// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2017 Western Digital Corporation or its affiliates. * * This file is released under the GPL.
*/
/* * On disk super block. * This uses only 512 B but uses on disk a full 4KB block. This block is * followed on disk by the mapping table of chunks to zones and the bitmap * blocks indicating zone block validity. * The overall resulting metadata format is: * (1) Super block (1 block) * (2) Chunk mapping table (nr_map_blocks) * (3) Bitmap blocks (nr_bitmap_blocks) * All metadata blocks are stored in conventional zones, starting from * the first conventional zone found on disk.
*/ struct dmz_super { /* Magic number */
__le32 magic; /* 4 */
/* Metadata version number */
__le32 version; /* 8 */
/* Generation number */
__le64 gen; /* 16 */
/* This block number */
__le64 sb_block; /* 24 */
/* The number of metadata blocks, including this super block */
__le32 nr_meta_blocks; /* 28 */
/* The number of sequential zones reserved for reclaim */
__le32 nr_reserved_seq; /* 32 */
/* The number of entries in the mapping table */
__le32 nr_chunks; /* 36 */
/* The number of blocks used for the chunk mapping table */
__le32 nr_map_blocks; /* 40 */
/* The number of blocks used for the block bitmaps */
__le32 nr_bitmap_blocks; /* 44 */
/* Checksum */
__le32 crc; /* 48 */
/* DM-Zoned label */
u8 dmz_label[32]; /* 80 */
/* DM-Zoned UUID */
u8 dmz_uuid[16]; /* 96 */
/* Device UUID */
u8 dev_uuid[16]; /* 112 */
/* Padding to full 512B sector */
u8 reserved[400]; /* 512 */
};
/* * Chunk mapping entry: entries are indexed by chunk number * and give the zone ID (dzone_id) mapping the chunk on disk. * This zone may be sequential or random. If it is a sequential * zone, a second zone (bzone_id) used as a write buffer may * also be specified. This second zone will always be a randomly * writeable zone.
*/ struct dmz_map {
__le32 dzone_id;
__le32 bzone_id;
};
for (i = 0; i < zmd->nr_devs; i++) { if (dmz_bdev_is_dying(&zmd->dev[i])) returntrue;
} returnfalse;
}
/* * Lock/unlock mapping table. * The map lock also protects all the zone lists.
*/ void dmz_lock_map(struct dmz_metadata *zmd)
{
mutex_lock(&zmd->map_lock);
}
/* * Lock/unlock metadata access. This is a "read" lock on a semaphore * that prevents metadata flush from running while metadata are being * modified. The actual metadata write mutual exclusion is achieved with * the map lock and zone state management (active and reclaim state are * mutually exclusive).
*/ void dmz_lock_metadata(struct dmz_metadata *zmd)
{
down_read(&zmd->mblk_sem);
}
/* * Lock/unlock flush: prevent concurrent executions * of dmz_flush_metadata as well as metadata modification in reclaim * while flush is being executed.
*/ void dmz_lock_flush(struct dmz_metadata *zmd)
{
mutex_lock(&zmd->mblk_flush_lock);
}
/* Figure out where to put the new node */ while (*new) {
b = container_of(*new, struct dmz_mblock, node);
parent = *new; new = (b->no < mblk->no) ? &((*new)->rb_left) : &((*new)->rb_right);
}
/* Add new node and rebalance tree */
rb_link_node(&mblk->node, parent, new);
rb_insert_color(&mblk->node, root);
}
/* * Lookup a metadata block in the rbtree. If the block is found, increment * its reference count.
*/ staticstruct dmz_mblock *dmz_get_mblock_fast(struct dmz_metadata *zmd,
sector_t mblk_no)
{ struct rb_root *root = &zmd->mblk_rbtree; struct rb_node *node = root->rb_node; struct dmz_mblock *mblk;
while (node) {
mblk = container_of(node, struct dmz_mblock, node); if (mblk->no == mblk_no) { /* * If this is the first reference to the block, * remove it from the LRU list.
*/
mblk->ref++; if (mblk->ref == 1 &&
!test_bit(DMZ_META_DIRTY, &mblk->state))
list_del_init(&mblk->link); return mblk;
}
node = (mblk->no < mblk_no) ? node->rb_left : node->rb_right;
}
return NULL;
}
/* * Metadata block BIO end callback.
*/ staticvoid dmz_mblock_bio_end_io(struct bio *bio)
{ struct dmz_mblock *mblk = bio->bi_private; int flag;
if (bio->bi_status)
set_bit(DMZ_META_ERROR, &mblk->state);
if (bio_op(bio) == REQ_OP_WRITE)
flag = DMZ_META_WRITING; else
flag = DMZ_META_READING;
/* * Read an uncached metadata block from disk and add it to the cache.
*/ staticstruct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd,
sector_t mblk_no)
{ struct dmz_mblock *mblk, *m;
sector_t block = zmd->sb[zmd->mblk_primary].block + mblk_no; struct dmz_dev *dev = zmd->sb[zmd->mblk_primary].dev; struct bio *bio;
if (dmz_bdev_is_dying(dev)) return ERR_PTR(-EIO);
/* Get a new block and a BIO to read it */
mblk = dmz_alloc_mblock(zmd, mblk_no); if (!mblk) return ERR_PTR(-ENOMEM);
bio = bio_alloc(dev->bdev, 1, REQ_OP_READ | REQ_META | REQ_PRIO,
GFP_NOIO);
spin_lock(&zmd->mblk_lock);
/* * Make sure that another context did not start reading * the block already.
*/
m = dmz_get_mblock_fast(zmd, mblk_no); if (m) {
spin_unlock(&zmd->mblk_lock);
dmz_free_mblock(zmd, mblk);
bio_put(bio); return m;
}
/* * For mblock shrinker: get the number of unused metadata blocks in the cache.
*/ staticunsignedlong dmz_mblock_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
{ struct dmz_metadata *zmd = shrink->private_data;
return atomic_read(&zmd->nr_mblks);
}
/* * For mblock shrinker: scan unused metadata blocks and shrink the cache.
*/ staticunsignedlong dmz_mblock_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc)
{ struct dmz_metadata *zmd = shrink->private_data; unsignedlong count;
/* * Get a metadata block from the rbtree. If the block * is not present, read it from disk.
*/ staticstruct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd,
sector_t mblk_no)
{ struct dmz_mblock *mblk; struct dmz_dev *dev = zmd->sb[zmd->mblk_primary].dev;
/* Write dirty blocks to the log */
ret = dmz_write_dirty_mblocks(zmd, write_list, log_set); if (ret) return ret;
/* * No error so far: now validate the log by updating the * log index super block generation.
*/
ret = dmz_write_sb(zmd, log_set); if (ret) return ret;
/* * Make sure that metadata blocks are stable before logging: take * the write lock on the metadata semaphore to prevent target BIOs * from modifying metadata.
*/
down_write(&zmd->mblk_sem);
dev = zmd->sb[zmd->mblk_primary].dev;
/* * This is called from the target flush work and reclaim work. * Concurrent execution is not allowed.
*/
dmz_lock_flush(zmd);
if (dmz_bdev_is_dying(dev)) {
ret = -EIO; goto out;
}
/* Get dirty blocks */
spin_lock(&zmd->mblk_lock);
list_splice_init(&zmd->mblk_dirty_list, &write_list);
spin_unlock(&zmd->mblk_lock);
/* If there are no dirty metadata blocks, just flush the device cache */ if (list_empty(&write_list)) {
ret = blkdev_issue_flush(dev->bdev); goto err;
}
/* * The primary metadata set is still clean. Keep it this way until * all updates are successful in the secondary set. That is, use * the secondary set as a log.
*/
ret = dmz_log_dirty_mblocks(zmd, &write_list); if (ret) goto err;
/* * The log is on disk. It is now safe to update in place * in the primary metadata set.
*/
ret = dmz_write_dirty_mblocks(zmd, &write_list, zmd->mblk_primary); if (ret) goto err;
ret = dmz_write_sb(zmd, zmd->mblk_primary); if (ret) goto err;
while (!list_empty(&write_list)) {
mblk = list_first_entry(&write_list, struct dmz_mblock, link);
list_del_init(&mblk->link);
if (tertiary) { /* * Generation number should be 0, but it doesn't * really matter if it isn't.
*/ if (gen != 0)
dmz_dev_warn(dev, "Invalid generation %llu",
gen); return 0;
}
}
/* * Read the first or second super block from disk.
*/ staticint dmz_read_sb(struct dmz_metadata *zmd, struct dmz_sb *sb, int set)
{
dmz_zmd_debug(zmd, "read superblock set %d dev %pg block %llu",
set, sb->dev->bdev, sb->block);
/* * Determine the position of the secondary super blocks on disk. * This is used only if a corruption of the primary super block * is detected.
*/ staticint dmz_lookup_secondary_sb(struct dmz_metadata *zmd)
{ unsignedint zone_nr_blocks = zmd->zone_nr_blocks; struct dmz_mblock *mblk; unsignedint zone_id = zmd->sb[0].zone->id; int i;
/* Allocate a block */
mblk = dmz_alloc_mblock(zmd, 0); if (!mblk) return -ENOMEM;
/* Bad first super block: search for the second one */
zmd->sb[1].block = zmd->sb[0].block + zone_nr_blocks;
zmd->sb[1].zone = dmz_get(zmd, zone_id + 1);
zmd->sb[1].dev = zmd->sb[0].dev; for (i = 1; i < zmd->nr_rnd_zones; i++) { if (dmz_read_sb(zmd, &zmd->sb[1], 1) != 0) break; if (le32_to_cpu(zmd->sb[1].sb->magic) == DMZ_MAGIC) return 0;
zmd->sb[1].block += zone_nr_blocks;
zmd->sb[1].zone = dmz_get(zmd, zone_id + i);
}
page = alloc_page(GFP_NOIO); if (!page) return -ENOMEM;
/* Copy metadata blocks */ for (i = 1; i < zmd->nr_meta_blocks; i++) {
ret = dmz_rdwr_block(zmd->sb[src_set].dev, REQ_OP_READ,
zmd->sb[src_set].block + i, page); if (ret) goto out;
ret = dmz_rdwr_block(zmd->sb[dst_set].dev, REQ_OP_WRITE,
zmd->sb[dst_set].block + i, page); if (ret) goto out;
}
/* Finalize with the super block */ if (!zmd->sb[dst_set].mblk) {
zmd->sb[dst_set].mblk = dmz_alloc_mblock(zmd, 0); if (!zmd->sb[dst_set].mblk) {
ret = -ENOMEM; goto out;
}
zmd->sb[dst_set].sb = zmd->sb[dst_set].mblk->data;
}
ret = dmz_write_sb(zmd, dst_set);
out:
__free_pages(page, 0);
return ret;
}
/* * Get super block from disk.
*/ staticint dmz_load_sb(struct dmz_metadata *zmd)
{ bool sb_good[2] = {false, false};
u64 sb_gen[2] = {0, 0}; int ret;
if (!zmd->sb[0].zone) {
dmz_zmd_err(zmd, "Primary super block zone not set"); return -ENXIO;
}
/* Read and check the primary super block */
zmd->sb[0].block = dmz_start_block(zmd, zmd->sb[0].zone);
zmd->sb[0].dev = zmd->sb[0].zone->dev;
ret = dmz_get_sb(zmd, &zmd->sb[0], 0); if (ret) {
dmz_dev_err(zmd->sb[0].dev, "Read primary super block failed"); return ret;
}
ret = dmz_check_sb(zmd, &zmd->sb[0], false);
/* Read and check secondary super block */ if (ret == 0) {
sb_good[0] = true; if (!zmd->sb[1].zone) { unsignedint zone_id =
zmd->sb[0].zone->id + zmd->nr_meta_zones;
dmz_dev_debug(zmd->sb[zmd->mblk_primary].dev, "Using super block %u (gen %llu)",
zmd->mblk_primary, zmd->sb_gen);
if (zmd->sb_version > 1) { int i; struct dmz_sb *sb;
sb = kzalloc(sizeof(struct dmz_sb), GFP_KERNEL); if (!sb) return -ENOMEM; for (i = 1; i < zmd->nr_devs; i++) {
sb->block = 0;
sb->zone = dmz_get(zmd, zmd->dev[i].zone_offset);
sb->dev = &zmd->dev[i]; if (!dmz_is_meta(sb->zone)) {
dmz_dev_err(sb->dev, "Tertiary super block zone %u not marked as metadata zone",
sb->zone->id);
ret = -EINVAL; goto out_kfree;
}
ret = dmz_get_sb(zmd, sb, i + 1); if (ret) {
dmz_dev_err(sb->dev, "Read tertiary super block failed");
dmz_free_mblock(zmd, sb->mblk); goto out_kfree;
}
ret = dmz_check_sb(zmd, sb, true);
dmz_free_mblock(zmd, sb->mblk); if (ret == -EINVAL) goto out_kfree;
}
out_kfree:
kfree(sb);
} return ret;
}
/* * Initialize a zone descriptor.
*/ staticint dmz_init_zone(struct blk_zone *blkz, unsignedint num, void *data)
{ struct dmz_dev *dev = data; struct dmz_metadata *zmd = dev->metadata; int idx = num + dev->zone_offset; struct dm_zone *zone;
zone = dmz_insert(zmd, idx, dev); if (IS_ERR(zone)) return PTR_ERR(zone);
if (blkz->len != zmd->zone_nr_sectors) { if (zmd->sb_version > 1) { /* Ignore the eventual runt (smaller) zone */
set_bit(DMZ_OFFLINE, &zone->flags); return 0;
} elseif (blkz->start + blkz->len == dev->capacity) return 0; return -ENXIO;
}
/* * Devices that have zones with a capacity smaller than the zone size * (e.g. NVMe zoned namespaces) are not supported.
*/ if (blkz->capacity != blkz->len) return -ENXIO;
switch (blkz->type) { case BLK_ZONE_TYPE_CONVENTIONAL:
set_bit(DMZ_RND, &zone->flags); break; case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF:
set_bit(DMZ_SEQ, &zone->flags); break; default: return -ENXIO;
}
if (blkz->cond == BLK_ZONE_COND_OFFLINE)
set_bit(DMZ_OFFLINE, &zone->flags); elseif (blkz->cond == BLK_ZONE_COND_READONLY)
set_bit(DMZ_READ_ONLY, &zone->flags); else {
zmd->nr_useable_zones++; if (dmz_is_rnd(zone)) {
zmd->nr_rnd_zones++; if (zmd->nr_devs == 1 && !zmd->sb[0].zone) { /* Primary super block zone */
zmd->sb[0].zone = zone;
}
} if (zmd->nr_devs > 1 && num == 0) { /* * Tertiary superblock zones are always at the * start of the zoned devices, so mark them * as metadata zone.
*/
set_bit(DMZ_META, &zone->flags);
}
} return 0;
}
/* * Allocate and initialize zone descriptors using the zone * information from disk.
*/ staticint dmz_init_zones(struct dmz_metadata *zmd)
{ int i, ret; struct dmz_dev *zoned_dev = &zmd->dev[0];
if (!zmd->nr_zones) {
DMERR("(%s): No zones found", zmd->devname); return -ENXIO;
}
xa_init(&zmd->zones);
DMDEBUG("(%s): Using %zu B for zone information",
zmd->devname, sizeof(struct dm_zone) * zmd->nr_zones);
if (zmd->nr_devs > 1) {
ret = dmz_emulate_zones(zmd, &zmd->dev[0]); if (ret < 0) {
DMDEBUG("(%s): Failed to emulate zones, error %d",
zmd->devname, ret);
dmz_drop_zones(zmd); return ret;
}
/* * Primary superblock zone is always at zone 0 when multiple * drives are present.
*/
zmd->sb[0].zone = dmz_get(zmd, 0);
for (i = 1; i < zmd->nr_devs; i++) {
zoned_dev = &zmd->dev[i];
ret = blkdev_report_zones(zoned_dev->bdev, 0,
BLK_ALL_ZONES,
dmz_init_zone, zoned_dev); if (ret < 0) {
DMDEBUG("(%s): Failed to report zones, error %d",
zmd->devname, ret);
dmz_drop_zones(zmd); return ret;
}
} return 0;
}
/* * Get zone information and initialize zone descriptors. At the same * time, determine where the super block should be: first block of the * first randomly writable zone.
*/
ret = blkdev_report_zones(zoned_dev->bdev, 0, BLK_ALL_ZONES,
dmz_init_zone, zoned_dev); if (ret < 0) {
DMDEBUG("(%s): Failed to report zones, error %d",
zmd->devname, ret);
dmz_drop_zones(zmd); return ret;
}
/* * Update a zone information.
*/ staticint dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
{ struct dmz_dev *dev = zone->dev; unsignedint noio_flag; int ret;
if (dev->flags & DMZ_BDEV_REGULAR) return 0;
/* * Get zone information from disk. Since blkdev_report_zones() uses * GFP_KERNEL by default for memory allocations, set the per-task * PF_MEMALLOC_NOIO flag so that all allocations are done as if * GFP_NOIO was specified.
*/
noio_flag = memalloc_noio_save();
ret = blkdev_report_zones(dev->bdev, dmz_start_sect(zmd, zone), 1,
dmz_update_zone_cb, zone);
memalloc_noio_restore(noio_flag);
if (ret == 0)
ret = -EIO; if (ret < 0) {
dmz_dev_err(dev, "Get zone %u report failed",
zone->id);
dmz_check_bdev(dev); return ret;
}
return 0;
}
/* * Check a zone write pointer position when the zone is marked * with the sequential write error flag.
*/ staticint dmz_handle_seq_write_err(struct dmz_metadata *zmd, struct dm_zone *zone)
{ struct dmz_dev *dev = zone->dev; unsignedint wp = 0; int ret;
wp = zone->wp_block;
ret = dmz_update_zone(zmd, zone); if (ret) return ret;
/* Metadata block array for the chunk mapping table */
zmd->map_mblk = kcalloc(zmd->nr_map_blocks, sizeof(struct dmz_mblk *), GFP_KERNEL); if (!zmd->map_mblk) return -ENOMEM;
/* Get chunk mapping table blocks and initialize zone mapping */ while (chunk < zmd->nr_chunks) { if (!dmap_mblk) { /* Get mapping block */
dmap_mblk = dmz_get_mblock(zmd, i + 1); if (IS_ERR(dmap_mblk)) return PTR_ERR(dmap_mblk);
zmd->map_mblk[i] = dmap_mblk;
dmap = dmap_mblk->data;
i++;
e = 0;
}
/* Check data zone */
dzone_id = le32_to_cpu(dmap[e].dzone_id); if (dzone_id == DMZ_MAP_UNMAPPED) goto next;
if (dzone_id >= zmd->nr_zones) {
dmz_zmd_err(zmd, "Chunk %u mapping: invalid data zone ID %u",
chunk, dzone_id); return -EIO;
}
dzone = dmz_get(zmd, dzone_id); if (!dzone) {
dmz_zmd_err(zmd, "Chunk %u mapping: data zone %u not present",
chunk, dzone_id); return -EIO;
}
set_bit(DMZ_DATA, &dzone->flags);
dzone->chunk = chunk;
dmz_get_zone_weight(zmd, dzone);
/* * At this point, only meta zones and mapped data zones were * fully initialized. All remaining zones are unmapped data * zones. Finish initializing those here.
*/ for (i = 0; i < zmd->nr_zones; i++) {
dzone = dmz_get(zmd, i); if (!dzone) continue; if (dmz_is_meta(dzone)) continue; if (dmz_is_offline(dzone)) continue;
if (dmz_is_cache(dzone))
zmd->nr_cache++; elseif (dmz_is_rnd(dzone))
dzone->dev->nr_rnd++; else
dzone->dev->nr_seq++;
if (dmz_is_data(dzone)) { /* Already initialized */ continue;
}
/* * The list of mapped zones is maintained in LRU order. * This rotates a zone at the end of its map list.
*/ staticvoid __dmz_lru_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
{ if (list_empty(&zone->link)) return;
list_del_init(&zone->link); if (dmz_is_seq(zone)) { /* LRU rotate sequential zone */
list_add_tail(&zone->link, &zone->dev->map_seq_list);
} elseif (dmz_is_cache(zone)) { /* LRU rotate cache zone */
list_add_tail(&zone->link, &zmd->map_cache_list);
} else { /* LRU rotate random zone */
list_add_tail(&zone->link, &zone->dev->map_rnd_list);
}
}
/* * The list of mapped random zones is maintained * in LRU order. This rotates a zone at the end of the list.
*/ staticvoid dmz_lru_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
{
__dmz_lru_zone(zmd, zone); if (zone->bzone)
__dmz_lru_zone(zmd, zone->bzone);
}
/* * Wait for any zone to be freed.
*/ staticvoid dmz_wait_for_free_zones(struct dmz_metadata *zmd)
{
DEFINE_WAIT(wait);
/* * Lock a zone for reclaim (set the zone RECLAIM bit). * Returns false if the zone cannot be locked or if it is already locked * and 1 otherwise.
*/ int dmz_lock_zone_reclaim(struct dm_zone *zone)
{ /* Active zones cannot be reclaimed */ if (dmz_is_active(zone)) return 0;
/* * Wait for a zone reclaim to complete.
*/ staticvoid dmz_wait_for_reclaim(struct dmz_metadata *zmd, struct dm_zone *zone)
{
dmz_unlock_map(zmd);
dmz_unlock_metadata(zmd);
set_bit(DMZ_RECLAIM_TERMINATE, &zone->flags);
wait_on_bit_timeout(&zone->flags, DMZ_RECLAIM, TASK_UNINTERRUPTIBLE, HZ);
clear_bit(DMZ_RECLAIM_TERMINATE, &zone->flags);
dmz_lock_metadata(zmd);
dmz_lock_map(zmd);
}
/* * Select a cache or random write zone for reclaim.
*/ staticstruct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd, unsignedint idx, bool idle)
{ struct dm_zone *dzone = NULL; struct dm_zone *zone, *maxw_z = NULL; struct list_head *zone_list;
/* If we have cache zones select from the cache zone list */ if (zmd->nr_cache) {
zone_list = &zmd->map_cache_list; /* Try to relaim random zones, too, when idle */ if (idle && list_empty(zone_list))
zone_list = &zmd->dev[idx].map_rnd_list;
} else
zone_list = &zmd->dev[idx].map_rnd_list;
/* * Find the buffer zone with the heaviest weight or the first (oldest) * data zone that can be reclaimed.
*/
list_for_each_entry(zone, zone_list, link) { if (dmz_is_buf(zone)) {
dzone = zone->bzone; if (dmz_is_rnd(dzone) && dzone->dev->dev_idx != idx) continue; if (!maxw_z || maxw_z->weight < dzone->weight)
maxw_z = dzone;
} else {
dzone = zone; if (dmz_lock_zone_reclaim(dzone)) return dzone;
}
}
if (maxw_z && dmz_lock_zone_reclaim(maxw_z)) return maxw_z;
/* * If we come here, none of the zones inspected could be locked for * reclaim. Try again, being more aggressive, that is, find the * first zone that can be reclaimed regardless of its weitght.
*/
list_for_each_entry(zone, zone_list, link) { if (dmz_is_buf(zone)) {
dzone = zone->bzone; if (dmz_is_rnd(dzone) && dzone->dev->dev_idx != idx) continue;
} else
dzone = zone; if (dmz_lock_zone_reclaim(dzone)) return dzone;
}
return NULL;
}
/* * Select a buffered sequential zone for reclaim.
*/ staticstruct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd, unsignedint idx)
{ struct dm_zone *zone;
list_for_each_entry(zone, &zmd->dev[idx].map_seq_list, link) { if (!zone->bzone) continue; if (dmz_lock_zone_reclaim(zone)) return zone;
}
return NULL;
}
/* * Select a zone for reclaim.
*/ struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd, unsignedint dev_idx, bool idle)
{ struct dm_zone *zone = NULL;
/* * Search for a zone candidate to reclaim: 2 cases are possible. * (1) There is no free sequential zones. Then a random data zone * cannot be reclaimed. So choose a sequential zone to reclaim so * that afterward a random zone can be reclaimed. * (2) At least one free sequential zone is available, then choose * the oldest random zone (data or buffer) that can be locked.
*/
dmz_lock_map(zmd); if (list_empty(&zmd->reserved_seq_zones_list))
zone = dmz_get_seq_zone_for_reclaim(zmd, dev_idx); if (!zone)
zone = dmz_get_rnd_zone_for_reclaim(zmd, dev_idx, idle);
dmz_unlock_map(zmd);
return zone;
}
/* * Get the zone mapping a chunk, if the chunk is mapped already. * If no mapping exist and the operation is WRITE, a zone is * allocated and used to map the chunk. * The zone returned will be set to the active state.
*/ struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd, unsignedint chunk, enum req_op op)
{ struct dmz_mblock *dmap_mblk = zmd->map_mblk[chunk >> DMZ_MAP_ENTRIES_SHIFT]; struct dmz_map *dmap = dmap_mblk->data; int dmap_idx = chunk & DMZ_MAP_ENTRIES_MASK; unsignedint dzone_id; struct dm_zone *dzone = NULL; int ret = 0; int alloc_flags = zmd->nr_cache ? DMZ_ALLOC_CACHE : DMZ_ALLOC_RND;
dmz_lock_map(zmd);
again: /* Get the chunk mapping */
dzone_id = le32_to_cpu(dmap[dmap_idx].dzone_id); if (dzone_id == DMZ_MAP_UNMAPPED) { /* * Read or discard in unmapped chunks are fine. But for * writes, we need a mapping, so get one.
*/ if (op != REQ_OP_WRITE) goto out;
/* Allocate a random zone */
dzone = dmz_alloc_zone(zmd, 0, alloc_flags); if (!dzone) { if (dmz_dev_is_dying(zmd)) {
dzone = ERR_PTR(-EIO); goto out;
}
dmz_wait_for_free_zones(zmd); goto again;
}
dmz_map_zone(zmd, dzone, chunk);
} else { /* The chunk is already mapped: get the mapping zone */
dzone = dmz_get(zmd, dzone_id); if (!dzone) {
dzone = ERR_PTR(-EIO); goto out;
} if (dzone->chunk != chunk) {
dzone = ERR_PTR(-EIO); goto out;
}
/* Repair write pointer if the sequential dzone has error */ if (dmz_seq_write_err(dzone)) {
ret = dmz_handle_seq_write_err(zmd, dzone); if (ret) {
dzone = ERR_PTR(-EIO); goto out;
}
clear_bit(DMZ_SEQ_WRITE_ERR, &dzone->flags);
}
}
/* * If the zone is being reclaimed, the chunk mapping may change * to a different zone. So wait for reclaim and retry. Otherwise, * activate the zone (this will prevent reclaim from touching it).
*/ if (dmz_in_reclaim(dzone)) {
dmz_wait_for_reclaim(zmd, dzone); goto again;
}
dmz_activate_zone(dzone);
dmz_lru_zone(zmd, dzone);
out:
dmz_unlock_map(zmd);
return dzone;
}
/* * Write and discard change the block validity of data zones and their buffer * zones. Check here that valid blocks are still present. If all blocks are * invalid, the zones can be unmapped on the fly without waiting for reclaim * to do it.
*/ void dmz_put_chunk_mapping(struct dmz_metadata *zmd, struct dm_zone *dzone)
{ struct dm_zone *bzone;
dmz_lock_map(zmd);
bzone = dzone->bzone; if (bzone) { if (dmz_weight(bzone))
dmz_lru_zone(zmd, bzone); else { /* Empty buffer zone: reclaim it */
dmz_unmap_zone(zmd, bzone);
dmz_free_zone(zmd, bzone);
bzone = NULL;
}
}
/* Deactivate the data zone */
dmz_deactivate_zone(dzone); if (dmz_is_active(dzone) || bzone || dmz_weight(dzone))
dmz_lru_zone(zmd, dzone); else { /* Unbuffered inactive empty data zone: reclaim it */
dmz_unmap_zone(zmd, dzone);
dmz_free_zone(zmd, dzone);
}
dmz_unlock_map(zmd);
}
/* * Allocate and map a random zone to buffer a chunk * already mapped to a sequential zone.
*/ struct dm_zone *dmz_get_chunk_buffer(struct dmz_metadata *zmd, struct dm_zone *dzone)
{ struct dm_zone *bzone; int alloc_flags = zmd->nr_cache ? DMZ_ALLOC_CACHE : DMZ_ALLOC_RND;
dmz_lock_map(zmd);
again:
bzone = dzone->bzone; if (bzone) goto out;
/* Allocate a random zone */
bzone = dmz_alloc_zone(zmd, 0, alloc_flags); if (!bzone) { if (dmz_dev_is_dying(zmd)) {
bzone = ERR_PTR(-EIO); goto out;
}
dmz_wait_for_free_zones(zmd); goto again;
}
/* Update the chunk mapping */
dmz_set_chunk_mapping(zmd, dzone->chunk, dzone->id, bzone->id);
/* * Get an unmapped (free) zone. * This must be called with the mapping lock held.
*/ struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsignedint dev_idx, unsignedlong flags)
{ struct list_head *list; struct dm_zone *zone; int i;
/* Schedule reclaim to ensure free zones are available */ if (!(flags & DMZ_ALLOC_RECLAIM)) { for (i = 0; i < zmd->nr_devs; i++)
dmz_schedule_reclaim(zmd->dev[i].reclaim);
}
i = 0;
again: if (flags & DMZ_ALLOC_CACHE)
list = &zmd->unmap_cache_list; elseif (flags & DMZ_ALLOC_RND)
list = &zmd->dev[dev_idx].unmap_rnd_list; else
list = &zmd->dev[dev_idx].unmap_seq_list;
if (list_empty(list)) { /* * No free zone: return NULL if this is for not reclaim.
*/ if (!(flags & DMZ_ALLOC_RECLAIM)) return NULL; /* * Try to allocate from other devices
*/ if (i < zmd->nr_devs) {
dev_idx = (dev_idx + 1) % zmd->nr_devs;
i++; goto again;
}
/* * Fallback to the reserved sequential zones
*/
zone = list_first_entry_or_null(&zmd->reserved_seq_zones_list, struct dm_zone, link); if (zone) {
list_del_init(&zone->link);
atomic_dec(&zmd->nr_reserved_seq_zones);
} return zone;
}
zone = list_first_entry(list, struct dm_zone, link);
list_del_init(&zone->link);
if (dmz_is_cache(zone))
atomic_dec(&zmd->unmap_nr_cache); elseif (dmz_is_rnd(zone))
atomic_dec(&zone->dev->unmap_nr_rnd); else
atomic_dec(&zone->dev->unmap_nr_seq);
if (dmz_is_offline(zone)) {
dmz_zmd_warn(zmd, "Zone %u is offline", zone->id);
zone = NULL; goto again;
} if (dmz_is_meta(zone)) {
dmz_zmd_warn(zmd, "Zone %u has metadata", zone->id);
zone = NULL; goto again;
} return zone;
}
/* * Free a zone. * This must be called with the mapping lock held.
*/ void dmz_free_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
{ /* If this is a sequential zone, reset it */ if (dmz_is_seq(zone))
dmz_reset_zone(zmd, zone);
/* Return the zone to its type unmap list */ if (dmz_is_cache(zone)) {
list_add_tail(&zone->link, &zmd->unmap_cache_list);
atomic_inc(&zmd->unmap_nr_cache);
} elseif (dmz_is_rnd(zone)) {
list_add_tail(&zone->link, &zone->dev->unmap_rnd_list);
atomic_inc(&zone->dev->unmap_nr_rnd);
} elseif (dmz_is_reserved(zone)) {
list_add_tail(&zone->link, &zmd->reserved_seq_zones_list);
atomic_inc(&zmd->nr_reserved_seq_zones);
} else {
list_add_tail(&zone->link, &zone->dev->unmap_seq_list);
atomic_inc(&zone->dev->unmap_nr_seq);
}
wake_up_all(&zmd->free_wq);
}
/* * Map a chunk to a zone. * This must be called with the mapping lock held.
*/ void dmz_map_zone(struct dmz_metadata *zmd, struct dm_zone *dzone, unsignedint chunk)
{ /* Set the chunk mapping */
dmz_set_chunk_mapping(zmd, chunk, dzone->id,
DMZ_MAP_UNMAPPED);
dzone->chunk = chunk; if (dmz_is_cache(dzone))
list_add_tail(&dzone->link, &zmd->map_cache_list); elseif (dmz_is_rnd(dzone))
list_add_tail(&dzone->link, &dzone->dev->map_rnd_list); else
list_add_tail(&dzone->link, &dzone->dev->map_seq_list);
}
/* * Unmap a zone. * This must be called with the mapping lock held.
*/ void dmz_unmap_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
{ unsignedint chunk = zone->chunk; unsignedint dzone_id;
if (test_and_clear_bit(DMZ_BUF, &zone->flags)) { /* * Unmapping the chunk buffer zone: clear only * the chunk buffer mapping
*/
dzone_id = zone->bzone->id;
zone->bzone->bzone = NULL;
zone->bzone = NULL;
} else { /* * Unmapping the chunk data zone: the zone must * not be buffered.
*/ if (WARN_ON(zone->bzone)) {
zone->bzone->bzone = NULL;
zone->bzone = NULL;
}
dzone_id = DMZ_MAP_UNMAPPED;
}
/* * Set @nr_bits bits in @bitmap starting from @bit. * Return the number of bits changed from 0 to 1.
*/ staticunsignedint dmz_set_bits(unsignedlong *bitmap, unsignedint bit, unsignedint nr_bits)
{ unsignedlong *addr; unsignedint end = bit + nr_bits; unsignedint n = 0;
while (bit < end) { if (((bit & (BITS_PER_LONG - 1)) == 0) &&
((end - bit) >= BITS_PER_LONG)) { /* Try to set the whole word at once */
addr = bitmap + BIT_WORD(bit); if (*addr == 0) {
*addr = ULONG_MAX;
n += BITS_PER_LONG;
bit += BITS_PER_LONG; continue;
}
}
if (!test_and_set_bit(bit, bitmap))
n++;
bit++;
}
return n;
}
/* * Get the bitmap block storing the bit for chunk_block in zone.
*/ staticstruct dmz_mblock *dmz_get_bitmap(struct dmz_metadata *zmd, struct dm_zone *zone,
sector_t chunk_block)
{
sector_t bitmap_block = 1 + zmd->nr_map_blocks +
(sector_t)(zone->id * zmd->zone_nr_bitmap_blocks) +
(chunk_block >> DMZ_BLOCK_SHIFT_BITS);
return dmz_get_mblock(zmd, bitmap_block);
}
/* * Copy the valid blocks bitmap of from_zone to the bitmap of to_zone.
*/ int dmz_copy_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone, struct dm_zone *to_zone)
{ struct dmz_mblock *from_mblk, *to_mblk;
sector_t chunk_block = 0;
/* Get the zones bitmap blocks */ while (chunk_block < zmd->zone_nr_blocks) {
from_mblk = dmz_get_bitmap(zmd, from_zone, chunk_block); if (IS_ERR(from_mblk)) return PTR_ERR(from_mblk);
to_mblk = dmz_get_bitmap(zmd, to_zone, chunk_block); if (IS_ERR(to_mblk)) {
dmz_release_mblock(zmd, from_mblk); return PTR_ERR(to_mblk);
}
/* * Merge the valid blocks bitmap of from_zone into the bitmap of to_zone, * starting from chunk_block.
*/ int dmz_merge_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone, struct dm_zone *to_zone, sector_t chunk_block)
{ unsignedint nr_blocks; int ret;
/* Get the zones bitmap blocks */ while (chunk_block < zmd->zone_nr_blocks) { /* Get a valid region from the source zone */
ret = dmz_first_valid_block(zmd, from_zone, &chunk_block); if (ret <= 0) return ret;
nr_blocks = ret;
ret = dmz_validate_blocks(zmd, to_zone, chunk_block, nr_blocks); if (ret) return ret;
chunk_block += nr_blocks;
}
return 0;
}
/* * Validate all the blocks in the range [block..block+nr_blocks-1].
*/ int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone,
sector_t chunk_block, unsignedint nr_blocks)
{ unsignedint count, bit, nr_bits; unsignedint zone_nr_blocks = zmd->zone_nr_blocks; struct dmz_mblock *mblk; unsignedint n = 0;
while (nr_blocks) { /* Get bitmap block */
mblk = dmz_get_bitmap(zmd, zone, chunk_block); if (IS_ERR(mblk)) return PTR_ERR(mblk);
/* Set bits */
bit = chunk_block & DMZ_BLOCK_MASK_BITS;
nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit);
count = dmz_set_bits((unsignedlong *)mblk->data, bit, nr_bits); if (count) {
dmz_dirty_mblock(zmd, mblk);
n += count;
}
dmz_release_mblock(zmd, mblk);
nr_blocks -= nr_bits;
chunk_block += nr_bits;
}
if (likely(zone->weight + n <= zone_nr_blocks))
zone->weight += n; else {
dmz_zmd_warn(zmd, "Zone %u: weight %u should be <= %u",
zone->id, zone->weight,
zone_nr_blocks - n);
zone->weight = zone_nr_blocks;
}
return 0;
}
/* * Clear nr_bits bits in bitmap starting from bit. * Return the number of bits cleared.
*/ staticint dmz_clear_bits(unsignedlong *bitmap, int bit, int nr_bits)
{ unsignedlong *addr; int end = bit + nr_bits; int n = 0;
while (bit < end) { if (((bit & (BITS_PER_LONG - 1)) == 0) &&
((end - bit) >= BITS_PER_LONG)) { /* Try to clear whole word at once */
addr = bitmap + BIT_WORD(bit); if (*addr == ULONG_MAX) {
*addr = 0;
n += BITS_PER_LONG;
bit += BITS_PER_LONG; continue;
}
}
if (test_and_clear_bit(bit, bitmap))
n++;
bit++;
}
return n;
}
/* * Invalidate all the blocks in the range [block..block+nr_blocks-1].
*/ int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone,
sector_t chunk_block, unsignedint nr_blocks)
{ unsignedint count, bit, nr_bits; struct dmz_mblock *mblk; unsignedint n = 0;
count = dmz_clear_bits((unsignedlong *)mblk->data,
bit, nr_bits); if (count) {
dmz_dirty_mblock(zmd, mblk);
n += count;
}
dmz_release_mblock(zmd, mblk);
nr_blocks -= nr_bits;
chunk_block += nr_bits;
}
if (zone->weight >= n)
zone->weight -= n; else {
dmz_zmd_warn(zmd, "Zone %u: weight %u should be >= %u",
zone->id, zone->weight, n);
zone->weight = 0;
}
return 0;
}
/* * Get a block bit value.
*/ staticint dmz_test_block(struct dmz_metadata *zmd, struct dm_zone *zone,
sector_t chunk_block)
{ struct dmz_mblock *mblk; int ret;
WARN_ON(chunk_block >= zmd->zone_nr_blocks);
/* Get bitmap block */
mblk = dmz_get_bitmap(zmd, zone, chunk_block); if (IS_ERR(mblk)) return PTR_ERR(mblk);
/* Get offset */
ret = test_bit(chunk_block & DMZ_BLOCK_MASK_BITS,
(unsignedlong *) mblk->data) != 0;
dmz_release_mblock(zmd, mblk);
return ret;
}
/* * Return the number of blocks from chunk_block to the first block with a bit * value specified by set. Search at most nr_blocks blocks from chunk_block.
*/ staticint dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone,
sector_t chunk_block, unsignedint nr_blocks, int set)
{ struct dmz_mblock *mblk; unsignedint bit, set_bit, nr_bits; unsignedint zone_bits = zmd->zone_bits_per_mblk; unsignedlong *bitmap; int n = 0;
n += set_bit - bit; if (set_bit < zone_bits) break;
nr_blocks -= nr_bits;
chunk_block += nr_bits;
}
return n;
}
/* * Test if chunk_block is valid. If it is, the number of consecutive * valid blocks from chunk_block will be returned.
*/ int dmz_block_valid(struct dmz_metadata *zmd, struct dm_zone *zone,
sector_t chunk_block)
{ int valid;
/* The block is valid: get the number of valid blocks from block */ return dmz_to_next_set_block(zmd, zone, chunk_block,
zmd->zone_nr_blocks - chunk_block, 0);
}
/* * Find the first valid block from @chunk_block in @zone. * If such a block is found, its number is returned using * @chunk_block and the total number of valid blocks from @chunk_block * is returned.
*/ int dmz_first_valid_block(struct dmz_metadata *zmd, struct dm_zone *zone,
sector_t *chunk_block)
{
sector_t start_block = *chunk_block; int ret;
ret = dmz_to_next_set_block(zmd, zone, start_block,
zmd->zone_nr_blocks - start_block, 1); if (ret < 0) return ret;
/* * Count the number of bits set starting from bit up to bit + nr_bits - 1.
*/ staticint dmz_count_bits(void *bitmap, int bit, int nr_bits)
{ unsignedlong *addr; int end = bit + nr_bits; int n = 0;
while (bit < end) { if (((bit & (BITS_PER_LONG - 1)) == 0) &&
((end - bit) >= BITS_PER_LONG)) {
addr = (unsignedlong *)bitmap + BIT_WORD(bit); if (*addr == ULONG_MAX) {
n += BITS_PER_LONG;
bit += BITS_PER_LONG; continue;
}
}
if (test_bit(bit, bitmap))
n++;
bit++;
}
return n;
}
/* * Get a zone weight.
*/
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Dauer der Verarbeitung: 0.28 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.