// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2017 Western Digital Corporation or its affiliates. * * This file is released under the GPL.
*/
#include"dm-zoned.h"
#include <linux/module.h>
#define DM_MSG_PREFIX "zoned"
#define DMZ_MIN_BIOS 8192
/* * Zone BIO context.
*/ struct dmz_bioctx { struct dmz_dev *dev; struct dm_zone *zone; struct bio *bio;
refcount_t ref;
};
/* * Target BIO completion.
*/ staticinlinevoid dmz_bio_endio(struct bio *bio, blk_status_t status)
{ struct dmz_bioctx *bioctx =
dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
if (status != BLK_STS_OK && bio->bi_status == BLK_STS_OK)
bio->bi_status = status; if (bioctx->dev && bio->bi_status != BLK_STS_OK)
bioctx->dev->flags |= DMZ_CHECK_BDEV;
if (refcount_dec_and_test(&bioctx->ref)) { struct dm_zone *zone = bioctx->zone;
if (zone) { if (bio->bi_status != BLK_STS_OK &&
bio_op(bio) == REQ_OP_WRITE &&
dmz_is_seq(zone))
set_bit(DMZ_SEQ_WRITE_ERR, &zone->flags);
dmz_deactivate_zone(zone);
}
bio_endio(bio);
}
}
/* * Completion callback for an internally cloned target BIO. This terminates the * target BIO when there are no more references to its context.
*/ staticvoid dmz_clone_endio(struct bio *clone)
{ struct dmz_bioctx *bioctx = clone->bi_private;
blk_status_t status = clone->bi_status;
/* * Issue a clone of a target BIO. The clone may only partially process the * original target BIO.
*/ staticint dmz_submit_bio(struct dmz_target *dmz, struct dm_zone *zone, struct bio *bio, sector_t chunk_block, unsignedint nr_blocks)
{ struct dmz_bioctx *bioctx =
dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); struct dmz_dev *dev = zone->dev; struct bio *clone;
if (dev->flags & DMZ_BDEV_DYING) return -EIO;
clone = bio_alloc_clone(dev->bdev, bio, GFP_NOIO, &dmz->bio_set); if (!clone) return -ENOMEM;
/* Check block validity to determine the read location */
bzone = zone->bzone; while (chunk_block < end_block) {
nr_blocks = 0; if (dmz_is_rnd(zone) || dmz_is_cache(zone) ||
chunk_block < zone->wp_block) { /* Test block validity in the data zone */
ret = dmz_block_valid(zmd, zone, chunk_block); if (ret < 0) return ret; if (ret > 0) { /* Read data zone blocks */
nr_blocks = ret;
rzone = zone;
}
}
/* * No valid blocks found in the data zone. * Check the buffer zone, if there is one.
*/ if (!nr_blocks && bzone) {
ret = dmz_block_valid(zmd, bzone, chunk_block); if (ret < 0) return ret; if (ret > 0) { /* Read buffer zone blocks */
nr_blocks = ret;
rzone = bzone;
}
}
if (nr_blocks) { /* Valid blocks found: read them */
nr_blocks = min_t(unsignedint, nr_blocks,
end_block - chunk_block);
ret = dmz_submit_bio(dmz, rzone, bio,
chunk_block, nr_blocks); if (ret) return ret;
chunk_block += nr_blocks;
} else { /* No valid block: zeroout the current BIO block */
dmz_handle_read_zero(dmz, bio, chunk_block, 1);
chunk_block++;
}
}
return 0;
}
/* * Write blocks directly in a data zone, at the write pointer. * If a buffer zone is assigned, invalidate the blocks written * in place.
*/ staticint dmz_handle_direct_write(struct dmz_target *dmz, struct dm_zone *zone, struct bio *bio,
sector_t chunk_block, unsignedint nr_blocks)
{ struct dmz_metadata *zmd = dmz->metadata; struct dm_zone *bzone = zone->bzone; int ret;
if (dmz_is_readonly(zone)) return -EROFS;
/* Submit write */
ret = dmz_submit_bio(dmz, zone, bio, chunk_block, nr_blocks); if (ret) return ret;
/* * Validate the blocks in the data zone and invalidate * in the buffer zone, if there is one.
*/
ret = dmz_validate_blocks(zmd, zone, chunk_block, nr_blocks); if (ret == 0 && bzone)
ret = dmz_invalidate_blocks(zmd, bzone, chunk_block, nr_blocks);
return ret;
}
/* * Write blocks in the buffer zone of @zone. * If no buffer zone is assigned yet, get one. * Called with @zone write locked.
*/ staticint dmz_handle_buffered_write(struct dmz_target *dmz, struct dm_zone *zone, struct bio *bio,
sector_t chunk_block, unsignedint nr_blocks)
{ struct dmz_metadata *zmd = dmz->metadata; struct dm_zone *bzone; int ret;
/* Get the buffer zone. One will be allocated if needed */
bzone = dmz_get_chunk_buffer(zmd, zone); if (IS_ERR(bzone)) return PTR_ERR(bzone);
if (dmz_is_readonly(bzone)) return -EROFS;
/* Submit write */
ret = dmz_submit_bio(dmz, bzone, bio, chunk_block, nr_blocks); if (ret) return ret;
/* * Validate the blocks in the buffer zone * and invalidate in the data zone.
*/
ret = dmz_validate_blocks(zmd, bzone, chunk_block, nr_blocks); if (ret == 0 && chunk_block < zone->wp_block)
ret = dmz_invalidate_blocks(zmd, zone, chunk_block, nr_blocks);
if (dmz_is_rnd(zone) || dmz_is_cache(zone) ||
chunk_block == zone->wp_block) { /* * zone is a random zone or it is a sequential zone * and the BIO is aligned to the zone write pointer: * direct write the zone.
*/ return dmz_handle_direct_write(dmz, zone, bio,
chunk_block, nr_blocks);
}
/* * This is an unaligned write in a sequential zone: * use buffered write.
*/ return dmz_handle_buffered_write(dmz, zone, bio, chunk_block, nr_blocks);
}
/* * Process a discard BIO.
*/ staticint dmz_handle_discard(struct dmz_target *dmz, struct dm_zone *zone, struct bio *bio)
{ struct dmz_metadata *zmd = dmz->metadata;
sector_t block = dmz_bio_block(bio); unsignedint nr_blocks = dmz_bio_blocks(bio);
sector_t chunk_block = dmz_chunk_block(zmd, block); int ret = 0;
/* For unmapped chunks, there is nothing to do */ if (!zone) return 0;
/* * Invalidate blocks in the data zone and its * buffer zone if one is mapped.
*/ if (dmz_is_rnd(zone) || dmz_is_cache(zone) ||
chunk_block < zone->wp_block)
ret = dmz_invalidate_blocks(zmd, zone, chunk_block, nr_blocks); if (ret == 0 && zone->bzone)
ret = dmz_invalidate_blocks(zmd, zone->bzone,
chunk_block, nr_blocks); return ret;
}
/* * Process a BIO.
*/ staticvoid dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw, struct bio *bio)
{ struct dmz_bioctx *bioctx =
dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); struct dmz_metadata *zmd = dmz->metadata; struct dm_zone *zone; int ret;
dmz_lock_metadata(zmd);
/* * Get the data zone mapping the chunk. There may be no * mapping for read and discard. If a mapping is obtained, + the zone returned will be set to active state.
*/
zone = dmz_get_chunk_mapping(zmd, dmz_bio_chunk(zmd, bio),
bio_op(bio)); if (IS_ERR(zone)) {
ret = PTR_ERR(zone); goto out;
}
/* Process the BIO */ if (zone) {
dmz_activate_zone(zone);
bioctx->zone = zone;
dmz_reclaim_bio_acc(zone->dev->reclaim);
}
switch (bio_op(bio)) { case REQ_OP_READ:
ret = dmz_handle_read(dmz, zone, bio); break; case REQ_OP_WRITE:
ret = dmz_handle_write(dmz, zone, bio); break; case REQ_OP_DISCARD: case REQ_OP_WRITE_ZEROES:
ret = dmz_handle_discard(dmz, zone, bio); break; default:
DMERR("(%s): Unsupported BIO operation 0x%x",
dmz_metadata_label(dmz->metadata), bio_op(bio));
ret = -EIO;
}
/* * Release the chunk mapping. This will check that the mapping * is still valid, that is, that the zone used still has valid blocks.
*/ if (zone)
dmz_put_chunk_mapping(zmd, zone);
out:
dmz_bio_endio(bio, errno_to_blk_status(ret));
/* * Decrement a chunk work reference count and * free it if it becomes 0.
*/ staticvoid dmz_put_chunk_work(struct dm_chunk_work *cw)
{ if (refcount_dec_and_test(&cw->refcount)) {
WARN_ON(!bio_list_empty(&cw->bio_list));
radix_tree_delete(&cw->target->chunk_rxtree, cw->chunk);
kfree(cw);
}
}
/* * Chunk BIO work function.
*/ staticvoid dmz_chunk_work(struct work_struct *work)
{ struct dm_chunk_work *cw = container_of(work, struct dm_chunk_work, work); struct dmz_target *dmz = cw->target; struct bio *bio;
mutex_lock(&dmz->chunk_lock);
/* Process the chunk BIOs */ while ((bio = bio_list_pop(&cw->bio_list))) {
mutex_unlock(&dmz->chunk_lock);
dmz_handle_bio(dmz, cw, bio);
mutex_lock(&dmz->chunk_lock);
dmz_put_chunk_work(cw);
}
/* Queueing the work incremented the work refcount */
dmz_put_chunk_work(cw);
/* * Get a chunk work and start it to process a new BIO. * If the BIO chunk has no work yet, create one.
*/ staticint dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
{ unsignedint chunk = dmz_bio_chunk(dmz->metadata, bio); struct dm_chunk_work *cw; int ret = 0;
mutex_lock(&dmz->chunk_lock);
/* Get the BIO chunk work. If one is not active yet, create one */
cw = radix_tree_lookup(&dmz->chunk_rxtree, chunk); if (cw) {
dmz_get_chunk_work(cw);
} else { /* Create a new chunk work */
cw = kmalloc(sizeof(struct dm_chunk_work), GFP_NOIO); if (unlikely(!cw)) {
ret = -ENOMEM; goto out;
}
ret = radix_tree_insert(&dmz->chunk_rxtree, chunk, cw); if (unlikely(ret)) {
kfree(cw); goto out;
}
}
bio_list_add(&cw->bio_list, bio);
if (queue_work(dmz->chunk_wq, &cw->work))
dmz_get_chunk_work(cw);
out:
mutex_unlock(&dmz->chunk_lock); return ret;
}
/* * Check if the backing device is being removed. If it's on the way out, * start failing I/O. Reclaim and metadata components also call this * function to cleanly abort operation in the event of such failure.
*/ bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev)
{ if (dmz_dev->flags & DMZ_BDEV_DYING) returntrue;
if (dmz_dev->flags & DMZ_CHECK_BDEV) return !dmz_check_bdev(dmz_dev);
/* * Check the backing device availability. This detects such events as * backing device going offline due to errors, media removals, etc. * This check is less efficient than dmz_bdev_is_dying() and should * only be performed as a part of error handling.
*/ bool dmz_check_bdev(struct dmz_dev *dmz_dev)
{ struct gendisk *disk;
dmz_dev->flags &= ~DMZ_CHECK_BDEV;
if (dmz_bdev_is_dying(dmz_dev)) returnfalse;
disk = dmz_dev->bdev->bd_disk; if (disk->fops->check_events &&
disk->fops->check_events(disk, 0) & DISK_EVENT_MEDIA_CHANGE) {
dmz_dev_warn(dmz_dev, "Backing device offline");
dmz_dev->flags |= DMZ_BDEV_DYING;
}
return !(dmz_dev->flags & DMZ_BDEV_DYING);
}
/* * Process a new BIO.
*/ staticint dmz_map(struct dm_target *ti, struct bio *bio)
{ struct dmz_target *dmz = ti->private; struct dmz_metadata *zmd = dmz->metadata; struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
sector_t sector = bio->bi_iter.bi_sector; unsignedint nr_sectors = bio_sectors(bio);
sector_t chunk_sector; int ret;
if (!nr_sectors && bio_op(bio) != REQ_OP_WRITE) return DM_MAPIO_REMAPPED;
/* The BIO should be block aligned */ if ((nr_sectors & DMZ_BLOCK_SECTORS_MASK) || (sector & DMZ_BLOCK_SECTORS_MASK)) return DM_MAPIO_KILL;
/* Initialize the BIO context */
bioctx->dev = NULL;
bioctx->zone = NULL;
bioctx->bio = bio;
refcount_set(&bioctx->ref, 1);
/* Set the BIO pending in the flush list */ if (!nr_sectors && bio_op(bio) == REQ_OP_WRITE) {
spin_lock(&dmz->flush_lock);
bio_list_add(&dmz->flush_list, bio);
spin_unlock(&dmz->flush_lock);
mod_delayed_work(dmz->flush_wq, &dmz->flush_work, 0); return DM_MAPIO_SUBMITTED;
}
/* Split zone BIOs to fit entirely into a zone */
chunk_sector = sector & (dmz_zone_nr_sectors(zmd) - 1); if (chunk_sector + nr_sectors > dmz_zone_nr_sectors(zmd))
dm_accept_partial_bio(bio, dmz_zone_nr_sectors(zmd) - chunk_sector);
/* Now ready to handle this BIO */
ret = dmz_queue_chunk_work(dmz, bio); if (ret) {
DMDEBUG("(%s): BIO op %d, can't process chunk %llu, err %i",
dmz_metadata_label(zmd),
bio_op(bio), (u64)dmz_bio_chunk(zmd, bio),
ret); return DM_MAPIO_REQUEUE;
}
return DM_MAPIO_SUBMITTED;
}
/* * Get zoned device information.
*/ staticint dmz_get_zoned_device(struct dm_target *ti, char *path, int idx, int nr_devs)
{ struct dmz_target *dmz = ti->private; struct dm_dev *ddev; struct dmz_dev *dev; int ret; struct block_device *bdev;
/* Get the target device */
ret = dm_get_device(ti, path, dm_table_get_mode(ti->table), &ddev); if (ret) {
ti->error = "Get target device failed"; return ret;
}
bdev = ddev->bdev; if (!bdev_is_zoned(bdev)) { if (nr_devs == 1) {
ti->error = "Invalid regular device"; goto err;
} if (idx != 0) {
ti->error = "First device must be a regular device"; goto err;
} if (dmz->ddev[0]) {
ti->error = "Too many regular devices"; goto err;
}
dev = &dmz->dev[idx];
dev->flags = DMZ_BDEV_REGULAR;
} else { if (dmz->ddev[idx]) {
ti->error = "Too many zoned devices"; goto err;
} if (nr_devs > 1 && idx == 0) {
ti->error = "First device must be a regular device"; goto err;
}
dev = &dmz->dev[idx];
}
dev->bdev = bdev;
dev->dev_idx = idx;
dev->capacity = bdev_nr_sectors(bdev); if (ti->begin) {
ti->error = "Partial mapping is not supported"; goto err;
}
/* * When we have more than on devices, the first one must be a * regular block device and the others zoned block devices.
*/ if (dmz->nr_ddevs > 1) {
reg_dev = &dmz->dev[0]; if (!(reg_dev->flags & DMZ_BDEV_REGULAR)) {
ti->error = "Primary disk is not a regular device"; return -EINVAL;
} for (i = 1; i < dmz->nr_ddevs; i++) { struct dmz_dev *zoned_dev = &dmz->dev[i]; struct block_device *bdev = zoned_dev->bdev;
if (zoned_dev->flags & DMZ_BDEV_REGULAR) {
ti->error = "Secondary disk is not a zoned device"; return -EINVAL;
} if (zone_nr_sectors &&
zone_nr_sectors != bdev_zone_sectors(bdev)) {
ti->error = "Zone nr sectors mismatch"; return -EINVAL;
}
zone_nr_sectors = bdev_zone_sectors(bdev);
zoned_dev->zone_nr_sectors = zone_nr_sectors;
zoned_dev->nr_zones = bdev_nr_zones(bdev);
}
} else { struct dmz_dev *zoned_dev = &dmz->dev[0]; struct block_device *bdev = zoned_dev->bdev;
if (zoned_dev->flags & DMZ_BDEV_REGULAR) {
ti->error = "Disk is not a zoned device"; return -EINVAL;
}
zoned_dev->zone_nr_sectors = bdev_zone_sectors(bdev);
zoned_dev->nr_zones = bdev_nr_zones(bdev);
}
/* Allocate and initialize the target descriptor */
dmz = kzalloc(sizeof(struct dmz_target), GFP_KERNEL); if (!dmz) {
ti->error = "Unable to allocate the zoned target descriptor"; return -ENOMEM;
}
dmz->dev = kcalloc(argc, sizeof(struct dmz_dev), GFP_KERNEL); if (!dmz->dev) {
ti->error = "Unable to allocate the zoned device descriptors";
kfree(dmz); return -ENOMEM;
}
dmz->ddev = kcalloc(argc, sizeof(struct dm_dev *), GFP_KERNEL); if (!dmz->ddev) {
ti->error = "Unable to allocate the dm device descriptors";
ret = -ENOMEM; goto err;
}
dmz->nr_ddevs = argc;
ti->private = dmz;
/* Get the target zoned block device */ for (i = 0; i < argc; i++) {
ret = dmz_get_zoned_device(ti, argv[i], i, argc); if (ret) goto err_dev;
}
ret = dmz_fixup_devices(ti); if (ret) goto err_dev;
/* Set target (no write same support) */
ti->max_io_len = dmz_zone_nr_sectors(dmz->metadata);
ti->num_flush_bios = 1;
ti->num_discard_bios = 1;
ti->num_write_zeroes_bios = 1;
ti->per_io_data_size = sizeof(struct dmz_bioctx);
ti->flush_supported = true;
ti->discards_supported = true;
/* The exposed capacity is the number of chunks that can be mapped */
ti->len = (sector_t)dmz_nr_chunks(dmz->metadata) <<
dmz_zone_nr_sectors_shift(dmz->metadata);
/* Zone BIO */
ret = bioset_init(&dmz->bio_set, DMZ_MIN_BIOS, 0, 0); if (ret) {
ti->error = "Create BIO set failed"; goto err_meta;
}
/* Chunk BIO work */
mutex_init(&dmz->chunk_lock);
INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_NOIO);
dmz->chunk_wq = alloc_workqueue("dmz_cwq_%s",
WQ_MEM_RECLAIM | WQ_UNBOUND, 0,
dmz_metadata_label(dmz->metadata)); if (!dmz->chunk_wq) {
ti->error = "Create chunk workqueue failed";
ret = -ENOMEM; goto err_bio;
}
/* * Stop works on suspend.
*/ staticvoid dmz_suspend(struct dm_target *ti)
{ struct dmz_target *dmz = ti->private; int i;
flush_workqueue(dmz->chunk_wq); for (i = 0; i < dmz->nr_ddevs; i++)
dmz_suspend_reclaim(dmz->dev[i].reclaim);
cancel_delayed_work_sync(&dmz->flush_work);
}
/* * Restart works on resume or if suspend failed.
*/ staticvoid dmz_resume(struct dm_target *ti)
{ struct dmz_target *dmz = ti->private; int i;
queue_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD); for (i = 0; i < dmz->nr_ddevs; i++)
dmz_resume_reclaim(dmz->dev[i].reclaim);
}
staticint dmz_iterate_devices(struct dm_target *ti,
iterate_devices_callout_fn fn, void *data)
{ struct dmz_target *dmz = ti->private; unsignedint zone_nr_sectors = dmz_zone_nr_sectors(dmz->metadata);
sector_t capacity; int i, r = 0;
for (i = 0; i < dmz->nr_ddevs; i++) {
capacity = dmz->dev[i].capacity & ~(zone_nr_sectors - 1);
r = fn(ti, dmz->ddev[i], 0, capacity, data); if (r) break;
} return r;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.