// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. * * This file is released under the GPL.
*/
/* * Cookies are numeric values sent with CHANGE and REMOVE * uevents while resuming, removing or renaming the device.
*/ #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" #define DM_COOKIE_LENGTH 24
/* * For REQ_POLLED fs bio, this flag is set if we link mapped underlying * dm_io into one list, and reuse bio->bi_private as the list head. Before * ending this fs bio, we will recover its ->bi_private.
*/ #define REQ_DM_POLL_LIST REQ_DRV
/* * One of these is allocated (on-stack) per original bio.
*/ struct clone_info { struct dm_table *map; struct bio *bio; struct dm_io *io;
sector_t sector; unsignedint sector_count; bool is_abnormal_io:1; bool submit_as_polled:1;
};
/* * Bio-based DM's mempools' reserved IOs set by the user.
*/ #define RESERVED_BIO_BASED_IOS 16 staticunsignedint reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
staticint __dm_get_module_param_int(int *module_param, int min, int max)
{ int param = READ_ONCE(*module_param); int modified_param = 0; bool modified = true;
#if (IS_ENABLED(CONFIG_IMA) && !IS_ENABLED(CONFIG_IMA_DISABLE_HTABLE))
DMWARN("CONFIG_IMA_DISABLE_HTABLE is disabled." " Duplicate IMA measurements will not be recorded in the IMA log."); #endif
for (i = 0; i < count; i++) {
r = _inits[i](); if (r) goto bad;
}
return 0;
bad: while (i--)
_exits[i]();
return r;
}
staticvoid __exit dm_exit(void)
{ int i = ARRAY_SIZE(_exits);
while (i--)
_exits[i]();
/* * Should be empty by this point.
*/
idr_destroy(&_minor_idr);
}
md = disk->private_data; if (WARN_ON(!md)) goto out;
if (atomic_dec_and_test(&md->open_count) &&
(test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
queue_work(deferred_remove_workqueue, &deferred_remove_work);
dm_put(md);
out:
spin_unlock(&_minor_lock);
}
int dm_open_count(struct mapped_device *md)
{ return atomic_read(&md->open_count);
}
/* * Guarantees nothing is using the device before it's deleted.
*/ int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
{ int r = 0;
spin_lock(&_minor_lock);
if (dm_open_count(md)) {
r = -EBUSY; if (mark_deferred)
set_bit(DMF_DEFERRED_REMOVE, &md->flags);
} elseif (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
r = -EEXIST; else
set_bit(DMF_DELETING, &md->flags);
spin_unlock(&_minor_lock);
return r;
}
int dm_cancel_deferred_remove(struct mapped_device *md)
{ int r = 0;
spin_lock(&_minor_lock);
if (test_bit(DMF_DELETING, &md->flags))
r = -EBUSY; else
clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
r = dm_prepare_ioctl(md, &srcu_idx, &bdev, cmd, arg, &forward); if (!forward || r < 0) goto out;
if (r > 0) { /* * Target determined this ioctl is being issued against a * subset of the parent bdev; require extra privileges.
*/ if (!capable(CAP_SYS_RAWIO)) {
DMDEBUG_LIMIT( "%s: sending ioctl %x to DM device without required privilege.",
current->comm, cmd);
r = -ENOIOCTLCMD; goto out;
}
}
if (!bdev->bd_disk->fops->ioctl)
r = -ENOTTY; else
r = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg);
out:
dm_unprepare_ioctl(md, srcu_idx); return r;
}
u64 dm_start_time_ns_from_clone(struct bio *bio)
{ return jiffies_to_nsecs(clone_to_tio(bio)->io->start_time);
}
EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);
staticinlineunsignedint dm_io_sectors(struct dm_io *io, struct bio *bio)
{ /* * If REQ_PREFLUSH set, don't account payload, it will be * submitted (and accounted) after this flush completes.
*/ if (bio_is_flush_with_data(bio)) return 0; if (unlikely(dm_io_flagged(io, DM_IO_WAS_SPLIT))) return io->sectors; return bio_sectors(bio);
}
staticvoid dm_start_io_acct(struct dm_io *io, struct bio *clone)
{ /* * Ensure IO accounting is only ever started once.
*/ if (dm_io_flagged(io, DM_IO_ACCOUNTED)) return;
/* Expect no possibility for race unless DM_TIO_IS_DUPLICATE_BIO. */ if (!clone || likely(dm_tio_is_normal(clone_to_tio(clone)))) {
dm_io_set_flag(io, DM_IO_ACCOUNTED);
} else { unsignedlong flags; /* Can afford locking given DM_TIO_IS_DUPLICATE_BIO */
spin_lock_irqsave(&io->lock, flags); if (dm_io_flagged(io, DM_IO_ACCOUNTED)) {
spin_unlock_irqrestore(&io->lock, flags); return;
}
dm_io_set_flag(io, DM_IO_ACCOUNTED);
spin_unlock_irqrestore(&io->lock, flags);
}
/* one ref is for submission, the other is for completion */
atomic_set(&io->io_count, 2);
this_cpu_inc(*md->pending_io);
io->orig_bio = bio;
io->md = md;
spin_lock_init(&io->lock);
io->start_time = jiffies;
io->flags = 0; if (blk_queue_io_stat(md->queue))
dm_io_set_flag(io, DM_IO_BLK_STAT);
if (static_branch_unlikely(&stats_enabled) &&
unlikely(dm_stats_used(&md->stats)))
dm_stats_record_start(&md->stats, &io->stats_aux);
/* Set default bdev, but target must bio_set_dev() before issuing IO */
clone->bi_bdev = md->disk->part0; if (likely(ti != NULL) && unlikely(ti->needs_bio_set_dev))
bio_set_dev(clone, md->disk->part0);
if (len) {
clone->bi_iter.bi_size = to_bytes(*len); if (bio_integrity(clone))
bio_integrity_trim(clone);
}
return clone;
}
staticvoid free_tio(struct bio *clone)
{ if (dm_tio_flagged(clone_to_tio(clone), DM_TIO_INSIDE_DM_IO)) return;
bio_put(clone);
}
/* * Add the bio to the list of deferred io.
*/ staticvoid queue_io(struct mapped_device *md, struct bio *bio)
{ unsignedlong flags;
/* * Everyone (including functions in this file), should use this * function to access the md->map field, and make sure they call * dm_put_live_table() when finished.
*/ struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
{
*srcu_idx = srcu_read_lock(&md->io_barrier);
/* * A fast alternative to dm_get_live_table/dm_put_live_table. * The caller must not block between these two functions.
*/ staticstruct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
{
rcu_read_lock(); return rcu_dereference(md->map);
}
staticchar *_dm_claim_ptr = "I belong to device-mapper";
/* * Open a table device so we can use it as a map destination.
*/ staticstruct table_device *open_table_device(struct mapped_device *md,
dev_t dev, blk_mode_t mode)
{ struct table_device *td; struct file *bdev_file; struct block_device *bdev;
u64 part_off; int r;
bdev_file = bdev_file_open_by_dev(dev, mode, _dm_claim_ptr, NULL); if (IS_ERR(bdev_file)) {
r = PTR_ERR(bdev_file); goto out_free_td;
}
bdev = file_bdev(bdev_file);
/* * We can be called before the dm disk is added. In that case we can't * register the holder relation here. It will be done once add_disk was * called.
*/ if (md->disk->slave_dir) {
r = bd_link_disk_holder(bdev, md->disk); if (r) goto out_blkdev_put;
}
/* * Close a table device that we've been using.
*/ staticvoid close_table_device(struct table_device *td, struct mapped_device *md)
{ if (md->disk->slave_dir)
bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
/* Leverage async fput() if DMF_DEFERRED_REMOVE set */ if (unlikely(test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
fput(td->dm_dev.bdev_file); else
__fput_sync(td->dm_dev.bdev_file);
mutex_lock(&md->table_devices_lock); if (refcount_dec_and_test(&td->count))
close_table_device(td, md);
mutex_unlock(&md->table_devices_lock);
}
/* * Get the geometry associated with a dm device
*/ int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
{
*geo = md->geometry;
return 0;
}
/* * Set the geometry of a device.
*/ int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
{
sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
if (geo->start > sz) {
DMERR("Start sector is beyond the geometry limits."); return -EINVAL;
}
/* * Return true if the dm_io's original bio is requeued. * io->status is updated with error if requeue disallowed.
*/ staticbool dm_handle_requeue(struct dm_io *io, bool first_stage)
{ struct bio *bio = io->orig_bio; bool handle_requeue = (io->status == BLK_STS_DM_REQUEUE); bool handle_polled_eagain = ((io->status == BLK_STS_AGAIN) &&
(bio->bi_opf & REQ_POLLED)); struct mapped_device *md = io->md; bool requeued = false;
if (handle_requeue || handle_polled_eagain) { unsignedlong flags;
if (bio->bi_opf & REQ_POLLED) { /* * Upper layer won't help us poll split bio * (io->orig_bio may only reflect a subset of the * pre-split original) so clear REQ_POLLED.
*/
bio_clear_polled(bio);
}
/* * Target requested pushing back the I/O or * polled IO hit BLK_STS_AGAIN.
*/
spin_lock_irqsave(&md->deferred_lock, flags); if ((__noflush_suspending(md) &&
!WARN_ON_ONCE(dm_is_zone_write(md, bio))) ||
handle_polled_eagain || first_stage) {
dm_requeue_add_io(io, first_stage);
requeued = true;
} else { /* * noflush suspend was interrupted or this is * a write to a zoned target.
*/
io->status = BLK_STS_IOERR;
}
spin_unlock_irqrestore(&md->deferred_lock, flags);
}
requeued = dm_handle_requeue(io, first_stage); if (requeued && first_stage) return;
io_error = io->status; if (dm_io_flagged(io, DM_IO_ACCOUNTED))
dm_end_io_acct(io); elseif (!io_error) { /* * Must handle target that DM_MAPIO_SUBMITTED only to * then bio_endio() rather than dm_submit_bio_remap()
*/
__dm_start_io_acct(io);
dm_end_io_acct(io);
}
free_io(io);
smp_wmb();
this_cpu_dec(*md->pending_io);
/* nudge anyone waiting on suspend queue */ if (unlikely(wq_has_sleeper(&md->wait)))
wake_up(&md->wait);
/* Return early if the original bio was requeued */ if (requeued) return;
if (bio_is_flush_with_data(bio)) { /* * Preflush done for flush with data, reissue * without REQ_PREFLUSH.
*/
bio->bi_opf &= ~REQ_PREFLUSH;
queue_io(md, bio);
} else { /* done with normal IO or empty flush */ if (io_error)
bio->bi_status = io_error;
bio_endio(bio);
}
}
/* * Two staged requeue: * * 1) io->orig_bio points to the real original bio, and the part mapped to * this io must be requeued, instead of other parts of the original bio. * * 2) io->orig_bio points to new cloned bio which matches the requeued dm_io.
*/ staticinlinevoid dm_io_complete(struct dm_io *io)
{ /* * Only dm_io that has been split needs two stage requeue, otherwise * we may run into long bio clone chain during suspend and OOM could * be triggered. * * Also flush data dm_io won't be marked as DM_IO_WAS_SPLIT, so they * also aren't handled via the first stage requeue.
*/
__dm_io_complete(io, dm_io_flagged(io, DM_IO_WAS_SPLIT));
}
/* * Decrements the number of outstanding ios that a bio has been * cloned into, completing the original io if necc.
*/ staticinlinevoid __dm_io_dec_pending(struct dm_io *io)
{ if (atomic_dec_and_test(&io->io_count))
dm_io_complete(io);
}
/* * The queue_limits are only valid as long as you have a reference * count on 'md'. But _not_ imposing verification to avoid atomic_read(),
*/ staticinlinestruct queue_limits *dm_get_queue_limits(struct mapped_device *md)
{ return &md->queue->limits;
}
if (unlikely(error == BLK_STS_TARGET)) { if (bio_op(bio) == REQ_OP_DISCARD &&
!bdev_max_discard_sectors(bio->bi_bdev))
blk_queue_disable_discard(md->queue); elseif (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
!bdev_write_zeroes_sectors(bio->bi_bdev))
blk_queue_disable_write_zeroes(md->queue);
}
if (static_branch_unlikely(&zoned_enabled) &&
unlikely(bdev_is_zoned(bio->bi_bdev)))
dm_zone_endio(io, bio);
if (endio) { int r = endio(ti, bio, &error);
switch (r) { case DM_ENDIO_REQUEUE: if (static_branch_unlikely(&zoned_enabled)) { /* * Requeuing writes to a sequential zone of a zoned * target will break the sequential write pattern: * fail such IO.
*/ if (WARN_ON_ONCE(dm_is_zone_write(md, bio)))
error = BLK_STS_IOERR; else
error = BLK_STS_DM_REQUEUE;
} else
error = BLK_STS_DM_REQUEUE;
fallthrough; case DM_ENDIO_DONE: break; case DM_ENDIO_INCOMPLETE: /* The target will handle the io */ return; default:
DMCRIT("unimplemented target endio return value: %d", r);
BUG();
}
}
if (static_branch_unlikely(&swap_bios_enabled) &&
likely(ti != NULL) && unlikely(swap_bios_limit(ti, bio)))
up(&md->swap_bios_semaphore);
free_tio(bio);
dm_io_dec_pending(io, error);
}
/* * Return maximum size of I/O possible at the supplied sector up to the current * target boundary.
*/ staticinline sector_t max_io_len_target_boundary(struct dm_target *ti,
sector_t target_offset)
{ return ti->len - target_offset;
}
/* * Does the target need to split IO even further? * - varied (per target) IO splitting is a tenet of DM; this * explains why stacked chunk_sectors based splitting via * bio_split_to_limits() isn't possible here.
*/ if (!max_granularity) return len; return min_t(sector_t, len,
min(max_sectors ? : queue_max_sectors(ti->table->md->queue),
blk_boundary_sectors_left(target_offset, max_granularity)));
}
map = dm_get_live_table(md, srcu_idx); if (!map) return NULL;
ti = dm_table_find_target(map, sector); if (!ti) return NULL;
return ti;
}
staticlong dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, void **kaddr, unsignedlong *pfn)
{ struct mapped_device *md = dax_get_private(dax_dev);
sector_t sector = pgoff * PAGE_SECTORS; struct dm_target *ti; long len, ret = -EIO; int srcu_idx;
ti = dm_dax_get_live_target(md, sector, &srcu_idx);
if (!ti) goto out; if (!ti->type->direct_access) goto out;
len = max_io_len(ti, sector) / PAGE_SECTORS; if (len < 1) goto out;
nr_pages = min(len, nr_pages);
ret = ti->type->direct_access(ti, pgoff, nr_pages, mode, kaddr, pfn);
out:
dm_put_live_table(md, srcu_idx);
return ret;
}
staticint dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
size_t nr_pages)
{ struct mapped_device *md = dax_get_private(dax_dev);
sector_t sector = pgoff * PAGE_SECTORS; struct dm_target *ti; int ret = -EIO; int srcu_idx;
ti = dm_dax_get_live_target(md, sector, &srcu_idx);
if (!ti) goto out; if (WARN_ON(!ti->type->dax_zero_page_range)) { /* * ->zero_page_range() is mandatory dax operation. If we are * here, something is wrong.
*/ goto out;
}
ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages);
out:
dm_put_live_table(md, srcu_idx);
/* * A target may call dm_accept_partial_bio only from the map routine. It is * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management * operations, zone append writes (native with REQ_OP_ZONE_APPEND or emulated * with write BIOs flagged with BIO_EMULATES_ZONE_APPEND) and any bio serviced * by __send_duplicate_bios(). * * dm_accept_partial_bio informs the dm that the target only wants to process * additional n_sectors sectors of the bio and the rest of the data should be * sent in a next bio. * * A diagram that explains the arithmetics: * +--------------------+---------------+-------+ * | 1 | 2 | 3 | * +--------------------+---------------+-------+ * * <-------------- *tio->len_ptr ---------------> * <----- bio_sectors -----> * <-- n_sectors --> * * Region 1 was already iterated over with bio_advance or similar function. * (it may be empty if the target doesn't use bio_advance) * Region 2 is the remaining bio size that the target wants to process. * (it may be empty if region 1 is non-empty, although there is no reason * to make it empty) * The target requires that region 3 is to be sent in the next bio. * * If the target wants to receive multiple copies of the bio (via num_*bios, etc), * the partially processed part (the sum of regions 1+2) must be the same for all * copies of the bio.
*/ void dm_accept_partial_bio(struct bio *bio, unsignedint n_sectors)
{ struct dm_target_io *tio = clone_to_tio(bio); struct dm_io *io = tio->io; unsignedint bio_sectors = bio_sectors(bio);
/* * __split_and_process_bio() may have already saved mapped part * for accounting but it is being reduced so update accordingly.
*/
dm_io_set_flag(io, DM_IO_WAS_SPLIT);
io->sectors = n_sectors;
io->sector_offset = bio_sectors(io->orig_bio);
}
EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
/* * @clone: clone bio that DM core passed to target's .map function * @tgt_clone: clone of @clone bio that target needs submitted * * Targets should use this interface to submit bios they take * ownership of when returning DM_MAPIO_SUBMITTED. * * Target should also enable ti->accounts_remapped_io
*/ void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone)
{ struct dm_target_io *tio = clone_to_tio(clone); struct dm_io *io = tio->io;
/* establish bio that will get submitted */ if (!tgt_clone)
tgt_clone = clone;
/* * Account io->origin_bio to DM dev on behalf of target * that took ownership of IO with DM_MAPIO_SUBMITTED.
*/
dm_start_io_acct(io, clone);
if (WARN_ON_ONCE(num_bios == 0)) /* num_bios = 0 is a bug in caller */ return 0;
/* dm_accept_partial_bio() is not supported with shared tio->len_ptr */ if (len)
setup_split_accounting(ci, *len);
/* * Using alloc_multiple_bios(), even if num_bios is 1, to consistently * support allocating using GFP_NOWAIT with GFP_NOIO fallback.
*/
alloc_multiple_bios(&blist, ci, ti, num_bios, len); while ((clone = bio_list_pop(&blist))) { if (num_bios > 1)
dm_tio_set_flag(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO);
__map_bio(clone);
ret += 1;
}
/* * Use an on-stack bio for this, it's safe since we don't * need to reference it after submit. It's just used as * the basis for the clone(s).
*/
bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0, opf);
if (!t->flush_bypasses_map) { for (unsignedint i = 0; i < t->num_targets; i++) { unsignedint bios; struct dm_target *ti = dm_table_get_target(t, i);
if (unlikely(ti->num_flush_bios == 0)) continue;
atomic_add(ti->num_flush_bios, &ci->io->io_count);
bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
NULL);
atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
}
} else { /* * Note that there's no need to grab t->devices_lock here * because the targets that support flush optimization don't * modify the list of devices.
*/ struct list_head *devices = dm_table_get_devices(t); unsignedint len = 0; struct dm_dev_internal *dd;
list_for_each_entry(dd, devices, list) { struct bio *clone; /* * Note that the structure dm_target_io is not * associated with any target (because the device may be * used by multiple targets), so we set tio->ti = NULL. * We must check for NULL in the I/O processing path, to * avoid NULL pointer dereference.
*/
clone = alloc_tio(ci, NULL, 0, &len, GFP_NOIO);
atomic_add(1, &ci->io->io_count);
bio_set_dev(clone, dd->dm_dev->bdev);
clone->bi_end_io = clone_endio;
dm_submit_bio_remap(clone, NULL);
}
}
/* * alloc_io() takes one extra reference for submission, so the * reference won't reach 0 without the following subtraction
*/
atomic_sub(1, &ci->io->io_count);
len = min_t(sector_t, ci->sector_count,
__max_io_len(ti, ci->sector, max_granularity, max_sectors));
atomic_add(num_bios, &ci->io->io_count);
bios = __send_duplicate_bios(ci, ti, num_bios, &len); /* * alloc_io() takes one extra reference for submission, so the * reference won't reach 0 without the following (+1) subtraction
*/
atomic_sub(num_bios - bios + 1, &ci->io->io_count);
ci->sector += len;
ci->sector_count -= len;
}
staticbool is_abnormal_io(struct bio *bio)
{ switch (bio_op(bio)) { case REQ_OP_READ: case REQ_OP_WRITE: case REQ_OP_FLUSH: returnfalse; case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: case REQ_OP_WRITE_ZEROES: case REQ_OP_ZONE_RESET_ALL: returntrue; default: returnfalse;
}
}
switch (bio_op(ci->bio)) { case REQ_OP_DISCARD:
num_bios = ti->num_discard_bios;
max_sectors = limits->max_discard_sectors; if (ti->max_discard_granularity)
max_granularity = max_sectors; break; case REQ_OP_SECURE_ERASE:
num_bios = ti->num_secure_erase_bios;
max_sectors = limits->max_secure_erase_sectors; break; case REQ_OP_WRITE_ZEROES:
num_bios = ti->num_write_zeroes_bios;
max_sectors = limits->max_write_zeroes_sectors; break; default: break;
}
/* * Even though the device advertised support for this type of * request, that does not mean every target supports it, and * reconfiguration might also have changed that since the * check was performed.
*/ if (unlikely(!num_bios)) return BLK_STS_NOTSUPP;
/* * Reuse ->bi_private as dm_io list head for storing all dm_io instances * associated with this bio, and this bio's bi_private needs to be * stored in dm_io->data before the reuse. * * bio->bi_private is owned by fs or upper layer, so block layer won't * touch it after splitting. Meantime it won't be changed by anyone after * bio is submitted. So this reuse is safe.
*/ staticinlinestruct dm_io **dm_poll_list_head(struct bio *bio)
{ return (struct dm_io **)&bio->bi_private;
}
if (!(bio->bi_opf & REQ_DM_POLL_LIST)) {
bio->bi_opf |= REQ_DM_POLL_LIST; /* * Save .bi_private into dm_io, so that we can reuse * .bi_private as dm_io list head for storing dm_io list
*/
io->data = bio->bi_private;
/* tell block layer to poll for completion */
bio->bi_cookie = ~BLK_QC_T_NONE;
io->next = NULL;
} else { /* * bio recursed due to split, reuse original poll list, * and save bio->bi_private too.
*/
io->data = (*head)->data;
io->next = *head;
}
*head = io;
}
/* * Select the correct strategy for processing a non-flush bio.
*/ static blk_status_t __split_and_process_bio(struct clone_info *ci)
{ struct bio *clone; struct dm_target *ti; unsignedint len;
ti = dm_table_find_target(ci->map, ci->sector); if (unlikely(!ti)) return BLK_STS_IOERR;
if (unlikely(ci->is_abnormal_io)) return __process_abnormal_io(ci, ti);
/* * Only support bio polling for normal IO, and the target io is * exactly inside the dm_io instance (verified in dm_poll_dm_io)
*/
ci->submit_as_polled = !!(ci->bio->bi_opf & REQ_POLLED);
len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count); if (ci->bio->bi_opf & REQ_ATOMIC && len != ci->sector_count) return BLK_STS_IOERR;
setup_split_accounting(ci, len);
if (unlikely(ci->bio->bi_opf & REQ_NOWAIT)) { if (unlikely(!dm_target_supports_nowait(ti->type))) return BLK_STS_NOTSUPP;
/* Shouldn't happen but sector_count was being set to 0 so... */ if (static_branch_unlikely(&zoned_enabled) &&
WARN_ON_ONCE(op_is_zone_mgmt(bio_op(bio)) && ci->sector_count))
ci->sector_count = 0;
}
#ifdef CONFIG_BLK_DEV_ZONED staticinlinebool dm_zone_bio_needs_split(struct bio *bio)
{ /* * Special case the zone operations that cannot or should not be split.
*/ switch (bio_op(bio)) { case REQ_OP_ZONE_APPEND: case REQ_OP_ZONE_FINISH: case REQ_OP_ZONE_RESET: case REQ_OP_ZONE_RESET_ALL: returnfalse; default: break;
}
/* * When mapped devices use the block layer zone write plugging, we must * split any large BIO to the mapped device limits to not submit BIOs * that span zone boundaries and to avoid potential deadlocks with * queue freeze operations.
*/ return bio_needs_zone_write_plugging(bio) || bio_straddles_zones(bio);
}
staticinlinebool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio)
{ if (!bio_needs_zone_write_plugging(bio)) returnfalse; return blk_zone_plug_bio(bio, 0);
}
ret = dm_zone_get_reset_bitmap(md, ci->map, ti->begin,
nr_zones, need_reset); if (ret) {
sts = BLK_STS_IOERR; goto free_bitmap;
}
/* If we have no zone to reset, we are done. */
nr_reset = bitmap_weight(need_reset, nr_zones); if (!nr_reset) goto free_bitmap;
atomic_add(nr_zones, &ci->io->io_count);
for (i = 0; i < nr_zones; i++) {
if (!test_bit(i, need_reset)) {
sector += zone_sectors; continue;
}
if (bio_list_empty(&blist)) { /* This may take a while, so be nice to others */ if (num_bios)
cond_resched();
/* * We may need to reset thousands of zones, so let's * not go crazy with the clone allocation.
*/
alloc_multiple_bios(&blist, ci, ti, min(nr_reset, 32),
NULL);
}
/* Get a clone and change it to a regular reset operation. */
clone = bio_list_pop(&blist);
clone->bi_opf &= ~REQ_OP_MASK;
clone->bi_opf |= REQ_OP_ZONE_RESET | REQ_SYNC;
clone->bi_iter.bi_sector = sector;
clone->bi_iter.bi_size = 0;
__map_bio(clone);
/* * Entry point to split a bio into clones and submit them to the targets.
*/ staticvoid dm_split_and_process_bio(struct mapped_device *md, struct dm_table *map, struct bio *bio)
{ struct clone_info ci; struct dm_io *io;
blk_status_t error = BLK_STS_OK; bool is_abnormal, need_split;
if (unlikely(need_split)) { /* * Use bio_split_to_limits() for abnormal IO (e.g. discard, etc) * otherwise associated queue_limits won't be imposed. * Also split the BIO for mapped devices needing zone append * emulation to ensure that the BIO does not cross zone * boundaries.
*/
bio = bio_split_to_limits(bio); if (!bio) return;
}
/* * Use the block layer zone write plugging for mapped devices that * need zone append emulation (e.g. dm-crypt).
*/ if (static_branch_unlikely(&zoned_enabled) && dm_zone_plug_bio(md, bio)) return;
/* Only support nowait for normal IO */ if (unlikely(bio->bi_opf & REQ_NOWAIT) && !is_abnormal) { /* * Don't support NOWAIT for FLUSH because it may allocate * multiple bios and there's no easy way how to undo the * allocations.
*/ if (bio->bi_opf & REQ_PREFLUSH) {
bio_wouldblock_error(bio); return;
}
io = alloc_io(md, bio, GFP_NOWAIT); if (unlikely(!io)) { /* Unable to do anything without dm_io. */
bio_wouldblock_error(bio); return;
}
} else {
io = alloc_io(md, bio, GFP_NOIO);
}
init_clone_info(&ci, io, map, bio, is_abnormal);
if (bio->bi_opf & REQ_PREFLUSH) {
__send_empty_flush(&ci); /* dm_io_complete submits any data associated with flush */ goto out;
}
error = __split_and_process_bio(&ci); if (error || !ci.sector_count) goto out; /* * Remainder must be passed to submit_bio_noacct() so it gets handled * *after* bios already submitted have been completely processed.
*/
bio_trim(bio, io->sectors, ci.sector_count);
trace_block_split(bio, bio->bi_iter.bi_sector);
bio_inc_remaining(bio);
submit_bio_noacct(bio);
out: /* * Drop the extra reference count for non-POLLED bio, and hold one * reference for POLLED bio, which will be released in dm_poll_bio * * Add every dm_io instance into the dm_io list head which is stored * in bio->bi_private, so that dm_poll_bio can poll them all.
*/ if (error || !ci.submit_as_polled) { /* * In case of submission failure, the extra reference for * submitting io isn't consumed yet
*/ if (error)
atomic_dec(&io->io_count);
dm_io_dec_pending(io, error);
} else
dm_queue_poll_io(bio, io);
}
staticvoid dm_submit_bio(struct bio *bio)
{ struct mapped_device *md = bio->bi_bdev->bd_disk->private_data; int srcu_idx; struct dm_table *map;
/* If suspended, queue this IO for later */ if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { if (bio->bi_opf & REQ_NOWAIT)
bio_wouldblock_error(bio); elseif (bio->bi_opf & REQ_RAHEAD)
bio_io_error(bio); else
queue_io(md, bio); goto out;
}
/* Only poll normal bio which was marked as REQ_DM_POLL_LIST */ if (!(bio->bi_opf & REQ_DM_POLL_LIST)) return 0;
WARN_ON_ONCE(!list);
/* * Restore .bi_private before possibly completing dm_io. * * bio_poll() is only possible once @bio has been completely * submitted via submit_bio_noacct()'s depth-first submission. * So there is no dm_queue_poll_io() race associated with * clearing REQ_DM_POLL_LIST here.
*/
bio->bi_opf &= ~REQ_DM_POLL_LIST;
bio->bi_private = list->data;
for (curr = list, next = curr->next; curr; curr = next, next =
curr ? curr->next : NULL) { if (dm_poll_dm_io(curr, iob, flags)) { /* * clone_endio() has already occurred, so no * error handling is needed here.
*/
__dm_io_dec_pending(curr);
} else {
curr->next = tmp;
tmp = curr;
}
}
/* Not done? */ if (tmp) {
bio->bi_opf |= REQ_DM_POLL_LIST; /* Reset bio->bi_private to dm_io list head */
*head = tmp; return 0;
} return 1;
}
/* *--------------------------------------------------------------- * An IDR is used to keep track of allocated minor numbers. *---------------------------------------------------------------
*/ staticvoid free_minor(int minor)
{
spin_lock(&_minor_lock);
idr_remove(&_minor_idr, minor);
spin_unlock(&_minor_lock);
}
/* * See if the device with a specific minor # is free.
*/ staticint specific_minor(int minor)
{ int r;
if (minor >= (1 << MINORBITS)) return -EINVAL;
idr_preload(GFP_KERNEL);
spin_lock(&_minor_lock);
r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
spin_unlock(&_minor_lock);
idr_preload_end(); if (r < 0) return r == -ENOSPC ? -EBUSY : r; return 0;
}
staticint next_free_minor(int *minor)
{ int r;
idr_preload(GFP_KERNEL);
spin_lock(&_minor_lock);
r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
/* * Allocate and initialise a blank device with a given minor.
*/ staticstruct mapped_device *alloc_dev(int minor)
{ int r, numa_node_id = dm_get_numa_node(); struct dax_device *dax_dev; struct mapped_device *md; void *old_md;
md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id); if (!md) {
DMERR("unable to allocate device, out of memory."); return NULL;
}
if (!try_module_get(THIS_MODULE)) goto bad_module_get;
/* get a minor number for the dev */ if (minor == DM_ANY_MINOR)
r = next_free_minor(&minor); else
r = specific_minor(minor); if (r < 0) goto bad_minor;
r = init_srcu_struct(&md->io_barrier); if (r < 0) goto bad_io_barrier;
ret = dm_table_set_restrictions(t, md->queue, limits); if (ret) {
set_capacity(md->disk, old_size);
old_map = ERR_PTR(ret); goto out;
}
/* * Wipe any geometry if the size of the table changed.
*/ if (size != old_size)
memset(&md->geometry, 0, sizeof(md->geometry));
dm_table_event_callback(t, event_callback, md);
if (dm_table_request_based(t)) { /* * Leverage the fact that request-based DM targets are * immutable singletons - used to optimize dm_mq_queue_rq.
*/
md->immutable_target = dm_table_get_immutable_target(t);
/* * There is no need to reload with request-based dm because the * size of front_pad doesn't change. * * Note for future: If you are to reload bioset, prep-ed * requests in the queue may refer to bio from the old bioset, * so you must walk through the queue to unprep.
*/ if (!md->mempools)
md->mempools = t->mempools; else
dm_free_md_mempools(t->mempools);
} else { /* * The md may already have mempools that need changing. * If so, reload bioset because front_pad may have changed * because a different table was loaded.
*/
dm_free_md_mempools(md->mempools);
md->mempools = t->mempools;
}
t->mempools = NULL;
/* * Constructor for a new device.
*/ int dm_create(int minor, struct mapped_device **result)
{ struct mapped_device *md;
md = alloc_dev(minor); if (!md) return -ENXIO;
dm_ima_reset_data(md);
*result = md; return 0;
}
/* * Functions to manage md->type. * All are required to hold md->type_lock.
*/ void dm_lock_md_type(struct mapped_device *md)
{
mutex_lock(&md->type_lock);
}
/* * Setup the DM device's queue based on md's type
*/ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
{ enum dm_queue_mode type = dm_table_get_type(t); struct queue_limits limits; struct table_device *td; int r;
WARN_ON_ONCE(type == DM_TYPE_NONE);
if (type == DM_TYPE_REQUEST_BASED) {
md->disk->fops = &dm_rq_blk_dops;
r = dm_mq_init_request_queue(md, t); if (r) {
DMERR("Cannot initialize queue for request-based dm mapped device"); return r;
}
}
r = dm_calculate_queue_limits(t, &limits); if (r) {
DMERR("Cannot calculate initial queue limits"); return r;
}
r = dm_table_set_restrictions(t, md->queue, &limits); if (r) return r;
/* * Hold lock to make sure add_disk() and del_gendisk() won't concurrent * with open_table_device() and close_table_device().
*/
mutex_lock(&md->table_devices_lock);
r = add_disk(md->disk);
mutex_unlock(&md->table_devices_lock); if (r) return r;
/* * Register the holder relationship for devices added before the disk * was live.
*/
list_for_each_entry(td, &md->table_devices, list) {
r = bd_link_disk_holder(td->dm_dev.bdev, md->disk); if (r) goto out_undo_holders;
}
r = dm_sysfs_init(md); if (r) goto out_undo_holders;
/* * Take suspend_lock so that presuspend and postsuspend methods * do not race with internal suspend.
*/
mutex_lock(&md->suspend_lock);
map = dm_get_live_table(md, &srcu_idx); if (!dm_suspended_md(md)) {
dm_table_presuspend_targets(map);
set_bit(DMF_SUSPENDED, &md->flags);
set_bit(DMF_POST_SUSPENDING, &md->flags);
dm_table_postsuspend_targets(map);
} /* dm_put_live_table must be before fsleep, otherwise deadlock is possible */
dm_put_live_table(md, srcu_idx);
mutex_unlock(&md->suspend_lock);
/* * Rare, but there may be I/O requests still going to complete, * for example. Wait for all references to disappear.
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Dauer der Verarbeitung: 0.26 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.