module_param_cb(multipath, &multipath_param_ops, &multipath, 0444);
MODULE_PARM_DESC(multipath, "turn on native support for multiple controllers per subsystem");
module_param_cb(multipath_always_on, &multipath_always_on_ops,
&multipath_always_on, 0444);
MODULE_PARM_DESC(multipath_always_on, "create multipath node always except for private namespace with non-unique nsid; note that this also implicitly enables native multipath support");
/* * If we got back an ANA error, we know the controller is alive but not * ready to serve this namespace. Kick of a re-read of the ANA * information page, and just try any other available path for now.
*/ if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
set_bit(NVME_NS_ANA_PENDING, &ns->flags);
queue_work(nvme_wq, &ns->ctrl->ana_work);
}
spin_lock_irqsave(&ns->head->requeue_lock, flags); for (bio = req->bio; bio; bio = bio->bi_next) {
bio_set_dev(bio, ns->head->disk->part0); if (bio->bi_opf & REQ_POLLED) {
bio->bi_opf &= ~REQ_POLLED;
bio->bi_cookie = BLK_QC_T_NONE;
} /* * The alternate request queue that we may end up submitting * the bio to may be frozen temporarily, in this case REQ_NOWAIT * will fail the I/O immediately with EAGAIN to the issuer. * We are not in the issuer context which cannot block. Clear * the flag to avoid spurious EAGAIN I/O failures.
*/
bio->bi_opf &= ~REQ_NOWAIT;
}
blk_steal_bios(&ns->head->requeue_list, req);
spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
staticbool nvme_path_is_disabled(struct nvme_ns *ns)
{ enum nvme_ctrl_state state = nvme_ctrl_state(ns->ctrl);
/* * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should * still be able to complete assuming that the controller is connected. * Otherwise it will fail immediately and return to the requeue list.
*/ if (state != NVME_CTRL_LIVE && state != NVME_CTRL_DELETING) returntrue; if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) ||
!test_bit(NVME_NS_READY, &ns->flags)) returntrue; returnfalse;
}
if (unlikely(!old)) return __nvme_find_path(head, node);
if (list_is_singular(&head->list)) { if (nvme_path_is_disabled(old)) return NULL; return old;
}
for (ns = nvme_next_ns(head, old);
ns && ns != old;
ns = nvme_next_ns(head, ns)) { if (nvme_path_is_disabled(ns)) continue;
if (ns->ana_state == NVME_ANA_OPTIMIZED) {
found = ns; goto out;
} if (ns->ana_state == NVME_ANA_NONOPTIMIZED)
found = ns;
}
/* * The loop above skips the current path for round-robin semantics. * Fall back to the current path if either: * - no other optimized path found and current is optimized, * - no other usable path found and current is usable.
*/ if (!nvme_path_is_disabled(old) &&
(old->ana_state == NVME_ANA_OPTIMIZED ||
(!found && old->ana_state == NVME_ANA_NONOPTIMIZED))) return old;
if (!found) return NULL;
out:
rcu_assign_pointer(head->current_path[node], found); return found;
}
if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) returnfalse;
list_for_each_entry_srcu(ns, &head->list, siblings,
srcu_read_lock_held(&head->srcu)) { if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags)) continue; switch (nvme_ctrl_state(ns->ctrl)) { case NVME_CTRL_LIVE: case NVME_CTRL_RESETTING: case NVME_CTRL_CONNECTING: returntrue; default: break;
}
}
/* * If "head->delayed_removal_secs" is configured (i.e., non-zero), do * not immediately fail I/O. Instead, requeue the I/O for the configured * duration, anticipating that if there's a transient link failure then * it may recover within this time window. This parameter is exported to * userspace via sysfs, and its default value is zero. It is internally * mapped to NVME_NSHEAD_QUEUE_IF_NO_PATH. When delayed_removal_secs is * non-zero, this flag is set to true. When zero, the flag is cleared.
*/ return nvme_mpath_queue_if_no_path(head);
}
staticvoid nvme_ns_head_submit_bio(struct bio *bio)
{ struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data; struct device *dev = disk_to_dev(head->disk); struct nvme_ns *ns; int srcu_idx;
/* * The namespace might be going away and the bio might be moved to a * different queue via blk_steal_bios(), so we need to use the bio_split * pool from the original queue to allocate the bvecs from.
*/
bio = bio_split_to_limits(bio); if (!bio) return;
spin_lock_irq(&head->requeue_lock);
next = bio_list_get(&head->requeue_list);
spin_unlock_irq(&head->requeue_lock);
while ((bio = next) != NULL) {
next = bio->bi_next;
bio->bi_next = NULL;
submit_bio_noacct(bio);
}
}
staticvoid nvme_remove_head(struct nvme_ns_head *head)
{ if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { /* * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared * to allow multipath to fail all I/O.
*/
kblockd_schedule_work(&head->requeue_work);
/* * If "multipath_always_on" is enabled, a multipath node is added * regardless of whether the disk is single/multi ported, and whether * the namespace is shared or private. If "multipath_always_on" is not * enabled, a multipath node is added only if the subsystem supports * multiple controllers and the "multipath" option is configured. In * either case, for private namespaces, we ensure that the NSID is * unique.
*/ if (!multipath_always_on) { if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
!multipath) return 0;
}
/* * We need to suppress the partition scan from occuring within the * controller's scan_work context. If a path error occurs here, the IO * will wait until a path becomes available or all paths are torn down, * but that action also occurs within scan_work, so it would deadlock. * Defer the partition scan to a different context that does not block * scan_work.
*/
set_bit(GD_SUPPRESS_PART_SCAN, &head->disk->state);
sprintf(head->disk->disk_name, "nvme%dn%d",
ctrl->subsys->instance, head->instance);
nvme_tryget_ns_head(head); return 0;
}
/* * test_and_set_bit() is used because it is protecting against two nvme * paths simultaneously calling device_add_disk() on the same namespace * head.
*/ if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
rc = device_add_disk(&head->subsys->dev, head->disk,
nvme_ns_attr_groups); if (rc) {
clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags); return;
}
nvme_add_ns_head_cdev(head);
kblockd_schedule_work(&head->partition_scan_work);
}
nvme_mpath_add_sysfs_link(ns->head);
mutex_lock(&head->lock); if (nvme_path_is_optimized(ns)) { int node, srcu_idx;
error = cb(ctrl, desc, data); if (error) return error;
offset += nsid_buf_size;
}
return 0;
}
staticinlinebool nvme_state_is_live(enum nvme_ana_state state)
{ return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED;
}
staticvoid nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc, struct nvme_ns *ns)
{
ns->ana_grpid = le32_to_cpu(desc->grpid);
ns->ana_state = desc->state;
clear_bit(NVME_NS_ANA_PENDING, &ns->flags); /* * nvme_mpath_set_live() will trigger I/O to the multipath path device * and in turn to this path device. However we cannot accept this I/O * if the controller is not live. This may deadlock if called from * nvme_mpath_init_identify() and the ctrl will never complete * initialization, preventing I/O from completing. For this case we * will reprocess the ANA log page in nvme_mpath_update() once the * controller is ready.
*/ if (nvme_state_is_live(ns->ana_state) &&
nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE)
nvme_mpath_set_live(ns); else { /* * Add sysfs link from multipath head gendisk node to path * device gendisk node. * If path's ana state is live (i.e. state is either optimized * or non-optimized) while we alloc the ns then sysfs link would * be created from nvme_mpath_set_live(). In that case we would * not fallthrough this code path. However for the path's ana * state other than live, we call nvme_mpath_set_live() only * after ana state transitioned to the live state. But we still * want to create the sysfs link from head node to a path device * irrespctive of the path's ana state. * If we reach through here then it means that path's ana state * is not live but still create the sysfs link to this path from * head node if head node of the path has already come alive.
*/ if (test_bit(NVME_NSHEAD_DISK_LIVE, &ns->head->flags))
nvme_mpath_add_sysfs_link(ns->head);
}
}
mutex_lock(&ctrl->ana_lock);
error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM,
ctrl->ana_log_buf, ctrl->ana_log_size, 0); if (error) {
dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error); goto out_unlock;
}
error = nvme_parse_ana_log(ctrl, &nr_change_groups,
nvme_update_ana_state); if (error) goto out_unlock;
/* * In theory we should have an ANATT timer per group as they might enter * the change state at different times. But that is a lot of overhead * just to protect against a target that keeps entering new changes * states while never finishing previous ones. But we'll still * eventually time out once all groups are in change state, so this * isn't a big deal. * * We also double the ANATT value to provide some slack for transports * or AEN processing overhead.
*/ if (nr_change_groups)
mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies); else
timer_delete_sync(&ctrl->anatt_timer);
out_unlock:
mutex_unlock(&ctrl->ana_lock); return error;
}
for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) { if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
nvme_subsys_iopolicy_update(subsys, i); return count;
}
}
ret = kstrtouint(buf, 0, &sec); if (ret < 0) return ret;
mutex_lock(&head->subsys->lock);
head->delayed_removal_secs = sec; if (sec)
set_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags); else
clear_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags);
mutex_unlock(&head->subsys->lock); /* * Ensure that update to NVME_NSHEAD_QUEUE_IF_NO_PATH is seen * by its reader.
*/
synchronize_srcu(&head->srcu);
/* * Ensure head disk node is already added otherwise we may get invalid * kobj for head disk node
*/ if (!test_bit(GD_ADDED, &head->disk->state)) return;
kobj = &disk_to_dev(head->disk)->kobj;
/* * loop through each ns chained through the head->list and create the * sysfs link from head node to the ns path node
*/
srcu_idx = srcu_read_lock(&head->srcu);
list_for_each_entry_srcu(ns, &head->list, siblings,
srcu_read_lock_held(&head->srcu)) { /* * Ensure that ns path disk node is already added otherwise we * may get invalid kobj name for target
*/ if (!test_bit(GD_ADDED, &ns->disk->state)) continue;
/* * Avoid creating link if it already exists for the given path. * When path ana state transitions from optimized to non- * optimized or vice-versa, the nvme_mpath_set_live() is * invoked which in truns call this function. Now if the sysfs * link already exists for the given path and we attempt to re- * create the link then sysfs code would warn about it loudly. * So we evaluate NVME_NS_SYSFS_ATTR_LINK flag here to ensure * that we're not creating duplicate link. * The test_and_set_bit() is used because it is protecting * against multiple nvme paths being simultaneously added.
*/ if (test_and_set_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags)) continue;
target = disk_to_dev(ns->disk); /* * Create sysfs link from head gendisk kobject @kobj to the * ns path gendisk kobject @target->kobj.
*/
rc = sysfs_add_link_to_group(kobj, nvme_ns_mpath_attr_group.name,
&target->kobj, dev_name(target)); if (unlikely(rc)) {
dev_err(disk_to_dev(ns->head->disk), "failed to create link to %s\n",
dev_name(target));
clear_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags);
}
}
mutex_lock(&head->subsys->lock); /* * We are called when all paths have been removed, and at that point * head->list is expected to be empty. However, nvme_remove_ns() and * nvme_init_ns_head() can run concurrently and so if head->delayed_ * removal_secs is configured, it is possible that by the time we reach * this point, head->list may no longer be empty. Therefore, we recheck * head->list here. If it is no longer empty then we skip enqueuing the * delayed head removal work.
*/ if (!list_empty(&head->list)) goto out;
if (head->delayed_removal_secs) { /* * Ensure that no one could remove this module while the head * remove work is pending.
*/ if (!try_module_get(THIS_MODULE)) goto out;
mod_delayed_work(nvme_wq, &head->remove_work,
head->delayed_removal_secs * HZ);
} else {
list_del_init(&head->entry);
remove = true;
}
out:
mutex_unlock(&head->subsys->lock); if (remove)
nvme_remove_head(head);
}
void nvme_mpath_put_disk(struct nvme_ns_head *head)
{ if (!head->disk) return; /* make sure all pending bios are cleaned up */
kblockd_schedule_work(&head->requeue_work);
flush_work(&head->requeue_work);
flush_work(&head->partition_scan_work);
put_disk(head->disk);
}
int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
{
size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT;
size_t ana_log_size; int error = 0;
/* check if multipath is enabled and we have the capability */ if (!multipath || !ctrl->subsys ||
!(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA)) return 0;
/* initialize this in the identify path to cover controller resets */
atomic_set(&ctrl->nr_active, 0);
if (!ctrl->max_namespaces ||
ctrl->max_namespaces > le32_to_cpu(id->nn)) {
dev_err(ctrl->device, "Invalid MNAN value %u\n", ctrl->max_namespaces); return -EINVAL;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.