unsignedint admin_timeout = 60;
module_param(admin_timeout, uint, 0644);
MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
EXPORT_SYMBOL_GPL(admin_timeout);
unsignedint nvme_io_timeout = 30;
module_param_named(io_timeout, nvme_io_timeout, uint, 0644);
MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
EXPORT_SYMBOL_GPL(nvme_io_timeout);
staticunsignedchar shutdown_timeout = 5;
module_param(shutdown_timeout, byte, 0644);
MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
static u8 nvme_max_retries = 5;
module_param_named(max_retries, nvme_max_retries, byte, 0644);
MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
staticunsignedlong default_ps_max_latency_us = 100000;
module_param(default_ps_max_latency_us, ulong, 0644);
MODULE_PARM_DESC(default_ps_max_latency_us, "max power saving latency for new devices; use PM QOS to change per device");
staticbool force_apst;
module_param(force_apst, bool, 0644);
MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");
/* * Older kernels didn't enable protection information if it was at an offset. * Newer kernels do, so it breaks reads on the upgrade if such formats were * used in prior kernels since the metadata written did not contain a valid * checksum.
*/ staticbool disable_pi_offsets = false;
module_param(disable_pi_offsets, bool, 0444);
MODULE_PARM_DESC(disable_pi_offsets, "disable protection information if it has an offset");
/* * nvme_wq - hosts nvme related works that are not reset or delete * nvme_reset_wq - hosts nvme reset works * nvme_delete_wq - hosts nvme delete works * * nvme_wq will host works such as scan, aen handling, fw activation, * keep-alive, periodic reconnects etc. nvme_reset_wq * runs reset works which also flush works hosted on nvme_wq for * serialization purposes. nvme_delete_wq host controller deletion * works which flush reset works for serialization.
*/ struct workqueue_struct *nvme_wq;
EXPORT_SYMBOL_GPL(nvme_wq);
void nvme_queue_scan(struct nvme_ctrl *ctrl)
{ /* * Only new queue scan work when admin and IO queues are both alive
*/ if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE && ctrl->tagset)
queue_work(nvme_wq, &ctrl->scan_work);
}
/* * Use this function to proceed with scheduling reset_work for a controller * that had previously been set to the resetting state. This is intended for * code paths that can't be interrupted by other reset attempts. A hot removal * may prevent this from succeeding.
*/ int nvme_try_sched_reset(struct nvme_ctrl *ctrl)
{ if (nvme_ctrl_state(ctrl) != NVME_CTRL_RESETTING) return -EBUSY; if (!queue_work(nvme_reset_wq, &ctrl->reset_work)) return -EBUSY; return 0;
}
EXPORT_SYMBOL_GPL(nvme_try_sched_reset);
int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
{ if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) return -EBUSY; if (!queue_work(nvme_delete_wq, &ctrl->delete_work)) return -EBUSY; return 0;
}
EXPORT_SYMBOL_GPL(nvme_delete_ctrl);
void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
{ /* * Keep a reference until nvme_do_delete_ctrl() complete, * since ->delete_ctrl can free the controller.
*/
nvme_get_ctrl(ctrl); if (nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
nvme_do_delete_ctrl(ctrl);
nvme_put_ctrl(ctrl);
}
static blk_status_t nvme_error_status(u16 status)
{ switch (status & NVME_SCT_SC_MASK) { case NVME_SC_SUCCESS: return BLK_STS_OK; case NVME_SC_CAP_EXCEEDED: return BLK_STS_NOSPC; case NVME_SC_LBA_RANGE: case NVME_SC_CMD_INTERRUPTED: case NVME_SC_NS_NOT_READY: return BLK_STS_TARGET; case NVME_SC_BAD_ATTRIBUTES: case NVME_SC_INVALID_OPCODE: case NVME_SC_INVALID_FIELD: case NVME_SC_INVALID_NS: return BLK_STS_NOTSUPP; case NVME_SC_WRITE_FAULT: case NVME_SC_READ_ERROR: case NVME_SC_UNWRITTEN_BLOCK: case NVME_SC_ACCESS_DENIED: case NVME_SC_READ_ONLY: case NVME_SC_COMPARE_FAILED: return BLK_STS_MEDIUM; case NVME_SC_GUARD_CHECK: case NVME_SC_APPTAG_CHECK: case NVME_SC_REFTAG_CHECK: case NVME_SC_INVALID_PI: return BLK_STS_PROTECTION; case NVME_SC_RESERVATION_CONFLICT: return BLK_STS_RESV_CONFLICT; case NVME_SC_HOST_PATH_ERROR: return BLK_STS_TRANSPORT; case NVME_SC_ZONE_TOO_MANY_ACTIVE: return BLK_STS_ZONE_ACTIVE_RESOURCE; case NVME_SC_ZONE_TOO_MANY_OPEN: return BLK_STS_ZONE_OPEN_RESOURCE; default: return BLK_STS_IOERR;
}
}
/* The mask and shift result must be <= 3 */
crd = (nvme_req(req)->status & NVME_STATUS_CRD) >> 11; if (crd)
delay = nvme_req(req)->ctrl->crdt[crd - 1] * 100;
/* * Completions of long-running commands should not be able to * defer sending of periodic keep alives, since the controller * may have completed processing such commands a long time ago * (arbitrarily close to command submission time). * req->deadline - req->timeout is the command submission time * in jiffies.
*/ if (ctrl->kas &&
req->deadline - req->timeout >= ctrl->ka_last_check_time)
ctrl->comp_seen = true;
switch (nvme_decide_disposition(req)) { case COMPLETE:
nvme_end_req(req); return; case RETRY:
nvme_retry_req(req); return; case FAILOVER:
nvme_failover_req(req); return; case AUTHENTICATE: #ifdef CONFIG_NVME_HOST_AUTH
queue_work(nvme_wq, &ctrl->dhchap_auth_work);
nvme_retry_req(req); #else
nvme_end_req(req); #endif return;
}
}
EXPORT_SYMBOL_GPL(nvme_complete_rq);
/* * Called to unwind from ->queue_rq on a failed command submission so that the * multipathing code gets called to potentially failover to another path. * The caller needs to unwind all transport specific resource allocations and * must return propagate the return value.
*/
blk_status_t nvme_host_path_error(struct request *req)
{
nvme_req(req)->status = NVME_SC_HOST_PATH_ERROR;
blk_mq_set_request_complete(req);
nvme_complete_rq(req); return BLK_STS_OK;
}
EXPORT_SYMBOL_GPL(nvme_host_path_error);
/* * Waits for the controller state to be resetting, or returns false if it is * not possible to ever transition to that state.
*/ bool nvme_wait_reset(struct nvme_ctrl *ctrl)
{
wait_event(ctrl->state_wq,
nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) ||
nvme_state_terminal(ctrl)); return nvme_ctrl_state(ctrl) == NVME_CTRL_RESETTING;
}
EXPORT_SYMBOL_GPL(nvme_wait_reset);
/* * For something we're not in a state to send to the device the default action * is to busy it and retry it after the controller state is recovered. However, * if the controller is deleting or if anything is marked for failfast or * nvme multipath it is immediately failed. * * Note: commands used to initialize the controller will be marked for failfast. * Note: nvme cli/ioctl commands are marked for failfast.
*/
blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl, struct request *rq)
{ enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
if (state != NVME_CTRL_DELETING_NOIO &&
state != NVME_CTRL_DELETING &&
state != NVME_CTRL_DEAD &&
!test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) &&
!blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) return BLK_STS_RESOURCE;
if (!(rq->rq_flags & RQF_DONTPREP))
nvme_clear_nvme_request(rq);
/* * currently we have a problem sending passthru commands * on the admin_q if the controller is not LIVE because we can't * make sure that they are going out after the admin connect, * controller enable and/or other commands in the initialization * sequence. until the controller will be LIVE, fail with * BLK_STS_RESOURCE so that they will be rescheduled.
*/ if (rq->q == ctrl->admin_q && (req->flags & NVME_REQ_USERCMD)) returnfalse;
if (ctrl->ops->flags & NVME_F_FABRICS) { /* * Only allow commands on a live queue, except for the connect * command, which is require to set the queue live in the * appropinquate states.
*/ switch (state) { case NVME_CTRL_CONNECTING: if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) &&
(req->cmd->fabrics.fctype == nvme_fabrics_type_connect ||
req->cmd->fabrics.fctype == nvme_fabrics_type_auth_send ||
req->cmd->fabrics.fctype == nvme_fabrics_type_auth_receive)) returntrue; break; default: break; case NVME_CTRL_DEAD: returnfalse;
}
}
/* * Some devices do not consider the DSM 'Number of Ranges' field when * determining how much data to DMA. Always allocate memory for maximum * number of segments to prevent device reading beyond end of buffer.
*/ staticconst size_t alloc_size = sizeof(*range) * NVME_DSM_MAX_RANGES;
range = kzalloc(alloc_size, GFP_ATOMIC | __GFP_NOWARN); if (!range) { /* * If we fail allocation our range, fallback to the controller * discard page. If that's also busy, it's safe to return * busy, as we know we can make progress once that's freed.
*/ if (test_and_set_bit_lock(0, &ns->ctrl->discard_page_busy)) return BLK_STS_RESOURCE;
/* only type1 and type 2 PI formats have a reftag */ switch (ns->head->pi_type) { case NVME_NS_DPS_PI_TYPE1: case NVME_NS_DPS_PI_TYPE2: break; default: return;
}
/* both rw and write zeroes share the same reftag format */ switch (ns->head->guard_type) { case NVME_NVM_NS_16B_GUARD:
cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req)); break; case NVME_NVM_NS_64B_GUARD:
ref48 = ext_pi_ref_tag(req);
lower = lower_32_bits(ref48);
upper = upper_32_bits(ref48);
if (nvme_ns_has_pi(ns->head)) {
cmnd->write_zeroes.control |= cpu_to_le16(NVME_RW_PRINFO_PRACT);
nvme_set_ref_tag(ns, cmnd, req);
}
return BLK_STS_OK;
}
/* * NVMe does not support a dedicated command to issue an atomic write. A write * which does adhere to the device atomic limits will silently be executed * non-atomically. The request issuer should ensure that the write is within * the queue atomic writes limits, but just validate this in case it is not.
*/ staticbool nvme_valid_atomic_write(struct request *req)
{ struct request_queue *q = req->q;
u32 boundary_bytes = queue_atomic_write_boundary_bytes(q);
if (blk_rq_bytes(req) > queue_atomic_write_unit_max_bytes(q)) returnfalse;
if (ns->head->ms) { /* * If formatted with metadata, the block layer always provides a * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled. Else * we enable the PRACT bit for protection information or set the * namespace capacity to zero to prevent any I/O.
*/ if (!blk_integrity_rq(req)) { if (WARN_ON_ONCE(!nvme_ns_has_pi(ns->head))) return BLK_STS_NOTSUPP;
control |= NVME_RW_PRINFO_PRACT;
nvme_set_ref_tag(ns, cmnd, req);
}
if (bio_integrity_flagged(req->bio, BIP_CHECK_GUARD))
control |= NVME_RW_PRINFO_PRCHK_GUARD; if (bio_integrity_flagged(req->bio, BIP_CHECK_REFTAG)) {
control |= NVME_RW_PRINFO_PRCHK_REF; if (op == nvme_cmd_zone_append)
control |= NVME_RW_APPEND_PIREMAP;
nvme_set_ref_tag(ns, cmnd, req);
} if (bio_integrity_flagged(req->bio, BIP_CHECK_APPTAG)) {
control |= NVME_RW_PRINFO_PRCHK_APP;
nvme_set_app_tag(req, cmnd);
}
}
if (!(req->rq_flags & RQF_DONTPREP))
nvme_clear_nvme_request(req);
switch (req_op(req)) { case REQ_OP_DRV_IN: case REQ_OP_DRV_OUT: /* these are setup prior to execution in nvme_init_request() */ break; case REQ_OP_FLUSH:
nvme_setup_flush(ns, cmd); break; case REQ_OP_ZONE_RESET_ALL: case REQ_OP_ZONE_RESET:
ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET); break; case REQ_OP_ZONE_OPEN:
ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN); break; case REQ_OP_ZONE_CLOSE:
ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE); break; case REQ_OP_ZONE_FINISH:
ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH); break; case REQ_OP_WRITE_ZEROES:
ret = nvme_setup_write_zeroes(ns, req, cmd); break; case REQ_OP_DISCARD:
ret = nvme_setup_discard(ns, req, cmd); break; case REQ_OP_READ:
ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read); break; case REQ_OP_WRITE:
ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write); break; case REQ_OP_ZONE_APPEND:
ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append); break; default:
WARN_ON_ONCE(1); return BLK_STS_IOERR;
}
/* * Return values: * 0: success * >0: nvme controller's cqe status response * <0: kernel error in lieu of controller response
*/ int nvme_execute_rq(struct request *rq, bool at_head)
{
blk_status_t status;
status = blk_execute_rq(rq, at_head); if (nvme_req(rq)->flags & NVME_REQ_CANCELLED) return -EINTR; if (nvme_req(rq)->status) return nvme_req(rq)->status; return blk_status_to_errno(status);
}
EXPORT_SYMBOL_NS_GPL(nvme_execute_rq, "NVME_TARGET_PASSTHRU");
/* * Returns 0 on success. If the result is negative, it's a Linux error code; * if the result is positive, it's an NVM Express status code
*/ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, union nvme_result *result, void *buffer, unsigned bufflen, int qid, nvme_submit_flags_t flags)
{ struct request *req; int ret;
blk_mq_req_flags_t blk_flags = 0;
if (ns) {
effects = le32_to_cpu(ns->head->effects->iocs[opcode]); if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC))
dev_warn_once(ctrl->device, "IO command:%02x has unusual effects:%08x\n",
opcode, effects);
/* * NVME_CMD_EFFECTS_CSE_MASK causes a freeze all I/O queues, * which would deadlock when done on an I/O command. Note that * We already warn about an unusual effect above.
*/
effects &= ~NVME_CMD_EFFECTS_CSE_MASK;
} else {
effects = le32_to_cpu(ctrl->effects->acs[opcode]);
/* Ignore execution restrictions if any relaxation bits are set */ if (effects & NVME_CMD_EFFECTS_CSER_MASK)
effects &= ~NVME_CMD_EFFECTS_CSE_MASK;
}
/* * For simplicity, IO to all namespaces is quiesced even if the command * effects say only one namespace is affected.
*/ if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
mutex_lock(&ctrl->scan_lock);
mutex_lock(&ctrl->subsys->lock);
nvme_mpath_start_freeze(ctrl->subsys);
nvme_mpath_wait_freeze(ctrl->subsys);
nvme_start_freeze(ctrl);
nvme_wait_freeze(ctrl);
} return effects;
}
EXPORT_SYMBOL_NS_GPL(nvme_passthru_start, "NVME_TARGET_PASSTHRU");
void nvme_passthru_end(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u32 effects, struct nvme_command *cmd, int status)
{ if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
nvme_unfreeze(ctrl);
nvme_mpath_unfreeze(ctrl->subsys);
mutex_unlock(&ctrl->subsys->lock);
mutex_unlock(&ctrl->scan_lock);
} if (effects & NVME_CMD_EFFECTS_CCC) { if (!test_and_set_bit(NVME_CTRL_DIRTY_CAPABILITY,
&ctrl->flags)) {
dev_info(ctrl->device, "controller capabilities changed, reset may be required to take effect.\n");
}
} if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) {
nvme_queue_scan(ctrl);
flush_work(&ctrl->scan_work);
} if (ns) return;
switch (cmd->common.opcode) { case nvme_admin_set_features: switch (le32_to_cpu(cmd->common.cdw10) & 0xFF) { case NVME_FEAT_KATO: /* * Keep alive commands interval on the host should be * updated when KATO is modified by Set Features * commands.
*/ if (!status)
nvme_update_keep_alive(ctrl, cmd); break; default: break;
} break; default: break;
}
}
EXPORT_SYMBOL_NS_GPL(nvme_passthru_end, "NVME_TARGET_PASSTHRU");
/* * Recommended frequency for KATO commands per NVMe 1.4 section 7.12.1: * * The host should send Keep Alive commands at half of the Keep Alive Timeout * accounting for transport roundtrip times [..].
*/ staticunsignedlong nvme_keep_alive_work_period(struct nvme_ctrl *ctrl)
{ unsignedlong delay = ctrl->kato * HZ / 2;
/* * When using Traffic Based Keep Alive, we need to run * nvme_keep_alive_work at twice the normal frequency, as one * command completion can postpone sending a keep alive command * by up to twice the delay between runs.
*/ if (ctrl->ctratt & NVME_CTRL_ATTR_TBKAS)
delay /= 2; return delay;
}
staticbool nvme_id_cns_ok(struct nvme_ctrl *ctrl, u8 cns)
{ /* * The CNS field occupies a full byte starting with NVMe 1.2
*/ if (ctrl->vs >= NVME_VS(1, 2, 0)) returntrue;
/* * NVMe 1.1 expanded the CNS value to two bits, which means values * larger than that could get truncated and treated as an incorrect * value. * * Qemu implemented 1.0 behavior for controllers claiming 1.1 * compliance, so they need to be quirked here.
*/ if (ctrl->vs >= NVME_VS(1, 1, 0) &&
!(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) return cns <= 3;
/* * NVMe 1.0 used a single bit for the CNS value.
*/ return cns <= 1;
}
staticint nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
{ struct nvme_command c = { }; int error;
/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
c.identify.opcode = nvme_admin_identify;
c.identify.cns = NVME_ID_CNS_CTRL;
*id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL); if (!*id) return -ENOMEM;
int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
{
u32 q_count = (*count - 1) | ((*count - 1) << 16);
u32 result; int status, nr_io_queues;
status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0,
&result);
/* * It's either a kernel error or the host observed a connection * lost. In either case it's not possible communicate with the * controller and thus enter the error code path.
*/ if (status < 0 || status == NVME_SC_HOST_PATH_ERROR) return status;
/* * Degraded controllers might return an error when setting the queue * count. We still want to be able to bring them online and offer * access to the admin queue, as that might be only way to fix them up.
*/ if (status > 0) {
dev_err(ctrl->device, "Could not set queue count (%d)\n", status);
*count = 0;
} else {
nr_io_queues = min(result & 0xffff, result >> 16) + 1;
*count = min(*count, nr_io_queues);
}
status = nvme_set_features(ctrl, NVME_FEAT_ASYNC_EVENT, supported_aens,
NULL, 0, &result); if (status)
dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n",
supported_aens);
queue_work(nvme_wq, &ctrl->async_event_work);
}
staticint nvme_ns_open(struct nvme_ns *ns)
{
/* should never be called due to GENHD_FL_HIDDEN */ if (WARN_ON_ONCE(nvme_ns_head_multipath(ns->head))) goto fail; if (!nvme_get_ns(ns)) goto fail; if (!try_module_get(ns->ctrl->ops->module)) goto fail_put_ns;
/* * PI can always be supported as we can ask the controller to simply * insert/strip it, which is not possible for other kinds of metadata.
*/ if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ||
!(head->features & NVME_NS_METADATA_SUPPORTED)) return nvme_ns_has_pi(head);
if (head->pi_size && head->ms >= head->pi_size)
head->pi_type = id->dps & NVME_NS_DPS_PI_MASK; if (!(id->dps & NVME_NS_DPS_PI_FIRST)) { if (disable_pi_offsets)
head->pi_type = 0; else
info->pi_offset = head->ms - head->pi_size;
}
if (ctrl->ops->flags & NVME_F_FABRICS) { /* * The NVMe over Fabrics specification only supports metadata as * part of the extended data LBA. We rely on HCA/HBA support to * remap the separate metadata buffer from the block layer.
*/ if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT))) return;
head->features |= NVME_NS_EXT_LBAS;
/* * The current fabrics transport drivers support namespace * metadata formats only if nvme_ns_has_pi() returns true. * Suppress support for all other formats so the namespace will * have a 0 capacity and not be usable through the block stack. * * Note, this check will need to be modified if any drivers * gain the ability to use other metadata formats.
*/ if (ctrl->max_integrity_segments && nvme_ns_has_pi(head))
head->features |= NVME_NS_METADATA_SUPPORTED;
} else { /* * For PCIe controllers, we can't easily remap the separate * metadata buffer from the block layer and thus require a * separate metadata buffer for block layer metadata/PI support. * We allow extended LBAs for the passthrough interface, though.
*/ if (id->flbas & NVME_NS_FLBAS_META_EXT)
head->features |= NVME_NS_EXT_LBAS; else
head->features |= NVME_NS_METADATA_SUPPORTED;
}
}
/* * We do not support an offset for the atomic boundaries.
*/ if (id->nabo) return bs;
if ((id->nsfeat & NVME_NS_FEAT_ATOMICS) && id->nawupf) { /* * Use the per-namespace atomic write unit when available.
*/
atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs; if (id->nabspf)
boundary = (le16_to_cpu(id->nabspf) + 1) * bs;
} else { /* * Use the controller wide atomic write unit. This sucks * because the limit is defined in terms of logical blocks while * namespaces can have different formats, and because there is * no clear language in the specification prohibiting different * values for different controllers in the subsystem.
*/
atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
}
/* * The block layer can't support LBA sizes larger than the page size * or smaller than a sector size yet, so catch this early and don't * allow block I/O.
*/ if (blk_validate_block_size(bs)) {
bs = (1 << 9);
valid = false;
}
/* * Linux filesystems assume writing a single physical block is * an atomic operation. Hence limit the physical block size to the * value of the Atomic Write Unit Power Fail parameter.
*/
lim->logical_block_size = bs;
lim->physical_block_size = min(phys_bs, atomic_bs);
lim->io_min = phys_bs;
lim->io_opt = io_opt; if ((ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) &&
(ns->ctrl->oncs & NVME_CTRL_ONCS_DSM))
lim->max_write_zeroes_sectors = UINT_MAX; else
lim->max_write_zeroes_sectors = ns->ctrl->max_zeroes_sectors; return valid;
}
n = le16_to_cpu(h->numfdpc) + 1; if (fdp_idx > n) {
dev_warn(ctrl->device, "FDP index:%d out of range:%d\n",
fdp_idx, n); /* Proceed without registering FDP streams */
ret = 0; goto out;
}
log = h + 1;
desc = log;
end = log + size - sizeof(*h); for (i = 0; i < fdp_idx; i++) {
log += le16_to_cpu(desc->dsze);
desc = log; if (log >= end) {
dev_warn(ctrl->device, "FDP invalid config descriptor list\n");
ret = 0; goto out;
}
}
if (le32_to_cpu(desc->nrg) > 1) {
dev_warn(ctrl->device, "FDP NRG > 1 not supported\n");
ret = 0; goto out;
}
/* * The FDP configuration is static for the lifetime of the namespace, * so return immediately if we've already registered this namespace's * streams.
*/ if (head->nr_plids) return 0;
ret = nvme_get_features(ctrl, NVME_FEAT_FDP, info->endgid, NULL, 0,
&fdp); if (ret) {
dev_warn(ctrl->device, "FDP get feature status:0x%x\n", ret); return ret;
}
if (!(fdp.flags & FDPCFG_FDPE)) return 0;
ret = nvme_query_fdp_granularity(ctrl, info, fdp.fdpcidx); if (!info->runs) return ret;
if (info->is_rotational)
lim.features |= BLK_FEAT_ROTATIONAL;
/* * Register a metadata profile for PI, or the plain non-integrity NVMe * metadata masquerading as Type 0 if supported, otherwise reject block * I/O to namespaces with metadata except when the namespace supports * PI, as it can strip/insert in that case.
*/ if (!nvme_init_integrity(ns->head, &lim, info))
capacity = 0;
/* * Only set the DEAC bit if the device guarantees that reads from * deallocated data return zeroes. While the DEAC bit does not * require that, it must be a no-op if reads from deallocated data * do not return zeroes.
*/ if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3))) {
ns->head->features |= NVME_NS_DEAC;
lim.max_hw_wzeroes_unmap_sectors = lim.max_write_zeroes_sectors;
}
ret = queue_limits_commit_update(ns->disk->queue, &lim); if (ret) {
blk_mq_unfreeze_queue(ns->disk->queue, memflags); goto out;
}
switch (info->ids.csi) { case NVME_CSI_ZNS: if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
dev_info(ns->ctrl->device, "block device for nsid %u not supported without CONFIG_BLK_DEV_ZONED\n",
info->nsid);
ret = nvme_update_ns_info_generic(ns, info); break;
}
ret = nvme_update_ns_info_block(ns, info); break; case NVME_CSI_NVM:
ret = nvme_update_ns_info_block(ns, info); break; default:
dev_info(ns->ctrl->device, "block device for nsid %u not supported (csi %u)\n",
info->nsid, info->ids.csi);
ret = nvme_update_ns_info_generic(ns, info); break;
}
/* * If probing fails due an unsupported feature, hide the block device, * but still allow other access.
*/ if (ret == -ENODEV) {
ns->disk->flags |= GENHD_FL_HIDDEN;
set_bit(NVME_NS_READY, &ns->flags);
unsupported = true;
ret = 0;
}
lim = queue_limits_start_update(ns->head->disk->queue);
memflags = blk_mq_freeze_queue(ns->head->disk->queue); /* * queue_limits mixes values that are the hardware limitations * for bio splitting with what is the device configuration. * * For NVMe the device configuration can change after e.g. a * Format command, and we really want to pick up the new format * value here. But we must still stack the queue limits to the * least common denominator for multipathing to split the bios * properly. * * To work around this, we explicitly set the device * configuration to those that we just queried, but only stack * the splitting limits in to make sure we still obey possibly * lower limitations of other controllers.
*/
lim.logical_block_size = ns_lim->logical_block_size;
lim.physical_block_size = ns_lim->physical_block_size;
lim.io_min = ns_lim->io_min;
lim.io_opt = ns_lim->io_opt;
queue_limits_stack_bdev(&lim, ns->disk->part0, 0,
ns->head->disk->disk_name); if (unsupported)
ns->head->disk->flags |= GENHD_FL_HIDDEN; else
nvme_init_integrity(ns->head, &lim, info);
lim.max_write_streams = ns_lim->max_write_streams;
lim.write_stream_granularity = ns_lim->write_stream_granularity;
ret = queue_limits_commit_update(ns->head->disk->queue, &lim);
/* * Setting CRIME results in CSTS.RDY before the media is ready. This * makes it possible for media related commands to return the error * NVME_SC_ADMIN_COMMAND_MEDIA_NOT_READY. Until the driver is * restructured to handle retries, disable CC.CRIME.
*/
ctrl->ctrl_config &= ~NVME_CC_CRIME;
ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CRTO, &crto); if (ret) {
dev_err(ctrl->device, "Reading CRTO failed (%d)\n",
ret); return ret;
}
/* * CRTO should always be greater or equal to CAP.TO, but some * devices are known to get this wrong. Use the larger of the * two values.
*/
ready_timeout = NVME_CRTO_CRWMT(crto);
/* Don't bother enabling the feature if retry delay is not reported */ if (ctrl->crdt[0])
acre = NVME_ENABLE_ACRE; if (ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)
lbafee = NVME_ENABLE_LBAFEE;
if (!acre && !lbafee) return 0;
host = kzalloc(sizeof(*host), GFP_KERNEL); if (!host) return 0;
/* * The function checks whether the given total (exlat + enlat) latency of * a power state allows the latter to be used as an APST transition target. * It does so by comparing the latency to the primary and secondary latency * tolerances defined by module params. If there's a match, the corresponding * timeout value is returned and the matching tolerance index (1 or 2) is * reported.
*/ staticbool nvme_apst_get_transition_time(u64 total_latency,
u64 *transition_time, unsigned *last_index)
{ if (total_latency <= apst_primary_latency_tol_us) { if (*last_index == 1) returnfalse;
*last_index = 1;
*transition_time = apst_primary_timeout_ms; returntrue;
} if (apst_secondary_timeout_ms &&
total_latency <= apst_secondary_latency_tol_us) { if (*last_index <= 2) returnfalse;
*last_index = 2;
*transition_time = apst_secondary_timeout_ms; returntrue;
} returnfalse;
}
/* * APST (Autonomous Power State Transition) lets us program a table of power * state transitions that the controller will perform automatically. * * Depending on module params, one of the two supported techniques will be used: * * - If the parameters provide explicit timeouts and tolerances, they will be * used to build a table with up to 2 non-operational states to transition to. * The default parameter values were selected based on the values used by * Microsoft's and Intel's NVMe drivers. Yet, since we don't implement dynamic * regeneration of the APST table in the event of switching between external * and battery power, the timeouts and tolerances reflect a compromise * between values used by Microsoft for AC and battery scenarios. * - If not, we'll configure the table with a simple heuristic: we are willing * to spend at most 2% of the time transitioning between power states. * Therefore, when running in any given state, we will enter the next * lower-power non-operational state after waiting 50 * (enlat + exlat) * microseconds, as long as that state's exit latency is under the requested * maximum latency. * * We will not autonomously enter any non-operational state for which the total * latency exceeds ps_max_latency_us. * * Users can set ps_max_latency_us to zero to turn off APST.
*/ staticint nvme_configure_apst(struct nvme_ctrl *ctrl)
{ struct nvme_feat_auto_pst *table; unsigned apste = 0;
u64 max_lat_us = 0;
__le64 target = 0; int max_ps = -1; int state; int ret; unsigned last_lt_index = UINT_MAX;
/* * If APST isn't supported or if we haven't been initialized yet, * then don't do anything.
*/ if (!ctrl->apsta) return 0;
if (ctrl->npss > 31) {
dev_warn(ctrl->device, "NPSS is invalid; not using APST\n"); return 0;
}
table = kzalloc(sizeof(*table), GFP_KERNEL); if (!table) return 0;
if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) { /* Turn off APST. */
dev_dbg(ctrl->device, "APST disabled\n"); goto done;
}
/* * Walk through all states from lowest- to highest-power. * According to the spec, lower-numbered states use more power. NPSS, * despite the name, is the index of the lowest-power state, not the * number of states.
*/ for (state = (int)ctrl->npss; state >= 0; state--) {
u64 total_latency_us, exit_latency_us, transition_ms;
if (target)
table->entries[state] = target;
/* * Don't allow transitions to the deepest state if it's quirked * off.
*/ if (state == ctrl->npss &&
(ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) continue;
/* * Is this state a useful non-operational state for higher-power * states to autonomously transition to?
*/ if (!(ctrl->psd[state].flags & NVME_PS_FLAGS_NON_OP_STATE)) continue;
exit_latency_us = (u64)le32_to_cpu(ctrl->psd[state].exit_lat); if (exit_latency_us > ctrl->ps_max_latency_us) continue;
/* * This state is good. It can be used as the APST idle target * for higher power states.
*/ if (apst_primary_timeout_ms && apst_primary_latency_tol_us) { if (!nvme_apst_get_transition_time(total_latency_us,
&transition_ms, &last_lt_index)) continue;
} else {
transition_ms = total_latency_us + 19;
do_div(transition_ms, 20); if (transition_ms > (1 << 24) - 1)
transition_ms = (1 << 24) - 1;
}
switch (val) { case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT: case PM_QOS_LATENCY_ANY:
latency = U64_MAX; break;
default:
latency = val;
}
if (ctrl->ps_max_latency_us != latency) {
ctrl->ps_max_latency_us = latency; if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE)
nvme_configure_apst(ctrl);
}
}
struct nvme_core_quirk_entry { /* * NVMe model and firmware strings are padded with spaces. For * simplicity, strings in the quirk table are padded with NULLs * instead.
*/
u16 vid; constchar *mn; constchar *fr; unsignedlong quirks;
};
staticconststruct nvme_core_quirk_entry core_quirks[] = {
{ /* * This Toshiba device seems to die using any APST states. See: * https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1678184/comments/11
*/
.vid = 0x1179,
.mn = "THNSF5256GPUK TOSHIBA",
.quirks = NVME_QUIRK_NO_APST,
},
{ /* * This LiteON CL1-3D*-Q11 firmware version has a race * condition associated with actions related to suspend to idle * LiteON has resolved the problem in future firmware
*/
.vid = 0x14a4,
.fr = "22301111",
.quirks = NVME_QUIRK_SIMPLE_SUSPEND,
},
{ /* * This Kioxia CD6-V Series / HPE PE8030 device times out and * aborts I/O during any load, but more easily reproducible * with discards (fstrim). * * The device is left in a state where it is also not possible * to use "nvme set-feature" to disable APST, but booting with * nvme_core.default_ps_max_latency=0 works.
*/
.vid = 0x1e0f,
.mn = "KCD6XVUL6T40",
.quirks = NVME_QUIRK_NO_APST,
},
{ /* * The external Samsung X5 SSD fails initialization without a * delay before checking if it is ready and has a whole set of * other problems. To make this even more interesting, it * shares the PCI ID with internal Samsung 970 Evo Plus that * does not need or want these quirks.
*/
.vid = 0x144d,
.mn = "Samsung Portable SSD X5",
.quirks = NVME_QUIRK_DELAY_BEFORE_CHK_RDY |
NVME_QUIRK_NO_DEEPEST_PS |
NVME_QUIRK_IGNORE_DEV_SUBNQN,
}
};
/* match is null-terminated but idstr is space-padded. */ staticbool string_matches(constchar *idstr, constchar *match, size_t len)
{
size_t matchlen;
if (ctrl->vs >= NVME_VS(1, 2, 1))
dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
}
/* * Generate a "fake" NQN similar to the one in Section 4.5 of the NVMe * Base Specification 2.0. It is slightly different from the format * specified there due to historic reasons, and we can't change it now.
*/
off = snprintf(subsys->subnqn, NVMF_NQN_SIZE, "nqn.2014.08.org.nvmexpress:%04x%04x",
le16_to_cpu(id->vid), le16_to_cpu(id->ssvid));
memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn));
off += sizeof(id->sn);
memcpy(subsys->subnqn + off, id->mn, sizeof(id->mn));
off += sizeof(id->mn);
memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off);
}
/* * Fail matches for discovery subsystems. This results * in each discovery controller bound to a unique subsystem. * This avoids issues with validating controller values * that can only be true when there is a single unique subsystem. * There may be multiple and completely independent entities * that provide discovery controllers.
*/ if (!strcmp(subsysnqn, NVME_DISC_SUBSYS_NAME)) return NULL;
list_for_each_entry(subsys, &nvme_subsystems, entry) { if (strcmp(subsys->subnqn, subsysnqn)) continue; if (!kref_get_unless_zero(&subsys->ref)) continue; return subsys;
}
mutex_lock(&nvme_subsystems_lock);
found = __nvme_find_get_subsystem(subsys->subnqn); if (found) {
put_device(&subsys->dev);
subsys = found;
if (!nvme_validate_cntlid(subsys, ctrl, id)) {
ret = -EINVAL; goto out_put_subsystem;
}
} else {
ret = device_add(&subsys->dev); if (ret) {
dev_err(ctrl->device, "failed to register subsystem device.\n");
put_device(&subsys->dev); goto out_unlock;
}
ida_init(&subsys->ns_ida);
list_add_tail(&subsys->entry, &nvme_subsystems);
}
ret = sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
dev_name(ctrl->device)); if (ret) {
dev_err(ctrl->device, "failed to create sysfs link from subsystem.\n"); goto out_put_subsystem;
}
if (check_shl_overflow(1U, units + page_shift - 9, &val)) return UINT_MAX; return val;
}
staticint nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl)
{ struct nvme_command c = { }; struct nvme_id_ctrl_nvm *id; int ret;
/* * Even though NVMe spec explicitly states that MDTS is not applicable * to the write-zeroes, we are cautious and limit the size to the * controllers max_hw_sectors value, which is based on the MDTS field * and possibly other limiting factors.
*/ if ((ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) &&
!(ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES))
ctrl->max_zeroes_sectors = ctrl->max_hw_sectors; else
ctrl->max_zeroes_sectors = 0;
/* * The spec says the result of a security receive command depends on * the previous security send command. As such, many vendors log this * command as one to submitted only when no other commands to the same * namespace are outstanding. The intention is to tell the host to * prevent mixing security send and receive. * * This driver can only enforce such exclusive access against IO * queues, though. We are not readily able to enforce such a rule for * two commands to the admin queue, which is the only queue that * matters for this command. * * Rather than blindly freezing the IO queues for this effect that * doesn't even apply to IO, mask it off.
*/
log->acs[nvme_admin_security_recv] &= cpu_to_le32(~NVME_CMD_EFFECTS_CSE_MASK);
ret = nvme_identify_ctrl(ctrl, &id); if (ret) {
dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret); return -EIO;
}
if (!(ctrl->ops->flags & NVME_F_FABRICS))
ctrl->cntlid = le16_to_cpu(id->cntlid);
if (!ctrl->identified) { unsignedint i;
/* * Check for quirks. Quirk can depend on firmware version, * so, in principle, the set of quirks present can change * across a reset. As a possible future enhancement, we * could re-scan for quirks every time we reinitialize * the device, but we'd have to make sure that the driver * behaves intelligently if the quirks change.
*/ for (i = 0; i < ARRAY_SIZE(core_quirks); i++) { if (quirk_matches(id, &core_quirks[i]))
ctrl->quirks |= core_quirks[i].quirks;
}
ret = nvme_init_subsystem(ctrl, id); if (ret) goto out_free;
ret = nvme_init_effects(ctrl, id); if (ret) goto out_free;
}
memcpy(ctrl->subsys->firmware_rev, id->fr, sizeof(ctrl->subsys->firmware_rev));
if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
}
/* * Initialize the cached copies of the Identify data and various controller * register in our nvme_ctrl structure. This should be called as soon as * the admin queue is fully up and running.
*/ int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended)
{ int ret;
ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs); if (ret) {
dev_err(ctrl->device, "Reading VS failed (%d)\n", ret); return ret;
}
if (ctrl->vs >= NVME_VS(1, 1, 0))
ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap);
ret = nvme_init_identify(ctrl); if (ret) return ret;
if (nvme_admin_ctrl(ctrl)) { /* * An admin controller has one admin queue, but no I/O queues. * Override queue_count so it only creates an admin queue.
*/
dev_dbg(ctrl->device, "Subsystem %s is an administrative controller",
ctrl->subsys->subnqn);
ctrl->queue_count = 1;
}
ret = nvme_configure_apst(ctrl); if (ret < 0) return ret;
ret = nvme_configure_timestamp(ctrl); if (ret < 0) return ret;
ret = nvme_configure_host_options(ctrl); if (ret < 0) return ret;
nvme_configure_opal(ctrl, was_suspended);
if (!ctrl->identified && !nvme_discovery_ctrl(ctrl)) { /* * Do not return errors unless we are in a controller reset, * the controller works perfectly fine without hwmon.
*/
ret = nvme_hwmon_init(ctrl); if (ret == -EINTR) return ret;
}
list_for_each_entry(h, &ctrl->subsys->nsheads, entry) { /* * Private namespaces can share NSIDs under some conditions. * In that case we can't use the same ns_head for namespaces * with the same NSID.
*/ if (h->ns_id != nsid || !nvme_is_unique_nsid(ctrl, h)) continue; if (nvme_tryget_ns_head(h)) return h;
}
return head;
out_cleanup_srcu:
cleanup_srcu_struct(&head->srcu);
out_ida_remove:
ida_free(&ctrl->subsys->ns_ida, head->instance);
out_free_head:
kfree(head);
out: if (ret > 0)
ret = blk_status_to_errno(nvme_error_status(ret)); return ERR_PTR(ret);
}
staticint nvme_global_check_duplicate_ids(struct nvme_subsystem *this, struct nvme_ns_ids *ids)
{ struct nvme_subsystem *s; int ret = 0;
/* * Note that this check is racy as we try to avoid holding the global * lock over the whole ns_head creation. But it is only intended as * a sanity check anyway.
*/
mutex_lock(&nvme_subsystems_lock);
list_for_each_entry(s, &nvme_subsystems, entry) { if (s == this) continue;
mutex_lock(&s->lock);
ret = nvme_subsys_check_duplicate_ids(s, ids);
mutex_unlock(&s->lock); if (ret) break;
}
mutex_unlock(&nvme_subsystems_lock);
ret = nvme_global_check_duplicate_ids(ctrl->subsys, &info->ids); if (ret) { /* * We've found two different namespaces on two different * subsystems that report the same ID. This is pretty nasty * for anything that actually requires unique device * identification. In the kernel we need this for multipathing, * and in user space the /dev/disk/by-id/ links rely on it. * * If the device also claims to be multi-path capable back off * here now and refuse the probe the second device as this is a * recipe for data corruption. If not this is probably a * cheap consumer device if on the PCIe bus, so let the user * proceed and use the shiny toy, but warn that with changing * probing order (which due to our async probing could just be * device taking longer to startup) the other device could show * up at any time.
*/
nvme_print_device_info(ctrl); if ((ns->ctrl->ops->flags & NVME_F_FABRICS) || /* !PCIe */
((ns->ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) &&
info->is_shared)) {
dev_err(ctrl->device, "ignoring nsid %d because of duplicate IDs\n",
info->nsid); return ret;
}
dev_err(ctrl->device, "clearing duplicate IDs for nsid %d\n", info->nsid);
dev_err(ctrl->device, "use of /dev/disk/by-id/ may cause data corruption\n");
memset(&info->ids.nguid, 0, sizeof(info->ids.nguid));
memset(&info->ids.uuid, 0, sizeof(info->ids.uuid));
memset(&info->ids.eui64, 0, sizeof(info->ids.eui64));
ctrl->quirks |= NVME_QUIRK_BOGUS_NID;
}
mutex_lock(&ctrl->subsys->lock);
head = nvme_find_ns_head(ctrl, info->nsid); if (!head) {
ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, &info->ids); if (ret) {
dev_err(ctrl->device, "duplicate IDs in subsystem for nsid %d\n",
info->nsid); goto out_unlock;
}
head = nvme_alloc_ns_head(ctrl, info); if (IS_ERR(head)) {
ret = PTR_ERR(head); goto out_unlock;
}
} else {
ret = -EINVAL; if ((!info->is_shared || !head->shared) &&
!list_empty(&head->list)) {
dev_err(ctrl->device, "Duplicate unshared namespace %d\n",
info->nsid); goto out_put_ns_head;
} if (!nvme_ns_ids_equal(&head->ids, &info->ids)) {
dev_err(ctrl->device, "IDs don't match for shared namespace %d\n",
info->nsid); goto out_put_ns_head;
}
if (!multipath) {
dev_warn(ctrl->device, "Found shared namespace %d, but multipathing not supported.\n",
info->nsid);
dev_warn_once(ctrl->device, "Shared namespace support requires core_nvme.multipath=Y.\n");
}
}
srcu_idx = srcu_read_lock(&ctrl->srcu);
list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
srcu_read_lock_held(&ctrl->srcu)) { if (ns->head->ns_id == nsid) { if (!nvme_get_ns(ns)) continue;
ret = ns; break;
} if (ns->head->ns_id > nsid) break;
}
srcu_read_unlock(&ctrl->srcu, srcu_idx); return ret;
}
EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, "NVME_TARGET_PASSTHRU");
/* * Add the namespace to the controller list while keeping the list ordered.
*/ staticvoid nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)
{ struct nvme_ns *tmp;
if (nvme_init_ns_head(ns, info)) goto out_cleanup_disk;
/* * If multipathing is enabled, the device name for all disks and not * just those that represent shared namespaces needs to be based on the * subsystem instance. Using the controller instance for private * namespaces could lead to naming collisions between shared and private * namespaces if they don't use a common numbering scheme. * * If multipathing is not enabled, disk names must use the controller * instance as shared namespaces will show up as multiple block * devices.
*/ if (nvme_ns_head_multipath(ns->head)) {
sprintf(disk->disk_name, "nvme%dc%dn%d", ctrl->subsys->instance,
ctrl->instance, ns->head->instance);
disk->flags |= GENHD_FL_HIDDEN;
} elseif (multipath) {
sprintf(disk->disk_name, "nvme%dn%d", ctrl->subsys->instance,
ns->head->instance);
} else {
sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance,
ns->head->instance);
}
if (nvme_update_ns_info(ns, info)) goto out_unlink_ns;
mutex_lock(&ctrl->namespaces_lock); /* * Ensure that no namespaces are added to the ctrl list after the queues * are frozen, thereby avoiding a deadlock between scan and reset.
*/ if (test_bit(NVME_CTRL_FROZEN, &ctrl->flags)) {
mutex_unlock(&ctrl->namespaces_lock); goto out_unlink_ns;
}
nvme_ns_add_to_ctrl_list(ns);
mutex_unlock(&ctrl->namespaces_lock);
synchronize_srcu(&ctrl->srcu);
nvme_get_ctrl(ctrl);
if (device_add_disk(ctrl->device, ns->disk, nvme_ns_attr_groups)) goto out_cleanup_ns_from_list;
if (!nvme_ns_head_multipath(ns->head))
nvme_add_ns_cdev(ns);
/* * Set ns->disk->device->driver_data to ns so we can access * ns->head->passthru_err_log_enabled in * nvme_io_passthru_err_log_enabled_[store | show]().
*/
dev_set_drvdata(disk_to_dev(ns->disk), ns);
return;
out_cleanup_ns_from_list:
nvme_put_ctrl(ctrl);
mutex_lock(&ctrl->namespaces_lock);
list_del_rcu(&ns->list);
mutex_unlock(&ctrl->namespaces_lock);
synchronize_srcu(&ctrl->srcu);
out_unlink_ns:
mutex_lock(&ctrl->subsys->lock);
list_del_rcu(&ns->siblings); if (list_empty(&ns->head->list)) {
list_del_init(&ns->head->entry); /* * If multipath is not configured, we still create a namespace * head (nshead), but head->disk is not initialized in that * case. As a result, only a single reference to nshead is held * (via kref_init()) when it is created. Therefore, ensure that * we do not release the reference to nshead twice if head->disk * is not present.
*/ if (ns->head->disk)
last_path = true;
}
mutex_unlock(&ctrl->subsys->lock); if (last_path)
nvme_put_ns_head(ns->head);
nvme_put_ns_head(ns->head);
out_cleanup_disk:
put_disk(disk);
out_free_ns:
kfree(ns);
}
if (ns) {
nvme_ns_remove(ns);
nvme_put_ns(ns);
}
}
staticvoid nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_info *info)
{ int ret = NVME_SC_INVALID_NS | NVME_STATUS_DNR;
if (!nvme_ns_ids_equal(&ns->head->ids, &info->ids)) {
dev_err(ns->ctrl->device, "identifiers changed for nsid %d\n", ns->head->ns_id); goto out;
}
ret = nvme_update_ns_info(ns, info);
out: /* * Only remove the namespace if we got a fatal error back from the * device, otherwise ignore the error and just move on. * * TODO: we should probably schedule a delayed retry here.
*/ if (ret > 0 && (ret & NVME_STATUS_DNR))
nvme_ns_remove(ns);
}
staticvoid nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid)
{ struct nvme_ns_info info = { .nsid = nsid }; struct nvme_ns *ns; int ret = 1;
if (nvme_identify_ns_descs(ctrl, &info)) return;
if (info.ids.csi != NVME_CSI_NVM && !nvme_multi_css(ctrl)) {
dev_warn(ctrl->device, "command set not reported for nsid: %d\n", nsid); return;
}
/* * If available try to use the Command Set Independent Identify Namespace * data structure to find all the generic information that is needed to * set up a namespace. If not fall back to the legacy version.
*/ if ((ctrl->cap & NVME_CAP_CRMS_CRIMS) ||
(info.ids.csi != NVME_CSI_NVM && info.ids.csi != NVME_CSI_ZNS) ||
ctrl->vs >= NVME_VS(2, 0, 0))
ret = nvme_ns_info_from_id_cs_indep(ctrl, &info); if (ret > 0)
ret = nvme_ns_info_from_identify(ctrl, &info);
if (info.is_removed)
nvme_ns_remove_by_nsid(ctrl, nsid);
/* * Ignore the namespace if it is not ready. We will get an AEN once it * becomes ready and restart the scan.
*/ if (ret || !info.is_ready) return;
/** * struct async_scan_info - keeps track of controller & NSIDs to scan * @ctrl: Controller on which namespaces are being scanned * @next_nsid: Index of next NSID to scan in ns_list * @ns_list: Pointer to list of NSIDs to scan * * Note: There is a single async_scan_info structure shared by all instances * of nvme_scan_ns_async() scanning a given controller, so the atomic * operations on next_nsid are critical to ensure each instance scans a unique * NSID.
*/ struct async_scan_info { struct nvme_ctrl *ctrl;
atomic_t next_nsid;
__le32 *ns_list;
};
log = kzalloc(log_size, GFP_KERNEL); if (!log) return;
/* * We need to read the log to clear the AEN, but we don't want to rely * on it for the changed namespace information as userspace could have * raced with us in reading the log page, which could cause us to miss * updates.
*/
error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0,
NVME_CSI_NVM, log, log_size, 0); if (error)
dev_warn(ctrl->device, "reading changed ns log failed: %d\n", error);
/* No tagset on a live ctrl means IO queues could not created */ if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE || !ctrl->tagset) return;
/* * Identify controller limits can change at controller reset due to * new firmware download, even though it is not common we cannot ignore * such scenario. Controller's non-mdts limits are reported in the unit * of logical blocks that is dependent on the format of attached * namespace. Hence re-read the limits at the time of ns allocation.
*/
ret = nvme_init_non_mdts_limits(ctrl); if (ret < 0) {
dev_warn(ctrl->device, "reading non-mdts-limits failed: %d\n", ret); return;
}
if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) {
dev_info(ctrl->device, "rescanning namespaces.\n");
nvme_clear_changed_ns_log(ctrl);
}
mutex_lock(&ctrl->scan_lock); if (!nvme_id_cns_ok(ctrl, NVME_ID_CNS_NS_ACTIVE_LIST)) {
nvme_scan_ns_sequential(ctrl);
} else { /* * Fall back to sequential scan if DNR is set to handle broken * devices which should support Identify NS List (as per the VS * they report) but don't actually support it.
*/
ret = nvme_scan_ns_list(ctrl); if (ret > 0 && ret & NVME_STATUS_DNR)
nvme_scan_ns_sequential(ctrl);
}
mutex_unlock(&ctrl->scan_lock);
/* Requeue if we have missed AENs */ if (test_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events))
nvme_queue_scan(ctrl); #ifdef CONFIG_NVME_MULTIPATH elseif (ctrl->ana_log_buf) /* Re-read the ANA log page to not miss updates */
queue_work(nvme_wq, &ctrl->ana_work); #endif
}
/* * This function iterates the namespace list unlocked to allow recovery from * controller failure. It is up to the caller to ensure the namespace list is * not modified by scan work while this function is executing.
*/ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
{ struct nvme_ns *ns, *next;
LIST_HEAD(ns_list);
/* * make sure to requeue I/O to all namespaces as these * might result from the scan itself and must complete * for the scan_work to make progress
*/
nvme_mpath_clear_ctrl_paths(ctrl);
/* * Unquiesce io queues so any pending IO won't hang, especially * those submitted from scan work
*/
nvme_unquiesce_io_queues(ctrl);
/* prevent racing with ns scanning */
flush_work(&ctrl->scan_work);
/* * The dead states indicates the controller was not gracefully * disconnected. In that case, we won't be able to flush any data while * removing the namespaces' disks; fail all the queues now to avoid * potentially having to clean up the failed sync later.
*/ if (nvme_ctrl_state(ctrl) == NVME_CTRL_DEAD)
nvme_mark_namespaces_dead(ctrl);
/* this is a no-op when called from the controller reset handler */
nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
/* * The transport drivers must guarantee AER submission here is safe by * flushing ctrl async_event_work after changing the controller state * from LIVE and before freeing the admin queue.
*/ if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE)
ctrl->ops->submit_async_event(ctrl);
}
switch (aer_notice_type) { case NVME_AER_NOTICE_NS_CHANGED:
set_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events);
nvme_queue_scan(ctrl); break; case NVME_AER_NOTICE_FW_ACT_STARTING: /* * We are (ab)using the RESETTING state to prevent subsequent * recovery actions from interfering with the controller's * firmware activation.
*/ if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) {
requeue = false;
queue_work(nvme_wq, &ctrl->fw_act_work);
} break; #ifdef CONFIG_NVME_MULTIPATH case NVME_AER_NOTICE_ANA: if (!ctrl->ana_log_buf) break;
queue_work(nvme_wq, &ctrl->ana_work); break; #endif case NVME_AER_NOTICE_DISC_CHANGED:
ctrl->aen_result = result; break; default:
dev_warn(ctrl->device, "async event result %08x\n", result);
} return requeue;
}
staticvoid nvme_handle_aer_persistent_error(struct nvme_ctrl *ctrl)
{
dev_warn(ctrl->device, "resetting controller due to persistent internal error\n");
nvme_reset_ctrl(ctrl);
}
if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS) return;
trace_nvme_async_event(ctrl, result); switch (aer_type) { case NVME_AER_NOTICE:
requeue = nvme_handle_aen_notice(ctrl, result); break; case NVME_AER_ERROR: /* * For a persistent internal error, don't run async_event_work * to submit a new AER. The controller reset will do it.
*/ if (aer_subtype == NVME_AER_ERROR_PERSIST_INT_ERR) {
nvme_handle_aer_persistent_error(ctrl); return;
}
fallthrough; case NVME_AER_SMART: case NVME_AER_CSS: case NVME_AER_VS:
ctrl->aen_result = result; break; default: break;
}
if (requeue)
queue_work(nvme_wq, &ctrl->async_event_work);
}
EXPORT_SYMBOL_GPL(nvme_complete_async_event);
int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, conststruct blk_mq_ops *ops, unsignedint cmd_size)
{ struct queue_limits lim = {}; int ret;
memset(set, 0, sizeof(*set));
set->ops = ops;
set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; if (ctrl->ops->flags & NVME_F_FABRICS) /* Reserved for fabric connect and keep alive */
set->reserved_tags = 2;
set->numa_node = ctrl->numa_node; if (ctrl->ops->flags & NVME_F_BLOCKING)
set->flags |= BLK_MQ_F_BLOCKING;
set->cmd_size = cmd_size;
set->driver_data = ctrl;
set->nr_hw_queues = 1;
set->timeout = NVME_ADMIN_TIMEOUT;
ret = blk_mq_alloc_tag_set(set); if (ret) return ret;
ctrl->admin_q = blk_mq_alloc_queue(set, &lim, NULL); if (IS_ERR(ctrl->admin_q)) {
ret = PTR_ERR(ctrl->admin_q); goto out_free_tagset;
}
if (ctrl->ops->flags & NVME_F_FABRICS) {
ctrl->fabrics_q = blk_mq_alloc_queue(set, NULL, NULL); if (IS_ERR(ctrl->fabrics_q)) {
ret = PTR_ERR(ctrl->fabrics_q); goto out_cleanup_admin_q;
}
}
void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl)
{ /* * As we're about to destroy the queue and free tagset * we can not have keep-alive work running.
*/
nvme_stop_keep_alive(ctrl);
blk_mq_destroy_queue(ctrl->admin_q);
blk_put_queue(ctrl->admin_q); if (ctrl->ops->flags & NVME_F_FABRICS) {
blk_mq_destroy_queue(ctrl->fabrics_q);
blk_put_queue(ctrl->fabrics_q);
}
blk_mq_free_tag_set(ctrl->admin_tagset);
}
EXPORT_SYMBOL_GPL(nvme_remove_admin_tag_set);
int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, conststruct blk_mq_ops *ops, unsignedint nr_maps, unsignedint cmd_size)
{ int ret;
memset(set, 0, sizeof(*set));
set->ops = ops;
set->queue_depth = min_t(unsigned, ctrl->sqsize, BLK_MQ_MAX_DEPTH - 1); /* * Some Apple controllers requires tags to be unique across admin and * the (only) I/O queue, so reserve the first 32 tags of the I/O queue.
*/ if (ctrl->quirks & NVME_QUIRK_SHARED_TAGS)
set->reserved_tags = NVME_AQ_DEPTH; elseif (ctrl->ops->flags & NVME_F_FABRICS) /* Reserved for fabric connect */
set->reserved_tags = 1;
set->numa_node = ctrl->numa_node; if (ctrl->ops->flags & NVME_F_BLOCKING)
set->flags |= BLK_MQ_F_BLOCKING;
set->cmd_size = cmd_size;
set->driver_data = ctrl;
set->nr_hw_queues = ctrl->queue_count - 1;
set->timeout = NVME_IO_TIMEOUT;
set->nr_maps = nr_maps;
ret = blk_mq_alloc_tag_set(set); if (ret) return ret;
/* * persistent discovery controllers need to send indication to userspace * to re-read the discovery log page to learn about possible changes * that were missed. We identify persistent discovery controllers by * checking that they started once before, hence are reconnecting back.
*/ if (test_bit(NVME_CTRL_STARTED_ONCE, &ctrl->flags) &&
nvme_discovery_ctrl(ctrl)) { if (!ctrl->kato) {
nvme_stop_keep_alive(ctrl);
ctrl->kato = NVME_DEFAULT_KATO;
nvme_start_keep_alive(ctrl);
}
nvme_change_uevent(ctrl, "NVME_EVENT=rediscover");
}
if (ctrl->queue_count > 1) {
nvme_queue_scan(ctrl);
nvme_unquiesce_io_queues(ctrl);
nvme_mpath_update(ctrl);
}
if (subsys) {
mutex_lock(&nvme_subsystems_lock);
list_del(&ctrl->subsys_entry);
sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device));
mutex_unlock(&nvme_subsystems_lock);
}
ctrl->ops->free_ctrl(ctrl);
if (subsys)
nvme_put_subsystem(subsys);
}
/* * Initialize a NVMe controller structures. This needs to be called during * earliest initialization so that we have the initialized structured around * during probing. * * On success, the caller must use the nvme_put_ctrl() to release this when * needed, which also invokes the ops->free_ctrl() callback.
*/ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, conststruct nvme_ctrl_ops *ops, unsignedlong quirks)
{ int ret;
/* * On success, returns with an elevated controller reference and caller must * use nvme_uninit_ctrl() to properly free resources associated with the ctrl.
*/ int nvme_add_ctrl(struct nvme_ctrl *ctrl)
{ int ret;
ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance); if (ret) return ret;
cdev_init(&ctrl->cdev, &nvme_dev_fops);
ctrl->cdev.owner = ctrl->ops->module;
ret = cdev_device_add(&ctrl->cdev, ctrl->device); if (ret) return ret;
/* * Initialize latency tolerance controls. The sysfs files won't * be visible to userspace unless the device actually supports APST.
*/
ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance;
dev_pm_qos_update_user_latency_tolerance(ctrl->device,
min(default_ps_max_latency_us, (unsignedlong)S32_MAX));
/* let I/O to all namespaces fail in preparation for surprise removal */ void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl)
{ struct nvme_ns *ns; int srcu_idx;
void nvme_start_freeze(struct nvme_ctrl *ctrl)
{ struct nvme_ns *ns; int srcu_idx;
set_bit(NVME_CTRL_FROZEN, &ctrl->flags);
srcu_idx = srcu_read_lock(&ctrl->srcu);
list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
srcu_read_lock_held(&ctrl->srcu)) /* * Typical non_owner use case is from pci driver, in which * start_freeze is called from timeout work function, but * unfreeze is done in reset work context
*/
blk_freeze_queue_start_non_owner(ns->queue);
srcu_read_unlock(&ctrl->srcu, srcu_idx);
}
EXPORT_SYMBOL_GPL(nvme_start_freeze);
void nvme_quiesce_io_queues(struct nvme_ctrl *ctrl)
{ if (!ctrl->tagset) return; if (!test_and_set_bit(NVME_CTRL_STOPPED, &ctrl->flags))
blk_mq_quiesce_tagset(ctrl->tagset); else
blk_mq_wait_quiesce_done(ctrl->tagset);
}
EXPORT_SYMBOL_GPL(nvme_quiesce_io_queues);
void nvme_unquiesce_io_queues(struct nvme_ctrl *ctrl)
{ if (!ctrl->tagset) return; if (test_and_clear_bit(NVME_CTRL_STOPPED, &ctrl->flags))
blk_mq_unquiesce_tagset(ctrl->tagset);
}
EXPORT_SYMBOL_GPL(nvme_unquiesce_io_queues);
¤ Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.0.121Bemerkung:
(vorverarbeitet am 2026-04-25)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.