/* * For data SGLs we support a single descriptors worth of SGL entries. * For PRPs, segments don't matter at all.
*/ #define NVME_MAX_SEGS \
(NVME_CTRL_PAGE_SIZE / sizeof(struct nvme_sgl_desc))
/* * For metadata SGLs, only the small descriptor is supported, and the first * entry is the segment descriptor, which for the data pointer sits in the SQE.
*/ #define NVME_MAX_META_SEGS \
((NVME_SMALL_POOL_SIZE / sizeof(struct nvme_sgl_desc)) - 1)
/* * The last entry is used to link to the next descriptor.
*/ #define PRPS_PER_PAGE \
(((NVME_CTRL_PAGE_SIZE / sizeof(__le64))) - 1)
/* * I/O could be non-aligned both at the beginning and end.
*/ #define MAX_PRP_RANGE \
(NVME_MAX_BYTES + 2 * (NVME_CTRL_PAGE_SIZE - 1))
staticunsignedint max_host_mem_size_mb = 128;
module_param(max_host_mem_size_mb, uint, 0444);
MODULE_PARM_DESC(max_host_mem_size_mb, "Maximum Host Memory Buffer (HMB) size per controller (in MiB)");
staticunsignedint sgl_threshold = SZ_32K;
module_param(sgl_threshold, uint, 0644);
MODULE_PARM_DESC(sgl_threshold, "Use SGLs when average request segment size is larger or equal to " "this size. Use 0 to disable SGLs.");
staticunsignedint write_queues;
module_param_cb(write_queues, &io_queue_count_ops, &write_queues, 0644);
MODULE_PARM_DESC(write_queues, "Number of queues to use for writes. If not set, reads and writes " "will share a queue set.");
staticunsignedint poll_queues;
module_param_cb(poll_queues, &io_queue_count_ops, &poll_queues, 0644);
MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");
if (!(dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP)) return;
if (dev->dbbuf_dbs) { /* * Clear the dbbuf memory so the driver doesn't observe stale * values from the previous instantiation.
*/
memset(dev->dbbuf_dbs, 0, mem_size);
memset(dev->dbbuf_eis, 0, mem_size); return;
}
if (nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0)) {
dev_warn(dev->ctrl.device, "unable to set dbbuf\n"); /* Free memory and continue on */
nvme_dbbuf_dma_free(dev);
for (i = 1; i <= dev->online_queues; i++)
nvme_dbbuf_free(&dev->queues[i]);
}
}
/* * Ensure that the doorbell is updated before reading the event * index from memory. The controller needs to provide similar * ordering to ensure the event index is updated before reading * the doorbell.
*/
mb();
event_idx = le32_to_cpu(*dbbuf_ei); if (!nvme_dbbuf_need_event(event_idx, value, old_value)) returnfalse;
}
/* * The poll queue(s) doesn't have an IRQ (and hence IRQ * affinity), so use the regular blk-mq cpu mapping
*/
map->queue_offset = qoff; if (i != HCTX_TYPE_POLL && offset)
blk_mq_map_hw_queues(map, dev->dev, offset); else
blk_mq_map_queues(map);
qoff += map->nr_queues;
offset += map->nr_queues;
}
}
/* * Write sq tail if we are asked to, or if the next command would wrap.
*/ staticinlinevoid nvme_write_sq_db(struct nvme_queue *nvmeq, bool write_sq)
{ if (!write_sq) {
u16 next_tail = nvmeq->sq_tail + 1;
if (next_tail == nvmeq->q_depth)
next_tail = 0; if (next_tail != nvmeq->last_sq_tail) return;
}
if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail,
nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei))
writel(nvmeq->sq_tail, nvmeq->q_db);
nvmeq->last_sq_tail = nvmeq->sq_tail;
}
for (i = 0; i < iod->nr_dma_vecs; i++)
dma_unmap_page(nvmeq->dev->dev, iod->dma_vecs[i].addr,
iod->dma_vecs[i].len, rq_dma_dir(req));
mempool_free(iod->dma_vecs, nvmeq->dev->dmavec_mempool);
}
/* * PRP1 always points to the start of the DMA transfers. * * This is the only PRP (except for the list entries) that could be * non-aligned.
*/
prp1_dma = iter->addr;
prp_len = min(length, NVME_CTRL_PAGE_SIZE -
(iter->addr & (NVME_CTRL_PAGE_SIZE - 1)));
iod->total_len += prp_len;
iter->addr += prp_len;
iter->len -= prp_len;
length -= prp_len; if (!length) goto done;
if (!nvme_pci_prp_iter_next(req, nvmeq->dev->dev, iter)) { if (WARN_ON_ONCE(!iter->status)) goto bad_sgl; goto done;
}
/* * PRP2 is usually a list, but can point to data if all data to be * transferred fits into PRP1 + PRP2:
*/ if (length <= NVME_CTRL_PAGE_SIZE) {
prp2_dma = iter->addr;
iod->total_len += length; goto done;
}
if (DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE) <=
NVME_SMALL_POOL_SIZE / sizeof(__le64))
iod->flags |= IOD_SMALL_DESCRIPTOR;
if (!nvme_pci_prp_iter_next(req, nvmeq->dev->dev, iter)) { if (WARN_ON_ONCE(!iter->status)) goto bad_sgl; goto done;
}
/* * If we've filled the entire descriptor, allocate a new that is * pointed to be the last entry in the previous PRP list. To * accommodate for that move the last actual entry to the new * descriptor.
*/ if (i == NVME_CTRL_PAGE_SIZE >> 3) {
__le64 *old_prp_list = prp_list;
dma_addr_t prp_list_dma;
done: /* * nvme_unmap_data uses the DPT field in the SQE to tear down the * mapping, so initialize it even for failures.
*/
iod->cmd.common.dptr.prp1 = cpu_to_le64(prp1_dma);
iod->cmd.common.dptr.prp2 = cpu_to_le64(prp2_dma); if (unlikely(iter->status))
nvme_unmap_data(req); return iter->status;
bad_sgl:
dev_err_once(nvmeq->dev->dev, "Incorrectly formed request for payload:%d nents:%d\n",
blk_rq_payload_bytes(req), blk_rq_nr_phys_segments(req)); return BLK_STS_IOERR;
}
/* * Try to skip the DMA iterator for single segment requests, as that * significantly improves performances for small I/O sizes.
*/ if (blk_rq_nr_phys_segments(req) == 1) {
ret = nvme_pci_setup_data_simple(req, use_sgl); if (ret != BLK_STS_AGAIN) return ret;
}
if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_state, &iter)) return iter.status;
/* * We should not need to do this, but we're still using this to * ensure we can drain requests on a dying queue.
*/ if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags))) return BLK_STS_IOERR;
if (unlikely(!nvme_check_ready(&dev->ctrl, req, true))) return nvme_fail_nonready_command(&dev->ctrl, req);
ret = nvme_prep_rq(req); if (unlikely(ret)) return ret;
spin_lock(&nvmeq->sq_lock);
nvme_sq_copy_cmd(nvmeq, &iod->cmd);
nvme_write_sq_db(nvmeq, bd->last);
spin_unlock(&nvmeq->sq_lock); return BLK_STS_OK;
}
staticbool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req)
{ /* * We should not need to do this, but we're still using this to * ensure we can drain requests on a dying queue.
*/ if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags))) returnfalse; if (unlikely(!nvme_check_ready(&nvmeq->dev->ctrl, req, true))) returnfalse;
/* We read the CQE phase first to check if the rest of the entry is valid */ staticinlinebool nvme_cqe_pending(struct nvme_queue *nvmeq)
{ struct nvme_completion *hcqe = &nvmeq->cqes[nvmeq->cq_head];
/* * AEN requests are special as they don't time out and can * survive any kind of queue freeze and often don't respond to * aborts. We don't even bother to allocate a struct request * for them but rather special case them here.
*/ if (unlikely(nvme_is_aen_req(nvmeq->qid, command_id))) {
nvme_complete_async_event(&nvmeq->dev->ctrl,
cqe->status, &cqe->result); return;
}
req = nvme_find_rq(nvme_queue_tagset(nvmeq), command_id); if (unlikely(!req)) {
dev_warn(nvmeq->dev->ctrl.device, "invalid id %d completed on queue %d\n",
command_id, le16_to_cpu(cqe->sq_id)); return;
}
while (nvme_cqe_pending(nvmeq)) {
found = true; /* * load-load control dependency between phase and the rest of * the cqe requires a full read memory barrier
*/
dma_rmb();
nvme_handle_cqe(nvmeq, iob, nvmeq->cq_head);
nvme_update_cq_head(nvmeq);
}
if (found)
nvme_ring_cq_doorbell(nvmeq); return found;
}
if (nvme_cqe_pending(nvmeq)) return IRQ_WAKE_THREAD; return IRQ_NONE;
}
/* * Poll for completions for any interrupt driven queue * Can be called from any context.
*/ staticvoid nvme_poll_irqdisable(struct nvme_queue *nvmeq)
{ struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
staticint nvme_pci_subsystem_reset(struct nvme_ctrl *ctrl)
{ struct nvme_dev *dev = to_nvme_dev(ctrl); int ret = 0;
/* * Taking the shutdown_lock ensures the BAR mapping is not being * altered by reset_work. Holding this lock before the RESETTING state * change, if successful, also ensures nvme_remove won't be able to * proceed to iounmap until we're done.
*/
mutex_lock(&dev->shutdown_lock); if (!dev->bar_mapped_size) {
ret = -ENODEV; goto unlock;
}
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) {
ret = -EBUSY; goto unlock;
}
if (!test_bit(NVMEQ_POLLED, &nvmeq->flags))
flags |= NVME_CQ_IRQ_ENABLED;
/* * Note: we (ab)use the fact that the prp fields survive if no data * is attached to the request.
*/
c.create_cq.opcode = nvme_admin_create_cq;
c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
c.create_cq.cqid = cpu_to_le16(qid);
c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
c.create_cq.cq_flags = cpu_to_le16(flags);
c.create_cq.irq_vector = cpu_to_le16(vector);
/* * Some drives have a bug that auto-enables WRRU if MEDIUM isn't * set. Since URGENT priority is zeroes, it makes all queues * URGENT.
*/ if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ)
flags |= NVME_SQ_PRIO_MEDIUM;
/* * Note: we (ab)use the fact that the prp fields survive if no data * is attached to the request.
*/
c.create_sq.opcode = nvme_admin_create_sq;
c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
c.create_sq.sqid = cpu_to_le16(qid);
c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
c.create_sq.sq_flags = cpu_to_le16(flags);
c.create_sq.cqid = cpu_to_le16(qid);
staticbool nvme_should_reset(struct nvme_dev *dev, u32 csts)
{ /* If true, indicates loss of adapter communication, possibly by a * NVMe Subsystem reset.
*/ bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
/* If there is a reset/reinit ongoing, we shouldn't reset again. */ switch (nvme_ctrl_state(&dev->ctrl)) { case NVME_CTRL_RESETTING: case NVME_CTRL_CONNECTING: returnfalse; default: break;
}
/* We shouldn't reset unless the controller is on fatal error state * _or_ if we lost the communication with it.
*/ if (!(csts & NVME_CSTS_CFS) && !nssro) returnfalse;
returntrue;
}
staticvoid nvme_warn_reset(struct nvme_dev *dev, u32 csts)
{ /* Read a config register to help see what died. */
u16 pci_status; int result;
result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS,
&pci_status); if (result == PCIBIOS_SUCCESSFUL)
dev_warn(dev->ctrl.device, "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n",
csts, pci_status); else
dev_warn(dev->ctrl.device, "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n",
csts, result);
if (csts != ~0) return;
dev_warn(dev->ctrl.device, "Does your device have a faulty power saving mode enabled?\n");
dev_warn(dev->ctrl.device, "Try \"nvme_core.default_ps_max_latency_us=0 pcie_aspm=off pcie_port_pm=off\" and report a bug\n");
}
/* * Shutdown the device immediately if we see it is disconnected. This * unblocks PCIe error handling if the nvme driver is waiting in * error_resume for a device that has been removed. We can't unbind the * driver while the driver's error callback is waiting to complete, so * we're relying on a timeout to break that deadlock if a removal * occurs while reset work is running.
*/ if (pci_dev_is_disconnected(pdev))
nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); if (nvme_state_terminal(&dev->ctrl)) goto disable;
/* If PCI error recovery process is happening, we cannot reset or * the recovery mechanism will surely fail.
*/
mb(); if (pci_channel_offline(pdev)) return BLK_EH_RESET_TIMER;
/* * Reset immediately if the controller is failed
*/ if (nvme_should_reset(dev, csts)) {
nvme_warn_reset(dev, csts); goto disable;
}
/* * Did we miss an interrupt?
*/ if (test_bit(NVMEQ_POLLED, &nvmeq->flags))
nvme_poll(req->mq_hctx, NULL); else
nvme_poll_irqdisable(nvmeq);
/* * Shutdown immediately if controller times out while starting. The * reset work will see the pci device disabled when it gets the forced * cancellation error. All outstanding requests are completed on * shutdown, so we return BLK_EH_DONE.
*/ switch (nvme_ctrl_state(&dev->ctrl)) { case NVME_CTRL_CONNECTING:
nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
fallthrough; case NVME_CTRL_DELETING:
dev_warn_ratelimited(dev->ctrl.device, "I/O tag %d (%04x) QID %d timeout, disable controller\n",
req->tag, nvme_cid(req), nvmeq->qid);
nvme_req(req)->flags |= NVME_REQ_CANCELLED;
nvme_dev_disable(dev, true); return BLK_EH_DONE; case NVME_CTRL_RESETTING: return BLK_EH_RESET_TIMER; default: break;
}
/* * Shutdown the controller immediately and schedule a reset if the * command was already aborted once before and still hasn't been * returned to the driver, or if this is the admin queue.
*/
opcode = nvme_req(req)->cmd->common.opcode; if (!nvmeq->qid || (iod->flags & IOD_ABORTED)) {
dev_warn(dev->ctrl.device, "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, reset controller\n",
req->tag, nvme_cid(req), opcode,
nvme_opcode_str(nvmeq->qid, opcode), nvmeq->qid);
nvme_req(req)->flags |= NVME_REQ_CANCELLED; goto disable;
}
/* * The aborted req will be completed on receiving the abort req. * We enable the timer again. If hit twice, it'll cause a device reset, * as the device then is in a faulty state.
*/ return BLK_EH_RESET_TIMER;
disable: if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) { if (nvme_state_terminal(&dev->ctrl))
nvme_dev_disable(dev, true); return BLK_EH_DONE;
}
nvme_dev_disable(dev, false); if (nvme_try_sched_reset(&dev->ctrl))
nvme_unquiesce_io_queues(&dev->ctrl); return BLK_EH_DONE;
}
if (!test_and_clear_bit(NVMEQ_ENABLED, &nvmeq->flags)) return;
/* ensure that nvme_queue_rq() sees NVMEQ_ENABLED cleared */
mb();
nvmeq->dev->online_queues--; if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
nvme_quiesce_admin_queue(&nvmeq->dev->ctrl); if (!test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags))
pci_free_irq(to_pci_dev(dev->dev), nvmeq->cq_vector, nvmeq);
}
staticvoid nvme_suspend_io_queues(struct nvme_dev *dev)
{ int i;
for (i = dev->ctrl.queue_count - 1; i > 0; i--)
nvme_suspend_queue(dev, i);
}
/* * Called only on a device that has been disabled and after all other threads * that can check this device's completion queues have synced, except * nvme_poll(). This is the last chance for the driver to see a natural * completion before nvme_cancel_request() terminates all incomplete requests.
*/ staticvoid nvme_reap_pending_cqes(struct nvme_dev *dev)
{ int i;
for (i = dev->ctrl.queue_count - 1; i > 0; i--) {
spin_lock(&dev->queues[i].cq_poll_lock);
nvme_poll_cq(&dev->queues[i], NULL);
spin_unlock(&dev->queues[i].cq_poll_lock);
}
}
staticint nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, int entry_size)
{ int q_depth = dev->q_depth; unsigned q_size_aligned = roundup(q_depth * entry_size,
NVME_CTRL_PAGE_SIZE);
/* * Ensure the reduced q_depth is above some threshold where it * would be better to map queues in system memory with the * original depth
*/ if (q_depth < 64) return -ENOMEM;
}
nvmeq->sq_tail = 0;
nvmeq->last_sq_tail = 0;
nvmeq->cq_head = 0;
nvmeq->cq_phase = 1;
nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq));
nvme_dbbuf_init(dev, nvmeq, qid);
dev->online_queues++;
wmb(); /* ensure the first interrupt sees the initialization */
}
/* * Try getting shutdown_lock while setting up IO queues.
*/ staticint nvme_setup_io_queues_trylock(struct nvme_dev *dev)
{ /* * Give up if the lock is being held by nvme_dev_disable.
*/ if (!mutex_trylock(&dev->shutdown_lock)) return -ENODEV;
/* * Controller is in wrong state, fail early.
*/ if (nvme_ctrl_state(&dev->ctrl) != NVME_CTRL_CONNECTING) {
mutex_unlock(&dev->shutdown_lock); return -ENODEV;
}
return 0;
}
staticint nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
{ struct nvme_dev *dev = nvmeq->dev; int result;
u16 vector = 0;
clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
/* * A queue's vector matches the queue identifier unless the controller * has only one vector available.
*/ if (!polled)
vector = dev->num_vecs == 1 ? 0 : qid; else
set_bit(NVMEQ_POLLED, &nvmeq->flags);
result = adapter_alloc_cq(dev, qid, nvmeq, vector); if (result) return result;
result = adapter_alloc_sq(dev, qid, nvmeq); if (result < 0) return result; if (result) goto release_cq;
nvmeq->cq_vector = vector;
result = nvme_setup_io_queues_trylock(dev); if (result) return result;
nvme_init_queue(nvmeq, qid); if (!polled) {
result = queue_request_irq(nvmeq); if (result < 0) goto release_sq;
}
staticvoid nvme_dev_remove_admin(struct nvme_dev *dev)
{ if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) { /* * If the controller was reset during removal, it's possible * user requests may be waiting on a stopped queue. Start the * queue to flush these to completion.
*/
nvme_unquiesce_admin_queue(&dev->ctrl);
nvme_remove_admin_tag_set(&dev->ctrl);
}
}
/* * If the device has been passed off to us in an enabled state, just * clear the enabled bit. The spec says we should set the 'shutdown * notification bits', but doing so may cause the device to complete * commands to the admin queue ... and we don't know what memory that * might be pointing at!
*/
result = nvme_disable_ctrl(&dev->ctrl, false); if (result < 0) { struct pci_dev *pdev = to_pci_dev(dev->dev);
/* * The NVMe Controller Reset method did not get an expected * CSTS.RDY transition, so something with the device appears to * be stuck. Use the lower level and bigger hammer PCIe * Function Level Reset to attempt restoring the device to its * initial state, and try again.
*/
result = pcie_reset_flr(pdev, false); if (result < 0) return result;
pci_restore_state(pdev);
result = nvme_disable_ctrl(&dev->ctrl, false); if (result < 0) return result;
dev_info(dev->ctrl.device, "controller reset completed after pcie flr\n");
}
result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH); if (result) return result;
for (i = dev->online_queues; i <= max; i++) { bool polled = i > rw_queues;
ret = nvme_create_queue(&dev->queues[i], i, polled); if (ret) break;
}
/* * Ignore failing Create SQ/CQ commands, we can continue with less * than the desired amount of queues, and even a controller without * I/O queues can still be used to issue admin commands. This might * be useful to upgrade a buggy firmware for example.
*/ return ret >= 0 ? 0 : ret;
}
/* * Controllers may support a CMB size larger than their BAR, for * example, due to being behind a bridge. Reduce the CMB to the * reported size of the BAR
*/
size = min(size, bar_size - offset);
if (!IS_ALIGNED(size, memremap_compat_align()) ||
!IS_ALIGNED(pci_resource_start(pdev, bar),
memremap_compat_align())) return;
/* * Tell the controller about the host side address mapping the CMB, * and enable CMB decoding for the NVMe 1.4+ scheme:
*/ if (NVME_CAP_CMBS(dev->ctrl.cap)) {
hi_lo_writeq(NVME_CMBMSC_CRE | NVME_CMBMSC_CMSE |
(pci_bus_address(pdev, bar) + offset),
dev->bar + NVME_REG_CMBMSC);
}
if (pci_p2pdma_add_resource(pdev, bar, size, offset)) {
dev_warn(dev->ctrl.device, "failed to register the CMB\n");
hi_lo_writeq(0, dev->bar + NVME_REG_CMBMSC); return;
}
/* * If there is an IOMMU that can merge pages, try a virtually * non-contiguous allocation for a single segment first.
*/ if (dma_merge_boundary && (PAGE_SIZE & dma_merge_boundary) == 0) { if (!nvme_alloc_host_mem_single(dev, preferred)) return 0;
}
/* start big and work our way down */ for (chunk_size = min_chunk; chunk_size >= hmminds; chunk_size /= 2) { if (!nvme_alloc_host_mem_multi(dev, preferred, chunk_size)) { if (!min || dev->host_mem_size >= min) return 0;
nvme_free_host_mem(dev);
}
}
return -ENOMEM;
}
staticint nvme_setup_host_mem(struct nvme_dev *dev)
{
u64 max = (u64)max_host_mem_size_mb * SZ_1M;
u64 preferred = (u64)dev->ctrl.hmpre * 4096;
u64 min = (u64)dev->ctrl.hmmin * 4096;
u32 enable_bits = NVME_HOST_MEM_ENABLE; int ret;
/* * If we already have a buffer allocated check if we can reuse it.
*/ if (dev->host_mem_descs) { if (dev->host_mem_size >= min)
enable_bits |= NVME_HOST_MEM_RETURN; else
nvme_free_host_mem(dev);
}
if (!dev->host_mem_descs) { if (nvme_alloc_host_mem(dev, min, preferred)) {
dev_warn(dev->ctrl.device, "failed to allocate host memory buffer.\n"); return 0; /* controller must work without HMB */
}
if (a == &dev_attr_cmb.attr ||
a == &dev_attr_cmbloc.attr ||
a == &dev_attr_cmbsz.attr) { if (!dev->cmbsz) return 0;
} if (a == &dev_attr_hmb.attr && !ctrl->hmpre) return 0;
/* * nirqs is the number of interrupts available for write and read * queues. The core already reserved an interrupt for the admin queue.
*/ staticvoid nvme_calc_irq_sets(struct irq_affinity *affd, unsignedint nrirqs)
{ struct nvme_dev *dev = affd->priv; unsignedint nr_read_queues, nr_write_queues = dev->nr_write_queues;
/* * If there is no interrupt available for queues, ensure that * the default queue is set to 1. The affinity set size is * also set to one, but the irq core ignores it for this case. * * If only one interrupt is available or 'write_queue' == 0, combine * write and read queues. * * If 'write_queues' > 0, ensure it leaves room for at least one read * queue.
*/ if (!nrirqs) {
nrirqs = 1;
nr_read_queues = 0;
} elseif (nrirqs == 1 || !nr_write_queues) {
nr_read_queues = 0;
} elseif (nr_write_queues >= nrirqs) {
nr_read_queues = 1;
} else {
nr_read_queues = nrirqs - nr_write_queues;
}
/* * Poll queues don't need interrupts, but we need at least one I/O queue * left over for non-polled I/O.
*/
poll_queues = min(dev->nr_poll_queues, nr_io_queues - 1);
dev->io_queues[HCTX_TYPE_POLL] = poll_queues;
/* * Initialize for the single interrupt case, will be updated in * nvme_calc_irq_sets().
*/
dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
dev->io_queues[HCTX_TYPE_READ] = 0;
/* * We need interrupts for the admin queue and each non-polled I/O queue, * but some Apple controllers require all queues to use the first * vector.
*/
irq_queues = 1; if (!(dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR))
irq_queues += (nr_io_queues - poll_queues); if (dev->ctrl.quirks & NVME_QUIRK_BROKEN_MSI)
flags &= ~PCI_IRQ_MSI; return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues, flags,
&affd);
}
staticunsignedint nvme_max_io_queues(struct nvme_dev *dev)
{ /* * If tags are shared with admin queue (Apple bug), then * make sure we only use one IO queue.
*/ if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) return 1; return blk_mq_num_possible_queues(0) + dev->nr_write_queues +
dev->nr_poll_queues;
}
/* * Sample the module parameters once at reset time so that we have * stable values to work with.
*/
dev->nr_write_queues = write_queues;
dev->nr_poll_queues = poll_queues;
nr_io_queues = dev->nr_allocated_queues - 1;
result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); if (result < 0) return result;
if (nr_io_queues == 0) return 0;
/* * Free IRQ resources as soon as NVMEQ_ENABLED bit transitions * from set to unset. If there is a window to it is truely freed, * pci_free_irq_vectors() jumping into this window will crash. * And take lock to avoid racing with pci_free_irq_vectors() in * nvme_dev_disable() path.
*/
result = nvme_setup_io_queues_trylock(dev); if (result) return result; if (test_and_clear_bit(NVMEQ_ENABLED, &adminq->flags))
pci_free_irq(pdev, 0, adminq);
if (dev->cmb_use_sqes) {
result = nvme_cmb_qdepth(dev, nr_io_queues, sizeof(struct nvme_command)); if (result > 0) {
dev->q_depth = result;
dev->ctrl.sqsize = result - 1;
} else {
dev->cmb_use_sqes = false;
}
}
do {
size = db_bar_size(dev, nr_io_queues);
result = nvme_remap_bar(dev, size); if (!result) break; if (!--nr_io_queues) {
result = -ENOMEM; goto out_unlock;
}
} while (1);
adminq->q_db = dev->dbs;
retry: /* Deregister the admin queue's interrupt */ if (test_and_clear_bit(NVMEQ_ENABLED, &adminq->flags))
pci_free_irq(pdev, 0, adminq);
/* * If we enable msix early due to not intx, disable it again before * setting up the full range we need.
*/
pci_free_irq_vectors(pdev);
result = nvme_setup_irqs(dev, nr_io_queues); if (result <= 0) {
result = -EIO; goto out_unlock;
}
dev->num_vecs = result;
result = max(result - 1, 1);
dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL];
/* * Should investigate if there's a performance win from allocating * more queues than interrupt vectors; it might allow the submission * path to scale better, even if the receive path is limited by the * number of interrupts.
*/
result = queue_request_irq(adminq); if (result) goto out_unlock;
set_bit(NVMEQ_ENABLED, &adminq->flags);
mutex_unlock(&dev->shutdown_lock);
result = nvme_create_io_queues(dev); if (result || dev->online_queues < 2) return result;
/* Give up if we are racing with nvme_dev_disable() */ if (!mutex_trylock(&dev->shutdown_lock)) returnfalse;
/* Check if nvme_dev_disable() has been executed already */ if (!dev->online_queues) {
mutex_unlock(&dev->shutdown_lock); returnfalse;
}
blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1); /* free previously allocated queues that are no longer usable */
nvme_free_queues(dev, dev->online_queues);
mutex_unlock(&dev->shutdown_lock); returntrue;
}
staticint nvme_pci_enable(struct nvme_dev *dev)
{ int result = -ENOMEM; struct pci_dev *pdev = to_pci_dev(dev->dev); unsignedint flags = PCI_IRQ_ALL_TYPES;
if (pci_enable_device_mem(pdev)) return result;
pci_set_master(pdev);
if (readl(dev->bar + NVME_REG_CSTS) == -1) {
result = -ENODEV; goto disable;
}
/* * Some devices and/or platforms don't advertise or work with INTx * interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll * adjust this later.
*/ if (dev->ctrl.quirks & NVME_QUIRK_BROKEN_MSI)
flags &= ~PCI_IRQ_MSI;
result = pci_alloc_irq_vectors(pdev, 1, 1, flags); if (result < 0) goto disable;
/* * Some Apple controllers require a non-standard SQE size. * Interestingly they also seem to ignore the CC:IOSQES register * so we don't bother updating it here.
*/ if (dev->ctrl.quirks & NVME_QUIRK_128_BYTES_SQES)
dev->io_sqes = 7; else
dev->io_sqes = NVME_NVM_IOSQES;
/* * Controllers with the shared tags quirk need the IO queue to be * big enough so that we get 32 tags for the admin queue
*/ if ((dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) &&
(dev->q_depth < (NVME_AQ_DEPTH + 2))) {
dev->q_depth = NVME_AQ_DEPTH + 2;
dev_warn(dev->ctrl.device, "IO queue depth clamped to %d\n",
dev->q_depth);
}
dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */
nvme_map_cmb(dev);
pci_save_state(pdev);
result = nvme_pci_configure_admin_queue(dev); if (result) goto free_irq; return result;
mutex_lock(&dev->shutdown_lock);
dead = nvme_pci_ctrl_is_dead(dev); if (state == NVME_CTRL_LIVE || state == NVME_CTRL_RESETTING) { if (pci_is_enabled(pdev))
nvme_start_freeze(&dev->ctrl); /* * Give the controller a chance to complete all entered requests * if doing a safe shutdown.
*/ if (!dead && shutdown)
nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
}
nvme_quiesce_io_queues(&dev->ctrl);
if (!dead && dev->ctrl.queue_count > 0) {
nvme_delete_io_queues(dev);
nvme_disable_ctrl(&dev->ctrl, shutdown);
nvme_poll_irqdisable(&dev->queues[0]);
}
nvme_suspend_io_queues(dev);
nvme_suspend_queue(dev, 0);
pci_free_irq_vectors(pdev); if (pci_is_enabled(pdev))
pci_disable_device(pdev);
nvme_reap_pending_cqes(dev);
/* * The driver will not be starting up queues again if shutting down so * must flush all entered requests to their failed completion to avoid * deadlocking blk-mq hot-cpu notifier.
*/ if (shutdown) {
nvme_unquiesce_io_queues(&dev->ctrl); if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q))
nvme_unquiesce_admin_queue(&dev->ctrl);
}
mutex_unlock(&dev->shutdown_lock);
}
if (nvme_ctrl_state(&dev->ctrl) != NVME_CTRL_RESETTING) {
dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n",
dev->ctrl.state);
result = -ENODEV; goto out;
}
/* * If we're called to reset a live controller first shut it down before * moving on.
*/ if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
nvme_dev_disable(dev, false);
nvme_sync_queues(&dev->ctrl);
mutex_lock(&dev->shutdown_lock);
result = nvme_pci_enable(dev); if (result) goto out_unlock;
nvme_unquiesce_admin_queue(&dev->ctrl);
mutex_unlock(&dev->shutdown_lock);
/* * Introduce CONNECTING state from nvme-fc/rdma transports to mark the * initializing procedure here.
*/ if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
dev_warn(dev->ctrl.device, "failed to mark controller CONNECTING\n");
result = -EBUSY; goto out;
}
result = nvme_init_ctrl_finish(&dev->ctrl, was_suspend); if (result) goto out;
if (nvme_ctrl_meta_sgl_supported(&dev->ctrl))
dev->ctrl.max_integrity_segments = NVME_MAX_META_SEGS; else
dev->ctrl.max_integrity_segments = 1;
nvme_dbbuf_dma_alloc(dev);
result = nvme_setup_host_mem(dev); if (result < 0) goto out;
nvme_update_attrs(dev);
result = nvme_setup_io_queues(dev); if (result) goto out;
/* * Freeze and update the number of I/O queues as those might have * changed. If there are no I/O queues left after this reset, keep the * controller around but remove all namespaces.
*/ if (dev->online_queues > 1) {
nvme_dbbuf_set(dev);
nvme_unquiesce_io_queues(&dev->ctrl);
nvme_wait_freeze(&dev->ctrl); if (!nvme_pci_update_nr_queues(dev)) goto out;
nvme_unfreeze(&dev->ctrl);
} else {
dev_warn(dev->ctrl.device, "IO queues lost\n");
nvme_mark_namespaces_dead(&dev->ctrl);
nvme_unquiesce_io_queues(&dev->ctrl);
nvme_remove_namespaces(&dev->ctrl);
nvme_free_tagset(dev);
}
/* * If only admin queue live, keep it to do further investigation or * recovery.
*/ if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
dev_warn(dev->ctrl.device, "failed to mark controller live state\n");
result = -ENODEV; goto out;
}
nvme_start_ctrl(&dev->ctrl); return;
out_unlock:
mutex_unlock(&dev->shutdown_lock);
out: /* * Set state to deleting now to avoid blocking nvme_wait_reset(), which * may be holding this pci_dev's device lock.
*/
dev_warn(dev->ctrl.device, "Disabling device after reset failure: %d\n",
result);
nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
nvme_dev_disable(dev, true);
nvme_sync_queues(&dev->ctrl);
nvme_mark_namespaces_dead(&dev->ctrl);
nvme_unquiesce_io_queues(&dev->ctrl);
nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
}
staticunsignedlong check_vendor_combination_bug(struct pci_dev *pdev)
{ if (pdev->vendor == 0x144d && pdev->device == 0xa802) { /* * Several Samsung devices seem to drop off the PCIe bus * randomly when APST is on and uses the deepest sleep state. * This has been observed on a Samsung "SM951 NVMe SAMSUNG * 256GB", a "PM951 NVMe SAMSUNG 512GB", and a "Samsung SSD * 950 PRO 256GB", but it seems to be restricted to two Dell * laptops.
*/ if (dmi_match(DMI_SYS_VENDOR, "Dell Inc.") &&
(dmi_match(DMI_PRODUCT_NAME, "XPS 15 9550") ||
dmi_match(DMI_PRODUCT_NAME, "Precision 5510"))) return NVME_QUIRK_NO_DEEPEST_PS;
} elseif (pdev->vendor == 0x144d && pdev->device == 0xa804) { /* * Samsung SSD 960 EVO drops off the PCIe bus after system * suspend on a Ryzen board, ASUS PRIME B350M-A, as well as * within few minutes after bootup on a Coffee Lake board - * ASUS PRIME Z370-A
*/ if (dmi_match(DMI_BOARD_VENDOR, "ASUSTeK COMPUTER INC.") &&
(dmi_match(DMI_BOARD_NAME, "PRIME B350M-A") ||
dmi_match(DMI_BOARD_NAME, "PRIME Z370-A"))) return NVME_QUIRK_NO_APST;
} elseif ((pdev->vendor == 0x144d && (pdev->device == 0xa801 ||
pdev->device == 0xa808 || pdev->device == 0xa809)) ||
(pdev->vendor == 0x1e0f && pdev->device == 0x0001)) { /* * Forcing to use host managed nvme power settings for * lowest idle power with quick resume latency on * Samsung and Toshiba SSDs based on suspend behavior * on Coffee Lake board for LENOVO C640
*/ if ((dmi_match(DMI_BOARD_VENDOR, "LENOVO")) &&
dmi_match(DMI_BOARD_NAME, "LNVNB161216")) return NVME_QUIRK_SIMPLE_SUSPEND;
} elseif (pdev->vendor == 0x2646 && (pdev->device == 0x2263 ||
pdev->device == 0x500f)) { /* * Exclude some Kingston NV1 and A2000 devices from * NVME_QUIRK_SIMPLE_SUSPEND. Do a full suspend to save a * lot of energy with s2idle sleep on some TUXEDO platforms.
*/ if (dmi_match(DMI_BOARD_NAME, "NS5X_NS7XAU") ||
dmi_match(DMI_BOARD_NAME, "NS5x_7xAU") ||
dmi_match(DMI_BOARD_NAME, "NS5x_7xPU") ||
dmi_match(DMI_BOARD_NAME, "PH4PRX1_PH6PRX1")) return NVME_QUIRK_FORCE_NO_SIMPLE_SUSPEND;
} elseif (pdev->vendor == 0x144d && pdev->device == 0xa80d) { /* * Exclude Samsung 990 Evo from NVME_QUIRK_SIMPLE_SUSPEND * because of high power consumption (> 2 Watt) in s2idle * sleep. Only some boards with Intel CPU are affected. * (Note for testing: Samsung 990 Evo Plus has same PCI ID)
*/ if (dmi_match(DMI_BOARD_NAME, "DN50Z-140HC-YD") ||
dmi_match(DMI_BOARD_NAME, "GMxPXxx") ||
dmi_match(DMI_BOARD_NAME, "GXxMRXx") ||
dmi_match(DMI_BOARD_NAME, "NS5X_NS7XAU") ||
dmi_match(DMI_BOARD_NAME, "PH4PG31") ||
dmi_match(DMI_BOARD_NAME, "PH4PRX1_PH6PRX1") ||
dmi_match(DMI_BOARD_NAME, "PH6PG01_PH6PG71")) return NVME_QUIRK_FORCE_NO_SIMPLE_SUSPEND;
}
/* * NVMe SSD drops off the PCIe bus after system idle * for 10 hours on a Lenovo N60z board.
*/ if (dmi_match(DMI_BOARD_NAME, "LXKT-ZXEG-N6")) return NVME_QUIRK_NO_APST;
return 0;
}
staticstruct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, conststruct pci_device_id *id)
{ unsignedlong quirks = id->driver_data; int node = dev_to_node(&pdev->dev); struct nvme_dev *dev; int ret = -ENOMEM;
dev = kzalloc_node(struct_size(dev, descriptor_pools, nr_node_ids),
GFP_KERNEL, node); if (!dev) return ERR_PTR(-ENOMEM);
INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
mutex_init(&dev->shutdown_lock);
quirks |= check_vendor_combination_bug(pdev); if (!noacpi &&
!(quirks & NVME_QUIRK_FORCE_NO_SIMPLE_SUSPEND) &&
acpi_storage_d3(&pdev->dev)) { /* * Some systems use a bios work around to ask for D3 on * platforms that support kernel managed suspend.
*/
dev_info(&pdev->dev, "platform quirk: setting simple suspend\n");
quirks |= NVME_QUIRK_SIMPLE_SUSPEND;
}
ret = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
quirks); if (ret) goto out_put_device;
staticint nvme_probe(struct pci_dev *pdev, conststruct pci_device_id *id)
{ struct nvme_dev *dev; int result = -ENOMEM;
dev = nvme_pci_alloc_dev(pdev, id); if (IS_ERR(dev)) return PTR_ERR(dev);
result = nvme_add_ctrl(&dev->ctrl); if (result) goto out_put_ctrl;
result = nvme_dev_map(dev); if (result) goto out_uninit_ctrl;
result = nvme_pci_alloc_iod_mempool(dev); if (result) goto out_dev_unmap;
dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
result = nvme_pci_enable(dev); if (result) goto out_release_iod_mempool;
result = nvme_alloc_admin_tag_set(&dev->ctrl, &dev->admin_tagset,
&nvme_mq_admin_ops, sizeof(struct nvme_iod)); if (result) goto out_disable;
/* * Mark the controller as connecting before sending admin commands to * allow the timeout handler to do the right thing.
*/ if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
dev_warn(dev->ctrl.device, "failed to mark controller CONNECTING\n");
result = -EBUSY; goto out_disable;
}
result = nvme_init_ctrl_finish(&dev->ctrl, false); if (result) goto out_disable;
if (nvme_ctrl_meta_sgl_supported(&dev->ctrl))
dev->ctrl.max_integrity_segments = NVME_MAX_META_SEGS; else
dev->ctrl.max_integrity_segments = 1;
nvme_dbbuf_dma_alloc(dev);
result = nvme_setup_host_mem(dev); if (result < 0) goto out_disable;
nvme_update_attrs(dev);
result = nvme_setup_io_queues(dev); if (result) goto out_disable;
if (!dev->ctrl.tagset)
dev_warn(dev->ctrl.device, "IO queues not created\n");
if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
dev_warn(dev->ctrl.device, "failed to mark controller live state\n");
result = -ENODEV; goto out_disable;
}
/* * We don't need to check the return value from waiting for the reset * state as pci_dev device lock is held, making it impossible to race * with ->remove().
*/
nvme_disable_prepare_reset(dev, false);
nvme_sync_queues(&dev->ctrl);
}
/* * The driver's remove may be called on a device in a partially initialized * state. This function must not have any dependencies on the device state in * order to proceed.
*/ staticvoid nvme_remove(struct pci_dev *pdev)
{ struct nvme_dev *dev = pci_get_drvdata(pdev);
/* * The platform does not remove power for a kernel managed suspend so * use host managed nvme power settings for lowest idle power if * possible. This should have quicker resume latency than a full device * shutdown. But if the firmware is involved after the suspend or the * device does not support any non-default power states, shut down the * device fully. * * If ASPM is not enabled for the device, shut down the device and allow * the PCI bus layer to put it into D3 in order to take the PCIe link * down, so as to allow the platform to achieve its minimum low-power * state (which may not be possible if the link is up).
*/ if (pm_suspend_via_firmware() || !ctrl->npss ||
!pcie_aspm_enabled(pdev) ||
(ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND)) return nvme_disable_prepare_reset(ndev, true);
if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE) goto unfreeze;
/* * Host memory access may not be successful in a system suspend state, * but the specification allows the controller to access memory in a * non-operational power state.
*/ if (ndev->hmb) {
ret = nvme_set_host_mem(ndev, 0); if (ret < 0) goto unfreeze;
}
ret = nvme_get_power_state(ctrl, &ndev->last_ps); if (ret < 0) goto unfreeze;
/* * A saved state prevents pci pm from generically controlling the * device's power. If we're using protocol specific settings, we don't * want pci interfering.
*/
pci_save_state(pdev);
ret = nvme_set_power_state(ctrl, ctrl->npss); if (ret < 0) goto unfreeze;
if (ret) { /* discard the saved state */
pci_load_saved_state(pdev, NULL);
/* * Clearing npss forces a controller reset on resume. The * correct value will be rediscovered then.
*/
ret = nvme_disable_prepare_reset(ndev, true);
ctrl->npss = 0;
}
unfreeze:
nvme_unfreeze(ctrl); return ret;
}
/* * A frozen channel requires a reset. When detected, this method will * shutdown the controller to quiesce. The controller will be restarted * after the slot reset through driver's slot_reset callback.
*/ switch (state) { case pci_channel_io_normal: return PCI_ERS_RESULT_CAN_RECOVER; case pci_channel_io_frozen:
dev_warn(dev->ctrl.device, "frozen state error detected, reset controller\n"); if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) {
nvme_dev_disable(dev, true); return PCI_ERS_RESULT_DISCONNECT;
}
nvme_dev_disable(dev, false); return PCI_ERS_RESULT_NEED_RESET; case pci_channel_io_perm_failure:
dev_warn(dev->ctrl.device, "failure state error detected, request disconnect\n"); return PCI_ERS_RESULT_DISCONNECT;
} return PCI_ERS_RESULT_NEED_RESET;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.