// SPDX-License-Identifier: GPL-2.0 /* * Apple ANS NVM Express device driver * Copyright The Asahi Linux Contributors * * Based on the pci.c NVM Express device driver * Copyright (c) 2011-2014, Intel Corporation. * and on the rdma.c NVMe over Fabrics RDMA host code. * Copyright (c) 2015-2016 HGST, a Western Digital Company.
*/
/* * This controller is a bit weird in the way command tags works: Both the * admin and the IO queue share the same tag space. Additionally, tags * cannot be higher than 0x40 which effectively limits the combined * queue depth to 0x40. Instead of wasting half of that on the admin queue * which gets much less traffic we instead reduce its size here. * The controller also doesn't support async event such that no space must * be reserved for NVME_NR_AEN_COMMANDS.
*/ #define APPLE_NVME_AQ_DEPTH 2 #define APPLE_NVME_AQ_MQ_TAG_DEPTH (APPLE_NVME_AQ_DEPTH - 1)
/* * These can be higher, but we need to ensure that any command doesn't * require an sg allocation that needs more than a page of data.
*/ #define NVME_MAX_KB_SZ 4096 #define NVME_MAX_SEGS 127
/* * This controller comes with an embedded IOMMU known as NVMMU. * The NVMMU is pointed to an array of TCBs indexed by the command tag. * Each command must be configured inside this structure before it's allowed * to execute, including commands that don't require DMA transfers. * * An exception to this are Apple's vendor-specific commands (opcode 0xD8 on the * admin queue): Those commands must still be added to the NVMMU but the DMA * buffers cannot be represented as PRPs and must instead be allowed using SART. * * Programming the PRPs to the same values as those in the submission queue * looks rather silly at first. This hardware is however designed for a kernel * that runs the NVMMU code in a higher exception level than the NVMe driver. * In that setting the NVMe driver first programs the submission queue entry * and then executes a hypercall to the code that is allowed to program the * NVMMU. The NVMMU driver then creates a shadow copy of the PRPs while * verifying that they don't point to kernel text, data, pagetables, or similar * protected areas before programming the TCB to point to this shadow copy. * Since Linux doesn't do any of that we may as well just point both the queue * and the TCB PRP pointer to the same memory.
*/ struct apple_nvmmu_tcb {
u8 opcode;
/* * The Apple NVMe controller only supports a single admin and a single IO queue * which are both limited to 64 entries and share a single interrupt. * * The completion queue works as usual. The submission "queue" instead is * an array indexed by the command tag on this hardware. Commands must also be * present in the NVMMU's tcb array. They are triggered by writing their tag to * a MMIO register.
*/ struct apple_nvme_queue { struct nvme_command *sqes; struct nvme_completion *cqes; struct apple_nvmmu_tcb *tcbs;
/* * The apple_nvme_iod describes the data in an I/O. * * The sg pointer contains the list of PRP chunk allocations in addition * to the actual struct scatterlist.
*/ struct apple_nvme_iod { struct nvme_request req; struct nvme_command cmd; struct apple_nvme_queue *q; int npages; /* In the PRP list. 0 means small pool in use */ int nents; /* Used in scatterlist */
dma_addr_t first_dma; unsignedint dma_len; /* length of single DMA segment mapping */ struct scatterlist *sg;
};
if (nvme_is_write(cmd))
tcb->dma_flags = APPLE_ANS_TCB_DMA_TO_DEVICE; else
tcb->dma_flags = APPLE_ANS_TCB_DMA_FROM_DEVICE;
memcpy(&q->sqes[tag], cmd, sizeof(*cmd));
/* * This lock here doesn't make much sense at a first glance but * removing it will result in occasional missed completion * interrupts even though the commands still appear on the CQ. * It's unclear why this happens but our best guess is that * there is a bug in the firmware triggered when a new command * is issued while we're inside the irq handler between the * NVMMU invalidation (and making the tag available again) * and the final CQ update.
*/
spin_lock_irq(&anv->lock);
writel(tag, q->sq_db);
spin_unlock_irq(&anv->lock);
}
/* * From pci.c: * Will slightly overestimate the number of pages needed. This is OK * as it only leads to a small amount of wasted memory for the lifetime of * the I/O.
*/ staticinline size_t apple_nvme_iod_alloc_size(void)
{ constunsignedint nprps = DIV_ROUND_UP(
NVME_MAX_KB_SZ + NVME_CTRL_PAGE_SIZE, NVME_CTRL_PAGE_SIZE); constint npages = DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); const size_t alloc_size = sizeof(__le64 *) * npages + sizeof(struct scatterlist) * NVME_MAX_SEGS;
/* * load-load control dependency between phase and the rest of * the cqe requires a full read memory barrier
*/
dma_rmb();
apple_nvme_handle_cqe(q, iob, q->cq_head);
apple_nvme_update_cq_head(q);
}
spin_lock_irqsave(&anv->lock, flags); if (apple_nvme_handle_cq(&anv->ioq, false))
handled = true; if (apple_nvme_handle_cq(&anv->adminq, false))
handled = true;
spin_unlock_irqrestore(&anv->lock, flags);
if (handled) return IRQ_HANDLED; return IRQ_NONE;
}
staticint apple_nvme_create_cq(struct apple_nvme *anv)
{ struct nvme_command c = {};
/* * Note: we (ab)use the fact that the prp fields survive if no data * is attached to the request.
*/
c.create_cq.opcode = nvme_admin_create_cq;
c.create_cq.prp1 = cpu_to_le64(anv->ioq.cq_dma_addr);
c.create_cq.cqid = cpu_to_le16(1);
c.create_cq.qsize = cpu_to_le16(APPLE_ANS_MAX_QUEUE_DEPTH - 1);
c.create_cq.cq_flags = cpu_to_le16(NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED);
c.create_cq.irq_vector = cpu_to_le16(0);
staticint apple_nvme_create_sq(struct apple_nvme *anv)
{ struct nvme_command c = {};
/* * Note: we (ab)use the fact that the prp fields survive if no data * is attached to the request.
*/
c.create_sq.opcode = nvme_admin_create_sq;
c.create_sq.prp1 = cpu_to_le64(anv->ioq.sq_dma_addr);
c.create_sq.sqid = cpu_to_le16(1);
c.create_sq.qsize = cpu_to_le16(APPLE_ANS_MAX_QUEUE_DEPTH - 1);
c.create_sq.sq_flags = cpu_to_le16(NVME_QUEUE_PHYS_CONTIG);
c.create_sq.cqid = cpu_to_le16(1);
/* * We should not need to do this, but we're still using this to * ensure we can drain requests on a dying queue.
*/ if (unlikely(!READ_ONCE(q->enabled))) return BLK_STS_IOERR;
if (!nvme_check_ready(&anv->ctrl, req, true)) return nvme_fail_nonready_command(&anv->ctrl, req);
ret = nvme_setup_cmd(ns, req); if (ret) return ret;
if (blk_rq_nr_phys_segments(req)) {
ret = apple_nvme_map_data(anv, req, cmnd); if (ret) goto out_free_cmd;
}
if (apple_rtkit_is_crashed(anv->rtk))
dead = true; if (!(csts & NVME_CSTS_RDY))
dead = true; if (csts & NVME_CSTS_CFS)
dead = true;
if (state == NVME_CTRL_LIVE ||
state == NVME_CTRL_RESETTING) {
freeze = true;
nvme_start_freeze(&anv->ctrl);
}
/* * Give the controller a chance to complete all entered requests if * doing a safe shutdown.
*/ if (!dead && shutdown && freeze)
nvme_wait_freeze_timeout(&anv->ctrl, NVME_IO_TIMEOUT);
nvme_quiesce_io_queues(&anv->ctrl);
if (!dead) { if (READ_ONCE(anv->ioq.enabled)) {
apple_nvme_remove_sq(anv);
apple_nvme_remove_cq(anv);
}
/* * Always disable the NVMe controller after shutdown. * We need to do this to bring it back up later anyway, and we * can't do it while the firmware is not running (e.g. in the * resume reset path before RTKit is initialized), so for Apple * controllers it makes sense to unconditionally do it here. * Additionally, this sequence of events is reliable, while * others (like disabling after bringing back the firmware on * resume) seem to run into trouble under some circumstances. * * Both U-Boot and m1n1 also use this convention (i.e. an ANS * NVMe controller is handed off with firmware shut down, in an * NVMe disabled state, after a clean shutdown).
*/ if (shutdown)
nvme_disable_ctrl(&anv->ctrl, shutdown);
nvme_disable_ctrl(&anv->ctrl, false);
}
WRITE_ONCE(anv->ioq.enabled, false);
WRITE_ONCE(anv->adminq.enabled, false);
mb(); /* ensure that nvme_queue_rq() sees that enabled is cleared */
nvme_quiesce_admin_queue(&anv->ctrl);
/* last chance to complete any requests before nvme_cancel_request */
spin_lock_irqsave(&anv->lock, flags);
apple_nvme_handle_cq(&anv->ioq, true);
apple_nvme_handle_cq(&anv->adminq, true);
spin_unlock_irqrestore(&anv->lock, flags);
/* * The driver will not be starting up queues again if shutting down so * must flush all entered requests to their failed completion to avoid * deadlocking blk-mq hot-cpu notifier.
*/ if (shutdown) {
nvme_unquiesce_io_queues(&anv->ctrl);
nvme_unquiesce_admin_queue(&anv->ctrl);
}
}
if (nvme_ctrl_state(&anv->ctrl) != NVME_CTRL_LIVE) { /* * From rdma.c: * If we are resetting, connecting or deleting we should * complete immediately because we may block controller * teardown or setup sequence * - ctrl disable/shutdown fabrics requests * - connect requests * - initialization admin requests * - I/O requests that entered after unquiescing and * the controller stopped responding * * All other requests should be cancelled by the error * recovery work, so it's fine that we fail it here.
*/
dev_warn(anv->dev, "I/O %d(aq:%d) timeout while not in live state\n",
req->tag, q->is_adminq); if (blk_mq_request_started(req) &&
!blk_mq_request_completed(req)) {
nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD;
nvme_req(req)->flags |= NVME_REQ_CANCELLED;
blk_mq_complete_request(req);
} return BLK_EH_DONE;
}
/* check if we just missed an interrupt if we're still alive */ if (!apple_rtkit_is_crashed(anv->rtk) && !(csts & NVME_CSTS_CFS)) {
spin_lock_irqsave(&anv->lock, flags);
apple_nvme_handle_cq(q, false);
spin_unlock_irqrestore(&anv->lock, flags); if (blk_mq_request_completed(req)) {
dev_warn(anv->dev, "I/O %d(aq:%d) timeout: completion polled\n",
req->tag, q->is_adminq); return BLK_EH_DONE;
}
}
/* * aborting commands isn't supported which leaves a full reset as our * only option here
*/
dev_warn(anv->dev, "I/O %d(aq:%d) timeout: resetting controller\n",
req->tag, q->is_adminq);
nvme_req(req)->flags |= NVME_REQ_CANCELLED;
apple_nvme_disable(anv, false);
nvme_reset_ctrl(&anv->ctrl); return BLK_EH_DONE;
}
if (state != NVME_CTRL_RESETTING) {
dev_warn(anv->dev, "ctrl state %d is not RESETTING\n", state);
ret = -ENODEV; goto out;
}
/* there's unfortunately no known way to recover if RTKit crashed :( */ if (apple_rtkit_is_crashed(anv->rtk)) {
dev_err(anv->dev, "RTKit has crashed without any way to recover.");
ret = -EIO; goto out;
}
/* RTKit must be shut down cleanly for the (soft)-reset to work */ if (apple_rtkit_is_running(anv->rtk)) { /* reset the controller if it is enabled */ if (anv->ctrl.ctrl_config & NVME_CC_ENABLE)
apple_nvme_disable(anv, false);
dev_dbg(anv->dev, "Trying to shut down RTKit before reset.");
ret = apple_rtkit_shutdown(anv->rtk); if (ret) goto out;
/* * Only do the soft-reset if the CPU is not running, which means either we * or the previous stage shut it down cleanly.
*/ if (!(readl(anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL) &
APPLE_ANS_COPROC_CPU_CONTROL_RUN)) {
ret = reset_control_assert(anv->reset); if (ret) goto out;
ret = apple_rtkit_reinit(anv->rtk); if (ret) goto out;
ret = reset_control_deassert(anv->reset); if (ret) goto out;
ret = apple_rtkit_boot(anv->rtk);
} else {
ret = apple_rtkit_wake(anv->rtk);
}
if (ret) {
dev_err(anv->dev, "ANS did not boot"); goto out;
}
ret = readl_poll_timeout(anv->mmio_nvme + APPLE_ANS_BOOT_STATUS,
boot_status,
boot_status == APPLE_ANS_BOOT_STATUS_OK,
USEC_PER_MSEC, APPLE_ANS_BOOT_TIMEOUT); if (ret) {
dev_err(anv->dev, "ANS did not initialize"); goto out;
}
dev_dbg(anv->dev, "ANS booted successfully.");
/* * Limit the max command size to prevent iod->sg allocations going * over a single page.
*/
anv->ctrl.max_hw_sectors = min_t(u32, NVME_MAX_KB_SZ << 1,
dma_max_mapping_size(anv->dev) >> 9);
anv->ctrl.max_segments = NVME_MAX_SEGS;
dma_set_max_seg_size(anv->dev, 0xffffffff);
/* * Enable NVMMU and linear submission queues. * While we could keep those disabled and pretend this is slightly * more common NVMe controller we'd still need some quirks (e.g. * sq entries will be 128 bytes) and Apple might drop support for * that mode in the future.
*/
writel(APPLE_ANS_LINEAR_SQ_EN,
anv->mmio_nvme + APPLE_ANS_LINEAR_SQ_CTRL);
/* Allow as many pending command as possible for both queues */
writel(APPLE_ANS_MAX_QUEUE_DEPTH | (APPLE_ANS_MAX_QUEUE_DEPTH << 16),
anv->mmio_nvme + APPLE_ANS_MAX_PEND_CMDS_CTRL);
/* Setup the NVMMU for the maximum admin and IO queue depth */
writel(APPLE_ANS_MAX_QUEUE_DEPTH - 1,
anv->mmio_nvme + APPLE_NVMMU_NUM_TCBS);
/* * This is probably a chicken bit: without it all commands where any PRP * is set to zero (including those that don't use that field) fail and * the co-processor complains about "completed with err BAD_CMD-" or * a "NULL_PRP_PTR_ERR" in the syslog
*/
writel(readl(anv->mmio_nvme + APPLE_ANS_UNKNOWN_CTRL) &
~APPLE_ANS_PRP_NULL_CHECK,
anv->mmio_nvme + APPLE_ANS_UNKNOWN_CTRL);
if (!nvme_change_ctrl_state(&anv->ctrl, NVME_CTRL_CONNECTING)) {
dev_warn(anv->ctrl.device, "failed to mark controller CONNECTING\n");
ret = -ENODEV; goto out;
}
ret = nvme_init_ctrl_finish(&anv->ctrl, false); if (ret) goto out;
dev_dbg(anv->dev, "Creating IOCQ");
ret = apple_nvme_create_cq(anv); if (ret) goto out;
dev_dbg(anv->dev, "Creating IOSQ");
ret = apple_nvme_create_sq(anv); if (ret) goto out_remove_cq;
apple_nvme_init_queue(&anv->ioq);
nr_io_queues = 1;
ret = nvme_set_queue_count(&anv->ctrl, &nr_io_queues); if (ret) goto out_remove_sq; if (nr_io_queues != 1) {
ret = -ENXIO; goto out_remove_sq;
}
if (!nvme_change_ctrl_state(&anv->ctrl, NVME_CTRL_LIVE)) {
dev_warn(anv->ctrl.device, "failed to mark controller live state\n");
ret = -ENODEV; goto out_remove_sq;
}
nvme_start_ctrl(&anv->ctrl);
dev_dbg(anv->dev, "ANS boot and NVMe init completed."); return;
ret = blk_mq_alloc_tag_set(&anv->admin_tagset); if (ret) return ret;
ret = devm_add_action_or_reset(anv->dev, devm_apple_nvme_put_tag_set,
&anv->admin_tagset); if (ret) return ret;
anv->tagset.ops = &apple_nvme_mq_ops;
anv->tagset.nr_hw_queues = 1;
anv->tagset.nr_maps = 1; /* * Tags are used as an index to the NVMMU and must be unique across * both queues. The admin queue gets the first APPLE_NVME_AQ_DEPTH which * must be marked as reserved in the IO queue.
*/
anv->tagset.reserved_tags = APPLE_NVME_AQ_DEPTH;
anv->tagset.queue_depth = APPLE_ANS_MAX_QUEUE_DEPTH - 1;
anv->tagset.timeout = NVME_IO_TIMEOUT;
anv->tagset.numa_node = NUMA_NO_NODE;
anv->tagset.cmd_size = sizeof(struct apple_nvme_iod);
anv->tagset.driver_data = &anv->ioq;
ret = blk_mq_alloc_tag_set(&anv->tagset); if (ret) return ret;
ret = devm_add_action_or_reset(anv->dev, devm_apple_nvme_put_tag_set,
&anv->tagset); if (ret) return ret;
/* * We need the maximum queue depth here because the NVMMU only has a * single depth configuration shared between both queues.
*/
q->tcbs = dmam_alloc_coherent(anv->dev,
APPLE_ANS_MAX_QUEUE_DEPTH * sizeof(struct apple_nvmmu_tcb),
&q->tcb_dma_addr, GFP_KERNEL); if (!q->tcbs) return -ENOMEM;
/* * initialize phase to make sure the allocated and empty memory * doesn't look like a full cq already.
*/
q->cq_phase = 1; return 0;
}
staticvoid apple_nvme_detach_genpd(struct apple_nvme *anv)
{ int i;
if (anv->pd_count <= 1) return;
for (i = anv->pd_count - 1; i >= 0; i--) { if (anv->pd_link[i])
device_link_del(anv->pd_link[i]); if (!IS_ERR_OR_NULL(anv->pd_dev[i]))
dev_pm_domain_detach(anv->pd_dev[i], true);
}
}
anv->pd_dev = devm_kcalloc(dev, anv->pd_count, sizeof(*anv->pd_dev),
GFP_KERNEL); if (!anv->pd_dev) return -ENOMEM;
anv->pd_link = devm_kcalloc(dev, anv->pd_count, sizeof(*anv->pd_link),
GFP_KERNEL); if (!anv->pd_link) return -ENOMEM;
for (i = 0; i < anv->pd_count; i++) {
anv->pd_dev[i] = dev_pm_domain_attach_by_id(dev, i); if (IS_ERR(anv->pd_dev[i])) {
apple_nvme_detach_genpd(anv); return PTR_ERR(anv->pd_dev[i]);
}
ret = apple_nvme_attach_genpd(anv); if (ret < 0) {
dev_err_probe(dev, ret, "Failed to attach power domains"); goto put_dev;
} if (dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64))) {
ret = -ENXIO; goto put_dev;
}
anv->irq = platform_get_irq(pdev, 0); if (anv->irq < 0) {
ret = anv->irq; goto put_dev;
} if (!anv->irq) {
ret = -ENXIO; goto put_dev;
}
anv->mmio_coproc = devm_platform_ioremap_resource_byname(pdev, "ans"); if (IS_ERR(anv->mmio_coproc)) {
ret = PTR_ERR(anv->mmio_coproc); goto put_dev;
}
anv->mmio_nvme = devm_platform_ioremap_resource_byname(pdev, "nvme"); if (IS_ERR(anv->mmio_nvme)) {
ret = PTR_ERR(anv->mmio_nvme); goto put_dev;
}
anv->sart = devm_apple_sart_get(dev); if (IS_ERR(anv->sart)) {
ret = dev_err_probe(dev, PTR_ERR(anv->sart), "Failed to initialize SART"); goto put_dev;
}
anv->reset = devm_reset_control_array_get_exclusive(anv->dev); if (IS_ERR(anv->reset)) {
ret = dev_err_probe(dev, PTR_ERR(anv->reset), "Failed to get reset control"); goto put_dev;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.