struct nvme_fc_fcp_op { struct nvme_request nreq; /* * nvme/host/core.c * requires this to be * the 1st element in the * private structure * associated with the * request.
*/ struct nvmefc_fcp_req fcp_req;
/* * These items are short-term. They will eventually be moved into * a generic FC class. See comments in module init.
*/ staticstruct device *fc_udev_device;
/** * nvme_fc_register_localport - transport entry point called by an * LLDD to register the existence of a NVME * host FC port. * @pinfo: pointer to information about the port to be registered * @template: LLDD entrypoints and operational parameters for the port * @dev: physical hardware device node port corresponds to. Will be * used for DMA mappings * @portptr: pointer to a local port pointer. Upon success, the routine * will allocate a nvme_fc_local_port structure and place its * address in the local port pointer. Upon failure, local port * pointer will be set to 0. * * Returns: * a completion status. Must be 0 upon success; a negative errno * (ex: -ENXIO) upon failure.
*/ int
nvme_fc_register_localport(struct nvme_fc_port_info *pinfo, struct nvme_fc_port_template *template, struct device *dev, struct nvme_fc_local_port **portptr)
{ struct nvme_fc_lport *newrec; unsignedlong flags; int ret, idx;
/* * look to see if there is already a localport that had been * deregistered and in the process of waiting for all the * references to fully be removed. If the references haven't * expired, we can simply re-enable the localport. Remoteports * and controller reconnections should resume naturally.
*/
newrec = nvme_fc_attach_to_unreg_lport(pinfo, template, dev);
/* found an lport, but something about its state is bad */ if (IS_ERR(newrec)) {
ret = PTR_ERR(newrec); goto out_reghost_failed;
/* found existing lport, which was resumed */
} elseif (newrec) {
*portptr = &newrec->localport; return 0;
}
/* nothing found - allocate a new localport struct */
newrec = kmalloc((sizeof(*newrec) + template->local_priv_sz),
GFP_KERNEL); if (!newrec) {
ret = -ENOMEM; goto out_reghost_failed;
}
idx = ida_alloc(&nvme_fc_local_port_cnt, GFP_KERNEL); if (idx < 0) {
ret = -ENOSPC; goto out_fail_kfree;
}
if (!get_device(dev) && dev) {
ret = -ENODEV; goto out_ida_put;
}
/** * nvme_fc_unregister_localport - transport entry point called by an * LLDD to deregister/remove a previously * registered a NVME host FC port. * @portptr: pointer to the (registered) local port that is to be deregistered. * * Returns: * a completion status. Must be 0 upon success; a negative errno * (ex: -ENXIO) upon failure.
*/ int
nvme_fc_unregister_localport(struct nvme_fc_local_port *portptr)
{ struct nvme_fc_lport *lport = localport_to_lport(portptr); unsignedlong flags;
/* * TRADDR strings, per FC-NVME are fixed format: * "nn-0x<16hexdigits>:pn-0x<16hexdigits>" - 43 characters * udev event will only differ by prefix of what field is * being specified: * "NVMEFC_HOST_TRADDR=" or "NVMEFC_TRADDR=" - 19 max characters * 19 + 43 + null_fudge = 64 characters
*/ #define FCNVME_TRADDR_LENGTH 64
case NVME_CTRL_RESETTING: /* * Controller is already in the process of terminating the * association. No need to do anything further. The reconnect * step will naturally occur after the reset completes.
*/ break;
default: /* no action to take - let it delete */ break;
}
}
if (!nvme_fc_rport_get(rport)) {
rport = ERR_PTR(-ENOLCK); goto out_done;
}
spin_unlock_irqrestore(&nvme_fc_lock, flags);
spin_lock_irqsave(&rport->lock, flags);
/* has it been unregistered */ if (rport->remoteport.port_state != FC_OBJSTATE_DELETED) { /* means lldd called us twice */
spin_unlock_irqrestore(&rport->lock, flags);
nvme_fc_rport_put(rport); return ERR_PTR(-ESTALE);
}
/* * kick off a reconnect attempt on all associations to the * remote port. A successful reconnects will resume i/o.
*/
list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list)
nvme_fc_resume_controller(ctrl);
/** * nvme_fc_register_remoteport - transport entry point called by an * LLDD to register the existence of a NVME * subsystem FC port on its fabric. * @localport: pointer to the (registered) local port that the remote * subsystem port is connected to. * @pinfo: pointer to information about the port to be registered * @portptr: pointer to a remote port pointer. Upon success, the routine * will allocate a nvme_fc_remote_port structure and place its * address in the remote port pointer. Upon failure, remote port * pointer will be set to 0. * * Returns: * a completion status. Must be 0 upon success; a negative errno * (ex: -ENXIO) upon failure.
*/ int
nvme_fc_register_remoteport(struct nvme_fc_local_port *localport, struct nvme_fc_port_info *pinfo, struct nvme_fc_remote_port **portptr)
{ struct nvme_fc_lport *lport = localport_to_lport(localport); struct nvme_fc_rport *newrec; unsignedlong flags; int ret, idx;
if (!nvme_fc_lport_get(lport)) {
ret = -ESHUTDOWN; goto out_reghost_failed;
}
/* * look to see if there is already a remoteport that is waiting * for a reconnect (within dev_loss_tmo) with the same WWN's. * If so, transition to it and reconnect.
*/
newrec = nvme_fc_attach_to_suspended_rport(lport, pinfo);
/* found an rport, but something about its state is bad */ if (IS_ERR(newrec)) {
ret = PTR_ERR(newrec); goto out_lport_put;
/* found existing rport, which was resumed */
} elseif (newrec) {
nvme_fc_lport_put(lport);
__nvme_fc_set_dev_loss_tmo(newrec, pinfo);
nvme_fc_signal_discovery_scan(lport, newrec);
*portptr = &newrec->remoteport; return 0;
}
/* nothing found - allocate a new remoteport struct */
newrec = kmalloc((sizeof(*newrec) + lport->ops->remote_priv_sz),
GFP_KERNEL); if (!newrec) {
ret = -ENOMEM; goto out_lport_put;
}
idx = ida_alloc(&lport->endp_cnt, GFP_KERNEL); if (idx < 0) {
ret = -ENOSPC; goto out_kfree_rport;
}
/** * nvme_fc_unregister_remoteport - transport entry point called by an * LLDD to deregister/remove a previously * registered a NVME subsystem FC port. * @portptr: pointer to the (registered) remote port that is to be * deregistered. * * Returns: * a completion status. Must be 0 upon success; a negative errno * (ex: -ENXIO) upon failure.
*/ int
nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *portptr)
{ struct nvme_fc_rport *rport = remoteport_to_rport(portptr); struct nvme_fc_ctrl *ctrl; unsignedlong flags;
list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) { /* if dev_loss_tmo==0, dev loss is immediate */ if (!portptr->dev_loss_tmo) {
dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: controller connectivity lost.\n",
ctrl->cnum);
nvme_delete_ctrl(&ctrl->ctrl);
} else
nvme_fc_ctrl_connectivity_loss(ctrl);
}
spin_unlock_irqrestore(&rport->lock, flags);
nvme_fc_abort_lsops(rport);
if (atomic_read(&rport->act_ctrl_cnt) == 0)
rport->lport->ops->remoteport_delete(portptr);
/* * release the reference, which will allow, if all controllers * go away, which should only occur after dev_loss_tmo occurs, * for the rport to be torn down.
*/
nvme_fc_rport_put(rport);
/** * nvme_fc_rescan_remoteport - transport entry point called by an * LLDD to request a nvme device rescan. * @remoteport: pointer to the (registered) remote port that is to be * rescanned. * * Returns: N/A
*/ void
nvme_fc_rescan_remoteport(struct nvme_fc_remote_port *remoteport)
{ struct nvme_fc_rport *rport = remoteport_to_rport(remoteport);
/* * The fcloop device passes in a NULL device pointer. Real LLD's will * pass in a valid device pointer. If NULL is passed to the dma mapping * routines, depending on the platform, it may or may not succeed, and * may crash. * * As such: * Wrap all the dma routines and check the dev pointer. * * If simple mappings (return just a dma address, we'll noop them, * returning a dma address of 0. * * On more complex mappings (dma_map_sg), a pseudo routine fills * in the scatter list, setting all dma addresses to 0.
*/
ret = __nvme_fc_send_ls_req(rport, lsop, nvme_fc_send_ls_req_done);
if (!ret) { /* * No timeout/not interruptible as we need the struct * to exist until the lldd calls us back. Thus mandate * wait until driver calls back. lldd responsible for * the timeout action
*/
wait_for_completion(&lsop->ls_done);
__nvme_fc_finish_ls_req(lsop);
ret = lsop->ls_error;
}
if (ret) return ret;
/* ACC or RJT payload ? */ if (rjt->w0.ls_cmd == FCNVME_LS_RJT) return -ENXIO;
return 0;
}
staticint
nvme_fc_send_ls_req_async(struct nvme_fc_rport *rport, struct nvmefc_ls_req_op *lsop, void (*done)(struct nvmefc_ls_req *req, int status))
{ /* don't wait for completion */
/* fc-nvme initiator doesn't care about success or failure of cmd */
kfree(lsop);
}
/* * This routine sends a FC-NVME LS to disconnect (aka terminate) * the FC-NVME Association. Terminating the association also * terminates the FC-NVME connections (per queue, both admin and io * queues) that are part of the association. E.g. things are torn * down, and the related FC-NVME Association ID and Connection IDs * become invalid. * * The behavior of the fc-nvme initiator is such that its * understanding of the association and connections will implicitly * be torn down. The action is implicit as it may be due to a loss of * connectivity with the fc-nvme target, so you may never get a * response even if you tried. As such, the action of this routine * is to asynchronously send the LS, ignore any results of the LS, and * continue on with terminating the association. If the fc-nvme target * is present and receives the LS, it too can tear down.
*/ staticvoid
nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl)
{ struct fcnvme_ls_disconnect_assoc_rqst *discon_rqst; struct fcnvme_ls_disconnect_assoc_acc *discon_acc; struct nvmefc_ls_req_op *lsop; struct nvmefc_ls_req *lsreq; int ret;
ret = lport->ops->xmt_ls_rsp(&lport->localport, &rport->remoteport,
lsop->lsrsp); if (ret) {
dev_warn(lport->dev, "LLDD rejected LS RSP xmt: LS %d status %d\n",
w0->ls_cmd, ret);
nvme_fc_xmt_ls_rsp_free(lsop); return;
}
}
list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) { if (!nvme_fc_ctrl_get(ctrl)) continue;
spin_lock(&ctrl->lock); if (association_id == ctrl->association_id) {
oldls = ctrl->rcv_disconn;
ctrl->rcv_disconn = lsop;
ret = ctrl;
}
spin_unlock(&ctrl->lock); if (ret) /* leave the ctrl get reference */ break;
nvme_fc_ctrl_put(ctrl);
}
spin_unlock_irqrestore(&rport->lock, flags);
/* transmit a response for anything that was pending */ if (oldls) {
dev_info(rport->lport->dev, "NVME-FC{%d}: Multiple Disconnect Association " "LS's received\n", ctrl->cnum); /* overwrite good response with bogus failure */
oldls->lsrsp->rsplen = nvme_fc_format_rjt(oldls->rspbuf, sizeof(*oldls->rspbuf),
rqst->w0.ls_cmd,
FCNVME_RJT_RC_UNAB,
FCNVME_RJT_EXP_NONE, 0);
nvme_fc_xmt_ls_rsp(oldls);
}
return ret;
}
/* * returns true to mean LS handled and ls_rsp can be sent * returns false to defer ls_rsp xmt (will be done as part of * association termination)
*/ staticbool
nvme_fc_ls_disconnect_assoc(struct nvmefc_ls_rcv_op *lsop)
{ struct nvme_fc_rport *rport = lsop->rport; struct fcnvme_ls_disconnect_assoc_rqst *rqst =
&lsop->rqstbuf->rq_dis_assoc; struct fcnvme_ls_disconnect_assoc_acc *acc =
&lsop->rspbuf->rsp_dis_assoc; struct nvme_fc_ctrl *ctrl = NULL; int ret = 0;
memset(acc, 0, sizeof(*acc));
ret = nvmefc_vldt_lsreq_discon_assoc(lsop->rqstdatalen, rqst); if (!ret) { /* match an active association */
ctrl = nvme_fc_match_disconn_ls(rport, lsop); if (!ctrl)
ret = VERR_NO_ASSOC;
}
/* * the transmit of the response will occur after the exchanges * for the association have been ABTS'd by * nvme_fc_delete_association().
*/
/* fail the association */
nvme_fc_error_recovery(ctrl, "Disconnect Association LS received");
/* release the reference taken by nvme_fc_match_disconn_ls() */
nvme_fc_ctrl_put(ctrl);
returnfalse;
}
/* * Actual Processing routine for received FC-NVME LS Requests from the LLD * returns true if a response should be sent afterward, false if rsp will * be sent asynchronously.
*/ staticbool
nvme_fc_handle_ls_rqst(struct nvmefc_ls_rcv_op *lsop)
{ struct fcnvme_ls_rqst_w0 *w0 = &lsop->rqstbuf->w0; bool ret = true;
lsop->lsrsp->nvme_fc_private = lsop;
lsop->lsrsp->rspbuf = lsop->rspbuf;
lsop->lsrsp->rspdma = lsop->rspdma;
lsop->lsrsp->done = nvme_fc_xmt_ls_rsp_done; /* Be preventative. handlers will later set to valid length */
lsop->lsrsp->rsplen = 0;
/* * handlers: * parse request input, execute the request, and format the * LS response
*/ switch (w0->ls_cmd) { case FCNVME_LS_DISCONNECT_ASSOC:
ret = nvme_fc_ls_disconnect_assoc(lsop); break; case FCNVME_LS_DISCONNECT_CONN:
lsop->lsrsp->rsplen = nvme_fc_format_rjt(lsop->rspbuf, sizeof(*lsop->rspbuf), w0->ls_cmd,
FCNVME_RJT_RC_UNSUP, FCNVME_RJT_EXP_NONE, 0); break; case FCNVME_LS_CREATE_ASSOCIATION: case FCNVME_LS_CREATE_CONNECTION:
lsop->lsrsp->rsplen = nvme_fc_format_rjt(lsop->rspbuf, sizeof(*lsop->rspbuf), w0->ls_cmd,
FCNVME_RJT_RC_LOGIC, FCNVME_RJT_EXP_NONE, 0); break; default:
lsop->lsrsp->rsplen = nvme_fc_format_rjt(lsop->rspbuf, sizeof(*lsop->rspbuf), w0->ls_cmd,
FCNVME_RJT_RC_INVAL, FCNVME_RJT_EXP_NONE, 0); break;
}
/** * nvme_fc_rcv_ls_req - transport entry point called by an LLDD * upon the reception of a NVME LS request. * * The nvme-fc layer will copy payload to an internal structure for * processing. As such, upon completion of the routine, the LLDD may * immediately free/reuse the LS request buffer passed in the call. * * If this routine returns error, the LLDD should abort the exchange. * * @portptr: pointer to the (registered) remote port that the LS * was received from. The remoteport is associated with * a specific localport. * @lsrsp: pointer to a nvmefc_ls_rsp response structure to be * used to reference the exchange corresponding to the LS * when issuing an ls response. * @lsreqbuf: pointer to the buffer containing the LS Request * @lsreqbuf_len: length, in bytes, of the received LS request
*/ int
nvme_fc_rcv_ls_req(struct nvme_fc_remote_port *portptr, struct nvmefc_ls_rsp *lsrsp, void *lsreqbuf, u32 lsreqbuf_len)
{ struct nvme_fc_rport *rport = remoteport_to_rport(portptr); struct nvme_fc_lport *lport = rport->lport; struct fcnvme_ls_rqst_w0 *w0 = (struct fcnvme_ls_rqst_w0 *)lsreqbuf; struct nvmefc_ls_rcv_op *lsop; unsignedlong flags; int ret;
nvme_fc_rport_get(rport);
/* validate there's a routine to transmit a response */ if (!lport->ops->xmt_ls_rsp) {
dev_info(lport->dev, "RCV %s LS failed: no LLDD xmt_ls_rsp\n",
(w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ?
nvmefc_ls_names[w0->ls_cmd] : "");
ret = -EINVAL; goto out_put;
}
if (lsreqbuf_len > sizeof(union nvmefc_ls_requests)) {
dev_info(lport->dev, "RCV %s LS failed: payload too large\n",
(w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ?
nvmefc_ls_names[w0->ls_cmd] : "");
ret = -E2BIG; goto out_put;
}
lsop = kzalloc(sizeof(*lsop), GFP_KERNEL); if (!lsop) {
nvme_fc_rcv_ls_req_err_msg(lport, w0);
ret = -ENOMEM; goto out_put;
}
/* * nvme_fc_io_getuuid - Routine called to get the appid field * associated with request by the lldd * @req:IO request from nvme fc to driver * Returns: UUID if there is an appid associated with VM or * NULL if the user/libvirt has not set the appid to VM
*/ char *nvme_fc_io_getuuid(struct nvmefc_fcp_req *req)
{ struct nvme_fc_fcp_op *op = fcp_req_to_fcp_op(req); struct request *rq = op->rq;
/* * WARNING: * The current linux implementation of a nvme controller * allocates a single tag set for all io queues and sizes * the io queues to fully hold all possible tags. Thus, the * implementation does not reference or care about the sqhd * value as it never needs to use the sqhd/sqtail pointers * for submission pacing. * * This affects the FC-NVME implementation in two ways: * 1) As the value doesn't matter, we don't need to waste * cycles extracting it from ERSPs and stamping it in the * cases where the transport fabricates CQEs on successful * completions. * 2) The FC-NVME implementation requires that delivery of * ERSP completions are to go back to the nvme layer in order * relative to the rsn, such that the sqhd value will always * be "in order" for the nvme layer. As the nvme layer in * linux doesn't care about sqhd, there's no need to return * them in order. * * Additionally: * As the core nvme layer in linux currently does not look at * every field in the cqe - in cases where the FC transport must * fabricate a CQE, the following fields will not be set as they * are not referenced: * cqe.sqid, cqe.sqhd, cqe.command_id * * Failure or error of an individual i/o, in a transport * detected fashion unrelated to the nvme completion status, * potentially cause the initiator and target sides to get out * of sync on SQ head/tail (aka outstanding io count allowed). * Per FC-NVME spec, failure of an individual command requires * the connection to be terminated, which in turn requires the * association to be terminated.
*/
if (opstate == FCPOP_STATE_ABORTED)
status = cpu_to_le16(NVME_SC_HOST_ABORTED_CMD << 1); elseif (freq->status) {
status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1);
dev_info(ctrl->ctrl.device, "NVME-FC{%d}: io failed due to lldd error %d\n",
ctrl->cnum, freq->status);
}
/* * For the linux implementation, if we have an unsuccessful * status, the blk-mq layer can typically be called with the * non-zero status and the content of the cqe isn't important.
*/ if (status) goto done;
/* * command completed successfully relative to the wire * protocol. However, validate anything received and * extract the status and result from the cqe (create it * where necessary).
*/
switch (freq->rcv_rsplen) {
case 0: case NVME_FC_SIZEOF_ZEROS_RSP: /* * No response payload or 12 bytes of payload (which * should all be zeros) are considered successful and * no payload in the CQE by the transport.
*/ if (freq->transferred_length !=
be32_to_cpu(op->cmd_iu.data_len)) {
status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1);
dev_info(ctrl->ctrl.device, "NVME-FC{%d}: io failed due to bad transfer " "length: %d vs expected %d\n",
ctrl->cnum, freq->transferred_length,
be32_to_cpu(op->cmd_iu.data_len)); goto done;
}
result.u64 = 0; break;
casesizeof(struct nvme_fc_ersp_iu): /* * The ERSP IU contains a full completion with CQE. * Validate ERSP IU and look at cqe.
*/ if (unlikely(be16_to_cpu(op->rsp_iu.iu_len) !=
(freq->rcv_rsplen / 4) ||
be32_to_cpu(op->rsp_iu.xfrd_len) !=
freq->transferred_length ||
op->rsp_iu.ersp_result ||
sqe->common.command_id != cqe->command_id)) {
status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1);
dev_info(ctrl->ctrl.device, "NVME-FC{%d}: io failed due to bad NVMe_ERSP: " "iu len %d, xfr len %d vs %d, status code " "%d, cmdid %d vs %d\n",
ctrl->cnum, be16_to_cpu(op->rsp_iu.iu_len),
be32_to_cpu(op->rsp_iu.xfrd_len),
freq->transferred_length,
op->rsp_iu.ersp_result,
sqe->common.command_id,
cqe->command_id); goto done;
}
result = cqe->result;
status = cqe->status; break;
default:
status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1);
dev_info(ctrl->ctrl.device, "NVME-FC{%d}: io failed due to odd NVMe_xRSP iu " "len %d\n",
ctrl->cnum, freq->rcv_rsplen); goto done;
}
aen_op = ctrl->aen_ops; for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) { if (ctrl->lport->ops->fcprqst_priv_sz) { private = kzalloc(ctrl->lport->ops->fcprqst_priv_sz,
GFP_KERNEL); if (!private) return -ENOMEM;
}
/* * Considered whether we should allocate buffers for all SQEs * and CQEs and dma map them - mapping their respective entries * into the request structures (kernel vm addr and dma address) * thus the driver could use the buffers/mappings directly. * It only makes sense if the LLDD would use them for its * messaging api. It's very unlikely most adapter api's would use * a native NVME sqe/cqe. More reasonable if FC-NVME IU payload * structures were used instead.
*/
}
/* * This routine terminates a queue at the transport level. * The transport has already ensured that all outstanding ios on * the queue have been terminated. * The transport will send a Disconnect LS request to terminate * the queue's connection. Termination of the admin queue will also * terminate the association at the target.
*/ staticvoid
nvme_fc_free_queue(struct nvme_fc_queue *queue)
{ if (!test_and_clear_bit(NVME_FC_Q_CONNECTED, &queue->flags)) return;
clear_bit(NVME_FC_Q_LIVE, &queue->flags); /* * Current implementation never disconnects a single queue. * It always terminates a whole association. So there is never * a disconnect(queue) LS sent to the target.
*/
for (i = ctrl->ctrl.queue_count - 1; i >= 1; i--, queue--)
__nvme_fc_delete_hw_queue(ctrl, queue, i);
}
staticint
nvme_fc_create_hw_io_queues(struct nvme_fc_ctrl *ctrl, u16 qsize)
{ struct nvme_fc_queue *queue = &ctrl->queues[1]; int i, ret;
for (i = 1; i < ctrl->ctrl.queue_count; i++, queue++) {
ret = __nvme_fc_create_hw_queue(ctrl, queue, i, qsize); if (ret) goto delete_queues;
}
return 0;
delete_queues: for (; i > 0; i--)
__nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[i], i); return ret;
}
staticint
nvme_fc_connect_io_queues(struct nvme_fc_ctrl *ctrl, u16 qsize)
{ int i, ret = 0;
for (i = 1; i < ctrl->ctrl.queue_count; i++) {
ret = nvme_fc_connect_queue(ctrl, &ctrl->queues[i], qsize,
(qsize / 5)); if (ret) break;
ret = nvmf_connect_io_queue(&ctrl->ctrl, i); if (ret) break;
/* * All accesses from nvme core layer done - can now free the * controller. Called after last nvme_put_ctrl() call
*/ staticvoid
nvme_fc_free_ctrl(struct nvme_ctrl *nctrl)
{ struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
WARN_ON(nctrl != &ctrl->ctrl);
nvme_fc_ctrl_put(ctrl);
}
/* * This routine is used by the transport when it needs to find active * io on a queue that is to be terminated. The transport uses * blk_mq_tagset_busy_itr() to find the busy requests, which then invoke * this routine to kill them on a 1 by 1 basis. * * As FC allocates FC exchange for each io, the transport must contact * the LLDD to terminate the exchange, thus releasing the FC exchange. * After terminating the exchange the LLDD will call the transport's * normal io done path for the request, but it will have an aborted * status. The done path will return the io request back to the block * layer with an error status.
*/ staticbool nvme_fc_terminate_exchange(struct request *req, void *data)
{ struct nvme_ctrl *nctrl = data; struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(req);
/* * This routine runs through all outstanding commands on the association * and aborts them. This routine is typically called by the * delete_association routine. It is also called due to an error during * reconnect. In that scenario, it is most likely a command that initializes * the controller, including fabric Connect commands on io queues, that * may have timed out or failed thus the io must be killed for the connect * thread to see the error.
*/ staticvoid
__nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues)
{ int q;
/* * if aborting io, the queues are no longer good, mark them * all as not live.
*/ if (ctrl->ctrl.queue_count > 1) { for (q = 1; q < ctrl->ctrl.queue_count; q++)
clear_bit(NVME_FC_Q_LIVE, &ctrl->queues[q].flags);
}
clear_bit(NVME_FC_Q_LIVE, &ctrl->queues[0].flags);
/* * If io queues are present, stop them and terminate all outstanding * ios on them. As FC allocates FC exchange for each io, the * transport must contact the LLDD to terminate the exchange, * thus releasing the FC exchange. We use blk_mq_tagset_busy_itr() * to tell us what io's are busy and invoke a transport routine * to kill them with the LLDD. After terminating the exchange * the LLDD will call the transport's normal io done path, but it * will have an aborted status. The done path will return the * io requests back to the block layer as part of normal completions * (but with error status).
*/ if (ctrl->ctrl.queue_count > 1) {
nvme_quiesce_io_queues(&ctrl->ctrl);
nvme_sync_io_queues(&ctrl->ctrl);
blk_mq_tagset_busy_iter(&ctrl->tag_set,
nvme_fc_terminate_exchange, &ctrl->ctrl);
blk_mq_tagset_wait_completed_request(&ctrl->tag_set); if (start_queues)
nvme_unquiesce_io_queues(&ctrl->ctrl);
}
/* * Other transports, which don't have link-level contexts bound * to sqe's, would try to gracefully shutdown the controller by * writing the registers for shutdown and polling (call * nvme_disable_ctrl()). Given a bunch of i/o was potentially * just aborted and we will wait on those contexts, and given * there was no indication of how live the controller is on the * link, don't send more io to create more contexts for the * shutdown. Let the controller fail via keepalive failure if * its still present.
*/
/* * clean up the admin queue. Same thing as above.
*/
nvme_quiesce_admin_queue(&ctrl->ctrl);
blk_sync_queue(ctrl->ctrl.admin_q);
blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
nvme_fc_terminate_exchange, &ctrl->ctrl);
blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set); if (start_queues)
nvme_unquiesce_admin_queue(&ctrl->ctrl);
}
/* * if an error (io timeout, etc) while (re)connecting, the remote * port requested terminating of the association (disconnect_ls) * or an error (timeout or abort) occurred on an io while creating * the controller. Abort any ios on the association and let the * create_association error path resolve things.
*/ if (state == NVME_CTRL_CONNECTING) {
__nvme_fc_abort_outstanding_ios(ctrl, true);
dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: transport error during (re)connect\n",
ctrl->cnum); return;
}
/* Otherwise, only proceed if in LIVE state - e.g. on first error */ if (state != NVME_CTRL_LIVE) return;
dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: transport association event: %s\n",
ctrl->cnum, errmsg);
dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: resetting controller\n", ctrl->cnum);
/* * Attempt to abort the offending command. Command completion * will detect the aborted io and will fail the connection.
*/
dev_info(ctrl->ctrl.device, "NVME-FC{%d.%d}: io timeout: opcode %d fctype %d (%s) w10/11: " "x%08x/x%08x\n",
ctrl->cnum, qnum, sqe->common.opcode, sqe->fabrics.fctype,
nvme_fabrics_opcode_str(qnum, sqe),
sqe->common.cdw10, sqe->common.cdw11); if (__nvme_fc_abort_op(ctrl, op))
nvme_fc_error_recovery(ctrl, "io timeout abort failed");
/* * the io abort has been initiated. Have the reset timer * restarted and the abort completion will complete the io * shortly. Avoids a synchronous wait while the abort finishes.
*/ return BLK_EH_RESET_TIMER;
}
/* * In FC, the queue is a logical thing. At transport connect, the target * creates its "queue" and returns a handle that is to be given to the * target whenever it posts something to the corresponding SQ. When an * SQE is sent on a SQ, FC effectively considers the SQE, or rather the * command contained within the SQE, an io, and assigns a FC exchange * to it. The SQE and the associated SQ handle are sent in the initial * CMD IU sents on the exchange. All transfers relative to the io occur * as part of the exchange. The CQE is the last thing for the io, * which is transferred (explicitly or implicitly) with the RSP IU * sent on the exchange. After the CQE is received, the FC exchange is * terminated and the Exchange may be used on a different io. * * The transport to LLDD api has the transport making a request for a * new fcp io request to the LLDD. The LLDD then allocates a FC exchange * resource and transfers the command. The LLDD will then process all * steps to complete the io. Upon completion, the transport done routine * is called. * * So - while the operation is outstanding to the LLDD, there is a link * level FC exchange resource that is also outstanding. This must be * considered in all cleanup operations.
*/ static blk_status_t
nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue, struct nvme_fc_fcp_op *op, u32 data_len, enum nvmefc_fcp_datadir io_dir)
{ struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu; struct nvme_command *sqe = &cmdiu->sqe; int ret, opstate;
/* * before attempting to send the io, check to see if we believe * the target device is present
*/ if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE) return BLK_STS_RESOURCE;
if (!nvme_fc_ctrl_get(ctrl)) return BLK_STS_IOERR;
/* format the FC-NVME CMD IU and fcp_req */
cmdiu->connection_id = cpu_to_be64(queue->connection_id);
cmdiu->data_len = cpu_to_be32(data_len); switch (io_dir) { case NVMEFC_FCP_WRITE:
cmdiu->flags = FCNVME_CMD_FLAGS_WRITE; break; case NVMEFC_FCP_READ:
cmdiu->flags = FCNVME_CMD_FLAGS_READ; break; case NVMEFC_FCP_NODATA:
cmdiu->flags = 0; break;
}
op->fcp_req.payload_length = data_len;
op->fcp_req.io_dir = io_dir;
op->fcp_req.transferred_length = 0;
op->fcp_req.rcv_rsplen = 0;
op->fcp_req.status = NVME_SC_SUCCESS;
op->fcp_req.sqid = cpu_to_le16(queue->qnum);
/* * validate per fabric rules, set fields mandated by fabric spec * as well as those by FC-NVME spec.
*/
WARN_ON_ONCE(sqe->common.metadata);
sqe->common.flags |= NVME_CMD_SGL_METABUF;
/* * format SQE DPTR field per FC-NVME rules: * type=0x5 Transport SGL Data Block Descriptor * subtype=0xA Transport-specific value * address=0 * length=length of the data series
*/
sqe->rw.dptr.sgl.type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
NVME_SGL_FMT_TRANSPORT_A;
sqe->rw.dptr.sgl.length = cpu_to_le32(data_len);
sqe->rw.dptr.sgl.addr = 0;
if (!(op->flags & FCOP_FLAGS_AEN)) {
ret = nvme_fc_map_data(ctrl, op->rq, op); if (ret < 0) {
nvme_cleanup_cmd(op->rq);
nvme_fc_ctrl_put(ctrl); if (ret == -ENOMEM || ret == -EAGAIN) return BLK_STS_RESOURCE; return BLK_STS_IOERR;
}
}
if (!(op->flags & FCOP_FLAGS_AEN))
nvme_start_request(op->rq);
cmdiu->csn = cpu_to_be32(atomic_inc_return(&queue->csn));
ret = ctrl->lport->ops->fcp_io(&ctrl->lport->localport,
&ctrl->rport->remoteport,
queue->lldd_handle, &op->fcp_req);
if (ret) { /* * If the lld fails to send the command is there an issue with * the csn value? If the command that fails is the Connect, * no - as the connection won't be live. If it is a command * post-connect, it's possible a gap in csn may be created. * Does this matter? As Linux initiators don't send fused * commands, no. The gap would exist, but as there's nothing * that depends on csn order to be delivered on the target * side, it shouldn't hurt. It would be difficult for a * target to even detect the csn gap as it has no idea when the * cmd with the csn was supposed to arrive.
*/
opstate = atomic_xchg(&op->state, FCPOP_STATE_COMPLETE);
__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
if (!(op->flags & FCOP_FLAGS_AEN)) {
nvme_fc_unmap_data(ctrl, op->rq, op);
nvme_cleanup_cmd(op->rq);
}
nvme_fc_ctrl_put(ctrl);
if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE &&
ret != -EBUSY) return BLK_STS_IOERR;
ret = nvme_setup_cmd(ns, rq); if (ret) return ret;
/* * nvme core doesn't quite treat the rq opaquely. Commands such * as WRITE ZEROES will return a non-zero rq payload_bytes yet * there is no actual payload to be transferred. * To get it right, key data transmission on there being 1 or * more physical segments in the sg list. If there are no * physical segments, there is no payload.
*/ if (blk_rq_nr_phys_segments(rq)) {
data_len = blk_rq_payload_bytes(rq);
io_dir = ((rq_data_dir(rq) == WRITE) ?
NVMEFC_FCP_WRITE : NVMEFC_FCP_READ);
} else {
data_len = 0;
io_dir = NVMEFC_FCP_NODATA;
}
/* clearing of ctrl->flags ASSOC_ACTIVE bit is in association delete */
cnt = atomic_dec_return(&rport->act_ctrl_cnt); if (cnt == 0) { if (rport->remoteport.port_state == FC_OBJSTATE_DELETED)
lport->ops->remoteport_delete(&rport->remoteport);
nvme_fc_rport_inactive_on_lport(rport);
}
return 0;
}
/* * This routine restarts the controller on the host side, and * on the link side, recreates the controller association.
*/ staticint
nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
{ struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; struct nvmefc_ls_rcv_op *disls = NULL; unsignedlong flags; int ret;
ret = nvme_init_ctrl_finish(&ctrl->ctrl, false); if (ret) goto out_disconnect_admin_queue; if (test_bit(ASSOC_FAILED, &ctrl->flags)) {
ret = -EIO; goto out_stop_keep_alive;
} /* sanity checks */
/* FC-NVME does not have other data in the capsule */ if (ctrl->ctrl.icdoff) {
dev_err(ctrl->ctrl.device, "icdoff %d is not supported!\n",
ctrl->ctrl.icdoff);
ret = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; goto out_stop_keep_alive;
}
/* FC-NVME supports normal SGL Data Block Descriptors */ if (!nvme_ctrl_sgl_supported(&ctrl->ctrl)) {
dev_err(ctrl->ctrl.device, "Mandatory sgls are not supported!\n");
ret = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; goto out_stop_keep_alive;
}
if (opts->queue_size > ctrl->ctrl.maxcmd) { /* warn if maxcmd is lower than queue_size */
dev_warn(ctrl->ctrl.device, "queue_size %zu > ctrl maxcmd %u, reducing " "to maxcmd\n",
opts->queue_size, ctrl->ctrl.maxcmd);
opts->queue_size = ctrl->ctrl.maxcmd;
ctrl->ctrl.sqsize = opts->queue_size - 1;
}
ret = nvme_fc_init_aen_ops(ctrl); if (ret) goto out_term_aen_ops;
/* * Create the io queues
*/
if (ctrl->ctrl.queue_count > 1) { if (!ctrl->ioq_live)
ret = nvme_fc_create_io_queues(ctrl); else
ret = nvme_fc_recreate_io_queues(ctrl);
} if (!ret && test_bit(ASSOC_FAILED, &ctrl->flags))
ret = -EIO; if (ret) goto out_term_aen_ops;
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE)) {
ret = -EIO; goto out_term_aen_ops;
}
/* * This routine stops operation of the controller on the host side. * On the host os stack side: Admin and IO queues are stopped, * outstanding ios on them terminated via FC ABTS. * On the link side: the association is terminated.
*/ staticvoid
nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
{ struct nvmefc_ls_rcv_op *disls = NULL; unsignedlong flags;
if (!test_and_clear_bit(ASSOC_ACTIVE, &ctrl->flags)) return;
/* kill the aens as they are a separate path */
nvme_fc_abort_aen_ops(ctrl);
/* wait for all io that had to be aborted */
spin_lock_irq(&ctrl->lock);
wait_event_lock_irq(ctrl->ioabort_wait, ctrl->iocnt == 0, ctrl->lock);
clear_bit(FCCTRL_TERMIO, &ctrl->flags);
spin_unlock_irq(&ctrl->lock);
nvme_fc_term_aen_ops(ctrl);
/* * send a Disconnect(association) LS to fc-nvme target * Note: could have been sent at top of process, but * cleaner on link traffic if after the aborts complete. * Note: if association doesn't exist, association_id will be 0
*/ if (ctrl->association_id)
nvme_fc_xmt_disconnect_assoc(ctrl);
spin_lock_irqsave(&ctrl->lock, flags);
ctrl->association_id = 0;
disls = ctrl->rcv_disconn;
ctrl->rcv_disconn = NULL;
spin_unlock_irqrestore(&ctrl->lock, flags); if (disls) /* * if a Disconnect Request was waiting for a response, send * now that all ABTS's have been issued (and are complete).
*/
nvme_fc_xmt_ls_rsp(disls);
if (ctrl->ctrl.tagset) {
nvme_fc_delete_hw_io_queues(ctrl);
nvme_fc_free_io_queues(ctrl);
}
cancel_work_sync(&ctrl->ioerr_work);
cancel_delayed_work_sync(&ctrl->connect_work); /* * kill the association on the link side. this will block * waiting for io to terminate
*/
nvme_fc_delete_association(ctrl);
}
/* * Fails a controller request if it matches an existing controller * (association) with the same tuple: * <Host NQN, Host ID, local FC port, remote FC port, SUBSYS NQN> * * The ports don't need to be compared as they are intrinsically * already matched by the port pointers supplied.
*/ staticbool
nvme_fc_existing_controller(struct nvme_fc_rport *rport, struct nvmf_ctrl_options *opts)
{ struct nvme_fc_ctrl *ctrl; unsignedlong flags; bool found = false;
spin_lock_irqsave(&rport->lock, flags);
list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) {
found = nvmf_ctlr_matches_baseopts(&ctrl->ctrl, opts); if (found) break;
}
spin_unlock_irqrestore(&rport->lock, flags);
if (!(rport->remoteport.port_role &
(FC_PORT_ROLE_NVME_DISCOVERY | FC_PORT_ROLE_NVME_TARGET))) {
ret = -EBADR; goto out_fail;
}
if (!opts->duplicate_connect &&
nvme_fc_existing_controller(rport, opts)) {
ret = -EALREADY; goto out_fail;
}
ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); if (!ctrl) {
ret = -ENOMEM; goto out_fail;
}
idx = ida_alloc(&nvme_fc_ctrl_cnt, GFP_KERNEL); if (idx < 0) {
ret = -ENOSPC; goto out_free_ctrl;
}
/* * if ctrl_loss_tmo is being enforced and the default reconnect delay * is being used, change to a shorter reconnect delay for FC.
*/ if (opts->max_reconnects != -1 &&
opts->reconnect_delay == NVMF_DEF_RECONNECT_DELAY &&
opts->reconnect_delay > NVME_FC_DEFAULT_RECONNECT_TMO) {
ctrl_loss_tmo = opts->max_reconnects * opts->reconnect_delay;
opts->reconnect_delay = NVME_FC_DEFAULT_RECONNECT_TMO;
opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo,
opts->reconnect_delay);
}
ret = -ENOMEM;
ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(struct nvme_fc_queue), GFP_KERNEL); if (!ctrl->queues) goto out_free_ida;
nvme_fc_init_queue(ctrl, 0);
/* * Would have been nice to init io queues tag set as well. * However, we require interaction from the controller * for max io queue count before we can do so. * Defer this to the connect path.
*/
ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_fc_ctrl_ops, 0); if (ret) goto out_free_queues; if (lport->dev)
ctrl->ctrl.numa_node = dev_to_node(lport->dev);
/* as we're past the point where we transition to the ref * counting teardown path, if we return a bad pointer here, * the calling routine, thinking it's prior to the * transition, will do an rport put. Since the teardown * path also does a rport put, we do an extra get here to * so proper order/teardown happens.
*/
nvme_fc_rport_get(rport);
if (match_u64(sstr, &token64)) return -EINVAL;
*val = token64;
return 0;
}
/* * This routine validates and extracts the WWN's from the TRADDR string. * As kernel parsers need the 0x to determine number base, universally * build string to parse with 0x prefix before parsing name strings.
*/ staticint
nvme_fc_parse_traddr(struct nvmet_fc_traddr *traddr, char *buf, size_t blen)
{ char name[2 + NVME_FC_TRADDR_HEXNAMELEN + 1];
substring_t wwn = { name, &name[sizeof(name)-1] }; int nnoffset, pnoffset;
spin_lock_irqsave(&nvme_fc_lock, flags);
restart:
list_for_each_entry(lport, &nvme_fc_lport_list, port_list) {
list_for_each_entry(rport, &lport->endp_list, endp_list) { if (!nvme_fc_lport_get(lport)) continue; if (!nvme_fc_rport_get(rport)) { /* * This is a temporary condition. Upon restart * this rport will be gone from the list. * * Revert the lport put and retry. Anything * added to the list already will be skipped (as * they are no longer list_empty). Loops should * resume at rports that were not yet seen.
*/
nvme_fc_lport_put(lport);
if (failcnt++ < DISCOVERY_MAX_FAIL) goto restart;
pr_err("nvme_discovery: too many reference " "failures\n"); goto process_local_list;
} if (list_empty(&rport->disc_list))
list_add_tail(&rport->disc_list,
&local_disc_list);
}
}
#ifdef CONFIG_BLK_CGROUP_FC_APPID /* Parse the cgroup id from a buf and return the length of cgrpid */ staticint fc_parse_cgrpid(constchar *buf, u64 *id)
{ char cgrp_id[16+1]; int cgrpid_len, j;
/* * Parse and update the appid in the blkcg associated with the cgroupid.
*/ static ssize_t fc_appid_store(struct device *dev, struct device_attribute *attr, constchar *buf, size_t count)
{
size_t orig_count = count;
u64 cgrp_id; int appid_len = 0; int cgrpid_len = 0; char app_id[FC_APPID_LEN]; int ret = 0;
if (buf[count-1] == '\n')
count--;
if ((count > (16+1+FC_APPID_LEN)) || (!strchr(buf, ':'))) return -EINVAL;
staticint __init nvme_fc_init_module(void)
{ int ret;
/* * NOTE: * It is expected that in the future the kernel will combine * the FC-isms that are currently under scsi and now being * added to by NVME into a new standalone FC class. The SCSI * and NVME protocols and their devices would be under this * new FC class. * * As we need something to post FC-specific udev events to, * specifically for nvme probe events, start by creating the * new device class. When the new standalone FC class is * put in place, this code will move to a more generic * location for the class.
*/
ret = class_register(&fc_class); if (ret) {
pr_err("couldn't register class fc\n"); return ret;
}
/* * Create a device for the FC-centric udev events
*/
fc_udev_device = device_create(&fc_class, NULL, MKDEV(0, 0), NULL, "fc_udev_device"); if (IS_ERR(fc_udev_device)) {
pr_err("couldn't create fc_udev device!\n");
ret = PTR_ERR(fc_udev_device); goto out_destroy_class;
}
ret = nvmf_register_transport(&nvme_fc_transport); if (ret) goto out_destroy_device;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.