/* * We allow at least 1 page, up to 4 SGEs, and up to 16KB of inline data
*/ #define NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE PAGE_SIZE #define NVMET_RDMA_MAX_INLINE_SGE 4 #define NVMET_RDMA_MAX_INLINE_DATA_SIZE max_t(int, SZ_16K, PAGE_SIZE)
for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) { if (sge->length)
ib_dma_unmap_page(ndev->device, sge->addr,
sge->length, DMA_FROM_DEVICE); if (sg_page(sg))
__free_page(sg_page(sg));
}
}
spin_unlock(&queue->rsp_wr_wait_lock);
ret = nvmet_rdma_execute_command(rsp);
spin_lock(&queue->rsp_wr_wait_lock);
if (!ret) {
list_add(&rsp->wait_list, &queue->rsp_wr_wait_list); break;
}
}
spin_unlock(&queue->rsp_wr_wait_lock);
}
static u16 nvmet_rdma_check_pi_status(struct ib_mr *sig_mr)
{ struct ib_mr_status mr_status; int ret;
u16 status = 0;
ret = ib_check_mr_status(sig_mr, IB_MR_CHECK_SIG_STATUS, &mr_status); if (ret) {
pr_err("ib_check_mr_status failed, ret %d\n", ret); return NVME_SC_INVALID_PI;
}
if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) { switch (mr_status.sig_err.err_type) { case IB_SIG_BAD_GUARD:
status = NVME_SC_GUARD_CHECK; break; case IB_SIG_BAD_REFTAG:
status = NVME_SC_REFTAG_CHECK; break; case IB_SIG_BAD_APPTAG:
status = NVME_SC_APPTAG_CHECK; break;
}
pr_err("PI error found type %d expected 0x%x vs actual 0x%x\n",
mr_status.sig_err.err_type,
mr_status.sig_err.expected,
mr_status.sig_err.actual);
}
if (control & NVME_RW_PRINFO_PRACT) { /* for WRITE_INSERT/READ_STRIP no wire domain */
sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE;
nvmet_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, control,
pi_type); /* Clear the PRACT bit since HCA will generate/verify the PI */
control &= ~NVME_RW_PRINFO_PRACT;
cmd->rw.control = cpu_to_le16(control); /* PI is added by the HW */
req->transfer_len += req->metadata_len;
} else { /* for WRITE_PASS/READ_PASS both wire/memory domains exist */
nvmet_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control,
pi_type);
nvmet_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, control,
pi_type);
}
if (control & NVME_RW_PRINFO_PRCHK_REF)
sig_attrs->check_mask |= IB_SIG_CHECK_REFTAG; if (control & NVME_RW_PRINFO_PRCHK_GUARD)
sig_attrs->check_mask |= IB_SIG_CHECK_GUARD; if (control & NVME_RW_PRINFO_PRCHK_APP)
sig_attrs->check_mask |= IB_SIG_CHECK_APPTAG;
}
if (rsp->req.sg != rsp->cmd->inline_sg)
nvmet_req_free_sgls(&rsp->req);
if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list)))
nvmet_rdma_process_wr_wait_list(queue);
nvmet_rdma_put_rsp(rsp);
}
staticvoid nvmet_rdma_error_comp(struct nvmet_rdma_queue *queue)
{ if (queue->nvme_sq.ctrl) {
nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl);
} else { /* * we didn't setup the controller yet in case * of admin connect error, just disconnect and * cleanup the queue
*/
nvmet_rdma_queue_disconnect(queue);
}
}
if (unlikely(wc->status != IB_WC_SUCCESS)) {
nvmet_rdma_rw_ctx_destroy(rsp);
nvmet_req_uninit(&rsp->req);
nvmet_rdma_release_rsp(rsp); if (wc->status != IB_WC_WR_FLUSH_ERR) {
pr_info("RDMA WRITE for CQE failed with status %s (%d).\n",
ib_wc_status_msg(wc->status), wc->status);
nvmet_rdma_error_comp(queue);
} return;
}
/* * Upon RDMA completion check the signature status * - if succeeded send good NVMe response * - if failed send bad NVMe response with appropriate error
*/
status = nvmet_rdma_check_pi_status(rsp->rw.reg->mr); if (unlikely(status))
rsp->req.cqe->status = cpu_to_le16(status << 1);
nvmet_rdma_rw_ctx_destroy(rsp);
spin_lock_irqsave(&queue->state_lock, flags); /* * recheck queue state is not live to prevent a race condition * with RDMA_CM_EVENT_ESTABLISHED handler.
*/ if (queue->state == NVMET_RDMA_Q_LIVE)
ret = false; elseif (queue->state == NVMET_RDMA_Q_CONNECTING)
list_add_tail(&rsp->wait_list, &queue->rsp_wait_list); else
nvmet_rdma_put_rsp(rsp);
spin_unlock_irqrestore(&queue->state_lock, flags); return ret;
}
if (unlikely(wc->status != IB_WC_SUCCESS)) { if (wc->status != IB_WC_WR_FLUSH_ERR) {
pr_err("RECV for CQE 0x%p failed with status %s (%d)\n",
wc->wr_cqe, ib_wc_status_msg(wc->status),
wc->status);
nvmet_rdma_error_comp(queue);
} return;
}
if (unlikely(wc->byte_len < sizeof(struct nvme_command))) {
pr_err("Ctrl Fatal Error: capsule size less than 64 bytes\n");
nvmet_rdma_error_comp(queue); return;
}
cmd->queue = queue;
rsp = nvmet_rdma_get_rsp(queue); if (unlikely(!rsp)) { /* * we get here only under memory pressure, * silently drop and have the host retry * as we can't even fail it.
*/
nvmet_rdma_post_recv(queue->dev, cmd); return;
}
rsp->queue = queue;
rsp->cmd = cmd;
rsp->flags = 0;
rsp->req.cmd = cmd->nvme_cmd;
rsp->req.port = queue->port;
rsp->n_rdma = 0;
rsp->invalidate_rkey = 0;
if (unlikely(queue->state != NVMET_RDMA_Q_LIVE) &&
nvmet_rdma_recv_not_live(queue, rsp)) return;
staticint nvmet_rdma_init_srqs(struct nvmet_rdma_device *ndev)
{ int i, ret;
if (!ndev->device->attrs.max_srq_wr || !ndev->device->attrs.max_srq) { /* * If SRQs aren't supported we just go ahead and use normal * non-shared receive queues.
*/
pr_info("SRQ requested but not supported.\n"); return 0;
}
ndev->srqs = kcalloc(ndev->srq_count, sizeof(*ndev->srqs), GFP_KERNEL); if (!ndev->srqs) return -ENOMEM;
for (i = 0; i < ndev->srq_count; i++) {
ndev->srqs[i] = nvmet_rdma_init_srq(ndev); if (IS_ERR(ndev->srqs[i])) {
ret = PTR_ERR(ndev->srqs[i]); goto err_srq;
}
}
return 0;
err_srq: while (--i >= 0)
nvmet_rdma_destroy_srq(ndev->srqs[i]);
kfree(ndev->srqs); return ret;
}
if (!queue->nsrq) { for (i = 0; i < queue->recv_queue_size; i++) {
queue->cmds[i].queue = queue;
ret = nvmet_rdma_post_recv(ndev, &queue->cmds[i]); if (ret) goto err_destroy_qp;
}
}
queue = kzalloc(sizeof(*queue), GFP_KERNEL); if (!queue) {
ret = NVME_RDMA_CM_NO_RSC; goto out_reject;
}
nvmet_cq_init(&queue->nvme_cq);
ret = nvmet_sq_init(&queue->nvme_sq, &queue->nvme_cq); if (ret) {
ret = NVME_RDMA_CM_NO_RSC; goto out_free_queue;
}
ret = nvmet_rdma_parse_cm_connect_req(&event->param.conn, queue); if (ret) goto out_destroy_sq;
/* * Schedules the actual release because calling rdma_destroy_id from * inside a CM callback would trigger a deadlock. (great API design..)
*/
INIT_WORK(&queue->release_work, nvmet_rdma_release_queue_work);
queue->dev = ndev;
queue->cm_id = cm_id;
queue->port = port->nport;
queue->idx = ida_alloc(&nvmet_rdma_queue_ida, GFP_KERNEL); if (queue->idx < 0) {
ret = NVME_RDMA_CM_NO_RSC; goto out_destroy_sq;
}
/* * Spread the io queues across completion vectors, * but still keep all admin queues on vector 0.
*/
queue->comp_vector = !queue->host_qid ? 0 :
queue->idx % ndev->device->num_comp_vectors;
ret = nvmet_rdma_alloc_rsps(queue); if (ret) {
ret = NVME_RDMA_CM_NO_RSC; goto out_ida_remove;
}
if (ndev->srqs) {
queue->nsrq = ndev->srqs[queue->comp_vector % ndev->srq_count];
} else {
queue->cmds = nvmet_rdma_alloc_cmds(ndev,
queue->recv_queue_size,
!queue->host_qid); if (IS_ERR(queue->cmds)) {
ret = NVME_RDMA_CM_NO_RSC; goto out_free_responses;
}
}
ret = nvmet_rdma_create_queue_ib(queue); if (ret) {
pr_err("%s: creating RDMA queue failed (%d).\n",
__func__, ret);
ret = NVME_RDMA_CM_NO_RSC; goto out_free_cmds;
}
queue = nvmet_rdma_alloc_queue(ndev, cm_id, event); if (!queue) {
ret = -ENOMEM; goto put_device;
}
if (queue->host_qid == 0) { struct nvmet_rdma_queue *q; int pending = 0;
/* Check for pending controller teardown */
mutex_lock(&nvmet_rdma_queue_mutex);
list_for_each_entry(q, &nvmet_rdma_queue_list, queue_list) { if (q->nvme_sq.ctrl == queue->nvme_sq.ctrl &&
q->state == NVMET_RDMA_Q_DISCONNECTING)
pending++;
}
mutex_unlock(&nvmet_rdma_queue_mutex); if (pending > NVMET_RDMA_BACKLOG) return NVME_SC_CONNECT_CTRL_BUSY;
}
ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn); if (ret) { /* * Don't destroy the cm_id in free path, as we implicitly * destroy the cm_id here with non-zero ret code.
*/
queue->cm_id = NULL; goto free_queue;
}
mutex_lock(&nvmet_rdma_queue_mutex); if (!list_empty(&queue->queue_list))
list_del_init(&queue->queue_list);
mutex_unlock(&nvmet_rdma_queue_mutex);
pr_err("failed to connect queue %d\n", queue->idx);
queue_work(nvmet_wq, &queue->release_work);
}
/** * nvmet_rdma_device_removal() - Handle RDMA device removal * @cm_id: rdma_cm id, used for nvmet port * @queue: nvmet rdma queue (cm id qp_context) * * DEVICE_REMOVAL event notifies us that the RDMA device is about * to unplug. Note that this event can be generated on a normal * queue cm_id and/or a device bound listener cm_id (where in this * case queue will be null). * * We registered an ib_client to handle device removal for queues, * so we only need to handle the listening port cm_ids. In this case * we nullify the priv to prevent double cm_id destruction and destroying * the cm_id implicitly by returning a non-zero rc to the callout.
*/ staticint nvmet_rdma_device_removal(struct rdma_cm_id *cm_id, struct nvmet_rdma_queue *queue)
{ struct nvmet_rdma_port *port;
if (queue) { /* * This is a queue cm_id. we have registered * an ib_client to handle queues removal * so don't interfere and just return.
*/ return 0;
}
port = cm_id->context;
/* * This is a listener cm_id. Make sure that * future remove_port won't invoke a double * cm_id destroy. use atomic xchg to make sure * we don't compete with remove_port.
*/ if (xchg(&port->cm_id, NULL) != cm_id) return 0;
/* * We need to return 1 so that the core will destroy * its own ID. What a great API design..
*/ return 1;
}
staticint nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event)
{ struct nvmet_rdma_queue *queue = NULL; int ret = 0;
if (cm_id->qp)
queue = cm_id->qp->qp_context;
pr_debug("%s (%d): status %d id %p\n",
rdma_event_msg(event->event), event->event,
event->status, cm_id);
switch (event->event) { case RDMA_CM_EVENT_CONNECT_REQUEST:
ret = nvmet_rdma_queue_connect(cm_id, event); break; case RDMA_CM_EVENT_ESTABLISHED:
nvmet_rdma_queue_established(queue); break; case RDMA_CM_EVENT_ADDR_CHANGE: if (!queue) { struct nvmet_rdma_port *port = cm_id->context;
queue_delayed_work(nvmet_wq, &port->repair_work, 0); break;
}
fallthrough; case RDMA_CM_EVENT_DISCONNECTED: case RDMA_CM_EVENT_TIMEWAIT_EXIT:
nvmet_rdma_queue_disconnect(queue); break; case RDMA_CM_EVENT_DEVICE_REMOVAL:
ret = nvmet_rdma_device_removal(cm_id, queue); break; case RDMA_CM_EVENT_REJECTED:
pr_debug("Connection rejected: %s\n",
rdma_reject_msg(cm_id, event->status));
fallthrough; case RDMA_CM_EVENT_UNREACHABLE: case RDMA_CM_EVENT_CONNECT_ERROR:
nvmet_rdma_queue_connect_fail(cm_id, queue); break; default:
pr_err("received unrecognized RDMA CM event %d\n",
event->event); break;
}
/* * Destroy the remaining queues, which are not belong to any * controller yet. Do it here after the RDMA-CM was destroyed * guarantees that no new queue will be created.
*/
nvmet_rdma_destroy_port_queues(port);
}
cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port,
RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(cm_id)) {
pr_err("CM ID creation failed\n"); return PTR_ERR(cm_id);
}
/* * Allow both IPv4 and IPv6 sockets to bind a single port * at the same time.
*/
ret = rdma_set_afonly(cm_id, 1); if (ret) {
pr_err("rdma_set_afonly failed (%d)\n", ret); goto out_destroy_id;
}
ret = rdma_bind_addr(cm_id, addr); if (ret) {
pr_err("binding CM ID to %pISpcs failed (%d)\n", addr, ret); goto out_destroy_id;
}
ret = rdma_listen(cm_id, NVMET_RDMA_BACKLOG); if (ret) {
pr_err("listening to %pISpcs failed (%d)\n", addr, ret); goto out_destroy_id;
}
switch (nport->disc_addr.adrfam) { case NVMF_ADDR_FAMILY_IP4:
af = AF_INET; break; case NVMF_ADDR_FAMILY_IP6:
af = AF_INET6; break; default:
pr_err("address family %d not supported\n",
nport->disc_addr.adrfam);
ret = -EINVAL; goto out_free_port;
}
if (nport->inline_data_size < 0) {
nport->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE;
} elseif (nport->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) {
pr_warn("inline_data_size %u is too large, reducing to %u\n",
nport->inline_data_size,
NVMET_RDMA_MAX_INLINE_DATA_SIZE);
nport->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE;
}
if (nport->max_queue_size < 0) {
nport->max_queue_size = NVME_RDMA_DEFAULT_QUEUE_SIZE;
} elseif (nport->max_queue_size > NVME_RDMA_MAX_QUEUE_SIZE) {
pr_warn("max_queue_size %u is too large, reducing to %u\n",
nport->max_queue_size, NVME_RDMA_MAX_QUEUE_SIZE);
nport->max_queue_size = NVME_RDMA_MAX_QUEUE_SIZE;
}
ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr,
nport->disc_addr.trsvcid, &port->addr); if (ret) {
pr_err("malformed ip/port passed: %s:%s\n",
nport->disc_addr.traddr, nport->disc_addr.trsvcid); goto out_free_port;
}
ret = nvmet_rdma_enable_port(port); if (ret) goto out_free_port;
pr_info("enabling port %d (%pISpcs)\n",
le16_to_cpu(nport->disc_addr.portid),
(struct sockaddr *)&port->addr);
mutex_lock(&device_list_mutex);
list_for_each_entry(ndev, &device_list, entry) { if (ndev->device == ib_device) {
found = true; break;
}
}
mutex_unlock(&device_list_mutex);
if (!found) return;
/* * IB Device that is used by nvmet controllers is being removed, * delete all queues using this device.
*/
mutex_lock(&nvmet_rdma_queue_mutex);
list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list,
queue_list) { if (queue->dev->device != ib_device) continue;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.