// SPDX-License-Identifier: GPL-2.0-only /* * RDMA transport layer based on the trans_fd.c implementation. * * Copyright (C) 2008 by Tom Tucker <tom@opengridcomputing.com> * Copyright (C) 2006 by Russ Cox <rsc@swtch.com> * Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net> * Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com> * Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com>
*/
/** * struct p9_trans_rdma - RDMA transport instance * * @state: tracks the transport state machine for connection setup and tear down * @cm_id: The RDMA CM ID * @pd: Protection Domain pointer * @qp: Queue Pair pointer * @cq: Completion Queue pointer * @timeout: Number of uSecs to wait for connection management events * @privport: Whether a privileged port may be used * @port: The port to use * @sq_depth: The depth of the Send Queue * @sq_sem: Semaphore for the SQ * @rq_depth: The depth of the Receive Queue. * @rq_sem: Semaphore for the RQ * @excess_rc : Amount of posted Receive Contexts without a pending request. * See rdma_request() * @addr: The remote peer's address * @req_lock: Protects the active request list * @cm_done: Completion event for connection management tracking
*/ struct p9_trans_rdma { enum {
P9_RDMA_INIT,
P9_RDMA_ADDR_RESOLVED,
P9_RDMA_ROUTE_RESOLVED,
P9_RDMA_CONNECTED,
P9_RDMA_FLUSHING,
P9_RDMA_CLOSING,
P9_RDMA_CLOSED,
} state; struct rdma_cm_id *cm_id; struct ib_pd *pd; struct ib_qp *qp; struct ib_cq *cq; long timeout; bool privport;
u16 port; int sq_depth; struct semaphore sq_sem; int rq_depth; struct semaphore rq_sem;
atomic_t excess_rc; struct sockaddr_in addr;
spinlock_t req_lock;
struct completion cm_done;
};
struct p9_rdma_req;
/** * struct p9_rdma_context - Keeps track of in-process WR * * @cqe: completion queue entry * @busa: Bus address to unmap when the WR completes * @req: Keeps track of requests (send) * @rc: Keepts track of replies (receive)
*/ struct p9_rdma_context { struct ib_cqe cqe;
dma_addr_t busa; union { struct p9_req_t *req; struct p9_fcall rc;
};
};
/** * struct p9_rdma_opts - Collection of mount options * @port: port of connection * @privport: Whether a privileged port may be used * @sq_depth: The requested depth of the SQ. This really doesn't need * to be any deeper than the number of threads used in the client * @rq_depth: The depth of the RQ. Should be greater than or equal to SQ depth * @timeout: Time to wait in msecs for CM events
*/ struct p9_rdma_opts { short port; bool privport; int sq_depth; int rq_depth; long timeout;
};
/* * Option Parsing (code inspired by NFS code)
*/ enum { /* Options that take integer arguments */
Opt_port, Opt_rq_depth, Opt_sq_depth, Opt_timeout, /* Options that take no argument */
Opt_privport,
Opt_err,
};
tmp_options = kstrdup(params, GFP_KERNEL); if (!tmp_options) {
p9_debug(P9_DEBUG_ERROR, "failed to allocate copy of option string\n"); return -ENOMEM;
}
options = tmp_options;
while ((p = strsep(&options, ",")) != NULL) { int token; int r; if (!*p) continue;
token = match_token(p, tokens, args); if ((token != Opt_err) && (token != Opt_privport)) {
r = match_int(&args[0], &option); if (r < 0) {
p9_debug(P9_DEBUG_ERROR, "integer field, but no integer?\n"); continue;
}
} switch (token) { case Opt_port:
opts->port = option; break; case Opt_sq_depth:
opts->sq_depth = option; break; case Opt_rq_depth:
opts->rq_depth = option; break; case Opt_timeout:
opts->timeout = option; break; case Opt_privport:
opts->privport = true; break; default: continue;
}
} /* RQ must be at least as large as the SQ */
opts->rq_depth = max(opts->rq_depth, opts->sq_depth);
kfree(tmp_options); return 0;
}
case RDMA_CM_EVENT_ROUTE_RESOLVED:
BUG_ON(rdma->state != P9_RDMA_ADDR_RESOLVED);
rdma->state = P9_RDMA_ROUTE_RESOLVED; break;
case RDMA_CM_EVENT_ESTABLISHED:
BUG_ON(rdma->state != P9_RDMA_ROUTE_RESOLVED);
rdma->state = P9_RDMA_CONNECTED; break;
case RDMA_CM_EVENT_DISCONNECTED: if (rdma)
rdma->state = P9_RDMA_CLOSED;
c->status = Disconnected; break;
case RDMA_CM_EVENT_TIMEWAIT_EXIT: break;
case RDMA_CM_EVENT_ADDR_CHANGE: case RDMA_CM_EVENT_ROUTE_ERROR: case RDMA_CM_EVENT_DEVICE_REMOVAL: case RDMA_CM_EVENT_MULTICAST_JOIN: case RDMA_CM_EVENT_MULTICAST_ERROR: case RDMA_CM_EVENT_REJECTED: case RDMA_CM_EVENT_CONNECT_REQUEST: case RDMA_CM_EVENT_CONNECT_RESPONSE: case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_ADDR_ERROR: case RDMA_CM_EVENT_UNREACHABLE:
c->status = Disconnected;
rdma_disconnect(rdma->cm_id); break; default:
BUG();
}
complete(&rdma->cm_done); return 0;
}
req = p9_tag_lookup(client, tag); if (!req) goto err_out;
/* Check that we have not yet received a reply for this request.
*/ if (unlikely(req->rc.sdata)) {
pr_err("Duplicate reply for request %d", tag); goto err_out;
}
/* When an error occurs between posting the recv and the send, * there will be a receive context posted without a pending request. * Since there is no way to "un-post" it, we remember it and skip * post_recv() for the next request. * So here, * see if we are this `next request' and need to absorb an excess rc. * If yes, then drop and free our own, and do not recv_post().
**/ if (unlikely(atomic_read(&rdma->excess_rc) > 0)) { if ((atomic_sub_return(1, &rdma->excess_rc) >= 0)) { /* Got one! */
p9_fcall_fini(&req->rc);
req->rc.sdata = NULL; goto dont_need_post_recv;
} else { /* We raced and lost. */
atomic_inc(&rdma->excess_rc);
}
}
/* Allocate an fcall for the reply */
rpl_context = kmalloc(sizeof *rpl_context, GFP_NOFS); if (!rpl_context) {
err = -ENOMEM; goto recv_error;
}
rpl_context->rc.sdata = req->rc.sdata;
/* * Post a receive buffer for this request. We need to ensure * there is a reply buffer available for every outstanding * request. A flushed request can result in no reply for an * outstanding request, so we must keep a count to avoid * overflowing the RQ.
*/ if (down_interruptible(&rdma->rq_sem)) {
err = -EINTR; goto recv_error;
}
if (down_interruptible(&rdma->sq_sem)) {
err = -EINTR; goto dma_unmap;
}
/* Mark request as `sent' *before* we actually send it, * because doing if after could erase the REQ_STATUS_RCVD * status in case of a very fast reply.
*/
WRITE_ONCE(req->status, REQ_STATUS_SENT);
err = ib_post_send(rdma->qp, &wr, NULL); if (err) goto dma_unmap;
/* Success */ return 0;
dma_unmap:
ib_dma_unmap_single(rdma->cm_id->device, c->busa,
c->req->tc.size, DMA_TO_DEVICE); /* Handle errors that happened during or while preparing the send: */
send_error:
WRITE_ONCE(req->status, REQ_STATUS_ERROR);
kfree(c);
p9_debug(P9_DEBUG_ERROR, "Error %d in rdma_request()\n", err);
/* Ach. * We did recv_post(), but not send. We have one recv_post in excess.
*/
atomic_inc(&rdma->excess_rc); return err;
/* Handle errors that happened during or while preparing post_recv(): */
recv_error:
kfree(rpl_context);
spin_lock_irqsave(&rdma->req_lock, flags); if (err != -EINTR && rdma->state < P9_RDMA_CLOSING) {
rdma->state = P9_RDMA_CLOSING;
spin_unlock_irqrestore(&rdma->req_lock, flags);
rdma_disconnect(rdma->cm_id);
} else
spin_unlock_irqrestore(&rdma->req_lock, flags); return err;
}
staticint rdma_cancel(struct p9_client *client, struct p9_req_t *req)
{ /* Nothing to do here. * We will take care of it (if we have to) in rdma_cancelled()
*/ return 1;
}
/* A request has been fully flushed without a reply. * That means we have posted one buffer in excess.
*/ staticint rdma_cancelled(struct p9_client *client, struct p9_req_t *req)
{ struct p9_trans_rdma *rdma = client->trans;
atomic_inc(&rdma->excess_rc); return 0;
}
for (port = P9_DEF_MAX_RESVPORT; port >= P9_DEF_MIN_RESVPORT; port--) {
cl.sin_port = htons((ushort)port);
err = rdma_bind_addr(rdma->cm_id, (struct sockaddr *)&cl); if (err != -EADDRINUSE) break;
} return err;
}
/** * rdma_create_trans - Transport method for creating a transport instance * @client: client instance * @addr: IP address string * @args: Mount options string
*/ staticint
rdma_create_trans(struct p9_client *client, constchar *addr, char *args)
{ int err; struct p9_rdma_opts opts; struct p9_trans_rdma *rdma; struct rdma_conn_param conn_param; struct ib_qp_init_attr qp_attr;
if (addr == NULL) return -EINVAL;
/* Parse the transport specific mount options */
err = parse_opts(args, &opts); if (err < 0) return err;
/* Create and initialize the RDMA transport structure */
rdma = alloc_rdma(&opts); if (!rdma) return -ENOMEM;
/* Create the RDMA CM ID */
rdma->cm_id = rdma_create_id(&init_net, p9_cm_event_handler, client,
RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(rdma->cm_id)) goto error;
/* Associate the client with the transport */
client->trans = rdma;
/* Bind to a privileged port if we need to */ if (opts.privport) {
err = p9_rdma_bind_privport(rdma); if (err < 0) {
pr_err("%s (%d): problem binding to privport: %d\n",
__func__, task_pid_nr(current), -err); goto error;
}
}
/* Resolve the route to the server */
err = rdma_resolve_route(rdma->cm_id, rdma->timeout); if (err) goto error;
err = wait_for_completion_interruptible(&rdma->cm_done); if (err || (rdma->state != P9_RDMA_ROUTE_RESOLVED)) goto error;
/* Create the Completion Queue */
rdma->cq = ib_alloc_cq_any(rdma->cm_id->device, client,
opts.sq_depth + opts.rq_depth + 1,
IB_POLL_SOFTIRQ); if (IS_ERR(rdma->cq)) goto error;
/* Create the Protection Domain */
rdma->pd = ib_alloc_pd(rdma->cm_id->device, 0); if (IS_ERR(rdma->pd)) goto error;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.