/* * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. *
*/ #include <linux/kernel.h> #include <linux/in.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/ratelimit.h> #include <net/addrconf.h> #include <rdma/ib_cm.h>
/* * Connection established. * We get here for both outgoing and incoming connection.
*/ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
{ struct rds_ib_connection *ic = conn->c_transport_data; constunion rds_ib_conn_priv *dp = NULL;
__be64 ack_seq = 0;
__be32 credit = 0;
u8 major = 0;
u8 minor = 0; int err;
dp = event->param.conn.private_data; if (conn->c_isv6) { if (event->param.conn.private_data_len >= sizeof(struct rds6_ib_connect_private)) {
major = dp->ricp_v6.dp_protocol_major;
minor = dp->ricp_v6.dp_protocol_minor;
credit = dp->ricp_v6.dp_credit; /* dp structure start is not guaranteed to be 8 bytes * aligned. Since dp_ack_seq is 64-bit extended load * operations can be used so go through get_unaligned * to avoid unaligned errors.
*/
ack_seq = get_unaligned(&dp->ricp_v6.dp_ack_seq);
}
} elseif (event->param.conn.private_data_len >= sizeof(struct rds_ib_connect_private)) {
major = dp->ricp_v4.dp_protocol_major;
minor = dp->ricp_v4.dp_protocol_minor;
credit = dp->ricp_v4.dp_credit;
ack_seq = get_unaligned(&dp->ricp_v4.dp_ack_seq);
}
/* make sure it isn't empty data */ if (major) {
rds_ib_set_protocol(conn, RDS_PROTOCOL(major, minor));
rds_ib_set_flow_control(conn, be32_to_cpu(credit));
}
if (conn->c_version < RDS_PROTOCOL_VERSION) { if (conn->c_version != RDS_PROTOCOL_COMPAT_VERSION) {
pr_notice("RDS/IB: Connection <%pI6c,%pI6c> version %u.%u no longer supported\n",
&conn->c_laddr, &conn->c_faddr,
RDS_PROTOCOL_MAJOR(conn->c_version),
RDS_PROTOCOL_MINOR(conn->c_version));
rds_conn_destroy(conn); return;
}
}
/* receive sl from the peer */
ic->i_sl = ic->i_cm_id->route.path_rec->sl;
atomic_set(&ic->i_cq_quiesce, 0);
/* Init rings and fill recv. this needs to wait until protocol * negotiation is complete, since ring layout is different * from 3.1 to 4.1.
*/
rds_ib_send_init_ring(ic);
rds_ib_recv_init_ring(ic); /* Post receive buffers - as a side effect, this will update
* the posted credit count. */
rds_ib_recv_refill(conn, 1, GFP_KERNEL);
/* update ib_device with this local ipaddr */
err = rds_ib_update_ipaddr(ic->rds_ibdev, &conn->c_laddr); if (err)
printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n",
err);
/* If the peer gave us the last packet it saw, process this as if
* we had received a regular ACK. */ if (dp) { if (ack_seq)
rds_send_drop_acked(conn, be64_to_cpu(ack_seq),
NULL);
}
/* Plucking the oldest entry from the ring can be done concurrently with * the thread refilling the ring. Each ring operation is protected by * spinlocks and the transient state of refilling doesn't change the * recording of which entry is oldest. * * This relies on IB only calling one cq comp_handler for each cq so that * there will only be one caller of rds_recv_incoming() per RDS connection.
*/ staticvoid rds_ib_cq_comp_handler_recv(struct ib_cq *cq, void *context)
{ struct rds_connection *conn = context; struct rds_ib_connection *ic = conn->c_transport_data;
staticinlineint ibdev_get_unused_vector(struct rds_ib_device *rds_ibdev)
{ int min = rds_ibdev->vector_load[rds_ibdev->dev->num_comp_vectors - 1]; int index = rds_ibdev->dev->num_comp_vectors - 1; int i;
for (i = rds_ibdev->dev->num_comp_vectors - 1; i >= 0; i--) { if (rds_ibdev->vector_load[i] < min) {
index = i;
min = rds_ibdev->vector_load[i];
}
}
rds_ibdev->vector_load[index]++; return index;
}
staticinlinevoid ibdev_put_vector(struct rds_ib_device *rds_ibdev, int index)
{
rds_ibdev->vector_load[index]--;
}
/* Free the DMA memory used to store struct rds_header. * * @dev: the RDS IB device * @hdrs: pointer to the array storing DMA memory pointers * @dma_addrs: pointer to the array storing DMA addresses * @num_hdars: number of headers to free.
*/ staticvoid rds_dma_hdrs_free(struct rds_ib_device *dev, struct rds_header **hdrs, dma_addr_t *dma_addrs, u32 num_hdrs, enum dma_data_direction dir)
{
u32 i;
for (i = 0; i < num_hdrs; i++)
rds_dma_hdr_free(dev->dev, hdrs[i], dma_addrs[i], dir);
kvfree(hdrs);
kvfree(dma_addrs);
}
/* Allocate DMA coherent memory to be used to store struct rds_header for * sending/receiving packets. The pointers to the DMA memory and the * associated DMA addresses are stored in two arrays. * * @dev: the RDS IB device * @dma_addrs: pointer to the array for storing DMA addresses * @num_hdrs: number of headers to allocate * * It returns the pointer to the array storing the DMA memory pointers. On * error, NULL pointer is returned.
*/ staticstruct rds_header **rds_dma_hdrs_alloc(struct rds_ib_device *dev,
dma_addr_t **dma_addrs, u32 num_hdrs, enum dma_data_direction dir)
{ struct rds_header **hdrs;
dma_addr_t *hdr_daddrs;
u32 i;
for (i = 0; i < num_hdrs; i++) {
hdrs[i] = rds_dma_hdr_alloc(dev->dev, &hdr_daddrs[i], dir); if (!hdrs[i]) {
rds_dma_hdrs_free(dev, hdrs, hdr_daddrs, i, dir); return NULL;
}
}
*dma_addrs = hdr_daddrs; return hdrs;
}
/* * This needs to be very careful to not leave IS_ERR pointers around for * cleanup to trip over.
*/ staticint rds_ib_setup_qp(struct rds_connection *conn)
{ struct rds_ib_connection *ic = conn->c_transport_data; struct ib_device *dev = ic->i_cm_id->device; struct ib_qp_init_attr attr; struct ib_cq_init_attr cq_attr = {}; struct rds_ib_device *rds_ibdev; unsignedlong max_wrs; int ret, fr_queue_space;
/* * It's normal to see a null device if an incoming connection races * with device removal, so we don't print a warning.
*/
rds_ibdev = rds_ib_get_client_data(dev); if (!rds_ibdev) return -EOPNOTSUPP;
/* The fr_queue_space is currently set to 512, to add extra space on * completion queue and send queue. This extra space is used for FRWR * registration and invalidation work requests
*/
fr_queue_space = RDS_IB_DEFAULT_FR_WR;
/* add the conn now so that connection establishment has the dev */
rds_ib_add_conn(rds_ibdev, conn);
ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP); if (ret) {
rdsdebug("ib_req_notify_cq send failed: %d\n", ret); goto recv_cq_out;
}
ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); if (ret) {
rdsdebug("ib_req_notify_cq recv failed: %d\n", ret); goto recv_cq_out;
}
/* XXX negotiate max send/recv with remote? */
memset(&attr, 0, sizeof(attr));
attr.event_handler = rds_ib_qp_event_handler;
attr.qp_context = conn; /* + 1 to allow for the single ack message */
attr.cap.max_send_wr = ic->i_send_ring.w_nr + fr_queue_space + 1;
attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1;
attr.cap.max_send_sge = rds_ibdev->max_sge;
attr.cap.max_recv_sge = RDS_IB_RECV_SGE;
attr.sq_sig_type = IB_SIGNAL_REQ_WR;
attr.qp_type = IB_QPT_RC;
attr.send_cq = ic->i_send_cq;
attr.recv_cq = ic->i_recv_cq;
/* * XXX this can fail if max_*_wr is too large? Are we supposed * to back off until we get a value that the hardware can support?
*/
ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr); if (ret) {
rdsdebug("rdma_create_qp failed: %d\n", ret); goto recv_cq_out;
}
ic->i_send_hdrs = rds_dma_hdrs_alloc(rds_ibdev, &ic->i_send_hdrs_dma,
ic->i_send_ring.w_nr,
DMA_TO_DEVICE); if (!ic->i_send_hdrs) {
ret = -ENOMEM;
rdsdebug("DMA send hdrs alloc failed\n"); goto qp_out;
}
ic->i_recv_hdrs = rds_dma_hdrs_alloc(rds_ibdev, &ic->i_recv_hdrs_dma,
ic->i_recv_ring.w_nr,
DMA_FROM_DEVICE); if (!ic->i_recv_hdrs) {
ret = -ENOMEM;
rdsdebug("DMA recv hdrs alloc failed\n"); goto send_hdrs_dma_out;
}
ic->i_ack = rds_dma_hdr_alloc(rds_ibdev->dev, &ic->i_ack_dma,
DMA_TO_DEVICE); if (!ic->i_ack) {
ret = -ENOMEM;
rdsdebug("DMA ack header alloc failed\n"); goto recv_hdrs_dma_out;
}
ic->i_sends = vzalloc_node(array_size(sizeof(struct rds_ib_send_work),
ic->i_send_ring.w_nr),
ibdev_to_node(dev)); if (!ic->i_sends) {
ret = -ENOMEM;
rdsdebug("send allocation failed\n"); goto ack_dma_out;
}
ic->i_recvs = vzalloc_node(array_size(sizeof(struct rds_ib_recv_work),
ic->i_recv_ring.w_nr),
ibdev_to_node(dev)); if (!ic->i_recvs) {
ret = -ENOMEM;
rdsdebug("recv allocation failed\n"); goto sends_out;
}
/* * rdma_cm private data is odd - when there is any private data in the * request, we will be given a pretty large buffer without telling us the * original size. The only way to tell the difference is by looking at * the contents, which are initialized to zero. * If the protocol version fields aren't set, this is a connection attempt * from an older version. This could be 3.0 or 2.0 - we can't tell. * We really should have changed this for OFED 1.3 :-(
*/
/* Be paranoid. RDS always has privdata */ if (!event->param.conn.private_data_len) {
printk(KERN_NOTICE "RDS incoming connection has no private data, " "rejecting\n"); return 0;
}
if (isv6) {
data_len = sizeof(struct rds6_ib_connect_private);
major = dp->ricp_v6.dp_protocol_major;
minor = dp->ricp_v6.dp_protocol_minor;
mask = dp->ricp_v6.dp_protocol_minor_mask;
} else {
data_len = sizeof(struct rds_ib_connect_private);
major = dp->ricp_v4.dp_protocol_major;
minor = dp->ricp_v4.dp_protocol_minor;
mask = dp->ricp_v4.dp_protocol_minor_mask;
}
/* Even if len is crap *now* I still want to check it. -ASG */ if (event->param.conn.private_data_len < data_len || major == 0) return RDS_PROTOCOL_4_0;
common = be16_to_cpu(mask) & RDS_IB_SUPPORTED_PROTOCOLS; if (major == 4 && common) {
version = RDS_PROTOCOL_4_0; while ((common >>= 1) != 0)
version++;
} elseif (RDS_PROTOCOL_COMPAT_VERSION ==
RDS_PROTOCOL(major, minor)) {
version = RDS_PROTOCOL_COMPAT_VERSION;
} else { if (isv6)
printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI6c using incompatible protocol version %u.%u\n",
&dp->ricp_v6.dp_saddr, major, minor); else
printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using incompatible protocol version %u.%u\n",
&dp->ricp_v4.dp_saddr, major, minor);
} return version;
}
#if IS_ENABLED(CONFIG_IPV6) /* Given an IPv6 address, find the net_device which hosts that address and * return its index. This is used by the rds_ib_cm_handle_connect() code to * find the interface index of where an incoming request comes from when * the request is using a link local address. * * Note one problem in this search. It is possible that two interfaces have * the same link local address. Unfortunately, this cannot be solved unless * the underlying layer gives us the interface which an incoming RDMA connect * request comes from.
*/ static u32 __rds_find_ifindex(struct net *net, conststruct in6_addr *addr)
{ struct net_device *dev; int idx = 0;
/* Check whether the remote protocol version matches ours. */
version = rds_ib_protocol_compatible(event, isv6); if (!version) {
err = RDS_RDMA_REJ_INCOMPAT; goto out;
}
dp = event->param.conn.private_data; if (isv6) { #if IS_ENABLED(CONFIG_IPV6)
dp_cmn = &dp->ricp_v6.dp_cmn;
saddr6 = &dp->ricp_v6.dp_saddr;
daddr6 = &dp->ricp_v6.dp_daddr; /* If either address is link local, need to find the * interface index in order to create a proper RDS * connection.
*/ if (ipv6_addr_type(daddr6) & IPV6_ADDR_LINKLOCAL) { /* Using init_net for now .. */
ifindex = __rds_find_ifindex(&init_net, daddr6); /* No index found... Need to bail out. */ if (ifindex == 0) {
err = -EOPNOTSUPP; goto out;
}
} elseif (ipv6_addr_type(saddr6) & IPV6_ADDR_LINKLOCAL) { /* Use our address to find the correct index. */
ifindex = __rds_find_ifindex(&init_net, daddr6); /* No index found... Need to bail out. */ if (ifindex == 0) {
err = -EOPNOTSUPP; goto out;
}
} #else
err = -EOPNOTSUPP; goto out; #endif
} else {
dp_cmn = &dp->ricp_v4.dp_cmn;
ipv6_addr_set_v4mapped(dp->ricp_v4.dp_saddr, &s_mapped_addr);
ipv6_addr_set_v4mapped(dp->ricp_v4.dp_daddr, &d_mapped_addr);
saddr6 = &s_mapped_addr;
daddr6 = &d_mapped_addr;
}
/* RDS/IB is not currently netns aware, thus init_net */
conn = rds_conn_create(&init_net, daddr6, saddr6,
&rds_ib_transport, dp_cmn->ricpc_dp_toss,
GFP_KERNEL, ifindex); if (IS_ERR(conn)) {
rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
conn = NULL; goto out;
}
/* * The connection request may occur while the * previous connection exist, e.g. in case of failover. * But as connections may be initiated simultaneously * by both hosts, we have a random backoff mechanism - * see the comment above rds_queue_reconnect()
*/
mutex_lock(&conn->c_cm_lock); if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { if (rds_conn_state(conn) == RDS_CONN_UP) {
rdsdebug("incoming connect while connecting\n");
rds_conn_drop(conn);
rds_ib_stats_inc(s_ib_listen_closed_stale);
} else if (rds_conn_state(conn) == RDS_CONN_CONNECTING) { /* Wait and see - our connect may still be succeeding */
rds_ib_stats_inc(s_ib_connect_raced);
} goto out;
}
/* If the peer gave us the last packet it saw, process this as if
* we had received a regular ACK. */ if (dp_cmn->ricpc_ack_seq)
rds_send_drop_acked(conn, be64_to_cpu(dp_cmn->ricpc_ack_seq),
NULL);
BUG_ON(cm_id->context);
BUG_ON(ic->i_cm_id);
ic->i_cm_id = cm_id;
cm_id->context = conn;
/* We got halfway through setting up the ib_connection, if we
* fail now, we have to take the long route out of this mess. */
destroy = 0;
rdma_set_min_rnr_timer(cm_id, IB_RNR_TIMER_000_32); /* rdma_accept() calls rdma_reject() internally if it fails */ if (rdma_accept(cm_id, &conn_param))
rds_ib_conn_error(conn, "rdma_accept failed\n");
out: if (conn)
mutex_unlock(&conn->c_cm_lock); if (err)
rdma_reject(cm_id, &err, sizeof(int),
IB_CM_REJ_CONSUMER_DEFINED); return destroy;
}
int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6)
{ struct rds_connection *conn = cm_id->context; struct rds_ib_connection *ic = conn->c_transport_data; struct rdma_conn_param conn_param; union rds_ib_conn_priv dp; int ret;
/* If the peer doesn't do protocol negotiation, we must
* default to RDSv3.0 */
rds_ib_set_protocol(conn, RDS_PROTOCOL_4_1);
ic->i_flowctl = rds_ib_sysctl_flow_control; /* advertise flow control */
ret = rds_ib_setup_qp(conn); if (ret) {
rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", ret); goto out;
}
rds_ib_cm_fill_conn_param(conn, &conn_param, &dp,
conn->c_proposed_version,
UINT_MAX, UINT_MAX, isv6);
ret = rdma_connect_locked(cm_id, &conn_param); if (ret)
rds_ib_conn_error(conn, "rdma_connect_locked failed (%d)\n",
ret);
out: /* Beware - returning non-zero tells the rdma_cm to destroy * the cm_id. We should certainly not do it as long as we still
* "own" the cm_id. */ if (ret) { if (ic->i_cm_id == cm_id)
ret = 0;
}
ic->i_active_side = true; return ret;
}
/* XXX I wonder what affect the port space has */ /* delegate cm event handler to rdma_transport */ #if IS_ENABLED(CONFIG_IPV6) if (conn->c_isv6)
handler = rds6_rdma_cm_event_handler; else #endif
handler = rds_rdma_cm_event_handler;
ic->i_cm_id = rdma_create_id(&init_net, handler, conn,
RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(ic->i_cm_id)) {
ret = PTR_ERR(ic->i_cm_id);
ic->i_cm_id = NULL;
rdsdebug("rdma_create_id() failed: %d\n", ret); goto out;
}
rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
if (ipv6_addr_v4mapped(&conn->c_faddr)) { struct sockaddr_in *sin;
ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
(struct sockaddr *)&dest,
RDS_RDMA_RESOLVE_TIMEOUT_MS); if (ret) {
rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id,
ret);
rdma_destroy_id(ic->i_cm_id);
ic->i_cm_id = NULL;
}
out: return ret;
}
/* * This is so careful about only cleaning up resources that were built up * so that it can be called at any point during startup. In fact it * can be called multiple times for a given connection.
*/ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
{ struct rds_connection *conn = cp->cp_conn; struct rds_ib_connection *ic = conn->c_transport_data; int err = 0;
if (ic->i_cm_id) {
rdsdebug("disconnecting cm %p\n", ic->i_cm_id);
err = rdma_disconnect(ic->i_cm_id); if (err) { /* Actually this may happen quite frequently, when * an outgoing connect raced with an incoming connect.
*/
rdsdebug("failed to disconnect, cm: %p err %d\n",
ic->i_cm_id, err);
}
/* kick off "flush_worker" for all pools in order to reap * all FRMR registrations that are still marked "FRMR_IS_INUSE"
*/
rds_ib_flush_mrs();
/* * We want to wait for tx and rx completion to finish * before we tear down the connection, but we have to be * careful not to get stuck waiting on a send ring that * only has unsignaled sends in it. We've shutdown new * sends before getting here so by waiting for signaled * sends to complete we're ensured that there will be no * more tx processing.
*/
wait_event(rds_ib_ring_empty_wait,
rds_ib_ring_empty(&ic->i_recv_ring) &&
(atomic_read(&ic->i_signaled_sends) == 0) &&
(atomic_read(&ic->i_fastreg_inuse_count) == 0) &&
(atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR));
tasklet_kill(&ic->i_send_tasklet);
tasklet_kill(&ic->i_recv_tasklet);
atomic_set(&ic->i_cq_quiesce, 1);
/* first destroy the ib state that generates callbacks */ if (ic->i_cm_id->qp)
rdma_destroy_qp(ic->i_cm_id); if (ic->i_send_cq) { if (ic->rds_ibdev)
ibdev_put_vector(ic->rds_ibdev, ic->i_scq_vector);
ib_destroy_cq(ic->i_send_cq);
}
if (ic->i_recv_cq) { if (ic->rds_ibdev)
ibdev_put_vector(ic->rds_ibdev, ic->i_rcq_vector);
ib_destroy_cq(ic->i_recv_cq);
}
if (ic->rds_ibdev) { /* then free the resources that ib callbacks use */ if (ic->i_send_hdrs) {
rds_dma_hdrs_free(ic->rds_ibdev,
ic->i_send_hdrs,
ic->i_send_hdrs_dma,
ic->i_send_ring.w_nr,
DMA_TO_DEVICE);
ic->i_send_hdrs = NULL;
ic->i_send_hdrs_dma = NULL;
}
/* * rds_ib_conn_shutdown() waits for these to be emptied so they * must be initialized before it can be called.
*/
rds_ib_ring_init(&ic->i_send_ring, 0);
rds_ib_ring_init(&ic->i_recv_ring, 0);
rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data); return 0;
}
/* * Free a connection. Connection must be shut down and not set for reconnect.
*/ void rds_ib_conn_free(void *arg)
{ struct rds_ib_connection *ic = arg;
spinlock_t *lock_ptr;
rdsdebug("ic %p\n", ic);
/* * Conn is either on a dev's list or on the nodev list. * A race with shutdown() or connect() would cause problems * (since rds_ibdev would change) but that should never happen.
*/
lock_ptr = ic->rds_ibdev ? &ic->rds_ibdev->spinlock : &ib_nodev_conns_lock;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.