// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2005 Voltaire Inc. All rights reserved. * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. * Copyright (c) 1999-2019, Mellanox Technologies, Inc. All rights reserved. * Copyright (c) 2005-2006 Intel Corporation. All rights reserved.
*/
/** * rdma_is_consumer_reject - return true if the consumer rejected the connect * request. * @id: Communication identifier that received the REJECT event. * @reason: Value returned in the REJECT event status field.
*/ staticbool rdma_is_consumer_reject(struct rdma_cm_id *id, int reason)
{ if (rdma_ib_or_roce(id->device, id->port_num)) return reason == IB_CM_REJ_CONSUMER_DEFINED;
if (rdma_protocol_iwarp(id->device, id->port_num)) return reason == -ECONNREFUSED;
/* * Device removal can occur at anytime, so we need extra handling to * serialize notifying the user of device removal with other callbacks. * We do this by disabling removal notification while a callback is in process, * and reporting it after the callback completes.
*/
/* * The FSM uses a funny double locking where state is protected by both * the handler_mutex and the spinlock. State is not allowed to change * to/from a handler_mutex protected value without also holding * handler_mutex.
*/ if (comp == RDMA_CM_CONNECT || exch == RDMA_CM_CONNECT)
lockdep_assert_held(&id_priv->handler_mutex);
node = kzalloc(sizeof(*node), GFP_KERNEL); if (!node) return -ENOMEM;
spin_lock_irqsave(&id_table_lock, flags); new = &id_table.rb_node; while (*new) { this = container_of(*new, struct id_table_entry, rb_node);
result = compare_netdev_and_ip(
node_id_priv->id.route.addr.dev_addr.bound_dev_if,
cma_dst_addr(node_id_priv), this);
parent = *new; if (result < 0) new = &((*new)->rb_left); elseif (result > 0) new = &((*new)->rb_right); else {
list_add_tail(&node_id_priv->id_list_entry,
&this->id_list);
kfree(node); goto unlock;
}
}
if (!rdma_dev_access_netns(device, id_priv->id.route.addr.dev_addr.net)) goto out;
if ((dev_type == ARPHRD_INFINIBAND) && !rdma_protocol_ib(device, port)) goto out;
if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port)) goto out;
/* * For drivers that do not associate more than one net device with * their gid tables, such as iWARP drivers, it is sufficient to * return the first table entry. * * Other driver classes might be included in the future.
*/ if (rdma_protocol_iwarp(device, port)) {
sgid_attr = rdma_get_gid_attr(device, port, 0); if (IS_ERR(sgid_attr)) goto out;
rcu_read_lock();
ndev = rcu_dereference(sgid_attr->ndev); if (ndev->ifindex != bound_if_index) {
pdev = dev_get_by_index_rcu(dev_addr->net, bound_if_index); if (pdev) { if (is_vlan_dev(pdev)) {
pdev = vlan_dev_real_dev(pdev); if (ndev->ifindex == pdev->ifindex)
bound_if_index = pdev->ifindex;
} if (is_vlan_dev(ndev)) {
pdev = vlan_dev_real_dev(ndev); if (bound_if_index == pdev->ifindex)
bound_if_index = ndev->ifindex;
}
}
} if (!net_eq(dev_net(ndev), dev_addr->net) ||
ndev->ifindex != bound_if_index) {
rdma_put_gid_attr(sgid_attr);
sgid_attr = ERR_PTR(-ENODEV);
}
rcu_read_unlock(); goto out;
}
/* * For a RXE device, it should work with TUN device and normal ethernet * devices. Use driver_id to check if a device is a RXE device or not. * ARPHDR_NONE means a TUN device.
*/ if (device->ops.driver_id == RDMA_DRIVER_RXE) { if ((dev_type == ARPHRD_NONE || dev_type == ARPHRD_ETHER)
&& rdma_protocol_roce(device, port)) {
ndev = dev_get_by_index(dev_addr->net, bound_if_index); if (!ndev) goto out;
}
} else { if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) {
ndev = dev_get_by_index(dev_addr->net, bound_if_index); if (!ndev) goto out;
} else {
gid_type = IB_GID_TYPE_IB;
}
}
/** * cma_acquire_dev_by_src_ip - Acquire cma device, port, gid attribute * based on source ip address. * @id_priv: cm_id which should be bound to cma device * * cma_acquire_dev_by_src_ip() binds cm id to cma device, port and GID attribute * based on source IP address. It returns 0 on success or error code otherwise. * It is applicable to active and passive side cm_id.
*/ staticint cma_acquire_dev_by_src_ip(struct rdma_id_private *id_priv)
{ struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; conststruct ib_gid_attr *sgid_attr; union ib_gid gid, iboe_gid, *gidp; struct cma_device *cma_dev; enum ib_gid_type gid_type; int ret = -ENODEV;
u32 port;
if (dev_addr->dev_type != ARPHRD_INFINIBAND &&
id_priv->id.ps == RDMA_PS_IPOIB) return -EINVAL;
/** * cma_ib_acquire_dev - Acquire cma device, port and SGID attribute * @id_priv: cm id to bind to cma device * @listen_id_priv: listener cm id to match against * @req: Pointer to req structure containaining incoming * request information * cma_ib_acquire_dev() acquires cma device, port and SGID attribute when * rdma device matches for listen_id and incoming request. It also verifies * that a GID table entry is present for the source address. * Returns 0 on success, or returns error code otherwise.
*/ staticint cma_ib_acquire_dev(struct rdma_id_private *id_priv, conststruct rdma_id_private *listen_id_priv, struct cma_req_info *req)
{ struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; conststruct ib_gid_attr *sgid_attr; enum ib_gid_type gid_type; union ib_gid gid;
if (dev_addr->dev_type != ARPHRD_INFINIBAND &&
id_priv->id.ps == RDMA_PS_IPOIB) return -EINVAL;
id_priv->id.port_num = req->port;
cma_bind_sgid_attr(id_priv, sgid_attr); /* Need to acquire lock to protect against reader * of cma_dev->id_list such as cma_netdev_callback() and * cma_process_remove().
*/
mutex_lock(&lock);
cma_attach_to_dev(id_priv, listen_id_priv->cma_dev);
mutex_unlock(&lock);
rdma_restrack_add(&id_priv->res); return 0;
}
mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) {
ret = 0; goto out;
}
/* Need to update QP attributes from default values. */
qp_attr.qp_state = IB_QPS_INIT;
ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out;
ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); if (ret) goto out;
qp_attr.qp_state = IB_QPS_RTR;
ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out;
if (ipv6_addr_cmp(&src_addr6->sin6_addr,
&dst_addr6->sin6_addr)) return 1;
link_local = ipv6_addr_type(&dst_addr6->sin6_addr) &
IPV6_ADDR_LINKLOCAL; /* Link local must match their scope_ids */ return link_local ? (src_addr6->sin6_scope_id !=
dst_addr6->sin6_scope_id) :
0;
}
switch (ib_event->event) { case IB_CM_REQ_RECEIVED:
req->device = req_param->listen_id->device;
req->port = req_param->port;
memcpy(&req->local_gid, &req_param->primary_path->sgid, sizeof(req->local_gid));
req->has_gid = true;
req->service_id = req_param->primary_path->service_id;
req->pkey = be16_to_cpu(req_param->primary_path->pkey); if (req->pkey != req_param->bth_pkey)
pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and primary path P_Key (0x%x)\n" "RDMA CMA: in the future this may cause the request to be dropped\n",
req_param->bth_pkey, req->pkey); break; case IB_CM_SIDR_REQ_RECEIVED:
req->device = sidr_param->listen_id->device;
req->port = sidr_param->port;
req->has_gid = false;
req->service_id = sidr_param->service_id;
req->pkey = sidr_param->pkey; if (req->pkey != sidr_param->bth_pkey)
pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and SIDR request payload P_Key (0x%x)\n" "RDMA CMA: in the future this may cause the request to be dropped\n",
sidr_param->bth_pkey, req->pkey); break; default: return -EINVAL;
}
/* Returns true if the req is for IPv6 link local */ return (daddr->sa_family == AF_INET6 &&
(ipv6_addr_type(&daddr6->sin6_addr) & IPV6_ADDR_LINKLOCAL));
}
if (!net_dev) /* This request is an AF_IB request */ return (!id->port_num || id->port_num == req->port) &&
(addr->src_addr.ss_family == AF_IB);
/* * If the request is not for IPv6 link local, allow matching * request to any netdevice of the one or multiport rdma device.
*/ if (!cma_is_req_ipv6_ll(req)) returntrue; /* * Net namespaces must match, and if the listner is listening * on a specific netdevice than netdevice must match as well.
*/ if (net_eq(dev_net(net_dev), addr->dev_addr.net) &&
(!!addr->dev_addr.bound_dev_if ==
(addr->dev_addr.bound_dev_if == net_dev->ifindex))) returntrue; else returnfalse;
}
err = cma_save_req_info(ib_event, req); if (err) return ERR_PTR(err);
*net_dev = cma_get_net_dev(ib_event, req); if (IS_ERR(*net_dev)) { if (PTR_ERR(*net_dev) == -EAFNOSUPPORT) { /* Assuming the protocol is AF_IB */
*net_dev = NULL;
} else { return ERR_CAST(*net_dev);
}
}
mutex_lock(&lock); /* * Net namespace might be getting deleted while route lookup, * cm_id lookup is in progress. Therefore, perform netdevice * validation, cm_id lookup under rcu lock. * RCU lock along with netdevice state check, synchronizes with * netdevice migrating to different net namespace and also avoids * case where net namespace doesn't get deleted while lookup is in * progress. * If the device state is not IFF_UP, its properties such as ifindex * and nd_net cannot be trusted to remain valid without rcu lock. * net/core/dev.c change_net_namespace() ensures to synchronize with * ongoing operations on net device after device is closed using * synchronize_net().
*/
rcu_read_lock(); if (*net_dev) { /* * If netdevice is down, it is likely that it is administratively * down or it might be migrating to different namespace. * In that case avoid further processing, as the net namespace * or ifindex may change.
*/ if (((*net_dev)->flags & IFF_UP) == 0) {
id_priv = ERR_PTR(-EHOSTUNREACH); goto err;
}
staticvoid cma_cancel_operation(struct rdma_id_private *id_priv, enum rdma_cm_state state)
{ switch (state) { case RDMA_CM_ADDR_QUERY: /* * We can avoid doing the rdma_addr_cancel() based on state, * only RDMA_CM_ADDR_QUERY has a work that could still execute. * Notice that the addr_handler work could still be exiting * outside this state, however due to the interaction with the * handler_mutex the work is guaranteed not to touch id_priv * during exit.
*/
rdma_addr_cancel(&id_priv->id.route.addr.dev_addr); break; case RDMA_CM_ROUTE_QUERY:
cma_cancel_route(id_priv); break; case RDMA_CM_LISTEN: if (cma_any_addr(cma_src_addr(id_priv)) && !id_priv->cma_dev)
cma_cancel_listens(id_priv); break; default: break;
}
}
/* * destroy an ID from within the handler_mutex. This ensures that no other * handlers can start running concurrently.
*/ staticvoid destroy_id_handler_unlock(struct rdma_id_private *id_priv)
__releases(&idprv->handler_mutex)
{ enum rdma_cm_state state; unsignedlong flags;
trace_cm_id_destroy(id_priv);
/* * Setting the state to destroyed under the handler mutex provides a * fence against calling handler callbacks. If this is invoked due to * the failure of a handler callback then it guarentees that no future * handlers will be called.
*/
lockdep_assert_held(&id_priv->handler_mutex);
spin_lock_irqsave(&id_priv->lock, flags);
state = id_priv->state;
id_priv->state = RDMA_CM_DESTROYING;
spin_unlock_irqrestore(&id_priv->lock, flags);
mutex_unlock(&id_priv->handler_mutex);
_destroy_id(id_priv, state);
}
mutex_lock(&id_priv->handler_mutex);
state = READ_ONCE(id_priv->state); if ((ib_event->event != IB_CM_TIMEWAIT_EXIT &&
state != RDMA_CM_CONNECT) ||
(ib_event->event == IB_CM_TIMEWAIT_EXIT &&
state != RDMA_CM_DISCONNECT)) goto out;
switch (ib_event->event) { case IB_CM_REQ_ERROR: case IB_CM_REP_ERROR:
event.event = RDMA_CM_EVENT_UNREACHABLE;
event.status = -ETIMEDOUT; break; case IB_CM_REP_RECEIVED: if (state == RDMA_CM_CONNECT &&
(id_priv->id.qp_type != IB_QPT_UD)) {
trace_cm_prepare_mra(id_priv);
ib_prepare_cm_mra(cm_id);
} if (id_priv->id.qp) {
event.status = cma_rep_recv(id_priv);
event.event = event.status ? RDMA_CM_EVENT_CONNECT_ERROR :
RDMA_CM_EVENT_ESTABLISHED;
} else {
event.event = RDMA_CM_EVENT_CONNECT_RESPONSE;
}
cma_set_rep_event_data(&event, &ib_event->param.rep_rcvd,
ib_event->private_data); break; case IB_CM_RTU_RECEIVED: case IB_CM_USER_ESTABLISHED:
event.event = RDMA_CM_EVENT_ESTABLISHED; break; case IB_CM_DREQ_ERROR:
event.status = -ETIMEDOUT;
fallthrough; case IB_CM_DREQ_RECEIVED: case IB_CM_DREP_RECEIVED: if (!cma_comp_exch(id_priv, RDMA_CM_CONNECT,
RDMA_CM_DISCONNECT)) goto out;
event.event = RDMA_CM_EVENT_DISCONNECTED; break; case IB_CM_TIMEWAIT_EXIT:
event.event = RDMA_CM_EVENT_TIMEWAIT_EXIT; break; case IB_CM_MRA_RECEIVED: /* ignore event */ goto out; case IB_CM_REJ_RECEIVED:
pr_debug_ratelimited("RDMA CM: REJECTED: %s\n", rdma_reject_msg(&id_priv->id,
ib_event->param.rej_rcvd.reason));
cma_modify_qp_err(id_priv);
event.status = ib_event->param.rej_rcvd.reason;
event.event = RDMA_CM_EVENT_REJECTED;
event.param.conn.private_data = ib_event->private_data;
event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE; break; default:
pr_err("RDMA CMA: unexpected IB CM event: %d\n",
ib_event->event); goto out;
}
ret = cma_cm_event_handler(id_priv, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */
id_priv->cm_id.ib = NULL;
destroy_id_handler_unlock(id_priv); return ret;
}
out:
mutex_unlock(&id_priv->handler_mutex); return 0;
}
ret = cma_cm_event_handler(conn_id, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */
conn_id->cm_id.ib = NULL;
mutex_unlock(&listen_id->handler_mutex);
destroy_id_handler_unlock(conn_id); goto net_dev_put;
}
mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) goto out;
switch (iw_event->event) { case IW_CM_EVENT_CLOSE:
event.event = RDMA_CM_EVENT_DISCONNECTED; break; case IW_CM_EVENT_CONNECT_REPLY:
memcpy(cma_src_addr(id_priv), laddr,
rdma_addr_size(laddr));
memcpy(cma_dst_addr(id_priv), raddr,
rdma_addr_size(raddr)); switch (iw_event->status) { case 0:
event.event = RDMA_CM_EVENT_ESTABLISHED;
event.param.conn.initiator_depth = iw_event->ird;
event.param.conn.responder_resources = iw_event->ord; break; case -ECONNRESET: case -ECONNREFUSED:
event.event = RDMA_CM_EVENT_REJECTED; break; case -ETIMEDOUT:
event.event = RDMA_CM_EVENT_UNREACHABLE; break; default:
event.event = RDMA_CM_EVENT_CONNECT_ERROR; break;
} break; case IW_CM_EVENT_ESTABLISHED:
event.event = RDMA_CM_EVENT_ESTABLISHED;
event.param.conn.initiator_depth = iw_event->ird;
event.param.conn.responder_resources = iw_event->ord; break; default: goto out;
}
event.status = iw_event->status;
event.param.conn.private_data = iw_event->private_data;
event.param.conn.private_data_len = iw_event->private_data_len;
ret = cma_cm_event_handler(id_priv, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */
id_priv->cm_id.iw = NULL;
destroy_id_handler_unlock(id_priv); return ret;
}
mutex_lock(&listen_id->handler_mutex); if (READ_ONCE(listen_id->state) != RDMA_CM_LISTEN) goto out;
/* Create a new RDMA id for the new IW CM ID */
conn_id = __rdma_create_id(listen_id->id.route.addr.dev_addr.net,
listen_id->id.event_handler,
listen_id->id.context, RDMA_PS_TCP,
IB_QPT_RC, listen_id); if (IS_ERR(conn_id)) {
ret = -ENOMEM; goto out;
}
mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
conn_id->state = RDMA_CM_CONNECT;
ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr); if (ret) {
mutex_unlock(&listen_id->handler_mutex);
destroy_id_handler_unlock(conn_id); return ret;
}
ret = cma_iw_acquire_dev(conn_id, listen_id); if (ret) {
mutex_unlock(&listen_id->handler_mutex);
destroy_id_handler_unlock(conn_id); return ret;
}
ret = cma_cm_event_handler(conn_id, &event); if (ret) { /* User wants to destroy the CM ID */
conn_id->cm_id.iw = NULL;
mutex_unlock(&listen_id->handler_mutex);
destroy_id_handler_unlock(conn_id); return ret;
}
/** * rdma_set_ack_timeout() - Set the ack timeout of QP associated * with a connection identifier. * @id: Communication identifier to associated with service type. * @timeout: Ack timeout to set a QP, expressed as 4.096 * 2^(timeout) usec. * * This function should be called before rdma_connect() on active side, * and on passive side before rdma_accept(). It is applicable to primary * path only. The timeout will affect the local side of the QP, it is not * negotiated with remote side and zero disables the timer. In case it is * set before rdma_resolve_route, the value will also be used to determine * PacketLifeTime for RoCE. * * Return: 0 for success
*/ int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout)
{ struct rdma_id_private *id_priv;
if (id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_INI) return -EINVAL;
/** * rdma_set_min_rnr_timer() - Set the minimum RNR Retry timer of the * QP associated with a connection identifier. * @id: Communication identifier to associated with service type. * @min_rnr_timer: 5-bit value encoded as Table 45: "Encoding for RNR NAK * Timer Field" in the IBTA specification. * * This function should be called before rdma_connect() on active * side, and on passive side before rdma_accept(). The timer value * will be associated with the local QP. When it receives a send it is * not read to handle, typically if the receive queue is empty, an RNR * Retry NAK is returned to the requester with the min_rnr_timer * encoded. The requester will then wait at least the time specified * in the NAK before retrying. The default is zero, which translates * to a minimum RNR Timer value of 655 ms. * * Return: 0 for success
*/ int rdma_set_min_rnr_timer(struct rdma_cm_id *id, u8 min_rnr_timer)
{ struct rdma_id_private *id_priv;
/* It is a five-bit value */ if (min_rnr_timer & 0xe0) return -EINVAL;
if (WARN_ON(id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_TGT)) return -EINVAL;
/* * cma_iboe_set_path_rec_l2_fields() is helper function which sets * path record type based on GID type. * It also sets up other L2 fields which includes destination mac address * netdev ifindex, of the path record. * It returns the netdev of the bound interface for this path record entry.
*/ staticstruct net_device *
cma_iboe_set_path_rec_l2_fields(struct rdma_id_private *id_priv)
{ struct rdma_route *route = &id_priv->id.route; enum ib_gid_type gid_type = IB_GID_TYPE_ROCE; struct rdma_addr *addr = &route->addr; unsignedlong supported_gids; struct net_device *ndev;
if (!addr->dev_addr.bound_dev_if) return NULL;
ndev = dev_get_by_index(addr->dev_addr.net,
addr->dev_addr.bound_dev_if); if (!ndev) return NULL;
supported_gids = roce_gid_type_mask_support(id_priv->id.device,
id_priv->id.port_num);
gid_type = cma_route_gid_type(addr->dev_addr.network,
supported_gids,
id_priv->gid_type); /* Use the hint from IP Stack to select GID Type */ if (gid_type < ib_network_to_gid_type(addr->dev_addr.network))
gid_type = ib_network_to_gid_type(addr->dev_addr.network);
route->path_rec->rec_type = sa_conv_gid_to_pathrec_type(gid_type);
if (is_vlan_dev(dev))
map->output_tc = get_vlan_ndev_tc(dev, map->input_prio); elseif (dev->num_tc)
map->output_tc = netdev_get_prio_tc_map(dev, map->input_prio); else
map->output_tc = 0; /* We are interested only in first level VLAN device, so always * return 1 to stop iterating over next level devices.
*/
map->found = true; return 1;
}
staticint iboe_tos_to_sl(struct net_device *ndev, int tos)
{ struct iboe_prio_tc_map prio_tc_map = {}; int prio = rt_tos2priority(tos); struct netdev_nested_priv priv;
/* If VLAN device, get it directly from the VLAN netdev */ if (is_vlan_dev(ndev)) return get_vlan_ndev_tc(ndev, prio);
prio_tc_map.input_prio = prio;
priv.data = (void *)&prio_tc_map;
rcu_read_lock();
netdev_walk_all_lower_dev_rcu(ndev,
get_lower_vlan_dev_tc,
&priv);
rcu_read_unlock(); /* If map is found from lower device, use it; Otherwise * continue with the current netdevice to get priority to tc map.
*/ if (prio_tc_map.found) return prio_tc_map.output_tc; elseif (ndev->num_tc) return netdev_get_prio_tc_map(ndev, prio); else return 0;
}
if (((struct sockaddr *)&id_priv->id.route.addr.dst_addr)->sa_family != AF_IB) /* TODO: get the hoplimit from the inet/inet6 device */
route->path_rec->hop_limit = addr->dev_addr.hoplimit; else
route->path_rec->hop_limit = 1;
route->path_rec->reversible = 1;
route->path_rec->pkey = cpu_to_be16(0xffff);
route->path_rec->mtu_selector = IB_SA_EQ;
route->path_rec->sl = iboe_tos_to_sl(ndev, tos);
route->path_rec->traffic_class = tos;
route->path_rec->mtu = iboe_get_mtu(ndev->mtu);
route->path_rec->rate_selector = IB_SA_EQ;
route->path_rec->rate = IB_RATE_PORT_CURRENT;
dev_put(ndev);
route->path_rec->packet_life_time_selector = IB_SA_EQ; /* In case ACK timeout is set, use this value to calculate * PacketLifeTime. As per IBTA 12.7.34, * local ACK timeout = (2 * PacketLifeTime + Local CA’s ACK delay). * Assuming a negligible local ACK delay, we can use * PacketLifeTime = local ACK timeout/2 * as a reasonable approximation for RoCE networks.
*/
mutex_lock(&id_priv->qp_mutex); if (id_priv->timeout_set && id_priv->timeout)
route->path_rec->packet_life_time = id_priv->timeout - 1; else
route->path_rec->packet_life_time = CMA_IBOE_PACKET_LIFETIME;
mutex_unlock(&id_priv->qp_mutex);
if (!route->path_rec->mtu) {
ret = -EINVAL; goto err2;
}
if (rdma_protocol_roce_udp_encap(id_priv->id.device,
id_priv->id.port_num))
route->path_rec->flow_label =
cma_get_roce_udp_flow_label(id_priv);
cma_id_get(id_priv); if (rdma_cap_ib_sa(id->device, id->port_num))
ret = cma_resolve_ib_route(id_priv, timeout_ms); elseif (rdma_protocol_roce(id->device, id->port_num)) {
ret = cma_resolve_iboe_route(id_priv); if (!ret)
cma_add_id_to_tree(id_priv);
} elseif (rdma_protocol_iwarp(id->device, id->port_num))
ret = cma_resolve_iw_route(id_priv); else
ret = -ENOSYS;
mutex_lock(&id_priv->handler_mutex); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY,
RDMA_CM_ADDR_RESOLVED)) goto out;
/* * Store the previous src address, so that if we fail to acquire * matching rdma device, old address can be restored back, which helps * to cancel the cma listen operation correctly.
*/
addr = cma_src_addr(id_priv);
memcpy(&old_addr, addr, rdma_addr_size(addr));
memcpy(addr, src_addr, rdma_addr_size(src_addr)); if (!status && !id_priv->cma_dev) {
status = cma_acquire_dev_by_src_ip(id_priv); if (status)
pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to acquire device. status %d\n",
status);
rdma_restrack_add(&id_priv->res);
} elseif (status) {
pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to resolve IP. status %d\n", status);
}
if (!bind_list) {
ret = cma_alloc_port(ps, id_priv, rover);
} else {
ret = cma_port_is_unique(bind_list, id_priv); if (!ret)
cma_bind_port(bind_list, id_priv);
} /* * Remember previously used port number in order to avoid * re-using same port immediately after it is closed.
*/ if (!ret)
last_used_port = rover; if (ret != -EADDRNOTAVAIL) return ret;
} if (--remaining) {
rover++; if ((rover < low) || (rover > high))
rover = low; goto retry;
} return -EADDRNOTAVAIL;
}
/* * Check that the requested port is available. This is called when trying to * bind to a specific port, or when trying to listen on a bound port. In * the latter case, the provided id_priv may already be on the bind_list, but * we still need to check that it's okay to start listening.
*/ staticint cma_check_port(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv, uint8_t reuseaddr)
{ struct rdma_id_private *cur_id; struct sockaddr *addr, *cur_addr;
bind_list = cma_ps_find(id_priv->id.route.addr.dev_addr.net, ps, snum); if (!bind_list) {
ret = cma_alloc_port(ps, id_priv, snum);
} else {
ret = cma_check_port(bind_list, id_priv, id_priv->reuseaddr); if (!ret)
cma_bind_port(bind_list, id_priv);
} return ret;
}
staticenum rdma_ucm_port_space
cma_select_inet_ps(struct rdma_id_private *id_priv)
{ switch (id_priv->id.ps) { case RDMA_PS_TCP: case RDMA_PS_UDP: case RDMA_PS_IPOIB: case RDMA_PS_IB: return id_priv->id.ps; default:
staticint cma_get_port(struct rdma_id_private *id_priv)
{ enum rdma_ucm_port_space ps; int ret;
if (cma_family(id_priv) != AF_IB)
ps = cma_select_inet_ps(id_priv); else
ps = cma_select_ib_ps(id_priv); if (!ps) return -EPROTONOSUPPORT;
mutex_lock(&lock); if (cma_any_port(cma_src_addr(id_priv)))
ret = cma_alloc_any_port(ps, id_priv); else
ret = cma_use_port(ps, id_priv);
mutex_unlock(&lock);
/* For a well behaved ULP state will be RDMA_CM_IDLE */
ret = rdma_bind_addr(id, (struct sockaddr *)&any_in); if (ret) return ret; if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND,
RDMA_CM_LISTEN))) return -EINVAL;
}
/* * Once the ID reaches RDMA_CM_LISTEN it is not allowed to be reusable * any more, and has to be unique in the bind list.
*/ if (id_priv->reuseaddr) {
mutex_lock(&lock);
ret = cma_check_port(id_priv->bind_list, id_priv, 0); if (!ret)
id_priv->reuseaddr = 0;
mutex_unlock(&lock); if (ret) goto err;
}
id_priv->backlog = backlog; if (id_priv->cma_dev) { if (rdma_cap_ib_cm(id->device, 1)) {
ret = cma_ib_listen(id_priv); if (ret) goto err;
} elseif (rdma_cap_iw_cm(id->device, 1)) {
ret = cma_iw_listen(id_priv, backlog); if (ret) goto err;
} else {
ret = -ENOSYS; goto err;
}
} else {
ret = cma_listen_on_all(id_priv); if (ret) goto err;
}
return 0;
err:
id_priv->backlog = 0; /* * All the failure paths that lead here will not allow the req_handler's * to have run.
*/
cma_comp_exch(id_priv, RDMA_CM_LISTEN, RDMA_CM_ADDR_BOUND); return ret;
}
EXPORT_SYMBOL(rdma_listen);
if (!cma_comp_exch(id_priv, RDMA_CM_IDLE, RDMA_CM_ADDR_BOUND)) return -EINVAL;
ret = cma_check_linklocal(&id_priv->id.route.addr.dev_addr, addr); if (ret) goto err1;
memcpy(cma_src_addr(id_priv), addr, rdma_addr_size(addr)); if (!cma_any_addr(addr)) {
ret = cma_translate_addr(addr, &id_priv->id.route.addr.dev_addr); if (ret) goto err1;
ret = cma_acquire_dev_by_src_ip(id_priv); if (ret) goto err1;
}
if (!(id_priv->options & (1 << CMA_OPTION_AFONLY))) { if (addr->sa_family == AF_INET)
id_priv->afonly = 1; #if IS_ENABLED(CONFIG_IPV6) elseif (addr->sa_family == AF_INET6) { struct net *net = id_priv->id.route.addr.dev_addr.net;
/* * If required, resolve the source address for bind and leave the id_priv in * state RDMA_CM_ADDR_BOUND. This oddly uses the state to determine the prior * calls made by ULP, a previously bound ID will not be re-bound and src_addr is * ignored.
*/ staticint resolve_prepare_src(struct rdma_id_private *id_priv, struct sockaddr *src_addr, conststruct sockaddr *dst_addr)
{ int ret;
if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) { /* For a well behaved ULP state will be RDMA_CM_IDLE */
ret = cma_bind_addr(&id_priv->id, src_addr, dst_addr); if (ret) return ret; if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND,
RDMA_CM_ADDR_QUERY))) return -EINVAL;
ret = resolve_prepare_src(id_priv, src_addr, dst_addr); if (ret) return ret;
if (cma_any_addr(dst_addr)) {
ret = cma_resolve_loopback(id_priv);
} else { if (dst_addr->sa_family == AF_IB) {
ret = cma_resolve_ib_addr(id_priv);
} else { /* * The FSM can return back to RDMA_CM_ADDR_BOUND after * rdma_resolve_ip() is called, eg through the error * path in addr_handler(). If this happens the existing * request must be canceled before issuing a new one. * Since canceling a request is a bit slow and this * oddball path is rare, keep track once a request has * been issued. The track turns out to be a permanent * state since this is the only cancel as it is * immediately before rdma_resolve_ip().
*/ if (id_priv->used_resolve_ip)
rdma_addr_cancel(&id->route.addr.dev_addr); else
id_priv->used_resolve_ip = 1;
ret = rdma_resolve_ip(cma_src_addr(id_priv), dst_addr,
&id->route.addr.dev_addr,
timeout_ms, addr_handler, false, id_priv);
}
} if (ret) goto err;
mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) goto out;
switch (ib_event->event) { case IB_CM_SIDR_REQ_ERROR:
event.event = RDMA_CM_EVENT_UNREACHABLE;
event.status = -ETIMEDOUT; break; case IB_CM_SIDR_REP_RECEIVED:
event.param.ud.private_data = ib_event->private_data;
event.param.ud.private_data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE; if (rep->status != IB_SIDR_SUCCESS) {
event.event = RDMA_CM_EVENT_UNREACHABLE;
event.status = ib_event->param.sidr_rep_rcvd.status;
pr_debug_ratelimited("RDMA CM: UNREACHABLE: bad SIDR reply. status %d\n",
event.status); break;
}
ret = cma_set_qkey(id_priv, rep->qkey); if (ret) {
pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to set qkey. status %d\n", ret);
event.event = RDMA_CM_EVENT_ADDR_ERROR;
event.status = ret; break;
}
ib_init_ah_attr_from_path(id_priv->id.device,
id_priv->id.port_num,
id_priv->id.route.path_rec,
&event.param.ud.ah_attr,
rep->sgid_attr);
event.param.ud.qp_num = rep->qpn;
event.param.ud.qkey = rep->qkey;
event.event = RDMA_CM_EVENT_ESTABLISHED;
event.status = 0; break; default:
pr_err("RDMA CMA: unexpected IB CM event: %d\n",
ib_event->event); goto out;
}
ret = cma_cm_event_handler(id_priv, &event);
rdma_destroy_ah_attr(&event.param.ud.ah_attr); if (ret) { /* Destroy the CM ID by returning a non-zero value. */
id_priv->cm_id.ib = NULL;
destroy_id_handler_unlock(id_priv); return ret;
}
out:
mutex_unlock(&id_priv->handler_mutex); return 0;
}
/** * rdma_connect_locked - Initiate an active connection request. * @id: Connection identifier to connect. * @conn_param: Connection information used for connected QPs. * * Same as rdma_connect() but can only be called from the * RDMA_CM_EVENT_ROUTE_RESOLVED handler callback.
*/ int rdma_connect_locked(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
{ struct rdma_id_private *id_priv =
container_of(id, struct rdma_id_private, id); int ret;
if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT)) return -EINVAL;
if (!id->qp) {
id_priv->qp_num = conn_param->qp_num;
id_priv->srq = conn_param->srq;
}
if (rdma_cap_ib_cm(id->device, id->port_num)) { if (id->qp_type == IB_QPT_UD)
ret = cma_resolve_ib_udp(id_priv, conn_param); else
ret = cma_connect_ib(id_priv, conn_param);
} elseif (rdma_cap_iw_cm(id->device, id->port_num)) {
ret = cma_connect_iw(id_priv, conn_param);
} else {
ret = -ENOSYS;
} if (ret) goto err_state; return 0;
err_state:
cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_ROUTE_RESOLVED); return ret;
}
EXPORT_SYMBOL(rdma_connect_locked);
/** * rdma_connect - Initiate an active connection request. * @id: Connection identifier to connect. * @conn_param: Connection information used for connected QPs. * * Users must have resolved a route for the rdma_cm_id to connect with by having * called rdma_resolve_route before calling this routine. * * This call will either connect to a remote QP or obtain remote QP information * for unconnected rdma_cm_id's. The actual operation is based on the * rdma_cm_id's port space.
*/ int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
{ struct rdma_id_private *id_priv =
container_of(id, struct rdma_id_private, id); int ret;
mutex_lock(&id_priv->handler_mutex);
ret = rdma_connect_locked(id, conn_param);
mutex_unlock(&id_priv->handler_mutex); return ret;
}
EXPORT_SYMBOL(rdma_connect);
/** * rdma_connect_ece - Initiate an active connection request with ECE data. * @id: Connection identifier to connect. * @conn_param: Connection information used for connected QPs. * @ece: ECE parameters * * See rdma_connect() explanation.
*/ int rdma_connect_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, struct rdma_ucm_ece *ece)
{ struct rdma_id_private *id_priv =
container_of(id, struct rdma_id_private, id);
/** * rdma_accept - Called to accept a connection request or response. * @id: Connection identifier associated with the request. * @conn_param: Information needed to establish the connection. This must be * provided if accepting a connection request. If accepting a connection * response, this parameter must be NULL. * * Typically, this routine is only called by the listener to accept a connection * request. It must also be called on the active side of a connection if the * user is performing their own QP transitions. * * In the case of error, a reject message is sent to the remote side and the * state of the qp associated with the id is modified to error, such that any * previously posted receive buffers would be flushed. * * This function is for use by kernel ULPs and must be called from under the * handler callback.
*/ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
{ struct rdma_id_private *id_priv =
container_of(id, struct rdma_id_private, id); int ret;
lockdep_assert_held(&id_priv->handler_mutex);
if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) return -EINVAL;
int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event)
{ struct rdma_id_private *id_priv; int ret;
id_priv = container_of(id, struct rdma_id_private, id); if (!id_priv->cm_id.ib) return -EINVAL;
switch (id->device->node_type) { case RDMA_NODE_IB_CA:
ret = ib_cm_notify(id_priv->cm_id.ib, event); break; default:
ret = 0; break;
} return ret;
}
EXPORT_SYMBOL(rdma_notify);
int rdma_reject(struct rdma_cm_id *id, constvoid *private_data,
u8 private_data_len, u8 reason)
{ struct rdma_id_private *id_priv; int ret;
id_priv = container_of(id, struct rdma_id_private, id); if (!id_priv->cm_id.ib) return -EINVAL;
if (rdma_cap_ib_cm(id->device, id->port_num)) { if (id->qp_type == IB_QPT_UD) {
ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, 0,
private_data, private_data_len);
} else {
trace_cm_send_rej(id_priv);
ret = ib_send_cm_rej(id_priv->cm_id.ib, reason, NULL, 0,
private_data, private_data_len);
}
} elseif (rdma_cap_iw_cm(id->device, id->port_num)) {
ret = iw_cm_reject(id_priv->cm_id.iw,
private_data, private_data_len);
} else {
ret = -ENOSYS;
}
return ret;
}
EXPORT_SYMBOL(rdma_reject);
int rdma_disconnect(struct rdma_cm_id *id)
{ struct rdma_id_private *id_priv; int ret;
id_priv = container_of(id, struct rdma_id_private, id); if (!id_priv->cm_id.ib) return -EINVAL;
if (rdma_cap_ib_cm(id->device, id->port_num)) {
ret = cma_modify_qp_err(id_priv); if (ret) goto out; /* Initiate or respond to a disconnect. */
trace_cm_disconnect(id_priv); if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) { if (!ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0))
trace_cm_sent_drep(id_priv);
} else {
trace_cm_sent_dreq(id_priv);
}
} elseif (rdma_cap_iw_cm(id->device, id->port_num)) {
ret = iw_cm_disconnect(id_priv->cm_id.iw, 0);
} else
ret = -EINVAL;
if ((dev_addr->bound_dev_if == ndev->ifindex) &&
(net_eq(dev_net(ndev), dev_addr->net)) &&
memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) {
pr_info("RDMA CM addr change for ndev %s used by id %p\n",
ndev->name, &id_priv->id);
work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM;
mutex_lock(&id_priv->handler_mutex); /* Record that we want to remove the device */
spin_lock_irqsave(&id_priv->lock, flags);
state = id_priv->state; if (state == RDMA_CM_DESTROYING || state == RDMA_CM_DEVICE_REMOVAL) {
spin_unlock_irqrestore(&id_priv->lock, flags);
mutex_unlock(&id_priv->handler_mutex);
cma_id_put(id_priv); return;
}
id_priv->state = RDMA_CM_DEVICE_REMOVAL;
spin_unlock_irqrestore(&id_priv->lock, flags);
if (cma_cm_event_handler(id_priv, &event)) { /* * At this point the ULP promises it won't call * rdma_destroy_id() concurrently
*/
cma_id_put(id_priv);
mutex_unlock(&id_priv->handler_mutex);
trace_cm_id_destroy(id_priv);
_destroy_id(id_priv, state); return;
}
mutex_unlock(&id_priv->handler_mutex);
/* * If this races with destroy then the thread that first assigns state * to a destroying does the cancel.
*/
cma_cancel_operation(id_priv, state);
cma_id_put(id_priv);
}
/* * There is a rare lock ordering dependency in cma_netdev_callback() * that only happens when bonding is enabled. Teach lockdep that rtnl * must never be nested under lock so it can find these without having * to test with bonding.
*/ if (IS_ENABLED(CONFIG_LOCKDEP)) {
rtnl_lock();
mutex_lock(&lock);
mutex_unlock(&lock);
rtnl_unlock();
}
cma_wq = alloc_ordered_workqueue("rdma_cm", WQ_MEM_RECLAIM); if (!cma_wq) return -ENOMEM;
ret = register_pernet_subsys(&cma_pernet_operations); if (ret) goto err_wq;
¤ Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.0.459Bemerkung:
(vorverarbeitet am 2026-04-28)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.