tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data); /* * Check if first frame was successfully processed. * Signal connection full establishment if yes. * Failed data processing would have already scheduled * connection drop.
*/ if (!qp->rx_stream.rx_suspend)
siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0);
out:
read_unlock(&sk->sk_callback_lock); if (qp)
siw_qp_socket_assoc(cep, qp);
}
/* * siw_qp_cm_drop() * * Drops established LLP connection if present and not already * scheduled for dropping. Called from user context, SQ workqueue * or receive IRQ. Caller signals if socket can be immediately * closed (basically, if not in IRQ).
*/ void siw_qp_cm_drop(struct siw_qp *qp, int schedule)
{ struct siw_cep *cep = qp->cep;
if (schedule) {
siw_cm_queue_work(cep, SIW_CM_WORK_CLOSE_LLP);
} else {
siw_cep_set_inuse(cep);
if (cep->state == SIW_EPSTATE_CLOSED) {
siw_dbg_cep(cep, "already closed\n"); goto out;
}
siw_dbg_cep(cep, "immediate close, state %d\n", cep->state);
siw_send_terminate(qp);
if (cep->cm_id) { switch (cep->state) { case SIW_EPSTATE_AWAIT_MPAREP:
siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
-EINVAL); break;
case SIW_EPSTATE_RDMA_MODE:
siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); break;
case SIW_EPSTATE_IDLE: case SIW_EPSTATE_LISTENING: case SIW_EPSTATE_CONNECTING: case SIW_EPSTATE_AWAIT_MPAREQ: case SIW_EPSTATE_RECVD_MPAREQ: case SIW_EPSTATE_CLOSED: default: break;
}
siw_free_cm_id(cep);
siw_cep_put(cep);
}
cep->state = SIW_EPSTATE_CLOSED;
/* * Receive MPA Request/Reply header. * * Returns 0 if complete MPA Request/Reply header including * eventual private data was received. Returns -EAGAIN if * header was partially received or negative error code otherwise. * * Context: May be called in process context only
*/ staticint siw_recv_mpa_rr(struct siw_cep *cep)
{ struct mpa_rr *hdr = &cep->mpa.hdr; struct socket *s = cep->sock;
u16 pd_len; int rcvd, to_rcv;
if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) return -EAGAIN;
if (be16_to_cpu(hdr->params.pd_len) > MPA_MAX_PRIVDATA) return -EPROTO;
}
pd_len = be16_to_cpu(hdr->params.pd_len);
/* * At least the MPA Request/Reply header (frame not including * private data) has been received. * Receive (or continue receiving) any private data.
*/
to_rcv = pd_len - (cep->mpa.bytes_rcvd - sizeof(struct mpa_rr));
if (!to_rcv) { /* * We must have hdr->params.pd_len == 0 and thus received a * complete MPA Request/Reply frame. * Check against peer protocol violation.
*/
u32 word;
if (rcvd == 0) {
siw_dbg_cep(cep, "peer EOF\n"); return -EPIPE;
} if (rcvd < 0) {
siw_dbg_cep(cep, "error: %d\n", rcvd); return rcvd;
}
siw_dbg_cep(cep, "peer sent extra data: %d\n", rcvd);
return -EPROTO;
}
/* * At this point, we must have hdr->params.pd_len != 0. * A private data buffer gets allocated if hdr->params.pd_len != 0.
*/ if (!cep->mpa.pdata) {
cep->mpa.pdata = kmalloc(pd_len + 4, GFP_KERNEL); if (!cep->mpa.pdata) return -ENOMEM;
}
rcvd = ksock_recv(
s, cep->mpa.pdata + cep->mpa.bytes_rcvd - sizeof(struct mpa_rr),
to_rcv + 4, MSG_DONTWAIT);
if (rcvd < 0) return rcvd;
if (rcvd > to_rcv) return -EPROTO;
cep->mpa.bytes_rcvd += rcvd;
if (to_rcv == rcvd) {
siw_dbg_cep(cep, "%d bytes private data received\n", pd_len); return 0;
} return -EAGAIN;
}
/* * siw_proc_mpareq() * * Read MPA Request from socket and signal new connection to IWCM * if success. Caller must hold lock on corresponding listening CEP.
*/ staticint siw_proc_mpareq(struct siw_cep *cep)
{ struct mpa_rr *req; int version, rv;
u16 pd_len;
rv = siw_recv_mpa_rr(cep); if (rv) return rv;
req = &cep->mpa.hdr;
version = __mpa_rr_revision(req->params.bits);
pd_len = be16_to_cpu(req->params.pd_len);
if (version > MPA_REVISION_2) /* allow for 0, 1, and 2 only */ return -EPROTO;
if (memcmp(req->key, MPA_KEY_REQ, 16)) return -EPROTO;
/* Prepare for sending MPA reply */
memcpy(req->key, MPA_KEY_REP, 16);
if (version == MPA_REVISION_2 &&
(req->params.bits & MPA_RR_FLAG_ENHANCED)) { /* * MPA version 2 must signal IRD/ORD values and P2P mode * in private data if header flag MPA_RR_FLAG_ENHANCED * is set.
*/ if (pd_len < sizeof(struct mpa_v2_data)) goto reject_conn;
cep->enhanced_rdma_conn_est = true;
}
/* MPA Markers: currently not supported. Marker TX to be added. */ if (req->params.bits & MPA_RR_FLAG_MARKERS) goto reject_conn;
if (req->params.bits & MPA_RR_FLAG_CRC) { /* * RFC 5044, page 27: CRC MUST be used if peer requests it. * siw specific: 'mpa_crc_strict' parameter to reject * connection with CRC if local CRC off enforced by * 'mpa_crc_strict' module parameter.
*/ if (!mpa_crc_required && mpa_crc_strict) goto reject_conn;
/* Enable CRC if requested by module parameter */ if (mpa_crc_required)
req->params.bits |= MPA_RR_FLAG_CRC;
} if (cep->enhanced_rdma_conn_est) { struct mpa_v2_data *v2 = (struct mpa_v2_data *)cep->mpa.pdata;
/* * Peer requested ORD becomes requested local IRD, * peer requested IRD becomes requested local ORD. * IRD and ORD get limited by global maximum values.
*/
cep->ord = ntohs(v2->ird) & MPA_IRD_ORD_MASK;
cep->ord = min(cep->ord, SIW_MAX_ORD_QP);
cep->ird = ntohs(v2->ord) & MPA_IRD_ORD_MASK;
cep->ird = min(cep->ird, SIW_MAX_IRD_QP);
/* May get overwritten by locally negotiated values */
cep->mpa.v2_ctrl.ird = htons(cep->ird);
cep->mpa.v2_ctrl.ord = htons(cep->ord);
/* * Support for peer sent zero length Write or Read to * let local side enter RTS. Writes are preferred. * Sends would require pre-posting a Receive and are * not supported. * Propose zero length Write if none of Read and Write * is indicated.
*/ if (v2->ird & MPA_V2_PEER_TO_PEER) {
cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER;
return -EINVAL;
} if (cep->enhanced_rdma_conn_est) { struct mpa_v2_data *v2;
if (__mpa_rr_revision(rep->params.bits) < MPA_REVISION_2 ||
!(rep->params.bits & MPA_RR_FLAG_ENHANCED)) { /* * Protocol failure: The responder MUST reply with * MPA version 2 and MUST set MPA_RR_FLAG_ENHANCED.
*/
siw_dbg_cep(cep, "mpa reply error: vers %d, enhcd %d\n",
__mpa_rr_revision(rep->params.bits),
rep->params.bits & MPA_RR_FLAG_ENHANCED ?
1 :
0);
if (ird_insufficient) { /* * If the initiator IRD is insuffient for the * responder ORD, send a TERM.
*/
siw_init_terminate(qp, TERM_ERROR_LAYER_LLP,
LLP_ETYPE_MPA,
LLP_ECODE_INSUFFICIENT_IRD, 0);
siw_send_terminate(qp);
rv = -ENOMEM; goto out_err;
} if (cep->mpa.v2_ctrl_req.ird & MPA_V2_PEER_TO_PEER)
mpa_p2p_mode =
cep->mpa.v2_ctrl_req.ord &
(MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR);
/* * Check if we requested P2P mode, and if peer agrees
*/ if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) { if ((mpa_p2p_mode & v2->ord) == 0) { /* * We requested RTR mode(s), but the peer * did not pick any mode we support.
*/
siw_dbg_cep(cep, "rtr mode: req %2x, got %2x\n",
mpa_p2p_mode,
v2->ord & (MPA_V2_RDMA_WRITE_RTR |
MPA_V2_RDMA_READ_RTR));
if (cep->state != SIW_EPSTATE_LISTENING) goto error;
new_cep = siw_cep_alloc(cep->sdev); if (!new_cep) goto error;
/* * 4: Allocate a sufficient number of work elements * to allow concurrent handling of local + peer close * events, MPA header processing + MPA timeout.
*/ if (siw_cm_alloc_work(new_cep, 4) != 0) goto error;
/* * Copy saved socket callbacks from listening CEP * and assign new socket with new CEP
*/
new_cep->sk_state_change = cep->sk_state_change;
new_cep->sk_data_ready = cep->sk_data_ready;
new_cep->sk_write_space = cep->sk_write_space;
new_cep->sk_error_report = cep->sk_error_report;
if (siw_tcp_nagle == false)
tcp_sock_set_nodelay(new_s->sk);
new_cep->state = SIW_EPSTATE_AWAIT_MPAREQ;
rv = siw_cm_queue_work(new_cep, SIW_CM_WORK_MPATIMEOUT); if (rv) goto error; /* * See siw_proc_mpareq() etc. for the use of new_cep->listen_cep.
*/
new_cep->listen_cep = cep;
siw_cep_get(cep);
if (rv != -EAGAIN) {
siw_cep_put(cep->listen_cep);
cep->listen_cep = NULL; if (rv)
siw_cep_put(cep);
}
}
} elseif (cep->state == SIW_EPSTATE_AWAIT_MPAREP) {
rv = siw_proc_mpareply(cep);
} else { /* * CEP already moved out of MPA handshake. * any connection management already done. * silently ignore the mpa packet.
*/ if (cep->state == SIW_EPSTATE_RDMA_MODE) {
cep->sock->sk->sk_data_ready(cep->sock->sk);
siw_dbg_cep(cep, "already in RDMA mode");
} else {
siw_dbg_cep(cep, "out of state: %d\n",
cep->state);
}
} if (rv && rv != -EAGAIN)
release_cep = 1; break;
case SIW_CM_WORK_CLOSE_LLP: /* * QP scheduled LLP close
*/ if (cep->qp)
siw_send_terminate(cep->qp);
if (cep->cm_id)
siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
release_cep = 1; break;
case SIW_CM_WORK_PEER_CLOSE: if (cep->cm_id) { if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) { /* * MPA reply not received, but connection drop
*/
siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
-ECONNRESET);
} elseif (cep->state == SIW_EPSTATE_RDMA_MODE) { /* * NOTE: IW_CM_EVENT_DISCONNECT is given just * to transition IWCM into CLOSING.
*/
siw_cm_upcall(cep, IW_CM_EVENT_DISCONNECT, 0);
siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
} /* * for other states there is no connection * known to the IWCM.
*/
} else { if (cep->state == SIW_EPSTATE_RECVD_MPAREQ) { /* * Wait for the ulp/CM to call accept/reject
*/
siw_dbg_cep(cep, "mpa req recvd, wait for ULP\n");
} elseif (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) { /* * Socket close before MPA request received.
*/ if (cep->listen_cep) {
siw_dbg_cep(cep, "no mpareq: drop listener\n");
siw_cep_put(cep->listen_cep);
cep->listen_cep = NULL;
}
}
}
release_cep = 1; break;
case SIW_CM_WORK_MPATIMEOUT:
cep->mpa_timer = NULL;
if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) { /* * MPA request timed out: * Hide any partially received private data and signal * timeout
*/
cep->mpa.hdr.params.pd_len = 0;
if (cep->cm_id)
siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
-ETIMEDOUT);
release_cep = 1;
} elseif (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) { /* * No MPA request received after peer TCP stream setup.
*/ if (cep->listen_cep) {
siw_cep_put(cep->listen_cep);
cep->listen_cep = NULL;
}
release_cep = 1;
} break;
default:
WARN(1, "Undefined CM work type: %d\n", work->type);
} if (release_cep) {
siw_dbg_cep(cep, "release: timer=%s, QP[%u]\n",
cep->mpa_timer ? "y" : "n",
cep->qp ? qp_id(cep->qp) : UINT_MAX);
siw_cancel_mpatimer(cep);
cep->state = SIW_EPSTATE_CLOSED;
if (cep->qp) { struct siw_qp *qp = cep->qp; /* * Serialize a potential race with application * closing the QP and calling siw_qp_cm_drop()
*/
siw_qp_get(qp);
siw_cep_set_free(cep);
switch (sk->sk_state) { case TCP_ESTABLISHED: /* * handle accepting socket as special case where only * new connection is possible
*/
siw_cm_queue_work(cep, SIW_CM_WORK_ACCEPT); break;
case TCP_CLOSE: case TCP_CLOSE_WAIT: if (cep->qp)
cep->qp->tx_ctx.tx_suspend = 1;
siw_cm_queue_work(cep, SIW_CM_WORK_PEER_CLOSE); break;
/* * Respect any iwarp port mapping: Use mapped remote address * if valid. Local address must not be mapped, since siw * uses kernel TCP stack.
*/ if ((v4 && to_sockaddr_in(id->remote_addr).sin_port != 0) ||
to_sockaddr_in6(id->remote_addr).sin6_port != 0)
raddr = (struct sockaddr *)&id->m_remote_addr;
qp = siw_qp_id2obj(sdev, params->qpn); if (!qp) {
WARN(1, "[QP %u] does not exist\n", params->qpn);
rv = -EINVAL; goto error;
}
siw_dbg_qp(qp, "pd_len %d, laddr %pISp, raddr %pISp\n", pd_len, laddr,
raddr);
/* * 4: Allocate a sufficient number of work elements * to allow concurrent handling of local + peer close * events, MPA header processing + MPA timeout.
*/
rv = siw_cm_alloc_work(cep, 4); if (rv != 0) {
rv = -ENOMEM; goto error;
}
cep->ird = params->ird;
cep->ord = params->ord;
if (p2p_mode && cep->ord == 0)
cep->ord = 1;
cep->state = SIW_EPSTATE_CONNECTING;
/* * Associate CEP with socket
*/
siw_cep_socket_assoc(cep, s);
cep->state = SIW_EPSTATE_AWAIT_MPAREP;
/* * Set MPA Request bits: CRC if required, no MPA Markers, * MPA Rev. according to module parameter 'mpa_version', Key 'Request'.
*/
cep->mpa.hdr.params.bits = 0; if (version > MPA_REVISION_2) {
pr_warn("Setting MPA version to %u\n", MPA_REVISION_2);
version = MPA_REVISION_2; /* Adjust also module parameter */
mpa_version = MPA_REVISION_2;
}
__mpa_rr_set_revision(&cep->mpa.hdr.params.bits, version);
if (try_gso)
cep->mpa.hdr.params.bits |= MPA_RR_FLAG_GSO_EXP;
if (mpa_crc_required)
cep->mpa.hdr.params.bits |= MPA_RR_FLAG_CRC;
/* * If MPA version == 2: * o Include ORD and IRD. * o Indicate peer-to-peer mode, if required by module * parameter 'peer_to_peer'.
*/ if (version == MPA_REVISION_2) {
cep->enhanced_rdma_conn_est = true;
cep->mpa.hdr.params.bits |= MPA_RR_FLAG_ENHANCED;
if (cep) {
siw_socket_disassoc(s);
sock_release(s);
cep->sock = NULL;
cep->qp = NULL;
cep->cm_id = NULL;
id->rem_ref(id);
qp->cep = NULL;
siw_cep_put(cep);
cep->state = SIW_EPSTATE_CLOSED;
siw_cep_set_free_and_put(cep);
} elseif (s) {
sock_release(s);
} if (qp)
siw_qp_put(qp);
return rv;
}
/* * siw_accept - Let SoftiWARP accept an RDMA connection request * * @id: New connection management id to be used for accepted * connection request * @params: Connection parameters provided by ULP for accepting connection * * Transition QP to RTS state, associate new CM id @id with accepted CEP * and get prepared for TCP input by installing socket callbacks. * Then send MPA Reply and generate the "connection established" event. * Socket callbacks must be installed before sending MPA Reply, because * the latter may cause a first RDMA message to arrive from the RDMA Initiator * side very quickly, at which time the socket callbacks must be ready.
*/ int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params)
{ struct siw_device *sdev = to_siw_dev(id->device); struct siw_cep *cep = (struct siw_cep *)id->provider_data; struct siw_qp *qp; struct siw_qp_attrs qp_attrs; int rv = -EINVAL, max_priv_data = MPA_MAX_PRIVDATA; bool wait_for_peer_rts = false;
siw_cep_set_inuse(cep);
siw_cep_put(cep);
/* Free lingering inbound private data */ if (cep->mpa.hdr.params.pd_len) {
cep->mpa.hdr.params.pd_len = 0;
kfree(cep->mpa.pdata);
cep->mpa.pdata = NULL;
}
siw_cancel_mpatimer(cep);
if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) {
siw_dbg_cep(cep, "out of state\n");
rv = -ECONNRESET; goto free_cep;
}
qp = siw_qp_id2obj(sdev, params->qpn); if (!qp) {
WARN(1, "[QP %d] does not exist\n", params->qpn); goto free_cep;
}
down_write(&qp->state_lock); if (qp->attrs.state > SIW_QP_STATE_RTR) goto error_unlock;
siw_dbg_cep(cep, "[QP %d]\n", params->qpn);
/* * In case of a wildcard rdma_listen on a multi-homed device, * a listener's IWCM id is associated with more than one listening CEP. * * We currently use id->provider_data in three different ways: * * o For a listener's IWCM id, id->provider_data points to * the list_head of the list of listening CEPs. * Uses: siw_create_listen(), siw_destroy_listen() * * o For each accepted passive-side IWCM id, id->provider_data * points to the CEP itself. This is a consequence of * - siw_cm_upcall() setting event.provider_data = cep and * - the IWCM's cm_conn_req_handler() setting provider_data of the * new passive-side IWCM id equal to event.provider_data * Uses: siw_accept(), siw_reject() * * o For an active-side IWCM id, id->provider_data is not used at all. *
*/ if (!id->provider_data) {
id->provider_data =
kmalloc(sizeof(struct list_head), GFP_KERNEL); if (!id->provider_data) {
rv = -ENOMEM; goto error;
}
INIT_LIST_HEAD((struct list_head *)id->provider_data);
}
list_add_tail(&cep->listenq, (struct list_head *)id->provider_data);
cep->state = SIW_EPSTATE_LISTENING;
dev_put(ndev);
siw_dbg(id->device, "Listen at laddr %pISp\n", &id->local_addr);
/* * In case of a wildcard rdma_listen on a multi-homed device, * a listener's IWCM id is associated with more than one listening CEP.
*/
list_for_each_safe(p, tmp, (struct list_head *)id->provider_data) { struct siw_cep *cep = list_entry(p, struct siw_cep, listenq);
list_del(p);
siw_dbg_cep(cep, "drop cep, state %d\n", cep->state);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.