/* * siw_rx_umem() * * Receive data of @len into target referenced by @dest_addr. * * @srx: Receive Context * @umem: siw representation of target memory * @dest_addr: user virtual address * @len: number of bytes to place
*/ staticint siw_rx_umem(struct siw_rx_stream *srx, struct siw_umem *umem,
u64 dest_addr, int len)
{ int copied = 0;
while (len) { struct page *p; int pg_off, bytes, rv; void *dest;
return -EFAULT;
} if (srx->mpa_crc_enabled) { if (rdma_is_kernel_res(&rx_qp(srx)->base_qp.res)) {
siw_crc_update(&srx->mpa_crc, dest + pg_off,
bytes);
kunmap_atomic(dest);
} else {
kunmap_atomic(dest); /* * Do CRC on original, not target buffer. * Some user land applications may * concurrently write the target buffer, * which would yield a broken CRC. * Walking the skb twice is very ineffcient. * Folding the CRC into skb_copy_bits() * would be much better, but is currently * not supported.
*/
siw_crc_skb(srx, bytes);
}
} else {
kunmap_atomic(dest);
}
srx->skb_offset += bytes;
copied += bytes;
len -= bytes;
dest_addr += bytes;
pg_off = 0;
}
srx->skb_copied += copied;
srx->skb_new -= copied;
return copied;
}
staticint siw_rx_kva(struct siw_rx_stream *srx, void *kva, int len)
{ int rv;
/* * siw_rresp_check_ntoh() * * Check incoming RRESP fragment header against expected * header values and update expected values for potential next * fragment. * * NOTE: This function must be called only if a RRESP DDP segment * starts but not for fragmented consecutive pieces of an * already started DDP segment.
*/ staticint siw_rresp_check_ntoh(struct siw_rx_stream *srx, struct siw_rx_fpdu *frx)
{ struct iwarp_rdma_rresp *rresp = &srx->hdr.rresp; struct siw_wqe *wqe = &frx->wqe_active; enum ddp_ecode ecode;
if (frx->first_ddp_seg) {
srx->ddp_stag = wqe->sqe.sge[0].lkey;
srx->ddp_to = wqe->sqe.sge[0].laddr;
frx->pbl_idx = 0;
} /* Below checks extend beyond the semantics of DDP, and * into RDMAP: * We check if the read response matches exactly the * read request which was send to the remote peer to * trigger this read response. RFC5040/5041 do not * always have a proper error code for the detected * error cases. We choose 'base or bounds error' for * cases where the inbound STag is valid, but offset * or length do not match our response receive state.
*/ if (unlikely(srx->ddp_stag != sink_stag)) {
pr_warn("siw: [QP %u]: rresp stag: %08x != %08x\n",
qp_id(rx_qp(srx)), sink_stag, srx->ddp_stag);
ecode = DDP_ECODE_T_INVALID_STAG; goto error;
} if (unlikely(srx->ddp_to != sink_to)) {
pr_warn("siw: [QP %u]: rresp off: %016llx != %016llx\n",
qp_id(rx_qp(srx)), (unsignedlonglong)sink_to,
(unsignedlonglong)srx->ddp_to);
ecode = DDP_ECODE_T_BASE_BOUNDS; goto error;
} if (unlikely(!frx->more_ddp_segs &&
(wqe->processed + srx->fpdu_part_rem != wqe->bytes))) {
pr_warn("siw: [QP %u]: rresp len: %d != %d\n",
qp_id(rx_qp(srx)),
wqe->processed + srx->fpdu_part_rem, wqe->bytes);
ecode = DDP_ECODE_T_BASE_BOUNDS; goto error;
} return 0;
error:
siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
DDP_ETYPE_TAGGED_BUF, ecode, 0); return -EINVAL;
}
/* * siw_write_check_ntoh() * * Check incoming WRITE fragment header against expected * header values and update expected values for potential next * fragment * * NOTE: This function must be called only if a WRITE DDP segment * starts but not for fragmented consecutive pieces of an * already started DDP segment.
*/ staticint siw_write_check_ntoh(struct siw_rx_stream *srx, struct siw_rx_fpdu *frx)
{ struct iwarp_rdma_write *write = &srx->hdr.rwrite; enum ddp_ecode ecode;
/* * siw_send_check_ntoh() * * Check incoming SEND fragment header against expected * header values and update expected MSN if no next * fragment expected * * NOTE: This function must be called only if a SEND DDP segment * starts but not for fragmented consecutive pieces of an * already started DDP segment.
*/ staticint siw_send_check_ntoh(struct siw_rx_stream *srx, struct siw_rx_fpdu *frx)
{ struct iwarp_send_inv *send = &srx->hdr.send_inv; struct siw_wqe *wqe = &frx->wqe_active; enum ddp_ecode ecode;
/* * siw_proc_send: * * Process one incoming SEND and place data into memory referenced by * receive wqe. * * Function supports partially received sends (suspending/resuming * current receive wqe processing) * * return value: * 0: reached the end of a DDP segment * -EAGAIN: to be called again to finish the DDP segment
*/ int siw_proc_send(struct siw_qp *qp)
{ struct siw_rx_stream *srx = &qp->rx_stream; struct siw_rx_fpdu *frx = &qp->rx_untagged; struct siw_wqe *wqe;
u32 data_bytes; /* all data bytes available */
u32 rcvd_bytes; /* sum of data bytes rcvd */ int rv = 0;
if (frx->first_ddp_seg) {
wqe = siw_rqe_get(qp); if (unlikely(!wqe)) {
siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
DDP_ETYPE_UNTAGGED_BUF,
DDP_ECODE_UT_INVALID_MSN_NOBUF, 0); return -ENOENT;
}
} else {
wqe = rx_wqe(frx);
} if (srx->state == SIW_GET_DATA_START) {
rv = siw_send_check_ntoh(srx, frx); if (unlikely(rv)) {
siw_qp_event(qp, IB_EVENT_QP_FATAL); return rv;
} if (!srx->fpdu_part_rem) /* zero length SEND */ return 0;
}
data_bytes = min(srx->fpdu_part_rem, srx->skb_new);
rcvd_bytes = 0;
/* A zero length SEND will skip below loop */ while (data_bytes) { struct ib_pd *pd; struct siw_mem **mem, *mem_p; struct siw_sge *sge;
u32 sge_bytes; /* data bytes avail for SGE */
sge = &wqe->rqe.sge[frx->sge_idx];
if (!sge->length) { /* just skip empty sge's */
frx->sge_idx++;
frx->sge_off = 0;
frx->pbl_idx = 0; continue;
}
sge_bytes = min(data_bytes, sge->length - frx->sge_off);
mem = &wqe->mem[frx->sge_idx];
/* * check with QP's PD if no SRQ present, SRQ's PD otherwise
*/
pd = qp->srq == NULL ? qp->pd : qp->srq->base_srq.pd;
/* * siw_proc_write: * * Place incoming WRITE after referencing and checking target buffer
* Function supports partially received WRITEs (suspending/resuming * current receive processing) * * return value: * 0: reached the end of a DDP segment * -EAGAIN: to be called again to finish the DDP segment
*/ int siw_proc_write(struct siw_qp *qp)
{ struct siw_rx_stream *srx = &qp->rx_stream; struct siw_rx_fpdu *frx = &qp->rx_tagged; struct siw_mem *mem; int bytes, rv;
if (srx->state == SIW_GET_DATA_START) { if (!srx->fpdu_part_rem) /* zero length WRITE */ return 0;
/* * Inbound RREQ's cannot carry user data.
*/ int siw_proc_rreq(struct siw_qp *qp)
{ struct siw_rx_stream *srx = &qp->rx_stream;
if (!srx->fpdu_part_rem) return 0;
pr_warn("siw: [QP %u]: rreq with mpa len %d\n", qp_id(qp),
be16_to_cpu(srx->hdr.ctrl.mpa_len));
return -EPROTO;
}
/* * siw_init_rresp: * * Process inbound RDMA READ REQ. Produce a pseudo READ RESPONSE WQE. * Put it at the tail of the IRQ, if there is another WQE currently in * transmit processing. If not, make it the current WQE to be processed * and schedule transmit processing. * * Can be called from softirq context and from process * context (RREAD socket loopback case!) * * return value: * 0: success, * failure code otherwise
*/
/* RRESP now valid as current TX wqe or placed into IRQ */
smp_store_mb(resp->flags, SIW_WQE_VALID);
} else {
error_irq:
pr_warn("siw: [QP %u]: IRQ exceeded or null, size %d\n",
qp_id(qp), qp->attrs.irq_size);
/* * Only called at start of Read.Resonse processing. * Transfer pending Read from tip of ORQ into currrent rx wqe, * but keep ORQ entry valid until Read.Response processing done. * No Queue locking needed.
*/ staticint siw_orqe_start_rx(struct siw_qp *qp)
{ struct siw_sqe *orqe; struct siw_wqe *wqe = NULL;
if (unlikely(!qp->attrs.orq_size)) return -EPROTO;
/* make sure ORQ indices are current */
smp_mb();
orqe = orq_get_current(qp); if (READ_ONCE(orqe->flags) & SIW_WQE_VALID) { /* RRESP is a TAGGED RDMAP operation */
wqe = rx_wqe(&qp->rx_tagged);
wqe->sqe.id = orqe->id;
wqe->sqe.opcode = orqe->opcode;
wqe->sqe.sge[0].laddr = orqe->sge[0].laddr;
wqe->sqe.sge[0].lkey = orqe->sge[0].lkey;
wqe->sqe.sge[0].length = orqe->sge[0].length;
wqe->sqe.flags = orqe->flags;
wqe->sqe.num_sge = 1;
wqe->bytes = orqe->sge[0].length;
wqe->processed = 0;
wqe->mem[0] = NULL; /* make sure WQE is completely written before valid */
smp_wmb();
wqe->wr_status = SIW_WR_INPROGRESS;
return 0;
} return -EPROTO;
}
/* * siw_proc_rresp: * * Place incoming RRESP data into memory referenced by RREQ WQE * which is at the tip of the ORQ * * Function supports partially received RRESP's (suspending/resuming * current receive processing)
*/ int siw_proc_rresp(struct siw_qp *qp)
{ struct siw_rx_stream *srx = &qp->rx_stream; struct siw_rx_fpdu *frx = &qp->rx_tagged; struct siw_wqe *wqe = rx_wqe(frx); struct siw_mem **mem, *mem_p; struct siw_sge *sge; int bytes, rv;
if (frx->first_ddp_seg) { if (unlikely(wqe->wr_status != SIW_WR_IDLE)) {
pr_warn("siw: [QP %u]: proc RRESP: status %d, op %d\n",
qp_id(qp), wqe->wr_status, wqe->sqe.opcode);
rv = -EPROTO; goto error_term;
} /* * fetch pending RREQ from orq
*/
rv = siw_orqe_start_rx(qp); if (rv) {
pr_warn("siw: [QP %u]: ORQ empty, size %d\n",
qp_id(qp), qp->attrs.orq_size); goto error_term;
}
rv = siw_rresp_check_ntoh(srx, frx); if (unlikely(rv)) {
siw_qp_event(qp, IB_EVENT_QP_FATAL); return rv;
}
} else { if (unlikely(wqe->wr_status != SIW_WR_INPROGRESS)) {
pr_warn("siw: [QP %u]: resume RRESP: status %d\n",
qp_id(qp), wqe->wr_status);
rv = -EPROTO; goto error_term;
}
} if (!srx->fpdu_part_rem) /* zero length RRESPONSE */ return 0;
sge = wqe->sqe.sge; /* there is only one */
mem = &wqe->mem[0];
if (!(*mem)) { /* * check target memory which resolves memory on first fragment
*/
rv = siw_check_sge(qp->pd, sge, mem, IB_ACCESS_LOCAL_WRITE, 0,
wqe->bytes); if (unlikely(rv)) {
siw_dbg_qp(qp, "target mem check: %d\n", rv);
wqe->wc_status = SIW_WC_LOC_PROT_ERR;
if (be32_to_cpu(term->ddp_qn) != RDMAP_UNTAGGED_QN_TERMINATE ||
be32_to_cpu(term->ddp_msn) !=
qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] ||
be32_to_cpu(term->ddp_mo) != 0) {
pr_warn("siw: rx bogus TERM [QN x%08x, MSN x%08x, MO x%08x]\n",
be32_to_cpu(term->ddp_qn), be32_to_cpu(term->ddp_msn),
be32_to_cpu(term->ddp_mo)); return -ECONNRESET;
} /* * Receive remaining pieces of TERM if indicated
*/ if (!term->flag_m) return -ECONNRESET;
/* Do not take the effort to reassemble a network fragmented * TERM message
*/ if (srx->skb_new < sizeof(struct iwarp_ctrl_tagged)) return -ECONNRESET;
if (srx->pad)
siw_crc_update(&srx->mpa_crc, tbuf, srx->pad); /* * CRC32 is computed, transmitted and received directly in NBO, * so there's never a reason to convert byte order.
*/
siw_crc_final(&srx->mpa_crc, (u8 *)&crc_own);
crc_in = (__force __wsum)srx->trailer.crc;
if (unlikely(crc_in != crc_own)) {
pr_warn("siw: crc error. in: %08x, own %08x, op %u\n",
crc_in, crc_own, qp->rx_stream.rdmap_op);
/* * Figure out len of current hdr: variable length of * iwarp hdr may force us to copy hdr information in * two steps. Only tagged DDP messages are already * completely received.
*/ if (iwarp_pktinfo[opcode].hdr_len > sizeof(struct iwarp_ctrl_tagged)) { int hdrlen = iwarp_pktinfo[opcode].hdr_len;
/* * DDP/RDMAP header receive completed. Check if the current * DDP segment starts a new RDMAP message or continues a previously * started RDMAP message. * * Alternating reception of DDP segments (or FPDUs) from incomplete * tagged and untagged RDMAP messages is supported, as long as * the current tagged or untagged message gets eventually completed * w/o intersection from another message of the same type * (tagged/untagged). E.g., a WRITE can get intersected by a SEND, * but not by a READ RESPONSE etc.
*/ if (srx->mpa_crc_enabled) { /* * Restart CRC computation
*/
siw_crc_init(&srx->mpa_crc);
siw_crc_update(&srx->mpa_crc, c_hdr, srx->fpdu_part_rcvd);
} if (frx->more_ddp_segs) {
frx->first_ddp_seg = 0; if (frx->prev_rdmap_op != opcode) {
pr_warn("siw: packet intersection: %u : %u\n",
frx->prev_rdmap_op, opcode); /* * The last inbound RDMA operation of same type * (tagged or untagged) is left unfinished. * To complete it in error, make it the current * operation again, even with the header already * overwritten. For error handling, only the opcode * and current rx context are relevant.
*/
set_rx_fpdu_context(qp, frx->prev_rdmap_op);
__rdmap_set_opcode(c_hdr, frx->prev_rdmap_op); return -EPROTO;
}
} else {
frx->prev_rdmap_op = opcode;
frx->first_ddp_seg = 1;
}
frx->more_ddp_segs = c_hdr->ddp_rdmap_ctrl & DDP_FLAG_LAST ? 0 : 1;
if (qp->tx_ctx.orq_fence) { if (unlikely(tx_waiting->wr_status != SIW_WR_QUEUED)) {
pr_warn("siw: [QP %u]: fence resume: bad status %d\n",
qp_id(qp), tx_waiting->wr_status);
rv = -EPROTO; goto out;
} /* resume SQ processing, if possible */ if (tx_waiting->sqe.opcode == SIW_OP_READ ||
tx_waiting->sqe.opcode == SIW_OP_READ_LOCAL_INV) {
/* SQ processing was stopped because of a full ORQ */
rreq = orq_get_free(qp); if (unlikely(!rreq)) {
pr_warn("siw: [QP %u]: no ORQE\n", qp_id(qp));
rv = -EPROTO; goto out;
}
siw_read_to_orq(rreq, &tx_waiting->sqe);
} elseif (siw_orq_empty(qp)) { /* * SQ processing was stopped by fenced work request. * Resume since all previous Read's are now completed.
*/
qp->tx_ctx.orq_fence = 0;
resume_tx = 1;
}
}
out:
spin_unlock_irqrestore(&qp->orq_lock, flags);
if (resume_tx)
rv = siw_sq_start(qp);
return rv;
}
/* * siw_rdmap_complete() * * Complete processing of an RDMA message after receiving all * DDP segmens or ABort processing after encountering error case. * * o SENDs + RRESPs will need for completion, * o RREQs need for READ RESPONSE initialization * o WRITEs need memory dereferencing * * TODO: Failed WRITEs need local error to be surfaced.
*/ staticint siw_rdmap_complete(struct siw_qp *qp, int error)
{ struct siw_rx_stream *srx = &qp->rx_stream; struct siw_wqe *wqe = rx_wqe(qp->rx_fpdu); enum siw_wc_status wc_status = wqe->wc_status;
u8 opcode = __rdmap_get_opcode(&srx->hdr.ctrl); int rv = 0;
switch (opcode) { case RDMAP_SEND_SE: case RDMAP_SEND_SE_INVAL:
wqe->rqe.flags |= SIW_WQE_SOLICITED;
fallthrough;
case RDMAP_SEND: case RDMAP_SEND_INVAL: if (wqe->wr_status == SIW_WR_IDLE) break;
if (wc_status == SIW_WC_SUCCESS) {
wc_status = SIW_WC_GENERAL_ERR;
error = rv;
}
}
} /* * All errors turn the wqe into signalled.
*/ if ((wqe->sqe.flags & SIW_WQE_SIGNALLED) || error != 0)
rv = siw_sqe_complete(qp, &wqe->sqe, wqe->processed,
wc_status);
siw_wqe_put_mem(wqe, SIW_OP_READ);
if (!error) {
rv = siw_check_tx_fence(qp);
} else { /* Disable current ORQ element */ if (qp->attrs.orq_size)
WRITE_ONCE(orq_get_current(qp)->flags, 0);
} break;
case RDMAP_RDMA_READ_REQ: if (!error) {
rv = siw_init_rresp(qp, srx);
srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]++;
} break;
case RDMAP_RDMA_WRITE: if (wqe->wr_status == SIW_WR_IDLE) break;
/* * Free References from memory object if * attached to receive context (inbound WRITE). * While a zero-length WRITE is allowed, * no memory reference got created.
*/ if (rx_mem(&qp->rx_tagged)) {
siw_mem_put(rx_mem(&qp->rx_tagged));
rx_mem(&qp->rx_tagged) = NULL;
} break;
default: break;
}
wqe->wr_status = SIW_WR_IDLE;
return rv;
}
/* * siw_tcp_rx_data() * * Main routine to consume inbound TCP payload * * @rd_desc: read descriptor * @skb: socket buffer * @off: offset in skb * @len: skb->len - offset : payload in skb
*/ int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb, unsignedint off, size_t len)
{ struct siw_qp *qp = rd_desc->arg.data; struct siw_rx_stream *srx = &qp->rx_stream; int rv;
siw_dbg_qp(qp, "new data, len %d\n", srx->skb_new);
while (srx->skb_new) { int run_completion = 1;
if (unlikely(srx->rx_suspend)) { /* Do not process any more data */
srx->skb_copied += srx->skb_new; break;
} switch (srx->state) { case SIW_GET_HDR:
rv = siw_get_hdr(srx); if (!rv) {
srx->fpdu_part_rem =
be16_to_cpu(srx->hdr.ctrl.mpa_len) -
srx->fpdu_part_rcvd + MPA_HDR_SIZE;
case SIW_GET_DATA_MORE: /* * Another data fragment of the same DDP segment. * Setting first_ddp_seg = 0 avoids repeating * initializations that shall occur only once per * DDP segment.
*/
qp->rx_fpdu->first_ddp_seg = 0;
fallthrough;
case SIW_GET_DATA_START: /* * Headers will be checked by the opcode-specific * data receive function below.
*/
rv = iwarp_pktinfo[qp->rx_stream.rdmap_op].rx_data(qp); if (!rv) { int mpa_len =
be16_to_cpu(srx->hdr.ctrl.mpa_len)
+ MPA_HDR_SIZE;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.