/* * siw_qp_prepare_tx() * * Prepare tx state for sending out one fpdu. Builds complete pkt * if no user data or only immediate data are present. * * returns PKT_COMPLETE if complete pkt built, PKT_FRAGMENTED otherwise.
*/ staticint siw_qp_prepare_tx(struct siw_iwarp_tx *c_tx)
{ struct siw_wqe *wqe = &c_tx->wqe_active; char *crc = NULL; int data = 0;
switch (tx_type(wqe)) { case SIW_OP_READ: case SIW_OP_READ_LOCAL_INV:
memcpy(&c_tx->pkt.ctrl,
&iwarp_pktinfo[RDMAP_RDMA_READ_REQ].ctrl, sizeof(struct iwarp_ctrl));
*(u32 *)crc = 0; /* * Do complete CRC if enabled and short packet
*/ if (c_tx->mpa_crc_enabled)
siw_crc_oneshot(&c_tx->pkt, c_tx->ctrl_len, (u8 *)crc);
c_tx->ctrl_len += MPA_CRC_SIZE;
/* * Allow direct sending out of user buffer if WR is non signalled * and payload is over threshold. * Per RDMA verbs, the application should not change the send buffer * until the work completed. In iWarp, work completion is only * local delivery to TCP. TCP may reuse the buffer for * retransmission. Changing unsent data also breaks the CRC, * if applied.
*/ if (c_tx->zcopy_tx && wqe->bytes >= SENDPAGE_THRESH &&
!(tx_flags(wqe) & SIW_WQE_SIGNALLED))
c_tx->use_sendpage = 1; else
c_tx->use_sendpage = 0;
/* * Send out one complete control type FPDU, or header of FPDU carrying * data. Used for fixed sized packets like Read.Requests or zero length * SENDs, WRITEs, READ.Responses, or header only.
*/ staticint siw_tx_ctrl(struct siw_iwarp_tx *c_tx, struct socket *s, int flags)
{ struct kvec iov = { .iov_base =
(char *)&c_tx->pkt.ctrl + c_tx->ctrl_sent,
.iov_len = c_tx->ctrl_len - c_tx->ctrl_sent };
int rv = siw_sendmsg(s, flags, &iov, 1, iov.iov_len);
/* * 0copy TCP transmit interface: Use MSG_SPLICE_PAGES. * * Using sendpage to push page by page appears to be less efficient * than using sendmsg, even if data are copied. * * A general performance limitation might be the extra four bytes * trailer checksum segment to be pushed after user data.
*/ staticint siw_tcp_sendpages(struct socket *s, struct page **page, int offset,
size_t size)
{ struct bio_vec bvec; struct msghdr msg = {
.msg_flags = (MSG_MORE | MSG_DONTWAIT | MSG_SPLICE_PAGES),
}; struct sock *sk = s->sk; int i = 0, rv = 0, sent = 0;
/* * siw_0copy_tx() * * Pushes list of pages to TCP socket. If pages from multiple * SGE's, all referenced pages of each SGE are pushed in one * shot.
*/ staticint siw_0copy_tx(struct socket *s, struct page **page, struct siw_sge *sge, unsignedint offset, unsignedint size)
{ int i = 0, sent = 0, rv; int sge_bytes = min(sge->length - offset, size);
offset = (sge->laddr + offset) & ~PAGE_MASK;
while (sent != size) {
rv = siw_tcp_sendpages(s, &page[i], offset, sge_bytes); if (rv >= 0) {
sent += rv; if (size == sent || sge_bytes > rv) break;
staticvoid siw_unmap_pages(struct kvec *iov, unsignedlong kmap_mask, int len)
{ int i;
/* * Work backwards through the array to honor the kmap_local_page() * ordering requirements.
*/ for (i = (len-1); i >= 0; i--) { if (kmap_mask & BIT(i)) { unsignedlong addr = (unsignedlong)iov[i].iov_base;
kunmap_local((void *)(addr & PAGE_MASK));
}
}
}
/* * siw_tx_hdt() tries to push a complete packet to TCP where all * packet fragments are referenced by the elements of one iovec. * For the data portion, each involved page must be referenced by * one extra element. All sge's data can be non-aligned to page * boundaries. Two more elements are referencing iWARP header * and trailer: * MAX_ARRAY = 64KB/PAGE_SIZE + 1 + (2 * (SIW_MAX_SGE - 1) + HDR + TRL
*/ #define MAX_ARRAY ((0xffff / PAGE_SIZE) + 1 + (2 * (SIW_MAX_SGE - 1) + 2))
/* * Write out iov referencing hdr, data and trailer of current FPDU. * Update transmit state dependent on write return status
*/ static noinline_for_stack int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s)
{ struct siw_wqe *wqe = &c_tx->wqe_active; struct siw_sge *sge = &wqe->sqe.sge[c_tx->sge_idx]; struct kvec iov[MAX_ARRAY]; struct page *page_array[MAX_ARRAY];
if (rv >= (int)data_len) { /* all user data pushed to TCP or no data to push */ if (data_len > 0 && wqe->processed < wqe->bytes) { /* Save the current state for next tx */
c_tx->sge_idx = sge_idx;
c_tx->sge_off = sge_off;
c_tx->pbl_idx = pbl_idx;
}
rv -= data_len;
if (tp->gso_segs) { if (c_tx->gso_seg_limit == 0)
c_tx->tcp_seglen = tp->mss_cache * tp->gso_segs; else
c_tx->tcp_seglen =
tp->mss_cache *
min_t(u16, c_tx->gso_seg_limit, tp->gso_segs);
} else {
c_tx->tcp_seglen = tp->mss_cache;
} /* Loopback may give odd numbers */
c_tx->tcp_seglen &= 0xfffffff8;
}
/* * siw_prepare_fpdu() * * Prepares transmit context to send out one FPDU if FPDU will contain * user data and user data are not immediate data. * Computes maximum FPDU length to fill up TCP MSS if possible. * * @qp: QP from which to transmit * @wqe: Current WQE causing transmission * * TODO: Take into account real available sendspace on socket * to avoid header misalignment due to send pausing within * fpdu transmission
*/ staticvoid siw_prepare_fpdu(struct siw_qp *qp, struct siw_wqe *wqe)
{ struct siw_iwarp_tx *c_tx = &qp->tx_ctx; int data_len;
/* * siw_check_sgl_tx() * * Check permissions for a list of SGE's (SGL). * A successful check will have all memory referenced * for transmission resolved and assigned to the WQE. * * @pd: Protection Domain SGL should belong to * @wqe: WQE to be checked * @perms: requested access permissions *
*/
if (unlikely(num_sge > SIW_MAX_SGE)) return -EINVAL;
for (i = 0, len = 0; num_sge; num_sge--, i++, sge++) { /* * rdma verbs: do not check stag for a zero length sge
*/ if (sge->length) { int rv = siw_check_sge(pd, sge, &wqe->mem[i], perms, 0,
sge->length);
if (unlikely(rv != E_ACCESS_OK)) return rv;
}
len += sge->length;
} return len;
}
/* * siw_qp_sq_proc_tx() * * Process one WQE which needs transmission on the wire.
*/ staticint siw_qp_sq_proc_tx(struct siw_qp *qp, struct siw_wqe *wqe)
{ struct siw_iwarp_tx *c_tx = &qp->tx_ctx; struct socket *s = qp->attrs.sk; int rv = 0, burst_len = qp->tx_ctx.burst; enum rdmap_ecode ecode = RDMAP_ECODE_CATASTROPHIC_STREAM;
if (unlikely(wqe->wr_status == SIW_WR_IDLE)) return 0;
if (!burst_len)
burst_len = SQ_USER_MAXBURST;
if (wqe->wr_status == SIW_WR_QUEUED) { if (!(wqe->sqe.flags & SIW_WQE_INLINE)) { if (tx_type(wqe) == SIW_OP_READ_RESPONSE)
wqe->sqe.num_sge = 1;
if (tx_type(wqe) != SIW_OP_READ &&
tx_type(wqe) != SIW_OP_READ_LOCAL_INV) { /* * Reference memory to be tx'd w/o checking * access for LOCAL_READ permission, since * not defined in RDMA core.
*/
rv = siw_check_sgl_tx(qp->pd, wqe, 0); if (rv < 0) { if (tx_type(wqe) ==
SIW_OP_READ_RESPONSE)
ecode = siw_rdmap_error(-rv);
rv = -EINVAL; goto tx_error;
}
wqe->bytes = rv;
} else {
wqe->bytes = 0;
}
} else {
wqe->bytes = wqe->sqe.sge[0].length; if (!rdma_is_kernel_res(&qp->base_qp.res)) { if (wqe->bytes > SIW_MAX_INLINE) {
rv = -EINVAL; goto tx_error;
}
wqe->sqe.sge[0].laddr =
(u64)(uintptr_t)&wqe->sqe.sge[1];
}
}
wqe->wr_status = SIW_WR_INPROGRESS;
wqe->processed = 0;
next_segment:
siw_dbg_qp(qp, "wr type %d, state %d, data %u, sent %u, id %llx\n",
tx_type(wqe), wqe->wr_status, wqe->bytes, wqe->processed,
wqe->sqe.id);
if (siw_sq_empty(qp) || !siw_tcp_nagle || burst_len == 1) /* * End current TCP segment, if SQ runs empty, * or siw_tcp_nagle is not set, or we bail out * soon due to no burst credit left.
*/
msg_flags = MSG_DONTWAIT; else
msg_flags = MSG_DONTWAIT | MSG_MORE;
staticint siw_qp_sq_proc_local(struct siw_qp *qp, struct siw_wqe *wqe)
{ int rv;
switch (tx_type(wqe)) { case SIW_OP_REG_MR:
rv = siw_fastreg_mr(qp->pd, &wqe->sqe); break;
case SIW_OP_INVAL_STAG:
rv = siw_invalidate_stag(qp->pd, wqe->sqe.rkey); break;
default:
rv = -EINVAL;
} return rv;
}
/* * siw_qp_sq_process() * * Core TX path routine for RDMAP/DDP/MPA using a TCP kernel socket. * Sends RDMAP payload for the current SQ WR @wqe of @qp in one or more * MPA FPDUs, each containing a DDP segment. * * SQ processing may occur in user context as a result of posting * new WQE's or from siw_tx_thread context. Processing in * user context is limited to non-kernel verbs users. * * SQ processing may get paused anytime, possibly in the middle of a WR * or FPDU, if insufficient send space is available. SQ processing * gets resumed from siw_tx_thread, if send space becomes available again. * * Must be called with the QP state read-locked. * * Note: * An outbound RREQ can be satisfied by the corresponding RRESP * _before_ it gets assigned to the ORQ. This happens regularly * in RDMA READ via loopback case. Since both outbound RREQ and * inbound RRESP can be handled by the same CPU, locking the ORQ * is dead-lock prone and thus not an option. With that, the * RREQ gets assigned to the ORQ _before_ being sent - see * siw_activate_tx() - and pulled back in case of send failure.
*/ int siw_qp_sq_process(struct siw_qp *qp)
{ struct siw_wqe *wqe = tx_wqe(qp); enum siw_opcode tx_type; unsignedlong flags; int rv = 0;
siw_dbg_qp(qp, "enter for type %d\n", tx_type(wqe));
next_wqe: /* * Stop QP processing if SQ state changed
*/ if (unlikely(qp->tx_ctx.tx_suspend)) {
siw_dbg_qp(qp, "tx suspended\n"); goto done;
}
tx_type = tx_type(wqe);
} elseif (rv == -EAGAIN) {
siw_dbg_qp(qp, "sq paused: hd/tr %d of %d, data %d\n",
qp->tx_ctx.ctrl_sent, qp->tx_ctx.ctrl_len,
qp->tx_ctx.bytes_unsent);
rv = 0; goto done;
} elseif (rv == -EINPROGRESS) {
rv = siw_sq_start(qp); goto done;
} else { /* * WQE processing failed. * Verbs 8.3.2: * o It turns any WQE into a signalled WQE. * o Local catastrophic error must be surfaced * o QP must be moved into Terminate state: done by code * doing socket state change processing * * o TODO: Termination message must be sent. * o TODO: Implement more precise work completion errors, * see enum ib_wc_status in ib_verbs.h
*/
siw_dbg_qp(qp, "wqe type %d processing failed: %d\n",
tx_type(wqe), rv);
spin_lock_irqsave(&qp->sq_lock, flags); /* * RREQ may have already been completed by inbound RRESP!
*/ if ((tx_type == SIW_OP_READ ||
tx_type == SIW_OP_READ_LOCAL_INV) && qp->attrs.orq_size) { /* Cleanup pending entry in ORQ */
qp->orq_put--;
qp->orq[qp->orq_put % qp->attrs.orq_size].flags = 0;
}
spin_unlock_irqrestore(&qp->sq_lock, flags); /* * immediately suspends further TX processing
*/ if (!qp->tx_ctx.tx_suspend)
siw_qp_cm_drop(qp, 0);
switch (tx_type) { case SIW_OP_SEND: case SIW_OP_SEND_REMOTE_INV: case SIW_OP_SEND_WITH_IMM: case SIW_OP_WRITE: case SIW_OP_READ: case SIW_OP_READ_LOCAL_INV:
siw_wqe_put_mem(wqe, tx_type);
fallthrough;
case SIW_OP_INVAL_STAG: case SIW_OP_REG_MR:
siw_sqe_complete(qp, &wqe->sqe, wqe->bytes,
SIW_WC_LOC_QP_OP_ERR);
siw_qp_event(qp, IB_EVENT_QP_FATAL);
break;
case SIW_OP_READ_RESPONSE:
siw_dbg_qp(qp, "proc. read.response failed: %d\n", rv);
active = llist_del_all(&tx_task->active); /* * llist_del_all returns a list with newest entry first. * Re-order list for fairness among QP's.
*/
fifo_list = llist_reverse_order(active); while (fifo_list) {
qp = container_of(fifo_list, struct siw_qp, tx_list);
fifo_list = llist_next(fifo_list);
qp->tx_list.next = NULL;
int siw_sq_start(struct siw_qp *qp)
{ if (tx_wqe(qp)->wr_status == SIW_WR_IDLE) return 0;
if (unlikely(!cpu_online(qp->tx_cpu))) {
siw_put_tx_cpu(qp->tx_cpu);
qp->tx_cpu = siw_get_tx_cpu(qp->sdev); if (qp->tx_cpu < 0) {
pr_warn("siw: no tx cpu available\n");
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.