/* Returns end sequence number of the receiver's advertised window */ static u64 mptcp_wnd_end(conststruct mptcp_sock *msk)
{ return READ_ONCE(msk->wnd_end);
}
/* This is the first subflow, always with id 0 */
WRITE_ONCE(subflow->local_id, 0);
mptcp_sock_graft(msk->first, sk->sk_socket);
iput(SOCK_INODE(ssock));
return 0;
}
/* If the MPC handshake is not started, returns the first subflow, * eventually allocating it.
*/ struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk)
{ struct sock *sk = (struct sock *)msk; int ret;
if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) return ERR_PTR(-EINVAL);
if (!msk->first) {
ret = __mptcp_socket_create(msk); if (ret) return ERR_PTR(ret);
}
pr_debug("colesced seq %llx into %llx new len %d new end seq %llx\n",
MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq,
to->len, MPTCP_SKB_CB(from)->end_seq);
MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq;
/* note the fwd memory can reach a negative value after accounting * for the delta, but the later skb free will restore a non * negative one
*/
atomic_add(delta, &sk->sk_rmem_alloc);
sk_mem_charge(sk, delta);
kfree_skb_partial(from, fragstolen);
pr_debug("msk=%p seq=%llx limit=%llx empty=%d\n", msk, seq, max_seq,
RB_EMPTY_ROOT(&msk->out_of_order_queue)); if (after64(end_seq, max_seq)) { /* out of window */
mptcp_drop(sk, skb);
pr_debug("oow by %lld, rcv_wnd_sent %llu\n",
(unsignedlonglong)end_seq - (unsignedlong)max_seq,
(unsignedlonglong)atomic64_read(&msk->rcv_wnd_sent));
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW); return;
}
p = &msk->out_of_order_queue.rb_node;
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUE); if (RB_EMPTY_ROOT(&msk->out_of_order_queue)) {
rb_link_node(&skb->rbnode, NULL, p);
rb_insert_color(&skb->rbnode, &msk->out_of_order_queue);
msk->ooo_last_skb = skb; goto end;
}
/* with 2 subflows, adding at end of ooo queue is quite likely * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
*/ if (mptcp_ooo_try_coalesce(msk, msk->ooo_last_skb, skb)) {
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE);
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL); return;
}
/* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */ if (!before64(seq, MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq)) {
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL);
parent = &msk->ooo_last_skb->rbnode;
p = &parent->rb_right; goto insert;
}
/* Find place to insert this segment. Handle overlaps on the way. */
parent = NULL; while (*p) {
parent = *p;
skb1 = rb_to_skb(parent); if (before64(seq, MPTCP_SKB_CB(skb1)->map_seq)) {
p = &parent->rb_left; continue;
} if (before64(seq, MPTCP_SKB_CB(skb1)->end_seq)) { if (!after64(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) { /* All the bits are present. Drop. */
mptcp_drop(sk, skb);
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); return;
} if (after64(seq, MPTCP_SKB_CB(skb1)->map_seq)) { /* partial overlap: * | skb | * | skb1 | * continue traversing
*/
} else { /* skb's seq == skb1's seq and skb covers skb1. * Replace skb1 with skb.
*/
rb_replace_node(&skb1->rbnode, &skb->rbnode,
&msk->out_of_order_queue);
mptcp_drop(sk, skb1);
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); goto merge_right;
}
} elseif (mptcp_ooo_try_coalesce(msk, skb1, skb)) {
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE); return;
}
p = &parent->rb_right;
}
merge_right: /* Remove other segments covered by skb. */ while ((skb1 = skb_rb_next(skb)) != NULL) { if (before64(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) break;
rb_erase(&skb1->rbnode, &msk->out_of_order_queue);
mptcp_drop(sk, skb1);
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
} /* If there is no skb after us, we are the last_skb ! */ if (!skb1)
msk->ooo_last_skb = skb;
/* try to fetch required memory from subflow */ if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED); goto drop;
}
has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp;
/* the skb map_seq accounts for the skb offset: * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq * value
*/
MPTCP_SKB_CB(skb)->map_seq = mptcp_subflow_get_mapped_dsn(subflow);
MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len;
MPTCP_SKB_CB(skb)->offset = offset;
MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp;
MPTCP_SKB_CB(skb)->cant_coalesce = 0;
if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) { /* in sequence */
msk->bytes_received += copy_len;
WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len);
tail = skb_peek_tail(&sk->sk_receive_queue); if (tail && mptcp_try_coalesce(sk, tail, skb)) returntrue;
/* old data, keep it simple and drop the whole pkt, sender * will retransmit as needed, if needed.
*/
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
drop:
mptcp_drop(sk, skb); returnfalse;
}
/* Need to ack a DATA_FIN received from a peer while this side * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2. * msk->rcv_data_fin was set when parsing the incoming options * at the subflow level and the msk lock was not held, so this * is the first opportunity to act on the DATA_FIN and change * the msk state. * * If we are caught up to the sequence number of the incoming * DATA_FIN, send the DATA_ACK now and do state transition. If * not caught up, do nothing and let the recv code send DATA_ACK * when catching up.
*/
if (mptcp_pending_data_fin(sk, &rcv_data_fin_seq)) {
WRITE_ONCE(msk->ack_seq, msk->ack_seq + 1);
WRITE_ONCE(msk->rcv_data_fin, 0);
WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN);
smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
switch (sk->sk_state) { case TCP_ESTABLISHED:
mptcp_set_state(sk, TCP_CLOSE_WAIT); break; case TCP_FIN_WAIT1:
mptcp_set_state(sk, TCP_CLOSING); break; case TCP_FIN_WAIT2:
mptcp_shutdown_subflows(msk);
mptcp_set_state(sk, TCP_CLOSE); break; default: /* Other states not expected */
WARN_ON_ONCE(1); break;
}
ret = true; if (!__mptcp_check_fallback(msk))
mptcp_send_ack(msk);
mptcp_close_wake_up(sk);
} return ret;
}
/* try to move as much data as available */
map_remaining = subflow->map_data_len -
mptcp_subflow_get_map_offset(subflow);
skb = skb_peek(&ssk->sk_receive_queue); if (unlikely(!skb)) break;
if (__mptcp_check_fallback(msk)) { /* Under fallback skbs have no MPTCP extension and TCP could * collapse them between the dummy map creation and the * current dequeue. Be sure to adjust the map size.
*/
map_remaining = skb->len;
subflow->map_data_len = skb->len;
}
offset = seq - TCP_SKB_CB(skb)->seq;
fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; if (fin)
seq++;
if (offset < skb->len) {
size_t len = skb->len - offset;
staticbool __mptcp_subflow_error_report(struct sock *sk, struct sock *ssk)
{ int err = sock_error(ssk); int ssk_state;
if (!err) returnfalse;
/* only propagate errors on fallen-back sockets or * on MPC connect
*/ if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(mptcp_sk(sk))) returnfalse;
/* We need to propagate only transition to CLOSE state. * Orphaned socket will see such state change via * subflow_sched_work_if_closed() and that path will properly * destroy the msk as needed.
*/
ssk_state = inet_sk_state_load(ssk); if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD))
mptcp_set_state(sk, ssk_state);
WRITE_ONCE(sk->sk_err, -err);
/* This barrier is coupled with smp_rmb() in mptcp_poll() */
smp_wmb();
sk_error_report(sk); returntrue;
}
mptcp_for_each_subflow(msk, subflow) if (__mptcp_subflow_error_report(sk, mptcp_subflow_tcp_sock(subflow))) break;
}
/* In most cases we will be able to lock the mptcp socket. If its already * owned, we need to defer to the work queue to avoid ABBA deadlock.
*/ staticbool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
{ struct sock *sk = (struct sock *)msk; bool moved;
moved = __mptcp_move_skbs_from_subflow(msk, ssk);
__mptcp_ofo_queue(msk); if (unlikely(ssk->sk_err)) { if (!sock_owned_by_user(sk))
__mptcp_error_report(sk); else
__set_bit(MPTCP_ERROR_REPORT, &msk->cb_flags);
}
/* If the moves have caught up with the DATA_FIN sequence number * it's time to ack the DATA_FIN and change socket state, but * this is not a good place to change state. Let the workqueue * do it.
*/ if (mptcp_pending_data_fin(sk, NULL))
mptcp_schedule_work(sk); return moved;
}
/* The peer can send data while we are shutting down this * subflow at msk destruction time, but we must avoid enqueuing * more data to the msk receive queue
*/ if (unlikely(subflow->disposable)) return;
mptcp_data_lock(sk); if (!sock_owned_by_user(sk))
__mptcp_data_ready(sk, ssk); else
__set_bit(MPTCP_DEQUEUE, &mptcp_sk(sk)->cb_flags);
mptcp_data_unlock(sk);
}
spin_lock_bh(&msk->fallback_lock); if (!msk->allow_subflows) {
spin_unlock_bh(&msk->fallback_lock); returnfalse;
}
mptcp_subflow_joined(msk, ssk);
spin_unlock_bh(&msk->fallback_lock);
/* attach to msk socket only after we are sure we will deal with it * at close time
*/ if (sk->sk_socket && !ssk->sk_socket)
mptcp_sock_graft(ssk, sk->sk_socket);
/* prevent rescheduling on close */ if (unlikely(inet_sk_state_load(sk) == TCP_CLOSE)) return;
tout = mptcp_sk(sk)->timer_ival;
sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout);
}
bool mptcp_schedule_work(struct sock *sk)
{ if (inet_sk_state_load(sk) != TCP_CLOSE &&
schedule_work(&mptcp_sk(sk)->work)) { /* each subflow already holds a reference to the sk, and the * workqueue is invoked by a subflow, so sk can't go away here.
*/
sock_hold(sk); returntrue;
} returnfalse;
}
/* can collapse only if MPTCP level sequence is in order and this * mapping has not been xmitted yet
*/ return mpext && mpext->data_seq + mpext->data_len == write_seq &&
!mpext->frozen;
}
/* we can append data to the given data frag if: * - there is space available in the backing page_frag * - the data frag tail matches the current page_frag free offset * - the data frag end sequence number matches the current write seq
*/ staticbool mptcp_frag_can_collapse_to(conststruct mptcp_sock *msk, conststruct page_frag *pfrag, conststruct mptcp_data_frag *df)
{ return df && pfrag->page == df->page &&
pfrag->size - pfrag->offset > 0 &&
pfrag->offset == (df->offset + df->data_len) &&
df->data_seq + df->data_len == msk->write_seq;
}
/* called under both the msk socket lock and the data lock */ staticvoid __mptcp_clean_una(struct sock *sk)
{ struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag;
u64 snd_una;
if (first)
tcp_enter_memory_pressure(ssk);
sk_stream_moderate_sndbuf(ssk);
first = false;
}
__mptcp_sync_sndbuf(sk);
}
/* ensure we get enough memory for the frag hdr, beyond some minimal amount of * data
*/ staticbool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
{ if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag),
pfrag, sk->sk_allocation))) returntrue;
mptcp_enter_memory_pressure(sk); returnfalse;
}
staticstruct mptcp_data_frag *
mptcp_carve_data_frag(conststruct mptcp_sock *msk, struct page_frag *pfrag, int orig_offset)
{ int offset = ALIGN(orig_offset, sizeof(long)); struct mptcp_data_frag *dfrag;
/* note: this always recompute the csum on the whole skb, even * if we just appended a single frag. More status info needed
*/ staticvoid mptcp_update_data_checksum(struct sk_buff *skb, int added)
{ struct mptcp_ext *mpext = mptcp_get_ext(skb);
__wsum csum = ~csum_unfold(mpext->csum); int offset = skb->len - added;
skb = tcp_write_queue_tail(ssk); if (skb && copy > skb->len) { /* Limit the write to the size available in the * current skb, if any, so that we create at most a new skb. * Explicitly tells TCP internals to avoid collapsing on later * queue management operation, to avoid breaking the ext <-> * SSN association set here
*/
mpext = mptcp_get_ext(skb); if (!mptcp_skb_can_collapse_to(data_seq, skb, mpext)) {
TCP_SKB_CB(skb)->eor = 1;
tcp_mark_push(tcp_sk(ssk), skb); goto alloc_skb;
}
i = skb_shinfo(skb)->nr_frags;
can_coalesce = skb_can_coalesce(skb, i, dfrag->page, offset); if (!can_coalesce && i >= READ_ONCE(net_hotdata.sysctl_max_skb_frags)) {
tcp_mark_push(tcp_sk(ssk), skb); goto alloc_skb;
}
i = skb_shinfo(skb)->nr_frags;
reuse_skb = false;
mpext = mptcp_get_ext(skb);
}
/* Zero window and all data acked? Probe. */
copy = mptcp_check_allowed_size(msk, ssk, data_seq, copy); if (copy == 0) {
u64 snd_una = READ_ONCE(msk->snd_una);
/* No need for zero probe if there are any data pending * either at the msk or ssk level; skb is the current write * queue tail and can be empty at this point.
*/ if (snd_una != msk->snd_nxt || skb->len ||
skb != tcp_send_head(ssk)) {
tcp_remove_empty_skb(ssk); return 0;
}
trace_mptcp_subflow_get_send(subflow);
ssk = mptcp_subflow_tcp_sock(subflow); if (!mptcp_subflow_active(subflow)) continue;
tout = max(tout, mptcp_timeout_from_subflow(subflow));
nr_active += !backup;
pace = subflow->avg_pacing_rate; if (unlikely(!pace)) { /* init pacing rate from socket */
subflow->avg_pacing_rate = READ_ONCE(ssk->sk_pacing_rate);
pace = subflow->avg_pacing_rate; if (!pace) continue;
}
/* pick the best backup if no other subflow is active */ if (!nr_active)
send_info[SSK_MODE_ACTIVE].ssk = send_info[SSK_MODE_BACKUP].ssk;
/* According to the blest algorithm, to avoid HoL blocking for the * faster flow, we need to: * - estimate the faster flow linger time * - use the above to estimate the amount of byte transferred * by the faster flow * - check that the amount of queued data is greater than the above, * otherwise do not use the picked, slower, subflow * We select the subflow with the shorter estimated time to flush * the queued mem, which basically ensure the above. We just need * to check that subflow has a non empty cwin.
*/
ssk = send_info[SSK_MODE_ACTIVE].ssk; if (!ssk || !sk_stream_memory_free(ssk)) return NULL;
/* snd_nxt_new can be smaller than snd_nxt in case mptcp * is recovering after a failover. In that event, this re-sends * old segments. * * Thus compute snd_nxt_new candidate based on * the dfrag->data_seq that was sent and the data * that has been handed to the subflow for transmission * and skip update in case it was old dfrag.
*/ if (likely(after64(snd_nxt_new, msk->snd_nxt))) {
msk->bytes_sent += snd_nxt_new - msk->snd_nxt;
WRITE_ONCE(msk->snd_nxt, snd_nxt_new);
}
}
while ((dfrag = mptcp_send_head(sk))) {
info->sent = dfrag->already_sent;
info->limit = dfrag->data_len;
len = dfrag->data_len - dfrag->already_sent; while (len > 0) { int ret = 0;
ret = mptcp_sendmsg_frag(sk, ssk, dfrag, info); if (ret <= 0) {
err = copied ? : ret; goto out;
}
while (mptcp_send_head(sk) && (push_count > 0)) { struct mptcp_subflow_context *subflow; int ret = 0;
if (mptcp_sched_get_send(msk)) break;
push_count = 0;
mptcp_for_each_subflow(msk, subflow) { if (READ_ONCE(subflow->scheduled)) {
mptcp_subflow_set_scheduled(subflow, false);
prev_ssk = ssk;
ssk = mptcp_subflow_tcp_sock(subflow); if (ssk != prev_ssk) { /* First check. If the ssk has changed since * the last round, release prev_ssk
*/ if (prev_ssk)
mptcp_push_release(prev_ssk, &info);
/* Need to lock the new subflow only if different * from the previous one, otherwise we are still * helding the relevant lock
*/
lock_sock(ssk);
}
/* at this point we held the socket lock for the last subflow we used */ if (ssk)
mptcp_push_release(ssk, &info);
/* ensure the rtx timer is running */ if (!mptcp_rtx_timer_pending(sk))
mptcp_reset_rtx_timer(sk); if (do_check_data_fin)
mptcp_check_send_data_fin(sk);
}
info.flags = 0; while (mptcp_send_head(sk) && keep_pushing) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); int ret = 0;
/* check for a different subflow usage only after * spooling the first chunk of data
*/ if (first) {
mptcp_subflow_set_scheduled(subflow, false);
ret = __subflow_push_pending(sk, ssk, &info);
first = false; if (ret <= 0) break;
copied += ret; continue;
}
if (mptcp_sched_get_send(msk)) goto out;
if (READ_ONCE(subflow->scheduled)) {
mptcp_subflow_set_scheduled(subflow, false);
ret = __subflow_push_pending(sk, ssk, &info); if (ret <= 0)
keep_pushing = false;
copied += ret;
}
out: /* __mptcp_alloc_tx_skb could have released some wmem and we are * not going to flush it via release_sock()
*/ if (copied) {
tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
info.size_goal); if (!mptcp_rtx_timer_pending(sk))
mptcp_reset_rtx_timer(sk);
/* on flags based fastopen the mptcp is supposed to create the * first subflow right now. Otherwise we are in the defer_connect * path, and the first subflow must be already present. * Since the defer_connect flag is cleared after the first succsful * fastopen attempt, no need to check for additional subflow status.
*/ if (msg->msg_flags & MSG_FASTOPEN) {
ssk = __mptcp_nmpc_sk(msk); if (IS_ERR(ssk)) return PTR_ERR(ssk);
} if (!msk->first) return -EINVAL;
/* do the blocking bits of inet_stream_connect outside the ssk socket lock */ if (ret == -EINPROGRESS && !(msg->msg_flags & MSG_DONTWAIT)) {
ret = __inet_stream_connect(sk->sk_socket, msg->msg_name,
msg->msg_namelen, msg->msg_flags, 1);
/* Keep the same behaviour of plain TCP: zero the copied bytes in * case of any error, except timeout or signal
*/ if (ret && ret != -EINPROGRESS && ret != -ERESTARTSYS && ret != -EINTR)
*copied_syn = 0;
} elseif (ret && ret != -EINPROGRESS) { /* The disconnect() op called by tcp_sendmsg_fastopen()/ * __inet_stream_connect() can fail, due to looking check, * see mptcp_disconnect(). * Attempt it again outside the problematic scope.
*/ if (!mptcp_disconnect(sk, 0)) {
sk->sk_disconnects++;
sk->sk_socket->state = SS_UNCONNECTED;
}
}
inet_clear_bit(DEFER_CONNECT, sk);
/* open-code sk_stream_memory_free() plus sent limit computation to * avoid indirect calls in fast-path. * Called under the msk socket lock, so we can avoid a bunch of ONCE * annotations.
*/ static u32 mptcp_send_limit(conststruct sock *sk)
{ conststruct mptcp_sock *msk = mptcp_sk(sk);
u32 limit, not_sent;
if (sk->sk_wmem_queued >= READ_ONCE(sk->sk_sndbuf)) return 0;
limit = mptcp_notsent_lowat(sk); if (limit == UINT_MAX) return UINT_MAX;
if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
ret = sk_stream_wait_connect(sk, &timeo); if (ret) goto do_error;
}
ret = -EPIPE; if (unlikely(sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))) goto do_error;
pfrag = sk_page_frag(sk);
while (msg_data_left(msg)) { int total_ts, frag_truesize = 0; struct mptcp_data_frag *dfrag; bool dfrag_collapsed;
size_t psize, offset;
u32 copy_limit;
/* ensure fitting the notsent_lowat() constraint */
copy_limit = mptcp_send_limit(sk); if (!copy_limit) goto wait_for_memory;
/* reuse tail pfrag, if possible, or carve a new one from the * page allocator
*/
dfrag = mptcp_pending_tail(sk);
dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag); if (!dfrag_collapsed) { if (!mptcp_page_frag_refill(sk, pfrag)) goto wait_for_memory;
/* we do not bound vs wspace, to allow a single packet. * memory accounting will prevent execessive memory usage * anyway
*/
offset = dfrag->offset + dfrag->data_len;
psize = pfrag->size - offset;
psize = min_t(size_t, psize, msg_data_left(msg));
psize = min_t(size_t, psize, copy_limit);
total_ts = psize + frag_truesize;
if (!sk_wmem_schedule(sk, total_ts)) goto wait_for_memory;
ret = do_copy_data_nocache(sk, psize, &msg->msg_iter,
page_address(dfrag->page) + offset); if (ret) goto do_error;
/* data successfully copied into the write queue */
sk_forward_alloc_add(sk, -total_ts);
copied += psize;
dfrag->data_len += psize;
frag_truesize += psize;
pfrag->offset += frag_truesize;
WRITE_ONCE(msk->write_seq, msk->write_seq + psize);
/* charge data on mptcp pending queue to the msk socket * Note: we charge such data both to sk and ssk
*/
sk_wmem_queued_add(sk, frag_truesize); if (!dfrag_collapsed) {
get_page(dfrag->page);
list_add_tail(&dfrag->list, &msk->rtx_queue); if (!msk->first_pending)
msk->first_pending = dfrag;
}
pr_debug("msk=%p dfrag at seq=%llu len=%u sent=%u new=%d\n", msk,
dfrag->data_seq, dfrag->data_len, dfrag->already_sent,
!dfrag_collapsed);
continue;
wait_for_memory:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
__mptcp_push_pending(sk, msg->msg_flags);
ret = sk_stream_wait_memory(sk, &timeo); if (ret) goto do_error;
}
if (copied)
__mptcp_push_pending(sk, msg->msg_flags);
/* Make subflows follow along. If we do not do this, we * get drops at subflow level if skbs can't be moved to * the mptcp rx queue fast enough (announced rcv_win can * exceed ssk->sk_rcvbuf).
*/
mptcp_for_each_subflow(msk, subflow) { struct sock *ssk; bool slow;
/* verify we can move any data from the subflow, eventually updating */ if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
mptcp_for_each_subflow(msk, subflow)
__mptcp_rcvbuf_update(sk, subflow->tcp_sock);
/* * As an optimization avoid traversing the subflows list * and ev. acquiring the subflow socket lock before baling out
*/ if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf) break;
subflow = __mptcp_first_ready_from(msk, subflow); if (!subflow) break;
ssk = mptcp_subflow_tcp_sock(subflow);
slowpath = lock_sock_fast(ssk);
ret = __mptcp_move_skbs_from_subflow(msk, ssk) || ret; if (unlikely(ssk->sk_err))
__mptcp_error_report(sk);
unlock_sock_fast(ssk, slowpath);
subflow = mptcp_next_subflow(msk, subflow);
}
__mptcp_ofo_queue(msk); if (ret)
mptcp_check_data_fin((struct sock *)msk); return ret;
}
if (unlikely(msk->recvmsg_inq))
cmsg_flags = MPTCP_CMSG_INQ;
while (copied < len) { int err, bytes_read;
bytes_read = __mptcp_recvmsg_mskq(sk, msg, len - copied, flags,
copied, &tss, &cmsg_flags); if (unlikely(bytes_read < 0)) { if (!copied)
copied = bytes_read; goto out_err;
}
copied += bytes_read;
if (skb_queue_empty(&sk->sk_receive_queue) && __mptcp_move_skbs(sk)) continue;
/* only the MPTCP socket status is relevant here. The exit * conditions mirror closely tcp_recvmsg()
*/ if (copied >= target) break;
if (copied) { if (sk->sk_err ||
sk->sk_state == TCP_CLOSE ||
(sk->sk_shutdown & RCV_SHUTDOWN) ||
!timeo ||
signal_pending(current)) break;
} else { if (sk->sk_err) {
copied = sock_error(sk); break;
}
if (sk->sk_shutdown & RCV_SHUTDOWN) { /* race breaker: the shutdown could be after the * previous receive queue check
*/ if (__mptcp_move_skbs(sk)) continue; break;
}
if (sk->sk_state == TCP_CLOSE) {
copied = -ENOTCONN; break;
}
if (!timeo) {
copied = -EAGAIN; break;
}
if (signal_pending(current)) {
copied = sock_intr_errno(timeo); break;
}
}
bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { /* we need a process context to retransmit */ if (!test_and_set_bit(MPTCP_WORK_RTX, &msk->flags))
mptcp_schedule_work(sk);
} else { /* delegate our work to tcp_release_cb() */
__set_bit(MPTCP_RETRANSMIT, &msk->cb_flags);
}
bh_unlock_sock(sk);
sock_put(sk);
}
/* Find an idle subflow. Return NULL if there is unacked data at tcp * level. * * A backup subflow is returned only if that is the only kind available.
*/ struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk)
{ struct sock *backup = NULL, *pick = NULL; struct mptcp_subflow_context *subflow; int min_stale_count = INT_MAX;
/* still data outstanding at TCP level? skip this */ if (!tcp_rtx_and_write_queues_empty(ssk)) {
mptcp_pm_subflow_chk_stale(msk, ssk);
min_stale_count = min_t(int, min_stale_count, subflow->stale_count); continue;
}
if (subflow->backup || subflow->request_bkup) { if (!backup)
backup = ssk; continue;
}
if (!pick)
pick = ssk;
}
if (pick) return pick;
/* use backup only if there are no progresses anywhere */ return min_stale_count > 1 ? backup : NULL;
}
/* the closing socket has some data untransmitted and/or unacked: * some data in the mptcp rtx queue has not really xmitted yet. * keep it simple and re-inject the whole mptcp level rtx queue
*/
mptcp_data_lock(sk);
__mptcp_clean_una_wakeup(sk);
rtx_head = mptcp_rtx_head(sk); if (!rtx_head) {
mptcp_data_unlock(sk); returnfalse;
}
/* be sure to clear the "sent status" on all re-injected fragments */
list_for_each_entry(cur, &msk->rtx_queue, list) { if (!cur->already_sent) break;
cur->already_sent = 0;
}
/* be sure to send a reset only if the caller asked for it, also * clean completely the subflow status when the subflow reaches * TCP_CLOSE state
*/ staticvoid __mptcp_subflow_disconnect(struct sock *ssk, struct mptcp_subflow_context *subflow, unsignedint flags)
{ if (((1 << ssk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) ||
(flags & MPTCP_CF_FASTCLOSE)) { /* The MPTCP code never wait on the subflow sockets, TCP-level * disconnect should never fail
*/
WARN_ON_ONCE(tcp_disconnect(ssk, 0));
mptcp_subflow_ctx_reset(subflow);
} else {
tcp_shutdown(ssk, SEND_SHUTDOWN);
}
}
/* subflow sockets can be either outgoing (connect) or incoming * (accept). * * Outgoing subflows use in-kernel sockets. * Incoming subflows do not have their own 'struct socket' allocated, * so we need to use tcp_close() after detaching them from the mptcp * parent socket.
*/ staticvoid __mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow, unsignedint flags)
{ struct mptcp_sock *msk = mptcp_sk(sk); bool dispose_it, need_push = false;
/* If the first subflow moved to a close state before accept, e.g. due * to an incoming reset or listener shutdown, the subflow socket is * already deleted by inet_child_forget() and the mptcp socket can't * survive too.
*/ if (msk->in_accept_queue && msk->first == ssk &&
(sock_flag(sk, SOCK_DEAD) || sock_flag(ssk, SOCK_DEAD))) { /* ensure later check in mptcp_worker() will dispose the msk */
sock_set_flag(sk, SOCK_DEAD);
mptcp_set_close_tout(sk, tcp_jiffies32 - (mptcp_close_timeout(sk) + 1));
lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
mptcp_subflow_drop_ctx(ssk); goto out_release;
}
dispose_it = msk->free_first || ssk != msk->first; if (dispose_it)
list_del(&subflow->node);
lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
if ((flags & MPTCP_CF_FASTCLOSE) && !__mptcp_check_fallback(msk)) { /* be sure to force the tcp_close path * to generate the egress reset
*/
ssk->sk_lingertime = 0;
sock_set_flag(ssk, SOCK_LINGER);
subflow->send_fastclose = 1;
}
/* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops * the ssk has been already destroyed, we just need to release the * reference owned by msk;
*/ if (!inet_csk(ssk)->icsk_ulp_ops) {
WARN_ON_ONCE(!sock_flag(ssk, SOCK_DEAD));
kfree_rcu(subflow, rcu);
} else { /* otherwise tcp will dispose of the ssk and subflow ctx */
__tcp_close(ssk, 0);
/* close acquired an extra ref */
__sock_put(ssk);
}
if (ssk == msk->first)
WRITE_ONCE(msk->first, NULL);
out:
__mptcp_sync_sndbuf(sk); if (need_push)
__mptcp_push_pending(sk, 0);
/* Catch every 'all subflows closed' scenario, including peers silently * closing them, e.g. due to timeout. * For established sockets, allow an additional timeout before closing, * as the protocol can still create more subflows.
*/ if (list_is_singular(&msk->conn_list) && msk->first &&
inet_sk_state_load(msk->first) == TCP_CLOSE) { if (sk->sk_state != TCP_ESTABLISHED ||
msk->in_accept_queue || sock_flag(sk, SOCK_DEAD)) {
mptcp_set_state(sk, TCP_CLOSE);
mptcp_close_wake_up(sk);
} else {
mptcp_start_tout_timer(sk);
}
}
}
void mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow)
{ /* The first subflow can already be closed and still in the list */ if (subflow->close_event_done) return;
subflow->close_event_done = true;
if (sk->sk_state == TCP_ESTABLISHED)
mptcp_event(MPTCP_EVENT_SUB_CLOSED, mptcp_sk(sk), ssk, GFP_KERNEL);
/* subflow aborted before reaching the fully_established status * attempt the creation of the next subflow
*/
--> --------------------
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.