#ifdef CONFIG_TCP_MD5SIG /* no MPTCP if MD5SIG is enabled on this socket or we may run out of * TCP option space.
*/ if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) {
subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); return -EINVAL;
} #endif
subflow = mptcp_subflow_ctx(ssk);
__mptcp_propagate_sndbuf(sk, ssk); if (!msk->rcvspace_init)
mptcp_rcv_space_init(msk, ssk);
if (sk->sk_state == TCP_SYN_SENT) { /* subflow->idsn is always available is TCP_SYN_SENT state, * even for the FASTOPEN scenarios
*/
WRITE_ONCE(msk->write_seq, subflow->idsn + 1);
WRITE_ONCE(msk->snd_nxt, msk->write_seq);
mptcp_set_state(sk, state);
sk->sk_state_change(sk);
}
}
staticvoid subflow_set_remote_key(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, conststruct mptcp_options_received *mp_opt)
{ /* active MPC subflow will reach here multiple times: * at subflow_finish_connect() time and at 4th ack time
*/ if (subflow->remote_key_valid) return;
if (subflow->backup)
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKBACKUPRX);
if (subflow_use_different_dport(msk, sk)) {
pr_debug("synack inet_dport=%d %d\n",
ntohs(inet_sk(sk)->inet_dport),
ntohs(inet_sk(parent)->inet_dport));
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINPORTSYNACKRX);
}
} elseif (mptcp_check_fallback(sk)) { /* It looks like MPTCP is blocked, while TCP is not */ if (subflow->mpc_drop)
mptcp_active_disable(parent);
fallback:
mptcp_propagate_state(parent, sk, subflow, NULL);
} return;
/* if the sk is MP_CAPABLE, we try to fetch the client key */ if (subflow_req->mp_capable) { /* we can receive and accept an in-window, out-of-order pkt, * which may not carry the MP_CAPABLE opt even on mptcp enabled * paths: always try to extract the peer key, and fallback * for packets missing it. * Even OoO DSS packets coming legitly after dropped or * reordered MPC will cause fallback, but we don't have other * options.
*/
mptcp_get_options(skb, &mp_opt); if (!(mp_opt.suboptions &
(OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_ACK)))
fallback = true;
if (child && *own_req) { struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child);
tcp_rsk(req)->drop_req = false;
/* we need to fallback on ctx allocation failure and on pre-reqs * checking above. In the latter scenario we additionally need * to reset the context to non MPTCP status.
*/ if (!ctx || fallback) { if (fallback_is_fatal) {
subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); goto dispose_child;
} goto fallback;
}
/* ssk inherits options of listener sk */
ctx->setsockopt_seq = listener->setsockopt_seq;
if (ctx->mp_capable) {
ctx->conn = mptcp_sk_clone_init(listener->conn, &mp_opt, child, req); if (!ctx->conn) goto fallback;
ctx->subflow_id = 1;
owner = mptcp_sk(ctx->conn);
if (mp_opt.deny_join_id0)
WRITE_ONCE(owner->pm.remote_deny_join_id0, true);
mptcp_pm_new_connection(owner, child, 1);
/* with OoO packets we can reach here without ingress * mpc option
*/ if (mp_opt.suboptions & OPTION_MPTCP_MPC_ACK) {
mptcp_pm_fully_established(owner, child);
ctx->pm_notified = 1;
}
} elseif (ctx->mp_join) {
owner = subflow_req->msk; if (!owner) {
subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); goto dispose_child;
}
if (unlikely(before(ssn, subflow->map_subflow_seq))) { /* Mapping covers data later in the subflow stream, * currently unsupported.
*/
dbg_bad_map(subflow, ssn); returnfalse;
} if (unlikely(!before(ssn, subflow->map_subflow_seq +
subflow->map_data_len))) { /* Mapping does covers past subflow data, invalid */
dbg_bad_map(subflow, ssn); returnfalse;
} returntrue;
}
/* mapping already validated on previous traversal */ if (subflow->map_csum_len == subflow->map_data_len) return MAPPING_OK;
/* traverse the receive queue, ensuring it contains a full * DSS mapping and accumulating the related csum. * Preserve the accoumlate csum across multiple calls, to compute * the csum only once
*/
delta = subflow->map_data_len - subflow->map_csum_len; for (;;) {
seq = tcp_sk(ssk)->copied_seq + subflow->map_csum_len;
offset = seq - TCP_SKB_CB(skb)->seq;
/* if the current skb has not been accounted yet, csum its contents * up to the amount covered by the current DSS
*/ if (offset < skb->len) {
__wsum csum;
if (skb_queue_is_last(&ssk->sk_receive_queue, skb)) { /* if this subflow is closed, the partial mapping * will be never completed; flush the pending skbs, so * that subflow_sched_work_if_closed() can kick in
*/ if (unlikely(ssk->sk_state == TCP_CLOSE)) while ((skb = skb_peek(&ssk->sk_receive_queue)))
sk_eat_skb(ssk, skb);
/* not enough data to validate the csum */ return MAPPING_EMPTY;
}
/* the DSS mapping for next skbs will be validated later, * when a get_mapping_status call will process such skb
*/
skb = skb->next;
}
/* note that 'map_data_len' accounts only for the carried data, does * not include the eventual seq increment due to the data fin, * while the pseudo header requires the original DSS data len, * including that
*/
csum = __mptcp_make_csum(subflow->map_seq,
subflow->map_subflow_seq,
subflow->map_data_len + subflow->map_data_fin,
subflow->map_data_csum); if (unlikely(csum)) {
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DATACSUMERR); return MAPPING_BAD_CSUM;
}
skb = skb_peek(&ssk->sk_receive_queue); if (!skb) return MAPPING_EMPTY;
if (mptcp_check_fallback(ssk)) return MAPPING_DUMMY;
mpext = mptcp_get_ext(skb); if (!mpext || !mpext->use_map) { if (!subflow->map_valid && !skb->len) { /* the TCP stack deliver 0 len FIN pkt to the receive * queue, that is the only 0len pkts ever expected here, * and we can admit no mapping only for 0 len pkts
*/ if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
WARN_ONCE(1, "0len seq %d:%d flags %x",
TCP_SKB_CB(skb)->seq,
TCP_SKB_CB(skb)->end_seq,
TCP_SKB_CB(skb)->tcp_flags);
sk_eat_skb(ssk, skb); return MAPPING_EMPTY;
}
/* If the required DSS has likely been dropped by a middlebox */ if (!subflow->map_valid) return MAPPING_NODSS;
if (data_len == 1) { bool updated = mptcp_update_rcv_data_fin(msk, mpext->data_seq,
mpext->dsn64);
pr_debug("DATA_FIN with no payload seq=%llu\n", mpext->data_seq); if (subflow->map_valid) { /* A DATA_FIN might arrive in a DSS * option before the previous mapping * has been fully consumed. Continue * handling the existing mapping.
*/
skb_ext_del(skb, SKB_EXT_MPTCP); return MAPPING_OK;
}
if (updated)
mptcp_schedule_work((struct sock *)msk);
return MAPPING_DATA_FIN;
}
data_fin_seq = mpext->data_seq + data_len - 1;
/* If mpext->data_seq is a 32-bit value, data_fin_seq must also * be limited to 32 bits.
*/ if (!mpext->dsn64)
data_fin_seq &= GENMASK_ULL(31, 0);
mptcp_update_rcv_data_fin(msk, data_fin_seq, mpext->dsn64);
pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d\n",
data_fin_seq, mpext->dsn64);
/* Adjust for DATA_FIN using 1 byte of sequence space */
data_len--;
}
if (subflow->map_valid) { /* Allow replacing only with an identical map */ if (subflow->map_seq == map_seq &&
subflow->map_subflow_seq == mpext->subflow_seq &&
subflow->map_data_len == data_len &&
subflow->map_csum_reqd == mpext->csum_reqd) {
skb_ext_del(skb, SKB_EXT_MPTCP); goto validate_csum;
}
/* If this skb data are fully covered by the current mapping, * the new map would need caching, which is not supported
*/ if (skb_is_fully_mapped(ssk, skb)) {
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSNOMATCH); return MAPPING_INVALID;
}
/* will validate the next map after consuming the current one */ goto validate_csum;
}
validate_seq: /* we revalidate valid mapping on new skb, because we must ensure * the current skb is completely covered by the available mapping
*/ if (!validate_mapping(ssk, skb)) {
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSTCPMISMATCH); return MAPPING_INVALID;
}
/* sched mptcp worker for subflow cleanup if no more data is pending */ staticvoid subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk)
{ struct sock *sk = (struct sock *)msk;
if (!skb_queue_empty(&ssk->sk_receive_queue)) return;
if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
mptcp_schedule_work(sk);
/* when the fallback subflow closes the rx side, trigger a 'dummy' * ingress data fin, so that the msk state will follow along
*/ if (__mptcp_check_fallback(msk) && subflow_is_done(ssk) &&
msk->first == ssk &&
mptcp_update_rcv_data_fin(msk, READ_ONCE(msk->ack_seq), true))
mptcp_schedule_work(sk);
}
/* we are really failing, prevent any later subflow join */
spin_lock_bh(&msk->fallback_lock); if (!msk->allow_infinite_fallback) {
spin_unlock_bh(&msk->fallback_lock); returnfalse;
}
msk->allow_subflows = false;
spin_unlock_bh(&msk->fallback_lock);
/* graceful failure can happen only on the MPC subflow */ if (WARN_ON_ONCE(ssk != READ_ONCE(msk->first))) returnfalse;
/* since the close timeout take precedence on the fail one, * no need to start the latter when the first is already set
*/ if (sock_flag((struct sock *)msk, SOCK_DEAD)) returntrue;
/* we don't need extreme accuracy here, use a zero fail_tout as special * value meaning no fail timeout at all;
*/
fail_tout = jiffies + TCP_RTO_MAX; if (!fail_tout)
fail_tout = 1;
WRITE_ONCE(subflow->fail_tout, fail_tout);
tcp_send_ack(ssk);
if (!skb_peek(&ssk->sk_receive_queue))
WRITE_ONCE(subflow->data_avail, false); if (subflow->data_avail) returntrue;
msk = mptcp_sk(subflow->conn); for (;;) {
u64 ack_seq;
u64 old_ack;
status = get_mapping_status(ssk, msk);
trace_subflow_check_data_avail(status, skb_peek(&ssk->sk_receive_queue)); if (unlikely(status == MAPPING_INVALID || status == MAPPING_DUMMY ||
status == MAPPING_BAD_CSUM || status == MAPPING_NODSS)) goto fallback;
if (status != MAPPING_OK) goto no_data;
skb = skb_peek(&ssk->sk_receive_queue); if (WARN_ON_ONCE(!skb)) goto no_data;
if (unlikely(!READ_ONCE(msk->can_ack))) goto fallback;
/* check if current mapping is still valid */ if (subflow->map_valid &&
mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) {
subflow->map_valid = 0;
WRITE_ONCE(subflow->data_avail, false);
pr_debug("Done with mapping: seq=%u data_len=%u\n",
subflow->map_subflow_seq,
subflow->map_data_len);
}
return subflow_check_data_avail(sk);
}
/* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy, * not the ssk one. * * In mptcp, rwin is about the mptcp-level connection data. * * Data that is still on the ssk rx queue can thus be ignored, * as far as mptcp peer is concerned that data is still inflight. * DSS ACK is updated when skb is moved to the mptcp rx queue.
*/ void mptcp_space(conststruct sock *ssk, int *space, int *full_space)
{ conststruct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); conststruct sock *sk = subflow->conn;
/* bail early if this is a no-op, so that we avoid introducing a * problematic lockdep dependency between TCP accept queue lock * and msk socket spinlock
*/ if (!sk->sk_socket) return;
mptcp_data_lock(sk); if (!sock_owned_by_user(sk))
__mptcp_error_report(sk); else
__set_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->cb_flags);
mptcp_data_unlock(sk);
}
ssk = sf->sk;
subflow = mptcp_subflow_ctx(ssk); do {
get_random_bytes(&subflow->local_nonce, sizeof(u32));
} while (!subflow->local_nonce);
/* if 'IPADDRANY', the ID will be set later, after the routing */ if (local->addr.family == AF_INET) { if (!local->addr.addr.s_addr)
local_id = -1; #if IS_ENABLED(CONFIG_MPTCP_IPV6)
} elseif (sk->sk_family == AF_INET6) { if (ipv6_addr_any(&local->addr.addr6))
local_id = -1; #endif
}
if (local_id >= 0)
subflow_set_local_id(subflow, local_id);
err_out: /* we account subflows before the creation, and this failures will not * be caught by sk_state_change()
*/
mptcp_pm_close_subflow(msk); return err;
}
/* only the additional subflows created by kworkers have to be modified */ if (cgroup_id(sock_cgroup_ptr(parent_skcd)) !=
cgroup_id(sock_cgroup_ptr(child_skcd))) {
cgroup_sk_free(child_skcd);
*child_skcd = *parent_skcd;
cgroup_sk_clone(child_skcd);
} #endif/* CONFIG_SOCK_CGROUP_DATA */
if (mem_cgroup_sockets_enabled)
mem_cgroup_sk_inherit(parent, child);
}
int mptcp_subflow_create_socket(struct sock *sk, unsignedshort family, struct socket **new_sock)
{ struct mptcp_subflow_context *subflow; struct net *net = sock_net(sk); struct socket *sf; int err;
/* un-accepted server sockets can reach here - on bad configuration * bail early to avoid greater trouble later
*/ if (unlikely(!sk->sk_socket)) return -EINVAL;
err = security_mptcp_add_subflow(sk, sf->sk); if (err) goto err_free;
/* the newly created socket has to be in the same cgroup as its parent */
mptcp_attach_cgroup(sk, sf->sk);
/* kernel sockets do not by default acquire net ref, but TCP timer * needs it. * Update ns_tracker to current stack trace and refcounted tracker.
*/
sk_net_refcnt_upgrade(sf->sk);
err = tcp_set_ulp(sf->sk, "mptcp"); if (err) goto err_free;
/* the newly created socket really belongs to the owning MPTCP * socket, even if for additional subflows the allocation is performed * by a kernel workqueue. Adjust inode references, so that the * procfs/diag interfaces really show this one belonging to the correct * user.
*/
SOCK_INODE(sf)->i_ino = SOCK_INODE(sk->sk_socket)->i_ino;
SOCK_INODE(sf)->i_uid = SOCK_INODE(sk->sk_socket)->i_uid;
SOCK_INODE(sf)->i_gid = SOCK_INODE(sk->sk_socket)->i_gid;
/* as recvmsg() does not acquire the subflow socket for ssk selection * a fin packet carrying a DSS can be unnoticed if we don't trigger * the data available machinery here.
*/ if (mptcp_subflow_data_available(sk))
mptcp_data_ready(parent, sk); elseif (unlikely(sk->sk_err))
subflow_error_report(sk);
/* Due to lock dependencies no relevant lock can be acquired under rskq_lock. * Splice the req list, so that accept() can not reach the pending ssk after * the listener socket is released below.
*/
spin_lock_bh(&queue->rskq_lock);
head = queue->rskq_accept_head;
tail = queue->rskq_accept_tail;
queue->rskq_accept_head = NULL;
queue->rskq_accept_tail = NULL;
spin_unlock_bh(&queue->rskq_lock);
if (!head) return;
/* can't acquire the msk socket lock under the subflow one, * or will cause ABBA deadlock
*/
release_sock(listener_ssk);
for (req = head; req; req = req->dl_next) {
ssk = req->sk; if (!sk_is_mptcp(ssk)) continue;
subflow = mptcp_subflow_ctx(ssk); if (!subflow || !subflow->conn) continue;
/* lockdep will report a false positive ABBA deadlock * between cancel_work_sync and the listener socket. * The involved locks belong to different sockets WRT * the existing AB chain. * Using a per socket key is problematic as key * deregistration requires process context and must be * performed at socket disposal time, in atomic * context. * Just tell lockdep to consider the listener socket * released here.
*/
mutex_release(&listener_sk->sk_lock.dep_map, _RET_IP_);
mptcp_cancel_work(sk);
mutex_acquire(&listener_sk->sk_lock.dep_map, 0, 0, _RET_IP_);
sock_put(sk);
}
/* we are still under the listener msk socket lock */
lock_sock_nested(listener_ssk, SINGLE_DEPTH_NESTING);
/* restore the listener queue, to let the TCP code clean it up */
spin_lock_bh(&queue->rskq_lock);
WARN_ON_ONCE(queue->rskq_accept_head);
queue->rskq_accept_head = head;
queue->rskq_accept_tail = tail;
spin_unlock_bh(&queue->rskq_lock);
}
sk = ctx->conn; if (sk) { /* if the msk has been orphaned, keep the ctx * alive, will be freed by __mptcp_close_ssk(), * when the subflow is still unaccepted
*/
release = ctx->disposable || list_empty(&ctx->node);
/* inet_child_forget() does not call sk_state_change(), * explicitly trigger the socket close machinery
*/ if (!release && !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW,
&mptcp_sk(sk)->flags))
mptcp_schedule_work(sk);
sock_put(sk);
}
mptcp_subflow_ops_undo_override(ssk); if (release)
kfree_rcu(ctx, rcu);
}
if (subflow_req->mp_capable) { /* see comments in subflow_syn_recv_sock(), MPTCP connection * is fully established only after we receive the remote key
*/
new_ctx->mp_capable = 1;
new_ctx->local_key = subflow_req->local_key;
new_ctx->token = subflow_req->token;
new_ctx->ssn_offset = subflow_req->ssn_offset;
new_ctx->idsn = subflow_req->idsn;
/* this is the first subflow, id is always 0 */
subflow_set_local_id(new_ctx, 0);
} elseif (subflow_req->mp_join) {
new_ctx->ssn_offset = subflow_req->ssn_offset;
new_ctx->mp_join = 1;
WRITE_ONCE(new_ctx->fully_established, true);
new_ctx->remote_key_valid = 1;
new_ctx->backup = subflow_req->backup;
new_ctx->request_bkup = subflow_req->request_bkup;
WRITE_ONCE(new_ctx->remote_id, subflow_req->remote_id);
new_ctx->token = subflow_req->token;
new_ctx->thmac = subflow_req->thmac;
/* the subflow req id is valid, fetched via subflow_check_req() * and subflow_token_join_request()
*/
subflow_set_local_id(new_ctx, subflow_req->local_id);
}
}
/* process and clear all the pending actions, but leave the subflow into * the napi queue. To respect locking, only the same CPU that originated * the action can touch the list. mptcp_napi_poll will take care of it.
*/
status = set_mask_bits(&subflow->delegated_status, MPTCP_DELEGATE_ACTIONS_MASK, 0); if (status)
mptcp_subflow_process_delegated(ssk, status);
tcp_release_cb(ssk);
}
staticint tcp_abort_override(struct sock *ssk, int err)
{ /* closing a listener subflow requires a great deal of care. * keep it simple and just prevent such operation
*/ if (inet_sk_state_load(ssk) == TCP_LISTEN) return -EINVAL;
#if IS_ENABLED(CONFIG_MPTCP_IPV6) /* In struct mptcp_subflow_request_sock, we assume the TCP request sock * structures for v4 and v6 have the same size. It should not changed in * the future but better to make sure to be warned if it is no longer * the case.
*/
BUILD_BUG_ON(sizeof(struct tcp_request_sock) != sizeof(struct tcp6_request_sock));
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.