// SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche, <flla@stud.uni-sb.de> * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> * Linus Torvalds, <torvalds@cs.helsinki.fi> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Matthew Dillon, <dillon@apollo.west.oic.com> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> * * Fixes: * Alan Cox : Numerous verify_area() calls * Alan Cox : Set the ACK bit on a reset * Alan Cox : Stopped it crashing if it closed while * sk->inuse=1 and was trying to connect * (tcp_err()). * Alan Cox : All icmp error handling was broken * pointers passed where wrong and the * socket was looked up backwards. Nobody * tested any icmp error code obviously. * Alan Cox : tcp_err() now handled properly. It * wakes people on errors. poll * behaves and the icmp error race * has gone by moving it into sock.c * Alan Cox : tcp_send_reset() fixed to work for * everything not just packets for * unknown sockets. * Alan Cox : tcp option processing. * Alan Cox : Reset tweaked (still not 100%) [Had * syn rule wrong] * Herp Rosmanith : More reset fixes * Alan Cox : No longer acks invalid rst frames. * Acking any kind of RST is right out. * Alan Cox : Sets an ignore me flag on an rst * receive otherwise odd bits of prattle * escape still * Alan Cox : Fixed another acking RST frame bug. * Should stop LAN workplace lockups. * Alan Cox : Some tidyups using the new skb list * facilities * Alan Cox : sk->keepopen now seems to work * Alan Cox : Pulls options out correctly on accepts * Alan Cox : Fixed assorted sk->rqueue->next errors * Alan Cox : PSH doesn't end a TCP read. Switched a * bit to skb ops. * Alan Cox : Tidied tcp_data to avoid a potential * nasty. * Alan Cox : Added some better commenting, as the * tcp is hard to follow * Alan Cox : Removed incorrect check for 20 * psh * Michael O'Reilly : ack < copied bug fix. * Johannes Stille : Misc tcp fixes (not all in yet). * Alan Cox : FIN with no memory -> CRASH * Alan Cox : Added socket option proto entries. * Also added awareness of them to accept. * Alan Cox : Added TCP options (SOL_TCP) * Alan Cox : Switched wakeup calls to callbacks, * so the kernel can layer network * sockets. * Alan Cox : Use ip_tos/ip_ttl settings. * Alan Cox : Handle FIN (more) properly (we hope). * Alan Cox : RST frames sent on unsynchronised * state ack error. * Alan Cox : Put in missing check for SYN bit. * Alan Cox : Added tcp_select_window() aka NET2E * window non shrink trick. * Alan Cox : Added a couple of small NET2E timer * fixes * Charles Hedrick : TCP fixes * Toomas Tamm : TCP window fixes * Alan Cox : Small URG fix to rlogin ^C ack fight * Charles Hedrick : Rewrote most of it to actually work * Linus : Rewrote tcp_read() and URG handling * completely * Gerhard Koerting: Fixed some missing timer handling * Matthew Dillon : Reworked TCP machine states as per RFC * Gerhard Koerting: PC/TCP workarounds * Adam Caldwell : Assorted timer/timing errors * Matthew Dillon : Fixed another RST bug * Alan Cox : Move to kernel side addressing changes. * Alan Cox : Beginning work on TCP fastpathing * (not yet usable) * Arnt Gulbrandsen: Turbocharged tcp_check() routine. * Alan Cox : TCP fast path debugging * Alan Cox : Window clamping * Michael Riepe : Bug in tcp_check() * Matt Dillon : More TCP improvements and RST bug fixes * Matt Dillon : Yet more small nasties remove from the * TCP code (Be very nice to this man if * tcp finally works 100%) 8) * Alan Cox : BSD accept semantics. * Alan Cox : Reset on closedown bug. * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). * Michael Pall : Handle poll() after URG properly in * all cases. * Michael Pall : Undo the last fix in tcp_read_urg() * (multi URG PUSH broke rlogin). * Michael Pall : Fix the multi URG PUSH problem in * tcp_readable(), poll() after URG * works now. * Michael Pall : recv(...,MSG_OOB) never blocks in the * BSD api. * Alan Cox : Changed the semantics of sk->socket to * fix a race and a signal problem with * accept() and async I/O. * Alan Cox : Relaxed the rules on tcp_sendto(). * Yury Shevchuk : Really fixed accept() blocking problem. * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for * clients/servers which listen in on * fixed ports. * Alan Cox : Cleaned the above up and shrank it to * a sensible code size. * Alan Cox : Self connect lockup fix. * Alan Cox : No connect to multicast. * Ross Biro : Close unaccepted children on master * socket close. * Alan Cox : Reset tracing code. * Alan Cox : Spurious resets on shutdown. * Alan Cox : Giant 15 minute/60 second timer error * Alan Cox : Small whoops in polling before an * accept. * Alan Cox : Kept the state trace facility since * it's handy for debugging. * Alan Cox : More reset handler fixes. * Alan Cox : Started rewriting the code based on * the RFC's for other useful protocol * references see: Comer, KA9Q NOS, and * for a reference on the difference * between specifications and how BSD * works see the 4.4lite source. * A.N.Kuznetsov : Don't time wait on completion of tidy * close. * Linus Torvalds : Fin/Shutdown & copied_seq changes. * Linus Torvalds : Fixed BSD port reuse to work first syn * Alan Cox : Reimplemented timers as per the RFC * and using multiple timers for sanity. * Alan Cox : Small bug fixes, and a lot of new * comments. * Alan Cox : Fixed dual reader crash by locking * the buffers (much like datagram.c) * Alan Cox : Fixed stuck sockets in probe. A probe * now gets fed up of retrying without * (even a no space) answer. * Alan Cox : Extracted closing code better * Alan Cox : Fixed the closing state machine to * resemble the RFC. * Alan Cox : More 'per spec' fixes. * Jorge Cwik : Even faster checksumming. * Alan Cox : tcp_data() doesn't ack illegal PSH * only frames. At least one pc tcp stack * generates them. * Alan Cox : Cache last socket. * Alan Cox : Per route irtt. * Matt Day : poll()->select() match BSD precisely on error * Alan Cox : New buffers * Marc Tamsky : Various sk->prot->retransmits and * sk->retransmits misupdating fixed. * Fixed tcp_write_timeout: stuck close, * and TCP syn retries gets used now. * Mark Yarvis : In tcp_read_wakeup(), don't send an * ack if state is TCP_CLOSED. * Alan Cox : Look up device on a retransmit - routes may * change. Doesn't yet cope with MSS shrink right * but it's a start! * Marc Tamsky : Closing in closing fixes. * Mike Shaver : RFC1122 verifications. * Alan Cox : rcv_saddr errors. * Alan Cox : Block double connect(). * Alan Cox : Small hooks for enSKIP. * Alexey Kuznetsov: Path MTU discovery. * Alan Cox : Support soft errors. * Alan Cox : Fix MTU discovery pathological case * when the remote claims no mtu! * Marc Tamsky : TCP_CLOSE fix. * Colin (G3TNE) : Send a reset on syn ack replies in * window but wrong (fixes NT lpd problems) * Pedro Roque : Better TCP window handling, delayed ack. * Joerg Reuter : No modification of locked buffers in * tcp_do_retransmit() * Eric Schenk : Changed receiver side silly window * avoidance algorithm to BSD style * algorithm. This doubles throughput * against machines running Solaris, * and seems to result in general * improvement. * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD * Willy Konynenberg : Transparent proxying support. * Mike McLagan : Routing by source * Keith Owens : Do proper merging with partial SKB's in * tcp_do_sendmsg to avoid burstiness. * Eric Schenk : Fix fast close down bug with * shutdown() followed by close(). * Andi Kleen : Make poll agree with SIGIO * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and * lingertime == 0 (RFC 793 ABORT Call) * Hirokazu Takahashi : Use copy_from_user() instead of * csum_and_copy_from_user() if possible. * * Description of States: * * TCP_SYN_SENT sent a connection request, waiting for ack * * TCP_SYN_RECV received a connection request, sent ack, * waiting for final ack in three-way handshake. * * TCP_ESTABLISHED connection established * * TCP_FIN_WAIT1 our side has shutdown, waiting to complete * transmission of remaining buffered data * * TCP_FIN_WAIT2 all buffered data sent, waiting for remote * to shutdown * * TCP_CLOSING both sides have shutdown but we still have * data we have to finish sending * * TCP_TIME_WAIT timeout to catch resent junk before entering * closed, can only be entered from FIN_WAIT2 * or CLOSING. Required because the other end * may not have gotten our last ACK causing it * to retransmit the data packet (which we ignore) * * TCP_CLOSE_WAIT remote side has shutdown and is waiting for * us to finish writing our data and to shutdown * (we have to close() to move on to LAST_ACK) * * TCP_LAST_ACK out side has shutdown after remote has * shutdown. There may still be data in our * buffer that we have to finish sending * * TCP_CLOSE socket is finished
*/
/* * Current number of TCP sockets.
*/ struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp;
EXPORT_IPV6_MOD(tcp_sockets_allocated);
/* * Pressure flag: try to collapse. * Technical note: it is used by multiple contexts non atomically. * All the __sk_mem_schedule() is of this nature: accounting * is strict, actions are advisory and have some latency.
*/ unsignedlong tcp_memory_pressure __read_mostly;
EXPORT_SYMBOL_GPL(tcp_memory_pressure);
if (!READ_ONCE(tcp_memory_pressure)) return;
val = xchg(&tcp_memory_pressure, 0); if (val)
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO,
jiffies_to_msecs(jiffies - val));
}
EXPORT_IPV6_MOD_GPL(tcp_leave_memory_pressure);
/* Convert seconds to retransmits based on initial and max timeout */ static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
{
u8 res = 0;
if (seconds > 0) { int period = timeout;
res = 1; while (seconds > period && res < 255) {
res++;
timeout <<= 1; if (timeout > rto_max)
timeout = rto_max;
period += timeout;
}
} return res;
}
/* Convert retransmits to seconds based on initial and max timeout */ staticint retrans_to_secs(u8 retrans, int timeout, int rto_max)
{ int period = 0;
if (retrans > 0) {
period = timeout; while (--retrans) {
timeout <<= 1; if (timeout > rto_max)
timeout = rto_max;
period += timeout;
}
} return period;
}
/* Address-family independent initialization for a tcp_sock. * * NOTE: A lot of things set to zero explicitly by call to * sk_alloc() so need not be done here.
*/ void tcp_init_sock(struct sock *sk)
{ struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int rto_min_us, rto_max_ms;
/* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control * algorithms that we must have the following bandaid to talk * efficiently to them. -DaveM
*/
tcp_snd_cwnd_set(tp, TCP_INIT_CWND);
/* There's a bubble in the pipe until at least the first ACK. */
tp->app_limited = ~0U;
tp->rate_app_limited = 1;
/* See draft-stevens-tcpca-spec-01 for discussion of the * initialization of these values.
*/
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tp->snd_cwnd_clamp = ~0;
tp->mss_cache = TCP_MSS_DEFAULT;
staticbool tcp_stream_is_readable(struct sock *sk, int target)
{ if (tcp_epollin_ready(sk, target)) returntrue; return sk_is_readable(sk);
}
/* * Wait for a TCP event. * * Note that we don't need to lock the socket, as the upper poll layers * take care of normal races (between the test and the event) and we don't * go look at any of the socket buffers directly.
*/
__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
{
__poll_t mask; struct sock *sk = sock->sk; conststruct tcp_sock *tp = tcp_sk(sk);
u8 shutdown; int state;
sock_poll_wait(file, sock, wait);
state = inet_sk_state_load(sk); if (state == TCP_LISTEN) return inet_csk_listen_poll(sk);
/* Socket is not locked. We are protected from async events * by poll logic and correct handling of state changes * made by other threads is impossible in any case.
*/
mask = 0;
/* * EPOLLHUP is certainly not done right. But poll() doesn't * have a notion of HUP in just one direction, and for a * socket the read side is more interesting. * * Some poll() documentation says that EPOLLHUP is incompatible * with the EPOLLOUT/POLLWR flags, so somebody should check this * all. But careful, it tends to be safer to return too many * bits than too few, and you can easily break real applications * if you don't tell them that something has hung up! * * Check-me. * * Check number 1. EPOLLHUP is _UNMASKABLE_ event (see UNIX98 and * our fs/select.c). It means that after we received EOF, * poll always returns immediately, making impossible poll() on write() * in state CLOSE_WAIT. One solution is evident --- to set EPOLLHUP * if and only if shutdown has been made in both directions. * Actually, it is interesting to look how Solaris and DUX * solve this dilemma. I would prefer, if EPOLLHUP were maskable, * then we could set it on SND_SHUTDOWN. BTW examples given * in Stevens' books assume exactly this behaviour, it explains * why EPOLLHUP is incompatible with EPOLLOUT. --ANK * * NOTE. Check for TCP_CLOSE is added. The goal is to prevent * blocking on fresh not-connected or disconnected socket. --ANK
*/
shutdown = READ_ONCE(sk->sk_shutdown); if (shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
mask |= EPOLLHUP; if (shutdown & RCV_SHUTDOWN)
mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
/* Connected or passive Fast Open socket? */ if (state != TCP_SYN_SENT &&
(state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) { int target = sock_rcvlowat(sk, 0, INT_MAX);
u16 urg_data = READ_ONCE(tp->urg_data);
if (unlikely(urg_data) &&
READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) &&
!sock_flag(sk, SOCK_URGINLINE))
target++;
if (tcp_stream_is_readable(sk, target))
mask |= EPOLLIN | EPOLLRDNORM;
if (!(shutdown & SEND_SHUTDOWN)) { if (__sk_stream_is_writeable(sk, 1)) {
mask |= EPOLLOUT | EPOLLWRNORM;
} else { /* send SIGIO later */
sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
/* Race breaker. If space is freed after * wspace test but before the flags are set, * IO signal will be lost. Memory barrier * pairs with the input side.
*/
smp_mb__after_atomic(); if (__sk_stream_is_writeable(sk, 1))
mask |= EPOLLOUT | EPOLLWRNORM;
}
} else
mask |= EPOLLOUT | EPOLLWRNORM;
if (urg_data & TCP_URG_VALID)
mask |= EPOLLPRI;
} elseif (state == TCP_SYN_SENT &&
inet_test_bit(DEFER_CONNECT, sk)) { /* Active TCP fastopen socket with defer_connect * Return EPOLLOUT so application can call write() * in order for kernel to generate SYN+data
*/
mask |= EPOLLOUT | EPOLLWRNORM;
} /* This barrier is coupled with smp_wmb() in tcp_done_with_error() */
smp_rmb(); if (READ_ONCE(sk->sk_err) ||
!skb_queue_empty_lockless(&sk->sk_error_queue))
mask |= EPOLLERR;
return mask;
}
EXPORT_SYMBOL(tcp_poll);
int tcp_ioctl(struct sock *sk, int cmd, int *karg)
{ struct tcp_sock *tp = tcp_sk(sk); int answ; bool slow;
switch (cmd) { case SIOCINQ: if (sk->sk_state == TCP_LISTEN) return -EINVAL;
slow = lock_sock_fast(sk);
answ = tcp_inq(sk);
unlock_sock_fast(sk, slow); break; case SIOCATMARK:
answ = READ_ONCE(tp->urg_data) &&
READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq); break; case SIOCOUTQ: if (sk->sk_state == TCP_LISTEN) return -EINVAL;
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
answ = 0; else
answ = READ_ONCE(tp->write_seq) - tp->snd_una; break; case SIOCOUTQNSD: if (sk->sk_state == TCP_LISTEN) return -EINVAL;
staticinlinevoid tcp_mark_urg(struct tcp_sock *tp, int flags)
{ if (flags & MSG_OOB)
tp->snd_up = tp->write_seq;
}
/* If a not yet filled skb is pushed, do not send it if * we have data packets in Qdisc or NIC queues : * Because TX completion will happen shortly, it gives a chance * to coalesce future sendmsg() payload into this skb, without * need for a timer, and with no latency trade off. * As packets containing data payload have a bigger truesize * than pure acks (dataless) packets, the last checks prevent * autocorking if we only have an ACK in Qdisc/NIC queues, * or if TX completion was delayed after we processed ACK packet.
*/ staticbool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, int size_goal)
{ return skb->len < size_goal &&
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_autocorking) &&
!tcp_rtx_queue_empty(sk) &&
refcount_read(&sk->sk_wmem_alloc) > skb->truesize &&
tcp_skb_can_collapse_to(skb);
}
void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle, int size_goal)
{ struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb;
skb = tcp_write_queue_tail(sk); if (!skb) return; if (!(flags & MSG_MORE) || forced_push(tp))
tcp_mark_push(tp, skb);
tcp_mark_urg(tp, flags);
if (tcp_should_autocork(sk, skb, size_goal)) {
/* avoid atomic op if TSQ_THROTTLED bit is already set */ if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
smp_mb__after_atomic();
} /* It is possible TX completion already happened * before we set TSQ_THROTTLED.
*/ if (refcount_read(&sk->sk_wmem_alloc) > skb->truesize) return;
}
/** * tcp_splice_read - splice data from TCP socket to a pipe * @sock: socket to splice from * @ppos: position (not valid) * @pipe: pipe to splice to * @len: number of bytes to splice * @flags: splice modifier flags * * Description: * Will read pages from given socket and fill them into a pipe. *
**/
ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsignedint flags)
{ struct sock *sk = sock->sk; struct tcp_splice_state tss = {
.pipe = pipe,
.len = len,
.flags = flags,
}; long timeo;
ssize_t spliced; int ret;
sock_rps_record_flow(sk); /* * We can't seek on a socket input
*/ if (unlikely(*ppos)) return -ESPIPE;
ret = spliced = 0;
lock_sock(sk);
timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK); while (tss.len) {
ret = __tcp_splice_read(sk, &tss); if (ret < 0) break; elseif (!ret) { if (spliced) break; if (sock_flag(sk, SOCK_DONE)) break; if (sk->sk_err) {
ret = sock_error(sk); break;
} if (sk->sk_shutdown & RCV_SHUTDOWN) break; if (sk->sk_state == TCP_CLOSE) { /* * This occurs when user tries to read * from never connected socket.
*/
ret = -ENOTCONN; break;
} if (!timeo) {
ret = -EAGAIN; break;
} /* if __tcp_splice_read() got nothing while we have * an skb in receive queue, we do not want to loop. * This might happen with URG data.
*/ if (!skb_queue_empty(&sk->sk_receive_queue)) break;
ret = sk_wait_data(sk, &timeo, NULL); if (ret < 0) break; if (signal_pending(current)) {
ret = sock_intr_errno(timeo); break;
} continue;
}
tss.len -= ret;
spliced += ret;
if (!tss.len || !timeo) break;
release_sock(sk);
lock_sock(sk);
/* In some cases, sendmsg() could have added an skb to the write queue, * but failed adding payload on it. We need to remove it to consume less * memory, but more importantly be able to generate EPOLLOUT for Edge Trigger * epoll() users. Another reason is that tcp_write_xmit() does not like * finding an empty skb in the write queue.
*/ void tcp_remove_empty_skb(struct sock *sk)
{ struct sk_buff *skb = tcp_write_queue_tail(sk);
if (skb && TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
tcp_unlink_write_queue(skb, sk); if (tcp_write_queue_empty(sk))
tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
tcp_wmem_free_skb(sk, skb);
}
}
/* skb changing from pure zc to mixed, must charge zc */ staticint tcp_downgrade_zcopy_pure(struct sock *sk, struct sk_buff *skb)
{ if (unlikely(skb_zcopy_pure(skb))) {
u32 extra = skb->truesize -
SKB_TRUESIZE(skb_end_offset(skb));
int tcp_wmem_schedule(struct sock *sk, int copy)
{ int left;
if (likely(sk_wmem_schedule(sk, copy))) return copy;
/* We could be in trouble if we have nothing queued. * Use whatever is left in sk->sk_forward_alloc and tcp_wmem[0] * to guarantee some progress.
*/
left = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[0]) - sk->sk_wmem_queued; if (left > 0)
sk_forced_mem_schedule(sk, min(left, copy)); return min(copy, sk->sk_forward_alloc);
}
if (!(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) &
TFO_CLIENT_ENABLE) ||
(uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) &&
uaddr->sa_family == AF_UNSPEC)) return -EOPNOTSUPP; if (tp->fastopen_req) return -EALREADY; /* Another Fast Open is in progress */
if (inet_test_bit(DEFER_CONNECT, sk)) {
err = tcp_connect(sk); /* Same failure procedure as in tcp_v4/6_connect */ if (err) {
tcp_set_state(sk, TCP_CLOSE);
inet->inet_dport = 0;
sk->sk_route_caps = 0;
}
}
flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
err = __inet_stream_connect(sk->sk_socket, uaddr,
msg->msg_namelen, flags, 1); /* fastopen_req could already be freed in __inet_stream_connect * if the connection times out or gets rst
*/ if (tp->fastopen_req) {
*copied = tp->fastopen_req->copied;
tcp_free_fastopen_req(tp);
inet_clear_bit(DEFER_CONNECT, sk);
} return err;
}
int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
{ struct net_devmem_dmabuf_binding *binding = NULL; struct tcp_sock *tp = tcp_sk(sk); struct ubuf_info *uarg = NULL; struct sk_buff *skb; struct sockcm_cookie sockc; int flags, err, copied = 0; int mss_now = 0, size_goal, copied_syn = 0; int process_backlog = 0; int sockc_err = 0; int zc = 0; long timeo;
flags = msg->msg_flags;
sockc = (struct sockcm_cookie){ .tsflags = READ_ONCE(sk->sk_tsflags) }; if (msg->msg_controllen) {
sockc_err = sock_cmsg_send(sk, msg, &sockc); /* Don't return error until MSG_FASTOPEN has been processed; * that may succeed even if the cmsg is invalid.
*/
}
tcp_rate_check_app_limited(sk); /* is sending application-limited? */
/* Wait for a connection to finish. One exception is TCP Fast Open * (passive side) where data is allowed to be sent before a connection * is fully established.
*/ if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
!tcp_passive_fastopen(sk)) {
err = sk_stream_wait_connect(sk, &timeo); if (err != 0) goto do_error;
}
if (unlikely(tp->repair)) { if (tp->repair_queue == TCP_RECV_QUEUE) {
copied = tcp_send_rcvq(sk, msg, size); goto out_nopush;
}
err = -EINVAL; if (tp->repair_queue == TCP_NO_QUEUE) goto out_err;
/* 'common' sending to sendq */
}
if (sockc_err) {
err = sockc_err; goto out_err;
}
/* This should be in poll */
sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
/* All packets are restored as if they have * already been sent. skb_mstamp_ns isn't set to * avoid wrong rtt estimation.
*/ if (tp->repair)
TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
}
/* Try to append data to the end of skb. */ if (copy > msg_data_left(msg))
copy = msg_data_left(msg);
if (zc == 0) { bool merge = true; int i = skb_shinfo(skb)->nr_frags; struct page_frag *pfrag = sk_page_frag(sk);
if (!sk_page_frag_refill(sk, pfrag)) goto wait_for_space;
if (!skb_can_coalesce(skb, i, pfrag->page,
pfrag->offset)) { if (i >= READ_ONCE(net_hotdata.sysctl_max_skb_frags)) {
tcp_mark_push(tp, skb); goto new_segment;
}
merge = false;
}
err = sk_stream_wait_memory(sk, &timeo); if (err != 0) goto do_error;
mss_now = tcp_send_mss(sk, &size_goal, flags);
}
out: if (copied) {
tcp_tx_timestamp(sk, &sockc);
tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
}
out_nopush: /* msg->msg_ubuf is pinned by the caller so we don't take extra refs */ if (uarg && !msg->msg_ubuf)
net_zcopy_put(uarg); if (binding)
net_devmem_dmabuf_binding_put(binding); return copied + copied_syn;
do_error:
tcp_remove_empty_skb(sk);
if (copied + copied_syn) goto out;
out_err: /* msg->msg_ubuf is pinned by the caller so we don't take extra refs */ if (uarg && !msg->msg_ubuf)
net_zcopy_put_abort(uarg, true);
err = sk_stream_error(sk, flags, err); /* make sure we wake any epoll edge trigger waiter */ if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
sk->sk_write_space(sk);
tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
} if (binding)
net_devmem_dmabuf_binding_put(binding);
/* * Handle reading urgent data. BSD has very simple semantics for * this, no blocking and very strange errors 8)
*/
staticint tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
{ struct tcp_sock *tp = tcp_sk(sk);
/* No URG data to read. */ if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
tp->urg_data == TCP_URG_READ) return -EINVAL; /* Yes this is right ! */
if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE)) return -ENOTCONN;
if (tp->urg_data & TCP_URG_VALID) { int err = 0; char c = tp->urg_data;
if (!(flags & MSG_PEEK))
WRITE_ONCE(tp->urg_data, TCP_URG_READ);
if (len > 0) { if (!(flags & MSG_TRUNC))
err = memcpy_to_msg(msg, &c, 1);
len = 1;
} else
msg->msg_flags |= MSG_TRUNC;
return err ? -EFAULT : len;
}
if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN)) return 0;
/* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and * the available implementations agree in this case: * this call should never block, independent of the * blocking state of the socket. * Mike <pall@rz.uni-karlsruhe.de>
*/ return -EAGAIN;
}
staticint tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
{ struct sk_buff *skb; int copied = 0, err = 0;
/* Clean up the receive buffer for full frames taken by the user, * then send an ACK if necessary. COPIED is the number of bytes * tcp_recvmsg has given to the user so far, it speeds up the * calculation of whether or not we must ACK for the sake of * a window update.
*/ void __tcp_cleanup_rbuf(struct sock *sk, int copied)
{ struct tcp_sock *tp = tcp_sk(sk); bool time_to_ack = false;
if (inet_csk_ack_scheduled(sk)) { conststruct inet_connection_sock *icsk = inet_csk(sk);
if (/* Once-per-two-segments ACK was not sent by tcp_input.c */
tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss || /* * If this read emptied read buffer, we send ACK, if * connection is not bidirectional, user drained * receive buffer and there was a small segment * in queue.
*/
(copied > 0 &&
((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
!inet_csk_in_pingpong_mode(sk))) &&
!atomic_read(&sk->sk_rmem_alloc)))
time_to_ack = true;
}
/* We send an ACK if we can now advertise a non-zero window * which has been raised "significantly". * * Even if window raised up to infinity, do not send window open ACK * in states, where we will not receive more. It is useless.
*/ if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
__u32 rcv_window_now = tcp_receive_window(tp);
/* Optimize, __tcp_select_window() is not cheap. */ if (2*rcv_window_now <= tp->window_clamp) {
__u32 new_window = __tcp_select_window(sk);
/* Send ACK now, if this read freed lots of space * in our buffer. Certainly, new_window is new window. * We can advertise it now, if it is not less than current one. * "Lots" means "at least twice" here.
*/ if (new_window && new_window >= 2 * rcv_window_now)
time_to_ack = true;
}
} if (time_to_ack)
tcp_send_ack(sk);
}
while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
offset = seq - TCP_SKB_CB(skb)->seq; if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
pr_err_once("%s: found a SYN, please report !\n", __func__);
offset--;
} if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
*off = offset; return skb;
} /* This looks weird, but this can happen if TCP collapsing * splitted a fat GRO packet, while we released socket lock * in skb_splice_bits()
*/
tcp_eat_recv_skb(sk, skb);
} return NULL;
}
EXPORT_SYMBOL(tcp_recv_skb);
/* * This routine provides an alternative to tcp_recvmsg() for routines * that would like to handle copying from skbuffs directly in 'sendfile' * fashion. * Note: * - It is assumed that the socket was locked by the caller. * - The routine does not block. * - At present, there is no support for reading OOB data * or for 'peeking' the socket using this routine * (although both would be easy to implement).
*/ staticint __tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
sk_read_actor_t recv_actor, bool noack,
u32 *copied_seq)
{ struct sk_buff *skb; struct tcp_sock *tp = tcp_sk(sk);
u32 seq = *copied_seq;
u32 offset; int copied = 0;
if (sk->sk_state == TCP_LISTEN) return -ENOTCONN; while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { if (offset < skb->len) { int used;
size_t len;
len = skb->len - offset; /* Stop reading if we hit a patch of urgent data */ if (unlikely(tp->urg_data)) {
u32 urg_offset = tp->urg_seq - seq; if (urg_offset < len)
len = urg_offset; if (!len) break;
}
used = recv_actor(desc, skb, offset, len); if (used <= 0) { if (!copied)
copied = used; break;
} if (WARN_ON_ONCE(used > len))
used = len;
seq += used;
copied += used;
offset += used;
/* If recv_actor drops the lock (e.g. TCP splice * receive) the skb pointer might be invalid when * getting here: tcp_collapse might have deleted it * while aggregating skbs from the socket queue.
*/
skb = tcp_recv_skb(sk, seq - 1, &offset); if (!skb) break; /* TCP coalescing might have appended data to the skb. * Try to splice more frags
*/ if (offset + 1 != skb->len) continue;
} if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
tcp_eat_recv_skb(sk, skb);
++seq; break;
}
tcp_eat_recv_skb(sk, skb); if (!desc->count) break;
WRITE_ONCE(*copied_seq, seq);
}
WRITE_ONCE(*copied_seq, seq);
if (noack) goto out;
tcp_rcv_space_adjust(sk);
/* Clean up data we have read: This will do ACK frames. */ if (copied > 0) {
tcp_recv_skb(sk, seq, &offset);
tcp_cleanup_rbuf(sk, copied);
}
out: return copied;
}
/* Clean up data we have read: This will do ACK frames. */ if (left != len)
tcp_cleanup_rbuf(sk, len - left);
}
EXPORT_SYMBOL(tcp_read_done);
int tcp_peek_len(struct socket *sock)
{ return tcp_inq(sock->sk);
}
EXPORT_IPV6_MOD(tcp_peek_len);
/* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */ int tcp_set_rcvlowat(struct sock *sk, int val)
{ struct tcp_sock *tp = tcp_sk(sk); int space, cap;
if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
cap = sk->sk_rcvbuf >> 1; else
cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
val = min(val, cap);
WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
/* Check if we need to signal EPOLLIN right now */
tcp_data_ready(sk);
if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) return 0;
space = tcp_space_from_win(sk, val); if (space > sk->sk_rcvbuf) {
WRITE_ONCE(sk->sk_rcvbuf, space);
/* worst case: skip to next skb. try to improve on this case below */
zc->recv_skip_hint = skb->len - offset;
/* Find the frag containing this offset (and how far into that frag) */
frag = skb_advance_to_frag(skb, offset, &frag_offset); if (!frag) return;
if (frag_offset) { struct skb_shared_info *info = skb_shinfo(skb);
/* We read part of the last frag, must recvmsg() rest of skb. */ if (frag == &info->frags[info->nr_frags - 1]) return;
/* Else, we must at least read the remainder in this frag. */
partial_frag_remainder = skb_frag_size(frag) - frag_offset;
zc->recv_skip_hint -= partial_frag_remainder;
++frag;
}
/* partial_frag_remainder: If part way through a frag, must read rest. * mappable_offset: Bytes till next mappable frag, *not* counting bytes * in partial_frag_remainder.
*/
mappable_offset = find_next_mappable_frag(frag, zc->recv_skip_hint);
zc->recv_skip_hint = mappable_offset + partial_frag_remainder;
}
staticint tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, int flags, struct scm_timestamping_internal *tss, int *cmsg_flags); staticint receive_fallback_to_copy(struct sock *sk, struct tcp_zerocopy_receive *zc, int inq, struct scm_timestamping_internal *tss)
{ unsignedlong copy_address = (unsignedlong)zc->copybuf_address; struct msghdr msg = {}; int err;
zc->length = 0;
zc->recv_skip_hint = 0;
if (copy_address != zc->copybuf_address) return -EINVAL;
if (!err) { unsignedlong leftover_pages = pages_remaining; int bytes_mapped;
/* We called zap_page_range_single, try to reinsert. */
err = vm_insert_pages(vma, *address,
pending_pages,
&pages_remaining);
bytes_mapped = PAGE_SIZE * (leftover_pages - pages_remaining);
*seq += bytes_mapped;
*address += bytes_mapped;
} if (err) { /* Either we were unable to zap, OR we zapped, retried an * insert, and still had an issue. Either ways, pages_remaining * is the number of pages we were unable to map, and we unroll * some state we speculatively touched before.
*/ constint bytes_not_mapped = PAGE_SIZE * pages_remaining;
err = vm_insert_pages(vma, *address, pages, &pages_remaining);
pages_mapped = pages_to_map - (unsignedint)pages_remaining;
bytes_mapped = PAGE_SIZE * pages_mapped; /* Even if vm_insert_pages fails, it may have partially succeeded in * mapping (some but not all of the pages).
*/
*seq += bytes_mapped;
*address += bytes_mapped;
if (likely(!err)) return 0;
/* Error: maybe zap and retry + rollback state for failed inserts. */ return tcp_zerocopy_vm_insert_batch_error(vma, pages + pages_mapped,
pages_remaining, address, length, seq, zc, total_bytes_to_map,
err);
}
mappable_offset = find_next_mappable_frag(frags,
zc->recv_skip_hint); if (mappable_offset) {
zc->recv_skip_hint = mappable_offset; break;
}
page = skb_frag_page(frags); if (WARN_ON_ONCE(!page)) break;
prefetchw(page);
pages[pages_to_map++] = page;
length += PAGE_SIZE;
zc->recv_skip_hint -= PAGE_SIZE;
frags++; if (pages_to_map == TCP_ZEROCOPY_PAGE_BATCH_SIZE ||
zc->recv_skip_hint < PAGE_SIZE) { /* Either full batch, or we're about to go to next skb * (and we cannot unroll failed ops across skbs).
*/
ret = tcp_zerocopy_vm_insert_batch(vma, pages,
pages_to_map,
&address, &length,
&seq, zc,
total_bytes_to_map); if (ret) goto out;
pages_to_map = 0;
}
} if (pages_to_map) {
ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map,
&address, &length, &seq,
zc, total_bytes_to_map);
}
out: if (mmap_locked)
mmap_read_unlock(current->mm); else
vma_end_read(vma); /* Try to copy straggler data. */ if (!ret)
copylen = tcp_zc_handle_leftover(zc, sk, skb, &seq, copybuf_len, tss);
if (length + copylen) {
WRITE_ONCE(tp->copied_seq, seq);
tcp_rcv_space_adjust(sk);
/* Clean up data we have read: This will do ACK frames. */
tcp_recv_skb(sk, seq, &offset);
tcp_cleanup_rbuf(sk, length + copylen);
ret = 0; if (length == zc->length)
zc->recv_skip_hint = 0;
} else { if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE))
ret = -EIO;
}
zc->length = length; return ret;
} #endif
/* Similar to __sock_recv_timestamp, but does not require an skb */ void tcp_recv_timestamp(struct msghdr *msg, conststruct sock *sk, struct scm_timestamping_internal *tss)
{ int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
u32 tsflags = READ_ONCE(sk->sk_tsflags); bool has_timestamping = false;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.