/* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the AF_INET socket handler. * * Version: @(#)sock.h 1.0.4 05/13/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche <flla@stud.uni-sb.de> * * Fixes: * Alan Cox : Volatiles in skbuff pointers. See * skbuff comments. May be overdone, * better to prove they can be removed * than the reverse. * Alan Cox : Added a zapped field for tcp to note * a socket is reset and must stay shut up * Alan Cox : New fields for options * Pauline Middelink : identd support * Alan Cox : Eliminate low level recv/recvfrom * David S. Miller : New socket lookup architecture. * Steve Whitehouse: Default routines for sock_ops * Arnaldo C. Melo : removed net_pinfo, tp_pinfo and made * protinfo be just a void pointer, as the * protocol specific parts were moved to * respective headers and ipv4/v6, etc now * use private slabcaches for its socks * Pedro Hortas : New flags field for socket options
*/ #ifndef _SOCK_H #define _SOCK_H
/* * This structure really needs to be cleaned up. * Most of it is for TCP, and not used by any of * the other protocols.
*/
/* This is the per-socket lock. The spinlock provides a synchronization * between user contexts and software interrupt processing, whereas the * mini-semaphore synchronizes multiple users amongst themselves.
*/ typedefstruct {
spinlock_t slock; int owned;
wait_queue_head_t wq; /* * We express the mutex-alike socket_lock semantics * to the lock validator by explicitly managing * the slock as a lock variant (in addition to * the slock itself):
*/ #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif
} socket_lock_t;
/** * struct sock_common - minimal network layer representation of sockets * @skc_daddr: Foreign IPv4 addr * @skc_rcv_saddr: Bound local IPv4 addr * @skc_addrpair: 8-byte-aligned __u64 union of @skc_daddr & @skc_rcv_saddr * @skc_hash: hash value used with various protocol lookup tables * @skc_u16hashes: two u16 hash values used by UDP lookup tables * @skc_dport: placeholder for inet_dport/tw_dport * @skc_num: placeholder for inet_num/tw_num * @skc_portpair: __u32 union of @skc_dport & @skc_num * @skc_family: network address family * @skc_state: Connection state * @skc_reuse: %SO_REUSEADDR setting * @skc_reuseport: %SO_REUSEPORT setting * @skc_ipv6only: socket is IPV6 only * @skc_net_refcnt: socket is using net ref counting * @skc_bound_dev_if: bound device index if != 0 * @skc_bind_node: bind hash linkage for various protocol lookup tables * @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol * @skc_prot: protocol handlers inside a network family * @skc_net: reference to the network namespace of this socket * @skc_v6_daddr: IPV6 destination address * @skc_v6_rcv_saddr: IPV6 source address * @skc_cookie: socket's cookie value * @skc_node: main hash linkage for various protocol lookup tables * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol * @skc_tx_queue_mapping: tx queue number for this connection * @skc_rx_queue_mapping: rx queue number for this connection * @skc_flags: place holder for sk_flags * %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings * @skc_listener: connection request listener socket (aka rsk_listener) * [union with @skc_flags] * @skc_tw_dr: (aka tw_dr) ptr to &struct inet_timewait_death_row * [union with @skc_flags] * @skc_incoming_cpu: record/match cpu processing incoming packets * @skc_rcv_wnd: (aka rsk_rcv_wnd) TCP receive window size (possibly scaled) * [union with @skc_incoming_cpu] * @skc_tw_rcv_nxt: (aka tw_rcv_nxt) TCP window next expected seq number * [union with @skc_incoming_cpu] * @skc_refcnt: reference count * * This is the minimal network layer representation of sockets, the header * for struct sock and struct inet_timewait_sock.
*/ struct sock_common { union {
__addrpair skc_addrpair; struct {
__be32 skc_daddr;
__be32 skc_rcv_saddr;
};
}; union { unsignedint skc_hash;
__u16 skc_u16hashes[2];
}; /* skc_dport && skc_num must be grouped as well */ union {
__portpair skc_portpair; struct {
__be16 skc_dport;
__u16 skc_num;
};
};
unsignedshort skc_family; volatileunsignedchar skc_state; unsignedchar skc_reuse:4; unsignedchar skc_reuseport:1; unsignedchar skc_ipv6only:1; unsignedchar skc_net_refcnt:1; int skc_bound_dev_if; union { struct hlist_node skc_bind_node; struct hlist_node skc_portaddr_node;
}; struct proto *skc_prot;
possible_net_t skc_net;
/* following fields are padding to force * offset(struct sock, sk_refcnt) == 128 on 64bit arches * assuming IPV6 is enabled. We use this padding differently * for different kind of 'sockets'
*/ union { unsignedlong skc_flags; struct sock *skc_listener; /* request_sock */ struct inet_timewait_death_row *skc_tw_dr; /* inet_timewait_sock */
}; /* * fields between dontcopy_begin/dontcopy_end * are not copied in sock_copy()
*/ /* private: */ int skc_dontcopy_begin[0]; /* public: */ union { struct hlist_node skc_node; struct hlist_nulls_node skc_nulls_node;
}; unsignedshort skc_tx_queue_mapping; #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING unsignedshort skc_rx_queue_mapping; #endif union { int skc_incoming_cpu;
u32 skc_rcv_wnd;
u32 skc_tw_rcv_nxt; /* struct tcp_timewait_sock */
};
/** * struct sock - network layer representation of sockets * @__sk_common: shared layout with inet_timewait_sock * @sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN * @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings * @sk_lock: synchronizer * @sk_kern_sock: True if sock is using kernel lock classes * @sk_rcvbuf: size of receive buffer in bytes * @sk_wq: sock wait queue and async head * @sk_rx_dst: receive input route used by early demux * @sk_rx_dst_ifindex: ifindex for @sk_rx_dst * @sk_rx_dst_cookie: cookie for @sk_rx_dst * @sk_dst_cache: destination cache * @sk_dst_pending_confirm: need to confirm neighbour * @sk_policy: flow policy * @sk_receive_queue: incoming packets * @sk_wmem_alloc: transmit queue bytes committed * @sk_tsq_flags: TCP Small Queues flags * @sk_write_queue: Packet sending queue * @sk_omem_alloc: "o" is "option" or "other" * @sk_wmem_queued: persistent queue size * @sk_forward_alloc: space allocated forward * @sk_reserved_mem: space reserved and non-reclaimable for the socket * @sk_napi_id: id of the last napi context to receive data for sk * @sk_ll_usec: usecs to busypoll when there is no data * @sk_allocation: allocation mode * @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler) * @sk_pacing_status: Pacing status (requested, handled by sch_fq) * @sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE) * @sk_sndbuf: size of send buffer in bytes * @sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets * @sk_no_check_rx: allow zero checksum in RX packets * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO) * @sk_gso_disabled: if set, NETIF_F_GSO_MASK is forbidden. * @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4) * @sk_gso_max_size: Maximum GSO segment size to build * @sk_gso_max_segs: Maximum number of GSO segments * @sk_pacing_shift: scaling factor for TCP Small Queues * @sk_lingertime: %SO_LINGER l_linger setting * @sk_backlog: always used with the per-socket spinlock held * @sk_callback_lock: used with the callbacks in the end of this struct * @sk_error_queue: rarely used * @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt, * IPV6_ADDRFORM for instance) * @sk_err: last error * @sk_err_soft: errors that don't cause failure but are the cause of a * persistent failure not just 'timed out' * @sk_drops: raw/udp drops counter * @sk_ack_backlog: current listen backlog * @sk_max_ack_backlog: listen backlog set in listen() * @sk_uid: user id of owner * @sk_ino: inode number (zero if orphaned) * @sk_prefer_busy_poll: prefer busypolling over softirq processing * @sk_busy_poll_budget: napi processing budget when busypolling * @sk_priority: %SO_PRIORITY setting * @sk_type: socket type (%SOCK_STREAM, etc) * @sk_protocol: which protocol this socket belongs in this network family * @sk_peer_lock: lock protecting @sk_peer_pid and @sk_peer_cred * @sk_peer_pid: &struct pid for this socket's peer * @sk_peer_cred: %SO_PEERCRED setting * @sk_rcvlowat: %SO_RCVLOWAT setting * @sk_rcvtimeo: %SO_RCVTIMEO setting * @sk_sndtimeo: %SO_SNDTIMEO setting * @sk_txhash: computed flow hash for use on transmit * @sk_txrehash: enable TX hash rethink * @sk_filter: socket filtering instructions * @sk_timer: sock cleanup timer * @sk_stamp: time stamp of last packet received * @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only * @sk_tsflags: SO_TIMESTAMPING flags * @sk_bpf_cb_flags: used in bpf_setsockopt() * @sk_use_task_frag: allow sk_page_frag() to use current->task_frag. * Sockets that can be used under memory reclaim should * set this to false. * @sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock * for timestamping * @sk_tskey: counter to disambiguate concurrent tstamp requests * @sk_zckey: counter to order MSG_ZEROCOPY notifications * @sk_socket: Identd and reporting IO signals * @sk_user_data: RPC layer private data. Write-protected by @sk_callback_lock. * @sk_frag: cached page frag * @sk_peek_off: current peek_offset value * @sk_send_head: front of stuff to transmit * @tcp_rtx_queue: TCP re-transmit queue [union with @sk_send_head] * @sk_security: used by security modules * @sk_mark: generic packet mark * @sk_cgrp_data: cgroup data for this cgroup * @sk_memcg: this socket's memory cgroup association * @sk_write_pending: a write to stream socket waits to start * @sk_disconnects: number of disconnect operations performed on this sock * @sk_state_change: callback to indicate change in the state of the sock * @sk_data_ready: callback to indicate there is data to be processed * @sk_write_space: callback to indicate there is bf sending space available * @sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE) * @sk_backlog_rcv: callback to process the backlog * @sk_validate_xmit_skb: ptr to an optional validate function * @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0 * @sk_reuseport_cb: reuseport group container * @sk_bpf_storage: ptr to cache and control for bpf_sk_storage * @sk_rcu: used during RCU grace period * @sk_clockid: clockid used by time-based scheduling (SO_TXTIME) * @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME * @sk_txtime_report_errors: set report errors mode for SO_TXTIME * @sk_txtime_unused: unused txtime flags * @sk_scm_recv_flags: all flags used by scm_recv() * @sk_scm_credentials: flagged by SO_PASSCRED to recv SCM_CREDENTIALS * @sk_scm_security: flagged by SO_PASSSEC to recv SCM_SECURITY * @sk_scm_pidfd: flagged by SO_PASSPIDFD to recv SCM_PIDFD * @sk_scm_rights: flagged by SO_PASSRIGHTS to recv SCM_RIGHTS * @sk_scm_unused: unused flags for scm_recv() * @ns_tracker: tracker for netns reference * @sk_user_frags: xarray of pages the user is holding a reference on. * @sk_owner: reference to the real owner of the socket that calls * sock_lock_init_class_and_name().
*/ struct sock { /* * Now struct inet_timewait_sock also uses sock_common, so please just * don't add nothing before this first member (__sk_common) --acme
*/ struct sock_common __sk_common; #define sk_node __sk_common.skc_node #define sk_nulls_node __sk_common.skc_nulls_node #define sk_refcnt __sk_common.skc_refcnt #define sk_tx_queue_mapping __sk_common.skc_tx_queue_mapping #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING #define sk_rx_queue_mapping __sk_common.skc_rx_queue_mapping #endif
atomic_t sk_drops;
__s32 sk_peek_off; struct sk_buff_head sk_error_queue; struct sk_buff_head sk_receive_queue; /* * The backlog queue is special, it is always used with * the per-socket spinlock held and requires low latency * access. Therefore we special case it's implementation. * Note : rmem_alloc is in this structure to fill a hole * on 64bit arches, not because its logically part of * backlog.
*/ struct {
atomic_t rmem_alloc; int len; struct sk_buff *head; struct sk_buff *tail;
} sk_backlog; #define sk_rmem_alloc sk_backlog.rmem_alloc
__cacheline_group_end(sock_write_rx);
__cacheline_group_begin(sock_read_rx); /* early demux fields */ struct dst_entry __rcu *sk_rx_dst; int sk_rx_dst_ifindex;
u32 sk_rx_dst_cookie;
/* flag bits in sk_user_data * * - SK_USER_DATA_NOCOPY: Pointer stored in sk_user_data might * not be suitable for copying when cloning the socket. For instance, * it can point to a reference counted object. sk_user_data bottom * bit is set if pointer must not be copied. * * - SK_USER_DATA_BPF: Mark whether sk_user_data field is * managed/owned by a BPF reuseport array. This bit should be set * when sk_user_data's sk is added to the bpf's reuseport_array. * * - SK_USER_DATA_PSOCK: Mark whether pointer stored in * sk_user_data points to psock type. This bit should be set * when sk_user_data is assigned to a psock object.
*/ #define SK_USER_DATA_NOCOPY 1UL #define SK_USER_DATA_BPF 2UL #define SK_USER_DATA_PSOCK 4UL #define SK_USER_DATA_PTRMASK ~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF |\
SK_USER_DATA_PSOCK)
/** * sk_user_data_is_nocopy - Test if sk_user_data pointer must not be copied * @sk: socket
*/ staticinlinebool sk_user_data_is_nocopy(conststruct sock *sk)
{ return ((uintptr_t)sk->sk_user_data & SK_USER_DATA_NOCOPY);
}
/** * __locked_read_sk_user_data_with_flags - return the pointer * only if argument flags all has been set in sk_user_data. Otherwise * return NULL * * @sk: socket * @flags: flag bits * * The caller must be holding sk->sk_callback_lock.
*/ staticinlinevoid *
__locked_read_sk_user_data_with_flags(conststruct sock *sk,
uintptr_t flags)
{
uintptr_t sk_user_data =
(uintptr_t)rcu_dereference_check(__sk_user_data(sk),
lockdep_is_held(&sk->sk_callback_lock));
/** * __rcu_dereference_sk_user_data_with_flags - return the pointer * only if argument flags all has been set in sk_user_data. Otherwise * return NULL * * @sk: socket * @flags: flag bits
*/ staticinlinevoid *
__rcu_dereference_sk_user_data_with_flags(conststruct sock *sk,
uintptr_t flags)
{
uintptr_t sk_user_data = (uintptr_t)rcu_dereference(__sk_user_data(sk));
/* * SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK * or not whether his port will be reused by someone else. SK_FORCE_REUSE * on a socket means that the socket will reuse everybody else's port * without looking at the other's sk_reuse value.
*/
/* NB: equivalent to hlist_del_init_rcu */ staticinlinebool __sk_del_node_init(struct sock *sk)
{ if (sk_hashed(sk)) {
__sk_del_node(sk);
sk_node_init(&sk->sk_node); returntrue;
} returnfalse;
}
/* Grab socket reference count. This operation is valid only when sk is ALREADY grabbed f.e. it is found in hash table or a list and the lookup is made under lock preventing hash table modifications.
*/
/* Ungrab socket in the context, which assumes that socket refcnt cannot hit zero, f.e. it is true in context of any socketcall.
*/ static __always_inline void __sock_put(struct sock *sk)
{
refcount_dec(&sk->sk_refcnt);
}
/** * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @head: the head for your list. * @offset: offset of hlist_node within the struct. *
*/ #define sk_for_each_entry_offset_rcu(tpos, pos, head, offset) \ for (pos = rcu_dereference(hlist_first_rcu(head)); \
pos != NULL && \
({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;}); \
pos = rcu_dereference(hlist_next_rcu(pos)))
staticinlinestruct user_namespace *sk_user_ns(conststruct sock *sk)
{ /* Careful only use this in a context where these parameters * can not change and must all be valid, such as recvmsg from * userspace.
*/ return sk->sk_socket->file->f_cred->user_ns;
}
/* Sock flags */ enum sock_flags {
SOCK_DEAD,
SOCK_DONE,
SOCK_URGINLINE,
SOCK_KEEPOPEN,
SOCK_LINGER,
SOCK_DESTROY,
SOCK_BROADCAST,
SOCK_TIMESTAMP,
SOCK_ZAPPED,
SOCK_USE_WRITE_QUEUE, /* whether to call sk->sk_write_space in sock_wfree */
SOCK_DBG, /* %SO_DEBUG setting */
SOCK_RCVTSTAMP, /* %SO_TIMESTAMP setting */
SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */
SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */
SOCK_MEMALLOC, /* VM depends on this socket for swapping */
SOCK_TIMESTAMPING_RX_SOFTWARE, /* %SOF_TIMESTAMPING_RX_SOFTWARE */
SOCK_FASYNC, /* fasync() active */
SOCK_RXQ_OVFL,
SOCK_ZEROCOPY, /* buffers from userspace */
SOCK_WIFI_STATUS, /* push wifi status to userspace */
SOCK_NOFCS, /* Tell NIC not to do the Ethernet FCS. * Will use last 4 bytes of packet sent from * user-space instead.
*/
SOCK_FILTER_LOCKED, /* Filter cannot be changed anymore */
SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */
SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */
SOCK_TXTIME,
SOCK_XDP, /* XDP is attached */
SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */
SOCK_RCVMARK, /* Receive SO_MARK ancillary data with packet */
SOCK_RCVPRIORITY, /* Receive SO_PRIORITY ancillary data with packet */
SOCK_TIMESTAMPING_ANY, /* Copy of sk_tsflags & TSFLAGS_ANY */
};
#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) /* * The highest bit of sk_tsflags is reserved for kernel-internal * SOCKCM_FLAG_TS_OPT_ID. There is a check in core/sock.c to control that * SOF_TIMESTAMPING* values do not reach this reserved area
*/ #define SOCKCM_FLAG_TS_OPT_ID BIT(31)
/* Note: If you think the test should be: * return READ_ONCE(sk->sk_ack_backlog) >= READ_ONCE(sk->sk_max_ack_backlog); * Then please take a look at commit 64a146513f8f ("[NET]: Revert incorrect accept queue backlog changes.")
*/ staticinlinebool sk_acceptq_is_full(conststruct sock *sk)
{ return READ_ONCE(sk->sk_ack_backlog) > READ_ONCE(sk->sk_max_ack_backlog);
}
/* * Compute minimal free write space needed to queue new packets.
*/ staticinlineint sk_stream_min_wspace(conststruct sock *sk)
{ return READ_ONCE(sk->sk_wmem_queued) >> 1;
}
staticinlinevoid sk_forward_alloc_add(struct sock *sk, int val)
{ /* Paired with lockless reads of sk->sk_forward_alloc */
WRITE_ONCE(sk->sk_forward_alloc, sk->sk_forward_alloc + val);
}
void sk_stream_write_space(struct sock *sk);
/* OOB backlog add */ staticinlinevoid __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
{ /* dont let skb dst not refcounted, we are going to leave rcu lock */
skb_dst_force(skb);
if (!sk->sk_backlog.tail)
WRITE_ONCE(sk->sk_backlog.head, skb); else
sk->sk_backlog.tail->next = skb;
/* * Take into account size of receive queue and backlog queue * Do not take into account this skb truesize, * to allow even a single big packet to come.
*/ staticinlinebool sk_rcvqueues_full(conststruct sock *sk, unsignedint limit)
{ unsignedint qsize = sk->sk_backlog.len + atomic_read(&sk->sk_rmem_alloc);
return qsize > limit;
}
/* The per-socket spinlock must be held here. */ staticinline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *skb, unsignedint limit)
{ if (sk_rcvqueues_full(sk, limit)) return -ENOBUFS;
/* * If the skb was allocated from pfmemalloc reserves, only * allow SOCK_MEMALLOC sockets to use it as this socket is * helping free memory
*/ if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) return -ENOMEM;
staticinlinevoid sk_incoming_cpu_update(struct sock *sk)
{ int cpu = raw_smp_processor_id();
if (unlikely(READ_ONCE(sk->sk_incoming_cpu) != cpu))
WRITE_ONCE(sk->sk_incoming_cpu, cpu);
}
staticinlinevoid sock_rps_save_rxhash(struct sock *sk, conststruct sk_buff *skb)
{ #ifdef CONFIG_RPS /* The following WRITE_ONCE() is paired with the READ_ONCE() * here, and another one in sock_rps_record_flow().
*/ if (unlikely(READ_ONCE(sk->sk_rxhash) != skb->hash))
WRITE_ONCE(sk->sk_rxhash, skb->hash); #endif
}
staticinlinevoid sock_rps_reset_rxhash(struct sock *sk)
{ #ifdef CONFIG_RPS /* Paired with READ_ONCE() in sock_rps_record_flow() */
WRITE_ONCE(sk->sk_rxhash, 0); #endif
}
int sk_stream_wait_connect(struct sock *sk, long *timeo_p); int sk_stream_wait_memory(struct sock *sk, long *timeo_p); void sk_stream_wait_close(struct sock *sk, long timeo_p); int sk_stream_error(struct sock *sk, int flags, int err); void sk_stream_kill_queues(struct sock *sk); void sk_set_memalloc(struct sock *sk); void sk_clear_memalloc(struct sock *sk);
/* * caches using SLAB_TYPESAFE_BY_RCU should let .next pointer from nulls nodes * un-modified. Special care is taken when initializing object to zero.
*/ staticinlinevoid sk_prot_clear_nulls(struct sock *sk, int size)
{ if (offsetof(struct sock, sk_node.next) != 0)
memset(sk, 0, offsetof(struct sock, sk_node.next));
memset(&sk->sk_node.pprev, 0,
size - offsetof(struct sock, sk_node.pprev));
}
struct proto_accept_arg { int flags; int err; int is_empty; bool kern;
};
/* Networking protocol blocks we attach to sockets. * socket layer -> transport layer interface
*/ struct proto { void (*close)(struct sock *sk, long timeout); int (*pre_connect)(struct sock *sk, struct sockaddr *uaddr, int addr_len); int (*connect)(struct sock *sk, struct sockaddr *uaddr, int addr_len); int (*disconnect)(struct sock *sk, int flags);
int (*ioctl)(struct sock *sk, int cmd, int *karg); int (*init)(struct sock *sk); void (*destroy)(struct sock *sk); void (*shutdown)(struct sock *sk, int how); int (*setsockopt)(struct sock *sk, int level, int optname, sockptr_t optval, unsignedint optlen); int (*getsockopt)(struct sock *sk, int level, int optname, char __user *optval, int __user *option); void (*keepalive)(struct sock *sk, int valbool); #ifdef CONFIG_COMPAT int (*compat_ioctl)(struct sock *sk, unsignedint cmd, unsignedlong arg); #endif int (*sendmsg)(struct sock *sk, struct msghdr *msg,
size_t len); int (*recvmsg)(struct sock *sk, struct msghdr *msg,
size_t len, int flags, int *addr_len); void (*splice_eof)(struct socket *sock); int (*bind)(struct sock *sk, struct sockaddr *addr, int addr_len); int (*bind_add)(struct sock *sk, struct sockaddr *addr, int addr_len);
int (*backlog_rcv) (struct sock *sk, struct sk_buff *skb); bool (*bpf_bypass_getsockopt)(int level, int optname);
void (*release_cb)(struct sock *sk);
/* Keeping track of sk's, looking them up, and port selection methods. */ int (*hash)(struct sock *sk); void (*unhash)(struct sock *sk); void (*rehash)(struct sock *sk); int (*get_port)(struct sock *sk, unsignedshort snum); void (*put_port)(struct sock *sk); #ifdef CONFIG_BPF_SYSCALL int (*psock_update_sk_prot)(struct sock *sk, struct sk_psock *psock, bool restore); #endif
/* Keeping track of sockets in use */ #ifdef CONFIG_PROC_FS unsignedint inuse_idx; #endif
bool (*stream_memory_free)(conststruct sock *sk, int wake); bool (*sock_is_readable)(struct sock *sk); /* Memory pressure */ void (*enter_memory_pressure)(struct sock *sk); void (*leave_memory_pressure)(struct sock *sk);
atomic_long_t *memory_allocated; /* Current allocated memory. */ int __percpu *per_cpu_fw_alloc; struct percpu_counter *sockets_allocated; /* Current number of sockets. */
/* * Pressure flag: try to collapse. * Technical note: it is used by multiple contexts non atomically. * Make sure to use READ_ONCE()/WRITE_ONCE() for all reads/writes. * All the __sk_mem_schedule() is of this nature: accounting * is strict, actions are advisory and have some latency.
*/ unsignedlong *memory_pressure; long *sysctl_mem;
int *sysctl_wmem; int *sysctl_rmem;
u32 sysctl_wmem_offset;
u32 sysctl_rmem_offset;
struct list_head node; int (*diag_destroy)(struct sock *sk, int err);
} __randomize_layout;
int proto_register(struct proto *prot, int alloc_slab); void proto_unregister(struct proto *prot); int sock_load_diag_module(int family, int protocol);
INDIRECT_CALLABLE_DECLARE(bool tcp_stream_memory_free(conststruct sock *sk, int wake));
staticinlinebool __sk_stream_memory_free(conststruct sock *sk, int wake)
{ if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf)) returnfalse;
staticinlineint
proto_sockets_allocated_sum_positive(struct proto *prot)
{ return percpu_counter_sum_positive(prot->sockets_allocated);
}
#ifdef CONFIG_PROC_FS #define PROTO_INUSE_NR 64 /* should be enough for the first time */ struct prot_inuse { int all; int val[PROTO_INUSE_NR];
};
staticinlinevoid sock_prot_inuse_add(conststruct net *net, conststruct proto *prot, int val)
{
this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
}
staticinlinevoid sock_inuse_add(conststruct net *net, int val)
{
this_cpu_add(net->core.prot_inuse->all, val);
}
int sock_prot_inuse_get(struct net *net, struct proto *proto); int sock_inuse_get(struct net *net); #else staticinlinevoid sock_prot_inuse_add(conststruct net *net, conststruct proto *prot, int val)
{
}
staticinlinevoid sock_inuse_add(conststruct net *net, int val)
{
} #endif
/* With per-bucket locks this operation is not-atomic, so that * this version is not worse.
*/ staticinlineint __sk_prot_rehash(struct sock *sk)
{
sk->sk_prot->unhash(sk); return sk->sk_prot->hash(sk);
}
/* About 10 seconds */ #define SOCK_DESTROY_TIME (10*HZ)
/* Sockets 0-1023 can't be bound to unless you are superuser */ #define PROT_SOCK 1024
/* * Functions for memory accounting
*/ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind); int __sk_mem_schedule(struct sock *sk, int size, int kind); void __sk_mem_reduce_allocated(struct sock *sk, int amount); void __sk_mem_reclaim(struct sock *sk, int amount);
#define SK_MEM_SEND 0 #define SK_MEM_RECV 1
/* sysctl_mem values are in pages */ staticinlinelong sk_prot_mem_limits(conststruct sock *sk, int index)
{ return READ_ONCE(sk->sk_prot->sysctl_mem[index]);
}
staticinlinevoid sk_owner_put(struct sock *sk)
{
} #endif /* * Macro so as to not evaluate some arguments when * lockdep is not enabled. * * Mark both the sk_lock and the sk_lock.slock as a * per-address-family lock class.
*/ #define sock_lock_init_class_and_name(sk, sname, skey, name, key) \ do { \
sk_owner_set(sk, THIS_MODULE); \
sk->sk_lock.owned = 0; \
init_waitqueue_head(&sk->sk_lock.wq); \
spin_lock_init(&(sk)->sk_lock.slock); \
debug_check_no_locks_freed((void *)&(sk)->sk_lock, \ sizeof((sk)->sk_lock)); \
lockdep_set_class_and_name(&(sk)->sk_lock.slock, \
(skey), (sname)); \
lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0); \
} while (0)
/* BH context may only use the following locking interface. */ #define bh_lock_sock(__sk) spin_lock(&((__sk)->sk_lock.slock)) #define bh_lock_sock_nested(__sk) \
spin_lock_nested(&((__sk)->sk_lock.slock), \
SINGLE_DEPTH_NESTING) #define bh_unlock_sock(__sk) spin_unlock(&((__sk)->sk_lock.slock))
/** * lock_sock_fast - fast version of lock_sock * @sk: socket * * This version should be used for very small section, where process won't block * return false if fast path is taken: * * sk_lock.slock locked, owned = 0, BH disabled * * return true if slow path is taken: * * sk_lock.slock unlocked, owned = 1, BH enabled
*/ staticinlinebool lock_sock_fast(struct sock *sk)
{ /* The sk_lock has mutex_lock() semantics here. */
mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
return __lock_sock_fast(sk);
}
/* fast socket lock variant for caller already holding a [different] socket lock */ staticinlinebool lock_sock_fast_nested(struct sock *sk)
{
mutex_acquire(&sk->sk_lock.dep_map, SINGLE_DEPTH_NESTING, 0, _RET_IP_);
return __lock_sock_fast(sk);
}
/** * unlock_sock_fast - complement of lock_sock_fast * @sk: socket * @slow: slow mode * * fast unlock socket for user context. * If slow mode is on, we call regular release_sock()
*/ staticinlinevoid unlock_sock_fast(struct sock *sk, bool slow)
__releases(&sk->sk_lock.slock)
{ if (slow) {
release_sock(sk);
__release(&sk->sk_lock.slock);
} else {
mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
spin_unlock_bh(&sk->sk_lock.slock);
}
}
/* Used by processes to "lock" a socket state, so that * interrupts and bottom half handlers won't change it * from under us. It essentially blocks any incoming * packets, so that we won't get any new data or any * packets that change the state of the socket. * * While locked, BH processing will add new packets to * the backlog queue. This queue is processed by the * owner of the socket lock right before it is released. * * Since ~2.3.5 it is also exclusive sleep lock serializing * accesses from user process context.
*/
/* The sk_lock has mutex_unlock() semantics: */
mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
}
/* no reclassification while locks are held */ staticinlinebool sock_allow_reclassification(conststruct sock *csk)
{ struct sock *sk = (struct sock *)csk;
int sk_setsockopt(struct sock *sk, int level, int optname,
sockptr_t optval, unsignedint optlen); int sock_setsockopt(struct socket *sock, int level, int op,
sockptr_t optval, unsignedint optlen); int do_sock_setsockopt(struct socket *sock, bool compat, int level, int optname, sockptr_t optval, int optlen); int do_sock_getsockopt(struct socket *sock, bool compat, int level, int optname, sockptr_t optval, sockptr_t optlen);
int sk_getsockopt(struct sock *sk, int level, int optname,
sockptr_t optval, sockptr_t optlen); int sock_gettstamp(struct socket *sock, void __user *userstamp, bool timeval, bool time32); struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsignedlong header_len, unsignedlong data_len, int noblock, int *errcode, int max_page_order);
staticinlinestruct sk_buff *sock_alloc_send_skb(struct sock *sk, unsignedlong size, int noblock, int *errcode)
{ return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
}
void *sock_kmalloc(struct sock *sk, int size, gfp_t priority); void *sock_kmemdup(struct sock *sk, constvoid *src, int size, gfp_t priority); void sock_kfree_s(struct sock *sk, void *mem, int size); void sock_kzfree_s(struct sock *sk, void *mem, int size); void sk_send_sigurg(struct sock *sk);
staticinlinevoid sock_replace_proto(struct sock *sk, struct proto *proto)
{ if (sk->sk_socket)
clear_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
WRITE_ONCE(sk->sk_prot, proto);
}
int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, struct sockcm_cookie *sockc); int sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct sockcm_cookie *sockc);
/* * Functions to fill in entries in struct proto_ops when a protocol * does not implement a particular function.
*/ int sock_no_bind(struct socket *, struct sockaddr *, int); int sock_no_connect(struct socket *, struct sockaddr *, int, int); int sock_no_socketpair(struct socket *, struct socket *); int sock_no_accept(struct socket *, struct socket *, struct proto_accept_arg *); int sock_no_getname(struct socket *, struct sockaddr *, int); int sock_no_ioctl(struct socket *, unsignedint, unsignedlong); int sock_no_listen(struct socket *, int); int sock_no_shutdown(struct socket *, int); int sock_no_sendmsg(struct socket *, struct msghdr *, size_t); int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t len); int sock_no_recvmsg(struct socket *, struct msghdr *, size_t, int); int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma);
/* * Functions to fill in entries in struct proto_ops when a protocol * uses the inet style.
*/ int sock_common_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen); int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); int sock_common_setsockopt(struct socket *sock, int level, int optname,
sockptr_t optval, unsignedint optlen);
void sk_common_release(struct sock *sk);
/* * Default socket callbacks and setup code
*/
/* Initialise core socket variables using an explicit uid. */ void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid);
/* Initialise core socket variables. * Assumes struct socket *sock is embedded in a struct socket_alloc.
*/ void sock_init_data(struct socket *sock, struct sock *sk);
/* * Socket reference counting postulates. * * * Each user of socket SHOULD hold a reference count. * * Each access point to socket (an hash table bucket, reference from a list, * running timer, skb in flight MUST hold a reference count. * * When reference count hits 0, it means it will never increase back. * * When reference count hits 0, it means that no references from * outside exist to this socket and current process on current CPU * is last user and may/should destroy this socket. * * sk_free is called from any context: process, BH, IRQ. When * it is called, socket has no references from outside -> sk_free * may release descendant resources allocated by the socket, but * to the time when it is called, socket is NOT referenced by any * hash tables, lists etc. * * Packets, delivered from outside (from network or from another process) * and enqueued on receive/error queues SHOULD NOT grab reference count, * when they sit in queue. Otherwise, packets will leak to hole, when * socket is looked up by one cpu and unhasing is made by another CPU. * It is true for udp/raw, netlink (leak to receive and error queues), tcp * (leak to backlog). Packet socket does all the processing inside * BR_NETPROTO_LOCK, so that it has not this race condition. UNIX sockets * use separate SMP lock, so that they are prone too.
*/
/* Ungrab socket and destroy it, if it was the last reference. */ staticinlinevoid sock_put(struct sock *sk)
{ if (refcount_dec_and_test(&sk->sk_refcnt))
sk_free(sk);
} /* Generic version of sock_put(), dealing with all sockets * (TCP_TIMEWAIT, TCP_NEW_SYN_RECV, ESTABLISHED...)
*/ void sock_gen_put(struct sock *sk);
staticinlinevoid sk_tx_queue_set(struct sock *sk, int tx_queue)
{ /* sk_tx_queue_mapping accept only upto a 16-bit value */ if (WARN_ON_ONCE((unsignedshort)tx_queue >= USHRT_MAX)) return; /* Paired with READ_ONCE() in sk_tx_queue_get() and * other WRITE_ONCE() because socket lock might be not held.
*/
WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue);
}
#define NO_QUEUE_MAPPING USHRT_MAX
staticinlinevoid sk_tx_queue_clear(struct sock *sk)
{ /* Paired with READ_ONCE() in sk_tx_queue_get() and * other WRITE_ONCE() because socket lock might be not held.
*/
WRITE_ONCE(sk->sk_tx_queue_mapping, NO_QUEUE_MAPPING);
}
staticinlineint sk_tx_queue_get(conststruct sock *sk)
{ if (sk) { /* Paired with WRITE_ONCE() in sk_tx_queue_clear() * and sk_tx_queue_set().
*/ int val = READ_ONCE(sk->sk_tx_queue_mapping);
if (val != NO_QUEUE_MAPPING) return val;
} return -1;
}
staticinline wait_queue_head_t *sk_sleep(struct sock *sk)
{
BUILD_BUG_ON(offsetof(struct socket_wq, wait) != 0); return &rcu_dereference_raw(sk->sk_wq)->wait;
} /* Detach socket from process context. * Announce socket dead, detach it from wait queue and inode. * Note that parent inode held reference count on this struct sock, * we do not release it in this function, because protocol * probably wants some additional cleanups or even continuing * to work with this socket (TCP).
*/ staticinlinevoid sock_orphan(struct sock *sk)
{
write_lock_bh(&sk->sk_callback_lock);
sock_set_flag(sk, SOCK_DEAD);
sk_set_socket(sk, NULL);
sk->sk_wq = NULL;
write_unlock_bh(&sk->sk_callback_lock);
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.