// SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * PACKET - implements raw packet sockets. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox, <gw4pts@gw4pts.ampr.org> * * Fixes: * Alan Cox : verify_area() now used correctly * Alan Cox : new skbuff lists, look ma no backlogs! * Alan Cox : tidied skbuff lists. * Alan Cox : Now uses generic datagram routines I * added. Also fixed the peek/read crash * from all old Linux datagram code. * Alan Cox : Uses the improved datagram code. * Alan Cox : Added NULL's for socket options. * Alan Cox : Re-commented the code. * Alan Cox : Use new kernel side addressing * Rob Janssen : Correct MTU usage. * Dave Platt : Counter leaks caused by incorrect * interrupt locking and some slightly * dubious gcc output. Can you read * compiler: it said _VOLATILE_ * Richard Kooijman : Timestamp fixes. * Alan Cox : New buffers. Use sk->mac.raw. * Alan Cox : sendmsg/recvmsg support. * Alan Cox : Protocol setting support * Alexey Kuznetsov : Untied from IPv4 stack. * Cyrus Durgin : Fixed kerneld for kmod. * Michal Ostrowski : Module initialization cleanup. * Ulises Alonso : Frame number limit removal and * packet_set_ring memory leak. * Eric Biederman : Allow for > 8 byte hardware addresses. * The convention is that longer addresses * will simply extend the hardware address * byte arrays at the end of sockaddr_ll * and packet_mreq. * Johann Baudy : Added TX RING. * Chetan Loke : Implemented TPACKET_V3 block abstraction * layer. * Copyright (C) 2011, <lokec@ccs.neu.edu>
*/
/* Assumptions: - If the device has no dev->header_ops->create, there is no LL header visible above the device. In this case, its hard_header_len should be 0. The device may prepend its own header internally. In this case, its needed_headroom should be set to the space needed for it to add its internal header. For example, a WiFi driver pretending to be an Ethernet driver should set its hard_header_len to be the Ethernet header length, and set its needed_headroom to be (the real WiFi header length - the fake Ethernet header length). - packet socket receives packets with pulled ll header, so that SOCK_RAW should push it back.
On receive: -----------
Incoming, dev_has_header(dev) == true mac_header -> ll header data -> data
Incoming, dev_has_header(dev) == false mac_header -> data However drivers often make it point to the ll header. This is incorrect because the ll header should be invisible to us. data -> data
Outgoing, dev_has_header(dev) == false mac_header -> data. ll header is invisible to us. data -> data
Resume If dev_has_header(dev) == false we are unable to restore the ll header, because it is invisible to us.
struct packet_skb_cb { union { struct sockaddr_pkt pkt; union { /* Trick: alias skb original length with * ll.sll_family and ll.protocol in order * to save room.
*/ unsignedint origlen; struct sockaddr_ll ll;
};
} sa;
};
/* __register_prot_hook must be invoked through register_prot_hook * or from a context in which asynchronous accesses to the packet * socket is not possible (packet_create()).
*/ staticvoid __register_prot_hook(struct sock *sk)
{ struct packet_sock *po = pkt_sk(sk);
if (!packet_sock_flag(po, PACKET_SOCK_RUNNING)) { if (po->fanout)
__fanout_link(sk, po); else
dev_add_pack(&po->prot_hook);
/* If the sync parameter is true, we will temporarily drop * the po->bind_lock and do a synchronize_net to make sure no * asynchronous packet processing paths still refer to the elements * of po->prot_hook. If the sync parameter is false, it is the * callers responsibility to take care of this.
*/ staticvoid __unregister_prot_hook(struct sock *sk, bool sync)
{ struct packet_sock *po = pkt_sk(sk);
lockdep_assert_held_once(&po->bind_lock);
packet_sock_flag_set(po, PACKET_SOCK_RUNNING, 0);
if (po->fanout)
__fanout_unlink(sk, po); else
__dev_remove_pack(&po->prot_hook);
__sock_put(sk);
if (sync) {
spin_unlock(&po->bind_lock);
synchronize_net();
spin_lock(&po->bind_lock);
}
}
if (!(ts_status = tpacket_get_timestamp(skb, &ts, READ_ONCE(po->tp_tstamp)))) return 0;
h.raw = frame; /* * versions 1 through 3 overflow the timestamps in y2106, since they * all store the seconds in a 32-bit unsigned integer. * If we create a version 4, that should have a 64-bit timestamp, * either 64-bit seconds + 32-bit nanoseconds, or just 64-bit * nanoseconds.
*/ switch (po->tp_version) { case TPACKET_V1:
h.h1->tp_sec = ts.tv_sec;
h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; break; case TPACKET_V2:
h.h2->tp_sec = ts.tv_sec;
h.h2->tp_nsec = ts.tv_nsec; break; case TPACKET_V3:
h.h3->tp_sec = ts.tv_sec;
h.h3->tp_nsec = ts.tv_nsec; break; default:
WARN(1, "TPACKET version not supported.\n");
BUG();
}
/* one flush is safe, as both fields always lie on the same cacheline */
flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
smp_wmb();
return ts_status;
}
staticvoid *packet_lookup_frame(conststruct packet_sock *po, conststruct packet_ring_buffer *rb, unsignedint position, int status)
{ unsignedint pg_vec_pos, frame_offset; union tpacket_uhdr h;
pg_vec_pos = position / rb->frames_per_block;
frame_offset = position % rb->frames_per_block;
/* In the SOCK_DGRAM scenario, skb data starts at the network * protocol, which is after the VLAN headers. The outer VLAN * header is at the hard_header_len offset in non-variable * length link layer headers. If it's a VLAN device, the * min_header_len should be used to exclude the VLAN header * size.
*/ if (dev->min_header_len == dev->hard_header_len)
header_len = dev->hard_header_len; elseif (is_vlan_dev(dev))
header_len = dev->min_header_len; else return 0;
staticint prb_calc_retire_blk_tmo(struct packet_sock *po, int blk_size_in_bytes)
{ struct net_device *dev; unsignedint mbits, div; struct ethtool_link_ksettings ecmd; int err;
rtnl_lock();
dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex); if (unlikely(!dev)) {
rtnl_unlock(); return DEFAULT_PRB_RETIRE_TOV;
}
err = __ethtool_get_link_ksettings(dev, &ecmd);
rtnl_unlock(); if (err) return DEFAULT_PRB_RETIRE_TOV;
/* If the link speed is so slow you don't really * need to worry about perf anyways
*/ if (ecmd.base.speed < SPEED_1000 ||
ecmd.base.speed == SPEED_UNKNOWN) return DEFAULT_PRB_RETIRE_TOV;
/* Do NOT update the last_blk_num first. * Assumes sk_buff_head lock is held.
*/ staticvoid _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
{
mod_timer(&pkc->retire_blk_timer,
jiffies + pkc->tov_in_jiffies);
pkc->last_kactive_blk_num = pkc->kactive_blk_num;
}
/* * Timer logic: * 1) We refresh the timer only when we open a block. * By doing this we don't waste cycles refreshing the timer * on packet-by-packet basis. * * With a 1MB block-size, on a 1Gbps line, it will take * i) ~8 ms to fill a block + ii) memcpy etc. * In this cut we are not accounting for the memcpy time. * * So, if the user sets the 'tmo' to 10ms then the timer * will never fire while the block is still getting filled * (which is what we want). However, the user could choose * to close a block early and that's fine. * * But when the timer does fire, we check whether or not to refresh it. * Since the tmo granularity is in msecs, it is not too expensive * to refresh the timer, lets say every '8' msecs. * Either the user can set the 'tmo' or we can derive it based on * a) line-speed and b) block-size. * prb_calc_retire_blk_tmo() calculates the tmo. *
*/ staticvoid prb_retire_rx_blk_timer_expired(struct timer_list *t)
{ struct packet_sock *po =
timer_container_of(po, t, rx_ring.prb_bdqc.retire_blk_timer); struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring); unsignedint frozen; struct tpacket_block_desc *pbd;
/* We only need to plug the race when the block is partially filled. * tpacket_rcv: * lock(); increment BLOCK_NUM_PKTS; unlock() * copy_bits() is in progress ... * timer fires on other cpu: * we can't retire the current block because copy_bits * is in progress. *
*/ if (BLOCK_NUM_PKTS(pbd)) { /* Waiting for skb_copy_bits to finish... */
write_lock(&pkc->blk_fill_in_prog_lock);
write_unlock(&pkc->blk_fill_in_prog_lock);
}
if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) { if (!frozen) { if (!BLOCK_NUM_PKTS(pbd)) { /* An empty block. Just refresh the timer. */ goto refresh_timer;
}
prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO); if (!prb_dispatch_next_block(pkc, po)) goto refresh_timer; else goto out;
} else { /* Case 1. Queue was frozen because user-space was * lagging behind.
*/ if (prb_curr_blk_in_use(pbd)) { /* * Ok, user-space is still behind. * So just refresh the timer.
*/ goto refresh_timer;
} else { /* Case 2. queue was frozen,user-space caught up, * now the link went idle && the timer fired. * We don't have a block to close.So we open this * block and restart the timer. * opening a block thaws the queue,restarts timer * Thawing/timer-refresh is a side effect.
*/
prb_open_block(pkc, pbd); goto out;
}
}
}
/* * Side effect: * * 1) flush the block * 2) Increment active_blk_num * * Note:We DONT refresh the timer on purpose. * Because almost always the next block will be opened.
*/ staticvoid prb_close_block(struct tpacket_kbdq_core *pkc1, struct tpacket_block_desc *pbd1, struct packet_sock *po, unsignedint stat)
{
__u32 status = TP_STATUS_USER | stat;
/* Get the ts of the last pkt */ if (BLOCK_NUM_PKTS(pbd1)) {
h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
} else { /* Ok, we tmo'd - so get the current time. * * It shouldn't really happen as we don't close empty * blocks. See prb_retire_rx_blk_timer_expired().
*/ struct timespec64 ts;
ktime_get_real_ts64(&ts);
h1->ts_last_pkt.ts_sec = ts.tv_sec;
h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
}
smp_wmb();
/* Flush the block */
prb_flush_block(pkc1, pbd1, status);
/* * Queue freeze logic: * 1) Assume tp_block_nr = 8 blocks. * 2) At time 't0', user opens Rx ring. * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7 * 4) user-space is either sleeping or processing block '0'. * 5) tpacket_rcv is currently filling block '7', since there is no space left, * it will close block-7,loop around and try to fill block '0'. * call-flow: * __packet_lookup_frame_in_block * prb_retire_current_block() * prb_dispatch_next_block() * |->(BLOCK_STATUS == USER) evaluates to true * 5.1) Since block-0 is currently in-use, we just freeze the queue. * 6) Now there are two cases: * 6.1) Link goes idle right after the queue is frozen. * But remember, the last open_block() refreshed the timer. * When this timer expires,it will refresh itself so that we can * re-open block-0 in near future. * 6.2) Link is busy and keeps on receiving packets. This is a simple * case and __packet_lookup_frame_in_block will check if block-0 * is free and can now be re-used.
*/ staticvoid prb_freeze_queue(struct tpacket_kbdq_core *pkc, struct packet_sock *po)
{
pkc->reset_pending_on_curr_blk = 1;
po->stats.stats3.tp_freeze_q_cnt++;
}
/* * If the next block is free then we will dispatch it * and return a good offset. * Else, we will freeze the queue. * So, caller must check the return value.
*/ staticvoid *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc, struct packet_sock *po)
{ struct tpacket_block_desc *pbd;
smp_rmb();
/* 1. Get current block num */
pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
/* 2. If this block is currently in_use then freeze the queue */ if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
prb_freeze_queue(pkc, po); return NULL;
}
/* * 3. * open this block and return the offset where the first packet * needs to get stored.
*/
prb_open_block(pkc, pbd); return (void *)pkc->nxt_offset;
}
/* retire/close the current block */ if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) { /* * Plug the case where copy_bits() is in progress on * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't * have space to copy the pkt in the current block and * called prb_retire_current_block() * * We don't need to worry about the TMO case because * the timer-handler already handled this case.
*/ if (!(status & TP_STATUS_BLK_TMO)) { /* Waiting for skb_copy_bits to finish... */
write_lock(&pkc->blk_fill_in_prog_lock);
write_unlock(&pkc->blk_fill_in_prog_lock);
}
prb_close_block(pkc, pbd, po, status); return;
}
}
/* Queue is frozen when user space is lagging behind */ if (prb_queue_frozen(pkc)) { /* * Check if that last block which caused the queue to freeze, * is still in_use by user-space.
*/ if (prb_curr_blk_in_use(pbd)) { /* Can't record this packet */ return NULL;
} else { /* * Ok, the block was released by user-space. * Now let's open that block. * opening a block also thaws the queue. * Thawing is a side effect.
*/
prb_open_block(pkc, pbd);
}
}
if (po->tp_version == TPACKET_V3) { if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
ret = ROOM_NORMAL; elseif (__tpacket_v3_has_room(po, 0))
ret = ROOM_LOW;
} else { if (__tpacket_has_room(po, ROOM_POW_OFF))
ret = ROOM_NORMAL; elseif (__tpacket_has_room(po, 0))
ret = ROOM_LOW;
}
switch (type) { case PACKET_FANOUT_ROLLOVER: if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER) return -EINVAL; break; case PACKET_FANOUT_HASH: case PACKET_FANOUT_LB: case PACKET_FANOUT_CPU: case PACKET_FANOUT_RND: case PACKET_FANOUT_QM: case PACKET_FANOUT_CBPF: case PACKET_FANOUT_EBPF: break; default: return -EINVAL;
}
if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) { if (id != 0) {
err = -EINVAL; goto out;
} if (!fanout_find_new_id(sk, &id)) {
err = -ENOMEM; goto out;
} /* ephemeral flag for the first socket in the group: drop it */
flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8);
}
/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout. * It is the responsibility of the caller to call fanout_release_data() and * free the returned packet_fanout (after synchronize_net())
*/ staticstruct packet_fanout *fanout_release(struct sock *sk)
{ struct packet_sock *po = pkt_sk(sk); struct packet_fanout *f;
mutex_lock(&fanout_mutex);
f = po->fanout; if (f) {
po->fanout = NULL;
if (refcount_dec_and_test(&f->sk_ref))
list_del(&f->list); else
f = NULL;
}
mutex_unlock(&fanout_mutex);
return f;
}
staticbool packet_extra_vlan_len_allowed(conststruct net_device *dev, struct sk_buff *skb)
{ /* Earlier code assumed this would be a VLAN pkt, double-check * this now that we have the actual packet in hand. We can only * do this check on Ethernet devices.
*/ if (unlikely(dev->type != ARPHRD_ETHER)) returnfalse;
/* * When we registered the protocol we saved the socket in the data * field for just this event.
*/
sk = pt->af_packet_priv;
/* * Yank back the headers [hope the device set this * right or kerboom...] * * Incoming packets have ll header pulled, * push it back. * * For outgoing ones skb->data == skb_mac_header(skb) * so that this procedure is noop.
*/
if (skb->pkt_type == PACKET_LOOPBACK) goto out;
if (!net_eq(dev_net(dev), sock_net(sk))) goto out;
skb = skb_share_check(skb, GFP_ATOMIC); if (skb == NULL) goto oom;
/* drop any routing info */
skb_dst_drop(skb);
/* drop conntrack reference */
nf_reset_ct(skb);
spkt = &PACKET_SKB_CB(skb)->sa.pkt;
skb_push(skb, skb->data - skb_mac_header(skb));
/* * The SOCK_PACKET socket receives _all_ frames.
*/
/* Move network header to the right position for VLAN tagged packets */ if (likely(skb->dev->type == ARPHRD_ETHER) &&
eth_type_vlan(skb->protocol) &&
vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0)
skb_set_network_header(skb, depth);
skb_probe_transport_header(skb);
}
/* * Output a raw packet to a device layer. This bypasses all the other * protocol layers and you must therefore supply it with a complete frame
*/
if (saddr) { if (msg->msg_namelen < sizeof(struct sockaddr)) return -EINVAL; if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
proto = saddr->spkt_protocol;
} else return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
/* * Find the device first to size check it
*/
saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
retry:
rcu_read_lock();
dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
err = -ENODEV; if (dev == NULL) goto out_unlock;
err = -ENETDOWN; if (!(dev->flags & IFF_UP)) goto out_unlock;
/* * You may not queue a frame bigger than the mtu. This is the lowest level * raw protocol and you must do your own fragmentation at this level.
*/
if (unlikely(sock_flag(sk, SOCK_NOFCS))) { if (!netif_supports_nofcs(dev)) {
err = -EPROTONOSUPPORT; goto out_unlock;
}
extra_len = 4; /* We're doing our own CRC */
}
if (!skb) {
size_t reserved = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; unsignedint hhlen = dev->header_ops ? dev->hard_header_len : 0;
rcu_read_unlock();
skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL); if (skb == NULL) return -ENOBUFS; /* FIXME: Save some space for broken drivers that write a hard * header at transmission time by themselves. PPP is the notable * one here. This should really be fixed at the driver level.
*/
skb_reserve(skb, reserved);
skb_reset_network_header(skb);
/* Try to align data part correctly */ if (hhlen) {
skb->data -= hhlen;
skb->tail -= hhlen; if (len < hhlen)
skb_reset_network_header(skb);
}
err = memcpy_from_msg(skb_put(skb, len), msg, len); if (err) goto out_free; goto retry;
}
/* * This function makes lazy skb cloning in hope that most of packets * are discarded by BPF. * * Note tricky part: we DO mangle shared skb! skb->data, skb->len * and skb->cb are mangled. It works because (and until) packets * falling here are owned by current CPU. Output packets are cloned * by dev_queue_xmit_nit(), input packets are processed by net_bh * sequentially, so that if we return skb to original state on exit, * we will not harm anyone.
*/
if (!net_eq(dev_net(dev), sock_net(sk))) goto drop;
skb->dev = dev;
if (dev_has_header(dev)) { /* The device has an explicit notion of ll header, * exported to higher levels. * * Otherwise, the device hides details of its frame * structure, so that corresponding packet head is * never delivered to user.
*/ if (sk->sk_type != SOCK_DGRAM)
skb_push(skb, skb->data - skb_mac_header(skb)); elseif (skb->pkt_type == PACKET_OUTGOING) { /* Special case: outgoing packets have ll header at head */
skb_pull(skb, skb_network_offset(skb));
}
}
/* sll->sll_family and sll->sll_protocol are set in packet_recvmsg(). * Use their space for storing the original skb length.
*/
PACKET_SKB_CB(skb)->sa.origlen = skb->len;
/* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT. * We may add members to them until current aligned size without forcing * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
*/
BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
if (skb->pkt_type == PACKET_LOOPBACK) goto drop;
sk = pt->af_packet_priv;
po = pkt_sk(sk);
if (!net_eq(dev_net(dev), sock_net(sk))) goto drop;
if (dev_has_header(dev)) { if (sk->sk_type != SOCK_DGRAM)
skb_push(skb, skb->data - skb_mac_header(skb)); elseif (skb->pkt_type == PACKET_OUTGOING) { /* Special case: outgoing packets have ll header at head */
skb_pull(skb, skb_network_offset(skb));
}
}
res = run_filter(skb, sk, snaplen); if (!res) goto drop_n_restore;
/* If we are flooded, just give up */ if (__packet_rcv_has_room(po, skb) == ROOM_NONE) {
atomic_inc(&po->tp_drops); goto drop_n_restore;
}
if (skb->ip_summed == CHECKSUM_PARTIAL)
status |= TP_STATUS_CSUMNOTREADY; elseif (skb->pkt_type != PACKET_OUTGOING &&
skb_csum_unnecessary(skb))
status |= TP_STATUS_CSUM_VALID; if (skb_is_gso(skb) && skb_is_gso_tcp(skb))
status |= TP_STATUS_GSO_TCP;
if (po->tp_version <= TPACKET_V2) {
packet_increment_rx_head(po, &po->rx_ring); /* * LOSING will be reported till you read the stats, * because it's COR - Clear On Read. * Anyways, moving it for V1/V2 only as V3 doesn't need this * at packet level.
*/ if (atomic_read(&po->tp_drops))
status |= TP_STATUS_LOSING;
}
po->stats.stats1.tp_packets++; if (copy_skb) {
status |= TP_STATUS_COPY;
skb_clear_delivery_time(copy_skb);
__skb_queue_tail(&sk->sk_receive_queue, copy_skb);
}
spin_unlock(&sk->sk_receive_queue.lock);
skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
/* Always timestamp; prefer an existing software timestamp taken * closer to the time of capture.
*/
ts_status = tpacket_get_timestamp(skb, &ts,
READ_ONCE(po->tp_tstamp) |
SOF_TIMESTAMPING_SOFTWARE); if (!ts_status)
ktime_get_real_ts64(&ts);
if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len) return -EINVAL;
return 0;
}
staticint packet_snd_vnet_parse(struct msghdr *msg, size_t *len, struct virtio_net_hdr *vnet_hdr, int vnet_hdr_sz)
{ int ret;
if (*len < vnet_hdr_sz) return -EINVAL;
*len -= vnet_hdr_sz;
if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter)) return -EFAULT;
ret = __packet_snd_vnet_parse(vnet_hdr, *len); if (ret) return ret;
/* move iter to point to the start of mac header */ if (vnet_hdr_sz != sizeof(struct virtio_net_hdr))
iov_iter_advance(&msg->msg_iter, vnet_hdr_sz - sizeof(struct virtio_net_hdr));
return 0;
}
staticint tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, void *frame, struct net_device *dev, void *data, int tp_len,
__be16 proto, unsignedchar *addr, int hlen, int copylen, conststruct sockcm_cookie *sockc)
{ union tpacket_uhdr ph; int to_write, offset, len, nr_frags, len_max; struct socket *sock = po->sk.sk_socket; struct page *page; int err;
staticint tpacket_parse_header(struct packet_sock *po, void *frame, int size_max, void **data)
{ union tpacket_uhdr ph; int tp_len, off;
ph.raw = frame;
switch (po->tp_version) { case TPACKET_V3: if (ph.h3->tp_next_offset != 0) {
pr_warn_once("variable sized slot not supported"); return -EINVAL;
}
tp_len = ph.h3->tp_len; break; case TPACKET_V2:
tp_len = ph.h2->tp_len; break; default:
tp_len = ph.h1->tp_len; break;
} if (unlikely(tp_len > size_max)) {
pr_err("packet size is too long (%d > %d)\n", tp_len, size_max); return -EMSGSIZE;
}
if (unlikely(packet_sock_flag(po, PACKET_SOCK_TX_HAS_OFF))) { int off_min, off_max;
off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
off_max = po->tx_ring.frame_size - tp_len; if (po->sk.sk_type == SOCK_DGRAM) { switch (po->tp_version) { case TPACKET_V3:
off = ph.h3->tp_net; break; case TPACKET_V2:
off = ph.h2->tp_net; break; default:
off = ph.h1->tp_net; break;
}
} else { switch (po->tp_version) { case TPACKET_V3:
off = ph.h3->tp_mac; break; case TPACKET_V2:
off = ph.h2->tp_mac; break; default:
off = ph.h1->tp_mac; break;
}
} if (unlikely((off < off_min) || (off_max < off))) return -EINVAL;
} else {
off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
}
*data = frame + off; return tp_len;
}
staticint tpacket_snd(struct packet_sock *po, struct msghdr *msg)
{ struct sk_buff *skb = NULL; struct net_device *dev; struct virtio_net_hdr *vnet_hdr = NULL; struct sockcm_cookie sockc;
__be16 proto; int err, reserve = 0; void *ph;
DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name); bool need_wait = !(msg->msg_flags & MSG_DONTWAIT); int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); unsignedchar *addr = NULL; int tp_len, size_max; void *data; int len_sum = 0; int status = TP_STATUS_AVAILABLE; int hlen, tlen, copylen = 0; long timeo;
mutex_lock(&po->pg_vec_lock);
/* packet_sendmsg() check on tx_ring.pg_vec was lockless, * we need to confirm it under protection of pg_vec_lock.
*/ if (unlikely(!po->tx_ring.pg_vec)) {
err = -EBUSY; goto out;
} if (likely(saddr == NULL)) {
dev = packet_cached_dev_get(po);
proto = READ_ONCE(po->num);
} else {
err = -EINVAL; if (msg->msg_namelen < sizeof(struct sockaddr_ll)) goto out; if (msg->msg_namelen < (saddr->sll_halen
+ offsetof(struct sockaddr_ll,
sll_addr))) goto out;
proto = saddr->sll_protocol;
dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex); if (po->sk.sk_socket->type == SOCK_DGRAM) { if (dev && msg->msg_namelen < dev->addr_len +
offsetof(struct sockaddr_ll, sll_addr)) goto out_put;
addr = saddr->sll_addr;
}
}
err = -ENXIO; if (unlikely(dev == NULL)) goto out;
err = -ENETDOWN; if (unlikely(!(dev->flags & IFF_UP))) goto out_put;
sockcm_init(&sockc, &po->sk); if (msg->msg_controllen) {
err = sock_cmsg_send(&po->sk, msg, &sockc); if (unlikely(err)) goto out_put;
}
do {
ph = packet_current_frame(po, &po->tx_ring,
TP_STATUS_SEND_REQUEST); if (unlikely(ph == NULL)) { /* Note: packet_read_pending() might be slow if we * have to call it as it's per_cpu variable, but in * fast-path we don't have to call it, only when ph * is NULL, we need to check the pending_refcnt.
*/ if (need_wait && packet_read_pending(&po->tx_ring)) {
timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo); if (timeo <= 0) {
err = !timeo ? -ETIMEDOUT : -ERESTARTSYS; goto out_put;
} /* check for additional frames */ continue;
} else break;
}
/* Reading tx_ring.pg_vec without holding pg_vec_lock is racy. * tpacket_snd() will redo the check safely.
*/ if (data_race(po->tx_ring.pg_vec)) return tpacket_snd(po, msg);
return packet_snd(sock, msg, len);
}
/* * Close a PACKET socket. This is fairly simple. We immediately go * to 'closed' state and remove our protocol entry in the device list.
*/
kfree(po->rollover); if (f) {
fanout_release_data(f);
kvfree(f);
} /* * Now the socket is dead. No more input will appear.
*/
sock_orphan(sk);
sock->sk = NULL;
lock_sock(sk);
spin_lock(&po->bind_lock); if (!proto)
proto = po->num;
rcu_read_lock();
if (po->fanout) {
ret = -EINVAL; goto out_unlock;
}
if (name) {
dev = dev_get_by_name_rcu(sock_net(sk), name); if (!dev) {
ret = -ENODEV; goto out_unlock;
}
} elseif (ifindex) {
dev = dev_get_by_index_rcu(sock_net(sk), ifindex); if (!dev) {
ret = -ENODEV; goto out_unlock;
}
}
need_rehook = po->prot_hook.type != proto || po->prot_hook.dev != dev;
if (need_rehook) {
dev_hold(dev); if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) {
rcu_read_unlock(); /* prevents packet_notifier() from calling * register_prot_hook()
*/
WRITE_ONCE(po->num, 0);
__unregister_prot_hook(sk, true);
rcu_read_lock(); if (dev)
unlisted = !dev_get_by_index_rcu(sock_net(sk),
dev->ifindex);
}
if (addr_len != sizeof(struct sockaddr)) return -EINVAL; /* uaddr->sa_data comes from the userspace, it's not guaranteed to be * zero-terminated.
*/
memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data_min));
name[sizeof(uaddr->sa_data_min)] = 0;
staticint packet_create(struct net *net, struct socket *sock, int protocol, int kern)
{ struct sock *sk; struct packet_sock *po;
__be16 proto = (__force __be16)protocol; /* weird, but documented */ int err;
if (!ns_capable(net->user_ns, CAP_NET_RAW)) return -EPERM; if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
sock->type != SOCK_PACKET) return -ESOCKTNOSUPPORT;
sock->state = SS_UNCONNECTED;
err = -ENOBUFS;
sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern); if (sk == NULL) goto out;
sock->ops = &packet_ops; if (sock->type == SOCK_PACKET)
sock->ops = &packet_ops_spkt;
po = pkt_sk(sk);
err = packet_alloc_pending(po); if (err) goto out_sk_free;
/* * Call the generic datagram receiver. This handles all sorts * of horrible races and re-entrancy so we can forget about it * in the protocol layers. * * Now it will return ENETDOWN, if device have just gone down, * but then it will block.
*/
skb = skb_recv_datagram(sk, flags, &err);
/* * An error occurred so return it. Because skb_recv_datagram() * handles the blocking we don't see and worry about blocking * retries.
*/
if (skb == NULL) goto out;
packet_rcv_try_clear_pressure(pkt_sk(sk));
if (vnet_hdr_len) {
err = packet_rcv_vnet(msg, skb, &len, vnet_hdr_len); if (err) goto out_free;
}
/* You lose any data beyond the buffer you gave. If it worries * a user program they can ask the device for its MTU * anyway.
*/
copied = skb->len; if (copied > len) {
copied = len;
msg->msg_flags |= MSG_TRUNC;
}
err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free;
if (sock->type != SOCK_PACKET) { struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
/* Original length was stored in sockaddr_ll fields */
origlen = PACKET_SKB_CB(skb)->sa.origlen;
sll->sll_family = AF_PACKET;
sll->sll_protocol = (sock->type == SOCK_DGRAM) ?
vlan_get_protocol_dgram(skb) : skb->protocol;
}
sock_recv_cmsgs(msg, sk, skb);
if (msg->msg_name) { const size_t max_len = min(sizeof(skb->cb), sizeof(struct sockaddr_storage)); int copy_len;
/* If the address length field is there to be filled * in, we fill it in now.
*/ if (sock->type == SOCK_PACKET) {
__sockaddr_check_size(sizeof(struct sockaddr_pkt));
msg->msg_namelen = sizeof(struct sockaddr_pkt);
copy_len = msg->msg_namelen;
} else { struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
/* * Free or return the buffer as appropriate. Again this * hides all the races and re-entrancy issues from us.
*/
err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
if (optlen != sizeof(int) && optlen != sizeof(args)) return -EINVAL; if (copy_from_sockptr(&args, optval, optlen)) return -EFAULT;
return fanout_add(sk, &args);
} case PACKET_FANOUT_DATA:
{ /* Paired with the WRITE_ONCE() in fanout_add() */ if (!READ_ONCE(po->fanout)) return -EINVAL;
return fanout_set_data(po, optval, optlen);
} case PACKET_IGNORE_OUTGOING:
{ int val;
if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; if (val < 0 || val > 1) return -EINVAL;
WRITE_ONCE(po->prot_hook.ignore_outgoing, !!val); return 0;
} case PACKET_TX_HAS_OFF:
{ unsignedint val;
if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT;
lock_sock(sk); if (!po->rx_ring.pg_vec && !po->tx_ring.pg_vec)
packet_sock_flag_set(po, PACKET_SOCK_TX_HAS_OFF, val);
release_sock(sk); return 0;
} case PACKET_QDISC_BYPASS:
{ int val;
if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT;
break; case PACKET_AUXDATA:
val = packet_sock_flag(po, PACKET_SOCK_AUXDATA); break; case PACKET_ORIGDEV:
val = packet_sock_flag(po, PACKET_SOCK_ORIGDEV); break; case PACKET_VNET_HDR:
val = !!READ_ONCE(po->vnet_hdr_sz); break; case PACKET_VNET_HDR_SZ:
val = READ_ONCE(po->vnet_hdr_sz); break; case PACKET_COPY_THRESH:
val = READ_ONCE(pkt_sk(sk)->copy_thresh); break; case PACKET_VERSION:
val = po->tp_version; break; case PACKET_HDRLEN: if (len > sizeof(int))
len = sizeof(int); if (len < sizeof(int)) return -EINVAL; if (copy_from_user(&val, optval, len)) return -EFAULT; switch (val) { case TPACKET_V1:
val = sizeof(struct tpacket_hdr); break; case TPACKET_V2:
val = sizeof(struct tpacket2_hdr); break; case TPACKET_V3:
val = sizeof(struct tpacket3_hdr); break; default: return -EINVAL;
} break; case PACKET_RESERVE:
val = po->tp_reserve; break; case PACKET_LOSS:
val = packet_sock_flag(po, PACKET_SOCK_TP_LOSS); break; case PACKET_TIMESTAMP:
val = READ_ONCE(po->tp_tstamp); break; case PACKET_FANOUT:
val = (po->fanout ?
((u32)po->fanout->id |
((u32)po->fanout->type << 16) |
((u32)po->fanout->flags << 24)) :
0); break; case PACKET_IGNORE_OUTGOING:
val = READ_ONCE(po->prot_hook.ignore_outgoing); break; case PACKET_ROLLOVER_STATS: if (!po->rollover) return -EINVAL;
rstats.tp_all = atomic_long_read(&po->rollover->num);
rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
data = &rstats;
lv = sizeof(rstats); break; case PACKET_TX_HAS_OFF:
val = packet_sock_flag(po, PACKET_SOCK_TX_HAS_OFF); break; case PACKET_QDISC_BYPASS:
val = packet_sock_flag(po, PACKET_SOCK_QDISC_BYPASS); break; default: return -ENOPROTOOPT;
}
if (len > lv)
len = lv; if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, data, len)) return -EFAULT; return 0;
}
switch (cmd) { case SIOCOUTQ:
{ int amount = sk_wmem_alloc_get(sk);
return put_user(amount, (int __user *)arg);
} case SIOCINQ:
{ struct sk_buff *skb; int amount = 0;
spin_lock_bh(&sk->sk_receive_queue.lock);
skb = skb_peek(&sk->sk_receive_queue); if (skb)
amount = skb->len;
spin_unlock_bh(&sk->sk_receive_queue.lock); return put_user(amount, (int __user *)arg);
} #ifdef CONFIG_INET case SIOCADDRT: case SIOCDELRT: case SIOCDARP: case SIOCGARP: case SIOCSARP: case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCSIFFLAGS: return inet_dgram_ops.ioctl(sock, cmd, arg); #endif
for (i = 0; i < len; i++) { if (likely(pg_vec[i].buffer)) { if (is_vmalloc_addr(pg_vec[i].buffer))
vfree(pg_vec[i].buffer); else
free_pages((unsignedlong)pg_vec[i].buffer,
order);
pg_vec[i].buffer = NULL;
}
}
kfree(pg_vec);
}
spin_lock(&po->bind_lock);
WRITE_ONCE(po->num, num); if (was_running)
register_prot_hook(sk);
spin_unlock(&po->bind_lock); if (pg_vec && (po->tp_version > TPACKET_V2)) { /* Because we don't support block-based V3 on tx-ring */ if (!tx_ring)
prb_shutdown_retire_blk_timer(po, rb_queue);
}
module_init(packet_init);
module_exit(packet_exit);
MODULE_DESCRIPTION("Packet socket support (AF_PACKET)");
MODULE_LICENSE("GPL");
MODULE_ALIAS_NETPROTO(PF_PACKET);
Messung V0.5 in Prozent
¤ Diese beiden folgenden Angebotsgruppen bietet das Unternehmen0.72Angebot
(Wie Sie bei der Firma Beratungs- und Dienstleistungen beauftragen können 2026-04-28)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.