// SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * PACKET - implements raw packet sockets. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox, <gw4pts@gw4pts.ampr.org> * * Fixes: * Alan Cox : verify_area() now used correctly * Alan Cox : new skbuff lists, look ma no backlogs! * Alan Cox : tidied skbuff lists. * Alan Cox : Now uses generic datagram routines I * added. Also fixed the peek/read crash * from all old Linux datagram code. * Alan Cox : Uses the improved datagram code. * Alan Cox : Added NULL's for socket options. * Alan Cox : Re-commented the code. * Alan Cox : Use new kernel side addressing * Rob Janssen : Correct MTU usage. * Dave Platt : Counter leaks caused by incorrect * interrupt locking and some slightly * dubious gcc output. Can you read * compiler: it said _VOLATILE_ * Richard Kooijman : Timestamp fixes. * Alan Cox : New buffers. Use sk->mac.raw. * Alan Cox : sendmsg/recvmsg support. * Alan Cox : Protocol setting support * Alexey Kuznetsov : Untied from IPv4 stack. * Cyrus Durgin : Fixed kerneld for kmod. * Michal Ostrowski : Module initialization cleanup. * Ulises Alonso : Frame number limit removal and * packet_set_ring memory leak. * Eric Biederman : Allow for > 8 byte hardware addresses. * The convention is that longer addresses * will simply extend the hardware address * byte arrays at the end of sockaddr_ll * and packet_mreq. * Johann Baudy : Added TX RING. * Chetan Loke : Implemented TPACKET_V3 block abstraction * layer. * Copyright (C) 2011, <lokec@ccs.neu.edu>
*/
/* Assumptions: - If the device has no dev->header_ops->create, there is no LL header visible above the device. In this case, its hard_header_len should be 0. The device may prepend its own header internally. In this case, its needed_headroom should be set to the space needed for it to add its internal header. For example, a WiFi driver pretending to be an Ethernet driver should set its hard_header_len to be the Ethernet header length, and set its needed_headroom to be (the real WiFi header length - the fake Ethernet header length). - packet socket receives packets with pulled ll header, so that SOCK_RAW should push it back.
On receive: -----------
Incoming, dev_has_header(dev) == true mac_header -> ll header data -> data
Incoming, dev_has_header(dev) == false mac_header -> data However drivers often make it point to the ll header. This is incorrect because the ll header should be invisible to us. data -> data
Outgoing, dev_has_header(dev) == false mac_header -> data. ll header is invisible to us. data -> data
Resume If dev_has_header(dev) == false we are unable to restore the ll header, because it is invisible to us.
struct packet_skb_cb { union { struct sockaddr_pkt pkt; union { /* Trick: alias skb original length with * ll.sll_family and ll.protocol in order * to save room.
*/ unsignedint origlen; struct sockaddr_ll ll;
};
} sa;
};
/* __register_prot_hook must be invoked through register_prot_hook * or from a context in which asynchronous accesses to the packet * socket is not possible (packet_create()).
*/ staticvoid __register_prot_hook(struct sock *sk)
{ struct packet_sock *po = pkt_sk(sk);
if (!packet_sock_flag(po, PACKET_SOCK_RUNNING)) { if (po->fanout)
__fanout_link(sk, po); else
dev_add_pack(&po->prot_hook);
/* If the sync parameter is true, we will temporarily drop * the po->bind_lock and do a synchronize_net to make sure no * asynchronous packet processing paths still refer to the elements * of po->prot_hook. If the sync parameter is false, it is the * callers responsibility to take care of this.
*/ staticvoid __unregister_prot_hook(struct sock *sk, bool sync)
{ struct packet_sock *po = pkt_sk(sk);
lockdep_assert_held_once(&po->bind_lock);
packet_sock_flag_set(po, PACKET_SOCK_RUNNING, 0);
if (po->fanout)
__fanout_unlink(sk, po); else
__dev_remove_pack(&po->prot_hook);
__sock_put(sk);
if (sync) {
spin_unlock(&po->bind_lock);
synchronize_net();
spin_lock(&po->bind_lock);
}
}
if (!(ts_status = tpacket_get_timestamp(skb, &ts, READ_ONCE(po->tp_tstamp)))) return 0;
h.raw = frame; /* * versions 1 through 3 overflow the timestamps in y2106, since they * all store the seconds in a 32-bit unsigned integer. * If we create a version 4, that should have a 64-bit timestamp, * either 64-bit seconds + 32-bit nanoseconds, or just 64-bit * nanoseconds.
*/ switch (po->tp_version) { case TPACKET_V1:
h.h1->tp_sec = ts.tv_sec;
h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; break; case TPACKET_V2:
h.h2->tp_sec = ts.tv_sec;
h.h2->tp_nsec = ts.tv_nsec; break; case TPACKET_V3:
h.h3->tp_sec = ts.tv_sec;
h.h3->tp_nsec = ts.tv_nsec; break; default:
WARN(1, "TPACKET version not supported.\n");
BUG();
}
/* one flush is safe, as both fields always lie on the same cacheline */
flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
smp_wmb();
return ts_status;
}
staticvoid *packet_lookup_frame(conststruct packet_sock *po, conststruct packet_ring_buffer *rb, unsignedint position, int status)
{ unsignedint pg_vec_pos, frame_offset; union tpacket_uhdr h;
pg_vec_pos = position / rb->frames_per_block;
frame_offset = position % rb->frames_per_block;
/* In the SOCK_DGRAM scenario, skb data starts at the network * protocol, which is after the VLAN headers. The outer VLAN * header is at the hard_header_len offset in non-variable * length link layer headers. If it's a VLAN device, the * min_header_len should be used to exclude the VLAN header * size.
*/ if (dev->min_header_len == dev->hard_header_len)
header_len = dev->hard_header_len; elseif (is_vlan_dev(dev))
header_len = dev->min_header_len; else return 0;
staticint prb_calc_retire_blk_tmo(struct packet_sock *po, int blk_size_in_bytes)
{ struct net_device *dev; unsignedint mbits, div; struct ethtool_link_ksettings ecmd; int err;
rtnl_lock();
dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex); if (unlikely(!dev)) {
rtnl_unlock(); return DEFAULT_PRB_RETIRE_TOV;
}
err = __ethtool_get_link_ksettings(dev, &ecmd);
rtnl_unlock(); if (err) return DEFAULT_PRB_RETIRE_TOV;
/* If the link speed is so slow you don't really * need to worry about perf anyways
*/ if (ecmd.base.speed < SPEED_1000 ||
ecmd.base.speed == SPEED_UNKNOWN) return DEFAULT_PRB_RETIRE_TOV;
/* Do NOT update the last_blk_num first. * Assumes sk_buff_head lock is held.
*/ staticvoid _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
{
mod_timer(&pkc->retire_blk_timer,
jiffies + pkc->tov_in_jiffies);
pkc->last_kactive_blk_num = pkc->kactive_blk_num;
}
/* * Timer logic: * 1) We refresh the timer only when we open a block. * By doing this we don't waste cycles refreshing the timer * on packet-by-packet basis. * * With a 1MB block-size, on a 1Gbps line, it will take * i) ~8 ms to fill a block + ii) memcpy etc. * In this cut we are not accounting for the memcpy time. * * So, if the user sets the 'tmo' to 10ms then the timer * will never fire while the block is still getting filled * (which is what we want). However, the user could choose * to close a block early and that's fine. * * But when the timer does fire, we check whether or not to refresh it. * Since the tmo granularity is in msecs, it is not too expensive * to refresh the timer, lets say every '8' msecs. * Either the user can set the 'tmo' or we can derive it based on * a) line-speed and b) block-size. * prb_calc_retire_blk_tmo() calculates the tmo. *
*/ staticvoid prb_retire_rx_blk_timer_expired(struct timer_list *t)
{ struct packet_sock *po =
timer_container_of(po, t, rx_ring.prb_bdqc.retire_blk_timer); struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring); unsignedint frozen; struct tpacket_block_desc *pbd;
/* We only need to plug the race when the block is partially filled. * tpacket_rcv: * lock(); increment BLOCK_NUM_PKTS; unlock() * copy_bits() is in progress ... * timer fires on other cpu: * we can't retire the current block because copy_bits * is in progress. *
*/ if (BLOCK_NUM_PKTS(pbd)) { /* Waiting for skb_copy_bits to finish... */
write_lock(&pkc->blk_fill_in_prog_lock);
write_unlock(&pkc->blk_fill_in_prog_lock);
}
if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) { if (!frozen) { if (!BLOCK_NUM_PKTS(pbd)) { /* An empty block. Just refresh the timer. */ goto refresh_timer;
}
prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO); if (!prb_dispatch_next_block(pkc, po)) goto refresh_timer; else goto out;
} else { /* Case 1. Queue was frozen because user-space was * lagging behind.
*/ if (prb_curr_blk_in_use(pbd)) { /* * Ok, user-space is still behind. * So just refresh the timer.
*/ goto refresh_timer;
} else { /* Case 2. queue was frozen,user-space caught up, * now the link went idle && the timer fired. * We don't have a block to close.So we open this * block and restart the timer. * opening a block thaws the queue,restarts timer * Thawing/timer-refresh is a side effect.
*/
prb_open_block(pkc, pbd); goto out;
}
}
}
/* * Side effect: * * 1) flush the block * 2) Increment active_blk_num * * Note:We DONT refresh the timer on purpose. * Because almost always the next block will be opened.
*/ staticvoid prb_close_block(struct tpacket_kbdq_core *pkc1, struct tpacket_block_desc *pbd1, struct packet_sock *po, unsignedint stat)
{
__u32 status = TP_STATUS_USER | stat;
/* Get the ts of the last pkt */ if (BLOCK_NUM_PKTS(pbd1)) {
h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
} else { /* Ok, we tmo'd - so get the current time. * * It shouldn't really happen as we don't close empty * blocks. See prb_retire_rx_blk_timer_expired().
*/ struct timespec64 ts;
ktime_get_real_ts64(&ts);
h1->ts_last_pkt.ts_sec = ts.tv_sec;
h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
}
smp_wmb();
/* Flush the block */
prb_flush_block(pkc1, pbd1, status);
/* * Queue freeze logic: * 1) Assume tp_block_nr = 8 blocks. * 2) At time 't0', user opens Rx ring. * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7 * 4) user-space is either sleeping or processing block '0'. * 5) tpacket_rcv is currently filling block '7', since there is no space left, * it will close block-7,loop around and try to fill block '0'. * call-flow: * __packet_lookup_frame_in_block * prb_retire_current_block() * prb_dispatch_next_block() * |->(BLOCK_STATUS == USER) evaluates to true * 5.1) Since block-0 is currently in-use, we just freeze the queue. * 6) Now there are two cases: * 6.1) Link goes idle right after the queue is frozen. * But remember, the last open_block() refreshed the timer. * When this timer expires,it will refresh itself so that we can * re-open block-0 in near future. * 6.2) Link is busy and keeps on receiving packets. This is a simple * case and __packet_lookup_frame_in_block will check if block-0 * is free and can now be re-used.
*/ staticvoid prb_freeze_queue(struct tpacket_kbdq_core *pkc, struct packet_sock *po)
{
pkc->reset_pending_on_curr_blk = 1;
po->stats.stats3.tp_freeze_q_cnt++;
}
/* * If the next block is free then we will dispatch it * and return a good offset. * Else, we will freeze the queue. * So, caller must check the return value.
*/ staticvoid *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc, struct packet_sock *po)
{ struct tpacket_block_desc *pbd;
smp_rmb();
/* 1. Get current block num */
pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
/* 2. If this block is currently in_use then freeze the queue */ if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
prb_freeze_queue(pkc, po); return NULL;
}
/* * 3. * open this block and return the offset where the first packet * needs to get stored.
*/
prb_open_block(pkc, pbd); return (void *)pkc->nxt_offset;
}
/* retire/close the current block */ if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) { /* * Plug the case where copy_bits() is in progress on * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't * have space to copy the pkt in the current block and * called prb_retire_current_block() * * We don't need to worry about the TMO case because * the timer-handler already handled this case.
*/ if (!(status & TP_STATUS_BLK_TMO)) { /* Waiting for skb_copy_bits to finish... */
write_lock(&pkc->blk_fill_in_prog_lock);
write_unlock(&pkc->blk_fill_in_prog_lock);
}
prb_close_block(pkc, pbd, po, status); return;
}
}
/* Queue is frozen when user space is lagging behind */ if (prb_queue_frozen(pkc)) { /* * Check if that last block which caused the queue to freeze, * is still in_use by user-space.
*/ if (prb_curr_blk_in_use(pbd)) { /* Can't record this packet */ return NULL;
} else { /* * Ok, the block was released by user-space. * Now let's open that block. * opening a block also thaws the queue. * Thawing is a side effect.
*/
prb_open_block(pkc, pbd);
}
}
if (po->tp_version == TPACKET_V3) { if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
ret = ROOM_NORMAL; elseif (__tpacket_v3_has_room(po, 0))
ret = ROOM_LOW;
} else { if (__tpacket_has_room(po, ROOM_POW_OFF))
ret = ROOM_NORMAL; elseif (__tpacket_has_room(po, 0))
ret = ROOM_LOW;
}
switch (type) { case PACKET_FANOUT_ROLLOVER: if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER) return -EINVAL; break; case PACKET_FANOUT_HASH: case PACKET_FANOUT_LB: case PACKET_FANOUT_CPU: case PACKET_FANOUT_RND: case PACKET_FANOUT_QM: case PACKET_FANOUT_CBPF: case PACKET_FANOUT_EBPF: break; default: return -EINVAL;
}
if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) { if (id != 0) {
err = -EINVAL; goto out;
} if (!fanout_find_new_id(sk, &id)) {
err = -ENOMEM; goto out;
} /* ephemeral flag for the first socket in the group: drop it */
flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8);
}
/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout. * It is the responsibility of the caller to call fanout_release_data() and * free the returned packet_fanout (after synchronize_net())
*/ staticstruct packet_fanout *fanout_release(struct sock *sk)
{ struct packet_sock *po = pkt_sk(sk); struct packet_fanout *f;
mutex_lock(&fanout_mutex);
f = po->fanout; if (f) {
po->fanout = NULL;
if (refcount_dec_and_test(&f->sk_ref))
list_del(&f->list); else
f = NULL;
}
mutex_unlock(&fanout_mutex);
return f;
}
staticbool packet_extra_vlan_len_allowed(conststruct net_device *dev, struct sk_buff *skb)
{ /* Earlier code assumed this would be a VLAN pkt, double-check * this now that we have the actual packet in hand. We can only * do this check on Ethernet devices.
*/ if (unlikely(dev->type != ARPHRD_ETHER)) returnfalse;
/* * When we registered the protocol we saved the socket in the data * field for just this event.
*/
sk = pt->af_packet_priv;
/* * Yank back the headers [hope the device set this * right or kerboom...] * * Incoming packets have ll header pulled, * push it back. * * For outgoing ones skb->data == skb_mac_header(skb) * so that this procedure is noop.
*/
if (skb->pkt_type == PACKET_LOOPBACK) goto out;
if (!net_eq(dev_net(dev), sock_net(sk))) goto out;
skb = skb_share_check(skb, GFP_ATOMIC); if (skb == NULL) goto oom;
/* drop any routing info */
skb_dst_drop(skb);
/* drop conntrack reference */
nf_reset_ct(skb);
spkt = &PACKET_SKB_CB(skb)->sa.pkt;
skb_push(skb, skb->data - skb_mac_header(skb));
/* * The SOCK_PACKET socket receives _all_ frames.
*/
/* Move network header to the right position for VLAN tagged packets */ if (likely(skb->dev->type == ARPHRD_ETHER) &&
eth_type_vlan(skb->protocol) &&
vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0)
skb_set_network_header(skb, depth);
skb_probe_transport_header(skb);
}
/* * Output a raw packet to a device layer. This bypasses all the other * protocol layers and you must therefore supply it with a complete frame
*/
if (saddr) { if (msg->msg_namelen < sizeof(struct sockaddr)) return -EINVAL; if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
proto = saddr->spkt_protocol;
} else return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
/* * Find the device first to size check it
*/
saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
retry:
rcu_read_lock();
dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
err = -ENODEV; if (dev == NULL) goto out_unlock;
err = -ENETDOWN; if (!(dev->flags & IFF_UP)) goto out_unlock;
/* * You may not queue a frame bigger than the mtu. This is the lowest level * raw protocol and you must do your own fragmentation at this level.
*/
if (unlikely(sock_flag(sk, SOCK_NOFCS))) { if (!netif_supports_nofcs(dev)) {
err = -EPROTONOSUPPORT; goto out_unlock;
}
extra_len = 4; /* We're doing our own CRC */
}
if (!skb) {
size_t reserved = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; unsignedint hhlen = dev->header_ops ? dev->hard_header_len : 0;
rcu_read_unlock();
skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL); if (skb == NULL) return -ENOBUFS; /* FIXME: Save some space for broken drivers that write a hard * header at transmission time by themselves. PPP is the notable * one here. This should really be fixed at the driver level.
*/
skb_reserve(skb, reserved);
skb_reset_network_header(skb);
/* Try to align data part correctly */ if (hhlen) {
skb->data -= hhlen;
skb->tail -= hhlen; if (len < hhlen)
skb_reset_network_header(skb);
}
err = memcpy_from_msg(skb_put(skb, len), msg, len); if (err) goto out_free; goto retry;
}
/* * This function makes lazy skb cloning in hope that most of packets * are discarded by BPF. * * Note tricky part: we DO mangle shared skb! skb->data, skb->len * and skb->cb are mangled. It works because (and until) packets * falling here are owned by current CPU. Output packets are cloned * by dev_queue_xmit_nit(), input packets are processed by net_bh * sequentially, so that if we return skb to original state on exit, * we will not harm anyone.
*/
if (!net_eq(dev_net(dev), sock_net(sk))) goto drop;
skb->dev = dev;
if (dev_has_header(dev)) { /* The device has an explicit notion of ll header, * exported to higher levels. * * Otherwise, the device hides details of its frame * structure, so that corresponding packet head is * never delivered to user.
*/ if (sk->sk_type != SOCK_DGRAM)
skb_push(skb, skb->data - skb_mac_header(skb)); elseif (skb->pkt_type == PACKET_OUTGOING) { /* Special case: outgoing packets have ll header at head */
skb_pull(skb, skb_network_offset(skb));
}
}
/* sll->sll_family and sll->sll_protocol are set in packet_recvmsg(). * Use their space for storing the original skb length.
*/
PACKET_SKB_CB(skb)->sa.origlen = skb->len;
/* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT. * We may add members to them until current aligned size without forcing * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
*/
BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
if (skb->pkt_type == PACKET_LOOPBACK) goto drop;
sk = pt->af_packet_priv;
po = pkt_sk(sk);
if (!net_eq(dev_net(dev), sock_net(sk))) goto drop;
if (dev_has_header(dev)) { if (sk->sk_type != SOCK_DGRAM)
skb_push(skb, skb->data - skb_mac_header(skb)); elseif (skb->pkt_type == PACKET_OUTGOING) { /* Special case: outgoing packets have ll header at head */
skb_pull(skb, skb_network_offset(skb));
}
}
res = run_filter(skb, sk, snaplen); if (!res) goto drop_n_restore;
/* If we are flooded, just give up */ if (__packet_rcv_has_room(po, skb) == ROOM_NONE) {
atomic_inc(&po->tp_drops); goto drop_n_restore;
}
if (skb->ip_summed == CHECKSUM_PARTIAL)
status |= TP_STATUS_CSUMNOTREADY; elseif (skb->pkt_type != PACKET_OUTGOING &&
skb_csum_unnecessary(skb))
status |= TP_STATUS_CSUM_VALID; if (skb_is_gso(skb) && skb_is_gso_tcp(skb))
status |= TP_STATUS_GSO_TCP;
if (po->tp_version <= TPACKET_V2) {
packet_increment_rx_head(po, &po->rx_ring); /* * LOSING will be reported till you read the stats, * because it's COR - Clear On Read. * Anyways, moving it for V1/V2 only as V3 doesn't need this * at packet level.
*/ if (atomic_read(&po->tp_drops))
status |= TP_STATUS_LOSING;
}
po->stats.stats1.tp_packets++; if (copy_skb) {
status |= TP_STATUS_COPY;
skb_clear_delivery_time(copy_skb);
__skb_queue_tail(&sk->sk_receive_queue, copy_skb);
}
spin_unlock(&sk->sk_receive_queue.lock);
skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
/* Always timestamp; prefer an existing software timestamp taken * closer to the time of capture.
*/
ts_status = tpacket_get_timestamp(skb, &ts,
READ_ONCE(po->tp_tstamp) |
SOF_TIMESTAMPING_SOFTWARE); if (!ts_status)
ktime_get_real_ts64(&ts);
end = (u8 *) PAGE_ALIGN((unsignedlong) h.raw +
macoff + snaplen);
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.0.24Bemerkung:
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.