// SPDX-License-Identifier: GPL-2.0-or-later /* * Routines having to do with the 'struct sk_buff' memory handlers. * * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> * Florian La Roche <rzsfl@rz.uni-sb.de> * * Fixes: * Alan Cox : Fixed the worst of the load * balancer bugs. * Dave Platt : Interrupt stacking fix. * Richard Kooijman : Timestamp fixes. * Alan Cox : Changed buffer format. * Alan Cox : destructor hook for AF_UNIX etc. * Linus Torvalds : Better skb_clone. * Alan Cox : Added skb_copy. * Alan Cox : Added all the changed routines Linus * only put in the headers * Ray VanTassle : Fixed --skb->lock in free * Alan Cox : skb_copy copy arp field * Andi Kleen : slabified it. * Robert Olsson : Removed skb_head_pool * * NOTE: * The __skb_ routines should be called with interrupts * disabled, or you better be *real* sure that the operation is atomic * with respect to whatever list is being frobbed (e.g. via lock_sock() * or via disabling bottom half handlers, etc).
*/
/* * The functions in this file will not compile correctly with gcc 2.4.x
*/
/* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two. * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique * size, and we can differentiate heads from skb_small_head_cache * vs system slabs by looking at their size (skb_end_offset()).
*/ #define SKB_SMALL_HEAD_CACHE_SIZE \
(is_power_of_2(SKB_SMALL_HEAD_SIZE) ? \
(SKB_SMALL_HEAD_SIZE + L1_CACHE_BYTES) : \
SKB_SMALL_HEAD_SIZE)
/* kcm_write_msgs() relies on casting paged frags to bio_vec to use * iov_iter_bvec(). These static asserts ensure the cast is valid is long as the * netmem is a page.
*/
static_assert(offsetof(struct bio_vec, bv_page) ==
offsetof(skb_frag_t, netmem));
static_assert(sizeof_field(struct bio_vec, bv_page) ==
sizeof_field(skb_frag_t, netmem));
/** * drop_reasons_register_subsys - register another drop reason subsystem * @subsys: the subsystem to register, must not be the core * @list: the list of drop reasons within the subsystem, must point to * a statically initialized list
*/ void drop_reasons_register_subsys(enum skb_drop_reason_subsys subsys, conststruct drop_reason_list *list)
{ if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
subsys >= ARRAY_SIZE(drop_reasons_by_subsys), "invalid subsystem %d\n", subsys)) return;
/* must point to statically allocated memory, so INIT is OK */
RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], list);
}
EXPORT_SYMBOL_GPL(drop_reasons_register_subsys);
/** * drop_reasons_unregister_subsys - unregister a drop reason subsystem * @subsys: the subsystem to remove, must not be the core * * Note: This will synchronize_rcu() to ensure no users when it returns.
*/ void drop_reasons_unregister_subsys(enum skb_drop_reason_subsys subsys)
{ if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
subsys >= ARRAY_SIZE(drop_reasons_by_subsys), "invalid subsystem %d\n", subsys)) return;
/** * skb_panic - private function for out-of-line support * @skb: buffer * @sz: size * @addr: address * @msg: skb_over_panic or skb_under_panic * * Out-of-line support for skb_put() and skb_push(). * Called via the wrapper skb_over_panic() or skb_under_panic(). * Keep out of line to prevent kernel bloat. * __builtin_return_address is not used because it is not always reliable.
*/ staticvoid skb_panic(struct sk_buff *skb, unsignedint sz, void *addr, constchar msg[])
{
pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n",
msg, addr, skb->len, sz, skb->head, skb->data,
(unsignedlong)skb->tail, (unsignedlong)skb->end,
skb->dev ? skb->dev->name : "");
BUG();
}
/** * napi_skb_cache_get_bulk - obtain a number of zeroed skb heads from the cache * @skbs: pointer to an at least @n-sized array to fill with skb pointers * @n: number of entries to provide * * Tries to obtain @n &sk_buff entries from the NAPI percpu cache and writes * the pointers into the provided array @skbs. If there are less entries * available, tries to replenish the cache and bulk-allocates the diff from * the MM layer if needed. * The heads are being zeroed with either memset() or %__GFP_ZERO, so they are * ready for {,__}build_skb_around() and don't have any data buffers attached. * Must be called *only* from the BH context. * * Return: number of successfully allocated skbs (@n if no actual allocation * needed or kmem_cache_alloc_bulk() didn't fail).
*/
u32 napi_skb_cache_get_bulk(void **skbs, u32 n)
{ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
u32 bulk, total = n;
local_lock_nested_bh(&napi_alloc_cache.bh_lock);
if (nc->skb_count >= n) goto get;
/* No enough cached skbs. Try refilling the cache first */
bulk = min(NAPI_SKB_CACHE_SIZE - nc->skb_count, NAPI_SKB_CACHE_BULK);
nc->skb_count += kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
GFP_ATOMIC | __GFP_NOWARN, bulk,
&nc->skb_cache[nc->skb_count]); if (likely(nc->skb_count >= n)) goto get;
/* Still not enough. Bulk-allocate the missing part directly, zeroed */
n -= kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
GFP_ATOMIC | __GFP_ZERO | __GFP_NOWARN,
n - nc->skb_count, &skbs[nc->skb_count]); if (likely(nc->skb_count >= n)) goto get;
/* kmem_cache didn't allocate the number we need, limit the output */
total -= n - nc->skb_count;
n = nc->skb_count;
get: for (u32 base = nc->skb_count - n, i = 0; i < n; i++) {
u32 cache_size = kmem_cache_size(net_hotdata.skbuff_cache);
/* Must find the allocation size (and grow it to match). */
*size = ksize(data); /* krealloc() will immediately return "data" when * "ksize(data)" is requested: it is the existing upper * bounds. As a result, GFP_ATOMIC will be ignored. Note * that this "new" pointer needs to be passed back to the * caller for use so the __alloc_size hinting will be * tracked correctly.
*/
resized = krealloc(data, *size, GFP_ATOMIC);
WARN_ON_ONCE(resized != data); return resized;
}
/* build_skb() variant which can operate on slab buffers. * Note that this should be used sparingly as slab buffers * cannot be combined efficiently by GRO!
*/ struct sk_buff *slab_build_skb(void *data)
{ struct sk_buff *skb; unsignedint size;
skb = kmem_cache_alloc(net_hotdata.skbuff_cache,
GFP_ATOMIC | __GFP_NOWARN); if (unlikely(!skb)) return NULL;
/* Caller must provide SKB that is memset cleared */ staticvoid __build_skb_around(struct sk_buff *skb, void *data, unsignedint frag_size)
{ unsignedint size = frag_size;
/* frag_size == 0 is considered deprecated now. Callers * using slab buffer should use slab_build_skb() instead.
*/ if (WARN_ONCE(size == 0, "Use slab_build_skb() instead"))
data = __slab_build_skb(data, &size);
__finalize_skb_around(skb, data, size);
}
/** * __build_skb - build a network buffer * @data: data buffer provided by caller * @frag_size: size of data (must not be 0) * * Allocate a new &sk_buff. Caller provides space holding head and * skb_shared_info. @data must have been allocated from the page * allocator or vmalloc(). (A @frag_size of 0 to indicate a kmalloc() * allocation is deprecated, and callers should use slab_build_skb() * instead.) * The return is the new skb buffer. * On a failure the return is %NULL, and @data is not freed. * Notes : * Before IO, driver allocates only data buffer where NIC put incoming frame * Driver should add room at head (NET_SKB_PAD) and * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info)) * After IO, driver calls build_skb(), to allocate sk_buff and populate it * before giving packet to stack. * RX rings only contains data buffers, not full skbs.
*/ struct sk_buff *__build_skb(void *data, unsignedint frag_size)
{ struct sk_buff *skb;
skb = kmem_cache_alloc(net_hotdata.skbuff_cache,
GFP_ATOMIC | __GFP_NOWARN); if (unlikely(!skb)) return NULL;
/* build_skb() is wrapper over __build_skb(), that specifically * takes care of skb->head and skb->pfmemalloc
*/ struct sk_buff *build_skb(void *data, unsignedint frag_size)
{ struct sk_buff *skb = __build_skb(data, frag_size);
/** * build_skb_around - build a network buffer around provided skb * @skb: sk_buff provide by caller, must be memset cleared * @data: data buffer provided by caller * @frag_size: size of data
*/ struct sk_buff *build_skb_around(struct sk_buff *skb, void *data, unsignedint frag_size)
{ if (unlikely(!skb)) return NULL;
/** * __napi_build_skb - build a network buffer * @data: data buffer provided by caller * @frag_size: size of data * * Version of __build_skb() that uses NAPI percpu caches to obtain * skbuff_head instead of inplace allocation. * * Returns a new &sk_buff on success, %NULL on allocation failure.
*/ staticstruct sk_buff *__napi_build_skb(void *data, unsignedint frag_size)
{ struct sk_buff *skb;
skb = napi_skb_cache_get(); if (unlikely(!skb)) return NULL;
/** * napi_build_skb - build a network buffer * @data: data buffer provided by caller * @frag_size: size of data * * Version of __napi_build_skb() that takes care of skb->head_frag * and skb->pfmemalloc when the data is a page or page fragment. * * Returns a new &sk_buff on success, %NULL on allocation failure.
*/ struct sk_buff *napi_build_skb(void *data, unsignedint frag_size)
{ struct sk_buff *skb = __napi_build_skb(data, frag_size);
/* * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells * the caller if emergency pfmemalloc reserves are being used. If it is and * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves * may be used. Otherwise, the packet data may be discarded until enough * memory is free
*/ staticvoid *kmalloc_reserve(unsignedint *size, gfp_t flags, int node, bool *pfmemalloc)
{ bool ret_pfmemalloc = false;
size_t obj_size; void *obj;
obj_size = SKB_HEAD_ALIGN(*size); if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE &&
!(flags & KMALLOC_NOT_NORMAL_BITS)) {
obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache,
flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
node);
*size = SKB_SMALL_HEAD_CACHE_SIZE; if (obj || !(gfp_pfmemalloc_allowed(flags))) goto out; /* Try again but now we are using pfmemalloc reserves */
ret_pfmemalloc = true;
obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, flags, node); goto out;
}
obj_size = kmalloc_size_roundup(obj_size); /* The following cast might truncate high-order bits of obj_size, this * is harmless because kmalloc(obj_size >= 2^32) will fail anyway.
*/
*size = (unsignedint)obj_size;
/* * Try a regular allocation, when that fails and we're not entitled * to the reserves, fail.
*/
obj = kmalloc_node_track_caller(obj_size,
flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
node); if (obj || !(gfp_pfmemalloc_allowed(flags))) goto out;
/* Try again but now we are using pfmemalloc reserves */
ret_pfmemalloc = true;
obj = kmalloc_node_track_caller(obj_size, flags, node);
out: if (pfmemalloc)
*pfmemalloc = ret_pfmemalloc;
return obj;
}
/* Allocate a new skbuff. We do this ourselves so we can fill in a few * 'private' fields and also do memory statistics to find all the * [BEEP] leaks. *
*/
/** * __alloc_skb - allocate a network buffer * @size: size to allocate * @gfp_mask: allocation mask * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache * instead of head cache and allocate a cloned (child) skb. * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for * allocations in case the data is required for writeback * @node: numa node to allocate memory on * * Allocate a new &sk_buff. The returned buffer has no headroom and a * tail room of at least size bytes. The object has a reference count * of one. The return is the buffer. On a failure the return is %NULL. * * Buffers may only be allocated from interrupts using a @gfp_mask of * %GFP_ATOMIC.
*/ struct sk_buff *__alloc_skb(unsignedint size, gfp_t gfp_mask, int flags, int node)
{ struct kmem_cache *cache; struct sk_buff *skb; bool pfmemalloc;
u8 *data;
if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
gfp_mask |= __GFP_MEMALLOC;
/* Get the HEAD */ if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI &&
likely(node == NUMA_NO_NODE || node == numa_mem_id()))
skb = napi_skb_cache_get(); else
skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node); if (unlikely(!skb)) return NULL;
prefetchw(skb);
/* We do our best to align skb_shared_info on a separate cache * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives * aligned memory blocks, unless SLUB/SLAB debug is enabled. * Both skb->head and skb_shared_info are cache line aligned.
*/
data = kmalloc_reserve(&size, gfp_mask, node, &pfmemalloc); if (unlikely(!data)) goto nodata; /* kmalloc_size_roundup() might give us more room than requested. * Put skb_shared_info exactly at the end of allocated zone, * to allow max possible filling before reallocation.
*/
prefetchw(data + SKB_WITH_OVERHEAD(size));
/* * Only clear those fields we need to clear, not those that we will * actually initialise below. Hence, don't put any more fields after * the tail pointer in struct sk_buff!
*/
memset(skb, 0, offsetof(struct sk_buff, tail));
__build_skb_around(skb, data, size);
skb->pfmemalloc = pfmemalloc;
if (flags & SKB_ALLOC_FCLONE) { struct sk_buff_fclones *fclones;
/** * __netdev_alloc_skb - allocate an skbuff for rx on a specific device * @dev: network device to receive on * @len: length to allocate * @gfp_mask: get_free_pages mask, passed to alloc_skb * * Allocate a new &sk_buff and assign it a usage count of one. The * buffer has NET_SKB_PAD headroom built in. Users should allocate * the headroom they think they need without accounting for the * built in space. The built in space is used for optimisations. * * %NULL is returned if there is no free memory.
*/ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsignedint len,
gfp_t gfp_mask)
{ struct page_frag_cache *nc; struct sk_buff *skb; bool pfmemalloc; void *data;
len += NET_SKB_PAD;
/* If requested length is either too small or too big, * we use kmalloc() for skb->head allocation.
*/ if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) ||
len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
(gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); if (!skb) goto skb_fail; goto skb_success;
}
len = SKB_HEAD_ALIGN(len);
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
/** * napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance * @napi: napi instance this buffer was allocated for * @len: length to allocate * * Allocate a new sk_buff for use in NAPI receive. This buffer will * attempt to allocate the head from a special reserved region used * only for NAPI Rx allocation. By doing this we can save several * CPU cycles by avoiding having to disable and re-enable IRQs. * * %NULL is returned if there is no free memory.
*/ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsignedint len)
{
gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN; struct napi_alloc_cache *nc; struct sk_buff *skb; bool pfmemalloc; void *data;
DEBUG_NET_WARN_ON_ONCE(!in_softirq());
len += NET_SKB_PAD + NET_IP_ALIGN;
/* If requested length is either too small or too big, * we use kmalloc() for skb->head allocation.
*/ if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) ||
len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
(gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
NUMA_NO_NODE); if (!skb) goto skb_fail; goto skb_success;
}
len = SKB_HEAD_ALIGN(len);
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem, int off, int size, unsignedint truesize)
{
DEBUG_NET_WARN_ON_ONCE(size > truesize);
/** * skb_pp_frag_ref() - Increase fragment references of a page pool aware skb * @skb: page pool aware skb * * Increase the fragment reference count (pp_ref_count) of a skb. This is * intended to gain fragment references only for page pool aware skbs, * i.e. when skb->pp_recycle is true, and not for fragments in a * non-pp-recycling skb. It has a fallback to increase references on normal * pages, as page pool aware skbs may also have normal page fragments.
*/ staticint skb_pp_frag_ref(struct sk_buff *skb)
{ struct skb_shared_info *shinfo;
netmem_ref head_netmem; int i;
if (!skb->pp_recycle) return -EINVAL;
shinfo = skb_shinfo(skb);
for (i = 0; i < shinfo->nr_frags; i++) {
head_netmem = netmem_compound_head(shinfo->frags[i].netmem); if (likely(netmem_is_pp(head_netmem)))
page_pool_ref_netmem(head_netmem); else
page_ref_inc(netmem_to_page(head_netmem));
} return 0;
}
if (skb_zcopy(skb)) { bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS;
skb_zcopy_clear(skb, true); if (skip_unref) goto free_head;
}
for (i = 0; i < shinfo->nr_frags; i++)
__skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
free_head: if (shinfo->frag_list)
kfree_skb_list_reason(shinfo->frag_list, reason);
skb_free_head(skb); exit: /* When we clone an SKB we copy the reycling bit. The pp_recycle * bit is only set on the head though, so in order to avoid races * while trying to recycle fragments on __skb_frag_unref() we need * to make one SKB responsible for triggering the recycle path. * So disable the recycling bit if an SKB is cloned and we have * additional references to the fragmented part of the SKB. * Eventually the last SKB will have the recycling bit set and it's * dataref set to 0, which will trigger the recycling
*/
skb->pp_recycle = 0;
}
/* * Free an skbuff by memory without cleaning the state.
*/ staticvoid kfree_skbmem(struct sk_buff *skb)
{ struct sk_buff_fclones *fclones;
switch (skb->fclone) { case SKB_FCLONE_UNAVAILABLE:
kmem_cache_free(net_hotdata.skbuff_cache, skb); return;
case SKB_FCLONE_ORIG:
fclones = container_of(skb, struct sk_buff_fclones, skb1);
/* We usually free the clone (TX completion) before original skb * This test would have no chance to be true for the clone, * while here, branch prediction will be good.
*/ if (refcount_read(&fclones->fclone_ref) == 1) goto fastpath; break;
/* Free everything but the sk_buff shell. */ staticvoid skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
{
skb_release_head_state(skb); if (likely(skb->head))
skb_release_data(skb, reason);
}
/** * __kfree_skb - private function * @skb: buffer * * Free an sk_buff. Release anything attached to the buffer. * Clean the state. This is an internal helper function. Users should * always call kfree_skb
*/
/** * sk_skb_reason_drop - free an sk_buff with special reason * @sk: the socket to receive @skb, or NULL if not applicable * @skb: buffer to free * @reason: reason why this skb is dropped * * Drop a reference to the buffer and free it if the usage count has hit * zero. Meanwhile, pass the receiving socket and drop reason to * 'kfree_skb' tracepoint.
*/ void __fix_address
sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason)
{ if (__sk_skb_reason_drop(sk, skb, reason))
__kfree_skb(skb);
}
EXPORT_SYMBOL(sk_skb_reason_drop);
staticvoid kfree_skb_add_bulk(struct sk_buff *skb, struct skb_free_array *sa, enum skb_drop_reason reason)
{ /* if SKB is a clone, don't handle this case */ if (unlikely(skb->fclone != SKB_FCLONE_UNAVAILABLE)) {
__kfree_skb(skb); return;
}
/** * skb_tx_error - report an sk_buff xmit error * @skb: buffer that triggered an error * * Report xmit error if a device callback is tracking this skb. * skb must be freed afterwards.
*/ void skb_tx_error(struct sk_buff *skb)
{ if (skb) {
skb_zcopy_downgrade_managed(skb);
skb_zcopy_clear(skb, true);
}
}
EXPORT_SYMBOL(skb_tx_error);
#ifdef CONFIG_TRACEPOINTS /** * consume_skb - free an skbuff * @skb: buffer to free * * Drop a ref to the buffer and free it if the usage count has hit zero * Functions identically to kfree_skb, but kfree_skb assumes that the frame * is being dropped after a failure and notes that
*/ void consume_skb(struct sk_buff *skb)
{ if (!skb_unref(skb)) return;
/** * __consume_stateless_skb - free an skbuff, assuming it is stateless * @skb: buffer to free * * Alike consume_skb(), but this variant assumes that this is the last * skb reference and all the head states have been already dropped
*/ void __consume_stateless_skb(struct sk_buff *skb)
{
trace_consume_skb(skb, __builtin_return_address(0));
skb_release_data(skb, SKB_CONSUMED);
kfree_skbmem(skb);
}
if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++)
kasan_mempool_unpoison_object(nc->skb_cache[i],
kmem_cache_size(net_hotdata.skbuff_cache));
void napi_consume_skb(struct sk_buff *skb, int budget)
{ /* Zero budget indicate non-NAPI context called us, like netpoll */ if (unlikely(!budget)) {
dev_consume_skb_any(skb); return;
}
DEBUG_NET_WARN_ON_ONCE(!in_softirq());
if (!skb_unref(skb)) return;
/* if reaching here SKB is ready to free */
trace_consume_skb(skb, __builtin_return_address(0));
/* if SKB is a clone, don't handle this case */ if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
__kfree_skb(skb); return;
}
/* Make sure a field is contained by headers group */ #define CHECK_SKB_FIELD(field) \
BUILD_BUG_ON(offsetof(struct sk_buff, field) != \
offsetof(struct sk_buff, headers.field)); \
/* Note : this field could be in the headers group. * It is not yet because we do not want to have a 16 bit hole
*/
new->queue_mapping = old->queue_mapping;
/* * You should not add any new code to this function. Add it to * __copy_skb_header above instead.
*/ staticstruct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
{ #define C(x) n->x = skb->x
/** * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg * @first: first sk_buff of the msg
*/ struct sk_buff *alloc_skb_for_msg(struct sk_buff *first)
{ struct sk_buff *n;
n = alloc_skb(0, GFP_ATOMIC); if (!n) return NULL;
/** * skb_morph - morph one skb into another * @dst: the skb to receive the contents * @src: the skb to supply the contents * * This is identical to skb_clone except that the target skb is * supplied by the user. * * The target skb is returned upon exit.
*/ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
{
skb_release_all(dst, SKB_CONSUMED); return __skb_clone(dst, src);
}
EXPORT_SYMBOL_GPL(skb_morph);
struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, struct ubuf_info *uarg, bool devmem)
{ if (uarg) { struct ubuf_info_msgzc *uarg_zc; const u32 byte_limit = 1 << 19; /* limit to a few TSO */
u32 bytelen, next;
/* there might be non MSG_ZEROCOPY users */ if (uarg->ops != &msg_zerocopy_ubuf_ops) return NULL;
/* realloc only when socket is locked (TCP, UDP cork), * so uarg->len and sk_zckey access is serialized
*/ if (!sock_owned_by_user(sk)) {
WARN_ON_ONCE(1); return NULL;
}
uarg_zc = uarg_to_msgzc(uarg);
bytelen = uarg_zc->bytelen + size; if (uarg_zc->len == USHRT_MAX - 1 || bytelen > byte_limit) { /* TCP can create new skb to attach new uarg */ if (sk->sk_type == SOCK_STREAM) goto new_alloc; return NULL;
}
next = (u32)atomic_read(&sk->sk_zckey); if ((u32)(uarg_zc->id + uarg_zc->len) == next) { if (likely(!devmem) &&
mm_account_pinned_pages(&uarg_zc->mmp, size)) return NULL;
uarg_zc->len++;
uarg_zc->bytelen = bytelen;
atomic_set(&sk->sk_zckey, ++next);
/* no extra ref when appending to datagram (MSG_MORE) */ if (sk->sk_type == SOCK_STREAM)
net_zcopy_get(uarg);
/* if !len, there was only 1 call, and it was aborted * so do not queue a completion notification
*/ if (!uarg->len || sock_flag(sk, SOCK_DEAD)) goto release;
len = uarg->len;
lo = uarg->id;
hi = uarg->id + len - 1;
is_zerocopy = uarg->zerocopy;
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, struct ubuf_info *uarg, struct net_devmem_dmabuf_binding *binding)
{ int err, orig_len = skb->len;
if (uarg->ops->link_skb) {
err = uarg->ops->link_skb(skb, uarg); if (err) return err;
} else { struct ubuf_info *orig_uarg = skb_zcopy(skb);
/* An skb can only point to one uarg. This edge case happens * when TCP appends to an skb, but zerocopy_realloc triggered * a new alloc.
*/ if (orig_uarg && uarg != orig_uarg) return -EEXIST;
}
void __skb_zcopy_downgrade_managed(struct sk_buff *skb)
{ int i;
skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAG_REFS; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
skb_frag_ref(skb, i);
}
EXPORT_SYMBOL_GPL(__skb_zcopy_downgrade_managed);
staticint skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
gfp_t gfp_mask)
{ if (skb_zcopy(orig)) { if (skb_zcopy(nskb)) { /* !gfp_mask callers are verified to !skb_zcopy(nskb) */ if (!gfp_mask) {
WARN_ON_ONCE(1); return -ENOMEM;
} if (skb_uarg(nskb) == skb_uarg(orig)) return 0; if (skb_copy_ubufs(nskb, GFP_ATOMIC)) return -EIO;
}
skb_zcopy_set(nskb, skb_uarg(orig), NULL);
} return 0;
}
/** * skb_copy_ubufs - copy userspace skb frags buffers to kernel * @skb: the skb to modify * @gfp_mask: allocation priority * * This must be called on skb with SKBFL_ZEROCOPY_ENABLE. * It will copy all frags into kernel and drop the reference * to userspace pages. * * If this function is called from an interrupt gfp_mask() must be * %GFP_ATOMIC. * * Returns 0 on success or a negative error code on failure * to allocate kernel memory to copy to.
*/ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
{ int num_frags = skb_shinfo(skb)->nr_frags; struct page *page, *head = NULL; int i, order, psize, new_frags;
u32 d_off;
if (skb_shared(skb) || skb_unclone(skb, gfp_mask)) return -EINVAL;
if (!skb_frags_readable(skb)) return -EFAULT;
if (!num_frags) goto release;
/* We might have to allocate high order pages, so compute what minimum * page order is needed.
*/
order = 0; while ((PAGE_SIZE << order) * MAX_SKB_FRAGS < __skb_pagelen(skb))
order++;
psize = (PAGE_SIZE << order);
new_frags = (__skb_pagelen(skb) + psize - 1) >> (PAGE_SHIFT + order); for (i = 0; i < new_frags; i++) {
page = alloc_pages(gfp_mask | __GFP_COMP, order); if (!page) { while (head) { struct page *next = (struct page *)page_private(head);
put_page(head);
head = next;
} return -ENOMEM;
}
set_page_private(page, (unsignedlong)head);
head = page;
}
page = head;
d_off = 0; for (i = 0; i < num_frags; i++) {
skb_frag_t *f = &skb_shinfo(skb)->frags[i];
u32 p_off, p_len, copied; struct page *p;
u8 *vaddr;
/** * skb_clone - duplicate an sk_buff * @skb: buffer to clone * @gfp_mask: allocation priority * * Duplicate an &sk_buff. The new one is not owned by a socket. Both * copies share the same packet data but not structure. The new * buffer has a reference count of 1. If the allocation fails the * function returns %NULL otherwise the new buffer is returned. * * If this function is called from an interrupt gfp_mask() must be * %GFP_ATOMIC.
*/
void skb_headers_offset_update(struct sk_buff *skb, int off)
{ /* Only adjust this if it actually is csum_start rather than csum */ if (skb->ip_summed == CHECKSUM_PARTIAL)
skb->csum_start += off; /* {transport,network,mac}_header and tail are relative to skb->head */
skb->transport_header += off;
skb->network_header += off; if (skb_mac_header_was_set(skb))
skb->mac_header += off;
skb->inner_transport_header += off;
skb->inner_network_header += off;
skb->inner_mac_header += off;
}
EXPORT_SYMBOL(skb_headers_offset_update);
/** * skb_copy - create private copy of an sk_buff * @skb: buffer to copy * @gfp_mask: allocation priority * * Make a copy of both an &sk_buff and its data. This is used when the * caller wishes to modify the data and needs a private copy of the * data to alter. Returns %NULL on failure or the pointer to the buffer * on success. The returned buffer has a reference count of 1. * * As by-product this function converts non-linear &sk_buff to linear * one, so that &sk_buff becomes completely private and caller is allowed * to modify all the data of returned buffer. This means that this * function is not recommended for use in circumstances when only * header is going to be modified. Use pskb_copy() instead.
*/
/** * __pskb_copy_fclone - create copy of an sk_buff with private head. * @skb: buffer to copy * @headroom: headroom of new skb * @gfp_mask: allocation priority * @fclone: if true allocate the copy of the skb from the fclone * cache instead of the head cache; it is recommended to set this * to true for the cases where the copy will likely be cloned * * Make a copy of both an &sk_buff and part of its data, located * in header. Fragmented data remain shared. This is used when * the caller wishes to modify only header of &sk_buff and needs * private copy of the header to alter. Returns %NULL on failure * or the pointer to the buffer on success. * The returned buffer has a reference count of 1.
*/
/* Set the data pointer */
skb_reserve(n, headroom); /* Set the tail pointer and length */
skb_put(n, skb_headlen(skb)); /* Copy the bytes */
skb_copy_from_linear_data(skb, n->data, n->len);
/** * pskb_expand_head - reallocate header of &sk_buff * @skb: buffer to reallocate * @nhead: room to add at head * @ntail: room to add at tail * @gfp_mask: allocation priority * * Expands (or creates identical copy, if @nhead and @ntail are zero) * header of @skb. &sk_buff itself is not changed. &sk_buff MUST have * reference count of 1. Returns zero in the case of success or error, * if expansion failed. In the last case, &sk_buff is not changed. * * All the pointers pointing into skb header may change and must be * reloaded after call to this function.
*/
int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
gfp_t gfp_mask)
{ unsignedint osize = skb_end_offset(skb); unsignedint size = osize + nhead + ntail; long off;
u8 *data; int i;
BUG_ON(nhead < 0);
BUG_ON(skb_shared(skb));
skb_zcopy_downgrade_managed(skb);
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL); if (!data) goto nodata;
size = SKB_WITH_OVERHEAD(size);
/* Copy only real data... and, alas, header. This should be * optimized for the cases when header is void.
*/
memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head);
/* * if shinfo is shared we must drop the old head gracefully, but if it * is not we can just drop the old head and let the existing refcount * be since all we did is relocate the values
*/ if (skb_cloned(skb)) { if (skb_orphan_frags(skb, gfp_mask)) goto nofrags; if (skb_zcopy(skb))
refcount_inc(&skb_uarg(skb)->refcnt); for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
skb_frag_ref(skb, i);
if (skb_has_frag_list(skb))
skb_clone_fraglist(skb);
/* It is not generally safe to change skb->truesize. * For the moment, we really care of rx path, or * when skb is orphaned (not attached to a socket).
*/ if (!skb->sk || skb->destructor == sock_edemux)
skb->truesize += size - osize;
/* Note: We plan to rework this in linux-6.4 */ int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri)
{ unsignedint saved_end_offset, saved_truesize; struct skb_shared_info *shinfo; int res;
res = pskb_expand_head(skb, 0, 0, pri); if (res) return res;
skb->truesize = saved_truesize;
if (likely(skb_end_offset(skb) == saved_end_offset)) return 0;
/* We can not change skb->end if the original or new value * is SKB_SMALL_HEAD_HEADROOM, as it might break skb_kfree_head().
*/ if (saved_end_offset == SKB_SMALL_HEAD_HEADROOM ||
skb_end_offset(skb) == SKB_SMALL_HEAD_HEADROOM) { /* We think this path should not be taken. * Add a temporary trace to warn us just in case.
*/
pr_err_once("__skb_unclone_keeptruesize() skb_end_offset() %u -> %u\n",
saved_end_offset, skb_end_offset(skb));
WARN_ON_ONCE(1); return 0;
}
shinfo = skb_shinfo(skb);
/* We are about to change back skb->end, * we need to move skb_shinfo() to its new location.
*/
memmove(skb->head + saved_end_offset,
shinfo,
offsetof(struct skb_shared_info, frags[shinfo->nr_frags]));
skb_set_end_offset(skb, saved_end_offset);
return 0;
}
/** * skb_expand_head - reallocate header of &sk_buff * @skb: buffer to reallocate * @headroom: needed headroom * * Unlike skb_realloc_headroom, this one does not allocate a new skb * if possible; copies skb->sk to new skb as needed * and frees original skb in case of failures. * * It expect increased headroom and generates warning otherwise.
*/
/** * skb_copy_expand - copy and expand sk_buff * @skb: buffer to copy * @newheadroom: new free bytes at head * @newtailroom: new free bytes at tail * @gfp_mask: allocation priority * * Make a copy of both an &sk_buff and its data and while doing so
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Dauer der Verarbeitung: 0.26 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.