// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ #include <linux/bpf.h> #include <linux/btf.h> #include <linux/err.h> #include"linux/filter.h" #include <linux/btf_ids.h> #include <linux/vmalloc.h> #include <linux/pagemap.h> #include"range_tree.h"
/* * bpf_arena is a sparsely populated shared memory region between bpf program and * user space process. * * For example on x86-64 the values could be: * user_vm_start 7f7d26200000 // picked by mmap() * kern_vm_start ffffc90001e69000 // picked by get_vm_area() * For user space all pointers within the arena are normal 8-byte addresses. * In this example 7f7d26200000 is the address of the first page (pgoff=0). * The bpf program will access it as: kern_vm_start + lower_32bit_of_user_ptr * (u32)7f7d26200000 -> 26200000 * hence * ffffc90001e69000 + 26200000 == ffffc90028069000 is "pgoff=0" within 4Gb * kernel memory region. * * BPF JITs generate the following code to access arena: * mov eax, eax // eax has lower 32-bit of user pointer * mov word ptr [rax + r12 + off], bx * where r12 == kern_vm_start and off is s16. * Hence allocate 4Gb + GUARD_SZ/2 on each side. * * Initially kernel vm_area and user vma are not populated. * User space can fault-in any address which will insert the page * into kernel and user vma. * bpf program can allocate a page via bpf_arena_alloc_pages() kfunc * which will insert it into kernel vm_area. * The later fault-in from user space will populate that page into user vma.
*/
/* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */ #define GUARD_SZ round_up(1ull << sizeof_field(struct bpf_insn, off) * 8, PAGE_SIZE << 1) #define KERN_VM_SZ (SZ_4G + GUARD_SZ)
pte = ptep_get(ptep); if (!pte_present(pte)) /* sanity check */ return 0;
page = pte_page(pte); /* * We do not update pte here: * 1. Nobody should be accessing bpf_arena's range outside of a kernel bug * 2. TLB flushing is batched or deferred. Even if we clear pte, * the TLB entries can stick around and continue to permit access to * the freed page. So it all relies on 1.
*/
__free_page(page); return 0;
}
/* * Check that user vma-s are not around when bpf map is freed. * mmap() holds vm_file which holds bpf_map refcnt. * munmap() must have happened on vma followed by arena_vm_close() * which would clear arena->vma_list.
*/ if (WARN_ON_ONCE(!list_empty(&arena->vma_list))) return;
/* * free_vm_area() calls remove_vm_area() that calls free_unmap_vmap_area(). * It unmaps everything from vmalloc area and clears pgtables. * Call apply_to_existing_page_range() first to find populated ptes and * free those pages.
*/
apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL);
free_vm_area(arena->kern_vm);
range_tree_destroy(&arena->rt);
bpf_map_area_free(arena);
}
if (!refcount_dec_and_test(&vml->mmap_count)) return;
guard(mutex)(&arena->lock); /* update link list under lock */
list_del(&vml->head);
vma->vm_private_data = NULL;
kfree(vml);
}
guard(mutex)(&arena->lock);
page = vmalloc_to_page((void *)kaddr); if (page) /* already have a page vmap-ed */ goto out;
if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT) /* User space requested to segfault when page is not allocated by bpf prog */ return VM_FAULT_SIGSEGV;
ret = range_tree_clear(&arena->rt, vmf->pgoff, 1); if (ret) return VM_FAULT_SIGSEGV;
/* Account into memcg of the process that created bpf_arena */
ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page); if (ret) {
range_tree_set(&arena->rt, vmf->pgoff, 1); return VM_FAULT_SIGSEGV;
}
if (pgoff) return -EINVAL; if (len > SZ_4G) return -E2BIG;
/* if user_vm_start was specified at arena creation time */ if (arena->user_vm_start) { if (len > arena->user_vm_end - arena->user_vm_start) return -E2BIG; if (len != arena->user_vm_end - arena->user_vm_start) return -EINVAL; if (addr != arena->user_vm_start) return -EINVAL;
}
ret = mm_get_unmapped_area(current->mm, filp, addr, len * 2, 0, flags); if (IS_ERR_VALUE(ret)) return ret; if ((ret >> 32) == ((ret + len - 1) >> 32)) return ret; if (WARN_ON_ONCE(arena->user_vm_start)) /* checks at map creation time should prevent this */ return -EFAULT; return round_up(ret, SZ_4G);
}
guard(mutex)(&arena->lock); if (arena->user_vm_start && arena->user_vm_start != vma->vm_start) /* * If map_extra was not specified at arena creation time then * 1st user process can do mmap(NULL, ...) to pick user_vm_start * 2nd user process must pass the same addr to mmap(addr, MAP_FIXED..); * or * specify addr in map_extra and * use the same addr later with mmap(addr, MAP_FIXED..);
*/ return -EBUSY;
if (arena->user_vm_end && arena->user_vm_end != vma->vm_end) /* all user processes must have the same size of mmap-ed region */ return -EBUSY;
/* Earlier checks should prevent this */ if (WARN_ON_ONCE(vma->vm_end - vma->vm_start > SZ_4G || vma->vm_pgoff)) return -EFAULT;
if (remember_vma(arena, vma)) return -ENOMEM;
arena->user_vm_start = vma->vm_start;
arena->user_vm_end = vma->vm_end; /* * bpf_map_mmap() checks that it's being mmaped as VM_SHARED and * clears VM_MAYEXEC. Set VM_DONTEXPAND as well to avoid * potential change of user_vm_start.
*/
vm_flags_set(vma, VM_DONTEXPAND);
vma->vm_ops = &arena_vm_ops; return 0;
}
static u64 clear_lo32(u64 val)
{ return val & ~(u64)~0U;
}
/* * Allocate pages and vmap them into kernel vmalloc area. * Later the pages will be mmaped into user space vma.
*/ staticlong arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id)
{ /* user_vm_end/start are fixed before bpf prog runs */ long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT;
u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena); struct page **pages; long pgoff = 0;
u32 uaddr32; int ret, i;
if (page_cnt > page_cnt_max) return 0;
if (uaddr) { if (uaddr & ~PAGE_MASK) return 0;
pgoff = compute_pgoff(arena, uaddr); if (pgoff > page_cnt_max - page_cnt) /* requested address will be outside of user VMA */ return 0;
}
/* zeroing is needed, since alloc_pages_bulk() only fills in non-zero entries */
pages = kvcalloc(page_cnt, sizeof(struct page *), GFP_KERNEL); if (!pages) return 0;
guard(mutex)(&arena->lock);
if (uaddr) {
ret = is_range_tree_set(&arena->rt, pgoff, page_cnt); if (ret) goto out_free_pages;
ret = range_tree_clear(&arena->rt, pgoff, page_cnt);
} else {
ret = pgoff = range_tree_find(&arena->rt, page_cnt); if (pgoff >= 0)
ret = range_tree_clear(&arena->rt, pgoff, page_cnt);
} if (ret) goto out_free_pages;
ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages); if (ret) goto out;
uaddr32 = (u32)(arena->user_vm_start + pgoff * PAGE_SIZE); /* Earlier checks made sure that uaddr32 + page_cnt * PAGE_SIZE - 1 * will not overflow 32-bit. Lower 32-bit need to represent * contiguous user address range. * Map these pages at kern_vm_start base. * kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow * lower 32-bit and it's ok.
*/
ret = vm_area_map_pages(arena->kern_vm, kern_vm_start + uaddr32,
kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE, pages); if (ret) { for (i = 0; i < page_cnt; i++)
__free_page(pages[i]); goto out;
}
kvfree(pages); return clear_lo32(arena->user_vm_start) + uaddr32;
out:
range_tree_set(&arena->rt, pgoff, page_cnt);
out_free_pages:
kvfree(pages); return 0;
}
/* * If page is present in vmalloc area, unmap it from vmalloc area, * unmap it from all user space vma-s, * and free it.
*/ staticvoid zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
{ struct vma_list *vml;
if (page_cnt > 1) /* bulk zap if multiple pages being freed */
zap_pages(arena, full_uaddr, page_cnt);
kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr; for (i = 0; i < page_cnt; i++, kaddr += PAGE_SIZE, full_uaddr += PAGE_SIZE) {
page = vmalloc_to_page((void *)kaddr); if (!page) continue; if (page_cnt == 1 && page_mapped(page)) /* mapped by some user process */ /* Optimization for the common case of page_cnt==1: * If page wasn't mapped into some user vma there * is no need to call zap_pages which is slow. When * page_cnt is big it's faster to do the batched zap.
*/
zap_pages(arena, full_uaddr, 1);
vm_area_unmap_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE);
__free_page(page);
}
}
/* * Reserve an arena virtual address range without populating it. This call stops * bpf_arena_alloc_pages from adding pages to this range.
*/ staticint arena_reserve_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt)
{ long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT; long pgoff; int ret;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.