/* * To prevent common memory management code establishing * a zero page mapping on a read fault. * This macro should be defined within <asm/pgtable.h>. * s390 does this to prevent multiplexing of hardware bits * related to the physical page in case of virtualization.
*/ #ifndef mm_forbids_zeropage #define mm_forbids_zeropage(X) (0) #endif
/* * On some architectures it is expensive to call memset() for small sizes. * If an architecture decides to implement their own version of * mm_zero_struct_page they should wrap the defines below in a #ifndef and * define their own version of this macro in <asm/pgtable.h>
*/ #if BITS_PER_LONG == 64 /* This function must be updated when the size of struct page grows above 96 * or reduces below 56. The idea that compiler optimizes out switch() * statement, and only leaves move/store instructions. Also the compiler can * combine write statements if they are both assignments and can be reordered, * this can result in several of the writes here being dropped.
*/ #define mm_zero_struct_page(pp) __mm_zero_struct_page(pp) staticinlinevoid __mm_zero_struct_page(struct page *page)
{ unsignedlong *_pp = (void *)page;
/* Check that struct page is either 56, 64, 72, 80, 88 or 96 bytes */
BUILD_BUG_ON(sizeof(struct page) & 7);
BUILD_BUG_ON(sizeof(struct page) < 56);
BUILD_BUG_ON(sizeof(struct page) > 96);
/* * Default maximum number of active map areas, this limits the number of vmas * per mm struct. Users can overwrite this number by sysctl but there is a * problem. * * When a program's coredump is generated as ELF format, a section is created * per a vma. In ELF, the number of sections is represented in unsigned short. * This means the number of sections should be smaller than 65535 at coredump. * Because the kernel adds some informative sections to a image of program at * generating coredump, we need some margin. The number of extra sections is * 1-3 now and depends on arch. We use "5" as safe margin, here. * * ELF extended numbering allows more than 65535 sections, so 16-bit bound is * not a hard limit any more. Although some userspace tools can be surprised by * that.
*/ #define MAPCOUNT_ELF_CORE_MARGIN (5) #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)
/* to align the pointer to the (next) page boundary */ #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE)
/* to align the pointer to the (prev) page boundary */ #define PAGE_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PAGE_SIZE)
/* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */ #define PAGE_ALIGNED(addr) IS_ALIGNED((unsignedlong)(addr), PAGE_SIZE)
/* * Linux kernel virtual memory manager primitives. * The idea being to have a "virtual" mm in the same way * we have a virtual fs - giving a cleaner interface to the * mm details, and allowing different kinds of memory mappings * (from shared memory to executable loading to arbitrary * mmap() functions).
*/
/* * vm_flags in vm_area_struct, see mm_types.h. * When changing, update also include/trace/events/mmflags.h
*/ #define VM_NONE 0x00000000
#define VM_READ 0x00000001 /* currently active flags */ #define VM_WRITE 0x00000002 #define VM_EXEC 0x00000004 #define VM_SHARED 0x00000008
/* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ #define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */ #define VM_MAYWRITE 0x00000020 #define VM_MAYEXEC 0x00000040 #define VM_MAYSHARE 0x00000080
#define VM_GROWSDOWN 0x00000100 /* general info on the segment */ #ifdef CONFIG_MMU #define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */ #else/* CONFIG_MMU */ #define VM_MAYOVERLAY 0x00000200 /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */ #define VM_UFFD_MISSING 0 #endif/* CONFIG_MMU */ #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ #define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */
#define VM_LOCKED 0x00002000 #define VM_IO 0x00004000 /* Memory mapped I/O or similar */
/* Used by sys_madvise() */ #define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ #define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */
#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ #define VM_LOCKONFAULT 0x00080000 /* Lock the pages covered when they are faulted in */ #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_SYNC 0x00800000 /* Synchronous page faults */ #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ #define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */ #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */
#ifdef CONFIG_MEM_SOFT_DIRTY # define VM_SOFTDIRTY 0x08000000 /* Not soft dirty clean area */ #else # define VM_SOFTDIRTY 0 #endif
#define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ #define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ #define VM_MERGEABLE BIT(31) /* KSM may merge identical pages */
#ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS #define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_6 38 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) #define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) #define VM_HIGH_ARCH_6 BIT(VM_HIGH_ARCH_BIT_6) #endif/* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
#ifdef CONFIG_X86_USER_SHADOW_STACK /* * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of * support core mm. * * These VMAs will get a single end guard page. This helps userspace protect * itself from attacks. A single page is enough for current shadow stack archs * (x86). See the comments near alloc_shstk() in arch/x86/kernel/shstk.c * for more details on the guard size.
*/ # define VM_SHADOW_STACK VM_HIGH_ARCH_5 #endif
#ifdefined(CONFIG_ARM64_GCS) /* * arm64's Guarded Control Stack implements similar functionality and * has similar constraints to shadow stacks.
*/ # define VM_SHADOW_STACK VM_HIGH_ARCH_6 #endif
/* * This flag is used to connect VFIO to arch specific KVM code. It * indicates that the memory under this VMA is safe for use with any * non-cachable memory type inside KVM. Some VFIO devices, on some * platforms, are thought to be unsafe and can cause machine crashes * if KVM does not lock down the memory type.
*/ #ifdef CONFIG_64BIT #define VM_ALLOW_ANY_UNCACHED_BIT 39 #define VM_ALLOW_ANY_UNCACHED BIT(VM_ALLOW_ANY_UNCACHED_BIT) #else #define VM_ALLOW_ANY_UNCACHED VM_NONE #endif
/* * Special vmas that are non-mergable, non-mlock()able.
*/ #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)
/* This mask prevents VMA from being scanned with khugepaged */ #define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB)
/* This mask defines which mm->def_flags a process can inherit its parent */ #define VM_INIT_DEF_MASK VM_NOHUGEPAGE
/* This mask represents all the VMA flag bits used by mlock */ #define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT)
/* Arch-specific flags to clear when updating VM flags on protection change */ #ifndef VM_ARCH_CLEAR # define VM_ARCH_CLEAR VM_NONE #endif #define VM_FLAGS_CLEAR (ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR)
/* * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask..
*/
/* * The default fault flags that should be used by most of the * arch-specific page fault handlers.
*/ #define FAULT_FLAG_DEFAULT (FAULT_FLAG_ALLOW_RETRY | \
FAULT_FLAG_KILLABLE | \
FAULT_FLAG_INTERRUPTIBLE)
/** * fault_flag_allow_retry_first - check ALLOW_RETRY the first time * @flags: Fault flags. * * This is mostly used for places where we want to try to avoid taking * the mmap_lock for too long a time when waiting for another condition * to change, in which case we can try to be polite to release the * mmap_lock in the first round to avoid potential starvation of other * processes that would also want the mmap_lock. * * Return: true if the page fault allows retry and this is the first * attempt of the fault handling; false otherwise.
*/ staticinlinebool fault_flag_allow_retry_first(enum fault_flag flags)
{ return (flags & FAULT_FLAG_ALLOW_RETRY) &&
(!(flags & FAULT_FLAG_TRIED));
}
/* * vm_fault is filled by the pagefault handler and passed to the vma's * ->fault function. The vma's ->fault is responsible for returning a bitmask * of VM_FAULT_xxx flags that give details about how the fault was handled. * * MM layer fills up gfp_mask for page allocations but fault handler might * alter it if its implementation requires a different allocation context. * * pgoff should be used in favour of virtual_address, if possible.
*/ struct vm_fault { conststruct { struct vm_area_struct *vma; /* Target VMA */
gfp_t gfp_mask; /* gfp mask to be used for allocations */
pgoff_t pgoff; /* Logical page offset based on vma */ unsignedlong address; /* Faulting virtual address - masked */ unsignedlong real_address; /* Faulting virtual address - unmasked */
}; enum fault_flag flags; /* FAULT_FLAG_xxx flags
* XXX: should really be 'const' */
pmd_t *pmd; /* Pointer to pmd entry matching
* the 'address' */
pud_t *pud; /* Pointer to pud entry matching * the 'address'
*/ union {
pte_t orig_pte; /* Value of PTE at the time of fault */
pmd_t orig_pmd; /* Value of PMD at the time of fault, * used by PMD fault only.
*/
};
struct page *cow_page; /* Page handler may use for COW fault */ struct page *page; /* ->fault handlers should return a * page here, unless VM_FAULT_NOPAGE * is set (which is also implied by * VM_FAULT_ERROR).
*/ /* These three entries are valid only while holding ptl lock */
pte_t *pte; /* Pointer to pte entry matching * the 'address'. NULL if the page * table hasn't been allocated.
*/
spinlock_t *ptl; /* Page table lock. * Protects pte page table if 'pte' * is not NULL, otherwise pmd.
*/
pgtable_t prealloc_pte; /* Pre-allocated pte page table. * vm_ops->map_pages() sets up a page * table from atomic context. * do_fault_around() pre-allocates * page table to avoid allocation from * atomic context.
*/
};
/* * These are the virtual MM functions - opening of an area, closing and * unmapping it (needed to keep files on disk up-to-date etc), pointer * to the functions called when a no-page or a wp-page exception occurs.
*/ struct vm_operations_struct { void (*open)(struct vm_area_struct * area); /** * @close: Called when the VMA is being removed from the MM. * Context: User context. May sleep. Caller holds mmap_lock.
*/ void (*close)(struct vm_area_struct * area); /* Called any time before splitting to check if it's allowed */ int (*may_split)(struct vm_area_struct *area, unsignedlong addr); int (*mremap)(struct vm_area_struct *area); /* * Called by mprotect() to make driver-specific permission * checks before mprotect() is finalised. The VMA must not * be modified. Returns 0 if mprotect() can proceed.
*/ int (*mprotect)(struct vm_area_struct *vma, unsignedlong start, unsignedlong end, unsignedlong newflags);
vm_fault_t (*fault)(struct vm_fault *vmf);
vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsignedint order);
vm_fault_t (*map_pages)(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff); unsignedlong (*pagesize)(struct vm_area_struct * area);
/* notification that a previously read-only page is about to become
* writable, if an error is returned it will cause a SIGBUS */
vm_fault_t (*page_mkwrite)(struct vm_fault *vmf);
/* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */
vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf);
/* called by access_process_vm when get_user_pages() fails, typically * for use by special VMAs. See also generic_access_phys() for a generic * implementation useful for any iomem mapping.
*/ int (*access)(struct vm_area_struct *vma, unsignedlong addr, void *buf, int len, int write);
/* Called by the /proc/PID/maps code to ask the vma whether it * has a special name. Returning non-NULL will also cause this
* vma to be dumped unconditionally. */ constchar *(*name)(struct vm_area_struct *vma);
#ifdef CONFIG_NUMA /* * set_policy() op must add a reference to any non-NULL @new mempolicy * to hold the policy upon return. Caller should pass NULL @new to * remove a policy and fall back to surrounding context--i.e. do not * install a MPOL_DEFAULT policy, nor the task or system default * mempolicy.
*/ int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);
/* * get_policy() op must add reference [mpol_get()] to any policy at * (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure * in mm/mempolicy.c will do this automatically. * get_policy() must NOT add a ref if the policy at (vma,addr) is not * marked as MPOL_SHARED. vma policies are protected by the mmap_lock. * If no [shared/vma] mempolicy exists at the addr, get_policy() op * must return NULL--i.e., do not "fallback" to task or system default * policy.
*/ struct mempolicy *(*get_policy)(struct vm_area_struct *vma, unsignedlong addr, pgoff_t *ilx); #endif /* * Called by vm_normal_page() for special PTEs to find the * page for @addr. This is useful if the default behavior * (using pte_page()) would not find the correct page.
*/ struct page *(*find_special_page)(struct vm_area_struct *vma, unsignedlong addr);
};
/* * These must be here rather than mmap_lock.h as dependent on vm_fault type, * declared in this header.
*/ #ifdef CONFIG_PER_VMA_LOCK staticinlinevoid release_fault_lock(struct vm_fault *vmf)
{ if (vmf->flags & FAULT_FLAG_VMA_LOCK)
vma_end_read(vmf->vma); else
mmap_read_unlock(vmf->vma->vm_mm);
}
/* Use when VMA is not part of the VMA tree and needs no locking */ staticinlinevoid vm_flags_init(struct vm_area_struct *vma,
vm_flags_t flags)
{
ACCESS_PRIVATE(vma, __vm_flags) = flags;
}
/* * Use when VMA is part of the VMA tree and modifications need coordination * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and * it should be locked explicitly beforehand.
*/ staticinlinevoid vm_flags_reset(struct vm_area_struct *vma,
vm_flags_t flags)
{
vma_assert_write_locked(vma);
vm_flags_init(vma, flags);
}
/* * Use only if VMA is not part of the VMA tree or has no other users and * therefore needs no locking.
*/ staticinlinevoid __vm_flags_mod(struct vm_area_struct *vma,
vm_flags_t set, vm_flags_t clear)
{
vm_flags_init(vma, (vma->vm_flags | set) & ~clear);
}
/* * Use only when the order of set/clear operations is unimportant, otherwise * use vm_flags_{set|clear} explicitly.
*/ staticinlinevoid vm_flags_mod(struct vm_area_struct *vma,
vm_flags_t set, vm_flags_t clear)
{
vma_start_write(vma);
__vm_flags_mod(vma, set, clear);
}
/* * Indicate if the VMA is a heap for the given task; for * /proc/PID/maps that is the heap of the main task.
*/ staticinlinebool vma_is_initial_heap(conststruct vm_area_struct *vma)
{ return vma->vm_start < vma->vm_mm->brk &&
vma->vm_end > vma->vm_mm->start_brk;
}
/* * Indicate if the VMA is a stack for the given task; for * /proc/PID/maps that is the stack of the main task.
*/ staticinlinebool vma_is_initial_stack(conststruct vm_area_struct *vma)
{ /* * We make no effort to guess what a given thread considers to be * its "stack". It's not even well-defined for programs written * languages like Go.
*/ return vma->vm_start <= vma->vm_mm->start_stack &&
vma->vm_end >= vma->vm_mm->start_stack;
}
staticinlinestruct vm_area_struct *vma_next(struct vma_iterator *vmi)
{ /* * Uses mas_find() to get the first VMA when the iterator starts. * Calling mas_next() could skip the first entry.
*/ return mas_find(&vmi->mas, ULONG_MAX);
}
#define for_each_vma(__vmi, __vma) \ while (((__vma) = vma_next(&(__vmi))) != NULL)
/* The MM code likes to work with exclusive end addresses */ #define for_each_vma_range(__vmi, __vma, __end) \ while (((__vma) = vma_find(&(__vmi), (__end))) != NULL)
#ifdef CONFIG_SHMEM /* * The vma_is_shmem is not inline because it is used only by slow * paths in userfault.
*/ bool vma_is_shmem(struct vm_area_struct *vma); bool vma_is_anon_shmem(struct vm_area_struct *vma); #else staticinlinebool vma_is_shmem(struct vm_area_struct *vma) { returnfalse; } staticinlinebool vma_is_anon_shmem(struct vm_area_struct *vma) { returnfalse; } #endif
int vma_is_stack_for_current(struct vm_area_struct *vma);
/* flush_tlb_range() takes a vma, not a mm, and can care about flags */ #define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) }
/* * compound_order() can be called without holding a reference, which means * that niceties like page_folio() don't work. These callers should be * prepared to handle wild return values. For example, PG_head may be * set before the order is initialised, or this may be a tail page. * See compaction.c for some good examples.
*/ staticinlineunsignedint compound_order(struct page *page)
{ struct folio *folio = (struct folio *)page;
if (!test_bit(PG_head, &folio->flags)) return 0; return folio_large_order(folio);
}
/** * folio_order - The allocation order of a folio. * @folio: The folio. * * A folio is composed of 2^order pages. See get_order() for the definition * of order. * * Return: The order of the folio.
*/ staticinlineunsignedint folio_order(conststruct folio *folio)
{ if (!folio_test_large(folio)) return 0; return folio_large_order(folio);
}
/** * folio_reset_order - Reset the folio order and derived _nr_pages * @folio: The folio. * * Reset the order and derived _nr_pages to 0. Must only be used in the * process of splitting large folios.
*/ staticinlinevoid folio_reset_order(struct folio *folio)
{ if (WARN_ON_ONCE(!folio_test_large(folio))) return;
folio->_flags_1 &= ~0xffUL; #ifdef NR_PAGES_IN_LARGE_FOLIO
folio->_nr_pages = 0; #endif
}
#include <linux/huge_mm.h>
/* * Methods to modify the page usage count. * * What counts for a page usage: * - cache mapping (page->mapping) * - private data (page->private) * - page mapped in a task's page tables, each mapping * is counted separately * * Also, many kernel routines increase the page count before a critical * routine so they can be sure the page doesn't go away from under them.
*/
/* * Drop a ref, return true if the refcount fell to zero (the page has no users)
*/ staticinlineint put_page_testzero(struct page *page)
{
VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); return page_ref_dec_and_test(page);
}
/* * Try to grab a ref unless the page has a refcount of zero, return false if * that is the case. * This can be called when MMU is off so it must not access * any of the virtual mappings.
*/ staticinlinebool get_page_unless_zero(struct page *page)
{ return page_ref_add_unless(page, 1, 0);
}
int region_intersects(resource_size_t offset, size_t size, unsignedlong flags, unsignedlong desc);
/* Support for virtually mapped pages */ struct page *vmalloc_to_page(constvoid *addr); unsignedlong vmalloc_to_pfn(constvoid *addr);
/* * Determine if an address is within the vmalloc range * * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there * is no special casing required.
*/ #ifdef CONFIG_MMU externbool is_vmalloc_addr(constvoid *x); externint is_vmalloc_or_module_addr(constvoid *x); #else staticinlinebool is_vmalloc_addr(constvoid *x)
{ returnfalse;
} staticinlineint is_vmalloc_or_module_addr(constvoid *x)
{ return 0;
} #endif
/* * How many times the entire folio is mapped as a single unit (eg by a * PMD or PUD entry). This is probably not what you want, except for * debugging purposes or implementation of other core folio_*() primitives.
*/ staticinlineint folio_entire_mapcount(conststruct folio *folio)
{
VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio_large_order(folio) == 1)) return 0; return atomic_read(&folio->_entire_mapcount) + 1;
}
/** * folio_mapcount() - Number of mappings of this folio. * @folio: The folio. * * The folio mapcount corresponds to the number of present user page table * entries that reference any part of a folio. Each such present user page * table entry must be paired with exactly on folio reference. * * For ordindary folios, each user page table entry (PTE/PMD/PUD/...) counts * exactly once. * * For hugetlb folios, each abstracted "hugetlb" user page table entry that * references the entire folio counts exactly once, even when such special * page table entries are comprised of multiple ordinary page table entries. * * Will report 0 for pages which cannot be mapped into userspace, such as * slab, page tables and similar. * * Return: The number of times this folio is mapped.
*/ staticinlineint folio_mapcount(conststruct folio *folio)
{ int mapcount;
if (likely(!folio_test_large(folio))) {
mapcount = atomic_read(&folio->_mapcount) + 1; if (page_mapcount_is_type(mapcount))
mapcount = 0; return mapcount;
} return folio_large_mapcount(folio);
}
/** * folio_mapped - Is this folio mapped into userspace? * @folio: The folio. * * Return: True if any page in this folio is referenced by user page tables.
*/ staticinlinebool folio_mapped(conststruct folio *folio)
{ return folio_mapcount(folio) >= 1;
}
/* * Return true if this page is mapped into pagetables. * For compound page it returns true if any sub-page of compound page is mapped, * even if this particular sub-page is not itself mapped by any PTE or PMD.
*/ staticinlinebool page_mapped(conststruct page *page)
{ return folio_mapped(page_folio(page));
}
/* Returns the number of bytes in this potentially compound page. */ staticinlineunsignedlong page_size(struct page *page)
{ return PAGE_SIZE << compound_order(page);
}
/* Returns the number of bits needed for the number of bytes in a page */ staticinlineunsignedint page_shift(struct page *page)
{ return PAGE_SHIFT + compound_order(page);
}
/** * thp_order - Order of a transparent huge page. * @page: Head page of a transparent huge page.
*/ staticinlineunsignedint thp_order(struct page *page)
{
VM_BUG_ON_PGFLAGS(PageTail(page), page); return compound_order(page);
}
/** * thp_size - Size of a transparent huge page. * @page: Head page of a transparent huge page. * * Return: Number of bytes in this page.
*/ staticinlineunsignedlong thp_size(struct page *page)
{ return PAGE_SIZE << thp_order(page);
}
#ifdef CONFIG_MMU /* * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when * servicing faults for write access. In the normal case, do always want * pte_mkwrite. But get_user_pages can cause write faults for mappings * that do not have writing enabled, when used by access_process_vm.
*/ staticinline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
{ if (likely(vma->vm_flags & VM_WRITE))
pte = pte_mkwrite(pte, vma); return pte;
}
/* * Multiple processes may "see" the same page. E.g. for untouched * mappings of /dev/null, all processes see the same page full of * zeroes, and text pages of executables and shared libraries have * only one copy in memory, at most, normally. * * For the non-reserved pages, page_count(page) denotes a reference count. * page_count() == 0 means the page is free. page->lru is then used for * freelist management in the buddy allocator. * page_count() > 0 means the page has been allocated. * * Pages are allocated by the slab allocator in order to provide memory * to kmalloc and kmem_cache_alloc. In this case, the management of the * page, and the fields in 'struct page' are the responsibility of mm/slab.c * unless a particular usage is carefully commented. (the responsibility of * freeing the kmalloc memory is the caller's, of course). * * A page may be used by anyone else who does a __get_free_page(). * In this case, page_count still tracks the references, and should only * be used through the normal accessor functions. The top bits of page->flags * and page->virtual store page management information, but all other fields * are unused and could be used privately, carefully. The management of this * page is the responsibility of the one who allocated it, and those who have * subsequently been given references to it. * * The other pages (we may call them "pagecache pages") are completely * managed by the Linux memory manager: I/O, buffers, swapping etc. * The following discussion applies only to them. * * A pagecache page contains an opaque `private' member, which belongs to the * page's address_space. Usually, this is the address of a circular list of * the page's disk buffers. PG_private must be set to tell the VM to call * into the filesystem to release these pages. * * A folio may belong to an inode's memory mapping. In this case, * folio->mapping points to the inode, and folio->index is the file * offset of the folio, in units of PAGE_SIZE. * * If pagecache pages are not associated with an inode, they are said to be * anonymous pages. These may become associated with the swapcache, and in that * case PG_swapcache is set, and page->private is an offset into the swapcache. * * In either case (swapcache or inode backed), the pagecache itself holds one * reference to the page. Setting PG_private should also increment the * refcount. The each user mapping also has a reference to the page. * * The pagecache pages are stored in a per-mapping radix tree, which is * rooted at mapping->i_pages, and indexed by offset. * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space * lists, we instead now tag pages as dirty/writeback in the radix tree. * * All pagecache pages may be subject to I/O: * - inode pages may need to be read from disk, * - inode pages which have been modified and are MAP_SHARED may need * to be written back to the inode on disk, * - anonymous pages (including MAP_PRIVATE file mappings) which have been * modified may need to be swapped out to swap space and (later) to be read * back into memory.
*/
/* 127: arbitrary random number, small enough to assemble well */ #define folio_ref_zero_or_close_to_overflow(folio) \
((unsignedint) folio_ref_count(folio) + 127u <= 127u)
/** * folio_get - Increment the reference count on a folio. * @folio: The folio. * * Context: May be called in any context, as long as you know that * you have a refcount on the folio. If you do not already have one, * folio_try_get() may be the right interface for you to use.
*/ staticinlinevoid folio_get(struct folio *folio)
{
VM_BUG_ON_FOLIO(folio_ref_zero_or_close_to_overflow(folio), folio);
folio_ref_inc(folio);
}
staticinlinevoid get_page(struct page *page)
{ struct folio *folio = page_folio(page); if (WARN_ON_ONCE(folio_test_slab(folio))) return; if (WARN_ON_ONCE(folio_test_large_kmalloc(folio))) return;
folio_get(folio);
}
/** * folio_put - Decrement the reference count on a folio. * @folio: The folio. * * If the folio's reference count reaches zero, the memory will be * released back to the page allocator and may be used by another * allocation immediately. Do not access the memory or the struct folio * after calling folio_put() unless you can be sure that it wasn't the * last reference. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock.
*/ staticinlinevoid folio_put(struct folio *folio)
{ if (folio_put_testzero(folio))
__folio_put(folio);
}
/** * folio_put_refs - Reduce the reference count on a folio. * @folio: The folio. * @refs: The amount to subtract from the folio's reference count. * * If the folio's reference count reaches zero, the memory will be * released back to the page allocator and may be used by another * allocation immediately. Do not access the memory or the struct folio * after calling folio_put_refs() unless you can be sure that these weren't * the last references. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock.
*/ staticinlinevoid folio_put_refs(struct folio *folio, int refs)
{ if (folio_ref_sub_and_test(folio, refs))
__folio_put(folio);
}
/* * union release_pages_arg - an array of pages or folios * * release_pages() releases a simple array of multiple pages, and * accepts various different forms of said page array: either * a regular old boring array of pages, an array of folios, or * an array of encoded page pointers. * * The transparent union syntax for this kind of "any of these * argument types" is all kinds of ugly, so look away.
*/ typedefunion { struct page **pages; struct folio **folios; struct encoded_page **encoded_pages;
} release_pages_arg __attribute__ ((__transparent_union__));
void release_pages(release_pages_arg, int nr);
/** * folios_put - Decrement the reference count on an array of folios. * @folios: The folios. * * Like folio_put(), but for a batch of folios. This is more efficient * than writing the loop yourself as it will optimise the locks which need * to be taken if the folios are freed. The folios batch is returned * empty and ready to be reused for another batch; there is no need to * reinitialise it. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock.
*/ staticinlinevoid folios_put(struct folio_batch *folios)
{
folios_put_refs(folios, NULL);
}
if (folio_test_slab(folio) || folio_test_large_kmalloc(folio)) return;
folio_put(folio);
}
/* * GUP_PIN_COUNTING_BIAS, and the associated functions that use it, overload * the page's refcount so that two separate items are tracked: the original page * reference count, and also a new count of how many pin_user_pages() calls were * made against the page. ("gup-pinned" is another term for the latter). * * With this scheme, pin_user_pages() becomes special: such pages are marked as * distinct from normal pages. As such, the unpin_user_page() call (and its * variants) must be used in order to release gup-pinned pages. * * Choice of value: * * By making GUP_PIN_COUNTING_BIAS a power of two, debugging of page reference * counts with respect to pin_user_pages() and unpin_user_page() becomes * simpler, due to the fact that adding an even power of two to the page * refcount has the effect of using only the upper N bits, for the code that * counts up using the bias value. This means that the lower bits are left for * the exclusive use of the original code that increments and decrements by one * (or at least, by much smaller values than the bias value). * * Of course, once the lower bits overflow into the upper bits (and this is * OK, because subtraction recovers the original values), then visual inspection * no longer suffices to directly view the separate counts. However, for normal * applications that don't have huge page reference counts, this won't be an * issue. * * Locking: the lockless algorithm described in folio_try_get_rcu() * provides safe operation for get_user_pages(), folio_mkclean() and * other calls that race to set up page table entries.
*/ #define GUP_PIN_COUNTING_BIAS (1U << 10)
#ifndef CONFIG_MMU staticinlinebool is_nommu_shared_mapping(vm_flags_t flags)
{ /* * NOMMU shared mappings are ordinary MAP_SHARED mappings and selected * R/O MAP_PRIVATE file mappings that are an effective R/O overlay of * a file mapping. R/O MAP_PRIVATE mappings might still modify * underlying memory if ptrace is active, so this is only possible if * ptrace does not apply. Note that there is no mprotect() to upgrade * write permissions later.
*/ return flags & (VM_MAYSHARE | VM_MAYOVERLAY);
} #endif
/* * The identification function is mainly used by the buddy allocator for * determining if two pages could be buddies. We are not really identifying * the zone since we could be using the section number id if we do not have * node id available in page flags. * We only guarantee that it will return the same value for two combinable * pages in a zone.
*/ staticinlineint page_zone_id(struct page *page)
{ return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK;
}
/* * KASAN per-page tags are stored xor'ed with 0xff. This allows to avoid * setting tags for all pages to native kernel tag value 0xff, as the default * value 0x00 maps to 0xff.
*/
staticinline u8 page_kasan_tag(conststruct page *page)
{
u8 tag = KASAN_TAG_KERNEL;
if (kasan_enabled()) {
tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
tag ^= 0xff;
}
/** * folio_pfn - Return the Page Frame Number of a folio. * @folio: The folio. * * A folio may contain multiple pages. The pages have consecutive * Page Frame Numbers. * * Return: The Page Frame Number of the first page in the folio.
*/ staticinlineunsignedlong folio_pfn(conststruct folio *folio)
{ return page_to_pfn(&folio->page);
}
/** * folio_mk_pte - Create a PTE for this folio * @folio: The folio to create a PTE for * @pgprot: The page protection bits to use * * Create a page table entry for the first page of this folio. * This is suitable for passing to set_ptes(). * * Return: A page table entry suitable for mapping this folio.
*/ staticinline pte_t folio_mk_pte(struct folio *folio, pgprot_t pgprot)
{ return pfn_pte(folio_pfn(folio), pgprot);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE /** * folio_mk_pmd - Create a PMD for this folio * @folio: The folio to create a PMD for * @pgprot: The page protection bits to use * * Create a page table entry for the first page of this folio. * This is suitable for passing to set_pmd_at(). * * Return: A page table entry suitable for mapping this folio.
*/ staticinline pmd_t folio_mk_pmd(struct folio *folio, pgprot_t pgprot)
{ return pmd_mkhuge(pfn_pmd(folio_pfn(folio), pgprot));
}
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD /** * folio_mk_pud - Create a PUD for this folio * @folio: The folio to create a PUD for * @pgprot: The page protection bits to use * * Create a page table entry for the first page of this folio. * This is suitable for passing to set_pud_at(). * * Return: A page table entry suitable for mapping this folio.
*/ staticinline pud_t folio_mk_pud(struct folio *folio, pgprot_t pgprot)
{ return pud_mkhuge(pfn_pud(folio_pfn(folio), pgprot));
} #endif/* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ #endif/* CONFIG_TRANSPARENT_HUGEPAGE */ #endif/* CONFIG_MMU */
/** * folio_maybe_dma_pinned - Report if a folio may be pinned for DMA. * @folio: The folio. * * This function checks if a folio has been pinned via a call to * a function in the pin_user_pages() family. * * For small folios, the return value is partially fuzzy: false is not fuzzy, * because it means "definitely not pinned for DMA", but true means "probably * pinned for DMA, but possibly a false positive due to having at least * GUP_PIN_COUNTING_BIAS worth of normal folio references". * * False positives are OK, because: a) it's unlikely for a folio to * get that many refcounts, and b) all the callers of this routine are * expected to be able to deal gracefully with a false positive. * * For most large folios, the result will be exactly correct. That's because * we have more tracking data available: the _pincount field is used * instead of the GUP_PIN_COUNTING_BIAS scheme. * * For more information, please see Documentation/core-api/pin_user_pages.rst. * * Return: True, if it is likely that the folio has been "dma-pinned". * False, if the folio is definitely not dma-pinned.
*/ staticinlinebool folio_maybe_dma_pinned(struct folio *folio)
{ if (folio_has_pincount(folio)) return atomic_read(&folio->_pincount) > 0;
/* * folio_ref_count() is signed. If that refcount overflows, then * folio_ref_count() returns a negative value, and callers will avoid * further incrementing the refcount. * * Here, for that overflow case, use the sign bit to count a little * bit higher via unsigned math, and thus still get an accurate result.
*/ return ((unsignedint)folio_ref_count(folio)) >=
GUP_PIN_COUNTING_BIAS;
}
/* * This should most likely only be called during fork() to see whether we * should break the cow immediately for an anon page on the src mm. * * The caller has to hold the PT lock and the vma->vm_mm->->write_protect_seq.
*/ staticinlinebool folio_needs_cow_for_dma(struct vm_area_struct *vma, struct folio *folio)
{
VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1));
if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)) returnfalse;
return folio_maybe_dma_pinned(folio);
}
/** * is_zero_page - Query if a page is a zero page * @page: The page to query * * This returns true if @page is one of the permanent zero pages.
*/ staticinlinebool is_zero_page(conststruct page *page)
{ return is_zero_pfn(page_to_pfn(page));
}
/** * is_zero_folio - Query if a folio is a zero page * @folio: The folio to query * * This returns true if @folio is one of the permanent zero pages.
*/ staticinlinebool is_zero_folio(conststruct folio *folio)
{ return is_zero_page(&folio->page);
}
/* MIGRATE_CMA and ZONE_MOVABLE do not allow pin folios */ #ifdef CONFIG_MIGRATION staticinlinebool folio_is_longterm_pinnable(struct folio *folio)
{ #ifdef CONFIG_CMA int mt = folio_migratetype(folio);
if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE) returnfalse; #endif /* The zero page can be "pinned" but gets special handling. */ if (is_zero_folio(folio)) returntrue;
/* Coherent device memory must always allow eviction. */ if (folio_is_device_coherent(folio)) returnfalse;
/* * Filesystems can only tolerate transient delays to truncate and * hole-punch operations
*/ if (folio_is_fsdax(folio)) returnfalse;
/* Otherwise, non-movable zone folios can be pinned. */ return !folio_is_zone_movable(folio);
/** * folio_nr_pages - The number of pages in the folio. * @folio: The folio. * * Return: A positive power of two.
*/ staticinlinelong folio_nr_pages(conststruct folio *folio)
{ if (!folio_test_large(folio)) return 1; return folio_large_nr_pages(folio);
}
/* Only hugetlbfs can allocate folios larger than MAX_ORDER */ #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE #define MAX_FOLIO_NR_PAGES (1UL << PUD_ORDER) #else #define MAX_FOLIO_NR_PAGES MAX_ORDER_NR_PAGES #endif
/* * compound_nr() returns the number of pages in this potentially compound * page. compound_nr() can be called on a tail page, and is defined to * return 1 in that case.
*/ staticinlinelong compound_nr(struct page *page)
{ struct folio *folio = (struct folio *)page;
if (!test_bit(PG_head, &folio->flags)) return 1; return folio_large_nr_pages(folio);
}
/** * folio_next - Move to the next physical folio. * @folio: The folio we're currently operating on. * * If you have physically contiguous memory which may span more than * one folio (eg a &struct bio_vec), use this function to move from one * folio to the next. Do not use it if the memory is only virtually * contiguous as the folios are almost certainly not adjacent to each * other. This is the folio equivalent to writing ``page++``. * * Context: We assume that the folios are refcounted and/or locked at a * higher level and do not adjust the reference counts. * Return: The next struct folio.
*/ staticinlinestruct folio *folio_next(struct folio *folio)
{ return (struct folio *)folio_page(folio, folio_nr_pages(folio));
}
/** * folio_shift - The size of the memory described by this folio. * @folio: The folio. * * A folio represents a number of bytes which is a power-of-two in size. * This function tells you which power-of-two the folio is. See also * folio_size() and folio_order(). * * Context: The caller should have a reference on the folio to prevent * it from being split. It is not necessary for the folio to be locked. * Return: The base-2 logarithm of the size of this folio.
*/ staticinlineunsignedint folio_shift(conststruct folio *folio)
{ return PAGE_SHIFT + folio_order(folio);
}
/** * folio_size - The number of bytes in a folio. * @folio: The folio. * * Context: The caller should have a reference on the folio to prevent * it from being split. It is not necessary for the folio to be locked. * Return: The number of bytes in this folio.
*/ staticinline size_t folio_size(conststruct folio *folio)
{ return PAGE_SIZE << folio_order(folio);
}
/** * folio_maybe_mapped_shared - Whether the folio is mapped into the page * tables of more than one MM * @folio: The folio. * * This function checks if the folio maybe currently mapped into more than one * MM ("maybe mapped shared"), or if the folio is certainly mapped into a single * MM ("mapped exclusively"). * * For KSM folios, this function also returns "mapped shared" when a folio is * mapped multiple times into the same MM, because the individual page mappings * are independent. * * For small anonymous folios and anonymous hugetlb folios, the return * value will be exactly correct: non-KSM folios can only be mapped at most once * into an MM, and they cannot be partially mapped. KSM folios are * considered shared even if mapped multiple times into the same MM. * * For other folios, the result can be fuzzy: * #. For partially-mappable large folios (THP), the return value can wrongly * indicate "mapped shared" (false positive) if a folio was mapped by * more than two MMs at one point in time. * #. For pagecache folios (including hugetlb), the return value can wrongly * indicate "mapped shared" (false positive) when two VMAs in the same MM * cover the same file range. * * Further, this function only considers current page table mappings that * are tracked using the folio mapcount(s). * * This function does not consider: * #. If the folio might get mapped in the (near) future (e.g., swapcache, * pagecache, temporary unmapping for migration). * #. If the folio is mapped differently (VM_PFNMAP). * #. If hugetlb page table sharing applies. Callers might want to check * hugetlb_pmd_shared(). * * Return: Whether the folio is estimated to be mapped into more than one MM.
*/ staticinlinebool folio_maybe_mapped_shared(struct folio *folio)
{ int mapcount = folio_mapcount(folio);
/* Only partially-mappable folios require more care. */ if (!folio_test_large(folio) || unlikely(folio_test_hugetlb(folio))) return mapcount > 1;
/* * vm_insert_page() without CONFIG_TRANSPARENT_HUGEPAGE ... * simply assume "mapped shared", nobody should really care * about this for arbitrary kernel allocations.
*/ if (!IS_ENABLED(CONFIG_MM_ID)) returntrue;
/* * A single mapping implies "mapped exclusively", even if the * folio flag says something different: it's easier to handle this * case here instead of on the RMAP hot path.
*/ if (mapcount <= 1) returnfalse; return test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids);
}
/**
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Dauer der Verarbeitung: 0.23 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.