#if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \ defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS #error CONFIG_PGTABLE_LEVELS is not consistent with __PAGETABLE_{P4D,PUD,PMD}_FOLDED #endif
/* * On almost all architectures and configurations, 0 can be used as the * upper ceiling to free_pgtables(): on many architectures it has the same * effect as using TASK_SIZE. However, there is one configuration which * must impose a more careful limit, to avoid freeing kernel pgtables.
*/ #ifndef USER_PGTABLES_CEILING #define USER_PGTABLES_CEILING 0UL #endif
/* * This defines the first usable user address. Platforms * can override its value with custom FIRST_USER_ADDRESS * defined in their respective <asm/pgtable.h>.
*/ #ifndef FIRST_USER_ADDRESS #define FIRST_USER_ADDRESS 0UL #endif
/* * This defines the generic helper for accessing PMD page * table page. Although platforms can still override this * via their respective <asm/pgtable.h>.
*/ #ifndef pmd_pgtable #define pmd_pgtable(pmd) pmd_page(pmd) #endif
#define pmd_folio(pmd) page_folio(pmd_page(pmd))
/* * A page table page can be thought of an array like this: pXd_t[PTRS_PER_PxD] * * The pXx_index() functions return the index of the entry in the page * table page which would control the given virtual address * * As these functions may be used by the same code for different levels of * the page table folding, they are always available, regardless of * CONFIG_PGTABLE_LEVELS value. For the folded levels they simply return 0 * because in such cases PTRS_PER_PxD equals 1.
*/
#ifndef pgd_index /* Must be a compile-time constant, so implement it as a macro */ #define pgd_index(a) (((a) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)) #endif
/* * a shortcut to get a pgd_t in a given mm
*/ #ifndef pgd_offset #define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address)) #endif
/* * a shortcut which implies the use of the kernel's pgd, instead * of a process's
*/ #define pgd_offset_k(address) pgd_offset(&init_mm, (address))
/* * In many cases it is known that a virtual address is mapped at PMD or PTE * level, so instead of traversing all the page table levels, we can get a * pointer to the PMD entry in user or kernel page table or translate a virtual * address to the pointer in the PTE in the kernel page tables with simple * helpers.
*/ staticinline pmd_t *pmd_off(struct mm_struct *mm, unsignedlong va)
{ return pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, va), va), va), va);
}
/* * A facility to provide lazy MMU batching. This allows PTE updates and * page invalidations to be delayed until a call to leave lazy MMU mode * is issued. Some architectures may benefit from doing this, and it is * beneficial for both shadow and direct mode hypervisors, which may batch * the PTE updates which happen during this window. Note that using this * interface requires that read hazards be removed from the code. A read * hazard could result in the direct mode hypervisor case, since the actual * write to the page tables may not yet have taken place, so reads though * a raw PTE pointer after it has been modified are not guaranteed to be * up to date. * * In the general case, no lock is guaranteed to be held between entry and exit * of the lazy mode. So the implementation must assume preemption may be enabled * and cpu migration is possible; it must take steps to be robust against this. * (In practice, for user PTE updates, the appropriate page table lock(s) are * held, but for kernel PTE updates, no lock is held). Nesting is not permitted * and the mode cannot be used in interrupt context.
*/ #ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE staticinlinevoid arch_enter_lazy_mmu_mode(void) {} staticinlinevoid arch_leave_lazy_mmu_mode(void) {} staticinlinevoid arch_flush_lazy_mmu_mode(void) {} #endif
#ifndef pte_batch_hint /** * pte_batch_hint - Number of pages that can be added to batch without scanning. * @ptep: Page table pointer for the entry. * @pte: Page table entry. * * Some architectures know that a set of contiguous ptes all map the same * contiguous memory with the same permissions. In this case, it can provide a * hint to aid pte batching without the core code needing to scan every pte. * * An architecture implementation may ignore the PTE accessed state. Further, * the dirty state must apply atomically to all the PTEs described by the hint. * * May be overridden by the architecture, else pte_batch_hint is always 1.
*/ staticinlineunsignedint pte_batch_hint(pte_t *ptep, pte_t pte)
{ return 1;
} #endif
#ifndef set_ptes /** * set_ptes - Map consecutive pages to a contiguous range of addresses. * @mm: Address space to map the pages into. * @addr: Address to map the first page at. * @ptep: Page table pointer for the first entry. * @pte: Page table entry for the first page. * @nr: Number of pages to map. * * When nr==1, initial state of pte may be present or not present, and new state * may be present or not present. When nr>1, initial state of all ptes must be * not present, and new state must be present. * * May be overridden by the architecture, or the architecture can define * set_pte() and PFN_PTE_SHIFT. * * Context: The caller holds the page table lock. The pages all belong * to the same folio. The PTEs are all in the same PMD.
*/ staticinlinevoid set_ptes(struct mm_struct *mm, unsignedlong addr,
pte_t *ptep, pte_t pte, unsignedint nr)
{
page_table_check_ptes_set(mm, ptep, pte, nr);
#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH #ifdef CONFIG_TRANSPARENT_HUGEPAGE externint pmdp_clear_flush_young(struct vm_area_struct *vma, unsignedlong address, pmd_t *pmdp); #else /* * Despite relevant to THP only, this API is called from generic rmap code * under PageTransHuge(), hence needs a dummy implementation for !THP
*/ staticinlineint pmdp_clear_flush_young(struct vm_area_struct *vma, unsignedlong address, pmd_t *pmdp)
{
BUILD_BUG(); return 0;
} #endif/* CONFIG_TRANSPARENT_HUGEPAGE */ #endif
#ifndef arch_has_hw_nonleaf_pmd_young /* * Return whether the accessed bit in non-leaf PMD entries is supported on the * local CPU.
*/ staticinlinebool arch_has_hw_nonleaf_pmd_young(void)
{ return IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG);
} #endif
#ifndef arch_has_hw_pte_young /* * Return whether the accessed bit is supported on the local CPU. * * This stub assumes accessing through an old PTE triggers a page fault. * Architectures that automatically set the access bit should overwrite it.
*/ staticinlinebool arch_has_hw_pte_young(void)
{ return IS_ENABLED(CONFIG_ARCH_HAS_HW_PTE_YOUNG);
} #endif
#ifndef exec_folio_order /* * Returns preferred minimum folio order for executable file-backed memory. Must * be in range [0, PMD_ORDER). Default to order-0.
*/ staticinlineunsignedint exec_folio_order(void)
{ return 0;
} #endif
#ifndef clear_young_dirty_ptes /** * clear_young_dirty_ptes - Mark PTEs that map consecutive pages of the * same folio as old/clean. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to mark old/clean. * @flags: Flags to modify the PTE batch semantics. * * May be overridden by the architecture; otherwise, implemented by * get_and_clear/modify/set for each pte in the range. * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD.
*/ staticinlinevoid clear_young_dirty_ptes(struct vm_area_struct *vma, unsignedlong addr, pte_t *ptep, unsignedint nr, cydp_t flags)
{
pte_t pte;
for (;;) { if (flags == CYDP_CLEAR_YOUNG)
ptep_test_and_clear_young(vma, addr, ptep); else {
pte = ptep_get_and_clear(vma->vm_mm, addr, ptep); if (flags & CYDP_CLEAR_YOUNG)
pte = pte_mkold(pte); if (flags & CYDP_CLEAR_DIRTY)
pte = pte_mkclean(pte);
set_pte_at(vma->vm_mm, addr, ptep, pte);
} if (--nr == 0) break;
ptep++;
addr += PAGE_SIZE;
}
} #endif
pte_clear(mm, addr, ptep); /* * No need for ptep_get_and_clear(): page table check doesn't care about * any bits that could have been set by HW concurrently.
*/
page_table_check_pte_clear(mm, pte);
}
#ifdef CONFIG_GUP_GET_PXX_LOW_HIGH /* * For walking the pagetables without holding any locks. Some architectures * (eg x86-32 PAE) cannot load the entries atomically without using expensive * instructions. We are guaranteed that a PTE will only either go from not * present to present, or present to not present -- it will not switch to a * completely different present page without a TLB flush inbetween; which we * are blocking by holding interrupts off. * * Setting ptes from not present to present goes: * * ptep->pte_high = h; * smp_wmb(); * ptep->pte_low = l; * * And present to not present goes: * * ptep->pte_low = 0; * smp_wmb(); * ptep->pte_high = 0; * * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'. * We load pte_high *after* loading pte_low, which ensures we don't see an older * value of pte_high. *Then* we recheck pte_low, which ensures that we haven't * picked up a changed pte high. We might have gotten rubbish values from * pte_low and pte_high, but we are guaranteed that pte_low will not have the * present bit set *unless* it is 'l'. Because get_user_pages_fast() only * operates on present ptes we're safe.
*/ staticinline pte_t ptep_get_lockless(pte_t *ptep)
{
pte_t pte;
do {
pte.pte_low = ptep->pte_low;
smp_rmb();
pte.pte_high = ptep->pte_high;
smp_rmb();
} while (unlikely(pte.pte_low != ptep->pte_low));
/* * We require that the PTE can be read atomically.
*/ #ifndef ptep_get_lockless staticinline pte_t ptep_get_lockless(pte_t *ptep)
{ return ptep_get(ptep);
} #endif
#ifndef get_and_clear_full_ptes /** * get_and_clear_full_ptes - Clear present PTEs that map consecutive pages of * the same folio, collecting dirty/accessed bits. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to clear. * @full: Whether we are clearing a full mm. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over ptep_get_and_clear_full(), merging dirty/accessed bits into the * returned PTE. * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD.
*/ staticinline pte_t get_and_clear_full_ptes(struct mm_struct *mm, unsignedlong addr, pte_t *ptep, unsignedint nr, int full)
{
pte_t pte, tmp_pte;
/** * get_and_clear_ptes - Clear present PTEs that map consecutive pages of * the same folio, collecting dirty/accessed bits. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to clear. * * Use this instead of get_and_clear_full_ptes() if it is known that we don't * need to clear the full mm, which is mostly the case. * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD.
*/ staticinline pte_t get_and_clear_ptes(struct mm_struct *mm, unsignedlong addr,
pte_t *ptep, unsignedint nr)
{ return get_and_clear_full_ptes(mm, addr, ptep, nr, 0);
}
#ifndef clear_full_ptes /** * clear_full_ptes - Clear present PTEs that map consecutive pages of the same * folio. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to clear. * @full: Whether we are clearing a full mm. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over ptep_get_and_clear_full(). * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD.
*/ staticinlinevoid clear_full_ptes(struct mm_struct *mm, unsignedlong addr,
pte_t *ptep, unsignedint nr, int full)
{ for (;;) {
ptep_get_and_clear_full(mm, addr, ptep, full); if (--nr == 0) break;
ptep++;
addr += PAGE_SIZE;
}
} #endif
/** * clear_ptes - Clear present PTEs that map consecutive pages of the same folio. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to clear. * * Use this instead of clear_full_ptes() if it is known that we don't need to * clear the full mm, which is mostly the case. * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD.
*/ staticinlinevoid clear_ptes(struct mm_struct *mm, unsignedlong addr,
pte_t *ptep, unsignedint nr)
{
clear_full_ptes(mm, addr, ptep, nr, 0);
}
/* * If two threads concurrently fault at the same page, the thread that * won the race updates the PTE and its local TLB/Cache. The other thread * gives up, simply does nothing, and continues; on architectures where * software can update TLB, local TLB can be updated here to avoid next page * fault. This function updates TLB only, do nothing with cache or others. * It is the difference with function update_mmu_cache.
*/ #ifndef update_mmu_tlb_range staticinlinevoid update_mmu_tlb_range(struct vm_area_struct *vma, unsignedlong address, pte_t *ptep, unsignedint nr)
{
} #endif
/* * Some architectures may be able to avoid expensive synchronization * primitives when modifications are made to PTE's which are already * not present, or in the process of an address space destruction.
*/ #ifndef __HAVE_ARCH_PTE_CLEAR_NOT_PRESENT_FULL staticinlinevoid pte_clear_not_present_full(struct mm_struct *mm, unsignedlong address,
pte_t *ptep, int full)
{
pte_clear(mm, address, ptep);
} #endif
#ifndef clear_not_present_full_ptes /** * clear_not_present_full_ptes - Clear multiple not present PTEs which are * consecutive in the pgtable. * @mm: Address space the ptes represent. * @addr: Address of the first pte. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to clear. * @full: Whether we are clearing a full mm. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over pte_clear_not_present_full(). * * Context: The caller holds the page table lock. The PTEs are all not present. * The PTEs are all in the same PMD.
*/ staticinlinevoid clear_not_present_full_ptes(struct mm_struct *mm, unsignedlong addr, pte_t *ptep, unsignedint nr, int full)
{ for (;;) {
pte_clear_not_present_full(mm, addr, ptep, full); if (--nr == 0) break;
ptep++;
addr += PAGE_SIZE;
}
} #endif
#ifndef wrprotect_ptes /** * wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same * folio. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to write-protect. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over ptep_set_wrprotect(). * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD.
*/ staticinlinevoid wrprotect_ptes(struct mm_struct *mm, unsignedlong addr,
pte_t *ptep, unsignedint nr)
{ for (;;) {
ptep_set_wrprotect(mm, addr, ptep); if (--nr == 0) break;
ptep++;
addr += PAGE_SIZE;
}
} #endif
/* * On some architectures hardware does not set page access bit when accessing * memory page, it is responsibility of software setting this bit. It brings * out extra page fault penalty to track page access bit. For optimization page * access bit can be set during all page fault flow on these arches. * To be differentiate with macro pte_mkyoung, this macro is used on platforms * where software maintains page access bit.
*/ #ifndef pte_sw_mkyoung staticinline pte_t pte_sw_mkyoung(pte_t pte)
{ return pte;
} #define pte_sw_mkyoung pte_sw_mkyoung #endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * This is an implementation of pmdp_establish() that is only suitable for an * architecture that doesn't have hardware dirty/accessed bits. In this case we * can't race with CPU which sets these bits and non-atomic approach is fine.
*/ staticinline pmd_t generic_pmdp_establish(struct vm_area_struct *vma, unsignedlong address, pmd_t *pmdp, pmd_t pmd)
{
pmd_t old_pmd = *pmdp;
set_pmd_at(vma->vm_mm, address, pmdp, pmd); return old_pmd;
} #endif
/* * pmdp_invalidate_ad() invalidates the PMD while changing a transparent * hugepage mapping in the page tables. This function is similar to * pmdp_invalidate(), but should only be used if the access and dirty bits would * not be cleared by the software in the new PMD value. The function ensures * that hardware changes of the access and dirty bits updates would not be lost. * * Doing so can allow in certain architectures to avoid a TLB flush in most * cases. Yet, another TLB flush might be necessary later if the PMD update * itself requires such flush (e.g., if protection was set to be stricter). Yet, * even when a TLB flush is needed because of the update, the caller may be able * to batch these TLB flushing operations, so fewer TLB flush operations are * needed.
*/ extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsignedlong address, pmd_t *pmdp); #endif
#ifndef __HAVE_ARCH_PTE_UNUSED /* * Some architectures provide facilities to virtualization guests * so that they can flag allocated pages as unused. This allows the * host to transparently reclaim unused pages. This function returns * whether the pte's page is unused.
*/ staticinlineint pte_unused(pte_t pte)
{ return 0;
} #endif
} #else /* * Some architectures support metadata associated with a page. When a * page is being swapped out, this metadata must be saved so it can be * restored when the page is swapped back in. SPARC M7 and newer * processors support an ADI (Application Data Integrity) tag for the * page as metadata for the page. arch_do_swap_page() can restore this * metadata when a page is swapped back in.
*/ staticinlinevoid arch_do_swap_page_nr(struct mm_struct *mm, struct vm_area_struct *vma, unsignedlong addr,
pte_t pte, pte_t oldpte, int nr)
{ for (int i = 0; i < nr; i++) {
arch_do_swap_page(vma->vm_mm, vma, addr + i * PAGE_SIZE,
pte_advance_pfn(pte, i),
pte_advance_pfn(oldpte, i));
}
} #endif
#ifndef __HAVE_ARCH_UNMAP_ONE /* * Some architectures support metadata associated with a page. When a * page is being swapped out, this metadata must be saved so it can be * restored when the page is swapped back in. SPARC M7 and newer * processors support an ADI (Application Data Integrity) tag for the * page as metadata for the page. arch_unmap_one() can save this * metadata on a swap-out of a page.
*/ staticinlineint arch_unmap_one(struct mm_struct *mm, struct vm_area_struct *vma, unsignedlong addr,
pte_t orig_pte)
{ return 0;
} #endif
/* * Allow architectures to preserve additional metadata associated with * swapped-out pages. The corresponding __HAVE_ARCH_SWAP_* macros and function * prototypes must be defined in the arch-specific asm/pgtable.h file.
*/ #ifndef __HAVE_ARCH_PREPARE_TO_SWAP staticinlineint arch_prepare_to_swap(struct folio *folio)
{ return 0;
} #endif
/* * When walking page tables, get the address of the next boundary, * or the end address of the range if that comes earlier. Although no * vma end wraps to 0, rounded up __boundary may wrap to 0 throughout.
*/
/* * When walking page tables, we usually want to skip any p?d_none entries; * and any p?d_bad entries - reporting the error before resetting to none. * Do the tests inline, but report and clear the bad entry in mm/memory.c.
*/ void pgd_clear_bad(pgd_t *);
#ifndef __PAGETABLE_P4D_FOLDED void p4d_clear_bad(p4d_t *); #else #define p4d_clear_bad(p4d) do { } while (0) #endif
#ifndef __PAGETABLE_PUD_FOLDED void pud_clear_bad(pud_t *); #else #define pud_clear_bad(p4d) do { } while (0) #endif
void pmd_clear_bad(pmd_t *);
staticinlineint pgd_none_or_clear_bad(pgd_t *pgd)
{ if (pgd_none(*pgd)) return 1; if (unlikely(pgd_bad(*pgd))) {
pgd_clear_bad(pgd); return 1;
} return 0;
}
staticinlineint p4d_none_or_clear_bad(p4d_t *p4d)
{ if (p4d_none(*p4d)) return 1; if (unlikely(p4d_bad(*p4d))) {
p4d_clear_bad(p4d); return 1;
} return 0;
}
staticinlineint pud_none_or_clear_bad(pud_t *pud)
{ if (pud_none(*pud)) return 1; if (unlikely(pud_bad(*pud))) {
pud_clear_bad(pud); return 1;
} return 0;
}
staticinlineint pmd_none_or_clear_bad(pmd_t *pmd)
{ if (pmd_none(*pmd)) return 1; if (unlikely(pmd_bad(*pmd))) {
pmd_clear_bad(pmd); return 1;
} return 0;
}
staticinline pte_t __ptep_modify_prot_start(struct vm_area_struct *vma, unsignedlong addr,
pte_t *ptep)
{ /* * Get the current pte state, but zero it out to make it * non-present, preventing the hardware from asynchronously * updating it.
*/ return ptep_get_and_clear(vma->vm_mm, addr, ptep);
}
staticinlinevoid __ptep_modify_prot_commit(struct vm_area_struct *vma, unsignedlong addr,
pte_t *ptep, pte_t pte)
{ /* * The pte is non-present, so there's no hardware state to * preserve.
*/
set_pte_at(vma->vm_mm, addr, ptep, pte);
}
#ifndef __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION /* * Start a pte protection read-modify-write transaction, which * protects against asynchronous hardware modifications to the pte. * The intention is not to prevent the hardware from making pte * updates, but to prevent any updates it may make from being lost. * * This does not protect against other software modifications of the * pte; the appropriate pte lock must be held over the transaction. * * Note that this interface is intended to be batchable, meaning that * ptep_modify_prot_commit may not actually update the pte, but merely * queue the update to be done at some later time. The update must be * actually committed before the pte lock is released, however.
*/ staticinline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsignedlong addr,
pte_t *ptep)
{ return __ptep_modify_prot_start(vma, addr, ptep);
}
/* * Commit an update to a pte, leaving any hardware-controlled bits in * the PTE unmodified. The pte returned from ptep_modify_prot_start() may * additionally have young and/or dirty bits set where previously they were not, * so the updated pte may have these additional changes.
*/ staticinlinevoid ptep_modify_prot_commit(struct vm_area_struct *vma, unsignedlong addr,
pte_t *ptep, pte_t old_pte, pte_t pte)
{
__ptep_modify_prot_commit(vma, addr, ptep, pte);
} #endif/* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */
/** * modify_prot_start_ptes - Start a pte protection read-modify-write transaction * over a batch of ptes, which protects against asynchronous hardware * modifications to the ptes. The intention is not to prevent the hardware from * making pte updates, but to prevent any updates it may make from being lost. * Please see the comment above ptep_modify_prot_start() for full description. * * @vma: The virtual memory area the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over ptep_modify_prot_start(), collecting the a/d bits from each pte * in the batch. * * Note that PTE bits in the PTE batch besides the PFN can differ. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. All other PTE bits must be identical for * all PTEs in the batch except for young and dirty bits. The PTEs are all in * the same PMD.
*/ #ifndef modify_prot_start_ptes staticinline pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsignedlong addr, pte_t *ptep, unsignedint nr)
{
pte_t pte, tmp_pte;
/** * modify_prot_commit_ptes - Commit an update to a batch of ptes, leaving any * hardware-controlled bits in the PTE unmodified. * * @vma: The virtual memory area the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @old_pte: Old page table entry (for the first entry) which is now cleared. * @pte: New page table entry to be set. * @nr: Number of entries. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over ptep_modify_prot_commit(). * * Context: The caller holds the page table lock. The PTEs are all in the same * PMD. On exit, the set ptes in the batch map the same folio. The ptes set by * ptep_modify_prot_start() may additionally have young and/or dirty bits set * where previously they were not, so the updated ptes may have these * additional changes.
*/ #ifndef modify_prot_commit_ptes staticinlinevoid modify_prot_commit_ptes(struct vm_area_struct *vma, unsignedlong addr,
pte_t *ptep, pte_t old_pte, pte_t pte, unsignedint nr)
{ int i;
for (i = 0; i < nr; ++i, ++ptep, addr += PAGE_SIZE) {
ptep_modify_prot_commit(vma, addr, ptep, old_pte, pte);
/* Advance PFN only, set same prot */
old_pte = pte_next_pfn(old_pte);
pte = pte_next_pfn(pte);
}
} #endif
/* * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values * and let generic vmalloc, ioremap and page table update code know when * arch_sync_kernel_mappings() needs to be called.
*/ #ifndef ARCH_PAGE_TABLE_SYNC_MASK #define ARCH_PAGE_TABLE_SYNC_MASK 0 #endif
/* * There is no default implementation for arch_sync_kernel_mappings(). It is * relied upon the compiler to optimize calls out if ARCH_PAGE_TABLE_SYNC_MASK * is 0.
*/ void arch_sync_kernel_mappings(unsignedlong start, unsignedlong end);
#endif/* CONFIG_MMU */
/* * No-op macros that just return the current protection value. Defined here * because these macros can be used even if CONFIG_MMU is not defined.
*/
/* * A facility to provide batching of the reload of page tables and * other process state with the actual context switch code for * paravirtualized guests. By convention, only one of the batched * update (lazy) modes (CPU, MMU) should be active at any given time, * entry should never be nested, and entry and exits should always be * paired. This is for sanity of maintaining and reasoning about the * kernel code. In this case, the exit (end of the context switch) is * in architecture-specific code, and so doesn't need a generic * definition.
*/ #ifndef __HAVE_ARCH_START_CONTEXT_SWITCH #define arch_start_context_switch(prev) do {} while (0) #endif
#ifndef __HAVE_PFNMAP_TRACKING /* * Interfaces that can be used by architecture code to keep track of * memory type of pfn mappings specified by the remap_pfn_range, * vmf_insert_pfn.
*/
staticinlinevoid pfnmap_untrack(unsignedlong pfn, unsignedlong size)
{
} #else /** * pfnmap_setup_cachemode - setup the cachemode in the pgprot for a pfn range * @pfn: the start of the pfn range * @size: the size of the pfn range in bytes * @prot: the pgprot to modify * * Lookup the cachemode for the pfn range starting at @pfn with the size * @size and store it in @prot, leaving other data in @prot unchanged. * * This allows for a hardware implementation to have fine-grained control of * memory cache behavior at page level granularity. Without a hardware * implementation, this function does nothing. * * Currently there is only one implementation for this - x86 Page Attribute * Table (PAT). See Documentation/arch/x86/pat.rst for more details. * * This function can fail if the pfn range spans pfns that require differing * cachemodes. If the pfn range was previously verified to have a single * cachemode, it is sufficient to query only a single pfn. The assumption is * that this is the case for drivers using the vmf_insert_pfn*() interface. * * Returns 0 on success and -EINVAL on error.
*/ int pfnmap_setup_cachemode(unsignedlong pfn, unsignedlong size,
pgprot_t *prot);
/** * pfnmap_track - track a pfn range * @pfn: the start of the pfn range * @size: the size of the pfn range in bytes * @prot: the pgprot to track * * Requested the pfn range to be 'tracked' by a hardware implementation and * setup the cachemode in @prot similar to pfnmap_setup_cachemode(). * * This allows for fine-grained control of memory cache behaviour at page * level granularity. Tracking memory this way is persisted across VMA splits * (VMA merging does not apply for VM_PFNMAP). * * Currently, there is only one implementation for this - x86 Page Attribute * Table (PAT). See Documentation/arch/x86/pat.rst for more details. * * Returns 0 on success and -EINVAL on error.
*/ int pfnmap_track(unsignedlong pfn, unsignedlong size, pgprot_t *prot);
/** * pfnmap_untrack - untrack a pfn range * @pfn: the start of the pfn range * @size: the size of the pfn range in bytes * * Untrack a pfn range previously tracked through pfnmap_track().
*/ void pfnmap_untrack(unsignedlong pfn, unsignedlong size); #endif
/** * pfnmap_setup_cachemode_pfn - setup the cachemode in the pgprot for a pfn * @pfn: the pfn * @prot: the pgprot to modify * * Lookup the cachemode for @pfn and store it in @prot, leaving other * data in @prot unchanged. * * See pfnmap_setup_cachemode() for details.
*/ staticinlinevoid pfnmap_setup_cachemode_pfn(unsignedlong pfn, pgprot_t *prot)
{
pfnmap_setup_cachemode(pfn, PAGE_SIZE, prot);
}
if (pud_none(pudval) || pud_trans_huge(pudval)) return 1; if (unlikely(pud_bad(pudval))) {
pud_clear_bad(pud); return 1;
} #endif return 0;
}
#ifndef CONFIG_NUMA_BALANCING /* * In an inaccessible (PROT_NONE) VMA, pte_protnone() may indicate "yes". It is * perfectly valid to indicate "no" in that case, which is why our default * implementation defaults to "always no". * * In an accessible VMA, however, pte_protnone() reliably indicates PROT_NONE * page protection due to NUMA hinting. NUMA hinting faults only apply in * accessible VMAs. * * So, to reliably identify PROT_NONE PTEs that require a NUMA hinting fault, * looking at the VMA accessibility is sufficient.
*/ staticinlineint pte_protnone(pte_t pte)
{ return 0;
}
#ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * ARCHes with special requirements for evicting THP backing TLB entries can * implement this. Otherwise also, it can help optimize normal TLB flush in * THP regime. Stock flush_tlb_range() typically has optimization to nuke the * entire TLB if flush span is greater than a threshold, which will * likely be true for a single huge page. Thus a single THP flush will * invalidate the entire TLB which is not desirable. * e.g. see arch/arc: flush_pmd_tlb_range
*/ #define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) #define flush_pud_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) #else #define flush_pmd_tlb_range(vma, addr, end) BUILD_BUG() #define flush_pud_tlb_range(vma, addr, end) BUILD_BUG() #endif #endif
/* * Architecture PAGE_KERNEL_* fallbacks * * Some architectures don't define certain PAGE_KERNEL_* flags. This is either * because they really don't support them, or the port needs to be updated to * reflect the required functionality. Below are a set of relatively safe * fallbacks, as best effort, which we can count on in lieu of the architectures * not defining them on their own yet.
*/
/* * Page Table Modification bits for pgtbl_mod_mask. * * These are used by the p?d_alloc_track*() and p*d_populate_kernel() * functions in the generic vmalloc, ioremap and page table update code * to track at which page-table levels entries have been modified. * Based on that the code can better decide when page table changes need * to be synchronized to other page-tables in the system.
*/ #define __PGTBL_PGD_MODIFIED 0 #define __PGTBL_P4D_MODIFIED 1 #define __PGTBL_PUD_MODIFIED 2 #define __PGTBL_PMD_MODIFIED 3 #define __PGTBL_PTE_MODIFIED 4
#ifndef has_transparent_pud_hugepage #define has_transparent_pud_hugepage() IS_BUILTIN(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) #endif /* * On some architectures it depends on the mm if the p4d/pud or pmd * layer of the page table hierarchy is folded or not.
*/ #ifndef mm_p4d_folded #define mm_p4d_folded(mm) __is_defined(__PAGETABLE_P4D_FOLDED) #endif
/* * pXd_leaf() is the API to check whether a pgtable entry is a huge page * mapping. It should work globally across all archs, without any * dependency on CONFIG_* options. For architectures that do not support * huge mappings on specific levels, below fallbacks will be used. * * A leaf pgtable entry should always imply the following: * * - It is a "present" entry. IOW, before using this API, please check it * with pXd_present() first. NOTE: it may not always mean the "present * bit" is set. For example, PROT_NONE entries are always "present". * * - It should _never_ be a swap entry of any type. Above "present" check * should have guarded this, but let's be crystal clear on this. * * - It should contain a huge PFN, which points to a huge page larger than * PAGE_SIZE of the platform. The PFN format isn't important here. * * - It should cover all kinds of huge mappings (i.e. pXd_trans_huge() * or hugetlb mappings).
*/ #ifndef pgd_leaf #define pgd_leaf(x) false #endif #ifndef p4d_leaf #define p4d_leaf(x) false #endif #ifndef pud_leaf #define pud_leaf(x) false #endif #ifndef pmd_leaf #define pmd_leaf(x) false #endif
/* * We always define pmd_pfn for all archs as it's used in lots of generic * code. Now it happens too for pud_pfn (and can happen for larger * mappings too in the future; we're not there yet). Instead of defining * it for all archs (like pmd_pfn), provide a fallback. * * Note that returning 0 here means any arch that didn't define this can * get severely wrong when it hits a real pud leaf. It's arch's * responsibility to properly define it when a huge pud is possible.
*/ #ifndef pud_pfn #define pud_pfn(x) 0 #endif
/* * Some architectures have MMUs that are configurable or selectable at boot * time. These lead to variable PTRS_PER_x. For statically allocated arrays it * helps to have a static maximum value.
*/
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung ist noch experimentell.