staticinlinevoid emit_pte_barriers(void)
{ /* * These barriers are emitted under certain conditions after a pte entry * was modified (see e.g. __set_pte_complete()). The dsb makes the store * visible to the table walker. The isb ensures that any previous * speculative "invalid translation" marker that is in the CPU's * pipeline gets cleared, so that any access to that address after * setting the pte to valid won't cause a spurious fault. If the thread * gets preempted after storing to the pgtable but before emitting these * barriers, __switch_to() emits a dsb which ensure the walker gets to * see the store. There is no guarantee of an isb being issued though. * This is safe because it will still get issued (albeit on a * potentially different CPU) when the thread starts running again, * before any access to the address.
*/
dsb(ishst);
isb();
}
if (in_interrupt()) {
emit_pte_barriers(); return;
}
flags = read_thread_flags();
if (flags & BIT(TIF_LAZY_MMU)) { /* Avoid the atomic op if already set. */ if (!(flags & BIT(TIF_LAZY_MMU_PENDING)))
set_thread_flag(TIF_LAZY_MMU_PENDING);
} else {
emit_pte_barriers();
}
}
#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE staticinlinevoid arch_enter_lazy_mmu_mode(void)
{ /* * lazy_mmu_mode is not supposed to permit nesting. But in practice this * does happen with CONFIG_DEBUG_PAGEALLOC, where a page allocation * inside a lazy_mmu_mode section (such as zap_pte_range()) will change * permissions on the linear map with apply_to_page_range(), which * re-enters lazy_mmu_mode. So we tolerate nesting in our * implementation. The first call to arch_leave_lazy_mmu_mode() will * flush and clear the flag such that the remainder of the work in the * outer nest behaves as if outside of lazy mmu mode. This is safe and * keeps tracking simple.
*/
if (in_interrupt()) return;
set_thread_flag(TIF_LAZY_MMU);
}
staticinlinevoid arch_flush_lazy_mmu_mode(void)
{ if (in_interrupt()) return;
if (test_and_clear_thread_flag(TIF_LAZY_MMU_PENDING))
emit_pte_barriers();
}
staticinlinevoid arch_leave_lazy_mmu_mode(void)
{ if (in_interrupt()) return;
/* * Outside of a few very special situations (e.g. hibernation), we always * use broadcast TLB invalidation instructions, therefore a spurious page * fault on one CPU which has been handled concurrently by another CPU * does not need to perform additional invalidation.
*/ #define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0)
/* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc..
*/ externunsignedlong empty_zero_page[PAGE_SIZE / sizeof(unsignedlong)]; #define ZERO_PAGE(vaddr) phys_to_page(__pa_symbol(empty_zero_page))
#define pte_ERROR(e) \
pr_err("%s:%d: bad pte %016llx.\n", __FILE__, __LINE__, pte_val(e))
#define pte_valid(pte) (!!(pte_val(pte) & PTE_VALID)) #define pte_present_invalid(pte) \
((pte_val(pte) & (PTE_VALID | PTE_PRESENT_INVALID)) == PTE_PRESENT_INVALID) /* * Execute-only user mappings do not have the PTE_USER bit set. All valid * kernel mappings have the PTE_UXN bit set.
*/ #define pte_valid_not_user(pte) \
((pte_val(pte) & (PTE_VALID | PTE_USER | PTE_UXN)) == (PTE_VALID | PTE_UXN)) /* * Returns true if the pte is valid and has the contiguous bit set.
*/ #define pte_valid_cont(pte) (pte_valid(pte) && pte_cont(pte)) /* * Could the pte be present in the TLB? We must check mm_tlb_flush_pending * so that we don't erroneously return false for pages that have been * remapped as PROT_NONE but are yet to be flushed from the TLB. * Note that we can't make any assumptions based on the state of the access * flag, since __ptep_clear_flush_young() elides a DSB when invalidating the * TLB.
*/ #define pte_accessible(mm, pte) \
(mm_tlb_flush_pending(mm) ? pte_present(pte) : pte_valid(pte))
if (pte_write(pte))
pte = clear_pte_bit(pte, __pgprot(PTE_RDONLY));
return pte;
}
staticinline pte_t pte_wrprotect(pte_t pte)
{ /* * If hardware-dirty (PTE_WRITE/DBM bit set and PTE_RDONLY * clear), set the PTE_DIRTY bit.
*/ if (pte_hw_dirty(pte))
pte = set_pte_bit(pte, __pgprot(PTE_DIRTY));
staticinlinevoid __set_pte_complete(pte_t pte)
{ /* * Only if the new pte is valid and kernel, otherwise TLB maintenance * has the necessary barriers.
*/ if (pte_valid_not_user(pte))
queue_pte_barriers();
}
/* * If the PTE would provide user space access to the tags associated * with it then ensure that the MTE tags are synchronised. Although * pte_access_permitted_no_overlay() returns false for exec only * mappings, they don't expose tags (instruction fetches don't check * tags).
*/ if (system_supports_mte() && pte_access_permitted_no_overlay(pte, false) &&
!pte_special(pte) && pte_tagged(pte))
mte_sync_tags(pte, nr_pages);
}
#ifdef CONFIG_NUMA_BALANCING /* * See the comment in include/linux/pgtable.h
*/ staticinlineint pte_protnone(pte_t pte)
{ /* * pte_present_invalid() tells us that the pte is invalid from HW * perspective but present from SW perspective, so the fields are to be * interpretted as per the HW layout. The second 2 checks are the unique * encoding that we use for PROT_NONE. It is insufficient to only use * the first check because we share the same encoding scheme with pmds * which support pmd_mkinvalid(), so can be present-invalid without * being PROT_NONE.
*/ return pte_present_invalid(pte) && !pte_user(pte) && !pte_user_exec(pte);
}
staticinline pmd_t pmd_mkhuge(pmd_t pmd)
{ /* * It's possible that the pmd is present-invalid on entry * and in that case it needs to remain present-invalid on * exit. So ensure the VALID bit does not get modified.
*/
pmdval_t mask = PMD_TYPE_MASK & ~PTE_VALID;
pmdval_t val = PMD_TYPE_SECT & ~PTE_VALID;
staticinline pud_t pud_mkhuge(pud_t pud)
{ /* * It's possible that the pud is present-invalid on entry * and in that case it needs to remain present-invalid on * exit. So ensure the VALID bit does not get modified.
*/
pudval_t mask = PUD_TYPE_MASK & ~PTE_VALID;
pudval_t val = PUD_TYPE_SECT & ~PTE_VALID;
/* * Mark the prot value as uncacheable and unbufferable.
*/ #define pgprot_noncached(prot) \
__pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_DEVICE_nGnRnE) | PTE_PXN | PTE_UXN) #define pgprot_writecombine(prot) \
__pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_NORMAL_NC) | PTE_PXN | PTE_UXN) #define pgprot_device(prot) \
__pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_DEVICE_nGnRE) | PTE_PXN | PTE_UXN) #define pgprot_tagged(prot) \
__pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_NORMAL_TAGGED)) #define pgprot_mhp pgprot_tagged /* * DMA allocations for non-coherent devices use what the Arm architecture calls * "Normal non-cacheable" memory, which permits speculation, unaligned accesses * and merging of writes. This is different from "Device-nGnR[nE]" memory which * is intended for MMIO and thus forbids speculation, preserves access size, * requires strict alignment and can also force write responses to come from the * endpoint.
*/ #define pgprot_dmacoherent(prot) \
__pgprot_modify(prot, PTE_ATTRINDX_MASK, \
PTE_ATTRINDX(MT_NORMAL_NC) | PTE_PXN | PTE_UXN)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE staticinlineint pmd_trans_huge(pmd_t pmd)
{ /* * If pmd is present-invalid, pmd_table() won't detect it * as a table, so force the valid bit for the comparison.
*/ return pmd_present(pmd) && !pmd_table(__pmd(pmd_val(pmd) | PTE_VALID));
} #endif/* CONFIG_TRANSPARENT_HUGEPAGE */
/* Match p4d_offset folding in <asm/generic/pgtable-nop4d.h> */ #define p4d_set_fixmap(addr) NULL #define p4d_set_fixmap_offset(p4dp, addr) ((p4d_t *)p4dp) #define p4d_clear_fixmap()
#define p4d_offset_kimg(dir,addr) ((p4d_t *)dir)
staticinline
p4d_t *p4d_offset_lockless_folded(pgd_t *pgdp, pgd_t pgd, unsignedlong addr)
{ /* * With runtime folding of the pud, pud_offset_lockless() passes * the 'pgd_t *' we return here to p4d_to_folded_pud(), which * will offset the pointer assuming that it points into * a page-table page. However, the fast GUP path passes us a * pgd_t allocated on the stack and so we must use the original * pointer in 'pgdp' to construct the p4d pointer instead of * using the generic p4d_offset_lockless() implementation. * * Note: reusing the original pointer means that we may * dereference the same (live) page-table entry multiple times. * This is safe because it is still only loaded once in the * context of each level and the CPU guarantees same-address * read-after-read ordering.
*/ return p4d_offset(pgdp, addr);
} #define p4d_offset_lockless p4d_offset_lockless_folded
#endif/* CONFIG_PGTABLE_LEVELS > 4 */
#define pgd_ERROR(e) \
pr_err("%s:%d: bad pgd %016llx.\n", __FILE__, __LINE__, pgd_val(e))
staticinline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{ /* * Normal and Normal-Tagged are two different memory types and indices * in MAIR_EL1. The mask below has to include PTE_ATTRINDX_MASK.
*/ const pteval_t mask = PTE_USER | PTE_PXN | PTE_UXN | PTE_RDONLY |
PTE_PRESENT_INVALID | PTE_VALID | PTE_WRITE |
PTE_GP | PTE_ATTRINDX_MASK | PTE_PO_IDX_MASK;
/* preserve the hardware dirty information */ if (pte_hw_dirty(pte))
pte = set_pte_bit(pte, __pgprot(PTE_DIRTY));
pte_val(pte) = (pte_val(pte) & ~mask) | (pgprot_val(newprot) & mask); /* * If we end up clearing hw dirtiness for a sw-dirty PTE, set hardware * dirtiness again.
*/ if (pte_sw_dirty(pte))
pte = pte_mkdirty(pte); return pte;
}
staticinlineint __ptep_clear_flush_young(struct vm_area_struct *vma, unsignedlong address, pte_t *ptep)
{ int young = __ptep_test_and_clear_young(vma, address, ptep);
if (young) { /* * We can elide the trailing DSB here since the worst that can * happen is that a CPU continues to use the young entry in its * TLB and we mistakenly reclaim the associated page. The * window for such an event is bounded by the next * context-switch, which provides a DSB to complete the TLB * invalidation.
*/
flush_tlb_page_nosync(vma, address);
}
/* * Ensure that there are not more swap files than can be encoded in the kernel * PTEs.
*/ #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > __SWP_TYPE_BITS)
/* * On AArch64, the cache coherency is handled via the __set_ptes() function.
*/ staticinlinevoid update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, unsignedlong addr, pte_t *ptep, unsignedint nr)
{ /* * We don't do anything here, so there's a very small chance of * us retaking a user fault which we just fixed up. The alternative * is doing a dsb(ishst), but that penalises the fastpath.
*/
}
#define update_mmu_cache(vma, addr, ptep) \
update_mmu_cache_range(NULL, vma, addr, ptep, 1) #define update_mmu_cache_pmd(vma, address, pmd) do { } while (0)
/* * On arm64 without hardware Access Flag, copying from user will fail because * the pte is old and cannot be marked young. So we always end up with zeroed * page after fork() + CoW for pfn mappings. We don't always have a * hardware-managed access flag on arm64.
*/ #define arch_has_hw_pte_young cpu_has_hw_af
/* * Experimentally, it's cheap to set the access flag in hardware and we * benefit from prefaulting mappings as 'old' to start with.
*/ #define arch_wants_old_prefaulted_pte cpu_has_hw_af
/* * Request exec memory is read into pagecache in at least 64K folios. This size * can be contpte-mapped when 4K base pages are in use (16 pages into 1 iTLB * entry), and HPA can coalesce it (4 pages into 1 TLB entry) when 16K base * pages are in use.
*/ #define exec_folio_order() ilog2(SZ_64K >> PAGE_SHIFT)
/* * The contpte APIs are used to transparently manage the contiguous bit in ptes * where it is possible and makes sense to do so. The PTE_CONT bit is considered * a private implementation detail of the public ptep API (see below).
*/ externvoid __contpte_try_fold(struct mm_struct *mm, unsignedlong addr,
pte_t *ptep, pte_t pte); externvoid __contpte_try_unfold(struct mm_struct *mm, unsignedlong addr,
pte_t *ptep, pte_t pte); extern pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte); extern pte_t contpte_ptep_get_lockless(pte_t *orig_ptep); externvoid contpte_set_ptes(struct mm_struct *mm, unsignedlong addr,
pte_t *ptep, pte_t pte, unsignedint nr); externvoid contpte_clear_full_ptes(struct mm_struct *mm, unsignedlong addr,
pte_t *ptep, unsignedint nr, int full); extern pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm, unsignedlong addr, pte_t *ptep, unsignedint nr, int full); externint contpte_ptep_test_and_clear_young(struct vm_area_struct *vma, unsignedlong addr, pte_t *ptep); externint contpte_ptep_clear_flush_young(struct vm_area_struct *vma, unsignedlong addr, pte_t *ptep); externvoid contpte_wrprotect_ptes(struct mm_struct *mm, unsignedlong addr,
pte_t *ptep, unsignedint nr); externint contpte_ptep_set_access_flags(struct vm_area_struct *vma, unsignedlong addr, pte_t *ptep,
pte_t entry, int dirty); externvoid contpte_clear_young_dirty_ptes(struct vm_area_struct *vma, unsignedlong addr, pte_t *ptep, unsignedint nr, cydp_t flags);
static __always_inline void contpte_try_fold(struct mm_struct *mm, unsignedlong addr, pte_t *ptep, pte_t pte)
{ /* * Only bother trying if both the virtual and physical addresses are * aligned and correspond to the last entry in a contig range. The core * code mostly modifies ranges from low to high, so this is the likely * the last modification in the contig range, so a good time to fold. * We can't fold special mappings, because there is no associated folio.
*/
/* * The below functions constitute the public API that arm64 presents to the * core-mm to manipulate PTE entries within their page tables (or at least this * is the subset of the API that arm64 needs to implement). These public * versions will automatically and transparently apply the contiguous bit where * it makes sense to do so. Therefore any users that are contig-aware (e.g. * hugetlb, kernel mapper) should NOT use these APIs, but instead use the * private versions, which are prefixed with double underscore. All of these * APIs except for ptep_get_lockless() are expected to be called with the PTL * held. Although the contiguous bit is considered private to the * implementation, it is deliberately allowed to leak through the getters (e.g. * ptep_get()), back to core code. This is required so that pte_leaf_size() can * provide an accurate size for perf_get_pgtable_size(). But this leakage means * its possible a pte will be passed to a setter with the contiguous bit set, so * we explicitly clear the contiguous bit in those cases to prevent accidentally * setting it in the pgtable.
*/
staticinlinevoid set_pte(pte_t *ptep, pte_t pte)
{ /* * We don't have the mm or vaddr so cannot unfold contig entries (since * it requires tlb maintenance). set_pte() is not used in core code, so * this should never even be called. Regardless do our best to service * any call and emit a warning if there is any attempt to set a pte on * top of an existing contig range.
*/
pte_t orig_pte = __ptep_get(ptep);
#define wrprotect_ptes wrprotect_ptes static __always_inline void wrprotect_ptes(struct mm_struct *mm, unsignedlong addr, pte_t *ptep, unsignedint nr)
{ if (likely(nr == 1)) { /* * Optimization: wrprotect_ptes() can only be called for present * ptes so we only need to check contig bit as condition for * unfold, and we can remove the contig bit from the pte we read * to avoid re-reading. This speeds up fork() which is sensitive * for order-0 folios. Equivalent to contpte_try_unfold().
*/
pte_t orig_pte = __ptep_get(ptep);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.