/* * By default, transparent hugepage support is disabled in order to avoid * risking an increased memory footprint for applications that are not * guaranteed to benefit from it. When transparent hugepage support is * enabled, it is for all mappings, and khugepaged scans all mappings. * Defrag is invoked by khugepaged hugepage allocations and by page faults * for all hugepage allocations.
*/ unsignedlong transparent_hugepage_flags __read_mostly = #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
(1<<TRANSPARENT_HUGEPAGE_FLAG)| #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| #endif
(1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
/* Check the intersection of requested and supported orders. */ if (vma_is_anonymous(vma))
supported_orders = THP_ORDERS_ALL_ANON; elseif (vma_is_special_huge(vma))
supported_orders = THP_ORDERS_ALL_SPECIAL; else
supported_orders = THP_ORDERS_ALL_FILE_DEFAULT;
orders &= supported_orders; if (!orders) return 0;
if (!vma->vm_mm) /* vdso */ return 0;
if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags)) return 0;
/* khugepaged doesn't collapse DAX vma, but page fault is fine. */ if (vma_is_dax(vma)) return in_pf ? orders : 0;
/* * khugepaged special VMA and hugetlb VMA. * Must be checked after dax since some dax mappings may have * VM_MIXEDMAP set.
*/ if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED)) return 0;
/* * Check alignment for file vma and size for both file and anon vma by * filtering out the unsuitable orders. * * Skip the check for page fault. Huge fault does the check in fault * handlers.
*/ if (!in_pf) { int order = highest_order(orders); unsignedlong addr;
while (orders) {
addr = vma->vm_end - (PAGE_SIZE << order); if (thp_vma_suitable_order(vma, addr, order)) break;
order = next_order(&orders, order);
}
if (!orders) return 0;
}
/* * Enabled via shmem mount options or sysfs settings. * Must be done before hugepage flags check since shmem has its * own flags.
*/ if (!in_pf && shmem_file(vma->vm_file)) return orders & shmem_allowable_huge_orders(file_inode(vma->vm_file),
vma, vma->vm_pgoff, 0,
!enforce_sysfs);
if (!vma_is_anonymous(vma)) { /* * Enforce sysfs THP requirements as necessary. Anonymous vmas * were already handled in thp_vma_allowable_orders().
*/ if (enforce_sysfs &&
(!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) &&
!hugepage_global_always()))) return 0;
/* * Trust that ->huge_fault() handlers know what they are doing * in fault path.
*/ if (((in_pf || smaps)) && vma->vm_ops->huge_fault) return orders; /* Only regular file is valid in collapse path */ if (((!in_pf || smaps)) && file_thp_enabled(vma)) return orders; return 0;
}
if (vma_is_temporary_stack(vma)) return 0;
/* * THPeligible bit of smaps should show 1 for proper VMAs even * though anon_vma is not initialized yet. * * Allow page fault since anon_vma may be not initialized until * the first page fault.
*/ if (!vma->anon_vma) return (smaps || in_pf) ? orders : 0;
return orders;
}
staticbool get_huge_zero_page(void)
{ struct folio *zero_folio;
retry: if (likely(atomic_inc_not_zero(&huge_zero_refcount))) returntrue;
zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
HPAGE_PMD_ORDER); if (!zero_folio) {
count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); returnfalse;
} /* Ensure zero folio won't have large_rmappable flag set. */
folio_clear_large_rmappable(zero_folio);
preempt_disable(); if (cmpxchg(&huge_zero_folio, NULL, zero_folio)) {
preempt_enable();
folio_put(zero_folio); goto retry;
}
WRITE_ONCE(huge_zero_pfn, folio_pfn(zero_folio));
/* We take additional reference here. It will be put back by shrinker */
atomic_set(&huge_zero_refcount, 2);
preempt_enable();
count_vm_event(THP_ZERO_PAGE_ALLOC); returntrue;
}
staticvoid put_huge_zero_page(void)
{ /* * Counter should never go to zero here. Only shrinker can put * last reference.
*/
BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
}
if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
put_huge_zero_page();
return READ_ONCE(huge_zero_folio);
}
void mm_put_huge_zero_folio(struct mm_struct *mm)
{ if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
put_huge_zero_page();
}
staticunsignedlong shrink_huge_zero_page_count(struct shrinker *shrink, struct shrink_control *sc)
{ /* we can free zero page only if last reference remains */ return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
}
staticint sysfs_add_group(struct kobject *kobj, conststruct attribute_group *grp)
{ int ret = -ENOENT;
/* * If the group is named, try to merge first, assuming the subdirectory * was already created. This avoids the warning emitted by * sysfs_create_group() if the directory already exists.
*/ if (grp->name)
ret = sysfs_merge_group(kobj, grp); if (ret)
ret = sysfs_create_group(kobj, grp);
staticint __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
{ int err; struct thpsize *thpsize; unsignedlong orders; int order;
/* * Default to setting PMD-sized THP to inherit the global setting and * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time * constant so we have to do this here.
*/ if (!anon_orders_configured)
huge_anon_orders_inherit = BIT(PMD_ORDER);
*hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); if (unlikely(!*hugepage_kobj)) {
pr_err("failed to create transparent hugepage kobject\n"); return -ENOMEM;
}
err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); if (err) {
pr_err("failed to register transparent hugepage group\n"); goto delete_obj;
}
err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); if (err) {
pr_err("failed to register transparent hugepage group\n"); goto remove_hp_group;
}
orders = THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DEFAULT;
order = highest_order(orders); while (orders) {
thpsize = thpsize_create(order, *hugepage_kobj); if (IS_ERR(thpsize)) {
pr_err("failed to create thpsize for order %d\n", order);
err = PTR_ERR(thpsize); goto remove_all;
}
list_add(&thpsize->node, &thpsize_list);
order = next_order(&orders, order);
}
/* * hugepages can't be allocated by the buddy allocator
*/
MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_PAGE_ORDER);
err = hugepage_init_sysfs(&hugepage_kobj); if (err) goto err_sysfs;
err = khugepaged_init(); if (err) goto err_slab;
err = thp_shrinker_init(); if (err) goto err_shrinker;
/* * By default disable transparent hugepages on smaller systems, * where the extra memory used could hurt more than TLB overhead * is likely to save. The admin can still enable it through /sys.
*/ if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
transparent_hugepage_flags = 0; return 0;
}
err = start_stop_khugepaged(); if (err) goto err_khugepaged;
/* * When a folio is not zeroed during allocation (__GFP_ZERO not used) * or user folios require special handling, folio_zero_user() is used to * make sure that the page corresponding to the faulting address will be * hot in the cache after zeroing.
*/ if (user_alloc_needs_zeroing())
folio_zero_user(folio, addr); /* * The memory barrier inside __folio_mark_uptodate makes sure that * folio_zero_user writes become visible before the set_pmd_at() * write.
*/
__folio_mark_uptodate(folio); return folio;
}
/* * always: directly stall for all thp allocations * defer: wake kswapd and fail if not immediately available * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise * fail if not immediately available * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately * available * never: never stall for any thp allocation
*/
gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
{ constbool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE);
/* Always do synchronous compaction */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
/* Kick kcompactd and fail quickly */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
/* Synchronous compaction if madvised, otherwise kick kcompactd */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE_LIGHT |
(vma_madvised ? __GFP_DIRECT_RECLAIM :
__GFP_KSWAPD_RECLAIM);
/* Only do synchronous compaction if madvised */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE_LIGHT |
(vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) return VM_FAULT_FALLBACK;
ret = vmf_anon_prepare(vmf); if (ret) return ret;
khugepaged_enter_vma(vma, vma->vm_flags);
/* * If we had pmd_special, we could avoid all these restrictions, * but we need to be consistent with PTEs and architectures that * can't support a 'special' bit.
*/
BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
(VM_PFNMAP|VM_MIXEDMAP));
BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS;
if (arch_needs_pgtable_deposit()) {
pgtable = pte_alloc_one(vma->vm_mm); if (!pgtable) return VM_FAULT_OOM;
}
/* * If we had pud_special, we could avoid all these restrictions, * but we need to be consistent with PTEs and architectures that * can't support a 'special' bit.
*/
BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
(VM_PFNMAP|VM_MIXEDMAP));
BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS;
pmd = pmdp_get_lockless(src_pmd); if (unlikely(pmd_present(pmd) && pmd_special(pmd))) {
dst_ptl = pmd_lock(dst_mm, dst_pmd);
src_ptl = pmd_lockptr(src_mm, src_pmd);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); /* * No need to recheck the pmd, it can't change with write * mmap lock held here. * * Meanwhile, making sure it's not a CoW VMA with writable * mapping, otherwise it means either the anon page wrongly * applied special bit, or we made the PRIVATE mapping be * able to wrongly write to the backend MMIO.
*/
VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && pmd_write(pmd)); goto set_pmd;
}
/* Skip if can be re-fill on fault */ if (!vma_is_anonymous(dst_vma)) return 0;
pgtable = pte_alloc_one(dst_mm); if (unlikely(!pgtable)) goto out;
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION if (unlikely(is_swap_pmd(pmd))) {
swp_entry_t entry = pmd_to_swp_entry(pmd);
VM_BUG_ON(!is_pmd_migration_entry(pmd)); if (!is_readable_migration_entry(entry)) {
entry = make_readable_migration_entry(
swp_offset(entry));
pmd = swp_entry_to_pmd(entry); if (pmd_swp_soft_dirty(*src_pmd))
pmd = pmd_swp_mksoft_dirty(pmd); if (pmd_swp_uffd_wp(*src_pmd))
pmd = pmd_swp_mkuffd_wp(pmd);
set_pmd_at(src_mm, addr, src_pmd, pmd);
}
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
mm_inc_nr_ptes(dst_mm);
pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); if (!userfaultfd_wp(dst_vma))
pmd = pmd_swp_clear_uffd_wp(pmd);
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
ret = 0; goto out_unlock;
} #endif
if (unlikely(!pmd_trans_huge(pmd))) {
pte_free(dst_mm, pgtable); goto out_unlock;
} /* * When page table lock is held, the huge zero pmd should not be * under splitting since we don't split the page itself, only pmd to * a page table.
*/ if (is_huge_zero_pmd(pmd)) { /* * mm_get_huge_zero_folio() will never allocate a new * folio here, since we already have a zero page to * copy. It just takes a reference.
*/
mm_get_huge_zero_folio(dst_mm); goto out_zero_page;
}
/* Early check when only holding the PT lock. */ if (PageAnonExclusive(page)) goto reuse;
if (!folio_trylock(folio)) {
folio_get(folio);
spin_unlock(vmf->ptl);
folio_lock(folio);
spin_lock(vmf->ptl); if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
spin_unlock(vmf->ptl);
folio_unlock(folio);
folio_put(folio); return 0;
}
folio_put(folio);
}
/* Recheck after temporarily dropping the PT lock. */ if (PageAnonExclusive(page)) {
folio_unlock(folio); goto reuse;
}
/* * See do_wp_page(): we can only reuse the folio exclusively if * there are no additional references. Note that we always drain * the LRU cache immediately after adding a THP.
*/ if (folio_ref_count(folio) >
1 + folio_test_swapcache(folio) * folio_nr_pages(folio)) goto unlock_fallback; if (folio_test_swapcache(folio))
folio_free_swap(folio); if (folio_ref_count(folio) == 1) {
pmd_t entry;
if (unlikely(!pmd_same(old_pmd, vmf->orig_pmd))) {
spin_unlock(vmf->ptl); return 0;
}
pmd = pmd_modify(old_pmd, vma->vm_page_prot);
/* * Detect now whether the PMD could be writable; this information * is only valid while holding the PT lock.
*/
writable = pmd_write(pmd); if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
can_change_pmd_writable(vma, vmf->address, pmd))
writable = true;
folio = vm_normal_folio_pmd(vma, haddr, pmd); if (!folio) goto out_map;
nid = folio_nid(folio);
target_nid = numa_migrate_check(folio, vmf, haddr, &flags, writable,
&last_cpupid); if (target_nid == NUMA_NO_NODE) goto out_map; if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) {
flags |= TNF_MIGRATE_FAIL; goto out_map;
} /* The folio is isolated and isolation code holds a folio reference. */
spin_unlock(vmf->ptl);
writable = false;
if (!migrate_misplaced_folio(folio, target_nid)) {
flags |= TNF_MIGRATED;
nid = target_nid;
task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags); return 0;
}
/* * Return true if we do MADV_FREE successfully on entire pmd page. * Otherwise, return false.
*/ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsignedlong addr, unsignedlong next)
{
spinlock_t *ptl;
pmd_t orig_pmd; struct folio *folio; struct mm_struct *mm = tlb->mm; bool ret = false;
tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
ptl = pmd_trans_huge_lock(pmd, vma); if (!ptl) goto out_unlocked;
orig_pmd = *pmd; if (is_huge_zero_pmd(orig_pmd)) goto out;
if (unlikely(!pmd_present(orig_pmd))) {
VM_BUG_ON(thp_migration_supported() &&
!is_pmd_migration_entry(orig_pmd)); goto out;
}
folio = pmd_folio(orig_pmd); /* * If other processes are mapping this folio, we couldn't discard * the folio unless they all do MADV_FREE so let's skip the folio.
*/ if (folio_maybe_mapped_shared(folio)) goto out;
if (!folio_trylock(folio)) goto out;
/* * If user want to discard part-pages of THP, split it so MADV_FREE * will deactivate only them.
*/ if (next - addr != HPAGE_PMD_SIZE) {
folio_get(folio);
spin_unlock(ptl);
split_folio(folio);
folio_unlock(folio);
folio_put(folio); goto out_unlocked;
}
if (folio_test_dirty(folio))
folio_clear_dirty(folio);
folio_unlock(folio);
if (folio_test_anon(folio)) {
zap_deposited_table(tlb->mm, pmd);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
} else { if (arch_needs_pgtable_deposit())
zap_deposited_table(tlb->mm, pmd);
add_mm_counter(tlb->mm, mm_counter_file(folio),
-HPAGE_PMD_NR);
/* * Use flush_needed to indicate whether the PMD entry * is present, instead of checking pmd_present() again.
*/ if (flush_needed && pmd_young(orig_pmd) &&
likely(vma_has_recency(vma)))
folio_mark_accessed(folio);
}
spin_unlock(ptl); if (flush_needed)
tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
} return 1;
}
#ifndef pmd_move_must_withdraw staticinlineint pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
spinlock_t *old_pmd_ptl, struct vm_area_struct *vma)
{ /* * With split pmd lock we also need to move preallocated * PTE page table if new_pmd is on different PMD page table. * * We also don't deposit and withdraw tables for file pages.
*/ return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
} #endif
/* * The destination pmd shouldn't be established, free_pgtables() * should have released it; but move_page_tables() might have already * inserted a page table, if racing against shmem/file collapse.
*/ if (!pmd_none(*new_pmd)) {
VM_BUG_ON(pmd_trans_huge(*new_pmd)); returnfalse;
}
/* * We don't have to worry about the ordering of src and dst * ptlocks because exclusive mmap_lock prevents deadlock.
*/
old_ptl = __pmd_trans_huge_lock(old_pmd, vma); if (old_ptl) {
new_ptl = pmd_lockptr(mm, new_pmd); if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); if (pmd_present(pmd))
force_flush = true;
VM_BUG_ON(!pmd_none(*new_pmd));
VM_BUG_ON(!is_pmd_migration_entry(*pmd)); if (is_writable_migration_entry(entry)) { /* * A protection check is difficult so * just be safe and disable write
*/ if (folio_test_anon(folio))
entry = make_readable_exclusive_migration_entry(swp_offset(entry)); else
entry = make_readable_migration_entry(swp_offset(entry));
newpmd = swp_entry_to_pmd(entry); if (pmd_swp_soft_dirty(*pmd))
newpmd = pmd_swp_mksoft_dirty(newpmd);
} else {
newpmd = *pmd;
}
if (prot_numa) { struct folio *folio; bool toptier; /* * Avoid trapping faults against the zero page. The read-only * data is likely to be read-cached on the local CPU and * local/remote hits to the zero page are not interesting.
*/
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Dauer der Verarbeitung: 0.23 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.