/* default scan 8*512 pte (or vmas) every 30 second */ staticunsignedint khugepaged_pages_to_scan __read_mostly; staticunsignedint khugepaged_pages_collapsed; staticunsignedint khugepaged_full_scans; staticunsignedint khugepaged_scan_sleep_millisecs __read_mostly = 10000; /* during fragmentation poll the hugepage allocator once every minute */ staticunsignedint khugepaged_alloc_sleep_millisecs __read_mostly = 60000; staticunsignedlong khugepaged_sleep_expire; static DEFINE_SPINLOCK(khugepaged_mm_lock); static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); /* * default collapse hugepages if there is at least one pte mapped like * it would have happened if the vma was large enough during page * fault. * * Note that these are only respected if collapse was initiated by khugepaged.
*/ unsignedint khugepaged_max_ptes_none __read_mostly; staticunsignedint khugepaged_max_ptes_swap __read_mostly; staticunsignedint khugepaged_max_ptes_shared __read_mostly;
/* Num pages scanned per node */
u32 node_load[MAX_NUMNODES];
/* nodemask for allocation fallback */
nodemask_t alloc_nmask;
};
/** * struct khugepaged_mm_slot - khugepaged information per mm that is being scanned * @slot: hash lookup from mm to mm_slot
*/ struct khugepaged_mm_slot { struct mm_slot slot;
};
/** * struct khugepaged_scan - cursor for scanning * @mm_head: the head of the mm list to scan * @mm_slot: the current mm_slot we are scanning * @address: the next address inside that to be scanned * * There is only the one khugepaged_scan instance of this cursor structure.
*/ struct khugepaged_scan { struct list_head mm_head; struct khugepaged_mm_slot *mm_slot; unsignedlong address;
};
/* * max_ptes_none controls if khugepaged should collapse hugepages over * any unmapped ptes in turn potentially increasing the memory * footprint of the vmas. When max_ptes_none is 0 khugepaged will not * reduce the available free memory in the system as it * runs. Increasing max_ptes_none will instead potentially reduce the * free memory in the system during the khugepaged scan.
*/ static ssize_t max_ptes_none_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{ return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none);
} static ssize_t max_ptes_none_store(struct kobject *kobj, struct kobj_attribute *attr, constchar *buf, size_t count)
{ int err; unsignedlong max_ptes_none;
int hugepage_madvise(struct vm_area_struct *vma,
vm_flags_t *vm_flags, int advice)
{ switch (advice) { case MADV_HUGEPAGE: #ifdef CONFIG_S390 /* * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390 * can't handle this properly after s390_enable_sie, so we simply * ignore the madvise to prevent qemu from causing a SIGSEGV.
*/ if (mm_has_pgste(vma->vm_mm)) return 0; #endif
*vm_flags &= ~VM_NOHUGEPAGE;
*vm_flags |= VM_HUGEPAGE; /* * If the vma become good for khugepaged to scan, * register it here without waiting a page fault that * may not happen any time soon.
*/
khugepaged_enter_vma(vma, *vm_flags); break; case MADV_NOHUGEPAGE:
*vm_flags &= ~VM_HUGEPAGE;
*vm_flags |= VM_NOHUGEPAGE; /* * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning * this vma even if we leave the mm registered in khugepaged if * it got registered before VM_NOHUGEPAGE was set.
*/ break;
}
return 0;
}
int __init khugepaged_init(void)
{
mm_slot_cache = KMEM_CACHE(khugepaged_mm_slot, 0); if (!mm_slot_cache) return -ENOMEM;
staticbool hugepage_pmd_enabled(void)
{ /* * We cover the anon, shmem and the file-backed case here; file-backed * hugepages, when configured in, are determined by the global control. * Anon pmd-sized hugepages are determined by the pmd-size control. * Shmem pmd-sized hugepages are also determined by its pmd-size control, * except when the global shmem_huge is set to SHMEM_HUGE_DENY.
*/ if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
hugepage_global_enabled()) returntrue; if (test_bit(PMD_ORDER, &huge_anon_orders_always)) returntrue; if (test_bit(PMD_ORDER, &huge_anon_orders_madvise)) returntrue; if (test_bit(PMD_ORDER, &huge_anon_orders_inherit) &&
hugepage_global_enabled()) returntrue; if (IS_ENABLED(CONFIG_SHMEM) && shmem_hpage_pmd_enabled()) returntrue; returnfalse;
}
/* __khugepaged_exit() must not run from under us */
VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm); if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) return;
mm_slot = mm_slot_alloc(mm_slot_cache); if (!mm_slot) return;
slot = &mm_slot->slot;
spin_lock(&khugepaged_mm_lock);
mm_slot_insert(mm_slots_hash, mm, slot); /* * Insert just behind the scanning cursor, to let the area settle * down a little.
*/
wakeup = list_empty(&khugepaged_scan.mm_head);
list_add_tail(&slot->mm_node, &khugepaged_scan.mm_head);
spin_unlock(&khugepaged_mm_lock);
mmgrab(mm); if (wakeup)
wake_up_interruptible(&khugepaged_wait);
}
if (free) {
clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
mm_slot_free(mm_slot_cache, mm_slot);
mmdrop(mm);
} elseif (mm_slot) { /* * This is required to serialize against * hpage_collapse_test_exit() (which is guaranteed to run * under mmap sem read mode). Stop here (after we return all * pagetables will be destroyed) until khugepaged has finished * working on the pagetables under the mmap_lock.
*/
mmap_write_lock(mm);
mmap_write_unlock(mm);
}
}
/* See hpage_collapse_scan_pmd(). */ if (folio_maybe_mapped_shared(folio)) {
++shared; if (cc->is_khugepaged &&
shared > khugepaged_max_ptes_shared) {
result = SCAN_EXCEED_SHARED_PTE;
count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); goto out;
}
}
if (folio_test_large(folio)) { struct folio *f;
/* * Check if we have dealt with the compound page * already
*/
list_for_each_entry(f, compound_pagelist, lru) { if (folio == f) goto next;
}
}
/* * We can do it before folio_isolate_lru because the * folio can't be freed from under us. NOTE: PG_lock * is needed to serialize against split_huge_page * when invoked from the VM.
*/ if (!folio_trylock(folio)) {
result = SCAN_PAGE_LOCK; goto out;
}
/* * Check if the page has any GUP (or other external) pins. * * The page table that maps the page has been already unlinked * from the page table tree and this process cannot get * an additional pin on the page. * * New pins can come later if the page is shared across fork, * but not from this process. The other process cannot write to * the page, only trigger CoW.
*/ if (folio_expected_ref_count(folio) != folio_ref_count(folio)) {
folio_unlock(folio);
result = SCAN_PAGE_COUNT; goto out;
}
/* * Isolate the page to avoid collapsing an hugepage * currently in use by the VM.
*/ if (!folio_isolate_lru(folio)) {
folio_unlock(folio);
result = SCAN_DEL_PAGE_LRU; goto out;
}
node_stat_mod_folio(folio,
NR_ISOLATED_ANON + folio_is_file_lru(folio),
folio_nr_pages(folio));
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
if (folio_test_large(folio))
list_add_tail(&folio->lru, compound_pagelist);
next: /* * If collapse was initiated by khugepaged, check that there is * enough young pte to justify collapsing the page
*/ if (cc->is_khugepaged &&
(pte_young(pteval) || folio_test_young(folio) ||
folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
address)))
referenced++;
/* * Re-establish the PMD to point to the original page table * entry. Restoring PMD needs to be done prior to releasing * pages. Since pages are still isolated and locked here, * acquiring anon_vma_lock_write is unnecessary.
*/
pmd_ptl = pmd_lock(vma->vm_mm, pmd);
pmd_populate(vma->vm_mm, pmd, pmd_pgtable(orig_pmd));
spin_unlock(pmd_ptl); /* * Release both raw and compound pages isolated * in __collapse_huge_page_isolate.
*/
release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist);
}
/* * __collapse_huge_page_copy - attempts to copy memory contents from raw * pages to a hugepage. Cleans up the raw pages if copying succeeds; * otherwise restores the original page table and releases isolated raw pages. * Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC. * * @pte: starting of the PTEs to copy from * @folio: the new hugepage to copy contents to * @pmd: pointer to the new hugepage's PMD * @orig_pmd: the original raw pages' PMD * @vma: the original raw pages' virtual memory area * @address: starting address to copy * @ptl: lock on raw pages' PTEs * @compound_pagelist: list that stores compound pages
*/ staticint __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma, unsignedlong address, spinlock_t *ptl, struct list_head *compound_pagelist)
{ unsignedint i; int result = SCAN_SUCCEED;
/* * Copying pages' contents is subject to memory poison at any iteration.
*/ for (i = 0; i < HPAGE_PMD_NR; i++) {
pte_t pteval = ptep_get(pte + i); struct page *page = folio_page(folio, i); unsignedlong src_addr = address + i * PAGE_SIZE; struct page *src_page;
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
clear_user_highpage(page, src_addr); continue;
}
src_page = pte_page(pteval); if (copy_mc_user_highpage(page, src_page, src_addr, vma) > 0) {
result = SCAN_COPY_MC; break;
}
}
/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */ staticinline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
{ return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT;
}
/* find first node with max normal pages hit */ for (nid = 0; nid < MAX_NUMNODES; nid++) if (cc->node_load[nid] > max_value) {
max_value = cc->node_load[nid];
target_node = nid;
}
for_each_online_node(nid) { if (max_value == cc->node_load[nid])
node_set(nid, cc->alloc_nmask);
}
if (unlikely(hpage_collapse_test_exit_or_disable(mm))) return SCAN_ANY_PROCESS;
*vmap = vma = find_vma(mm, address); if (!vma) return SCAN_VMA_NULL;
if (!thp_vma_suitable_order(vma, address, PMD_ORDER)) return SCAN_ADDRESS_RANGE; if (!thp_vma_allowable_order(vma, vma->vm_flags, tva_flags, PMD_ORDER)) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then * remapped to file after khugepaged reaquired the mmap_lock. * * thp_vma_allowable_order may return true for qualified file * vmas.
*/ if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap))) return SCAN_PAGE_ANON; return SCAN_SUCCEED;
}
/* * The folio may be under migration when khugepaged is trying to * collapse it. Migration success or failure will eventually end * up with a present PMD mapping a folio again.
*/ if (is_pmd_migration_entry(pmde)) return SCAN_PMD_MAPPED; if (!pmd_present(pmde)) return SCAN_PMD_NULL; if (pmd_trans_huge(pmde)) return SCAN_PMD_MAPPED; if (pmd_bad(pmde)) return SCAN_PMD_NULL; return SCAN_SUCCEED;
}
staticint check_pmd_still_valid(struct mm_struct *mm, unsignedlong address,
pmd_t *pmd)
{
pmd_t *new_pmd; int result = find_pmd_or_thp_or_none(mm, address, &new_pmd);
if (result != SCAN_SUCCEED) return result; if (new_pmd != pmd) return SCAN_FAIL; return SCAN_SUCCEED;
}
/* * Bring missing pages in from swap, to complete THP collapse. * Only done if hpage_collapse_scan_pmd believes it is worthwhile. * * Called and returns without pte mapped or spinlocks held. * Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
*/ staticint __collapse_huge_page_swapin(struct mm_struct *mm, struct vm_area_struct *vma, unsignedlong haddr, pmd_t *pmd, int referenced)
{ int swapped_in = 0;
vm_fault_t ret = 0; unsignedlong address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE); int result;
pte_t *pte = NULL;
spinlock_t *ptl;
if (!pte++) { /* * Here the ptl is only used to check pte_same() in * do_swap_page(), so readonly version is enough.
*/
pte = pte_offset_map_ro_nolock(mm, pmd, address, &ptl); if (!pte) {
mmap_read_unlock(mm);
result = SCAN_PMD_NULL; goto out;
}
}
vmf.orig_pte = ptep_get_lockless(pte); if (!is_swap_pte(vmf.orig_pte)) continue;
vmf.pte = pte;
vmf.ptl = ptl;
ret = do_swap_page(&vmf); /* Which unmaps pte (after perhaps re-checking the entry) */
pte = NULL;
/* * do_swap_page returns VM_FAULT_RETRY with released mmap_lock. * Note we treat VM_FAULT_RETRY as VM_FAULT_ERROR here because * we do not retry here and swap entry will remain in pagetable * resulting in later failure.
*/ if (ret & VM_FAULT_RETRY) { /* Likely, but not guaranteed, that page lock failed */
result = SCAN_PAGE_LOCK; goto out;
} if (ret & VM_FAULT_ERROR) {
mmap_read_unlock(mm);
result = SCAN_FAIL; goto out;
}
swapped_in++;
}
if (pte)
pte_unmap(pte);
/* Drain LRU cache to remove extra pin on the swapped in pages */ if (swapped_in)
lru_add_drain();
staticint collapse_huge_page(struct mm_struct *mm, unsignedlong address, int referenced, int unmapped, struct collapse_control *cc)
{
LIST_HEAD(compound_pagelist);
pmd_t *pmd, _pmd;
pte_t *pte;
pgtable_t pgtable; struct folio *folio;
spinlock_t *pmd_ptl, *pte_ptl; int result = SCAN_FAIL; struct vm_area_struct *vma; struct mmu_notifier_range range;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
/* * Before allocating the hugepage, release the mmap_lock read lock. * The allocation can take potentially a long time if it involves * sync compaction, and we do not need to hold the mmap_lock during * that. We will recheck the vma after taking it again in write mode.
*/
mmap_read_unlock(mm);
result = alloc_charge_folio(&folio, mm, cc); if (result != SCAN_SUCCEED) goto out_nolock;
mmap_read_lock(mm);
result = hugepage_vma_revalidate(mm, address, true, &vma, cc); if (result != SCAN_SUCCEED) {
mmap_read_unlock(mm); goto out_nolock;
}
result = find_pmd_or_thp_or_none(mm, address, &pmd); if (result != SCAN_SUCCEED) {
mmap_read_unlock(mm); goto out_nolock;
}
if (unmapped) { /* * __collapse_huge_page_swapin will return with mmap_lock * released when it fails. So we jump out_nolock directly in * that case. Continuing to collapse causes inconsistency.
*/
result = __collapse_huge_page_swapin(mm, vma, address, pmd,
referenced); if (result != SCAN_SUCCEED) goto out_nolock;
}
mmap_read_unlock(mm); /* * Prevent all access to pagetables with the exception of * gup_fast later handled by the ptep_clear_flush and the VM * handled by the anon_vma lock + PG_lock. * * UFFDIO_MOVE is prevented to race as well thanks to the * mmap_lock.
*/
mmap_write_lock(mm);
result = hugepage_vma_revalidate(mm, address, true, &vma, cc); if (result != SCAN_SUCCEED) goto out_up_write; /* check if the pmd is still valid */
vma_start_write(vma);
result = check_pmd_still_valid(mm, address, pmd); if (result != SCAN_SUCCEED) goto out_up_write;
anon_vma_lock_write(vma->anon_vma);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
address + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ /* * This removes any huge TLB entry from the CPU so we won't allow * huge and small TLB entries for the same virtual address to * avoid the risk of CPU bugs in that area. * * Parallel GUP-fast is fine since GUP-fast will back off when * it detects PMD is changed.
*/
_pmd = pmdp_collapse_flush(vma, address, pmd);
spin_unlock(pmd_ptl);
mmu_notifier_invalidate_range_end(&range);
tlb_remove_table_sync_one();
pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl); if (pte) {
result = __collapse_huge_page_isolate(vma, address, pte, cc,
&compound_pagelist);
spin_unlock(pte_ptl);
} else {
result = SCAN_PMD_NULL;
}
if (unlikely(result != SCAN_SUCCEED)) { if (pte)
pte_unmap(pte);
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd)); /* * We can only use set_pmd_at when establishing * hugepmds and never for establishing regular pmds that * points to regular pagetables. Use pmd_populate for that
*/
pmd_populate(mm, pmd, pmd_pgtable(_pmd));
spin_unlock(pmd_ptl);
anon_vma_unlock_write(vma->anon_vma); goto out_up_write;
}
/* * All pages are isolated and locked so anon_vma rmap * can't run anymore.
*/
anon_vma_unlock_write(vma->anon_vma);
result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
vma, address, pte_ptl,
&compound_pagelist);
pte_unmap(pte); if (unlikely(result != SCAN_SUCCEED)) goto out_up_write;
/* * The smp_wmb() inside __folio_mark_uptodate() ensures the * copy_huge_page writes become visible before the set_pmd_at() * write.
*/
__folio_mark_uptodate(folio);
pgtable = pmd_pgtable(_pmd);
result = find_pmd_or_thp_or_none(mm, address, &pmd); if (result != SCAN_SUCCEED) goto out;
memset(cc->node_load, 0, sizeof(cc->node_load));
nodes_clear(cc->alloc_nmask);
pte = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte) {
result = SCAN_PMD_NULL; goto out;
}
for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR;
_pte++, _address += PAGE_SIZE) {
pte_t pteval = ptep_get(_pte); if (is_swap_pte(pteval)) {
++unmapped; if (!cc->is_khugepaged ||
unmapped <= khugepaged_max_ptes_swap) { /* * Always be strict with uffd-wp * enabled swap entries. Please see * comment below for pte_uffd_wp().
*/ if (pte_swp_uffd_wp_any(pteval)) {
result = SCAN_PTE_UFFD_WP; goto out_unmap;
} continue;
} else {
result = SCAN_EXCEED_SWAP_PTE;
count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); goto out_unmap;
}
} if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
++none_or_zero; if (!userfaultfd_armed(vma) &&
(!cc->is_khugepaged ||
none_or_zero <= khugepaged_max_ptes_none)) { continue;
} else {
result = SCAN_EXCEED_NONE_PTE;
count_vm_event(THP_SCAN_EXCEED_NONE_PTE); goto out_unmap;
}
} if (pte_uffd_wp(pteval)) { /* * Don't collapse the page if any of the small * PTEs are armed with uffd write protection. * Here we can also mark the new huge pmd as * write protected if any of the small ones is * marked but that could bring unknown * userfault messages that falls outside of * the registered range. So, just be simple.
*/
result = SCAN_PTE_UFFD_WP; goto out_unmap;
} if (pte_write(pteval))
writable = true;
page = vm_normal_page(vma, _address, pteval); if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
result = SCAN_PAGE_NULL; goto out_unmap;
}
folio = page_folio(page);
if (!folio_test_anon(folio)) {
result = SCAN_PAGE_ANON; goto out_unmap;
}
/* * We treat a single page as shared if any part of the THP * is shared.
*/ if (folio_maybe_mapped_shared(folio)) {
++shared; if (cc->is_khugepaged &&
shared > khugepaged_max_ptes_shared) {
result = SCAN_EXCEED_SHARED_PTE;
count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); goto out_unmap;
}
}
/* * Record which node the original page is from and save this * information to cc->node_load[]. * Khugepaged will allocate hugepage from the node has the max * hit record.
*/
node = folio_nid(folio); if (hpage_collapse_scan_abort(node, cc)) {
result = SCAN_SCAN_ABORT; goto out_unmap;
}
cc->node_load[node]++; if (!folio_test_lru(folio)) {
result = SCAN_PAGE_LRU; goto out_unmap;
} if (folio_test_locked(folio)) {
result = SCAN_PAGE_LOCK; goto out_unmap;
}
/* * Check if the page has any GUP (or other external) pins. * * Here the check may be racy: * it may see folio_mapcount() > folio_ref_count(). * But such case is ephemeral we could always retry collapse * later. However it may report false positive if the page * has excessive GUP pins (i.e. 512). Anyway the same check * will be done again later the risk seems low.
*/ if (folio_expected_ref_count(folio) != folio_ref_count(folio)) {
result = SCAN_PAGE_COUNT; goto out_unmap;
}
/* * If collapse was initiated by khugepaged, check that there is * enough young pte to justify collapsing the page
*/ if (cc->is_khugepaged &&
(pte_young(pteval) || folio_test_young(folio) ||
folio_test_referenced(folio) ||
mmu_notifier_test_young(vma->vm_mm, _address)))
referenced++;
} if (!writable) {
result = SCAN_PAGE_RO;
} elseif (cc->is_khugepaged &&
(!referenced ||
(unmapped && referenced < HPAGE_PMD_NR / 2))) {
result = SCAN_LACK_REFERENCED_PAGE;
} else {
result = SCAN_SUCCEED;
}
out_unmap:
pte_unmap_unlock(pte, ptl); if (result == SCAN_SUCCEED) {
result = collapse_huge_page(mm, address, referenced,
unmapped, cc); /* collapse_huge_page will return with the mmap_lock released */
*mmap_locked = false;
}
out:
trace_mm_khugepaged_scan_pmd(mm, folio, writable, referenced,
none_or_zero, result, unmapped); return result;
}
if (hpage_collapse_test_exit(mm)) { /* free mm_slot */
hash_del(&slot->hash);
list_del(&slot->mm_node);
/* * Not strictly needed because the mm exited already. * * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
*/
/* khugepaged_mm_lock actually not necessary for the below */
mm_slot_free(mm_slot_cache, mm_slot);
mmdrop(mm);
}
}
/* folio must be locked, and mmap_lock must be held */ staticint set_huge_pmd(struct vm_area_struct *vma, unsignedlong addr,
pmd_t *pmdp, struct folio *folio, struct page *page)
{ struct vm_fault vmf = {
.vma = vma,
.address = addr,
.flags = 0,
.pmd = pmdp,
};
mmap_assert_locked(vma->vm_mm);
if (do_set_pmd(&vmf, folio, page)) return SCAN_FAIL;
folio_get(folio); return SCAN_SUCCEED;
}
/** * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at * address haddr. * * @mm: process address space where collapse happens * @addr: THP collapse address * @install_pmd: If a huge PMD should be installed * * This function checks whether all the PTEs in the PMD are pointing to the * right THP. If so, retract the page table so the THP can refault in with * as pmd-mapped. Possibly install a huge PMD mapping the THP.
*/ int collapse_pte_mapped_thp(struct mm_struct *mm, unsignedlong addr, bool install_pmd)
{ int nr_mapped_ptes = 0, result = SCAN_FAIL; unsignedint nr_batch_ptes; struct mmu_notifier_range range; bool notified = false; unsignedlong haddr = addr & HPAGE_PMD_MASK; unsignedlong end = haddr + HPAGE_PMD_SIZE; struct vm_area_struct *vma = vma_lookup(mm, haddr); struct folio *folio;
pte_t *start_pte, *pte;
pmd_t *pmd, pgt_pmd;
spinlock_t *pml = NULL, *ptl; int i;
mmap_assert_locked(mm);
/* First check VMA found, in case page tables are being torn down */ if (!vma || !vma->vm_file ||
!range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE)) return SCAN_VMA_CHECK;
/* Fast check before locking page if already PMD-mapped */
result = find_pmd_or_thp_or_none(mm, haddr, &pmd); if (result == SCAN_PMD_MAPPED) return result;
/* * If we are here, we've succeeded in replacing all the native pages * in the page cache with a single hugepage. If a mm were to fault-in * this memory (mapped by a suitably aligned VMA), we'd get the hugepage * and map it by a PMD, regardless of sysfs THP settings. As such, let's * analogously elide sysfs THP settings here.
*/ if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) return SCAN_VMA_CHECK;
/* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ if (userfaultfd_wp(vma)) return SCAN_PTE_UFFD_WP;
folio = filemap_lock_folio(vma->vm_file->f_mapping,
linear_page_index(vma, haddr)); if (IS_ERR(folio)) return SCAN_PAGE_NULL;
if (folio_order(folio) != HPAGE_PMD_ORDER) {
result = SCAN_PAGE_COMPOUND; goto drop_folio;
}
result = find_pmd_or_thp_or_none(mm, haddr, &pmd); switch (result) { case SCAN_SUCCEED: break; case SCAN_PMD_NONE: /* * All pte entries have been removed and pmd cleared. * Skip all the pte checks and just update the pmd mapping.
*/ goto maybe_install_pmd; default: goto drop_folio;
}
result = SCAN_FAIL;
start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); if (!start_pte) /* mmap_lock + page lock should prevent this */ goto drop_folio;
/* step 1: check all mapped PTEs are to the right huge page */ for (i = 0, addr = haddr, pte = start_pte;
i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { struct page *page;
pte_t ptent = ptep_get(pte);
/* empty pte, skip */ if (pte_none(ptent)) continue;
/* page swapped out, abort */ if (!pte_present(ptent)) {
result = SCAN_PTE_NON_PRESENT; goto abort;
}
page = vm_normal_page(vma, addr, ptent); if (WARN_ON_ONCE(page && is_zone_device_page(page)))
page = NULL; /* * Note that uprobe, debugger, or MAP_PRIVATE may change the * page table, but the new page will not be a subpage of hpage.
*/ if (folio_page(folio, i) != page) goto abort;
}
/* * pmd_lock covers a wider range than ptl, and (if split from mm's * page_table_lock) ptl nests inside pml. The less time we hold pml, * the better; but userfaultfd's mfill_atomic_pte() on a private VMA * inserts a valid as-if-COWed PTE without even looking up page cache. * So page lock of folio does not protect from it, so we must not drop * ptl before pgt_pmd is removed, so uffd private needs pml taken now.
*/ if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
pml = pmd_lock(mm, pmd);
start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl); if (!start_pte) /* mmap_lock + page lock should prevent this */ goto abort; if (!pml)
spin_lock(ptl); elseif (ptl != pml)
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) goto abort;
/* step 2: clear page table and adjust rmap */ for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR;
i += nr_batch_ptes, addr += nr_batch_ptes * PAGE_SIZE,
pte += nr_batch_ptes) { unsignedint max_nr_batch_ptes = (end - addr) >> PAGE_SHIFT; struct page *page;
pte_t ptent = ptep_get(pte);
nr_batch_ptes = 1;
if (pte_none(ptent)) continue; /* * We dropped ptl after the first scan, to do the mmu_notifier: * page lock stops more PTEs of the folio being faulted in, but * does not stop write faults COWing anon copies from existing * PTEs; and does not stop those being swapped out or migrated.
*/ if (!pte_present(ptent)) {
result = SCAN_PTE_NON_PRESENT; goto abort;
}
page = vm_normal_page(vma, addr, ptent);
/* * Must clear entry, or a racing truncate may re-remove it. * TLB flush can be left until pmdp_collapse_flush() does it. * PTE dirty? Shmem page is already dirty; file is read-only.
*/
clear_ptes(mm, addr, pte, nr_batch_ptes);
folio_remove_rmap_ptes(folio, page, nr_batch_ptes, vma);
nr_mapped_ptes += nr_batch_ptes;
}
if (!pml)
spin_unlock(ptl);
/* step 3: set proper refcount and mm_counters. */ if (nr_mapped_ptes) {
folio_ref_sub(folio, nr_mapped_ptes);
add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes);
}
/* * Check vma->anon_vma to exclude MAP_PRIVATE mappings that * got written to. These VMAs are likely not worth removing * page tables from, as PMD-mapping is likely to be split later.
*/ if (READ_ONCE(vma->anon_vma)) continue;
mm = vma->vm_mm; if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED) continue;
if (hpage_collapse_test_exit(mm)) continue; /* * When a vma is registered with uffd-wp, we cannot recycle * the page table because there may be pte markers installed. * Other vmas can still have the same file mapped hugely, but * skip this one: it will always be mapped in small page size * for uffd-wp registered ranges.
*/ if (userfaultfd_wp(vma)) continue;
/* PTEs were notified when unmapped; but now for the PMD? */
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
addr, addr + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
pml = pmd_lock(mm, pmd); /* * The lock of new_folio is still held, we will be blocked in * the page fault path, which prevents the pte entries from * being set again. So even though the old empty PTE page may be * concurrently freed and a new PTE page is filled into the pmd * entry, it is still empty and can be removed. * * So here we only need to recheck if the state of pmd entry * still meets our requirements, rather than checking pmd_same() * like elsewhere.
*/ if (check_pmd_state(pmd) != SCAN_SUCCEED) goto drop_pml;
ptl = pte_lockptr(mm, pmd); if (ptl != pml)
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
/* * Huge page lock is still held, so normally the page table * must remain empty; and we have already skipped anon_vma * and userfaultfd_wp() vmas. But since the mmap_lock is not * held, it is still possible for a racing userfaultfd_ioctl() * to have inserted ptes or markers. Now that we hold ptlock, * repeating the anon_vma check protects from one category, * and repeating the userfaultfd_wp() check from another.
*/ if (likely(!vma->anon_vma && !userfaultfd_wp(vma))) {
pgt_pmd = pmdp_collapse_flush(vma, addr, pmd);
pmdp_get_lockless_sync();
success = true;
}
if (ptl != pml)
spin_unlock(ptl);
drop_pml:
spin_unlock(pml);
result = alloc_charge_folio(&new_folio, mm, cc); if (result != SCAN_SUCCEED) goto out;
mapping_set_update(&xas, mapping);
__folio_set_locked(new_folio); if (is_shmem)
__folio_set_swapbacked(new_folio);
new_folio->index = start;
new_folio->mapping = mapping;
/* * Ensure we have slots for all the pages in the range. This is * almost certainly a no-op because most of the pages must be present
*/ do {
xas_lock_irq(&xas);
xas_create_range(&xas); if (!xas_error(&xas)) break;
xas_unlock_irq(&xas); if (!xas_nomem(&xas, GFP_KERNEL)) {
result = SCAN_FAIL; goto rollback;
}
} while (1);
for (index = start; index < end;) {
xas_set(&xas, index);
folio = xas_load(&xas);
VM_BUG_ON(index != xas.xa_index); if (is_shmem) { if (!folio) { /* * Stop if extent has been truncated or * hole-punched, and is now completely * empty.
*/ if (index == start) { if (!xas_next_entry(&xas, end - 1)) {
result = SCAN_TRUNCATED; goto xa_locked;
}
}
nr_none++;
index++; continue;
}
if (xa_is_value(folio) || !folio_test_uptodate(folio)) {
xas_unlock_irq(&xas); /* swap in or instantiate fallocated page */ if (shmem_get_folio(mapping->host, index, 0,
&folio, SGP_NOALLOC)) {
result = SCAN_FAIL; goto xa_unlocked;
} /* drain lru cache to help folio_isolate_lru() */
lru_add_drain();
} elseif (folio_trylock(folio)) {
folio_get(folio);
xas_unlock_irq(&xas);
} else {
result = SCAN_PAGE_LOCK; goto xa_locked;
}
} else { /* !is_shmem */ if (!folio || xa_is_value(folio)) {
xas_unlock_irq(&xas);
page_cache_sync_readahead(mapping, &file->f_ra,
file, index,
end - index); /* drain lru cache to help folio_isolate_lru() */
lru_add_drain();
folio = filemap_lock_folio(mapping, index); if (IS_ERR(folio)) {
result = SCAN_FAIL; goto xa_unlocked;
}
} elseif (folio_test_dirty(folio)) { /* * khugepaged only works on read-only fd, * so this page is dirty because it hasn't * been flushed since first write. There * won't be new dirty pages. * * Trigger async flush here and hope the * writeback is done when khugepaged * revisits this page. * * This is a one-off situation. We are not * forcing writeback in loop.
*/
xas_unlock_irq(&xas);
filemap_flush(mapping);
result = SCAN_FAIL; goto xa_unlocked;
} elseif (folio_test_writeback(folio)) {
xas_unlock_irq(&xas);
result = SCAN_FAIL; goto xa_unlocked;
} elseif (folio_trylock(folio)) {
folio_get(folio);
xas_unlock_irq(&xas);
} else {
result = SCAN_PAGE_LOCK; goto xa_locked;
}
}
/* * The folio must be locked, so we can drop the i_pages lock * without racing with truncate.
*/
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
/* make sure the folio is up to date */ if (unlikely(!folio_test_uptodate(folio))) {
result = SCAN_FAIL; goto out_unlock;
}
/* * If file was truncated then extended, or hole-punched, before * we locked the first folio, then a THP might be there already. * This will be discovered on the first iteration.
*/ if (folio_order(folio) == HPAGE_PMD_ORDER &&
folio->index == start) { /* Maybe PMD-mapped */
result = SCAN_PTE_MAPPED_HUGEPAGE; goto out_unlock;
}
if (folio_mapping(folio) != mapping) {
result = SCAN_TRUNCATED; goto out_unlock;
}
if (!is_shmem && (folio_test_dirty(folio) ||
folio_test_writeback(folio))) { /* * khugepaged only works on read-only fd, so this * folio is dirty because it hasn't been flushed * since first write.
*/
result = SCAN_FAIL; goto out_unlock;
}
if (!folio_isolate_lru(folio)) {
result = SCAN_DEL_PAGE_LRU; goto out_unlock;
}
if (!filemap_release_folio(folio, GFP_KERNEL)) {
result = SCAN_PAGE_HAS_PRIVATE;
folio_putback_lru(folio); goto out_unlock;
}
if (folio_mapped(folio))
try_to_unmap(folio,
TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH);
/* * We control 2 + nr_pages references to the folio: * - we hold a pin on it; * - nr_pages reference from page cache; * - one from lru_isolate_folio; * If those are the only references, then any new usage * of the folio will have to fetch it from the page * cache. That requires locking the folio to handle * truncate, so any new usage will be blocked until we * unlock folio after collapse/during rollback.
*/ if (folio_ref_count(folio) != 2 + folio_nr_pages(folio)) {
result = SCAN_PAGE_COUNT;
xas_unlock_irq(&xas);
folio_putback_lru(folio); goto out_unlock;
}
/* * Accumulate the folios that are being collapsed.
*/
list_add_tail(&folio->lru, &pagelist);
index += folio_nr_pages(folio); continue;
out_unlock:
folio_unlock(folio);
folio_put(folio); goto xa_unlocked;
}
if (!is_shmem) {
filemap_nr_thps_inc(mapping); /* * Paired with the fence in do_dentry_open() -> get_write_access() * to ensure i_writecount is up to date and the update to nr_thps * is visible. Ensures the page cache will be truncated if the * file is opened writable.
*/
smp_mb(); if (inode_is_open_for_write(mapping->host)) {
result = SCAN_FAIL;
filemap_nr_thps_dec(mapping);
}
}
xa_locked:
xas_unlock_irq(&xas);
xa_unlocked:
/* * If collapse is successful, flush must be done now before copying. * If collapse is unsuccessful, does flush actually need to be done? * Do it anyway, to clear the state.
*/
try_to_unmap_flush();
if (result == SCAN_SUCCEED && nr_none &&
!shmem_charge(mapping->host, nr_none))
result = SCAN_FAIL; if (result != SCAN_SUCCEED) {
nr_none = 0; goto rollback;
}
/* * The old folios are locked, so they won't change anymore.
*/
index = start;
dst = folio_page(new_folio, 0);
list_for_each_entry(folio, &pagelist, lru) { int i, nr_pages = folio_nr_pages(folio);
while (index < folio->index) {
clear_highpage(dst);
index++;
dst++;
}
for (i = 0; i < nr_pages; i++) { if (copy_mc_highpage(dst, folio_page(folio, i)) > 0) {
result = SCAN_COPY_MC; goto rollback;
}
index++;
dst++;
}
} while (index < end) {
clear_highpage(dst);
index++;
dst++;
}
if (nr_none) { struct vm_area_struct *vma; int nr_none_check = 0;
i_mmap_lock_read(mapping);
xas_lock_irq(&xas);
xas_set(&xas, start); for (index = start; index < end; index++) { if (!xas_next(&xas)) {
xas_store(&xas, XA_RETRY_ENTRY); if (xas_error(&xas)) {
result = SCAN_STORE_FAILED; goto immap_locked;
}
nr_none_check++;
}
}
if (nr_none != nr_none_check) {
result = SCAN_PAGE_FILLED; goto immap_locked;
}
/* * If userspace observed a missing page in a VMA with * a MODE_MISSING userfaultfd, then it might expect a * UFFD_EVENT_PAGEFAULT for that page. If so, we need to * roll back to avoid suppressing such an event. Since * wp/minor userfaultfds don't give userspace any * guarantees that the kernel doesn't fill a missing * page with a zero page, so they don't matter here. * * Any userfaultfds registered after this point will * not be able to observe any missing pages due to the * previously inserted retry entries.
*/
vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) { if (userfaultfd_missing(vma)) {
result = SCAN_EXCEED_NONE_PTE; goto immap_locked;
}
}
immap_locked:
i_mmap_unlock_read(mapping); if (result != SCAN_SUCCEED) {
xas_set(&xas, start); for (index = start; index < end; index++) { if (xas_next(&xas) == XA_RETRY_ENTRY)
xas_store(&xas, NULL);
}
if (is_shmem)
__lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR); else
__lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR);
if (nr_none) {
__lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none); /* nr_none is always 0 for non-shmem. */
__lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none);
}
/* * Mark new_folio as uptodate before inserting it into the * page cache so that it isn't mistaken for an fallocated but * unwritten page.
*/
folio_mark_uptodate(new_folio);
folio_ref_add(new_folio, HPAGE_PMD_NR - 1);
if (is_shmem)
folio_mark_dirty(new_folio);
folio_add_lru(new_folio);
/* Join all the small entries into a single multi-index entry. */
xas_set_order(&xas, start, HPAGE_PMD_ORDER);
xas_store(&xas, new_folio);
WARN_ON_ONCE(xas_error(&xas));
xas_unlock_irq(&xas);
/* * Remove pte page tables, so we can re-fault the page as huge. * If MADV_COLLAPSE, adjust result to call collapse_pte_mapped_thp().
*/
retract_page_tables(mapping, start); if (cc && !cc->is_khugepaged)
result = SCAN_PTE_MAPPED_HUGEPAGE;
folio_unlock(new_folio);
/* * The collapse has succeeded, so free the old folios.
*/
list_for_each_entry_safe(folio, tmp, &pagelist, lru) {
list_del(&folio->lru);
folio->mapping = NULL;
folio_clear_active(folio);
folio_clear_unevictable(folio);
folio_unlock(folio);
folio_put_refs(folio, 2 + folio_nr_pages(folio));
}
goto out;
rollback: /* Something went wrong: roll back page cache changes */ if (nr_none) {
xas_lock_irq(&xas);
mapping->nrpages -= nr_none;
xas_unlock_irq(&xas);
shmem_uncharge(mapping->host, nr_none);
}
list_for_each_entry_safe(folio, tmp, &pagelist, lru) {
list_del(&folio->lru);
folio_unlock(folio);
folio_putback_lru(folio);
folio_put(folio);
} /* * Undo the updates of filemap_nr_thps_inc for non-SHMEM * file only. This undo is not needed unless failure is * due to SCAN_COPY_MC.
*/ if (!is_shmem && result == SCAN_COPY_MC) {
filemap_nr_thps_dec(mapping); /* * Paired with the fence in do_dentry_open() -> get_write_access() * to ensure the update to nr_thps is visible.
*/
smp_mb();
}
if (xa_is_value(folio)) {
swap += 1 << xas_get_order(&xas); if (cc->is_khugepaged &&
swap > khugepaged_max_ptes_swap) {
result = SCAN_EXCEED_SWAP_PTE;
count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); break;
} continue;
}
if (!folio_try_get(folio)) {
xas_reset(&xas); continue;
}
if (unlikely(folio != xas_reload(&xas))) {
folio_put(folio);
xas_reset(&xas); continue;
}
if (folio_order(folio) == HPAGE_PMD_ORDER &&
folio->index == start) { /* Maybe PMD-mapped */
result = SCAN_PTE_MAPPED_HUGEPAGE; /* * For SCAN_PTE_MAPPED_HUGEPAGE, further processing * by the caller won't touch the page cache, and so * it's safe to skip LRU and refcount checks before * returning.
*/
folio_put(folio); break;
}
node = folio_nid(folio); if (hpage_collapse_scan_abort(node, cc)) {
result = SCAN_SCAN_ABORT;
folio_put(folio); break;
}
cc->node_load[node]++;
if (!folio_test_lru(folio)) {
result = SCAN_PAGE_LRU;
folio_put(folio); break;
}
if (folio_expected_ref_count(folio) + 1 != folio_ref_count(folio)) {
result = SCAN_PAGE_COUNT;
folio_put(folio); break;
}
/* * We probably should check if the folio is referenced * here, but nobody would transfer pte_young() to * folio_test_referenced() for us. And rmap walk here * is just too costly...
*/
mm = slot->mm; /* * Don't wait for semaphore (to avoid long wait times). Just move to * the next mm on the list.
*/
vma = NULL; if (unlikely(!mmap_read_trylock(mm))) goto breakouterloop_mmap_lock;
progress++; if (unlikely(hpage_collapse_test_exit_or_disable(mm))) goto breakouterloop;
vma_iter_init(&vmi, mm, khugepaged_scan.address);
for_each_vma(vmi, vma) { unsignedlong hstart, hend;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.