// SPDX-License-Identifier: GPL-2.0 /* * Memory Migration functionality - linux/mm/migrate.c * * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter * * Page migration was first developed in the context of the memory hotplug * project. The main authors of the migration code are: * * IWAMOTO Toshihiro <iwamoto@valinux.co.jp> * Hirokazu Takahashi <taka@valinux.co.jp> * Dave Hansen <haveblue@us.ibm.com> * Christoph Lameter
*/
/* * If we enable page migration for a page of a certain type by marking * it as movable, the page type must be sticky until the page gets freed * back to the buddy.
*/ if (PageOffline(page)) /* Only balloon compaction sets PageOffline pages movable. */ return offline_movable_ops; if (PageZsmalloc(page)) return zsmalloc_movable_ops;
return NULL;
}
/** * isolate_movable_ops_page - isolate a movable_ops page for migration * @page: The page. * @mode: The isolation mode. * * Try to isolate a movable_ops page for migration. Will fail if the page is * not a movable_ops page, if the page is already isolated for migration * or if the page was just was released by its owner. * * Once isolated, the page cannot get freed until it is either putback * or migrated. * * Returns true if isolation succeeded, otherwise false.
*/ bool isolate_movable_ops_page(struct page *page, isolate_mode_t mode)
{ /* * TODO: these pages will not be folios in the future. All * folio dependencies will have to be removed.
*/ struct folio *folio = folio_get_nontail_page(page); conststruct movable_operations *mops;
/* * Avoid burning cycles with pages that are yet under __free_pages(), * or just got freed under us. * * In case we 'win' a race for a movable page being freed under us and * raise its refcount preventing __free_pages() from doing its job * the put_page() at the end of this block will take care of * release this page, thus avoiding a nasty leakage.
*/ if (!folio) goto out;
/* * Check for movable_ops pages before taking the page lock because * we use non-atomic bitops on newly allocated page flags so * unconditionally grabbing the lock ruins page's owner side. * * Note that once a page has movable_ops, it will stay that way * until the page was freed.
*/ if (unlikely(!page_has_movable_ops(page))) goto out_putfolio;
/* * As movable pages are not isolated from LRU lists, concurrent * compaction threads can race against page migration functions * as well as race against the releasing a page. * * In order to avoid having an already isolated movable page * being (wrongly) re-isolated while it is under migration, * or to avoid attempting to isolate pages being released, * lets be sure we have the page lock * before proceeding with the movable page isolation steps.
*/ if (unlikely(!folio_trylock(folio))) goto out_putfolio;
VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(page), page); if (PageMovableOpsIsolated(page)) goto out_no_isolated;
mops = page_movable_ops(page); if (WARN_ON_ONCE(!mops)) goto out_no_isolated;
if (!mops->isolate_page(page, mode)) goto out_no_isolated;
/* Driver shouldn't use the isolated flag */
VM_WARN_ON_ONCE_PAGE(PageMovableOpsIsolated(page), page);
SetPageMovableOpsIsolated(page);
folio_unlock(folio);
/** * putback_movable_ops_page - putback an isolated movable_ops page * @page: The isolated page. * * Putback an isolated movable_ops page. * * After the page was putback, it might get freed instantly.
*/ staticvoid putback_movable_ops_page(struct page *page)
{ /* * TODO: these pages will not be folios in the future. All * folio dependencies will have to be removed.
*/ struct folio *folio = page_folio(page);
/** * migrate_movable_ops_page - migrate an isolated movable_ops page * @dst: The destination page. * @src: The source page. * @mode: The migration mode. * * Migrate an isolated movable_ops page. * * If the src page was already released by its owner, the src page is * un-isolated (putback) and migration succeeds; the migration core will be the * owner of both pages. * * If the src page was not released by its owner and the migration was * successful, the owner of the src page and the dst page are swapped and * the src page is un-isolated. * * If migration fails, the ownership stays unmodified and the src page * remains isolated: migration may be retried later or the page can be putback. * * TODO: migration core will treat both pages as folios and lock them before * this call to unlock them after this call. Further, the folio refcounts on * src and dst are also released by migration core. These pages will not be * folios in the future, so that must be reworked. * * Returns 0 on success, otherwise a negative error code.
*/ staticint migrate_movable_ops_page(struct page *dst, struct page *src, enum migrate_mode mode)
{ int rc;
/* * Put previously isolated pages back onto the appropriate lists * from where they were once taken off for compaction/migration. * * This function shall be used whenever the isolated pageset has been * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range() * and folio_isolate_hugetlb().
*/ void putback_movable_pages(struct list_head *l)
{ struct folio *folio; struct folio *folio2;
/* Must be called with an elevated refcount on the non-hugetlb folio */ bool isolate_folio_to_list(struct folio *folio, struct list_head *list)
{ if (folio_test_hugetlb(folio)) return folio_isolate_hugetlb(folio, list);
if (page_has_movable_ops(&folio->page)) { if (!isolate_movable_ops_page(&folio->page,
ISOLATE_UNEVICTABLE)) returnfalse;
} else { if (!folio_isolate_lru(folio)) returnfalse;
node_stat_add_folio(folio, NR_ISOLATED_ANON +
folio_is_file_lru(folio));
}
list_add(&folio->lru, list); returntrue;
}
if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags & VM_LOCKED) ||
mm_forbids_zeropage(pvmw->vma->vm_mm)) returnfalse;
/* * The pmd entry mapping the old thp was flushed and the pte mapping * this subpage has been non present. If the subpage is only zero-filled * then map it to the shared zeropage.
*/ if (!pages_identical(page, ZERO_PAGE(0))) returnfalse;
/* pgoff is invalid for ksm pages, but they are never large */ if (folio_test_large(folio) && !folio_test_hugetlb(folio))
idx = linear_page_index(vma, pvmw.address) - pvmw.pgoff; new = folio_page(folio, idx);
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, pvmw.address, pvmw.pte);
}
returntrue;
}
/* * Get rid of all migration entries and replace them by * references to the indicated page.
*/ void remove_migration_ptes(struct folio *src, struct folio *dst, int flags)
{ struct rmap_walk_arg rmap_walk_arg = {
.folio = src,
.map_unused_to_zeropage = flags & RMP_USE_SHARED_ZEROPAGE,
};
if (flags & RMP_LOCKED)
rmap_walk_locked(dst, &rwc); else
rmap_walk(dst, &rwc);
}
/* * Something used the pte of a page under migration. We need to * get to the page and wait until migration is finished. * When we return from this function the fault will be retried.
*/ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsignedlong address)
{
spinlock_t *ptl;
pte_t *ptep;
pte_t pte;
swp_entry_t entry;
ptep = pte_offset_map_lock(mm, pmd, address, &ptl); if (!ptep) return;
pte = ptep_get(ptep);
pte_unmap(ptep);
if (!is_swap_pte(pte)) goto out;
entry = pte_to_swp_entry(pte); if (!is_migration_entry(entry)) goto out;
#ifdef CONFIG_HUGETLB_PAGE /* * The vma read lock must be held upon entry. Holding that lock prevents either * the pte or the ptl from being freed. * * This function will release the vma lock before returning.
*/ void migration_entry_wait_huge(struct vm_area_struct *vma, unsignedlong addr, pte_t *ptep)
{
spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, ptep);
pte_t pte;
if (unlikely(!is_hugetlb_entry_migration(pte))) {
spin_unlock(ptl);
hugetlb_vma_unlock_read(vma);
} else { /* * If migration entry existed, safe to release vma lock * here because the pgtable page won't be freed without the * pgtable lock released. See comment right above pgtable * lock release in migration_entry_wait_on_locked().
*/
hugetlb_vma_unlock_read(vma);
migration_entry_wait_on_locked(pte_to_swp_entry(pte), ptl);
}
} #endif
/* * Replace the folio in the mapping. * * The number of remaining references must be: * 1 for anonymous folios without a mapping * 2 for folios with a mapping * 3 for folios with a mapping and the private flag set.
*/ staticint __folio_migrate_mapping(struct address_space *mapping, struct folio *newfolio, struct folio *folio, int expected_count)
{
XA_STATE(xas, &mapping->i_pages, folio_index(folio)); struct zone *oldzone, *newzone; int dirty; long nr = folio_nr_pages(folio); long entries, i;
if (!mapping) { /* Take off deferred split queue while frozen and memcg set */ if (folio_test_large(folio) &&
folio_test_large_rmappable(folio)) { if (!folio_ref_freeze(folio, expected_count)) return -EAGAIN;
folio_unqueue_deferred_split(folio);
folio_ref_unfreeze(folio, expected_count);
}
/* No turning back from here */
newfolio->index = folio->index;
newfolio->mapping = folio->mapping; if (folio_test_anon(folio) && folio_test_large(folio))
mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1); if (folio_test_swapbacked(folio))
__folio_set_swapbacked(newfolio);
xas_lock_irq(&xas); if (!folio_ref_freeze(folio, expected_count)) {
xas_unlock_irq(&xas); return -EAGAIN;
}
/* Take off deferred split queue while frozen and memcg set */
folio_unqueue_deferred_split(folio);
/* * Now we know that no one else is looking at the folio: * no turning back from here.
*/
newfolio->index = folio->index;
newfolio->mapping = folio->mapping; if (folio_test_anon(folio) && folio_test_large(folio))
mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
folio_ref_add(newfolio, nr); /* add cache reference */ if (folio_test_swapbacked(folio))
__folio_set_swapbacked(newfolio); if (folio_test_swapcache(folio)) {
folio_set_swapcache(newfolio);
newfolio->private = folio_get_private(folio);
entries = nr;
} else {
entries = 1;
}
/* Move dirty while folio refs frozen and newfolio not yet exposed */
dirty = folio_test_dirty(folio); if (dirty) {
folio_clear_dirty(folio);
folio_set_dirty(newfolio);
}
/* Swap cache still stores N entries instead of a high-order entry */ for (i = 0; i < entries; i++) {
xas_store(&xas, newfolio);
xas_next(&xas);
}
/* * Drop cache reference from old folio by unfreezing * to one less reference. * We know this isn't the last reference.
*/
folio_ref_unfreeze(folio, expected_count - nr);
xas_unlock(&xas); /* Leave irq disabled to prevent preemption while updating stats */
/* * If moved to a different zone then also account * the folio for that zone. Other VM counters will be * taken care of when we establish references to the * new folio and drop references to the old folio. * * Note that anonymous folios are accounted for * via NR_FILE_PAGES and NR_ANON_MAPPED if they * are mapped to swap space.
*/ if (newzone != oldzone) { struct lruvec *old_lruvec, *new_lruvec; struct mem_cgroup *memcg;
/* * The expected number of remaining references is the same as that * of folio_migrate_mapping().
*/ int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src)
{
XA_STATE(xas, &mapping->i_pages, folio_index(src)); int rc, expected_count = folio_expected_ref_count(src) + 1;
if (folio_ref_count(src) != expected_count) return -EAGAIN;
rc = folio_mc_copy(dst, src); if (unlikely(rc)) return rc;
xas_lock_irq(&xas); if (!folio_ref_freeze(src, expected_count)) {
xas_unlock_irq(&xas); return -EAGAIN;
}
/* * Copy the flags and some other ancillary information
*/ void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
{ int cpupid;
if (folio_test_referenced(folio))
folio_set_referenced(newfolio); if (folio_test_uptodate(folio))
folio_mark_uptodate(newfolio); if (folio_test_clear_active(folio)) {
VM_BUG_ON_FOLIO(folio_test_unevictable(folio), folio);
folio_set_active(newfolio);
} elseif (folio_test_clear_unevictable(folio))
folio_set_unevictable(newfolio); if (folio_test_workingset(folio))
folio_set_workingset(newfolio); if (folio_test_checked(folio))
folio_set_checked(newfolio); /* * PG_anon_exclusive (-> PG_mappedtodisk) is always migrated via * migration entries. We can still have PG_anon_exclusive set on an * effectively unmapped and unreferenced first sub-pages of an * anonymous THP: we can simply copy it here via PG_mappedtodisk.
*/ if (folio_test_mappedtodisk(folio))
folio_set_mappedtodisk(newfolio);
/* Move dirty on pages not done by folio_migrate_mapping() */ if (folio_test_dirty(folio))
folio_set_dirty(newfolio);
if (folio_test_young(folio))
folio_set_young(newfolio); if (folio_test_idle(folio))
folio_set_idle(newfolio);
folio_migrate_refs(newfolio, folio); /* * Copy NUMA information to the new page, to prevent over-eager * future migrations of this same page.
*/
cpupid = folio_xchg_last_cpupid(folio, -1); /* * For memory tiering mode, when migrate between slow and fast * memory node, reset cpupid, because that is used to record * page access time in slow memory node.
*/ if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) { bool f_toptier = node_is_toptier(folio_nid(folio)); bool t_toptier = node_is_toptier(folio_nid(newfolio));
if (f_toptier != t_toptier)
cpupid = -1;
}
folio_xchg_last_cpupid(newfolio, cpupid);
folio_migrate_ksm(newfolio, folio); /* * Please do not reorder this without considering how mm/ksm.c's * ksm_get_folio() depends upon ksm_migrate_page() and the * swapcache flag.
*/ if (folio_test_swapcache(folio))
folio_clear_swapcache(folio);
folio_clear_private(folio);
/* page->private contains hugetlb specific flags */ if (!folio_test_hugetlb(folio))
folio->private = NULL;
/* * If any waiters have accumulated on the new page then * wake them up.
*/ if (folio_test_writeback(newfolio))
folio_end_writeback(newfolio);
/* * PG_readahead shares the same bit with PG_reclaim. The above * end_page_writeback() may clear PG_readahead mistakenly, so set the * bit after that.
*/ if (folio_test_readahead(folio))
folio_set_readahead(newfolio);
/* Check whether src does not have extra refs before we do more work */ if (folio_ref_count(src) != expected_count) return -EAGAIN;
rc = folio_mc_copy(dst, src); if (unlikely(rc)) return rc;
rc = __folio_migrate_mapping(mapping, dst, src, expected_count); if (rc) return rc;
if (src_private)
folio_attach_private(dst, folio_detach_private(src));
folio_migrate_flags(dst, src); return 0;
}
/** * migrate_folio() - Simple folio migration. * @mapping: The address_space containing the folio. * @dst: The folio to migrate the data to. * @src: The folio containing the current data. * @mode: How to migrate the page. * * Common logic to directly migrate a single LRU folio suitable for * folios that do not have private data. * * Folios are locked upon entry and exit.
*/ int migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode)
{
BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */ return __migrate_folio(mapping, dst, src, NULL, mode);
}
EXPORT_SYMBOL(migrate_folio);
#ifdef CONFIG_BUFFER_HEAD /* Returns true if all buffers are successfully locked */ staticbool buffer_migrate_lock_buffers(struct buffer_head *head, enum migrate_mode mode)
{ struct buffer_head *bh = head; struct buffer_head *failed_bh;
do { if (!trylock_buffer(bh)) { if (mode == MIGRATE_ASYNC) goto unlock; if (mode == MIGRATE_SYNC_LIGHT && !buffer_uptodate(bh)) goto unlock;
lock_buffer(bh);
}
bh = bh->b_this_page;
} while (bh != head);
returntrue;
unlock: /* We failed to lock the buffer and cannot stall. */
failed_bh = bh;
bh = head; while (bh != failed_bh) {
unlock_buffer(bh);
bh = bh->b_this_page;
}
head = folio_buffers(src); if (!head) return migrate_folio(mapping, dst, src, mode);
/* Check whether page does not have extra refs before we do more work */
expected_count = folio_expected_ref_count(src) + 1; if (folio_ref_count(src) != expected_count) return -EAGAIN;
if (!buffer_migrate_lock_buffers(head, mode)) return -EAGAIN;
if (check_refs) { bool busy, migrating; bool invalidated = false;
rc = filemap_migrate_folio(mapping, dst, src, mode); if (rc) goto unlock_buffers;
bh = head; do {
folio_set_bh(bh, dst, bh_offset(bh));
bh = bh->b_this_page;
} while (bh != head);
unlock_buffers: if (check_refs)
clear_bit_unlock(BH_Migrate, &head->b_state);
bh = head; do {
unlock_buffer(bh);
bh = bh->b_this_page;
} while (bh != head);
return rc;
}
/** * buffer_migrate_folio() - Migration function for folios with buffers. * @mapping: The address space containing @src. * @dst: The folio to migrate to. * @src: The folio to migrate from. * @mode: How to migrate the folio. * * This function can only be used if the underlying filesystem guarantees * that no other references to @src exist. For example attached buffer * heads are accessed only under the folio lock. If your filesystem cannot * provide this guarantee, buffer_migrate_folio_norefs() may be more * appropriate. * * Return: 0 on success or a negative errno on failure.
*/ int buffer_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode)
{ return __buffer_migrate_folio(mapping, dst, src, mode, false);
}
EXPORT_SYMBOL(buffer_migrate_folio);
/** * buffer_migrate_folio_norefs() - Migration function for folios with buffers. * @mapping: The address space containing @src. * @dst: The folio to migrate to. * @src: The folio to migrate from. * @mode: How to migrate the folio. * * Like buffer_migrate_folio() except that this variant is more careful * and checks that there are also no buffer head references. This function * is the right one for mappings where buffer heads are directly looked * up and referenced (such as block device mappings). * * Return: 0 on success or a negative errno on failure.
*/ int buffer_migrate_folio_norefs(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode)
{ return __buffer_migrate_folio(mapping, dst, src, mode, true);
}
EXPORT_SYMBOL_GPL(buffer_migrate_folio_norefs); #endif/* CONFIG_BUFFER_HEAD */
/* * Default handling if a filesystem does not provide a migration function.
*/ staticint fallback_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode)
{
WARN_ONCE(mapping->a_ops->writepages, "%ps does not implement migrate_folio\n",
mapping->a_ops); if (folio_test_dirty(src)) return -EBUSY;
/* * Filesystem may have private data at folio->private that we * can't migrate automatically.
*/ if (!filemap_release_folio(src, GFP_KERNEL)) return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
return migrate_folio(mapping, dst, src, mode);
}
/* * Move a src folio to a newly allocated dst folio. * * The src and dst folios are locked and the src folios was unmapped from * the page tables. * * On success, the src folio was replaced by the dst folio. * * Return value: * < 0 - error code * 0 - success
*/ staticint move_to_new_folio(struct folio *dst, struct folio *src, enum migrate_mode mode)
{ struct address_space *mapping = folio_mapping(src); int rc = -EAGAIN;
if (!mapping)
rc = migrate_folio(mapping, dst, src, mode); elseif (mapping_inaccessible(mapping))
rc = -EOPNOTSUPP; elseif (mapping->a_ops->migrate_folio) /* * Most folios have a mapping and most filesystems * provide a migrate_folio callback. Anonymous folios * are part of swap space which also has its own * migrate_folio callback. This is the most common path * for page migration.
*/
rc = mapping->a_ops->migrate_folio(mapping, dst, src,
mode); else
rc = fallback_migrate_folio(mapping, dst, src, mode);
if (!rc) { /* * For pagecache folios, src->mapping must be cleared before src * is freed. Anonymous folios must stay anonymous until freed.
*/ if (!folio_test_anon(src))
src->mapping = NULL;
if (likely(!folio_is_zone_device(dst)))
flush_dcache_folio(dst);
} return rc;
}
/* * To record some information during migration, we use unused private * field of struct folio of the newly allocated destination folio. * This is safe because nobody is using it except us.
*/ enum {
PAGE_WAS_MAPPED = BIT(0),
PAGE_WAS_MLOCKED = BIT(1),
PAGE_OLD_STATES = PAGE_WAS_MAPPED | PAGE_WAS_MLOCKED,
};
/* Restore the source folio to the original state upon failure */ staticvoid migrate_folio_undo_src(struct folio *src, int page_was_mapped, struct anon_vma *anon_vma, bool locked, struct list_head *ret)
{ if (page_was_mapped)
remove_migration_ptes(src, src, 0); /* Drop an anon_vma reference if we took one */ if (anon_vma)
put_anon_vma(anon_vma); if (locked)
folio_unlock(src); if (ret)
list_move_tail(&src->lru, ret);
}
/* Restore the destination folio to the original state upon failure */ staticvoid migrate_folio_undo_dst(struct folio *dst, bool locked,
free_folio_t put_new_folio, unsignedlongprivate)
{ if (locked)
folio_unlock(dst); if (put_new_folio)
put_new_folio(dst, private); else
folio_put(dst);
}
if (!folio_trylock(src)) { if (mode == MIGRATE_ASYNC) goto out;
/* * It's not safe for direct compaction to call lock_page. * For example, during page readahead pages are added locked * to the LRU. Later, when the IO completes the pages are * marked uptodate and unlocked. However, the queueing * could be merging multiple pages for one bio (e.g. * mpage_readahead). If an allocation happens for the * second or third page, the process can end up locking * the same page twice and deadlocking. Rather than * trying to be clever about what pages can be locked, * avoid the use of lock_page for direct compaction * altogether.
*/ if (current->flags & PF_MEMALLOC) goto out;
/* * In "light" mode, we can wait for transient locks (eg * inserting a page into the page table), but it's not * worth waiting for I/O.
*/ if (mode == MIGRATE_SYNC_LIGHT && !folio_test_uptodate(src)) goto out;
folio_lock(src);
}
locked = true; if (folio_test_mlocked(src))
old_page_state |= PAGE_WAS_MLOCKED;
if (folio_test_writeback(src)) { /* * Only in the case of a full synchronous migration is it * necessary to wait for PageWriteback. In the async case, * the retry loop is too short and in the sync-light case, * the overhead of stalling is too much
*/ switch (mode) { case MIGRATE_SYNC: break; default:
rc = -EBUSY; goto out;
}
folio_wait_writeback(src);
}
/* * By try_to_migrate(), src->mapcount goes down to 0 here. In this case, * we cannot notice that anon_vma is freed while we migrate a page. * This get_anon_vma() delays freeing anon_vma pointer until the end * of migration. File cache pages are no problem because of page_lock() * File Caches may use write_page() or lock_page() in migration, then, * just care Anon page here. * * Only folio_get_anon_vma() understands the subtleties of * getting a hold on an anon_vma from outside one of its mms. * But if we cannot get anon_vma, then we won't need it anyway, * because that implies that the anon page is no longer mapped * (and cannot be remapped so long as we hold the page lock).
*/ if (folio_test_anon(src) && !folio_test_ksm(src))
anon_vma = folio_get_anon_vma(src);
/* * Block others from accessing the new page when we get around to * establishing additional references. We are usually the only one * holding a reference to dst at this point. We used to have a BUG * here if folio_trylock(dst) fails, but would like to allow for * cases where there might be a race with the previous use of dst. * This is much like races on refcount of oldpage: just don't BUG().
*/ if (unlikely(!folio_trylock(dst))) goto out;
dst_locked = true;
if (unlikely(page_has_movable_ops(&src->page))) {
__migrate_folio_record(dst, old_page_state, anon_vma); return 0;
}
/* * Corner case handling: * 1. When a new swap-cache page is read into, it is added to the LRU * and treated as swapcache but it has no rmap yet. * Calling try_to_unmap() against a src->mapping==NULL page will * trigger a BUG. So handle it here. * 2. An orphaned page (see truncate_cleanup_page) might have * fs-private metadata. The page can be picked up due to memory * offlining. Everywhere else except page reclaim, the page is * invisible to the vm, so the page can not be migrated. So try to * free the metadata, so the page can be freed.
*/ if (!src->mapping) { if (folio_test_private(src)) {
try_to_free_buffers(src); goto out;
}
} elseif (folio_mapped(src)) { /* Establish migration ptes */
VM_BUG_ON_FOLIO(folio_test_anon(src) &&
!folio_test_ksm(src) && !anon_vma, src);
try_to_migrate(src, mode == MIGRATE_ASYNC ? TTU_BATCH_FLUSH : 0);
old_page_state |= PAGE_WAS_MAPPED;
}
if (!folio_mapped(src)) {
__migrate_folio_record(dst, old_page_state, anon_vma); return 0;
}
out: /* * A folio that has not been unmapped will be restored to * right list unless we want to retry.
*/ if (rc == -EAGAIN)
ret = NULL;
if (unlikely(page_has_movable_ops(&src->page))) {
rc = migrate_movable_ops_page(&dst->page, &src->page, mode); if (rc) goto out; goto out_unlock_both;
}
rc = move_to_new_folio(dst, src, mode); if (rc) goto out;
/* * When successful, push dst to LRU immediately: so that if it * turns out to be an mlocked page, remove_migration_ptes() will * automatically build up the correct dst->mlock_count for it. * * We would like to do something similar for the old page, when * unsuccessful, and other cases when a page has been temporarily * isolated from the unevictable LRU: but this case is the easiest.
*/
folio_add_lru(dst); if (old_page_state & PAGE_WAS_MLOCKED)
lru_add_drain();
if (old_page_state & PAGE_WAS_MAPPED)
remove_migration_ptes(src, dst, 0);
out_unlock_both:
folio_unlock(dst);
folio_set_owner_migrate_reason(dst, reason); /* * If migration is successful, decrease refcount of dst, * which will not free the page because new page owner increased * refcounter.
*/
folio_put(dst);
/* * A folio that has been migrated has all references removed * and will be freed.
*/
list_del(&src->lru); /* Drop an anon_vma reference if we took one */ if (anon_vma)
put_anon_vma(anon_vma);
folio_unlock(src);
migrate_folio_done(src, reason);
return rc;
out: /* * A folio that has not been migrated will be restored to * right list unless we want to retry.
*/ if (rc == -EAGAIN) {
list_add(&dst->lru, prev);
__migrate_folio_record(dst, old_page_state, anon_vma); return rc;
}
/* * Counterpart of unmap_and_move_page() for hugepage migration. * * This function doesn't wait the completion of hugepage I/O * because there is no race between I/O and migration for hugepage. * Note that currently hugepage I/O occurs only in direct I/O * where no lock is held and PG_writeback is irrelevant, * and writeback status of all subpages are counted in the reference * count of the head page (i.e. if all subpages of a 2MB hugepage are * under direct I/O, the reference of the head page is 512 and a bit more.) * This means that when we try to migrate hugepage whose subpages are * doing direct I/O, some references remain after try_to_unmap() and * hugepage migration fails without data corruption. * * There is also no race when direct I/O is issued on the page under migration, * because then pte is replaced with migration swap entry and direct I/O code * will wait in the page fault for migration to complete.
*/ staticint unmap_and_move_huge_page(new_folio_t get_new_folio,
free_folio_t put_new_folio, unsignedlongprivate, struct folio *src, int force, enum migrate_mode mode, int reason, struct list_head *ret)
{ struct folio *dst; int rc = -EAGAIN; int page_was_mapped = 0; struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL;
if (folio_ref_count(src) == 1) { /* page was freed from under us. So we are done. */
folio_putback_hugetlb(src); return 0;
}
dst = get_new_folio(src, private); if (!dst) return -ENOMEM;
if (!folio_trylock(src)) { if (!force) goto out; switch (mode) { case MIGRATE_SYNC: break; default: goto out;
}
folio_lock(src);
}
/* * Check for pages which are in the process of being freed. Without * folio_mapping() set, hugetlbfs specific move page routine will not * be called and we could leak usage counts for subpools.
*/ if (hugetlb_folio_subpool(src) && !folio_mapping(src)) {
rc = -EBUSY; goto out_unlock;
}
if (folio_test_anon(src))
anon_vma = folio_get_anon_vma(src);
if (unlikely(!folio_trylock(dst))) goto put_anon;
if (folio_mapped(src)) { enum ttu_flags ttu = 0;
if (!folio_test_anon(src)) { /* * In shared mappings, try_to_unmap could potentially * call huge_pmd_unshare. Because of this, take * semaphore in write mode here and set TTU_RMAP_LOCKED * to let lower levels know we have taken the lock.
*/
mapping = hugetlb_folio_mapping_lock_write(src); if (unlikely(!mapping)) goto unlock_put_anon;
ttu = TTU_RMAP_LOCKED;
}
try_to_migrate(src, ttu);
page_was_mapped = 1;
if (ttu & TTU_RMAP_LOCKED)
i_mmap_unlock_write(mapping);
}
if (!folio_mapped(src))
rc = move_to_new_folio(dst, src, mode);
if (page_was_mapped)
remove_migration_ptes(src, !rc ? dst : src, 0);
unlock_put_anon:
folio_unlock(dst);
put_anon: if (anon_vma)
put_anon_vma(anon_vma);
if (!rc) {
move_hugetlb_state(src, dst, reason);
put_new_folio = NULL;
}
/* * If migration was not successful and there's a freeing callback, * return the folio to that special allocator. Otherwise, simply drop * our additional reference.
*/ if (put_new_folio)
put_new_folio(dst, private); else
folio_put(dst);
struct migrate_pages_stats { int nr_succeeded; /* Normal and large folios migrated successfully, in
units of base pages */ int nr_failed_pages; /* Normal and large folios failed to be migrated, in
units of base pages. Untried folios aren't counted */ int nr_thp_succeeded; /* THP migrated successfully */ int nr_thp_failed; /* THP failed to be migrated */ int nr_thp_split; /* THP split before migrating */ int nr_split; /* Large folio (include THP) split before migrating */
};
/* * Returns the number of hugetlb folios that were not migrated, or an error code * after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no hugetlb folios are movable * any more because the list has become empty or no retryable hugetlb folios * exist any more. It is caller's responsibility to call putback_movable_pages() * only if ret != 0.
*/ staticint migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio,
free_folio_t put_new_folio, unsignedlongprivate, enum migrate_mode mode, int reason, struct migrate_pages_stats *stats, struct list_head *ret_folios)
{ int retry = 1; int nr_failed = 0; int nr_retry_pages = 0; int pass = 0; struct folio *folio, *folio2; int rc, nr_pages;
list_for_each_entry_safe(folio, folio2, from, lru) { if (!folio_test_hugetlb(folio)) continue;
nr_pages = folio_nr_pages(folio);
cond_resched();
/* * Migratability of hugepages depends on architectures and * their size. This check is necessary because some callers * of hugepage migration like soft offline and memory * hotremove don't walk through page tables or check whether * the hugepage is pmd-based or not before kicking migration.
*/ if (!hugepage_migration_supported(folio_hstate(folio))) {
nr_failed++;
stats->nr_failed_pages += nr_pages;
list_move_tail(&folio->lru, ret_folios); continue;
}
rc = unmap_and_move_huge_page(get_new_folio,
put_new_folio, private,
folio, pass > 2, mode,
reason, ret_folios); /* * The rules are: * 0: hugetlb folio will be put back * -EAGAIN: stay on the from list * -ENOMEM: stay on the from list * Other errno: put on ret_folios list
*/ switch(rc) { case -ENOMEM: /* * When memory is low, don't bother to try to migrate * other folios, just exit.
*/
stats->nr_failed_pages += nr_pages + nr_retry_pages; return -ENOMEM; case -EAGAIN:
retry++;
nr_retry_pages += nr_pages; break; case 0:
stats->nr_succeeded += nr_pages; break; default: /* * Permanent failure (-EBUSY, etc.): * unlike -EAGAIN case, the failed folio is * removed from migration folio list and not * retried in the next outer loop.
*/
nr_failed++;
stats->nr_failed_pages += nr_pages; break;
}
}
} /* * nr_failed is number of hugetlb folios failed to be migrated. After * NR_MAX_MIGRATE_PAGES_RETRY attempts, give up and count retried hugetlb * folios as failed.
*/
nr_failed += retry;
stats->nr_failed_pages += nr_retry_pages;
return nr_failed;
}
staticvoid migrate_folios_move(struct list_head *src_folios, struct list_head *dst_folios,
free_folio_t put_new_folio, unsignedlongprivate, enum migrate_mode mode, int reason, struct list_head *ret_folios, struct migrate_pages_stats *stats, int *retry, int *thp_retry, int *nr_failed, int *nr_retry_pages)
{ struct folio *folio, *folio2, *dst, *dst2; bool is_thp; int nr_pages; int rc;
/* * migrate_pages_batch() first unmaps folios in the from list as many as * possible, then move the unmapped folios. * * We only batch migration if mode == MIGRATE_ASYNC to avoid to wait a * lock or bit when we have locked more than one folio. Which may cause * deadlock (e.g., for loop device). So, if mode != MIGRATE_ASYNC, the * length of the from list must be <= 1.
*/ staticint migrate_pages_batch(struct list_head *from,
new_folio_t get_new_folio, free_folio_t put_new_folio, unsignedlongprivate, enum migrate_mode mode, int reason, struct list_head *ret_folios, struct list_head *split_folios, struct migrate_pages_stats *stats, int nr_pass)
{ int retry = 1; int thp_retry = 1; int nr_failed = 0; int nr_retry_pages = 0; int pass = 0; bool is_thp = false; bool is_large = false; struct folio *folio, *folio2, *dst = NULL; int rc, rc_saved = 0, nr_pages;
LIST_HEAD(unmap_folios);
LIST_HEAD(dst_folios); bool nosplit = (reason == MR_NUMA_MISPLACED);
/* * The rare folio on the deferred split list should * be split now. It should not count as a failure: * but increment nr_failed because, without doing so, * migrate_pages() may report success with (split but * unmigrated) pages still on its fromlist; whereas it * always reports success when its fromlist is empty. * stats->nr_thp_failed should be increased too, * otherwise stats inconsistency will happen when * migrate_pages_batch is called via migrate_pages() * with MIGRATE_SYNC and MIGRATE_ASYNC. * * Only check it without removing it from the list. * Since the folio can be on deferred_split_scan() * local list and removing it can cause the local list * corruption. Folio split process below can handle it * with the help of folio_ref_freeze(). * * nr_pages > 2 is needed to avoid checking order-1 * page cache folios. They exist, in contrast to * non-existent order-1 anonymous folios, and do not * use _deferred_list.
*/ if (nr_pages > 2 &&
!list_empty(&folio->_deferred_list) &&
folio_test_partially_mapped(folio)) { if (!try_split_folio(folio, split_folios, mode)) {
nr_failed++;
stats->nr_thp_failed += is_thp;
stats->nr_thp_split += is_thp;
stats->nr_split++; continue;
}
}
/* * Large folio migration might be unsupported or * the allocation might be failed so we should retry * on the same folio with the large folio split * to normal folios. * * Split folios are put in split_folios, and * we will migrate them after the rest of the * list is processed.
*/ if (!thp_migration_supported() && is_thp) {
nr_failed++;
stats->nr_thp_failed++; if (!try_split_folio(folio, split_folios, mode)) {
stats->nr_thp_split++;
stats->nr_split++; continue;
}
stats->nr_failed_pages += nr_pages;
list_move_tail(&folio->lru, ret_folios); continue;
}
/* * If we are holding the last folio reference, the folio * was freed from under us, so just drop our reference.
*/ if (likely(!page_has_movable_ops(&folio->page)) &&
folio_ref_count(folio) == 1) {
folio_clear_active(folio);
folio_clear_unevictable(folio);
list_del(&folio->lru);
migrate_folio_done(folio, reason);
stats->nr_succeeded += nr_pages;
stats->nr_thp_succeeded += is_thp; continue;
}
rc = migrate_folio_unmap(get_new_folio, put_new_folio, private, folio, &dst, mode, ret_folios); /* * The rules are: * 0: folio will be put on unmap_folios list, * dst folio put on dst_folios list * -EAGAIN: stay on the from list * -ENOMEM: stay on the from list * Other errno: put on ret_folios list
*/ switch(rc) { case -ENOMEM: /* * When memory is low, don't bother to try to migrate * other folios, move unmapped folios, then exit.
*/
nr_failed++;
stats->nr_thp_failed += is_thp; /* Large folio NUMA faulting doesn't split to retry. */ if (is_large && !nosplit) { int ret = try_split_folio(folio, split_folios, mode);
if (!ret) {
stats->nr_thp_split += is_thp;
stats->nr_split++; break;
} elseif (reason == MR_LONGTERM_PIN &&
ret == -EAGAIN) { /* * Try again to split large folio to * mitigate the failure of longterm pinning.
*/
retry++;
thp_retry += is_thp;
nr_retry_pages += nr_pages; /* Undo duplicated failure counting. */
nr_failed--;
stats->nr_thp_failed -= is_thp; break;
}
}
stats->nr_failed_pages += nr_pages + nr_retry_pages; /* nr_failed isn't updated for not used */
stats->nr_thp_failed += thp_retry;
rc_saved = rc; if (list_empty(&unmap_folios)) goto out; else goto move; case -EAGAIN:
retry++;
thp_retry += is_thp;
nr_retry_pages += nr_pages; break; case 0:
list_move_tail(&folio->lru, &unmap_folios);
list_add_tail(&dst->lru, &dst_folios); break; default: /* * Permanent failure (-EBUSY, etc.): * unlike -EAGAIN case, the failed folio is * removed from migration folio list and not * retried in the next outer loop.
*/
nr_failed++;
stats->nr_thp_failed += is_thp;
stats->nr_failed_pages += nr_pages; break;
}
}
}
nr_failed += retry;
stats->nr_thp_failed += thp_retry;
stats->nr_failed_pages += nr_retry_pages;
move: /* Flush TLBs for all unmapped folios */
try_to_unmap_flush();
memset(&astats, 0, sizeof(astats)); /* Try to migrate in batch with MIGRATE_ASYNC mode firstly */
rc = migrate_pages_batch(from, get_new_folio, put_new_folio, private, MIGRATE_ASYNC,
reason, &folios, split_folios, &astats,
NR_MAX_MIGRATE_ASYNC_RETRY);
stats->nr_succeeded += astats.nr_succeeded;
stats->nr_thp_succeeded += astats.nr_thp_succeeded;
stats->nr_thp_split += astats.nr_thp_split;
stats->nr_split += astats.nr_split; if (rc < 0) {
stats->nr_failed_pages += astats.nr_failed_pages;
stats->nr_thp_failed += astats.nr_thp_failed;
list_splice_tail(&folios, ret_folios); return rc;
}
stats->nr_thp_failed += astats.nr_thp_split; /* * Do not count rc, as pages will be retried below. * Count nr_split only, since it includes nr_thp_split.
*/
nr_failed += astats.nr_split; /* * Fall back to migrate all failed folios one by one synchronously. All * failed folios except split THPs will be retried, so their failure * isn't counted
*/
list_splice_tail_init(&folios, from); while (!list_empty(from)) {
list_move(from->next, &folios);
rc = migrate_pages_batch(&folios, get_new_folio, put_new_folio, private, mode, reason, ret_folios,
split_folios, stats, NR_MAX_MIGRATE_SYNC_RETRY);
list_splice_tail_init(&folios, ret_folios); if (rc < 0) return rc;
nr_failed += rc;
}
return nr_failed;
}
/* * migrate_pages - migrate the folios specified in a list, to the free folios * supplied as the target for the page migration * * @from: The list of folios to be migrated. * @get_new_folio: The function used to allocate free folios to be used * as the target of the folio migration. * @put_new_folio: The function used to free target folios if migration * fails, or NULL if no special handling is necessary. * @private: Private data to be passed on to get_new_folio() * @mode: The migration mode that specifies the constraints for * folio migration, if any. * @reason: The reason for folio migration. * @ret_succeeded: Set to the number of folios migrated successfully if * the caller passes a non-NULL pointer. * * The function returns after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no folios * are movable any more because the list has become empty or no retryable folios * exist any more. It is caller's responsibility to call putback_movable_pages() * only if ret != 0. * * Returns the number of {normal folio, large folio, hugetlb} that were not * migrated, or an error code. The number of large folio splits will be * considered as the number of non-migrated large folio, no matter how many * split folios of the large folio are migrated successfully.
*/ int migrate_pages(struct list_head *from, new_folio_t get_new_folio,
free_folio_t put_new_folio, unsignedlongprivate, enum migrate_mode mode, int reason, unsignedint *ret_succeeded)
{ int rc, rc_gather; int nr_pages; struct folio *folio, *folio2;
LIST_HEAD(folios);
LIST_HEAD(ret_folios);
LIST_HEAD(split_folios); struct migrate_pages_stats stats;
again:
nr_pages = 0;
list_for_each_entry_safe(folio, folio2, from, lru) { /* Retried hugetlb folios will be kept in list */ if (folio_test_hugetlb(folio)) {
list_move_tail(&folio->lru, &ret_folios); continue;
}
nr_pages += folio_nr_pages(folio); if (nr_pages >= NR_MAX_BATCHED_MIGRATION) break;
} if (nr_pages >= NR_MAX_BATCHED_MIGRATION)
list_cut_before(&folios, from, &folio2->lru); else
list_splice_init(from, &folios); if (mode == MIGRATE_ASYNC)
rc = migrate_pages_batch(&folios, get_new_folio, put_new_folio, private, mode, reason, &ret_folios,
&split_folios, &stats,
NR_MAX_MIGRATE_PAGES_RETRY); else
rc = migrate_pages_sync(&folios, get_new_folio, put_new_folio, private, mode, reason, &ret_folios,
&split_folios, &stats);
list_splice_tail_init(&folios, &ret_folios); if (rc < 0) {
rc_gather = rc;
list_splice_tail(&split_folios, &ret_folios); goto out;
} if (!list_empty(&split_folios)) { /* * Failure isn't counted since all split folios of a large folio * is counted as 1 failure already. And, we only try to migrate * with minimal effort, force MIGRATE_ASYNC mode and retry once.
*/
migrate_pages_batch(&split_folios, get_new_folio,
put_new_folio, private, MIGRATE_ASYNC, reason,
&ret_folios, NULL, &stats, 1);
list_splice_tail_init(&split_folios, &ret_folios);
}
rc_gather += rc; if (!list_empty(from)) goto again;
out: /* * Put the permanent failure folio back to migration list, they * will be put back to the right list by the caller.
*/
list_splice(&ret_folios, from);
/* * Return 0 in case all split folios of fail-to-migrate large folios * are migrated successfully.
*/ if (list_empty(from))
rc_gather = 0;
staticint store_status(int __user *status, int start, int value, int nr)
{ while (nr-- > 0) { if (put_user(value, status + start)) return -EFAULT;
start++;
}
/* * Resolves the given address to a struct folio, isolates it from the LRU and * puts it to the given pagelist. * Returns: * errno - if the folio cannot be found/isolated * 0 - when it doesn't have to be migrated because it is already on the * target node * 1 - when it has been queued
*/ staticint add_folio_for_migration(struct mm_struct *mm, constvoid __user *p, int node, struct list_head *pagelist, bool migrate_all)
{ struct vm_area_struct *vma; struct folio_walk fw; struct folio *folio; unsignedlong addr; int err = -EFAULT;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.