// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * * Swap reorganised 29.12.95, Stephen Tweedie. * kswapd added: 7.1.96 sct * Removed kswapd_ctl limits, and swap out as many pages as needed * to bring the system back to freepages.high: 2.4.97, Rik van Riel. * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). * Multiqueue VM started 5.8.00, Rik van Riel.
*/
struct scan_control { /* How many pages shrink_list() should reclaim */ unsignedlong nr_to_reclaim;
/* * Nodemask of nodes allowed by the caller. If NULL, all nodes * are scanned.
*/
nodemask_t *nodemask;
/* * The memory cgroup that hit its limit and as a result is the * primary target of this reclaim invocation.
*/ struct mem_cgroup *target_mem_cgroup;
/* * Scan pressure balancing between anon and file LRUs
*/ unsignedlong anon_cost; unsignedlong file_cost;
/* Swappiness value for proactive reclaim. Always use sc_swappiness()! */ int *proactive_swappiness;
/* Can active folios be deactivated as part of reclaim? */ #define DEACTIVATE_ANON 1 #define DEACTIVATE_FILE 2 unsignedint may_deactivate:2; unsignedint force_deactivate:1; unsignedint skipped_deactivate:1;
/* Writepage batching in laptop mode; RECLAIM_WRITE */ unsignedint may_writepage:1;
/* Can mapped folios be reclaimed? */ unsignedint may_unmap:1;
/* Can folios be swapped as part of reclaim? */ unsignedint may_swap:1;
/* Not allow cache_trim_mode to be turned on as part of reclaim? */ unsignedint no_cache_trim_mode:1;
/* Has cache_trim_mode failed at least once? */ unsignedint cache_trim_mode_failed:1;
/* Proactive reclaim invoked by userspace */ unsignedint proactive:1;
/* * Cgroup memory below memory.low is protected as long as we * don't threaten to OOM. If any cgroup is reclaimed at * reduced force or passed over entirely due to its memory.low * setting (memcg_low_skipped), and nothing is reclaimed as a * result, then go back for one more cycle that reclaims the protected * memory (memcg_low_reclaim) to avert OOM.
*/ unsignedint memcg_low_reclaim:1; unsignedint memcg_low_skipped:1;
/* Shared cgroup tree walk failed, rescan the whole tree */ unsignedint memcg_full_walk:1;
unsignedint hibernation_mode:1;
/* One of the zones is ready for compaction */ unsignedint compaction_ready:1;
/* There is easily reclaimable cold cache in the current node */ unsignedint cache_trim_mode:1;
/* The file folios on the current node are dangerously low */ unsignedint file_is_tiny:1;
/* Always discard instead of demoting to lower tier memory */ unsignedint no_demotion:1;
/* Allocation order */
s8 order;
/* Scan (total_size >> priority) pages at once */
s8 priority;
/* The highest zone to isolate folios for reclaim from */
s8 reclaim_idx;
/* This context's GFP mask */
gfp_t gfp_mask;
/* Incremented by the number of inactive pages that were scanned */ unsignedlong nr_scanned;
/* Number of pages freed so far during a call to shrink_zones() */ unsignedlong nr_reclaimed;
/* for recording the reclaimed slab by now */ struct reclaim_state reclaim_state;
};
#ifdef ARCH_HAS_PREFETCHW #define prefetchw_prev_lru_folio(_folio, _base, _field) \ do { \ if ((_folio)->lru.prev != _base) { \ struct folio *prev; \
\
prev = lru_to_folio(&(_folio->lru)); \
prefetchw(&prev->_field); \
} \
} while (0) #else #define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0) #endif
/* * From 0 .. MAX_SWAPPINESS. Higher means more swappy.
*/ int vm_swappiness = 60;
#ifdef CONFIG_MEMCG
/* Returns true for reclaim through cgroup limits or cgroup interfaces. */ staticbool cgroup_reclaim(struct scan_control *sc)
{ return sc->target_mem_cgroup;
}
/* * Returns true for reclaim on the root cgroup. This is true for direct * allocator reclaim and reclaim through cgroup interfaces on the root cgroup.
*/ staticbool root_reclaim(struct scan_control *sc)
{ return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup);
}
/** * writeback_throttling_sane - is the usual dirty throttling mechanism available? * @sc: scan_control in question * * The normal page dirty throttling mechanism in balance_dirty_pages() is * completely broken with the legacy memcg and direct stalling in * shrink_folio_list() is used for throttling instead, which lacks all the * niceties such as fairness, adaptive pausing, bandwidth proportional * allocation and configurability. * * This function tests whether the vmscan currently in progress can assume * that the normal dirty throttling mechanism is operational.
*/ staticbool writeback_throttling_sane(struct scan_control *sc)
{ if (!cgroup_reclaim(sc)) returntrue; #ifdef CONFIG_CGROUP_WRITEBACK if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) returntrue; #endif returnfalse;
}
/* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to * and including the specified highidx * @zone: The current zone in the iterator * @pgdat: The pgdat which node_zones are being iterated * @idx: The index variable * @highidx: The index of the highest zone to return * * This macro iterates through all managed zones up to and including the specified highidx. * The zone iterator enters an invalid state after macro call and must be reinitialized * before it can be used again.
*/ #define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx) \ for ((idx) = 0, (zone) = (pgdat)->node_zones; \
(idx) <= (highidx); \
(idx)++, (zone)++) \ if (!managed_zone(zone)) \ continue; \ else
staticvoid set_task_reclaim_state(struct task_struct *task, struct reclaim_state *rs)
{ /* Check for an overwrite */
WARN_ON_ONCE(rs && task->reclaim_state);
/* Check for the nulling of an already-nulled member */
WARN_ON_ONCE(!rs && !task->reclaim_state);
task->reclaim_state = rs;
}
/* * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to * scan_control->nr_reclaimed.
*/ staticvoid flush_reclaim_state(struct scan_control *sc)
{ /* * Currently, reclaim_state->reclaimed includes three types of pages * freed outside of vmscan: * (1) Slab pages. * (2) Clean file pages from pruned inodes (on highmem systems). * (3) XFS freed buffer pages. * * For all of these cases, we cannot universally link the pages to a * single memcg. For example, a memcg-aware shrinker can free one object * charged to the target memcg, causing an entire page to be freed. * If we count the entire page as reclaimed from the memcg, we end up * overestimating the reclaimed amount (potentially under-reclaiming). * * Only count such pages for global reclaim to prevent under-reclaiming * from the target memcg; preventing unnecessary retries during memcg * charging and false positives from proactive reclaim. * * For uncommon cases where the freed pages were actually mostly * charged to the target memcg, we end up underestimating the reclaimed * amount. This should be fine. The freed pages will be uncharged * anyway, even if they are not counted here properly, and we will be * able to make forward progress in charging (which is usually in a * retry loop). * * We can go one step further, and report the uncharged objcg pages in * memcg reclaim, to make reporting more accurate and reduce * underestimation, but it's probably not worth the complexity for now.
*/ if (current->reclaim_state && root_reclaim(sc)) {
sc->nr_reclaimed += current->reclaim_state->reclaimed;
current->reclaim_state->reclaimed = 0;
}
}
if (!numa_demotion_enabled) returnfalse; if (sc && sc->no_demotion) returnfalse;
demotion_nid = next_demotion_node(nid); if (demotion_nid == NUMA_NO_NODE) returnfalse;
/* If demotion node isn't in the cgroup's mems_allowed, fall back */ return mem_cgroup_node_allowed(memcg, demotion_nid);
}
staticinlinebool can_reclaim_anon_pages(struct mem_cgroup *memcg, int nid, struct scan_control *sc)
{ if (memcg == NULL) { /* * For non-memcg reclaim, is there * space in any swap device?
*/ if (get_nr_swap_pages() > 0) returntrue;
} else { /* Is the memcg below its swap limit? */ if (mem_cgroup_get_nr_swap_pages(memcg) > 0) returntrue;
}
/* * The page can not be swapped. * * Can it be reclaimed from this node via demotion?
*/ return can_demote(nid, sc, memcg);
}
/* * This misses isolated folios which are not accounted for to save counters. * As the data only determines if reclaim or compaction continues, it is * not expected that isolated folios will be a dominating factor.
*/ unsignedlong zone_reclaimable_pages(struct zone *zone)
{ unsignedlong nr;
nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE); if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON); /* * If there are no reclaimable file-backed or anonymous pages, * ensure zones with sufficient free pages are not skipped. * This prevents zones like DMA32 from being ignored in reclaim * scenarios where they can still help alleviate memory pressure.
*/ if (nr == 0)
nr = zone_page_state_snapshot(zone, NR_FREE_PAGES); return nr;
}
/** * lruvec_lru_size - Returns the number of pages on the given LRU list. * @lruvec: lru vector * @lru: lru to use * @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list)
*/ staticunsignedlong lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
{ unsignedlong size = 0; int zid; struct zone *zone;
if (current_is_kswapd()) return 0; if (current_is_khugepaged()) return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD; if (sc->proactive) return PGSTEAL_PROACTIVE - PGSTEAL_KSWAPD; return PGSTEAL_DIRECT - PGSTEAL_KSWAPD;
}
staticinlineint is_page_cache_freeable(struct folio *folio)
{ /* * A freeable page cache folio is referenced only by the caller * that isolated the folio, the page cache and optional filesystem * private data at folio->private.
*/ return folio_ref_count(folio) - folio_test_private(folio) ==
1 + folio_nr_pages(folio);
}
/* * We detected a synchronous write error writing a folio out. Probably * -ENOSPC. We need to propagate that into the address_space for a subsequent * fsync(), msync() or close(). * * The tricky part is that after writepage we cannot touch the mapping: nothing * prevents it from being freed up. But we have a ref on the folio and once * that folio is locked, the mapping is pinned. * * We're allowed to run sleeping folio_lock() here because we know the caller has * __GFP_FS.
*/ staticvoid handle_write_error(struct address_space *mapping, struct folio *folio, int error)
{
folio_lock(folio); if (folio_mapping(folio) == mapping)
mapping_set_error(mapping, error);
folio_unlock(folio);
}
staticbool skip_throttle_noprogress(pg_data_t *pgdat)
{ int reclaimable = 0, write_pending = 0; int i; struct zone *zone; /* * If kswapd is disabled, reschedule if necessary but do not * throttle as the system is likely near OOM.
*/ if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) returntrue;
/* * If there are a lot of dirty/writeback folios then do not * throttle as throttling will occur when the folios cycle * towards the end of the LRU if still under writeback.
*/
for_each_managed_zone_pgdat(zone, pgdat, i, MAX_NR_ZONES - 1) {
reclaimable += zone_reclaimable_pages(zone);
write_pending += zone_page_state_snapshot(zone,
NR_ZONE_WRITE_PENDING);
} if (2 * write_pending <= reclaimable) returntrue;
/* * Do not throttle user workers, kthreads other than kswapd or * workqueues. They may be required for reclaim to make * forward progress (e.g. journalling workqueues or kthreads).
*/ if (!current_is_kswapd() &&
current->flags & (PF_USER_WORKER|PF_KTHREAD)) {
cond_resched(); return;
}
/* * These figures are pulled out of thin air. * VMSCAN_THROTTLE_ISOLATED is a transient condition based on too many * parallel reclaimers which is a short-lived event so the timeout is * short. Failing to make progress or waiting on writeback are * potentially long-lived events so use a longer timeout. This is shaky * logic as a failure to make progress could be due to anything from * writeback to a slow device to excessive referenced folios at the tail * of the inactive LRU.
*/ switch(reason) { case VMSCAN_THROTTLE_WRITEBACK:
timeout = HZ/10;
if (atomic_inc_return(&pgdat->nr_writeback_throttled) == 1) {
WRITE_ONCE(pgdat->nr_reclaim_start,
node_page_state(pgdat, NR_THROTTLED_WRITTEN));
}
break; case VMSCAN_THROTTLE_CONGESTED:
fallthrough; case VMSCAN_THROTTLE_NOPROGRESS: if (skip_throttle_noprogress(pgdat)) {
cond_resched(); return;
}
/* * Account for folios written if tasks are throttled waiting on dirty * folios to clean. If enough folios have been cleaned since throttling * started then wakeup the throttled tasks.
*/ void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio, int nr_throttled)
{ unsignedlong nr_written;
node_stat_add_folio(folio, NR_THROTTLED_WRITTEN);
/* * This is an inaccurate read as the per-cpu deltas may not * be synchronised. However, given that the system is * writeback throttled, it is not worth taking the penalty * of getting an accurate count. At worst, the throttle * timeout guarantees forward progress.
*/
nr_written = node_page_state(pgdat, NR_THROTTLED_WRITTEN) -
READ_ONCE(pgdat->nr_reclaim_start);
if (nr_written > SWAP_CLUSTER_MAX * nr_throttled)
wake_up(&pgdat->reclaim_wait[VMSCAN_THROTTLE_WRITEBACK]);
}
/* possible outcome of pageout() */ typedefenum { /* failed to write folio out, folio is locked */
PAGE_KEEP, /* move folio to the active list, folio is locked */
PAGE_ACTIVATE, /* folio has been sent to the disk successfully, folio is unlocked */
PAGE_SUCCESS, /* folio is clean and locked */
PAGE_CLEAN,
} pageout_t;
/* * The large shmem folio can be split if CONFIG_THP_SWAP is not enabled * or we failed to allocate contiguous swap entries, in which case * the split out folios get added back to folio_list.
*/ if (shmem_mapping(mapping))
res = shmem_writeout(folio, plug, folio_list); else
res = swap_writeout(folio, plug);
if (res < 0)
handle_write_error(mapping, folio, res); if (res == AOP_WRITEPAGE_ACTIVATE) {
folio_clear_reclaim(folio); return PAGE_ACTIVATE;
}
/* synchronous write? */ if (!folio_test_writeback(folio))
folio_clear_reclaim(folio);
/* * pageout is called by shrink_folio_list() for each dirty folio.
*/ static pageout_t pageout(struct folio *folio, struct address_space *mapping, struct swap_iocb **plug, struct list_head *folio_list)
{ /* * We no longer attempt to writeback filesystem folios here, other * than tmpfs/shmem. That's taken care of in page-writeback. * If we find a dirty filesystem folio at the end of the LRU list, * typically that means the filesystem is saturating the storage * with contiguous writes and telling it to write a folio here * would only make the situation worse by injecting an element * of random access. * * If the folio is swapcache, write it back even if that would * block, for some throttling. This happens by accident, because * swap_backing_dev_info is bust: it doesn't reflect the * congestion state of the swapdevs. Easy to fix, if needed.
*/ if (!is_page_cache_freeable(folio)) return PAGE_KEEP; if (!mapping) { /* * Some data journaling orphaned folios can have * folio->mapping == NULL while being dirty with clean buffers.
*/ if (folio_test_private(folio)) { if (try_to_free_buffers(folio)) {
folio_clear_dirty(folio);
pr_info("%s: orphaned folio\n", __func__); return PAGE_CLEAN;
}
} return PAGE_KEEP;
}
if (!shmem_mapping(mapping) && !folio_test_anon(folio)) return PAGE_ACTIVATE; if (!folio_clear_dirty_for_io(folio)) return PAGE_CLEAN; return writeout(folio, mapping, plug, folio_list);
}
/* * Same as remove_mapping, but if the folio is removed from the mapping, it * gets returned with a refcount of 0.
*/ staticint __remove_mapping(struct address_space *mapping, struct folio *folio, bool reclaimed, struct mem_cgroup *target_memcg)
{ int refcount; void *shadow = NULL;
if (!folio_test_swapcache(folio))
spin_lock(&mapping->host->i_lock);
xa_lock_irq(&mapping->i_pages); /* * The non racy check for a busy folio. * * Must be careful with the order of the tests. When someone has * a ref to the folio, it may be possible that they dirty it then * drop the reference. So if the dirty flag is tested before the * refcount here, then the following race may occur: * * get_user_pages(&page); * [user mapping goes away] * write_to(page); * !folio_test_dirty(folio) [good] * folio_set_dirty(folio); * folio_put(folio); * !refcount(folio) [good, discard it] * * [oops, our write_to data is lost] * * Reversing the order of the tests ensures such a situation cannot * escape unnoticed. The smp_rmb is needed to ensure the folio->flags * load is not satisfied before that of folio->_refcount. * * Note that if the dirty flag is always set via folio_mark_dirty, * and thus under the i_pages lock, then this ordering is not required.
*/
refcount = 1 + folio_nr_pages(folio); if (!folio_ref_freeze(folio, refcount)) goto cannot_free; /* note: atomic_cmpxchg in folio_ref_freeze provides the smp_rmb */ if (unlikely(folio_test_dirty(folio))) {
folio_ref_unfreeze(folio, refcount); goto cannot_free;
}
if (folio_test_swapcache(folio)) {
swp_entry_t swap = folio->swap;
free_folio = mapping->a_ops->free_folio; /* * Remember a shadow entry for reclaimed file cache in * order to detect refaults, thus thrashing, later on. * * But don't store shadows in an address space that is * already exiting. This is not just an optimization, * inode reclaim needs to empty out the radix tree or * the nodes are lost. Don't plant shadows behind its * back. * * We also don't store shadows for DAX mappings because the * only page cache folios found in these are zero pages * covering holes, and because we don't want to mix DAX * exceptional entries and shadow exceptional entries in the * same address_space.
*/ if (reclaimed && folio_is_file_lru(folio) &&
!mapping_exiting(mapping) && !dax_mapping(mapping))
shadow = workingset_eviction(folio, target_memcg);
__filemap_remove_folio(folio, shadow);
xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping))
inode_add_lru(mapping->host);
spin_unlock(&mapping->host->i_lock);
if (free_folio)
free_folio(folio);
}
return 1;
cannot_free:
xa_unlock_irq(&mapping->i_pages); if (!folio_test_swapcache(folio))
spin_unlock(&mapping->host->i_lock); return 0;
}
/** * remove_mapping() - Attempt to remove a folio from its mapping. * @mapping: The address space. * @folio: The folio to remove. * * If the folio is dirty, under writeback or if someone else has a ref * on it, removal will fail. * Return: The number of pages removed from the mapping. 0 if the folio * could not be removed. * Context: The caller should have a single refcount on the folio and * hold its lock.
*/ long remove_mapping(struct address_space *mapping, struct folio *folio)
{ if (__remove_mapping(mapping, folio, false, NULL)) { /* * Unfreezing the refcount with 1 effectively * drops the pagecache ref for us without requiring another * atomic operation.
*/
folio_ref_unfreeze(folio, 1); return folio_nr_pages(folio);
} return 0;
}
/** * folio_putback_lru - Put previously isolated folio onto appropriate LRU list. * @folio: Folio to be returned to an LRU list. * * Add previously isolated @folio to appropriate LRU list. * The folio may still be unevictable for other reasons. * * Context: lru_lock must not be held, interrupts must be enabled.
*/ void folio_putback_lru(struct folio *folio)
{
folio_add_lru(folio);
folio_put(folio); /* drop ref from isolate */
}
#ifdef CONFIG_LRU_GEN /* * Only used on a mapped folio in the eviction (rmap walk) path, where promotion * needs to be done by taking the folio off the LRU list and then adding it back * with PG_active set. In contrast, the aging (page table walk) path uses * folio_update_gen().
*/ staticbool lru_gen_set_refs(struct folio *folio)
{ /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) {
set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); returnfalse;
}
/* * The supposedly reclaimable folio was found to be in a VM_LOCKED vma. * Let the folio, now marked Mlocked, be moved to the unevictable list.
*/ if (vm_flags & VM_LOCKED) return FOLIOREF_ACTIVATE;
/* * There are two cases to consider. * 1) Rmap lock contention: rotate. * 2) Skip the non-shared swapbacked folio mapped solely by * the exiting or OOM-reaped process.
*/ if (referenced_ptes == -1) return FOLIOREF_KEEP;
if (lru_gen_enabled()) { if (!referenced_ptes) return FOLIOREF_RECLAIM;
if (referenced_ptes) { /* * All mapped folios start out with page table * references from the instantiating fault, so we need * to look twice if a mapped file/anon folio is used more * than once. * * Mark it and spare it for another trip around the * inactive list. Another page table reference will * lead to its activation. * * Note: the mark is set for activated folios as well * so that recently deactivated but used folios are * quickly recovered.
*/
folio_set_referenced(folio);
if (referenced_folio || referenced_ptes > 1) return FOLIOREF_ACTIVATE;
/* * Activate file-backed executable folios after first usage.
*/ if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) return FOLIOREF_ACTIVATE;
return FOLIOREF_KEEP;
}
/* Reclaim if clean, defer dirty folios to writeback */ if (referenced_folio && folio_is_file_lru(folio)) return FOLIOREF_RECLAIM_CLEAN;
return FOLIOREF_RECLAIM;
}
/* Check if a folio is dirty or under writeback */ staticvoid folio_check_dirty_writeback(struct folio *folio, bool *dirty, bool *writeback)
{ struct address_space *mapping;
/* * Anonymous folios are not handled by flushers and must be written * from reclaim context. Do not stall reclaim based on them. * MADV_FREE anonymous folios are put into inactive file list too. * They could be mistakenly treated as file lru. So further anon * test is needed.
*/ if (!folio_is_file_lru(folio) ||
(folio_test_anon(folio) && !folio_test_swapbacked(folio))) {
*dirty = false;
*writeback = false; return;
}
/* By default assume that the folio flags are accurate */
*dirty = folio_test_dirty(folio);
*writeback = folio_test_writeback(folio);
/* Verify dirty/writeback state if the filesystem supports it */ if (!folio_test_private(folio)) return;
allowed_mask = mtc->nmask; /* * make sure we allocate from the target node first also trying to * demote or reclaim pages from the target node via kswapd if we are * low on free memory on target node. If we don't do this and if * we have free memory on the slower(lower) memtier, we would start * allocating pages from slower(lower) memory tiers without even forcing * a demotion of cold pages from the target memtier. This can result * in the kernel placing hot pages in slower(lower) memory tiers.
*/
mtc->nmask = NULL;
mtc->gfp_mask |= __GFP_THISNODE;
dst = alloc_migration_target(src, (unsignedlong)mtc); if (dst) return dst;
/* * Take folios on @demote_folios and attempt to demote them to another node. * Folios which are not demoted are left on @demote_folios.
*/ staticunsignedint demote_folio_list(struct list_head *demote_folios, struct pglist_data *pgdat)
{ int target_nid = next_demotion_node(pgdat->node_id); unsignedint nr_succeeded;
nodemask_t allowed_mask;
struct migration_target_control mtc = { /* * Allocate from 'node', or fail quickly and quietly. * When this happens, 'page' will likely just be discarded * instead of migrated.
*/
.gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN |
__GFP_NOMEMALLOC | GFP_NOWAIT,
.nid = target_nid,
.nmask = &allowed_mask,
.reason = MR_DEMOTION,
};
if (list_empty(demote_folios)) return 0;
if (target_nid == NUMA_NO_NODE) return 0;
node_get_allowed_targets(pgdat, &allowed_mask);
/* Demotion ignores all cpuset and mempolicy settings */
migrate_pages(demote_folios, alloc_demote_folio, NULL,
(unsignedlong)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
&nr_succeeded);
return nr_succeeded;
}
staticbool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
{ if (gfp_mask & __GFP_FS) returntrue; if (!folio_test_swapcache(folio) || !(gfp_mask & __GFP_IO)) returnfalse; /* * We can "enter_fs" for swap-cache with only __GFP_IO * providing this isn't SWP_FS_OPS. * ->flags can be updated non-atomicially (scan_swap_map_slots), * but that will never affect SWP_FS_OPS, so the data_race * is safe.
*/ return !data_race(folio_swap_flags(folio) & SWP_FS_OPS);
}
if (folio_contain_hwpoisoned_page(folio)) { /* * unmap_poisoned_folio() can't handle large * folio, just skip it. memory_failure() will * handle it if the UCE is triggered again.
*/ if (folio_test_large(folio)) goto keep_locked;
/* Account the number of base pages */
sc->nr_scanned += nr_pages;
if (unlikely(!folio_evictable(folio))) goto activate_locked;
if (!sc->may_unmap && folio_mapped(folio)) goto keep_locked;
/* * The number of dirty pages determines if a node is marked * reclaim_congested. kswapd will stall and start writing * folios if the tail of the LRU is all dirty unqueued folios.
*/
folio_check_dirty_writeback(folio, &dirty, &writeback); if (dirty || writeback)
stat->nr_dirty += nr_pages;
if (dirty && !writeback)
stat->nr_unqueued_dirty += nr_pages;
/* * Treat this folio as congested if folios are cycling * through the LRU so quickly that the folios marked * for immediate reclaim are making it to the end of * the LRU a second time.
*/ if (writeback && folio_test_reclaim(folio))
stat->nr_congested += nr_pages;
/* * If a folio at the tail of the LRU is under writeback, there * are three cases to consider. * * 1) If reclaim is encountering an excessive number * of folios under writeback and this folio has both * the writeback and reclaim flags set, then it * indicates that folios are being queued for I/O but * are being recycled through the LRU before the I/O * can complete. Waiting on the folio itself risks an * indefinite stall if it is impossible to writeback * the folio due to I/O error or disconnected storage * so instead note that the LRU is being scanned too * quickly and the caller can stall after the folio * list has been processed. * * 2) Global or new memcg reclaim encounters a folio that is * not marked for immediate reclaim, or the caller does not * have __GFP_FS (or __GFP_IO if it's simply going to swap, * not to fs), or the folio belongs to a mapping where * waiting on writeback during reclaim may lead to a deadlock. * In this case mark the folio for immediate reclaim and * continue scanning. * * Require may_enter_fs() because we would wait on fs, which * may not have submitted I/O yet. And the loop driver might * enter reclaim, and deadlock if it waits on a folio for * which it is needed to do the write (loop masks off * __GFP_IO|__GFP_FS for this reason); but more thought * would probably show more reasons. * * 3) Legacy memcg encounters a folio that already has the * reclaim flag set. memcg does not have any dirty folio * throttling so we could easily OOM just because too many * folios are in writeback and there is nothing else to * reclaim. Wait for the writeback to complete. * * In cases 1) and 2) we activate the folios to get them out of * the way while we continue scanning for clean folios on the * inactive list and refilling from the active list. The * observation here is that waiting for disk writes is more * expensive than potentially causing reloads down the line. * Since they're marked for immediate reclaim, they won't put * memory pressure on the cache working set any longer than it * takes to write them to disk.
*/ if (folio_test_writeback(folio)) {
mapping = folio_mapping(folio);
/* Case 1 above */ if (current_is_kswapd() &&
folio_test_reclaim(folio) &&
test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
stat->nr_immediate += nr_pages; goto activate_locked;
/* Case 2 above */
} elseif (writeback_throttling_sane(sc) ||
!folio_test_reclaim(folio) ||
!may_enter_fs(folio, sc->gfp_mask) ||
(mapping &&
mapping_writeback_may_deadlock_on_reclaim(mapping))) { /* * This is slightly racy - * folio_end_writeback() might have * just cleared the reclaim flag, then * setting the reclaim flag here ends up * interpreted as the readahead flag - but * that does not matter enough to care. * What we do want is for this folio to * have the reclaim flag set next time * memcg reclaim reaches the tests above, * so it will then wait for writeback to * avoid OOM; and it's also appropriate * in global reclaim.
*/
folio_set_reclaim(folio);
stat->nr_writeback += nr_pages; goto activate_locked;
/* Case 3 above */
} else {
folio_unlock(folio);
folio_wait_writeback(folio); /* then go back and try same folio again */
list_add_tail(&folio->lru, folio_list); continue;
}
}
if (!ignore_references)
references = folio_check_references(folio, sc);
switch (references) { case FOLIOREF_ACTIVATE: goto activate_locked; case FOLIOREF_KEEP:
stat->nr_ref_keep += nr_pages; goto keep_locked; case FOLIOREF_RECLAIM: case FOLIOREF_RECLAIM_CLEAN:
; /* try to reclaim the folio below */
}
/* * Before reclaiming the folio, try to relocate * its contents to another node.
*/ if (do_demote_pass &&
(thp_migration_supported() || !folio_test_large(folio))) {
list_add(&folio->lru, &demote_folios);
folio_unlock(folio); continue;
}
/* * Anonymous process memory has backing store? * Try to allocate it some swap space here. * Lazyfree folio could be freed directly
*/ if (folio_test_anon(folio) && folio_test_swapbacked(folio)) { if (!folio_test_swapcache(folio)) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; if (folio_maybe_dma_pinned(folio)) goto keep_locked; if (folio_test_large(folio)) { /* cannot split folio, skip it */ if (!can_split_folio(folio, 1, NULL)) goto activate_locked; /* * Split partially mapped folios right away. * We can free the unmapped pages without IO.
*/ if (data_race(!list_empty(&folio->_deferred_list) &&
folio_test_partially_mapped(folio)) &&
split_folio_to_list(folio, folio_list)) goto activate_locked;
} if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN)) { int __maybe_unused order = folio_order(folio);
if (!folio_test_large(folio)) goto activate_locked_split; /* Fallback to swap normal pages */ if (split_folio_to_list(folio, folio_list)) goto activate_locked; #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (nr_pages >= HPAGE_PMD_NR) {
count_memcg_folio_events(folio,
THP_SWPOUT_FALLBACK, 1);
count_vm_event(THP_SWPOUT_FALLBACK);
} #endif
count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK); if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN)) goto activate_locked_split;
} /* * Normally the folio will be dirtied in unmap because its * pte should be dirty. A special case is MADV_FREE page. The * page's pte could have dirty bit cleared but the folio's * SwapBacked flag is still set because clearing the dirty bit * and SwapBacked flag has no lock protected. For such folio, * unmap will not set dirty bit for it, so folio reclaim will * not write the folio out. This can cause data corruption when * the folio is swapped in later. Always setting the dirty flag * for the folio solves the problem.
*/
folio_mark_dirty(folio);
}
}
/* * If the folio was split above, the tail pages will make * their own pass through this function and be accounted * then.
*/ if ((nr_pages > 1) && !folio_test_large(folio)) {
sc->nr_scanned -= (nr_pages - 1);
nr_pages = 1;
}
/* * The folio is mapped into the page tables of one or more * processes. Try to unmap it here.
*/ if (folio_mapped(folio)) { enum ttu_flags flags = TTU_BATCH_FLUSH; bool was_swapbacked = folio_test_swapbacked(folio);
if (folio_test_pmd_mappable(folio))
flags |= TTU_SPLIT_HUGE_PMD; /* * Without TTU_SYNC, try_to_unmap will only begin to * hold PTL from the first present PTE within a large * folio. Some initial PTEs might be skipped due to * races with parallel PTE writes in which PTEs can be * cleared temporarily before being written new present * values. This will lead to a large folio is still * mapped while some subpages have been partially * unmapped after try_to_unmap; TTU_SYNC helps * try_to_unmap acquire PTL from the first PTE, * eliminating the influence of temporary PTE values.
*/ if (folio_test_large(folio))
flags |= TTU_SYNC;
try_to_unmap(folio, flags); if (folio_mapped(folio)) {
stat->nr_unmap_fail += nr_pages; if (!was_swapbacked &&
folio_test_swapbacked(folio))
stat->nr_lazyfree_fail += nr_pages; goto activate_locked;
}
}
/* * Folio is unmapped now so it cannot be newly pinned anymore. * No point in trying to reclaim folio if it is pinned. * Furthermore we don't want to reclaim underlying fs metadata * if the folio is pinned and thus potentially modified by the * pinning process as that may upset the filesystem.
*/ if (folio_maybe_dma_pinned(folio)) goto activate_locked;
mapping = folio_mapping(folio); if (folio_test_dirty(folio)) { /* * Only kswapd can writeback filesystem folios * to avoid risk of stack overflow. But avoid * injecting inefficient single-folio I/O into * flusher writeback as much as possible: only * write folios when we've encountered many * dirty folios, and when we've already scanned * the rest of the LRU for clean folios and see * the same dirty folios again (with the reclaim * flag set).
*/ if (folio_is_file_lru(folio) &&
(!current_is_kswapd() ||
!folio_test_reclaim(folio) ||
!test_bit(PGDAT_DIRTY, &pgdat->flags))) { /* * Immediately reclaim when written back. * Similar in principle to folio_deactivate() * except we already have the folio isolated * and know it's dirty
*/
node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE,
nr_pages);
folio_set_reclaim(folio);
goto activate_locked;
}
if (references == FOLIOREF_RECLAIM_CLEAN) goto keep_locked; if (!may_enter_fs(folio, sc->gfp_mask)) goto keep_locked; if (!sc->may_writepage) goto keep_locked;
/* * Folio is dirty. Flush the TLB if a writable entry * potentially exists to avoid CPU writes after I/O * starts and then write it out here.
*/
try_to_unmap_flush_dirty(); switch (pageout(folio, mapping, &plug, folio_list)) { case PAGE_KEEP: goto keep_locked; case PAGE_ACTIVATE: /* * If shmem folio is split when writeback to swap, * the tail pages will make their own pass through * this function and be accounted then.
*/ if (nr_pages > 1 && !folio_test_large(folio)) {
sc->nr_scanned -= (nr_pages - 1);
nr_pages = 1;
} goto activate_locked; case PAGE_SUCCESS: if (nr_pages > 1 && !folio_test_large(folio)) {
sc->nr_scanned -= (nr_pages - 1);
nr_pages = 1;
}
stat->nr_pageout += nr_pages;
if (folio_test_writeback(folio)) goto keep; if (folio_test_dirty(folio)) goto keep;
/* * A synchronous write - probably a ramdisk. Go * ahead and try to reclaim the folio.
*/ if (!folio_trylock(folio)) goto keep; if (folio_test_dirty(folio) ||
folio_test_writeback(folio)) goto keep_locked;
mapping = folio_mapping(folio);
fallthrough; case PAGE_CLEAN:
; /* try to free the folio below */
}
}
/* * If the folio has buffers, try to free the buffer * mappings associated with this folio. If we succeed * we try to free the folio as well. * * We do this even if the folio is dirty. * filemap_release_folio() does not perform I/O, but it * is possible for a folio to have the dirty flag set, * but it is actually clean (all its buffers are clean). * This happens if the buffers were written out directly, * with submit_bh(). ext3 will do this, as well as * the blockdev mapping. filemap_release_folio() will * discover that cleanness and will drop the buffers * and mark the folio clean - it can be freed. * * Rarely, folios can have buffers and no ->mapping. * These are the folios which were not successfully * invalidated in truncate_cleanup_folio(). We try to * drop those buffers here and if that worked, and the * folio is no longer mapped into process address space * (refcount == 1) it can be freed. Otherwise, leave * the folio on the LRU so it is swappable.
*/ if (folio_needs_release(folio)) { if (!filemap_release_folio(folio, sc->gfp_mask)) goto activate_locked; if (!mapping && folio_ref_count(folio) == 1) {
folio_unlock(folio); if (folio_put_testzero(folio)) goto free_it; else { /* * rare race with speculative reference. * the speculative reference will free * this folio shortly, so we may * increment nr_reclaimed here (and * leave it off the LRU).
*/
nr_reclaimed += nr_pages; continue;
}
}
}
if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) { /* follow __remove_mapping for reference */ if (!folio_ref_freeze(folio, 1)) goto keep_locked; /* * The folio has only one reference left, which is * from the isolation. After the caller puts the * folio back on the lru and drops the reference, the * folio will be freed anyway. It doesn't matter * which lru it goes on. So we don't bother checking * the dirty flag here.
*/
count_vm_events(PGLAZYFREED, nr_pages);
count_memcg_folio_events(folio, PGLAZYFREED, nr_pages);
} elseif (!mapping || !__remove_mapping(mapping, folio, true,
sc->target_mem_cgroup)) goto keep_locked;
folio_unlock(folio);
free_it: /* * Folio may get swapped out as a whole, need to account * all pages in it.
*/
nr_reclaimed += nr_pages;
activate_locked_split: /* * The tail pages that are failed to add into swap cache * reach here. Fixup nr_scanned and nr_pages.
*/ if (nr_pages > 1) {
sc->nr_scanned -= (nr_pages - 1);
nr_pages = 1;
}
activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ if (folio_test_swapcache(folio) &&
(mem_cgroup_swap_full(folio) || folio_test_mlocked(folio)))
folio_free_swap(folio);
VM_BUG_ON_FOLIO(folio_test_active(folio), folio); if (!folio_test_mlocked(folio)) { int type = folio_is_file_lru(folio);
folio_set_active(folio);
stat->nr_activate[type] += nr_pages;
count_memcg_folio_events(folio, PGACTIVATE, nr_pages);
}
keep_locked:
folio_unlock(folio);
keep:
list_add(&folio->lru, &ret_folios);
VM_BUG_ON_FOLIO(folio_test_lru(folio) ||
folio_test_unevictable(folio), folio);
} /* 'folio_list' is always empty here */
/* Migrate folios selected for demotion */
nr_demoted = demote_folio_list(&demote_folios, pgdat);
nr_reclaimed += nr_demoted;
stat->nr_demoted += nr_demoted; /* Folios that could not be demoted are still in @demote_folios */ if (!list_empty(&demote_folios)) { /* Folios which weren't demoted go back on @folio_list */
list_splice_init(&demote_folios, folio_list);
/* * goto retry to reclaim the undemoted folios in folio_list if * desired. * * Reclaiming directly from top tier nodes is not often desired * due to it breaking the LRU ordering: in general memory * should be reclaimed from lower tier nodes and demoted from * top tier nodes. * * However, disabling reclaim from top tier nodes entirely * would cause ooms in edge scenarios where lower tier memory * is unreclaimable for whatever reason, eg memory being * mlocked or too hot to reclaim. We can disable reclaim * from top tier nodes in proactive reclaim though as that is * not real memory pressure.
*/ if (!sc->proactive) {
do_demote_pass = false; goto retry;
}
}
list_for_each_entry_safe(folio, next, folio_list, lru) { /* TODO: these pages should not even appear in this list. */ if (page_has_movable_ops(&folio->page)) continue; if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) &&
!folio_test_dirty(folio) && !folio_test_unevictable(folio)) {
folio_clear_active(folio);
list_move(&folio->lru, &clean_folios);
}
}
/* * We should be safe here since we are only dealing with file pages and * we are not kswapd and therefore cannot write dirty file pages. But * call memalloc_noreclaim_save() anyway, just in case these conditions * change in the future.
*/
noreclaim_flag = memalloc_noreclaim_save();
nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc,
&stat, true, NULL);
memalloc_noreclaim_restore(noreclaim_flag);
list_splice(&clean_folios, folio_list);
mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
-(long)nr_reclaimed); /* * Since lazyfree pages are isolated from file LRU from the beginning, * they will rotate back to anonymous LRU in the end if it failed to * discard so isolated count will be mismatched. * Compensate the isolated count for both LRU lists.
*/
mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON,
stat.nr_lazyfree_fail);
mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
-(long)stat.nr_lazyfree_fail); return nr_reclaimed;
}
/* * Update LRU sizes after isolating pages. The LRU size updates must * be complete before mem_cgroup_update_lru_size due to a sanity check.
*/ static __always_inline void update_lru_sizes(struct lruvec *lruvec, enum lru_list lru, unsignedlong *nr_zone_taken)
{ int zid;
for (zid = 0; zid < MAX_NR_ZONES; zid++) { if (!nr_zone_taken[zid]) continue;
/* * Isolating page from the lruvec to fill in @dst list by nr_to_scan times. * * lruvec->lru_lock is heavily contended. Some of the functions that * shrink the lists perform better by taking out a batch of pages * and working on them outside the LRU lock. * * For pagecache intensive workloads, this function is the hottest * spot in the kernel (apart from copy_*_user functions). * * Lru_lock must be held before calling this function. * * @nr_to_scan: The number of eligible pages to look through on the list. * @lruvec: The LRU vector to pull pages from. * @dst: The temp list to put pages on to. * @nr_scanned: The number of pages that were scanned. * @sc: The scan_control struct for this reclaim session * @lru: LRU list id for isolating * * returns how many pages were moved onto *@dst.
*/ staticunsignedlong isolate_lru_folios(unsignedlong nr_to_scan, struct lruvec *lruvec, struct list_head *dst, unsignedlong *nr_scanned, struct scan_control *sc, enum lru_list lru)
{ struct list_head *src = &lruvec->lists[lru]; unsignedlong nr_taken = 0; unsignedlong nr_zone_taken[MAX_NR_ZONES] = { 0 }; unsignedlong nr_skipped[MAX_NR_ZONES] = { 0, }; unsignedlong skipped = 0, total_scan = 0, scan = 0; unsignedlong nr_pages; unsignedlong max_nr_skipped = 0;
LIST_HEAD(folios_skipped);
/* Using max_nr_skipped to prevent hard LOCKUP*/ if (max_nr_skipped < SWAP_CLUSTER_MAX_SKIPPED &&
(folio_zonenum(folio) > sc->reclaim_idx)) {
nr_skipped[folio_zonenum(folio)] += nr_pages;
move_to = &folios_skipped;
max_nr_skipped++; goto move;
}
/* * Do not count skipped folios because that makes the function * return with no isolated folios if the LRU mostly contains * ineligible folios. This causes the VM to not reclaim any * folios, triggering a premature OOM. * Account all pages in a folio.
*/
scan += nr_pages;
if (!folio_test_lru(folio)) goto move; if (!sc->may_unmap && folio_mapped(folio)) goto move;
/* * Be careful not to clear the lru flag until after we're * sure the folio is not being freed elsewhere -- the * folio release code relies on it.
*/ if (unlikely(!folio_try_get(folio))) goto move;
if (!folio_test_clear_lru(folio)) { /* Another thread is already isolating this folio */
folio_put(folio); goto move;
}
/* * Splice any skipped folios to the start of the LRU list. Note that * this disrupts the LRU order when reclaiming for lower zones but * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX * scanning would soon rescan the same folios to skip and waste lots * of cpu cycles.
*/ if (!list_empty(&folios_skipped)) { int zid;
list_splice(&folios_skipped, src); for (zid = 0; zid < MAX_NR_ZONES; zid++) { if (!nr_skipped[zid]) continue;
/** * folio_isolate_lru() - Try to isolate a folio from its LRU list. * @folio: Folio to isolate from its LRU list. * * Isolate a @folio from an LRU list and adjust the vmstat statistic * corresponding to whatever LRU list the folio was on. * * The folio will have its LRU flag cleared. If it was found on the * active list, it will have the Active flag set. If it was found on the * unevictable list, it will have the Unevictable flag set. These flags * may need to be cleared by the caller before letting the page go. * * Context: * * (1) Must be called with an elevated refcount on the folio. This is a * fundamental difference from isolate_lru_folios() (which is called * without a stable reference). * (2) The lru_lock must not be held. * (3) Interrupts must be enabled. * * Return: true if the folio was removed from an LRU list. * false if the folio was not on an LRU list.
*/ bool folio_isolate_lru(struct folio *folio)
{ bool ret = false;
VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio);
if (folio_test_clear_lru(folio)) { struct lruvec *lruvec;
/* * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and * then get rescheduled. When there are massive number of tasks doing page * allocation, such sleeping direct reclaimers may keep piling up on each CPU, * the LRU list will go small and be scanned faster than necessary, leading to * unnecessary swapping, thrashing and OOM.
*/ staticbool too_many_isolated(struct pglist_data *pgdat, int file, struct scan_control *sc)
{ unsignedlong inactive, isolated; bool too_many;
/* * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they * won't get blocked by normal direct-reclaimers, forming a circular * deadlock.
*/ if (gfp_has_io_fs(sc->gfp_mask))
inactive >>= 3;
too_many = isolated > inactive;
/* Wake up tasks throttled due to too_many_isolated. */ if (!too_many)
wake_throttle_isolated(pgdat);
return too_many;
}
/* * move_folios_to_lru() moves folios from private @list to appropriate LRU list. * * Returns the number of pages moved to the given lruvec.
*/ staticunsignedint move_folios_to_lru(struct lruvec *lruvec, struct list_head *list)
{ int nr_pages, nr_moved = 0; struct folio_batch free_folios;
folio_batch_init(&free_folios); while (!list_empty(list)) { struct folio *folio = lru_to_folio(list);
/* * The folio_set_lru needs to be kept here for list integrity. * Otherwise: * #0 move_folios_to_lru #1 release_pages * if (!folio_put_testzero()) * if (folio_put_testzero()) * !lru //skip lru_lock * folio_set_lru() * list_add(&folio->lru,) * list_add(&folio->lru,)
*/
folio_set_lru(folio);
if (unlikely(folio_put_testzero(folio))) {
__folio_clear_lru_flags(folio);
/* * All pages were isolated from the same lruvec (and isolation * inhibits memcg migration).
*/
VM_BUG_ON_FOLIO(!folio_matches_lruvec(folio, lruvec), folio);
lruvec_add_folio(lruvec, folio);
nr_pages = folio_nr_pages(folio);
nr_moved += nr_pages; if (folio_test_active(folio))
workingset_age_nonresident(lruvec, nr_pages);
}
if (free_folios.nr) {
spin_unlock_irq(&lruvec->lru_lock);
mem_cgroup_uncharge_folios(&free_folios);
free_unref_folios(&free_folios);
spin_lock_irq(&lruvec->lru_lock);
}
return nr_moved;
}
/* * If a kernel thread (such as nfsd for loop-back mounts) services a backing * device by writing to the page cache it sets PF_LOCAL_THROTTLE. In this case * we should not throttle. Otherwise it is safe to do so.
*/ staticint current_may_throttle(void)
{ return !(current->flags & PF_LOCAL_THROTTLE);
}
/* * shrink_inactive_list() is a helper for shrink_node(). It returns the number * of reclaimed pages
*/ staticunsignedlong shrink_inactive_list(unsignedlong nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, enum lru_list lru)
{
LIST_HEAD(folio_list); unsignedlong nr_scanned; unsignedint nr_reclaimed = 0; unsignedlong nr_taken; struct reclaim_stat stat; bool file = is_file_lru(lru); enum vm_event_item item; struct pglist_data *pgdat = lruvec_pgdat(lruvec); bool stalled = false;
while (unlikely(too_many_isolated(pgdat, file, sc))) { if (stalled) return 0;
/* wait a bit for the reclaimer. */
stalled = true;
reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED);
/* We are about to die and free our memory. Return now. */ if (fatal_signal_pending(current)) return SWAP_CLUSTER_MAX;
}
/* * If dirty folios are scanned that are not queued for IO, it * implies that flushers are not doing their job. This can * happen when memory pressure pushes dirty folios to the end of * the LRU before the dirty limits are breached and the dirty * data has expired. It can also happen when the proportion of * dirty folios grows not through writes but through memory * pressure reclaiming all the clean cache. And in some cases, * the flushers simply cannot keep up with the allocation * rate. Nudge the flusher threads in case they are asleep.
*/ if (stat.nr_unqueued_dirty == nr_taken) {
wakeup_flusher_threads(WB_REASON_VMSCAN); /* * For cgroupv1 dirty throttling is achieved by waking up * the kernel flusher here and later waiting on folios * which are in writeback to finish (see shrink_folio_list()). * * Flusher may not be able to issue writeback quickly * enough for cgroupv1 writeback throttling to work * on a large system.
*/ if (!writeback_throttling_sane(sc))
reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
}
/* * shrink_active_list() moves folios from the active LRU to the inactive LRU. * * We move them the other way if the folio is referenced by one or more * processes. * * If the folios are mostly unmapped, the processing is fast and it is * appropriate to hold lru_lock across the whole operation. But if * the folios are mapped, the processing is slow (folio_referenced()), so * we should drop lru_lock around each folio. It's impossible to balance * this, so instead we remove the folios from the LRU while processing them. * It is safe to rely on the active flag against the non-LRU folios in here * because nobody will play with that bit on a non-LRU folio. * * The downside is that we have to touch folio->_refcount against each folio. * But we had to alter folio->flags anyway.
*/ staticvoid shrink_active_list(unsignedlong nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, enum lru_list lru)
{ unsignedlong nr_taken; unsignedlong nr_scanned;
vm_flags_t vm_flags;
LIST_HEAD(l_hold); /* The folios which were snipped off */
LIST_HEAD(l_active);
LIST_HEAD(l_inactive); unsigned nr_deactivate, nr_activate; unsigned nr_rotated = 0; bool file = is_file_lru(lru); struct pglist_data *pgdat = lruvec_pgdat(lruvec);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.