static DEFINE_SPINLOCK(swap_lock); staticunsignedint nr_swapfiles;
atomic_long_t nr_swap_pages; /* * Some modules use swappable objects and may try to swap them out under * memory pressure (via the shrinker). Before doing so, they may wish to * check to see if any swap space is available.
*/
EXPORT_SYMBOL_GPL(nr_swap_pages); /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ long total_swap_pages; staticint least_priority = -1; unsignedlong swapfile_maximum_size; #ifdef CONFIG_MIGRATION bool swap_migration_ad_supported; #endif/* CONFIG_MIGRATION */
/* * all active swap_info_structs * protected with swap_lock, and ordered by priority.
*/ static PLIST_HEAD(swap_active_head);
/* * all available (active, not full) swap_info_structs * protected with swap_avail_lock, ordered by priority. * This is used by folio_alloc_swap() instead of swap_active_head * because swap_active_head includes all swap_info_structs, * but folio_alloc_swap() doesn't need to look at full ones. * This uses its own lock instead of swap_lock because when a * swap_info_struct changes between not-full/full, it needs to * add/remove itself to/from this list, but the swap_info_struct->lock * is held and the locking order requires swap_lock to be taken * before any swap_info_struct->lock.
*/ staticstruct plist_head *swap_avail_heads; static DEFINE_SPINLOCK(swap_avail_lock);
static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait); /* Activity counter to indicate that a swapon or swapoff has occurred */ static atomic_t proc_poll_event = ATOMIC_INIT(0);
staticinlineunsignedchar swap_count(unsignedchar ent)
{ return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */
}
/* * Use the second highest bit of inuse_pages counter as the indicator * if one swap device is on the available plist, so the atomic can * still be updated arithmetically while having special data embedded. * * inuse_pages counter is the only thing indicating if a device should * be on avail_lists or not (except swapon / swapoff). By embedding the * off-list bit in the atomic counter, updates no longer need any lock * to check the list status. * * This bit will be set if the device is not on the plist and not * usable, will be cleared if the device is on the plist.
*/ #define SWAP_USAGE_OFFLIST_BIT (1UL << (BITS_PER_TYPE(atomic_t) - 2)) #define SWAP_USAGE_COUNTER_MASK (~SWAP_USAGE_OFFLIST_BIT) staticlong swap_usage_in_pages(struct swap_info_struct *si)
{ return atomic_long_read(&si->inuse_pages) & SWAP_USAGE_COUNTER_MASK;
}
/* Reclaim the swap entry anyway if possible */ #define TTRS_ANYWAY 0x1 /* * Reclaim the swap entry if there are no more mappings of the * corresponding page
*/ #define TTRS_UNMAPPED 0x2 /* Reclaim the swap entry if swap is getting full */ #define TTRS_FULL 0x4
/* * returns number of pages in the folio that backs the swap entry. If positive, * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no * folio was associated with the swap entry.
*/ staticint __try_to_reclaim_swap(struct swap_info_struct *si, unsignedlong offset, unsignedlong flags)
{
swp_entry_t entry = swp_entry(si->type, offset); struct address_space *address_space = swap_address_space(entry); struct swap_cluster_info *ci; struct folio *folio; int ret, nr_pages; bool need_reclaim;
again:
folio = filemap_get_folio(address_space, swap_cache_index(entry)); if (IS_ERR(folio)) return 0;
nr_pages = folio_nr_pages(folio);
ret = -nr_pages;
/* * When this function is called from scan_swap_map_slots() and it's * called by vmscan.c at reclaiming folios. So we hold a folio lock * here. We have to use trylock for avoiding deadlock. This is a special * case and you should use folio_free_swap() with explicit folio_lock() * in usual operations.
*/ if (!folio_trylock(folio)) goto out;
/* * Offset could point to the middle of a large folio, or folio * may no longer point to the expected offset before it's locked.
*/
entry = folio->swap; if (offset < swp_offset(entry) || offset >= swp_offset(entry) + nr_pages) {
folio_unlock(folio);
folio_put(folio); goto again;
}
offset = swp_offset(entry);
/* * It's safe to delete the folio from swap cache only if the folio's * swap_map is HAS_CACHE only, which means the slots have no page table * reference or pending writeback, and can't be allocated to others.
*/
ci = lock_cluster(si, offset);
need_reclaim = swap_only_has_cache(si, offset, nr_pages);
unlock_cluster(ci); if (!need_reclaim) goto out_unlock;
/* * swapon tell device that all the old swap contents can be discarded, * to allow the swap device to optimize its wear-levelling.
*/ staticint discard_swap(struct swap_info_struct *si)
{ struct swap_extent *se;
sector_t start_block;
sector_t nr_blocks; int err = 0;
/* Do not discard the swap header page! */
se = first_se(si);
start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); if (nr_blocks) {
err = blkdev_issue_discard(si->bdev, start_block,
nr_blocks, GFP_KERNEL); if (err) return err;
cond_resched();
}
/* * swap allocation tell device that a cluster of swap can now be discarded, * to allow the swap device to optimize its wear-levelling.
*/ staticvoid discard_swap_cluster(struct swap_info_struct *si,
pgoff_t start_page, pgoff_t nr_pages)
{ struct swap_extent *se = offset_to_swap_extent(si, start_page);
/* * Define swap_entry_order() as constant to let compiler to optimize * out some code if !CONFIG_THP_SWAP
*/ #define swap_entry_order(order) 0 #endif #define LATENCY_LIMIT 256
/* Add a cluster to discard list and schedule it to do discard */ staticvoid swap_cluster_schedule_discard(struct swap_info_struct *si, struct swap_cluster_info *ci)
{
VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE);
move_cluster(si, ci, &si->discard_clusters, CLUSTER_FLAG_DISCARD);
schedule_work(&si->discard_work);
}
/* * Isolate and lock the first cluster that is not contented on a list, * clean its flag before taken off-list. Cluster flag must be in sync * with list status, so cluster updaters can always know the cluster * list status without touching si lock. * * Note it's possible that all clusters on a list are contented so * this returns NULL for an non-empty list.
*/ staticstruct swap_cluster_info *isolate_lock_cluster( struct swap_info_struct *si, struct list_head *list)
{ struct swap_cluster_info *ci, *ret = NULL;
spin_lock(&si->lock);
if (unlikely(!(si->flags & SWP_WRITEOK))) goto out;
list_for_each_entry(ci, list, list) { if (!spin_trylock(&ci->lock)) continue;
/* We may only isolate and clear flags of following lists */
VM_BUG_ON(!ci->flags);
VM_BUG_ON(ci->flags > CLUSTER_FLAG_USABLE &&
ci->flags != CLUSTER_FLAG_FULL);
/* * Doing discard actually. After a cluster discard is finished, the cluster * will be added to free cluster list. Discard cluster is a bit special as * they don't participate in allocation or reclaim, so clusters marked as * CLUSTER_FLAG_DISCARD must remain off-list or on discard list.
*/ staticbool swap_do_scheduled_discard(struct swap_info_struct *si)
{ struct swap_cluster_info *ci; bool ret = false; unsignedint idx;
spin_lock(&si->lock); while (!list_empty(&si->discard_clusters)) {
ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list); /* * Delete the cluster from list to prepare for discard, but keep * the CLUSTER_FLAG_DISCARD flag, percpu_swap_cluster could be * pointing to it, or ran into by relocate_cluster.
*/
list_del(&ci->list);
idx = cluster_index(si, ci);
spin_unlock(&si->lock);
discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
SWAPFILE_CLUSTER);
spin_lock(&ci->lock); /* * Discard is done, clear its flags as it's off-list, then * return the cluster to allocation list.
*/
ci->flags = CLUSTER_FLAG_NONE;
__free_cluster(si, ci);
spin_unlock(&ci->lock);
ret = true;
spin_lock(&si->lock);
}
spin_unlock(&si->lock); return ret;
}
si = container_of(ref, struct swap_info_struct, users);
complete(&si->comp);
}
/* * Must be called after freeing if ci->count == 0, moves the cluster to free * or discard list.
*/ staticvoid free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
{
VM_BUG_ON(ci->count != 0);
VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE);
lockdep_assert_held(&ci->lock);
/* * If the swap is discardable, prepare discard the cluster * instead of free it immediately. The cluster will be freed * after discard.
*/ if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
(SWP_WRITEOK | SWP_PAGE_DISCARD)) {
swap_cluster_schedule_discard(si, ci); return;
}
__free_cluster(si, ci);
}
/* * Must be called after freeing if ci->count != 0, moves the cluster to * nonfull list.
*/ staticvoid partial_free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
{
VM_BUG_ON(!ci->count || ci->count == SWAPFILE_CLUSTER);
lockdep_assert_held(&ci->lock);
if (ci->flags != CLUSTER_FLAG_NONFULL)
move_cluster(si, ci, &si->nonfull_clusters[ci->order],
CLUSTER_FLAG_NONFULL);
}
/* * Must be called after allocation, moves the cluster to full or frag list. * Note: allocation doesn't acquire si lock, and may drop the ci lock for * reclaim, so the cluster could be any where when called.
*/ staticvoid relocate_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
{
lockdep_assert_held(&ci->lock);
/* Discard cluster must remain off-list or on discard list */ if (cluster_is_discard(ci)) return;
if (!ci->count) { if (ci->flags != CLUSTER_FLAG_FREE)
free_cluster(si, ci);
} elseif (ci->count != SWAPFILE_CLUSTER) { if (ci->flags != CLUSTER_FLAG_FRAG)
move_cluster(si, ci, &si->frag_clusters[ci->order],
CLUSTER_FLAG_FRAG);
} else { if (ci->flags != CLUSTER_FLAG_FULL)
move_cluster(si, ci, &si->full_clusters,
CLUSTER_FLAG_FULL);
}
}
/* * The cluster corresponding to page_nr will be used. The cluster will not be * added to free cluster list and its usage counter will be increased by 1. * Only used for initialization.
*/ staticvoid inc_cluster_info_page(struct swap_info_struct *si, struct swap_cluster_info *cluster_info, unsignedlong page_nr)
{ unsignedlong idx = page_nr / SWAPFILE_CLUSTER; struct swap_cluster_info *ci;
spin_unlock(&ci->lock); do { switch (READ_ONCE(map[offset])) { case 0:
offset++; break; case SWAP_HAS_CACHE:
nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); if (nr_reclaim > 0)
offset += nr_reclaim; else goto out; break; default: goto out;
}
} while (offset < end);
out:
spin_lock(&ci->lock); /* * Recheck the range no matter reclaim succeeded or not, the slot * could have been be freed while we are not holding the lock.
*/ for (offset = start; offset < end; offset++) if (READ_ONCE(map[offset])) returnfalse;
/* Try use a new cluster for current CPU and allocate from it. */ staticunsignedint alloc_swap_scan_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci, unsignedlong offset, unsignedint order, unsignedchar usage)
{ unsignedint next = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID; unsignedlong start = ALIGN_DOWN(offset, SWAPFILE_CLUSTER); unsignedlong end = min(start + SWAPFILE_CLUSTER, si->max); unsignedint nr_pages = 1 << order; bool need_reclaim, ret;
for (end -= nr_pages; offset <= end; offset += nr_pages) {
need_reclaim = false; if (!cluster_scan_range(si, ci, offset, nr_pages, &need_reclaim)) continue; if (need_reclaim) {
ret = cluster_reclaim_range(si, ci, offset, offset + nr_pages); /* * Reclaim drops ci->lock and cluster could be used * by another order. Not checking flag as off-list * cluster has no flag set, and change of list * won't cause fragmentation.
*/ if (!cluster_is_usable(ci, order)) goto out; if (cluster_is_empty(ci))
offset = start; /* Reclaim failed but cluster is usable, try next */ if (!ret) continue;
} if (!cluster_alloc_range(si, ci, offset, usage, order)) break;
found = offset;
offset += nr_pages; if (ci->count < SWAPFILE_CLUSTER && offset <= end)
next = offset; break;
}
out:
relocate_cluster(si, ci);
unlock_cluster(ci); if (si->flags & SWP_SOLIDSTATE) {
this_cpu_write(percpu_swap_cluster.offset[order], next);
this_cpu_write(percpu_swap_cluster.si[order], si);
} else {
si->global_cluster->next[order] = next;
} return found;
}
si = container_of(work, struct swap_info_struct, reclaim_work);
swap_reclaim_full_clusters(si, true);
}
/* * Try to allocate swap entries with specified order and try set a new * cluster for current CPU too.
*/ staticunsignedlong cluster_alloc_swap_entry(struct swap_info_struct *si, int order, unsignedchar usage)
{ struct swap_cluster_info *ci; unsignedint offset = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;
/* * Swapfile is not block device so unable * to allocate large entries.
*/ if (order && !(si->flags & SWP_BLKDEV)) return 0;
if (!(si->flags & SWP_SOLIDSTATE)) { /* Serialize HDD SWAP allocation for each device. */
spin_lock(&si->global_cluster_lock);
offset = si->global_cluster->next[order]; if (offset == SWAP_ENTRY_INVALID) goto new_cluster;
ci = lock_cluster(si, offset); /* Cluster could have been used by another order */ if (cluster_is_usable(ci, order)) { if (cluster_is_empty(ci))
offset = cluster_offset(si, ci);
found = alloc_swap_scan_cluster(si, ci, offset,
order, usage);
} else {
unlock_cluster(ci);
} if (found) goto done;
}
new_cluster:
ci = isolate_lock_cluster(si, &si->free_clusters); if (ci) {
found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
order, usage); if (found) goto done;
}
/* Try reclaim from full clusters if free clusters list is drained */ if (vm_swap_full())
swap_reclaim_full_clusters(si, false);
if (order < PMD_ORDER) { unsignedint frags = 0, frags_existing;
while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[order]))) {
found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
order, usage); if (found) goto done; /* Clusters failed to allocate are moved to frag_clusters */
frags++;
}
frags_existing = atomic_long_read(&si->frag_cluster_nr[order]); while (frags < frags_existing &&
(ci = isolate_lock_cluster(si, &si->frag_clusters[order]))) {
atomic_long_dec(&si->frag_cluster_nr[order]); /* * Rotate the frag list to iterate, they were all * failing high order allocation or moved here due to * per-CPU usage, but they could contain newly released * reclaimable (eg. lazy-freed swap cache) slots.
*/
found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
order, usage); if (found) goto done;
frags++;
}
}
/* * We don't have free cluster but have some clusters in discarding, * do discard now and reclaim them.
*/ if ((si->flags & SWP_PAGE_DISCARD) && swap_do_scheduled_discard(si)) goto new_cluster;
if (order) goto done;
/* Order 0 stealing from higher order */ for (int o = 1; o < SWAP_NR_ORDERS; o++) { /* * Clusters here have at least one usable slots and can't fail order 0 * allocation, but reclaim may drop si->lock and race with another user.
*/ while ((ci = isolate_lock_cluster(si, &si->frag_clusters[o]))) {
atomic_long_dec(&si->frag_cluster_nr[o]);
found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
0, usage); if (found) goto done;
}
while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[o]))) {
found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
0, usage); if (found) goto done;
}
}
done: if (!(si->flags & SWP_SOLIDSTATE))
spin_unlock(&si->global_cluster_lock); return found;
}
/* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */ staticvoid del_from_avail_list(struct swap_info_struct *si, bool swapoff)
{ int nid; unsignedlong pages;
spin_lock(&swap_avail_lock);
if (swapoff) { /* * Forcefully remove it. Clear the SWP_WRITEOK flags for * swapoff here so it's synchronized by both si->lock and * swap_avail_lock, to ensure the result can be seen by * add_to_avail_list.
*/
lockdep_assert_held(&si->lock);
si->flags &= ~SWP_WRITEOK;
atomic_long_or(SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages);
} else { /* * If not called by swapoff, take it off-list only if it's * full and SWAP_USAGE_OFFLIST_BIT is not set (strictly * si->inuse_pages == pages), any concurrent slot freeing, * or device already removed from plist by someone else * will make this return false.
*/
pages = si->pages; if (!atomic_long_try_cmpxchg(&si->inuse_pages, &pages,
pages | SWAP_USAGE_OFFLIST_BIT)) goto skip;
}
/* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */ staticvoid add_to_avail_list(struct swap_info_struct *si, bool swapon)
{ int nid; long val; unsignedlong pages;
spin_lock(&swap_avail_lock);
/* Corresponding to SWP_WRITEOK clearing in del_from_avail_list */ if (swapon) {
lockdep_assert_held(&si->lock);
si->flags |= SWP_WRITEOK;
} else { if (!(READ_ONCE(si->flags) & SWP_WRITEOK)) goto skip;
}
if (!(atomic_long_read(&si->inuse_pages) & SWAP_USAGE_OFFLIST_BIT)) goto skip;
val = atomic_long_fetch_and_relaxed(~SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages);
/* * When device is full and device is on the plist, only one updater will * see (inuse_pages == si->pages) and will call del_from_avail_list. If * that updater happen to be here, just skip adding.
*/
pages = si->pages; if (val == pages) { /* Just like the cmpxchg in del_from_avail_list */ if (atomic_long_try_cmpxchg(&si->inuse_pages, &pages,
pages | SWAP_USAGE_OFFLIST_BIT)) goto skip;
}
/* * swap_usage_add / swap_usage_sub of each slot are serialized by ci->lock * within each cluster, so the total contribution to the global counter should * always be positive and cannot exceed the total number of usable slots.
*/ staticbool swap_usage_add(struct swap_info_struct *si, unsignedint nr_entries)
{ long val = atomic_long_add_return_relaxed(nr_entries, &si->inuse_pages);
/* * If device is full, and SWAP_USAGE_OFFLIST_BIT is not set, * remove it from the plist.
*/ if (unlikely(val == si->pages)) {
del_from_avail_list(si, false); returntrue;
}
returnfalse;
}
staticvoid swap_usage_sub(struct swap_info_struct *si, unsignedint nr_entries)
{ long val = atomic_long_sub_return_relaxed(nr_entries, &si->inuse_pages);
/* * If device is not full, and SWAP_USAGE_OFFLIST_BIT is set, * add it to the plist.
*/ if (unlikely(val & SWAP_USAGE_OFFLIST_BIT))
add_to_avail_list(si, false);
}
staticvoid swap_range_alloc(struct swap_info_struct *si, unsignedint nr_entries)
{ if (swap_usage_add(si, nr_entries)) { if (vm_swap_full())
schedule_work(&si->reclaim_work);
}
atomic_long_sub(nr_entries, &nr_swap_pages);
}
/* * Use atomic clear_bit operations only on zeromap instead of non-atomic * bitmap_clear to prevent adjacent bits corruption due to simultaneous writes.
*/ for (i = 0; i < nr_entries; i++) {
clear_bit(offset + i, si->zeromap);
zswap_invalidate(swp_entry(si->type, offset + i));
}
if (si->flags & SWP_BLKDEV)
swap_slot_free_notify =
si->bdev->bd_disk->fops->swap_slot_free_notify; else
swap_slot_free_notify = NULL; while (offset <= end) {
arch_swap_invalidate_page(si->type, offset); if (swap_slot_free_notify)
swap_slot_free_notify(si->bdev, offset);
offset++;
}
clear_shadow_from_swap_cache(si->type, begin, end);
/* * Make sure that try_to_unuse() observes si->inuse_pages reaching 0 * only after the above cleanups are done.
*/
smp_wmb();
atomic_long_add(nr_entries, &nr_swap_pages);
swap_usage_sub(si, nr_entries);
}
staticbool get_swap_device_info(struct swap_info_struct *si)
{ if (!percpu_ref_tryget_live(&si->users)) returnfalse; /* * Guarantee the si->users are checked before accessing other * fields of swap_info_struct, and si->flags (SWP_WRITEOK) is * up to dated. * * Paired with the spin_unlock() after setup_swap_info() in * enable_swap_info(), and smp_wmb() in swapoff.
*/
smp_rmb(); returntrue;
}
/* * Fast path try to get swap entries with specified order from current * CPU's swap entry pool (a cluster).
*/ staticbool swap_alloc_fast(swp_entry_t *entry, int order)
{ struct swap_cluster_info *ci; struct swap_info_struct *si; unsignedint offset, found = SWAP_ENTRY_INVALID;
/* * Once allocated, swap_info_struct will never be completely freed, * so checking it's liveness by get_swap_device_info is enough.
*/
si = this_cpu_read(percpu_swap_cluster.si[order]);
offset = this_cpu_read(percpu_swap_cluster.offset[order]); if (!si || !offset || !get_swap_device_info(si)) returnfalse;
ci = lock_cluster(si, offset); if (cluster_is_usable(ci, order)) { if (cluster_is_empty(ci))
offset = cluster_offset(si, ci);
found = alloc_swap_scan_cluster(si, ci, offset, order, SWAP_HAS_CACHE); if (found)
*entry = swp_entry(si->type, found);
} else {
unlock_cluster(ci);
}
put_swap_device(si); return !!found;
}
/* Rotate the device and switch to a new cluster */ staticbool swap_alloc_slow(swp_entry_t *entry, int order)
{ int node; unsignedlong offset; struct swap_info_struct *si, *next;
node = numa_node_id();
spin_lock(&swap_avail_lock);
start_over:
plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) { /* Rotate the device and switch to a new cluster */
plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
spin_unlock(&swap_avail_lock); if (get_swap_device_info(si)) {
offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE);
put_swap_device(si); if (offset) {
*entry = swp_entry(si->type, offset); returntrue;
} if (order) returnfalse;
}
spin_lock(&swap_avail_lock); /* * if we got here, it's likely that si was almost full before, * and since scan_swap_map_slots() can drop the si->lock, * multiple callers probably all tried to get a page from the * same si and it filled up before we could get one; or, the si * filled up between us dropping swap_avail_lock and taking * si->lock. Since we dropped the swap_avail_lock, the * swap_avail_head list may have been modified; so if next is * still in the swap_avail_head list then try it, otherwise * start over if we have not gotten any slots.
*/ if (plist_node_empty(&next->avail_lists[node])) goto start_over;
}
spin_unlock(&swap_avail_lock); returnfalse;
}
/** * folio_alloc_swap - allocate swap space for a folio * @folio: folio we want to move to swap * @gfp: gfp mask for shadow nodes * * Allocate swap space for the folio and add the folio to the * swap cache. * * Context: Caller needs to hold the folio lock. * Return: Whether the folio was added to the swap cache.
*/ int folio_alloc_swap(struct folio *folio, gfp_t gfp)
{ unsignedint order = folio_order(folio); unsignedint size = 1 << order;
swp_entry_t entry = {};
if (order) { /* * Reject large allocation when THP_SWAP is disabled, * the caller should split the folio and try again.
*/ if (!IS_ENABLED(CONFIG_THP_SWAP)) return -EAGAIN;
/* * Allocation size should never exceed cluster size * (HPAGE_PMD_SIZE).
*/ if (size > SWAPFILE_CLUSTER) {
VM_WARN_ON_ONCE(1); return -EINVAL;
}
}
local_lock(&percpu_swap_cluster.lock); if (!swap_alloc_fast(&entry, order))
swap_alloc_slow(&entry, order);
local_unlock(&percpu_swap_cluster.lock);
/* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */ if (mem_cgroup_try_charge_swap(folio, entry)) goto out_free;
if (!entry.val) return -ENOMEM;
/* * XArray node allocations from PF_MEMALLOC contexts could * completely exhaust the page allocator. __GFP_NOMEMALLOC * stops emergency reserves from being allocated. * * TODO: this could cause a theoretical memory reclaim * deadlock in the swap out path.
*/ if (add_to_swap_cache(folio, entry, gfp | __GFP_NOMEMALLOC, NULL)) goto out_free;
/* * When we get a swap entry, if there aren't some other ways to * prevent swapoff, such as the folio in swap cache is locked, RCU * reader side is locked, etc., the swap entry may become invalid * because of swapoff. Then, we need to enclose all swap related * functions with get_swap_device() and put_swap_device(), unless the * swap functions call get/put_swap_device() by themselves. * * RCU reader side lock (including any spinlock) is sufficient to * prevent swapoff, because synchronize_rcu() is called in swapoff() * before freeing data structures. * * Check whether swap entry is valid in the swap device. If so, * return pointer to swap_info_struct, and keep the swap entry valid * via preventing the swap device from being swapoff, until * put_swap_device() is called. Otherwise return NULL. * * Notice that swapoff or swapoff+swapon can still happen before the * percpu_ref_tryget_live() in get_swap_device() or after the * percpu_ref_put() in put_swap_device() if there isn't any other way * to prevent swapoff. The caller must be prepared for that. For * example, the following situation is possible. * * CPU1 CPU2 * do_swap_page() * ... swapoff+swapon * __read_swap_cache_async() * swapcache_prepare() * __swap_duplicate() * // check swap_map * // verify PTE not changed * * In __swap_duplicate(), the swap_map need to be checked before * changing partly because the specified swap entry may be for another * swap device which has been swapoff. And in do_swap_page(), after * the page is read from the swap device, the PTE is verified not * changed with the page table locked to check whether the swap device * has been swapoff or swapoff+swapon.
*/ struct swap_info_struct *get_swap_device(swp_entry_t entry)
{ struct swap_info_struct *si; unsignedlong offset;
if (!entry.val) goto out;
si = swp_swap_info(entry); if (!si) goto bad_nofile; if (!get_swap_device_info(si)) goto out;
offset = swp_offset(entry); if (offset >= si->max) goto put_out;
if (nr <= 1) goto fallback;
count = swap_count(data_race(si->swap_map[offset])); if (count != 1 && count != SWAP_MAP_SHMEM) goto fallback;
ci = lock_cluster(si, offset); if (!swap_is_last_map(si, offset, nr, &has_cache)) { goto locked_fallback;
} if (!has_cache)
swap_entries_free(si, ci, entry, nr); else for (i = 0; i < nr; i++)
WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE);
unlock_cluster(ci);
return has_cache;
fallback:
ci = lock_cluster(si, offset);
locked_fallback: for (i = 0; i < nr; i++, entry.val++) {
count = swap_entry_put_locked(si, ci, entry, 1); if (count == SWAP_HAS_CACHE)
has_cache = true;
}
unlock_cluster(ci); return has_cache;
}
/* * Only functions with "_nr" suffix are able to free entries spanning * cross multi clusters, so ensure the range is within a single cluster * when freeing entries with functions without "_nr" suffix.
*/ staticbool swap_entries_put_map_nr(struct swap_info_struct *si,
swp_entry_t entry, int nr)
{ int cluster_nr, cluster_rest; unsignedlong offset = swp_offset(entry); bool has_cache = false;
/* * Check if it's the last ref of swap entry in the freeing path. * Qualified vlaue includes 1, SWAP_HAS_CACHE or SWAP_MAP_SHMEM.
*/ staticinlinebool __maybe_unused swap_is_last_ref(unsignedchar count)
{ return (count == SWAP_HAS_CACHE) || (count == 1) ||
(count == SWAP_MAP_SHMEM);
}
/* * Drop the last ref of swap entries, caller have to ensure all entries * belong to the same cgroup and cluster.
*/ staticvoid swap_entries_free(struct swap_info_struct *si, struct swap_cluster_info *ci,
swp_entry_t entry, unsignedint nr_pages)
{ unsignedlong offset = swp_offset(entry); unsignedchar *map = si->swap_map + offset; unsignedchar *map_end = map + nr_pages;
/* It should never free entries across different clusters */
VM_BUG_ON(ci != offset_to_cluster(si, offset + nr_pages - 1));
VM_BUG_ON(cluster_is_empty(ci));
VM_BUG_ON(ci->count < nr_pages);
ci->count -= nr_pages; do {
VM_BUG_ON(!swap_is_last_ref(*map));
*map = 0;
} while (++map < map_end);
if (!ci->count)
free_cluster(si, ci); else
partial_free_cluster(si, ci);
}
/* * Caller has made sure that the swap device corresponding to entry * is still around or has not been recycled.
*/ void swap_free_nr(swp_entry_t entry, int nr_pages)
{ int nr; struct swap_info_struct *sis; unsignedlong offset = swp_offset(entry);
/* * How many references to @entry are currently swapped out? * This does not give an exact answer when swap count is continued, * but does include the high COUNT_CONTINUED flag to allow for that.
*/ bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry)
{
pgoff_t offset = swp_offset(entry); struct swap_cluster_info *ci; int count;
/* * How many references to @entry are currently swapped out? * This considers COUNT_CONTINUED so it returns exact answer.
*/ int swp_swapcount(swp_entry_t entry)
{ int count, tmp_count, n; struct swap_info_struct *si; struct swap_cluster_info *ci; struct page *page;
pgoff_t offset; unsignedchar *map;
si = _swap_info_get(entry); if (!si) return 0;
offset = swp_offset(entry);
ci = lock_cluster(si, offset);
count = swap_count(si->swap_map[offset]); if (!(count & COUNT_CONTINUED)) goto out;
ci = lock_cluster(si, offset); if (nr_pages == 1) { if (swap_count(map[roffset]))
ret = true; goto unlock_out;
} for (i = 0; i < nr_pages; i++) { if (swap_count(map[offset + i])) {
ret = true; break;
}
}
unlock_out:
unlock_cluster(ci); return ret;
}
if (!folio_test_swapcache(folio)) returnfalse; if (folio_test_writeback(folio)) returnfalse;
/* * Once hibernation has begun to create its image of memory, * there's a danger that one of the calls to folio_free_swap() * - most probably a call from __try_to_reclaim_swap() while * hibernation is allocating its own swap pages for the image, * but conceivably even a call from memory reclaim - will free * the swap from a folio which has already been recorded in the * image as a clean swapcache folio, and then reuse its swap for * another page of the image. On waking from hibernation, the * original folio might be freed under memory pressure, then * later read back in from swap, now with the wrong data. * * Hibernation suspends storage while it is writing the image * to disk so check that here.
*/ if (pm_suspended_storage()) returnfalse;
returntrue;
}
/** * folio_free_swap() - Free the swap space used for this folio. * @folio: The folio to remove. * * If swap is getting full, or if there are no more mappings of this folio, * then call folio_free_swap to free its swap space. * * Return: true if we were able to release the swap space.
*/ bool folio_free_swap(struct folio *folio)
{ if (!folio_swapcache_freeable(folio)) returnfalse; if (folio_swapped(folio)) returnfalse;
/** * free_swap_and_cache_nr() - Release reference on range of swap entries and * reclaim their cache if no more references remain. * @entry: First entry of range. * @nr: Number of entries in range. * * For each swap entry in the contiguous range, release a reference. If any swap * entries become free, try to reclaim their underlying folios, if present. The * offset range is defined by [entry.offset, entry.offset + nr).
*/ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
{ constunsignedlong start_offset = swp_offset(entry); constunsignedlong end_offset = start_offset + nr; struct swap_info_struct *si; bool any_only_cache = false; unsignedlong offset;
si = get_swap_device(entry); if (!si) return;
if (WARN_ON(end_offset > si->max)) goto out;
/* * First free all entries in the range.
*/
any_only_cache = swap_entries_put_map_nr(si, entry, nr);
/* * Short-circuit the below loop if none of the entries had their * reference drop to zero.
*/ if (!any_only_cache) goto out;
/* * Now go back over the range trying to reclaim the swap cache.
*/ for (offset = start_offset; offset < end_offset; offset += nr) {
nr = 1; if (READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { /* * Folios are always naturally aligned in swap so * advance forward to the next boundary. Zero means no * folio was found for the swap entry, so advance by 1 * in this case. Negative value means folio was found * but could not be reclaimed. Here we can still advance * to the next boundary.
*/
nr = __try_to_reclaim_swap(si, offset,
TTRS_UNMAPPED | TTRS_FULL); if (nr == 0)
nr = 1; elseif (nr < 0)
nr = -nr;
nr = ALIGN(offset + 1, nr) - offset;
}
}
/* This is called for allocating swap entry, not cache */ if (get_swap_device_info(si)) { if (si->flags & SWP_WRITEOK) {
offset = cluster_alloc_swap_entry(si, 0, 1); if (offset) {
entry = swp_entry(si->type, offset);
atomic_long_dec(&nr_swap_pages);
}
}
put_swap_device(si);
}
fail: return entry;
}
/* * Find the swap type that corresponds to given device (if any). * * @offset - number of the PAGE_SIZE-sized block of the device, starting * from 0, in which the swap header is expected to be located. * * This is needed for the suspend to disk (aka swsusp).
*/ int swap_type_of(dev_t device, sector_t offset)
{ int type;
if (!device) return -1;
spin_lock(&swap_lock); for (type = 0; type < nr_swapfiles; type++) { struct swap_info_struct *sis = swap_info[type];
if (!(sis->flags & SWP_WRITEOK)) continue;
if (device == sis->bdev->bd_dev) { struct swap_extent *se = first_se(sis);
/* * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev * corresponding to given index in swap_info (swap type).
*/
sector_t swapdev_block(int type, pgoff_t offset)
{ struct swap_info_struct *si = swap_type_to_swap_info(type); struct swap_extent *se;
if (!si || !(si->flags & SWP_WRITEOK)) return 0;
se = offset_to_swap_extent(si, offset); return se->start_block + (offset - se->start_page);
}
/* * Return either the total number of swap pages of given type, or the number * of free pages of that type (depending on @free) * * This is needed for software suspend
*/ unsignedint count_swap_pages(int type, int free)
{ unsignedint n = 0;
/* * No need to decide whether this PTE shares the swap entry with others, * just let do_wp_page work it out if a write is requested later - to * force COW, vm_page_prot omits write permission from any private vma.
*/ staticint unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, unsignedlong addr, swp_entry_t entry, struct folio *folio)
{ struct page *page; struct folio *swapcache;
spinlock_t *ptl;
pte_t *pte, new_pte, old_pte; bool hwpoisoned = false; int ret = 1;
/* * Some architectures may have to restore extra metadata to the page * when reading from swap. This metadata may be indexed by swap entry * so this must be called before swap_free().
*/
arch_swap_restore(folio_swap(entry, folio), folio);
/* * See do_swap_page(): writeback would be problematic. * However, we do a folio_wait_writeback() just before this * call and have the folio locked.
*/
VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); if (pte_swp_exclusive(old_pte))
rmap_flags |= RMAP_EXCLUSIVE; /* * We currently only expect small !anon folios, which are either * fully exclusive or fully shared. If we ever get large folios * here, we have to be careful.
*/ if (!folio_test_anon(folio)) {
VM_WARN_ON_ONCE(folio_test_large(folio));
VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
folio_add_new_anon_rmap(folio, vma, addr, rmap_flags);
} else {
folio_add_anon_rmap_pte(folio, page, vma, addr, rmap_flags);
}
} else { /* ksm created a completely new copy */
folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE);
folio_add_lru_vma(folio, vma);
}
new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot)); if (pte_swp_soft_dirty(old_pte))
new_pte = pte_mksoft_dirty(new_pte); if (pte_swp_uffd_wp(old_pte))
new_pte = pte_mkuffd_wp(new_pte);
setpte:
set_pte_at(vma->vm_mm, addr, pte, new_pte);
swap_free(entry);
out: if (pte)
pte_unmap_unlock(pte, ptl); if (folio != swapcache) {
folio_unlock(folio);
folio_put(folio);
} return ret;
}
/* * Scan swap_map from current position to next entry still in use. * Return 0 if there are no inuse entries after prev till end of * the map.
*/ staticunsignedint find_next_to_unuse(struct swap_info_struct *si, unsignedint prev)
{ unsignedint i; unsignedchar count;
/* * No need for swap_lock here: we're just looking * for whether an entry is in use, not modifying it; false * hits are okay, and sys_swapoff() has already prevented new * allocations from this area (while holding swap_lock).
*/ for (i = prev + 1; i < si->max; i++) {
count = READ_ONCE(si->swap_map[i]); if (count && swap_count(count) != SWAP_MAP_BAD) break; if ((i % LATENCY_LIMIT) == 0)
cond_resched();
}
/* * It is conceivable that a racing task removed this folio from * swap cache just before we acquired the page lock. The folio * might even be back in swap cache on another swap area. But * that is okay, folio_free_swap() only removes stale folios.
*/
folio_lock(folio);
folio_wait_writeback(folio);
folio_free_swap(folio);
folio_unlock(folio);
folio_put(folio);
}
/* * Lets check again to see if there are still swap entries in the map. * If yes, we would need to do retry the unuse logic again. * Under global memory pressure, swap entries can be reinserted back * into process space after the mmlist loop above passes over them. * * Limit the number of retries? No: when mmget_not_zero() * above fails, that mm is likely to be freeing swap from * exit_mmap(), which proceeds at its own independent pace; * and even shmem_writeout() could have been preempted after * folio_alloc_swap(), temporarily hiding that swap. It's easy * and robust (though cpu-intensive) just to keep retrying.
*/ if (swap_usage_in_pages(si)) { if (!signal_pending(current)) goto retry; return -EINTR;
}
success: /* * Make sure that further cleanups after try_to_unuse() returns happen * after swap_range_free() reduces si->inuse_pages to 0.
*/
smp_mb(); return 0;
}
/* * After a successful try_to_unuse, if no swap is now in use, we know * we can empty the mmlist. swap_lock must be held on entry and exit. * Note that mmlist_lock nests inside swap_lock, and an mm must be * added to the mmlist just after page_duplicate - before would be racy.
*/ staticvoid drain_mmlist(void)
{ struct list_head *p, *next; unsignedint type;
for (type = 0; type < nr_swapfiles; type++) if (swap_usage_in_pages(swap_info[type])) return;
spin_lock(&mmlist_lock);
list_for_each_safe(p, next, &init_mm.mmlist)
list_del_init(p);
spin_unlock(&mmlist_lock);
}
/* * Free all of a swapdev's extent information
*/ staticvoid destroy_swap_extents(struct swap_info_struct *sis)
{ while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) { struct rb_node *rb = sis->swap_extent_root.rb_node; struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node);
sis->flags &= ~SWP_ACTIVATED; if (mapping->a_ops->swap_deactivate)
mapping->a_ops->swap_deactivate(swap_file);
}
}
/* * Add a block range (and the corresponding page range) into this swapdev's * extent tree. * * This function rather assumes that it is called in ascending page order.
*/ int
add_swap_extent(struct swap_info_struct *sis, unsignedlong start_page, unsignedlong nr_pages, sector_t start_block)
{ struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL; struct swap_extent *se; struct swap_extent *new_se;
/* * place the new node at the right most since the * function is called in ascending page order.
*/ while (*link) {
parent = *link;
link = &parent->rb_right;
}
if (parent) {
se = rb_entry(parent, struct swap_extent, rb_node);
BUG_ON(se->start_page + se->nr_pages != start_page); if (se->start_block + se->nr_pages == start_block) { /* Merge it */
se->nr_pages += nr_pages; return 0;
}
}
/* No merge, insert a new extent. */
new_se = kmalloc(sizeof(*se), GFP_KERNEL); if (new_se == NULL) return -ENOMEM;
new_se->start_page = start_page;
new_se->nr_pages = nr_pages;
new_se->start_block = start_block;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.