// SPDX-License-Identifier: GPL-2.0-or-later /* * zswap.c - zswap driver file * * zswap is a cache that takes pages that are in the process * of being swapped out and attempts to compress and store them in a * RAM-based memory pool. This can result in a significant I/O reduction on * the swap device and, in the case where decompressing from RAM is faster * than reading from the swap device, can also improve workload performance. * * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com>
*/
/********************************* * statistics
**********************************/ /* The number of compressed pages currently stored in zswap */
atomic_long_t zswap_stored_pages = ATOMIC_LONG_INIT(0);
/* * The statistics below are not protected from concurrent access for * performance reasons so they may not be a 100% accurate. However, * they do provide useful information on roughly how many times a * certain event is occurring.
*/
/* Pool limit was hit (see zswap_max_pool_percent) */ static u64 zswap_pool_limit_hit; /* Pages written back when pool limit was reached */ static u64 zswap_written_back_pages; /* Store failed due to a reclaim failure after pool limit was reached */ static u64 zswap_reject_reclaim_fail; /* Store failed due to compression algorithm failure */ static u64 zswap_reject_compress_fail; /* Compressed page was too big for the allocator to (optimally) store */ static u64 zswap_reject_compress_poor; /* Load or writeback failed due to decompression failure */ static u64 zswap_decompress_fail; /* Store failed because underlying allocator could not get memory */ static u64 zswap_reject_alloc_fail; /* Store failed because the entry metadata could not be allocated (rare) */ static u64 zswap_reject_kmemcache_fail;
/* Shrinker work queue */ staticstruct workqueue_struct *shrink_wq; /* Pool limit was hit, we need to calm down */ staticbool zswap_pool_reached_full;
/* The maximum percentage of memory that the compressed pool can occupy */ staticunsignedint zswap_max_pool_percent = 20;
module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644);
/* The threshold for accepting new pages after the max_pool_percent was hit */ staticunsignedint zswap_accept_thr_percent = 90; /* of max pool size */
module_param_named(accept_threshold_percent, zswap_accept_thr_percent,
uint, 0644);
/* * The lock ordering is zswap_tree.lock -> zswap_pool.lru_lock. * The only case where lru_lock is not acquired while holding tree.lock is * when a zswap_entry is taken off the lru for writeback, in that case it * needs to be verified that it's still valid in the tree.
*/ struct zswap_pool { struct zpool *zpool; struct crypto_acomp_ctx __percpu *acomp_ctx; struct percpu_ref ref; struct list_head list; struct work_struct release_work; struct hlist_node node; char tfm_name[CRYPTO_MAX_ALG_NAME];
};
/* Global LRU lists shared by all zswap pools. */ staticstruct list_lru zswap_list_lru;
/* * struct zswap_entry * * This structure contains the metadata for tracking a single compressed * page within zswap. * * swpentry - associated swap entry, the offset indexes into the red-black tree * length - the length in bytes of the compressed page data. Needed during * decompression. * referenced - true if the entry recently entered the zswap pool. Unset by the * writeback logic. The entry is only reclaimed by the writeback * logic if referenced is unset. See comments in the shrinker * section for context. * pool - the zswap_pool the entry's data is in * handle - zpool allocation handle that stores the compressed page data * objcg - the obj_cgroup that the compressed memory is charged to * lru - handle to the pool's lru used to evict pages.
*/ struct zswap_entry {
swp_entry_t swpentry; unsignedint length; bool referenced; struct zswap_pool *pool; unsignedlong handle; struct obj_cgroup *objcg; struct list_head lru;
};
if (!zswap_has_pool) { /* if either are unset, pool initialization failed, and we * need both params to be set correctly before trying to * create a pool.
*/ if (!strcmp(type, ZSWAP_PARAM_UNSET)) return NULL; if (!strcmp(compressor, ZSWAP_PARAM_UNSET)) return NULL;
}
pool = kzalloc(sizeof(*pool), GFP_KERNEL); if (!pool) return NULL;
/* unique name for each pool specifically required by zsmalloc */
snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count));
pool->zpool = zpool_create_pool(type, name, gfp); if (!pool->zpool) {
pr_err("%s zpool not available\n", type); goto error;
}
pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE,
&pool->node); if (ret) goto error;
/* being the current pool takes 1 ref; this func expects the * caller to always add the new pool as the current pool
*/
ret = percpu_ref_init(&pool->ref, __zswap_pool_empty,
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL); if (ret) goto ref_fail;
INIT_LIST_HEAD(&pool->list);
zswap_pool_debug("created", pool);
return pool;
ref_fail:
cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
error: if (pool->acomp_ctx)
free_percpu(pool->acomp_ctx); if (pool->zpool)
zpool_destroy_pool(pool->zpool);
kfree(pool); return NULL;
}
pool = __zswap_pool_current(); if (!zswap_pool_tryget(pool))
pool = NULL;
rcu_read_unlock();
return pool;
}
/* type and compressor must be null-terminated */ staticstruct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
{ struct zswap_pool *pool;
assert_spin_locked(&zswap_pools_lock);
list_for_each_entry_rcu(pool, &zswap_pools, list) { if (strcmp(pool->tfm_name, compressor)) continue; if (strcmp(zpool_get_type(pool->zpool), type)) continue; /* if we can't get it, it's about to be destroyed */ if (!zswap_pool_tryget(pool)) continue; return pool;
}
/* val must be a null-terminated string */ staticint __zswap_param_set(constchar *val, conststruct kernel_param *kp, char *type, char *compressor)
{ struct zswap_pool *pool, *put_pool = NULL; char *s = strstrip((char *)val); int ret = 0; bool new_pool = false;
mutex_lock(&zswap_init_lock); switch (zswap_init_state) { case ZSWAP_UNINIT: /* if this is load-time (pre-init) param setting, * don't create a pool; that's done during init.
*/
ret = param_set_charp(s, kp); break; case ZSWAP_INIT_SUCCEED:
new_pool = zswap_pool_changed(s, kp); break; case ZSWAP_INIT_FAILED:
pr_err("can't set param, initialization failed\n");
ret = -ENODEV;
}
mutex_unlock(&zswap_init_lock);
/* no need to create a new pool, return directly */ if (!new_pool) return ret;
if (!type) { if (!zpool_has_pool(s)) {
pr_err("zpool %s not available\n", s); return -ENOENT;
}
type = s;
} elseif (!compressor) { if (!crypto_has_acomp(s, 0, 0)) {
pr_err("compressor %s not available\n", s); return -ENOENT;
}
compressor = s;
} else {
WARN_ON(1); return -EINVAL;
}
spin_lock_bh(&zswap_pools_lock);
pool = zswap_pool_find_get(type, compressor); if (pool) {
zswap_pool_debug("using existing", pool);
WARN_ON(pool == zswap_pool_current());
list_del_rcu(&pool->list);
}
spin_unlock_bh(&zswap_pools_lock);
if (!pool)
pool = zswap_pool_create(type, compressor); else { /* * Restore the initial ref dropped by percpu_ref_kill() * when the pool was decommissioned and switch it again * to percpu mode.
*/
percpu_ref_resurrect(&pool->ref);
/* Drop the ref from zswap_pool_find_get(). */
zswap_pool_put(pool);
}
if (pool)
ret = param_set_charp(s, kp); else
ret = -EINVAL;
spin_lock_bh(&zswap_pools_lock);
if (!ret) {
put_pool = zswap_pool_current();
list_add_rcu(&pool->list, &zswap_pools);
zswap_has_pool = true;
} elseif (pool) { /* add the possibly pre-existing pool to the end of the pools * list; if it's new (and empty) then it'll be removed and * destroyed by the put after we drop the lock
*/
list_add_tail_rcu(&pool->list, &zswap_pools);
put_pool = pool;
}
spin_unlock_bh(&zswap_pools_lock);
if (!zswap_has_pool && !pool) { /* if initial pool creation failed, and this pool creation also * failed, maybe both compressor and zpool params were bad. * Allow changing this param, so pool creation will succeed * when the other param is changed. We already verified this * param is ok in the zpool_has_pool() or crypto_has_acomp() * checks above.
*/
ret = param_set_charp(s, kp);
}
/* drop the ref from either the old current pool, * or the new pool we failed to add
*/ if (put_pool)
percpu_ref_kill(&put_pool->ref);
staticint zswap_enabled_param_set(constchar *val, conststruct kernel_param *kp)
{ int ret = -ENODEV;
/* if this is load-time (pre-init) param setting, only set param. */ if (system_state != SYSTEM_RUNNING) return param_set_bool(val, kp);
mutex_lock(&zswap_init_lock); switch (zswap_init_state) { case ZSWAP_UNINIT: if (zswap_setup()) break;
fallthrough; case ZSWAP_INIT_SUCCEED: if (!zswap_has_pool)
pr_err("can't enable, no pool configured\n"); else
ret = param_set_bool(val, kp); break; case ZSWAP_INIT_FAILED:
pr_err("can't enable, initialization failed\n");
}
mutex_unlock(&zswap_init_lock);
staticvoid zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
{ int nid = entry_to_nid(entry); struct mem_cgroup *memcg;
/* * Note that it is safe to use rcu_read_lock() here, even in the face of * concurrent memcg offlining: * * 1. list_lru_add() is called before list_lru_one is dead. The * new entry will be reparented to memcg's parent's list_lru. * 2. list_lru_add() is called after list_lru_one is dead. The * new entry will be added directly to memcg's parent's list_lru. * * Similar reasoning holds for list_lru_del().
*/
rcu_read_lock();
memcg = mem_cgroup_from_entry(entry); /* will always succeed */
list_lru_add(list_lru, &entry->lru, nid, memcg);
rcu_read_unlock();
}
staticvoid zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry)
{ int nid = entry_to_nid(entry); struct mem_cgroup *memcg;
if (folio) {
lruvec = folio_lruvec(folio);
atomic_long_inc(&lruvec->zswap_lruvec_state.nr_disk_swapins);
}
}
/* * This function should be called when a memcg is being offlined. * * Since the global shrinker shrink_worker() may hold a reference * of the memcg, we must check and release the reference in * zswap_next_shrink. * * shrink_worker() must handle the case where this function releases * the reference of memcg being shrunk.
*/ void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg)
{ /* lock out zswap shrinker walking memcg tree */
spin_lock(&zswap_shrink_lock); if (zswap_next_shrink == memcg) { do {
zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL);
} while (zswap_next_shrink && !mem_cgroup_online(zswap_next_shrink));
}
spin_unlock(&zswap_shrink_lock);
}
/* * Carries out the common pattern of freeing and entry's zpool allocation, * freeing the entry itself, and decrementing the number of stored pages.
*/ staticvoid zswap_entry_free(struct zswap_entry *entry)
{
zswap_lru_del(&zswap_list_lru, entry);
zpool_free(entry->pool->zpool, entry->handle);
zswap_pool_put(entry->pool); if (entry->objcg) {
obj_cgroup_uncharge_zswap(entry->objcg, entry->length);
obj_cgroup_put(entry->objcg);
}
zswap_entry_cache_free(entry);
atomic_long_dec(&zswap_stored_pages);
}
buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); if (!buffer) {
ret = -ENOMEM; goto fail;
}
acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); if (IS_ERR(acomp)) {
pr_err("could not alloc crypto acomp %s : %ld\n",
pool->tfm_name, PTR_ERR(acomp));
ret = PTR_ERR(acomp); goto fail;
}
req = acomp_request_alloc(acomp); if (!req) {
pr_err("could not alloc crypto acomp_request %s\n",
pool->tfm_name);
ret = -ENOMEM; goto fail;
}
/* * Only hold the mutex after completing allocations, otherwise we may * recurse into zswap through reclaim and attempt to hold the mutex * again resulting in a deadlock.
*/
mutex_lock(&acomp_ctx->mutex);
crypto_init_wait(&acomp_ctx->wait);
/* * if the backend of acomp is async zip, crypto_req_done() will wakeup * crypto_wait_req(); if the backend of acomp is scomp, the callback * won't be called, crypto_wait_req() will return without blocking.
*/
acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
crypto_req_done, &acomp_ctx->wait);
/* * Do the actual freeing after releasing the mutex to avoid subtle * locking dependencies causing deadlocks.
*/ if (!IS_ERR_OR_NULL(req))
acomp_request_free(req); if (!IS_ERR_OR_NULL(acomp))
crypto_free_acomp(acomp);
kfree(buffer);
for (;;) {
acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
mutex_lock(&acomp_ctx->mutex); if (likely(acomp_ctx->req)) return acomp_ctx; /* * It is possible that we were migrated to a different CPU after * getting the per-CPU ctx but before the mutex was acquired. If * the old CPU got offlined, zswap_cpu_comp_dead() could have * already freed ctx->req (among other things) and set it to * NULL. Just try again on the new CPU that we ended up on.
*/
mutex_unlock(&acomp_ctx->mutex);
}
}
/* * We need PAGE_SIZE * 2 here since there maybe over-compression case, * and hardware-accelerators may won't check the dst buffer size, so * giving the dst buffer with enough length to avoid buffer overflow.
*/
sg_init_one(&output, dst, PAGE_SIZE * 2);
acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
/* * it maybe looks a little bit silly that we send an asynchronous request, * then wait for its completion synchronously. This makes the process look * synchronous in fact. * Theoretically, acomp supports users send multiple acomp requests in one * acomp instance, then get those requests done simultaneously. but in this * case, zswap actually does store and load page by page, there is no * existing method to send the second page before the first page is done * in one thread doing zwap. * but in different threads running on different cpu, we have different * acomp instance, so multiple threads can do (de)compression in parallel.
*/
comp_ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
dlen = acomp_ctx->req->dlen; if (comp_ret) goto unlock;
/* * zpool_obj_read_begin() might return a kmap address of highmem when * acomp_ctx->buffer is not used. However, sg_init_one() does not * handle highmem addresses, so copy the object to acomp_ctx->buffer.
*/ if (virt_addr_valid(obj)) {
src = obj;
} else {
WARN_ON_ONCE(obj == acomp_ctx->buffer);
memcpy(acomp_ctx->buffer, obj, entry->length);
src = acomp_ctx->buffer;
}
/********************************* * writeback code
**********************************/ /* * Attempts to free an entry by adding a folio to the swap cache, * decompressing the entry data into the folio, and issuing a * bio write to write the folio back to the swap device. * * This can be thought of as a "resumed writeback" of the folio * to the swap device. We are basically resuming the same swap * writeback path that was intercepted with the zswap_store() * in the first place. After the folio has been decompressed into * the swap cache, the compressed version stored by zswap can be * freed.
*/ staticint zswap_writeback_entry(struct zswap_entry *entry,
swp_entry_t swpentry)
{ struct xarray *tree;
pgoff_t offset = swp_offset(swpentry); struct folio *folio; struct mempolicy *mpol; bool folio_was_allocated; struct swap_info_struct *si; int ret = 0;
/* try to allocate swap cache folio */
si = get_swap_device(swpentry); if (!si) return -EEXIST;
/* * Found an existing folio, we raced with swapin or concurrent * shrinker. We generally writeback cold folios from zswap, and * swapin means the folio just became hot, so skip this folio. * For unlikely concurrent shrinker case, it will be unlinked * and freed when invalidated by the concurrent shrinker anyway.
*/ if (!folio_was_allocated) {
ret = -EEXIST; goto out;
}
/* * folio is locked, and the swapcache is now secured against * concurrent swapping to and from the slot, and concurrent * swapoff so we can safely dereference the zswap tree here. * Verify that the swap entry hasn't been invalidated and recycled * behind our backs, to avoid overwriting a new swap folio with * old compressed data. Only when this is successful can the entry * be dereferenced.
*/
tree = swap_zswap_tree(swpentry); if (entry != xa_load(tree, offset)) {
ret = -ENOMEM; goto out;
}
if (!zswap_decompress(entry, folio)) {
ret = -EIO; goto out;
}
xa_erase(tree, offset);
count_vm_event(ZSWPWB); if (entry->objcg)
count_objcg_events(entry->objcg, ZSWPWB, 1);
zswap_entry_free(entry);
/* folio is up to date */
folio_mark_uptodate(folio);
/* move it to the tail of the inactive list after end_writeback */
folio_set_reclaim(folio);
out: if (ret && ret != -EEXIST) {
delete_from_swap_cache(folio);
folio_unlock(folio);
}
folio_put(folio); return ret;
}
/********************************* * shrinker functions
**********************************/ /* * The dynamic shrinker is modulated by the following factors: * * 1. Each zswap entry has a referenced bit, which the shrinker unsets (giving * the entry a second chance) before rotating it in the LRU list. If the * entry is considered again by the shrinker, with its referenced bit unset, * it is written back. The writeback rate as a result is dynamically * adjusted by the pool activities - if the pool is dominated by new entries * (i.e lots of recent zswapouts), these entries will be protected and * the writeback rate will slow down. On the other hand, if the pool has a * lot of stagnant entries, these entries will be reclaimed immediately, * effectively increasing the writeback rate. * * 2. Swapins counter: If we observe swapins, it is a sign that we are * overshrinking and should slow down. We maintain a swapins counter, which * is consumed and subtract from the number of eligible objects on the LRU * in zswap_shrinker_count(). * * 3. Compression ratio. The better the workload compresses, the less gains we * can expect from writeback. We scale down the number of objects available * for reclaim by this ratio.
*/ staticenum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, void *arg)
{ struct zswap_entry *entry = container_of(item, struct zswap_entry, lru); bool *encountered_page_in_swapcache = (bool *)arg;
swp_entry_t swpentry; enum lru_status ret = LRU_REMOVED_RETRY; int writeback_result;
/* * Second chance algorithm: if the entry has its referenced bit set, give it * a second chance. Only clear the referenced bit and rotate it in the * zswap's LRU list.
*/ if (entry->referenced) {
entry->referenced = false; return LRU_ROTATE;
}
/* * As soon as we drop the LRU lock, the entry can be freed by * a concurrent invalidation. This means the following: * * 1. We extract the swp_entry_t to the stack, allowing * zswap_writeback_entry() to pin the swap entry and * then validate the zwap entry against that swap entry's * tree using pointer value comparison. Only when that * is successful can the entry be dereferenced. * * 2. Usually, objects are taken off the LRU for reclaim. In * this case this isn't possible, because if reclaim fails * for whatever reason, we have no means of knowing if the * entry is alive to put it back on the LRU. * * So rotate it before dropping the lock. If the entry is * written back or invalidated, the free path will unlink * it. For failures, rotation is the right thing as well. * * Temporary failures, where the same entry should be tried * again immediately, almost never happen for this shrinker. * We don't do any trylocking; -ENOMEM comes closest, * but that's extremely rare and doesn't happen spuriously * either. Don't bother distinguishing this case.
*/
list_move_tail(item, &l->list);
/* * Once the lru lock is dropped, the entry might get freed. The * swpentry is copied to the stack, and entry isn't deref'd again * until the entry is verified to still be alive in the tree.
*/
swpentry = entry->swpentry;
/* * It's safe to drop the lock here because we return either * LRU_REMOVED_RETRY, LRU_RETRY or LRU_STOP.
*/
spin_unlock(&l->lock);
if (writeback_result) {
zswap_reject_reclaim_fail++;
ret = LRU_RETRY;
/* * Encountering a page already in swap cache is a sign that we are shrinking * into the warmer region. We should terminate shrinking (if we're in the dynamic * shrinker context).
*/ if (writeback_result == -EEXIST && encountered_page_in_swapcache) {
ret = LRU_STOP;
*encountered_page_in_swapcache = true;
}
} else {
zswap_written_back_pages++;
}
if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg)) return 0;
/* * The shrinker resumes swap writeback, which will enter block * and may enter fs. XXX: Harmonize with vmscan.c __GFP_FS * rules (may_enter_fs()), which apply on a per-folio basis.
*/ if (!gfp_has_io_fs(sc->gfp_mask)) return 0;
/* * For memcg, use the cgroup-wide ZSWAP stats since we don't * have them per-node and thus per-lruvec. Careful if memcg is * runtime-disabled: we can get sc->memcg == NULL, which is ok * for the lruvec, but not for memcg_page_state(). * * Without memcg, use the zswap pool-wide metrics.
*/ if (!mem_cgroup_disabled()) {
mem_cgroup_flush_stats(memcg);
nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT;
nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED);
} else {
nr_backing = zswap_total_pages();
nr_stored = atomic_long_read(&zswap_stored_pages);
}
if (!nr_stored) return 0;
nr_freeable = list_lru_shrink_count(&zswap_list_lru, sc); if (!nr_freeable) return 0;
/* * Subtract from the lru size the number of pages that are recently swapped * in from disk. The idea is that had we protect the zswap's LRU by this * amount of pages, these disk swapins would not have happened.
*/
nr_disk_swapins_cur = atomic_long_read(nr_disk_swapins); do { if (nr_freeable >= nr_disk_swapins_cur)
nr_remain = 0; else
nr_remain = nr_disk_swapins_cur - nr_freeable;
} while (!atomic_long_try_cmpxchg(
nr_disk_swapins, &nr_disk_swapins_cur, nr_remain));
nr_freeable -= nr_disk_swapins_cur - nr_remain; if (!nr_freeable) return 0;
/* * Scale the number of freeable pages by the memory saving factor. * This ensures that the better zswap compresses memory, the fewer * pages we will evict to swap (as it will otherwise incur IO for * relatively small memory saving).
*/ return mult_frac(nr_freeable, nr_backing, nr_stored);
}
if (!mem_cgroup_zswap_writeback_enabled(memcg)) return -ENOENT;
/* * Skip zombies because their LRUs are reparented and we would be * reclaiming from the parent instead of the dead memcg.
*/ if (memcg && !mem_cgroup_online(memcg)) return -ENOENT;
/* Reclaim down to the accept threshold */
thr = zswap_accept_thr_pages();
/* * Global reclaim will select cgroup in a round-robin fashion from all * online memcgs, but memcgs that have no pages in zswap and * writeback-disabled memcgs (memory.zswap.writeback=0) are not * candidates for shrinking. * * Shrinking will be aborted if we encounter the following * MAX_RECLAIM_RETRIES times: * - No writeback-candidate memcgs found in a memcg tree walk. * - Shrinking a writeback-candidate memcg failed. * * We save iteration cursor memcg into zswap_next_shrink, * which can be modified by the offline memcg cleaner * zswap_memcg_offline_cleanup(). * * Since the offline cleaner is called only once, we cannot leave an * offline memcg reference in zswap_next_shrink. * We can rely on the cleaner only if we get online memcg under lock. * * If we get an offline memcg, we cannot determine if the cleaner has * already been called or will be called later. We must put back the * reference before returning from this function. Otherwise, the * offline memcg left in zswap_next_shrink will hold the reference * until the next run of shrink_worker().
*/ do { /* * Start shrinking from the next memcg after zswap_next_shrink. * When the offline cleaner has already advanced the cursor, * advancing the cursor here overlooks one memcg, but this * should be negligibly rare. * * If we get an online memcg, keep the extra reference in case * the original one obtained by mem_cgroup_iter() is dropped by * zswap_memcg_offline_cleanup() while we are shrinking the * memcg.
*/
spin_lock(&zswap_shrink_lock); do {
memcg = mem_cgroup_iter(NULL, zswap_next_shrink, NULL);
zswap_next_shrink = memcg;
} while (memcg && !mem_cgroup_tryget_online(memcg));
spin_unlock(&zswap_shrink_lock);
if (!memcg) { /* * Continue shrinking without incrementing failures if * we found candidate memcgs in the last tree walk.
*/ if (!attempts && ++failures == MAX_RECLAIM_RETRIES) break;
attempts = 0; goto resched;
}
ret = shrink_memcg(memcg); /* drop the extra reference */
mem_cgroup_put(memcg);
/* * There are no writeback-candidate pages in the memcg. * This is not an issue as long as we can find another memcg * with pages in zswap. Skip this without incrementing attempts * and failures.
*/ if (ret == -ENOENT) continue;
++attempts;
if (ret && ++failures == MAX_RECLAIM_RETRIES) break;
resched:
cond_resched();
} while (zswap_total_pages() > thr);
}
/********************************* * main API
**********************************/
/* * We may have had an existing entry that became stale when * the folio was redirtied and now the new version is being * swapped out. Get rid of the old.
*/ if (old)
zswap_entry_free(old);
/* * The entry is successfully compressed and stored in the tree, there is * no further possibility of failure. Grab refs to the pool and objcg, * charge zswap memory, and increment zswap_stored_pages. * The opposite actions will be performed by zswap_entry_free() * when the entry is removed from the tree.
*/
zswap_pool_get(pool); if (objcg) {
obj_cgroup_get(objcg);
obj_cgroup_charge_zswap(objcg, entry->length);
}
atomic_long_inc(&zswap_stored_pages);
/* * We finish initializing the entry while it's already in xarray. * This is safe because: * * 1. Concurrent stores and invalidations are excluded by folio lock. * * 2. Writeback is excluded by the entry not being on the LRU yet. * The publishing order matters to prevent writeback from seeing * an incoherent entry.
*/
entry->pool = pool;
entry->swpentry = page_swpentry;
entry->objcg = objcg;
entry->referenced = true; if (entry->length) {
INIT_LIST_HEAD(&entry->lru);
zswap_lru_add(&zswap_list_lru, entry);
}
objcg = get_obj_cgroup_from_folio(folio); if (objcg && !obj_cgroup_may_zswap(objcg)) {
memcg = get_mem_cgroup_from_objcg(objcg); if (shrink_memcg(memcg)) {
mem_cgroup_put(memcg); goto put_objcg;
}
mem_cgroup_put(memcg);
}
if (zswap_check_limits()) goto put_objcg;
pool = zswap_pool_current_get(); if (!pool) goto put_objcg;
if (objcg) {
memcg = get_mem_cgroup_from_objcg(objcg); if (memcg_list_lru_alloc(memcg, &zswap_list_lru, GFP_KERNEL)) {
mem_cgroup_put(memcg); goto put_pool;
}
mem_cgroup_put(memcg);
}
for (index = 0; index < nr_pages; ++index) { struct page *page = folio_page(folio, index);
if (!zswap_store_page(page, objcg, pool)) goto put_pool;
}
if (objcg)
count_objcg_events(objcg, ZSWPOUT, nr_pages);
count_vm_events(ZSWPOUT, nr_pages);
ret = true;
put_pool:
zswap_pool_put(pool);
put_objcg:
obj_cgroup_put(objcg); if (!ret && zswap_pool_reached_full)
queue_work(shrink_wq, &zswap_shrink_work);
check_old: /* * If the zswap store fails or zswap is disabled, we must invalidate * the possibly stale entries which were previously stored at the * offsets corresponding to each page of the folio. Otherwise, * writeback could overwrite the new data in the swapfile.
*/ if (!ret) { unsigned type = swp_type(swp);
pgoff_t offset = swp_offset(swp); struct zswap_entry *entry; struct xarray *tree;
for (index = 0; index < nr_pages; ++index) {
tree = swap_zswap_tree(swp_entry(type, offset + index));
entry = xa_erase(tree, offset + index); if (entry)
zswap_entry_free(entry);
}
}
return ret;
}
/** * zswap_load() - load a folio from zswap * @folio: folio to load * * Return: 0 on success, with the folio unlocked and marked up-to-date, or one * of the following error codes: * * -EIO: if the swapped out content was in zswap, but could not be loaded * into the page due to a decompression failure. The folio is unlocked, but * NOT marked up-to-date, so that an IO error is emitted (e.g. do_swap_page() * will SIGBUS). * * -EINVAL: if the swapped out content was in zswap, but the page belongs * to a large folio, which is not supported by zswap. The folio is unlocked, * but NOT marked up-to-date, so that an IO error is emitted (e.g. * do_swap_page() will SIGBUS). * * -ENOENT: if the swapped out content was not in zswap. The folio remains * locked on return.
*/ int zswap_load(struct folio *folio)
{
swp_entry_t swp = folio->swap;
pgoff_t offset = swp_offset(swp); bool swapcache = folio_test_swapcache(folio); struct xarray *tree = swap_zswap_tree(swp); struct zswap_entry *entry;
VM_WARN_ON_ONCE(!folio_test_locked(folio));
if (zswap_never_enabled()) return -ENOENT;
/* * Large folios should not be swapped in while zswap is being used, as * they are not properly handled. Zswap does not properly load large * folios, and a large folio may only be partially in zswap.
*/ if (WARN_ON_ONCE(folio_test_large(folio))) {
folio_unlock(folio); return -EINVAL;
}
entry = xa_load(tree, offset); if (!entry) return -ENOENT;
if (!zswap_decompress(entry, folio)) {
folio_unlock(folio); return -EIO;
}
folio_mark_uptodate(folio);
count_vm_event(ZSWPIN); if (entry->objcg)
count_objcg_events(entry->objcg, ZSWPIN, 1);
/* * When reading into the swapcache, invalidate our entry. The * swapcache can be the authoritative owner of the page and * its mappings, and the pressure that results from having two * in-memory copies outweighs any benefits of caching the * compression work. * * (Most swapins go through the swapcache. The notable * exception is the singleton fault on SWP_SYNCHRONOUS_IO * files, which reads into a private page and may free it if * the fault fails. We remain the primary owner of the entry.)
*/ if (swapcache) {
folio_mark_dirty(folio);
xa_erase(tree, offset);
zswap_entry_free(entry);
}
nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
trees = kvcalloc(nr, sizeof(*tree), GFP_KERNEL); if (!trees) {
pr_err("alloc failed, zswap disabled for swap type %d\n", type); return -ENOMEM;
}
ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE, "mm/zswap_pool:prepare",
zswap_cpu_comp_prepare,
zswap_cpu_comp_dead); if (ret) goto hp_fail;
shrink_wq = alloc_workqueue("zswap-shrink",
WQ_UNBOUND|WQ_MEM_RECLAIM, 1); if (!shrink_wq) goto shrink_wq_fail;
zswap_shrinker = zswap_alloc_shrinker(); if (!zswap_shrinker) goto shrinker_fail; if (list_lru_init_memcg(&zswap_list_lru, zswap_shrinker)) goto lru_fail;
shrinker_register(zswap_shrinker);
INIT_WORK(&zswap_shrink_work, shrink_worker);
pool = __zswap_pool_create_fallback(); if (pool) {
pr_info("loaded using pool %s/%s\n", pool->tfm_name,
zpool_get_type(pool->zpool));
list_add(&pool->list, &zswap_pools);
zswap_has_pool = true;
static_branch_enable(&zswap_ever_enabled);
} else {
pr_err("pool creation failed\n");
zswap_enabled = false;
}
if (zswap_debugfs_init())
pr_warn("debugfs initialization failed\n");
zswap_init_state = ZSWAP_INIT_SUCCEED; return 0;
lru_fail:
shrinker_free(zswap_shrinker);
shrinker_fail:
destroy_workqueue(shrink_wq);
shrink_wq_fail:
cpuhp_remove_multi_state(CPUHP_MM_ZSWP_POOL_PREPARE);
hp_fail:
kmem_cache_destroy(zswap_entry_cache);
cache_fail: /* if built-in, we aren't unloaded on failure; don't allow use */
zswap_init_state = ZSWAP_INIT_FAILED;
zswap_enabled = false; return -ENOMEM;
}
staticint __init zswap_init(void)
{ if (!zswap_enabled) return 0; return zswap_setup();
} /* must be late so crypto has time to come up */
late_initcall(zswap_init);
MODULE_AUTHOR("Seth Jennings ");
MODULE_DESCRIPTION("Compressed cache for swap pages");
Messung V0.5
¤ Dauer der Verarbeitung: 0.19 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.