#ifdef CONFIG_SHMEM /* * This virtual memory filesystem is heavily based on the ramfs. It * extends ramfs by the ability to use swap and honor resource limits * which makes it a completely usable filesystem.
*/
/* Pretend that each entry is of this size in directory's i_size */ #define BOGO_DIRENT_SIZE 20
/* Pretend that one inode + its dentry occupy this much memory */ #define BOGO_INODE_SIZE 1024
/* Symlink up to this size is kmalloc'ed instead of using a swappable page */ #define SHORT_SYMLINK_LEN 128
/* * shmem_fallocate communicates with shmem_fault or shmem_writeout via * inode->i_private (with i_rwsem making sure that it has only one user at * a time): we would prefer not to enlarge the shmem inode just for that.
*/ struct shmem_falloc {
wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
pgoff_t start; /* start of range currently being fallocated */
pgoff_t next; /* the next page offset to be fallocated */
pgoff_t nr_falloced; /* how many new pages have been fallocated */
pgoff_t nr_unswapped; /* how often writeout refused to swap out */
};
/* * shmem_file_setup pre-accounts the whole fixed size of a VM object, * for shared memory and for shared anonymous (/dev/zero) mappings * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1), * consistent with the pre-accounting of private mappings ...
*/ staticinlineint shmem_acct_size(unsignedlong flags, loff_t size)
{ return (flags & VM_NORESERVE) ?
0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
}
/* * ... whereas tmpfs objects are accounted incrementally as * pages are allocated, in order to allow large sparse files. * shmem_get_folio reports shmem_acct_blocks failure as -ENOSPC not -ENOMEM, * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
*/ staticinlineint shmem_acct_blocks(unsignedlong flags, long pages)
{ if (!(flags & VM_NORESERVE)) return 0;
if (shmem_acct_blocks(info->flags, pages)) return err;
might_sleep(); /* when quotas */ if (sbinfo->max_blocks) { if (!percpu_counter_limited_add(&sbinfo->used_blocks,
sbinfo->max_blocks, pages)) goto unacct;
/* * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and * produces a novel ino for the newly allocated inode. * * It may also be called when making a hard link to permit the space needed by * each dentry. However, in that case, no new inode number is needed since that * internally draws from another pool of inode numbers (currently global * get_next_ino()). This case is indicated by passing NULL as inop.
*/ #define SHMEM_INO_BATCH 1024 staticint shmem_reserve_inode(struct super_block *sb, ino_t *inop)
{ struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
ino_t ino;
if (!(sb->s_flags & SB_KERNMOUNT)) {
raw_spin_lock(&sbinfo->stat_lock); if (sbinfo->max_inodes) { if (sbinfo->free_ispace < BOGO_INODE_SIZE) {
raw_spin_unlock(&sbinfo->stat_lock); return -ENOSPC;
}
sbinfo->free_ispace -= BOGO_INODE_SIZE;
} if (inop) {
ino = sbinfo->next_ino++; if (unlikely(is_zero_ino(ino)))
ino = sbinfo->next_ino++; if (unlikely(!sbinfo->full_inums &&
ino > UINT_MAX)) { /* * Emulate get_next_ino uint wraparound for * compatibility
*/ if (IS_ENABLED(CONFIG_64BIT))
pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n",
__func__, MINOR(sb->s_dev));
sbinfo->next_ino = 1;
ino = sbinfo->next_ino++;
}
*inop = ino;
}
raw_spin_unlock(&sbinfo->stat_lock);
} elseif (inop) { /* * __shmem_file_setup, one of our callers, is lock-free: it * doesn't hold stat_lock in shmem_reserve_inode since * max_inodes is always 0, and is called from potentially * unknown contexts. As such, use a per-cpu batched allocator * which doesn't require the per-sb stat_lock unless we are at * the batch boundary. * * We don't need to worry about inode{32,64} since SB_KERNMOUNT * shmem mounts are not exposed to userspace, so we don't need * to worry about things like glibc compatibility.
*/
ino_t *next_ino;
/** * shmem_recalc_inode - recalculate the block usage of an inode * @inode: inode to recalc * @alloced: the change in number of pages allocated to inode * @swapped: the change in number of pages swapped from inode * * We have to calculate the free blocks since the mm can drop * undirtied hole pages behind our back. * * But normally info->alloced == inode->i_mapping->nrpages + info->swapped * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) * * Return: true if swapped was incremented from 0, for shmem_writeout().
*/ staticbool shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
{ struct shmem_inode_info *info = SHMEM_I(inode); bool first_swapped = false; long freed;
spin_lock(&info->lock);
info->alloced += alloced;
info->swapped += swapped;
freed = info->alloced - info->swapped -
READ_ONCE(inode->i_mapping->nrpages); /* * Special case: whereas normally shmem_recalc_inode() is called * after i_mapping->nrpages has already been adjusted (up or down), * shmem_writeout() has to raise swapped before nrpages is lowered - * to stop a racing shmem_recalc_inode() from thinking that a page has * been freed. Compensate here, to avoid the need for a followup call.
*/ if (swapped > 0) { if (info->swapped == swapped)
first_swapped = true;
freed += swapped;
} if (freed > 0)
info->alloced -= freed;
spin_unlock(&info->lock);
/* The quota case may block */ if (freed > 0)
shmem_inode_unacct_blocks(inode, freed); return first_swapped;
}
void shmem_uncharge(struct inode *inode, long pages)
{ /* pages argument is currently unused: keep it to help debugging */ /* nrpages adjustment done by __filemap_remove_folio() or caller */
shmem_recalc_inode(inode, 0, 0);
}
/* * Replace item expected in xarray by a new item, while holding xa_lock.
*/ staticint shmem_replace_entry(struct address_space *mapping,
pgoff_t index, void *expected, void *replacement)
{
XA_STATE(xas, &mapping->i_pages, index); void *item;
/* * Sometimes, before we decide whether to proceed or to fail, we must check * that an entry was not already brought back or split by a racing thread. * * Checking folio is not enough: by the time a swapcache folio is locked, it * might be reused, and again be swapcache, using the same swap as before. * Returns the swap entry's order if it still presents, else returns -1.
*/ staticint shmem_confirm_swap(struct address_space *mapping, pgoff_t index,
swp_entry_t swap)
{
XA_STATE(xas, &mapping->i_pages, index); int ret = -1; void *entry;
rcu_read_lock(); do {
entry = xas_load(&xas); if (entry == swp_to_radix_entry(swap))
ret = xas_get_order(&xas);
} while (xas_retry(&xas, entry));
rcu_read_unlock(); return ret;
}
/* * Definitions for "huge tmpfs": tmpfs mounted with the huge= option * * SHMEM_HUGE_NEVER: * disables huge pages for the mount; * SHMEM_HUGE_ALWAYS: * enables huge pages for the mount; * SHMEM_HUGE_WITHIN_SIZE: * only allocate huge pages if the page will be fully within i_size, * also respect madvise() hints; * SHMEM_HUGE_ADVISE: * only allocate huge pages if requested with madvise();
*/
/* * Special values. * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled: * * SHMEM_HUGE_DENY: * disables huge on shm_mnt and all mounts, for emergency use; * SHMEM_HUGE_FORCE: * enables huge on shm_mnt and all mounts, w/o needing option, for testing; *
*/ #define SHMEM_HUGE_DENY (-1) #define SHMEM_HUGE_FORCE (-2)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* ifdef here to avoid bloating shmem.o when not necessary */
/** * shmem_mapping_size_orders - Get allowable folio orders for the given file size. * @mapping: Target address_space. * @index: The page index. * @write_end: end of a write, could extend inode size. * * This returns huge orders for folios (when supported) based on the file size * which the mapping currently allows at the given index. The index is relevant * due to alignment considerations the mapping might have. The returned order * may be less than the size passed. * * Return: The orders.
*/ staticinlineunsignedint
shmem_mapping_size_orders(struct address_space *mapping, pgoff_t index, loff_t write_end)
{ unsignedint order;
size_t size;
if (!mapping_large_folio_support(mapping) || !write_end) return 0;
/* Calculate the write size based on the write_end */
size = write_end - (index << PAGE_SHIFT);
order = filemap_get_order(size); if (!order) return 0;
/* If we're not aligned, allocate a smaller folio */ if (index & ((1UL << order) - 1))
order = __ffs(index);
order = min_t(size_t, order, MAX_PAGECACHE_ORDER); return order > 0 ? BIT(order + 1) - 1 : 0;
}
if (!S_ISREG(inode->i_mode)) return 0; if (shmem_huge == SHMEM_HUGE_DENY) return 0; if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE) return maybe_pmd_order;
/* * The huge order allocation for anon shmem is controlled through * the mTHP interface, so we still use PMD-sized huge order to * check whether global control is enabled. * * For tmpfs mmap()'s huge order, we still use PMD-sized order to * allocate huge pages due to lack of a write size hint. * * Otherwise, tmpfs will allow getting a highest order hint based on * the size of write and fallocate paths, then will try each allowable * huge orders.
*/ switch (SHMEM_SB(inode->i_sb)->huge) { case SHMEM_HUGE_ALWAYS: if (vma) return maybe_pmd_order;
return shmem_mapping_size_orders(inode->i_mapping, index, write_end); case SHMEM_HUGE_WITHIN_SIZE: if (vma)
within_size_orders = maybe_pmd_order; else
within_size_orders = shmem_mapping_size_orders(inode->i_mapping,
index, write_end);
/* No large folio at the end of the file: nothing to split */ if (!folio_test_large(folio)) {
folio_put(folio); goto drop;
}
/* Check if there is anything to gain from splitting */
next = folio_next_index(folio);
end = shmem_fallocend(inode, DIV_ROUND_UP(i_size, PAGE_SIZE)); if (end <= folio->index || end >= next) {
folio_put(folio); goto drop;
}
/* * Move the inode on the list back to shrinklist if we failed * to lock the page at this time. * * Waiting for the lock may lead to deadlock in the * reclaim path.
*/ if (!folio_trylock(folio)) {
folio_put(folio); goto move_back;
}
ret = split_folio(folio);
folio_unlock(folio);
folio_put(folio);
/* If split failed move the inode on the list back to shrinklist */ if (ret) goto move_back;
freed += next - end;
split++;
drop:
list_del_init(&info->shrinklist); goto put;
move_back: /* * Make sure the inode is either on the global list or deleted * from any local list before iput() since it could be deleted * in another thread once we put the inode (then the local list * is corrupted).
*/
spin_lock(&sbinfo->shrinklist_lock);
list_move(&info->shrinklist, &sbinfo->shrinklist);
sbinfo->shrinklist_len++;
spin_unlock(&sbinfo->shrinklist_lock);
put:
iput(inode);
}
do {
iter = swap;
xas_lock_irq(&xas);
xas_for_each_conflict(&xas, entry) { /* * The range must either be empty, or filled with * expected swap entries. Shmem swap entries are never * partially freed without split of both entry and * folio, so there shouldn't be any holes.
*/ if (!expected || entry != swp_to_radix_entry(iter)) {
xas_set_err(&xas, -EEXIST); goto unlock;
}
iter.val += 1 << xas_get_order(&xas);
} if (expected && iter.val - nr != swap.val) {
xas_set_err(&xas, -EEXIST); goto unlock;
}
xas_store(&xas, folio); if (xas_error(&xas)) goto unlock;
shmem_update_stats(folio, nr);
mapping->nrpages += nr;
unlock:
xas_unlock_irq(&xas);
} while (xas_nomem(&xas, gfp));
/* * Somewhat like filemap_remove_folio, but substitutes swap for @folio.
*/ staticvoid shmem_delete_from_page_cache(struct folio *folio, void *radswap)
{ struct address_space *mapping = folio->mapping; long nr = folio_nr_pages(folio); int error;
/* * Remove swap entry from page cache, free the swap and its page cache. Returns * the number of pages being freed. 0 means entry not found in XArray (0 pages * being freed).
*/ staticlong shmem_free_swap(struct address_space *mapping,
pgoff_t index, void *radswap)
{ int order = xa_get_order(&mapping->i_pages, index); void *old;
old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0); if (old != radswap) return 0;
free_swap_and_cache_nr(radix_to_swp_entry(radswap), 1 << order);
return 1 << order;
}
/* * Determine (in bytes) how many of the shmem object's pages mapped by the * given offsets are swapped out. * * This is safe to call without i_rwsem or the i_pages lock thanks to RCU, * as long as the inode doesn't go away and racy results are not a problem.
*/ unsignedlong shmem_partial_swap_usage(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
XA_STATE(xas, &mapping->i_pages, start); struct page *page; unsignedlong swapped = 0; unsignedlong max = end - 1;
rcu_read_lock();
xas_for_each(&xas, page, max) { if (xas_retry(&xas, page)) continue; if (xa_is_value(page))
swapped += 1 << xas_get_order(&xas); if (xas.xa_index == max) break; if (need_resched()) {
xas_pause(&xas);
cond_resched_rcu();
}
}
rcu_read_unlock();
return swapped << PAGE_SHIFT;
}
/* * Determine (in bytes) how many of the shmem object's pages mapped by the * given vma is swapped out. * * This is safe to call without i_rwsem or the i_pages lock thanks to RCU, * as long as the inode doesn't go away and racy results are not a problem.
*/ unsignedlong shmem_swap_usage(struct vm_area_struct *vma)
{ struct inode *inode = file_inode(vma->vm_file); struct shmem_inode_info *info = SHMEM_I(inode); struct address_space *mapping = inode->i_mapping; unsignedlong swapped;
/* Be careful as we don't hold info->lock */
swapped = READ_ONCE(info->swapped);
/* * The easier cases are when the shmem object has nothing in swap, or * the vma maps it whole. Then we can simply use the stats that we * already track.
*/ if (!swapped) return 0;
/* Here comes the more involved part */ return shmem_partial_swap_usage(mapping, vma->vm_pgoff,
vma->vm_pgoff + vma_pages(vma));
}
/* * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
*/ void shmem_unlock_mapping(struct address_space *mapping)
{ struct folio_batch fbatch;
pgoff_t index = 0;
folio_batch_init(&fbatch); /* * Minor point, but we might as well stop if someone else SHM_LOCKs it.
*/ while (!mapping_unevictable(mapping) &&
filemap_get_folios(mapping, &index, ~0UL, &fbatch)) {
check_move_unevictable_folios(&fbatch);
folio_batch_release(&fbatch);
cond_resched();
}
}
/* * At first avoid shmem_get_folio(,,,SGP_READ): that fails * beyond i_size, and reports fallocated folios as holes.
*/
folio = filemap_get_entry(inode->i_mapping, index); if (!folio) return folio; if (!xa_is_value(folio)) {
folio_lock(folio); if (folio->mapping == inode->i_mapping) return folio; /* The folio has been swapped out */
folio_unlock(folio);
folio_put(folio);
} /* * But read a folio back from swap if any of it is within i_size * (although in some cases this is just a waste of time).
*/
folio = NULL;
shmem_get_folio(inode, index, 0, &folio, SGP_READ); return folio;
}
/* * Remove range of pages and swap entries from page cache, and free them. * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
*/ staticvoid shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, bool unfalloc)
{ struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode);
pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
pgoff_t end = (lend + 1) >> PAGE_SHIFT; struct folio_batch fbatch;
pgoff_t indices[PAGEVEC_SIZE]; struct folio *folio; bool same_folio; long nr_swaps_freed = 0;
pgoff_t index; int i;
if (lend == -1)
end = -1; /* unsigned, so actually very big */
if (info->fallocend > start && info->fallocend <= end && !unfalloc)
info->fallocend = start;
folio_batch_init(&fbatch);
index = start; while (index < end && find_lock_entries(mapping, &index, end - 1,
&fbatch, indices)) { for (i = 0; i < folio_batch_count(&fbatch); i++) {
folio = fbatch.folios[i];
if (xa_is_value(folio)) { if (unfalloc) continue;
nr_swaps_freed += shmem_free_swap(mapping,
indices[i], folio); continue;
}
/* * When undoing a failed fallocate, we want none of the partial folio * zeroing and splitting below, but shall want to truncate the whole * folio when !uptodate indicates that it was added by this fallocate, * even when [lstart, lend] covers only a part of the folio.
*/ if (unfalloc) goto whole_folios;
if (!same_folio)
folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT); if (folio) {
folio_mark_dirty(folio); if (!truncate_inode_partial_folio(folio, lstart, lend))
end = folio->index;
folio_unlock(folio);
folio_put(folio);
}
whole_folios:
index = start; while (index < end) {
cond_resched();
if (!find_get_entries(mapping, &index, end - 1, &fbatch,
indices)) { /* If all gone or hole-punch or unfalloc, we're done */ if (index == start || end != -1) break; /* But if truncating, restart to make sure all gone */
index = start; continue;
} for (i = 0; i < folio_batch_count(&fbatch); i++) {
folio = fbatch.folios[i];
if (xa_is_value(folio)) { long swaps_freed;
if (unfalloc) continue;
swaps_freed = shmem_free_swap(mapping, indices[i], folio); if (!swaps_freed) { /* Swap was replaced by page: retry */
index = indices[i]; break;
}
nr_swaps_freed += swaps_freed; continue;
}
folio_lock(folio);
if (!unfalloc || !folio_test_uptodate(folio)) { if (folio_mapping(folio) != mapping) { /* Page was replaced by swap: retry */
folio_unlock(folio);
index = indices[i]; break;
}
VM_BUG_ON_FOLIO(folio_test_writeback(folio),
folio);
if (!folio_test_large(folio)) {
truncate_inode_folio(mapping, folio);
} elseif (truncate_inode_partial_folio(folio, lstart, lend)) { /* * If we split a page, reset the loop so * that we pick up the new sub pages. * Otherwise the THP was entirely * dropped or the target range was * zeroed, so just continue the loop as * is.
*/ if (!folio_test_large(folio)) {
folio_unlock(folio);
index = start; break;
}
}
}
folio_unlock(folio);
}
folio_batch_remove_exceptionals(&fbatch);
folio_batch_release(&fbatch);
}
if (shmem_mapping(inode->i_mapping)) {
shmem_unacct_size(info->flags, inode->i_size);
inode->i_size = 0;
mapping_set_exiting(inode->i_mapping);
shmem_truncate_range(inode, 0, (loff_t)-1); if (!list_empty(&info->shrinklist)) {
spin_lock(&sbinfo->shrinklist_lock); if (!list_empty(&info->shrinklist)) {
list_del_init(&info->shrinklist);
sbinfo->shrinklist_len--;
}
spin_unlock(&sbinfo->shrinklist_lock);
} while (!list_empty(&info->swaplist)) { /* Wait while shmem_unuse() is scanning this inode... */
wait_var_event(&info->stop_eviction,
!atomic_read(&info->stop_eviction));
spin_lock(&shmem_swaplist_lock); /* ...but beware of the race if we peeked too early */ if (!atomic_read(&info->stop_eviction))
list_del_init(&info->swaplist);
spin_unlock(&shmem_swaplist_lock);
}
}
rcu_read_lock();
xas_for_each(&xas, folio, ULONG_MAX) { if (xas_retry(&xas, folio)) continue;
if (!xa_is_value(folio)) continue;
entry = radix_to_swp_entry(folio); /* * swapin error entries can be found in the mapping. But they're * deliberately ignored here as we've done everything we can do.
*/ if (swp_type(entry) != type) continue;
indices[folio_batch_count(fbatch)] = xas.xa_index; if (!folio_batch_add(fbatch, folio)) break;
if (need_resched()) {
xas_pause(&xas);
cond_resched_rcu();
}
}
rcu_read_unlock();
return folio_batch_count(fbatch);
}
/* * Move the swapped pages for an inode to page cache. Returns the count * of pages swapped in, or the error in case of failure.
*/ staticint shmem_unuse_swap_entries(struct inode *inode, struct folio_batch *fbatch, pgoff_t *indices)
{ int i = 0; int ret = 0; int error = 0; struct address_space *mapping = inode->i_mapping;
for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i];
/* * If swap found in inode, free it and move page from swapcache to filecache.
*/ staticint shmem_unuse_inode(struct inode *inode, unsignedint type)
{ struct address_space *mapping = inode->i_mapping;
pgoff_t start = 0; struct folio_batch fbatch;
pgoff_t indices[PAGEVEC_SIZE]; int ret = 0;
do {
folio_batch_init(&fbatch); if (!shmem_find_swap_entries(mapping, start, &fbatch,
indices, type)) {
ret = 0; break;
}
ret = shmem_unuse_swap_entries(inode, &fbatch, indices); if (ret < 0) break;
start = indices[folio_batch_count(&fbatch) - 1];
} while (true);
return ret;
}
/* * Read all the shared memory data that resides in the swap * device 'type' back into memory, so the swap device can be * unused.
*/ int shmem_unuse(unsignedint type)
{ struct shmem_inode_info *info, *next; int error = 0;
if (list_empty(&shmem_swaplist)) return 0;
spin_lock(&shmem_swaplist_lock);
start_over:
list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { if (!info->swapped) {
list_del_init(&info->swaplist); continue;
} /* * Drop the swaplist mutex while searching the inode for swap; * but before doing so, make sure shmem_evict_inode() will not * remove placeholder inode from swaplist, nor let it be freed * (igrab() would protect from unlink, but not from unmount).
*/
atomic_inc(&info->stop_eviction);
spin_unlock(&shmem_swaplist_lock);
spin_lock(&shmem_swaplist_lock); if (atomic_dec_and_test(&info->stop_eviction))
wake_up_var(&info->stop_eviction); if (error) break; if (list_empty(&info->swaplist)) goto start_over;
next = list_next_entry(info, swaplist); if (!info->swapped)
list_del_init(&info->swaplist);
}
spin_unlock(&shmem_swaplist_lock);
return error;
}
/** * shmem_writeout - Write the folio to swap * @folio: The folio to write * @plug: swap plug * @folio_list: list to put back folios on split * * Move the folio from the page cache to the swap cache.
*/ int shmem_writeout(struct folio *folio, struct swap_iocb **plug, struct list_head *folio_list)
{ struct address_space *mapping = folio->mapping; struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
pgoff_t index; int nr_pages; bool split = false;
if ((info->flags & VM_LOCKED) || sbinfo->noswap) goto redirty;
if (!total_swap_pages) goto redirty;
/* * If CONFIG_THP_SWAP is not enabled, the large folio should be * split when swapping. * * And shrinkage of pages beyond i_size does not split swap, so * swapout of a large folio crossing i_size needs to split too * (unless fallocate has been used to preallocate beyond EOF).
*/ if (folio_test_large(folio)) {
index = shmem_fallocend(inode,
DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE)); if ((index > folio->index && index < folio_next_index(folio)) ||
!IS_ENABLED(CONFIG_THP_SWAP))
split = true;
}
if (split) {
try_split: /* Ensure the subpages are still dirty */
folio_test_set_dirty(folio); if (split_folio_to_list(folio, folio_list)) goto redirty;
folio_clear_dirty(folio);
}
index = folio->index;
nr_pages = folio_nr_pages(folio);
/* * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC * value into swapfile.c, the only way we can correctly account for a * fallocated folio arriving here is now to initialize it and write it. * * That's okay for a folio already fallocated earlier, but if we have * not yet completed the fallocation, then (a) we want to keep track * of this folio in case we have to undo it, and (b) it may not be a * good idea to continue anyway, once we're pushing into swap. So * reactivate the folio, and let shmem_fallocate() quit when too many.
*/ if (!folio_test_uptodate(folio)) { if (inode->i_private) { struct shmem_falloc *shmem_falloc;
spin_lock(&inode->i_lock);
shmem_falloc = inode->i_private; if (shmem_falloc &&
!shmem_falloc->waitq &&
index >= shmem_falloc->start &&
index < shmem_falloc->next)
shmem_falloc->nr_unswapped += nr_pages; else
shmem_falloc = NULL;
spin_unlock(&inode->i_lock); if (shmem_falloc) goto redirty;
}
folio_zero_range(folio, 0, folio_size(folio));
flush_dcache_folio(folio);
folio_mark_uptodate(folio);
}
if (!folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN)) { bool first_swapped = shmem_recalc_inode(inode, 0, nr_pages); int error;
/* * Add inode to shmem_unuse()'s list of swapped-out inodes, * if it's not already there. Do it now before the folio is * removed from page cache, when its pagelock no longer * protects the inode from eviction. And do it now, after * we've incremented swapped, because shmem_unuse() will * prune a !swapped inode from the swaplist.
*/ if (first_swapped) {
spin_lock(&shmem_swaplist_lock); if (list_empty(&info->swaplist))
list_add(&info->swaplist, &shmem_swaplist);
spin_unlock(&shmem_swaplist_lock);
}
BUG_ON(folio_mapped(folio));
error = swap_writeout(folio, plug); if (error != AOP_WRITEPAGE_ACTIVATE) { /* folio has been unlocked */ return error;
}
/* * The intention here is to avoid holding on to the swap when * zswap was unable to compress and unable to writeback; but * it will be appropriate if other reactivate cases are added.
*/
error = shmem_add_to_page_cache(folio, mapping, index,
swp_to_radix_entry(folio->swap),
__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); /* Swap entry might be erased by racing shmem_free_swap() */ if (!error) {
shmem_recalc_inode(inode, 0, -nr_pages);
swap_free_nr(folio->swap, nr_pages);
}
/* * The delete_from_swap_cache() below could be left for * shrink_folio_list()'s folio_free_swap() to dispose of; * but I'm a little nervous about letting this folio out of * shmem_writeout() in a hybrid half-tmpfs-half-swap state * e.g. folio_mapping(folio) might give an unexpected answer.
*/
delete_from_swap_cache(folio); goto redirty;
} if (nr_pages > 1) goto try_split;
redirty:
folio_mark_dirty(folio); return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */
}
EXPORT_SYMBOL_GPL(shmem_writeout);
/* * Make sure huge_gfp is always more limited than limit_gfp. * Some of the flags set permissions, while others set limitations.
*/ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
{
gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY;
gfp_t zoneflags = limit_gfp & GFP_ZONEMASK;
gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK);
/* Allow allocations only from the originally specified zones. */
result |= zoneflags;
/* * Minimize the result gfp by taking the union with the deny flags, * and the intersection of the allow flags.
*/
result |= (limit_gfp & denyflags);
result |= (huge_gfp & limit_gfp) & allowflags;
return result;
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE bool shmem_hpage_pmd_enabled(void)
{ if (shmem_huge == SHMEM_HUGE_DENY) returnfalse; if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_always)) returntrue; if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_madvise)) returntrue; if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_within_size)) returntrue; if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_inherit) &&
shmem_huge != SHMEM_HUGE_NEVER) returntrue;
/* * Following the 'deny' semantics of the top level, force the huge * option off from all mounts.
*/ if (shmem_huge == SHMEM_HUGE_DENY) return 0;
/* * Only allow inherit orders if the top-level value is 'force', which * means non-PMD sized THP can not override 'huge' mount option now.
*/ if (shmem_huge == SHMEM_HUGE_FORCE) return READ_ONCE(huge_shmem_orders_inherit);
/* Allow mTHP that will be fully within i_size. */
mask |= shmem_get_orders_within_size(inode, within_size_orders, index, 0);
if (vm_flags & VM_HUGEPAGE)
mask |= READ_ONCE(huge_shmem_orders_madvise);
if (global_orders > 0)
mask |= READ_ONCE(huge_shmem_orders_inherit);
if (vma) {
orders = thp_vma_suitable_orders(vma, vmf->address, orders); if (!orders) return 0;
}
/* Find the highest order that can add into the page cache */
order = highest_order(orders); while (orders) {
pages = 1UL << order;
aligned_index = round_down(index, pages); /* * Check for conflict before waiting on a huge allocation. * Conflict might be that a huge page has just been allocated * and added to page cache by a racing thread, or that there * is already at least one small page in the huge extent. * Be careful to retry when appropriate, but not forever! * Elsewhere -EEXIST would be the right code, but not here.
*/ if (!xa_find(&mapping->i_pages, &aligned_index,
aligned_index + pages - 1, XA_PRESENT)) break;
order = next_order(&orders, order);
}
error = shmem_inode_acct_blocks(inode, pages); if (error) { struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); long freed; /* * Try to reclaim some space by splitting a few * large folios beyond i_size on the filesystem.
*/
shmem_unused_huge_shrink(sbinfo, NULL, pages); /* * And do a shmem_recalc_inode() to account for freed pages: * except our folio is there in cache, so not quite balanced.
*/
spin_lock(&info->lock);
freed = pages + info->alloced - info->swapped -
READ_ONCE(mapping->nrpages); if (freed > 0)
info->alloced -= freed;
spin_unlock(&info->lock); if (freed > 0)
shmem_inode_unacct_blocks(inode, freed);
error = shmem_inode_acct_blocks(inode, pages); if (error) {
filemap_remove_folio(folio); goto unlock;
}
}
/* * We have arrived here because our zones are constrained, so don't * limit chance of success with further cpuset and node constraints.
*/
gfp &= ~GFP_CONSTRAINT_MASK;
alloc_gfp = gfp; if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { if (WARN_ON_ONCE(order)) return ERR_PTR(-EINVAL);
} elseif (order) { /* * If uffd is active for the vma, we need per-page fault * fidelity to maintain the uffd semantics, then fallback * to swapin order-0 folio, as well as for zswap case. * Any existing sub folio in the swap cache also blocks * mTHP swapin.
*/ if ((vma && unlikely(userfaultfd_armed(vma))) ||
!zswap_never_enabled() ||
non_swapcache_batch(entry, nr_pages) != nr_pages) goto fallback;
alloc_gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp);
}
retry: new = shmem_alloc_folio(alloc_gfp, order, info, index); if (!new) { new = ERR_PTR(-ENOMEM); goto fallback;
}
if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL,
alloc_gfp, entry)) {
folio_put(new); new = ERR_PTR(-ENOMEM); goto fallback;
}
/* * Prevent parallel swapin from proceeding with the swap cache flag. * * Of course there is another possible concurrent scenario as well, * that is to say, the swap cache flag of a large folio has already * been set by swapcache_prepare(), while another thread may have * already split the large swap entry stored in the shmem mapping. * In this case, shmem_add_to_page_cache() will help identify the * concurrent swapin and return -EEXIST.
*/ if (swapcache_prepare(entry, nr_pages)) {
folio_put(new); new = ERR_PTR(-EEXIST); /* Try smaller folio to avoid cache conflict */ goto fallback;
}
memcg1_swapin(entry, nr_pages);
shadow = get_shadow_from_swap_cache(entry); if (shadow)
workingset_refault(new, shadow);
folio_add_lru(new);
swap_read_folio(new, NULL); returnnew;
fallback: /* Order 0 swapin failed, nothing to fallback to, abort */ if (!order) returnnew;
entry.val += index - round_down(index, nr_pages);
alloc_gfp = gfp;
nr_pages = 1;
order = 0; goto retry;
}
/* * When a page is moved from swapcache to shmem filecache (either by the * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of * shmem_unuse_inode()), it may have been read in earlier from swap, in * ignorance of the mapping it belongs to. If that mapping has special * constraints (like the gma500 GEM driver, which requires RAM below 4GB), * we may need to copy to a suitable page before moving to filecache. * * In a future release, this may well be extended to respect cpuset and * NUMA mempolicy, and applied also to anonymous pages in do_swap_page(); * but for now it is a simple matter of zone.
*/ staticbool shmem_should_replace_folio(struct folio *folio, gfp_t gfp)
{ return folio_zonenum(folio) > gfp_zone(gfp);
}
/* * We have arrived here because our zones are constrained, so don't * limit chance of success by further cpuset and node constraints.
*/
gfp &= ~GFP_CONSTRAINT_MASK; #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (nr_pages > 1) {
gfp_t huge_gfp = vma_thp_gfp_mask(vma);
gfp = limit_gfp_mask(huge_gfp, gfp);
} #endif
new = shmem_alloc_folio(gfp, folio_order(old), info, index); if (!new) return -ENOMEM;
/* Swap cache still stores N entries instead of a high-order entry */
xa_lock_irq(&swap_mapping->i_pages); for (i = 0; i < nr_pages; i++) { void *item = xas_load(&xas);
if (unlikely(error)) { /* * Is this possible? I think not, now that our callers * check both the swapcache flag and folio->private * after getting the folio lock; but be defensive. * Reverse old to newpage for clear and free.
*/
old = new;
} else {
folio_add_lru(new);
*foliop = new;
}
folio_clear_swapcache(old);
old->private = NULL;
folio_unlock(old); /* * The old folio are removed from swap cache, drop the 'nr_pages' * reference, as well as one temporary reference getting from swap * cache.
*/
folio_put_refs(old, nr_pages + 1); return error;
}
swapin_error = make_poisoned_swp_entry();
old = xa_cmpxchg_irq(&mapping->i_pages, index,
swp_to_radix_entry(swap),
swp_to_radix_entry(swapin_error), 0); if (old != swp_to_radix_entry(swap)) return;
nr_pages = folio_nr_pages(folio);
folio_wait_writeback(folio); if (!skip_swapcache)
delete_from_swap_cache(folio); /* * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks) * in shmem_evict_inode().
*/
shmem_recalc_inode(inode, -nr_pages, -nr_pages);
swap_free_nr(swap, nr_pages);
}
xas_set_order(&xas, index, split_order);
xas_try_split(&xas, old, cur_order); if (xas_error(&xas)) goto unlock;
/* * Re-set the swap entry after splitting, and the swap * offset of the original large entry must be continuous.
*/ for (i = 0; i < 1 << cur_order;
i += (1 << split_order)) {
swp_entry_t tmp;
if (is_poisoned_swp_entry(index_entry)) return -EIO;
si = get_swap_device(index_entry);
order = shmem_confirm_swap(mapping, index, index_entry); if (unlikely(!si)) { if (order < 0) return -EEXIST; else return -EINVAL;
} if (unlikely(order < 0)) {
put_swap_device(si); return -EEXIST;
}
/* index may point to the middle of a large entry, get the sub entry */ if (order) {
offset = index - round_down(index, 1 << order);
swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
}
/* Look it up and read it in.. */
folio = swap_cache_get_folio(swap, NULL, 0); if (!folio) { if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) { /* Direct swapin skipping swap cache & readahead */
folio = shmem_swap_alloc_folio(inode, vma, index,
index_entry, order, gfp); if (IS_ERR(folio)) {
error = PTR_ERR(folio);
folio = NULL; goto failed;
}
skip_swapcache = true;
} else { /* Cached swapin only supports order 0 folio */
folio = shmem_swapin_cluster(swap, gfp, info, index); if (!folio) {
error = -ENOMEM; goto failed;
}
} if (fault_type) {
*fault_type |= VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Dauer der Verarbeitung: 0.12 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.