#ifdef CONFIG_SHMEM /* * This virtual memory filesystem is heavily based on the ramfs. It * extends ramfs by the ability to use swap and honor resource limits * which makes it a completely usable filesystem.
*/
/* Pretend that each entry is of this size in directory's i_size */ #define BOGO_DIRENT_SIZE 20
/* Pretend that one inode + its dentry occupy this much memory */ #define BOGO_INODE_SIZE 1024
/* Symlink up to this size is kmalloc'ed instead of using a swappable page */ #define SHORT_SYMLINK_LEN 128
/* * shmem_fallocate communicates with shmem_fault or shmem_writeout via * inode->i_private (with i_rwsem making sure that it has only one user at * a time): we would prefer not to enlarge the shmem inode just for that.
*/ struct shmem_falloc {
wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
pgoff_t start; /* start of range currently being fallocated */
pgoff_t next; /* the next page offset to be fallocated */
pgoff_t nr_falloced; /* how many new pages have been fallocated */
pgoff_t nr_unswapped; /* how often writeout refused to swap out */
};
/* * shmem_file_setup pre-accounts the whole fixed size of a VM object, * for shared memory and for shared anonymous (/dev/zero) mappings * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1), * consistent with the pre-accounting of private mappings ...
*/ staticinlineint shmem_acct_size(unsignedlong flags, loff_t size)
{ return (flags & VM_NORESERVE) ?
0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
}
/* * ... whereas tmpfs objects are accounted incrementally as * pages are allocated, in order to allow large sparse files. * shmem_get_folio reports shmem_acct_blocks failure as -ENOSPC not -ENOMEM, * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
*/ staticinlineint shmem_acct_blocks(unsignedlong flags, long pages)
{ if (!(flags & VM_NORESERVE)) return 0;
if (shmem_acct_blocks(info->flags, pages)) return err;
might_sleep(); /* when quotas */ if (sbinfo->max_blocks) { if (!percpu_counter_limited_add(&sbinfo->used_blocks,
sbinfo->max_blocks, pages)) goto unacct;
/* * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and * produces a novel ino for the newly allocated inode. * * It may also be called when making a hard link to permit the space needed by * each dentry. However, in that case, no new inode number is needed since that * internally draws from another pool of inode numbers (currently global * get_next_ino()). This case is indicated by passing NULL as inop.
*/ #define SHMEM_INO_BATCH 1024 staticint shmem_reserve_inode(struct super_block *sb, ino_t *inop)
{ struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
ino_t ino;
if (!(sb->s_flags & SB_KERNMOUNT)) {
raw_spin_lock(&sbinfo->stat_lock); if (sbinfo->max_inodes) { if (sbinfo->free_ispace < BOGO_INODE_SIZE) {
raw_spin_unlock(&sbinfo->stat_lock); return -ENOSPC;
}
sbinfo->free_ispace -= BOGO_INODE_SIZE;
} if (inop) {
ino = sbinfo->next_ino++; if (unlikely(is_zero_ino(ino)))
ino = sbinfo->next_ino++; if (unlikely(!sbinfo->full_inums &&
ino > UINT_MAX)) { /* * Emulate get_next_ino uint wraparound for * compatibility
*/ if (IS_ENABLED(CONFIG_64BIT))
pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n",
__func__, MINOR(sb->s_dev));
sbinfo->next_ino = 1;
ino = sbinfo->next_ino++;
}
*inop = ino;
}
raw_spin_unlock(&sbinfo->stat_lock);
} elseif (inop) { /* * __shmem_file_setup, one of our callers, is lock-free: it * doesn't hold stat_lock in shmem_reserve_inode since * max_inodes is always 0, and is called from potentially * unknown contexts. As such, use a per-cpu batched allocator * which doesn't require the per-sb stat_lock unless we are at * the batch boundary. * * We don't need to worry about inode{32,64} since SB_KERNMOUNT * shmem mounts are not exposed to userspace, so we don't need * to worry about things like glibc compatibility.
*/
ino_t *next_ino;
/** * shmem_recalc_inode - recalculate the block usage of an inode * @inode: inode to recalc * @alloced: the change in number of pages allocated to inode * @swapped: the change in number of pages swapped from inode * * We have to calculate the free blocks since the mm can drop * undirtied hole pages behind our back. * * But normally info->alloced == inode->i_mapping->nrpages + info->swapped * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) * * Return: true if swapped was incremented from 0, for shmem_writeout().
*/ staticbool shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
{ struct shmem_inode_info *info = SHMEM_I(inode); bool first_swapped = false; long freed;
spin_lock(&info->lock);
info->alloced += alloced;
info->swapped += swapped;
freed = info->alloced - info->swapped -
READ_ONCE(inode->i_mapping->nrpages); /* * Special case: whereas normally shmem_recalc_inode() is called * after i_mapping->nrpages has already been adjusted (up or down), * shmem_writeout() has to raise swapped before nrpages is lowered - * to stop a racing shmem_recalc_inode() from thinking that a page has * been freed. Compensate here, to avoid the need for a followup call.
*/ if (swapped > 0) { if (info->swapped == swapped)
first_swapped = true;
freed += swapped;
} if (freed > 0)
info->alloced -= freed;
spin_unlock(&info->lock);
/* The quota case may block */ if (freed > 0)
shmem_inode_unacct_blocks(inode, freed); return first_swapped;
}
void shmem_uncharge(struct inode *inode, long pages)
{ /* pages argument is currently unused: keep it to help debugging */ /* nrpages adjustment done by __filemap_remove_folio() or caller */
shmem_recalc_inode(inode, 0, 0);
}
/* * Replace item expected in xarray by a new item, while holding xa_lock.
*/ staticint shmem_replace_entry(struct address_space *mapping,
pgoff_t index, void *expected, void *replacement)
{
XA_STATE(xas, &mapping->i_pages, index); void *item;
/* * Sometimes, before we decide whether to proceed or to fail, we must check * that an entry was not already brought back or split by a racing thread. * * Checking folio is not enough: by the time a swapcache folio is locked, it * might be reused, and again be swapcache, using the same swap as before. * Returns the swap entry's order if it still presents, else returns -1.
*/ staticint shmem_confirm_swap(struct address_space *mapping, pgoff_t index,
swp_entry_t swap)
{
XA_STATE(xas, &mapping->i_pages, index); int ret = -1; void *entry;
rcu_read_lock(); do {
entry = xas_load(&xas); if (entry == swp_to_radix_entry(swap))
ret = xas_get_order(&xas);
} while (xas_retry(&xas, entry));
rcu_read_unlock(); return ret;
}
/* * Definitions for "huge tmpfs": tmpfs mounted with the huge= option * * SHMEM_HUGE_NEVER: * disables huge pages for the mount; * SHMEM_HUGE_ALWAYS: * enables huge pages for the mount; * SHMEM_HUGE_WITHIN_SIZE: * only allocate huge pages if the page will be fully within i_size, * also respect madvise() hints; * SHMEM_HUGE_ADVISE: * only allocate huge pages if requested with madvise();
*/
/* * Special values. * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled: * * SHMEM_HUGE_DENY: * disables huge on shm_mnt and all mounts, for emergency use; * SHMEM_HUGE_FORCE: * enables huge on shm_mnt and all mounts, w/o needing option, for testing; *
*/ #define SHMEM_HUGE_DENY (-1) #define SHMEM_HUGE_FORCE (-2)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* ifdef here to avoid bloating shmem.o when not necessary */
/** * shmem_mapping_size_orders - Get allowable folio orders for the given file size. * @mapping: Target address_space. * @index: The page index. * @write_end: end of a write, could extend inode size. * * This returns huge orders for folios (when supported) based on the file size * which the mapping currently allows at the given index. The index is relevant * due to alignment considerations the mapping might have. The returned order * may be less than the size passed. * * Return: The orders.
*/ staticinlineunsignedint
shmem_mapping_size_orders(struct address_space *mapping, pgoff_t index, loff_t write_end)
{ unsignedint order;
size_t size;
if (!mapping_large_folio_support(mapping) || !write_end) return 0;
/* Calculate the write size based on the write_end */
size = write_end - (index << PAGE_SHIFT);
order = filemap_get_order(size); if (!order) return 0;
/* If we're not aligned, allocate a smaller folio */ if (index & ((1UL << order) - 1))
order = __ffs(index);
order = min_t(size_t, order, MAX_PAGECACHE_ORDER); return order > 0 ? BIT(order + 1) - 1 : 0;
}
if (!S_ISREG(inode->i_mode)) return 0; if (shmem_huge == SHMEM_HUGE_DENY) return 0; if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE) return maybe_pmd_order;
/* * The huge order allocation for anon shmem is controlled through * the mTHP interface, so we still use PMD-sized huge order to * check whether global control is enabled. * * For tmpfs mmap()'s huge order, we still use PMD-sized order to * allocate huge pages due to lack of a write size hint. * * Otherwise, tmpfs will allow getting a highest order hint based on * the size of write and fallocate paths, then will try each allowable * huge orders.
*/ switch (SHMEM_SB(inode->i_sb)->huge) { case SHMEM_HUGE_ALWAYS: if (vma) return maybe_pmd_order;
return shmem_mapping_size_orders(inode->i_mapping, index, write_end); case SHMEM_HUGE_WITHIN_SIZE: if (vma)
within_size_orders = maybe_pmd_order; else
within_size_orders = shmem_mapping_size_orders(inode->i_mapping,
index, write_end);
/* No large folio at the end of the file: nothing to split */ if (!folio_test_large(folio)) {
folio_put(folio); goto drop;
}
/* Check if there is anything to gain from splitting */
next = folio_next_index(folio);
end = shmem_fallocend(inode, DIV_ROUND_UP(i_size, PAGE_SIZE)); if (end <= folio->index || end >= next) {
folio_put(folio); goto drop;
}
/* * Move the inode on the list back to shrinklist if we failed * to lock the page at this time. * * Waiting for the lock may lead to deadlock in the * reclaim path.
*/ if (!folio_trylock(folio)) {
folio_put(folio); goto move_back;
}
ret = split_folio(folio);
folio_unlock(folio);
folio_put(folio);
/* If split failed move the inode on the list back to shrinklist */ if (ret) goto move_back;
freed += next - end;
split++;
drop:
list_del_init(&info->shrinklist); goto put;
move_back: /* * Make sure the inode is either on the global list or deleted * from any local list before iput() since it could be deleted * in another thread once we put the inode (then the local list * is corrupted).
*/
spin_lock(&sbinfo->shrinklist_lock);
list_move(&info->shrinklist, &sbinfo->shrinklist);
sbinfo->shrinklist_len++;
spin_unlock(&sbinfo->shrinklist_lock);
put:
iput(inode);
}
do {
iter = swap;
xas_lock_irq(&xas);
xas_for_each_conflict(&xas, entry) { /* * The range must either be empty, or filled with * expected swap entries. Shmem swap entries are never * partially freed without split of both entry and * folio, so there shouldn't be any holes.
*/ if (!expected || entry != swp_to_radix_entry(iter)) {
xas_set_err(&xas, -EEXIST); goto unlock;
}
iter.val += 1 << xas_get_order(&xas);
} if (expected && iter.val - nr != swap.val) {
xas_set_err(&xas, -EEXIST); goto unlock;
}
xas_store(&xas, folio); if (xas_error(&xas)) goto unlock;
shmem_update_stats(folio, nr);
mapping->nrpages += nr;
unlock:
xas_unlock_irq(&xas);
} while (xas_nomem(&xas, gfp));
/* * Somewhat like filemap_remove_folio, but substitutes swap for @folio.
*/ staticvoid shmem_delete_from_page_cache(struct folio *folio, void *radswap)
{ struct address_space *mapping = folio->mapping; long nr = folio_nr_pages(folio); int error;
/* * Remove swap entry from page cache, free the swap and its page cache. Returns * the number of pages being freed. 0 means entry not found in XArray (0 pages * being freed).
*/ staticlong shmem_free_swap(struct address_space *mapping,
pgoff_t index, void *radswap)
{ int order = xa_get_order(&mapping->i_pages, index); void *old;
old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0); if (old != radswap) return 0;
free_swap_and_cache_nr(radix_to_swp_entry(radswap), 1 << order);
return 1 << order;
}
/* * Determine (in bytes) how many of the shmem object's pages mapped by the * given offsets are swapped out. * * This is safe to call without i_rwsem or the i_pages lock thanks to RCU, * as long as the inode doesn't go away and racy results are not a problem.
*/ unsignedlong shmem_partial_swap_usage(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
XA_STATE(xas, &mapping->i_pages, start); struct page *page; unsignedlong swapped = 0; unsignedlong max = end - 1;
rcu_read_lock();
xas_for_each(&xas, page, max) { if (xas_retry(&xas, page)) continue; if (xa_is_value(page))
swapped += 1 << xas_get_order(&xas); if (xas.xa_index == max) break; if (need_resched()) {
xas_pause(&xas);
cond_resched_rcu();
}
}
rcu_read_unlock();
return swapped << PAGE_SHIFT;
}
/* * Determine (in bytes) how many of the shmem object's pages mapped by the * given vma is swapped out. * * This is safe to call without i_rwsem or the i_pages lock thanks to RCU, * as long as the inode doesn't go away and racy results are not a problem.
*/ unsignedlong shmem_swap_usage(struct vm_area_struct *vma)
{ struct inode *inode = file_inode(vma->vm_file); struct shmem_inode_info *info = SHMEM_I(inode); struct address_space *mapping = inode->i_mapping; unsignedlong swapped;
/* Be careful as we don't hold info->lock */
swapped = READ_ONCE(info->swapped);
/* * The easier cases are when the shmem object has nothing in swap, or * the vma maps it whole. Then we can simply use the stats that we * already track.
*/ if (!swapped) return 0;
/* Here comes the more involved part */ return shmem_partial_swap_usage(mapping, vma->vm_pgoff,
vma->vm_pgoff + vma_pages(vma));
}
/* * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
*/ void shmem_unlock_mapping(struct address_space *mapping)
{ struct folio_batch fbatch;
pgoff_t index = 0;
folio_batch_init(&fbatch); /* * Minor point, but we might as well stop if someone else SHM_LOCKs it.
*/ while (!mapping_unevictable(mapping) &&
filemap_get_folios(mapping, &index, ~0UL, &fbatch)) {
check_move_unevictable_folios(&fbatch);
folio_batch_release(&fbatch);
cond_resched();
}
}
/* * At first avoid shmem_get_folio(,,,SGP_READ): that fails * beyond i_size, and reports fallocated folios as holes.
*/
folio = filemap_get_entry(inode->i_mapping, index); if (!folio) return folio; if (!xa_is_value(folio)) {
folio_lock(folio); if (folio->mapping == inode->i_mapping) return folio; /* The folio has been swapped out */
folio_unlock(folio);
folio_put(folio);
} /* * But read a folio back from swap if any of it is within i_size * (although in some cases this is just a waste of time).
*/
folio = NULL;
shmem_get_folio(inode, index, 0, &folio, SGP_READ); return folio;
}
/* * Remove range of pages and swap entries from page cache, and free them. * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
*/ staticvoid shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, bool unfalloc)
{ struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode);
pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
pgoff_t end = (lend + 1) >> PAGE_SHIFT; struct folio_batch fbatch;
pgoff_t indices[PAGEVEC_SIZE]; struct folio *folio; bool same_folio; long nr_swaps_freed = 0;
pgoff_t index; int i;
if (lend == -1)
end = -1; /* unsigned, so actually very big */
if (info->fallocend > start && info->fallocend <= end && !unfalloc)
info->fallocend = start;
folio_batch_init(&fbatch);
index = start; while (index < end && find_lock_entries(mapping, &index, end - 1,
&fbatch, indices)) { for (i = 0; i < folio_batch_count(&fbatch); i++) {
folio = fbatch.folios[i];
if (xa_is_value(folio)) { if (unfalloc) continue;
nr_swaps_freed += shmem_free_swap(mapping,
indices[i], folio); continue;
}
/* * When undoing a failed fallocate, we want none of the partial folio * zeroing and splitting below, but shall want to truncate the whole * folio when !uptodate indicates that it was added by this fallocate, * even when [lstart, lend] covers only a part of the folio.
*/ if (unfalloc) goto whole_folios;
if (!same_folio)
folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT); if (folio) {
folio_mark_dirty(folio); if (!truncate_inode_partial_folio(folio, lstart, lend))
end = folio->index;
folio_unlock(folio);
folio_put(folio);
}
whole_folios:
index = start; while (index < end) {
cond_resched();
if (!find_get_entries(mapping, &index, end - 1, &fbatch,
indices)) { /* If all gone or hole-punch or unfalloc, we're done */ if (index == start || end != -1) break; /* But if truncating, restart to make sure all gone */
index = start; continue;
} for (i = 0; i < folio_batch_count(&fbatch); i++) {
folio = fbatch.folios[i];
if (xa_is_value(folio)) { long swaps_freed;
if (unfalloc) continue;
swaps_freed = shmem_free_swap(mapping, indices[i], folio); if (!swaps_freed) { /* Swap was replaced by page: retry */
index = indices[i]; break;
}
nr_swaps_freed += swaps_freed; continue;
}
folio_lock(folio);
if (!unfalloc || !folio_test_uptodate(folio)) { if (folio_mapping(folio) != mapping) { /* Page was replaced by swap: retry */
folio_unlock(folio);
index = indices[i]; break;
}
VM_BUG_ON_FOLIO(folio_test_writeback(folio),
folio);
if (!folio_test_large(folio)) {
truncate_inode_folio(mapping, folio);
} elseif (truncate_inode_partial_folio(folio, lstart, lend)) { /* * If we split a page, reset the loop so * that we pick up the new sub pages. * Otherwise the THP was entirely * dropped or the target range was * zeroed, so just continue the loop as * is.
*/ if (!folio_test_large(folio)) {
folio_unlock(folio);
index = start; break;
}
}
}
folio_unlock(folio);
}
folio_batch_remove_exceptionals(&fbatch);
folio_batch_release(&fbatch);
}
if (shmem_mapping(inode->i_mapping)) {
shmem_unacct_size(info->flags, inode->i_size);
inode->i_size = 0;
mapping_set_exiting(inode->i_mapping);
shmem_truncate_range(inode, 0, (loff_t)-1); if (!list_empty(&info->shrinklist)) {
spin_lock(&sbinfo->shrinklist_lock); if (!list_empty(&info->shrinklist)) {
list_del_init(&info->shrinklist);
sbinfo->shrinklist_len--;
}
spin_unlock(&sbinfo->shrinklist_lock);
} while (!list_empty(&info->swaplist)) { /* Wait while shmem_unuse() is scanning this inode... */
wait_var_event(&info->stop_eviction,
!atomic_read(&info->stop_eviction));
spin_lock(&shmem_swaplist_lock); /* ...but beware of the race if we peeked too early */ if (!atomic_read(&info->stop_eviction))
list_del_init(&info->swaplist);
spin_unlock(&shmem_swaplist_lock);
}
}
rcu_read_lock();
xas_for_each(&xas, folio, ULONG_MAX) { if (xas_retry(&xas, folio)) continue;
if (!xa_is_value(folio)) continue;
entry = radix_to_swp_entry(folio); /* * swapin error entries can be found in the mapping. But they're * deliberately ignored here as we've done everything we can do.
*/ if (swp_type(entry) != type) continue;
indices[folio_batch_count(fbatch)] = xas.xa_index; if (!folio_batch_add(fbatch, folio)) break;
if (need_resched()) {
xas_pause(&xas);
cond_resched_rcu();
}
}
rcu_read_unlock();
return folio_batch_count(fbatch);
}
/* * Move the swapped pages for an inode to page cache. Returns the count * of pages swapped in, or the error in case of failure.
*/ staticint shmem_unuse_swap_entries(struct inode *inode, struct folio_batch *fbatch, pgoff_t *indices)
{ int i = 0; int ret = 0; int error = 0; struct address_space *mapping = inode->i_mapping;
for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i];
/* * If swap found in inode, free it and move page from swapcache to filecache.
*/ staticint shmem_unuse_inode(struct inode *inode, unsignedint type)
{ struct address_space *mapping = inode->i_mapping;
pgoff_t start = 0; struct folio_batch fbatch;
pgoff_t indices[PAGEVEC_SIZE]; int ret = 0;
do {
folio_batch_init(&fbatch); if (!shmem_find_swap_entries(mapping, start, &fbatch,
indices, type)) {
ret = 0; break;
}
ret = shmem_unuse_swap_entries(inode, &fbatch, indices); if (ret < 0) break;
start = indices[folio_batch_count(&fbatch) - 1];
} while (true);
return ret;
}
/* * Read all the shared memory data that resides in the swap * device 'type' back into memory, so the swap device can be * unused.
*/ int shmem_unuse(unsignedint type)
{ struct shmem_inode_info *info, *next; int error = 0;
if (list_empty(&shmem_swaplist)) return 0;
spin_lock(&shmem_swaplist_lock);
start_over:
list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { if (!info->swapped) {
list_del_init(&info->swaplist); continue;
} /* * Drop the swaplist mutex while searching the inode for swap; * but before doing so, make sure shmem_evict_inode() will not * remove placeholder inode from swaplist, nor let it be freed * (igrab() would protect from unlink, but not from unmount).
*/
atomic_inc(&info->stop_eviction);
spin_unlock(&shmem_swaplist_lock);
spin_lock(&shmem_swaplist_lock); if (atomic_dec_and_test(&info->stop_eviction))
wake_up_var(&info->stop_eviction); if (error) break; if (list_empty(&info->swaplist)) goto start_over;
next = list_next_entry(info, swaplist); if (!info->swapped)
list_del_init(&info->swaplist);
}
spin_unlock(&shmem_swaplist_lock);
return error;
}
/** * shmem_writeout - Write the folio to swap * @folio: The folio to write * @plug: swap plug * @folio_list: list to put back folios on split * * Move the folio from the page cache to the swap cache.
*/ int shmem_writeout(struct folio *folio, struct swap_iocb **plug, struct list_head *folio_list)
{ struct address_space *mapping = folio->mapping; struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
pgoff_t index; int nr_pages; bool split = false;
if ((info->flags & VM_LOCKED) || sbinfo->noswap) goto redirty;
if (!total_swap_pages) goto redirty;
/* * If CONFIG_THP_SWAP is not enabled, the large folio should be * split when swapping. * * And shrinkage of pages beyond i_size does not split swap, so * swapout of a large folio crossing i_size needs to split too * (unless fallocate has been used to preallocate beyond EOF).
*/ if (folio_test_large(folio)) {
index = shmem_fallocend(inode,
DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE)); if ((index > folio->index && index < folio_next_index(folio)) ||
!IS_ENABLED(CONFIG_THP_SWAP))
split = true;
}
if (split) {
try_split: /* Ensure the subpages are still dirty */
folio_test_set_dirty(folio); if (split_folio_to_list(folio, folio_list)) goto redirty;
folio_clear_dirty(folio);
}
index = folio->index;
nr_pages = folio_nr_pages(folio);
/* * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC * value into swapfile.c, the only way we can correctly account for a * fallocated folio arriving here is now to initialize it and write it. * * That's okay for a folio already fallocated earlier, but if we have * not yet completed the fallocation, then (a) we want to keep track * of this folio in case we have to undo it, and (b) it may not be a * good idea to continue anyway, once we're pushing into swap. So * reactivate the folio, and let shmem_fallocate() quit when too many.
*/ if (!folio_test_uptodate(folio)) { if (inode->i_private) { struct shmem_falloc *shmem_falloc;
spin_lock(&inode->i_lock);
shmem_falloc = inode->i_private; if (shmem_falloc &&
!shmem_falloc->waitq &&
index >= shmem_falloc->start &&
index < shmem_falloc->next)
shmem_falloc->nr_unswapped += nr_pages; else
shmem_falloc = NULL;
spin_unlock(&inode->i_lock); if (shmem_falloc) goto redirty;
}
folio_zero_range(folio, 0, folio_size(folio));
flush_dcache_folio(folio);
folio_mark_uptodate(folio);
}
if (!folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN)) { bool first_swapped = shmem_recalc_inode(inode, 0, nr_pages); int error;
/* * Add inode to shmem_unuse()'s list of swapped-out inodes, * if it's not already there. Do it now before the folio is * removed from page cache, when its pagelock no longer * protects the inode from eviction. And do it now, after * we've incremented swapped, because shmem_unuse() will * prune a !swapped inode from the swaplist.
*/ if (first_swapped) {
spin_lock(&shmem_swaplist_lock); if (list_empty(&info->swaplist))
list_add(&info->swaplist, &shmem_swaplist);
spin_unlock(&shmem_swaplist_lock);
}
BUG_ON(folio_mapped(folio));
error = swap_writeout(folio, plug); if (error != AOP_WRITEPAGE_ACTIVATE) { /* folio has been unlocked */ return error;
}
/* * The intention here is to avoid holding on to the swap when * zswap was unable to compress and unable to writeback; but * it will be appropriate if other reactivate cases are added.
*/
error = shmem_add_to_page_cache(folio, mapping, index,
swp_to_radix_entry(folio->swap),
__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); /* Swap entry might be erased by racing shmem_free_swap() */ if (!error) {
shmem_recalc_inode(inode, 0, -nr_pages);
swap_free_nr(folio->swap, nr_pages);
}
/* * The delete_from_swap_cache() below could be left for * shrink_folio_list()'s folio_free_swap() to dispose of; * but I'm a little nervous about letting this folio out of * shmem_writeout() in a hybrid half-tmpfs-half-swap state * e.g. folio_mapping(folio) might give an unexpected answer.
*/
delete_from_swap_cache(folio); goto redirty;
} if (nr_pages > 1) goto try_split;
redirty:
folio_mark_dirty(folio); return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */
}
EXPORT_SYMBOL_GPL(shmem_writeout);
/* * Make sure huge_gfp is always more limited than limit_gfp. * Some of the flags set permissions, while others set limitations.
*/ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
{
gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY;
gfp_t zoneflags = limit_gfp & GFP_ZONEMASK;
gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK);
/* Allow allocations only from the originally specified zones. */
result |= zoneflags;
/* * Minimize the result gfp by taking the union with the deny flags, * and the intersection of the allow flags.
*/
result |= (limit_gfp & denyflags);
result |= (huge_gfp & limit_gfp) & allowflags;
return result;
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE bool shmem_hpage_pmd_enabled(void)
{ if (shmem_huge == SHMEM_HUGE_DENY) returnfalse; if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_always)) returntrue; if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_madvise)) returntrue; if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_within_size)) returntrue; if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_inherit) &&
shmem_huge != SHMEM_HUGE_NEVER) returntrue;
/* * Following the 'deny' semantics of the top level, force the huge * option off from all mounts.
*/ if (shmem_huge == SHMEM_HUGE_DENY) return 0;
/* * Only allow inherit orders if the top-level value is 'force', which * means non-PMD sized THP can not override 'huge' mount option now.
*/ if (shmem_huge == SHMEM_HUGE_FORCE) return READ_ONCE(huge_shmem_orders_inherit);
/* Allow mTHP that will be fully within i_size. */
mask |= shmem_get_orders_within_size(inode, within_size_orders, index, 0);
if (vm_flags & VM_HUGEPAGE)
mask |= READ_ONCE(huge_shmem_orders_madvise);
if (global_orders > 0)
mask |= READ_ONCE(huge_shmem_orders_inherit);
if (vma) {
orders = thp_vma_suitable_orders(vma, vmf->address, orders); if (!orders) return 0;
}
/* Find the highest order that can add into the page cache */
order = highest_order(orders); while (orders) {
pages = 1UL << order;
aligned_index = round_down(index, pages); /* * Check for conflict before waiting on a huge allocation. * Conflict might be that a huge page has just been allocated * and added to page cache by a racing thread, or that there * is already at least one small page in the huge extent. * Be careful to retry when appropriate, but not forever! * Elsewhere -EEXIST would be the right code, but not here.
*/ if (!xa_find(&mapping->i_pages, &aligned_index,
aligned_index + pages - 1, XA_PRESENT)) break;
order = next_order(&orders, order);
}
error = shmem_inode_acct_blocks(inode, pages); if (error) { struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); long freed; /* * Try to reclaim some space by splitting a few * large folios beyond i_size on the filesystem.
*/
shmem_unused_huge_shrink(sbinfo, NULL, pages); /* * And do a shmem_recalc_inode() to account for freed pages: * except our folio is there in cache, so not quite balanced.
*/
spin_lock(&info->lock);
freed = pages + info->alloced - info->swapped -
READ_ONCE(mapping->nrpages); if (freed > 0)
info->alloced -= freed;
spin_unlock(&info->lock); if (freed > 0)
shmem_inode_unacct_blocks(inode, freed);
error = shmem_inode_acct_blocks(inode, pages); if (error) {
filemap_remove_folio(folio); goto unlock;
}
}
/* * We have arrived here because our zones are constrained, so don't * limit chance of success with further cpuset and node constraints.
*/
gfp &= ~GFP_CONSTRAINT_MASK;
alloc_gfp = gfp; if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { if (WARN_ON_ONCE(order)) return ERR_PTR(-EINVAL);
} elseif (order) { /* * If uffd is active for the vma, we need per-page fault * fidelity to maintain the uffd semantics, then fallback * to swapin order-0 folio, as well as for zswap case. * Any existing sub folio in the swap cache also blocks * mTHP swapin.
*/ if ((vma && unlikely(userfaultfd_armed(vma))) ||
!zswap_never_enabled() ||
non_swapcache_batch(entry, nr_pages) != nr_pages) goto fallback;
alloc_gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp);
}
retry: new = shmem_alloc_folio(alloc_gfp, order, info, index); if (!new) { new = ERR_PTR(-ENOMEM); goto fallback;
}
if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL,
alloc_gfp, entry)) {
folio_put(new); new = ERR_PTR(-ENOMEM); goto fallback;
}
/* * Prevent parallel swapin from proceeding with the swap cache flag. * * Of course there is another possible concurrent scenario as well, * that is to say, the swap cache flag of a large folio has already * been set by swapcache_prepare(), while another thread may have * already split the large swap entry stored in the shmem mapping. * In this case, shmem_add_to_page_cache() will help identify the * concurrent swapin and return -EEXIST.
*/ if (swapcache_prepare(entry, nr_pages)) {
folio_put(new); new = ERR_PTR(-EEXIST); /* Try smaller folio to avoid cache conflict */ goto fallback;
}
memcg1_swapin(entry, nr_pages);
shadow = get_shadow_from_swap_cache(entry); if (shadow)
workingset_refault(new, shadow);
folio_add_lru(new);
swap_read_folio(new, NULL); returnnew;
fallback: /* Order 0 swapin failed, nothing to fallback to, abort */ if (!order) returnnew;
entry.val += index - round_down(index, nr_pages);
alloc_gfp = gfp;
nr_pages = 1;
order = 0; goto retry;
}
/* * When a page is moved from swapcache to shmem filecache (either by the * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of * shmem_unuse_inode()), it may have been read in earlier from swap, in * ignorance of the mapping it belongs to. If that mapping has special * constraints (like the gma500 GEM driver, which requires RAM below 4GB), * we may need to copy to a suitable page before moving to filecache. * * In a future release, this may well be extended to respect cpuset and * NUMA mempolicy, and applied also to anonymous pages in do_swap_page(); * but for now it is a simple matter of zone.
*/ staticbool shmem_should_replace_folio(struct folio *folio, gfp_t gfp)
{ return folio_zonenum(folio) > gfp_zone(gfp);
}
/* * We have arrived here because our zones are constrained, so don't * limit chance of success by further cpuset and node constraints.
*/
gfp &= ~GFP_CONSTRAINT_MASK; #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (nr_pages > 1) {
gfp_t huge_gfp = vma_thp_gfp_mask(vma);
gfp = limit_gfp_mask(huge_gfp, gfp);
} #endif
new = shmem_alloc_folio(gfp, folio_order(old), info, index); if (!new) return -ENOMEM;
/* Swap cache still stores N entries instead of a high-order entry */
xa_lock_irq(&swap_mapping->i_pages); for (i = 0; i < nr_pages; i++) { void *item = xas_load(&xas);
if (unlikely(error)) { /* * Is this possible? I think not, now that our callers * check both the swapcache flag and folio->private * after getting the folio lock; but be defensive. * Reverse old to newpage for clear and free.
*/
old = new;
} else {
folio_add_lru(new);
*foliop = new;
}
folio_clear_swapcache(old);
old->private = NULL;
folio_unlock(old); /* * The old folio are removed from swap cache, drop the 'nr_pages' * reference, as well as one temporary reference getting from swap * cache.
*/
folio_put_refs(old, nr_pages + 1); return error;
}
swapin_error = make_poisoned_swp_entry();
old = xa_cmpxchg_irq(&mapping->i_pages, index,
swp_to_radix_entry(swap),
swp_to_radix_entry(swapin_error), 0); if (old != swp_to_radix_entry(swap)) return;
nr_pages = folio_nr_pages(folio);
folio_wait_writeback(folio); if (!skip_swapcache)
delete_from_swap_cache(folio); /* * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks) * in shmem_evict_inode().
*/
shmem_recalc_inode(inode, -nr_pages, -nr_pages);
swap_free_nr(swap, nr_pages);
}
xas_set_order(&xas, index, split_order);
xas_try_split(&xas, old, cur_order); if (xas_error(&xas)) goto unlock;
/* * Re-set the swap entry after splitting, and the swap * offset of the original large entry must be continuous.
*/ for (i = 0; i < 1 << cur_order;
i += (1 << split_order)) {
swp_entry_t tmp;
if (is_poisoned_swp_entry(index_entry)) return -EIO;
si = get_swap_device(index_entry);
order = shmem_confirm_swap(mapping, index, index_entry); if (unlikely(!si)) { if (order < 0) return -EEXIST; else return -EINVAL;
} if (unlikely(order < 0)) {
put_swap_device(si); return -EEXIST;
}
/* index may point to the middle of a large entry, get the sub entry */ if (order) {
offset = index - round_down(index, 1 << order);
swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
}
/* Look it up and read it in.. */
folio = swap_cache_get_folio(swap, NULL, 0); if (!folio) { if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) { /* Direct swapin skipping swap cache & readahead */
folio = shmem_swap_alloc_folio(inode, vma, index,
index_entry, order, gfp); if (IS_ERR(folio)) {
error = PTR_ERR(folio);
folio = NULL; goto failed;
}
skip_swapcache = true;
} else { /* Cached swapin only supports order 0 folio */
folio = shmem_swapin_cluster(swap, gfp, info, index); if (!folio) {
error = -ENOMEM; goto failed;
}
} if (fault_type) {
*fault_type |= VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(fault_mm, PGMAJFAULT);
}
}
if (order > folio_order(folio)) { /* * Swapin may get smaller folios due to various reasons: * It may fallback to order 0 due to memory pressure or race, * swap readahead may swap in order 0 folios into swapcache * asynchronously, while the shmem mapping can still stores * large swap entries. In such cases, we should split the * large swap entry to prevent possible data corruption.
*/
error = shmem_split_large_entry(inode, index, index_entry, gfp); if (error) goto failed_nolock;
}
/* * If the folio is large, round down swap and index by folio size. * No matter what race occurs, the swap layer ensures we either get * a valid folio that has its swap entry aligned by size, or a * temporarily invalid one which we'll abort very soon and retry. * * shmem_add_to_page_cache ensures the whole range contains expected * entries and prevents any corruption, so any race split is fine * too, it will succeed as long as the entries are still there.
*/
nr_pages = folio_nr_pages(folio); if (nr_pages > 1) {
swap.val = round_down(swap.val, nr_pages);
index = round_down(index, nr_pages);
}
/* * We have to do this with the folio locked to prevent races. * The shmem_confirm_swap below only checks if the first swap * entry matches the folio, that's enough to ensure the folio * is not used outside of shmem, as shmem swap entries * and swap cache folios are never partially freed.
*/
folio_lock(folio); if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
shmem_confirm_swap(mapping, index, swap) < 0 ||
folio->swap.val != swap.val) {
error = -EEXIST; goto unlock;
} if (!folio_test_uptodate(folio)) {
error = -EIO; goto failed;
}
folio_wait_writeback(folio);
nr_pages = folio_nr_pages(folio);
/* * Some architectures may have to restore extra metadata to the * folio after reading from swap.
*/
arch_swap_restore(folio_swap(swap, folio), folio);
if (shmem_should_replace_folio(folio, gfp)) {
error = shmem_replace_folio(&folio, gfp, info, index, vma); if (error) goto failed;
}
*foliop = folio; return 0;
failed: if (shmem_confirm_swap(mapping, index, swap) < 0)
error = -EEXIST; if (error == -EIO)
shmem_set_folio_swapin_error(inode, index, folio, swap,
skip_swapcache);
unlock: if (folio)
folio_unlock(folio);
failed_nolock: if (skip_swapcache)
swapcache_clear(si, folio->swap, folio_nr_pages(folio)); if (folio)
folio_put(folio);
put_swap_device(si);
return error;
}
/* * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate * * If we allocate a new one we do not mark it dirty. That's up to the * vm. If we swap it in we mark it dirty since we also free the swap * entry since a page cannot live in both the swap and page cache. * * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL.
*/ staticint shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
loff_t write_end, struct folio **foliop, enum sgp_type sgp,
gfp_t gfp, struct vm_fault *vmf, vm_fault_t *fault_type)
{ struct vm_area_struct *vma = vmf ? vmf->vma : NULL; struct mm_struct *fault_mm; struct folio *folio; int error; bool alloced; unsignedlong orders = 0;
if (WARN_ON_ONCE(!shmem_mapping(inode->i_mapping))) return -EINVAL;
alloced:
alloced = true; if (folio_test_large(folio) &&
DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
folio_next_index(folio)) { struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct shmem_inode_info *info = SHMEM_I(inode); /* * Part of the large folio is beyond i_size: subject * to shrink under memory pressure.
*/
spin_lock(&sbinfo->shrinklist_lock); /* * _careful to defend against unlocked access to * ->shrink_list in shmem_unused_huge_shrink()
*/ if (list_empty_careful(&info->shrinklist)) {
list_add_tail(&info->shrinklist,
&sbinfo->shrinklist);
sbinfo->shrinklist_len++;
}
spin_unlock(&sbinfo->shrinklist_lock);
}
if (sgp == SGP_WRITE)
folio_set_referenced(folio); /* * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio.
*/ if (sgp == SGP_FALLOC)
sgp = SGP_WRITE;
clear: /* * Let SGP_WRITE caller clear ends if write does not fill folio; * but SGP_FALLOC on a folio fallocated earlier must initialize * it now, lest undo on failure cancel our earlier guarantee.
*/ if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) { long i, n = folio_nr_pages(folio);
for (i = 0; i < n; i++)
clear_highpage(folio_page(folio, i));
flush_dcache_folio(folio);
folio_mark_uptodate(folio);
}
/* Perhaps the file has been truncated since we checked */ if (sgp <= SGP_CACHE &&
((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
error = -EINVAL; goto unlock;
}
out:
*foliop = folio; return 0;
/** * shmem_get_folio - find, and lock a shmem folio. * @inode: inode to search * @index: the page index. * @write_end: end of a write, could extend inode size * @foliop: pointer to the folio if found * @sgp: SGP_* flags to control behavior * * Looks up the page cache entry at @inode & @index. If a folio is * present, it is returned locked with an increased refcount. * * If the caller modifies data in the folio, it must call folio_mark_dirty() * before unlocking the folio to ensure that the folio is not reclaimed. * There is no need to reserve space before calling folio_mark_dirty(). * * When no folio is found, the behavior depends on @sgp: * - for SGP_READ, *@foliop is %NULL and 0 is returned * - for SGP_NOALLOC, *@foliop is %NULL and -ENOENT is returned * - for all other flags a new folio is allocated, inserted into the * page cache and returned locked in @foliop. * * Context: May sleep. * Return: 0 if successful, else a negative error code.
*/ int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end, struct folio **foliop, enum sgp_type sgp)
{ return shmem_get_folio_gfp(inode, index, write_end, foliop, sgp,
mapping_gfp_mask(inode->i_mapping), NULL, NULL);
}
EXPORT_SYMBOL_GPL(shmem_get_folio);
/* * This is like autoremove_wake_function, but it removes the wait queue * entry unconditionally - even if something else had already woken the * target.
*/ staticint synchronous_wake_function(wait_queue_entry_t *wait, unsignedint mode, int sync, void *key)
{ int ret = default_wake_function(wait, mode, sync, key);
list_del_init(&wait->entry); return ret;
}
/* * Trinity finds that probing a hole which tmpfs is punching can * prevent the hole-punch from ever completing: which in turn * locks writers out with its hold on i_rwsem. So refrain from * faulting pages into the hole while it's being punched. Although * shmem_undo_range() does remove the additions, it may be unable to * keep up, as each new page needs its own unmap_mapping_range() call, * and the i_mmap tree grows ever slower to scan if new vmas are added. * * It does not matter if we sometimes reach this check just before the * hole-punch begins, so that one fault then races with the punch: * we just need to make racing faults a rare case. * * The implementation below would be much simpler if we just used a * standard mutex or completion: but we cannot take i_rwsem in fault, * and bloating every shmem inode for this unlikely case would be sad.
*/ static vm_fault_t shmem_falloc_wait(struct vm_fault *vmf, struct inode *inode)
{ struct shmem_falloc *shmem_falloc; struct file *fpin = NULL;
vm_fault_t ret = 0;
/* * shmem_falloc_waitq points into the shmem_fallocate() * stack of the hole-punching task: shmem_falloc_waitq * is usually invalid by the time we reach here, but * finish_wait() does not dereference it in that case; * though i_lock needed lest racing with wake_up_all().
*/
spin_lock(&inode->i_lock);
finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
}
spin_unlock(&inode->i_lock); if (fpin) {
fput(fpin);
ret = VM_FAULT_RETRY;
} return ret;
}
/* * Trinity finds that probing a hole which tmpfs is punching can * prevent the hole-punch from ever completing: noted in i_private.
*/ if (unlikely(inode->i_private)) {
ret = shmem_falloc_wait(vmf, inode); if (ret) return ret;
}
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return addr; if (IS_ERR_VALUE(addr)) return addr; if (addr & ~PAGE_MASK) return addr; if (addr > TASK_SIZE - len) return addr;
if (shmem_huge == SHMEM_HUGE_DENY) return addr; if (flags & MAP_FIXED) return addr; /* * Our priority is to support MAP_SHARED mapped hugely; * and support MAP_PRIVATE mapped hugely too, until it is COWed. * But if caller specified an address hint and we allocated area there * successfully, respect that as before.
*/ if (uaddr == addr) return addr;
hpage_size = HPAGE_PMD_SIZE; if (shmem_huge != SHMEM_HUGE_FORCE) { struct super_block *sb; unsignedlong __maybe_unused hpage_orders; int order = 0;
if (file) {
VM_BUG_ON(file->f_op != &shmem_file_operations);
sb = file_inode(file)->i_sb;
} else { /* * Called directly from mm/mmap.c, or drivers/char/mem.c * for "/dev/zero", to create a shared anonymous object.
*/ if (IS_ERR(shm_mnt)) return addr;
sb = shm_mnt->mnt_sb;
/* * Find the highest mTHP order used for anonymous shmem to * provide a suitable alignment address.
*/ #ifdef CONFIG_TRANSPARENT_HUGEPAGE
hpage_orders = READ_ONCE(huge_shmem_orders_always);
hpage_orders |= READ_ONCE(huge_shmem_orders_within_size);
hpage_orders |= READ_ONCE(huge_shmem_orders_madvise); if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER)
hpage_orders |= READ_ONCE(huge_shmem_orders_inherit);
if (hpage_orders > 0) {
order = highest_order(hpage_orders);
hpage_size = PAGE_SIZE << order;
} #endif
} if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER && !order) return addr;
}
/* * Bias interleave by inode number to distribute better across nodes; * but this interface is independent of which page order is used, so * supplies only that bias, letting caller apply the offset (adjusted * by page order, as in shmem_get_pgoff_policy() and get_vma_policy()).
*/
*ilx = inode->i_ino;
index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
}
int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
{ struct inode *inode = file_inode(file); struct shmem_inode_info *info = SHMEM_I(inode); int retval = -ENOMEM;
/* * What serializes the accesses to info->flags? * ipc_lock_object() when called from shmctl_do_lock(), * no serialization needed when called from shm_destroy().
*/ if (lock && !(info->flags & VM_LOCKED)) { if (!user_shm_lock(inode->i_size, ucounts)) goto out_nomem;
info->flags |= VM_LOCKED;
mapping_set_unevictable(file->f_mapping);
} if (!lock && (info->flags & VM_LOCKED) && ucounts) {
user_shm_unlock(inode->i_size, ucounts);
info->flags &= ~VM_LOCKED;
mapping_clear_unevictable(file->f_mapping);
}
retval = 0;
file_accessed(file); /* This is anonymous shared memory if it is unlinked at the time of mmap */ if (inode->i_nlink)
vma->vm_ops = &shmem_vm_ops; else
vma->vm_ops = &shmem_anon_vm_ops; return 0;
}
#if IS_ENABLED(CONFIG_UNICODE) /* * shmem_inode_casefold_flags - Deal with casefold file attribute flag * * The casefold file attribute needs some special checks. I can just be added to * an empty dir, and can't be removed from a non-empty dir.
*/ staticint shmem_inode_casefold_flags(struct inode *inode, unsignedint fsflags, struct dentry *dentry, unsignedint *i_flags)
{ unsignedint old = inode->i_flags; struct super_block *sb = inode->i_sb;
if (fsflags & FS_CASEFOLD_FL) { if (!(old & S_CASEFOLD)) { if (!sb->s_encoding) return -EOPNOTSUPP;
if (!S_ISDIR(inode->i_mode)) return -ENOTDIR;
if (dentry && !simple_empty(dentry)) return -ENOTEMPTY;
}
/* * chattr's fsflags are unrelated to extended attributes, * but tmpfs has chosen to enable them under the same config option.
*/ staticint shmem_set_inode_flags(struct inode *inode, unsignedint fsflags, struct dentry *dentry)
{ unsignedint i_flags = 0; int ret;
ret = shmem_inode_casefold_flags(inode, fsflags, dentry, &i_flags); if (ret) return ret;
if (fsflags & FS_NOATIME_FL)
i_flags |= S_NOATIME; if (fsflags & FS_APPEND_FL)
i_flags |= S_APPEND; if (fsflags & FS_IMMUTABLE_FL)
i_flags |= S_IMMUTABLE; /* * But FS_NODUMP_FL does not require any action in i_flags.
*/
inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE | S_CASEFOLD);
if (shmem_inode_acct_blocks(inode, 1)) { /* * We may have got a page, returned -ENOENT triggering a retry, * and now we find ourselves with -ENOMEM. Release the page, to * avoid a BUG_ON in our caller.
*/ if (unlikely(*foliop)) {
folio_put(*foliop);
*foliop = NULL;
} return -ENOMEM;
}
if (!*foliop) {
ret = -ENOMEM;
folio = shmem_alloc_folio(gfp, 0, info, pgoff); if (!folio) goto out_unacct_blocks;
if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) {
page_kaddr = kmap_local_folio(folio, 0); /* * The read mmap_lock is held here. Despite the * mmap_lock being read recursive a deadlock is still * possible if a writer has taken a lock. For example: * * process A thread 1 takes read lock on own mmap_lock * process A thread 2 calls mmap, blocks taking write lock * process B thread 1 takes page fault, read lock on own mmap lock * process B thread 2 calls mmap, blocks taking write lock * process A thread 1 blocks taking read lock on process B * process B thread 1 blocks taking read lock on process A * * Disable page faults to prevent potential deadlock * and retry the copy outside the mmap_lock.
*/
pagefault_disable();
ret = copy_from_user(page_kaddr,
(constvoid __user *)src_addr,
PAGE_SIZE);
pagefault_enable();
kunmap_local(page_kaddr);
/* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) {
*foliop = folio;
ret = -ENOENT; /* don't free the page */ goto out_unacct_blocks;
}
ret = -EFAULT;
max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (unlikely(pgoff >= max_off)) goto out_release;
ret = mem_cgroup_charge(folio, dst_vma->vm_mm, gfp); if (ret) goto out_release;
ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp); if (ret) goto out_release;
ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
&folio->page, true, flags); if (ret) goto out_delete_from_cache;
if (folio_test_large(folio) &&
folio_test_has_hwpoisoned(folio))
fallback_page_copy = true;
}
/* * We must evaluate after, since reads (unlike writes) * are called without i_rwsem protection against truncate
*/
i_size = i_size_read(inode); if (unlikely(iocb->ki_pos >= i_size)) { if (folio)
folio_put(folio); break;
}
end_offset = min_t(loff_t, i_size, iocb->ki_pos + to->count); if (folio && likely(!fallback_page_copy))
fsize = folio_size(folio); else
fsize = PAGE_SIZE;
offset = iocb->ki_pos & (fsize - 1);
nr = min_t(loff_t, end_offset - iocb->ki_pos, fsize - offset);
if (folio) { /* * If users can be writing to this page using arbitrary * virtual addresses, take care about potential aliasing * before reading the page on the kernel side.
*/ if (mapping_writably_mapped(mapping)) { if (likely(!fallback_page_copy))
flush_dcache_folio(folio); else
flush_dcache_page(page);
}
/* * Mark the folio accessed if we read the beginning.
*/ if (!offset)
folio_mark_accessed(folio); /* * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space...
*/ if (likely(!fallback_page_copy))
ret = copy_folio_to_iter(folio, offset, nr, to); else
ret = copy_page_to_iter(page, offset, nr, to);
folio_put(folio);
} elseif (user_backed_iter(to)) { /* * Copy to user tends to be so well optimized, but * clear_user() not so much, that it is noticeably * faster to copy the zero page instead of clearing.
*/
ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to);
} else { /* * But submitting the same page twice in a row to * splice() - or others? - can result in confusion: * so don't attempt that optimization on pipes etc.
*/
ret = iov_iter_zero(nr, to);
}
retval += ret;
iocb->ki_pos += ret;
if (!iov_iter_count(to)) break; if (ret < nr) {
error = -EFAULT; break;
}
cond_resched();
}
inode_lock(inode);
ret = generic_write_checks(iocb, from); if (ret <= 0) goto unlock;
ret = file_remove_privs(file); if (ret) goto unlock;
ret = file_update_time(file); if (ret) goto unlock;
ret = generic_perform_write(iocb, from);
unlock:
inode_unlock(inode); return ret;
}
/* Work out how much data we can actually add into the pipe */
used = pipe_buf_usage(pipe);
npages = max_t(ssize_t, pipe->max_usage - used, 0);
len = min_t(size_t, len, npages * PAGE_SIZE);
if (folio_test_large(folio) &&
folio_test_has_hwpoisoned(folio))
fallback_page_splice = true;
}
/* * i_size must be checked after we know the pages are Uptodate. * * Checking i_size after the check allows us to calculate * the correct value for "nr", which means the zero-filled * part of the page is not copied back to userspace (unless * another truncate extends the file - this is desired though).
*/
isize = i_size_read(inode); if (unlikely(*ppos >= isize)) break; /* * Fallback to PAGE_SIZE splice if the large folio has hwpoisoned * pages.
*/
size = len; if (unlikely(fallback_page_splice)) {
size_t offset = *ppos & ~PAGE_MASK;
if (folio) { /* * If users can be writing to this page using arbitrary * virtual addresses, take care about potential aliasing * before reading the page on the kernel side.
*/ if (mapping_writably_mapped(mapping)) { if (likely(!fallback_page_splice))
flush_dcache_folio(folio); else
flush_dcache_page(page);
}
folio_mark_accessed(folio); /* * Ok, we have the page, and it's up-to-date, so we can * now splice it into the pipe.
*/
n = splice_folio_into_pipe(pipe, folio, *ppos, part);
folio_put(folio);
folio = NULL;
} else {
n = splice_zeropage_into_pipe(pipe, *ppos, part);
}
if (!n) break;
len -= n;
total_spliced += n;
*ppos += n;
in->f_ra.prev_pos = *ppos; if (pipe_is_full(pipe)) break;
/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
error = inode_newsize_ok(inode, offset + len); if (error) goto out;
if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
error = -EPERM; goto out;
}
start = offset >> PAGE_SHIFT;
end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT; /* Try to avoid a swapstorm if len is impossible to satisfy */ if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
error = -ENOSPC; goto out;
}
/* * info->fallocend is only relevant when huge pages might be * involved: to prevent split_huge_page() freeing fallocated * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size.
*/
undo_fallocend = info->fallocend; if (info->fallocend < end)
info->fallocend = end;
for (index = start; index < end; ) { struct folio *folio;
/* * Check for fatal signal so that we abort early in OOM * situations. We don't want to abort in case of non-fatal * signals as large fallocate can take noticeable time and * e.g. periodic timers may result in fallocate constantly * restarting.
*/ if (fatal_signal_pending(current))
error = -EINTR; elseif (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
error = -ENOMEM; else
error = shmem_get_folio(inode, index, offset + len,
&folio, SGP_FALLOC); if (error) {
info->fallocend = undo_fallocend; /* Remove the !uptodate folios we added */ if (index > start) {
shmem_undo_range(inode,
(loff_t)start << PAGE_SHIFT,
((loff_t)index << PAGE_SHIFT) - 1, true);
} goto undone;
}
/* * Here is a more important optimization than it appears: * a second SGP_FALLOC on the same large folio will clear it, * making it uptodate and un-undoable if we fail later.
*/
index = folio_next_index(folio); /* Beware 32-bit wraparound */ if (!index)
index--;
/* * Inform shmem_writeout() how far we have reached. * No need for lock or barrier: we have the page lock.
*/ if (!folio_test_uptodate(folio))
shmem_falloc.nr_falloced += index - shmem_falloc.next;
shmem_falloc.next = index;
/* * If !uptodate, leave it that way so that freeable folios * can be recognized if we need to rollback on error later. * But mark it dirty so that memory pressure will swap rather * than free the folios we are allocating (and SGP_CACHE folios * might still be clean: we now need to mark those dirty too).
*/
folio_mark_dirty(folio);
folio_unlock(folio);
folio_put(folio);
cond_resched();
}
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
i_size_write(inode, offset + len);
undone:
spin_lock(&inode->i_lock);
inode->i_private = NULL;
spin_unlock(&inode->i_lock);
out: if (!error)
file_modified(file);
inode_unlock(inode); return error;
}
/* * Link a file..
*/ staticint shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
{ struct inode *inode = d_inode(old_dentry); int ret = 0;
/* * No ordinary (disk based) filesystem counts links as inodes; * but each new link needs a new dentry, pinning lowmem, and * tmpfs dentries cannot be pruned until they are unlinked. * But if an O_TMPFILE file is linked into the tmpfs, the * first link must skip that, to get the accounting right.
*/ if (inode->i_nlink) {
ret = shmem_reserve_inode(inode->i_sb, NULL); if (ret) goto out;
}
ret = simple_offset_add(shmem_get_offset_ctx(dir), dentry); if (ret) { if (inode->i_nlink)
shmem_free_inode(inode->i_sb, 0); goto out;
}
dir->i_size += BOGO_DIRENT_SIZE;
inode_set_mtime_to_ts(dir,
inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
inode_inc_iversion(dir);
inc_nlink(inode);
ihold(inode); /* New dentry reference */
dget(dentry); /* Extra pinning count for the created dentry */ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
d_add(dentry, inode); else
d_instantiate(dentry, inode);
out: return ret;
}
dir->i_size -= BOGO_DIRENT_SIZE;
inode_set_mtime_to_ts(dir,
inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
inode_inc_iversion(dir);
drop_nlink(inode);
dput(dentry); /* Undo the count from "create" - does all the work */
/* * For now, VFS can't deal with case-insensitive negative dentries, so * we invalidate them
*/ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
d_invalidate(dentry);
/* * Cheat and hash the whiteout while the old dentry is still in * place, instead of playing games with FS_RENAME_DOES_D_MOVE. * * d_lookup() will consistently find one of them at this point, * not sure which one, but that isn't even important.
*/
d_rehash(whiteout); return 0;
}
/* * The VFS layer already does all the dentry stuff for rename, * we just have to decrement the usage count for the target if * it exists so that the VFS layer correctly free's it when it * gets overwritten.
*/ staticint shmem_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsignedint flags)
{ struct inode *inode = d_inode(old_dentry); int they_are_dirs = S_ISDIR(inode->i_mode); int error;
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL;
if (flags & RENAME_EXCHANGE) return simple_offset_rename_exchange(old_dir, old_dentry,
new_dir, new_dentry);
if (!simple_empty(new_dentry)) return -ENOTEMPTY;
if (flags & RENAME_WHITEOUT) {
error = shmem_whiteout(idmap, old_dir, old_dentry); if (error) return error;
}
error = simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry); if (error) return error;
if (d_really_is_positive(new_dentry)) {
(void) shmem_unlink(new_dir, new_dentry); if (they_are_dirs) {
drop_nlink(d_inode(new_dentry));
drop_nlink(old_dir);
}
} elseif (they_are_dirs) {
drop_nlink(old_dir);
inc_nlink(new_dir);
}
/* * Superblocks without xattr inode operations may get some security.* xattr * support from the LSM "for free". As soon as we have any other xattrs * like ACLs, we also need to implement the security.* handlers at * filesystem level, though.
*/
/* Find any alias of inode, but prefer a hashed alias */ staticstruct dentry *shmem_find_alias(struct inode *inode)
{ struct dentry *alias = d_find_alias(inode);
if (inode_unhashed(inode)) { /* Unfortunately insert_inode_hash is not idempotent, * so as we hash inodes here rather than at creation * time, we need a lock to ensure we only try * to do it once
*/ static DEFINE_SPINLOCK(lock);
spin_lock(&lock); if (inode_unhashed(inode))
__insert_inode_hash(inode,
inode->i_ino + inode->i_generation);
spin_unlock(&lock);
}
if (!latest_version) { if (strncmp(param->string, "utf8-", 5)) return invalfc(fc, "Only UTF-8 encodings are supported " "in the format: utf8-<version number>");
version = utf8_parse_version(version_str); if (version < 0) return invalfc(fc, "Invalid UTF-8 version: %s", version_str);
}
/* * NUL-terminate this option: unfortunately, * mount options form a comma-separated list, * but mpol's nodelist may also contain commas.
*/ for (;;) {
p = strchr(*s, ','); if (p == NULL) break;
*s = p + 1; if (!isdigit(*(p+1))) {
*p = '\0'; return sbegin;
}
}
if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { if (!sbinfo->max_blocks) {
err = "Cannot retroactively limit size"; goto out;
} if (percpu_counter_compare(&sbinfo->used_blocks,
ctx->blocks) > 0) {
err = "Too small a size for current use"; goto out;
}
} if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) { if (!sbinfo->max_inodes) {
err = "Cannot retroactively limit inodes"; goto out;
} if (ctx->inodes * BOGO_INODE_SIZE < used_isp) {
err = "Too few inodes for current use"; goto out;
}
}
if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums &&
sbinfo->next_ino > UINT_MAX) {
err = "Current inum too high to switch to 32-bit inums"; goto out;
} if ((ctx->seen & SHMEM_SEEN_NOSWAP) && ctx->noswap && !sbinfo->noswap) {
err = "Cannot disable swap on remount"; goto out;
} if (!(ctx->seen & SHMEM_SEEN_NOSWAP) && !ctx->noswap && sbinfo->noswap) {
err = "Cannot enable swap on remount if it was disabled on first mount"; goto out;
}
if (ctx->seen & SHMEM_SEEN_QUOTA &&
!sb_any_quota_loaded(fc->root->d_sb)) {
err = "Cannot enable quota on remount"; goto out;
}
if (sbinfo->max_blocks != shmem_default_max_blocks())
seq_printf(seq, ",size=%luk", K(sbinfo->max_blocks)); if (sbinfo->max_inodes != shmem_default_max_inodes())
seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); if (sbinfo->mode != (0777 | S_ISVTX))
seq_printf(seq, ",mode=%03ho", sbinfo->mode); if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
seq_printf(seq, ",uid=%u",
from_kuid_munged(&init_user_ns, sbinfo->uid)); if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
seq_printf(seq, ",gid=%u",
from_kgid_munged(&init_user_ns, sbinfo->gid));
/* * Showing inode{64,32} might be useful even if it's the system default, * since then people don't have to resort to checking both here and * /proc/config.gz to confirm 64-bit inums were successfully applied * (which may not even exist if IKCONFIG_PROC isn't enabled). * * We hide it when inode64 isn't the default and we are using 32-bit * inodes, since that probably just means the feature isn't even under * consideration. * * As such: * * +-----------------+-----------------+ * | TMPFS_INODE64=y | TMPFS_INODE64=n | * +------------------+-----------------+-----------------+ * | full_inums=true | show | show | * | full_inums=false | show | hide | * +------------------+-----------------+-----------------+ *
*/ if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums)
seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32)); #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */ if (sbinfo->huge)
seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge)); #endif
mpol = shmem_get_sbmpol(sbinfo);
shmem_show_mpol(seq, mpol);
mpol_put(mpol); if (sbinfo->noswap)
seq_printf(seq, ",noswap"); #ifdef CONFIG_TMPFS_QUOTA if (sb_has_quota_active(root->d_sb, USRQUOTA))
seq_printf(seq, ",usrquota"); if (sb_has_quota_active(root->d_sb, GRPQUOTA))
seq_printf(seq, ",grpquota"); if (sbinfo->qlimits.usrquota_bhardlimit)
seq_printf(seq, ",usrquota_block_hardlimit=%lld",
sbinfo->qlimits.usrquota_bhardlimit); if (sbinfo->qlimits.grpquota_bhardlimit)
seq_printf(seq, ",grpquota_block_hardlimit=%lld",
sbinfo->qlimits.grpquota_bhardlimit); if (sbinfo->qlimits.usrquota_ihardlimit)
seq_printf(seq, ",usrquota_inode_hardlimit=%lld",
sbinfo->qlimits.usrquota_ihardlimit); if (sbinfo->qlimits.grpquota_ihardlimit)
seq_printf(seq, ",grpquota_inode_hardlimit=%lld",
sbinfo->qlimits.grpquota_ihardlimit); #endif return 0;
}
/* Round up to L1_CACHE_BYTES to resist false sharing */
sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
L1_CACHE_BYTES), GFP_KERNEL); if (!sbinfo) return error;
sb->s_fs_info = sbinfo;
#ifdef CONFIG_TMPFS /* * Per default we only allow half of the physical ram per * tmpfs instance, limiting inodes to one per page of lowmem; * but the internal instance is left unlimited.
*/ if (!(sb->s_flags & SB_KERNMOUNT)) { if (!(ctx->seen & SHMEM_SEEN_BLOCKS))
ctx->blocks = shmem_default_max_blocks(); if (!(ctx->seen & SHMEM_SEEN_INODES))
ctx->inodes = shmem_default_max_inodes(); if (!(ctx->seen & SHMEM_SEEN_INUMS))
ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
sbinfo->noswap = ctx->noswap;
} else {
sb->s_flags |= SB_NOUSER;
}
sb->s_export_op = &shmem_export_ops;
sb->s_flags |= SB_NOSEC | SB_I_VERSION;
#if IS_ENABLED(CONFIG_UNICODE) if (!ctx->encoding && ctx->strict_encoding) {
pr_err("tmpfs: strict_encoding option without encoding is forbidden\n");
error = -EINVAL; goto failed;
}
if (ctx->encoding) {
sb->s_encoding = ctx->encoding;
set_default_d_op(sb, &shmem_ci_dentry_ops); if (ctx->strict_encoding)
sb->s_encoding_flags = SB_ENC_STRICT_MODE_FL;
} #endif
error = register_filesystem(&shmem_fs_type); if (error) {
pr_err("Could not register tmpfs\n"); goto out2;
}
shm_mnt = kern_mount(&shmem_fs_type); if (IS_ERR(shm_mnt)) {
error = PTR_ERR(shm_mnt);
pr_err("Could not kern_mount tmpfs\n"); goto out1;
}
#ifdefined(CONFIG_SYSFS) && defined(CONFIG_TMPFS)
error = tmpfs_sysfs_init(); if (error) {
pr_err("Could not init tmpfs sysfs\n"); goto out1;
} #endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; else
shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */
/* * Default to setting PMD-sized THP to inherit the global setting and * disable all other multi-size THPs.
*/ if (!shmem_orders_configured)
huge_shmem_orders_inherit = BIT(HPAGE_PMD_ORDER); #endif return;
/* * tiny-shmem: simple shmemfs and tmpfs using ramfs code * * This is intended for small system where the benefits of the full * shmem code (swap-backed and resource-limited) are outweighed by * their complexity. On systems without swap this code should be * effectively equivalent, but much lighter weight.
*/
if (size < 0 || size > MAX_LFS_FILESIZE) return ERR_PTR(-EINVAL);
if (is_idmapped_mnt(mnt)) return ERR_PTR(-EINVAL);
if (shmem_acct_size(flags, size)) return ERR_PTR(-ENOMEM);
inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
S_IFREG | S_IRWXUGO, 0, flags); if (IS_ERR(inode)) {
shmem_unacct_size(flags, size); return ERR_CAST(inode);
}
inode->i_flags |= i_flags;
inode->i_size = size;
clear_nlink(inode); /* It is unlinked */
res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); if (!IS_ERR(res))
res = alloc_file_pseudo(inode, mnt, name, O_RDWR,
&shmem_file_operations); if (IS_ERR(res))
iput(inode); return res;
}
/** * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be * kernel internal. There will be NO LSM permission checks against the * underlying inode. So users of this interface must do LSM checks at a * higher layer. The users are the big_key and shm implementations. LSM * checks are provided at the key or shm level rather than the inode. * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
*/ struct file *shmem_kernel_file_setup(constchar *name, loff_t size, unsignedlong flags)
{ return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE);
}
EXPORT_SYMBOL_GPL(shmem_kernel_file_setup);
/** * shmem_file_setup - get an unlinked file living in tmpfs * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
*/ struct file *shmem_file_setup(constchar *name, loff_t size, unsignedlong flags)
{ return __shmem_file_setup(shm_mnt, name, size, flags, 0);
}
EXPORT_SYMBOL_GPL(shmem_file_setup);
/** * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs * @mnt: the tmpfs mount where the file will be created * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
*/ struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, constchar *name,
loff_t size, unsignedlong flags)
{ return __shmem_file_setup(mnt, name, size, flags, 0);
}
EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
/** * shmem_zero_setup - setup a shared anonymous mapping * @vma: the vma to be mmapped is prepared by do_mmap
*/ int shmem_zero_setup(struct vm_area_struct *vma)
{ struct file *file;
loff_t size = vma->vm_end - vma->vm_start;
/* * Cloning a new file under mmap_lock leads to a lock ordering conflict * between XFS directory reading and selinux: since this file is only * accessible to the user through its mapping, use S_PRIVATE flag to * bypass file security, in the same way as shmem_kernel_file_setup().
*/
file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags); if (IS_ERR(file)) return PTR_ERR(file);
if (vma->vm_file)
fput(vma->vm_file);
vma->vm_file = file;
vma->vm_ops = &shmem_anon_vm_ops;
return 0;
}
/** * shmem_read_folio_gfp - read into page cache, using specified page allocation flags. * @mapping: the folio's address_space * @index: the folio index * @gfp: the page allocator flags to use if allocating * * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)", * with any new page allocations done using the specified allocation flags. * But read_cache_page_gfp() uses the ->read_folio() method: which does not * suit tmpfs, since it may have pages in swapcache, and needs to find those * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. * * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
*/ struct folio *shmem_read_folio_gfp(struct address_space *mapping,
pgoff_t index, gfp_t gfp)
{ #ifdef CONFIG_SHMEM struct inode *inode = mapping->host; struct folio *folio; int error;
¤ Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.0.106Bemerkung:
(vorverarbeitet am 2026-04-26)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.