Quelle filemap.c Sprache: C

// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/filemap.c
*
* Copyright (C) 1994-1999  Linus Torvalds
*/

/*
* This file handles the generic file mmap semantics used by
* most "normal" filesystems (but you don't /have/ to use this:
* the NFS filesystem used to do this differently, for example)
*/
#include <linux/export.h>
#include <linux/compiler.h>
#include <linux/dax.h>
#include <linux/fs.h>
#include <linux/sched/signal.h>
#include <linux/uaccess.h>
#include <linux/capability.h>
#include <linux/kernel_stat.h>
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/syscalls.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/uio.h>
#include <linux/error-injection.h>
#include <linux/hash.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
#include <linux/security.h>
#include <linux/cpuset.h>
#include <linux/hugetlb.h>
#include <linux/memcontrol.h>
#include <linux/shmem_fs.h>
#include <linux/rmap.h>
#include <linux/delayacct.h>
#include <linux/psi.h>
#include <linux/ramfs.h>
#include <linux/page_idle.h>
#include <linux/migrate.h>
#include <linux/pipe_fs_i.h>
#include <linux/splice.h>
#include <linux/rcupdate_wait.h>
#include <linux/sched/mm.h>
#include <linux/sysctl.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include "internal.h"

#define CREATE_TRACE_POINTS
#include <trace/events/filemap.h>

/*
* FIXME: remove all knowledge of the buffer layer from the core VM
*/
#include <linux/buffer_head.h> /* for try_to_free_buffers */

#include <asm/mman.h>

#include "swap.h"

/*
* Shared mappings implemented 30.11.1994. It's not fully working yet,
* though.
*
* Shared mappings now work. 15.8.1995  Bruno.
*
* finished 'unifying' the page and buffer cache and SMP-threaded the
* page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
*
* SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
*/

/*
* Lock ordering:
*
*  ->i_mmap_rwsem (truncate_pagecache)
*    ->private_lock (__free_pte->block_dirty_folio)
*      ->swap_lock (exclusive_swap_page, others)
*        ->i_pages lock
*
*  ->i_rwsem
*    ->invalidate_lock (acquired by fs in truncate path)
*      ->i_mmap_rwsem (truncate->unmap_mapping_range)
*
*  ->mmap_lock
*    ->i_mmap_rwsem
*      ->page_table_lock or pte_lock (various, mainly in memory.c)
*        ->i_pages lock (arch-dependent flush_dcache_mmap_lock)
*
*  ->mmap_lock
*    ->invalidate_lock (filemap_fault)
*      ->lock_page (filemap_fault, access_process_vm)
*
*  ->i_rwsem (generic_perform_write)
*    ->mmap_lock (fault_in_readable->do_page_fault)
*
*  bdi->wb.list_lock
*    sb_lock (fs/fs-writeback.c)
*    ->i_pages lock (__sync_single_inode)
*
*  ->i_mmap_rwsem
*    ->anon_vma.lock (vma_merge)
*
*  ->anon_vma.lock
*    ->page_table_lock or pte_lock (anon_vma_prepare and various)
*
*  ->page_table_lock or pte_lock
*    ->swap_lock (try_to_unmap_one)
*    ->private_lock (try_to_unmap_one)
*    ->i_pages lock (try_to_unmap_one)
*    ->lruvec->lru_lock (follow_page_mask->mark_page_accessed)
*    ->lruvec->lru_lock (check_pte_range->folio_isolate_lru)
*    ->private_lock (folio_remove_rmap_pte->set_page_dirty)
*    ->i_pages lock (folio_remove_rmap_pte->set_page_dirty)
*    bdi.wb->list_lock (folio_remove_rmap_pte->set_page_dirty)
*    ->inode->i_lock (folio_remove_rmap_pte->set_page_dirty)
*    bdi.wb->list_lock (zap_pte_range->set_page_dirty)
*    ->inode->i_lock (zap_pte_range->set_page_dirty)
*    ->private_lock (zap_pte_range->block_dirty_folio)
*/

static void page_cache_delete(struct address_space *mapping,
       struct folio *folio, void *shadow)
{
XA_STATE(xas, &mapping->i_pages, folio->index);
long nr = 1;

mapping_set_update(&xas, mapping);

xas_set_order(&xas, folio->index, folio_order(folio));
nr = folio_nr_pages(folio);

VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

xas_store(&xas, shadow);
xas_init_marks(&xas);

folio->mapping = NULL;
/* Leave folio->index set: truncation lookup relies upon it */
mapping->nrpages -= nr;
}

static void filemap_unaccount_folio(struct address_space *mapping,
  struct folio *folio)
{
long nr;

VM_BUG_ON_FOLIO(folio_mapped(folio), folio);
if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) {
  pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n",
    current->comm, folio_pfn(folio));
  dump_page(&folio->page, "still mapped when deleted");
  dump_stack();
  add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);

  if (mapping_exiting(mapping) && !folio_test_large(folio)) {
   int mapcount = folio_mapcount(folio);

   if (folio_ref_count(folio) >= mapcount + 2) {
    /*
* All vmas have already been torn down, so it's
* a good bet that actually the page is unmapped
* and we'd rather not leak it: if we're wrong,
* another bad page check should catch it later.
*/
    atomic_set(&folio->_mapcount, -1);
    folio_ref_sub(folio, mapcount);
   }
  }
}

/* hugetlb folios do not participate in page cache accounting. */
if (folio_test_hugetlb(folio))
  return;

nr = folio_nr_pages(folio);

__lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
if (folio_test_swapbacked(folio)) {
  __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
  if (folio_test_pmd_mappable(folio))
   __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr);
} else if (folio_test_pmd_mappable(folio)) {
  __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr);
  filemap_nr_thps_dec(mapping);
}

/*
* At this point folio must be either written or cleaned by
* truncate.  Dirty folio here signals a bug and loss of
* unwritten data - on ordinary filesystems.
*
* But it's harmless on in-memory filesystems like tmpfs; and can
* occur when a driver which did get_user_pages() sets page dirty
* before putting it, while the inode is being finally evicted.
*
* Below fixes dirty accounting after removing the folio entirely
* but leaves the dirty flag set: it has no effect for truncated
* folio and anyway will be cleared before returning folio to
* buddy allocator.
*/
if (WARN_ON_ONCE(folio_test_dirty(folio) &&
    mapping_can_writeback(mapping)))
  folio_account_cleaned(folio, inode_to_wb(mapping->host));
}

/*
* Delete a page from the page cache and free it. Caller has to make
* sure the page is locked and that nobody else uses it - or that usage
* is safe.  The caller must hold the i_pages lock.
*/
void __filemap_remove_folio(struct folio *folio, void *shadow)
{
struct address_space *mapping = folio->mapping;

trace_mm_filemap_delete_from_page_cache(folio);
filemap_unaccount_folio(mapping, folio);
page_cache_delete(mapping, folio, shadow);
}

void filemap_free_folio(struct address_space *mapping, struct folio *folio)
{
void (*free_folio)(struct folio *);

free_folio = mapping->a_ops->free_folio;
if (free_folio)
  free_folio(folio);

folio_put_refs(folio, folio_nr_pages(folio));
}

/**
* filemap_remove_folio - Remove folio from page cache.
* @folio: The folio.
*
* This must be called only on folios that are locked and have been
* verified to be in the page cache.  It will never put the folio into
* the free list because the caller has a reference on the page.
*/
void filemap_remove_folio(struct folio *folio)
{
struct address_space *mapping = folio->mapping;

BUG_ON(!folio_test_locked(folio));
spin_lock(&mapping->host->i_lock);
xa_lock_irq(&mapping->i_pages);
__filemap_remove_folio(folio, NULL);
xa_unlock_irq(&mapping->i_pages);
if (mapping_shrinkable(mapping))
  inode_add_lru(mapping->host);
spin_unlock(&mapping->host->i_lock);

filemap_free_folio(mapping, folio);
}

/*
* page_cache_delete_batch - delete several folios from page cache
* @mapping: the mapping to which folios belong
* @fbatch: batch of folios to delete
*
* The function walks over mapping->i_pages and removes folios passed in
* @fbatch from the mapping. The function expects @fbatch to be sorted
* by page index and is optimised for it to be dense.
* It tolerates holes in @fbatch (mapping entries at those indices are not
* modified).
*
* The function expects the i_pages lock to be held.
*/
static void page_cache_delete_batch(struct address_space *mapping,
        struct folio_batch *fbatch)
{
XA_STATE(xas, &mapping->i_pages, fbatch->folios[0]->index);
long total_pages = 0;
int i = 0;
struct folio *folio;

mapping_set_update(&xas, mapping);
xas_for_each(&xas, folio, ULONG_MAX) {
  if (i >= folio_batch_count(fbatch))
   break;

  /* A swap/dax/shadow entry got inserted? Skip it. */
  if (xa_is_value(folio))
   continue;
  /*
* A page got inserted in our range? Skip it. We have our
* pages locked so they are protected from being removed.
* If we see a page whose index is higher than ours, it
* means our page has been removed, which shouldn't be
* possible because we're holding the PageLock.
*/
  if (folio != fbatch->folios[i]) {
   VM_BUG_ON_FOLIO(folio->index >
     fbatch->folios[i]->index, folio);
   continue;
  }

  WARN_ON_ONCE(!folio_test_locked(folio));

  folio->mapping = NULL;
  /* Leave folio->index set: truncation lookup relies on it */

  i++;
  xas_store(&xas, NULL);
  total_pages += folio_nr_pages(folio);
}
mapping->nrpages -= total_pages;
}

void delete_from_page_cache_batch(struct address_space *mapping,
      struct folio_batch *fbatch)
{
int i;

if (!folio_batch_count(fbatch))
  return;

spin_lock(&mapping->host->i_lock);
xa_lock_irq(&mapping->i_pages);
for (i = 0; i < folio_batch_count(fbatch); i++) {
  struct folio *folio = fbatch->folios[i];

  trace_mm_filemap_delete_from_page_cache(folio);
  filemap_unaccount_folio(mapping, folio);
}
page_cache_delete_batch(mapping, fbatch);
xa_unlock_irq(&mapping->i_pages);
if (mapping_shrinkable(mapping))
  inode_add_lru(mapping->host);
spin_unlock(&mapping->host->i_lock);

for (i = 0; i < folio_batch_count(fbatch); i++)
  filemap_free_folio(mapping, fbatch->folios[i]);
}

int filemap_check_errors(struct address_space *mapping)
{
int ret = 0;
/* Check for outstanding write errors */
if (test_bit(AS_ENOSPC, &mapping->flags) &&
     test_and_clear_bit(AS_ENOSPC, &mapping->flags))
  ret = -ENOSPC;
if (test_bit(AS_EIO, &mapping->flags) &&
     test_and_clear_bit(AS_EIO, &mapping->flags))
  ret = -EIO;
return ret;
}
EXPORT_SYMBOL(filemap_check_errors);

static int filemap_check_and_keep_errors(struct address_space *mapping)
{
/* Check for outstanding write errors */
if (test_bit(AS_EIO, &mapping->flags))
  return -EIO;
if (test_bit(AS_ENOSPC, &mapping->flags))
  return -ENOSPC;
return 0;
}

/**
* filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range
* @mapping: address space structure to write
* @wbc: the writeback_control controlling the writeout
*
* Call writepages on the mapping using the provided wbc to control the
* writeout.
*
* Return: %0 on success, negative error code otherwise.
*/
int filemap_fdatawrite_wbc(struct address_space *mapping,
      struct writeback_control *wbc)
{
int ret;

if (!mapping_can_writeback(mapping) ||
     !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
  return 0;

wbc_attach_fdatawrite_inode(wbc, mapping->host);
ret = do_writepages(mapping, wbc);
wbc_detach_inode(wbc);
return ret;
}
EXPORT_SYMBOL(filemap_fdatawrite_wbc);

/**
* __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
* @mapping: address space structure to write
* @start: offset in bytes where the range starts
* @end: offset in bytes where the range ends (inclusive)
* @sync_mode: enable synchronous operation
*
* Start writeback against all of a mapping's dirty pages that lie
* within the byte offsets <start, end> inclusive.
*
* If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
* opposed to a regular memory cleansing writeback.  The difference between
* these two operations is that if a dirty page/buffer is encountered, it must
* be waited upon, and not just skipped over.
*
* Return: %0 on success, negative error code otherwise.
*/
int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
    loff_t end, int sync_mode)
{
struct writeback_control wbc = {
  .sync_mode = sync_mode,
  .nr_to_write = LONG_MAX,
  .range_start = start,
  .range_end = end,
};

return filemap_fdatawrite_wbc(mapping, &wbc);
}

static inline int __filemap_fdatawrite(struct address_space *mapping,
int sync_mode)
{
return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
}

int filemap_fdatawrite(struct address_space *mapping)
{
return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
}
EXPORT_SYMBOL(filemap_fdatawrite);

int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
    loff_t end)
{
return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
}
EXPORT_SYMBOL(filemap_fdatawrite_range);

/**
* filemap_fdatawrite_range_kick - start writeback on a range
* @mapping: target address_space
* @start: index to start writeback on
* @end: last (inclusive) index for writeback
*
* This is a non-integrity writeback helper, to start writing back folios
* for the indicated range.
*
* Return: %0 on success, negative error code otherwise.
*/
int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start,
      loff_t end)
{
return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_NONE);
}
EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick);

/**
* filemap_flush - mostly a non-blocking flush
* @mapping: target address_space
*
* This is a mostly non-blocking flush.  Not suitable for data-integrity
* purposes - I/O may not be started against all dirty pages.
*
* Return: %0 on success, negative error code otherwise.
*/
int filemap_flush(struct address_space *mapping)
{
return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
}
EXPORT_SYMBOL(filemap_flush);

/**
* filemap_range_has_page - check if a page exists in range.
* @mapping:           address space within which to check
* @start_byte:        offset in bytes where the range starts
* @end_byte:          offset in bytes where the range ends (inclusive)
*
* Find at least one page in the range supplied, usually used to check if
* direct writing in this range will trigger a writeback.
*
* Return: %true if at least one page exists in the specified range,
* %false otherwise.
*/
bool filemap_range_has_page(struct address_space *mapping,
      loff_t start_byte, loff_t end_byte)
{
struct folio *folio;
XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
pgoff_t max = end_byte >> PAGE_SHIFT;

if (end_byte < start_byte)
  return false;

rcu_read_lock();
for (;;) {
  folio = xas_find(&xas, max);
  if (xas_retry(&xas, folio))
   continue;
  /* Shadow entries don't count */
  if (xa_is_value(folio))
   continue;
  /*
* We don't need to try to pin this page; we're about to
* release the RCU lock anyway.  It is enough to know that
* there was a page here recently.
*/
  break;
}
rcu_read_unlock();

return folio != NULL;
}
EXPORT_SYMBOL(filemap_range_has_page);

static void __filemap_fdatawait_range(struct address_space *mapping,
         loff_t start_byte, loff_t end_byte)
{
pgoff_t index = start_byte >> PAGE_SHIFT;
pgoff_t end = end_byte >> PAGE_SHIFT;
struct folio_batch fbatch;
unsigned nr_folios;

folio_batch_init(&fbatch);

while (index <= end) {
  unsigned i;

  nr_folios = filemap_get_folios_tag(mapping, &index, end,
    PAGECACHE_TAG_WRITEBACK, &fbatch);

  if (!nr_folios)
   break;

  for (i = 0; i < nr_folios; i++) {
   struct folio *folio = fbatch.folios[i];

   folio_wait_writeback(folio);
  }
  folio_batch_release(&fbatch);
  cond_resched();
}
}

/**
* filemap_fdatawait_range - wait for writeback to complete
* @mapping: address space structure to wait for
* @start_byte: offset in bytes where the range starts
* @end_byte: offset in bytes where the range ends (inclusive)
*
* Walk the list of under-writeback pages of the given address space
* in the given range and wait for all of them.  Check error status of
* the address space and return it.
*
* Since the error status of the address space is cleared by this function,
* callers are responsible for checking the return value and handling and/or
* reporting the error.
*
* Return: error status of the address space.
*/
int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
       loff_t end_byte)
{
__filemap_fdatawait_range(mapping, start_byte, end_byte);
return filemap_check_errors(mapping);
}
EXPORT_SYMBOL(filemap_fdatawait_range);

/**
* filemap_fdatawait_range_keep_errors - wait for writeback to complete
* @mapping: address space structure to wait for
* @start_byte: offset in bytes where the range starts
* @end_byte: offset in bytes where the range ends (inclusive)
*
* Walk the list of under-writeback pages of the given address space in the
* given range and wait for all of them.  Unlike filemap_fdatawait_range(),
* this function does not clear error status of the address space.
*
* Use this function if callers don't handle errors themselves.  Expected
* call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
* fsfreeze(8)
*/
int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
  loff_t start_byte, loff_t end_byte)
{
__filemap_fdatawait_range(mapping, start_byte, end_byte);
return filemap_check_and_keep_errors(mapping);
}
EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors);

/**
* file_fdatawait_range - wait for writeback to complete
* @file: file pointing to address space structure to wait for
* @start_byte: offset in bytes where the range starts
* @end_byte: offset in bytes where the range ends (inclusive)
*
* Walk the list of under-writeback pages of the address space that file
* refers to, in the given range and wait for all of them.  Check error
* status of the address space vs. the file->f_wb_err cursor and return it.
*
* Since the error status of the file is advanced by this function,
* callers are responsible for checking the return value and handling and/or
* reporting the error.
*
* Return: error status of the address space vs. the file->f_wb_err cursor.
*/
int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
{
struct address_space *mapping = file->f_mapping;

__filemap_fdatawait_range(mapping, start_byte, end_byte);
return file_check_and_advance_wb_err(file);
}
EXPORT_SYMBOL(file_fdatawait_range);

/**
* filemap_fdatawait_keep_errors - wait for writeback without clearing errors
* @mapping: address space structure to wait for
*
* Walk the list of under-writeback pages of the given address space
* and wait for all of them.  Unlike filemap_fdatawait(), this function
* does not clear error status of the address space.
*
* Use this function if callers don't handle errors themselves.  Expected
* call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
* fsfreeze(8)
*
* Return: error status of the address space.
*/
int filemap_fdatawait_keep_errors(struct address_space *mapping)
{
__filemap_fdatawait_range(mapping, 0, LLONG_MAX);
return filemap_check_and_keep_errors(mapping);
}
EXPORT_SYMBOL(filemap_fdatawait_keep_errors);

/* Returns true if writeback might be needed or already in progress. */
static bool mapping_needs_writeback(struct address_space *mapping)
{
return mapping->nrpages;
}

bool filemap_range_has_writeback(struct address_space *mapping,
     loff_t start_byte, loff_t end_byte)
{
XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
pgoff_t max = end_byte >> PAGE_SHIFT;
struct folio *folio;

if (end_byte < start_byte)
  return false;

rcu_read_lock();
xas_for_each(&xas, folio, max) {
  if (xas_retry(&xas, folio))
   continue;
  if (xa_is_value(folio))
   continue;
  if (folio_test_dirty(folio) || folio_test_locked(folio) ||
    folio_test_writeback(folio))
   break;
}
rcu_read_unlock();
return folio != NULL;
}
EXPORT_SYMBOL_GPL(filemap_range_has_writeback);

/**
* filemap_write_and_wait_range - write out & wait on a file range
* @mapping: the address_space for the pages
* @lstart: offset in bytes where the range starts
* @lend: offset in bytes where the range ends (inclusive)
*
* Write out and wait upon file offsets lstart->lend, inclusive.
*
* Note that @lend is inclusive (describes the last byte to be written) so
* that this function can be used to write to the very end-of-file (end = -1).
*
* Return: error status of the address space.
*/
int filemap_write_and_wait_range(struct address_space *mapping,
     loff_t lstart, loff_t lend)
{
int err = 0, err2;

if (lend < lstart)
  return 0;

if (mapping_needs_writeback(mapping)) {
  err = __filemap_fdatawrite_range(mapping, lstart, lend,
       WB_SYNC_ALL);
  /*
* Even if the above returned error, the pages may be
* written partially (e.g. -ENOSPC), so we wait for it.
* But the -EIO is special case, it may indicate the worst
* thing (e.g. bug) happened, so we avoid waiting for it.
*/
  if (err != -EIO)
   __filemap_fdatawait_range(mapping, lstart, lend);
}
err2 = filemap_check_errors(mapping);
if (!err)
  err = err2;
return err;
}
EXPORT_SYMBOL(filemap_write_and_wait_range);

void __filemap_set_wb_err(struct address_space *mapping, int err)
{
errseq_t eseq = errseq_set(&mapping->wb_err, err);

trace_filemap_set_wb_err(mapping, eseq);
}
EXPORT_SYMBOL(__filemap_set_wb_err);

/**
* file_check_and_advance_wb_err - report wb error (if any) that was previously
*    and advance wb_err to current one
* @file: struct file on which the error is being reported
*
* When userland calls fsync (or something like nfsd does the equivalent), we
* want to report any writeback errors that occurred since the last fsync (or
* since the file was opened if there haven't been any).
*
* Grab the wb_err from the mapping. If it matches what we have in the file,
* then just quickly return 0. The file is all caught up.
*
* If it doesn't match, then take the mapping value, set the "seen" flag in
* it and try to swap it into place. If it works, or another task beat us
* to it with the new value, then update the f_wb_err and return the error
* portion. The error at this point must be reported via proper channels
* (a'la fsync, or NFS COMMIT operation, etc.).
*
* While we handle mapping->wb_err with atomic operations, the f_wb_err
* value is protected by the f_lock since we must ensure that it reflects
* the latest value swapped in for this file descriptor.
*
* Return: %0 on success, negative error code otherwise.
*/
int file_check_and_advance_wb_err(struct file *file)
{
int err = 0;
errseq_t old = READ_ONCE(file->f_wb_err);
struct address_space *mapping = file->f_mapping;

/* Locklessly handle the common case where nothing has changed */
if (errseq_check(&mapping->wb_err, old)) {
  /* Something changed, must use slow path */
  spin_lock(&file->f_lock);
  old = file->f_wb_err;
  err = errseq_check_and_advance(&mapping->wb_err,
      &file->f_wb_err);
  trace_file_check_and_advance_wb_err(file, old);
  spin_unlock(&file->f_lock);
}

/*
* We're mostly using this function as a drop in replacement for
* filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect
* that the legacy code would have had on these flags.
*/
clear_bit(AS_EIO, &mapping->flags);
clear_bit(AS_ENOSPC, &mapping->flags);
return err;
}
EXPORT_SYMBOL(file_check_and_advance_wb_err);

/**
* file_write_and_wait_range - write out & wait on a file range
* @file: file pointing to address_space with pages
* @lstart: offset in bytes where the range starts
* @lend: offset in bytes where the range ends (inclusive)
*
* Write out and wait upon file offsets lstart->lend, inclusive.
*
* Note that @lend is inclusive (describes the last byte to be written) so
* that this function can be used to write to the very end-of-file (end = -1).
*
* After writing out and waiting on the data, we check and advance the
* f_wb_err cursor to the latest value, and return any errors detected there.
*
* Return: %0 on success, negative error code otherwise.
*/
int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
{
int err = 0, err2;
struct address_space *mapping = file->f_mapping;

if (lend < lstart)
  return 0;

if (mapping_needs_writeback(mapping)) {
  err = __filemap_fdatawrite_range(mapping, lstart, lend,
       WB_SYNC_ALL);
  /* See comment of filemap_write_and_wait() */
  if (err != -EIO)
   __filemap_fdatawait_range(mapping, lstart, lend);
}
err2 = file_check_and_advance_wb_err(file);
if (!err)
  err = err2;
return err;
}
EXPORT_SYMBOL(file_write_and_wait_range);

/**
* replace_page_cache_folio - replace a pagecache folio with a new one
* @old: folio to be replaced
* @new: folio to replace with
*
* This function replaces a folio in the pagecache with a new one.  On
* success it acquires the pagecache reference for the new folio and
* drops it for the old folio.  Both the old and new folios must be
* locked.  This function does not add the new folio to the LRU, the
* caller must do that.
*
* The remove + add is atomic.  This function cannot fail.
*/
void replace_page_cache_folio(struct folio *old, struct folio *new)
{
struct address_space *mapping = old->mapping;
void (*free_folio)(struct folio *) = mapping->a_ops->free_folio;
pgoff_t offset = old->index;
XA_STATE(xas, &mapping->i_pages, offset);

VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
VM_BUG_ON_FOLIO(new->mapping, new);

folio_get(new);
new->mapping = mapping;
new->index = offset;

mem_cgroup_replace_folio(old, new);

xas_lock_irq(&xas);
xas_store(&xas, new);

old->mapping = NULL;
/* hugetlb pages do not participate in page cache accounting. */
if (!folio_test_hugetlb(old))
  __lruvec_stat_sub_folio(old, NR_FILE_PAGES);
if (!folio_test_hugetlb(new))
  __lruvec_stat_add_folio(new, NR_FILE_PAGES);
if (folio_test_swapbacked(old))
  __lruvec_stat_sub_folio(old, NR_SHMEM);
if (folio_test_swapbacked(new))
  __lruvec_stat_add_folio(new, NR_SHMEM);
xas_unlock_irq(&xas);
if (free_folio)
  free_folio(old);
folio_put(old);
}
EXPORT_SYMBOL_GPL(replace_page_cache_folio);

noinline int __filemap_add_folio(struct address_space *mapping,
  struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
{
XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
bool huge;
long nr;
unsigned int forder = folio_order(folio);

VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);
VM_BUG_ON_FOLIO(folio_order(folio) < mapping_min_folio_order(mapping),
   folio);
mapping_set_update(&xas, mapping);

VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
huge = folio_test_hugetlb(folio);
nr = folio_nr_pages(folio);

gfp &= GFP_RECLAIM_MASK;
folio_ref_add(folio, nr);
folio->mapping = mapping;
folio->index = xas.xa_index;

for (;;) {
  int order = -1;
  void *entry, *old = NULL;

  xas_lock_irq(&xas);
  xas_for_each_conflict(&xas, entry) {
   old = entry;
   if (!xa_is_value(entry)) {
    xas_set_err(&xas, -EEXIST);
    goto unlock;
   }
   /*
* If a larger entry exists,
* it will be the first and only entry iterated.
*/
   if (order == -1)
    order = xas_get_order(&xas);
  }

  if (old) {
   if (order > 0 && order > forder) {
    unsigned int split_order = max(forder,
      xas_try_split_min_order(order));

    /* How to handle large swap entries? */
    BUG_ON(shmem_mapping(mapping));

    while (order > forder) {
     xas_set_order(&xas, index, split_order);
     xas_try_split(&xas, old, order);
     if (xas_error(&xas))
      goto unlock;
     order = split_order;
     split_order =
      max(xas_try_split_min_order(
           split_order),
          forder);
    }
    xas_reset(&xas);
   }
   if (shadowp)
    *shadowp = old;
  }

  xas_store(&xas, folio);
  if (xas_error(&xas))
   goto unlock;

  mapping->nrpages += nr;

  /* hugetlb pages do not participate in page cache accounting */
  if (!huge) {
   __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
   if (folio_test_pmd_mappable(folio))
    __lruvec_stat_mod_folio(folio,
      NR_FILE_THPS, nr);
  }

unlock:
  xas_unlock_irq(&xas);

  if (!xas_nomem(&xas, gfp))
   break;
}

if (xas_error(&xas))
  goto error;

trace_mm_filemap_add_to_page_cache(folio);
return 0;
error:
folio->mapping = NULL;
/* Leave folio->index set: truncation relies upon it */
folio_put_refs(folio, nr);
return xas_error(&xas);
}
ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO);

int filemap_add_folio(struct address_space *mapping, struct folio *folio,
    pgoff_t index, gfp_t gfp)
{
void *shadow = NULL;
int ret;

ret = mem_cgroup_charge(folio, NULL, gfp);
if (ret)
  return ret;

__folio_set_locked(folio);
ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow);
if (unlikely(ret)) {
  mem_cgroup_uncharge(folio);
  __folio_clear_locked(folio);
} else {
  /*
* The folio might have been evicted from cache only
* recently, in which case it should be activated like
* any other repeatedly accessed folio.
* The exception is folios getting rewritten; evicting other
* data from the working set, only to cache data that will
* get overwritten with something else, is a waste of memory.
*/
  WARN_ON_ONCE(folio_test_active(folio));
  if (!(gfp & __GFP_WRITE) && shadow)
   workingset_refault(folio, shadow);
  folio_add_lru(folio);
}
return ret;
}
EXPORT_SYMBOL_GPL(filemap_add_folio);

#ifdef CONFIG_NUMA
struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
{
int n;
struct folio *folio;

if (cpuset_do_page_mem_spread()) {
  unsigned int cpuset_mems_cookie;
  do {
   cpuset_mems_cookie = read_mems_allowed_begin();
   n = cpuset_mem_spread_node();
   folio = __folio_alloc_node_noprof(gfp, order, n);
  } while (!folio && read_mems_allowed_retry(cpuset_mems_cookie));

  return folio;
}
return folio_alloc_noprof(gfp, order);
}
EXPORT_SYMBOL(filemap_alloc_folio_noprof);
#endif

/*
* filemap_invalidate_lock_two - lock invalidate_lock for two mappings
*
* Lock exclusively invalidate_lock of any passed mapping that is not NULL.
*
* @mapping1: the first mapping to lock
* @mapping2: the second mapping to lock
*/
void filemap_invalidate_lock_two(struct address_space *mapping1,
     struct address_space *mapping2)
{
if (mapping1 > mapping2)
  swap(mapping1, mapping2);
if (mapping1)
  down_write(&mapping1->invalidate_lock);
if (mapping2 && mapping1 != mapping2)
  down_write_nested(&mapping2->invalidate_lock, 1);
}
EXPORT_SYMBOL(filemap_invalidate_lock_two);

/*
* filemap_invalidate_unlock_two - unlock invalidate_lock for two mappings
*
* Unlock exclusive invalidate_lock of any passed mapping that is not NULL.
*
* @mapping1: the first mapping to unlock
* @mapping2: the second mapping to unlock
*/
void filemap_invalidate_unlock_two(struct address_space *mapping1,
       struct address_space *mapping2)
{
if (mapping1)
  up_write(&mapping1->invalidate_lock);
if (mapping2 && mapping1 != mapping2)
  up_write(&mapping2->invalidate_lock);
}
EXPORT_SYMBOL(filemap_invalidate_unlock_two);

/*
* In order to wait for pages to become available there must be
* waitqueues associated with pages. By using a hash table of
* waitqueues where the bucket discipline is to maintain all
* waiters on the same queue and wake all when any of the pages
* become available, and for the woken contexts to check to be
* sure the appropriate page became available, this saves space
* at a cost of "thundering herd" phenomena during rare hash
* collisions.
*/
#define PAGE_WAIT_TABLE_BITS 8
#define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
static wait_queue_head_t folio_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;

static wait_queue_head_t *folio_waitqueue(struct folio *folio)
{
return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)];
}

/* How many times do we accept lock stealing from under a waiter? */
static int sysctl_page_lock_unfairness = 5;
static const struct ctl_table filemap_sysctl_table[] = {
{
  .procname = "page_lock_unfairness",
  .data  = &sysctl_page_lock_unfairness,
  .maxlen  = sizeof(sysctl_page_lock_unfairness),
  .mode  = 0644,
  .proc_handler = proc_dointvec_minmax,
  .extra1  = SYSCTL_ZERO,
}
};

void __init pagecache_init(void)
{
int i;

for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
  init_waitqueue_head(&folio_wait_table[i]);

page_writeback_init();
register_sysctl_init("vm", filemap_sysctl_table);
}

/*
* The page wait code treats the "wait->flags" somewhat unusually, because
* we have multiple different kinds of waits, not just the usual "exclusive"
* one.
*
* We have:
*
*  (a) no special bits set:
*
* We're just waiting for the bit to be released, and when a waker
* calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up,
* and remove it from the wait queue.
*
* Simple and straightforward.
*
*  (b) WQ_FLAG_EXCLUSIVE:
*
* The waiter is waiting to get the lock, and only one waiter should
* be woken up to avoid any thundering herd behavior. We'll set the
* WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue.
*
* This is the traditional exclusive wait.
*
*  (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM:
*
* The waiter is waiting to get the bit, and additionally wants the
* lock to be transferred to it for fair lock behavior. If the lock
* cannot be taken, we stop walking the wait queue without waking
* the waiter.
*
* This is the "fair lock handoff" case, and in addition to setting
* WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see
* that it now has the lock.
*/
static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
{
unsigned int flags;
struct wait_page_key *key = arg;
struct wait_page_queue *wait_page
  = container_of(wait, struct wait_page_queue, wait);

if (!wake_page_match(wait_page, key))
  return 0;

/*
* If it's a lock handoff wait, we get the bit for it, and
* stop walking (and do not wake it up) if we can't.
*/
flags = wait->flags;
if (flags & WQ_FLAG_EXCLUSIVE) {
  if (test_bit(key->bit_nr, &key->folio->flags))
   return -1;
  if (flags & WQ_FLAG_CUSTOM) {
   if (test_and_set_bit(key->bit_nr, &key->folio->flags))
    return -1;
   flags |= WQ_FLAG_DONE;
  }
}

/*
* We are holding the wait-queue lock, but the waiter that
* is waiting for this will be checking the flags without
* any locking.
*
* So update the flags atomically, and wake up the waiter
* afterwards to avoid any races. This store-release pairs
* with the load-acquire in folio_wait_bit_common().
*/
smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);
wake_up_state(wait->private, mode);

/*
* Ok, we have successfully done what we're waiting for,
* and we can unconditionally remove the wait entry.
*
* Note that this pairs with the "finish_wait()" in the
* waiter, and has to be the absolute last thing we do.
* After this list_del_init(&wait->entry) the wait entry
* might be de-allocated and the process might even have
* exited.
*/
list_del_init_careful(&wait->entry);
return (flags & WQ_FLAG_EXCLUSIVE) != 0;
}

static void folio_wake_bit(struct folio *folio, int bit_nr)
{
wait_queue_head_t *q = folio_waitqueue(folio);
struct wait_page_key key;
unsigned long flags;

key.folio = folio;
key.bit_nr = bit_nr;
key.page_match = 0;

spin_lock_irqsave(&q->lock, flags);
__wake_up_locked_key(q, TASK_NORMAL, &key);

/*
* It's possible to miss clearing waiters here, when we woke our page
* waiters, but the hashed waitqueue has waiters for other pages on it.
* That's okay, it's a rare case. The next waker will clear it.
*
* Note that, depending on the page pool (buddy, hugetlb, ZONE_DEVICE,
* other), the flag may be cleared in the course of freeing the page;
* but that is not required for correctness.
*/
if (!waitqueue_active(q) || !key.page_match)
  folio_clear_waiters(folio);

spin_unlock_irqrestore(&q->lock, flags);
}

/*
* A choice of three behaviors for folio_wait_bit_common():
*/
enum behavior {
EXCLUSIVE, /* Hold ref to page and take the bit when woken, like
* __folio_lock() waiting on then setting PG_locked.
*/
SHARED,  /* Hold ref to page and check the bit when woken, like
* folio_wait_writeback() waiting on PG_writeback.
*/
DROP,  /* Drop ref to page before wait, no check when woken,
* like folio_put_wait_locked() on PG_locked.
*/
};

/*
* Attempt to check (or get) the folio flag, and mark us done
* if successful.
*/
static inline bool folio_trylock_flag(struct folio *folio, int bit_nr,
     struct wait_queue_entry *wait)
{
if (wait->flags & WQ_FLAG_EXCLUSIVE) {
  if (test_and_set_bit(bit_nr, &folio->flags))
   return false;
} else if (test_bit(bit_nr, &folio->flags))
  return false;

wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
return true;
}

static inline int folio_wait_bit_common(struct folio *folio, int bit_nr,
  int state, enum behavior behavior)
{
wait_queue_head_t *q = folio_waitqueue(folio);
int unfairness = sysctl_page_lock_unfairness;
struct wait_page_queue wait_page;
wait_queue_entry_t *wait = &wait_page.wait;
bool thrashing = false;
unsigned long pflags;
bool in_thrashing;

if (bit_nr == PG_locked &&
     !folio_test_uptodate(folio) && folio_test_workingset(folio)) {
  delayacct_thrashing_start(&in_thrashing);
  psi_memstall_enter(&pflags);
  thrashing = true;
}

init_wait(wait);
wait->func = wake_page_function;
wait_page.folio = folio;
wait_page.bit_nr = bit_nr;

repeat:
wait->flags = 0;
if (behavior == EXCLUSIVE) {
  wait->flags = WQ_FLAG_EXCLUSIVE;
  if (--unfairness < 0)
   wait->flags |= WQ_FLAG_CUSTOM;
}

/*
* Do one last check whether we can get the
* page bit synchronously.
*
* Do the folio_set_waiters() marking before that
* to let any waker we _just_ missed know they
* need to wake us up (otherwise they'll never
* even go to the slow case that looks at the
* page queue), and add ourselves to the wait
* queue if we need to sleep.
*
* This part needs to be done under the queue
* lock to avoid races.
*/
spin_lock_irq(&q->lock);
folio_set_waiters(folio);
if (!folio_trylock_flag(folio, bit_nr, wait))
  __add_wait_queue_entry_tail(q, wait);
spin_unlock_irq(&q->lock);

/*
* From now on, all the logic will be based on
* the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to
* see whether the page bit testing has already
* been done by the wake function.
*
* We can drop our reference to the folio.
*/
if (behavior == DROP)
  folio_put(folio);

/*
* Note that until the "finish_wait()", or until
* we see the WQ_FLAG_WOKEN flag, we need to
* be very careful with the 'wait->flags', because
* we may race with a waker that sets them.
*/
for (;;) {
  unsigned int flags;

  set_current_state(state);

  /* Loop until we've been woken or interrupted */
  flags = smp_load_acquire(&wait->flags);
  if (!(flags & WQ_FLAG_WOKEN)) {
   if (signal_pending_state(state, current))
    break;

   io_schedule();
   continue;
  }

  /* If we were non-exclusive, we're done */
  if (behavior != EXCLUSIVE)
   break;

  /* If the waker got the lock for us, we're done */
  if (flags & WQ_FLAG_DONE)
   break;

  /*
* Otherwise, if we're getting the lock, we need to
* try to get it ourselves.
*
* And if that fails, we'll have to retry this all.
*/
  if (unlikely(test_and_set_bit(bit_nr, folio_flags(folio, 0))))
   goto repeat;

  wait->flags |= WQ_FLAG_DONE;
  break;
}

/*
* If a signal happened, this 'finish_wait()' may remove the last
* waiter from the wait-queues, but the folio waiters bit will remain
* set. That's ok. The next wakeup will take care of it, and trying
* to do it here would be difficult and prone to races.
*/
finish_wait(q, wait);

if (thrashing) {
  delayacct_thrashing_end(&in_thrashing);
  psi_memstall_leave(&pflags);
}

/*
* NOTE! The wait->flags weren't stable until we've done the
* 'finish_wait()', and we could have exited the loop above due
* to a signal, and had a wakeup event happen after the signal
* test but before the 'finish_wait()'.
*
* So only after the finish_wait() can we reliably determine
* if we got woken up or not, so we can now figure out the final
* return value based on that state without races.
*
* Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive
* waiter, but an exclusive one requires WQ_FLAG_DONE.
*/
if (behavior == EXCLUSIVE)
  return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR;

return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
}

#ifdef CONFIG_MIGRATION
/**
* migration_entry_wait_on_locked - Wait for a migration entry to be removed
* @entry: migration swap entry.
* @ptl: already locked ptl. This function will drop the lock.
*
* Wait for a migration entry referencing the given page to be removed. This is
* equivalent to folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE) except
* this can be called without taking a reference on the page. Instead this
* should be called while holding the ptl for the migration entry referencing
* the page.
*
* Returns after unlocking the ptl.
*
* This follows the same logic as folio_wait_bit_common() so see the comments
* there.
*/
void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl)
__releases(ptl)
{
struct wait_page_queue wait_page;
wait_queue_entry_t *wait = &wait_page.wait;
bool thrashing = false;
unsigned long pflags;
bool in_thrashing;
wait_queue_head_t *q;
struct folio *folio = pfn_swap_entry_folio(entry);

q = folio_waitqueue(folio);
if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) {
  delayacct_thrashing_start(&in_thrashing);
  psi_memstall_enter(&pflags);
  thrashing = true;
}

init_wait(wait);
wait->func = wake_page_function;
wait_page.folio = folio;
wait_page.bit_nr = PG_locked;
wait->flags = 0;

spin_lock_irq(&q->lock);
folio_set_waiters(folio);
if (!folio_trylock_flag(folio, PG_locked, wait))
  __add_wait_queue_entry_tail(q, wait);
spin_unlock_irq(&q->lock);

/*
* If a migration entry exists for the page the migration path must hold
* a valid reference to the page, and it must take the ptl to remove the
* migration entry. So the page is valid until the ptl is dropped.
*/
spin_unlock(ptl);

for (;;) {
  unsigned int flags;

  set_current_state(TASK_UNINTERRUPTIBLE);

  /* Loop until we've been woken or interrupted */
  flags = smp_load_acquire(&wait->flags);
  if (!(flags & WQ_FLAG_WOKEN)) {
   if (signal_pending_state(TASK_UNINTERRUPTIBLE, current))
    break;

   io_schedule();
   continue;
  }
  break;
}

finish_wait(q, wait);

if (thrashing) {
  delayacct_thrashing_end(&in_thrashing);
  psi_memstall_leave(&pflags);
}
}
#endif

void folio_wait_bit(struct folio *folio, int bit_nr)
{
folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
}
EXPORT_SYMBOL(folio_wait_bit);

int folio_wait_bit_killable(struct folio *folio, int bit_nr)
{
return folio_wait_bit_common(folio, bit_nr, TASK_KILLABLE, SHARED);
}
EXPORT_SYMBOL(folio_wait_bit_killable);

/**
* folio_put_wait_locked - Drop a reference and wait for it to be unlocked
* @folio: The folio to wait for.
* @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc).
*
* The caller should hold a reference on @folio.  They expect the page to
* become unlocked relatively soon, but do not wish to hold up migration
* (for example) by holding the reference while waiting for the folio to
* come unlocked.  After this function returns, the caller should not
* dereference @folio.
*
* Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal.
*/
static int folio_put_wait_locked(struct folio *folio, int state)
{
return folio_wait_bit_common(folio, PG_locked, state, DROP);
}

/**
* folio_unlock - Unlock a locked folio.
* @folio: The folio.
*
* Unlocks the folio and wakes up any thread sleeping on the page lock.
*
* Context: May be called from interrupt or process context.  May not be
* called from NMI context.
*/
void folio_unlock(struct folio *folio)
{
/* Bit 7 allows x86 to check the byte's sign bit */
BUILD_BUG_ON(PG_waiters != 7);
BUILD_BUG_ON(PG_locked > 7);
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
if (folio_xor_flags_has_waiters(folio, 1 << PG_locked))
  folio_wake_bit(folio, PG_locked);
}
EXPORT_SYMBOL(folio_unlock);

/**
* folio_end_read - End read on a folio.
* @folio: The folio.
* @success: True if all reads completed successfully.
*
* When all reads against a folio have completed, filesystems should
* call this function to let the pagecache know that no more reads
* are outstanding.  This will unlock the folio and wake up any thread
* sleeping on the lock.  The folio will also be marked uptodate if all
* reads succeeded.
*
* Context: May be called from interrupt or process context.  May not be
* called from NMI context.
*/
void folio_end_read(struct folio *folio, bool success)
{
unsigned long mask = 1 << PG_locked;

/* Must be in bottom byte for x86 to work */
BUILD_BUG_ON(PG_uptodate > 7);
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_FOLIO(success && folio_test_uptodate(folio), folio);

if (likely(success))
  mask |= 1 << PG_uptodate;
if (folio_xor_flags_has_waiters(folio, mask))
  folio_wake_bit(folio, PG_locked);
}
EXPORT_SYMBOL(folio_end_read);

/**
* folio_end_private_2 - Clear PG_private_2 and wake any waiters.
* @folio: The folio.
*
* Clear the PG_private_2 bit on a folio and wake up any sleepers waiting for
* it.  The folio reference held for PG_private_2 being set is released.
*
* This is, for example, used when a netfs folio is being written to a local
* disk cache, thereby allowing writes to the cache for the same folio to be
* serialised.
*/
void folio_end_private_2(struct folio *folio)
{
VM_BUG_ON_FOLIO(!folio_test_private_2(folio), folio);
clear_bit_unlock(PG_private_2, folio_flags(folio, 0));
folio_wake_bit(folio, PG_private_2);
folio_put(folio);
}
EXPORT_SYMBOL(folio_end_private_2);

/**
* folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio.
* @folio: The folio to wait on.
*
* Wait for PG_private_2 to be cleared on a folio.
*/
void folio_wait_private_2(struct folio *folio)
{
while (folio_test_private_2(folio))
  folio_wait_bit(folio, PG_private_2);
}
EXPORT_SYMBOL(folio_wait_private_2);

/**
* folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio.
* @folio: The folio to wait on.
*
* Wait for PG_private_2 to be cleared on a folio or until a fatal signal is
* received by the calling task.
*
* Return:
* - 0 if successful.
* - -EINTR if a fatal signal was encountered.
*/
int folio_wait_private_2_killable(struct folio *folio)
{
int ret = 0;

while (folio_test_private_2(folio)) {
  ret = folio_wait_bit_killable(folio, PG_private_2);
  if (ret < 0)
   break;
}

return ret;
}
EXPORT_SYMBOL(folio_wait_private_2_killable);

static void filemap_end_dropbehind(struct folio *folio)
{
struct address_space *mapping = folio->mapping;

VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

if (folio_test_writeback(folio) || folio_test_dirty(folio))
  return;
if (!folio_test_clear_dropbehind(folio))
  return;
if (mapping)
  folio_unmap_invalidate(mapping, folio, 0);
}

/*
* If folio was marked as dropbehind, then pages should be dropped when writeback
* completes. Do that now. If we fail, it's likely because of a big folio -
* just reset dropbehind for that case and latter completions should invalidate.
*/
static void filemap_end_dropbehind_write(struct folio *folio)
{
if (!folio_test_dropbehind(folio))
  return;

/*
* Hitting !in_task() should not happen off RWF_DONTCACHE writeback,
* but can happen if normal writeback just happens to find dirty folios
* that were created as part of uncached writeback, and that writeback
* would otherwise not need non-IRQ handling. Just skip the
* invalidation in that case.
*/
if (in_task() && folio_trylock(folio)) {
  filemap_end_dropbehind(folio);
  folio_unlock(folio);
}
}

/**
* folio_end_writeback - End writeback against a folio.
* @folio: The folio.
*
* The folio must actually be under writeback.
*
* Context: May be called from process or interrupt context.
*/
void folio_end_writeback(struct folio *folio)
{
VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio);

/*
* folio_test_clear_reclaim() could be used here but it is an
* atomic operation and overkill in this particular case. Failing
* to shuffle a folio marked for immediate reclaim is too mild
* a gain to justify taking an atomic operation penalty at the
* end of every folio writeback.
*/
if (folio_test_reclaim(folio)) {
  folio_clear_reclaim(folio);
  folio_rotate_reclaimable(folio);
}

/*
* Writeback does not hold a folio reference of its own, relying
* on truncation to wait for the clearing of PG_writeback.
* But here we must make sure that the folio is not freed and
* reused before the folio_wake_bit().
*/
folio_get(folio);
if (__folio_end_writeback(folio))
  folio_wake_bit(folio, PG_writeback);

filemap_end_dropbehind_write(folio);
acct_reclaim_writeback(folio);
folio_put(folio);
}
EXPORT_SYMBOL(folio_end_writeback);

/**
* __folio_lock - Get a lock on the folio, assuming we need to sleep to get it.
* @folio: The folio to lock
*/
void __folio_lock(struct folio *folio)
{
folio_wait_bit_common(folio, PG_locked, TASK_UNINTERRUPTIBLE,
    EXCLUSIVE);
}
EXPORT_SYMBOL(__folio_lock);

int __folio_lock_killable(struct folio *folio)
{
return folio_wait_bit_common(folio, PG_locked, TASK_KILLABLE,
     EXCLUSIVE);
}
EXPORT_SYMBOL_GPL(__folio_lock_killable);

static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait)
{
struct wait_queue_head *q = folio_waitqueue(folio);
int ret;

wait->folio = folio;
wait->bit_nr = PG_locked;

spin_lock_irq(&q->lock);
__add_wait_queue_entry_tail(q, &wait->wait);
folio_set_waiters(folio);
ret = !folio_trylock(folio);
/*
* If we were successful now, we know we're still on the
* waitqueue as we're still under the lock. This means it's
* safe to remove and return success, we know the callback
* isn't going to trigger.
*/
if (!ret)
  __remove_wait_queue(q, &wait->wait);
else
  ret = -EIOCBQUEUED;
spin_unlock_irq(&q->lock);
return ret;
}

/*
* Return values:
* 0 - folio is locked.
* non-zero - folio is not locked.
*     mmap_lock or per-VMA lock has been released (mmap_read_unlock() or
*     vma_end_read()), unless flags had both FAULT_FLAG_ALLOW_RETRY and
*     FAULT_FLAG_RETRY_NOWAIT set, in which case the lock is still held.
*
* If neither ALLOW_RETRY nor KILLABLE are set, will always return 0
* with the folio locked and the mmap_lock/per-VMA lock is left unperturbed.
*/
vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf)
{
unsigned int flags = vmf->flags;

if (fault_flag_allow_retry_first(flags)) {
  /*
* CAUTION! In this case, mmap_lock/per-VMA lock is not
* released even though returning VM_FAULT_RETRY.
*/
  if (flags & FAULT_FLAG_RETRY_NOWAIT)
   return VM_FAULT_RETRY;

  release_fault_lock(vmf);
  if (flags & FAULT_FLAG_KILLABLE)
   folio_wait_locked_killable(folio);
  else
   folio_wait_locked(folio);
  return VM_FAULT_RETRY;
}
if (flags & FAULT_FLAG_KILLABLE) {
  bool ret;

  ret = __folio_lock_killable(folio);
  if (ret) {
   release_fault_lock(vmf);
   return VM_FAULT_RETRY;
  }
} else {
  __folio_lock(folio);
}

return 0;
}

/**
* page_cache_next_miss() - Find the next gap in the page cache.
* @mapping: Mapping.
* @index: Index.
* @max_scan: Maximum range to search.
*
* Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the
* gap with the lowest index.
*
* This function may be called under the rcu_read_lock.  However, this will
* not atomically search a snapshot of the cache at a single point in time.
* For example, if a gap is created at index 5, then subsequently a gap is
* created at index 10, page_cache_next_miss covering both indices may
* return 10 if called under the rcu_read_lock.
*
* Return: The index of the gap if found, otherwise an index outside the
* range specified (in which case 'return - index >= max_scan' will be true).
* In the rare case of index wrap-around, 0 will be returned.
*/
pgoff_t page_cache_next_miss(struct address_space *mapping,
        pgoff_t index, unsigned long max_scan)
{
XA_STATE(xas, &mapping->i_pages, index);
unsigned long nr = max_scan;

while (nr--) {
  void *entry = xas_next(&xas);
  if (!entry || xa_is_value(entry))
   return xas.xa_index;
  if (xas.xa_index == 0)
   return 0;
}

return index + max_scan;
}
EXPORT_SYMBOL(page_cache_next_miss);

/**
* page_cache_prev_miss() - Find the previous gap in the page cache.
* @mapping: Mapping.
* @index: Index.
* @max_scan: Maximum range to search.
*
* Search the range [max(index - max_scan + 1, 0), index] for the
* gap with the highest index.
*
* This function may be called under the rcu_read_lock.  However, this will
* not atomically search a snapshot of the cache at a single point in time.
* For example, if a gap is created at index 10, then subsequently a gap is
* created at index 5, page_cache_prev_miss() covering both indices may
* return 5 if called under the rcu_read_lock.
*
* Return: The index of the gap if found, otherwise an index outside the
* range specified (in which case 'index - return >= max_scan' will be true).
* In the rare case of wrap-around, ULONG_MAX will be returned.
*/
pgoff_t page_cache_prev_miss(struct address_space *mapping,
        pgoff_t index, unsigned long max_scan)
{
XA_STATE(xas, &mapping->i_pages, index);

while (max_scan--) {
  void *entry = xas_prev(&xas);
  if (!entry || xa_is_value(entry))
   break;
  if (xas.xa_index == ULONG_MAX)
   break;
}

return xas.xa_index;
}
EXPORT_SYMBOL(page_cache_prev_miss);

/*
* Lockless page cache protocol:
* On the lookup side:
* 1. Load the folio from i_pages
* 2. Increment the refcount if it's not zero
* 3. If the folio is not found by xas_reload(), put the refcount and retry
*
* On the removal side:
* A. Freeze the page (by zeroing the refcount if nobody else has a reference)
* B. Remove the page from i_pages
* C. Return the page to the page allocator
*
* This means that any page may have its reference count temporarily
* increased by a speculative page cache (or GUP-fast) lookup as it can
* be allocated by another user before the RCU grace period expires.
* Because the refcount temporarily acquired here may end up being the
* last refcount on the page, any page allocation must be freeable by
* folio_put().
*/

/*
* filemap_get_entry - Get a page cache entry.
* @mapping: the address_space to search
* @index: The page cache index.
*
* Looks up the page cache entry at @mapping & @index.  If it is a folio,
* it is returned with an increased refcount.  If it is a shadow entry
* of a previously evicted folio, or a swap entry from shmem/tmpfs,
* it is returned without further action.
*
* Return: The folio, swap or shadow entry, %NULL if nothing is found.
*/
void *filemap_get_entry(struct address_space *mapping, pgoff_t index)
{
XA_STATE(xas, &mapping->i_pages, index);
struct folio *folio;

rcu_read_lock();
repeat:
xas_reset(&xas);
folio = xas_load(&xas);
if (xas_retry(&xas, folio))
  goto repeat;
/*
* A shadow entry of a recently evicted page, or a swap entry from
* shmem/tmpfs.  Return it without attempting to raise page count.
*/
if (!folio || xa_is_value(folio))
  goto out;

if (!folio_try_get(folio))
  goto repeat;

if (unlikely(folio != xas_reload(&xas))) {
  folio_put(folio);
  goto repeat;
}
out:
rcu_read_unlock();

return folio;
}

/**
* __filemap_get_folio - Find and get a reference to a folio.
* @mapping: The address_space to search.
* @index: The page index.
* @fgp_flags: %FGP flags modify how the folio is returned.
* @gfp: Memory allocation flags to use if %FGP_CREAT is specified.
*
* Looks up the page cache entry at @mapping & @index.
*
* If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
* if the %GFP flags specified for %FGP_CREAT are atomic.
*
* If this function returns a folio, it is returned with an increased refcount.
*
* Return: The found folio or an ERR_PTR() otherwise.
*/
struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
  fgf_t fgp_flags, gfp_t gfp)
{
struct folio *folio;

repeat:
folio = filemap_get_entry(mapping, index);
if (xa_is_value(folio))
  folio = NULL;
if (!folio)
  goto no_page;

if (fgp_flags & FGP_LOCK) {
  if (fgp_flags & FGP_NOWAIT) {
   if (!folio_trylock(folio)) {
    folio_put(folio);
    return ERR_PTR(-EAGAIN);
   }
  } else {
   folio_lock(folio);
  }

  /* Has the page been truncated? */
  if (unlikely(folio->mapping != mapping)) {
   folio_unlock(folio);
   folio_put(folio);
   goto repeat;
  }
  VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
}

if (fgp_flags & FGP_ACCESSED)
  folio_mark_accessed(folio);
else if (fgp_flags & FGP_WRITE) {
  /* Clear idle flag for buffer write */
  if (folio_test_idle(folio))
   folio_clear_idle(folio);
}

if (fgp_flags & FGP_STABLE)
  folio_wait_stable(folio);
no_page:
if (!folio && (fgp_flags & FGP_CREAT)) {
  unsigned int min_order = mapping_min_folio_order(mapping);
  unsigned int order = max(min_order, FGF_GET_ORDER(fgp_flags));
  int err;
  index = mapping_align_index(mapping, index);

  if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
   gfp |= __GFP_WRITE;
  if (fgp_flags & FGP_NOFS)
   gfp &= ~__GFP_FS;
  if (fgp_flags & FGP_NOWAIT) {
   gfp &= ~GFP_KERNEL;
   gfp |= GFP_NOWAIT | __GFP_NOWARN;
  }
  if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
   fgp_flags |= FGP_LOCK;

  if (order > mapping_max_folio_order(mapping))
   order = mapping_max_folio_order(mapping);
  /* If we're not aligned, allocate a smaller folio */
  if (index & ((1UL << order) - 1))
   order = __ffs(index);

  do {
   gfp_t alloc_gfp = gfp;

   err = -ENOMEM;
   if (order > min_order)
    alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;
   folio = filemap_alloc_folio(alloc_gfp, order);
   if (!folio)
    continue;

   /* Init accessed so avoid atomic mark_page_accessed later */
   if (fgp_flags & FGP_ACCESSED)
    __folio_set_referenced(folio);
   if (fgp_flags & FGP_DONTCACHE)
    __folio_set_dropbehind(folio);

   err = filemap_add_folio(mapping, folio, index, gfp);
   if (!err)
    break;
   folio_put(folio);
   folio = NULL;
  } while (order-- > min_order);

  if (err == -EEXIST)
   goto repeat;
  if (err) {
   /*
* When NOWAIT I/O fails to allocate folios this could
* be due to a nonblocking memory allocation and not
* because the system actually is out of memory.
* Return -EAGAIN so that there caller retries in a
* blocking fashion instead of propagating -ENOMEM
* to the application.
*/
   if ((fgp_flags & FGP_NOWAIT) && err == -ENOMEM)
    err = -EAGAIN;
   return ERR_PTR(err);
  }
  /*
* filemap_add_folio locks the page, and for mmap
* we expect an unlocked page.
*/
  if (folio && (fgp_flags & FGP_FOR_MMAP))
   folio_unlock(folio);
}

if (!folio)
  return ERR_PTR(-ENOENT);
/* not an uncached lookup, clear uncached if set */
if (folio_test_dropbehind(folio) && !(fgp_flags & FGP_DONTCACHE))
  folio_clear_dropbehind(folio);
return folio;
}
EXPORT_SYMBOL(__filemap_get_folio);

static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max,
  xa_mark_t mark)
{
struct folio *folio;

retry:
if (mark == XA_PRESENT)
  folio = xas_find(xas, max);
else
  folio = xas_find_marked(xas, max, mark);

if (xas_retry(xas, folio))
  goto retry;
/*
* A shadow entry of a recently evicted page, a swap
* entry from shmem/tmpfs or a DAX entry.  Return it
* without attempting to raise page count.
*/
if (!folio || xa_is_value(folio))
  return folio;

if (!folio_try_get(folio))
  goto reset;

if (unlikely(folio != xas_reload(xas))) {
  folio_put(folio);
  goto reset;
}

return folio;
reset:
xas_reset(xas);
goto retry;
}

/**
* find_get_entries - gang pagecache lookup
* @mapping: The address_space to search
* @start: The starting page cache index
* @end: The final page index (inclusive).
* @fbatch: Where the resulting entries are placed.
* @indices: The cache indices corresponding to the entries in @entries
*
* find_get_entries() will search for and return a batch of entries in
* the mapping.  The entries are placed in @fbatch.  find_get_entries()
* takes a reference on any actual folios it returns.
*
* The entries have ascending indexes.  The indices may not be consecutive
* due to not-present entries or large folios.
*
* Any shadow entries of evicted folios, or swap entries from
* shmem/tmpfs, are included in the returned array.
*
* Return: The number of entries which were found.
*/
unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
  pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
{
XA_STATE(xas, &mapping->i_pages, *start);
struct folio *folio;

rcu_read_lock();
while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {
  indices[fbatch->nr] = xas.xa_index;
  if (!folio_batch_add(fbatch, folio))
   break;
}

if (folio_batch_count(fbatch)) {
  unsigned long nr;
  int idx = folio_batch_count(fbatch) - 1;

  folio = fbatch->folios[idx];
  if (!xa_is_value(folio))
   nr = folio_nr_pages(folio);
  else
   nr = 1 << xa_get_order(&mapping->i_pages, indices[idx]);
  *start = round_down(indices[idx] + nr, nr);
}
rcu_read_unlock();

return folio_batch_count(fbatch);
}

/**
* find_lock_entries - Find a batch of pagecache entries.
* @mapping: The address_space to search.
* @start: The starting page cache index.
* @end: The final page index (inclusive).
* @fbatch: Where the resulting entries are placed.
* @indices: The cache indices of the entries in @fbatch.
*
* find_lock_entries() will return a batch of entries from @mapping.
* Swap, shadow and DAX entries are included.  Folios are returned
* locked and with an incremented refcount.  Folios which are locked
* by somebody else or under writeback are skipped.  Folios which are
* partially outside the range are not returned.
*
* The entries have ascending indexes.  The indices may not be consecutive
* due to not-present entries, large folios, folios which could not be
* locked or folios under writeback.
*
* Return: The number of entries which were found.
*/
unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
  pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
{
XA_STATE(xas, &mapping->i_pages, *start);
struct folio *folio;

rcu_read_lock();
while ((folio = find_get_entry(&xas, end, XA_PRESENT))) {
  unsigned long base;
  unsigned long nr;

  if (!xa_is_value(folio)) {
   nr = folio_nr_pages(folio);
   base = folio->index;
   /* Omit large folio which begins before the start */
   if (base < *start)
    goto put;
   /* Omit large folio which extends beyond the end */
   if (base + nr - 1 > end)
    goto put;
   if (!folio_trylock(folio))
    goto put;
   if (folio->mapping != mapping ||
       folio_test_writeback(folio))
    goto unlock;
   VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index),
     folio);
  } else {
   nr = 1 << xas_get_order(&xas);
   base = xas.xa_index & ~(nr - 1);
   /* Omit order>0 value which begins before the start */
   if (base < *start)
    continue;
   /* Omit order>0 value which extends beyond the end */
   if (base + nr - 1 > end)
    break;
  }

  /* Update start now so that last update is correct on return */
  *start = base + nr;
  indices[fbatch->nr] = xas.xa_index;
  if (!folio_batch_add(fbatch, folio))
   break;
  continue;
unlock:
  folio_unlock(folio);
put:
  folio_put(folio);
}
rcu_read_unlock();

return folio_batch_count(fbatch);
}

/**
* filemap_get_folios - Get a batch of folios
* @mapping: The address_space to search
* @start: The starting page index
* @end: The final page index (inclusive)
* @fbatch: The batch to fill.
*
* Search for and return a batch of folios in the mapping starting at
* index @start and up to index @end (inclusive).  The folios are returned
* in @fbatch with an elevated reference count.
*
* Return: The number of folios which were found.
* We also update @start to index the next folio for the traversal.
*/
unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,
  pgoff_t end, struct folio_batch *fbatch)
{
return filemap_get_folios_tag(mapping, start, end, XA_PRESENT, fbatch);
}
EXPORT_SYMBOL(filemap_get_folios);

/**
* filemap_get_folios_contig - Get a batch of contiguous folios
* @mapping: The address_space to search
* @start: The starting page index
* @end: The final page index (inclusive)
* @fbatch: The batch to fill
*
* filemap_get_folios_contig() works exactly like filemap_get_folios(),
* except the returned folios are guaranteed to be contiguous. This may
* not return all contiguous folios if the batch gets filled up.
*
* Return: The number of folios found.
* Also update @start to be positioned for traversal of the next folio.
*/

--> --------------------

--> maximum size reached

--> --------------------

quality87%

¤ Dauer der Verarbeitung: 0.72 Sekunden (vorverarbeitet) ¤

Normalansicht

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung ist noch experimentell.