Quelle extent_io.c Sprache: C

// SPDX-License-Identifier: GPL-2.0

#include <linux/bitops.h>
#include <linux/slab.h>
#include <linux/bio.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/page-flags.h>
#include <linux/sched/mm.h>
#include <linux/spinlock.h>
#include <linux/blkdev.h>
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/pagevec.h>
#include <linux/prefetch.h>
#include <linux/fsverity.h>
#include "extent_io.h"
#include "extent-io-tree.h"
#include "extent_map.h"
#include "ctree.h"
#include "btrfs_inode.h"
#include "bio.h"
#include "locking.h"
#include "backref.h"
#include "disk-io.h"
#include "subpage.h"
#include "zoned.h"
#include "block-group.h"
#include "compression.h"
#include "fs.h"
#include "accessors.h"
#include "file-item.h"
#include "file.h"
#include "dev-replace.h"
#include "super.h"
#include "transaction.h"

static struct kmem_cache *extent_buffer_cache;

#ifdef CONFIG_BTRFS_DEBUG
static inline void btrfs_leak_debug_add_eb(struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
unsigned long flags;

spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
list_add(&eb->leak_list, &fs_info->allocated_ebs);
spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
}

static inline void btrfs_leak_debug_del_eb(struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
unsigned long flags;

spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
list_del(&eb->leak_list);
spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
}

void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
{
struct extent_buffer *eb;
unsigned long flags;

/*
* If we didn't get into open_ctree our allocated_ebs will not be
* initialized, so just skip this.
*/
if (!fs_info->allocated_ebs.next)
  return;

WARN_ON(!list_empty(&fs_info->allocated_ebs));
spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
while (!list_empty(&fs_info->allocated_ebs)) {
  eb = list_first_entry(&fs_info->allocated_ebs,
          struct extent_buffer, leak_list);
  btrfs_err(fs_info,
         "buffer leak start %llu len %u refs %d bflags %lu owner %llu",
         eb->start, eb->len, refcount_read(&eb->refs), eb->bflags,
         btrfs_header_owner(eb));
  list_del(&eb->leak_list);
  WARN_ON_ONCE(1);
  kmem_cache_free(extent_buffer_cache, eb);
}
spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
}
#else
#define btrfs_leak_debug_add_eb(eb)   do {} while (0)
#define btrfs_leak_debug_del_eb(eb)   do {} while (0)
#endif

/*
* Structure to record info about the bio being assembled, and other info like
* how many bytes are there before stripe/ordered extent boundary.
*/
struct btrfs_bio_ctrl {
struct btrfs_bio *bbio;
/* Last byte contained in bbio + 1 . */
loff_t next_file_offset;
enum btrfs_compression_type compress_type;
u32 len_to_oe_boundary;
blk_opf_t opf;
btrfs_bio_end_io_t end_io_func;
struct writeback_control *wbc;

/*
* The sectors of the page which are going to be submitted by
* extent_writepage_io().
* This is to avoid touching ranges covered by compression/inline.
*/
unsigned long submit_bitmap;
struct readahead_control *ractl;

/*
* The start offset of the last used extent map by a read operation.
*
* This is for proper compressed read merge.
* U64_MAX means we are starting the read and have made no progress yet.
*
* The current btrfs_bio_is_contig() only uses disk_bytenr as
* the condition to check if the read can be merged with previous
* bio, which is not correct. E.g. two file extents pointing to the
* same extent but with different offset.
*
* So here we need to do extra checks to only merge reads that are
* covered by the same extent map.
* Just extent_map::start will be enough, as they are unique
* inside the same inode.
*/
u64 last_em_start;
};

static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
{
struct btrfs_bio *bbio = bio_ctrl->bbio;

if (!bbio)
  return;

/* Caller should ensure the bio has at least some range added */
ASSERT(bbio->bio.bi_iter.bi_size);

if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ &&
     bio_ctrl->compress_type != BTRFS_COMPRESS_NONE)
  btrfs_submit_compressed_read(bbio);
else
  btrfs_submit_bbio(bbio, 0);

/* The bbio is owned by the end_io handler now */
bio_ctrl->bbio = NULL;
}

/*
* Submit or fail the current bio in the bio_ctrl structure.
*/
static void submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret)
{
struct btrfs_bio *bbio = bio_ctrl->bbio;

if (!bbio)
  return;

if (ret) {
  ASSERT(ret < 0);
  btrfs_bio_end_io(bbio, errno_to_blk_status(ret));
  /* The bio is owned by the end_io handler now */
  bio_ctrl->bbio = NULL;
} else {
  submit_one_bio(bio_ctrl);
}
}

int __init extent_buffer_init_cachep(void)
{
extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
      sizeof(struct extent_buffer), 0, 0,
      NULL);
if (!extent_buffer_cache)
  return -ENOMEM;

return 0;
}

void __cold extent_buffer_free_cachep(void)
{
/*
* Make sure all delayed rcu free are flushed before we
* destroy caches.
*/
rcu_barrier();
kmem_cache_destroy(extent_buffer_cache);
}

static void process_one_folio(struct btrfs_fs_info *fs_info,
         struct folio *folio, const struct folio *locked_folio,
         unsigned long page_ops, u64 start, u64 end)
{
u32 len;

ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX);
len = end + 1 - start;

if (page_ops & PAGE_SET_ORDERED)
  btrfs_folio_clamp_set_ordered(fs_info, folio, start, len);
if (page_ops & PAGE_START_WRITEBACK) {
  btrfs_folio_clamp_clear_dirty(fs_info, folio, start, len);
  btrfs_folio_clamp_set_writeback(fs_info, folio, start, len);
}
if (page_ops & PAGE_END_WRITEBACK)
  btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len);

if (folio != locked_folio && (page_ops & PAGE_UNLOCK))
  btrfs_folio_end_lock(fs_info, folio, start, len);
}

static void __process_folios_contig(struct address_space *mapping,
        const struct folio *locked_folio, u64 start,
        u64 end, unsigned long page_ops)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host);
pgoff_t index = start >> PAGE_SHIFT;
pgoff_t end_index = end >> PAGE_SHIFT;
struct folio_batch fbatch;
int i;

folio_batch_init(&fbatch);
while (index <= end_index) {
  int found_folios;

  found_folios = filemap_get_folios_contig(mapping, &index,
    end_index, &fbatch);
  for (i = 0; i < found_folios; i++) {
   struct folio *folio = fbatch.folios[i];

   process_one_folio(fs_info, folio, locked_folio,
       page_ops, start, end);
  }
  folio_batch_release(&fbatch);
  cond_resched();
}
}

static noinline void unlock_delalloc_folio(const struct inode *inode,
        struct folio *locked_folio,
        u64 start, u64 end)
{
ASSERT(locked_folio);

__process_folios_contig(inode->i_mapping, locked_folio, start, end,
    PAGE_UNLOCK);
}

static noinline int lock_delalloc_folios(struct inode *inode,
      struct folio *locked_folio,
      u64 start, u64 end)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct address_space *mapping = inode->i_mapping;
pgoff_t index = start >> PAGE_SHIFT;
pgoff_t end_index = end >> PAGE_SHIFT;
u64 processed_end = start;
struct folio_batch fbatch;

folio_batch_init(&fbatch);
while (index <= end_index) {
  unsigned int found_folios, i;

  found_folios = filemap_get_folios_contig(mapping, &index,
    end_index, &fbatch);
  if (found_folios == 0)
   goto out;

  for (i = 0; i < found_folios; i++) {
   struct folio *folio = fbatch.folios[i];
   u64 range_start;
   u32 range_len;

   if (folio == locked_folio)
    continue;

   folio_lock(folio);
   if (!folio_test_dirty(folio) || folio->mapping != mapping) {
    folio_unlock(folio);
    goto out;
   }
   range_start = max_t(u64, folio_pos(folio), start);
   range_len = min_t(u64, folio_end(folio), end + 1) - range_start;
   btrfs_folio_set_lock(fs_info, folio, range_start, range_len);

   processed_end = range_start + range_len - 1;
  }
  folio_batch_release(&fbatch);
  cond_resched();
}

return 0;
out:
folio_batch_release(&fbatch);
if (processed_end > start)
  unlock_delalloc_folio(inode, locked_folio, start, processed_end);
return -EAGAIN;
}

/*
* Find and lock a contiguous range of bytes in the file marked as delalloc, no
* more than @max_bytes.
*
* @start: The original start bytenr to search.
* Will store the extent range start bytenr.
* @end: The original end bytenr of the search range
* Will store the extent range end bytenr.
*
* Return true if we find a delalloc range which starts inside the original
* range, and @start/@end will store the delalloc range start/end.
*
* Return false if we can't find any delalloc range which starts inside the
* original range, and @start/@end will be the non-delalloc range start/end.
*/
EXPORT_FOR_TESTS
noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
       struct folio *locked_folio,
       u64 *start, u64 *end)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
const u64 orig_start = *start;
const u64 orig_end = *end;
/* The sanity tests may not set a valid fs_info. */
u64 max_bytes = fs_info ? fs_info->max_extent_size : BTRFS_MAX_EXTENT_SIZE;
u64 delalloc_start;
u64 delalloc_end;
bool found;
struct extent_state *cached_state = NULL;
int ret;
int loops = 0;

/* Caller should pass a valid @end to indicate the search range end */
ASSERT(orig_end > orig_start);

/* The range should at least cover part of the folio */
ASSERT(!(orig_start >= folio_end(locked_folio) ||
   orig_end <= folio_pos(locked_folio)));
again:
/* step one, find a bunch of delalloc bytes starting at start */
delalloc_start = *start;
delalloc_end = 0;

/*
* If @max_bytes is smaller than a block, btrfs_find_delalloc_range() can
* return early without handling any dirty ranges.
*/
ASSERT(max_bytes >= fs_info->sectorsize);

found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
       max_bytes, &cached_state);
if (!found || delalloc_end <= *start || delalloc_start > orig_end) {
  *start = delalloc_start;

  /* @delalloc_end can be -1, never go beyond @orig_end */
  *end = min(delalloc_end, orig_end);
  btrfs_free_extent_state(cached_state);
  return false;
}

/*
* start comes from the offset of locked_folio.  We have to lock
* folios in order, so we can't process delalloc bytes before
* locked_folio
*/
if (delalloc_start < *start)
  delalloc_start = *start;

/*
* make sure to limit the number of folios we try to lock down
*/
if (delalloc_end + 1 - delalloc_start > max_bytes)
  delalloc_end = delalloc_start + max_bytes - 1;

/* step two, lock all the folioss after the folios that has start */
ret = lock_delalloc_folios(inode, locked_folio, delalloc_start,
       delalloc_end);
ASSERT(!ret || ret == -EAGAIN);
if (ret == -EAGAIN) {
  /*
* Some of the folios are gone, lets avoid looping by
* shortening the size of the delalloc range we're searching.
*/
  btrfs_free_extent_state(cached_state);
  cached_state = NULL;
  if (!loops) {
   max_bytes = fs_info->sectorsize;
   loops = 1;
   goto again;
  } else {
   found = false;
   goto out_failed;
  }
}

/* step three, lock the state bits for the whole range */
btrfs_lock_extent(tree, delalloc_start, delalloc_end, &cached_state);

/* then test to make sure it is all still delalloc */
ret = btrfs_test_range_bit(tree, delalloc_start, delalloc_end,
       EXTENT_DELALLOC, cached_state);

btrfs_unlock_extent(tree, delalloc_start, delalloc_end, &cached_state);
if (!ret) {
  unlock_delalloc_folio(inode, locked_folio, delalloc_start,
          delalloc_end);
  cond_resched();
  goto again;
}
*start = delalloc_start;
*end = delalloc_end;
out_failed:
return found;
}

void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
      const struct folio *locked_folio,
      struct extent_state **cached,
      u32 clear_bits, unsigned long page_ops)
{
btrfs_clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached);

__process_folios_contig(inode->vfs_inode.i_mapping, locked_folio, start,
    end, page_ops);
}

static bool btrfs_verify_folio(struct folio *folio, u64 start, u32 len)
{
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);

if (!fsverity_active(folio->mapping->host) ||
     btrfs_folio_test_uptodate(fs_info, folio, start, len) ||
     start >= i_size_read(folio->mapping->host))
  return true;
return fsverity_verify_folio(folio);
}

static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 len)
{
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);

ASSERT(folio_pos(folio) <= start &&
        start + len <= folio_end(folio));

if (uptodate && btrfs_verify_folio(folio, start, len))
  btrfs_folio_set_uptodate(fs_info, folio, start, len);
else
  btrfs_folio_clear_uptodate(fs_info, folio, start, len);

if (!btrfs_is_subpage(fs_info, folio))
  folio_unlock(folio);
else
  btrfs_folio_end_lock(fs_info, folio, start, len);
}

/*
* After a write IO is done, we need to:
*
* - clear the uptodate bits on error
* - clear the writeback bits in the extent tree for the range
* - filio_end_writeback()  if there is no more pending io for the folio
*
* Scheduling is not allowed, so the extent state tree is expected
* to have one and only one object corresponding to this IO.
*/
static void end_bbio_data_write(struct btrfs_bio *bbio)
{
struct btrfs_fs_info *fs_info = bbio->fs_info;
struct bio *bio = &bbio->bio;
int error = blk_status_to_errno(bio->bi_status);
struct folio_iter fi;
const u32 sectorsize = fs_info->sectorsize;

ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_folio_all(fi, bio) {
  struct folio *folio = fi.folio;
  u64 start = folio_pos(folio) + fi.offset;
  u32 len = fi.length;

  /* Our read/write should always be sector aligned. */
  if (!IS_ALIGNED(fi.offset, sectorsize))
   btrfs_err(fs_info,
  "partial page write in btrfs with offset %zu and length %zu",
      fi.offset, fi.length);
  else if (!IS_ALIGNED(fi.length, sectorsize))
   btrfs_info(fs_info,
  "incomplete page write with offset %zu and length %zu",
       fi.offset, fi.length);

  btrfs_finish_ordered_extent(bbio->ordered, folio, start, len,
         !error);
  if (error)
   mapping_set_error(folio->mapping, error);
  btrfs_folio_clear_writeback(fs_info, folio, start, len);
}

bio_put(bio);
}

static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio)
{
ASSERT(folio_test_locked(folio));
if (!btrfs_is_subpage(fs_info, folio))
  return;

ASSERT(folio_test_private(folio));
btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), folio_size(folio));
}

/*
* After a data read IO is done, we need to:
*
* - clear the uptodate bits on error
* - set the uptodate bits if things worked
* - set the folio up to date if all extents in the tree are uptodate
* - clear the lock bit in the extent tree
* - unlock the folio if there are no other extents locked for it
*
* Scheduling is not allowed, so the extent state tree is expected
* to have one and only one object corresponding to this IO.
*/
static void end_bbio_data_read(struct btrfs_bio *bbio)
{
struct btrfs_fs_info *fs_info = bbio->fs_info;
struct bio *bio = &bbio->bio;
struct folio_iter fi;

ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_folio_all(fi, &bbio->bio) {
  bool uptodate = !bio->bi_status;
  struct folio *folio = fi.folio;
  struct inode *inode = folio->mapping->host;
  u64 start = folio_pos(folio) + fi.offset;

  btrfs_debug(fs_info,
   "%s: bi_sector=%llu, err=%d, mirror=%u",
   __func__, bio->bi_iter.bi_sector, bio->bi_status,
   bbio->mirror_num);

  if (likely(uptodate)) {
   u64 end = start + fi.length - 1;
   loff_t i_size = i_size_read(inode);

   /*
* Zero out the remaining part if this range straddles
* i_size.
*
* Here we should only zero the range inside the folio,
* not touch anything else.
*
* NOTE: i_size is exclusive while end is inclusive and
* folio_contains() takes PAGE_SIZE units.
*/
   if (folio_contains(folio, i_size >> PAGE_SHIFT) &&
       i_size <= end) {
    u32 zero_start = max(offset_in_folio(folio, i_size),
           offset_in_folio(folio, start));
    u32 zero_len = offset_in_folio(folio, end) + 1 -
            zero_start;

    folio_zero_range(folio, zero_start, zero_len);
   }
  }

  /* Update page status and unlock. */
  end_folio_read(folio, uptodate, start, fi.length);
}
bio_put(bio);
}

/*
* Populate every free slot in a provided array with folios using GFP_NOFS.
*
* @nr_folios:   number of folios to allocate
* @folio_array: the array to fill with folios; any existing non-NULL entries in
* the array will be skipped
*
* Return: 0        if all folios were able to be allocated;
*         -ENOMEM  otherwise, the partially allocated folios would be freed and
*                  the array slots zeroed
*/
int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array)
{
for (int i = 0; i < nr_folios; i++) {
  if (folio_array[i])
   continue;
  folio_array[i] = folio_alloc(GFP_NOFS, 0);
  if (!folio_array[i])
   goto error;
}
return 0;
error:
for (int i = 0; i < nr_folios; i++) {
  if (folio_array[i])
   folio_put(folio_array[i]);
}
return -ENOMEM;
}

/*
* Populate every free slot in a provided array with pages, using GFP_NOFS.
*
* @nr_pages:   number of pages to allocate
* @page_array: the array to fill with pages; any existing non-null entries in
* the array will be skipped
* @nofail: whether using __GFP_NOFAIL flag
*
* Return: 0        if all pages were able to be allocated;
*         -ENOMEM  otherwise, the partially allocated pages would be freed and
*                  the array slots zeroed
*/
int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
      bool nofail)
{
const gfp_t gfp = nofail ? (GFP_NOFS | __GFP_NOFAIL) : GFP_NOFS;
unsigned int allocated;

for (allocated = 0; allocated < nr_pages;) {
  unsigned int last = allocated;

  allocated = alloc_pages_bulk(gfp, nr_pages, page_array);
  if (unlikely(allocated == last)) {
   /* No progress, fail and do cleanup. */
   for (int i = 0; i < allocated; i++) {
    __free_page(page_array[i]);
    page_array[i] = NULL;
   }
   return -ENOMEM;
  }
}
return 0;
}

/*
* Populate needed folios for the extent buffer.
*
* For now, the folios populated are always in order 0 (aka, single page).
*/
static int alloc_eb_folio_array(struct extent_buffer *eb, bool nofail)
{
struct page *page_array[INLINE_EXTENT_BUFFER_PAGES] = { 0 };
int num_pages = num_extent_pages(eb);
int ret;

ret = btrfs_alloc_page_array(num_pages, page_array, nofail);
if (ret < 0)
  return ret;

for (int i = 0; i < num_pages; i++)
  eb->folios[i] = page_folio(page_array[i]);
eb->folio_size = PAGE_SIZE;
eb->folio_shift = PAGE_SHIFT;
return 0;
}

static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl,
    u64 disk_bytenr, loff_t file_offset)
{
struct bio *bio = &bio_ctrl->bbio->bio;
const sector_t sector = disk_bytenr >> SECTOR_SHIFT;

if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) {
  /*
* For compression, all IO should have its logical bytenr set
* to the starting bytenr of the compressed extent.
*/
  return bio->bi_iter.bi_sector == sector;
}

/*
* To merge into a bio both the disk sector and the logical offset in
* the file need to be contiguous.
*/
return bio_ctrl->next_file_offset == file_offset &&
  bio_end_sector(bio) == sector;
}

static void alloc_new_bio(struct btrfs_inode *inode,
     struct btrfs_bio_ctrl *bio_ctrl,
     u64 disk_bytenr, u64 file_offset)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_bio *bbio;

bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, fs_info,
          bio_ctrl->end_io_func, NULL);
bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
bbio->bio.bi_write_hint = inode->vfs_inode.i_write_hint;
bbio->inode = inode;
bbio->file_offset = file_offset;
bio_ctrl->bbio = bbio;
bio_ctrl->len_to_oe_boundary = U32_MAX;
bio_ctrl->next_file_offset = file_offset;

/* Limit data write bios to the ordered boundary. */
if (bio_ctrl->wbc) {
  struct btrfs_ordered_extent *ordered;

  ordered = btrfs_lookup_ordered_extent(inode, file_offset);
  if (ordered) {
   bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX,
     ordered->file_offset +
     ordered->disk_num_bytes - file_offset);
   bbio->ordered = ordered;
  }

  /*
* Pick the last added device to support cgroup writeback.  For
* multi-device file systems this means blk-cgroup policies have
* to always be set on the last added/replaced device.
* This is a bit odd but has been like that for a long time.
*/
  bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev);
  wbc_init_bio(bio_ctrl->wbc, &bbio->bio);
}
}

/*
* @disk_bytenr: logical bytenr where the write will be
* @page: page to add to the bio
* @size: portion of page that we want to write to
* @pg_offset: offset of the new bio or to check whether we are adding
*              a contiguous page to the previous one
*
* The will either add the page into the existing @bio_ctrl->bbio, or allocate a
* new one in @bio_ctrl->bbio.
* The mirror number for this IO should already be initizlied in
* @bio_ctrl->mirror_num.
*/
static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
          u64 disk_bytenr, struct folio *folio,
          size_t size, unsigned long pg_offset)
{
struct btrfs_inode *inode = folio_to_inode(folio);
loff_t file_offset = folio_pos(folio) + pg_offset;

ASSERT(pg_offset + size <= folio_size(folio));
ASSERT(bio_ctrl->end_io_func);

if (bio_ctrl->bbio &&
     !btrfs_bio_is_contig(bio_ctrl, disk_bytenr, file_offset))
  submit_one_bio(bio_ctrl);

do {
  u32 len = size;

  /* Allocate new bio if needed */
  if (!bio_ctrl->bbio)
   alloc_new_bio(inode, bio_ctrl, disk_bytenr, file_offset);

  /* Cap to the current ordered extent boundary if there is one. */
  if (len > bio_ctrl->len_to_oe_boundary) {
   ASSERT(bio_ctrl->compress_type == BTRFS_COMPRESS_NONE);
   ASSERT(is_data_inode(inode));
   len = bio_ctrl->len_to_oe_boundary;
  }

  if (!bio_add_folio(&bio_ctrl->bbio->bio, folio, len, pg_offset)) {
   /* bio full: move on to a new one */
   submit_one_bio(bio_ctrl);
   continue;
  }
  bio_ctrl->next_file_offset += len;

  if (bio_ctrl->wbc)
   wbc_account_cgroup_owner(bio_ctrl->wbc, folio, len);

  size -= len;
  pg_offset += len;
  disk_bytenr += len;
  file_offset += len;

  /*
* len_to_oe_boundary defaults to U32_MAX, which isn't folio or
* sector aligned.  alloc_new_bio() then sets it to the end of
* our ordered extent for writes into zoned devices.
*
* When len_to_oe_boundary is tracking an ordered extent, we
* trust the ordered extent code to align things properly, and
* the check above to cap our write to the ordered extent
* boundary is correct.
*
* When len_to_oe_boundary is U32_MAX, the cap above would
* result in a 4095 byte IO for the last folio right before
* we hit the bio limit of UINT_MAX.  bio_add_folio() has all
* the checks required to make sure we don't overflow the bio,
* and we should just ignore len_to_oe_boundary completely
* unless we're using it to track an ordered extent.
*
* It's pretty hard to make a bio sized U32_MAX, but it can
* happen when the page cache is able to feed us contiguous
* folios for large extents.
*/
  if (bio_ctrl->len_to_oe_boundary != U32_MAX)
   bio_ctrl->len_to_oe_boundary -= len;

  /* Ordered extent boundary: move on to a new bio. */
  if (bio_ctrl->len_to_oe_boundary == 0)
   submit_one_bio(bio_ctrl);
} while (size);
}

static int attach_extent_buffer_folio(struct extent_buffer *eb,
          struct folio *folio,
          struct btrfs_folio_state *prealloc)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
int ret = 0;

/*
* If the page is mapped to btree inode, we should hold the private
* lock to prevent race.
* For cloned or dummy extent buffers, their pages are not mapped and
* will not race with any other ebs.
*/
if (folio->mapping)
  lockdep_assert_held(&folio->mapping->i_private_lock);

if (!btrfs_meta_is_subpage(fs_info)) {
  if (!folio_test_private(folio))
   folio_attach_private(folio, eb);
  else
   WARN_ON(folio_get_private(folio) != eb);
  return 0;
}

/* Already mapped, just free prealloc */
if (folio_test_private(folio)) {
  btrfs_free_folio_state(prealloc);
  return 0;
}

if (prealloc)
  /* Has preallocated memory for subpage */
  folio_attach_private(folio, prealloc);
else
  /* Do new allocation to attach subpage */
  ret = btrfs_attach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA);
return ret;
}

int set_folio_extent_mapped(struct folio *folio)
{
struct btrfs_fs_info *fs_info;

ASSERT(folio->mapping);

if (folio_test_private(folio))
  return 0;

fs_info = folio_to_fs_info(folio);

if (btrfs_is_subpage(fs_info, folio))
  return btrfs_attach_folio_state(fs_info, folio, BTRFS_SUBPAGE_DATA);

folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE);
return 0;
}

void clear_folio_extent_mapped(struct folio *folio)
{
struct btrfs_fs_info *fs_info;

ASSERT(folio->mapping);

if (!folio_test_private(folio))
  return;

fs_info = folio_to_fs_info(folio);
if (btrfs_is_subpage(fs_info, folio))
  return btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_DATA);

folio_detach_private(folio);
}

static struct extent_map *get_extent_map(struct btrfs_inode *inode,
      struct folio *folio, u64 start,
      u64 len, struct extent_map **em_cached)
{
struct extent_map *em;

ASSERT(em_cached);

if (*em_cached) {
  em = *em_cached;
  if (btrfs_extent_map_in_tree(em) && start >= em->start &&
      start < btrfs_extent_map_end(em)) {
   refcount_inc(&em->refs);
   return em;
  }

  btrfs_free_extent_map(em);
  *em_cached = NULL;
}

em = btrfs_get_extent(inode, folio, start, len);
if (!IS_ERR(em)) {
  BUG_ON(*em_cached);
  refcount_inc(&em->refs);
  *em_cached = em;
}

return em;
}

static void btrfs_readahead_expand(struct readahead_control *ractl,
       const struct extent_map *em)
{
const u64 ra_pos = readahead_pos(ractl);
const u64 ra_end = ra_pos + readahead_length(ractl);
const u64 em_end = em->start + em->len;

/* No expansion for holes and inline extents. */
if (em->disk_bytenr > EXTENT_MAP_LAST_BYTE)
  return;

ASSERT(em_end >= ra_pos,
        "extent_map %llu %llu ends before current readahead position %llu",
        em->start, em->len, ra_pos);
if (em_end > ra_end)
  readahead_expand(ractl, ra_pos, em_end - ra_pos);
}

/*
* basic readpage implementation.  Locked extent state structs are inserted
* into the tree that are removed when the IO is done (by the end_io
* handlers)
* XXX JDM: This needs looking at to ensure proper page locking
* return 0 on success, otherwise return error
*/
static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
        struct btrfs_bio_ctrl *bio_ctrl)
{
struct inode *inode = folio->mapping->host;
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
u64 start = folio_pos(folio);
const u64 end = start + folio_size(folio) - 1;
u64 extent_offset;
u64 last_byte = i_size_read(inode);
struct extent_map *em;
int ret = 0;
const size_t blocksize = fs_info->sectorsize;

ret = set_folio_extent_mapped(folio);
if (ret < 0) {
  folio_unlock(folio);
  return ret;
}

if (folio_contains(folio, last_byte >> PAGE_SHIFT)) {
  size_t zero_offset = offset_in_folio(folio, last_byte);

  if (zero_offset)
   folio_zero_range(folio, zero_offset,
      folio_size(folio) - zero_offset);
}
bio_ctrl->end_io_func = end_bbio_data_read;
begin_folio_read(fs_info, folio);
for (u64 cur = start; cur <= end; cur += blocksize) {
  enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE;
  unsigned long pg_offset = offset_in_folio(folio, cur);
  bool force_bio_submit = false;
  u64 disk_bytenr;
  u64 block_start;

  ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
  if (cur >= last_byte) {
   folio_zero_range(folio, pg_offset, end - cur + 1);
   end_folio_read(folio, true, cur, end - cur + 1);
   break;
  }
  if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) {
   end_folio_read(folio, true, cur, blocksize);
   continue;
  }
  em = get_extent_map(BTRFS_I(inode), folio, cur, end - cur + 1, em_cached);
  if (IS_ERR(em)) {
   end_folio_read(folio, false, cur, end + 1 - cur);
   return PTR_ERR(em);
  }
  extent_offset = cur - em->start;
  BUG_ON(btrfs_extent_map_end(em) <= cur);
  BUG_ON(end < cur);

  compress_type = btrfs_extent_map_compression(em);

  /*
* Only expand readahead for extents which are already creating
* the pages anyway in add_ra_bio_pages, which is compressed
* extents in the non subpage case.
*/
  if (bio_ctrl->ractl &&
      !btrfs_is_subpage(fs_info, folio) &&
      compress_type != BTRFS_COMPRESS_NONE)
   btrfs_readahead_expand(bio_ctrl->ractl, em);

  if (compress_type != BTRFS_COMPRESS_NONE)
   disk_bytenr = em->disk_bytenr;
  else
   disk_bytenr = btrfs_extent_map_block_start(em) + extent_offset;

  if (em->flags & EXTENT_FLAG_PREALLOC)
   block_start = EXTENT_MAP_HOLE;
  else
   block_start = btrfs_extent_map_block_start(em);

  /*
* If we have a file range that points to a compressed extent
* and it's followed by a consecutive file range that points
* to the same compressed extent (possibly with a different
* offset and/or length, so it either points to the whole extent
* or only part of it), we must make sure we do not submit a
* single bio to populate the folios for the 2 ranges because
* this makes the compressed extent read zero out the folios
* belonging to the 2nd range. Imagine the following scenario:
*
*  File layout
*  [0 - 8K]                     [8K - 24K]
*    |                               |
*    |                               |
* points to extent X,         points to extent X,
* offset 4K, length of 8K     offset 0, length 16K
*
* [extent X, compressed length = 4K uncompressed length = 16K]
*
* If the bio to read the compressed extent covers both ranges,
* it will decompress extent X into the folios belonging to the
* first range and then it will stop, zeroing out the remaining
* folios that belong to the other range that points to extent X.
* So here we make sure we submit 2 bios, one for the first
* range and another one for the third range. Both will target
* the same physical extent from disk, but we can't currently
* make the compressed bio endio callback populate the folios
* for both ranges because each compressed bio is tightly
* coupled with a single extent map, and each range can have
* an extent map with a different offset value relative to the
* uncompressed data of our extent and different lengths. This
* is a corner case so we prioritize correctness over
* non-optimal behavior (submitting 2 bios for the same extent).
*/
  if (compress_type != BTRFS_COMPRESS_NONE &&
      bio_ctrl->last_em_start != U64_MAX &&
      bio_ctrl->last_em_start != em->start)
   force_bio_submit = true;

  bio_ctrl->last_em_start = em->start;

  btrfs_free_extent_map(em);
  em = NULL;

  /* we've found a hole, just zero and go on */
  if (block_start == EXTENT_MAP_HOLE) {
   folio_zero_range(folio, pg_offset, blocksize);
   end_folio_read(folio, true, cur, blocksize);
   continue;
  }
  /* the get_extent function already copied into the folio */
  if (block_start == EXTENT_MAP_INLINE) {
   end_folio_read(folio, true, cur, blocksize);
   continue;
  }

  if (bio_ctrl->compress_type != compress_type) {
   submit_one_bio(bio_ctrl);
   bio_ctrl->compress_type = compress_type;
  }

  if (force_bio_submit)
   submit_one_bio(bio_ctrl);
  submit_extent_folio(bio_ctrl, disk_bytenr, folio, blocksize,
        pg_offset);
}
return 0;
}

/*
* Check if we can skip waiting the @ordered extent covering the block at @fileoff.
*
* @fileoff: Both input and output.
* Input as the file offset where the check should start at.
* Output as where the next check should start at,
* if the function returns true.
*
* Return true if we can skip to @fileoff. The caller needs to check the new
* @fileoff value to make sure it covers the full range, before skipping the
* full OE.
*
* Return false if we must wait for the ordered extent.
*/
static bool can_skip_one_ordered_range(struct btrfs_inode *inode,
           struct btrfs_ordered_extent *ordered,
           u64 *fileoff)
{
const struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct folio *folio;
const u32 blocksize = fs_info->sectorsize;
u64 cur = *fileoff;
bool ret;

folio = filemap_get_folio(inode->vfs_inode.i_mapping, cur >> PAGE_SHIFT);

/*
* We should have locked the folio(s) for range [start, end], thus
* there must be a folio and it must be locked.
*/
ASSERT(!IS_ERR(folio));
ASSERT(folio_test_locked(folio));

/*
* There are several cases for the folio and OE combination:
*
* 1) Folio has no private flag
*    The OE has all its IO done but not yet finished, and folio got
*    invalidated.
*
* Have we have to wait for the OE to finish, as it may contain the
* to-be-inserted data checksum.
* Without the data checksum inserted into the csum tree, read will
* just fail with missing csum.
*/
if (!folio_test_private(folio)) {
  ret = false;
  goto out;
}

/*
* 2) The first block is DIRTY.
*
* This means the OE is created by some other folios whose file pos is
* before this one. And since we are holding the folio lock, the writeback
* of this folio cannot start.
*
* We must skip the whole OE, because it will never start until we
* finished our folio read and unlocked the folio.
*/
if (btrfs_folio_test_dirty(fs_info, folio, cur, blocksize)) {
  u64 range_len = min(folio_end(folio),
        ordered->file_offset + ordered->num_bytes) - cur;

  ret = true;
  /*
* At least inside the folio, all the remaining blocks should
* also be dirty.
*/
  ASSERT(btrfs_folio_test_dirty(fs_info, folio, cur, range_len));
  *fileoff = ordered->file_offset + ordered->num_bytes;
  goto out;
}

/*
* 3) The first block is uptodate.
*
* At least the first block can be skipped, but we are still not fully
* sure. E.g. if the OE has some other folios in the range that cannot
* be skipped.
* So we return true and update @next_ret to the OE/folio boundary.
*/
if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) {
  u64 range_len = min(folio_end(folio),
        ordered->file_offset + ordered->num_bytes) - cur;

  /*
* The whole range to the OE end or folio boundary should also
* be uptodate.
*/
  ASSERT(btrfs_folio_test_uptodate(fs_info, folio, cur, range_len));
  ret = true;
  *fileoff = cur + range_len;
  goto out;
}

/*
* 4) The first block is not uptodate.
*
* This means the folio is invalidated after the writeback was finished,
* but by some other operations (e.g. block aligned buffered write) the
* folio is inserted into filemap.
* Very much the same as case 1).
*/
ret = false;
out:
folio_put(folio);
return ret;
}

static bool can_skip_ordered_extent(struct btrfs_inode *inode,
        struct btrfs_ordered_extent *ordered,
        u64 start, u64 end)
{
const u64 range_end = min(end, ordered->file_offset + ordered->num_bytes - 1);
u64 cur = max(start, ordered->file_offset);

while (cur < range_end) {
  bool can_skip;

  can_skip = can_skip_one_ordered_range(inode, ordered, &cur);
  if (!can_skip)
   return false;
}
return true;
}

/*
* Locking helper to make sure we get a stable view of extent maps for the
* involved range.
*
* This is for folio read paths (read and readahead), thus the involved range
* should have all the folios locked.
*/
static void lock_extents_for_read(struct btrfs_inode *inode, u64 start, u64 end,
      struct extent_state **cached_state)
{
u64 cur_pos;

/* Caller must provide a valid @cached_state. */
ASSERT(cached_state);

/* The range must at least be page aligned, as all read paths are folio based. */
ASSERT(IS_ALIGNED(start, PAGE_SIZE));
ASSERT(IS_ALIGNED(end + 1, PAGE_SIZE));

again:
btrfs_lock_extent(&inode->io_tree, start, end, cached_state);
cur_pos = start;
while (cur_pos < end) {
  struct btrfs_ordered_extent *ordered;

  ordered = btrfs_lookup_ordered_range(inode, cur_pos,
           end - cur_pos + 1);
  /*
* No ordered extents in the range, and we hold the extent lock,
* no one can modify the extent maps in the range, we're safe to return.
*/
  if (!ordered)
   break;

  /* Check if we can skip waiting for the whole OE. */
  if (can_skip_ordered_extent(inode, ordered, start, end)) {
   cur_pos = min(ordered->file_offset + ordered->num_bytes,
          end + 1);
   btrfs_put_ordered_extent(ordered);
   continue;
  }

  /* Now wait for the OE to finish. */
  btrfs_unlock_extent(&inode->io_tree, start, end, cached_state);
  btrfs_start_ordered_extent_nowriteback(ordered, start, end + 1 - start);
  btrfs_put_ordered_extent(ordered);
  /* We have unlocked the whole range, restart from the beginning. */
  goto again;
}
}

int btrfs_read_folio(struct file *file, struct folio *folio)
{
struct btrfs_inode *inode = folio_to_inode(folio);
const u64 start = folio_pos(folio);
const u64 end = start + folio_size(folio) - 1;
struct extent_state *cached_state = NULL;
struct btrfs_bio_ctrl bio_ctrl = {
  .opf = REQ_OP_READ,
  .last_em_start = U64_MAX,
};
struct extent_map *em_cached = NULL;
int ret;

lock_extents_for_read(inode, start, end, &cached_state);
ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl);
btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);

btrfs_free_extent_map(em_cached);

/*
* If btrfs_do_readpage() failed we will want to submit the assembled
* bio to do the cleanup.
*/
submit_one_bio(&bio_ctrl);
return ret;
}

static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bitmap,
    u64 start, u32 len)
{
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
const u64 folio_start = folio_pos(folio);
unsigned int start_bit;
unsigned int nbits;

ASSERT(start >= folio_start && start + len <= folio_start + folio_size(folio));
start_bit = (start - folio_start) >> fs_info->sectorsize_bits;
nbits = len >> fs_info->sectorsize_bits;
ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits));
bitmap_set(delalloc_bitmap, start_bit, nbits);
}

static bool find_next_delalloc_bitmap(struct folio *folio,
          unsigned long *delalloc_bitmap, u64 start,
          u64 *found_start, u32 *found_len)
{
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
const u64 folio_start = folio_pos(folio);
const unsigned int bitmap_size = btrfs_blocks_per_folio(fs_info, folio);
unsigned int start_bit;
unsigned int first_zero;
unsigned int first_set;

ASSERT(start >= folio_start && start < folio_start + folio_size(folio));

start_bit = (start - folio_start) >> fs_info->sectorsize_bits;
first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit);
if (first_set >= bitmap_size)
  return false;

*found_start = folio_start + (first_set << fs_info->sectorsize_bits);
first_zero = find_next_zero_bit(delalloc_bitmap, bitmap_size, first_set);
*found_len = (first_zero - first_set) << fs_info->sectorsize_bits;
return true;
}

/*
* Do all of the delayed allocation setup.
*
* Return >0 if all the dirty blocks are submitted async (compression) or inlined.
* The @folio should no longer be touched (treat it as already unlocked).
*
* Return 0 if there is still dirty block that needs to be submitted through
* extent_writepage_io().
* bio_ctrl->submit_bitmap will indicate which blocks of the folio should be
* submitted, and @folio is still kept locked.
*
* Return <0 if there is any error hit.
* Any allocated ordered extent range covering this folio will be marked
* finished (IOERR), and @folio is still kept locked.
*/
static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
       struct folio *folio,
       struct btrfs_bio_ctrl *bio_ctrl)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(&inode->vfs_inode);
struct writeback_control *wbc = bio_ctrl->wbc;
const bool is_subpage = btrfs_is_subpage(fs_info, folio);
const u64 page_start = folio_pos(folio);
const u64 page_end = page_start + folio_size(folio) - 1;
const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
unsigned long delalloc_bitmap = 0;
/*
* Save the last found delalloc end. As the delalloc end can go beyond
* page boundary, thus we cannot rely on subpage bitmap to locate the
* last delalloc end.
*/
u64 last_delalloc_end = 0;
/*
* The range end (exclusive) of the last successfully finished delalloc
* range.
* Any range covered by ordered extent must either be manually marked
* finished (error handling), or has IO submitted (and finish the
* ordered extent normally).
*
* This records the end of ordered extent cleanup if we hit an error.
*/
u64 last_finished_delalloc_end = page_start;
u64 delalloc_start = page_start;
u64 delalloc_end = page_end;
u64 delalloc_to_write = 0;
int ret = 0;
int bit;

/* Save the dirty bitmap as our submission bitmap will be a subset of it. */
if (btrfs_is_subpage(fs_info, folio)) {
  ASSERT(blocks_per_folio > 1);
  btrfs_get_subpage_dirty_bitmap(fs_info, folio, &bio_ctrl->submit_bitmap);
} else {
  bio_ctrl->submit_bitmap = 1;
}

for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) {
  u64 start = page_start + (bit << fs_info->sectorsize_bits);

  btrfs_folio_set_lock(fs_info, folio, start, fs_info->sectorsize);
}

/* Lock all (subpage) delalloc ranges inside the folio first. */
while (delalloc_start < page_end) {
  delalloc_end = page_end;
  if (!find_lock_delalloc_range(&inode->vfs_inode, folio,
           &delalloc_start, &delalloc_end)) {
   delalloc_start = delalloc_end + 1;
   continue;
  }
  set_delalloc_bitmap(folio, &delalloc_bitmap, delalloc_start,
        min(delalloc_end, page_end) + 1 - delalloc_start);
  last_delalloc_end = delalloc_end;
  delalloc_start = delalloc_end + 1;
}
delalloc_start = page_start;

if (!last_delalloc_end)
  goto out;

/* Run the delalloc ranges for the above locked ranges. */
while (delalloc_start < page_end) {
  u64 found_start;
  u32 found_len;
  bool found;

  if (!is_subpage) {
   /*
* For non-subpage case, the found delalloc range must
* cover this folio and there must be only one locked
* delalloc range.
*/
   found_start = page_start;
   found_len = last_delalloc_end + 1 - found_start;
   found = true;
  } else {
   found = find_next_delalloc_bitmap(folio, &delalloc_bitmap,
     delalloc_start, &found_start, &found_len);
  }
  if (!found)
   break;
  /*
* The subpage range covers the last sector, the delalloc range may
* end beyond the folio boundary, use the saved delalloc_end
* instead.
*/
  if (found_start + found_len >= page_end)
   found_len = last_delalloc_end + 1 - found_start;

  if (ret >= 0) {
   /*
* Some delalloc range may be created by previous folios.
* Thus we still need to clean up this range during error
* handling.
*/
   last_finished_delalloc_end = found_start;
   /* No errors hit so far, run the current delalloc range. */
   ret = btrfs_run_delalloc_range(inode, folio,
             found_start,
             found_start + found_len - 1,
             wbc);
   if (ret >= 0)
    last_finished_delalloc_end = found_start + found_len;
   if (unlikely(ret < 0))
    btrfs_err_rl(fs_info,
"failed to run delalloc range, root=%lld ino=%llu folio=%llu submit_bitmap=%*pbl start=%llu len=%u: %d",
          btrfs_root_id(inode->root),
          btrfs_ino(inode),
          folio_pos(folio),
          blocks_per_folio,
          &bio_ctrl->submit_bitmap,
          found_start, found_len, ret);
  } else {
   /*
* We've hit an error during previous delalloc range,
* have to cleanup the remaining locked ranges.
*/
   btrfs_unlock_extent(&inode->io_tree, found_start,
         found_start + found_len - 1, NULL);
   unlock_delalloc_folio(&inode->vfs_inode, folio,
           found_start,
           found_start + found_len - 1);
  }

  /*
* We have some ranges that's going to be submitted asynchronously
* (compression or inline).  These range have their own control
* on when to unlock the pages.  We should not touch them
* anymore, so clear the range from the submission bitmap.
*/
  if (ret > 0) {
   unsigned int start_bit = (found_start - page_start) >>
       fs_info->sectorsize_bits;
   unsigned int end_bit = (min(page_end + 1, found_start + found_len) -
      page_start) >> fs_info->sectorsize_bits;
   bitmap_clear(&bio_ctrl->submit_bitmap, start_bit, end_bit - start_bit);
  }
  /*
* Above btrfs_run_delalloc_range() may have unlocked the folio,
* thus for the last range, we cannot touch the folio anymore.
*/
  if (found_start + found_len >= last_delalloc_end + 1)
   break;

  delalloc_start = found_start + found_len;
}
/*
* It's possible we had some ordered extents created before we hit
* an error, cleanup non-async successfully created delalloc ranges.
*/
if (unlikely(ret < 0)) {
  unsigned int bitmap_size = min(
    (last_finished_delalloc_end - page_start) >>
    fs_info->sectorsize_bits,
    blocks_per_folio);

  for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size)
   btrfs_mark_ordered_io_finished(inode, folio,
    page_start + (bit << fs_info->sectorsize_bits),
    fs_info->sectorsize, false);
  return ret;
}
out:
if (last_delalloc_end)
  delalloc_end = last_delalloc_end;
else
  delalloc_end = page_end;
/*
* delalloc_end is already one less than the total length, so
* we don't subtract one from PAGE_SIZE.
*/
delalloc_to_write +=
  DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE);

/*
* If all ranges are submitted asynchronously, we just need to account
* for them here.
*/
if (bitmap_empty(&bio_ctrl->submit_bitmap, blocks_per_folio)) {
  wbc->nr_to_write -= delalloc_to_write;
  return 1;
}

if (wbc->nr_to_write < delalloc_to_write) {
  int thresh = 8192;

  if (delalloc_to_write < thresh * 2)
   thresh = delalloc_to_write;
  wbc->nr_to_write = min_t(u64, delalloc_to_write,
      thresh);
}

return 0;
}

/*
* Return 0 if we have submitted or queued the sector for submission.
* Return <0 for critical errors, and the sector will have its dirty flag cleared.
*
* Caller should make sure filepos < i_size and handle filepos >= i_size case.
*/
static int submit_one_sector(struct btrfs_inode *inode,
        struct folio *folio,
        u64 filepos, struct btrfs_bio_ctrl *bio_ctrl,
        loff_t i_size)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_map *em;
u64 block_start;
u64 disk_bytenr;
u64 extent_offset;
u64 em_end;
const u32 sectorsize = fs_info->sectorsize;

ASSERT(IS_ALIGNED(filepos, sectorsize));

/* @filepos >= i_size case should be handled by the caller. */
ASSERT(filepos < i_size);

em = btrfs_get_extent(inode, NULL, filepos, sectorsize);
if (IS_ERR(em)) {
  /*
* When submission failed, we should still clear the folio dirty.
* Or the folio will be written back again but without any
* ordered extent.
*/
  btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize);
  btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize);
  btrfs_folio_clear_writeback(fs_info, folio, filepos, sectorsize);
  return PTR_ERR(em);
}

extent_offset = filepos - em->start;
em_end = btrfs_extent_map_end(em);
ASSERT(filepos <= em_end);
ASSERT(IS_ALIGNED(em->start, sectorsize));
ASSERT(IS_ALIGNED(em->len, sectorsize));

block_start = btrfs_extent_map_block_start(em);
disk_bytenr = btrfs_extent_map_block_start(em) + extent_offset;

ASSERT(!btrfs_extent_map_is_compressed(em));
ASSERT(block_start != EXTENT_MAP_HOLE);
ASSERT(block_start != EXTENT_MAP_INLINE);

btrfs_free_extent_map(em);
em = NULL;

/*
* Although the PageDirty bit is cleared before entering this
* function, subpage dirty bit is not cleared.
* So clear subpage dirty bit here so next time we won't submit
* a folio for a range already written to disk.
*/
btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize);
btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize);
/*
* Above call should set the whole folio with writeback flag, even
* just for a single subpage sector.
* As long as the folio is properly locked and the range is correct,
* we should always get the folio with writeback flag.
*/
ASSERT(folio_test_writeback(folio));

submit_extent_folio(bio_ctrl, disk_bytenr, folio,
       sectorsize, filepos - folio_pos(folio));
return 0;
}

/*
* Helper for extent_writepage().  This calls the writepage start hooks,
* and does the loop to map the page into extents and bios.
*
* We return 1 if the IO is started and the page is unlocked,
* 0 if all went well (page still locked)
* < 0 if there were errors (page still locked)
*/
static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
        struct folio *folio,
        u64 start, u32 len,
        struct btrfs_bio_ctrl *bio_ctrl,
        loff_t i_size)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
unsigned long range_bitmap = 0;
bool submitted_io = false;
int found_error = 0;
const u64 folio_start = folio_pos(folio);
const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
u64 cur;
int bit;
int ret = 0;

ASSERT(start >= folio_start &&
        start + len <= folio_start + folio_size(folio));

ret = btrfs_writepage_cow_fixup(folio);
if (ret == -EAGAIN) {
  /* Fixup worker will requeue */
  folio_redirty_for_writepage(bio_ctrl->wbc, folio);
  folio_unlock(folio);
  return 1;
}
if (ret < 0) {
  btrfs_folio_clear_dirty(fs_info, folio, start, len);
  btrfs_folio_set_writeback(fs_info, folio, start, len);
  btrfs_folio_clear_writeback(fs_info, folio, start, len);
  return ret;
}

for (cur = start; cur < start + len; cur += fs_info->sectorsize)
  set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap);
bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap,
     blocks_per_folio);

bio_ctrl->end_io_func = end_bbio_data_write;

for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) {
  cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits);

  if (cur >= i_size) {
   btrfs_mark_ordered_io_finished(inode, folio, cur,
             start + len - cur, true);
   /*
* This range is beyond i_size, thus we don't need to
* bother writing back.
* But we still need to clear the dirty subpage bit, or
* the next time the folio gets dirtied, we will try to
* writeback the sectors with subpage dirty bits,
* causing writeback without ordered extent.
*/
   btrfs_folio_clear_dirty(fs_info, folio, cur,
      start + len - cur);
   break;
  }
  ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size);
  if (unlikely(ret < 0)) {
   /*
* bio_ctrl may contain a bio crossing several folios.
* Submit it immediately so that the bio has a chance
* to finish normally, other than marked as error.
*/
   submit_one_bio(bio_ctrl);
   /*
* Failed to grab the extent map which should be very rare.
* Since there is no bio submitted to finish the ordered
* extent, we have to manually finish this sector.
*/
   btrfs_mark_ordered_io_finished(inode, folio, cur,
             fs_info->sectorsize, false);
   if (!found_error)
    found_error = ret;
   continue;
  }
  submitted_io = true;
}

/*
* If we didn't submitted any sector (>= i_size), folio dirty get
* cleared but PAGECACHE_TAG_DIRTY is not cleared (only cleared
* by folio_start_writeback() if the folio is not dirty).
*
* Here we set writeback and clear for the range. If the full folio
* is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag.
*
* If we hit any error, the corresponding sector will have its dirty
* flag cleared and writeback finished, thus no need to handle the error case.
*/
if (!submitted_io && !found_error) {
  btrfs_folio_set_writeback(fs_info, folio, start, len);
  btrfs_folio_clear_writeback(fs_info, folio, start, len);
}
return found_error;
}

/*
* the writepage semantics are similar to regular writepage.  extent
* records are inserted to lock ranges in the tree, and as dirty areas
* are found, they are marked writeback.  Then the lock bits are removed
* and the end_io handler clears the writeback ranges
*
* Return 0 if everything goes well.
* Return <0 for error.
*/
static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl)
{
struct btrfs_inode *inode = BTRFS_I(folio->mapping->host);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
int ret;
size_t pg_offset;
loff_t i_size = i_size_read(&inode->vfs_inode);
const pgoff_t end_index = i_size >> PAGE_SHIFT;
const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);

trace_extent_writepage(folio, &inode->vfs_inode, bio_ctrl->wbc);

WARN_ON(!folio_test_locked(folio));

pg_offset = offset_in_folio(folio, i_size);
if (folio->index > end_index ||
    (folio->index == end_index && !pg_offset)) {
  folio_invalidate(folio, 0, folio_size(folio));
  folio_unlock(folio);
  return 0;
}

if (folio_contains(folio, end_index))
  folio_zero_range(folio, pg_offset, folio_size(folio) - pg_offset);

/*
* Default to unlock the whole folio.
* The proper bitmap can only be initialized until writepage_delalloc().
*/
bio_ctrl->submit_bitmap = (unsigned long)-1;

/*
* If the page is dirty but without private set, it's marked dirty
* without informing the fs.
* Nowadays that is a bug, since the introduction of
* pin_user_pages*().
*
* So here we check if the page has private set to rule out such
* case.
* But we also have a long history of relying on the COW fixup,
* so here we only enable this check for experimental builds until
* we're sure it's safe.
*/
if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) &&
     unlikely(!folio_test_private(folio))) {
  WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
  btrfs_err_rl(fs_info,
"root %lld ino %llu folio %llu is marked dirty without notifying the fs",
        btrfs_root_id(inode->root),
        btrfs_ino(inode), folio_pos(folio));
  ret = -EUCLEAN;
  goto done;
}

ret = set_folio_extent_mapped(folio);
if (ret < 0)
  goto done;

ret = writepage_delalloc(inode, folio, bio_ctrl);
if (ret == 1)
  return 0;
if (ret)
  goto done;

ret = extent_writepage_io(inode, folio, folio_pos(folio),
      folio_size(folio), bio_ctrl, i_size);
if (ret == 1)
  return 0;
if (ret < 0)
  btrfs_err_rl(fs_info,
"failed to submit blocks, root=%lld inode=%llu folio=%llu submit_bitmap=%*pbl: %d",
        btrfs_root_id(inode->root), btrfs_ino(inode),
        folio_pos(folio), blocks_per_folio,
        &bio_ctrl->submit_bitmap, ret);

bio_ctrl->wbc->nr_to_write--;

done:
if (ret < 0)
  mapping_set_error(folio->mapping, ret);
/*
* Only unlock ranges that are submitted. As there can be some async
* submitted ranges inside the folio.
*/
btrfs_folio_end_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap);
ASSERT(ret <= 0);
return ret;
}

/*
* Lock extent buffer status and pages for writeback.
*
* Return %false if the extent buffer doesn't need to be submitted (e.g. the
* extent buffer is not dirty)
* Return %true is the extent buffer is submitted to bio.
*/
static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *eb,
     struct writeback_control *wbc)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
bool ret = false;

btrfs_tree_lock(eb);
while (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
  btrfs_tree_unlock(eb);
  if (wbc->sync_mode != WB_SYNC_ALL)
   return false;
  wait_on_extent_buffer_writeback(eb);
  btrfs_tree_lock(eb);
}

/*
* We need to do this to prevent races in people who check if the eb is
* under IO since we can end up having no IO bits set for a short period
* of time.
*/
spin_lock(&eb->refs_lock);
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
  XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
  unsigned long flags;

  set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
  spin_unlock(&eb->refs_lock);

  xas_lock_irqsave(&xas, flags);
  xas_load(&xas);
  xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
  xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
  xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
  xas_unlock_irqrestore(&xas, flags);

  btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
  percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
      -eb->len,
      fs_info->dirty_metadata_batch);
  ret = true;
} else {
  spin_unlock(&eb->refs_lock);
}
btrfs_tree_unlock(eb);
return ret;
}

static void set_btree_ioerr(struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;

set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);

/*
* A read may stumble upon this buffer later, make sure that it gets an
* error and knows there was an error.
*/
clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);

/*
* We need to set the mapping with the io error as well because a write
* error will flip the file system readonly, and then syncfs() will
* return a 0 because we are readonly if we don't modify the err seq for
* the superblock.
*/
mapping_set_error(eb->fs_info->btree_inode->i_mapping, -EIO);

/*
* If writeback for a btree extent that doesn't belong to a log tree
* failed, increment the counter transaction->eb_write_errors.
* We do this because while the transaction is running and before it's
* committing (when we call filemap_fdata[write|wait]_range against
* the btree inode), we might have
* btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
* returns an error or an error happens during writeback, when we're
* committing the transaction we wouldn't know about it, since the pages
* can be no longer dirty nor marked anymore for writeback (if a
* subsequent modification to the extent buffer didn't happen before the
* transaction commit), which makes filemap_fdata[write|wait]_range not
* able to find the pages which contain errors at transaction
* commit time. So if this happens we must abort the transaction,
* otherwise we commit a super block with btree roots that point to
* btree nodes/leafs whose content on disk is invalid - either garbage
* or the content of some node/leaf from a past generation that got
* cowed or deleted and is no longer valid.
*
* Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
* not be enough - we need to distinguish between log tree extents vs
* non-log tree extents, and the next filemap_fdatawait_range() call
* will catch and clear such errors in the mapping - and that call might
* be from a log sync and not from a transaction commit. Also, checking
* for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
* not done and would not be reliable - the eb might have been released
* from memory and reading it back again means that flag would not be
* set (since it's a runtime flag, not persisted on disk).
*
* Using the flags below in the btree inode also makes us achieve the
* goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
* writeback for all dirty pages and before filemap_fdatawait_range()
* is called, the writeback for all dirty pages had already finished
* with errors - because we were not using AS_EIO/AS_ENOSPC,
* filemap_fdatawait_range() would return success, as it could not know
* that writeback errors happened (the pages were no longer tagged for
* writeback).
*/
switch (eb->log_index) {
case -1:
  set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags);
  break;
case 0:
  set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
  break;
case 1:
  set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
  break;
default:
  BUG(); /* unexpected, logic error */
}
}

static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
unsigned long flags;

xas_lock_irqsave(&xas, flags);
xas_load(&xas);
xas_set_mark(&xas, mark);
xas_unlock_irqrestore(&xas, flags);
}

static void buffer_tree_clear_mark(const struct extent_buffer *eb, xa_mark_t mark)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
unsigned long flags;

xas_lock_irqsave(&xas, flags);
xas_load(&xas);
xas_clear_mark(&xas, mark);
xas_unlock_irqrestore(&xas, flags);
}

static void buffer_tree_tag_for_writeback(struct btrfs_fs_info *fs_info,
       unsigned long start, unsigned long end)
{
XA_STATE(xas, &fs_info->buffer_tree, start);
unsigned int tagged = 0;
void *eb;

xas_lock_irq(&xas);
xas_for_each_marked(&xas, eb, end, PAGECACHE_TAG_DIRTY) {
  xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE);
  if (++tagged % XA_CHECK_SCHED)
   continue;
  xas_pause(&xas);
  xas_unlock_irq(&xas);
  cond_resched();
  xas_lock_irq(&xas);
}
xas_unlock_irq(&xas);
}

struct eb_batch {
unsigned int nr;
unsigned int cur;
struct extent_buffer *ebs[PAGEVEC_SIZE];
};

static inline bool eb_batch_add(struct eb_batch *batch, struct extent_buffer *eb)
{
batch->ebs[batch->nr++] = eb;
return (batch->nr < PAGEVEC_SIZE);
}

static inline void eb_batch_init(struct eb_batch *batch)
{
batch->nr = 0;
batch->cur = 0;
}

static inline struct extent_buffer *eb_batch_next(struct eb_batch *batch)
{
if (batch->cur >= batch->nr)
  return NULL;
return batch->ebs[batch->cur++];
}

static inline void eb_batch_release(struct eb_batch *batch)
{
for (unsigned int i = 0; i < batch->nr; i++)
  free_extent_buffer(batch->ebs[i]);
eb_batch_init(batch);
}

static inline struct extent_buffer *find_get_eb(struct xa_state *xas, unsigned long max,
      xa_mark_t mark)
{
struct extent_buffer *eb;

retry:
eb = xas_find_marked(xas, max, mark);

if (xas_retry(xas, eb))
  goto retry;

if (!eb)
  return NULL;

if (!refcount_inc_not_zero(&eb->refs)) {
  xas_reset(xas);
  goto retry;
}

if (unlikely(eb != xas_reload(xas))) {
  free_extent_buffer(eb);
  xas_reset(xas);
  goto retry;
}

return eb;
}

static unsigned int buffer_tree_get_ebs_tag(struct btrfs_fs_info *fs_info,
         unsigned long *start,
         unsigned long end, xa_mark_t tag,
         struct eb_batch *batch)
{
XA_STATE(xas, &fs_info->buffer_tree, *start);
struct extent_buffer *eb;

rcu_read_lock();
while ((eb = find_get_eb(&xas, end, tag)) != NULL) {
  if (!eb_batch_add(batch, eb)) {
   *start = ((eb->start + eb->len) >> fs_info->nodesize_bits);
   goto out;
  }
}
if (end == ULONG_MAX)
  *start = ULONG_MAX;
else
  *start = end + 1;
out:
rcu_read_unlock();

return batch->nr;
}

/*
* The endio specific version which won't touch any unsafe spinlock in endio
* context.
*/
static struct extent_buffer *find_extent_buffer_nolock(
  struct btrfs_fs_info *fs_info, u64 start)
{
struct extent_buffer *eb;
unsigned long index = (start >> fs_info->nodesize_bits);

rcu_read_lock();
eb = xa_load(&fs_info->buffer_tree, index);
if (eb && !refcount_inc_not_zero(&eb->refs))
  eb = NULL;
rcu_read_unlock();
return eb;
}

static void end_bbio_meta_write(struct btrfs_bio *bbio)
{
struct extent_buffer *eb = bbio->private;
struct folio_iter fi;

if (bbio->bio.bi_status != BLK_STS_OK)
  set_btree_ioerr(eb);

bio_for_each_folio_all(fi, &bbio->bio) {
  btrfs_meta_folio_clear_writeback(fi.folio, eb);
}

buffer_tree_clear_mark(eb, PAGECACHE_TAG_WRITEBACK);
clear_and_wake_up_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
bio_put(&bbio->bio);
}

static void prepare_eb_write(struct extent_buffer *eb)
{
u32 nritems;
unsigned long start;
unsigned long end;

clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);

/* Set btree blocks beyond nritems with 0 to avoid stale content */
nritems = btrfs_header_nritems(eb);
if (btrfs_header_level(eb) > 0) {
  end = btrfs_node_key_ptr_offset(eb, nritems);
  memzero_extent_buffer(eb, end, eb->len - end);
} else {
  /*
* Leaf:
* header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
*/
  start = btrfs_item_nr_offset(eb, nritems);
  end = btrfs_item_nr_offset(eb, 0);
  if (nritems == 0)
   end += BTRFS_LEAF_DATA_SIZE(eb->fs_info);
  else
   end += btrfs_item_offset(eb, nritems - 1);
  memzero_extent_buffer(eb, start, end - start);
}
}

static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
         struct writeback_control *wbc)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct btrfs_bio *bbio;

prepare_eb_write(eb);

bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
          REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc),
          eb->fs_info, end_bbio_meta_write, eb);
bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT;
bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev);
wbc_init_bio(wbc, &bbio->bio);
bbio->inode = BTRFS_I(eb->fs_info->btree_inode);
bbio->file_offset = eb->start;
for (int i = 0; i < num_extent_folios(eb); i++) {
  struct folio *folio = eb->folios[i];
  u64 range_start = max_t(u64, eb->start, folio_pos(folio));
--> --------------------

--> maximum size reached

--> --------------------

Messung V0.5

¤ Dauer der Verarbeitung: 0.18 Sekunden ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.