Quelle inode.c Sprache: C

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle.  All rights reserved.
*/

#include <crypto/hash.h>
#include <linux/kernel.h>
#include <linux/bio.h>
#include <linux/blk-cgroup.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/time.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/writeback.h>
#include <linux/compat.h>
#include <linux/xattr.h>
#include <linux/posix_acl.h>
#include <linux/falloc.h>
#include <linux/slab.h>
#include <linux/ratelimit.h>
#include <linux/btrfs.h>
#include <linux/blkdev.h>
#include <linux/posix_acl_xattr.h>
#include <linux/uio.h>
#include <linux/magic.h>
#include <linux/iversion.h>
#include <linux/swap.h>
#include <linux/migrate.h>
#include <linux/sched/mm.h>
#include <linux/iomap.h>
#include <linux/unaligned.h>
#include <linux/fsverity.h>
#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "ordered-data.h"
#include "xattr.h"
#include "tree-log.h"
#include "bio.h"
#include "compression.h"
#include "locking.h"
#include "props.h"
#include "qgroup.h"
#include "delalloc-space.h"
#include "block-group.h"
#include "space-info.h"
#include "zoned.h"
#include "subpage.h"
#include "inode-item.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
#include "root-tree.h"
#include "defrag.h"
#include "dir-item.h"
#include "file-item.h"
#include "uuid-tree.h"
#include "ioctl.h"
#include "file.h"
#include "acl.h"
#include "relocation.h"
#include "verity.h"
#include "super.h"
#include "orphan.h"
#include "backref.h"
#include "raid-stripe-tree.h"
#include "fiemap.h"

struct btrfs_iget_args {
u64 ino;
struct btrfs_root *root;
};

struct btrfs_rename_ctx {
/* Output field. Stores the index number of the old directory entry. */
u64 index;
};

/*
* Used by data_reloc_print_warning_inode() to pass needed info for filename
* resolution and output of error message.
*/
struct data_reloc_warn {
struct btrfs_path path;
struct btrfs_fs_info *fs_info;
u64 extent_item_size;
u64 logical;
int mirror_num;
};

/*
* For the file_extent_tree, we want to hold the inode lock when we lookup and
* update the disk_i_size, but lockdep will complain because our io_tree we hold
* the tree lock and get the inode lock when setting delalloc. These two things
* are unrelated, so make a class for the file_extent_tree so we don't get the
* two locking patterns mixed up.
*/
static struct lock_class_key file_extent_tree_class;

static const struct inode_operations btrfs_dir_inode_operations;
static const struct inode_operations btrfs_symlink_inode_operations;
static const struct inode_operations btrfs_special_inode_operations;
static const struct inode_operations btrfs_file_inode_operations;
static const struct address_space_operations btrfs_aops;
static const struct file_operations btrfs_dir_file_operations;

static struct kmem_cache *btrfs_inode_cachep;

static int btrfs_setsize(struct inode *inode, struct iattr *attr);
static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback);

static noinline int run_delalloc_cow(struct btrfs_inode *inode,
         struct folio *locked_folio, u64 start,
         u64 end, struct writeback_control *wbc,
         bool pages_dirty);

static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
       u64 root, void *warn_ctx)
{
struct data_reloc_warn *warn = warn_ctx;
struct btrfs_fs_info *fs_info = warn->fs_info;
struct extent_buffer *eb;
struct btrfs_inode_item *inode_item;
struct inode_fs_paths *ipath = NULL;
struct btrfs_root *local_root;
struct btrfs_key key;
unsigned int nofs_flag;
u32 nlink;
int ret;

local_root = btrfs_get_fs_root(fs_info, root, true);
if (IS_ERR(local_root)) {
  ret = PTR_ERR(local_root);
  goto err;
}

/* This makes the path point to (inum INODE_ITEM ioff). */
key.objectid = inum;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;

ret = btrfs_search_slot(NULL, local_root, &key, &warn->path, 0, 0);
if (ret) {
  btrfs_put_root(local_root);
  btrfs_release_path(&warn->path);
  goto err;
}

eb = warn->path.nodes[0];
inode_item = btrfs_item_ptr(eb, warn->path.slots[0], struct btrfs_inode_item);
nlink = btrfs_inode_nlink(eb, inode_item);
btrfs_release_path(&warn->path);

nofs_flag = memalloc_nofs_save();
ipath = init_ipath(4096, local_root, &warn->path);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(ipath)) {
  btrfs_put_root(local_root);
  ret = PTR_ERR(ipath);
  ipath = NULL;
  /*
* -ENOMEM, not a critical error, just output an generic error
* without filename.
*/
  btrfs_warn(fs_info,
"checksum error at logical %llu mirror %u root %llu, inode %llu offset %llu",
      warn->logical, warn->mirror_num, root, inum, offset);
  return ret;
}
ret = paths_from_inode(inum, ipath);
if (ret < 0) {
  btrfs_put_root(local_root);
  goto err;
}

/*
* We deliberately ignore the bit ipath might have been too small to
* hold all of the paths here
*/
for (int i = 0; i < ipath->fspath->elem_cnt; i++) {
  btrfs_warn(fs_info,
"checksum error at logical %llu mirror %u root %llu inode %llu offset %llu length %u links %u (path: %s)",
      warn->logical, warn->mirror_num, root, inum, offset,
      fs_info->sectorsize, nlink,
      (char *)(unsigned long)ipath->fspath->val[i]);
}

btrfs_put_root(local_root);
free_ipath(ipath);
return 0;

err:
btrfs_warn(fs_info,
"checksum error at logical %llu mirror %u root %llu inode %llu offset %llu, path resolving failed with ret=%d",
     warn->logical, warn->mirror_num, root, inum, offset, ret);

free_ipath(ipath);
return ret;
}

/*
* Do extra user-friendly error output (e.g. lookup all the affected files).
*
* Return true if we succeeded doing the backref lookup.
* Return false if such lookup failed, and has to fallback to the old error message.
*/
static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off,
       const u8 *csum, const u8 *csum_expected,
       int mirror_num)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_path path = { 0 };
struct btrfs_key found_key = { 0 };
struct extent_buffer *eb;
struct btrfs_extent_item *ei;
const u32 csum_size = fs_info->csum_size;
u64 logical;
u64 flags;
u32 item_size;
int ret;

mutex_lock(&fs_info->reloc_mutex);
logical = btrfs_get_reloc_bg_bytenr(fs_info);
mutex_unlock(&fs_info->reloc_mutex);

if (logical == U64_MAX) {
  btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation");
  btrfs_warn_rl(fs_info,
"csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
   btrfs_root_id(inode->root), btrfs_ino(inode), file_off,
   CSUM_FMT_VALUE(csum_size, csum),
   CSUM_FMT_VALUE(csum_size, csum_expected),
   mirror_num);
  return;
}

logical += file_off;
btrfs_warn_rl(fs_info,
"csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
   btrfs_root_id(inode->root),
   btrfs_ino(inode), file_off, logical,
   CSUM_FMT_VALUE(csum_size, csum),
   CSUM_FMT_VALUE(csum_size, csum_expected),
   mirror_num);

ret = extent_from_logical(fs_info, logical, &path, &found_key, &flags);
if (ret < 0) {
  btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d",
        logical, ret);
  return;
}
eb = path.nodes[0];
ei = btrfs_item_ptr(eb, path.slots[0], struct btrfs_extent_item);
item_size = btrfs_item_size(eb, path.slots[0]);
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
  unsigned long ptr = 0;
  u64 ref_root;
  u8 ref_level;

  while (true) {
   ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
            item_size, &ref_root,
            &ref_level);
   if (ret < 0) {
    btrfs_warn_rl(fs_info,
    "failed to resolve tree backref for logical %llu: %d",
           logical, ret);
    break;
   }
   if (ret > 0)
    break;

   btrfs_warn_rl(fs_info,
"csum error at logical %llu mirror %u: metadata %s (level %d) in tree %llu",
    logical, mirror_num,
    (ref_level ? "node" : "leaf"),
    ref_level, ref_root);
  }
  btrfs_release_path(&path);
} else {
  struct btrfs_backref_walk_ctx ctx = { 0 };
  struct data_reloc_warn reloc_warn = { 0 };

  btrfs_release_path(&path);

  ctx.bytenr = found_key.objectid;
  ctx.extent_item_pos = logical - found_key.objectid;
  ctx.fs_info = fs_info;

  reloc_warn.logical = logical;
  reloc_warn.extent_item_size = found_key.offset;
  reloc_warn.mirror_num = mirror_num;
  reloc_warn.fs_info = fs_info;

  iterate_extent_inodes(&ctx, true,
          data_reloc_print_warning_inode, &reloc_warn);
}
}

static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
  u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
{
struct btrfs_root *root = inode->root;
const u32 csum_size = root->fs_info->csum_size;

/* For data reloc tree, it's better to do a backref lookup instead. */
if (btrfs_is_data_reloc_root(root))
  return print_data_reloc_error(inode, logical_start, csum,
           csum_expected, mirror_num);

/* Output without objectid, which is more meaningful */
if (btrfs_root_id(root) >= BTRFS_LAST_FREE_OBJECTID) {
  btrfs_warn_rl(root->fs_info,
"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
   btrfs_root_id(root), btrfs_ino(inode),
   logical_start,
   CSUM_FMT_VALUE(csum_size, csum),
   CSUM_FMT_VALUE(csum_size, csum_expected),
   mirror_num);
} else {
  btrfs_warn_rl(root->fs_info,
"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
   btrfs_root_id(root), btrfs_ino(inode),
   logical_start,
   CSUM_FMT_VALUE(csum_size, csum),
   CSUM_FMT_VALUE(csum_size, csum_expected),
   mirror_num);
}
}

/*
* Lock inode i_rwsem based on arguments passed.
*
* ilock_flags can have the following bit set:
*
* BTRFS_ILOCK_SHARED - acquire a shared lock on the inode
* BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt
*      return -EAGAIN
* BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
*/
int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags)
{
if (ilock_flags & BTRFS_ILOCK_SHARED) {
  if (ilock_flags & BTRFS_ILOCK_TRY) {
   if (!inode_trylock_shared(&inode->vfs_inode))
    return -EAGAIN;
   else
    return 0;
  }
  inode_lock_shared(&inode->vfs_inode);
} else {
  if (ilock_flags & BTRFS_ILOCK_TRY) {
   if (!inode_trylock(&inode->vfs_inode))
    return -EAGAIN;
   else
    return 0;
  }
  inode_lock(&inode->vfs_inode);
}
if (ilock_flags & BTRFS_ILOCK_MMAP)
  down_write(&inode->i_mmap_lock);
return 0;
}

/*
* Unock inode i_rwsem.
*
* ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
* to decide whether the lock acquired is shared or exclusive.
*/
void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags)
{
if (ilock_flags & BTRFS_ILOCK_MMAP)
  up_write(&inode->i_mmap_lock);
if (ilock_flags & BTRFS_ILOCK_SHARED)
  inode_unlock_shared(&inode->vfs_inode);
else
  inode_unlock(&inode->vfs_inode);
}

/*
* Cleanup all submitted ordered extents in specified range to handle errors
* from the btrfs_run_delalloc_range() callback.
*
* NOTE: caller must ensure that when an error happens, it can not call
* extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
* and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
* to be released, which we want to happen only when finishing the ordered
* extent (btrfs_finish_ordered_io()).
*/
static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
       u64 offset, u64 bytes)
{
pgoff_t index = offset >> PAGE_SHIFT;
const pgoff_t end_index = (offset + bytes - 1) >> PAGE_SHIFT;
struct folio *folio;

while (index <= end_index) {
  folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);
  if (IS_ERR(folio)) {
   index++;
   continue;
  }

  index = folio_end(folio) >> PAGE_SHIFT;
  /*
* Here we just clear all Ordered bits for every page in the
* range, then btrfs_mark_ordered_io_finished() will handle
* the ordered extent accounting for the range.
*/
  btrfs_folio_clamp_clear_ordered(inode->root->fs_info, folio,
      offset, bytes);
  folio_put(folio);
}

return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false);
}

static int btrfs_dirty_inode(struct btrfs_inode *inode);

static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
         struct btrfs_new_inode_args *args)
{
int ret;

if (args->default_acl) {
  ret = __btrfs_set_acl(trans, args->inode, args->default_acl,
          ACL_TYPE_DEFAULT);
  if (ret)
   return ret;
}
if (args->acl) {
  ret = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS);
  if (ret)
   return ret;
}
if (!args->default_acl && !args->acl)
  cache_no_acl(args->inode);
return btrfs_xattr_security_init(trans, args->inode, args->dir,
      &args->dentry->d_name);
}

/*
* this does all the hard work for inserting an inline extent into
* the btree.  The caller should have done a btrfs_drop_extents so that
* no overlapping inline items exist in the btree
*/
static int insert_inline_extent(struct btrfs_trans_handle *trans,
    struct btrfs_path *path,
    struct btrfs_inode *inode, bool extent_inserted,
    size_t size, size_t compressed_size,
    int compress_type,
    struct folio *compressed_folio,
    bool update_i_size)
{
struct btrfs_root *root = inode->root;
struct extent_buffer *leaf;
const u32 sectorsize = trans->fs_info->sectorsize;
char *kaddr;
unsigned long ptr;
struct btrfs_file_extent_item *ei;
int ret;
size_t cur_size = size;
u64 i_size;

/*
* The decompressed size must still be no larger than a sector.  Under
* heavy race, we can have size == 0 passed in, but that shouldn't be a
* big deal and we can continue the insertion.
*/
ASSERT(size <= sectorsize);

/*
* The compressed size also needs to be no larger than a sector.
* That's also why we only need one page as the parameter.
*/
if (compressed_folio)
  ASSERT(compressed_size <= sectorsize);
else
  ASSERT(compressed_size == 0);

if (compressed_size && compressed_folio)
  cur_size = compressed_size;

if (!extent_inserted) {
  struct btrfs_key key;
  size_t datasize;

  key.objectid = btrfs_ino(inode);
  key.type = BTRFS_EXTENT_DATA_KEY;
  key.offset = 0;

  datasize = btrfs_file_extent_calc_inline_size(cur_size);
  ret = btrfs_insert_empty_item(trans, root, path, &key,
           datasize);
  if (ret)
   goto fail;
}
leaf = path->nodes[0];
ei = btrfs_item_ptr(leaf, path->slots[0],
       struct btrfs_file_extent_item);
btrfs_set_file_extent_generation(leaf, ei, trans->transid);
btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
btrfs_set_file_extent_encryption(leaf, ei, 0);
btrfs_set_file_extent_other_encoding(leaf, ei, 0);
btrfs_set_file_extent_ram_bytes(leaf, ei, size);
ptr = btrfs_file_extent_inline_start(ei);

if (compress_type != BTRFS_COMPRESS_NONE) {
  kaddr = kmap_local_folio(compressed_folio, 0);
  write_extent_buffer(leaf, kaddr, ptr, compressed_size);
  kunmap_local(kaddr);

  btrfs_set_file_extent_compression(leaf, ei,
        compress_type);
} else {
  struct folio *folio;

  folio = filemap_get_folio(inode->vfs_inode.i_mapping, 0);
  ASSERT(!IS_ERR(folio));
  btrfs_set_file_extent_compression(leaf, ei, 0);
  kaddr = kmap_local_folio(folio, 0);
  write_extent_buffer(leaf, kaddr, ptr, size);
  kunmap_local(kaddr);
  folio_put(folio);
}
btrfs_release_path(path);

/*
* We align size to sectorsize for inline extents just for simplicity
* sake.
*/
ret = btrfs_inode_set_file_extent_range(inode, 0,
     ALIGN(size, root->fs_info->sectorsize));
if (ret)
  goto fail;

/*
* We're an inline extent, so nobody can extend the file past i_size
* without locking a page we already have locked.
*
* We must do any i_size and inode updates before we unlock the pages.
* Otherwise we could end up racing with unlink.
*/
i_size = i_size_read(&inode->vfs_inode);
if (update_i_size && size > i_size) {
  i_size_write(&inode->vfs_inode, size);
  i_size = size;
}
inode->disk_i_size = i_size;

fail:
return ret;
}

static bool can_cow_file_range_inline(struct btrfs_inode *inode,
          u64 offset, u64 size,
          size_t compressed_size)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 data_len = (compressed_size ?: size);

/* Inline extents must start at offset 0. */
if (offset != 0)
  return false;

/* Inline extents are limited to sectorsize. */
if (size > fs_info->sectorsize)
  return false;

/* We do not allow a non-compressed extent to be as large as block size. */
if (data_len >= fs_info->sectorsize)
  return false;

/* We cannot exceed the maximum inline data size. */
if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
  return false;

/* We cannot exceed the user specified max_inline size. */
if (data_len > fs_info->max_inline)
  return false;

/* Inline extents must be the entirety of the file. */
if (size < i_size_read(&inode->vfs_inode))
  return false;

return true;
}

/*
* conditionally insert an inline extent into the file.  This
* does the checks required to make sure the data is small enough
* to fit as an inline extent.
*
* If being used directly, you must have already checked we're allowed to cow
* the range by getting true from can_cow_file_range_inline().
*/
static noinline int __cow_file_range_inline(struct btrfs_inode *inode,
         u64 size, size_t compressed_size,
         int compress_type,
         struct folio *compressed_folio,
         bool update_i_size)
{
struct btrfs_drop_extents_args drop_args = { 0 };
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
u64 data_len = (compressed_size ?: size);
int ret;
struct btrfs_path *path;

path = btrfs_alloc_path();
if (!path)
  return -ENOMEM;

trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
  btrfs_free_path(path);
  return PTR_ERR(trans);
}
trans->block_rsv = &inode->block_rsv;

drop_args.path = path;
drop_args.start = 0;
drop_args.end = fs_info->sectorsize;
drop_args.drop_cache = true;
drop_args.replace_extent = true;
drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len);
ret = btrfs_drop_extents(trans, root, inode, &drop_args);
if (ret) {
  btrfs_abort_transaction(trans, ret);
  goto out;
}

ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
       size, compressed_size, compress_type,
       compressed_folio, update_i_size);
if (ret && ret != -ENOSPC) {
  btrfs_abort_transaction(trans, ret);
  goto out;
} else if (ret == -ENOSPC) {
  ret = 1;
  goto out;
}

btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
ret = btrfs_update_inode(trans, inode);
if (ret && ret != -ENOSPC) {
  btrfs_abort_transaction(trans, ret);
  goto out;
} else if (ret == -ENOSPC) {
  ret = 1;
  goto out;
}

btrfs_set_inode_full_sync(inode);
out:
/*
* Don't forget to free the reserved space, as for inlined extent
* it won't count as data extent, free them directly here.
* And at reserve time, it's always aligned to page size, so
* just free one page here.
*/
btrfs_qgroup_free_data(inode, NULL, 0, fs_info->sectorsize, NULL);
btrfs_free_path(path);
btrfs_end_transaction(trans);
return ret;
}

static noinline int cow_file_range_inline(struct btrfs_inode *inode,
       struct folio *locked_folio,
       u64 offset, u64 end,
       size_t compressed_size,
       int compress_type,
       struct folio *compressed_folio,
       bool update_i_size)
{
struct extent_state *cached = NULL;
unsigned long clear_flags = EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
  EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING | EXTENT_LOCKED;
u64 size = min_t(u64, i_size_read(&inode->vfs_inode), end + 1);
int ret;

if (!can_cow_file_range_inline(inode, offset, size, compressed_size))
  return 1;

btrfs_lock_extent(&inode->io_tree, offset, end, &cached);
ret = __cow_file_range_inline(inode, size, compressed_size,
          compress_type, compressed_folio,
          update_i_size);
if (ret > 0) {
  btrfs_unlock_extent(&inode->io_tree, offset, end, &cached);
  return ret;
}

/*
* In the successful case (ret == 0 here), cow_file_range will return 1.
*
* Quite a bit further up the callstack in extent_writepage(), ret == 1
* is treated as a short circuited success and does not unlock the folio,
* so we must do it here.
*
* In the failure case, the locked_folio does get unlocked by
* btrfs_folio_end_all_writers, which asserts that it is still locked
* at that point, so we must *not* unlock it here.
*
* The other two callsites in compress_file_range do not have a
* locked_folio, so they are not relevant to this logic.
*/
if (ret == 0)
  locked_folio = NULL;

extent_clear_unlock_delalloc(inode, offset, end, locked_folio, &cached,
         clear_flags, PAGE_UNLOCK |
         PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
return ret;
}

struct async_extent {
u64 start;
u64 ram_size;
u64 compressed_size;
struct folio **folios;
unsigned long nr_folios;
int compress_type;
struct list_head list;
};

struct async_chunk {
struct btrfs_inode *inode;
struct folio *locked_folio;
u64 start;
u64 end;
blk_opf_t write_flags;
struct list_head extents;
struct cgroup_subsys_state *blkcg_css;
struct btrfs_work work;
struct async_cow *async_cow;
};

struct async_cow {
atomic_t num_chunks;
struct async_chunk chunks[];
};

static noinline int add_async_extent(struct async_chunk *cow,
         u64 start, u64 ram_size,
         u64 compressed_size,
         struct folio **folios,
         unsigned long nr_folios,
         int compress_type)
{
struct async_extent *async_extent;

async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
if (!async_extent)
  return -ENOMEM;
async_extent->start = start;
async_extent->ram_size = ram_size;
async_extent->compressed_size = compressed_size;
async_extent->folios = folios;
async_extent->nr_folios = nr_folios;
async_extent->compress_type = compress_type;
list_add_tail(&async_extent->list, &cow->extents);
return 0;
}

/*
* Check if the inode needs to be submitted to compression, based on mount
* options, defragmentation, properties or heuristics.
*/
static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
          u64 end)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;

if (!btrfs_inode_can_compress(inode)) {
  DEBUG_WARN("BTRFS: unexpected compression for ino %llu", btrfs_ino(inode));
  return 0;
}

/* Defrag ioctl takes precedence over mount options and properties. */
if (inode->defrag_compress == BTRFS_DEFRAG_DONT_COMPRESS)
  return 0;
if (BTRFS_COMPRESS_NONE < inode->defrag_compress &&
     inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES)
  return 1;
/* force compress */
if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
  return 1;
/* bad compression ratios */
if (inode->flags & BTRFS_INODE_NOCOMPRESS)
  return 0;
if (btrfs_test_opt(fs_info, COMPRESS) ||
     inode->flags & BTRFS_INODE_COMPRESS ||
     inode->prop_compress)
  return btrfs_compress_heuristic(inode, start, end);
return 0;
}

static inline void inode_should_defrag(struct btrfs_inode *inode,
  u64 start, u64 end, u64 num_bytes, u32 small_write)
{
/* If this is a small write inside eof, kick off a defrag */
if (num_bytes < small_write &&
     (start > 0 || end + 1 < inode->disk_i_size))
  btrfs_add_inode_defrag(inode, small_write);
}

static int extent_range_clear_dirty_for_io(struct btrfs_inode *inode, u64 start, u64 end)
{
const pgoff_t end_index = end >> PAGE_SHIFT;
struct folio *folio;
int ret = 0;

for (pgoff_t index = start >> PAGE_SHIFT; index <= end_index; index++) {
  folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);
  if (IS_ERR(folio)) {
   if (!ret)
    ret = PTR_ERR(folio);
   continue;
  }
  btrfs_folio_clamp_clear_dirty(inode->root->fs_info, folio, start,
           end + 1 - start);
  folio_put(folio);
}
return ret;
}

/*
* Work queue call back to started compression on a file and pages.
*
* This is done inside an ordered work queue, and the compression is spread
* across many cpus.  The actual IO submission is step two, and the ordered work
* queue takes care of making sure that happens in the same order things were
* put onto the queue by writepages and friends.
*
* If this code finds it can't get good compression, it puts an entry onto the
* work queue to write the uncompressed bytes.  This makes sure that both
* compressed inodes and uncompressed inodes are written in the same order that
* the flusher thread sent them down.
*/
static void compress_file_range(struct btrfs_work *work)
{
struct async_chunk *async_chunk =
  container_of(work, struct async_chunk, work);
struct btrfs_inode *inode = async_chunk->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct address_space *mapping = inode->vfs_inode.i_mapping;
u64 blocksize = fs_info->sectorsize;
u64 start = async_chunk->start;
u64 end = async_chunk->end;
u64 actual_end;
u64 i_size;
int ret = 0;
struct folio **folios;
unsigned long nr_folios;
unsigned long total_compressed = 0;
unsigned long total_in = 0;
unsigned int poff;
int i;
int compress_type = fs_info->compress_type;
int compress_level = fs_info->compress_level;

inode_should_defrag(inode, start, end, end - start + 1, SZ_16K);

/*
* We need to call clear_page_dirty_for_io on each page in the range.
* Otherwise applications with the file mmap'd can wander in and change
* the page contents while we are compressing them.
*/
ret = extent_range_clear_dirty_for_io(inode, start, end);

/*
* All the folios should have been locked thus no failure.
*
* And even if some folios are missing, btrfs_compress_folios()
* would handle them correctly, so here just do an ASSERT() check for
* early logic errors.
*/
ASSERT(ret == 0);

/*
* We need to save i_size before now because it could change in between
* us evaluating the size and assigning it.  This is because we lock and
* unlock the page in truncate and fallocate, and then modify the i_size
* later on.
*
* The barriers are to emulate READ_ONCE, remove that once i_size_read
* does that for us.
*/
barrier();
i_size = i_size_read(&inode->vfs_inode);
barrier();
actual_end = min_t(u64, i_size, end + 1);
again:
folios = NULL;
nr_folios = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED_PAGES);

/*
* we don't want to send crud past the end of i_size through
* compression, that's just a waste of CPU time.  So, if the
* end of the file is before the start of our current
* requested range of bytes, we bail out to the uncompressed
* cleanup code that can deal with all of this.
*
* It isn't really the fastest way to fix things, but this is a
* very uncommon corner.
*/
if (actual_end <= start)
  goto cleanup_and_bail_uncompressed;

total_compressed = actual_end - start;

/*
* Skip compression for a small file range(<=blocksize) that
* isn't an inline extent, since it doesn't save disk space at all.
*/
if (total_compressed <= blocksize &&
    (start > 0 || end + 1 < inode->disk_i_size))
  goto cleanup_and_bail_uncompressed;

total_compressed = min_t(unsigned long, total_compressed,
   BTRFS_MAX_UNCOMPRESSED);
total_in = 0;
ret = 0;

/*
* We do compression for mount -o compress and when the inode has not
* been flagged as NOCOMPRESS.  This flag can change at any time if we
* discover bad compression ratios.
*/
if (!inode_need_compress(inode, start, end))
  goto cleanup_and_bail_uncompressed;

folios = kcalloc(nr_folios, sizeof(struct folio *), GFP_NOFS);
if (!folios) {
  /*
* Memory allocation failure is not a fatal error, we can fall
* back to uncompressed code.
*/
  goto cleanup_and_bail_uncompressed;
}

if (0 < inode->defrag_compress && inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES) {
  compress_type = inode->defrag_compress;
  compress_level = inode->defrag_compress_level;
} else if (inode->prop_compress) {
  compress_type = inode->prop_compress;
}

/* Compression level is applied here. */
ret = btrfs_compress_folios(compress_type, compress_level,
        mapping, start, folios, &nr_folios, &total_in,
        &total_compressed);
if (ret)
  goto mark_incompressible;

/*
* Zero the tail end of the last page, as we might be sending it down
* to disk.
*/
poff = offset_in_page(total_compressed);
if (poff)
  folio_zero_range(folios[nr_folios - 1], poff, PAGE_SIZE - poff);

/*
* Try to create an inline extent.
*
* If we didn't compress the entire range, try to create an uncompressed
* inline extent, else a compressed one.
*
* Check cow_file_range() for why we don't even try to create inline
* extent for the subpage case.
*/
if (total_in < actual_end)
  ret = cow_file_range_inline(inode, NULL, start, end, 0,
         BTRFS_COMPRESS_NONE, NULL, false);
else
  ret = cow_file_range_inline(inode, NULL, start, end, total_compressed,
         compress_type, folios[0], false);
if (ret <= 0) {
  if (ret < 0)
   mapping_set_error(mapping, -EIO);
  goto free_pages;
}

/*
* We aren't doing an inline extent. Round the compressed size up to a
* block size boundary so the allocator does sane things.
*/
total_compressed = ALIGN(total_compressed, blocksize);

/*
* One last check to make sure the compression is really a win, compare
* the page count read with the blocks on disk, compression must free at
* least one sector.
*/
total_in = round_up(total_in, fs_info->sectorsize);
if (total_compressed + blocksize > total_in)
  goto mark_incompressible;

/*
* The async work queues will take care of doing actual allocation on
* disk for these compressed pages, and will submit the bios.
*/
ret = add_async_extent(async_chunk, start, total_in, total_compressed, folios,
          nr_folios, compress_type);
BUG_ON(ret);
if (start + total_in < end) {
  start += total_in;
  cond_resched();
  goto again;
}
return;

mark_incompressible:
if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress)
  inode->flags |= BTRFS_INODE_NOCOMPRESS;
cleanup_and_bail_uncompressed:
ret = add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
          BTRFS_COMPRESS_NONE);
BUG_ON(ret);
free_pages:
if (folios) {
  for (i = 0; i < nr_folios; i++) {
   WARN_ON(folios[i]->mapping);
   btrfs_free_compr_folio(folios[i]);
  }
  kfree(folios);
}
}

static void free_async_extent_pages(struct async_extent *async_extent)
{
int i;

if (!async_extent->folios)
  return;

for (i = 0; i < async_extent->nr_folios; i++) {
  WARN_ON(async_extent->folios[i]->mapping);
  btrfs_free_compr_folio(async_extent->folios[i]);
}
kfree(async_extent->folios);
async_extent->nr_folios = 0;
async_extent->folios = NULL;
}

static void submit_uncompressed_range(struct btrfs_inode *inode,
          struct async_extent *async_extent,
          struct folio *locked_folio)
{
u64 start = async_extent->start;
u64 end = async_extent->start + async_extent->ram_size - 1;
int ret;
struct writeback_control wbc = {
  .sync_mode  = WB_SYNC_ALL,
  .range_start  = start,
  .range_end  = end,
  .no_cgroup_owner = 1,
};

wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode);
ret = run_delalloc_cow(inode, locked_folio, start, end,
          &wbc, false);
wbc_detach_inode(&wbc);
if (ret < 0) {
  if (locked_folio)
   btrfs_folio_end_lock(inode->root->fs_info, locked_folio,
          start, async_extent->ram_size);
  btrfs_err_rl(inode->root->fs_info,
   "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d",
        __func__, btrfs_root_id(inode->root),
        btrfs_ino(inode), start, async_extent->ram_size, ret);
}
}

static void submit_one_async_extent(struct async_chunk *async_chunk,
        struct async_extent *async_extent,
        u64 *alloc_hint)
{
struct btrfs_inode *inode = async_chunk->inode;
struct extent_io_tree *io_tree = &inode->io_tree;
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_ordered_extent *ordered;
struct btrfs_file_extent file_extent;
struct btrfs_key ins;
struct folio *locked_folio = NULL;
struct extent_state *cached = NULL;
struct extent_map *em;
int ret = 0;
bool free_pages = false;
u64 start = async_extent->start;
u64 end = async_extent->start + async_extent->ram_size - 1;

if (async_chunk->blkcg_css)
  kthread_associate_blkcg(async_chunk->blkcg_css);

/*
* If async_chunk->locked_folio is in the async_extent range, we need to
* handle it.
*/
if (async_chunk->locked_folio) {
  u64 locked_folio_start = folio_pos(async_chunk->locked_folio);
  u64 locked_folio_end = locked_folio_start +
   folio_size(async_chunk->locked_folio) - 1;

  if (!(start >= locked_folio_end || end <= locked_folio_start))
   locked_folio = async_chunk->locked_folio;
}

if (async_extent->compress_type == BTRFS_COMPRESS_NONE) {
  ASSERT(!async_extent->folios);
  ASSERT(async_extent->nr_folios == 0);
  submit_uncompressed_range(inode, async_extent, locked_folio);
  free_pages = true;
  goto done;
}

ret = btrfs_reserve_extent(root, async_extent->ram_size,
       async_extent->compressed_size,
       async_extent->compressed_size,
       0, *alloc_hint, &ins, 1, 1);
if (ret) {
  /*
* We can't reserve contiguous space for the compressed size.
* Unlikely, but it's possible that we could have enough
* non-contiguous space for the uncompressed size instead.  So
* fall back to uncompressed.
*/
  submit_uncompressed_range(inode, async_extent, locked_folio);
  free_pages = true;
  goto done;
}

btrfs_lock_extent(io_tree, start, end, &cached);

/* Here we're doing allocation and writeback of the compressed pages */
file_extent.disk_bytenr = ins.objectid;
file_extent.disk_num_bytes = ins.offset;
file_extent.ram_bytes = async_extent->ram_size;
file_extent.num_bytes = async_extent->ram_size;
file_extent.offset = 0;
file_extent.compression = async_extent->compress_type;

em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED);
if (IS_ERR(em)) {
  ret = PTR_ERR(em);
  goto out_free_reserve;
}
btrfs_free_extent_map(em);

ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
          1U << BTRFS_ORDERED_COMPRESSED);
if (IS_ERR(ordered)) {
  btrfs_drop_extent_map_range(inode, start, end, false);
  ret = PTR_ERR(ordered);
  goto out_free_reserve;
}
btrfs_dec_block_group_reservations(fs_info, ins.objectid);

/* Clear dirty, set writeback and unlock the pages. */
extent_clear_unlock_delalloc(inode, start, end,
   NULL, &cached, EXTENT_LOCKED | EXTENT_DELALLOC,
   PAGE_UNLOCK | PAGE_START_WRITEBACK);
btrfs_submit_compressed_write(ordered,
       async_extent->folios, /* compressed_folios */
       async_extent->nr_folios,
       async_chunk->write_flags, true);
*alloc_hint = ins.objectid + ins.offset;
done:
if (async_chunk->blkcg_css)
  kthread_associate_blkcg(NULL);
if (free_pages)
  free_async_extent_pages(async_extent);
kfree(async_extent);
return;

out_free_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
mapping_set_error(inode->vfs_inode.i_mapping, -EIO);
extent_clear_unlock_delalloc(inode, start, end,
         NULL, &cached,
         EXTENT_LOCKED | EXTENT_DELALLOC |
         EXTENT_DELALLOC_NEW |
         EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
         PAGE_UNLOCK | PAGE_START_WRITEBACK |
         PAGE_END_WRITEBACK);
free_async_extent_pages(async_extent);
if (async_chunk->blkcg_css)
  kthread_associate_blkcg(NULL);
btrfs_debug(fs_info,
"async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
      btrfs_root_id(root), btrfs_ino(inode), start,
      async_extent->ram_size, ret);
kfree(async_extent);
}

u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
         u64 num_bytes)
{
struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
u64 alloc_hint = 0;

read_lock(&em_tree->lock);
em = btrfs_search_extent_mapping(em_tree, start, num_bytes);
if (em) {
  /*
* if block start isn't an actual block number then find the
* first block in this inode and use that as a hint.  If that
* block is also bogus then just don't worry about it.
*/
  if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
   btrfs_free_extent_map(em);
   em = btrfs_search_extent_mapping(em_tree, 0, 0);
   if (em && em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
    alloc_hint = btrfs_extent_map_block_start(em);
   if (em)
    btrfs_free_extent_map(em);
  } else {
   alloc_hint = btrfs_extent_map_block_start(em);
   btrfs_free_extent_map(em);
  }
}
read_unlock(&em_tree->lock);

return alloc_hint;
}

/*
* when extent_io.c finds a delayed allocation range in the file,
* the call backs end up in this code.  The basic idea is to
* allocate extents on disk for the range, and create ordered data structs
* in ram to track those extents.
*
* locked_folio is the folio that writepage had locked already.  We use
* it to make sure we don't do extra locks or unlocks.
*
* When this function fails, it unlocks all pages except @locked_folio.
*
* When this function successfully creates an inline extent, it returns 1 and
* unlocks all pages including locked_folio and starts I/O on them.
* (In reality inline extents are limited to a single page, so locked_folio is
* the only page handled anyway).
*
* When this function succeed and creates a normal extent, the page locking
* status depends on the passed in flags:
*
* - If @keep_locked is set, all pages are kept locked.
* - Else all pages except for @locked_folio are unlocked.
*
* When a failure happens in the second or later iteration of the
* while-loop, the ordered extents created in previous iterations are cleaned up.
*/
static noinline int cow_file_range(struct btrfs_inode *inode,
       struct folio *locked_folio, u64 start,
       u64 end, u64 *done_offset,
       bool keep_locked, bool no_inline)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_state *cached = NULL;
u64 alloc_hint = 0;
u64 orig_start = start;
u64 num_bytes;
u64 cur_alloc_size = 0;
u64 min_alloc_size;
u64 blocksize = fs_info->sectorsize;
struct btrfs_key ins;
struct extent_map *em;
unsigned clear_bits;
unsigned long page_ops;
int ret = 0;

if (btrfs_is_free_space_inode(inode)) {
  ret = -EINVAL;
  goto out_unlock;
}

num_bytes = ALIGN(end - start + 1, blocksize);
num_bytes = max(blocksize,  num_bytes);
ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));

inode_should_defrag(inode, start, end, num_bytes, SZ_64K);

if (!no_inline) {
  /* lets try to make an inline extent */
  ret = cow_file_range_inline(inode, locked_folio, start, end, 0,
         BTRFS_COMPRESS_NONE, NULL, false);
  if (ret <= 0) {
   /*
* We succeeded, return 1 so the caller knows we're done
* with this page and already handled the IO.
*
* If there was an error then cow_file_range_inline() has
* already done the cleanup.
*/
   if (ret == 0)
    ret = 1;
   goto done;
  }
}

alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes);

/*
* We're not doing compressed IO, don't unlock the first page (which
* the caller expects to stay locked), don't clear any dirty bits and
* don't set any writeback bits.
*
* Do set the Ordered (Private2) bit so we know this page was properly
* setup for writepage.
*/
page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
page_ops |= PAGE_SET_ORDERED;

/*
* Relocation relies on the relocated extents to have exactly the same
* size as the original extents. Normally writeback for relocation data
* extents follows a NOCOW path because relocation preallocates the
* extents. However, due to an operation such as scrub turning a block
* group to RO mode, it may fallback to COW mode, so we must make sure
* an extent allocated during COW has exactly the requested size and can
* not be split into smaller extents, otherwise relocation breaks and
* fails during the stage where it updates the bytenr of file extent
* items.
*/
if (btrfs_is_data_reloc_root(root))
  min_alloc_size = num_bytes;
else
  min_alloc_size = fs_info->sectorsize;

while (num_bytes > 0) {
  struct btrfs_ordered_extent *ordered;
  struct btrfs_file_extent file_extent;

  ret = btrfs_reserve_extent(root, num_bytes, num_bytes,
        min_alloc_size, 0, alloc_hint,
        &ins, 1, 1);
  if (ret == -EAGAIN) {
   /*
* btrfs_reserve_extent only returns -EAGAIN for zoned
* file systems, which is an indication that there are
* no active zones to allocate from at the moment.
*
* If this is the first loop iteration, wait for at
* least one zone to finish before retrying the
* allocation.  Otherwise ask the caller to write out
* the already allocated blocks before coming back to
* us, or return -ENOSPC if it can't handle retries.
*/
   ASSERT(btrfs_is_zoned(fs_info));
   if (start == orig_start) {
    wait_on_bit_io(&inode->root->fs_info->flags,
            BTRFS_FS_NEED_ZONE_FINISH,
            TASK_UNINTERRUPTIBLE);
    continue;
   }
   if (done_offset) {
    /*
* Move @end to the end of the processed range,
* and exit the loop to unlock the processed extents.
*/
    end = start - 1;
    ret = 0;
    break;
   }
   ret = -ENOSPC;
  }
  if (ret < 0)
   goto out_unlock;
  cur_alloc_size = ins.offset;

  file_extent.disk_bytenr = ins.objectid;
  file_extent.disk_num_bytes = ins.offset;
  file_extent.num_bytes = ins.offset;
  file_extent.ram_bytes = ins.offset;
  file_extent.offset = 0;
  file_extent.compression = BTRFS_COMPRESS_NONE;

  /*
* Locked range will be released either during error clean up or
* after the whole range is finished.
*/
  btrfs_lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1,
      &cached);

  em = btrfs_create_io_em(inode, start, &file_extent,
     BTRFS_ORDERED_REGULAR);
  if (IS_ERR(em)) {
   btrfs_unlock_extent(&inode->io_tree, start,
         start + cur_alloc_size - 1, &cached);
   ret = PTR_ERR(em);
   goto out_reserve;
  }
  btrfs_free_extent_map(em);

  ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
           1U << BTRFS_ORDERED_REGULAR);
  if (IS_ERR(ordered)) {
   btrfs_unlock_extent(&inode->io_tree, start,
         start + cur_alloc_size - 1, &cached);
   ret = PTR_ERR(ordered);
   goto out_drop_extent_cache;
  }

  if (btrfs_is_data_reloc_root(root)) {
   ret = btrfs_reloc_clone_csums(ordered);

   /*
* Only drop cache here, and process as normal.
*
* We must not allow extent_clear_unlock_delalloc()
* at out_unlock label to free meta of this ordered
* extent, as its meta should be freed by
* btrfs_finish_ordered_io().
*
* So we must continue until @start is increased to
* skip current ordered extent.
*/
   if (ret)
    btrfs_drop_extent_map_range(inode, start,
           start + cur_alloc_size - 1,
           false);
  }
  btrfs_put_ordered_extent(ordered);

  btrfs_dec_block_group_reservations(fs_info, ins.objectid);

  if (num_bytes < cur_alloc_size)
   num_bytes = 0;
  else
   num_bytes -= cur_alloc_size;
  alloc_hint = ins.objectid + ins.offset;
  start += cur_alloc_size;
  cur_alloc_size = 0;

  /*
* btrfs_reloc_clone_csums() error, since start is increased
* extent_clear_unlock_delalloc() at out_unlock label won't
* free metadata of current ordered extent, we're OK to exit.
*/
  if (ret)
   goto out_unlock;
}
extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached,
         EXTENT_LOCKED | EXTENT_DELALLOC, page_ops);
done:
if (done_offset)
  *done_offset = end;
return ret;

out_drop_extent_cache:
btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false);
out_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
out_unlock:
/*
* Now, we have three regions to clean up:
*
* |-------(1)----|---(2)---|-------------(3)----------|
* `- orig_start  `- start  `- start + cur_alloc_size  `- end
*
* We process each region below.
*/

/*
* For the range (1). We have already instantiated the ordered extents
* for this region, thus we need to cleanup those ordered extents.
* EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV
* are also handled by the ordered extents cleanup.
*
* So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag, and
* finish the writeback of the involved folios, which will be never submitted.
*/
if (orig_start < start) {
  clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC;
  page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;

  if (!locked_folio)
   mapping_set_error(inode->vfs_inode.i_mapping, ret);

  btrfs_cleanup_ordered_extents(inode, orig_start, start - orig_start);
  extent_clear_unlock_delalloc(inode, orig_start, start - 1,
          locked_folio, NULL, clear_bits, page_ops);
}

clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
       EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;

/*
* For the range (2). If we reserved an extent for our delalloc range
* (or a subrange) and failed to create the respective ordered extent,
* then it means that when we reserved the extent we decremented the
* extent's size from the data space_info's bytes_may_use counter and
* incremented the space_info's bytes_reserved counter by the same
* amount. We must make sure extent_clear_unlock_delalloc() does not try
* to decrement again the data space_info's bytes_may_use counter,
* therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
*/
if (cur_alloc_size) {
  extent_clear_unlock_delalloc(inode, start,
          start + cur_alloc_size - 1,
          locked_folio, &cached, clear_bits,
          page_ops);
  btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL);
}

/*
* For the range (3). We never touched the region. In addition to the
* clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data
* space_info's bytes_may_use counter, reserved in
* btrfs_check_data_free_space().
*/
if (start + cur_alloc_size < end) {
  clear_bits |= EXTENT_CLEAR_DATA_RESV;
  extent_clear_unlock_delalloc(inode, start + cur_alloc_size,
          end, locked_folio,
          &cached, clear_bits, page_ops);
  btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size,
           end - start - cur_alloc_size + 1, NULL);
}
btrfs_err_rl(fs_info,
       "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d",
       __func__, btrfs_root_id(inode->root),
       btrfs_ino(inode), orig_start, end + 1 - orig_start, ret);
return ret;
}

/*
* Phase two of compressed writeback.  This is the ordered portion of the code,
* which only gets called in the order the work was queued.  We walk all the
* async extents created by compress_file_range and send them down to the disk.
*
* If called with @do_free == true then it'll try to finish the work and free
* the work struct eventually.
*/
static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_free)
{
struct async_chunk *async_chunk = container_of(work, struct async_chunk,
           work);
struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
struct async_extent *async_extent;
unsigned long nr_pages;
u64 alloc_hint = 0;

if (do_free) {
  struct async_cow *async_cow;

  btrfs_add_delayed_iput(async_chunk->inode);
  if (async_chunk->blkcg_css)
   css_put(async_chunk->blkcg_css);

  async_cow = async_chunk->async_cow;
  if (atomic_dec_and_test(&async_cow->num_chunks))
   kvfree(async_cow);
  return;
}

nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
  PAGE_SHIFT;

while (!list_empty(&async_chunk->extents)) {
  async_extent = list_first_entry(&async_chunk->extents,
      struct async_extent, list);
  list_del(&async_extent->list);
  submit_one_async_extent(async_chunk, async_extent, &alloc_hint);
}

/* atomic_sub_return implies a barrier */
if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
     5 * SZ_1M)
  cond_wake_up_nomb(&fs_info->async_submit_wait);
}

static bool run_delalloc_compressed(struct btrfs_inode *inode,
        struct folio *locked_folio, u64 start,
        u64 end, struct writeback_control *wbc)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
struct async_cow *ctx;
struct async_chunk *async_chunk;
unsigned long nr_pages;
u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
int i;
unsigned nofs_flag;
const blk_opf_t write_flags = wbc_to_write_flags(wbc);

nofs_flag = memalloc_nofs_save();
ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
memalloc_nofs_restore(nofs_flag);
if (!ctx)
  return false;

set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);

async_chunk = ctx->chunks;
atomic_set(&ctx->num_chunks, num_chunks);

for (i = 0; i < num_chunks; i++) {
  u64 cur_end = min(end, start + SZ_512K - 1);

  /*
* igrab is called higher up in the call chain, take only the
* lightweight reference for the callback lifetime
*/
  ihold(&inode->vfs_inode);
  async_chunk[i].async_cow = ctx;
  async_chunk[i].inode = inode;
  async_chunk[i].start = start;
  async_chunk[i].end = cur_end;
  async_chunk[i].write_flags = write_flags;
  INIT_LIST_HEAD(&async_chunk[i].extents);

  /*
* The locked_folio comes all the way from writepage and its
* the original folio we were actually given.  As we spread
* this large delalloc region across multiple async_chunk
* structs, only the first struct needs a pointer to
* locked_folio.
*
* This way we don't need racey decisions about who is supposed
* to unlock it.
*/
  if (locked_folio) {
   /*
* Depending on the compressibility, the pages might or
* might not go through async.  We want all of them to
* be accounted against wbc once.  Let's do it here
* before the paths diverge.  wbc accounting is used
* only for foreign writeback detection and doesn't
* need full accuracy.  Just account the whole thing
* against the first page.
*/
   wbc_account_cgroup_owner(wbc, locked_folio,
       cur_end - start);
   async_chunk[i].locked_folio = locked_folio;
   locked_folio = NULL;
  } else {
   async_chunk[i].locked_folio = NULL;
  }

  if (blkcg_css != blkcg_root_css) {
   css_get(blkcg_css);
   async_chunk[i].blkcg_css = blkcg_css;
   async_chunk[i].write_flags |= REQ_BTRFS_CGROUP_PUNT;
  } else {
   async_chunk[i].blkcg_css = NULL;
  }

  btrfs_init_work(&async_chunk[i].work, compress_file_range,
    submit_compressed_extents);

  nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
  atomic_add(nr_pages, &fs_info->async_delalloc_pages);

  btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);

  start = cur_end + 1;
}
return true;
}

/*
* Run the delalloc range from start to end, and write back any dirty pages
* covered by the range.
*/
static noinline int run_delalloc_cow(struct btrfs_inode *inode,
         struct folio *locked_folio, u64 start,
         u64 end, struct writeback_control *wbc,
         bool pages_dirty)
{
u64 done_offset = end;
int ret;

while (start <= end) {
  ret = cow_file_range(inode, locked_folio, start, end,
         &done_offset, true, false);
  if (ret)
   return ret;
  extent_write_locked_range(&inode->vfs_inode, locked_folio,
       start, done_offset, wbc, pages_dirty);
  start = done_offset + 1;
}

return 1;
}

static int fallback_to_cow(struct btrfs_inode *inode,
      struct folio *locked_folio, const u64 start,
      const u64 end)
{
const bool is_space_ino = btrfs_is_free_space_inode(inode);
const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
const u64 range_bytes = end + 1 - start;
struct extent_io_tree *io_tree = &inode->io_tree;
struct extent_state *cached_state = NULL;
u64 range_start = start;
u64 count;
int ret;

/*
* If EXTENT_NORESERVE is set it means that when the buffered write was
* made we had not enough available data space and therefore we did not
* reserve data space for it, since we though we could do NOCOW for the
* respective file range (either there is prealloc extent or the inode
* has the NOCOW bit set).
*
* However when we need to fallback to COW mode (because for example the
* block group for the corresponding extent was turned to RO mode by a
* scrub or relocation) we need to do the following:
*
* 1) We increment the bytes_may_use counter of the data space info.
*    If COW succeeds, it allocates a new data extent and after doing
*    that it decrements the space info's bytes_may_use counter and
*    increments its bytes_reserved counter by the same amount (we do
*    this at btrfs_add_reserved_bytes()). So we need to increment the
*    bytes_may_use counter to compensate (when space is reserved at
*    buffered write time, the bytes_may_use counter is incremented);
*
* 2) We clear the EXTENT_NORESERVE bit from the range. We do this so
*    that if the COW path fails for any reason, it decrements (through
*    extent_clear_unlock_delalloc()) the bytes_may_use counter of the
*    data space info, which we incremented in the step above.
*
* If we need to fallback to cow and the inode corresponds to a free
* space cache inode or an inode of the data relocation tree, we must
* also increment bytes_may_use of the data space_info for the same
* reason. Space caches and relocated data extents always get a prealloc
* extent for them, however scrub or balance may have set the block
* group that contains that extent to RO mode and therefore force COW
* when starting writeback.
*/
btrfs_lock_extent(io_tree, start, end, &cached_state);
count = btrfs_count_range_bits(io_tree, &range_start, end, range_bytes,
           EXTENT_NORESERVE, 0, NULL);
if (count > 0 || is_space_ino || is_reloc_ino) {
  u64 bytes = count;
  struct btrfs_fs_info *fs_info = inode->root->fs_info;
  struct btrfs_space_info *sinfo = fs_info->data_sinfo;

  if (is_space_ino || is_reloc_ino)
   bytes = range_bytes;

  spin_lock(&sinfo->lock);
  btrfs_space_info_update_bytes_may_use(sinfo, bytes);
  spin_unlock(&sinfo->lock);

  if (count > 0)
   btrfs_clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
            &cached_state);
}
btrfs_unlock_extent(io_tree, start, end, &cached_state);

/*
* Don't try to create inline extents, as a mix of inline extent that
* is written out and unlocked directly and a normal NOCOW extent
* doesn't work.
*/
ret = cow_file_range(inode, locked_folio, start, end, NULL, false,
        true);
ASSERT(ret != 1);
return ret;
}

struct can_nocow_file_extent_args {
/* Input fields. */

/* Start file offset of the range we want to NOCOW. */
u64 start;
/* End file offset (inclusive) of the range we want to NOCOW. */
u64 end;
bool writeback_path;
/*
* Free the path passed to can_nocow_file_extent() once it's not needed
* anymore.
*/
bool free_path;

/*
* Output fields. Only set when can_nocow_file_extent() returns 1.
* The expected file extent for the NOCOW write.
*/
struct btrfs_file_extent file_extent;
};

/*
* Check if we can NOCOW the file extent that the path points to.
* This function may return with the path released, so the caller should check
* if path->nodes[0] is NULL or not if it needs to use the path afterwards.
*
* Returns: < 0 on error
*            0 if we can not NOCOW
*            1 if we can NOCOW
*/
static int can_nocow_file_extent(struct btrfs_path *path,
     struct btrfs_key *key,
     struct btrfs_inode *inode,
     struct can_nocow_file_extent_args *args)
{
const bool is_freespace_inode = btrfs_is_free_space_inode(inode);
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_root *root = inode->root;
struct btrfs_file_extent_item *fi;
struct btrfs_root *csum_root;
u64 io_start;
u64 extent_end;
u8 extent_type;
int can_nocow = 0;
int ret = 0;
bool nowait = path->nowait;

fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
extent_type = btrfs_file_extent_type(leaf, fi);

if (extent_type == BTRFS_FILE_EXTENT_INLINE)
  goto out;

if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
     extent_type == BTRFS_FILE_EXTENT_REG)
  goto out;

/*
* If the extent was created before the generation where the last snapshot
* for its subvolume was created, then this implies the extent is shared,
* hence we must COW.
*/
if (btrfs_file_extent_generation(leaf, fi) <=
     btrfs_root_last_snapshot(&root->root_item))
  goto out;

/* An explicit hole, must COW. */
if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
  goto out;

/* Compressed/encrypted/encoded extents must be COWed. */
if (btrfs_file_extent_compression(leaf, fi) ||
     btrfs_file_extent_encryption(leaf, fi) ||
     btrfs_file_extent_other_encoding(leaf, fi))
  goto out;

extent_end = btrfs_file_extent_end(path);

args->file_extent.disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
args->file_extent.disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
args->file_extent.ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
args->file_extent.offset = btrfs_file_extent_offset(leaf, fi);
args->file_extent.compression = btrfs_file_extent_compression(leaf, fi);

/*
* The following checks can be expensive, as they need to take other
* locks and do btree or rbtree searches, so release the path to avoid
* blocking other tasks for too long.
*/
btrfs_release_path(path);

ret = btrfs_cross_ref_exist(inode, key->offset - args->file_extent.offset,
        args->file_extent.disk_bytenr, path);
WARN_ON_ONCE(ret > 0 && is_freespace_inode);
if (ret != 0)
  goto out;

if (args->free_path) {
  /*
* We don't need the path anymore, plus through the
* btrfs_lookup_csums_list() call below we will end up allocating
* another path. So free the path to avoid unnecessary extra
* memory usage.
*/
  btrfs_free_path(path);
  path = NULL;
}

/* If there are pending snapshots for this root, we must COW. */
if (args->writeback_path && !is_freespace_inode &&
     atomic_read(&root->snapshot_force_cow))
  goto out;

args->file_extent.num_bytes = min(args->end + 1, extent_end) - args->start;
args->file_extent.offset += args->start - key->offset;
io_start = args->file_extent.disk_bytenr + args->file_extent.offset;

/*
* Force COW if csums exist in the range. This ensures that csums for a
* given extent are either valid or do not exist.
*/

csum_root = btrfs_csum_root(root->fs_info, io_start);
ret = btrfs_lookup_csums_list(csum_root, io_start,
          io_start + args->file_extent.num_bytes - 1,
          NULL, nowait);
WARN_ON_ONCE(ret > 0 && is_freespace_inode);
if (ret != 0)
  goto out;

can_nocow = 1;
out:
if (args->free_path && path)
  btrfs_free_path(path);

return ret < 0 ? ret : can_nocow;
}

/*
* Cleanup the dirty folios which will never be submitted due to error.
*
* When running a delalloc range, we may need to split the ranges (due to
* fragmentation or NOCOW). If we hit an error in the later part, we will error
* out and previously successfully executed range will never be submitted, thus
* we have to cleanup those folios by clearing their dirty flag, starting and
* finishing the writeback.
*/
static void cleanup_dirty_folios(struct btrfs_inode *inode,
     struct folio *locked_folio,
     u64 start, u64 end, int error)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct address_space *mapping = inode->vfs_inode.i_mapping;
pgoff_t start_index = start >> PAGE_SHIFT;
pgoff_t end_index = end >> PAGE_SHIFT;
u32 len;

ASSERT(end + 1 - start < U32_MAX);
ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
        IS_ALIGNED(end + 1, fs_info->sectorsize));
len = end + 1 - start;

/*
* Handle the locked folio first.
* The btrfs_folio_clamp_*() helpers can handle range out of the folio case.
*/
btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len);

for (pgoff_t index = start_index; index <= end_index; index++) {
  struct folio *folio;

  /* Already handled at the beginning. */
  if (index == locked_folio->index)
   continue;
  folio = __filemap_get_folio(mapping, index, FGP_LOCK, GFP_NOFS);
  /* Cache already dropped, no need to do any cleanup. */
  if (IS_ERR(folio))
   continue;
  btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len);
  folio_unlock(folio);
  folio_put(folio);
}
mapping_set_error(mapping, error);
}

static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio,
      struct extent_state **cached,
      struct can_nocow_file_extent_args *nocow_args,
      u64 file_pos, bool is_prealloc)
{
struct btrfs_ordered_extent *ordered;
u64 len = nocow_args->file_extent.num_bytes;
u64 end = file_pos + len - 1;
int ret = 0;

btrfs_lock_extent(&inode->io_tree, file_pos, end, cached);

if (is_prealloc) {
  struct extent_map *em;

  em = btrfs_create_io_em(inode, file_pos, &nocow_args->file_extent,
     BTRFS_ORDERED_PREALLOC);
  if (IS_ERR(em)) {
   btrfs_unlock_extent(&inode->io_tree, file_pos, end, cached);
   return PTR_ERR(em);
  }
  btrfs_free_extent_map(em);
}

ordered = btrfs_alloc_ordered_extent(inode, file_pos, &nocow_args->file_extent,
          is_prealloc
          ? (1U << BTRFS_ORDERED_PREALLOC)
          : (1U << BTRFS_ORDERED_NOCOW));
if (IS_ERR(ordered)) {
  if (is_prealloc)
   btrfs_drop_extent_map_range(inode, file_pos, end, false);
  btrfs_unlock_extent(&inode->io_tree, file_pos, end, cached);
  return PTR_ERR(ordered);
}

if (btrfs_is_data_reloc_root(inode->root))
  /*
* Errors are handled later, as we must prevent
* extent_clear_unlock_delalloc() in error handler from freeing
* metadata of the created ordered extent.
*/
  ret = btrfs_reloc_clone_csums(ordered);
btrfs_put_ordered_extent(ordered);

extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached,
         EXTENT_LOCKED | EXTENT_DELALLOC |
         EXTENT_CLEAR_DATA_RESV,
         PAGE_UNLOCK | PAGE_SET_ORDERED);
/*
* On error, we need to cleanup the ordered extents we created.
*
* We do not clear the folio Dirty flags because they are set and
* cleaered by the caller.
*/
if (ret < 0)
  btrfs_cleanup_ordered_extents(inode, file_pos, len);
return ret;
}

/*
* when nowcow writeback call back.  This checks for snapshots or COW copies
* of the extents that exist in the file, and COWs the file as required.
*
* If no cow copies or snapshots exist, we write directly to the existing
* blocks on disk
*/
static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
           struct folio *locked_folio,
           const u64 start, const u64 end)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_root *root = inode->root;
struct btrfs_path *path;
u64 cow_start = (u64)-1;
/*
* If not 0, represents the inclusive end of the last fallback_to_cow()
* range. Only for error handling.
*/
u64 cow_end = 0;
u64 cur_offset = start;
int ret;
bool check_prev = true;
u64 ino = btrfs_ino(inode);
struct can_nocow_file_extent_args nocow_args = { 0 };

/*
* Normally on a zoned device we're only doing COW writes, but in case
* of relocation on a zoned filesystem serializes I/O so that we're only
* writing sequentially and can end up here as well.
*/
ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root));

path = btrfs_alloc_path();
if (!path) {
  ret = -ENOMEM;
  goto error;
}

nocow_args.end = end;
nocow_args.writeback_path = true;

while (cur_offset <= end) {
  struct btrfs_block_group *nocow_bg = NULL;
  struct btrfs_key found_key;
  struct btrfs_file_extent_item *fi;
  struct extent_buffer *leaf;
  struct extent_state *cached_state = NULL;
  u64 extent_end;
  int extent_type;

  ret = btrfs_lookup_file_extent(NULL, root, path, ino,
            cur_offset, 0);
  if (ret < 0)
   goto error;

  /*
* If there is no extent for our range when doing the initial
* search, then go back to the previous slot as it will be the
* one containing the search offset
*/
  if (ret > 0 && path->slots[0] > 0 && check_prev) {
   leaf = path->nodes[0];
   btrfs_item_key_to_cpu(leaf, &found_key,
           path->slots[0] - 1);
   if (found_key.objectid == ino &&
       found_key.type == BTRFS_EXTENT_DATA_KEY)
    path->slots[0]--;
  }
  check_prev = false;
next_slot:
  /* Go to next leaf if we have exhausted the current one */
  leaf = path->nodes[0];
  if (path->slots[0] >= btrfs_header_nritems(leaf)) {
   ret = btrfs_next_leaf(root, path);
   if (ret < 0)
    goto error;
   if (ret > 0)
    break;
   leaf = path->nodes[0];
  }

  btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);

  /* Didn't find anything for our INO */
  if (found_key.objectid > ino)
   break;
  /*
* Keep searching until we find an EXTENT_ITEM or there are no
* more extents for this inode
*/
  if (WARN_ON_ONCE(found_key.objectid < ino) ||
      found_key.type < BTRFS_EXTENT_DATA_KEY) {
--> --------------------

--> maximum size reached

--> --------------------

Messung V0.5

¤ Dauer der Verarbeitung: 0.21 Sekunden (vorverarbeitet) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.