Quelle tree-log.c Sprache: C

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2008 Oracle.  All rights reserved.
*/

#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/list_sort.h>
#include <linux/iversion.h>
#include "misc.h"
#include "ctree.h"
#include "tree-log.h"
#include "disk-io.h"
#include "locking.h"
#include "backref.h"
#include "compression.h"
#include "qgroup.h"
#include "block-group.h"
#include "space-info.h"
#include "inode-item.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
#include "root-tree.h"
#include "dir-item.h"
#include "file-item.h"
#include "file.h"
#include "orphan.h"
#include "tree-checker.h"

#define MAX_CONFLICT_INODES 10

/* magic values for the inode_only field in btrfs_log_inode:
*
* LOG_INODE_ALL means to log everything
* LOG_INODE_EXISTS means to log just enough to recreate the inode
* during log replay
*/
enum {
LOG_INODE_ALL,
LOG_INODE_EXISTS,
};

/*
* directory trouble cases
*
* 1) on rename or unlink, if the inode being unlinked isn't in the fsync
* log, we must force a full commit before doing an fsync of the directory
* where the unlink was done.
* ---> record transid of last unlink/rename per directory
*
* mkdir foo/some_dir
* normal commit
* rename foo/some_dir foo2/some_dir
* mkdir foo/some_dir
* fsync foo/some_dir/some_file
*
* The fsync above will unlink the original some_dir without recording
* it in its new location (foo2).  After a crash, some_dir will be gone
* unless the fsync of some_file forces a full commit
*
* 2) we must log any new names for any file or dir that is in the fsync
* log. ---> check inode while renaming/linking.
*
* 2a) we must log any new names for any file or dir during rename
* when the directory they are being removed from was logged.
* ---> check inode and old parent dir during rename
*
*  2a is actually the more important variant.  With the extra logging
*  a crash might unlink the old name without recreating the new one
*
* 3) after a crash, we must go through any directories with a link count
* of zero and redo the rm -rf
*
* mkdir f1/foo
* normal commit
* rm -rf f1/foo
* fsync(f1)
*
* The directory f1 was fully removed from the FS, but fsync was never
* called on f1, only its parent dir.  After a crash the rm -rf must
* be replayed.  This must be able to recurse down the entire
* directory tree.  The inode link count fixup code takes care of the
* ugly details.
*/

/*
* stages for the tree walking.  The first
* stage (0) is to only pin down the blocks we find
* the second stage (1) is to make sure that all the inodes
* we find in the log are created in the subvolume.
*
* The last stage is to deal with directories and links and extents
* and all the other fun semantics
*/
enum {
LOG_WALK_PIN_ONLY,
LOG_WALK_REPLAY_INODES,
LOG_WALK_REPLAY_DIR_INDEX,
LOG_WALK_REPLAY_ALL,
};

static int btrfs_log_inode(struct btrfs_trans_handle *trans,
      struct btrfs_inode *inode,
      int inode_only,
      struct btrfs_log_ctx *ctx);
static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
        struct btrfs_root *root,
        struct btrfs_path *path, u64 objectid);
static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
           struct btrfs_root *root,
           struct btrfs_root *log,
           struct btrfs_path *path,
           u64 dirid, bool del_all);
static void wait_log_commit(struct btrfs_root *root, int transid);

/*
* tree logging is a special write ahead log used to make sure that
* fsyncs and O_SYNCs can happen without doing full tree commits.
*
* Full tree commits are expensive because they require commonly
* modified blocks to be recowed, creating many dirty pages in the
* extent tree an 4x-6x higher write load than ext3.
*
* Instead of doing a tree commit on every fsync, we use the
* key ranges and transaction ids to find items for a given file or directory
* that have changed in this transaction.  Those items are copied into
* a special tree (one per subvolume root), that tree is written to disk
* and then the fsync is considered complete.
*
* After a crash, items are copied out of the log-tree back into the
* subvolume tree.  Any file data extents found are recorded in the extent
* allocation tree, and the log-tree freed.
*
* The log tree is read three times, once to pin down all the extents it is
* using in ram and once, once to create all the inodes logged in the tree
* and once to do all the other items.
*/

static struct btrfs_inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root)
{
unsigned int nofs_flag;
struct btrfs_inode *inode;

/* Only meant to be called for subvolume roots and not for log roots. */
ASSERT(btrfs_is_fstree(btrfs_root_id(root)));

/*
* We're holding a transaction handle whether we are logging or
* replaying a log tree, so we must make sure NOFS semantics apply
* because btrfs_alloc_inode() may be triggered and it uses GFP_KERNEL
* to allocate an inode, which can recurse back into the filesystem and
* attempt a transaction commit, resulting in a deadlock.
*/
nofs_flag = memalloc_nofs_save();
inode = btrfs_iget(objectid, root);
memalloc_nofs_restore(nofs_flag);

return inode;
}

/*
* start a sub transaction and setup the log tree
* this increments the log tree writer count to make the people
* syncing the tree wait for us to finish
*/
static int start_log_trans(struct btrfs_trans_handle *trans,
      struct btrfs_root *root,
      struct btrfs_log_ctx *ctx)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *tree_root = fs_info->tree_root;
const bool zoned = btrfs_is_zoned(fs_info);
int ret = 0;
bool created = false;

/*
* First check if the log root tree was already created. If not, create
* it before locking the root's log_mutex, just to keep lockdep happy.
*/
if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) {
  mutex_lock(&tree_root->log_mutex);
  if (!fs_info->log_root_tree) {
   ret = btrfs_init_log_root_tree(trans, fs_info);
   if (!ret) {
    set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state);
    created = true;
   }
  }
  mutex_unlock(&tree_root->log_mutex);
  if (ret)
   return ret;
}

mutex_lock(&root->log_mutex);

again:
if (root->log_root) {
  int index = (root->log_transid + 1) % 2;

  if (btrfs_need_log_full_commit(trans)) {
   ret = BTRFS_LOG_FORCE_COMMIT;
   goto out;
  }

  if (zoned && atomic_read(&root->log_commit[index])) {
   wait_log_commit(root, root->log_transid - 1);
   goto again;
  }

  if (!root->log_start_pid) {
   clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
   root->log_start_pid = current->pid;
  } else if (root->log_start_pid != current->pid) {
   set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
  }
} else {
  /*
* This means fs_info->log_root_tree was already created
* for some other FS trees. Do the full commit not to mix
* nodes from multiple log transactions to do sequential
* writing.
*/
  if (zoned && !created) {
   ret = BTRFS_LOG_FORCE_COMMIT;
   goto out;
  }

  ret = btrfs_add_log_tree(trans, root);
  if (ret)
   goto out;

  set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
  clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
  root->log_start_pid = current->pid;
}

atomic_inc(&root->log_writers);
if (!ctx->logging_new_name) {
  int index = root->log_transid % 2;
  list_add_tail(&ctx->list, &root->log_ctxs[index]);
  ctx->log_transid = root->log_transid;
}

out:
mutex_unlock(&root->log_mutex);
return ret;
}

/*
* returns 0 if there was a log transaction running and we were able
* to join, or returns -ENOENT if there were not transactions
* in progress
*/
static int join_running_log_trans(struct btrfs_root *root)
{
const bool zoned = btrfs_is_zoned(root->fs_info);
int ret = -ENOENT;

if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
  return ret;

mutex_lock(&root->log_mutex);
again:
if (root->log_root) {
  int index = (root->log_transid + 1) % 2;

  ret = 0;
  if (zoned && atomic_read(&root->log_commit[index])) {
   wait_log_commit(root, root->log_transid - 1);
   goto again;
  }
  atomic_inc(&root->log_writers);
}
mutex_unlock(&root->log_mutex);
return ret;
}

/*
* This either makes the current running log transaction wait
* until you call btrfs_end_log_trans() or it makes any future
* log transactions wait until you call btrfs_end_log_trans()
*/
void btrfs_pin_log_trans(struct btrfs_root *root)
{
atomic_inc(&root->log_writers);
}

/*
* indicate we're done making changes to the log tree
* and wake up anyone waiting to do a sync
*/
void btrfs_end_log_trans(struct btrfs_root *root)
{
if (atomic_dec_and_test(&root->log_writers)) {
  /* atomic_dec_and_test implies a barrier */
  cond_wake_up_nomb(&root->log_writer_wait);
}
}

/*
* the walk control struct is used to pass state down the chain when
* processing the log tree.  The stage field tells us which part
* of the log tree processing we are currently doing.  The others
* are state fields used for that specific part
*/
struct walk_control {
/* should we free the extent on disk when done?  This is used
* at transaction commit time while freeing a log tree
*/
int free;

/* pin only walk, we record which extents on disk belong to the
* log trees
*/
int pin;

/* what stage of the replay code we're currently in */
int stage;

/*
* Ignore any items from the inode currently being processed. Needs
* to be set every time we find a BTRFS_INODE_ITEM_KEY.
*/
bool ignore_cur_inode;

/* the root we are currently replaying */
struct btrfs_root *replay_dest;

/* the trans handle for the current replay */
struct btrfs_trans_handle *trans;

/* the function that gets used to process blocks we find in the
* tree.  Note the extent_buffer might not be up to date when it is
* passed in, and it must be checked or read if you need the data
* inside it
*/
int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
       struct walk_control *wc, u64 gen, int level);
};

/*
* process_func used to pin down extents, write them or wait on them
*/
static int process_one_buffer(struct btrfs_root *log,
         struct extent_buffer *eb,
         struct walk_control *wc, u64 gen, int level)
{
struct btrfs_trans_handle *trans = wc->trans;
struct btrfs_fs_info *fs_info = log->fs_info;
int ret = 0;

/*
* If this fs is mixed then we need to be able to process the leaves to
* pin down any logged extents, so we have to read the block.
*/
if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
  struct btrfs_tree_parent_check check = {
   .level = level,
   .transid = gen
  };

  ret = btrfs_read_extent_buffer(eb, &check);
  if (ret) {
   if (trans)
    btrfs_abort_transaction(trans, ret);
   else
    btrfs_handle_fs_error(fs_info, ret, NULL);
   return ret;
  }
}

if (wc->pin) {
  ASSERT(trans != NULL);
  ret = btrfs_pin_extent_for_log_replay(trans, eb);
  if (ret) {
   btrfs_abort_transaction(trans, ret);
   return ret;
  }

  if (btrfs_buffer_uptodate(eb, gen, 0) &&
      btrfs_header_level(eb) == 0) {
   ret = btrfs_exclude_logged_extents(eb);
   if (ret)
    btrfs_abort_transaction(trans, ret);
  }
}
return ret;
}

/*
* Item overwrite used by log replay. The given eb, slot and key all refer to
* the source data we are copying out.
*
* The given root is for the tree we are copying into, and path is a scratch
* path for use in this function (it should be released on entry and will be
* released on exit).
*
* If the key is already in the destination tree the existing item is
* overwritten.  If the existing item isn't big enough, it is extended.
* If it is too large, it is truncated.
*
* If the key isn't in the destination yet, a new item is inserted.
*/
static int overwrite_item(struct btrfs_trans_handle *trans,
     struct btrfs_root *root,
     struct btrfs_path *path,
     struct extent_buffer *eb, int slot,
     struct btrfs_key *key)
{
int ret;
u32 item_size;
u64 saved_i_size = 0;
int save_old_i_size = 0;
unsigned long src_ptr;
unsigned long dst_ptr;
struct extent_buffer *dst_eb;
int dst_slot;
bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;

/*
* This is only used during log replay, so the root is always from a
* fs/subvolume tree. In case we ever need to support a log root, then
* we'll have to clone the leaf in the path, release the path and use
* the leaf before writing into the log tree. See the comments at
* copy_items() for more details.
*/
ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);

item_size = btrfs_item_size(eb, slot);
src_ptr = btrfs_item_ptr_offset(eb, slot);

/* Look for the key in the destination tree. */
ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
if (ret < 0)
  return ret;

dst_eb = path->nodes[0];
dst_slot = path->slots[0];

if (ret == 0) {
  char *src_copy;
  const u32 dst_size = btrfs_item_size(dst_eb, dst_slot);

  if (dst_size != item_size)
   goto insert;

  if (item_size == 0) {
   btrfs_release_path(path);
   return 0;
  }
  src_copy = kmalloc(item_size, GFP_NOFS);
  if (!src_copy) {
   btrfs_release_path(path);
   return -ENOMEM;
  }

  read_extent_buffer(eb, src_copy, src_ptr, item_size);
  dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot);
  ret = memcmp_extent_buffer(dst_eb, src_copy, dst_ptr, item_size);

  kfree(src_copy);
  /*
* they have the same contents, just return, this saves
* us from cowing blocks in the destination tree and doing
* extra writes that may not have been done by a previous
* sync
*/
  if (ret == 0) {
   btrfs_release_path(path);
   return 0;
  }

  /*
* We need to load the old nbytes into the inode so when we
* replay the extents we've logged we get the right nbytes.
*/
  if (inode_item) {
   struct btrfs_inode_item *item;
   u64 nbytes;
   u32 mode;

   item = btrfs_item_ptr(dst_eb, dst_slot,
           struct btrfs_inode_item);
   nbytes = btrfs_inode_nbytes(dst_eb, item);
   item = btrfs_item_ptr(eb, slot,
           struct btrfs_inode_item);
   btrfs_set_inode_nbytes(eb, item, nbytes);

   /*
* If this is a directory we need to reset the i_size to
* 0 so that we can set it up properly when replaying
* the rest of the items in this log.
*/
   mode = btrfs_inode_mode(eb, item);
   if (S_ISDIR(mode))
    btrfs_set_inode_size(eb, item, 0);
  }
} else if (inode_item) {
  struct btrfs_inode_item *item;
  u32 mode;

  /*
* New inode, set nbytes to 0 so that the nbytes comes out
* properly when we replay the extents.
*/
  item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
  btrfs_set_inode_nbytes(eb, item, 0);

  /*
* If this is a directory we need to reset the i_size to 0 so
* that we can set it up properly when replaying the rest of
* the items in this log.
*/
  mode = btrfs_inode_mode(eb, item);
  if (S_ISDIR(mode))
   btrfs_set_inode_size(eb, item, 0);
}
insert:
btrfs_release_path(path);
/* try to insert the key into the destination tree */
path->skip_release_on_error = 1;
ret = btrfs_insert_empty_item(trans, root, path,
          key, item_size);
path->skip_release_on_error = 0;

dst_eb = path->nodes[0];
dst_slot = path->slots[0];

/* make sure any existing item is the correct size */
if (ret == -EEXIST || ret == -EOVERFLOW) {
  const u32 found_size = btrfs_item_size(dst_eb, dst_slot);

  if (found_size > item_size)
   btrfs_truncate_item(trans, path, item_size, 1);
  else if (found_size < item_size)
   btrfs_extend_item(trans, path, item_size - found_size);
} else if (ret) {
  return ret;
}
dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot);

/* don't overwrite an existing inode if the generation number
* was logged as zero.  This is done when the tree logging code
* is just logging an inode to make sure it exists after recovery.
*
* Also, don't overwrite i_size on directories during replay.
* log replay inserts and removes directory items based on the
* state of the tree found in the subvolume, and i_size is modified
* as it goes
*/
if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
  struct btrfs_inode_item *src_item;
  struct btrfs_inode_item *dst_item;

  src_item = (struct btrfs_inode_item *)src_ptr;
  dst_item = (struct btrfs_inode_item *)dst_ptr;

  if (btrfs_inode_generation(eb, src_item) == 0) {
   const u64 ino_size = btrfs_inode_size(eb, src_item);

   /*
* For regular files an ino_size == 0 is used only when
* logging that an inode exists, as part of a directory
* fsync, and the inode wasn't fsynced before. In this
* case don't set the size of the inode in the fs/subvol
* tree, otherwise we would be throwing valid data away.
*/
   if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
       S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
       ino_size != 0)
    btrfs_set_inode_size(dst_eb, dst_item, ino_size);
   goto no_copy;
  }

  if (S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
      S_ISDIR(btrfs_inode_mode(dst_eb, dst_item))) {
   save_old_i_size = 1;
   saved_i_size = btrfs_inode_size(dst_eb, dst_item);
  }
}

copy_extent_buffer(dst_eb, eb, dst_ptr, src_ptr, item_size);

if (save_old_i_size) {
  struct btrfs_inode_item *dst_item;

  dst_item = (struct btrfs_inode_item *)dst_ptr;
  btrfs_set_inode_size(dst_eb, dst_item, saved_i_size);
}

/* make sure the generation is filled in */
if (key->type == BTRFS_INODE_ITEM_KEY) {
  struct btrfs_inode_item *dst_item;

  dst_item = (struct btrfs_inode_item *)dst_ptr;
  if (btrfs_inode_generation(dst_eb, dst_item) == 0)
   btrfs_set_inode_generation(dst_eb, dst_item, trans->transid);
}
no_copy:
btrfs_release_path(path);
return 0;
}

static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
          struct fscrypt_str *name)
{
char *buf;

buf = kmalloc(len, GFP_NOFS);
if (!buf)
  return -ENOMEM;

read_extent_buffer(eb, buf, (unsigned long)start, len);
name->name = buf;
name->len = len;
return 0;
}

/* replays a single extent in 'eb' at 'slot' with 'key' into the
* subvolume 'root'.  path is released on entry and should be released
* on exit.
*
* extents in the log tree have not been allocated out of the extent
* tree yet.  So, this completes the allocation, taking a reference
* as required if the extent already exists or creating a new extent
* if it isn't in the extent allocation tree yet.
*
* The extent is inserted into the file, dropping any existing extents
* from the file that overlap the new one.
*/
static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
          struct btrfs_root *root,
          struct btrfs_path *path,
          struct extent_buffer *eb, int slot,
          struct btrfs_key *key)
{
struct btrfs_drop_extents_args drop_args = { 0 };
struct btrfs_fs_info *fs_info = root->fs_info;
int found_type;
u64 extent_end;
u64 start = key->offset;
u64 nbytes = 0;
struct btrfs_file_extent_item *item;
struct btrfs_inode *inode = NULL;
unsigned long size;
int ret = 0;

item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
found_type = btrfs_file_extent_type(eb, item);

if (found_type == BTRFS_FILE_EXTENT_REG ||
     found_type == BTRFS_FILE_EXTENT_PREALLOC) {
  nbytes = btrfs_file_extent_num_bytes(eb, item);
  extent_end = start + nbytes;

  /*
* We don't add to the inodes nbytes if we are prealloc or a
* hole.
*/
  if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
   nbytes = 0;
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
  size = btrfs_file_extent_ram_bytes(eb, item);
  nbytes = btrfs_file_extent_ram_bytes(eb, item);
  extent_end = ALIGN(start + size,
       fs_info->sectorsize);
} else {
  btrfs_err(fs_info,
    "unexpected extent type=%d root=%llu inode=%llu offset=%llu",
     found_type, btrfs_root_id(root), key->objectid, key->offset);
  return -EUCLEAN;
}

inode = btrfs_iget_logging(key->objectid, root);
if (IS_ERR(inode))
  return PTR_ERR(inode);

/*
* first check to see if we already have this extent in the
* file.  This must be done before the btrfs_drop_extents run
* so we don't try to drop this extent.
*/
ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), start, 0);

if (ret == 0 &&
     (found_type == BTRFS_FILE_EXTENT_REG ||
      found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
  struct btrfs_file_extent_item existing;
  unsigned long ptr;

  ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
  read_extent_buffer(path->nodes[0], &existing, ptr, sizeof(existing));

  /*
* we already have a pointer to this exact extent,
* we don't have to do anything
*/
  if (memcmp_extent_buffer(eb, &existing, (unsigned long)item,
      sizeof(existing)) == 0) {
   btrfs_release_path(path);
   goto out;
  }
}
btrfs_release_path(path);

/* drop any overlapping extents */
drop_args.start = start;
drop_args.end = extent_end;
drop_args.drop_cache = true;
ret = btrfs_drop_extents(trans, root, inode, &drop_args);
if (ret)
  goto out;

if (found_type == BTRFS_FILE_EXTENT_REG ||
     found_type == BTRFS_FILE_EXTENT_PREALLOC) {
  u64 offset;
  unsigned long dest_offset;
  struct btrfs_key ins;

  if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
      btrfs_fs_incompat(fs_info, NO_HOLES))
   goto update_inode;

  ret = btrfs_insert_empty_item(trans, root, path, key,
           sizeof(*item));
  if (ret)
   goto out;
  dest_offset = btrfs_item_ptr_offset(path->nodes[0],
          path->slots[0]);
  copy_extent_buffer(path->nodes[0], eb, dest_offset,
    (unsigned long)item,  sizeof(*item));

  ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
  ins.type = BTRFS_EXTENT_ITEM_KEY;
  ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
  offset = key->offset - btrfs_file_extent_offset(eb, item);

  /*
* Manually record dirty extent, as here we did a shallow
* file extent item copy and skip normal backref update,
* but modifying extent tree all by ourselves.
* So need to manually record dirty extent for qgroup,
* as the owner of the file extent changed from log tree
* (doesn't affect qgroup) to fs/file tree(affects qgroup)
*/
  ret = btrfs_qgroup_trace_extent(trans,
    btrfs_file_extent_disk_bytenr(eb, item),
    btrfs_file_extent_disk_num_bytes(eb, item));
  if (ret < 0)
   goto out;

  if (ins.objectid > 0) {
   u64 csum_start;
   u64 csum_end;
   LIST_HEAD(ordered_sums);

   /*
* is this extent already allocated in the extent
* allocation tree?  If so, just add a reference
*/
   ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
      ins.offset);
   if (ret < 0) {
    goto out;
   } else if (ret == 0) {
    struct btrfs_ref ref = {
     .action = BTRFS_ADD_DELAYED_REF,
     .bytenr = ins.objectid,
     .num_bytes = ins.offset,
     .owning_root = btrfs_root_id(root),
     .ref_root = btrfs_root_id(root),
    };
    btrfs_init_data_ref(&ref, key->objectid, offset,
          0, false);
    ret = btrfs_inc_extent_ref(trans, &ref);
    if (ret)
     goto out;
   } else {
    /*
* insert the extent pointer in the extent
* allocation tree
*/
    ret = btrfs_alloc_logged_file_extent(trans,
      btrfs_root_id(root),
      key->objectid, offset, &ins);
    if (ret)
     goto out;
   }
   btrfs_release_path(path);

   if (btrfs_file_extent_compression(eb, item)) {
    csum_start = ins.objectid;
    csum_end = csum_start + ins.offset;
   } else {
    csum_start = ins.objectid +
     btrfs_file_extent_offset(eb, item);
    csum_end = csum_start +
     btrfs_file_extent_num_bytes(eb, item);
   }

   ret = btrfs_lookup_csums_list(root->log_root,
      csum_start, csum_end - 1,
      &ordered_sums, false);
   if (ret < 0)
    goto out;
   ret = 0;
   /*
* Now delete all existing cums in the csum root that
* cover our range. We do this because we can have an
* extent that is completely referenced by one file
* extent item and partially referenced by another
* file extent item (like after using the clone or
* extent_same ioctls). In this case if we end up doing
* the replay of the one that partially references the
* extent first, and we do not do the csum deletion
* below, we can get 2 csum items in the csum tree that
* overlap each other. For example, imagine our log has
* the two following file extent items:
*
* key (257 EXTENT_DATA 409600)
*     extent data disk byte 12845056 nr 102400
*     extent data offset 20480 nr 20480 ram 102400
*
* key (257 EXTENT_DATA 819200)
*     extent data disk byte 12845056 nr 102400
*     extent data offset 0 nr 102400 ram 102400
*
* Where the second one fully references the 100K extent
* that starts at disk byte 12845056, and the log tree
* has a single csum item that covers the entire range
* of the extent:
*
* key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
*
* After the first file extent item is replayed, the
* csum tree gets the following csum item:
*
* key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
*
* Which covers the 20K sub-range starting at offset 20K
* of our extent. Now when we replay the second file
* extent item, if we do not delete existing csum items
* that cover any of its blocks, we end up getting two
* csum items in our csum tree that overlap each other:
*
* key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
* key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
*
* Which is a problem, because after this anyone trying
* to lookup up for the checksum of any block of our
* extent starting at an offset of 40K or higher, will
* end up looking at the second csum item only, which
* does not contain the checksum for any block starting
* at offset 40K or higher of our extent.
*/
   while (!list_empty(&ordered_sums)) {
    struct btrfs_ordered_sum *sums;
    struct btrfs_root *csum_root;

    sums = list_first_entry(&ordered_sums,
       struct btrfs_ordered_sum,
       list);
    csum_root = btrfs_csum_root(fs_info,
           sums->logical);
    if (!ret)
     ret = btrfs_del_csums(trans, csum_root,
             sums->logical,
             sums->len);
    if (!ret)
     ret = btrfs_csum_file_blocks(trans,
             csum_root,
             sums);
    list_del(&sums->list);
    kfree(sums);
   }
   if (ret)
    goto out;
  } else {
   btrfs_release_path(path);
  }
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
  /* inline extents are easy, we just overwrite them */
  ret = overwrite_item(trans, root, path, eb, slot, key);
  if (ret)
   goto out;
}

ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start);
if (ret)
  goto out;

update_inode:
btrfs_update_inode_bytes(inode, nbytes, drop_args.bytes_found);
ret = btrfs_update_inode(trans, inode);
out:
iput(&inode->vfs_inode);
return ret;
}

static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans,
           struct btrfs_inode *dir,
           struct btrfs_inode *inode,
           const struct fscrypt_str *name)
{
int ret;

ret = btrfs_unlink_inode(trans, dir, inode, name);
if (ret)
  return ret;
/*
* Whenever we need to check if a name exists or not, we check the
* fs/subvolume tree. So after an unlink we must run delayed items, so
* that future checks for a name during log replay see that the name
* does not exists anymore.
*/
return btrfs_run_delayed_items(trans);
}

/*
* when cleaning up conflicts between the directory names in the
* subvolume, directory names in the log and directory names in the
* inode back references, we may have to unlink inodes from directories.
*
* This is a helper function to do the unlink of a specific directory
* item
*/
static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
          struct btrfs_path *path,
          struct btrfs_inode *dir,
          struct btrfs_dir_item *di)
{
struct btrfs_root *root = dir->root;
struct btrfs_inode *inode;
struct fscrypt_str name;
struct extent_buffer *leaf;
struct btrfs_key location;
int ret;

leaf = path->nodes[0];

btrfs_dir_item_key_to_cpu(leaf, di, &location);
ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name);
if (ret)
  return -ENOMEM;

btrfs_release_path(path);

inode = btrfs_iget_logging(location.objectid, root);
if (IS_ERR(inode)) {
  ret = PTR_ERR(inode);
  inode = NULL;
  goto out;
}

ret = link_to_fixup_dir(trans, root, path, location.objectid);
if (ret)
  goto out;

ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
out:
kfree(name.name);
if (inode)
  iput(&inode->vfs_inode);
return ret;
}

/*
* See if a given name and sequence number found in an inode back reference are
* already in a directory and correctly point to this inode.
*
* Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
* exists.
*/
static noinline int inode_in_dir(struct btrfs_root *root,
     struct btrfs_path *path,
     u64 dirid, u64 objectid, u64 index,
     struct fscrypt_str *name)
{
struct btrfs_dir_item *di;
struct btrfs_key location;
int ret = 0;

di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
      index, name, 0);
if (IS_ERR(di)) {
  ret = PTR_ERR(di);
  goto out;
} else if (di) {
  btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
  if (location.objectid != objectid)
   goto out;
} else {
  goto out;
}

btrfs_release_path(path);
di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, 0);
if (IS_ERR(di)) {
  ret = PTR_ERR(di);
  goto out;
} else if (di) {
  btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
  if (location.objectid == objectid)
   ret = 1;
}
out:
btrfs_release_path(path);
return ret;
}

/*
* helper function to check a log tree for a named back reference in
* an inode.  This is used to decide if a back reference that is
* found in the subvolume conflicts with what we find in the log.
*
* inode backreferences may have multiple refs in a single item,
* during replay we process one reference at a time, and we don't
* want to delete valid links to a file from the subvolume if that
* link is also in the log.
*/
static noinline int backref_in_log(struct btrfs_root *log,
       struct btrfs_key *key,
       u64 ref_objectid,
       const struct fscrypt_str *name)
{
struct btrfs_path *path;
int ret;

path = btrfs_alloc_path();
if (!path)
  return -ENOMEM;

ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
if (ret < 0) {
  goto out;
} else if (ret == 1) {
  ret = 0;
  goto out;
}

if (key->type == BTRFS_INODE_EXTREF_KEY)
  ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
             path->slots[0],
             ref_objectid, name);
else
  ret = !!btrfs_find_name_in_backref(path->nodes[0],
         path->slots[0], name);
out:
btrfs_free_path(path);
return ret;
}

static int unlink_refs_not_in_log(struct btrfs_trans_handle *trans,
      struct btrfs_path *path,
      struct btrfs_root *log_root,
      struct btrfs_key *search_key,
      struct btrfs_inode *dir,
      struct btrfs_inode *inode,
      u64 parent_objectid)
{
struct extent_buffer *leaf = path->nodes[0];
unsigned long ptr;
unsigned long ptr_end;

/*
* Check all the names in this back reference to see if they are in the
* log. If so, we allow them to stay otherwise they must be unlinked as
* a conflict.
*/
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]);
while (ptr < ptr_end) {
  struct fscrypt_str victim_name;
  struct btrfs_inode_ref *victim_ref;
  int ret;

  victim_ref = (struct btrfs_inode_ref *)ptr;
  ret = read_alloc_one_name(leaf, (victim_ref + 1),
       btrfs_inode_ref_name_len(leaf, victim_ref),
       &victim_name);
  if (ret)
   return ret;

  ret = backref_in_log(log_root, search_key, parent_objectid, &victim_name);
  if (ret) {
   kfree(victim_name.name);
   if (ret < 0)
    return ret;
   ptr = (unsigned long)(victim_ref + 1) + victim_name.len;
   continue;
  }

  inc_nlink(&inode->vfs_inode);
  btrfs_release_path(path);

  ret = unlink_inode_for_log_replay(trans, dir, inode, &victim_name);
  kfree(victim_name.name);
  if (ret)
   return ret;
  return -EAGAIN;
}

return 0;
}

static int unlink_extrefs_not_in_log(struct btrfs_trans_handle *trans,
         struct btrfs_path *path,
         struct btrfs_root *root,
         struct btrfs_root *log_root,
         struct btrfs_key *search_key,
         struct btrfs_inode *inode,
         u64 inode_objectid,
         u64 parent_objectid)
{
struct extent_buffer *leaf = path->nodes[0];
const unsigned long base = btrfs_item_ptr_offset(leaf, path->slots[0]);
const u32 item_size = btrfs_item_size(leaf, path->slots[0]);
u32 cur_offset = 0;

while (cur_offset < item_size) {
  struct btrfs_inode_extref *extref;
  struct btrfs_inode *victim_parent;
  struct fscrypt_str victim_name;
  int ret;

  extref = (struct btrfs_inode_extref *)(base + cur_offset);
  victim_name.len = btrfs_inode_extref_name_len(leaf, extref);

  if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
   goto next;

  ret = read_alloc_one_name(leaf, &extref->name, victim_name.len,
       &victim_name);
  if (ret)
   return ret;

  search_key->objectid = inode_objectid;
  search_key->type = BTRFS_INODE_EXTREF_KEY;
  search_key->offset = btrfs_extref_hash(parent_objectid,
             victim_name.name,
             victim_name.len);
  ret = backref_in_log(log_root, search_key, parent_objectid, &victim_name);
  if (ret) {
   kfree(victim_name.name);
   if (ret < 0)
    return ret;
next:
   cur_offset += victim_name.len + sizeof(*extref);
   continue;
  }

  victim_parent = btrfs_iget_logging(parent_objectid, root);
  if (IS_ERR(victim_parent)) {
   kfree(victim_name.name);
   return PTR_ERR(victim_parent);
  }

  inc_nlink(&inode->vfs_inode);
  btrfs_release_path(path);

  ret = unlink_inode_for_log_replay(trans, victim_parent, inode,
        &victim_name);
  iput(&victim_parent->vfs_inode);
  kfree(victim_name.name);
  if (ret)
   return ret;
  return -EAGAIN;
}

return 0;
}

static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
      struct btrfs_root *root,
      struct btrfs_path *path,
      struct btrfs_root *log_root,
      struct btrfs_inode *dir,
      struct btrfs_inode *inode,
      u64 inode_objectid, u64 parent_objectid,
      u64 ref_index, struct fscrypt_str *name)
{
int ret;
struct btrfs_dir_item *di;
struct btrfs_key search_key;
struct btrfs_inode_extref *extref;

again:
/* Search old style refs */
search_key.objectid = inode_objectid;
search_key.type = BTRFS_INODE_REF_KEY;
search_key.offset = parent_objectid;
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
if (ret < 0) {
  return ret;
} else if (ret == 0) {
  /*
* Are we trying to overwrite a back ref for the root directory?
* If so, we're done.
*/
  if (search_key.objectid == search_key.offset)
   return 1;

  ret = unlink_refs_not_in_log(trans, path, log_root, &search_key,
          dir, inode, parent_objectid);
  if (ret == -EAGAIN)
   goto again;
  else if (ret)
   return ret;
}
btrfs_release_path(path);

/* Same search but for extended refs */
extref = btrfs_lookup_inode_extref(root, path, name, inode_objectid, parent_objectid);
if (IS_ERR(extref)) {
  return PTR_ERR(extref);
} else if (extref) {
  ret = unlink_extrefs_not_in_log(trans, path, root, log_root,
      &search_key, inode,
      inode_objectid, parent_objectid);
  if (ret == -EAGAIN)
   goto again;
  else if (ret)
   return ret;
}
btrfs_release_path(path);

/* look for a conflicting sequence number */
di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
      ref_index, name, 0);
if (IS_ERR(di)) {
  return PTR_ERR(di);
} else if (di) {
  ret = drop_one_dir_item(trans, path, dir, di);
  if (ret)
   return ret;
}
btrfs_release_path(path);

/* look for a conflicting name */
di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, 0);
if (IS_ERR(di)) {
  return PTR_ERR(di);
} else if (di) {
  ret = drop_one_dir_item(trans, path, dir, di);
  if (ret)
   return ret;
}
btrfs_release_path(path);

return 0;
}

static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
        struct fscrypt_str *name, u64 *index,
        u64 *parent_objectid)
{
struct btrfs_inode_extref *extref;
int ret;

extref = (struct btrfs_inode_extref *)ref_ptr;

ret = read_alloc_one_name(eb, &extref->name,
      btrfs_inode_extref_name_len(eb, extref), name);
if (ret)
  return ret;

if (index)
  *index = btrfs_inode_extref_index(eb, extref);
if (parent_objectid)
  *parent_objectid = btrfs_inode_extref_parent(eb, extref);

return 0;
}

static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
     struct fscrypt_str *name, u64 *index)
{
struct btrfs_inode_ref *ref;
int ret;

ref = (struct btrfs_inode_ref *)ref_ptr;

ret = read_alloc_one_name(eb, ref + 1, btrfs_inode_ref_name_len(eb, ref),
      name);
if (ret)
  return ret;

if (index)
  *index = btrfs_inode_ref_index(eb, ref);

return 0;
}

/*
* Take an inode reference item from the log tree and iterate all names from the
* inode reference item in the subvolume tree with the same key (if it exists).
* For any name that is not in the inode reference item from the log tree, do a
* proper unlink of that name (that is, remove its entry from the inode
* reference item and both dir index keys).
*/
static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
     struct btrfs_root *root,
     struct btrfs_path *path,
     struct btrfs_inode *inode,
     struct extent_buffer *log_eb,
     int log_slot,
     struct btrfs_key *key)
{
int ret;
unsigned long ref_ptr;
unsigned long ref_end;
struct extent_buffer *eb;

again:
btrfs_release_path(path);
ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
if (ret > 0) {
  ret = 0;
  goto out;
}
if (ret < 0)
  goto out;

eb = path->nodes[0];
ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]);
while (ref_ptr < ref_end) {
  struct fscrypt_str name;
  u64 parent_id;

  if (key->type == BTRFS_INODE_EXTREF_KEY) {
   ret = extref_get_fields(eb, ref_ptr, &name,
      NULL, &parent_id);
  } else {
   parent_id = key->offset;
   ret = ref_get_fields(eb, ref_ptr, &name, NULL);
  }
  if (ret)
   goto out;

  if (key->type == BTRFS_INODE_EXTREF_KEY)
   ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
              parent_id, &name);
  else
   ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name);

  if (!ret) {
   struct btrfs_inode *dir;

   btrfs_release_path(path);
   dir = btrfs_iget_logging(parent_id, root);
   if (IS_ERR(dir)) {
    ret = PTR_ERR(dir);
    kfree(name.name);
    goto out;
   }
   ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
   kfree(name.name);
   iput(&dir->vfs_inode);
   if (ret)
    goto out;
   goto again;
  }

  kfree(name.name);
  ref_ptr += name.len;
  if (key->type == BTRFS_INODE_EXTREF_KEY)
   ref_ptr += sizeof(struct btrfs_inode_extref);
  else
   ref_ptr += sizeof(struct btrfs_inode_ref);
}
ret = 0;
out:
btrfs_release_path(path);
return ret;
}

/*
* replay one inode back reference item found in the log tree.
* eb, slot and key refer to the buffer and key found in the log tree.
* root is the destination we are replaying into, and path is for temp
* use by this function.  (it should be released on return).
*/
static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
      struct btrfs_root *root,
      struct btrfs_root *log,
      struct btrfs_path *path,
      struct extent_buffer *eb, int slot,
      struct btrfs_key *key)
{
struct btrfs_inode *dir = NULL;
struct btrfs_inode *inode = NULL;
unsigned long ref_ptr;
unsigned long ref_end;
struct fscrypt_str name = { 0 };
int ret;
const bool is_extref_item = (key->type == BTRFS_INODE_EXTREF_KEY);
u64 parent_objectid;
u64 inode_objectid;
u64 ref_index = 0;
int ref_struct_size;

ref_ptr = btrfs_item_ptr_offset(eb, slot);
ref_end = ref_ptr + btrfs_item_size(eb, slot);

if (is_extref_item) {
  struct btrfs_inode_extref *r;

  ref_struct_size = sizeof(struct btrfs_inode_extref);
  r = (struct btrfs_inode_extref *)ref_ptr;
  parent_objectid = btrfs_inode_extref_parent(eb, r);
} else {
  ref_struct_size = sizeof(struct btrfs_inode_ref);
  parent_objectid = key->offset;
}
inode_objectid = key->objectid;

/*
* it is possible that we didn't log all the parent directories
* for a given inode.  If we don't find the dir, just don't
* copy the back ref in.  The link count fixup code will take
* care of the rest
*/
dir = btrfs_iget_logging(parent_objectid, root);
if (IS_ERR(dir)) {
  ret = PTR_ERR(dir);
  if (ret == -ENOENT)
   ret = 0;
  dir = NULL;
  goto out;
}

inode = btrfs_iget_logging(inode_objectid, root);
if (IS_ERR(inode)) {
  ret = PTR_ERR(inode);
  inode = NULL;
  goto out;
}

while (ref_ptr < ref_end) {
  if (is_extref_item) {
   ret = extref_get_fields(eb, ref_ptr, &name,
      &ref_index, &parent_objectid);
   if (ret)
    goto out;
   /*
* parent object can change from one array
* item to another.
*/
   if (!dir) {
    dir = btrfs_iget_logging(parent_objectid, root);
    if (IS_ERR(dir)) {
     ret = PTR_ERR(dir);
     dir = NULL;
     /*
* A new parent dir may have not been
* logged and not exist in the subvolume
* tree, see the comment above before
* the loop when getting the first
* parent dir.
*/
     if (ret == -ENOENT) {
      /*
* The next extref may refer to
* another parent dir that
* exists, so continue.
*/
      ret = 0;
      goto next;
     }
     goto out;
    }
   }
  } else {
   ret = ref_get_fields(eb, ref_ptr, &name, &ref_index);
   if (ret)
    goto out;
  }

  ret = inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
       ref_index, &name);
  if (ret < 0) {
   goto out;
  } else if (ret == 0) {
   /*
* look for a conflicting back reference in the
* metadata. if we find one we have to unlink that name
* of the file before we add our new link.  Later on, we
* overwrite any existing back reference, and we don't
* want to create dangling pointers in the directory.
*/
   ret = __add_inode_ref(trans, root, path, log, dir, inode,
           inode_objectid, parent_objectid,
           ref_index, &name);
   if (ret) {
    if (ret == 1)
     ret = 0;
    goto out;
   }

   /* insert our name */
   ret = btrfs_add_link(trans, dir, inode, &name, 0, ref_index);
   if (ret)
    goto out;

   ret = btrfs_update_inode(trans, inode);
   if (ret)
    goto out;
  }
  /* Else, ret == 1, we already have a perfect match, we're done. */

next:
  ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len;
  kfree(name.name);
  name.name = NULL;
  if (is_extref_item && dir) {
   iput(&dir->vfs_inode);
   dir = NULL;
  }
}

/*
* Before we overwrite the inode reference item in the subvolume tree
* with the item from the log tree, we must unlink all names from the
* parent directory that are in the subvolume's tree inode reference
* item, otherwise we end up with an inconsistent subvolume tree where
* dir index entries exist for a name but there is no inode reference
* item with the same name.
*/
ret = unlink_old_inode_refs(trans, root, path, inode, eb, slot, key);
if (ret)
  goto out;

/* finally write the back reference in the inode */
ret = overwrite_item(trans, root, path, eb, slot, key);
out:
btrfs_release_path(path);
kfree(name.name);
if (dir)
  iput(&dir->vfs_inode);
if (inode)
  iput(&inode->vfs_inode);
return ret;
}

static int count_inode_extrefs(struct btrfs_inode *inode, struct btrfs_path *path)
{
int ret = 0;
int name_len;
unsigned int nlink = 0;
u32 item_size;
u32 cur_offset = 0;
u64 inode_objectid = btrfs_ino(inode);
u64 offset = 0;
unsigned long ptr;
struct btrfs_inode_extref *extref;
struct extent_buffer *leaf;

while (1) {
  ret = btrfs_find_one_extref(inode->root, inode_objectid, offset,
         path, &extref, &offset);
  if (ret)
   break;

  leaf = path->nodes[0];
  item_size = btrfs_item_size(leaf, path->slots[0]);
  ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
  cur_offset = 0;

  while (cur_offset < item_size) {
   extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
   name_len = btrfs_inode_extref_name_len(leaf, extref);

   nlink++;

   cur_offset += name_len + sizeof(*extref);
  }

  offset++;
  btrfs_release_path(path);
}
btrfs_release_path(path);

if (ret < 0 && ret != -ENOENT)
  return ret;
return nlink;
}

static int count_inode_refs(struct btrfs_inode *inode, struct btrfs_path *path)
{
int ret;
struct btrfs_key key;
unsigned int nlink = 0;
unsigned long ptr;
unsigned long ptr_end;
int name_len;
u64 ino = btrfs_ino(inode);

key.objectid = ino;
key.type = BTRFS_INODE_REF_KEY;
key.offset = (u64)-1;

while (1) {
  ret = btrfs_search_slot(NULL, inode->root, &key, path, 0, 0);
  if (ret < 0)
   break;
  if (ret > 0) {
   if (path->slots[0] == 0)
    break;
   path->slots[0]--;
  }
process_slot:
  btrfs_item_key_to_cpu(path->nodes[0], &key,
          path->slots[0]);
  if (key.objectid != ino ||
      key.type != BTRFS_INODE_REF_KEY)
   break;
  ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
  ptr_end = ptr + btrfs_item_size(path->nodes[0],
         path->slots[0]);
  while (ptr < ptr_end) {
   struct btrfs_inode_ref *ref;

   ref = (struct btrfs_inode_ref *)ptr;
   name_len = btrfs_inode_ref_name_len(path->nodes[0],
           ref);
   ptr = (unsigned long)(ref + 1) + name_len;
   nlink++;
  }

  if (key.offset == 0)
   break;
  if (path->slots[0] > 0) {
   path->slots[0]--;
   goto process_slot;
  }
  key.offset--;
  btrfs_release_path(path);
}
btrfs_release_path(path);

return nlink;
}

/*
* There are a few corners where the link count of the file can't
* be properly maintained during replay.  So, instead of adding
* lots of complexity to the log code, we just scan the backrefs
* for any file that has been through replay.
*
* The scan will update the link count on the inode to reflect the
* number of back refs found.  If it goes down to zero, the iput
* will free the inode.
*/
static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
        struct btrfs_inode *inode)
{
struct btrfs_root *root = inode->root;
struct btrfs_path *path;
int ret;
u64 nlink = 0;
const u64 ino = btrfs_ino(inode);

path = btrfs_alloc_path();
if (!path)
  return -ENOMEM;

ret = count_inode_refs(inode, path);
if (ret < 0)
  goto out;

nlink = ret;

ret = count_inode_extrefs(inode, path);
if (ret < 0)
  goto out;

nlink += ret;

ret = 0;

if (nlink != inode->vfs_inode.i_nlink) {
  set_nlink(&inode->vfs_inode, nlink);
  ret = btrfs_update_inode(trans, inode);
  if (ret)
   goto out;
}
if (S_ISDIR(inode->vfs_inode.i_mode))
  inode->index_cnt = (u64)-1;

if (inode->vfs_inode.i_nlink == 0) {
  if (S_ISDIR(inode->vfs_inode.i_mode)) {
   ret = replay_dir_deletes(trans, root, NULL, path, ino, true);
   if (ret)
    goto out;
  }
  ret = btrfs_insert_orphan_item(trans, root, ino);
  if (ret == -EEXIST)
   ret = 0;
}

out:
btrfs_free_path(path);
return ret;
}

static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
         struct btrfs_root *root,
         struct btrfs_path *path)
{
int ret;
struct btrfs_key key;

key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = (u64)-1;
while (1) {
  struct btrfs_inode *inode;

  ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
  if (ret < 0)
   break;

  if (ret == 1) {
   ret = 0;
   if (path->slots[0] == 0)
    break;
   path->slots[0]--;
  }

  btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
  if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
      key.type != BTRFS_ORPHAN_ITEM_KEY)
   break;

  ret = btrfs_del_item(trans, root, path);
  if (ret)
   break;

  btrfs_release_path(path);
  inode = btrfs_iget_logging(key.offset, root);
  if (IS_ERR(inode)) {
   ret = PTR_ERR(inode);
   break;
  }

  ret = fixup_inode_link_count(trans, inode);
  iput(&inode->vfs_inode);
  if (ret)
   break;

  /*
* fixup on a directory may create new entries,
* make sure we always look for the highset possible
* offset
*/
  key.offset = (u64)-1;
}
btrfs_release_path(path);
return ret;
}

/*
* record a given inode in the fixup dir so we can check its link
* count when replay is done.  The link count is incremented here
* so the inode won't go away until we check it
*/
static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
          struct btrfs_root *root,
          struct btrfs_path *path,
          u64 objectid)
{
struct btrfs_key key;
int ret = 0;
struct btrfs_inode *inode;
struct inode *vfs_inode;

inode = btrfs_iget_logging(objectid, root);
if (IS_ERR(inode))
  return PTR_ERR(inode);

vfs_inode = &inode->vfs_inode;
key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = objectid;

ret = btrfs_insert_empty_item(trans, root, path, &key, 0);

btrfs_release_path(path);
if (ret == 0) {
  if (!vfs_inode->i_nlink)
   set_nlink(vfs_inode, 1);
  else
   inc_nlink(vfs_inode);
  ret = btrfs_update_inode(trans, inode);
  if (ret)
   btrfs_abort_transaction(trans, ret);
} else if (ret == -EEXIST) {
  ret = 0;
}
iput(vfs_inode);

return ret;
}

/*
* when replaying the log for a directory, we only insert names
* for inodes that actually exist.  This means an fsync on a directory
* does not implicitly fsync all the new files in it
*/
static noinline int insert_one_name(struct btrfs_trans_handle *trans,
        struct btrfs_root *root,
        u64 dirid, u64 index,
        const struct fscrypt_str *name,
        struct btrfs_key *location)
{
struct btrfs_inode *inode;
struct btrfs_inode *dir;
int ret;

inode = btrfs_iget_logging(location->objectid, root);
if (IS_ERR(inode))
  return PTR_ERR(inode);

dir = btrfs_iget_logging(dirid, root);
if (IS_ERR(dir)) {
  iput(&inode->vfs_inode);
  return PTR_ERR(dir);
}

ret = btrfs_add_link(trans, dir, inode, name, 1, index);

/* FIXME, put inode into FIXUP list */

iput(&inode->vfs_inode);
iput(&dir->vfs_inode);
return ret;
}

static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
     struct btrfs_inode *dir,
     struct btrfs_path *path,
     struct btrfs_dir_item *dst_di,
     const struct btrfs_key *log_key,
     u8 log_flags,
     bool exists)
{
struct btrfs_key found_key;

btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
/* The existing dentry points to the same inode, don't delete it. */
if (found_key.objectid == log_key->objectid &&
     found_key.type == log_key->type &&
     found_key.offset == log_key->offset &&
     btrfs_dir_flags(path->nodes[0], dst_di) == log_flags)
  return 1;

/*
* Don't drop the conflicting directory entry if the inode for the new
* entry doesn't exist.
*/
if (!exists)
  return 0;

return drop_one_dir_item(trans, path, dir, dst_di);
}

/*
* take a single entry in a log directory item and replay it into
* the subvolume.
*
* if a conflicting item exists in the subdirectory already,
* the inode it points to is unlinked and put into the link count
* fix up tree.
*
* If a name from the log points to a file or directory that does
* not exist in the FS, it is skipped.  fsyncs on directories
* do not force down inodes inside that directory, just changes to the
* names or unlinks in a directory.
*
* Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
* non-existing inode) and 1 if the name was replayed.
*/
static noinline int replay_one_name(struct btrfs_trans_handle *trans,
        struct btrfs_root *root,
        struct btrfs_path *path,
        struct extent_buffer *eb,
        struct btrfs_dir_item *di,
        struct btrfs_key *key)
{
struct fscrypt_str name = { 0 };
struct btrfs_dir_item *dir_dst_di;
struct btrfs_dir_item *index_dst_di;
bool dir_dst_matches = false;
bool index_dst_matches = false;
struct btrfs_key log_key;
struct btrfs_key search_key;
struct btrfs_inode *dir;
u8 log_flags;
bool exists;
int ret;
bool update_size = true;
bool name_added = false;

dir = btrfs_iget_logging(key->objectid, root);
if (IS_ERR(dir))
  return PTR_ERR(dir);

ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
if (ret)
  goto out;

log_flags = btrfs_dir_flags(eb, di);
btrfs_dir_item_key_to_cpu(eb, di, &log_key);
ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
btrfs_release_path(path);
if (ret < 0)
  goto out;
exists = (ret == 0);
ret = 0;

dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
        &name, 1);
if (IS_ERR(dir_dst_di)) {
  ret = PTR_ERR(dir_dst_di);
  goto out;
} else if (dir_dst_di) {
  ret = delete_conflicting_dir_entry(trans, dir, path, dir_dst_di,
         &log_key, log_flags, exists);
  if (ret < 0)
   goto out;
  dir_dst_matches = (ret == 1);
}

btrfs_release_path(path);

index_dst_di = btrfs_lookup_dir_index_item(trans, root, path,
         key->objectid, key->offset,
         &name, 1);
if (IS_ERR(index_dst_di)) {
  ret = PTR_ERR(index_dst_di);
  goto out;
} else if (index_dst_di) {
  ret = delete_conflicting_dir_entry(trans, dir, path, index_dst_di,
         &log_key, log_flags, exists);
  if (ret < 0)
   goto out;
  index_dst_matches = (ret == 1);
}

btrfs_release_path(path);

if (dir_dst_matches && index_dst_matches) {
  ret = 0;
  update_size = false;
  goto out;
}

/*
* Check if the inode reference exists in the log for the given name,
* inode and parent inode
*/
search_key.objectid = log_key.objectid;
search_key.type = BTRFS_INODE_REF_KEY;
search_key.offset = key->objectid;
ret = backref_in_log(root->log_root, &search_key, 0, &name);
if (ret < 0) {
         goto out;
} else if (ret) {
         /* The dentry will be added later. */
         ret = 0;
         update_size = false;
         goto out;
}

search_key.objectid = log_key.objectid;
search_key.type = BTRFS_INODE_EXTREF_KEY;
search_key.offset = btrfs_extref_hash(key->objectid, name.name, name.len);
ret = backref_in_log(root->log_root, &search_key, key->objectid, &name);
if (ret < 0) {
  goto out;
} else if (ret) {
  /* The dentry will be added later. */
  ret = 0;
  update_size = false;
  goto out;
}
btrfs_release_path(path);
ret = insert_one_name(trans, root, key->objectid, key->offset,
         &name, &log_key);
if (ret && ret != -ENOENT && ret != -EEXIST)
  goto out;
if (!ret)
  name_added = true;
update_size = false;
ret = 0;

out:
if (!ret && update_size) {
  btrfs_i_size_write(dir, dir->vfs_inode.i_size + name.len * 2);
  ret = btrfs_update_inode(trans, dir);
}
kfree(name.name);
iput(&dir->vfs_inode);
if (!ret && name_added)
  ret = 1;
return ret;
}

/* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */
static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
     struct btrfs_root *root,
     struct btrfs_path *path,
     struct extent_buffer *eb, int slot,
     struct btrfs_key *key)
{
int ret;
struct btrfs_dir_item *di;

/* We only log dir index keys, which only contain a single dir item. */
ASSERT(key->type == BTRFS_DIR_INDEX_KEY);

di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
ret = replay_one_name(trans, root, path, eb, di, key);
if (ret < 0)
  return ret;

/*
* If this entry refers to a non-directory (directories can not have a
* link count > 1) and it was added in the transaction that was not
* committed, make sure we fixup the link count of the inode the entry
* points to. Otherwise something like the following would result in a
* directory pointing to an inode with a wrong link that does not account
* for this dir entry:
*
* mkdir testdir
* touch testdir/foo
* touch testdir/bar
* sync
*
* ln testdir/bar testdir/bar_link
* ln testdir/foo testdir/foo_link
* xfs_io -c "fsync" testdir/bar
*
* <power failure>
*
* mount fs, log replay happens
*
* File foo would remain with a link count of 1 when it has two entries
* pointing to it in the directory testdir. This would make it impossible
* to ever delete the parent directory has it would result in stale
* dentries that can never be deleted.
*/
if (ret == 1 && btrfs_dir_ftype(eb, di) != BTRFS_FT_DIR) {
  struct btrfs_path *fixup_path;
  struct btrfs_key di_key;

  fixup_path = btrfs_alloc_path();
  if (!fixup_path)
   return -ENOMEM;

  btrfs_dir_item_key_to_cpu(eb, di, &di_key);
  ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid);
  btrfs_free_path(fixup_path);
}

return ret;
}

/*
* directory replay has two parts.  There are the standard directory
* items in the log copied from the subvolume, and range items
* created in the log while the subvolume was logged.
*
* The range items tell us which parts of the key space the log
* is authoritative for.  During replay, if a key in the subvolume
* directory is in a logged range item, but not actually in the log
* that means it was deleted from the directory before the fsync
* and should be removed.
*/
static noinline int find_dir_range(struct btrfs_root *root,
       struct btrfs_path *path,
       u64 dirid,
       u64 *start_ret, u64 *end_ret)
{
struct btrfs_key key;
u64 found_end;
struct btrfs_dir_log_item *item;
int ret;
int nritems;

if (*start_ret == (u64)-1)
  return 1;

key.objectid = dirid;
key.type = BTRFS_DIR_LOG_INDEX_KEY;
key.offset = *start_ret;

ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
  goto out;
if (ret > 0) {
  if (path->slots[0] == 0)
   goto out;
  path->slots[0]--;
}
if (ret != 0)
  btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);

if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
  ret = 1;
  goto next;
}
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
         struct btrfs_dir_log_item);
found_end = btrfs_dir_log_end(path->nodes[0], item);

if (*start_ret >= key.offset && *start_ret <= found_end) {
  ret = 0;
  *start_ret = key.offset;
  *end_ret = found_end;
  goto out;
}
ret = 1;
next:
/* check the next slot in the tree to see if it is a valid item */
nritems = btrfs_header_nritems(path->nodes[0]);
path->slots[0]++;
if (path->slots[0] >= nritems) {
  ret = btrfs_next_leaf(root, path);
  if (ret)
   goto out;
}

btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);

if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
  ret = 1;
  goto out;
}
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
         struct btrfs_dir_log_item);
found_end = btrfs_dir_log_end(path->nodes[0], item);
*start_ret = key.offset;
*end_ret = found_end;
ret = 0;
out:
btrfs_release_path(path);
return ret;
}

/*
* this looks for a given directory item in the log.  If the directory
* item is not in the log, the item is removed and the inode it points
* to is unlinked
*/
static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
          struct btrfs_root *log,
          struct btrfs_path *path,
          struct btrfs_path *log_path,
          struct btrfs_inode *dir,
          struct btrfs_key *dir_key)
{
struct btrfs_root *root = dir->root;
int ret;
struct extent_buffer *eb;
int slot;
struct btrfs_dir_item *di;
struct fscrypt_str name = { 0 };
struct btrfs_inode *inode = NULL;
struct btrfs_key location;

/*
* Currently we only log dir index keys. Even if we replay a log created
* by an older kernel that logged both dir index and dir item keys, all
* we need to do is process the dir index keys, we (and our caller) can
* safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY).
*/
ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY);

eb = path->nodes[0];
slot = path->slots[0];
di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
if (ret)
  goto out;

if (log) {
  struct btrfs_dir_item *log_di;

  log_di = btrfs_lookup_dir_index_item(trans, log, log_path,
           dir_key->objectid,
           dir_key->offset, &name, 0);
  if (IS_ERR(log_di)) {
   ret = PTR_ERR(log_di);
   goto out;
  } else if (log_di) {
   /* The dentry exists in the log, we have nothing to do. */
   ret = 0;
   goto out;
  }
}

btrfs_dir_item_key_to_cpu(eb, di, &location);
btrfs_release_path(path);
btrfs_release_path(log_path);
inode = btrfs_iget_logging(location.objectid, root);
if (IS_ERR(inode)) {
  ret = PTR_ERR(inode);
  inode = NULL;
  goto out;
}

ret = link_to_fixup_dir(trans, root, path, location.objectid);
if (ret)
  goto out;

inc_nlink(&inode->vfs_inode);
ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
/*
* Unlike dir item keys, dir index keys can only have one name (entry) in
* them, as there are no key collisions since each key has a unique offset
* (an index number), so we're done.
*/
out:
btrfs_release_path(path);
btrfs_release_path(log_path);
kfree(name.name);
if (inode)
  iput(&inode->vfs_inode);
return ret;
}

static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
         struct btrfs_root *root,
         struct btrfs_root *log,
         struct btrfs_path *path,
         const u64 ino)
{
struct btrfs_key search_key;
struct btrfs_path *log_path;
int i;
int nritems;
int ret;

log_path = btrfs_alloc_path();
if (!log_path)
  return -ENOMEM;

search_key.objectid = ino;
search_key.type = BTRFS_XATTR_ITEM_KEY;
search_key.offset = 0;
again:
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
if (ret < 0)
  goto out;
process_leaf:
nritems = btrfs_header_nritems(path->nodes[0]);
for (i = path->slots[0]; i < nritems; i++) {
  struct btrfs_key key;
  struct btrfs_dir_item *di;
  struct btrfs_dir_item *log_di;
  u32 total_size;
  u32 cur;

  btrfs_item_key_to_cpu(path->nodes[0], &key, i);
  if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
   ret = 0;
   goto out;
  }

  di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
  total_size = btrfs_item_size(path->nodes[0], i);
  cur = 0;
  while (cur < total_size) {
   u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
   u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
   u32 this_len = sizeof(*di) + name_len + data_len;
   char *name;

   name = kmalloc(name_len, GFP_NOFS);
   if (!name) {
    ret = -ENOMEM;
    goto out;
   }
   read_extent_buffer(path->nodes[0], name,
        (unsigned long)(di + 1), name_len);

   log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
          name, name_len, 0);
   btrfs_release_path(log_path);
   if (!log_di) {
    /* Doesn't exist in log tree, so delete it. */
    btrfs_release_path(path);
    di = btrfs_lookup_xattr(trans, root, path, ino,
       name, name_len, -1);
    kfree(name);
    if (IS_ERR(di)) {
     ret = PTR_ERR(di);
     goto out;
    }
    ASSERT(di);
    ret = btrfs_delete_one_dir_name(trans, root,
        path, di);
    if (ret)
     goto out;
    btrfs_release_path(path);
    search_key = key;
    goto again;
   }
   kfree(name);
   if (IS_ERR(log_di)) {
    ret = PTR_ERR(log_di);
    goto out;
   }
   cur += this_len;
   di = (struct btrfs_dir_item *)((char *)di + this_len);
  }
}
ret = btrfs_next_leaf(root, path);
if (ret > 0)
  ret = 0;
else if (ret == 0)
  goto process_leaf;
out:
btrfs_free_path(log_path);
btrfs_release_path(path);
return ret;
}

/*
* deletion replay happens before we copy any new directory items
* out of the log or out of backreferences from inodes.  It
* scans the log to find ranges of keys that log is authoritative for,
* and then scans the directory to find items in those ranges that are
* not present in the log.
*
* Anything we don't find in the log is unlinked and removed from the
* directory.
*/
static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
           struct btrfs_root *root,
           struct btrfs_root *log,
           struct btrfs_path *path,
           u64 dirid, bool del_all)
{
u64 range_start;
u64 range_end;
int ret = 0;
struct btrfs_key dir_key;
struct btrfs_key found_key;
struct btrfs_path *log_path;
struct btrfs_inode *dir;

dir_key.objectid = dirid;
dir_key.type = BTRFS_DIR_INDEX_KEY;
log_path = btrfs_alloc_path();
if (!log_path)
  return -ENOMEM;

dir = btrfs_iget_logging(dirid, root);
/*
* It isn't an error if the inode isn't there, that can happen because
* we replay the deletes before we copy in the inode item from the log.
*/
if (IS_ERR(dir)) {
  btrfs_free_path(log_path);
  ret = PTR_ERR(dir);
  if (ret == -ENOENT)
   ret = 0;
  return ret;
}

range_start = 0;
range_end = 0;
while (1) {
  if (del_all)
   range_end = (u64)-1;
  else {
   ret = find_dir_range(log, path, dirid,
          &range_start, &range_end);
   if (ret < 0)
    goto out;
   else if (ret > 0)
    break;
  }

  dir_key.offset = range_start;
  while (1) {
   int nritems;
   ret = btrfs_search_slot(NULL, root, &dir_key, path,
      0, 0);
   if (ret < 0)
    goto out;

   nritems = btrfs_header_nritems(path->nodes[0]);
   if (path->slots[0] >= nritems) {
    ret = btrfs_next_leaf(root, path);
    if (ret == 1)
     break;
    else if (ret < 0)
     goto out;
   }
   btrfs_item_key_to_cpu(path->nodes[0], &found_key,
           path->slots[0]);
--> --------------------

--> maximum size reached

--> --------------------

Messung V0.5

¤ Dauer der Verarbeitung: 0.28 Sekunden (vorverarbeitet) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.