/* magic values for the inode_only field in btrfs_log_inode: * * LOG_INODE_ALL means to log everything * LOG_INODE_EXISTS means to log just enough to recreate the inode * during log replay
*/ enum {
LOG_INODE_ALL,
LOG_INODE_EXISTS,
};
/* * directory trouble cases * * 1) on rename or unlink, if the inode being unlinked isn't in the fsync * log, we must force a full commit before doing an fsync of the directory * where the unlink was done. * ---> record transid of last unlink/rename per directory * * mkdir foo/some_dir * normal commit * rename foo/some_dir foo2/some_dir * mkdir foo/some_dir * fsync foo/some_dir/some_file * * The fsync above will unlink the original some_dir without recording * it in its new location (foo2). After a crash, some_dir will be gone * unless the fsync of some_file forces a full commit * * 2) we must log any new names for any file or dir that is in the fsync * log. ---> check inode while renaming/linking. * * 2a) we must log any new names for any file or dir during rename * when the directory they are being removed from was logged. * ---> check inode and old parent dir during rename * * 2a is actually the more important variant. With the extra logging * a crash might unlink the old name without recreating the new one * * 3) after a crash, we must go through any directories with a link count * of zero and redo the rm -rf * * mkdir f1/foo * normal commit * rm -rf f1/foo * fsync(f1) * * The directory f1 was fully removed from the FS, but fsync was never * called on f1, only its parent dir. After a crash the rm -rf must * be replayed. This must be able to recurse down the entire * directory tree. The inode link count fixup code takes care of the * ugly details.
*/
/* * stages for the tree walking. The first * stage (0) is to only pin down the blocks we find * the second stage (1) is to make sure that all the inodes * we find in the log are created in the subvolume. * * The last stage is to deal with directories and links and extents * and all the other fun semantics
*/ enum {
LOG_WALK_PIN_ONLY,
LOG_WALK_REPLAY_INODES,
LOG_WALK_REPLAY_DIR_INDEX,
LOG_WALK_REPLAY_ALL,
};
/* * tree logging is a special write ahead log used to make sure that * fsyncs and O_SYNCs can happen without doing full tree commits. * * Full tree commits are expensive because they require commonly * modified blocks to be recowed, creating many dirty pages in the * extent tree an 4x-6x higher write load than ext3. * * Instead of doing a tree commit on every fsync, we use the * key ranges and transaction ids to find items for a given file or directory * that have changed in this transaction. Those items are copied into * a special tree (one per subvolume root), that tree is written to disk * and then the fsync is considered complete. * * After a crash, items are copied out of the log-tree back into the * subvolume tree. Any file data extents found are recorded in the extent * allocation tree, and the log-tree freed. * * The log tree is read three times, once to pin down all the extents it is * using in ram and once, once to create all the inodes logged in the tree * and once to do all the other items.
*/
/* Only meant to be called for subvolume roots and not for log roots. */
ASSERT(btrfs_is_fstree(btrfs_root_id(root)));
/* * We're holding a transaction handle whether we are logging or * replaying a log tree, so we must make sure NOFS semantics apply * because btrfs_alloc_inode() may be triggered and it uses GFP_KERNEL * to allocate an inode, which can recurse back into the filesystem and * attempt a transaction commit, resulting in a deadlock.
*/
nofs_flag = memalloc_nofs_save();
inode = btrfs_iget(objectid, root);
memalloc_nofs_restore(nofs_flag);
return inode;
}
/* * start a sub transaction and setup the log tree * this increments the log tree writer count to make the people * syncing the tree wait for us to finish
*/ staticint start_log_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_log_ctx *ctx)
{ struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *tree_root = fs_info->tree_root; constbool zoned = btrfs_is_zoned(fs_info); int ret = 0; bool created = false;
/* * First check if the log root tree was already created. If not, create * it before locking the root's log_mutex, just to keep lockdep happy.
*/ if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) {
mutex_lock(&tree_root->log_mutex); if (!fs_info->log_root_tree) {
ret = btrfs_init_log_root_tree(trans, fs_info); if (!ret) {
set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state);
created = true;
}
}
mutex_unlock(&tree_root->log_mutex); if (ret) return ret;
}
mutex_lock(&root->log_mutex);
again: if (root->log_root) { int index = (root->log_transid + 1) % 2;
if (btrfs_need_log_full_commit(trans)) {
ret = BTRFS_LOG_FORCE_COMMIT; goto out;
}
if (!root->log_start_pid) {
clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
root->log_start_pid = current->pid;
} elseif (root->log_start_pid != current->pid) {
set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
}
} else { /* * This means fs_info->log_root_tree was already created * for some other FS trees. Do the full commit not to mix * nodes from multiple log transactions to do sequential * writing.
*/ if (zoned && !created) {
ret = BTRFS_LOG_FORCE_COMMIT; goto out;
}
ret = btrfs_add_log_tree(trans, root); if (ret) goto out;
/* * returns 0 if there was a log transaction running and we were able * to join, or returns -ENOENT if there were not transactions * in progress
*/ staticint join_running_log_trans(struct btrfs_root *root)
{ constbool zoned = btrfs_is_zoned(root->fs_info); int ret = -ENOENT;
if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state)) return ret;
mutex_lock(&root->log_mutex);
again: if (root->log_root) { int index = (root->log_transid + 1) % 2;
/* * This either makes the current running log transaction wait * until you call btrfs_end_log_trans() or it makes any future * log transactions wait until you call btrfs_end_log_trans()
*/ void btrfs_pin_log_trans(struct btrfs_root *root)
{
atomic_inc(&root->log_writers);
}
/* * indicate we're done making changes to the log tree * and wake up anyone waiting to do a sync
*/ void btrfs_end_log_trans(struct btrfs_root *root)
{ if (atomic_dec_and_test(&root->log_writers)) { /* atomic_dec_and_test implies a barrier */
cond_wake_up_nomb(&root->log_writer_wait);
}
}
/* * the walk control struct is used to pass state down the chain when * processing the log tree. The stage field tells us which part * of the log tree processing we are currently doing. The others * are state fields used for that specific part
*/ struct walk_control { /* should we free the extent on disk when done? This is used * at transaction commit time while freeing a log tree
*/ int free;
/* pin only walk, we record which extents on disk belong to the * log trees
*/ int pin;
/* what stage of the replay code we're currently in */ int stage;
/* * Ignore any items from the inode currently being processed. Needs * to be set every time we find a BTRFS_INODE_ITEM_KEY.
*/ bool ignore_cur_inode;
/* the root we are currently replaying */ struct btrfs_root *replay_dest;
/* the trans handle for the current replay */ struct btrfs_trans_handle *trans;
/* the function that gets used to process blocks we find in the * tree. Note the extent_buffer might not be up to date when it is * passed in, and it must be checked or read if you need the data * inside it
*/ int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, struct walk_control *wc, u64 gen, int level);
};
/* * process_func used to pin down extents, write them or wait on them
*/ staticint process_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, struct walk_control *wc, u64 gen, int level)
{ struct btrfs_trans_handle *trans = wc->trans; struct btrfs_fs_info *fs_info = log->fs_info; int ret = 0;
/* * If this fs is mixed then we need to be able to process the leaves to * pin down any logged extents, so we have to read the block.
*/ if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { struct btrfs_tree_parent_check check = {
.level = level,
.transid = gen
};
ret = btrfs_read_extent_buffer(eb, &check); if (ret) { if (trans)
btrfs_abort_transaction(trans, ret); else
btrfs_handle_fs_error(fs_info, ret, NULL); return ret;
}
}
if (wc->pin) {
ASSERT(trans != NULL);
ret = btrfs_pin_extent_for_log_replay(trans, eb); if (ret) {
btrfs_abort_transaction(trans, ret); return ret;
}
if (btrfs_buffer_uptodate(eb, gen, 0) &&
btrfs_header_level(eb) == 0) {
ret = btrfs_exclude_logged_extents(eb); if (ret)
btrfs_abort_transaction(trans, ret);
}
} return ret;
}
/* * Item overwrite used by log replay. The given eb, slot and key all refer to * the source data we are copying out. * * The given root is for the tree we are copying into, and path is a scratch * path for use in this function (it should be released on entry and will be * released on exit). * * If the key is already in the destination tree the existing item is * overwritten. If the existing item isn't big enough, it is extended. * If it is too large, it is truncated. * * If the key isn't in the destination yet, a new item is inserted.
*/ staticint overwrite_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct extent_buffer *eb, int slot, struct btrfs_key *key)
{ int ret;
u32 item_size;
u64 saved_i_size = 0; int save_old_i_size = 0; unsignedlong src_ptr; unsignedlong dst_ptr; struct extent_buffer *dst_eb; int dst_slot; bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
/* * This is only used during log replay, so the root is always from a * fs/subvolume tree. In case we ever need to support a log root, then * we'll have to clone the leaf in the path, release the path and use * the leaf before writing into the log tree. See the comments at * copy_items() for more details.
*/
ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);
kfree(src_copy); /* * they have the same contents, just return, this saves * us from cowing blocks in the destination tree and doing * extra writes that may not have been done by a previous * sync
*/ if (ret == 0) {
btrfs_release_path(path); return 0;
}
/* * We need to load the old nbytes into the inode so when we * replay the extents we've logged we get the right nbytes.
*/ if (inode_item) { struct btrfs_inode_item *item;
u64 nbytes;
u32 mode;
/* * If this is a directory we need to reset the i_size to * 0 so that we can set it up properly when replaying * the rest of the items in this log.
*/
mode = btrfs_inode_mode(eb, item); if (S_ISDIR(mode))
btrfs_set_inode_size(eb, item, 0);
}
} elseif (inode_item) { struct btrfs_inode_item *item;
u32 mode;
/* * New inode, set nbytes to 0 so that the nbytes comes out * properly when we replay the extents.
*/
item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
btrfs_set_inode_nbytes(eb, item, 0);
/* * If this is a directory we need to reset the i_size to 0 so * that we can set it up properly when replaying the rest of * the items in this log.
*/
mode = btrfs_inode_mode(eb, item); if (S_ISDIR(mode))
btrfs_set_inode_size(eb, item, 0);
}
insert:
btrfs_release_path(path); /* try to insert the key into the destination tree */
path->skip_release_on_error = 1;
ret = btrfs_insert_empty_item(trans, root, path,
key, item_size);
path->skip_release_on_error = 0;
/* make sure any existing item is the correct size */ if (ret == -EEXIST || ret == -EOVERFLOW) { const u32 found_size = btrfs_item_size(dst_eb, dst_slot);
/* don't overwrite an existing inode if the generation number * was logged as zero. This is done when the tree logging code * is just logging an inode to make sure it exists after recovery. * * Also, don't overwrite i_size on directories during replay. * log replay inserts and removes directory items based on the * state of the tree found in the subvolume, and i_size is modified * as it goes
*/ if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { struct btrfs_inode_item *src_item; struct btrfs_inode_item *dst_item;
/* * For regular files an ino_size == 0 is used only when * logging that an inode exists, as part of a directory * fsync, and the inode wasn't fsynced before. In this * case don't set the size of the inode in the fs/subvol * tree, otherwise we would be throwing valid data away.
*/ if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
ino_size != 0)
btrfs_set_inode_size(dst_eb, dst_item, ino_size); goto no_copy;
}
/* replays a single extent in 'eb' at 'slot' with 'key' into the * subvolume 'root'. path is released on entry and should be released * on exit. * * extents in the log tree have not been allocated out of the extent * tree yet. So, this completes the allocation, taking a reference * as required if the extent already exists or creating a new extent * if it isn't in the extent allocation tree yet. * * The extent is inserted into the file, dropping any existing extents * from the file that overlap the new one.
*/ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct extent_buffer *eb, int slot, struct btrfs_key *key)
{ struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_fs_info *fs_info = root->fs_info; int found_type;
u64 extent_end;
u64 start = key->offset;
u64 nbytes = 0; struct btrfs_file_extent_item *item; struct btrfs_inode *inode = NULL; unsignedlong size; int ret = 0;
/* * We don't add to the inodes nbytes if we are prealloc or a * hole.
*/ if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
nbytes = 0;
} elseif (found_type == BTRFS_FILE_EXTENT_INLINE) {
size = btrfs_file_extent_ram_bytes(eb, item);
nbytes = btrfs_file_extent_ram_bytes(eb, item);
extent_end = ALIGN(start + size,
fs_info->sectorsize);
} else {
btrfs_err(fs_info, "unexpected extent type=%d root=%llu inode=%llu offset=%llu",
found_type, btrfs_root_id(root), key->objectid, key->offset); return -EUCLEAN;
}
inode = btrfs_iget_logging(key->objectid, root); if (IS_ERR(inode)) return PTR_ERR(inode);
/* * first check to see if we already have this extent in the * file. This must be done before the btrfs_drop_extents run * so we don't try to drop this extent.
*/
ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), start, 0);
/* * we already have a pointer to this exact extent, * we don't have to do anything
*/ if (memcmp_extent_buffer(eb, &existing, (unsignedlong)item, sizeof(existing)) == 0) {
btrfs_release_path(path); goto out;
}
}
btrfs_release_path(path);
/* drop any overlapping extents */
drop_args.start = start;
drop_args.end = extent_end;
drop_args.drop_cache = true;
ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) goto out;
/* * Manually record dirty extent, as here we did a shallow * file extent item copy and skip normal backref update, * but modifying extent tree all by ourselves. * So need to manually record dirty extent for qgroup, * as the owner of the file extent changed from log tree * (doesn't affect qgroup) to fs/file tree(affects qgroup)
*/
ret = btrfs_qgroup_trace_extent(trans,
btrfs_file_extent_disk_bytenr(eb, item),
btrfs_file_extent_disk_num_bytes(eb, item)); if (ret < 0) goto out;
if (ins.objectid > 0) {
u64 csum_start;
u64 csum_end;
LIST_HEAD(ordered_sums);
/* * is this extent already allocated in the extent * allocation tree? If so, just add a reference
*/
ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
ins.offset); if (ret < 0) { goto out;
} elseif (ret == 0) { struct btrfs_ref ref = {
.action = BTRFS_ADD_DELAYED_REF,
.bytenr = ins.objectid,
.num_bytes = ins.offset,
.owning_root = btrfs_root_id(root),
.ref_root = btrfs_root_id(root),
};
btrfs_init_data_ref(&ref, key->objectid, offset,
0, false);
ret = btrfs_inc_extent_ref(trans, &ref); if (ret) goto out;
} else { /* * insert the extent pointer in the extent * allocation tree
*/
ret = btrfs_alloc_logged_file_extent(trans,
btrfs_root_id(root),
key->objectid, offset, &ins); if (ret) goto out;
}
btrfs_release_path(path);
ret = btrfs_lookup_csums_list(root->log_root,
csum_start, csum_end - 1,
&ordered_sums, false); if (ret < 0) goto out;
ret = 0; /* * Now delete all existing cums in the csum root that * cover our range. We do this because we can have an * extent that is completely referenced by one file * extent item and partially referenced by another * file extent item (like after using the clone or * extent_same ioctls). In this case if we end up doing * the replay of the one that partially references the * extent first, and we do not do the csum deletion * below, we can get 2 csum items in the csum tree that * overlap each other. For example, imagine our log has * the two following file extent items: * * key (257 EXTENT_DATA 409600) * extent data disk byte 12845056 nr 102400 * extent data offset 20480 nr 20480 ram 102400 * * key (257 EXTENT_DATA 819200) * extent data disk byte 12845056 nr 102400 * extent data offset 0 nr 102400 ram 102400 * * Where the second one fully references the 100K extent * that starts at disk byte 12845056, and the log tree * has a single csum item that covers the entire range * of the extent: * * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 * * After the first file extent item is replayed, the * csum tree gets the following csum item: * * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 * * Which covers the 20K sub-range starting at offset 20K * of our extent. Now when we replay the second file * extent item, if we do not delete existing csum items * that cover any of its blocks, we end up getting two * csum items in our csum tree that overlap each other: * * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 * * Which is a problem, because after this anyone trying * to lookup up for the checksum of any block of our * extent starting at an offset of 40K or higher, will * end up looking at the second csum item only, which * does not contain the checksum for any block starting * at offset 40K or higher of our extent.
*/ while (!list_empty(&ordered_sums)) { struct btrfs_ordered_sum *sums; struct btrfs_root *csum_root;
sums = list_first_entry(&ordered_sums, struct btrfs_ordered_sum,
list);
csum_root = btrfs_csum_root(fs_info,
sums->logical); if (!ret)
ret = btrfs_del_csums(trans, csum_root,
sums->logical,
sums->len); if (!ret)
ret = btrfs_csum_file_blocks(trans,
csum_root,
sums);
list_del(&sums->list);
kfree(sums);
} if (ret) goto out;
} else {
btrfs_release_path(path);
}
} elseif (found_type == BTRFS_FILE_EXTENT_INLINE) { /* inline extents are easy, we just overwrite them */
ret = overwrite_item(trans, root, path, eb, slot, key); if (ret) goto out;
}
ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start); if (ret) goto out;
ret = btrfs_unlink_inode(trans, dir, inode, name); if (ret) return ret; /* * Whenever we need to check if a name exists or not, we check the * fs/subvolume tree. So after an unlink we must run delayed items, so * that future checks for a name during log replay see that the name * does not exists anymore.
*/ return btrfs_run_delayed_items(trans);
}
/* * when cleaning up conflicts between the directory names in the * subvolume, directory names in the log and directory names in the * inode back references, we may have to unlink inodes from directories. * * This is a helper function to do the unlink of a specific directory * item
*/ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_inode *dir, struct btrfs_dir_item *di)
{ struct btrfs_root *root = dir->root; struct btrfs_inode *inode; struct fscrypt_str name; struct extent_buffer *leaf; struct btrfs_key location; int ret;
leaf = path->nodes[0];
btrfs_dir_item_key_to_cpu(leaf, di, &location);
ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name); if (ret) return -ENOMEM;
btrfs_release_path(path);
inode = btrfs_iget_logging(location.objectid, root); if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
inode = NULL; goto out;
}
ret = link_to_fixup_dir(trans, root, path, location.objectid); if (ret) goto out;
ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
out:
kfree(name.name); if (inode)
iput(&inode->vfs_inode); return ret;
}
/* * See if a given name and sequence number found in an inode back reference are * already in a directory and correctly point to this inode. * * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it * exists.
*/ static noinline int inode_in_dir(struct btrfs_root *root, struct btrfs_path *path,
u64 dirid, u64 objectid, u64 index, struct fscrypt_str *name)
{ struct btrfs_dir_item *di; struct btrfs_key location; int ret = 0;
di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
index, name, 0); if (IS_ERR(di)) {
ret = PTR_ERR(di); goto out;
} elseif (di) {
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); if (location.objectid != objectid) goto out;
} else { goto out;
}
btrfs_release_path(path);
di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, 0); if (IS_ERR(di)) {
ret = PTR_ERR(di); goto out;
} elseif (di) {
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); if (location.objectid == objectid)
ret = 1;
}
out:
btrfs_release_path(path); return ret;
}
/* * helper function to check a log tree for a named back reference in * an inode. This is used to decide if a back reference that is * found in the subvolume conflicts with what we find in the log. * * inode backreferences may have multiple refs in a single item, * during replay we process one reference at a time, and we don't * want to delete valid links to a file from the subvolume if that * link is also in the log.
*/ static noinline int backref_in_log(struct btrfs_root *log, struct btrfs_key *key,
u64 ref_objectid, conststruct fscrypt_str *name)
{ struct btrfs_path *path; int ret;
path = btrfs_alloc_path(); if (!path) return -ENOMEM;
ret = btrfs_search_slot(NULL, log, key, path, 0, 0); if (ret < 0) { goto out;
} elseif (ret == 1) {
ret = 0; goto out;
}
if (key->type == BTRFS_INODE_EXTREF_KEY)
ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
path->slots[0],
ref_objectid, name); else
ret = !!btrfs_find_name_in_backref(path->nodes[0],
path->slots[0], name);
out:
btrfs_free_path(path); return ret;
}
/* * Check all the names in this back reference to see if they are in the * log. If so, we allow them to stay otherwise they must be unlinked as * a conflict.
*/
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]); while (ptr < ptr_end) { struct fscrypt_str victim_name; struct btrfs_inode_ref *victim_ref; int ret;
victim_ref = (struct btrfs_inode_ref *)ptr;
ret = read_alloc_one_name(leaf, (victim_ref + 1),
btrfs_inode_ref_name_len(leaf, victim_ref),
&victim_name); if (ret) return ret;
ret = backref_in_log(log_root, search_key, parent_objectid, &victim_name); if (ret) {
kfree(victim_name.name); if (ret < 0) return ret;
ptr = (unsignedlong)(victim_ref + 1) + victim_name.len; continue;
}
again: /* Search old style refs */
search_key.objectid = inode_objectid;
search_key.type = BTRFS_INODE_REF_KEY;
search_key.offset = parent_objectid;
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret < 0) { return ret;
} elseif (ret == 0) { /* * Are we trying to overwrite a back ref for the root directory? * If so, we're done.
*/ if (search_key.objectid == search_key.offset) return 1;
ret = read_alloc_one_name(eb, ref + 1, btrfs_inode_ref_name_len(eb, ref),
name); if (ret) return ret;
if (index)
*index = btrfs_inode_ref_index(eb, ref);
return 0;
}
/* * Take an inode reference item from the log tree and iterate all names from the * inode reference item in the subvolume tree with the same key (if it exists). * For any name that is not in the inode reference item from the log tree, do a * proper unlink of that name (that is, remove its entry from the inode * reference item and both dir index keys).
*/ staticint unlink_old_inode_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_inode *inode, struct extent_buffer *log_eb, int log_slot, struct btrfs_key *key)
{ int ret; unsignedlong ref_ptr; unsignedlong ref_end; struct extent_buffer *eb;
again:
btrfs_release_path(path);
ret = btrfs_search_slot(NULL, root, key, path, 0, 0); if (ret > 0) {
ret = 0; goto out;
} if (ret < 0) goto out;
/* * replay one inode back reference item found in the log tree. * eb, slot and key refer to the buffer and key found in the log tree. * root is the destination we are replaying into, and path is for temp * use by this function. (it should be released on return).
*/ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_root *log, struct btrfs_path *path, struct extent_buffer *eb, int slot, struct btrfs_key *key)
{ struct btrfs_inode *dir = NULL; struct btrfs_inode *inode = NULL; unsignedlong ref_ptr; unsignedlong ref_end; struct fscrypt_str name = { 0 }; int ret; constbool is_extref_item = (key->type == BTRFS_INODE_EXTREF_KEY);
u64 parent_objectid;
u64 inode_objectid;
u64 ref_index = 0; int ref_struct_size;
/* * it is possible that we didn't log all the parent directories * for a given inode. If we don't find the dir, just don't * copy the back ref in. The link count fixup code will take * care of the rest
*/
dir = btrfs_iget_logging(parent_objectid, root); if (IS_ERR(dir)) {
ret = PTR_ERR(dir); if (ret == -ENOENT)
ret = 0;
dir = NULL; goto out;
}
inode = btrfs_iget_logging(inode_objectid, root); if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
inode = NULL; goto out;
}
while (ref_ptr < ref_end) { if (is_extref_item) {
ret = extref_get_fields(eb, ref_ptr, &name,
&ref_index, &parent_objectid); if (ret) goto out; /* * parent object can change from one array * item to another.
*/ if (!dir) {
dir = btrfs_iget_logging(parent_objectid, root); if (IS_ERR(dir)) {
ret = PTR_ERR(dir);
dir = NULL; /* * A new parent dir may have not been * logged and not exist in the subvolume * tree, see the comment above before * the loop when getting the first * parent dir.
*/ if (ret == -ENOENT) { /* * The next extref may refer to * another parent dir that * exists, so continue.
*/
ret = 0; goto next;
} goto out;
}
}
} else {
ret = ref_get_fields(eb, ref_ptr, &name, &ref_index); if (ret) goto out;
}
ret = inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
ref_index, &name); if (ret < 0) { goto out;
} elseif (ret == 0) { /* * look for a conflicting back reference in the * metadata. if we find one we have to unlink that name * of the file before we add our new link. Later on, we * overwrite any existing back reference, and we don't * want to create dangling pointers in the directory.
*/
ret = __add_inode_ref(trans, root, path, log, dir, inode,
inode_objectid, parent_objectid,
ref_index, &name); if (ret) { if (ret == 1)
ret = 0; goto out;
}
/* insert our name */
ret = btrfs_add_link(trans, dir, inode, &name, 0, ref_index); if (ret) goto out;
ret = btrfs_update_inode(trans, inode); if (ret) goto out;
} /* Else, ret == 1, we already have a perfect match, we're done. */
/* * Before we overwrite the inode reference item in the subvolume tree * with the item from the log tree, we must unlink all names from the * parent directory that are in the subvolume's tree inode reference * item, otherwise we end up with an inconsistent subvolume tree where * dir index entries exist for a name but there is no inode reference * item with the same name.
*/
ret = unlink_old_inode_refs(trans, root, path, inode, eb, slot, key); if (ret) goto out;
/* finally write the back reference in the inode */
ret = overwrite_item(trans, root, path, eb, slot, key);
out:
btrfs_release_path(path);
kfree(name.name); if (dir)
iput(&dir->vfs_inode); if (inode)
iput(&inode->vfs_inode); return ret;
}
if (key.offset == 0) break; if (path->slots[0] > 0) {
path->slots[0]--; goto process_slot;
}
key.offset--;
btrfs_release_path(path);
}
btrfs_release_path(path);
return nlink;
}
/* * There are a few corners where the link count of the file can't * be properly maintained during replay. So, instead of adding * lots of complexity to the log code, we just scan the backrefs * for any file that has been through replay. * * The scan will update the link count on the inode to reflect the * number of back refs found. If it goes down to zero, the iput * will free the inode.
*/ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, struct btrfs_inode *inode)
{ struct btrfs_root *root = inode->root; struct btrfs_path *path; int ret;
u64 nlink = 0; const u64 ino = btrfs_ino(inode);
path = btrfs_alloc_path(); if (!path) return -ENOMEM;
ret = count_inode_refs(inode, path); if (ret < 0) goto out;
nlink = ret;
ret = count_inode_extrefs(inode, path); if (ret < 0) goto out;
nlink += ret;
ret = 0;
if (nlink != inode->vfs_inode.i_nlink) {
set_nlink(&inode->vfs_inode, nlink);
ret = btrfs_update_inode(trans, inode); if (ret) goto out;
} if (S_ISDIR(inode->vfs_inode.i_mode))
inode->index_cnt = (u64)-1;
if (inode->vfs_inode.i_nlink == 0) { if (S_ISDIR(inode->vfs_inode.i_mode)) {
ret = replay_dir_deletes(trans, root, NULL, path, ino, true); if (ret) goto out;
}
ret = btrfs_insert_orphan_item(trans, root, ino); if (ret == -EEXIST)
ret = 0;
}
out:
btrfs_free_path(path); return ret;
}
static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path)
{ int ret; struct btrfs_key key;
ret = btrfs_del_item(trans, root, path); if (ret) break;
btrfs_release_path(path);
inode = btrfs_iget_logging(key.offset, root); if (IS_ERR(inode)) {
ret = PTR_ERR(inode); break;
}
ret = fixup_inode_link_count(trans, inode);
iput(&inode->vfs_inode); if (ret) break;
/* * fixup on a directory may create new entries, * make sure we always look for the highset possible * offset
*/
key.offset = (u64)-1;
}
btrfs_release_path(path); return ret;
}
/* * record a given inode in the fixup dir so we can check its link * count when replay is done. The link count is incremented here * so the inode won't go away until we check it
*/ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path,
u64 objectid)
{ struct btrfs_key key; int ret = 0; struct btrfs_inode *inode; struct inode *vfs_inode;
inode = btrfs_iget_logging(objectid, root); if (IS_ERR(inode)) return PTR_ERR(inode);
ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
btrfs_release_path(path); if (ret == 0) { if (!vfs_inode->i_nlink)
set_nlink(vfs_inode, 1); else
inc_nlink(vfs_inode);
ret = btrfs_update_inode(trans, inode); if (ret)
btrfs_abort_transaction(trans, ret);
} elseif (ret == -EEXIST) {
ret = 0;
}
iput(vfs_inode);
return ret;
}
/* * when replaying the log for a directory, we only insert names * for inodes that actually exist. This means an fsync on a directory * does not implicitly fsync all the new files in it
*/ static noinline int insert_one_name(struct btrfs_trans_handle *trans, struct btrfs_root *root,
u64 dirid, u64 index, conststruct fscrypt_str *name, struct btrfs_key *location)
{ struct btrfs_inode *inode; struct btrfs_inode *dir; int ret;
inode = btrfs_iget_logging(location->objectid, root); if (IS_ERR(inode)) return PTR_ERR(inode);
dir = btrfs_iget_logging(dirid, root); if (IS_ERR(dir)) {
iput(&inode->vfs_inode); return PTR_ERR(dir);
}
ret = btrfs_add_link(trans, dir, inode, name, 1, index);
/* * take a single entry in a log directory item and replay it into * the subvolume. * * if a conflicting item exists in the subdirectory already, * the inode it points to is unlinked and put into the link count * fix up tree. * * If a name from the log points to a file or directory that does * not exist in the FS, it is skipped. fsyncs on directories * do not force down inodes inside that directory, just changes to the * names or unlinks in a directory. * * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a * non-existing inode) and 1 if the name was replayed.
*/ static noinline int replay_one_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct extent_buffer *eb, struct btrfs_dir_item *di, struct btrfs_key *key)
{ struct fscrypt_str name = { 0 }; struct btrfs_dir_item *dir_dst_di; struct btrfs_dir_item *index_dst_di; bool dir_dst_matches = false; bool index_dst_matches = false; struct btrfs_key log_key; struct btrfs_key search_key; struct btrfs_inode *dir;
u8 log_flags; bool exists; int ret; bool update_size = true; bool name_added = false;
dir = btrfs_iget_logging(key->objectid, root); if (IS_ERR(dir)) return PTR_ERR(dir);
ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); if (ret) goto out;
log_flags = btrfs_dir_flags(eb, di);
btrfs_dir_item_key_to_cpu(eb, di, &log_key);
ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
btrfs_release_path(path); if (ret < 0) goto out;
exists = (ret == 0);
ret = 0;
if (dir_dst_matches && index_dst_matches) {
ret = 0;
update_size = false; goto out;
}
/* * Check if the inode reference exists in the log for the given name, * inode and parent inode
*/
search_key.objectid = log_key.objectid;
search_key.type = BTRFS_INODE_REF_KEY;
search_key.offset = key->objectid;
ret = backref_in_log(root->log_root, &search_key, 0, &name); if (ret < 0) { goto out;
} elseif (ret) { /* The dentry will be added later. */
ret = 0;
update_size = false; goto out;
}
search_key.objectid = log_key.objectid;
search_key.type = BTRFS_INODE_EXTREF_KEY;
search_key.offset = btrfs_extref_hash(key->objectid, name.name, name.len);
ret = backref_in_log(root->log_root, &search_key, key->objectid, &name); if (ret < 0) { goto out;
} elseif (ret) { /* The dentry will be added later. */
ret = 0;
update_size = false; goto out;
}
btrfs_release_path(path);
ret = insert_one_name(trans, root, key->objectid, key->offset,
&name, &log_key); if (ret && ret != -ENOENT && ret != -EEXIST) goto out; if (!ret)
name_added = true;
update_size = false;
ret = 0;
out: if (!ret && update_size) {
btrfs_i_size_write(dir, dir->vfs_inode.i_size + name.len * 2);
ret = btrfs_update_inode(trans, dir);
}
kfree(name.name);
iput(&dir->vfs_inode); if (!ret && name_added)
ret = 1; return ret;
}
/* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct extent_buffer *eb, int slot, struct btrfs_key *key)
{ int ret; struct btrfs_dir_item *di;
/* We only log dir index keys, which only contain a single dir item. */
ASSERT(key->type == BTRFS_DIR_INDEX_KEY);
di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
ret = replay_one_name(trans, root, path, eb, di, key); if (ret < 0) return ret;
/* * If this entry refers to a non-directory (directories can not have a * link count > 1) and it was added in the transaction that was not * committed, make sure we fixup the link count of the inode the entry * points to. Otherwise something like the following would result in a * directory pointing to an inode with a wrong link that does not account * for this dir entry: * * mkdir testdir * touch testdir/foo * touch testdir/bar * sync * * ln testdir/bar testdir/bar_link * ln testdir/foo testdir/foo_link * xfs_io -c "fsync" testdir/bar * * <power failure> * * mount fs, log replay happens * * File foo would remain with a link count of 1 when it has two entries * pointing to it in the directory testdir. This would make it impossible * to ever delete the parent directory has it would result in stale * dentries that can never be deleted.
*/ if (ret == 1 && btrfs_dir_ftype(eb, di) != BTRFS_FT_DIR) { struct btrfs_path *fixup_path; struct btrfs_key di_key;
fixup_path = btrfs_alloc_path(); if (!fixup_path) return -ENOMEM;
/* * directory replay has two parts. There are the standard directory * items in the log copied from the subvolume, and range items * created in the log while the subvolume was logged. * * The range items tell us which parts of the key space the log * is authoritative for. During replay, if a key in the subvolume * directory is in a logged range item, but not actually in the log * that means it was deleted from the directory before the fsync * and should be removed.
*/ static noinline int find_dir_range(struct btrfs_root *root, struct btrfs_path *path,
u64 dirid,
u64 *start_ret, u64 *end_ret)
{ struct btrfs_key key;
u64 found_end; struct btrfs_dir_log_item *item; int ret; int nritems;
if (*start_ret >= key.offset && *start_ret <= found_end) {
ret = 0;
*start_ret = key.offset;
*end_ret = found_end; goto out;
}
ret = 1;
next: /* check the next slot in the tree to see if it is a valid item */
nritems = btrfs_header_nritems(path->nodes[0]);
path->slots[0]++; if (path->slots[0] >= nritems) {
ret = btrfs_next_leaf(root, path); if (ret) goto out;
}
/* * this looks for a given directory item in the log. If the directory * item is not in the log, the item is removed and the inode it points * to is unlinked
*/ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, struct btrfs_path *log_path, struct btrfs_inode *dir, struct btrfs_key *dir_key)
{ struct btrfs_root *root = dir->root; int ret; struct extent_buffer *eb; int slot; struct btrfs_dir_item *di; struct fscrypt_str name = { 0 }; struct btrfs_inode *inode = NULL; struct btrfs_key location;
/* * Currently we only log dir index keys. Even if we replay a log created * by an older kernel that logged both dir index and dir item keys, all * we need to do is process the dir index keys, we (and our caller) can * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY).
*/
ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY);
eb = path->nodes[0];
slot = path->slots[0];
di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); if (ret) goto out;
if (log) { struct btrfs_dir_item *log_di;
log_di = btrfs_lookup_dir_index_item(trans, log, log_path,
dir_key->objectid,
dir_key->offset, &name, 0); if (IS_ERR(log_di)) {
ret = PTR_ERR(log_di); goto out;
} elseif (log_di) { /* The dentry exists in the log, we have nothing to do. */
ret = 0; goto out;
}
}
ret = link_to_fixup_dir(trans, root, path, location.objectid); if (ret) goto out;
inc_nlink(&inode->vfs_inode);
ret = unlink_inode_for_log_replay(trans, dir, inode, &name); /* * Unlike dir item keys, dir index keys can only have one name (entry) in * them, as there are no key collisions since each key has a unique offset * (an index number), so we're done.
*/
out:
btrfs_release_path(path);
btrfs_release_path(log_path);
kfree(name.name); if (inode)
iput(&inode->vfs_inode); return ret;
}
staticint replay_xattr_deletes(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_root *log, struct btrfs_path *path, const u64 ino)
{ struct btrfs_key search_key; struct btrfs_path *log_path; int i; int nritems; int ret;
log_path = btrfs_alloc_path(); if (!log_path) return -ENOMEM;
search_key.objectid = ino;
search_key.type = BTRFS_XATTR_ITEM_KEY;
search_key.offset = 0;
again:
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret < 0) goto out;
process_leaf:
nritems = btrfs_header_nritems(path->nodes[0]); for (i = path->slots[0]; i < nritems; i++) { struct btrfs_key key; struct btrfs_dir_item *di; struct btrfs_dir_item *log_di;
u32 total_size;
u32 cur;
btrfs_item_key_to_cpu(path->nodes[0], &key, i); if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
ret = 0; goto out;
}
di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
total_size = btrfs_item_size(path->nodes[0], i);
cur = 0; while (cur < total_size) {
u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
u32 this_len = sizeof(*di) + name_len + data_len; char *name;
name = kmalloc(name_len, GFP_NOFS); if (!name) {
ret = -ENOMEM; goto out;
}
read_extent_buffer(path->nodes[0], name,
(unsignedlong)(di + 1), name_len);
log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
name, name_len, 0);
btrfs_release_path(log_path); if (!log_di) { /* Doesn't exist in log tree, so delete it. */
btrfs_release_path(path);
di = btrfs_lookup_xattr(trans, root, path, ino,
name, name_len, -1);
kfree(name); if (IS_ERR(di)) {
ret = PTR_ERR(di); goto out;
}
ASSERT(di);
ret = btrfs_delete_one_dir_name(trans, root,
path, di); if (ret) goto out;
btrfs_release_path(path);
search_key = key; goto again;
}
kfree(name); if (IS_ERR(log_di)) {
ret = PTR_ERR(log_di); goto out;
}
cur += this_len;
di = (struct btrfs_dir_item *)((char *)di + this_len);
}
}
ret = btrfs_next_leaf(root, path); if (ret > 0)
ret = 0; elseif (ret == 0) goto process_leaf;
out:
btrfs_free_path(log_path);
btrfs_release_path(path); return ret;
}
/* * deletion replay happens before we copy any new directory items * out of the log or out of backreferences from inodes. It * scans the log to find ranges of keys that log is authoritative for, * and then scans the directory to find items in those ranges that are * not present in the log. * * Anything we don't find in the log is unlinked and removed from the * directory.
*/ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_root *log, struct btrfs_path *path,
u64 dirid, bool del_all)
{
u64 range_start;
u64 range_end; int ret = 0; struct btrfs_key dir_key; struct btrfs_key found_key; struct btrfs_path *log_path; struct btrfs_inode *dir;
dir = btrfs_iget_logging(dirid, root); /* * It isn't an error if the inode isn't there, that can happen because * we replay the deletes before we copy in the inode item from the log.
*/ if (IS_ERR(dir)) {
btrfs_free_path(log_path);
ret = PTR_ERR(dir); if (ret == -ENOENT)
ret = 0; return ret;
}
range_start = 0;
range_end = 0; while (1) { if (del_all)
range_end = (u64)-1; else {
ret = find_dir_range(log, path, dirid,
&range_start, &range_end); if (ret < 0) goto out; elseif (ret > 0) break;
}
dir_key.offset = range_start; while (1) { int nritems;
ret = btrfs_search_slot(NULL, root, &dir_key, path,
0, 0); if (ret < 0) goto out;
nritems = btrfs_header_nritems(path->nodes[0]); if (path->slots[0] >= nritems) {
ret = btrfs_next_leaf(root, path); if (ret == 1) break; elseif (ret < 0) goto out;
}
btrfs_item_key_to_cpu(path->nodes[0], &found_key,
path->slots[0]);
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Dauer der Verarbeitung: 0.28 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.