staticint split_node(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path, int level); staticint split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, conststruct btrfs_key *ins_key, struct btrfs_path *path, int data_size, int extend); staticint push_node_left(struct btrfs_trans_handle *trans, struct extent_buffer *dst, struct extent_buffer *src, int empty); staticint balance_node_right(struct btrfs_trans_handle *trans, struct extent_buffer *dst_buf, struct extent_buffer *src_buf); /* * The leaf data grows from end-to-front in the node. this returns the address * of the start of the last item, which is the stop of the leaf data stack.
*/ staticunsignedint leaf_data_end(conststruct extent_buffer *leaf)
{
u32 nr = btrfs_header_nritems(leaf);
if (nr == 0) return BTRFS_LEAF_DATA_SIZE(leaf->fs_info); return btrfs_item_offset(leaf, nr - 1);
}
/* * Move data in a @leaf (using memmove, safe for overlapping ranges). * * @leaf: leaf that we're doing a memmove on * @dst_offset: item data offset we're moving to * @src_offset: item data offset were' moving from * @len: length of the data we're moving * * Wrapper around memmove_extent_buffer() that takes into account the header on * the leaf. The btrfs_item offset's start directly after the header, so we * have to adjust any offsets to account for the header in the leaf. This * handles that math to simplify the callers.
*/ staticinlinevoid memmove_leaf_data(conststruct extent_buffer *leaf, unsignedlong dst_offset, unsignedlong src_offset, unsignedlong len)
{
memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, 0) + dst_offset,
btrfs_item_nr_offset(leaf, 0) + src_offset, len);
}
/* * Copy item data from @src into @dst at the given @offset. * * @dst: destination leaf that we're copying into * @src: source leaf that we're copying from * @dst_offset: item data offset we're copying to * @src_offset: item data offset were' copying from * @len: length of the data we're copying * * Wrapper around copy_extent_buffer() that takes into account the header on * the leaf. The btrfs_item offset's start directly after the header, so we * have to adjust any offsets to account for the header in the leaf. This * handles that math to simplify the callers.
*/ staticinlinevoid copy_leaf_data(conststruct extent_buffer *dst, conststruct extent_buffer *src, unsignedlong dst_offset, unsignedlong src_offset, unsignedlong len)
{
copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, 0) + dst_offset,
btrfs_item_nr_offset(src, 0) + src_offset, len);
}
/* * Move items in a @leaf (using memmove). * * @dst: destination leaf for the items * @dst_item: the item nr we're copying into * @src_item: the item nr we're copying from * @nr_items: the number of items to copy * * Wrapper around memmove_extent_buffer() that does the math to get the * appropriate offsets into the leaf from the item numbers.
*/ staticinlinevoid memmove_leaf_items(conststruct extent_buffer *leaf, int dst_item, int src_item, int nr_items)
{
memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, dst_item),
btrfs_item_nr_offset(leaf, src_item),
nr_items * sizeof(struct btrfs_item));
}
/* * Copy items from @src into @dst at the given @offset. * * @dst: destination leaf for the items * @src: source leaf for the items * @dst_item: the item nr we're copying into * @src_item: the item nr we're copying from * @nr_items: the number of items to copy * * Wrapper around copy_extent_buffer() that does the math to get the * appropriate offsets into the leaf from the item numbers.
*/ staticinlinevoid copy_leaf_items(conststruct extent_buffer *dst, conststruct extent_buffer *src, int dst_item, int src_item, int nr_items)
{
copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, dst_item),
btrfs_item_nr_offset(src, src_item),
nr_items * sizeof(struct btrfs_item));
}
/* this also releases the path */ void btrfs_free_path(struct btrfs_path *p)
{ if (!p) return;
btrfs_release_path(p);
kmem_cache_free(btrfs_path_cachep, p);
}
/* * path release drops references on the extent buffers in the path * and it drops any locks held by this path * * It is safe to call this on paths that no locks or extent buffers held.
*/
noinline void btrfs_release_path(struct btrfs_path *p)
{ int i;
for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
p->slots[i] = 0; if (!p->nodes[i]) continue; if (p->locks[i]) {
btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
p->locks[i] = 0;
}
free_extent_buffer(p->nodes[i]);
p->nodes[i] = NULL;
}
}
/* * safely gets a reference on the root node of a tree. A lock * is not taken, so a concurrent writer may put a different node * at the root of the tree. See btrfs_lock_root_node for the * looping required. * * The extent buffer returned by this has a reference taken, so * it won't disappear. It may stop being the root of the tree * at any time because there are no locks held.
*/ struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
{ struct extent_buffer *eb;
while (1) {
rcu_read_lock();
eb = rcu_dereference(root->node);
/* * RCU really hurts here, we could free up the root node because * it was COWed but we may not get the new root node yet so do * the inc_not_zero dance and if it doesn't work then * synchronize_rcu and try again.
*/ if (refcount_inc_not_zero(&eb->refs)) {
rcu_read_unlock(); break;
}
rcu_read_unlock();
synchronize_rcu();
} return eb;
}
/* * Cowonly root (not-shareable trees, everything not subvolume or reloc roots), * just get put onto a simple dirty list. Transaction walks this list to make * sure they get properly updated on disk.
*/ staticvoid add_root_to_dirty_list(struct btrfs_root *root)
{ struct btrfs_fs_info *fs_info = root->fs_info;
if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
!test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state)) return;
spin_lock(&fs_info->trans_lock); if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) { /* Want the extent tree to be the last on the list */ if (btrfs_root_id(root) == BTRFS_EXTENT_TREE_OBJECTID)
list_move_tail(&root->dirty_list,
&fs_info->dirty_cowonly_roots); else
list_move(&root->dirty_list,
&fs_info->dirty_cowonly_roots);
}
spin_unlock(&fs_info->trans_lock);
}
/* * used by snapshot creation to make a copy of a root for a tree with * a given objectid. The buffer with the new root node is returned in * cow_ret, and this func returns zero on success or a negative error code.
*/ int btrfs_copy_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer **cow_ret, u64 new_root_objectid)
{ struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *cow; int ret = 0; int level; struct btrfs_disk_key disk_key;
u64 reloc_src_root = 0;
/* * check if the tree block can be shared by multiple trees
*/ bool btrfs_block_can_be_shared(conststruct btrfs_trans_handle *trans, conststruct btrfs_root *root, conststruct extent_buffer *buf)
{ const u64 buf_gen = btrfs_header_generation(buf);
/* * Tree blocks not in shareable trees and tree roots are never shared. * If a block was allocated after the last snapshot and the block was * not allocated by tree relocation, we know the block is not shared.
*/
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) returnfalse;
if (buf == root->node) returnfalse;
if (buf_gen > btrfs_root_last_snapshot(&root->root_item) &&
!btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) returnfalse;
if (buf != root->commit_root) returntrue;
/* * An extent buffer that used to be the commit root may still be shared * because the tree height may have increased and it became a child of a * higher level root. This can happen when snapshotting a subvolume * created in the current transaction.
*/ if (buf_gen == trans->transid) returntrue;
/* * Backrefs update rules: * * Always use full backrefs for extent pointers in tree block * allocated by tree relocation. * * If a shared tree block is no longer referenced by its owner * tree (btrfs_header_owner(buf) == root->root_key.objectid), * use full backrefs for extent pointers in tree block. * * If a tree block is been relocating * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID), * use full backrefs for extent pointers in tree block. * The reason for this is some operations (such as drop tree) * are only allowed for blocks use full backrefs.
*/
if (btrfs_block_can_be_shared(trans, root, buf)) {
ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
btrfs_header_level(buf), 1,
&refs, &flags, NULL); if (ret) return ret; if (unlikely(refs == 0)) {
btrfs_crit(fs_info, "found 0 references for tree block at bytenr %llu level %d root %llu",
buf->start, btrfs_header_level(buf),
btrfs_root_id(root));
ret = -EUCLEAN;
btrfs_abort_transaction(trans, ret); return ret;
}
} else {
refs = 1; if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID ||
btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; else
flags = 0;
}
owner = btrfs_header_owner(buf); if (unlikely(owner == BTRFS_TREE_RELOC_OBJECTID &&
!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))) {
btrfs_crit(fs_info, "found tree block at bytenr %llu level %d root %llu refs %llu flags %llx without full backref flag set",
buf->start, btrfs_header_level(buf),
btrfs_root_id(root), refs, flags);
ret = -EUCLEAN;
btrfs_abort_transaction(trans, ret); return ret;
}
if (refs > 1) { if ((owner == btrfs_root_id(root) ||
btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) &&
!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
ret = btrfs_inc_ref(trans, root, buf, 1); if (ret) return ret;
if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) {
ret = btrfs_dec_ref(trans, root, buf, 0); if (ret) return ret;
ret = btrfs_inc_ref(trans, root, cow, 1); if (ret) return ret;
}
ret = btrfs_set_disk_extent_flags(trans, buf,
BTRFS_BLOCK_FLAG_FULL_BACKREF); if (ret) return ret;
} else {
if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
ret = btrfs_inc_ref(trans, root, cow, 1); else
ret = btrfs_inc_ref(trans, root, cow, 0); if (ret) return ret;
}
} else { if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
ret = btrfs_inc_ref(trans, root, cow, 1); else
ret = btrfs_inc_ref(trans, root, cow, 0); if (ret) return ret;
ret = btrfs_dec_ref(trans, root, buf, 1); if (ret) return ret;
}
btrfs_clear_buffer_dirty(trans, buf);
*last_ref = 1;
} return 0;
}
/* * does the dirty work in cow of a single block. The parent block (if * supplied) is updated to point to the new cow copy. The new buffer is marked * dirty and returned locked. If you modify the block it needs to be marked * dirty again. * * search_start -- an allocation hint for the new block * * empty_size -- a hint that you plan on doing more cow. This is the size in * bytes the allocator should try to find free next to the block it returns. * This is just a hint and may be ignored by the allocator.
*/ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, struct extent_buffer **cow_ret,
u64 search_start, u64 empty_size, enum btrfs_lock_nesting nest)
{ struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_disk_key disk_key; struct extent_buffer *cow; int level, ret; int last_ref = 0; int unlock_orig = 0;
u64 parent_start = 0;
u64 reloc_src_root = 0;
/* Ensure we can see the FORCE_COW bit */
smp_mb__before_atomic();
/* * We do not need to cow a block if * 1) this block is not created or changed in this transaction; * 2) this block does not belong to TREE_RELOC tree; * 3) the root is not forced COW. * * What is forced COW: * when we create snapshot during committing the transaction, * after we've finished copying src root, we must COW the shared * block to ensure the metadata consistency.
*/ if (btrfs_header_generation(buf) == trans->transid &&
!btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
!(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
!test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) return 0; return 1;
}
/* * COWs a single block, see btrfs_force_cow_block() for the real work. * This version of it has extra checks so that a block isn't COWed more than * once per transaction, as long as it hasn't been written yet
*/ int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, struct extent_buffer **cow_ret, enum btrfs_lock_nesting nest)
{ struct btrfs_fs_info *fs_info = root->fs_info;
u64 search_start;
if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) {
btrfs_abort_transaction(trans, -EUCLEAN);
btrfs_crit(fs_info, "attempt to COW block %llu on root %llu that is being deleted",
buf->start, btrfs_root_id(root)); return -EUCLEAN;
}
/* * COWing must happen through a running transaction, which always * matches the current fs generation (it's a transaction with a state * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs * into error state to prevent the commit of any transaction.
*/ if (unlikely(trans->transaction != fs_info->running_transaction ||
trans->transid != fs_info->generation)) {
btrfs_abort_transaction(trans, -EUCLEAN);
btrfs_crit(fs_info, "unexpected transaction when attempting to COW block %llu on root %llu, transaction %llu running transaction %llu fs generation %llu",
buf->start, btrfs_root_id(root), trans->transid,
fs_info->running_transaction->transid,
fs_info->generation); return -EUCLEAN;
}
/* * Before CoWing this block for later modification, check if it's * the subtree root and do the delayed subtree trace if needed. * * Also We don't care about the error, as it's handled internally.
*/
btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); return btrfs_force_cow_block(trans, root, buf, parent, parent_slot,
cow_ret, search_start, 0, nest);
}
ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
/* * same as comp_keys only with two btrfs_key's
*/ int __pure btrfs_comp_cpu_keys(conststruct btrfs_key *k1, conststruct btrfs_key *k2)
{ if (k1->objectid > k2->objectid) return 1; if (k1->objectid < k2->objectid) return -1; if (k1->type > k2->type) return 1; if (k1->type < k2->type) return -1; if (k1->offset > k2->offset) return 1; if (k1->offset < k2->offset) return -1; return 0;
}
/* * Search for a key in the given extent_buffer. * * The lower boundary for the search is specified by the slot number @first_slot. * Use a value of 0 to search over the whole extent buffer. Works for both * leaves and nodes. * * The slot in the extent buffer is returned via @slot. If the key exists in the * extent buffer, then @slot will point to the slot where the key is, otherwise * it points to the slot where you would insert the key. * * Slot may point to the total number of items (i.e. one position beyond the last * key) if the key is bigger than the last key in the extent buffer.
*/ int btrfs_bin_search(conststruct extent_buffer *eb, int first_slot, conststruct btrfs_key *key, int *slot)
{ unsignedlong p; int item_size; /* * Use unsigned types for the low and high slots, so that we get a more * efficient division in the search loop below.
*/
u32 low = first_slot;
u32 high = btrfs_header_nritems(eb); int ret; constint key_size = sizeof(struct btrfs_disk_key);
/* given a node and slot number, this reads the blocks it points to. The * extent buffer is returned with a reference taken (but unlocked).
*/ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, int slot)
{ int level = btrfs_header_level(parent); struct btrfs_tree_parent_check check = { 0 }; struct extent_buffer *eb;
if (slot < 0 || slot >= btrfs_header_nritems(parent)) return ERR_PTR(-ENOENT);
eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
&check); if (IS_ERR(eb)) return eb; if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb); return ERR_PTR(-EIO);
}
return eb;
}
/* * node level balancing, used to make sure nodes are in proper order for * item deletion. We balance from the top down, so we have to make sure * that a deletion won't leave an node completely empty later on.
*/ static noinline int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level)
{ struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *right = NULL; struct extent_buffer *mid; struct extent_buffer *left = NULL; struct extent_buffer *parent = NULL; int ret = 0; int wret; int pslot; int orig_slot = path->slots[level];
u64 orig_ptr;
/* * deal with the case where there is only one pointer in the root * by promoting the node below to a root
*/ if (!parent) { struct extent_buffer *child;
if (btrfs_header_nritems(mid) != 1) return 0;
/* promote the child to a root */
child = btrfs_read_node_slot(mid, 0); if (IS_ERR(child)) {
ret = PTR_ERR(child); goto out;
}
btrfs_tree_lock(child);
ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
BTRFS_NESTING_COW); if (ret) {
btrfs_tree_unlock(child);
free_extent_buffer(child); goto out;
}
ret = btrfs_tree_mod_log_insert_root(root->node, child, true); if (ret < 0) {
btrfs_tree_unlock(child);
free_extent_buffer(child);
btrfs_abort_transaction(trans, ret); goto out;
}
rcu_assign_pointer(root->node, child);
if (pslot + 1 < btrfs_header_nritems(parent)) {
right = btrfs_read_node_slot(parent, pslot + 1); if (IS_ERR(right)) {
ret = PTR_ERR(right);
right = NULL; goto out;
}
/* first, try to make some room in the middle buffer */ if (left) {
orig_slot += btrfs_header_nritems(left);
wret = push_node_left(trans, left, mid, 1); if (wret < 0)
ret = wret;
}
/* * then try to empty the right most buffer into the middle
*/ if (right) {
wret = push_node_left(trans, mid, right, 1); if (wret < 0 && wret != -ENOSPC)
ret = wret; if (btrfs_header_nritems(right) == 0) {
btrfs_clear_buffer_dirty(trans, right);
btrfs_tree_unlock(right);
ret = btrfs_del_ptr(trans, root, path, level + 1, pslot + 1); if (ret < 0) {
free_extent_buffer_stale(right);
right = NULL; goto out;
}
root_sub_used_bytes(root);
ret = btrfs_free_tree_block(trans, btrfs_root_id(root),
right, 0, 1);
free_extent_buffer_stale(right);
right = NULL; if (ret < 0) {
btrfs_abort_transaction(trans, ret); goto out;
}
} else { struct btrfs_disk_key right_key;
btrfs_node_key(right, &right_key, 0);
ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
BTRFS_MOD_LOG_KEY_REPLACE); if (ret < 0) {
btrfs_abort_transaction(trans, ret); goto out;
}
btrfs_set_node_key(parent, &right_key, pslot + 1);
btrfs_mark_buffer_dirty(trans, parent);
}
} if (btrfs_header_nritems(mid) == 1) { /* * we're not allowed to leave a node with one item in the * tree during a delete. A deletion from lower in the tree * could try to delete the only pointer in this node. * So, pull some keys from the left. * There has to be a left pointer at this point because * otherwise we would have pulled some pointers from the * right
*/ if (unlikely(!left)) {
btrfs_crit(fs_info, "missing left child when middle child only has 1 item, parent bytenr %llu level %d mid bytenr %llu root %llu",
parent->start, btrfs_header_level(parent),
mid->start, btrfs_root_id(root));
ret = -EUCLEAN;
btrfs_abort_transaction(trans, ret); goto out;
}
wret = balance_node_right(trans, mid, left); if (wret < 0) {
ret = wret; goto out;
} if (wret == 1) {
wret = push_node_left(trans, left, mid, 1); if (wret < 0)
ret = wret;
}
BUG_ON(wret == 1);
} if (btrfs_header_nritems(mid) == 0) {
btrfs_clear_buffer_dirty(trans, mid);
btrfs_tree_unlock(mid);
ret = btrfs_del_ptr(trans, root, path, level + 1, pslot); if (ret < 0) {
free_extent_buffer_stale(mid);
mid = NULL; goto out;
}
root_sub_used_bytes(root);
ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
free_extent_buffer_stale(mid);
mid = NULL; if (ret < 0) {
btrfs_abort_transaction(trans, ret); goto out;
}
} else { /* update the parent key to reflect our changes */ struct btrfs_disk_key mid_key;
btrfs_node_key(mid, &mid_key, 0);
ret = btrfs_tree_mod_log_insert_key(parent, pslot,
BTRFS_MOD_LOG_KEY_REPLACE); if (ret < 0) {
btrfs_abort_transaction(trans, ret); goto out;
}
btrfs_set_node_key(parent, &mid_key, pslot);
btrfs_mark_buffer_dirty(trans, parent);
}
/* update the path */ if (left) { if (btrfs_header_nritems(left) > orig_slot) {
refcount_inc(&left->refs); /* left was locked after cow */
path->nodes[level] = left;
path->slots[level + 1] -= 1;
path->slots[level] = orig_slot; if (mid) {
btrfs_tree_unlock(mid);
free_extent_buffer(mid);
}
} else {
orig_slot -= btrfs_header_nritems(left);
path->slots[level] = orig_slot;
}
} /* double check we haven't messed things up */ if (orig_ptr !=
btrfs_node_blockptr(path->nodes[level], path->slots[level]))
BUG();
out: if (right) {
btrfs_tree_unlock(right);
free_extent_buffer(right);
} if (left) { if (path->nodes[level] != left)
btrfs_tree_unlock(left);
free_extent_buffer(left);
} return ret;
}
/* Node balancing for insertion. Here we only split or push nodes around * when they are completely full. This is also done top down, so we * have to be pessimistic.
*/ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level)
{ struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *right = NULL; struct extent_buffer *mid; struct extent_buffer *left = NULL; struct extent_buffer *parent = NULL; int ret = 0; int wret; int pslot; int orig_slot = path->slots[level];
/* * readahead one full node of leaves, finding things that are close * to the block in 'slot', and triggering ra on them.
*/ staticvoid reada_for_search(struct btrfs_fs_info *fs_info, conststruct btrfs_path *path, int level, int slot, u64 objectid)
{ struct extent_buffer *node; struct btrfs_disk_key disk_key;
u32 nritems;
u64 search;
u64 target;
u64 nread = 0;
u64 nread_max;
u32 nr;
u32 blocksize;
u32 nscan = 0;
if (level != 1 && path->reada != READA_FORWARD_ALWAYS) return;
if (!path->nodes[level]) return;
node = path->nodes[level];
/* * Since the time between visiting leaves is much shorter than the time * between visiting nodes, limit read ahead of nodes to 1, to avoid too * much IO at once (possibly random).
*/ if (path->reada == READA_FORWARD_ALWAYS) { if (level > 1)
nread_max = node->fs_info->nodesize; else
nread_max = SZ_128K;
} else {
nread_max = SZ_64K;
}
if (slot > 0)
btrfs_readahead_node_child(parent, slot - 1); if (slot + 1 < nritems)
btrfs_readahead_node_child(parent, slot + 1);
}
/* * when we walk down the tree, it is usually safe to unlock the higher layers * in the tree. The exceptions are when our path goes through slot 0, because * operations on the tree might require changing key pointers higher up in the * tree. * * callers might also have set path->keep_locks, which tells this code to keep * the lock if the path points to the last slot in the block. This is part of * walking through the tree, and selecting the next slot in the higher block. * * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so * if lowest_unlock is 1, level 0 won't be unlocked
*/ static noinline void unlock_up(struct btrfs_path *path, int level, int lowest_unlock, int min_write_lock_level, int *write_lock_level)
{ int i; int skip_level = level; bool check_skip = true;
for (i = level; i < BTRFS_MAX_LEVEL; i++) { if (!path->nodes[i]) break; if (!path->locks[i]) break;
if (check_skip) { if (path->slots[i] == 0) {
skip_level = i + 1; continue;
}
if (i >= lowest_unlock && i > skip_level) {
check_skip = false;
btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
path->locks[i] = 0; if (write_lock_level &&
i > min_write_lock_level &&
i <= *write_lock_level) {
*write_lock_level = i - 1;
}
}
}
}
/* * Helper function for btrfs_search_slot() and other functions that do a search * on a btree. The goal is to find a tree block in the cache (the radix tree at * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read * its pages from disk. * * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the * whole btree search, starting again from the current root node.
*/ staticint
read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, struct extent_buffer **eb_ret, int slot, conststruct btrfs_key *key)
{ struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_tree_parent_check check = { 0 };
u64 blocknr; struct extent_buffer *tmp = NULL; int ret = 0; int ret2; int parent_level; bool read_tmp = false; bool tmp_locked = false; bool path_released = false;
/* * If we need to read an extent buffer from disk and we are holding locks * on upper level nodes, we unlock all the upper nodes before reading the * extent buffer, and then return -EAGAIN to the caller as it needs to * restart the search. We don't release the lock on the current level * because we need to walk this node to figure out which blocks to read.
*/
tmp = find_extent_buffer(fs_info, blocknr); if (tmp) { if (p->reada == READA_FORWARD_ALWAYS)
reada_for_search(fs_info, p, parent_level, slot, key->objectid);
/* first we do an atomic uptodate check */ if (btrfs_buffer_uptodate(tmp, check.transid, 1) > 0) { /* * Do extra check for first_key, eb can be stale due to * being cached, read from scrub, or have multiple * parents (shared tree blocks).
*/ if (btrfs_verify_level_key(tmp, &check)) {
ret = -EUCLEAN; goto out;
}
*eb_ret = tmp;
tmp = NULL;
ret = 0; goto out;
}
/* Now we're allowed to do a blocking uptodate check. */
ret2 = btrfs_read_extent_buffer(tmp, &check); if (ret2) {
ret = ret2; goto out;
}
/* * If the read above didn't mark this buffer up to date, * it will never end up being up to date. Set ret to EIO now * and give up so that our caller doesn't loop forever * on our EAGAINs.
*/ if (!extent_buffer_uptodate(tmp)) {
ret = -EIO; goto out;
}
if (ret == 0) {
ASSERT(!tmp_locked);
*eb_ret = tmp;
tmp = NULL;
}
out: if (tmp) { if (tmp_locked)
btrfs_tree_read_unlock(tmp); if (read_tmp && ret && ret != -EAGAIN)
free_extent_buffer_stale(tmp); else
free_extent_buffer(tmp);
} if (ret && !path_released)
btrfs_release_path(p);
return ret;
}
/* * helper function for btrfs_search_slot. This does all of the checks * for node-level blocks and does any balancing required based on * the ins_len. * * If no extra work was required, zero is returned. If we had to * drop the path, -EAGAIN is returned and btrfs_search_slot must * start over
*/ staticint
setup_nodes_for_search(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *p, struct extent_buffer *b, int level, int ins_len, int *write_lock_level)
{ struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0;
staticstruct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, struct btrfs_path *p, int write_lock_level)
{ struct extent_buffer *b; int root_lock = 0; int level = 0;
if (p->search_commit_root) {
b = root->commit_root;
refcount_inc(&b->refs);
level = btrfs_header_level(b); /* * Ensure that all callers have set skip_locking when * p->search_commit_root = 1.
*/
ASSERT(p->skip_locking == 1);
goto out;
}
if (p->skip_locking) {
b = btrfs_root_node(root);
level = btrfs_header_level(b); goto out;
}
/* We try very hard to do read locks on the root */
root_lock = BTRFS_READ_LOCK;
/* * If the level is set to maximum, we can skip trying to get the read * lock.
*/ if (write_lock_level < BTRFS_MAX_LEVEL) { /* * We don't know the level of the root node until we actually * have it read locked
*/ if (p->nowait) {
b = btrfs_try_read_lock_root_node(root); if (IS_ERR(b)) return b;
} else {
b = btrfs_read_lock_root_node(root);
}
level = btrfs_header_level(b); if (level > write_lock_level) goto out;
/* Whoops, must trade for write lock */
btrfs_tree_read_unlock(b);
free_extent_buffer(b);
}
b = btrfs_lock_root_node(root);
root_lock = BTRFS_WRITE_LOCK;
/* The level might have changed, check again */
level = btrfs_header_level(b);
out: /* * The root may have failed to write out at some point, and thus is no * longer valid, return an error in this case.
*/ if (!extent_buffer_uptodate(b)) { if (root_lock)
btrfs_tree_unlock_rw(b, root_lock);
free_extent_buffer(b); return ERR_PTR(-EIO);
}
p->nodes[level] = b; if (!p->skip_locking)
p->locks[level] = root_lock; /* * Callers are responsible for dropping b's references.
*/ return b;
}
/* * Replace the extent buffer at the lowest level of the path with a cloned * version. The purpose is to be able to use it safely, after releasing the * commit root semaphore, even if relocation is happening in parallel, the * transaction used for relocation is committed and the extent buffer is * reallocated in the next transaction. * * This is used in a context where the caller does not prevent transaction * commits from happening, either by holding a transaction handle or holding * some lock, while it's doing searches through a commit root. * At the moment it's only used for send operations.
*/ staticint finish_need_commit_sem_search(struct btrfs_path *path)
{ constint i = path->lowest_level; constint slot = path->slots[i]; struct extent_buffer *lowest = path->nodes[i]; struct extent_buffer *clone;
staticinlineint search_for_key_slot(conststruct extent_buffer *eb, int search_low_slot, conststruct btrfs_key *key, int prev_cmp, int *slot)
{ /* * If a previous call to btrfs_bin_search() on a parent node returned an * exact match (prev_cmp == 0), we can safely assume the target key will * always be at slot 0 on lower levels, since each key pointer * (struct btrfs_key_ptr) refers to the lowest key accessible from the * subtree it points to. Thus we can skip searching lower levels.
*/ if (prev_cmp == 0) {
*slot = 0; return 0;
}
staticint search_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, conststruct btrfs_key *key, struct btrfs_path *path, int ins_len, int prev_cmp)
{ struct extent_buffer *leaf = path->nodes[0]; int leaf_free_space = -1; int search_low_slot = 0; int ret; bool do_bin_search = true;
/* * If we are doing an insertion, the leaf has enough free space and the * destination slot for the key is not slot 0, then we can unlock our * write lock on the parent, and any other upper nodes, before doing the * binary search on the leaf (with search_for_key_slot()), allowing other * tasks to lock the parent and any other upper nodes.
*/ if (ins_len > 0) { /* * Cache the leaf free space, since we will need it later and it * will not change until then.
*/
leaf_free_space = btrfs_leaf_free_space(leaf);
/* * !path->locks[1] means we have a single node tree, the leaf is * the root of the tree.
*/ if (path->locks[1] && leaf_free_space >= ins_len) { struct btrfs_disk_key first_key;
/* * Doing the extra comparison with the first key is cheap, * taking into account that the first key is very likely * already in a cache line because it immediately follows * the extent buffer's header and we have recently accessed * the header's level field.
*/
ret = btrfs_comp_keys(&first_key, key); if (ret < 0) { /* * The first key is smaller than the key we want * to insert, so we are safe to unlock all upper * nodes and we have to do the binary search. * * We do use btrfs_unlock_up_safe() and not * unlock_up() because the later does not unlock * nodes with a slot of 0 - we can safely unlock * any node even if its slot is 0 since in this * case the key does not end up at slot 0 of the * leaf and there's no need to split the leaf.
*/
btrfs_unlock_up_safe(path, 1);
search_low_slot = 1;
} else { /* * The first key is >= then the key we want to * insert, so we can skip the binary search as * the target key will be at slot 0. * * We can not unlock upper nodes when the key is * less than the first key, because we will need * to update the key at slot 0 of the parent node * and possibly of other upper nodes too. * If the key matches the first key, then we can * unlock all the upper nodes, using * btrfs_unlock_up_safe() instead of unlock_up() * as stated above.
*/ if (ret == 0)
btrfs_unlock_up_safe(path, 1); /* * ret is already 0 or 1, matching the result of * a btrfs_bin_search() call, so there is no need * to adjust it.
*/
do_bin_search = false;
path->slots[0] = 0;
}
}
}
if (do_bin_search) {
ret = search_for_key_slot(leaf, search_low_slot, key,
prev_cmp, &path->slots[0]); if (ret < 0) return ret;
}
if (ins_len > 0) { /* * Item key already exists. In this case, if we are allowed to * insert the item (for example, in dir_item case, item key * collision is allowed), it will be merged with the original * item. Only the item size grows, no new btrfs item will be * added. If search_for_extension is not set, ins_len already * accounts the size btrfs_item, deduct it here so leaf space * check will be correct.
*/ if (ret == 0 && !path->search_for_extension) {
ASSERT(ins_len >= sizeof(struct btrfs_item));
ins_len -= sizeof(struct btrfs_item);
}
ASSERT(leaf_free_space >= 0);
if (leaf_free_space < ins_len) { int ret2;
ret2 = split_leaf(trans, root, key, path, ins_len, (ret == 0));
ASSERT(ret2 <= 0); if (WARN_ON(ret2 > 0))
ret2 = -EUCLEAN; if (ret2)
ret = ret2;
}
}
return ret;
}
/* * Look for a key in a tree and perform necessary modifications to preserve * tree invariants. * * @trans: Handle of transaction, used when modifying the tree * @p: Holds all btree nodes along the search path * @root: The root node of the tree * @key: The key we are looking for * @ins_len: Indicates purpose of search: * >0 for inserts it's size of item inserted (*) * <0 for deletions * 0 for plain searches, not modifying the tree * * (*) If size of item inserted doesn't include * sizeof(struct btrfs_item), then p->search_for_extension must * be set. * @cow: boolean should CoW operations be performed. Must always be 1 * when modifying the tree. * * If @ins_len > 0, nodes and leaves will be split as we walk down the tree. * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible) * * If @key is found, 0 is returned and you can find the item in the leaf level * of the path (level 0) * * If @key isn't found, 1 is returned and the leaf level of the path (level 0) * points to the slot where it should be inserted * * If an error is encountered while searching the tree a negative error number * is returned
*/ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, conststruct btrfs_key *key, struct btrfs_path *p, int ins_len, int cow)
{ struct btrfs_fs_info *fs_info; struct extent_buffer *b; int slot; int ret; int level; int lowest_unlock = 1; /* everything at write_lock_level or lower must be write locked */ int write_lock_level = 0;
u8 lowest_level = 0; int min_write_lock_level; int prev_cmp;
/* * For now only allow nowait for read only operations. There's no * strict reason why we can't, we just only need it for reads so it's * only implemented for reads.
*/
ASSERT(!p->nowait || !cow);
if (ins_len < 0) {
lowest_unlock = 2;
/* when we are removing items, we might have to go up to level * two as we update tree pointers Make sure we keep write * for those levels as well
*/
write_lock_level = 2;
} elseif (ins_len > 0) { /* * for inserting items, make sure we have a write lock on * level 1 so we can update keys
*/
write_lock_level = 1;
}
if (!cow)
write_lock_level = -1;
if (cow && (p->keep_locks || p->lowest_level))
write_lock_level = BTRFS_MAX_LEVEL;
min_write_lock_level = write_lock_level;
if (p->need_commit_sem) {
ASSERT(p->search_commit_root); if (p->nowait) { if (!down_read_trylock(&fs_info->commit_root_sem)) return -EAGAIN;
} else {
down_read(&fs_info->commit_root_sem);
}
}
again:
prev_cmp = -1;
b = btrfs_search_slot_get_root(root, p, write_lock_level); if (IS_ERR(b)) {
ret = PTR_ERR(b); goto done;
}
/* * if we don't really need to cow this block * then we don't want to set the path blocking, * so we test it here
*/ if (!should_cow_block(trans, root, b)) goto cow_done;
/* * must have write locks on this node and the * parent
*/ if (level > write_lock_level ||
(level + 1 > write_lock_level &&
level + 1 < BTRFS_MAX_LEVEL &&
p->nodes[level + 1])) {
write_lock_level = level + 1;
btrfs_release_path(p); goto again;
}
/* * we have a lock on b and as long as we aren't changing * the tree, there is no way to for the items in b to change. * It is safe to drop the lock on our parent before we * go through the expensive btree search on b. * * If we're inserting or deleting (ins_len != 0), then we might * be changing slot zero, which may require changing the parent. * So, we can't drop the lock until after we know which slot * we're operating on.
*/ if (!ins_len && !p->keep_locks) { int u = level + 1;
if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
p->locks[u] = 0;
}
}
if (level == 0) { if (ins_len > 0)
ASSERT(write_lock_level >= 1);
ret = search_leaf(trans, root, key, p, ins_len, prev_cmp); if (!p->search_for_split)
unlock_up(p, level, lowest_unlock,
min_write_lock_level, NULL); goto done;
}
ret = search_for_key_slot(b, 0, key, prev_cmp, &slot); if (ret < 0) goto done;
prev_cmp = ret;
if (ret && slot > 0) {
dec = 1;
slot--;
}
p->slots[level] = slot;
ret2 = setup_nodes_for_search(trans, root, p, b, level, ins_len,
&write_lock_level); if (ret2 == -EAGAIN) goto again; if (ret2) {
ret = ret2; goto done;
}
b = p->nodes[level];
slot = p->slots[level];
/* * Slot 0 is special, if we change the key we have to update * the parent pointer which means we must have a write lock on * the parent
*/ if (slot == 0 && ins_len && write_lock_level < level + 1) {
write_lock_level = level + 1;
btrfs_release_path(p); goto again;
}
/* * Like btrfs_search_slot, this looks for a key in the given tree. It uses the * current state of the tree together with the operations recorded in the tree * modification log to search for the key in a previous version of this tree, as * denoted by the time_seq parameter. * * Naturally, there is no support for insert, delete or cow operations. * * The resulting path and return value will be set up as if we called * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
*/ int btrfs_search_old_slot(struct btrfs_root *root, conststruct btrfs_key *key, struct btrfs_path *p, u64 time_seq)
{ struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *b; int slot; int ret; int level; int lowest_unlock = 1;
u8 lowest_level = 0;
/* * we have a lock on b and as long as we aren't changing * the tree, there is no way to for the items in b to change. * It is safe to drop the lock on our parent before we * go through the expensive btree search on b.
*/
btrfs_unlock_up_safe(p, level + 1);
ret = btrfs_bin_search(b, 0, key, &slot); if (ret < 0) goto done;
if (ret && slot > 0) {
dec = 1;
slot--;
}
p->slots[level] = slot;
unlock_up(p, level, lowest_unlock, 0, NULL);
if (level == lowest_level) { if (dec)
p->slots[level]++; goto done;
}
ret2 = read_block_for_search(root, p, &b, slot, key); if (ret2 == -EAGAIN && !p->nowait) goto again; if (ret2) {
ret = ret2; goto done;
}
level = btrfs_header_level(b);
btrfs_tree_read_lock(b);
b = btrfs_tree_mod_log_rewind(fs_info, b, time_seq); if (!b) {
ret = -ENOMEM; goto done;
}
p->locks[level] = BTRFS_READ_LOCK;
p->nodes[level] = b;
}
ret = 1;
done: if (ret < 0)
btrfs_release_path(p);
return ret;
}
/* * Search the tree again to find a leaf with smaller keys. * Returns 0 if it found something. * Returns 1 if there are no smaller keys. * Returns < 0 on error. * * This may release the path, and so you may lose any locks held at the * time you call it.
*/ staticint btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
{ struct btrfs_key key; struct btrfs_key orig_key; struct btrfs_disk_key found_key; int ret;
btrfs_release_path(path);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret <= 0) return ret;
/* * Previous key not found. Even if we were at slot 0 of the leaf we had * before releasing the path and calling btrfs_search_slot(), we now may * be in a slot pointing to the same original key - this can happen if * after we released the path, one of more items were moved from a * sibling leaf into the front of the leaf we had due to an insertion * (see push_leaf_right()). * If we hit this case and our slot is > 0 and just decrement the slot * so that the caller does not process the same key again, which may or * may not break the caller, depending on its logic.
*/ if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
btrfs_item_key(path->nodes[0], &found_key, path->slots[0]);
ret = btrfs_comp_keys(&found_key, &orig_key); if (ret == 0) { if (path->slots[0] > 0) {
path->slots[0]--; return 0;
} /* * At slot 0, same key as before, it means orig_key is * the lowest, leftmost, key in the tree. We're done.
*/ return 1;
}
}
btrfs_item_key(path->nodes[0], &found_key, 0);
ret = btrfs_comp_keys(&found_key, &key); /* * We might have had an item with the previous key in the tree right * before we released our path. And after we released our path, that * item might have been pushed to the first slot (0) of the leaf we * were holding due to a tree balance. Alternatively, an item with the * previous key can exist as the only element of a leaf (big fat item). * Therefore account for these 2 cases, so that our callers (like * btrfs_previous_item) don't miss an existing item with a key matching * the previous key we computed above.
*/ if (ret <= 0) return 0; return 1;
}
/*
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Dauer der Verarbeitung: 0.51 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.