if (!bpos_eq(prev.k->k.p, b->key.k.p)) {
prt_str(&buf, "last child node doesn't end at end of parent node\nchild: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
prt_newline(&buf);
/** * bch2_btree_node_format_fits - check if we could rewrite node with a new format * * @c: filesystem handle * @b: btree node to rewrite * @nr: number of keys for new node (i.e. b->nr) * @new_f: bkey format to translate keys to * * Returns: true if all re-packed keys will be able to fit in a new node. * * Assumes all keys will successfully pack with the new format.
*/ staticbool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, struct btree_nr_keys nr, struct bkey_format *new_f)
{
size_t u64s = btree_node_u64s_with_format(nr, &b->format, new_f);
/* * The keys might expand with the new format - if they wouldn't fit in * the btree node anymore, use the old format for now:
*/ if (!bch2_btree_node_format_fits(as->c, b, b->nr, &format))
format = b->format;
/* * Protects reaping from the btree node cache and using the btree node * open bucket reserve:
*/
ret = bch2_btree_cache_cannibalize_lock(trans, cl); if (ret) return ret;
/* * Have to do the wakeup with btree_interior_update_lock still held, * since being on btree_interior_update_list is our ref on @c:
*/
closure_wake_up(&c->btree_interior_update_wait);
/* * The transactional part of an interior btree node update, where we journal the * update we did to the interior node and update alloc info:
*/ staticint btree_update_nodes_written_trans(struct btree_trans *trans, struct btree_update *as)
{ struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, as->journal_u64s); int ret = PTR_ERR_OR_ZERO(e); if (ret) return ret;
/* * If we're already in an error state, it might be because a btree node * was never written, and we might be trying to free that same btree * node here, but it won't have been marked as allocated and we'll see * spurious disk usage inconsistencies in the transactional part below * if we don't skip it:
*/
ret = bch2_journal_error(&c->journal); if (ret) goto err;
if (!btree_update_new_nodes_marked_sb(as))
btree_update_new_nodes_mark_sb(as);
/* * Wait for any in flight writes to finish before we free the old nodes * on disk. But we haven't pinned those old nodes in the btree cache, * they might have already been evicted. * * The update we're completing deleted references to those nodes from the * btree, so we know if they've been evicted they can't be pulled back in. * We just have to check if the nodes we have pointers to are still those * old nodes, and haven't been reused. * * This can't be done locklessly because the data buffer might have been * vmalloc allocated, and they're not RCU freed. We also need the * __no_kmsan_checks annotation because even with the btree node read * lock, nothing tells us that the data buffer has been initialized (if * the btree node has been reused for a different node, and the data * buffer swapped for a new data buffer).
*/ for (i = 0; i < as->nr_old_nodes; i++) {
b = as->old_nodes[i];
if (seq_matches)
wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner,
TASK_UNINTERRUPTIBLE);
}
/* * We did an update to a parent node where the pointers we added pointed * to child nodes that weren't written yet: now, the child nodes have * been written so we can write out the update to the interior node.
*/
/* * We can't call into journal reclaim here: we'd block on the journal * reclaim lock, but we may need to release the open buckets we have * pinned in order for other btree updates to make forward progress, and * journal reclaim does btree updates when flushing bkey_cached entries, * which may require allocations as well.
*/
ret = commit_do(trans, &as->disk_res, &journal_seq,
BCH_WATERMARK_interior_updates|
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_journal_reclaim,
btree_update_nodes_written_trans(trans, as));
bch2_trans_unlock(trans);
bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c, "%s", bch2_err_str(ret));
err: /* * Ensure transaction is unlocked before using btree_node_lock_nopath() * (the use of which is always suspect, we need to work on removing this * in the future) * * It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get() * calls bch2_path_upgrade(), before we call path_make_mut(), so we may * rarely end up with a locked path besides the one we have here:
*/
bch2_trans_unlock(trans);
bch2_trans_begin(trans);
/* * We have to be careful because another thread might be getting ready * to free as->b and calling btree_update_reparent() on us - we'll * recheck under btree_update_lock below:
*/
b = READ_ONCE(as->b); if (b) { /* * @b is the node we did the final insert into: * * On failure to get a journal reservation, we still have to * unblock the write and allow most of the write path to happen * so that shutdown works, but the i->journal_seq mechanism * won't work to prevent the btree write from being visible (we * didn't get a journal sequence number) - instead * __bch2_btree_node_write() doesn't do the actual write if * we're in journal error state:
*/
bch2_btree_add_journal_pin(c, b, journal_seq);
} else { /* * If we didn't get a journal sequence number we * can't write this btree node, because recovery * won't know to ignore this write:
*/
set_btree_node_never_write(b);
}
}
while (1) {
mutex_lock(&c->btree_interior_update_lock);
as = list_first_entry_or_null(&c->btree_interior_updates_unwritten, struct btree_update, unwritten_list); if (as && !as->nodes_written)
as = NULL;
mutex_unlock(&c->btree_interior_update_lock);
/* * bch2_btree_update_add_new_node: * * This causes @as to wait on @b to be written, before it gets to * bch2_btree_update_nodes_written * * Additionally, it sets b->will_make_reachable to prevent any additional writes * to @b from happening besides the first until @b is reachable on disk * * And it adds @b to the list of @as's new nodes, so that we can update sector * counts in bch2_btree_update_nodes_written:
*/ staticvoid bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b)
{ struct bch_fs *c = as->c;
/* * returns true if @b was a new node
*/ staticvoid btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
{ struct btree_update *as; unsignedlong v; unsigned i;
mutex_lock(&c->btree_interior_update_lock); /* * When b->will_make_reachable != 0, it owns a ref on as->cl that's * dropped when it gets written by bch2_btree_complete_write - the * xchg() is for synchronization with bch2_btree_complete_write:
*/
v = xchg(&b->will_make_reachable, 0);
clear_btree_node_will_make_reachable(b);
as = (struct btree_update *) (v & ~1UL);
if (!as) {
mutex_unlock(&c->btree_interior_update_lock); return;
}
for (i = 0; i < as->nr_new_nodes; i++) if (as->new_nodes[i] == b) goto found;
/* * @b is being split/rewritten: it may have pointers to not-yet-written btree * nodes and thus outstanding btree_updates - redirect @b's * btree_updates to point to this btree_update:
*/ staticvoid bch2_btree_interior_update_will_free_node(struct btree_update *as, struct btree *b)
{ struct bch_fs *c = as->c; struct btree_update *p, *n; struct btree_write *w;
set_btree_node_dying(b);
if (btree_node_fake(b)) return;
mutex_lock(&c->btree_interior_update_lock);
/* * Does this node have any btree_update operations preventing * it from being written? * * If so, redirect them to point to this btree_update: we can * write out our new nodes, but we won't make them visible until those * operations complete
*/
list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
list_del_init(&p->write_blocked_list);
btree_update_reparent(as, p);
/* * for flush_held_btree_writes() waiting on updates to flush or * nodes to be writeable:
*/
closure_wake_up(&c->btree_interior_update_wait);
}
/* * Does this node have unwritten data that has a pin on the journal? * * If so, transfer that pin to the btree_update operation - * note that if we're freeing multiple nodes, we only need to keep the * oldest pin of any of the nodes we're freeing. We'll release the pin * when the new nodes are persistent and reachable on disk:
*/
w = btree_current_write(b);
bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
bch2_btree_update_will_free_node_journal_pin_flush);
bch2_journal_pin_drop(&c->journal, &w->journal);
w = btree_prev_write(b);
bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
bch2_btree_update_will_free_node_journal_pin_flush);
bch2_journal_pin_drop(&c->journal, &w->journal);
mutex_unlock(&c->btree_interior_update_lock);
/* * Is this a node that isn't reachable on disk yet? * * Nodes that aren't reachable yet have writes blocked until they're * reachable - now that we've cancelled any pending writes and moved * things waiting on that write to wait on this update, we can drop this * node from the list of nodes that the other update is making * reachable, prior to freeing it:
*/
btree_update_drop_new_node(c, b);
if (watermark == BCH_WATERMARK_copygc)
watermark = BCH_WATERMARK_btree_copygc; if (watermark < BCH_WATERMARK_btree)
watermark = BCH_WATERMARK_btree;
flags &= ~BCH_WATERMARK_MASK;
flags |= watermark;
if (watermark < BCH_WATERMARK_reclaim &&
test_bit(JOURNAL_space_low, &c->journal.flags)) { if (flags & BCH_TRANS_COMMIT_journal_reclaim) return ERR_PTR(-BCH_ERR_journal_reclaim_would_deadlock);
ret = drop_locks_do(trans,
({ wait_event(c->journal.wait, !test_bit(JOURNAL_space_low, &c->journal.flags)); 0; })); if (ret) return ERR_PTR(ret);
}
while (1) {
nr_nodes[!!level_end] += 1 + split;
level_end++;
ret = bch2_btree_path_upgrade(trans, path, level_end + 1); if (ret) return ERR_PTR(ret);
if (!btree_path_node(path, level_end)) { /* Allocating new root? */
nr_nodes[1] += split;
level_end = BTREE_MAX_DEPTH; break;
}
/* * Always check for space for two keys, even if we won't have to * split at prior level - it might have been a merge instead:
*/ if (bch2_btree_node_insert_fits(path->l[level_end].b,
BKEY_BTREE_PTR_U64s_MAX * 2)) break;
/* * We don't want to allocate if we're in an error state, that can cause * deadlock on emergency shutdown due to open buckets getting stuck in * the btree_reserve_cache after allocator shutdown has cleared it out. * This check needs to come after adding us to the btree_interior_update * list but before calling bch2_btree_reserve_get, to synchronize with * __bch2_fs_read_only().
*/
ret = bch2_journal_error(&c->journal); if (ret) goto err;
ret = bch2_disk_reservation_get(c, &as->disk_res,
(nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
READ_ONCE(c->opts.metadata_replicas),
disk_res_flags); if (ret) goto err;
ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, NULL); if (bch2_err_matches(ret, ENOSPC) ||
bch2_err_matches(ret, ENOMEM)) { struct closure cl;
/* * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK * flag
*/ if (bch2_err_matches(ret, ENOSPC) &&
(flags & BCH_TRANS_COMMIT_journal_reclaim) &&
watermark < BCH_WATERMARK_reclaim) {
ret = bch_err_throw(c, journal_reclaim_would_deadlock); goto err;
}
closure_init_stack(&cl);
do {
ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, &cl); if (!bch2_err_matches(ret, BCH_ERR_operation_blocked)) break;
bch2_trans_unlock(trans);
bch2_wait_on_allocator(c, &cl);
} while (1);
}
/* * Ensure no one is using the old root while we switch to the * new root:
*/ if (nofail) {
bch2_btree_node_lock_write_nofail(trans, path, &old->c);
} else { int ret = bch2_btree_node_lock_write(trans, path, &old->c); if (ret) return ret;
}
bch2_btree_set_root_inmem(c, b);
btree_update_updated_root(as, b);
/* * Unlock old root after new root is visible: * * The new root isn't persistent, but that's ok: we still have * an intent lock on the new root, and any updates that would * depend on the new root would have to update the new root.
*/
bch2_btree_node_unlock_write(trans, path, old); return 0;
}
/* * For updates to interior nodes, we've got to do the insert before we split * because the stuff we're inserting has to be inserted atomically. Post split, * the keys might have to go in different nodes and the split would no longer be * atomic. * * Worse, if the insert is from btree node coalescing, if we do the insert after * we do the split (and pick the pivot) - the pivot we pick might be between * nodes that were coalesced, and thus in the middle of a child node post * coalescing:
*/ staticint btree_split_insert_keys(struct btree_update *as, struct btree_trans *trans,
btree_path_idx_t path_idx, struct btree *b, struct keylist *keys)
{ struct btree_path *path = trans->paths + path_idx;
if (!bch2_keylist_empty(keys) &&
bpos_le(bch2_keylist_front(keys)->k.p, b->data->max_key)) { struct btree_node_iter node_iter;
/* * Note that on recursive parent_keys == keys, so we * can't start adding new keys to parent_keys before emptying it * out (which we did with btree_split_insert_keys() above)
*/
bch2_keylist_add(&as->parent_keys, &n1->key);
bch2_keylist_add(&as->parent_keys, &n2->key);
if (!parent) { /* Depth increases, make a new root */
n3 = __btree_root_alloc(as, trans, b->c.level + 1);
if (parent)
bch2_keylist_add(&as->parent_keys, &n1->key);
}
/* New nodes all written, now make them visible: */
if (parent) { /* Split a non root node */
ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
} elseif (n3) {
ret = bch2_btree_set_root(as, trans, trans->paths + path, n3, false);
} else { /* Root filled up but didn't need to be split */
ret = bch2_btree_set_root(as, trans, trans->paths + path, n1, false);
}
/* * The old node must be freed (in memory) _before_ unlocking the new * nodes - else another thread could re-acquire a read lock on the old * node after another thread has locked and updated the new node, thus * seeing stale data:
*/
bch2_btree_node_free_inmem(trans, trans->paths + path, b);
if (n3)
bch2_trans_node_add(trans, trans->paths + path, n3); if (n2)
bch2_trans_node_add(trans, trans->paths + path2, n2);
bch2_trans_node_add(trans, trans->paths + path1, n1);
if (n3)
six_unlock_intent(&n3->c.lock); if (n2)
six_unlock_intent(&n2->c.lock);
six_unlock_intent(&n1->c.lock);
out: if (path2) {
__bch2_btree_path_unlock(trans, trans->paths + path2);
bch2_path_put(trans, path2, true);
} if (path1) {
__bch2_btree_path_unlock(trans, trans->paths + path1);
bch2_path_put(trans, path1, true);
}
/** * bch2_btree_insert_node - insert bkeys into a given btree node * * @as: btree_update object * @trans: btree_trans object * @path_idx: path that points to current node * @b: node to insert keys into * @keys: list of keys to insert * * Returns: 0 on success, typically transaction restart error on failure * * Inserts as many keys as it can into a given btree node, splitting it if full. * If a split occurred, this function will return early. This can only happen * for leaf nodes -- inserts into interior nodes have to be atomic.
*/ staticint bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
btree_path_idx_t path_idx, struct btree *b, struct keylist *keys)
{ struct bch_fs *c = as->c; struct btree_path *path = trans->paths + path_idx, *linked; unsigned i; int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); int old_live_u64s = b->nr.live_u64s; int live_u64s_added, u64s_added; int ret;
int bch2_btree_split_leaf(struct btree_trans *trans,
btree_path_idx_t path, unsigned flags)
{ /* btree_split & merge may both cause paths array to be reallocated */ struct btree *b = path_l(trans->paths + path)->b; struct btree_update *as; unsigned l; int ret = 0;
as = bch2_btree_update_start(trans, trans->paths + path,
trans->paths[path].level, true, 0, flags); if (IS_ERR(as)) return PTR_ERR(as);
ret = btree_split(as, trans, path, b, NULL); if (ret) {
bch2_btree_update_free(as, trans); return ret;
}
bch2_btree_update_done(as, trans);
for (l = trans->paths[path].level + 1;
btree_node_intent_locked(&trans->paths[path], l) && !ret;
l++)
ret = bch2_foreground_maybe_merge(trans, path, l, flags);
/* * Work around a deadlock caused by the btree write buffer not doing * merges and leaving tons of merges for us to do - we really don't need * to be doing merges at all from the interior update path, and if the * interior update path is generating too many new interior updates we * deadlock:
*/ if ((flags & BCH_WATERMARK_MASK) == BCH_WATERMARK_interior_updates) return 0;
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
out:
err: if (new_path)
bch2_path_put(trans, new_path, true);
bch2_path_put(trans, sib_path, true);
bch2_trans_verify_locks(trans); if (ret == -BCH_ERR_journal_reclaim_would_deadlock)
ret = 0; if (!ret)
ret = bch2_trans_relock(trans); return ret;
err_free_update:
bch2_btree_node_free_never_used(as, trans, n);
bch2_btree_update_free(as, trans); goto out;
}
staticint get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter, struct btree *b)
{
bch2_trans_node_iter_init(trans, iter, b->c.btree_id, b->key.k.p,
BTREE_MAX_DEPTH, b->c.level,
BTREE_ITER_intent); int ret = bch2_btree_iter_traverse(trans, iter); if (ret) goto err;
/* has node been freed? */ if (btree_iter_path(trans, iter)->l[b->c.level].b != b) { /* node has been freed: */
BUG_ON(!btree_node_dying(b));
ret = bch_err_throw(trans->c, btree_node_dying); goto err;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.