static noinline void wb_sort(struct wb_key_ref *base, size_t num)
{
size_t n = num, a = num / 2;
if (!a) /* num < 2 || size == 0 */ return;
for (;;) {
size_t b, c, d;
if (a) /* Building heap: sift down --a */
--a; elseif (--n) /* Sorting: Extract root to --n */
swap(base[0], base[n]); else/* Sort complete */ break;
/* * Sift element at "a" down into heap. This is the * "bottom-up" variant, which significantly reduces * calls to cmp_func(): we find the sift-down path all * the way to the leaves (one compare per level), then * backtrack to find where to insert the target element. * * Because elements tend to sift down close to the leaves, * this uses fewer compares than doing two per level * on the way down. (A bit more than half as many on * average, 3/4 worst-case.)
*/ for (b = a; c = 2*b + 1, (d = c + 1) < n;)
b = wb_key_ref_cmp(base + c, base + d) ? c : d; if (d == n) /* Special case last leaf with no sibling */
b = c;
/* Now backtrack from "b" to the correct location for "a" */ while (b != a && wb_key_ref_cmp(base + a, base + b))
b = (b - 1) / 2;
c = b; /* Where "a" belongs */ while (b != a) { /* Shift it into place */
b = (b - 1) / 2;
swap(base[b], base[c]);
}
}
}
ret = bch2_btree_iter_traverse(trans, iter); if (ret) return ret;
if (!*accounting_accumulated && wb->k.k.type == KEY_TYPE_accounting) { struct bkey u; struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, iter), &u);
if (k.k->type == KEY_TYPE_accounting)
bch2_accounting_accumulate(bkey_i_to_accounting(&wb->k),
bkey_s_c_to_accounting(k));
}
*accounting_accumulated = true;
/* * We can't clone a path that has write locks: unshare it now, before * set_pos and traverse():
*/ if (btree_iter_path(trans, iter)->ref > 1)
iter->path = __bch2_btree_path_make_mut(trans, iter->path, true, _THIS_IP_);
path = btree_iter_path(trans, iter);
if (!*write_locked) {
ret = bch2_btree_node_lock_write(trans, path, &path->l[0].b->c); if (ret) return ret;
/* * Update a btree with a write buffered key using the journal seq of the * original write buffer insert. * * It is not safe to rejournal the key once it has been inserted into the write * buffer because that may break recovery ordering. For example, the key may * have already been modified in the active write buffer in a seq that comes * before the current transaction. If we were to journal this key again and * crash, recovery would process updates in the wrong order.
*/ staticint
btree_write_buffered_insert(struct btree_trans *trans, struct btree_write_buffered_key *wb)
{ struct btree_iter iter; int ret;
prt_printf(&buf, "attempting to do write buffer update on non wb btree=");
bch2_btree_id_to_text(&buf, btree);
prt_str(&buf, "\n");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
for (size_t i = 0; i < wb->flushing.keys.nr; i++) {
wb->sorted.data[i].idx = i;
wb->sorted.data[i].btree = wb->flushing.keys.data[i].btree;
memcpy(&wb->sorted.data[i].pos, &wb->flushing.keys.data[i].k.k.p, sizeof(struct bpos));
}
wb->sorted.nr = wb->flushing.keys.nr;
/* * We first sort so that we can detect and skip redundant updates, and * then we attempt to flush in sorted btree order, as this is most * efficient. * * However, since we're not flushing in the order they appear in the * journal we won't be able to drop our journal pin until everything is * flushed - which means this could deadlock the journal if we weren't * passing BCH_TRANS_COMMIT_journal_reclaim. This causes the update to fail * if it would block taking a journal reservation. * * If that happens, simply skip the key so we can optimistically insert * as many keys as possible in the fast path.
*/
wb_sort(wb->sorted.data, wb->sorted.nr);
bool accounting_accumulated = false; do { if (race_fault()) {
ret = bch_err_throw(c, journal_reclaim_would_deadlock); break;
}
ret = wb_flush_one(trans, &iter, k, &write_locked,
&accounting_accumulated, &fast); if (!write_locked)
bch2_trans_begin(trans);
} while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
if (!ret) {
k->journal_seq = 0;
} elseif (ret == -BCH_ERR_journal_reclaim_would_deadlock) {
slowpath++;
ret = 0;
} else break;
}
if (slowpath) { /* * Flush in the order they were present in the journal, so that * we can release journal pins: * The fastpath zapped the seq of keys that were successfully flushed so * we can skip those here.
*/
trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr);
darray_for_each(wb->flushing.keys, i) { if (!i->journal_seq) continue;
if (!accounting_replay_done &&
i->k.k.type == KEY_TYPE_accounting) {
could_not_insert++; continue;
}
if (!could_not_insert)
bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
bch2_btree_write_buffer_journal_flush);
bch2_trans_begin(trans);
ret = commit_do(trans, NULL, NULL,
BCH_WATERMARK_reclaim|
BCH_TRANS_COMMIT_journal_reclaim|
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_no_journal_res ,
btree_write_buffered_insert(trans, i)); if (ret) goto err;
i->journal_seq = 0;
}
/* * If journal replay hasn't finished with accounting keys we * can't flush accounting keys at all - condense them and leave * them for next time. * * Q: Can the write buffer overflow? * A Shouldn't be any actual risk. It's just new accounting * updates that the write buffer can't flush, and those are only * going to be generated by interior btree node updates as * journal replay has to split/rewrite nodes to make room for * its updates. * * And for those new acounting updates, updates to the same * counters get accumulated as they're flushed from the journal * to the write buffer - see the patch for eytzingcer tree * accumulated. So we could only overflow if the number of * distinct counters touched somehow was very large.
*/ if (could_not_insert) { struct btree_write_buffered_key *dst = wb->flushing.keys.data;
/* * The write buffer requires flushing when going RO: keys in the journal for the * write buffer don't have a journal pin yet
*/ bool bch2_btree_write_buffer_flush_going_ro(struct bch_fs *c)
{ if (bch2_journal_error(&c->journal)) returnfalse;
int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *trans)
{ struct bch_fs *c = trans->c; struct btree_write_buffer *wb = &c->btree_write_buffer; int ret = 0;
if (mutex_trylock(&wb->flushing.lock)) {
ret = bch2_btree_write_buffer_flush_locked(trans);
mutex_unlock(&wb->flushing.lock);
}
return ret;
}
int bch2_btree_write_buffer_tryflush(struct btree_trans *trans)
{ struct bch_fs *c = trans->c;
if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer)) return bch_err_throw(c, erofs_no_writes);
int ret = bch2_btree_write_buffer_flush_nocheck_rw(trans);
enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer); return ret;
}
/* * In check and repair code, when checking references to write buffer btrees we * need to issue a flush before we have a definitive error: this issues a flush * if this is a key we haven't yet checked.
*/ int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans, struct bkey_s_c referring_k, struct bkey_buf *last_flushed)
{ struct bch_fs *c = trans->c; struct bkey_buf tmp; int ret = 0;
bch2_bkey_buf_init(&tmp);
if (!bkey_and_val_eq(referring_k, bkey_i_to_s_c(last_flushed->k))) { if (trace_write_buffer_maybe_flush_enabled()) { struct printbuf buf = PRINTBUF;
mutex_lock(&wb->flushing.lock); do {
ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans));
} while (!ret && bch2_btree_write_buffer_should_flush(c));
mutex_unlock(&wb->flushing.lock);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.