/* * Notes on disk accounting: * * We have two parallel sets of counters to be concerned with, and both must be * kept in sync. * * - Persistent/on disk accounting, stored in the accounting btree and updated * via btree write buffer updates that treat new accounting keys as deltas to * apply to existing values. But reading from a write buffer btree is * expensive, so we also have * * - In memory accounting, where accounting is stored as an array of percpu * counters, indexed by an eytzinger array of disk acounting keys/bpos (which * are the same thing, excepting byte swabbing on big endian). * * Cheap to read, but non persistent. * * Disk accounting updates are generated by transactional triggers; these run as * keys enter and leave the btree, and can compare old and new versions of keys; * the output of these triggers are deltas to the various counters. * * Disk accounting updates are done as btree write buffer updates, where the * counters in the disk accounting key are deltas that will be applied to the * counter in the btree when the key is flushed by the write buffer (or journal * replay). * * To do a disk accounting update: * - initialize a disk_accounting_pos, to specify which counter is being update * - initialize counter deltas, as an array of 1-3 s64s * - call bch2_disk_accounting_mod() * * This queues up the accounting update to be done at transaction commit time. * Underneath, it's a normal btree write buffer update. * * The transaction commit path is responsible for propagating updates to the in * memory counters, with bch2_accounting_mem_mod(). * * The commit path also assigns every disk accounting update a unique version * number, based on the journal sequence number and offset within that journal * buffer; this is used by journal replay to determine which updates have been * done. * * The transaction commit path also ensures that replicas entry accounting * updates are properly marked in the superblock (so that we know whether we can * mount without data being unavailable); it will update the superblock if * bch2_accounting_mem_mod() tells it to.
*/
staticconstchar * const disk_accounting_type_strs[] = { #define x(t, n, ...) [n] = #t,
BCH_DISK_ACCOUNTING_TYPES() #undef x
NULL
};
switch (acc_k.type) { case BCH_DISK_ACCOUNTING_nr_inodes:
end = field_end(acc_k, nr_inodes); break; case BCH_DISK_ACCOUNTING_persistent_reserved:
end = field_end(acc_k, persistent_reserved); break; case BCH_DISK_ACCOUNTING_replicas:
bkey_fsck_err_on(!acc_k.replicas.nr_devs,
c, accounting_key_replicas_nr_devs_0, "accounting key replicas entry with nr_devs=0");
bkey_fsck_err_on(acc_k.replicas.nr_required > acc_k.replicas.nr_devs ||
(acc_k.replicas.nr_required > 1 &&
acc_k.replicas.nr_required == acc_k.replicas.nr_devs),
c, accounting_key_replicas_nr_required_bad, "accounting key replicas entry with bad nr_required");
for (unsigned i = 0; i + 1 < acc_k.replicas.nr_devs; i++)
bkey_fsck_err_on(acc_k.replicas.devs[i] >= acc_k.replicas.devs[i + 1],
c, accounting_key_replicas_devs_unsorted, "accounting key replicas entry with unsorted devs");
end = (void *) &acc_k.replicas + replicas_entry_bytes(&acc_k.replicas); break; case BCH_DISK_ACCOUNTING_dev_data_type:
end = field_end(acc_k, dev_data_type); break; case BCH_DISK_ACCOUNTING_compression:
end = field_end(acc_k, compression); break; case BCH_DISK_ACCOUNTING_snapshot:
end = field_end(acc_k, snapshot); break; case BCH_DISK_ACCOUNTING_btree:
end = field_end(acc_k, btree); break; case BCH_DISK_ACCOUNTING_rebalance_work:
end = field_end(acc_k, rebalance_work); break;
}
bkey_fsck_err_on(!is_zero(end, (void *) (&acc_k + 1)),
c, accounting_key_junk_at_end, "junk at end of accounting key");
bkey_fsck_err_on(bch2_accounting_counters(k.k) != bch2_accounting_type_nr_counters[acc_k.type],
c, accounting_key_nr_counters_wrong, "accounting key with %u counters, should be %u",
bch2_accounting_counters(k.k), bch2_accounting_type_nr_counters[acc_k.type]);
fsck_err: return ret;
}
void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_pos *k)
{ if (k->type >= BCH_DISK_ACCOUNTING_TYPE_NR) {
prt_printf(out, "unknown type %u", k->type); return;
}
/* * Ensure accounting keys being updated are present in the superblock, when * applicable (i.e. replicas updates)
*/ int bch2_accounting_update_sb(struct btree_trans *trans)
{ for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting);
i != btree_trans_subbuf_top(trans, &trans->accounting);
i = bkey_next(i)) { int ret = bch2_accounting_update_sb_one(trans->c, i->k.p); if (ret) return ret;
}
/* * Read out accounting keys for replicas entries, as an array of * bch_replicas_usage entries. * * Note: this may be deprecated/removed at smoe point in the future and replaced * with something more general, it exists to support the ioctl used by the * 'bcachefs fs usage' command.
*/ int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage)
{ struct bch_accounting_mem *acc = &c->accounting; int ret = 0;
if (memcmp(dst_v, src_v, nr * sizeof(u64))) {
printbuf_reset(&buf);
prt_str(&buf, "accounting mismatch for ");
bch2_accounting_key_to_text(&buf, &acc_k);
switch (acc->type) { case BCH_DISK_ACCOUNTING_replicas: { union bch_replicas_padded r;
__accounting_to_replicas(&r.e, acc);
for (unsigned i = 0; i < r.e.nr_devs; i++) if (r.e.devs[i] != BCH_SB_MEMBER_INVALID &&
!bch2_dev_exists(c, r.e.devs[i])) {
invalid_dev = r.e.devs[i]; goto invalid_device;
}
/* * All replicas entry checks except for invalid device are done * in bch2_accounting_validate
*/
BUG_ON(bch2_replicas_entry_validate(&r.e, c, &buf));
if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e),
trans, accounting_replicas_not_marked, "accounting not marked in superblock replicas\n%s",
(printbuf_reset(&buf),
bch2_accounting_key_to_text(&buf, acc),
buf.buf))) { /* * We're not RW yet and still single threaded, dropping * and retaking lock is ok:
*/
percpu_up_write(&c->mark_lock);
ret = bch2_mark_replicas(c, &r.e); if (ret) goto fsck_err;
percpu_down_write(&c->mark_lock);
} break;
}
case BCH_DISK_ACCOUNTING_dev_data_type: if (!bch2_dev_exists(c, acc->dev_data_type.dev)) {
invalid_dev = acc->dev_data_type.dev; goto invalid_device;
} break;
}
fsck_err:
printbuf_exit(&buf); return ret;
invalid_device: if (fsck_err(trans, accounting_to_invalid_device, "accounting entry points to invalid device %i\n%s",
invalid_dev,
(printbuf_reset(&buf),
bch2_accounting_key_to_text(&buf, acc),
buf.buf))) { for (unsigned i = 0; i < nr; i++)
v[i] = -v[i];
/* * At startup time, initialize the in memory accounting from the btree (and * journal)
*/ int bch2_accounting_read(struct bch_fs *c)
{ struct bch_accounting_mem *acc = &c->accounting; struct btree_trans *trans = bch2_trans_get(c); struct printbuf buf = PRINTBUF;
/* * We might run more than once if we rewind to start topology repair or * btree node scan - and those might cause us to get different results, * so we can't just skip if we've already run. * * Instead, zero out any accounting we have:
*/
percpu_down_write(&c->mark_lock);
darray_for_each(acc->k, e)
percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters);
for_each_member_device(c, ca)
percpu_memset(ca->usage, 0, sizeof(*ca->usage));
percpu_memset(c->usage, 0, sizeof(*c->usage));
percpu_up_write(&c->mark_lock);
struct btree_iter iter;
bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots);
iter.flags &= ~BTREE_ITER_with_journal; int ret = for_each_btree_key_continue(trans, iter,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ struct bkey u; struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u);
/* * If the entry counters are zeroed, it should be treated as * nonexistent - it might point to an invalid device. * * Remove it, so that if it's re-added it gets re-marked in the * superblock:
*/
ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters)
? -BCH_ERR_remove_disk_accounting_entry
: bch2_disk_accounting_validate_late(trans, &acc_k, v, i->nr_counters);
if (ret == -BCH_ERR_remove_disk_accounting_entry) {
free_percpu(i->v[0]);
free_percpu(i->v[1]);
darray_remove_item(&acc->k, i);
ret = 0; continue;
}
switch (acc_k.type) { case BCH_DISK_ACCOUNTING_persistent_reserved:
base.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0]; break; case BCH_DISK_ACCOUNTING_replicas:
fs_usage_data_type_to_base(&base, acc_k.replicas.data_type, a.v->d[0]); break; case BCH_DISK_ACCOUNTING_dev_data_type:
{
guard(rcu)(); /* scoped guard is a loop, and doesn't play nicely with continue */ struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); if (!ca) continue;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.