// SPDX-License-Identifier: GPL-2.0 /* * bcachefs setup/teardown code, and some metadata io - read a superblock and * figure out what to do with it. * * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> * Copyright 2012 Google, Inc.
*/
mutex_lock(&bch_fs_list_lock);
c = __bch2_uuid_to_fs(uuid); if (c)
closure_get(&c->cl);
mutex_unlock(&bch_fs_list_lock);
return c;
}
/* Filesystem RO/RW: */
/* * For startup/shutdown of RW stuff, the dependencies are: * * - foreground writes depend on copygc and rebalance (to free up space) * * - copygc and rebalance depend on mark and sweep gc (they actually probably * don't because they either reserve ahead of time or don't block if * allocations fail, but allocations can require mark and sweep gc to run * because of generation number wraparound) * * - all of the above depends on the allocator threads * * - allocator depends on the journal (when it rewrites prios and gens)
*/
/* * Block new foreground-end write operations from starting - any new * writes will return -EROFS:
*/
set_bit(BCH_FS_going_ro, &c->flags);
enumerated_ref_stop_async(&c->writes);
/* * If we're not doing an emergency shutdown, we want to wait on * outstanding writes to complete so they don't see spurious errors due * to shutting down the allocator: * * If we are doing an emergency shutdown outstanding writes may * hang until we shutdown the allocator so we don't want to wait * on outstanding writes before shutting everything down - but * we do need to wait on them before returning and signalling * that going RO is complete:
*/
wait_event(bch2_read_only_wait,
test_bit(BCH_FS_write_disable_complete, &c->flags) ||
test_bit(BCH_FS_emergency_ro, &c->flags));
bool writes_disabled = test_bit(BCH_FS_write_disable_complete, &c->flags); if (writes_disabled)
bch_verbose(c, "finished waiting for writes to stop");
staticint __bch2_fs_read_write(struct bch_fs *c, bool early)
{ int ret;
BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
if (WARN_ON(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) return bch_err_throw(c, erofs_no_alloc_info);
if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) {
bch_err(c, "cannot go rw, unfixed btree errors"); return bch_err_throw(c, erofs_unfixed_errors);
}
if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) {
bch_err(c, "cannot go rw, filesystem is an unresized image file"); return bch_err_throw(c, erofs_filesystem_full);
}
if (test_bit(BCH_FS_rw, &c->flags)) return 0;
bch_info(c, "going read-write");
ret = bch2_fs_init_rw(c); if (ret) goto err;
ret = bch2_sb_members_v2_init(c); if (ret) goto err;
/* * First journal write must be a flush write: after a clean shutdown we * don't read the journal, so the first journal write may end up * overwriting whatever was there previously, and there must always be * at least one non-flush write in the journal or recovery will fail:
*/
spin_lock(&c->journal.lock);
set_bit(JOURNAL_need_flush_write, &c->journal.flags);
set_bit(JOURNAL_running, &c->journal.flags);
bch2_journal_space_available(&c->journal);
spin_unlock(&c->journal.lock);
ret = bch2_fs_mark_dirty(c); if (ret) goto err;
ret = bch2_journal_reclaim_start(&c->journal); if (ret) goto err;
BUG_ON(atomic_read(&c->journal_keys.ref));
percpu_free_rwsem(&c->mark_lock); if (c->online_reserved) {
u64 v = percpu_u64_get(c->online_reserved);
WARN(v, "online_reserved not 0 at shutdown: %lli", v);
free_percpu(c->online_reserved);
}
if (c->write_ref_wq)
destroy_workqueue(c->write_ref_wq); if (c->btree_write_submit_wq)
destroy_workqueue(c->btree_write_submit_wq); if (c->btree_read_complete_wq)
destroy_workqueue(c->btree_read_complete_wq); if (c->copygc_wq)
destroy_workqueue(c->copygc_wq); if (c->btree_write_complete_wq)
destroy_workqueue(c->btree_write_complete_wq); if (c->btree_update_wq)
destroy_workqueue(c->btree_update_wq);
if (go_rw_in_recovery(c)) { /* * start workqueues/kworkers early - kthread creation checks for * pending signals, which is _very_ annoying
*/
ret = bch2_fs_init_rw(c); if (ret) goto err;
}
#ifdef CONFIG_UNICODE if (bch2_fs_casefold_enabled(c)) { /* Default encoding until we can potentially have more as an option. */
c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING); if (IS_ERR(c->cf_encoding)) {
printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u",
unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING),
unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING),
unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING));
ret = -EINVAL; goto err;
}
} #else if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) {
printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n");
ret = -EINVAL; goto err;
} #endif
for (i = 0; i < c->sb.nr_devices; i++) { if (!bch2_member_exists(c->disk_sb.sb, i)) continue;
ret = bch2_dev_alloc(c, i); if (ret) goto err;
}
int bch2_fs_start(struct bch_fs *c)
{
time64_t now = ktime_get_real_seconds(); int ret = 0;
print_mount_opts(c);
if (c->cf_encoding)
bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u",
unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING),
unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING),
unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING));
if (!bch2_fs_may_start(c)) return bch_err_throw(c, insufficient_devices_to_start);
prt_str(&buf, "Split brain detected between ");
prt_bdevname(&buf, sb->bdev);
prt_str(&buf, " and ");
prt_bdevname(&buf, fs->bdev);
prt_char(&buf, ':');
prt_newline(&buf);
prt_bdevname(&buf, fs->bdev);
prt_str(&buf, " believes seq of ");
prt_bdevname(&buf, sb->bdev);
prt_printf(&buf, " to be %llu, but ", seq_from_fs);
prt_bdevname(&buf, sb->bdev);
prt_printf(&buf, " has %llu\n", seq_from_member);
if (!opts->no_splitbrain_check) {
prt_str(&buf, "Not using ");
prt_bdevname(&buf, sb->bdev);
}
pr_err("%s", buf.buf);
printbuf_exit(&buf);
if (!opts->no_splitbrain_check) return -BCH_ERR_device_splitbrain;
}
return 0;
}
/* Device startup/shutdown: */
staticvoid bch2_dev_io_ref_stop(struct bch_dev *ca, int rw)
{ if (rw == READ)
clear_bit(ca->dev_idx, ca->fs->online_devs.d);
/* * This is racy w.r.t. the underlying block device being hot-removed, * which removes it from sysfs. * * It'd be lovely if we had a way to handle this race, but the sysfs * code doesn't appear to provide a good method and block/holder.c is * susceptible as well:
*/ if (ca->kobj.state_in_sysfs &&
ca->disk_sb.bdev &&
(b = bdev_kobj(ca->disk_sb.bdev))->state_in_sysfs) {
sysfs_remove_link(b, "bcachefs");
sysfs_remove_link(&ca->kobj, "block");
}
}
staticint bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
{ int ret;
if (!c->kobj.state_in_sysfs) return 0;
if (!ca->kobj.state_in_sysfs) {
ret = kobject_add(&ca->kobj, &c->kobj, "dev-%u", ca->dev_idx) ?:
bch2_opts_create_sysfs_files(&ca->kobj, OPT_DEVICE); if (ret) return ret;
}
if (ca->disk_sb.bdev) { struct kobject *block = bdev_kobj(ca->disk_sb.bdev);
ret = sysfs_create_link(block, &ca->kobj, "bcachefs"); if (ret) return ret;
ret = sysfs_create_link(&ca->kobj, block, "block"); if (ret) return ret;
}
/* * Stash pointer to the filesystem for blk_holder_ops - note that once * attached to a filesystem, we will always close the block device * before tearing down the filesystem object.
*/
ca->disk_sb.holder->c = ca->fs;
if (le64_to_cpu(sb->sb->seq) >
le64_to_cpu(c->disk_sb.sb->seq))
bch2_sb_to_fs(c, sb->sb);
BUG_ON(!bch2_dev_exists(c, sb->sb->dev_idx));
ca = bch2_dev_locked(c, sb->sb->dev_idx);
ret = __bch2_dev_attach_bdev(ca, sb); if (ret) return ret;
set_bit(ca->dev_idx, c->online_devs.d);
bch2_dev_sysfs_online(c, ca);
bch2_rebalance_wakeup(c); return 0;
}
/* Device management: */
/* * Note: this function is also used by the error paths - when a particular * device sees an error, we call it to determine whether we can just set the * device RO, or - if this function returns false - we'll set the whole * filesystem RO: * * XXX: maybe we should be more explicit about whether we're changing state * because we got an error or what have you?
*/ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, enum bch_member_state new_state, int flags)
{ struct bch_devs_mask new_online_devs; int nr_rw = 0, required;
lockdep_assert_held(&c->state_lock);
switch (new_state) { case BCH_MEMBER_STATE_rw: returntrue; case BCH_MEMBER_STATE_ro: if (ca->mi.state != BCH_MEMBER_STATE_rw) returntrue;
/* do we have enough devices to write to? */
for_each_member_device(c, ca2) if (ca2 != ca)
nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw;
mutex_lock(&c->sb_lock);
m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
SET_BCH_MEMBER_STATE(m, new_state);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
if (new_state == BCH_MEMBER_STATE_rw)
__bch2_dev_read_write(c, ca);
bch2_rebalance_wakeup(c);
return ret;
}
int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, enum bch_member_state new_state, int flags)
{ int ret;
down_write(&c->state_lock);
ret = __bch2_dev_set_state(c, ca, new_state, flags);
up_write(&c->state_lock);
return ret;
}
/* Device add/removal: */
int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
{ struct bch_member *m; unsigned dev_idx = ca->dev_idx, data; bool fast_device_removal = !bch2_request_incompat_feature(c,
bcachefs_metadata_version_fast_device_removal); int ret;
down_write(&c->state_lock);
/* * We consume a reference to ca->ref, regardless of whether we succeed * or fail:
*/
bch2_dev_put(ca);
if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
bch_err(ca, "Cannot remove without losing data");
ret = bch_err_throw(c, device_state_not_allowed); goto err;
}
__bch2_dev_read_only(c, ca);
ret = fast_device_removal
? bch2_dev_data_drop_by_backpointers(c, ca->dev_idx, flags)
: (bch2_dev_data_drop(c, ca->dev_idx, flags) ?:
bch2_dev_remove_stripes(c, ca->dev_idx, flags)); if (ret) goto err;
/* Check if device still has data before blowing away alloc info */ struct bch_dev_usage usage = bch2_dev_usage_read(ca); for (unsigned i = 0; i < BCH_DATA_NR; i++) if (!data_type_is_empty(i) &&
!data_type_is_hidden(i) &&
usage.buckets[i]) {
bch_err(ca, "Remove failed: still has data (%s, %llu buckets)",
__bch2_data_types[i], usage.buckets[i]);
ret = -EBUSY; goto err;
}
ret = bch2_dev_remove_alloc(c, ca);
bch_err_msg(ca, ret, "bch2_dev_remove_alloc()"); if (ret) goto err;
/* * We need to flush the entire journal to get rid of keys that reference * the device being removed before removing the superblock entry
*/
bch2_journal_flush_all_pins(&c->journal);
/* * this is really just needed for the bch2_replicas_gc_(start|end) * calls, and could be cleaned up:
*/
ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()"); if (ret) goto err;
ret = bch2_journal_flush(&c->journal);
bch_err_msg(ca, ret, "bch2_journal_flush()"); if (ret) goto err;
ret = bch2_replicas_gc2(c);
bch_err_msg(ca, ret, "bch2_replicas_gc2()"); if (ret) goto err;
data = bch2_dev_has_data(c, ca); if (data) { struct printbuf data_has = PRINTBUF;
prt_bitflags(&data_has, __bch2_data_types, data);
bch_err(ca, "Remove failed, still has data (%s)", data_has.buf);
printbuf_exit(&data_has);
ret = -EBUSY; goto err;
}
/* * Free this device's slot in the bch_member array - all pointers to * this device must be gone:
*/
mutex_lock(&c->sb_lock);
m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx);
if (fast_device_removal)
m->uuid = BCH_SB_MEMBER_DELETED_UUID; else
memset(&m->uuid, 0, sizeof(m->uuid));
if (BCH_MEMBER_GROUP(&dev_mi)) {
bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1); if (label.allocation_failure) {
ret = -ENOMEM; goto err;
}
}
if (list_empty(&c->list)) {
mutex_lock(&bch_fs_list_lock); if (__bch2_uuid_to_fs(c->sb.uuid))
ret = bch_err_throw(c, filesystem_uuid_already_open); else
list_add(&c->list, &bch_fs_list);
mutex_unlock(&bch_fs_list_lock);
err_unlock:
mutex_unlock(&c->sb_lock);
up_write(&c->state_lock);
err: if (ca)
bch2_dev_free(ca);
bch2_free_super(&sb); goto out;
err_late:
up_write(&c->state_lock);
ca = NULL; goto err;
}
/* Hot add existing device to running filesystem: */ int bch2_dev_online(struct bch_fs *c, constchar *path)
{ struct bch_opts opts = bch2_opts_empty(); struct bch_sb_handle sb = { NULL }; struct bch_dev *ca; unsigned dev_idx; int ret;
down_write(&c->state_lock);
ret = bch2_read_super(path, &opts, &sb); if (ret) {
up_write(&c->state_lock); return ret;
}
dev_idx = sb.sb->dev_idx;
ret = bch2_dev_in_fs(&c->disk_sb, &sb, &c->opts);
bch_err_msg(c, ret, "bringing %s online", path); if (ret) goto err;
ret = bch2_dev_attach_bdev(c, &sb); if (ret) goto err;
ca = bch2_dev_locked(c, dev_idx);
ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path); if (ret) goto err;
if (ca->mi.state == BCH_MEMBER_STATE_rw)
__bch2_dev_read_write(c, ca);
if (!ca->mi.freespace_initialized) {
ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
bch_err_msg(ca, ret, "initializing free space"); if (ret) goto err;
}
if (!ca->journal.nr) {
ret = bch2_dev_journal_alloc(ca, false);
bch_err_msg(ca, ret, "allocating journal"); if (ret) goto err;
}
if (nbuckets < ca->mi.nbuckets) {
bch_err(ca, "Cannot shrink yet");
ret = -EINVAL; goto err;
}
if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) {
bch_err(ca, "New device size too big (%llu greater than max %u)",
nbuckets, BCH_MEMBER_NBUCKETS_MAX);
ret = bch_err_throw(c, device_size_too_big); goto err;
}
if (bch2_dev_is_online(ca) &&
get_capacity(ca->disk_sb.bdev->bd_disk) <
ca->mi.bucket_size * nbuckets) {
bch_err(ca, "New size larger than device");
ret = bch_err_throw(c, device_size_too_small); goto err;
}
ret = bch2_dev_buckets_resize(c, ca, nbuckets);
bch_err_msg(ca, ret, "resizing buckets"); if (ret) goto err;
ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); if (ret) goto err;
mutex_lock(&c->sb_lock);
m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
m->nbuckets = cpu_to_le64(nbuckets);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
if (ca->mi.freespace_initialized) {
ret = __bch2_dev_resize_alloc(ca, old_nbuckets, nbuckets); if (ret) goto err;
}
struct super_block *sb = c->vfs_sb; if (sb) { /* * Not necessary, c->ro_ref guards against the filesystem being * unmounted - we only take this to avoid a warning in * sync_filesystem:
*/
down_read(&sb->s_umount);
}
struct super_block *sb = c->vfs_sb; if (sb) { /* * Not necessary, c->ro_ref guards against the filesystem being * unmounted - we only take this to avoid a warning in * sync_filesystem:
*/
down_read(&sb->s_umount);
sync_filesystem(sb);
up_read(&sb->s_umount);
}
ret = bch2_read_super(*i, opts, &sb); if (ret) goto err;
BUG_ON(darray_push(&sbs, sb));
}
if (opts->nochanges && !opts->read_only) {
ret = bch_err_throw(c, erofs_nochanges); goto err_print;
}
darray_for_each(sbs, sb) if (!best || sb_cmp(sb->sb, best->sb) > 0)
best = sb;
darray_for_each_reverse(sbs, sb) {
ret = bch2_dev_in_fs(best, sb, opts);
if (ret == -BCH_ERR_device_has_been_removed ||
ret == -BCH_ERR_device_splitbrain) {
bch2_free_super(sb);
darray_remove_item(&sbs, sb);
best -= best > sb;
ret = 0; continue;
}
if (ret) goto err_print;
}
c = bch2_fs_alloc(best->sb, opts, &sbs);
ret = PTR_ERR_OR_ZERO(c); if (ret) goto err;
down_write(&c->state_lock);
darray_for_each(sbs, sb) {
ret = bch2_dev_attach_bdev(c, sb); if (ret) {
up_write(&c->state_lock); goto err;
}
}
up_write(&c->state_lock);
if (!c->opts.nostart) {
ret = bch2_fs_start(c); if (ret) goto err;
}
out:
darray_for_each(sbs, sb)
bch2_free_super(sb);
darray_exit(&sbs);
printbuf_exit(&errbuf);
module_put(THIS_MODULE); return c;
err_print:
pr_err("bch_fs_open err opening %s: %s",
devices->data[0], bch2_err_str(ret));
err: if (!IS_ERR_OR_NULL(c))
bch2_fs_stop(c);
c = ERR_PTR(ret); goto out;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.