// SPDX-License-Identifier: GPL-2.0 /* * bcache setup/teardown code, and some metadata io - read a superblock and * figure out what to do with it. * * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> * Copyright 2012 Google, Inc.
*/
#define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE) /* limitation of partitions number on single bcache device */ #define BCACHE_MINORS 128 /* limitation of bcache devices number on single system */ #define BCACHE_DEVICE_IDX_MAX ((1U << MINORBITS)/BCACHE_MINORS)
/* * Since the new uuid entry is bigger than the old, we have to * convert starting at the highest memory address and work down * in order to do it in place
*/
for (i = c->nr_uuids - 1;
i >= 0;
--i) {
memcpy(u1[i].uuid, u0[i].uuid, 16);
memcpy(u1[i].label, u0[i].label, 32);
/* * Bucket priorities/gens: * * For each bucket, we store on disk its * 8 bit gen * 16 bit priority * * See alloc.c for an explanation of the gen. The priority is used to implement * lru (and in the future other) cache replacement policies; for most purposes * it's just an opaque integer. * * The gens and the priorities don't have a whole lot to do with each other, and * it's actually the gens that must be written out at specific times - it's no * big deal if the priorities don't get written, if we lose them we just reuse * buckets in suboptimal order. * * On disk they're stored in a packed array, and in as many buckets are required * to fit them all. The buckets we use to store them form a list; the journal * header points to the first bucket, the first bucket points to the second * bucket, et cetera. * * This code is used by the allocation code; periodically (whenever it runs out * of buckets to allocate from) the allocation code will invalidate some * buckets, but it can't use those buckets until their new gens are safely on * disk.
*/
staticvoid prio_endio(struct bio *bio)
{ struct cache *ca = bio->bi_private;
/* * Pre-check if there are enough free buckets. In the non-blocking * scenario it's better to fail early rather than starting to allocate * buckets and do a cleanup later in case of failure.
*/ if (!wait) {
size_t avail = fifo_used(&ca->free[RESERVE_PRIO]) +
fifo_used(&ca->free[RESERVE_NONE]); if (prio_buckets(ca) > avail) return -ENOMEM;
}
/* * Don't want the old priorities to get garbage collected until after we * finish writing the new ones, and they're journalled
*/ for (i = 0; i < prio_buckets(ca); i++) { if (ca->prio_last_buckets[i])
__bch_bucket_free(ca,
&ca->buckets[ca->prio_last_buckets[i]]);
n = DIV_ROUND_UP_ULL(sectors, d->stripe_size); if (!n || n > max_stripes) {
pr_err("nr_stripes too large or invalid: %llu (start sector beyond end of disk?)\n",
n); return -ENOMEM;
}
d->nr_stripes = n;
n = d->nr_stripes * sizeof(atomic_t);
d->stripe_sectors_dirty = kvzalloc(n, GFP_KERNEL); if (!d->stripe_sectors_dirty) return -ENOMEM;
n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsignedlong);
d->full_dirty_stripes = kvzalloc(n, GFP_KERNEL); if (!d->full_dirty_stripes) goto out_free_stripe_sectors_dirty;
if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio),
BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER)) goto out_ida_remove;
if (lim.logical_block_size > PAGE_SIZE && cached_bdev) { /* * This should only happen with BCACHE_SB_VERSION_BDEV. * Block/page size is checked for BCACHE_SB_VERSION_CDEV.
*/
pr_info("bcache%i: sb/logical block size (%u) greater than page size (%lu) falling back to device logical block size (%u)\n",
idx, lim.logical_block_size,
PAGE_SIZE, bdev_logical_block_size(cached_bdev));
/* This also adjusts physical block size/min io size if needed */
lim.logical_block_size = bdev_logical_block_size(cached_bdev);
}
d->disk = blk_alloc_disk(&lim, NUMA_NO_NODE); if (IS_ERR(d->disk)) goto out_bioset_exit;
/* * If this delayed worker is stopping outside, directly quit here. * dc->io_disable might be set via sysfs interface, so check it * here too.
*/ while (!kthread_should_stop() && !dc->io_disable) {
q = bdev_get_queue(dc->bdev); if (blk_queue_dying(q))
dc->offline_seconds++; else
dc->offline_seconds = 0;
if (dc->offline_seconds >= BACKING_DEV_OFFLINE_TIMEOUT) {
pr_err("%pg: device offline for %d seconds\n",
dc->bdev,
BACKING_DEV_OFFLINE_TIMEOUT);
pr_err("%s: disable I/O request due to backing device offline\n",
dc->disk.name);
dc->io_disable = true; /* let others know earlier that io_disable is true */
smp_mb();
bcache_device_stop(&dc->disk); break;
}
schedule_timeout_interruptible(HZ);
}
ret = add_disk(d->disk); if (ret) goto out;
bd_link_disk_holder(dc->bdev, dc->disk.disk); /* * won't show up in the uevent file, use udevadm monitor -e instead * only class / kset properties are persistent
*/
kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
sysfs_create_link(&disk_to_dev(d->disk)->kobj,
&d->kobj, "bcache")) {
pr_err("Couldn't create bcache dev <-> disk sysfs symlinks\n");
ret = -ENOMEM; goto out;
}
dc->status_update_thread = kthread_run(cached_dev_status_update,
dc, "bcache_status_update"); if (IS_ERR(dc->status_update_thread)) {
pr_warn("failed to create bcache_status_update kthread, continue to run without monitoring backing device status\n");
}
/* * If BCACHE_DEV_RATE_DW_RUNNING is set, it means routine of the delayed * work dc->writeback_rate_update is running. Wait until the routine * quits (BCACHE_DEV_RATE_DW_RUNNING is clear), then continue to * cancel it. If BCACHE_DEV_RATE_DW_RUNNING is not clear after time_out * seconds, give up waiting here and continue to cancel it too.
*/ staticvoid cancel_writeback_rate_update_dwork(struct cached_dev *dc)
{ int time_out = WRITEBACK_RATE_UPDATE_SECS_MAX * HZ;
do { if (!test_bit(BCACHE_DEV_RATE_DW_RUNNING,
&dc->disk.flags)) break;
time_out--;
schedule_timeout_interruptible(1);
} while (time_out > 0);
if (time_out == 0)
pr_warn("give up waiting for dc->writeback_write_update to quit\n");
if (dc->sb.block_size < c->cache->sb.block_size) { /* Will die */
pr_err("Couldn't attach %pg: block size less than set's block size\n",
dc->bdev); return -EINVAL;
}
/* Check whether already attached */
list_for_each_entry_safe(exist_dc, t, &c->cached_devs, list) { if (!memcmp(dc->sb.uuid, exist_dc->sb.uuid, 16)) {
pr_err("Tried to attach %pg but duplicate UUID already attached\n",
dc->bdev);
return -EINVAL;
}
}
u = uuid_find(c, dc->sb.uuid);
if (u &&
(BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
memcpy(u->uuid, invalid_uuid, 16);
u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
u = NULL;
}
if (!u) { if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
pr_err("Couldn't find uuid for %pg in set\n", dc->bdev); return -ENOENT;
}
u = uuid_find_empty(c); if (!u) {
pr_err("Not caching %pg, no room for UUID\n", dc->bdev); return -EINVAL;
}
}
/* * Deadlocks since we're called via sysfs... * sysfs_remove_file(&dc->kobj, &sysfs_attach);
*/
if (bch_is_zero(u->uuid, 16)) { struct closure cl;
bcache_device_attach(&dc->disk, c, u - c->uuids);
list_move(&dc->list, &c->cached_devs);
calc_cached_dev_sectors(c);
/* * dc->c must be set before dc->count != 0 - paired with the mb in * cached_dev_get()
*/
smp_wmb();
refcount_set(&dc->count, 1);
/* Block writeback thread, but spawn it */
down_write(&dc->writeback_lock); if (bch_cached_dev_writeback_start(dc)) {
up_write(&dc->writeback_lock);
pr_err("Couldn't start writeback facilities for %s\n",
dc->disk.disk->disk_name); return -ENOMEM;
}
if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
atomic_set(&dc->has_dirty, 1);
bch_writeback_queue(dc);
}
bch_sectors_dirty_init(&dc->disk);
ret = bch_cached_dev_run(dc); if (ret && (ret != -EBUSY)) {
up_write(&dc->writeback_lock); /* * bch_register_lock is held, bcache_device_stop() is not * able to be directly called. The kthread and kworker * created previously in bch_cached_dev_writeback_start() * have to be stopped manually here.
*/
kthread_stop(dc->writeback_thread);
cancel_writeback_rate_update_dwork(dc);
pr_err("Couldn't run cached device %pg\n", dc->bdev); return ret;
}
if (bch_has_feature_obso_large_bucket(&(c->cache->sb))) {
pr_err("The obsoleted large bucket layout is unsupported, set the bcache device into read-only\n");
pr_err("Please update to the latest bcache-tools to create the cache device\n");
set_disk_ro(dc->disk.disk, 1);
}
/* Allow the writeback thread to proceed */
up_write(&dc->writeback_lock);
pr_info("Caching %pg as %s on set %pU\n",
dc->bdev,
dc->disk.disk->disk_name,
dc->disk.c->set_uuid); return 0;
}
if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
cancel_writeback_rate_update_dwork(dc);
if (!IS_ERR_OR_NULL(dc->writeback_thread))
kthread_stop(dc->writeback_thread); if (!IS_ERR_OR_NULL(dc->status_update_thread))
kthread_stop(dc->status_update_thread);
mutex_lock(&bch_register_lock);
if (atomic_read(&dc->running)) {
bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
del_gendisk(dc->disk.disk);
}
bcache_device_free(&dc->disk);
list_del(&dc->list);
mutex_unlock(&bch_register_lock);
if (dc->sb_disk)
folio_put(virt_to_folio(dc->sb_disk));
list_add(&dc->list, &uncached_devices); /* attach to a matched cache set if it exists */
list_for_each_entry(c, &bch_cache_sets, list)
bch_cached_dev_attach(dc, c, NULL);
if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE ||
BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) {
err = "failed to run cached device";
ret = bch_cached_dev_run(dc); if (ret) goto err;
}
if (bcache_device_init(d, block_bytes(c->cache), u->sectors,
NULL, &bcache_flash_ops)) goto err;
bcache_device_attach(d, c, u - c->uuids);
bch_sectors_dirty_init(d);
bch_flash_dev_request_init(d);
err = add_disk(d->disk); if (err) goto err;
err = kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"); if (err) goto err;
bcache_device_link(d, c, "volume");
if (bch_has_feature_obso_large_bucket(&c->cache->sb)) {
pr_err("The obsoleted large bucket layout is unsupported, set the bcache device into read-only\n");
pr_err("Please update to the latest bcache-tools to create the cache device\n");
set_disk_ro(d->disk, 1);
}
if (!IS_ERR_OR_NULL(c->gc_thread))
kthread_stop(c->gc_thread);
if (!IS_ERR_OR_NULL(c->root))
list_add(&c->root->list, &c->btree_cache);
/* * Avoid flushing cached nodes if cache set is retiring * due to too many I/O errors detected.
*/ if (!test_bit(CACHE_SET_IO_DISABLE, &c->flags))
list_for_each_entry(b, &c->btree_cache, list) {
mutex_lock(&b->write_lock); if (btree_node_dirty(b))
__bch_btree_node_write(b, NULL);
mutex_unlock(&b->write_lock);
}
/* * If the register_cache_set() call to bch_cache_set_alloc() failed, * ca has not been assigned a value and return error. * So we need check ca is not NULL during bch_cache_set_unregister().
*/ if (ca && ca->alloc_thread)
kthread_stop(ca->alloc_thread);
if (c->journal.cur) {
cancel_delayed_work_sync(&c->journal.work); /* flush last journal entry if needed */
c->journal.work.work.func(&c->journal.work.work);
}
closure_return(cl);
}
/* * This function is only called when CACHE_SET_IO_DISABLE is set, which means * cache set is unregistering due to too many I/O errors. In this condition, * the bcache device might be stopped, it depends on stop_when_cache_set_failed * value and whether the broken cache has dirty data: * * dc->stop_when_cache_set_failed dc->has_dirty stop bcache device * BCH_CACHED_STOP_AUTO 0 NO * BCH_CACHED_STOP_AUTO 1 YES * BCH_CACHED_DEV_STOP_ALWAYS 0 YES * BCH_CACHED_DEV_STOP_ALWAYS 1 YES * * The expected behavior is, if stop_when_cache_set_failed is configured to * "auto" via sysfs interface, the bcache device will not be stopped if the * backing device is clean on the broken cache device.
*/ staticvoid conditional_stop_bcache_device(struct cache_set *c, struct bcache_device *d, struct cached_dev *dc)
{ if (dc->stop_when_cache_set_failed == BCH_CACHED_DEV_STOP_ALWAYS) {
pr_warn("stop_when_cache_set_failed of %s is \"always\", stop it for failed cache set %pU.\n",
d->disk->disk_name, c->set_uuid);
bcache_device_stop(d);
} elseif (atomic_read(&dc->has_dirty)) { /* * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO * and dc->has_dirty == 1
*/
pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is dirty, stop it to avoid potential data corruption.\n",
d->disk->disk_name); /* * There might be a small time gap that cache set is * released but bcache device is not. Inside this time * gap, regular I/O requests will directly go into * backing device as no cache set attached to. This * behavior may also introduce potential inconsistence * data in writeback mode while cache is dirty. * Therefore before calling bcache_device_stop() due * to a broken cache device, dc->io_disable should be * explicitly set to true.
*/
dc->io_disable = true; /* make others know io_disable is true earlier */
smp_mb();
bcache_device_stop(d);
} else { /* * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO * and dc->has_dirty == 0
*/
pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is clean, keep it alive.\n",
d->disk->disk_name);
}
}
/* * If prio_read() fails it'll call cache_set_error and we'll * tear everything down right away, but if we perhaps checked * sooner we could avoid journal replay.
*/
k = &j->btree_root;
err = "bad btree root"; if (__bch_btree_ptr_invalid(c, k)) goto err;
/* * bcache_journal_next() can't happen sooner, or * btree_gc_finish() will give spurious errors about last_gc > * gc_gen - this is a hack but oh well.
*/
bch_journal_next(&c->journal);
err = "error starting allocator thread"; if (bch_cache_allocator_start(ca)) goto err;
/* * First place it's safe to allocate: btree_check() and * btree_gc_finish() have to run before we have buckets to * allocate, and bch_bucket_alloc_set() might cause a journal * entry to be written so bcache_journal_next() has to be called * first. * * If the uuids were in the old format we have to rewrite them * before the next journal entry is written:
*/ if (j->version < BCACHE_JSET_VERSION_UUID)
__uuid_write(c);
/* * We don't want to write the first journal entry until * everything is set up - fortunately journal entries won't be * written until the SET_CACHE_SYNC() here:
*/
SET_CACHE_SYNC(&c->cache->sb, true);
/* * When the cache disk is first registered, ca->sb.njournal_buckets * is zero, and it is assigned in run_cache_set(). * * When ca->sb.njournal_buckets is not zero, journal exists, * and in bch_journal_replay(), tree node may split. * The worst situation is all journal buckets are valid journal, * and all the keys need to replay, so the number of RESERVE_BTREE * type buckets should be as much as journal buckets. * * If the number of RESERVE_BTREE type buckets is too few, the * bch_allocator_thread() may hang up and unable to allocate * bucket. The situation is roughly as follows: * * 1. In bch_data_insert_keys(), if the operation is not op->replace, * it will call the bch_journal(), which increments the journal_ref * counter. This counter is only decremented after bch_btree_insert * completes. * * 2. When calling bch_btree_insert, if the btree needs to split, * it will call btree_split() and btree_check_reserve() to check * whether there are enough reserved buckets in the RESERVE_BTREE * slot. If not enough, bcache_btree_root() will repeatedly retry. * * 3. Normally, the bch_allocator_thread is responsible for filling * the reservation slots from the free_inc bucket list. When the * free_inc bucket list is exhausted, the bch_allocator_thread * will call invalidate_buckets() until free_inc is refilled. * Then bch_allocator_thread calls bch_prio_write() once. and * bch_prio_write() will call bch_journal_meta() and waits for * the journal write to complete. * * 4. During journal_write, journal_write_unlocked() is be called. * If journal full occurs, journal_reclaim() and btree_flush_write() * will be called sequentially, then retry journal_write. * * 5. When 2 and 4 occur together, IO will hung up and cannot recover. * * Therefore, reserve more RESERVE_BTREE type buckets.
*/
btree_buckets = clamp_t(size_t, ca->sb.nbuckets >> 7,
32, SB_JOURNAL_BUCKETS);
free = roundup_pow_of_two(ca->sb.nbuckets) >> 10; if (!free) {
ret = -EPERM;
err = "ca->sb.nbuckets is too small"; goto err_free;
}
staticint register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk, struct file *bdev_file, struct cache *ca)
{ constchar *err = NULL; /* must be set for any error case */ int ret = 0;
if (bdev_max_discard_sectors(file_bdev(bdev_file)))
ca->discard = CACHE_DISCARD(&ca->sb);
ret = cache_alloc(ca); if (ret != 0) { if (ret == -ENOMEM)
err = "cache_alloc(): -ENOMEM"; elseif (ret == -EPERM)
err = "cache_alloc(): cache device is too small"; else
err = "cache_alloc(): unknown error";
pr_notice("error %pg: %s\n", file_bdev(bdev_file), err); /* * If we failed here, it means ca->kobj is not initialized yet, * kobject_put() won't be called and there is no chance to * call fput() to bdev in bch_cache_release(). So * we explicitly call fput() on the block device here.
*/
fput(bdev_file); return ret;
}
if (kobject_add(&ca->kobj, bdev_kobj(file_bdev(bdev_file)), "bcache")) {
pr_notice("error %pg: error calling kobject_add\n",
file_bdev(bdev_file));
ret = -ENOMEM; goto out;
}
ret = -EBUSY;
err = "failed to reference bcache module"; if (!try_module_get(THIS_MODULE)) goto out;
/* For latest state of bcache_is_reboot */
smp_mb();
err = "bcache is in reboot"; if (bcache_is_reboot) goto out_module_put;
ret = -ENOMEM;
err = "cannot allocate memory";
path = kstrndup(buffer, size, GFP_KERNEL); if (!path) goto out_module_put;
sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL); if (!sb) goto out_free_path;
ret = -EINVAL;
err = "failed to open device";
bdev_file = bdev_file_open_by_path(strim(path), BLK_OPEN_READ, NULL, NULL); if (IS_ERR(bdev_file))
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Dauer der Verarbeitung: 0.25 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.