// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> * * Uses a block device as cache for other block devices; optimized for SSDs. * All allocation is done in buckets, which should match the erase block size * of the device. * * Buckets containing cached data are kept on a heap sorted by priority; * bucket priority is increased on cache hit, and periodically all the buckets * on the heap have their priority scaled down. This currently is just used as * an LRU but in the future should allow for more intelligent heuristics. * * Buckets have an 8 bit counter; freeing is accomplished by incrementing the * counter. Garbage collection is used to remove stale pointers. * * Indexing is done via a btree; nodes are not necessarily fully sorted, rather * as keys are inserted we only sort the pages that have not yet been written. * When garbage collection is run, we resort the entire node. * * All configuration is done via sysfs; see Documentation/admin-guide/bcache.rst.
*/
/* * Todo: * register_bcache: Return errors out to userspace correctly * * Writeback: don't undirty key until after a cache flush * * Create an iterator for key pointers * * On btree write error, mark bucket such that it won't be freed from the cache * * Journalling: * Check for bad keys in replay * Propagate barriers * Refcount journal entries in journal_replay * * Garbage collection: * Finish incremental gc * Gc should free old UUIDs, data for invalid UUIDs * * Provide a way to list backing device UUIDs we have data cached for, and * probably how long it's been since we've seen them, and a way to invalidate * dirty data for devices that will never be attached again * * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so * that based on that and how much dirty data we have we can keep writeback * from being starved * * Add a tracepoint or somesuch to watch for writeback starvation * * When btree depth > 1 and splitting an interior node, we have to make sure * alloc_bucket() cannot fail. This should be true but is not completely * obvious. * * Plugging? * * If data write is less than hard sector size of ssd, round up offset in open * bucket to the next whole sector * * Superblock needs to be fleshed out for multiple cache devices * * Add a sysfs tunable for the number of writeback IOs in flight * * Add a sysfs tunable for the number of open data buckets * * IO tracking: Can we track when one process is doing io on behalf of another? * IO tracking: Don't use just an average, weigh more recent stuff higher * * Test module load/unload
*/
/* * c->fill_iter can allocate an iterator with more memory space * than static MAX_BSETS. * See the comment arount cache_set->fill_iter.
*/
iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO);
iter->size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size;
iter->used = 0;
/* * If we're appending to a leaf node, we don't technically need FUA - * this write just needs to be persisted before the next journal write, * which will be marked FLUSH|FUA. * * Similarly if we're writing a new btree root - the pointer is going to * be in the next journal entry. * * But if we're writing a new btree node (that isn't a root) or * appending to a non leaf btree node, we need either FUA or a flush * when we write the parent with the new pointer. FUA is cheaper than a * flush, and writes appending to leaf nodes aren't blocking anything so * just make all btree node writes FUA to keep things sane.
*/
continue_at(cl, btree_node_write_done, NULL);
} else { /* * No problem for multipage bvec since the bio is * just allocated
*/
b->bio->bi_vcnt = 0;
bch_bio_map(b->bio, i);
/* * do verify if there was more than one set initially (i.e. we did a * sort) and we sorted down to a single set:
*/ if (nsets && !b->keys.nsets)
bch_btree_verify(b);
if (!btree_node_dirty(b))
queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
set_btree_node_dirty(b);
/* * w->journal is always the oldest journal pin of all bkeys * in the leaf node, to make sure the oldest jset seq won't * be increased before this btree node is flushed.
*/ if (journal_ref) { if (w->journal &&
journal_pin_cmp(b->c, w->journal, journal_ref)) {
atomic_dec_bug(w->journal);
w->journal = NULL;
}
if (!w->journal) {
w->journal = journal_ref;
atomic_inc(w->journal);
}
}
/* Force write if set is too big */ if (set_bytes(i) > PAGE_SIZE - 48 &&
!current->bio_list)
bch_btree_node_write(b, NULL);
}
if (b->keys.page_order < min_order) goto out_unlock;
if (!flush) { if (btree_node_dirty(b)) goto out_unlock;
if (down_trylock(&b->io_mutex)) goto out_unlock;
up(&b->io_mutex);
}
retry: /* * BTREE_NODE_dirty might be cleared in btree_flush_btree() by * __bch_btree_node_write(). To avoid an extra flush, acquire * b->write_lock before checking BTREE_NODE_dirty bit.
*/
mutex_lock(&b->write_lock); /* * If this btree node is selected in btree_flush_write() by journal * code, delay and retry until the node is flushed by journal code * and BTREE_NODE_journal_flush bit cleared by btree_flush_write().
*/ if (btree_node_journal_flush(b)) {
pr_debug("bnode %p is flushing by journal, retry\n", b);
mutex_unlock(&b->write_lock);
udelay(1); goto retry;
}
if (btree_node_dirty(b))
__bch_btree_node_write(b, &cl);
mutex_unlock(&b->write_lock);
closure_sync(&cl);
/* wait for any in flight btree write */
down(&b->io_mutex);
up(&b->io_mutex);
if (c->btree_cache_alloc_lock) return SHRINK_STOP;
/* Return -1 if we can't do anything right now */ if (sc->gfp_mask & __GFP_IO)
mutex_lock(&c->bucket_lock); elseif (!mutex_trylock(&c->bucket_lock)) return -1;
/* * It's _really_ critical that we don't free too many btree nodes - we * have to always leave ourselves a reserve. The reserve is how we * guarantee that allocating memory for a new btree node can always * succeed, so that inserting keys into the btree can always succeed and * IO can always make forward progress:
*/
nr /= c->btree_pages; if (nr == 0)
nr = 1;
nr = min_t(unsignedlong, nr, mca_can_free(c));
i = 0;
btree_cache_used = c->btree_cache_used;
list_for_each_entry_safe_reverse(b, t, &c->btree_cache_freeable, list) { if (nr <= 0) goto out;
while (!list_empty(&c->btree_cache)) {
b = list_first_entry(&c->btree_cache, struct btree, list);
/* * This function is called by cache_set_free(), no I/O * request on cache now, it is unnecessary to acquire * b->write_lock before clearing BTREE_NODE_dirty anymore.
*/ if (btree_node_dirty(b)) {
btree_complete_write(b, btree_current_write(b));
clear_bit(BTREE_NODE_dirty, &b->flags);
}
mca_data_free(b);
}
while (!list_empty(&c->btree_cache_freed)) {
b = list_first_entry(&c->btree_cache_freed, struct btree, list);
list_del(&b->list);
cancel_delayed_work_sync(&b->work);
kfree(b);
}
mutex_unlock(&c->bucket_lock);
}
int bch_btree_cache_alloc(struct cache_set *c)
{ unsignedint i;
for (i = 0; i < mca_reserve(c); i++) if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL)) return -ENOMEM;
c->verify_ondisk = (void *)
__get_free_pages(GFP_KERNEL|__GFP_COMP,
ilog2(meta_bucket_pages(&c->cache->sb))); if (!c->verify_ondisk) { /* * Don't worry about the mca_rereserve buckets * allocated in previous for-loop, they will be * handled properly in bch_cache_set_unregister().
*/ return -ENOMEM;
}
/* * We can only have one thread cannibalizing other cached btree nodes at a time, * or we'll deadlock. We use an open coded mutex to ensure that, which a * cannibalize_bucket() will take. This means every time we unlock the root of * the btree, we need to release this lock if we have it held.
*/ void bch_cannibalize_unlock(struct cache_set *c)
{
spin_lock(&c->btree_cannibalize_lock); if (c->btree_cache_alloc_lock == current) {
c->btree_cache_alloc_lock = NULL;
wake_up(&c->btree_cache_wait);
}
spin_unlock(&c->btree_cannibalize_lock);
}
/* btree_free() doesn't free memory; it sticks the node on the end of * the list. Check if there's any freed nodes there:
*/
list_for_each_entry(b, &c->btree_cache_freeable, list) if (!mca_reap(b, btree_order(k), false)) goto out;
/* We never free struct btree itself, just the memory that holds the on * disk node. Check the freed list before allocating a new one:
*/
list_for_each_entry(b, &c->btree_cache_freed, list) if (!mca_reap(b, 0, false)) {
mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); if (!b->keys.set[0].data) goto err; else goto out;
}
b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO); if (!b) goto err;
BUG_ON(!down_write_trylock(&b->lock)); if (!b->keys.set->data) goto err;
out:
BUG_ON(b->io_mutex.count != 1);
if (!b->level)
bch_btree_keys_init(&b->keys, &bch_extent_keys_ops,
&b->c->expensive_debug_checks); else
bch_btree_keys_init(&b->keys, &bch_btree_keys_ops,
&b->c->expensive_debug_checks);
return b;
err: if (b)
rw_unlock(true, b);
b = mca_cannibalize(c, op, k); if (!IS_ERR(b)) goto out;
return b;
}
/* * bch_btree_node_get - find a btree node in the cache and lock it, reading it * in from disk if necessary. * * If IO is necessary and running under submit_bio_noacct, returns -EAGAIN. * * The btree node will have either a read or a write lock held, depending on * level and op->lock. * * Note: Only error code or btree pointer will be returned, it is unncessary * for callers to check NULL pointer.
*/ struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op, struct bkey *k, int level, bool write, struct btree *parent)
{ int i = 0; struct btree *b;
BUG_ON(level < 0);
retry:
b = mca_find(c, k);
if (!b) { if (current->bio_list) return ERR_PTR(-EAGAIN);
mutex_lock(&c->bucket_lock);
b = mca_alloc(c, op, k, level);
mutex_unlock(&c->bucket_lock);
retry:
mutex_lock(&b->write_lock); /* * If the btree node is selected and flushing in btree_flush_write(), * delay and retry until the BTREE_NODE_journal_flush bit cleared, * then it is safe to free the btree node here. Otherwise this btree * node will be in race condition.
*/ if (btree_node_journal_flush(b)) {
mutex_unlock(&b->write_lock);
pr_debug("bnode %p journal_flush set, retry\n", b);
udelay(1); goto retry;
}
if (btree_node_dirty(b)) {
btree_complete_write(b, btree_current_write(b));
clear_bit(BTREE_NODE_dirty, &b->flags);
}
/* * Only error code or btree pointer will be returned, it is unncessary for * callers to check NULL pointer.
*/ struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, int level, bool wait, struct btree *parent)
{
BKEY_PADDED(key) k; struct btree *b;
mutex_lock(&c->bucket_lock);
retry: /* return ERR_PTR(-EAGAIN) when it fails */
b = ERR_PTR(-EAGAIN); if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, wait)) goto err;
if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { if (op)
prepare_to_wait(&c->btree_cache_wait, &op->wait,
TASK_UNINTERRUPTIBLE);
mutex_unlock(&c->bucket_lock); return -EINTR;
}
mutex_unlock(&c->bucket_lock);
return mca_cannibalize_lock(b->c, op);
}
/* Garbage collection */
static uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
{
uint8_t stale = 0; unsignedint i; struct bucket *g;
/* * ptr_invalid() can't return true for the keys that mark btree nodes as * freed, but since ptr_bad() returns true we'll never actually use them * for anything and thus we don't want mark their pointers here
*/ if (!bkey_cmp(k, &ZERO_KEY)) return stale;
for (i = 0; i < KEY_PTRS(k); i++) { if (!ptr_available(c, k, i)) continue;
g = PTR_BUCKET(c, k, i);
if (gen_after(g->last_gc, PTR_GEN(k, i)))
g->last_gc = PTR_GEN(k, i);
if (ptr_stale(c, k, i)) {
stale = max(stale, ptr_stale(c, k, i)); continue;
}
for (i = 0; i < nodes; i++) {
new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL); if (IS_ERR(new_nodes[i])) goto out_nocoalesce;
}
/* * We have to check the reserve here, after we've allocated our new * nodes, to make sure the insert below will succeed - we also check * before as an optimization to potentially avoid a bunch of expensive * allocs/sorts
*/ if (btree_check_reserve(b, NULL)) goto out_nocoalesce;
for (i = 0; i < nodes; i++)
mutex_lock(&new_nodes[i]->write_lock);
for (i = nodes - 1; i > 0; --i) { struct bset *n1 = btree_bset_first(new_nodes[i]); struct bset *n2 = btree_bset_first(new_nodes[i - 1]); struct bkey *k, *last = NULL;
keys = 0;
if (i > 1) { for (k = n2->start;
k < bset_bkey_last(n2);
k = bkey_next(k)) { if (__set_blocks(n1, n1->keys + keys +
bkey_u64s(k),
block_bytes(b->c->cache)) > blocks) break;
last = k;
keys += bkey_u64s(k);
}
} else { /* * Last node we're not getting rid of - we're getting * rid of the node at r[0]. Have to try and fit all of * the remaining keys into this node; we can't ensure * they will always fit due to rounding and variable * length keys (shouldn't be possible in practice, * though)
*/ if (__set_blocks(n1, n1->keys + n2->keys,
block_bytes(b->c->cache)) >
btree_blocks(new_nodes[i])) goto out_unlock_nocoalesce;
keys = n2->keys; /* Take the key of the node we're getting rid of */
last = &r->b->key;
}
for (i = 0; i < nodes; i++)
mutex_unlock(&new_nodes[i]->write_lock);
closure_sync(&cl);
/* We emptied out this node */
BUG_ON(btree_bset_first(new_nodes[0])->keys);
btree_node_free(new_nodes[0]);
rw_unlock(true, new_nodes[0]);
new_nodes[0] = NULL;
for (i = 0; i < nodes; i++) { if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key))) goto out_nocoalesce;
/* * Since incremental GC would stop 100ms when front * side I/O comes, so when there are many btree nodes, * if GC only processes constant (100) nodes each time, * GC would last a long time, and the front side I/Os * would run out of the buckets (since no new bucket * can be allocated during GC), and be blocked again. * So GC should not process constant nodes, but varied * nodes according to the number of btree nodes, which * realized by dividing GC into constant(100) times, * so when there are many btree nodes, GC can process * more nodes each time, otherwise, GC will process less * nodes each time (but no less than MIN_GC_NODES)
*/
min_nodes = c->gc_stats.nodes / MAX_GC_TIMES; if (min_nodes < MIN_GC_NODES)
min_nodes = MIN_GC_NODES;
for (i = r; i < r + ARRAY_SIZE(r); i++)
i->b = ERR_PTR(-EINTR);
while (1) {
k = bch_btree_iter_next_filter(&iter.iter, &b->keys,
bch_ptr_bad); if (k) {
r->b = bch_btree_node_get(b->c, op, k, b->level - 1, true, b); if (IS_ERR(r->b)) {
ret = PTR_ERR(r->b); break;
}
r->keys = btree_gc_count_keys(r->b);
ret = btree_gc_coalesce(b, op, gc, r); if (ret) break;
}
if (!last->b) break;
if (!IS_ERR(last->b)) {
should_rewrite = btree_gc_mark_node(last->b, gc); if (should_rewrite) {
ret = btree_gc_rewrite_node(b, op, last->b); if (ret) break;
}
if (last->b->level) {
ret = btree_gc_recurse(last->b, op, writes, gc); if (ret) break;
}
bkey_copy_key(&b->c->gc_done, &last->b->key);
/* * Must flush leaf nodes before gc ends, since replace * operations aren't journalled
*/
mutex_lock(&last->b->write_lock); if (btree_node_dirty(last->b))
bch_btree_node_write(last->b, writes);
mutex_unlock(&last->b->write_lock);
rw_unlock(true, last->b);
}
if (atomic_read(&b->c->search_inflight) &&
gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) {
gc->nodes_pre = gc->nodes;
ret = -EAGAIN; break;
}
if (need_resched()) {
ret = -EAGAIN; break;
}
}
for (i = r; i < r + ARRAY_SIZE(r); i++) if (!IS_ERR_OR_NULL(i->b)) {
mutex_lock(&i->b->write_lock); if (btree_node_dirty(i->b))
bch_btree_node_write(i->b, writes);
mutex_unlock(&i->b->write_lock);
rw_unlock(true, i->b);
}
/* if CACHE_SET_IO_DISABLE set, gc thread should stop too */ do {
ret = bcache_btree_root(gc_root, c, &op, &writes, &stats);
closure_sync(&writes);
cond_resched();
if (ret == -EAGAIN)
schedule_timeout_interruptible(msecs_to_jiffies
(GC_SLEEP_MS)); elseif (ret)
pr_warn("gc failed!\n");
} while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));
/* root node keys are checked before thread created */
bch_btree_iter_stack_init(&c->root->keys, &iter, NULL);
k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad);
BUG_ON(!k);
p = k; while (k) { /* * Fetch a root node key index, skip the keys which * should be fetched by other threads, then check the * sub-tree indexed by the fetched key.
*/
spin_lock(&check_state->idx_lock);
cur_idx = check_state->key_idx;
check_state->key_idx++;
spin_unlock(&check_state->idx_lock);
skip_nr = cur_idx - prev_idx;
while (skip_nr) {
k = bch_btree_iter_next_filter(&iter.iter,
&c->root->keys,
bch_ptr_bad); if (k)
p = k; else { /* * No more keys to check in root node, * current checking threads are enough, * stop creating more.
*/
atomic_set(&check_state->enough, 1); /* Update check_state->enough earlier */
smp_mb__after_atomic(); goto out;
}
skip_nr--;
cond_resched();
}
if (p) { struct btree_op op;
btree_node_prefetch(c->root, p);
c->gc_stats.nodes++;
bch_btree_op_init(&op, 0);
ret = bcache_btree(check_recurse, p, c->root, &op); /* * The op may be added to cache_set's btree_cache_wait * in mca_cannibalize(), must ensure it is removed from * the list and release btree_cache_alloc_lock before * free op memory. * Otherwise, the btree_cache_wait will be damaged.
*/
bch_cannibalize_unlock(c);
finish_wait(&c->btree_cache_wait, &(&op)->wait); if (ret) goto out;
}
p = NULL;
prev_idx = cur_idx;
cond_resched();
}
out:
info->result = ret; /* update check_state->started among all CPUs */
smp_mb__before_atomic(); if (atomic_dec_and_test(&check_state->started))
wake_up(&check_state->wait);
return ret;
}
staticint bch_btree_chkthread_nr(void)
{ int n = num_online_cpus()/2;
if (n == 0)
n = 1; elseif (n > BCH_BTR_CHKTHREAD_MAX)
n = BCH_BTR_CHKTHREAD_MAX;
return n;
}
int bch_btree_check(struct cache_set *c)
{ int ret = 0; int i; struct bkey *k = NULL; struct btree_iter_stack iter; struct btree_check_state check_state;
/* check and mark root node keys */
for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid)
bch_initial_mark_key(c, c->root->level, k);
rw_lock(0, c->root, c->root->level); /* * Run multiple threads to check btree nodes in parallel, * if check_state.enough is non-zero, it means current * running check threads are enough, unncessary to create * more.
*/ for (i = 0; i < check_state.total_threads; i++) { /* fetch latest check_state.enough earlier */
smp_mb__before_atomic(); if (atomic_read(&check_state.enough)) break;
check_state.infos[i].thread =
kthread_run(bch_btree_check_thread,
&check_state.infos[i], "bch_btrchk[%d]", i); if (IS_ERR(check_state.infos[i].thread)) {
pr_err("fails to run thread bch_btrchk[%d]\n", i); for (--i; i >= 0; i--)
kthread_stop(check_state.infos[i].thread);
ret = -ENOMEM; goto out;
}
atomic_inc(&check_state.started);
}
/* * Must wait for all threads to stop.
*/
wait_event(check_state.wait, atomic_read(&check_state.started) == 0);
for (i = 0; i < check_state.total_threads; i++) { if (check_state.infos[i].result) {
ret = check_state.infos[i].result; goto out;
}
}
/* * We need to put some unused buckets directly on the prio freelist in * order to get the allocator thread started - it needs freed buckets in * order to rewrite the prios and gens, and it needs to rewrite prios * and gens in order to free buckets. * * This is only safe for buckets that have no live data in them, which * there should always be some of.
*/
for_each_bucket(b, ca) { if (fifo_full(&ca->free[RESERVE_PRIO]) &&
fifo_full(&ca->free[RESERVE_BTREE])) break;
if (bch_can_invalidate_bucket(ca, b) &&
!GC_MARK(b)) {
__bch_invalidate_one_bucket(ca, b); if (!fifo_push(&ca->free[RESERVE_PRIO],
b - ca->buckets))
fifo_push(&ca->free[RESERVE_BTREE],
b - ca->buckets);
}
}
if (n3) { /* Depth increases, make a new root */
mutex_lock(&n3->write_lock);
bkey_copy_key(&n3->key, &MAX_KEY);
bch_btree_insert_keys(n3, op, &parent_keys, NULL);
bch_btree_node_write(n3, &cl);
mutex_unlock(&n3->write_lock);
closure_sync(&cl);
bch_btree_set_root(n3);
rw_unlock(true, n3);
} elseif (!b->parent) { /* Root filled up but didn't need to be split */
closure_sync(&cl);
bch_btree_set_root(n1);
} else { /* Split a non root node */
closure_sync(&cl);
make_btree_freeing_key(b, parent_keys.top);
bch_keylist_push(&parent_keys);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.