/* * This should really be done in slub/vmalloc, but we're using the * kmalloc_large() path, so we're working around a slub bug by doing * this here:
*/ if (b->data)
mm_account_reclaimed_pages(btree_buf_bytes(b) / PAGE_SIZE); if (b->aux_data)
mm_account_reclaimed_pages(btree_aux_data_bytes(b) / PAGE_SIZE);
if (btree_node_noevict(b)) {
bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_noevict]++; return bch_err_throw(c, ENOMEM_btree_node_reclaim);
} if (btree_node_write_blocked(b)) {
bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_blocked]++; return bch_err_throw(c, ENOMEM_btree_node_reclaim);
} if (btree_node_will_make_reachable(b)) {
bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_will_make_reachable]++; return bch_err_throw(c, ENOMEM_btree_node_reclaim);
}
if (btree_node_dirty(b)) { if (!flush) {
bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_dirty]++; return bch_err_throw(c, ENOMEM_btree_node_reclaim);
}
if (locked) { /* * Using the underscore version because we don't want to compact * bsets after the write, since this node is about to be evicted * - unless btree verify mode is enabled, since it runs out of * the post write cleanup:
*/ if (static_branch_unlikely(&bch2_verify_btree_ondisk))
bch2_btree_node_write(c, b, SIX_LOCK_intent,
BTREE_WRITE_cache_reclaim); else
__bch2_btree_node_write(c, b,
BTREE_WRITE_cache_reclaim);
}
}
if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
(1U << BTREE_NODE_write_in_flight))) { if (!flush) { if (btree_node_read_in_flight(b))
bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_read_in_flight]++; elseif (btree_node_write_in_flight(b))
bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_in_flight]++; return bch_err_throw(c, ENOMEM_btree_node_reclaim);
}
if (locked) return -EINTR;
/* XXX: waiting on IO with btree cache lock held */
bch2_btree_node_wait_on_read(b);
bch2_btree_node_wait_on_write(b);
}
return 0;
}
/* * this version is for btree nodes that have already been freed (we're not * reaping a real btree node)
*/ staticint __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
{ struct btree_cache *bc = &c->btree_cache; int ret = 0;
lockdep_assert_held(&bc->lock);
retry_unlocked:
ret = __btree_node_reclaim_checks(c, b, flush, false); if (ret) return ret;
if (!six_trylock_intent(&b->c.lock)) {
bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_intent]++; return bch_err_throw(c, ENOMEM_btree_node_reclaim);
}
if (!six_trylock_write(&b->c.lock)) {
bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_write]++;
six_unlock_intent(&b->c.lock); return bch_err_throw(c, ENOMEM_btree_node_reclaim);
}
/* recheck under lock */
ret = __btree_node_reclaim_checks(c, b, flush, true); if (ret) {
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock); if (ret == -EINTR) goto retry_unlocked; return ret;
}
/* * It's _really_ critical that we don't free too many btree nodes - we * have to always leave ourselves a reserve. The reserve is how we * guarantee that allocating memory for a new btree node can always * succeed, so that inserting keys into the btree can always succeed and * IO can always make forward progress:
*/
can_free = btree_cache_can_free(list); if (nr > can_free) {
bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_cache_reserve] += nr - can_free;
nr = can_free;
}
i = 0;
list_for_each_entry_safe(b, t, &bc->freeable, list) { /* * Leave a few nodes on the freeable list, so that a btree split * won't have to hit the system allocator:
*/ if (++i <= 3) continue;
for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++)
BUG_ON(bc->nr_by_btree[i]);
BUG_ON(bc->live[0].nr);
BUG_ON(bc->live[1].nr);
BUG_ON(bc->nr_freeable);
if (bc->table_init_done)
rhashtable_destroy(&bc->table);
}
int bch2_fs_btree_cache_init(struct bch_fs *c)
{ struct btree_cache *bc = &c->btree_cache; struct shrinker *shrink; unsigned i; int ret = 0;
ret = rhashtable_init(&bc->table, &bch_btree_cache_params); if (ret) goto err;
bc->table_init_done = true;
bch2_recalc_btree_reserve(c);
for (i = 0; i < bc->nr_reserve; i++) { struct btree *b = __bch2_btree_node_mem_alloc(c); if (!b) goto err;
__bch2_btree_node_to_freelist(bc, b);
}
void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
{
mutex_init(&bc->lock); for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) {
bc->live[i].idx = i;
INIT_LIST_HEAD(&bc->live[i].list);
}
INIT_LIST_HEAD(&bc->freeable);
INIT_LIST_HEAD(&bc->freed_pcpu);
INIT_LIST_HEAD(&bc->freed_nonpcpu);
}
/* * We can only have one thread cannibalizing other cached btree nodes at a time, * or we'll deadlock. We use an open coded mutex to ensure that, which a * cannibalize_bucket() will take. This means every time we unlock the root of * the btree, we need to release this lock if we have it held.
*/ void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans)
{ struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache;
old = NULL; if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) goto success;
if (!cl) {
trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); return bch_err_throw(c, ENOMEM_btree_cache_cannibalize_lock);
}
closure_wait(&bc->alloc_wait, cl);
/* Try again, after adding ourselves to waitlist */
old = NULL; if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) { /* We raced */
closure_wake_up(&bc->alloc_wait); goto success;
}
for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
list_for_each_entry_reverse(b, &bc->live[i].list, list) if (!btree_node_reclaim(c, b)) return b;
while (1) { for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
list_for_each_entry_reverse(b, &bc->live[i].list, list) if (!btree_node_write_and_reclaim(c, b)) return b;
/* * Rare case: all nodes were intent-locked. * Just busy-wait.
*/
WARN_ONCE(1, "btree cache cannibalize failed\n");
cond_resched();
}
}
/* * We never free struct btree itself, just the memory that holds the on * disk node. Check the freed list before allocating a new one:
*/
list_for_each_entry(b, freed, list) if (!btree_node_reclaim(c, b)) {
list_del_init(&b->list); goto got_node;
}
b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN); if (b) {
bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_NOWAIT);
} else {
mutex_unlock(&bc->lock);
bch2_trans_unlock(trans);
b = __btree_node_mem_alloc(c, GFP_KERNEL); if (!b) goto err;
bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL);
mutex_lock(&bc->lock);
}
got_node: /* * btree_free() doesn't free memory; it sticks the node on the end of * the list. Check if there's any freed nodes there:
*/
list_for_each_entry(b2, &bc->freeable, list) if (!btree_node_reclaim(c, b2)) {
swap(b->data, b2->data);
swap(b->aux_data, b2->aux_data);
if (unlikely(level >= BTREE_MAX_DEPTH)) { int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u",
level, BTREE_MAX_DEPTH); return ERR_PTR(ret);
}
int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf);
printbuf_exit(&buf); return ERR_PTR(ret);
}
/* * Parent node must be locked, else we could read in a btree node that's * been freed:
*/ if (path && !bch2_btree_node_relock(trans, path, level + 1)) {
trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
}
b = bch2_btree_node_mem_alloc(trans, level != 0);
if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) { if (!path) return b;
EBUG_ON(level >= BTREE_MAX_DEPTH);
retry:
b = btree_cache_find(bc, k); if (unlikely(!b)) { /* * We must have the parent locked to call bch2_btree_node_fill(), * else we could read in a btree node from disk that's been * freed:
*/
b = bch2_btree_node_fill(trans, path, k, path->btree_id,
level, lock_type, true);
need_relock = true;
/* We raced and found the btree node in the cache */ if (!b) goto retry;
if (IS_ERR(b)) return b;
} else { if (btree_node_read_locked(path, level + 1))
btree_node_unlock(trans, path, level + 1);
ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) return ERR_PTR(ret);
ret = bch2_trans_relock(trans); if (ret) return ERR_PTR(ret);
/* * should_be_locked is not set on this path yet, so we need to * relock it specifically:
*/ if (!six_relock_type(&b->c.lock, lock_type, seq)) goto retry;
}
if (unlikely(need_relock)) {
ret = bch2_trans_relock(trans) ?:
bch2_btree_path_relock_intent(trans, path); if (ret) {
six_unlock_type(&b->c.lock, lock_type); return ERR_PTR(ret);
}
}
/** * bch2_btree_node_get - find a btree node in the cache and lock it, reading it * in from disk if necessary. * * @trans: btree transaction object * @path: btree_path being traversed * @k: pointer to btree node (generally KEY_TYPE_btree_ptr_v2) * @level: level of btree node being looked up (0 == leaf node) * @lock_type: SIX_LOCK_read or SIX_LOCK_intent * @trace_ip: ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek()) * * The btree node will have either a read or a write lock held, depending on * the @write parameter. * * Returns: btree node or ERR_PTR()
*/ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, conststruct bkey_i *k, unsigned level, enum six_lock_type lock_type, unsignedlong trace_ip)
{ struct bch_fs *c = trans->c; struct btree *b; int ret;
EBUG_ON(level >= BTREE_MAX_DEPTH);
b = btree_node_mem_ptr(k);
/* * Check b->hash_val _before_ calling btree_node_lock() - this might not * be the node we want anymore, and trying to lock the wrong node could * cause an unneccessary transaction restart:
*/ if (unlikely(!c->opts.btree_node_mem_ptr_optimization ||
!b ||
b->hash_val != btree_ptr_hash_val(k))) return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
if (btree_node_read_locked(path, level + 1))
btree_node_unlock(trans, path, level + 1);
ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) return ERR_PTR(ret);
if (c->opts.btree_node_mem_ptr_optimization) {
b = btree_node_mem_ptr(k); if (b) goto lock_node;
}
retry:
b = btree_cache_find(bc, k); if (unlikely(!b)) { if (nofill) goto out;
b = bch2_btree_node_fill(trans, NULL, k, btree_id,
level, SIX_LOCK_read, true);
/* We raced and found the btree node in the cache */ if (!b) goto retry;
if (IS_ERR(b) &&
!bch2_btree_cache_cannibalize_lock(trans, NULL)) goto retry;
if (IS_ERR(b)) goto out;
} else {
lock_node:
ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) return ERR_PTR(ret);
struct btree *b = btree_cache_find(bc, k); if (b) return 0;
b = bch2_btree_node_fill(trans, path, k, btree_id,
level, SIX_LOCK_read, false); int ret = PTR_ERR_OR_ZERO(b); if (ret) return ret; if (b)
six_unlock_read(&b->c.lock); return 0;
}
BUG_ON(b == btree_node_root(trans->c, b));
wait_on_io: /* not allowed to wait on io with btree locks held: */
/* XXX we're called from btree_gc which will be holding other btree * nodes locked
*/
__bch2_btree_node_wait_on_read(b);
__bch2_btree_node_wait_on_write(b);
for (unsigned i = 0; i < ARRAY_SIZE(bc->not_freed); i++)
prt_printf(out, " %s\t%llu\n",
bch2_btree_cache_not_freed_reasons_strs[i], bc->not_freed[i]);
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.