/* * Journal replay/recovery: * * This code is all driven from run_cache_set(); we first read the journal * entries, do some other stuff, then we mark all the keys in the journal * entries (same as garbage collection would), then we replay them - reinserting * them into the cache in precisely the same order as they appear in the * journal. * * We only journal keys that go in leaf nodes, which simplifies things quite a * bit.
*/
staticvoid journal_read_endio(struct bio *bio)
{ struct closure *cl = bio->bi_private;
/* This function could be simpler now since we no longer write * journal entries that overlap bucket boundaries; this means * the start of a bucket will always have a valid journal entry * if it has any journal entries at all.
*/
if (j->magic != jset_magic(&ca->sb)) {
pr_debug("%u: bad magic\n", bucket_index); return ret;
}
if (bytes > left << 9 ||
bytes > PAGE_SIZE << JSET_BITS) {
pr_info("%u: too big, %zu bytes, offset %u\n",
bucket_index, bytes, offset); return ret;
}
if (bytes > len << 9) goto reread;
if (j->csum != csum_set(j)) {
pr_info("%u: bad csum, %zu bytes, offset %u\n",
bucket_index, bytes, offset); return ret;
}
blocks = set_blocks(j, block_bytes(ca));
/* * Nodes in 'list' are in linear increasing order of * i->j.seq, the node on head has the smallest (oldest) * journal seq, the node on tail has the biggest * (latest) journal seq.
*/
/* * Check from the oldest jset for last_seq. If * i->j.seq < j->last_seq, it means the oldest jset * in list is expired and useless, remove it from * this list. Otherwise, j is a candidate jset for * further following checks.
*/ while (!list_empty(list)) {
i = list_first_entry(list, struct journal_replay, list); if (i->j.seq >= j->last_seq) break;
list_del(&i->list);
kfree(i);
}
/* iterate list in reverse order (from latest jset) */
list_for_each_entry_reverse(i, list, list) { if (j->seq == i->j.seq) goto next_set;
/* * if j->seq is less than any i->j.last_seq * in list, j is an expired and useless jset.
*/ if (j->seq < i->j.last_seq) goto next_set;
/* * 'where' points to first jset in list which * is elder then j.
*/ if (j->seq > i->j.seq) {
where = &i->list; goto add;
}
}
where = list;
add:
i = kmalloc(offsetof(struct journal_replay, j) +
bytes, GFP_KERNEL); if (!i) return -ENOMEM;
unsafe_memcpy(&i->j, j, bytes, /* "bytes" was calculated by set_bytes() above */); /* Add to the location after 'where' points to */
list_add(&i->list, where);
ret = 1;
/* * Read journal buckets ordered by golden ratio hash to quickly * find a sequence of buckets with valid journal entries
*/ for (i = 0; i < ca->sb.njournal_buckets; i++) { /* * We must try the index l with ZERO first for * correctness due to the scenario that the journal * bucket is circular buffer which might have wrapped
*/
l = (i * 2654435769U) % ca->sb.njournal_buckets;
if (test_bit(l, bitmap)) break;
if (read_bucket(l)) goto bsearch;
}
/* * If that fails, check all the buckets we haven't checked * already
*/
pr_debug("falling back to linear search\n");
for_each_clear_bit(l, bitmap, ca->sb.njournal_buckets) if (read_bucket(l)) goto bsearch;
/* no journal entries on this device? */ if (l == ca->sb.njournal_buckets) goto out;
bsearch:
BUG_ON(list_empty(list));
/* Binary search */
m = l;
r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1);
pr_debug("starting binary search, l %u r %u\n", l, r);
if (seq != list_entry(list->prev, struct journal_replay,
list)->j.seq)
l = m; else
r = m;
}
/* * Read buckets in reverse order until we stop finding more * journal entries
*/
pr_debug("finishing up: m %u njournal_buckets %u\n",
m, ca->sb.njournal_buckets);
l = m;
while (1) { if (!l--)
l = ca->sb.njournal_buckets - 1;
if (l == m) break;
if (test_bit(l, bitmap)) continue;
if (!read_bucket(l)) break;
}
seq = 0;
for (i = 0; i < ca->sb.njournal_buckets; i++) if (ja->seq[i] > seq) {
seq = ja->seq[i]; /* * When journal_reclaim() goes to allocate for * the first time, it'll use the bucket after * ja->cur_idx
*/
ja->cur_idx = i;
ja->last_idx = ja->discard_idx = (i + 1) %
ca->sb.njournal_buckets;
}
out: if (!list_empty(list))
c->journal.seq = list_entry(list->prev, struct journal_replay,
list)->j.seq;
/* * journal.pin should never fill up - we never write a journal * entry when it would fill up. But if for some reason it does, we * iterate over the list in reverse order so that we can just skip that * refcount instead of bugging.
*/
/* get the oldest journal entry and check its refcount */
spin_lock(&c->journal.lock);
fifo_front_p = &fifo_front(&c->journal.pin);
ref_nr = atomic_read(fifo_front_p); if (ref_nr <= 0) { /* * do nothing if no btree node references * the oldest journal entry
*/
spin_unlock(&c->journal.lock); goto out;
}
spin_unlock(&c->journal.lock);
mask = c->journal.pin.mask;
nr = 0;
atomic_long_inc(&c->flush_write);
memset(btree_nodes, 0, sizeof(btree_nodes));
mutex_lock(&c->bucket_lock);
list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) { /* * It is safe to get now_fifo_front_p without holding * c->journal.lock here, because we don't need to know * the exactly accurate value, just check whether the * front pointer of c->journal.pin is changed.
*/
now_fifo_front_p = &fifo_front(&c->journal.pin); /* * If the oldest journal entry is reclaimed and front * pointer of c->journal.pin changes, it is unnecessary * to scan c->btree_cache anymore, just quit the loop and * flush out what we have already.
*/ if (now_fifo_front_p != fifo_front_p) break; /* * quit this loop if all matching btree nodes are * scanned and record in btree_nodes[] already.
*/
ref_nr = atomic_read(fifo_front_p); if (nr >= ref_nr) break;
if (btree_node_journal_flush(b))
pr_err("BUG: flush_write bit should not be set here!\n");
mutex_lock(&b->write_lock);
if (!btree_node_dirty(b)) {
mutex_unlock(&b->write_lock); continue;
}
if (!btree_current_write(b)->journal) {
mutex_unlock(&b->write_lock); continue;
}
/* * Only select the btree node which exactly references * the oldest journal entry. * * If the journal entry pointed by fifo_front_p is * reclaimed in parallel, don't worry: * - the list_for_each_xxx loop will quit when checking * next now_fifo_front_p. * - If there are matched nodes recorded in btree_nodes[], * they are clean now (this is why and how the oldest * journal entry can be reclaimed). These selected nodes * will be ignored and skipped in the following for-loop.
*/ if (((btree_current_write(b)->journal - fifo_front_p) &
mask) != 0) {
mutex_unlock(&b->write_lock); continue;
}
set_btree_node_journal_flush(b);
mutex_unlock(&b->write_lock);
btree_nodes[nr++] = b; /* * To avoid holding c->bucket_lock too long time, * only scan for BTREE_FLUSH_NR matched btree nodes * at most. If there are more btree nodes reference * the oldest journal entry, try to flush them next * time when btree_flush_write() is called.
*/ if (nr == BTREE_FLUSH_NR) break;
}
mutex_unlock(&c->bucket_lock);
for (i = 0; i < nr; i++) {
b = btree_nodes[i]; if (!b) {
pr_err("BUG: btree_nodes[%d] is NULL\n", i); continue;
}
/* safe to check without holding b->write_lock */ if (!btree_node_journal_flush(b)) {
pr_err("BUG: bnode %p: journal_flush bit cleaned\n", b); continue;
}
mutex_lock(&b->write_lock); if (!btree_current_write(b)->journal) {
clear_bit(BTREE_NODE_journal_flush, &b->flags);
mutex_unlock(&b->write_lock);
pr_debug("bnode %p: written by others\n", b); continue;
}
if (!btree_node_dirty(b)) {
clear_bit(BTREE_NODE_journal_flush, &b->flags);
mutex_unlock(&b->write_lock);
pr_debug("bnode %p: dirty bit cleaned by others\n", b); continue;
}
/* In case njournal_buckets is not power of 2 */ if (ja->cur_idx >= ja->discard_idx)
n = ca->sb.njournal_buckets + ja->discard_idx - ja->cur_idx; else
n = ja->discard_idx - ja->cur_idx;
if (n > (1 + j->do_reserve)) return n - (1 + j->do_reserve);
/* * The fifo_push() needs to happen at the same time as j->seq is * incremented for last_seq() to be calculated correctly
*/
BUG_ON(!fifo_push(&j->pin, p));
atomic_set(&fifo_back(&j->pin), 1);
if (!journal_full(&c->journal)) { if (wait)
trace_bcache_journal_entry_full(c);
/* * XXX: If we were inserting so many keys that they * won't fit in an _empty_ journal write, we'll * deadlock. For now, handle this in * bch_keylist_realloc() - but something to think about.
*/
BUG_ON(!w->data->keys);
journal_try_write(c); /* unlocks */
} else { if (wait)
trace_bcache_journal_full(c);
/* * Entry point to the journalling code - bio_insert() and btree_invalidate() * pass bch_journal() a list of keys to be journalled, and then * bch_journal() hands those same keys off to btree_insert_async()
*/
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.