// SPDX-License-Identifier: GPL-2.0 /* * Main bcache entry point - handle a read or a write request and decide what to * do with it; the make_request functions are called by the block layer. * * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> * Copyright 2012 Google, Inc.
*/
/* * The journalling code doesn't handle the case where the keys to insert * is bigger than an empty write: If we just return -ENOMEM here, * bch_data_insert_keys() will insert the keys created so far * and finish the rest when the keylist is empty.
*/ if (newsize * sizeof(uint64_t) > block_bytes(c->cache) - sizeof(struct jset)) return -ENOMEM;
/* * Our data write just errored, which means we've got a bunch of keys to * insert that point to data that wasn't successfully written. * * We don't have to insert those keys but we still have to invalidate * that region of the cache - so, if we just strip off all the pointers * from the keys we'll accomplish just that.
*/
if (bio->bi_status) { /* TODO: We could try to recover from this. */ if (op->writeback)
op->status = bio->bi_status; elseif (!op->replace)
set_closure_fn(cl, bch_data_insert_error, op->wq); else
set_closure_fn(cl, NULL, NULL);
}
bch_bbio_endio(op->c, bio, bio->bi_status, "writing data to cache");
}
if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0)
wake_up_gc(op->c);
/* * Journal writes are marked REQ_PREFLUSH; if the original write was a * flush, it'll wait on the journal write.
*/
bio->bi_opf &= ~(REQ_PREFLUSH|REQ_FUA);
/* 1 for the device pointer and 1 for the chksum */ if (bch_keylist_realloc(&op->insert_keys,
3 + (op->csum ? 1 : 0),
op->c)) {
continue_at(cl, bch_data_insert_keys, op->wq); return;
}
k = op->insert_keys.top;
bkey_init(k);
SET_KEY_INODE(k, op->inode);
SET_KEY_OFFSET(k, bio->bi_iter.bi_sector);
if (!bch_alloc_sectors(op->c, k, bio_sectors(bio),
op->write_point, op->write_prio,
op->writeback)) goto err;
n = bio_next_split(bio, KEY_SIZE(k), GFP_NOIO, split);
/* * But if it's not a writeback write we'd rather just bail out if * there aren't any buckets ready to write to - it might take awhile and * we might be starving btree writes for gc or something.
*/
if (!op->replace) { /* * Writethrough write: We can't complete the write until we've * updated the index. But we don't want to delay the write while * we wait for buckets to be freed up, so just invalidate the * rest of the write.
*/
op->bypass = true; return bch_data_invalidate(cl);
} else { /* * From a cache miss, we can just insert the keys for the data * we have written or bail out if we didn't do anything.
*/
op->insert_data_done = true;
bio_put(bio);
if (!bch_keylist_empty(&op->insert_keys))
continue_at(cl, bch_data_insert_keys, op->wq); else
closure_return(cl);
}
}
/** * bch_data_insert - stick some data in the cache * @cl: closure pointer. * * This is the starting point for any data to end up in a cache device; it could * be from a normal write, or a writeback write, or a write to a flash only * volume - it's also used by the moving garbage collector to compact data in * mostly empty buckets. * * It first writes the data to the cache, creating a list of keys to be inserted * (if the data had to be fragmented there will be multiple keys); after the * data is written it calls bch_journal, and after the keys have been added to * the next journal write they're inserted into the btree. * * It inserts the data in op->bio; bi_sector is used for the key offset, * and op->inode is used for the key inode. * * If op->bypass is true, instead of inserting the data it invalidates the * region of the cache represented by op->bio and op->inode.
*/
CLOSURE_CALLBACK(bch_data_insert)
{
closure_type(op, struct data_insert_op, cl);
/* * Congested? Return 0 (not congested) or the limit (in sectors) * beyond which we should bypass the cache due to congestion.
*/ unsignedint bch_get_congested(conststruct cache_set *c)
{ int i;
if (!c->congested_read_threshold_us &&
!c->congested_write_threshold_us) return 0;
i = (local_clock_us() - c->congested_last_us) / 1024; if (i < 0) return 0;
i += atomic_read(&c->congested); if (i >= 0) return 0;
if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
(bio_op(bio) == REQ_OP_DISCARD)) goto skip;
if (c->gc_stats.in_use > CUTOFF_CACHE_ADD) { /* * If cached buckets are all clean now, 'true' will be * returned and all requests will bypass the cache device. * Then c->sectors_to_gc has no chance to be negative, and * gc thread won't wake up and caching won't work forever. * Here call force_wake_up_gc() to avoid such aftermath.
*/ if (BDEV_STATE(&dc->sb) == BDEV_STATE_CLEAN &&
c->gc_mark_valid)
force_wake_up_gc(c);
/* * If the bio is for read-ahead or background IO, bypass it or * not depends on the following situations, * - If the IO is for meta data, always cache it and no bypass * - If the IO is not meta data, check dc->cache_reada_policy, * BCH_CACHE_READA_ALL: cache it and not bypass * BCH_CACHE_READA_META_ONLY: not cache it and bypass * That is, read-ahead request for metadata always get cached * (eg, for gfs2 or xfs).
*/ if ((bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND))) { if (!(bio->bi_opf & (REQ_META|REQ_PRIO)) &&
(dc->cache_readahead_policy != BCH_CACHE_READA_ALL)) goto skip;
}
/* * If the bucket was reused while our bio was in flight, we might have * read the wrong data. Set s->error but not error so it doesn't get * counted against the cache device, but we'll still reread the data * from the backing device.
*/
bch_bbio_endio(s->iop.c, bio, bio->bi_status, "reading from cache");
}
/* * Read from a single key, handling the initial cache miss if the key starts in * the middle of the bio
*/ staticint cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
{ struct search *s = container_of(op, struct search, op); struct bio *n, *bio = &s->bio.bio; struct bkey *bio_key; unsignedint ptr;
if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0)) <= 0) return MAP_CONTINUE;
/* * The bucket we're reading from might be reused while our bio * is in flight, and we could then end up reading the wrong * data. * * We guard against this by checking (in cache_read_endio()) if * the pointer is stale again; if so, we treat it as an error * and reread from the backing device (but we don't pass that * error up anywhere).
*/
__bch_submit_bbio(n, b->c); return n == bio ? MAP_DONE : MAP_CONTINUE;
}
static CLOSURE_CALLBACK(cache_lookup)
{
closure_type(s, struct search, iop.cl); struct bio *bio = &s->bio.bio; struct cached_dev *dc; int ret;
bch_btree_op_init(&s->op, -1);
ret = bch_btree_map_keys(&s->op, s->iop.c,
&KEY(s->iop.inode, bio->bi_iter.bi_sector, 0),
cache_lookup_fn, MAP_END_KEY); if (ret == -EAGAIN) {
continue_at(cl, cache_lookup, bcache_wq); return;
}
/* * We might meet err when searching the btree, If that happens, we will * get negative ret, in this scenario we should not recover data from * backing device (when cache device is dirty) because we don't know * whether bkeys the read request covered are all clean. * * And after that happened, s->iop.status is still its initial value * before we submit s->bio.bio
*/ if (ret < 0) {
BUG_ON(ret == -EINTR); if (s->d && s->d->c &&
!UUID_FLASH_ONLY(&s->d->c->uuids[s->d->id])) {
dc = container_of(s->d, struct cached_dev, disk); if (dc && atomic_read(&dc->has_dirty))
s->recoverable = false;
} if (!s->iop.status)
s->iop.status = BLK_STS_IOERR;
}
closure_return(cl);
}
/* Common code for the make_request functions */
staticvoid request_endio(struct bio *bio)
{ struct closure *cl = bio->bi_private;
s->iop.status = bio->bi_status; /* Only cache read errors are recoverable */
s->recoverable = false;
}
bio_put(bio);
closure_put(cl);
}
staticvoid backing_request_endio(struct bio *bio)
{ struct closure *cl = bio->bi_private;
if (bio->bi_status) { struct search *s = container_of(cl, struct search, cl); struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); /* * If a bio has REQ_PREFLUSH for writeback mode, it is * speically assembled in cached_dev_write() for a non-zero * write request which has REQ_PREFLUSH. we don't set * s->iop.status by this failure, the status will be decided * by result of bch_data_insert() operation.
*/ if (unlikely(s->iop.writeback &&
bio->bi_opf & REQ_PREFLUSH)) {
pr_err("Can't flush %pg: returned bi_status %i\n",
dc->bdev, bio->bi_status);
} else { /* set to orig_bio->bi_status in bio_complete() */
s->iop.status = bio->bi_status;
}
s->recoverable = false; /* should count I/O error for backing device here */
bch_count_backing_io_errors(dc, bio);
}
staticvoid do_bio_hook(struct search *s, struct bio *orig_bio,
bio_end_io_t *end_io_fn)
{ struct bio *bio = &s->bio.bio;
bio_init_clone(orig_bio->bi_bdev, bio, orig_bio, GFP_NOIO); /* * bi_end_io can be set separately somewhere else, e.g. the * variants in, * - cache_bio->bi_end_io from cached_dev_cache_miss() * - n->bi_end_io from cache_lookup_fn()
*/
bio->bi_end_io = end_io_fn;
bio->bi_private = &s->cl;
/* * If read request hit dirty data (s->read_dirty_data is true), * then recovery a failed read request from cached device may * get a stale data back. So read failure recovery is only * permitted when read request hit clean data in cache device, * or when cache read race happened.
*/ if (s->recoverable && !s->read_dirty_data) { /* Retry from the backing device: */
trace_bcache_read_retry(s->orig_bio);
/* * We had a cache miss; cache_bio now contains data ready to be inserted * into the cache. * * First, we copy the data we just read from cache_bio's bounce buffers * to the buffers the original bio pointed to:
*/
staticint cached_dev_cache_miss(struct btree *b, struct search *s, struct bio *bio, unsignedint sectors)
{ int ret = MAP_CONTINUE; struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); struct bio *miss, *cache_bio; unsignedint size_limit;
s->cache_missed = 1;
if (s->cache_miss || s->iop.bypass) {
miss = bio_next_split(bio, sectors, GFP_NOIO, &s->d->bio_split);
ret = miss == bio ? MAP_DONE : MAP_CONTINUE; goto out_submit;
}
/* Limitation for valid replace key size and cache_bio bvecs number */
size_limit = min_t(unsignedint, BIO_MAX_VECS * PAGE_SECTORS,
(1 << KEY_SIZE_BITS) - 1);
s->insert_bio_sectors = min3(size_limit, sectors, bio_sectors(bio));
down_read_non_owner(&dc->writeback_lock); if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) { /* * We overlap with some dirty data undergoing background * writeback, force this write to writeback
*/
s->iop.bypass = false;
s->iop.writeback = true;
}
/* * Discards aren't _required_ to do anything, so skipping if * check_overlapping returned true is ok * * But check_overlapping drops dirty keys for which io hasn't started, * so we still want to call it.
*/ if (bio_op(bio) == REQ_OP_DISCARD)
s->iop.bypass = true;
/* * no need to call closure_get(&dc->disk.cl), * because upper layer had already opened bcache device, * which would call closure_get(&dc->disk.cl)
*/
ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO); if (!ddip) {
bio->bi_status = BLK_STS_RESOURCE;
bio->bi_end_io(bio); return;
}
/* * mutex bch_register_lock may compete with other parallel requesters, * or attach/detach operations on other backing device. Waiting to * the mutex lock may increase I/O request latency for seconds or more. * To avoid such situation, if mutext_trylock() failed, only writeback * rate of current cached device is set to 1, and __update_write_back() * will decide writeback rate of other cached devices (remember now * c->idle_counter is 0 already).
*/ if (mutex_trylock(&bch_register_lock)) { for (i = 0; i < c->devices_max_used; i++) { if (!c->devices[i]) continue;
if (UUID_FLASH_ONLY(&c->uuids[i])) continue;
d = c->devices[i];
dc = container_of(d, struct cached_dev, disk); /* * set writeback rate to default minimum value, * then let update_writeback_rate() to decide the * upcoming rate.
*/
atomic_long_set(&dc->writeback_rate.rate, 1);
}
mutex_unlock(&bch_register_lock);
} else
atomic_long_set(&this_dc->writeback_rate.rate, 1);
}
if (likely(d->c)) { if (atomic_read(&d->c->idle_counter))
atomic_set(&d->c->idle_counter, 0); /* * If at_max_writeback_rate of cache set is true and new I/O * comes, quit max writeback rate of all cached devices * attached to this cache set, and set at_max_writeback_rate * to false.
*/ if (unlikely(atomic_read(&d->c->at_max_writeback_rate) == 1)) {
atomic_set(&d->c->at_max_writeback_rate, 0);
quit_max_writeback_rate(d->c, dc);
}
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.