return &op->write.rbio;
err_remove_list:
async_object_list_del(c, promote, op->list_idx);
err_remove_hash:
BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash,
bch_promote_params));
err:
bio_free_pages(&op->write.op.wbio.bio); /* We may have added to the rhashtable and thus need rcu freeing: */
kfree_rcu(op, rcu);
err_put:
enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote); return ERR_PTR(ret);
}
noinline staticstruct bch_read_bio *promote_alloc(struct btree_trans *trans, struct bvec_iter iter, struct bkey_s_c k, struct extent_ptr_decoded *pick, unsigned flags, struct bch_read_bio *orig, bool *bounce, bool *read_full, struct bch_io_failures *failed)
{ /* * We're in the retry path, but we don't know what to repair yet, and we * don't want to do a promote here:
*/ if (failed && !failed->nr) return NULL;
struct bch_fs *c = trans->c; /* * if failed != NULL we're not actually doing a promote, we're * recovering from an io/checksum error
*/ bool promote_full = (have_io_error(failed) ||
*read_full ||
READ_ONCE(c->opts.promote_whole_extents)); /* data might have to be decompressed in the write path: */ unsigned sectors = promote_full
? max(pick->crc.compressed_size, pick->crc.live_size)
: bvec_iter_sectors(iter); struct bpos pos = promote_full
? bkey_start_pos(k.k)
: POS(k.k->p.inode, iter.bi_sector); int ret;
ret = should_promote(c, k, pos, orig->opts, flags, failed); if (ret) goto nopromote;
if (rbio->split) { struct bch_read_bio *parent = rbio->parent;
if (unlikely(rbio->promote)) { if (!rbio->bio.bi_status)
promote_start(rbio); else
promote_free(rbio);
} else {
async_object_list_del(rbio->c, rbio, rbio->list_idx);
if (rbio->bounce)
bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
bio_put(&rbio->bio);
}
rbio = parent;
}
return rbio;
}
/* * Only called on a top level bch_read_bio to complete an entire read request, * not a split:
*/ staticvoid bch2_rbio_done(struct bch_read_bio *rbio)
{ if (rbio->start_time)
bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
rbio->start_time); #ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS if (rbio->list_idx)
async_object_list_del(rbio->c, rbio, rbio->list_idx); #endif
bio_endio(&rbio->bio);
}
static noinline int maybe_poison_extent(struct btree_trans *trans, struct bch_read_bio *rbio, enum btree_id btree, struct bkey_s_c read_k)
{ if (!bch2_poison_extents_on_checksum_error) return 0;
struct bch_fs *c = trans->c;
struct data_update *u = rbio_data_update(rbio); if (u)
read_k = bkey_i_to_s_c(u->k.k);
u64 flags = bch2_bkey_extent_flags(read_k); if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) return 0;
struct btree_iter iter; struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, bkey_start_pos(read_k.k),
BTREE_ITER_intent); int ret = bkey_err(k); if (ret) return ret;
/* * Propagate key change back to data update path, in particular so it * knows the extent has been poisoned and it's safe to change the * checksum
*/ if (u && !ret)
bch2_bkey_buf_copy(&u->k, c, new);
out:
bch2_trans_iter_exit(trans, &iter); return ret;
}
k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos,
BTREE_ITER_slots|BTREE_ITER_intent); if ((ret = bkey_err(k))) goto out;
if (bversion_cmp(k.k->bversion, rbio->version) ||
!bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) goto out;
/* Extent was merged? */ if (bkey_start_offset(k.k) < data_offset ||
k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) goto out;
if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
rbio->pick.crc, NULL, &new_crc,
bkey_start_offset(k.k) - data_offset, k.k->size,
rbio->pick.crc.csum_type)) {
bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
ret = 0; goto out;
}
/* * going to be temporarily appending another checksum entry:
*/ new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(struct bch_extent_crc128)); if ((ret = PTR_ERR_OR_ZERO(new))) goto out;
bkey_reassemble(new, k);
if (!bch2_bkey_narrow_crcs(new, new_crc)) goto out;
/* * Checksum error: if the bio wasn't bounced, we may have been * reading into buffers owned by userspace (that userspace can * scribble over) - retry the read, bouncing it this time:
*/ if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) {
rbio->flags |= BCH_READ_must_bounce;
bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace,
BLK_STS_IOERR); goto out;
}
/* * XXX * We need to rework the narrow_crcs path to deliver the read completion * first, and then punt to a different workqueue, otherwise we're * holding up reads while doing btree updates which is bad for memory * reclaim.
*/ if (unlikely(rbio->narrow_crcs))
bch2_rbio_narrow_crcs(rbio);
if (likely(!parent->data_update)) { /* Adjust crc to point to subset of data we want: */
crc.offset += rbio->offset_into_extent;
crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
if (crc_is_compressed(crc)) {
ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); if (ret) goto decrypt_err;
if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
!c->opts.no_data_io) goto decompression_err;
} else { /* don't need to decrypt the entire bio: */
nonce = nonce_add(nonce, crc.offset << 9);
bio_advance(src, crc.offset << 9);
if (rbio->promote) { /* * Re encrypt data we decrypted, so it's consistent with * rbio->crc:
*/
ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); if (ret) goto decrypt_err;
}
int gen = bucket_gen_get(ca, iter.pos.offset); if (gen >= 0) {
prt_printf(&buf, "Attempting to read from stale dirty pointer:\n");
printbuf_indent_add(&buf, 2);
/* * Stale dirty pointers are treated as IO errors, but @failed isn't * allocated unless we're in the retry path - so if we're not in the * retry path, don't check here, it'll be caught in bch2_read_endio() * and we'll end up in the retry path:
*/ if ((flags & BCH_READ_in_retry) &&
!pick.ptr.cached &&
ca &&
unlikely(dev_ptr_stale(ca, &pick.ptr))) {
read_from_stale_dirty_pointer(trans, ca, k, pick.ptr);
bch2_mark_io_failure(failed, &pick, false);
enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read); goto retry_pick;
}
if (likely(!u)) { if (!(flags & BCH_READ_last_fragment) ||
bio_flagged(&orig->bio, BIO_CHAIN))
flags |= BCH_READ_must_clone;
if (crc_is_compressed(pick.crc) ||
(pick.crc.csum_type != BCH_CSUM_none &&
(bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
(bch2_csum_type_is_encryption(pick.crc.csum_type) &&
(flags & BCH_READ_user_mapped)) ||
(flags & BCH_READ_must_bounce)))) {
read_full = true;
bounce = true;
}
} else { /* * can happen if we retry, and the extent we were going to read * has been merged in the meantime:
*/ if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { if (ca)
enumerated_ref_put(&ca->io_ref[READ],
BCH_DEV_READ_REF_io_read);
rbio->ret = bch_err_throw(c, data_read_buffer_too_small); goto out_read_done;
}
if (rbio) { /* * promote already allocated bounce rbio: * promote needs to allocate a bio big enough for uncompressing * data in the write path, but we're not going to use it all * here:
*/
EBUG_ON(rbio->bio.bi_iter.bi_size <
pick.crc.compressed_size << 9);
rbio->bio.bi_iter.bi_size =
pick.crc.compressed_size << 9;
} elseif (bounce) { unsigned sectors = pick.crc.compressed_size;
bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
rbio->bounce = true;
} elseif (flags & BCH_READ_must_clone) { /* * Have to clone if there were any splits, due to error * reporting issues (if a split errored, and retrying didn't * work, when it reports the error to its parent (us) we don't * know if the error was from our bio, and we should retry, or * from the whole bio, in which case we don't want to retry and * lose the error)
*/
rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
&c->bio_read_split),
orig);
rbio->bio.bi_iter = iter;
} else {
rbio = orig;
rbio->bio.bi_iter = iter;
EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
}
if (rbio->bounce)
trace_and_count(c, io_read_bounce, &rbio->bio);
if (!u)
this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); else
this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio));
bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
/* * If it's being moved internally, we don't want to flag it as a cache * hit:
*/ if (ca && pick.ptr.cached && !u)
bch2_bucket_io_time_reset(trans, pick.ptr.dev,
PTR_BUCKET_NR(ca, &pick.ptr), READ);
if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) {
bio_inc_remaining(&orig->bio);
trace_and_count(c, io_read_split, &orig->bio);
}
/* * Unlock the iterator while the btree node's lock is still in * cache, before doing the IO:
*/ if (!(flags & BCH_READ_in_retry))
bch2_trans_unlock(trans); else
bch2_trans_unlock_long(trans);
if (likely(!rbio->pick.do_ec_reconstruct)) { if (unlikely(!rbio->have_ioref)) {
bch2_rbio_error(rbio,
-BCH_ERR_data_read_retry_device_offline,
BLK_STS_IOERR); goto out;
}
hole:
this_cpu_add(c->counters[BCH_COUNTER_io_read_hole],
bvec_iter_sectors(iter)); /* * won't normally happen in the data update (bch2_move_extent()) path, * but if we retry and the extent we wanted to read no longer exists we * have to signal that:
*/ if (u)
orig->ret = bch_err_throw(c, data_read_key_overwritten);
ret = bch2_read_indirect_extent(trans, &data_btree,
&offset_into_extent, &sk); if (ret) goto err;
k = bkey_i_to_s_c(sk.k);
if (unlikely(flags & BCH_READ_in_retry)) { if (!bkey_and_val_eq(k, bkey_i_to_s_c(prev_read->k)))
failed->nr = 0;
bch2_bkey_buf_copy(prev_read, c, sk.k);
}
/* * With indirect extents, the amount of data to read is the min * of the original extent and the indirect extent:
*/
sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.