/* Is this a data path I/O that needs storage layer checksum and repair? */ staticinlinebool is_data_bbio(conststruct btrfs_bio *bbio)
{ return bbio->inode && is_data_inode(bbio->inode);
}
/* * Initialize a btrfs_bio structure. This skips the embedded bio itself as it * is already initialized by the block layer.
*/ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
btrfs_bio_end_io_t end_io, void *private)
{
memset(bbio, 0, offsetof(struct btrfs_bio, bio));
bbio->fs_info = fs_info;
bbio->end_io = end_io;
bbio->private = private;
atomic_set(&bbio->pending_ios, 1);
WRITE_ONCE(bbio->status, BLK_STS_OK);
}
/* * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for * btrfs, and is used for all I/O submitted through btrfs_submit_bbio(). * * Just like the underlying bio_alloc_bioset it will not fail as it is backed by * a mempool.
*/ struct btrfs_bio *btrfs_bio_alloc(unsignedint nr_vecs, blk_opf_t opf, struct btrfs_fs_info *fs_info,
btrfs_bio_end_io_t end_io, void *private)
{ struct btrfs_bio *bbio; struct bio *bio;
/* Free bio that was never submitted to the underlying device. */ if (bbio_has_ordered_extent(bbio))
btrfs_put_ordered_extent(bbio->ordered);
bio_put(&bbio->bio);
bbio = orig_bbio;
}
/* * At this point, bbio always points to the original btrfs_bio. Save * the first error in it.
*/ if (status != BLK_STS_OK)
cmpxchg(&bbio->status, BLK_STS_OK, status);
if (atomic_dec_and_test(&bbio->pending_ios)) { /* Load split bio's error which might be set above. */ if (status == BLK_STS_OK)
bbio->bio.bi_status = READ_ONCE(bbio->status);
if (bbio_has_ordered_extent(bbio)) { struct btrfs_ordered_extent *ordered = bbio->ordered;
/* * Try to kick off a repair read to the next available mirror for a bad sector. * * This primarily tries to recover good data to serve the actual read request, * but also tries to write the good data back to the bad mirror(s) when a * read succeeded to restore the redundancy.
*/ staticstruct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio,
u32 bio_offset, struct bio_vec *bv, struct btrfs_failed_bio *fbio)
{ struct btrfs_inode *inode = failed_bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; const u32 sectorsize = fs_info->sectorsize; const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT); struct btrfs_bio *repair_bbio; struct bio *repair_bio; int num_copies; int mirror;
/* Read-repair requires the inode field to be set by the submitter. */
ASSERT(inode);
/* * Hand off repair bios to the repair code as there is no upper level * submitter for them.
*/ if (bbio->bio.bi_pool == &btrfs_repair_bioset) {
btrfs_end_repair_bio(bbio, dev); return;
}
/* Clear the I/O error. A failed repair will reset it. */
bbio->bio.bi_status = BLK_STS_OK;
while (iter->bi_size) { struct bio_vec bv = bio_iter_iovec(&bbio->bio, *iter);
/* Metadata reads are checked and repaired by the submitter. */ if (is_data_bbio(bbio))
btrfs_check_read_bio(bbio, bbio->bio.bi_private); else
btrfs_bio_end_io(bbio, bbio->bio.bi_status);
}
if (bio->bi_status) {
atomic_inc(&bioc->error);
btrfs_log_dev_io_error(bio, stripe->dev);
}
/* * Only send an error to the higher layers if it is beyond the tolerance * threshold.
*/ if (atomic_read(&bioc->error) > bioc->max_errors)
bio->bi_status = BLK_STS_IOERR; else
bio->bi_status = BLK_STS_OK;
if (bio_is_zone_append(bio) && !bio->bi_status)
stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
/* * For zone append writing, bi_sector must point the beginning of the * zone
*/ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
u64 zone_start = round_down(physical, dev->fs_info->zone_size);
/* * Track reads if tracking is enabled; ignore I/O operations before the * filesystem is fully initialized.
*/ if (dev->fs_devices->collect_fs_stats && bio_op(bio) == REQ_OP_READ && dev->fs_info)
percpu_counter_add(&dev->fs_info->stats_read_blocks,
bio->bi_iter.bi_size >> dev->fs_info->sectorsize_bits);
if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT)
blkcg_punt_bio_submit(bio); else
submit_bio(bio);
}
staticvoid btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
{ struct bio *orig_bio = bioc->orig_bio, *bio;
ASSERT(bio_op(orig_bio) != REQ_OP_READ);
/* Reuse the bio embedded into the btrfs_bio for the last mirror */ if (dev_nr == bioc->num_stripes - 1) {
bio = orig_bio;
bio->bi_end_io = btrfs_orig_write_end_io;
} else {
bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set);
bio_inc_remaining(orig_bio);
bio->bi_end_io = btrfs_clone_write_end_io;
}
/* * Async submit bios are used to offload expensive checksumming onto the worker * threads.
*/ struct async_submit_bio { struct btrfs_bio *bbio; struct btrfs_io_context *bioc; struct btrfs_io_stripe smap; int mirror_num; struct btrfs_work work;
};
/* * In order to insert checksums into the metadata in large chunks, we wait * until bio submission time. All the pages in the bio are checksummed and * sums are attached onto the ordered extent record. * * At IO completion time the csums attached on the ordered extent record are * inserted into the btree.
*/ staticvoid run_one_async_start(struct btrfs_work *work)
{ struct async_submit_bio *async =
container_of(work, struct async_submit_bio, work); int ret;
ret = btrfs_bio_csum(async->bbio); if (ret)
async->bbio->bio.bi_status = errno_to_blk_status(ret);
}
/* * In order to insert checksums into the metadata in large chunks, we wait * until bio submission time. All the pages in the bio are checksummed and * sums are attached onto the ordered extent record. * * At IO completion time the csums attached on the ordered extent record are * inserted into the tree. * * If called with @do_free == true, then it will free the work struct.
*/ staticvoid run_one_async_done(struct btrfs_work *work, bool do_free)
{ struct async_submit_bio *async =
container_of(work, struct async_submit_bio, work); struct bio *bio = &async->bbio->bio;
if (do_free) {
kfree(container_of(work, struct async_submit_bio, work)); return;
}
/* If an error occurred we just want to clean up the bio and move on. */ if (bio->bi_status) {
btrfs_bio_end_io(async->bbio, bio->bi_status); return;
}
/* * All of the bios that pass through here are from async helpers. * Use REQ_BTRFS_CGROUP_PUNT to issue them from the owning cgroup's * context. This changes nothing when cgroups aren't in use.
*/
bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT;
btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num);
}
/* Submit synchronously if the checksum implementation is fast. */ if (auto_csum_mode && test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags)) returnfalse;
/* * Try to defer the submission to a workqueue to parallelize the * checksum calculation unless the I/O is issued synchronously.
*/ if (op_is_sync(bbio->bio.bi_opf)) returnfalse;
/* Zoned devices require I/O to be submitted in order. */ if ((bbio->bio.bi_opf & REQ_META) && btrfs_is_zoned(bbio->fs_info)) returnfalse;
returntrue;
}
/* * Submit bio to an async queue. * * Return true if the work has been successfully submitted, else false.
*/ staticbool btrfs_wq_submit_bio(struct btrfs_bio *bbio, struct btrfs_io_context *bioc, struct btrfs_io_stripe *smap, int mirror_num)
{ struct btrfs_fs_info *fs_info = bbio->fs_info; struct async_submit_bio *async;
async = kmalloc(sizeof(*async), GFP_NOFS); if (!async) returnfalse;
map_length = min(map_length, bbio->fs_info->max_zone_append_size);
sector_offset = bio_split_rw_at(&bbio->bio, &bbio->fs_info->limits,
&nr_segs, map_length); if (sector_offset) { /* * bio_split_rw_at() could split at a size smaller than our * sectorsize and thus cause unaligned I/Os. Fix that by * always rounding down to the nearest boundary.
*/ return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, bbio->fs_info->sectorsize);
} return map_length;
}
btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
&bioc, &smap, &mirror_num); if (ret) {
status = errno_to_blk_status(ret);
btrfs_bio_counter_dec(fs_info); goto end_bbio;
}
map_length = min(map_length, length); if (use_append)
map_length = btrfs_append_map_length(bbio, map_length);
if (map_length < length) { struct btrfs_bio *split;
split = btrfs_split_bio(fs_info, bbio, map_length); if (IS_ERR(split)) {
status = errno_to_blk_status(PTR_ERR(split));
btrfs_bio_counter_dec(fs_info); goto end_bbio;
}
bbio = split;
bio = &bbio->bio;
}
/* * Save the iter for the end_io handler and preload the checksums for * data reads.
*/ if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) {
bbio->saved_iter = bio->bi_iter;
ret = btrfs_lookup_bio_sums(bbio);
status = errno_to_blk_status(ret); if (status) goto fail;
}
if (btrfs_op(bio) == BTRFS_MAP_WRITE) { if (use_append) {
bio->bi_opf &= ~REQ_OP_WRITE;
bio->bi_opf |= REQ_OP_ZONE_APPEND;
}
if (is_data_bbio(bbio) && bioc && bioc->use_rst) { /* * No locking for the list update, as we only add to * the list in the I/O submission path, and list * iteration only happens in the completion path, which * can't happen until after the last submission.
*/
btrfs_get_bioc(bioc);
list_add_tail(&bioc->rst_ordered_entry, &bbio->ordered->bioc_list);
}
/* * Csum items for reloc roots have already been cloned at this * point, so they are handled as part of the no-checksum case.
*/ if (inode && !(inode->flags & BTRFS_INODE_NODATASUM) &&
!test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state) &&
!btrfs_is_data_reloc_root(inode->root)) { if (should_async_write(bbio) &&
btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num)) goto done;
ret = btrfs_bio_csum(bbio);
status = errno_to_blk_status(ret); if (status) goto fail;
} elseif (use_append ||
(btrfs_is_zoned(fs_info) && inode &&
inode->flags & BTRFS_INODE_NODATASUM)) {
ret = btrfs_alloc_dummy_sum(bbio);
status = errno_to_blk_status(ret); if (status) goto fail;
}
}
fail:
btrfs_bio_counter_dec(fs_info); /* * We have split the original bbio, now we have to end both the current * @bbio and remaining one, as the remaining one will never be submitted.
*/ if (map_length < length) { struct btrfs_bio *remaining = bbio->private;
btrfs_bio_end_io(remaining, status);
}
end_bbio:
btrfs_bio_end_io(bbio, status); /* Do not submit another chunk */ returntrue;
}
void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num)
{ /* If bbio->inode is not populated, its file_offset must be 0. */
ASSERT(bbio->inode || bbio->file_offset == 0);
while (!btrfs_submit_chunk(bbio, mirror_num))
;
}
/* * Submit a repair write. * * This bypasses btrfs_submit_bbio() deliberately, as that writes all copies in a * RAID setup. Here we only want to write the one bad copy, so we do the * mapping ourselves and submit the bio directly. * * The I/O is issued synchronously to block the repair read completion from * freeing the bio.
*/ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
u64 length, u64 logical, phys_addr_t paddr, int mirror_num)
{ struct btrfs_io_stripe smap = { 0 }; struct bio_vec bvec; struct bio bio; int ret = 0;
if (btrfs_repair_one_zone(fs_info, logical)) return 0;
/* * Avoid races with device replace and make sure our bioc has devices * associated to its stripes that don't go away while we are doing the * read repair operation.
*/
btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num); if (ret < 0) goto out_counter_dec;
if (!smap.dev->bdev ||
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state)) {
ret = -EIO; goto out_counter_dec;
}
bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
__bio_add_page(&bio, phys_to_page(paddr), length, offset_in_page(paddr));
ret = submit_bio_wait(&bio); if (ret) { /* try to remap that extent elsewhere? */
btrfs_dev_stat_inc_and_print(smap.dev, BTRFS_DEV_STAT_WRITE_ERRS); goto out_bio_uninit;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.