/* Used by the raid56 code to lock stripes for read/modify/write */ struct btrfs_stripe_hash { struct list_head hash_list;
spinlock_t lock;
};
/* Used by the raid56 code to lock stripes for read/modify/write */ struct btrfs_stripe_hash_table { struct list_head stripe_cache;
spinlock_t cache_lock; int cache_size; struct btrfs_stripe_hash table[];
};
/* * A structure to present a sector inside a page, the length is fixed to * sectorsize;
*/ struct sector_ptr { /* * Blocks from the bio list can still be highmem. * So here we use physical address to present a page and the offset inside it.
*/
phys_addr_t paddr; bool has_paddr; bool uptodate;
};
/* * the stripe hash table is used for locking, and to collect * bios in hopes of making a full stripe
*/ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
{ struct btrfs_stripe_hash_table *table; struct btrfs_stripe_hash_table *x; struct btrfs_stripe_hash *cur; struct btrfs_stripe_hash *h; unsignedint num_entries = 1U << BTRFS_STRIPE_HASH_TABLE_BITS;
if (info->stripe_hash_table) return 0;
/* * The table is large, starting with order 4 and can go as high as * order 7 in case lock debugging is turned on. * * Try harder to allocate and fallback to vmalloc to lower the chance * of a failing mount.
*/
table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL); if (!table) return -ENOMEM;
/* * caching an rbio means to copy anything from the * bio_sectors array into the stripe_pages array. We * use the page uptodate bit in the stripe cache array * to indicate if it has valid data * * once the caching is done, we set the cache ready * bit.
*/ staticvoid cache_rbio_pages(struct btrfs_raid_bio *rbio)
{ int i; int ret;
ret = alloc_rbio_pages(rbio); if (ret) return;
for (i = 0; i < rbio->nr_sectors; i++) { /* Some range not covered by bio (partial write), skip it */ if (!rbio->bio_sectors[i].has_paddr) { /* * Even if the sector is not covered by bio, if it is * a data sector it should still be uptodate as it is * read from disk.
*/ if (i < rbio->nr_data * rbio->stripe_nsectors)
ASSERT(rbio->stripe_sectors[i].uptodate); continue;
}
/* * we hash on the first logical address of the stripe
*/ staticint rbio_bucket(struct btrfs_raid_bio *rbio)
{
u64 num = rbio->bioc->full_stripe_logical;
/* * we shift down quite a bit. We're using byte * addressing, and most of the lower bits are zeros. * This tends to upset hash_64, and it consistently * returns just one or two different values. * * shifting off the lower bits fixes things.
*/ return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
}
for (i = sectors_per_page * page_nr;
i < sectors_per_page * page_nr + sectors_per_page;
i++) { if (!rbio->stripe_sectors[i].uptodate) returnfalse;
} returntrue;
}
/* * Update the stripe_sectors[] array to use correct page and pgoff * * Should be called every time any page pointer in stripes_pages[] got modified.
*/ staticvoid index_stripe_sectors(struct btrfs_raid_bio *rbio)
{ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
u32 offset; int i;
for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) { int page_index = offset >> PAGE_SHIFT;
ASSERT(page_index < rbio->nr_pages); if (!rbio->stripe_pages[page_index]) continue;
if (dest->stripe_pages[page_nr])
__free_page(dest->stripe_pages[page_nr]);
dest->stripe_pages[page_nr] = src->stripe_pages[page_nr];
src->stripe_pages[page_nr] = NULL;
/* Also update the sector->uptodate bits. */ for (i = sectors_per_page * page_nr;
i < sectors_per_page * page_nr + sectors_per_page; i++)
dest->stripe_sectors[i].uptodate = true;
}
/* * We have ensured PAGE_SIZE is aligned with sectorsize, thus * we won't have a page which is half data half parity. * * Thus if the first sector of the page belongs to data stripes, then * the full page belongs to data stripes.
*/ return (sector_nr < rbio->nr_data * rbio->stripe_nsectors);
}
/* * Stealing an rbio means taking all the uptodate pages from the stripe array * in the source rbio and putting them into the destination rbio. * * This will also update the involved stripe_sectors[] which are referring to * the old pages.
*/ staticvoid steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
{ int i;
if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) return;
for (i = 0; i < dest->nr_pages; i++) { struct page *p = src->stripe_pages[i];
/* * We don't need to steal P/Q pages as they will always be * regenerated for RMW or full write anyway.
*/ if (!is_data_stripe_page(src, i)) continue;
/* * If @src already has RBIO_CACHE_READY_BIT, it should have * all data stripe pages present and uptodate.
*/
ASSERT(p);
ASSERT(full_page_sectors_uptodate(src, i));
steal_rbio_page(src, dest, i);
}
index_stripe_sectors(dest);
index_stripe_sectors(src);
}
/* * merging means we take the bio_list from the victim and * splice it into the destination. The victim should * be discarded afterwards. * * must be called with dest->rbio_list_lock held
*/ staticvoid merge_rbio(struct btrfs_raid_bio *dest, struct btrfs_raid_bio *victim)
{
bio_list_merge_init(&dest->bio_list, &victim->bio_list);
dest->bio_list_bytes += victim->bio_list_bytes; /* Also inherit the bitmaps from @victim. */
bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap,
dest->stripe_nsectors);
}
/* * used to prune items that are in the cache. The caller * must hold the hash table lock.
*/ staticvoid __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
{ int bucket = rbio_bucket(rbio); struct btrfs_stripe_hash_table *table; struct btrfs_stripe_hash *h; int freeit = 0;
/* * check the bit again under the hash table lock.
*/ if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) return;
table = rbio->bioc->fs_info->stripe_hash_table;
h = table->table + bucket;
/* hold the lock for the bucket because we may be * removing it from the hash table
*/
spin_lock(&h->lock);
/* * hold the lock for the bio list because we need * to make sure the bio list is empty
*/
spin_lock(&rbio->bio_list_lock);
/* if the bio list isn't empty, this rbio is * still involved in an IO. We take it out * of the cache list, and drop the ref that * was held for the list. * * If the bio_list was empty, we also remove * the rbio from the hash_table, and drop * the corresponding ref
*/ if (bio_list_empty(&rbio->bio_list)) { if (!list_empty(&rbio->hash_list)) {
list_del_init(&rbio->hash_list);
refcount_dec(&rbio->refs);
BUG_ON(!list_empty(&rbio->plug_list));
}
}
}
/* * remove all cached entries and free the hash table * used by unmount
*/ void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
{ if (!info->stripe_hash_table) return;
btrfs_clear_rbio_cache(info);
kvfree(info->stripe_hash_table);
info->stripe_hash_table = NULL;
}
/* * insert an rbio into the stripe cache. It * must have already been prepared by calling * cache_rbio_pages * * If this rbio was already cached, it gets * moved to the front of the lru. * * If the size of the rbio cache is too big, we * prune an item.
*/ staticvoid cache_rbio(struct btrfs_raid_bio *rbio)
{ struct btrfs_stripe_hash_table *table;
if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) return;
if (table->cache_size > RBIO_CACHE_SIZE) { struct btrfs_raid_bio *found;
found = list_last_entry(&table->stripe_cache, struct btrfs_raid_bio,
stripe_cache);
if (found != rbio)
__remove_rbio_from_cache(found);
}
spin_unlock(&table->cache_lock);
}
/* * helper function to run the xor_blocks api. It is only * able to do MAX_XOR_BLOCKS at a time, so we need to * loop through.
*/ staticvoid run_xor(void **pages, int src_cnt, ssize_t len)
{ int src_off = 0; int xor_src_cnt = 0; void *dest = pages[src_cnt];
/* * Returns true if the bio list inside this rbio covers an entire stripe (no * rmw required).
*/ staticint rbio_is_full(struct btrfs_raid_bio *rbio)
{ unsignedlong size = rbio->bio_list_bytes; int ret = 1;
spin_lock(&rbio->bio_list_lock); if (size != rbio->nr_data * BTRFS_STRIPE_LEN)
ret = 0;
BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN);
spin_unlock(&rbio->bio_list_lock);
return ret;
}
/* * returns 1 if it is safe to merge two rbios together. * The merging is safe if the two rbios correspond to * the same stripe and if they are both going in the same * direction (read vs write), and if neither one is * locked for final IO * * The caller is responsible for locking such that * rmw_locked is safe to test
*/ staticint rbio_can_merge(struct btrfs_raid_bio *last, struct btrfs_raid_bio *cur)
{ if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) return 0;
/* * we can't merge with cached rbios, since the * idea is that when we merge the destination * rbio is going to run our IO for us. We can * steal from cached rbios though, other functions * handle that.
*/ if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
test_bit(RBIO_CACHE_BIT, &cur->flags)) return 0;
if (last->bioc->full_stripe_logical != cur->bioc->full_stripe_logical) return 0;
/* we can't merge with different operations */ if (last->operation != cur->operation) return 0; /* * We've need read the full stripe from the drive. * check and repair the parity and write the new results. * * We're not allowed to add any new bios to the * bio list here, anyone else that wants to * change this stripe needs to do their own rmw.
*/ if (last->operation == BTRFS_RBIO_PARITY_SCRUB) return 0;
if (last->operation == BTRFS_RBIO_READ_REBUILD) return 0;
/* Return a sector from rbio->stripe_sectors, not from the bio list */ staticstruct sector_ptr *rbio_stripe_sector(conststruct btrfs_raid_bio *rbio, unsignedint stripe_nr, unsignedint sector_nr)
{ return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr,
sector_nr)];
}
/* Grab a sector inside Q stripe, return NULL if not RAID6 */ staticstruct sector_ptr *rbio_qstripe_sector(conststruct btrfs_raid_bio *rbio, unsignedint sector_nr)
{ if (rbio->nr_data + 1 == rbio->real_stripes) return NULL; return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr);
}
/* * The first stripe in the table for a logical address * has the lock. rbios are added in one of three ways: * * 1) Nobody has the stripe locked yet. The rbio is given * the lock and 0 is returned. The caller must start the IO * themselves. * * 2) Someone has the stripe locked, but we're able to merge * with the lock owner. The rbio is freed and the IO will * start automatically along with the existing rbio. 1 is returned. * * 3) Someone has the stripe locked, but we're not able to merge. * The rbio is added to the lock owner's plug list, or merged into * an rbio already on the plug list. When the lock owner unlocks, * the next rbio on the list is run and the IO is started automatically. * 1 is returned * * If we return 0, the caller still owns the rbio and must continue with * IO submission. If we return 1, the caller must assume the rbio has * already been freed.
*/ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
{ struct btrfs_stripe_hash *h; struct btrfs_raid_bio *cur; struct btrfs_raid_bio *pending; struct btrfs_raid_bio *freeit = NULL; struct btrfs_raid_bio *cache_drop = NULL; int ret = 0;
h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
spin_lock(&h->lock);
list_for_each_entry(cur, &h->hash_list, hash_list) { if (cur->bioc->full_stripe_logical != rbio->bioc->full_stripe_logical) continue;
spin_lock(&cur->bio_list_lock);
/* Can we steal this cached rbio's pages? */ if (bio_list_empty(&cur->bio_list) &&
list_empty(&cur->plug_list) &&
test_bit(RBIO_CACHE_BIT, &cur->flags) &&
!test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
list_del_init(&cur->hash_list);
refcount_dec(&cur->refs);
/* Can we merge into the lock owner? */ if (rbio_can_merge(cur, rbio)) {
merge_rbio(cur, rbio);
spin_unlock(&cur->bio_list_lock);
freeit = rbio;
ret = 1; goto out;
}
/* * We couldn't merge with the running rbio, see if we can merge * with the pending ones. We don't have to check for rmw_locked * because there is no way they are inside finish_rmw right now
*/
list_for_each_entry(pending, &cur->plug_list, plug_list) { if (rbio_can_merge(pending, rbio)) {
merge_rbio(pending, rbio);
spin_unlock(&cur->bio_list_lock);
freeit = rbio;
ret = 1; goto out;
}
}
/* * No merging, put us on the tail of the plug list, our rbio * will be started with the currently running rbio unlocks
*/
list_add_tail(&rbio->plug_list, &cur->plug_list);
spin_unlock(&cur->bio_list_lock);
ret = 1; goto out;
}
lockit:
refcount_inc(&rbio->refs);
list_add(&rbio->hash_list, &h->hash_list);
out:
spin_unlock(&h->lock); if (cache_drop)
remove_rbio_from_cache(cache_drop); if (freeit)
free_raid_bio(freeit); return ret;
}
/* * called as rmw or parity rebuild is completed. If the plug list has more * rbios waiting for this stripe, the next one on the list will be started
*/ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
{ int bucket; struct btrfs_stripe_hash *h; int keep_cache = 0;
bucket = rbio_bucket(rbio);
h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
if (list_empty(&rbio->plug_list))
cache_rbio(rbio);
if (!list_empty(&rbio->hash_list)) { /* * if we're still cached and there is no other IO * to perform, just leave this rbio here for others * to steal from later
*/ if (list_empty(&rbio->plug_list) &&
test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
keep_cache = 1;
clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
BUG_ON(!bio_list_empty(&rbio->bio_list)); goto done;
}
/* * we use the plug list to hold all the rbios * waiting for the chance to lock this stripe. * hand the lock over to one of them.
*/ if (!list_empty(&rbio->plug_list)) { struct btrfs_raid_bio *next; struct list_head *head = rbio->plug_list.next;
next = list_entry(head, struct btrfs_raid_bio,
plug_list);
done_nolock: if (!keep_cache)
remove_rbio_from_cache(rbio);
}
staticvoid rbio_endio_bio_list(struct bio *cur, blk_status_t status)
{ struct bio *next;
while (cur) {
next = cur->bi_next;
cur->bi_next = NULL;
cur->bi_status = status;
bio_endio(cur);
cur = next;
}
}
/* * this frees the rbio and runs through all the bios in the * bio_list and calls end_io on them
*/ staticvoid rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t status)
{ struct bio *cur = bio_list_get(&rbio->bio_list); struct bio *extra;
/* * Clear the data bitmap, as the rbio may be cached for later usage. * do this before before unlock_stripe() so there will be no new bio * for this bio.
*/
bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors);
/* * At this moment, rbio->bio_list is empty, however since rbio does not * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the * hash list, rbio may be merged with others so that rbio->bio_list * becomes non-empty. * Once unlock_stripe() is done, rbio->bio_list will not be updated any * more and we can call bio_endio() on all queued bios.
*/
unlock_stripe(rbio);
extra = bio_list_get(&rbio->bio_list);
free_raid_bio(rbio);
rbio_endio_bio_list(cur, status); if (extra)
rbio_endio_bio_list(extra, status);
}
/* * Get a sector pointer specified by its @stripe_nr and @sector_nr. * * @rbio: The raid bio * @stripe_nr: Stripe number, valid range [0, real_stripe) * @sector_nr: Sector number inside the stripe, * valid range [0, stripe_nsectors) * @bio_list_only: Whether to use sectors inside the bio list only. * * The read/modify/write code wants to reuse the original bio page as much * as possible, and only use stripe_sectors as fallback.
*/ staticstruct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, int stripe_nr, int sector_nr, bool bio_list_only)
{ struct sector_ptr *sector; int index;
index = stripe_nr * rbio->stripe_nsectors + sector_nr;
ASSERT(index >= 0 && index < rbio->nr_sectors);
spin_lock(&rbio->bio_list_lock);
sector = &rbio->bio_sectors[index]; if (sector->has_paddr || bio_list_only) { /* Don't return sector without a valid page pointer */ if (!sector->has_paddr)
sector = NULL;
spin_unlock(&rbio->bio_list_lock); return sector;
}
spin_unlock(&rbio->bio_list_lock);
return &rbio->stripe_sectors[index];
}
/* * allocation and initial setup for the btrfs_raid_bio. Not * this does not allocate any pages for rbio->pages.
*/ staticstruct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, struct btrfs_io_context *bioc)
{ constunsignedint real_stripes = bioc->num_stripes - bioc->replace_nr_stripes; constunsignedint stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT; constunsignedint num_pages = stripe_npages * real_stripes; constunsignedint stripe_nsectors =
BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; constunsignedint num_sectors = stripe_nsectors * real_stripes; struct btrfs_raid_bio *rbio;
/* PAGE_SIZE must also be aligned to sectorsize for subpage support */
ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize)); /* * Our current stripe len should be fixed to 64k thus stripe_nsectors * (at most 16) should be no larger than BITS_PER_LONG.
*/
ASSERT(stripe_nsectors <= BITS_PER_LONG);
/* * Real stripes must be between 2 (2 disks RAID5, aka RAID1) and 256 * (limited by u8).
*/
ASSERT(real_stripes >= 2);
ASSERT(real_stripes <= U8_MAX);
/* allocate pages for all the stripes in the bio, including parity */ staticint alloc_rbio_pages(struct btrfs_raid_bio *rbio)
{ int ret;
ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, false); if (ret < 0) return ret; /* Mapping all sectors */
index_stripe_sectors(rbio); return 0;
}
/* only allocate pages for p/q stripes */ staticint alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
{ constint data_pages = rbio->nr_data * rbio->stripe_npages; int ret;
ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
rbio->stripe_pages + data_pages, false); if (ret < 0) return ret;
index_stripe_sectors(rbio); return 0;
}
/* * Return the total number of errors found in the vertical stripe of @sector_nr. * * @faila and @failb will also be updated to the first and second stripe * number of the errors.
*/ staticint get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr, int *faila, int *failb)
{ int stripe_nr; int found_errors = 0;
if (faila || failb) { /* * Both @faila and @failb should be valid pointers if any of * them is specified.
*/
ASSERT(faila && failb);
*faila = -1;
*failb = -1;
}
for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr;
if (test_bit(total_sector_nr, rbio->error_bitmap)) {
found_errors++; if (faila) { /* Update faila and failb. */ if (*faila < 0)
*faila = stripe_nr; elseif (*failb < 0)
*failb = stripe_nr;
}
}
} return found_errors;
}
/* * Add a single sector @sector into our list of bios for IO. * * Return 0 if everything went well. * Return <0 for error.
*/ staticint rbio_add_io_sector(struct btrfs_raid_bio *rbio, struct bio_list *bio_list, struct sector_ptr *sector, unsignedint stripe_nr, unsignedint sector_nr, enum req_op op)
{ const u32 sectorsize = rbio->bioc->fs_info->sectorsize; struct bio *last = bio_list->tail; int ret; struct bio *bio; struct btrfs_io_stripe *stripe;
u64 disk_start;
/* * Note: here stripe_nr has taken device replace into consideration, * thus it can be larger than rbio->real_stripe. * So here we check against bioc->num_stripes, not rbio->real_stripes.
*/
ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes,
rbio, stripe_nr);
ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors,
rbio, sector_nr);
ASSERT(sector->has_paddr);
/* Check if we have reached tolerance early. */
found_errors = get_rbio_veritical_errors(rbio, sector_nr,
NULL, NULL); if (found_errors > rbio->bioc->max_errors) return -EIO; return 0;
}
/* see if we can add this page onto our existing bio */ if (last) {
u64 last_end = last->bi_iter.bi_sector << SECTOR_SHIFT;
last_end += last->bi_iter.bi_size;
/* * we can't merge these if they are from different * devices or if they are not contiguous
*/ if (last_end == disk_start && !last->bi_status &&
last->bi_bdev == stripe->dev->bdev) {
ret = bio_add_page(last, phys_to_page(sector->paddr),
sectorsize, offset_in_page(sector->paddr)); if (ret == sectorsize) return 0;
}
}
/* put a new bio on the list */
bio = bio_alloc(stripe->dev->bdev,
max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1),
op, GFP_NOFS);
bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT;
bio->bi_private = rbio;
/* * helper function to walk our bio list and populate the bio_pages array with * the result. This seems expensive, but it is faster than constantly * searching through the bio list as we setup the IO in finish_rmw or stripe * reconstruction. * * This must be called before you trust the answers from page_in_rbio
*/ staticvoid index_rbio_pages(struct btrfs_raid_bio *rbio)
{ struct bio *bio;
staticinlinevoid bio_list_put(struct bio_list *bio_list)
{ struct bio *bio;
while ((bio = bio_list_pop(bio_list)))
bio_put(bio);
}
staticvoid assert_rbio(struct btrfs_raid_bio *rbio)
{ if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) return;
/* * At least two stripes (2 disks RAID5), and since real_stripes is U8, * we won't go beyond 256 disks anyway.
*/
ASSERT_RBIO(rbio->real_stripes >= 2, rbio);
ASSERT_RBIO(rbio->nr_data > 0, rbio);
/* * This is another check to make sure nr data stripes is smaller * than total stripes.
*/
ASSERT_RBIO(rbio->nr_data < rbio->real_stripes, rbio);
}
staticinlinevoid *kmap_local_sector(conststruct sector_ptr *sector)
{ /* The sector pointer must have a page mapped to it. */
ASSERT(sector->has_paddr);
/* Generate PQ for one vertical stripe. */ staticvoid generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
{ void **pointers = rbio->finish_pointers; const u32 sectorsize = rbio->bioc->fs_info->sectorsize; struct sector_ptr *sector; int stripe; constbool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6;
/* First collect one sector from each data stripe */ for (stripe = 0; stripe < rbio->nr_data; stripe++) {
sector = sector_in_rbio(rbio, stripe, sectornr, 0);
pointers[stripe] = kmap_local_sector(sector);
}
/* Then add the parity stripe */
sector = rbio_pstripe_sector(rbio, sectornr);
sector->uptodate = 1;
pointers[stripe++] = kmap_local_sector(sector);
if (has_qstripe) { /* * RAID6, add the qstripe and call the library function * to fill in our p/q
*/
sector = rbio_qstripe_sector(rbio, sectornr);
sector->uptodate = 1;
pointers[stripe++] = kmap_local_sector(sector);
staticint rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, struct bio_list *bio_list)
{ /* The total sector number inside the full stripe. */ int total_sector_nr; int sectornr; int stripe; int ret;
ASSERT(bio_list_size(bio_list) == 0);
/* We should have at least one data sector. */
ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
/* * Reset errors, as we may have errors inherited from from degraded * write.
*/
bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
/* * Start assembly. Make bios for everything from the higher layers (the * bio_list in our rbio) and our P/Q. Ignore everything else.
*/ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
total_sector_nr++) { struct sector_ptr *sector;
ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
sectornr, REQ_OP_WRITE); if (ret) goto error;
}
if (likely(!rbio->bioc->replace_nr_stripes)) return 0;
/* * Make a copy for the replace target device. * * Thus the source stripe number (in replace_stripe_src) should be valid.
*/
ASSERT(rbio->bioc->replace_stripe_src >= 0);
/* * For RAID56, there is only one device that can be replaced, * and replace_stripe_src[0] indicates the stripe number we * need to copy from.
*/ if (stripe != rbio->bioc->replace_stripe_src) { /* * We can skip the whole stripe completely, note * total_sector_nr will be increased by one anyway.
*/
ASSERT(sectornr == 0);
total_sector_nr += rbio->stripe_nsectors - 1; continue;
}
/* This vertical stripe has no data, skip it. */ if (!test_bit(sectornr, &rbio->dbitmap)) continue;
/* * Special handling for raid56_alloc_missing_rbio() used by * scrub/replace. Unlike call path in raid56_parity_recover(), they * pass an empty bio here. Thus we have to find out the missing device * and mark the stripe error instead.
*/ if (bio->bi_iter.bi_size == 0) { bool found_missing = false; int stripe_nr;
/* * For subpage case, we can no longer set page Up-to-date directly for * stripe_pages[], thus we need to locate the sector.
*/ staticstruct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio,
phys_addr_t paddr)
{ int i;
for (i = 0; i < rbio->nr_sectors; i++) { struct sector_ptr *sector = &rbio->stripe_sectors[i];
/* * this sets each page in the bio uptodate. It should only be used on private * rbio pages, nothing that comes in from the higher layers
*/ staticvoid set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
{ const u32 sectorsize = rbio->bioc->fs_info->sectorsize; struct bio_vec *bvec; struct bvec_iter_all iter_all;
/* * Since we can have multiple bios touching the error_bitmap, we cannot * call bitmap_set() without protection. * * Instead use set_bit() for each bit, as set_bit() itself is atomic.
*/ for (i = total_sector_nr; i < total_sector_nr +
(bio_size >> rbio->bioc->fs_info->sectorsize_bits); i++)
set_bit(i, rbio->error_bitmap);
}
/* Verify the data sectors at read time. */ staticvoid verify_bio_data_sectors(struct btrfs_raid_bio *rbio, struct bio *bio)
{ struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; int total_sector_nr = get_bio_sector_nr(rbio, bio); struct bio_vec *bvec; struct bvec_iter_all iter_all;
/* No data csum for the whole stripe, no need to verify. */ if (!rbio->csum_bitmap || !rbio->csum_buf) return;
/* P/Q stripes, they have no data csum to verify against. */ if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors) return;
ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, false); if (ret < 0) return ret;
index_stripe_sectors(rbio); return 0;
}
/* * We use plugging call backs to collect full stripes. * Any time we get a partial stripe write while plugged * we collect it into a list. When the unplug comes down, * we sort the list by logical block number and merge * everything we can into the same rbios
*/ struct btrfs_plug_cb { struct blk_plug_cb cb; struct btrfs_fs_info *info; struct list_head rbio_list;
};
/* * rbios on the plug list are sorted for easier merging.
*/ staticint plug_cmp(void *priv, conststruct list_head *a, conststruct list_head *b)
{ conststruct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
plug_list); conststruct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
plug_list);
u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
if (a_sector < b_sector) return -1; if (a_sector > b_sector) return 1; return 0;
}
while (!list_empty(&plug->rbio_list)) {
cur = list_first_entry(&plug->rbio_list, struct btrfs_raid_bio, plug_list);
list_del_init(&cur->plug_list);
if (rbio_is_full(cur)) { /* We have a full stripe, queue it down. */
start_async_work(cur, rmw_rbio_work); continue;
} if (last) { if (rbio_can_merge(last, cur)) {
merge_rbio(last, cur);
free_raid_bio(cur); continue;
}
start_async_work(last, rmw_rbio_work);
}
last = cur;
} if (last)
start_async_work(last, rmw_rbio_work);
kfree(plug);
}
/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */ staticvoid rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
{ conststruct btrfs_fs_info *fs_info = rbio->bioc->fs_info; const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT; const u64 full_stripe_start = rbio->bioc->full_stripe_logical; const u32 orig_len = orig_bio->bi_iter.bi_size; const u32 sectorsize = fs_info->sectorsize;
u64 cur_logical;
/* Update the dbitmap. */ for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
cur_logical += sectorsize) { int bit = ((u32)(cur_logical - full_stripe_start) >>
fs_info->sectorsize_bits) % rbio->stripe_nsectors;
set_bit(bit, &rbio->dbitmap);
}
}
/* * our main entry point for writes from the rest of the FS.
*/ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
{ struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; struct btrfs_plug_cb *plug = NULL; struct blk_plug_cb *cb;
/* * Don't plug on full rbios, just get them out the door * as quickly as we can
*/ if (!rbio_is_full(rbio)) {
cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug)); if (cb) {
plug = container_of(cb, struct btrfs_plug_cb, cb); if (!plug->info) {
plug->info = fs_info;
INIT_LIST_HEAD(&plug->rbio_list);
}
list_add_tail(&rbio->plug_list, &plug->rbio_list); return;
}
}
/* * Either we don't have any existing plug, or we're doing a full stripe, * queue the rmw work now.
*/
start_async_work(rbio, rmw_rbio_work);
}
staticint verify_one_sector(struct btrfs_raid_bio *rbio, int stripe_nr, int sector_nr)
{ struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; struct sector_ptr *sector;
u8 csum_buf[BTRFS_CSUM_SIZE];
u8 *csum_expected; void *kaddr; int ret;
if (!rbio->csum_bitmap || !rbio->csum_buf) return 0;
/* No way to verify P/Q as they are not covered by data csum. */ if (stripe_nr >= rbio->nr_data) return 0; /* * If we're rebuilding a read, we have to use pages from the * bio list if possible.
*/ if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
} else {
sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
}
/* * Recover a vertical stripe specified by @sector_nr. * @*pointers are the pre-allocated pointers by the caller, so we don't * need to allocate/free the pointers again and again.
*/ staticint recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, void **pointers, void **unmap_array)
{ struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; struct sector_ptr *sector; const u32 sectorsize = fs_info->sectorsize; int found_errors; int faila; int failb; int stripe_nr; int ret = 0;
/* * Now we just use bitmap to mark the horizontal stripes in * which we have data when doing parity scrub.
*/ if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
!test_bit(sector_nr, &rbio->dbitmap)) return 0;
found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila,
&failb); /* * No errors in the vertical stripe, skip it. Can happen for recovery * which only part of a stripe failed csum check.
*/ if (!found_errors) return 0;
if (found_errors > rbio->bioc->max_errors) return -EIO;
/* * Setup our array of pointers with sectors from each stripe * * NOTE: store a duplicate array of pointers to preserve the * pointer order.
*/ for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { /* * If we're rebuilding a read, we have to use pages from the * bio list if possible.
*/ if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
} else {
sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
}
pointers[stripe_nr] = kmap_local_sector(sector);
unmap_array[stripe_nr] = pointers[stripe_nr];
}
/* All raid6 handling here */ if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) { /* Single failure, rebuild from parity raid5 style */ if (failb < 0) { if (faila == rbio->nr_data) /* * Just the P stripe has failed, without * a bad data or Q stripe. * We have nothing to do, just skip the * recovery for this stripe.
*/ goto cleanup; /* * a single failure in raid6 is rebuilt * in the pstripe code below
*/ goto pstripe;
}
/* * If the q stripe is failed, do a pstripe reconstruction from * the xors. * If both the q stripe and the P stripe are failed, we're * here due to a crc mismatch and we can't give them the * data they want.
*/ if (failb == rbio->real_stripes - 1) { if (faila == rbio->real_stripes - 2) /* * Only P and Q are corrupted. * We only care about data stripes recovery, * can skip this vertical stripe.
*/ goto cleanup; /* * Otherwise we have one bad data stripe and * a good P stripe. raid5!
*/ goto pstripe;
}
/* Rebuild from P stripe here (raid5 or raid6). */
ASSERT(failb == -1);
pstripe: /* Copy parity block into failed block to start with */
memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
/* Rearrange the pointer array */
p = pointers[faila]; for (stripe_nr = faila; stripe_nr < rbio->nr_data - 1;
stripe_nr++)
pointers[stripe_nr] = pointers[stripe_nr + 1];
pointers[rbio->nr_data - 1] = p;
/* Xor in the rest */
run_xor(pointers, rbio->nr_data - 1, sectorsize);
}
/* * No matter if this is a RMW or recovery, we should have all * failed sectors repaired in the vertical stripe, thus they are now * uptodate. * Especially if we determine to cache the rbio, we need to * have at least all data sectors uptodate. * * If possible, also check if the repaired sector matches its data * checksum.
*/ if (faila >= 0) {
ret = verify_one_sector(rbio, faila, sector_nr); if (ret < 0) goto cleanup;
sector = rbio_stripe_sector(rbio, faila, sector_nr);
sector->uptodate = 1;
} if (failb >= 0) {
ret = verify_one_sector(rbio, failb, sector_nr); if (ret < 0) goto cleanup;
staticint recover_sectors(struct btrfs_raid_bio *rbio)
{ void **pointers = NULL; void **unmap_array = NULL; int sectornr; int ret = 0;
/* * @pointers array stores the pointer for each sector. * * @unmap_array stores copy of pointers that does not get reordered * during reconstruction so that kunmap_local works.
*/
pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); if (!pointers || !unmap_array) {
ret = -ENOMEM; goto out;
}
if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
spin_lock(&rbio->bio_list_lock);
set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
spin_unlock(&rbio->bio_list_lock);
}
index_rbio_pages(rbio);
for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
ret = recover_vertical(rbio, sectornr, pointers, unmap_array); if (ret < 0) break;
}
staticvoid recover_rbio(struct btrfs_raid_bio *rbio)
{ struct bio_list bio_list = BIO_EMPTY_LIST; int total_sector_nr; int ret = 0;
/* * Either we're doing recover for a read failure or degraded write, * caller should have set error bitmap correctly.
*/
ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors));
/* For recovery, we need to read all sectors including P/Q. */
ret = alloc_rbio_pages(rbio); if (ret < 0) goto out;
index_rbio_pages(rbio);
/* * Read everything that hasn't failed. However this time we will * not trust any cached sector. * As we may read out some stale data but higher layer is not reading * that stale part. * * So here we always re-read everything in recovery path.
*/ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
total_sector_nr++) { int stripe = total_sector_nr / rbio->stripe_nsectors; int sectornr = total_sector_nr % rbio->stripe_nsectors; struct sector_ptr *sector;
/* * Skip the range which has error. It can be a range which is * marked error (for csum mismatch), or it can be a missing * device.
*/ if (!rbio->bioc->stripes[stripe].dev->bdev ||
test_bit(total_sector_nr, rbio->error_bitmap)) { /* * Also set the error bit for missing device, which * may not yet have its error bit set.
*/
set_bit(total_sector_nr, rbio->error_bitmap); continue;
}
staticvoid set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num)
{ bool found = false; int sector_nr;
/* * This is for RAID6 extra recovery tries, thus mirror number should * be large than 2. * Mirror 1 means read from data stripes. Mirror 2 means rebuild using * RAID5 methods.
*/
ASSERT(mirror_num > 2); for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { int found_errors; int faila; int failb;
found_errors = get_rbio_veritical_errors(rbio, sector_nr,
&faila, &failb); /* This vertical stripe doesn't have errors. */ if (!found_errors) continue;
/* * If we found errors, there should be only one error marked * by previous set_rbio_range_error().
*/
ASSERT(found_errors == 1);
found = true;
/* Now select another stripe to mark as error. */
failb = rbio->real_stripes - (mirror_num - 1); if (failb <= faila)
failb--;
/* Set the extra bit in error bitmap. */ if (failb >= 0)
set_bit(failb * rbio->stripe_nsectors + sector_nr,
rbio->error_bitmap);
}
/* We should found at least one vertical stripe with error.*/
ASSERT(found);
}
/* * the main entry point for reads from the higher layers. This * is really only called when the normal read path had a failure, * so we assume the bio they send down corresponds to a failed part * of the drive.
*/ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, int mirror_num)
{ struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio;
/* * Loop retry: * for 'mirror == 2', reconstruct from all other stripes. * for 'mirror_num > 2', select a stripe to fail on every retry.
*/ if (mirror_num > 2)
set_rbio_raid6_extra_error(rbio, mirror_num);
/* The rbio should not have its csum buffer initialized. */
ASSERT(!rbio->csum_buf && !rbio->csum_bitmap);
/* * Skip the csum search if: * * - The rbio doesn't belong to data block groups * Then we are doing IO for tree blocks, no need to search csums. * * - The rbio belongs to mixed block groups * This is to avoid deadlock, as we're already holding the full * stripe lock, if we trigger a metadata read, and it needs to do * raid56 recovery, we will deadlock.
*/ if (!(rbio->bioc->map_type & BTRFS_BLOCK_GROUP_DATA) ||
rbio->bioc->map_type & BTRFS_BLOCK_GROUP_METADATA) return;
ret = btrfs_lookup_csums_bitmap(csum_root, NULL, start, start + len - 1,
rbio->csum_buf, rbio->csum_bitmap); if (ret < 0) goto error; if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits)) goto no_csum; return;
error: /* * We failed to allocate memory or grab the csum, but it's not fatal, * we can still continue. But better to warn users that RMW is no * longer safe for this particular sub-stripe write.
*/
btrfs_warn_rl(fs_info, "sub-stripe write for full stripe %llu is not safe, failed to get csum: %d",
rbio->bioc->full_stripe_logical, ret);
no_csum:
kfree(rbio->csum_buf);
bitmap_free(rbio->csum_bitmap);
rbio->csum_buf = NULL;
rbio->csum_bitmap = NULL;
}
staticint rmw_read_wait_recover(struct btrfs_raid_bio *rbio)
{ struct bio_list bio_list = BIO_EMPTY_LIST; int total_sector_nr; int ret = 0;
/* * Fill the data csums we need for data verification. We need to fill * the csum_bitmap/csum_buf first, as our endio function will try to * verify the data sectors.
*/
fill_data_csums(rbio);
/* * Build a list of bios to read all sectors (including data and P/Q). * * This behavior is to compensate the later csum verification and recovery.
*/ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
total_sector_nr++) { struct sector_ptr *sector; int stripe = total_sector_nr / rbio->stripe_nsectors; int sectornr = total_sector_nr % rbio->stripe_nsectors;
/* * We may or may not have any corrupted sectors (including missing dev * and csum mismatch), just let recover_sectors() to handle them all.
*/
submit_read_wait_bio_list(rbio, &bio_list); return recover_sectors(rbio);
}
staticvoid raid_wait_write_end_io(struct bio *bio)
{ struct btrfs_raid_bio *rbio = bio->bi_private;
if (bio->bi_status)
rbio_update_error_bitmap(rbio, bio);
bio_put(bio); if (atomic_dec_and_test(&rbio->stripes_pending))
wake_up(&rbio->io_wait);
}
/* * To determine if we need to read any sector from the disk. * Should only be utilized in RMW path, to skip cached rbio.
*/ staticbool need_read_stripe_sectors(struct btrfs_raid_bio *rbio)
{ int i;
for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) { struct sector_ptr *sector = &rbio->stripe_sectors[i];
/* * We have a sector which doesn't have page nor uptodate, * thus this rbio can not be cached one, as cached one must * have all its data sectors present and uptodate.
*/ if (!sector->has_paddr || !sector->uptodate) returntrue;
} returnfalse;
}
staticvoid rmw_rbio(struct btrfs_raid_bio *rbio)
{ struct bio_list bio_list; int sectornr; int ret = 0;
/* * Allocate the pages for parity first, as P/Q pages will always be * needed for both full-stripe and sub-stripe writes.
*/
ret = alloc_rbio_parity_pages(rbio); if (ret < 0) goto out;
/* * Either full stripe write, or we have every data sector already * cached, can go to write path immediately.
*/ if (!rbio_is_full(rbio) && need_read_stripe_sectors(rbio)) { /* * Now we're doing sub-stripe write, also need all data stripes * to do the full RMW.
*/
ret = alloc_rbio_data_pages(rbio); if (ret < 0) goto out;
index_rbio_pages(rbio);
ret = rmw_read_wait_recover(rbio); if (ret < 0) goto out;
}
/* * At this stage we're not allowed to add any new bios to the * bio list any more, anyone else that wants to change this stripe * needs to do their own rmw.
*/
spin_lock(&rbio->bio_list_lock);
set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
spin_unlock(&rbio->bio_list_lock);
/* * We don't cache full rbios because we're assuming * the higher layers are unlikely to use this area of * the disk again soon. If they do use it again,
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Dauer der Verarbeitung: 0.24 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.