// SPDX-License-Identifier: GPL-2.0-or-later /* * raid10.c : Multiple Devices driver for Linux * * Copyright (C) 2000-2004 Neil Brown * * RAID-10 support for md. * * Base on code in raid1.c. See raid1.c for further copyright information.
*/
/* * RAID10 provides a combination of RAID0 and RAID1 functionality. * The layout of data is defined by * chunk_size * raid_disks * near_copies (stored in low byte of layout) * far_copies (stored in second byte of layout) * far_offset (stored in bit 16 of layout ) * use_far_sets (stored in bit 17 of layout ) * use_far_sets_bugfixed (stored in bit 18 of layout ) * * The data to be stored is divided into chunks using chunksize. Each device * is divided into far_copies sections. In each section, chunks are laid out * in a style similar to raid0, but near_copies copies of each chunk is stored * (each on a different drive). The starting device for each section is offset * near_copies from the starting device of the previous section. Thus there * are (near_copies * far_copies) of each chunk, and each is on a different * drive. near_copies and far_copies must be at least one, and their product * is at most raid_disks. * * If far_offset is true, then the far_copies are handled a bit differently. * The copies are still in different stripes, but instead of being very far * apart on disk, there are adjacent stripes. * * The far and offset algorithms are handled slightly differently if * 'use_far_sets' is true. In this case, the array's devices are grouped into * sets that are (near_copies * far_copies) in size. The far copied stripes * are still shifted by 'near_copies' devices, but this shifting stays confined * to the set rather than the entire array. This is done to improve the number * of device combinations that can fail without causing the array to fail. * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk * on a device): * A B C D A B C D E * ... ... * D A B C E A B C D * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s): * [A B] [C D] [A B] [C D E] * |...| |...| |...| | ... | * [B A] [D C] [B A] [E C D]
*/
staticvoid allow_barrier(struct r10conf *conf); staticvoid lower_barrier(struct r10conf *conf); staticint _enough(struct r10conf *conf, int previous, int ignore); staticint enough(struct r10conf *conf, int ignore); static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped); staticvoid reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio); staticvoid end_reshape_write(struct bio *bio); staticvoid end_reshape(struct r10conf *conf);
#include"raid1-10.c"
#define NULL_CMD #define cmd_before(conf, cmd) \ do { \
write_sequnlock_irq(&(conf)->resync_lock); \
cmd; \
} while (0) #define cmd_after(conf) write_seqlock_irq(&(conf)->resync_lock)
/* * for resync bio, r10bio pointer can be retrieved from the per-bio * 'struct resync_pages'.
*/ staticinlinestruct r10bio *get_resync_r10bio(struct bio *bio)
{ return get_resync_pages(bio)->raid_bio;
}
/* allocate a r10bio with room for raid_disks entries in the
* bios array */ return kzalloc(size, gfp_flags);
}
#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) /* amount of memory to reserve for resync requests */ #define RESYNC_WINDOW (1024*1024) /* maximum number of concurrent requests, memory permitting */ #define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE) #define CLUSTER_RESYNC_WINDOW (32 * RESYNC_WINDOW) #define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
/* * When performing a resync, we need to read and compare, so * we need as many pages are there are copies. * When performing a recovery, we need 2 bios, one for read, * one for write (we recover only one drive per r10buf) *
*/ staticvoid * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
{ struct r10conf *conf = data; struct r10bio *r10_bio; struct bio *bio; int j; int nalloc, nalloc_rp; struct resync_pages *rps;
r10_bio = r10bio_pool_alloc(gfp_flags, conf); if (!r10_bio) return NULL;
/* wake up frozen array... */
wake_up(&conf->wait_barrier);
md_wakeup_thread(mddev->thread);
}
/* * raid_end_bio_io() is called when we have finished servicing a mirrored * operation and are ready to return a success/failure code to the buffer * cache layer.
*/ staticvoid raid_end_bio_io(struct r10bio *r10_bio)
{ struct bio *bio = r10_bio->master_bio; struct r10conf *conf = r10_bio->mddev->private;
if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
bio->bi_status = BLK_STS_IOERR;
bio_endio(bio); /* * Wake up any possible resync thread that waits for the device * to go idle.
*/
allow_barrier(conf);
free_r10bio(r10_bio);
}
/* * Update disk head position estimator based on IRQ completion info.
*/ staticinlinevoid update_head_pos(int slot, struct r10bio *r10_bio)
{ struct r10conf *conf = r10_bio->mddev->private;
/* * Find the disk number which triggered given bio
*/ staticint find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, struct bio *bio, int *slotp, int *replp)
{ int slot; int repl = 0;
for (slot = 0; slot < conf->geo.raid_disks; slot++) { if (r10_bio->devs[slot].bio == bio) break; if (r10_bio->devs[slot].repl_bio == bio) {
repl = 1; break;
}
}
update_head_pos(slot, r10_bio);
if (slotp)
*slotp = slot; if (replp)
*replp = repl; return r10_bio->devs[slot].devnum;
}
staticvoid raid10_end_read_request(struct bio *bio)
{ int uptodate = !bio->bi_status; struct r10bio *r10_bio = bio->bi_private; int slot; struct md_rdev *rdev; struct r10conf *conf = r10_bio->mddev->private;
slot = r10_bio->read_slot;
rdev = r10_bio->devs[slot].rdev; /* * this branch is our 'one mirror IO has finished' event handler:
*/
update_head_pos(slot, r10_bio);
if (uptodate) { /* * Set R10BIO_Uptodate in our master bio, so that * we will return a good error code to the higher * levels even if IO on some other mirrored buffer fails. * * The 'master' represents the composite IO operation to * user-side. So if something waits for IO, then it will * wait for the 'master' bio.
*/
set_bit(R10BIO_Uptodate, &r10_bio->state);
} elseif (!raid1_should_handle_error(bio)) {
uptodate = 1;
} else { /* If all other devices that store this block have * failed, we want to return the error upwards rather * than fail the last device. Here we redefine * "uptodate" to mean "Don't want to retry"
*/ if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state),
rdev->raid_disk))
uptodate = 1;
} if (uptodate) {
raid_end_bio_io(r10_bio);
rdev_dec_pending(rdev, conf->mddev);
} else { /* * oops, read error - keep the refcount on the rdev
*/
pr_err_ratelimited("md/raid10:%s: %pg: rescheduling sector %llu\n",
mdname(conf->mddev),
rdev->bdev,
(unsignedlonglong)r10_bio->sector);
set_bit(R10BIO_ReadError, &r10_bio->state);
reschedule_retry(r10_bio);
}
}
staticvoid one_write_done(struct r10bio *r10_bio)
{ if (atomic_dec_and_test(&r10_bio->remaining)) { if (test_bit(R10BIO_WriteError, &r10_bio->state))
reschedule_retry(r10_bio); else {
close_write(r10_bio); if (test_bit(R10BIO_MadeGood, &r10_bio->state))
reschedule_retry(r10_bio); else
raid_end_bio_io(r10_bio);
}
}
}
staticvoid raid10_end_write_request(struct bio *bio)
{ struct r10bio *r10_bio = bio->bi_private; int dev; int dec_rdev = 1; struct r10conf *conf = r10_bio->mddev->private; int slot, repl; struct md_rdev *rdev = NULL; struct bio *to_put = NULL; bool ignore_error = !raid1_should_handle_error(bio) ||
(bio->bi_status && bio_op(bio) == REQ_OP_DISCARD);
dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
if (repl)
rdev = conf->mirrors[dev].replacement; if (!rdev) {
smp_rmb();
repl = 0;
rdev = conf->mirrors[dev].rdev;
} /* * this branch is our 'one mirror IO has finished' event handler:
*/ if (bio->bi_status && !ignore_error) { if (repl) /* Never record new bad blocks to replacement, * just fail it.
*/
md_error(rdev->mddev, rdev); else {
set_bit(WriteErrorSeen, &rdev->flags); if (!test_and_set_bit(WantReplacement, &rdev->flags))
set_bit(MD_RECOVERY_NEEDED,
&rdev->mddev->recovery);
/* * When the device is faulty, it is not necessary to * handle write error.
*/ if (!test_bit(Faulty, &rdev->flags))
set_bit(R10BIO_WriteError, &r10_bio->state); else { /* Fail the request */
r10_bio->devs[slot].bio = NULL;
to_put = bio;
dec_rdev = 1;
}
}
} else { /* * Set R10BIO_Uptodate in our master bio, so that * we will return a good error code for to the higher * levels even if IO on some other mirrored buffer fails. * * The 'master' represents the composite IO operation to * user-side. So if something waits for IO, then it will * wait for the 'master' bio. * * Do not set R10BIO_Uptodate if the current device is * rebuilding or Faulty. This is because we cannot use * such device for properly reading the data back (we could * potentially use it, if the current write would have felt * before rdev->recovery_offset, but for simplicity we don't * check this here.
*/ if (test_bit(In_sync, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags))
set_bit(R10BIO_Uptodate, &r10_bio->state);
/* Maybe we can clear some bad blocks. */ if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr,
r10_bio->sectors) &&
!ignore_error) {
bio_put(bio); if (repl)
r10_bio->devs[slot].repl_bio = IO_MADE_GOOD; else
r10_bio->devs[slot].bio = IO_MADE_GOOD;
dec_rdev = 0;
set_bit(R10BIO_MadeGood, &r10_bio->state);
}
}
/* * * Let's see if all mirrored write operations have finished * already.
*/
one_write_done(r10_bio); if (dec_rdev)
rdev_dec_pending(rdev, conf->mddev); if (to_put)
bio_put(to_put);
}
/* * RAID10 layout manager * As well as the chunksize and raid_disks count, there are two * parameters: near_copies and far_copies. * near_copies * far_copies must be <= raid_disks. * Normally one of these will be 1. * If both are 1, we get raid0. * If near_copies == raid_disks, we get raid1. * * Chunks are laid out in raid0 style with near_copies copies of the * first chunk, followed by near_copies copies of the next chunk and * so on. * If far_copies > 1, then after 1/far_copies of the array has been assigned * as described above, we start again with a device offset of near_copies. * So we effectively have another copy of the whole array further down all * the drives, but with blocks on different drives. * With this layout, and block is never stored twice on the one device. * * raid10_find_phys finds the sector offset of a given virtual sector * on each device that it is on. * * raid10_find_virt does the reverse mapping, from a device and a * sector offset to a virtual address
*/
staticvoid __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
{ int n,f;
sector_t sector;
sector_t chunk;
sector_t stripe; int dev; int slot = 0; int last_far_set_start, last_far_set_size;
/* now calculate first sector/dev */
chunk = r10bio->sector >> geo->chunk_shift;
sector = r10bio->sector & geo->chunk_mask;
chunk *= geo->near_copies;
stripe = chunk;
dev = sector_div(stripe, geo->raid_disks); if (geo->far_offset)
stripe *= geo->far_copies;
sector += stripe << geo->chunk_shift;
/* and calculate all the others */ for (n = 0; n < geo->near_copies; n++) { int d = dev; int set;
sector_t s = sector;
r10bio->devs[slot].devnum = d;
r10bio->devs[slot].addr = s;
slot++;
for (f = 1; f < geo->far_copies; f++) {
set = d / geo->far_set_size;
d += geo->near_copies;
if ((geo->raid_disks % geo->far_set_size) &&
(d > last_far_set_start)) {
d -= last_far_set_start;
d %= last_far_set_size;
d += last_far_set_start;
} else {
d %= geo->far_set_size;
d += geo->far_set_size * set;
}
s += geo->stride;
r10bio->devs[slot].devnum = d;
r10bio->devs[slot].addr = s;
slot++;
}
dev++; if (dev >= geo->raid_disks) {
dev = 0;
sector += (geo->chunk_mask + 1);
}
}
}
static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
{
sector_t offset, chunk, vchunk; /* Never use conf->prev as this is only called during resync * or recovery, so reshape isn't happening
*/ struct geom *geo = &conf->geo; int far_set_start = (dev / geo->far_set_size) * geo->far_set_size; int far_set_size = geo->far_set_size; int last_far_set_start;
offset = sector & geo->chunk_mask; if (geo->far_offset) { int fc;
chunk = sector >> geo->chunk_shift;
fc = sector_div(chunk, geo->far_copies);
dev -= fc * geo->near_copies; if (dev < far_set_start)
dev += far_set_size;
} else { while (sector >= geo->stride) {
sector -= geo->stride; if (dev < (geo->near_copies + far_set_start))
dev += far_set_size - geo->near_copies; else
dev -= geo->near_copies;
}
chunk = sector >> geo->chunk_shift;
}
vchunk = chunk * geo->raid_disks + dev;
sector_div(vchunk, geo->near_copies); return (vchunk << geo->chunk_shift) + offset;
}
/* * This routine returns the disk from which the requested read should * be done. There is a per-array 'next expected sequential IO' sector * number - if this matches on the next IO then we use the last disk. * There is also a per-disk 'last know head position' sector that is * maintained from IRQ contexts, both the normal and the resync IO * completion handlers update this position correctly. If there is no * perfect sequential match then we pick the disk whose head is closest. * * If there are 2 mirrors in the same 2 devices, performance degrades * because position is mirror, not device based. * * The rdev for the device selected will have nr_pending incremented.
*/
/* * FIXME: possibly should rethink readbalancing and do it differently * depending on near_copies / far_copies geometry.
*/ staticstruct md_rdev *read_balance(struct r10conf *conf, struct r10bio *r10_bio, int *max_sectors)
{ const sector_t this_sector = r10_bio->sector; int disk, slot; int sectors = r10_bio->sectors; int best_good_sectors;
sector_t new_distance, best_dist; struct md_rdev *best_dist_rdev, *best_pending_rdev, *rdev = NULL; int do_balance; int best_dist_slot, best_pending_slot; bool has_nonrot_disk = false; unsignedint min_pending; struct geom *geo = &conf->geo;
if (best_dist_slot >= 0) /* At least 2 disks to choose from so failfast is OK */
set_bit(R10BIO_FailFast, &r10_bio->state); /* This optimisation is debatable, and completely destroys * sequential read speed for 'far copies' arrays. So only * keep it for 'near' arrays, and review those later.
*/ if (geo->near_copies > 1 && !pending)
new_distance = 0;
/* for far > 1 always use the lowest address */ elseif (geo->far_copies > 1)
new_distance = r10_bio->devs[slot].addr; else
new_distance = abs(r10_bio->devs[slot].addr -
conf->mirrors[disk].head_position);
staticvoid flush_pending_writes(struct r10conf *conf)
{ /* Any writes that have been queued but are awaiting * bitmap updates get flushed here.
*/
spin_lock_irq(&conf->device_lock);
if (conf->pending_bio_list.head) { struct blk_plug plug; struct bio *bio;
bio = bio_list_get(&conf->pending_bio_list);
spin_unlock_irq(&conf->device_lock);
/* * As this is called in a wait_event() loop (see freeze_array), * current->state might be TASK_UNINTERRUPTIBLE which will * cause a warning when we prepare to wait again. As it is * rare that this path is taken, it is perfectly safe to force * us to go around the wait_event() loop again, so the warning * is a false-positive. Silence the warning by resetting * thread state
*/
__set_current_state(TASK_RUNNING);
/* Barriers.... * Sometimes we need to suspend IO while we do something else, * either some resync/recovery, or reconfigure the array. * To do this we raise a 'barrier'. * The 'barrier' is a counter that can be raised multiple times * to count how many activities are happening which preclude * normal IO. * We can only raise the barrier if there is no pending IO. * i.e. if nr_pending == 0. * We choose only to raise the barrier if no-one is waiting for the * barrier to go down. This means that as soon as an IO request * is ready, no other operations which require a barrier will start * until the IO request has had a chance. * * So: regular IO calls 'wait_barrier'. When that returns there * is no backgroup IO happening, It must arrange to call * allow_barrier when it has finished its IO. * backgroup IO calls must call raise_barrier. Once that returns * there is no normal IO happeing. It must arrange to call * lower_barrier when the particular background IO completes.
*/
staticvoid raise_barrier(struct r10conf *conf, int force)
{
write_seqlock_irq(&conf->resync_lock);
if (WARN_ON_ONCE(force && !conf->barrier))
force = false;
/* Wait until no block IO is waiting (unless 'force') */
wait_event_barrier(conf, force || !conf->nr_waiting);
/* block any new IO from starting */
WRITE_ONCE(conf->barrier, conf->barrier + 1);
/* Now wait for all pending IO to complete */
wait_event_barrier(conf, !atomic_read(&conf->nr_pending) &&
conf->barrier < RESYNC_DEPTH);
/* barrier is dropped */ if (!conf->barrier) returntrue;
/* * If there are already pending requests (preventing the barrier from * rising completely), and the pre-process bio queue isn't empty, then * don't wait, as we need to empty that queue to get the nr_pending * count down.
*/ if (atomic_read(&conf->nr_pending) && bio_list &&
(!bio_list_empty(&bio_list[0]) || !bio_list_empty(&bio_list[1]))) returntrue;
/* daemon thread must exist while handling io */
thread = rcu_dereference_protected(conf->mddev->thread, true); /* * move on if io is issued from raid10d(), nr_pending is not released * from original io(see handle_read_error()). All raise barrier is * blocked until this io is done.
*/ if (thread->tsk == current) {
WARN_ON_ONCE(atomic_read(&conf->nr_pending) == 0); returntrue;
}
write_seqlock_irq(&conf->resync_lock); if (conf->barrier) { /* Return false when nowait flag is set */ if (nowait) {
ret = false;
} else {
conf->nr_waiting++;
mddev_add_trace_msg(conf->mddev, "raid10 wait barrier");
wait_event_barrier(conf, stop_waiting_barrier(conf));
conf->nr_waiting--;
} if (!conf->nr_waiting)
wake_up(&conf->wait_barrier);
} /* Only increment nr_pending when we wait */ if (ret)
atomic_inc(&conf->nr_pending);
write_sequnlock_irq(&conf->resync_lock); return ret;
}
staticvoid freeze_array(struct r10conf *conf, int extra)
{ /* stop syncio and normal IO and wait for everything to * go quiet. * We increment barrier and nr_waiting, and then * wait until nr_pending match nr_queued+extra * This is called in the context of one normal IO request * that has failed. Thus any sync request that might be pending * will be blocked by nr_pending, and we need to wait for * pending IO requests to complete or be queued for re-try. * Thus the number queued (nr_queued) plus this request (extra) * must match the number of pending IOs (nr_pending) before * we continue.
*/
write_seqlock_irq(&conf->resync_lock);
conf->array_freeze_pending++;
WRITE_ONCE(conf->barrier, conf->barrier + 1);
conf->nr_waiting++;
wait_event_barrier_cmd(conf, atomic_read(&conf->nr_pending) ==
conf->nr_queued + extra, flush_pending_writes(conf));
conf->array_freeze_pending--;
write_sequnlock_irq(&conf->resync_lock);
}
staticvoid unfreeze_array(struct r10conf *conf)
{ /* reverse the effect of the freeze */
write_seqlock_irq(&conf->resync_lock);
WRITE_ONCE(conf->barrier, conf->barrier - 1);
conf->nr_waiting--;
wake_up(&conf->wait_barrier);
write_sequnlock_irq(&conf->resync_lock);
}
/* we aren't scheduling, so we can do the write-out directly. */
bio = bio_list_get(&plug->pending);
raid1_prepare_flush_writes(mddev);
wake_up_barrier(conf);
while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next;
raid1_submit_write(bio);
bio = next;
cond_resched();
}
kfree(plug);
}
/* * 1. Register the new request and wait if the reconstruction thread has put * up a bar for new requests. Continue immediately if no resync is active * currently. * 2. If IO spans the reshape position. Need to wait for reshape to pass.
*/ staticbool regular_request_wait(struct mddev *mddev, struct r10conf *conf, struct bio *bio, sector_t sectors)
{ /* Bail out if REQ_NOWAIT is set for the bio */ if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) {
bio_wouldblock_error(bio); returnfalse;
} while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
bio->bi_iter.bi_sector < conf->reshape_progress &&
bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
allow_barrier(conf); if (bio->bi_opf & REQ_NOWAIT) {
bio_wouldblock_error(bio); returnfalse;
}
mddev_add_trace_msg(conf->mddev, "raid10 wait reshape");
wait_event(conf->wait_barrier,
conf->reshape_progress <= bio->bi_iter.bi_sector ||
conf->reshape_progress >= bio->bi_iter.bi_sector +
sectors);
wait_barrier(conf, false);
} returntrue;
}
staticvoid raid10_read_request(struct mddev *mddev, struct bio *bio, struct r10bio *r10_bio, bool io_accounting)
{ struct r10conf *conf = mddev->private; struct bio *read_bio; int max_sectors; struct md_rdev *rdev; char b[BDEVNAME_SIZE]; int slot = r10_bio->read_slot; struct md_rdev *err_rdev = NULL;
gfp_t gfp = GFP_NOIO; int error;
if (slot >= 0 && r10_bio->devs[slot].rdev) { /* * This is an error retry, but we cannot * safely dereference the rdev in the r10_bio, * we must use the one in conf. * If it has already been disconnected (unlikely) * we lose the device name in error messages.
*/ int disk; /* * As we are blocking raid10, it is a little safer to * use __GFP_HIGH.
*/
gfp = GFP_NOIO | __GFP_HIGH;
disk = r10_bio->devs[slot].devnum;
err_rdev = conf->mirrors[disk].rdev; if (err_rdev)
snprintf(b, sizeof(b), "%pg", err_rdev->bdev); else {
strcpy(b, "???"); /* This never gets dereferenced */
err_rdev = r10_bio->devs[slot].rdev;
}
}
if (!regular_request_wait(mddev, conf, bio, r10_bio->sectors)) {
raid_end_bio_io(r10_bio); return;
}
rdev = read_balance(conf, r10_bio, &max_sectors); if (!rdev) { if (err_rdev) {
pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n",
mdname(mddev), b,
(unsignedlonglong)r10_bio->sector);
}
raid_end_bio_io(r10_bio); return;
} if (err_rdev)
pr_err_ratelimited("md/raid10:%s: %pg: redirecting sector %llu to another mirror\n",
mdname(mddev),
rdev->bdev,
(unsignedlonglong)r10_bio->sector); if (max_sectors < bio_sectors(bio)) { struct bio *split = bio_split(bio, max_sectors,
gfp, &conf->bio_split); if (IS_ERR(split)) {
error = PTR_ERR(split); goto err_handle;
}
retry_wait:
blocked_rdev = NULL; for (i = 0; i < conf->copies; i++) { struct md_rdev *rdev, *rrdev;
rdev = conf->mirrors[i].rdev; if (rdev) {
sector_t dev_sector = r10_bio->devs[i].addr;
/* * Discard request doesn't care the write result * so it doesn't need to wait blocked disk here.
*/ if (test_bit(WriteErrorSeen, &rdev->flags) &&
r10_bio->sectors &&
rdev_has_badblock(rdev, dev_sector,
r10_bio->sectors) < 0) /* * Mustn't write here until the bad * block is acknowledged
*/
set_bit(BlockedBadBlocks, &rdev->flags);
if (rdev_blocked(rdev)) {
blocked_rdev = rdev;
atomic_inc(&rdev->nr_pending); break;
}
}
if (unlikely(blocked_rdev)) { /* Have to wait for this device to get unblocked, then retry */
allow_barrier(conf);
mddev_add_trace_msg(conf->mddev, "raid10 %s wait rdev %d blocked",
__func__, blocked_rdev->raid_disk);
md_wait_for_blocked_rdev(blocked_rdev, mddev);
wait_barrier(conf, false); goto retry_wait;
}
}
staticvoid raid10_write_request(struct mddev *mddev, struct bio *bio, struct r10bio *r10_bio)
{ struct r10conf *conf = mddev->private; int i, k;
sector_t sectors; int max_sectors; int error;
if ((mddev_is_clustered(mddev) &&
mddev->cluster_ops->area_resyncing(mddev, WRITE,
bio->bi_iter.bi_sector,
bio_end_sector(bio)))) {
DEFINE_WAIT(w); /* Bail out if REQ_NOWAIT is set for the bio */ if (bio->bi_opf & REQ_NOWAIT) {
bio_wouldblock_error(bio); return;
} for (;;) {
prepare_to_wait(&conf->wait_barrier,
&w, TASK_IDLE); if (!mddev->cluster_ops->area_resyncing(mddev, WRITE,
bio->bi_iter.bi_sector, bio_end_sector(bio))) break;
schedule();
}
finish_wait(&conf->wait_barrier, &w);
}
/* first select target devices under rcu_lock and * inc refcount on their rdev. Record them by setting * bios[x] to bio * If there are known/acknowledged bad blocks on any device * on which we have seen a write error, we want to avoid * writing to those blocks. This potentially requires several * writes to write around the bad blocks. Each set of writes * gets its own r10_bio with a set of bios attached.
*/
if (!rdev && !rrdev) continue; if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
sector_t first_bad;
sector_t dev_sector = r10_bio->devs[i].addr;
sector_t bad_sectors; int is_bad;
is_bad = is_badblock(rdev, dev_sector, max_sectors,
&first_bad, &bad_sectors); if (is_bad && first_bad <= dev_sector) { /* Cannot write here at all */
bad_sectors -= (dev_sector - first_bad); if (bad_sectors < max_sectors) /* Mustn't write more than bad_sectors * to other devices yet
*/
max_sectors = bad_sectors; continue;
} if (is_bad) { int good_sectors;
/* * We cannot atomically write this, so just * error in that case. It could be possible to * atomically write other mirrors, but the * complexity of supporting that is not worth * the benefit.
*/ if (bio->bi_opf & REQ_ATOMIC) {
error = -EIO; goto err_handle;
}
/* * There are some limitations to handle discard bio * 1st, the discard size is bigger than stripe_size*2. * 2st, if the discard bio spans reshape progress, we use the old way to * handle discard bio
*/ staticint raid10_handle_discard(struct mddev *mddev, struct bio *bio)
{ struct r10conf *conf = mddev->private; struct geom *geo = &conf->geo; int far_copies = geo->far_copies; bool first_copy = true; struct r10bio *r10_bio, *first_r10bio; struct bio *split; int disk;
sector_t chunk; unsignedint stripe_size; unsignedint stripe_data_disks;
sector_t split_size;
sector_t bio_start, bio_end;
sector_t first_stripe_index, last_stripe_index;
sector_t start_disk_offset; unsignedint start_disk_index;
sector_t end_disk_offset; unsignedint end_disk_index; unsignedint remainder;
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) return -EAGAIN;
if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) {
bio_wouldblock_error(bio); return 0;
}
/* * Check reshape again to avoid reshape happens after checking * MD_RECOVERY_RESHAPE and before wait_barrier
*/ if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) goto out;
/* * Maybe one discard bio is smaller than strip size or across one * stripe and discard region is larger than one stripe size. For far * offset layout, if the discard region is not aligned with stripe * size, there is hole when we submit discard bio to member disk. * For simplicity, we only handle discard bio which discard region * is bigger than stripe_size * 2
*/ if (bio_sectors(bio) < stripe_size*2) goto out;
/* * Keep bio aligned with strip size.
*/
div_u64_rem(bio_start, stripe_size, &remainder); if (remainder) {
split_size = stripe_size - remainder;
split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split); if (IS_ERR(split)) {
bio->bi_status = errno_to_blk_status(PTR_ERR(split));
bio_endio(bio); return 0;
}
bio_chain(split, bio);
trace_block_split(split, bio->bi_iter.bi_sector);
allow_barrier(conf); /* Resend the second split part */
submit_bio_noacct(bio);
bio = split;
wait_barrier(conf, false);
}
/* * Raid10 uses chunk as the unit to store data. It's similar like raid0. * One stripe contains the chunks from all member disk (one chunk from * one disk at the same HBA address). For layout detail, see 'man md 4'
*/
chunk = bio_start >> geo->chunk_shift;
chunk *= geo->near_copies;
first_stripe_index = chunk;
start_disk_index = sector_div(first_stripe_index, geo->raid_disks); if (geo->far_offset)
first_stripe_index *= geo->far_copies;
start_disk_offset = (bio_start & geo->chunk_mask) +
(first_stripe_index << geo->chunk_shift);
/* * For far layout it needs more than one r10bio to cover all regions. * Inspired by raid10_sync_request, we can use the first r10bio->master_bio * to record the discard bio. Other r10bio->master_bio record the first * r10bio. The first r10bio only release after all other r10bios finish. * The discard bio returns only first r10bio finishes
*/ if (first_copy) {
md_account_bio(mddev, &bio);
r10_bio->master_bio = bio;
set_bit(R10BIO_Discard, &r10_bio->state);
first_copy = false;
first_r10bio = r10_bio;
} else
r10_bio->master_bio = (struct bio *)first_r10bio;
/* * first select target devices under rcu_lock and * inc refcount on their rdev. Record them by setting * bios[x] to bio
*/ for (disk = 0; disk < geo->raid_disks; disk++) { struct md_rdev *rdev, *rrdev;
if (rdev && (test_bit(Faulty, &rdev->flags)))
rdev = NULL; if (rrdev && (test_bit(Faulty, &rrdev->flags)))
rrdev = NULL; if (!rdev && !rrdev) continue;
if (rdev) {
r10_bio->devs[disk].bio = bio;
atomic_inc(&rdev->nr_pending);
} if (rrdev) {
r10_bio->devs[disk].repl_bio = bio;
atomic_inc(&rrdev->nr_pending);
}
}
atomic_set(&r10_bio->remaining, 1); for (disk = 0; disk < geo->raid_disks; disk++) {
sector_t dev_start, dev_end; struct bio *mbio, *rbio = NULL;
/* * Now start to calculate the start and end address for each disk. * The space between dev_start and dev_end is the discard region. * * For dev_start, it needs to consider three conditions: * 1st, the disk is before start_disk, you can imagine the disk in * the next stripe. So the dev_start is the start address of next * stripe. * 2st, the disk is after start_disk, it means the disk is at the * same stripe of first disk * 3st, the first disk itself, we can use start_disk_offset directly
*/ if (disk < start_disk_index)
dev_start = (first_stripe_index + 1) * mddev->chunk_sectors; elseif (disk > start_disk_index)
dev_start = first_stripe_index * mddev->chunk_sectors; else
dev_start = start_disk_offset;
/* check if there are enough drives for * every block to appear on atleast one. * Don't consider the device numbered 'ignore' * as we might be about to remove it.
*/ staticint _enough(struct r10conf *conf, int previous, int ignore)
{ int first = 0; int has_enough = 0; int disks, ncopies; if (previous) {
disks = conf->prev.raid_disks;
ncopies = conf->prev.near_copies;
} else {
disks = conf->geo.raid_disks;
ncopies = conf->geo.near_copies;
}
do { int n = conf->copies; int cnt = 0; intthis = first; while (n--) { struct md_rdev *rdev; if (this != ignore &&
(rdev = conf->mirrors[this].rdev) &&
test_bit(In_sync, &rdev->flags))
cnt++; this = (this+1) % disks;
} if (cnt == 0) goto out;
first = (first + ncopies) % disks;
} while (first != 0);
has_enough = 1;
out: return has_enough;
}
staticint enough(struct r10conf *conf, int ignore)
{ /* when calling 'enough', both 'prev' and 'geo' must * be stable. * This is ensured if ->reconfig_mutex or ->device_lock * is held.
*/ return _enough(conf, 0, ignore) &&
_enough(conf, 1, ignore);
}
/** * raid10_error() - RAID10 error handler. * @mddev: affected md device. * @rdev: member device to fail. * * The routine acknowledges &rdev failure and determines new @mddev state. * If it failed, then: * - &MD_BROKEN flag is set in &mddev->flags. * Otherwise, it must be degraded: * - recovery is interrupted. * - &mddev->degraded is bumped. * * @rdev is marked as &Faulty excluding case when array is failed and * &mddev->fail_last_dev is off.
*/ staticvoid raid10_error(struct mddev *mddev, struct md_rdev *rdev)
{ struct r10conf *conf = mddev->private; unsignedlong flags;
spin_lock_irqsave(&conf->device_lock, flags);
if (test_bit(In_sync, &rdev->flags) && !enough(conf, rdev->raid_disk)) {
set_bit(MD_BROKEN, &mddev->flags);
if (!mddev->fail_last_dev) {
spin_unlock_irqrestore(&conf->device_lock, flags); return;
}
} if (test_and_clear_bit(In_sync, &rdev->flags))
mddev->degraded++;
lockdep_assert_held(&conf->mddev->reconfig_mutex); for (i = 0; i < conf->geo.raid_disks; i++) {
rdev = conf->mirrors[i].rdev; if (rdev)
pr_debug(" disk %d, wo:%d, o:%d, dev:%pg\n",
i, !test_bit(In_sync, &rdev->flags),
!test_bit(Faulty, &rdev->flags),
rdev->bdev);
}
}
/* * Find all non-in_sync disks within the RAID10 configuration * and mark them in_sync
*/ for (i = 0; i < conf->geo.raid_disks; i++) {
tmp = conf->mirrors + i; if (tmp->replacement
&& tmp->replacement->recovery_offset == MaxSector
&& !test_bit(Faulty, &tmp->replacement->flags)
&& !test_and_set_bit(In_sync, &tmp->replacement->flags)) { /* Replacement has just become active */ if (!tmp->rdev
|| !test_and_clear_bit(In_sync, &tmp->rdev->flags))
count++; if (tmp->rdev) { /* Replaced device not technically faulty, * but we need to be sure it gets removed * and never re-added.
*/
set_bit(Faulty, &tmp->rdev->flags);
sysfs_notify_dirent_safe(
tmp->rdev->sysfs_state);
}
sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
} elseif (tmp->rdev
&& tmp->rdev->recovery_offset == MaxSector
&& !test_bit(Faulty, &tmp->rdev->flags)
&& !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
count++;
sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
}
}
spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded -= count;
spin_unlock_irqrestore(&conf->device_lock, flags);
print_conf(conf); return count;
}
staticint raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
{ struct r10conf *conf = mddev->private; int err = -EEXIST; int mirror, repl_slot = -1; int first = 0; int last = conf->geo.raid_disks - 1; struct raid10_info *p;
if (mddev->resync_offset < MaxSector) /* only hot-add to in-sync arrays, as recovery is * very different from resync
*/ return -EBUSY; if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1)) return -EINVAL;
if (rdev->raid_disk >= 0)
first = last = rdev->raid_disk;
if (rdev->saved_raid_disk >= first &&
rdev->saved_raid_disk < conf->geo.raid_disks &&
conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
mirror = rdev->saved_raid_disk; else
mirror = first; for ( ; mirror <= last ; mirror++) {
p = &conf->mirrors[mirror]; if (p->recovery_disabled == mddev->recovery_disabled) continue; if (p->rdev) { if (test_bit(WantReplacement, &p->rdev->flags) &&
p->replacement == NULL && repl_slot < 0)
repl_slot = mirror; continue;
}
staticvoid __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d)
{ struct r10conf *conf = r10_bio->mddev->private;
if (!bio->bi_status)
set_bit(R10BIO_Uptodate, &r10_bio->state); else /* The write handler will notice the lack of * R10BIO_Uptodate and record any errors etc
*/
atomic_add(r10_bio->sectors,
&conf->mirrors[d].rdev->corrected_errors);
/* for reconstruct, we always reschedule after a read. * for resync, only after all reads
*/
rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev); if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
atomic_dec_and_test(&r10_bio->remaining)) { /* we have read all the blocks, * do the comparison in process context in raid10d
*/
reschedule_retry(r10_bio);
}
}
staticvoid end_sync_read(struct bio *bio)
{ struct r10bio *r10_bio = get_resync_r10bio(bio); struct r10conf *conf = r10_bio->mddev->private; int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
__end_sync_read(r10_bio, bio, d);
}
staticvoid end_reshape_read(struct bio *bio)
{ /* reshape read bio isn't allocated from r10buf_pool */ struct r10bio *r10_bio = bio->bi_private;
while (atomic_dec_and_test(&r10_bio->remaining)) { if (r10_bio->master_bio == NULL) { /* the primary of several recovery bios */
sector_t s = r10_bio->sectors; if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
test_bit(R10BIO_WriteError, &r10_bio->state))
reschedule_retry(r10_bio); else
put_buf(r10_bio);
md_done_sync(mddev, s, 1); break;
} else { struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio; if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
test_bit(R10BIO_WriteError, &r10_bio->state))
reschedule_retry(r10_bio); else
put_buf(r10_bio);
r10_bio = r10_bio2;
}
}
}
staticvoid end_sync_write(struct bio *bio)
{ struct r10bio *r10_bio = get_resync_r10bio(bio); struct mddev *mddev = r10_bio->mddev; struct r10conf *conf = mddev->private; int d; int slot; int repl; struct md_rdev *rdev = NULL;
d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); if (repl)
rdev = conf->mirrors[d].replacement; else
rdev = conf->mirrors[d].rdev;
if (bio->bi_status) { if (repl)
md_error(mddev, rdev); else {
set_bit(WriteErrorSeen, &rdev->flags); if (!test_and_set_bit(WantReplacement, &rdev->flags))
set_bit(MD_RECOVERY_NEEDED,
&rdev->mddev->recovery);
set_bit(R10BIO_WriteError, &r10_bio->state);
}
} elseif (rdev_has_badblock(rdev, r10_bio->devs[slot].addr,
r10_bio->sectors)) {
set_bit(R10BIO_MadeGood, &r10_bio->state);
}
rdev_dec_pending(rdev, mddev);
end_sync_request(r10_bio);
}
/* * Note: sync and recover and handled very differently for raid10 * This code is for resync. * For resync, we read through virtual addresses and read all blocks. * If there is any error, we schedule a write. The lowest numbered * drive is authoritative.
--> --------------------
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.