// SPDX-License-Identifier: GPL-2.0-or-later /* * raid10.c : Multiple Devices driver for Linux * * Copyright (C) 2000-2004 Neil Brown * * RAID-10 support for md. * * Base on code in raid1.c. See raid1.c for further copyright information.
*/
/* * RAID10 provides a combination of RAID0 and RAID1 functionality. * The layout of data is defined by * chunk_size * raid_disks * near_copies (stored in low byte of layout) * far_copies (stored in second byte of layout) * far_offset (stored in bit 16 of layout ) * use_far_sets (stored in bit 17 of layout ) * use_far_sets_bugfixed (stored in bit 18 of layout ) * * The data to be stored is divided into chunks using chunksize. Each device * is divided into far_copies sections. In each section, chunks are laid out * in a style similar to raid0, but near_copies copies of each chunk is stored * (each on a different drive). The starting device for each section is offset * near_copies from the starting device of the previous section. Thus there * are (near_copies * far_copies) of each chunk, and each is on a different * drive. near_copies and far_copies must be at least one, and their product * is at most raid_disks. * * If far_offset is true, then the far_copies are handled a bit differently. * The copies are still in different stripes, but instead of being very far * apart on disk, there are adjacent stripes. * * The far and offset algorithms are handled slightly differently if * 'use_far_sets' is true. In this case, the array's devices are grouped into * sets that are (near_copies * far_copies) in size. The far copied stripes * are still shifted by 'near_copies' devices, but this shifting stays confined * to the set rather than the entire array. This is done to improve the number * of device combinations that can fail without causing the array to fail. * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk * on a device): * A B C D A B C D E * ... ... * D A B C E A B C D * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s): * [A B] [C D] [A B] [C D E] * |...| |...| |...| | ... | * [B A] [D C] [B A] [E C D]
*/
staticvoid allow_barrier(struct r10conf *conf); staticvoid lower_barrier(struct r10conf *conf); staticint _enough(struct r10conf *conf, int previous, int ignore); staticint enough(struct r10conf *conf, int ignore); static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped); staticvoid reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio); staticvoid end_reshape_write(struct bio *bio); staticvoid end_reshape(struct r10conf *conf);
#include"raid1-10.c"
#define NULL_CMD #define cmd_before(conf, cmd) \ do { \
write_sequnlock_irq(&(conf)->resync_lock); \
cmd; \
} while (0) #define cmd_after(conf) write_seqlock_irq(&(conf)->resync_lock)
/* * for resync bio, r10bio pointer can be retrieved from the per-bio * 'struct resync_pages'.
*/ staticinlinestruct r10bio *get_resync_r10bio(struct bio *bio)
{ return get_resync_pages(bio)->raid_bio;
}
/* allocate a r10bio with room for raid_disks entries in the
* bios array */ return kzalloc(size, gfp_flags);
}
#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) /* amount of memory to reserve for resync requests */ #define RESYNC_WINDOW (1024*1024) /* maximum number of concurrent requests, memory permitting */ #define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE) #define CLUSTER_RESYNC_WINDOW (32 * RESYNC_WINDOW) #define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
/* * When performing a resync, we need to read and compare, so * we need as many pages are there are copies. * When performing a recovery, we need 2 bios, one for read, * one for write (we recover only one drive per r10buf) *
*/ staticvoid * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
{ struct r10conf *conf = data; struct r10bio *r10_bio; struct bio *bio; int j; int nalloc, nalloc_rp; struct resync_pages *rps;
r10_bio = r10bio_pool_alloc(gfp_flags, conf); if (!r10_bio) return NULL;
/* wake up frozen array... */
wake_up(&conf->wait_barrier);
md_wakeup_thread(mddev->thread);
}
/* * raid_end_bio_io() is called when we have finished servicing a mirrored * operation and are ready to return a success/failure code to the buffer * cache layer.
*/ staticvoid raid_end_bio_io(struct r10bio *r10_bio)
{ struct bio *bio = r10_bio->master_bio; struct r10conf *conf = r10_bio->mddev->private;
if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
bio->bi_status = BLK_STS_IOERR;
bio_endio(bio); /* * Wake up any possible resync thread that waits for the device * to go idle.
*/
allow_barrier(conf);
free_r10bio(r10_bio);
}
/* * Update disk head position estimator based on IRQ completion info.
*/ staticinlinevoid update_head_pos(int slot, struct r10bio *r10_bio)
{ struct r10conf *conf = r10_bio->mddev->private;
/* * Find the disk number which triggered given bio
*/ staticint find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, struct bio *bio, int *slotp, int *replp)
{ int slot; int repl = 0;
for (slot = 0; slot < conf->geo.raid_disks; slot++) { if (r10_bio->devs[slot].bio == bio) break; if (r10_bio->devs[slot].repl_bio == bio) {
repl = 1; break;
}
}
update_head_pos(slot, r10_bio);
if (slotp)
*slotp = slot; if (replp)
*replp = repl; return r10_bio->devs[slot].devnum;
}
staticvoid raid10_end_read_request(struct bio *bio)
{ int uptodate = !bio->bi_status; struct r10bio *r10_bio = bio->bi_private; int slot; struct md_rdev *rdev; struct r10conf *conf = r10_bio->mddev->private;
slot = r10_bio->read_slot;
rdev = r10_bio->devs[slot].rdev; /* * this branch is our 'one mirror IO has finished' event handler:
*/
update_head_pos(slot, r10_bio);
if (uptodate) { /* * Set R10BIO_Uptodate in our master bio, so that * we will return a good error code to the higher * levels even if IO on some other mirrored buffer fails. * * The 'master' represents the composite IO operation to * user-side. So if something waits for IO, then it will * wait for the 'master' bio.
*/
set_bit(R10BIO_Uptodate, &r10_bio->state);
} elseif (!raid1_should_handle_error(bio)) {
uptodate = 1;
} else { /* If all other devices that store this block have * failed, we want to return the error upwards rather * than fail the last device. Here we redefine * "uptodate" to mean "Don't want to retry"
*/ if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state),
rdev->raid_disk))
uptodate = 1;
} if (uptodate) {
raid_end_bio_io(r10_bio);
rdev_dec_pending(rdev, conf->mddev);
} else { /* * oops, read error - keep the refcount on the rdev
*/
pr_err_ratelimited("md/raid10:%s: %pg: rescheduling sector %llu\n",
mdname(conf->mddev),
rdev->bdev,
(unsignedlonglong)r10_bio->sector);
set_bit(R10BIO_ReadError, &r10_bio->state);
reschedule_retry(r10_bio);
}
}
staticvoid one_write_done(struct r10bio *r10_bio)
{ if (atomic_dec_and_test(&r10_bio->remaining)) { if (test_bit(R10BIO_WriteError, &r10_bio->state))
reschedule_retry(r10_bio); else {
close_write(r10_bio); if (test_bit(R10BIO_MadeGood, &r10_bio->state))
reschedule_retry(r10_bio); else
raid_end_bio_io(r10_bio);
}
}
}
staticvoid raid10_end_write_request(struct bio *bio)
{ struct r10bio *r10_bio = bio->bi_private; int dev; int dec_rdev = 1; struct r10conf *conf = r10_bio->mddev->private; int slot, repl; struct md_rdev *rdev = NULL; struct bio *to_put = NULL; bool ignore_error = !raid1_should_handle_error(bio) ||
(bio->bi_status && bio_op(bio) == REQ_OP_DISCARD);
dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
if (repl)
rdev = conf->mirrors[dev].replacement; if (!rdev) {
smp_rmb();
repl = 0;
rdev = conf->mirrors[dev].rdev;
} /* * this branch is our 'one mirror IO has finished' event handler:
*/ if (bio->bi_status && !ignore_error) { if (repl) /* Never record new bad blocks to replacement, * just fail it.
*/
md_error(rdev->mddev, rdev); else {
set_bit(WriteErrorSeen, &rdev->flags); if (!test_and_set_bit(WantReplacement, &rdev->flags))
set_bit(MD_RECOVERY_NEEDED,
&rdev->mddev->recovery);
/* * When the device is faulty, it is not necessary to * handle write error.
*/ if (!test_bit(Faulty, &rdev->flags))
set_bit(R10BIO_WriteError, &r10_bio->state); else { /* Fail the request */
r10_bio->devs[slot].bio = NULL;
to_put = bio;
dec_rdev = 1;
}
}
} else { /* * Set R10BIO_Uptodate in our master bio, so that * we will return a good error code for to the higher * levels even if IO on some other mirrored buffer fails. * * The 'master' represents the composite IO operation to * user-side. So if something waits for IO, then it will * wait for the 'master' bio. * * Do not set R10BIO_Uptodate if the current device is * rebuilding or Faulty. This is because we cannot use * such device for properly reading the data back (we could * potentially use it, if the current write would have felt * before rdev->recovery_offset, but for simplicity we don't * check this here.
*/ if (test_bit(In_sync, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags))
set_bit(R10BIO_Uptodate, &r10_bio->state);
/* Maybe we can clear some bad blocks. */ if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr,
r10_bio->sectors) &&
!ignore_error) {
bio_put(bio); if (repl)
r10_bio->devs[slot].repl_bio = IO_MADE_GOOD; else
r10_bio->devs[slot].bio = IO_MADE_GOOD;
dec_rdev = 0;
set_bit(R10BIO_MadeGood, &r10_bio->state);
}
}
/* * * Let's see if all mirrored write operations have finished * already.
*/
one_write_done(r10_bio); if (dec_rdev)
rdev_dec_pending(rdev, conf->mddev); if (to_put)
bio_put(to_put);
}
/* * RAID10 layout manager * As well as the chunksize and raid_disks count, there are two * parameters: near_copies and far_copies. * near_copies * far_copies must be <= raid_disks. * Normally one of these will be 1. * If both are 1, we get raid0. * If near_copies == raid_disks, we get raid1. * * Chunks are laid out in raid0 style with near_copies copies of the * first chunk, followed by near_copies copies of the next chunk and * so on. * If far_copies > 1, then after 1/far_copies of the array has been assigned * as described above, we start again with a device offset of near_copies. * So we effectively have another copy of the whole array further down all * the drives, but with blocks on different drives. * With this layout, and block is never stored twice on the one device. * * raid10_find_phys finds the sector offset of a given virtual sector * on each device that it is on. * * raid10_find_virt does the reverse mapping, from a device and a * sector offset to a virtual address
*/
staticvoid __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
{ int n,f;
sector_t sector;
sector_t chunk;
sector_t stripe; int dev; int slot = 0; int last_far_set_start, last_far_set_size;
/* now calculate first sector/dev */
chunk = r10bio->sector >> geo->chunk_shift;
sector = r10bio->sector & geo->chunk_mask;
chunk *= geo->near_copies;
stripe = chunk;
dev = sector_div(stripe, geo->raid_disks); if (geo->far_offset)
stripe *= geo->far_copies;
sector += stripe << geo->chunk_shift;
/* and calculate all the others */ for (n = 0; n < geo->near_copies; n++) { int d = dev; int set;
sector_t s = sector;
r10bio->devs[slot].devnum = d;
r10bio->devs[slot].addr = s;
slot++;
for (f = 1; f < geo->far_copies; f++) {
set = d / geo->far_set_size;
d += geo->near_copies;
if ((geo->raid_disks % geo->far_set_size) &&
(d > last_far_set_start)) {
d -= last_far_set_start;
d %= last_far_set_size;
d += last_far_set_start;
} else {
d %= geo->far_set_size;
d += geo->far_set_size * set;
}
s += geo->stride;
r10bio->devs[slot].devnum = d;
r10bio->devs[slot].addr = s;
slot++;
}
dev++; if (dev >= geo->raid_disks) {
dev = 0;
sector += (geo->chunk_mask + 1);
}
}
}
static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
{
sector_t offset, chunk, vchunk; /* Never use conf->prev as this is only called during resync * or recovery, so reshape isn't happening
*/ struct geom *geo = &conf->geo; int far_set_start = (dev / geo->far_set_size) * geo->far_set_size; int far_set_size = geo->far_set_size; int last_far_set_start;
offset = sector & geo->chunk_mask; if (geo->far_offset) { int fc;
chunk = sector >> geo->chunk_shift;
fc = sector_div(chunk, geo->far_copies);
dev -= fc * geo->near_copies; if (dev < far_set_start)
dev += far_set_size;
} else { while (sector >= geo->stride) {
sector -= geo->stride; if (dev < (geo->near_copies + far_set_start))
dev += far_set_size - geo->near_copies; else
dev -= geo->near_copies;
}
chunk = sector >> geo->chunk_shift;
}
vchunk = chunk * geo->raid_disks + dev;
sector_div(vchunk, geo->near_copies); return (vchunk << geo->chunk_shift) + offset;
}
/* * This routine returns the disk from which the requested read should * be done. There is a per-array 'next expected sequential IO' sector * number - if this matches on the next IO then we use the last disk. * There is also a per-disk 'last know head position' sector that is * maintained from IRQ contexts, both the normal and the resync IO * completion handlers update this position correctly. If there is no * perfect sequential match then we pick the disk whose head is closest. * * If there are 2 mirrors in the same 2 devices, performance degrades * because position is mirror, not device based. * * The rdev for the device selected will have nr_pending incremented.
*/
/* * FIXME: possibly should rethink readbalancing and do it differently * depending on near_copies / far_copies geometry.
*/ staticstruct md_rdev *read_balance(struct r10conf *conf, struct r10bio *r10_bio, int *max_sectors)
{ const sector_t this_sector = r10_bio->sector; int disk, slot; int sectors = r10_bio->sectors; int best_good_sectors;
sector_t new_distance, best_dist; struct md_rdev *best_dist_rdev, *best_pending_rdev, *rdev = NULL; int do_balance; int best_dist_slot, best_pending_slot; bool has_nonrot_disk = false; unsignedint min_pending; struct geom *geo = &conf->geo;
if (best_dist_slot >= 0) /* At least 2 disks to choose from so failfast is OK */
set_bit(R10BIO_FailFast, &r10_bio->state); /* This optimisation is debatable, and completely destroys * sequential read speed for 'far copies' arrays. So only * keep it for 'near' arrays, and review those later.
*/ if (geo->near_copies > 1 && !pending)
new_distance = 0;
/* for far > 1 always use the lowest address */ elseif (geo->far_copies > 1)
new_distance = r10_bio->devs[slot].addr; else
new_distance = abs(r10_bio->devs[slot].addr -
conf->mirrors[disk].head_position);
staticvoid flush_pending_writes(struct r10conf *conf)
{ /* Any writes that have been queued but are awaiting * bitmap updates get flushed here.
*/
spin_lock_irq(&conf->device_lock);
if (conf->pending_bio_list.head) { struct blk_plug plug; struct bio *bio;
bio = bio_list_get(&conf->pending_bio_list);
spin_unlock_irq(&conf->device_lock);
/* * As this is called in a wait_event() loop (see freeze_array), * current->state might be TASK_UNINTERRUPTIBLE which will * cause a warning when we prepare to wait again. As it is * rare that this path is taken, it is perfectly safe to force * us to go around the wait_event() loop again, so the warning * is a false-positive. Silence the warning by resetting * thread state
*/
__set_current_state(TASK_RUNNING);
/* Barriers.... * Sometimes we need to suspend IO while we do something else, * either some resync/recovery, or reconfigure the array. * To do this we raise a 'barrier'. * The 'barrier' is a counter that can be raised multiple times * to count how many activities are happening which preclude * normal IO. * We can only raise the barrier if there is no pending IO. * i.e. if nr_pending == 0. * We choose only to raise the barrier if no-one is waiting for the * barrier to go down. This means that as soon as an IO request * is ready, no other operations which require a barrier will start * until the IO request has had a chance. * * So: regular IO calls 'wait_barrier'. When that returns there * is no backgroup IO happening, It must arrange to call * allow_barrier when it has finished its IO. * backgroup IO calls must call raise_barrier. Once that returns * there is no normal IO happeing. It must arrange to call * lower_barrier when the particular background IO completes.
*/
staticvoid raise_barrier(struct r10conf *conf, int force)
{
write_seqlock_irq(&conf->resync_lock);
if (WARN_ON_ONCE(force && !conf->barrier))
force = false;
/* Wait until no block IO is waiting (unless 'force') */
wait_event_barrier(conf, force || !conf->nr_waiting);
/* block any new IO from starting */
WRITE_ONCE(conf->barrier, conf->barrier + 1);
/* Now wait for all pending IO to complete */
wait_event_barrier(conf, !atomic_read(&conf->nr_pending) &&
conf->barrier < RESYNC_DEPTH);
/* barrier is dropped */ if (!conf->barrier) returntrue;
/* * If there are already pending requests (preventing the barrier from * rising completely), and the pre-process bio queue isn't empty, then * don't wait, as we need to empty that queue to get the nr_pending * count down.
*/ if (atomic_read(&conf->nr_pending) && bio_list &&
(!bio_list_empty(&bio_list[0]) || !bio_list_empty(&bio_list[1]))) returntrue;
/* daemon thread must exist while handling io */
thread = rcu_dereference_protected(conf->mddev->thread, true); /* * move on if io is issued from raid10d(), nr_pending is not released * from original io(see handle_read_error()). All raise barrier is * blocked until this io is done.
*/ if (thread->tsk == current) {
WARN_ON_ONCE(atomic_read(&conf->nr_pending) == 0); returntrue;
}
write_seqlock_irq(&conf->resync_lock); if (conf->barrier) { /* Return false when nowait flag is set */ if (nowait) {
ret = false;
} else {
conf->nr_waiting++;
mddev_add_trace_msg(conf->mddev, "raid10 wait barrier");
wait_event_barrier(conf, stop_waiting_barrier(conf));
conf->nr_waiting--;
} if (!conf->nr_waiting)
wake_up(&conf->wait_barrier);
} /* Only increment nr_pending when we wait */ if (ret)
atomic_inc(&conf->nr_pending);
write_sequnlock_irq(&conf->resync_lock); return ret;
}
staticvoid freeze_array(struct r10conf *conf, int extra)
{ /* stop syncio and normal IO and wait for everything to * go quiet. * We increment barrier and nr_waiting, and then * wait until nr_pending match nr_queued+extra * This is called in the context of one normal IO request * that has failed. Thus any sync request that might be pending * will be blocked by nr_pending, and we need to wait for * pending IO requests to complete or be queued for re-try. * Thus the number queued (nr_queued) plus this request (extra) * must match the number of pending IOs (nr_pending) before * we continue.
*/
write_seqlock_irq(&conf->resync_lock);
conf->array_freeze_pending++;
WRITE_ONCE(conf->barrier, conf->barrier + 1);
conf->nr_waiting++;
wait_event_barrier_cmd(conf, atomic_read(&conf->nr_pending) ==
conf->nr_queued + extra, flush_pending_writes(conf));
conf->array_freeze_pending--;
write_sequnlock_irq(&conf->resync_lock);
}
staticvoid unfreeze_array(struct r10conf *conf)
{ /* reverse the effect of the freeze */
write_seqlock_irq(&conf->resync_lock);
WRITE_ONCE(conf->barrier, conf->barrier - 1);
conf->nr_waiting--;
wake_up(&conf->wait_barrier);
write_sequnlock_irq(&conf->resync_lock);
}
/* we aren't scheduling, so we can do the write-out directly. */
bio = bio_list_get(&plug->pending);
raid1_prepare_flush_writes(mddev);
wake_up_barrier(conf);
while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next;
raid1_submit_write(bio);
bio = next;
cond_resched();
}
kfree(plug);
}
/* * 1. Register the new request and wait if the reconstruction thread has put * up a bar for new requests. Continue immediately if no resync is active * currently. * 2. If IO spans the reshape position. Need to wait for reshape to pass.
*/ staticbool regular_request_wait(struct mddev *mddev, struct r10conf *conf, struct bio *bio, sector_t sectors)
{ /* Bail out if REQ_NOWAIT is set for the bio */ if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) {
bio_wouldblock_error(bio); returnfalse;
} while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
bio->bi_iter.bi_sector < conf->reshape_progress &&
bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
allow_barrier(conf); if (bio->bi_opf & REQ_NOWAIT) {
bio_wouldblock_error(bio); returnfalse;
}
mddev_add_trace_msg(conf->mddev, "raid10 wait reshape");
wait_event(conf->wait_barrier,
conf->reshape_progress <= bio->bi_iter.bi_sector ||
conf->reshape_progress >= bio->bi_iter.bi_sector +
sectors);
wait_barrier(conf, false);
} returntrue;
}
staticvoid raid10_read_request(struct mddev *mddev, struct bio *bio, struct r10bio *r10_bio, bool io_accounting)
{ struct r10conf *conf = mddev->private; struct bio *read_bio; int max_sectors; struct md_rdev *rdev; char b[BDEVNAME_SIZE]; int slot = r10_bio->read_slot; struct md_rdev *err_rdev = NULL;
gfp_t gfp = GFP_NOIO; int error;
if (slot >= 0 && r10_bio->devs[slot].rdev) { /* * This is an error retry, but we cannot * safely dereference the rdev in the r10_bio, * we must use the one in conf. * If it has already been disconnected (unlikely) * we lose the device name in error messages.
*/ int disk; /* * As we are blocking raid10, it is a little safer to * use __GFP_HIGH.
*/
gfp = GFP_NOIO | __GFP_HIGH;
disk = r10_bio->devs[slot].devnum;
err_rdev = conf->mirrors[disk].rdev; if (err_rdev)
snprintf(b, sizeof(b), "%pg", err_rdev->bdev); else {
strcpy(b, "???"); /* This never gets dereferenced */
err_rdev = r10_bio->devs[slot].rdev;
}
}
if (!regular_request_wait(mddev, conf, bio, r10_bio->sectors)) {
raid_end_bio_io(r10_bio); return;
}
rdev = read_balance(conf, r10_bio, &max_sectors); if (!rdev) { if (err_rdev) {
pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n",
mdname(mddev), b,
(unsignedlonglong)r10_bio->sector);
}
raid_end_bio_io(r10_bio); return;
} if (err_rdev)
pr_err_ratelimited("md/raid10:%s: %pg: redirecting sector %llu to another mirror\n",
mdname(mddev),
rdev->bdev,
(unsignedlonglong)r10_bio->sector); if (max_sectors < bio_sectors(bio)) { struct bio *split = bio_split(bio, max_sectors,
gfp, &conf->bio_split); if (IS_ERR(split)) {
error = PTR_ERR(split); goto err_handle;
}
retry_wait:
blocked_rdev = NULL; for (i = 0; i < conf->copies; i++) { struct md_rdev *rdev, *rrdev;
rdev = conf->mirrors[i].rdev; if (rdev) {
sector_t dev_sector = r10_bio->devs[i].addr;
/* * Discard request doesn't care the write result * so it doesn't need to wait blocked disk here.
*/ if (test_bit(WriteErrorSeen, &rdev->flags) &&
r10_bio->sectors &&
rdev_has_badblock(rdev, dev_sector,
r10_bio->sectors) < 0) /* * Mustn't write here until the bad * block is acknowledged
*/
set_bit(BlockedBadBlocks, &rdev->flags);
if (rdev_blocked(rdev)) {
blocked_rdev = rdev;
atomic_inc(&rdev->nr_pending); break;
}
}
if (unlikely(blocked_rdev)) { /* Have to wait for this device to get unblocked, then retry */
allow_barrier(conf);
mddev_add_trace_msg(conf->mddev, "raid10 %s wait rdev %d blocked",
__func__, blocked_rdev->raid_disk);
md_wait_for_blocked_rdev(blocked_rdev, mddev);
wait_barrier(conf, false); goto retry_wait;
}
}
staticvoid raid10_write_request(struct mddev *mddev, struct bio *bio, struct r10bio *r10_bio)
{ struct r10conf *conf = mddev->private; int i, k;
sector_t sectors; int max_sectors; int error;
if ((mddev_is_clustered(mddev) &&
mddev->cluster_ops->area_resyncing(mddev, WRITE,
bio->bi_iter.bi_sector,
bio_end_sector(bio)))) {
DEFINE_WAIT(w); /* Bail out if REQ_NOWAIT is set for the bio */ if (bio->bi_opf & REQ_NOWAIT) {
bio_wouldblock_error(bio); return;
} for (;;) {
prepare_to_wait(&conf->wait_barrier,
&w, TASK_IDLE); if (!mddev->cluster_ops->area_resyncing(mddev, WRITE,
bio->bi_iter.bi_sector, bio_end_sector(bio))) break;
schedule();
}
finish_wait(&conf->wait_barrier, &w);
}
/* first select target devices under rcu_lock and * inc refcount on their rdev. Record them by setting * bios[x] to bio * If there are known/acknowledged bad blocks on any device * on which we have seen a write error, we want to avoid * writing to those blocks. This potentially requires several * writes to write around the bad blocks. Each set of writes * gets its own r10_bio with a set of bios attached.
*/
if (!rdev && !rrdev) continue; if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
sector_t first_bad;
sector_t dev_sector = r10_bio->devs[i].addr;
sector_t bad_sectors; int is_bad;
is_bad = is_badblock(rdev, dev_sector, max_sectors,
&first_bad, &bad_sectors); if (is_bad && first_bad <= dev_sector) { /* Cannot write here at all */
bad_sectors -= (dev_sector - first_bad); if (bad_sectors < max_sectors) /* Mustn't write more than bad_sectors * to other devices yet
*/
max_sectors = bad_sectors; continue;
} if (is_bad) { int good_sectors;
/* * We cannot atomically write this, so just * error in that case. It could be possible to * atomically write other mirrors, but the * complexity of supporting that is not worth * the benefit.
*/ if (bio->bi_opf & REQ_ATOMIC) {
error = -EIO; goto err_handle;
}
/* * There are some limitations to handle discard bio * 1st, the discard size is bigger than stripe_size*2. * 2st, if the discard bio spans reshape progress, we use the old way to * handle discard bio
*/ staticint raid10_handle_discard(struct mddev *mddev, struct bio *bio)
{ struct r10conf *conf = mddev->private; struct geom *geo = &conf->geo; int far_copies = geo->far_copies; bool first_copy = true; struct r10bio *r10_bio, *first_r10bio; struct bio *split; int disk;
sector_t chunk; unsignedint stripe_size; unsignedint stripe_data_disks;
sector_t split_size;
sector_t bio_start, bio_end;
sector_t first_stripe_index, last_stripe_index;
sector_t start_disk_offset; unsignedint start_disk_index;
sector_t end_disk_offset; unsignedint end_disk_index; unsignedint remainder;
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) return -EAGAIN;
if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) {
bio_wouldblock_error(bio); return 0;
}
/* * Check reshape again to avoid reshape happens after checking * MD_RECOVERY_RESHAPE and before wait_barrier
*/ if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) goto out;
/* * Maybe one discard bio is smaller than strip size or across one * stripe and discard region is larger than one stripe size. For far * offset layout, if the discard region is not aligned with stripe * size, there is hole when we submit discard bio to member disk. * For simplicity, we only handle discard bio which discard region * is bigger than stripe_size * 2
*/ if (bio_sectors(bio) < stripe_size*2) goto out;
/* * Keep bio aligned with strip size.
*/
div_u64_rem(bio_start, stripe_size, &remainder); if (remainder) {
split_size = stripe_size - remainder;
split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split); if (IS_ERR(split)) {
bio->bi_status = errno_to_blk_status(PTR_ERR(split));
bio_endio(bio); return 0;
}
bio_chain(split, bio);
trace_block_split(split, bio->bi_iter.bi_sector);
allow_barrier(conf); /* Resend the second split part */
submit_bio_noacct(bio);
bio = split;
wait_barrier(conf, false);
}
/* * Raid10 uses chunk as the unit to store data. It's similar like raid0. * One stripe contains the chunks from all member disk (one chunk from * one disk at the same HBA address). For layout detail, see 'man md 4'
*/
chunk = bio_start >> geo->chunk_shift;
chunk *= geo->near_copies;
first_stripe_index = chunk;
start_disk_index = sector_div(first_stripe_index, geo->raid_disks); if (geo->far_offset)
first_stripe_index *= geo->far_copies;
start_disk_offset = (bio_start & geo->chunk_mask) +
(first_stripe_index << geo->chunk_shift);
/* * For far layout it needs more than one r10bio to cover all regions. * Inspired by raid10_sync_request, we can use the first r10bio->master_bio * to record the discard bio. Other r10bio->master_bio record the first * r10bio. The first r10bio only release after all other r10bios finish. * The discard bio returns only first r10bio finishes
*/ if (first_copy) {
md_account_bio(mddev, &bio);
r10_bio->master_bio = bio;
set_bit(R10BIO_Discard, &r10_bio->state);
first_copy = false;
first_r10bio = r10_bio;
} else
r10_bio->master_bio = (struct bio *)first_r10bio;
/* * first select target devices under rcu_lock and * inc refcount on their rdev. Record them by setting * bios[x] to bio
*/ for (disk = 0; disk < geo->raid_disks; disk++) { struct md_rdev *rdev, *rrdev;
if (rdev && (test_bit(Faulty, &rdev->flags)))
rdev = NULL; if (rrdev && (test_bit(Faulty, &rrdev->flags)))
rrdev = NULL; if (!rdev && !rrdev) continue;
if (rdev) {
r10_bio->devs[disk].bio = bio;
atomic_inc(&rdev->nr_pending);
} if (rrdev) {
r10_bio->devs[disk].repl_bio = bio;
atomic_inc(&rrdev->nr_pending);
}
}
atomic_set(&r10_bio->remaining, 1); for (disk = 0; disk < geo->raid_disks; disk++) {
sector_t dev_start, dev_end; struct bio *mbio, *rbio = NULL;
/* * Now start to calculate the start and end address for each disk. * The space between dev_start and dev_end is the discard region. * * For dev_start, it needs to consider three conditions: * 1st, the disk is before start_disk, you can imagine the disk in * the next stripe. So the dev_start is the start address of next * stripe. * 2st, the disk is after start_disk, it means the disk is at the * same stripe of first disk * 3st, the first disk itself, we can use start_disk_offset directly
*/ if (disk < start_disk_index)
dev_start = (first_stripe_index + 1) * mddev->chunk_sectors; elseif (disk > start_disk_index)
dev_start = first_stripe_index * mddev->chunk_sectors; else
dev_start = start_disk_offset;
/* check if there are enough drives for * every block to appear on atleast one. * Don't consider the device numbered 'ignore' * as we might be about to remove it.
*/ staticint _enough(struct r10conf *conf, int previous, int ignore)
{ int first = 0; int has_enough = 0; int disks, ncopies; if (previous) {
disks = conf->prev.raid_disks;
ncopies = conf->prev.near_copies;
} else {
disks = conf->geo.raid_disks;
ncopies = conf->geo.near_copies;
}
do { int n = conf->copies; int cnt = 0; intthis = first; while (n--) { struct md_rdev *rdev; if (this != ignore &&
(rdev = conf->mirrors[this].rdev) &&
test_bit(In_sync, &rdev->flags))
cnt++; this = (this+1) % disks;
} if (cnt == 0) goto out;
first = (first + ncopies) % disks;
} while (first != 0);
has_enough = 1;
out: return has_enough;
}
staticint enough(struct r10conf *conf, int ignore)
{ /* when calling 'enough', both 'prev' and 'geo' must * be stable. * This is ensured if ->reconfig_mutex or ->device_lock * is held.
*/ return _enough(conf, 0, ignore) &&
_enough(conf, 1, ignore);
}
/** * raid10_error() - RAID10 error handler. * @mddev: affected md device. * @rdev: member device to fail. * * The routine acknowledges &rdev failure and determines new @mddev state. * If it failed, then: * - &MD_BROKEN flag is set in &mddev->flags. * Otherwise, it must be degraded: * - recovery is interrupted. * - &mddev->degraded is bumped. * * @rdev is marked as &Faulty excluding case when array is failed and * &mddev->fail_last_dev is off.
*/ staticvoid raid10_error(struct mddev *mddev, struct md_rdev *rdev)
{ struct r10conf *conf = mddev->private; unsignedlong flags;
spin_lock_irqsave(&conf->device_lock, flags);
if (test_bit(In_sync, &rdev->flags) && !enough(conf, rdev->raid_disk)) {
set_bit(MD_BROKEN, &mddev->flags);
if (!mddev->fail_last_dev) {
spin_unlock_irqrestore(&conf->device_lock, flags); return;
}
} if (test_and_clear_bit(In_sync, &rdev->flags))
mddev->degraded++;
lockdep_assert_held(&conf->mddev->reconfig_mutex); for (i = 0; i < conf->geo.raid_disks; i++) {
rdev = conf->mirrors[i].rdev; if (rdev)
pr_debug(" disk %d, wo:%d, o:%d, dev:%pg\n",
i, !test_bit(In_sync, &rdev->flags),
!test_bit(Faulty, &rdev->flags),
rdev->bdev);
}
}
/* * Find all non-in_sync disks within the RAID10 configuration * and mark them in_sync
*/ for (i = 0; i < conf->geo.raid_disks; i++) {
tmp = conf->mirrors + i; if (tmp->replacement
&& tmp->replacement->recovery_offset == MaxSector
&& !test_bit(Faulty, &tmp->replacement->flags)
&& !test_and_set_bit(In_sync, &tmp->replacement->flags)) { /* Replacement has just become active */ if (!tmp->rdev
|| !test_and_clear_bit(In_sync, &tmp->rdev->flags))
count++; if (tmp->rdev) { /* Replaced device not technically faulty, * but we need to be sure it gets removed * and never re-added.
*/
set_bit(Faulty, &tmp->rdev->flags);
sysfs_notify_dirent_safe(
tmp->rdev->sysfs_state);
}
sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
} elseif (tmp->rdev
&& tmp->rdev->recovery_offset == MaxSector
&& !test_bit(Faulty, &tmp->rdev->flags)
&& !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
count++;
sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
}
}
spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded -= count;
spin_unlock_irqrestore(&conf->device_lock, flags);
print_conf(conf); return count;
}
staticint raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
{ struct r10conf *conf = mddev->private; int err = -EEXIST; int mirror, repl_slot = -1; int first = 0; int last = conf->geo.raid_disks - 1; struct raid10_info *p;
if (mddev->resync_offset < MaxSector) /* only hot-add to in-sync arrays, as recovery is * very different from resync
*/ return -EBUSY; if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1)) return -EINVAL;
if (rdev->raid_disk >= 0)
first = last = rdev->raid_disk;
if (rdev->saved_raid_disk >= first &&
rdev->saved_raid_disk < conf->geo.raid_disks &&
conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
mirror = rdev->saved_raid_disk; else
mirror = first; for ( ; mirror <= last ; mirror++) {
p = &conf->mirrors[mirror]; if (p->recovery_disabled == mddev->recovery_disabled) continue; if (p->rdev) { if (test_bit(WantReplacement, &p->rdev->flags) &&
p->replacement == NULL && repl_slot < 0)
repl_slot = mirror; continue;
}
staticvoid __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d)
{ struct r10conf *conf = r10_bio->mddev->private;
if (!bio->bi_status)
set_bit(R10BIO_Uptodate, &r10_bio->state); else /* The write handler will notice the lack of * R10BIO_Uptodate and record any errors etc
*/
atomic_add(r10_bio->sectors,
&conf->mirrors[d].rdev->corrected_errors);
/* for reconstruct, we always reschedule after a read. * for resync, only after all reads
*/
rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev); if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
atomic_dec_and_test(&r10_bio->remaining)) { /* we have read all the blocks, * do the comparison in process context in raid10d
*/
reschedule_retry(r10_bio);
}
}
staticvoid end_sync_read(struct bio *bio)
{ struct r10bio *r10_bio = get_resync_r10bio(bio); struct r10conf *conf = r10_bio->mddev->private; int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
__end_sync_read(r10_bio, bio, d);
}
staticvoid end_reshape_read(struct bio *bio)
{ /* reshape read bio isn't allocated from r10buf_pool */ struct r10bio *r10_bio = bio->bi_private;
while (atomic_dec_and_test(&r10_bio->remaining)) { if (r10_bio->master_bio == NULL) { /* the primary of several recovery bios */
sector_t s = r10_bio->sectors; if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
test_bit(R10BIO_WriteError, &r10_bio->state))
reschedule_retry(r10_bio); else
put_buf(r10_bio);
md_done_sync(mddev, s, 1); break;
} else { struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio; if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
test_bit(R10BIO_WriteError, &r10_bio->state))
reschedule_retry(r10_bio); else
put_buf(r10_bio);
r10_bio = r10_bio2;
}
}
}
staticvoid end_sync_write(struct bio *bio)
{ struct r10bio *r10_bio = get_resync_r10bio(bio); struct mddev *mddev = r10_bio->mddev; struct r10conf *conf = mddev->private; int d; int slot; int repl; struct md_rdev *rdev = NULL;
d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); if (repl)
rdev = conf->mirrors[d].replacement; else
rdev = conf->mirrors[d].rdev;
if (bio->bi_status) { if (repl)
md_error(mddev, rdev); else {
set_bit(WriteErrorSeen, &rdev->flags); if (!test_and_set_bit(WantReplacement, &rdev->flags))
set_bit(MD_RECOVERY_NEEDED,
&rdev->mddev->recovery);
set_bit(R10BIO_WriteError, &r10_bio->state);
}
} elseif (rdev_has_badblock(rdev, r10_bio->devs[slot].addr,
r10_bio->sectors)) {
set_bit(R10BIO_MadeGood, &r10_bio->state);
}
rdev_dec_pending(rdev, mddev);
end_sync_request(r10_bio);
}
/* * Note: sync and recover and handled very differently for raid10 * This code is for resync. * For resync, we read through virtual addresses and read all blocks. * If there is any error, we schedule a write. The lowest numbered * drive is authoritative. * However requests come for physical address, so we need to map. * For every physical address there are raid_disks/copies virtual addresses, * which is always are least one, but is not necessarly an integer. * This means that a physical address can span multiple chunks, so we may * have to submit multiple io requests for a single sync request.
*/ /* * We check if all blocks are in-sync and only write to blocks that * aren't in sync
*/ staticvoid sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
{ struct r10conf *conf = mddev->private; int i, first; struct bio *tbio, *fbio; int vcnt; struct page **tpages, **fpages;
atomic_set(&r10_bio->remaining, 1);
/* find the first device with a block */ for (i=0; i<conf->copies; i++) if (!r10_bio->devs[i].bio->bi_status) break;
vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9); /* now find blocks with errors */ for (i=0 ; i < conf->copies ; i++) { int j, d; struct md_rdev *rdev; struct resync_pages *rp;
tbio = r10_bio->devs[i].bio;
if (tbio->bi_end_io != end_sync_read) continue; if (i == first) continue;
tpages = get_resync_pages(tbio)->pages;
d = r10_bio->devs[i].devnum;
rdev = conf->mirrors[d].rdev; if (!r10_bio->devs[i].bio->bi_status) { /* We know that the bi_io_vec layout is the same for * both 'first' and 'i', so we just compare them. * All vec entries are PAGE_SIZE;
*/ int sectors = r10_bio->sectors; for (j = 0; j < vcnt; j++) { int len = PAGE_SIZE; if (sectors < (len / 512))
len = sectors * 512; if (memcmp(page_address(fpages[j]),
page_address(tpages[j]),
len)) break;
sectors -= len/512;
} if (j == vcnt) continue;
atomic64_add(r10_bio->sectors, &mddev->resync_mismatches); if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) /* Don't fix anything. */ continue;
} elseif (test_bit(FailFast, &rdev->flags)) { /* Just give up on this device */
md_error(rdev->mddev, rdev); continue;
} /* Ok, we need to write this bio, either to correct an * inconsistency or to correct an unreadable block. * First we need to fixup bv_offset, bv_len and * bi_vecs, as the read request might have corrupted these
*/
rp = get_resync_pages(tbio);
bio_reset(tbio, conf->mirrors[d].rdev->bdev, REQ_OP_WRITE);
/* Now write out to any replacement devices * that are active
*/ for (i = 0; i < conf->copies; i++) {
tbio = r10_bio->devs[i].repl_bio; if (!tbio || !tbio->bi_end_io) continue; if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
&& r10_bio->devs[i].bio != fbio)
bio_copy_data(tbio, fbio);
atomic_inc(&r10_bio->remaining);
submit_bio_noacct(tbio);
}
done: if (atomic_dec_and_test(&r10_bio->remaining)) {
md_done_sync(mddev, r10_bio->sectors, 1);
put_buf(r10_bio);
}
}
/* * Now for the recovery code. * Recovery happens across physical sectors. * We recover all non-is_sync drives by finding the virtual address of * each, and then choose a working drive that also has that virt address. * There is a separate r10_bio for each non-in_sync drive. * Only the first two slots are in use. The first for reading, * The second for writing. *
*/ staticvoid fix_recovery_read_error(struct r10bio *r10_bio)
{ /* We got a read error during recovery. * We repeat the read in smaller page-sized sections. * If a read succeeds, write it to the new device or record * a bad block if we cannot. * If a read fails, record a bad block on both old and * new devices.
*/ struct mddev *mddev = r10_bio->mddev; struct r10conf *conf = mddev->private; struct bio *bio = r10_bio->devs[0].bio;
sector_t sect = 0; int sectors = r10_bio->sectors; int idx = 0; int dr = r10_bio->devs[0].devnum; int dw = r10_bio->devs[1].devnum; struct page **pages = get_resync_pages(bio)->pages;
while (sectors) { int s = sectors; struct md_rdev *rdev;
sector_t addr; int ok;
if (s > (PAGE_SIZE>>9))
s = PAGE_SIZE >> 9;
rdev = conf->mirrors[dr].rdev;
addr = r10_bio->devs[0].addr + sect;
ok = sync_page_io(rdev,
addr,
s << 9,
pages[idx],
REQ_OP_READ, false); if (ok) {
rdev = conf->mirrors[dw].rdev;
addr = r10_bio->devs[1].addr + sect;
ok = sync_page_io(rdev,
addr,
s << 9,
pages[idx],
REQ_OP_WRITE, false); if (!ok) {
set_bit(WriteErrorSeen, &rdev->flags); if (!test_and_set_bit(WantReplacement,
&rdev->flags))
set_bit(MD_RECOVERY_NEEDED,
&rdev->mddev->recovery);
}
} if (!ok) { /* We don't worry if we cannot set a bad block - * it really is bad so there is no loss in not * recording it yet
*/
rdev_set_badblocks(rdev, addr, s, 0);
if (rdev != conf->mirrors[dw].rdev) { /* need bad block on destination too */ struct md_rdev *rdev2 = conf->mirrors[dw].rdev;
addr = r10_bio->devs[1].addr + sect;
ok = rdev_set_badblocks(rdev2, addr, s, 0); if (!ok) { /* just abort the recovery */
pr_notice("md/raid10:%s: recovery aborted due to read error\n",
mdname(mddev));
staticvoid recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
{ struct r10conf *conf = mddev->private; int d; struct bio *wbio = r10_bio->devs[1].bio; struct bio *wbio2 = r10_bio->devs[1].repl_bio;
/* Need to test wbio2->bi_end_io before we call * submit_bio_noacct as if the former is NULL, * the latter is free to free wbio2.
*/ if (wbio2 && !wbio2->bi_end_io)
wbio2 = NULL;
if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
fix_recovery_read_error(r10_bio); if (wbio->bi_end_io)
end_sync_request(r10_bio); if (wbio2)
end_sync_request(r10_bio); return;
}
/* * share the pages with the first bio * and submit the write request
*/
d = r10_bio->devs[1].devnum; if (wbio->bi_end_io) {
atomic_inc(&conf->mirrors[d].rdev->nr_pending);
submit_bio_noacct(wbio);
} if (wbio2) {
atomic_inc(&conf->mirrors[d].replacement->nr_pending);
submit_bio_noacct(wbio2);
}
}
staticint r10_sync_page_io(struct md_rdev *rdev, sector_t sector, int sectors, struct page *page, enum req_op op)
{ if (rdev_has_badblock(rdev, sector, sectors) &&
(op == REQ_OP_READ || test_bit(WriteErrorSeen, &rdev->flags))) return -1; if (sync_page_io(rdev, sector, sectors << 9, page, op, false)) /* success */ return 1; if (op == REQ_OP_WRITE) {
set_bit(WriteErrorSeen, &rdev->flags); if (!test_and_set_bit(WantReplacement, &rdev->flags))
set_bit(MD_RECOVERY_NEEDED,
&rdev->mddev->recovery);
} /* need to record an error - either for the block or the device */ if (!rdev_set_badblocks(rdev, sector, sectors, 0))
md_error(rdev->mddev, rdev); return 0;
}
/* * This is a kernel thread which: * * 1. Retries failed read operations on working mirrors. * 2. Updates the raid superblock when problems encounter. * 3. Performs writes following reads for array synchronising.
*/
staticvoid fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
{ int sect = 0; /* Offset from r10_bio->sector */ int sectors = r10_bio->sectors, slot = r10_bio->read_slot; struct md_rdev *rdev; int d = r10_bio->devs[slot].devnum;
/* still own a reference to this rdev, so it cannot * have been cleared recently.
*/
rdev = conf->mirrors[d].rdev;
if (test_bit(Faulty, &rdev->flags)) /* drive has already been failed, just ignore any
more fix_read_error() attempts */ return;
if (exceed_read_errors(mddev, rdev)) {
r10_bio->devs[slot].bio = IO_BLOCKED; return;
}
while(sectors) { int s = sectors; int sl = slot; int success = 0; int start;
if (s > (PAGE_SIZE>>9))
s = PAGE_SIZE >> 9;
do {
d = r10_bio->devs[sl].devnum;
rdev = conf->mirrors[d].rdev; if (rdev &&
test_bit(In_sync, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags) &&
rdev_has_badblock(rdev,
r10_bio->devs[sl].addr + sect,
s) == 0) {
atomic_inc(&rdev->nr_pending);
success = sync_page_io(rdev,
r10_bio->devs[sl].addr +
sect,
s<<9,
conf->tmppage,
REQ_OP_READ, false);
rdev_dec_pending(rdev, mddev); if (success) break;
}
sl++; if (sl == conf->copies)
sl = 0;
} while (sl != slot);
if (!success) { /* Cannot read from anywhere, just mark the block * as bad on the first device to discourage future * reads.
*/ int dn = r10_bio->devs[slot].devnum;
rdev = conf->mirrors[dn].rdev;
start = sl; /* write it back and re-read */ while (sl != slot) { if (sl==0)
sl = conf->copies;
sl--;
d = r10_bio->devs[sl].devnum;
rdev = conf->mirrors[d].rdev; if (!rdev ||
test_bit(Faulty, &rdev->flags) ||
!test_bit(In_sync, &rdev->flags)) continue;
atomic_inc(&rdev->nr_pending); if (r10_sync_page_io(rdev,
r10_bio->devs[sl].addr +
sect,
s, conf->tmppage, REQ_OP_WRITE)
== 0) { /* Well, this device is dead */
pr_notice("md/raid10:%s: read correction write failed (%d sectors at %llu on %pg)\n",
mdname(mddev), s,
(unsignedlonglong)(
sect +
choose_data_offset(r10_bio,
rdev)),
rdev->bdev);
pr_notice("md/raid10:%s: %pg: failing drive\n",
mdname(mddev),
rdev->bdev);
}
rdev_dec_pending(rdev, mddev);
}
sl = start; while (sl != slot) { if (sl==0)
sl = conf->copies;
sl--;
d = r10_bio->devs[sl].devnum;
rdev = conf->mirrors[d].rdev; if (!rdev ||
test_bit(Faulty, &rdev->flags) ||
!test_bit(In_sync, &rdev->flags)) continue;
atomic_inc(&rdev->nr_pending); switch (r10_sync_page_io(rdev,
r10_bio->devs[sl].addr +
sect,
s, conf->tmppage, REQ_OP_READ)) { case 0: /* Well, this device is dead */
pr_notice("md/raid10:%s: unable to read back corrected sectors (%d sectors at %llu on %pg)\n",
mdname(mddev), s,
(unsignedlonglong)(
sect +
choose_data_offset(r10_bio, rdev)),
rdev->bdev);
pr_notice("md/raid10:%s: %pg: failing drive\n",
mdname(mddev),
rdev->bdev); break; case 1:
pr_info("md/raid10:%s: read error corrected (%d sectors at %llu on %pg)\n",
mdname(mddev), s,
(unsignedlonglong)(
sect +
choose_data_offset(r10_bio, rdev)),
rdev->bdev);
atomic_add(s, &rdev->corrected_errors);
}
rdev_dec_pending(rdev, mddev);
}
sectors -= s;
sect += s;
}
}
staticbool narrow_write_error(struct r10bio *r10_bio, int i)
{ struct bio *bio = r10_bio->master_bio; struct mddev *mddev = r10_bio->mddev; struct r10conf *conf = mddev->private; struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev; /* bio has the data to be written to slot 'i' where * we just recently had a write error. * We repeatedly clone the bio and trim down to one block, * then try the write. Where the write fails we record * a bad block. * It is conceivable that the bio doesn't exactly align with * blocks. We must handle this. * * We currently own a reference to the rdev.
*/
int block_sectors;
sector_t sector; int sectors; int sect_to_write = r10_bio->sectors; bool ok = true;
/* we got a read error. Maybe the drive is bad. Maybe just * the block and we can fix it. * We freeze all other IO, and try reading the block from * other devices. When we find one, we re-write * and check it that fixes the read error. * This is all done synchronously while the array is * frozen.
*/
bio = r10_bio->devs[slot].bio;
bio_put(bio);
r10_bio->devs[slot].bio = NULL;
rdev_dec_pending(rdev, mddev);
r10_bio->state = 0;
raid10_read_request(mddev, r10_bio->master_bio, r10_bio, false); /* * allow_barrier after re-submit to ensure no sync io * can be issued while regular io pending.
*/
allow_barrier(conf);
}
staticvoid handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
{ /* Some sort of write request has finished and it * succeeded in writing where we thought there was a * bad block. So forget the bad block. * Or possibly if failed and we need to record * a bad block.
*/ int m; struct md_rdev *rdev;
if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
test_bit(R10BIO_IsRecover, &r10_bio->state)) { for (m = 0; m < conf->copies; m++) { int dev = r10_bio->devs[m].devnum;
rdev = conf->mirrors[dev].rdev; if (r10_bio->devs[m].bio == NULL ||
r10_bio->devs[m].bio->bi_end_io == NULL) continue; if (!r10_bio->devs[m].bio->bi_status) {
rdev_clear_badblocks(
rdev,
r10_bio->devs[m].addr,
r10_bio->sectors, 0);
} else { if (!rdev_set_badblocks(
rdev,
r10_bio->devs[m].addr,
r10_bio->sectors, 0))
md_error(conf->mddev, rdev);
}
rdev = conf->mirrors[dev].replacement; if (r10_bio->devs[m].repl_bio == NULL ||
r10_bio->devs[m].repl_bio->bi_end_io == NULL) continue;
if (!r10_bio->devs[m].repl_bio->bi_status) {
rdev_clear_badblocks(
rdev,
r10_bio->devs[m].addr,
r10_bio->sectors, 0);
} else { if (!rdev_set_badblocks(
rdev,
r10_bio->devs[m].addr,
r10_bio->sectors, 0))
md_error(conf->mddev, rdev);
}
}
put_buf(r10_bio);
} else { bool fail = false; for (m = 0; m < conf->copies; m++) { int dev = r10_bio->devs[m].devnum; struct bio *bio = r10_bio->devs[m].bio;
rdev = conf->mirrors[dev].rdev; if (bio == IO_MADE_GOOD) {
rdev_clear_badblocks(
rdev,
r10_bio->devs[m].addr,
r10_bio->sectors, 0);
rdev_dec_pending(rdev, conf->mddev);
} elseif (bio != NULL && bio->bi_status) {
fail = true; if (!narrow_write_error(r10_bio, m))
md_error(conf->mddev, rdev);
rdev_dec_pending(rdev, conf->mddev);
}
bio = r10_bio->devs[m].repl_bio;
rdev = conf->mirrors[dev].replacement; if (rdev && bio == IO_MADE_GOOD) {
rdev_clear_badblocks(
rdev,
r10_bio->devs[m].addr,
r10_bio->sectors, 0);
rdev_dec_pending(rdev, conf->mddev);
}
} if (fail) {
spin_lock_irq(&conf->device_lock);
list_add(&r10_bio->retry_list, &conf->bio_end_io_list);
conf->nr_queued++;
spin_unlock_irq(&conf->device_lock); /* * In case freeze_array() is waiting for condition * nr_pending == nr_queued + extra to be true.
*/
wake_up(&conf->wait_barrier);
md_wakeup_thread(conf->mddev->thread);
} else { if (test_bit(R10BIO_WriteError,
&r10_bio->state))
close_write(r10_bio);
raid_end_bio_io(r10_bio);
}
}
}
for (i = 0; i < nalloc; i++) {
bio = r10bio->devs[i].bio;
rp = bio->bi_private;
bio_reset(bio, NULL, 0);
bio->bi_private = rp;
bio = r10bio->devs[i].repl_bio; if (bio) {
rp = bio->bi_private;
bio_reset(bio, NULL, 0);
bio->bi_private = rp;
}
} return r10bio;
}
/* * Set cluster_sync_high since we need other nodes to add the * range [cluster_sync_low, cluster_sync_high] to suspend list.
*/ staticvoid raid10_set_cluster_sync_high(struct r10conf *conf)
{
sector_t window_size; int extra_chunk, chunks;
/* * First, here we define "stripe" as a unit which across * all member devices one time, so we get chunks by use * raid_disks / near_copies. Otherwise, if near_copies is * close to raid_disks, then resync window could increases * linearly with the increase of raid_disks, which means * we will suspend a really large IO window while it is not * necessary. If raid_disks is not divisible by near_copies, * an extra chunk is needed to ensure the whole "stripe" is * covered.
*/
/* * At least use a 32M window to align with raid1's resync window
*/
window_size = (CLUSTER_RESYNC_WINDOW_SECTORS > window_size) ?
CLUSTER_RESYNC_WINDOW_SECTORS : window_size;
/* * perform a "sync" on one "block" * * We need to make sure that no normal I/O request - particularly write * requests - conflict with active sync requests. * * This is achieved by tracking pending requests and a 'barrier' concept * that can be installed to exclude normal IO requests. * * Resync and recovery are handled very differently. * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery. * * For resync, we iterate over virtual addresses, read all copies, * and update if there are differences. If only one copy is live, * skip it. * For recovery, we iterate over physical addresses, read a good * value for each non-in_sync drive, and over-write. * * So, for recovery we may have several outstanding complex requests for a * given address, one for each out-of-sync device. We model this by allocating * a number of r10_bio structures, one for each out-of-sync device. * As we setup these structures, we collect all bio's together into a list * which we then process collectively to add pages, and then process again * to pass to submit_bio_noacct. * * The r10_bio structures are linked using a borrowed master_bio pointer. * This link is counted in ->remaining. When the r10_bio that points to NULL * has its remaining count decremented to 0, the whole complex operation * is complete. *
*/
static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
sector_t max_sector, int *skipped)
{ struct r10conf *conf = mddev->private; struct r10bio *r10_bio; struct bio *biolist = NULL, *bio;
sector_t nr_sectors; int i; int max_sync;
sector_t sync_blocks;
sector_t sectors_skipped = 0; int chunks_skipped = 0;
sector_t chunk_mask = conf->geo.chunk_mask; int page_idx = 0; int error_disk = -1;
/* * Allow skipping a full rebuild for incremental assembly * of a clean array, like RAID1 does.
*/ if (mddev->bitmap == NULL &&
mddev->resync_offset == MaxSector &&
mddev->reshape_position == MaxSector &&
!test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
conf->fullsync == 0) {
*skipped = 1; return mddev->dev_sectors - sector_nr;
}
if (!mempool_initialized(&conf->r10buf_pool)) if (init_resync(conf)) return 0;
/* If we aborted, we need to abort the * sync on the 'current' bitmap chucks (there can * be several when recovering multiple devices). * as we may have started syncing it but not finished. * We can find the current address in * mddev->curr_resync, but for recovery, * we need to convert that to several * virtual addresses.
*/ if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
end_reshape(conf);
close_sync(conf); return 0;
}
if (mddev->curr_resync < max_sector) { /* aborted */ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
mddev->bitmap_ops->end_sync(mddev,
mddev->curr_resync,
&sync_blocks); elsefor (i = 0; i < conf->geo.raid_disks; i++) {
sector_t sect =
raid10_find_virt(conf, mddev->curr_resync, i);
mddev->bitmap_ops->end_sync(mddev, sect,
&sync_blocks);
}
} else { /* completed sync */ if ((!mddev->bitmap || conf->fullsync)
&& conf->have_replacement
&& test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { /* Completed a full sync so the replacements * are now fully recovered.
*/ for (i = 0; i < conf->geo.raid_disks; i++) { struct md_rdev *rdev =
conf->mirrors[i].replacement;
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) return reshape_request(mddev, sector_nr, skipped);
if (chunks_skipped >= conf->geo.raid_disks) {
pr_err("md/raid10:%s: %s fails\n", mdname(mddev),
test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? "resync" : "recovery"); if (error_disk >= 0 &&
!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { /* * recovery fails, set mirrors.recovery_disabled, * device shouldn't be added to there.
*/
conf->mirrors[error_disk].recovery_disabled =
mddev->recovery_disabled; return 0;
} /* * if there has been nothing to do on any drive, * then there is nothing to do at all.
*/
*skipped = 1; return (max_sector - sector_nr) + sectors_skipped;
}
if (max_sector > mddev->resync_max)
max_sector = mddev->resync_max; /* Don't do IO beyond here */
/* make sure whole request will fit in a chunk - if chunks * are meaningful
*/ if (conf->geo.near_copies < conf->geo.raid_disks &&
max_sector > (sector_nr | chunk_mask))
max_sector = (sector_nr | chunk_mask) + 1;
/* * If there is non-resync activity waiting for a turn, then let it * though before starting on this new sync request.
*/ if (conf->nr_waiting)
schedule_timeout_uninterruptible(1);
/* Again, very different code for resync and recovery. * Both must result in an r10bio with a list of bios that * have bi_end_io, bi_sector, bi_bdev set, * and bi_private set to the r10bio. * For recovery, we may actually create several r10bios * with 2 bios in each, that correspond to the bios in the main one. * In this case, the subordinate r10bios link back through a * borrowed master_bio pointer, and the counter in the master * includes a ref from each subordinate.
*/ /* First, we decide what to do and set ->bi_end_io * To end_sync_read if we want to read, and * end_sync_write if we will want to write.
*/
max_sync = RESYNC_PAGES << (PAGE_SHIFT-9); if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { /* recovery... the complicated one */ int j;
r10_bio = NULL;
for (i = 0 ; i < conf->geo.raid_disks; i++) { bool still_degraded; struct r10bio *rb2;
sector_t sect; bool must_sync; int any_working; struct raid10_info *mirror = &conf->mirrors[i]; struct md_rdev *mrdev, *mreplace;
if (mrdev && (test_bit(Faulty, &mrdev->flags) ||
test_bit(In_sync, &mrdev->flags)))
mrdev = NULL; if (mreplace && test_bit(Faulty, &mreplace->flags))
mreplace = NULL;
if (!mrdev && !mreplace) continue;
still_degraded = false; /* want to reconstruct this device */
rb2 = r10_bio;
sect = raid10_find_virt(conf, sector_nr, i); if (sect >= mddev->resync_max_sectors) /* last stripe is not complete - don't * try to recover this sector.
*/ continue; /* Unless we are doing a full sync, or a replacement * we only need to recover the block if it is set in * the bitmap
*/
must_sync = mddev->bitmap_ops->start_sync(mddev, sect,
&sync_blocks, true); if (sync_blocks < max_sync)
max_sync = sync_blocks; if (!must_sync &&
mreplace == NULL &&
!conf->fullsync) { /* yep, skip the sync_blocks here, but don't assume * that there will never be anything to do here
*/
chunks_skipped = -1; continue;
} if (mrdev)
atomic_inc(&mrdev->nr_pending); if (mreplace)
atomic_inc(&mreplace->nr_pending);
any_working = 0; for (j=0; j<conf->copies;j++) { int k; int d = r10_bio->devs[j].devnum;
sector_t from_addr, to_addr; struct md_rdev *rdev = conf->mirrors[d].rdev;
sector_t sector, first_bad;
sector_t bad_sectors; if (!rdev ||
!test_bit(In_sync, &rdev->flags)) continue; /* This is where we read from */
any_working = 1;
sector = r10_bio->devs[j].addr;
if (is_badblock(rdev, sector, max_sync,
&first_bad, &bad_sectors)) { if (first_bad > sector)
max_sync = first_bad - sector; else {
bad_sectors -= (sector
- first_bad); if (max_sync > bad_sectors)
max_sync = bad_sectors; continue;
}
}
bio = r10_bio->devs[0].bio;
bio->bi_next = biolist;
biolist = bio;
bio->bi_end_io = end_sync_read;
bio->bi_opf = REQ_OP_READ; if (test_bit(FailFast, &rdev->flags))
bio->bi_opf |= MD_FAILFAST;
from_addr = r10_bio->devs[j].addr;
bio->bi_iter.bi_sector = from_addr +
rdev->data_offset;
bio_set_dev(bio, rdev->bdev);
atomic_inc(&rdev->nr_pending); /* and we write to 'i' (if not in_sync) */
/* and maybe write to replacement */
bio = r10_bio->devs[1].repl_bio; if (bio)
bio->bi_end_io = NULL; /* Note: if replace is not NULL, then bio * cannot be NULL as r10buf_pool_alloc will * have allocated it.
*/ if (!mreplace) break;
bio->bi_next = biolist;
biolist = bio;
bio->bi_end_io = end_sync_write;
bio->bi_opf = REQ_OP_WRITE;
bio->bi_iter.bi_sector = to_addr +
mreplace->data_offset;
bio_set_dev(bio, mreplace->bdev);
atomic_inc(&r10_bio->remaining); break;
} if (j == conf->copies) { /* Cannot recover, so abort the recovery or
* record a bad block */ if (any_working) { /* problem is that there are bad blocks * on other device(s)
*/ int k; for (k = 0; k < conf->copies; k++) if (r10_bio->devs[k].devnum == i) break; if (mrdev && !test_bit(In_sync,
&mrdev->flags)
&& !rdev_set_badblocks(
mrdev,
r10_bio->devs[k].addr,
max_sync, 0))
any_working = 0; if (mreplace &&
!rdev_set_badblocks(
mreplace,
r10_bio->devs[k].addr,
max_sync, 0))
any_working = 0;
} if (!any_working) { if (!test_and_set_bit(MD_RECOVERY_INTR,
&mddev->recovery))
pr_warn("md/raid10:%s: insufficient working devices for recovery.\n",
mdname(mddev));
mirror->recovery_disabled
= mddev->recovery_disabled;
} else {
error_disk = i;
}
put_buf(r10_bio); if (rb2)
atomic_dec(&rb2->remaining);
r10_bio = rb2; if (mrdev)
rdev_dec_pending(mrdev, mddev); if (mreplace)
rdev_dec_pending(mreplace, mddev); break;
} if (mrdev)
rdev_dec_pending(mrdev, mddev); if (mreplace)
rdev_dec_pending(mreplace, mddev); if (r10_bio->devs[0].bio->bi_opf & MD_FAILFAST) { /* Only want this if there is elsewhere to * read from. 'j' is currently the first * readable copy.
*/ int targets = 1; for (; j < conf->copies; j++) { int d = r10_bio->devs[j].devnum; if (conf->mirrors[d].rdev &&
test_bit(In_sync,
&conf->mirrors[d].rdev->flags))
targets++;
} if (targets == 1)
r10_bio->devs[0].bio->bi_opf
&= ~MD_FAILFAST;
}
} if (biolist == NULL) { while (r10_bio) { struct r10bio *rb2 = r10_bio;
r10_bio = (struct r10bio*) rb2->master_bio;
rb2->master_bio = NULL;
put_buf(rb2);
} goto giveup;
}
} else { /* resync. Schedule a read for every block at this virt offset */ int count = 0;
/* * Since curr_resync_completed could probably not update in * time, and we will set cluster_sync_low based on it. * Let's check against "sector_nr + 2 * RESYNC_SECTORS" for * safety reason, which ensures curr_resync_completed is * updated in bitmap_cond_end_sync.
*/
mddev->bitmap_ops->cond_end_sync(mddev, sector_nr,
mddev_is_clustered(mddev) &&
(sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
if (!mddev->bitmap_ops->start_sync(mddev, sector_nr,
&sync_blocks,
mddev->degraded) &&
!conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
&mddev->recovery)) { /* We can skip this block */
*skipped = 1; return sync_blocks + sectors_skipped;
} if (sync_blocks < max_sync)
max_sync = sync_blocks;
r10_bio = raid10_alloc_init_r10buf(conf);
r10_bio->state = 0;
if (count < 2) { for (i=0; i<conf->copies; i++) { int d = r10_bio->devs[i].devnum; if (r10_bio->devs[i].bio->bi_end_io)
rdev_dec_pending(conf->mirrors[d].rdev,
mddev); if (r10_bio->devs[i].repl_bio &&
r10_bio->devs[i].repl_bio->bi_end_io)
rdev_dec_pending(
conf->mirrors[d].replacement,
mddev);
}
put_buf(r10_bio);
biolist = NULL; goto giveup;
}
}
nr_sectors = 0; if (sector_nr + max_sync < max_sector)
max_sector = sector_nr + max_sync; do { struct page *page; int len = PAGE_SIZE; if (sector_nr + (len>>9) > max_sector)
len = (max_sector - sector_nr) << 9; if (len == 0) break; for (bio= biolist ; bio ; bio=bio->bi_next) { struct resync_pages *rp = get_resync_pages(bio);
page = resync_fetch_page(rp, page_idx); if (WARN_ON(!bio_add_page(bio, page, len, 0))) {
bio->bi_status = BLK_STS_RESOURCE;
bio_endio(bio); goto giveup;
}
}
nr_sectors += len>>9;
sector_nr += len>>9;
} while (++page_idx < RESYNC_PAGES);
r10_bio->sectors = nr_sectors;
if (mddev_is_clustered(mddev) &&
test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { /* It is resync not recovery */ if (conf->cluster_sync_high < sector_nr + nr_sectors) {
conf->cluster_sync_low = mddev->curr_resync_completed;
raid10_set_cluster_sync_high(conf); /* Send resync message */
mddev->cluster_ops->resync_info_update(mddev,
conf->cluster_sync_low,
conf->cluster_sync_high);
}
} elseif (mddev_is_clustered(mddev)) { /* This is recovery not resync */
sector_t sect_va1, sect_va2; bool broadcast_msg = false;
for (i = 0; i < conf->geo.raid_disks; i++) { /* * sector_nr is a device address for recovery, so we * need translate it to array address before compare * with cluster_sync_high.
*/
sect_va1 = raid10_find_virt(conf, sector_nr, i);
if (conf->cluster_sync_high < sect_va1 + nr_sectors) {
broadcast_msg = true; /* * curr_resync_completed is similar as * sector_nr, so make the translation too.
*/
sect_va2 = raid10_find_virt(conf,
mddev->curr_resync_completed, i);
if (sectors_skipped) /* pretend they weren't skipped, it makes * no important difference in this case
*/
md_done_sync(mddev, sectors_skipped, 1);
return sectors_skipped + nr_sectors;
giveup: /* There is nowhere to write, so all non-sync * drives must be failed or in resync, all drives * have a bad block, so try the next chunk...
*/ if (sector_nr + max_sync < max_sector)
max_sector = sector_nr + max_sync;
staticvoid calc_sectors(struct r10conf *conf, sector_t size)
{ /* Calculate the number of sectors-per-device that will * actually be used, and set conf->dev_sectors and * conf->stride
*/
size = size >> conf->geo.chunk_shift;
sector_div(size, conf->geo.far_copies);
size = size * conf->geo.raid_disks;
sector_div(size, conf->geo.near_copies); /* 'size' is now the number of chunks in the array */ /* calculate "used chunks per device" */
size = size * conf->copies;
/* We need to round up when dividing by raid_disks to * get the stride size.
*/
size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
if (mddev_is_clustered(conf->mddev)) { int fc, fo;
fc = (mddev->layout >> 8) & 255;
fo = mddev->layout & (1<<16); if (fc > 1 || fo > 0) {
pr_err("only near layout is supported by clustered" " raid10\n"); goto out_free_conf;
}
}
rdev_for_each(rdev, mddev) { longlong diff;
disk_idx = rdev->raid_disk; if (disk_idx < 0) continue; if (disk_idx >= conf->geo.raid_disks &&
disk_idx >= conf->prev.raid_disks) continue;
disk = conf->mirrors + disk_idx;
if (test_bit(Replacement, &rdev->flags)) { if (disk->replacement) goto out_free_conf;
disk->replacement = rdev;
} else { if (disk->rdev) goto out_free_conf;
disk->rdev = rdev;
}
diff = (rdev->new_data_offset - rdev->data_offset); if (!mddev->reshape_backwards)
diff = -diff; if (diff < 0)
diff = 0; if (first || diff < min_offset_diff)
min_offset_diff = diff;
disk->head_position = 0;
first = 0;
}
if (!mddev_is_dm(conf->mddev)) { int err = raid10_set_queue_limits(mddev);
if (err) {
ret = err; goto out_free_conf;
}
}
/* need to check that every block has at least one working mirror */ if (!enough(conf, -1)) {
pr_err("md/raid10:%s: not enough operational mirrors.\n",
mdname(mddev)); goto out_free_conf;
}
if (conf->reshape_progress != MaxSector) { /* must ensure that shape change is supported */ if (conf->geo.far_copies != 1 &&
conf->geo.far_offset == 0) goto out_free_conf; if (conf->prev.far_copies != 1 &&
conf->prev.far_offset == 0) goto out_free_conf;
}
mddev->degraded = 0; for (i = 0;
i < conf->geo.raid_disks
|| i < conf->prev.raid_disks;
i++) {
disk = conf->mirrors + i;
if (!disk->rdev && disk->replacement) { /* The replacement is all we have - use it */
disk->rdev = disk->replacement;
disk->replacement = NULL;
clear_bit(Replacement, &disk->rdev->flags);
}
if (!disk->rdev ||
!test_bit(In_sync, &disk->rdev->flags)) {
disk->head_position = 0;
mddev->degraded++; if (disk->rdev &&
disk->rdev->saved_raid_disk < 0)
conf->fullsync = 1;
}
if (mddev->resync_offset != MaxSector)
pr_notice("md/raid10:%s: not clean -- starting background reconstruction\n",
mdname(mddev));
pr_info("md/raid10:%s: active with %d out of %d devices\n",
mdname(mddev), conf->geo.raid_disks - mddev->degraded,
conf->geo.raid_disks); /* * Ok, everything is just fine now
*/
mddev->dev_sectors = conf->dev_sectors;
size = raid10_size(mddev, 0, 0);
md_set_array_sectors(mddev, size);
mddev->resync_max_sectors = size;
set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
if (md_integrity_register(mddev)) goto out_free_conf;
if (conf->reshape_progress != MaxSector) { unsignedlong before_length, after_length;
if (max(before_length, after_length) > min_offset_diff) { /* This cannot work */
pr_warn("md/raid10: offset difference not enough to continue reshape\n"); goto out_free_conf;
}
conf->offset_diff = min_offset_diff;
if (quiesce)
raise_barrier(conf, 0); else
lower_barrier(conf);
}
staticint raid10_resize(struct mddev *mddev, sector_t sectors)
{ /* Resize of 'far' arrays is not supported. * For 'near' and 'offset' arrays we can set the * number of sectors used to be an appropriate multiple * of the chunk size. * For 'offset', this is far_copies*chunksize. * For 'near' the multiplier is the LCM of * near_copies and raid_disks. * So if far_copies > 1 && !far_offset, fail. * Else find LCM(raid_disks, near_copy)*far_copies and * multiply by chunk_size. Then round to this number. * This is mostly done by raid10_size()
*/ struct r10conf *conf = mddev->private;
sector_t oldsize, size; int ret;
if (mddev->reshape_position != MaxSector) return -EBUSY;
if (conf->geo.far_copies > 1 && !conf->geo.far_offset) return -EINVAL;
/* raid10 can take over: * raid0 - providing it has only two drives
*/ if (mddev->level == 0) { /* for raid0 takeover only one zone is supported */
raid0_conf = mddev->private; if (raid0_conf->nr_strip_zones > 1) {
pr_warn("md/raid10:%s: cannot takeover raid 0 with more than one zone.\n",
mdname(mddev)); return ERR_PTR(-EINVAL);
} return raid10_takeover_raid0(mddev,
raid0_conf->strip_zone->zone_end,
raid0_conf->strip_zone->nb_dev);
} return ERR_PTR(-EINVAL);
}
staticint raid10_check_reshape(struct mddev *mddev)
{ /* Called when there is a request to change * - layout (to ->new_layout) * - chunk size (to ->new_chunk_sectors) * - raid_disks (by delta_disks) * or when trying to restart a reshape that was ongoing. * * We need to validate the request and possibly allocate * space if that might be an issue later. * * Currently we reject any reshape of a 'far' mode array, * allow chunk size to change if new is generally acceptable, * allow raid_disks to increase, and allow * a switch between 'near' mode and 'offset' mode.
*/ struct r10conf *conf = mddev->private; struct geom geo;
if (conf->geo.far_copies != 1 && !conf->geo.far_offset) return -EINVAL;
if (setup_geo(&geo, mddev, geo_start) != conf->copies) /* mustn't change number of copies */ return -EINVAL; if (geo.far_copies > 1 && !geo.far_offset) /* Cannot switch to 'far' mode */ return -EINVAL;
if (mddev->array_sectors & geo.chunk_mask) /* not factor of array size */ return -EINVAL;
if (!enough(conf, -1)) return -EINVAL;
kfree(conf->mirrors_new);
conf->mirrors_new = NULL; if (mddev->delta_disks > 0) { /* allocate new 'mirrors' list */
conf->mirrors_new =
kcalloc(mddev->raid_disks + mddev->delta_disks, sizeof(struct raid10_info),
GFP_KERNEL); if (!conf->mirrors_new) return -ENOMEM;
} return 0;
}
/* * Need to check if array has failed when deciding whether to: * - start an array * - remove non-faulty devices * - add a spare * - allow a reshape * This determination is simple when no reshape is happening. * However if there is a reshape, we need to carefully check * both the before and after sections. * This is because some failed devices may only affect one * of the two sections, and some non-in_sync devices may * be insync in the section most affected by failed devices.
*/ staticint calc_degraded(struct r10conf *conf)
{ int degraded, degraded2; int i;
degraded = 0; /* 'prev' section first */ for (i = 0; i < conf->prev.raid_disks; i++) { struct md_rdev *rdev = conf->mirrors[i].rdev;
if (!rdev || test_bit(Faulty, &rdev->flags))
degraded++; elseif (!test_bit(In_sync, &rdev->flags)) /* When we can reduce the number of devices in * an array, this might not contribute to * 'degraded'. It does now.
*/
degraded++;
} if (conf->geo.raid_disks == conf->prev.raid_disks) return degraded;
degraded2 = 0; for (i = 0; i < conf->geo.raid_disks; i++) { struct md_rdev *rdev = conf->mirrors[i].rdev;
if (!rdev || test_bit(Faulty, &rdev->flags))
degraded2++; elseif (!test_bit(In_sync, &rdev->flags)) { /* If reshape is increasing the number of devices, * this section has already been recovered, so * it doesn't contribute to degraded. * else it does.
*/ if (conf->geo.raid_disks <= conf->prev.raid_disks)
degraded2++;
}
} if (degraded2 > degraded) return degraded2; return degraded;
}
staticint raid10_start_reshape(struct mddev *mddev)
{ /* A 'reshape' has been requested. This commits * the various 'new' fields and sets MD_RECOVER_RESHAPE * This also checks if there are enough spares and adds them * to the array. * We currently require enough spares to make the final * array non-degraded. We also require that the difference * between old and new data_offset - on each device - is * enough that we never risk over-writing.
*/
unsignedlong before_length, after_length;
sector_t min_offset_diff = 0; int first = 1; struct geom new; struct r10conf *conf = mddev->private; struct md_rdev *rdev; int spares = 0; int ret;
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return -EBUSY;
if (setup_geo(&new, mddev, geo_start) != conf->copies) return -EINVAL;
/* * some node is already performing reshape, and no need to * call bitmap_ops->resize again since it should be called when * receiving BITMAP_RESIZE msg
*/ if ((sb && (le32_to_cpu(sb->feature_map) &
MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize)) goto out;
ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false); if (ret) goto abort;
ret = mddev->cluster_ops->resize_bitmaps(mddev, newsize, oldsize); if (ret) {
mddev->bitmap_ops->resize(mddev, oldsize, 0, false); goto abort;
}
}
out: if (mddev->delta_disks > 0) {
rdev_for_each(rdev, mddev) if (rdev->raid_disk < 0 &&
!test_bit(Faulty, &rdev->flags)) { if (raid10_add_disk(mddev, rdev) == 0) { if (rdev->raid_disk >=
conf->prev.raid_disks)
set_bit(In_sync, &rdev->flags); else
rdev->recovery_offset = 0;
/* Failure here is OK */
sysfs_link_rdev(mddev, rdev);
}
} elseif (rdev->raid_disk >= conf->prev.raid_disks
&& !test_bit(Faulty, &rdev->flags)) { /* This is a spare that was manually added */
set_bit(In_sync, &rdev->flags);
}
} /* When a reshape changes the number of devices, * ->degraded is measured against the larger of the * pre and post numbers.
*/
spin_lock_irq(&conf->device_lock);
mddev->degraded = calc_degraded(conf);
spin_unlock_irq(&conf->device_lock);
mddev->raid_disks = conf->geo.raid_disks;
mddev->reshape_position = conf->reshape_progress;
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
/* Calculate the last device-address that could contain * any block from the chunk that includes the array-address 's' * and report the next address. * i.e. the address returned will be chunk-aligned and after * any data that is in the chunk containing 's'.
*/ static sector_t last_dev_address(sector_t s, struct geom *geo)
{
s = (s | geo->chunk_mask) + 1;
s >>= geo->chunk_shift;
s *= geo->near_copies;
s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
s *= geo->far_copies;
s <<= geo->chunk_shift; return s;
}
/* Calculate the first device-address that could contain * any block from the chunk that includes the array-address 's'. * This too will be the start of a chunk
*/ static sector_t first_dev_address(sector_t s, struct geom *geo)
{
s >>= geo->chunk_shift;
s *= geo->near_copies;
sector_div(s, geo->raid_disks);
s *= geo->far_copies;
s <<= geo->chunk_shift; return s;
}
static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
{ /* We simply copy at most one chunk (smallest of old and new) * at a time, possibly less if that exceeds RESYNC_PAGES, * or we hit a bad block or something. * This might mean we pause for normal IO in the middle of * a chunk, but that is not a problem as mddev->reshape_position * can record any location. * * If we will want to write to a location that isn't * yet recorded as 'safe' (i.e. in metadata on disk) then * we need to flush all reshape requests and update the metadata. * * When reshaping forwards (e.g. to more devices), we interpret * 'safe' as the earliest block which might not have been copied * down yet. We divide this by previous stripe size and multiply * by previous stripe length to get lowest device offset that we * cannot write to yet. * We interpret 'sector_nr' as an address that we want to write to. * From this we use last_device_address() to find where we might * write to, and first_device_address on the 'safe' position. * If this 'next' write position is after the 'safe' position, * we must update the metadata to increase the 'safe' position. * * When reshaping backwards, we round in the opposite direction * and perform the reverse test: next write position must not be * less than current safe position. * * In all this the minimum difference in data offsets * (conf->offset_diff - always positive) allows a bit of slack, * so next can be after 'safe', but not by more than offset_diff * * We need to prepare all the bios here before we start any IO * to ensure the size we choose is acceptable to all devices. * The means one for each copy for write-out and an extra one for * read-in. * We store the read-in bio in ->master_bio and the others in * ->devs[x].bio and ->devs[x].repl_bio.
*/ struct r10conf *conf = mddev->private; struct r10bio *r10_bio;
sector_t next, safe, last; int max_sectors; int nr_sectors; int s; struct md_rdev *rdev; int need_flush = 0; struct bio *blist; struct bio *bio, *read_bio; int sectors_done = 0; struct page **pages;
if (sector_nr == 0) { /* If restarting in the middle, skip the initial sectors */ if (mddev->reshape_backwards &&
conf->reshape_progress < raid10_size(mddev, 0, 0)) {
sector_nr = (raid10_size(mddev, 0, 0)
- conf->reshape_progress);
} elseif (!mddev->reshape_backwards &&
conf->reshape_progress > 0)
sector_nr = conf->reshape_progress; if (sector_nr) {
mddev->curr_resync_completed = sector_nr;
sysfs_notify_dirent_safe(mddev->sysfs_completed);
*skipped = 1; return sector_nr;
}
}
/* We don't use sector_nr to track where we are up to * as that doesn't work well for ->reshape_backwards. * So just use ->reshape_progress.
*/ if (mddev->reshape_backwards) { /* 'next' is the earliest device address that we might * write to for this chunk in the new layout
*/
next = first_dev_address(conf->reshape_progress - 1,
&conf->geo);
/* 'safe' is the last device address that we might read from * in the old layout after a restart
*/
safe = last_dev_address(conf->reshape_safe - 1,
&conf->prev);
if (next + conf->offset_diff < safe)
need_flush = 1;
last = conf->reshape_progress - 1;
sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
& conf->prev.chunk_mask); if (sector_nr + RESYNC_SECTORS < last)
sector_nr = last + 1 - RESYNC_SECTORS;
} else { /* 'next' is after the last device address that we * might write to for this chunk in the new layout
*/
next = last_dev_address(conf->reshape_progress, &conf->geo);
/* 'safe' is the earliest device address that we might * read from in the old layout after a restart
*/
safe = first_dev_address(conf->reshape_safe, &conf->prev);
/* Need to update metadata if 'next' might be beyond 'safe' * as that would possibly corrupt data
*/ if (next > safe + conf->offset_diff)
need_flush = 1;
sector_nr = conf->reshape_progress;
last = sector_nr | (conf->geo.chunk_mask
& conf->prev.chunk_mask);
if (sector_nr + RESYNC_SECTORS <= last)
last = sector_nr + RESYNC_SECTORS - 1;
}
if (need_flush ||
time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { /* Need to update reshape_position in metadata */
wait_barrier(conf, false);
mddev->reshape_position = conf->reshape_progress; if (mddev->reshape_backwards)
mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
- conf->reshape_progress; else
mddev->curr_resync_completed = conf->reshape_progress;
conf->reshape_checkpoint = jiffies;
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
md_wakeup_thread(mddev->thread);
wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
test_bit(MD_RECOVERY_INTR, &mddev->recovery)); if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
allow_barrier(conf); return sectors_done;
}
conf->reshape_safe = mddev->reshape_position;
allow_barrier(conf);
}
raise_barrier(conf, 0);
read_more: /* Now schedule reads for blocks from sector_nr to last */
r10_bio = raid10_alloc_init_r10buf(conf);
r10_bio->state = 0;
raise_barrier(conf, 1);
atomic_set(&r10_bio->remaining, 0);
r10_bio->mddev = mddev;
r10_bio->sector = sector_nr;
set_bit(R10BIO_IsReshape, &r10_bio->state);
r10_bio->sectors = last - sector_nr + 1;
rdev = read_balance(conf, r10_bio, &max_sectors);
BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
if (!rdev) { /* Cannot read from here, so need to record bad blocks * on all the target devices.
*/ // FIXME
mempool_free(r10_bio, &conf->r10buf_pool);
set_bit(MD_RECOVERY_INTR, &mddev->recovery); return sectors_done;
}
/* * Broadcast RESYNC message to other nodes, so all nodes would not * write to the region to avoid conflict.
*/ if (mddev_is_clustered(mddev) && conf->cluster_sync_high <= sector_nr) { struct mdp_superblock_1 *sb = NULL; int sb_reshape_pos = 0;
conf->cluster_sync_low = sector_nr;
conf->cluster_sync_high = sector_nr + CLUSTER_RESYNC_WINDOW_SECTORS;
sb = page_address(rdev->sb_page); if (sb) {
sb_reshape_pos = le64_to_cpu(sb->reshape_position); /* * Set cluster_sync_low again if next address for array * reshape is less than cluster_sync_low. Since we can't * update cluster_sync_low until it has finished reshape.
*/ if (sb_reshape_pos < conf->cluster_sync_low)
conf->cluster_sync_low = sb_reshape_pos;
}
/* Now find the locations in the new layout */
__raid10_find_phys(&conf->geo, r10_bio);
blist = read_bio;
read_bio->bi_next = NULL;
for (s = 0; s < conf->copies*2; s++) { struct bio *b; int d = r10_bio->devs[s/2].devnum; struct md_rdev *rdev2; if (s&1) {
rdev2 = conf->mirrors[d].replacement;
b = r10_bio->devs[s/2].repl_bio;
} else {
rdev2 = conf->mirrors[d].rdev;
b = r10_bio->devs[s/2].bio;
} if (!rdev2 || test_bit(Faulty, &rdev2->flags)) continue;
/* Now add as many pages as possible to all of these bios. */
nr_sectors = 0;
pages = get_resync_pages(r10_bio->devs[0].bio)->pages; for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) { struct page *page = pages[s / (PAGE_SIZE >> 9)]; int len = (max_sectors - s) << 9; if (len > PAGE_SIZE)
len = PAGE_SIZE; for (bio = blist; bio ; bio = bio->bi_next) { if (WARN_ON(!bio_add_page(bio, page, len, 0))) {
bio->bi_status = BLK_STS_RESOURCE;
bio_endio(bio); return sectors_done;
}
}
sector_nr += len >> 9;
nr_sectors += len >> 9;
}
r10_bio->sectors = nr_sectors;
/* Now submit the read */
atomic_inc(&r10_bio->remaining);
read_bio->bi_next = NULL;
submit_bio_noacct(read_bio);
sectors_done += nr_sectors; if (sector_nr <= last) goto read_more;
lower_barrier(conf);
/* Now that we have done the whole section we can * update reshape_progress
*/ if (mddev->reshape_backwards)
conf->reshape_progress -= sectors_done; else
conf->reshape_progress += sectors_done;
return sectors_done;
}
staticvoid end_reshape_request(struct r10bio *r10_bio); staticint handle_reshape_read_error(struct mddev *mddev, struct r10bio *r10_bio); staticvoid reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
{ /* Reshape read completed. Hopefully we have a block * to write out. * If we got a read error then we do sync 1-page reads from * elsewhere until we find the data - or give up.
*/ struct r10conf *conf = mddev->private; int s;
if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) if (handle_reshape_read_error(mddev, r10_bio) < 0) { /* Reshape has been aborted */
md_done_sync(mddev, r10_bio->sectors, 0); return;
}
/* We definitely have the data in the pages, schedule the * writes.
*/
atomic_set(&r10_bio->remaining, 1); for (s = 0; s < conf->copies*2; s++) { struct bio *b; int d = r10_bio->devs[s/2].devnum; struct md_rdev *rdev; if (s&1) {
rdev = conf->mirrors[d].replacement;
b = r10_bio->devs[s/2].repl_bio;
} else {
rdev = conf->mirrors[d].rdev;
b = r10_bio->devs[s/2].bio;
} if (!rdev || test_bit(Faulty, &rdev->flags)) continue;
¤ Diese beiden folgenden Angebotsgruppen bietet das Unternehmen0.77Angebot
(Wie Sie bei der Firma Beratungs- und Dienstleistungen beauftragen können 2026-04-25)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.