// SPDX-License-Identifier: GPL-2.0-or-later /* * raid5.c : Multiple Devices driver for Linux * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman * Copyright (C) 1999, 2000 Ingo Molnar * Copyright (C) 2002, 2003 H. Peter Anvin * * RAID-4/5/6 management functions. * Thanks to Penguin Computing for making the RAID-6 development possible * by donating a test server!
*/
/* * BITMAP UNPLUGGING: * * The sequencing for updating the bitmap reliably is a little * subtle (and I got it wrong the first time) so it deserves some * explanation. * * We group bitmap updates into batches. Each batch has a number. * We may write out several batches at once, but that isn't very important. * conf->seq_write is the number of the last batch successfully written. * conf->seq_flush is the number of the last batch that was closed to * new additions. * When we discover that we will need to write to any block in a stripe * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq * the number of the batch it will be in. This is seq_flush+1. * When we are ready to do a write, if that batch hasn't been written yet, * we plug the array and queue the stripe for later. * When an unplug happens, we increment bm_flush, thus closing the current * batch. * When we notice that bm_flush > bm_write, we write out all pending updates * to the bitmap, and advance bm_write to where bm_flush was. * This may occasionally write a bit out twice, but is sure never to * miss any bits.
*/
staticbool devices_handle_discard_safely = false;
module_param(devices_handle_discard_safely, bool, 0644);
MODULE_PARM_DESC(devices_handle_discard_safely, "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"); staticstruct workqueue_struct *raid5_wq;
staticvoid raid5_quiesce(struct mddev *mddev, int quiesce);
staticinlinevoid lock_all_device_hash_locks_irq(struct r5conf *conf)
__acquires(&conf->device_lock)
{ int i;
spin_lock_irq(conf->hash_locks); for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
spin_lock(&conf->device_lock);
}
staticinlinevoid unlock_all_device_hash_locks_irq(struct r5conf *conf)
__releases(&conf->device_lock)
{ int i;
spin_unlock(&conf->device_lock); for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--)
spin_unlock(conf->hash_locks + i);
spin_unlock_irq(conf->hash_locks);
}
/* Find first data disk in a raid6 stripe */ staticinlineint raid6_d0(struct stripe_head *sh)
{ if (sh->ddf_layout) /* ddf always start from first device */ return 0; /* md starts just after Q block */ if (sh->qd_idx == sh->disks - 1) return 0; else return sh->qd_idx + 1;
} staticinlineint raid6_next_disk(int disk, int raid_disks)
{
disk++; return (disk < raid_disks) ? disk : 0;
}
/* When walking through the disks in a raid5, starting at raid6_d0, * We need to map each disk to a 'slot', where the data disks are slot * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk * is raid_disks-1. This help does that mapping.
*/ staticint raid6_idx_to_slot(int idx, struct stripe_head *sh, int *count, int syndrome_disks)
{ int slot = *count;
if (sh->ddf_layout)
(*count)++; if (idx == sh->pd_idx) return syndrome_disks; if (idx == sh->qd_idx) return syndrome_disks + 1; if (!sh->ddf_layout)
(*count)++; return slot;
}
staticvoid raid5_wakeup_stripe_thread(struct stripe_head *sh)
__must_hold(&sh->raid_conf->device_lock)
{ struct r5conf *conf = sh->raid_conf; struct r5worker_group *group; int thread_cnt; int i, cpu = sh->cpu;
if (!cpu_online(cpu)) {
cpu = cpumask_any(cpu_online_mask);
sh->cpu = cpu;
}
if (list_empty(&sh->lru)) { struct r5worker_group *group;
group = conf->worker_groups + cpu_to_group(cpu); if (stripe_is_lowprio(sh))
list_add_tail(&sh->lru, &group->loprio_list); else
list_add_tail(&sh->lru, &group->handle_list);
group->stripes_cnt++;
sh->group = group;
}
if (conf->worker_cnt_per_group == 0) {
md_wakeup_thread(conf->mddev->thread); return;
}
group = conf->worker_groups + cpu_to_group(sh->cpu);
group->workers[0].working = true; /* at least one worker should run to avoid race */
queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work);
thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1; /* wakeup more workers */ for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) { if (group->workers[i].working == false) {
group->workers[i].working = true;
queue_work_on(sh->cpu, raid5_wq,
&group->workers[i].work);
thread_cnt--;
}
}
}
staticvoid do_release_stripe(struct r5conf *conf, struct stripe_head *sh, struct list_head *temp_inactive_list)
__must_hold(&conf->device_lock)
{ int i; int injournal = 0; /* number of date pages with R5_InJournal */
if (r5c_is_writeback(conf->log)) for (i = sh->disks; i--; ) if (test_bit(R5_InJournal, &sh->dev[i].flags))
injournal++; /* * In the following cases, the stripe cannot be released to cached * lists. Therefore, we make the stripe write out and set * STRIPE_HANDLE: * 1. when quiesce in r5c write back; * 2. when resync is requested fot the stripe.
*/ if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) ||
(conf->quiesce && r5c_is_writeback(conf->log) &&
!test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) { if (test_bit(STRIPE_R5C_CACHING, &sh->state))
r5c_make_stripe_write_out(sh);
set_bit(STRIPE_HANDLE, &sh->state);
}
if (test_bit(STRIPE_HANDLE, &sh->state)) { if (test_bit(STRIPE_DELAYED, &sh->state) &&
!test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
list_add_tail(&sh->lru, &conf->delayed_list); elseif (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
sh->bm_seq - conf->seq_write > 0)
list_add_tail(&sh->lru, &conf->bitmap_list); else {
clear_bit(STRIPE_DELAYED, &sh->state);
clear_bit(STRIPE_BIT_DELAY, &sh->state); if (conf->worker_cnt_per_group == 0) { if (stripe_is_lowprio(sh))
list_add_tail(&sh->lru,
&conf->loprio_list); else
list_add_tail(&sh->lru,
&conf->handle_list);
} else {
raid5_wakeup_stripe_thread(sh); return;
}
}
md_wakeup_thread(conf->mddev->thread);
} else {
BUG_ON(stripe_operations_active(sh)); if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) if (atomic_dec_return(&conf->preread_active_stripes)
< IO_THRESHOLD)
md_wakeup_thread(conf->mddev->thread);
atomic_dec(&conf->active_stripes); if (!test_bit(STRIPE_EXPANDING, &sh->state)) { if (!r5c_is_writeback(conf->log))
list_add_tail(&sh->lru, temp_inactive_list); else {
WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)); if (injournal == 0)
list_add_tail(&sh->lru, temp_inactive_list); elseif (injournal == conf->raid_disks - conf->max_degraded) { /* full stripe */ if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state))
atomic_inc(&conf->r5c_cached_full_stripes); if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
atomic_dec(&conf->r5c_cached_partial_stripes);
list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
r5c_check_cached_full_stripe(conf);
} else /* * STRIPE_R5C_PARTIAL_STRIPE is set in * r5c_try_caching_write(). No need to * set it again.
*/
list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list);
}
}
}
}
/* * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list * * Be careful: Only one task can add/delete stripes from temp_inactive_list at * given time. Adding stripes only takes device lock, while deleting stripes * only takes hash lock.
*/ staticvoid release_inactive_stripe_list(struct r5conf *conf, struct list_head *temp_inactive_list, int hash)
{ int size; bool do_wakeup = false; unsignedlong flags;
/* * We don't hold any lock here yet, raid5_get_active_stripe() might * remove stripes from the list
*/ if (!list_empty_careful(list)) {
spin_lock_irqsave(conf->hash_locks + hash, flags); if (list_empty(conf->inactive_list + hash) &&
!list_empty(list))
atomic_dec(&conf->empty_inactive_list_nr);
list_splice_tail_init(list, conf->inactive_list + hash);
do_wakeup = true;
spin_unlock_irqrestore(conf->hash_locks + hash, flags);
}
size--;
hash--;
}
if (do_wakeup) {
wake_up(&conf->wait_for_stripe); if (atomic_read(&conf->active_stripes) == 0)
wake_up(&conf->wait_for_quiescent); if (conf->retry_read_aligned)
md_wakeup_thread(conf->mddev->thread);
}
}
head = llist_del_all(&conf->released_stripes);
head = llist_reverse_order(head);
llist_for_each_entry_safe(sh, t, head, release_list) { int hash;
/* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
smp_mb();
clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state); /* * Don't worry the bit is set here, because if the bit is set * again, the count is always > 1. This is true for * STRIPE_ON_UNPLUG_LIST bit too.
*/
hash = sh->hash_lock_index;
__release_stripe(conf, sh, &temp_inactive_list[hash]);
count++;
}
/* Avoid release_list until the last reference.
*/ if (atomic_add_unless(&sh->count, -1, 1)) return;
if (unlikely(!conf->mddev->thread) ||
test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) goto slow_path;
wakeup = llist_add(&sh->release_list, &conf->released_stripes); if (wakeup)
md_wakeup_thread(conf->mddev->thread); return;
slow_path: /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ if (atomic_dec_and_lock_irqsave(&sh->count, &conf->device_lock, flags)) {
INIT_LIST_HEAD(&list);
hash = sh->hash_lock_index;
do_release_stripe(conf, sh, &list);
spin_unlock_irqrestore(&conf->device_lock, flags);
release_inactive_stripe_list(conf, &list, hash);
}
}
/* find an idle stripe, make sure it is unhashed, and return it. */ staticstruct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
{ struct stripe_head *sh = NULL; struct list_head *first;
if (list_empty(conf->inactive_list + hash)) goto out;
first = (conf->inactive_list + hash)->next;
sh = list_entry(first, struct stripe_head, lru);
list_del_init(first);
remove_hash(sh);
atomic_inc(&conf->active_stripes);
BUG_ON(hash != sh->hash_lock_index); if (list_empty(conf->inactive_list + hash))
atomic_inc(&conf->empty_inactive_list_nr);
out: return sh;
}
pr_debug("__find_stripe, sector %llu\n", (unsignedlonglong)sector);
hlist_for_each_entry(sh, stripe_hash(conf, sector), hash) if (sh->sector == sector && sh->generation == generation) return sh;
pr_debug("__stripe %llu not in cache\n", (unsignedlonglong)sector); return NULL;
}
staticstruct stripe_head *find_get_stripe(struct r5conf *conf,
sector_t sector, short generation, int hash)
{ int inc_empty_inactive_list_flag; struct stripe_head *sh;
sh = __find_stripe(conf, sector, generation); if (!sh) return NULL;
if (atomic_inc_not_zero(&sh->count)) return sh;
/* * Slow path. The reference count is zero which means the stripe must * be on a list (sh->lru). Must remove the stripe from the list that * references it with the device_lock held.
*/
spin_lock(&conf->device_lock); if (!atomic_read(&sh->count)) { if (!test_bit(STRIPE_HANDLE, &sh->state))
atomic_inc(&conf->active_stripes);
BUG_ON(list_empty(&sh->lru) &&
!test_bit(STRIPE_EXPANDING, &sh->state));
inc_empty_inactive_list_flag = 0; if (!list_empty(conf->inactive_list + hash))
inc_empty_inactive_list_flag = 1;
list_del_init(&sh->lru); if (list_empty(conf->inactive_list + hash) &&
inc_empty_inactive_list_flag)
atomic_inc(&conf->empty_inactive_list_nr); if (sh->group) {
sh->group->stripes_cnt--;
sh->group = NULL;
}
}
atomic_inc(&sh->count);
spin_unlock(&conf->device_lock);
return sh;
}
/* * Need to check if array has failed when deciding whether to: * - start an array * - remove non-faulty devices * - add a spare * - allow a reshape * This determination is simple when no reshape is happening. * However if there is a reshape, we need to carefully check * both the before and after sections. * This is because some failed devices may only affect one * of the two sections, and some non-in_sync devices may * be insync in the section most affected by failed devices. * * Most calls to this function hold &conf->device_lock. Calls * in raid5_run() do not require the lock as no other threads * have been started yet.
*/ int raid5_calc_degraded(struct r5conf *conf)
{ int degraded, degraded2; int i;
degraded = 0; for (i = 0; i < conf->previous_raid_disks; i++) { struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev);
if (rdev && test_bit(Faulty, &rdev->flags))
rdev = READ_ONCE(conf->disks[i].replacement); if (!rdev || test_bit(Faulty, &rdev->flags))
degraded++; elseif (test_bit(In_sync, &rdev->flags))
; else /* not in-sync or faulty. * If the reshape increases the number of devices, * this is being recovered by the reshape, so * this 'previous' section is not in_sync. * If the number of devices is being reduced however, * the device can only be part of the array if * we are reverting a reshape, so this section will * be in-sync.
*/ if (conf->raid_disks >= conf->previous_raid_disks)
degraded++;
} if (conf->raid_disks == conf->previous_raid_disks) return degraded;
degraded2 = 0; for (i = 0; i < conf->raid_disks; i++) { struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev);
if (rdev && test_bit(Faulty, &rdev->flags))
rdev = READ_ONCE(conf->disks[i].replacement); if (!rdev || test_bit(Faulty, &rdev->flags))
degraded2++; elseif (test_bit(In_sync, &rdev->flags))
; else /* not in-sync or faulty. * If reshape increases the number of devices, this * section has already been recovered, else it * almost certainly hasn't.
*/ if (conf->raid_disks <= conf->previous_raid_disks)
degraded2++;
} if (degraded2 > degraded) return degraded2; return degraded;
}
staticbool has_failed(struct r5conf *conf)
{ int degraded = conf->mddev->degraded;
if (test_bit(MD_BROKEN, &conf->mddev->flags)) returntrue;
if (conf->mddev->reshape_position != MaxSector)
degraded = raid5_calc_degraded(conf);
struct stripe_request_ctx { /* a reference to the last stripe_head for batching */ struct stripe_head *batch_last;
/* first sector in the request */
sector_t first_sector;
/* last sector in the request */
sector_t last_sector;
/* * bitmap to track stripe sectors that have been added to stripes * add one to account for unaligned requests
*/
DECLARE_BITMAP(sectors_to_do, RAID5_MAX_REQ_STRIPES + 1);
/* the request had REQ_PREFLUSH, cleared after the first stripe_head */ bool do_flush;
};
/* * Block until another thread clears R5_INACTIVE_BLOCKED or * there are fewer than 3/4 the maximum number of active stripes * and there is an inactive stripe available.
*/ staticbool is_inactive_blocked(struct r5conf *conf, int hash)
{ if (list_empty(conf->inactive_list + hash)) returnfalse;
if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) returntrue;
for (;;) { if (!(flags & R5_GAS_NOQUIESCE) && conf->quiesce) { /* * Must release the reference to batch_last before * waiting, on quiesce, otherwise the batch_last will * hold a reference to a stripe and raid5_quiesce() * will deadlock waiting for active_stripes to go to * zero.
*/ if (ctx && ctx->batch_last) {
raid5_release_stripe(ctx->batch_last);
ctx->batch_last = NULL;
}
/* Only freshly new full stripe normal write stripe can be added to a batch list */ staticbool stripe_can_batch(struct stripe_head *sh)
{ struct r5conf *conf = sh->raid_conf;
/* we only do back search */ staticvoid stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh, struct stripe_head *last_sh)
{ struct stripe_head *head;
sector_t head_sector, tmp_sec; int hash; int dd_idx;
/* Don't cross chunks, so stripe pd_idx/qd_idx is the same */
tmp_sec = sh->sector; if (!sector_div(tmp_sec, conf->chunk_sectors)) return;
head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf);
if (last_sh && head_sector == last_sh->sector) {
head = last_sh;
atomic_inc(&head->count);
} else {
hash = stripe_hash_locks_hash(conf, head_sector);
spin_lock_irq(conf->hash_locks + hash);
head = find_get_stripe(conf, head_sector, conf->generation,
hash);
spin_unlock_irq(conf->hash_locks + hash); if (!head) return; if (!stripe_can_batch(head)) goto out;
}
lock_two_stripes(head, sh); /* clear_batch_ready clear the flag */ if (!stripe_can_batch(head) || !stripe_can_batch(sh)) goto unlock_out;
if (head->batch_head) {
spin_lock(&head->batch_head->batch_lock); /* This batch list is already running */ if (!stripe_can_batch(head)) {
spin_unlock(&head->batch_head->batch_lock); goto unlock_out;
} /* * We must assign batch_head of this stripe within the * batch_lock, otherwise clear_batch_ready of batch head * stripe could clear BATCH_READY bit of this stripe and * this stripe->batch_head doesn't get assigned, which * could confuse clear_batch_ready for this stripe
*/
sh->batch_head = head->batch_head;
/* * at this point, head's BATCH_READY could be cleared, but we * can still add the stripe to batch list
*/
list_add(&sh->batch_list, &head->batch_list);
spin_unlock(&head->batch_head->batch_lock);
} else {
head->batch_head = head;
sh->batch_head = head->batch_head;
spin_lock(&head->batch_lock);
list_add_tail(&sh->batch_list, &head->batch_list);
spin_unlock(&head->batch_lock);
}
if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) if (atomic_dec_return(&conf->preread_active_stripes)
< IO_THRESHOLD)
md_wakeup_thread(conf->mddev->thread);
if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) { int seq = sh->bm_seq; if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) &&
sh->batch_head->bm_seq > seq)
seq = sh->batch_head->bm_seq;
set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state);
sh->batch_head->bm_seq = seq;
}
/* Determine if 'data_offset' or 'new_data_offset' should be used * in this stripe_head.
*/ staticint use_new_offset(struct r5conf *conf, struct stripe_head *sh)
{
sector_t progress = conf->reshape_progress; /* Need a memory barrier to make sure we see the value * of conf->generation, or ->data_offset that was set before * reshape_progress was updated.
*/
smp_rmb(); if (progress == MaxSector) return 0; if (sh->generation == conf->generation - 1) return 0; /* We are in a reshape, and this is a new-generation stripe, * so use new_data_offset.
*/ return 1;
}
staticvoid dispatch_bio_list(struct bio_list *tmp)
{ struct bio *bio;
while ((bio = bio_list_pop(tmp)))
submit_bio_noacct(bio);
}
/* temporarily move the head */ if (conf->next_pending_data)
list_move_tail(&conf->pending_list,
&conf->next_pending_data->sibling);
while (!list_empty(&conf->pending_list)) {
data = list_first_entry(&conf->pending_list, struct r5pending_data, sibling); if (&data->sibling == first)
first = data->sibling.next;
next = data->sibling.next;
for (i = disks; i--; ) { enum req_op op;
blk_opf_t op_flags = 0; int replace_only = 0; struct bio *bi, *rbi; struct md_rdev *rdev, *rrdev = NULL;
sh = head_sh; if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
op = REQ_OP_WRITE; if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
op_flags = REQ_FUA; if (test_bit(R5_Discard, &sh->dev[i].flags))
op = REQ_OP_DISCARD;
} elseif (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
op = REQ_OP_READ; elseif (test_and_clear_bit(R5_WantReplace,
&sh->dev[i].flags)) {
op = REQ_OP_WRITE;
replace_only = 1;
} else continue; if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
op_flags |= REQ_SYNC;
again:
dev = &sh->dev[i];
bi = &dev->req;
rbi = &dev->rreq; /* For writing to replacement */
rdev = conf->disks[i].rdev;
rrdev = conf->disks[i].replacement; if (op_is_write(op)) { if (replace_only)
rdev = NULL; if (rdev == rrdev) /* We raced and saw duplicates */
rrdev = NULL;
} else { if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev)
rdev = rrdev;
rrdev = NULL;
}
if (rdev && test_bit(Faulty, &rdev->flags))
rdev = NULL; if (rdev)
atomic_inc(&rdev->nr_pending); if (rrdev && test_bit(Faulty, &rrdev->flags))
rrdev = NULL; if (rrdev)
atomic_inc(&rrdev->nr_pending);
/* We have already checked bad blocks for reads. Now * need to check for writes. We never accept write errors * on the replacement, so we don't to check rrdev.
*/ while (op_is_write(op) && rdev &&
test_bit(WriteErrorSeen, &rdev->flags)) { int bad = rdev_has_badblock(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf)); if (!bad) break;
if (bad < 0) {
set_bit(BlockedBadBlocks, &rdev->flags); if (!conf->mddev->external &&
conf->mddev->sb_flags) { /* It is very unlikely, but we might * still need to write out the * bad block log - better give it
* a chance*/
md_check_recovery(conf->mddev);
} /* * Because md_wait_for_blocked_rdev * will dec nr_pending, we must * increment it first.
*/
atomic_inc(&rdev->nr_pending);
md_wait_for_blocked_rdev(rdev, conf->mddev);
} else { /* Acknowledged bad block - skip the write */
rdev_dec_pending(rdev, conf->mddev);
rdev = NULL;
}
}
if (rdev) {
set_bit(STRIPE_IO_STARTED, &sh->state);
pr_debug("%s: for %llu schedule op %d on disc %d\n",
__func__, (unsignedlonglong)sh->sector,
bi->bi_opf, i);
atomic_inc(&sh->count); if (sh != head_sh)
atomic_inc(&head_sh->count); if (use_new_offset(conf, sh))
bi->bi_iter.bi_sector = (sh->sector
+ rdev->new_data_offset); else
bi->bi_iter.bi_sector = (sh->sector
+ rdev->data_offset); if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags))
bi->bi_opf |= REQ_NOMERGE;
if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
if (!op_is_write(op) &&
test_bit(R5_InJournal, &sh->dev[i].flags)) /* * issuing read for a page in journal, this * must be preparing for prexor in rmw; read * the data into orig_page
*/
sh->dev[i].vec.bv_page = sh->dev[i].orig_page; else
sh->dev[i].vec.bv_page = sh->dev[i].page;
bi->bi_vcnt = 1;
bi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
bi->bi_io_vec[0].bv_offset = sh->dev[i].offset;
bi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf); /* * If this is discard request, set bi_vcnt 0. We don't * want to confuse SCSI because SCSI will replace payload
*/ if (op == REQ_OP_DISCARD)
bi->bi_vcnt = 0; if (rrdev)
set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
mddev_trace_remap(conf->mddev, bi, sh->dev[i].sector); if (should_defer && op_is_write(op))
bio_list_add(&pending_bios, bi); else
submit_bio_noacct(bi);
} if (rrdev) {
set_bit(STRIPE_IO_STARTED, &sh->state);
/* clear completed biofills */ for (i = sh->disks; i--; ) { struct r5dev *dev = &sh->dev[i];
/* acknowledge completion of a biofill operation */ /* and check if we need to reply to a read request, * new R5_Wantfill requests are held off until * !STRIPE_BIOFILL_RUN
*/ if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { struct bio *rbi, *rbi2;
/* return a pointer to the address conversion region of the scribble buffer */ staticstruct page **to_addr_page(struct raid5_percpu *percpu, int i)
{ return percpu->scribble + i * percpu->scribble_obj_size;
}
/* return a pointer to the address conversion region of the scribble buffer */ static addr_conv_t *to_addr_conv(struct stripe_head *sh, struct raid5_percpu *percpu, int i)
{ return (void *) (to_addr_page(percpu, i) + sh->disks + 2);
}
/* * Return a pointer to record offset address.
*/ staticunsignedint *
to_addr_offs(struct stripe_head *sh, struct raid5_percpu *percpu)
{ return (unsignedint *) (to_addr_conv(sh, percpu, 0) + sh->disks + 2);
}
/* set_syndrome_sources - populate source buffers for gen_syndrome * @srcs - (struct page *) array of size sh->disks * @offs - (unsigned int) array of offset for each page * @sh - stripe_head to parse * * Populates srcs in proper layout order for the stripe and returns the * 'count' of sources to be used in a call to async_gen_syndrome. The P * destination buffer is recorded in srcs[count] and the Q destination * is recorded in srcs[count+1]].
*/ staticint set_syndrome_sources(struct page **srcs, unsignedint *offs, struct stripe_head *sh, int srctype)
{ int disks = sh->disks; int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); int d0_idx = raid6_d0(sh); int count; int i;
for (i = 0; i < disks; i++)
srcs[i] = NULL;
count = 0;
i = d0_idx; do { int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); struct r5dev *dev = &sh->dev[i];
if (i == sh->qd_idx || i == sh->pd_idx ||
(srctype == SYNDROME_SRC_ALL) ||
(srctype == SYNDROME_SRC_WANT_DRAIN &&
(test_bit(R5_Wantdrain, &dev->flags) ||
test_bit(R5_InJournal, &dev->flags))) ||
(srctype == SYNDROME_SRC_WRITTEN &&
(dev->written ||
test_bit(R5_InJournal, &dev->flags)))) { if (test_bit(R5_InJournal, &dev->flags))
srcs[slot] = sh->dev[i].orig_page; else
srcs[slot] = sh->dev[i].page; /* * For R5_InJournal, PAGE_SIZE must be 4KB and will * not shared page. In that case, dev[i].offset * is 0.
*/
offs[slot] = sh->dev[i].offset;
}
i = raid6_next_disk(i, disks);
} while (i != d0_idx);
return syndrome_disks;
}
staticstruct dma_async_tx_descriptor *
ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
{ int disks = sh->disks; struct page **blocks = to_addr_page(percpu, 0); unsignedint *offs = to_addr_offs(sh, percpu); int target; int qd_idx = sh->qd_idx; struct dma_async_tx_descriptor *tx; struct async_submit_ctl submit; struct r5dev *tgt; struct page *dest; unsignedint dest_off; int i; int count;
BUG_ON(sh->batch_head); if (sh->ops.target < 0)
target = sh->ops.target2; elseif (sh->ops.target2 < 0)
target = sh->ops.target; else /* we should only have one valid target */
BUG();
BUG_ON(target < 0);
pr_debug("%s: stripe %llu block: %d\n",
__func__, (unsignedlonglong)sh->sector, target);
/* we need to open-code set_syndrome_sources to handle the * slot number conversion for 'faila' and 'failb'
*/ for (i = 0; i < disks ; i++) {
offs[i] = 0;
blocks[i] = NULL;
}
count = 0;
i = d0_idx; do { int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
if (r5c_is_writeback(sh->raid_conf->log)) /* * raid5-cache write back uses orig_page during prexor. * After prexor, it is time to free orig_page
*/
r5c_release_extra_page(sh);
}
for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; /* Only process blocks that are known to be uptodate */ if (test_bit(R5_InJournal, &dev->flags)) { /* * For this case, PAGE_SIZE must be equal to 4KB and * page offset is zero.
*/
off_srcs[count] = dev->offset;
xor_srcs[count++] = dev->orig_page;
} elseif (test_bit(R5_Wantdrain, &dev->flags)) {
off_srcs[count] = dev->offset;
xor_srcs[count++] = dev->page;
}
}
for (i = disks; i--; ) { struct r5dev *dev; struct bio *chosen;
sh = head_sh; if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) { struct bio *wbi;
again:
dev = &sh->dev[i]; /* * clear R5_InJournal, so when rewriting a page in * journal, it is not skipped by r5l_log_stripe()
*/
clear_bit(R5_InJournal, &dev->flags);
spin_lock_irq(&sh->stripe_lock);
chosen = dev->towrite;
dev->towrite = NULL;
sh->overwrite_disks = 0;
BUG_ON(dev->written);
wbi = dev->written = chosen;
spin_unlock_irq(&sh->stripe_lock);
WARN_ON(dev->page != dev->orig_page);
for (i = disks; i--; ) {
fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
discard |= test_bit(R5_Discard, &sh->dev[i].flags);
}
for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i];
if (dev->written || i == pd_idx || i == qd_idx) { if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) {
set_bit(R5_UPTODATE, &dev->flags); if (test_bit(STRIPE_EXPAND_READY, &sh->state))
set_bit(R5_Expanded, &dev->flags);
} if (fua)
set_bit(R5_WantFUA, &dev->flags); if (sync)
set_bit(R5_SyncIO, &dev->flags);
}
}
for (i = 0; i < sh->disks; i++) { if (pd_idx == i) continue; if (!test_bit(R5_Discard, &sh->dev[i].flags)) break;
} if (i >= sh->disks) {
atomic_inc(&sh->count);
set_bit(R5_Discard, &sh->dev[pd_idx].flags);
ops_complete_reconstruct(sh); return;
}
again:
count = 0;
xor_srcs = to_addr_page(percpu, j);
off_srcs = to_addr_offs(sh, percpu); /* check if prexor is active which means only process blocks * that are part of a read-modify-write (written)
*/ if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
prexor = 1;
off_dest = off_srcs[count] = sh->dev[pd_idx].offset;
xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (head_sh->dev[i].written ||
test_bit(R5_InJournal, &head_sh->dev[i].flags)) {
off_srcs[count] = dev->offset;
xor_srcs[count++] = dev->page;
}
}
} else {
xor_dest = sh->dev[pd_idx].page;
off_dest = sh->dev[pd_idx].offset; for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (i != pd_idx) {
off_srcs[count] = dev->offset;
xor_srcs[count++] = dev->page;
}
}
}
/* 1/ if we prexor'd then the dest is reused as a source * 2/ if we did not prexor then we are redoing the parity * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST * for the synchronous xor case
*/
last_stripe = !head_sh->batch_head ||
list_first_entry(&sh->batch_list, struct stripe_head, batch_list) == head_sh; if (last_stripe) {
flags = ASYNC_TX_ACK |
(prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
for (i = 0; i < sh->disks; i++) { if (sh->pd_idx == i || sh->qd_idx == i) continue; if (!test_bit(R5_Discard, &sh->dev[i].flags)) break;
} if (i >= sh->disks) {
atomic_inc(&sh->count);
set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
ops_complete_reconstruct(sh); return;
}
if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { if (level < 6)
tx = ops_run_compute5(sh, percpu); else { if (sh->ops.target2 < 0 || sh->ops.target < 0)
tx = ops_run_compute6_1(sh, percpu); else
tx = ops_run_compute6_2(sh, percpu);
} /* terminate the chain if reconstruct is not set to be run */ if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
async_tx_ack(tx);
}
if (test_bit(STRIPE_OP_PREXOR, &ops_request)) { if (level < 6)
tx = ops_run_prexor5(sh, percpu, tx); else
tx = ops_run_prexor6(sh, percpu, tx);
}
if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
tx = ops_run_partial_parity(sh, percpu, tx);
if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
tx = ops_run_biodrain(sh, tx);
overlap_clear++;
}
if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { if (level < 6)
ops_run_reconstruct5(sh, percpu, tx); else
ops_run_reconstruct6(sh, percpu, tx);
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.