// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2003 Sistina Software Limited. * Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. * * This file is released under the GPL.
*/
while ((bio = bio_list_pop(bio_list)))
queue_bio(ms, bio, WRITE);
}
struct dm_raid1_bio_record { struct mirror *m; /* if details->bi_bdev == NULL, details were not saved */ struct dm_bio_details details;
region_t write_region;
};
/* * Every mirror should look like this one.
*/ #define DEFAULT_MIRROR 0
/* * This is yucky. We squirrel the mirror struct away inside * bi_next for read/write buffers. This is safe since the bh * doesn't get submitted to the lower levels of block layer.
*/ staticstruct mirror *bio_get_m(struct bio *bio)
{ return (struct mirror *) bio->bi_next;
}
staticvoid bio_set_m(struct bio *bio, struct mirror *m)
{
bio->bi_next = (struct bio *) m;
}
for (m = ms->mirror; m < ms->mirror + ms->nr_mirrors; m++) if (!atomic_read(&m->error_count)) return m;
return NULL;
}
/* fail_mirror * @m: mirror device to fail * @error_type: one of the enum's, DM_RAID1_*_ERROR * * If errors are being handled, record the type of * error encountered for this device. If this type * of error has already been recorded, we can return; * otherwise, we must signal userspace by triggering * an event. Additionally, if the device is the * primary device, we must choose a new primary, but * only if the mirror is in-sync. * * This function must not block.
*/ staticvoid fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
{ struct mirror_set *ms = m->ms; struct mirror *new;
ms->leg_failure = 1;
/* * error_count is used for nothing more than a * simple way to tell if a device has encountered * errors.
*/
atomic_inc(&m->error_count);
if (test_and_set_bit(error_type, &m->error_type)) return;
if (!errors_handled(ms)) return;
if (m != get_default_mirror(ms)) goto out;
if (!ms->in_sync && !keep_log(ms)) { /* * Better to issue requests to same failing device * than to risk returning corrupt data.
*/
DMERR("Primary mirror (%s) failed while out-of-sync: Reads may fail.",
m->dev->name); goto out;
}
new = get_valid_mirror(ms); if (new)
set_default_mirror(new); else
DMWARN("All sides of mirror have failed.");
for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) {
io[i].bdev = m->dev->bdev;
io[i].sector = 0;
io[i].count = 0;
}
error_bits = -1;
dm_io(&io_req, ms->nr_mirrors, io, &error_bits, IOPRIO_DEFAULT); if (unlikely(error_bits != 0)) { for (i = 0; i < ms->nr_mirrors; i++) if (test_bit(i, &error_bits))
fail_mirror(ms->mirror + i,
DM_RAID1_FLUSH_ERROR); return -EIO;
}
return 0;
}
/* *--------------------------------------------------------------- * Recovery. * * When a mirror is first activated we may find that some regions * are in the no-sync state. We have to recover these by * recopying from the default mirror to all the others. *---------------------------------------------------------------
*/ staticvoid recovery_complete(int read_err, unsignedlong write_err, void *context)
{ struct dm_region *reg = context; struct mirror_set *ms = dm_rh_region_context(reg); int m, bit = 0;
if (read_err) { /* Read error means the failure of default mirror. */
DMERR_LIMIT("Unable to read primary mirror during recovery");
fail_mirror(get_default_mirror(ms), DM_RAID1_SYNC_ERROR);
}
if (write_err) {
DMERR_LIMIT("Write error during recovery (error = 0x%lx)",
write_err); /* * Bits correspond to devices (excluding default mirror). * The default mirror cannot change during recovery.
*/ for (m = 0; m < ms->nr_mirrors; m++) { if (&ms->mirror[m] == get_default_mirror(ms)) continue; if (test_bit(bit, &write_err))
fail_mirror(ms->mirror + m,
DM_RAID1_SYNC_ERROR);
bit++;
}
}
/* fill in the source */
m = get_default_mirror(ms);
from.bdev = m->dev->bdev;
from.sector = m->offset + dm_rh_region_to_sector(ms->rh, key); if (key == (ms->nr_regions - 1)) { /* * The final region may be smaller than * region_size.
*/
from.count = ms->ti->len & (region_size - 1); if (!from.count)
from.count = region_size;
} else
from.count = region_size;
/* fill in the destinations */ for (i = 0, dest = to; i < ms->nr_mirrors; i++) { if (&ms->mirror[i] == get_default_mirror(ms)) continue;
staticvoid hold_bio(struct mirror_set *ms, struct bio *bio)
{ /* * Lock is required to avoid race condition during suspend * process.
*/
spin_lock_irq(&ms->lock);
if (atomic_read(&ms->suspend)) {
spin_unlock_irq(&ms->lock);
/* * If device is suspended, complete the bio.
*/ if (dm_noflush_suspending(ms->ti))
bio->bi_status = BLK_STS_DM_REQUEUE; else
bio->bi_status = BLK_STS_IOERR;
bio_endio(bio); return;
}
/* * Hold bio until the suspend is complete.
*/
bio_list_add(&ms->holds, bio);
spin_unlock_irq(&ms->lock);
}
staticinlineint region_in_sync(struct mirror_set *ms, region_t region, int may_block)
{ int state = dm_rh_get_state(ms->rh, region, may_block); return state == DM_RH_CLEAN || state == DM_RH_DIRTY;
}
while ((bio = bio_list_pop(reads))) {
region = dm_rh_bio_to_region(ms->rh, bio);
m = get_default_mirror(ms);
/* * We can only read balance if the region is in sync.
*/ if (likely(region_in_sync(ms, region, 1)))
m = choose_mirror(ms, bio->bi_iter.bi_sector); elseif (m && atomic_read(&m->error_count))
m = NULL;
if (likely(m))
read_async_bio(m, bio); else
bio_io_error(bio);
}
}
/* *--------------------------------------------------------------------- * Writes. * * We do different things with the write io depending on the * state of the region that it's in: * * SYNC: increment pending, use kcopyd to write to *all* mirrors * RECOVERING: delay the io until recovery completes * NOSYNC: increment pending, just write to the default mirror *---------------------------------------------------------------------
*/ staticvoid write_callback(unsignedlong error, void *context)
{ unsignedint i; struct bio *bio = context; struct mirror_set *ms; int should_wake = 0; unsignedlong flags;
ms = bio_get_m(bio)->ms;
bio_set_m(bio, NULL);
/* * NOTE: We don't decrement the pending count here, * instead it is done by the targets endio function. * This way we handle both writes to SYNC and NOSYNC * regions with the same code.
*/ if (likely(!error)) {
bio_endio(bio); return;
}
/* * If the bio is discard, return an error, but do not * degrade the array.
*/ if (bio_op(bio) == REQ_OP_DISCARD) {
bio->bi_status = BLK_STS_NOTSUPP;
bio_endio(bio); return;
}
for (i = 0; i < ms->nr_mirrors; i++) if (test_bit(i, &error))
fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR);
/* * Need to raise event. Since raising * events can block, we need to do it in * the main thread.
*/
spin_lock_irqsave(&ms->lock, flags); if (!ms->failures.head)
should_wake = 1;
bio_list_add(&ms->failures, bio); if (should_wake)
wakeup_mirrord(ms);
spin_unlock_irqrestore(&ms->lock, flags);
}
for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++)
map_region(dest++, m, bio);
/* * Use default mirror because we only need it to retrieve the reference * to the mirror set in write_callback().
*/
bio_set_m(bio, get_default_mirror(ms));
/* * Classify each write.
*/
bio_list_init(&sync);
bio_list_init(&nosync);
bio_list_init(&recover);
bio_list_init(&requeue);
while ((bio = bio_list_pop(writes))) { if ((bio->bi_opf & REQ_PREFLUSH) ||
(bio_op(bio) == REQ_OP_DISCARD)) {
bio_list_add(&sync, bio); continue;
}
region = dm_rh_bio_to_region(ms->rh, bio);
if (log->type->is_remote_recovering &&
log->type->is_remote_recovering(log, region)) {
bio_list_add(&requeue, bio); continue;
}
state = dm_rh_get_state(ms->rh, region, 1); switch (state) { case DM_RH_CLEAN: case DM_RH_DIRTY:
this_list = &sync; break;
case DM_RH_NOSYNC:
this_list = &nosync; break;
case DM_RH_RECOVERING:
this_list = &recover; break;
}
bio_list_add(this_list, bio);
}
/* * Add bios that are delayed due to remote recovery * back on to the write queue
*/ if (unlikely(requeue.head)) {
spin_lock_irq(&ms->lock);
bio_list_merge(&ms->writes, &requeue);
spin_unlock_irq(&ms->lock);
delayed_wake(ms);
}
/* * Increment the pending counts for any regions that will * be written to (writes to recover regions are going to * be delayed).
*/
dm_rh_inc_pending(ms->rh, &sync);
dm_rh_inc_pending(ms->rh, &nosync);
/* * If the flush fails on a previous call and succeeds here, * we must not reset the log_failure variable. We need * userspace interaction to do that.
*/
ms->log_failure = dm_rh_flush(ms->rh) ? 1 : ms->log_failure;
/* * If the log has failed, unattempted writes are being * put on the holds list. We can't issue those writes * until a log has been marked, so we must store them. * * If a 'noflush' suspend is in progress, we can requeue * the I/O's to the core. This give userspace a chance * to reconfigure the mirror, at which point the core * will reissue the writes. If the 'noflush' flag is * not set, we have no choice but to return errors. * * Some writes on the failures list may have been * submitted before the log failure and represent a * failure to write to one of the devices. It is ok * for us to treat them the same and requeue them * as well.
*/ while ((bio = bio_list_pop(failures))) { if (!ms->log_failure) {
ms->in_sync = 0;
dm_rh_mark_nosync(ms->rh, bio);
}
/* * If all the legs are dead, fail the I/O. * If the device has failed and keep_log is enabled, * fail the I/O. * * If we have been told to handle errors, and keep_log * isn't enabled, hold the bio and wait for userspace to * deal with the problem. * * Otherwise pretend that the I/O succeeded. (This would * be wrong if the failed leg returned after reboot and * got replicated back to the good legs.)
*/ if (unlikely(!get_valid_mirror(ms) || (keep_log(ms) && ms->log_failure)))
bio_io_error(bio); elseif (errors_handled(ms) && !keep_log(ms))
hold_bio(ms, bio); else
bio_endio(bio);
}
}
r = parse_features(ms, argc, argv, &args_used); if (r) goto err_destroy_wq;
argv += args_used;
argc -= args_used;
/* * Any read-balancing addition depends on the * DM_RAID1_HANDLE_ERRORS flag being present. * This is because the decision to balance depends * on the sync state of a region. If the above * flag is not present, we ignore errors; and * the sync state may be inaccurate.
*/
if (argc) {
ti->error = "Too many mirror arguments";
r = -EINVAL; goto err_destroy_wq;
}
ms->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle); if (IS_ERR(ms->kcopyd_client)) {
r = PTR_ERR(ms->kcopyd_client); goto err_destroy_wq;
}
/* * The region is in-sync and we can perform reads directly. * Store enough information so we can retry if it fails.
*/
m = choose_mirror(ms, bio->bi_iter.bi_sector); if (unlikely(!m)) return DM_MAPIO_KILL;
dm_bio_record(&bio_record->details, bio);
bio_record->m = m;
/* * We need to dec pending if this was a write.
*/ if (rw == WRITE) { if (!(bio->bi_opf & REQ_PREFLUSH) &&
bio_op(bio) != REQ_OP_DISCARD)
dm_rh_dec(ms->rh, bio_record->write_region); return DM_ENDIO_DONE;
}
if (*error == BLK_STS_NOTSUPP) goto out;
if (bio->bi_opf & REQ_RAHEAD) goto out;
if (unlikely(*error)) { if (!bio_record->details.bi_bdev) { /* * There wasn't enough memory to record necessary * information for a retry or there was no other * mirror in-sync.
*/
DMERR_LIMIT("Mirror read failed."); return DM_ENDIO_DONE;
}
m = bio_record->m;
DMERR("Mirror read failed from %s. Trying alternative device.",
m->dev->name);
fail_mirror(m, DM_RAID1_READ_ERROR);
/* * A failed read is requeued for another attempt using an intact * mirror.
*/ if (default_ok(m) || mirror_available(ms, bio)) {
bd = &bio_record->details;
/* * Process bios in the hold list to start recovery waiting * for bios in the hold list. After the process, no bio has * a chance to be added in the hold list because ms->suspend * is set.
*/
spin_lock_irq(&ms->lock);
holds = ms->holds;
bio_list_init(&ms->holds);
spin_unlock_irq(&ms->lock);
while ((bio = bio_list_pop(&holds)))
hold_bio(ms, bio);
/* * We must finish up all the work that we've * generated (i.e. recovery work).
*/
dm_rh_stop_recovery(ms->rh);
if (log->type->presuspend && log->type->presuspend(log)) /* FIXME: need better error handling */
DMWARN("log presuspend failed");
/* * Now that recovery is complete/stopped and the * delayed bios are queued, we need to wait for * the worker thread to complete. This way, * we know that all of our I/O has been pushed.
*/
flush_workqueue(ms->kmirrord_wq);
}
atomic_set(&ms->suspend, 0); if (log->type->resume && log->type->resume(log)) /* FIXME: need better error handling */
DMWARN("log resume failed");
dm_rh_start_recovery(ms->rh);
}
/* * device_status_char * @m: mirror device/leg we want the status of * * We return one character representing the most severe error * we have encountered. * A => Alive - No failures * D => Dead - A write failure occurred leaving mirror out-of-sync * S => Sync - A sychronization failure occurred, mirror out-of-sync * R => Read - A read failure occurred, mirror data unaffected * * Returns: <char>
*/ staticchar device_status_char(struct mirror *m)
{ if (!atomic_read(&(m->error_count))) return'A';
case STATUSTYPE_TABLE:
sz = log->type->status(log, type, result, maxlen);
DMEMIT("%d", ms->nr_mirrors); for (m = 0; m < ms->nr_mirrors; m++)
DMEMIT(" %s %llu", ms->mirror[m].dev->name,
(unsignedlonglong)ms->mirror[m].offset);
num_feature_args += !!errors_handled(ms);
num_feature_args += !!keep_log(ms); if (num_feature_args) {
DMEMIT(" %d", num_feature_args); if (errors_handled(ms))
DMEMIT(" handle_errors"); if (keep_log(ms))
DMEMIT(" keep_log");
}
break;
case STATUSTYPE_IMA:
DMEMIT_TARGET_NAME_VERSION(ti->type);
DMEMIT(",nr_mirrors=%d", ms->nr_mirrors); for (m = 0; m < ms->nr_mirrors; m++) {
DMEMIT(",mirror_device_%d=%s", m, ms->mirror[m].dev->name);
DMEMIT(",mirror_device_%d_status=%c",
m, device_status_char(&(ms->mirror[m])));
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.