// SPDX-License-Identifier: GPL-2.0-or-later /* md.c : Multiple Devices driver for Linux Copyright (C) 1998, 1999, 2000 Ingo Molnar
completely rewritten, based on the MD driver code from Marc Zyngier
Changes:
- RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> - kerneld support by Boris Tobotras <boris@xtalk.msk.su> - kmod support by: Cyrus Durgin - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
- lots of fixes and improvements to the RAID1/RAID5 and generic RAID code (such as request based resynchronization):
Neil Brown <neilb@cse.unsw.edu.au>.
- persistent bitmap code Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
Errors, Warnings, etc. Please use: pr_crit() for error conditions that risk data loss pr_err() for error conditions that are unexpected, like an IO error or internal inconsistency pr_warn() for error conditions that could have been predicated, like adding a device to an array when it has incompatible metadata pr_info() for every interesting, very rare events, like an array starting or stopping, or resync starting or stopping pr_debug() for everything else.
/* * This workqueue is used for sync_work to register new sync_thread, and for * del_work to remove rdev, and for event_work that is only set by dm-raid. * * Noted that sync_work will grab reconfig_mutex, hence never flush this * workqueue whith reconfig_mutex grabbed.
*/ staticstruct workqueue_struct *md_misc_wq; struct workqueue_struct *md_bitmap_wq;
/* * Default number of read corrections we'll attempt on an rdev * before ejecting it from the array. We divide the read error * count by 2 for every hour elapsed between read errors.
*/ #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 /* Default safemode delay: 200 msec */ #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) /* * Current RAID-1,4,5,6,10 parallel reconstruction 'guaranteed speed limit' * is sysctl_speed_limit_min, 1000 KB/sec by default, so the extra system load * does not show up that much. Increase it if you want to have more guaranteed * speed. Note that the RAID driver will use the maximum bandwidth * sysctl_speed_limit_max, 200 MB/sec by default, if the IO subsystem is idle. * * Background sync IO speed control: * * - below speed min: * no limit; * - above speed min and below speed max: * a) if mddev is idle, then no limit; * b) if mddev is busy handling normal IO, then limit inflight sync IO * to sync_io_depth; * - above speed max: * sync IO can't be issued; * * Following configurations can be changed via /proc/sys/dev/raid/ for system * or /sys/block/mdX/md/ for one array.
*/ staticint sysctl_speed_limit_min = 1000; staticint sysctl_speed_limit_max = 200000; staticint sysctl_sync_io_depth = 32;
staticint rdevs_init_serial(struct mddev *mddev)
{ struct md_rdev *rdev; int ret = 0;
rdev_for_each(rdev, mddev) {
ret = rdev_init_serial(rdev); if (ret) break;
}
/* Free all resources if pool is not existed */ if (ret && !mddev->serial_info_pool)
rdevs_uninit_serial(mddev);
return ret;
}
/* * rdev needs to enable serial stuffs if it meets the conditions: * 1. it is multi-queue device flaged with writemostly. * 2. the write-behind mode is enabled.
*/ staticint rdev_need_serial(struct md_rdev *rdev)
{ return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 &&
rdev->bdev->bd_disk->queue->nr_hw_queues != 1 &&
test_bit(WriteMostly, &rdev->flags));
}
/* * Init resource for rdev(s), then create serial_info_pool if: * 1. rdev is the first device which return true from rdev_enable_serial. * 2. rdev is NULL, means we want to enable serialization for all rdevs.
*/ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev)
{ int ret = 0;
if (rdev && !rdev_need_serial(rdev) &&
!test_bit(CollisionCheck, &rdev->flags)) return;
if (!rdev)
ret = rdevs_init_serial(mddev); else
ret = rdev_init_serial(rdev); if (ret) return;
if (mddev->serial_info_pool == NULL) { /* * already in memalloc noio context by * mddev_suspend()
*/
mddev->serial_info_pool =
mempool_create_kmalloc_pool(NR_SERIAL_INFOS, sizeof(struct serial_info)); if (!mddev->serial_info_pool) {
rdevs_uninit_serial(mddev);
pr_err("can't alloc memory pool for serialization\n");
}
}
}
/* * Free resource from rdev(s), and destroy serial_info_pool under conditions: * 1. rdev is the last device flaged with CollisionCheck. * 2. when bitmap is destroyed while policy is not enabled. * 3. for disable policy, the pool is destroyed only when no rdev needs it.
*/ void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev)
{ if (rdev && !test_bit(CollisionCheck, &rdev->flags)) return;
if (mddev->serial_info_pool) { struct md_rdev *temp; int num = 0; /* used to track if other rdevs need the pool */
if (num)
pr_info("The mempool could be used by other devices\n"); else {
mempool_destroy(mddev->serial_info_pool);
mddev->serial_info_pool = NULL;
}
}
}
/* * The original mechanism for creating an md device is to create * a device node in /dev and to open it. This causes races with device-close. * The preferred method is to write to the "new_array" module parameter. * This can avoid races. * Setting create_on_open to false disables the original mechanism * so all the races disappear.
*/ staticbool create_on_open = true; staticbool legacy_async_del_gendisk = true;
/* * We have a system wide 'event count' that is incremented * on any 'interesting' event, and readers of /proc/mdstat * can use 'poll' or 'select' to find out when the event * count increases. * * Events are: * start array, stop array, error, add device, remove device, * start build, activate spare
*/ static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); static atomic_t md_event_count; void md_new_event(void)
{
atomic_inc(&md_event_count);
wake_up(&md_event_waiters);
}
EXPORT_SYMBOL_GPL(md_new_event);
/* * Enables to iterate over all existing md arrays * all_mddevs_lock protects this list.
*/ static LIST_HEAD(all_mddevs); static DEFINE_SPINLOCK(all_mddevs_lock);
staticbool is_md_suspended(struct mddev *mddev)
{ return percpu_ref_is_dying(&mddev->active_io);
} /* Rather than calling directly into the personality make_request function, * IO requests come here first so that we can check if the device is * being suspended pending a reconfiguration. * We hold a refcount over the call to ->make_request. By the time that * call has finished, the bio has been linked into some internal structure * and so is visible to ->quiesce(), so we don't need the refcount any more.
*/ staticbool is_suspended(struct mddev *mddev, struct bio *bio)
{ if (is_md_suspended(mddev)) returntrue; if (bio_data_dir(bio) != WRITE) returnfalse; if (READ_ONCE(mddev->suspend_lo) >= READ_ONCE(mddev->suspend_hi)) returnfalse; if (bio->bi_iter.bi_sector >= READ_ONCE(mddev->suspend_hi)) returnfalse; if (bio_end_sector(bio) < READ_ONCE(mddev->suspend_lo)) returnfalse; returntrue;
}
bool md_handle_request(struct mddev *mddev, struct bio *bio)
{
check_suspended: if (is_suspended(mddev, bio)) {
DEFINE_WAIT(__wait); /* Bail out if REQ_NOWAIT is set for the bio */ if (bio->bi_opf & REQ_NOWAIT) {
bio_wouldblock_error(bio); returntrue;
} for (;;) {
prepare_to_wait(&mddev->sb_wait, &__wait,
TASK_UNINTERRUPTIBLE); if (!is_suspended(mddev, bio)) break;
schedule();
}
finish_wait(&mddev->sb_wait, &__wait);
} if (!percpu_ref_tryget_live(&mddev->active_io)) goto check_suspended;
if (!mddev->pers->make_request(mddev, bio)) {
percpu_ref_put(&mddev->active_io); if (!mddev->gendisk && mddev->pers->prepare_suspend) returnfalse; goto check_suspended;
}
if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) { if (bio_sectors(bio) != 0)
bio->bi_status = BLK_STS_IOERR;
bio_endio(bio); return;
}
/* bio could be mergeable after passing to underlayer */
bio->bi_opf &= ~REQ_NOMERGE;
md_handle_request(mddev, bio);
}
/* * Make sure no new requests are submitted to the device, and any requests that * have been submitted are completely handled.
*/ int mddev_suspend(struct mddev *mddev, bool interruptible)
{ int err = 0;
/* * hold reconfig_mutex to wait for normal io will deadlock, because * other context can't update super_block, and normal io can rely on * updating super_block.
*/
lockdep_assert_not_held(&mddev->reconfig_mutex);
if (interruptible)
err = mutex_lock_interruptible(&mddev->suspend_mutex); else
mutex_lock(&mddev->suspend_mutex); if (err) return err;
percpu_ref_kill(&mddev->active_io); if (interruptible)
err = wait_event_interruptible(mddev->sb_wait,
percpu_ref_is_zero(&mddev->active_io)); else
wait_event(mddev->sb_wait,
percpu_ref_is_zero(&mddev->active_io)); if (err) {
percpu_ref_resurrect(&mddev->active_io);
mutex_unlock(&mddev->suspend_mutex); return err;
}
/* * For raid456, io might be waiting for reshape to make progress, * allow new reshape to start while waiting for io to be done to * prevent deadlock.
*/
WRITE_ONCE(mddev->suspended, mddev->suspended + 1);
/* restrict memory reclaim I/O during raid array is suspend */
mddev->noio_flag = memalloc_noio_save();
if (recovery_needed)
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
/* sync bdev before setting device to readonly or stopping raid*/ staticint mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_num)
{
mutex_lock(&mddev->open_mutex); if (mddev->pers && atomic_read(&mddev->openers) > opener_num) {
mutex_unlock(&mddev->open_mutex); return -EBUSY;
} if (test_and_set_bit(MD_CLOSING, &mddev->flags)) {
mutex_unlock(&mddev->open_mutex); return -EBUSY;
}
mutex_unlock(&mddev->open_mutex);
sync_blockdev(mddev->gendisk->part0); return 0;
}
/* * The only difference from bio_chain_endio() is that the current * bi_status of bio does not affect the bi_status of parent.
*/ staticvoid md_end_flush(struct bio *bio)
{ struct bio *parent = bio->bi_private;
/* * If any flush io error before the power failure, * disk data may be lost.
*/ if (bio->bi_status)
pr_err("md: %pg flush io error %d\n", bio->bi_bdev,
blk_status_to_errno(bio->bi_status));
bio_put(bio);
bio_endio(parent);
}
bool md_flush_request(struct mddev *mddev, struct bio *bio)
{ struct md_rdev *rdev; struct bio *new;
/* * md_flush_reqeust() should be called under md_handle_request() and * 'active_io' is already grabbed. Hence it's safe to get rdev directly * without rcu protection.
*/
WARN_ON(percpu_ref_is_zero(&mddev->active_io));
/* * If array is freed by stopping array, MD_DELETED is set by * do_md_stop(), MD_DELETED is still set here in case mddev is freed * directly by closing a mddev that is created by create_on_open.
*/
set_bit(MD_DELETED, &mddev->flags); /* * Call queue_work inside the spinlock so that flush_workqueue() after * mddev_find will succeed in waiting for the work to be done.
*/
queue_work(md_misc_wq, &mddev->del_work);
}
staticvoid mddev_put_locked(struct mddev *mddev)
{ if (atomic_dec_and_test(&mddev->active))
__mddev_put(mddev);
}
void mddev_put(struct mddev *mddev)
{ if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) return;
if (!list_empty(&mddev->deleting))
list_splice_init(&mddev->deleting, &delete);
if (mddev->to_remove) { /* These cannot be removed under reconfig_mutex as * an access to the files will try to take reconfig_mutex * while holding the file unremovable, which leads to * a deadlock. * So hold set sysfs_active while the remove in happeing, * and anything else which might set ->to_remove or my * otherwise change the sysfs namespace will fail with * -EBUSY if sysfs_active is still set. * We set sysfs_active under reconfig_mutex and elsewhere * test it under the same mutex to ensure its correct value * is seen.
*/ conststruct attribute_group *to_remove = mddev->to_remove;
mddev->to_remove = NULL;
mddev->sysfs_active = 1;
mutex_unlock(&mddev->reconfig_mutex);
if (mddev->kobj.sd) { if (to_remove != &md_redundancy_group)
sysfs_remove_group(&mddev->kobj, to_remove); if (mddev->pers == NULL ||
mddev->pers->sync_request == NULL) {
sysfs_remove_group(&mddev->kobj, &md_redundancy_group); if (mddev->sysfs_action)
sysfs_put(mddev->sysfs_action); if (mddev->sysfs_completed)
sysfs_put(mddev->sysfs_completed); if (mddev->sysfs_degraded)
sysfs_put(mddev->sysfs_degraded);
mddev->sysfs_action = NULL;
mddev->sysfs_completed = NULL;
mddev->sysfs_degraded = NULL;
}
}
mddev->sysfs_active = 0;
} else
mutex_unlock(&mddev->reconfig_mutex);
if (!legacy_async_del_gendisk) { /* * Call del_gendisk after release reconfig_mutex to avoid * deadlock (e.g. call del_gendisk under the lock and an * access to sysfs files waits the lock) * And MD_DELETED is only used for md raid which is set in * do_md_stop. dm raid only uses md_stop to stop. So dm raid * doesn't need to check MD_DELETED when getting reconfig lock
*/ if (test_bit(MD_DELETED, &mddev->flags))
del_gendisk(mddev->gendisk);
}
}
EXPORT_SYMBOL_GPL(mddev_unlock);
xa_lock(&md_submodule);
xa_for_each(&md_submodule, i, head) { if (head->type != MD_PERSONALITY) continue; if ((level != LEVEL_NONE && head->id == level) ||
!strcmp(head->name, clevel)) { if (try_module_get(head->owner))
ret = (void *)head; break;
}
}
xa_unlock(&md_submodule);
if (!ret) { if (level != LEVEL_NONE)
pr_warn("md: personality for level %d is not loaded!\n",
level); else
pr_warn("md: personality for level %s is not loaded!\n",
clevel);
}
/* return the offset of the super block in 512byte sectors */ staticinline sector_t calc_dev_sboffset(struct md_rdev *rdev)
{ return MD_NEW_SIZE_SECTORS(bdev_nr_sectors(rdev->bdev));
}
if (atomic_dec_and_test(&mddev->pending_writes))
wake_up(&mddev->sb_wait);
}
void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
sector_t sector, int size, struct page *page)
{ /* write first size bytes of page to sector of rdev * Increment mddev->pending_writes before returning * and decrement it on completion, waking up sb_wait * if zero is reached. * If an error occurred, call md_error
*/ struct bio *bio;
int md_super_wait(struct mddev *mddev)
{ /* wait for all superblock writes that were scheduled to complete */
wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags)) return -EAGAIN; return 0;
}
int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, blk_opf_t opf, bool metadata_op)
{ struct bio bio; struct bio_vec bvec;
for (i = 0; i < MD_SB_BYTES/4 ; i++)
newcsum += sb32[i];
csum = (newcsum & 0xffffffff) + (newcsum>>32);
#ifdef CONFIG_ALPHA /* This used to use csum_partial, which was wrong for several * reasons including that different results are returned on * different architectures. It isn't critical that we get exactly * the same return value as before (we always csum_fold before * testing, and that removes any differences). However as we * know that csum_partial always returned a 16bit value on * alphas, do a fold to maximise conformity to previous behaviour.
*/
sb->sb_csum = md_csum_fold(disk_csum); #else
sb->sb_csum = disk_csum; #endif return csum;
}
/* * Handle superblock details. * We want to be able to handle multiple superblock formats * so we have a common interface to them all, and an array of * different handlers. * We rely on user-space to write the initial superblock, and support * reading and updating of superblocks. * Interface methods are: * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) * loads and validates a superblock on dev. * if refdev != NULL, compare superblocks on both devices * Return: * 0 - dev has a superblock that is compatible with refdev * 1 - dev has a superblock that is compatible and newer than refdev * so dev should be used as the refdev in future * -EINVAL superblock incompatible or invalid * -othererror e.g. -EIO * * int validate_super(struct mddev *mddev, struct md_rdev *dev) * Verify that dev is acceptable into mddev. * The first time, mddev->raid_disks will be 0, and data from * dev should be merged in. Subsequent calls check that dev * is new enough. Return 0 or -EINVAL * * void sync_super(struct mddev *mddev, struct md_rdev *dev) * Update the superblock for rdev with data in mddev * This does not write to disc. *
*/
/* * Check that the given mddev has no bitmap. * * This function is called from the run method of all personalities that do not * support bitmaps. It prints an error message and returns non-zero if mddev * has a bitmap. Otherwise, it returns 0. *
*/ int md_check_no_bitmap(struct mddev *mddev)
{ if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) return 0;
pr_warn("%s: bitmaps are not supported for %s\n",
mdname(mddev), mddev->pers->head.name); return 1;
}
EXPORT_SYMBOL(md_check_no_bitmap);
/* * load_super for 0.90.0
*/ staticint super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
{
mdp_super_t *sb; int ret; bool spare_disk = true;
/* * Calculate the position of the superblock (512byte sectors), * it's at the end of the disk. * * It also happens to be a multiple of 4Kb.
*/
rdev->sb_start = calc_dev_sboffset(rdev);
ret = read_disk_sb(rdev, MD_SB_BYTES); if (ret) return ret;
ret = -EINVAL;
sb = page_address(rdev->sb_page);
if (sb->md_magic != MD_SB_MAGIC) {
pr_warn("md: invalid raid superblock magic on %pg\n",
rdev->bdev); goto abort;
}
if (sb->major_version != 0 ||
sb->minor_version < 90 ||
sb->minor_version > 91) {
pr_warn("Bad version number %d.%d on %pg\n",
sb->major_version, sb->minor_version, rdev->bdev); goto abort;
}
if (sb->raid_disks <= 0) goto abort;
if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
pr_warn("md: invalid superblock checksum on %pg\n", rdev->bdev); goto abort;
}
/* not spare disk */ if (rdev->desc_nr >= 0 && rdev->desc_nr < MD_SB_DISKS &&
sb->disks[rdev->desc_nr].state & ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
spare_disk = false;
if (!refdev) { if (!spare_disk)
ret = 1; else
ret = 0;
} else {
__u64 ev1, ev2;
mdp_super_t *refsb = page_address(refdev->sb_page); if (!md_uuid_equal(refsb, sb)) {
pr_warn("md: %pg has different UUID to %pg\n",
rdev->bdev, refdev->bdev); goto abort;
} if (!md_sb_equal(refsb, sb)) {
pr_warn("md: %pg has same UUID but different superblock to %pg\n",
rdev->bdev, refdev->bdev); goto abort;
}
ev1 = md_event(sb);
ev2 = md_event(refsb);
if (!spare_disk && ev1 > ev2)
ret = 1; else
ret = 0;
}
rdev->sectors = rdev->sb_start; /* Limit to 4TB as metadata cannot record more than that. * (not needed for Linear and RAID0 as metadata doesn't * record this size)
*/ if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1)
rdev->sectors = (sector_t)(2ULL << 32) - 2;
if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) /* "this cannot possibly happen" ... */
ret = -EINVAL;
} elseif (mddev->pers == NULL) { /* Insist on good event counter while assembling, except
* for spares (which don't need an event count) */
++ev1; if (sb->disks[rdev->desc_nr].state & (
(1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) if (ev1 < mddev->events) return -EINVAL;
} elseif (mddev->bitmap) { /* if adding to array with a bitmap, then we can accept an * older device ... but not too old.
*/ if (ev1 < md_bitmap_events_cleared(mddev)) return 0; if (ev1 < mddev->events)
set_bit(Bitmap_sync, &rdev->flags);
} else { if (ev1 < mddev->events) /* just a hot-add of a new device, leave raid_disk at -1 */ return 0;
}
desc = sb->disks + rdev->desc_nr;
if (desc->state & (1<<MD_DISK_FAULTY))
set_bit(Faulty, &rdev->flags); elseif (desc->state & (1<<MD_DISK_SYNC)) {
set_bit(In_sync, &rdev->flags);
rdev->raid_disk = desc->raid_disk;
rdev->saved_raid_disk = desc->raid_disk;
} elseif (desc->state & (1<<MD_DISK_ACTIVE)) { /* active but not in sync implies recovery up to * reshape position. We don't know exactly where * that is, so set to zero for now
*/ if (mddev->minor_version >= 91) {
rdev->recovery_offset = 0;
rdev->raid_disk = desc->raid_disk;
}
} if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
set_bit(WriteMostly, &rdev->flags); if (desc->state & (1<<MD_DISK_FAILFAST))
set_bit(FailFast, &rdev->flags); return 0;
}
/* make rdev->sb match mddev data.. * * 1/ zero out disks * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); * 3/ any empty disks < next_spare become removed * * disks[0] gets initialised to REMOVED because * we cannot be sure from other fields if it has * been initialised or not.
*/ int i; int active=0, working=0,failed=0,spare=0,nr_disks=0;
staticint super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
{ struct mdp_superblock_1 *sb; int ret;
sector_t sb_start;
sector_t sectors; int bmask; bool spare_disk = true;
/* * Calculate the position of the superblock in 512byte sectors. * It is always aligned to a 4K boundary and * depeding on minor_version, it can be: * 0: At least 8K, but less than 12K, from end of device * 1: At start of device * 2: 4K from start of device.
*/ switch(minor_version) { case 0:
sb_start = bdev_nr_sectors(rdev->bdev) - 8 * 2;
sb_start &= ~(sector_t)(4*2-1); break; case 1:
sb_start = 0; break; case 2:
sb_start = 8; break; default: return -EINVAL;
}
rdev->sb_start = sb_start;
/* superblock is rarely larger than 1K, but it can be larger, * and it is safe to read 4k, so we do that
*/
ret = read_disk_sb(rdev, 4096); if (ret) return ret;
if (calc_sb_1_csum(sb) != sb->sb_csum) {
pr_warn("md: invalid superblock checksum on %pg\n",
rdev->bdev); return -EINVAL;
} if (le64_to_cpu(sb->data_size) < 10) {
pr_warn("md: data_size too small on %pg\n",
rdev->bdev); return -EINVAL;
} if (sb->pad0 ||
sb->pad3[0] ||
memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) /* Some padding is non-zero, might be a new feature */ return -EINVAL;
if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
mddev->bitmap_info.file == NULL) {
mddev->bitmap_info.offset =
(__s32)le32_to_cpu(sb->bitmap_offset); /* Metadata doesn't record how much space is available. * For 1.0, we assume we can use up to the superblock * if before, else to 4K beyond superblock. * For others, assume no change is possible.
*/ if (mddev->minor_version > 0)
mddev->bitmap_info.space = 0; elseif (mddev->bitmap_info.offset > 0)
mddev->bitmap_info.space =
8 - mddev->bitmap_info.offset; else
mddev->bitmap_info.space =
-mddev->bitmap_info.offset;
}
if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
set_bit(MD_HAS_JOURNAL, &mddev->flags);
if (le32_to_cpu(sb->feature_map) &
(MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) { if (le32_to_cpu(sb->feature_map) &
(MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL)) return -EINVAL; if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) &&
(le32_to_cpu(sb->feature_map) &
MD_FEATURE_MULTIPLE_PPLS)) return -EINVAL;
set_bit(MD_HAS_PPL, &mddev->flags);
}
} elseif (mddev->pers == NULL) { /* Insist of good event counter while assembling, except for * spares (which don't need an event count). * Similar to mdadm, we allow event counter difference of 1 * from the freshest device.
*/ if (rdev->desc_nr >= 0 &&
rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
(le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) if (ev1 + 1 < mddev->events) return -EINVAL;
} elseif (mddev->bitmap) { /* If adding to array with a bitmap, then we can accept an * older device, but not too old.
*/ if (ev1 < md_bitmap_events_cleared(mddev)) return 0; if (ev1 < mddev->events)
set_bit(Bitmap_sync, &rdev->flags);
} else { if (ev1 < mddev->events) /* just a hot-add of a new device, leave raid_disk at -1 */ return 0;
}
if (rdev->desc_nr < 0 ||
rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
role = MD_DISK_ROLE_SPARE;
rdev->desc_nr = -1;
} elseif (mddev->pers == NULL && freshest && ev1 < mddev->events) { /* * If we are assembling, and our event counter is smaller than the * highest event counter, we cannot trust our superblock about the role. * It could happen that our rdev was marked as Faulty, and all other * superblocks were updated with +1 event counter. * Then, before the next superblock update, which typically happens when * remove_and_add_spares() removes the device from the array, there was * a crash or reboot. * If we allow current rdev without consulting the freshest superblock, * we could cause data corruption. * Note that in this case our event counter is smaller by 1 than the * highest, otherwise, this rdev would not be allowed into array; * both kernel and mdadm allow event counter difference of 1.
*/ struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page);
u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev);
if (rdev->desc_nr >= freshest_max_dev) { /* this is unexpected, better not proceed */
pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n",
mdname(mddev), rdev->bdev, rdev->desc_nr,
freshest->bdev, freshest_max_dev); return -EUCLEAN;
}
role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]);
pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n",
mdname(mddev), rdev->bdev, role, role, freshest->bdev);
} else {
role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
} switch (role) { case MD_DISK_ROLE_SPARE: /* spare */ break; case MD_DISK_ROLE_FAULTY: /* faulty */
set_bit(Faulty, &rdev->flags); break; case MD_DISK_ROLE_JOURNAL: /* journal device */ if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { /* journal device without journal feature */
pr_warn("md: journal device provided without journal feature, ignoring the device\n"); return -EINVAL;
}
set_bit(Journal, &rdev->flags);
rdev->journal_tail = le64_to_cpu(sb->journal_tail);
rdev->raid_disk = 0; break; default:
rdev->saved_raid_disk = role; if ((le32_to_cpu(sb->feature_map) &
MD_FEATURE_RECOVERY_OFFSET)) {
rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); if (!(le32_to_cpu(sb->feature_map) &
MD_FEATURE_RECOVERY_BITMAP))
rdev->saved_raid_disk = -1;
} else { /* * If the array is FROZEN, then the device can't * be in_sync with rest of array.
*/ if (!test_bit(MD_RECOVERY_FROZEN,
&mddev->recovery))
set_bit(In_sync, &rdev->flags);
}
rdev->raid_disk = role; break;
} if (sb->devflags & WriteMostly1)
set_bit(WriteMostly, &rdev->flags); if (sb->devflags & FailFast1)
set_bit(FailFast, &rdev->flags); if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
set_bit(Replacement, &rdev->flags);
return 0;
}
staticvoid super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
{ struct mdp_superblock_1 *sb; struct md_rdev *rdev2; int max_dev, i; /* make rdev->sb match mddev and rdev data. */
/* if the device is bigger than 8Gig, save 64k for bitmap * usage, if bigger than 200Gig, save 128k
*/ if (dev_size < 64*2)
bm_space = 0; elseif (dev_size - 64*2 >= 200*1024*1024*2)
bm_space = 128*2; elseif (dev_size - 4*2 > 8*1024*1024*2)
bm_space = 64*2; else
bm_space = 4*2; return bm_space;
}
staticunsignedlonglong
super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
{ struct mdp_superblock_1 *sb;
sector_t max_sectors; if (num_sectors && num_sectors < rdev->mddev->dev_sectors) return 0; /* component must fit device */ if (rdev->data_offset != rdev->new_data_offset) return 0; /* too confusing */ if (rdev->sb_start < rdev->data_offset) { /* minor versions 1 and 2; superblock before data */
max_sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; if (!num_sectors || num_sectors > max_sectors)
num_sectors = max_sectors;
} elseif (rdev->mddev->bitmap_info.offset) { /* minor version 0 with bitmap we can't move */ return 0;
} else { /* minor version 0; superblock after data */
sector_t sb_start, bm_space;
sector_t dev_size = bdev_nr_sectors(rdev->bdev);
/* 8K is for superblock */
sb_start = dev_size - 8*2;
sb_start &= ~(sector_t)(4*2 - 1);
bm_space = super_1_choose_bm_space(dev_size);
/* Space that can be used to store date needs to decrease * superblock bitmap space and bad block space(4K)
*/
max_sectors = sb_start - bm_space - 4*2;
staticint
super_1_allow_new_offset(struct md_rdev *rdev, unsignedlonglong new_offset)
{ /* All necessary checks on new >= old have been done */ if (new_offset >= rdev->data_offset) return 1;
/* with 1.0 metadata, there is no metadata to tread on
* so we can always move back */ if (rdev->mddev->minor_version == 0) return 1;
/* otherwise we must be sure not to step on * any metadata, so stay: * 36K beyond start of superblock * beyond end of badblocks * beyond write-intent bitmap
*/ if (rdev->sb_start + (32+4)*2 > new_offset) return 0;
if (!rdev->mddev->bitmap_info.file) { struct mddev *mddev = rdev->mddev; struct md_bitmap_stats stats; int err;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.