/* * PPL consists of a 4KB header (struct ppl_header) and at least 128KB for * partial parity data. The header contains an array of entries * (struct ppl_header_entry) which describe the logged write requests. * Partial parity for the entries comes after the header, written in the same * sequence as the entries: * * Header * entry0 * ... * entryN * PP data * PP for entry0 * ... * PP for entryN * * An entry describes one or more consecutive stripe_heads, up to a full * stripe. The modifed raid data chunks form an m-by-n matrix, where m is the * number of stripe_heads in the entry and n is the number of modified data * disks. Every stripe_head in the entry must write to the same data disks. * An example of a valid case described by a single entry (writes to the first * stripe of a 4 disk array, 16k chunk size): * * sh->sector dd0 dd1 dd2 ppl * +-----+-----+-----+ * 0 | --- | --- | --- | +----+ * 8 | -W- | -W- | --- | | pp | data_sector = 8 * 16 | -W- | -W- | --- | | pp | data_size = 3 * 2 * 4k * 24 | -W- | -W- | --- | | pp | pp_size = 3 * 4k * +-----+-----+-----+ +----+ * * data_sector is the first raid sector of the modified data, data_size is the * total size of modified data and pp_size is the size of partial parity for * this entry. Entries for full stripe writes contain no partial parity * (pp_size = 0), they only mark the stripes for which parity should be * recalculated after an unclean shutdown. Every entry holds a checksum of its * partial parity, the header also has a checksum of the header itself. * * A write request is always logged to the PPL instance stored on the parity * disk of the corresponding stripe. For each member disk there is one ppl_log * used to handle logging for this disk, independently from others. They are * grouped in child_logs array in struct ppl_conf, which is assigned to * r5conf->log_private. * * ppl_io_unit represents a full PPL write, header_page contains the ppl_header. * PPL entries for logged stripes are added in ppl_log_stripe(). A stripe_head * can be appended to the last entry if it meets the conditions for a valid * entry described above, otherwise a new entry is added. Checksums of entries * are calculated incrementally as stripes containing partial parity are being * added. ppl_submit_iounit() calculates the checksum of the header and submits * a bio containing the header page and partial parity pages (sh->ppl_page) for * all stripes of the io_unit. When the PPL write completes, the stripes * associated with the io_unit are released and raid5d starts writing their data * and parity. When all stripes are written, the io_unit is freed and the next * can be submitted. * * An io_unit is used to gather stripes until it is submitted or becomes full * (if the maximum number of entries or size of PPL is reached). Another io_unit * can't be submitted until the previous has completed (PPL and stripe * data+parity is written). The log->io_list tracks all io_units of a log * (for a single member disk). New io_units are added to the end of the list * and the first io_unit is submitted, if it is not submitted already. * The current io_unit accepting new stripes is always at the end of the list. * * If write-back cache is enabled for any of the disks in the array, its data * must be flushed before next io_unit is submitted.
*/
#define PPL_SPACE_SIZE (128 * 1024)
struct ppl_conf { struct mddev *mddev;
/* array of child logs, one for each raid disk */ struct ppl_log *child_logs; int count;
int block_size; /* the logical block size used for data_sector
* in ppl_header_entry */
u32 signature; /* raid array identifier */
atomic64_t seq; /* current log write sequence number */
/* used only for recovery */ int recovered_entries; int mismatch_count;
/* stripes to retry if failed to allocate io_unit */ struct list_head no_mem_stripes;
spinlock_t no_mem_stripes_lock;
unsignedshort write_hint;
};
struct ppl_log { struct ppl_conf *ppl_conf; /* shared between all log instances */
struct md_rdev *rdev; /* array member disk associated with
* this log instance */ struct mutex io_mutex; struct ppl_io_unit *current_io; /* current io_unit accepting new data
* always at the end of io_list */
spinlock_t io_list_lock; struct list_head io_list; /* all io_units of this log */
unsignedint entries_count; /* number of entries in ppl_header */ unsignedint pp_size; /* total size current of partial parity */
u64 seq; /* sequence number of this log write */ struct list_head log_sibling; /* log->io_list */
struct list_head stripe_list; /* stripes added to the io_unit */
atomic_t pending_stripes; /* how many stripes not written to raid */
atomic_t pending_flushes; /* how many disk flushes are in progress */
bool submitted; /* true if write to log started */
/* inline bio and its biovec for submitting the iounit */ struct bio bio; struct bio_vec biovec[PPL_IO_INLINE_BVECS];
};
/* * Partial parity is the XOR of stripe data chunks that are not changed * during the write request. Depending on available data * (read-modify-write vs. reconstruct-write case) we calculate it * differently.
*/ if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { /* * rmw: xor old data and parity from updated disks * This is calculated earlier by ops_run_prexor5() so just copy * the parity dev page.
*/
srcs[count++] = sh->dev[pd_idx].page;
} elseif (sh->reconstruct_state == reconstruct_state_drain_run) { /* rcw: xor data from all not updated disks */ for (i = disks; i--;) { struct r5dev *dev = &sh->dev[i]; if (test_bit(R5_UPTODATE, &dev->flags))
srcs[count++] = dev->page;
}
} else { return tx;
}
/* check if current io_unit is full */ if (io && (io->pp_size == log->entry_space ||
io->entries_count == PPL_HDR_MAX_ENTRIES)) {
pr_debug("%s: add io_unit blocked by seq: %llu\n",
__func__, io->seq);
io = NULL;
}
/* add a new unit if there is none or the current is full */ if (!io) {
io = ppl_new_iounit(log, sh); if (!io) return -ENOMEM;
spin_lock_irq(&log->io_list_lock);
list_add_tail(&io->log_sibling, &log->io_list);
spin_unlock_irq(&log->io_list_lock);
log->current_io = io;
}
for (i = 0; i < sh->disks; i++) { struct r5dev *dev = &sh->dev[i];
if (i != sh->pd_idx && test_bit(R5_Wantwrite, &dev->flags)) { if (!data_disks || dev->sector < data_sector)
data_sector = dev->sector;
data_disks++;
}
}
BUG_ON(!data_disks);
/* * Check if we can append the stripe to the last entry. It must * be just after the last logged stripe and write to the same * disks. Use bit shift and logarithm to avoid 64-bit division.
*/ if ((sh->sector == sh_last->sector + RAID5_STRIPE_SECTORS(conf)) &&
(data_sector >> ilog2(conf->chunk_sectors) ==
data_sector_last >> ilog2(conf->chunk_sectors)) &&
((data_sector - data_sector_last) * data_disks ==
data_size_last >> 9))
e = last;
}
if (!e) {
e = &pplhdr->entries[io->entries_count++];
e->data_sector = cpu_to_le64(data_sector);
e->parity_disk = cpu_to_le32(sh->pd_idx);
e->checksum = cpu_to_le32(~0);
}
/* Rewind the buffer if current PPL is larger then remaining space */ if (log->use_multippl &&
log->rdev->ppl.sector + log->rdev->ppl.size - log->next_io_sector <
(PPL_HEADER_SIZE + io->pp_size) >> 9)
log->next_io_sector = log->rdev->ppl.sector;
/* if start and end is 4k aligned, use a 4k block */ if (block_size == 512 &&
(r_sector_first & (RAID5_STRIPE_SECTORS(conf) - 1)) == 0 &&
(r_sector_last & (RAID5_STRIPE_SECTORS(conf) - 1)) == 0)
block_size = RAID5_STRIPE_SIZE(conf);
/* iterate through blocks in strip */ for (i = 0; i < strip_sectors; i += (block_size >> 9)) { bool update_parity = false;
sector_t parity_sector; struct md_rdev *parity_rdev; struct stripe_head sh; int disk; int indent = 0;
/* iterate through data member disks */ for (disk = 0; disk < data_disks; disk++) { int dd_idx; struct md_rdev *rdev;
sector_t sector;
sector_t r_sector = r_sector_first + i +
(disk * conf->chunk_sectors);
pr_debug("%s:%*s data member disk %d start\n",
__func__, indent, "", disk);
indent += 2;
/* read parial parity for this entry and calculate its checksum */ while (pp_size) { int s = pp_size > PAGE_SIZE ? PAGE_SIZE : pp_size;
if (!sync_page_io(rdev, sector - rdev->data_offset,
s, page, REQ_OP_READ, false)) {
md_error(mddev, rdev);
ret = -EIO; goto out;
}
crc = crc32c(crc, page_address(page), s);
pp_size -= s;
sector += s >> 9;
}
crc = ~crc;
if (crc != crc_stored) { /* * Don't recover this entry if the checksum does not * match, but keep going and try to recover other * entries.
*/
pr_debug("%s: ppl entry crc does not match: stored: 0x%x calculated: 0x%x\n",
__func__, crc_stored, crc);
ppl_conf->mismatch_count++;
} else {
ret = ppl_recover_entry(log, e, ppl_sector); if (ret) goto out;
ppl_conf->recovered_entries++;
}
ppl_sector += ppl_entry_sectors;
}
/* flush the disk cache after recovery if necessary */
ret = blkdev_issue_flush(rdev->bdev);
out:
__free_page(page); return ret;
}
staticint ppl_write_empty_header(struct ppl_log *log)
{ struct page *page; struct ppl_header *pplhdr; struct md_rdev *rdev = log->rdev; int ret = 0;
/* calculate next potential ppl offset */ for (i = 0; i < le32_to_cpu(pplhdr->entries_count); i++)
pplhdr_offset +=
le32_to_cpu(pplhdr->entries[i].pp_size) >> 9;
pplhdr_offset += PPL_HEADER_SIZE >> 9;
}
/* no valid ppl found */ if (!pplhdr)
ppl_conf->mismatch_count++; else
pr_debug("%s: latest PPL found at offset: %llu, with generation: %llu\n",
__func__, (unsignedlonglong)pplhdr_offset,
le64_to_cpu(pplhdr->generation));
/* attempt to recover from log if we are starting a dirty array */ if (pplhdr && !mddev->pers && mddev->resync_offset != MaxSector)
ret = ppl_recover(log, pplhdr, pplhdr_offset);
/* write empty header if we are starting the array */ if (!ret && !mddev->pers)
ret = ppl_write_empty_header(log);
staticint ppl_load(struct ppl_conf *ppl_conf)
{ int ret = 0;
u32 signature = 0; bool signature_set = false; int i;
for (i = 0; i < ppl_conf->count; i++) { struct ppl_log *log = &ppl_conf->child_logs[i];
/* skip missing drive */ if (!log->rdev) continue;
ret = ppl_load_distributed(log); if (ret) break;
/* * For external metadata we can't check if the signature is * correct on a single drive, but we can check if it is the same * on all drives.
*/ if (ppl_conf->mddev->external) { if (!signature_set) {
signature = ppl_conf->signature;
signature_set = true;
} elseif (signature != ppl_conf->signature) {
pr_warn("md/raid:%s: PPL header signature does not match on all member drives\n",
mdname(ppl_conf->mddev));
ret = -EINVAL; break;
}
}
}
if (ppl_conf) {
__ppl_exit_log(ppl_conf);
conf->log_private = NULL;
}
}
staticint ppl_validate_rdev(struct md_rdev *rdev)
{ int ppl_data_sectors; int ppl_size_new;
/* * The configured PPL size must be enough to store * the header and (at the very least) partial parity * for one stripe. Round it down to ensure the data * space is cleanly divisible by stripe size.
*/
ppl_data_sectors = rdev->ppl.size - (PPL_HEADER_SIZE >> 9);
if (ppl_data_sectors > 0)
ppl_data_sectors = rounddown(ppl_data_sectors,
RAID5_STRIPE_SECTORS((struct r5conf *)rdev->mddev->private));
if (ppl_data_sectors <= 0) {
pr_warn("md/raid:%s: PPL space too small on %pg\n",
mdname(rdev->mddev), rdev->bdev); return -ENOSPC;
}
if (mddev->level != 5) {
pr_warn("md/raid:%s PPL is not compatible with raid level %d\n",
mdname(mddev), mddev->level); return -EINVAL;
}
if (mddev->bitmap_info.file || mddev->bitmap_info.offset) {
pr_warn("md/raid:%s PPL is not compatible with bitmap\n",
mdname(mddev)); return -EINVAL;
}
if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
pr_warn("md/raid:%s PPL is not compatible with journal\n",
mdname(mddev)); return -EINVAL;
}
max_disks = sizeof_field(struct ppl_log, disk_flush_bitmap) *
BITS_PER_BYTE; if (conf->raid_disks > max_disks) {
pr_warn("md/raid:%s PPL doesn't support over %d disks in the array\n",
mdname(mddev), max_disks); return -EINVAL;
}
ppl_conf = kzalloc(sizeof(struct ppl_conf), GFP_KERNEL); if (!ppl_conf) return -ENOMEM;
ppl_conf->mddev = mddev;
ppl_conf->io_kc = KMEM_CACHE(ppl_io_unit, 0); if (!ppl_conf->io_kc) {
ret = -ENOMEM; goto err;
}
ret = mempool_init(&ppl_conf->io_pool, conf->raid_disks, ppl_io_pool_alloc,
ppl_io_pool_free, ppl_conf->io_kc); if (ret) goto err;
ret = bioset_init(&ppl_conf->bs, conf->raid_disks, 0, BIOSET_NEED_BVECS); if (ret) goto err;
ret = bioset_init(&ppl_conf->flush_bs, conf->raid_disks, 0, 0); if (ret) goto err;
ppl_conf->count = conf->raid_disks;
ppl_conf->child_logs = kcalloc(ppl_conf->count, sizeof(struct ppl_log),
GFP_KERNEL); if (!ppl_conf->child_logs) {
ret = -ENOMEM; goto err;
}
if (rdev) {
ret = ppl_validate_rdev(rdev); if (ret) goto err;
ppl_init_child_log(log, rdev);
}
}
/* load and possibly recover the logs from the member disks */
ret = ppl_load(ppl_conf);
if (ret) { goto err;
} elseif (!mddev->pers && mddev->resync_offset == 0 &&
ppl_conf->recovered_entries > 0 &&
ppl_conf->mismatch_count == 0) { /* * If we are starting a dirty array and the recovery succeeds * without any issues, set the array as clean.
*/
mddev->resync_offset = MaxSector;
set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
} elseif (mddev->pers && ppl_conf->mismatch_count > 0) { /* no mismatch allowed when enabling PPL for a running array */
ret = -EINVAL; goto err;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.