/* * This target will sequentially log all writes to the target device onto the * log device. This is helpful for replaying writes to check for fs consistency * at all times. This target provides a mechanism to mark specific events to * check data at a later time. So for example you would: * * write data * fsync * dmsetup message /dev/whatever mark mymark * unmount /mnt/test * * Then replay the log up to mymark and check the contents of the replay to * verify it matches what was written. * * We log writes only after they have been flushed, this makes the log describe * close to the order in which the data hits the actual disk, not its cache. So * for example the following sequence (W means write, C means complete) * * Wa,Wb,Wc,Cc,Ca,FLUSH,FUAd,Cb,CFLUSH,CFUAd * * Would result in the log looking like this: * * c,a,b,flush,fuad,<other writes>,<next flush> * * This is meant to help expose problems where file systems do not properly wait * on data being written before invoking a FLUSH. FUA bypasses cache so once it * completes it is added to the log as it should be on disk. * * We treat DISCARDs as if they don't bypass cache so that they are logged in * order of completion along with the normal writes. If we didn't do it this * way we would process all the discards first and then write all the data, when * in fact we want to do the data and the discard in the order that they * completed.
*/ #define LOG_FLUSH_FLAG (1 << 0) #define LOG_FUA_FLAG (1 << 1) #define LOG_DISCARD_FLAG (1 << 2) #define LOG_MARK_FLAG (1 << 3) #define LOG_METADATA_FLAG (1 << 4)
/* * The disk format for this is braindead simple. * * At byte 0 we have our super, followed by the following sequence for * nr_entries: * * [ 1 sector ][ entry->nr_sectors ] * [log_write_entry][ data written ] * * The log_write_entry takes up a full sector so we can have arbitrary length * marks and it leaves us room for extra content in the future.
*/
/* * Basic info about the log for userspace.
*/ struct log_write_super {
__le64 magic;
__le64 version;
__le64 nr_entries;
__le32 sectorsize;
};
/* * sector - the sector we wrote. * nr_sectors - the number of sectors we wrote. * flags - flags for this log entry. * data_len - the size of the data in this log entry, this is for private log * entry stuff, the MARK data provided by userspace for example.
*/ struct log_write_entry {
__le64 sector;
__le64 nr_sectors;
__le64 flags;
__le64 data_len;
};
staticvoid log_end_super(struct bio *bio)
{ struct log_writes_c *lc = bio->bi_private;
complete(&lc->super_done);
log_end_io(bio);
}
/* * Meant to be called if there is an error, it will free all the pages * associated with the block.
*/ staticvoid free_pending_block(struct log_writes_c *lc, struct pending_block *block)
{ int i;
for (i = 0; i < block->vec_cnt; i++) { if (block->vecs[i].bv_page)
__free_page(block->vecs[i].bv_page);
}
kfree(block->data);
kfree(block);
put_pending_block(lc);
}
if (block->datalen && metadatalen == 0) { if (write_inline_data(lc, &entry, sizeof(entry), block->data,
block->datalen, sector)) {
free_pending_block(lc, block); return -1;
} /* we don't support both inline data & bio data */ goto out;
}
for (i = 0; i < block->vec_cnt; i++) { /* * The page offset is always 0 because we allocate a new page * for every bvec in the original bio for simplicity sake.
*/
ret = bio_add_page(bio, block->vecs[i].bv_page,
block->vecs[i].bv_len, 0); if (ret != block->vecs[i].bv_len) {
atomic_inc(&lc->io_blocks);
submit_bio(bio);
bio = bio_alloc(lc->logdev->bdev,
bio_max_segs(block->vec_cnt - i),
REQ_OP_WRITE, GFP_KERNEL);
bio->bi_iter.bi_size = 0;
bio->bi_iter.bi_sector = sector;
bio->bi_end_io = log_end_io;
bio->bi_private = lc;
/* * Super sector should be writen in-order, otherwise the * nr_entries could be rewritten incorrectly by an old bio.
*/
wait_for_completion_io(&lc->super_done);
/* * Apparently the size of the device may not be known * right away, so handle this properly.
*/ if (!lc->end_sector)
lc->end_sector = logdev_last_sector(lc); if (lc->end_sector &&
lc->next_sector >= lc->end_sector) {
DMERR("Ran out of space on the logdev");
lc->logging_enabled = false; goto next;
}
lc->logged_entries++;
atomic_inc(&lc->io_blocks);
super = (block->flags & (LOG_FUA_FLAG | LOG_MARK_FLAG)); if (super)
atomic_inc(&lc->io_blocks);
}
next:
logging_enabled = lc->logging_enabled;
spin_unlock_irq(&lc->blocks_lock); if (block) { if (logging_enabled) {
ret = log_one_block(lc, block, sector); if (!ret && super)
ret = log_super(lc); if (ret) {
spin_lock_irq(&lc->blocks_lock);
lc->logging_enabled = false;
spin_unlock_irq(&lc->blocks_lock);
}
} else
free_pending_block(lc, block); continue;
}
if (!try_to_freeze()) {
set_current_state(TASK_INTERRUPTIBLE); if (!kthread_should_stop() &&
list_empty(&lc->logging_blocks))
schedule();
__set_current_state(TASK_RUNNING);
}
} return 0;
}
/* * next_sector is in 512b sectors to correspond to what bi_sector expects. * The super starts at sector 0, and the next_sector is the next logical * one based on the sectorsize of the device.
*/
lc->next_sector = lc->sectorsize >> SECTOR_SHIFT;
lc->logging_enabled = true;
lc->end_sector = logdev_last_sector(lc);
lc->device_supports_discard = true;
/* * This is just nice to have since it'll update the super to include the * unflushed blocks, if it fails we don't really care.
*/
log_mark(lc, "dm-log-writes-end");
wake_up_process(lc->log_kthread);
wait_event(lc->wait, !atomic_read(&lc->io_blocks) &&
!atomic_read(&lc->pending_blocks));
kthread_stop(lc->log_kthread);
/* Don't bother doing anything if logging has been disabled */ if (!lc->logging_enabled) goto map_bio;
/* * Map reads as normal.
*/ if (bio_data_dir(bio) == READ) goto map_bio;
/* No sectors and not a flush? Don't care */ if (!bio_sectors(bio) && !flush_bio) goto map_bio;
/* * Discards will have bi_size set but there's no actual data, so just * allocate the size of the pending block.
*/ if (discard_bio)
alloc_size = sizeof(struct pending_block); else
alloc_size = struct_size(block, vecs, bio_segments(bio));
if (flush_bio)
block->flags |= LOG_FLUSH_FLAG; if (fua_bio)
block->flags |= LOG_FUA_FLAG; if (discard_bio)
block->flags |= LOG_DISCARD_FLAG; if (meta_bio)
block->flags |= LOG_METADATA_FLAG;
/* We don't need the data, just submit */ if (discard_bio) {
WARN_ON(flush_bio || fua_bio); if (lc->device_supports_discard) goto map_bio;
bio_endio(bio); return DM_MAPIO_SUBMITTED;
}
/* Flush bio, splice the unflushed blocks onto this list and submit */ if (flush_bio && !bio_sectors(bio)) {
spin_lock_irq(&lc->blocks_lock);
list_splice_init(&lc->unflushed_blocks, &block->list);
spin_unlock_irq(&lc->blocks_lock); goto map_bio;
}
/* * We will write this bio somewhere else way later so we need to copy * the actual contents into new pages so we know the data will always be * there. * * We do this because this could be a bio from O_DIRECT in which case we * can't just hold onto the page until some later point, we have to * manually copy the contents.
*/
bio_for_each_segment(bv, bio, iter) { struct page *page; void *dst;
/* Had a flush with data in it, weird */ if (flush_bio) {
spin_lock_irq(&lc->blocks_lock);
list_splice_init(&lc->unflushed_blocks, &block->list);
spin_unlock_irq(&lc->blocks_lock);
}
map_bio:
normal_map_bio(ti, bio); return DM_MAPIO_REMAPPED;
}
*bdev = dev->bdev; /* * Only pass ioctls through if the device sizes match exactly.
*/ if (ti->len != bdev_nr_sectors(dev->bdev)) return 1; return 0;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.