/** * get_lock() - Get the lock object for a slab journal block by sequence number. * @journal: vdo_slab journal to retrieve from. * @sequence_number: Sequence number of the block. * * Return: The lock object for the given sequence number.
*/ staticinlinestruct journal_lock * __must_check get_lock(struct slab_journal *journal,
sequence_number_t sequence_number)
{ return &journal->locks[sequence_number % journal->size];
}
/** * must_make_entries_to_flush() - Check whether there are entry waiters which should delay a flush. * @journal: The journal to check. * * Return: true if there are no entry waiters, or if the slab is unrecovered.
*/ staticinlinebool __must_check must_make_entries_to_flush(struct slab_journal *journal)
{ return ((journal->slab->status != VDO_SLAB_REBUILDING) &&
vdo_waitq_has_waiters(&journal->entry_waiters));
}
/** * is_reaping() - Check whether a reap is currently in progress. * @journal: The journal which may be reaping. * * Return: true if the journal is reaping.
*/ staticinlinebool __must_check is_reaping(struct slab_journal *journal)
{ return (journal->head != journal->unreapable);
}
/** * initialize_tail_block() - Initialize tail block as a new block. * @journal: The journal whose tail block is being initialized.
*/ staticvoid initialize_tail_block(struct slab_journal *journal)
{ struct slab_journal_block_header *header = &journal->tail_header;
/** * initialize_journal_state() - Set all journal fields appropriately to start journaling. * @journal: The journal to be reset, based on its tail sequence number.
*/ staticvoid initialize_journal_state(struct slab_journal *journal)
{
journal->unreapable = journal->head;
journal->reap_lock = get_lock(journal, journal->unreapable);
journal->next_commit = journal->tail;
journal->summarized = journal->last_summarized = journal->tail;
initialize_tail_block(journal);
}
/** * block_is_full() - Check whether a journal block is full. * @journal: The slab journal for the block. * * Return: true if the tail block is full.
*/ staticbool __must_check block_is_full(struct slab_journal *journal)
{
journal_entry_count_t count = journal->tail_header.entry_count;
/** * is_slab_journal_blank() - Check whether a slab's journal is blank. * * A slab journal is blank if it has never had any entries recorded in it. * * Return: true if the slab's journal has never been modified.
*/ staticbool is_slab_journal_blank(conststruct vdo_slab *slab)
{ return ((slab->journal.tail == 1) &&
(slab->journal.tail_header.entry_count == 0));
}
/** * mark_slab_journal_dirty() - Put a slab journal on the dirty list of its allocator in the correct * order. * @journal: The journal to be marked dirty. * @lock: The recovery journal lock held by the slab journal.
*/ staticvoid mark_slab_journal_dirty(struct slab_journal *journal, sequence_number_t lock)
{ struct slab_journal *dirty_journal; struct list_head *dirty_list = &journal->slab->allocator->dirty_slab_journals;
VDO_ASSERT_LOG_ONLY(journal->recovery_lock == 0, "slab journal was clean");
/* When not suspending or recovering, the slab must be clean. */
code = vdo_get_admin_state_code(&slab->state);
read_only = vdo_is_read_only(slab->allocator->depot->vdo); if (!read_only &&
vdo_waitq_has_waiters(&slab->dirty_blocks) &&
(code != VDO_ADMIN_STATE_SUSPENDING) &&
(code != VDO_ADMIN_STATE_RECOVERING)) return;
/** * compute_fullness_hint() - Translate a slab's free block count into a 'fullness hint' that can be * stored in a slab_summary_entry's 7 bits that are dedicated to its free * count. * @depot: The depot whose summary being updated. * @free_blocks: The number of free blocks. * * Note: the number of free blocks must be strictly less than 2^23 blocks, even though * theoretically slabs could contain precisely 2^23 blocks; there is an assumption that at least * one block is used by metadata. This assumption is necessary; otherwise, the fullness hint might * overflow. The fullness hint formula is roughly (fullness >> 16) & 0x7f, but (2^23 >> 16) & 0x7f * is 0, which would make it impossible to distinguish completely full from completely empty. * * Return: A fullness hint, which can be stored in 7 bits.
*/ static u8 __must_check compute_fullness_hint(struct slab_depot *depot,
block_count_t free_blocks)
{
block_count_t hint;
VDO_ASSERT_LOG_ONLY((free_blocks < (1 << 23)), "free blocks must be less than 2^23");
/** * notify_summary_waiters() - Wake all the waiters in a given queue. * @allocator: The block allocator summary which owns the queue. * @queue: The queue to notify.
*/ staticvoid notify_summary_waiters(struct block_allocator *allocator, struct vdo_wait_queue *queue)
{ int result = (vdo_is_read_only(allocator->depot->vdo) ?
VDO_READ_ONLY : VDO_SUCCESS);
/** * launch_write() - Write a slab summary block unless it is currently out for writing. * @block: The block that needs to be committed.
*/ staticvoid launch_write(struct slab_summary_block *block)
{ struct block_allocator *allocator = block->allocator; struct slab_depot *depot = allocator->depot;
physical_block_number_t pbn;
/* * Flush before writing to ensure that the slab journal tail blocks and reference updates * covered by this summary update are stable. Otherwise, a subsequent recovery could * encounter a slab summary update that refers to a slab journal tail block that has not * actually been written. In such cases, the slab journal referenced will be treated as * empty, causing any data within the slab which predates the existing recovery journal * entries to be lost.
*/
pbn = (depot->summary_origin +
(VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE * allocator->zone_number) +
block->index);
vdo_submit_metadata_vio(&block->vio, pbn, write_slab_summary_endio,
handle_write_error, REQ_OP_WRITE | REQ_PREFLUSH);
}
/** * update_slab_summary_entry() - Update the entry for a slab. * @slab: The slab whose entry is to be updated * @waiter: The waiter that is updating the summary. * @tail_block_offset: The offset of the slab journal's tail block. * @load_ref_counts: Whether the reference counts must be loaded from disk on the vdo load. * @is_clean: Whether the slab is clean. * @free_blocks: The number of free blocks.
*/ staticvoid update_slab_summary_entry(struct vdo_slab *slab, struct vdo_waiter *waiter,
tail_block_offset_t tail_block_offset, bool load_ref_counts, bool is_clean,
block_count_t free_blocks)
{
u8 index = slab->slab_number / VDO_SLAB_SUMMARY_ENTRIES_PER_BLOCK; struct block_allocator *allocator = slab->allocator; struct slab_summary_block *block = &allocator->summary_blocks[index]; int result; struct slab_summary_entry *entry;
if (vdo_is_read_only(block->vio.completion.vdo)) {
result = VDO_READ_ONLY;
waiter->callback(waiter, &result); return;
}
if (vdo_is_state_draining(&allocator->summary_state) ||
vdo_is_state_quiescent(&allocator->summary_state)) {
result = VDO_INVALID_ADMIN_STATE;
waiter->callback(waiter, &result); return;
}
/** * finish_reaping() - Actually advance the head of the journal now that any necessary flushes are * complete. * @journal: The journal to be reaped.
*/ staticvoid finish_reaping(struct slab_journal *journal)
{
journal->head = journal->unreapable;
add_entries(journal);
check_if_slab_drained(journal->slab);
}
/** * complete_reaping() - Finish reaping now that we have flushed the lower layer and then try * reaping again in case we deferred reaping due to an outstanding vio. * @completion: The flush vio.
*/ staticvoid complete_reaping(struct vdo_completion *completion)
{ struct slab_journal *journal = completion->parent;
/** * reap_slab_journal() - Conduct a reap on a slab journal to reclaim unreferenced blocks. * @journal: The slab journal.
*/ staticvoid reap_slab_journal(struct slab_journal *journal)
{ bool reaped = false;
if (is_reaping(journal)) { /* We already have a reap in progress so wait for it to finish. */ return;
}
if ((journal->slab->status != VDO_SLAB_REBUILT) ||
!vdo_is_state_normal(&journal->slab->state) ||
vdo_is_read_only(journal->slab->allocator->depot->vdo)) { /* * We must not reap in the first two cases, and there's no point in read-only mode.
*/ return;
}
/* * Start reclaiming blocks only when the journal head has no references. Then stop when a * block is referenced or reap reaches the most recently written block, referenced by the * slab summary, which has the sequence number just before the tail.
*/ while ((journal->unreapable < journal->tail) && (journal->reap_lock->count == 0)) {
reaped = true;
journal->unreapable++;
journal->reap_lock++; if (journal->reap_lock == &journal->locks[journal->size])
journal->reap_lock = &journal->locks[0];
}
if (!reaped) return;
/* * It is never safe to reap a slab journal block without first issuing a flush, regardless * of whether a user flush has been received or not. In the absence of the flush, the * reference block write which released the locks allowing the slab journal to reap may not * be persisted. Although slab summary writes will eventually issue flushes, multiple slab * journal block writes can be issued while previous slab summary updates have not yet been * made. Even though those slab journal block writes will be ignored if the slab summary * update is not persisted, they may still overwrite the to-be-reaped slab journal block * resulting in a loss of reference count updates.
*/
journal->flush_waiter.callback = flush_for_reaping;
acquire_vio_from_pool(journal->slab->allocator->vio_pool,
&journal->flush_waiter);
}
/** * adjust_slab_journal_block_reference() - Adjust the reference count for a slab journal block. * @journal: The slab journal. * @sequence_number: The journal sequence number of the referenced block. * @adjustment: Amount to adjust the reference counter. * * Note that when the adjustment is negative, the slab journal will be reaped.
*/ staticvoid adjust_slab_journal_block_reference(struct slab_journal *journal,
sequence_number_t sequence_number, int adjustment)
{ struct journal_lock *lock;
if (sequence_number == 0) return;
if (journal->slab->status == VDO_SLAB_REPLAYING) { /* Locks should not be used during offline replay. */ return;
}
VDO_ASSERT_LOG_ONLY((adjustment != 0), "adjustment must be non-zero");
lock = get_lock(journal, sequence_number); if (adjustment < 0) {
VDO_ASSERT_LOG_ONLY((-adjustment <= lock->count), "adjustment %d of lock count %u for slab journal block %llu must not underflow",
adjustment, lock->count,
(unsignedlonglong) sequence_number);
}
lock->count += adjustment; if (lock->count == 0)
reap_slab_journal(journal);
}
/** * release_journal_locks() - Callback invoked after a slab summary update completes. * @waiter: The slab summary waiter that has just been notified. * @context: The result code of the update. * * Registered in the constructor on behalf of update_tail_block_location(). * * Implements waiter_callback_fn.
*/ staticvoid release_journal_locks(struct vdo_waiter *waiter, void *context)
{
sequence_number_t first, i; struct slab_journal *journal =
container_of(waiter, struct slab_journal, slab_summary_waiter); int result = *((int *) context);
if (result != VDO_SUCCESS) { if (result != VDO_READ_ONLY) { /* * Don't bother logging what might be lots of errors if we are already in * read-only mode.
*/
vdo_log_error_strerror(result, "failed slab summary update %llu",
(unsignedlonglong) journal->summarized);
}
first = journal->last_summarized;
journal->last_summarized = journal->summarized; for (i = journal->summarized - 1; i >= first; i--) { /* * Release the lock the summarized block held on the recovery journal. (During * replay, recovery_start will always be 0.)
*/ if (journal->recovery_journal != NULL) {
zone_count_t zone_number = journal->slab->allocator->zone_number; struct journal_lock *lock = get_lock(journal, i);
/* * Release our own lock against reaping for blocks that are committed. (This * function will not change locks during replay.)
*/
adjust_slab_journal_block_reference(journal, i, -1);
}
journal->updating_slab_summary = false;
reap_slab_journal(journal);
/* Check if the slab summary needs to be updated again. */
update_tail_block_location(journal);
}
/** * update_tail_block_location() - Update the tail block location in the slab summary, if necessary. * @journal: The slab journal that is updating its tail block location.
*/ staticvoid update_tail_block_location(struct slab_journal *journal)
{
block_count_t free_block_count; struct vdo_slab *slab = journal->slab;
/* * Update slab summary as dirty. * vdo_slab journal can only reap past sequence number 1 when all the ref counts for this * slab have been written to the layer. Therefore, indicate that the ref counts must be * loaded when the journal head has reaped past sequence number 1.
*/
update_slab_summary_entry(slab, &journal->slab_summary_waiter,
journal->summarized % journal->size,
(journal->head > 1), false, free_block_count);
}
/** * reopen_slab_journal() - Reopen a slab's journal by emptying it and then adding pending entries.
*/ staticvoid reopen_slab_journal(struct vdo_slab *slab)
{ struct slab_journal *journal = &slab->journal;
sequence_number_t block;
VDO_ASSERT_LOG_ONLY(journal->tail_header.entry_count == 0, "vdo_slab journal's active block empty before reopening");
journal->head = journal->tail;
initialize_journal_state(journal);
/* Ensure no locks are spuriously held on an empty journal. */ for (block = 1; block <= journal->size; block++) {
VDO_ASSERT_LOG_ONLY((get_lock(journal, block)->count == 0), "Scrubbed journal's block %llu is not locked",
(unsignedlonglong) block);
}
if (list_empty(&journal->uncommitted_blocks)) { /* If no blocks are outstanding, then the commit point is at the tail. */
journal->next_commit = journal->tail;
} else { /* The commit point is always the beginning of the oldest incomplete block. */
pooled = container_of(journal->uncommitted_blocks.next, struct pooled_vio, list_entry);
journal->next_commit = get_committing_sequence_number(pooled);
}
/* Copy the tail block into the vio. */
memcpy(pooled->vio.data, journal->block, VDO_BLOCK_SIZE);
VDO_ASSERT_LOG_ONLY(unused_entries >= 0, "vdo_slab journal block is not overfull"); if (unused_entries > 0) { /* * Release the per-entry locks for any unused entries in the block we are about to * write.
*/
adjust_slab_journal_block_reference(journal, header->sequence_number,
-unused_entries);
journal->partial_write_in_progress = !block_is_full(journal);
}
/* * This block won't be read in recovery until the slab summary is updated to refer to it. * The slab summary update does a flush which is sufficient to protect us from corruption * due to out of order slab journal, reference block, or block map writes.
*/
vdo_submit_metadata_vio(vdo_forget(vio), block_number, write_slab_journal_endio,
complete_write, REQ_OP_WRITE);
/* Since the write is submitted, the tail block structure can be reused. */
journal->tail++;
initialize_tail_block(journal);
journal->waiting_to_commit = false;
/** * commit_tail() - Commit the tail block of the slab journal. * @journal: The journal whose tail block should be committed.
*/ staticvoid commit_tail(struct slab_journal *journal)
{ if ((journal->tail_header.entry_count == 0) && must_make_entries_to_flush(journal)) { /* * There are no entries at the moment, but there are some waiters, so defer * initiating the flush until those entries are ready to write.
*/ return;
}
if (vdo_is_read_only(journal->slab->allocator->depot->vdo) ||
journal->waiting_to_commit ||
(journal->tail_header.entry_count == 0)) { /* * There is nothing to do since the tail block is empty, or writing, or the journal * is in read-only mode.
*/ return;
}
/* * Since we are about to commit the tail block, this journal no longer needs to be on the * list of journals which the recovery journal might ask to commit.
*/
mark_slab_journal_clean(journal);
/** * encode_slab_journal_entry() - Encode a slab journal entry. * @tail_header: The unpacked header for the block. * @payload: The journal block payload to hold the entry. * @sbn: The slab block number of the entry to encode. * @operation: The type of the entry. * @increment: True if this is an increment. * * Exposed for unit tests.
*/ staticvoid encode_slab_journal_entry(struct slab_journal_block_header *tail_header,
slab_journal_payload *payload,
slab_block_number sbn, enum journal_operation operation, bool increment)
{
journal_entry_count_t entry_number = tail_header->entry_count++;
if (operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING) { if (!tail_header->has_block_map_increments) {
memset(payload->full_entries.entry_types, 0,
VDO_SLAB_JOURNAL_ENTRY_TYPES_SIZE);
tail_header->has_block_map_increments = true;
}
/** * expand_journal_point() - Convert a recovery journal journal_point which refers to both an * increment and a decrement to a single point which refers to one or the * other. * @recovery_point: The journal point to convert. * @increment: Whether the current entry is an increment. * * Return: The expanded journal point * * Because each data_vio has but a single recovery journal point, but may need to make both * increment and decrement entries in the same slab journal. In order to distinguish the two * entries, the entry count of the expanded journal point is twice the actual recovery journal * entry count for increments, and one more than that for decrements.
*/ staticstruct journal_point expand_journal_point(struct journal_point recovery_point, bool increment)
{
recovery_point.entry_count *= 2; if (!increment)
recovery_point.entry_count++;
return recovery_point;
}
/** * add_entry() - Actually add an entry to the slab journal, potentially firing off a write if a * block becomes full. * @journal: The slab journal to append to. * @pbn: The pbn being adjusted. * @operation: The type of entry to make. * @increment: True if this is an increment. * @recovery_point: The expanded recovery point. * * This function is synchronous.
*/ staticvoid add_entry(struct slab_journal *journal, physical_block_number_t pbn, enum journal_operation operation, bool increment, struct journal_point recovery_point)
{ struct packed_slab_journal_block *block = journal->block; int result;
result = VDO_ASSERT(vdo_before_journal_point(&journal->tail_header.recovery_point,
&recovery_point), "recovery journal point is monotonically increasing, recovery point: %llu.%u, block recovery point: %llu.%u",
(unsignedlonglong) recovery_point.sequence_number,
recovery_point.entry_count,
(unsignedlonglong) journal->tail_header.recovery_point.sequence_number,
journal->tail_header.recovery_point.entry_count); if (result != VDO_SUCCESS) {
vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo, result); return;
}
if (operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING) {
result = VDO_ASSERT((journal->tail_header.entry_count <
journal->full_entries_per_block), "block has room for full entries"); if (result != VDO_SUCCESS) {
vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo,
result); return;
}
}
/** * vdo_attempt_replay_into_slab() - Replay a recovery journal entry into a slab's journal. * @slab: The slab to play into. * @pbn: The PBN for the entry. * @operation: The type of entry to add. * @increment: True if this entry is an increment. * @recovery_point: The recovery journal point corresponding to this entry. * @parent: The completion to notify when there is space to add the entry if the entry could not be * added immediately. * * Return: true if the entry was added immediately.
*/ bool vdo_attempt_replay_into_slab(struct vdo_slab *slab, physical_block_number_t pbn, enum journal_operation operation, bool increment, struct journal_point *recovery_point, struct vdo_completion *parent)
{ struct slab_journal *journal = &slab->journal; struct slab_journal_block_header *header = &journal->tail_header; struct journal_point expanded = expand_journal_point(*recovery_point, increment);
/* Only accept entries after the current recovery point. */ if (!vdo_before_journal_point(&journal->tail_header.recovery_point, &expanded)) returntrue;
if ((header->entry_count >= journal->full_entries_per_block) &&
(header->has_block_map_increments || (operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING))) { /* * The tail block does not have room for the entry we are attempting to add so * commit the tail block now.
*/
commit_tail(journal);
}
if (journal->waiting_to_commit) {
vdo_start_operation_with_waiter(&journal->slab->state,
VDO_ADMIN_STATE_WAITING_FOR_RECOVERY,
parent, NULL); returnfalse;
}
if (journal_length(journal) >= journal->size) { /* * We must have reaped the current head before the crash, since the blocked * threshold keeps us from having more entries than fit in a slab journal; hence we * can just advance the head (and unreapable block), as needed.
*/
journal->head++;
journal->unreapable++;
}
if (journal->slab->status == VDO_SLAB_REBUILT)
journal->slab->status = VDO_SLAB_REPLAYING;
/** * requires_reaping() - Check whether the journal must be reaped before adding new entries. * @journal: The journal to check. * * Return: true if the journal must be reaped.
*/ staticbool requires_reaping(conststruct slab_journal *journal)
{ return (journal_length(journal) >= journal->blocking_threshold);
}
/** finish_summary_update() - A waiter callback that resets the writing state of a slab. */ staticvoid finish_summary_update(struct vdo_waiter *waiter, void *context)
{ struct vdo_slab *slab = container_of(waiter, struct vdo_slab, summary_waiter); int result = *((int *) context);
slab->active_count--;
if ((result != VDO_SUCCESS) && (result != VDO_READ_ONLY)) {
vdo_log_error_strerror(result, "failed to update slab summary");
vdo_enter_read_only_mode(slab->allocator->depot->vdo, result);
}
/** * launch_reference_block_write() - Launch the write of a dirty reference block by first acquiring * a VIO for it from the pool. * @waiter: The waiter of the block which is starting to write. * @context: The parent slab of the block. * * This can be asynchronous since the writer will have to wait if all VIOs in the pool are * currently in use.
*/ staticvoid launch_reference_block_write(struct vdo_waiter *waiter, void *context)
{ struct vdo_slab *slab = context;
if (vdo_is_read_only(slab->allocator->depot->vdo)) return;
/** * finish_reference_block_write() - After a reference block has written, clean it, release its * locks, and return its VIO to the pool. * @completion: The VIO that just finished writing.
*/ staticvoid finish_reference_block_write(struct vdo_completion *completion)
{ struct vio *vio = as_vio(completion); struct pooled_vio *pooled = vio_as_pooled_vio(vio); struct reference_block *block = completion->parent; struct vdo_slab *slab = block->slab;
tail_block_offset_t offset;
/* * We can't clear the is_writing flag earlier as releasing the slab journal lock may cause * us to be dirtied again, but we don't want to double enqueue.
*/
block->is_writing = false;
if (vdo_is_read_only(completion->vdo)) {
check_if_slab_drained(slab); return;
}
/* Re-queue the block if it was re-dirtied while it was writing. */ if (block->is_dirty) {
vdo_waitq_enqueue_waiter(&block->slab->dirty_blocks, &block->waiter); if (vdo_is_state_draining(&slab->state)) { /* We must be saving, and this block will otherwise not be relaunched. */
save_dirty_reference_blocks(slab);
}
return;
}
/* * Mark the slab as clean in the slab summary if there are no dirty or writing blocks * and no summary update in progress.
*/ if ((slab->active_count > 0) || vdo_waitq_has_waiters(&slab->dirty_blocks)) {
check_if_slab_drained(slab); return;
}
/** * get_reference_counters_for_block() - Find the reference counters for a given block. * @block: The reference_block in question. * * Return: A pointer to the reference counters for this block.
*/ static vdo_refcount_t * __must_check get_reference_counters_for_block(struct reference_block *block)
{
size_t block_index = block - block->slab->reference_blocks;
/** * pack_reference_block() - Copy data from a reference block to a buffer ready to be written out. * @block: The block to copy. * @buffer: The char buffer to fill with the packed block.
*/ staticvoid pack_reference_block(struct reference_block *block, void *buffer)
{ struct packed_reference_block *packed = buffer;
vdo_refcount_t *counters = get_reference_counters_for_block(block);
sector_count_t i; struct packed_journal_point commit_point;
/** * write_reference_block() - After a dirty block waiter has gotten a VIO from the VIO pool, copy * its counters and associated data into the VIO, and launch the write. * @waiter: The waiter of the dirty block. * @context: The VIO returned by the pool.
*/ staticvoid write_reference_block(struct vdo_waiter *waiter, void *context)
{
size_t block_offset;
physical_block_number_t pbn; struct pooled_vio *pooled = context; struct vdo_completion *completion = &pooled->vio.completion; struct reference_block *block = container_of(waiter, struct reference_block,
waiter);
/* * Mark the block as clean, since we won't be committing any updates that happen after this * moment. As long as VIO order is preserved, two VIOs updating this block at once will not * cause complications.
*/
block->is_dirty = false;
/* * Flush before writing to ensure that the recovery journal and slab journal entries which * cover this reference update are stable. This prevents data corruption that can be caused * by out of order writes.
*/
WRITE_ONCE(block->slab->allocator->ref_counts_statistics.blocks_written,
block->slab->allocator->ref_counts_statistics.blocks_written + 1);
if ((length < journal->flushing_threshold) || (write_count == 0)) return;
/* The slab journal is over the first threshold, schedule some reference block writes. */
WRITE_ONCE(journal->events->flush_count, journal->events->flush_count + 1); if (length < journal->flushing_deadline) { /* Schedule more writes the closer to the deadline we get. */
write_count /= journal->flushing_deadline - length + 1;
write_count = max_t(block_count_t, write_count, 1);
}
for (written = 0; written < write_count; written++) {
vdo_waitq_notify_next_waiter(&slab->dirty_blocks,
launch_reference_block_write, slab);
}
}
/** * reference_count_to_status() - Convert a reference count to a reference status. * @count: The count to convert. * * Return: The appropriate reference status.
*/ staticenum reference_status __must_check reference_count_to_status(vdo_refcount_t count)
{ if (count == EMPTY_REFERENCE_COUNT) return RS_FREE; elseif (count == 1) return RS_SINGLE; elseif (count == PROVISIONAL_REFERENCE_COUNT) return RS_PROVISIONAL; else return RS_SHARED;
}
/** * dirty_block() - Mark a reference count block as dirty, potentially adding it to the dirty queue * if it wasn't already dirty. * @block: The reference block to mark as dirty.
*/ staticvoid dirty_block(struct reference_block *block)
{ if (block->is_dirty) return;
block->is_dirty = true; if (!block->is_writing)
vdo_waitq_enqueue_waiter(&block->slab->dirty_blocks, &block->waiter);
}
/** * get_reference_block() - Get the reference block that covers the given block index.
*/ staticstruct reference_block * __must_check get_reference_block(struct vdo_slab *slab,
slab_block_number index)
{ return &slab->reference_blocks[index / COUNTS_PER_BLOCK];
}
/** * slab_block_number_from_pbn() - Determine the index within the slab of a particular physical * block number. * @slab: The slab. * @pbn: The physical block number. * @slab_block_number_ptr: A pointer to the slab block number. * * Return: VDO_SUCCESS or an error code.
*/ staticint __must_check slab_block_number_from_pbn(struct vdo_slab *slab,
physical_block_number_t pbn,
slab_block_number *slab_block_number_ptr)
{
u64 slab_block_number;
/** * get_reference_counter() - Get the reference counter that covers the given physical block number. * @slab: The slab to query. * @pbn: The physical block number. * @counter_ptr: A pointer to the reference counter.
*/ staticint __must_check get_reference_counter(struct vdo_slab *slab,
physical_block_number_t pbn,
vdo_refcount_t **counter_ptr)
{
slab_block_number index; int result = slab_block_number_from_pbn(slab, pbn, &index);
/* * Wholly full slabs must be the only ones with lowest priority, 0. * * Slabs that have never been opened (empty, newly initialized, and never been written to) * have lower priority than previously opened slabs that have a significant number of free * blocks. This ranking causes VDO to avoid writing physical blocks for the first time * unless there are very few free blocks that have been previously written to. * * Since VDO doesn't discard blocks currently, reusing previously written blocks makes VDO * a better client of any underlying storage that is thinly-provisioned (though discarding * would be better). * * For all other slabs, the priority is derived from the logarithm of the number of free * blocks. Slabs with the same order of magnitude of free blocks have the same priority. * With 2^23 blocks, the priority will range from 1 to 25. The reserved * unopened_slab_priority divides the range and is skipped by the logarithmic mapping.
*/
if (free_blocks == 0) return 0;
if (is_slab_journal_blank(slab)) return unopened_slab_priority;
/* * Slabs are essentially prioritized by an approximation of the number of free blocks in the slab * so slabs with lots of free blocks will be opened for allocation before slabs that have few free * blocks.
*/ staticvoid prioritize_slab(struct vdo_slab *slab)
{
VDO_ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry), "a slab must not already be on a list when prioritizing");
slab->priority = calculate_slab_priority(slab);
vdo_priority_table_enqueue(slab->allocator->prioritized_slabs,
slab->priority, &slab->allocq_entry);
}
/** * adjust_free_block_count() - Adjust the free block count and (if needed) reprioritize the slab. * @incremented: true if the free block count went up.
*/ staticvoid adjust_free_block_count(struct vdo_slab *slab, bool incremented)
{ struct block_allocator *allocator = slab->allocator;
/* The open slab doesn't need to be reprioritized until it is closed. */ if (slab == allocator->open_slab) return;
/* Don't bother adjusting the priority table if unneeded. */ if (slab->priority == calculate_slab_priority(slab)) return;
/* * Reprioritize the slab to reflect the new free block count by removing it from the table * and re-enqueuing it with the new priority.
*/
vdo_priority_table_remove(allocator->prioritized_slabs, &slab->allocq_entry);
prioritize_slab(slab);
}
/** * increment_for_data() - Increment the reference count for a data block. * @slab: The slab which owns the block. * @block: The reference block which contains the block being updated. * @block_number: The block to update. * @old_status: The reference status of the data block before this increment. * @lock: The pbn_lock associated with this increment (may be NULL). * @counter_ptr: A pointer to the count for the data block (in, out). * @adjust_block_count: Whether to update the allocator's free block count. * * Return: VDO_SUCCESS or an error.
*/ staticint increment_for_data(struct vdo_slab *slab, struct reference_block *block,
slab_block_number block_number, enum reference_status old_status, struct pbn_lock *lock, vdo_refcount_t *counter_ptr, bool adjust_block_count)
{ switch (old_status) { case RS_FREE:
*counter_ptr = 1;
block->allocated_count++;
slab->free_blocks--; if (adjust_block_count)
adjust_free_block_count(slab, false);
break;
case RS_PROVISIONAL:
*counter_ptr = 1; break;
default: /* Single or shared */ if (*counter_ptr >= MAXIMUM_REFERENCE_COUNT) { return vdo_log_error_strerror(VDO_REF_COUNT_INVALID, "Incrementing a block already having 254 references (slab %u, offset %u)",
slab->slab_number, block_number);
}
(*counter_ptr)++;
}
if (lock != NULL)
vdo_unassign_pbn_lock_provisional_reference(lock); return VDO_SUCCESS;
}
/** * decrement_for_data() - Decrement the reference count for a data block. * @slab: The slab which owns the block. * @block: The reference block which contains the block being updated. * @block_number: The block to update. * @old_status: The reference status of the data block before this decrement. * @updater: The reference updater doing this operation in case we need to look up the pbn lock. * @counter_ptr: A pointer to the count for the data block (in, out). * @adjust_block_count: Whether to update the allocator's free block count. * * Return: VDO_SUCCESS or an error.
*/ staticint decrement_for_data(struct vdo_slab *slab, struct reference_block *block,
slab_block_number block_number, enum reference_status old_status, struct reference_updater *updater,
vdo_refcount_t *counter_ptr, bool adjust_block_count)
{ switch (old_status) { case RS_FREE: return vdo_log_error_strerror(VDO_REF_COUNT_INVALID, "Decrementing free block at offset %u in slab %u",
block_number, slab->slab_number);
case RS_PROVISIONAL: case RS_SINGLE: if (updater->zpbn.zone != NULL) { struct pbn_lock *lock = vdo_get_physical_zone_pbn_lock(updater->zpbn.zone,
updater->zpbn.pbn);
if (lock != NULL) { /* * There is a read lock on this block, so the block must not become * unreferenced.
*/
*counter_ptr = PROVISIONAL_REFERENCE_COUNT;
vdo_assign_pbn_lock_provisional_reference(lock); break;
}
}
*counter_ptr = EMPTY_REFERENCE_COUNT;
block->allocated_count--;
slab->free_blocks++; if (adjust_block_count)
adjust_free_block_count(slab, true);
break;
default: /* Shared */
(*counter_ptr)--;
}
return VDO_SUCCESS;
}
/** * increment_for_block_map() - Increment the reference count for a block map page. * @slab: The slab which owns the block. * @block: The reference block which contains the block being updated. * @block_number: The block to update. * @old_status: The reference status of the block before this increment. * @lock: The pbn_lock associated with this increment (may be NULL). * @normal_operation: Whether we are in normal operation vs. recovery or rebuild. * @counter_ptr: A pointer to the count for the block (in, out). * @adjust_block_count: Whether to update the allocator's free block count. * * All block map increments should be from provisional to MAXIMUM_REFERENCE_COUNT. Since block map * blocks never dedupe they should never be adjusted from any other state. The adjustment always * results in MAXIMUM_REFERENCE_COUNT as this value is used to prevent dedupe against block map * blocks. * * Return: VDO_SUCCESS or an error.
*/ staticint increment_for_block_map(struct vdo_slab *slab, struct reference_block *block,
slab_block_number block_number, enum reference_status old_status, struct pbn_lock *lock, bool normal_operation,
vdo_refcount_t *counter_ptr, bool adjust_block_count)
{ switch (old_status) { case RS_FREE: if (normal_operation) { return vdo_log_error_strerror(VDO_REF_COUNT_INVALID, "Incrementing unallocated block map block (slab %u, offset %u)",
slab->slab_number, block_number);
}
*counter_ptr = MAXIMUM_REFERENCE_COUNT;
block->allocated_count++;
slab->free_blocks--; if (adjust_block_count)
adjust_free_block_count(slab, false);
return VDO_SUCCESS;
case RS_PROVISIONAL: if (!normal_operation) return vdo_log_error_strerror(VDO_REF_COUNT_INVALID, "Block map block had provisional reference during replay (slab %u, offset %u)",
slab->slab_number, block_number);
*counter_ptr = MAXIMUM_REFERENCE_COUNT; if (lock != NULL)
vdo_unassign_pbn_lock_provisional_reference(lock); return VDO_SUCCESS;
default: return vdo_log_error_strerror(VDO_REF_COUNT_INVALID, "Incrementing a block map block which is already referenced %u times (slab %u, offset %u)",
*counter_ptr, slab->slab_number,
block_number);
}
}
/** * update_reference_count() - Update the reference count of a block. * @slab: The slab which owns the block. * @block: The reference block which contains the block being updated. * @block_number: The block to update. * @slab_journal_point: The slab journal point at which this update is journaled. * @updater: The reference updater. * @normal_operation: Whether we are in normal operation vs. recovery or rebuild. * @adjust_block_count: Whether to update the slab's free block count. * @provisional_decrement_ptr: A pointer which will be set to true if this update was a decrement * of a provisional reference. * * Return: VDO_SUCCESS or an error.
*/ staticint update_reference_count(struct vdo_slab *slab, struct reference_block *block,
slab_block_number block_number, conststruct journal_point *slab_journal_point, struct reference_updater *updater, bool normal_operation, bool adjust_block_count, bool *provisional_decrement_ptr)
{
vdo_refcount_t *counter_ptr = &slab->counters[block_number]; enum reference_status old_status = reference_count_to_status(*counter_ptr); int result;
if (!updater->increment) {
result = decrement_for_data(slab, block, block_number, old_status,
updater, counter_ptr, adjust_block_count); if ((result == VDO_SUCCESS) && (old_status == RS_PROVISIONAL)) { if (provisional_decrement_ptr != NULL)
*provisional_decrement_ptr = true; return VDO_SUCCESS;
}
} elseif (updater->operation == VDO_JOURNAL_DATA_REMAPPING) {
result = increment_for_data(slab, block, block_number, old_status,
updater->lock, counter_ptr, adjust_block_count);
} else {
result = increment_for_block_map(slab, block, block_number, old_status,
updater->lock, normal_operation,
counter_ptr, adjust_block_count);
}
if (result != VDO_SUCCESS) return result;
if (is_valid_journal_point(slab_journal_point))
slab->slab_journal_point = *slab_journal_point;
if (block->is_dirty && (block->slab_journal_lock > 0)) {
sequence_number_t entry_lock = slab_journal_point->sequence_number; /* * This block is already dirty and a slab journal entry has been made for it since * the last time it was clean. We must release the per-entry slab journal lock for * the entry associated with the update we are now doing.
*/
result = VDO_ASSERT(is_valid_journal_point(slab_journal_point), "Reference count adjustments need slab journal points."); if (result != VDO_SUCCESS) return result;
/* * This may be the first time we are applying an update for which there is a slab journal * entry to this block since the block was cleaned. Therefore, we convert the per-entry * slab journal lock to an uncommitted reference block lock, if there is a per-entry lock.
*/ if (is_valid_journal_point(slab_journal_point))
block->slab_journal_lock = slab_journal_point->sequence_number; else
block->slab_journal_lock = 0;
dirty_block(block); return VDO_SUCCESS;
}
/** * add_entry_from_waiter() - Add an entry to the slab journal. * @waiter: The vio which should make an entry now. * @context: The slab journal to make an entry in. * * This callback is invoked by add_entries() once it has determined that we are ready to make * another entry in the slab journal. Implements waiter_callback_fn.
*/ staticvoid add_entry_from_waiter(struct vdo_waiter *waiter, void *context)
{ int result; struct reference_updater *updater =
container_of(waiter, struct reference_updater, waiter); struct data_vio *data_vio = data_vio_from_reference_updater(updater); struct slab_journal *journal = context; struct slab_journal_block_header *header = &journal->tail_header; struct journal_point slab_journal_point = {
.sequence_number = header->sequence_number,
.entry_count = header->entry_count,
};
sequence_number_t recovery_block = data_vio->recovery_journal_point.sequence_number;
if (header->entry_count == 0) { /* * This is the first entry in the current tail block, so get a lock on the recovery * journal which we will hold until this tail block is committed.
*/
get_lock(journal, header->sequence_number)->recovery_start = recovery_block; if (journal->recovery_journal != NULL) {
zone_count_t zone_number = journal->slab->allocator->zone_number;
if (journal->slab->status != VDO_SLAB_REBUILT) { /* * If the slab is unrecovered, scrubbing will take care of the count since the * update is now recorded in the journal.
*/
adjust_slab_journal_block_reference(journal,
slab_journal_point.sequence_number, -1);
result = VDO_SUCCESS;
} else { /* Now that an entry has been made in the slab journal, update the counter. */
result = adjust_reference_count(journal->slab, updater,
&slab_journal_point);
}
if (updater->increment)
continue_data_vio_with_error(data_vio, result); else
vdo_continue_completion(&data_vio->decrement_completion, result);
}
/** * is_next_entry_a_block_map_increment() - Check whether the next entry to be made is a block map * increment. * @journal: The journal. * * Return: true if the first entry waiter's operation is a block map increment.
*/ staticinlinebool is_next_entry_a_block_map_increment(struct slab_journal *journal)
{ struct vdo_waiter *waiter = vdo_waitq_get_first_waiter(&journal->entry_waiters); struct reference_updater *updater =
container_of(waiter, struct reference_updater, waiter);
/** * add_entries() - Add as many entries as possible from the queue of vios waiting to make entries. * @journal: The journal to which entries may be added. * * By processing the queue in order, we ensure that slab journal entries are made in the same order * as recovery journal entries for the same increment or decrement.
*/ staticvoid add_entries(struct slab_journal *journal)
{ if (journal->adding_entries) { /* Protect against re-entrancy. */ return;
}
if (journal->partial_write_in_progress ||
(journal->slab->status == VDO_SLAB_REBUILDING)) { /* * Don't add entries while rebuilding or while a partial write is * outstanding, as it could result in reference count corruption.
*/ break;
}
if (journal->waiting_to_commit) { /* * If we are waiting for resources to write the tail block, and the tail * block is full, we can't make another entry.
*/
WRITE_ONCE(journal->events->tail_busy_count,
journal->events->tail_busy_count + 1); break;
} elseif (is_next_entry_a_block_map_increment(journal) &&
(header->entry_count >= journal->full_entries_per_block)) { /* * The tail block does not have room for a block map increment, so commit * it now.
*/
commit_tail(journal); if (journal->waiting_to_commit) {
WRITE_ONCE(journal->events->tail_busy_count,
journal->events->tail_busy_count + 1); break;
}
}
/* If the slab is over the blocking threshold, make the vio wait. */ if (requires_reaping(journal)) {
WRITE_ONCE(journal->events->blocked_count,
journal->events->blocked_count + 1);
save_dirty_reference_blocks(journal->slab); break;
}
/* * Check if the on disk slab journal is full. Because of the blocking and * scrubbing thresholds, this should never happen.
*/ if (lock->count > 0) {
VDO_ASSERT_LOG_ONLY((journal->head + journal->size) == journal->tail, "New block has locks, but journal is not full");
/* * The blocking threshold must let the journal fill up if the new * block has locks; if the blocking threshold is smaller than the * journal size, the new block cannot possibly have locks already.
*/
VDO_ASSERT_LOG_ONLY((journal->blocking_threshold >= journal->size), "New block can have locks already iff blocking threshold is at the end of the journal");
/* * Don't allow the new block to be reaped until all of the reference count * blocks are written and the journal block has been fully committed as * well.
*/
lock->count = journal->entries_per_block + 1;
/* * This is the first entry in this slab journal, ever. Dirty all of * the reference count blocks. Each will acquire a lock on the tail * block so that the journal won't be reaped until the reference * counts are initialized. The lock acquisition must be done by the * ref_counts since here we don't know how many reference blocks * the ref_counts has.
*/ for (i = 0; i < slab->reference_block_count; i++) {
slab->reference_blocks[i].slab_journal_lock = 1;
dirty_block(&slab->reference_blocks[i]);
}
/* If there are no waiters, and we are flushing or saving, commit the tail block. */ if (vdo_is_state_draining(&journal->slab->state) &&
!vdo_is_state_suspending(&journal->slab->state) &&
!vdo_waitq_has_waiters(&journal->entry_waiters))
commit_tail(journal);
}
/** * reset_search_cursor() - Reset the free block search back to the first reference counter in the * first reference block of a slab.
*/ staticvoid reset_search_cursor(struct vdo_slab *slab)
{ struct search_cursor *cursor = &slab->search_cursor;
cursor->block = cursor->first_block;
cursor->index = 0; /* Unit tests have slabs with only one reference block (and it's a runt). */
cursor->end_index = min_t(u32, COUNTS_PER_BLOCK, slab->block_count);
}
/** * advance_search_cursor() - Advance the search cursor to the start of the next reference block in * a slab, * * Wraps around to the first reference block if the current block is the last reference block. * * Return: true unless the cursor was at the last reference block.
*/ staticbool advance_search_cursor(struct vdo_slab *slab)
{ struct search_cursor *cursor = &slab->search_cursor;
/* * If we just finished searching the last reference block, then wrap back around to the * start of the array.
*/ if (cursor->block == cursor->last_block) {
reset_search_cursor(slab); returnfalse;
}
/* We're not already at the end, so advance to cursor to the next block. */
cursor->block++;
cursor->index = cursor->end_index;
if (cursor->block == cursor->last_block) { /* The last reference block will usually be a runt. */
cursor->end_index = slab->block_count;
} else {
cursor->end_index += COUNTS_PER_BLOCK;
}
returntrue;
}
/** * vdo_adjust_reference_count_for_rebuild() - Adjust the reference count of a block during rebuild. * * Return: VDO_SUCCESS or an error.
*/ int vdo_adjust_reference_count_for_rebuild(struct slab_depot *depot,
physical_block_number_t pbn, enum journal_operation operation)
{ int result;
slab_block_number block_number; struct reference_block *block; struct vdo_slab *slab = vdo_get_slab(depot, pbn); struct reference_updater updater = {
.operation = operation,
.increment = true,
};
result = slab_block_number_from_pbn(slab, pbn, &block_number); if (result != VDO_SUCCESS) return result;
block = get_reference_block(slab, block_number);
result = update_reference_count(slab, block, block_number, NULL,
&updater, !NORMAL_OPERATION, false, NULL); if (result != VDO_SUCCESS) return result;
dirty_block(block); return VDO_SUCCESS;
}
/** * replay_reference_count_change() - Replay the reference count adjustment from a slab journal * entry into the reference count for a block. * @slab: The slab. * @entry_point: The slab journal point for the entry. * @entry: The slab journal entry being replayed. * * The adjustment will be ignored if it was already recorded in the reference count. * * Return: VDO_SUCCESS or an error code.
*/ staticint replay_reference_count_change(struct vdo_slab *slab, conststruct journal_point *entry_point, struct slab_journal_entry entry)
{ int result; struct reference_block *block = get_reference_block(slab, entry.sbn);
sector_count_t sector = (entry.sbn % COUNTS_PER_BLOCK) / COUNTS_PER_SECTOR; struct reference_updater updater = {
.operation = entry.operation,
.increment = entry.increment,
};
if (!vdo_before_journal_point(&block->commit_points[sector], entry_point)) { /* This entry is already reflected in the existing counts, so do nothing. */ return VDO_SUCCESS;
}
/* This entry is not yet counted in the reference counts. */
result = update_reference_count(slab, block, entry.sbn, entry_point,
&updater, !NORMAL_OPERATION, false, NULL); if (result != VDO_SUCCESS) return result;
dirty_block(block); return VDO_SUCCESS;
}
/** * find_zero_byte_in_word() - Find the array index of the first zero byte in word-sized range of * reference counters. * @word_ptr: A pointer to the eight counter bytes to check. * @start_index: The array index corresponding to word_ptr[0]. * @fail_index: The array index to return if no zero byte is found. * * The search does no bounds checking; the function relies on the array being sufficiently padded. * * Return: The array index of the first zero byte in the word, or the value passed as fail_index if * no zero byte was found.
*/ staticinline slab_block_number find_zero_byte_in_word(const u8 *word_ptr,
slab_block_number start_index,
slab_block_number fail_index)
{
u64 word = get_unaligned_le64(word_ptr);
/* This looks like a loop, but GCC will unroll the eight iterations for us. */ unsignedint offset;
for (offset = 0; offset < BYTES_PER_WORD; offset++) { /* Assumes little-endian byte order, which we have on X86. */ if ((word & 0xFF) == 0) return (start_index + offset);
word >>= 8;
}
return fail_index;
}
/** * find_free_block() - Find the first block with a reference count of zero in the specified * range of reference counter indexes. * @slab: The slab counters to scan. * @index_ptr: A pointer to hold the array index of the free block. * * Exposed for unit testing. * * Return: true if a free block was found in the specified range.
*/ staticbool find_free_block(conststruct vdo_slab *slab, slab_block_number *index_ptr)
{
slab_block_number zero_index;
slab_block_number next_index = slab->search_cursor.index;
slab_block_number end_index = slab->search_cursor.end_index;
u8 *next_counter = &slab->counters[next_index];
u8 *end_counter = &slab->counters[end_index];
/* * Search every byte of the first unaligned word. (Array is padded so reading past end is * safe.)
*/
zero_index = find_zero_byte_in_word(next_counter, next_index, end_index); if (zero_index < end_index) {
*index_ptr = zero_index; returntrue;
}
/* * On architectures where unaligned word access is expensive, this would be a good place to * advance to an alignment boundary.
*/
next_index += BYTES_PER_WORD;
next_counter += BYTES_PER_WORD;
/* * Now we're word-aligned; check an word at a time until we find a word containing a zero. * (Array is padded so reading past end is safe.)
*/ while (next_counter < end_counter) { /* * The following code is currently an exact copy of the code preceding the loop, * but if you try to merge them by using a do loop, it runs slower because a jump * instruction gets added at the start of the iteration.
*/
zero_index = find_zero_byte_in_word(next_counter, next_index, end_index); if (zero_index < end_index) {
*index_ptr = zero_index; returntrue;
}
/** * search_current_reference_block() - Search the reference block currently saved in the search * cursor for a reference count of zero, starting at the saved * counter index. * @slab: The slab to search. * @free_index_ptr: A pointer to receive the array index of the zero reference count. * * Return: true if an unreferenced counter was found.
*/ staticbool search_current_reference_block(conststruct vdo_slab *slab,
slab_block_number *free_index_ptr)
{ /* Don't bother searching if the current block is known to be full. */ return ((slab->search_cursor.block->allocated_count < COUNTS_PER_BLOCK) &&
find_free_block(slab, free_index_ptr));
}
/** * search_reference_blocks() - Search each reference block for a reference count of zero. * @slab: The slab to search. * @free_index_ptr: A pointer to receive the array index of the zero reference count. * * Searches each reference block for a reference count of zero, starting at the reference block and * counter index saved in the search cursor and searching up to the end of the last reference * block. The search does not wrap. * * Return: true if an unreferenced counter was found.
*/ staticbool search_reference_blocks(struct vdo_slab *slab,
slab_block_number *free_index_ptr)
{ /* Start searching at the saved search position in the current block. */ if (search_current_reference_block(slab, free_index_ptr)) returntrue;
/* Search each reference block up to the end of the slab. */ while (advance_search_cursor(slab)) { if (search_current_reference_block(slab, free_index_ptr)) returntrue;
}
returnfalse;
}
/** * make_provisional_reference() - Do the bookkeeping for making a provisional reference.
*/ staticvoid make_provisional_reference(struct vdo_slab *slab,
slab_block_number block_number)
{ struct reference_block *block = get_reference_block(slab, block_number);
/* * Make the initial transition from an unreferenced block to a * provisionally allocated block.
*/
slab->counters[block_number] = PROVISIONAL_REFERENCE_COUNT;
/* Account for the allocation. */
block->allocated_count++;
slab->free_blocks--;
}
/** * dirty_all_reference_blocks() - Mark all reference count blocks in a slab as dirty.
*/ staticvoid dirty_all_reference_blocks(struct vdo_slab *slab)
{
block_count_t i;
for (i = 0; i < slab->reference_block_count; i++)
dirty_block(&slab->reference_blocks[i]);
}
/** * match_bytes() - Check an 8-byte word for bytes matching the value specified * @input: A word to examine the bytes of * @match: The byte value sought * * Return: 1 in each byte when the corresponding input byte matched, 0 otherwise
*/ staticinline u64 match_bytes(u64 input, u8 match)
{
u64 temp = input ^ (match * 0x0101010101010101ULL); /* top bit of each byte is set iff top bit of temp byte is clear; rest are 0 */
u64 test_top_bits = ~temp & 0x8080808080808080ULL; /* top bit of each byte is set iff low 7 bits of temp byte are clear; rest are useless */
u64 test_low_bits = 0x8080808080808080ULL - (temp & 0x7f7f7f7f7f7f7f7fULL); /* return 1 when both tests indicate temp byte is 0 */ return (test_top_bits & test_low_bits) >> 7;
}
/** * count_valid_references() - Process a newly loaded refcount array * @counters: the array of counters from a metadata block * * Scan a 8-byte-aligned array of counters, fixing up any "provisional" values that weren't * cleaned up at shutdown, changing them internally to "empty". * * Return: the number of blocks that are referenced (counters not "empty")
*/ staticunsignedint count_valid_references(vdo_refcount_t *counters)
{
u64 *words = (u64 *)counters; /* It's easier to count occurrences of a specific byte than its absences. */ unsignedint empty_count = 0; /* For speed, we process 8 bytes at once. */ unsignedint words_left = COUNTS_PER_BLOCK / sizeof(u64);
/* * Sanity check assumptions used for optimizing this code: Counters are bytes. The counter * array is a multiple of the word size.
*/
BUILD_BUG_ON(sizeof(vdo_refcount_t) != 1);
BUILD_BUG_ON((COUNTS_PER_BLOCK % sizeof(u64)) != 0);
while (words_left > 0) { /* * This is used effectively as 8 byte-size counters. Byte 0 counts how many words * had the target value found in byte 0, etc. We just have to avoid overflow.
*/
u64 split_count = 0; /* * The counter "% 255" trick used below to fold split_count into empty_count * imposes a limit of 254 bytes examined each iteration of the outer loop. We * process a word at a time, so that limit gets rounded down to 31 u64 words.
*/ constunsignedint max_words_per_iteration = 254 / sizeof(u64); unsignedint iter_words_left = min_t(unsignedint, words_left,
max_words_per_iteration);
words_left -= iter_words_left;
while (iter_words_left--) {
u64 word = *words;
u64 temp;
/* First, if we have any provisional refcount values, clear them. */
temp = match_bytes(word, PROVISIONAL_REFERENCE_COUNT); if (temp) { /* * 'temp' has 0x01 bytes where 'word' has PROVISIONAL; this xor * will alter just those bytes, changing PROVISIONAL to EMPTY.
*/
word ^= temp * (PROVISIONAL_REFERENCE_COUNT ^ EMPTY_REFERENCE_COUNT);
*words = word;
}
/* Now count the EMPTY_REFERENCE_COUNT bytes, updating the 8 counters. */
split_count += match_bytes(word, EMPTY_REFERENCE_COUNT);
words++;
}
empty_count += split_count % 255;
}
return COUNTS_PER_BLOCK - empty_count;
}
/** * unpack_reference_block() - Unpack reference counts blocks into the internal memory structure. * @packed: The written reference block to be unpacked. * @block: The internal reference block to be loaded.
*/ staticvoid unpack_reference_block(struct packed_reference_block *packed, struct reference_block *block)
{
sector_count_t i; struct vdo_slab *slab = block->slab;
vdo_refcount_t *counters = get_reference_counters_for_block(block);
for (i = 0; i < VDO_SECTORS_PER_BLOCK; i++) { struct packed_reference_sector *sector = &packed->sectors[i];
vdo_unpack_journal_point(§or->commit_point, &block->commit_points[i]);
memcpy(counters + (i * COUNTS_PER_SECTOR), sector->counts,
(sizeof(vdo_refcount_t) * COUNTS_PER_SECTOR)); /* The slab_journal_point must be the latest point found in any sector. */ if (vdo_before_journal_point(&slab->slab_journal_point,
&block->commit_points[i]))
slab->slab_journal_point = block->commit_points[i];
/** * load_reference_block_group() - After a block waiter has gotten a VIO from the VIO pool, load * a set of blocks. * @waiter: The waiter of the first block to load. * @context: The VIO returned by the pool.
*/ staticvoid load_reference_block_group(struct vdo_waiter *waiter, void *context)
{ struct pooled_vio *pooled = context; struct vio *vio = &pooled->vio; struct reference_block *block =
container_of(waiter, struct reference_block, waiter);
u32 block_offset = block - block->slab->reference_blocks;
u32 max_block_count = block->slab->reference_block_count - block_offset;
u32 block_count = min_t(int, vio->block_count, max_block_count);
/** * drain_slab() - Drain all reference count I/O. * * Depending upon the type of drain being performed (as recorded in the ref_count's vdo_slab), the * reference blocks may be loaded from disk or dirty reference blocks may be written out.
*/ staticvoid drain_slab(struct vdo_slab *slab)
{ bool save; bool load; conststruct admin_state_code *state = vdo_get_admin_state_code(&slab->state);
if (state == VDO_ADMIN_STATE_SUSPENDING) return;
if ((state != VDO_ADMIN_STATE_REBUILDING) &&
(state != VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING))
commit_tail(&slab->journal);
if ((state == VDO_ADMIN_STATE_RECOVERING) || (slab->counters == NULL)) return;
save = false;
load = slab->allocator->summary_entries[slab->slab_number].load_ref_counts; if (state == VDO_ADMIN_STATE_SCRUBBING) { if (load) {
load_reference_blocks(slab); return;
}
} elseif (state == VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING) { if (!load) { /* These reference counts were never written, so mark them all dirty. */
dirty_all_reference_blocks(slab);
}
save = true;
} elseif (state == VDO_ADMIN_STATE_REBUILDING) { /* * Write out the counters if the slab has written them before, or it has any * non-zero reference counts, or there are any slab journal blocks.
*/
block_count_t data_blocks = slab->allocator->depot->slab_config.data_blocks;
staticint allocate_slab_counters(struct vdo_slab *slab)
{ int result;
size_t index, bytes;
result = VDO_ASSERT(slab->reference_blocks == NULL, "vdo_slab %u doesn't allocate refcounts twice",
slab->slab_number); if (result != VDO_SUCCESS) return result;
result = vdo_allocate(slab->reference_block_count, struct reference_block,
__func__, &slab->reference_blocks); if (result != VDO_SUCCESS) return result;
/* * Allocate such that the runt slab has a full-length memory array, plus a little padding * so we can word-search even at the very end.
*/
bytes = (slab->reference_block_count * COUNTS_PER_BLOCK) + (2 * BYTES_PER_WORD);
result = vdo_allocate(bytes, vdo_refcount_t, "ref counts array",
&slab->counters); if (result != VDO_SUCCESS) {
vdo_free(vdo_forget(slab->reference_blocks)); return result;
}
/* FIXME: should it be an error if the following conditional fails? */ if ((header.metadata_type == VDO_METADATA_SLAB_JOURNAL) &&
(header.nonce == slab->allocator->nonce)) {
journal->tail = header.sequence_number + 1;
/* * If the slab is clean, this implies the slab journal is empty, so advance the * head appropriately.
*/
journal->head = (slab->allocator->summary_entries[slab->slab_number].is_dirty ?
header.head : journal->tail);
journal->tail_header = header;
initialize_journal_state(journal);
}
/** * read_slab_journal_tail() - Read the slab journal tail block by using a vio acquired from the vio * pool. * @waiter: The vio pool waiter which has just been notified. * @context: The vio pool entry given to the waiter. * * This is the success callback from acquire_vio_from_pool() when loading a slab journal.
*/ staticvoid read_slab_journal_tail(struct vdo_waiter *waiter, void *context)
{ struct slab_journal *journal =
container_of(waiter, struct slab_journal, resource_waiter); struct vdo_slab *slab = journal->slab; struct pooled_vio *pooled = context; struct vio *vio = &pooled->vio;
tail_block_offset_t last_commit_point =
slab->allocator->summary_entries[slab->slab_number].tail_block_offset;
/* * Slab summary keeps the commit point offset, so the tail block is the block before that. * Calculation supports small journals in unit tests.
*/
tail_block_offset_t tail_block = ((last_commit_point == 0) ?
(tail_block_offset_t)(journal->size - 1) :
(last_commit_point - 1));
/** * load_slab_journal() - Load a slab's journal by reading the journal's tail.
*/ staticvoid load_slab_journal(struct vdo_slab *slab)
{ struct slab_journal *journal = &slab->journal;
tail_block_offset_t last_commit_point;
last_commit_point = slab->allocator->summary_entries[slab->slab_number].tail_block_offset; if ((last_commit_point == 0) &&
!slab->allocator->summary_entries[slab->slab_number].load_ref_counts) { /* * This slab claims that it has a tail block at (journal->size - 1), but a head of * 1. This is impossible, due to the scrubbing threshold, on a real system, so * don't bother reading the (bogus) data off disk.
*/
VDO_ASSERT_LOG_ONLY(((journal->size < 16) ||
(journal->scrubbing_threshold < (journal->size - 1))), "Scrubbing threshold protects against reads of unwritten slab journal blocks");
vdo_finish_loading_with_result(&slab->state,
allocate_counters_if_clean(slab)); return;
}
/* Queue a slab for allocation or scrubbing. */ staticvoid queue_slab(struct vdo_slab *slab)
{ struct block_allocator *allocator = slab->allocator;
block_count_t free_blocks; int result;
VDO_ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry), "a requeued slab must not already be on a list");
if (vdo_is_read_only(allocator->depot->vdo)) return;
free_blocks = slab->free_blocks;
result = VDO_ASSERT((free_blocks <= allocator->depot->slab_config.data_blocks), "rebuilt slab %u must have a valid free block count (has %llu, expected maximum %llu)",
slab->slab_number, (unsignedlonglong) free_blocks,
(unsignedlonglong) allocator->depot->slab_config.data_blocks); if (result != VDO_SUCCESS) {
vdo_enter_read_only_mode(allocator->depot->vdo, result); return;
}
if (slab->status != VDO_SLAB_REBUILT) {
register_slab_for_scrubbing(slab, false); return;
}
if (!vdo_is_state_resuming(&slab->state)) { /* * If the slab is resuming, we've already accounted for it here, so don't do it * again. * FIXME: under what situation would the slab be resuming here?
*/
WRITE_ONCE(allocator->allocated_blocks,
allocator->allocated_blocks - free_blocks); if (!is_slab_journal_blank(slab)) {
WRITE_ONCE(allocator->statistics.slabs_opened,
allocator->statistics.slabs_opened + 1);
}
}
if (allocator->depot->vdo->suspend_type == VDO_ADMIN_STATE_SAVING)
reopen_slab_journal(slab);
/** * get_next_slab() - Get the next slab to scrub. * @scrubber: The slab scrubber. * * Return: The next slab to scrub or NULL if there are none.
*/ staticstruct vdo_slab *get_next_slab(struct slab_scrubber *scrubber)
{ struct vdo_slab *slab;
/** * has_slabs_to_scrub() - Check whether a scrubber has slabs to scrub. * @scrubber: The scrubber to check. * * Return: true if the scrubber has slabs to scrub.
*/ staticinlinebool __must_check has_slabs_to_scrub(struct slab_scrubber *scrubber)
{ return (get_next_slab(scrubber) != NULL);
}
/** * uninitialize_scrubber_vio() - Clean up the slab_scrubber's vio. * @scrubber: The scrubber.
*/ staticvoid uninitialize_scrubber_vio(struct slab_scrubber *scrubber)
{
vdo_free(vdo_forget(scrubber->vio.data));
free_vio_components(&scrubber->vio);
}
/** * finish_scrubbing() - Stop scrubbing, either because there are no more slabs to scrub or because * there's been an error. * @scrubber: The scrubber.
*/ staticvoid finish_scrubbing(struct slab_scrubber *scrubber, int result)
{ bool notify = vdo_waitq_has_waiters(&scrubber->waiters); bool done = !has_slabs_to_scrub(scrubber); struct block_allocator *allocator =
container_of(scrubber, struct block_allocator, scrubber);
if (done)
uninitialize_scrubber_vio(scrubber);
if (scrubber->high_priority_only) {
scrubber->high_priority_only = false;
vdo_fail_completion(vdo_forget(scrubber->vio.completion.parent), result);
} elseif (done && (atomic_add_return(-1, &allocator->depot->zones_to_scrub) == 0)) { /* All of our slabs were scrubbed, and we're the last allocator to finish. */ enum vdo_state prior_state =
atomic_cmpxchg(&allocator->depot->vdo->state, VDO_RECOVERING,
VDO_DIRTY);
/* * To be safe, even if the CAS failed, ensure anything that follows is ordered with * respect to whatever state change did happen.
*/
smp_mb__after_atomic();
/* * We must check the VDO state here and not the depot's read_only_notifier since * the compare-swap-above could have failed due to a read-only entry which our own * thread does not yet know about.
*/ if (prior_state == VDO_DIRTY)
vdo_log_info("VDO commencing normal operation"); elseif (prior_state == VDO_RECOVERING)
vdo_log_info("Exiting recovery mode");
free_vio_pool(vdo_forget(allocator->refcount_big_vio_pool));
}
/* * Note that the scrubber has stopped, and inform anyone who might be waiting for that to * happen.
*/ if (!vdo_finish_draining(&scrubber->admin_state))
WRITE_ONCE(scrubber->admin_state.current_state,
VDO_ADMIN_STATE_SUSPENDED);
/* * We can't notify waiters until after we've finished draining or they'll just requeue. * Fortunately if there were waiters, we can't have been freed yet.
*/ if (notify)
vdo_waitq_notify_all_waiters(&scrubber->waiters, NULL, NULL);
}
/** * apply_block_entries() - Apply all the entries in a block to the reference counts. * @block: A block with entries to apply. * @entry_count: The number of entries to apply. * @block_number: The sequence number of the block. * @slab: The slab to apply the entries to. * * Return: VDO_SUCCESS or an error code.
*/ staticint apply_block_entries(struct packed_slab_journal_block *block,
journal_entry_count_t entry_count,
sequence_number_t block_number, struct vdo_slab *slab)
{ struct journal_point entry_point = {
.sequence_number = block_number,
.entry_count = 0,
}; int result;
slab_block_number max_sbn = slab->end - slab->start;
if (entry.sbn > max_sbn) { /* This entry is out of bounds. */ return vdo_log_error_strerror(VDO_CORRUPT_JOURNAL, "vdo_slab journal entry (%llu, %u) had invalid offset %u in slab (size %u blocks)",
(unsignedlonglong) block_number,
entry_point.entry_count,
entry.sbn, max_sbn);
}
result = replay_reference_count_change(slab, &entry_point, entry); if (result != VDO_SUCCESS) {
vdo_log_error_strerror(result, "vdo_slab journal entry (%llu, %u) (%s of offset %u) could not be applied in slab %u",
(unsignedlonglong) block_number,
entry_point.entry_count,
vdo_get_journal_operation_name(entry.operation),
entry.sbn, slab->slab_number); return result;
}
entry_point.entry_count++;
}
return VDO_SUCCESS;
}
/** * apply_journal_entries() - Find the relevant vio of the slab journal and apply all valid entries. * @completion: The metadata read vio completion. * * This is a callback registered in start_scrubbing().
*/ staticvoid apply_journal_entries(struct vdo_completion *completion)
{ int result; struct slab_scrubber *scrubber =
container_of(as_vio(completion), struct slab_scrubber, vio); struct vdo_slab *slab = scrubber->slab; struct slab_journal *journal = &slab->journal;
/* Find the boundaries of the useful part of the journal. */
sequence_number_t tail = journal->tail;
tail_block_offset_t end_index = (tail - 1) % journal->size; char *end_data = scrubber->vio.data + (end_index * VDO_BLOCK_SIZE); struct packed_slab_journal_block *end_block =
(struct packed_slab_journal_block *) end_data;
sequence_number_t head = __le64_to_cpu(end_block->header.head);
tail_block_offset_t head_index = head % journal->size;
block_count_t index = head_index;
if ((header.nonce != slab->allocator->nonce) ||
(header.metadata_type != VDO_METADATA_SLAB_JOURNAL) ||
(header.sequence_number != sequence) ||
(header.entry_count > journal->entries_per_block) ||
(header.has_block_map_increments &&
(header.entry_count > journal->full_entries_per_block))) { /* The block is not what we expect it to be. */
vdo_log_error("vdo_slab journal block for slab %u was invalid",
slab->slab_number);
abort_scrubbing(scrubber, VDO_CORRUPT_JOURNAL); return;
}
result = apply_block_entries(block, header.entry_count, sequence, slab); if (result != VDO_SUCCESS) {
abort_scrubbing(scrubber, result); return;
}
last_entry_applied.sequence_number = sequence;
last_entry_applied.entry_count = header.entry_count - 1;
index++; if (index == journal->size)
index = 0;
}
/* * At the end of rebuild, the reference counters should be accurate to the end of the * journal we just applied.
*/
result = VDO_ASSERT(!vdo_before_journal_point(&last_entry_applied,
&ref_counts_point), "Refcounts are not more accurate than the slab journal"); if (result != VDO_SUCCESS) {
abort_scrubbing(scrubber, result); return;
}
/* Save out the rebuilt reference blocks. */
vdo_prepare_completion(completion, slab_scrubbed, handle_scrubber_error,
slab->allocator->thread_id, completion->parent);
vdo_start_operation_with_waiter(&slab->state,
VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING,
completion, initiate_slab_action);
}
/** * start_scrubbing() - Read the current slab's journal from disk now that it has been flushed. * @completion: The scrubber's vio completion. * * This callback is registered in scrub_next_slab().
*/ staticvoid start_scrubbing(struct vdo_completion *completion)
{ struct slab_scrubber *scrubber =
container_of(as_vio(completion), struct slab_scrubber, vio); struct vdo_slab *slab = scrubber->slab;
if (!slab->allocator->summary_entries[slab->slab_number].is_dirty) {
slab_scrubbed(completion); return;
}
/** * scrub_next_slab() - Scrub the next slab if there is one. * @scrubber: The scrubber.
*/ staticvoid scrub_next_slab(struct slab_scrubber *scrubber)
{ struct vdo_completion *completion = &scrubber->vio.completion; struct vdo_slab *slab;
/* * Note: this notify call is always safe only because scrubbing can only be started when * the VDO is quiescent.
*/
vdo_waitq_notify_all_waiters(&scrubber->waiters, NULL, NULL);
if (vdo_is_read_only(completion->vdo)) {
finish_scrubbing(scrubber, VDO_READ_ONLY); return;
}
/** * scrub_slabs() - Scrub all of an allocator's slabs that are eligible for scrubbing. * @allocator: The block_allocator to scrub. * @parent: The completion to notify when scrubbing is done, implies high_priority, may be NULL.
*/ staticvoid scrub_slabs(struct block_allocator *allocator, struct vdo_completion *parent)
{ struct slab_scrubber *scrubber = &allocator->scrubber;
/** * get_depot_slab_iterator() - Return a slab_iterator over the slabs in a slab_depot. * @depot: The depot over which to iterate. * @start: The number of the slab to start iterating from. * @end: The number of the last slab which may be returned. * @stride: The difference in slab number between successive slabs. * * Iteration always occurs from higher to lower numbered slabs. * * Return: An initialized iterator structure.
*/ staticstruct slab_iterator get_depot_slab_iterator(struct slab_depot *depot,
slab_count_t start, slab_count_t end,
slab_count_t stride)
{ struct vdo_slab **slabs = depot->slabs;
/** * next_slab() - Get the next slab from a slab_iterator and advance the iterator * @iterator: The slab_iterator. * * Return: The next slab or NULL if the iterator is exhausted.
*/ staticstruct vdo_slab *next_slab(struct slab_iterator *iterator)
{ struct vdo_slab *slab = iterator->next;
/** * abort_waiter() - Abort vios waiting to make journal entries when read-only. * * This callback is invoked on all vios waiting to make slab journal entries after the VDO has gone * into read-only mode. Implements waiter_callback_fn.
*/ staticvoid abort_waiter(struct vdo_waiter *waiter, void *context __always_unused)
{ struct reference_updater *updater =
container_of(waiter, struct reference_updater, waiter); struct data_vio *data_vio = data_vio_from_reference_updater(updater);
if (updater->increment) {
continue_data_vio_with_error(data_vio, VDO_READ_ONLY); return;
}
/** * vdo_acquire_provisional_reference() - Acquire a provisional reference on behalf of a PBN lock if * the block it locks is unreferenced. * @slab: The slab which contains the block. * @pbn: The physical block to reference. * @lock: The lock. * * Return: VDO_SUCCESS or an error.
*/ int vdo_acquire_provisional_reference(struct vdo_slab *slab, physical_block_number_t pbn, struct pbn_lock *lock)
{
slab_block_number block_number; int result;
if (vdo_pbn_lock_has_provisional_reference(lock)) return VDO_SUCCESS;
if (!is_slab_open(slab)) return VDO_INVALID_ADMIN_STATE;
result = slab_block_number_from_pbn(slab, pbn, &block_number); if (result != VDO_SUCCESS) return result;
if (slab->counters[block_number] == EMPTY_REFERENCE_COUNT) {
make_provisional_reference(slab, block_number); if (lock != NULL)
vdo_assign_pbn_lock_provisional_reference(lock);
}
if (vdo_pbn_lock_has_provisional_reference(lock))
adjust_free_block_count(slab, false);
if (!is_slab_open(slab)) return VDO_INVALID_ADMIN_STATE;
if (!search_reference_blocks(slab, &free_index)) return VDO_NO_SPACE;
VDO_ASSERT_LOG_ONLY((slab->counters[free_index] == EMPTY_REFERENCE_COUNT), "free block must have ref count of zero");
make_provisional_reference(slab, free_index);
adjust_free_block_count(slab, false);
/* * Update the search hint so the next search will start at the array index just past the * free block we just found.
*/
slab->search_cursor.index = (free_index + 1);
/** * open_slab() - Prepare a slab to be allocated from. * @slab: The slab.
*/ staticvoid open_slab(struct vdo_slab *slab)
{
reset_search_cursor(slab); if (is_slab_journal_blank(slab)) {
WRITE_ONCE(slab->allocator->statistics.slabs_opened,
slab->allocator->statistics.slabs_opened + 1);
dirty_all_reference_blocks(slab);
} else {
WRITE_ONCE(slab->allocator->statistics.slabs_reopened,
slab->allocator->statistics.slabs_reopened + 1);
}
slab->allocator->open_slab = slab;
}
/* * The block allocated will have a provisional reference and the reference must be either confirmed * with a subsequent increment or vacated with a subsequent decrement via * vdo_release_block_reference().
*/ int vdo_allocate_block(struct block_allocator *allocator,
physical_block_number_t *block_number_ptr)
{ int result;
if (allocator->open_slab != NULL) { /* Try to allocate the next block in the currently open slab. */
result = allocate_slab_block(allocator->open_slab, block_number_ptr); if ((result == VDO_SUCCESS) || (result != VDO_NO_SPACE)) return result;
/* Put the exhausted open slab back into the priority table. */
prioritize_slab(allocator->open_slab);
}
/* Remove the highest priority slab from the priority table and make it the open slab. */
open_slab(list_entry(vdo_priority_table_dequeue(allocator->prioritized_slabs), struct vdo_slab, allocq_entry));
/* * Try allocating again. If we're out of space immediately after opening a slab, then every * slab must be fully allocated.
*/ return allocate_slab_block(allocator->open_slab, block_number_ptr);
}
/** * vdo_enqueue_clean_slab_waiter() - Wait for a clean slab. * @allocator: The block_allocator on which to wait. * @waiter: The waiter. * * Return: VDO_SUCCESS if the waiter was queued, VDO_NO_SPACE if there are no slabs to scrub, and * some other error otherwise.
*/ int vdo_enqueue_clean_slab_waiter(struct block_allocator *allocator, struct vdo_waiter *waiter)
{ if (vdo_is_read_only(allocator->depot->vdo)) return VDO_READ_ONLY;
if (vdo_is_state_quiescent(&allocator->scrubber.admin_state)) return VDO_NO_SPACE;
/** * vdo_modify_reference_count() - Modify the reference count of a block by first making a slab * journal entry and then updating the reference counter. * @completion: The data_vio completion for which to add the entry. * @updater: Which of the data_vio's reference updaters is being submitted.
*/ void vdo_modify_reference_count(struct vdo_completion *completion, struct reference_updater *updater)
{ struct vdo_slab *slab = vdo_get_slab(completion->vdo->depot, updater->zpbn.pbn);
if (!is_slab_open(slab)) {
vdo_continue_completion(completion, VDO_INVALID_ADMIN_STATE); return;
}
if (vdo_is_read_only(completion->vdo)) {
vdo_continue_completion(completion, VDO_READ_ONLY); return;
}
vdo_waitq_enqueue_waiter(&slab->journal.entry_waiters, &updater->waiter); if ((slab->status != VDO_SLAB_REBUILT) && requires_reaping(&slab->journal))
register_slab_for_scrubbing(slab, true);
add_entries(&slab->journal);
}
/* Release an unused provisional reference. */ int vdo_release_block_reference(struct block_allocator *allocator,
physical_block_number_t pbn)
{ struct reference_updater updater;
/* * This is a min_heap callback function orders slab_status structures using the 'is_clean' field as * the primary key and the 'emptiness' field as the secondary key. * * Slabs need to be pushed onto the lists in the same order they are to be popped off. Popping * should always get the most empty first, so pushing should be from most empty to least empty. * Thus, the ordering is reversed from the usual sense since min_heap returns smaller elements * before larger ones.
*/ staticbool slab_status_is_less_than(constvoid *item1, constvoid *item2, void __always_unused *args)
{ conststruct slab_status *info1 = item1; conststruct slab_status *info2 = item2;
if (info1->is_clean != info2->is_clean) return info1->is_clean; if (info1->emptiness != info2->emptiness) return info1->emptiness > info2->emptiness; return info1->slab_number < info2->slab_number;
}
/* Inform the slab actor that a action has finished on some slab; used by apply_to_slabs(). */ staticvoid slab_action_callback(struct vdo_completion *completion)
{ struct block_allocator *allocator = vdo_as_block_allocator(completion); struct slab_actor *actor = &allocator->slab_actor;
if (--actor->slab_action_count == 0) {
actor->callback(completion); return;
}
vdo_reset_completion(completion);
}
/* Preserve the error from part of an action and continue. */ staticvoid handle_operation_error(struct vdo_completion *completion)
{ struct block_allocator *allocator = vdo_as_block_allocator(completion);
if (allocator->state.waiter != NULL)
vdo_set_completion_result(allocator->state.waiter, completion->result);
completion->callback(completion);
}
/* Perform an action on each of an allocator's slabs in parallel. */ staticvoid apply_to_slabs(struct block_allocator *allocator, vdo_action_fn callback)
{ struct slab_iterator iterator;
if (operation == VDO_ADMIN_STATE_LOADING_FOR_REBUILD) { /* * Must requeue because the kcopyd client cannot be freed in the same stack frame * as the kcopyd callback, lest it deadlock.
*/
vdo_prepare_completion_for_requeue(&allocator->completion,
finish_loading_allocator,
handle_operation_error,
allocator->thread_id, NULL);
allocator->eraser = dm_kcopyd_client_create(NULL); if (IS_ERR(allocator->eraser)) {
vdo_fail_completion(&allocator->completion,
PTR_ERR(allocator->eraser));
allocator->eraser = NULL; return;
}
allocator->slabs_to_erase = get_slab_iterator(allocator);
/** * vdo_notify_slab_journals_are_recovered() - Inform a block allocator that its slab journals have * been recovered from the recovery journal. * @completion The allocator completion
*/ void vdo_notify_slab_journals_are_recovered(struct vdo_completion *completion)
{ struct block_allocator *allocator = vdo_as_block_allocator(completion);
vdo_log_info("block_allocator zone %u", allocator->zone_number); while (iterator.next != NULL) { struct vdo_slab *slab = next_slab(&iterator); struct slab_journal *journal = &slab->journal;
if (slab->reference_blocks != NULL) { /* Terse because there are a lot of slabs to dump and syslog is lossy. */
vdo_log_info("slab %u: P%u, %llu free", slab->slab_number,
slab->priority,
(unsignedlonglong) slab->free_blocks);
} else {
vdo_log_info("slab %u: status %s", slab->slab_number,
status_to_string(slab->status));
}
vdo_log_info(" slab journal: entry_waiters=%zu waiting_to_commit=%s updating_slab_summary=%s head=%llu unreapable=%llu tail=%llu next_commit=%llu summarized=%llu last_summarized=%llu recovery_lock=%llu dirty=%s",
vdo_waitq_num_waiters(&journal->entry_waiters),
vdo_bool_to_string(journal->waiting_to_commit),
vdo_bool_to_string(journal->updating_slab_summary),
(unsignedlonglong) journal->head,
(unsignedlonglong) journal->unreapable,
(unsignedlonglong) journal->tail,
(unsignedlonglong) journal->next_commit,
(unsignedlonglong) journal->summarized,
(unsignedlonglong) journal->last_summarized,
(unsignedlonglong) journal->recovery_lock,
vdo_bool_to_string(journal->recovery_lock != 0)); /* * Given the frequency with which the locks are just a tiny bit off, it might be * worth dumping all the locks, but that might be too much logging.
*/
if (slab->counters != NULL) { /* Terse because there are a lot of slabs to dump and syslog is lossy. */
vdo_log_info(" slab: free=%u/%u blocks=%u dirty=%zu active=%zu journal@(%llu,%u)",
slab->free_blocks, slab->block_count,
slab->reference_block_count,
vdo_waitq_num_waiters(&slab->dirty_blocks),
slab->active_count,
(unsignedlonglong) slab->slab_journal_point.sequence_number,
slab->slab_journal_point.entry_count);
} else {
vdo_log_info(" no counters");
}
/* * Wait for a while after each batch of 32 slabs dumped, an arbitrary number, * allowing the kernel log a chance to be flushed instead of being overrun.
*/ if (pause_counter++ == 31) {
pause_counter = 0;
vdo_pause_for_logger();
}
}
journal->flushing_deadline = journal->flushing_threshold; /* * Set there to be some time between the deadline and the blocking threshold, so that * hopefully all are done before blocking.
*/ if ((journal->blocking_threshold - journal->flushing_threshold) > 5)
journal->flushing_deadline = journal->blocking_threshold - 5;
/** * make_slab() - Construct a new, empty slab. * @slab_origin: The physical block number within the block allocator partition of the first block * in the slab. * @allocator: The block allocator to which the slab belongs. * @slab_number: The slab number of the slab. * @is_new: true if this slab is being allocated as part of a resize. * @slab_ptr: A pointer to receive the new slab. * * Return: VDO_SUCCESS or an error code.
*/ staticint __must_check make_slab(physical_block_number_t slab_origin, struct block_allocator *allocator,
slab_count_t slab_number, bool is_new, struct vdo_slab **slab_ptr)
{ conststruct slab_config *slab_config = &allocator->depot->slab_config; struct vdo_slab *slab; int result;
result = vdo_allocate(1, struct vdo_slab, __func__, &slab); if (result != VDO_SUCCESS) return result;
result = initialize_slab_journal(slab); if (result != VDO_SUCCESS) {
free_slab(slab); return result;
}
if (is_new) {
vdo_set_admin_state_code(&slab->state, VDO_ADMIN_STATE_NEW);
result = allocate_slab_counters(slab); if (result != VDO_SUCCESS) {
free_slab(slab); return result;
}
} else {
vdo_set_admin_state_code(&slab->state, VDO_ADMIN_STATE_NORMAL_OPERATION);
}
*slab_ptr = slab; return VDO_SUCCESS;
}
/** * allocate_slabs() - Allocate a new slab pointer array. * @depot: The depot. * @slab_count: The number of slabs the depot should have in the new array. * * Any existing slab pointers will be copied into the new array, and slabs will be allocated as * needed. The newly allocated slabs will not be distributed for use by the block allocators. * * Return: VDO_SUCCESS or an error code.
*/ staticint allocate_slabs(struct slab_depot *depot, slab_count_t slab_count)
{
block_count_t slab_size; bool resizing = false;
physical_block_number_t slab_origin; int result;
result = vdo_allocate(slab_count, struct vdo_slab *, "slab pointer array", &depot->new_slabs); if (result != VDO_SUCCESS) return result;
result = make_slab(slab_origin, allocator, depot->new_slab_count,
resizing, slab_ptr); if (result != VDO_SUCCESS) return result;
}
return VDO_SUCCESS;
}
/** * vdo_abandon_new_slabs() - Abandon any new slabs in this depot, freeing them as needed. * @depot: The depot.
*/ void vdo_abandon_new_slabs(struct slab_depot *depot)
{
slab_count_t i;
if (depot->new_slabs == NULL) return;
for (i = depot->slab_count; i < depot->new_slab_count; i++)
free_slab(vdo_forget(depot->new_slabs[i]));
depot->new_slab_count = 0;
depot->new_size = 0;
vdo_free(vdo_forget(depot->new_slabs));
}
/** * get_allocator_thread_id() - Get the ID of the thread on which a given allocator operates. * * Implements vdo_zone_thread_getter_fn.
*/ static thread_id_t get_allocator_thread_id(void *context, zone_count_t zone_number)
{ return ((struct slab_depot *) context)->allocators[zone_number].thread_id;
}
/** * release_recovery_journal_lock() - Request the slab journal to release the recovery journal lock * it may hold on a specified recovery journal block. * @journal: The slab journal. * @recovery_lock: The sequence number of the recovery journal block whose locks should be * released. * * Return: true if the journal does hold a lock on the specified block (which it will release).
*/ staticbool __must_check release_recovery_journal_lock(struct slab_journal *journal,
sequence_number_t recovery_lock)
{ if (recovery_lock > journal->recovery_lock) {
VDO_ASSERT_LOG_ONLY((recovery_lock < journal->recovery_lock), "slab journal recovery lock is not older than the recovery journal head"); returnfalse;
}
if ((recovery_lock < journal->recovery_lock) ||
vdo_is_read_only(journal->slab->allocator->depot->vdo)) returnfalse;
/* All locks are held by the block which is in progress; write it. */
commit_tail(journal); returntrue;
}
/* * Request a commit of all dirty tail blocks which are locking the recovery journal block the depot * is seeking to release. * * Implements vdo_zone_action_fn.
*/ staticvoid release_tail_block_locks(void *context, zone_count_t zone_number, struct vdo_completion *parent)
{ struct slab_journal *journal, *tmp; struct slab_depot *depot = context; struct list_head *list = &depot->allocators[zone_number].dirty_slab_journals;
list_for_each_entry_safe(journal, tmp, list, dirty_entry) { if (!release_recovery_journal_lock(journal,
depot->active_release_request)) break;
}
/** * schedule_tail_block_commit() - Schedule a tail block commit if necessary. * * This method should not be called directly. Rather, call vdo_schedule_default_action() on the * depot's action manager. * * Implements vdo_action_scheduler_fn.
*/ staticbool schedule_tail_block_commit(void *context)
{ struct slab_depot *depot = context;
if (depot->new_release_request == depot->active_release_request) returnfalse;
/** * initialize_slab_summary_block() - Initialize a slab_summary_block. * @allocator: The allocator which owns the block. * @index: The index of this block in its zone's summary. * * Return: VDO_SUCCESS or an error.
*/ staticint __must_check initialize_slab_summary_block(struct block_allocator *allocator,
block_count_t index)
{ struct slab_summary_block *block = &allocator->summary_blocks[index]; int result;
result = vdo_allocate(VDO_BLOCK_SIZE, char, __func__, &block->outgoing_entries); if (result != VDO_SUCCESS) return result;
result = allocate_vio_components(allocator->depot->vdo, VIO_TYPE_SLAB_SUMMARY,
VIO_PRIORITY_METADATA, NULL, 1,
block->outgoing_entries, &block->vio); if (result != VDO_SUCCESS) return result;
/* Initialize each summary block. */ for (i = 0; i < VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE; i++) {
result = initialize_slab_summary_block(allocator, i); if (result != VDO_SUCCESS) return result;
}
/* * Performing well atop thin provisioned storage requires either that VDO discards freed * blocks, or that the block allocator try to use slabs that already have allocated blocks * in preference to slabs that have never been opened. For reasons we have not been able to * fully understand, some SSD machines have been have been very sensitive (50% reduction in * test throughput) to very slight differences in the timing and locality of block * allocation. Assigning a low priority to unopened slabs (max_priority/2, say) would be * ideal for the story, but anything less than a very high threshold (max_priority - 1) * hurts on these machines. * * This sets the free block threshold for preferring to open an unopened slab to the binary * floor of 3/4ths the total number of data blocks in a slab, which will generally evaluate * to about half the slab size.
*/
allocator->unopened_slab_priority = (1 + ilog2((max_free_blocks * 3) / 4));
/** * vdo_decode_slab_depot() - Make a slab depot and configure it with the state read from the super * block. * @state: The slab depot state from the super block. * @vdo: The VDO which will own the depot. * @summary_partition: The partition which holds the slab summary. * @depot_ptr: A pointer to hold the depot. * * Return: A success or error code.
*/ int vdo_decode_slab_depot(struct slab_depot_state_2_0 state, struct vdo *vdo, struct partition *summary_partition, struct slab_depot **depot_ptr)
{ unsignedint slab_size_shift; struct slab_depot *depot; int result;
/* * Calculate the bit shift for efficiently mapping block numbers to slabs. Using a shift * requires that the slab size be a power of two.
*/
block_count_t slab_size = state.slab_config.slab_blocks;
if (!is_power_of_2(slab_size)) { return vdo_log_error_strerror(UDS_INVALID_ARGUMENT, "slab size must be a power of two");
}
slab_size_shift = ilog2(slab_size);
result = vdo_allocate_extended(struct slab_depot,
vdo->thread_config.physical_zone_count, struct block_allocator, __func__, &depot); if (result != VDO_SUCCESS) return result;
for (i = 0; i < VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE; i++) {
free_vio_components(&allocator->summary_blocks[i].vio);
vdo_free(vdo_forget(allocator->summary_blocks[i].outgoing_entries));
}
/** * vdo_record_slab_depot() - Record the state of a slab depot for encoding into the super block. * @depot: The depot to encode. * * Return: The depot state.
*/ struct slab_depot_state_2_0 vdo_record_slab_depot(conststruct slab_depot *depot)
{ /* * If this depot is currently using 0 zones, it must have been synchronously loaded by a * tool and is now being saved. We did not load and combine the slab summary, so we still * need to do that next time we load with the old zone count rather than 0.
*/ struct slab_depot_state_2_0 state;
zone_count_t zones_to_record = depot->zone_count;
if (depot->zone_count == 0)
zones_to_record = depot->old_zone_count;
/** * vdo_allocate_reference_counters() - Allocate the reference counters for all slabs in the depot. * * Context: This method may be called only before entering normal operation from the load thread. * * Return: VDO_SUCCESS or an error.
*/ int vdo_allocate_reference_counters(struct slab_depot *depot)
{ struct slab_iterator iterator =
get_depot_slab_iterator(depot, depot->slab_count - 1, 0, 1);
while (iterator.next != NULL) { int result = allocate_slab_counters(next_slab(&iterator));
if (result != VDO_SUCCESS) return result;
}
return VDO_SUCCESS;
}
/** * get_slab_number() - Get the number of the slab that contains a specified block. * @depot: The slab depot. * @pbn: The physical block number. * @slab_number_ptr: A pointer to hold the slab number. * * Return: VDO_SUCCESS or an error.
*/ staticint __must_check get_slab_number(conststruct slab_depot *depot,
physical_block_number_t pbn,
slab_count_t *slab_number_ptr)
{
slab_count_t slab_number;
if (pbn < depot->first_block) return VDO_OUT_OF_RANGE;
/** * vdo_get_slab() - Get the slab object for the slab that contains a specified block. * @depot: The slab depot. * @pbn: The physical block number. * * Will put the VDO in read-only mode if the PBN is not a valid data block nor the zero block. * * Return: The slab containing the block, or NULL if the block number is the zero block or * otherwise out of range.
*/ struct vdo_slab *vdo_get_slab(conststruct slab_depot *depot,
physical_block_number_t pbn)
{
slab_count_t slab_number; int result;
if (pbn == VDO_ZERO_BLOCK) return NULL;
result = get_slab_number(depot, pbn, &slab_number); if (result != VDO_SUCCESS) {
vdo_enter_read_only_mode(depot->vdo, result); return NULL;
}
return depot->slabs[slab_number];
}
/** * vdo_get_increment_limit() - Determine how many new references a block can acquire. * @depot: The slab depot. * @pbn: The physical block number that is being queried. * * Context: This method must be called from the physical zone thread of the PBN. * * Return: The number of available references.
*/
u8 vdo_get_increment_limit(struct slab_depot *depot, physical_block_number_t pbn)
{ struct vdo_slab *slab = vdo_get_slab(depot, pbn);
vdo_refcount_t *counter_ptr = NULL; int result;
if ((slab == NULL) || (slab->status != VDO_SLAB_REBUILT)) return 0;
result = get_reference_counter(slab, pbn, &counter_ptr); if (result != VDO_SUCCESS) return 0;
if (*counter_ptr == PROVISIONAL_REFERENCE_COUNT) return (MAXIMUM_REFERENCE_COUNT - 1);
/** * vdo_is_physical_data_block() - Determine whether the given PBN refers to a data block. * @depot: The depot. * @pbn: The physical block number to ask about. * * Return: True if the PBN corresponds to a data block.
*/ bool vdo_is_physical_data_block(conststruct slab_depot *depot,
physical_block_number_t pbn)
{
slab_count_t slab_number;
slab_block_number sbn;
/** * vdo_get_slab_depot_allocated_blocks() - Get the total number of data blocks allocated across all * the slabs in the depot. * @depot: The slab depot. * * This is the total number of blocks with a non-zero reference count. * * Context: This may be called from any thread. * * Return: The total number of blocks with a non-zero reference count.
*/
block_count_t vdo_get_slab_depot_allocated_blocks(conststruct slab_depot *depot)
{
block_count_t total = 0;
zone_count_t zone;
for (zone = 0; zone < depot->zone_count; zone++) { /* The allocators are responsible for thread safety. */
total += READ_ONCE(depot->allocators[zone].allocated_blocks);
}
return total;
}
/** * vdo_get_slab_depot_data_blocks() - Get the total number of data blocks in all the slabs in the * depot. * @depot: The slab depot. * * Context: This may be called from any thread. * * Return: The total number of data blocks in all slabs.
*/
block_count_t vdo_get_slab_depot_data_blocks(conststruct slab_depot *depot)
{ return (READ_ONCE(depot->slab_count) * depot->slab_config.data_blocks);
}
/** * finish_combining_zones() - Clean up after saving out the combined slab summary. * @completion: The vio which was used to write the summary data.
*/ staticvoid finish_combining_zones(struct vdo_completion *completion)
{ int result = completion->result; struct vdo_completion *parent = completion->parent;
/** * combine_summaries() - Treating the current entries buffer as the on-disk value of all zones, * update every zone to the correct values for every slab. * @depot: The depot whose summary entries should be combined.
*/ staticvoid combine_summaries(struct slab_depot *depot)
{ /* * Combine all the old summary data into the portion of the buffer corresponding to the * first zone.
*/
zone_count_t zone = 0; struct slab_summary_entry *entries = depot->summary_entries;
if (depot->old_zone_count > 1) {
slab_count_t entry_number;
zone++; if (zone == depot->old_zone_count)
zone = 0;
}
}
/* Copy the combined data to each zones's region of the buffer. */ for (zone = 1; zone < MAX_VDO_PHYSICAL_ZONES; zone++) {
memcpy(entries + (zone * MAX_VDO_SLABS), entries,
MAX_VDO_SLABS * sizeof(struct slab_summary_entry));
}
}
/** * finish_loading_summary() - Finish loading slab summary data. * @completion: The vio which was used to read the summary data. * * Combines the slab summary data from all the previously written zones and copies the combined * summary to each partition's data region. Then writes the combined summary back out to disk. This * callback is registered in load_summary_endio().
*/ staticvoid finish_loading_summary(struct vdo_completion *completion)
{ struct slab_depot *depot = completion->vdo->depot;
/* Combine the summary from each zone so each zone is correct for all slabs. */
combine_summaries(depot);
/* Write the combined summary back out. */
vdo_submit_metadata_vio(as_vio(completion), depot->summary_origin,
write_summary_endio, handle_combining_error,
REQ_OP_WRITE);
}
/** * vdo_load_slab_depot() - Asynchronously load any slab depot state that isn't included in the * super_block component. * @depot: The depot to load. * @operation: The type of load to perform. * @parent: The completion to notify when the load is complete. * @context: Additional context for the load operation; may be NULL. * * This method may be called only before entering normal operation from the load thread.
*/ void vdo_load_slab_depot(struct slab_depot *depot, conststruct admin_state_code *operation, struct vdo_completion *parent, void *context)
{ if (!vdo_assert_load_operation(operation, parent)) return;
result = vdo_prepare_slabs_for_allocation(allocator); if (result != VDO_SUCCESS) {
vdo_fail_completion(parent, result); return;
}
scrub_slabs(allocator, parent);
}
/** * vdo_prepare_slab_depot_to_allocate() - Prepare the slab depot to come online and start * allocating blocks. * @depot: The depot to prepare. * @load_type: The load type. * @parent: The completion to notify when the operation is complete. * * This method may be called only before entering normal operation from the load thread. It must be * called before allocation may proceed.
*/ void vdo_prepare_slab_depot_to_allocate(struct slab_depot *depot, enum slab_depot_load_type load_type, struct vdo_completion *parent)
{
depot->load_type = load_type;
atomic_set(&depot->zones_to_scrub, depot->zone_count);
vdo_schedule_action(depot->action_manager, NULL,
prepare_to_allocate, NULL, parent);
}
/** * vdo_update_slab_depot_size() - Update the slab depot to reflect its new size in memory. * @depot: The depot to update. * * This size is saved to disk as part of the super block.
*/ void vdo_update_slab_depot_size(struct slab_depot *depot)
{
depot->last_block = depot->new_last_block;
}
/** * vdo_prepare_to_grow_slab_depot() - Allocate new memory needed for a resize of a slab depot to * the given size. * @depot: The depot to prepare to resize. * @partition: The new depot partition * * Return: VDO_SUCCESS or an error.
*/ int vdo_prepare_to_grow_slab_depot(struct slab_depot *depot, conststruct partition *partition)
{ struct slab_depot_state_2_0 new_state; int result;
slab_count_t new_slab_count;
if ((partition->count >> depot->slab_size_shift) <= depot->slab_count) return VDO_INCREMENT_TOO_SMALL;
/* Generate the depot configuration for the new block count. */
VDO_ASSERT_LOG_ONLY(depot->first_block == partition->offset, "New slab depot partition doesn't change origin");
result = vdo_configure_slab_depot(partition, depot->slab_config,
depot->zone_count, &new_state); if (result != VDO_SUCCESS) return result;
new_slab_count = vdo_compute_slab_count(depot->first_block,
new_state.last_block,
depot->slab_size_shift); if (new_slab_count <= depot->slab_count) return vdo_log_error_strerror(VDO_INCREMENT_TOO_SMALL, "Depot can only grow"); if (new_slab_count == depot->new_slab_count) { /* Check it out, we've already got all the new slabs allocated! */ return VDO_SUCCESS;
}
vdo_abandon_new_slabs(depot);
result = allocate_slabs(depot, new_slab_count); if (result != VDO_SUCCESS) {
vdo_abandon_new_slabs(depot); return result;
}
/** * finish_registration() - Finish registering new slabs now that all of the allocators have * received their new slabs. * * Implements vdo_action_conclusion_fn.
*/ staticint finish_registration(void *context)
{ struct slab_depot *depot = context;
for (i = depot->slab_count; i < depot->new_slab_count; i++) { struct vdo_slab *slab = depot->new_slabs[i];
if (slab->allocator == allocator)
register_slab_with_allocator(allocator, slab);
}
vdo_finish_completion(parent);
}
/** * vdo_use_new_slabs() - Use the new slabs allocated for resize. * @depot: The depot. * @parent: The object to notify when complete.
*/ void vdo_use_new_slabs(struct slab_depot *depot, struct vdo_completion *parent)
{
VDO_ASSERT_LOG_ONLY(depot->new_slabs != NULL, "Must have new slabs to use");
vdo_schedule_operation(depot->action_manager,
VDO_ADMIN_STATE_SUSPENDED_OPERATION,
NULL, register_new_slabs,
finish_registration, parent);
}
/** * stop_scrubbing() - Tell the scrubber to stop scrubbing after it finishes the slab it is * currently working on. * @allocator: The block allocator owning the scrubber to stop.
*/ staticvoid stop_scrubbing(struct block_allocator *allocator)
{ struct slab_scrubber *scrubber = &allocator->scrubber;
case VDO_DRAIN_ALLOCATOR_STEP_SLABS:
apply_to_slabs(allocator, do_drain_step); return;
case VDO_DRAIN_ALLOCATOR_STEP_SUMMARY:
vdo_start_draining(&allocator->summary_state,
vdo_get_admin_state_code(&allocator->state),
completion, initiate_summary_drain); return;
case VDO_DRAIN_ALLOCATOR_STEP_FINISHED:
VDO_ASSERT_LOG_ONLY(!is_vio_pool_busy(allocator->vio_pool), "vio pool not busy");
vdo_finish_draining_with_result(&allocator->state, completion->result); return;
/* * Drain all allocator I/O. Depending upon the type of drain, some or all dirty metadata may be * written to disk. The type of drain will be determined from the state of the allocator's depot. * * Implements vdo_zone_action_fn.
*/ staticvoid drain_allocator(void *context, zone_count_t zone_number, struct vdo_completion *parent)
{ struct slab_depot *depot = context;
/** * vdo_drain_slab_depot() - Drain all slab depot I/O. * @depot: The depot to drain. * @operation: The drain operation (flush, rebuild, suspend, or save). * @parent: The completion to finish when the drain is complete. * * If saving, or flushing, all dirty depot metadata will be written out. If saving or suspending, * the depot will be left in a suspended state.
*/ void vdo_drain_slab_depot(struct slab_depot *depot, conststruct admin_state_code *operation, struct vdo_completion *parent)
{
vdo_schedule_operation(depot->action_manager, operation,
NULL, drain_allocator, NULL, parent);
}
/** * resume_scrubbing() - Tell the scrubber to resume scrubbing if it has been stopped. * @allocator: The allocator being resumed.
*/ staticvoid resume_scrubbing(struct block_allocator *allocator)
{ int result; struct slab_scrubber *scrubber = &allocator->scrubber;
if (!has_slabs_to_scrub(scrubber)) {
vdo_finish_completion(&allocator->completion); return;
}
result = vdo_resume_if_quiescent(&scrubber->admin_state); if (result != VDO_SUCCESS) {
vdo_fail_completion(&allocator->completion, result); return;
}
/** * vdo_resume_slab_depot() - Resume a suspended slab depot. * @depot: The depot to resume. * @parent: The completion to finish when the depot has resumed.
*/ void vdo_resume_slab_depot(struct slab_depot *depot, struct vdo_completion *parent)
{ if (vdo_is_read_only(depot->vdo)) {
vdo_continue_completion(parent, VDO_READ_ONLY); return;
}
/** * vdo_commit_oldest_slab_journal_tail_blocks() - Commit all dirty tail blocks which are locking a * given recovery journal block. * @depot: The depot. * @recovery_block_number: The sequence number of the recovery journal block whose locks should be * released. * * Context: This method must be called from the journal zone thread.
*/ void vdo_commit_oldest_slab_journal_tail_blocks(struct slab_depot *depot,
sequence_number_t recovery_block_number)
{ if (depot == NULL) return;
/** * vdo_scrub_all_unrecovered_slabs() - Scrub all unrecovered slabs. * @depot: The depot to scrub. * @parent: The object to notify when scrubbing has been launched for all zones.
*/ void vdo_scrub_all_unrecovered_slabs(struct slab_depot *depot, struct vdo_completion *parent)
{
vdo_schedule_action(depot->action_manager, NULL,
scrub_all_unrecovered_slabs,
NULL, parent);
}
/** * get_block_allocator_statistics() - Get the total of the statistics from all the block allocators * in the depot. * @depot: The slab depot. * * Return: The statistics from all block allocators in the depot.
*/ staticstruct block_allocator_statistics __must_check
get_block_allocator_statistics(conststruct slab_depot *depot)
{ struct block_allocator_statistics totals;
zone_count_t zone;
memset(&totals, 0, sizeof(totals));
for (zone = 0; zone < depot->zone_count; zone++) { conststruct block_allocator *allocator = &depot->allocators[zone]; conststruct block_allocator_statistics *stats = &allocator->statistics;
/** * get_ref_counts_statistics() - Get the cumulative ref_counts statistics for the depot. * @depot: The slab depot. * * Return: The cumulative statistics for all ref_counts in the depot.
*/ staticstruct ref_counts_statistics __must_check
get_ref_counts_statistics(conststruct slab_depot *depot)
{ struct ref_counts_statistics totals;
zone_count_t zone;
memset(&totals, 0, sizeof(totals));
for (zone = 0; zone < depot->zone_count; zone++) {
totals.blocks_written +=
READ_ONCE(depot->allocators[zone].ref_counts_statistics.blocks_written);
}
return totals;
}
/** * get_slab_journal_statistics() - Get the aggregated slab journal statistics for the depot. * @depot: The slab depot. * * Return: The aggregated statistics for all slab journals in the depot.
*/ staticstruct slab_journal_statistics __must_check
get_slab_journal_statistics(conststruct slab_depot *depot)
{ struct slab_journal_statistics totals;
zone_count_t zone;
memset(&totals, 0, sizeof(totals));
for (zone = 0; zone < depot->zone_count; zone++) { conststruct slab_journal_statistics *stats =
&depot->allocators[zone].slab_journal_statistics;
/** * vdo_get_slab_depot_statistics() - Get all the vdo_statistics fields that are properties of the * slab depot. * @depot: The slab depot. * @stats: The vdo statistics structure to partially fill.
*/ void vdo_get_slab_depot_statistics(conststruct slab_depot *depot, struct vdo_statistics *stats)
{
slab_count_t slab_count = READ_ONCE(depot->slab_count);
slab_count_t unrecovered = 0;
zone_count_t zone;
for (zone = 0; zone < depot->zone_count; zone++) { /* The allocators are responsible for thread safety. */
unrecovered += READ_ONCE(depot->allocators[zone].scrubber.slab_count);
}
¤ Diese beiden folgenden Angebotsgruppen bietet das Unternehmen0.103Angebot
(Wie Sie bei der Firma Beratungs- und Dienstleistungen beauftragen können 2026-04-29)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.