/* * fs/ext4/fast_commit.c * * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com> * * Ext4 fast commits routines.
*/ #include"ext4.h" #include"ext4_jbd2.h" #include"ext4_extents.h" #include"mballoc.h"
#include <linux/lockdep.h> /* * Ext4 Fast Commits * ----------------- * * Ext4 fast commits implement fine grained journalling for Ext4. * * Fast commits are organized as a log of tag-length-value (TLV) structs. (See * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by * TLV during the recovery phase. For the scenarios for which we currently * don't have replay code, fast commit falls back to full commits. * Fast commits record delta in one of the following three categories. * * (A) Directory entry updates: * * - EXT4_FC_TAG_UNLINK - records directory entry unlink * - EXT4_FC_TAG_LINK - records directory entry link * - EXT4_FC_TAG_CREAT - records inode and directory entry creation * * (B) File specific data range updates: * * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode * * (C) Inode metadata (mtime / ctime etc): * * - EXT4_FC_TAG_INODE - record the inode that should be replayed * during recovery. Note that iblocks field is * not replayed and instead derived during * replay. * Commit Operation * ---------------- * With fast commits, we maintain all the directory entry operations in the * order in which they are issued in an in-memory queue. This queue is flushed * to disk during the commit operation. We also maintain a list of inodes * that need to be committed during a fast commit in another in memory queue of * inodes. During the commit operation, we commit in the following order: * * [1] Prepare all the inodes to write out their data by setting * "EXT4_STATE_FC_FLUSHING_DATA". This ensures that inode cannot be * deleted while it is being flushed. * [2] Flush data buffers to disk and clear "EXT4_STATE_FC_FLUSHING_DATA" * state. * [3] Lock the journal by calling jbd2_journal_lock_updates. This ensures that * all the exsiting handles finish and no new handles can start. * [4] Mark all the fast commit eligible inodes as undergoing fast commit * by setting "EXT4_STATE_FC_COMMITTING" state. * [5] Unlock the journal by calling jbd2_journal_unlock_updates. This allows * starting of new handles. If new handles try to start an update on * any of the inodes that are being committed, ext4_fc_track_inode() * will block until those inodes have finished the fast commit. * [6] Commit all the directory entry updates in the fast commit space. * [7] Commit all the changed inodes in the fast commit space and clear * "EXT4_STATE_FC_COMMITTING" for these inodes. * [8] Write tail tag (this tag ensures the atomicity, please read the following * section for more details). * * All the inode updates must be enclosed within jbd2_jounrnal_start() * and jbd2_journal_stop() similar to JBD2 journaling. * * Fast Commit Ineligibility * ------------------------- * * Not all operations are supported by fast commits today (e.g extended * attributes). Fast commit ineligibility is marked by calling * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back * to full commit. * * Atomicity of commits * -------------------- * In order to guarantee atomicity during the commit operation, fast commit * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail * tag contains CRC of the contents and TID of the transaction after which * this fast commit should be applied. Recovery code replays fast commit * logs only if there's at least 1 valid tail present. For every fast commit * operation, there is 1 tail. This means, we may end up with multiple tails * in the fast commit space. Here's an example: * * - Create a new file A and remove existing file B * - fsync() * - Append contents to file A * - Truncate file A * - fsync() * * The fast commit space at the end of above operations would look like this: * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL] * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->| * * Replay code should thus check for all the valid tails in the FC area. * * Fast Commit Replay Idempotence * ------------------------------ * * Fast commits tags are idempotent in nature provided the recovery code follows * certain rules. The guiding principle that the commit path follows while * committing is that it stores the result of a particular operation instead of * storing the procedure. * * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a' * was associated with inode 10. During fast commit, instead of storing this * operation as a procedure "rename a to b", we store the resulting file system * state as a "series" of outcomes: * * - Link dirent b to inode 10 * - Unlink dirent a * - Inode <10> with valid refcount * * Now when recovery code runs, it needs "enforce" this state on the file * system. This is what guarantees idempotence of fast commit replay. * * Let's take an example of a procedure that is not idempotent and see how fast * commits make it idempotent. Consider following sequence of operations: * * rm A; mv B A; read A * (x) (y) (z) * * (x), (y) and (z) are the points at which we can crash. If we store this * sequence of operations as is then the replay is not idempotent. Let's say * while in replay, we crash at (z). During the second replay, file A (which was * actually created as a result of "mv B A" operation) would get deleted. Thus, * file named A would be absent when we try to read A. So, this sequence of * operations is not idempotent. However, as mentioned above, instead of storing * the procedure fast commits store the outcome of each procedure. Thus the fast * commit log for above procedure would be as follows: * * (Let's assume dirent A was linked to inode 10 and dirent B was linked to * inode 11 before the replay) * * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11] * (w) (x) (y) (z) * * If we crash at (z), we will have file A linked to inode 11. During the second * replay, we will remove file A (inode 11). But we will create it back and make * it point to inode 11. We won't find B, so we'll just skip that step. At this * point, the refcount for inode 11 is not reliable, but that gets fixed by the * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled * similarly. Thus, by converting a non-idempotent procedure into a series of * idempotent outcomes, fast commits ensured idempotence during the replay. * * Locking * ------- * sbi->s_fc_lock protects the fast commit inodes queue and the fast commit * dentry queue. ei->i_fc_lock protects the fast commit related info in a given * inode. Most of the code avoids acquiring both the locks, but if one must do * that then sbi->s_fc_lock must be acquired before ei->i_fc_lock. * * TODOs * ----- * * 0) Fast commit replay path hardening: Fast commit replay code should use * journal handles to make sure all the updates it does during the replay * path are atomic. With that if we crash during fast commit replay, after * trying to do recovery again, we will find a file system where fast commit * area is invalid (because new full commit would be found). In order to deal * with that, fast commit replay code should ensure that the "FC_REPLAY" * superblock state is persisted before starting the replay, so that after * the crash, fast commit recovery code can look at that flag and perform * fast commit recovery even if that area is invalidated by later full * commits. * * 1) Handle more ineligible cases. * * 2) Change ext4_fc_commit() to lookup logical to physical mapping using extent * status tree. This would get rid of the need to call ext4_fc_track_inode() * before acquiring i_data_sem. To do that we would need to ensure that * modified extents from the extent status tree are not evicted from memory.
*/
/* * Remove inode from fast commit list. If the inode is being committed * we wait until inode commit is done.
*/ void ext4_fc_del(struct inode *inode)
{ struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_fc_dentry_update *fc_dentry;
wait_queue_head_t *wq;
if (ext4_fc_disabled(inode->i_sb)) return;
mutex_lock(&sbi->s_fc_lock); if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
mutex_unlock(&sbi->s_fc_lock); return;
}
/* * Since ext4_fc_del is called from ext4_evict_inode while having a * handle open, there is no need for us to wait here even if a fast * commit is going on. That is because, if this inode is being * committed, ext4_mark_inode_dirty would have waited for inode commit * operation to finish before we come here. So, by the time we come * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So, * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode * here. * * We may come here without any handles open in the "no_delete" case of * ext4_evict_inode as well. However, if that happens, we first mark the * file system as fast commit ineligible anyway. So, even in that case, * it is okay to remove the inode from the fc list.
*/
WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)
&& !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)); while (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) { #if (BITS_PER_LONG < 64)
DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
EXT4_STATE_FC_FLUSHING_DATA);
wq = bit_waitqueue(&ei->i_state_flags,
EXT4_STATE_FC_FLUSHING_DATA); #else
DEFINE_WAIT_BIT(wait, &ei->i_flags,
EXT4_STATE_FC_FLUSHING_DATA);
wq = bit_waitqueue(&ei->i_flags,
EXT4_STATE_FC_FLUSHING_DATA); #endif
prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
mutex_unlock(&sbi->s_fc_lock);
schedule();
mutex_lock(&sbi->s_fc_lock);
}
finish_wait(wq, &wait.wq_entry);
}
list_del_init(&ei->i_fc_list);
/* * Since this inode is getting removed, let's also remove all FC * dentry create references, since it is not needed to log it anyways.
*/ if (list_empty(&ei->i_fc_dilist)) {
mutex_unlock(&sbi->s_fc_lock); return;
}
/* * Mark file system as fast commit ineligible, and record latest * ineligible transaction tid. This means until the recorded * transaction, commit operation would result in a full jbd2 commit.
*/ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle)
{ struct ext4_sb_info *sbi = EXT4_SB(sb);
tid_t tid; bool has_transaction = true; bool is_ineligible;
if (ext4_fc_disabled(sb)) return;
if (handle && !IS_ERR(handle))
tid = handle->h_transaction->t_tid; else {
read_lock(&sbi->s_journal->j_state_lock); if (sbi->s_journal->j_running_transaction)
tid = sbi->s_journal->j_running_transaction->t_tid; else
has_transaction = false;
read_unlock(&sbi->s_journal->j_state_lock);
}
mutex_lock(&sbi->s_fc_lock);
is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid)))
sbi->s_fc_ineligible_tid = tid;
ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
mutex_unlock(&sbi->s_fc_lock);
WARN_ON(reason >= EXT4_FC_REASON_MAX);
sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
}
/* * Generic fast commit tracking function. If this is the first time this we are * called after a full commit, we initialize fast commit fields and then call * __fc_track_fn() with update = 0. If we have already been called after a full * commit, we pass update = 1. Based on that, the track function can determine * if it needs to track a field for the first time or if it needs to just * update the previously tracked value. * * If enqueue is set, this function enqueues the inode in fast commit list.
*/ staticint ext4_fc_track_template(
handle_t *handle, struct inode *inode, int (*__fc_track_fn)(handle_t *handle, struct inode *, void *, bool), void *args, int enqueue)
{ bool update = false; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
tid_t tid = 0; int ret;
tid = handle->h_transaction->t_tid;
spin_lock(&ei->i_fc_lock); if (tid == ei->i_sync_tid) {
update = true;
} else {
ext4_fc_reset_inode(inode);
ei->i_sync_tid = tid;
}
ret = __fc_track_fn(handle, inode, args, update);
spin_unlock(&ei->i_fc_lock); if (!enqueue) return ret;
/* * This helps us keep a track of all fc_dentry updates which is part of * this ext4 inode. So in case the inode is getting unlinked, before * even we get a chance to fsync, we could remove all fc_dentry * references while evicting the inode in ext4_fc_del(). * Also with this, we don't need to loop over all the inodes in * sbi->s_fc_q to get the corresponding inode in * ext4_fc_commit_dentry_updates().
*/ if (dentry_update->op == EXT4_FC_TAG_CREAT) {
WARN_ON(!list_empty(&ei->i_fc_dilist));
list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
}
mutex_unlock(&sbi->s_fc_lock);
spin_lock(&ei->i_fc_lock);
if (ext4_should_journal_data(inode)) {
ext4_fc_mark_ineligible(inode->i_sb,
EXT4_FC_REASON_INODE_JOURNAL_DATA, handle); return;
}
if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return;
/* * If we come here, we may sleep while waiting for the inode to * commit. We shouldn't be holding i_data_sem when we go to sleep since * the commit path needs to grab the lock while committing the inode.
*/
lockdep_assert_not_held(&ei->i_data_sem);
/* * From this point on, this inode will not be committed either * by fast or full commit as long as the handle is open.
*/
ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
trace_ext4_fc_track_inode(handle, inode, ret);
}
/* * Allocate len bytes on a fast commit buffer. * * During the commit time this function is used to manage fast commit * block space. We don't split a fast commit log onto different * blocks. So this function makes sure that if there's not enough space * on the current block, the remaining space in the current block is * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case, * new block is from jbd2 and CRC is updated to reflect the padding * we added.
*/ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
{ struct ext4_fc_tl tl; struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh; int bsize = sbi->s_journal->j_blocksize; int ret, off = sbi->s_fc_bytes % bsize; int remaining;
u8 *dst;
/* * If 'len' is too long to fit in any block alongside a PAD tlv, then we * cannot fulfill the request.
*/ if (len > bsize - EXT4_FC_TAG_BASE_LEN) return NULL;
if (!sbi->s_fc_bh) {
ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); if (ret) return NULL;
sbi->s_fc_bh = bh;
}
dst = sbi->s_fc_bh->b_data + off;
/* * Allocate the bytes in the current block if we can do so while still * leaving enough space for a PAD tlv.
*/
remaining = bsize - EXT4_FC_TAG_BASE_LEN - off; if (len <= remaining) {
sbi->s_fc_bytes += len; return dst;
}
/* * Else, terminate the current block with a PAD tlv, then allocate a new * block and allocate the bytes at the start of that new block.
*/
ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); if (ret) return NULL;
sbi->s_fc_bh = bh;
sbi->s_fc_bytes += bsize - off + len; return sbi->s_fc_bh->b_data;
}
/* * Complete a fast commit by writing tail tag. * * Writing tail tag marks the end of a fast commit. In order to guarantee * atomicity, after writing tail tag, even if there's space remaining * in the block, next commit shouldn't use it. That's why tail tag * has the length as that of the remaining space on the block.
*/ staticint ext4_fc_write_tail(struct super_block *sb, u32 crc)
{ struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_fc_tl tl; struct ext4_fc_tail tail; int off, bsize = sbi->s_journal->j_blocksize;
u8 *dst;
/* * ext4_fc_reserve_space takes care of allocating an extra block if * there's no enough space on this block for accommodating this tail.
*/
dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc); if (!dst) return -ENOSPC;
/* * Writes inode in the fast commit space under TLV with tag @tag. * Returns 0 on success, error on failure.
*/ staticint ext4_fc_write_inode(struct inode *inode, u32 *crc)
{ struct ext4_inode_info *ei = EXT4_I(inode); int inode_len = EXT4_GOOD_OLD_INODE_SIZE; int ret; struct ext4_iloc iloc; struct ext4_fc_inode fc_inode; struct ext4_fc_tl tl;
u8 *dst;
ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret;
/* Flushes data of all the inodes in the commit queue. */ staticint ext4_fc_flush_data(journal_t *journal)
{ struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *ei; int ret = 0;
list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
ret = jbd2_submit_inode_data(journal, ei->jinode); if (ret) return ret;
}
list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
ret = jbd2_wait_inode_data(journal, ei->jinode); if (ret) return ret;
}
if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) return 0;
list_for_each_entry_safe(fc_dentry, fc_dentry_n,
&sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) return -ENOSPC; continue;
} /* * With fcd_dilist we need not loop in sbi->s_fc_q to get the * corresponding inode. Also, the corresponding inode could have been * deleted, in which case, we don't need to do anything.
*/ if (list_empty(&fc_dentry->fcd_dilist)) continue;
ei = list_first_entry(&fc_dentry->fcd_dilist, struct ext4_inode_info, i_fc_dilist);
inode = &ei->vfs_inode;
WARN_ON(inode->i_ino != fc_dentry->fcd_ino);
/* * We first write the inode and then the create dirent. This * allows the recovery code to create an unnamed inode first * and then link it to a directory entry. This allows us * to use namei.c routines almost as is and simplifies * the recovery code.
*/
ret = ext4_fc_write_inode(inode, crc); if (ret) return ret;
ret = ext4_fc_write_inode_data(inode, crc); if (ret) return ret; if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) return -ENOSPC;
} return 0;
}
/* * Step 1: Mark all inodes on s_fc_q[MAIN] with * EXT4_STATE_FC_FLUSHING_DATA. This prevents these inodes from being * freed until the data flush is over.
*/
mutex_lock(&sbi->s_fc_lock);
list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
ext4_set_inode_state(&iter->vfs_inode,
EXT4_STATE_FC_FLUSHING_DATA);
}
mutex_unlock(&sbi->s_fc_lock);
/* Step 2: Flush data for all the eligible inodes. */
ret = ext4_fc_flush_data(journal);
/* * Step 3: Clear EXT4_STATE_FC_FLUSHING_DATA flag, before returning * any error from step 2. This ensures that waiters waiting on * EXT4_STATE_FC_FLUSHING_DATA can resume.
*/
mutex_lock(&sbi->s_fc_lock);
list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
ext4_clear_inode_state(&iter->vfs_inode,
EXT4_STATE_FC_FLUSHING_DATA); #if (BITS_PER_LONG < 64)
wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_FLUSHING_DATA); #else
wake_up_bit(&iter->i_flags, EXT4_STATE_FC_FLUSHING_DATA); #endif
}
/* * Make sure clearing of EXT4_STATE_FC_FLUSHING_DATA is visible before * the waiter checks the bit. Pairs with implicit barrier in * prepare_to_wait() in ext4_fc_del().
*/
smp_mb();
mutex_unlock(&sbi->s_fc_lock);
/* * If we encountered error in Step 2, return it now after clearing * EXT4_STATE_FC_FLUSHING_DATA bit.
*/ if (ret) return ret;
/* Step 4: Mark all inodes as being committed. */
jbd2_journal_lock_updates(journal); /* * The journal is now locked. No more handles can start and all the * previous handles are now drained. We now mark the inodes on the * commit queue as being committed.
*/
mutex_lock(&sbi->s_fc_lock);
list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
ext4_set_inode_state(&iter->vfs_inode,
EXT4_STATE_FC_COMMITTING);
}
mutex_unlock(&sbi->s_fc_lock);
jbd2_journal_unlock_updates(journal);
/* * Step 5: If file system device is different from journal device, * issue a cache flush before we start writing fast commit blocks.
*/ if (journal->j_fs_dev != journal->j_dev)
blkdev_issue_flush(journal->j_fs_dev);
blk_start_plug(&plug); /* Step 6: Write fast commit blocks to disk. */ if (sbi->s_fc_bytes == 0) { /* * Step 6.1: Add a head tag only if this is the first fast * commit in this TID.
*/
head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
head.fc_tid = cpu_to_le32(
sbi->s_journal->j_running_transaction->t_tid); if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
(u8 *)&head, &crc)) {
ret = -ENOSPC; goto out;
}
}
/* Step 6.2: Now write all the dentry updates. */
mutex_lock(&sbi->s_fc_lock);
ret = ext4_fc_commit_dentry_updates(journal, &crc); if (ret) goto out;
/* Step 6.3: Now write all the changed inodes to disk. */
list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
inode = &iter->vfs_inode; if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) continue;
ret = ext4_fc_write_inode_data(inode, &crc); if (ret) goto out;
ret = ext4_fc_write_inode(inode, &crc); if (ret) goto out;
} /* Step 6.4: Finally write tail tag to conclude this fast commit. */
ret = ext4_fc_write_tail(sb, crc);
staticvoid ext4_fc_update_stats(struct super_block *sb, int status,
u64 commit_time, int nblks, tid_t commit_tid)
{ struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;
ext4_debug("Fast commit ended with status = %d for tid %u",
status, commit_tid); if (status == EXT4_FC_STATUS_OK) {
stats->fc_num_commits++;
stats->fc_numblks += nblks; if (likely(stats->s_fc_avg_commit_time))
stats->s_fc_avg_commit_time =
(commit_time +
stats->s_fc_avg_commit_time * 3) / 4; else
stats->s_fc_avg_commit_time = commit_time;
} elseif (status == EXT4_FC_STATUS_FAILED ||
status == EXT4_FC_STATUS_INELIGIBLE) { if (status == EXT4_FC_STATUS_FAILED)
stats->fc_failed_commits++;
stats->fc_ineligible_commits++;
} else {
stats->fc_skipped_commits++;
}
trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid);
}
/* * The main commit entry point. Performs a fast commit for transaction * commit_tid if needed. If it's not possible to perform a fast commit * due to various reasons, we fall back to full commit. Returns 0 * on success, error otherwise.
*/ int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
{ struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); int nblks = 0, ret, bsize = journal->j_blocksize; int subtid = atomic_read(&sbi->s_fc_subtid); int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
ktime_t start_time, commit_time; int old_ioprio, journal_ioprio;
if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) return jbd2_complete_transaction(journal, commit_tid);
restart_fc:
ret = jbd2_fc_begin_commit(journal, commit_tid); if (ret == -EALREADY) { /* There was an ongoing commit, check if we need to restart */ if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
tid_gt(commit_tid, journal->j_commit_sequence)) goto restart_fc;
ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0,
commit_tid); return 0;
} elseif (ret) { /* * Commit couldn't start. Just update stats and perform a * full commit.
*/
ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0,
commit_tid); return jbd2_complete_transaction(journal, commit_tid);
}
/* * After establishing journal barrier via jbd2_fc_begin_commit(), check * if we are fast commit ineligible.
*/ if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) {
status = EXT4_FC_STATUS_INELIGIBLE; goto fallback;
}
/* * Now that we know that this thread is going to do a fast commit, * elevate the priority to match that of the journal thread.
*/ if (journal->j_task->io_context)
journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; else
journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;
set_task_ioprio(current, journal_ioprio);
fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
ret = ext4_fc_perform_commit(journal); if (ret < 0) {
status = EXT4_FC_STATUS_FAILED; goto fallback;
}
nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
ret = jbd2_fc_wait_bufs(journal, nblks); if (ret < 0) {
status = EXT4_FC_STATUS_FAILED; goto fallback;
}
atomic_inc(&sbi->s_fc_subtid);
ret = jbd2_fc_end_commit(journal);
set_task_ioprio(current, old_ioprio); /* * weight the commit time higher than the average time so we * don't react too strongly to vast changes in the commit time
*/
commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid); return ret;
/* * Fast commit cleanup routine. This is called after every fast commit and * full commit. full is true if we are called after a full commit.
*/ staticvoid ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
{ struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *ei; struct ext4_fc_dentry_update *fc_dentry;
mutex_lock(&sbi->s_fc_lock); while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) {
ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN], struct ext4_inode_info,
i_fc_list);
list_del_init(&ei->i_fc_list);
ext4_clear_inode_state(&ei->vfs_inode,
EXT4_STATE_FC_COMMITTING); if (tid_geq(tid, ei->i_sync_tid)) {
ext4_fc_reset_inode(&ei->vfs_inode);
} elseif (full) { /* * We are called after a full commit, inode has been * modified while the commit was running. Re-enqueue * the inode into STAGING, which will then be splice * back into MAIN. This cannot happen during * fastcommit because the journal is locked all the * time in that case (and tid doesn't increase so * tid check above isn't reliable).
*/
list_add_tail(&ei->i_fc_list,
&sbi->s_fc_q[FC_Q_STAGING]);
} /* * Make sure clearing of EXT4_STATE_FC_COMMITTING is * visible before we send the wakeup. Pairs with implicit * barrier in prepare_to_wait() in ext4_fc_track_inode().
*/
smp_mb(); #if (BITS_PER_LONG < 64)
wake_up_bit(&ei->i_state_flags, EXT4_STATE_FC_COMMITTING); #else
wake_up_bit(&ei->i_flags, EXT4_STATE_FC_COMMITTING); #endif
}
if (IS_ERR(inode)) {
ext4_debug("Inode %d not found", darg.ino); return 0;
}
old_parent = ext4_iget(sb, darg.parent_ino,
EXT4_IGET_NORMAL); if (IS_ERR(old_parent)) {
ext4_debug("Dir with inode %d not found", darg.parent_ino);
iput(inode); return 0;
}
ret = __ext4_unlink(old_parent, &entry, inode, NULL); /* -ENOENT ok coz it might not exist anymore. */ if (ret == -ENOENT)
ret = 0;
iput(old_parent);
iput(inode); return ret;
}
dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL); if (IS_ERR(dir)) {
ext4_debug("Dir with inode %d not found.", darg->parent_ino);
dir = NULL; goto out;
}
dentry_dir = d_obtain_alias(dir); if (IS_ERR(dentry_dir)) {
ext4_debug("Failed to obtain dentry");
dentry_dir = NULL; goto out;
}
dentry_inode = d_alloc(dentry_dir, &qstr_dname); if (!dentry_inode) {
ext4_debug("Inode dentry not created.");
ret = -ENOMEM; goto out;
}
ret = __ext4_link(dir, inode, dentry_inode); /* * It's possible that link already existed since data blocks * for the dir in question got persisted before we crashed OR * we replayed this tag and crashed before the entire replay * could complete.
*/ if (ret && ret != -EEXIST) {
ext4_debug("Failed to link\n"); goto out;
}
ret = 0;
out: if (dentry_dir) {
d_drop(dentry_dir);
dput(dentry_dir);
} elseif (dir) {
iput(dir);
} if (dentry_inode) {
d_drop(dentry_inode);
dput(dentry_inode);
}
return ret;
}
/* Link replay function */ staticint ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val)
{ struct inode *inode; struct dentry_info_args darg; int ret = 0;
inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) {
ext4_debug("Inode not found."); return 0;
}
ret = ext4_fc_replay_link_internal(sb, &darg, inode);
iput(inode); return ret;
}
/* * Record all the modified inodes during replay. We use this later to setup * block bitmaps correctly.
*/ staticint ext4_fc_record_modified_inode(struct super_block *sb, int ino)
{ struct ext4_fc_replay_state *state; int i;
state = &EXT4_SB(sb)->s_fc_replay_state; for (i = 0; i < state->fc_modified_inodes_used; i++) if (state->fc_modified_inodes[i] == ino) return 0; if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { int *fc_modified_inodes;
/* Immediately update the inode on disk. */
ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); if (ret) goto out;
ret = sync_dirty_buffer(iloc.bh); if (ret) goto out;
ret = ext4_mark_inode_used(sb, ino); if (ret) goto out;
/* Given that we just wrote the inode on disk, this SHOULD succeed. */
inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) {
ext4_debug("Inode not found."); return -EFSCORRUPTED;
}
/* * Our allocator could have made different decisions than before * crashing. This should be fixed but until then, we calculate * the number of blocks the inode.
*/ if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
ext4_ext_replay_set_iblocks(inode);
ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
sync_dirty_buffer(iloc.bh);
brelse(iloc.bh);
out:
iput(inode); if (!ret)
blkdev_issue_flush(sb->s_bdev);
return 0;
}
/* * Dentry create replay function. * * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the * inode for which we are trying to create a dentry here, should already have * been replayed before we start here.
*/ staticint ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val)
{ int ret = 0; struct inode *inode = NULL; struct inode *dir = NULL; struct dentry_info_args darg;
/* This takes care of update group descriptor and other metadata */
ret = ext4_mark_inode_used(sb, darg.ino); if (ret) goto out;
inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) {
ext4_debug("inode %d not found.", darg.ino);
inode = NULL;
ret = -EINVAL; goto out;
}
if (S_ISDIR(inode->i_mode)) { /* * If we are creating a directory, we need to make sure that the * dot and dot dot dirents are setup properly.
*/
dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); if (IS_ERR(dir)) {
ext4_debug("Dir %d not found.", darg.ino); goto out;
}
ret = ext4_init_new_dir(NULL, dir, inode);
iput(dir); if (ret) {
ret = 0; goto out;
}
}
ret = ext4_fc_replay_link_internal(sb, &darg, inode); if (ret) goto out;
set_nlink(inode, 1);
ext4_mark_inode_dirty(NULL, inode);
out:
iput(inode); return ret;
}
/* * Record physical disk regions which are in use as per fast commit area, * and used by inodes during replay phase. Our simple replay phase * allocator excludes these regions from allocation.
*/ int ext4_fc_record_regions(struct super_block *sb, int ino,
ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
{ struct ext4_fc_replay_state *state; struct ext4_fc_alloc_region *region;
state = &EXT4_SB(sb)->s_fc_replay_state; /* * during replay phase, the fc_regions_valid may not same as * fc_regions_used, update it when do new additions.
*/ if (replay && state->fc_regions_used != state->fc_regions_valid)
state->fc_regions_used = state->fc_regions_valid; if (state->fc_regions_used == state->fc_regions_size) { struct ext4_fc_alloc_region *fc_regions;
while (remaining > 0) {
map.m_lblk = cur;
map.m_len = remaining;
map.m_pblk = 0;
ret = ext4_map_blocks(NULL, inode, &map, 0);
if (ret < 0) goto out;
if (ret == 0) { /* Range is not mapped */
path = ext4_find_extent(inode, cur, path, 0); if (IS_ERR(path)) goto out;
memset(&newex, 0, sizeof(newex));
newex.ee_block = cpu_to_le32(cur);
ext4_ext_store_pblock(
&newex, start_pblk + cur - start);
newex.ee_len = cpu_to_le16(map.m_len); if (ext4_ext_is_unwritten(ex))
ext4_ext_mark_unwritten(&newex);
down_write(&EXT4_I(inode)->i_data_sem);
path = ext4_ext_insert_extent(NULL, inode,
path, &newex, 0);
up_write((&EXT4_I(inode)->i_data_sem)); if (IS_ERR(path)) goto out; goto next;
}
if (start_pblk + cur - start != map.m_pblk) { /* * Logical to physical mapping changed. This can happen * if this range was removed and then reallocated to * map to new physical blocks during a fast commit.
*/
ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
ext4_ext_is_unwritten(ex),
start_pblk + cur - start); if (ret) goto out; /* * Mark the old blocks as free since they aren't used * anymore. We maintain an array of all the modified * inodes. In case these blocks are still used at either * a different logical range in the same inode or in * some different inode, we will mark them as allocated * at the end of the FC replay using our array of * modified inodes.
*/
ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); goto next;
}
/* Range is mapped and needs a state change */
ext4_debug("Converting from %ld to %d %lld",
map.m_flags & EXT4_MAP_UNWRITTEN,
ext4_ext_is_unwritten(ex), map.m_pblk);
ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
ext4_ext_is_unwritten(ex), map.m_pblk); if (ret) goto out; /* * We may have split the extent tree while toggling the state. * Try to shrink the extent tree now.
*/
ext4_ext_replay_shrink_inode(inode, start + len);
next:
cur += map.m_len;
remaining -= map.m_len;
}
ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
sb->s_blocksize_bits);
out:
ext4_free_ext_path(path);
iput(inode); return 0;
}
state = &EXT4_SB(sb)->s_fc_replay_state; for (i = 0; i < state->fc_modified_inodes_used; i++) {
inode = ext4_iget(sb, state->fc_modified_inodes[i],
EXT4_IGET_NORMAL); if (IS_ERR(inode)) {
ext4_debug("Inode %d not found.",
state->fc_modified_inodes[i]); continue;
}
cur = 0;
end = EXT_MAX_BLOCKS; if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
iput(inode); continue;
} while (cur < end) {
map.m_lblk = cur;
map.m_len = end - cur;
ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) break;
if (ret > 0) {
path = ext4_find_extent(inode, map.m_lblk, path, 0); if (!IS_ERR(path)) { for (j = 0; j < path->p_depth; j++)
ext4_mb_mark_bb(inode->i_sb,
path[j].p_block, 1, true);
} else {
path = NULL;
}
cur += ret;
ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
map.m_len, true);
} else {
cur = cur + (map.m_len ? map.m_len : 1);
}
}
iput(inode);
}
ext4_free_ext_path(path);
}
/* * Check if block is in excluded regions for block allocation. The simple * allocator that runs during replay phase is calls this function to see * if it is okay to use a block.
*/ bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
{ int i; struct ext4_fc_replay_state *state;
state = &EXT4_SB(sb)->s_fc_replay_state; for (i = 0; i < state->fc_regions_valid; i++) { if (state->fc_regions[i].ino == 0 ||
state->fc_regions[i].len == 0) continue; if (in_range(blk, state->fc_regions[i].pblk,
state->fc_regions[i].len)) returntrue;
} returnfalse;
}
/* Cleanup function called after replay */ void ext4_fc_replay_cleanup(struct super_block *sb)
{ struct ext4_sb_info *sbi = EXT4_SB(sb);
staticbool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi, int tag, int len)
{ switch (tag) { case EXT4_FC_TAG_ADD_RANGE: return len == sizeof(struct ext4_fc_add_range); case EXT4_FC_TAG_DEL_RANGE: return len == sizeof(struct ext4_fc_del_range); case EXT4_FC_TAG_CREAT: case EXT4_FC_TAG_LINK: case EXT4_FC_TAG_UNLINK:
len -= sizeof(struct ext4_fc_dentry_info); return len >= 1 && len <= EXT4_NAME_LEN; case EXT4_FC_TAG_INODE:
len -= sizeof(struct ext4_fc_inode); return len >= EXT4_GOOD_OLD_INODE_SIZE &&
len <= sbi->s_inode_size; case EXT4_FC_TAG_PAD: returntrue; /* padding can have any length */ case EXT4_FC_TAG_TAIL: return len >= sizeof(struct ext4_fc_tail); case EXT4_FC_TAG_HEAD: return len == sizeof(struct ext4_fc_head);
} returnfalse;
}
/* * Recovery Scan phase handler * * This function is called during the scan phase and is responsible * for doing following things: * - Make sure the fast commit area has valid tags for replay * - Count number of tags that need to be replayed by the replay handler * - Verify CRC * - Create a list of excluded blocks for allocation during replay phase * * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP * to indicate that scan has finished and JBD2 can now start replay phase. * It returns a negative error to indicate that there was an error. At the end * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set * to indicate the number of tags that need to replayed during the replay phase.
*/ staticint ext4_fc_replay_scan(journal_t *journal, struct buffer_head *bh, int off,
tid_t expected_tid)
{ struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_fc_replay_state *state; int ret = JBD2_FC_REPLAY_CONTINUE; struct ext4_fc_add_range ext; struct ext4_fc_tl_mem tl; struct ext4_fc_tail tail;
__u8 *start, *end, *cur, *val; struct ext4_fc_head head; struct ext4_extent *ex;
state = &sbi->s_fc_replay_state;
start = (u8 *)bh->b_data;
end = start + journal->j_blocksize;
if (state->fc_replay_expected_off == 0) {
state->fc_cur_tag = 0;
state->fc_replay_num_tags = 0;
state->fc_crc = 0;
state->fc_regions = NULL;
state->fc_regions_valid = state->fc_regions_used =
state->fc_regions_size = 0; /* Check if we can stop early */ if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
!= EXT4_FC_TAG_HEAD) return 0;
}
if (off != state->fc_replay_expected_off) {
ret = -EFSCORRUPTED; goto out_err;
}
state->fc_replay_expected_off++; for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
ext4_fc_get_tl(&tl, cur);
val = cur + EXT4_FC_TAG_BASE_LEN; if (tl.fc_len > end - val ||
!ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) {
ret = state->fc_replay_num_tags ?
JBD2_FC_REPLAY_STOP : -ECANCELED; goto out_err;
}
ext4_debug("Scan phase, tag:%s, blk %lld\n",
tag2str(tl.fc_tag), bh->b_blocknr); switch (tl.fc_tag) { case EXT4_FC_TAG_ADD_RANGE:
memcpy(&ext, val, sizeof(ext));
ex = (struct ext4_extent *)&ext.fc_ex;
ret = ext4_fc_record_regions(sb,
le32_to_cpu(ext.fc_ino),
le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
ext4_ext_get_actual_len(ex), 0); if (ret < 0) break;
ret = JBD2_FC_REPLAY_CONTINUE;
fallthrough; case EXT4_FC_TAG_DEL_RANGE: case EXT4_FC_TAG_LINK: case EXT4_FC_TAG_UNLINK: case EXT4_FC_TAG_CREAT: case EXT4_FC_TAG_INODE: case EXT4_FC_TAG_PAD:
state->fc_cur_tag++;
state->fc_crc = ext4_chksum(state->fc_crc, cur,
EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; case EXT4_FC_TAG_TAIL:
state->fc_cur_tag++;
memcpy(&tail, val, sizeof(tail));
state->fc_crc = ext4_chksum(state->fc_crc, cur,
EXT4_FC_TAG_BASE_LEN +
offsetof(struct ext4_fc_tail,
fc_crc)); if (le32_to_cpu(tail.fc_tid) == expected_tid &&
le32_to_cpu(tail.fc_crc) == state->fc_crc) {
state->fc_replay_num_tags = state->fc_cur_tag;
state->fc_regions_valid =
state->fc_regions_used;
} else {
ret = state->fc_replay_num_tags ?
JBD2_FC_REPLAY_STOP : -EFSBADCRC;
}
state->fc_crc = 0; break; case EXT4_FC_TAG_HEAD:
memcpy(&head, val, sizeof(head)); if (le32_to_cpu(head.fc_features) &
~EXT4_FC_SUPPORTED_FEATURES) {
ret = -EOPNOTSUPP; break;
} if (le32_to_cpu(head.fc_tid) != expected_tid) {
ret = JBD2_FC_REPLAY_STOP; break;
}
state->fc_cur_tag++;
state->fc_crc = ext4_chksum(state->fc_crc, cur,
EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; default:
ret = state->fc_replay_num_tags ?
JBD2_FC_REPLAY_STOP : -ECANCELED;
} if (ret < 0 || ret == JBD2_FC_REPLAY_STOP) break;
}
#ifdef CONFIG_EXT4_DEBUG if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
pr_warn("Dropping fc block %d because max_replay set\n", off);
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Dauer der Verarbeitung: 0.59 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.