Ziele Untersuchung
mit Columbo Integrität von
Datenbanken Interaktion und
Portierbarkeit Ergonomie der
Schnittstellen

Angebot Produkte Projekt Beratung

Mittel Analytik Modellierung Sprachen Algebra Logik Hardware Denken Kreativität

Zusammenhänge Gesellschaft Wirtschaft Branche Firma


products/sources/formale Sprachen/C/Linux/fs/jbd2/ (Open Source Betriebssystem Version 6.17.9^©) Datei vom 24.10.2025 mit Größe 89 kB

Quelle journal.c Sprache: C

// SPDX-License-Identifier: GPL-2.0+
/*
* linux/fs/jbd2/journal.c
*
* Written by Stephen C. Tweedie <sct@redhat.com>, 1998
*
* Copyright 1998 Red Hat corp --- All Rights Reserved
*
* Generic filesystem journal-writing code; part of the ext2fs
* journaling system.
*
* This file manages journals: areas of disk reserved for logging
* transactional updates.  This includes the kernel journaling thread
* which is responsible for scheduling updates to the log.
*
* We do not actually manage the physical storage of the journal in this
* file: that is left to a per-journal policy function, which allows us
* to store the journal within a filesystem-specified area for ext2
* journaling (ext2 can use a reserved inode for storing the log).
*/

#include <linux/module.h>
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd2.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/freezer.h>
#include <linux/pagemap.h>
#include <linux/kthread.h>
#include <linux/poison.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/math64.h>
#include <linux/hash.h>
#include <linux/log2.h>
#include <linux/vmalloc.h>
#include <linux/backing-dev.h>
#include <linux/bitops.h>
#include <linux/ratelimit.h>
#include <linux/sched/mm.h>

#define CREATE_TRACE_POINTS
#include <trace/events/jbd2.h>

#include <linux/uaccess.h>
#include <asm/page.h>

#ifdef CONFIG_JBD2_DEBUG
static ushort jbd2_journal_enable_debug __read_mostly;

module_param_named(jbd2_debug, jbd2_journal_enable_debug, ushort, 0644);
MODULE_PARM_DESC(jbd2_debug, "Debugging level for jbd2");
#endif

EXPORT_SYMBOL(jbd2_journal_extend);
EXPORT_SYMBOL(jbd2_journal_stop);
EXPORT_SYMBOL(jbd2_journal_lock_updates);
EXPORT_SYMBOL(jbd2_journal_unlock_updates);
EXPORT_SYMBOL(jbd2_journal_get_write_access);
EXPORT_SYMBOL(jbd2_journal_get_create_access);
EXPORT_SYMBOL(jbd2_journal_get_undo_access);
EXPORT_SYMBOL(jbd2_journal_set_triggers);
EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
EXPORT_SYMBOL(jbd2_journal_forget);
EXPORT_SYMBOL(jbd2_journal_flush);
EXPORT_SYMBOL(jbd2_journal_revoke);

EXPORT_SYMBOL(jbd2_journal_init_dev);
EXPORT_SYMBOL(jbd2_journal_init_inode);
EXPORT_SYMBOL(jbd2_journal_check_used_features);
EXPORT_SYMBOL(jbd2_journal_check_available_features);
EXPORT_SYMBOL(jbd2_journal_set_features);
EXPORT_SYMBOL(jbd2_journal_load);
EXPORT_SYMBOL(jbd2_journal_destroy);
EXPORT_SYMBOL(jbd2_journal_abort);
EXPORT_SYMBOL(jbd2_journal_errno);
EXPORT_SYMBOL(jbd2_journal_ack_err);
EXPORT_SYMBOL(jbd2_journal_clear_err);
EXPORT_SYMBOL(jbd2_log_wait_commit);
EXPORT_SYMBOL(jbd2_journal_start_commit);
EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
EXPORT_SYMBOL(jbd2_journal_wipe);
EXPORT_SYMBOL(jbd2_journal_blocks_per_folio);
EXPORT_SYMBOL(jbd2_journal_invalidate_folio);
EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
EXPORT_SYMBOL(jbd2_journal_force_commit);
EXPORT_SYMBOL(jbd2_journal_inode_ranged_write);
EXPORT_SYMBOL(jbd2_journal_inode_ranged_wait);
EXPORT_SYMBOL(jbd2_journal_finish_inode_data_buffers);
EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
EXPORT_SYMBOL(jbd2_inode_cache);

static int jbd2_journal_create_slab(size_t slab_size);

#ifdef CONFIG_JBD2_DEBUG
void __jbd2_debug(int level, const char *file, const char *func,
    unsigned int line, const char *fmt, ...)
{
struct va_format vaf;
va_list args;

if (level > jbd2_journal_enable_debug)
  return;
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
printk(KERN_DEBUG "%s: (%s, %u): %pV", file, func, line, &vaf);
va_end(args);
}
#endif

/* Checksumming functions */
static __be32 jbd2_superblock_csum(journal_superblock_t *sb)
{
__u32 csum;
__be32 old_csum;

old_csum = sb->s_checksum;
sb->s_checksum = 0;
csum = jbd2_chksum(~0, (char *)sb, sizeof(journal_superblock_t));
sb->s_checksum = old_csum;

return cpu_to_be32(csum);
}

/*
* Helper function used to manage commit timeouts
*/

static void commit_timeout(struct timer_list *t)
{
journal_t *journal = timer_container_of(journal, t, j_commit_timer);

wake_up_process(journal->j_task);
}

/*
* kjournald2: The main thread function used to manage a logging device
* journal.
*
* This kernel thread is responsible for two things:
*
* 1) COMMIT:  Every so often we need to commit the current state of the
*    filesystem to disk.  The journal thread is responsible for writing
*    all of the metadata buffers to disk. If a fast commit is ongoing
*    journal thread waits until it's done and then continues from
*    there on.
*
* 2) CHECKPOINT: We cannot reuse a used section of the log file until all
*    of the data in that part of the log has been rewritten elsewhere on
*    the disk.  Flushing these old buffers to reclaim space in the log is
*    known as checkpointing, and this thread is responsible for that job.
*/

static int kjournald2(void *arg)
{
journal_t *journal = arg;
transaction_t *transaction;

/*
* Set up an interval timer which can be used to trigger a commit wakeup
* after the commit interval expires
*/
timer_setup(&journal->j_commit_timer, commit_timeout, 0);

set_freezable();

/* Record that the journal thread is running */
journal->j_task = current;
wake_up(&journal->j_wait_done_commit);

/*
* Make sure that no allocations from this kernel thread will ever
* recurse to the fs layer because we are responsible for the
* transaction commit and any fs involvement might get stuck waiting for
* the trasn. commit.
*/
memalloc_nofs_save();

/*
* And now, wait forever for commit wakeup events.
*/
write_lock(&journal->j_state_lock);

loop:
if (journal->j_flags & JBD2_UNMOUNT)
  goto end_loop;

jbd2_debug(1, "commit_sequence=%u, commit_request=%u\n",
  journal->j_commit_sequence, journal->j_commit_request);

if (journal->j_commit_sequence != journal->j_commit_request) {
  jbd2_debug(1, "OK, requests differ\n");
  write_unlock(&journal->j_state_lock);
  timer_delete_sync(&journal->j_commit_timer);
  jbd2_journal_commit_transaction(journal);
  write_lock(&journal->j_state_lock);
  goto loop;
}

wake_up(&journal->j_wait_done_commit);
if (freezing(current)) {
  /*
* The simpler the better. Flushing journal isn't a
* good idea, because that depends on threads that may
* be already stopped.
*/
  jbd2_debug(1, "Now suspending kjournald2\n");
  write_unlock(&journal->j_state_lock);
  try_to_freeze();
  write_lock(&journal->j_state_lock);
} else {
  /*
* We assume on resume that commits are already there,
* so we don't sleep
*/
  DEFINE_WAIT(wait);

  prepare_to_wait(&journal->j_wait_commit, &wait,
    TASK_INTERRUPTIBLE);
  transaction = journal->j_running_transaction;
  if (transaction == NULL ||
      time_before(jiffies, transaction->t_expires)) {
   write_unlock(&journal->j_state_lock);
   schedule();
   write_lock(&journal->j_state_lock);
  }
  finish_wait(&journal->j_wait_commit, &wait);
}

jbd2_debug(1, "kjournald2 wakes\n");

/*
* Were we woken up by a commit wakeup event?
*/
transaction = journal->j_running_transaction;
if (transaction && time_after_eq(jiffies, transaction->t_expires)) {
  journal->j_commit_request = transaction->t_tid;
  jbd2_debug(1, "woke because of timeout\n");
}
goto loop;

end_loop:
timer_delete_sync(&journal->j_commit_timer);
journal->j_task = NULL;
wake_up(&journal->j_wait_done_commit);
jbd2_debug(1, "Journal thread exiting.\n");
write_unlock(&journal->j_state_lock);
return 0;
}

static int jbd2_journal_start_thread(journal_t *journal)
{
struct task_struct *t;

t = kthread_run(kjournald2, journal, "jbd2/%s",
   journal->j_devname);
if (IS_ERR(t))
  return PTR_ERR(t);

wait_event(journal->j_wait_done_commit, journal->j_task != NULL);
return 0;
}

static void journal_kill_thread(journal_t *journal)
{
write_lock(&journal->j_state_lock);
journal->j_flags |= JBD2_UNMOUNT;

while (journal->j_task) {
  write_unlock(&journal->j_state_lock);
  wake_up(&journal->j_wait_commit);
  wait_event(journal->j_wait_done_commit, journal->j_task == NULL);
  write_lock(&journal->j_state_lock);
}
write_unlock(&journal->j_state_lock);
}

static inline bool jbd2_data_needs_escaping(char *data)
{
return *((__be32 *)data) == cpu_to_be32(JBD2_MAGIC_NUMBER);
}

static inline void jbd2_data_do_escape(char *data)
{
*((unsigned int *)data) = 0;
}

/*
* jbd2_journal_write_metadata_buffer: write a metadata buffer to the journal.
*
* Writes a metadata buffer to a given disk block.  The actual IO is not
* performed but a new buffer_head is constructed which labels the data
* to be written with the correct destination disk block.
*
* Any magic-number escaping which needs to be done will cause a
* copy-out here.  If the buffer happens to start with the
* JBD2_MAGIC_NUMBER, then we can't write it to the log directly: the
* magic number is only written to the log for descripter blocks.  In
* this case, we copy the data and replace the first word with 0, and we
* return a result code which indicates that this buffer needs to be
* marked as an escaped buffer in the corresponding log descriptor
* block.  The missing word can then be restored when the block is read
* during recovery.
*
* If the source buffer has already been modified by a new transaction
* since we took the last commit snapshot, we use the frozen copy of
* that data for IO. If we end up using the existing buffer_head's data
* for the write, then we have to make sure nobody modifies it while the
* IO is in progress. do_get_write_access() handles this.
*
* The function returns a pointer to the buffer_head to be used for IO.
*
*
* Return value:
*  =0: Finished OK without escape
*  =1: Finished OK with escape
*/

int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
      struct journal_head  *jh_in,
      struct buffer_head **bh_out,
      sector_t blocknr)
{
int do_escape = 0;
struct buffer_head *new_bh;
struct folio *new_folio;
unsigned int new_offset;
struct buffer_head *bh_in = jh2bh(jh_in);
journal_t *journal = transaction->t_journal;

/*
* The buffer really shouldn't be locked: only the current committing
* transaction is allowed to write it, so nobody else is allowed
* to do any IO.
*
* akpm: except if we're journalling data, and write() output is
* also part of a shared mapping, and another thread has
* decided to launch a writepage() against this buffer.
*/
J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));

new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);

/* keep subsequent assertions sane */
atomic_set(&new_bh->b_count, 1);

spin_lock(&jh_in->b_state_lock);
/*
* If a new transaction has already done a buffer copy-out, then
* we use that version of the data for the commit.
*/
if (jh_in->b_frozen_data) {
  new_folio = virt_to_folio(jh_in->b_frozen_data);
  new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data);
  do_escape = jbd2_data_needs_escaping(jh_in->b_frozen_data);
  if (do_escape)
   jbd2_data_do_escape(jh_in->b_frozen_data);
} else {
  char *tmp;
  char *mapped_data;

  new_folio = bh_in->b_folio;
  new_offset = offset_in_folio(new_folio, bh_in->b_data);
  mapped_data = kmap_local_folio(new_folio, new_offset);
  /*
* Fire data frozen trigger if data already wasn't frozen. Do
* this before checking for escaping, as the trigger may modify
* the magic offset.  If a copy-out happens afterwards, it will
* have the correct data in the buffer.
*/
  jbd2_buffer_frozen_trigger(jh_in, mapped_data,
        jh_in->b_triggers);
  do_escape = jbd2_data_needs_escaping(mapped_data);
  kunmap_local(mapped_data);
  /*
* Do we need to do a data copy?
*/
  if (!do_escape)
   goto escape_done;

  spin_unlock(&jh_in->b_state_lock);
  tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS | __GFP_NOFAIL);
  spin_lock(&jh_in->b_state_lock);
  if (jh_in->b_frozen_data) {
   jbd2_free(tmp, bh_in->b_size);
   goto copy_done;
  }

  jh_in->b_frozen_data = tmp;
  memcpy_from_folio(tmp, new_folio, new_offset, bh_in->b_size);
  /*
* This isn't strictly necessary, as we're using frozen
* data for the escaping, but it keeps consistency with
* b_frozen_data usage.
*/
  jh_in->b_frozen_triggers = jh_in->b_triggers;

copy_done:
  new_folio = virt_to_folio(jh_in->b_frozen_data);
  new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data);
  jbd2_data_do_escape(jh_in->b_frozen_data);
}

escape_done:
folio_set_bh(new_bh, new_folio, new_offset);
new_bh->b_size = bh_in->b_size;
new_bh->b_bdev = journal->j_dev;
new_bh->b_blocknr = blocknr;
new_bh->b_private = bh_in;
set_buffer_mapped(new_bh);
set_buffer_dirty(new_bh);

*bh_out = new_bh;

/*
* The to-be-written buffer needs to get moved to the io queue,
* and the original buffer whose contents we are shadowing or
* copying is moved to the transaction's shadow queue.
*/
JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
spin_lock(&journal->j_list_lock);
__jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
spin_unlock(&journal->j_list_lock);
set_buffer_shadow(bh_in);
spin_unlock(&jh_in->b_state_lock);

return do_escape;
}

/*
* Allocation code for the journal file.  Manage the space left in the
* journal, so that we can begin checkpointing when appropriate.
*/

/*
* Called with j_state_lock locked for writing.
* Returns true if a transaction commit was started.
*/
static int __jbd2_log_start_commit(journal_t *journal, tid_t target)
{
/* Return if the txn has already requested to be committed */
if (journal->j_commit_request == target)
  return 0;

/*
* The only transaction we can possibly wait upon is the
* currently running transaction (if it exists).  Otherwise,
* the target tid must be an old one.
*/
if (journal->j_running_transaction &&
     journal->j_running_transaction->t_tid == target) {
  /*
* We want a new commit: OK, mark the request and wakeup the
* commit thread.  We do _not_ do the commit ourselves.
*/

  journal->j_commit_request = target;
  jbd2_debug(1, "JBD2: requesting commit %u/%u\n",
     journal->j_commit_request,
     journal->j_commit_sequence);
  journal->j_running_transaction->t_requested = jiffies;
  wake_up(&journal->j_wait_commit);
  return 1;
} else if (!tid_geq(journal->j_commit_request, target))
  /* This should never happen, but if it does, preserve
   the evidence before kjournald goes into a loop and
   increments j_commit_sequence beyond all recognition. */
  WARN_ONCE(1, "JBD2: bad log_start_commit: %u %u %u %u\n",
     journal->j_commit_request,
     journal->j_commit_sequence,
     target, journal->j_running_transaction ?
     journal->j_running_transaction->t_tid : 0);
return 0;
}

int jbd2_log_start_commit(journal_t *journal, tid_t tid)
{
int ret;

write_lock(&journal->j_state_lock);
ret = __jbd2_log_start_commit(journal, tid);
write_unlock(&journal->j_state_lock);
return ret;
}

/*
* Force and wait any uncommitted transactions.  We can only force the running
* transaction if we don't have an active handle, otherwise, we will deadlock.
* Returns: <0 in case of error,
*           0 if nothing to commit,
*           1 if transaction was successfully committed.
*/
static int __jbd2_journal_force_commit(journal_t *journal)
{
transaction_t *transaction = NULL;
tid_t tid;
int need_to_start = 0, ret = 0;

read_lock(&journal->j_state_lock);
if (journal->j_running_transaction && !current->journal_info) {
  transaction = journal->j_running_transaction;
  if (!tid_geq(journal->j_commit_request, transaction->t_tid))
   need_to_start = 1;
} else if (journal->j_committing_transaction)
  transaction = journal->j_committing_transaction;

if (!transaction) {
  /* Nothing to commit */
  read_unlock(&journal->j_state_lock);
  return 0;
}
tid = transaction->t_tid;
read_unlock(&journal->j_state_lock);
if (need_to_start)
  jbd2_log_start_commit(journal, tid);
ret = jbd2_log_wait_commit(journal, tid);
if (!ret)
  ret = 1;

return ret;
}

/**
* jbd2_journal_force_commit_nested - Force and wait upon a commit if the
* calling process is not within transaction.
*
* @journal: journal to force
* Returns true if progress was made.
*
* This is used for forcing out undo-protected data which contains
* bitmaps, when the fs is running out of space.
*/
int jbd2_journal_force_commit_nested(journal_t *journal)
{
int ret;

ret = __jbd2_journal_force_commit(journal);
return ret > 0;
}

/**
* jbd2_journal_force_commit() - force any uncommitted transactions
* @journal: journal to force
*
* Caller want unconditional commit. We can only force the running transaction
* if we don't have an active handle, otherwise, we will deadlock.
*/
int jbd2_journal_force_commit(journal_t *journal)
{
int ret;

J_ASSERT(!current->journal_info);
ret = __jbd2_journal_force_commit(journal);
if (ret > 0)
  ret = 0;
return ret;
}

/*
* Start a commit of the current running transaction (if any).  Returns true
* if a transaction is going to be committed (or is currently already
* committing), and fills its tid in at *ptid
*/
int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
{
int ret = 0;

write_lock(&journal->j_state_lock);
if (journal->j_running_transaction) {
  tid_t tid = journal->j_running_transaction->t_tid;

  __jbd2_log_start_commit(journal, tid);
  /* There's a running transaction and we've just made sure
* it's commit has been scheduled. */
  if (ptid)
   *ptid = tid;
  ret = 1;
} else if (journal->j_committing_transaction) {
  /*
* If commit has been started, then we have to wait for
* completion of that transaction.
*/
  if (ptid)
   *ptid = journal->j_committing_transaction->t_tid;
  ret = 1;
}
write_unlock(&journal->j_state_lock);
return ret;
}

/*
* Return 1 if a given transaction has not yet sent barrier request
* connected with a transaction commit. If 0 is returned, transaction
* may or may not have sent the barrier. Used to avoid sending barrier
* twice in common cases.
*/
int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
{
int ret = 0;
transaction_t *commit_trans, *running_trans;

if (!(journal->j_flags & JBD2_BARRIER))
  return 0;
read_lock(&journal->j_state_lock);
/* Transaction already committed? */
if (tid_geq(journal->j_commit_sequence, tid))
  goto out;
commit_trans = journal->j_committing_transaction;
if (!commit_trans || commit_trans->t_tid != tid) {
  running_trans = journal->j_running_transaction;
  /*
* The query transaction hasn't started committing,
* it must still be running.
*/
  if (WARN_ON_ONCE(!running_trans ||
     running_trans->t_tid != tid))
   goto out;

  running_trans->t_need_data_flush = 1;
  ret = 1;
  goto out;
}
/*
* Transaction is being committed and we already proceeded to
* submitting a flush to fs partition?
*/
if (journal->j_fs_dev != journal->j_dev) {
  if (!commit_trans->t_need_data_flush ||
      commit_trans->t_state >= T_COMMIT_DFLUSH)
   goto out;
} else {
  if (commit_trans->t_state >= T_COMMIT_JFLUSH)
   goto out;
}
ret = 1;
out:
read_unlock(&journal->j_state_lock);
return ret;
}
EXPORT_SYMBOL(jbd2_trans_will_send_data_barrier);

/*
* Wait for a specified commit to complete.
* The caller may not hold the journal lock.
*/
int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
{
int err = 0;

read_lock(&journal->j_state_lock);
#ifdef CONFIG_PROVE_LOCKING
/*
* Some callers make sure transaction is already committing and in that
* case we cannot block on open handles anymore. So don't warn in that
* case.
*/
if (tid_gt(tid, journal->j_commit_sequence) &&
     (!journal->j_committing_transaction ||
      journal->j_committing_transaction->t_tid != tid)) {
  read_unlock(&journal->j_state_lock);
  jbd2_might_wait_for_commit(journal);
  read_lock(&journal->j_state_lock);
}
#endif
#ifdef CONFIG_JBD2_DEBUG
if (!tid_geq(journal->j_commit_request, tid)) {
  printk(KERN_ERR
         "%s: error: j_commit_request=%u, tid=%u\n",
         __func__, journal->j_commit_request, tid);
}
#endif
while (tid_gt(tid, journal->j_commit_sequence)) {
  jbd2_debug(1, "JBD2: want %u, j_commit_sequence=%u\n",
      tid, journal->j_commit_sequence);
  read_unlock(&journal->j_state_lock);
  wake_up(&journal->j_wait_commit);
  wait_event(journal->j_wait_done_commit,
    !tid_gt(tid, journal->j_commit_sequence));
  read_lock(&journal->j_state_lock);
}
read_unlock(&journal->j_state_lock);

if (unlikely(is_journal_aborted(journal)))
  err = -EIO;
return err;
}

/*
* Start a fast commit. If there's an ongoing fast or full commit wait for
* it to complete. Returns 0 if a new fast commit was started. Returns -EALREADY
* if a fast commit is not needed, either because there's an already a commit
* going on or this tid has already been committed. Returns -EINVAL if no jbd2
* commit has yet been performed.
*/
int jbd2_fc_begin_commit(journal_t *journal, tid_t tid)
{
if (unlikely(is_journal_aborted(journal)))
  return -EIO;
/*
* Fast commits only allowed if at least one full commit has
* been processed.
*/
if (!journal->j_stats.ts_tid)
  return -EINVAL;

write_lock(&journal->j_state_lock);
if (tid_geq(journal->j_commit_sequence, tid)) {
  write_unlock(&journal->j_state_lock);
  return -EALREADY;
}

if (journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
     (journal->j_flags & JBD2_FAST_COMMIT_ONGOING)) {
  DEFINE_WAIT(wait);

  prepare_to_wait(&journal->j_fc_wait, &wait,
    TASK_UNINTERRUPTIBLE);
  write_unlock(&journal->j_state_lock);
  schedule();
  finish_wait(&journal->j_fc_wait, &wait);
  return -EALREADY;
}
journal->j_flags |= JBD2_FAST_COMMIT_ONGOING;
write_unlock(&journal->j_state_lock);

return 0;
}
EXPORT_SYMBOL(jbd2_fc_begin_commit);

/*
* Stop a fast commit. If fallback is set, this function starts commit of
* TID tid before any other fast commit can start.
*/
static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback)
{
if (journal->j_fc_cleanup_callback)
  journal->j_fc_cleanup_callback(journal, 0, tid);
write_lock(&journal->j_state_lock);
journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING;
if (fallback)
  journal->j_flags |= JBD2_FULL_COMMIT_ONGOING;
write_unlock(&journal->j_state_lock);
wake_up(&journal->j_fc_wait);
if (fallback)
  return jbd2_complete_transaction(journal, tid);
return 0;
}

int jbd2_fc_end_commit(journal_t *journal)
{
return __jbd2_fc_end_commit(journal, 0, false);
}
EXPORT_SYMBOL(jbd2_fc_end_commit);

int jbd2_fc_end_commit_fallback(journal_t *journal)
{
tid_t tid;

read_lock(&journal->j_state_lock);
tid = journal->j_running_transaction ?
  journal->j_running_transaction->t_tid : 0;
read_unlock(&journal->j_state_lock);
return __jbd2_fc_end_commit(journal, tid, true);
}
EXPORT_SYMBOL(jbd2_fc_end_commit_fallback);

/* Return 1 when transaction with given tid has already committed. */
int jbd2_transaction_committed(journal_t *journal, tid_t tid)
{
return tid_geq(READ_ONCE(journal->j_commit_sequence), tid);
}
EXPORT_SYMBOL(jbd2_transaction_committed);

/*
* When this function returns the transaction corresponding to tid
* will be completed.  If the transaction has currently running, start
* committing that transaction before waiting for it to complete.  If
* the transaction id is stale, it is by definition already completed,
* so just return SUCCESS.
*/
int jbd2_complete_transaction(journal_t *journal, tid_t tid)
{
int need_to_wait = 1;

read_lock(&journal->j_state_lock);
if (journal->j_running_transaction &&
     journal->j_running_transaction->t_tid == tid) {
  if (journal->j_commit_request != tid) {
   /* transaction not yet started, so request it */
   read_unlock(&journal->j_state_lock);
   jbd2_log_start_commit(journal, tid);
   goto wait_commit;
  }
} else if (!(journal->j_committing_transaction &&
       journal->j_committing_transaction->t_tid == tid))
  need_to_wait = 0;
read_unlock(&journal->j_state_lock);
if (!need_to_wait)
  return 0;
wait_commit:
return jbd2_log_wait_commit(journal, tid);
}
EXPORT_SYMBOL(jbd2_complete_transaction);

/*
* Log buffer allocation routines:
*/

int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
{
unsigned long blocknr;

write_lock(&journal->j_state_lock);
J_ASSERT(journal->j_free > 1);

blocknr = journal->j_head;
journal->j_head++;
journal->j_free--;
if (journal->j_head == journal->j_last)
  journal->j_head = journal->j_first;
write_unlock(&journal->j_state_lock);
return jbd2_journal_bmap(journal, blocknr, retp);
}

/* Map one fast commit buffer for use by the file system */
int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out)
{
unsigned long long pblock;
unsigned long blocknr;
int ret = 0;
struct buffer_head *bh;
int fc_off;

*bh_out = NULL;

if (journal->j_fc_off + journal->j_fc_first >= journal->j_fc_last)
  return -EINVAL;

fc_off = journal->j_fc_off;
blocknr = journal->j_fc_first + fc_off;
journal->j_fc_off++;
ret = jbd2_journal_bmap(journal, blocknr, &pblock);
if (ret)
  return ret;

bh = __getblk(journal->j_dev, pblock, journal->j_blocksize);
if (!bh)
  return -ENOMEM;

journal->j_fc_wbuf[fc_off] = bh;

*bh_out = bh;

return 0;
}
EXPORT_SYMBOL(jbd2_fc_get_buf);

/*
* Wait on fast commit buffers that were allocated by jbd2_fc_get_buf
* for completion.
*/
int jbd2_fc_wait_bufs(journal_t *journal, int num_blks)
{
struct buffer_head *bh;
int i, j_fc_off;

j_fc_off = journal->j_fc_off;

/*
* Wait in reverse order to minimize chances of us being woken up before
* all IOs have completed
*/
for (i = j_fc_off - 1; i >= j_fc_off - num_blks; i--) {
  bh = journal->j_fc_wbuf[i];
  wait_on_buffer(bh);
  /*
* Update j_fc_off so jbd2_fc_release_bufs can release remain
* buffer head.
*/
  if (unlikely(!buffer_uptodate(bh))) {
   journal->j_fc_off = i + 1;
   return -EIO;
  }
  put_bh(bh);
  journal->j_fc_wbuf[i] = NULL;
}

return 0;
}
EXPORT_SYMBOL(jbd2_fc_wait_bufs);

void jbd2_fc_release_bufs(journal_t *journal)
{
struct buffer_head *bh;
int i, j_fc_off;

j_fc_off = journal->j_fc_off;

for (i = j_fc_off - 1; i >= 0; i--) {
  bh = journal->j_fc_wbuf[i];
  if (!bh)
   break;
  put_bh(bh);
  journal->j_fc_wbuf[i] = NULL;
}
}
EXPORT_SYMBOL(jbd2_fc_release_bufs);

/*
* Conversion of logical to physical block numbers for the journal
*
* On external journals the journal blocks are identity-mapped, so
* this is a no-op.  If needed, we can use j_blk_offset - everything is
* ready.
*/
int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
   unsigned long long *retp)
{
int err = 0;
unsigned long long ret;
sector_t block = blocknr;

if (journal->j_bmap) {
  err = journal->j_bmap(journal, &block);
  if (err == 0)
   *retp = block;
} else if (journal->j_inode) {
  ret = bmap(journal->j_inode, &block);

  if (ret || !block) {
   printk(KERN_ALERT "%s: journal block not found "
     "at offset %lu on %s\n",
          __func__, blocknr, journal->j_devname);
   err = -EIO;
   jbd2_journal_abort(journal, err);
  } else {
   *retp = block;
  }

} else {
  *retp = blocknr; /* +journal->j_blk_offset */
}
return err;
}

/*
* We play buffer_head aliasing tricks to write data/metadata blocks to
* the journal without copying their contents, but for journal
* descriptor blocks we do need to generate bona fide buffers.
*
* After the caller of jbd2_journal_get_descriptor_buffer() has finished modifying
* the buffer's contents they really should run flush_dcache_folio(bh->b_folio).
* But we don't bother doing that, so there will be coherency problems with
* mmaps of blockdevs which hold live JBD-controlled filesystems.
*/
struct buffer_head *
jbd2_journal_get_descriptor_buffer(transaction_t *transaction, int type)
{
journal_t *journal = transaction->t_journal;
struct buffer_head *bh;
unsigned long long blocknr;
journal_header_t *header;
int err;

err = jbd2_journal_next_log_block(journal, &blocknr);

if (err)
  return NULL;

bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
if (!bh)
  return NULL;
atomic_dec(&transaction->t_outstanding_credits);
lock_buffer(bh);
memset(bh->b_data, 0, journal->j_blocksize);
header = (journal_header_t *)bh->b_data;
header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
header->h_blocktype = cpu_to_be32(type);
header->h_sequence = cpu_to_be32(transaction->t_tid);
set_buffer_uptodate(bh);
unlock_buffer(bh);
BUFFER_TRACE(bh, "return this buffer");
return bh;
}

void jbd2_descriptor_block_csum_set(journal_t *j, struct buffer_head *bh)
{
struct jbd2_journal_block_tail *tail;
__u32 csum;

if (!jbd2_journal_has_csum_v2or3(j))
  return;

tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
   sizeof(struct jbd2_journal_block_tail));
tail->t_checksum = 0;
csum = jbd2_chksum(j->j_csum_seed, bh->b_data, j->j_blocksize);
tail->t_checksum = cpu_to_be32(csum);
}

/*
* Return tid of the oldest transaction in the journal and block in the journal
* where the transaction starts.
*
* If the journal is now empty, return which will be the next transaction ID
* we will write and where will that transaction start.
*
* The return value is 0 if journal tail cannot be pushed any further, 1 if
* it can.
*/
int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
         unsigned long *block)
{
transaction_t *transaction;
int ret;

read_lock(&journal->j_state_lock);
spin_lock(&journal->j_list_lock);
transaction = journal->j_checkpoint_transactions;
if (transaction) {
  *tid = transaction->t_tid;
  *block = transaction->t_log_start;
} else if ((transaction = journal->j_committing_transaction) != NULL) {
  *tid = transaction->t_tid;
  *block = transaction->t_log_start;
} else if ((transaction = journal->j_running_transaction) != NULL) {
  *tid = transaction->t_tid;
  *block = journal->j_head;
} else {
  *tid = journal->j_transaction_sequence;
  *block = journal->j_head;
}
ret = tid_gt(*tid, journal->j_tail_sequence);
spin_unlock(&journal->j_list_lock);
read_unlock(&journal->j_state_lock);

return ret;
}

/*
* Update information in journal structure and in on disk journal superblock
* about log tail. This function does not check whether information passed in
* really pushes log tail further. It's responsibility of the caller to make
* sure provided log tail information is valid (e.g. by holding
* j_checkpoint_mutex all the time between computing log tail and calling this
* function as is the case with jbd2_cleanup_journal_tail()).
*
* Requires j_checkpoint_mutex
*/
int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
{
unsigned long freed;
int ret;

BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));

/*
* We cannot afford for write to remain in drive's caches since as
* soon as we update j_tail, next transaction can start reusing journal
* space and if we lose sb update during power failure we'd replay
* old transaction with possibly newly overwritten data.
*/
ret = jbd2_journal_update_sb_log_tail(journal, tid, block, REQ_FUA);
if (ret)
  goto out;

write_lock(&journal->j_state_lock);
freed = block - journal->j_tail;
if (block < journal->j_tail)
  freed += journal->j_last - journal->j_first;

trace_jbd2_update_log_tail(journal, tid, block, freed);
jbd2_debug(1,
    "Cleaning journal tail from %u to %u (offset %lu), "
    "freeing %lu\n",
    journal->j_tail_sequence, tid, block, freed);

journal->j_free += freed;
journal->j_tail_sequence = tid;
journal->j_tail = block;
write_unlock(&journal->j_state_lock);

out:
return ret;
}

/*
* This is a variation of __jbd2_update_log_tail which checks for validity of
* provided log tail and locks j_checkpoint_mutex. So it is safe against races
* with other threads updating log tail.
*/
void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
{
mutex_lock_io(&journal->j_checkpoint_mutex);
if (tid_gt(tid, journal->j_tail_sequence))
  __jbd2_update_log_tail(journal, tid, block);
mutex_unlock(&journal->j_checkpoint_mutex);
}

struct jbd2_stats_proc_session {
journal_t *journal;
struct transaction_stats_s *stats;
int start;
int max;
};

static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos)
{
return *pos ? NULL : SEQ_START_TOKEN;
}

static void *jbd2_seq_info_next(struct seq_file *seq, void *v, loff_t *pos)
{
(*pos)++;
return NULL;
}

static int jbd2_seq_info_show(struct seq_file *seq, void *v)
{
struct jbd2_stats_proc_session *s = seq->private;

if (v != SEQ_START_TOKEN)
  return 0;
seq_printf(seq, "%lu transactions (%lu requested), "
     "each up to %u blocks\n",
     s->stats->ts_tid, s->stats->ts_requested,
     s->journal->j_max_transaction_buffers);
if (s->stats->ts_tid == 0)
  return 0;
seq_printf(seq, "average: \n %ums waiting for transaction\n",
     jiffies_to_msecs(s->stats->run.rs_wait / s->stats->ts_tid));
seq_printf(seq, " %ums request delay\n",
     (s->stats->ts_requested == 0) ? 0 :
     jiffies_to_msecs(s->stats->run.rs_request_delay /
        s->stats->ts_requested));
seq_printf(seq, " %ums running transaction\n",
     jiffies_to_msecs(s->stats->run.rs_running / s->stats->ts_tid));
seq_printf(seq, " %ums transaction was being locked\n",
     jiffies_to_msecs(s->stats->run.rs_locked / s->stats->ts_tid));
seq_printf(seq, " %ums flushing data (in ordered mode)\n",
     jiffies_to_msecs(s->stats->run.rs_flushing / s->stats->ts_tid));
seq_printf(seq, " %ums logging transaction\n",
     jiffies_to_msecs(s->stats->run.rs_logging / s->stats->ts_tid));
seq_printf(seq, " %lluus average transaction commit time\n",
     div_u64(s->journal->j_average_commit_time, 1000));
seq_printf(seq, " %lu handles per transaction\n",
     s->stats->run.rs_handle_count / s->stats->ts_tid);
seq_printf(seq, " %lu blocks per transaction\n",
     s->stats->run.rs_blocks / s->stats->ts_tid);
seq_printf(seq, " %lu logged blocks per transaction\n",
     s->stats->run.rs_blocks_logged / s->stats->ts_tid);
return 0;
}

static void jbd2_seq_info_stop(struct seq_file *seq, void *v)
{
}

static const struct seq_operations jbd2_seq_info_ops = {
.start  = jbd2_seq_info_start,
.next   = jbd2_seq_info_next,
.stop   = jbd2_seq_info_stop,
.show   = jbd2_seq_info_show,
};

static int jbd2_seq_info_open(struct inode *inode, struct file *file)
{
journal_t *journal = pde_data(inode);
struct jbd2_stats_proc_session *s;
int rc, size;

s = kmalloc(sizeof(*s), GFP_KERNEL);
if (s == NULL)
  return -ENOMEM;
size = sizeof(struct transaction_stats_s);
s->stats = kmalloc(size, GFP_KERNEL);
if (s->stats == NULL) {
  kfree(s);
  return -ENOMEM;
}
spin_lock(&journal->j_history_lock);
memcpy(s->stats, &journal->j_stats, size);
s->journal = journal;
spin_unlock(&journal->j_history_lock);

rc = seq_open(file, &jbd2_seq_info_ops);
if (rc == 0) {
  struct seq_file *m = file->private_data;
  m->private = s;
} else {
  kfree(s->stats);
  kfree(s);
}
return rc;

}

static int jbd2_seq_info_release(struct inode *inode, struct file *file)
{
struct seq_file *seq = file->private_data;
struct jbd2_stats_proc_session *s = seq->private;
kfree(s->stats);
kfree(s);
return seq_release(inode, file);
}

static const struct proc_ops jbd2_info_proc_ops = {
.proc_open = jbd2_seq_info_open,
.proc_read = seq_read,
.proc_lseek = seq_lseek,
.proc_release = jbd2_seq_info_release,
};

static struct proc_dir_entry *proc_jbd2_stats;

static void jbd2_stats_proc_init(journal_t *journal)
{
journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats);
if (journal->j_proc_entry) {
  proc_create_data("info", S_IRUGO, journal->j_proc_entry,
     &jbd2_info_proc_ops, journal);
}
}

static void jbd2_stats_proc_exit(journal_t *journal)
{
remove_proc_entry("info", journal->j_proc_entry);
remove_proc_entry(journal->j_devname, proc_jbd2_stats);
}

/* Minimum size of descriptor tag */
static int jbd2_min_tag_size(void)
{
/*
* Tag with 32-bit block numbers does not use last four bytes of the
* structure
*/
return sizeof(journal_block_tag_t) - 4;
}

/**
* jbd2_journal_shrink_scan()
* @shrink: shrinker to work on
* @sc: reclaim request to process
*
* Scan the checkpointed buffer on the checkpoint list and release the
* journal_head.
*/
static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink,
           struct shrink_control *sc)
{
journal_t *journal = shrink->private_data;
unsigned long nr_to_scan = sc->nr_to_scan;
unsigned long nr_shrunk;
unsigned long count;

count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count);
trace_jbd2_shrink_scan_enter(journal, sc->nr_to_scan, count);

nr_shrunk = jbd2_journal_shrink_checkpoint_list(journal, &nr_to_scan);

count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count);
trace_jbd2_shrink_scan_exit(journal, nr_to_scan, nr_shrunk, count);

return nr_shrunk;
}

/**
* jbd2_journal_shrink_count()
* @shrink: shrinker to work on
* @sc: reclaim request to process
*
* Count the number of checkpoint buffers on the checkpoint list.
*/
static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink,
            struct shrink_control *sc)
{
journal_t *journal = shrink->private_data;
unsigned long count;

count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count);
trace_jbd2_shrink_count(journal, sc->nr_to_scan, count);

return count;
}

/*
* If the journal init or create aborts, we need to mark the journal
* superblock as being NULL to prevent the journal destroy from writing
* back a bogus superblock.
*/
static void journal_fail_superblock(journal_t *journal)
{
struct buffer_head *bh = journal->j_sb_buffer;
brelse(bh);
journal->j_sb_buffer = NULL;
}

/*
* Check the superblock for a given journal, performing initial
* validation of the format.
*/
static int journal_check_superblock(journal_t *journal)
{
journal_superblock_t *sb = journal->j_superblock;
int num_fc_blks;
int err = -EINVAL;

if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) ||
     sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
  printk(KERN_WARNING "JBD2: no valid journal superblock found\n");
  return err;
}

if (be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V1 &&
     be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V2) {
  printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n");
  return err;
}

if (be32_to_cpu(sb->s_maxlen) > journal->j_total_len) {
  printk(KERN_WARNING "JBD2: journal file too short\n");
  return err;
}

if (be32_to_cpu(sb->s_first) == 0 ||
     be32_to_cpu(sb->s_first) >= journal->j_total_len) {
  printk(KERN_WARNING
   "JBD2: Invalid start block of journal: %u\n",
   be32_to_cpu(sb->s_first));
  return err;
}

/*
* If this is a V2 superblock, then we have to check the
* features flags on it.
*/
if (!jbd2_format_support_feature(journal))
  return 0;

if ((sb->s_feature_ro_compat &
   ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) ||
     (sb->s_feature_incompat &
   ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) {
  printk(KERN_WARNING "JBD2: Unrecognised features on journal\n");
  return err;
}

num_fc_blks = jbd2_has_feature_fast_commit(journal) ?
    jbd2_journal_get_num_fc_blks(sb) : 0;
if (be32_to_cpu(sb->s_maxlen) < JBD2_MIN_JOURNAL_BLOCKS ||
     be32_to_cpu(sb->s_maxlen) - JBD2_MIN_JOURNAL_BLOCKS < num_fc_blks) {
  printk(KERN_ERR "JBD2: journal file too short %u,%d\n",
         be32_to_cpu(sb->s_maxlen), num_fc_blks);
  return err;
}

if (jbd2_has_feature_csum2(journal) &&
     jbd2_has_feature_csum3(journal)) {
  /* Can't have checksum v2 and v3 at the same time! */
  printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 "
         "at the same time!\n");
  return err;
}

if (jbd2_journal_has_csum_v2or3(journal) &&
     jbd2_has_feature_checksum(journal)) {
  /* Can't have checksum v1 and v2 on at the same time! */
  printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 "
         "at the same time!\n");
  return err;
}

if (jbd2_journal_has_csum_v2or3(journal)) {
  if (sb->s_checksum_type != JBD2_CRC32C_CHKSUM) {
   printk(KERN_ERR "JBD2: Unknown checksum type\n");
   return err;
  }

  /* Check superblock checksum */
  if (sb->s_checksum != jbd2_superblock_csum(sb)) {
   printk(KERN_ERR "JBD2: journal checksum error\n");
   err = -EFSBADCRC;
   return err;
  }
}

return 0;
}

static int journal_revoke_records_per_block(journal_t *journal)
{
int record_size;
int space = journal->j_blocksize - sizeof(jbd2_journal_revoke_header_t);

if (jbd2_has_feature_64bit(journal))
  record_size = 8;
else
  record_size = 4;

if (jbd2_journal_has_csum_v2or3(journal))
  space -= sizeof(struct jbd2_journal_block_tail);
return space / record_size;
}

static int jbd2_journal_get_max_txn_bufs(journal_t *journal)
{
return (journal->j_total_len - journal->j_fc_wbufsize) / 3;
}

/*
* Base amount of descriptor blocks we reserve for each transaction.
*/
static int jbd2_descriptor_blocks_per_trans(journal_t *journal)
{
int tag_space = journal->j_blocksize - sizeof(journal_header_t);
int tags_per_block;

/* Subtract UUID */
tag_space -= 16;
if (jbd2_journal_has_csum_v2or3(journal))
  tag_space -= sizeof(struct jbd2_journal_block_tail);
/* Commit code leaves a slack space of 16 bytes at the end of block */
tags_per_block = (tag_space - 16) / journal_tag_bytes(journal);
/*
* Revoke descriptors are accounted separately so we need to reserve
* space for commit block and normal transaction descriptor blocks.
*/
return 1 + DIV_ROUND_UP(jbd2_journal_get_max_txn_bufs(journal),
    tags_per_block);
}

/*
* Initialize number of blocks each transaction reserves for its bookkeeping
* and maximum number of blocks a transaction can use. This needs to be called
* after the journal size and the fastcommit area size are initialized.
*/
static void jbd2_journal_init_transaction_limits(journal_t *journal)
{
journal->j_revoke_records_per_block =
    journal_revoke_records_per_block(journal);
journal->j_transaction_overhead_buffers =
    jbd2_descriptor_blocks_per_trans(journal);
journal->j_max_transaction_buffers =
    jbd2_journal_get_max_txn_bufs(journal);
}

/*
* Load the on-disk journal superblock and read the key fields into the
* journal_t.
*/
static int journal_load_superblock(journal_t *journal)
{
int err;
struct buffer_head *bh;
journal_superblock_t *sb;

bh = getblk_unmovable(journal->j_dev, journal->j_blk_offset,
         journal->j_blocksize);
if (bh)
  err = bh_read(bh, 0);
if (!bh || err < 0) {
  pr_err("%s: Cannot read journal superblock\n", __func__);
  brelse(bh);
  return -EIO;
}

journal->j_sb_buffer = bh;
sb = (journal_superblock_t *)bh->b_data;
journal->j_superblock = sb;
err = journal_check_superblock(journal);
if (err) {
  journal_fail_superblock(journal);
  return err;
}

journal->j_tail_sequence = be32_to_cpu(sb->s_sequence);
journal->j_tail = be32_to_cpu(sb->s_start);
journal->j_first = be32_to_cpu(sb->s_first);
journal->j_errno = be32_to_cpu(sb->s_errno);
journal->j_last = be32_to_cpu(sb->s_maxlen);

if (be32_to_cpu(sb->s_maxlen) < journal->j_total_len)
  journal->j_total_len = be32_to_cpu(sb->s_maxlen);
/* Precompute checksum seed for all metadata */
if (jbd2_journal_has_csum_v2or3(journal))
  journal->j_csum_seed = jbd2_chksum(~0, sb->s_uuid,
         sizeof(sb->s_uuid));
/* After journal features are set, we can compute transaction limits */
jbd2_journal_init_transaction_limits(journal);

if (jbd2_has_feature_fast_commit(journal)) {
  journal->j_fc_last = be32_to_cpu(sb->s_maxlen);
  journal->j_last = journal->j_fc_last -
      jbd2_journal_get_num_fc_blks(sb);
  journal->j_fc_first = journal->j_last + 1;
  journal->j_fc_off = 0;
}

return 0;
}

/*
* Management for journal control blocks: functions to create and
* destroy journal_t structures, and to initialise and read existing
* journal blocks from disk.  */

/* The journal_init_common() function creates and fills a journal_t object
* in memory. It calls journal_load_superblock() to load the on-disk journal
* superblock and initialize the journal_t object.
*/

static journal_t *journal_init_common(struct block_device *bdev,
   struct block_device *fs_dev,
   unsigned long long start, int len, int blocksize)
{
static struct lock_class_key jbd2_trans_commit_key;
journal_t *journal;
int err;
int n;

journal = kzalloc(sizeof(*journal), GFP_KERNEL);
if (!journal)
  return ERR_PTR(-ENOMEM);

journal->j_blocksize = blocksize;
journal->j_dev = bdev;
journal->j_fs_dev = fs_dev;
journal->j_blk_offset = start;
journal->j_total_len = len;
jbd2_init_fs_dev_write_error(journal);

err = journal_load_superblock(journal);
if (err)
  goto err_cleanup;

init_waitqueue_head(&journal->j_wait_transaction_locked);
init_waitqueue_head(&journal->j_wait_done_commit);
init_waitqueue_head(&journal->j_wait_commit);
init_waitqueue_head(&journal->j_wait_updates);
init_waitqueue_head(&journal->j_wait_reserved);
init_waitqueue_head(&journal->j_fc_wait);
mutex_init(&journal->j_abort_mutex);
mutex_init(&journal->j_barrier);
mutex_init(&journal->j_checkpoint_mutex);
spin_lock_init(&journal->j_revoke_lock);
spin_lock_init(&journal->j_list_lock);
spin_lock_init(&journal->j_history_lock);
rwlock_init(&journal->j_state_lock);

journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
journal->j_min_batch_time = 0;
journal->j_max_batch_time = 15000; /* 15ms */
atomic_set(&journal->j_reserved_credits, 0);
lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle",
    &jbd2_trans_commit_key, 0);

/* The journal is marked for error until we succeed with recovery! */
journal->j_flags = JBD2_ABORT;

/* Set up a default-sized revoke table for the new mount. */
err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
if (err)
  goto err_cleanup;

/*
* journal descriptor can store up to n blocks, we need enough
* buffers to write out full descriptor block.
*/
err = -ENOMEM;
n = journal->j_blocksize / jbd2_min_tag_size();
journal->j_wbufsize = n;
journal->j_fc_wbuf = NULL;
journal->j_wbuf = kmalloc_array(n, sizeof(struct buffer_head *),
     GFP_KERNEL);
if (!journal->j_wbuf)
  goto err_cleanup;

err = percpu_counter_init(&journal->j_checkpoint_jh_count, 0,
      GFP_KERNEL);
if (err)
  goto err_cleanup;

journal->j_shrink_transaction = NULL;

journal->j_shrinker = shrinker_alloc(0, "jbd2-journal:(%u:%u)",
          MAJOR(bdev->bd_dev),
          MINOR(bdev->bd_dev));
if (!journal->j_shrinker) {
  err = -ENOMEM;
  goto err_cleanup;
}

journal->j_shrinker->scan_objects = jbd2_journal_shrink_scan;
journal->j_shrinker->count_objects = jbd2_journal_shrink_count;
journal->j_shrinker->private_data = journal;

shrinker_register(journal->j_shrinker);

return journal;

err_cleanup:
percpu_counter_destroy(&journal->j_checkpoint_jh_count);
kfree(journal->j_wbuf);
jbd2_journal_destroy_revoke(journal);
journal_fail_superblock(journal);
kfree(journal);
return ERR_PTR(err);
}

/* jbd2_journal_init_dev and jbd2_journal_init_inode:
*
* Create a journal structure assigned some fixed set of disk blocks to
* the journal.  We don't actually touch those disk blocks yet, but we
* need to set up all of the mapping information to tell the journaling
* system where the journal blocks are.
*
*/

/**
*  journal_t * jbd2_journal_init_dev() - creates and initialises a journal structure
*  @bdev: Block device on which to create the journal
*  @fs_dev: Device which hold journalled filesystem for this journal.
*  @start: Block nr Start of journal.
*  @len:  Length of the journal in blocks.
*  @blocksize: blocksize of journalling device
*
*  Returns: a newly created journal_t *
*
*  jbd2_journal_init_dev creates a journal which maps a fixed contiguous
*  range of blocks on an arbitrary block device.
*
*/
journal_t *jbd2_journal_init_dev(struct block_device *bdev,
   struct block_device *fs_dev,
   unsigned long long start, int len, int blocksize)
{
journal_t *journal;

journal = journal_init_common(bdev, fs_dev, start, len, blocksize);
if (IS_ERR(journal))
  return ERR_CAST(journal);

snprintf(journal->j_devname, sizeof(journal->j_devname),
   "%pg", journal->j_dev);
strreplace(journal->j_devname, '/', '!');
jbd2_stats_proc_init(journal);

return journal;
}

/**
*  journal_t * jbd2_journal_init_inode () - creates a journal which maps to a inode.
*  @inode: An inode to create the journal in
*
* jbd2_journal_init_inode creates a journal which maps an on-disk inode as
* the journal.  The inode must exist already, must support bmap() and
* must have all data blocks preallocated.
*/
journal_t *jbd2_journal_init_inode(struct inode *inode)
{
journal_t *journal;
sector_t blocknr;
int err = 0;

blocknr = 0;
err = bmap(inode, &blocknr);
if (err || !blocknr) {
  pr_err("%s: Cannot locate journal superblock\n", __func__);
  return err ? ERR_PTR(err) : ERR_PTR(-EINVAL);
}

jbd2_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n",
    inode->i_sb->s_id, inode->i_ino, (long long) inode->i_size,
    inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);

journal = journal_init_common(inode->i_sb->s_bdev, inode->i_sb->s_bdev,
   blocknr, inode->i_size >> inode->i_sb->s_blocksize_bits,
   inode->i_sb->s_blocksize);
if (IS_ERR(journal))
  return ERR_CAST(journal);

journal->j_inode = inode;
snprintf(journal->j_devname, sizeof(journal->j_devname),
   "%pg-%lu", journal->j_dev, journal->j_inode->i_ino);
strreplace(journal->j_devname, '/', '!');
jbd2_stats_proc_init(journal);

return journal;
}

/*
* Given a journal_t structure, initialise the various fields for
* startup of a new journaling session.  We use this both when creating
* a journal, and after recovering an old journal to reset it for
* subsequent use.
*/

static int journal_reset(journal_t *journal)
{
journal_superblock_t *sb = journal->j_superblock;
unsigned long long first, last;

first = be32_to_cpu(sb->s_first);
last = be32_to_cpu(sb->s_maxlen);
if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) {
  printk(KERN_ERR "JBD2: Journal too short (blocks %llu-%llu).\n",
         first, last);
  journal_fail_superblock(journal);
  return -EINVAL;
}

journal->j_first = first;
journal->j_last = last;

if (journal->j_head != 0 && journal->j_flags & JBD2_CYCLE_RECORD) {
  /*
* Disable the cycled recording mode if the journal head block
* number is not correct.
*/
  if (journal->j_head < first || journal->j_head >= last) {
   printk(KERN_WARNING "JBD2: Incorrect Journal head block %lu, "
          "disable journal_cycle_record\n",
          journal->j_head);
   journal->j_head = journal->j_first;
  }
} else {
  journal->j_head = journal->j_first;
}
journal->j_tail = journal->j_head;
journal->j_free = journal->j_last - journal->j_first;

journal->j_tail_sequence = journal->j_transaction_sequence;
journal->j_commit_sequence = journal->j_transaction_sequence - 1;
journal->j_commit_request = journal->j_commit_sequence;

/*
* Now that journal recovery is done, turn fast commits off here. This
* way, if fast commit was enabled before the crash but if now FS has
* disabled it, we don't enable fast commits.
*/
jbd2_clear_feature_fast_commit(journal);

/*
* As a special case, if the on-disk copy is already marked as needing
* no recovery (s_start == 0), then we can safely defer the superblock
* update until the next commit by setting JBD2_FLUSHED.  This avoids
* attempting a write to a potential-readonly device.
*/
if (sb->s_start == 0) {
  jbd2_debug(1, "JBD2: Skipping superblock update on recovered sb "
   "(start %ld, seq %u, errno %d)\n",
   journal->j_tail, journal->j_tail_sequence,
   journal->j_errno);
  journal->j_flags |= JBD2_FLUSHED;
} else {
  /* Lock here to make assertions happy... */
  mutex_lock_io(&journal->j_checkpoint_mutex);
  /*
* Update log tail information. We use REQ_FUA since new
* transaction will start reusing journal space and so we
* must make sure information about current log tail is on
* disk before that.
*/
  jbd2_journal_update_sb_log_tail(journal,
      journal->j_tail_sequence,
      journal->j_tail, REQ_FUA);
  mutex_unlock(&journal->j_checkpoint_mutex);
}
return jbd2_journal_start_thread(journal);
}

/*
* This function expects that the caller will have locked the journal
* buffer head, and will return with it unlocked
*/
static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags)
{
struct buffer_head *bh = journal->j_sb_buffer;
journal_superblock_t *sb = journal->j_superblock;
int ret = 0;

/* Buffer got discarded which means block device got invalidated */
if (!buffer_mapped(bh)) {
  unlock_buffer(bh);
  return -EIO;
}

/*
* Always set high priority flags to exempt from block layer's
* QOS policies, e.g. writeback throttle.
*/
write_flags |= JBD2_JOURNAL_REQ_FLAGS;
if (!(journal->j_flags & JBD2_BARRIER))
  write_flags &= ~(REQ_FUA | REQ_PREFLUSH);

trace_jbd2_write_superblock(journal, write_flags);

if (buffer_write_io_error(bh)) {
  /*
* Oh, dear.  A previous attempt to write the journal
* superblock failed.  This could happen because the
* USB device was yanked out.  Or it could happen to
* be a transient write error and maybe the block will
* be remapped.  Nothing we can do but to retry the
* write and hope for the best.
*/
  printk(KERN_ERR "JBD2: previous I/O error detected "
         "for journal superblock update for %s.\n",
         journal->j_devname);
  clear_buffer_write_io_error(bh);
  set_buffer_uptodate(bh);
}
if (jbd2_journal_has_csum_v2or3(journal))
  sb->s_checksum = jbd2_superblock_csum(sb);
get_bh(bh);
bh->b_end_io = end_buffer_write_sync;
submit_bh(REQ_OP_WRITE | write_flags, bh);
wait_on_buffer(bh);
if (buffer_write_io_error(bh)) {
  clear_buffer_write_io_error(bh);
  set_buffer_uptodate(bh);
  ret = -EIO;
}
if (ret) {
  printk(KERN_ERR "JBD2: I/O error when updating journal superblock for %s.\n",
    journal->j_devname);
  if (!is_journal_aborted(journal))
   jbd2_journal_abort(journal, ret);
}

return ret;
}

/**
* jbd2_journal_update_sb_log_tail() - Update log tail in journal sb on disk.
* @journal: The journal to update.
* @tail_tid: TID of the new transaction at the tail of the log
* @tail_block: The first block of the transaction at the tail of the log
* @write_flags: Flags for the journal sb write operation
*
* Update a journal's superblock information about log tail and write it to
* disk, waiting for the IO to complete.
*/
int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
        unsigned long tail_block,
        blk_opf_t write_flags)
{
journal_superblock_t *sb = journal->j_superblock;
int ret;

if (is_journal_aborted(journal))
  return -EIO;
if (jbd2_check_fs_dev_write_error(journal)) {
  jbd2_journal_abort(journal, -EIO);
  return -EIO;
}

BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
jbd2_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
    tail_block, tail_tid);

lock_buffer(journal->j_sb_buffer);
sb->s_sequence = cpu_to_be32(tail_tid);
sb->s_start    = cpu_to_be32(tail_block);

ret = jbd2_write_superblock(journal, write_flags);
if (ret)
  goto out;

/* Log is no longer empty */
write_lock(&journal->j_state_lock);
journal->j_flags &= ~JBD2_FLUSHED;
write_unlock(&journal->j_state_lock);

out:
return ret;
}

/**
* jbd2_mark_journal_empty() - Mark on disk journal as empty.
* @journal: The journal to update.
* @write_flags: Flags for the journal sb write operation
*
* Update a journal's dynamic superblock fields to show that journal is empty.
* Write updated superblock to disk waiting for IO to complete.
*/
static void jbd2_mark_journal_empty(journal_t *journal, blk_opf_t write_flags)
{
journal_superblock_t *sb = journal->j_superblock;
bool had_fast_commit = false;

BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
lock_buffer(journal->j_sb_buffer);
if (sb->s_start == 0) {  /* Is it already empty? */
  unlock_buffer(journal->j_sb_buffer);
  return;
}

jbd2_debug(1, "JBD2: Marking journal as empty (seq %u)\n",
    journal->j_tail_sequence);

sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
sb->s_start    = cpu_to_be32(0);
sb->s_head     = cpu_to_be32(journal->j_head);
if (jbd2_has_feature_fast_commit(journal)) {
  /*
* When journal is clean, no need to commit fast commit flag and
* make file system incompatible with older kernels.
*/
  jbd2_clear_feature_fast_commit(journal);
  had_fast_commit = true;
}

jbd2_write_superblock(journal, write_flags);

if (had_fast_commit)
  jbd2_set_feature_fast_commit(journal);

/* Log is empty */
write_lock(&journal->j_state_lock);
journal->j_flags |= JBD2_FLUSHED;
write_unlock(&journal->j_state_lock);
}

/**
* __jbd2_journal_erase() - Discard or zeroout journal blocks (excluding superblock)
* @journal: The journal to erase.
* @flags: A discard/zeroout request is sent for each physically contigous
* region of the journal. Either JBD2_JOURNAL_FLUSH_DISCARD or
* JBD2_JOURNAL_FLUSH_ZEROOUT must be set to determine which operation
* to perform.
*
* Note: JBD2_JOURNAL_FLUSH_ZEROOUT attempts to use hardware offload. Zeroes
* will be explicitly written if no hardware offload is available, see
* blkdev_issue_zeroout for more details.
*/
static int __jbd2_journal_erase(journal_t *journal, unsigned int flags)
{
int err = 0;
unsigned long block, log_offset; /* logical */
unsigned long long phys_block, block_start, block_stop; /* physical */
loff_t byte_start, byte_stop, byte_count;

/* flags must be set to either discard or zeroout */
if ((flags & ~JBD2_JOURNAL_FLUSH_VALID) || !flags ||
   ((flags & JBD2_JOURNAL_FLUSH_DISCARD) &&
   (flags & JBD2_JOURNAL_FLUSH_ZEROOUT)))
  return -EINVAL;

if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) &&
     !bdev_max_discard_sectors(journal->j_dev))
  return -EOPNOTSUPP;

/*
* lookup block mapping and issue discard/zeroout for each
* contiguous region
*/
log_offset = be32_to_cpu(journal->j_superblock->s_first);
block_start =  ~0ULL;
for (block = log_offset; block < journal->j_total_len; block++) {
  err = jbd2_journal_bmap(journal, block, &phys_block);
  if (err) {
   pr_err("JBD2: bad block at offset %lu", block);
   return err;
  }

  if (block_start == ~0ULL)
   block_stop = block_start = phys_block;

  /*
* last block not contiguous with current block,
* process last contiguous region and return to this block on
* next loop
*/
  if (phys_block != block_stop) {
   block--;
  } else {
   block_stop++;
   /*
* if this isn't the last block of journal,
* no need to process now because next block may also
* be part of this contiguous region
*/
   if (block != journal->j_total_len - 1)
    continue;
  }

  /*
* end of contiguous region or this is last block of journal,
* take care of the region
*/
  byte_start = block_start * journal->j_blocksize;
  byte_stop = block_stop * journal->j_blocksize;
  byte_count = (block_stop - block_start) * journal->j_blocksize;

  truncate_inode_pages_range(journal->j_dev->bd_mapping,
    byte_start, byte_stop - 1);

  if (flags & JBD2_JOURNAL_FLUSH_DISCARD) {
   err = blkdev_issue_discard(journal->j_dev,
     byte_start >> SECTOR_SHIFT,
     byte_count >> SECTOR_SHIFT,
     GFP_NOFS);
  } else if (flags & JBD2_JOURNAL_FLUSH_ZEROOUT) {
   err = blkdev_issue_zeroout(journal->j_dev,
     byte_start >> SECTOR_SHIFT,
     byte_count >> SECTOR_SHIFT,
     GFP_NOFS, 0);
  }

  if (unlikely(err != 0)) {
   pr_err("JBD2: (error %d) unable to wipe journal at physical blocks [%llu, %llu)",
     err, block_start, block_stop);
   return err;
  }

  /* reset start and stop after processing a region */
  block_start = ~0ULL;
}

return blkdev_issue_flush(journal->j_dev);
}

/**
* jbd2_journal_update_sb_errno() - Update error in the journal.
* @journal: The journal to update.
*
* Update a journal's errno.  Write updated superblock to disk waiting for IO
* to complete.
*/
void jbd2_journal_update_sb_errno(journal_t *journal)
{
journal_superblock_t *sb = journal->j_superblock;
int errcode;

lock_buffer(journal->j_sb_buffer);
errcode = journal->j_errno;
if (errcode == -ESHUTDOWN)
  errcode = 0;
jbd2_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode);
sb->s_errno    = cpu_to_be32(errcode);

jbd2_write_superblock(journal, REQ_FUA);
}
EXPORT_SYMBOL(jbd2_journal_update_sb_errno);

/**
* jbd2_journal_load() - Read journal from disk.
* @journal: Journal to act on.
*
* Given a journal_t structure which tells us which disk blocks contain
* a journal, read the journal from disk to initialise the in-memory
* structures.
*/
int jbd2_journal_load(journal_t *journal)
{
int err;
journal_superblock_t *sb = journal->j_superblock;

/*
* Create a slab for this blocksize
*/
err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize));
if (err)
  return err;

/* Let the recovery code check whether it needs to recover any
* data from the journal. */
err = jbd2_journal_recover(journal);
if (err) {
  pr_warn("JBD2: journal recovery failed\n");
  return err;
}

if (journal->j_failed_commit) {
  printk(KERN_ERR "JBD2: journal transaction %u on %s "
         "is corrupt.\n", journal->j_failed_commit,
         journal->j_devname);
  return -EFSCORRUPTED;
}
/*
* clear JBD2_ABORT flag initialized in journal_init_common
* here to update log tail information with the newest seq.
*/
journal->j_flags &= ~JBD2_ABORT;

/* OK, we've finished with the dynamic journal bits:
* reinitialise the dynamic contents of the superblock in memory
* and reset them on disk. */
err = journal_reset(journal);
if (err) {
  pr_warn("JBD2: journal reset failed\n");
  return err;
}

journal->j_flags |= JBD2_LOADED;
return 0;
}

/**
* jbd2_journal_destroy() - Release a journal_t structure.
* @journal: Journal to act on.
*
* Release a journal_t structure once it is no longer in use by the
* journaled object.
* Return <0 if we couldn't clean up the journal.
*/
int jbd2_journal_destroy(journal_t *journal)
{
int err = 0;

/* Wait for the commit thread to wake up and die. */
journal_kill_thread(journal);

/* Force a final log commit */
if (journal->j_running_transaction)
  jbd2_journal_commit_transaction(journal);

/* Force any old transactions to disk */

/* Totally anal locking here... */
spin_lock(&journal->j_list_lock);
while (journal->j_checkpoint_transactions != NULL) {
  spin_unlock(&journal->j_list_lock);
  mutex_lock_io(&journal->j_checkpoint_mutex);
  err = jbd2_log_do_checkpoint(journal);
  mutex_unlock(&journal->j_checkpoint_mutex);
  /*
* If checkpointing failed, just free the buffers to avoid
* looping forever
*/
  if (err) {
   jbd2_journal_destroy_checkpoint(journal);
   spin_lock(&journal->j_list_lock);
   break;
  }
  spin_lock(&journal->j_list_lock);
}

J_ASSERT(journal->j_running_transaction == NULL);
J_ASSERT(journal->j_committing_transaction == NULL);
J_ASSERT(journal->j_checkpoint_transactions == NULL);
spin_unlock(&journal->j_list_lock);

/*
* OK, all checkpoint transactions have been checked, now check the
* writeback errseq of fs dev and abort the journal if some buffer
* failed to write back to the original location, otherwise the
* filesystem may become inconsistent.
*/
if (!is_journal_aborted(journal) &&
     jbd2_check_fs_dev_write_error(journal))
  jbd2_journal_abort(journal, -EIO);

if (journal->j_sb_buffer) {
  if (!is_journal_aborted(journal)) {
   mutex_lock_io(&journal->j_checkpoint_mutex);

   write_lock(&journal->j_state_lock);
   journal->j_tail_sequence =
    ++journal->j_transaction_sequence;
   write_unlock(&journal->j_state_lock);

   jbd2_mark_journal_empty(journal, REQ_PREFLUSH | REQ_FUA);
   mutex_unlock(&journal->j_checkpoint_mutex);
  } else
   err = -EIO;
  brelse(journal->j_sb_buffer);
}

if (journal->j_shrinker) {
  percpu_counter_destroy(&journal->j_checkpoint_jh_count);
  shrinker_free(journal->j_shrinker);
}
if (journal->j_proc_entry)
  jbd2_stats_proc_exit(journal);
iput(journal->j_inode);
if (journal->j_revoke)
  jbd2_journal_destroy_revoke(journal);
kfree(journal->j_fc_wbuf);
kfree(journal->j_wbuf);
kfree(journal);

return err;
}

/**
* jbd2_journal_check_used_features() - Check if features specified are used.
* @journal: Journal to check.
* @compat: bitmask of compatible features
* @ro: bitmask of features that force read-only mount
* @incompat: bitmask of incompatible features
*
* Check whether the journal uses all of a given set of
* features.  Return true (non-zero) if it does.
**/

int jbd2_journal_check_used_features(journal_t *journal, unsigned long compat,
     unsigned long ro, unsigned long incompat)
{
journal_superblock_t *sb;

if (!compat && !ro && !incompat)
  return 1;
if (!jbd2_format_support_feature(journal))
  return 0;

sb = journal->j_superblock;

if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) &&
     ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) &&
     ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat))
  return 1;

return 0;
}

/**
* jbd2_journal_check_available_features() - Check feature set in journalling layer
* @journal: Journal to check.
* @compat: bitmask of compatible features
* @ro: bitmask of features that force read-only mount
--> --------------------

--> maximum size reached

--> --------------------

Messung V0.5

¤ Dauer der Verarbeitung: 0.46 Sekunden (vorverarbeitet) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.