#define SC_MAX_SEGDELTA 64 /* * Upper limit of the number of segments * appended in collection retry loop
*/
/* Construction mode */ enum {
SC_LSEG_SR = 1, /* Make a logical segment having a super root */
SC_LSEG_DSYNC, /* * Flush data blocks of a given file and make * a logical segment without a super root.
*/
SC_FLUSH_FILE, /* * Flush data files, leads to segment writes without * creating a checkpoint.
*/
SC_FLUSH_DAT, /* * Flush DAT file. This also creates segments * without a checkpoint.
*/
};
/* Stage numbers of dirty block collection */ enum {
NILFS_ST_INIT = 0,
NILFS_ST_GC, /* Collecting dirty blocks for GC */
NILFS_ST_FILE,
NILFS_ST_IFILE,
NILFS_ST_CPFILE,
NILFS_ST_SUFILE,
NILFS_ST_DAT,
NILFS_ST_SR, /* Super root */
NILFS_ST_DSYNC, /* Data sync blocks */
NILFS_ST_DONE,
};
/* * nilfs_sc_cstage_inc(), nilfs_sc_cstage_set(), nilfs_sc_cstage_get() are * wrapper functions of stage count (nilfs_sc_info->sc_stage.scnt). Users of * the variable must use them because transition of stage count must involve * trace events (trace_nilfs2_collection_stage_transition). * * nilfs_sc_cstage_get() isn't required for the above purpose because it doesn't * produce tracepoint events. It is provided just for making the intention * clear.
*/ staticinlinevoid nilfs_sc_cstage_inc(struct nilfs_sc_info *sci)
{
sci->sc_stage.scnt++;
trace_nilfs2_collection_stage_transition(sci);
}
if (cur_ti) { if (cur_ti->ti_magic == NILFS_TI_MAGIC) return ++cur_ti->ti_count;
/* * If journal_info field is occupied by other FS, * it is saved and will be restored on * nilfs_transaction_commit().
*/
nilfs_warn(sb, "journal info from a different FS");
save = current->journal_info;
} if (!ti) {
ti = kmem_cache_alloc(nilfs_transaction_cachep, GFP_NOFS); if (!ti) return -ENOMEM;
ti->ti_flags = NILFS_TI_DYNAMIC_ALLOC;
} else {
ti->ti_flags = 0;
}
ti->ti_count = 0;
ti->ti_save = save;
ti->ti_magic = NILFS_TI_MAGIC;
current->journal_info = ti; return 0;
}
/** * nilfs_transaction_begin - start indivisible file operations. * @sb: super block * @ti: nilfs_transaction_info * @vacancy_check: flags for vacancy rate checks * * nilfs_transaction_begin() acquires a reader/writer semaphore, called * the segment semaphore, to make a segment construction and write tasks * exclusive. The function is used with nilfs_transaction_commit() in pairs. * The region enclosed by these two functions can be nested. To avoid a * deadlock, the semaphore is only acquired or released in the outermost call. * * This function allocates a nilfs_transaction_info struct to keep context * information on it. It is initialized and hooked onto the current task in * the outermost call. If a pre-allocated struct is given to @ti, it is used * instead; otherwise a new struct is assigned from a slab. * * When @vacancy_check flag is set, this function will check the amount of * free space, and will wait for the GC to reclaim disk space if low capacity. * * Return: 0 on success, or one of the following negative error codes on * failure: * * %-ENOMEM - Insufficient memory available. * * %-ENOSPC - No space left on device (if checking free space).
*/ int nilfs_transaction_begin(struct super_block *sb, struct nilfs_transaction_info *ti, int vacancy_check)
{ struct the_nilfs *nilfs; int ret = nilfs_prepare_segment_lock(sb, ti); struct nilfs_transaction_info *trace_ti;
if (unlikely(ret < 0)) return ret; if (ret > 0) {
trace_ti = current->journal_info;
failed:
ti = current->journal_info;
current->journal_info = ti->ti_save; if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
kmem_cache_free(nilfs_transaction_cachep, ti);
sb_end_intwrite(sb); return ret;
}
/** * nilfs_transaction_commit - commit indivisible file operations. * @sb: super block * * nilfs_transaction_commit() releases the read semaphore which is * acquired by nilfs_transaction_begin(). This is only performed * in outermost call of this function. If a commit flag is set, * nilfs_transaction_commit() sets a timer to start the segment * constructor. If a sync flag is set, it starts construction * directly. * * Return: 0 on success, or a negative error code on failure.
*/ int nilfs_transaction_commit(struct super_block *sb)
{ struct nilfs_transaction_info *ti = current->journal_info; struct the_nilfs *nilfs = sb->s_fs_info; int err = 0;
/** * nilfs_segctor_zeropad_segsum - zero pad the rest of the segment summary area * @sci: segment constructor object * * nilfs_segctor_zeropad_segsum() zero-fills unallocated space at the end of * the current segment summary block.
*/ staticvoid nilfs_segctor_zeropad_segsum(struct nilfs_sc_info *sci)
{ struct nilfs_segsum_pointer *ssp;
if (segbuf->sb_sum.nblocks >= segbuf->sb_rest_blocks) {
err = nilfs_segctor_feed_segment(sci); if (err) return err;
segbuf = sci->sc_curseg;
}
err = nilfs_segbuf_extend_payload(segbuf, &segbuf->sb_super_root); if (likely(!err))
segbuf->sb_sum.flags |= NILFS_SS_SR; return err;
}
/* * Functions for making segment summary and payloads
*/ staticint nilfs_segctor_segsum_block_required( struct nilfs_sc_info *sci, conststruct nilfs_segsum_pointer *ssp, unsignedint binfo_size)
{ unsignedint blocksize = sci->sc_super->s_blocksize; /* Size of finfo and binfo is enough small against blocksize */
if (unlikely(start != 0 || end != LLONG_MAX)) { /* * A valid range is given for sync-ing data pages. The * range is rounded to per-page; extra dirty buffers * may be included if blocksize < pagesize.
*/
index = start >> PAGE_SHIFT;
last = end >> PAGE_SHIFT;
}
folio_batch_init(&fbatch);
repeat: if (unlikely(index > last) ||
!filemap_get_folios_tag(mapping, &index, last,
PAGECACHE_TAG_DIRTY, &fbatch)) return ndirties;
for (i = 0; i < folio_batch_count(&fbatch); i++) { struct buffer_head *bh, *head; struct folio *folio = fbatch.folios[i];
folio_lock(folio); if (unlikely(folio->mapping != mapping)) { /* Exclude folios removed from the address space */
folio_unlock(folio); continue;
}
head = folio_buffers(folio); if (!head)
head = create_empty_buffers(folio,
i_blocksize(inode), 0);
bh = head; do { if (!buffer_dirty(bh) || buffer_async_write(bh)) continue;
get_bh(bh);
list_add_tail(&bh->b_assoc_buffers, listp);
ndirties++; if (unlikely(ndirties >= nlimit)) {
folio_unlock(folio);
folio_batch_release(&fbatch);
cond_resched(); return ndirties;
}
} while (bh = bh->b_this_page, bh != head);
staticint nilfs_test_metadata_dirty(struct the_nilfs *nilfs, struct nilfs_root *root)
{ int ret = 0;
if (nilfs_mdt_fetch_dirty(root->ifile))
ret++; if (nilfs_mdt_fetch_dirty(nilfs->ns_cpfile))
ret++; if (nilfs_mdt_fetch_dirty(nilfs->ns_sufile))
ret++; if ((ret || nilfs_doing_gc()) && nilfs_mdt_fetch_dirty(nilfs->ns_dat))
ret++; return ret;
}
/** * nilfs_write_root_mdt_inode - export root metadata inode information to * the on-disk inode * @inode: inode object of the root metadata file * @raw_inode: on-disk inode * * nilfs_write_root_mdt_inode() writes inode information and bmap data of * @inode to the inode area of the metadata file allocated on the super root * block created to finalize the log. Since super root blocks are configured * each time, this function zero-fills the unused area of @raw_inode.
*/ staticvoid nilfs_write_root_mdt_inode(struct inode *inode, struct nilfs_inode *raw_inode)
{ struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
staticint nilfs_segctor_scan_file_dsync(struct nilfs_sc_info *sci, struct inode *inode)
{
LIST_HEAD(data_buffers);
size_t n, rest = nilfs_segctor_buffer_rest(sci); int err;
n = nilfs_lookup_dirty_data_buffers(inode, &data_buffers, rest + 1,
sci->sc_dsync_start,
sci->sc_dsync_end);
err = nilfs_segctor_apply_buffers(sci, inode, &data_buffers,
nilfs_collect_file_data); if (!err) {
nilfs_segctor_end_finfo(sci, inode);
BUG_ON(n > rest); /* always receive -E2BIG or true error if n > rest */
} return err;
}
/** * nilfs_free_segments - free the segments given by an array of segment numbers * @nilfs: nilfs object * @segnumv: array of segment numbers to be freed * @nsegs: number of segments to be freed in @segnumv * * nilfs_free_segments() wraps nilfs_sufile_freev() and * nilfs_sufile_cancel_freev(), and edits the segment usage metadata file * (sufile) to free all segments given by @segnumv and @nsegs at once. If * it fails midway, it cancels the changes so that none of the segments are * freed. If @nsegs is 0, this function does nothing. * * The freeing of segments is not finalized until the writing of a log with * a super root block containing this sufile change is complete, and it can * be canceled with nilfs_sufile_cancel_freev() until then. * * Return: 0 on success, or one of the following negative error codes on * failure: * * %-EINVAL - Invalid segment number. * * %-EIO - I/O error (including metadata corruption). * * %-ENOMEM - Insufficient memory available.
*/ staticint nilfs_free_segments(struct the_nilfs *nilfs, __u64 *segnumv,
size_t nsegs)
{
size_t ndone; int ret;
if (!nsegs) return 0;
ret = nilfs_sufile_freev(nilfs->ns_sufile, segnumv, nsegs, &ndone); if (unlikely(ret)) {
nilfs_sufile_cancel_freev(nilfs->ns_sufile, segnumv, ndone,
NULL); /* * If a segment usage of the segments to be freed is in a * hole block, nilfs_sufile_freev() will return -ENOENT. * In this case, -EINVAL should be returned to the caller * since there is something wrong with the given segment * number array. This error can only occur during GC, so * there is no need to worry about it propagating to other * callers (such as fsync).
*/ if (ret == -ENOENT) {
nilfs_err(nilfs->ns_sb, "The segment usage entry %llu to be freed is invalid (in a hole)",
(unsignedlonglong)segnumv[ndone]);
ret = -EINVAL;
}
} return ret;
}
if (nilfs->ns_segnum == nilfs->ns_nextnum) /* Start from the head of a new full segment */
alloc++;
} else { /* Continue logs */
prev = NILFS_LAST_SEGBUF(&sci->sc_write_logs);
nilfs_segbuf_map_cont(segbuf, prev);
segbuf->sb_sum.seg_seq = prev->sb_sum.seg_seq;
nextnum = prev->sb_nextnum;
prev = NILFS_LAST_SEGBUF(&sci->sc_segbufs); /* * Since the segment specified with nextnum might be allocated during * the previous construction, the buffer including its segusage may * not be dirty. The following call ensures that the buffer is dirty * and will pin the buffer on memory until the sufile is written.
*/
err = nilfs_sufile_mark_dirty(sufile, prev->sb_nextnum); if (unlikely(err)) return err;
for (i = 0; i < nadd; i++) { /* extend segment info */
err = -ENOMEM;
segbuf = nilfs_segbuf_new(sci->sc_super); if (unlikely(!segbuf)) goto failed;
/* map this buffer to region of segment on-disk */
nilfs_segbuf_map(segbuf, prev->sb_nextnum, 0, nilfs);
sci->sc_segbuf_nblocks += segbuf->sb_rest_blocks;
/* allocate the next next full segment */
err = nilfs_sufile_alloc(sufile, &nextnextnum); if (unlikely(err)) goto failed_segbuf;
segbuf = NILFS_FIRST_SEGBUF(logs); if (nilfs->ns_nextnum != segbuf->sb_nextnum) {
ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
WARN_ON(ret); /* never fails */
} if (atomic_read(&segbuf->sb_err)) { /* Case 1: The first segment failed */ if (segbuf->sb_pseg_start != segbuf->sb_fseg_start) /* * Case 1a: Partial segment appended into an existing * segment
*/
nilfs_terminate_segment(nilfs, segbuf->sb_fseg_start,
segbuf->sb_fseg_end); else/* Case 1b: New full segment */
set_nilfs_discontinued(nilfs);
}
prev = segbuf;
list_for_each_entry_continue(segbuf, logs, sb_list) { if (prev->sb_nextnum != segbuf->sb_nextnum) {
ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
WARN_ON(ret); /* never fails */
} if (atomic_read(&segbuf->sb_err) &&
segbuf->sb_segnum != nilfs->ns_nextnum) /* Case 2: extended segment (!= next) failed */
nilfs_sufile_set_error(sufile, segbuf->sb_segnum);
prev = segbuf;
}
}
staticvoid nilfs_begin_folio_io(struct folio *folio)
{ if (!folio || folio_test_writeback(folio)) /* * For split b-tree node pages, this function may be called * twice. We ignore the 2nd or later calls by this check.
*/ return;
/** * nilfs_prepare_write_logs - prepare to write logs * @logs: logs to prepare for writing * @seed: checksum seed value * * nilfs_prepare_write_logs() adds checksums and prepares the block * buffers/folios for writing logs. In order to stabilize folios of * memory-mapped file blocks by putting them in writeback state before * calculating the checksums, first prepare to write payload blocks other * than segment summary and super root blocks in which the checksums will * be embedded.
*/ staticvoid nilfs_prepare_write_logs(struct list_head *logs, u32 seed)
{ struct nilfs_segment_buffer *segbuf; struct folio *bd_folio = NULL, *fs_folio = NULL; struct buffer_head *bh;
/* Prepare to write super root block */
bh = NILFS_LAST_SEGBUF(logs)->sb_super_root; if (bh) {
mark_buffer_dirty(bh); if (bh->b_folio != bd_folio) {
folio_lock(bd_folio);
folio_wait_writeback(bd_folio);
folio_clear_dirty_for_io(bd_folio);
folio_start_writeback(bd_folio);
folio_unlock(bd_folio);
bd_folio = bh->b_folio;
}
}
if (bd_folio) {
folio_lock(bd_folio);
folio_wait_writeback(bd_folio);
folio_clear_dirty_for_io(bd_folio);
folio_start_writeback(bd_folio);
folio_unlock(bd_folio);
}
}
staticint nilfs_segctor_write(struct nilfs_sc_info *sci, struct the_nilfs *nilfs)
{ int ret;
ret = nilfs_write_logs(&sci->sc_segbufs, nilfs);
list_splice_tail_init(&sci->sc_segbufs, &sci->sc_write_logs); return ret;
}
staticvoid nilfs_end_folio_io(struct folio *folio, int err)
{ if (!folio) return;
if (buffer_nilfs_node(folio_buffers(folio)) &&
!folio_test_writeback(folio)) { /* * For b-tree node pages, this function may be called twice * or more because they might be split in a segment.
*/ if (folio_test_dirty(folio)) { /* * For pages holding split b-tree node buffers, dirty * flag on the buffers may be cleared discretely. * In that case, the page is once redirtied for * remaining buffers, and it must be cancelled if * all the buffers get cleaned later.
*/
folio_lock(folio); if (nilfs_folio_buffers_clean(folio))
__nilfs_clear_folio_dirty(folio);
folio_unlock(folio);
} return;
}
if (err || !nilfs_folio_buffers_clean(folio))
filemap_dirty_folio(folio->mapping, folio);
if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
ret = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
sci->sc_freesegs,
sci->sc_nfreesegs,
NULL);
WARN_ON(ret); /* do not happen */
}
list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
b_assoc_buffers) {
set_buffer_uptodate(bh);
clear_buffer_dirty(bh); if (bh->b_folio != bd_folio) { if (bd_folio)
folio_end_writeback(bd_folio);
bd_folio = bh->b_folio;
}
} /* * We assume that the buffers which belong to the same folio * continue over the buffer list. * Under this assumption, the last BHs of folios is * identifiable by the discontinuity of bh->b_folio * (folio != fs_folio). * * For B-tree node blocks, however, this assumption is not * guaranteed. The cleanup code of B-tree node folios needs * special care.
*/
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) { constunsignedlong set_bits = BIT(BH_Uptodate); constunsignedlong clear_bits =
(BIT(BH_Dirty) | BIT(BH_Async_Write) |
BIT(BH_Delay) | BIT(BH_NILFS_Volatile) |
BIT(BH_NILFS_Redirected));
if (!nilfs_segbuf_simplex(segbuf)) { if (segbuf->sb_sum.flags & NILFS_SS_LOGBGN) {
set_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
sci->sc_lseg_stime = jiffies;
} if (segbuf->sb_sum.flags & NILFS_SS_LOGEND)
clear_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
}
} /* * Since folios may continue over multiple segment buffers, * end of the last folio must be checked outside of the loop.
*/ if (bd_folio)
folio_end_writeback(bd_folio);
spin_lock(&nilfs->ns_inode_lock);
retry:
list_for_each_entry_safe(ii, n, &nilfs->ns_dirty_files, i_dirty) { if (!ii->i_bh) { struct buffer_head *ibh; int err;
err = nilfs_segctor_write(sci, nilfs); if (unlikely(err)) goto failed_to_write;
if (nilfs_sc_cstage_get(sci) == NILFS_ST_DONE ||
nilfs->ns_blocksize_bits != PAGE_SHIFT) { /* * At this point, we avoid double buffering * for blocksize < pagesize because page dirty * flag is turned off during write and dirty * buffers are not properly collected for * pages crossing over segments.
*/
err = nilfs_segctor_wait(sci); if (err) goto failed_to_write;
}
} while (nilfs_sc_cstage_get(sci) != NILFS_ST_DONE);
failed_to_write:
failed: if (mode == SC_LSEG_SR && nilfs_sc_cstage_get(sci) >= NILFS_ST_IFILE)
nilfs_redirty_inodes(&sci->sc_dirty_files); if (nilfs_doing_gc())
nilfs_redirty_inodes(&sci->sc_gc_inodes);
nilfs_segctor_abort_construction(sci, nilfs, err); goto out;
}
/** * nilfs_segctor_start_timer - set timer of background write * @sci: nilfs_sc_info * * If the timer has already been set, it ignores the new request. * This function MUST be called within a section locking the segment * semaphore.
*/ staticvoid nilfs_segctor_start_timer(struct nilfs_sc_info *sci)
{
spin_lock(&sci->sc_state_lock); if (!(sci->sc_state & NILFS_SEGCTOR_COMMIT)) { if (sci->sc_task) {
sci->sc_timer.expires = jiffies + sci->sc_interval;
add_timer(&sci->sc_timer);
}
sci->sc_state |= NILFS_SEGCTOR_COMMIT;
}
spin_unlock(&sci->sc_state_lock);
}
staticvoid nilfs_segctor_do_flush(struct nilfs_sc_info *sci, int bn)
{
spin_lock(&sci->sc_state_lock); if (!(sci->sc_flush_request & BIT(bn))) { unsignedlong prev_req = sci->sc_flush_request;
sci->sc_flush_request |= BIT(bn); if (!prev_req)
wake_up(&sci->sc_wait_daemon);
}
spin_unlock(&sci->sc_state_lock);
}
/* * To prevent a race issue where completion notifications from the * log writer thread are missed, increment the request sequence count * "sc_seq_request" and insert a wait queue entry using the current * sequence number into the "sc_wait_request" queue at the same time * within the lock section of "sc_state_lock".
*/
spin_lock(&sci->sc_state_lock);
wait_req.seq = ++sci->sc_seq_request;
add_wait_queue(&sci->sc_wait_request, &wait_req.wq);
spin_unlock(&sci->sc_state_lock);
wake_up(&sci->sc_wait_daemon);
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
/* * Synchronize only while the log writer thread is alive. * Leave flushing out after the log writer thread exits to * the cleanup work in nilfs_segctor_destroy().
*/ if (!sci->sc_task) break;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.