// SPDX-License-Identifier: GPL-2.0 /* * fs/ext4/extents_status.c * * Written by Yongqiang Yang <xiaoqiangnk@gmail.com> * Modified by * Allison Henderson <achender@linux.vnet.ibm.com> * Hugh Dickins <hughd@google.com> * Zheng Liu <wenqing.lz@taobao.com> * * Ext4 extents status tree core functions.
*/ #include <linux/list_sort.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include"ext4.h"
#include <trace/events/ext4.h>
/* * According to previous discussion in Ext4 Developer Workshop, we * will introduce a new structure called io tree to track all extent * status in order to solve some problems that we have met * (e.g. Reservation space warning), and provide extent-level locking. * Delay extent tree is the first step to achieve this goal. It is * original built by Yongqiang Yang. At that time it is called delay * extent tree, whose goal is only track delayed extents in memory to * simplify the implementation of fiemap and bigalloc, and introduce * lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called * delay extent tree at the first commit. But for better understand * what it does, it has been rename to extent status tree. * * Step1: * Currently the first step has been done. All delayed extents are * tracked in the tree. It maintains the delayed extent when a delayed * allocation is issued, and the delayed extent is written out or * invalidated. Therefore the implementation of fiemap and bigalloc * are simplified, and SEEK_DATA/SEEK_HOLE are introduced. * * The following comment describes the implemenmtation of extent * status tree and future works. * * Step2: * In this step all extent status are tracked by extent status tree. * Thus, we can first try to lookup a block mapping in this tree before * finding it in extent tree. Hence, single extent cache can be removed * because extent status tree can do a better job. Extents in status * tree are loaded on-demand. Therefore, the extent status tree may not * contain all of the extents in a file. Meanwhile we define a shrinker * to reclaim memory from extent status tree because fragmented extent * tree will make status tree cost too much memory. written/unwritten/- * hole extents in the tree will be reclaimed by this shrinker when we * are under high memory pressure. Delayed extents will not be * reclimed because fiemap, bigalloc, and seek_data/hole need it.
*/
/* * Extent status tree implementation for ext4. * * * ========================================================================== * Extent status tree tracks all extent status. * * 1. Why we need to implement extent status tree? * * Without extent status tree, ext4 identifies a delayed extent by looking * up page cache, this has several deficiencies - complicated, buggy, * and inefficient code. * * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a * block or a range of blocks are belonged to a delayed extent. * * Let us have a look at how they do without extent status tree. * -- FIEMAP * FIEMAP looks up page cache to identify delayed allocations from holes. * * -- SEEK_HOLE/DATA * SEEK_HOLE/DATA has the same problem as FIEMAP. * * -- bigalloc * bigalloc looks up page cache to figure out if a block is * already under delayed allocation or not to determine whether * quota reserving is needed for the cluster. * * -- writeout * Writeout looks up whole page cache to see if a buffer is * mapped, If there are not very many delayed buffers, then it is * time consuming. * * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA, * bigalloc and writeout can figure out if a block or a range of * blocks is under delayed allocation(belonged to a delayed extent) or * not by searching the extent tree. * * * ========================================================================== * 2. Ext4 extent status tree impelmentation * * -- extent * A extent is a range of blocks which are contiguous logically and * physically. Unlike extent in extent tree, this extent in ext4 is * a in-memory struct, there is no corresponding on-disk data. There * is no limit on length of extent, so an extent can contain as many * blocks as they are contiguous logically and physically. * * -- extent status tree * Every inode has an extent status tree and all allocation blocks * are added to the tree with different status. The extent in the * tree are ordered by logical block no. * * -- operations on a extent status tree * There are three important operations on a delayed extent tree: find * next extent, adding a extent(a range of blocks) and removing a extent. * * -- race on a extent status tree * Extent status tree is protected by inode->i_es_lock. * * -- memory consumption * Fragmented extent tree will make extent status tree cost too much * memory. Hence, we will reclaim written/unwritten/hole extents from * the tree under a heavy memory pressure. * * ========================================================================== * 3. Assurance of Ext4 extent status tree consistency * * When mapping blocks, Ext4 queries the extent status tree first and should * always trusts that the extent status tree is consistent and up to date. * Therefore, it is important to adheres to the following rules when createing, * modifying and removing extents. * * 1. Besides fastcommit replay, when Ext4 creates or queries block mappings, * the extent information should always be processed through the extent * status tree instead of being organized manually through the on-disk * extent tree. * * 2. When updating the extent tree, Ext4 should acquire the i_data_sem * exclusively and update the extent status tree atomically. If the extents * to be modified are large enough to exceed the range that a single * i_data_sem can process (as ext4_datasem_ensure_credits() may drop * i_data_sem to restart a transaction), it must (e.g. as ext4_punch_hole() * does): * * a) Hold the i_rwsem and invalidate_lock exclusively. This ensures * exclusion against page faults, as well as reads and writes that may * concurrently modify the extent status tree. * b) Evict all page cache in the affected range and recommend rebuilding * or dropping the extent status tree after modifying the on-disk * extent tree. This ensures exclusion against concurrent writebacks * that do not hold those locks but only holds a folio lock. * * 3. Based on the rules above, when querying block mappings, Ext4 should at * least hold the i_rwsem or invalidate_lock or folio lock(s) for the * specified querying range. * * ========================================================================== * 4. Performance analysis * * -- overhead * 1. There is a cache extent for write access, so if writes are * not very random, adding space operaions are in O(1) time. * * -- gain * 2. Code is much simpler, more readable, more maintainable and * more efficient. * * * ========================================================================== * 5. TODO list * * -- Refactor delayed space reservation * * -- Extent-level locking
*/
/* * search through the tree for an delayed extent with a given offset. If * it can't be found, try to find next extent.
*/ staticstruct extent_status *__es_tree_search(struct rb_root *root,
ext4_lblk_t lblk)
{ struct rb_node *node = root->rb_node; struct extent_status *es = NULL;
while (node) {
es = rb_entry(node, struct extent_status, rb_node); if (lblk < es->es_lblk)
node = node->rb_left; elseif (lblk > ext4_es_end(es))
node = node->rb_right; else return es;
}
/* * ext4_es_find_extent_range - find extent with specified status within block * range or next extent following block range in * extents status tree * * @inode - file containing the range * @matching_fn - pointer to function that matches extents with desired status * @lblk - logical block defining start of range * @end - logical block defining end of range * @es - extent found, if any * * Find the first extent within the block range specified by @lblk and @end * in the extents status tree that satisfies @matching_fn. If a match * is found, it's returned in @es. If not, and a matching extent is found * beyond the block range, it's returned in @es. If no match is found, an * extent is returned in @es whose es_lblk, es_len, and es_pblk components * are 0.
*/ staticvoid __es_find_extent_range(struct inode *inode, int (*matching_fn)(struct extent_status *es),
ext4_lblk_t lblk, ext4_lblk_t end, struct extent_status *es)
{ struct ext4_es_tree *tree = NULL; struct extent_status *es1 = NULL; struct rb_node *node;
WARN_ON(es == NULL);
WARN_ON(end < lblk);
tree = &EXT4_I(inode)->i_es_tree;
/* see if the extent has been cached */
es->es_lblk = es->es_len = es->es_pblk = 0;
es1 = READ_ONCE(tree->cache_es); if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) {
es_debug("%u cached by [%u/%u) %llu %x\n",
lblk, es1->es_lblk, es1->es_len,
ext4_es_pblock(es1), ext4_es_status(es1)); goto out;
}
es1 = __es_tree_search(&tree->root, lblk);
out: if (es1 && !matching_fn(es1)) { while ((node = rb_next(&es1->rb_node)) != NULL) {
es1 = rb_entry(node, struct extent_status, rb_node); if (es1->es_lblk > end) {
es1 = NULL; break;
} if (matching_fn(es1)) break;
}
}
/* * __es_scan_range - search block range for block with specified status * in extents status tree * * @inode - file containing the range * @matching_fn - pointer to function that matches extents with desired status * @lblk - logical block defining start of range * @end - logical block defining end of range * * Returns true if at least one block in the specified block range satisfies * the criterion specified by @matching_fn, and false if not. If at least * one extent has the specified status, then there is at least one block * in the cluster with that status. Should only be called by code that has * taken i_es_lock.
*/ staticbool __es_scan_range(struct inode *inode, int (*matching_fn)(struct extent_status *es),
ext4_lblk_t start, ext4_lblk_t end)
{ struct extent_status es;
__es_find_extent_range(inode, matching_fn, start, end, &es); if (es.es_len == 0) returnfalse; /* no matching extent in the tree */ elseif (es.es_lblk <= start &&
start < es.es_lblk + es.es_len) returntrue; elseif (start <= es.es_lblk && es.es_lblk <= end) returntrue; else returnfalse;
} /* * Locking for __es_scan_range() for external use
*/ bool ext4_es_scan_range(struct inode *inode, int (*matching_fn)(struct extent_status *es),
ext4_lblk_t lblk, ext4_lblk_t end)
{ bool ret;
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) returnfalse;
read_lock(&EXT4_I(inode)->i_es_lock);
ret = __es_scan_range(inode, matching_fn, lblk, end);
read_unlock(&EXT4_I(inode)->i_es_lock);
return ret;
}
/* * __es_scan_clu - search cluster for block with specified status in * extents status tree * * @inode - file containing the cluster * @matching_fn - pointer to function that matches extents with desired status * @lblk - logical block in cluster to be searched * * Returns true if at least one extent in the cluster containing @lblk * satisfies the criterion specified by @matching_fn, and false if not. If at * least one extent has the specified status, then there is at least one block * in the cluster with that status. Should only be called by code that has * taken i_es_lock.
*/ staticbool __es_scan_clu(struct inode *inode, int (*matching_fn)(struct extent_status *es),
ext4_lblk_t lblk)
{ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
ext4_lblk_t lblk_start, lblk_end;
/* * Returns true if we cannot fail to allocate memory for this extent_status * entry and cannot reclaim it until its status changes.
*/ staticinlinebool ext4_es_must_keep(struct extent_status *es)
{ /* fiemap, bigalloc, and seek_data/hole need to use it. */ if (ext4_es_is_delayed(es)) returntrue;
returnfalse;
}
staticinlinestruct extent_status *__es_alloc_extent(bool nofail)
{ if (!nofail) return kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
/* We never try to reclaim a must kept extent, so we don't count it. */ if (!ext4_es_must_keep(es)) { if (!EXT4_I(inode)->i_es_shk_nr++)
ext4_es_list_add(inode);
percpu_counter_inc(&EXT4_SB(inode->i_sb)->
s_es_stats.es_stats_shk_cnt);
}
/* Decrease the shrink counter when we can reclaim the extent. */ if (!ext4_es_must_keep(es)) {
BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0); if (!--EXT4_I(inode)->i_es_shk_nr)
ext4_es_list_del(inode);
percpu_counter_dec(&EXT4_SB(inode->i_sb)->
s_es_stats.es_stats_shk_cnt);
}
__es_free_extent(es);
}
/* * Check whether or not two extents can be merged * Condition: * - logical block number is contiguous * - physical block number is contiguous * - status is equal
*/ staticint ext4_es_can_be_merged(struct extent_status *es1, struct extent_status *es2)
{ if (ext4_es_type(es1) != ext4_es_type(es2)) return 0;
if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) {
pr_warn("ES assertion failed when merging extents. " "The sum of lengths of es1 (%d) and es2 (%d) " "is bigger than allowed file size (%d)\n",
es1->es_len, es2->es_len, EXT_MAX_BLOCKS);
WARN_ON(1); return 0;
}
if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk) return 0;
/* * Make sure ex and es are not overlap when we try to insert * a delayed/hole extent.
*/ if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) { if (in_range(es->es_lblk, ee_block, ee_len)) {
pr_warn("ES insert assertion failed for " "inode: %lu we can find an extent " "at block [%d/%d/%llu/%c], but we " "want to add a delayed/hole extent " "[%d/%d/%llu/%x]\n",
inode->i_ino, ee_block, ee_len,
ee_start, ee_status ? 'u' : 'w',
es->es_lblk, es->es_len,
ext4_es_pblock(es), ext4_es_status(es));
} goto out;
}
/* * We don't check ee_block == es->es_lblk, etc. because es * might be a part of whole extent, vice versa.
*/ if (es->es_lblk < ee_block ||
ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) {
pr_warn("ES insert assertion failed for inode: %lu " "ex_status [%d/%d/%llu/%c] != " "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
ee_block, ee_len, ee_start,
ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
ext4_es_pblock(es), es_status ? 'u' : 'w'); goto out;
}
if (ee_status ^ es_status) {
pr_warn("ES insert assertion failed for inode: %lu " "ex_status [%d/%d/%llu/%c] != " "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
ee_block, ee_len, ee_start,
ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
ext4_es_pblock(es), es_status ? 'u' : 'w');
}
} else { /* * We can't find an extent on disk. So we need to make sure * that we don't want to add an written/unwritten extent.
*/ if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
pr_warn("ES insert assertion failed for inode: %lu " "can't find an extent at block %d but we want " "to add a written/unwritten extent " "[%d/%d/%llu/%x]\n", inode->i_ino,
es->es_lblk, es->es_lblk, es->es_len,
ext4_es_pblock(es), ext4_es_status(es));
}
}
out:
ext4_free_ext_path(path);
}
/* * Here we call ext4_ind_map_blocks to lookup a block mapping because * 'Indirect' structure is defined in indirect.c. So we couldn't * access direct/indirect tree from outside. It is too dirty to define * this function in indirect.c file.
*/
map.m_lblk = es->es_lblk;
map.m_len = es->es_len;
retval = ext4_ind_map_blocks(NULL, inode, &map, 0); if (retval > 0) { if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) { /* * We want to add a delayed/hole extent but this * block has been allocated.
*/
pr_warn("ES insert assertion failed for inode: %lu " "We can find blocks but we want to add a " "delayed/hole extent [%d/%d/%llu/%x]\n",
inode->i_ino, es->es_lblk, es->es_len,
ext4_es_pblock(es), ext4_es_status(es)); return;
} elseif (ext4_es_is_written(es)) { if (retval != es->es_len) {
pr_warn("ES insert assertion failed for " "inode: %lu retval %d != es_len %d\n",
inode->i_ino, retval, es->es_len); return;
} if (map.m_pblk != ext4_es_pblock(es)) {
pr_warn("ES insert assertion failed for " "inode: %lu m_pblk %llu != " "es_pblk %llu\n",
inode->i_ino, map.m_pblk,
ext4_es_pblock(es)); return;
}
} else { /* * We don't need to check unwritten extent because * indirect-based file doesn't have it.
*/
BUG();
}
} elseif (retval == 0) { if (ext4_es_is_written(es)) {
pr_warn("ES insert assertion failed for inode: %lu " "We can't find the block but we want to add " "a written extent [%d/%d/%llu/%x]\n",
inode->i_ino, es->es_lblk, es->es_len,
ext4_es_pblock(es), ext4_es_status(es)); return;
}
}
}
staticinlinevoid ext4_es_insert_extent_check(struct inode *inode, struct extent_status *es)
{ /* * We don't need to worry about the race condition because * caller takes i_data_sem locking.
*/
BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
ext4_es_insert_extent_ext_check(inode, es); else
ext4_es_insert_extent_ind_check(inode, es);
} #else staticinlinevoid ext4_es_insert_extent_check(struct inode *inode, struct extent_status *es)
{
} #endif
while (*p) {
parent = *p;
es = rb_entry(parent, struct extent_status, rb_node);
if (newes->es_lblk < es->es_lblk) { if (ext4_es_can_be_merged(newes, es)) { /* * Here we can modify es_lblk directly * because it isn't overlapped.
*/
es->es_lblk = newes->es_lblk;
es->es_len += newes->es_len; if (ext4_es_is_written(es) ||
ext4_es_is_unwritten(es))
ext4_es_store_pblock(es,
newes->es_pblk);
es = ext4_es_try_to_merge_left(inode, es); goto out;
}
p = &(*p)->rb_left;
} elseif (newes->es_lblk > ext4_es_end(es)) { if (ext4_es_can_be_merged(es, newes)) {
es->es_len += newes->es_len;
es = ext4_es_try_to_merge_right(inode, es); goto out;
}
p = &(*p)->rb_right;
} else {
BUG(); return -EINVAL;
}
}
if (prealloc)
es = prealloc; else
es = __es_alloc_extent(false); if (!es) return -ENOMEM;
ext4_es_init_extent(inode, es, newes->es_lblk, newes->es_len,
newes->es_pblk);
err1 = __es_remove_extent(inode, lblk, end, &resv_used, es1); if (err1 != 0) goto error; /* Free preallocated extent if it didn't get used. */ if (es1) { if (!es1->es_len)
__es_free_extent(es1);
es1 = NULL;
}
err2 = __es_insert_extent(inode, &newes, es2); if (err2 == -ENOMEM && !ext4_es_must_keep(&newes))
err2 = 0; if (err2 != 0) goto error; /* Free preallocated extent if it didn't get used. */ if (es2) { if (!es2->es_len)
__es_free_extent(es2);
es2 = NULL;
}
if (revise_pending) {
err3 = __revise_pending(inode, lblk, len, &pr); if (err3 < 0) goto error; if (pr) {
__free_pending(pr);
pr = NULL;
}
pending = err3;
}
error:
write_unlock(&EXT4_I(inode)->i_es_lock); /* * Reduce the reserved cluster count to reflect successful deferred * allocation of delayed allocated clusters or direct allocation of * clusters discovered to be delayed allocated. Once allocated, a * cluster is not included in the reserved count. * * When direct allocating (from fallocate, filemap, DIO, or clusters * allocated when delalloc has been disabled by ext4_nonda_switch()) * an extent either 1) contains delayed blocks but start with * non-delayed allocated blocks (e.g. hole) or 2) contains non-delayed * allocated blocks which belong to delayed allocated clusters when * bigalloc feature is enabled, quota has already been claimed by * ext4_mb_new_blocks(), so release the quota reservations made for * any previously delayed allocated clusters instead of claim them * again.
*/
resv_used += pending; if (resv_used)
ext4_da_update_reserve_space(inode, resv_used,
delalloc_reserve_used);
if (err1 || err2 || err3 < 0) goto retry;
ext4_es_print_tree(inode); return;
}
/* * ext4_es_cache_extent() inserts information into the extent status * tree if and only if there isn't information about the range in * question already.
*/ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk, unsignedint status)
{ struct extent_status *es; struct extent_status newes;
ext4_lblk_t end = lblk + len - 1;
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return;
es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk); if (!es || es->es_lblk > end)
__es_insert_extent(inode, &newes, NULL);
write_unlock(&EXT4_I(inode)->i_es_lock);
}
/* * ext4_es_lookup_extent() looks up an extent in extent status tree. * * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks. * * Return: 1 on found, 0 on not
*/ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t *next_lblk, struct extent_status *es)
{ struct ext4_es_tree *tree; struct ext4_es_stats *stats; struct extent_status *es1 = NULL; struct rb_node *node; int found = 0;
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return 0;
trace_ext4_es_lookup_extent_enter(inode, lblk);
es_debug("lookup extent in block %u\n", lblk);
tree = &EXT4_I(inode)->i_es_tree;
read_lock(&EXT4_I(inode)->i_es_lock);
/* find extent in cache firstly */
es->es_lblk = es->es_len = es->es_pblk = 0;
es1 = READ_ONCE(tree->cache_es); if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) {
es_debug("%u cached by [%u/%u)\n",
lblk, es1->es_lblk, es1->es_len);
found = 1; goto out;
}
/* * init_rsvd - initialize reserved count data before removing block range * in file from extent status tree * * @inode - file containing range * @lblk - first block in range * @es - pointer to first extent in range * @rc - pointer to reserved count data * * Assumes es is not NULL
*/ staticvoid init_rsvd(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es, struct rsvd_count *rc)
{ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct rb_node *node;
rc->ndelayed = 0;
/* * for bigalloc, note the first delayed block in the range has not * been found, record the extent containing the block to the left of * the region to be removed, if any, and note that there's no partial * cluster to track
*/ if (sbi->s_cluster_ratio > 1) {
rc->first_do_lblk_found = false; if (lblk > es->es_lblk) {
rc->left_es = es;
} else {
node = rb_prev(&es->rb_node);
rc->left_es = node ? rb_entry(node, struct extent_status,
rb_node) : NULL;
}
rc->partial = false;
}
}
/* * count_rsvd - count the clusters containing delayed blocks in a range * within an extent and add to the running tally in rsvd_count * * @inode - file containing extent * @lblk - first block in range * @len - length of range in blocks * @es - pointer to extent containing clusters to be counted * @rc - pointer to reserved count data * * Tracks partial clusters found at the beginning and end of extents so * they aren't overcounted when they span adjacent extents
*/ staticvoid count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len, struct extent_status *es, struct rsvd_count *rc)
{ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
ext4_lblk_t i, end, nclu;
i = (lblk < es->es_lblk) ? es->es_lblk : lblk;
end = lblk + (ext4_lblk_t) len - 1;
end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end;
/* record the first block of the first delayed extent seen */ if (!rc->first_do_lblk_found) {
rc->first_do_lblk = i;
rc->first_do_lblk_found = true;
}
/* update the last lblk in the region seen so far */
rc->last_do_lblk = end;
/* * if we're tracking a partial cluster and the current extent * doesn't start with it, count it and stop tracking
*/ if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) {
rc->ndelayed++;
rc->partial = false;
}
/* * if the first cluster doesn't start on a cluster boundary but * ends on one, count it
*/ if (EXT4_LBLK_COFF(sbi, i) != 0) { if (end >= EXT4_LBLK_CFILL(sbi, i)) {
rc->ndelayed++;
rc->partial = false;
i = EXT4_LBLK_CFILL(sbi, i) + 1;
}
}
/* * if the current cluster starts on a cluster boundary, count the * number of whole delayed clusters in the extent
*/ if ((i + sbi->s_cluster_ratio - 1) <= end) {
nclu = (end - i + 1) >> sbi->s_cluster_bits;
rc->ndelayed += nclu;
i += nclu << sbi->s_cluster_bits;
}
/* * start tracking a partial cluster if there's a partial at the end * of the current extent and we're not already tracking one
*/ if (!rc->partial && i <= end) {
rc->partial = true;
rc->lclu = EXT4_B2C(sbi, i);
}
}
/* * __pr_tree_search - search for a pending cluster reservation * * @root - root of pending reservation tree * @lclu - logical cluster to search for * * Returns the pending reservation for the cluster identified by @lclu * if found. If not, returns a reservation for the next cluster if any, * and if not, returns NULL.
*/ staticstruct pending_reservation *__pr_tree_search(struct rb_root *root,
ext4_lblk_t lclu)
{ struct rb_node *node = root->rb_node; struct pending_reservation *pr = NULL;
/* * get_rsvd - calculates and returns the number of cluster reservations to be * released when removing a block range from the extent status tree * and releases any pending reservations within the range * * @inode - file containing block range * @end - last block in range * @right_es - pointer to extent containing next block beyond end or NULL * @rc - pointer to reserved count data * * The number of reservations to be released is equal to the number of * clusters containing delayed blocks within the range, minus the number of * clusters still containing delayed blocks at the ends of the range, and * minus the number of pending reservations within the range.
*/ staticunsignedint get_rsvd(struct inode *inode, ext4_lblk_t end, struct extent_status *right_es, struct rsvd_count *rc)
{ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct pending_reservation *pr; struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree; struct rb_node *node;
ext4_lblk_t first_lclu, last_lclu; bool left_delayed, right_delayed, count_pending; struct extent_status *es;
if (sbi->s_cluster_ratio > 1) { /* count any remaining partial cluster */ if (rc->partial)
rc->ndelayed++;
/* * decrease the delayed count by the number of clusters at the * ends of the range that still contain delayed blocks - * these clusters still need to be reserved
*/
left_delayed = right_delayed = false;
es = rc->left_es; while (es && ext4_es_end(es) >=
EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) { if (ext4_es_is_delayed(es)) {
rc->ndelayed--;
left_delayed = true; break;
}
node = rb_prev(&es->rb_node); if (!node) break;
es = rb_entry(node, struct extent_status, rb_node);
} if (right_es && (!left_delayed || first_lclu != last_lclu)) { if (end < ext4_es_end(right_es)) {
es = right_es;
} else {
node = rb_next(&right_es->rb_node);
es = node ? rb_entry(node, struct extent_status,
rb_node) : NULL;
} while (es && es->es_lblk <=
EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) { if (ext4_es_is_delayed(es)) {
rc->ndelayed--;
right_delayed = true; break;
}
node = rb_next(&es->rb_node); if (!node) break;
es = rb_entry(node, struct extent_status,
rb_node);
}
}
/* * Determine the block range that should be searched for * pending reservations, if any. Clusters on the ends of the * original removed range containing delayed blocks are * excluded. They've already been accounted for and it's not * possible to determine if an associated pending reservation * should be released with the information available in the * extents status tree.
*/ if (first_lclu == last_lclu) { if (left_delayed | right_delayed)
count_pending = false; else
count_pending = true;
} else { if (left_delayed)
first_lclu++; if (right_delayed)
last_lclu--; if (first_lclu <= last_lclu)
count_pending = true; else
count_pending = false;
}
/* * a pending reservation found between first_lclu and last_lclu * represents an allocated cluster that contained at least one * delayed block, so the delayed total must be reduced by one * for each pending reservation found and released
*/ if (count_pending) {
pr = __pr_tree_search(&tree->root, first_lclu); while (pr && pr->lclu <= last_lclu) {
rc->ndelayed--;
node = rb_next(&pr->rb_node);
rb_erase(&pr->rb_node, &tree->root);
__free_pending(pr); if (!node) break;
pr = rb_entry(node, struct pending_reservation,
rb_node);
}
}
} return rc->ndelayed;
}
/* * __es_remove_extent - removes block range from extent status tree * * @inode - file containing range * @lblk - first block in range * @end - last block in range * @reserved - number of cluster reservations released * @prealloc - pre-allocated es to avoid memory allocation failures * * If @reserved is not NULL and delayed allocation is enabled, counts * block/cluster reservations freed by removing range and if bigalloc * enabled cancels pending reservations as needed. Returns 0 on success, * error code on failure.
*/ staticint __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t end, int *reserved, struct extent_status *prealloc)
{ struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; struct rb_node *node; struct extent_status *es; struct extent_status orig_es;
ext4_lblk_t len1, len2;
ext4_fsblk_t block; int err = 0; bool count_reserved = true; struct rsvd_count rc;
if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC))
count_reserved = false;
es = __es_tree_search(&tree->root, lblk); if (!es) goto out; if (es->es_lblk > end) goto out;
/* * ext4_es_remove_extent - removes block range from extent status tree * * @inode - file containing range * @lblk - first block in range * @len - number of blocks to remove * * Reduces block/cluster reservation count and for bigalloc cancels pending * reservations as needed.
*/ void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len)
{
ext4_lblk_t end; int err = 0; int reserved = 0; struct extent_status *es = NULL;
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return;
trace_ext4_es_remove_extent(inode, lblk, len);
es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
lblk, len, inode->i_ino);
if (!len) return;
end = lblk + len - 1;
BUG_ON(end < lblk);
retry: if (err && !es)
es = __es_alloc_extent(true); /* * ext4_clear_inode() depends on us taking i_es_lock unconditionally * so that we are sure __es_shrink() is done with the inode before it * is reclaimed.
*/
write_lock(&EXT4_I(inode)->i_es_lock);
err = __es_remove_extent(inode, lblk, end, &reserved, es); /* Free preallocated extent if it didn't get used. */ if (es) { if (!es->es_len)
__es_free_extent(es);
es = NULL;
}
write_unlock(&EXT4_I(inode)->i_es_lock); if (err) goto retry;
retry:
spin_lock(&sbi->s_es_lock);
nr_to_walk = sbi->s_es_nr_inode; while (nr_to_walk-- > 0) { if (list_empty(&sbi->s_es_list)) {
spin_unlock(&sbi->s_es_lock); goto out;
}
ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
i_es_list); /* Move the inode to the tail */
list_move_tail(&ei->i_es_list, &sbi->s_es_list);
/* * Normally we try hard to avoid shrinking precached inodes, * but we will as a last resort.
*/ if (!retried && ext4_test_inode_state(&ei->vfs_inode,
EXT4_STATE_EXT_PRECACHED)) {
nr_skipped++; continue;
}
if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) {
nr_skipped++; continue;
} /* * Now we hold i_es_lock which protects us from inode reclaim * freeing inode under us
*/
spin_unlock(&sbi->s_es_lock);
if (nr_to_scan <= 0) goto out;
spin_lock(&sbi->s_es_lock);
}
spin_unlock(&sbi->s_es_lock);
/* * If we skipped any inodes, and we weren't able to make any * forward progress, try again to scan precached inodes.
*/ if ((nr_shrunk == 0) && nr_skipped && !retried) {
retried++; goto retry;
}
if (locked_ei && nr_shrunk == 0)
nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan);
/* here we just find an inode that has the max nr. of objects */
spin_lock(&sbi->s_es_lock);
list_for_each_entry(ei, &sbi->s_es_list, i_es_list) {
inode_cnt++; if (max && max->i_es_all_nr < ei->i_es_all_nr)
max = ei; elseif (!max)
max = ei;
}
spin_unlock(&sbi->s_es_lock);
/* * Shrink extents in given inode from ei->i_es_shrink_lblk till end. Scan at * most *nr_to_scan extents, update *nr_to_scan accordingly. * * Return 0 if we hit end of tree / interval, 1 if we exhausted nr_to_scan. * Increment *nr_shrunk by the number of reclaimed extents. Also update * ei->i_es_shrink_lblk to where we should continue scanning.
*/ staticint es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end, int *nr_to_scan, int *nr_shrunk)
{ struct inode *inode = &ei->vfs_inode; struct ext4_es_tree *tree = &ei->i_es_tree; struct extent_status *es; struct rb_node *node;
es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk); if (!es) goto out_wrap;
while (*nr_to_scan > 0) { if (es->es_lblk > end) {
ei->i_es_shrink_lblk = end + 1; return 0;
}
(*nr_to_scan)--;
node = rb_next(&es->rb_node);
if (ext4_es_must_keep(es)) goto next; if (ext4_es_is_referenced(es)) {
ext4_es_clear_referenced(es); goto next;
}
/* * Called to support EXT4_IOC_CLEAR_ES_CACHE. We can only remove * discretionary entries from the extent status cache. (Some entries * must be present for proper operations.)
*/ void ext4_clear_inode_es(struct inode *inode)
{ struct ext4_inode_info *ei = EXT4_I(inode); struct extent_status *es; struct ext4_es_tree *tree; struct rb_node *node;
write_lock(&ei->i_es_lock);
tree = &EXT4_I(inode)->i_es_tree;
tree->cache_es = NULL;
node = rb_first(&tree->root); while (node) {
es = rb_entry(node, struct extent_status, rb_node);
node = rb_next(node); if (!ext4_es_must_keep(es)) {
rb_erase(&es->rb_node, &tree->root);
ext4_es_free_extent(inode, es);
}
}
ext4_clear_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
write_unlock(&ei->i_es_lock);
}
/* * __get_pending - retrieve a pointer to a pending reservation * * @inode - file containing the pending cluster reservation * @lclu - logical cluster of interest * * Returns a pointer to a pending reservation if it's a member of * the set, and NULL if not. Must be called holding i_es_lock.
*/ staticstruct pending_reservation *__get_pending(struct inode *inode,
ext4_lblk_t lclu)
{ struct ext4_pending_tree *tree; struct rb_node *node; struct pending_reservation *pr = NULL;
tree = &EXT4_I(inode)->i_pending_tree;
node = (&tree->root)->rb_node;
/* * __insert_pending - adds a pending cluster reservation to the set of * pending reservations * * @inode - file containing the cluster * @lblk - logical block in the cluster to be added * @prealloc - preallocated pending entry * * Returns 1 on successful insertion and -ENOMEM on failure. If the * pending reservation is already in the set, returns successfully.
*/ staticint __insert_pending(struct inode *inode, ext4_lblk_t lblk, struct pending_reservation **prealloc)
{ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree; struct rb_node **p = &tree->root.rb_node; struct rb_node *parent = NULL; struct pending_reservation *pr;
ext4_lblk_t lclu; int ret = 0;
lclu = EXT4_B2C(sbi, lblk); /* search to find parent for insertion */ while (*p) {
parent = *p;
pr = rb_entry(parent, struct pending_reservation, rb_node);
if (lclu < pr->lclu) {
p = &(*p)->rb_left;
} elseif (lclu > pr->lclu) {
p = &(*p)->rb_right;
} else { /* pending reservation already inserted */ goto out;
}
}
if (likely(*prealloc == NULL)) {
pr = __alloc_pending(false); if (!pr) {
ret = -ENOMEM; goto out;
}
} else {
pr = *prealloc;
*prealloc = NULL;
}
pr->lclu = lclu;
rb_link_node(&pr->rb_node, parent, p);
rb_insert_color(&pr->rb_node, &tree->root);
ret = 1;
out: return ret;
}
/* * __remove_pending - removes a pending cluster reservation from the set * of pending reservations * * @inode - file containing the cluster * @lblk - logical block in the pending cluster reservation to be removed * * Returns successfully if pending reservation is not a member of the set.
*/ staticvoid __remove_pending(struct inode *inode, ext4_lblk_t lblk)
{ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct pending_reservation *pr; struct ext4_pending_tree *tree;
pr = __get_pending(inode, EXT4_B2C(sbi, lblk)); if (pr != NULL) {
tree = &EXT4_I(inode)->i_pending_tree;
rb_erase(&pr->rb_node, &tree->root);
__free_pending(pr);
}
}
/* * ext4_remove_pending - removes a pending cluster reservation from the set * of pending reservations * * @inode - file containing the cluster * @lblk - logical block in the pending cluster reservation to be removed * * Locking for external use of __remove_pending.
*/ void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk)
{ struct ext4_inode_info *ei = EXT4_I(inode);
/* * ext4_is_pending - determine whether a cluster has a pending reservation * on it * * @inode - file containing the cluster * @lblk - logical block in the cluster * * Returns true if there's a pending reservation for the cluster in the * set of pending reservations, and false if not.
*/ bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk)
{ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); bool ret;
read_lock(&ei->i_es_lock);
ret = (bool)(__get_pending(inode, EXT4_B2C(sbi, lblk)) != NULL);
read_unlock(&ei->i_es_lock);
return ret;
}
/* * ext4_es_insert_delayed_extent - adds some delayed blocks to the extents * status tree, adding a pending reservation * where needed * * @inode - file containing the newly added block * @lblk - start logical block to be added * @len - length of blocks to be added * @lclu_allocated/end_allocated - indicates whether a physical cluster has * been allocated for the logical cluster * that contains the start/end block. Note that * end_allocated should always be set to false * if the start and the end block are in the * same cluster
*/ void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, bool lclu_allocated, bool end_allocated)
{ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct extent_status newes;
ext4_lblk_t end = lblk + len - 1; int err1 = 0, err2 = 0, err3 = 0; struct extent_status *es1 = NULL; struct extent_status *es2 = NULL; struct pending_reservation *pr1 = NULL; struct pending_reservation *pr2 = NULL;
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return;
es_debug("add [%u/%u) delayed to extent status tree of inode %lu\n",
lblk, len, inode->i_ino); if (!len) return;
/* * __revise_pending - makes, cancels, or leaves unchanged pending cluster * reservations for a specified block range depending * upon the presence or absence of delayed blocks * outside the range within clusters at the ends of the * range * * @inode - file containing the range * @lblk - logical block defining the start of range * @len - length of range in blocks * @prealloc - preallocated pending entry * * Used after a newly allocated extent is added to the extents status tree. * Requires that the extents in the range have either written or unwritten * status. Must be called while holding i_es_lock. Returns number of new * inserts pending cluster on insert pendings, returns 0 on remove pendings, * return -ENOMEM on failure.
*/ staticint __revise_pending(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, struct pending_reservation **prealloc)
{ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
ext4_lblk_t end = lblk + len - 1;
ext4_lblk_t first, last; bool f_del = false, l_del = false; int pendings = 0; int ret = 0;
if (len == 0) return 0;
/* * Two cases - block range within single cluster and block range * spanning two or more clusters. Note that a cluster belonging * to a range starting and/or ending on a cluster boundary is treated * as if it does not contain a delayed extent. The new range may * have allocated space for previously delayed blocks out to the * cluster boundary, requiring that any pre-existing pending * reservation be canceled. Because this code only looks at blocks * outside the range, it should revise pending reservations * correctly even if the extent represented by the range can't be * inserted in the extents status tree due to ENOSPC.
*/
if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) {
first = EXT4_LBLK_CMASK(sbi, lblk); if (first != lblk)
f_del = __es_scan_range(inode, &ext4_es_is_delayed,
first, lblk - 1); if (f_del) {
ret = __insert_pending(inode, first, prealloc); if (ret < 0) goto out;
pendings += ret;
} else {
last = EXT4_LBLK_CMASK(sbi, end) +
sbi->s_cluster_ratio - 1; if (last != end)
l_del = __es_scan_range(inode,
&ext4_es_is_delayed,
end + 1, last); if (l_del) {
ret = __insert_pending(inode, last, prealloc); if (ret < 0) goto out;
pendings += ret;
} else
__remove_pending(inode, last);
}
} else {
first = EXT4_LBLK_CMASK(sbi, lblk); if (first != lblk)
f_del = __es_scan_range(inode, &ext4_es_is_delayed,
first, lblk - 1); if (f_del) {
ret = __insert_pending(inode, first, prealloc); if (ret < 0) goto out;
pendings += ret;
} else
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Dauer der Verarbeitung: 0.25 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.