// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com * Written by Alex Tomas <alex@clusterfs.com>
*/
/* * mballoc.c contains the multiblocks allocation routines
*/
/* * MUSTDO: * - test ext4_ext_search_left() and ext4_ext_search_right() * - search for metadata in few groups * * TODO v4: * - normalization should take into account whether file is still open * - discard preallocations if no free space left (policy?) * - don't normalize tails * - quota * - reservation for superuser * * TODO v3: * - bitmap read-ahead (proposed by Oleg Drokin aka green) * - track min/max extents in each group for better group selection * - mb_mark_used() may allocate chunk right after splitting buddy * - tree of groups sorted by number of free blocks * - error handling
*/
/* * The allocation request involve request for multiple number of blocks * near to the goal(block) value specified. * * During initialization phase of the allocator we decide to use the * group preallocation or inode preallocation depending on the size of * the file. The size of the file could be the resulting file size we * would have after allocation, or the current file size, which ever * is larger. If the size is less than sbi->s_mb_stream_request we * select to use the group preallocation. The default value of * s_mb_stream_request is 16 blocks. This can also be tuned via * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in * terms of number of blocks. * * The main motivation for having small file use group preallocation is to * ensure that we have small files closer together on the disk. * * First stage the allocator looks at the inode prealloc list, * ext4_inode_info->i_prealloc_list, which contains list of prealloc * spaces for this particular inode. The inode prealloc space is * represented as: * * pa_lstart -> the logical start block for this prealloc space * pa_pstart -> the physical start block for this prealloc space * pa_len -> length for this prealloc space (in clusters) * pa_free -> free space available in this prealloc space (in clusters) * * The inode preallocation space is used looking at the _logical_ start * block. If only the logical file block falls within the range of prealloc * space we will consume the particular prealloc space. This makes sure that * we have contiguous physical blocks representing the file blocks * * The important thing to be noted in case of inode prealloc space is that * we don't modify the values associated to inode prealloc space except * pa_free. * * If we are not able to find blocks in the inode prealloc space and if we * have the group allocation flag set then we look at the locality group * prealloc space. These are per CPU prealloc list represented as * * ext4_sb_info.s_locality_groups[smp_processor_id()] * * The reason for having a per cpu locality group is to reduce the contention * between CPUs. It is possible to get scheduled at this point. * * The locality group prealloc space is used looking at whether we have * enough free space (pa_free) within the prealloc space. * * If we can't allocate blocks via inode prealloc or/and locality group * prealloc then we look at the buddy cache. The buddy cache is represented * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets * mapped to the buddy and bitmap information regarding different * groups. The buddy information is attached to buddy cache inode so that * we can access them through the page cache. The information regarding * each group is loaded via ext4_mb_load_buddy. The information involve * block bitmap and buddy information. The information are stored in the * inode as: * * { page } * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... * * * one block each for bitmap and buddy information. So for each group we * take up 2 blocks. A page can contain blocks_per_page (PAGE_SIZE / * blocksize) blocks. So it can have information regarding groups_per_page * which is blocks_per_page/2 * * The buddy cache inode is not stored on disk. The inode is thrown * away when the filesystem is unmounted. * * We look for count number of blocks in the buddy cache. If we were able * to locate that many free blocks we return with additional information * regarding rest of the contiguous physical block available * * Before allocating blocks via buddy cache we normalize the request * blocks. This ensure we ask for more blocks that we needed. The extra * blocks that we get after allocation is added to the respective prealloc * list. In case of inode preallocation we follow a list of heuristics * based on file size. This can be found in ext4_mb_normalize_request. If * we are doing a group prealloc we try to normalize the request to * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is * dependent on the cluster size; for non-bigalloc file systems, it is * 512 blocks. This can be tuned via * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in * terms of number of blocks. If we have mounted the file system with -O * stripe=<value> option the group prealloc request is normalized to the * smallest multiple of the stripe value (sbi->s_stripe) which is * greater than the default mb_group_prealloc. * * If "mb_optimize_scan" mount option is set, we maintain in memory group info * structures in two data structures: * * 1) Array of largest free order xarrays (sbi->s_mb_largest_free_orders) * * Locking: Writers use xa_lock, readers use rcu_read_lock. * * This is an array of xarrays where the index in the array represents the * largest free order in the buddy bitmap of the participating group infos of * that xarray. So, there are exactly MB_NUM_ORDERS(sb) (which means total * number of buddy bitmap orders possible) number of xarrays. Group-infos are * placed in appropriate xarrays. * * 2) Average fragment size xarrays (sbi->s_mb_avg_fragment_size) * * Locking: Writers use xa_lock, readers use rcu_read_lock. * * This is an array of xarrays where in the i-th xarray there are groups with * average fragment size >= 2^i and < 2^(i+1). The average fragment size * is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments. * Note that we don't bother with a special xarray for completely empty * groups so we only have MB_NUM_ORDERS(sb) xarrays. Group-infos are placed * in appropriate xarrays. * * In xarray, the index is the block group number, the value is the block group * information, and a non-empty value indicates the block group is present in * the current xarray. * * When "mb_optimize_scan" mount option is set, mballoc consults the above data * structures to decide the order in which groups are to be traversed for * fulfilling an allocation request. * * At CR_POWER2_ALIGNED , we look for groups which have the largest_free_order * >= the order of the request. We directly look at the largest free order list * in the data structure (1) above where largest_free_order = order of the * request. If that list is empty, we look at remaining list in the increasing * order of largest_free_order. This allows us to perform CR_POWER2_ALIGNED * lookup in O(1) time. * * At CR_GOAL_LEN_FAST, we only consider groups where * average fragment size > request size. So, we lookup a group which has average * fragment size just above or equal to request size using our average fragment * size group lists (data structure 2) in O(1) time. * * At CR_BEST_AVAIL_LEN, we aim to optimize allocations which can't be satisfied * in CR_GOAL_LEN_FAST. The fact that we couldn't find a group in * CR_GOAL_LEN_FAST suggests that there is no BG that has avg * fragment size > goal length. So before falling to the slower * CR_GOAL_LEN_SLOW, in CR_BEST_AVAIL_LEN we proactively trim goal length and * then use the same fragment lists as CR_GOAL_LEN_FAST to find a BG with a big * enough average fragment size. This increases the chances of finding a * suitable block group in O(1) time and results in faster allocation at the * cost of reduced size of allocation. * * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in * linear order which requires O(N) search time for each CR_POWER2_ALIGNED and * CR_GOAL_LEN_FAST phase. * * The regular allocator (using the buddy cache) supports a few tunables. * * /sys/fs/ext4/<partition>/mb_min_to_scan * /sys/fs/ext4/<partition>/mb_max_to_scan * /sys/fs/ext4/<partition>/mb_order2_req * /sys/fs/ext4/<partition>/mb_max_linear_groups * * The regular allocator uses buddy scan only if the request len is power of * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The * value of s_mb_order2_reqs can be tuned via * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to * stripe size (sbi->s_stripe), we try to search for contiguous block in * stripe size. This should result in better allocation on RAID setups. If * not, we search in the specific group using bitmap for best extents. The * tunable min_to_scan and max_to_scan control the behaviour here. * min_to_scan indicate how long the mballoc __must__ look for a best * extent and max_to_scan indicates how long the mballoc __can__ look for a * best extent in the found extents. Searching for the blocks starts with * the group specified as the goal value in allocation context via * ac_g_ex. Each group is first checked based on the criteria whether it * can be used for allocation. ext4_mb_good_group explains how the groups are * checked. * * When "mb_optimize_scan" is turned on, as mentioned above, the groups may not * get traversed linearly. That may result in subsequent allocations being not * close to each other. And so, the underlying device may get filled up in a * non-linear fashion. While that may not matter on non-rotational devices, for * rotational devices that may result in higher seek times. "mb_max_linear_groups" * tells mballoc how many groups mballoc should search linearly before * performing consulting above data structures for more efficient lookups. For * non rotational devices, this value defaults to 0 and for rotational devices * this is set to MB_DEFAULT_LINEAR_LIMIT. * * Both the prealloc space are getting populated as above. So for the first * request we will hit the buddy cache which will result in this prealloc * space getting filled. The prealloc space is then later used for the * subsequent request.
*/
/* * mballoc operates on the following data: * - on-disk bitmap * - in-core buddy (actually includes buddy and bitmap) * - preallocation descriptors (PAs) * * there are two types of preallocations: * - inode * assiged to specific inode and can be used for this inode only. * it describes part of inode's space preallocated to specific * physical blocks. any block from that preallocated can be used * independent. the descriptor just tracks number of blocks left * unused. so, before taking some block from descriptor, one must * make sure corresponded logical block isn't allocated yet. this * also means that freeing any block within descriptor's range * must discard all preallocated blocks. * - locality group * assigned to specific locality group which does not translate to * permanent set of inodes: inode can join and leave group. space * from this type of preallocation can be used for any inode. thus * it's consumed from the beginning to the end. * * relation between them can be expressed as: * in-core buddy = on-disk bitmap + preallocation descriptors * * this mean blocks mballoc considers used are: * - allocated blocks (persistent) * - preallocated blocks (non-persistent) * * consistency in mballoc world means that at any time a block is either * free or used in ALL structures. notice: "any time" should not be read * literally -- time is discrete and delimited by locks. * * to keep it simple, we don't use block numbers, instead we count number of * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA. * * all operations can be expressed as: * - init buddy: buddy = on-disk + PAs * - new PA: buddy += N; PA = N * - use inode PA: on-disk += N; PA -= N * - discard inode PA buddy -= on-disk - PA; PA = 0 * - use locality group PA on-disk += N; PA -= N * - discard locality group PA buddy -= PA; PA = 0 * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap * is used in real operation because we can't know actual used * bits from PA, only from on-disk bitmap * * if we follow this strict logic, then all operations above should be atomic. * given some of them can block, we'd have to use something like semaphores * killing performance on high-end SMP hardware. let's try to relax it using * the following knowledge: * 1) if buddy is referenced, it's already initialized * 2) while block is used in buddy and the buddy is referenced, * nobody can re-allocate that block * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has * bit set and PA claims same block, it's OK. IOW, one can set bit in * on-disk bitmap if buddy has same bit set or/and PA covers corresponded * block * * so, now we're building a concurrency table: * - init buddy vs. * - new PA * blocks for PA are allocated in the buddy, buddy must be referenced * until PA is linked to allocation group to avoid concurrent buddy init * - use inode PA * we need to make sure that either on-disk bitmap or PA has uptodate data * given (3) we care that PA-=N operation doesn't interfere with init * - discard inode PA * the simplest way would be to have buddy initialized by the discard * - use locality group PA * again PA-=N must be serialized with init * - discard locality group PA * the simplest way would be to have buddy initialized by the discard * - new PA vs. * - use inode PA * i_data_sem serializes them * - discard inode PA * discard process must wait until PA isn't used by another process * - use locality group PA * some mutex should serialize them * - discard locality group PA * discard process must wait until PA isn't used by another process * - use inode PA * - use inode PA * i_data_sem or another mutex should serializes them * - discard inode PA * discard process must wait until PA isn't used by another process * - use locality group PA * nothing wrong here -- they're different PAs covering different blocks * - discard locality group PA * discard process must wait until PA isn't used by another process * * now we're ready to make few consequences: * - PA is referenced and while it is no discard is possible * - PA is referenced until block isn't marked in on-disk bitmap * - PA changes only after on-disk bitmap * - discard must not compete with init. either init is done before * any discard or they're serialized somehow * - buddy init as sum of on-disk bitmap and PAs is done atomically * * a special case when we've used PA to emptiness. no need to modify buddy * in this case, but we should care about concurrent init *
*/
/* * Logic in few words: * * - allocation: * load group * find blocks * mark bits in on-disk bitmap * release group * * - use preallocation: * find proper PA (per-inode or group) * load group * mark bits in on-disk bitmap * release group * release PA * * - free: * load group * mark bits in on-disk bitmap * release group * * - discard preallocations in group: * mark PAs deleted * move them onto local list * load on-disk bitmap * load group * remove PA from object (inode or locality group) * mark free blocks in-core * * - discard inode's preallocations:
*/
/* * Locking rules * * Locks: * - bitlock on a group (group) * - object (inode/locality) (object) * - per-pa lock (pa) * - cr_power2_aligned lists lock (cr_power2_aligned) * - cr_goal_len_fast lists lock (cr_goal_len_fast) * * Paths: * - new pa * object * group * * - find and use pa: * pa * * - release consumed pa: * pa * group * object * * - generate in-core bitmap: * group * pa * * - discard all for given object (inode, locality group): * object * pa * group * * - discard all for given group: * group * pa * group * object * * - allocation path (ext4_mb_regular_allocator) * group * cr_power2_aligned/cr_goal_len_fast
*/ staticstruct kmem_cache *ext4_pspace_cachep; staticstruct kmem_cache *ext4_ac_cachep; staticstruct kmem_cache *ext4_free_data_cachep;
/* We create slab caches for groupinfo data structures based on the * superblock block size. There will be one per mounted filesystem for
* each unique s_blocksize_bits */ #define NR_GRPINFO_CACHES 8 staticstruct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
/* * The algorithm using this percpu seq counter goes below: * 1. We sample the percpu discard_pa_seq counter before trying for block * allocation in ext4_mb_new_blocks(). * 2. We increment this percpu discard_pa_seq counter when we either allocate * or free these blocks i.e. while marking those blocks as used/free in * mb_mark_used()/mb_free_blocks(). * 3. We also increment this percpu seq counter when we successfully identify * that the bb_prealloc_list is not empty and hence proceed for discarding * of those PAs inside ext4_mb_discard_group_preallocations(). * * Now to make sure that the regular fast path of block allocation is not * affected, as a small optimization we only sample the percpu seq counter * on that cpu. Only when the block allocation fails and when freed blocks * found were 0, that is when we sample percpu seq counter for all cpus using * below function ext4_get_discard_pa_seq_sum(). This happens after making * sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty.
*/ static DEFINE_PER_CPU(u64, discard_pa_seq); staticinline u64 ext4_get_discard_pa_seq_sum(void)
{ int __cpu;
u64 __seq = 0;
#ifdef DOUBLE_CHECK staticvoid mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, int first, int count)
{ int i; struct super_block *sb = e4b->bd_sb;
if (unlikely(e4b->bd_info->bb_bitmap == NULL)) return;
assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); for (i = 0; i < count; i++) { if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
ext4_fsblk_t blocknr;
staticvoid mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
{ int i;
if (unlikely(e4b->bd_info->bb_bitmap == NULL)) return;
assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); for (i = 0; i < count; i++) {
BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
}
}
staticvoid mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
{ if (unlikely(e4b->bd_info->bb_bitmap == NULL)) return; if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) { unsignedchar *b1, *b2; int i;
b1 = (unsignedchar *) e4b->bd_info->bb_bitmap;
b2 = (unsignedchar *) bitmap; for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { if (b1[i] != b2[i]) {
ext4_msg(e4b->bd_sb, KERN_ERR, "corruption in group %u " "at byte %u(%u): %x in copy != %x " "on disk/prealloc",
e4b->bd_group, i, i * 8, b1[i], b2[i]);
BUG();
}
}
}
}
#define MB_CHECK_ASSERT(assert) \ do { \ if (!(assert)) { \
printk(KERN_EMERG \ "Assertion failure in %s() at %s:%d: \"%s\"\n", \
function, file, line, # assert); \
BUG(); \
} \
} while (0)
staticvoid __mb_check_buddy(struct ext4_buddy *e4b, char *file, constchar *function, int line)
{ struct super_block *sb = e4b->bd_sb; int order = e4b->bd_blkbits + 1; int max; int max2; int i; int j; int k; int count; struct ext4_group_info *grp; int fragments = 0; int fstart; struct list_head *cur; void *buddy; void *buddy2;
if (e4b->bd_info->bb_check_counter++ % 10) return;
if (mb_test_bit(i, buddy)) { /* only single bit in buddy2 may be 0 */ if (!mb_test_bit(i << 1, buddy2)) {
MB_CHECK_ASSERT(
mb_test_bit((i<<1)+1, buddy2));
} continue;
}
/* both bits in buddy2 must be 1 */
MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
fstart = -1;
buddy = mb_find_buddy(e4b, 0, &max); for (i = 0; i < max; i++) { if (!mb_test_bit(i, buddy)) {
MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free); if (fstart == -1) {
fragments++;
fstart = i;
} continue;
}
fstart = -1; /* check used bits only */ for (j = 0; j < e4b->bd_blkbits + 1; j++) {
buddy2 = mb_find_buddy(e4b, j, &max2);
k = i >> j;
MB_CHECK_ASSERT(k < max2);
MB_CHECK_ASSERT(mb_test_bit(k, buddy2));
}
}
MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
grp = ext4_get_group_info(sb, e4b->bd_group); if (!grp) return;
list_for_each(cur, &grp->bb_prealloc_list) {
ext4_group_t groupnr; struct ext4_prealloc_space *pa;
pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k);
MB_CHECK_ASSERT(groupnr == e4b->bd_group); for (i = 0; i < pa->pa_len; i++)
MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
}
} #undef MB_CHECK_ASSERT #define mb_check_buddy(e4b) __mb_check_buddy(e4b, \
__FILE__, __func__, __LINE__) #else #define mb_check_buddy(e4b) #endif
/* * Divide blocks started from @first with length @len into * smaller chunks with power of 2 blocks. * Clear the bits in bitmap which the blocks of the chunk(s) covered, * then increase bb_counters[] for corresponded chunk size.
*/ staticvoid ext4_mb_mark_free_simple(struct super_block *sb, void *buddy, ext4_grpblk_t first, ext4_grpblk_t len, struct ext4_group_info *grp)
{ struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_grpblk_t min;
ext4_grpblk_t max;
ext4_grpblk_t chunk; unsignedint border;
BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
border = 2 << sb->s_blocksize_bits;
while (len > 0) { /* find how many blocks can be covered since this position */
max = ffs(first | border) - 1;
/* find how many blocks of power 2 we need to mark */
min = fls(len) - 1;
if (max < min)
min = max;
chunk = 1 << min;
/* mark multiblock chunks only */
grp->bb_counters[min]++; if (min > 0)
mb_clear_bit(first >> min,
buddy + sbi->s_mb_offsets[min]);
len -= chunk;
first += chunk;
}
}
staticint mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len)
{ int order;
/* * We don't bother with a special lists groups with only 1 block free * extents and for completely empty groups.
*/
order = fls(len) - 2; if (order < 0) return 0; if (order == MB_NUM_ORDERS(sb))
order--; if (WARN_ON_ONCE(order > MB_NUM_ORDERS(sb)))
order = MB_NUM_ORDERS(sb) - 1; return order;
}
/* Move group to appropriate avg_fragment_size list */ staticvoid
mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
{ struct ext4_sb_info *sbi = EXT4_SB(sb); intnew, old;
if (!test_opt2(sb, MB_OPTIMIZE_SCAN)) return;
old = grp->bb_avg_fragment_size_order; new = grp->bb_fragments == 0 ? -1 :
mb_avg_fragment_size_order(sb, grp->bb_free / grp->bb_fragments); if (new == old) return;
if (old >= 0)
xa_erase(&sbi->s_mb_avg_fragment_size[old], grp->bb_group);
grp->bb_avg_fragment_size_order = new; if (new >= 0) { /* * Cannot use __GFP_NOFAIL because we hold the group lock. * Although allocation for insertion may fails, it's not fatal * as we have linear traversal to fall back on.
*/ int err = xa_insert(&sbi->s_mb_avg_fragment_size[new],
grp->bb_group, grp, GFP_ATOMIC); if (err)
mb_debug(sb, "insert group: %u to s_mb_avg_fragment_size[%d] failed, err %d",
grp->bb_group, new, err);
}
}
/* * Find a suitable group of given order from the largest free orders xarray.
*/ staticinlineint
ext4_mb_scan_groups_largest_free_order_range(struct ext4_allocation_context *ac, int order, ext4_group_t start,
ext4_group_t end)
{ struct xarray *xa = &EXT4_SB(ac->ac_sb)->s_mb_largest_free_orders[order];
/* * Choose next group by traversing largest_free_order lists. Updates *new_cr if * cr level needs an update.
*/ staticint ext4_mb_scan_groups_p2_aligned(struct ext4_allocation_context *ac,
ext4_group_t group)
{ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int i; int ret = 0;
ext4_group_t start, end;
start = group;
end = ext4_get_groups_count(ac->ac_sb);
wrap_around: for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
ret = ext4_mb_scan_groups_largest_free_order_range(ac, i,
start, end); if (ret || ac->ac_status != AC_STATUS_CONTINUE) return ret;
} if (start) {
end = start;
start = 0; goto wrap_around;
}
if (sbi->s_mb_stats)
atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]);
/* Increment cr and search again if no group is found */
ac->ac_criteria = CR_GOAL_LEN_FAST; return ret;
}
/* * Find a suitable group of given order from the average fragments xarray.
*/ staticint
ext4_mb_scan_groups_avg_frag_order_range(struct ext4_allocation_context *ac, int order, ext4_group_t start,
ext4_group_t end)
{ struct xarray *xa = &EXT4_SB(ac->ac_sb)->s_mb_avg_fragment_size[order];
/* * Choose next group by traversing average fragment size list of suitable * order. Updates *new_cr if cr level needs an update.
*/ staticint ext4_mb_scan_groups_goal_fast(struct ext4_allocation_context *ac,
ext4_group_t group)
{ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int i, ret = 0;
ext4_group_t start, end;
start = group;
end = ext4_get_groups_count(ac->ac_sb);
wrap_around:
i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); for (; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
ret = ext4_mb_scan_groups_avg_frag_order_range(ac, i,
start, end); if (ret || ac->ac_status != AC_STATUS_CONTINUE) return ret;
} if (start) {
end = start;
start = 0; goto wrap_around;
}
if (sbi->s_mb_stats)
atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]); /* * CR_BEST_AVAIL_LEN works based on the concept that we have * a larger normalized goal len request which can be trimmed to * a smaller goal len such that it can still satisfy original * request len. However, allocation request for non-regular * files never gets normalized. * See function ext4_mb_normalize_request() (EXT4_MB_HINT_DATA).
*/ if (ac->ac_flags & EXT4_MB_HINT_DATA)
ac->ac_criteria = CR_BEST_AVAIL_LEN; else
ac->ac_criteria = CR_GOAL_LEN_SLOW;
return ret;
}
/* * We couldn't find a group in CR_GOAL_LEN_FAST so try to find the highest free fragment * order we have and proactively trim the goal request length to that order to * find a suitable group faster. * * This optimizes allocation speed at the cost of slightly reduced * preallocations. However, we make sure that we don't trim the request too * much and fall to CR_GOAL_LEN_SLOW in that case.
*/ staticint ext4_mb_scan_groups_best_avail(struct ext4_allocation_context *ac,
ext4_group_t group)
{ int ret = 0; struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int i, order, min_order; unsignedlong num_stripe_clusters = 0;
ext4_group_t start, end;
/* * mb_avg_fragment_size_order() returns order in a way that makes * retrieving back the length using (1 << order) inaccurate. Hence, use * fls() instead since we need to know the actual length while modifying * goal length.
*/
order = fls(ac->ac_g_ex.fe_len) - 1; if (WARN_ON_ONCE(order - 1 > MB_NUM_ORDERS(ac->ac_sb)))
order = MB_NUM_ORDERS(ac->ac_sb);
min_order = order - sbi->s_mb_best_avail_max_trim_order; if (min_order < 0)
min_order = 0;
if (sbi->s_stripe > 0) { /* * We are assuming that stripe size is always a multiple of * cluster ratio otherwise __ext4_fill_super exists early.
*/
num_stripe_clusters = EXT4_NUM_B2C(sbi, sbi->s_stripe); if (1 << min_order < num_stripe_clusters) /* * We consider 1 order less because later we round * up the goal len to num_stripe_clusters
*/
min_order = fls(num_stripe_clusters) - 1;
}
if (1 << min_order < ac->ac_o_ex.fe_len)
min_order = fls(ac->ac_o_ex.fe_len);
start = group;
end = ext4_get_groups_count(ac->ac_sb);
wrap_around: for (i = order; i >= min_order; i--) { int frag_order; /* * Scale down goal len to make sure we find something * in the free fragments list. Basically, reduce * preallocations.
*/
ac->ac_g_ex.fe_len = 1 << i;
if (num_stripe_clusters > 0) { /* * Try to round up the adjusted goal length to * stripe size (in cluster units) multiple for * efficiency.
*/
ac->ac_g_ex.fe_len = roundup(ac->ac_g_ex.fe_len,
num_stripe_clusters);
}
ret = ext4_mb_scan_groups_avg_frag_order_range(ac, frag_order,
start, end); if (ret || ac->ac_status != AC_STATUS_CONTINUE) return ret;
} if (start) {
end = start;
start = 0; goto wrap_around;
}
/* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */
ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; if (sbi->s_mb_stats)
atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]);
ac->ac_criteria = CR_GOAL_LEN_SLOW;
return ret;
}
staticinlineint should_optimize_scan(struct ext4_allocation_context *ac)
{ if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN))) return 0; if (ac->ac_criteria >= CR_GOAL_LEN_SLOW) return 0; if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) return 0; return 1;
}
/* * next linear group for allocation.
*/ staticvoid next_linear_group(ext4_group_t *group, ext4_group_t ngroups)
{ /* * Artificially restricted ngroups for non-extent * files makes group > ngroups possible on first loop.
*/
*group = *group + 1 >= ngroups ? 0 : *group + 1;
}
for (i = 0; i < count; i++, next_linear_group(&group, ngroups)) {
ret = ext4_mb_scan_group(ac, group); if (ret || ac->ac_status != AC_STATUS_CONTINUE) return ret;
cond_resched();
}
*start = group; if (count == ngroups)
ac->ac_criteria++;
/* Processed all groups and haven't found blocks */ if (sbi->s_mb_stats && i == ngroups)
atomic64_inc(&sbi->s_bal_cX_failed[cr]);
return 0;
}
staticint ext4_mb_scan_groups(struct ext4_allocation_context *ac)
{ int ret = 0;
ext4_group_t start; struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
ext4_group_t ngroups = ext4_get_groups_count(ac->ac_sb);
/* non-extent files are limited to low blocks/groups */ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
ngroups = sbi->s_blockfile_groups;
/* searching for the right group start from the goal value specified */
start = ac->ac_g_ex.fe_group;
ac->ac_prefetch_grp = start;
ac->ac_prefetch_nr = 0;
if (!should_optimize_scan(ac)) return ext4_mb_scan_groups_linear(ac, ngroups, &start, ngroups);
/* * Optimized scanning can return non adjacent groups which can cause * seek overhead for rotational disks. So try few linear groups before * trying optimized scan.
*/ if (sbi->s_mb_max_linear_groups)
ret = ext4_mb_scan_groups_linear(ac, ngroups, &start,
sbi->s_mb_max_linear_groups); if (ret || ac->ac_status != AC_STATUS_CONTINUE) return ret;
switch (ac->ac_criteria) { case CR_POWER2_ALIGNED: return ext4_mb_scan_groups_p2_aligned(ac, start); case CR_GOAL_LEN_FAST: return ext4_mb_scan_groups_goal_fast(ac, start); case CR_BEST_AVAIL_LEN: return ext4_mb_scan_groups_best_avail(ac, start); default: /* * TODO: For CR_GOAL_LEN_SLOW, we can arrange groups in an * rb tree sorted by bb_free. But until that happens, we should * never come here.
*/
WARN_ON(1);
}
return 0;
}
/* * Cache the order of the largest free extent we have available in this block * group.
*/ staticvoid
mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
{ struct ext4_sb_info *sbi = EXT4_SB(sb); intnew, old = grp->bb_largest_free_order;
for (new = MB_NUM_ORDERS(sb) - 1; new >= 0; new--) if (grp->bb_counters[new] > 0) break;
/* No need to move between order lists? */ if (new == old) return;
if (old >= 0) { struct xarray *xa = &sbi->s_mb_largest_free_orders[old];
if (!xa_empty(xa) && xa_load(xa, grp->bb_group))
xa_erase(xa, grp->bb_group);
}
grp->bb_largest_free_order = new; if (test_opt2(sb, MB_OPTIMIZE_SCAN) && new >= 0 && grp->bb_free) { /* * Cannot use __GFP_NOFAIL because we hold the group lock. * Although allocation for insertion may fails, it's not fatal * as we have linear traversal to fall back on.
*/ int err = xa_insert(&sbi->s_mb_largest_free_orders[new],
grp->bb_group, grp, GFP_ATOMIC); if (err)
mb_debug(sb, "insert group: %u to s_mb_largest_free_orders[%d] failed, err %d",
grp->bb_group, new, err);
}
}
/* initialize buddy from bitmap which is aggregation
* of on-disk bitmap and preallocations */
i = mb_find_next_zero_bit(bitmap, max, 0);
grp->bb_first_free = i; while (i < max) {
fragments++;
first = i;
i = mb_find_next_bit(bitmap, max, i);
len = i - first;
free += len; if (len > 1)
ext4_mb_mark_free_simple(sb, buddy, first, len, grp); else
grp->bb_counters[0]++; if (i < max)
i = mb_find_next_zero_bit(bitmap, max, i);
}
grp->bb_fragments = fragments;
if (free != grp->bb_free) {
ext4_grp_locked_error(sb, group, 0, 0, "block bitmap and bg descriptor " "inconsistent: %u vs %u free clusters",
free, grp->bb_free); /* * If we intend to continue, we consider group descriptor * corrupt and update bb_free using bitmap value
*/
grp->bb_free = free;
ext4_mark_group_bitmap_corrupted(sb, group,
EXT4_GROUP_INFO_BBITMAP_CORRUPT);
}
mb_set_largest_free_order(sb, grp);
mb_update_avg_fragment_size(sb, grp);
/* The buddy information is attached the buddy cache inode * for convenience. The information regarding each group * is loaded via ext4_mb_load_buddy. The information involve * block bitmap and buddy information. The information are * stored in the inode as * * { page } * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... * * * one block each for bitmap and buddy information. * So for each group we take up 2 blocks. A page can * contain blocks_per_page (PAGE_SIZE / blocksize) blocks. * So it can have information regarding groups_per_page which * is blocks_per_page/2 * * Locking note: This routine takes the block group lock of all groups * for this page; do not hold this lock when calling this routine!
*/
staticint ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp)
{
ext4_group_t ngroups; unsignedint blocksize; int blocks_per_page; int groups_per_page; int err = 0; int i;
ext4_group_t first_group, group; int first_block; struct super_block *sb; struct buffer_head *bhs; struct buffer_head **bh = NULL; struct inode *inode; char *data; char *bitmap; struct ext4_group_info *grinfo;
/* allocate buffer_heads to read bitmaps */ if (groups_per_page > 1) {
i = sizeof(struct buffer_head *) * groups_per_page;
bh = kzalloc(i, gfp); if (bh == NULL) return -ENOMEM;
} else
bh = &bhs;
first_group = folio->index * blocks_per_page / 2;
/* read all groups the folio covers into the cache */ for (i = 0, group = first_group; i < groups_per_page; i++, group++) { if (group >= ngroups) break;
grinfo = ext4_get_group_info(sb, group); if (!grinfo) continue; /* * If page is uptodate then we came here after online resize * which added some new uninitialized group info structs, so * we must skip all initialized uptodate buddies on the folio, * which may be currently in use by an allocating task.
*/ if (folio_test_uptodate(folio) &&
!EXT4_MB_GRP_NEED_INIT(grinfo)) {
bh[i] = NULL; continue;
}
bh[i] = ext4_read_block_bitmap_nowait(sb, group, false); if (IS_ERR(bh[i])) {
err = PTR_ERR(bh[i]);
bh[i] = NULL; goto out;
}
mb_debug(sb, "read bitmap for group %u\n", group);
}
/* wait for I/O completion */ for (i = 0, group = first_group; i < groups_per_page; i++, group++) { int err2;
if (!bh[i]) continue;
err2 = ext4_wait_block_bitmap(sb, group, bh[i]); if (!err)
err = err2;
}
first_block = folio->index * blocks_per_page; for (i = 0; i < blocks_per_page; i++) {
group = (first_block + i) >> 1; if (group >= ngroups) break;
/* * data carry information regarding this * particular group in the format specified * above *
*/
data = folio_address(folio) + (i * blocksize);
bitmap = bh[group - first_group]->b_data;
/* * We place the buddy block and bitmap block * close together
*/
grinfo = ext4_get_group_info(sb, group); if (!grinfo) {
err = -EFSCORRUPTED; goto out;
} if ((first_block + i) & 1) { /* this is block of buddy */
BUG_ON(incore == NULL);
mb_debug(sb, "put buddy for group %u in folio %lu/%x\n",
group, folio->index, i * blocksize);
trace_ext4_mb_buddy_bitmap_load(sb, group);
grinfo->bb_fragments = 0;
memset(grinfo->bb_counters, 0, sizeof(*grinfo->bb_counters) *
(MB_NUM_ORDERS(sb))); /* * incore got set to the group block bitmap below
*/
ext4_lock_group(sb, group); /* init the buddy */
memset(data, 0xff, blocksize);
ext4_mb_generate_buddy(sb, data, incore, group, grinfo);
ext4_unlock_group(sb, group);
incore = NULL;
} else { /* this is block of bitmap */
BUG_ON(incore != NULL);
mb_debug(sb, "put bitmap for group %u in folio %lu/%x\n",
group, folio->index, i * blocksize);
trace_ext4_mb_bitmap_load(sb, group);
/* see comments in ext4_mb_put_pa() */
ext4_lock_group(sb, group);
memcpy(data, bitmap, blocksize);
/* mark all preallocated blks used in in-core bitmap */
ext4_mb_generate_from_pa(sb, data, group);
WARN_ON_ONCE(!RB_EMPTY_ROOT(&grinfo->bb_free_root));
ext4_unlock_group(sb, group);
/* set incore so that the buddy information can be * generated using this
*/
incore = data;
}
}
folio_mark_uptodate(folio);
out: if (bh) { for (i = 0; i < groups_per_page; i++)
brelse(bh[i]); if (bh != &bhs)
kfree(bh);
} return err;
}
/* * Lock the buddy and bitmap pages. This make sure other parallel init_group * on the same buddy page doesn't happen whild holding the buddy page lock. * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap * are on the same page e4b->bd_buddy_folio is NULL and return value is 0.
*/ staticint ext4_mb_get_buddy_page_lock(struct super_block *sb,
ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp)
{ struct inode *inode = EXT4_SB(sb)->s_buddy_cache; int block, pnum, poff; int blocks_per_page; struct folio *folio;
blocks_per_page = PAGE_SIZE / sb->s_blocksize; /* * the buddy cache inode stores the block bitmap * and buddy information in consecutive blocks. * So for each group we need two blocks.
*/
block = group * 2;
pnum = block / blocks_per_page;
poff = block % blocks_per_page;
folio = __filemap_get_folio(inode->i_mapping, pnum,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); if (IS_ERR(folio)) return PTR_ERR(folio);
BUG_ON(folio->mapping != inode->i_mapping);
e4b->bd_bitmap_folio = folio;
e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize);
if (blocks_per_page >= 2) { /* buddy and bitmap are on the same page */ return 0;
}
/* blocks_per_page == 1, hence we need another page for the buddy */
folio = __filemap_get_folio(inode->i_mapping, block + 1,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); if (IS_ERR(folio)) return PTR_ERR(folio);
BUG_ON(folio->mapping != inode->i_mapping);
e4b->bd_buddy_folio = folio; return 0;
}
staticvoid ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
{ if (e4b->bd_bitmap_folio) {
folio_unlock(e4b->bd_bitmap_folio);
folio_put(e4b->bd_bitmap_folio);
} if (e4b->bd_buddy_folio) {
folio_unlock(e4b->bd_buddy_folio);
folio_put(e4b->bd_buddy_folio);
}
}
/* * Locking note: This routine calls ext4_mb_init_cache(), which takes the * block group lock of all groups for this page; do not hold the BG lock when * calling this routine!
*/ static noinline_for_stack int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
{
struct ext4_group_info *this_grp; struct ext4_buddy e4b; struct folio *folio; int ret = 0;
might_sleep();
mb_debug(sb, "init group %u\n", group);
this_grp = ext4_get_group_info(sb, group); if (!this_grp) return -EFSCORRUPTED;
/* * This ensures that we don't reinit the buddy cache * page which map to the group from which we are already * allocating. If we are looking at the buddy cache we would * have taken a reference using ext4_mb_load_buddy and that * would have pinned buddy page to page cache. * The call to ext4_mb_get_buddy_page_lock will mark the * page accessed.
*/
ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp); if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { /* * somebody initialized the group * return without doing anything
*/ goto err;
}
folio = e4b.bd_bitmap_folio;
ret = ext4_mb_init_cache(folio, NULL, gfp); if (ret) goto err; if (!folio_test_uptodate(folio)) {
ret = -EIO; goto err;
}
if (e4b.bd_buddy_folio == NULL) { /* * If both the bitmap and buddy are in * the same page we don't need to force * init the buddy
*/
ret = 0; goto err;
} /* init buddy cache */
folio = e4b.bd_buddy_folio;
ret = ext4_mb_init_cache(folio, e4b.bd_bitmap, gfp); if (ret) goto err; if (!folio_test_uptodate(folio)) {
ret = -EIO; goto err;
}
err:
ext4_mb_put_buddy_page_lock(&e4b); return ret;
}
/* * Locking note: This routine calls ext4_mb_init_cache(), which takes the * block group lock of all groups for this page; do not hold the BG lock when * calling this routine!
*/ static noinline_for_stack int
ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp)
{ int blocks_per_page; int block; int pnum; int poff; struct folio *folio; int ret; struct ext4_group_info *grp; struct ext4_sb_info *sbi = EXT4_SB(sb); struct inode *inode = sbi->s_buddy_cache;
might_sleep();
mb_debug(sb, "load group %u\n", group);
if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { /* * we need full data about the group * to make a good selection
*/
ret = ext4_mb_init_group(sb, group, gfp); if (ret) return ret;
}
/* * the buddy cache inode stores the block bitmap * and buddy information in consecutive blocks. * So for each group we need two blocks.
*/
block = group * 2;
pnum = block / blocks_per_page;
poff = block % blocks_per_page;
/* Avoid locking the folio in the fast path ... */
folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); if (IS_ERR(folio) || !folio_test_uptodate(folio)) { if (!IS_ERR(folio)) /* * drop the folio reference and try * to get the folio with lock. If we * are not uptodate that implies * somebody just created the folio but * is yet to initialize it. So * wait for it to initialize.
*/
folio_put(folio);
folio = __filemap_get_folio(inode->i_mapping, pnum,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); if (!IS_ERR(folio)) { if (WARN_RATELIMIT(folio->mapping != inode->i_mapping, "ext4: bitmap's mapping != inode->i_mapping\n")) { /* should never happen */
folio_unlock(folio);
ret = -EINVAL; goto err;
} if (!folio_test_uptodate(folio)) {
ret = ext4_mb_init_cache(folio, NULL, gfp); if (ret) {
folio_unlock(folio); goto err;
}
mb_cmp_bitmaps(e4b, folio_address(folio) +
(poff * sb->s_blocksize));
}
folio_unlock(folio);
}
} if (IS_ERR(folio)) {
ret = PTR_ERR(folio); goto err;
} if (!folio_test_uptodate(folio)) {
ret = -EIO; goto err;
}
while (order <= e4b->bd_blkbits + 1) {
bb = mb_find_buddy(e4b, order, &max); if (!mb_test_bit(block >> order, bb)) { /* this block is part of buddy of order 'order' */ return order;
}
order++;
} return 0;
}
staticvoid mb_clear_bits(void *bm, int cur, int len)
{
__u32 *addr;
len = cur + len; while (cur < len) { if ((cur & 31) == 0 && (len - cur) >= 32) { /* fast path: clear whole word at once */
addr = bm + (cur >> 3);
*addr = 0;
cur += 32; continue;
}
mb_clear_bit(cur, bm);
cur++;
}
}
/* clear bits in given range * will return first found zero bit if any, -1 otherwise
*/ staticint mb_test_and_clear_bits(void *bm, int cur, int len)
{
__u32 *addr; int zero_bit = -1;
len = cur + len; while (cur < len) { if ((cur & 31) == 0 && (len - cur) >= 32) { /* fast path: clear whole word at once */
addr = bm + (cur >> 3); if (*addr != (__u32)(-1) && zero_bit == -1)
zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
*addr = 0;
cur += 32; continue;
} if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
zero_bit = cur;
cur++;
}
return zero_bit;
}
void mb_set_bits(void *bm, int cur, int len)
{
__u32 *addr;
len = cur + len; while (cur < len) { if ((cur & 31) == 0 && (len - cur) >= 32) { /* fast path: set whole word at once */
addr = bm + (cur >> 3);
*addr = 0xffffffff;
cur += 32; continue;
}
mb_set_bit(cur, bm);
cur++;
}
}
staticvoid mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
{ int max; int order = 1; void *buddy = mb_find_buddy(e4b, order, &max);
while (buddy) { void *buddy2;
/* Bits in range [first; last] are known to be set since * corresponding blocks were allocated. Bits in range * (first; last) will stay set because they form buddies on * upper layer. We just deal with borders if they don't * align with upper layer and then go up. * Releasing entire group is all about clearing * single bit of highest order buddy.
*/
/* Example: * --------------------------------- * | 1 | 1 | 1 | 1 | * --------------------------------- * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | * --------------------------------- * 0 1 2 3 4 5 6 7 * \_____________________/ * * Neither [1] nor [6] is aligned to above layer. * Left neighbour [0] is free, so mark it busy, * decrease bb_counters and extend range to * [0; 6] * Right neighbour [7] is busy. It can't be coaleasced with [6], so * mark [6] free, increase bb_counters and shrink range to * [0; 5]. * Then shift range to [0; 2], go up and do the same.
*/
if (first & 1)
e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1); if (!(last & 1))
e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1); if (first > last) break;
order++;
buddy2 = mb_find_buddy(e4b, order, &max); if (!buddy2) {
mb_clear_bits(buddy, first, last - first + 1);
e4b->bd_info->bb_counters[order - 1] += last - first + 1; break;
}
first >>= 1;
last >>= 1;
buddy = buddy2;
}
}
staticvoid mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, int first, int count)
{ int left_is_free = 0; int right_is_free = 0; int block; int last = first + count - 1; struct super_block *sb = e4b->bd_sb;
if (WARN_ON(count == 0)) return;
BUG_ON(last >= (sb->s_blocksize << 3));
assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); /* Don't bother if the block group is corrupt. */ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) return;
/* buddy[0] == bd_bitmap is a special case, so handle * it right away and let mb_buddy_mark_free stay free of * zero order checks. * Check if neighbours are to be coaleasced, * adjust bitmap bb_counters and borders appropriately.
*/ if (first & 1) {
first += !left_is_free;
e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
} if (!(last & 1)) {
last -= !right_is_free;
e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
}
if (first <= last)
mb_buddy_mark_free(e4b, first >> 1, last >> 1);
if (ex->fe_start + ex->fe_len > EXT4_CLUSTERS_PER_GROUP(e4b->bd_sb)) { /* Should never happen! (but apparently sometimes does?!?) */
WARN_ON(1);
ext4_grp_locked_error(e4b->bd_sb, e4b->bd_group, 0, 0, "corruption or bug in mb_find_extent " "block=%d, order=%d needed=%d ex=%u/%d/%d@%u",
block, order, needed, ex->fe_group, ex->fe_start,
ex->fe_len, ex->fe_logical);
ex->fe_len = 0;
ex->fe_start = 0;
ex->fe_group = 0;
} return ex->fe_len;
}
staticint mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
{ int ord; int mlen = 0; int max = 0; int start = ex->fe_start; int len = ex->fe_len; unsigned ret = 0; int len0 = len; void *buddy; int ord_start, ord_end;
/* * Must be called under group lock!
*/ staticvoid ext4_mb_use_best_found(struct ext4_allocation_context *ac, struct ext4_buddy *e4b)
{ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int ret;
/* preallocation can change ac_b_ex, thus we store actually
* allocated blocks for history */
ac->ac_f_ex = ac->ac_b_ex;
ac->ac_status = AC_STATUS_FOUND;
ac->ac_tail = ret & 0xffff;
ac->ac_buddy = ret >> 16;
/* * take the page reference. We want the page to be pinned * so that we don't get a ext4_mb_init_cache_call for this * group until we update the bitmap. That would mean we * double allocate blocks. The reference is dropped
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Dauer der Verarbeitung: 0.26 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.