staticinlinebool has_unwritten_metadata(struct btrfs_block_group *block_group)
{ /* The meta_write_pointer is available only on the zoned setup. */ if (!btrfs_is_zoned(block_group->fs_info)) returnfalse;
if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) returnfalse;
/* * Return target flags in extended format or 0 if restripe for this chunk_type * is not in progress * * Should be called with balance_lock held
*/ static u64 get_restripe_target(conststruct btrfs_fs_info *fs_info, u64 flags)
{ conststruct btrfs_balance_control *bctl = fs_info->balance_ctl;
u64 target = 0;
/* * @flags: available profiles in extended format (see ctree.h) * * Return reduced profile in chunk format. If profile changing is in progress * (either running or paused) picks the target profile (if it's already * available), otherwise falls back to plain reducing.
*/ static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
{
u64 num_devices = fs_info->fs_devices->rw_devices;
u64 target;
u64 raid_type;
u64 allowed = 0;
/* * See if restripe for this chunk_type is in progress, if so try to * reduce to the target profile
*/
spin_lock(&fs_info->balance_lock);
target = get_restripe_target(fs_info, flags); if (target) {
spin_unlock(&fs_info->balance_lock); return extended_to_chunk(target);
}
spin_unlock(&fs_info->balance_lock);
/* First, mask out the RAID levels which aren't possible */ for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { if (num_devices >= btrfs_raid_array[raid_type].devs_min)
allowed |= btrfs_raid_array[raid_type].bg_flag;
}
allowed &= flags;
void btrfs_put_block_group(struct btrfs_block_group *cache)
{ if (refcount_dec_and_test(&cache->refs)) {
WARN_ON(cache->pinned > 0); /* * If there was a failure to cleanup a log tree, very likely due * to an IO failure on a writeback attempt of one or more of its * extent buffers, we could not do proper (and cheap) unaccounting * of their reserved space, so don't warn on reserved > 0 in that * case.
*/ if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) ||
!BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info))
WARN_ON(cache->reserved > 0);
/* * A block_group shouldn't be on the discard_list anymore. * Remove the block_group from the discard_list to prevent us * from causing a panic due to NULL pointer dereference.
*/ if (WARN_ON(!list_empty(&cache->discard_list)))
btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
cache);
if (new_bg->start < exist_bg->start) return -1; if (new_bg->start > exist_bg->start) return 1; return 0;
}
/* * This adds the block group to the fs_info rb tree for the block group cache
*/ staticint btrfs_add_block_group_cache(struct btrfs_block_group *block_group)
{ struct btrfs_fs_info *fs_info = block_group->fs_info; struct rb_node *exist; int ret = 0;
ASSERT(block_group->length != 0);
write_lock(&fs_info->block_group_cache_lock);
exist = rb_find_add_cached(&block_group->cache_node,
&fs_info->block_group_cache_tree, btrfs_bg_start_cmp); if (exist)
ret = -EEXIST;
write_unlock(&fs_info->block_group_cache_lock);
return ret;
}
/* * This will return the block group at or after bytenr if contains is 0, else * it will return the block group that contains the bytenr
*/ staticstruct btrfs_block_group *block_group_cache_tree_search( struct btrfs_fs_info *info, u64 bytenr, int contains)
{ struct btrfs_block_group *cache, *ret = NULL; struct rb_node *n;
u64 end, start;
read_lock(&info->block_group_cache_lock);
n = info->block_group_cache_tree.rb_root.rb_node;
while (n) {
cache = rb_entry(n, struct btrfs_block_group, cache_node);
end = cache->start + cache->length - 1;
start = cache->start;
if (bytenr < start) { if (!contains && (!ret || start < ret->start))
ret = cache;
n = n->rb_left;
} elseif (bytenr > start) { if (contains && bytenr <= end) {
ret = cache; break;
}
n = n->rb_right;
} else {
ret = cache; break;
}
} if (ret)
btrfs_get_block_group(ret);
read_unlock(&info->block_group_cache_lock);
return ret;
}
/* * Return the block group that starts at or after bytenr
*/ struct btrfs_block_group *btrfs_lookup_first_block_group( struct btrfs_fs_info *info, u64 bytenr)
{ return block_group_cache_tree_search(info, bytenr, 0);
}
/* * Return the block group that contains the given bytenr
*/ struct btrfs_block_group *btrfs_lookup_block_group( struct btrfs_fs_info *info, u64 bytenr)
{ return block_group_cache_tree_search(info, bytenr, 1);
}
/* If our block group was removed, we need a full search. */ if (RB_EMPTY_NODE(&cache->cache_node)) { const u64 next_bytenr = cache->start + cache->length;
/* * Check if we can do a NOCOW write for a given extent. * * @fs_info: The filesystem information object. * @bytenr: Logical start address of the extent. * * Check if we can do a NOCOW write for the given extent, and increments the * number of NOCOW writers in the block group that contains the extent, as long * as the block group exists and it's currently not in read-only mode. * * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller * is responsible for calling btrfs_dec_nocow_writers() later. * * Or NULL if we can not do a NOCOW write
*/ struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
u64 bytenr)
{ struct btrfs_block_group *bg; bool can_nocow = true;
bg = btrfs_lookup_block_group(fs_info, bytenr); if (!bg) return NULL;
spin_lock(&bg->lock); if (bg->ro)
can_nocow = false; else
atomic_inc(&bg->nocow_writers);
spin_unlock(&bg->lock);
if (!can_nocow) {
btrfs_put_block_group(bg); return NULL;
}
/* No put on block group, done by btrfs_dec_nocow_writers(). */ return bg;
}
/* * Decrement the number of NOCOW writers in a block group. * * This is meant to be called after a previous call to btrfs_inc_nocow_writers(), * and on the block group returned by that call. Typically this is called after * creating an ordered extent for a NOCOW write, to prevent races with scrub and * relocation. * * After this call, the caller should not use the block group anymore. It it wants * to use it, then it should get a reference on it before calling this function.
*/ void btrfs_dec_nocow_writers(struct btrfs_block_group *bg)
{ if (atomic_dec_and_test(&bg->nocow_writers))
wake_up_var(&bg->nocow_writers);
/* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */
btrfs_put_block_group(bg);
}
if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA)) return;
/* * Our block group is read only but before we set it to read only, * some task might have had allocated an extent from it already, but it * has not yet created a respective ordered extent (and added it to a * root's list of ordered extents). * Therefore wait for any task currently allocating extents, since the * block group's reservations counter is incremented while a read lock * on the groups' semaphore is held and decremented after releasing * the read access on that semaphore and creating the ordered extent.
*/
down_write(&space_info->groups_sem);
up_write(&space_info->groups_sem);
staticvoid btrfs_put_caching_control(struct btrfs_caching_control *ctl)
{ if (refcount_dec_and_test(&ctl->count))
kfree(ctl);
}
/* * When we wait for progress in the block group caching, its because our * allocation attempt failed at least once. So, we must sleep and let some * progress happen before we try again. * * This function will sleep at least once waiting for new free space to show * up, and then it will check the block group free space numbers for our min * num_bytes. Another option is to have it go ahead and look in the rbtree for * a free extent of a given size, but this is a good start. * * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using * any of the information in this block group.
*/ void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
u64 num_bytes)
{ struct btrfs_caching_control *caching_ctl; int progress;
caching_ctl = btrfs_get_caching_control(cache); if (!caching_ctl) return;
/* * We've already failed to allocate from this block group, so even if * there's enough space in the block group it isn't contiguous enough to * allow for an allocation, so wait for at least the next wakeup tick, * or for the thing to be done.
*/
progress = atomic_read(&caching_ctl->progress);
while (len > chunk) {
btrfs_remove_free_space(block_group, start, chunk);
start += step; if (len < step)
len = 0; else
len -= step;
}
} #endif
/* * Add a free space range to the in memory free space cache of a block group. * This checks if the range contains super block locations and any such * locations are not added to the free space cache. * * @block_group: The target block group. * @start: Start offset of the range. * @end: End offset of the range (exclusive). * @total_added_ret: Optional pointer to return the total amount of space * added to the block group's free space cache. * * Returns 0 on success or < 0 on error.
*/ int btrfs_add_new_free_space(struct btrfs_block_group *block_group, u64 start,
u64 end, u64 *total_added_ret)
{ struct btrfs_fs_info *info = block_group->fs_info;
u64 extent_start, extent_end, size; int ret;
if (total_added_ret)
*total_added_ret = 0;
while (start < end) { if (!btrfs_find_first_extent_bit(&info->excluded_extents, start,
&extent_start, &extent_end,
EXTENT_DIRTY, NULL)) break;
if (start < end) {
size = end - start;
ret = btrfs_add_free_space_async_trimmed(block_group, start,
size); if (ret) return ret; if (total_added_ret)
*total_added_ret += size;
}
return 0;
}
/* * Get an arbitrary extent item index / max_index through the block group * * @block_group the block group to sample from * @index: the integral step through the block group to grab from * @max_index: the granularity of the sampling * @key: return value parameter for the item we find * * Pre-conditions on indices: * 0 <= index <= max_index * 0 < max_index * * Returns: 0 on success, 1 if the search didn't yield a useful item, negative * error code on error.
*/ staticint sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl, struct btrfs_block_group *block_group, int index, int max_index, struct btrfs_key *found_key)
{ struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *extent_root;
u64 search_offset;
u64 search_end = block_group->start + block_group->length;
BTRFS_PATH_AUTO_FREE(path); struct btrfs_key search_key; int ret = 0;
/* * Best effort attempt to compute a block group's size class while caching it. * * @block_group: the block group we are caching * * We cannot infer the size class while adding free space extents, because that * logic doesn't care about contiguous file extents (it doesn't differentiate * between a 100M extent and 100 contiguous 1M extents). So we need to read the * file extent items. Reading all of them is quite wasteful, because usually * only a handful are enough to give a good answer. Therefore, we just grab 5 of * them at even steps through the block group and pick the smallest size class * we see. Since size class is best effort, and not guaranteed in general, * inaccuracy is acceptable. * * To be more explicit about why this algorithm makes sense: * * If we are caching in a block group from disk, then there are three major cases * to consider: * 1. the block group is well behaved and all extents in it are the same size * class. * 2. the block group is mostly one size class with rare exceptions for last * ditch allocations * 3. the block group was populated before size classes and can have a totally * arbitrary mix of size classes. * * In case 1, looking at any extent in the block group will yield the correct * result. For the mixed cases, taking the minimum size class seems like a good * approximation, since gaps from frees will be usable to the size class. For * 2., a small handful of file extents is likely to yield the right answer. For * 3, we can either read every file extent, or admit that this is best effort * anyway and try to stay fast. * * Returns: 0 on success, negative error code on error.
*/ staticint load_block_group_size_class(struct btrfs_caching_control *caching_ctl, struct btrfs_block_group *block_group)
{ struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_key key; int i;
u64 min_size = block_group->length; enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE; int ret;
if (!btrfs_block_group_should_use_size_class(block_group)) return 0;
lockdep_assert_held(&caching_ctl->mutex);
lockdep_assert_held_read(&fs_info->commit_root_sem); for (i = 0; i < 5; ++i) {
ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key); if (ret < 0) goto out; if (ret > 0) continue;
min_size = min_t(u64, min_size, key.offset);
size_class = btrfs_calc_block_group_size_class(min_size);
} if (size_class != BTRFS_BG_SZ_NONE) {
spin_lock(&block_group->lock);
block_group->size_class = size_class;
spin_unlock(&block_group->lock);
}
out: return ret;
}
path = btrfs_alloc_path(); if (!path) return -ENOMEM;
last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET);
extent_root = btrfs_extent_root(fs_info, last);
#ifdef CONFIG_BTRFS_DEBUG /* * If we're fragmenting we don't want to make anybody think we can * allocate from this block group until we've had a chance to fragment * the free space.
*/ if (btrfs_should_fragment_free_space(block_group))
wakeup = false; #endif /* * We don't want to deadlock with somebody trying to allocate a new * extent for the extent root while also trying to search the extent * root to add free space. So we skip locking and search the commit * root, since its read-only
*/
path->skip_locking = 1;
path->search_commit_root = 1;
path->reada = READA_FORWARD;
ret = btrfs_add_new_free_space(block_group, last,
key.objectid, &space_added); if (ret) goto out;
total_found += space_added; if (key.type == BTRFS_METADATA_ITEM_KEY)
last = key.objectid +
fs_info->nodesize; else
last = key.objectid + key.offset;
if (total_found > CACHING_CTL_WAKE_UP) {
total_found = 0; if (wakeup) {
atomic_inc(&caching_ctl->progress);
wake_up(&caching_ctl->wait);
}
}
}
path->slots[0]++;
}
ret = btrfs_add_new_free_space(block_group, last,
block_group->start + block_group->length,
NULL);
out: return ret;
}
load_block_group_size_class(caching_ctl, block_group); if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
ret = load_free_space_cache(block_group); if (ret == 1) {
ret = 0; goto done;
}
/* * We failed to load the space cache, set ourselves to * CACHE_STARTED and carry on.
*/
spin_lock(&block_group->lock);
block_group->cached = BTRFS_CACHE_STARTED;
spin_unlock(&block_group->lock);
wake_up(&caching_ctl->wait);
}
/* * If we are in the transaction that populated the free space tree we * can't actually cache from the free space tree as our commit root and * real root are the same, so we could change the contents of the blocks * while caching. Instead do the slow caching in this case, and after * the transaction has committed we will be safe.
*/ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
!(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags)))
ret = btrfs_load_free_space_tree(caching_ctl); else
ret = load_extent_tree_free(caching_ctl);
done:
spin_lock(&block_group->lock);
block_group->caching_ctl = NULL;
block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
spin_unlock(&block_group->lock);
#ifdef CONFIG_BTRFS_DEBUG if (btrfs_should_fragment_free_space(block_group)) {
u64 bytes_used;
btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
out: if (wait && caching_ctl)
ret = btrfs_caching_ctl_wait_done(cache, caching_ctl); if (caching_ctl)
btrfs_put_caching_control(caching_ctl);
write_seqlock(&fs_info->profiles_lock); if (flags & BTRFS_BLOCK_GROUP_DATA)
fs_info->avail_data_alloc_bits &= ~extra_flags; if (flags & BTRFS_BLOCK_GROUP_METADATA)
fs_info->avail_metadata_alloc_bits &= ~extra_flags; if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
fs_info->avail_system_alloc_bits &= ~extra_flags;
write_sequnlock(&fs_info->profiles_lock);
}
/* * Clear incompat bits for the following feature(s): * * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group * in the whole filesystem * * - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups
*/ staticvoid clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
{ bool found_raid56 = false; bool found_raid1c34 = false;
block_group = btrfs_lookup_block_group(fs_info, map->start); if (!block_group) return -ENOENT;
BUG_ON(!block_group->ro);
trace_btrfs_remove_block_group(block_group); /* * Free the reserved super bytes from this block group before * remove it.
*/
btrfs_free_excluded_extents(block_group);
btrfs_free_ref_tree_range(fs_info, block_group->start,
block_group->length);
index = btrfs_bg_flags_to_raid_index(block_group->flags);
factor = btrfs_bg_type_to_factor(block_group->flags);
/* make sure this block group isn't part of an allocation cluster */
cluster = &fs_info->data_alloc_cluster;
spin_lock(&cluster->refill_lock);
btrfs_return_cluster_to_free_space(block_group, cluster);
spin_unlock(&cluster->refill_lock);
/* * make sure this block group isn't part of a metadata * allocation cluster
*/
cluster = &fs_info->meta_alloc_cluster;
spin_lock(&cluster->refill_lock);
btrfs_return_cluster_to_free_space(block_group, cluster);
spin_unlock(&cluster->refill_lock);
path = btrfs_alloc_path(); if (!path) {
ret = -ENOMEM; goto out;
}
/* * get the inode first so any iput calls done for the io_list * aren't the final iput (no unlinks allowed now)
*/
inode = lookup_free_space_inode(block_group, path);
mutex_lock(&trans->transaction->cache_write_mutex); /* * Make sure our free space cache IO is done before removing the * free space inode
*/
spin_lock(&trans->transaction->dirty_bgs_lock); if (!list_empty(&block_group->io_list)) {
list_del_init(&block_group->io_list);
/* Once for the block groups rbtree */
btrfs_put_block_group(block_group);
write_unlock(&fs_info->block_group_cache_lock);
down_write(&block_group->space_info->groups_sem); /* * we must use list_del_init so people can check to see if they * are still on the list after taking the semaphore
*/
list_del_init(&block_group->list); if (list_empty(&block_group->space_info->block_groups[index])) {
kobj = block_group->space_info->block_group_kobjs[index];
block_group->space_info->block_group_kobjs[index] = NULL;
clear_avail_alloc_bits(fs_info, block_group->flags);
}
up_write(&block_group->space_info->groups_sem);
clear_incompat_bg_bits(fs_info, block_group->flags); if (kobj) {
kobject_del(kobj);
kobject_put(kobj);
}
if (block_group->cached == BTRFS_CACHE_STARTED)
btrfs_wait_block_group_cache_done(block_group);
write_lock(&fs_info->block_group_cache_lock);
caching_ctl = btrfs_get_caching_control(block_group); if (!caching_ctl) { struct btrfs_caching_control *ctl;
if (caching_ctl) { /* Once for the caching bgs list and once for us. */
btrfs_put_caching_control(caching_ctl);
btrfs_put_caching_control(caching_ctl);
}
/* * Remove the free space for the block group from the free space tree * and the block group's item from the extent tree before marking the * block group as removed. This is to prevent races with tasks that * freeze and unfreeze a block group, this task and another task * allocating a new block group - the unfreeze task ends up removing * the block group's extent map before the task calling this function * deletes the block group item from the extent tree, allowing for * another task to attempt to create another block group with the same * item key (and failing with -EEXIST and a transaction abort).
*/
ret = btrfs_remove_block_group_free_space(trans, block_group); if (ret) goto out;
ret = remove_block_group_item(trans, path, block_group); if (ret < 0) goto out;
spin_lock(&block_group->lock); /* * Hitting this WARN means we removed a block group with an unwritten * region. It will cause "unable to find chunk map for logical" errors.
*/ if (WARN_ON(has_unwritten_metadata(block_group)))
btrfs_warn(fs_info, "block group %llu is removed before metadata write out",
block_group->start);
/* * At this point trimming or scrub can't start on this block group, * because we removed the block group from the rbtree * fs_info->block_group_cache_tree so no one can't find it anymore and * even if someone already got this block group before we removed it * from the rbtree, they have already incremented block_group->frozen - * if they didn't, for the trimming case they won't find any free space * entries because we already removed them all when we called * btrfs_remove_free_space_cache(). * * And we must not remove the chunk map from the fs_info->mapping_tree * to prevent the same logical address range and physical device space * ranges from being reused for a new block group. This is needed to * avoid races with trimming and scrub. * * An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is * completely transactionless, so while it is trimming a range the * currently running transaction might finish and a new one start, * allowing for new block groups to be created that can reuse the same * physical device locations unless we take this special care. * * There may also be an implicit trim operation if the file system * is mounted with -odiscard. The same protections must remain * in place until the extents have been discarded completely when * the transaction commit has completed.
*/
remove_map = (atomic_read(&block_group->frozen) == 0);
spin_unlock(&block_group->lock);
if (remove_map)
btrfs_remove_chunk_map(fs_info, map);
out: /* Once for the lookup reference */
btrfs_put_block_group(block_group); if (remove_rsv)
btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
btrfs_free_path(path); return ret;
}
/* * We need to reserve 3 + N units from the metadata space info in order * to remove a block group (done at btrfs_remove_chunk() and at * btrfs_remove_block_group()), which are used for: * * 1 unit for adding the free space inode's orphan (located in the tree * of tree roots). * 1 unit for deleting the block group item (located in the extent * tree). * 1 unit for deleting the free space item (located in tree of tree * roots). * N units for deleting N device extent items corresponding to each * stripe (located in the device tree). * * In order to remove a block group we also need to reserve units in the * system space info in order to update the chunk tree (update one or * more device items and remove one chunk item), but this is done at * btrfs_remove_chunk() through a call to check_system_chunk().
*/
num_items = 3 + map->num_stripes;
btrfs_free_chunk_map(map);
/* * Mark block group @cache read-only, so later write won't happen to block * group @cache. * * If @force is not set, this function will only mark the block group readonly * if we have enough free space (1M) in other metadata/system block groups. * If @force is not set, this function will mark the block group readonly * without checking free space. * * NOTE: This function doesn't care if other block groups can contain all the * data in this block group. That check should be done by relocation routine, * not this function.
*/ staticint inc_block_group_ro(struct btrfs_block_group *cache, int force)
{ struct btrfs_space_info *sinfo = cache->space_info;
u64 num_bytes; int ret = -ENOSPC;
spin_lock(&sinfo->lock);
spin_lock(&cache->lock);
if (cache->swap_extents) {
ret = -ETXTBSY; goto out;
}
if (cache->ro) {
cache->ro++;
ret = 0; goto out;
}
/* * Data never overcommits, even in mixed mode, so do just the straight * check of left over space in how much we have allocated.
*/ if (force) {
ret = 0;
} elseif (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) {
u64 sinfo_used = btrfs_space_info_used(sinfo, true);
/* * Here we make sure if we mark this bg RO, we still have enough * free space as buffer.
*/ if (sinfo_used + num_bytes <= sinfo->total_bytes)
ret = 0;
} else { /* * We overcommit metadata, so we need to do the * btrfs_can_overcommit check here, and we need to pass in * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of * leeway to allow us to mark this block group as read only.
*/ if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes,
BTRFS_RESERVE_NO_FLUSH))
ret = 0;
}
if (!ret) {
sinfo->bytes_readonly += num_bytes; if (btrfs_is_zoned(cache->fs_info)) { /* Migrate zone_unusable bytes to readonly */
sinfo->bytes_readonly += cache->zone_unusable;
btrfs_space_info_update_bytes_zone_unusable(sinfo, -cache->zone_unusable);
cache->zone_unusable = 0;
}
cache->ro++;
list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
}
out:
spin_unlock(&cache->lock);
spin_unlock(&sinfo->lock); if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
btrfs_info(cache->fs_info, "unable to make block group %llu ro", cache->start);
btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, false);
} return ret;
}
/* * Hold the unused_bg_unpin_mutex lock to avoid racing with * btrfs_finish_extent_commit(). If we are at transaction N, another * task might be running finish_extent_commit() for the previous * transaction N - 1, and have seen a range belonging to the block * group in pinned_extents before we were able to clear the whole block * group range from pinned_extents. This means that task can lookup for * the block group after we unpinned it from pinned_extents and removed * it, leading to an error at unpin_extent_range().
*/
mutex_lock(&fs_info->unused_bg_unpin_mutex); if (prev_trans) {
ret = btrfs_clear_extent_bit(&prev_trans->pinned_extents, start, end,
EXTENT_DIRTY, NULL); if (ret) goto out;
}
ret = btrfs_clear_extent_bit(&trans->transaction->pinned_extents, start, end,
EXTENT_DIRTY, NULL);
out:
mutex_unlock(&fs_info->unused_bg_unpin_mutex); if (prev_trans)
btrfs_put_transaction(prev_trans);
return ret == 0;
}
/* * Link the block_group to a list via bg_list. * * @bg: The block_group to link to the list. * @list: The list to link it to. * * Use this rather than list_add_tail() directly to ensure proper respect * to locking and refcounting. * * Returns: true if the bg was linked with a refcount bump and false otherwise.
*/ staticbool btrfs_link_bg_list(struct btrfs_block_group *bg, struct list_head *list)
{ struct btrfs_fs_info *fs_info = bg->fs_info; bool added = false;
/* * Process the unused_bgs list and remove any that don't have any allocated * space inside of them.
*/ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
{
LIST_HEAD(retry_list); struct btrfs_block_group *block_group; struct btrfs_space_info *space_info; struct btrfs_trans_handle *trans; constbool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC); int ret = 0;
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) return;
if (btrfs_fs_closing(fs_info)) return;
/* * Long running balances can keep us blocked here for eternity, so * simply skip deletion if we're unable to get the mutex.
*/ if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) return;
spin_lock(&fs_info->unused_bgs_lock); while (!list_empty(&fs_info->unused_bgs)) {
u64 used; int trimming;
/* Don't want to race with allocators so take the groups_sem */
down_write(&space_info->groups_sem);
/* * Async discard moves the final block group discard to be prior * to the unused_bgs code path. Therefore, if it's not fully * trimmed, punt it back to the async discard lists.
*/ if (btrfs_test_opt(fs_info, DISCARD_ASYNC) &&
!btrfs_is_free_space_trimmed(block_group)) {
trace_btrfs_skip_unused_block_group(block_group);
up_write(&space_info->groups_sem); /* Requeue if we failed because of async discard */
btrfs_discard_queue_work(&fs_info->discard_ctl,
block_group); goto next;
}
spin_lock(&space_info->lock);
spin_lock(&block_group->lock); if (btrfs_is_block_group_used(block_group) || block_group->ro ||
list_is_singular(&block_group->list)) { /* * We want to bail if we made new allocations or have * outstanding allocations in this block group. We do * the ro check in case balance is currently acting on * this block group. * * Also bail out if this is the only block group for its * type, because otherwise we would lose profile * information from fs_info->avail_*_alloc_bits and the * next block group of this type would be created with a * "single" profile (even if we're in a raid fs) because * fs_info->avail_*_alloc_bits would be 0.
*/
trace_btrfs_skip_unused_block_group(block_group);
spin_unlock(&block_group->lock);
spin_unlock(&space_info->lock);
up_write(&space_info->groups_sem); goto next;
}
/* * The block group may be unused but there may be space reserved * accounting with the existence of that block group, that is, * space_info->bytes_may_use was incremented by a task but no * space was yet allocated from the block group by the task. * That space may or may not be allocated, as we are generally * pessimistic about space reservation for metadata as well as * for data when using compression (as we reserve space based on * the worst case, when data can't be compressed, and before * actually attempting compression, before starting writeback). * * So check if the total space of the space_info minus the size * of this block group is less than the used space of the * space_info - if that's the case, then it means we have tasks * that might be relying on the block group in order to allocate * extents, and add back the block group to the unused list when * we finish, so that we retry later in case no tasks ended up * needing to allocate extents from the block group.
*/
used = btrfs_space_info_used(space_info, true); if ((space_info->total_bytes - block_group->length < used &&
block_group->zone_unusable < block_group->length) ||
has_unwritten_metadata(block_group)) { /* * Add a reference for the list, compensate for the ref * drop under the "next" label for the * fs_info->unused_bgs list.
*/
btrfs_link_bg_list(block_group, &retry_list);
/* We don't want to force the issue, only flip if it's ok. */
ret = inc_block_group_ro(block_group, 0);
up_write(&space_info->groups_sem); if (ret < 0) {
ret = 0; goto next;
}
ret = btrfs_zone_finish(block_group); if (ret < 0) {
btrfs_dec_block_group_ro(block_group); if (ret == -EAGAIN) {
btrfs_link_bg_list(block_group, &retry_list);
ret = 0;
} goto next;
}
/* * Want to do this before we do anything else so we can recover * properly if we fail to join the transaction.
*/
trans = btrfs_start_trans_remove_block_group(fs_info,
block_group->start); if (IS_ERR(trans)) {
btrfs_dec_block_group_ro(block_group);
ret = PTR_ERR(trans); goto next;
}
/* * We could have pending pinned extents for this block group, * just delete them, we don't care about them anymore.
*/ if (!clean_pinned_extents(trans, block_group)) {
btrfs_dec_block_group_ro(block_group); goto end_trans;
}
/* * At this point, the block_group is read only and should fail * new allocations. However, btrfs_finish_extent_commit() can * cause this block_group to be placed back on the discard * lists because now the block_group isn't fully discarded. * Bail here and try again later after discarding everything.
*/
spin_lock(&fs_info->discard_ctl.lock); if (!list_empty(&block_group->discard_list)) {
spin_unlock(&fs_info->discard_ctl.lock);
btrfs_dec_block_group_ro(block_group);
btrfs_discard_queue_work(&fs_info->discard_ctl,
block_group); goto end_trans;
}
spin_unlock(&fs_info->discard_ctl.lock);
/* Reset pinned so btrfs_put_block_group doesn't complain */
spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
/* * The normal path here is an unused block group is passed here, * then trimming is handled in the transaction commit path. * Async discard interposes before this to do the trimming * before coming down the unused block group path as trimming * will no longer be done later in the transaction commit path.
*/ if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC)) goto flip_async;
/* * DISCARD can flip during remount. On zoned filesystems, we * need to reset sequential-required zones.
*/
trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) ||
btrfs_is_zoned(fs_info);
/* Implicit trim during transaction commit. */ if (trimming)
btrfs_freeze_block_group(block_group);
/* * Btrfs_remove_chunk will abort the transaction if things go * horribly wrong.
*/
ret = btrfs_remove_chunk(trans, block_group->start);
if (ret) { if (trimming)
btrfs_unfreeze_block_group(block_group); goto end_trans;
}
/* * If we're not mounted with -odiscard, we can just forget * about this block group. Otherwise we'll need to wait * until transaction commit to do the actual discard.
*/ if (trimming) {
spin_lock(&fs_info->unused_bgs_lock); /* * A concurrent scrub might have added us to the list * fs_info->unused_bgs, so use a list_move operation * to add the block group to the deleted_bgs list.
*/
list_move(&block_group->bg_list,
&trans->transaction->deleted_bgs);
spin_unlock(&fs_info->unused_bgs_lock);
btrfs_get_block_group(block_group);
}
end_trans:
btrfs_end_transaction(trans);
next:
btrfs_put_block_group(block_group);
spin_lock(&fs_info->unused_bgs_lock);
}
list_splice_tail(&retry_list, &fs_info->unused_bgs);
spin_unlock(&fs_info->unused_bgs_lock);
mutex_unlock(&fs_info->reclaim_bgs_lock); return;
spin_lock(&fs_info->unused_bgs_lock); if (list_empty(&bg->bg_list)) {
btrfs_get_block_group(bg);
trace_btrfs_add_unused_block_group(bg);
list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
} elseif (!test_bit(BLOCK_GROUP_FLAG_NEW, &bg->runtime_flags)) { /* Pull out the block group from the reclaim_bgs list. */
trace_btrfs_add_unused_block_group(bg);
list_move_tail(&bg->bg_list, &fs_info->unused_bgs);
}
spin_unlock(&fs_info->unused_bgs_lock);
}
/* * We want block groups with a low number of used bytes to be in the beginning * of the list, so they will get reclaimed first.
*/ staticint reclaim_bgs_cmp(void *unused, conststruct list_head *a, conststruct list_head *b)
{ conststruct btrfs_block_group *bg1, *bg2;
/* * Some other task may be updating the ->used field concurrently, but it * is not serious if we get a stale value or load/store tearing issues, * as sorting the list of block groups to reclaim is not critical and an * occasional imperfect order is ok. So silence KCSAN and avoid the * overhead of locking or any other synchronization.
*/ return data_race(bg1->used > bg2->used);
}
/* * If we were below the threshold before don't reclaim, we are likely a * brand new block group and we don't want to relocate new block groups.
*/ if (old_val < thresh_bytes) returnfalse; if (new_val >= thresh_bytes) returnfalse; returntrue;
}
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) return;
if (btrfs_fs_closing(fs_info)) return;
if (!btrfs_should_reclaim(fs_info)) return;
sb_start_write(fs_info->sb);
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
sb_end_write(fs_info->sb); return;
}
/* * Long running balances can keep us blocked here for eternity, so * simply skip reclaim if we're unable to get the mutex.
*/ if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
btrfs_exclop_finish(fs_info);
sb_end_write(fs_info->sb); return;
}
spin_lock(&fs_info->unused_bgs_lock); /* * Sort happens under lock because we can't simply splice it and sort. * The block groups might still be in use and reachable via bg_list, * and their presence in the reclaim_bgs list must be preserved.
*/
list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp); while (!list_empty(&fs_info->reclaim_bgs)) {
u64 used;
u64 reserved; int ret = 0;
/* Don't race with allocators so take the groups_sem */
down_write(&space_info->groups_sem);
spin_lock(&space_info->lock);
spin_lock(&bg->lock); if (bg->reserved || bg->pinned || bg->ro) { /* * We want to bail if we made new allocations or have * outstanding allocations in this block group. We do * the ro check in case balance is currently acting on * this block group.
*/
spin_unlock(&bg->lock);
spin_unlock(&space_info->lock);
up_write(&space_info->groups_sem); goto next;
} if (bg->used == 0) { /* * It is possible that we trigger relocation on a block * group as its extents are deleted and it first goes * below the threshold, then shortly after goes empty. * * In this case, relocating it does delete it, but has * some overhead in relocation specific metadata, looking * for the non-existent extents and running some extra * transactions, which we can avoid by using one of the * other mechanisms for dealing with empty block groups.
*/ if (!btrfs_test_opt(fs_info, DISCARD_ASYNC))
btrfs_mark_bg_unused(bg);
spin_unlock(&bg->lock);
spin_unlock(&space_info->lock);
up_write(&space_info->groups_sem); goto next;
} /* * The block group might no longer meet the reclaim condition by * the time we get around to reclaiming it, so to avoid * reclaiming overly full block_groups, skip reclaiming them. * * Since the decision making process also depends on the amount * being freed, pass in a fake giant value to skip that extra * check, which is more meaningful when adding to the list in * the first place.
*/ if (!should_reclaim_block_group(bg, bg->length)) {
spin_unlock(&bg->lock);
spin_unlock(&space_info->lock);
up_write(&space_info->groups_sem); goto next;
}
/* * Get out fast, in case we're read-only or unmounting the * filesystem. It is OK to drop block groups from the list even * for the read-only case. As we did sb_start_write(), * "mount -o remount,ro" won't happen and read-only filesystem * means it is forced read-only due to a fatal error. So, it * never gets back to read-write to let us reclaim again.
*/ if (btrfs_need_cleaner_sleep(fs_info)) {
up_write(&space_info->groups_sem); goto next;
}
ret = inc_block_group_ro(bg, 0);
up_write(&space_info->groups_sem); if (ret < 0) goto next;
/* * The amount of bytes reclaimed corresponds to the sum of the * "used" and "reserved" counters. We have set the block group * to RO above, which prevents reservations from happening but * we may have existing reservations for which allocation has * not yet been done - btrfs_update_block_group() was not yet * called, which is where we will transfer a reserved extent's * size from the "reserved" counter to the "used" counter - this * happens when running delayed references. When we relocate the * chunk below, relocation first flushes dellaloc, waits for * ordered extent completion (which is where we create delayed * references for data extents) and commits the current * transaction (which runs delayed references), and only after * it does the actual work to move extents out of the block * group. So the reported amount of reclaimed bytes is * effectively the sum of the 'used' and 'reserved' counters.
*/
spin_lock(&bg->lock);
used = bg->used;
reserved = bg->reserved;
spin_unlock(&bg->lock);
trace_btrfs_reclaim_block_group(bg);
ret = btrfs_relocate_chunk(fs_info, bg->start, false); if (ret) {
btrfs_dec_block_group_ro(bg);
btrfs_err(fs_info, "error relocating chunk %llu",
bg->start);
used = 0;
reserved = 0;
spin_lock(&space_info->lock);
space_info->reclaim_errors++; if (READ_ONCE(space_info->periodic_reclaim))
space_info->periodic_reclaim_ready = false;
spin_unlock(&space_info->lock);
}
spin_lock(&space_info->lock);
space_info->reclaim_count++;
space_info->reclaim_bytes += used;
space_info->reclaim_bytes += reserved;
spin_unlock(&space_info->lock);
next: if (ret && !READ_ONCE(space_info->periodic_reclaim))
btrfs_link_bg_list(bg, &retry_list);
btrfs_put_block_group(bg);
mutex_unlock(&fs_info->reclaim_bgs_lock); /* * Reclaiming all the block groups in the list can take really * long. Prioritize cleaning up unused block groups.
*/
btrfs_delete_unused_bgs(fs_info); /* * If we are interrupted by a balance, we can just bail out. The * cleaner thread restart again if necessary.
*/ if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) goto end;
spin_lock(&fs_info->unused_bgs_lock);
}
spin_unlock(&fs_info->unused_bgs_lock);
mutex_unlock(&fs_info->reclaim_bgs_lock);
end:
spin_lock(&fs_info->unused_bgs_lock);
list_splice_tail(&retry_list, &fs_info->reclaim_bgs);
spin_unlock(&fs_info->unused_bgs_lock);
btrfs_exclop_finish(fs_info);
sb_end_write(fs_info->sb);
}
if (btrfs_link_bg_list(bg, &fs_info->reclaim_bgs))
trace_btrfs_add_reclaim_block_group(bg);
}
staticint read_bg_from_eb(struct btrfs_fs_info *fs_info, conststruct btrfs_key *key, conststruct btrfs_path *path)
{ struct btrfs_chunk_map *map; struct btrfs_block_group_item bg; struct extent_buffer *leaf; int slot;
u64 flags; int ret = 0;
slot = path->slots[0];
leaf = path->nodes[0];
map = btrfs_find_chunk_map(fs_info, key->objectid, key->offset); if (!map) {
btrfs_err(fs_info, "logical %llu len %llu found bg but no related chunk",
key->objectid, key->offset); return -ENOENT;
}
if (map->start != key->objectid || map->chunk_len != key->offset) {
btrfs_err(fs_info, "block group %llu len %llu mismatch with chunk %llu len %llu",
key->objectid, key->offset, map->start, map->chunk_len);
ret = -EUCLEAN; goto out_free_map;
}
if (flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
btrfs_err(fs_info, "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
key->objectid, key->offset, flags,
(BTRFS_BLOCK_GROUP_TYPE_MASK & map->type));
ret = -EUCLEAN;
}
write_seqlock(&fs_info->profiles_lock); if (flags & BTRFS_BLOCK_GROUP_DATA)
fs_info->avail_data_alloc_bits |= extra_flags; if (flags & BTRFS_BLOCK_GROUP_METADATA)
fs_info->avail_metadata_alloc_bits |= extra_flags; if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
fs_info->avail_system_alloc_bits |= extra_flags;
write_sequnlock(&fs_info->profiles_lock);
}
/* * Map a physical disk address to a list of logical addresses. * * @fs_info: the filesystem * @chunk_start: logical address of block group * @physical: physical address to map to logical addresses
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Dauer der Verarbeitung: 0.9 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.