/* * If we didn't get into open_ctree our allocated_ebs will not be * initialized, so just skip this.
*/ if (!fs_info->allocated_ebs.next) return;
WARN_ON(!list_empty(&fs_info->allocated_ebs));
spin_lock_irqsave(&fs_info->eb_leak_lock, flags); while (!list_empty(&fs_info->allocated_ebs)) {
eb = list_first_entry(&fs_info->allocated_ebs, struct extent_buffer, leak_list);
btrfs_err(fs_info, "buffer leak start %llu len %u refs %d bflags %lu owner %llu",
eb->start, eb->len, refcount_read(&eb->refs), eb->bflags,
btrfs_header_owner(eb));
list_del(&eb->leak_list);
WARN_ON_ONCE(1);
kmem_cache_free(extent_buffer_cache, eb);
}
spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
} #else #define btrfs_leak_debug_add_eb(eb) do {} while (0) #define btrfs_leak_debug_del_eb(eb) do {} while (0) #endif
/* * Structure to record info about the bio being assembled, and other info like * how many bytes are there before stripe/ordered extent boundary.
*/ struct btrfs_bio_ctrl { struct btrfs_bio *bbio; /* Last byte contained in bbio + 1 . */
loff_t next_file_offset; enum btrfs_compression_type compress_type;
u32 len_to_oe_boundary;
blk_opf_t opf;
btrfs_bio_end_io_t end_io_func; struct writeback_control *wbc;
/* * The sectors of the page which are going to be submitted by * extent_writepage_io(). * This is to avoid touching ranges covered by compression/inline.
*/ unsignedlong submit_bitmap; struct readahead_control *ractl;
/* * The start offset of the last used extent map by a read operation. * * This is for proper compressed read merge. * U64_MAX means we are starting the read and have made no progress yet. * * The current btrfs_bio_is_contig() only uses disk_bytenr as * the condition to check if the read can be merged with previous * bio, which is not correct. E.g. two file extents pointing to the * same extent but with different offset. * * So here we need to do extra checks to only merge reads that are * covered by the same extent map. * Just extent_map::start will be enough, as they are unique * inside the same inode.
*/
u64 last_em_start;
};
/* The bbio is owned by the end_io handler now */
bio_ctrl->bbio = NULL;
}
/* * Submit or fail the current bio in the bio_ctrl structure.
*/ staticvoid submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret)
{ struct btrfs_bio *bbio = bio_ctrl->bbio;
if (!bbio) return;
if (ret) {
ASSERT(ret < 0);
btrfs_bio_end_io(bbio, errno_to_blk_status(ret)); /* The bio is owned by the end_io handler now */
bio_ctrl->bbio = NULL;
} else {
submit_one_bio(bio_ctrl);
}
}
int __init extent_buffer_init_cachep(void)
{
extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", sizeof(struct extent_buffer), 0, 0,
NULL); if (!extent_buffer_cache) return -ENOMEM;
return 0;
}
void __cold extent_buffer_free_cachep(void)
{ /* * Make sure all delayed rcu free are flushed before we * destroy caches.
*/
rcu_barrier();
kmem_cache_destroy(extent_buffer_cache);
}
/* * Find and lock a contiguous range of bytes in the file marked as delalloc, no * more than @max_bytes. * * @start: The original start bytenr to search. * Will store the extent range start bytenr. * @end: The original end bytenr of the search range * Will store the extent range end bytenr. * * Return true if we find a delalloc range which starts inside the original * range, and @start/@end will store the delalloc range start/end. * * Return false if we can't find any delalloc range which starts inside the * original range, and @start/@end will be the non-delalloc range start/end.
*/
EXPORT_FOR_TESTS
noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, struct folio *locked_folio,
u64 *start, u64 *end)
{ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; const u64 orig_start = *start; const u64 orig_end = *end; /* The sanity tests may not set a valid fs_info. */
u64 max_bytes = fs_info ? fs_info->max_extent_size : BTRFS_MAX_EXTENT_SIZE;
u64 delalloc_start;
u64 delalloc_end; bool found; struct extent_state *cached_state = NULL; int ret; int loops = 0;
/* Caller should pass a valid @end to indicate the search range end */
ASSERT(orig_end > orig_start);
/* The range should at least cover part of the folio */
ASSERT(!(orig_start >= folio_end(locked_folio) ||
orig_end <= folio_pos(locked_folio)));
again: /* step one, find a bunch of delalloc bytes starting at start */
delalloc_start = *start;
delalloc_end = 0;
/* * If @max_bytes is smaller than a block, btrfs_find_delalloc_range() can * return early without handling any dirty ranges.
*/
ASSERT(max_bytes >= fs_info->sectorsize);
/* @delalloc_end can be -1, never go beyond @orig_end */
*end = min(delalloc_end, orig_end);
btrfs_free_extent_state(cached_state); returnfalse;
}
/* * start comes from the offset of locked_folio. We have to lock * folios in order, so we can't process delalloc bytes before * locked_folio
*/ if (delalloc_start < *start)
delalloc_start = *start;
/* * make sure to limit the number of folios we try to lock down
*/ if (delalloc_end + 1 - delalloc_start > max_bytes)
delalloc_end = delalloc_start + max_bytes - 1;
/* step two, lock all the folioss after the folios that has start */
ret = lock_delalloc_folios(inode, locked_folio, delalloc_start,
delalloc_end);
ASSERT(!ret || ret == -EAGAIN); if (ret == -EAGAIN) { /* * Some of the folios are gone, lets avoid looping by * shortening the size of the delalloc range we're searching.
*/
btrfs_free_extent_state(cached_state);
cached_state = NULL; if (!loops) {
max_bytes = fs_info->sectorsize;
loops = 1; goto again;
} else {
found = false; goto out_failed;
}
}
/* step three, lock the state bits for the whole range */
btrfs_lock_extent(tree, delalloc_start, delalloc_end, &cached_state);
/* then test to make sure it is all still delalloc */
ret = btrfs_test_range_bit(tree, delalloc_start, delalloc_end,
EXTENT_DELALLOC, cached_state);
if (!btrfs_is_subpage(fs_info, folio))
folio_unlock(folio); else
btrfs_folio_end_lock(fs_info, folio, start, len);
}
/* * After a write IO is done, we need to: * * - clear the uptodate bits on error * - clear the writeback bits in the extent tree for the range * - filio_end_writeback() if there is no more pending io for the folio * * Scheduling is not allowed, so the extent state tree is expected * to have one and only one object corresponding to this IO.
*/ staticvoid end_bbio_data_write(struct btrfs_bio *bbio)
{ struct btrfs_fs_info *fs_info = bbio->fs_info; struct bio *bio = &bbio->bio; int error = blk_status_to_errno(bio->bi_status); struct folio_iter fi; const u32 sectorsize = fs_info->sectorsize;
/* * After a data read IO is done, we need to: * * - clear the uptodate bits on error * - set the uptodate bits if things worked * - set the folio up to date if all extents in the tree are uptodate * - clear the lock bit in the extent tree * - unlock the folio if there are no other extents locked for it * * Scheduling is not allowed, so the extent state tree is expected * to have one and only one object corresponding to this IO.
*/ staticvoid end_bbio_data_read(struct btrfs_bio *bbio)
{ struct btrfs_fs_info *fs_info = bbio->fs_info; struct bio *bio = &bbio->bio; struct folio_iter fi;
if (likely(uptodate)) {
u64 end = start + fi.length - 1;
loff_t i_size = i_size_read(inode);
/* * Zero out the remaining part if this range straddles * i_size. * * Here we should only zero the range inside the folio, * not touch anything else. * * NOTE: i_size is exclusive while end is inclusive and * folio_contains() takes PAGE_SIZE units.
*/ if (folio_contains(folio, i_size >> PAGE_SHIFT) &&
i_size <= end) {
u32 zero_start = max(offset_in_folio(folio, i_size),
offset_in_folio(folio, start));
u32 zero_len = offset_in_folio(folio, end) + 1 -
zero_start;
/* Update page status and unlock. */
end_folio_read(folio, uptodate, start, fi.length);
}
bio_put(bio);
}
/* * Populate every free slot in a provided array with folios using GFP_NOFS. * * @nr_folios: number of folios to allocate * @folio_array: the array to fill with folios; any existing non-NULL entries in * the array will be skipped * * Return: 0 if all folios were able to be allocated; * -ENOMEM otherwise, the partially allocated folios would be freed and * the array slots zeroed
*/ int btrfs_alloc_folio_array(unsignedint nr_folios, struct folio **folio_array)
{ for (int i = 0; i < nr_folios; i++) { if (folio_array[i]) continue;
folio_array[i] = folio_alloc(GFP_NOFS, 0); if (!folio_array[i]) goto error;
} return 0;
error: for (int i = 0; i < nr_folios; i++) { if (folio_array[i])
folio_put(folio_array[i]);
} return -ENOMEM;
}
/* * Populate every free slot in a provided array with pages, using GFP_NOFS. * * @nr_pages: number of pages to allocate * @page_array: the array to fill with pages; any existing non-null entries in * the array will be skipped * @nofail: whether using __GFP_NOFAIL flag * * Return: 0 if all pages were able to be allocated; * -ENOMEM otherwise, the partially allocated pages would be freed and * the array slots zeroed
*/ int btrfs_alloc_page_array(unsignedint nr_pages, struct page **page_array, bool nofail)
{ const gfp_t gfp = nofail ? (GFP_NOFS | __GFP_NOFAIL) : GFP_NOFS; unsignedint allocated;
for (allocated = 0; allocated < nr_pages;) { unsignedint last = allocated;
allocated = alloc_pages_bulk(gfp, nr_pages, page_array); if (unlikely(allocated == last)) { /* No progress, fail and do cleanup. */ for (int i = 0; i < allocated; i++) {
__free_page(page_array[i]);
page_array[i] = NULL;
} return -ENOMEM;
}
} return 0;
}
/* * Populate needed folios for the extent buffer. * * For now, the folios populated are always in order 0 (aka, single page).
*/ staticint alloc_eb_folio_array(struct extent_buffer *eb, bool nofail)
{ struct page *page_array[INLINE_EXTENT_BUFFER_PAGES] = { 0 }; int num_pages = num_extent_pages(eb); int ret;
ret = btrfs_alloc_page_array(num_pages, page_array, nofail); if (ret < 0) return ret;
for (int i = 0; i < num_pages; i++)
eb->folios[i] = page_folio(page_array[i]);
eb->folio_size = PAGE_SIZE;
eb->folio_shift = PAGE_SHIFT; return 0;
}
if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { /* * For compression, all IO should have its logical bytenr set * to the starting bytenr of the compressed extent.
*/ return bio->bi_iter.bi_sector == sector;
}
/* * To merge into a bio both the disk sector and the logical offset in * the file need to be contiguous.
*/ return bio_ctrl->next_file_offset == file_offset &&
bio_end_sector(bio) == sector;
}
/* * Pick the last added device to support cgroup writeback. For * multi-device file systems this means blk-cgroup policies have * to always be set on the last added/replaced device. * This is a bit odd but has been like that for a long time.
*/
bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev);
wbc_init_bio(bio_ctrl->wbc, &bbio->bio);
}
}
/* * @disk_bytenr: logical bytenr where the write will be * @page: page to add to the bio * @size: portion of page that we want to write to * @pg_offset: offset of the new bio or to check whether we are adding * a contiguous page to the previous one * * The will either add the page into the existing @bio_ctrl->bbio, or allocate a * new one in @bio_ctrl->bbio. * The mirror number for this IO should already be initizlied in * @bio_ctrl->mirror_num.
*/ staticvoid submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
u64 disk_bytenr, struct folio *folio,
size_t size, unsignedlong pg_offset)
{ struct btrfs_inode *inode = folio_to_inode(folio);
loff_t file_offset = folio_pos(folio) + pg_offset;
if (bio_ctrl->bbio &&
!btrfs_bio_is_contig(bio_ctrl, disk_bytenr, file_offset))
submit_one_bio(bio_ctrl);
do {
u32 len = size;
/* Allocate new bio if needed */ if (!bio_ctrl->bbio)
alloc_new_bio(inode, bio_ctrl, disk_bytenr, file_offset);
/* Cap to the current ordered extent boundary if there is one. */ if (len > bio_ctrl->len_to_oe_boundary) {
ASSERT(bio_ctrl->compress_type == BTRFS_COMPRESS_NONE);
ASSERT(is_data_inode(inode));
len = bio_ctrl->len_to_oe_boundary;
}
if (!bio_add_folio(&bio_ctrl->bbio->bio, folio, len, pg_offset)) { /* bio full: move on to a new one */
submit_one_bio(bio_ctrl); continue;
}
bio_ctrl->next_file_offset += len;
if (bio_ctrl->wbc)
wbc_account_cgroup_owner(bio_ctrl->wbc, folio, len);
/* * len_to_oe_boundary defaults to U32_MAX, which isn't folio or * sector aligned. alloc_new_bio() then sets it to the end of * our ordered extent for writes into zoned devices. * * When len_to_oe_boundary is tracking an ordered extent, we * trust the ordered extent code to align things properly, and * the check above to cap our write to the ordered extent * boundary is correct. * * When len_to_oe_boundary is U32_MAX, the cap above would * result in a 4095 byte IO for the last folio right before * we hit the bio limit of UINT_MAX. bio_add_folio() has all * the checks required to make sure we don't overflow the bio, * and we should just ignore len_to_oe_boundary completely * unless we're using it to track an ordered extent. * * It's pretty hard to make a bio sized U32_MAX, but it can * happen when the page cache is able to feed us contiguous * folios for large extents.
*/ if (bio_ctrl->len_to_oe_boundary != U32_MAX)
bio_ctrl->len_to_oe_boundary -= len;
/* Ordered extent boundary: move on to a new bio. */ if (bio_ctrl->len_to_oe_boundary == 0)
submit_one_bio(bio_ctrl);
} while (size);
}
staticint attach_extent_buffer_folio(struct extent_buffer *eb, struct folio *folio, struct btrfs_folio_state *prealloc)
{ struct btrfs_fs_info *fs_info = eb->fs_info; int ret = 0;
/* * If the page is mapped to btree inode, we should hold the private * lock to prevent race. * For cloned or dummy extent buffers, their pages are not mapped and * will not race with any other ebs.
*/ if (folio->mapping)
lockdep_assert_held(&folio->mapping->i_private_lock);
if (!btrfs_meta_is_subpage(fs_info)) { if (!folio_test_private(folio))
folio_attach_private(folio, eb); else
WARN_ON(folio_get_private(folio) != eb); return 0;
}
/* Already mapped, just free prealloc */ if (folio_test_private(folio)) {
btrfs_free_folio_state(prealloc); return 0;
}
if (prealloc) /* Has preallocated memory for subpage */
folio_attach_private(folio, prealloc); else /* Do new allocation to attach subpage */
ret = btrfs_attach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA); return ret;
}
int set_folio_extent_mapped(struct folio *folio)
{ struct btrfs_fs_info *fs_info;
ASSERT(folio->mapping);
if (folio_test_private(folio)) return 0;
fs_info = folio_to_fs_info(folio);
if (btrfs_is_subpage(fs_info, folio)) return btrfs_attach_folio_state(fs_info, folio, BTRFS_SUBPAGE_DATA);
if (zero_offset)
folio_zero_range(folio, zero_offset,
folio_size(folio) - zero_offset);
}
bio_ctrl->end_io_func = end_bbio_data_read;
begin_folio_read(fs_info, folio); for (u64 cur = start; cur <= end; cur += blocksize) { enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE; unsignedlong pg_offset = offset_in_folio(folio, cur); bool force_bio_submit = false;
u64 disk_bytenr;
u64 block_start;
ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); if (cur >= last_byte) {
folio_zero_range(folio, pg_offset, end - cur + 1);
end_folio_read(folio, true, cur, end - cur + 1); break;
} if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) {
end_folio_read(folio, true, cur, blocksize); continue;
}
em = get_extent_map(BTRFS_I(inode), folio, cur, end - cur + 1, em_cached); if (IS_ERR(em)) {
end_folio_read(folio, false, cur, end + 1 - cur); return PTR_ERR(em);
}
extent_offset = cur - em->start;
BUG_ON(btrfs_extent_map_end(em) <= cur);
BUG_ON(end < cur);
compress_type = btrfs_extent_map_compression(em);
/* * Only expand readahead for extents which are already creating * the pages anyway in add_ra_bio_pages, which is compressed * extents in the non subpage case.
*/ if (bio_ctrl->ractl &&
!btrfs_is_subpage(fs_info, folio) &&
compress_type != BTRFS_COMPRESS_NONE)
btrfs_readahead_expand(bio_ctrl->ractl, em);
/* * If we have a file range that points to a compressed extent * and it's followed by a consecutive file range that points * to the same compressed extent (possibly with a different * offset and/or length, so it either points to the whole extent * or only part of it), we must make sure we do not submit a * single bio to populate the folios for the 2 ranges because * this makes the compressed extent read zero out the folios * belonging to the 2nd range. Imagine the following scenario: * * File layout * [0 - 8K] [8K - 24K] * | | * | | * points to extent X, points to extent X, * offset 4K, length of 8K offset 0, length 16K * * [extent X, compressed length = 4K uncompressed length = 16K] * * If the bio to read the compressed extent covers both ranges, * it will decompress extent X into the folios belonging to the * first range and then it will stop, zeroing out the remaining * folios that belong to the other range that points to extent X. * So here we make sure we submit 2 bios, one for the first * range and another one for the third range. Both will target * the same physical extent from disk, but we can't currently * make the compressed bio endio callback populate the folios * for both ranges because each compressed bio is tightly * coupled with a single extent map, and each range can have * an extent map with a different offset value relative to the * uncompressed data of our extent and different lengths. This * is a corner case so we prioritize correctness over * non-optimal behavior (submitting 2 bios for the same extent).
*/ if (compress_type != BTRFS_COMPRESS_NONE &&
bio_ctrl->last_em_start != U64_MAX &&
bio_ctrl->last_em_start != em->start)
force_bio_submit = true;
bio_ctrl->last_em_start = em->start;
btrfs_free_extent_map(em);
em = NULL;
/* we've found a hole, just zero and go on */ if (block_start == EXTENT_MAP_HOLE) {
folio_zero_range(folio, pg_offset, blocksize);
end_folio_read(folio, true, cur, blocksize); continue;
} /* the get_extent function already copied into the folio */ if (block_start == EXTENT_MAP_INLINE) {
end_folio_read(folio, true, cur, blocksize); continue;
}
if (bio_ctrl->compress_type != compress_type) {
submit_one_bio(bio_ctrl);
bio_ctrl->compress_type = compress_type;
}
/* * Check if we can skip waiting the @ordered extent covering the block at @fileoff. * * @fileoff: Both input and output. * Input as the file offset where the check should start at. * Output as where the next check should start at, * if the function returns true. * * Return true if we can skip to @fileoff. The caller needs to check the new * @fileoff value to make sure it covers the full range, before skipping the * full OE. * * Return false if we must wait for the ordered extent.
*/ staticbool can_skip_one_ordered_range(struct btrfs_inode *inode, struct btrfs_ordered_extent *ordered,
u64 *fileoff)
{ conststruct btrfs_fs_info *fs_info = inode->root->fs_info; struct folio *folio; const u32 blocksize = fs_info->sectorsize;
u64 cur = *fileoff; bool ret;
folio = filemap_get_folio(inode->vfs_inode.i_mapping, cur >> PAGE_SHIFT);
/* * We should have locked the folio(s) for range [start, end], thus * there must be a folio and it must be locked.
*/
ASSERT(!IS_ERR(folio));
ASSERT(folio_test_locked(folio));
/* * There are several cases for the folio and OE combination: * * 1) Folio has no private flag * The OE has all its IO done but not yet finished, and folio got * invalidated. * * Have we have to wait for the OE to finish, as it may contain the * to-be-inserted data checksum. * Without the data checksum inserted into the csum tree, read will * just fail with missing csum.
*/ if (!folio_test_private(folio)) {
ret = false; goto out;
}
/* * 2) The first block is DIRTY. * * This means the OE is created by some other folios whose file pos is * before this one. And since we are holding the folio lock, the writeback * of this folio cannot start. * * We must skip the whole OE, because it will never start until we * finished our folio read and unlocked the folio.
*/ if (btrfs_folio_test_dirty(fs_info, folio, cur, blocksize)) {
u64 range_len = min(folio_end(folio),
ordered->file_offset + ordered->num_bytes) - cur;
ret = true; /* * At least inside the folio, all the remaining blocks should * also be dirty.
*/
ASSERT(btrfs_folio_test_dirty(fs_info, folio, cur, range_len));
*fileoff = ordered->file_offset + ordered->num_bytes; goto out;
}
/* * 3) The first block is uptodate. * * At least the first block can be skipped, but we are still not fully * sure. E.g. if the OE has some other folios in the range that cannot * be skipped. * So we return true and update @next_ret to the OE/folio boundary.
*/ if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) {
u64 range_len = min(folio_end(folio),
ordered->file_offset + ordered->num_bytes) - cur;
/* * The whole range to the OE end or folio boundary should also * be uptodate.
*/
ASSERT(btrfs_folio_test_uptodate(fs_info, folio, cur, range_len));
ret = true;
*fileoff = cur + range_len; goto out;
}
/* * 4) The first block is not uptodate. * * This means the folio is invalidated after the writeback was finished, * but by some other operations (e.g. block aligned buffered write) the * folio is inserted into filemap. * Very much the same as case 1).
*/
ret = false;
out:
folio_put(folio); return ret;
}
/* * Locking helper to make sure we get a stable view of extent maps for the * involved range. * * This is for folio read paths (read and readahead), thus the involved range * should have all the folios locked.
*/ staticvoid lock_extents_for_read(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state)
{
u64 cur_pos;
/* Caller must provide a valid @cached_state. */
ASSERT(cached_state);
/* The range must at least be page aligned, as all read paths are folio based. */
ASSERT(IS_ALIGNED(start, PAGE_SIZE));
ASSERT(IS_ALIGNED(end + 1, PAGE_SIZE));
ordered = btrfs_lookup_ordered_range(inode, cur_pos,
end - cur_pos + 1); /* * No ordered extents in the range, and we hold the extent lock, * no one can modify the extent maps in the range, we're safe to return.
*/ if (!ordered) break;
/* Check if we can skip waiting for the whole OE. */ if (can_skip_ordered_extent(inode, ordered, start, end)) {
cur_pos = min(ordered->file_offset + ordered->num_bytes,
end + 1);
btrfs_put_ordered_extent(ordered); continue;
}
/* Now wait for the OE to finish. */
btrfs_unlock_extent(&inode->io_tree, start, end, cached_state);
btrfs_start_ordered_extent_nowriteback(ordered, start, end + 1 - start);
btrfs_put_ordered_extent(ordered); /* We have unlocked the whole range, restart from the beginning. */ goto again;
}
}
/* * Do all of the delayed allocation setup. * * Return >0 if all the dirty blocks are submitted async (compression) or inlined. * The @folio should no longer be touched (treat it as already unlocked). * * Return 0 if there is still dirty block that needs to be submitted through * extent_writepage_io(). * bio_ctrl->submit_bitmap will indicate which blocks of the folio should be * submitted, and @folio is still kept locked. * * Return <0 if there is any error hit. * Any allocated ordered extent range covering this folio will be marked * finished (IOERR), and @folio is still kept locked.
*/ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl)
{ struct btrfs_fs_info *fs_info = inode_to_fs_info(&inode->vfs_inode); struct writeback_control *wbc = bio_ctrl->wbc; constbool is_subpage = btrfs_is_subpage(fs_info, folio); const u64 page_start = folio_pos(folio); const u64 page_end = page_start + folio_size(folio) - 1; constunsignedint blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); unsignedlong delalloc_bitmap = 0; /* * Save the last found delalloc end. As the delalloc end can go beyond * page boundary, thus we cannot rely on subpage bitmap to locate the * last delalloc end.
*/
u64 last_delalloc_end = 0; /* * The range end (exclusive) of the last successfully finished delalloc * range. * Any range covered by ordered extent must either be manually marked * finished (error handling), or has IO submitted (and finish the * ordered extent normally). * * This records the end of ordered extent cleanup if we hit an error.
*/
u64 last_finished_delalloc_end = page_start;
u64 delalloc_start = page_start;
u64 delalloc_end = page_end;
u64 delalloc_to_write = 0; int ret = 0; int bit;
/* Save the dirty bitmap as our submission bitmap will be a subset of it. */ if (btrfs_is_subpage(fs_info, folio)) {
ASSERT(blocks_per_folio > 1);
btrfs_get_subpage_dirty_bitmap(fs_info, folio, &bio_ctrl->submit_bitmap);
} else {
bio_ctrl->submit_bitmap = 1;
}
/* Run the delalloc ranges for the above locked ranges. */ while (delalloc_start < page_end) {
u64 found_start;
u32 found_len; bool found;
if (!is_subpage) { /* * For non-subpage case, the found delalloc range must * cover this folio and there must be only one locked * delalloc range.
*/
found_start = page_start;
found_len = last_delalloc_end + 1 - found_start;
found = true;
} else {
found = find_next_delalloc_bitmap(folio, &delalloc_bitmap,
delalloc_start, &found_start, &found_len);
} if (!found) break; /* * The subpage range covers the last sector, the delalloc range may * end beyond the folio boundary, use the saved delalloc_end * instead.
*/ if (found_start + found_len >= page_end)
found_len = last_delalloc_end + 1 - found_start;
if (ret >= 0) { /* * Some delalloc range may be created by previous folios. * Thus we still need to clean up this range during error * handling.
*/
last_finished_delalloc_end = found_start; /* No errors hit so far, run the current delalloc range. */
ret = btrfs_run_delalloc_range(inode, folio,
found_start,
found_start + found_len - 1,
wbc); if (ret >= 0)
last_finished_delalloc_end = found_start + found_len; if (unlikely(ret < 0))
btrfs_err_rl(fs_info, "failed to run delalloc range, root=%lld ino=%llu folio=%llu submit_bitmap=%*pbl start=%llu len=%u: %d",
btrfs_root_id(inode->root),
btrfs_ino(inode),
folio_pos(folio),
blocks_per_folio,
&bio_ctrl->submit_bitmap,
found_start, found_len, ret);
} else { /* * We've hit an error during previous delalloc range, * have to cleanup the remaining locked ranges.
*/
btrfs_unlock_extent(&inode->io_tree, found_start,
found_start + found_len - 1, NULL);
unlock_delalloc_folio(&inode->vfs_inode, folio,
found_start,
found_start + found_len - 1);
}
/* * We have some ranges that's going to be submitted asynchronously * (compression or inline). These range have their own control * on when to unlock the pages. We should not touch them * anymore, so clear the range from the submission bitmap.
*/ if (ret > 0) { unsignedint start_bit = (found_start - page_start) >>
fs_info->sectorsize_bits; unsignedint end_bit = (min(page_end + 1, found_start + found_len) -
page_start) >> fs_info->sectorsize_bits;
bitmap_clear(&bio_ctrl->submit_bitmap, start_bit, end_bit - start_bit);
} /* * Above btrfs_run_delalloc_range() may have unlocked the folio, * thus for the last range, we cannot touch the folio anymore.
*/ if (found_start + found_len >= last_delalloc_end + 1) break;
delalloc_start = found_start + found_len;
} /* * It's possible we had some ordered extents created before we hit * an error, cleanup non-async successfully created delalloc ranges.
*/ if (unlikely(ret < 0)) { unsignedint bitmap_size = min(
(last_finished_delalloc_end - page_start) >>
fs_info->sectorsize_bits,
blocks_per_folio);
for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size)
btrfs_mark_ordered_io_finished(inode, folio,
page_start + (bit << fs_info->sectorsize_bits),
fs_info->sectorsize, false); return ret;
}
out: if (last_delalloc_end)
delalloc_end = last_delalloc_end; else
delalloc_end = page_end; /* * delalloc_end is already one less than the total length, so * we don't subtract one from PAGE_SIZE.
*/
delalloc_to_write +=
DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE);
/* * If all ranges are submitted asynchronously, we just need to account * for them here.
*/ if (bitmap_empty(&bio_ctrl->submit_bitmap, blocks_per_folio)) {
wbc->nr_to_write -= delalloc_to_write; return 1;
}
if (wbc->nr_to_write < delalloc_to_write) { int thresh = 8192;
/* * Return 0 if we have submitted or queued the sector for submission. * Return <0 for critical errors, and the sector will have its dirty flag cleared. * * Caller should make sure filepos < i_size and handle filepos >= i_size case.
*/ staticint submit_one_sector(struct btrfs_inode *inode, struct folio *folio,
u64 filepos, struct btrfs_bio_ctrl *bio_ctrl,
loff_t i_size)
{ struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_map *em;
u64 block_start;
u64 disk_bytenr;
u64 extent_offset;
u64 em_end; const u32 sectorsize = fs_info->sectorsize;
ASSERT(IS_ALIGNED(filepos, sectorsize));
/* @filepos >= i_size case should be handled by the caller. */
ASSERT(filepos < i_size);
em = btrfs_get_extent(inode, NULL, filepos, sectorsize); if (IS_ERR(em)) { /* * When submission failed, we should still clear the folio dirty. * Or the folio will be written back again but without any * ordered extent.
*/
btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize);
btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize);
btrfs_folio_clear_writeback(fs_info, folio, filepos, sectorsize); return PTR_ERR(em);
}
/* * Although the PageDirty bit is cleared before entering this * function, subpage dirty bit is not cleared. * So clear subpage dirty bit here so next time we won't submit * a folio for a range already written to disk.
*/
btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize);
btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize); /* * Above call should set the whole folio with writeback flag, even * just for a single subpage sector. * As long as the folio is properly locked and the range is correct, * we should always get the folio with writeback flag.
*/
ASSERT(folio_test_writeback(folio));
/* * Helper for extent_writepage(). This calls the writepage start hooks, * and does the loop to map the page into extents and bios. * * We return 1 if the IO is started and the page is unlocked, * 0 if all went well (page still locked) * < 0 if there were errors (page still locked)
*/ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, struct folio *folio,
u64 start, u32 len, struct btrfs_bio_ctrl *bio_ctrl,
loff_t i_size)
{ struct btrfs_fs_info *fs_info = inode->root->fs_info; unsignedlong range_bitmap = 0; bool submitted_io = false; int found_error = 0; const u64 folio_start = folio_pos(folio); constunsignedint blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
u64 cur; int bit; int ret = 0;
if (cur >= i_size) {
btrfs_mark_ordered_io_finished(inode, folio, cur,
start + len - cur, true); /* * This range is beyond i_size, thus we don't need to * bother writing back. * But we still need to clear the dirty subpage bit, or * the next time the folio gets dirtied, we will try to * writeback the sectors with subpage dirty bits, * causing writeback without ordered extent.
*/
btrfs_folio_clear_dirty(fs_info, folio, cur,
start + len - cur); break;
}
ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size); if (unlikely(ret < 0)) { /* * bio_ctrl may contain a bio crossing several folios. * Submit it immediately so that the bio has a chance * to finish normally, other than marked as error.
*/
submit_one_bio(bio_ctrl); /* * Failed to grab the extent map which should be very rare. * Since there is no bio submitted to finish the ordered * extent, we have to manually finish this sector.
*/
btrfs_mark_ordered_io_finished(inode, folio, cur,
fs_info->sectorsize, false); if (!found_error)
found_error = ret; continue;
}
submitted_io = true;
}
/* * If we didn't submitted any sector (>= i_size), folio dirty get * cleared but PAGECACHE_TAG_DIRTY is not cleared (only cleared * by folio_start_writeback() if the folio is not dirty). * * Here we set writeback and clear for the range. If the full folio * is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag. * * If we hit any error, the corresponding sector will have its dirty * flag cleared and writeback finished, thus no need to handle the error case.
*/ if (!submitted_io && !found_error) {
btrfs_folio_set_writeback(fs_info, folio, start, len);
btrfs_folio_clear_writeback(fs_info, folio, start, len);
} return found_error;
}
/* * the writepage semantics are similar to regular writepage. extent * records are inserted to lock ranges in the tree, and as dirty areas * are found, they are marked writeback. Then the lock bits are removed * and the end_io handler clears the writeback ranges * * Return 0 if everything goes well. * Return <0 for error.
*/ staticint extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl)
{ struct btrfs_inode *inode = BTRFS_I(folio->mapping->host); struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret;
size_t pg_offset;
loff_t i_size = i_size_read(&inode->vfs_inode); const pgoff_t end_index = i_size >> PAGE_SHIFT; constunsignedint blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
if (folio_contains(folio, end_index))
folio_zero_range(folio, pg_offset, folio_size(folio) - pg_offset);
/* * Default to unlock the whole folio. * The proper bitmap can only be initialized until writepage_delalloc().
*/
bio_ctrl->submit_bitmap = (unsignedlong)-1;
/* * If the page is dirty but without private set, it's marked dirty * without informing the fs. * Nowadays that is a bug, since the introduction of * pin_user_pages*(). * * So here we check if the page has private set to rule out such * case. * But we also have a long history of relying on the COW fixup, * so here we only enable this check for experimental builds until * we're sure it's safe.
*/ if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) &&
unlikely(!folio_test_private(folio))) {
WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
btrfs_err_rl(fs_info, "root %lld ino %llu folio %llu is marked dirty without notifying the fs",
btrfs_root_id(inode->root),
btrfs_ino(inode), folio_pos(folio));
ret = -EUCLEAN; goto done;
}
ret = set_folio_extent_mapped(folio); if (ret < 0) goto done;
ret = writepage_delalloc(inode, folio, bio_ctrl); if (ret == 1) return 0; if (ret) goto done;
ret = extent_writepage_io(inode, folio, folio_pos(folio),
folio_size(folio), bio_ctrl, i_size); if (ret == 1) return 0; if (ret < 0)
btrfs_err_rl(fs_info, "failed to submit blocks, root=%lld inode=%llu folio=%llu submit_bitmap=%*pbl: %d",
btrfs_root_id(inode->root), btrfs_ino(inode),
folio_pos(folio), blocks_per_folio,
&bio_ctrl->submit_bitmap, ret);
bio_ctrl->wbc->nr_to_write--;
done: if (ret < 0)
mapping_set_error(folio->mapping, ret); /* * Only unlock ranges that are submitted. As there can be some async * submitted ranges inside the folio.
*/
btrfs_folio_end_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap);
ASSERT(ret <= 0); return ret;
}
/* * Lock extent buffer status and pages for writeback. * * Return %false if the extent buffer doesn't need to be submitted (e.g. the * extent buffer is not dirty) * Return %true is the extent buffer is submitted to bio.
*/ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *eb, struct writeback_control *wbc)
{ struct btrfs_fs_info *fs_info = eb->fs_info; bool ret = false;
btrfs_tree_lock(eb); while (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
btrfs_tree_unlock(eb); if (wbc->sync_mode != WB_SYNC_ALL) returnfalse;
wait_on_extent_buffer_writeback(eb);
btrfs_tree_lock(eb);
}
/* * We need to do this to prevent races in people who check if the eb is * under IO since we can end up having no IO bits set for a short period * of time.
*/
spin_lock(&eb->refs_lock); if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits); unsignedlong flags;
/* * A read may stumble upon this buffer later, make sure that it gets an * error and knows there was an error.
*/
clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
/* * We need to set the mapping with the io error as well because a write * error will flip the file system readonly, and then syncfs() will * return a 0 because we are readonly if we don't modify the err seq for * the superblock.
*/
mapping_set_error(eb->fs_info->btree_inode->i_mapping, -EIO);
/* * If writeback for a btree extent that doesn't belong to a log tree * failed, increment the counter transaction->eb_write_errors. * We do this because while the transaction is running and before it's * committing (when we call filemap_fdata[write|wait]_range against * the btree inode), we might have * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it * returns an error or an error happens during writeback, when we're * committing the transaction we wouldn't know about it, since the pages * can be no longer dirty nor marked anymore for writeback (if a * subsequent modification to the extent buffer didn't happen before the * transaction commit), which makes filemap_fdata[write|wait]_range not * able to find the pages which contain errors at transaction * commit time. So if this happens we must abort the transaction, * otherwise we commit a super block with btree roots that point to * btree nodes/leafs whose content on disk is invalid - either garbage * or the content of some node/leaf from a past generation that got * cowed or deleted and is no longer valid. * * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would * not be enough - we need to distinguish between log tree extents vs * non-log tree extents, and the next filemap_fdatawait_range() call * will catch and clear such errors in the mapping - and that call might * be from a log sync and not from a transaction commit. Also, checking * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is * not done and would not be reliable - the eb might have been released * from memory and reading it back again means that flag would not be * set (since it's a runtime flag, not persisted on disk). * * Using the flags below in the btree inode also makes us achieve the * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started * writeback for all dirty pages and before filemap_fdatawait_range() * is called, the writeback for all dirty pages had already finished * with errors - because we were not using AS_EIO/AS_ENOSPC, * filemap_fdatawait_range() would return success, as it could not know * that writeback errors happened (the pages were no longer tagged for * writeback).
*/ switch (eb->log_index) { case -1:
set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags); break; case 0:
set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags); break; case 1:
set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags); break; default:
BUG(); /* unexpected, logic error */
}
}
rcu_read_lock(); while ((eb = find_get_eb(&xas, end, tag)) != NULL) { if (!eb_batch_add(batch, eb)) {
*start = ((eb->start + eb->len) >> fs_info->nodesize_bits); goto out;
}
} if (end == ULONG_MAX)
*start = ULONG_MAX; else
*start = end + 1;
out:
rcu_read_unlock();
return batch->nr;
}
/* * The endio specific version which won't touch any unsafe spinlock in endio * context.
*/ staticstruct extent_buffer *find_extent_buffer_nolock( struct btrfs_fs_info *fs_info, u64 start)
{ struct extent_buffer *eb; unsignedlong index = (start >> fs_info->nodesize_bits);
folio_lock(folio);
btrfs_meta_folio_clear_dirty(folio, eb);
btrfs_meta_folio_set_writeback(folio, eb); if (!folio_test_dirty(folio))
wbc->nr_to_write -= folio_nr_pages(folio);
bio_add_folio_nofail(&bbio->bio, folio, range_len,
offset_in_folio(folio, range_start));
wbc_account_cgroup_owner(wbc, folio, range_len);
folio_unlock(folio);
} /* * If the fs is already in error status, do not submit any writeback * but immediately finish it.
*/ if (unlikely(BTRFS_FS_ERROR(fs_info))) {
btrfs_bio_end_io(bbio, errno_to_blk_status(BTRFS_FS_ERROR(fs_info))); return;
}
btrfs_submit_bbio(bbio, 0);
}
/* * Wait for all eb writeback in the given range to finish. * * @fs_info: The fs_info for this file system. * @start: The offset of the range to start waiting on writeback. * @end: The end of the range, inclusive. This is meant to be used in * conjuction with wait_marked_extents, so this will usually be * the_next_eb->start - 1.
*/ void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start,
u64 end)
{ struct eb_batch batch; unsignedlong start_index = (start >> fs_info->nodesize_bits); unsignedlong end_index = (end >> fs_info->nodesize_bits);
int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc)
{ struct btrfs_eb_write_context ctx = { .wbc = wbc }; struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host); int ret = 0; int done = 0; int nr_to_write_done = 0; struct eb_batch batch; unsignedint nr_ebs; unsignedlong index; unsignedlong end; int scanned = 0;
xa_mark_t tag;
eb_batch_init(&batch); if (wbc->range_cyclic) {
index = ((mapping->writeback_index << PAGE_SHIFT) >> fs_info->nodesize_bits);
end = -1;
/* * Start from the beginning does not need to cycle over the * range, mark it as scanned.
*/
scanned = (index == 0);
} else {
index = (wbc->range_start >> fs_info->nodesize_bits);
end = (wbc->range_end >> fs_info->nodesize_bits);
scanned = 1;
} if (wbc->sync_mode == WB_SYNC_ALL)
tag = PAGECACHE_TAG_TOWRITE; else
tag = PAGECACHE_TAG_DIRTY;
btrfs_zoned_meta_io_lock(fs_info);
retry: if (wbc->sync_mode == WB_SYNC_ALL)
buffer_tree_tag_for_writeback(fs_info, index, end); while (!done && !nr_to_write_done && (index <= end) &&
(nr_ebs = buffer_tree_get_ebs_tag(fs_info, &index, end, tag, &batch))) { struct extent_buffer *eb;
while ((eb = eb_batch_next(&batch)) != NULL) {
ctx.eb = eb;
ret = btrfs_check_meta_write_pointer(eb->fs_info, &ctx); if (ret) { if (ret == -EBUSY)
ret = 0;
if (ret) {
done = 1; break;
} continue;
}
if (!lock_extent_buffer_for_io(eb, wbc)) continue;
/* Implies write in zoned mode. */ if (ctx.zoned_bg) { /* Mark the last eb in the block group. */
btrfs_schedule_zone_finish_bg(ctx.zoned_bg, eb);
ctx.zoned_bg->meta_write_pointer += eb->len;
}
write_one_eb(eb, wbc);
}
nr_to_write_done = (wbc->nr_to_write <= 0);
eb_batch_release(&batch);
cond_resched();
} if (!scanned && !done) { /* * We hit the last page and there is more work to be done: wrap * back to the start of the file
*/
scanned = 1;
index = 0; goto retry;
} /* * If something went wrong, don't allow any metadata write bio to be * submitted. * * This would prevent use-after-free if we had dirty pages not * cleaned up, which can still happen by fuzzed images. * * - Bad extent tree * Allowing existing tree block to be allocated for other trees. * * - Log tree operations * Exiting tree blocks get allocated to log tree, bumps its * generation, then get cleaned in tree re-balance. * Such tree block will not be written back, since it's clean, * thus no WRITTEN flag set. * And after log writes back, this tree block is not traced by * any dirty extent_io_tree. * * - Offending tree block gets re-dirtied from its original owner * Since it has bumped generation, no WRITTEN flag, it can be * reused without COWing. This tree block will not be traced * by btrfs_transaction::dirty_pages. * * Now such dirty tree block will not be cleaned by any dirty * extent io tree. Thus we don't want to submit such wild eb * if the fs already has error. * * We can get ret > 0 from submit_extent_folio() indicating how many ebs * were submitted. Reset it to 0 to avoid false alerts for the caller.
*/ if (ret > 0)
ret = 0; if (!ret && BTRFS_FS_ERROR(fs_info))
ret = -EROFS;
if (ctx.zoned_bg)
btrfs_put_block_group(ctx.zoned_bg);
btrfs_zoned_meta_io_unlock(fs_info); return ret;
}
/* * Walk the list of dirty pages of the given address space and write all of them. * * @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write * @bio_ctrl: holds context for the write, namely the bio * * If a page is already under I/O, write_cache_pages() skips it, even * if it's dirty. This is desirable behaviour for memory-cleaning writeback, * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() * and msync() need to guarantee that all the data which was dirty at the time * the call was made get new I/O started against them. If wbc->sync_mode is * WB_SYNC_ALL then we were called for data integrity and we must wait for * existing IO to complete.
*/ staticint extent_write_cache_pages(struct address_space *mapping, struct btrfs_bio_ctrl *bio_ctrl)
{ struct writeback_control *wbc = bio_ctrl->wbc; struct inode *inode = mapping->host; int ret = 0; int done = 0; int nr_to_write_done = 0; struct folio_batch fbatch; unsignedint nr_folios;
pgoff_t index;
pgoff_t end; /* Inclusive */
pgoff_t done_index; int range_whole = 0; int scanned = 0;
xa_mark_t tag;
/* * We have to hold onto the inode so that ordered extents can do their * work when the IO finishes. The alternative to this is failing to add * an ordered extent if the igrab() fails there and that is a huge pain * to deal with, so instead just hold onto the inode throughout the * writepages operation. If it fails here we are freeing up the inode * anyway and we'd rather not waste our time writing out stuff that is * going to be truncated anyway.
*/ if (!igrab(inode)) return 0;
folio_batch_init(&fbatch); if (wbc->range_cyclic) {
index = mapping->writeback_index; /* Start from prev offset */
end = -1; /* * Start from the beginning does not need to cycle over the * range, mark it as scanned.
*/
scanned = (index == 0);
} else {
index = wbc->range_start >> PAGE_SHIFT;
end = wbc->range_end >> PAGE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
scanned = 1;
}
/* * We do the tagged writepage as long as the snapshot flush bit is set * and we are the first one who do the filemap_flush() on this inode. * * The nr_to_write == LONG_MAX is needed to make sure other flushers do * not race in and drop the bit.
*/ if (range_whole && wbc->nr_to_write == LONG_MAX &&
test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
&BTRFS_I(inode)->runtime_flags))
wbc->tagged_writepages = 1;
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE; else
tag = PAGECACHE_TAG_DIRTY;
retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag_pages_for_writeback(mapping, index, end);
done_index = index; while (!done && !nr_to_write_done && (index <= end) &&
(nr_folios = filemap_get_folios_tag(mapping, &index,
end, tag, &fbatch))) { unsigned i;
for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i];
done_index = folio_next_index(folio); /* * At this point we hold neither the i_pages lock nor * the folio lock: the folio may be truncated or * invalidated (changing folio->mapping to NULL).
*/ if (!folio_trylock(folio)) {
submit_write_bio(bio_ctrl, 0);
folio_lock(folio);
}
if (unlikely(folio->mapping != mapping)) {
folio_unlock(folio); continue;
}
if (!folio_test_dirty(folio)) { /* Someone wrote it for us. */
folio_unlock(folio); continue;
}
/* * For subpage case, compression can lead to mixed * writeback and dirty flags, e.g: * 0 32K 64K 96K 128K * | |//////||/////| |//| * * In above case, [32K, 96K) is asynchronously submitted * for compression, and [124K, 128K) needs to be written back. * * If we didn't wait wrtiteback for page 64K, [128K, 128K) * won't be submitted as the page still has writeback flag * and will be skipped in the next check. * * This mixed writeback and dirty case is only possible for * subpage case. * * TODO: Remove this check after migrating compression to * regular submission.
*/ if (wbc->sync_mode != WB_SYNC_NONE ||
btrfs_is_subpage(inode_to_fs_info(inode), folio)) { if (folio_test_writeback(folio))
submit_write_bio(bio_ctrl, 0);
folio_wait_writeback(folio);
}
if (folio_test_writeback(folio) ||
!folio_clear_dirty_for_io(folio)) {
folio_unlock(folio); continue;
}
ret = extent_writepage(folio, bio_ctrl); if (ret < 0) {
done = 1; break;
}
/* * The filesystem may choose to bump up nr_to_write. * We have to make sure to honor the new nr_to_write * at any time.
*/
nr_to_write_done = (wbc->sync_mode == WB_SYNC_NONE &&
wbc->nr_to_write <= 0);
}
folio_batch_release(&fbatch);
cond_resched();
} if (!scanned && !done) { /* * We hit the last page and there is more work to be done: wrap * back to the start of the file
*/
scanned = 1;
index = 0;
/* * If we're looping we could run into a page that is locked by a * writer and that writer could be waiting on writeback for a * page in our current bio, and thus deadlock, so flush the * write bio here.
*/
submit_write_bio(bio_ctrl, 0); goto retry;
}
/* * Submit the pages in the range to bio for call sites which delalloc range has * already been ran (aka, ordered extent inserted) and all pages are still * locked.
*/ void extent_write_locked_range(struct inode *inode, conststruct folio *locked_folio,
u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty)
{ bool found_error = false; int ret = 0; struct address_space *mapping = inode->i_mapping; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); const u32 sectorsize = fs_info->sectorsize;
loff_t i_size = i_size_read(inode);
u64 cur = start; struct btrfs_bio_ctrl bio_ctrl = {
.wbc = wbc,
.opf = REQ_OP_WRITE | wbc_to_write_flags(wbc),
};
if (wbc->no_cgroup_owner)
bio_ctrl.opf |= REQ_BTRFS_CGROUP_PUNT;
folio = filemap_get_folio(mapping, cur >> PAGE_SHIFT);
/* * This shouldn't happen, the pages are pinned and locked, this * code is just in case, but shouldn't actually be run.
*/ if (IS_ERR(folio)) {
cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
cur_len = cur_end + 1 - cur;
btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL,
cur, cur_len, false);
mapping_set_error(mapping, PTR_ERR(folio));
cur = cur_end; continue;
}
ASSERT(folio_test_locked(folio)); if (pages_dirty && folio != locked_folio)
ASSERT(folio_test_dirty(folio));
/* * Set the submission bitmap to submit all sectors. * extent_writepage_io() will do the truncation correctly.
*/
bio_ctrl.submit_bitmap = (unsignedlong)-1;
ret = extent_writepage_io(BTRFS_I(inode), folio, cur, cur_len,
&bio_ctrl, i_size); if (ret == 1) goto next_page;
if (ret)
mapping_set_error(mapping, ret);
btrfs_folio_end_lock(fs_info, folio, cur, cur_len); if (ret < 0)
found_error = true;
next_page:
folio_put(folio);
cur = cur_end + 1;
}
submit_write_bio(&bio_ctrl, found_error ? ret : 0);
}
/* * Allow only a single thread to do the reloc work in zoned mode to * protect the write pointer updates.
*/
btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
ret = extent_write_cache_pages(mapping, &bio_ctrl);
submit_write_bio(&bio_ctrl, ret);
btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); return ret;
}
if (em_cached)
btrfs_free_extent_map(em_cached);
submit_one_bio(&bio_ctrl);
}
/* * basic invalidate_folio code, this waits on any locked or writeback * ranges corresponding to the folio, and then deletes any extent state * records from the tree
*/ int extent_invalidate_folio(struct extent_io_tree *tree, struct folio *folio, size_t offset)
{ struct extent_state *cached_state = NULL;
u64 start = folio_pos(folio);
u64 end = start + folio_size(folio) - 1;
size_t blocksize = folio_to_fs_info(folio)->sectorsize;
/* This function is only called for the btree inode */
ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO);
start += ALIGN(offset, blocksize); if (start > end) return 0;
/* * Currently for btree io tree, only EXTENT_LOCKED is utilized, * so here we only need to unlock the extent range to free any * existing extent state.
*/
btrfs_unlock_extent(tree, start, end, &cached_state); return 0;
}
/* * A helper for struct address_space_operations::release_folio, this tests for * areas of the folio that are locked or under IO and drops the related state * bits if it is safe to drop the folio.
*/ staticbool try_release_extent_state(struct extent_io_tree *tree, struct folio *folio)
{ struct extent_state *cached_state = NULL;
u64 start = folio_pos(folio);
u64 end = start + folio_size(folio) - 1;
u32 range_bits;
u32 clear_bits; bool ret = false; int ret2;
/* * We can release the folio if it's locked only for ordered extent * completion, since that doesn't require using the folio.
*/ if ((range_bits & EXTENT_LOCKED) &&
!(range_bits & EXTENT_FINISHING_ORDERED)) goto out;
clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW |
EXTENT_CTLBITS | EXTENT_QGROUP_RESERVED |
EXTENT_FINISHING_ORDERED); /* * At this point we can safely clear everything except the locked, * nodatasum, delalloc new and finishing ordered bits. The delalloc new * bit will be cleared by ordered extent completion.
*/
ret2 = btrfs_clear_extent_bit(tree, start, end, clear_bits, &cached_state); /* * If clear_extent_bit failed for enomem reasons, we can't allow the * release to continue.
*/ if (ret2 == 0)
ret = true;
out:
btrfs_free_extent_state(cached_state);
return ret;
}
/* * a helper for release_folio. As long as there are no locked extents * in the range corresponding to the page, both state records and extent * map records are removed
*/ bool try_release_extent_mapping(struct folio *folio, gfp_t mask)
{
u64 start = folio_pos(folio);
u64 end = start + folio_size(folio) - 1; struct btrfs_inode *inode = folio_to_inode(folio); struct extent_io_tree *io_tree = &inode->io_tree;
while (start <= end) { const u64 cur_gen = btrfs_get_fs_generation(inode->root->fs_info); const u64 len = end - start + 1; struct extent_map_tree *extent_tree = &inode->extent_tree; struct extent_map *em;
write_lock(&extent_tree->lock);
em = btrfs_lookup_extent_mapping(extent_tree, start, len); if (!em) {
write_unlock(&extent_tree->lock); break;
} if ((em->flags & EXTENT_FLAG_PINNED) || em->start != start) {
write_unlock(&extent_tree->lock);
btrfs_free_extent_map(em); break;
} if (btrfs_test_range_bit_exists(io_tree, em->start,
btrfs_extent_map_end(em) - 1,
EXTENT_LOCKED)) goto next; /* * If it's not in the list of modified extents, used by a fast * fsync, we can remove it. If it's being logged we can safely * remove it since fsync took an extra reference on the em.
*/ if (list_empty(&em->list) || (em->flags & EXTENT_FLAG_LOGGING)) goto remove_em; /* * If it's in the list of modified extents, remove it only if * its generation is older then the current one, in which case * we don't need it for a fast fsync. Otherwise don't remove it, * we could be racing with an ongoing fast fsync that could miss * the new extent.
*/ if (em->generation >= cur_gen) goto next;
remove_em: /* * We only remove extent maps that are not in the list of * modified extents or that are in the list but with a * generation lower then the current generation, so there is no * need to set the full fsync flag on the inode (it hurts the * fsync performance for workloads with a data size that exceeds * or is close to the system's memory).
*/
btrfs_remove_extent_mapping(inode, em); /* Once for the inode's extent map tree. */
btrfs_free_extent_map(em);
next:
start = btrfs_extent_map_end(em);
write_unlock(&extent_tree->lock);
/* Once for us, for the lookup_extent_mapping() reference. */
btrfs_free_extent_map(em);
if (need_resched()) { /* * If we need to resched but we can't block just exit * and leave any remaining extent maps.
*/ if (!gfpflags_allow_blocking(mask)) break;
/* * For mapped eb, we're going to change the folio private, which should * be done under the i_private_lock.
*/ if (mapped)
spin_lock(&mapping->i_private_lock);
if (!folio_test_private(folio)) { if (mapped)
spin_unlock(&mapping->i_private_lock); return;
}
if (!btrfs_meta_is_subpage(fs_info)) { /* * We do this since we'll remove the pages after we've removed * the eb from the xarray, so we could race and have this page * now attached to the new eb. So only clear folio if it's * still connected to this eb.
*/ if (folio_test_private(folio) && folio_get_private(folio) == eb) {
BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
BUG_ON(folio_test_dirty(folio));
BUG_ON(folio_test_writeback(folio)); /* We need to make sure we haven't be attached to a new eb. */
folio_detach_private(folio);
} if (mapped)
spin_unlock(&mapping->i_private_lock); return;
}
/* * For subpage, we can have dummy eb with folio private attached. In * this case, we can directly detach the private as such folio is only * attached to one dummy eb, no sharing.
*/ if (!mapped) {
btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA); return;
}
btrfs_folio_dec_eb_refs(fs_info, folio);
/* * We can only detach the folio private if there are no other ebs in the * page range and no unfinished IO.
*/ if (!folio_range_has_eb(folio))
btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA);
spin_unlock(&mapping->i_private_lock);
}
/* Release all folios attached to the extent buffer */ staticvoid btrfs_release_extent_buffer_folios(conststruct extent_buffer *eb)
{
ASSERT(!extent_buffer_under_io(eb));
for (int i = 0; i < INLINE_EXTENT_BUFFER_PAGES; i++) { struct folio *folio = eb->folios[i];
if (!folio) continue;
detach_extent_buffer_folio(eb, folio);
}
}
/* * Helper for releasing the extent buffer.
*/ staticinlinevoid btrfs_release_extent_buffer(struct extent_buffer *eb)
{
btrfs_release_extent_buffer_folios(eb);
btrfs_leak_debug_del_eb(eb);
kmem_cache_free(extent_buffer_cache, eb);
}
/* * For use in eb allocation error cleanup paths, as btrfs_release_extent_buffer() * does not call folio_put(), and we need to set the folios to NULL so that * btrfs_release_extent_buffer() will not detach them a second time.
*/ staticvoid cleanup_extent_buffer_folios(struct extent_buffer *eb)
{ constint num_folios = num_extent_folios(eb);
/* We canont use num_extent_folios() as loop bound as eb->folios changes. */ for (int i = 0; i < num_folios; i++) {
ASSERT(eb->folios[i]);
detach_extent_buffer_folio(eb, eb->folios[i]);
folio_put(eb->folios[i]);
eb->folios[i] = NULL;
}
}
struct extent_buffer *btrfs_clone_extent_buffer(conststruct extent_buffer *src)
{ struct extent_buffer *new; int num_folios; int ret;
new = __alloc_extent_buffer(src->fs_info, src->start); if (new == NULL) return NULL;
/* * Set UNMAPPED before calling btrfs_release_extent_buffer(), as * btrfs_release_extent_buffer() have different behavior for * UNMAPPED subpage extent buffer.
*/
set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
ret = alloc_eb_folio_array(new, false); if (ret) goto release_eb;
ASSERT(num_extent_folios(src) == num_extent_folios(new), "%d != %d", num_extent_folios(src), num_extent_folios(new)); /* Explicitly use the cached num_extent value from now on. */
num_folios = num_extent_folios(src); for (int i = 0; i < num_folios; i++) { struct folio *folio = new->folios[i];
ret = attach_extent_buffer_folio(new, folio, NULL); if (ret < 0) goto cleanup_folios;
WARN_ON(folio_test_dirty(folio));
} for (int i = 0; i < num_folios; i++)
folio_put(new->folios[i]);
eb = __alloc_extent_buffer(fs_info, start); if (!eb) return NULL;
ret = alloc_eb_folio_array(eb, false); if (ret) goto release_eb;
for (int i = 0; i < num_extent_folios(eb); i++) {
ret = attach_extent_buffer_folio(eb, eb->folios[i], NULL); if (ret < 0) goto cleanup_folios;
} for (int i = 0; i < num_extent_folios(eb); i++)
folio_put(eb->folios[i]);
staticvoid check_buffer_tree_ref(struct extent_buffer *eb)
{ int refs; /* * The TREE_REF bit is first set when the extent_buffer is added to the * xarray. It is also reset, if unset, when a new reference is created * by find_extent_buffer. * * It is only cleared in two cases: freeing the last non-tree * reference to the extent_buffer when its STALE bit is set or * calling release_folio when the tree reference is the only reference. * * In both cases, care is taken to ensure that the extent_buffer's * pages are not under io. However, release_folio can be concurrently * called with creating new references, which is prone to race * conditions between the calls to check_buffer_tree_ref in those * codepaths and clearing TREE_REF in try_release_extent_buffer. * * The actual lifetime of the extent_buffer in the xarray is adequately * protected by the refcount, but the TREE_REF bit and its corresponding * reference are not. To protect against this class of races, we call * check_buffer_tree_ref() from the code paths which trigger io. Note that * once io is initiated, TREE_REF can no longer be cleared, so that is * the moment at which any such race is best fixed.
*/
refs = refcount_read(&eb->refs); if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) return;
spin_lock(&eb->refs_lock); if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
refcount_inc(&eb->refs);
spin_unlock(&eb->refs_lock);
}
eb = find_extent_buffer_nolock(fs_info, start); if (!eb) return NULL; /* * Lock our eb's refs_lock to avoid races with free_extent_buffer(). * When we get our eb it might be flagged with EXTENT_BUFFER_STALE and * another task running free_extent_buffer() might have seen that flag * set, eb->refs == 2, that the buffer isn't under IO (dirty and * writeback flags not set) and it's still in the tree (flag * EXTENT_BUFFER_TREE_REF set), therefore being in the process of * decrementing the extent buffer's reference count twice. So here we * could race and increment the eb's reference count, clear its stale * flag, mark it as dirty and drop our reference before the other task * finishes executing free_extent_buffer, which would later result in * an attempt to free an extent buffer that is dirty.
*/ if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
spin_lock(&eb->refs_lock);
spin_unlock(&eb->refs_lock);
}
mark_extent_buffer_accessed(eb); return eb;
}
/* * For subpage case, we completely rely on xarray to ensure we don't try * to insert two ebs for the same bytenr. So here we always return NULL * and just continue.
*/ if (btrfs_meta_is_subpage(fs_info)) return NULL;
/* Page not yet attached to an extent buffer */ if (!folio_test_private(folio)) return NULL;
/* * We could have already allocated an eb for this folio and attached one * so lets see if we can get a ref on the existing eb, and if we can we * know it's good and we can just return that one, else we know we can * just overwrite folio private.
*/
exists = folio_get_private(folio); if (refcount_inc_not_zero(&exists->refs)) return exists;
/* * Validate alignment constraints of eb at logical address @start.
*/ staticbool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
{ if (!IS_ALIGNED(start, fs_info->sectorsize)) {
btrfs_err(fs_info, "bad tree block start %llu", start); returntrue;
}
if (fs_info->nodesize < PAGE_SIZE && !IS_ALIGNED(start, fs_info->nodesize)) {
btrfs_err(fs_info, "tree block is not nodesize aligned, start %llu nodesize %u",
start, fs_info->nodesize); returntrue;
} if (fs_info->nodesize >= PAGE_SIZE &&
!PAGE_ALIGNED(start)) {
btrfs_err(fs_info, "tree block is not page aligned, start %llu nodesize %u",
start, fs_info->nodesize); returntrue;
} if (!IS_ALIGNED(start, fs_info->nodesize) &&
!test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags)) {
btrfs_warn(fs_info, "tree block not nodesize aligned, start %llu nodesize %u, can be resolved by a full metadata balance",
start, fs_info->nodesize);
} returnfalse;
}
/* * Return 0 if eb->folios[i] is attached to btree inode successfully. * Return >0 if there is already another extent buffer for the range, * and @found_eb_ret would be updated. * Return -EAGAIN if the filemap has an existing folio but with different size * than @eb. * The caller needs to free the existing folios and retry using the same order.
*/ staticint attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, struct btrfs_folio_state *prealloc, struct extent_buffer **found_eb_ret)
{
/* Caller should ensure the folio exists. */
ASSERT(eb->folios[i]);
retry:
existing_folio = NULL;
ret = filemap_add_folio(mapping, eb->folios[i], index + i,
GFP_NOFS | __GFP_NOFAIL); if (!ret) goto finish;
existing_folio = filemap_lock_folio(mapping, index + i); /* The page cache only exists for a very short time, just retry. */ if (IS_ERR(existing_folio)) goto retry;
/* For now, we should only have single-page folios for btree inode. */
ASSERT(folio_nr_pages(existing_folio) == 1);
if (folio_size(existing_folio) != eb->folio_size) {
folio_unlock(existing_folio);
folio_put(existing_folio); return -EAGAIN;
}
finish:
spin_lock(&mapping->i_private_lock); if (existing_folio && btrfs_meta_is_subpage(fs_info)) { /* We're going to reuse the existing page, can drop our folio now. */
__free_page(folio_page(eb->folios[i], 0));
eb->folios[i] = existing_folio;
} elseif (existing_folio) { struct extent_buffer *existing_eb;
existing_eb = grab_extent_buffer(fs_info, existing_folio); if (existing_eb) { /* The extent buffer still exists, we can use it directly. */
*found_eb_ret = existing_eb;
spin_unlock(&mapping->i_private_lock);
folio_unlock(existing_folio);
folio_put(existing_folio); return 1;
} /* The extent buffer no longer exists, we can reuse the folio. */
__free_page(folio_page(eb->folios[i], 0));
eb->folios[i] = existing_folio;
}
eb->folio_size = folio_size(eb->folios[i]);
eb->folio_shift = folio_shift(eb->folios[i]); /* Should not fail, as we have preallocated the memory. */
ret = attach_extent_buffer_folio(eb, eb->folios[i], prealloc);
ASSERT(!ret); /* * To inform we have an extra eb under allocation, so that * detach_extent_buffer_page() won't release the folio private when the * eb hasn't been inserted into the xarray yet. * * The ref will be decreased when the eb releases the page, in * detach_extent_buffer_page(). Thus needs no special handling in the * error path.
*/
btrfs_folio_inc_eb_refs(fs_info, eb->folios[i]);
spin_unlock(&mapping->i_private_lock); return 0;
}
if (check_eb_alignment(fs_info, start)) return ERR_PTR(-EINVAL);
#if BITS_PER_LONG == 32 if (start >= MAX_LFS_FILESIZE) {
btrfs_err_rl(fs_info, "extent buffer %llu is beyond 32bit page cache limit", start);
btrfs_err_32bit_limit(fs_info); return ERR_PTR(-EOVERFLOW);
} if (start >= BTRFS_32BIT_EARLY_WARN_THRESHOLD)
btrfs_warn_32bit_limit(fs_info); #endif
eb = find_extent_buffer(fs_info, start); if (eb) return eb;
eb = __alloc_extent_buffer(fs_info, start); if (!eb) return ERR_PTR(-ENOMEM);
/* * The reloc trees are just snapshots, so we need them to appear to be * just like any other fs tree WRT lockdep.
*/ if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID)
lockdep_owner = BTRFS_FS_TREE_OBJECTID;
/* * Preallocate folio private for subpage case, so that we won't * allocate memory with i_private_lock nor page lock hold. * * The memory will be freed by attach_extent_buffer_page() or freed * manually if we exit earlier.
*/ if (btrfs_meta_is_subpage(fs_info)) {
prealloc = btrfs_alloc_folio_state(fs_info, PAGE_SIZE, BTRFS_SUBPAGE_METADATA); if (IS_ERR(prealloc)) {
ret = PTR_ERR(prealloc); goto out;
}
}
reallocate: /* Allocate all pages first. */
ret = alloc_eb_folio_array(eb, true); if (ret < 0) {
btrfs_free_folio_state(prealloc); goto out;
}
/* Attach all pages to the filemap. */ for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio;
ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb); if (ret > 0) {
ASSERT(existing_eb); goto out;
}
/* * TODO: Special handling for a corner case where the order of * folios mismatch between the new eb and filemap. * * This happens when: * * - the new eb is using higher order folio * * - the filemap is still using 0-order folios for the range * This can happen at the previous eb allocation, and we don't * have higher order folio for the call. * * - the existing eb has already been freed * * In this case, we have to free the existing folios first, and * re-allocate using the same order. * Thankfully this is not going to happen yet, as we're still * using 0-order folios.
*/ if (unlikely(ret == -EAGAIN)) {
DEBUG_WARN("folio order mismatch between new eb and filemap"); goto reallocate;
}
attached++;
/* * Only after attach_eb_folio_to_filemap(), eb->folios[] is * reliable, as we may choose to reuse the existing page cache * and free the allocated page.
*/
folio = eb->folios[i];
WARN_ON(btrfs_meta_folio_test_dirty(folio, eb));
/* * Check if the current page is physically contiguous with previous eb * page. * At this stage, either we allocated a large folio, thus @i * would only be 0, or we fall back to per-page allocation.
*/ if (i && folio_page(eb->folios[i - 1], 0) + 1 != folio_page(folio, 0))
page_contig = false;
if (!btrfs_meta_folio_test_uptodate(folio, eb))
uptodate = 0;
/* * We can't unlock the pages just yet since the extent buffer * hasn't been properly inserted into the xarray, this opens a * race with btree_release_folio() which can free a page while we * are still filling in all pages for the buffer and we could crash.
*/
} if (uptodate)
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); /* All pages are physically contiguous, can skip cross page handling. */ if (page_contig)
eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start);
again:
xa_lock_irq(&fs_info->buffer_tree);
existing_eb = __xa_cmpxchg(&fs_info->buffer_tree,
start >> fs_info->nodesize_bits, NULL, eb,
GFP_NOFS); if (xa_is_err(existing_eb)) {
ret = xa_err(existing_eb);
xa_unlock_irq(&fs_info->buffer_tree); goto out;
} if (existing_eb) { if (!refcount_inc_not_zero(&existing_eb->refs)) {
xa_unlock_irq(&fs_info->buffer_tree); goto again;
}
xa_unlock_irq(&fs_info->buffer_tree); goto out;
}
xa_unlock_irq(&fs_info->buffer_tree);
/* add one reference for the tree */
check_buffer_tree_ref(eb);
/* * Now it's safe to unlock the pages because any calls to * btree_release_folio will correctly detect that a page belongs to a * live buffer and won't free them prematurely.
*/ for (int i = 0; i < num_extent_folios(eb); i++) {
folio_unlock(eb->folios[i]); /* * A folio that has been added to an address_space mapping * should not continue holding the refcount from its original * allocation indefinitely.
*/
folio_put(eb->folios[i]);
} return eb;
out:
WARN_ON(!refcount_dec_and_test(&eb->refs));
/* * Any attached folios need to be detached before we unlock them. This * is because when we're inserting our new folios into the mapping, and * then attaching our eb to that folio. If we fail to insert our folio * we'll lookup the folio for that index, and grab that EB. We do not * want that to grab this eb, as we're getting ready to free it. So we * have to detach it first and then unlock it. * * Note: the bounds is num_extent_pages() as we need to go through all slots.
*/ for (int i = 0; i < num_extent_pages(eb); i++) { struct folio *folio = eb->folios[i];
if (i < attached) {
ASSERT(folio);
detach_extent_buffer_folio(eb, folio);
folio_unlock(folio);
} elseif (!folio) { continue;
}
if (refcount_dec_and_test(&eb->refs)) { struct btrfs_fs_info *fs_info = eb->fs_info;
spin_unlock(&eb->refs_lock);
/* * We're erasing, theoretically there will be no allocations, so * just use GFP_ATOMIC. * * We use cmpxchg instead of erase because we do not know if * this eb is actually in the tree or not, we could be cleaning * up an eb that we allocated but never inserted into the tree. * Thus use cmpxchg to remove it from the tree if it is there, * or leave the other entry if this isn't in the tree. * * The documentation says that putting a NULL value is the same * as erase as long as XA_FLAGS_ALLOC is not set, which it isn't * in this case.
*/
xa_cmpxchg_irq(&fs_info->buffer_tree,
eb->start >> fs_info->nodesize_bits, eb, NULL,
GFP_ATOMIC);
btrfs_leak_debug_del_eb(eb); /* Should be safe to release folios at this point. */
btrfs_release_extent_buffer_folios(eb); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) {
kmem_cache_free(extent_buffer_cache, eb); return 1;
} #endif
call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); return 1;
}
spin_unlock(&eb->refs_lock);
return 0;
}
void free_extent_buffer(struct extent_buffer *eb)
{ int refs; if (!eb) return;
refs = refcount_read(&eb->refs); while (1) { if (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)) { if (refs == 1) break;
} elseif (refs <= 3) { break;
}
/* Optimization to avoid locking eb->refs_lock. */ if (atomic_try_cmpxchg(&eb->refs.refs, &refs, refs - 1)) return;
}
/* * I know this is terrible, but it's temporary until we stop tracking * the uptodate bits and such for the extent buffers.
*/
release_extent_buffer(eb);
}
void free_extent_buffer_stale(struct extent_buffer *eb)
{ if (!eb) return;
if (trans && btrfs_header_generation(eb) != trans->transid) return;
/* * Instead of clearing the dirty flag off of the buffer, mark it as * EXTENT_BUFFER_ZONED_ZEROOUT. This allows us to preserve * write-ordering in zoned mode, without the need to later re-dirty * the extent_buffer. * * The actual zeroout of the buffer will happen later in * btree_csum_one_bio.
*/ if (btrfs_is_zoned(fs_info) && test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
set_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags); return;
}
if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) return;
if (!was_dirty) { bool subpage = btrfs_meta_is_subpage(eb->fs_info);
/* * For subpage case, we can have other extent buffers in the * same page, and in clear_extent_buffer_dirty() we * have to clear page dirty without subpage lock held. * This can cause race where our page gets dirty cleared after * we just set it. * * Thankfully, clear_extent_buffer_dirty() has locked * its page for other reasons, we can use page lock to prevent * the above race.
*/ if (subpage)
folio_lock(eb->folios[0]); for (int i = 0; i < num_extent_folios(eb); i++)
btrfs_meta_folio_set_dirty(eb->folios[i], eb);
buffer_tree_set_mark(eb, PAGECACHE_TAG_DIRTY); if (subpage)
folio_unlock(eb->folios[0]);
percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes,
eb->len,
eb->fs_info->dirty_metadata_batch);
} #ifdef CONFIG_BTRFS_DEBUG for (int i = 0; i < num_extent_folios(eb); i++)
ASSERT(folio_test_dirty(eb->folios[i])); #endif
}
/* * If the extent buffer is marked UPTODATE before the read operation * completes, other calls to read_extent_buffer_pages() will return * early without waiting for the read to finish, causing data races.
*/
WARN_ON(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags));
eb->read_mirror = bbio->mirror_num;
if (uptodate &&
btrfs_validate_extent_buffer(eb, &bbio->parent_check) < 0)
uptodate = false;
if (uptodate)
set_extent_buffer_uptodate(eb); else
clear_extent_buffer_uptodate(eb);
int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, conststruct btrfs_tree_parent_check *check)
{ struct btrfs_bio *bbio;
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0;
/* * We could have had EXTENT_BUFFER_UPTODATE cleared by the write * operation, which could potentially still be in flight. In this case * we simply want to return an error.
*/ if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))) return -EIO;
/* Someone else is already reading the buffer, just wait for it. */ if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags)) return 0;
/* * Between the initial test_bit(EXTENT_BUFFER_UPTODATE) and the above * test_and_set_bit(EXTENT_BUFFER_READING), someone else could have * started and finished reading the same eb. In this case, UPTODATE * will now be set, and we shouldn't read it in again.
*/ if (unlikely(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))) {
clear_extent_buffer_reading(eb); return 0;
}
staticbool report_eb_range(conststruct extent_buffer *eb, unsignedlong start, unsignedlong len)
{
btrfs_warn(eb->fs_info, "access to eb bytenr %llu len %u out of range start %lu len %lu",
eb->start, eb->len, start, len);
DEBUG_WARN();
returntrue;
}
/* * Check if the [start, start + len) range is valid before reading/writing * the eb. * NOTE: @start and @len are offset inside the eb, not logical address. * * Caller should not touch the dst/src memory if this function returns error.
*/ staticinlineint check_eb_range(conststruct extent_buffer *eb, unsignedlong start, unsignedlong len)
{ unsignedlong offset;
/* start, start + len should not go beyond eb->len nor overflow */ if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len)) return report_eb_range(eb, start, len);
if (check_eb_range(eb, start, len)) { /* * Invalid range hit, reset the memory, so callers won't get * some random garbage for their uninitialized memory.
*/
memset(dstv, 0, len); return;
}
if (eb->addr) {
memcpy(dstv, eb->addr + start, len); return;
}
/* * Check that the extent buffer is uptodate. * * For regular sector size == PAGE_SIZE case, check if @page is uptodate. * For subpage case, check if the range covered by the eb has EXTENT_UPTODATE.
*/ staticvoid assert_eb_folio_uptodate(conststruct extent_buffer *eb, int i)
{ struct btrfs_fs_info *fs_info = eb->fs_info; struct folio *folio = eb->folios[i];
ASSERT(folio);
/* * If we are using the commit root we could potentially clear a page * Uptodate while we're using the extent buffer that we've previously * looked up. We don't want to complain in this case, as the page was * valid before, we just didn't write it out. Instead we want to catch * the case where we didn't actually read the block properly, which * would have !PageUptodate and !EXTENT_BUFFER_WRITE_ERR.
*/ if (test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) return;
/* * Calculate the folio and offset of the byte containing the given bit number. * * @eb: the extent buffer * @start: offset of the bitmap item in the extent buffer * @nr: bit number * @folio_index: return index of the folio in the extent buffer that contains * the given bit number * @folio_offset: return offset into the folio given by folio_index * * This helper hides the ugliness of finding the byte in an extent buffer which * contains a given bit.
*/ staticinlinevoid eb_bitmap_offset(conststruct extent_buffer *eb, unsignedlong start, unsignedlong nr, unsignedlong *folio_index,
size_t *folio_offset)
{
size_t byte_offset = BIT_BYTE(nr);
size_t offset;
/* * The byte we want is the offset of the extent buffer + the offset of * the bitmap item in the extent buffer + the offset of the byte in the * bitmap item.
*/
offset = start + offset_in_eb_folio(eb, eb->start) + byte_offset;
/* * Determine whether a bit in a bitmap item is set. * * @eb: the extent buffer * @start: offset of the bitmap item in the extent buffer * @nr: bit number to test
*/ bool extent_buffer_test_bit(conststruct extent_buffer *eb, unsignedlong start, unsignedlong nr)
{ unsignedlong i;
size_t offset;
u8 *kaddr;
/* * Set an area of a bitmap to 1. * * @eb: the extent buffer * @start: offset of the bitmap item in the extent buffer * @pos: bit number of the first bit * @len: number of bits to set
*/ void extent_buffer_bitmap_set(conststruct extent_buffer *eb, unsignedlong start, unsignedlong pos, unsignedlong len)
{ unsignedint first_byte = start + BIT_BYTE(pos); unsignedint last_byte = start + BIT_BYTE(pos + len - 1); constbool same_byte = (first_byte == last_byte);
u8 mask = BITMAP_FIRST_BYTE_MASK(pos);
u8 *kaddr;
if (same_byte)
mask &= BITMAP_LAST_BYTE_MASK(pos + len);
/* Handle the first byte. */
kaddr = extent_buffer_get_byte(eb, first_byte);
*kaddr |= mask; if (same_byte) return;
/* Handle the last byte. */
kaddr = extent_buffer_get_byte(eb, last_byte);
*kaddr |= BITMAP_LAST_BYTE_MASK(pos + len);
}
/* * Clear an area of a bitmap. * * @eb: the extent buffer * @start: offset of the bitmap item in the extent buffer * @pos: bit number of the first bit * @len: number of bits to clear
*/ void extent_buffer_bitmap_clear(conststruct extent_buffer *eb, unsignedlong start, unsignedlong pos, unsignedlong len)
{ unsignedint first_byte = start + BIT_BYTE(pos); unsignedint last_byte = start + BIT_BYTE(pos + len - 1); constbool same_byte = (first_byte == last_byte);
u8 mask = BITMAP_FIRST_BYTE_MASK(pos);
u8 *kaddr;
if (same_byte)
mask &= BITMAP_LAST_BYTE_MASK(pos + len);
/* Handle the first byte. */
kaddr = extent_buffer_get_byte(eb, first_byte);
*kaddr &= ~mask; if (same_byte) return;
cur = min_t(unsignedlong, len, src_off_in_folio + 1);
cur = min(cur, dst_off_in_folio + 1);
src_addr = folio_address(dst->folios[src_i]) + src_off_in_folio -
cur + 1;
use_memmove = areas_overlap(src_end - cur + 1, dst_end - cur + 1,
cur);
__write_extent_buffer(dst, src_addr, dst_end - cur + 1, cur,
use_memmove);
dst_end -= cur;
src_end -= cur;
len -= cur;
}
}
staticint try_release_subpage_extent_buffer(struct folio *folio)
{ struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); struct extent_buffer *eb; unsignedlong start = (folio_pos(folio) >> fs_info->nodesize_bits); unsignedlong index = start; unsignedlong end = index + (PAGE_SIZE >> fs_info->nodesize_bits) - 1; int ret;
rcu_read_lock();
xa_for_each_range(&fs_info->buffer_tree, index, eb, start, end) { /* * The same as try_release_extent_buffer(), to ensure the eb * won't disappear out from under us.
*/
spin_lock(&eb->refs_lock);
rcu_read_unlock();
/* * If tree ref isn't set then we know the ref on this eb is a * real ref, so just return, this eb will likely be freed soon * anyway.
*/ if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
spin_unlock(&eb->refs_lock); break;
}
/* * Here we don't care about the return value, we will always * check the folio private at the end. And * release_extent_buffer() will release the refs_lock.
*/
release_extent_buffer(eb);
rcu_read_lock();
}
rcu_read_unlock();
/* * Finally to check if we have cleared folio private, as if we have * released all ebs in the page, the folio private should be cleared now.
*/
spin_lock(&folio->mapping->i_private_lock); if (!folio_test_private(folio))
ret = 1; else
ret = 0;
spin_unlock(&folio->mapping->i_private_lock); return ret;
}
int try_release_extent_buffer(struct folio *folio)
{ struct extent_buffer *eb;
if (btrfs_meta_is_subpage(folio_to_fs_info(folio))) return try_release_subpage_extent_buffer(folio);
/* * We need to make sure nobody is changing folio private, as we rely on * folio private as the pointer to extent buffer.
*/
spin_lock(&folio->mapping->i_private_lock); if (!folio_test_private(folio)) {
spin_unlock(&folio->mapping->i_private_lock); return 1;
}
eb = folio_get_private(folio);
BUG_ON(!eb);
/* * This is a little awful but should be ok, we need to make sure that * the eb doesn't disappear out from under us while we're looking at * this page.
*/
spin_lock(&eb->refs_lock); if (refcount_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
spin_unlock(&eb->refs_lock);
spin_unlock(&folio->mapping->i_private_lock); return 0;
}
spin_unlock(&folio->mapping->i_private_lock);
/* * If tree ref isn't set then we know the ref on this eb is a real ref, * so just return, this page will likely be freed soon anyway.
*/ if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
spin_unlock(&eb->refs_lock); return 0;
}
return release_extent_buffer(eb);
}
/* * Attempt to readahead a child block. * * @fs_info: the fs_info * @bytenr: bytenr to read * @owner_root: objectid of the root that owns this eb * @gen: generation for the uptodate check, can be 0 * @level: level for the eb * * Attempt to readahead a tree block at @bytenr. If @gen is 0 then we do a * normal uptodate check of the eb, without checking the generation. If we have * to read the block we will not block on anything.
*/ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 owner_root, u64 gen, int level)
{ struct btrfs_tree_parent_check check = {
.level = level,
.transid = gen
}; struct extent_buffer *eb; int ret;
eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level); if (IS_ERR(eb)) return;
if (btrfs_buffer_uptodate(eb, gen, 1)) {
free_extent_buffer(eb); return;
}
ret = read_extent_buffer_pages_nowait(eb, 0, &check); if (ret < 0)
free_extent_buffer_stale(eb); else
free_extent_buffer(eb);
}
/* * Readahead a node's child block. * * @node: parent node we're reading from * @slot: slot in the parent node for the child we want to read * * A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at * the slot in the node provided.
*/ void btrfs_readahead_node_child(struct extent_buffer *node, int slot)
{
btrfs_readahead_tree_block(node->fs_info,
btrfs_node_blockptr(node, slot),
btrfs_header_owner(node),
btrfs_node_ptr_generation(node, slot),
btrfs_header_level(node) - 1);
}
Messung V0.5 in Prozent
¤ Dauer der Verarbeitung: 0.73 Sekunden
(vorverarbeitet am 2026-04-26)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.