/* * If we didn't get into open_ctree our allocated_ebs will not be * initialized, so just skip this.
*/ if (!fs_info->allocated_ebs.next) return;
WARN_ON(!list_empty(&fs_info->allocated_ebs));
spin_lock_irqsave(&fs_info->eb_leak_lock, flags); while (!list_empty(&fs_info->allocated_ebs)) {
eb = list_first_entry(&fs_info->allocated_ebs, struct extent_buffer, leak_list);
btrfs_err(fs_info, "buffer leak start %llu len %u refs %d bflags %lu owner %llu",
eb->start, eb->len, refcount_read(&eb->refs), eb->bflags,
btrfs_header_owner(eb));
list_del(&eb->leak_list);
WARN_ON_ONCE(1);
kmem_cache_free(extent_buffer_cache, eb);
}
spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
} #else #define btrfs_leak_debug_add_eb(eb) do {} while (0) #define btrfs_leak_debug_del_eb(eb) do {} while (0) #endif
/* * Structure to record info about the bio being assembled, and other info like * how many bytes are there before stripe/ordered extent boundary.
*/ struct btrfs_bio_ctrl { struct btrfs_bio *bbio; /* Last byte contained in bbio + 1 . */
loff_t next_file_offset; enum btrfs_compression_type compress_type;
u32 len_to_oe_boundary;
blk_opf_t opf;
btrfs_bio_end_io_t end_io_func; struct writeback_control *wbc;
/* * The sectors of the page which are going to be submitted by * extent_writepage_io(). * This is to avoid touching ranges covered by compression/inline.
*/ unsignedlong submit_bitmap; struct readahead_control *ractl;
/* * The start offset of the last used extent map by a read operation. * * This is for proper compressed read merge. * U64_MAX means we are starting the read and have made no progress yet. * * The current btrfs_bio_is_contig() only uses disk_bytenr as * the condition to check if the read can be merged with previous * bio, which is not correct. E.g. two file extents pointing to the * same extent but with different offset. * * So here we need to do extra checks to only merge reads that are * covered by the same extent map. * Just extent_map::start will be enough, as they are unique * inside the same inode.
*/
u64 last_em_start;
};
/* The bbio is owned by the end_io handler now */
bio_ctrl->bbio = NULL;
}
/* * Submit or fail the current bio in the bio_ctrl structure.
*/ staticvoid submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret)
{ struct btrfs_bio *bbio = bio_ctrl->bbio;
if (!bbio) return;
if (ret) {
ASSERT(ret < 0);
btrfs_bio_end_io(bbio, errno_to_blk_status(ret)); /* The bio is owned by the end_io handler now */
bio_ctrl->bbio = NULL;
} else {
submit_one_bio(bio_ctrl);
}
}
int __init extent_buffer_init_cachep(void)
{
extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", sizeof(struct extent_buffer), 0, 0,
NULL); if (!extent_buffer_cache) return -ENOMEM;
return 0;
}
void __cold extent_buffer_free_cachep(void)
{ /* * Make sure all delayed rcu free are flushed before we * destroy caches.
*/
rcu_barrier();
kmem_cache_destroy(extent_buffer_cache);
}
/* * Find and lock a contiguous range of bytes in the file marked as delalloc, no * more than @max_bytes. * * @start: The original start bytenr to search. * Will store the extent range start bytenr. * @end: The original end bytenr of the search range * Will store the extent range end bytenr. * * Return true if we find a delalloc range which starts inside the original * range, and @start/@end will store the delalloc range start/end. * * Return false if we can't find any delalloc range which starts inside the * original range, and @start/@end will be the non-delalloc range start/end.
*/
EXPORT_FOR_TESTS
noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, struct folio *locked_folio,
u64 *start, u64 *end)
{ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; const u64 orig_start = *start; const u64 orig_end = *end; /* The sanity tests may not set a valid fs_info. */
u64 max_bytes = fs_info ? fs_info->max_extent_size : BTRFS_MAX_EXTENT_SIZE;
u64 delalloc_start;
u64 delalloc_end; bool found; struct extent_state *cached_state = NULL; int ret; int loops = 0;
/* Caller should pass a valid @end to indicate the search range end */
ASSERT(orig_end > orig_start);
/* The range should at least cover part of the folio */
ASSERT(!(orig_start >= folio_end(locked_folio) ||
orig_end <= folio_pos(locked_folio)));
again: /* step one, find a bunch of delalloc bytes starting at start */
delalloc_start = *start;
delalloc_end = 0;
/* * If @max_bytes is smaller than a block, btrfs_find_delalloc_range() can * return early without handling any dirty ranges.
*/
ASSERT(max_bytes >= fs_info->sectorsize);
/* @delalloc_end can be -1, never go beyond @orig_end */
*end = min(delalloc_end, orig_end);
btrfs_free_extent_state(cached_state); returnfalse;
}
/* * start comes from the offset of locked_folio. We have to lock * folios in order, so we can't process delalloc bytes before * locked_folio
*/ if (delalloc_start < *start)
delalloc_start = *start;
/* * make sure to limit the number of folios we try to lock down
*/ if (delalloc_end + 1 - delalloc_start > max_bytes)
delalloc_end = delalloc_start + max_bytes - 1;
/* step two, lock all the folioss after the folios that has start */
ret = lock_delalloc_folios(inode, locked_folio, delalloc_start,
delalloc_end);
ASSERT(!ret || ret == -EAGAIN); if (ret == -EAGAIN) { /* * Some of the folios are gone, lets avoid looping by * shortening the size of the delalloc range we're searching.
*/
btrfs_free_extent_state(cached_state);
cached_state = NULL; if (!loops) {
max_bytes = fs_info->sectorsize;
loops = 1; goto again;
} else {
found = false; goto out_failed;
}
}
/* step three, lock the state bits for the whole range */
btrfs_lock_extent(tree, delalloc_start, delalloc_end, &cached_state);
/* then test to make sure it is all still delalloc */
ret = btrfs_test_range_bit(tree, delalloc_start, delalloc_end,
EXTENT_DELALLOC, cached_state);
if (!btrfs_is_subpage(fs_info, folio))
folio_unlock(folio); else
btrfs_folio_end_lock(fs_info, folio, start, len);
}
/* * After a write IO is done, we need to: * * - clear the uptodate bits on error * - clear the writeback bits in the extent tree for the range * - filio_end_writeback() if there is no more pending io for the folio * * Scheduling is not allowed, so the extent state tree is expected * to have one and only one object corresponding to this IO.
*/ staticvoid end_bbio_data_write(struct btrfs_bio *bbio)
{ struct btrfs_fs_info *fs_info = bbio->fs_info; struct bio *bio = &bbio->bio; int error = blk_status_to_errno(bio->bi_status); struct folio_iter fi; const u32 sectorsize = fs_info->sectorsize;
/* * After a data read IO is done, we need to: * * - clear the uptodate bits on error * - set the uptodate bits if things worked * - set the folio up to date if all extents in the tree are uptodate * - clear the lock bit in the extent tree * - unlock the folio if there are no other extents locked for it * * Scheduling is not allowed, so the extent state tree is expected * to have one and only one object corresponding to this IO.
*/ staticvoid end_bbio_data_read(struct btrfs_bio *bbio)
{ struct btrfs_fs_info *fs_info = bbio->fs_info; struct bio *bio = &bbio->bio; struct folio_iter fi;
if (likely(uptodate)) {
u64 end = start + fi.length - 1;
loff_t i_size = i_size_read(inode);
/* * Zero out the remaining part if this range straddles * i_size. * * Here we should only zero the range inside the folio, * not touch anything else. * * NOTE: i_size is exclusive while end is inclusive and * folio_contains() takes PAGE_SIZE units.
*/ if (folio_contains(folio, i_size >> PAGE_SHIFT) &&
i_size <= end) {
u32 zero_start = max(offset_in_folio(folio, i_size),
offset_in_folio(folio, start));
u32 zero_len = offset_in_folio(folio, end) + 1 -
zero_start;
/* Update page status and unlock. */
end_folio_read(folio, uptodate, start, fi.length);
}
bio_put(bio);
}
/* * Populate every free slot in a provided array with folios using GFP_NOFS. * * @nr_folios: number of folios to allocate * @folio_array: the array to fill with folios; any existing non-NULL entries in * the array will be skipped * * Return: 0 if all folios were able to be allocated; * -ENOMEM otherwise, the partially allocated folios would be freed and * the array slots zeroed
*/ int btrfs_alloc_folio_array(unsignedint nr_folios, struct folio **folio_array)
{ for (int i = 0; i < nr_folios; i++) { if (folio_array[i]) continue;
folio_array[i] = folio_alloc(GFP_NOFS, 0); if (!folio_array[i]) goto error;
} return 0;
error: for (int i = 0; i < nr_folios; i++) { if (folio_array[i])
folio_put(folio_array[i]);
} return -ENOMEM;
}
/* * Populate every free slot in a provided array with pages, using GFP_NOFS. * * @nr_pages: number of pages to allocate * @page_array: the array to fill with pages; any existing non-null entries in * the array will be skipped * @nofail: whether using __GFP_NOFAIL flag * * Return: 0 if all pages were able to be allocated; * -ENOMEM otherwise, the partially allocated pages would be freed and * the array slots zeroed
*/ int btrfs_alloc_page_array(unsignedint nr_pages, struct page **page_array, bool nofail)
{ const gfp_t gfp = nofail ? (GFP_NOFS | __GFP_NOFAIL) : GFP_NOFS; unsignedint allocated;
for (allocated = 0; allocated < nr_pages;) { unsignedint last = allocated;
allocated = alloc_pages_bulk(gfp, nr_pages, page_array); if (unlikely(allocated == last)) { /* No progress, fail and do cleanup. */ for (int i = 0; i < allocated; i++) {
__free_page(page_array[i]);
page_array[i] = NULL;
} return -ENOMEM;
}
} return 0;
}
/* * Populate needed folios for the extent buffer. * * For now, the folios populated are always in order 0 (aka, single page).
*/ staticint alloc_eb_folio_array(struct extent_buffer *eb, bool nofail)
{ struct page *page_array[INLINE_EXTENT_BUFFER_PAGES] = { 0 }; int num_pages = num_extent_pages(eb); int ret;
ret = btrfs_alloc_page_array(num_pages, page_array, nofail); if (ret < 0) return ret;
for (int i = 0; i < num_pages; i++)
eb->folios[i] = page_folio(page_array[i]);
eb->folio_size = PAGE_SIZE;
eb->folio_shift = PAGE_SHIFT; return 0;
}
if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { /* * For compression, all IO should have its logical bytenr set * to the starting bytenr of the compressed extent.
*/ return bio->bi_iter.bi_sector == sector;
}
/* * To merge into a bio both the disk sector and the logical offset in * the file need to be contiguous.
*/ return bio_ctrl->next_file_offset == file_offset &&
bio_end_sector(bio) == sector;
}
/* * Pick the last added device to support cgroup writeback. For * multi-device file systems this means blk-cgroup policies have * to always be set on the last added/replaced device. * This is a bit odd but has been like that for a long time.
*/
bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev);
wbc_init_bio(bio_ctrl->wbc, &bbio->bio);
}
}
/* * @disk_bytenr: logical bytenr where the write will be * @page: page to add to the bio * @size: portion of page that we want to write to * @pg_offset: offset of the new bio or to check whether we are adding * a contiguous page to the previous one * * The will either add the page into the existing @bio_ctrl->bbio, or allocate a * new one in @bio_ctrl->bbio. * The mirror number for this IO should already be initizlied in * @bio_ctrl->mirror_num.
*/ staticvoid submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
u64 disk_bytenr, struct folio *folio,
size_t size, unsignedlong pg_offset)
{ struct btrfs_inode *inode = folio_to_inode(folio);
loff_t file_offset = folio_pos(folio) + pg_offset;
if (bio_ctrl->bbio &&
!btrfs_bio_is_contig(bio_ctrl, disk_bytenr, file_offset))
submit_one_bio(bio_ctrl);
do {
u32 len = size;
/* Allocate new bio if needed */ if (!bio_ctrl->bbio)
alloc_new_bio(inode, bio_ctrl, disk_bytenr, file_offset);
/* Cap to the current ordered extent boundary if there is one. */ if (len > bio_ctrl->len_to_oe_boundary) {
ASSERT(bio_ctrl->compress_type == BTRFS_COMPRESS_NONE);
ASSERT(is_data_inode(inode));
len = bio_ctrl->len_to_oe_boundary;
}
if (!bio_add_folio(&bio_ctrl->bbio->bio, folio, len, pg_offset)) { /* bio full: move on to a new one */
submit_one_bio(bio_ctrl); continue;
}
bio_ctrl->next_file_offset += len;
if (bio_ctrl->wbc)
wbc_account_cgroup_owner(bio_ctrl->wbc, folio, len);
/* * len_to_oe_boundary defaults to U32_MAX, which isn't folio or * sector aligned. alloc_new_bio() then sets it to the end of * our ordered extent for writes into zoned devices. * * When len_to_oe_boundary is tracking an ordered extent, we * trust the ordered extent code to align things properly, and * the check above to cap our write to the ordered extent * boundary is correct. * * When len_to_oe_boundary is U32_MAX, the cap above would * result in a 4095 byte IO for the last folio right before * we hit the bio limit of UINT_MAX. bio_add_folio() has all * the checks required to make sure we don't overflow the bio, * and we should just ignore len_to_oe_boundary completely * unless we're using it to track an ordered extent. * * It's pretty hard to make a bio sized U32_MAX, but it can * happen when the page cache is able to feed us contiguous * folios for large extents.
*/ if (bio_ctrl->len_to_oe_boundary != U32_MAX)
bio_ctrl->len_to_oe_boundary -= len;
/* Ordered extent boundary: move on to a new bio. */ if (bio_ctrl->len_to_oe_boundary == 0)
submit_one_bio(bio_ctrl);
} while (size);
}
staticint attach_extent_buffer_folio(struct extent_buffer *eb, struct folio *folio, struct btrfs_folio_state *prealloc)
{ struct btrfs_fs_info *fs_info = eb->fs_info; int ret = 0;
/* * If the page is mapped to btree inode, we should hold the private * lock to prevent race. * For cloned or dummy extent buffers, their pages are not mapped and * will not race with any other ebs.
*/ if (folio->mapping)
lockdep_assert_held(&folio->mapping->i_private_lock);
if (!btrfs_meta_is_subpage(fs_info)) { if (!folio_test_private(folio))
folio_attach_private(folio, eb); else
WARN_ON(folio_get_private(folio) != eb); return 0;
}
/* Already mapped, just free prealloc */ if (folio_test_private(folio)) {
btrfs_free_folio_state(prealloc); return 0;
}
if (prealloc) /* Has preallocated memory for subpage */
folio_attach_private(folio, prealloc); else /* Do new allocation to attach subpage */
ret = btrfs_attach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA); return ret;
}
int set_folio_extent_mapped(struct folio *folio)
{ struct btrfs_fs_info *fs_info;
ASSERT(folio->mapping);
if (folio_test_private(folio)) return 0;
fs_info = folio_to_fs_info(folio);
if (btrfs_is_subpage(fs_info, folio)) return btrfs_attach_folio_state(fs_info, folio, BTRFS_SUBPAGE_DATA);
if (zero_offset)
folio_zero_range(folio, zero_offset,
folio_size(folio) - zero_offset);
}
bio_ctrl->end_io_func = end_bbio_data_read;
begin_folio_read(fs_info, folio); for (u64 cur = start; cur <= end; cur += blocksize) { enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE; unsignedlong pg_offset = offset_in_folio(folio, cur); bool force_bio_submit = false;
u64 disk_bytenr;
u64 block_start;
ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); if (cur >= last_byte) {
folio_zero_range(folio, pg_offset, end - cur + 1);
end_folio_read(folio, true, cur, end - cur + 1); break;
} if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) {
end_folio_read(folio, true, cur, blocksize); continue;
}
em = get_extent_map(BTRFS_I(inode), folio, cur, end - cur + 1, em_cached); if (IS_ERR(em)) {
end_folio_read(folio, false, cur, end + 1 - cur); return PTR_ERR(em);
}
extent_offset = cur - em->start;
BUG_ON(btrfs_extent_map_end(em) <= cur);
BUG_ON(end < cur);
compress_type = btrfs_extent_map_compression(em);
/* * Only expand readahead for extents which are already creating * the pages anyway in add_ra_bio_pages, which is compressed * extents in the non subpage case.
*/ if (bio_ctrl->ractl &&
!btrfs_is_subpage(fs_info, folio) &&
compress_type != BTRFS_COMPRESS_NONE)
btrfs_readahead_expand(bio_ctrl->ractl, em);
/* * If we have a file range that points to a compressed extent * and it's followed by a consecutive file range that points * to the same compressed extent (possibly with a different * offset and/or length, so it either points to the whole extent * or only part of it), we must make sure we do not submit a * single bio to populate the folios for the 2 ranges because * this makes the compressed extent read zero out the folios * belonging to the 2nd range. Imagine the following scenario: * * File layout * [0 - 8K] [8K - 24K] * | | * | | * points to extent X, points to extent X, * offset 4K, length of 8K offset 0, length 16K * * [extent X, compressed length = 4K uncompressed length = 16K] * * If the bio to read the compressed extent covers both ranges, * it will decompress extent X into the folios belonging to the * first range and then it will stop, zeroing out the remaining * folios that belong to the other range that points to extent X. * So here we make sure we submit 2 bios, one for the first * range and another one for the third range. Both will target * the same physical extent from disk, but we can't currently * make the compressed bio endio callback populate the folios * for both ranges because each compressed bio is tightly * coupled with a single extent map, and each range can have * an extent map with a different offset value relative to the * uncompressed data of our extent and different lengths. This * is a corner case so we prioritize correctness over * non-optimal behavior (submitting 2 bios for the same extent).
*/ if (compress_type != BTRFS_COMPRESS_NONE &&
bio_ctrl->last_em_start != U64_MAX &&
bio_ctrl->last_em_start != em->start)
force_bio_submit = true;
bio_ctrl->last_em_start = em->start;
btrfs_free_extent_map(em);
em = NULL;
/* we've found a hole, just zero and go on */ if (block_start == EXTENT_MAP_HOLE) {
folio_zero_range(folio, pg_offset, blocksize);
end_folio_read(folio, true, cur, blocksize); continue;
} /* the get_extent function already copied into the folio */ if (block_start == EXTENT_MAP_INLINE) {
end_folio_read(folio, true, cur, blocksize); continue;
}
if (bio_ctrl->compress_type != compress_type) {
submit_one_bio(bio_ctrl);
bio_ctrl->compress_type = compress_type;
}
/* * Check if we can skip waiting the @ordered extent covering the block at @fileoff. * * @fileoff: Both input and output. * Input as the file offset where the check should start at. * Output as where the next check should start at, * if the function returns true. * * Return true if we can skip to @fileoff. The caller needs to check the new * @fileoff value to make sure it covers the full range, before skipping the * full OE. * * Return false if we must wait for the ordered extent.
*/ staticbool can_skip_one_ordered_range(struct btrfs_inode *inode, struct btrfs_ordered_extent *ordered,
u64 *fileoff)
{ conststruct btrfs_fs_info *fs_info = inode->root->fs_info; struct folio *folio; const u32 blocksize = fs_info->sectorsize;
u64 cur = *fileoff; bool ret;
folio = filemap_get_folio(inode->vfs_inode.i_mapping, cur >> PAGE_SHIFT);
/* * We should have locked the folio(s) for range [start, end], thus * there must be a folio and it must be locked.
*/
ASSERT(!IS_ERR(folio));
ASSERT(folio_test_locked(folio));
/* * There are several cases for the folio and OE combination: * * 1) Folio has no private flag * The OE has all its IO done but not yet finished, and folio got * invalidated. * * Have we have to wait for the OE to finish, as it may contain the * to-be-inserted data checksum. * Without the data checksum inserted into the csum tree, read will * just fail with missing csum.
*/ if (!folio_test_private(folio)) {
ret = false; goto out;
}
/* * 2) The first block is DIRTY. * * This means the OE is created by some other folios whose file pos is * before this one. And since we are holding the folio lock, the writeback * of this folio cannot start. * * We must skip the whole OE, because it will never start until we * finished our folio read and unlocked the folio.
*/ if (btrfs_folio_test_dirty(fs_info, folio, cur, blocksize)) {
u64 range_len = min(folio_end(folio),
ordered->file_offset + ordered->num_bytes) - cur;
ret = true; /* * At least inside the folio, all the remaining blocks should * also be dirty.
*/
ASSERT(btrfs_folio_test_dirty(fs_info, folio, cur, range_len));
*fileoff = ordered->file_offset + ordered->num_bytes; goto out;
}
/* * 3) The first block is uptodate. * * At least the first block can be skipped, but we are still not fully * sure. E.g. if the OE has some other folios in the range that cannot * be skipped. * So we return true and update @next_ret to the OE/folio boundary.
*/ if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) {
u64 range_len = min(folio_end(folio),
ordered->file_offset + ordered->num_bytes) - cur;
/* * The whole range to the OE end or folio boundary should also * be uptodate.
*/
ASSERT(btrfs_folio_test_uptodate(fs_info, folio, cur, range_len));
ret = true;
*fileoff = cur + range_len; goto out;
}
/* * 4) The first block is not uptodate. * * This means the folio is invalidated after the writeback was finished, * but by some other operations (e.g. block aligned buffered write) the * folio is inserted into filemap. * Very much the same as case 1).
*/
ret = false;
out:
folio_put(folio); return ret;
}
/* * Locking helper to make sure we get a stable view of extent maps for the * involved range. * * This is for folio read paths (read and readahead), thus the involved range * should have all the folios locked.
*/ staticvoid lock_extents_for_read(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state)
{
u64 cur_pos;
/* Caller must provide a valid @cached_state. */
ASSERT(cached_state);
/* The range must at least be page aligned, as all read paths are folio based. */
ASSERT(IS_ALIGNED(start, PAGE_SIZE));
ASSERT(IS_ALIGNED(end + 1, PAGE_SIZE));
ordered = btrfs_lookup_ordered_range(inode, cur_pos,
end - cur_pos + 1); /* * No ordered extents in the range, and we hold the extent lock, * no one can modify the extent maps in the range, we're safe to return.
*/ if (!ordered) break;
/* Check if we can skip waiting for the whole OE. */ if (can_skip_ordered_extent(inode, ordered, start, end)) {
cur_pos = min(ordered->file_offset + ordered->num_bytes,
end + 1);
btrfs_put_ordered_extent(ordered); continue;
}
/* Now wait for the OE to finish. */
btrfs_unlock_extent(&inode->io_tree, start, end, cached_state);
btrfs_start_ordered_extent_nowriteback(ordered, start, end + 1 - start);
btrfs_put_ordered_extent(ordered); /* We have unlocked the whole range, restart from the beginning. */ goto again;
}
}
/* * Do all of the delayed allocation setup. * * Return >0 if all the dirty blocks are submitted async (compression) or inlined. * The @folio should no longer be touched (treat it as already unlocked). * * Return 0 if there is still dirty block that needs to be submitted through * extent_writepage_io(). * bio_ctrl->submit_bitmap will indicate which blocks of the folio should be * submitted, and @folio is still kept locked. * * Return <0 if there is any error hit. * Any allocated ordered extent range covering this folio will be marked * finished (IOERR), and @folio is still kept locked.
*/ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl)
{ struct btrfs_fs_info *fs_info = inode_to_fs_info(&inode->vfs_inode); struct writeback_control *wbc = bio_ctrl->wbc; constbool is_subpage = btrfs_is_subpage(fs_info, folio); const u64 page_start = folio_pos(folio); const u64 page_end = page_start + folio_size(folio) - 1; constunsignedint blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); unsignedlong delalloc_bitmap = 0; /* * Save the last found delalloc end. As the delalloc end can go beyond * page boundary, thus we cannot rely on subpage bitmap to locate the * last delalloc end.
*/
u64 last_delalloc_end = 0; /* * The range end (exclusive) of the last successfully finished delalloc * range. * Any range covered by ordered extent must either be manually marked * finished (error handling), or has IO submitted (and finish the * ordered extent normally). * * This records the end of ordered extent cleanup if we hit an error.
*/
u64 last_finished_delalloc_end = page_start;
u64 delalloc_start = page_start;
u64 delalloc_end = page_end;
u64 delalloc_to_write = 0; int ret = 0; int bit;
/* Save the dirty bitmap as our submission bitmap will be a subset of it. */ if (btrfs_is_subpage(fs_info, folio)) {
ASSERT(blocks_per_folio > 1);
btrfs_get_subpage_dirty_bitmap(fs_info, folio, &bio_ctrl->submit_bitmap);
} else {
bio_ctrl->submit_bitmap = 1;
}
/* Run the delalloc ranges for the above locked ranges. */ while (delalloc_start < page_end) {
u64 found_start;
u32 found_len; bool found;
if (!is_subpage) { /* * For non-subpage case, the found delalloc range must * cover this folio and there must be only one locked * delalloc range.
*/
found_start = page_start;
found_len = last_delalloc_end + 1 - found_start;
found = true;
} else {
found = find_next_delalloc_bitmap(folio, &delalloc_bitmap,
delalloc_start, &found_start, &found_len);
} if (!found) break; /* * The subpage range covers the last sector, the delalloc range may * end beyond the folio boundary, use the saved delalloc_end * instead.
*/ if (found_start + found_len >= page_end)
found_len = last_delalloc_end + 1 - found_start;
if (ret >= 0) { /* * Some delalloc range may be created by previous folios. * Thus we still need to clean up this range during error * handling.
*/
last_finished_delalloc_end = found_start; /* No errors hit so far, run the current delalloc range. */
ret = btrfs_run_delalloc_range(inode, folio,
found_start,
found_start + found_len - 1,
wbc); if (ret >= 0)
last_finished_delalloc_end = found_start + found_len; if (unlikely(ret < 0))
btrfs_err_rl(fs_info, "failed to run delalloc range, root=%lld ino=%llu folio=%llu submit_bitmap=%*pbl start=%llu len=%u: %d",
btrfs_root_id(inode->root),
btrfs_ino(inode),
folio_pos(folio),
blocks_per_folio,
&bio_ctrl->submit_bitmap,
found_start, found_len, ret);
} else { /* * We've hit an error during previous delalloc range, * have to cleanup the remaining locked ranges.
*/
btrfs_unlock_extent(&inode->io_tree, found_start,
found_start + found_len - 1, NULL);
unlock_delalloc_folio(&inode->vfs_inode, folio,
found_start,
found_start + found_len - 1);
}
/* * We have some ranges that's going to be submitted asynchronously * (compression or inline). These range have their own control * on when to unlock the pages. We should not touch them * anymore, so clear the range from the submission bitmap.
*/ if (ret > 0) { unsignedint start_bit = (found_start - page_start) >>
fs_info->sectorsize_bits; unsignedint end_bit = (min(page_end + 1, found_start + found_len) -
page_start) >> fs_info->sectorsize_bits;
bitmap_clear(&bio_ctrl->submit_bitmap, start_bit, end_bit - start_bit);
} /* * Above btrfs_run_delalloc_range() may have unlocked the folio, * thus for the last range, we cannot touch the folio anymore.
*/ if (found_start + found_len >= last_delalloc_end + 1) break;
delalloc_start = found_start + found_len;
} /* * It's possible we had some ordered extents created before we hit * an error, cleanup non-async successfully created delalloc ranges.
*/ if (unlikely(ret < 0)) { unsignedint bitmap_size = min(
(last_finished_delalloc_end - page_start) >>
fs_info->sectorsize_bits,
blocks_per_folio);
for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size)
btrfs_mark_ordered_io_finished(inode, folio,
page_start + (bit << fs_info->sectorsize_bits),
fs_info->sectorsize, false); return ret;
}
out: if (last_delalloc_end)
delalloc_end = last_delalloc_end; else
delalloc_end = page_end; /* * delalloc_end is already one less than the total length, so * we don't subtract one from PAGE_SIZE.
*/
delalloc_to_write +=
DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE);
/* * If all ranges are submitted asynchronously, we just need to account * for them here.
*/ if (bitmap_empty(&bio_ctrl->submit_bitmap, blocks_per_folio)) {
wbc->nr_to_write -= delalloc_to_write; return 1;
}
if (wbc->nr_to_write < delalloc_to_write) { int thresh = 8192;
/* * Return 0 if we have submitted or queued the sector for submission. * Return <0 for critical errors, and the sector will have its dirty flag cleared. * * Caller should make sure filepos < i_size and handle filepos >= i_size case.
*/ staticint submit_one_sector(struct btrfs_inode *inode, struct folio *folio,
u64 filepos, struct btrfs_bio_ctrl *bio_ctrl,
loff_t i_size)
{ struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_map *em;
u64 block_start;
u64 disk_bytenr;
u64 extent_offset;
u64 em_end; const u32 sectorsize = fs_info->sectorsize;
ASSERT(IS_ALIGNED(filepos, sectorsize));
/* @filepos >= i_size case should be handled by the caller. */
ASSERT(filepos < i_size);
em = btrfs_get_extent(inode, NULL, filepos, sectorsize); if (IS_ERR(em)) { /* * When submission failed, we should still clear the folio dirty. * Or the folio will be written back again but without any * ordered extent.
*/
btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize);
btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize);
btrfs_folio_clear_writeback(fs_info, folio, filepos, sectorsize); return PTR_ERR(em);
}
/* * Although the PageDirty bit is cleared before entering this * function, subpage dirty bit is not cleared. * So clear subpage dirty bit here so next time we won't submit * a folio for a range already written to disk.
*/
btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize);
btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize); /* * Above call should set the whole folio with writeback flag, even * just for a single subpage sector. * As long as the folio is properly locked and the range is correct, * we should always get the folio with writeback flag.
*/
ASSERT(folio_test_writeback(folio));
/* * Helper for extent_writepage(). This calls the writepage start hooks, * and does the loop to map the page into extents and bios. * * We return 1 if the IO is started and the page is unlocked, * 0 if all went well (page still locked) * < 0 if there were errors (page still locked)
*/ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, struct folio *folio,
u64 start, u32 len, struct btrfs_bio_ctrl *bio_ctrl,
loff_t i_size)
{ struct btrfs_fs_info *fs_info = inode->root->fs_info; unsignedlong range_bitmap = 0; bool submitted_io = false; int found_error = 0; const u64 folio_start = folio_pos(folio); constunsignedint blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
u64 cur; int bit; int ret = 0;
if (cur >= i_size) {
btrfs_mark_ordered_io_finished(inode, folio, cur,
start + len - cur, true); /* * This range is beyond i_size, thus we don't need to * bother writing back. * But we still need to clear the dirty subpage bit, or * the next time the folio gets dirtied, we will try to * writeback the sectors with subpage dirty bits, * causing writeback without ordered extent.
*/
btrfs_folio_clear_dirty(fs_info, folio, cur,
start + len - cur); break;
}
ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size); if (unlikely(ret < 0)) { /* * bio_ctrl may contain a bio crossing several folios. * Submit it immediately so that the bio has a chance * to finish normally, other than marked as error.
*/
submit_one_bio(bio_ctrl); /* * Failed to grab the extent map which should be very rare. * Since there is no bio submitted to finish the ordered * extent, we have to manually finish this sector.
*/
btrfs_mark_ordered_io_finished(inode, folio, cur,
fs_info->sectorsize, false); if (!found_error)
found_error = ret; continue;
}
submitted_io = true;
}
/* * If we didn't submitted any sector (>= i_size), folio dirty get * cleared but PAGECACHE_TAG_DIRTY is not cleared (only cleared * by folio_start_writeback() if the folio is not dirty). * * Here we set writeback and clear for the range. If the full folio * is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag. * * If we hit any error, the corresponding sector will have its dirty * flag cleared and writeback finished, thus no need to handle the error case.
*/ if (!submitted_io && !found_error) {
btrfs_folio_set_writeback(fs_info, folio, start, len);
btrfs_folio_clear_writeback(fs_info, folio, start, len);
} return found_error;
}
/* * the writepage semantics are similar to regular writepage. extent * records are inserted to lock ranges in the tree, and as dirty areas * are found, they are marked writeback. Then the lock bits are removed * and the end_io handler clears the writeback ranges * * Return 0 if everything goes well. * Return <0 for error.
*/ staticint extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl)
{ struct btrfs_inode *inode = BTRFS_I(folio->mapping->host); struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret;
size_t pg_offset;
loff_t i_size = i_size_read(&inode->vfs_inode); const pgoff_t end_index = i_size >> PAGE_SHIFT; constunsignedint blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
if (folio_contains(folio, end_index))
folio_zero_range(folio, pg_offset, folio_size(folio) - pg_offset);
/* * Default to unlock the whole folio. * The proper bitmap can only be initialized until writepage_delalloc().
*/
bio_ctrl->submit_bitmap = (unsignedlong)-1;
/* * If the page is dirty but without private set, it's marked dirty * without informing the fs. * Nowadays that is a bug, since the introduction of * pin_user_pages*(). * * So here we check if the page has private set to rule out such * case. * But we also have a long history of relying on the COW fixup, * so here we only enable this check for experimental builds until * we're sure it's safe.
*/ if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) &&
unlikely(!folio_test_private(folio))) {
WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
btrfs_err_rl(fs_info, "root %lld ino %llu folio %llu is marked dirty without notifying the fs",
btrfs_root_id(inode->root),
btrfs_ino(inode), folio_pos(folio));
ret = -EUCLEAN; goto done;
}
ret = set_folio_extent_mapped(folio); if (ret < 0) goto done;
ret = writepage_delalloc(inode, folio, bio_ctrl); if (ret == 1) return 0; if (ret) goto done;
ret = extent_writepage_io(inode, folio, folio_pos(folio),
folio_size(folio), bio_ctrl, i_size); if (ret == 1) return 0; if (ret < 0)
btrfs_err_rl(fs_info, "failed to submit blocks, root=%lld inode=%llu folio=%llu submit_bitmap=%*pbl: %d",
btrfs_root_id(inode->root), btrfs_ino(inode),
folio_pos(folio), blocks_per_folio,
&bio_ctrl->submit_bitmap, ret);
bio_ctrl->wbc->nr_to_write--;
done: if (ret < 0)
mapping_set_error(folio->mapping, ret); /* * Only unlock ranges that are submitted. As there can be some async * submitted ranges inside the folio.
*/
btrfs_folio_end_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap);
ASSERT(ret <= 0); return ret;
}
/* * Lock extent buffer status and pages for writeback. * * Return %false if the extent buffer doesn't need to be submitted (e.g. the * extent buffer is not dirty) * Return %true is the extent buffer is submitted to bio.
*/ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *eb, struct writeback_control *wbc)
{ struct btrfs_fs_info *fs_info = eb->fs_info; bool ret = false;
btrfs_tree_lock(eb); while (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
btrfs_tree_unlock(eb); if (wbc->sync_mode != WB_SYNC_ALL) returnfalse;
wait_on_extent_buffer_writeback(eb);
btrfs_tree_lock(eb);
}
/* * We need to do this to prevent races in people who check if the eb is * under IO since we can end up having no IO bits set for a short period * of time.
*/
spin_lock(&eb->refs_lock); if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits); unsignedlong flags;
/* * A read may stumble upon this buffer later, make sure that it gets an * error and knows there was an error.
*/
clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
/* * We need to set the mapping with the io error as well because a write * error will flip the file system readonly, and then syncfs() will * return a 0 because we are readonly if we don't modify the err seq for * the superblock.
*/
mapping_set_error(eb->fs_info->btree_inode->i_mapping, -EIO);
/* * If writeback for a btree extent that doesn't belong to a log tree * failed, increment the counter transaction->eb_write_errors. * We do this because while the transaction is running and before it's * committing (when we call filemap_fdata[write|wait]_range against * the btree inode), we might have * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it * returns an error or an error happens during writeback, when we're * committing the transaction we wouldn't know about it, since the pages * can be no longer dirty nor marked anymore for writeback (if a * subsequent modification to the extent buffer didn't happen before the * transaction commit), which makes filemap_fdata[write|wait]_range not * able to find the pages which contain errors at transaction * commit time. So if this happens we must abort the transaction, * otherwise we commit a super block with btree roots that point to * btree nodes/leafs whose content on disk is invalid - either garbage * or the content of some node/leaf from a past generation that got * cowed or deleted and is no longer valid. * * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would * not be enough - we need to distinguish between log tree extents vs * non-log tree extents, and the next filemap_fdatawait_range() call * will catch and clear such errors in the mapping - and that call might * be from a log sync and not from a transaction commit. Also, checking * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is * not done and would not be reliable - the eb might have been released * from memory and reading it back again means that flag would not be * set (since it's a runtime flag, not persisted on disk). * * Using the flags below in the btree inode also makes us achieve the * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started * writeback for all dirty pages and before filemap_fdatawait_range() * is called, the writeback for all dirty pages had already finished * with errors - because we were not using AS_EIO/AS_ENOSPC, * filemap_fdatawait_range() would return success, as it could not know * that writeback errors happened (the pages were no longer tagged for * writeback).
*/ switch (eb->log_index) { case -1:
set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags); break; case 0:
set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags); break; case 1:
set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags); break; default:
BUG(); /* unexpected, logic error */
}
}
rcu_read_lock(); while ((eb = find_get_eb(&xas, end, tag)) != NULL) { if (!eb_batch_add(batch, eb)) {
*start = ((eb->start + eb->len) >> fs_info->nodesize_bits); goto out;
}
} if (end == ULONG_MAX)
*start = ULONG_MAX; else
*start = end + 1;
out:
rcu_read_unlock();
return batch->nr;
}
/* * The endio specific version which won't touch any unsafe spinlock in endio * context.
*/ staticstruct extent_buffer *find_extent_buffer_nolock( struct btrfs_fs_info *fs_info, u64 start)
{ struct extent_buffer *eb; unsignedlong index = (start >> fs_info->nodesize_bits);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.