struct btrfs_rename_ctx { /* Output field. Stores the index number of the old directory entry. */
u64 index;
};
/* * Used by data_reloc_print_warning_inode() to pass needed info for filename * resolution and output of error message.
*/ struct data_reloc_warn { struct btrfs_path path; struct btrfs_fs_info *fs_info;
u64 extent_item_size;
u64 logical; int mirror_num;
};
/* * For the file_extent_tree, we want to hold the inode lock when we lookup and * update the disk_i_size, but lockdep will complain because our io_tree we hold * the tree lock and get the inode lock when setting delalloc. These two things * are unrelated, so make a class for the file_extent_tree so we don't get the * two locking patterns mixed up.
*/ staticstruct lock_class_key file_extent_tree_class;
nofs_flag = memalloc_nofs_save();
ipath = init_ipath(4096, local_root, &warn->path);
memalloc_nofs_restore(nofs_flag); if (IS_ERR(ipath)) {
btrfs_put_root(local_root);
ret = PTR_ERR(ipath);
ipath = NULL; /* * -ENOMEM, not a critical error, just output an generic error * without filename.
*/
btrfs_warn(fs_info, "checksum error at logical %llu mirror %u root %llu, inode %llu offset %llu",
warn->logical, warn->mirror_num, root, inum, offset); return ret;
}
ret = paths_from_inode(inum, ipath); if (ret < 0) {
btrfs_put_root(local_root); goto err;
}
/* * We deliberately ignore the bit ipath might have been too small to * hold all of the paths here
*/ for (int i = 0; i < ipath->fspath->elem_cnt; i++) {
btrfs_warn(fs_info, "checksum error at logical %llu mirror %u root %llu inode %llu offset %llu length %u links %u (path: %s)",
warn->logical, warn->mirror_num, root, inum, offset,
fs_info->sectorsize, nlink,
(char *)(unsignedlong)ipath->fspath->val[i]);
}
ret = extent_from_logical(fs_info, logical, &path, &found_key, &flags); if (ret < 0) {
btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d",
logical, ret); return;
}
eb = path.nodes[0];
ei = btrfs_item_ptr(eb, path.slots[0], struct btrfs_extent_item);
item_size = btrfs_item_size(eb, path.slots[0]); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { unsignedlong ptr = 0;
u64 ref_root;
u8 ref_level;
while (true) {
ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
item_size, &ref_root,
&ref_level); if (ret < 0) {
btrfs_warn_rl(fs_info, "failed to resolve tree backref for logical %llu: %d",
logical, ret); break;
} if (ret > 0) break;
/* For data reloc tree, it's better to do a backref lookup instead. */ if (btrfs_is_data_reloc_root(root)) return print_data_reloc_error(inode, logical_start, csum,
csum_expected, mirror_num);
/* * Lock inode i_rwsem based on arguments passed. * * ilock_flags can have the following bit set: * * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt * return -EAGAIN * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
*/ int btrfs_inode_lock(struct btrfs_inode *inode, unsignedint ilock_flags)
{ if (ilock_flags & BTRFS_ILOCK_SHARED) { if (ilock_flags & BTRFS_ILOCK_TRY) { if (!inode_trylock_shared(&inode->vfs_inode)) return -EAGAIN; else return 0;
}
inode_lock_shared(&inode->vfs_inode);
} else { if (ilock_flags & BTRFS_ILOCK_TRY) { if (!inode_trylock(&inode->vfs_inode)) return -EAGAIN; else return 0;
}
inode_lock(&inode->vfs_inode);
} if (ilock_flags & BTRFS_ILOCK_MMAP)
down_write(&inode->i_mmap_lock); return 0;
}
/* * Unock inode i_rwsem. * * ilock_flags should contain the same bits set as passed to btrfs_inode_lock() * to decide whether the lock acquired is shared or exclusive.
*/ void btrfs_inode_unlock(struct btrfs_inode *inode, unsignedint ilock_flags)
{ if (ilock_flags & BTRFS_ILOCK_MMAP)
up_write(&inode->i_mmap_lock); if (ilock_flags & BTRFS_ILOCK_SHARED)
inode_unlock_shared(&inode->vfs_inode); else
inode_unlock(&inode->vfs_inode);
}
/* * Cleanup all submitted ordered extents in specified range to handle errors * from the btrfs_run_delalloc_range() callback. * * NOTE: caller must ensure that when an error happens, it can not call * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata * to be released, which we want to happen only when finishing the ordered * extent (btrfs_finish_ordered_io()).
*/ staticinlinevoid btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
u64 offset, u64 bytes)
{
pgoff_t index = offset >> PAGE_SHIFT; const pgoff_t end_index = (offset + bytes - 1) >> PAGE_SHIFT; struct folio *folio;
while (index <= end_index) {
folio = filemap_get_folio(inode->vfs_inode.i_mapping, index); if (IS_ERR(folio)) {
index++; continue;
}
index = folio_end(folio) >> PAGE_SHIFT; /* * Here we just clear all Ordered bits for every page in the * range, then btrfs_mark_ordered_io_finished() will handle * the ordered extent accounting for the range.
*/
btrfs_folio_clamp_clear_ordered(inode->root->fs_info, folio,
offset, bytes);
folio_put(folio);
}
staticint btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct btrfs_new_inode_args *args)
{ int ret;
if (args->default_acl) {
ret = __btrfs_set_acl(trans, args->inode, args->default_acl,
ACL_TYPE_DEFAULT); if (ret) return ret;
} if (args->acl) {
ret = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS); if (ret) return ret;
} if (!args->default_acl && !args->acl)
cache_no_acl(args->inode); return btrfs_xattr_security_init(trans, args->inode, args->dir,
&args->dentry->d_name);
}
/* * this does all the hard work for inserting an inline extent into * the btree. The caller should have done a btrfs_drop_extents so that * no overlapping inline items exist in the btree
*/ staticint insert_inline_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_inode *inode, bool extent_inserted,
size_t size, size_t compressed_size, int compress_type, struct folio *compressed_folio, bool update_i_size)
{ struct btrfs_root *root = inode->root; struct extent_buffer *leaf; const u32 sectorsize = trans->fs_info->sectorsize; char *kaddr; unsignedlong ptr; struct btrfs_file_extent_item *ei; int ret;
size_t cur_size = size;
u64 i_size;
/* * The decompressed size must still be no larger than a sector. Under * heavy race, we can have size == 0 passed in, but that shouldn't be a * big deal and we can continue the insertion.
*/
ASSERT(size <= sectorsize);
/* * The compressed size also needs to be no larger than a sector. * That's also why we only need one page as the parameter.
*/ if (compressed_folio)
ASSERT(compressed_size <= sectorsize); else
ASSERT(compressed_size == 0);
if (compressed_size && compressed_folio)
cur_size = compressed_size;
if (!extent_inserted) { struct btrfs_key key;
size_t datasize;
/* * We align size to sectorsize for inline extents just for simplicity * sake.
*/
ret = btrfs_inode_set_file_extent_range(inode, 0,
ALIGN(size, root->fs_info->sectorsize)); if (ret) goto fail;
/* * We're an inline extent, so nobody can extend the file past i_size * without locking a page we already have locked. * * We must do any i_size and inode updates before we unlock the pages. * Otherwise we could end up racing with unlink.
*/
i_size = i_size_read(&inode->vfs_inode); if (update_i_size && size > i_size) {
i_size_write(&inode->vfs_inode, size);
i_size = size;
}
inode->disk_i_size = i_size;
/* Inline extents must start at offset 0. */ if (offset != 0) returnfalse;
/* Inline extents are limited to sectorsize. */ if (size > fs_info->sectorsize) returnfalse;
/* We do not allow a non-compressed extent to be as large as block size. */ if (data_len >= fs_info->sectorsize) returnfalse;
/* We cannot exceed the maximum inline data size. */ if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) returnfalse;
/* We cannot exceed the user specified max_inline size. */ if (data_len > fs_info->max_inline) returnfalse;
/* Inline extents must be the entirety of the file. */ if (size < i_size_read(&inode->vfs_inode)) returnfalse;
returntrue;
}
/* * conditionally insert an inline extent into the file. This * does the checks required to make sure the data is small enough * to fit as an inline extent. * * If being used directly, you must have already checked we're allowed to cow * the range by getting true from can_cow_file_range_inline().
*/ static noinline int __cow_file_range_inline(struct btrfs_inode *inode,
u64 size, size_t compressed_size, int compress_type, struct folio *compressed_folio, bool update_i_size)
{ struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans;
u64 data_len = (compressed_size ?: size); int ret; struct btrfs_path *path;
path = btrfs_alloc_path(); if (!path) return -ENOMEM;
trans = btrfs_join_transaction(root); if (IS_ERR(trans)) {
btrfs_free_path(path); return PTR_ERR(trans);
}
trans->block_rsv = &inode->block_rsv;
ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
size, compressed_size, compress_type,
compressed_folio, update_i_size); if (ret && ret != -ENOSPC) {
btrfs_abort_transaction(trans, ret); goto out;
} elseif (ret == -ENOSPC) {
ret = 1; goto out;
}
btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
ret = btrfs_update_inode(trans, inode); if (ret && ret != -ENOSPC) {
btrfs_abort_transaction(trans, ret); goto out;
} elseif (ret == -ENOSPC) {
ret = 1; goto out;
}
btrfs_set_inode_full_sync(inode);
out: /* * Don't forget to free the reserved space, as for inlined extent * it won't count as data extent, free them directly here. * And at reserve time, it's always aligned to page size, so * just free one page here.
*/
btrfs_qgroup_free_data(inode, NULL, 0, fs_info->sectorsize, NULL);
btrfs_free_path(path);
btrfs_end_transaction(trans); return ret;
}
/* * In the successful case (ret == 0 here), cow_file_range will return 1. * * Quite a bit further up the callstack in extent_writepage(), ret == 1 * is treated as a short circuited success and does not unlock the folio, * so we must do it here. * * In the failure case, the locked_folio does get unlocked by * btrfs_folio_end_all_writers, which asserts that it is still locked * at that point, so we must *not* unlock it here. * * The other two callsites in compress_file_range do not have a * locked_folio, so they are not relevant to this logic.
*/ if (ret == 0)
locked_folio = NULL;
/* * Check if the inode needs to be submitted to compression, based on mount * options, defragmentation, properties or heuristics.
*/ staticinlineint inode_need_compress(struct btrfs_inode *inode, u64 start,
u64 end)
{ struct btrfs_fs_info *fs_info = inode->root->fs_info;
if (!btrfs_inode_can_compress(inode)) {
DEBUG_WARN("BTRFS: unexpected compression for ino %llu", btrfs_ino(inode)); return 0;
}
/* Defrag ioctl takes precedence over mount options and properties. */ if (inode->defrag_compress == BTRFS_DEFRAG_DONT_COMPRESS) return 0; if (BTRFS_COMPRESS_NONE < inode->defrag_compress &&
inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES) return 1; /* force compress */ if (btrfs_test_opt(fs_info, FORCE_COMPRESS)) return 1; /* bad compression ratios */ if (inode->flags & BTRFS_INODE_NOCOMPRESS) return 0; if (btrfs_test_opt(fs_info, COMPRESS) ||
inode->flags & BTRFS_INODE_COMPRESS ||
inode->prop_compress) return btrfs_compress_heuristic(inode, start, end); return 0;
}
staticinlinevoid inode_should_defrag(struct btrfs_inode *inode,
u64 start, u64 end, u64 num_bytes, u32 small_write)
{ /* If this is a small write inside eof, kick off a defrag */ if (num_bytes < small_write &&
(start > 0 || end + 1 < inode->disk_i_size))
btrfs_add_inode_defrag(inode, small_write);
}
staticint extent_range_clear_dirty_for_io(struct btrfs_inode *inode, u64 start, u64 end)
{ const pgoff_t end_index = end >> PAGE_SHIFT; struct folio *folio; int ret = 0;
for (pgoff_t index = start >> PAGE_SHIFT; index <= end_index; index++) {
folio = filemap_get_folio(inode->vfs_inode.i_mapping, index); if (IS_ERR(folio)) { if (!ret)
ret = PTR_ERR(folio); continue;
}
btrfs_folio_clamp_clear_dirty(inode->root->fs_info, folio, start,
end + 1 - start);
folio_put(folio);
} return ret;
}
/* * Work queue call back to started compression on a file and pages. * * This is done inside an ordered work queue, and the compression is spread * across many cpus. The actual IO submission is step two, and the ordered work * queue takes care of making sure that happens in the same order things were * put onto the queue by writepages and friends. * * If this code finds it can't get good compression, it puts an entry onto the * work queue to write the uncompressed bytes. This makes sure that both * compressed inodes and uncompressed inodes are written in the same order that * the flusher thread sent them down.
*/ staticvoid compress_file_range(struct btrfs_work *work)
{ struct async_chunk *async_chunk =
container_of(work, struct async_chunk, work); struct btrfs_inode *inode = async_chunk->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct address_space *mapping = inode->vfs_inode.i_mapping;
u64 blocksize = fs_info->sectorsize;
u64 start = async_chunk->start;
u64 end = async_chunk->end;
u64 actual_end;
u64 i_size; int ret = 0; struct folio **folios; unsignedlong nr_folios; unsignedlong total_compressed = 0; unsignedlong total_in = 0; unsignedint poff; int i; int compress_type = fs_info->compress_type; int compress_level = fs_info->compress_level;
inode_should_defrag(inode, start, end, end - start + 1, SZ_16K);
/* * We need to call clear_page_dirty_for_io on each page in the range. * Otherwise applications with the file mmap'd can wander in and change * the page contents while we are compressing them.
*/
ret = extent_range_clear_dirty_for_io(inode, start, end);
/* * All the folios should have been locked thus no failure. * * And even if some folios are missing, btrfs_compress_folios() * would handle them correctly, so here just do an ASSERT() check for * early logic errors.
*/
ASSERT(ret == 0);
/* * We need to save i_size before now because it could change in between * us evaluating the size and assigning it. This is because we lock and * unlock the page in truncate and fallocate, and then modify the i_size * later on. * * The barriers are to emulate READ_ONCE, remove that once i_size_read * does that for us.
*/
barrier();
i_size = i_size_read(&inode->vfs_inode);
barrier();
actual_end = min_t(u64, i_size, end + 1);
again:
folios = NULL;
nr_folios = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
nr_folios = min_t(unsignedlong, nr_folios, BTRFS_MAX_COMPRESSED_PAGES);
/* * we don't want to send crud past the end of i_size through * compression, that's just a waste of CPU time. So, if the * end of the file is before the start of our current * requested range of bytes, we bail out to the uncompressed * cleanup code that can deal with all of this. * * It isn't really the fastest way to fix things, but this is a * very uncommon corner.
*/ if (actual_end <= start) goto cleanup_and_bail_uncompressed;
total_compressed = actual_end - start;
/* * Skip compression for a small file range(<=blocksize) that * isn't an inline extent, since it doesn't save disk space at all.
*/ if (total_compressed <= blocksize &&
(start > 0 || end + 1 < inode->disk_i_size)) goto cleanup_and_bail_uncompressed;
/* * We do compression for mount -o compress and when the inode has not * been flagged as NOCOMPRESS. This flag can change at any time if we * discover bad compression ratios.
*/ if (!inode_need_compress(inode, start, end)) goto cleanup_and_bail_uncompressed;
folios = kcalloc(nr_folios, sizeof(struct folio *), GFP_NOFS); if (!folios) { /* * Memory allocation failure is not a fatal error, we can fall * back to uncompressed code.
*/ goto cleanup_and_bail_uncompressed;
}
/* Compression level is applied here. */
ret = btrfs_compress_folios(compress_type, compress_level,
mapping, start, folios, &nr_folios, &total_in,
&total_compressed); if (ret) goto mark_incompressible;
/* * Zero the tail end of the last page, as we might be sending it down * to disk.
*/
poff = offset_in_page(total_compressed); if (poff)
folio_zero_range(folios[nr_folios - 1], poff, PAGE_SIZE - poff);
/* * Try to create an inline extent. * * If we didn't compress the entire range, try to create an uncompressed * inline extent, else a compressed one. * * Check cow_file_range() for why we don't even try to create inline * extent for the subpage case.
*/ if (total_in < actual_end)
ret = cow_file_range_inline(inode, NULL, start, end, 0,
BTRFS_COMPRESS_NONE, NULL, false); else
ret = cow_file_range_inline(inode, NULL, start, end, total_compressed,
compress_type, folios[0], false); if (ret <= 0) { if (ret < 0)
mapping_set_error(mapping, -EIO); goto free_pages;
}
/* * We aren't doing an inline extent. Round the compressed size up to a * block size boundary so the allocator does sane things.
*/
total_compressed = ALIGN(total_compressed, blocksize);
/* * One last check to make sure the compression is really a win, compare * the page count read with the blocks on disk, compression must free at * least one sector.
*/
total_in = round_up(total_in, fs_info->sectorsize); if (total_compressed + blocksize > total_in) goto mark_incompressible;
/* * The async work queues will take care of doing actual allocation on * disk for these compressed pages, and will submit the bios.
*/
ret = add_async_extent(async_chunk, start, total_in, total_compressed, folios,
nr_folios, compress_type);
BUG_ON(ret); if (start + total_in < end) {
start += total_in;
cond_resched(); goto again;
} return;
mark_incompressible: if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress)
inode->flags |= BTRFS_INODE_NOCOMPRESS;
cleanup_and_bail_uncompressed:
ret = add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
BTRFS_COMPRESS_NONE);
BUG_ON(ret);
free_pages: if (folios) { for (i = 0; i < nr_folios; i++) {
WARN_ON(folios[i]->mapping);
btrfs_free_compr_folio(folios[i]);
}
kfree(folios);
}
}
staticvoid free_async_extent_pages(struct async_extent *async_extent)
{ int i;
if (!async_extent->folios) return;
for (i = 0; i < async_extent->nr_folios; i++) {
WARN_ON(async_extent->folios[i]->mapping);
btrfs_free_compr_folio(async_extent->folios[i]);
}
kfree(async_extent->folios);
async_extent->nr_folios = 0;
async_extent->folios = NULL;
}
if (async_chunk->blkcg_css)
kthread_associate_blkcg(async_chunk->blkcg_css);
/* * If async_chunk->locked_folio is in the async_extent range, we need to * handle it.
*/ if (async_chunk->locked_folio) {
u64 locked_folio_start = folio_pos(async_chunk->locked_folio);
u64 locked_folio_end = locked_folio_start +
folio_size(async_chunk->locked_folio) - 1;
if (!(start >= locked_folio_end || end <= locked_folio_start))
locked_folio = async_chunk->locked_folio;
}
ret = btrfs_reserve_extent(root, async_extent->ram_size,
async_extent->compressed_size,
async_extent->compressed_size,
0, *alloc_hint, &ins, 1, 1); if (ret) { /* * We can't reserve contiguous space for the compressed size. * Unlikely, but it's possible that we could have enough * non-contiguous space for the uncompressed size instead. So * fall back to uncompressed.
*/
submit_uncompressed_range(inode, async_extent, locked_folio);
free_pages = true; goto done;
}
btrfs_lock_extent(io_tree, start, end, &cached);
/* Here we're doing allocation and writeback of the compressed pages */
file_extent.disk_bytenr = ins.objectid;
file_extent.disk_num_bytes = ins.offset;
file_extent.ram_bytes = async_extent->ram_size;
file_extent.num_bytes = async_extent->ram_size;
file_extent.offset = 0;
file_extent.compression = async_extent->compress_type;
em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED); if (IS_ERR(em)) {
ret = PTR_ERR(em); goto out_free_reserve;
}
btrfs_free_extent_map(em);
read_lock(&em_tree->lock);
em = btrfs_search_extent_mapping(em_tree, start, num_bytes); if (em) { /* * if block start isn't an actual block number then find the * first block in this inode and use that as a hint. If that * block is also bogus then just don't worry about it.
*/ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
btrfs_free_extent_map(em);
em = btrfs_search_extent_mapping(em_tree, 0, 0); if (em && em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
alloc_hint = btrfs_extent_map_block_start(em); if (em)
btrfs_free_extent_map(em);
} else {
alloc_hint = btrfs_extent_map_block_start(em);
btrfs_free_extent_map(em);
}
}
read_unlock(&em_tree->lock);
return alloc_hint;
}
/* * when extent_io.c finds a delayed allocation range in the file, * the call backs end up in this code. The basic idea is to * allocate extents on disk for the range, and create ordered data structs * in ram to track those extents. * * locked_folio is the folio that writepage had locked already. We use * it to make sure we don't do extra locks or unlocks. * * When this function fails, it unlocks all pages except @locked_folio. * * When this function successfully creates an inline extent, it returns 1 and * unlocks all pages including locked_folio and starts I/O on them. * (In reality inline extents are limited to a single page, so locked_folio is * the only page handled anyway). * * When this function succeed and creates a normal extent, the page locking * status depends on the passed in flags: * * - If @keep_locked is set, all pages are kept locked. * - Else all pages except for @locked_folio are unlocked. * * When a failure happens in the second or later iteration of the * while-loop, the ordered extents created in previous iterations are cleaned up.
*/ static noinline int cow_file_range(struct btrfs_inode *inode, struct folio *locked_folio, u64 start,
u64 end, u64 *done_offset, bool keep_locked, bool no_inline)
{ struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct extent_state *cached = NULL;
u64 alloc_hint = 0;
u64 orig_start = start;
u64 num_bytes;
u64 cur_alloc_size = 0;
u64 min_alloc_size;
u64 blocksize = fs_info->sectorsize; struct btrfs_key ins; struct extent_map *em; unsigned clear_bits; unsignedlong page_ops; int ret = 0;
if (btrfs_is_free_space_inode(inode)) {
ret = -EINVAL; goto out_unlock;
}
if (!no_inline) { /* lets try to make an inline extent */
ret = cow_file_range_inline(inode, locked_folio, start, end, 0,
BTRFS_COMPRESS_NONE, NULL, false); if (ret <= 0) { /* * We succeeded, return 1 so the caller knows we're done * with this page and already handled the IO. * * If there was an error then cow_file_range_inline() has * already done the cleanup.
*/ if (ret == 0)
ret = 1; goto done;
}
}
/* * We're not doing compressed IO, don't unlock the first page (which * the caller expects to stay locked), don't clear any dirty bits and * don't set any writeback bits. * * Do set the Ordered (Private2) bit so we know this page was properly * setup for writepage.
*/
page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
page_ops |= PAGE_SET_ORDERED;
/* * Relocation relies on the relocated extents to have exactly the same * size as the original extents. Normally writeback for relocation data * extents follows a NOCOW path because relocation preallocates the * extents. However, due to an operation such as scrub turning a block * group to RO mode, it may fallback to COW mode, so we must make sure * an extent allocated during COW has exactly the requested size and can * not be split into smaller extents, otherwise relocation breaks and * fails during the stage where it updates the bytenr of file extent * items.
*/ if (btrfs_is_data_reloc_root(root))
min_alloc_size = num_bytes; else
min_alloc_size = fs_info->sectorsize;
ret = btrfs_reserve_extent(root, num_bytes, num_bytes,
min_alloc_size, 0, alloc_hint,
&ins, 1, 1); if (ret == -EAGAIN) { /* * btrfs_reserve_extent only returns -EAGAIN for zoned * file systems, which is an indication that there are * no active zones to allocate from at the moment. * * If this is the first loop iteration, wait for at * least one zone to finish before retrying the * allocation. Otherwise ask the caller to write out * the already allocated blocks before coming back to * us, or return -ENOSPC if it can't handle retries.
*/
ASSERT(btrfs_is_zoned(fs_info)); if (start == orig_start) {
wait_on_bit_io(&inode->root->fs_info->flags,
BTRFS_FS_NEED_ZONE_FINISH,
TASK_UNINTERRUPTIBLE); continue;
} if (done_offset) { /* * Move @end to the end of the processed range, * and exit the loop to unlock the processed extents.
*/
end = start - 1;
ret = 0; break;
}
ret = -ENOSPC;
} if (ret < 0) goto out_unlock;
cur_alloc_size = ins.offset;
/* * Locked range will be released either during error clean up or * after the whole range is finished.
*/
btrfs_lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1,
&cached);
em = btrfs_create_io_em(inode, start, &file_extent,
BTRFS_ORDERED_REGULAR); if (IS_ERR(em)) {
btrfs_unlock_extent(&inode->io_tree, start,
start + cur_alloc_size - 1, &cached);
ret = PTR_ERR(em); goto out_reserve;
}
btrfs_free_extent_map(em);
if (btrfs_is_data_reloc_root(root)) {
ret = btrfs_reloc_clone_csums(ordered);
/* * Only drop cache here, and process as normal. * * We must not allow extent_clear_unlock_delalloc() * at out_unlock label to free meta of this ordered * extent, as its meta should be freed by * btrfs_finish_ordered_io(). * * So we must continue until @start is increased to * skip current ordered extent.
*/ if (ret)
btrfs_drop_extent_map_range(inode, start,
start + cur_alloc_size - 1, false);
}
btrfs_put_ordered_extent(ordered);
/* * btrfs_reloc_clone_csums() error, since start is increased * extent_clear_unlock_delalloc() at out_unlock label won't * free metadata of current ordered extent, we're OK to exit.
*/ if (ret) goto out_unlock;
}
extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached,
EXTENT_LOCKED | EXTENT_DELALLOC, page_ops);
done: if (done_offset)
*done_offset = end; return ret;
out_drop_extent_cache:
btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false);
out_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
out_unlock: /* * Now, we have three regions to clean up: * * |-------(1)----|---(2)---|-------------(3)----------| * `- orig_start `- start `- start + cur_alloc_size `- end * * We process each region below.
*/
/* * For the range (1). We have already instantiated the ordered extents * for this region, thus we need to cleanup those ordered extents. * EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV * are also handled by the ordered extents cleanup. * * So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag, and * finish the writeback of the involved folios, which will be never submitted.
*/ if (orig_start < start) {
clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC;
page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
if (!locked_folio)
mapping_set_error(inode->vfs_inode.i_mapping, ret);
/* * For the range (2). If we reserved an extent for our delalloc range * (or a subrange) and failed to create the respective ordered extent, * then it means that when we reserved the extent we decremented the * extent's size from the data space_info's bytes_may_use counter and * incremented the space_info's bytes_reserved counter by the same * amount. We must make sure extent_clear_unlock_delalloc() does not try * to decrement again the data space_info's bytes_may_use counter, * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
*/ if (cur_alloc_size) {
extent_clear_unlock_delalloc(inode, start,
start + cur_alloc_size - 1,
locked_folio, &cached, clear_bits,
page_ops);
btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL);
}
/* * For the range (3). We never touched the region. In addition to the * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data * space_info's bytes_may_use counter, reserved in * btrfs_check_data_free_space().
*/ if (start + cur_alloc_size < end) {
clear_bits |= EXTENT_CLEAR_DATA_RESV;
extent_clear_unlock_delalloc(inode, start + cur_alloc_size,
end, locked_folio,
&cached, clear_bits, page_ops);
btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size,
end - start - cur_alloc_size + 1, NULL);
}
btrfs_err_rl(fs_info, "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d",
__func__, btrfs_root_id(inode->root),
btrfs_ino(inode), orig_start, end + 1 - orig_start, ret); return ret;
}
/* * Phase two of compressed writeback. This is the ordered portion of the code, * which only gets called in the order the work was queued. We walk all the * async extents created by compress_file_range and send them down to the disk. * * If called with @do_free == true then it'll try to finish the work and free * the work struct eventually.
*/ static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_free)
{ struct async_chunk *async_chunk = container_of(work, struct async_chunk,
work); struct btrfs_fs_info *fs_info = btrfs_work_owner(work); struct async_extent *async_extent; unsignedlong nr_pages;
u64 alloc_hint = 0;
if (do_free) { struct async_cow *async_cow;
btrfs_add_delayed_iput(async_chunk->inode); if (async_chunk->blkcg_css)
css_put(async_chunk->blkcg_css);
async_cow = async_chunk->async_cow; if (atomic_dec_and_test(&async_cow->num_chunks))
kvfree(async_cow); return;
}
for (i = 0; i < num_chunks; i++) {
u64 cur_end = min(end, start + SZ_512K - 1);
/* * igrab is called higher up in the call chain, take only the * lightweight reference for the callback lifetime
*/
ihold(&inode->vfs_inode);
async_chunk[i].async_cow = ctx;
async_chunk[i].inode = inode;
async_chunk[i].start = start;
async_chunk[i].end = cur_end;
async_chunk[i].write_flags = write_flags;
INIT_LIST_HEAD(&async_chunk[i].extents);
/* * The locked_folio comes all the way from writepage and its * the original folio we were actually given. As we spread * this large delalloc region across multiple async_chunk * structs, only the first struct needs a pointer to * locked_folio. * * This way we don't need racey decisions about who is supposed * to unlock it.
*/ if (locked_folio) { /* * Depending on the compressibility, the pages might or * might not go through async. We want all of them to * be accounted against wbc once. Let's do it here * before the paths diverge. wbc accounting is used * only for foreign writeback detection and doesn't * need full accuracy. Just account the whole thing * against the first page.
*/
wbc_account_cgroup_owner(wbc, locked_folio,
cur_end - start);
async_chunk[i].locked_folio = locked_folio;
locked_folio = NULL;
} else {
async_chunk[i].locked_folio = NULL;
}
/* * Run the delalloc range from start to end, and write back any dirty pages * covered by the range.
*/ static noinline int run_delalloc_cow(struct btrfs_inode *inode, struct folio *locked_folio, u64 start,
u64 end, struct writeback_control *wbc, bool pages_dirty)
{
u64 done_offset = end; int ret;
while (start <= end) {
ret = cow_file_range(inode, locked_folio, start, end,
&done_offset, true, false); if (ret) return ret;
extent_write_locked_range(&inode->vfs_inode, locked_folio,
start, done_offset, wbc, pages_dirty);
start = done_offset + 1;
}
/* * If EXTENT_NORESERVE is set it means that when the buffered write was * made we had not enough available data space and therefore we did not * reserve data space for it, since we though we could do NOCOW for the * respective file range (either there is prealloc extent or the inode * has the NOCOW bit set). * * However when we need to fallback to COW mode (because for example the * block group for the corresponding extent was turned to RO mode by a * scrub or relocation) we need to do the following: * * 1) We increment the bytes_may_use counter of the data space info. * If COW succeeds, it allocates a new data extent and after doing * that it decrements the space info's bytes_may_use counter and * increments its bytes_reserved counter by the same amount (we do * this at btrfs_add_reserved_bytes()). So we need to increment the * bytes_may_use counter to compensate (when space is reserved at * buffered write time, the bytes_may_use counter is incremented); * * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so * that if the COW path fails for any reason, it decrements (through * extent_clear_unlock_delalloc()) the bytes_may_use counter of the * data space info, which we incremented in the step above. * * If we need to fallback to cow and the inode corresponds to a free * space cache inode or an inode of the data relocation tree, we must * also increment bytes_may_use of the data space_info for the same * reason. Space caches and relocated data extents always get a prealloc * extent for them, however scrub or balance may have set the block * group that contains that extent to RO mode and therefore force COW * when starting writeback.
*/
btrfs_lock_extent(io_tree, start, end, &cached_state);
count = btrfs_count_range_bits(io_tree, &range_start, end, range_bytes,
EXTENT_NORESERVE, 0, NULL); if (count > 0 || is_space_ino || is_reloc_ino) {
u64 bytes = count; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_space_info *sinfo = fs_info->data_sinfo;
if (is_space_ino || is_reloc_ino)
bytes = range_bytes;
/* * Don't try to create inline extents, as a mix of inline extent that * is written out and unlocked directly and a normal NOCOW extent * doesn't work.
*/
ret = cow_file_range(inode, locked_folio, start, end, NULL, false, true);
ASSERT(ret != 1); return ret;
}
/* Start file offset of the range we want to NOCOW. */
u64 start; /* End file offset (inclusive) of the range we want to NOCOW. */
u64 end; bool writeback_path; /* * Free the path passed to can_nocow_file_extent() once it's not needed * anymore.
*/ bool free_path;
/* * Output fields. Only set when can_nocow_file_extent() returns 1. * The expected file extent for the NOCOW write.
*/ struct btrfs_file_extent file_extent;
};
/* * Check if we can NOCOW the file extent that the path points to. * This function may return with the path released, so the caller should check * if path->nodes[0] is NULL or not if it needs to use the path afterwards. * * Returns: < 0 on error * 0 if we can not NOCOW * 1 if we can NOCOW
*/ staticint can_nocow_file_extent(struct btrfs_path *path, struct btrfs_key *key, struct btrfs_inode *inode, struct can_nocow_file_extent_args *args)
{ constbool is_freespace_inode = btrfs_is_free_space_inode(inode); struct extent_buffer *leaf = path->nodes[0]; struct btrfs_root *root = inode->root; struct btrfs_file_extent_item *fi; struct btrfs_root *csum_root;
u64 io_start;
u64 extent_end;
u8 extent_type; int can_nocow = 0; int ret = 0; bool nowait = path->nowait;
fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
extent_type = btrfs_file_extent_type(leaf, fi);
if (extent_type == BTRFS_FILE_EXTENT_INLINE) goto out;
if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
extent_type == BTRFS_FILE_EXTENT_REG) goto out;
/* * If the extent was created before the generation where the last snapshot * for its subvolume was created, then this implies the extent is shared, * hence we must COW.
*/ if (btrfs_file_extent_generation(leaf, fi) <=
btrfs_root_last_snapshot(&root->root_item)) goto out;
/* An explicit hole, must COW. */ if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) goto out;
/* Compressed/encrypted/encoded extents must be COWed. */ if (btrfs_file_extent_compression(leaf, fi) ||
btrfs_file_extent_encryption(leaf, fi) ||
btrfs_file_extent_other_encoding(leaf, fi)) goto out;
/* * The following checks can be expensive, as they need to take other * locks and do btree or rbtree searches, so release the path to avoid * blocking other tasks for too long.
*/
btrfs_release_path(path);
ret = btrfs_cross_ref_exist(inode, key->offset - args->file_extent.offset,
args->file_extent.disk_bytenr, path);
WARN_ON_ONCE(ret > 0 && is_freespace_inode); if (ret != 0) goto out;
if (args->free_path) { /* * We don't need the path anymore, plus through the * btrfs_lookup_csums_list() call below we will end up allocating * another path. So free the path to avoid unnecessary extra * memory usage.
*/
btrfs_free_path(path);
path = NULL;
}
/* If there are pending snapshots for this root, we must COW. */ if (args->writeback_path && !is_freespace_inode &&
atomic_read(&root->snapshot_force_cow)) goto out;
can_nocow = 1;
out: if (args->free_path && path)
btrfs_free_path(path);
return ret < 0 ? ret : can_nocow;
}
/* * Cleanup the dirty folios which will never be submitted due to error. * * When running a delalloc range, we may need to split the ranges (due to * fragmentation or NOCOW). If we hit an error in the later part, we will error * out and previously successfully executed range will never be submitted, thus * we have to cleanup those folios by clearing their dirty flag, starting and * finishing the writeback.
*/ staticvoid cleanup_dirty_folios(struct btrfs_inode *inode, struct folio *locked_folio,
u64 start, u64 end, int error)
{ struct btrfs_fs_info *fs_info = inode->root->fs_info; struct address_space *mapping = inode->vfs_inode.i_mapping;
pgoff_t start_index = start >> PAGE_SHIFT;
pgoff_t end_index = end >> PAGE_SHIFT;
u32 len;
/* * Handle the locked folio first. * The btrfs_folio_clamp_*() helpers can handle range out of the folio case.
*/
btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len);
for (pgoff_t index = start_index; index <= end_index; index++) { struct folio *folio;
/* Already handled at the beginning. */ if (index == locked_folio->index) continue;
folio = __filemap_get_folio(mapping, index, FGP_LOCK, GFP_NOFS); /* Cache already dropped, no need to do any cleanup. */ if (IS_ERR(folio)) continue;
btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len);
folio_unlock(folio);
folio_put(folio);
}
mapping_set_error(mapping, error);
}
staticint nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio, struct extent_state **cached, struct can_nocow_file_extent_args *nocow_args,
u64 file_pos, bool is_prealloc)
{ struct btrfs_ordered_extent *ordered;
u64 len = nocow_args->file_extent.num_bytes;
u64 end = file_pos + len - 1; int ret = 0;
if (btrfs_is_data_reloc_root(inode->root)) /* * Errors are handled later, as we must prevent * extent_clear_unlock_delalloc() in error handler from freeing * metadata of the created ordered extent.
*/
ret = btrfs_reloc_clone_csums(ordered);
btrfs_put_ordered_extent(ordered);
extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached,
EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_CLEAR_DATA_RESV,
PAGE_UNLOCK | PAGE_SET_ORDERED); /* * On error, we need to cleanup the ordered extents we created. * * We do not clear the folio Dirty flags because they are set and * cleaered by the caller.
*/ if (ret < 0)
btrfs_cleanup_ordered_extents(inode, file_pos, len); return ret;
}
/* * when nowcow writeback call back. This checks for snapshots or COW copies * of the extents that exist in the file, and COWs the file as required. * * If no cow copies or snapshots exist, we write directly to the existing * blocks on disk
*/ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, struct folio *locked_folio, const u64 start, const u64 end)
{ struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; struct btrfs_path *path;
u64 cow_start = (u64)-1; /* * If not 0, represents the inclusive end of the last fallback_to_cow() * range. Only for error handling.
*/
u64 cow_end = 0;
u64 cur_offset = start; int ret; bool check_prev = true;
u64 ino = btrfs_ino(inode); struct can_nocow_file_extent_args nocow_args = { 0 };
/* * Normally on a zoned device we're only doing COW writes, but in case * of relocation on a zoned filesystem serializes I/O so that we're only * writing sequentially and can end up here as well.
*/
ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root));
path = btrfs_alloc_path(); if (!path) {
ret = -ENOMEM; goto error;
}
ret = btrfs_lookup_file_extent(NULL, root, path, ino,
cur_offset, 0); if (ret < 0) goto error;
/* * If there is no extent for our range when doing the initial * search, then go back to the previous slot as it will be the * one containing the search offset
*/ if (ret > 0 && path->slots[0] > 0 && check_prev) {
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key,
path->slots[0] - 1); if (found_key.objectid == ino &&
found_key.type == BTRFS_EXTENT_DATA_KEY)
path->slots[0]--;
}
check_prev = false;
next_slot: /* Go to next leaf if we have exhausted the current one */
leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path); if (ret < 0) goto error; if (ret > 0) break;
leaf = path->nodes[0];
}
/* Didn't find anything for our INO */ if (found_key.objectid > ino) break; /* * Keep searching until we find an EXTENT_ITEM or there are no * more extents for this inode
*/ if (WARN_ON_ONCE(found_key.objectid < ino) ||
found_key.type < BTRFS_EXTENT_DATA_KEY) {
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Dauer der Verarbeitung: 0.21 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.