struct btrfs_rename_ctx { /* Output field. Stores the index number of the old directory entry. */
u64 index;
};
/* * Used by data_reloc_print_warning_inode() to pass needed info for filename * resolution and output of error message.
*/ struct data_reloc_warn { struct btrfs_path path; struct btrfs_fs_info *fs_info;
u64 extent_item_size;
u64 logical; int mirror_num;
};
/* * For the file_extent_tree, we want to hold the inode lock when we lookup and * update the disk_i_size, but lockdep will complain because our io_tree we hold * the tree lock and get the inode lock when setting delalloc. These two things * are unrelated, so make a class for the file_extent_tree so we don't get the * two locking patterns mixed up.
*/ staticstruct lock_class_key file_extent_tree_class;
nofs_flag = memalloc_nofs_save();
ipath = init_ipath(4096, local_root, &warn->path);
memalloc_nofs_restore(nofs_flag); if (IS_ERR(ipath)) {
btrfs_put_root(local_root);
ret = PTR_ERR(ipath);
ipath = NULL; /* * -ENOMEM, not a critical error, just output an generic error * without filename.
*/
btrfs_warn(fs_info, "checksum error at logical %llu mirror %u root %llu, inode %llu offset %llu",
warn->logical, warn->mirror_num, root, inum, offset); return ret;
}
ret = paths_from_inode(inum, ipath); if (ret < 0) {
btrfs_put_root(local_root); goto err;
}
/* * We deliberately ignore the bit ipath might have been too small to * hold all of the paths here
*/ for (int i = 0; i < ipath->fspath->elem_cnt; i++) {
btrfs_warn(fs_info, "checksum error at logical %llu mirror %u root %llu inode %llu offset %llu length %u links %u (path: %s)",
warn->logical, warn->mirror_num, root, inum, offset,
fs_info->sectorsize, nlink,
(char *)(unsignedlong)ipath->fspath->val[i]);
}
ret = extent_from_logical(fs_info, logical, &path, &found_key, &flags); if (ret < 0) {
btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d",
logical, ret); return;
}
eb = path.nodes[0];
ei = btrfs_item_ptr(eb, path.slots[0], struct btrfs_extent_item);
item_size = btrfs_item_size(eb, path.slots[0]); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { unsignedlong ptr = 0;
u64 ref_root;
u8 ref_level;
while (true) {
ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
item_size, &ref_root,
&ref_level); if (ret < 0) {
btrfs_warn_rl(fs_info, "failed to resolve tree backref for logical %llu: %d",
logical, ret); break;
} if (ret > 0) break;
/* For data reloc tree, it's better to do a backref lookup instead. */ if (btrfs_is_data_reloc_root(root)) return print_data_reloc_error(inode, logical_start, csum,
csum_expected, mirror_num);
/* * Lock inode i_rwsem based on arguments passed. * * ilock_flags can have the following bit set: * * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt * return -EAGAIN * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
*/ int btrfs_inode_lock(struct btrfs_inode *inode, unsignedint ilock_flags)
{ if (ilock_flags & BTRFS_ILOCK_SHARED) { if (ilock_flags & BTRFS_ILOCK_TRY) { if (!inode_trylock_shared(&inode->vfs_inode)) return -EAGAIN; else return 0;
}
inode_lock_shared(&inode->vfs_inode);
} else { if (ilock_flags & BTRFS_ILOCK_TRY) { if (!inode_trylock(&inode->vfs_inode)) return -EAGAIN; else return 0;
}
inode_lock(&inode->vfs_inode);
} if (ilock_flags & BTRFS_ILOCK_MMAP)
down_write(&inode->i_mmap_lock); return 0;
}
/* * Unock inode i_rwsem. * * ilock_flags should contain the same bits set as passed to btrfs_inode_lock() * to decide whether the lock acquired is shared or exclusive.
*/ void btrfs_inode_unlock(struct btrfs_inode *inode, unsignedint ilock_flags)
{ if (ilock_flags & BTRFS_ILOCK_MMAP)
up_write(&inode->i_mmap_lock); if (ilock_flags & BTRFS_ILOCK_SHARED)
inode_unlock_shared(&inode->vfs_inode); else
inode_unlock(&inode->vfs_inode);
}
/* * Cleanup all submitted ordered extents in specified range to handle errors * from the btrfs_run_delalloc_range() callback. * * NOTE: caller must ensure that when an error happens, it can not call * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata * to be released, which we want to happen only when finishing the ordered * extent (btrfs_finish_ordered_io()).
*/ staticinlinevoid btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
u64 offset, u64 bytes)
{
pgoff_t index = offset >> PAGE_SHIFT; const pgoff_t end_index = (offset + bytes - 1) >> PAGE_SHIFT; struct folio *folio;
while (index <= end_index) {
folio = filemap_get_folio(inode->vfs_inode.i_mapping, index); if (IS_ERR(folio)) {
index++; continue;
}
index = folio_end(folio) >> PAGE_SHIFT; /* * Here we just clear all Ordered bits for every page in the * range, then btrfs_mark_ordered_io_finished() will handle * the ordered extent accounting for the range.
*/
btrfs_folio_clamp_clear_ordered(inode->root->fs_info, folio,
offset, bytes);
folio_put(folio);
}
staticint btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct btrfs_new_inode_args *args)
{ int ret;
if (args->default_acl) {
ret = __btrfs_set_acl(trans, args->inode, args->default_acl,
ACL_TYPE_DEFAULT); if (ret) return ret;
} if (args->acl) {
ret = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS); if (ret) return ret;
} if (!args->default_acl && !args->acl)
cache_no_acl(args->inode); return btrfs_xattr_security_init(trans, args->inode, args->dir,
&args->dentry->d_name);
}
/* * this does all the hard work for inserting an inline extent into * the btree. The caller should have done a btrfs_drop_extents so that * no overlapping inline items exist in the btree
*/ staticint insert_inline_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_inode *inode, bool extent_inserted,
size_t size, size_t compressed_size, int compress_type, struct folio *compressed_folio, bool update_i_size)
{ struct btrfs_root *root = inode->root; struct extent_buffer *leaf; const u32 sectorsize = trans->fs_info->sectorsize; char *kaddr; unsignedlong ptr; struct btrfs_file_extent_item *ei; int ret;
size_t cur_size = size;
u64 i_size;
/* * The decompressed size must still be no larger than a sector. Under * heavy race, we can have size == 0 passed in, but that shouldn't be a * big deal and we can continue the insertion.
*/
ASSERT(size <= sectorsize);
/* * The compressed size also needs to be no larger than a sector. * That's also why we only need one page as the parameter.
*/ if (compressed_folio)
ASSERT(compressed_size <= sectorsize); else
ASSERT(compressed_size == 0);
if (compressed_size && compressed_folio)
cur_size = compressed_size;
if (!extent_inserted) { struct btrfs_key key;
size_t datasize;
/* * We align size to sectorsize for inline extents just for simplicity * sake.
*/
ret = btrfs_inode_set_file_extent_range(inode, 0,
ALIGN(size, root->fs_info->sectorsize)); if (ret) goto fail;
/* * We're an inline extent, so nobody can extend the file past i_size * without locking a page we already have locked. * * We must do any i_size and inode updates before we unlock the pages. * Otherwise we could end up racing with unlink.
*/
i_size = i_size_read(&inode->vfs_inode); if (update_i_size && size > i_size) {
i_size_write(&inode->vfs_inode, size);
i_size = size;
}
inode->disk_i_size = i_size;
/* Inline extents must start at offset 0. */ if (offset != 0) returnfalse;
/* Inline extents are limited to sectorsize. */ if (size > fs_info->sectorsize) returnfalse;
/* We do not allow a non-compressed extent to be as large as block size. */ if (data_len >= fs_info->sectorsize) returnfalse;
/* We cannot exceed the maximum inline data size. */ if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) returnfalse;
/* We cannot exceed the user specified max_inline size. */ if (data_len > fs_info->max_inline) returnfalse;
/* Inline extents must be the entirety of the file. */ if (size < i_size_read(&inode->vfs_inode)) returnfalse;
returntrue;
}
/* * conditionally insert an inline extent into the file. This * does the checks required to make sure the data is small enough * to fit as an inline extent. * * If being used directly, you must have already checked we're allowed to cow * the range by getting true from can_cow_file_range_inline().
*/ static noinline int __cow_file_range_inline(struct btrfs_inode *inode,
u64 size, size_t compressed_size, int compress_type, struct folio *compressed_folio, bool update_i_size)
{ struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans;
u64 data_len = (compressed_size ?: size); int ret; struct btrfs_path *path;
path = btrfs_alloc_path(); if (!path) return -ENOMEM;
trans = btrfs_join_transaction(root); if (IS_ERR(trans)) {
btrfs_free_path(path); return PTR_ERR(trans);
}
trans->block_rsv = &inode->block_rsv;
ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
size, compressed_size, compress_type,
compressed_folio, update_i_size); if (ret && ret != -ENOSPC) {
btrfs_abort_transaction(trans, ret); goto out;
} elseif (ret == -ENOSPC) {
ret = 1; goto out;
}
btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
ret = btrfs_update_inode(trans, inode); if (ret && ret != -ENOSPC) {
btrfs_abort_transaction(trans, ret); goto out;
} elseif (ret == -ENOSPC) {
ret = 1; goto out;
}
btrfs_set_inode_full_sync(inode);
out: /* * Don't forget to free the reserved space, as for inlined extent * it won't count as data extent, free them directly here. * And at reserve time, it's always aligned to page size, so * just free one page here.
*/
btrfs_qgroup_free_data(inode, NULL, 0, fs_info->sectorsize, NULL);
btrfs_free_path(path);
btrfs_end_transaction(trans); return ret;
}
/* * In the successful case (ret == 0 here), cow_file_range will return 1. * * Quite a bit further up the callstack in extent_writepage(), ret == 1 * is treated as a short circuited success and does not unlock the folio, * so we must do it here. * * In the failure case, the locked_folio does get unlocked by * btrfs_folio_end_all_writers, which asserts that it is still locked * at that point, so we must *not* unlock it here. * * The other two callsites in compress_file_range do not have a * locked_folio, so they are not relevant to this logic.
*/ if (ret == 0)
locked_folio = NULL;
/* * Check if the inode needs to be submitted to compression, based on mount * options, defragmentation, properties or heuristics.
*/ staticinlineint inode_need_compress(struct btrfs_inode *inode, u64 start,
u64 end)
{ struct btrfs_fs_info *fs_info = inode->root->fs_info;
if (!btrfs_inode_can_compress(inode)) {
DEBUG_WARN("BTRFS: unexpected compression for ino %llu", btrfs_ino(inode)); return 0;
}
/* Defrag ioctl takes precedence over mount options and properties. */ if (inode->defrag_compress == BTRFS_DEFRAG_DONT_COMPRESS) return 0; if (BTRFS_COMPRESS_NONE < inode->defrag_compress &&
inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES) return 1; /* force compress */ if (btrfs_test_opt(fs_info, FORCE_COMPRESS)) return 1; /* bad compression ratios */ if (inode->flags & BTRFS_INODE_NOCOMPRESS) return 0; if (btrfs_test_opt(fs_info, COMPRESS) ||
inode->flags & BTRFS_INODE_COMPRESS ||
inode->prop_compress) return btrfs_compress_heuristic(inode, start, end); return 0;
}
staticinlinevoid inode_should_defrag(struct btrfs_inode *inode,
u64 start, u64 end, u64 num_bytes, u32 small_write)
{ /* If this is a small write inside eof, kick off a defrag */ if (num_bytes < small_write &&
(start > 0 || end + 1 < inode->disk_i_size))
btrfs_add_inode_defrag(inode, small_write);
}
staticint extent_range_clear_dirty_for_io(struct btrfs_inode *inode, u64 start, u64 end)
{ const pgoff_t end_index = end >> PAGE_SHIFT; struct folio *folio; int ret = 0;
for (pgoff_t index = start >> PAGE_SHIFT; index <= end_index; index++) {
folio = filemap_get_folio(inode->vfs_inode.i_mapping, index); if (IS_ERR(folio)) { if (!ret)
ret = PTR_ERR(folio); continue;
}
btrfs_folio_clamp_clear_dirty(inode->root->fs_info, folio, start,
end + 1 - start);
folio_put(folio);
} return ret;
}
/* * Work queue call back to started compression on a file and pages. * * This is done inside an ordered work queue, and the compression is spread * across many cpus. The actual IO submission is step two, and the ordered work * queue takes care of making sure that happens in the same order things were * put onto the queue by writepages and friends. * * If this code finds it can't get good compression, it puts an entry onto the * work queue to write the uncompressed bytes. This makes sure that both * compressed inodes and uncompressed inodes are written in the same order that * the flusher thread sent them down.
*/ staticvoid compress_file_range(struct btrfs_work *work)
{ struct async_chunk *async_chunk =
container_of(work, struct async_chunk, work); struct btrfs_inode *inode = async_chunk->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct address_space *mapping = inode->vfs_inode.i_mapping;
u64 blocksize = fs_info->sectorsize;
u64 start = async_chunk->start;
u64 end = async_chunk->end;
u64 actual_end;
u64 i_size; int ret = 0; struct folio **folios; unsignedlong nr_folios; unsignedlong total_compressed = 0; unsignedlong total_in = 0; unsignedint poff; int i; int compress_type = fs_info->compress_type; int compress_level = fs_info->compress_level;
inode_should_defrag(inode, start, end, end - start + 1, SZ_16K);
/* * We need to call clear_page_dirty_for_io on each page in the range. * Otherwise applications with the file mmap'd can wander in and change * the page contents while we are compressing them.
*/
ret = extent_range_clear_dirty_for_io(inode, start, end);
/* * All the folios should have been locked thus no failure. * * And even if some folios are missing, btrfs_compress_folios() * would handle them correctly, so here just do an ASSERT() check for * early logic errors.
*/
ASSERT(ret == 0);
/* * We need to save i_size before now because it could change in between * us evaluating the size and assigning it. This is because we lock and * unlock the page in truncate and fallocate, and then modify the i_size * later on. * * The barriers are to emulate READ_ONCE, remove that once i_size_read * does that for us.
*/
barrier();
i_size = i_size_read(&inode->vfs_inode);
barrier();
actual_end = min_t(u64, i_size, end + 1);
again:
folios = NULL;
nr_folios = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
nr_folios = min_t(unsignedlong, nr_folios, BTRFS_MAX_COMPRESSED_PAGES);
/* * we don't want to send crud past the end of i_size through * compression, that's just a waste of CPU time. So, if the * end of the file is before the start of our current * requested range of bytes, we bail out to the uncompressed * cleanup code that can deal with all of this. * * It isn't really the fastest way to fix things, but this is a * very uncommon corner.
*/ if (actual_end <= start) goto cleanup_and_bail_uncompressed;
total_compressed = actual_end - start;
/* * Skip compression for a small file range(<=blocksize) that * isn't an inline extent, since it doesn't save disk space at all.
*/ if (total_compressed <= blocksize &&
(start > 0 || end + 1 < inode->disk_i_size)) goto cleanup_and_bail_uncompressed;
/* * We do compression for mount -o compress and when the inode has not * been flagged as NOCOMPRESS. This flag can change at any time if we * discover bad compression ratios.
*/ if (!inode_need_compress(inode, start, end)) goto cleanup_and_bail_uncompressed;
folios = kcalloc(nr_folios, sizeof(struct folio *), GFP_NOFS); if (!folios) { /* * Memory allocation failure is not a fatal error, we can fall * back to uncompressed code.
*/ goto cleanup_and_bail_uncompressed;
}
/* Compression level is applied here. */
ret = btrfs_compress_folios(compress_type, compress_level,
mapping, start, folios, &nr_folios, &total_in,
&total_compressed); if (ret) goto mark_incompressible;
/* * Zero the tail end of the last page, as we might be sending it down * to disk.
*/
poff = offset_in_page(total_compressed); if (poff)
folio_zero_range(folios[nr_folios - 1], poff, PAGE_SIZE - poff);
/* * Try to create an inline extent. * * If we didn't compress the entire range, try to create an uncompressed * inline extent, else a compressed one. * * Check cow_file_range() for why we don't even try to create inline * extent for the subpage case.
*/ if (total_in < actual_end)
ret = cow_file_range_inline(inode, NULL, start, end, 0,
BTRFS_COMPRESS_NONE, NULL, false); else
ret = cow_file_range_inline(inode, NULL, start, end, total_compressed,
compress_type, folios[0], false); if (ret <= 0) { if (ret < 0)
mapping_set_error(mapping, -EIO); goto free_pages;
}
/* * We aren't doing an inline extent. Round the compressed size up to a * block size boundary so the allocator does sane things.
*/
total_compressed = ALIGN(total_compressed, blocksize);
/* * One last check to make sure the compression is really a win, compare * the page count read with the blocks on disk, compression must free at * least one sector.
*/
total_in = round_up(total_in, fs_info->sectorsize); if (total_compressed + blocksize > total_in) goto mark_incompressible;
/* * The async work queues will take care of doing actual allocation on * disk for these compressed pages, and will submit the bios.
*/
ret = add_async_extent(async_chunk, start, total_in, total_compressed, folios,
nr_folios, compress_type);
BUG_ON(ret); if (start + total_in < end) {
start += total_in;
cond_resched(); goto again;
} return;
mark_incompressible: if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress)
inode->flags |= BTRFS_INODE_NOCOMPRESS;
cleanup_and_bail_uncompressed:
ret = add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
BTRFS_COMPRESS_NONE);
BUG_ON(ret);
free_pages: if (folios) { for (i = 0; i < nr_folios; i++) {
WARN_ON(folios[i]->mapping);
btrfs_free_compr_folio(folios[i]);
}
kfree(folios);
}
}
staticvoid free_async_extent_pages(struct async_extent *async_extent)
{ int i;
if (!async_extent->folios) return;
for (i = 0; i < async_extent->nr_folios; i++) {
WARN_ON(async_extent->folios[i]->mapping);
btrfs_free_compr_folio(async_extent->folios[i]);
}
kfree(async_extent->folios);
async_extent->nr_folios = 0;
async_extent->folios = NULL;
}
if (async_chunk->blkcg_css)
kthread_associate_blkcg(async_chunk->blkcg_css);
/* * If async_chunk->locked_folio is in the async_extent range, we need to * handle it.
*/ if (async_chunk->locked_folio) {
u64 locked_folio_start = folio_pos(async_chunk->locked_folio);
u64 locked_folio_end = locked_folio_start +
folio_size(async_chunk->locked_folio) - 1;
if (!(start >= locked_folio_end || end <= locked_folio_start))
locked_folio = async_chunk->locked_folio;
}
ret = btrfs_reserve_extent(root, async_extent->ram_size,
async_extent->compressed_size,
async_extent->compressed_size,
0, *alloc_hint, &ins, 1, 1); if (ret) { /* * We can't reserve contiguous space for the compressed size. * Unlikely, but it's possible that we could have enough * non-contiguous space for the uncompressed size instead. So * fall back to uncompressed.
*/
submit_uncompressed_range(inode, async_extent, locked_folio);
free_pages = true; goto done;
}
btrfs_lock_extent(io_tree, start, end, &cached);
/* Here we're doing allocation and writeback of the compressed pages */
file_extent.disk_bytenr = ins.objectid;
file_extent.disk_num_bytes = ins.offset;
file_extent.ram_bytes = async_extent->ram_size;
file_extent.num_bytes = async_extent->ram_size;
file_extent.offset = 0;
file_extent.compression = async_extent->compress_type;
em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED); if (IS_ERR(em)) {
ret = PTR_ERR(em); goto out_free_reserve;
}
btrfs_free_extent_map(em);
read_lock(&em_tree->lock);
em = btrfs_search_extent_mapping(em_tree, start, num_bytes); if (em) { /* * if block start isn't an actual block number then find the * first block in this inode and use that as a hint. If that * block is also bogus then just don't worry about it.
*/ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
btrfs_free_extent_map(em);
em = btrfs_search_extent_mapping(em_tree, 0, 0); if (em && em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
alloc_hint = btrfs_extent_map_block_start(em); if (em)
btrfs_free_extent_map(em);
} else {
alloc_hint = btrfs_extent_map_block_start(em);
btrfs_free_extent_map(em);
}
}
read_unlock(&em_tree->lock);
return alloc_hint;
}
/* * when extent_io.c finds a delayed allocation range in the file, * the call backs end up in this code. The basic idea is to * allocate extents on disk for the range, and create ordered data structs * in ram to track those extents. * * locked_folio is the folio that writepage had locked already. We use * it to make sure we don't do extra locks or unlocks. * * When this function fails, it unlocks all pages except @locked_folio. * * When this function successfully creates an inline extent, it returns 1 and * unlocks all pages including locked_folio and starts I/O on them. * (In reality inline extents are limited to a single page, so locked_folio is * the only page handled anyway). * * When this function succeed and creates a normal extent, the page locking * status depends on the passed in flags: * * - If @keep_locked is set, all pages are kept locked. * - Else all pages except for @locked_folio are unlocked. * * When a failure happens in the second or later iteration of the * while-loop, the ordered extents created in previous iterations are cleaned up.
*/ static noinline int cow_file_range(struct btrfs_inode *inode, struct folio *locked_folio, u64 start,
u64 end, u64 *done_offset, bool keep_locked, bool no_inline)
{ struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct extent_state *cached = NULL;
u64 alloc_hint = 0;
u64 orig_start = start;
u64 num_bytes;
u64 cur_alloc_size = 0;
u64 min_alloc_size;
u64 blocksize = fs_info->sectorsize; struct btrfs_key ins; struct extent_map *em; unsigned clear_bits; unsignedlong page_ops; int ret = 0;
if (btrfs_is_free_space_inode(inode)) {
ret = -EINVAL; goto out_unlock;
}
if (!no_inline) { /* lets try to make an inline extent */
ret = cow_file_range_inline(inode, locked_folio, start, end, 0,
BTRFS_COMPRESS_NONE, NULL, false); if (ret <= 0) { /* * We succeeded, return 1 so the caller knows we're done * with this page and already handled the IO. * * If there was an error then cow_file_range_inline() has * already done the cleanup.
*/ if (ret == 0)
ret = 1; goto done;
}
}
/* * We're not doing compressed IO, don't unlock the first page (which * the caller expects to stay locked), don't clear any dirty bits and * don't set any writeback bits. * * Do set the Ordered (Private2) bit so we know this page was properly * setup for writepage.
*/
page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
page_ops |= PAGE_SET_ORDERED;
/* * Relocation relies on the relocated extents to have exactly the same * size as the original extents. Normally writeback for relocation data * extents follows a NOCOW path because relocation preallocates the * extents. However, due to an operation such as scrub turning a block * group to RO mode, it may fallback to COW mode, so we must make sure * an extent allocated during COW has exactly the requested size and can * not be split into smaller extents, otherwise relocation breaks and * fails during the stage where it updates the bytenr of file extent * items.
*/ if (btrfs_is_data_reloc_root(root))
min_alloc_size = num_bytes; else
min_alloc_size = fs_info->sectorsize;
ret = btrfs_reserve_extent(root, num_bytes, num_bytes,
min_alloc_size, 0, alloc_hint,
&ins, 1, 1); if (ret == -EAGAIN) { /* * btrfs_reserve_extent only returns -EAGAIN for zoned * file systems, which is an indication that there are * no active zones to allocate from at the moment. * * If this is the first loop iteration, wait for at * least one zone to finish before retrying the * allocation. Otherwise ask the caller to write out * the already allocated blocks before coming back to * us, or return -ENOSPC if it can't handle retries.
*/
ASSERT(btrfs_is_zoned(fs_info)); if (start == orig_start) {
wait_on_bit_io(&inode->root->fs_info->flags,
BTRFS_FS_NEED_ZONE_FINISH,
TASK_UNINTERRUPTIBLE); continue;
} if (done_offset) { /* * Move @end to the end of the processed range, * and exit the loop to unlock the processed extents.
*/
end = start - 1;
ret = 0; break;
}
ret = -ENOSPC;
} if (ret < 0) goto out_unlock;
cur_alloc_size = ins.offset;
/* * Locked range will be released either during error clean up or * after the whole range is finished.
*/
btrfs_lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1,
&cached);
em = btrfs_create_io_em(inode, start, &file_extent,
BTRFS_ORDERED_REGULAR); if (IS_ERR(em)) {
btrfs_unlock_extent(&inode->io_tree, start,
start + cur_alloc_size - 1, &cached);
ret = PTR_ERR(em); goto out_reserve;
}
btrfs_free_extent_map(em);
if (btrfs_is_data_reloc_root(root)) {
ret = btrfs_reloc_clone_csums(ordered);
/* * Only drop cache here, and process as normal. * * We must not allow extent_clear_unlock_delalloc() * at out_unlock label to free meta of this ordered * extent, as its meta should be freed by * btrfs_finish_ordered_io(). * * So we must continue until @start is increased to * skip current ordered extent.
*/ if (ret)
btrfs_drop_extent_map_range(inode, start,
start + cur_alloc_size - 1, false);
}
btrfs_put_ordered_extent(ordered);
/* * btrfs_reloc_clone_csums() error, since start is increased * extent_clear_unlock_delalloc() at out_unlock label won't * free metadata of current ordered extent, we're OK to exit.
*/ if (ret) goto out_unlock;
}
extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached,
EXTENT_LOCKED | EXTENT_DELALLOC, page_ops);
done: if (done_offset)
*done_offset = end; return ret;
out_drop_extent_cache:
btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false);
out_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
out_unlock: /* * Now, we have three regions to clean up: * * |-------(1)----|---(2)---|-------------(3)----------| * `- orig_start `- start `- start + cur_alloc_size `- end * * We process each region below.
*/
/* * For the range (1). We have already instantiated the ordered extents * for this region, thus we need to cleanup those ordered extents. * EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV * are also handled by the ordered extents cleanup. * * So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag, and * finish the writeback of the involved folios, which will be never submitted.
*/ if (orig_start < start) {
clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC;
page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
if (!locked_folio)
mapping_set_error(inode->vfs_inode.i_mapping, ret);
/* * For the range (2). If we reserved an extent for our delalloc range * (or a subrange) and failed to create the respective ordered extent, * then it means that when we reserved the extent we decremented the * extent's size from the data space_info's bytes_may_use counter and * incremented the space_info's bytes_reserved counter by the same * amount. We must make sure extent_clear_unlock_delalloc() does not try * to decrement again the data space_info's bytes_may_use counter, * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
*/ if (cur_alloc_size) {
extent_clear_unlock_delalloc(inode, start,
start + cur_alloc_size - 1,
locked_folio, &cached, clear_bits,
page_ops);
btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL);
}
/* * For the range (3). We never touched the region. In addition to the * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data * space_info's bytes_may_use counter, reserved in * btrfs_check_data_free_space().
*/ if (start + cur_alloc_size < end) {
clear_bits |= EXTENT_CLEAR_DATA_RESV;
extent_clear_unlock_delalloc(inode, start + cur_alloc_size,
end, locked_folio,
&cached, clear_bits, page_ops);
btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size,
end - start - cur_alloc_size + 1, NULL);
}
btrfs_err_rl(fs_info, "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d",
__func__, btrfs_root_id(inode->root),
btrfs_ino(inode), orig_start, end + 1 - orig_start, ret); return ret;
}
/* * Phase two of compressed writeback. This is the ordered portion of the code, * which only gets called in the order the work was queued. We walk all the * async extents created by compress_file_range and send them down to the disk. * * If called with @do_free == true then it'll try to finish the work and free * the work struct eventually.
*/ static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_free)
{ struct async_chunk *async_chunk = container_of(work, struct async_chunk,
work); struct btrfs_fs_info *fs_info = btrfs_work_owner(work); struct async_extent *async_extent; unsignedlong nr_pages;
u64 alloc_hint = 0;
if (do_free) { struct async_cow *async_cow;
btrfs_add_delayed_iput(async_chunk->inode); if (async_chunk->blkcg_css)
css_put(async_chunk->blkcg_css);
async_cow = async_chunk->async_cow; if (atomic_dec_and_test(&async_cow->num_chunks))
kvfree(async_cow); return;
}
for (i = 0; i < num_chunks; i++) {
u64 cur_end = min(end, start + SZ_512K - 1);
/* * igrab is called higher up in the call chain, take only the * lightweight reference for the callback lifetime
*/
ihold(&inode->vfs_inode);
async_chunk[i].async_cow = ctx;
async_chunk[i].inode = inode;
async_chunk[i].start = start;
async_chunk[i].end = cur_end;
async_chunk[i].write_flags = write_flags;
INIT_LIST_HEAD(&async_chunk[i].extents);
/* * The locked_folio comes all the way from writepage and its * the original folio we were actually given. As we spread * this large delalloc region across multiple async_chunk * structs, only the first struct needs a pointer to * locked_folio. * * This way we don't need racey decisions about who is supposed * to unlock it.
*/ if (locked_folio) { /* * Depending on the compressibility, the pages might or * might not go through async. We want all of them to * be accounted against wbc once. Let's do it here * before the paths diverge. wbc accounting is used * only for foreign writeback detection and doesn't * need full accuracy. Just account the whole thing * against the first page.
*/
wbc_account_cgroup_owner(wbc, locked_folio,
cur_end - start);
async_chunk[i].locked_folio = locked_folio;
locked_folio = NULL;
} else {
async_chunk[i].locked_folio = NULL;
}
/* * Run the delalloc range from start to end, and write back any dirty pages * covered by the range.
*/ static noinline int run_delalloc_cow(struct btrfs_inode *inode, struct folio *locked_folio, u64 start,
u64 end, struct writeback_control *wbc, bool pages_dirty)
{
u64 done_offset = end; int ret;
while (start <= end) {
ret = cow_file_range(inode, locked_folio, start, end,
&done_offset, true, false); if (ret) return ret;
extent_write_locked_range(&inode->vfs_inode, locked_folio,
start, done_offset, wbc, pages_dirty);
start = done_offset + 1;
}
/* * If EXTENT_NORESERVE is set it means that when the buffered write was * made we had not enough available data space and therefore we did not * reserve data space for it, since we though we could do NOCOW for the * respective file range (either there is prealloc extent or the inode * has the NOCOW bit set). * * However when we need to fallback to COW mode (because for example the * block group for the corresponding extent was turned to RO mode by a * scrub or relocation) we need to do the following: * * 1) We increment the bytes_may_use counter of the data space info. * If COW succeeds, it allocates a new data extent and after doing * that it decrements the space info's bytes_may_use counter and * increments its bytes_reserved counter by the same amount (we do * this at btrfs_add_reserved_bytes()). So we need to increment the * bytes_may_use counter to compensate (when space is reserved at * buffered write time, the bytes_may_use counter is incremented); * * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so * that if the COW path fails for any reason, it decrements (through * extent_clear_unlock_delalloc()) the bytes_may_use counter of the * data space info, which we incremented in the step above. * * If we need to fallback to cow and the inode corresponds to a free * space cache inode or an inode of the data relocation tree, we must * also increment bytes_may_use of the data space_info for the same * reason. Space caches and relocated data extents always get a prealloc * extent for them, however scrub or balance may have set the block * group that contains that extent to RO mode and therefore force COW * when starting writeback.
*/
btrfs_lock_extent(io_tree, start, end, &cached_state);
count = btrfs_count_range_bits(io_tree, &range_start, end, range_bytes,
EXTENT_NORESERVE, 0, NULL); if (count > 0 || is_space_ino || is_reloc_ino) {
u64 bytes = count; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_space_info *sinfo = fs_info->data_sinfo;
if (is_space_ino || is_reloc_ino)
bytes = range_bytes;
/* * Don't try to create inline extents, as a mix of inline extent that * is written out and unlocked directly and a normal NOCOW extent * doesn't work.
*/
ret = cow_file_range(inode, locked_folio, start, end, NULL, false, true);
ASSERT(ret != 1); return ret;
}
/* Start file offset of the range we want to NOCOW. */
u64 start; /* End file offset (inclusive) of the range we want to NOCOW. */
u64 end; bool writeback_path; /* * Free the path passed to can_nocow_file_extent() once it's not needed * anymore.
*/ bool free_path;
/* * Output fields. Only set when can_nocow_file_extent() returns 1. * The expected file extent for the NOCOW write.
*/ struct btrfs_file_extent file_extent;
};
/* * Check if we can NOCOW the file extent that the path points to. * This function may return with the path released, so the caller should check * if path->nodes[0] is NULL or not if it needs to use the path afterwards. * * Returns: < 0 on error * 0 if we can not NOCOW * 1 if we can NOCOW
*/ staticint can_nocow_file_extent(struct btrfs_path *path, struct btrfs_key *key, struct btrfs_inode *inode, struct can_nocow_file_extent_args *args)
{ constbool is_freespace_inode = btrfs_is_free_space_inode(inode); struct extent_buffer *leaf = path->nodes[0]; struct btrfs_root *root = inode->root; struct btrfs_file_extent_item *fi; struct btrfs_root *csum_root;
u64 io_start;
u64 extent_end;
u8 extent_type; int can_nocow = 0; int ret = 0; bool nowait = path->nowait;
fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
extent_type = btrfs_file_extent_type(leaf, fi);
if (extent_type == BTRFS_FILE_EXTENT_INLINE) goto out;
if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
extent_type == BTRFS_FILE_EXTENT_REG) goto out;
/* * If the extent was created before the generation where the last snapshot * for its subvolume was created, then this implies the extent is shared, * hence we must COW.
*/ if (btrfs_file_extent_generation(leaf, fi) <=
btrfs_root_last_snapshot(&root->root_item)) goto out;
/* An explicit hole, must COW. */ if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) goto out;
/* Compressed/encrypted/encoded extents must be COWed. */ if (btrfs_file_extent_compression(leaf, fi) ||
btrfs_file_extent_encryption(leaf, fi) ||
btrfs_file_extent_other_encoding(leaf, fi)) goto out;
/* * The following checks can be expensive, as they need to take other * locks and do btree or rbtree searches, so release the path to avoid * blocking other tasks for too long.
*/
btrfs_release_path(path);
ret = btrfs_cross_ref_exist(inode, key->offset - args->file_extent.offset,
args->file_extent.disk_bytenr, path);
WARN_ON_ONCE(ret > 0 && is_freespace_inode); if (ret != 0) goto out;
if (args->free_path) { /* * We don't need the path anymore, plus through the * btrfs_lookup_csums_list() call below we will end up allocating * another path. So free the path to avoid unnecessary extra * memory usage.
*/
btrfs_free_path(path);
path = NULL;
}
/* If there are pending snapshots for this root, we must COW. */ if (args->writeback_path && !is_freespace_inode &&
atomic_read(&root->snapshot_force_cow)) goto out;
can_nocow = 1;
out: if (args->free_path && path)
btrfs_free_path(path);
return ret < 0 ? ret : can_nocow;
}
/* * Cleanup the dirty folios which will never be submitted due to error. * * When running a delalloc range, we may need to split the ranges (due to * fragmentation or NOCOW). If we hit an error in the later part, we will error * out and previously successfully executed range will never be submitted, thus * we have to cleanup those folios by clearing their dirty flag, starting and * finishing the writeback.
*/ staticvoid cleanup_dirty_folios(struct btrfs_inode *inode, struct folio *locked_folio,
u64 start, u64 end, int error)
{ struct btrfs_fs_info *fs_info = inode->root->fs_info; struct address_space *mapping = inode->vfs_inode.i_mapping;
pgoff_t start_index = start >> PAGE_SHIFT;
pgoff_t end_index = end >> PAGE_SHIFT;
u32 len;
/* * Handle the locked folio first. * The btrfs_folio_clamp_*() helpers can handle range out of the folio case.
*/
btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len);
for (pgoff_t index = start_index; index <= end_index; index++) { struct folio *folio;
/* Already handled at the beginning. */ if (index == locked_folio->index) continue;
folio = __filemap_get_folio(mapping, index, FGP_LOCK, GFP_NOFS); /* Cache already dropped, no need to do any cleanup. */ if (IS_ERR(folio)) continue;
btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len);
folio_unlock(folio);
folio_put(folio);
}
mapping_set_error(mapping, error);
}
staticint nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio, struct extent_state **cached, struct can_nocow_file_extent_args *nocow_args,
u64 file_pos, bool is_prealloc)
{ struct btrfs_ordered_extent *ordered;
u64 len = nocow_args->file_extent.num_bytes;
u64 end = file_pos + len - 1; int ret = 0;
if (btrfs_is_data_reloc_root(inode->root)) /* * Errors are handled later, as we must prevent * extent_clear_unlock_delalloc() in error handler from freeing * metadata of the created ordered extent.
*/
ret = btrfs_reloc_clone_csums(ordered);
btrfs_put_ordered_extent(ordered);
extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached,
EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_CLEAR_DATA_RESV,
PAGE_UNLOCK | PAGE_SET_ORDERED); /* * On error, we need to cleanup the ordered extents we created. * * We do not clear the folio Dirty flags because they are set and * cleaered by the caller.
*/ if (ret < 0)
btrfs_cleanup_ordered_extents(inode, file_pos, len); return ret;
}
/* * when nowcow writeback call back. This checks for snapshots or COW copies * of the extents that exist in the file, and COWs the file as required. * * If no cow copies or snapshots exist, we write directly to the existing * blocks on disk
*/ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, struct folio *locked_folio, const u64 start, const u64 end)
{ struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; struct btrfs_path *path;
u64 cow_start = (u64)-1; /* * If not 0, represents the inclusive end of the last fallback_to_cow() * range. Only for error handling.
*/
u64 cow_end = 0;
u64 cur_offset = start; int ret; bool check_prev = true;
u64 ino = btrfs_ino(inode); struct can_nocow_file_extent_args nocow_args = { 0 };
/* * Normally on a zoned device we're only doing COW writes, but in case * of relocation on a zoned filesystem serializes I/O so that we're only * writing sequentially and can end up here as well.
*/
ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root));
path = btrfs_alloc_path(); if (!path) {
ret = -ENOMEM; goto error;
}
ret = btrfs_lookup_file_extent(NULL, root, path, ino,
cur_offset, 0); if (ret < 0) goto error;
/* * If there is no extent for our range when doing the initial * search, then go back to the previous slot as it will be the * one containing the search offset
*/ if (ret > 0 && path->slots[0] > 0 && check_prev) {
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key,
path->slots[0] - 1); if (found_key.objectid == ino &&
found_key.type == BTRFS_EXTENT_DATA_KEY)
path->slots[0]--;
}
check_prev = false;
next_slot: /* Go to next leaf if we have exhausted the current one */
leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path); if (ret < 0) goto error; if (ret > 0) break;
leaf = path->nodes[0];
}
/* Didn't find anything for our INO */ if (found_key.objectid > ino) break; /* * Keep searching until we find an EXTENT_ITEM or there are no * more extents for this inode
*/ if (WARN_ON_ONCE(found_key.objectid < ino) ||
found_key.type < BTRFS_EXTENT_DATA_KEY) {
path->slots[0]++; goto next_slot;
}
/* Found key is not EXTENT_DATA_KEY or starts after req range */ if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
found_key.offset > end) break;
/* * If the found extent starts after requested offset, then * adjust cur_offset to be right before this extent begins.
*/ if (found_key.offset > cur_offset) { if (cow_start == (u64)-1)
cow_start = cur_offset;
cur_offset = found_key.offset; goto next_slot;
}
/* * Found extent which begins before our range and potentially * intersect it
*/
fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
extent_type = btrfs_file_extent_type(leaf, fi); /* If this is triggered then we have a memory corruption. */
ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES); if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) {
ret = -EUCLEAN; goto error;
}
extent_end = btrfs_file_extent_end(path);
/* * If the extent we got ends before our current offset, skip to * the next extent.
*/ if (extent_end <= cur_offset) {
path->slots[0]++; goto next_slot;
}
nocow_args.start = cur_offset;
ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args); if (ret < 0) goto error; if (ret == 0) goto must_cow;
ret = 0;
nocow_bg = btrfs_inc_nocow_writers(fs_info,
nocow_args.file_extent.disk_bytenr +
nocow_args.file_extent.offset); if (!nocow_bg) {
must_cow: /* * If we can't perform NOCOW writeback for the range, * then record the beginning of the range that needs to * be COWed. It will be written out before the next * NOCOW range if we find one, or when exiting this * loop.
*/ if (cow_start == (u64)-1)
cow_start = cur_offset;
cur_offset = extent_end; if (cur_offset > end) break; if (!path->nodes[0]) continue;
path->slots[0]++; goto next_slot;
}
/* * COW range from cow_start to found_key.offset - 1. As the key * will contain the beginning of the first extent that can be * NOCOW, following one which needs to be COW'ed
*/ if (cow_start != (u64)-1) {
ret = fallback_to_cow(inode, locked_folio, cow_start,
found_key.offset - 1); if (ret) {
cow_end = found_key.offset - 1;
btrfs_dec_nocow_writers(nocow_bg); goto error;
}
cow_start = (u64)-1;
}
if (cur_offset <= end && cow_start == (u64)-1)
cow_start = cur_offset;
if (cow_start != (u64)-1) {
ret = fallback_to_cow(inode, locked_folio, cow_start, end); if (ret) {
cow_end = end; goto error;
}
cow_start = (u64)-1;
}
btrfs_free_path(path); return 0;
error: /* * There are several error cases: * * 1) Failed without falling back to COW * start cur_offset end * |/////////////| | * * In this case, cow_start should be (u64)-1. * * For range [start, cur_offset) the folios are already unlocked (except * @locked_folio), EXTENT_DELALLOC already removed. * Need to clear the dirty flags and finish the ordered extents. * * 2) Failed with error before calling fallback_to_cow() * * start cow_start end * |/////////////| | * * In this case, only @cow_start is set, @cur_offset is between * [cow_start, end) * * It's mostly the same as case 1), just replace @cur_offset with * @cow_start. * * 3) Failed with error from fallback_to_cow() * * start cow_start cow_end end * |/////////////|-----------| | * * In this case, both @cow_start and @cow_end is set. * * For range [start, cow_start) it's the same as case 1). * But for range [cow_start, cow_end), all the cleanup is handled by * cow_file_range(), we should not touch anything in that range. * * So for all above cases, if @cow_start is set, cleanup ordered extents * for range [start, @cow_start), other wise cleanup range [start, @cur_offset).
*/ if (cow_start != (u64)-1)
cur_offset = cow_start;
/* * If an error happened while a COW region is outstanding, cur_offset * needs to be reset to @cow_end + 1 to skip the COW range, as * cow_file_range() will do the proper cleanup at error.
*/ if (cow_end)
cur_offset = cow_end + 1;
/* * We need to lock the extent here because we're clearing DELALLOC and * we're not locked at this point.
*/ if (cur_offset < end) { struct extent_state *cached = NULL;
/* * Function to process delayed allocation (create CoW) for ranges which are * being touched for the first time.
*/ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio,
u64 start, u64 end, struct writeback_control *wbc)
{ constbool zoned = btrfs_is_zoned(inode->root->fs_info); int ret;
/* * The range must cover part of the @locked_folio, or a return of 1 * can confuse the caller.
*/
ASSERT(!(end <= folio_pos(locked_folio) || start >= folio_end(locked_folio)));
if (should_nocow(inode, start, end)) {
ret = run_delalloc_nocow(inode, locked_folio, start, end); return ret;
}
/* * Handle merged delayed allocation extents so we can keep track of new extents * that are just merged onto old extents, such as when we are doing sequential * writes, so we can properly account for the metadata space we'll need.
*/ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new, struct extent_state *other)
{ struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 new_size, old_size;
u32 num_extents;
lockdep_assert_held(&inode->io_tree.lock);
/* not delalloc, ignore it */ if (!(other->state & EXTENT_DELALLOC)) return;
/* we're not bigger than the max, unreserve the space and go */ if (new_size <= fs_info->max_extent_size) {
spin_lock(&inode->lock);
btrfs_mod_outstanding_extents(inode, -1);
spin_unlock(&inode->lock); return;
}
/* * We have to add up either side to figure out how many extents were * accounted for before we merged into one big extent. If the number of * extents we accounted for is <= the amount we need for the new range * then we can return, otherwise drop. Think of it like this * * [ 4k][MAX_SIZE] * * So we've grown the extent by a MAX_SIZE extent, this would mean we * need 2 outstanding extents, on one side we have 1 and the other side * we have 1 so they are == and we can return. But in this case * * [MAX_SIZE+4k][MAX_SIZE+4k] * * Each range on their own accounts for 2 extents, but merged together * they are only 3 extents worth of accounting, so we need to drop in * this case.
*/
old_size = other->end - other->start + 1;
num_extents = count_max_extents(fs_info, old_size);
old_size = new->end - new->start + 1;
num_extents += count_max_extents(fs_info, old_size); if (count_max_extents(fs_info, new_size) >= num_extents) return;
/* * We may be called after the inode was already deleted from the list, * namely in the transaction abort path btrfs_destroy_delalloc_inodes(), * and then later through btrfs_clear_delalloc_extent() while the inode * still has ->delalloc_bytes > 0.
*/ if (!list_empty(&inode->delalloc_inodes)) {
list_del_init(&inode->delalloc_inodes);
root->nr_delalloc_inodes--; if (!root->nr_delalloc_inodes) {
ASSERT(list_empty(&root->delalloc_inodes));
spin_lock(&fs_info->delalloc_root_lock);
ASSERT(!list_empty(&root->delalloc_root));
list_del_init(&root->delalloc_root);
spin_unlock(&fs_info->delalloc_root_lock);
}
}
}
/* * Properly track delayed allocation bytes in the inode and to maintain the * list of inodes that have pending delalloc work to be done.
*/ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state,
u32 bits)
{ struct btrfs_fs_info *fs_info = inode->root->fs_info;
lockdep_assert_held(&inode->io_tree.lock);
if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
WARN_ON(1); /* * set_bit and clear bit hooks normally require _irqsave/restore * but in this case, we are only testing for the DELALLOC * bit, which is only set or cleared with irqs on
*/ if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
u64 len = state->end + 1 - state->start;
u64 prev_delalloc_bytes;
u32 num_extents = count_max_extents(fs_info, len);
/* * We don't need to be under the protection of the inode's lock, * because we are called while holding the inode's io_tree lock * and are therefore protected against concurrent calls of this * function and btrfs_clear_delalloc_extent().
*/ if (!btrfs_is_free_space_inode(inode) && prev_delalloc_bytes == 0)
btrfs_add_delalloc_inode(inode);
}
/* * set_bit and clear bit hooks normally require _irqsave/restore * but in this case, we are only testing for the DELALLOC * bit, which is only set or cleared with irqs on
*/ if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = inode->root;
u64 new_delalloc_bytes;
/* * We don't reserve metadata space for space cache inodes so we * don't need to call delalloc_release_metadata if there is an * error.
*/ if (bits & EXTENT_CLEAR_META_RESV &&
root != fs_info->tree_root)
btrfs_delalloc_release_metadata(inode, len, true);
/* For sanity tests. */ if (btrfs_is_testing(fs_info)) return;
/* * We don't need to be under the protection of the inode's lock, * because we are called while holding the inode's io_tree lock * and are therefore protected against concurrent calls of this * function and btrfs_set_delalloc_extent().
*/ if (!btrfs_is_free_space_inode(inode) && new_delalloc_bytes == 0) {
spin_lock(&root->delalloc_lock);
btrfs_del_delalloc_inode(inode);
spin_unlock(&root->delalloc_lock);
}
}
/* * given a list of ordered sums record them in the inode. This happens * at IO completion time based on sums calculated at bio submission time.
*/ staticint add_pending_csums(struct btrfs_trans_handle *trans, struct list_head *list)
{ struct btrfs_ordered_sum *sum; struct btrfs_root *csum_root = NULL; int ret;
list_for_each_entry(sum, list, list) {
trans->adding_csums = true; if (!csum_root)
csum_root = btrfs_csum_root(trans->fs_info,
sum->logical);
ret = btrfs_csum_file_blocks(trans, csum_root, sum);
trans->adding_csums = false; if (ret) return ret;
} return 0;
}
if (start >= i_size_read(&inode->vfs_inode) &&
!(inode->flags & BTRFS_INODE_PREALLOC)) { /* * There can't be any extents following eof in this case so just * set the delalloc new bit for the range directly.
*/
extra_bits |= EXTENT_DELALLOC_NEW;
} else { int ret;
ret = btrfs_find_new_delalloc_bytes(inode, start,
end + 1 - start,
cached_state); if (ret) return ret;
}
/* see btrfs_writepage_start_hook for details on why this is required */ struct btrfs_writepage_fixup { struct folio *folio; struct btrfs_inode *inode; struct btrfs_work work;
};
/* * This is similar to page_mkwrite, we need to reserve the space before * we take the folio lock.
*/
ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
folio_size(folio));
again:
folio_lock(folio);
/* * Before we queued this fixup, we took a reference on the folio. * folio->mapping may go NULL, but it shouldn't be moved to a different * address space.
*/ if (!folio->mapping || !folio_test_dirty(folio) ||
!folio_test_checked(folio)) { /* * Unfortunately this is a little tricky, either * * 1) We got here and our folio had already been dealt with and * we reserved our space, thus ret == 0, so we need to just * drop our space reservation and bail. This can happen the * first time we come into the fixup worker, or could happen * while waiting for the ordered extent. * 2) Our folio was already dealt with, but we happened to get an * ENOSPC above from the btrfs_delalloc_reserve_space. In * this case we obviously don't have anything to release, but * because the folio was already dealt with we don't want to * mark the folio with an error, so make sure we're resetting * ret to 0. This is why we have this check _before_ the ret * check, because we do not want to have a surprise ENOSPC * when the folio was already properly dealt with.
*/ if (!ret) {
btrfs_delalloc_release_extents(inode, folio_size(folio));
btrfs_delalloc_release_space(inode, data_reserved,
page_start, folio_size(folio), true);
}
ret = 0; goto out_page;
}
/* * We can't mess with the folio state unless it is locked, so now that * it is locked bail if we failed to make our space reservation.
*/ if (ret) goto out_page;
ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
&cached_state); if (ret) goto out_reserved;
/* * Everything went as planned, we're now the owner of a dirty page with * delayed allocation bits set and space reserved for our COW * destination. * * The page was dirty when we started, nothing should have cleaned it.
*/
BUG_ON(!folio_test_dirty(folio));
free_delalloc_space = false;
out_reserved:
btrfs_delalloc_release_extents(inode, PAGE_SIZE); if (free_delalloc_space)
btrfs_delalloc_release_space(inode, data_reserved, page_start,
PAGE_SIZE, true);
btrfs_unlock_extent(&inode->io_tree, page_start, page_end, &cached_state);
out_page: if (ret) { /* * We hit ENOSPC or other errors. Update the mapping and page * to reflect the errors and clean the page.
*/
mapping_set_error(folio->mapping, ret);
btrfs_mark_ordered_io_finished(inode, folio, page_start,
folio_size(folio), !ret);
folio_clear_dirty_for_io(folio);
}
btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE);
folio_unlock(folio);
folio_put(folio);
kfree(fixup);
extent_changeset_free(data_reserved); /* * As a precaution, do a delayed iput in case it would be the last iput * that could need flushing space. Recursing back to fixup worker would * deadlock.
*/
btrfs_add_delayed_iput(inode);
}
/* * There are a few paths in the higher layers of the kernel that directly * set the folio dirty bit without asking the filesystem if it is a * good idea. This causes problems because we want to make sure COW * properly happens and the data=ordered rules are followed. * * In our case any range that doesn't have the ORDERED bit set * hasn't been properly setup for IO. We kick off an async process * to fix it up. The async helper will wait for ordered extents, set * the delalloc bit and make it safe to write the folio.
*/ int btrfs_writepage_cow_fixup(struct folio *folio)
{ struct inode *inode = folio->mapping->host; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_writepage_fixup *fixup;
/* This folio has ordered extent covering it already */ if (folio_test_ordered(folio)) return 0;
/* * For experimental build, we error out instead of EAGAIN. * * We should not hit such out-of-band dirty folios anymore.
*/ if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) {
DEBUG_WARN();
btrfs_err_rl(fs_info, "root %lld ino %llu folio %llu is marked dirty without notifying the fs",
btrfs_root_id(BTRFS_I(inode)->root),
btrfs_ino(BTRFS_I(inode)),
folio_pos(folio)); return -EUCLEAN;
}
/* * folio_checked is set below when we create a fixup worker for this * folio, don't try to create another one if we're already * folio_test_checked. * * The extent_io writepage code will redirty the foio if we send back * EAGAIN.
*/ if (folio_test_checked(folio)) return -EAGAIN;
fixup = kzalloc(sizeof(*fixup), GFP_NOFS); if (!fixup) return -EAGAIN;
/* * We are already holding a reference to this inode from * write_cache_pages. We need to hold it because the space reservation * takes place outside of the folio lock, and we can't trust * folio->mapping outside of the folio lock.
*/
ihold(inode);
btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
folio_get(folio);
btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL);
fixup->folio = folio;
fixup->inode = BTRFS_I(inode);
btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
path = btrfs_alloc_path(); if (!path) return -ENOMEM;
/* * we may be replacing one extent in the tree with another. * The new extent is pinned in the extent map, and we don't want * to drop it from the cache until it is completely in the btree. * * So, tell btrfs_drop_extents to leave this extent in the cache. * the caller is expected to unpin it and allow it to be merged * with the others.
*/
drop_args.path = path;
drop_args.start = file_pos;
drop_args.end = file_pos + num_bytes;
drop_args.replace_extent = true;
drop_args.extent_item_size = sizeof(*stack_fi);
ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) goto out;
/* * If we dropped an inline extent here, we know the range where it is * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the * number of bytes only for that range containing the inline extent. * The remaining of the range will be processed when clearning the * EXTENT_DELALLOC_BIT bit through the ordered extent completion.
*/ if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) {
u64 inline_size = round_down(drop_args.bytes_found, sectorsize);
memset(&stack_fi, 0, sizeof(stack_fi));
btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
oe->disk_num_bytes);
btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset); if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags))
num_bytes = oe->truncated_len;
btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes);
btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes);
btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); /* Encryption and other encoding is reserved and all 0 */
/* * For delalloc, when completing an ordered extent we update the inode's * bytes when clearing the range in the inode's io tree, so pass false * as the argument 'update_inode_bytes' to insert_reserved_file_extent(), * except if the ordered extent was truncated.
*/
update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) ||
test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) ||
test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
/* * As ordered data IO finishes, this gets called so we can finish * an ordered extent if the range of bytes in the file it covers are * fully written.
*/ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
{ struct btrfs_inode *inode = ordered_extent->inode; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans = NULL; struct extent_io_tree *io_tree = &inode->io_tree; struct extent_state *cached_state = NULL;
u64 start, end; int compress_type = 0; int ret = 0;
u64 logical_len = ordered_extent->num_bytes; bool freespace_inode; bool truncated = false; bool clear_reserved_extent = true; unsignedint clear_bits = EXTENT_DEFRAG;
start = ordered_extent->file_offset;
end = start + ordered_extent->num_bytes - 1;
freespace_inode = btrfs_is_free_space_inode(inode); if (!freespace_inode)
btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent);
if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
ret = -EIO; goto out;
}
ret = btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
ordered_extent->disk_num_bytes); if (ret) goto out;
if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
truncated = true;
logical_len = ordered_extent->truncated_len; /* Truncated the entire extent, don't bother adding */ if (!logical_len) goto out;
}
/* * If it's a COW write we need to lock the extent range as we will be * inserting/replacing file extent items and unpinning an extent map. * This must be taken before joining a transaction, as it's a higher * level lock (like the inode's VFS lock), otherwise we can run into an * ABBA deadlock with other tasks (transactions work like a lock, * depending on their current state).
*/ if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
clear_bits |= EXTENT_LOCKED | EXTENT_FINISHING_ORDERED;
btrfs_lock_extent_bits(io_tree, start, end,
EXTENT_LOCKED | EXTENT_FINISHING_ORDERED,
&cached_state);
}
if (freespace_inode)
trans = btrfs_join_transaction_spacecache(root); else
trans = btrfs_join_transaction(root); if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL; goto out;
}
trans->block_rsv = &inode->block_rsv;
ret = btrfs_insert_raid_extent(trans, ordered_extent); if (ret) {
btrfs_abort_transaction(trans, ret); goto out;
}
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { /* Logic error */
ASSERT(list_empty(&ordered_extent->list)); if (!list_empty(&ordered_extent->list)) {
ret = -EINVAL;
btrfs_abort_transaction(trans, ret); goto out;
}
btrfs_inode_safe_disk_i_size_write(inode, 0);
ret = btrfs_update_inode_fallback(trans, inode); if (ret) { /* -ENOMEM or corruption */
btrfs_abort_transaction(trans, ret);
} goto out;
}
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
compress_type = ordered_extent->compress_type; if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
BUG_ON(compress_type);
ret = btrfs_mark_extent_written(trans, inode,
ordered_extent->file_offset,
ordered_extent->file_offset +
logical_len);
btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr,
ordered_extent->disk_num_bytes);
} else {
BUG_ON(root == fs_info->tree_root);
ret = insert_ordered_extent_file_extent(trans, ordered_extent); if (!ret) {
clear_reserved_extent = false;
btrfs_release_delalloc_bytes(fs_info,
ordered_extent->disk_bytenr,
ordered_extent->disk_num_bytes);
}
} if (ret < 0) {
btrfs_abort_transaction(trans, ret); goto out;
}
ret = btrfs_unpin_extent_cache(inode, ordered_extent->file_offset,
ordered_extent->num_bytes, trans->transid); if (ret < 0) {
btrfs_abort_transaction(trans, ret); goto out;
}
ret = add_pending_csums(trans, &ordered_extent->list); if (ret) {
btrfs_abort_transaction(trans, ret); goto out;
}
/* * If this is a new delalloc range, clear its new delalloc flag to * update the inode's number of bytes. This needs to be done first * before updating the inode item.
*/ if ((clear_bits & EXTENT_DELALLOC_NEW) &&
!test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
btrfs_clear_extent_bit(&inode->io_tree, start, end,
EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
&cached_state);
btrfs_inode_safe_disk_i_size_write(inode, 0);
ret = btrfs_update_inode_fallback(trans, inode); if (ret) { /* -ENOMEM or corruption */
btrfs_abort_transaction(trans, ret); goto out;
}
out:
btrfs_clear_extent_bit(&inode->io_tree, start, end, clear_bits,
&cached_state);
if (trans)
btrfs_end_transaction(trans);
if (ret || truncated) { /* * If we failed to finish this ordered extent for any reason we * need to make sure BTRFS_ORDERED_IOERR is set on the ordered * extent, and mark the inode with the error if it wasn't * already set. Any error during writeback would have already * set the mapping error, so we need to set it if we're the ones * marking this ordered extent as failed.
*/ if (ret)
btrfs_mark_ordered_extent_error(ordered_extent);
/* * Drop extent maps for the part of the extent we didn't write. * * We have an exception here for the free_space_inode, this is * because when we do btrfs_get_extent() on the free space inode * we will search the commit root. If this is a new block group * we won't find anything, and we will trip over the assert in * writepage where we do ASSERT(em->block_start != * EXTENT_MAP_HOLE). * * Theoretically we could also skip this for any NOCOW extent as * we don't mess with the extent map tree in the NOCOW case, but * for now simply skip this if we are the free space inode.
*/ if (!btrfs_is_free_space_inode(inode)) {
u64 unwritten_start = start;
/* * If the ordered extent had an IOERR or something else went * wrong we need to return the space for this ordered extent * back to the allocator. We only free the extent in the * truncated case if we didn't write out the extent at all. * * If we made it past insert_reserved_file_extent before we * errored out then we don't need to do this as the accounting * has already been done.
*/ if ((ret || !logical_len) &&
clear_reserved_extent &&
!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { /* * Discard the range before returning it back to the * free space pool
*/ if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC))
btrfs_discard_extent(fs_info,
ordered_extent->disk_bytenr,
ordered_extent->disk_num_bytes,
NULL);
btrfs_free_reserved_extent(fs_info,
ordered_extent->disk_bytenr,
ordered_extent->disk_num_bytes, true); /* * Actually free the qgroup rsv which was released when * the ordered extent was created.
*/
btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(inode->root),
ordered_extent->qgroup_rsv,
BTRFS_QGROUP_RSV_DATA);
}
}
/* * This needs to be done to make sure anybody waiting knows we are done * updating everything for this ordered extent.
*/
btrfs_remove_ordered_extent(inode, ordered_extent);
/* once for us */
btrfs_put_ordered_extent(ordered_extent); /* once for the tree */
btrfs_put_ordered_extent(ordered_extent);
return ret;
}
int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
{ if (btrfs_is_zoned(ordered->inode->root->fs_info) &&
!test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
list_empty(&ordered->bioc_list))
btrfs_finish_ordered_zoned(ordered); return btrfs_finish_one_ordered(ordered);
}
/* * Verify the checksum for a single sector without any extra action that depend * on the type of I/O. * * @kaddr must be a properly kmapped address.
*/ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, void *kaddr, u8 *csum, const u8 * const csum_expected)
{
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
if (memcmp(csum, csum_expected, fs_info->csum_size)) return -EIO; return 0;
}
/* * Verify the checksum of a single data sector. * * @bbio: btrfs_io_bio which contains the csum * @dev: device the sector is on * @bio_offset: offset to the beginning of the bio (in bytes) * @bv: bio_vec to check * * Check if the checksum on a data block is valid. When a checksum mismatch is * detected, report the error and fill the corrupted range with zero. * * Return %true if the sector is ok or had no checksum to start with, else %false.
*/ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
u32 bio_offset, struct bio_vec *bv)
{ struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 file_offset = bbio->file_offset + bio_offset;
u64 end = file_offset + bv->bv_len - 1;
u8 *csum_expected;
u8 csum[BTRFS_CSUM_SIZE]; void *kaddr;
ASSERT(bv->bv_len == fs_info->sectorsize);
if (!bbio->csum) returntrue;
if (btrfs_is_data_reloc_root(inode->root) &&
btrfs_test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM,
NULL)) { /* Skip the range without csum for data reloc inode */
btrfs_clear_extent_bit(&inode->io_tree, file_offset, end,
EXTENT_NODATASUM, NULL); returntrue;
}
/* * Perform a delayed iput on @inode. * * @inode: The inode we want to perform iput on * * This function uses the generic vfs_inode::i_count to track whether we should * just decrement it (in case it's > 1) or if this is the last iput then link * the inode to the delayed iput machinery. Delayed iputs are processed at * transaction commit time/superblock commit/cleaner kthread.
*/ void btrfs_add_delayed_iput(struct btrfs_inode *inode)
{ struct btrfs_fs_info *fs_info = inode->root->fs_info; unsignedlong flags;
if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1)) return;
WARN_ON_ONCE(test_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state));
atomic_inc(&fs_info->nr_delayed_iputs); /* * Need to be irq safe here because we can be called from either an irq * context (see bio.c and btrfs_put_ordered_extent()) or a non-irq * context.
*/
spin_lock_irqsave(&fs_info->delayed_iput_lock, flags);
ASSERT(list_empty(&inode->delayed_iput));
list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs);
spin_unlock_irqrestore(&fs_info->delayed_iput_lock, flags); if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
wake_up_process(fs_info->cleaner_kthread);
}
staticvoid btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
{ if (!list_empty(&inode->delayed_iput)) {
spin_lock_irq(&fs_info->delayed_iput_lock); if (!list_empty(&inode->delayed_iput))
run_delayed_iput_locked(fs_info, inode);
spin_unlock_irq(&fs_info->delayed_iput_lock);
}
}
void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
{ /* * btrfs_put_ordered_extent() can run in irq context (see bio.c), which * calls btrfs_add_delayed_iput() and that needs to lock * fs_info->delayed_iput_lock. So we need to disable irqs here to * prevent a deadlock.
*/
spin_lock_irq(&fs_info->delayed_iput_lock); while (!list_empty(&fs_info->delayed_iputs)) { struct btrfs_inode *inode;
/* * Wait for flushing all delayed iputs * * @fs_info: the filesystem * * This will wait on any delayed iputs that are currently running with KILLABLE * set. Once they are all done running we will return, unless we are killed in * which case we return EINTR. This helps in user operations like fallocate etc * that might get blocked on the iputs. * * Return EINTR if we were killed, 0 if nothing's pending
*/ int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
{ int ret = wait_event_killable(fs_info->delayed_iputs_wait,
atomic_read(&fs_info->nr_delayed_iputs) == 0); if (ret) return -EINTR; return 0;
}
/* * This creates an orphan entry for the given inode in case something goes wrong * in the middle of an unlink.
*/ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode)
{ int ret;
ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode)); if (ret && ret != -EEXIST) {
btrfs_abort_transaction(trans, ret); return ret;
}
return 0;
}
/* * We have done the delete so we can go ahead and remove the orphan item for * this particular inode.
*/ staticint btrfs_orphan_del(struct btrfs_trans_handle *trans, struct btrfs_inode *inode)
{ return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode));
}
/* * this cleans up any orphans that may be left on the list from the last use * of this root.
*/ int btrfs_orphan_cleanup(struct btrfs_root *root)
{ struct btrfs_fs_info *fs_info = root->fs_info;
BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_key key, found_key; struct btrfs_trans_handle *trans;
u64 last_objectid = 0; int ret = 0, nr_unlink = 0;
if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state)) return 0;
path = btrfs_alloc_path(); if (!path) {
ret = -ENOMEM; goto out;
}
path->reada = READA_BACK;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out;
/* * if ret == 0 means we found what we were searching for, which * is weird, but possible, so only screw with path if we didn't * find the key and see if we have stuff that matches
*/ if (ret > 0) {
ret = 0; if (path->slots[0] == 0) break;
path->slots[0]--;
}
/* pull out the item */
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
/* make sure the item matches what we want */ if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) break; if (found_key.type != BTRFS_ORPHAN_ITEM_KEY) break;
/* release the path since we're done with it */
btrfs_release_path(path);
/* * this is where we are basically btrfs_lookup, without the * crossing root thing. we store the inode number in the * offset of the orphan item.
*/
if (found_key.offset == last_objectid) { /* * We found the same inode as before. This means we were * not able to remove its items via eviction triggered * by an iput(). A transaction abort may have happened, * due to -ENOSPC for example, so try to grab the error * that lead to a transaction abort, if any.
*/
btrfs_err(fs_info, "Error removing orphan entry, stopping orphan cleanup");
ret = BTRFS_FS_ERROR(fs_info) ?: -EINVAL; goto out;
}
last_objectid = found_key.offset;
found_key.objectid = found_key.offset;
found_key.type = BTRFS_INODE_ITEM_KEY;
found_key.offset = 0;
inode = btrfs_iget(last_objectid, root); if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
inode = NULL; if (ret != -ENOENT) goto out;
}
if (!inode && root == fs_info->tree_root) { struct btrfs_root *dead_root; int is_dead_root = 0;
/* * This is an orphan in the tree root. Currently these * could come from 2 sources: * a) a root (snapshot/subvolume) deletion in progress * b) a free space cache inode * We need to distinguish those two, as the orphan item * for a root must not get deleted before the deletion * of the snapshot/subvolume's tree completes. * * btrfs_find_orphan_roots() ran before us, which has * found all deleted roots and loaded them into * fs_info->fs_roots_radix. So here we can find if an * orphan item corresponds to a deleted root by looking * up the root from that radix tree.
*/
if (is_dead_root) { /* prevent this orphan from being found again */
key.offset = found_key.objectid - 1; continue;
}
}
/* * If we have an inode with links, there are a couple of * possibilities: * * 1. We were halfway through creating fsverity metadata for the * file. In that case, the orphan item represents incomplete * fsverity metadata which must be cleaned up with * btrfs_drop_verity_items and deleting the orphan item.
* 2. Old kernels (before v3.12) used to create an * orphan item for truncate indicating that there were possibly * extent items past i_size that needed to be deleted. In v3.12, * truncate was changed to update i_size in sync with the extent * items, but the (useless) orphan item was still created. Since * v4.18, we don't create the orphan item for truncate at all. * * So, this item could mean that we need to do a truncate, but * only if this filesystem was last used on a pre-v3.12 kernel * and was not cleanly unmounted. The odds of that are quite * slim, and it's a pain to do the truncate now, so just delete * the orphan item. * * It's also possible that this orphan item was supposed to be * deleted but wasn't. The inode number may have been reused, * but either way, we can delete the orphan item.
*/ if (!inode || inode->vfs_inode.i_nlink) { if (inode) {
ret = btrfs_drop_verity_items(inode);
iput(&inode->vfs_inode);
inode = NULL; if (ret) goto out;
}
trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) {
ret = PTR_ERR(trans); goto out;
}
btrfs_debug(fs_info, "auto deleting %Lu",
found_key.objectid);
ret = btrfs_del_orphan_item(trans, root,
found_key.objectid);
btrfs_end_transaction(trans); if (ret) goto out; continue;
}
nr_unlink++;
/* this will do delete_inode and everything for us */
iput(&inode->vfs_inode);
} /* release the path since we're done with it */
btrfs_release_path(path);
if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
trans = btrfs_join_transaction(root); if (!IS_ERR(trans))
btrfs_end_transaction(trans);
}
if (nr_unlink)
btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
out: if (ret)
btrfs_err(fs_info, "could not do orphan cleanup %d", ret); return ret;
}
/* * Look ahead in the leaf for xattrs. If we don't find any then we know there * can't be any ACLs. * * @leaf: the eb leaf where to search * @slot: the slot the inode is in * @objectid: the objectid of the inode * * Return true if there is xattr/ACL, false otherwise.
*/ static noinline bool acls_after_inode_item(struct extent_buffer *leaf, int slot, u64 objectid, int *first_xattr_slot)
{
u32 nritems = btrfs_header_nritems(leaf); struct btrfs_key found_key; static u64 xattr_access = 0; static u64 xattr_default = 0; int scanned = 0;
/* We found a different objectid, there must be no ACLs. */ if (found_key.objectid != objectid) returnfalse;
/* We found an xattr, assume we've got an ACL. */ if (found_key.type == BTRFS_XATTR_ITEM_KEY) { if (*first_xattr_slot == -1)
*first_xattr_slot = slot; if (found_key.offset == xattr_access ||
found_key.offset == xattr_default) returntrue;
}
/* * We found a key greater than an xattr key, there can't be any * ACLs later on.
*/ if (found_key.type > BTRFS_XATTR_ITEM_KEY) returnfalse;
slot++;
scanned++;
/* * The item order goes like: * - inode * - inode backrefs * - xattrs * - extents, * * so if there are lots of hard links to an inode there can be * a lot of backrefs. Don't waste time searching too hard, * this is just an optimization.
*/ if (scanned >= 8) break;
} /* * We hit the end of the leaf before we found an xattr or something * larger than an xattr. We have to assume the inode has ACLs.
*/ if (*first_xattr_slot == -1)
*first_xattr_slot = slot; returntrue;
}
if (WARN_ON_ONCE(inode->file_extent_tree)) return 0; if (btrfs_fs_incompat(fs_info, NO_HOLES)) return 0; if (!S_ISREG(inode->vfs_inode.i_mode)) return 0; if (btrfs_is_free_space_inode(inode)) return 0;
inode->file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL); if (!inode->file_extent_tree) return -ENOMEM;
btrfs_extent_io_tree_init(fs_info, inode->file_extent_tree,
IO_TREE_INODE_FILE_EXTENT); /* Lockdep class is set only for the file extent tree. */
lockdep_set_class(&inode->file_extent_tree->lock, &file_extent_tree_class);
/* * Read a locked inode from the btree into the in-memory inode and add it to * its root list/tree. * * On failure clean up the inode.
*/ staticint btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path *path)
{ struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; struct btrfs_inode_item *inode_item; struct inode *vfs_inode = &inode->vfs_inode; struct btrfs_key location; unsignedlong ptr; int maybe_acls;
u32 rdev; int ret; bool filled = false; int first_xattr_slot;
ret = btrfs_fill_inode(inode, &rdev); if (!ret)
filled = true;
ASSERT(path);
btrfs_get_inode_key(inode, &location);
ret = btrfs_lookup_inode(NULL, root, path, &location, 0); if (ret) { /* * ret > 0 can come from btrfs_search_slot called by * btrfs_lookup_inode(), this means the inode was not found.
*/ if (ret > 0)
ret = -ENOENT; goto out;
}
cache_index:
ret = btrfs_init_file_extent_tree(inode); if (ret) goto out;
btrfs_inode_set_file_extent_range(inode, 0,
round_up(i_size_read(vfs_inode), fs_info->sectorsize)); /* * If we were modified in the current generation and evicted from memory * and then re-read we need to do a full sync since we don't have any * idea about which extents were modified before we were evicted from * cache. * * This is required for both inode re-read from disk and delayed inode * in the delayed_nodes xarray.
*/ if (inode->last_trans == btrfs_get_fs_generation(fs_info))
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
/* * We don't persist the id of the transaction where an unlink operation * against the inode was last made. So here we assume the inode might * have been evicted, and therefore the exact value of last_unlink_trans * lost, and set it to last_trans to avoid metadata inconsistencies * between the inode and its parent if the inode is fsync'ed and the log * replayed. For example, in the scenario: * * touch mydir/foo * ln mydir/foo mydir/bar * sync * unlink mydir/bar * echo 2 > /proc/sys/vm/drop_caches # evicts inode * xfs_io -c fsync mydir/foo * <power failure> * mount fs, triggers fsync log replay * * We must make sure that when we fsync our inode foo we also log its * parent inode, otherwise after log replay the parent still has the * dentry with the "bar" name but our inode foo has a link count of 1 * and doesn't have an inode ref with the name "bar" anymore. * * Setting last_unlink_trans to last_trans is a pessimistic approach, * but it guarantees correctness at the expense of occasional full * transaction commits on fsync if our inode is a directory, or if our * inode is not a directory, logging its parent unnecessarily.
*/
inode->last_unlink_trans = inode->last_trans;
/* * Same logic as for last_unlink_trans. We don't persist the generation * of the last transaction where this inode was used for a reflink * operation, so after eviction and reloading the inode we must be * pessimistic and assume the last transaction that modified the inode.
*/
inode->last_reflink_trans = inode->last_trans;
/* * given a leaf and an inode, copy the inode fields into the leaf
*/ staticvoid fill_inode_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf, struct btrfs_inode_item *item, struct inode *inode)
{
u64 flags;
/* * copy everything in the in-memory inode into the btree.
*/ int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode)
{ struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; int ret;
/* * If the inode is a free space inode, we can deadlock during commit * if we put it into the delayed code. * * The data relocation inode should also be directly updated * without delay
*/ if (!btrfs_is_free_space_inode(inode)
&& !btrfs_is_data_reloc_root(root)
&& !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
btrfs_update_root_times(trans, root);
ret = btrfs_delayed_update_inode(trans, inode); if (!ret)
btrfs_set_inode_last_trans(trans, inode); return ret;
}
return btrfs_update_inode_item(trans, inode);
}
int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, struct btrfs_inode *inode)
{ int ret;
ret = btrfs_update_inode(trans, inode); if (ret == -ENOSPC) return btrfs_update_inode_item(trans, inode); return ret;
}
/* * If we are replaying a log tree, we do not want to update the mtime * and ctime of the parent directory with the current time, since the * log replay procedure is responsible for setting them to their correct * values (the ones it had when the fsync was done).
*/ if (test_bit(BTRFS_FS_LOG_RECOVERING, &dir->root->fs_info->flags)) return;
now = inode_set_ctime_current(&dir->vfs_inode);
inode_set_mtime_to_ts(&dir->vfs_inode, now);
}
/* * unlink helper that gets used here in inode.c and in the tree logging * recovery code. It remove a link in a directory with a given name, and * also drops the back refs in the inode to the directory
*/ staticint __btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, conststruct fscrypt_str *name, struct btrfs_rename_ctx *rename_ctx)
{ struct btrfs_root *root = dir->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; int ret = 0; struct btrfs_dir_item *di;
u64 index;
u64 ino = btrfs_ino(inode);
u64 dir_ino = btrfs_ino(dir);
path = btrfs_alloc_path(); if (!path) return -ENOMEM;
di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1); if (IS_ERR_OR_NULL(di)) {
btrfs_free_path(path); return di ? PTR_ERR(di) : -ENOENT;
}
ret = btrfs_delete_one_dir_name(trans, root, path, di); /* * Down the call chains below we'll also need to allocate a path, so no * need to hold on to this one for longer than necessary.
*/
btrfs_free_path(path); if (ret) return ret;
/* * If we don't have dir index, we have to get it by looking up * the inode ref, since we get the inode ref, remove it directly, * it is unnecessary to do delayed deletion. * * But if we have dir index, needn't search inode ref to get it. * Since the inode ref is close to the inode item, it is better * that we delay to delete it, and just do this deletion when * we update the inode item.
*/ if (inode->dir_index) {
ret = btrfs_delayed_delete_inode_ref(inode); if (!ret) {
index = inode->dir_index; goto skip_backref;
}
}
ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index); if (ret) {
btrfs_crit(fs_info, "failed to delete reference to %.*s, root %llu inode %llu parent %llu",
name->len, name->name, btrfs_root_id(root), ino, dir_ino);
btrfs_abort_transaction(trans, ret); return ret;
}
skip_backref: if (rename_ctx)
rename_ctx->index = index;
ret = btrfs_delete_delayed_dir_index(trans, dir, index); if (ret) {
btrfs_abort_transaction(trans, ret); return ret;
}
/* * If we are in a rename context, we don't need to update anything in the * log. That will be done later during the rename by btrfs_log_new_name(). * Besides that, doing it here would only cause extra unnecessary btree * operations on the log tree, increasing latency for applications.
*/ if (!rename_ctx) {
btrfs_del_inode_ref_in_log(trans, root, name, inode, dir_ino);
btrfs_del_dir_entries_in_log(trans, root, name, dir, index);
}
/* * If we have a pending delayed iput we could end up with the final iput * being run in btrfs-cleaner context. If we have enough of these built * up we can end up burning a lot of time in btrfs-cleaner without any * way to throttle the unlinks. Since we're currently holding a ref on * the inode we can run the delayed iput here without any issues as the * final iput won't be done until after we drop the ref we're currently * holding.
*/
btrfs_run_delayed_iput(fs_info, inode);
int btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, conststruct fscrypt_str *name)
{ int ret;
ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL); if (!ret) {
drop_nlink(&inode->vfs_inode);
ret = btrfs_update_inode(trans, inode);
} return ret;
}
/* * helper to start transaction for unlink and rmdir. * * unlink and rmdir are special in btrfs, they do not always free space, so * if we cannot make our reservations the normal way try and see if there is * plenty of slack room in the global reserve to migrate, otherwise we cannot * allow the unlink to occur.
*/ staticstruct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir)
{ struct btrfs_root *root = dir->root;
/* * This is a placeholder inode for a subvolume we didn't have a * reference to at the time of the snapshot creation. In the meantime * we could have renamed the real subvol link into our snapshot, so * depending on btrfs_del_root_ref to return -ENOENT here is incorrect. * Instead simply lookup the dir_index_item for this entry so we can * remove it. Otherwise we know we have a ref to the root and we can * call btrfs_del_root_ref, and it _shouldn't_ fail.
*/ if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name); if (IS_ERR(di)) {
ret = PTR_ERR(di);
btrfs_abort_transaction(trans, ret); goto out;
}
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
index = key.offset;
btrfs_release_path(path);
} else {
ret = btrfs_del_root_ref(trans, objectid,
btrfs_root_id(root), dir_ino,
&index, &fname.disk_name); if (ret) {
btrfs_abort_transaction(trans, ret); goto out;
}
}
ret = btrfs_delete_delayed_dir_index(trans, dir, index); if (ret) {
btrfs_abort_transaction(trans, ret); goto out;
}
/* * Helper to check if the subvolume references other subvolumes or if it's * default.
*/ static noinline int may_destroy_subvol(struct btrfs_root *root)
{ struct btrfs_fs_info *fs_info = root->fs_info;
BTRFS_PATH_AUTO_FREE(path); struct btrfs_dir_item *di; struct btrfs_key key; struct fscrypt_str name = FSTR_INIT("default", 7);
u64 dir_id; int ret;
path = btrfs_alloc_path(); if (!path) return -ENOMEM;
/* Make sure this root isn't set as the default subvol */
dir_id = btrfs_super_root_dir(fs_info->super_copy);
di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
dir_id, &name, 0); if (di && !IS_ERR(di)) {
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); if (key.objectid == btrfs_root_id(root)) {
ret = -EPERM;
btrfs_err(fs_info, "deleting default subvolume %llu is not allowed",
key.objectid); return ret;
}
btrfs_release_path(path);
}
ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) return ret; if (ret == 0) { /* * Key with offset -1 found, there would have to exist a root * with such id, but this is out of valid range.
*/ return -EUCLEAN;
}
ret = 0; if (path->slots[0] > 0) {
path->slots[0]--;
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid == btrfs_root_id(root) && key.type == BTRFS_ROOT_REF_KEY)
ret = -ENOTEMPTY;
}
return ret;
}
/* Delete all dentries for inodes belonging to the root */ staticvoid btrfs_prune_dentries(struct btrfs_root *root)
{ struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_inode *inode;
u64 min_ino = 0;
if (!BTRFS_FS_ERROR(fs_info))
WARN_ON(btrfs_root_refs(&root->root_item) != 0);
inode = btrfs_find_first_inode(root, min_ino); while (inode) { if (atomic_read(&inode->vfs_inode.i_count) > 1)
d_prune_aliases(&inode->vfs_inode);
min_ino = btrfs_ino(inode) + 1; /* * btrfs_drop_inode() will have it removed from the inode * cache when its usage count hits zero.
*/
iput(&inode->vfs_inode);
cond_resched();
inode = btrfs_find_first_inode(root, min_ino);
}
}
/* * Don't allow to delete a subvolume with send in progress. This is * inside the inode lock so the error handling that has to drop the bit * again is not run concurrently.
*/
spin_lock(&dest->root_item_lock); if (dest->send_in_progress) {
spin_unlock(&dest->root_item_lock);
btrfs_warn(fs_info, "attempt to delete subvolume %llu during send",
btrfs_root_id(dest));
ret = -EPERM; goto out_up_write;
} if (atomic_read(&dest->nr_swapfiles)) {
spin_unlock(&dest->root_item_lock);
btrfs_warn(fs_info, "attempt to delete subvolume %llu with active swapfile",
btrfs_root_id(root));
ret = -EPERM; goto out_up_write;
}
root_flags = btrfs_root_flags(&dest->root_item);
btrfs_set_root_flags(&dest->root_item,
root_flags | BTRFS_ROOT_SUBVOL_DEAD);
spin_unlock(&dest->root_item_lock);
ret = may_destroy_subvol(dest); if (ret) goto out_undead;
btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); /* * One for dir inode, * two for dir entries, * two for root ref/backref.
*/
ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true); if (ret) goto out_undead;
qgroup_reserved = block_rsv.qgroup_rsv_reserved;
trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) {
ret = PTR_ERR(trans); goto out_release;
}
btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
qgroup_reserved = 0;
trans->block_rsv = &block_rsv;
trans->bytes_reserved = block_rsv.size;
btrfs_record_snapshot_destroy(trans, dir);
ret = btrfs_unlink_subvol(trans, dir, dentry); if (ret) {
btrfs_abort_transaction(trans, ret); goto out_end_trans;
}
ret = btrfs_record_root_in_trans(trans, dest); if (ret) {
btrfs_abort_transaction(trans, ret); goto out_end_trans;
}
if (inode->vfs_inode.i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) { if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) {
btrfs_err(fs_info, "extent tree v2 doesn't support snapshot deletion yet"); return -EOPNOTSUPP;
} return btrfs_delete_subvolume(dir, dentry);
}
ret = fscrypt_setup_filename(vfs_dir, &dentry->d_name, 1, &fname); if (ret) return ret;
/* This needs to handle no-key deletions later on */
trans = __unlink_start_trans(dir); if (IS_ERR(trans)) {
ret = PTR_ERR(trans); goto out_notrans;
}
/* * Propagate the last_unlink_trans value of the deleted dir to its * parent directory. This is to prevent an unrecoverable log tree in the * case we do something like this: * 1) create dir foo * 2) create snapshot under dir foo * 3) delete the snapshot * 4) rmdir foo * 5) mkdir foo * 6) fsync foo or some file inside foo * * This is because we can't unlink other roots when replaying the dir * deletes for directory foo.
*/ if (inode->last_unlink_trans >= trans->transid)
btrfs_record_snapshot_destroy(trans, dir);
if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
ret = btrfs_unlink_subvol(trans, dir, dentry); goto out;
}
ret = btrfs_orphan_add(trans, inode); if (ret) goto out;
/* now the directory is empty */
ret = btrfs_unlink_inode(trans, dir, inode, &fname.disk_name); if (!ret)
btrfs_i_size_write(inode, 0);
out:
btrfs_end_transaction(trans);
out_notrans:
btrfs_btree_balance_dirty(fs_info);
fscrypt_free_filename(&fname);
/* * Handle the truncation of a fs block. * * @inode - inode that we're zeroing * @offset - the file offset of the block to truncate * The value must be inside [@start, @end], and the function will do * extra checks if the block that covers @offset needs to be zeroed. * @start - the start file offset of the range we want to zero * @end - the end (inclusive) file offset of the range we want to zero. * * If the range is not block aligned, read out the folio that covers @offset, * and if needed zero blocks that are inside the folio and covered by [@start, @end). * If @start or @end + 1 lands inside a block, that block will be marked dirty * for writeback. * * This is utilized by hole punch, zero range, file expansion.
*/ int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 end)
{ struct btrfs_fs_info *fs_info = inode->root->fs_info; struct address_space *mapping = inode->vfs_inode.i_mapping; struct extent_io_tree *io_tree = &inode->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; bool only_release_metadata = false;
u32 blocksize = fs_info->sectorsize;
pgoff_t index = (offset >> PAGE_SHIFT); struct folio *folio;
gfp_t mask = btrfs_alloc_write_mask(mapping); int ret = 0; constbool in_head_block = is_inside_block(offset, round_down(start, blocksize),
blocksize); constbool in_tail_block = is_inside_block(offset, round_down(end, blocksize),
blocksize); bool need_truncate_head = false; bool need_truncate_tail = false;
u64 zero_start;
u64 zero_end;
u64 block_start;
u64 block_end;
/* @offset should be inside the range. */
ASSERT(start <= offset && offset <= end, "offset=%llu start=%llu end=%llu",
offset, start, end);
/* The range is aligned at both ends. */ if (IS_ALIGNED(start, blocksize) && IS_ALIGNED(end + 1, blocksize)) { /* * For block size < page size case, we may have polluted blocks * beyond EOF. So we also need to zero them out.
*/ if (end == (u64)-1 && blocksize < PAGE_SIZE)
ret = truncate_block_zero_beyond_eof(inode, start); goto out;
}
/* * @offset may not be inside the head nor tail block. In that case we * don't need to do anything.
*/ if (!in_head_block && !in_tail_block) goto out;
/* * Skip the truncatioin if the range in the target block is already aligned. * The seemingly complex check will also handle the same block case.
*/ if (in_head_block && !IS_ALIGNED(start, blocksize))
need_truncate_head = true; if (in_tail_block && !IS_ALIGNED(end + 1, blocksize))
need_truncate_tail = true; if (!need_truncate_head && !need_truncate_tail) goto out;
ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
blocksize, false); if (ret < 0) {
size_t write_bytes = blocksize;
if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) { /* For nocow case, no need to reserve data space. */
ASSERT(write_bytes == blocksize, "write_bytes=%zu blocksize=%u",
write_bytes, blocksize);
only_release_metadata = true;
} else { goto out;
}
}
ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false); if (ret < 0) { if (!only_release_metadata)
btrfs_free_reserved_data_space(inode, data_reserved,
block_start, blocksize); goto out;
}
again:
folio = __filemap_get_folio(mapping, index,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); if (IS_ERR(folio)) { if (only_release_metadata)
btrfs_delalloc_release_metadata(inode, blocksize, true); else
btrfs_delalloc_release_space(inode, data_reserved,
block_start, blocksize, true);
btrfs_delalloc_release_extents(inode, blocksize);
ret = PTR_ERR(folio); goto out;
}
if (!folio_test_uptodate(folio)) {
ret = btrfs_read_folio(NULL, folio);
folio_lock(folio); if (folio->mapping != mapping) {
folio_unlock(folio);
folio_put(folio); goto again;
} if (!folio_test_uptodate(folio)) {
ret = -EIO; goto out_unlock;
}
}
/* * We unlock the page after the io is completed and then re-lock it * above. release_folio() could have come in between that and cleared * folio private, but left the page in the mapping. Set the page mapped * here to make sure it's properly set for the subpage stuff.
*/
ret = set_folio_extent_mapped(folio); if (ret < 0) goto out_unlock;
ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
&cached_state); if (ret) {
btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state); goto out_unlock;
}
if (end == (u64)-1) { /* * We're truncating beyond EOF, the remaining blocks normally are * already holes thus no need to zero again, but it's possible for * fs block size < page size cases to have memory mapped writes * to pollute ranges beyond EOF. * * In that case although such polluted blocks beyond EOF will * not reach disk, it still affects our page caches.
*/
zero_start = max_t(u64, folio_pos(folio), start);
zero_end = min_t(u64, folio_end(folio) - 1, end);
} else {
zero_start = max_t(u64, block_start, start);
zero_end = min_t(u64, block_end, end);
}
folio_zero_range(folio, zero_start - folio_pos(folio),
zero_end - zero_start + 1);
/* * If NO_HOLES is enabled, we don't need to do anything. * Later, up in the call chain, either btrfs_set_inode_last_sub_trans() * or btrfs_update_inode() will be called, which guarantee that the next * fsync will know this inode was changed and needs to be logged.
*/ if (btrfs_fs_incompat(fs_info, NO_HOLES)) return 0;
/* * 1 - for the one we're dropping * 1 - for the one we're adding * 1 - for updating the inode.
*/
trans = btrfs_start_transaction(root, 3); if (IS_ERR(trans)) return PTR_ERR(trans);
/* * This function puts in dummy file extents for the area we're creating a hole * for. So if we are truncating this file to a larger size we need to insert * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for * the range between oldsize and size
*/ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
{ struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *io_tree = &inode->io_tree; struct extent_map *em = NULL; struct extent_state *cached_state = NULL;
u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
u64 block_end = ALIGN(size, fs_info->sectorsize);
u64 last_byte;
u64 cur_offset;
u64 hole_size; int ret = 0;
/* * If our size started in the middle of a block we need to zero out the * rest of the block before we expand the i_size, otherwise we could * expose stale data.
*/
ret = btrfs_truncate_block(inode, oldsize, oldsize, -1); if (ret) return ret;
if (size <= hole_start) return 0;
btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1,
&cached_state);
cur_offset = hole_start; while (1) {
em = btrfs_get_extent(inode, NULL, cur_offset, block_end - cur_offset); if (IS_ERR(em)) {
ret = PTR_ERR(em);
em = NULL; break;
}
last_byte = min(btrfs_extent_map_end(em), block_end);
last_byte = ALIGN(last_byte, fs_info->sectorsize);
hole_size = last_byte - cur_offset;
if (!(em->flags & EXTENT_FLAG_PREALLOC)) { struct extent_map *hole_em;
ret = maybe_insert_hole(inode, cur_offset, hole_size); if (ret) break;
ret = btrfs_inode_set_file_extent_range(inode,
cur_offset, hole_size); if (ret) break;
/* * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a * special case where we need to update the times despite not having * these flags set. For all other operations the VFS set these flags * explicitly if it wants a timestamp update.
*/ if (newsize != oldsize) {
inode_inc_iversion(inode); if (!(mask & (ATTR_CTIME | ATTR_MTIME))) {
inode_set_mtime_to_ts(inode,
inode_set_ctime_current(inode));
}
}
if (newsize > oldsize) { /* * Don't do an expanding truncate while snapshotting is ongoing. * This is to ensure the snapshot captures a fully consistent * state of this file - if the snapshot captures this expanding * truncation, it must capture all writes that happened before * this truncation.
*/
btrfs_drew_write_lock(&root->snapshot_lock);
ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize); if (ret) {
btrfs_drew_write_unlock(&root->snapshot_lock); return ret;
}
trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) {
btrfs_drew_write_unlock(&root->snapshot_lock); return PTR_ERR(trans);
}
if (btrfs_is_zoned(fs_info)) {
ret = btrfs_wait_ordered_range(BTRFS_I(inode),
ALIGN(newsize, fs_info->sectorsize),
(u64)-1); if (ret) return ret;
}
/* * We're truncating a file that used to have good data down to * zero. Make sure any new writes to the file get on disk * on close.
*/ if (newsize == 0)
set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
&BTRFS_I(inode)->runtime_flags);
truncate_setsize(inode, newsize);
inode_dio_wait(inode);
ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize); if (ret && inode->i_nlink) { int ret2;
/* * Truncate failed, so fix up the in-memory size. We * adjusted disk_i_size down as we removed extents, so * wait for disk_i_size to be stable and then update the * in-memory size to match.
*/
ret2 = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); if (ret2) return ret2;
i_size_write(inode, BTRFS_I(inode)->disk_i_size);
}
}
ret = setattr_prepare(idmap, dentry, attr); if (ret) return ret;
if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
ret = btrfs_setsize(inode, attr); if (ret) return ret;
}
if (attr->ia_valid) {
setattr_copy(idmap, inode, attr);
inode_inc_iversion(inode);
ret = btrfs_dirty_inode(BTRFS_I(inode));
if (!ret && attr->ia_valid & ATTR_MODE)
ret = posix_acl_chmod(idmap, dentry, inode->i_mode);
}
return ret;
}
/* * While truncating the inode pages during eviction, we get the VFS * calling btrfs_invalidate_folio() against each folio of the inode. This * is slow because the calls to btrfs_invalidate_folio() result in a * huge amount of calls to lock_extent() and clear_extent_bit(), * which keep merging and splitting extent_state structures over and over, * wasting lots of time. * * Therefore if the inode is being evicted, let btrfs_invalidate_folio() * skip all those expensive operations on a per folio basis and do only * the ordered io finishing, while we release here the extent_map and * extent_state structures, without the excessive merging and splitting.
*/ staticvoid evict_inode_truncate_pages(struct inode *inode)
{ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct rb_node *node;
/* * Keep looping until we have no more ranges in the io tree. * We can have ongoing bios started by readahead that have * their endio callback (extent_io.c:end_bio_extent_readpage) * still in progress (unlocked the pages in the bio but did not yet * unlocked the ranges in the io tree). Therefore this means some * ranges can still be locked and eviction started because before * submitting those bios, which are executed by a separate task (work * queue kthread), inode references (inode->i_count) were not taken * (which would be dropped in the end io callback of each bio). * Therefore here we effectively end up waiting for those bios and * anyone else holding locked ranges without having bumped the inode's * reference count - if we don't do it, when they access the inode's * io_tree to unlock a range it may be too late, leading to an * use-after-free issue.
*/
spin_lock(&io_tree->lock); while (!RB_EMPTY_ROOT(&io_tree->state)) { struct extent_state *state; struct extent_state *cached_state = NULL;
u64 start;
u64 end; unsigned state_flags;
node = rb_first(&io_tree->state);
state = rb_entry(node, struct extent_state, rb_node);
start = state->start;
end = state->end;
state_flags = state->state;
spin_unlock(&io_tree->lock);
/* * If still has DELALLOC flag, the extent didn't reach disk, * and its reserved space won't be freed by delayed_ref. * So we need to free its reserved space here. * (Refer to comment in btrfs_invalidate_folio, case 2) * * Note, end is the bytenr of last byte, so we need + 1 here.
*/ if (state_flags & EXTENT_DELALLOC)
btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
end - start + 1, NULL);
/* * Eviction should be taking place at some place safe because of our * delayed iputs. However the normal flushing code will run delayed * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock. * * We reserve the delayed_refs_extra here again because we can't use * btrfs_start_transaction(root, 0) for the same deadlocky reason as * above. We reserve our extra bit here because we generate a ton of * delayed refs activity by truncating. * * BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can, * if we fail to make this reservation we can re-try without the * delayed_refs_extra so we can make some forward progress.
*/
ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra,
BTRFS_RESERVE_FLUSH_EVICT); if (ret) {
ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size,
BTRFS_RESERVE_FLUSH_EVICT); if (ret) {
btrfs_warn(fs_info, "could not allocate space for delete; will truncate on mount"); return ERR_PTR(-ENOSPC);
}
delayed_refs_extra = 0;
}
trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return trans;
/* * This makes sure the inode item in tree is uptodate and the space for * the inode update is released.
*/
ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode)); if (ret) goto out;
/* * This drops any pending insert or delete operations we have for this * inode. We could have a delayed dir index deletion queued up, but * we're removing the inode completely so that'll be taken care of in * the truncate.
*/
btrfs_kill_delayed_inode_items(BTRFS_I(inode));
trans = evict_refill_and_join(root, &rsv); if (IS_ERR(trans)) goto out_release;
trans->block_rsv = &rsv;
ret = btrfs_truncate_inode_items(trans, root, &control);
trans->block_rsv = &fs_info->trans_block_rsv;
btrfs_end_transaction(trans); /* * We have not added new delayed items for our inode after we * have flushed its delayed items, so no need to throttle on * delayed items. However we have modified extent buffers.
*/
btrfs_btree_balance_dirty_nodelay(fs_info); if (ret && ret != -ENOSPC && ret != -EAGAIN) goto out_release; elseif (!ret) break;
}
/* * Errors here aren't a big deal, it just means we leave orphan items in * the tree. They will be cleaned up on the next mount. If the inode * number gets reused, cleanup deletes the orphan item without doing * anything, and unlink reuses the existing orphan item. * * If it turns out that we are dropping too many of these, we might want * to add a mechanism for retrying these after a commit.
*/
trans = evict_refill_and_join(root, &rsv); if (!IS_ERR(trans)) {
trans->block_rsv = &rsv;
btrfs_orphan_del(trans, BTRFS_I(inode));
trans->block_rsv = &fs_info->trans_block_rsv;
btrfs_end_transaction(trans);
}
out_release:
btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL);
out: /* * If we didn't successfully delete, the orphan item will still be in * the tree and we'll retry on the next mount. Again, we might also want * to retry these periodically in the future.
*/
btrfs_remove_delayed_node(BTRFS_I(inode));
fsverity_cleanup_inode(inode);
clear_inode(inode);
}
/* * Return the key found in the dir entry in the location pointer, fill @type * with BTRFS_FT_*, and return 0. * * If no dir entries were found, returns -ENOENT. * If found a corrupted location in dir entry, returns -EUCLEAN.
*/ staticint btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, struct btrfs_key *location, u8 *type)
{ struct btrfs_dir_item *di;
BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *root = dir->root; int ret = 0; struct fscrypt_name fname;
path = btrfs_alloc_path(); if (!path) return -ENOMEM;
ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname); if (ret < 0) return ret; /* * fscrypt_setup_filename() should never return a positive value, but * gcc on sparc/parisc thinks it can, so assert that doesn't happen.
*/
ASSERT(ret == 0);
/* This needs to handle no-key deletions later on */
di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir),
&fname.disk_name, 0); if (IS_ERR_OR_NULL(di)) {
ret = di ? PTR_ERR(di) : -ENOENT; goto out;
}
/* * when we hit a tree root in a directory, the btrfs part of the inode * needs to be changed to reflect the root directory of the tree root. This * is kind of like crossing a mount point.
*/ staticint fixup_tree_root_location(struct btrfs_fs_info *fs_info, struct btrfs_inode *dir, struct dentry *dentry, struct btrfs_key *location, struct btrfs_root **sub_root)
{
BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *new_root; struct btrfs_root_ref *ref; struct extent_buffer *leaf; struct btrfs_key key; int ret; int err = 0; struct fscrypt_name fname;
ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 0, &fname); if (ret) return ret;
xa_lock(&root->inodes); /* * This btrfs_inode is being freed and has already been unhashed at this * point. It's possible that another btrfs_inode has already been * allocated for the same inode and inserted itself into the root, so * don't delete it in that case. * * Note that this shouldn't need to allocate memory, so the gfp flags * don't really matter.
*/
entry = __xa_cmpxchg(&root->inodes, btrfs_ino(inode), inode, NULL,
GFP_ATOMIC); if (entry == inode)
empty = xa_empty(&root->inodes);
xa_unlock(&root->inodes);
if (empty && btrfs_root_refs(&root->root_item) == 0) {
xa_lock(&root->inodes);
empty = xa_empty(&root->inodes);
xa_unlock(&root->inodes); if (empty)
btrfs_add_dead_root(root);
}
}
/* * Get an inode object given its inode number and corresponding root. Path is * preallocated to prevent recursing back to iget through allocator.
*/ struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, struct btrfs_path *path)
{ struct btrfs_inode *inode; int ret;
inode = btrfs_iget_locked(ino, root); if (!inode) return ERR_PTR(-ENOMEM);
if (!(inode->vfs_inode.i_state & I_NEW)) return inode;
ret = btrfs_read_locked_inode(inode, path); if (ret) return ERR_PTR(ret);
/* * Get an inode object given its inode number and corresponding root.
*/ struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root)
{ struct btrfs_inode *inode; struct btrfs_path *path; int ret;
inode = btrfs_iget_locked(ino, root); if (!inode) return ERR_PTR(-ENOMEM);
if (!(inode->vfs_inode.i_state & I_NEW)) return inode;
path = btrfs_alloc_path(); if (!path) {
iget_failed(&inode->vfs_inode); return ERR_PTR(-ENOMEM);
}
ret = btrfs_read_locked_inode(inode, path);
btrfs_free_path(path); if (ret) return ERR_PTR(ret);
btrfs_set_inode_number(inode, BTRFS_EMPTY_SUBVOL_DIR_OBJECTID); /* * We only need lookup, the rest is read-only and there's no inode * associated with the dentry
*/
vfs_inode->i_op = &simple_dir_inode_operations;
vfs_inode->i_opflags &= ~IOP_XATTR;
vfs_inode->i_fop = &simple_dir_operations;
vfs_inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
/* * Find the highest existing sequence number in a directory and then set the * in-memory index_cnt variable to the first free sequence number.
*/ staticint btrfs_set_inode_index_count(struct btrfs_inode *inode)
{ struct btrfs_root *root = inode->root; struct btrfs_key key, found_key;
BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; int ret;
path = btrfs_alloc_path(); if (!path) return -ENOMEM;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) return ret; /* FIXME: we should be able to handle this */ if (ret == 0) return ret;
staticint btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index)
{ int ret = 0;
btrfs_inode_lock(dir, 0); if (dir->index_cnt == (u64)-1) {
ret = btrfs_inode_delayed_dir_index_count(dir); if (ret) {
ret = btrfs_set_inode_index_count(dir); if (ret) goto out;
}
}
/* index_cnt is the index number of next new entry, so decrement it. */
*index = dir->index_cnt - 1;
out:
btrfs_inode_unlock(dir, 0);
return ret;
}
/* * All this infrastructure exists because dir_emit can fault, and we are holding * the tree lock when doing readdir. For now just allocate a buffer and copy * our information into that, and then dir_emit from the buffer. This is * similar to what NFS does, only we don't keep the buffer around in pagecache * because I'm afraid I'll mess that up. Long term we need to make filldir do * copy_to_user_inatomic so we don't have to worry about page faulting under the * tree lock.
*/ staticint btrfs_opendir(struct inode *inode, struct file *file)
{ struct btrfs_file_private *private;
u64 last_index; int ret;
ret = btrfs_get_dir_last_index(BTRFS_I(inode), &last_index); if (ret) return ret;
ret = btrfs_filldir(private->filldir_buf, entries, ctx); if (ret) goto nopos;
if (btrfs_readdir_delayed_dir_index(ctx, &ins_list)) goto nopos;
/* * Stop new entries from being returned after we return the last * entry. * * New directory entries are assigned a strictly increasing * offset. This means that new entries created during readdir * are *guaranteed* to be seen in the future by that readdir. * This has broken buggy programs which operate on names as * they're returned by readdir. Until we reuse freed offsets * we have this hack to stop new entries from being returned * under the assumption that they'll never reach this huge * offset. * * This is being careful not to overflow 32bit loff_t unless the * last entry requires it because doing so has broken 32bit apps * in the past.
*/ if (ctx->pos >= INT_MAX)
ctx->pos = LLONG_MAX; else
ctx->pos = INT_MAX;
nopos:
ret = 0;
err: if (put)
btrfs_readdir_put_delayed_items(BTRFS_I(inode), &ins_list, &del_list); return ret;
}
/* * This is somewhat expensive, updating the tree every time the * inode changes. But, it is most likely to find the inode in cache. * FIXME, needs more benchmarking...there are no reasons other than performance * to keep or drop this code.
*/ staticint btrfs_dirty_inode(struct btrfs_inode *inode)
{ struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; int ret;
if (test_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags)) return 0;
trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans);
ret = btrfs_update_inode(trans, inode); if (ret == -ENOSPC || ret == -EDQUOT) { /* whoops, lets try again with the full transaction */
btrfs_end_transaction(trans);
trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) return PTR_ERR(trans);
ret = btrfs_update_inode(trans, inode);
}
btrfs_end_transaction(trans); if (inode->delayed_node)
btrfs_balance_delayed_items(fs_info);
return ret;
}
/* * This is a copy of file_update_time. We need this so we can return error on * ENOSPC for updating the inode in the case of file write and mmap writes.
*/ staticint btrfs_update_time(struct inode *inode, int flags)
{ struct btrfs_root *root = BTRFS_I(inode)->root; bool dirty;
/* * helper to find a free sequence number in a given directory. This current * code is very simple, later versions will do smarter things in the btree
*/ int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index)
{ int ret = 0;
if (dir->index_cnt == (u64)-1) {
ret = btrfs_inode_delayed_dir_index_count(dir); if (ret) {
ret = btrfs_set_inode_index_count(dir); if (ret) return ret;
}
}
int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, unsignedint *trans_num_items)
{ struct inode *dir = args->dir; struct inode *inode = args->inode; int ret;
if (!args->orphan) {
ret = fscrypt_setup_filename(dir, &args->dentry->d_name, 0,
&args->fname); if (ret) return ret;
}
ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl); if (ret) {
fscrypt_free_filename(&args->fname); return ret;
}
/* 1 to add inode item */
*trans_num_items = 1; /* 1 to add compression property */ if (BTRFS_I(dir)->prop_compress)
(*trans_num_items)++; /* 1 to add default ACL xattr */ if (args->default_acl)
(*trans_num_items)++; /* 1 to add access ACL xattr */ if (args->acl)
(*trans_num_items)++; #ifdef CONFIG_SECURITY /* 1 to add LSM xattr */ if (dir->i_security)
(*trans_num_items)++; #endif if (args->orphan) { /* 1 to add orphan item */
(*trans_num_items)++;
} else { /* * 1 to add dir item * 1 to add dir index * 1 to update parent inode item * * No need for 1 unit for the inode ref item because it is * inserted in a batch together with the inode item at * btrfs_create_new_inode().
*/
*trans_num_items += 3;
} return 0;
}
/* * Inherit flags from the parent inode. * * Currently only the compression flags and the cow flags are inherited.
*/ staticvoid btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *dir)
{ unsignedint flags;
path = btrfs_alloc_path(); if (!path) return -ENOMEM;
if (!args->subvol)
BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root);
root = BTRFS_I(inode)->root;
ret = btrfs_init_file_extent_tree(BTRFS_I(inode)); if (ret) goto out;
ret = btrfs_get_free_objectid(root, &objectid); if (ret) goto out;
btrfs_set_inode_number(BTRFS_I(inode), objectid);
ret = xa_reserve(&root->inodes, objectid, GFP_NOFS); if (ret) goto out;
xa_reserved = true;
if (args->orphan) { /* * O_TMPFILE, set link count to 0, so that after this point, we * fill in an inode item with the correct link count.
*/
set_nlink(inode, 0);
} else {
trace_btrfs_inode_request(dir);
ret = btrfs_set_inode_index(BTRFS_I(dir), &BTRFS_I(inode)->dir_index); if (ret) goto out;
}
if (S_ISDIR(inode->i_mode))
BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX;
/* * We don't have any capability xattrs set here yet, shortcut any * queries for the xattrs here. If we add them later via the inode * security init path or any other path this flag will be cleared.
*/
set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
/* * Subvolumes don't inherit flags from their parent directory. * Originally this was probably by accident, but we probably can't * change it now without compatibility issues.
*/ if (!args->subvol)
btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir));
btrfs_set_inode_mapping_order(BTRFS_I(inode)); if (S_ISREG(inode->i_mode)) { if (btrfs_test_opt(fs_info, NODATASUM))
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; if (btrfs_test_opt(fs_info, NODATACOW))
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
BTRFS_INODE_NODATASUM;
btrfs_update_inode_mapping_flags(BTRFS_I(inode));
}
ret = btrfs_insert_inode_locked(inode); if (ret < 0) { if (!args->orphan)
BTRFS_I(dir)->index_cnt--; goto out;
}
/* * We could have gotten an inode number from somebody who was fsynced * and then removed in this same transaction, so let's just set full * sync since it will be a full sync anyway and this will blow away the * old info in the log.
*/
btrfs_set_inode_full_sync(BTRFS_I(inode));
if (!args->orphan) { /* * Start new inodes with an inode_ref. This is slightly more * efficient for small numbers of hard links since they will * be packed into one item. Extended refs will kick in if we * add more hard links than can fit in the ref item.
*/
key[1].objectid = objectid;
key[1].type = BTRFS_INODE_REF_KEY; if (args->subvol) {
key[1].offset = objectid;
sizes[1] = 2 + sizeof(*ref);
} else {
key[1].offset = btrfs_ino(BTRFS_I(dir));
sizes[1] = name->len + sizeof(*ref);
}
}
/* * We don't need the path anymore, plus inheriting properties, adding * ACLs, security xattrs, orphan item or adding the link, will result in * allocating yet another path. So just free our path.
*/
btrfs_free_path(path);
path = NULL;
if (args->subvol) { struct btrfs_inode *parent;
/* * Subvolumes inherit properties from their parent subvolume, * not the directory they were created in.
*/
parent = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, BTRFS_I(dir)->root); if (IS_ERR(parent)) {
ret = PTR_ERR(parent);
} else {
ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode),
parent);
iput(&parent->vfs_inode);
}
} else {
ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode),
BTRFS_I(dir));
} if (ret) {
btrfs_err(fs_info, "error inheriting props for ino %llu (root %llu): %d",
btrfs_ino(BTRFS_I(inode)), btrfs_root_id(root), ret);
}
/* * Subvolumes don't inherit ACLs or get passed to the LSM. This is * probably a bug.
*/ if (!args->subvol) {
ret = btrfs_init_inode_security(trans, args); if (ret) {
btrfs_abort_transaction(trans, ret); goto discard;
}
}
ret = btrfs_add_inode_to_root(BTRFS_I(inode), false); if (WARN_ON(ret)) { /* Shouldn't happen, we used xa_reserve() before. */
btrfs_abort_transaction(trans, ret); goto discard;
}
if (args->orphan) {
ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) {
btrfs_abort_transaction(trans, ret); goto discard;
}
} else {
ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
0, BTRFS_I(inode)->dir_index); if (ret) {
btrfs_abort_transaction(trans, ret); goto discard;
}
}
return 0;
discard: /* * discard_new_inode() calls iput(), but the caller owns the reference * to the inode.
*/
ihold(inode);
discard_new_inode(inode);
out: if (xa_reserved)
xa_release(&root->inodes, objectid);
btrfs_free_path(path); return ret;
}
/* * utility function to add 'inode' into 'parent_inode' with * a give name and a given sequence number. * if 'add_backref' is true, also insert a backref from the * inode to the parent directory.
*/ int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, conststruct fscrypt_str *name, int add_backref, u64 index)
{ int ret = 0; struct btrfs_key key; struct btrfs_root *root = parent_inode->root;
u64 ino = btrfs_ino(inode);
u64 parent_ino = btrfs_ino(parent_inode);
/* do not allow sys_link's with other subvols of the same device */ if (btrfs_root_id(root) != btrfs_root_id(BTRFS_I(inode)->root)) return -EXDEV;
if (inode->i_nlink >= BTRFS_LINK_MAX) return -EMLINK;
ret = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname); if (ret) goto fail;
ret = btrfs_set_inode_index(BTRFS_I(dir), &index); if (ret) goto fail;
/* * 2 items for inode and inode ref * 2 items for dir items * 1 item for parent inode * 1 item for orphan item deletion if O_TMPFILE
*/
trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6); if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL; goto fail;
}
/* There are several dir indexes for this inode, clear the cache. */
BTRFS_I(inode)->dir_index = 0ULL;
inode_inc_iversion(inode);
inode_set_ctime_current(inode);
set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
&fname.disk_name, 1, index); if (ret) goto fail;
/* Link added now we update the inode item with the new link count. */
inc_nlink(inode);
ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) {
btrfs_abort_transaction(trans, ret); goto fail;
}
if (inode->i_nlink == 1) { /* * If the new hard link count is 1, it's a file created with the * open(2) O_TMPFILE flag.
*/
ret = btrfs_orphan_del(trans, BTRFS_I(inode)); if (ret) {
btrfs_abort_transaction(trans, ret); goto fail;
}
}
/* Grab reference for the new dentry passed to d_instantiate(). */
ihold(inode);
d_instantiate(dentry, inode);
btrfs_log_new_name(trans, old_dentry, NULL, 0, dentry->d_parent);
fail:
fscrypt_free_filename(&fname); if (trans)
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info); return ret;
}
/* * decompression code contains a memset to fill in any space between the end * of the uncompressed data and the end of max_size in case the decompressed * data ends up shorter than ram_bytes. That doesn't cover the hole between * the end of an inline extent and the beginning of the next block, so we * cover that region here.
*/
/* * Lookup the first extent overlapping a range in a file. * * @inode: file to search in * @page: page to read extent data into if the extent is inline * @start: file offset * @len: length of range starting at @start * * Return the first &struct extent_map which overlaps the given range, reading * it from the B-tree and caching it if necessary. Note that there may be more * extents which overlap the given range after the returned extent_map. * * If @page is not NULL and the extent is inline, this also reads the extent * data directly into the page and marks the extent up to date in the io_tree. * * Return: ERR_PTR on error, non-NULL extent_map on success.
*/ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct folio *folio, u64 start, u64 len)
{ struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret = 0;
u64 extent_start = 0;
u64 extent_end = 0;
u64 objectid = btrfs_ino(inode); int extent_type = -1; struct btrfs_path *path = NULL; struct btrfs_root *root = inode->root; struct btrfs_file_extent_item *item; struct extent_buffer *leaf; struct btrfs_key found_key; struct extent_map *em = NULL; struct extent_map_tree *em_tree = &inode->extent_tree;
read_lock(&em_tree->lock);
em = btrfs_lookup_extent_mapping(em_tree, start, len);
read_unlock(&em_tree->lock);
if (em) { if (em->start > start || em->start + em->len <= start)
btrfs_free_extent_map(em); elseif (em->disk_bytenr == EXTENT_MAP_INLINE && folio)
btrfs_free_extent_map(em); else goto out;
}
em = btrfs_alloc_extent_map(); if (!em) {
ret = -ENOMEM; goto out;
}
em->start = EXTENT_MAP_HOLE;
em->disk_bytenr = EXTENT_MAP_HOLE;
em->len = (u64)-1;
path = btrfs_alloc_path(); if (!path) {
ret = -ENOMEM; goto out;
}
/* Chances are we'll be called again, so go ahead and do readahead */
path->reada = READA_FORWARD;
/* * The same explanation in load_free_space_cache applies here as well, * we only read when we're loading the free space cache, and at that * point the commit_root has everything we need.
*/ if (btrfs_is_free_space_inode(inode)) {
path->search_commit_root = 1;
path->skip_locking = 1;
}
ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0); if (ret < 0) { goto out;
} elseif (ret > 0) { if (path->slots[0] == 0) goto not_found;
path->slots[0]--;
ret = 0;
}
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); if (found_key.objectid != objectid ||
found_key.type != BTRFS_EXTENT_DATA_KEY) { /* * If we backup past the first extent we want to move forward * and see if there is an extent in front of us, otherwise we'll * say there is a hole for our whole search range which can * cause problems.
*/
extent_end = start; goto next;
}
extent_type = btrfs_file_extent_type(leaf, item);
extent_start = found_key.offset;
extent_end = btrfs_file_extent_end(path); if (extent_type == BTRFS_FILE_EXTENT_REG ||
extent_type == BTRFS_FILE_EXTENT_PREALLOC) { /* Only regular file could have regular/prealloc extent */ if (!S_ISREG(inode->vfs_inode.i_mode)) {
ret = -EUCLEAN;
btrfs_crit(fs_info, "regular/prealloc extent found for non-regular inode %llu",
btrfs_ino(inode)); goto out;
}
trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
extent_start);
} elseif (extent_type == BTRFS_FILE_EXTENT_INLINE) {
trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
path->slots[0],
extent_start);
}
next: if (start >= extent_end) {
path->slots[0]++; if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path); if (ret < 0) goto out; elseif (ret > 0) goto not_found;
leaf = path->nodes[0];
}
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); if (found_key.objectid != objectid ||
found_key.type != BTRFS_EXTENT_DATA_KEY) goto not_found; if (start + len <= found_key.offset) goto not_found; if (start > found_key.offset) goto next;
/* New extent overlaps with existing one */
em->start = start;
em->len = found_key.offset - start;
em->disk_bytenr = EXTENT_MAP_HOLE; goto insert;
}
if (extent_type == BTRFS_FILE_EXTENT_REG ||
extent_type == BTRFS_FILE_EXTENT_PREALLOC) { goto insert;
} elseif (extent_type == BTRFS_FILE_EXTENT_INLINE) { /* * Inline extent can only exist at file offset 0. This is * ensured by tree-checker and inline extent creation path. * Thus all members representing file offsets should be zero.
*/
ASSERT(extent_start == 0);
ASSERT(em->start == 0);
/* * btrfs_extent_item_to_extent_map() should have properly * initialized em members already. * * Other members are not utilized for inline extents.
*/
ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE);
ASSERT(em->len == fs_info->sectorsize);
block_group = btrfs_lookup_block_group(fs_info, bytenr); if (!block_group || block_group->ro)
readonly = true; if (block_group)
btrfs_put_block_group(block_group); return readonly;
}
/* * Check if we can do nocow write into the range [@offset, @offset + @len) * * @offset: File offset * @len: The length to write, will be updated to the nocow writeable * range * @orig_start: (optional) Return the original file offset of the file extent * @orig_len: (optional) Return the original on-disk length of the file extent * @ram_bytes: (optional) Return the ram_bytes of the file extent * * Return: * >0 and update @len if we can do nocow write * 0 if we can't do nocow write * <0 if error happened * * NOTE: This only checks the file extents, caller is responsible to wait for * any ordered extents.
*/
noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len, struct btrfs_file_extent *file_extent, bool nowait)
{ struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct can_nocow_file_extent_args nocow_args = { 0 };
BTRFS_PATH_AUTO_FREE(path); int ret; struct extent_buffer *leaf; struct extent_io_tree *io_tree = &inode->io_tree; struct btrfs_file_extent_item *fi; struct btrfs_key key; int found_type;
path = btrfs_alloc_path(); if (!path) return -ENOMEM;
path->nowait = nowait;
ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
offset, 0); if (ret < 0) return ret;
if (ret == 1) { if (path->slots[0] == 0) { /* Can't find the item, must COW. */ return 0;
}
path->slots[0]--;
}
ret = 0;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != btrfs_ino(inode) ||
key.type != BTRFS_EXTENT_DATA_KEY) { /* Not our file or wrong item type, must COW. */ return 0;
}
if (key.offset > offset) { /* Wrong offset, must COW. */ return 0;
}
if (btrfs_file_extent_end(path) <= offset) return 0;
fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
found_type = btrfs_file_extent_type(leaf, fi);
range_end = round_up(offset + nocow_args.file_extent.num_bytes,
root->fs_info->sectorsize) - 1;
ret = btrfs_test_range_bit_exists(io_tree, offset, range_end,
EXTENT_DELALLOC); if (ret) return -EAGAIN;
}
if (file_extent)
memcpy(file_extent, &nocow_args.file_extent, sizeof(*file_extent));
*len = nocow_args.file_extent.num_bytes;
return 1;
}
/* The callers of this must take lock_extent() */ struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start, conststruct btrfs_file_extent *file_extent, int type)
{ struct extent_map *em; int ret;
/* * Note the missing NOCOW type. * * For pure NOCOW writes, we should not create an io extent map, but * just reusing the existing one. * Only PREALLOC writes (NOCOW write into preallocated range) can * create an io extent map.
*/
ASSERT(type == BTRFS_ORDERED_PREALLOC ||
type == BTRFS_ORDERED_COMPRESSED ||
type == BTRFS_ORDERED_REGULAR);
switch (type) { case BTRFS_ORDERED_PREALLOC: /* We're only referring part of a larger preallocated extent. */
ASSERT(file_extent->num_bytes <= file_extent->ram_bytes); break; case BTRFS_ORDERED_REGULAR: /* COW results a new extent matching our file extent size. */
ASSERT(file_extent->disk_num_bytes == file_extent->num_bytes);
ASSERT(file_extent->ram_bytes == file_extent->num_bytes);
/* Since it's a new extent, we should not have any offset. */
ASSERT(file_extent->offset == 0); break; case BTRFS_ORDERED_COMPRESSED: /* Must be compressed. */
ASSERT(file_extent->compression != BTRFS_COMPRESS_NONE);
/* * Encoded write can make us to refer to part of the * uncompressed extent.
*/
ASSERT(file_extent->num_bytes <= file_extent->ram_bytes); break;
}
em = btrfs_alloc_extent_map(); if (!em) return ERR_PTR(-ENOMEM);
ret = btrfs_replace_extent_map_range(inode, em, true); if (ret) {
btrfs_free_extent_map(em); return ERR_PTR(ret);
}
/* em got 2 refs now, callers needs to do btrfs_free_extent_map once. */ return em;
}
/* * For release_folio() and invalidate_folio() we have a race window where * folio_end_writeback() is called but the subpage spinlock is not yet released. * If we continue to release/invalidate the page, we could cause use-after-free * for subpage spinlock. So this function is to spin and wait for subpage * spinlock.
*/ staticvoid wait_subpage_spinlock(struct folio *folio)
{ struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); struct btrfs_folio_state *bfs;
/* * This may look insane as we just acquire the spinlock and release it, * without doing anything. But we just want to make sure no one is * still holding the subpage spinlock. * And since the page is not dirty nor writeback, and we have page * locked, the only possible way to hold a spinlock is from the endio * function to clear page writeback. * * Here we just acquire the spinlock so that all existing callers * should exit and we're safe to release/invalidate the page.
*/
spin_lock_irq(&bfs->lock);
spin_unlock_irq(&bfs->lock);
}
/* * We have folio locked so no new ordered extent can be created on this * page, nor bio can be submitted for this folio. * * But already submitted bio can still be finished on this folio. * Furthermore, endio function won't skip folio which has Ordered * already cleared, so it's possible for endio and * invalidate_folio to do the same ordered extent accounting twice * on one folio. * * So here we wait for any submitted bios to finish, so that we won't * do double ordered extent accounting on the same folio.
*/
folio_wait_writeback(folio);
wait_subpage_spinlock(folio);
/* * For subpage case, we have call sites like * btrfs_punch_hole_lock_range() which passes range not aligned to * sectorsize. * If the range doesn't cover the full folio, we don't need to and * shouldn't clear page extent mapped, as folio->private can still * record subpage dirty bits for other part of the range. * * For cases that invalidate the full folio even the range doesn't * cover the full folio, like invalidating the last folio, we're * still safe to wait for ordered extent to finish.
*/ if (!(offset == 0 && length == folio_size(folio))) {
btrfs_release_folio(folio, GFP_NOFS); return;
}
if (!inode_evicting)
btrfs_lock_extent(tree, page_start, page_end, &cached_state);
cur = page_start; while (cur < page_end) { struct btrfs_ordered_extent *ordered;
u64 range_end;
u32 range_len;
u32 extra_flags = 0;
ordered = btrfs_lookup_first_ordered_range(inode, cur,
page_end + 1 - cur); if (!ordered) {
range_end = page_end; /* * No ordered extent covering this range, we are safe * to delete all extent states in the range.
*/
extra_flags = EXTENT_CLEAR_ALL_BITS; goto next;
} if (ordered->file_offset > cur) { /* * There is a range between [cur, oe->file_offset) not * covered by any ordered extent. * We are safe to delete all extent states, and handle * the ordered extent in the next iteration.
*/
range_end = ordered->file_offset - 1;
extra_flags = EXTENT_CLEAR_ALL_BITS; goto next;
}
range_end = min(ordered->file_offset + ordered->num_bytes - 1,
page_end);
ASSERT(range_end + 1 - cur < U32_MAX);
range_len = range_end + 1 - cur; if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) { /* * If Ordered is cleared, it means endio has * already been executed for the range. * We can't delete the extent states as * btrfs_finish_ordered_io() may still use some of them.
*/ goto next;
}
btrfs_folio_clear_ordered(fs_info, folio, cur, range_len);
/* * IO on this page will never be started, so we need to account * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW * here, must leave that up for the ordered extent completion. * * This will also unlock the range for incoming * btrfs_finish_ordered_io().
*/ if (!inode_evicting)
btrfs_clear_extent_bit(tree, cur, range_end,
EXTENT_DELALLOC |
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, &cached_state);
spin_lock_irq(&inode->ordered_tree_lock);
set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
ordered->truncated_len = min(ordered->truncated_len,
cur - ordered->file_offset);
spin_unlock_irq(&inode->ordered_tree_lock);
/* * If the ordered extent has finished, we're safe to delete all * the extent states of the range, otherwise * btrfs_finish_ordered_io() will get executed by endio for * other pages, so we can't delete extent states.
*/ if (btrfs_dec_test_ordered_pending(inode, &ordered,
cur, range_end + 1 - cur)) {
btrfs_finish_ordered_io(ordered); /* * The ordered extent has finished, now we're again * safe to delete all extent states of the range.
*/
extra_flags = EXTENT_CLEAR_ALL_BITS;
}
next: if (ordered)
btrfs_put_ordered_extent(ordered); /* * Qgroup reserved space handler * Sector(s) here will be either: * * 1) Already written to disk or bio already finished * Then its QGROUP_RESERVED bit in io_tree is already cleared. * Qgroup will be handled by its qgroup_record then. * btrfs_qgroup_free_data() call will do nothing here. * * 2) Not written to disk yet * Then btrfs_qgroup_free_data() call will clear the * QGROUP_RESERVED bit of its io_tree, and free the qgroup * reserved data space. * Since the IO will never happen for this page.
*/
btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL); if (!inode_evicting)
btrfs_clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG | extra_flags,
&cached_state);
cur = range_end + 1;
} /* * We have iterated through all ordered extents of the page, the page * should not have Ordered anymore, or the above iteration * did something wrong.
*/
ASSERT(!folio_test_ordered(folio));
btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio)); if (!inode_evicting)
__btrfs_release_folio(folio, GFP_NOFS);
clear_folio_extent_mapped(folio);
}
if (!skip_writeback) {
ret = btrfs_wait_ordered_range(inode,
inode->vfs_inode.i_size & (~mask),
(u64)-1); if (ret) return ret;
}
/* * Yes ladies and gentlemen, this is indeed ugly. We have a couple of * things going on here: * * 1) We need to reserve space to update our inode. * * 2) We need to have something to cache all the space that is going to * be free'd up by the truncate operation, but also have some slack * space reserved in case it uses space during the truncate (thank you * very much snapshotting). * * And we need these to be separate. The fact is we can use a lot of * space doing the truncate, and we have no earthly idea how much space * we will use, so we need the truncate reservation to be separate so it * doesn't end up using space reserved for updating the inode. We also * need to be able to stop the transaction and start a new one, which * means we need to be able to update the inode several times, and we * have no idea of knowing how many times that will be, so we can't just * reserve 1 item for the entirety of the operation, so that has to be * done separately as well. * * So that leaves us with * * 1) rsv - for the truncate reservation, which we will steal from the * transaction reservation. * 2) fs_info->trans_block_rsv - this will have 1 items worth left for * updating the inode.
*/
btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP);
rsv.size = min_size;
rsv.failfast = true;
/* * 1 for the truncate slack space * 1 for updating the inode.
*/
trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) {
ret = PTR_ERR(trans); goto out;
}
/* Migrate the slack space for the truncate to our reserve */
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, &rsv,
min_size, false); /* * We have reserved 2 metadata units when we started the transaction and * min_size matches 1 unit, so this should never fail, but if it does, * it's not critical we just fail truncation.
*/ if (WARN_ON(ret)) {
btrfs_end_transaction(trans); goto out;
}
control.new_size = new_size;
btrfs_lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); /* * We want to drop from the next block forward in case this new * size is not block aligned since we will be keeping the last * block of the extent just the way it is.
*/
btrfs_drop_extent_map_range(inode,
ALIGN(new_size, fs_info->sectorsize),
(u64)-1, false);
ret = btrfs_truncate_inode_items(trans, root, &control);
trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL; break;
}
btrfs_block_rsv_release(fs_info, &rsv, -1, NULL);
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
&rsv, min_size, false); /* * We have reserved 2 metadata units when we started the * transaction and min_size matches 1 unit, so this should never * fail, but if it does, it's not critical we just fail truncation.
*/ if (WARN_ON(ret)) break;
trans->block_rsv = &rsv;
}
/* * We can't call btrfs_truncate_block inside a trans handle as we could * deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we * know we've truncated everything except the last little bit, and can * do btrfs_truncate_block and then update the disk_i_size.
*/ if (ret == BTRFS_NEED_TRUNCATE_BLOCK) {
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size,
inode->vfs_inode.i_size, (u64)-1); if (ret) goto out;
trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) {
ret = PTR_ERR(trans); goto out;
}
btrfs_inode_safe_disk_i_size_write(inode, 0);
}
if (trans) { int ret2;
trans->block_rsv = &fs_info->trans_block_rsv;
ret2 = btrfs_update_inode(trans, inode); if (ret2 && !ret)
ret = ret2;
ret2 = btrfs_end_transaction(trans); if (ret2 && !ret)
ret = ret2;
btrfs_btree_balance_dirty(fs_info);
}
out:
btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL); /* * So if we truncate and then write and fsync we normally would just * write the extents that changed, which is a problem if we need to * first truncate that entire inode. So set this flag so we write out * all of the extents in the inode to the sync log so we're completely * safe. * * If no extents were dropped or trimmed we don't need to force the next * fsync to truncate all the inode's items from the log and re-log them * all. This means the truncate operation did not change the file size, * or changed it to a smaller size but there was only an implicit hole * between the old i_size and the new i_size, and there were no prealloc * extents beyond i_size to drop.
*/ if (control.extents_found > 0)
btrfs_set_inode_full_sync(inode);
inode = new_inode(dir->i_sb); if (inode) { /* * Subvolumes don't inherit the sgid bit or the parent's gid if * the parent's sgid bit is set. This is probably a bug.
*/
inode_init_owner(idmap, inode, NULL,
S_IFDIR | (~current_umask() & S_IRWXUGO));
inode->i_op = &btrfs_dir_inode_operations;
inode->i_fop = &btrfs_dir_file_operations;
} return inode;
}
WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
WARN_ON(vfs_inode->i_data.nrpages);
WARN_ON(inode->block_rsv.reserved);
WARN_ON(inode->block_rsv.size);
WARN_ON(inode->outstanding_extents); if (!S_ISDIR(vfs_inode->i_mode)) {
WARN_ON(inode->delalloc_bytes);
WARN_ON(inode->new_delalloc_bytes);
WARN_ON(inode->csum_bytes);
} if (!root || !btrfs_is_data_reloc_root(root))
WARN_ON(inode->defrag_bytes);
/* * This can happen where we create an inode, but somebody else also * created the same inode and we need to destroy the one we already * created.
*/ if (!root) return;
/* * If this is a free space inode do not take the ordered extents lockdep * map.
*/
freespace_inode = btrfs_is_free_space_inode(inode);
while (1) {
ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); if (!ordered) break; else {
btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup",
ordered->file_offset, ordered->num_bytes);
if (!freespace_inode)
btrfs_lockdep_acquire(root->fs_info, btrfs_ordered_extent);
void __cold btrfs_destroy_cachep(void)
{ /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache.
*/
rcu_barrier();
kmem_cache_destroy(btrfs_inode_cachep);
}
int __init btrfs_init_cachep(void)
{
btrfs_inode_cachep = kmem_cache_create("btrfs_inode", sizeof(struct btrfs_inode), 0,
SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
init_once); if (!btrfs_inode_cachep) return -ENOMEM;
/* * For non-subvolumes allow exchange only within one subvolume, in the * same inode namespace. Two subvolumes (represented as directory) can * be exchanged as they're a logical link and have a fixed inode number.
*/ if (root != dest &&
(old_ino != BTRFS_FIRST_FREE_OBJECTID ||
new_ino != BTRFS_FIRST_FREE_OBJECTID)) return -EXDEV;
ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname); if (ret) return ret;
ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname); if (ret) {
fscrypt_free_filename(&old_fname); return ret;
}
/* close the race window with snapshot create/destroy ioctl */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
new_ino == BTRFS_FIRST_FREE_OBJECTID)
down_read(&fs_info->subvol_sem);
/* * For each inode: * 1 to remove old dir item * 1 to remove old dir index * 1 to add new dir item * 1 to add new dir index * 1 to update parent inode * * If the parents are the same, we only need to account for one
*/
trans_num_items = (old_dir == new_dir ? 9 : 10); if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { /* * 1 to remove old root ref * 1 to remove old root backref * 1 to add new root ref * 1 to add new root backref
*/
trans_num_items += 4;
} else { /* * 1 to update inode item * 1 to remove old inode ref * 1 to add new inode ref
*/
trans_num_items += 3;
} if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
trans_num_items += 4; else
trans_num_items += 3;
trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) {
ret = PTR_ERR(trans); goto out_notrans;
}
if (dest != root) {
ret = btrfs_record_root_in_trans(trans, dest); if (ret) goto out_fail;
}
/* * We need to find a free sequence number both in the source and * in the destination directory for the exchange.
*/
ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx); if (ret) goto out_fail;
ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx); if (ret) goto out_fail;
/* Reference for the source. */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { /* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
ret = btrfs_insert_inode_ref(trans, dest, new_name, old_ino,
btrfs_ino(BTRFS_I(new_dir)),
old_idx); if (ret) goto out_fail;
need_abort = true;
}
/* And now for the dest. */ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { /* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
ret = btrfs_insert_inode_ref(trans, root, old_name, new_ino,
btrfs_ino(BTRFS_I(old_dir)),
new_idx); if (ret) { if (need_abort)
btrfs_abort_transaction(trans, ret); goto out_fail;
}
}
/* Update inode version and ctime/mtime. */
inode_inc_iversion(old_dir);
inode_inc_iversion(new_dir);
inode_inc_iversion(old_inode);
inode_inc_iversion(new_inode);
simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
if (old_ino != BTRFS_FIRST_FREE_OBJECTID &&
new_ino != BTRFS_FIRST_FREE_OBJECTID) { /* * If we are renaming in the same directory (and it's not for * root entries) pin the log early to prevent any concurrent * task from logging the directory after we removed the old * entries and before we add the new entries, otherwise that * task can sync a log without any entry for the inodes we are * renaming and therefore replaying that log, if a power failure * happens after syncing the log, would result in deleting the * inodes. * * If the rename affects two different directories, we want to * make sure the that there's no log commit that contains * updates for only one of the directories but not for the * other. * * If we are renaming an entry for a root, we don't care about * log updates since we called btrfs_set_log_full_commit().
*/
btrfs_pin_log_trans(root);
btrfs_pin_log_trans(dest);
logs_pinned = true;
}
/* src is a subvolume */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); if (ret) {
btrfs_abort_transaction(trans, ret); goto out_fail;
}
} else { /* src is an inode */
ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
BTRFS_I(old_dentry->d_inode),
old_name, &old_rename_ctx); if (ret) {
btrfs_abort_transaction(trans, ret); goto out_fail;
}
ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); if (ret) {
btrfs_abort_transaction(trans, ret); goto out_fail;
}
}
/* dest is a subvolume */ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); if (ret) {
btrfs_abort_transaction(trans, ret); goto out_fail;
}
} else { /* dest is an inode */
ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
BTRFS_I(new_dentry->d_inode),
new_name, &new_rename_ctx); if (ret) {
btrfs_abort_transaction(trans, ret); goto out_fail;
}
ret = btrfs_update_inode(trans, BTRFS_I(new_inode)); if (ret) {
btrfs_abort_transaction(trans, ret); goto out_fail;
}
}
ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
new_name, 0, old_idx); if (ret) {
btrfs_abort_transaction(trans, ret); goto out_fail;
}
ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
old_name, 0, new_idx); if (ret) {
btrfs_abort_transaction(trans, ret); goto out_fail;
}
if (old_inode->i_nlink == 1)
BTRFS_I(old_inode)->dir_index = old_idx; if (new_inode->i_nlink == 1)
BTRFS_I(new_inode)->dir_index = new_idx;
/* * Do the log updates for all inodes. * * If either entry is for a root we don't need to update the logs since * we've called btrfs_set_log_full_commit() before.
*/ if (logs_pinned) {
btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
old_rename_ctx.index, new_dentry->d_parent);
btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
new_rename_ctx.index, old_dentry->d_parent);
}
out_fail: if (logs_pinned) {
btrfs_end_log_trans(root);
btrfs_end_log_trans(dest);
}
ret2 = btrfs_end_transaction(trans);
ret = ret ? ret : ret2;
out_notrans: if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem);
if (S_ISDIR(old_inode->i_mode) && new_inode &&
new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY;
ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname); if (ret) return ret;
ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname); if (ret) {
fscrypt_free_filename(&old_fname); return ret;
}
/* check for collisions, even if the name isn't there */
ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_fname.disk_name); if (ret) { if (ret == -EEXIST) { /* we shouldn't get
* eexist without a new_inode */ if (WARN_ON(!new_inode)) { goto out_fscrypt_names;
}
} else { /* maybe -EOVERFLOW */ goto out_fscrypt_names;
}
}
ret = 0;
/* * we're using rename to replace one file with another. Start IO on it * now so we don't add too much work to the end of the transaction
*/ if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
filemap_flush(old_inode->i_mapping);
if (flags & RENAME_WHITEOUT) {
whiteout_args.inode = new_whiteout_inode(idmap, old_dir); if (!whiteout_args.inode) {
ret = -ENOMEM; goto out_fscrypt_names;
}
ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items); if (ret) goto out_whiteout_inode;
} else { /* 1 to update the old parent inode. */
trans_num_items = 1;
}
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { /* Close the race window with snapshot create/destroy ioctl */
down_read(&fs_info->subvol_sem); /* * 1 to remove old root ref * 1 to remove old root backref * 1 to add new root ref * 1 to add new root backref
*/
trans_num_items += 4;
} else { /* * 1 to update inode * 1 to remove old inode ref * 1 to add new inode ref
*/
trans_num_items += 3;
} /* * 1 to remove old dir item * 1 to remove old dir index * 1 to add new dir item * 1 to add new dir index
*/
trans_num_items += 4; /* 1 to update new parent inode if it's not the same as the old parent */ if (new_dir != old_dir)
trans_num_items++; if (new_inode) { /* * 1 to update inode * 1 to remove inode ref * 1 to remove dir item * 1 to remove dir index * 1 to possibly add orphan item
*/
trans_num_items += 5;
}
trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) {
ret = PTR_ERR(trans); goto out_notrans;
}
if (dest != root) {
ret = btrfs_record_root_in_trans(trans, dest); if (ret) goto out_fail;
}
ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index); if (ret) goto out_fail;
BTRFS_I(old_inode)->dir_index = 0ULL; if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { /* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
ret = btrfs_insert_inode_ref(trans, dest, &new_fname.disk_name,
old_ino, btrfs_ino(BTRFS_I(new_dir)),
index); if (ret) goto out_fail;
}
if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { /* * If we are renaming in the same directory (and it's not a * root entry) pin the log to prevent any concurrent task from * logging the directory after we removed the old entry and * before we add the new entry, otherwise that task can sync * a log without any entry for the inode we are renaming and * therefore replaying that log, if a power failure happens * after syncing the log, would result in deleting the inode. * * If the rename affects two different directories, we want to * make sure the that there's no log commit that contains * updates for only one of the directories but not for the * other. * * If we are renaming an entry for a root, we don't care about * log updates since we called btrfs_set_log_full_commit().
*/
btrfs_pin_log_trans(root);
btrfs_pin_log_trans(dest);
logs_pinned = true;
}
if (old_dentry->d_parent != new_dentry->d_parent)
btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
BTRFS_I(old_inode), true);
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); if (ret) {
btrfs_abort_transaction(trans, ret); goto out_fail;
}
} else {
ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
BTRFS_I(d_inode(old_dentry)),
&old_fname.disk_name, &rename_ctx); if (ret) {
btrfs_abort_transaction(trans, ret); goto out_fail;
}
ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); if (ret) {
btrfs_abort_transaction(trans, ret); goto out_fail;
}
}
if (new_inode) {
inode_inc_iversion(new_inode); if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); if (ret) {
btrfs_abort_transaction(trans, ret); goto out_fail;
}
BUG_ON(new_inode->i_nlink == 0);
} else {
ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
BTRFS_I(d_inode(new_dentry)),
&new_fname.disk_name); if (ret) {
btrfs_abort_transaction(trans, ret); goto out_fail;
}
} if (new_inode->i_nlink == 0) {
ret = btrfs_orphan_add(trans,
BTRFS_I(d_inode(new_dentry))); if (ret) {
btrfs_abort_transaction(trans, ret); goto out_fail;
}
}
}
ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
&new_fname.disk_name, 0, index); if (ret) {
btrfs_abort_transaction(trans, ret); goto out_fail;
}
if (old_inode->i_nlink == 1)
BTRFS_I(old_inode)->dir_index = index;
if (logs_pinned)
btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
rename_ctx.index, new_dentry->d_parent);
if (flags & RENAME_WHITEOUT) {
ret = btrfs_create_new_inode(trans, &whiteout_args); if (ret) {
btrfs_abort_transaction(trans, ret); goto out_fail;
} else {
unlock_new_inode(whiteout_args.inode);
iput(whiteout_args.inode);
whiteout_args.inode = NULL;
}
}
out_fail: if (logs_pinned) {
btrfs_end_log_trans(root);
btrfs_end_log_trans(dest);
}
ret2 = btrfs_end_transaction(trans);
ret = ret ? ret : ret2;
out_notrans: if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem); if (flags & RENAME_WHITEOUT)
btrfs_new_inode_args_destroy(&whiteout_args);
out_whiteout_inode: if (flags & RENAME_WHITEOUT)
iput(whiteout_args.inode);
out_fscrypt_names:
fscrypt_free_filename(&old_fname);
fscrypt_free_filename(&new_fname); return ret;
}
/* * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk.
*/ staticint start_delalloc_inodes(struct btrfs_root *root, struct writeback_control *wbc, bool snapshot, bool in_reclaim_context)
{ struct btrfs_delalloc_work *work, *next;
LIST_HEAD(works);
LIST_HEAD(splice); int ret = 0; bool full_flush = wbc->nr_to_write == LONG_MAX;
mutex_lock(&fs_info->delalloc_root_mutex);
spin_lock(&fs_info->delalloc_root_lock);
list_splice_init(&fs_info->delalloc_roots, &splice); while (!list_empty(&splice)) { /* * Reset nr_to_write here so we know that we're doing a full * flush.
*/ if (nr == LONG_MAX)
wbc.nr_to_write = LONG_MAX;
btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC);
btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start);
btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len);
btrfs_set_stack_file_extent_num_bytes(&stack_fi, len);
btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len);
btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE); /* Encryption and other encoding is reserved and all 0 */
ret = btrfs_qgroup_release_data(inode, file_offset, len, &qgroup_released); if (ret < 0) return ERR_PTR(ret);
if (trans) {
ret = insert_reserved_file_extent(trans, inode,
file_offset, &stack_fi, true, qgroup_released); if (ret) goto free_qgroup; return trans;
}
path = btrfs_alloc_path(); if (!path) {
ret = -ENOMEM; goto free_qgroup;
}
ret = btrfs_replace_file_extents(inode, path, file_offset,
file_offset + len - 1, &extent_info,
&trans);
btrfs_free_path(path); if (ret) goto free_qgroup; return trans;
free_qgroup: /* * We have released qgroup data range at the beginning of the function, * and normally qgroup_released bytes will be freed when committing * transaction. * But if we error out early, we have to free what we have released * or we leak qgroup data reservation.
*/
btrfs_qgroup_free_refroot(inode->root->fs_info,
btrfs_root_id(inode->root), qgroup_released,
BTRFS_QGROUP_RSV_DATA); return ERR_PTR(ret);
}
if (trans)
own_trans = false; while (num_bytes > 0) {
cur_bytes = min_t(u64, num_bytes, SZ_256M);
cur_bytes = max(cur_bytes, min_size); /* * If we are severely fragmented we could end up with really * small allocations, so if the allocator is returning small * chunks lets make its job easier by only searching for those * sized chunks.
*/
cur_bytes = min(cur_bytes, last_alloc);
ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
min_size, 0, *alloc_hint, &ins, 1, 0); if (ret) break;
/* * We've reserved this space, and thus converted it from * ->bytes_may_use to ->bytes_reserved. Any error that happens * from here on out we will only need to clear our reservation * for the remaining unreserved area, so advance our * clear_offset by our extent size.
*/
clear_offset += ins.offset;
last_alloc = ins.offset;
trans = insert_prealloc_file_extent(trans, BTRFS_I(inode),
&ins, cur_offset); /* * Now that we inserted the prealloc extent we can finally * decrement the number of reservations in the block group. * If we did it before, we could race with relocation and have * relocation miss the reserved extent, making it fail later.
*/
btrfs_dec_block_group_reservations(fs_info, ins.objectid); if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
btrfs_free_reserved_extent(fs_info, ins.objectid,
ins.offset, false); break;
}
em = btrfs_alloc_extent_map(); if (!em) {
btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset,
cur_offset + ins.offset - 1, false);
btrfs_set_inode_full_sync(BTRFS_I(inode)); goto next;
}
new_inode_args.inode = inode;
ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); if (ret) goto out_inode;
trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) {
ret = PTR_ERR(trans); goto out_new_inode_args;
}
ret = btrfs_create_new_inode(trans, &new_inode_args);
/* * We set number of links to 0 in btrfs_create_new_inode(), and here we * set it to 1 because d_tmpfile() will issue a warning if the count is * 0, through: * * d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
*/
set_nlink(inode, 1);
if (!ret) {
d_tmpfile(file, inode);
unlock_new_inode(inode);
mark_inode_dirty(inode);
}
int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, int compress_type)
{ switch (compress_type) { case BTRFS_COMPRESS_NONE: return BTRFS_ENCODED_IO_COMPRESSION_NONE; case BTRFS_COMPRESS_ZLIB: return BTRFS_ENCODED_IO_COMPRESSION_ZLIB; case BTRFS_COMPRESS_LZO: /* * The LZO format depends on the sector size. 64K is the maximum * sector size that we support.
*/ if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K) return -EINVAL; return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K +
(fs_info->sectorsize_bits - 12); case BTRFS_COMPRESS_ZSTD: return BTRFS_ENCODED_IO_COMPRESSION_ZSTD; default: return -EUCLEAN;
}
}
if (bbio->bio.bi_status) { /* * The memory barrier implied by the refcount_dec_and_test() here * pairs with the memory barrier implied by the refcount_dec_and_test() * in btrfs_encoded_read_regular_fill_pages() to ensure that * this write is observed before the load of status in * btrfs_encoded_read_regular_fill_pages().
*/
WRITE_ONCE(priv->status, bbio->bio.bi_status);
} if (refcount_dec_and_test(&priv->pending_refs)) { int err = blk_status_to_errno(READ_ONCE(priv->status));
if (uring_ctx) { if (refcount_dec_and_test(&priv->pending_refs)) {
ret = blk_status_to_errno(READ_ONCE(priv->status));
btrfs_uring_read_extent_endio(uring_ctx, ret);
kfree(priv); return ret;
}
return -EIOCBQUEUED;
} else { if (!refcount_dec_and_test(&priv->pending_refs))
wait_for_completion_io(&sync_reads); /* See btrfs_encoded_read_endio() for ordering. */ return blk_status_to_errno(READ_ONCE(priv->status));
}
}
ret = btrfs_inode_lock(inode,
BTRFS_ILOCK_SHARED | (nowait ? BTRFS_ILOCK_TRY : 0)); if (ret) return ret;
if (iocb->ki_pos >= inode->vfs_inode.i_size) {
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); return 0;
}
start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize); /* * We don't know how long the extent containing iocb->ki_pos is, but if * it's compressed we know that it won't be longer than this.
*/
lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
if (nowait) { struct btrfs_ordered_extent *ordered;
if (filemap_range_needs_writeback(inode->vfs_inode.i_mapping,
start, lockend)) {
ret = -EAGAIN; goto out_unlock_inode;
}
if (!btrfs_try_lock_extent(io_tree, start, lockend, cached_state)) {
ret = -EAGAIN; goto out_unlock_inode;
}
em = btrfs_get_extent(inode, NULL, start, lockend - start + 1); if (IS_ERR(em)) {
ret = PTR_ERR(em); goto out_unlock_extent;
}
if (em->disk_bytenr == EXTENT_MAP_INLINE) {
u64 extent_start = em->start;
/* * For inline extents we get everything we need out of the * extent item.
*/
btrfs_free_extent_map(em);
em = NULL;
ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
cached_state, extent_start,
count, encoded, &unlocked); goto out_unlock_extent;
}
/* * We only want to return up to EOF even if the extent extends beyond * that.
*/
encoded->len = min_t(u64, btrfs_extent_map_end(em),
inode->vfs_inode.i_size) - iocb->ki_pos; if (em->disk_bytenr == EXTENT_MAP_HOLE ||
(em->flags & EXTENT_FLAG_PREALLOC)) {
*disk_bytenr = EXTENT_MAP_HOLE;
count = min_t(u64, count, encoded->len);
encoded->len = count;
encoded->unencoded_len = count;
} elseif (btrfs_extent_map_is_compressed(em)) {
*disk_bytenr = em->disk_bytenr; /* * Bail if the buffer isn't large enough to return the whole * compressed extent.
*/ if (em->disk_num_bytes > count) {
ret = -ENOBUFS; goto out_em;
}
*disk_io_size = em->disk_num_bytes;
count = em->disk_num_bytes;
encoded->unencoded_len = em->ram_bytes;
encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset);
ret = btrfs_encoded_io_compression_from_extent(fs_info,
btrfs_extent_map_compression(em)); if (ret < 0) goto out_em;
encoded->compression = ret;
} else {
*disk_bytenr = btrfs_extent_map_block_start(em) + (start - em->start); if (encoded->len > count)
encoded->len = count; /* * Don't read beyond what we locked. This also limits the page * allocations that we'll do.
*/
*disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
count = start + *disk_io_size - iocb->ki_pos;
encoded->len = count;
encoded->unencoded_len = count;
*disk_io_size = ALIGN(*disk_io_size, fs_info->sectorsize);
}
btrfs_free_extent_map(em);
em = NULL;
if (*disk_bytenr == EXTENT_MAP_HOLE) {
btrfs_unlock_extent(io_tree, start, lockend, cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
unlocked = true;
ret = iov_iter_zero(count, iter); if (ret != count)
ret = -EFAULT;
} else {
ret = -EIOCBQUEUED; goto out_unlock_extent;
}
out_em:
btrfs_free_extent_map(em);
out_unlock_extent: /* Leave inode and extent locked if we need to do a read. */ if (!unlocked && ret != -EIOCBQUEUED)
btrfs_unlock_extent(io_tree, start, lockend, cached_state);
out_unlock_inode: if (!unlocked && ret != -EIOCBQUEUED)
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); return ret;
}
switch (encoded->compression) { case BTRFS_ENCODED_IO_COMPRESSION_ZLIB:
compression = BTRFS_COMPRESS_ZLIB; break; case BTRFS_ENCODED_IO_COMPRESSION_ZSTD:
compression = BTRFS_COMPRESS_ZSTD; break; case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K: case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K: case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K: case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K: case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K: /* The sector size must match for LZO. */ if (encoded->compression -
BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 !=
fs_info->sectorsize_bits) return -EINVAL;
compression = BTRFS_COMPRESS_LZO; break; default: return -EINVAL;
} if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE) return -EINVAL;
/* * Compressed extents should always have checksums, so error out if we * have a NOCOW file or inode was created while mounted with NODATASUM.
*/ if (inode->flags & BTRFS_INODE_NODATASUM) return -EINVAL;
orig_count = iov_iter_count(from);
/* The extent size must be sane. */ if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED ||
orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0) return -EINVAL;
/* * The compressed data must be smaller than the decompressed data. * * It's of course possible for data to compress to larger or the same * size, but the buffered I/O path falls back to no compression for such * data, and we don't want to break any assumptions by creating these * extents. * * Note that this is less strict than the current check we have that the * compressed data must be at least one sector smaller than the * decompressed data. We only want to enforce the weaker requirement * from old kernels that it is at least one byte smaller.
*/ if (orig_count >= encoded->unencoded_len) return -EINVAL;
/* The extent must start on a sector boundary. */
start = iocb->ki_pos; if (!IS_ALIGNED(start, fs_info->sectorsize)) return -EINVAL;
/* * The extent must end on a sector boundary. However, we allow a write * which ends at or extends i_size to have an unaligned length; we round * up the extent size and set i_size to the unaligned end.
*/ if (start + encoded->len < inode->vfs_inode.i_size &&
!IS_ALIGNED(start + encoded->len, fs_info->sectorsize)) return -EINVAL;
/* Finally, the offset in the unencoded data must be sector-aligned. */ if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize)) return -EINVAL;
/* * If the extent cannot be inline, the compressed data on disk must be * sector-aligned. For convenience, we extend it with zeroes if it * isn't.
*/
disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
folios = kvcalloc(nr_folios, sizeof(struct folio *), GFP_KERNEL_ACCOUNT); if (!folios) return -ENOMEM; for (i = 0; i < nr_folios; i++) {
size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from)); char *kaddr;
folios[i] = folio_alloc(GFP_KERNEL_ACCOUNT, 0); if (!folios[i]) {
ret = -ENOMEM; goto out_folios;
}
kaddr = kmap_local_folio(folios[i], 0); if (copy_from_iter(kaddr, bytes, from) != bytes) {
kunmap_local(kaddr);
ret = -EFAULT; goto out_folios;
} if (bytes < PAGE_SIZE)
memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
kunmap_local(kaddr);
}
for (;;) { struct btrfs_ordered_extent *ordered;
ret = btrfs_wait_ordered_range(inode, start, num_bytes); if (ret) goto out_folios;
ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
start >> PAGE_SHIFT,
end >> PAGE_SHIFT); if (ret) goto out_folios;
btrfs_lock_extent(io_tree, start, end, &cached_state);
ordered = btrfs_lookup_ordered_range(inode, start, num_bytes); if (!ordered &&
!filemap_range_has_page(inode->vfs_inode.i_mapping, start, end)) break; if (ordered)
btrfs_put_ordered_extent(ordered);
btrfs_unlock_extent(io_tree, start, end, &cached_state);
cond_resched();
}
/* * We don't use the higher-level delalloc space functions because our * num_bytes and disk_num_bytes are different.
*/
ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes); if (ret) goto out_unlock;
ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes); if (ret) goto out_free_data_space;
ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes, false); if (ret) goto out_qgroup_free_data;
/* Try an inline extent first. */ if (encoded->unencoded_len == encoded->len &&
encoded->unencoded_offset == 0 &&
can_cow_file_range_inline(inode, start, encoded->len, orig_count)) {
ret = __cow_file_range_inline(inode, encoded->len,
orig_count, compression, folios[0], true); if (ret <= 0) { if (ret == 0)
ret = orig_count; goto out_delalloc_release;
}
}
ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes,
disk_num_bytes, 0, 0, &ins, 1, 1); if (ret) goto out_delalloc_release;
extent_reserved = true;
btrfs_submit_compressed_write(ordered, folios, nr_folios, 0, false);
ret = orig_count; goto out;
out_free_reserved:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
out_delalloc_release:
btrfs_delalloc_release_extents(inode, num_bytes);
btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
out_qgroup_free_data: if (ret < 0)
btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes, NULL);
out_free_data_space: /* * If btrfs_reserve_extent() succeeded, then we already decremented * bytes_may_use.
*/ if (!extent_reserved)
btrfs_free_reserved_data_space_noquota(inode, disk_num_bytes);
out_unlock:
btrfs_unlock_extent(io_tree, start, end, &cached_state);
out_folios: for (i = 0; i < nr_folios; i++) { if (folios[i])
folio_put(folios[i]);
}
kvfree(folios);
out: if (ret >= 0)
iocb->ki_pos += encoded->len; return ret;
}
#ifdef CONFIG_SWAP /* * Add an entry indicating a block group or device which is pinned by a * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a * negative errno on failure.
*/ staticint btrfs_add_swapfile_pin(struct inode *inode, void *ptr, bool is_block_group)
{ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct btrfs_swapfile_pin *sp, *entry; struct rb_node **p; struct rb_node *parent = NULL;
/* * Our swapfile may have had its size extended after the swap header was * written. In that case activating the swapfile should not go beyond * the max size set in the swap header.
*/ if (bsi->nr_pages >= sis->max) return 0;
/* * Acquire the inode's mmap lock to prevent races with memory mapped * writes, as they could happen after we flush delalloc below and before * we lock the extent range further below. The inode was already locked * up in the call chain.
*/
btrfs_assert_inode_locked(BTRFS_I(inode));
down_write(&BTRFS_I(inode)->i_mmap_lock);
/* * If the swap file was just created, make sure delalloc is done. If the * file changes again after this, the user is doing something stupid and * we don't really care.
*/
ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); if (ret) goto out_unlock_mmap;
/* * The inode is locked, so these flags won't change after we check them.
*/ if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
btrfs_warn(fs_info, "swapfile must not be compressed");
ret = -EINVAL; goto out_unlock_mmap;
} if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
btrfs_warn(fs_info, "swapfile must not be copy-on-write");
ret = -EINVAL; goto out_unlock_mmap;
} if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
btrfs_warn(fs_info, "swapfile must not be checksummed");
ret = -EINVAL; goto out_unlock_mmap;
}
path = btrfs_alloc_path();
backref_ctx = btrfs_alloc_backref_share_check_ctx(); if (!path || !backref_ctx) {
ret = -ENOMEM; goto out_unlock_mmap;
}
/* * Balance or device remove/replace/resize can move stuff around from * under us. The exclop protection makes sure they aren't running/won't * run concurrently while we are mapping the swap extents, and * fs_info->swapfile_pins prevents them from running while the swap * file is active and moving the extents. Note that this also prevents * a concurrent device add which isn't actually necessary, but it's not * really worth the trouble to allow it.
*/ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
btrfs_warn(fs_info, "cannot activate swapfile while exclusive operation is running");
ret = -EBUSY; goto out_unlock_mmap;
}
/* * Prevent snapshot creation while we are activating the swap file. * We do not want to race with snapshot creation. If snapshot creation * already started before we bumped nr_swapfiles from 0 to 1 and * completes before the first write into the swap file after it is * activated, than that write would fallback to COW.
*/ if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) {
btrfs_exclop_finish(fs_info);
btrfs_warn(fs_info, "cannot activate swapfile because snapshot creation is in progress");
ret = -EINVAL; goto out_unlock_mmap;
} /* * Snapshots can create extents which require COW even if NODATACOW is * set. We use this counter to prevent snapshots. We must increment it * before walking the extents because we don't want a concurrent * snapshot to run after we've already checked the extents. * * It is possible that subvolume is marked for deletion but still not * removed yet. To prevent this race, we check the root status before * activating the swapfile.
*/
spin_lock(&root->root_item_lock); if (btrfs_root_dead(root)) {
spin_unlock(&root->root_item_lock);
btrfs_drew_write_unlock(&root->snapshot_lock);
btrfs_exclop_finish(fs_info);
btrfs_warn(fs_info, "cannot activate swapfile because subvolume %llu is being deleted",
btrfs_root_id(root));
ret = -EPERM; goto out_unlock_mmap;
}
atomic_inc(&root->nr_swapfiles);
spin_unlock(&root->root_item_lock);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out;
/* * If key not found it means we have an implicit hole (NO_HOLES * is enabled).
*/ if (ret > 0) {
btrfs_warn(fs_info, "swapfile must not have holes");
ret = -EINVAL; goto out;
}
leaf = path->nodes[0];
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) { /* * It's unlikely we'll ever actually find ourselves * here, as a file small enough to fit inline won't be * big enough to store more than the swap header, but in * case something changes in the future, let's catch it * here rather than later.
*/
btrfs_warn(fs_info, "swapfile must not be inline");
ret = -EINVAL; goto out;
}
if (btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) {
btrfs_warn(fs_info, "swapfile must not be compressed");
ret = -EINVAL; goto out;
}
disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); if (disk_bytenr == 0) {
btrfs_warn(fs_info, "swapfile must not have holes");
ret = -EINVAL; goto out;
}
if (prev_extent_end > isize)
len = isize - key.offset; else
len = btrfs_file_extent_num_bytes(leaf, ei);
backref_ctx->curr_leaf_bytenr = leaf->start;
/* * Don't need the path anymore, release to avoid deadlocks when * calling btrfs_is_data_extent_shared() because when joining a * transaction it can block waiting for the current one's commit * which in turn may be trying to lock the same leaf to flush * delayed items for example.
*/
btrfs_release_path(path);
ret = btrfs_is_data_extent_shared(BTRFS_I(inode), disk_bytenr,
extent_gen, backref_ctx); if (ret < 0) { goto out;
} elseif (ret > 0) {
btrfs_warn(fs_info, "swapfile must not be copy-on-write");
ret = -EINVAL; goto out;
}
map = btrfs_get_chunk_map(fs_info, logical_block_start, len); if (IS_ERR(map)) {
ret = PTR_ERR(map); goto out;
}
if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
btrfs_warn(fs_info, "swapfile must have single data profile");
ret = -EINVAL; goto out;
}
if (device == NULL) {
device = map->stripes[0].dev;
ret = btrfs_add_swapfile_pin(inode, device, false); if (ret == 1)
ret = 0; elseif (ret) goto out;
} elseif (device != map->stripes[0].dev) {
btrfs_warn(fs_info, "swapfile must be on one device");
ret = -EINVAL; goto out;
}
bg = btrfs_lookup_block_group(fs_info, logical_block_start); if (!bg) {
btrfs_warn(fs_info, "could not find block group containing swapfile");
ret = -EINVAL; goto out;
}
if (!btrfs_inc_block_group_swap_extents(bg)) {
btrfs_warn(fs_info, "block group for swapfile at %llu is read-only%s",
bg->start,
atomic_read(&fs_info->scrubs_running) ? " (scrub running)" : "");
btrfs_put_block_group(bg);
ret = -EINVAL; goto out;
}
ret = btrfs_add_swapfile_pin(inode, bg, true); if (ret) {
btrfs_put_block_group(bg); if (ret == 1)
ret = 0; else goto out;
}
if (bsi.block_len &&
bsi.block_start + bsi.block_len == physical_block_start) {
bsi.block_len += len;
} else { if (bsi.block_len) {
ret = btrfs_add_swap_extent(sis, &bsi); if (ret) goto out;
}
bsi.start = key.offset;
bsi.block_start = physical_block_start;
bsi.block_len = len;
}
if (fatal_signal_pending(current)) {
ret = -EINTR; goto out;
}
cond_resched();
}
if (bsi.block_len)
ret = btrfs_add_swap_extent(sis, &bsi);
out: if (!IS_ERR_OR_NULL(map))
btrfs_free_chunk_map(map);
/* * Update the number of bytes used in the VFS' inode. When we replace extents in * a range (clone, dedupe, fallocate's zero range), we must update the number of * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls * always get a correct value.
*/ void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes, const u64 del_bytes)
{ if (add_bytes == del_bytes) return;
spin_lock(&inode->lock); if (del_bytes > 0)
inode_sub_bytes(&inode->vfs_inode, del_bytes); if (add_bytes > 0)
inode_add_bytes(&inode->vfs_inode, add_bytes);
spin_unlock(&inode->lock);
}
/* * Verify that there are no ordered extents for a given file range. * * @inode: The target inode. * @start: Start offset of the file range, should be sector size aligned. * @end: End offset (inclusive) of the file range, its value +1 should be * sector size aligned. * * This should typically be used for cases where we locked an inode's VFS lock in * exclusive mode, we have also locked the inode's i_mmap_lock in exclusive mode, * we have flushed all delalloc in the range, we have waited for all ordered * extents in the range to complete and finally we have locked the file range in * the inode's io_tree.
*/ void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end)
{ struct btrfs_root *root = inode->root; struct btrfs_ordered_extent *ordered;
if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) return;
ordered = btrfs_lookup_first_ordered_range(inode, start, end + 1 - start); if (ordered) {
btrfs_err(root->fs_info, "found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])",
start, end, btrfs_ino(inode), btrfs_root_id(root),
ordered->file_offset,
ordered->file_offset + ordered->num_bytes - 1);
btrfs_put_ordered_extent(ordered);
}
ASSERT(ordered == NULL);
}
/* * Find the first inode with a minimum number. * * @root: The root to search for. * @min_ino: The minimum inode number. * * Find the first inode in the @root with a number >= @min_ino and return it. * Returns NULL if no such inode found.
*/ struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino)
{ struct btrfs_inode *inode; unsignedlong from = min_ino;
xa_lock(&root->inodes); while (true) {
inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT); if (!inode) break; if (igrab(&inode->vfs_inode)) break;
from = btrfs_ino(inode) + 1;
cond_resched_lock(&root->inodes.xa_lock);
}
xa_unlock(&root->inodes);
/* * btrfs doesn't support the bmap operation because swapfiles * use bmap to make a mapping of extents in the file. They assume * these extents won't change over the life of the file and they * use the bmap result to do IO directly to the drive. * * the btrfs bmap call would return logical addresses that aren't * suitable for IO and they also will change frequently as COW * operations happen. So, swapfile + btrfs == corruption. * * For now we're avoiding this by dropping bmap.
*/ staticconststruct address_space_operations btrfs_aops = {
.read_folio = btrfs_read_folio,
.writepages = btrfs_writepages,
.readahead = btrfs_readahead,
.invalidate_folio = btrfs_invalidate_folio,
.launder_folio = btrfs_launder_folio,
.release_folio = btrfs_release_folio,
.migrate_folio = btrfs_migrate_folio,
.dirty_folio = filemap_dirty_folio,
.error_remove_folio = generic_error_remove_folio,
.swap_activate = btrfs_swap_activate,
.swap_deactivate = btrfs_swap_deactivate,
};
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.