/* * Unlock folio after btrfs_file_write() is done with it.
*/ staticvoid btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio,
u64 pos, u64 copied)
{
u64 block_start = round_down(pos, fs_info->sectorsize);
u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
ASSERT(block_len <= U32_MAX); /* * Folio checked is some magic around finding folios that have been * modified without going through btrfs_dirty_folio(). Clear it here. * There should be no need to mark the pages accessed as * prepare_one_folio() should have marked them accessed in * prepare_one_folio() via find_or_create_page()
*/
btrfs_folio_clamp_clear_checked(fs_info, folio, block_start, block_len);
folio_unlock(folio);
folio_put(folio);
}
/* * After copy_folio_from_iter_atomic(), update the following things for delalloc: * - Mark newly dirtied folio as DELALLOC in the io tree. * Used to advise which range is to be written back. * - Mark modified folio as Uptodate/Dirty and not needing COW fixup * - Update inode size for past EOF write
*/ int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos,
size_t write_bytes, struct extent_state **cached, bool noreserve)
{ struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret = 0;
u64 num_bytes;
u64 start_pos;
u64 end_of_last_block;
u64 end_pos = pos + write_bytes;
loff_t isize = i_size_read(&inode->vfs_inode); unsignedint extra_bits = 0;
/* * The pages may have already been dirty, clear out old accounting so * we can set things up properly
*/
btrfs_clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
cached);
ret = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
extra_bits, cached); if (ret) return ret;
/* * we've only changed i_size in ram, and we haven't updated * the disk i_size. There is no need to log the inode * at this time.
*/ if (end_pos > isize)
i_size_write(&inode->vfs_inode, end_pos); return 0;
}
/* * this is very complex, but the basic idea is to drop all extents * in the range start - end. hint_block is filled in with a block number * that would be a good hint to the block allocator for this file. * * If an extent intersects the range but is not entirely inside the range * it is either truncated or split. Anything entirely inside the range * is deleted from the tree. * * Note: the VFS' inode number of bytes is not updated, it's up to the caller * to deal with that. We set the field 'bytes_found' of the arguments structure * with the number of allocated bytes found in the target range, so that the * caller can update the inode's number of bytes in an atomic way when * replacing extents in a range to avoid races with stat(2).
*/ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_drop_extents_args *args)
{ struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; struct btrfs_key key; struct btrfs_key new_key;
u64 ino = btrfs_ino(inode);
u64 search_start = args->start;
u64 disk_bytenr = 0;
u64 num_bytes = 0;
u64 extent_offset = 0;
u64 extent_end = 0;
u64 last_end = args->start; int del_nr = 0; int del_slot = 0; int extent_type; int recow; int ret; int modify_tree = -1; int update_refs; int found = 0; struct btrfs_path *path = args->path;
/* * Don't skip extent items representing 0 byte lengths. They * used to be created (bug) if while punching holes we hit * -ENOSPC condition. So if we find one here, just ensure we * delete it, otherwise we would insert a new file extent item * with the same key (offset) as that 0 bytes length file * extent item in the call to setup_items_for_insert() later * in this function.
*/ if (extent_end == key.offset && extent_end >= search_start) {
last_end = extent_end; goto delete_extent_item;
}
if (extent_end <= search_start) {
path->slots[0]++; goto next_slot;
}
found = 1;
search_start = max(key.offset, args->start); if (recow || !modify_tree) {
modify_tree = -1;
btrfs_release_path(path); continue;
}
/* * | - range to drop - | * | -------- extent -------- |
*/ if (args->start > key.offset && args->end < extent_end) { if (WARN_ON(del_nr > 0)) {
btrfs_print_leaf(leaf);
ret = -EINVAL; break;
} if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
ret = -EOPNOTSUPP; break;
}
memcpy(&new_key, &key, sizeof(new_key));
new_key.offset = args->start;
ret = btrfs_duplicate_item(trans, root, path,
&new_key); if (ret == -EAGAIN) {
btrfs_release_path(path); continue;
} if (ret < 0) break;
ret = btrfs_del_items(trans, root, path, del_slot,
del_nr); if (ret) {
btrfs_abort_transaction(trans, ret); break;
}
del_nr = 0;
del_slot = 0;
btrfs_release_path(path); continue;
}
BUG();
}
if (!ret && del_nr > 0) { /* * Set path->slots[0] to first slot, so that after the delete * if items are move off from our leaf to its immediate left or * right neighbor leafs, we end up with a correct and adjusted * path->slots[0] for our insertion (if args->replace_extent).
*/
path->slots[0] = del_slot;
ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (ret)
btrfs_abort_transaction(trans, ret);
}
leaf = path->nodes[0]; /* * If btrfs_del_items() was called, it might have deleted a leaf, in * which case it unlocked our path, so check path->locks[0] matches a * write lock.
*/ if (!ret && args->replace_extent &&
path->locks[0] == BTRFS_WRITE_LOCK &&
btrfs_leaf_free_space(leaf) >= sizeof(struct btrfs_item) + args->extent_item_size) {
/* * Mark extent in the range start - end as written. * * This changes extent type from 'pre-allocated' to 'regular'. If only * part of extent is marked as written, the extent will be split into * two or three.
*/ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 start, u64 end)
{ struct btrfs_root *root = inode->root; struct extent_buffer *leaf;
BTRFS_PATH_AUTO_FREE(path); struct btrfs_file_extent_item *fi; struct btrfs_ref ref = { 0 }; struct btrfs_key key; struct btrfs_key new_key;
u64 bytenr;
u64 num_bytes;
u64 extent_end;
u64 orig_offset;
u64 other_start;
u64 other_end;
u64 split; int del_nr = 0; int del_slot = 0; int recow; int ret = 0;
u64 ino = btrfs_ino(inode);
/* * On error return an unlocked folio and the error value * On success return a locked folio and 0
*/ staticint prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 pos,
u64 len)
{
u64 clamp_start = max_t(u64, pos, folio_pos(folio));
u64 clamp_end = min_t(u64, pos + len, folio_end(folio)); const u32 blocksize = inode_to_fs_info(inode)->sectorsize; int ret = 0;
if (folio_test_uptodate(folio)) return 0;
if (IS_ALIGNED(clamp_start, blocksize) &&
IS_ALIGNED(clamp_end, blocksize)) return 0;
ret = btrfs_read_folio(NULL, folio); if (ret) return ret;
folio_lock(folio); if (!folio_test_uptodate(folio)) {
folio_unlock(folio); return -EIO;
}
/* * Since btrfs_read_folio() will unlock the folio before it returns, * there is a window where btrfs_release_folio() can be called to * release the page. Here we check both inode mapping and page * private to make sure the page was not released. * * The private flag check is essential for subpage as we need to store * extra bitmap using folio private.
*/ if (folio->mapping != inode->i_mapping || !folio_test_private(folio)) {
folio_unlock(folio); return -EAGAIN;
} return 0;
}
ret = set_folio_extent_mapped(folio); if (ret < 0) {
folio_unlock(folio);
folio_put(folio); return ret;
}
ret = prepare_uptodate_folio(inode, folio, pos, write_bytes); if (ret) { /* The folio is already unlocked. */
folio_put(folio); if (!nowait && ret == -EAGAIN) {
ret = 0; goto again;
} return ret;
}
*folio_ret = folio; return 0;
}
/* * Locks the extent and properly waits for data=ordered extents to finish * before allowing the folios to be modified if need. * * Return: * 1 - the extent is locked * 0 - the extent is not locked, and everything is OK * -EAGAIN - need to prepare the folios again
*/ static noinline int
lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio,
loff_t pos, size_t write_bytes,
u64 *lockstart, u64 *lockend, bool nowait, struct extent_state **cached_state)
{ struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 start_pos;
u64 last_pos; int ret = 0;
*lockstart = start_pos;
*lockend = last_pos;
ret = 1;
}
/* * We should be called after prepare_one_folio() which should have locked * all pages in the range.
*/
WARN_ON(!folio_test_locked(folio));
return ret;
}
/* * Check if we can do nocow write into the range [@pos, @pos + @write_bytes) * * @pos: File offset. * @write_bytes: The length to write, will be updated to the nocow writeable * range. * @nowait: Indicate if we can block or not (non-blocking IO context). * * This function will flush ordered extents in the range to ensure proper * nocow checks. * * Return: * > 0 If we can nocow, and updates @write_bytes. * 0 If we can't do a nocow write. * -EAGAIN If we can't do a nocow write because snapshoting of the inode's * root is in progress or because we are in a non-blocking IO * context and need to block (@nowait is true). * < 0 If an error happened. * * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0.
*/ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
size_t *write_bytes, bool nowait)
{ struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; struct extent_state *cached_state = NULL;
u64 lockstart, lockend;
u64 cur_offset; int ret = 0;
if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) return 0;
if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) return -EAGAIN;
ret = can_nocow_extent(inode, cur_offset, &num_bytes, NULL, nowait); if (ret <= 0) { /* * If cur_offset == lockstart it means we haven't found * any extent against which we can NOCOW, so unlock the * snapshot lock.
*/ if (cur_offset == lockstart)
btrfs_drew_write_unlock(&root->snapshot_lock); break;
}
cur_offset += num_bytes;
}
/* * cur_offset > lockstart means there's at least a partial range we can * NOCOW, and that range can cover one or more extents.
*/ if (cur_offset > lockstart) {
*write_bytes = min_t(size_t, *write_bytes, cur_offset - pos); return 1;
}
/* * Quickly bail out on NOWAIT writes if we don't have the nodatacow or * prealloc flags, as without those flags we always have to COW. We will * later check if we can really COW into the target range (using * can_nocow_extent() at btrfs_get_blocks_direct_write()).
*/ if ((iocb->ki_flags & IOCB_NOWAIT) &&
!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) return -EAGAIN;
ret = file_remove_privs(file); if (ret) return ret;
/* * We reserve space for updating the inode when we reserve space for the * extent we are going to write, so we will enospc out there. We don't * need to start yet another transaction to update the inode as we will * update the inode when we finish writing whatever data we write.
*/ if (!IS_NOCMTIME(inode)) {
inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
inode_inc_iversion(inode);
}
oldsize = i_size_read(inode); if (pos > oldsize) { /* Expand hole size to cover write data, preventing empty gap */
loff_t end_pos = round_up(pos + count, fs_info->sectorsize);
ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos); if (ret) return ret;
}
/* * Reserve data and metadata space for this buffered write range. * * Return >0 for the number of bytes reserved, which is always block aligned. * Return <0 for error.
*/ static ssize_t reserve_space(struct btrfs_inode *inode, struct extent_changeset **data_reserved,
u64 start, size_t *len, bool nowait, bool *only_release_metadata)
{ conststruct btrfs_fs_info *fs_info = inode->root->fs_info; constunsignedint block_offset = (start & (fs_info->sectorsize - 1));
size_t reserve_bytes; int ret;
ret = btrfs_check_data_free_space(inode, data_reserved, start, *len, nowait); if (ret < 0) { int can_nocow;
if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) return -EAGAIN;
/* * If we don't have to COW at the offset, reserve metadata only. * write_bytes may get smaller than requested here.
*/
can_nocow = btrfs_check_nocow_lock(inode, start, len, nowait); if (can_nocow < 0)
ret = can_nocow; if (can_nocow > 0)
ret = 0; if (ret) return ret;
*only_release_metadata = true;
}
reserve_bytes = round_up(*len + block_offset, fs_info->sectorsize);
WARN_ON(reserve_bytes == 0);
ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes,
reserve_bytes, nowait); if (ret) { if (!*only_release_metadata)
btrfs_free_reserved_data_space(inode, *data_reserved,
start, *len); else
btrfs_check_nocow_unlock(inode);
if (nowait && ret == -ENOSPC)
ret = -EAGAIN; return ret;
} return reserve_bytes;
}
/* Shrink the reserved data and metadata space from @reserved_len to @new_len. */ staticvoid shrink_reserved_space(struct btrfs_inode *inode, struct extent_changeset *data_reserved,
u64 reserved_start, u64 reserved_len,
u64 new_len, bool only_release_metadata)
{ const u64 diff = reserved_len - new_len;
/* Calculate the maximum amount of bytes we can write into one folio. */ static size_t calc_write_bytes(conststruct btrfs_inode *inode, conststruct iov_iter *iter, u64 start)
{ const size_t max_folio_size = mapping_max_folio_size(inode->vfs_inode.i_mapping);
/* * Do the heavy-lifting work to copy one range into one folio of the page cache. * * Return > 0 in case we copied all bytes or just some of them. * Return 0 if no bytes were copied, in which case the caller should retry. * Return <0 on error.
*/ staticint copy_one_range(struct btrfs_inode *inode, struct iov_iter *iter, struct extent_changeset **data_reserved, u64 start, bool nowait)
{ struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_state *cached_state = NULL;
size_t write_bytes = calc_write_bytes(inode, iter, start);
size_t copied; const u64 reserved_start = round_down(start, fs_info->sectorsize);
u64 reserved_len; struct folio *folio = NULL; int extents_locked;
u64 lockstart;
u64 lockend; bool only_release_metadata = false; constunsignedint bdp_flags = (nowait ? BDP_ASYNC : 0); int ret;
/* * Fault all pages before locking them in prepare_one_folio() to avoid * recursive lock.
*/ if (unlikely(fault_in_iov_iter_readable(iter, write_bytes))) return -EFAULT;
extent_changeset_release(*data_reserved);
ret = reserve_space(inode, data_reserved, start, &write_bytes, nowait,
&only_release_metadata); if (ret < 0) return ret;
reserved_len = ret; /* Write range must be inside the reserved range. */
ASSERT(reserved_start <= start);
ASSERT(start + write_bytes <= reserved_start + reserved_len);
again:
ret = balance_dirty_pages_ratelimited_flags(inode->vfs_inode.i_mapping,
bdp_flags); if (ret) {
btrfs_delalloc_release_extents(inode, reserved_len);
release_space(inode, *data_reserved, reserved_start, reserved_len,
only_release_metadata); return ret;
}
ret = prepare_one_folio(&inode->vfs_inode, &folio, start, write_bytes, false); if (ret) {
btrfs_delalloc_release_extents(inode, reserved_len);
release_space(inode, *data_reserved, reserved_start, reserved_len,
only_release_metadata); return ret;
}
/* * The reserved range goes beyond the current folio, shrink the reserved * space to the folio boundary.
*/ if (reserved_start + reserved_len > folio_end(folio)) { const u64 last_block = folio_end(folio);
if (unlikely(copied < write_bytes)) {
u64 last_block;
/* * The original write range doesn't need an uptodate folio as * the range is block aligned. But now a short copy happened. * We cannot handle it without an uptodate folio. * * So just revert the range and we will retry.
*/ if (!folio_test_uptodate(folio)) {
iov_iter_revert(iter, copied);
copied = 0;
}
/* No copied bytes, unlock, release reserved space and exit. */ if (copied == 0) { if (extents_locked)
btrfs_unlock_extent(&inode->io_tree, lockstart, lockend,
&cached_state); else
btrfs_free_extent_state(cached_state);
btrfs_delalloc_release_extents(inode, reserved_len);
release_space(inode, *data_reserved, reserved_start, reserved_len,
only_release_metadata);
btrfs_drop_folio(fs_info, folio, start, copied); return 0;
}
/* Release the reserved space beyond the last block. */
last_block = round_up(start + copied, fs_info->sectorsize);
ret = btrfs_dirty_folio(inode, folio, start, copied, &cached_state,
only_release_metadata); /* * If we have not locked the extent range, because the range's start * offset is >= i_size, we might still have a non-NULL cached extent * state, acquired while marking the extent range as delalloc through * btrfs_dirty_page(). Therefore free any possible cached extent state * to avoid a memory leak.
*/ if (extents_locked)
btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); else
btrfs_free_extent_state(cached_state);
btrfs_delalloc_release_extents(inode, reserved_len); if (ret) {
btrfs_drop_folio(fs_info, folio, start, copied);
release_space(inode, *data_reserved, reserved_start, reserved_len,
only_release_metadata); return ret;
} if (only_release_metadata)
btrfs_check_nocow_unlock(inode);
ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); if (ret < 0) return ret;
/* * We can only trust the isize with inode lock held, or it can race with * other buffered writes and cause incorrect call of * pagecache_isize_extended() to overwrite existing data.
*/
old_isize = i_size_read(inode);
ret = generic_write_checks(iocb, iter); if (ret <= 0) goto out;
ret = btrfs_write_check(iocb, ret); if (ret < 0) goto out;
pos = iocb->ki_pos; while (iov_iter_count(iter) > 0) {
ret = copy_one_range(BTRFS_I(inode), iter, &data_reserved, pos, nowait); if (ret < 0) break;
pos += ret;
num_written += ret;
cond_resched();
}
/* * If the fs flips readonly due to some impossible error, although we * have opened a file as writable, we have to stop this write operation * to ensure consistency.
*/ if (BTRFS_FS_ERROR(inode->root->fs_info)) return -EROFS;
if (encoded && (iocb->ki_flags & IOCB_NOWAIT)) return -EOPNOTSUPP;
if (private) {
kfree(private->filldir_buf);
btrfs_free_extent_state(private->llseek_cached_state);
kfree(private);
filp->private_data = NULL;
}
/* * Set by setattr when we are about to truncate a file from a non-zero * size to a zero size. This tries to flush down new bytes that may * have been written if the application were using truncate to replace * a file in place.
*/ if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
&BTRFS_I(inode)->runtime_flags))
filemap_flush(inode->i_mapping); return 0;
}
/* * This is only called in fsync, which would do synchronous writes, so * a plug can merge adjacent IOs as much as possible. Esp. in case of * multiple disks using raid profile, a large IO can be split to * several segments of stripe length (currently 64K).
*/
blk_start_plug(&plug);
ret = btrfs_fdatawrite_range(inode, start, end);
blk_finish_plug(&plug);
if (btrfs_inode_in_log(inode, btrfs_get_fs_generation(fs_info)) &&
list_empty(&ctx->ordered_extents)) returntrue;
/* * If we are doing a fast fsync we can not bail out if the inode's * last_trans is <= then the last committed transaction, because we only * update the last_trans of the inode during ordered extent completion, * and for a fast fsync we don't wait for that, we only wait for the * writeback to complete.
*/ if (inode->last_trans <= btrfs_get_last_trans_committed(fs_info) &&
(test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
list_empty(&ctx->ordered_extents))) returntrue;
returnfalse;
}
/* * fsync call for both files and directories. This logs the inode into * the tree log instead of forcing full commits whenever possible. * * It needs to call filemap_fdatawait so that all ordered extent updates are * in the metadata btree are up to date for copying to the log. * * It drops the inode mutex before doing the tree log commit. This is an * important optimization for directories because holding the mutex prevents * new operations on the dir while we write to disk.
*/ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{ struct dentry *dentry = file_dentry(file); struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; struct btrfs_log_ctx ctx; int ret = 0, err;
u64 len; bool full_sync; bool skip_ilock = false;
/* * Always set the range to a full range, otherwise we can get into * several problems, from missing file extent items to represent holes * when not using the NO_HOLES feature, to log tree corruption due to * races between hole detection during logging and completion of ordered * extents outside the range, to missing checksums due to ordered extents * for which we flushed only a subset of their pages.
*/
start = 0;
end = LLONG_MAX;
len = (u64)LLONG_MAX + 1;
/* * We write the dirty pages in the range and wait until they complete * out of the ->i_mutex. If so, we can flush the dirty pages by * multi-task, and make the performance up. See * btrfs_wait_ordered_range for an explanation of the ASYNC check.
*/
ret = start_ordered_ops(inode, start, end); if (ret) goto out;
if (skip_ilock)
down_write(&inode->i_mmap_lock); else
btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
atomic_inc(&root->log_batch);
/* * Before we acquired the inode's lock and the mmap lock, someone may * have dirtied more pages in the target range. We need to make sure * that writeback for any such pages does not start while we are logging * the inode, because if it does, any of the following might happen when * we are not doing a full inode sync: * * 1) We log an extent after its writeback finishes but before its * checksums are added to the csum tree, leading to -EIO errors * when attempting to read the extent after a log replay. * * 2) We can end up logging an extent before its writeback finishes. * Therefore after the log replay we will have a file extent item * pointing to an unwritten extent (and no data checksums as well). * * So trigger writeback for any eventual new dirty pages and then we * wait for all ordered extents to complete below.
*/
ret = start_ordered_ops(inode, start, end); if (ret) { if (skip_ilock)
up_write(&inode->i_mmap_lock); else
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); goto out;
}
/* * Always check for the full sync flag while holding the inode's lock, * to avoid races with other tasks. The flag must be either set all the * time during logging or always off all the time while logging. * We check the flag here after starting delalloc above, because when * running delalloc the full sync flag may be set if we need to drop * extra extent map ranges due to temporary memory allocation failures.
*/
full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
/* * We have to do this here to avoid the priority inversion of waiting on * IO of a lower priority task while holding a transaction open. * * For a full fsync we wait for the ordered extents to complete while * for a fast fsync we wait just for writeback to complete, and then * attach the ordered extents to the transaction so that a transaction * commit waits for their completion, to avoid data loss if we fsync, * the current transaction commits before the ordered extents complete * and a power failure happens right after that. * * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the * logical address recorded in the ordered extent may change. We need * to wait for the IO to stabilize the logical address.
*/ if (full_sync || btrfs_is_zoned(fs_info)) {
ret = btrfs_wait_ordered_range(inode, start, len);
clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags);
} else { /* * Get our ordered extents as soon as possible to avoid doing * checksum lookups in the csum tree, and use instead the * checksums attached to the ordered extents.
*/
btrfs_get_ordered_extents_for_logging(inode, &ctx.ordered_extents);
ret = filemap_fdatawait_range(inode->vfs_inode.i_mapping, start, end); if (ret) goto out_release_extents;
/* * Check and clear the BTRFS_INODE_COW_WRITE_ERROR now after * starting and waiting for writeback, because for buffered IO * it may have been set during the end IO callback * (end_bbio_data_write() -> btrfs_finish_ordered_extent()) in * case an error happened and we need to wait for ordered * extents to complete so that any extent maps that point to * unwritten locations are dropped and we don't log them.
*/ if (test_and_clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags))
ret = btrfs_wait_ordered_range(inode, start, len);
}
if (ret) goto out_release_extents;
atomic_inc(&root->log_batch);
if (skip_inode_logging(&ctx)) { /* * We've had everything committed since the last time we were * modified so clear this flag in case it was set for whatever * reason, it's no longer relevant.
*/
clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); /* * An ordered extent might have started before and completed * already with io errors, in which case the inode was not * updated and we end up here. So check the inode's mapping * for any errors that might have happened since we last * checked called fsync.
*/
ret = filemap_check_wb_err(inode->vfs_inode.i_mapping, file->f_wb_err); goto out_release_extents;
}
btrfs_init_log_ctx_scratch_eb(&ctx);
/* * We use start here because we will need to wait on the IO to complete * in btrfs_sync_log, which could require joining a transaction (for * example checking cross references in the nocow path). If we use join * here we could get into a situation where we're waiting on IO to * happen that is blocked on a transaction trying to commit. With start * we inc the extwriter counter, so we wait for all extwriters to exit * before we start blocking joiners. This comment is to keep somebody * from thinking they are super smart and changing this to * btrfs_join_transaction *cough*Josef*cough*.
*/
trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) {
ret = PTR_ERR(trans); goto out_release_extents;
}
trans->in_fsync = true;
ret = btrfs_log_dentry_safe(trans, dentry, &ctx); /* * Scratch eb no longer needed, release before syncing log or commit * transaction, to avoid holding unnecessary memory during such long * operations.
*/ if (ctx.scratch_eb) {
free_extent_buffer(ctx.scratch_eb);
ctx.scratch_eb = NULL;
}
btrfs_release_log_ctx_extents(&ctx); if (ret < 0) { /* Fallthrough and commit/free transaction. */
ret = BTRFS_LOG_FORCE_COMMIT;
}
/* we've logged all the items and now have a consistent * version of the file in the log. It is possible that * someone will come in and modify the file, but that's * fine because the log is consistent on disk, and we * have references to all of the file's extents * * It is possible that someone will come in and log the * file again, but that will end up using the synchronization * inside btrfs_sync_log to keep things safe.
*/ if (skip_ilock)
up_write(&inode->i_mmap_lock); else
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
if (ret == BTRFS_NO_LOG_SYNC) {
ret = btrfs_end_transaction(trans); goto out;
}
/* We successfully logged the inode, attempt to sync the log. */ if (!ret) {
ret = btrfs_sync_log(trans, root, &ctx); if (!ret) {
ret = btrfs_end_transaction(trans); goto out;
}
}
/* * At this point we need to commit the transaction because we had * btrfs_need_log_full_commit() or some other error. * * If we didn't do a full sync we have to stop the trans handle, wait on * the ordered extents, start it again and commit the transaction. If * we attempt to wait on the ordered extents here we could deadlock with * something like fallocate() that is holding the extent lock trying to * start a transaction while some other thread is trying to commit the * transaction while we (fsync) are currently holding the transaction * open.
*/ if (!full_sync) {
ret = btrfs_end_transaction(trans); if (ret) goto out;
ret = btrfs_wait_ordered_range(inode, start, len); if (ret) goto out;
/* * This is safe to use here because we're only interested in * making sure the transaction that had the ordered extents is * committed. We aren't waiting on anything past this point, * we're purely getting the transaction and committing it.
*/
trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
/* * We committed the transaction and there's no currently * running transaction, this means everything we care * about made it to disk and we are done.
*/ if (ret == -ENOENT)
ret = 0; goto out;
}
}
ret = btrfs_commit_transaction(trans);
out:
free_extent_buffer(ctx.scratch_eb);
ASSERT(list_empty(&ctx.list));
ASSERT(list_empty(&ctx.conflict_inodes));
err = file_check_and_advance_wb_err(file); if (!ret)
ret = err; return ret > 0 ? -EIO : ret;
/* * btrfs_page_mkwrite() is not allowed to change the file size as it gets * called from a page fault handler when a page is first dirtied. Hence we must * be careful to check for EOF conditions here. We set the page up correctly * for a written page which means we get ENOSPC checking when writing into * holes and correct delalloc and unwritten extent mapping on filesystems that * support these features. * * We are not allowed to take the i_mutex here so we have to play games to * protect against truncate races as the page could now be beyond EOF. Because * truncate_setsize() writes the inode size before removing pages, once we have * the page lock we can determine safely if the page is beyond EOF. If it is not * beyond EOF, then the page is guaranteed safe against truncation until we * unlock the page.
*/ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
{ struct page *page = vmf->page; struct folio *folio = page_folio(page); struct btrfs_inode *inode = BTRFS_I(file_inode(vmf->vma->vm_file)); struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_io_tree *io_tree = &inode->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; unsignedlong zero_start;
loff_t size;
size_t fsize = folio_size(folio); int ret; bool only_release_metadata = false;
u64 reserved_space;
u64 page_start;
u64 page_end;
u64 end;
/* * Reserving delalloc space after obtaining the page lock can lead to * deadlock. For example, if a dirty page is locked by this function * and the call to btrfs_delalloc_reserve_space() ends up triggering * dirty page write out, then the btrfs_writepages() function could * end up waiting indefinitely to get a lock on the page currently * being processed by btrfs_page_mkwrite() function.
*/
ret = btrfs_check_data_free_space(inode, &data_reserved, page_start,
reserved_space, false); if (ret < 0) {
size_t write_bytes = reserved_space;
if (btrfs_check_nocow_lock(inode, page_start, &write_bytes, false) <= 0) goto out_noreserve;
only_release_metadata = true;
/* * Can't write the whole range, there may be shared extents or * holes in the range, bail out with @only_release_metadata set * to true so that we unlock the nocow lock before returning the * error.
*/ if (write_bytes < reserved_space) goto out_noreserve;
}
ret = btrfs_delalloc_reserve_metadata(inode, reserved_space,
reserved_space, false); if (ret < 0) { if (!only_release_metadata)
btrfs_free_reserved_data_space(inode, data_reserved,
page_start, reserved_space); goto out_noreserve;
}
ret = file_update_time(vmf->vma->vm_file); if (ret < 0) goto out;
again:
down_read(&inode->i_mmap_lock);
folio_lock(folio);
size = i_size_read(&inode->vfs_inode);
if ((folio->mapping != inode->vfs_inode.i_mapping) ||
(page_start >= size)) { /* Page got truncated out from underneath us. */ goto out_unlock;
}
folio_wait_writeback(folio);
btrfs_lock_extent(io_tree, page_start, page_end, &cached_state);
ret = set_folio_extent_mapped(folio); if (ret < 0) {
btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); goto out_unlock;
}
/* * We can't set the delalloc bits if there are pending ordered * extents. Drop our locks and wait for them to finish.
*/
ordered = btrfs_lookup_ordered_range(inode, page_start, fsize); if (ordered) {
btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
folio_unlock(folio);
up_read(&inode->i_mmap_lock);
btrfs_start_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered); goto again;
}
end = page_start + reserved_space - 1; if (only_release_metadata)
btrfs_delalloc_release_metadata(inode, to_free, true); else
btrfs_delalloc_release_space(inode, data_reserved,
end + 1, to_free, true);
}
}
/* * page_mkwrite gets called when the page is firstly dirtied after it's * faulted in, but write(2) could also dirty a page and set delalloc * bits, thus in this case for space account reason, we still need to * clear any delalloc bits within this page range since we have to * reserve data&meta space before lock_page() (see above comments).
*/
btrfs_clear_extent_bit(io_tree, page_start, end,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, &cached_state);
ret = btrfs_set_extent_delalloc(inode, page_start, end, 0, &cached_state); if (ret < 0) {
btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); goto out_unlock;
}
/* Page is wholly or partially inside EOF. */ if (page_start + folio_size(folio) > size)
zero_start = offset_in_folio(folio, size); else
zero_start = fsize;
if (zero_start != fsize)
folio_zero_range(folio, zero_start, folio_size(folio) - zero_start);
btrfs_folio_clear_checked(fs_info, folio, page_start, fsize);
btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start);
btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start);
btrfs_set_inode_last_sub_trans(inode);
if (only_release_metadata)
btrfs_set_extent_bit(io_tree, page_start, end, EXTENT_NORESERVE,
&cached_state);
ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret <= 0) { /* * We should have dropped this offset, so if we find it then * something has gone horribly wrong.
*/ if (ret == 0)
ret = -EINVAL; return ret;
}
ret = btrfs_replace_extent_map_range(inode, hole_em, true);
btrfs_free_extent_map(hole_em); if (ret)
btrfs_set_inode_full_sync(inode);
}
return 0;
}
/* * Find a hole extent on given inode and change start/len to the end of hole * extent.(hole/vacuum extent whose em->start <= start && * em->start + em->len > start) * When a hole extent is found, return 1 and modify start/len.
*/ staticint find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
{ struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_map *em; int ret = 0;
em = btrfs_get_extent(inode, NULL,
round_down(*start, fs_info->sectorsize),
round_up(*len, fs_info->sectorsize)); if (IS_ERR(em)) return PTR_ERR(em);
/* * Check if there is no folio in the range. * * We cannot utilize filemap_range_has_page() in a filemap with large folios * as we can hit the following false positive: * * start end * | | * |//|//|//|//| | | | | | | | |//|//| * \ / \ / * Folio A Folio B * * That large folio A and B cover the start and end indexes. * In that case filemap_range_has_page() will always return true, but the above * case is fine for btrfs_punch_hole_lock_range() usage. * * So here we only ensure that no other folios is in the range, excluding the * head/tail large folio.
*/ staticbool check_range_has_page(struct inode *inode, u64 start, u64 end)
{ struct folio_batch fbatch; bool ret = false; /*
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Dauer der Verarbeitung: 0.13 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.