/* * Indicate the caller of emit_fiemap_extent() that it needs to unlock the file * range from the inode's io tree, unlock the subvolume tree search path, flush * the fiemap cache and relock the file range and research the subvolume tree. * The value here is something negative that can't be confused with a valid * errno value and different from 1 because that's also a return value from * fiemap_fill_next_extent() and also it's often used to mean some btree search * did not find a key, so make it some distinct negative value.
*/ #define BTRFS_FIEMAP_FLUSH_CACHE (-(MAX_ERRNO + 1))
/* * Used to: * * - Cache the next entry to be emitted to the fiemap buffer, so that we can * merge extents that are contiguous and can be grouped as a single one; * * - Store extents ready to be written to the fiemap buffer in an intermediary * buffer. This intermediary buffer is to ensure that in case the fiemap * buffer is memory mapped to the fiemap target file, we don't deadlock * during btrfs_page_mkwrite(). This is because during fiemap we are locking * an extent range in order to prevent races with delalloc flushing and * ordered extent completion, which is needed in order to reliably detect * delalloc in holes and prealloc extents. And this can lead to a deadlock * if the fiemap buffer is memory mapped to the file we are running fiemap * against (a silly, useless in practice scenario, but possible) because * btrfs_page_mkwrite() will try to lock the same extent range.
*/ struct fiemap_cache { /* An array of ready fiemap entries. */ struct btrfs_fiemap_entry *entries; /* Number of entries in the entries array. */ int entries_size; /* Index of the next entry in the entries array to write to. */ int entries_pos; /* * Once the entries array is full, this indicates what's the offset for * the next file extent item we must search for in the inode's subvolume * tree after unlocking the extent range in the inode's io tree and * releasing the search path.
*/
u64 next_search_offset; /* * This matches struct fiemap_extent_info::fi_mapped_extents, we use it * to count ourselves emitted extents and stop instead of relying on * fiemap_fill_next_extent() because we buffer ready fiemap entries at * the @entries array, and we want to stop as soon as we hit the max * amount of extents to map, not just to save time but also to make the * logic at extent_fiemap() simpler.
*/ unsignedint extents_mapped; /* Fields for the cached extent (unsubmitted, not ready, extent). */
u64 offset;
u64 phys;
u64 len;
u32 flags; bool cached;
};
staticint flush_fiemap_cache(struct fiemap_extent_info *fieinfo, struct fiemap_cache *cache)
{ for (int i = 0; i < cache->entries_pos; i++) { struct btrfs_fiemap_entry *entry = &cache->entries[i]; int ret;
ret = fiemap_fill_next_extent(fieinfo, entry->offset,
entry->phys, entry->len,
entry->flags); /* * Ignore 1 (reached max entries) because we keep track of that * ourselves in emit_fiemap_extent().
*/ if (ret < 0) return ret;
}
cache->entries_pos = 0;
return 0;
}
/* * Helper to submit fiemap extent. * * Will try to merge current fiemap extent specified by @offset, @phys, * @len and @flags with cached one. * And only when we fails to merge, cached one will be submitted as * fiemap extent. * * Return value is the same as fiemap_fill_next_extent().
*/ staticint emit_fiemap_extent(struct fiemap_extent_info *fieinfo, struct fiemap_cache *cache,
u64 offset, u64 phys, u64 len, u32 flags)
{ struct btrfs_fiemap_entry *entry;
u64 cache_end;
/* Set at the end of extent_fiemap(). */
ASSERT((flags & FIEMAP_EXTENT_LAST) == 0);
if (!cache->cached) goto assign;
/* * When iterating the extents of the inode, at extent_fiemap(), we may * find an extent that starts at an offset behind the end offset of the * previous extent we processed. This happens if fiemap is called * without FIEMAP_FLAG_SYNC and there are ordered extents completing * after we had to unlock the file range, release the search path, emit * the fiemap extents stored in the buffer (cache->entries array) and * the lock the remainder of the range and re-search the btree. * * For example we are in leaf X processing its last item, which is the * file extent item for file range [512K, 1M[, and after * btrfs_next_leaf() releases the path, there's an ordered extent that * completes for the file range [768K, 2M[, and that results in trimming * the file extent item so that it now corresponds to the file range * [512K, 768K[ and a new file extent item is inserted for the file * range [768K, 2M[, which may end up as the last item of leaf X or as * the first item of the next leaf - in either case btrfs_next_leaf() * will leave us with a path pointing to the new extent item, for the * file range [768K, 2M[, since that's the first key that follows the * last one we processed. So in order not to report overlapping extents * to user space, we trim the length of the previously cached extent and * emit it. * * Upon calling btrfs_next_leaf() we may also find an extent with an * offset smaller than or equals to cache->offset, and this happens * when we had a hole or prealloc extent with several delalloc ranges in * it, but after btrfs_next_leaf() released the path, delalloc was * flushed and the resulting ordered extents were completed, so we can * now have found a file extent item for an offset that is smaller than * or equals to what we have in cache->offset. We deal with this as * described below.
*/
cache_end = cache->offset + cache->len; if (cache_end > offset) { if (offset == cache->offset) { /* * We cached a dealloc range (found in the io tree) for * a hole or prealloc extent and we have now found a * file extent item for the same offset. What we have * now is more recent and up to date, so discard what * we had in the cache and use what we have just found.
*/ goto assign;
} elseif (offset > cache->offset) { /* * The extent range we previously found ends after the * offset of the file extent item we found and that * offset falls somewhere in the middle of that previous * extent range. So adjust the range we previously found * to end at the offset of the file extent item we have * just found, since this extent is more up to date. * Emit that adjusted range and cache the file extent * item we have just found. This corresponds to the case * where a previously found file extent item was split * due to an ordered extent completing.
*/
cache->len = offset - cache->offset; goto emit;
} else { const u64 range_end = offset + len;
/* * The offset of the file extent item we have just found * is behind the cached offset. This means we were * processing a hole or prealloc extent for which we * have found delalloc ranges (in the io tree), so what * we have in the cache is the last delalloc range we * found while the file extent item we found can be * either for a whole delalloc range we previously * emitted or only a part of that range. * * We have two cases here: * * 1) The file extent item's range ends at or behind the * cached extent's end. In this case just ignore the * current file extent item because we don't want to * overlap with previous ranges that may have been * emitted already; * * 2) The file extent item starts behind the currently * cached extent but its end offset goes beyond the * end offset of the cached extent. We don't want to * overlap with a previous range that may have been * emitted already, so we emit the currently cached * extent and then partially store the current file * extent item's range in the cache, for the subrange * going the cached extent's end to the end of the * file extent item.
*/ if (range_end <= cache_end) return 0;
/* * Only merges fiemap extents if * 1) Their logical addresses are continuous * * 2) Their physical addresses are continuous * So truly compressed (physical size smaller than logical size) * extents won't get merged with each other * * 3) Share same flags
*/ if (cache->offset + cache->len == offset &&
cache->phys + cache->len == phys &&
cache->flags == flags) {
cache->len += len; return 0;
}
emit: /* Not mergeable, need to submit cached one */
if (cache->entries_pos == cache->entries_size) { /* * We will need to research for the end offset of the last * stored extent and not from the current offset, because after * unlocking the range and releasing the path, if there's a hole * between that end offset and this current offset, a new extent * may have been inserted due to a new write, so we don't want * to miss it.
*/
entry = &cache->entries[cache->entries_size - 1];
cache->next_search_offset = entry->offset + entry->len;
cache->cached = false;
/* * Emit last fiemap cache * * The last fiemap cache may still be cached in the following case: * 0 4k 8k * |<- Fiemap range ->| * |<------------ First extent ----------->| * * In this case, the first extent range will be cached but not emitted. * So we must emit it before ending extent_fiemap().
*/ staticint emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo, struct fiemap_cache *cache)
{ int ret;
if (!cache->cached) return 0;
ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
cache->len, cache->flags);
cache->cached = false; if (ret > 0)
ret = 0; return ret;
}
staticint fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path)
{ struct extent_buffer *clone = path->nodes[0]; struct btrfs_key key; int slot; int ret;
path->slots[0]++; if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) return 0;
/* * Add a temporary extra ref to an already cloned extent buffer to * prevent btrfs_next_leaf() freeing it, we want to reuse it to avoid * the cost of allocating a new one.
*/
ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, &clone->bflags));
refcount_inc(&clone->refs);
ret = btrfs_next_leaf(inode->root, path); if (ret != 0) goto out;
/* * Don't bother with cloning if there are no more file extent items for * our inode.
*/
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) {
ret = 1; goto out;
}
/* * Important to preserve the start field, for the optimizations when * checking if extents are shared (see extent_fiemap()). * * We must set ->start before calling copy_extent_buffer_full(). If we * are on sub-pagesize blocksize, we use ->start to determine the offset * into the folio where our eb exists, and if we update ->start after * the fact then any subsequent reads of the eb may read from a * different offset in the folio than where we originally copied into.
*/
clone->start = path->nodes[0]->start; /* See the comment at fiemap_search_slot() about why we clone. */
copy_extent_buffer_full(clone, path->nodes[0]);
/* * Search for the first file extent item that starts at a given file offset or * the one that starts immediately before that offset. * Returns: 0 on success, < 0 on error, 1 if not found.
*/ staticint fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path,
u64 file_offset)
{ const u64 ino = btrfs_ino(inode); struct btrfs_root *root = inode->root; struct extent_buffer *clone; struct btrfs_key key; int slot; int ret;
/* * We clone the leaf and use it during fiemap. This is because while * using the leaf we do expensive things like checking if an extent is * shared, which can take a long time. In order to prevent blocking * other tasks for too long, we use a clone of the leaf. We have locked * the file range in the inode's io tree, so we know none of our file * extent items can change. This way we avoid blocking other tasks that * want to insert items for other inodes in the same leaf or b+tree * rebalance operations (triggered for example when someone is trying * to push items into this leaf when trying to insert an item in a * neighbour leaf). * We also need the private clone because holding a read lock on an * extent buffer of the subvolume's b+tree will make lockdep unhappy * when we check if extents are shared, as backref walking may need to * lock the same leaf we are processing.
*/
clone = btrfs_clone_extent_buffer(path->nodes[0]); if (!clone) return -ENOMEM;
/* * Process a range which is a hole or a prealloc extent in the inode's subvolume * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc * extent. The end offset (@end) is inclusive.
*/ staticint fiemap_process_hole(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, struct fiemap_cache *cache, struct extent_state **delalloc_cached_state, struct btrfs_backref_share_check_ctx *backref_ctx,
u64 disk_bytenr, u64 extent_offset,
u64 extent_gen,
u64 start, u64 end)
{ const u64 i_size = i_size_read(&inode->vfs_inode);
u64 cur_offset = start;
u64 last_delalloc_end = 0;
u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN; bool checked_extent_shared = false; int ret;
/* * There can be no delalloc past i_size, so don't waste time looking for * it beyond i_size.
*/ while (cur_offset < end && cur_offset < i_size) {
u64 delalloc_start;
u64 delalloc_end;
u64 prealloc_start;
u64 prealloc_len = 0; bool delalloc;
/* * If this is a prealloc extent we have to report every section * of it that has no delalloc.
*/ if (disk_bytenr != 0) { if (last_delalloc_end == 0) {
prealloc_start = start;
prealloc_len = delalloc_start - start;
} else {
prealloc_start = last_delalloc_end + 1;
prealloc_len = delalloc_start - prealloc_start;
}
}
if (prealloc_len > 0) { if (!checked_extent_shared && fieinfo->fi_extents_max) {
ret = btrfs_is_data_extent_shared(inode,
disk_bytenr,
extent_gen,
backref_ctx); if (ret < 0) return ret; elseif (ret > 0)
prealloc_flags |= FIEMAP_EXTENT_SHARED;
/* * Either we found no delalloc for the whole prealloc extent or we have * a prealloc extent that spans i_size or starts at or after i_size.
*/ if (disk_bytenr != 0 && last_delalloc_end < end) {
u64 prealloc_start;
u64 prealloc_len;
/* * Lookup the last file extent. We're not using i_size here because * there might be preallocation past i_size.
*/
ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0); /* There can't be a file extent item at offset (u64)-1 */
ASSERT(ret != 0); if (ret < 0) return ret;
/* * For a non-existing key, btrfs_search_slot() always leaves us at a * slot > 0, except if the btree is empty, which is impossible because * at least it has the inode item for this inode and all the items for * the root inode 256.
*/
ASSERT(path->slots[0] > 0);
path->slots[0]--;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { /* No file extent items in the subvolume tree. */
*last_extent_end_ret = 0; return 0;
}
/* * For an inline extent, the disk_bytenr is where inline data starts at, * so first check if we have an inline extent item before checking if we * have an implicit hole (disk_bytenr == 0).
*/
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
*last_extent_end_ret = btrfs_file_extent_end(path); return 0;
}
/* * Find the last file extent item that is not a hole (when NO_HOLES is * not enabled). This should take at most 2 iterations in the worst * case: we have one hole file extent item at slot 0 of a leaf and * another hole file extent item as the last item in the previous leaf. * This is because we merge file extent items that represent holes.
*/
disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); while (disk_bytenr == 0) {
ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY); if (ret < 0) { return ret;
} elseif (ret > 0) { /* No file extent items that are not holes. */
*last_extent_end_ret = 0; return 0;
}
leaf = path->nodes[0];
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
}
ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end); if (ret < 0) goto out_unlock;
btrfs_release_path(path);
path->reada = READA_FORWARD;
ret = fiemap_search_slot(inode, path, range_start); if (ret < 0) { goto out_unlock;
} elseif (ret > 0) { /* * No file extent item found, but we may have delalloc between * the current offset and i_size. So check for that.
*/
ret = 0; goto check_eof_delalloc;
}
/* * The first iteration can leave us at an extent item that ends * before our range's start. Move to the next item.
*/ if (extent_end <= range_start) goto next_item;
backref_ctx->curr_leaf_bytenr = leaf->start;
/* We have in implicit hole (NO_HOLES feature enabled). */ if (prev_extent_end < key.offset) { const u64 hole_end = min(key.offset, range_end) - 1;
ret = fiemap_process_hole(inode, fieinfo, &cache,
&delalloc_cached_state,
backref_ctx, 0, 0, 0,
prev_extent_end, hole_end); if (ret < 0) { goto out_unlock;
} elseif (ret > 0) { /* fiemap_fill_next_extent() told us to stop. */
stopped = true; break;
}
/* We've reached the end of the fiemap range, stop. */ if (key.offset >= range_end) {
stopped = true; break;
}
}
if (ret == BTRFS_FIEMAP_FLUSH_CACHE) {
btrfs_release_path(path);
ret = flush_fiemap_cache(fieinfo, &cache); if (ret) goto out;
len -= cache.next_search_offset - start;
start = cache.next_search_offset; goto restart;
} elseif (ret < 0) { goto out;
}
/* * Must free the path before emitting to the fiemap buffer because we * may have a non-cloned leaf and if the fiemap buffer is memory mapped * to a file, a write into it (through btrfs_page_mkwrite()) may trigger * waiting for an ordered extent that in order to complete needs to * modify that leaf, therefore leading to a deadlock.
*/
btrfs_free_path(path);
path = NULL;
ret = flush_fiemap_cache(fieinfo, &cache); if (ret) goto out;
int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{ struct btrfs_inode *btrfs_inode = BTRFS_I(inode); int ret;
ret = fiemap_prep(inode, fieinfo, start, &len, 0); if (ret) return ret;
/* * fiemap_prep() called filemap_write_and_wait() for the whole possible * file range (0 to LLONG_MAX), but that is not enough if we have * compression enabled. The first filemap_fdatawrite_range() only kicks * in the compression of data (in an async thread) and will return * before the compression is done and writeback is started. A second * filemap_fdatawrite_range() is needed to wait for the compression to * complete and writeback to start. We also need to wait for ordered * extents to complete, because our fiemap implementation uses mainly * file extent items to list the extents, searching for extent maps * only for file ranges with holes or prealloc extents to figure out * if we have delalloc in those ranges.
*/ if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
ret = btrfs_wait_ordered_range(btrfs_inode, 0, LLONG_MAX); if (ret) return ret;
}
/* * We did an initial flush to avoid holding the inode's lock while * triggering writeback and waiting for the completion of IO and ordered * extents. Now after we locked the inode we do it again, because it's * possible a new write may have happened in between those two steps.
*/ if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
ret = btrfs_wait_ordered_range(btrfs_inode, 0, LLONG_MAX); if (ret) {
btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED); return ret;
}
}
ret = extent_fiemap(btrfs_inode, fieinfo, start, len);
btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
return ret;
}
Messung V0.5
¤ Dauer der Verarbeitung: 0.25 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.