/* * Maximum number of references an extent can have in order for us to attempt to * issue clone operations instead of write operations. This currently exists to * avoid hitting limitations of the backreference walking code (taking a lot of * time and using too much memory for extents with large number of references).
*/ #define SEND_MAX_EXTENT_REFS 1024
/* * A fs_path is a helper to dynamically build path names with unknown size. * It reallocates the internal buffer on demand. * It allows fast adding of path elements on the right side (normal path) and * fast adding to the left side (reversed path). A reversed path can also be * unreversed if needed.
*/ struct fs_path { union { struct { char *start; char *end;
char *buf; unsignedshort buf_len:15; unsignedshort reversed:1; char inline_buf[];
}; /* * Average path length does not exceed 200 bytes, we'll have * better packing in the slab and higher chance to satisfy * an allocation later during send.
*/ char pad[256];
};
}; #define FS_PATH_INLINE_SIZE \
(sizeof(struct fs_path) - offsetof(struct fs_path, inline_buf))
/* * Limit the root_ids array of struct backref_cache_entry to 17 elements. * This makes the size of a cache entry to be exactly 192 bytes on x86_64, which * can be satisfied from the kmalloc-192 slab, without wasting any space. * The most common case is to have a single root for cloning, which corresponds * to the send root. Having the user specify more than 16 clone roots is not * common, and in such rare cases we simply don't use caching if the number of * cloning roots that lead down to a leaf is more than 17.
*/ #define SEND_MAX_BACKREF_CACHE_ROOTS 17
/* * Max number of entries in the cache. * With SEND_MAX_BACKREF_CACHE_ROOTS as 17, the size in bytes, excluding * maple tree's internal nodes, is 24K.
*/ #define SEND_MAX_BACKREF_CACHE_SIZE 128
/* * A backref cache entry maps a leaf to a list of IDs of roots from which the * leaf is accessible and we can use for clone operations. * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, each cache entry is 128 bytes (on * x86_64).
*/ struct backref_cache_entry { struct btrfs_lru_cache_entry entry;
u64 root_ids[SEND_MAX_BACKREF_CACHE_ROOTS]; /* Number of valid elements in the root_ids array. */ int num_roots;
};
/* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */
static_assert(offsetof(struct backref_cache_entry, entry) == 0);
/* * Max number of entries in the cache that stores directories that were already * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64).
*/ #define SEND_MAX_DIR_CREATED_CACHE_SIZE 64
/* * Max number of entries in the cache that stores directories that were already * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64).
*/ #define SEND_MAX_DIR_UTIMES_CACHE_SIZE 64
struct send_ctx { struct file *send_filp;
loff_t send_off; char *send_buf;
u32 send_size;
u32 send_max_size; /* * Whether BTRFS_SEND_A_DATA attribute was already added to current * command (since protocol v2, data must be the last attribute).
*/ bool put_data; struct page **send_buf_pages;
u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */ /* Protocol version compatibility requested */
u32 proto;
/* current state of the compare_tree call */ struct btrfs_path *left_path; struct btrfs_path *right_path; struct btrfs_key *cmp_key;
/* * Keep track of the generation of the last transaction that was used * for relocating a block group. This is periodically checked in order * to detect if a relocation happened since the last check, so that we * don't operate on stale extent buffers for nodes (level >= 1) or on * stale disk_bytenr values of file extent items.
*/
u64 last_reloc_trans;
/* * infos of the currently processed inode. In case of deleted inodes, * these are the values from the deleted inode.
*/
u64 cur_ino;
u64 cur_inode_gen;
u64 cur_inode_size;
u64 cur_inode_mode;
u64 cur_inode_rdev;
u64 cur_inode_last_extent;
u64 cur_inode_next_write_offset; struct fs_path cur_inode_path; bool cur_inode_new; bool cur_inode_new_gen; bool cur_inode_deleted; bool ignore_cur_inode; bool cur_inode_needs_verity; void *verity_descriptor;
/* * The inode we are currently processing. It's not NULL only when we * need to issue write commands for data extents from this inode.
*/ struct inode *cur_inode; struct file_ra_state ra;
u64 page_cache_clear_start; bool clean_page_cache;
/* * We process inodes by their increasing order, so if before an * incremental send we reverse the parent/child relationship of * directories such that a directory with a lower inode number was * the parent of a directory with a higher inode number, and the one * becoming the new parent got renamed too, we can't rename/move the * directory with lower inode number when we finish processing it - we * must process the directory with higher inode number first, then * rename/move it and then rename/move the directory with lower inode * number. Example follows. * * Tree state when the first send was performed: * * . * |-- a (ino 257) * |-- b (ino 258) * | * | * |-- c (ino 259) * | |-- d (ino 260) * | * |-- c2 (ino 261) * * Tree state when the second (incremental) send is performed: * * . * |-- a (ino 257) * |-- b (ino 258) * |-- c2 (ino 261) * |-- d2 (ino 260) * |-- cc (ino 259) * * The sequence of steps that lead to the second state was: * * mv /a/b/c/d /a/b/c2/d2 * mv /a/b/c /a/b/c2/d2/cc * * "c" has lower inode number, but we can't move it (2nd mv operation) * before we move "d", which has higher inode number. * * So we just memorize which move/rename operations must be performed * later when their respective parent is processed and moved/renamed.
*/
/* * Reverse index, indexed by the inode number of a directory that * is waiting for the move/rename of its immediate parent before its * own move/rename can be performed.
*/ struct rb_root waiting_dir_moves;
/* * A directory that is going to be rm'ed might have a child directory * which is in the pending directory moves index above. In this case, * the directory can only be removed after the move/rename of its child * is performed. Example: * * Parent snapshot: * * . (ino 256) * |-- a/ (ino 257) * |-- b/ (ino 258) * |-- c/ (ino 259) * | |-- x/ (ino 260) * | * |-- y/ (ino 261) * * Send snapshot: * * . (ino 256) * |-- a/ (ino 257) * |-- b/ (ino 258) * |-- YY/ (ino 261) * |-- x/ (ino 260) * * Sequence of steps that lead to the send snapshot: * rm -f /a/b/c/foo.txt * mv /a/b/y /a/b/YY * mv /a/b/c/x /a/b/YY * rmdir /a/b/c * * When the child is processed, its move/rename is delayed until its * parent is processed (as explained above), but all other operations * like update utimes, chown, chgrp, etc, are performed and the paths * that it uses for those operations must use the orphanized name of * its parent (the directory we're going to rm later), so we need to * memorize that name. * * Indexed by the inode number of the directory to be deleted.
*/ struct rb_root orphan_dirs;
struct waiting_dir_move { struct rb_node node;
u64 ino; /* * There might be some directory that could not be removed because it * was waiting for this directory inode to be moved first. Therefore * after this directory is moved, we can try to rmdir the ino rmdir_ino.
*/
u64 rmdir_ino;
u64 rmdir_gen; bool orphanized;
};
struct name_cache_entry { /* * The key in the entry is an inode number, and the generation matches * the inode's generation.
*/ struct btrfs_lru_cache_entry entry;
u64 parent_ino;
u64 parent_gen; int ret; int need_later_update; /* Name length without NUL terminator. */ int name_len; /* Not NUL terminated. */ char name[] __counted_by(name_len) __nonstring;
};
/* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */
static_assert(offsetof(struct name_cache_entry, entry) == 0);
/* * Allocate to the next largest kmalloc bucket size, to let * the fast path happen most of the time.
*/
len = kmalloc_size_roundup(len); /* * First time the inline_buf does not suffice
*/ if (p->buf == p->inline_buf) {
tmp_buf = kmalloc(len, GFP_KERNEL); if (tmp_buf)
memcpy(tmp_buf, p->buf, old_buf_len);
} else {
tmp_buf = krealloc(p->buf, len, GFP_KERNEL);
} if (!tmp_buf) return -ENOMEM;
p->buf = tmp_buf;
p->buf_len = len;
/* * For each command/item we want to send to userspace, we call this function.
*/ staticint begin_cmd(struct send_ctx *sctx, int cmd)
{ struct btrfs_cmd_header *hdr;
/* * Helper function to iterate the entries in ONE btrfs_inode_ref or * btrfs_inode_extref. * The iterate callback may return a non zero value to stop iteration. This can * be a negative value for error codes or 1 to simply stop it. * * path must point to the INODE_REF or INODE_EXTREF when called.
*/ staticint iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *found_key, int resolve,
iterate_inode_ref_t iterate, void *ctx)
{ struct extent_buffer *eb = path->nodes[0]; struct btrfs_inode_ref *iref; struct btrfs_inode_extref *extref; struct btrfs_path *tmp_path; struct fs_path *p;
u32 cur = 0;
u32 total; int slot = path->slots[0];
u32 name_len; char *start; int ret = 0;
u64 dir; unsignedlong name_off; unsignedlong elem_size; unsignedlong ptr;
p = fs_path_alloc_reversed(); if (!p) return -ENOMEM;
tmp_path = alloc_path_for_send(); if (!tmp_path) {
fs_path_free(p); return -ENOMEM;
}
typedefint (*iterate_dir_item_t)(int num, struct btrfs_key *di_key, constchar *name, int name_len, constchar *data, int data_len, void *ctx);
/* * Helper function to iterate the entries in ONE btrfs_dir_item. * The iterate callback may return a non zero value to stop iteration. This can * be a negative value for error codes or 1 to simply stop it. * * path must point to the dir item when called.
*/ staticint iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
iterate_dir_item_t iterate, void *ctx)
{ int ret = 0; struct extent_buffer *eb; struct btrfs_dir_item *di; struct btrfs_key di_key; char *buf = NULL; int buf_len;
u32 name_len;
u32 data_len;
u32 cur;
u32 len;
u32 total; int slot; int num;
/* * Start with a small buffer (1 page). If later we end up needing more * space, which can happen for xattrs on a fs with a leaf size greater * than the page size, attempt to increase the buffer. Typically xattr * values are small.
*/
buf_len = PATH_MAX;
buf = kmalloc(buf_len, GFP_KERNEL); if (!buf) {
ret = -ENOMEM; goto out;
}
eb = path->nodes[0];
slot = path->slots[0];
di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
cur = 0;
len = 0;
total = btrfs_item_size(eb, slot);
num = 0; while (cur < total) {
name_len = btrfs_dir_name_len(eb, di);
data_len = btrfs_dir_data_len(eb, di);
btrfs_dir_item_key_to_cpu(eb, di, &di_key);
if (btrfs_dir_ftype(eb, di) == BTRFS_FT_XATTR) { if (name_len > XATTR_NAME_MAX) {
ret = -ENAMETOOLONG; goto out;
} if (name_len + data_len >
BTRFS_MAX_XATTR_SIZE(root->fs_info)) {
ret = -E2BIG; goto out;
}
} else { /* * Path too long
*/ if (name_len + data_len > PATH_MAX) {
ret = -ENAMETOOLONG; goto out;
}
}
ret = fs_path_copy(pt, p); if (ret < 0) return ret;
/* we want the first only */ return 1;
}
/* * Retrieve the first path of an inode. If an inode has more then one * ref/hardlink, this is ignored.
*/ staticint get_inode_path(struct btrfs_root *root,
u64 ino, struct fs_path *path)
{ int ret; struct btrfs_key key, found_key; struct btrfs_path *p;
p = alloc_path_for_send(); if (!p) return -ENOMEM;
ret = btrfs_search_slot_for_read(root, &key, p, 1, 0); if (ret < 0) goto out; if (ret) {
ret = 1; goto out;
}
btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]); if (found_key.objectid != ino ||
(found_key.type != BTRFS_INODE_REF_KEY &&
found_key.type != BTRFS_INODE_EXTREF_KEY)) {
ret = -ENOENT; goto out;
}
ret = iterate_inode_ref(root, p, &found_key, 1,
__copy_first_ref, path); if (ret < 0) goto out;
ret = 0;
out:
btrfs_free_path(p); return ret;
}
struct backref_ctx { struct send_ctx *sctx;
/* number of total found references */
u64 found;
/* * used for clones found in send_root. clones found behind cur_objectid * and cur_offset are not considered as allowed clones.
*/
u64 cur_objectid;
u64 cur_offset;
/* may be truncated in case it's the last extent in a file */
u64 extent_len;
/* The bytenr the file extent item we are processing refers to. */
u64 bytenr; /* The owner (root id) of the data backref for the current extent. */
u64 backref_owner; /* The offset of the data backref for the current extent. */
u64 backref_offset;
};
if (btrfs_root_id(cr1->root) < btrfs_root_id(cr2->root)) return -1; if (btrfs_root_id(cr1->root) > btrfs_root_id(cr2->root)) return 1; return 0;
}
/* * Called for every backref that is found for the current extent. * Results are collected in sctx->clone_roots->ino/offset.
*/ staticint iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root_id, void *ctx_)
{ struct backref_ctx *bctx = ctx_; struct clone_root *clone_root;
/* First check if the root is in the list of accepted clone sources */
clone_root = bsearch((void *)(uintptr_t)root_id, bctx->sctx->clone_roots,
bctx->sctx->clone_roots_cnt, sizeof(struct clone_root),
__clone_root_cmp_bsearch); if (!clone_root) return 0;
/* This is our own reference, bail out as we can't clone from it. */ if (clone_root->root == bctx->sctx->send_root &&
ino == bctx->cur_objectid &&
offset == bctx->cur_offset) return 0;
/* * Make sure we don't consider clones from send_root that are * behind the current inode/offset.
*/ if (clone_root->root == bctx->sctx->send_root) { /* * If the source inode was not yet processed we can't issue a * clone operation, as the source extent does not exist yet at * the destination of the stream.
*/ if (ino > bctx->cur_objectid) return 0; /* * We clone from the inode currently being sent as long as the * source extent is already processed, otherwise we could try * to clone from an extent that does not exist yet at the * destination of the stream.
*/ if (ino == bctx->cur_objectid &&
offset + bctx->extent_len >
bctx->sctx->cur_inode_next_write_offset) return 0;
}
bctx->found++;
clone_root->found_ref = true;
/* * If the given backref refers to a file extent item with a larger * number of bytes than what we found before, use the new one so that * we clone more optimally and end up doing less writes and getting * less exclusive, non-shared extents at the destination.
*/ if (num_bytes > clone_root->num_bytes) {
clone_root->ino = ino;
clone_root->offset = offset;
clone_root->num_bytes = num_bytes;
/* * Found a perfect candidate, so there's no need to continue * backref walking.
*/ if (num_bytes >= bctx->extent_len) return BTRFS_ITERATE_EXTENT_INODES_STOP;
}
/* * If relocation happened since we first filled the cache, then we must * empty the cache and can not use it, because even though we operate on * read-only roots, their leaves and nodes may have been reallocated and * now be used for different nodes/leaves of the same tree or some other * tree. * * We are called from iterate_extent_inodes() while either holding a * transaction handle or holding fs_info->commit_root_sem, so no need * to take any lock here.
*/ if (fs_info->last_reloc_trans > sctx->backref_cache_last_reloc_trans) {
btrfs_lru_cache_clear(&sctx->backref_cache); returnfalse;
}
raw_entry = btrfs_lru_cache_lookup(&sctx->backref_cache, key, 0); if (!raw_entry) returnfalse;
/* * We're called while holding a transaction handle or while holding * fs_info->commit_root_sem (at iterate_extent_inodes()), so must do a * NOFS allocation.
*/
new_entry = kmalloc(sizeof(struct backref_cache_entry), GFP_NOFS); /* No worries, cache is optional. */ if (!new_entry) return;
/* Too many roots, just exit, no worries as caching is optional. */ if (new_entry->num_roots >= SEND_MAX_BACKREF_CACHE_ROOTS) {
kfree(new_entry); return;
}
/* * We may have not added any roots to the new cache entry, which means * none of the roots is part of the list of roots from which we are * allowed to clone. Cache the new entry as it's still useful to avoid * backref walking to determine which roots have a path to the leaf. * * Also use GFP_NOFS because we're called while holding a transaction * handle or while holding fs_info->commit_root_sem.
*/
ret = btrfs_lru_cache_store(&sctx->backref_cache, &new_entry->entry,
GFP_NOFS);
ASSERT(ret == 0 || ret == -ENOMEM); if (ret) { /* Caching is optional, no worries. */
kfree(new_entry); return;
}
/* * We are called from iterate_extent_inodes() while either holding a * transaction handle or holding fs_info->commit_root_sem, so no need * to take any lock here.
*/ if (sctx->backref_cache.size == 1)
sctx->backref_cache_last_reloc_trans = fs_info->last_reloc_trans;
}
if (WARN_ON(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) return -EUCLEAN;
/* * If we have only one reference and only the send root as a * clone source - meaning no clone roots were given in the * struct btrfs_ioctl_send_args passed to the send ioctl - then * it's our reference and there's no point in doing backref * walking which is expensive, so exit early.
*/ if (refs == 1 && sctx->clone_roots_cnt == 1) return -ENOENT;
}
/* * Backreference walking (iterate_extent_inodes() below) is currently * too expensive when an extent has a large number of references, both * in time spent and used memory. So for now just fallback to write * operations instead of clone operations when an extent has more than * a certain amount of references.
*/ if (refs > SEND_MAX_EXTENT_REFS) return -ENOENT;
/* * Given an inode, offset and extent item, it finds a good clone for a clone * instruction. Returns -ENOENT when none could be found. The function makes * sure that the returned clone is usable at the point where sending is at the * moment. This means, that no clones are accepted which lie behind the current * inode+offset. * * path must point to the extent item when called.
*/ staticint find_extent_clone(struct send_ctx *sctx, struct btrfs_path *path,
u64 ino, u64 data_offset,
u64 ino_size, struct clone_root **found)
{ struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret; int extent_type;
u64 disk_byte;
u64 num_bytes; struct btrfs_file_extent_item *fi; struct extent_buffer *eb = path->nodes[0]; struct backref_ctx backref_ctx = { 0 }; struct btrfs_backref_walk_ctx backref_walk_ctx = { 0 }; struct clone_root *cur_clone_root; int compressed;
u32 i;
/* * With fallocate we can get prealloc extents beyond the inode's i_size, * so we don't do anything here because clone operations can not clone * to a range beyond i_size without increasing the i_size of the * destination inode.
*/ if (data_offset >= ino_size) return 0;
fi = btrfs_item_ptr(eb, path->slots[0], struct btrfs_file_extent_item);
extent_type = btrfs_file_extent_type(eb, fi); if (extent_type == BTRFS_FILE_EXTENT_INLINE) return -ENOENT;
disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); if (disk_byte == 0) return -ENOENT;
/* * Setup the clone roots.
*/ for (i = 0; i < sctx->clone_roots_cnt; i++) {
cur_clone_root = sctx->clone_roots + i;
cur_clone_root->ino = (u64)-1;
cur_clone_root->offset = 0;
cur_clone_root->num_bytes = 0;
cur_clone_root->found_ref = false;
}
backref_ctx.sctx = sctx;
backref_ctx.cur_objectid = ino;
backref_ctx.cur_offset = data_offset;
backref_ctx.bytenr = disk_byte; /* * Use the header owner and not the send root's id, because in case of a * snapshot we can have shared subtrees.
*/
backref_ctx.backref_owner = btrfs_header_owner(eb);
backref_ctx.backref_offset = data_offset - btrfs_file_extent_offset(eb, fi);
/* * The last extent of a file may be too large due to page alignment. * We need to adjust extent_len in this case so that the checks in * iterate_backrefs() work.
*/ if (data_offset + num_bytes >= ino_size)
backref_ctx.extent_len = ino_size - data_offset; else
backref_ctx.extent_len = num_bytes;
/* * If have a single clone root, then it's the send root and we can tell * the backref walking code to skip our own backref and not resolve it, * since we can not use it for cloning - the source and destination * ranges can't overlap and in case the leaf is shared through a subtree * due to snapshots, we can't use those other roots since they are not * in the list of clone roots.
*/ if (sctx->clone_roots_cnt == 1)
backref_walk_ctx.skip_data_ref = skip_self_data_ref;
ret = iterate_extent_inodes(&backref_walk_ctx, true, iterate_backrefs,
&backref_ctx); if (ret < 0) return ret;
down_read(&fs_info->commit_root_sem); if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { /* * A transaction commit for a transaction in which block group * relocation was done just happened. * The disk_bytenr of the file extent item we processed is * possibly stale, referring to the extent's location before * relocation. So act as if we haven't found any clone sources * and fallback to write commands, which will read the correct * data from the new extent location. Otherwise we will fail * below because we haven't found our own back reference or we * could be getting incorrect sources in case the old extent * was already reallocated after the relocation.
*/
up_read(&fs_info->commit_root_sem); return -ENOENT;
}
up_read(&fs_info->commit_root_sem);
if (!backref_ctx.found) return -ENOENT;
cur_clone_root = NULL; for (i = 0; i < sctx->clone_roots_cnt; i++) { struct clone_root *clone_root = &sctx->clone_roots[i];
if (!clone_root->found_ref) continue;
/* * Choose the root from which we can clone more bytes, to * minimize write operations and therefore have more extent * sharing at the destination (the same as in the source).
*/ if (!cur_clone_root ||
clone_root->num_bytes > cur_clone_root->num_bytes) {
cur_clone_root = clone_root;
/* * We found an optimal clone candidate (any inode from * any root is fine), so we're done.
*/ if (clone_root->num_bytes >= backref_ctx.extent_len) break;
}
}
if (cur_clone_root) {
*found = cur_clone_root;
ret = 0;
} else {
ret = -ENOENT;
}
path = alloc_path_for_send(); if (!path) return -ENOMEM;
key.objectid = ino;
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; if (ret) { /* * An empty symlink inode. Can happen in rare error paths when * creating a symlink (transaction committed before the inode * eviction handler removed the symlink inode items and a crash * happened in between or the subvol was snapshoted in between). * Print an informative message to dmesg/syslog so that the user * can delete the symlink.
*/
btrfs_err(root->fs_info, "Found empty symlink inode %llu at root %llu",
ino, btrfs_root_id(root));
ret = -EIO; goto out;
}
ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_file_extent_item);
type = btrfs_file_extent_type(path->nodes[0], ei); if (unlikely(type != BTRFS_FILE_EXTENT_INLINE)) {
ret = -EUCLEAN;
btrfs_crit(root->fs_info, "send: found symlink extent that is not inline, ino %llu root %llu extent type %d",
ino, btrfs_root_id(root), type); goto out;
}
compression = btrfs_file_extent_compression(path->nodes[0], ei); if (unlikely(compression != BTRFS_COMPRESS_NONE)) {
ret = -EUCLEAN;
btrfs_crit(root->fs_info, "send: found symlink extent with compression, ino %llu root %llu compression type %d",
ino, btrfs_root_id(root), compression); goto out;
}
off = btrfs_file_extent_inline_start(ei);
len = btrfs_file_extent_ram_bytes(path->nodes[0], ei);
ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
out:
btrfs_free_path(path); return ret;
}
/* * Helper function to generate a file name that is unique in the root of * send_root and parent_root. This is used to generate names for orphan inodes.
*/ staticint gen_unique_name(struct send_ctx *sctx,
u64 ino, u64 gen, struct fs_path *dest)
{ int ret = 0; struct btrfs_path *path; struct btrfs_dir_item *di; char tmp[64]; int len;
u64 idx = 0;
path = alloc_path_for_send(); if (!path) return -ENOMEM;
di = btrfs_lookup_dir_item(NULL, sctx->send_root,
path, BTRFS_FIRST_FREE_OBJECTID,
&tmp_name, 0);
btrfs_release_path(path); if (IS_ERR(di)) {
ret = PTR_ERR(di); goto out;
} if (di) { /* not unique, try again */
idx++; continue;
}
if (!sctx->parent_root) { /* unique */
ret = 0; break;
}
di = btrfs_lookup_dir_item(NULL, sctx->parent_root,
path, BTRFS_FIRST_FREE_OBJECTID,
&tmp_name, 0);
btrfs_release_path(path); if (IS_ERR(di)) {
ret = PTR_ERR(di); goto out;
} if (di) { /* not unique, try again */
idx++; continue;
} /* unique */ break;
}
ret = get_cur_inode_state(sctx, ino, gen, send_gen, parent_gen); if (ret < 0) return ret;
if (ret == inode_state_no_change ||
ret == inode_state_did_create ||
ret == inode_state_will_delete) return 1;
return 0;
}
/* * Helper function to lookup a dir item in a dir.
*/ staticint lookup_dir_item_inode(struct btrfs_root *root,
u64 dir, constchar *name, int name_len,
u64 *found_inode)
{ int ret = 0; struct btrfs_dir_item *di; struct btrfs_key key; struct btrfs_path *path; struct fscrypt_str name_str = FSTR_INIT((char *)name, name_len);
path = alloc_path_for_send(); if (!path) return -ENOMEM;
di = btrfs_lookup_dir_item(NULL, root, path, dir, &name_str, 0); if (IS_ERR_OR_NULL(di)) {
ret = di ? PTR_ERR(di) : -ENOENT; goto out;
}
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); if (key.type == BTRFS_ROOT_ITEM_KEY) {
ret = -ENOENT; goto out;
}
*found_inode = key.objectid;
out:
btrfs_free_path(path); return ret;
}
/* * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir, * generation of the parent dir and the name of the dir entry.
*/ staticint get_first_ref(struct btrfs_root *root, u64 ino,
u64 *dir, u64 *dir_gen, struct fs_path *name)
{ int ret; struct btrfs_key key; struct btrfs_key found_key; struct btrfs_path *path; int len;
u64 parent_dir;
path = alloc_path_for_send(); if (!path) return -ENOMEM;
if (dir_gen) {
ret = get_inode_gen(root, parent_dir, dir_gen); if (ret < 0) goto out;
}
*dir = parent_dir;
out:
btrfs_free_path(path); return ret;
}
staticint is_first_ref(struct btrfs_root *root,
u64 ino, u64 dir, constchar *name, int name_len)
{ int ret; struct fs_path *tmp_name;
u64 tmp_dir;
tmp_name = fs_path_alloc(); if (!tmp_name) return -ENOMEM;
ret = get_first_ref(root, ino, &tmp_dir, NULL, tmp_name); if (ret < 0) goto out;
if (dir != tmp_dir || name_len != fs_path_len(tmp_name)) {
ret = 0; goto out;
}
ret = !memcmp(tmp_name->start, name, name_len);
out:
fs_path_free(tmp_name); return ret;
}
/* * Used by process_recorded_refs to determine if a new ref would overwrite an * already existing ref. In case it detects an overwrite, it returns the * inode/gen in who_ino/who_gen. * When an overwrite is detected, process_recorded_refs does proper orphanizing * to make sure later references to the overwritten inode are possible. * Orphanizing is however only required for the first ref of an inode. * process_recorded_refs does an additional is_first_ref check to see if * orphanizing is really required.
*/ staticint will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, constchar *name, int name_len,
u64 *who_ino, u64 *who_gen, u64 *who_mode)
{ int ret;
u64 parent_root_dir_gen;
u64 other_inode = 0; struct btrfs_inode_info info;
if (!sctx->parent_root) return 0;
ret = is_inode_existent(sctx, dir, dir_gen, NULL, &parent_root_dir_gen); if (ret <= 0) return 0;
/* * If we have a parent root we need to verify that the parent dir was * not deleted and then re-created, if it was then we have no overwrite * and we can just unlink this entry. * * @parent_root_dir_gen was set to 0 if the inode does not exist in the * parent root.
*/ if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID &&
parent_root_dir_gen != dir_gen) return 0;
ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len,
&other_inode); if (ret == -ENOENT) return 0; elseif (ret < 0) return ret;
/* * Check if the overwritten ref was already processed. If yes, the ref * was already unlinked/moved, so we can safely assume that we will not * overwrite anything at this point in time.
*/ if (other_inode > sctx->send_progress ||
is_waiting_for_move(sctx, other_inode)) {
ret = get_inode_info(sctx->parent_root, other_inode, &info); if (ret < 0) return ret;
/* * Checks if the ref was overwritten by an already processed inode. This is * used by __get_cur_name_and_parent to find out if the ref was orphanized and * thus the orphan name needs be used. * process_recorded_refs also uses it to avoid unlinking of refs that were * overwritten.
*/ staticint did_overwrite_ref(struct send_ctx *sctx,
u64 dir, u64 dir_gen,
u64 ino, u64 ino_gen, constchar *name, int name_len)
{ int ret;
u64 ow_inode;
u64 ow_gen = 0;
u64 send_root_dir_gen;
if (!sctx->parent_root) return 0;
ret = is_inode_existent(sctx, dir, dir_gen, &send_root_dir_gen, NULL); if (ret <= 0) return ret;
/* * @send_root_dir_gen was set to 0 if the inode does not exist in the * send root.
*/ if (dir != BTRFS_FIRST_FREE_OBJECTID && send_root_dir_gen != dir_gen) return 0;
/* check if the ref was overwritten by another ref */
ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len,
&ow_inode); if (ret == -ENOENT) { /* was never and will never be overwritten */ return 0;
} elseif (ret < 0) { return ret;
}
if (ow_inode == ino) {
ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen); if (ret < 0) return ret;
/* It's the same inode, so no overwrite happened. */ if (ow_gen == ino_gen) return 0;
}
/* * We know that it is or will be overwritten. Check this now. * The current inode being processed might have been the one that caused * inode 'ino' to be orphanized, therefore check if ow_inode matches * the current inode being processed.
*/ if (ow_inode < sctx->send_progress) return 1;
if (ino != sctx->cur_ino && ow_inode == sctx->cur_ino) { if (ow_gen == 0) {
ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen); if (ret < 0) return ret;
} if (ow_gen == sctx->cur_inode_gen) return 1;
}
return 0;
}
/* * Same as did_overwrite_ref, but also checks if it is the first ref of an inode * that got overwritten. This is used by process_recorded_refs to determine * if it has to use the path as returned by get_cur_path or the orphan name.
*/ staticint did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
{ int ret = 0; struct fs_path *name = NULL;
u64 dir;
u64 dir_gen;
if (!sctx->parent_root) goto out;
name = fs_path_alloc(); if (!name) return -ENOMEM;
ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name); if (ret < 0) goto out;
ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen,
name->start, fs_path_len(name));
/* * Used by get_cur_path for each ref up to the root. * Returns 0 if it succeeded. * Returns 1 if the inode is not existent or got overwritten. In that case, the * name is an orphan name. This instructs get_cur_path to stop iterating. If 1 * is returned, parent_ino/parent_gen are not guaranteed to be valid. * Returns <0 in case of error.
*/ staticint __get_cur_name_and_parent(struct send_ctx *sctx,
u64 ino, u64 gen,
u64 *parent_ino,
u64 *parent_gen, struct fs_path *dest)
{ int ret; int nce_ret; struct name_cache_entry *nce;
/* * First check if we already did a call to this function with the same * ino/gen. If yes, check if the cache entry is still up-to-date. If yes * return the cached result.
*/
nce = name_cache_search(sctx, ino, gen); if (nce) { if (ino < sctx->send_progress && nce->need_later_update) {
btrfs_lru_cache_remove(&sctx->name_cache, &nce->entry);
nce = NULL;
} else {
*parent_ino = nce->parent_ino;
*parent_gen = nce->parent_gen;
ret = fs_path_add(dest, nce->name, nce->name_len); if (ret < 0) return ret; return nce->ret;
}
}
/* * If the inode is not existent yet, add the orphan name and return 1. * This should only happen for the parent dir that we determine in * record_new_ref_if_needed().
*/
ret = is_inode_existent(sctx, ino, gen, NULL, NULL); if (ret < 0) return ret;
if (!ret) {
ret = gen_unique_name(sctx, ino, gen, dest); if (ret < 0) return ret;
ret = 1; goto out_cache;
}
/* * Depending on whether the inode was already processed or not, use * send_root or parent_root for ref lookup.
*/ if (ino < sctx->send_progress)
ret = get_first_ref(sctx->send_root, ino,
parent_ino, parent_gen, dest); else
ret = get_first_ref(sctx->parent_root, ino,
parent_ino, parent_gen, dest); if (ret < 0) return ret;
/* * Check if the ref was overwritten by an inode's ref that was processed * earlier. If yes, treat as orphan and return 1.
*/
ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen,
dest->start, fs_path_len(dest)); if (ret < 0) return ret; if (ret) {
fs_path_reset(dest);
ret = gen_unique_name(sctx, ino, gen, dest); if (ret < 0) return ret;
ret = 1;
}
out_cache: /* * Store the result of the lookup in the name cache.
*/
nce = kmalloc(sizeof(*nce) + fs_path_len(dest), GFP_KERNEL); if (!nce) return -ENOMEM;
/* * Magic happens here. This function returns the first ref to an inode as it * would look like while receiving the stream at this point in time. * We walk the path up to the root. For every inode in between, we check if it * was already processed/sent. If yes, we continue with the parent as found * in send_root. If not, we continue with the parent as found in parent_root. * If we encounter an inode that was deleted at this point in time, we use the * inodes "orphan" name instead of the real name and stop. Same with new inodes * that were not created yet and overwritten inodes/refs. * * When do we have orphan inodes: * 1. When an inode is freshly created and thus no valid refs are available yet * 2. When a directory lost all it's refs (deleted) but still has dir items * inside which were not processed yet (pending for move/delete). If anyone * tried to get the path to the dir items, it would get a path inside that * orphan directory. * 3. When an inode is moved around or gets new links, it may overwrite the ref * of an unprocessed inode. If in that case the first ref would be * overwritten, the overwritten inode gets "orphanized". Later when we * process this overwritten inode, it is restored at a new place by moving * the orphan inode. * * sctx->send_progress tells this function at which point in time receiving * would be.
*/ staticint get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, struct fs_path *dest)
{ int ret = 0; struct fs_path *name = NULL;
u64 parent_inode = 0;
u64 parent_gen = 0; int stop = 0; constbool is_cur_inode = (ino == sctx->cur_ino && gen == sctx->cur_inode_gen);
if (is_cur_inode && fs_path_len(&sctx->cur_inode_path) > 0) { if (dest != &sctx->cur_inode_path) return fs_path_copy(dest, &sctx->cur_inode_path);
return 0;
}
name = fs_path_alloc(); if (!name) {
ret = -ENOMEM; goto out;
}
dest->reversed = 1;
fs_path_reset(dest);
while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) { struct waiting_dir_move *wdm;
fs_path_reset(name);
if (is_waiting_for_rm(sctx, ino, gen)) {
ret = gen_unique_name(sctx, ino, gen, name); if (ret < 0) goto out;
ret = fs_path_add_path(dest, name); break;
}
wdm = get_waiting_dir_move(sctx, ino); if (wdm && wdm->orphanized) {
ret = gen_unique_name(sctx, ino, gen, name);
stop = 1;
} elseif (wdm) {
ret = get_first_ref(sctx->parent_root, ino,
&parent_inode, &parent_gen, name);
} else {
ret = __get_cur_name_and_parent(sctx, ino, gen,
&parent_inode,
&parent_gen, name); if (ret)
stop = 1;
}
if (ret < 0) goto out;
ret = fs_path_add_path(dest, name); if (ret < 0) goto out;
ino = parent_inode;
gen = parent_gen;
}
out:
fs_path_free(name); if (!ret) {
fs_path_unreverse(dest); if (is_cur_inode && dest != &sctx->cur_inode_path)
ret = fs_path_copy(&sctx->cur_inode_path, dest);
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.