Quelle ioctl.c Sprache: C

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle.  All rights reserved.
*/

#include <linux/kernel.h>
#include <linux/bio.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/fsnotify.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/time.h>
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/writeback.h>
#include <linux/compat.h>
#include <linux/security.h>
#include <linux/xattr.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/uuid.h>
#include <linux/btrfs.h>
#include <linux/uaccess.h>
#include <linux/iversion.h>
#include <linux/fileattr.h>
#include <linux/fsverity.h>
#include <linux/sched/xacct.h>
#include <linux/io_uring/cmd.h>
#include "ctree.h"
#include "disk-io.h"
#include "export.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "volumes.h"
#include "locking.h"
#include "backref.h"
#include "send.h"
#include "dev-replace.h"
#include "props.h"
#include "sysfs.h"
#include "qgroup.h"
#include "tree-log.h"
#include "compression.h"
#include "space-info.h"
#include "block-group.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
#include "root-tree.h"
#include "defrag.h"
#include "dir-item.h"
#include "uuid-tree.h"
#include "ioctl.h"
#include "file.h"
#include "scrub.h"
#include "super.h"

#ifdef CONFIG_64BIT
/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
* structures are incorrect, as the timespec structure from userspace
* is 4 bytes too small. We define these alternatives here to teach
* the kernel about the 32-bit struct packing.
*/
struct btrfs_ioctl_timespec_32 {
__u64 sec;
__u32 nsec;
} __attribute__ ((__packed__));

struct btrfs_ioctl_received_subvol_args_32 {
char uuid[BTRFS_UUID_SIZE]; /* in */
__u64 stransid;  /* in */
__u64 rtransid;  /* out */
struct btrfs_ioctl_timespec_32 stime; /* in */
struct btrfs_ioctl_timespec_32 rtime; /* out */
__u64 flags;   /* in */
__u64 reserved[16];  /* in */
} __attribute__ ((__packed__));

#define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \
    struct btrfs_ioctl_received_subvol_args_32)
#endif

#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
struct btrfs_ioctl_send_args_32 {
__s64 send_fd;   /* in */
__u64 clone_sources_count; /* in */
compat_uptr_t clone_sources; /* in */
__u64 parent_root;  /* in */
__u64 flags;   /* in */
__u32 version;   /* in */
__u8  reserved[28];  /* in */
} __attribute__ ((__packed__));

#define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \
          struct btrfs_ioctl_send_args_32)

struct btrfs_ioctl_encoded_io_args_32 {
compat_uptr_t iov;
compat_ulong_t iovcnt;
__s64 offset;
__u64 flags;
__u64 len;
__u64 unencoded_len;
__u64 unencoded_offset;
__u32 compression;
__u32 encryption;
__u8 reserved[64];
};

#define BTRFS_IOC_ENCODED_READ_32 _IOR(BTRFS_IOCTL_MAGIC, 64, \
           struct btrfs_ioctl_encoded_io_args_32)
#define BTRFS_IOC_ENCODED_WRITE_32 _IOW(BTRFS_IOCTL_MAGIC, 64, \
     struct btrfs_ioctl_encoded_io_args_32)
#endif

/* Mask out flags that are inappropriate for the given type of inode. */
static unsigned int btrfs_mask_fsflags_for_type(const struct inode *inode,
      unsigned int flags)
{
if (S_ISDIR(inode->i_mode))
  return flags;
else if (S_ISREG(inode->i_mode))
  return flags & ~FS_DIRSYNC_FL;
else
  return flags & (FS_NODUMP_FL | FS_NOATIME_FL);
}

/*
* Export internal inode flags to the format expected by the FS_IOC_GETFLAGS
* ioctl.
*/
static unsigned int btrfs_inode_flags_to_fsflags(const struct btrfs_inode *inode)
{
unsigned int iflags = 0;
u32 flags = inode->flags;
u32 ro_flags = inode->ro_flags;

if (flags & BTRFS_INODE_SYNC)
  iflags |= FS_SYNC_FL;
if (flags & BTRFS_INODE_IMMUTABLE)
  iflags |= FS_IMMUTABLE_FL;
if (flags & BTRFS_INODE_APPEND)
  iflags |= FS_APPEND_FL;
if (flags & BTRFS_INODE_NODUMP)
  iflags |= FS_NODUMP_FL;
if (flags & BTRFS_INODE_NOATIME)
  iflags |= FS_NOATIME_FL;
if (flags & BTRFS_INODE_DIRSYNC)
  iflags |= FS_DIRSYNC_FL;
if (flags & BTRFS_INODE_NODATACOW)
  iflags |= FS_NOCOW_FL;
if (ro_flags & BTRFS_INODE_RO_VERITY)
  iflags |= FS_VERITY_FL;

if (flags & BTRFS_INODE_NOCOMPRESS)
  iflags |= FS_NOCOMP_FL;
else if (flags & BTRFS_INODE_COMPRESS)
  iflags |= FS_COMPR_FL;

return iflags;
}

/*
* Update inode->i_flags based on the btrfs internal flags.
*/
void btrfs_sync_inode_flags_to_i_flags(struct btrfs_inode *inode)
{
unsigned int new_fl = 0;

if (inode->flags & BTRFS_INODE_SYNC)
  new_fl |= S_SYNC;
if (inode->flags & BTRFS_INODE_IMMUTABLE)
  new_fl |= S_IMMUTABLE;
if (inode->flags & BTRFS_INODE_APPEND)
  new_fl |= S_APPEND;
if (inode->flags & BTRFS_INODE_NOATIME)
  new_fl |= S_NOATIME;
if (inode->flags & BTRFS_INODE_DIRSYNC)
  new_fl |= S_DIRSYNC;
if (inode->ro_flags & BTRFS_INODE_RO_VERITY)
  new_fl |= S_VERITY;

set_mask_bits(&inode->vfs_inode.i_flags,
        S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC |
        S_VERITY, new_fl);
}

/*
* Check if @flags are a supported and valid set of FS_*_FL flags and that
* the old and new flags are not conflicting
*/
static int check_fsflags(unsigned int old_flags, unsigned int flags)
{
if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
        FS_NOATIME_FL | FS_NODUMP_FL | \
        FS_SYNC_FL | FS_DIRSYNC_FL | \
        FS_NOCOMP_FL | FS_COMPR_FL |
        FS_NOCOW_FL))
  return -EOPNOTSUPP;

/* COMPR and NOCOMP on new/old are valid */
if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL))
  return -EINVAL;

if ((flags & FS_COMPR_FL) && (flags & FS_NOCOW_FL))
  return -EINVAL;

/* NOCOW and compression options are mutually exclusive */
if ((old_flags & FS_NOCOW_FL) && (flags & (FS_COMPR_FL | FS_NOCOMP_FL)))
  return -EINVAL;
if ((flags & FS_NOCOW_FL) && (old_flags & (FS_COMPR_FL | FS_NOCOMP_FL)))
  return -EINVAL;

return 0;
}

static int check_fsflags_compatible(const struct btrfs_fs_info *fs_info,
        unsigned int flags)
{
if (btrfs_is_zoned(fs_info) && (flags & FS_NOCOW_FL))
  return -EPERM;

return 0;
}

int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args)
{
if (memchr(vol_args->name, 0, sizeof(vol_args->name)) == NULL)
  return -ENAMETOOLONG;
return 0;
}

static int btrfs_check_ioctl_vol_args2_subvol_name(const struct btrfs_ioctl_vol_args_v2 *vol_args2)
{
if (memchr(vol_args2->name, 0, sizeof(vol_args2->name)) == NULL)
  return -ENAMETOOLONG;
return 0;
}

/*
* Set flags/xflags from the internal inode flags. The remaining items of
* fsxattr are zeroed.
*/
int btrfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
const struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));

fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(inode));
return 0;
}

int btrfs_fileattr_set(struct mnt_idmap *idmap,
         struct dentry *dentry, struct file_kattr *fa)
{
struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
unsigned int fsflags, old_fsflags;
int ret;
const char *comp = NULL;
u32 inode_flags;

if (btrfs_root_readonly(root))
  return -EROFS;

if (fileattr_has_fsx(fa))
  return -EOPNOTSUPP;

fsflags = btrfs_mask_fsflags_for_type(&inode->vfs_inode, fa->flags);
old_fsflags = btrfs_inode_flags_to_fsflags(inode);
ret = check_fsflags(old_fsflags, fsflags);
if (ret)
  return ret;

ret = check_fsflags_compatible(fs_info, fsflags);
if (ret)
  return ret;

inode_flags = inode->flags;
if (fsflags & FS_SYNC_FL)
  inode_flags |= BTRFS_INODE_SYNC;
else
  inode_flags &= ~BTRFS_INODE_SYNC;
if (fsflags & FS_IMMUTABLE_FL)
  inode_flags |= BTRFS_INODE_IMMUTABLE;
else
  inode_flags &= ~BTRFS_INODE_IMMUTABLE;
if (fsflags & FS_APPEND_FL)
  inode_flags |= BTRFS_INODE_APPEND;
else
  inode_flags &= ~BTRFS_INODE_APPEND;
if (fsflags & FS_NODUMP_FL)
  inode_flags |= BTRFS_INODE_NODUMP;
else
  inode_flags &= ~BTRFS_INODE_NODUMP;
if (fsflags & FS_NOATIME_FL)
  inode_flags |= BTRFS_INODE_NOATIME;
else
  inode_flags &= ~BTRFS_INODE_NOATIME;

/* If coming from FS_IOC_FSSETXATTR then skip unconverted flags */
if (!fa->flags_valid) {
  /* 1 item for the inode */
  trans = btrfs_start_transaction(root, 1);
  if (IS_ERR(trans))
   return PTR_ERR(trans);
  goto update_flags;
}

if (fsflags & FS_DIRSYNC_FL)
  inode_flags |= BTRFS_INODE_DIRSYNC;
else
  inode_flags &= ~BTRFS_INODE_DIRSYNC;
if (fsflags & FS_NOCOW_FL) {
  if (S_ISREG(inode->vfs_inode.i_mode)) {
   /*
* It's safe to turn csums off here, no extents exist.
* Otherwise we want the flag to reflect the real COW
* status of the file and will not set it.
*/
   if (inode->vfs_inode.i_size == 0)
    inode_flags |= BTRFS_INODE_NODATACOW |
            BTRFS_INODE_NODATASUM;
  } else {
   inode_flags |= BTRFS_INODE_NODATACOW;
  }
} else {
  /*
* Revert back under same assumptions as above
*/
  if (S_ISREG(inode->vfs_inode.i_mode)) {
   if (inode->vfs_inode.i_size == 0)
    inode_flags &= ~(BTRFS_INODE_NODATACOW |
       BTRFS_INODE_NODATASUM);
  } else {
   inode_flags &= ~BTRFS_INODE_NODATACOW;
  }
}

/*
* The COMPRESS flag can only be changed by users, while the NOCOMPRESS
* flag may be changed automatically if compression code won't make
* things smaller.
*/
if (fsflags & FS_NOCOMP_FL) {
  inode_flags &= ~BTRFS_INODE_COMPRESS;
  inode_flags |= BTRFS_INODE_NOCOMPRESS;
} else if (fsflags & FS_COMPR_FL) {

  if (IS_SWAPFILE(&inode->vfs_inode))
   return -ETXTBSY;

  inode_flags |= BTRFS_INODE_COMPRESS;
  inode_flags &= ~BTRFS_INODE_NOCOMPRESS;

  comp = btrfs_compress_type2str(fs_info->compress_type);
  if (!comp || comp[0] == 0)
   comp = btrfs_compress_type2str(BTRFS_COMPRESS_ZLIB);
} else {
  inode_flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
}

/*
* 1 for inode item
* 2 for properties
*/
trans = btrfs_start_transaction(root, 3);
if (IS_ERR(trans))
  return PTR_ERR(trans);

if (comp) {
  ret = btrfs_set_prop(trans, inode, "btrfs.compression",
         comp, strlen(comp), 0);
  if (ret) {
   btrfs_abort_transaction(trans, ret);
   goto out_end_trans;
  }
} else {
  ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL, 0, 0);
  if (ret && ret != -ENODATA) {
   btrfs_abort_transaction(trans, ret);
   goto out_end_trans;
  }
}

update_flags:
inode->flags = inode_flags;
btrfs_update_inode_mapping_flags(inode);
btrfs_sync_inode_flags_to_i_flags(inode);
inode_inc_iversion(&inode->vfs_inode);
inode_set_ctime_current(&inode->vfs_inode);
ret = btrfs_update_inode(trans, inode);

out_end_trans:
btrfs_end_transaction(trans);
return ret;
}

static int btrfs_ioctl_getversion(const struct inode *inode, int __user *arg)
{
return put_user(inode->i_generation, arg);
}

static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
     void __user *arg)
{
struct btrfs_device *device;
struct fstrim_range range;
u64 minlen = ULLONG_MAX;
u64 num_devices = 0;
int ret;

if (!capable(CAP_SYS_ADMIN))
  return -EPERM;

/*
* btrfs_trim_block_group() depends on space cache, which is not
* available in zoned filesystem. So, disallow fitrim on a zoned
* filesystem for now.
*/
if (btrfs_is_zoned(fs_info))
  return -EOPNOTSUPP;

/*
* If the fs is mounted with nologreplay, which requires it to be
* mounted in RO mode as well, we can not allow discard on free space
* inside block groups, because log trees refer to extents that are not
* pinned in a block group's free space cache (pinning the extents is
* precisely the first phase of replaying a log tree).
*/
if (btrfs_test_opt(fs_info, NOLOGREPLAY))
  return -EROFS;

rcu_read_lock();
list_for_each_entry_rcu(device, &fs_info->fs_devices->devices,
    dev_list) {
  if (!device->bdev || !bdev_max_discard_sectors(device->bdev))
   continue;
  num_devices++;
  minlen = min_t(u64, bdev_discard_granularity(device->bdev),
        minlen);
}
rcu_read_unlock();

if (!num_devices)
  return -EOPNOTSUPP;
if (copy_from_user(&range, arg, sizeof(range)))
  return -EFAULT;

/*
* NOTE: Don't truncate the range using super->total_bytes.  Bytenr of
* block group is in the logical address space, which can be any
* sectorsize aligned bytenr in  the range [0, U64_MAX].
*/
if (range.len < fs_info->sectorsize)
  return -EINVAL;

range.minlen = max(range.minlen, minlen);
ret = btrfs_trim_fs(fs_info, &range);

if (copy_to_user(arg, &range, sizeof(range)))
  return -EFAULT;

return ret;
}

/*
* Calculate the number of transaction items to reserve for creating a subvolume
* or snapshot, not including the inode, directory entries, or parent directory.
*/
static unsigned int create_subvol_num_items(const struct btrfs_qgroup_inherit *inherit)
{
/*
* 1 to add root block
* 1 to add root item
* 1 to add root ref
* 1 to add root backref
* 1 to add UUID item
* 1 to add qgroup info
* 1 to add qgroup limit
*
* Ideally the last two would only be accounted if qgroups are enabled,
* but that can change between now and the time we would insert them.
*/
unsigned int num_items = 7;

if (inherit) {
  /* 2 to add qgroup relations for each inherited qgroup */
  num_items += 2 * inherit->num_qgroups;
}
return num_items;
}

static noinline int create_subvol(struct mnt_idmap *idmap,
      struct inode *dir, struct dentry *dentry,
      struct btrfs_qgroup_inherit *inherit)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct btrfs_trans_handle *trans;
struct btrfs_key key;
struct btrfs_root_item *root_item;
struct btrfs_inode_item *inode_item;
struct extent_buffer *leaf;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_root *new_root;
struct btrfs_block_rsv block_rsv;
struct timespec64 cur_time = current_time(dir);
struct btrfs_new_inode_args new_inode_args = {
  .dir = dir,
  .dentry = dentry,
  .subvol = true,
};
unsigned int trans_num_items;
int ret;
dev_t anon_dev;
u64 objectid;
u64 qgroup_reserved = 0;

root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
if (!root_item)
  return -ENOMEM;

ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid);
if (ret)
  goto out_root_item;

/*
* Don't create subvolume whose level is not zero. Or qgroup will be
* screwed up since it assumes subvolume qgroup's level to be 0.
*/
if (btrfs_qgroup_level(objectid)) {
  ret = -ENOSPC;
  goto out_root_item;
}

ret = get_anon_bdev(&anon_dev);
if (ret < 0)
  goto out_root_item;

new_inode_args.inode = btrfs_new_subvol_inode(idmap, dir);
if (!new_inode_args.inode) {
  ret = -ENOMEM;
  goto out_anon_dev;
}
ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
if (ret)
  goto out_inode;
trans_num_items += create_subvol_num_items(inherit);

btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
            trans_num_items, false);
if (ret)
  goto out_new_inode_args;
qgroup_reserved = block_rsv.qgroup_rsv_reserved;

trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
  ret = PTR_ERR(trans);
  goto out_release_rsv;
}
btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
qgroup_reserved = 0;
trans->block_rsv = &block_rsv;
trans->bytes_reserved = block_rsv.size;

ret = btrfs_qgroup_inherit(trans, 0, objectid, btrfs_root_id(root), inherit);
if (ret)
  goto out;

leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
          0, BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf)) {
  ret = PTR_ERR(leaf);
  goto out;
}

btrfs_mark_buffer_dirty(trans, leaf);

inode_item = &root_item->inode;
btrfs_set_stack_inode_generation(inode_item, 1);
btrfs_set_stack_inode_size(inode_item, 3);
btrfs_set_stack_inode_nlink(inode_item, 1);
btrfs_set_stack_inode_nbytes(inode_item,
         fs_info->nodesize);
btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);

btrfs_set_root_flags(root_item, 0);
btrfs_set_root_limit(root_item, 0);
btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT);

btrfs_set_root_bytenr(root_item, leaf->start);
btrfs_set_root_generation(root_item, trans->transid);
btrfs_set_root_level(root_item, 0);
btrfs_set_root_refs(root_item, 1);
btrfs_set_root_used(root_item, leaf->len);
btrfs_set_root_last_snapshot(root_item, 0);

btrfs_set_root_generation_v2(root_item,
   btrfs_root_generation(root_item));
generate_random_guid(root_item->uuid);
btrfs_set_stack_timespec_sec(&root_item->otime, cur_time.tv_sec);
btrfs_set_stack_timespec_nsec(&root_item->otime, cur_time.tv_nsec);
root_item->ctime = root_item->otime;
btrfs_set_root_ctransid(root_item, trans->transid);
btrfs_set_root_otransid(root_item, trans->transid);

btrfs_tree_unlock(leaf);

btrfs_set_root_dirid(root_item, BTRFS_FIRST_FREE_OBJECTID);

key.objectid = objectid;
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = 0;
ret = btrfs_insert_root(trans, fs_info->tree_root, &key,
    root_item);
if (ret) {
  int ret2;

  /*
* Since we don't abort the transaction in this case, free the
* tree block so that we don't leak space and leave the
* filesystem in an inconsistent state (an extent item in the
* extent tree with a backreference for a root that does not
* exists).
*/
  btrfs_tree_lock(leaf);
  btrfs_clear_buffer_dirty(trans, leaf);
  btrfs_tree_unlock(leaf);
  ret2 = btrfs_free_tree_block(trans, objectid, leaf, 0, 1);
  if (ret2 < 0)
   btrfs_abort_transaction(trans, ret2);
  free_extent_buffer(leaf);
  goto out;
}

free_extent_buffer(leaf);
leaf = NULL;

new_root = btrfs_get_new_fs_root(fs_info, objectid, &anon_dev);
if (IS_ERR(new_root)) {
  ret = PTR_ERR(new_root);
  btrfs_abort_transaction(trans, ret);
  goto out;
}
/* anon_dev is owned by new_root now. */
anon_dev = 0;
BTRFS_I(new_inode_args.inode)->root = new_root;
/* ... and new_root is owned by new_inode_args.inode now. */

ret = btrfs_record_root_in_trans(trans, new_root);
if (ret) {
  btrfs_abort_transaction(trans, ret);
  goto out;
}

ret = btrfs_uuid_tree_add(trans, root_item->uuid,
      BTRFS_UUID_KEY_SUBVOL, objectid);
if (ret) {
  btrfs_abort_transaction(trans, ret);
  goto out;
}

btrfs_record_new_subvolume(trans, BTRFS_I(dir));

ret = btrfs_create_new_inode(trans, &new_inode_args);
if (ret) {
  btrfs_abort_transaction(trans, ret);
  goto out;
}

d_instantiate_new(dentry, new_inode_args.inode);
new_inode_args.inode = NULL;

out:
trans->block_rsv = NULL;
trans->bytes_reserved = 0;
btrfs_end_transaction(trans);
out_release_rsv:
btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL);
if (qgroup_reserved)
  btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
out_new_inode_args:
btrfs_new_inode_args_destroy(&new_inode_args);
out_inode:
iput(new_inode_args.inode);
out_anon_dev:
if (anon_dev)
  free_anon_bdev(anon_dev);
out_root_item:
kfree(root_item);
return ret;
}

static int create_snapshot(struct btrfs_root *root, struct inode *dir,
      struct dentry *dentry, bool readonly,
      struct btrfs_qgroup_inherit *inherit)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct inode *inode;
struct btrfs_pending_snapshot *pending_snapshot;
unsigned int trans_num_items;
struct btrfs_trans_handle *trans;
struct btrfs_block_rsv *block_rsv;
u64 qgroup_reserved = 0;
int ret;

/* We do not support snapshotting right now. */
if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
  btrfs_warn(fs_info,
      "extent tree v2 doesn't support snapshotting yet");
  return -EOPNOTSUPP;
}

if (btrfs_root_refs(&root->root_item) == 0)
  return -ENOENT;

if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
  return -EINVAL;

if (atomic_read(&root->nr_swapfiles)) {
  btrfs_warn(fs_info,
      "cannot snapshot subvolume with active swapfile");
  return -ETXTBSY;
}

pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_KERNEL);
if (!pending_snapshot)
  return -ENOMEM;

ret = get_anon_bdev(&pending_snapshot->anon_dev);
if (ret < 0)
  goto free_pending;
pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item),
   GFP_KERNEL);
pending_snapshot->path = btrfs_alloc_path();
if (!pending_snapshot->root_item || !pending_snapshot->path) {
  ret = -ENOMEM;
  goto free_pending;
}

block_rsv = &pending_snapshot->block_rsv;
btrfs_init_block_rsv(block_rsv, BTRFS_BLOCK_RSV_TEMP);
/*
* 1 to add dir item
* 1 to add dir index
* 1 to update parent inode item
*/
trans_num_items = create_subvol_num_items(inherit) + 3;
ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, block_rsv,
            trans_num_items, false);
if (ret)
  goto free_pending;
qgroup_reserved = block_rsv->qgroup_rsv_reserved;

pending_snapshot->dentry = dentry;
pending_snapshot->root = root;
pending_snapshot->readonly = readonly;
pending_snapshot->dir = BTRFS_I(dir);
pending_snapshot->inherit = inherit;

trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
  ret = PTR_ERR(trans);
  goto fail;
}
ret = btrfs_record_root_in_trans(trans, BTRFS_I(dir)->root);
if (ret) {
  btrfs_end_transaction(trans);
  goto fail;
}
btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
qgroup_reserved = 0;

trans->pending_snapshot = pending_snapshot;

ret = btrfs_commit_transaction(trans);
if (ret)
  goto fail;

ret = pending_snapshot->error;
if (ret)
  goto fail;

ret = btrfs_orphan_cleanup(pending_snapshot->snap);
if (ret)
  goto fail;

inode = btrfs_lookup_dentry(d_inode(dentry->d_parent), dentry);
if (IS_ERR(inode)) {
  ret = PTR_ERR(inode);
  goto fail;
}

d_instantiate(dentry, inode);
ret = 0;
pending_snapshot->anon_dev = 0;
fail:
/* Prevent double freeing of anon_dev */
if (ret && pending_snapshot->snap)
  pending_snapshot->snap->anon_dev = 0;
btrfs_put_root(pending_snapshot->snap);
btrfs_block_rsv_release(fs_info, block_rsv, (u64)-1, NULL);
if (qgroup_reserved)
  btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
free_pending:
if (pending_snapshot->anon_dev)
  free_anon_bdev(pending_snapshot->anon_dev);
kfree(pending_snapshot->root_item);
btrfs_free_path(pending_snapshot->path);
kfree(pending_snapshot);

return ret;
}

/*  copy of may_delete in fs/namei.c()
* Check whether we can remove a link victim from directory dir, check
*  whether the type of victim is right.
*  1. We can't do it if dir is read-only (done in permission())
*  2. We should have write and exec permissions on dir
*  3. We can't remove anything from append-only dir
*  4. We can't do anything with immutable dir (done in permission())
*  5. If the sticky bit on dir is set we should either
* a. be owner of dir, or
* b. be owner of victim, or
* c. have CAP_FOWNER capability
*  6. If the victim is append-only or immutable we can't do anything with
*     links pointing to it.
*  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
*  8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
*  9. We can't remove a root or mountpoint.
* 10. We don't allow removal of NFS sillyrenamed files; it's handled by
*     nfs_async_unlink().
*/

static int btrfs_may_delete(struct mnt_idmap *idmap,
       struct inode *dir, struct dentry *victim, int isdir)
{
int ret;

if (d_really_is_negative(victim))
  return -ENOENT;

/* The @victim is not inside @dir. */
if (d_inode(victim->d_parent) != dir)
  return -EINVAL;
audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);

ret = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
if (ret)
  return ret;
if (IS_APPEND(dir))
  return -EPERM;
if (check_sticky(idmap, dir, d_inode(victim)) ||
     IS_APPEND(d_inode(victim)) || IS_IMMUTABLE(d_inode(victim)) ||
     IS_SWAPFILE(d_inode(victim)))
  return -EPERM;
if (isdir) {
  if (!d_is_dir(victim))
   return -ENOTDIR;
  if (IS_ROOT(victim))
   return -EBUSY;
} else if (d_is_dir(victim))
  return -EISDIR;
if (IS_DEADDIR(dir))
  return -ENOENT;
if (victim->d_flags & DCACHE_NFSFS_RENAMED)
  return -EBUSY;
return 0;
}

/* copy of may_create in fs/namei.c() */
static inline int btrfs_may_create(struct mnt_idmap *idmap,
       struct inode *dir, const struct dentry *child)
{
if (d_really_is_positive(child))
  return -EEXIST;
if (IS_DEADDIR(dir))
  return -ENOENT;
if (!fsuidgid_has_mapping(dir->i_sb, idmap))
  return -EOVERFLOW;
return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
}

/*
* Create a new subvolume below @parent.  This is largely modeled after
* sys_mkdirat and vfs_mkdir, but we only do a single component lookup
* inside this filesystem so it's quite a bit simpler.
*/
static noinline int btrfs_mksubvol(struct dentry *parent,
       struct mnt_idmap *idmap,
       struct qstr *qname, struct btrfs_root *snap_src,
       bool readonly,
       struct btrfs_qgroup_inherit *inherit)
{
struct inode *dir = d_inode(parent);
struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct dentry *dentry;
struct fscrypt_str name_str = FSTR_INIT((char *)qname->name, qname->len);
int ret;

ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
if (ret == -EINTR)
  return ret;

dentry = lookup_one(idmap, qname, parent);
ret = PTR_ERR(dentry);
if (IS_ERR(dentry))
  goto out_unlock;

ret = btrfs_may_create(idmap, dir, dentry);
if (ret)
  goto out_dput;

/*
* even if this name doesn't exist, we may get hash collisions.
* check for them now when we can safely fail
*/
ret = btrfs_check_dir_item_collision(BTRFS_I(dir)->root, dir->i_ino, &name_str);
if (ret)
  goto out_dput;

down_read(&fs_info->subvol_sem);

if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
  goto out_up_read;

if (snap_src)
  ret = create_snapshot(snap_src, dir, dentry, readonly, inherit);
else
  ret = create_subvol(idmap, dir, dentry, inherit);

if (!ret)
  fsnotify_mkdir(dir, dentry);
out_up_read:
up_read(&fs_info->subvol_sem);
out_dput:
dput(dentry);
out_unlock:
btrfs_inode_unlock(BTRFS_I(dir), 0);
return ret;
}

static noinline int btrfs_mksnapshot(struct dentry *parent,
       struct mnt_idmap *idmap,
       struct qstr *qname,
       struct btrfs_root *root,
       bool readonly,
       struct btrfs_qgroup_inherit *inherit)
{
int ret;

/*
* Force new buffered writes to reserve space even when NOCOW is
* possible. This is to avoid later writeback (running dealloc) to
* fallback to COW mode and unexpectedly fail with ENOSPC.
*/
btrfs_drew_read_lock(&root->snapshot_lock);

ret = btrfs_start_delalloc_snapshot(root, false);
if (ret)
  goto out;

/*
* All previous writes have started writeback in NOCOW mode, so now
* we force future writes to fallback to COW mode during snapshot
* creation.
*/
atomic_inc(&root->snapshot_force_cow);

btrfs_wait_ordered_extents(root, U64_MAX, NULL);

ret = btrfs_mksubvol(parent, idmap, qname, root, readonly, inherit);

atomic_dec(&root->snapshot_force_cow);
out:
btrfs_drew_read_unlock(&root->snapshot_lock);
return ret;
}

/*
* Try to start exclusive operation @type or cancel it if it's running.
*
* Return:
*   0        - normal mode, newly claimed op started
*  >0        - normal mode, something else is running,
*              return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS to user space
* ECANCELED  - cancel mode, successful cancel
* ENOTCONN   - cancel mode, operation not running anymore
*/
static int exclop_start_or_cancel_reloc(struct btrfs_fs_info *fs_info,
   enum btrfs_exclusive_operation type, bool cancel)
{
if (!cancel) {
  /* Start normal op */
  if (!btrfs_exclop_start(fs_info, type))
   return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
  /* Exclusive operation is now claimed */
  return 0;
}

/* Cancel running op */
if (btrfs_exclop_start_try_lock(fs_info, type)) {
  /*
* This blocks any exclop finish from setting it to NONE, so we
* request cancellation. Either it runs and we will wait for it,
* or it has finished and no waiting will happen.
*/
  atomic_inc(&fs_info->reloc_cancel_req);
  btrfs_exclop_start_unlock(fs_info);

  if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags))
   wait_on_bit(&fs_info->flags, BTRFS_FS_RELOC_RUNNING,
        TASK_INTERRUPTIBLE);

  return -ECANCELED;
}

/* Something else is running or none */
return -ENOTCONN;
}

static noinline int btrfs_ioctl_resize(struct file *file,
     void __user *arg)
{
BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
struct btrfs_fs_info *fs_info = root->fs_info;
u64 new_size;
u64 old_size;
u64 devid = 1;
struct btrfs_ioctl_vol_args *vol_args;
struct btrfs_device *device = NULL;
char *sizestr;
char *devstr = NULL;
int ret = 0;
int mod = 0;
bool cancel;

if (!capable(CAP_SYS_ADMIN))
  return -EPERM;

ret = mnt_want_write_file(file);
if (ret)
  return ret;

/*
* Read the arguments before checking exclusivity to be able to
* distinguish regular resize and cancel
*/
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
  ret = PTR_ERR(vol_args);
  goto out_drop;
}
ret = btrfs_check_ioctl_vol_args_path(vol_args);
if (ret < 0)
  goto out_free;

sizestr = vol_args->name;
cancel = (strcmp("cancel", sizestr) == 0);
ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_RESIZE, cancel);
if (ret)
  goto out_free;
/* Exclusive operation is now claimed */

devstr = strchr(sizestr, ':');
if (devstr) {
  sizestr = devstr + 1;
  *devstr = '\0';
  devstr = vol_args->name;
  ret = kstrtoull(devstr, 10, &devid);
  if (ret)
   goto out_finish;
  if (!devid) {
   ret = -EINVAL;
   goto out_finish;
  }
  btrfs_info(fs_info, "resizing devid %llu", devid);
}

args.devid = devid;
device = btrfs_find_device(fs_info->fs_devices, &args);
if (!device) {
  btrfs_info(fs_info, "resizer unable to find device %llu",
      devid);
  ret = -ENODEV;
  goto out_finish;
}

if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
  btrfs_info(fs_info,
      "resizer unable to apply on readonly device %llu",
         devid);
  ret = -EPERM;
  goto out_finish;
}

if (!strcmp(sizestr, "max"))
  new_size = bdev_nr_bytes(device->bdev);
else {
  char *retptr;

  if (sizestr[0] == '-') {
   mod = -1;
   sizestr++;
  } else if (sizestr[0] == '+') {
   mod = 1;
   sizestr++;
  }
  new_size = memparse(sizestr, &retptr);
  if (*retptr != '\0' || new_size == 0) {
   ret = -EINVAL;
   goto out_finish;
  }
}

if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
  ret = -EPERM;
  goto out_finish;
}

old_size = btrfs_device_get_total_bytes(device);

if (mod < 0) {
  if (new_size > old_size) {
   ret = -EINVAL;
   goto out_finish;
  }
  new_size = old_size - new_size;
} else if (mod > 0) {
  if (new_size > ULLONG_MAX - old_size) {
   ret = -ERANGE;
   goto out_finish;
  }
  new_size = old_size + new_size;
}

if (new_size < SZ_256M) {
  ret = -EINVAL;
  goto out_finish;
}
if (new_size > bdev_nr_bytes(device->bdev)) {
  ret = -EFBIG;
  goto out_finish;
}

new_size = round_down(new_size, fs_info->sectorsize);

if (new_size > old_size) {
  struct btrfs_trans_handle *trans;

  trans = btrfs_start_transaction(root, 0);
  if (IS_ERR(trans)) {
   ret = PTR_ERR(trans);
   goto out_finish;
  }
  ret = btrfs_grow_device(trans, device, new_size);
  btrfs_commit_transaction(trans);
} else if (new_size < old_size) {
  ret = btrfs_shrink_device(device, new_size);
} /* equal, nothing need to do */

if (ret == 0 && new_size != old_size)
  btrfs_info(fs_info,
   "resize device %s (devid %llu) from %llu to %llu",
   btrfs_dev_name(device), device->devid,
   old_size, new_size);
out_finish:
btrfs_exclop_finish(fs_info);
out_free:
kfree(vol_args);
out_drop:
mnt_drop_write_file(file);
return ret;
}

static noinline int __btrfs_ioctl_snap_create(struct file *file,
    struct mnt_idmap *idmap,
    const char *name, unsigned long fd, bool subvol,
    bool readonly,
    struct btrfs_qgroup_inherit *inherit)
{
int ret = 0;
struct qstr qname = QSTR_INIT(name, strlen(name));

if (!S_ISDIR(file_inode(file)->i_mode))
  return -ENOTDIR;

ret = mnt_want_write_file(file);
if (ret)
  goto out;

if (strchr(name, '/')) {
  ret = -EINVAL;
  goto out_drop_write;
}

if (qname.name[0] == '.' &&
    (qname.len == 1 || (qname.name[1] == '.' && qname.len == 2))) {
  ret = -EEXIST;
  goto out_drop_write;
}

if (subvol) {
  ret = btrfs_mksubvol(file_dentry(file), idmap, &qname, NULL,
         readonly, inherit);
} else {
  CLASS(fd, src)(fd);
  struct inode *src_inode;
  if (fd_empty(src)) {
   ret = -EINVAL;
   goto out_drop_write;
  }

  src_inode = file_inode(fd_file(src));
  if (src_inode->i_sb != file_inode(file)->i_sb) {
   btrfs_info(BTRFS_I(file_inode(file))->root->fs_info,
       "Snapshot src from another FS");
   ret = -EXDEV;
  } else if (!inode_owner_or_capable(idmap, src_inode)) {
   /*
* Subvolume creation is not restricted, but snapshots
* are limited to own subvolumes only
*/
   ret = -EPERM;
  } else if (btrfs_ino(BTRFS_I(src_inode)) != BTRFS_FIRST_FREE_OBJECTID) {
   /*
* Snapshots must be made with the src_inode referring
* to the subvolume inode, otherwise the permission
* checking above is useless because we may have
* permission on a lower directory but not the subvol
* itself.
*/
   ret = -EINVAL;
  } else {
   ret = btrfs_mksnapshot(file_dentry(file), idmap, &qname,
            BTRFS_I(src_inode)->root,
            readonly, inherit);
  }
}
out_drop_write:
mnt_drop_write_file(file);
out:
return ret;
}

static noinline int btrfs_ioctl_snap_create(struct file *file,
         void __user *arg, int subvol)
{
struct btrfs_ioctl_vol_args *vol_args;
int ret;

if (!S_ISDIR(file_inode(file)->i_mode))
  return -ENOTDIR;

vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args))
  return PTR_ERR(vol_args);
ret = btrfs_check_ioctl_vol_args_path(vol_args);
if (ret < 0)
  goto out;

ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file),
     vol_args->name, vol_args->fd, subvol,
     false, NULL);

out:
kfree(vol_args);
return ret;
}

static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
            void __user *arg, bool subvol)
{
struct btrfs_ioctl_vol_args_v2 *vol_args;
int ret;
bool readonly = false;
struct btrfs_qgroup_inherit *inherit = NULL;

if (!S_ISDIR(file_inode(file)->i_mode))
  return -ENOTDIR;

vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args))
  return PTR_ERR(vol_args);
ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args);
if (ret < 0)
  goto free_args;

if (vol_args->flags & ~BTRFS_SUBVOL_CREATE_ARGS_MASK) {
  ret = -EOPNOTSUPP;
  goto free_args;
}

if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
  readonly = true;
if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
  struct btrfs_fs_info *fs_info = inode_to_fs_info(file_inode(file));

  if (vol_args->size < sizeof(*inherit) ||
      vol_args->size > PAGE_SIZE) {
   ret = -EINVAL;
   goto free_args;
  }
  inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size);
  if (IS_ERR(inherit)) {
   ret = PTR_ERR(inherit);
   goto free_args;
  }

  ret = btrfs_qgroup_check_inherit(fs_info, inherit, vol_args->size);
  if (ret < 0)
   goto free_inherit;
}

ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file),
     vol_args->name, vol_args->fd, subvol,
     readonly, inherit);
if (ret)
  goto free_inherit;
free_inherit:
kfree(inherit);
free_args:
kfree(vol_args);
return ret;
}

static noinline int btrfs_ioctl_subvol_getflags(struct btrfs_inode *inode,
      void __user *arg)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
u64 flags = 0;

if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID)
  return -EINVAL;

down_read(&fs_info->subvol_sem);
if (btrfs_root_readonly(root))
  flags |= BTRFS_SUBVOL_RDONLY;
up_read(&fs_info->subvol_sem);

if (copy_to_user(arg, &flags, sizeof(flags)))
  ret = -EFAULT;

return ret;
}

static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
           void __user *arg)
{
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
u64 root_flags;
u64 flags;
int ret = 0;

if (!inode_owner_or_capable(file_mnt_idmap(file), inode))
  return -EPERM;

ret = mnt_want_write_file(file);
if (ret)
  goto out;

if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
  ret = -EINVAL;
  goto out_drop_write;
}

if (copy_from_user(&flags, arg, sizeof(flags))) {
  ret = -EFAULT;
  goto out_drop_write;
}

if (flags & ~BTRFS_SUBVOL_RDONLY) {
  ret = -EOPNOTSUPP;
  goto out_drop_write;
}

down_write(&fs_info->subvol_sem);

/* nothing to do */
if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
  goto out_drop_sem;

root_flags = btrfs_root_flags(&root->root_item);
if (flags & BTRFS_SUBVOL_RDONLY) {
  btrfs_set_root_flags(&root->root_item,
         root_flags | BTRFS_ROOT_SUBVOL_RDONLY);
} else {
  /*
* Block RO -> RW transition if this subvolume is involved in
* send
*/
  spin_lock(&root->root_item_lock);
  if (root->send_in_progress == 0) {
   btrfs_set_root_flags(&root->root_item,
         root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY);
   spin_unlock(&root->root_item_lock);
  } else {
   spin_unlock(&root->root_item_lock);
   btrfs_warn(fs_info,
       "Attempt to set subvolume %llu read-write during send",
       btrfs_root_id(root));
   ret = -EPERM;
   goto out_drop_sem;
  }
}

trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
  ret = PTR_ERR(trans);
  goto out_reset;
}

ret = btrfs_update_root(trans, fs_info->tree_root,
    &root->root_key, &root->root_item);
if (ret < 0) {
  btrfs_end_transaction(trans);
  goto out_reset;
}

ret = btrfs_commit_transaction(trans);

out_reset:
if (ret)
  btrfs_set_root_flags(&root->root_item, root_flags);
out_drop_sem:
up_write(&fs_info->subvol_sem);
out_drop_write:
mnt_drop_write_file(file);
out:
return ret;
}

static noinline bool key_in_sk(const struct btrfs_key *key,
          const struct btrfs_ioctl_search_key *sk)
{
struct btrfs_key test;
int ret;

test.objectid = sk->min_objectid;
test.type = sk->min_type;
test.offset = sk->min_offset;

ret = btrfs_comp_cpu_keys(key, &test);
if (ret < 0)
  return false;

test.objectid = sk->max_objectid;
test.type = sk->max_type;
test.offset = sk->max_offset;

ret = btrfs_comp_cpu_keys(key, &test);
if (ret > 0)
  return false;
return true;
}

static noinline int copy_to_sk(struct btrfs_path *path,
          struct btrfs_key *key,
          const struct btrfs_ioctl_search_key *sk,
          u64 *buf_size,
          char __user *ubuf,
          unsigned long *sk_offset,
          int *num_found)
{
u64 found_transid;
struct extent_buffer *leaf;
struct btrfs_ioctl_search_header sh;
struct btrfs_key test;
unsigned long item_off;
unsigned long item_len;
int nritems;
int i;
int slot;
int ret = 0;

leaf = path->nodes[0];
slot = path->slots[0];
nritems = btrfs_header_nritems(leaf);

if (btrfs_header_generation(leaf) > sk->max_transid) {
  i = nritems;
  goto advance_key;
}
found_transid = btrfs_header_generation(leaf);

for (i = slot; i < nritems; i++) {
  item_off = btrfs_item_ptr_offset(leaf, i);
  item_len = btrfs_item_size(leaf, i);

  btrfs_item_key_to_cpu(leaf, key, i);
  if (!key_in_sk(key, sk))
   continue;

  if (sizeof(sh) + item_len > *buf_size) {
   if (*num_found) {
    ret = 1;
    goto out;
   }

   /*
* return one empty item back for v1, which does not
* handle -EOVERFLOW
*/

   *buf_size = sizeof(sh) + item_len;
   item_len = 0;
   ret = -EOVERFLOW;
  }

  if (sizeof(sh) + item_len + *sk_offset > *buf_size) {
   ret = 1;
   goto out;
  }

  sh.objectid = key->objectid;
  sh.type = key->type;
  sh.offset = key->offset;
  sh.len = item_len;
  sh.transid = found_transid;

  /*
* Copy search result header. If we fault then loop again so we
* can fault in the pages and -EFAULT there if there's a
* problem. Otherwise we'll fault and then copy the buffer in
* properly this next time through
*/
  if (copy_to_user_nofault(ubuf + *sk_offset, &sh, sizeof(sh))) {
   ret = 0;
   goto out;
  }

  *sk_offset += sizeof(sh);

  if (item_len) {
   char __user *up = ubuf + *sk_offset;
   /*
* Copy the item, same behavior as above, but reset the
* * sk_offset so we copy the full thing again.
*/
   if (read_extent_buffer_to_user_nofault(leaf, up,
      item_off, item_len)) {
    ret = 0;
    *sk_offset -= sizeof(sh);
    goto out;
   }

   *sk_offset += item_len;
  }
  (*num_found)++;

  if (ret) /* -EOVERFLOW from above */
   goto out;

  if (*num_found >= sk->nr_items) {
   ret = 1;
   goto out;
  }
}
advance_key:
ret = 0;
test.objectid = sk->max_objectid;
test.type = sk->max_type;
test.offset = sk->max_offset;
if (btrfs_comp_cpu_keys(key, &test) >= 0)
  ret = 1;
else if (key->offset < (u64)-1)
  key->offset++;
else if (key->type < (u8)-1) {
  key->offset = 0;
  key->type++;
} else if (key->objectid < (u64)-1) {
  key->offset = 0;
  key->type = 0;
  key->objectid++;
} else
  ret = 1;
out:
/*
*  0: all items from this leaf copied, continue with next
*  1: * more items can be copied, but unused buffer is too small
*     * all items were found
*     Either way, it will stops the loop which iterates to the next
*     leaf
*  -EOVERFLOW: item was to large for buffer
*  -EFAULT: could not copy extent buffer back to userspace
*/
return ret;
}

static noinline int search_ioctl(struct btrfs_root *root,
     struct btrfs_ioctl_search_key *sk,
     u64 *buf_size,
     char __user *ubuf)
{
struct btrfs_fs_info *info = root->fs_info;
struct btrfs_key key;
struct btrfs_path *path;
int ret;
int num_found = 0;
unsigned long sk_offset = 0;

if (*buf_size < sizeof(struct btrfs_ioctl_search_header)) {
  *buf_size = sizeof(struct btrfs_ioctl_search_header);
  return -EOVERFLOW;
}

path = btrfs_alloc_path();
if (!path)
  return -ENOMEM;

if (sk->tree_id == 0) {
  /* Search the root that we got passed. */
  root = btrfs_grab_root(root);
} else {
  /* Look up the root from the arguments. */
  root = btrfs_get_fs_root(info, sk->tree_id, true);
  if (IS_ERR(root)) {
   btrfs_free_path(path);
   return PTR_ERR(root);
  }
}

key.objectid = sk->min_objectid;
key.type = sk->min_type;
key.offset = sk->min_offset;

while (1) {
  /*
* Ensure that the whole user buffer is faulted in at sub-page
* granularity, otherwise the loop may live-lock.
*/
  if (fault_in_subpage_writeable(ubuf + sk_offset, *buf_size - sk_offset)) {
   ret = -EFAULT;
   break;
  }

  ret = btrfs_search_forward(root, &key, path, sk->min_transid);
  if (ret)
   break;

  ret = copy_to_sk(path, &key, sk, buf_size, ubuf,
     &sk_offset, &num_found);
  btrfs_release_path(path);
  if (ret)
   break;

}
/* Normalize return values from btrfs_search_forward() and copy_to_sk(). */
if (ret > 0)
  ret = 0;

sk->nr_items = num_found;
btrfs_put_root(root);
btrfs_free_path(path);
return ret;
}

static noinline int btrfs_ioctl_tree_search(struct btrfs_root *root,
         void __user *argp)
{
struct btrfs_ioctl_search_args __user *uargs = argp;
struct btrfs_ioctl_search_key sk;
int ret;
u64 buf_size;

if (!capable(CAP_SYS_ADMIN))
  return -EPERM;

if (copy_from_user(&sk, &uargs->key, sizeof(sk)))
  return -EFAULT;

buf_size = sizeof(uargs->buf);

ret = search_ioctl(root, &sk, &buf_size, uargs->buf);

/*
* In the origin implementation an overflow is handled by returning a
* search header with a len of zero, so reset ret.
*/
if (ret == -EOVERFLOW)
  ret = 0;

if (ret == 0 && copy_to_user(&uargs->key, &sk, sizeof(sk)))
  ret = -EFAULT;
return ret;
}

static noinline int btrfs_ioctl_tree_search_v2(struct btrfs_root *root,
            void __user *argp)
{
struct btrfs_ioctl_search_args_v2 __user *uarg = argp;
struct btrfs_ioctl_search_args_v2 args;
int ret;
u64 buf_size;
const u64 buf_limit = SZ_16M;

if (!capable(CAP_SYS_ADMIN))
  return -EPERM;

/* copy search header and buffer size */
if (copy_from_user(&args, uarg, sizeof(args)))
  return -EFAULT;

buf_size = args.buf_size;

/* limit result size to 16MB */
if (buf_size > buf_limit)
  buf_size = buf_limit;

ret = search_ioctl(root, &args.key, &buf_size,
      (char __user *)(&uarg->buf[0]));
if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key)))
  ret = -EFAULT;
else if (ret == -EOVERFLOW &&
  copy_to_user(&uarg->buf_size, &buf_size, sizeof(buf_size)))
  ret = -EFAULT;

return ret;
}

/*
* Search INODE_REFs to identify path name of 'dirid' directory
* in a 'tree_id' tree. and sets path name to 'name'.
*/
static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
    u64 tree_id, u64 dirid, char *name)
{
struct btrfs_root *root;
struct btrfs_key key;
char *ptr;
int ret = -1;
int slot;
int len;
int total_len = 0;
struct btrfs_inode_ref *iref;
struct extent_buffer *l;
struct btrfs_path *path;

if (dirid == BTRFS_FIRST_FREE_OBJECTID) {
  name[0]='\0';
  return 0;
}

path = btrfs_alloc_path();
if (!path)
  return -ENOMEM;

ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX - 1];

root = btrfs_get_fs_root(info, tree_id, true);
if (IS_ERR(root)) {
  ret = PTR_ERR(root);
  root = NULL;
  goto out;
}

key.objectid = dirid;
key.type = BTRFS_INODE_REF_KEY;
key.offset = (u64)-1;

while (1) {
  ret = btrfs_search_backwards(root, &key, path);
  if (ret < 0)
   goto out;
  else if (ret > 0) {
   ret = -ENOENT;
   goto out;
  }

  l = path->nodes[0];
  slot = path->slots[0];

  iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
  len = btrfs_inode_ref_name_len(l, iref);
  ptr -= len + 1;
  total_len += len + 1;
  if (ptr < name) {
   ret = -ENAMETOOLONG;
   goto out;
  }

  *(ptr + len) = '/';
  read_extent_buffer(l, ptr, (unsigned long)(iref + 1), len);

  if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
   break;

  btrfs_release_path(path);
  key.objectid = key.offset;
  key.offset = (u64)-1;
  dirid = key.objectid;
}
memmove(name, ptr, total_len);
name[total_len] = '\0';
ret = 0;
out:
btrfs_put_root(root);
btrfs_free_path(path);
return ret;
}

static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
    struct inode *inode,
    struct btrfs_ioctl_ino_lookup_user_args *args)
{
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
u64 upper_limit = btrfs_ino(BTRFS_I(inode));
u64 treeid = btrfs_root_id(BTRFS_I(inode)->root);
u64 dirid = args->dirid;
unsigned long item_off;
unsigned long item_len;
struct btrfs_inode_ref *iref;
struct btrfs_root_ref *rref;
struct btrfs_root *root = NULL;
struct btrfs_path *path;
struct btrfs_key key, key2;
struct extent_buffer *leaf;
char *ptr;
int slot;
int len;
int total_len = 0;
int ret;

path = btrfs_alloc_path();
if (!path)
  return -ENOMEM;

/*
* If the bottom subvolume does not exist directly under upper_limit,
* construct the path in from the bottom up.
*/
if (dirid != upper_limit) {
  ptr = &args->path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1];

  root = btrfs_get_fs_root(fs_info, treeid, true);
  if (IS_ERR(root)) {
   ret = PTR_ERR(root);
   goto out;
  }

  key.objectid = dirid;
  key.type = BTRFS_INODE_REF_KEY;
  key.offset = (u64)-1;
  while (1) {
   struct btrfs_inode *temp_inode;

   ret = btrfs_search_backwards(root, &key, path);
   if (ret < 0)
    goto out_put;
   else if (ret > 0) {
    ret = -ENOENT;
    goto out_put;
   }

   leaf = path->nodes[0];
   slot = path->slots[0];

   iref = btrfs_item_ptr(leaf, slot, struct btrfs_inode_ref);
   len = btrfs_inode_ref_name_len(leaf, iref);
   ptr -= len + 1;
   total_len += len + 1;
   if (ptr < args->path) {
    ret = -ENAMETOOLONG;
    goto out_put;
   }

   *(ptr + len) = '/';
   read_extent_buffer(leaf, ptr,
     (unsigned long)(iref + 1), len);

   /* Check the read+exec permission of this directory */
   ret = btrfs_previous_item(root, path, dirid,
        BTRFS_INODE_ITEM_KEY);
   if (ret < 0) {
    goto out_put;
   } else if (ret > 0) {
    ret = -ENOENT;
    goto out_put;
   }

   leaf = path->nodes[0];
   slot = path->slots[0];
   btrfs_item_key_to_cpu(leaf, &key2, slot);
   if (key2.objectid != dirid) {
    ret = -ENOENT;
    goto out_put;
   }

   /*
* We don't need the path anymore, so release it and
* avoid deadlocks and lockdep warnings in case
* btrfs_iget() needs to lookup the inode from its root
* btree and lock the same leaf.
*/
   btrfs_release_path(path);
   temp_inode = btrfs_iget(key2.objectid, root);
   if (IS_ERR(temp_inode)) {
    ret = PTR_ERR(temp_inode);
    goto out_put;
   }
   ret = inode_permission(idmap, &temp_inode->vfs_inode,
            MAY_READ | MAY_EXEC);
   iput(&temp_inode->vfs_inode);
   if (ret) {
    ret = -EACCES;
    goto out_put;
   }

   if (key.offset == upper_limit)
    break;
   if (key.objectid == BTRFS_FIRST_FREE_OBJECTID) {
    ret = -EACCES;
    goto out_put;
   }

   key.objectid = key.offset;
   key.offset = (u64)-1;
   dirid = key.objectid;
  }

  memmove(args->path, ptr, total_len);
  args->path[total_len] = '\0';
  btrfs_put_root(root);
  root = NULL;
  btrfs_release_path(path);
}

/* Get the bottom subvolume's name from ROOT_REF */
key.objectid = treeid;
key.type = BTRFS_ROOT_REF_KEY;
key.offset = args->treeid;
ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
if (ret < 0) {
  goto out;
} else if (ret > 0) {
  ret = -ENOENT;
  goto out;
}

leaf = path->nodes[0];
slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &key, slot);

item_off = btrfs_item_ptr_offset(leaf, slot);
item_len = btrfs_item_size(leaf, slot);
/* Check if dirid in ROOT_REF corresponds to passed dirid */
rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) {
  ret = -EINVAL;
  goto out;
}

/* Copy subvolume's name */
item_off += sizeof(struct btrfs_root_ref);
item_len -= sizeof(struct btrfs_root_ref);
read_extent_buffer(leaf, args->name, item_off, item_len);
args->name[item_len] = 0;

out_put:
btrfs_put_root(root);
out:
btrfs_free_path(path);
return ret;
}

static noinline int btrfs_ioctl_ino_lookup(struct btrfs_root *root,
        void __user *argp)
{
struct btrfs_ioctl_ino_lookup_args *args;
int ret = 0;

args = memdup_user(argp, sizeof(*args));
if (IS_ERR(args))
  return PTR_ERR(args);

/*
* Unprivileged query to obtain the containing subvolume root id. The
* path is reset so it's consistent with btrfs_search_path_in_tree.
*/
if (args->treeid == 0)
  args->treeid = btrfs_root_id(root);

if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) {
  args->name[0] = 0;
  goto out;
}

if (!capable(CAP_SYS_ADMIN)) {
  ret = -EPERM;
  goto out;
}

ret = btrfs_search_path_in_tree(root->fs_info,
     args->treeid, args->objectid,
     args->name);

out:
if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
  ret = -EFAULT;

kfree(args);
return ret;
}

/*
* Version of ino_lookup ioctl (unprivileged)
*
* The main differences from ino_lookup ioctl are:
*
*   1. Read + Exec permission will be checked using inode_permission() during
*      path construction. -EACCES will be returned in case of failure.
*   2. Path construction will be stopped at the inode number which corresponds
*      to the fd with which this ioctl is called. If constructed path does not
*      exist under fd's inode, -EACCES will be returned.
*   3. The name of bottom subvolume is also searched and filled.
*/
static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp)
{
struct btrfs_ioctl_ino_lookup_user_args *args;
struct inode *inode;
int ret;

args = memdup_user(argp, sizeof(*args));
if (IS_ERR(args))
  return PTR_ERR(args);

inode = file_inode(file);

if (args->dirid == BTRFS_FIRST_FREE_OBJECTID &&
     btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
  /*
* The subvolume does not exist under fd with which this is
* called
*/
  kfree(args);
  return -EACCES;
}

ret = btrfs_search_path_in_tree_user(file_mnt_idmap(file), inode, args);

if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
  ret = -EFAULT;

kfree(args);
return ret;
}

/* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */
static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp)
{
struct btrfs_ioctl_get_subvol_info_args *subvol_info;
struct btrfs_fs_info *fs_info;
struct btrfs_root *root;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_root_item *root_item;
struct btrfs_root_ref *rref;
struct extent_buffer *leaf;
unsigned long item_off;
unsigned long item_len;
int slot;
int ret = 0;

path = btrfs_alloc_path();
if (!path)
  return -ENOMEM;

subvol_info = kzalloc(sizeof(*subvol_info), GFP_KERNEL);
if (!subvol_info) {
  btrfs_free_path(path);
  return -ENOMEM;
}

fs_info = BTRFS_I(inode)->root->fs_info;

/* Get root_item of inode's subvolume */
key.objectid = btrfs_root_id(BTRFS_I(inode)->root);
root = btrfs_get_fs_root(fs_info, key.objectid, true);
if (IS_ERR(root)) {
  ret = PTR_ERR(root);
  goto out_free;
}
root_item = &root->root_item;

subvol_info->treeid = key.objectid;

subvol_info->generation = btrfs_root_generation(root_item);
subvol_info->flags = btrfs_root_flags(root_item);

memcpy(subvol_info->uuid, root_item->uuid, BTRFS_UUID_SIZE);
memcpy(subvol_info->parent_uuid, root_item->parent_uuid,
          BTRFS_UUID_SIZE);
memcpy(subvol_info->received_uuid, root_item->received_uuid,
          BTRFS_UUID_SIZE);

subvol_info->ctransid = btrfs_root_ctransid(root_item);
subvol_info->ctime.sec = btrfs_stack_timespec_sec(&root_item->ctime);
subvol_info->ctime.nsec = btrfs_stack_timespec_nsec(&root_item->ctime);

subvol_info->otransid = btrfs_root_otransid(root_item);
subvol_info->otime.sec = btrfs_stack_timespec_sec(&root_item->otime);
subvol_info->otime.nsec = btrfs_stack_timespec_nsec(&root_item->otime);

subvol_info->stransid = btrfs_root_stransid(root_item);
subvol_info->stime.sec = btrfs_stack_timespec_sec(&root_item->stime);
subvol_info->stime.nsec = btrfs_stack_timespec_nsec(&root_item->stime);

subvol_info->rtransid = btrfs_root_rtransid(root_item);
subvol_info->rtime.sec = btrfs_stack_timespec_sec(&root_item->rtime);
subvol_info->rtime.nsec = btrfs_stack_timespec_nsec(&root_item->rtime);

if (key.objectid != BTRFS_FS_TREE_OBJECTID) {
  /* Search root tree for ROOT_BACKREF of this subvolume */
  key.type = BTRFS_ROOT_BACKREF_KEY;
  key.offset = 0;
  ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
  if (ret < 0) {
   goto out;
  } else if (path->slots[0] >=
      btrfs_header_nritems(path->nodes[0])) {
   ret = btrfs_next_leaf(fs_info->tree_root, path);
   if (ret < 0) {
    goto out;
   } else if (ret > 0) {
    ret = -EUCLEAN;
    goto out;
   }
  }

  leaf = path->nodes[0];
  slot = path->slots[0];
  btrfs_item_key_to_cpu(leaf, &key, slot);
  if (key.objectid == subvol_info->treeid &&
      key.type == BTRFS_ROOT_BACKREF_KEY) {
   subvol_info->parent_id = key.offset;

   rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
   subvol_info->dirid = btrfs_root_ref_dirid(leaf, rref);

   item_off = btrfs_item_ptr_offset(leaf, slot)
     + sizeof(struct btrfs_root_ref);
   item_len = btrfs_item_size(leaf, slot)
     - sizeof(struct btrfs_root_ref);
   read_extent_buffer(leaf, subvol_info->name,
        item_off, item_len);
  } else {
   ret = -ENOENT;
   goto out;
  }
}

btrfs_free_path(path);
path = NULL;
if (copy_to_user(argp, subvol_info, sizeof(*subvol_info)))
  ret = -EFAULT;

out:
btrfs_put_root(root);
out_free:
btrfs_free_path(path);
kfree(subvol_info);
return ret;
}

/*
* Return ROOT_REF information of the subvolume containing this inode
* except the subvolume name.
*/
static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root,
       void __user *argp)
{
struct btrfs_ioctl_get_subvol_rootref_args *rootrefs;
struct btrfs_root_ref *rref;
struct btrfs_path *path;
struct btrfs_key key;
struct extent_buffer *leaf;
u64 objectid;
int slot;
int ret;
u8 found;

path = btrfs_alloc_path();
if (!path)
  return -ENOMEM;

rootrefs = memdup_user(argp, sizeof(*rootrefs));
if (IS_ERR(rootrefs)) {
  btrfs_free_path(path);
  return PTR_ERR(rootrefs);
}

objectid = btrfs_root_id(root);
key.objectid = objectid;
key.type = BTRFS_ROOT_REF_KEY;
key.offset = rootrefs->min_treeid;
found = 0;

root = root->fs_info->tree_root;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
  goto out;
} else if (path->slots[0] >=
     btrfs_header_nritems(path->nodes[0])) {
  ret = btrfs_next_leaf(root, path);
  if (ret < 0) {
   goto out;
  } else if (ret > 0) {
   ret = -EUCLEAN;
   goto out;
  }
}
while (1) {
  leaf = path->nodes[0];
  slot = path->slots[0];

  btrfs_item_key_to_cpu(leaf, &key, slot);
  if (key.objectid != objectid || key.type != BTRFS_ROOT_REF_KEY) {
   ret = 0;
   goto out;
  }

  if (found == BTRFS_MAX_ROOTREF_BUFFER_NUM) {
   ret = -EOVERFLOW;
   goto out;
  }

  rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
  rootrefs->rootref[found].treeid = key.offset;
  rootrefs->rootref[found].dirid =
      btrfs_root_ref_dirid(leaf, rref);
  found++;

  ret = btrfs_next_item(root, path);
  if (ret < 0) {
   goto out;
  } else if (ret > 0) {
   ret = -EUCLEAN;
   goto out;
  }
}

out:
btrfs_free_path(path);

if (!ret || ret == -EOVERFLOW) {
  rootrefs->num_items = found;
  /* update min_treeid for next search */
  if (found)
   rootrefs->min_treeid =
    rootrefs->rootref[found - 1].treeid + 1;
  if (copy_to_user(argp, rootrefs, sizeof(*rootrefs)))
   ret = -EFAULT;
}

kfree(rootrefs);

return ret;
}

static noinline int btrfs_ioctl_snap_destroy(struct file *file,
          void __user *arg,
          bool destroy_v2)
{
struct dentry *parent = file->f_path.dentry;
struct dentry *dentry;
struct inode *dir = d_inode(parent);
struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct inode *inode;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_root *dest = NULL;
struct btrfs_ioctl_vol_args *vol_args = NULL;
struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL;
struct mnt_idmap *idmap = file_mnt_idmap(file);
char *subvol_name, *subvol_name_ptr = NULL;
int ret = 0;
bool destroy_parent = false;

/* We don't support snapshots with extent tree v2 yet. */
if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
  btrfs_err(fs_info,
     "extent tree v2 doesn't support snapshot deletion yet");
  return -EOPNOTSUPP;
}

if (destroy_v2) {
  vol_args2 = memdup_user(arg, sizeof(*vol_args2));
  if (IS_ERR(vol_args2))
   return PTR_ERR(vol_args2);

  if (vol_args2->flags & ~BTRFS_SUBVOL_DELETE_ARGS_MASK) {
   ret = -EOPNOTSUPP;
   goto out;
  }

  /*
* If SPEC_BY_ID is not set, we are looking for the subvolume by
* name, same as v1 currently does.
*/
  if (!(vol_args2->flags & BTRFS_SUBVOL_SPEC_BY_ID)) {
   ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args2);
   if (ret < 0)
    goto out;
   subvol_name = vol_args2->name;

   ret = mnt_want_write_file(file);
   if (ret)
    goto out;
  } else {
   struct inode *old_dir;

   if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID) {
    ret = -EINVAL;
    goto out;
   }

   ret = mnt_want_write_file(file);
   if (ret)
    goto out;

   dentry = btrfs_get_dentry(fs_info->sb,
     BTRFS_FIRST_FREE_OBJECTID,
     vol_args2->subvolid, 0);
   if (IS_ERR(dentry)) {
    ret = PTR_ERR(dentry);
    goto out_drop_write;
   }

   /*
* Change the default parent since the subvolume being
* deleted can be outside of the current mount point.
*/
   parent = btrfs_get_parent(dentry);

   /*
* At this point dentry->d_name can point to '/' if the
* subvolume we want to destroy is outsite of the
* current mount point, so we need to release the
* current dentry and execute the lookup to return a new
* one with ->d_name pointing to the
* <mount point>/subvol_name.
*/
   dput(dentry);
   if (IS_ERR(parent)) {
    ret = PTR_ERR(parent);
    goto out_drop_write;
   }
   old_dir = dir;
   dir = d_inode(parent);

   /*
* If v2 was used with SPEC_BY_ID, a new parent was
* allocated since the subvolume can be outside of the
* current mount point. Later on we need to release this
* new parent dentry.
*/
   destroy_parent = true;

   /*
* On idmapped mounts, deletion via subvolid is
* restricted to subvolumes that are immediate
* ancestors of the inode referenced by the file
* descriptor in the ioctl. Otherwise the idmapping
* could potentially be abused to delete subvolumes
* anywhere in the filesystem the user wouldn't be able
* to delete without an idmapped mount.
*/
   if (old_dir != dir && idmap != &nop_mnt_idmap) {
    ret = -EOPNOTSUPP;
    goto free_parent;
   }

   subvol_name_ptr = btrfs_get_subvol_name_from_objectid(
      fs_info, vol_args2->subvolid);
   if (IS_ERR(subvol_name_ptr)) {
    ret = PTR_ERR(subvol_name_ptr);
    goto free_parent;
   }
   /* subvol_name_ptr is already nul terminated */
   subvol_name = (char *)kbasename(subvol_name_ptr);
  }
} else {
  vol_args = memdup_user(arg, sizeof(*vol_args));
  if (IS_ERR(vol_args))
   return PTR_ERR(vol_args);

  ret = btrfs_check_ioctl_vol_args_path(vol_args);
  if (ret < 0)
   goto out;

  subvol_name = vol_args->name;

  ret = mnt_want_write_file(file);
  if (ret)
   goto out;
}

if (strchr(subvol_name, '/') ||
     strcmp(subvol_name, "..") == 0) {
  ret = -EINVAL;
  goto free_subvol_name;
}

if (!S_ISDIR(dir->i_mode)) {
  ret = -ENOTDIR;
  goto free_subvol_name;
}

ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
if (ret == -EINTR)
  goto free_subvol_name;
dentry = lookup_one(idmap, &QSTR(subvol_name), parent);
if (IS_ERR(dentry)) {
  ret = PTR_ERR(dentry);
  goto out_unlock_dir;
}

if (d_really_is_negative(dentry)) {
  ret = -ENOENT;
  goto out_dput;
}

inode = d_inode(dentry);
dest = BTRFS_I(inode)->root;
if (!capable(CAP_SYS_ADMIN)) {
  /*
* Regular user.  Only allow this with a special mount
* option, when the user has write+exec access to the
* subvol root, and when rmdir(2) would have been
* allowed.
*
* Note that this is _not_ check that the subvol is
* empty or doesn't contain data that we wouldn't
* otherwise be able to delete.
*
* Users who want to delete empty subvols should try
* rmdir(2).
*/
  ret = -EPERM;
  if (!btrfs_test_opt(fs_info, USER_SUBVOL_RM_ALLOWED))
   goto out_dput;

  /*
* Do not allow deletion if the parent dir is the same
* as the dir to be deleted.  That means the ioctl
* must be called on the dentry referencing the root
* of the subvol, not a random directory contained
* within it.
*/
  ret = -EINVAL;
  if (root == dest)
   goto out_dput;

  ret = inode_permission(idmap, inode, MAY_WRITE | MAY_EXEC);
  if (ret)
   goto out_dput;
}

/* check if subvolume may be deleted by a user */
ret = btrfs_may_delete(idmap, dir, dentry, 1);
if (ret)
  goto out_dput;

if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
  ret = -EINVAL;
  goto out_dput;
}

btrfs_inode_lock(BTRFS_I(inode), 0);
ret = btrfs_delete_subvolume(BTRFS_I(dir), dentry);
btrfs_inode_unlock(BTRFS_I(inode), 0);
if (!ret)
  d_delete_notify(dir, dentry);

out_dput:
dput(dentry);
out_unlock_dir:
btrfs_inode_unlock(BTRFS_I(dir), 0);
free_subvol_name:
kfree(subvol_name_ptr);
free_parent:
if (destroy_parent)
  dput(parent);
out_drop_write:
mnt_drop_write_file(file);
out:
kfree(vol_args2);
kfree(vol_args);
return ret;
}

static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
{
struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ioctl_defrag_range_args range = {0};
int ret;

ret = mnt_want_write_file(file);
if (ret)
  return ret;

if (btrfs_root_readonly(root)) {
  ret = -EROFS;
  goto out;
}

switch (inode->i_mode & S_IFMT) {
case S_IFDIR:
  if (!capable(CAP_SYS_ADMIN)) {
   ret = -EPERM;
   goto out;
  }
  ret = btrfs_defrag_root(root);
  break;
case S_IFREG:
  /*
* Note that this does not check the file descriptor for write
* access. This prevents defragmenting executables that are
* running and allows defrag on files open in read-only mode.
*/
  if (!capable(CAP_SYS_ADMIN) &&
      inode_permission(&nop_mnt_idmap, inode, MAY_WRITE)) {
   ret = -EPERM;
   goto out;
  }

  /*
* Don't allow defrag on pre-content watched files, as it could
* populate the page cache with 0's via readahead.
*/
  if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) {
   ret = -EINVAL;
   goto out;
  }

  if (argp) {
   if (copy_from_user(&range, argp, sizeof(range))) {
    ret = -EFAULT;
    goto out;
   }
   if (range.flags & ~BTRFS_DEFRAG_RANGE_FLAGS_SUPP) {
    ret = -EOPNOTSUPP;
    goto out;
   }
   if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS) &&
       (range.flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS)) {
    ret = -EINVAL;
    goto out;
   }
   /* Compression or no-compression require to start the IO. */
   if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS) ||
       (range.flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS)) {
    range.flags |= BTRFS_DEFRAG_RANGE_START_IO;
    range.extent_thresh = (u32)-1;
   }
  } else {
   /* the rest are all set to zero by kzalloc */
   range.len = (u64)-1;
  }
  ret = btrfs_defrag_file(BTRFS_I(file_inode(file)), &file->f_ra,
     &range, BTRFS_OLDEST_GENERATION, 0);
  if (ret > 0)
   ret = 0;
--> --------------------

--> maximum size reached

--> --------------------

Messung V0.5

¤ Dauer der Verarbeitung: 0.23 Sekunden (vorverarbeitet) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.