/* must be called with qgroup_ioctl_lock held */ staticstruct btrfs_qgroup *find_qgroup_rb(conststruct btrfs_fs_info *fs_info,
u64 qgroupid)
{ struct rb_node *node;
/* * Add qgroup to the filesystem's qgroup tree. * * Must be called with qgroup_lock held and @prealloc preallocated. * * The control on the lifespan of @prealloc would be transferred to this * function, thus caller should no longer touch @prealloc.
*/ staticstruct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *prealloc,
u64 qgroupid)
{ struct rb_node *node;
/* Caller must have pre-allocated @prealloc. */
ASSERT(prealloc);
list_del(&qgroup->dirty); while (!list_empty(&qgroup->groups)) {
list = list_first_entry(&qgroup->groups, struct btrfs_qgroup_list, next_group);
list_del(&list->next_group);
list_del(&list->next_member);
kfree(list);
}
while (!list_empty(&qgroup->members)) {
list = list_first_entry(&qgroup->members, struct btrfs_qgroup_list, next_member);
list_del(&list->next_group);
list_del(&list->next_member);
kfree(list);
}
}
/* must be called with qgroup_lock held */ staticint del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
{ struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid);
/* * Add relation specified by two qgroups. * * Must be called with qgroup_lock held, the ownership of @prealloc is * transferred to this function and caller should not touch it anymore. * * Return: 0 on success * -ENOENT if one of the qgroups is NULL * <0 other errors
*/ staticint __add_relation_rb(struct btrfs_qgroup_list *prealloc, struct btrfs_qgroup *member, struct btrfs_qgroup *parent)
{ if (!member || !parent) {
kfree(prealloc); return -ENOENT;
}
/* * Add relation specified by two qgroup ids. * * Must be called with qgroup_lock held. * * Return: 0 on success * -ENOENT if one of the ids does not exist * <0 other errors
*/ staticint add_relation_rb(struct btrfs_fs_info *fs_info, struct btrfs_qgroup_list *prealloc,
u64 memberid, u64 parentid)
{ struct btrfs_qgroup *member; struct btrfs_qgroup *parent;
member = find_qgroup_rb(fs_info, memberid);
parent = find_qgroup_rb(fs_info, parentid);
/* * The full config is read in one go, only called from open_ctree() * It doesn't use any locking, as at this point we're still single-threaded
*/ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
{ struct btrfs_key key; struct btrfs_key found_key; struct btrfs_root *quota_root = fs_info->quota_root; struct btrfs_path *path = NULL; struct extent_buffer *l; int slot; int ret = 0;
u64 flags = 0;
u64 rescan_progress = 0;
if (!fs_info->quota_root) return 0;
path = btrfs_alloc_path(); if (!path) {
ret = -ENOMEM; goto out;
}
ret = btrfs_sysfs_add_qgroups(fs_info); if (ret < 0) goto out; /* default this to quota off, in case no status key is found */
fs_info->qgroup_flags = 0;
/* * pass 1: read status, all qgroup infos and limits
*/
key.objectid = 0;
key.type = 0;
key.offset = 0;
ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1); if (ret) goto out;
while (1) { struct btrfs_qgroup *qgroup;
slot = path->slots[0];
l = path->nodes[0];
btrfs_item_key_to_cpu(l, &found_key, slot);
if (found_key.type == BTRFS_QGROUP_STATUS_KEY) { struct btrfs_qgroup_status_item *ptr;
prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL); if (!prealloc) {
ret = -ENOMEM; goto out;
}
qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset); /* * If a qgroup exists for a subvolume ID, it is possible * that subvolume has been deleted, in which case * reusing that ID would lead to incorrect accounting. * * Ensure that we skip any such subvol ids. * * We don't need to lock because this is only called * during mount before we start doing things like creating * subvolumes.
*/ if (btrfs_is_fstree(qgroup->qgroupid) &&
qgroup->qgroupid > tree_root->free_objectid) /* * Don't need to check against BTRFS_LAST_FREE_OBJECTID, * as it will get checked on the next call to * btrfs_get_free_objectid.
*/
tree_root->free_objectid = qgroup->qgroupid + 1;
}
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); if (ret < 0) goto out;
switch (found_key.type) { case BTRFS_QGROUP_INFO_KEY: { struct btrfs_qgroup_info_item *ptr;
/* * pass 2: read all qgroup relations
*/
key.objectid = 0;
key.type = BTRFS_QGROUP_RELATION_KEY;
key.offset = 0;
ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0); if (ret) goto out; while (1) { struct btrfs_qgroup_list *list = NULL;
slot = path->slots[0];
l = path->nodes[0];
btrfs_item_key_to_cpu(l, &found_key, slot);
if (found_key.type != BTRFS_QGROUP_RELATION_KEY) goto next2;
if (found_key.objectid > found_key.offset) { /* parent <- member, not needed to build config */ /* FIXME should we omit the key completely? */ goto next2;
}
list = kzalloc(sizeof(*list), GFP_KERNEL); if (!list) {
ret = -ENOMEM; goto out;
}
ret = add_relation_rb(fs_info, list, found_key.objectid,
found_key.offset);
list = NULL; if (ret == -ENOENT) {
btrfs_warn(fs_info, "orphan qgroup relation 0x%llx->0x%llx",
found_key.objectid, found_key.offset);
ret = 0; /* ignore the error */
} if (ret) goto out;
next2:
ret = btrfs_next_item(quota_root, path); if (ret < 0) goto out; if (ret) break;
}
out:
btrfs_free_path(path);
fs_info->qgroup_flags |= flags; if (ret >= 0) { if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)
set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
} else {
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
btrfs_sysfs_del_qgroups(fs_info);
}
return ret < 0 ? ret : 0;
}
/* * Called in close_ctree() when quota is still enabled. This verifies we don't * leak some reserved space. * * Return false if no reserved space is left. * Return true if some reserved space is leaked.
*/ bool btrfs_check_quota_leak(conststruct btrfs_fs_info *fs_info)
{ struct rb_node *node; bool ret = false;
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) return ret; /* * Since we're unmounting, there is no race and no need to grab qgroup * lock. And here we don't go post-order to provide a more user * friendly sorted result.
*/ for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) { struct btrfs_qgroup *qgroup; int i;
qgroup = rb_entry(node, struct btrfs_qgroup, node); for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) { if (qgroup->rsv.values[i]) {
ret = true;
btrfs_warn(fs_info, "qgroup %hu/%llu has unreleased space, type %d rsv %llu",
btrfs_qgroup_level(qgroup->qgroupid),
btrfs_qgroup_subvolid(qgroup->qgroupid),
i, qgroup->rsv.values[i]);
}
}
} return ret;
}
/* * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(), * first two are in single-threaded paths.
*/ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
{ struct rb_node *n; struct btrfs_qgroup *qgroup;
/* * btrfs_quota_disable() can be called concurrently with * btrfs_qgroup_rescan() -> qgroup_rescan_zero_tracking(), so take the * lock.
*/
spin_lock(&fs_info->qgroup_lock); while ((n = rb_first(&fs_info->qgroup_tree))) {
qgroup = rb_entry(n, struct btrfs_qgroup, node);
rb_erase(n, &fs_info->qgroup_tree);
__del_qgroup_rb(qgroup);
spin_unlock(&fs_info->qgroup_lock);
btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
kfree(qgroup);
spin_lock(&fs_info->qgroup_lock);
}
spin_unlock(&fs_info->qgroup_lock);
/* * called with qgroup_lock held
*/ staticint btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root)
{ struct btrfs_path *path; struct btrfs_key key; struct extent_buffer *leaf = NULL; int ret; int nr = 0;
path = btrfs_alloc_path(); if (!path) return -ENOMEM;
key.objectid = 0;
key.type = 0;
key.offset = 0;
while (1) {
ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) goto out;
leaf = path->nodes[0];
nr = btrfs_header_nritems(leaf); if (!nr) break; /* * delete the leaf one by one * since the whole tree is going * to be deleted.
*/
path->slots[0] = 0;
ret = btrfs_del_items(trans, root, path, 0, nr); if (ret) goto out;
btrfs_release_path(path);
}
ret = 0;
out:
btrfs_free_path(path); return ret;
}
/* * We need to have subvol_sem write locked, to prevent races between * concurrent tasks trying to enable quotas, because we will unlock * and relock qgroup_ioctl_lock before setting fs_info->quota_root * and before setting BTRFS_FS_QUOTA_ENABLED.
*/
lockdep_assert_held_write(&fs_info->subvol_sem);
if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
btrfs_err(fs_info, "qgroups are currently unsupported in extent tree v2"); return -EINVAL;
}
mutex_lock(&fs_info->qgroup_ioctl_lock); if (fs_info->quota_root) goto out;
ret = btrfs_sysfs_add_qgroups(fs_info); if (ret < 0) goto out;
/* * Unlock qgroup_ioctl_lock before starting the transaction. This is to * avoid lock acquisition inversion problems (reported by lockdep) between * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we * start a transaction. * After we started the transaction lock qgroup_ioctl_lock again and * check if someone else created the quota root in the meanwhile. If so, * just return success and release the transaction handle. * * Also we don't need to worry about someone else calling * btrfs_sysfs_add_qgroups() after we unlock and getting an error because * that function returns 0 (success) when the sysfs entries already exist.
*/
mutex_unlock(&fs_info->qgroup_ioctl_lock);
/* * 1 for quota root item * 1 for BTRFS_QGROUP_STATUS item * * Yet we also need 2*n items for a QGROUP_INFO/QGROUP_LIMIT items * per subvolume. However those are not currently reserved since it * would be a lot of overkill.
*/
trans = btrfs_start_transaction(tree_root, 2);
mutex_lock(&fs_info->qgroup_ioctl_lock); if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL; goto out;
}
if (fs_info->quota_root) goto out;
/* * initially create the quota tree
*/
quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID); if (IS_ERR(quota_root)) {
ret = PTR_ERR(quota_root);
btrfs_abort_transaction(trans, ret); goto out;
}
path = btrfs_alloc_path(); if (!path) {
ret = -ENOMEM;
btrfs_abort_transaction(trans, ret); goto out_free_root;
}
/* Release locks on tree_root before we access quota_root */
btrfs_release_path(path);
/* We should not have a stray @prealloc pointer. */
ASSERT(prealloc == NULL);
prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); if (!prealloc) {
ret = -ENOMEM;
btrfs_abort_transaction(trans, ret); goto out_free_path;
}
ret = add_qgroup_item(trans, quota_root,
found_key.offset); if (ret) {
btrfs_abort_transaction(trans, ret); goto out_free_path;
}
qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset);
prealloc = NULL;
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); if (ret < 0) {
btrfs_abort_transaction(trans, ret); goto out_free_path;
}
ret = btrfs_search_slot_for_read(tree_root, &found_key,
path, 1, 0); if (ret < 0) {
btrfs_abort_transaction(trans, ret); goto out_free_path;
} if (ret > 0) { /* * Shouldn't happen, but in case it does we * don't need to do the btrfs_next_item, just * continue.
*/ continue;
}
}
ret = btrfs_next_item(tree_root, path); if (ret < 0) {
btrfs_abort_transaction(trans, ret); goto out_free_path;
} if (ret) break;
}
out_add_root:
btrfs_release_path(path);
ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID); if (ret) {
btrfs_abort_transaction(trans, ret); goto out_free_path;
}
ASSERT(prealloc == NULL);
prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); if (!prealloc) {
ret = -ENOMEM; goto out_free_path;
}
qgroup = add_qgroup_rb(fs_info, prealloc, BTRFS_FS_TREE_OBJECTID);
prealloc = NULL;
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); if (ret < 0) {
btrfs_abort_transaction(trans, ret); goto out_free_path;
}
fs_info->qgroup_enable_gen = trans->transid;
mutex_unlock(&fs_info->qgroup_ioctl_lock); /* * Commit the transaction while not holding qgroup_ioctl_lock, to avoid * a deadlock with tasks concurrently doing other qgroup operations, such * adding/removing qgroups or adding/deleting qgroup relations for example, * because all qgroup operations first start or join a transaction and then * lock the qgroup_ioctl_lock mutex. * We are safe from a concurrent task trying to enable quotas, by calling * this function, since we are serialized by fs_info->subvol_sem.
*/
ret = btrfs_commit_transaction(trans);
trans = NULL;
mutex_lock(&fs_info->qgroup_ioctl_lock); if (ret) goto out_free_path;
/* * Set quota enabled flag after committing the transaction, to avoid * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot * creation.
*/
spin_lock(&fs_info->qgroup_lock);
fs_info->quota_root = quota_root;
set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
spin_unlock(&fs_info->qgroup_lock);
/* Skip rescan for simple qgroups. */ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) goto out_free_path;
ret = qgroup_rescan_init(fs_info, 0, 1); if (!ret) {
qgroup_rescan_zero_tracking(fs_info);
fs_info->qgroup_rescan_running = true;
btrfs_queue_work(fs_info->qgroup_rescan_workers,
&fs_info->qgroup_rescan_work);
} else { /* * We have set both BTRFS_FS_QUOTA_ENABLED and * BTRFS_QGROUP_STATUS_FLAG_ON, so we can only fail with * -EINPROGRESS. That can happen because someone started the * rescan worker by calling quota rescan ioctl before we * attempted to initialize the rescan worker. Failure due to * quotas disabled in the meanwhile is not possible, because * we are holding a write lock on fs_info->subvol_sem, which * is also acquired when disabling quotas. * Ignore such error, and any other error would need to undo * everything we did in the transaction we just committed.
*/
ASSERT(ret == -EINPROGRESS);
ret = 0;
}
out_free_path:
btrfs_free_path(path);
out_free_root: if (ret)
btrfs_put_root(quota_root);
out: if (ret)
btrfs_sysfs_del_qgroups(fs_info);
mutex_unlock(&fs_info->qgroup_ioctl_lock); if (ret && trans)
btrfs_end_transaction(trans); elseif (trans)
ret = btrfs_end_transaction(trans);
kfree(prealloc); return ret;
}
/* * It is possible to have outstanding ordered extents which reserved bytes * before we disabled. We need to fully flush delalloc, ordered extents, and a * commit to ensure that we don't leak such reservations, only to have them * come back if we re-enable. * * - enable simple quotas * - reserve space * - release it, store rsv_bytes in OE * - disable quotas * - enable simple quotas (qgroup rsv are all 0) * - OE finishes * - run delayed refs * - free rsv_bytes, resulting in miscounting or even underflow
*/ staticint flush_reservations(struct btrfs_fs_info *fs_info)
{ int ret;
ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false); if (ret) return ret;
btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL);
int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
{ struct btrfs_root *quota_root = NULL; struct btrfs_trans_handle *trans = NULL; int ret = 0;
/* * We need to have subvol_sem write locked to prevent races with * snapshot creation.
*/
lockdep_assert_held_write(&fs_info->subvol_sem);
/* * Relocation will mess with backrefs, so make sure we have the * cleaner_mutex held to protect us from relocate.
*/
lockdep_assert_held(&fs_info->cleaner_mutex);
mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) goto out;
/* * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs * to lock that mutex while holding a transaction handle and the rescan * worker needs to commit a transaction.
*/
mutex_unlock(&fs_info->qgroup_ioctl_lock);
/* * Request qgroup rescan worker to complete and wait for it. This wait * must be done before transaction start for quota disable since it may * deadlock with transaction by the qgroup rescan worker.
*/
clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
btrfs_qgroup_wait_for_completion(fs_info, false);
/* * We have nothing held here and no trans handle, just return the error * if there is one and set back the quota enabled bit since we didn't * actually disable quotas.
*/
ret = flush_reservations(fs_info); if (ret) {
set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); return ret;
}
/* * 1 For the root item * * We should also reserve enough items for the quota tree deletion in * btrfs_clean_quota_tree but this is not done. * * Also, we must always start a transaction without holding the mutex * qgroup_ioctl_lock, see btrfs_quota_enable().
*/
trans = btrfs_start_transaction(fs_info->tree_root, 1);
mutex_lock(&fs_info->qgroup_ioctl_lock); if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL;
set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); goto out;
}
/* * The easy accounting, we're updating qgroup relationship whose child qgroup * only has exclusive extents. * * In this case, all exclusive extents will also be exclusive for parent, so * excl/rfer just get added/removed. * * So is qgroup reservation space, which should also be added/removed to * parent. * Or when child tries to release reservation space, parent will underflow its * reservation (for relationship adding case). * * Caller should hold fs_info->qgroup_lock.
*/ staticint __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root, struct btrfs_qgroup *src, int sign)
{ struct btrfs_qgroup *qgroup;
LIST_HEAD(qgroup_list);
u64 num_bytes = src->excl;
u64 num_bytes_cmpr = src->excl_cmpr; int ret = 0;
qgroup = find_qgroup_rb(fs_info, ref_root); if (!qgroup) goto out;
/* * Quick path for updating qgroup with only excl refs. * * In that case, just update all parent will be enough. * Or we needs to do a full rescan. * Caller should also hold fs_info->qgroup_lock. * * Return 0 for quick update, return >0 for need to full rescan * and mark INCONSISTENT flag. * Return < 0 for other error.
*/ staticint quick_update_accounting(struct btrfs_fs_info *fs_info,
u64 src, u64 dst, int sign)
{ struct btrfs_qgroup *qgroup; int ret = 1;
qgroup = find_qgroup_rb(fs_info, src); if (!qgroup) goto out; if (qgroup->excl == qgroup->rfer) {
ret = __qgroup_excl_accounting(fs_info, dst, qgroup, sign); if (ret < 0) goto out;
ret = 0;
}
out: if (ret)
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; return ret;
}
/* * Add relation between @src and @dst qgroup. The @prealloc is allocated by the * callers and transferred here (either used or freed on error).
*/ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst, struct btrfs_qgroup_list *prealloc)
{ struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup *parent; struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; int ret = 0;
ASSERT(prealloc);
/* Check the level of src and dst first */ if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) {
kfree(prealloc); return -EINVAL;
}
mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) {
ret = -ENOTCONN; goto out;
}
member = find_qgroup_rb(fs_info, src);
parent = find_qgroup_rb(fs_info, dst); if (!member || !parent) {
ret = -EINVAL; goto out;
}
/* check if such qgroup relation exist firstly */
list_for_each_entry(list, &member->groups, next_group) { if (list->group == parent) {
ret = -EEXIST; goto out;
}
}
ret = add_qgroup_relation_item(trans, src, dst); if (ret) goto out;
ret = add_qgroup_relation_item(trans, dst, src); if (ret) {
del_qgroup_relation_item(trans, src, dst); goto out;
}
staticint __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
u64 dst)
{ struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup *parent; struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; bool found = false; int ret = 0; int ret2;
if (!fs_info->quota_root) {
ret = -ENOTCONN; goto out;
}
member = find_qgroup_rb(fs_info, src);
parent = find_qgroup_rb(fs_info, dst); /* * The parent/member pair doesn't exist, then try to delete the dead * relation items only.
*/ if (!member || !parent) goto delete_item;
/* check if such qgroup relation exist firstly */
list_for_each_entry(list, &member->groups, next_group) { if (list->group == parent) {
found = true; break;
}
}
delete_item:
ret = del_qgroup_relation_item(trans, src, dst); if (ret < 0 && ret != -ENOENT) goto out;
ret2 = del_qgroup_relation_item(trans, dst, src); if (ret2 < 0 && ret2 != -ENOENT) goto out;
/* At least one deletion succeeded, return 0 */ if (!ret || !ret2)
ret = 0;
if (found) {
spin_lock(&fs_info->qgroup_lock);
del_relation_rb(fs_info, src, dst);
ret = quick_update_accounting(fs_info, src, dst, -1);
spin_unlock(&fs_info->qgroup_lock);
}
out: return ret;
}
int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
u64 dst)
{ struct btrfs_fs_info *fs_info = trans->fs_info; int ret = 0;
mutex_lock(&fs_info->qgroup_ioctl_lock);
ret = __del_qgroup_relation(trans, src, dst);
mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
}
int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
{ struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; struct btrfs_qgroup *prealloc = NULL; int ret = 0;
mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) {
ret = -ENOTCONN; goto out;
}
quota_root = fs_info->quota_root;
qgroup = find_qgroup_rb(fs_info, qgroupid); if (qgroup) {
ret = -EEXIST; goto out;
}
prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); if (!prealloc) {
ret = -ENOMEM; goto out;
}
ret = add_qgroup_item(trans, quota_root, qgroupid); if (ret) goto out;
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
kfree(prealloc); return ret;
}
/* * Return 0 if we can not delete the qgroup (not empty or has children etc). * Return >0 if we can delete the qgroup. * Return <0 for other errors during tree search.
*/ staticint can_delete_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup)
{ struct btrfs_key key; struct btrfs_path *path; int ret;
/* * Squota would never be inconsistent, but there can still be case * where a dropped subvolume still has qgroup numbers, and squota * relies on such qgroup for future accounting. * * So for squota, do not allow dropping any non-zero qgroup.
*/ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE &&
(qgroup->rfer || qgroup->excl || qgroup->excl_cmpr || qgroup->rfer_cmpr)) return 0;
/* For higher level qgroup, we can only delete it if it has no child. */ if (btrfs_qgroup_level(qgroup->qgroupid)) { if (!list_empty(&qgroup->members)) return 0; return 1;
}
/* * For level-0 qgroups, we can only delete it if it has no subvolume * for it. * This means even a subvolume is unlinked but not yet fully dropped, * we can not delete the qgroup.
*/
key.objectid = qgroup->qgroupid;
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = -1ULL;
path = btrfs_alloc_path(); if (!path) return -ENOMEM;
ret = btrfs_find_root(fs_info->tree_root, &key, path, NULL, NULL);
btrfs_free_path(path); /* * The @ret from btrfs_find_root() exactly matches our definition for * the return value, thus can be returned directly.
*/ return ret;
}
int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
{ struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup *qgroup; struct btrfs_qgroup_list *list; int ret = 0;
mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) {
ret = -ENOTCONN; goto out;
}
qgroup = find_qgroup_rb(fs_info, qgroupid); if (!qgroup) {
ret = -ENOENT; goto out;
}
ret = can_delete_qgroup(fs_info, qgroup); if (ret < 0) goto out; if (ret == 0) {
ret = -EBUSY; goto out;
}
/* Check if there are no children of this qgroup */ if (!list_empty(&qgroup->members)) {
ret = -EBUSY; goto out;
}
ret = del_qgroup_item(trans, qgroupid); if (ret && ret != -ENOENT) goto out;
while (!list_empty(&qgroup->groups)) {
list = list_first_entry(&qgroup->groups, struct btrfs_qgroup_list, next_group);
ret = __del_qgroup_relation(trans, qgroupid,
list->group->qgroupid); if (ret) goto out;
}
spin_lock(&fs_info->qgroup_lock); /* * Warn on reserved space. The subvolume should has no child nor * corresponding subvolume. * Thus its reserved space should all be zero, no matter if qgroup * is consistent or the mode.
*/ if (qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] ||
qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] ||
qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]) {
DEBUG_WARN();
btrfs_warn_rl(fs_info, "to be deleted qgroup %u/%llu has non-zero numbers, data %llu meta prealloc %llu meta pertrans %llu",
btrfs_qgroup_level(qgroup->qgroupid),
btrfs_qgroup_subvolid(qgroup->qgroupid),
qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA],
qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC],
qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]);
} /* * The same for rfer/excl numbers, but that's only if our qgroup is * consistent and if it's in regular qgroup mode. * For simple mode it's not as accurate thus we can hit non-zero values * very frequently.
*/ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL &&
!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) { if (qgroup->rfer || qgroup->excl ||
qgroup->rfer_cmpr || qgroup->excl_cmpr) {
DEBUG_WARN();
qgroup_mark_inconsistent(fs_info, "to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu",
btrfs_qgroup_level(qgroup->qgroupid),
btrfs_qgroup_subvolid(qgroup->qgroupid),
qgroup->rfer, qgroup->rfer_cmpr,
qgroup->excl, qgroup->excl_cmpr);
}
}
del_qgroup_rb(fs_info, qgroupid);
spin_unlock(&fs_info->qgroup_lock);
/* * Remove the qgroup from sysfs now without holding the qgroup_lock * spinlock, since the sysfs_remove_group() function needs to take * the mutex kernfs_mutex through kernfs_remove_by_name_ns().
*/
btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
kfree(qgroup);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret;
}
int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 subvolid)
{ struct btrfs_trans_handle *trans; int ret;
if (!btrfs_is_fstree(subvolid) || !btrfs_qgroup_enabled(fs_info) ||
!fs_info->quota_root) return 0;
/* * Commit current transaction to make sure all the rfer/excl numbers * get updated.
*/
ret = btrfs_commit_current_transaction(fs_info->quota_root); if (ret < 0) return ret;
/* Start new trans to delete the qgroup info and limit items. */
trans = btrfs_start_transaction(fs_info->quota_root, 2); if (IS_ERR(trans)) return PTR_ERR(trans);
ret = btrfs_remove_qgroup(trans, subvolid);
btrfs_end_transaction(trans); /* * It's squota and the subvolume still has numbers needed for future * accounting, in this case we can not delete it. Just skip it. * * Or the qgroup is already removed by a qgroup rescan. For both cases we're * safe to ignore them.
*/ if (ret == -EBUSY || ret == -ENOENT)
ret = 0; return ret;
}
int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, struct btrfs_qgroup_limit *limit)
{ struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup *qgroup; int ret = 0; /* Sometimes we would want to clear the limit on this qgroup. * To meet this requirement, we treat the -1 as a special value * which tell kernel to clear the limit on this qgroup.
*/ const u64 CLEAR_VALUE = -1;
mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) {
ret = -ENOTCONN; goto out;
}
qgroup = find_qgroup_rb(fs_info, qgroupid); if (!qgroup) {
ret = -ENOENT; goto out;
}
/* * Inform qgroup to trace one dirty extent, its info is recorded in @record. * So qgroup can account it at transaction committing time. * * No lock version, caller must acquire delayed ref lock and allocated memory, * then call btrfs_qgroup_trace_extent_post() after exiting lock context. * * Return 0 for success insert * Return >0 for existing record, caller can free @record safely. * Return <0 for insertion failure, caller can free @record safely.
*/ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_qgroup_extent_record *record,
u64 bytenr)
{ struct btrfs_qgroup_extent_record *existing, *ret; constunsignedlong index = (bytenr >> fs_info->sectorsize_bits);
if (!btrfs_qgroup_full_accounting(fs_info)) return 1;
#if BITS_PER_LONG == 32 if (bytenr >= MAX_LFS_FILESIZE) {
btrfs_err_rl(fs_info, "qgroup record for extent at %llu is beyond 32bit page cache and xarray index limit",
bytenr);
btrfs_err_32bit_limit(fs_info); return -EOVERFLOW;
} #endif
ret = __xa_store(&delayed_refs->dirty_extents, index, record, GFP_ATOMIC);
xa_unlock(&delayed_refs->dirty_extents); if (xa_is_err(ret)) {
qgroup_mark_inconsistent(fs_info, "xarray insert error: %d", xa_err(ret)); return xa_err(ret);
}
return 0;
}
/* * Post handler after qgroup_trace_extent_nolock(). * * NOTE: Current qgroup does the expensive backref walk at transaction * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming * new transaction. * This is designed to allow btrfs_find_all_roots() to get correct new_roots * result. * * However for old_roots there is no need to do backref walk at that time, * since we search commit roots to walk backref and result will always be * correct. * * Due to the nature of no lock version, we can't do backref there. * So we must call btrfs_qgroup_trace_extent_post() after exiting * spinlock context. * * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result * using current root, then we can move all expensive backref walk out of * transaction committing, but not now as qgroup accounting will be wrong again.
*/ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, struct btrfs_qgroup_extent_record *qrecord,
u64 bytenr)
{ struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_backref_walk_ctx ctx = {
.bytenr = bytenr,
.fs_info = fs_info,
}; int ret;
if (!btrfs_qgroup_full_accounting(fs_info)) return 0; /* * We are always called in a context where we are already holding a * transaction handle. Often we are called when adding a data delayed * reference from btrfs_truncate_inode_items() (truncating or unlinking), * in which case we will be holding a write lock on extent buffer from a * subvolume tree. In this case we can't allow btrfs_find_all_roots() to * acquire fs_info->commit_root_sem, because that is a higher level lock * that must be acquired before locking any extent buffers. * * So we want btrfs_find_all_roots() to not acquire the commit_root_sem * but we can't pass it a non-NULL transaction handle, because otherwise * it would not use commit roots and would lock extent buffers, causing * a deadlock if it ends up trying to read lock the same extent buffer * that was previously write locked at btrfs_truncate_inode_items(). * * So pass a NULL transaction handle to btrfs_find_all_roots() and * explicitly tell it to not acquire the commit_root_sem - if we are * holding a transaction handle we don't need its protection.
*/
ASSERT(trans != NULL);
if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) return 0;
ret = btrfs_find_all_roots(&ctx, true); if (ret < 0) {
qgroup_mark_inconsistent(fs_info, "error accounting new delayed refs extent: %d", ret); return 0;
}
/* * Here we don't need to get the lock of * trans->transaction->delayed_refs, since inserted qrecord won't * be deleted, only qrecord->node may be modified (new qrecord insert) * * So modifying qrecord->old_roots is safe here
*/
qrecord->old_roots = ctx.roots; return 0;
}
/* * Inform qgroup to trace one dirty extent, specified by @bytenr and * @num_bytes. * So qgroup can account it at commit trans time. * * Better encapsulated version, with memory allocation and backref walk for * commit roots. * So this can sleep. * * Return 0 if the operation is done. * Return <0 for error, like memory allocation failure or invalid parameter * (NULL trans)
*/ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
u64 num_bytes)
{ struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup_extent_record *record; struct btrfs_delayed_ref_root *delayed_refs = &trans->transaction->delayed_refs; constunsignedlong index = (bytenr >> fs_info->sectorsize_bits); int ret;
if (!btrfs_qgroup_full_accounting(fs_info) || bytenr == 0 || num_bytes == 0) return 0;
record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) return -ENOMEM;
if (xa_reserve(&delayed_refs->dirty_extents, index, GFP_NOFS)) {
kfree(record); return -ENOMEM;
}
record->num_bytes = num_bytes;
ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record, bytenr); if (ret) { /* Clean up if insertion fails or item exists. */
xa_release(&delayed_refs->dirty_extents, index);
kfree(record); return 0;
} return btrfs_qgroup_trace_extent_post(trans, record, bytenr);
}
/* * Inform qgroup to trace all leaf items of data * * Return 0 for success * Return <0 for error(ENOMEM)
*/ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, struct extent_buffer *eb)
{ struct btrfs_fs_info *fs_info = trans->fs_info; int nr = btrfs_header_nritems(eb); int i, extent_type, ret; struct btrfs_key key; struct btrfs_file_extent_item *fi;
u64 bytenr, num_bytes;
/* We can be called directly from walk_up_proc() */ if (!btrfs_qgroup_full_accounting(fs_info)) return 0;
for (i = 0; i < nr; i++) {
btrfs_item_key_to_cpu(eb, &key, i);
if (key.type != BTRFS_EXTENT_DATA_KEY) continue;
fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); /* filter out non qgroup-accountable extents */
extent_type = btrfs_file_extent_type(eb, fi);
if (extent_type == BTRFS_FILE_EXTENT_INLINE) continue;
bytenr = btrfs_file_extent_disk_bytenr(eb, fi); if (!bytenr) continue;
ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes); if (ret) return ret;
}
cond_resched(); return 0;
}
/* * Walk up the tree from the bottom, freeing leaves and any interior * nodes which have had all slots visited. If a node (leaf or * interior) is freed, the node above it will have it's slot * incremented. The root node will never be freed. * * At the end of this function, we should have a path which has all * slots incremented to the next position for a search. If we need to * read a new node it will be NULL and the node above it will have the * correct slot selected for a later read. * * If we increment the root nodes slot counter past the number of * elements, 1 is returned to signal completion of the search.
*/ staticint adjust_slots_upwards(struct btrfs_path *path, int root_level)
{ int level = 0; int nr, slot; struct extent_buffer *eb;
if (root_level == 0) return 1;
while (level <= root_level) {
eb = path->nodes[level];
nr = btrfs_header_nritems(eb);
path->slots[level]++;
slot = path->slots[level]; if (slot >= nr || level == 0) { /* * Don't free the root - we will detect this * condition after our loop and return a * positive value for caller to stop walking the tree.
*/ if (level != root_level) {
btrfs_tree_unlock_rw(eb, path->locks[level]);
path->locks[level] = 0;
free_extent_buffer(eb);
path->nodes[level] = NULL;
path->slots[level] = 0;
}
} else { /* * We have a valid slot to walk back down * from. Stop here so caller can process these * new nodes.
*/ break;
}
level++;
}
eb = path->nodes[root_level]; if (path->slots[root_level] >= btrfs_header_nritems(eb)) return 1;
return 0;
}
/* * Helper function to trace a subtree tree block swap. * * The swap will happen in highest tree block, but there may be a lot of * tree blocks involved. * * For example: * OO = Old tree blocks * NN = New tree blocks allocated during balance * * File tree (257) Reloc tree for 257 * L2 OO NN * / \ / \ * L1 OO OO (a) OO NN (a) * / \ / \ / \ / \ * L0 OO OO OO OO OO OO NN NN * (b) (c) (b) (c) * * When calling qgroup_trace_extent_swap(), we will pass: * @src_eb = OO(a) * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ] * @dst_level = 0 * @root_level = 1 * * In that case, qgroup_trace_extent_swap() will search from OO(a) to * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty. * * The main work of qgroup_trace_extent_swap() can be split into 3 parts: * * 1) Tree search from @src_eb * It should acts as a simplified btrfs_search_slot(). * The key for search can be extracted from @dst_path->nodes[dst_level] * (first key). * * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty * NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty. * They should be marked during previous (@dst_level = 1) iteration. * * 3) Mark file extents in leaves dirty * We don't have good way to pick out new file extents only. * So we still follow the old method by scanning all file extents in * the leave. * * This function can free us from keeping two paths, thus later we only need * to care about how to iterate all new tree blocks in reloc tree.
*/ staticint qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, struct extent_buffer *src_eb, struct btrfs_path *dst_path, int dst_level, int root_level, bool trace_leaf)
{ struct btrfs_key key; struct btrfs_path *src_path; struct btrfs_fs_info *fs_info = trans->fs_info;
u32 nodesize = fs_info->nodesize; int cur_level = root_level; int ret;
/* * Now both @dst_path and @src_path have been populated, record the tree * blocks for qgroup accounting.
*/
ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start,
nodesize); if (ret < 0) goto out;
ret = btrfs_qgroup_trace_extent(trans, dst_path->nodes[dst_level]->start,
nodesize); if (ret < 0) goto out;
/* Record leaf file extents */ if (dst_level == 0 && trace_leaf) {
ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]); if (ret < 0) goto out;
ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]);
}
out:
btrfs_free_path(src_path); return ret;
}
/*
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Dauer der Verarbeitung: 0.26 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.