/* must be called with qgroup_ioctl_lock held */ staticstruct btrfs_qgroup *find_qgroup_rb(conststruct btrfs_fs_info *fs_info,
u64 qgroupid)
{ struct rb_node *node;
/* * Add qgroup to the filesystem's qgroup tree. * * Must be called with qgroup_lock held and @prealloc preallocated. * * The control on the lifespan of @prealloc would be transferred to this * function, thus caller should no longer touch @prealloc.
*/ staticstruct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *prealloc,
u64 qgroupid)
{ struct rb_node *node;
/* Caller must have pre-allocated @prealloc. */
ASSERT(prealloc);
list_del(&qgroup->dirty); while (!list_empty(&qgroup->groups)) {
list = list_first_entry(&qgroup->groups, struct btrfs_qgroup_list, next_group);
list_del(&list->next_group);
list_del(&list->next_member);
kfree(list);
}
while (!list_empty(&qgroup->members)) {
list = list_first_entry(&qgroup->members, struct btrfs_qgroup_list, next_member);
list_del(&list->next_group);
list_del(&list->next_member);
kfree(list);
}
}
/* must be called with qgroup_lock held */ staticint del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
{ struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid);
/* * Add relation specified by two qgroups. * * Must be called with qgroup_lock held, the ownership of @prealloc is * transferred to this function and caller should not touch it anymore. * * Return: 0 on success * -ENOENT if one of the qgroups is NULL * <0 other errors
*/ staticint __add_relation_rb(struct btrfs_qgroup_list *prealloc, struct btrfs_qgroup *member, struct btrfs_qgroup *parent)
{ if (!member || !parent) {
kfree(prealloc); return -ENOENT;
}
/* * Add relation specified by two qgroup ids. * * Must be called with qgroup_lock held. * * Return: 0 on success * -ENOENT if one of the ids does not exist * <0 other errors
*/ staticint add_relation_rb(struct btrfs_fs_info *fs_info, struct btrfs_qgroup_list *prealloc,
u64 memberid, u64 parentid)
{ struct btrfs_qgroup *member; struct btrfs_qgroup *parent;
member = find_qgroup_rb(fs_info, memberid);
parent = find_qgroup_rb(fs_info, parentid);
/* * The full config is read in one go, only called from open_ctree() * It doesn't use any locking, as at this point we're still single-threaded
*/ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
{ struct btrfs_key key; struct btrfs_key found_key; struct btrfs_root *quota_root = fs_info->quota_root; struct btrfs_path *path = NULL; struct extent_buffer *l; int slot; int ret = 0;
u64 flags = 0;
u64 rescan_progress = 0;
if (!fs_info->quota_root) return 0;
path = btrfs_alloc_path(); if (!path) {
ret = -ENOMEM; goto out;
}
ret = btrfs_sysfs_add_qgroups(fs_info); if (ret < 0) goto out; /* default this to quota off, in case no status key is found */
fs_info->qgroup_flags = 0;
/* * pass 1: read status, all qgroup infos and limits
*/
key.objectid = 0;
key.type = 0;
key.offset = 0;
ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1); if (ret) goto out;
while (1) { struct btrfs_qgroup *qgroup;
slot = path->slots[0];
l = path->nodes[0];
btrfs_item_key_to_cpu(l, &found_key, slot);
if (found_key.type == BTRFS_QGROUP_STATUS_KEY) { struct btrfs_qgroup_status_item *ptr;
prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL); if (!prealloc) {
ret = -ENOMEM; goto out;
}
qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset); /* * If a qgroup exists for a subvolume ID, it is possible * that subvolume has been deleted, in which case * reusing that ID would lead to incorrect accounting. * * Ensure that we skip any such subvol ids. * * We don't need to lock because this is only called * during mount before we start doing things like creating * subvolumes.
*/ if (btrfs_is_fstree(qgroup->qgroupid) &&
qgroup->qgroupid > tree_root->free_objectid) /* * Don't need to check against BTRFS_LAST_FREE_OBJECTID, * as it will get checked on the next call to * btrfs_get_free_objectid.
*/
tree_root->free_objectid = qgroup->qgroupid + 1;
}
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); if (ret < 0) goto out;
switch (found_key.type) { case BTRFS_QGROUP_INFO_KEY: { struct btrfs_qgroup_info_item *ptr;
/* * pass 2: read all qgroup relations
*/
key.objectid = 0;
key.type = BTRFS_QGROUP_RELATION_KEY;
key.offset = 0;
ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0); if (ret) goto out; while (1) { struct btrfs_qgroup_list *list = NULL;
slot = path->slots[0];
l = path->nodes[0];
btrfs_item_key_to_cpu(l, &found_key, slot);
if (found_key.type != BTRFS_QGROUP_RELATION_KEY) goto next2;
if (found_key.objectid > found_key.offset) { /* parent <- member, not needed to build config */ /* FIXME should we omit the key completely? */ goto next2;
}
list = kzalloc(sizeof(*list), GFP_KERNEL); if (!list) {
ret = -ENOMEM; goto out;
}
ret = add_relation_rb(fs_info, list, found_key.objectid,
found_key.offset);
list = NULL; if (ret == -ENOENT) {
btrfs_warn(fs_info, "orphan qgroup relation 0x%llx->0x%llx",
found_key.objectid, found_key.offset);
ret = 0; /* ignore the error */
} if (ret) goto out;
next2:
ret = btrfs_next_item(quota_root, path); if (ret < 0) goto out; if (ret) break;
}
out:
btrfs_free_path(path);
fs_info->qgroup_flags |= flags; if (ret >= 0) { if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)
set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
} else {
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
btrfs_sysfs_del_qgroups(fs_info);
}
return ret < 0 ? ret : 0;
}
/* * Called in close_ctree() when quota is still enabled. This verifies we don't * leak some reserved space. * * Return false if no reserved space is left. * Return true if some reserved space is leaked.
*/ bool btrfs_check_quota_leak(conststruct btrfs_fs_info *fs_info)
{ struct rb_node *node; bool ret = false;
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) return ret; /* * Since we're unmounting, there is no race and no need to grab qgroup * lock. And here we don't go post-order to provide a more user * friendly sorted result.
*/ for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) { struct btrfs_qgroup *qgroup; int i;
qgroup = rb_entry(node, struct btrfs_qgroup, node); for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) { if (qgroup->rsv.values[i]) {
ret = true;
btrfs_warn(fs_info, "qgroup %hu/%llu has unreleased space, type %d rsv %llu",
btrfs_qgroup_level(qgroup->qgroupid),
btrfs_qgroup_subvolid(qgroup->qgroupid),
i, qgroup->rsv.values[i]);
}
}
} return ret;
}
/* * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(), * first two are in single-threaded paths.
*/ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
{ struct rb_node *n; struct btrfs_qgroup *qgroup;
/* * btrfs_quota_disable() can be called concurrently with * btrfs_qgroup_rescan() -> qgroup_rescan_zero_tracking(), so take the * lock.
*/
spin_lock(&fs_info->qgroup_lock); while ((n = rb_first(&fs_info->qgroup_tree))) {
qgroup = rb_entry(n, struct btrfs_qgroup, node);
rb_erase(n, &fs_info->qgroup_tree);
__del_qgroup_rb(qgroup);
spin_unlock(&fs_info->qgroup_lock);
btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
kfree(qgroup);
spin_lock(&fs_info->qgroup_lock);
}
spin_unlock(&fs_info->qgroup_lock);
/* * called with qgroup_lock held
*/ staticint btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root)
{ struct btrfs_path *path; struct btrfs_key key; struct extent_buffer *leaf = NULL; int ret; int nr = 0;
path = btrfs_alloc_path(); if (!path) return -ENOMEM;
key.objectid = 0;
key.type = 0;
key.offset = 0;
while (1) {
ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) goto out;
leaf = path->nodes[0];
nr = btrfs_header_nritems(leaf); if (!nr) break; /* * delete the leaf one by one * since the whole tree is going * to be deleted.
*/
path->slots[0] = 0;
ret = btrfs_del_items(trans, root, path, 0, nr); if (ret) goto out;
btrfs_release_path(path);
}
ret = 0;
out:
btrfs_free_path(path); return ret;
}
/* * We need to have subvol_sem write locked, to prevent races between * concurrent tasks trying to enable quotas, because we will unlock * and relock qgroup_ioctl_lock before setting fs_info->quota_root * and before setting BTRFS_FS_QUOTA_ENABLED.
*/
lockdep_assert_held_write(&fs_info->subvol_sem);
if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
btrfs_err(fs_info, "qgroups are currently unsupported in extent tree v2"); return -EINVAL;
}
mutex_lock(&fs_info->qgroup_ioctl_lock); if (fs_info->quota_root) goto out;
ret = btrfs_sysfs_add_qgroups(fs_info); if (ret < 0) goto out;
/* * Unlock qgroup_ioctl_lock before starting the transaction. This is to * avoid lock acquisition inversion problems (reported by lockdep) between * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we * start a transaction. * After we started the transaction lock qgroup_ioctl_lock again and * check if someone else created the quota root in the meanwhile. If so, * just return success and release the transaction handle. * * Also we don't need to worry about someone else calling * btrfs_sysfs_add_qgroups() after we unlock and getting an error because * that function returns 0 (success) when the sysfs entries already exist.
*/
mutex_unlock(&fs_info->qgroup_ioctl_lock);
/* * 1 for quota root item * 1 for BTRFS_QGROUP_STATUS item * * Yet we also need 2*n items for a QGROUP_INFO/QGROUP_LIMIT items * per subvolume. However those are not currently reserved since it * would be a lot of overkill.
*/
trans = btrfs_start_transaction(tree_root, 2);
mutex_lock(&fs_info->qgroup_ioctl_lock); if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL; goto out;
}
if (fs_info->quota_root) goto out;
/* * initially create the quota tree
*/
quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID); if (IS_ERR(quota_root)) {
ret = PTR_ERR(quota_root);
btrfs_abort_transaction(trans, ret); goto out;
}
path = btrfs_alloc_path(); if (!path) {
ret = -ENOMEM;
btrfs_abort_transaction(trans, ret); goto out_free_root;
}
/* Release locks on tree_root before we access quota_root */
btrfs_release_path(path);
/* We should not have a stray @prealloc pointer. */
ASSERT(prealloc == NULL);
prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); if (!prealloc) {
ret = -ENOMEM;
btrfs_abort_transaction(trans, ret); goto out_free_path;
}
ret = add_qgroup_item(trans, quota_root,
found_key.offset); if (ret) {
btrfs_abort_transaction(trans, ret); goto out_free_path;
}
qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset);
prealloc = NULL;
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); if (ret < 0) {
btrfs_abort_transaction(trans, ret); goto out_free_path;
}
ret = btrfs_search_slot_for_read(tree_root, &found_key,
path, 1, 0); if (ret < 0) {
btrfs_abort_transaction(trans, ret); goto out_free_path;
} if (ret > 0) { /* * Shouldn't happen, but in case it does we * don't need to do the btrfs_next_item, just * continue.
*/ continue;
}
}
ret = btrfs_next_item(tree_root, path); if (ret < 0) {
btrfs_abort_transaction(trans, ret); goto out_free_path;
} if (ret) break;
}
out_add_root:
btrfs_release_path(path);
ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID); if (ret) {
btrfs_abort_transaction(trans, ret); goto out_free_path;
}
ASSERT(prealloc == NULL);
prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); if (!prealloc) {
ret = -ENOMEM; goto out_free_path;
}
qgroup = add_qgroup_rb(fs_info, prealloc, BTRFS_FS_TREE_OBJECTID);
prealloc = NULL;
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); if (ret < 0) {
btrfs_abort_transaction(trans, ret); goto out_free_path;
}
fs_info->qgroup_enable_gen = trans->transid;
mutex_unlock(&fs_info->qgroup_ioctl_lock); /* * Commit the transaction while not holding qgroup_ioctl_lock, to avoid * a deadlock with tasks concurrently doing other qgroup operations, such * adding/removing qgroups or adding/deleting qgroup relations for example, * because all qgroup operations first start or join a transaction and then * lock the qgroup_ioctl_lock mutex. * We are safe from a concurrent task trying to enable quotas, by calling * this function, since we are serialized by fs_info->subvol_sem.
*/
ret = btrfs_commit_transaction(trans);
trans = NULL;
mutex_lock(&fs_info->qgroup_ioctl_lock); if (ret) goto out_free_path;
/* * Set quota enabled flag after committing the transaction, to avoid * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot * creation.
*/
spin_lock(&fs_info->qgroup_lock);
fs_info->quota_root = quota_root;
set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
spin_unlock(&fs_info->qgroup_lock);
/* Skip rescan for simple qgroups. */ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) goto out_free_path;
ret = qgroup_rescan_init(fs_info, 0, 1); if (!ret) {
qgroup_rescan_zero_tracking(fs_info);
fs_info->qgroup_rescan_running = true;
btrfs_queue_work(fs_info->qgroup_rescan_workers,
&fs_info->qgroup_rescan_work);
} else { /* * We have set both BTRFS_FS_QUOTA_ENABLED and * BTRFS_QGROUP_STATUS_FLAG_ON, so we can only fail with * -EINPROGRESS. That can happen because someone started the * rescan worker by calling quota rescan ioctl before we * attempted to initialize the rescan worker. Failure due to * quotas disabled in the meanwhile is not possible, because * we are holding a write lock on fs_info->subvol_sem, which * is also acquired when disabling quotas. * Ignore such error, and any other error would need to undo * everything we did in the transaction we just committed.
*/
ASSERT(ret == -EINPROGRESS);
ret = 0;
}
out_free_path:
btrfs_free_path(path);
out_free_root: if (ret)
btrfs_put_root(quota_root);
out: if (ret)
btrfs_sysfs_del_qgroups(fs_info);
mutex_unlock(&fs_info->qgroup_ioctl_lock); if (ret && trans)
btrfs_end_transaction(trans); elseif (trans)
ret = btrfs_end_transaction(trans);
kfree(prealloc); return ret;
}
/* * It is possible to have outstanding ordered extents which reserved bytes * before we disabled. We need to fully flush delalloc, ordered extents, and a * commit to ensure that we don't leak such reservations, only to have them * come back if we re-enable. * * - enable simple quotas * - reserve space * - release it, store rsv_bytes in OE * - disable quotas * - enable simple quotas (qgroup rsv are all 0) * - OE finishes * - run delayed refs * - free rsv_bytes, resulting in miscounting or even underflow
*/ staticint flush_reservations(struct btrfs_fs_info *fs_info)
{ int ret;
ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false); if (ret) return ret;
btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL);
int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
{ struct btrfs_root *quota_root = NULL; struct btrfs_trans_handle *trans = NULL; int ret = 0;
/* * We need to have subvol_sem write locked to prevent races with * snapshot creation.
*/
lockdep_assert_held_write(&fs_info->subvol_sem);
/* * Relocation will mess with backrefs, so make sure we have the * cleaner_mutex held to protect us from relocate.
*/
lockdep_assert_held(&fs_info->cleaner_mutex);
mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) goto out;
/* * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs * to lock that mutex while holding a transaction handle and the rescan * worker needs to commit a transaction.
*/
mutex_unlock(&fs_info->qgroup_ioctl_lock);
/* * Request qgroup rescan worker to complete and wait for it. This wait * must be done before transaction start for quota disable since it may * deadlock with transaction by the qgroup rescan worker.
*/
clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
btrfs_qgroup_wait_for_completion(fs_info, false);
/* * We have nothing held here and no trans handle, just return the error * if there is one and set back the quota enabled bit since we didn't * actually disable quotas.
*/
ret = flush_reservations(fs_info); if (ret) {
set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); return ret;
}
/* * 1 For the root item * * We should also reserve enough items for the quota tree deletion in * btrfs_clean_quota_tree but this is not done. * * Also, we must always start a transaction without holding the mutex * qgroup_ioctl_lock, see btrfs_quota_enable().
*/
trans = btrfs_start_transaction(fs_info->tree_root, 1);
mutex_lock(&fs_info->qgroup_ioctl_lock); if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL;
set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); goto out;
}
/* * The easy accounting, we're updating qgroup relationship whose child qgroup * only has exclusive extents. * * In this case, all exclusive extents will also be exclusive for parent, so * excl/rfer just get added/removed. * * So is qgroup reservation space, which should also be added/removed to * parent. * Or when child tries to release reservation space, parent will underflow its * reservation (for relationship adding case). * * Caller should hold fs_info->qgroup_lock.
*/ staticint __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root, struct btrfs_qgroup *src, int sign)
{ struct btrfs_qgroup *qgroup;
LIST_HEAD(qgroup_list);
u64 num_bytes = src->excl;
u64 num_bytes_cmpr = src->excl_cmpr; int ret = 0;
qgroup = find_qgroup_rb(fs_info, ref_root); if (!qgroup) goto out;
/* * Quick path for updating qgroup with only excl refs. * * In that case, just update all parent will be enough. * Or we needs to do a full rescan. * Caller should also hold fs_info->qgroup_lock. * * Return 0 for quick update, return >0 for need to full rescan * and mark INCONSISTENT flag. * Return < 0 for other error.
*/ staticint quick_update_accounting(struct btrfs_fs_info *fs_info,
u64 src, u64 dst, int sign)
{ struct btrfs_qgroup *qgroup; int ret = 1;
qgroup = find_qgroup_rb(fs_info, src); if (!qgroup) goto out; if (qgroup->excl == qgroup->rfer) {
ret = __qgroup_excl_accounting(fs_info, dst, qgroup, sign); if (ret < 0) goto out;
ret = 0;
}
out: if (ret)
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; return ret;
}
/* * Add relation between @src and @dst qgroup. The @prealloc is allocated by the * callers and transferred here (either used or freed on error).
*/ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst, struct btrfs_qgroup_list *prealloc)
{ struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup *parent; struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; int ret = 0;
ASSERT(prealloc);
/* Check the level of src and dst first */ if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) {
kfree(prealloc); return -EINVAL;
}
mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) {
ret = -ENOTCONN; goto out;
}
member = find_qgroup_rb(fs_info, src);
parent = find_qgroup_rb(fs_info, dst); if (!member || !parent) {
ret = -EINVAL; goto out;
}
/* check if such qgroup relation exist firstly */
list_for_each_entry(list, &member->groups, next_group) { if (list->group == parent) {
ret = -EEXIST; goto out;
}
}
ret = add_qgroup_relation_item(trans, src, dst); if (ret) goto out;
ret = add_qgroup_relation_item(trans, dst, src); if (ret) {
del_qgroup_relation_item(trans, src, dst); goto out;
}
staticint __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
u64 dst)
{ struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup *parent; struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; bool found = false; int ret = 0; int ret2;
if (!fs_info->quota_root) {
ret = -ENOTCONN; goto out;
}
member = find_qgroup_rb(fs_info, src);
parent = find_qgroup_rb(fs_info, dst); /* * The parent/member pair doesn't exist, then try to delete the dead * relation items only.
*/ if (!member || !parent) goto delete_item;
/* check if such qgroup relation exist firstly */
list_for_each_entry(list, &member->groups, next_group) { if (list->group == parent) {
found = true; break;
}
}
delete_item:
ret = del_qgroup_relation_item(trans, src, dst); if (ret < 0 && ret != -ENOENT) goto out;
ret2 = del_qgroup_relation_item(trans, dst, src); if (ret2 < 0 && ret2 != -ENOENT) goto out;
/* At least one deletion succeeded, return 0 */ if (!ret || !ret2)
ret = 0;
if (found) {
spin_lock(&fs_info->qgroup_lock);
del_relation_rb(fs_info, src, dst);
ret = quick_update_accounting(fs_info, src, dst, -1);
spin_unlock(&fs_info->qgroup_lock);
}
out: return ret;
}
int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
u64 dst)
{ struct btrfs_fs_info *fs_info = trans->fs_info; int ret = 0;
mutex_lock(&fs_info->qgroup_ioctl_lock);
ret = __del_qgroup_relation(trans, src, dst);
mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
}
int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
{ struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; struct btrfs_qgroup *prealloc = NULL; int ret = 0;
mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) {
ret = -ENOTCONN; goto out;
}
quota_root = fs_info->quota_root;
qgroup = find_qgroup_rb(fs_info, qgroupid); if (qgroup) {
ret = -EEXIST; goto out;
}
prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); if (!prealloc) {
ret = -ENOMEM; goto out;
}
ret = add_qgroup_item(trans, quota_root, qgroupid); if (ret) goto out;
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
kfree(prealloc); return ret;
}
/* * Return 0 if we can not delete the qgroup (not empty or has children etc). * Return >0 if we can delete the qgroup. * Return <0 for other errors during tree search.
*/ staticint can_delete_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup)
{ struct btrfs_key key; struct btrfs_path *path; int ret;
/* * Squota would never be inconsistent, but there can still be case * where a dropped subvolume still has qgroup numbers, and squota * relies on such qgroup for future accounting. * * So for squota, do not allow dropping any non-zero qgroup.
*/ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE &&
(qgroup->rfer || qgroup->excl || qgroup->excl_cmpr || qgroup->rfer_cmpr)) return 0;
/* For higher level qgroup, we can only delete it if it has no child. */ if (btrfs_qgroup_level(qgroup->qgroupid)) { if (!list_empty(&qgroup->members)) return 0; return 1;
}
/* * For level-0 qgroups, we can only delete it if it has no subvolume * for it. * This means even a subvolume is unlinked but not yet fully dropped, * we can not delete the qgroup.
*/
key.objectid = qgroup->qgroupid;
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = -1ULL;
path = btrfs_alloc_path(); if (!path) return -ENOMEM;
ret = btrfs_find_root(fs_info->tree_root, &key, path, NULL, NULL);
btrfs_free_path(path); /* * The @ret from btrfs_find_root() exactly matches our definition for * the return value, thus can be returned directly.
*/ return ret;
}
int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
{ struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup *qgroup; struct btrfs_qgroup_list *list; int ret = 0;
mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) {
ret = -ENOTCONN; goto out;
}
qgroup = find_qgroup_rb(fs_info, qgroupid); if (!qgroup) {
ret = -ENOENT; goto out;
}
ret = can_delete_qgroup(fs_info, qgroup); if (ret < 0) goto out; if (ret == 0) {
ret = -EBUSY; goto out;
}
/* Check if there are no children of this qgroup */ if (!list_empty(&qgroup->members)) {
ret = -EBUSY; goto out;
}
ret = del_qgroup_item(trans, qgroupid); if (ret && ret != -ENOENT) goto out;
while (!list_empty(&qgroup->groups)) {
list = list_first_entry(&qgroup->groups, struct btrfs_qgroup_list, next_group);
ret = __del_qgroup_relation(trans, qgroupid,
list->group->qgroupid); if (ret) goto out;
}
spin_lock(&fs_info->qgroup_lock); /* * Warn on reserved space. The subvolume should has no child nor * corresponding subvolume. * Thus its reserved space should all be zero, no matter if qgroup * is consistent or the mode.
*/ if (qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] ||
qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] ||
qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]) {
DEBUG_WARN();
btrfs_warn_rl(fs_info, "to be deleted qgroup %u/%llu has non-zero numbers, data %llu meta prealloc %llu meta pertrans %llu",
btrfs_qgroup_level(qgroup->qgroupid),
btrfs_qgroup_subvolid(qgroup->qgroupid),
qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA],
qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC],
qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]);
} /* * The same for rfer/excl numbers, but that's only if our qgroup is * consistent and if it's in regular qgroup mode. * For simple mode it's not as accurate thus we can hit non-zero values * very frequently.
*/ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL &&
!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) { if (qgroup->rfer || qgroup->excl ||
qgroup->rfer_cmpr || qgroup->excl_cmpr) {
DEBUG_WARN();
qgroup_mark_inconsistent(fs_info, "to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu",
btrfs_qgroup_level(qgroup->qgroupid),
btrfs_qgroup_subvolid(qgroup->qgroupid),
qgroup->rfer, qgroup->rfer_cmpr,
qgroup->excl, qgroup->excl_cmpr);
}
}
del_qgroup_rb(fs_info, qgroupid);
spin_unlock(&fs_info->qgroup_lock);
/* * Remove the qgroup from sysfs now without holding the qgroup_lock * spinlock, since the sysfs_remove_group() function needs to take * the mutex kernfs_mutex through kernfs_remove_by_name_ns().
*/
btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
kfree(qgroup);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret;
}
int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 subvolid)
{ struct btrfs_trans_handle *trans; int ret;
if (!btrfs_is_fstree(subvolid) || !btrfs_qgroup_enabled(fs_info) ||
!fs_info->quota_root) return 0;
/* * Commit current transaction to make sure all the rfer/excl numbers * get updated.
*/
ret = btrfs_commit_current_transaction(fs_info->quota_root); if (ret < 0) return ret;
/* Start new trans to delete the qgroup info and limit items. */
trans = btrfs_start_transaction(fs_info->quota_root, 2); if (IS_ERR(trans)) return PTR_ERR(trans);
ret = btrfs_remove_qgroup(trans, subvolid);
btrfs_end_transaction(trans); /* * It's squota and the subvolume still has numbers needed for future * accounting, in this case we can not delete it. Just skip it. * * Or the qgroup is already removed by a qgroup rescan. For both cases we're * safe to ignore them.
*/ if (ret == -EBUSY || ret == -ENOENT)
ret = 0; return ret;
}
int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, struct btrfs_qgroup_limit *limit)
{ struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup *qgroup; int ret = 0; /* Sometimes we would want to clear the limit on this qgroup. * To meet this requirement, we treat the -1 as a special value * which tell kernel to clear the limit on this qgroup.
*/ const u64 CLEAR_VALUE = -1;
mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) {
ret = -ENOTCONN; goto out;
}
qgroup = find_qgroup_rb(fs_info, qgroupid); if (!qgroup) {
ret = -ENOENT; goto out;
}
/* * Inform qgroup to trace one dirty extent, its info is recorded in @record. * So qgroup can account it at transaction committing time. * * No lock version, caller must acquire delayed ref lock and allocated memory, * then call btrfs_qgroup_trace_extent_post() after exiting lock context. * * Return 0 for success insert * Return >0 for existing record, caller can free @record safely. * Return <0 for insertion failure, caller can free @record safely.
*/ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_qgroup_extent_record *record,
u64 bytenr)
{ struct btrfs_qgroup_extent_record *existing, *ret; constunsignedlong index = (bytenr >> fs_info->sectorsize_bits);
if (!btrfs_qgroup_full_accounting(fs_info)) return 1;
#if BITS_PER_LONG == 32 if (bytenr >= MAX_LFS_FILESIZE) {
btrfs_err_rl(fs_info, "qgroup record for extent at %llu is beyond 32bit page cache and xarray index limit",
bytenr);
btrfs_err_32bit_limit(fs_info); return -EOVERFLOW;
} #endif
ret = __xa_store(&delayed_refs->dirty_extents, index, record, GFP_ATOMIC);
xa_unlock(&delayed_refs->dirty_extents); if (xa_is_err(ret)) {
qgroup_mark_inconsistent(fs_info, "xarray insert error: %d", xa_err(ret)); return xa_err(ret);
}
return 0;
}
/* * Post handler after qgroup_trace_extent_nolock(). * * NOTE: Current qgroup does the expensive backref walk at transaction * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming * new transaction. * This is designed to allow btrfs_find_all_roots() to get correct new_roots * result. * * However for old_roots there is no need to do backref walk at that time, * since we search commit roots to walk backref and result will always be * correct. * * Due to the nature of no lock version, we can't do backref there. * So we must call btrfs_qgroup_trace_extent_post() after exiting * spinlock context. * * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result * using current root, then we can move all expensive backref walk out of * transaction committing, but not now as qgroup accounting will be wrong again.
*/ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, struct btrfs_qgroup_extent_record *qrecord,
u64 bytenr)
{ struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_backref_walk_ctx ctx = {
.bytenr = bytenr,
.fs_info = fs_info,
}; int ret;
if (!btrfs_qgroup_full_accounting(fs_info)) return 0; /* * We are always called in a context where we are already holding a * transaction handle. Often we are called when adding a data delayed * reference from btrfs_truncate_inode_items() (truncating or unlinking), * in which case we will be holding a write lock on extent buffer from a * subvolume tree. In this case we can't allow btrfs_find_all_roots() to * acquire fs_info->commit_root_sem, because that is a higher level lock * that must be acquired before locking any extent buffers. * * So we want btrfs_find_all_roots() to not acquire the commit_root_sem * but we can't pass it a non-NULL transaction handle, because otherwise * it would not use commit roots and would lock extent buffers, causing * a deadlock if it ends up trying to read lock the same extent buffer * that was previously write locked at btrfs_truncate_inode_items(). * * So pass a NULL transaction handle to btrfs_find_all_roots() and * explicitly tell it to not acquire the commit_root_sem - if we are * holding a transaction handle we don't need its protection.
*/
ASSERT(trans != NULL);
if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) return 0;
ret = btrfs_find_all_roots(&ctx, true); if (ret < 0) {
qgroup_mark_inconsistent(fs_info, "error accounting new delayed refs extent: %d", ret); return 0;
}
/* * Here we don't need to get the lock of * trans->transaction->delayed_refs, since inserted qrecord won't * be deleted, only qrecord->node may be modified (new qrecord insert) * * So modifying qrecord->old_roots is safe here
*/
qrecord->old_roots = ctx.roots; return 0;
}
/* * Inform qgroup to trace one dirty extent, specified by @bytenr and * @num_bytes. * So qgroup can account it at commit trans time. * * Better encapsulated version, with memory allocation and backref walk for * commit roots. * So this can sleep. * * Return 0 if the operation is done. * Return <0 for error, like memory allocation failure or invalid parameter * (NULL trans)
*/ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
u64 num_bytes)
{ struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup_extent_record *record; struct btrfs_delayed_ref_root *delayed_refs = &trans->transaction->delayed_refs; constunsignedlong index = (bytenr >> fs_info->sectorsize_bits); int ret;
if (!btrfs_qgroup_full_accounting(fs_info) || bytenr == 0 || num_bytes == 0) return 0;
record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) return -ENOMEM;
if (xa_reserve(&delayed_refs->dirty_extents, index, GFP_NOFS)) {
kfree(record); return -ENOMEM;
}
record->num_bytes = num_bytes;
ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record, bytenr); if (ret) { /* Clean up if insertion fails or item exists. */
xa_release(&delayed_refs->dirty_extents, index);
kfree(record); return 0;
} return btrfs_qgroup_trace_extent_post(trans, record, bytenr);
}
/* * Inform qgroup to trace all leaf items of data * * Return 0 for success * Return <0 for error(ENOMEM)
*/ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, struct extent_buffer *eb)
{ struct btrfs_fs_info *fs_info = trans->fs_info; int nr = btrfs_header_nritems(eb); int i, extent_type, ret; struct btrfs_key key; struct btrfs_file_extent_item *fi;
u64 bytenr, num_bytes;
/* We can be called directly from walk_up_proc() */ if (!btrfs_qgroup_full_accounting(fs_info)) return 0;
for (i = 0; i < nr; i++) {
btrfs_item_key_to_cpu(eb, &key, i);
if (key.type != BTRFS_EXTENT_DATA_KEY) continue;
fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); /* filter out non qgroup-accountable extents */
extent_type = btrfs_file_extent_type(eb, fi);
if (extent_type == BTRFS_FILE_EXTENT_INLINE) continue;
bytenr = btrfs_file_extent_disk_bytenr(eb, fi); if (!bytenr) continue;
ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes); if (ret) return ret;
}
cond_resched(); return 0;
}
/* * Walk up the tree from the bottom, freeing leaves and any interior * nodes which have had all slots visited. If a node (leaf or * interior) is freed, the node above it will have it's slot * incremented. The root node will never be freed. * * At the end of this function, we should have a path which has all * slots incremented to the next position for a search. If we need to * read a new node it will be NULL and the node above it will have the * correct slot selected for a later read. * * If we increment the root nodes slot counter past the number of * elements, 1 is returned to signal completion of the search.
*/ staticint adjust_slots_upwards(struct btrfs_path *path, int root_level)
{ int level = 0; int nr, slot; struct extent_buffer *eb;
if (root_level == 0) return 1;
while (level <= root_level) {
eb = path->nodes[level];
nr = btrfs_header_nritems(eb);
path->slots[level]++;
slot = path->slots[level]; if (slot >= nr || level == 0) { /* * Don't free the root - we will detect this * condition after our loop and return a * positive value for caller to stop walking the tree.
*/ if (level != root_level) {
btrfs_tree_unlock_rw(eb, path->locks[level]);
path->locks[level] = 0;
free_extent_buffer(eb);
path->nodes[level] = NULL;
path->slots[level] = 0;
}
} else { /* * We have a valid slot to walk back down * from. Stop here so caller can process these * new nodes.
*/ break;
}
level++;
}
eb = path->nodes[root_level]; if (path->slots[root_level] >= btrfs_header_nritems(eb)) return 1;
return 0;
}
/* * Helper function to trace a subtree tree block swap. * * The swap will happen in highest tree block, but there may be a lot of * tree blocks involved. * * For example: * OO = Old tree blocks * NN = New tree blocks allocated during balance * * File tree (257) Reloc tree for 257 * L2 OO NN * / \ / \ * L1 OO OO (a) OO NN (a) * / \ / \ / \ / \ * L0 OO OO OO OO OO OO NN NN * (b) (c) (b) (c) * * When calling qgroup_trace_extent_swap(), we will pass: * @src_eb = OO(a) * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ] * @dst_level = 0 * @root_level = 1 * * In that case, qgroup_trace_extent_swap() will search from OO(a) to * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty. * * The main work of qgroup_trace_extent_swap() can be split into 3 parts: * * 1) Tree search from @src_eb * It should acts as a simplified btrfs_search_slot(). * The key for search can be extracted from @dst_path->nodes[dst_level] * (first key). * * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty * NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty. * They should be marked during previous (@dst_level = 1) iteration. * * 3) Mark file extents in leaves dirty * We don't have good way to pick out new file extents only. * So we still follow the old method by scanning all file extents in * the leave. * * This function can free us from keeping two paths, thus later we only need * to care about how to iterate all new tree blocks in reloc tree.
*/ staticint qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, struct extent_buffer *src_eb, struct btrfs_path *dst_path, int dst_level, int root_level, bool trace_leaf)
{ struct btrfs_key key; struct btrfs_path *src_path; struct btrfs_fs_info *fs_info = trans->fs_info;
u32 nodesize = fs_info->nodesize; int cur_level = root_level; int ret;
/* * Now both @dst_path and @src_path have been populated, record the tree * blocks for qgroup accounting.
*/
ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start,
nodesize); if (ret < 0) goto out;
ret = btrfs_qgroup_trace_extent(trans, dst_path->nodes[dst_level]->start,
nodesize); if (ret < 0) goto out;
/* Record leaf file extents */ if (dst_level == 0 && trace_leaf) {
ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]); if (ret < 0) goto out;
ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]);
}
out:
btrfs_free_path(src_path); return ret;
}
/* * Helper function to do recursive generation-aware depth-first search, to * locate all new tree blocks in a subtree of reloc tree. * * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot) * reloc tree * L2 NN (a) * / \ * L1 OO NN (b) * / \ / \ * L0 OO OO OO NN * (c) (d) * If we pass: * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ], * @cur_level = 1 * @root_level = 1 * * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace * above tree blocks along with their counter parts in file tree. * While during search, old tree blocks OO(c) will be skipped as tree block swap * won't affect OO(c).
*/ staticint qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, struct extent_buffer *src_eb, struct btrfs_path *dst_path, int cur_level, int root_level,
u64 last_snapshot, bool trace_leaf)
{ struct btrfs_fs_info *fs_info = trans->fs_info; struct extent_buffer *eb; bool need_cleanup = false; int ret = 0; int i;
/* Read the tree block if needed */ if (dst_path->nodes[cur_level] == NULL) { int parent_slot;
u64 child_gen;
/* * dst_path->nodes[root_level] must be initialized before * calling this function.
*/ if (cur_level == root_level) {
btrfs_err_rl(fs_info, "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d",
__func__, root_level, root_level, cur_level); return -EUCLEAN;
}
/* * We need to get child blockptr/gen from parent before we can * read it.
*/
eb = dst_path->nodes[cur_level + 1];
parent_slot = dst_path->slots[cur_level + 1];
child_gen = btrfs_node_ptr_generation(eb, parent_slot);
/* This node is old, no need to trace */ if (child_gen < last_snapshot) goto out;
eb = btrfs_read_node_slot(eb, parent_slot); if (IS_ERR(eb)) {
ret = PTR_ERR(eb); goto out;
}
/* Now record this tree block and its counter part for qgroups */
ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level,
root_level, trace_leaf); if (ret < 0) goto cleanup;
eb = dst_path->nodes[cur_level];
if (cur_level > 0) { /* Iterate all child tree blocks */ for (i = 0; i < btrfs_header_nritems(eb); i++) { /* Skip old tree blocks as they won't be swapped */ if (btrfs_node_ptr_generation(eb, i) < last_snapshot) continue;
dst_path->slots[cur_level] = i;
/* Recursive call (at most 7 times) */
ret = qgroup_trace_new_subtree_blocks(trans, src_eb,
dst_path, cur_level - 1, root_level,
last_snapshot, trace_leaf); if (ret < 0) goto cleanup;
}
}
/* * Inform qgroup to trace a whole subtree, including all its child tree * blocks and data. * The root tree block is specified by @root_eb. * * Normally used by relocation(tree block swap) and subvolume deletion. * * Return 0 for success * Return <0 for error(ENOMEM or tree search error)
*/ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *root_eb,
u64 root_gen, int root_level)
{ struct btrfs_fs_info *fs_info = trans->fs_info; int ret = 0; int level;
u8 drop_subptree_thres; struct extent_buffer *eb = root_eb; struct btrfs_path *path = NULL;
/* * This function only gets called for snapshot drop, if we hit a high * node here, it means we are going to change ownership for quite a lot * of extents, which will greatly slow down btrfs_commit_transaction(). * * So here if we find a high tree here, we just skip the accounting and * mark qgroup inconsistent.
*/ if (root_level >= drop_subptree_thres) {
qgroup_mark_inconsistent(fs_info, "subtree level reached threshold"); return 0;
}
ret = btrfs_read_extent_buffer(root_eb, &check); if (ret) goto out;
}
if (root_level == 0) {
ret = btrfs_qgroup_trace_leaf_items(trans, root_eb); goto out;
}
path = btrfs_alloc_path(); if (!path) return -ENOMEM;
/* * Walk down the tree. Missing extent blocks are filled in as * we go. Metadata is accounted every time we read a new * extent block. * * When we reach a leaf, we account for file extent items in it, * walk back up the tree (adjusting slot pointers as we go) * and restart the search process.
*/
refcount_inc(&root_eb->refs); /* For path */
path->nodes[root_level] = root_eb;
path->slots[root_level] = 0;
path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
walk_down:
level = root_level; while (level >= 0) { if (path->nodes[level] == NULL) { int parent_slot;
u64 child_bytenr;
/* * We need to get child blockptr from parent before we * can read it.
*/
eb = path->nodes[level + 1];
parent_slot = path->slots[level + 1];
child_bytenr = btrfs_node_blockptr(eb, parent_slot);
eb = btrfs_read_node_slot(eb, parent_slot); if (IS_ERR(eb)) {
ret = PTR_ERR(eb); goto out;
}
#define UPDATE_NEW 0 #define UPDATE_OLD 1 /* * Walk all of the roots that points to the bytenr and adjust their refcnts.
*/ staticvoid qgroup_update_refcnt(struct btrfs_fs_info *fs_info, struct ulist *roots, struct list_head *qgroups,
u64 seq, int update_old)
{ struct ulist_node *unode; struct ulist_iterator uiter; struct btrfs_qgroup *qg;
if (!roots) return;
ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(roots, &uiter))) {
LIST_HEAD(tmp);
qg = find_qgroup_rb(fs_info, unode->val); if (!qg) continue;
/* * Update qgroup rfer/excl counters. * Rfer update is easy, codes can explain themselves. * * Excl update is tricky, the update is split into 2 parts. * Part 1: Possible exclusive <-> sharing detect: * | A | !A | * ------------------------------------- * B | * | - | * ------------------------------------- * !B | + | ** | * ------------------------------------- * * Conditions: * A: cur_old_roots < nr_old_roots (not exclusive before) * !A: cur_old_roots == nr_old_roots (possible exclusive before) * B: cur_new_roots < nr_new_roots (not exclusive now) * !B: cur_new_roots == nr_new_roots (possible exclusive now) * * Results: * +: Possible sharing -> exclusive -: Possible exclusive -> sharing * *: Definitely not changed. **: Possible unchanged. * * For !A and !B condition, the exception is cur_old/new_roots == 0 case. * * To make the logic clear, we first use condition A and B to split * combination into 4 results. * * Then, for result "+" and "-", check old/new_roots == 0 case, as in them * only on variant maybe 0. * * Lastly, check result **, since there are 2 variants maybe 0, split them * again(2x2). * But this time we don't need to consider other things, the codes and logic * is easy to understand now.
*/ staticvoid qgroup_update_counters(struct btrfs_fs_info *fs_info, struct list_head *qgroups, u64 nr_old_roots,
u64 nr_new_roots, u64 num_bytes, u64 seq)
{ struct btrfs_qgroup *qg;
/* * Check if the @roots potentially is a list of fs tree roots * * Return 0 for definitely not a fs/subvol tree roots ulist * Return 1 for possible fs/subvol tree roots in the list (considering an empty * one as well)
*/ staticint maybe_fs_roots(struct ulist *roots)
{ struct ulist_node *unode; struct ulist_iterator uiter;
/* Empty one, still possible for fs roots */ if (!roots || roots->nnodes == 0) return 1;
ULIST_ITER_INIT(&uiter);
unode = ulist_next(roots, &uiter); if (!unode) return 1;
/* * If it contains fs tree roots, then it must belong to fs/subvol * trees. * If it contains a non-fs tree, it won't be shared with fs/subvol trees.
*/ return btrfs_is_fstree(unode->val);
}
/* * If quotas get disabled meanwhile, the resources need to be freed and * we can't just exit here.
*/ if (!btrfs_qgroup_full_accounting(fs_info) ||
fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) goto out_free;
if (new_roots) { if (!maybe_fs_roots(new_roots)) goto out_free;
nr_new_roots = new_roots->nnodes;
} if (old_roots) { if (!maybe_fs_roots(old_roots)) goto out_free;
nr_old_roots = old_roots->nnodes;
}
/* Quick exit, either not fs tree roots, or won't affect any qgroup */ if (nr_old_roots == 0 && nr_new_roots == 0) goto out_free;
/* * We're done using the iterator, release all its qgroups while holding * fs_info->qgroup_lock so that we don't race with btrfs_remove_qgroup() * and trigger use-after-free accesses to qgroups.
*/
qgroup_iterator_nested_clean(&qgroups);
/* * Old roots should be searched when inserting qgroup * extent record. * * But for INCONSISTENT (NO_ACCOUNTING) -> rescan case, * we may have some record inserted during * NO_ACCOUNTING (thus no old_roots populated), but * later we start rescan, which clears NO_ACCOUNTING, * leaving some inserted records without old_roots * populated. * * Those cases are rare and should not cause too much * time spent during commit_transaction().
*/ if (!record->old_roots) { /* Search commit root to find old_roots */
ret = btrfs_find_all_roots(&ctx, false); if (ret < 0) goto cleanup;
record->old_roots = ctx.roots;
ctx.roots = NULL;
}
/* * Use BTRFS_SEQ_LAST as time_seq to do special search, * which doesn't lock tree or delayed_refs and search * current root. It's safe inside commit_transaction().
*/
ctx.trans = trans;
ctx.time_seq = BTRFS_SEQ_LAST;
ret = btrfs_find_all_roots(&ctx, false); if (ret < 0) goto cleanup;
new_roots = ctx.roots; if (qgroup_to_skip) {
ulist_del(new_roots, qgroup_to_skip, 0);
ulist_del(record->old_roots, qgroup_to_skip,
0);
}
ret = btrfs_qgroup_account_extent(trans, bytenr,
record->num_bytes,
record->old_roots,
new_roots);
record->old_roots = NULL;
new_roots = NULL;
} /* Free the reserved data space */
btrfs_qgroup_free_refroot(fs_info,
record->data_rsv_refroot,
record->data_rsv,
BTRFS_QGROUP_RSV_DATA);
cleanup:
ulist_free(record->old_roots);
ulist_free(new_roots);
new_roots = NULL;
xa_erase(&delayed_refs->dirty_extents, index);
kfree(record);
/* * Writes all changed qgroups to disk. * Called by the transaction commit path and the qgroup assign ioctl.
*/ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
{ struct btrfs_fs_info *fs_info = trans->fs_info; int ret = 0;
/* * In case we are called from the qgroup assign ioctl, assert that we * are holding the qgroup_ioctl_lock, otherwise we can race with a quota * disable operation (ioctl) and access a freed quota root.
*/ if (trans->transaction->state != TRANS_STATE_COMMIT_DOING)
lockdep_assert_held(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root) return ret;
spin_lock(&fs_info->qgroup_lock); while (!list_empty(&fs_info->dirty_qgroups)) { struct btrfs_qgroup *qgroup;
qgroup = list_first_entry(&fs_info->dirty_qgroups, struct btrfs_qgroup, dirty);
list_del_init(&qgroup->dirty);
spin_unlock(&fs_info->qgroup_lock);
ret = update_qgroup_info_item(trans, qgroup); if (ret)
qgroup_mark_inconsistent(fs_info, "qgroup info item update error %d", ret);
ret = update_qgroup_limit_item(trans, qgroup); if (ret)
qgroup_mark_inconsistent(fs_info, "qgroup limit item update error %d", ret);
spin_lock(&fs_info->qgroup_lock);
} if (btrfs_qgroup_enabled(fs_info))
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON; else
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
spin_unlock(&fs_info->qgroup_lock);
ret = update_qgroup_status_item(trans); if (ret)
qgroup_mark_inconsistent(fs_info, "qgroup status item update error %d", ret);
return ret;
}
int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info, struct btrfs_qgroup_inherit *inherit,
size_t size)
{ if (inherit->flags & ~BTRFS_QGROUP_INHERIT_FLAGS_SUPP) return -EOPNOTSUPP; if (size < sizeof(*inherit) || size > PAGE_SIZE) return -EINVAL;
/* * In the past we allowed btrfs_qgroup_inherit to specify to copy * rfer/excl numbers directly from other qgroups. This behavior has * been disabled in userspace for a very long time, but here we should * also disable it in kernel, as this behavior is known to mark qgroup * inconsistent, and a rescan would wipe out the changes anyway. * * Reject any btrfs_qgroup_inherit with num_ref_copies or num_excl_copies.
*/ if (inherit->num_ref_copies > 0 || inherit->num_excl_copies > 0) return -EINVAL;
if (size != struct_size(inherit, qgroups, inherit->num_qgroups)) return -EINVAL;
/* * Skip the inherit source qgroups check if qgroup is not enabled. * Qgroup can still be later enabled causing problems, but in that case * btrfs_qgroup_inherit() would just ignore those invalid ones.
*/ if (!btrfs_qgroup_enabled(fs_info)) return 0;
/* * Now check all the remaining qgroups, they should all: * * - Exist * - Be higher level qgroups.
*/ for (int i = 0; i < inherit->num_qgroups; i++) { struct btrfs_qgroup *qgroup;
u64 qgroupid = inherit->qgroups[i];
if (btrfs_qgroup_level(qgroupid) == 0) return -EINVAL;
/* * Check if we can skip rescan when inheriting qgroups. If @src has a single * @parent, and that @parent is owning all its bytes exclusively, we can skip * the full rescan, by just adding nodesize to the @parent's excl/rfer. * * Return <0 for fatal errors (like srcid/parentid has no qgroup). * Return 0 if a quick inherit is done. * Return >0 if a quick inherit is not possible, and a full rescan is needed.
*/ staticint qgroup_snapshot_quick_inherit(struct btrfs_fs_info *fs_info,
u64 srcid, u64 parentid)
{ struct btrfs_qgroup *src; struct btrfs_qgroup *parent; struct btrfs_qgroup_list *list; int nr_parents = 0;
src = find_qgroup_rb(fs_info, srcid); if (!src) return -ENOENT;
parent = find_qgroup_rb(fs_info, parentid); if (!parent) return -ENOENT;
/* * Source has no parent qgroup, but our new qgroup would have one. * Qgroup numbers would become inconsistent.
*/ if (list_empty(&src->groups)) return 1;
list_for_each_entry(list, &src->groups, next_group) { /* The parent is not the same, quick update is not possible. */ if (list->group->qgroupid != parentid) return 1;
nr_parents++; /* * More than one parent qgroup, we can't be sure about accounting * consistency.
*/ if (nr_parents > 1) return 1;
}
/* * The parent is not exclusively owning all its bytes. We're not sure * if the source has any bytes not fully owned by the parent.
*/ if (parent->excl != parent->rfer) return 1;
/* * Copy the accounting information between qgroups. This is necessary * when a snapshot or a subvolume is created. Throwing an error will * cause a transaction abort so we take extra care here to only error * when a readonly fs is a reasonable outcome.
*/ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
u64 objectid, u64 inode_rootid, struct btrfs_qgroup_inherit *inherit)
{ int ret = 0;
u64 *i_qgroups; bool committing = false; struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *quota_root; struct btrfs_qgroup *srcgroup; struct btrfs_qgroup *dstgroup; struct btrfs_qgroup *prealloc; struct btrfs_qgroup_list **qlist_prealloc = NULL; bool free_inherit = false; bool need_rescan = false;
u32 level_size = 0;
u64 nums;
if (!btrfs_qgroup_enabled(fs_info)) return 0;
prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); if (!prealloc) return -ENOMEM;
/* * There are only two callers of this function. * * One in create_subvol() in the ioctl context, which needs to hold * the qgroup_ioctl_lock. * * The other one in create_pending_snapshot() where no other qgroup * code can modify the fs as they all need to either start a new trans * or hold a trans handler, thus we don't need to hold * qgroup_ioctl_lock. * This would avoid long and complex lock chain and make lockdep happy.
*/
spin_lock(&fs_info->trans_lock); if (trans->transaction->state == TRANS_STATE_COMMIT_DOING)
committing = true;
spin_unlock(&fs_info->trans_lock);
if (!committing)
mutex_lock(&fs_info->qgroup_ioctl_lock);
quota_root = fs_info->quota_root; if (!quota_root) {
ret = -EINVAL; goto out;
}
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE && !inherit) {
ret = qgroup_auto_inherit(fs_info, inode_rootid, &inherit); if (ret) goto out;
free_inherit = true;
}
if (inherit) {
i_qgroups = (u64 *)(inherit + 1);
nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
2 * inherit->num_excl_copies; for (int i = 0; i < nums; i++) {
srcgroup = find_qgroup_rb(fs_info, *i_qgroups);
/* * Zero out invalid groups so we can ignore * them later.
*/ if (!srcgroup ||
((srcgroup->qgroupid >> 48) <= (objectid >> 48)))
*i_qgroups = 0ULL;
++i_qgroups;
}
}
/* * create a tracking group for the subvol itself
*/
ret = add_qgroup_item(trans, quota_root, objectid); if (ret) goto out;
/* * add qgroup to all inherited groups
*/ if (inherit) {
i_qgroups = (u64 *)(inherit + 1); for (int i = 0; i < inherit->num_qgroups; i++, i_qgroups++) { if (*i_qgroups == 0) continue;
ret = add_qgroup_relation_item(trans, objectid,
*i_qgroups); if (ret && ret != -EEXIST) goto out;
ret = add_qgroup_relation_item(trans, *i_qgroups,
objectid); if (ret && ret != -EEXIST) goto out;
}
ret = 0;
qlist_prealloc = kcalloc(inherit->num_qgroups, sizeof(struct btrfs_qgroup_list *),
GFP_NOFS); if (!qlist_prealloc) {
ret = -ENOMEM; goto out;
} for (int i = 0; i < inherit->num_qgroups; i++) {
qlist_prealloc[i] = kzalloc(sizeof(struct btrfs_qgroup_list),
GFP_NOFS); if (!qlist_prealloc[i]) {
ret = -ENOMEM; goto out;
}
}
}
if (srcid && btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL) {
srcgroup = find_qgroup_rb(fs_info, srcid); if (!srcgroup) goto unlock;
/* * We call inherit after we clone the root in order to make sure * our counts don't go crazy, so at this point the only * difference between the two roots should be the root node.
*/
level_size = fs_info->nodesize;
dstgroup->rfer = srcgroup->rfer;
dstgroup->rfer_cmpr = srcgroup->rfer_cmpr;
dstgroup->excl = level_size;
dstgroup->excl_cmpr = level_size;
srcgroup->excl = level_size;
srcgroup->excl_cmpr = level_size;
/* * If the source qgroup has parent but the new one doesn't, * we need a full rescan.
*/ if (!inherit && !list_empty(&srcgroup->groups))
need_rescan = true;
}
if (!inherit) goto unlock;
i_qgroups = (u64 *)(inherit + 1); for (int i = 0; i < inherit->num_qgroups; i++) { if (*i_qgroups) {
ret = add_relation_rb(fs_info, qlist_prealloc[i], objectid,
*i_qgroups);
qlist_prealloc[i] = NULL; if (ret) goto unlock;
} if (srcid) { /* Check if we can do a quick inherit. */
ret = qgroup_snapshot_quick_inherit(fs_info, srcid, *i_qgroups); if (ret < 0) goto unlock; if (ret > 0)
need_rescan = true;
ret = 0;
}
++i_qgroups;
}
for (int i = 0; i < inherit->num_ref_copies; i++, i_qgroups += 2) { struct btrfs_qgroup *src; struct btrfs_qgroup *dst;
unlock:
spin_unlock(&fs_info->qgroup_lock); if (!ret)
ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup);
out: if (!committing)
mutex_unlock(&fs_info->qgroup_ioctl_lock); if (need_rescan)
qgroup_mark_inconsistent(fs_info, "qgroup inherit needs a rescan"); if (qlist_prealloc) { for (int i = 0; i < inherit->num_qgroups; i++)
kfree(qlist_prealloc[i]);
kfree(qlist_prealloc);
} if (free_inherit)
kfree(inherit);
kfree(prealloc); return ret;
}
ret = 0; /* * no limits exceeded, now record the reservation into all qgroups
*/
list_for_each_entry(qgroup, &qgroup_list, iterator)
qgroup_rsv_add(fs_info, qgroup, num_bytes, type);
/* * Free @num_bytes of reserved space with @type for qgroup. (Normally level 0 * qgroup). * * Will handle all higher level qgroup too. * * NOTE: If @num_bytes is (u64)-1, this means to free all bytes of this qgroup. * This special case is only used for META_PERTRANS type.
*/ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
u64 ref_root, u64 num_bytes, enum btrfs_qgroup_rsv_type type)
{ struct btrfs_qgroup *qgroup;
LIST_HEAD(qgroup_list);
if (!btrfs_is_fstree(ref_root)) return;
if (num_bytes == 0) return;
if (num_bytes == (u64)-1 && type != BTRFS_QGROUP_RSV_META_PERTRANS) {
WARN(1, "%s: Invalid type to free", __func__); return;
}
spin_lock(&fs_info->qgroup_lock);
if (!fs_info->quota_root) goto out;
qgroup = find_qgroup_rb(fs_info, ref_root); if (!qgroup) goto out;
if (num_bytes == (u64)-1) /* * We're freeing all pertrans rsv, get reserved value from * level 0 qgroup as real num_bytes to free.
*/
num_bytes = qgroup->rsv.values[type];
/* * Check if the leaf is the last leaf. Which means all node pointers * are at their last position.
*/ staticbool is_last_leaf(struct btrfs_path *path)
{ int i;
for (i = 1; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { if (path->slots[i] != btrfs_header_nritems(path->nodes[i]) - 1) returnfalse;
} returntrue;
}
/* * returns < 0 on error, 0 when more leafs are to be scanned. * returns 1 when done.
*/ staticint qgroup_rescan_leaf(struct btrfs_trans_handle *trans, struct btrfs_path *path)
{ struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *extent_root; struct btrfs_key found; struct extent_buffer *scratch_leaf = NULL;
u64 num_bytes; bool done; int slot; int ret;
if (!btrfs_qgroup_full_accounting(fs_info)) return 1;
if (ret) { /* * The rescan is about to end, we will not be scanning any * further blocks. We cannot unset the RESCAN flag here, because * we want to commit the transaction if everything went well. * To make the live accounting work in this phase, we set our * scan progress pointer such that every real extent objectid * will be smaller.
*/
fs_info->qgroup_rescan_progress.objectid = (u64)-1;
btrfs_release_path(path);
mutex_unlock(&fs_info->qgroup_rescan_lock); return ret;
}
done = is_last_leaf(path);
ret = btrfs_find_all_roots(&ctx, false); if (ret < 0) goto out; /* For rescan, just pass old_roots as NULL */
ret = btrfs_qgroup_account_extent(trans, found.objectid,
num_bytes, NULL, ctx.roots); if (ret < 0) goto out;
}
out: if (scratch_leaf)
free_extent_buffer(scratch_leaf);
if (done && !ret) {
ret = 1;
fs_info->qgroup_rescan_progress.objectid = (u64)-1;
} return ret;
}
staticbool rescan_should_stop(struct btrfs_fs_info *fs_info)
{ if (btrfs_fs_closing(fs_info)) returntrue; if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) returntrue; if (!btrfs_qgroup_enabled(fs_info)) returntrue; if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) returntrue; returnfalse;
}
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) return;
path = btrfs_alloc_path(); if (!path) {
ret = -ENOMEM; goto out;
} /* * Rescan should only search for commit root, and any later difference * should be recorded by qgroup
*/
path->search_commit_root = 1;
path->skip_locking = 1;
while (!ret && !(stopped = rescan_should_stop(fs_info))) {
trans = btrfs_start_transaction(fs_info->fs_root, 0); if (IS_ERR(trans)) {
ret = PTR_ERR(trans); break;
}
ret = qgroup_rescan_leaf(trans, path);
did_leaf_rescans = true;
if (ret > 0)
btrfs_commit_transaction(trans); else
btrfs_end_transaction(trans);
}
/* * Only update status, since the previous part has already updated the * qgroup info, and only if we did any actual work. This also prevents * race with a concurrent quota disable, which has already set * fs_info->quota_root to NULL and cleared BTRFS_FS_QUOTA_ENABLED at * btrfs_quota_disable().
*/ if (did_leaf_rescans) {
trans = btrfs_start_transaction(fs_info->quota_root, 1); if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL;
btrfs_err(fs_info, "fail to start transaction for status update: %d",
ret);
}
} else {
trans = NULL;
}
mutex_lock(&fs_info->qgroup_rescan_lock); if (!stopped ||
fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN)
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; if (trans) { int ret2 = update_qgroup_status_item(trans);
if (ret2 < 0) {
ret = ret2;
btrfs_err(fs_info, "fail to update qgroup status: %d", ret);
}
}
fs_info->qgroup_rescan_running = false;
fs_info->qgroup_flags &= ~BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN;
complete_all(&fs_info->qgroup_rescan_completion);
mutex_unlock(&fs_info->qgroup_rescan_lock);
/* * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all * memory required for the rescan context.
*/ staticint
qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, int init_flags)
{ int ret = 0;
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) {
btrfs_warn(fs_info, "qgroup rescan init failed, running in simple mode"); return -EINVAL;
}
if (!init_flags) { /* we're resuming qgroup rescan at mount time */ if (!(fs_info->qgroup_flags &
BTRFS_QGROUP_STATUS_FLAG_RESCAN)) {
btrfs_debug(fs_info, "qgroup rescan init failed, qgroup rescan is not queued");
ret = -EINVAL;
} elseif (!(fs_info->qgroup_flags &
BTRFS_QGROUP_STATUS_FLAG_ON)) {
btrfs_debug(fs_info, "qgroup rescan init failed, qgroup is not enabled");
ret = -ENOTCONN;
}
if (ret) return ret;
}
mutex_lock(&fs_info->qgroup_rescan_lock);
if (init_flags) { if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
ret = -EINPROGRESS;
} elseif (!(fs_info->qgroup_flags &
BTRFS_QGROUP_STATUS_FLAG_ON)) {
btrfs_debug(fs_info, "qgroup rescan init failed, qgroup is not enabled");
ret = -ENOTCONN;
} elseif (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) { /* Quota disable is in progress */
ret = -EBUSY;
}
spin_lock(&fs_info->qgroup_lock); /* clear all current qgroup tracking information */ for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
qgroup = rb_entry(n, struct btrfs_qgroup, node);
qgroup->rfer = 0;
qgroup->rfer_cmpr = 0;
qgroup->excl = 0;
qgroup->excl_cmpr = 0;
qgroup_dirty(fs_info, qgroup);
}
spin_unlock(&fs_info->qgroup_lock);
}
int
btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
{ int ret = 0;
ret = qgroup_rescan_init(fs_info, 0, 1); if (ret) return ret;
/* * We have set the rescan_progress to 0, which means no more * delayed refs will be accounted by btrfs_qgroup_account_ref. * However, btrfs_qgroup_account_ref may be right after its call * to btrfs_find_all_roots, in which case it would still do the * accounting. * To solve this, we're committing the transaction, which will * ensure we run all delayed refs and only after that, we are * going to clear all tracking information for a clean start.
*/
ret = btrfs_commit_current_transaction(fs_info->fs_root); if (ret) {
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; return ret;
}
qgroup_rescan_zero_tracking(fs_info);
mutex_lock(&fs_info->qgroup_rescan_lock); /* * The rescan worker is only for full accounting qgroups, check if it's * enabled as it is pointless to queue it otherwise. A concurrent quota * disable may also have just cleared BTRFS_FS_QUOTA_ENABLED.
*/ if (btrfs_qgroup_full_accounting(fs_info)) {
fs_info->qgroup_rescan_running = true;
btrfs_queue_work(fs_info->qgroup_rescan_workers,
&fs_info->qgroup_rescan_work);
} else {
ret = -ENOTCONN;
}
mutex_unlock(&fs_info->qgroup_rescan_lock);
return ret;
}
int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, bool interruptible)
{ int running; int ret = 0;
if (interruptible)
ret = wait_for_completion_interruptible(
&fs_info->qgroup_rescan_completion); else
wait_for_completion(&fs_info->qgroup_rescan_completion);
return ret;
}
/* * this is only called from open_ctree where we're still single threaded, thus * locking is omitted here.
*/ void
btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
{ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
mutex_lock(&fs_info->qgroup_rescan_lock);
fs_info->qgroup_rescan_running = true;
btrfs_queue_work(fs_info->qgroup_rescan_workers,
&fs_info->qgroup_rescan_work);
mutex_unlock(&fs_info->qgroup_rescan_lock);
}
}
#define rbtree_iterate_from_safe(node, next, start) \ for (node = start; node && ({ next = rb_next(node); 1;}); node = next)
/* * Try to free some space for qgroup. * * For qgroup, there are only 3 ways to free qgroup space: * - Flush nodatacow write * Any nodatacow write will free its reserved data space at run_delalloc_range(). * In theory, we should only flush nodatacow inodes, but it's not yet * possible, so we need to flush the whole root. * * - Wait for ordered extents * When ordered extents are finished, their reserved metadata is finally * converted to per_trans status, which can be freed by later commit * transaction. * * - Commit transaction * This would free the meta_per_trans space. * In theory this shouldn't provide much space, but any more qgroup space * is needed.
*/ staticint try_flush_qgroup(struct btrfs_root *root)
{ int ret;
/* Can't hold an open transaction or we run the risk of deadlocking. */
ASSERT(current->journal_info == NULL); if (WARN_ON(current->journal_info)) return 0;
/* * We don't want to run flush again and again, so if there is a running * one, we won't try to start a new flush, but exit directly.
*/ if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
wait_event(root->qgroup_flush_wait,
!test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)); return 0;
}
ret = btrfs_start_delalloc_snapshot(root, true); if (ret < 0) goto out;
btrfs_wait_ordered_extents(root, U64_MAX, NULL);
/* * After waiting for ordered extents run delayed iputs in order to free * space from unlinked files before committing the current transaction, * as ordered extents may have been holding the last reference of an * inode and they add a delayed iput when they complete.
*/
btrfs_run_delayed_iputs(root->fs_info);
btrfs_wait_on_delayed_iputs(root->fs_info);
ret = btrfs_commit_current_transaction(root);
out:
clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
wake_up(&root->qgroup_flush_wait); return ret;
}
/* * Reserve qgroup space for range [start, start + len). * * This function will either reserve space from related qgroups or do nothing * if the range is already reserved. * * Return 0 for successful reservation * Return <0 for error (including -EQUOT) * * NOTE: This function may sleep for memory allocation, dirty page flushing and * commit transaction. So caller should not hold any dirty page locked.
*/ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, struct extent_changeset **reserved_ret, u64 start,
u64 len)
{ int ret;
ret = qgroup_reserve_data(inode, reserved_ret, start, len); if (ret <= 0 && ret != -EDQUOT) return ret;
ret = try_flush_qgroup(inode->root); if (ret < 0) return ret; return qgroup_reserve_data(inode, reserved_ret, start, len);
}
ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(&reserved->range_changed, &uiter))) {
u64 range_start = unode->val; /* unode->aux is the inclusive end */
u64 range_len = unode->aux - range_start + 1;
u64 free_start;
u64 free_len;
extent_changeset_release(&changeset);
/* Only free range in range [start, start + len) */ if (range_start >= start + len ||
range_start + range_len <= start) continue;
free_start = max(range_start, start);
free_len = min(start + len, range_start + range_len) -
free_start; /* * TODO: To also modify reserved->ranges_reserved to reflect * the modification. * * However as long as we free qgroup reserved according to * EXTENT_QGROUP_RESERVED, we won't double free. * So not need to rush.
*/
ret = btrfs_clear_record_extent_bits(&inode->io_tree, free_start,
free_start + free_len - 1,
EXTENT_QGROUP_RESERVED,
&changeset); if (ret < 0) goto out;
freed += changeset.bytes_changed;
}
btrfs_qgroup_free_refroot(root->fs_info, btrfs_root_id(root), freed,
BTRFS_QGROUP_RSV_DATA); if (freed_ret)
*freed_ret = freed;
ret = 0;
out:
extent_changeset_release(&changeset); return ret;
}
staticint __btrfs_qgroup_release_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len,
u64 *released, int free)
{ struct extent_changeset changeset; int trace_op = QGROUP_RELEASE; int ret;
if (btrfs_qgroup_mode(inode->root->fs_info) == BTRFS_QGROUP_MODE_DISABLED) { return btrfs_clear_record_extent_bits(&inode->io_tree, start,
start + len - 1,
EXTENT_QGROUP_RESERVED, NULL);
}
/* In release case, we shouldn't have @reserved */
WARN_ON(!free && reserved); if (free && reserved) return qgroup_free_reserved_data(inode, reserved, start, len, released);
extent_changeset_init(&changeset);
ret = btrfs_clear_record_extent_bits(&inode->io_tree, start, start + len - 1,
EXTENT_QGROUP_RESERVED, &changeset); if (ret < 0) goto out;
if (free)
trace_op = QGROUP_FREE;
trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len,
changeset.bytes_changed, trace_op); if (free)
btrfs_qgroup_free_refroot(inode->root->fs_info,
btrfs_root_id(inode->root),
changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); if (released)
*released = changeset.bytes_changed;
out:
extent_changeset_release(&changeset); return ret;
}
/* * Free a reserved space range from io_tree and related qgroups * * Should be called when a range of pages get invalidated before reaching disk. * Or for error cleanup case. * if @reserved is given, only reserved range in [@start, @start + @len) will * be freed. * * For data written to disk, use btrfs_qgroup_release_data(). * * NOTE: This function may sleep for memory allocation.
*/ int btrfs_qgroup_free_data(struct btrfs_inode *inode, struct extent_changeset *reserved,
u64 start, u64 len, u64 *freed)
{ return __btrfs_qgroup_release_data(inode, reserved, start, len, freed, 1);
}
/* * Release a reserved space range from io_tree only. * * Should be called when a range of pages get written to disk and corresponding * FILE_EXTENT is inserted into corresponding root. * * Since new qgroup accounting framework will only update qgroup numbers at * commit_transaction() time, its reserved space shouldn't be freed from * related qgroups. * * But we should release the range from io_tree, to allow further write to be * COWed. * * NOTE: This function may sleep for memory allocation.
*/ int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released)
{ return __btrfs_qgroup_release_data(inode, NULL, start, len, released, 0);
}
staticvoid add_root_meta_rsv(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type)
{ if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
type != BTRFS_QGROUP_RSV_META_PERTRANS) return; if (num_bytes == 0) return;
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
trace_btrfs_qgroup_meta_reserve(root, (s64)num_bytes, type);
ret = qgroup_reserve(root, num_bytes, enforce, type); if (ret < 0) return ret; /* * Record what we have reserved into root. * * To avoid quota disabled->enabled underflow. * In that case, we may try to free space we haven't reserved * (since quota was disabled), so record what we reserved into root. * And ensure later release won't underflow this number.
*/
add_root_meta_rsv(root, num_bytes, type); return ret;
}
int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type, bool enforce, bool noflush)
{ int ret;
ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); if ((ret <= 0 && ret != -EDQUOT) || noflush) return ret;
ret = try_flush_qgroup(root); if (ret < 0) return ret; return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
}
/* * Per-transaction meta reservation should be all freed at transaction commit * time
*/ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
{ struct btrfs_fs_info *fs_info = root->fs_info;
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
!btrfs_is_fstree(btrfs_root_id(root))) return;
/* TODO: Update trace point to handle such free */
trace_btrfs_qgroup_meta_free_all_pertrans(root); /* Special value -1 means to free all reserved space */
btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), (u64)-1,
BTRFS_QGROUP_RSV_META_PERTRANS);
}
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
!btrfs_is_fstree(btrfs_root_id(root))) return;
/* * reservation for META_PREALLOC can happen before quota is enabled, * which can lead to underflow. * Here ensure we will only free what we really have reserved.
*/
num_bytes = sub_root_meta_rsv(root, num_bytes, type);
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
trace_btrfs_qgroup_meta_reserve(root, -(s64)num_bytes, type);
btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), num_bytes, type);
}
/* * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS. * * This is called when preallocated meta reservation needs to be used. * Normally after btrfs_join_transaction() call.
*/ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
{ struct btrfs_fs_info *fs_info = root->fs_info;
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
!btrfs_is_fstree(btrfs_root_id(root))) return; /* Same as btrfs_qgroup_free_meta_prealloc() */
num_bytes = sub_root_meta_rsv(root, num_bytes,
BTRFS_QGROUP_RSV_META_PREALLOC);
trace_btrfs_qgroup_meta_convert(root, num_bytes);
qgroup_convert_meta(fs_info, btrfs_root_id(root), num_bytes); if (!sb_rdonly(fs_info->sb))
add_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS);
}
/* * Check qgroup reserved space leaking, normally at destroy inode * time
*/ void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode)
{ struct extent_changeset changeset; struct ulist_node *unode; struct ulist_iterator iter; int ret;
extent_changeset_init(&changeset);
ret = btrfs_clear_record_extent_bits(&inode->io_tree, 0, (u64)-1,
EXTENT_QGROUP_RESERVED, &changeset);
void btrfs_qgroup_init_swapped_blocks( struct btrfs_qgroup_swapped_blocks *swapped_blocks)
{ int i;
spin_lock_init(&swapped_blocks->lock); for (i = 0; i < BTRFS_MAX_LEVEL; i++)
swapped_blocks->blocks[i] = RB_ROOT;
swapped_blocks->swapped = false;
}
/* * Delete all swapped blocks record of @root. * Every record here means we skipped a full subtree scan for qgroup. * * Gets called when committing one transaction.
*/ void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
{ struct btrfs_qgroup_swapped_blocks *swapped_blocks; int i;
swapped_blocks = &root->swapped_blocks;
spin_lock(&swapped_blocks->lock); if (!swapped_blocks->swapped) goto out; for (i = 0; i < BTRFS_MAX_LEVEL; i++) { struct rb_root *cur_root = &swapped_blocks->blocks[i]; struct btrfs_qgroup_swapped_block *entry; struct btrfs_qgroup_swapped_block *next;
/* * Add subtree roots record into @subvol_root. * * @subvol_root: tree root of the subvolume tree get swapped * @bg: block group under balance * @subvol_parent/slot: pointer to the subtree root in subvolume tree * @reloc_parent/slot: pointer to the subtree root in reloc tree * BOTH POINTERS ARE BEFORE TREE SWAP * @last_snapshot: last snapshot generation of the subvolume tree
*/ int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root, struct btrfs_block_group *bg, struct extent_buffer *subvol_parent, int subvol_slot, struct extent_buffer *reloc_parent, int reloc_slot,
u64 last_snapshot)
{ struct btrfs_fs_info *fs_info = subvol_root->fs_info; struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks; struct btrfs_qgroup_swapped_block *block; struct rb_node *node; int level = btrfs_header_level(subvol_parent) - 1; int ret = 0;
if (!btrfs_qgroup_full_accounting(fs_info)) return 0;
block = kmalloc(sizeof(*block), GFP_NOFS); if (!block) {
ret = -ENOMEM; goto out;
}
/* * @reloc_parent/slot is still before swap, while @block is going to * record the bytenr after swap, so we do the swap here.
*/
block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot);
block->subvol_generation = btrfs_node_ptr_generation(reloc_parent,
reloc_slot);
block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot);
block->reloc_generation = btrfs_node_ptr_generation(subvol_parent,
subvol_slot);
block->last_snapshot = last_snapshot;
block->level = level;
/* * If we have bg == NULL, we're called from btrfs_recover_relocation(), * no one else can modify tree blocks thus we qgroup will not change * no matter the value of trace_leaf.
*/ if (bg && bg->flags & BTRFS_BLOCK_GROUP_DATA)
block->trace_leaf = true; else
block->trace_leaf = false;
btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot);
/* Insert @block into @blocks */
spin_lock(&blocks->lock);
node = rb_find_add(&block->node, &blocks->blocks[level], qgroup_swapped_block_bytenr_cmp); if (node) { struct btrfs_qgroup_swapped_block *entry;
if (entry->subvol_generation != block->subvol_generation ||
entry->reloc_bytenr != block->reloc_bytenr ||
entry->reloc_generation != block->reloc_generation) { /* * Duplicated but mismatch entry found. Shouldn't happen. * Marking qgroup inconsistent should be enough for end * users.
*/
DEBUG_WARN("duplicated but mismatched entry found");
ret = -EEXIST;
}
kfree(block); goto out_unlock;
}
blocks->swapped = true;
out_unlock:
spin_unlock(&blocks->lock);
out: if (ret < 0)
qgroup_mark_inconsistent(fs_info, "%s error: %d", __func__, ret); return ret;
}
/* * Check if the tree block is a subtree root, and if so do the needed * delayed subtree trace for qgroup. * * This is called during btrfs_cow_block().
*/ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *subvol_eb)
{ struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_tree_parent_check check = { 0 }; struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks; struct btrfs_qgroup_swapped_block *block; struct extent_buffer *reloc_eb = NULL; struct rb_node *node; bool swapped = false; int level = btrfs_header_level(subvol_eb); int ret = 0; int i;
if (!btrfs_qgroup_full_accounting(fs_info)) return 0; if (!btrfs_is_fstree(btrfs_root_id(root)) || !root->reloc_root) return 0;
/* Found one, remove it from @blocks first and update blocks->swapped */
rb_erase(&block->node, &blocks->blocks[level]); for (i = 0; i < BTRFS_MAX_LEVEL; i++) { if (RB_EMPTY_ROOT(&blocks->blocks[i])) {
swapped = true; break;
}
}
blocks->swapped = swapped;
spin_unlock(&blocks->lock);
¤ Diese beiden folgenden Angebotsgruppen bietet das Unternehmen0.72Angebot
(Wie Sie bei der Firma Beratungs- und Dienstleistungen beauftragen können 2026-04-26)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.