From 141fb8cd206ace23c02cd2791c6da52c1d77d42a Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Tue, 19 Mar 2024 10:54:22 -0700 Subject: [PATCH 1/6] btrfs: qgroup: correctly model root qgroup rsv in convert We use add_root_meta_rsv and sub_root_meta_rsv to track prealloc and pertrans reservations for subvolumes when quotas are enabled. The convert function does not properly increment pertrans after decrementing prealloc, so the count is not accurate. Note: we check that the fs is not read-only to mirror the logic in qgroup_convert_meta, which checks that before adding to the pertrans rsv. Fixes: 8287475a2055 ("btrfs: qgroup: Use root::qgroup_meta_rsv_* to record qgroup meta reserved space") CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5f90f0605b12..cf8820ce7aa2 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -4495,6 +4495,8 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) BTRFS_QGROUP_RSV_META_PREALLOC); trace_qgroup_meta_convert(root, num_bytes); qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes); + if (!sb_rdonly(fs_info->sb)) + add_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS); } /* From 74e97958121aa1f5854da6effba70143f051b0cd Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Thu, 21 Mar 2024 10:02:04 -0700 Subject: [PATCH 2/6] btrfs: qgroup: fix qgroup prealloc rsv leak in subvolume operations Create subvolume, create snapshot and delete subvolume all use btrfs_subvolume_reserve_metadata() to reserve metadata for the changes done to the parent subvolume's fs tree, which cannot be mediated in the normal way via start_transaction. When quota groups (squota or qgroups) are enabled, this reserves qgroup metadata of type PREALLOC. Once the operation is associated to a transaction, we convert PREALLOC to PERTRANS, which gets cleared in bulk at the end of the transaction. However, the error paths of these three operations were not implementing this lifecycle correctly. They unconditionally converted the PREALLOC to PERTRANS in a generic cleanup step regardless of errors or whether the operation was fully associated to a transaction or not. This resulted in error paths occasionally converting this rsv to PERTRANS without calling record_root_in_trans successfully, which meant that unless that root got recorded in the transaction by some other thread, the end of the transaction would not free that root's PERTRANS, leaking it. Ultimately, this resulted in hitting a WARN in CONFIG_BTRFS_DEBUG builds at unmount for the leaked reservation. The fix is to ensure that every qgroup PREALLOC reservation observes the following properties: 1. any failure before record_root_in_trans is called successfully results in freeing the PREALLOC reservation. 2. after record_root_in_trans, we convert to PERTRANS, and now the transaction owns freeing the reservation. This patch enforces those properties on the three operations. Without it, generic/269 with squotas enabled at mkfs time would fail in ~5-10 runs on my system. With this patch, it ran successfully 1000 times in a row. Fixes: e85fde5162bf ("btrfs: qgroup: fix qgroup meta rsv leak for subvolume operations") CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/inode.c | 13 ++++++++++++- fs/btrfs/ioctl.c | 37 ++++++++++++++++++++++++++++--------- fs/btrfs/root-tree.c | 10 ---------- fs/btrfs/root-tree.h | 2 -- 4 files changed, 40 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 37701531eeb1..1f9e74c4161d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4503,6 +4503,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) struct btrfs_trans_handle *trans; struct btrfs_block_rsv block_rsv; u64 root_flags; + u64 qgroup_reserved = 0; int ret; down_write(&fs_info->subvol_sem); @@ -4547,12 +4548,20 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true); if (ret) goto out_undead; + qgroup_reserved = block_rsv.qgroup_rsv_reserved; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_release; } + ret = btrfs_record_root_in_trans(trans, root); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_end_trans; + } + btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); + qgroup_reserved = 0; trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; @@ -4611,7 +4620,9 @@ out_end_trans: ret = btrfs_end_transaction(trans); inode->i_flags |= S_DEAD; out_release: - btrfs_subvolume_release_metadata(root, &block_rsv); + btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL); + if (qgroup_reserved) + btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); out_undead: if (ret) { spin_lock(&dest->root_item_lock); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 38459a89b27c..5a507237c4fa 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -613,6 +613,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap, int ret; dev_t anon_dev; u64 objectid; + u64 qgroup_reserved = 0; root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); if (!root_item) @@ -650,13 +651,18 @@ static noinline int create_subvol(struct mnt_idmap *idmap, trans_num_items, false); if (ret) goto out_new_inode_args; + qgroup_reserved = block_rsv.qgroup_rsv_reserved; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - btrfs_subvolume_release_metadata(root, &block_rsv); - goto out_new_inode_args; + goto out_release_rsv; } + ret = btrfs_record_root_in_trans(trans, BTRFS_I(dir)->root); + if (ret) + goto out; + btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); + qgroup_reserved = 0; trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; /* Tree log can't currently deal with an inode which is a new root. */ @@ -767,9 +773,11 @@ static noinline int create_subvol(struct mnt_idmap *idmap, out: trans->block_rsv = NULL; trans->bytes_reserved = 0; - btrfs_subvolume_release_metadata(root, &block_rsv); - btrfs_end_transaction(trans); +out_release_rsv: + btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL); + if (qgroup_reserved) + btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); out_new_inode_args: btrfs_new_inode_args_destroy(&new_inode_args); out_inode: @@ -791,6 +799,8 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, struct btrfs_pending_snapshot *pending_snapshot; unsigned int trans_num_items; struct btrfs_trans_handle *trans; + struct btrfs_block_rsv *block_rsv; + u64 qgroup_reserved = 0; int ret; /* We do not support snapshotting right now. */ @@ -827,19 +837,19 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, goto free_pending; } - btrfs_init_block_rsv(&pending_snapshot->block_rsv, - BTRFS_BLOCK_RSV_TEMP); + block_rsv = &pending_snapshot->block_rsv; + btrfs_init_block_rsv(block_rsv, BTRFS_BLOCK_RSV_TEMP); /* * 1 to add dir item * 1 to add dir index * 1 to update parent inode item */ trans_num_items = create_subvol_num_items(inherit) + 3; - ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, - &pending_snapshot->block_rsv, + ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, block_rsv, trans_num_items, false); if (ret) goto free_pending; + qgroup_reserved = block_rsv->qgroup_rsv_reserved; pending_snapshot->dentry = dentry; pending_snapshot->root = root; @@ -852,6 +862,13 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, ret = PTR_ERR(trans); goto fail; } + ret = btrfs_record_root_in_trans(trans, BTRFS_I(dir)->root); + if (ret) { + btrfs_end_transaction(trans); + goto fail; + } + btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); + qgroup_reserved = 0; trans->pending_snapshot = pending_snapshot; @@ -881,7 +898,9 @@ fail: if (ret && pending_snapshot->snap) pending_snapshot->snap->anon_dev = 0; btrfs_put_root(pending_snapshot->snap); - btrfs_subvolume_release_metadata(root, &pending_snapshot->block_rsv); + btrfs_block_rsv_release(fs_info, block_rsv, (u64)-1, NULL); + if (qgroup_reserved) + btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); free_pending: if (pending_snapshot->anon_dev) free_anon_bdev(pending_snapshot->anon_dev); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 4bb538a372ce..7007f9e0c972 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -548,13 +548,3 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, } return ret; } - -void btrfs_subvolume_release_metadata(struct btrfs_root *root, - struct btrfs_block_rsv *rsv) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - u64 qgroup_to_release; - - btrfs_block_rsv_release(fs_info, rsv, (u64)-1, &qgroup_to_release); - btrfs_qgroup_convert_reserved_meta(root, qgroup_to_release); -} diff --git a/fs/btrfs/root-tree.h b/fs/btrfs/root-tree.h index 6f929cf3bd49..8f5739e732b9 100644 --- a/fs/btrfs/root-tree.h +++ b/fs/btrfs/root-tree.h @@ -18,8 +18,6 @@ struct btrfs_trans_handle; int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, bool use_global_rsv); -void btrfs_subvolume_release_metadata(struct btrfs_root *root, - struct btrfs_block_rsv *rsv); int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, u64 ref_id, u64 dirid, u64 sequence, const struct fscrypt_str *name); From 71537e35c324ea6fbd68377a4f26bb93a831ae35 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Thu, 21 Mar 2024 10:14:24 -0700 Subject: [PATCH 3/6] btrfs: record delayed inode root in transaction When running delayed inode updates, we do not record the inode's root in the transaction, but we do allocate PREALLOC and thus converted PERTRANS space for it. To be sure we free that PERTRANS meta rsv, we must ensure that we record the root in the transaction. Fixes: 4f5427ccce5d ("btrfs: delayed-inode: Use new qgroup meta rsv for delayed inode and item") CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index dd6f566a383f..121ab890bd05 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1133,6 +1133,9 @@ __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, if (ret) return ret; + ret = btrfs_record_root_in_trans(trans, node->root); + if (ret) + return ret; ret = btrfs_update_delayed_inode(trans, node->root, path, node); return ret; } From 211de93367304ab395357f8cb12568a4d1e20701 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Thu, 21 Mar 2024 10:18:39 -0700 Subject: [PATCH 4/6] btrfs: qgroup: convert PREALLOC to PERTRANS after record_root_in_trans The transaction is only able to free PERTRANS reservations for a root once that root has been recorded with the TRANS tag on the roots radix tree. Therefore, until we are sure that this root will get tagged, it isn't safe to convert. Generally, this is not an issue as *some* transaction will likely tag the root before long and this reservation will get freed in that transaction, but technically it could stick around until unmount and result in a warning about leaked metadata reservation space. This path is most exercised by running the generic/269 fstest with CONFIG_BTRFS_DEBUG. Fixes: a6496849671a ("btrfs: fix start transaction qgroup rsv double free") CC: stable@vger.kernel.org # 6.6+ Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 46e8426adf4f..1f10082d35d9 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -745,14 +745,6 @@ again: h->reloc_reserved = reloc_reserved; } - /* - * Now that we have found a transaction to be a part of, convert the - * qgroup reservation from prealloc to pertrans. A different transaction - * can't race in and free our pertrans out from under us. - */ - if (qgroup_reserved) - btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); - got_it: if (!current->journal_info) current->journal_info = h; @@ -786,8 +778,15 @@ got_it: * not just freed. */ btrfs_end_transaction(h); - return ERR_PTR(ret); + goto reserve_fail; } + /* + * Now that we have found a transaction to be a part of, convert the + * qgroup reservation from prealloc to pertrans. A different transaction + * can't race in and free our pertrans out from under us. + */ + if (qgroup_reserved) + btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); return h; From 3c6f0c5ecc8910d4ffb0dfe85609ebc0c91c8f34 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Tue, 26 Mar 2024 11:55:22 -0700 Subject: [PATCH 5/6] btrfs: make btrfs_clear_delalloc_extent() free delalloc reserve Currently, this call site in btrfs_clear_delalloc_extent() only converts the reservation. We are marking it not delalloc, so I don't think it makes sense to keep the rsv around. This is a path where we are not sure to join a transaction, so it leads to incorrect free-ing during umount. Helps with the pass rate of generic/269 and generic/475. Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1f9e74c4161d..c65fe5de4022 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2533,7 +2533,7 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, */ if (bits & EXTENT_CLEAR_META_RESV && root != fs_info->tree_root) - btrfs_delalloc_release_metadata(inode, len, false); + btrfs_delalloc_release_metadata(inode, len, true); /* For sanity tests. */ if (btrfs_is_testing(fs_info)) From 6e68de0bb0ed59e0554a0c15ede7308c47351e2d Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Tue, 26 Mar 2024 12:01:28 -0700 Subject: [PATCH 6/6] btrfs: always clear PERTRANS metadata during commit It is possible to clear a root's IN_TRANS tag from the radix tree, but not clear its PERTRANS, if there is some error in between. Eliminate that possibility by moving the free up to where we clear the tag. Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 1f10082d35d9..85f359e0e0a7 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1494,6 +1494,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) radix_tree_tag_clear(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); + btrfs_qgroup_free_meta_all_pertrans(root); spin_unlock(&fs_info->fs_roots_radix_lock); btrfs_free_log(trans, root); @@ -1518,7 +1519,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) if (ret2) return ret2; spin_lock(&fs_info->fs_roots_radix_lock); - btrfs_qgroup_free_meta_all_pertrans(root); } } spin_unlock(&fs_info->fs_roots_radix_lock);