btrfs: hold block group refcount during async discard

Async discard does not acquire the block group reference count while it holds a reference on the discard list. This is generally OK, as the paths which destroy block groups tend to try to synchronize on cancelling async discard work. However, relying on cancelling work requires careful analysis to be sure it is safe from races with unpinning scheduling more work. While I am unable to find a race with unpinning in the current code for either the unused bgs or relocation paths, I believe we have one in an older version of auto relocation in a Meta internal build. This suggests that this is in fact an error prone model, and could be fragile to future changes to these bg deletion paths. To make this ownership more clear, add a refcount for async discard. If work is queued for a block group, its refcount should be incremented, and when work is completed or canceled, it should be decremented. CC: stable@vger.kernel.org # 5.15+ Signed-off-by: Boris Burkov <boris@bur.io> Signed-off-by: David Sterba <dsterba@suse.com>
author: Boris Burkov <boris@bur.io> 2023-01-12 16:05:11 -0800
committer: David Sterba <dsterba@suse.com> 2023-02-15 19:38:50 +0100
commit: 2b5463fcbdfb24e898916bcae2b1359042d26963 (patch)
tree: 3d789298e5d721a8c64579b18c6a49047318f3e7 /fs/btrfs/discard.c
parent: btrfs: send: cache utimes operations for directories if possible (diff)
download: linux-2b5463fcbdfb24e898916bcae2b1359042d26963.tar.gz
linux-2b5463fcbdfb24e898916bcae2b1359042d26963.tar.bz2
linux-2b5463fcbdfb24e898916bcae2b1359042d26963.zip
1 files changed, 38 insertions, 3 deletions
diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
index ff2e524d9937..317aeff6c1da 100644
--- a/fs/btrfs/discard.c
+++ b/fs/btrfs/discard.c
@@ -78,6 +78,7 @@ static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl,
 static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
 				  struct btrfs_block_group *block_group)
 {
+	lockdep_assert_held(&discard_ctl->lock);
 	if (!btrfs_run_discard_work(discard_ctl))
 		return;
 
@@ -89,6 +90,8 @@ static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
 						      BTRFS_DISCARD_DELAY);
 		block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
 	}
+	if (list_empty(&block_group->discard_list))
+		btrfs_get_block_group(block_group);
 
 	list_move_tail(&block_group->discard_list,
 		       get_discard_list(discard_ctl, block_group));
@@ -108,8 +111,12 @@ static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
 static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl,
 				       struct btrfs_block_group *block_group)
 {
+	bool queued;
+
 	spin_lock(&discard_ctl->lock);
 
+	queued = !list_empty(&block_group->discard_list);
+
 	if (!btrfs_run_discard_work(discard_ctl)) {
 		spin_unlock(&discard_ctl->lock);
 		return;
@@ -121,6 +128,8 @@ static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl,
 	block_group->discard_eligible_time = (ktime_get_ns() +
 					      BTRFS_DISCARD_UNUSED_DELAY);
 	block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
+	if (!queued)
+		btrfs_get_block_group(block_group);
 	list_add_tail(&block_group->discard_list,
 		      &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]);
 
@@ -131,6 +140,7 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
 				     struct btrfs_block_group *block_group)
 {
 	bool running = false;
+	bool queued = false;
 
 	spin_lock(&discard_ctl->lock);
 
@@ -140,7 +150,16 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
 	}
 
 	block_group->discard_eligible_time = 0;
+	queued = !list_empty(&block_group->discard_list);
 	list_del_init(&block_group->discard_list);
+	/*
+	 * If the block group is currently running in the discard workfn, we
+	 * don't want to deref it, since it's still being used by the workfn.
+	 * The workfn will notice this case and deref the block group when it is
+	 * finished.
+	 */
+	if (queued && !running)
+		btrfs_put_block_group(block_group);
 
 	spin_unlock(&discard_ctl->lock);
 
@@ -214,10 +233,12 @@ again:
 	if (block_group && now >= block_group->discard_eligible_time) {
 		if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED &&
 		    block_group->used != 0) {
-			if (btrfs_is_block_group_data_only(block_group))
+			if (btrfs_is_block_group_data_only(block_group)) {
 				__add_to_discard_list(discard_ctl, block_group);
-			else
+			} else {
 				list_del_init(&block_group->discard_list);
+				btrfs_put_block_group(block_group);
+			}
 			goto again;
 		}
 		if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) {
@@ -511,6 +532,15 @@ static void btrfs_discard_workfn(struct work_struct *work)
 	spin_lock(&discard_ctl->lock);
 	discard_ctl->prev_discard = trimmed;
 	discard_ctl->prev_discard_time = now;
+	/*
+	 * If the block group was removed from the discard list while it was
+	 * running in this workfn, then we didn't deref it, since this function
+	 * still owned that reference. But we set the discard_ctl->block_group
+	 * back to NULL, so we can use that condition to know that now we need
+	 * to deref the block_group.
+	 */
+	if (discard_ctl->block_group == NULL)
+		btrfs_put_block_group(block_group);
 	discard_ctl->block_group = NULL;
 	__btrfs_discard_schedule_work(discard_ctl, now, false);
 	spin_unlock(&discard_ctl->lock);
@@ -651,8 +681,12 @@ void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info)
 	list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs,
 				 bg_list) {
 		list_del_init(&block_group->bg_list);
-		btrfs_put_block_group(block_group);
 		btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
+		/*
+		 * This put is for the get done by btrfs_mark_bg_unused.
+		 * Queueing discard incremented it for discard's reference.
+		 */
+		btrfs_put_block_group(block_group);
 	}
 	spin_unlock(&fs_info->unused_bgs_lock);
 }
@@ -683,6 +717,7 @@ static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl)
 			if (block_group->used == 0)
 				btrfs_mark_bg_unused(block_group);
 			spin_lock(&discard_ctl->lock);
+			btrfs_put_block_group(block_group);
 		}
 	}
 	spin_unlock(&discard_ctl->lock);
author	Boris Burkov <boris@bur.io>	2023-01-12 16:05:11 -0800
committer	David Sterba <dsterba@suse.com>	2023-02-15 19:38:50 +0100
commit	2b5463fcbdfb24e898916bcae2b1359042d26963 (patch)
tree	3d789298e5d721a8c64579b18c6a49047318f3e7 /fs/btrfs/discard.c
parent	btrfs: send: cache utimes operations for directories if possible (diff)
download	linux-2b5463fcbdfb24e898916bcae2b1359042d26963.tar.gz linux-2b5463fcbdfb24e898916bcae2b1359042d26963.tar.bz2 linux-2b5463fcbdfb24e898916bcae2b1359042d26963.zip