Skip to content

Commit 42a2fbd

Browse files
josefbacikgregkh
authored andcommitted
btrfs: allocate scrub workqueues outside of locks
commit e89c4a9 upstream. I got the following lockdep splat while testing: ====================================================== WARNING: possible circular locking dependency detected 5.8.0-rc7-00172-g021118712e59 torvalds#932 Not tainted ------------------------------------------------------ btrfs/229626 is trying to acquire lock: ffffffff828513f0 (cpu_hotplug_lock){++++}-{0:0}, at: alloc_workqueue+0x378/0x450 but task is already holding lock: ffff889dd3889518 (&fs_info->scrub_lock){+.+.}-{3:3}, at: btrfs_scrub_dev+0x11c/0x630 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #7 (&fs_info->scrub_lock){+.+.}-{3:3}: __mutex_lock+0x9f/0x930 btrfs_scrub_dev+0x11c/0x630 btrfs_dev_replace_by_ioctl.cold.21+0x10a/0x1d4 btrfs_ioctl+0x2799/0x30a0 ksys_ioctl+0x83/0xc0 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x50/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #6 (&fs_devs->device_list_mutex){+.+.}-{3:3}: __mutex_lock+0x9f/0x930 btrfs_run_dev_stats+0x49/0x480 commit_cowonly_roots+0xb5/0x2a0 btrfs_commit_transaction+0x516/0xa60 sync_filesystem+0x6b/0x90 generic_shutdown_super+0x22/0x100 kill_anon_super+0xe/0x30 btrfs_kill_super+0x12/0x20 deactivate_locked_super+0x29/0x60 cleanup_mnt+0xb8/0x140 task_work_run+0x6d/0xb0 __prepare_exit_to_usermode+0x1cc/0x1e0 do_syscall_64+0x5c/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #5 (&fs_info->tree_log_mutex){+.+.}-{3:3}: __mutex_lock+0x9f/0x930 btrfs_commit_transaction+0x4bb/0xa60 sync_filesystem+0x6b/0x90 generic_shutdown_super+0x22/0x100 kill_anon_super+0xe/0x30 btrfs_kill_super+0x12/0x20 deactivate_locked_super+0x29/0x60 cleanup_mnt+0xb8/0x140 task_work_run+0x6d/0xb0 __prepare_exit_to_usermode+0x1cc/0x1e0 do_syscall_64+0x5c/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #4 (&fs_info->reloc_mutex){+.+.}-{3:3}: __mutex_lock+0x9f/0x930 btrfs_record_root_in_trans+0x43/0x70 start_transaction+0xd1/0x5d0 btrfs_dirty_inode+0x42/0xd0 touch_atime+0xa1/0xd0 btrfs_file_mmap+0x3f/0x60 mmap_region+0x3a4/0x640 do_mmap+0x376/0x580 vm_mmap_pgoff+0xd5/0x120 ksys_mmap_pgoff+0x193/0x230 do_syscall_64+0x50/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #3 (&mm->mmap_lock#2){++++}-{3:3}: __might_fault+0x68/0x90 _copy_to_user+0x1e/0x80 perf_read+0x141/0x2c0 vfs_read+0xad/0x1b0 ksys_read+0x5f/0xe0 do_syscall_64+0x50/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #2 (&cpuctx_mutex){+.+.}-{3:3}: __mutex_lock+0x9f/0x930 perf_event_init_cpu+0x88/0x150 perf_event_init+0x1db/0x20b start_kernel+0x3ae/0x53c secondary_startup_64+0xa4/0xb0 -> #1 (pmus_lock){+.+.}-{3:3}: __mutex_lock+0x9f/0x930 perf_event_init_cpu+0x4f/0x150 cpuhp_invoke_callback+0xb1/0x900 _cpu_up.constprop.26+0x9f/0x130 cpu_up+0x7b/0xc0 bringup_nonboot_cpus+0x4f/0x60 smp_init+0x26/0x71 kernel_init_freeable+0x110/0x258 kernel_init+0xa/0x103 ret_from_fork+0x1f/0x30 -> #0 (cpu_hotplug_lock){++++}-{0:0}: __lock_acquire+0x1272/0x2310 lock_acquire+0x9e/0x360 cpus_read_lock+0x39/0xb0 alloc_workqueue+0x378/0x450 __btrfs_alloc_workqueue+0x15d/0x200 btrfs_alloc_workqueue+0x51/0x160 scrub_workers_get+0x5a/0x170 btrfs_scrub_dev+0x18c/0x630 btrfs_dev_replace_by_ioctl.cold.21+0x10a/0x1d4 btrfs_ioctl+0x2799/0x30a0 ksys_ioctl+0x83/0xc0 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x50/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xa9 other info that might help us debug this: Chain exists of: cpu_hotplug_lock --> &fs_devs->device_list_mutex --> &fs_info->scrub_lock Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&fs_info->scrub_lock); lock(&fs_devs->device_list_mutex); lock(&fs_info->scrub_lock); lock(cpu_hotplug_lock); *** DEADLOCK *** 2 locks held by btrfs/229626: #0: ffff88bfe8bb86e0 (&fs_devs->device_list_mutex){+.+.}-{3:3}, at: btrfs_scrub_dev+0xbd/0x630 #1: ffff889dd3889518 (&fs_info->scrub_lock){+.+.}-{3:3}, at: btrfs_scrub_dev+0x11c/0x630 stack backtrace: CPU: 15 PID: 229626 Comm: btrfs Kdump: loaded Not tainted 5.8.0-rc7-00172-g021118712e59 torvalds#932 Hardware name: Quanta Tioga Pass Single Side 01-0030993006/Tioga Pass Single Side, BIOS F08_3A18 12/20/2018 Call Trace: dump_stack+0x78/0xa0 check_noncircular+0x165/0x180 __lock_acquire+0x1272/0x2310 lock_acquire+0x9e/0x360 ? alloc_workqueue+0x378/0x450 cpus_read_lock+0x39/0xb0 ? alloc_workqueue+0x378/0x450 alloc_workqueue+0x378/0x450 ? rcu_read_lock_sched_held+0x52/0x80 __btrfs_alloc_workqueue+0x15d/0x200 btrfs_alloc_workqueue+0x51/0x160 scrub_workers_get+0x5a/0x170 btrfs_scrub_dev+0x18c/0x630 ? start_transaction+0xd1/0x5d0 btrfs_dev_replace_by_ioctl.cold.21+0x10a/0x1d4 btrfs_ioctl+0x2799/0x30a0 ? do_sigaction+0x102/0x250 ? lockdep_hardirqs_on_prepare+0xca/0x160 ? _raw_spin_unlock_irq+0x24/0x30 ? trace_hardirqs_on+0x1c/0xe0 ? _raw_spin_unlock_irq+0x24/0x30 ? do_sigaction+0x102/0x250 ? ksys_ioctl+0x83/0xc0 ksys_ioctl+0x83/0xc0 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x50/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xa9 This happens because we're allocating the scrub workqueues under the scrub and device list mutex, which brings in a whole host of other dependencies. Because the work queue allocation is done with GFP_KERNEL, it can trigger reclaim, which can lead to a transaction commit, which in turns needs the device_list_mutex, it can lead to a deadlock. A different problem for which this fix is a solution. Fix this by moving the actual allocation outside of the scrub lock, and then only take the lock once we're ready to actually assign them to the fs_info. We'll now have to cleanup the workqueues in a few more places, so I've added a helper to do the refcount dance to safely free the workqueues. CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Josef Bacik <josef@toxicpanda.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
1 parent 26cf893 commit 42a2fbd

File tree

1 file changed

+70
-52
lines changed

1 file changed

+70
-52
lines changed

fs/btrfs/scrub.c

+70-52
Original file line numberDiff line numberDiff line change
@@ -3783,50 +3783,84 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
37833783
return 0;
37843784
}
37853785

3786+
static void scrub_workers_put(struct btrfs_fs_info *fs_info)
3787+
{
3788+
if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
3789+
&fs_info->scrub_lock)) {
3790+
struct btrfs_workqueue *scrub_workers = NULL;
3791+
struct btrfs_workqueue *scrub_wr_comp = NULL;
3792+
struct btrfs_workqueue *scrub_parity = NULL;
3793+
3794+
scrub_workers = fs_info->scrub_workers;
3795+
scrub_wr_comp = fs_info->scrub_wr_completion_workers;
3796+
scrub_parity = fs_info->scrub_parity_workers;
3797+
3798+
fs_info->scrub_workers = NULL;
3799+
fs_info->scrub_wr_completion_workers = NULL;
3800+
fs_info->scrub_parity_workers = NULL;
3801+
mutex_unlock(&fs_info->scrub_lock);
3802+
3803+
btrfs_destroy_workqueue(scrub_workers);
3804+
btrfs_destroy_workqueue(scrub_wr_comp);
3805+
btrfs_destroy_workqueue(scrub_parity);
3806+
}
3807+
}
3808+
37863809
/*
37873810
* get a reference count on fs_info->scrub_workers. start worker if necessary
37883811
*/
37893812
static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
37903813
int is_dev_replace)
37913814
{
3815+
struct btrfs_workqueue *scrub_workers = NULL;
3816+
struct btrfs_workqueue *scrub_wr_comp = NULL;
3817+
struct btrfs_workqueue *scrub_parity = NULL;
37923818
unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
37933819
int max_active = fs_info->thread_pool_size;
3820+
int ret = -ENOMEM;
37943821

3795-
lockdep_assert_held(&fs_info->scrub_lock);
3822+
if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
3823+
return 0;
37963824

3797-
if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
3798-
ASSERT(fs_info->scrub_workers == NULL);
3799-
fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub",
3800-
flags, is_dev_replace ? 1 : max_active, 4);
3801-
if (!fs_info->scrub_workers)
3802-
goto fail_scrub_workers;
3803-
3804-
ASSERT(fs_info->scrub_wr_completion_workers == NULL);
3805-
fs_info->scrub_wr_completion_workers =
3806-
btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
3807-
max_active, 2);
3808-
if (!fs_info->scrub_wr_completion_workers)
3809-
goto fail_scrub_wr_completion_workers;
3825+
scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags,
3826+
is_dev_replace ? 1 : max_active, 4);
3827+
if (!scrub_workers)
3828+
goto fail_scrub_workers;
38103829

3811-
ASSERT(fs_info->scrub_parity_workers == NULL);
3812-
fs_info->scrub_parity_workers =
3813-
btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
3830+
scrub_wr_comp = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
38143831
max_active, 2);
3815-
if (!fs_info->scrub_parity_workers)
3816-
goto fail_scrub_parity_workers;
3832+
if (!scrub_wr_comp)
3833+
goto fail_scrub_wr_completion_workers;
38173834

3835+
scrub_parity = btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
3836+
max_active, 2);
3837+
if (!scrub_parity)
3838+
goto fail_scrub_parity_workers;
3839+
3840+
mutex_lock(&fs_info->scrub_lock);
3841+
if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
3842+
ASSERT(fs_info->scrub_workers == NULL &&
3843+
fs_info->scrub_wr_completion_workers == NULL &&
3844+
fs_info->scrub_parity_workers == NULL);
3845+
fs_info->scrub_workers = scrub_workers;
3846+
fs_info->scrub_wr_completion_workers = scrub_wr_comp;
3847+
fs_info->scrub_parity_workers = scrub_parity;
38183848
refcount_set(&fs_info->scrub_workers_refcnt, 1);
3819-
} else {
3820-
refcount_inc(&fs_info->scrub_workers_refcnt);
3849+
mutex_unlock(&fs_info->scrub_lock);
3850+
return 0;
38213851
}
3822-
return 0;
3852+
/* Other thread raced in and created the workers for us */
3853+
refcount_inc(&fs_info->scrub_workers_refcnt);
3854+
mutex_unlock(&fs_info->scrub_lock);
38233855

3856+
ret = 0;
3857+
btrfs_destroy_workqueue(scrub_parity);
38243858
fail_scrub_parity_workers:
3825-
btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
3859+
btrfs_destroy_workqueue(scrub_wr_comp);
38263860
fail_scrub_wr_completion_workers:
3827-
btrfs_destroy_workqueue(fs_info->scrub_workers);
3861+
btrfs_destroy_workqueue(scrub_workers);
38283862
fail_scrub_workers:
3829-
return -ENOMEM;
3863+
return ret;
38303864
}
38313865

38323866
int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
@@ -3837,9 +3871,6 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
38373871
int ret;
38383872
struct btrfs_device *dev;
38393873
unsigned int nofs_flag;
3840-
struct btrfs_workqueue *scrub_workers = NULL;
3841-
struct btrfs_workqueue *scrub_wr_comp = NULL;
3842-
struct btrfs_workqueue *scrub_parity = NULL;
38433874

38443875
if (btrfs_fs_closing(fs_info))
38453876
return -EAGAIN;
@@ -3886,13 +3917,17 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
38863917
if (IS_ERR(sctx))
38873918
return PTR_ERR(sctx);
38883919

3920+
ret = scrub_workers_get(fs_info, is_dev_replace);
3921+
if (ret)
3922+
goto out_free_ctx;
3923+
38893924
mutex_lock(&fs_info->fs_devices->device_list_mutex);
38903925
dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
38913926
if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
38923927
!is_dev_replace)) {
38933928
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
38943929
ret = -ENODEV;
3895-
goto out_free_ctx;
3930+
goto out;
38963931
}
38973932

38983933
if (!is_dev_replace && !readonly &&
@@ -3901,7 +3936,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
39013936
btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable",
39023937
rcu_str_deref(dev->name));
39033938
ret = -EROFS;
3904-
goto out_free_ctx;
3939+
goto out;
39053940
}
39063941

39073942
mutex_lock(&fs_info->scrub_lock);
@@ -3910,7 +3945,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
39103945
mutex_unlock(&fs_info->scrub_lock);
39113946
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
39123947
ret = -EIO;
3913-
goto out_free_ctx;
3948+
goto out;
39143949
}
39153950

39163951
down_read(&fs_info->dev_replace.rwsem);
@@ -3921,17 +3956,10 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
39213956
mutex_unlock(&fs_info->scrub_lock);
39223957
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
39233958
ret = -EINPROGRESS;
3924-
goto out_free_ctx;
3959+
goto out;
39253960
}
39263961
up_read(&fs_info->dev_replace.rwsem);
39273962

3928-
ret = scrub_workers_get(fs_info, is_dev_replace);
3929-
if (ret) {
3930-
mutex_unlock(&fs_info->scrub_lock);
3931-
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3932-
goto out_free_ctx;
3933-
}
3934-
39353963
sctx->readonly = readonly;
39363964
dev->scrub_ctx = sctx;
39373965
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -3984,24 +4012,14 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
39844012

39854013
mutex_lock(&fs_info->scrub_lock);
39864014
dev->scrub_ctx = NULL;
3987-
if (refcount_dec_and_test(&fs_info->scrub_workers_refcnt)) {
3988-
scrub_workers = fs_info->scrub_workers;
3989-
scrub_wr_comp = fs_info->scrub_wr_completion_workers;
3990-
scrub_parity = fs_info->scrub_parity_workers;
3991-
3992-
fs_info->scrub_workers = NULL;
3993-
fs_info->scrub_wr_completion_workers = NULL;
3994-
fs_info->scrub_parity_workers = NULL;
3995-
}
39964015
mutex_unlock(&fs_info->scrub_lock);
39974016

3998-
btrfs_destroy_workqueue(scrub_workers);
3999-
btrfs_destroy_workqueue(scrub_wr_comp);
4000-
btrfs_destroy_workqueue(scrub_parity);
4017+
scrub_workers_put(fs_info);
40014018
scrub_put_ctx(sctx);
40024019

40034020
return ret;
4004-
4021+
out:
4022+
scrub_workers_put(fs_info);
40054023
out_free_ctx:
40064024
scrub_free_ctx(sctx);
40074025

0 commit comments

Comments
 (0)