Skip to content

Commit 4361d23

Browse files
authored
Merge pull request torvalds#119 from sched-ext/htejun
scx: Fix locking order
2 parents 7592388 + 1225a90 commit 4361d23

File tree

2 files changed

+76
-42
lines changed

2 files changed

+76
-42
lines changed

include/linux/sched/ext.h

+23-19
Original file line numberDiff line numberDiff line change
@@ -412,24 +412,6 @@ struct sched_ext_ops {
412412
*/
413413
void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);
414414

415-
/**
416-
* cpu_online - A CPU became online
417-
* @cpu: CPU which just came up
418-
*
419-
* @cpu just came online. @cpu doesn't call ops.enqueue() or run tasks
420-
* associated with other CPUs beforehand.
421-
*/
422-
void (*cpu_online)(s32 cpu);
423-
424-
/**
425-
* cpu_offline - A CPU is going offline
426-
* @cpu: CPU which is going offline
427-
*
428-
* @cpu is going offline. @cpu doesn't call ops.enqueue() or run tasks
429-
* associated with other CPUs afterwards.
430-
*/
431-
void (*cpu_offline)(s32 cpu);
432-
433415
/**
434416
* init_task - Initialize a task to run in a BPF scheduler
435417
* @p: task to initialize for BPF scheduling
@@ -550,7 +532,29 @@ struct sched_ext_ops {
550532
#endif /* CONFIG_CGROUPS */
551533

552534
/*
553-
* All online ops must come before ops.init().
535+
* All online ops must come before ops.cpu_online().
536+
*/
537+
538+
/**
539+
* cpu_online - A CPU became online
540+
* @cpu: CPU which just came up
541+
*
542+
* @cpu just came online. @cpu doesn't call ops.enqueue() or run tasks
543+
* associated with other CPUs beforehand.
544+
*/
545+
void (*cpu_online)(s32 cpu);
546+
547+
/**
548+
* cpu_offline - A CPU is going offline
549+
* @cpu: CPU which is going offline
550+
*
551+
* @cpu is going offline. @cpu doesn't call ops.enqueue() or run tasks
552+
* associated with other CPUs afterwards.
553+
*/
554+
void (*cpu_offline)(s32 cpu);
555+
556+
/*
557+
* All CPU hotplug ops must come before ops.init().
554558
*/
555559

556560
/**

kernel/sched/ext.c

+53-23
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,15 @@
99
#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
1010

1111
enum scx_internal_consts {
12-
SCX_NR_ONLINE_OPS = SCX_OP_IDX(init),
13-
SCX_DSP_DFL_MAX_BATCH = 32,
14-
SCX_DSP_MAX_LOOPS = 32,
15-
SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ,
12+
SCX_OPI_BEGIN = 0,
13+
SCX_OPI_NORMAL_BEGIN = 0,
14+
SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online),
15+
SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online),
16+
SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init),
17+
SCX_OPI_END = SCX_OP_IDX(init),
18+
SCX_DSP_DFL_MAX_BATCH = 32,
19+
SCX_DSP_MAX_LOOPS = 32,
20+
SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ,
1621
};
1722

1823
enum scx_ops_enable_state {
@@ -104,8 +109,8 @@ static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
104109
DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
105110
static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
106111

107-
struct static_key_false scx_has_op[SCX_NR_ONLINE_OPS] =
108-
{ [0 ... SCX_NR_ONLINE_OPS-1] = STATIC_KEY_FALSE_INIT };
112+
struct static_key_false scx_has_op[SCX_OPI_END] =
113+
{ [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT };
109114

110115
static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE);
111116
static struct scx_exit_info scx_exit_info;
@@ -3196,9 +3201,12 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
31963201
static_branch_disable(&__scx_switched_all);
31973202
WRITE_ONCE(scx_switching_all, false);
31983203

3199-
/* avoid racing against fork and cgroup changes */
3200-
cpus_read_lock();
3204+
/*
3205+
* Avoid racing against fork and cgroup changes. See scx_ops_enable()
3206+
* for explanation on the locking order.
3207+
*/
32013208
percpu_down_write(&scx_fork_rwsem);
3209+
cpus_read_lock();
32023210
scx_cgroup_lock();
32033211

32043212
spin_lock_irq(&scx_tasks_lock);
@@ -3228,7 +3236,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
32283236

32293237
/* no task is on scx, turn off all the switches and flush in-progress calls */
32303238
static_branch_disable_cpuslocked(&__scx_ops_enabled);
3231-
for (i = 0; i < SCX_NR_ONLINE_OPS; i++)
3239+
for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
32323240
static_branch_disable_cpuslocked(&scx_has_op[i]);
32333241
static_branch_disable_cpuslocked(&scx_ops_enq_last);
32343242
static_branch_disable_cpuslocked(&scx_ops_enq_exiting);
@@ -3239,8 +3247,8 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
32393247
scx_cgroup_exit();
32403248

32413249
scx_cgroup_unlock();
3242-
percpu_up_write(&scx_fork_rwsem);
32433250
cpus_read_unlock();
3251+
percpu_up_write(&scx_fork_rwsem);
32443252

32453253
if (ei->kind >= SCX_EXIT_ERROR) {
32463254
printk(KERN_ERR "sched_ext: BPF scheduler \"%s\" errored, disabling\n", scx_ops.name);
@@ -3373,13 +3381,13 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
33733381
scx_create_rt_helper("sched_ext_ops_helper"));
33743382
if (!scx_ops_helper) {
33753383
ret = -ENOMEM;
3376-
goto err_unlock;
3384+
goto err;
33773385
}
33783386
}
33793387

33803388
if (scx_ops_enable_state() != SCX_OPS_DISABLED) {
33813389
ret = -EBUSY;
3382-
goto err_unlock;
3390+
goto err;
33833391
}
33843392

33853393
/*
@@ -3408,7 +3416,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
34083416
ret = SCX_CALL_OP_RET(SCX_KF_INIT, init);
34093417
if (ret) {
34103418
ret = ops_sanitize_err("init", ret);
3411-
goto err_disable;
3419+
goto err_disable_unlock_cpus;
34123420
}
34133421

34143422
/*
@@ -3420,9 +3428,15 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
34203428
* ops.exit() like other scx_bpf_error() invocations.
34213429
*/
34223430
if (atomic_read(&scx_exit_kind) != SCX_EXIT_NONE)
3423-
goto err_disable;
3431+
goto err_disable_unlock_cpus;
34243432
}
34253433

3434+
for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
3435+
if (((void (**)(void))ops)[i])
3436+
static_branch_enable_cpuslocked(&scx_has_op[i]);
3437+
3438+
cpus_read_unlock();
3439+
34263440
ret = validate_ops(ops);
34273441
if (ret)
34283442
goto err_disable;
@@ -3449,11 +3463,26 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
34493463
/*
34503464
* Lock out forks, cgroup on/offlining and moves before opening the
34513465
* floodgate so that they don't wander into the operations prematurely.
3466+
*
3467+
* We don't need to keep the CPUs stable but static_branch_*() requires
3468+
* cpus_read_lock() and scx_cgroup_rwsem must nest inside
3469+
* cpu_hotplug_lock because of the following dependency chain:
3470+
*
3471+
* cpu_hotplug_lock --> cgroup_threadgroup_rwsem --> scx_cgroup_rwsem
3472+
*
3473+
* So, we need to do cpus_read_lock() before scx_cgroup_lock() and use
3474+
* static_branch_*_cpuslocked().
3475+
*
3476+
* Note that cpu_hotplug_lock must nest inside scx_fork_rwsem due to the
3477+
* following dependency chain:
3478+
*
3479+
* scx_fork_rwsem --> pernet_ops_rwsem --> cpu_hotplug_lock
34523480
*/
34533481
percpu_down_write(&scx_fork_rwsem);
3482+
cpus_read_lock();
34543483
scx_cgroup_lock();
34553484

3456-
for (i = 0; i < SCX_NR_ONLINE_OPS; i++)
3485+
for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
34573486
if (((void (**)(void))ops)[i])
34583487
static_branch_enable_cpuslocked(&scx_has_op[i]);
34593488

@@ -3478,7 +3507,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
34783507
*/
34793508
ret = scx_cgroup_init();
34803509
if (ret)
3481-
goto err_disable_unlock;
3510+
goto err_disable_unlock_all;
34823511

34833512
static_branch_enable_cpuslocked(&__scx_ops_enabled);
34843513

@@ -3504,7 +3533,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
35043533
spin_unlock_irq(&scx_tasks_lock);
35053534
pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n",
35063535
ret, p->comm, p->pid);
3507-
goto err_disable_unlock;
3536+
goto err_disable_unlock_all;
35083537
}
35093538

35103539
put_task_struct(p);
@@ -3528,7 +3557,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
35283557
preempt_enable();
35293558
spin_unlock_irq(&scx_tasks_lock);
35303559
ret = -EBUSY;
3531-
goto err_disable_unlock;
3560+
goto err_disable_unlock_all;
35323561
}
35333562

35343563
/*
@@ -3563,6 +3592,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
35633592
spin_unlock_irq(&scx_tasks_lock);
35643593
preempt_enable();
35653594
scx_cgroup_unlock();
3595+
cpus_read_unlock();
35663596
percpu_up_write(&scx_fork_rwsem);
35673597

35683598
if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
@@ -3571,24 +3601,24 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
35713601
}
35723602

35733603
if (scx_switch_all_req)
3574-
static_branch_enable_cpuslocked(&__scx_switched_all);
3604+
static_branch_enable(&__scx_switched_all);
35753605

3576-
cpus_read_unlock();
35773606
mutex_unlock(&scx_ops_enable_mutex);
35783607

35793608
scx_cgroup_config_knobs();
35803609

35813610
return 0;
35823611

3583-
err_unlock:
3612+
err:
35843613
mutex_unlock(&scx_ops_enable_mutex);
35853614
return ret;
35863615

3587-
err_disable_unlock:
3616+
err_disable_unlock_all:
35883617
scx_cgroup_unlock();
35893618
percpu_up_write(&scx_fork_rwsem);
3590-
err_disable:
3619+
err_disable_unlock_cpus:
35913620
cpus_read_unlock();
3621+
err_disable:
35923622
mutex_unlock(&scx_ops_enable_mutex);
35933623
/* must be fully disabled before returning */
35943624
scx_ops_disable(SCX_EXIT_ERROR);

0 commit comments

Comments
 (0)