9
9
#define SCX_OP_IDX (op ) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
10
10
11
11
enum scx_internal_consts {
12
- SCX_NR_ONLINE_OPS = SCX_OP_IDX (init ),
13
- SCX_DSP_DFL_MAX_BATCH = 32 ,
14
- SCX_DSP_MAX_LOOPS = 32 ,
15
- SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ ,
12
+ SCX_OPI_BEGIN = 0 ,
13
+ SCX_OPI_NORMAL_BEGIN = 0 ,
14
+ SCX_OPI_NORMAL_END = SCX_OP_IDX (cpu_online ),
15
+ SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX (cpu_online ),
16
+ SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX (init ),
17
+ SCX_OPI_END = SCX_OP_IDX (init ),
18
+ SCX_DSP_DFL_MAX_BATCH = 32 ,
19
+ SCX_DSP_MAX_LOOPS = 32 ,
20
+ SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ ,
16
21
};
17
22
18
23
enum scx_ops_enable_state {
@@ -104,8 +109,8 @@ static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
104
109
DEFINE_STATIC_KEY_FALSE (scx_ops_cpu_preempt );
105
110
static DEFINE_STATIC_KEY_FALSE (scx_builtin_idle_enabled );
106
111
107
- struct static_key_false scx_has_op [SCX_NR_ONLINE_OPS ] =
108
- { [0 ... SCX_NR_ONLINE_OPS - 1 ] = STATIC_KEY_FALSE_INIT };
112
+ struct static_key_false scx_has_op [SCX_OPI_END ] =
113
+ { [0 ... SCX_OPI_END - 1 ] = STATIC_KEY_FALSE_INIT };
109
114
110
115
static atomic_t scx_exit_kind = ATOMIC_INIT (SCX_EXIT_DONE );
111
116
static struct scx_exit_info scx_exit_info ;
@@ -3196,9 +3201,12 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
3196
3201
static_branch_disable (& __scx_switched_all );
3197
3202
WRITE_ONCE (scx_switching_all , false);
3198
3203
3199
- /* avoid racing against fork and cgroup changes */
3200
- cpus_read_lock ();
3204
+ /*
3205
+ * Avoid racing against fork and cgroup changes. See scx_ops_enable()
3206
+ * for explanation on the locking order.
3207
+ */
3201
3208
percpu_down_write (& scx_fork_rwsem );
3209
+ cpus_read_lock ();
3202
3210
scx_cgroup_lock ();
3203
3211
3204
3212
spin_lock_irq (& scx_tasks_lock );
@@ -3228,7 +3236,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
3228
3236
3229
3237
/* no task is on scx, turn off all the switches and flush in-progress calls */
3230
3238
static_branch_disable_cpuslocked (& __scx_ops_enabled );
3231
- for (i = 0 ; i < SCX_NR_ONLINE_OPS ; i ++ )
3239
+ for (i = SCX_OPI_BEGIN ; i < SCX_OPI_END ; i ++ )
3232
3240
static_branch_disable_cpuslocked (& scx_has_op [i ]);
3233
3241
static_branch_disable_cpuslocked (& scx_ops_enq_last );
3234
3242
static_branch_disable_cpuslocked (& scx_ops_enq_exiting );
@@ -3239,8 +3247,8 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
3239
3247
scx_cgroup_exit ();
3240
3248
3241
3249
scx_cgroup_unlock ();
3242
- percpu_up_write (& scx_fork_rwsem );
3243
3250
cpus_read_unlock ();
3251
+ percpu_up_write (& scx_fork_rwsem );
3244
3252
3245
3253
if (ei -> kind >= SCX_EXIT_ERROR ) {
3246
3254
printk (KERN_ERR "sched_ext: BPF scheduler \"%s\" errored, disabling\n" , scx_ops .name );
@@ -3373,13 +3381,13 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
3373
3381
scx_create_rt_helper ("sched_ext_ops_helper" ));
3374
3382
if (!scx_ops_helper ) {
3375
3383
ret = - ENOMEM ;
3376
- goto err_unlock ;
3384
+ goto err ;
3377
3385
}
3378
3386
}
3379
3387
3380
3388
if (scx_ops_enable_state () != SCX_OPS_DISABLED ) {
3381
3389
ret = - EBUSY ;
3382
- goto err_unlock ;
3390
+ goto err ;
3383
3391
}
3384
3392
3385
3393
/*
@@ -3408,7 +3416,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
3408
3416
ret = SCX_CALL_OP_RET (SCX_KF_INIT , init );
3409
3417
if (ret ) {
3410
3418
ret = ops_sanitize_err ("init" , ret );
3411
- goto err_disable ;
3419
+ goto err_disable_unlock_cpus ;
3412
3420
}
3413
3421
3414
3422
/*
@@ -3420,9 +3428,15 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
3420
3428
* ops.exit() like other scx_bpf_error() invocations.
3421
3429
*/
3422
3430
if (atomic_read (& scx_exit_kind ) != SCX_EXIT_NONE )
3423
- goto err_disable ;
3431
+ goto err_disable_unlock_cpus ;
3424
3432
}
3425
3433
3434
+ for (i = SCX_OPI_CPU_HOTPLUG_BEGIN ; i < SCX_OPI_CPU_HOTPLUG_END ; i ++ )
3435
+ if (((void (* * )(void ))ops )[i ])
3436
+ static_branch_enable_cpuslocked (& scx_has_op [i ]);
3437
+
3438
+ cpus_read_unlock ();
3439
+
3426
3440
ret = validate_ops (ops );
3427
3441
if (ret )
3428
3442
goto err_disable ;
@@ -3449,11 +3463,26 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
3449
3463
/*
3450
3464
* Lock out forks, cgroup on/offlining and moves before opening the
3451
3465
* floodgate so that they don't wander into the operations prematurely.
3466
+ *
3467
+ * We don't need to keep the CPUs stable but static_branch_*() requires
3468
+ * cpus_read_lock() and scx_cgroup_rwsem must nest inside
3469
+ * cpu_hotplug_lock because of the following dependency chain:
3470
+ *
3471
+ * cpu_hotplug_lock --> cgroup_threadgroup_rwsem --> scx_cgroup_rwsem
3472
+ *
3473
+ * So, we need to do cpus_read_lock() before scx_cgroup_lock() and use
3474
+ * static_branch_*_cpuslocked().
3475
+ *
3476
+ * Note that cpu_hotplug_lock must nest inside scx_fork_rwsem due to the
3477
+ * following dependency chain:
3478
+ *
3479
+ * scx_fork_rwsem --> pernet_ops_rwsem --> cpu_hotplug_lock
3452
3480
*/
3453
3481
percpu_down_write (& scx_fork_rwsem );
3482
+ cpus_read_lock ();
3454
3483
scx_cgroup_lock ();
3455
3484
3456
- for (i = 0 ; i < SCX_NR_ONLINE_OPS ; i ++ )
3485
+ for (i = SCX_OPI_NORMAL_BEGIN ; i < SCX_OPI_NORMAL_END ; i ++ )
3457
3486
if (((void (* * )(void ))ops )[i ])
3458
3487
static_branch_enable_cpuslocked (& scx_has_op [i ]);
3459
3488
@@ -3478,7 +3507,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
3478
3507
*/
3479
3508
ret = scx_cgroup_init ();
3480
3509
if (ret )
3481
- goto err_disable_unlock ;
3510
+ goto err_disable_unlock_all ;
3482
3511
3483
3512
static_branch_enable_cpuslocked (& __scx_ops_enabled );
3484
3513
@@ -3504,7 +3533,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
3504
3533
spin_unlock_irq (& scx_tasks_lock );
3505
3534
pr_err ("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n" ,
3506
3535
ret , p -> comm , p -> pid );
3507
- goto err_disable_unlock ;
3536
+ goto err_disable_unlock_all ;
3508
3537
}
3509
3538
3510
3539
put_task_struct (p );
@@ -3528,7 +3557,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
3528
3557
preempt_enable ();
3529
3558
spin_unlock_irq (& scx_tasks_lock );
3530
3559
ret = - EBUSY ;
3531
- goto err_disable_unlock ;
3560
+ goto err_disable_unlock_all ;
3532
3561
}
3533
3562
3534
3563
/*
@@ -3563,6 +3592,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
3563
3592
spin_unlock_irq (& scx_tasks_lock );
3564
3593
preempt_enable ();
3565
3594
scx_cgroup_unlock ();
3595
+ cpus_read_unlock ();
3566
3596
percpu_up_write (& scx_fork_rwsem );
3567
3597
3568
3598
if (!scx_ops_tryset_enable_state (SCX_OPS_ENABLED , SCX_OPS_ENABLING )) {
@@ -3571,24 +3601,24 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
3571
3601
}
3572
3602
3573
3603
if (scx_switch_all_req )
3574
- static_branch_enable_cpuslocked (& __scx_switched_all );
3604
+ static_branch_enable (& __scx_switched_all );
3575
3605
3576
- cpus_read_unlock ();
3577
3606
mutex_unlock (& scx_ops_enable_mutex );
3578
3607
3579
3608
scx_cgroup_config_knobs ();
3580
3609
3581
3610
return 0 ;
3582
3611
3583
- err_unlock :
3612
+ err :
3584
3613
mutex_unlock (& scx_ops_enable_mutex );
3585
3614
return ret ;
3586
3615
3587
- err_disable_unlock :
3616
+ err_disable_unlock_all :
3588
3617
scx_cgroup_unlock ();
3589
3618
percpu_up_write (& scx_fork_rwsem );
3590
- err_disable :
3619
+ err_disable_unlock_cpus :
3591
3620
cpus_read_unlock ();
3621
+ err_disable :
3592
3622
mutex_unlock (& scx_ops_enable_mutex );
3593
3623
/* must be fully disabled before returning */
3594
3624
scx_ops_disable (SCX_EXIT_ERROR );
0 commit comments