Skip to content

Commit 7a726f7

Browse files
committed
sched_ext: Synchronize bypass state changes with rq lock
While the BPF scheduler is being unloaded, the following warning messages trigger sometimes: NOHZ tick-stop error: local softirq work is pending, handler torvalds#80!!! This is caused by the CPU entering idle while there are pending softirqs. The main culprit is the bypassing state assertion not being synchronized with rq operations. As the BPF scheduler cannot be trusted in the disable path, the first step is entering the bypass mode where the BPF scheduler is ignored and scheduling becomes global FIFO. This is implemented by turning scx_ops_bypassing() true. However, the transition isn't synchronized against anything and it's possible for enqueue and dispatch paths to have different ideas on whether bypass mode is on. Make each rq track its own bypass state with SCX_RQ_BYPASSING which is modified while rq is locked. This removes most of the NOHZ tick-stop messages but not completely. I believe the stragglers are from the sched core bug where pick_task_scx() can be called without preceding balance_scx(). Once that bug is fixed, we should verify that all occurrences of this error message are gone too. v2: scx_enabled() test moved inside the for_each_possible_cpu() loop so that the per-cpu states are always synchronized with the global state. Signed-off-by: Tejun Heo <tj@kernel.org> Reported-by: David Vernet <void@manifault.com> (cherry picked from commit 750a40d)
1 parent 939aaa9 commit 7a726f7

File tree

2 files changed

+36
-25
lines changed

2 files changed

+36
-25
lines changed

kernel/sched/ext.c

+35-25
Original file line numberDiff line numberDiff line change
@@ -1414,9 +1414,9 @@ static bool scx_ops_tryset_enable_state(enum scx_ops_enable_state to,
14141414
return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to);
14151415
}
14161416

1417-
static bool scx_ops_bypassing(void)
1417+
static bool scx_rq_bypassing(struct rq *rq)
14181418
{
1419-
return unlikely(atomic_read(&scx_ops_bypass_depth));
1419+
return unlikely(rq->scx.flags & SCX_RQ_BYPASSING);
14201420
}
14211421

14221422
/**
@@ -1951,7 +1951,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
19511951
if (!scx_rq_online(rq))
19521952
goto local;
19531953

1954-
if (scx_ops_bypassing()) {
1954+
if (scx_rq_bypassing(rq)) {
19551955
if (enq_flags & SCX_ENQ_LAST)
19561956
goto local;
19571957
else
@@ -2629,7 +2629,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev, bool local)
26292629
* same conditions later and pick @rq->curr accordingly.
26302630
*/
26312631
if ((prev->scx.flags & SCX_TASK_QUEUED) &&
2632-
prev->scx.slice && !scx_ops_bypassing()) {
2632+
prev->scx.slice && !scx_rq_bypassing(rq)) {
26332633
if (local)
26342634
prev->scx.flags |= SCX_TASK_BAL_KEEP;
26352635
goto has_tasks;
@@ -2643,7 +2643,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev, bool local)
26432643
if (consume_dispatch_q(rq, &scx_dsq_global))
26442644
goto has_tasks;
26452645

2646-
if (!SCX_HAS_OP(dispatch) || scx_ops_bypassing() || !scx_rq_online(rq))
2646+
if (!SCX_HAS_OP(dispatch) || scx_rq_bypassing(rq) || !scx_rq_online(rq))
26472647
goto out;
26482648

26492649
dspc->rq = rq;
@@ -2884,7 +2884,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
28842884
* scheduler class or core-sched forcing a different task. Leave
28852885
* it at the head of the local DSQ.
28862886
*/
2887-
if (p->scx.slice && !scx_ops_bypassing()) {
2887+
if (p->scx.slice && !scx_rq_bypassing(rq)) {
28882888
dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
28892889
return;
28902890
}
@@ -2922,7 +2922,7 @@ static struct task_struct *pick_next_task_scx(struct rq *rq)
29222922
set_next_task_scx(rq, p, true);
29232923

29242924
if (unlikely(!p->scx.slice)) {
2925-
if (!scx_ops_bypassing() && !scx_warned_zero_slice) {
2925+
if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) {
29262926
printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n",
29272927
p->comm, p->pid);
29282928
scx_warned_zero_slice = true;
@@ -2959,7 +2959,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
29592959
* calling ops.core_sched_before(). Accesses are controlled by the
29602960
* verifier.
29612961
*/
2962-
if (SCX_HAS_OP(core_sched_before) && !scx_ops_bypassing())
2962+
if (SCX_HAS_OP(core_sched_before) && !scx_rq_bypassing(task_rq(a)))
29632963
return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before,
29642964
(struct task_struct *)a,
29652965
(struct task_struct *)b);
@@ -3361,7 +3361,7 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
33613361
* While disabling, always resched and refresh core-sched timestamp as
33623362
* we can't trust the slice management or ops.core_sched_before().
33633363
*/
3364-
if (scx_ops_bypassing()) {
3364+
if (scx_rq_bypassing(rq)) {
33653365
curr->scx.slice = 0;
33663366
touch_core_sched(rq, curr);
33673367
} else if (SCX_HAS_OP(tick)) {
@@ -3700,7 +3700,7 @@ bool scx_can_stop_tick(struct rq *rq)
37003700
{
37013701
struct task_struct *p = rq->curr;
37023702

3703-
if (scx_ops_bypassing())
3703+
if (scx_rq_bypassing(rq))
37043704
return false;
37053705

37063706
if (p->sched_class != &ext_sched_class)
@@ -4297,17 +4297,9 @@ static void scx_ops_bypass(bool bypass)
42974297
return;
42984298
}
42994299

4300-
/*
4301-
* We need to guarantee that no tasks are on the BPF scheduler while
4302-
* bypassing. Either we see enabled or the enable path sees the
4303-
* increased bypass_depth before moving tasks to SCX.
4304-
*/
4305-
if (!scx_enabled())
4306-
return;
4307-
43084300
/*
43094301
* No task property is changing. We just need to make sure all currently
4310-
* queued tasks are re-queued according to the new scx_ops_bypassing()
4302+
* queued tasks are re-queued according to the new scx_rq_bypassing()
43114303
* state. As an optimization, walk each rq's runnable_list instead of
43124304
* the scx_tasks list.
43134305
*
@@ -4321,6 +4313,24 @@ static void scx_ops_bypass(bool bypass)
43214313

43224314
rq_lock_irqsave(rq, &rf);
43234315

4316+
if (bypass) {
4317+
WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING);
4318+
rq->scx.flags |= SCX_RQ_BYPASSING;
4319+
} else {
4320+
WARN_ON_ONCE(!(rq->scx.flags & SCX_RQ_BYPASSING));
4321+
rq->scx.flags &= ~SCX_RQ_BYPASSING;
4322+
}
4323+
4324+
/*
4325+
* We need to guarantee that no tasks are on the BPF scheduler
4326+
* while bypassing. Either we see enabled or the enable path
4327+
* sees scx_rq_bypassing() before moving tasks to SCX.
4328+
*/
4329+
if (!scx_enabled()) {
4330+
rq_unlock_irqrestore(rq, &rf);
4331+
continue;
4332+
}
4333+
43244334
/*
43254335
* The use of list_for_each_entry_safe_reverse() is required
43264336
* because each task is going to be removed from and added back
@@ -6438,17 +6448,17 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
64386448
if (!ops_cpu_valid(cpu, NULL))
64396449
return;
64406450

6451+
local_irq_save(irq_flags);
6452+
6453+
this_rq = this_rq();
6454+
64416455
/*
64426456
* While bypassing for PM ops, IRQ handling may not be online which can
64436457
* lead to irq_work_queue() malfunction such as infinite busy wait for
64446458
* IRQ status update. Suppress kicking.
64456459
*/
6446-
if (scx_ops_bypassing())
6447-
return;
6448-
6449-
local_irq_save(irq_flags);
6450-
6451-
this_rq = this_rq();
6460+
if (scx_rq_bypassing(this_rq))
6461+
goto out;
64526462

64536463
/*
64546464
* Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting

kernel/sched/sched.h

+1
Original file line numberDiff line numberDiff line change
@@ -750,6 +750,7 @@ enum scx_rq_flags {
750750
*/
751751
SCX_RQ_ONLINE = 1 << 0,
752752
SCX_RQ_CAN_STOP_TICK = 1 << 1,
753+
SCX_RQ_BYPASSING = 1 << 3,
753754

754755
SCX_RQ_IN_WAKEUP = 1 << 16,
755756
SCX_RQ_IN_BALANCE = 1 << 17,

0 commit comments

Comments
 (0)