sched_ext: Fix SCX_KICK_WAIT deadlock by deferring wait to balance callback
SCX_KICK_WAIT busy-waits in kick_cpus_irq_workfn() using
smp_cond_load_acquire() until the target CPU's kick_sync advances. Because
the irq_work runs in hardirq context, the waiting CPU cannot reschedule and
its own kick_sync never advances. If multiple CPUs form a wait cycle, all
CPUs deadlock.
Replace the busy-wait in kick_cpus_irq_workfn() with resched_curr() to
force the CPU through do_pick_task_scx(), which queues a balance callback
to perform the wait. The balance callback drops the rq lock and enables
IRQs following the sched_core_balance() pattern, so the CPU can process
IPIs while waiting. The local CPU's kick_sync is advanced on entry to
do_pick_task_scx() and continuously during the wait, ensuring any CPU that
starts waiting for us sees the advancement and cannot form cyclic
dependencies.
Fixes: 90e55164da ("sched_ext: Implement SCX_KICK_WAIT")
Cc: stable@vger.kernel.org # v6.12+
Reported-by: Christian Loehle <christian.loehle@arm.com>
Link: https://lore.kernel.org/r/20260316100249.1651641-1-christian.loehle@arm.com
Signed-off-by: Tejun Heo <tj@kernel.org>
Tested-by: Christian Loehle <christian.loehle@arm.com>
This commit is contained in:
parent
db08b1940f
commit
415cb193bb
|
|
@ -2404,7 +2404,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
|
|||
{
|
||||
struct scx_sched *sch = scx_root;
|
||||
|
||||
/* see kick_cpus_irq_workfn() */
|
||||
/* see kick_sync_wait_bal_cb() */
|
||||
smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
|
||||
|
||||
update_curr_scx(rq);
|
||||
|
|
@ -2447,6 +2447,48 @@ switch_class:
|
|||
switch_class(rq, next);
|
||||
}
|
||||
|
||||
static void kick_sync_wait_bal_cb(struct rq *rq)
|
||||
{
|
||||
struct scx_kick_syncs __rcu *ks = __this_cpu_read(scx_kick_syncs);
|
||||
unsigned long *ksyncs = rcu_dereference_sched(ks)->syncs;
|
||||
bool waited;
|
||||
s32 cpu;
|
||||
|
||||
/*
|
||||
* Drop rq lock and enable IRQs while waiting. IRQs must be enabled
|
||||
* — a target CPU may be waiting for us to process an IPI (e.g. TLB
|
||||
* flush) while we wait for its kick_sync to advance.
|
||||
*
|
||||
* Also, keep advancing our own kick_sync so that new kick_sync waits
|
||||
* targeting us, which can start after we drop the lock, cannot form
|
||||
* cyclic dependencies.
|
||||
*/
|
||||
retry:
|
||||
waited = false;
|
||||
for_each_cpu(cpu, rq->scx.cpus_to_sync) {
|
||||
/*
|
||||
* smp_load_acquire() pairs with smp_store_release() on
|
||||
* kick_sync updates on the target CPUs.
|
||||
*/
|
||||
if (cpu == cpu_of(rq) ||
|
||||
smp_load_acquire(&cpu_rq(cpu)->scx.kick_sync) != ksyncs[cpu]) {
|
||||
cpumask_clear_cpu(cpu, rq->scx.cpus_to_sync);
|
||||
continue;
|
||||
}
|
||||
|
||||
raw_spin_rq_unlock_irq(rq);
|
||||
while (READ_ONCE(cpu_rq(cpu)->scx.kick_sync) == ksyncs[cpu]) {
|
||||
smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
|
||||
cpu_relax();
|
||||
}
|
||||
raw_spin_rq_lock_irq(rq);
|
||||
waited = true;
|
||||
}
|
||||
|
||||
if (waited)
|
||||
goto retry;
|
||||
}
|
||||
|
||||
static struct task_struct *first_local_task(struct rq *rq)
|
||||
{
|
||||
return list_first_entry_or_null(&rq->scx.local_dsq.list,
|
||||
|
|
@ -2460,7 +2502,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
|
|||
bool keep_prev;
|
||||
struct task_struct *p;
|
||||
|
||||
/* see kick_cpus_irq_workfn() */
|
||||
/* see kick_sync_wait_bal_cb() */
|
||||
smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
|
||||
|
||||
rq_modified_begin(rq, &ext_sched_class);
|
||||
|
|
@ -2470,6 +2512,17 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
|
|||
rq_repin_lock(rq, rf);
|
||||
maybe_queue_balance_callback(rq);
|
||||
|
||||
/*
|
||||
* Defer to a balance callback which can drop rq lock and enable
|
||||
* IRQs. Waiting directly in the pick path would deadlock against
|
||||
* CPUs sending us IPIs (e.g. TLB flushes) while we wait for them.
|
||||
*/
|
||||
if (unlikely(rq->scx.kick_sync_pending)) {
|
||||
rq->scx.kick_sync_pending = false;
|
||||
queue_balance_callback(rq, &rq->scx.kick_sync_bal_cb,
|
||||
kick_sync_wait_bal_cb);
|
||||
}
|
||||
|
||||
/*
|
||||
* If any higher-priority sched class enqueued a runnable task on
|
||||
* this rq during balance_one(), abort and return RETRY_TASK, so
|
||||
|
|
@ -4713,6 +4766,9 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
|
|||
if (!cpumask_empty(rq->scx.cpus_to_wait))
|
||||
dump_line(&ns, " cpus_to_wait : %*pb",
|
||||
cpumask_pr_args(rq->scx.cpus_to_wait));
|
||||
if (!cpumask_empty(rq->scx.cpus_to_sync))
|
||||
dump_line(&ns, " cpus_to_sync : %*pb",
|
||||
cpumask_pr_args(rq->scx.cpus_to_sync));
|
||||
|
||||
used = seq_buf_used(&ns);
|
||||
if (SCX_HAS_OP(sch, dump_cpu)) {
|
||||
|
|
@ -5610,11 +5666,11 @@ static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs)
|
|||
|
||||
if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) {
|
||||
if (cur_class == &ext_sched_class) {
|
||||
cpumask_set_cpu(cpu, this_scx->cpus_to_sync);
|
||||
ksyncs[cpu] = rq->scx.kick_sync;
|
||||
should_wait = true;
|
||||
} else {
|
||||
cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
|
||||
}
|
||||
cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
|
||||
}
|
||||
|
||||
resched_curr(rq);
|
||||
|
|
@ -5669,27 +5725,15 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
|
|||
cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
|
||||
}
|
||||
|
||||
if (!should_wait)
|
||||
return;
|
||||
|
||||
for_each_cpu(cpu, this_scx->cpus_to_wait) {
|
||||
unsigned long *wait_kick_sync = &cpu_rq(cpu)->scx.kick_sync;
|
||||
|
||||
/*
|
||||
* Busy-wait until the task running at the time of kicking is no
|
||||
* longer running. This can be used to implement e.g. core
|
||||
* scheduling.
|
||||
*
|
||||
* smp_cond_load_acquire() pairs with store_releases in
|
||||
* pick_task_scx() and put_prev_task_scx(). The former breaks
|
||||
* the wait if SCX's scheduling path is entered even if the same
|
||||
* task is picked subsequently. The latter is necessary to break
|
||||
* the wait when $cpu is taken by a higher sched class.
|
||||
*/
|
||||
if (cpu != cpu_of(this_rq))
|
||||
smp_cond_load_acquire(wait_kick_sync, VAL != ksyncs[cpu]);
|
||||
|
||||
cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
|
||||
/*
|
||||
* Can't wait in hardirq — kick_sync can't advance, deadlocking if
|
||||
* CPUs wait for each other. Defer to kick_sync_wait_bal_cb().
|
||||
*/
|
||||
if (should_wait) {
|
||||
raw_spin_rq_lock(this_rq);
|
||||
this_scx->kick_sync_pending = true;
|
||||
resched_curr(this_rq);
|
||||
raw_spin_rq_unlock(this_rq);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -5794,6 +5838,7 @@ void __init init_sched_ext_class(void)
|
|||
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n));
|
||||
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n));
|
||||
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
|
||||
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_sync, GFP_KERNEL, n));
|
||||
rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn);
|
||||
rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn);
|
||||
|
||||
|
|
|
|||
|
|
@ -805,9 +805,12 @@ struct scx_rq {
|
|||
cpumask_var_t cpus_to_kick_if_idle;
|
||||
cpumask_var_t cpus_to_preempt;
|
||||
cpumask_var_t cpus_to_wait;
|
||||
cpumask_var_t cpus_to_sync;
|
||||
bool kick_sync_pending;
|
||||
unsigned long kick_sync;
|
||||
local_t reenq_local_deferred;
|
||||
struct balance_callback deferred_bal_cb;
|
||||
struct balance_callback kick_sync_bal_cb;
|
||||
struct irq_work deferred_irq_work;
|
||||
struct irq_work kick_cpus_irq_work;
|
||||
struct scx_dispatch_q bypass_dsq;
|
||||
|
|
|
|||
Loading…
Reference in New Issue