From fcbc423577ce28b5e470a2a47fed6724430cccdb Mon Sep 17 00:00:00 2001 From: Vishal Chourasia Date: Wed, 2 Oct 2024 12:38:06 +0530 Subject: [PATCH 1/7] sched_ext: Add __weak markers to BPF helper function decalarations Fix build errors by adding __weak markers to BPF helper function declarations in header files. This resolves static assertion failures in scx_qmap.bpf.c and scx_flatcg.bpf.c where functions like scx_bpf_dispatch_from_dsq_set_slice, scx_bpf_dispatch_from_dsq_set_vtime, and scx_bpf_task_cgroup were missing the __weak attribute. [1] https://lore.kernel.org/all/ZvvfUqRNM4-jYQzH@linux.ibm.com Signed-off-by: Vishal Chourasia Signed-off-by: Tejun Heo --- tools/sched_ext/include/scx/common.bpf.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index 225f61f9bfca..27749c51c3ec 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -41,8 +41,8 @@ void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vt u32 scx_bpf_dispatch_nr_slots(void) __ksym; void scx_bpf_dispatch_cancel(void) __ksym; bool scx_bpf_consume(u64 dsq_id) __ksym; -void scx_bpf_dispatch_from_dsq_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym; -void scx_bpf_dispatch_from_dsq_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym; +void scx_bpf_dispatch_from_dsq_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak; +void scx_bpf_dispatch_from_dsq_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak; bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; u32 scx_bpf_reenqueue_local(void) __ksym; @@ -71,7 +71,7 @@ s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; bool scx_bpf_task_running(const struct task_struct *p) __ksym; s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; -struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym; +struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak; /* * Use the following as @it__iter when calling From cc9877fb76771b7cbce6c9ec239f13a1d7759876 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 2 Oct 2024 10:33:37 -1000 Subject: [PATCH 2/7] sched_ext: Improve error reporting during loading When the BPF scheduler fails, ops.exit() allows rich error reporting through scx_exit_info. Use scx.exit() path consistently for all failures which can be caused by the BPF scheduler: - scx_ops_error() is called after ops.init() and ops.cgroup_init() failure to record error information. - ops.init_task() failure now uses scx_ops_error() instead of pr_err(). - The err_disable path updated to automatically trigger scx_ops_error() to cover cases that the error message hasn't already been generated and always return 0 indicating init success so that the error is reported through ops.exit(). Signed-off-by: Tejun Heo Cc: David Vernet Cc: Daniel Hodges Cc: Changwoo Min Cc: Andrea Righi Cc: Dan Schatzberg --- kernel/sched/ext.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 3cd7c50a51c5..76f04f3ace61 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -625,6 +625,10 @@ struct sched_ext_ops { /** * exit - Clean up after the BPF scheduler * @info: Exit info + * + * ops.exit() is also called on ops.init() failure, which is a bit + * unusual. This is to allow rich reporting through @info on how + * ops.init() failed. */ void (*exit)(struct scx_exit_info *info); @@ -4117,6 +4121,7 @@ static int scx_cgroup_init(void) css->cgroup, &args); if (ret) { css_put(css); + scx_ops_error("ops.cgroup_init() failed (%d)", ret); return ret; } tg->scx_flags |= SCX_TG_INITED; @@ -5041,6 +5046,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (ret) { ret = ops_sanitize_err("init", ret); cpus_read_unlock(); + scx_ops_error("ops.init() failed (%d)", ret); goto err_disable; } } @@ -5150,8 +5156,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) spin_lock_irq(&scx_tasks_lock); scx_task_iter_exit(&sti); spin_unlock_irq(&scx_tasks_lock); - pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n", - ret, p->comm, p->pid); + scx_ops_error("ops.init_task() failed (%d) for %s[%d]", + ret, p->comm, p->pid); goto err_disable_unlock_all; } @@ -5199,14 +5205,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_ops_bypass(false); - /* - * Returning an error code here would lose the recorded error - * information. Exit indicating success so that the error is notified - * through ops.exit() with all the details. - */ if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) { WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); - ret = 0; goto err_disable; } @@ -5241,10 +5241,18 @@ err_disable_unlock_all: scx_ops_bypass(false); err_disable: mutex_unlock(&scx_ops_enable_mutex); - /* must be fully disabled before returning */ - scx_ops_disable(SCX_EXIT_ERROR); + /* + * Returning an error code here would not pass all the error information + * to userspace. Record errno using scx_ops_error() for cases + * scx_ops_error() wasn't already invoked and exit indicating success so + * that the error is notified through ops.exit() with all the details. + * + * Flush scx_ops_disable_work to ensure that error is reported before + * init completion. + */ + scx_ops_error("scx_ops_enable() failed (%d)", ret); kthread_flush_work(&scx_ops_disable_work); - return ret; + return 0; } From ec010333ce7cf3270ae7193a6724794d5a179625 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 2 Oct 2024 10:34:38 -1000 Subject: [PATCH 3/7] sched_ext: scx_cgroup_exit() may be called without successful scx_cgroup_init() 568894edbe48 ("sched_ext: Add scx_cgroup_enabled to gate cgroup operations and fix scx_tg_online()") assumed that scx_cgroup_exit() is only called after scx_cgroup_init() finished successfully. This isn't true. scx_cgroup_exit() can be called without scx_cgroup_init() being called at all or after scx_cgroup_init() failed in the middle. As init state is tracked per cgroup, scx_cgroup_exit() can be used safely to clean up in all cases. Remove the incorrect WARN_ON_ONCE(). Signed-off-by: Tejun Heo Fixes: 568894edbe48 ("sched_ext: Add scx_cgroup_enabled to gate cgroup operations and fix scx_tg_online()") --- kernel/sched/ext.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 76f04f3ace61..1df2082d1352 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4052,7 +4052,6 @@ static void scx_cgroup_exit(void) percpu_rwsem_assert_held(&scx_cgroup_rwsem); - WARN_ON_ONCE(!scx_cgroup_enabled); scx_cgroup_enabled = false; /* From b62933eee41e2909422c2c3d7fdb56217913faf9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 27 Sep 2024 13:46:11 -1000 Subject: [PATCH 4/7] sched/core: Make select_task_rq() take the pointer to wake_flags instead of value This will be used to allow select_task_rq() to indicate whether ->select_task_rq() was called by modifying *wake_flags. This makes try_to_wake_up() call all functions that take wake_flags with WF_TTWU set. Previously, only select_task_rq() was. Using the same flags is more consistent, and, as the flag is only tested by ->select_task_rq() implementations, it doesn't cause any behavior differences. Signed-off-by: Tejun Heo Acked-by: David Vernet --- kernel/sched/core.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 43e453ab7e20..e70b57a5693e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3518,12 +3518,12 @@ out: * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable. */ static inline -int select_task_rq(struct task_struct *p, int cpu, int wake_flags) +int select_task_rq(struct task_struct *p, int cpu, int *wake_flags) { lockdep_assert_held(&p->pi_lock); if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) - cpu = p->sched_class->select_task_rq(p, cpu, wake_flags); + cpu = p->sched_class->select_task_rq(p, cpu, *wake_flags); else cpu = cpumask_any(p->cpus_ptr); @@ -4120,6 +4120,8 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) guard(preempt)(); int cpu, success = 0; + wake_flags |= WF_TTWU; + if (p == current) { /* * We're waking current, this means 'p->on_rq' and 'task_cpu(p) @@ -4252,7 +4254,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_cond_load_acquire(&p->on_cpu, !VAL); - cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU); + cpu = select_task_rq(p, p->wake_cpu, &wake_flags); if (task_cpu(p) != cpu) { if (p->in_iowait) { delayacct_blkio_end(p); @@ -4793,6 +4795,7 @@ void wake_up_new_task(struct task_struct *p) { struct rq_flags rf; struct rq *rq; + int wake_flags = WF_FORK; raw_spin_lock_irqsave(&p->pi_lock, rf.flags); WRITE_ONCE(p->__state, TASK_RUNNING); @@ -4807,7 +4810,7 @@ void wake_up_new_task(struct task_struct *p) */ p->recent_used_cpu = task_cpu(p); rseq_migrate(p); - __set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_FORK)); + __set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags)); #endif rq = __task_rq_lock(p, &rf); update_rq_clock(rq); @@ -4815,7 +4818,7 @@ void wake_up_new_task(struct task_struct *p) activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL); trace_sched_wakeup_new(p); - wakeup_preempt(rq, p, WF_FORK); + wakeup_preempt(rq, p, wake_flags); #ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* From f207dc2dcdcf0e1e7d260b392784855ce8d84147 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 27 Sep 2024 13:46:12 -1000 Subject: [PATCH 5/7] sched/core: Add ENQUEUE_RQ_SELECTED to indicate whether ->select_task_rq() was called During ttwu, ->select_task_rq() can be skipped if only one CPU is allowed or migration is disabled. sched_ext schedulers may perform operations such as direct dispatch from ->select_task_rq() path and it is useful for them to know whether ->select_task_rq() was skipped in the ->enqueue_task() path. Currently, sched_ext schedulers are using ENQUEUE_WAKEUP for this purpose and end up assuming incorrectly that ->select_task_rq() was called for tasks that are bound to a single CPU or migration disabled. Make select_task_rq() indicate whether ->select_task_rq() was called by setting WF_RQ_SELECTED in *wake_flags and make ttwu_do_activate() map that to ENQUEUE_RQ_SELECTED for ->enqueue_task(). This will be used by sched_ext to fix ->select_task_rq() skip detection. Signed-off-by: Tejun Heo Acked-by: David Vernet --- kernel/sched/core.c | 8 ++++++-- kernel/sched/sched.h | 3 +++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e70b57a5693e..aeb595514461 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3522,10 +3522,12 @@ int select_task_rq(struct task_struct *p, int cpu, int *wake_flags) { lockdep_assert_held(&p->pi_lock); - if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) + if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) { cpu = p->sched_class->select_task_rq(p, cpu, *wake_flags); - else + *wake_flags |= WF_RQ_SELECTED; + } else { cpu = cpumask_any(p->cpus_ptr); + } /* * In order not to call set_task_cpu() on a blocking task we need @@ -3659,6 +3661,8 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, rq->nr_uninterruptible--; #ifdef CONFIG_SMP + if (wake_flags & WF_RQ_SELECTED) + en_flags |= ENQUEUE_RQ_SELECTED; if (wake_flags & WF_MIGRATED) en_flags |= ENQUEUE_MIGRATED; else diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b1c3588a8f00..6085ef50febf 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2292,6 +2292,7 @@ static inline int task_on_rq_migrating(struct task_struct *p) #define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ #define WF_MIGRATED 0x20 /* Internal use, task got migrated */ #define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */ +#define WF_RQ_SELECTED 0x80 /* ->select_task_rq() was called */ #ifdef CONFIG_SMP static_assert(WF_EXEC == SD_BALANCE_EXEC); @@ -2334,6 +2335,7 @@ extern const u32 sched_prio_to_wmult[40]; * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) * ENQUEUE_MIGRATED - the task was migrated during wakeup + * ENQUEUE_RQ_SELECTED - ->select_task_rq() was called * */ @@ -2360,6 +2362,7 @@ extern const u32 sched_prio_to_wmult[40]; #define ENQUEUE_INITIAL 0x80 #define ENQUEUE_MIGRATING 0x100 #define ENQUEUE_DELAYED 0x200 +#define ENQUEUE_RQ_SELECTED 0x400 #define RETRY_TASK ((void *)-1UL) From 9b671793c7d95f020791415cbbcc82b9c007d19c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 27 Sep 2024 13:46:13 -1000 Subject: [PATCH 6/7] sched_ext, scx_qmap: Add and use SCX_ENQ_CPU_SELECTED scx_qmap and other schedulers in the SCX repo are using SCX_ENQ_WAKEUP to tell whether ops.select_cpu() was called. This is incorrect as ops.select_cpu() can be skipped in the wakeup path and leads to e.g. incorrectly skipping direct dispatch for tasks that are bound to a single CPU. sched core has been updated to specify ENQUEUE_RQ_SELECTED if ->select_task_rq() was called. Map it to SCX_ENQ_CPU_SELECTED and update scx_qmap to test it instead of SCX_ENQ_WAKEUP. Signed-off-by: Tejun Heo Acked-by: David Vernet Cc: Daniel Hodges Cc: Changwoo Min Cc: Andrea Righi Cc: Dan Schatzberg --- kernel/sched/ext.c | 1 + tools/sched_ext/scx_qmap.bpf.c | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 1df2082d1352..410a4df8a121 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -696,6 +696,7 @@ enum scx_enq_flags { /* expose select ENQUEUE_* flags as enums */ SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, SCX_ENQ_HEAD = ENQUEUE_HEAD, + SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED, /* high 32bits are SCX specific */ diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 67e2a7968cc9..5d1f880d1149 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -230,8 +230,8 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) return; } - /* if !WAKEUP, select_cpu() wasn't called, try direct dispatch */ - if (!(enq_flags & SCX_ENQ_WAKEUP) && + /* if select_cpu() wasn't called, try direct dispatch */ + if (!(enq_flags & SCX_ENQ_CPU_SELECTED) && (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) { __sync_fetch_and_add(&nr_ddsp_from_enq, 1); scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags); From e0ed52154e866a1e9e9b97ded50b164698f0a222 Mon Sep 17 00:00:00 2001 From: Devaansh-Kumar Date: Tue, 8 Oct 2024 19:56:20 +0530 Subject: [PATCH 7/7] sched_ext: Documentation: Update instructions for running example schedulers Since the artifact paths for tools changed, we need to update the documentation to reflect that path. Signed-off-by: Devaansh-Kumar Signed-off-by: Tejun Heo --- Documentation/scheduler/sched-ext.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst index 6c0d70e2e27d..7b59bbd2e564 100644 --- a/Documentation/scheduler/sched-ext.rst +++ b/Documentation/scheduler/sched-ext.rst @@ -66,7 +66,7 @@ BPF scheduler and reverts all tasks back to CFS. .. code-block:: none # make -j16 -C tools/sched_ext - # tools/sched_ext/scx_simple + # tools/sched_ext/build/bin/scx_simple local=0 global=3 local=5 global=24 local=9 global=44