From fcbc423577ce28b5e470a2a47fed6724430cccdb Mon Sep 17 00:00:00 2001
From: Vishal Chourasia <vishalc@linux.ibm.com>
Date: Wed, 2 Oct 2024 12:38:06 +0530
Subject: [PATCH 1/7] sched_ext: Add __weak markers to BPF helper function
 decalarations

Fix build errors by adding __weak markers to BPF helper function
declarations in header files. This resolves static assertion failures
in scx_qmap.bpf.c and scx_flatcg.bpf.c where functions like
scx_bpf_dispatch_from_dsq_set_slice, scx_bpf_dispatch_from_dsq_set_vtime,
and scx_bpf_task_cgroup were missing the __weak attribute.

[1] https://lore.kernel.org/all/ZvvfUqRNM4-jYQzH@linux.ibm.com

Signed-off-by: Vishal Chourasia <vishalc@linux.ibm.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/include/scx/common.bpf.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 225f61f9bfca..27749c51c3ec 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -41,8 +41,8 @@ void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vt
 u32 scx_bpf_dispatch_nr_slots(void) __ksym;
 void scx_bpf_dispatch_cancel(void) __ksym;
 bool scx_bpf_consume(u64 dsq_id) __ksym;
-void scx_bpf_dispatch_from_dsq_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym;
-void scx_bpf_dispatch_from_dsq_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym;
+void scx_bpf_dispatch_from_dsq_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak;
+void scx_bpf_dispatch_from_dsq_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
 bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
 bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
 u32 scx_bpf_reenqueue_local(void) __ksym;
@@ -71,7 +71,7 @@ s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
 bool scx_bpf_task_running(const struct task_struct *p) __ksym;
 s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
 struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym;
-struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym;
+struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak;
 
 /*
  * Use the following as @it__iter when calling

From cc9877fb76771b7cbce6c9ec239f13a1d7759876 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 2 Oct 2024 10:33:37 -1000
Subject: [PATCH 2/7] sched_ext: Improve error reporting during loading

When the BPF scheduler fails, ops.exit() allows rich error reporting through
scx_exit_info. Use scx.exit() path consistently for all failures which can
be caused by the BPF scheduler:

- scx_ops_error() is called after ops.init() and ops.cgroup_init() failure
  to record error information.

- ops.init_task() failure now uses scx_ops_error() instead of pr_err().

- The err_disable path updated to automatically trigger scx_ops_error() to
  cover cases that the error message hasn't already been generated and
  always return 0 indicating init success so that the error is reported
  through ops.exit().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: David Vernet <void@manifault.com>
Cc: Daniel Hodges <hodges.daniel.scott@gmail.com>
Cc: Changwoo Min <multics69@gmail.com>
Cc: Andrea Righi <andrea.righi@linux.dev>
Cc: Dan Schatzberg <schatzberg.dan@gmail.com>
---
 kernel/sched/ext.c | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 3cd7c50a51c5..76f04f3ace61 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -625,6 +625,10 @@ struct sched_ext_ops {
 	/**
 	 * exit - Clean up after the BPF scheduler
 	 * @info: Exit info
+	 *
+	 * ops.exit() is also called on ops.init() failure, which is a bit
+	 * unusual. This is to allow rich reporting through @info on how
+	 * ops.init() failed.
 	 */
 	void (*exit)(struct scx_exit_info *info);
 
@@ -4117,6 +4121,7 @@ static int scx_cgroup_init(void)
 				      css->cgroup, &args);
 		if (ret) {
 			css_put(css);
+			scx_ops_error("ops.cgroup_init() failed (%d)", ret);
 			return ret;
 		}
 		tg->scx_flags |= SCX_TG_INITED;
@@ -5041,6 +5046,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 		if (ret) {
 			ret = ops_sanitize_err("init", ret);
 			cpus_read_unlock();
+			scx_ops_error("ops.init() failed (%d)", ret);
 			goto err_disable;
 		}
 	}
@@ -5150,8 +5156,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 			spin_lock_irq(&scx_tasks_lock);
 			scx_task_iter_exit(&sti);
 			spin_unlock_irq(&scx_tasks_lock);
-			pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n",
-			       ret, p->comm, p->pid);
+			scx_ops_error("ops.init_task() failed (%d) for %s[%d]",
+				      ret, p->comm, p->pid);
 			goto err_disable_unlock_all;
 		}
 
@@ -5199,14 +5205,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 
 	scx_ops_bypass(false);
 
-	/*
-	 * Returning an error code here would lose the recorded error
-	 * information. Exit indicating success so that the error is notified
-	 * through ops.exit() with all the details.
-	 */
 	if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
 		WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE);
-		ret = 0;
 		goto err_disable;
 	}
 
@@ -5241,10 +5241,18 @@ err_disable_unlock_all:
 	scx_ops_bypass(false);
 err_disable:
 	mutex_unlock(&scx_ops_enable_mutex);
-	/* must be fully disabled before returning */
-	scx_ops_disable(SCX_EXIT_ERROR);
+	/*
+	 * Returning an error code here would not pass all the error information
+	 * to userspace. Record errno using scx_ops_error() for cases
+	 * scx_ops_error() wasn't already invoked and exit indicating success so
+	 * that the error is notified through ops.exit() with all the details.
+	 *
+	 * Flush scx_ops_disable_work to ensure that error is reported before
+	 * init completion.
+	 */
+	scx_ops_error("scx_ops_enable() failed (%d)", ret);
 	kthread_flush_work(&scx_ops_disable_work);
-	return ret;
+	return 0;
 }
 
 

From ec010333ce7cf3270ae7193a6724794d5a179625 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 2 Oct 2024 10:34:38 -1000
Subject: [PATCH 3/7] sched_ext: scx_cgroup_exit() may be called without
 successful scx_cgroup_init()

568894edbe48 ("sched_ext: Add scx_cgroup_enabled to gate cgroup operations
and fix scx_tg_online()") assumed that scx_cgroup_exit() is only called
after scx_cgroup_init() finished successfully. This isn't true.
scx_cgroup_exit() can be called without scx_cgroup_init() being called at
all or after scx_cgroup_init() failed in the middle.

As init state is tracked per cgroup, scx_cgroup_exit() can be used safely to
clean up in all cases. Remove the incorrect WARN_ON_ONCE().

Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: 568894edbe48 ("sched_ext: Add scx_cgroup_enabled to gate cgroup operations and fix scx_tg_online()")
---
 kernel/sched/ext.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 76f04f3ace61..1df2082d1352 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4052,7 +4052,6 @@ static void scx_cgroup_exit(void)
 
 	percpu_rwsem_assert_held(&scx_cgroup_rwsem);
 
-	WARN_ON_ONCE(!scx_cgroup_enabled);
 	scx_cgroup_enabled = false;
 
 	/*

From b62933eee41e2909422c2c3d7fdb56217913faf9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 27 Sep 2024 13:46:11 -1000
Subject: [PATCH 4/7] sched/core: Make select_task_rq() take the pointer to
 wake_flags instead of value

This will be used to allow select_task_rq() to indicate whether
->select_task_rq() was called by modifying *wake_flags.

This makes try_to_wake_up() call all functions that take wake_flags with
WF_TTWU set. Previously, only select_task_rq() was. Using the same flags is
more consistent, and, as the flag is only tested by ->select_task_rq()
implementations, it doesn't cause any behavior differences.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: David Vernet <void@manifault.com>
---
 kernel/sched/core.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 43e453ab7e20..e70b57a5693e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3518,12 +3518,12 @@ out:
  * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
  */
 static inline
-int select_task_rq(struct task_struct *p, int cpu, int wake_flags)
+int select_task_rq(struct task_struct *p, int cpu, int *wake_flags)
 {
 	lockdep_assert_held(&p->pi_lock);
 
 	if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
-		cpu = p->sched_class->select_task_rq(p, cpu, wake_flags);
+		cpu = p->sched_class->select_task_rq(p, cpu, *wake_flags);
 	else
 		cpu = cpumask_any(p->cpus_ptr);
 
@@ -4120,6 +4120,8 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	guard(preempt)();
 	int cpu, success = 0;
 
+	wake_flags |= WF_TTWU;
+
 	if (p == current) {
 		/*
 		 * We're waking current, this means 'p->on_rq' and 'task_cpu(p)
@@ -4252,7 +4254,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 		 */
 		smp_cond_load_acquire(&p->on_cpu, !VAL);
 
-		cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU);
+		cpu = select_task_rq(p, p->wake_cpu, &wake_flags);
 		if (task_cpu(p) != cpu) {
 			if (p->in_iowait) {
 				delayacct_blkio_end(p);
@@ -4793,6 +4795,7 @@ void wake_up_new_task(struct task_struct *p)
 {
 	struct rq_flags rf;
 	struct rq *rq;
+	int wake_flags = WF_FORK;
 
 	raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
 	WRITE_ONCE(p->__state, TASK_RUNNING);
@@ -4807,7 +4810,7 @@ void wake_up_new_task(struct task_struct *p)
 	 */
 	p->recent_used_cpu = task_cpu(p);
 	rseq_migrate(p);
-	__set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_FORK));
+	__set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags));
 #endif
 	rq = __task_rq_lock(p, &rf);
 	update_rq_clock(rq);
@@ -4815,7 +4818,7 @@ void wake_up_new_task(struct task_struct *p)
 
 	activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL);
 	trace_sched_wakeup_new(p);
-	wakeup_preempt(rq, p, WF_FORK);
+	wakeup_preempt(rq, p, wake_flags);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_woken) {
 		/*

From f207dc2dcdcf0e1e7d260b392784855ce8d84147 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 27 Sep 2024 13:46:12 -1000
Subject: [PATCH 5/7] sched/core: Add ENQUEUE_RQ_SELECTED to indicate whether
 ->select_task_rq() was called

During ttwu, ->select_task_rq() can be skipped if only one CPU is allowed or
migration is disabled. sched_ext schedulers may perform operations such as
direct dispatch from ->select_task_rq() path and it is useful for them to
know whether ->select_task_rq() was skipped in the ->enqueue_task() path.

Currently, sched_ext schedulers are using ENQUEUE_WAKEUP for this purpose
and end up assuming incorrectly that ->select_task_rq() was called for tasks
that are bound to a single CPU or migration disabled.

Make select_task_rq() indicate whether ->select_task_rq() was called by
setting WF_RQ_SELECTED in *wake_flags and make ttwu_do_activate() map that
to ENQUEUE_RQ_SELECTED for ->enqueue_task().

This will be used by sched_ext to fix ->select_task_rq() skip detection.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: David Vernet <void@manifault.com>
---
 kernel/sched/core.c  | 8 ++++++--
 kernel/sched/sched.h | 3 +++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e70b57a5693e..aeb595514461 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3522,10 +3522,12 @@ int select_task_rq(struct task_struct *p, int cpu, int *wake_flags)
 {
 	lockdep_assert_held(&p->pi_lock);
 
-	if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
+	if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) {
 		cpu = p->sched_class->select_task_rq(p, cpu, *wake_flags);
-	else
+		*wake_flags |= WF_RQ_SELECTED;
+	} else {
 		cpu = cpumask_any(p->cpus_ptr);
+	}
 
 	/*
 	 * In order not to call set_task_cpu() on a blocking task we need
@@ -3659,6 +3661,8 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
 		rq->nr_uninterruptible--;
 
 #ifdef CONFIG_SMP
+	if (wake_flags & WF_RQ_SELECTED)
+		en_flags |= ENQUEUE_RQ_SELECTED;
 	if (wake_flags & WF_MIGRATED)
 		en_flags |= ENQUEUE_MIGRATED;
 	else
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b1c3588a8f00..6085ef50febf 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2292,6 +2292,7 @@ static inline int task_on_rq_migrating(struct task_struct *p)
 #define WF_SYNC			0x10 /* Waker goes to sleep after wakeup */
 #define WF_MIGRATED		0x20 /* Internal use, task got migrated */
 #define WF_CURRENT_CPU		0x40 /* Prefer to move the wakee to the current CPU. */
+#define WF_RQ_SELECTED		0x80 /* ->select_task_rq() was called */
 
 #ifdef CONFIG_SMP
 static_assert(WF_EXEC == SD_BALANCE_EXEC);
@@ -2334,6 +2335,7 @@ extern const u32		sched_prio_to_wmult[40];
  * ENQUEUE_HEAD      - place at front of runqueue (tail if not specified)
  * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
  * ENQUEUE_MIGRATED  - the task was migrated during wakeup
+ * ENQUEUE_RQ_SELECTED - ->select_task_rq() was called
  *
  */
 
@@ -2360,6 +2362,7 @@ extern const u32		sched_prio_to_wmult[40];
 #define ENQUEUE_INITIAL		0x80
 #define ENQUEUE_MIGRATING	0x100
 #define ENQUEUE_DELAYED		0x200
+#define ENQUEUE_RQ_SELECTED	0x400
 
 #define RETRY_TASK		((void *)-1UL)
 

From 9b671793c7d95f020791415cbbcc82b9c007d19c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 27 Sep 2024 13:46:13 -1000
Subject: [PATCH 6/7] sched_ext, scx_qmap: Add and use SCX_ENQ_CPU_SELECTED

scx_qmap and other schedulers in the SCX repo are using SCX_ENQ_WAKEUP to
tell whether ops.select_cpu() was called. This is incorrect as
ops.select_cpu() can be skipped in the wakeup path and leads to e.g.
incorrectly skipping direct dispatch for tasks that are bound to a single
CPU.

sched core has been updated to specify ENQUEUE_RQ_SELECTED if
->select_task_rq() was called. Map it to SCX_ENQ_CPU_SELECTED and update
scx_qmap to test it instead of SCX_ENQ_WAKEUP.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: David Vernet <void@manifault.com>
Cc: Daniel Hodges <hodges.daniel.scott@gmail.com>
Cc: Changwoo Min <multics69@gmail.com>
Cc: Andrea Righi <andrea.righi@linux.dev>
Cc: Dan Schatzberg <schatzberg.dan@gmail.com>
---
 kernel/sched/ext.c             | 1 +
 tools/sched_ext/scx_qmap.bpf.c | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 1df2082d1352..410a4df8a121 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -696,6 +696,7 @@ enum scx_enq_flags {
 	/* expose select ENQUEUE_* flags as enums */
 	SCX_ENQ_WAKEUP		= ENQUEUE_WAKEUP,
 	SCX_ENQ_HEAD		= ENQUEUE_HEAD,
+	SCX_ENQ_CPU_SELECTED	= ENQUEUE_RQ_SELECTED,
 
 	/* high 32bits are SCX specific */
 
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 67e2a7968cc9..5d1f880d1149 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -230,8 +230,8 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 		return;
 	}
 
-	/* if !WAKEUP, select_cpu() wasn't called, try direct dispatch */
-	if (!(enq_flags & SCX_ENQ_WAKEUP) &&
+	/* if select_cpu() wasn't called, try direct dispatch */
+	if (!(enq_flags & SCX_ENQ_CPU_SELECTED) &&
 	    (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
 		__sync_fetch_and_add(&nr_ddsp_from_enq, 1);
 		scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);

From e0ed52154e866a1e9e9b97ded50b164698f0a222 Mon Sep 17 00:00:00 2001
From: Devaansh-Kumar <devaanshk840@gmail.com>
Date: Tue, 8 Oct 2024 19:56:20 +0530
Subject: [PATCH 7/7] sched_ext: Documentation: Update instructions for running
 example schedulers

Since the artifact paths for tools changed, we need to update the documentation to reflect that path.

Signed-off-by: Devaansh-Kumar <devaanshk840@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/scheduler/sched-ext.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
index 6c0d70e2e27d..7b59bbd2e564 100644
--- a/Documentation/scheduler/sched-ext.rst
+++ b/Documentation/scheduler/sched-ext.rst
@@ -66,7 +66,7 @@ BPF scheduler and reverts all tasks back to CFS.
 .. code-block:: none
 
     # make -j16 -C tools/sched_ext
-    # tools/sched_ext/scx_simple
+    # tools/sched_ext/build/bin/scx_simple
     local=0 global=3
     local=5 global=24
     local=9 global=44