diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile index c9255d1499b6..2c601a7eaff5 100644 --- a/tools/testing/selftests/sched_ext/Makefile +++ b/tools/testing/selftests/sched_ext/Makefile @@ -185,6 +185,7 @@ auto-test-targets := \ select_cpu_vtime \ rt_stall \ test_example \ + total_bw \ testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-targets))) diff --git a/tools/testing/selftests/sched_ext/total_bw.c b/tools/testing/selftests/sched_ext/total_bw.c new file mode 100644 index 000000000000..5b0a619bab86 --- /dev/null +++ b/tools/testing/selftests/sched_ext/total_bw.c @@ -0,0 +1,281 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test to verify that total_bw value remains consistent across all CPUs + * in different BPF program states. + * + * Copyright (C) 2025 NVIDIA Corporation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "minimal.bpf.skel.h" +#include "scx_test.h" + +#define MAX_CPUS 512 +#define STRESS_DURATION_SEC 5 + +struct total_bw_ctx { + struct minimal *skel; + long baseline_bw[MAX_CPUS]; + int nr_cpus; +}; + +static void *cpu_stress_thread(void *arg) +{ + volatile int i; + time_t end_time = time(NULL) + STRESS_DURATION_SEC; + + while (time(NULL) < end_time) + for (i = 0; i < 1000000; i++) + ; + + return NULL; +} + +/* + * The first enqueue on a CPU causes the DL server to start, for that + * reason run stressor threads in the hopes it schedules on all CPUs. + */ +static int run_cpu_stress(int nr_cpus) +{ + pthread_t *threads; + int i, ret = 0; + + threads = calloc(nr_cpus, sizeof(pthread_t)); + if (!threads) + return -ENOMEM; + + /* Create threads to run on each CPU */ + for (i = 0; i < nr_cpus; i++) { + if (pthread_create(&threads[i], NULL, cpu_stress_thread, NULL)) { + ret = -errno; + fprintf(stderr, "Failed to create thread %d: %s\n", i, strerror(-ret)); + break; + } + } + + /* Wait for all threads to complete */ + for (i = 0; i < nr_cpus; i++) { + if (threads[i]) + pthread_join(threads[i], NULL); + } + + free(threads); + return ret; +} + +static int read_total_bw_values(long *bw_values, int max_cpus) +{ + FILE *fp; + char line[256]; + int cpu_count = 0; + + fp = fopen("/sys/kernel/debug/sched/debug", "r"); + if (!fp) { + SCX_ERR("Failed to open debug file"); + return -1; + } + + while (fgets(line, sizeof(line), fp)) { + char *bw_str = strstr(line, "total_bw"); + + if (bw_str) { + bw_str = strchr(bw_str, ':'); + if (bw_str) { + /* Only store up to max_cpus values */ + if (cpu_count < max_cpus) + bw_values[cpu_count] = atol(bw_str + 1); + cpu_count++; + } + } + } + + fclose(fp); + return cpu_count; +} + +static bool verify_total_bw_consistency(long *bw_values, int count) +{ + int i; + long first_value; + + if (count <= 0) + return false; + + first_value = bw_values[0]; + + for (i = 1; i < count; i++) { + if (bw_values[i] != first_value) { + SCX_ERR("Inconsistent total_bw: CPU0=%ld, CPU%d=%ld", + first_value, i, bw_values[i]); + return false; + } + } + + return true; +} + +static int fetch_verify_total_bw(long *bw_values, int nr_cpus) +{ + int attempts = 0; + int max_attempts = 10; + int count; + + /* + * The first enqueue on a CPU causes the DL server to start, for that + * reason run stressor threads in the hopes it schedules on all CPUs. + */ + if (run_cpu_stress(nr_cpus) < 0) { + SCX_ERR("Failed to run CPU stress"); + return -1; + } + + /* Try multiple times to get stable values */ + while (attempts < max_attempts) { + count = read_total_bw_values(bw_values, nr_cpus); + fprintf(stderr, "Read %d total_bw values (testing %d CPUs)\n", count, nr_cpus); + /* If system has more CPUs than we're testing, that's OK */ + if (count < nr_cpus) { + SCX_ERR("Expected at least %d CPUs, got %d", nr_cpus, count); + attempts++; + sleep(1); + continue; + } + + /* Only verify the CPUs we're testing */ + if (verify_total_bw_consistency(bw_values, nr_cpus)) { + fprintf(stderr, "Values are consistent: %ld\n", bw_values[0]); + return 0; + } + + attempts++; + sleep(1); + } + + return -1; +} + +static enum scx_test_status setup(void **ctx) +{ + struct total_bw_ctx *test_ctx; + + if (access("/sys/kernel/debug/sched/debug", R_OK) != 0) { + fprintf(stderr, "Skipping test: debugfs sched/debug not accessible\n"); + return SCX_TEST_SKIP; + } + + test_ctx = calloc(1, sizeof(*test_ctx)); + if (!test_ctx) + return SCX_TEST_FAIL; + + test_ctx->nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); + if (test_ctx->nr_cpus <= 0) { + free(test_ctx); + return SCX_TEST_FAIL; + } + + /* If system has more CPUs than MAX_CPUS, just test the first MAX_CPUS */ + if (test_ctx->nr_cpus > MAX_CPUS) + test_ctx->nr_cpus = MAX_CPUS; + + /* Test scenario 1: BPF program not loaded */ + /* Read and verify baseline total_bw before loading BPF program */ + fprintf(stderr, "BPF prog initially not loaded, reading total_bw values\n"); + if (fetch_verify_total_bw(test_ctx->baseline_bw, test_ctx->nr_cpus) < 0) { + SCX_ERR("Failed to get stable baseline values"); + free(test_ctx); + return SCX_TEST_FAIL; + } + + /* Load the BPF skeleton */ + test_ctx->skel = minimal__open(); + if (!test_ctx->skel) { + free(test_ctx); + return SCX_TEST_FAIL; + } + + SCX_ENUM_INIT(test_ctx->skel); + if (minimal__load(test_ctx->skel)) { + minimal__destroy(test_ctx->skel); + free(test_ctx); + return SCX_TEST_FAIL; + } + + *ctx = test_ctx; + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct total_bw_ctx *test_ctx = ctx; + struct bpf_link *link; + long loaded_bw[MAX_CPUS]; + long unloaded_bw[MAX_CPUS]; + int i; + + /* Test scenario 2: BPF program loaded */ + link = bpf_map__attach_struct_ops(test_ctx->skel->maps.minimal_ops); + if (!link) { + SCX_ERR("Failed to attach scheduler"); + return SCX_TEST_FAIL; + } + + fprintf(stderr, "BPF program loaded, reading total_bw values\n"); + if (fetch_verify_total_bw(loaded_bw, test_ctx->nr_cpus) < 0) { + SCX_ERR("Failed to get stable values with BPF loaded"); + bpf_link__destroy(link); + return SCX_TEST_FAIL; + } + bpf_link__destroy(link); + + /* Test scenario 3: BPF program unloaded */ + fprintf(stderr, "BPF program unloaded, reading total_bw values\n"); + if (fetch_verify_total_bw(unloaded_bw, test_ctx->nr_cpus) < 0) { + SCX_ERR("Failed to get stable values after BPF unload"); + return SCX_TEST_FAIL; + } + + /* Verify all three scenarios have the same total_bw values */ + for (i = 0; i < test_ctx->nr_cpus; i++) { + if (test_ctx->baseline_bw[i] != loaded_bw[i]) { + SCX_ERR("CPU%d: baseline_bw=%ld != loaded_bw=%ld", + i, test_ctx->baseline_bw[i], loaded_bw[i]); + return SCX_TEST_FAIL; + } + + if (test_ctx->baseline_bw[i] != unloaded_bw[i]) { + SCX_ERR("CPU%d: baseline_bw=%ld != unloaded_bw=%ld", + i, test_ctx->baseline_bw[i], unloaded_bw[i]); + return SCX_TEST_FAIL; + } + } + + fprintf(stderr, "All total_bw values are consistent across all scenarios\n"); + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct total_bw_ctx *test_ctx = ctx; + + if (test_ctx) { + if (test_ctx->skel) + minimal__destroy(test_ctx->skel); + free(test_ctx); + } +} + +struct scx_test total_bw = { + .name = "total_bw", + .description = "Verify total_bw consistency across BPF program states", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&total_bw)