2024-06-18 10:09:17 -10:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0 */
|
|
|
|
/*
|
|
|
|
* Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
|
|
|
|
* Copyright (c) 2024 Tejun Heo <tj@kernel.org>
|
|
|
|
* Copyright (c) 2024 David Vernet <dvernet@meta.com>
|
|
|
|
*/
|
|
|
|
#ifndef __SCX_COMPAT_H
|
|
|
|
#define __SCX_COMPAT_H
|
|
|
|
|
|
|
|
#include <bpf/btf.h>
|
2024-06-18 10:09:20 -10:00
|
|
|
#include <fcntl.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <unistd.h>
|
2024-06-18 10:09:17 -10:00
|
|
|
|
|
|
|
struct btf *__COMPAT_vmlinux_btf __attribute__((weak));
|
|
|
|
|
|
|
|
static inline void __COMPAT_load_vmlinux_btf(void)
|
|
|
|
{
|
|
|
|
if (!__COMPAT_vmlinux_btf) {
|
|
|
|
__COMPAT_vmlinux_btf = btf__load_vmlinux_btf();
|
|
|
|
SCX_BUG_ON(!__COMPAT_vmlinux_btf, "btf__load_vmlinux_btf()");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool __COMPAT_read_enum(const char *type, const char *name, u64 *v)
|
|
|
|
{
|
|
|
|
const struct btf_type *t;
|
|
|
|
const char *n;
|
|
|
|
s32 tid;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
__COMPAT_load_vmlinux_btf();
|
|
|
|
|
|
|
|
tid = btf__find_by_name(__COMPAT_vmlinux_btf, type);
|
|
|
|
if (tid < 0)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
t = btf__type_by_id(__COMPAT_vmlinux_btf, tid);
|
|
|
|
SCX_BUG_ON(!t, "btf__type_by_id(%d)", tid);
|
|
|
|
|
|
|
|
if (btf_is_enum(t)) {
|
|
|
|
struct btf_enum *e = btf_enum(t);
|
|
|
|
|
|
|
|
for (i = 0; i < BTF_INFO_VLEN(t->info); i++) {
|
|
|
|
n = btf__name_by_offset(__COMPAT_vmlinux_btf, e[i].name_off);
|
|
|
|
SCX_BUG_ON(!n, "btf__name_by_offset()");
|
|
|
|
if (!strcmp(n, name)) {
|
|
|
|
*v = e[i].val;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if (btf_is_enum64(t)) {
|
|
|
|
struct btf_enum64 *e = btf_enum64(t);
|
|
|
|
|
|
|
|
for (i = 0; i < BTF_INFO_VLEN(t->info); i++) {
|
|
|
|
n = btf__name_by_offset(__COMPAT_vmlinux_btf, e[i].name_off);
|
|
|
|
SCX_BUG_ON(!n, "btf__name_by_offset()");
|
|
|
|
if (!strcmp(n, name)) {
|
|
|
|
*v = btf_enum64_value(&e[i]);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
#define __COMPAT_ENUM_OR_ZERO(__type, __ent) \
|
|
|
|
({ \
|
|
|
|
u64 __val = 0; \
|
|
|
|
__COMPAT_read_enum(__type, __ent, &__val); \
|
|
|
|
__val; \
|
|
|
|
})
|
|
|
|
|
|
|
|
static inline bool __COMPAT_has_ksym(const char *ksym)
|
|
|
|
{
|
|
|
|
__COMPAT_load_vmlinux_btf();
|
|
|
|
return btf__find_by_name(__COMPAT_vmlinux_btf, ksym) >= 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool __COMPAT_struct_has_field(const char *type, const char *field)
|
|
|
|
{
|
|
|
|
const struct btf_type *t;
|
|
|
|
const struct btf_member *m;
|
|
|
|
const char *n;
|
|
|
|
s32 tid;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
__COMPAT_load_vmlinux_btf();
|
|
|
|
tid = btf__find_by_name_kind(__COMPAT_vmlinux_btf, type, BTF_KIND_STRUCT);
|
|
|
|
if (tid < 0)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
t = btf__type_by_id(__COMPAT_vmlinux_btf, tid);
|
|
|
|
SCX_BUG_ON(!t, "btf__type_by_id(%d)", tid);
|
|
|
|
|
|
|
|
m = btf_members(t);
|
|
|
|
|
|
|
|
for (i = 0; i < BTF_INFO_VLEN(t->info); i++) {
|
|
|
|
n = btf__name_by_offset(__COMPAT_vmlinux_btf, m[i].name_off);
|
|
|
|
SCX_BUG_ON(!n, "btf__name_by_offset()");
|
|
|
|
if (!strcmp(n, field))
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2025-02-24 23:49:59 +01:00
|
|
|
#define SCX_OPS_FLAG(name) __COMPAT_ENUM_OR_ZERO("scx_ops_flags", #name)
|
2024-06-18 10:09:17 -10:00
|
|
|
|
2025-02-24 23:49:59 +01:00
|
|
|
#define SCX_OPS_KEEP_BUILTIN_IDLE SCX_OPS_FLAG(SCX_OPS_KEEP_BUILTIN_IDLE)
|
|
|
|
#define SCX_OPS_ENQ_LAST SCX_OPS_FLAG(SCX_OPS_ENQ_LAST)
|
|
|
|
#define SCX_OPS_ENQ_EXITING SCX_OPS_FLAG(SCX_OPS_ENQ_EXITING)
|
|
|
|
#define SCX_OPS_SWITCH_PARTIAL SCX_OPS_FLAG(SCX_OPS_SWITCH_PARTIAL)
|
|
|
|
#define SCX_OPS_ENQ_MIGRATION_DISABLED SCX_OPS_FLAG(SCX_OPS_ENQ_MIGRATION_DISABLED)
|
|
|
|
#define SCX_OPS_ALLOW_QUEUED_WAKEUP SCX_OPS_FLAG(SCX_OPS_ALLOW_QUEUED_WAKEUP)
|
|
|
|
#define SCX_OPS_BUILTIN_IDLE_PER_NODE SCX_OPS_FLAG(SCX_OPS_BUILTIN_IDLE_PER_NODE)
|
2025-02-14 20:40:05 +01:00
|
|
|
|
2025-02-24 23:49:59 +01:00
|
|
|
#define SCX_PICK_IDLE_FLAG(name) __COMPAT_ENUM_OR_ZERO("scx_pick_idle_cpu_flags", #name)
|
|
|
|
|
|
|
|
#define SCX_PICK_IDLE_CORE SCX_PICK_IDLE_FLAG(SCX_PICK_IDLE_CORE)
|
|
|
|
#define SCX_PICK_IDLE_IN_NODE SCX_PICK_IDLE_FLAG(SCX_PICK_IDLE_IN_NODE)
|
sched_ext: idle: Per-node idle cpumasks
Using a single global idle mask can lead to inefficiencies and a lot of
stress on the cache coherency protocol on large systems with multiple
NUMA nodes, since all the CPUs can create a really intense read/write
activity on the single global cpumask.
Therefore, split the global cpumask into multiple per-NUMA node cpumasks
to improve scalability and performance on large systems.
The concept is that each cpumask will track only the idle CPUs within
its corresponding NUMA node, treating CPUs in other NUMA nodes as busy.
In this way concurrent access to the idle cpumask will be restricted
within each NUMA node.
The split of multiple per-node idle cpumasks can be controlled using the
SCX_OPS_BUILTIN_IDLE_PER_NODE flag.
By default SCX_OPS_BUILTIN_IDLE_PER_NODE is not enabled and a global
host-wide idle cpumask is used, maintaining the previous behavior.
NOTE: if a scheduler explicitly enables the per-node idle cpumasks (via
SCX_OPS_BUILTIN_IDLE_PER_NODE), scx_bpf_get_idle_cpu/smtmask() will
trigger an scx error, since there are no system-wide cpumasks.
= Test =
Hardware:
- System: DGX B200
- CPUs: 224 SMT threads (112 physical cores)
- Processor: INTEL(R) XEON(R) PLATINUM 8570
- 2 NUMA nodes
Scheduler:
- scx_simple [1] (so that we can focus at the built-in idle selection
policy and not at the scheduling policy itself)
Test:
- Run a parallel kernel build `make -j $(nproc)` and measure the average
elapsed time over 10 runs:
avg time | stdev
---------+------
before: 52.431s | 2.895
after: 50.342s | 2.895
= Conclusion =
Splitting the global cpumask into multiple per-NUMA cpumasks helped to
achieve a speedup of approximately +4% with this particular architecture
and test case.
The same test on a DGX-1 (40 physical cores, Intel Xeon E5-2698 v4 @
2.20GHz, 2 NUMA nodes) shows a speedup of around 1.5-3%.
On smaller systems, I haven't noticed any measurable regressions or
improvements with the same test (parallel kernel build) and scheduler
(scx_simple).
Moreover, with a modified scx_bpfland that uses the new NUMA-aware APIs
I observed an additional +2-2.5% performance improvement with the same
test.
[1] https://github.com/sched-ext/scx/blob/main/scheds/c/scx_simple.bpf.c
Cc: Yury Norov [NVIDIA] <yury.norov@gmail.com>
Signed-off-by: Andrea Righi <arighi@nvidia.com>
Reviewed-by: Yury Norov [NVIDIA] <yury.norov@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2025-02-14 20:40:06 +01:00
|
|
|
|
2024-06-18 10:09:20 -10:00
|
|
|
static inline long scx_hotplug_seq(void)
|
|
|
|
{
|
|
|
|
int fd;
|
|
|
|
char buf[32];
|
|
|
|
ssize_t len;
|
|
|
|
long val;
|
|
|
|
|
|
|
|
fd = open("/sys/kernel/sched_ext/hotplug_seq", O_RDONLY);
|
|
|
|
if (fd < 0)
|
|
|
|
return -ENOENT;
|
|
|
|
|
|
|
|
len = read(fd, buf, sizeof(buf) - 1);
|
|
|
|
SCX_BUG_ON(len <= 0, "read failed (%ld)", len);
|
|
|
|
buf[len] = 0;
|
|
|
|
close(fd);
|
|
|
|
|
|
|
|
val = strtoul(buf, NULL, 10);
|
|
|
|
SCX_BUG_ON(val < 0, "invalid num hotplug events: %lu", val);
|
|
|
|
|
|
|
|
return val;
|
|
|
|
}
|
|
|
|
|
2024-06-18 10:09:17 -10:00
|
|
|
/*
|
|
|
|
* struct sched_ext_ops can change over time. If compat.bpf.h::SCX_OPS_DEFINE()
|
|
|
|
* is used to define ops and compat.h::SCX_OPS_LOAD/ATTACH() are used to load
|
|
|
|
* and attach it, backward compatibility is automatically maintained where
|
|
|
|
* reasonable.
|
2024-06-18 10:09:18 -10:00
|
|
|
*
|
|
|
|
* ec7e3b0463e1 ("implement-ops") in https://github.com/sched-ext/sched_ext is
|
|
|
|
* the current minimum required kernel version.
|
2024-06-18 10:09:17 -10:00
|
|
|
*/
|
|
|
|
#define SCX_OPS_OPEN(__ops_name, __scx_name) ({ \
|
|
|
|
struct __scx_name *__skel; \
|
|
|
|
\
|
2024-06-18 10:09:18 -10:00
|
|
|
SCX_BUG_ON(!__COMPAT_struct_has_field("sched_ext_ops", "dump"), \
|
|
|
|
"sched_ext_ops.dump() missing, kernel too old?"); \
|
|
|
|
\
|
2024-06-18 10:09:17 -10:00
|
|
|
__skel = __scx_name##__open(); \
|
|
|
|
SCX_BUG_ON(!__skel, "Could not open " #__scx_name); \
|
2024-06-18 10:09:20 -10:00
|
|
|
__skel->struct_ops.__ops_name->hotplug_seq = scx_hotplug_seq(); \
|
2024-12-12 16:16:57 -10:00
|
|
|
SCX_ENUM_INIT(__skel); \
|
2024-06-18 10:09:17 -10:00
|
|
|
__skel; \
|
|
|
|
})
|
|
|
|
|
2024-06-18 10:09:18 -10:00
|
|
|
#define SCX_OPS_LOAD(__skel, __ops_name, __scx_name, __uei_name) ({ \
|
|
|
|
UEI_SET_SIZE(__skel, __ops_name, __uei_name); \
|
2024-06-18 10:09:17 -10:00
|
|
|
SCX_BUG_ON(__scx_name##__load((__skel)), "Failed to load skel"); \
|
|
|
|
})
|
|
|
|
|
|
|
|
/*
|
|
|
|
* New versions of bpftool now emit additional link placeholders for BPF maps,
|
|
|
|
* and set up BPF skeleton in such a way that libbpf will auto-attach BPF maps
|
|
|
|
* automatically, assumming libbpf is recent enough (v1.5+). Old libbpf will do
|
|
|
|
* nothing with those links and won't attempt to auto-attach maps.
|
|
|
|
*
|
|
|
|
* To maintain compatibility with older libbpf while avoiding trying to attach
|
|
|
|
* twice, disable the autoattach feature on newer libbpf.
|
|
|
|
*/
|
|
|
|
#if LIBBPF_MAJOR_VERSION > 1 || \
|
|
|
|
(LIBBPF_MAJOR_VERSION == 1 && LIBBPF_MINOR_VERSION >= 5)
|
|
|
|
#define __SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name) \
|
|
|
|
bpf_map__set_autoattach((__skel)->maps.__ops_name, false)
|
|
|
|
#else
|
|
|
|
#define __SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name) do {} while (0)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define SCX_OPS_ATTACH(__skel, __ops_name, __scx_name) ({ \
|
|
|
|
struct bpf_link *__link; \
|
|
|
|
__SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name); \
|
|
|
|
SCX_BUG_ON(__scx_name##__attach((__skel)), "Failed to attach skel"); \
|
|
|
|
__link = bpf_map__attach_struct_ops((__skel)->maps.__ops_name); \
|
|
|
|
SCX_BUG_ON(!__link, "Failed to attach struct_ops"); \
|
|
|
|
__link; \
|
|
|
|
})
|
|
|
|
|
|
|
|
#endif /* __SCX_COMPAT_H */
|