mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-08-05 16:54:27 +00:00

Specify the threshold for dumping offcpu samples with --off-cpu-thresh, the unit is milliseconds. Default value is 500ms. Example: perf record --off-cpu --off-cpu-thresh 824 The example above collects direct off-cpu samples where the off-cpu time is longer than 824ms. Committer testing: After commenting out the end off-cpu dump to have just the ones that are added right after the task is scheduled back, and using a threshould of 1000ms, we see some periods (the 5th column, just before "offcpu-time" in the 'perf script' output) that are over 1000.000.000 nanoseconds: root@number:~# perf record --off-cpu --off-cpu-thresh 10000 ^C[ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 3.902 MB perf.data (34335 samples) ] root@number:~# perf script <SNIP> Isolated Web Co 59932 [028] 63839.594437: 1000049427 offcpu-time: 7fe63c7976c2 __syscall_cancel_arch_end+0x0 (/usr/lib64/libc.so.6) 7fe63c78c04c __futex_abstimed_wait_common+0x7c (/usr/lib64/libc.so.6) 7fe63c78e928 pthread_cond_timedwait@@GLIBC_2.3.2+0x178 (/usr/lib64/libc.so.6) 5599974a9fe7 mozilla::detail::ConditionVariableImpl::wait_for(mozilla::detail::MutexImpl&, mozilla::BaseTimeDuration<mozilla::TimeDurationValueCalculator> const&)+0xe7 (/usr/lib64/fir> 100000000 [unknown] ([unknown]) swapper 0 [025] 63839.594459: 195724 cycles:P: ffffffffac328270 read_tsc+0x0 ([kernel.kallsyms]) Isolated Web Co 59932 [010] 63839.594466: 1000055278 offcpu-time: 7fe63c7976c2 __syscall_cancel_arch_end+0x0 (/usr/lib64/libc.so.6) 7fe63c78ba24 __syscall_cancel+0x14 (/usr/lib64/libc.so.6) 7fe63c804c4e __poll+0x1e (/usr/lib64/libc.so.6) 7fe633b0d1b8 PollWrapper(_GPollFD*, unsigned int, int) [clone .lto_priv.0]+0xf8 (/usr/lib64/firefox/libxul.so) 10000002c [unknown] ([unknown]) swapper 0 [027] 63839.594475: 134433 cycles:P: ffffffffad4c45d9 irqentry_enter+0x19 ([kernel.kallsyms]) swapper 0 [028] 63839.594499: 215838 cycles:P: ffffffffac39199a switch_mm_irqs_off+0x10a ([kernel.kallsyms]) MediaPD~oder #1 1407676 [027] 63839.594514: 134433 cycles:P: 7f982ef5e69f dct_IV(int*, int, int*)+0x24f (/usr/lib64/libfdk-aac.so.2.0.0) swapper 0 [024] 63839.594524: 267411 cycles:P: ffffffffad4c6ee6 poll_idle+0x56 ([kernel.kallsyms]) MediaSu~sor #75 1093827 [026] 63839.594555: 332652 cycles:P: 55be753ad030 moz_xmalloc+0x200 (/usr/lib64/firefox/firefox) swapper 0 [027] 63839.594616: 160548 cycles:P: ffffffffad144840 menu_select+0x570 ([kernel.kallsyms]) Isolated Web Co 14019 [027] 63839.595120: 1000050178 offcpu-time: 7fc9537cc6c2 __syscall_cancel_arch_end+0x0 (/usr/lib64/libc.so.6) 7fc9537c104c __futex_abstimed_wait_common+0x7c (/usr/lib64/libc.so.6) 7fc9537c3928 pthread_cond_timedwait@@GLIBC_2.3.2+0x178 (/usr/lib64/libc.so.6) 7fc95372a3c8 pt_TimedWait+0xb8 (/usr/lib64/libnspr4.so) 7fc95372a8d8 PR_WaitCondVar+0x68 (/usr/lib64/libnspr4.so) 7fc94afb1f7c WatchdogMain(void*)+0xac (/usr/lib64/firefox/libxul.so) 7fc947498660 [unknown] ([unknown]) 7fc9535fce88 [unknown] ([unknown]) 7fc94b620e60 WatchdogManager::~WatchdogManager()+0x0 (/usr/lib64/firefox/libxul.so) fff8548387f8b48 [unknown] ([unknown]) swapper 0 [003] 63839.595712: 212948 cycles:P: ffffffffacd5b865 acpi_os_read_port+0x55 ([kernel.kallsyms]) <SNIP> Suggested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Suggested-by: Ian Rogers <irogers@google.com> Suggested-by: Namhyung Kim <namhyung@kernel.org> Reviewed-by: Ian Rogers <irogers@google.com> Signed-off-by: Howard Chu <howardchu95@gmail.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Tested-by: Gautam Menghani <gautam@linux.ibm.com> Tested-by: Ian Rogers <irogers@google.com> Acked-by: Namhyung Kim <namhyung@kernel.org> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Clark <james.clark@linaro.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Link: https://lore.kernel.org/r/20241108204137.2444151-2-howardchu95@gmail.com Link: https://lore.kernel.org/r/20250501022809.449767-10-howardchu95@gmail.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
372 lines
8.4 KiB
C
372 lines
8.4 KiB
C
// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
|
|
// Copyright (c) 2022 Google
|
|
#include "vmlinux.h"
|
|
#include <bpf/bpf_helpers.h>
|
|
#include <bpf/bpf_tracing.h>
|
|
#include <bpf/bpf_core_read.h>
|
|
|
|
/* task->flags for off-cpu analysis */
|
|
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
|
|
|
|
/* task->state for off-cpu analysis */
|
|
#define TASK_INTERRUPTIBLE 0x0001
|
|
#define TASK_UNINTERRUPTIBLE 0x0002
|
|
|
|
/* create a new thread */
|
|
#define CLONE_THREAD 0x10000
|
|
|
|
#define MAX_STACKS 32
|
|
#define MAX_ENTRIES 102400
|
|
|
|
#define MAX_CPUS 4096
|
|
#define MAX_OFFCPU_LEN 37
|
|
|
|
// We have a 'struct stack' in vmlinux.h when building with GEN_VMLINUX_H=1
|
|
struct __stack {
|
|
u64 array[MAX_STACKS];
|
|
};
|
|
|
|
struct tstamp_data {
|
|
__u32 stack_id;
|
|
__u32 state;
|
|
__u64 timestamp;
|
|
struct __stack stack;
|
|
};
|
|
|
|
struct offcpu_key {
|
|
__u32 pid;
|
|
__u32 tgid;
|
|
__u32 stack_id;
|
|
__u32 state;
|
|
__u64 cgroup_id;
|
|
};
|
|
|
|
struct {
|
|
__uint(type, BPF_MAP_TYPE_STACK_TRACE);
|
|
__uint(key_size, sizeof(__u32));
|
|
__uint(value_size, MAX_STACKS * sizeof(__u64));
|
|
__uint(max_entries, MAX_ENTRIES);
|
|
} stacks SEC(".maps");
|
|
|
|
struct offcpu_data {
|
|
u64 array[MAX_OFFCPU_LEN];
|
|
};
|
|
|
|
struct {
|
|
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
|
|
__uint(key_size, sizeof(int));
|
|
__uint(value_size, sizeof(int));
|
|
__uint(max_entries, MAX_CPUS);
|
|
} offcpu_output SEC(".maps");
|
|
|
|
struct {
|
|
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
|
|
__uint(key_size, sizeof(__u32));
|
|
__uint(value_size, sizeof(struct offcpu_data));
|
|
__uint(max_entries, 1);
|
|
} offcpu_payload SEC(".maps");
|
|
|
|
struct {
|
|
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
|
|
__uint(map_flags, BPF_F_NO_PREALLOC);
|
|
__type(key, int);
|
|
__type(value, struct tstamp_data);
|
|
} tstamp SEC(".maps");
|
|
|
|
struct {
|
|
__uint(type, BPF_MAP_TYPE_HASH);
|
|
__uint(key_size, sizeof(struct offcpu_key));
|
|
__uint(value_size, sizeof(__u64));
|
|
__uint(max_entries, MAX_ENTRIES);
|
|
} off_cpu SEC(".maps");
|
|
|
|
struct {
|
|
__uint(type, BPF_MAP_TYPE_HASH);
|
|
__uint(key_size, sizeof(__u32));
|
|
__uint(value_size, sizeof(__u8));
|
|
__uint(max_entries, 1);
|
|
} cpu_filter SEC(".maps");
|
|
|
|
struct {
|
|
__uint(type, BPF_MAP_TYPE_HASH);
|
|
__uint(key_size, sizeof(__u32));
|
|
__uint(value_size, sizeof(__u8));
|
|
__uint(max_entries, 1);
|
|
} task_filter SEC(".maps");
|
|
|
|
struct {
|
|
__uint(type, BPF_MAP_TYPE_HASH);
|
|
__uint(key_size, sizeof(__u64));
|
|
__uint(value_size, sizeof(__u8));
|
|
__uint(max_entries, 1);
|
|
} cgroup_filter SEC(".maps");
|
|
|
|
/* new kernel task_struct definition */
|
|
struct task_struct___new {
|
|
long __state;
|
|
} __attribute__((preserve_access_index));
|
|
|
|
/* old kernel task_struct definition */
|
|
struct task_struct___old {
|
|
long state;
|
|
} __attribute__((preserve_access_index));
|
|
|
|
int enabled = 0;
|
|
|
|
const volatile int has_cpu = 0;
|
|
const volatile int has_task = 0;
|
|
const volatile int has_cgroup = 0;
|
|
const volatile int uses_tgid = 0;
|
|
|
|
const volatile bool has_prev_state = false;
|
|
const volatile bool needs_cgroup = false;
|
|
const volatile bool uses_cgroup_v1 = false;
|
|
|
|
int perf_subsys_id = -1;
|
|
|
|
__u64 offcpu_thresh_ns;
|
|
|
|
/*
|
|
* Old kernel used to call it task_struct->state and now it's '__state'.
|
|
* Use BPF CO-RE "ignored suffix rule" to deal with it like below:
|
|
*
|
|
* https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes
|
|
*/
|
|
static inline int get_task_state(struct task_struct *t)
|
|
{
|
|
/* recast pointer to capture new type for compiler */
|
|
struct task_struct___new *t_new = (void *)t;
|
|
|
|
if (bpf_core_field_exists(t_new->__state)) {
|
|
return BPF_CORE_READ(t_new, __state);
|
|
} else {
|
|
/* recast pointer to capture old type for compiler */
|
|
struct task_struct___old *t_old = (void *)t;
|
|
|
|
return BPF_CORE_READ(t_old, state);
|
|
}
|
|
}
|
|
|
|
static inline __u64 get_cgroup_id(struct task_struct *t)
|
|
{
|
|
struct cgroup *cgrp;
|
|
|
|
if (!uses_cgroup_v1)
|
|
return BPF_CORE_READ(t, cgroups, dfl_cgrp, kn, id);
|
|
|
|
if (perf_subsys_id == -1) {
|
|
#if __has_builtin(__builtin_preserve_enum_value)
|
|
perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
|
|
perf_event_cgrp_id);
|
|
#else
|
|
perf_subsys_id = perf_event_cgrp_id;
|
|
#endif
|
|
}
|
|
|
|
cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_subsys_id], cgroup);
|
|
return BPF_CORE_READ(cgrp, kn, id);
|
|
}
|
|
|
|
static inline int can_record(struct task_struct *t, int state)
|
|
{
|
|
/* kernel threads don't have user stack */
|
|
if (t->flags & PF_KTHREAD)
|
|
return 0;
|
|
|
|
if (state != TASK_INTERRUPTIBLE &&
|
|
state != TASK_UNINTERRUPTIBLE)
|
|
return 0;
|
|
|
|
if (has_cpu) {
|
|
__u32 cpu = bpf_get_smp_processor_id();
|
|
__u8 *ok;
|
|
|
|
ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
|
|
if (!ok)
|
|
return 0;
|
|
}
|
|
|
|
if (has_task) {
|
|
__u8 *ok;
|
|
__u32 pid;
|
|
|
|
if (uses_tgid)
|
|
pid = t->tgid;
|
|
else
|
|
pid = t->pid;
|
|
|
|
ok = bpf_map_lookup_elem(&task_filter, &pid);
|
|
if (!ok)
|
|
return 0;
|
|
}
|
|
|
|
if (has_cgroup) {
|
|
__u8 *ok;
|
|
__u64 cgrp_id = get_cgroup_id(t);
|
|
|
|
ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id);
|
|
if (!ok)
|
|
return 0;
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
|
|
static inline int copy_stack(struct __stack *from, struct offcpu_data *to, int n)
|
|
{
|
|
int len = 0;
|
|
|
|
for (int i = 0; i < MAX_STACKS && from->array[i]; ++i, ++len)
|
|
to->array[n + 2 + i] = from->array[i];
|
|
|
|
return len;
|
|
}
|
|
|
|
/**
|
|
* off_cpu_dump - dump off-cpu samples to ring buffer
|
|
* @data: payload for dumping off-cpu samples
|
|
* @key: off-cpu data
|
|
* @stack: stack trace of the task before being scheduled out
|
|
*
|
|
* If the threshold of off-cpu time is reached, acquire tid, period, callchain, and cgroup id
|
|
* information of the task, and dump it as a raw sample to perf ring buffer
|
|
*/
|
|
static int off_cpu_dump(void *ctx, struct offcpu_data *data, struct offcpu_key *key,
|
|
struct __stack *stack, __u64 delta)
|
|
{
|
|
int n = 0, len = 0;
|
|
|
|
data->array[n++] = (u64)key->tgid << 32 | key->pid;
|
|
data->array[n++] = delta;
|
|
|
|
/* data->array[n] is callchain->nr (updated later) */
|
|
data->array[n + 1] = PERF_CONTEXT_USER;
|
|
data->array[n + 2] = 0;
|
|
len = copy_stack(stack, data, n);
|
|
|
|
/* update length of callchain */
|
|
data->array[n] = len + 1;
|
|
n += len + 2;
|
|
|
|
data->array[n++] = key->cgroup_id;
|
|
|
|
return bpf_perf_event_output(ctx, &offcpu_output, BPF_F_CURRENT_CPU, data, n * sizeof(u64));
|
|
}
|
|
|
|
static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
|
|
struct task_struct *next, int state)
|
|
{
|
|
__u64 ts;
|
|
__u32 stack_id;
|
|
struct tstamp_data *pelem;
|
|
|
|
ts = bpf_ktime_get_ns();
|
|
|
|
if (!can_record(prev, state))
|
|
goto next;
|
|
|
|
stack_id = bpf_get_stackid(ctx, &stacks,
|
|
BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK);
|
|
|
|
pelem = bpf_task_storage_get(&tstamp, prev, NULL,
|
|
BPF_LOCAL_STORAGE_GET_F_CREATE);
|
|
if (!pelem)
|
|
goto next;
|
|
|
|
pelem->timestamp = ts;
|
|
pelem->state = state;
|
|
pelem->stack_id = stack_id;
|
|
|
|
/*
|
|
* If stacks are successfully collected by bpf_get_stackid(), collect them once more
|
|
* in task_storage for direct off-cpu sample dumping
|
|
*/
|
|
if (stack_id > 0 && bpf_get_stack(ctx, &pelem->stack, MAX_STACKS * sizeof(u64), BPF_F_USER_STACK)) {
|
|
/*
|
|
* This empty if block is used to avoid 'result unused warning' from bpf_get_stack().
|
|
* If the collection fails, continue with the logic for the next task.
|
|
*/
|
|
}
|
|
next:
|
|
pelem = bpf_task_storage_get(&tstamp, next, NULL, 0);
|
|
|
|
if (pelem && pelem->timestamp) {
|
|
struct offcpu_key key = {
|
|
.pid = next->pid,
|
|
.tgid = next->tgid,
|
|
.stack_id = pelem->stack_id,
|
|
.state = pelem->state,
|
|
.cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0,
|
|
};
|
|
__u64 delta = ts - pelem->timestamp;
|
|
__u64 *total;
|
|
|
|
if (delta >= offcpu_thresh_ns) {
|
|
int zero = 0;
|
|
struct offcpu_data *data = bpf_map_lookup_elem(&offcpu_payload, &zero);
|
|
|
|
if (data)
|
|
off_cpu_dump(ctx, data, &key, &pelem->stack, delta);
|
|
} else {
|
|
total = bpf_map_lookup_elem(&off_cpu, &key);
|
|
if (total)
|
|
*total += delta;
|
|
else
|
|
bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY);
|
|
}
|
|
|
|
/* prevent to reuse the timestamp later */
|
|
pelem->timestamp = 0;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
SEC("tp_btf/task_newtask")
|
|
int on_newtask(u64 *ctx)
|
|
{
|
|
struct task_struct *task;
|
|
u64 clone_flags;
|
|
u32 pid;
|
|
u8 val = 1;
|
|
|
|
if (!uses_tgid)
|
|
return 0;
|
|
|
|
task = (struct task_struct *)bpf_get_current_task();
|
|
|
|
pid = BPF_CORE_READ(task, tgid);
|
|
if (!bpf_map_lookup_elem(&task_filter, &pid))
|
|
return 0;
|
|
|
|
task = (struct task_struct *)ctx[0];
|
|
clone_flags = ctx[1];
|
|
|
|
pid = task->tgid;
|
|
if (!(clone_flags & CLONE_THREAD))
|
|
bpf_map_update_elem(&task_filter, &pid, &val, BPF_NOEXIST);
|
|
|
|
return 0;
|
|
}
|
|
|
|
SEC("tp_btf/sched_switch")
|
|
int on_switch(u64 *ctx)
|
|
{
|
|
struct task_struct *prev, *next;
|
|
int prev_state;
|
|
|
|
if (!enabled)
|
|
return 0;
|
|
|
|
prev = (struct task_struct *)ctx[1];
|
|
next = (struct task_struct *)ctx[2];
|
|
|
|
if (has_prev_state)
|
|
prev_state = (int)ctx[3];
|
|
else
|
|
prev_state = get_task_state(prev);
|
|
|
|
return off_cpu_stat(ctx, prev, next, prev_state & 0xff);
|
|
}
|
|
|
|
char LICENSE[] SEC("license") = "Dual BSD/GPL";
|