linux/tools/perf/util/bpf_skel/syscall_summary.bpf.c
Namhyung Kim ef60b8f572 perf trace: Support --summary-mode=cgroup
Add a new summary mode to collect stats for each cgroup.

  $ sudo ./perf trace -as --bpf-summary --summary-mode=cgroup -- sleep 1

   Summary of events:

   cgroup /user.slice/user-657345.slice/user@657345.service/session.slice/org.gnome.Shell@x11.service, 535 events

     syscall            calls  errors  total       min       avg       max       stddev
                                       (msec)    (msec)    (msec)    (msec)        (%)
     --------------- --------  ------ -------- --------- --------- ---------     ------
     ppoll                 15      0   373.600     0.004    24.907   197.491     55.26%
     poll                  15      0     1.325     0.001     0.088     0.369     38.76%
     close                 66      0     0.567     0.007     0.009     0.026      3.55%
     write                150      0     0.471     0.001     0.003     0.010      3.29%
     recvmsg               94     83     0.290     0.000     0.003     0.037     16.39%
     ioctl                 26      0     0.237     0.001     0.009     0.096     50.13%
     timerfd_create        66      0     0.236     0.003     0.004     0.024      8.92%
     timerfd_settime       70      0     0.160     0.001     0.002     0.012      7.66%
     writev                10      0     0.118     0.001     0.012     0.019     18.17%
     read                   9      0     0.021     0.001     0.002     0.004     14.07%
     getpid                14      0     0.019     0.000     0.001     0.004     20.28%

   cgroup /system.slice/polkit.service, 94 events

     syscall            calls  errors  total       min       avg       max       stddev
                                       (msec)    (msec)    (msec)    (msec)        (%)
     --------------- --------  ------ -------- --------- --------- ---------     ------
     ppoll                 22      0    19.811     0.000     0.900     9.273     63.88%
     write                 30      0     0.040     0.001     0.001     0.003     12.09%
     recvmsg               12      0     0.018     0.001     0.002     0.006     28.15%
     read                  18      0     0.013     0.000     0.001     0.003     21.99%
     poll                  12      0     0.006     0.000     0.001     0.001      4.48%

   cgroup /user.slice/user-657345.slice/user@657345.service/app.slice/app-org.gnome.Terminal.slice/gnome-terminal-server.service, 21 events

     syscall            calls  errors  total       min       avg       max       stddev
                                       (msec)    (msec)    (msec)    (msec)        (%)
     --------------- --------  ------ -------- --------- --------- ---------     ------
     ppoll                  4      0    17.476     0.003     4.369    13.298     69.65%
     recvmsg               15     12     0.068     0.002     0.005     0.014     26.53%
     writev                 1      0     0.033     0.033     0.033     0.033      0.00%
     poll                   1      0     0.005     0.005     0.005     0.005      0.00%

   ...

It works only for --bpf-summary for now.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20250501225337.928470-1-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2025-05-13 18:20:46 -03:00

153 lines
3.4 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Trace raw_syscalls tracepoints to collect system call statistics.
*/
#include "vmlinux.h"
#include "syscall_summary.h"
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include <bpf/bpf_core_read.h>
/* This is to calculate a delta between sys-enter and sys-exit for each thread */
struct syscall_trace {
int nr; /* syscall number is only available at sys-enter */
int unused;
u64 timestamp;
};
#define MAX_ENTRIES (128 * 1024)
struct syscall_trace_map {
__uint(type, BPF_MAP_TYPE_HASH);
__type(key, int); /* tid */
__type(value, struct syscall_trace);
__uint(max_entries, MAX_ENTRIES);
} syscall_trace_map SEC(".maps");
struct syscall_stats_map {
__uint(type, BPF_MAP_TYPE_HASH);
__type(key, struct syscall_key);
__type(value, struct syscall_stats);
__uint(max_entries, MAX_ENTRIES);
} syscall_stats_map SEC(".maps");
int enabled; /* controlled from userspace */
const volatile enum syscall_aggr_mode aggr_mode;
const volatile int use_cgroup_v2;
int perf_subsys_id = -1;
static inline __u64 get_current_cgroup_id(void)
{
struct task_struct *task;
struct cgroup *cgrp;
if (use_cgroup_v2)
return bpf_get_current_cgroup_id();
task = bpf_get_current_task_btf();
if (perf_subsys_id == -1) {
#if __has_builtin(__builtin_preserve_enum_value)
perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
perf_event_cgrp_id);
#else
perf_subsys_id = perf_event_cgrp_id;
#endif
}
cgrp = BPF_CORE_READ(task, cgroups, subsys[perf_subsys_id], cgroup);
return BPF_CORE_READ(cgrp, kn, id);
}
static void update_stats(int cpu_or_tid, u64 cgroup_id, int nr, s64 duration,
long ret)
{
struct syscall_key key = {
.cpu_or_tid = cpu_or_tid,
.cgroup = cgroup_id,
.nr = nr,
};
struct syscall_stats *stats;
stats = bpf_map_lookup_elem(&syscall_stats_map, &key);
if (stats == NULL) {
struct syscall_stats zero = {};
bpf_map_update_elem(&syscall_stats_map, &key, &zero, BPF_NOEXIST);
stats = bpf_map_lookup_elem(&syscall_stats_map, &key);
if (stats == NULL)
return;
}
__sync_fetch_and_add(&stats->count, 1);
if (ret < 0)
__sync_fetch_and_add(&stats->error, 1);
if (duration > 0) {
__sync_fetch_and_add(&stats->total_time, duration);
__sync_fetch_and_add(&stats->squared_sum, duration * duration);
if (stats->max_time < duration)
stats->max_time = duration;
if (stats->min_time > duration || stats->min_time == 0)
stats->min_time = duration;
}
return;
}
SEC("tp_btf/sys_enter")
int sys_enter(u64 *ctx)
{
int tid;
struct syscall_trace st;
if (!enabled)
return 0;
st.nr = ctx[1]; /* syscall number */
st.unused = 0;
st.timestamp = bpf_ktime_get_ns();
tid = bpf_get_current_pid_tgid();
bpf_map_update_elem(&syscall_trace_map, &tid, &st, BPF_ANY);
return 0;
}
SEC("tp_btf/sys_exit")
int sys_exit(u64 *ctx)
{
int tid;
int key = 0;
u64 cgroup = 0;
long ret = ctx[1]; /* return value of the syscall */
struct syscall_trace *st;
s64 delta;
if (!enabled)
return 0;
tid = bpf_get_current_pid_tgid();
st = bpf_map_lookup_elem(&syscall_trace_map, &tid);
if (st == NULL)
return 0;
if (aggr_mode == SYSCALL_AGGR_THREAD)
key = tid;
else if (aggr_mode == SYSCALL_AGGR_CGROUP)
cgroup = get_current_cgroup_id();
else
key = bpf_get_smp_processor_id();
delta = bpf_ktime_get_ns() - st->timestamp;
update_stats(key, cgroup, st->nr, delta, ret);
bpf_map_delete_elem(&syscall_trace_map, &tid);
return 0;
}
char _license[] SEC("license") = "GPL";