mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-08-05 16:54:27 +00:00

Add a new summary mode to collect stats for each cgroup. $ sudo ./perf trace -as --bpf-summary --summary-mode=cgroup -- sleep 1 Summary of events: cgroup /user.slice/user-657345.slice/user@657345.service/session.slice/org.gnome.Shell@x11.service, 535 events syscall calls errors total min avg max stddev (msec) (msec) (msec) (msec) (%) --------------- -------- ------ -------- --------- --------- --------- ------ ppoll 15 0 373.600 0.004 24.907 197.491 55.26% poll 15 0 1.325 0.001 0.088 0.369 38.76% close 66 0 0.567 0.007 0.009 0.026 3.55% write 150 0 0.471 0.001 0.003 0.010 3.29% recvmsg 94 83 0.290 0.000 0.003 0.037 16.39% ioctl 26 0 0.237 0.001 0.009 0.096 50.13% timerfd_create 66 0 0.236 0.003 0.004 0.024 8.92% timerfd_settime 70 0 0.160 0.001 0.002 0.012 7.66% writev 10 0 0.118 0.001 0.012 0.019 18.17% read 9 0 0.021 0.001 0.002 0.004 14.07% getpid 14 0 0.019 0.000 0.001 0.004 20.28% cgroup /system.slice/polkit.service, 94 events syscall calls errors total min avg max stddev (msec) (msec) (msec) (msec) (%) --------------- -------- ------ -------- --------- --------- --------- ------ ppoll 22 0 19.811 0.000 0.900 9.273 63.88% write 30 0 0.040 0.001 0.001 0.003 12.09% recvmsg 12 0 0.018 0.001 0.002 0.006 28.15% read 18 0 0.013 0.000 0.001 0.003 21.99% poll 12 0 0.006 0.000 0.001 0.001 4.48% cgroup /user.slice/user-657345.slice/user@657345.service/app.slice/app-org.gnome.Terminal.slice/gnome-terminal-server.service, 21 events syscall calls errors total min avg max stddev (msec) (msec) (msec) (msec) (%) --------------- -------- ------ -------- --------- --------- --------- ------ ppoll 4 0 17.476 0.003 4.369 13.298 69.65% recvmsg 15 12 0.068 0.002 0.005 0.014 26.53% writev 1 0 0.033 0.033 0.033 0.033 0.00% poll 1 0 0.005 0.005 0.005 0.005 0.00% ... It works only for --bpf-summary for now. Signed-off-by: Namhyung Kim <namhyung@kernel.org> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Howard Chu <howardchu95@gmail.com> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Song Liu <song@kernel.org> Link: https://lore.kernel.org/r/20250501225337.928470-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
458 lines
12 KiB
C
458 lines
12 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#include <inttypes.h>
|
|
#include <math.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
|
|
#include "dwarf-regs.h" /* for EM_HOST */
|
|
#include "syscalltbl.h"
|
|
#include "util/cgroup.h"
|
|
#include "util/hashmap.h"
|
|
#include "util/trace.h"
|
|
#include "util/util.h"
|
|
#include <bpf/bpf.h>
|
|
#include <linux/rbtree.h>
|
|
#include <linux/time64.h>
|
|
#include <tools/libc_compat.h> /* reallocarray */
|
|
|
|
#include "bpf_skel/syscall_summary.h"
|
|
#include "bpf_skel/syscall_summary.skel.h"
|
|
|
|
|
|
static struct syscall_summary_bpf *skel;
|
|
static struct rb_root cgroups = RB_ROOT;
|
|
|
|
int trace_prepare_bpf_summary(enum trace_summary_mode mode)
|
|
{
|
|
skel = syscall_summary_bpf__open();
|
|
if (skel == NULL) {
|
|
fprintf(stderr, "failed to open syscall summary bpf skeleton\n");
|
|
return -1;
|
|
}
|
|
|
|
if (mode == SUMMARY__BY_THREAD)
|
|
skel->rodata->aggr_mode = SYSCALL_AGGR_THREAD;
|
|
else if (mode == SUMMARY__BY_CGROUP)
|
|
skel->rodata->aggr_mode = SYSCALL_AGGR_CGROUP;
|
|
else
|
|
skel->rodata->aggr_mode = SYSCALL_AGGR_CPU;
|
|
|
|
if (cgroup_is_v2("perf_event") > 0)
|
|
skel->rodata->use_cgroup_v2 = 1;
|
|
|
|
if (syscall_summary_bpf__load(skel) < 0) {
|
|
fprintf(stderr, "failed to load syscall summary bpf skeleton\n");
|
|
return -1;
|
|
}
|
|
|
|
if (syscall_summary_bpf__attach(skel) < 0) {
|
|
fprintf(stderr, "failed to attach syscall summary bpf skeleton\n");
|
|
return -1;
|
|
}
|
|
|
|
if (mode == SUMMARY__BY_CGROUP)
|
|
read_all_cgroups(&cgroups);
|
|
|
|
return 0;
|
|
}
|
|
|
|
void trace_start_bpf_summary(void)
|
|
{
|
|
skel->bss->enabled = 1;
|
|
}
|
|
|
|
void trace_end_bpf_summary(void)
|
|
{
|
|
skel->bss->enabled = 0;
|
|
}
|
|
|
|
struct syscall_node {
|
|
int syscall_nr;
|
|
struct syscall_stats stats;
|
|
};
|
|
|
|
static double rel_stddev(struct syscall_stats *stat)
|
|
{
|
|
double variance, average;
|
|
|
|
if (stat->count < 2)
|
|
return 0;
|
|
|
|
average = (double)stat->total_time / stat->count;
|
|
|
|
variance = stat->squared_sum;
|
|
variance -= (stat->total_time * stat->total_time) / stat->count;
|
|
variance /= stat->count - 1;
|
|
|
|
return 100 * sqrt(variance / stat->count) / average;
|
|
}
|
|
|
|
/*
|
|
* The syscall_data is to maintain syscall stats ordered by total time.
|
|
* It supports different summary modes like per-thread or global.
|
|
*
|
|
* For per-thread stats, it uses two-level data strurcture -
|
|
* syscall_data is keyed by TID and has an array of nodes which
|
|
* represents each syscall for the thread.
|
|
*
|
|
* For global stats, it's still two-level technically but we don't need
|
|
* per-cpu analysis so it's keyed by the syscall number to combine stats
|
|
* from different CPUs. And syscall_data always has a syscall_node so
|
|
* it can effectively work as flat hierarchy.
|
|
*
|
|
* For per-cgroup stats, it uses two-level data structure like thread
|
|
* syscall_data is keyed by CGROUP and has an array of node which
|
|
* represents each syscall for the cgroup.
|
|
*/
|
|
struct syscall_data {
|
|
u64 key; /* tid if AGGR_THREAD, syscall-nr if AGGR_CPU, cgroup if AGGR_CGROUP */
|
|
int nr_events;
|
|
int nr_nodes;
|
|
u64 total_time;
|
|
struct syscall_node *nodes;
|
|
};
|
|
|
|
static int datacmp(const void *a, const void *b)
|
|
{
|
|
const struct syscall_data * const *sa = a;
|
|
const struct syscall_data * const *sb = b;
|
|
|
|
return (*sa)->total_time > (*sb)->total_time ? -1 : 1;
|
|
}
|
|
|
|
static int nodecmp(const void *a, const void *b)
|
|
{
|
|
const struct syscall_node *na = a;
|
|
const struct syscall_node *nb = b;
|
|
|
|
return na->stats.total_time > nb->stats.total_time ? -1 : 1;
|
|
}
|
|
|
|
static size_t sc_node_hash(long key, void *ctx __maybe_unused)
|
|
{
|
|
return key;
|
|
}
|
|
|
|
static bool sc_node_equal(long key1, long key2, void *ctx __maybe_unused)
|
|
{
|
|
return key1 == key2;
|
|
}
|
|
|
|
static int print_common_stats(struct syscall_data *data, FILE *fp)
|
|
{
|
|
int printed = 0;
|
|
|
|
for (int i = 0; i < data->nr_nodes; i++) {
|
|
struct syscall_node *node = &data->nodes[i];
|
|
struct syscall_stats *stat = &node->stats;
|
|
double total = (double)(stat->total_time) / NSEC_PER_MSEC;
|
|
double min = (double)(stat->min_time) / NSEC_PER_MSEC;
|
|
double max = (double)(stat->max_time) / NSEC_PER_MSEC;
|
|
double avg = total / stat->count;
|
|
const char *name;
|
|
|
|
/* TODO: support other ABIs */
|
|
name = syscalltbl__name(EM_HOST, node->syscall_nr);
|
|
if (name)
|
|
printed += fprintf(fp, " %-15s", name);
|
|
else
|
|
printed += fprintf(fp, " syscall:%-7d", node->syscall_nr);
|
|
|
|
printed += fprintf(fp, " %8u %6u %9.3f %9.3f %9.3f %9.3f %9.2f%%\n",
|
|
stat->count, stat->error, total, min, avg, max,
|
|
rel_stddev(stat));
|
|
}
|
|
return printed;
|
|
}
|
|
|
|
static int update_thread_stats(struct hashmap *hash, struct syscall_key *map_key,
|
|
struct syscall_stats *map_data)
|
|
{
|
|
struct syscall_data *data;
|
|
struct syscall_node *nodes;
|
|
|
|
if (!hashmap__find(hash, map_key->cpu_or_tid, &data)) {
|
|
data = zalloc(sizeof(*data));
|
|
if (data == NULL)
|
|
return -ENOMEM;
|
|
|
|
data->key = map_key->cpu_or_tid;
|
|
if (hashmap__add(hash, data->key, data) < 0) {
|
|
free(data);
|
|
return -ENOMEM;
|
|
}
|
|
}
|
|
|
|
/* update thread total stats */
|
|
data->nr_events += map_data->count;
|
|
data->total_time += map_data->total_time;
|
|
|
|
nodes = reallocarray(data->nodes, data->nr_nodes + 1, sizeof(*nodes));
|
|
if (nodes == NULL)
|
|
return -ENOMEM;
|
|
|
|
data->nodes = nodes;
|
|
nodes = &data->nodes[data->nr_nodes++];
|
|
nodes->syscall_nr = map_key->nr;
|
|
|
|
/* each thread has an entry for each syscall, just use the stat */
|
|
memcpy(&nodes->stats, map_data, sizeof(*map_data));
|
|
return 0;
|
|
}
|
|
|
|
static int print_thread_stat(struct syscall_data *data, FILE *fp)
|
|
{
|
|
int printed = 0;
|
|
|
|
qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp);
|
|
|
|
printed += fprintf(fp, " thread (%d), ", (int)data->key);
|
|
printed += fprintf(fp, "%d events\n\n", data->nr_events);
|
|
|
|
printed += fprintf(fp, " syscall calls errors total min avg max stddev\n");
|
|
printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
|
|
printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n");
|
|
|
|
printed += print_common_stats(data, fp);
|
|
printed += fprintf(fp, "\n\n");
|
|
|
|
return printed;
|
|
}
|
|
|
|
static int print_thread_stats(struct syscall_data **data, int nr_data, FILE *fp)
|
|
{
|
|
int printed = 0;
|
|
|
|
for (int i = 0; i < nr_data; i++)
|
|
printed += print_thread_stat(data[i], fp);
|
|
|
|
return printed;
|
|
}
|
|
|
|
static int update_total_stats(struct hashmap *hash, struct syscall_key *map_key,
|
|
struct syscall_stats *map_data)
|
|
{
|
|
struct syscall_data *data;
|
|
struct syscall_stats *stat;
|
|
|
|
if (!hashmap__find(hash, map_key->nr, &data)) {
|
|
data = zalloc(sizeof(*data));
|
|
if (data == NULL)
|
|
return -ENOMEM;
|
|
|
|
data->nodes = zalloc(sizeof(*data->nodes));
|
|
if (data->nodes == NULL) {
|
|
free(data);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
data->nr_nodes = 1;
|
|
data->key = map_key->nr;
|
|
data->nodes->syscall_nr = data->key;
|
|
|
|
if (hashmap__add(hash, data->key, data) < 0) {
|
|
free(data->nodes);
|
|
free(data);
|
|
return -ENOMEM;
|
|
}
|
|
}
|
|
|
|
/* update total stats for this syscall */
|
|
data->nr_events += map_data->count;
|
|
data->total_time += map_data->total_time;
|
|
|
|
/* This is sum of the same syscall from different CPUs */
|
|
stat = &data->nodes->stats;
|
|
|
|
stat->total_time += map_data->total_time;
|
|
stat->squared_sum += map_data->squared_sum;
|
|
stat->count += map_data->count;
|
|
stat->error += map_data->error;
|
|
|
|
if (stat->max_time < map_data->max_time)
|
|
stat->max_time = map_data->max_time;
|
|
if (stat->min_time > map_data->min_time || stat->min_time == 0)
|
|
stat->min_time = map_data->min_time;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int print_total_stats(struct syscall_data **data, int nr_data, FILE *fp)
|
|
{
|
|
int printed = 0;
|
|
int nr_events = 0;
|
|
|
|
for (int i = 0; i < nr_data; i++)
|
|
nr_events += data[i]->nr_events;
|
|
|
|
printed += fprintf(fp, " total, %d events\n\n", nr_events);
|
|
|
|
printed += fprintf(fp, " syscall calls errors total min avg max stddev\n");
|
|
printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
|
|
printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n");
|
|
|
|
for (int i = 0; i < nr_data; i++)
|
|
printed += print_common_stats(data[i], fp);
|
|
|
|
printed += fprintf(fp, "\n\n");
|
|
return printed;
|
|
}
|
|
|
|
static int update_cgroup_stats(struct hashmap *hash, struct syscall_key *map_key,
|
|
struct syscall_stats *map_data)
|
|
{
|
|
struct syscall_data *data;
|
|
struct syscall_node *nodes;
|
|
|
|
if (!hashmap__find(hash, map_key->cgroup, &data)) {
|
|
data = zalloc(sizeof(*data));
|
|
if (data == NULL)
|
|
return -ENOMEM;
|
|
|
|
data->key = map_key->cgroup;
|
|
if (hashmap__add(hash, data->key, data) < 0) {
|
|
free(data);
|
|
return -ENOMEM;
|
|
}
|
|
}
|
|
|
|
/* update thread total stats */
|
|
data->nr_events += map_data->count;
|
|
data->total_time += map_data->total_time;
|
|
|
|
nodes = reallocarray(data->nodes, data->nr_nodes + 1, sizeof(*nodes));
|
|
if (nodes == NULL)
|
|
return -ENOMEM;
|
|
|
|
data->nodes = nodes;
|
|
nodes = &data->nodes[data->nr_nodes++];
|
|
nodes->syscall_nr = map_key->nr;
|
|
|
|
/* each thread has an entry for each syscall, just use the stat */
|
|
memcpy(&nodes->stats, map_data, sizeof(*map_data));
|
|
return 0;
|
|
}
|
|
|
|
static int print_cgroup_stat(struct syscall_data *data, FILE *fp)
|
|
{
|
|
int printed = 0;
|
|
struct cgroup *cgrp = __cgroup__find(&cgroups, data->key);
|
|
|
|
qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp);
|
|
|
|
if (cgrp)
|
|
printed += fprintf(fp, " cgroup %s,", cgrp->name);
|
|
else
|
|
printed += fprintf(fp, " cgroup id:%lu,", (unsigned long)data->key);
|
|
|
|
printed += fprintf(fp, " %d events\n\n", data->nr_events);
|
|
|
|
printed += fprintf(fp, " syscall calls errors total min avg max stddev\n");
|
|
printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
|
|
printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n");
|
|
|
|
printed += print_common_stats(data, fp);
|
|
printed += fprintf(fp, "\n\n");
|
|
|
|
return printed;
|
|
}
|
|
|
|
static int print_cgroup_stats(struct syscall_data **data, int nr_data, FILE *fp)
|
|
{
|
|
int printed = 0;
|
|
|
|
for (int i = 0; i < nr_data; i++)
|
|
printed += print_cgroup_stat(data[i], fp);
|
|
|
|
return printed;
|
|
}
|
|
|
|
int trace_print_bpf_summary(FILE *fp)
|
|
{
|
|
struct bpf_map *map = skel->maps.syscall_stats_map;
|
|
struct syscall_key *prev_key, key;
|
|
struct syscall_data **data = NULL;
|
|
struct hashmap schash;
|
|
struct hashmap_entry *entry;
|
|
int nr_data = 0;
|
|
int printed = 0;
|
|
int i;
|
|
size_t bkt;
|
|
|
|
hashmap__init(&schash, sc_node_hash, sc_node_equal, /*ctx=*/NULL);
|
|
|
|
printed = fprintf(fp, "\n Summary of events:\n\n");
|
|
|
|
/* get stats from the bpf map */
|
|
prev_key = NULL;
|
|
while (!bpf_map__get_next_key(map, prev_key, &key, sizeof(key))) {
|
|
struct syscall_stats stat;
|
|
|
|
if (!bpf_map__lookup_elem(map, &key, sizeof(key), &stat, sizeof(stat), 0)) {
|
|
switch (skel->rodata->aggr_mode) {
|
|
case SYSCALL_AGGR_THREAD:
|
|
update_thread_stats(&schash, &key, &stat);
|
|
break;
|
|
case SYSCALL_AGGR_CPU:
|
|
update_total_stats(&schash, &key, &stat);
|
|
break;
|
|
case SYSCALL_AGGR_CGROUP:
|
|
update_cgroup_stats(&schash, &key, &stat);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
prev_key = &key;
|
|
}
|
|
|
|
nr_data = hashmap__size(&schash);
|
|
data = calloc(nr_data, sizeof(*data));
|
|
if (data == NULL)
|
|
goto out;
|
|
|
|
i = 0;
|
|
hashmap__for_each_entry(&schash, entry, bkt)
|
|
data[i++] = entry->pvalue;
|
|
|
|
qsort(data, nr_data, sizeof(*data), datacmp);
|
|
|
|
switch (skel->rodata->aggr_mode) {
|
|
case SYSCALL_AGGR_THREAD:
|
|
printed += print_thread_stats(data, nr_data, fp);
|
|
break;
|
|
case SYSCALL_AGGR_CPU:
|
|
printed += print_total_stats(data, nr_data, fp);
|
|
break;
|
|
case SYSCALL_AGGR_CGROUP:
|
|
printed += print_cgroup_stats(data, nr_data, fp);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
for (i = 0; i < nr_data && data; i++) {
|
|
free(data[i]->nodes);
|
|
free(data[i]);
|
|
}
|
|
free(data);
|
|
|
|
out:
|
|
hashmap__clear(&schash);
|
|
return printed;
|
|
}
|
|
|
|
void trace_cleanup_bpf_summary(void)
|
|
{
|
|
if (!RB_EMPTY_ROOT(&cgroups)) {
|
|
struct cgroup *cgrp, *tmp;
|
|
|
|
rbtree_postorder_for_each_entry_safe(cgrp, tmp, &cgroups, node)
|
|
cgroup__put(cgrp);
|
|
|
|
cgroups = RB_ROOT;
|
|
}
|
|
|
|
syscall_summary_bpf__destroy(skel);
|
|
}
|