mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-08-05 16:54:27 +00:00

Add multi-uprobe and multi-uretprobe benchmarks to bench tool. Multi- and classic uprobes/uretprobes have different low-level triggering code paths, so it's sometimes important to be able to benchmark both flavors of uprobes/uretprobes. Sample examples from my dev machine below. Single-threaded peformance almost doesn't differ, but with more parallel CPUs triggering the same uprobe/uretprobe the difference grows. This might be due to [0], but given the code is slightly different, there could be other sources of slowdown. Note, all these numbers will change due to ongoing work to improve uprobe/uretprobe scalability (e.g., [1]), but having benchmark like this is useful for measurements and debugging nevertheless. \#!/bin/bash set -eufo pipefail for p in 1 8 16 32; do for i in uprobe-nop uretprobe-nop uprobe-multi-nop uretprobe-multi-nop; do summary=$(sudo ./bench -w1 -d3 -p$p -a trig-$i | tail -n1) total=$(echo "$summary" | cut -d'(' -f1 | cut -d' ' -f3-) percpu=$(echo "$summary" | cut -d'(' -f2 | cut -d')' -f1 | cut -d'/' -f1) printf "%-21s (%2d cpus): %s (%s/s/cpu)\n" $i $p "$total" "$percpu" done echo done uprobe-nop ( 1 cpus): 1.020 ± 0.005M/s ( 1.020M/s/cpu) uretprobe-nop ( 1 cpus): 0.515 ± 0.009M/s ( 0.515M/s/cpu) uprobe-multi-nop ( 1 cpus): 1.036 ± 0.004M/s ( 1.036M/s/cpu) uretprobe-multi-nop ( 1 cpus): 0.512 ± 0.005M/s ( 0.512M/s/cpu) uprobe-nop ( 8 cpus): 3.481 ± 0.030M/s ( 0.435M/s/cpu) uretprobe-nop ( 8 cpus): 2.222 ± 0.008M/s ( 0.278M/s/cpu) uprobe-multi-nop ( 8 cpus): 3.769 ± 0.094M/s ( 0.471M/s/cpu) uretprobe-multi-nop ( 8 cpus): 2.482 ± 0.007M/s ( 0.310M/s/cpu) uprobe-nop (16 cpus): 2.968 ± 0.011M/s ( 0.185M/s/cpu) uretprobe-nop (16 cpus): 1.870 ± 0.002M/s ( 0.117M/s/cpu) uprobe-multi-nop (16 cpus): 3.541 ± 0.037M/s ( 0.221M/s/cpu) uretprobe-multi-nop (16 cpus): 2.123 ± 0.026M/s ( 0.133M/s/cpu) uprobe-nop (32 cpus): 2.524 ± 0.026M/s ( 0.079M/s/cpu) uretprobe-nop (32 cpus): 1.572 ± 0.003M/s ( 0.049M/s/cpu) uprobe-multi-nop (32 cpus): 2.717 ± 0.003M/s ( 0.085M/s/cpu) uretprobe-multi-nop (32 cpus): 1.687 ± 0.007M/s ( 0.053M/s/cpu) [0] https://lore.kernel.org/linux-trace-kernel/20240805202803.1813090-1-andrii@kernel.org/ [1] https://lore.kernel.org/linux-trace-kernel/20240731214256.3588718-1-andrii@kernel.org/ Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Acked-by: Jiri Olsa <jolsa@kernel.org> Link: https://lore.kernel.org/r/20240806042935.3867862-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov <ast@kernel.org>
140 lines
2.3 KiB
C
140 lines
2.3 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
// Copyright (c) 2020 Facebook
|
|
#include <linux/bpf.h>
|
|
#include <asm/unistd.h>
|
|
#include <bpf/bpf_helpers.h>
|
|
#include <bpf/bpf_tracing.h>
|
|
#include "bpf_misc.h"
|
|
|
|
char _license[] SEC("license") = "GPL";
|
|
|
|
#define CPU_MASK 255
|
|
#define MAX_CPUS (CPU_MASK + 1) /* should match MAX_BUCKETS in benchs/bench_trigger.c */
|
|
|
|
/* matches struct counter in bench.h */
|
|
struct counter {
|
|
long value;
|
|
} __attribute__((aligned(128)));
|
|
|
|
struct counter hits[MAX_CPUS];
|
|
|
|
static __always_inline void inc_counter(void)
|
|
{
|
|
int cpu = bpf_get_smp_processor_id();
|
|
|
|
__sync_add_and_fetch(&hits[cpu & CPU_MASK].value, 1);
|
|
}
|
|
|
|
SEC("?uprobe")
|
|
int bench_trigger_uprobe(void *ctx)
|
|
{
|
|
inc_counter();
|
|
return 0;
|
|
}
|
|
|
|
SEC("?uprobe.multi")
|
|
int bench_trigger_uprobe_multi(void *ctx)
|
|
{
|
|
inc_counter();
|
|
return 0;
|
|
}
|
|
|
|
const volatile int batch_iters = 0;
|
|
|
|
SEC("?raw_tp")
|
|
int trigger_count(void *ctx)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < batch_iters; i++)
|
|
inc_counter();
|
|
|
|
return 0;
|
|
}
|
|
|
|
SEC("?raw_tp")
|
|
int trigger_driver(void *ctx)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < batch_iters; i++)
|
|
(void)bpf_get_numa_node_id(); /* attach point for benchmarking */
|
|
|
|
return 0;
|
|
}
|
|
|
|
extern int bpf_modify_return_test_tp(int nonce) __ksym __weak;
|
|
|
|
SEC("?raw_tp")
|
|
int trigger_driver_kfunc(void *ctx)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < batch_iters; i++)
|
|
(void)bpf_modify_return_test_tp(0); /* attach point for benchmarking */
|
|
|
|
return 0;
|
|
}
|
|
|
|
SEC("?kprobe/bpf_get_numa_node_id")
|
|
int bench_trigger_kprobe(void *ctx)
|
|
{
|
|
inc_counter();
|
|
return 0;
|
|
}
|
|
|
|
SEC("?kretprobe/bpf_get_numa_node_id")
|
|
int bench_trigger_kretprobe(void *ctx)
|
|
{
|
|
inc_counter();
|
|
return 0;
|
|
}
|
|
|
|
SEC("?kprobe.multi/bpf_get_numa_node_id")
|
|
int bench_trigger_kprobe_multi(void *ctx)
|
|
{
|
|
inc_counter();
|
|
return 0;
|
|
}
|
|
|
|
SEC("?kretprobe.multi/bpf_get_numa_node_id")
|
|
int bench_trigger_kretprobe_multi(void *ctx)
|
|
{
|
|
inc_counter();
|
|
return 0;
|
|
}
|
|
|
|
SEC("?fentry/bpf_get_numa_node_id")
|
|
int bench_trigger_fentry(void *ctx)
|
|
{
|
|
inc_counter();
|
|
return 0;
|
|
}
|
|
|
|
SEC("?fexit/bpf_get_numa_node_id")
|
|
int bench_trigger_fexit(void *ctx)
|
|
{
|
|
inc_counter();
|
|
return 0;
|
|
}
|
|
|
|
SEC("?fmod_ret/bpf_modify_return_test_tp")
|
|
int bench_trigger_fmodret(void *ctx)
|
|
{
|
|
inc_counter();
|
|
return -22;
|
|
}
|
|
|
|
SEC("?tp/bpf_test_run/bpf_trigger_tp")
|
|
int bench_trigger_tp(void *ctx)
|
|
{
|
|
inc_counter();
|
|
return 0;
|
|
}
|
|
|
|
SEC("?raw_tp/bpf_trigger_tp")
|
|
int bench_trigger_rawtp(void *ctx)
|
|
{
|
|
inc_counter();
|
|
return 0;
|
|
}
|