mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-08-05 16:54:27 +00:00

The struct zone is embedded in struct pglist_data which can be allocated for each NUMA node early in the boot process. As it's not a slab object nor a global lock, this was not symbolized. Since the zone->lock is often contended, it'd be nice if we can symbolize it. On NUMA systems, node_data array will have pointers for struct pglist_data. By following the pointer, it can calculate the address of each zone and its lock using BTF. On UMA, it can just use contig_page_data and its zones. The following example shows the zone lock contention at the end. $ sudo ./perf lock con -abl -E 5 -- ./perf bench sched messaging # Running 'sched/messaging' benchmark: # 20 sender and receiver processes per group # 10 groups == 400 processes run Total time: 0.038 [sec] contended total wait max wait avg wait address symbol 5167 18.17 ms 10.27 us 3.52 us ffff953340052d00 &kmem_cache_node (spinlock) 38 11.75 ms 465.49 us 309.13 us ffff95334060c480 &sock_inode_cache (spinlock) 3916 10.13 ms 10.43 us 2.59 us ffff953342aecb40 &kmem_cache_node (spinlock) 2963 10.02 ms 13.75 us 3.38 us ffff9533d2344098 &kmalloc-rnd-08-2k (spinlock) 216 5.05 ms 99.49 us 23.39 us ffff9542bf7d65d0 zone_lock (spinlock) Signed-off-by: Namhyung Kim <namhyung@kernel.org> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Song Liu <song@kernel.org> Cc: Stephane Eranian <eranian@google.com> Cc: bpf@vger.kernel.org Cc: linux-mm@kvack.org Link: https://lore.kernel.org/r/20250401063055.7431-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
215 lines
4.8 KiB
C
215 lines
4.8 KiB
C
#ifndef __VMLINUX_H
|
|
#define __VMLINUX_H
|
|
|
|
#include <linux/stddef.h> // for define __always_inline
|
|
#include <linux/bpf.h>
|
|
#include <linux/types.h>
|
|
#include <linux/perf_event.h>
|
|
#include <stdbool.h>
|
|
|
|
// non-UAPI kernel data structures, used in the .bpf.c BPF tool component.
|
|
|
|
// Just the fields used in these tools preserving the access index so that
|
|
// libbpf can fixup offsets with the ones used in the kernel when loading the
|
|
// BPF bytecode, if they differ from what is used here.
|
|
|
|
typedef __u8 u8;
|
|
typedef __u32 u32;
|
|
typedef __s32 s32;
|
|
typedef __u64 u64;
|
|
typedef __s64 s64;
|
|
|
|
typedef int pid_t;
|
|
|
|
typedef __s64 time64_t;
|
|
|
|
struct timespec64 {
|
|
time64_t tv_sec;
|
|
long int tv_nsec;
|
|
};
|
|
|
|
enum cgroup_subsys_id {
|
|
perf_event_cgrp_id = 8,
|
|
};
|
|
|
|
enum {
|
|
HI_SOFTIRQ = 0,
|
|
TIMER_SOFTIRQ,
|
|
NET_TX_SOFTIRQ,
|
|
NET_RX_SOFTIRQ,
|
|
BLOCK_SOFTIRQ,
|
|
IRQ_POLL_SOFTIRQ,
|
|
TASKLET_SOFTIRQ,
|
|
SCHED_SOFTIRQ,
|
|
HRTIMER_SOFTIRQ,
|
|
RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */
|
|
|
|
NR_SOFTIRQS
|
|
};
|
|
|
|
typedef struct {
|
|
s64 counter;
|
|
} __attribute__((preserve_access_index)) atomic64_t;
|
|
|
|
typedef atomic64_t atomic_long_t;
|
|
|
|
struct raw_spinlock {
|
|
int rawlock;
|
|
} __attribute__((preserve_access_index));
|
|
|
|
typedef struct raw_spinlock raw_spinlock_t;
|
|
|
|
typedef struct {
|
|
struct raw_spinlock rlock;
|
|
} __attribute__((preserve_access_index)) spinlock_t;
|
|
|
|
struct sighand_struct {
|
|
spinlock_t siglock;
|
|
} __attribute__((preserve_access_index));
|
|
|
|
struct rw_semaphore {
|
|
atomic_long_t owner;
|
|
} __attribute__((preserve_access_index));
|
|
|
|
struct mutex {
|
|
atomic_long_t owner;
|
|
} __attribute__((preserve_access_index));
|
|
|
|
struct kernfs_node {
|
|
u64 id;
|
|
} __attribute__((preserve_access_index));
|
|
|
|
struct cgroup {
|
|
struct kernfs_node *kn;
|
|
int level;
|
|
} __attribute__((preserve_access_index));
|
|
|
|
struct cgroup_subsys_state {
|
|
struct cgroup *cgroup;
|
|
} __attribute__((preserve_access_index));
|
|
|
|
struct css_set {
|
|
struct cgroup_subsys_state *subsys[13];
|
|
struct cgroup *dfl_cgrp;
|
|
} __attribute__((preserve_access_index));
|
|
|
|
struct mm_struct {
|
|
struct rw_semaphore mmap_lock;
|
|
} __attribute__((preserve_access_index));
|
|
|
|
struct task_struct {
|
|
unsigned int flags;
|
|
struct mm_struct *mm;
|
|
pid_t pid;
|
|
pid_t tgid;
|
|
char comm[16];
|
|
struct sighand_struct *sighand;
|
|
struct css_set *cgroups;
|
|
} __attribute__((preserve_access_index));
|
|
|
|
struct trace_entry {
|
|
short unsigned int type;
|
|
unsigned char flags;
|
|
unsigned char preempt_count;
|
|
int pid;
|
|
} __attribute__((preserve_access_index));
|
|
|
|
struct trace_event_raw_irq_handler_entry {
|
|
struct trace_entry ent;
|
|
int irq;
|
|
u32 __data_loc_name;
|
|
char __data[];
|
|
} __attribute__((preserve_access_index));
|
|
|
|
struct trace_event_raw_irq_handler_exit {
|
|
struct trace_entry ent;
|
|
int irq;
|
|
int ret;
|
|
char __data[];
|
|
} __attribute__((preserve_access_index));
|
|
|
|
struct trace_event_raw_softirq {
|
|
struct trace_entry ent;
|
|
unsigned int vec;
|
|
char __data[];
|
|
} __attribute__((preserve_access_index));
|
|
|
|
struct trace_event_raw_workqueue_execute_start {
|
|
struct trace_entry ent;
|
|
void *work;
|
|
void *function;
|
|
char __data[];
|
|
} __attribute__((preserve_access_index));
|
|
|
|
struct trace_event_raw_workqueue_execute_end {
|
|
struct trace_entry ent;
|
|
void *work;
|
|
void *function;
|
|
char __data[];
|
|
} __attribute__((preserve_access_index));
|
|
|
|
struct trace_event_raw_workqueue_activate_work {
|
|
struct trace_entry ent;
|
|
void *work;
|
|
char __data[];
|
|
} __attribute__((preserve_access_index));
|
|
|
|
struct perf_sample_data {
|
|
u64 addr;
|
|
u64 period;
|
|
union perf_sample_weight weight;
|
|
u64 txn;
|
|
union perf_mem_data_src data_src;
|
|
u64 ip;
|
|
struct {
|
|
u32 pid;
|
|
u32 tid;
|
|
} tid_entry;
|
|
u64 time;
|
|
u64 id;
|
|
struct {
|
|
u32 cpu;
|
|
} cpu_entry;
|
|
u64 phys_addr;
|
|
u64 cgroup;
|
|
u64 data_page_size;
|
|
u64 code_page_size;
|
|
} __attribute__((__aligned__(64))) __attribute__((preserve_access_index));
|
|
|
|
struct perf_event {
|
|
struct perf_event *parent;
|
|
u64 id;
|
|
} __attribute__((preserve_access_index));
|
|
|
|
struct bpf_perf_event_data_kern {
|
|
struct perf_sample_data *data;
|
|
struct perf_event *event;
|
|
} __attribute__((preserve_access_index));
|
|
|
|
/*
|
|
* If 'struct rq' isn't defined for lock_contention.bpf.c, for the sake of
|
|
* rq___old and rq___new, then the type for the 'runqueue' variable ends up
|
|
* being a forward declaration (BTF_KIND_FWD) while the kernel has it defined
|
|
* (BTF_KIND_STRUCT). The definition appears in vmlinux.h rather than
|
|
* lock_contention.bpf.c for consistency with a generated vmlinux.h.
|
|
*/
|
|
struct rq {};
|
|
|
|
struct kmem_cache {
|
|
const char *name;
|
|
} __attribute__((preserve_access_index));
|
|
|
|
struct bpf_iter__kmem_cache {
|
|
struct kmem_cache *s;
|
|
} __attribute__((preserve_access_index));
|
|
|
|
struct zone {
|
|
spinlock_t lock;
|
|
} __attribute__((preserve_access_index));
|
|
|
|
struct pglist_data {
|
|
struct zone node_zones[6]; /* value for all possible config */
|
|
int nr_zones;
|
|
} __attribute__((preserve_access_index));
|
|
|
|
#endif // __VMLINUX_H
|