linux/tools/perf/util/intel-tpebs.c
Ian Rogers 8882095b1d perf sample: Remove arch notion of sample parsing
By definition arch sample parsing and synthesis will inhibit certain
kinds of cross-platform record then analysis (report, script,
etc.). Remove arch_perf_parse_sample_weight and
arch_perf_synthesize_sample_weight replacing with a common
implementation. Combine perf_sample p_stage_cyc and retire_lat as
weight3 to capture the differing uses regardless of compiled for
architecture.

Signed-off-by: Ian Rogers <irogers@google.com>
Link: https://lore.kernel.org/r/20250724163302.596743-21-irogers@google.com
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
2025-07-25 10:37:58 -07:00

661 lines
15 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* intel_tpebs.c: Intel TPEBS support
*/
#include <api/fs/fs.h>
#include <sys/param.h>
#include <subcmd/run-command.h>
#include <thread.h>
#include "intel-tpebs.h"
#include <linux/list.h>
#include <linux/zalloc.h>
#include <linux/err.h>
#include "sample.h"
#include "counts.h"
#include "debug.h"
#include "evlist.h"
#include "evsel.h"
#include "mutex.h"
#include "session.h"
#include "stat.h"
#include "tool.h"
#include "cpumap.h"
#include "metricgroup.h"
#include "stat.h"
#include <sys/stat.h>
#include <sys/file.h>
#include <poll.h>
#include <math.h>
#define PERF_DATA "-"
bool tpebs_recording;
enum tpebs_mode tpebs_mode;
static LIST_HEAD(tpebs_results);
static pthread_t tpebs_reader_thread;
static struct child_process tpebs_cmd;
static int control_fd[2], ack_fd[2];
static struct mutex tpebs_mtx;
struct tpebs_retire_lat {
struct list_head nd;
/** @evsel: The evsel that opened the retire_lat event. */
struct evsel *evsel;
/** @event: Event passed to perf record. */
char *event;
/** @stats: Recorded retirement latency stats. */
struct stats stats;
/** @last: Last retirement latency read. */
uint64_t last;
/* Has the event been sent to perf record? */
bool started;
};
static void tpebs_mtx_init(void)
{
mutex_init(&tpebs_mtx);
}
static struct mutex *tpebs_mtx_get(void)
{
static pthread_once_t tpebs_mtx_once = PTHREAD_ONCE_INIT;
pthread_once(&tpebs_mtx_once, tpebs_mtx_init);
return &tpebs_mtx;
}
static struct tpebs_retire_lat *tpebs_retire_lat__find(struct evsel *evsel)
EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get());
static int evsel__tpebs_start_perf_record(struct evsel *evsel)
{
const char **record_argv;
int tpebs_event_size = 0, i = 0, ret;
char control_fd_buf[32];
char cpumap_buf[50];
struct tpebs_retire_lat *t;
list_for_each_entry(t, &tpebs_results, nd)
tpebs_event_size++;
record_argv = malloc((10 + 2 * tpebs_event_size) * sizeof(*record_argv));
if (!record_argv)
return -ENOMEM;
record_argv[i++] = "perf";
record_argv[i++] = "record";
record_argv[i++] = "-W";
record_argv[i++] = "--synth=no";
scnprintf(control_fd_buf, sizeof(control_fd_buf), "--control=fd:%d,%d",
control_fd[0], ack_fd[1]);
record_argv[i++] = control_fd_buf;
record_argv[i++] = "-o";
record_argv[i++] = PERF_DATA;
if (!perf_cpu_map__is_any_cpu_or_is_empty(evsel->evlist->core.user_requested_cpus)) {
cpu_map__snprint(evsel->evlist->core.user_requested_cpus, cpumap_buf,
sizeof(cpumap_buf));
record_argv[i++] = "-C";
record_argv[i++] = cpumap_buf;
}
list_for_each_entry(t, &tpebs_results, nd) {
record_argv[i++] = "-e";
record_argv[i++] = t->event;
}
record_argv[i++] = NULL;
assert(i == 10 + 2 * tpebs_event_size || i == 8 + 2 * tpebs_event_size);
/* Note, no workload given so system wide is implied. */
assert(tpebs_cmd.pid == 0);
tpebs_cmd.argv = record_argv;
tpebs_cmd.out = -1;
ret = start_command(&tpebs_cmd);
zfree(&tpebs_cmd.argv);
list_for_each_entry(t, &tpebs_results, nd)
t->started = true;
return ret;
}
static bool is_child_pid(pid_t parent, pid_t child)
{
if (parent < 0 || child < 0)
return false;
while (true) {
char path[PATH_MAX];
char line[256];
FILE *fp;
new_child:
if (parent == child)
return true;
if (child <= 0)
return false;
scnprintf(path, sizeof(path), "%s/%d/status", procfs__mountpoint(), child);
fp = fopen(path, "r");
if (!fp) {
/* Presumably the process went away. Assume not a child. */
return false;
}
while (fgets(line, sizeof(line), fp) != NULL) {
if (strncmp(line, "PPid:", 5) == 0) {
fclose(fp);
if (sscanf(line + 5, "%d", &child) != 1) {
/* Unexpected error parsing. */
return false;
}
goto new_child;
}
}
/* Unexpected EOF. */
fclose(fp);
return false;
}
}
static bool should_ignore_sample(const struct perf_sample *sample, const struct tpebs_retire_lat *t)
{
pid_t workload_pid, sample_pid = sample->pid;
/*
* During evlist__purge the evlist will be removed prior to the
* evsel__exit calling evsel__tpebs_close and taking the
* tpebs_mtx. Avoid a segfault by ignoring samples in this case.
*/
if (t->evsel->evlist == NULL)
return true;
workload_pid = t->evsel->evlist->workload.pid;
if (workload_pid < 0 || workload_pid == sample_pid)
return false;
if (!t->evsel->core.attr.inherit)
return true;
return !is_child_pid(workload_pid, sample_pid);
}
static int process_sample_event(const struct perf_tool *tool __maybe_unused,
union perf_event *event __maybe_unused,
struct perf_sample *sample,
struct evsel *evsel,
struct machine *machine __maybe_unused)
{
struct tpebs_retire_lat *t;
mutex_lock(tpebs_mtx_get());
if (tpebs_cmd.pid == 0) {
/* Record has terminated. */
mutex_unlock(tpebs_mtx_get());
return 0;
}
t = tpebs_retire_lat__find(evsel);
if (!t) {
mutex_unlock(tpebs_mtx_get());
return -EINVAL;
}
if (should_ignore_sample(sample, t)) {
mutex_unlock(tpebs_mtx_get());
return 0;
}
/*
* Need to handle per core results? We are assuming average retire
* latency value will be used. Save the number of samples and the sum of
* retire latency value for each event.
*/
t->last = sample->weight3;
update_stats(&t->stats, sample->weight3);
mutex_unlock(tpebs_mtx_get());
return 0;
}
static int process_feature_event(struct perf_session *session,
union perf_event *event)
{
if (event->feat.feat_id < HEADER_LAST_FEATURE)
return perf_event__process_feature(session, event);
return 0;
}
static void *__sample_reader(void *arg __maybe_unused)
{
struct perf_session *session;
struct perf_data data = {
.mode = PERF_DATA_MODE_READ,
.path = PERF_DATA,
.file.fd = tpebs_cmd.out,
};
struct perf_tool tool;
perf_tool__init(&tool, /*ordered_events=*/false);
tool.sample = process_sample_event;
tool.feature = process_feature_event;
tool.attr = perf_event__process_attr;
session = perf_session__new(&data, &tool);
if (IS_ERR(session))
return NULL;
perf_session__process_events(session);
perf_session__delete(session);
return NULL;
}
static int tpebs_send_record_cmd(const char *msg) EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get())
{
struct pollfd pollfd = { .events = POLLIN, };
int ret, len, retries = 0;
char ack_buf[8];
/* Check if the command exited before the send, done with the lock held. */
if (tpebs_cmd.pid == 0)
return 0;
/*
* Let go of the lock while sending/receiving as blocking can starve the
* sample reading thread.
*/
mutex_unlock(tpebs_mtx_get());
/* Send perf record command.*/
len = strlen(msg);
ret = write(control_fd[1], msg, len);
if (ret != len) {
pr_err("perf record control write control message '%s' failed\n", msg);
ret = -EPIPE;
goto out;
}
if (!strcmp(msg, EVLIST_CTL_CMD_STOP_TAG)) {
ret = 0;
goto out;
}
/* Wait for an ack. */
pollfd.fd = ack_fd[0];
/*
* We need this poll to ensure the ack_fd PIPE will not hang
* when perf record failed for any reason. The timeout value
* 3000ms is an empirical selection.
*/
again:
if (!poll(&pollfd, 1, 500)) {
if (check_if_command_finished(&tpebs_cmd)) {
ret = 0;
goto out;
}
if (retries++ < 6)
goto again;
pr_err("tpebs failed: perf record ack timeout for '%s'\n", msg);
ret = -ETIMEDOUT;
goto out;
}
if (!(pollfd.revents & POLLIN)) {
if (check_if_command_finished(&tpebs_cmd)) {
ret = 0;
goto out;
}
pr_err("tpebs failed: did not received an ack for '%s'\n", msg);
ret = -EPIPE;
goto out;
}
ret = read(ack_fd[0], ack_buf, sizeof(ack_buf));
if (ret > 0)
ret = strcmp(ack_buf, EVLIST_CTL_CMD_ACK_TAG);
else
pr_err("tpebs: perf record control ack failed\n");
out:
/* Re-take lock as expected by caller. */
mutex_lock(tpebs_mtx_get());
return ret;
}
/*
* tpebs_stop - stop the sample data read thread and the perf record process.
*/
static int tpebs_stop(void) EXCLUSIVE_LOCKS_REQUIRED(tpebs_mtx_get())
{
int ret = 0;
/* Like tpebs_start, we should only run tpebs_end once. */
if (tpebs_cmd.pid != 0) {
tpebs_send_record_cmd(EVLIST_CTL_CMD_STOP_TAG);
tpebs_cmd.pid = 0;
mutex_unlock(tpebs_mtx_get());
pthread_join(tpebs_reader_thread, NULL);
mutex_lock(tpebs_mtx_get());
close(control_fd[0]);
close(control_fd[1]);
close(ack_fd[0]);
close(ack_fd[1]);
close(tpebs_cmd.out);
ret = finish_command(&tpebs_cmd);
tpebs_cmd.pid = 0;
if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL)
ret = 0;
}
return ret;
}
/**
* evsel__tpebs_event() - Create string event encoding to pass to `perf record`.
*/
static int evsel__tpebs_event(struct evsel *evsel, char **event)
{
char *name, *modifier;
int ret;
name = strdup(evsel->name);
if (!name)
return -ENOMEM;
modifier = strrchr(name, 'R');
if (!modifier) {
ret = -EINVAL;
goto out;
}
*modifier = 'p';
modifier = strchr(name, ':');
if (!modifier)
modifier = strrchr(name, '/');
if (!modifier) {
ret = -EINVAL;
goto out;
}
*modifier = '\0';
if (asprintf(event, "%s/name=tpebs_event_%p/%s", name, evsel, modifier + 1) > 0)
ret = 0;
else
ret = -ENOMEM;
out:
if (ret)
pr_err("Tpebs event modifier broken '%s'\n", evsel->name);
free(name);
return ret;
}
static struct tpebs_retire_lat *tpebs_retire_lat__new(struct evsel *evsel)
{
struct tpebs_retire_lat *result = zalloc(sizeof(*result));
int ret;
if (!result)
return NULL;
ret = evsel__tpebs_event(evsel, &result->event);
if (ret) {
free(result);
return NULL;
}
result->evsel = evsel;
return result;
}
static void tpebs_retire_lat__delete(struct tpebs_retire_lat *r)
{
zfree(&r->event);
free(r);
}
static struct tpebs_retire_lat *tpebs_retire_lat__find(struct evsel *evsel)
{
struct tpebs_retire_lat *t;
unsigned long num;
const char *evsel_name;
/*
* Evsels will match for evlist with the retirement latency event. The
* name with "tpebs_event_" prefix will be present on events being read
* from `perf record`.
*/
if (evsel__is_retire_lat(evsel)) {
list_for_each_entry(t, &tpebs_results, nd) {
if (t->evsel == evsel)
return t;
}
return NULL;
}
evsel_name = strstr(evsel->name, "tpebs_event_");
if (!evsel_name) {
/* Unexpected that the perf record should have other events. */
return NULL;
}
errno = 0;
num = strtoull(evsel_name + 12, NULL, 16);
if (errno) {
pr_err("Bad evsel for tpebs find '%s'\n", evsel->name);
return NULL;
}
list_for_each_entry(t, &tpebs_results, nd) {
if ((unsigned long)t->evsel == num)
return t;
}
return NULL;
}
/**
* evsel__tpebs_prepare - create tpebs data structures ready for opening.
* @evsel: retire_latency evsel, all evsels on its list will be prepared.
*/
static int evsel__tpebs_prepare(struct evsel *evsel)
{
struct evsel *pos;
struct tpebs_retire_lat *tpebs_event;
mutex_lock(tpebs_mtx_get());
tpebs_event = tpebs_retire_lat__find(evsel);
if (tpebs_event) {
/* evsel, or an identically named one, was already prepared. */
mutex_unlock(tpebs_mtx_get());
return 0;
}
tpebs_event = tpebs_retire_lat__new(evsel);
if (!tpebs_event) {
mutex_unlock(tpebs_mtx_get());
return -ENOMEM;
}
list_add_tail(&tpebs_event->nd, &tpebs_results);
mutex_unlock(tpebs_mtx_get());
/*
* Eagerly prepare all other evsels on the list to try to ensure that by
* open they are all known.
*/
evlist__for_each_entry(evsel->evlist, pos) {
int ret;
if (pos == evsel || !pos->retire_lat)
continue;
ret = evsel__tpebs_prepare(pos);
if (ret)
return ret;
}
return 0;
}
/**
* evsel__tpebs_open - starts tpebs execution.
* @evsel: retire_latency evsel, all evsels on its list will be selected. Each
* evsel is sampled to get the average retire_latency value.
*/
int evsel__tpebs_open(struct evsel *evsel)
{
int ret;
bool tpebs_empty;
/* We should only run tpebs_start when tpebs_recording is enabled. */
if (!tpebs_recording)
return 0;
/* Only start the events once. */
if (tpebs_cmd.pid != 0) {
struct tpebs_retire_lat *t;
bool valid;
mutex_lock(tpebs_mtx_get());
t = tpebs_retire_lat__find(evsel);
valid = t && t->started;
mutex_unlock(tpebs_mtx_get());
/* May fail as the event wasn't started. */
return valid ? 0 : -EBUSY;
}
ret = evsel__tpebs_prepare(evsel);
if (ret)
return ret;
mutex_lock(tpebs_mtx_get());
tpebs_empty = list_empty(&tpebs_results);
if (!tpebs_empty) {
/*Create control and ack fd for --control*/
if (pipe(control_fd) < 0) {
pr_err("tpebs: Failed to create control fifo");
ret = -1;
goto out;
}
if (pipe(ack_fd) < 0) {
pr_err("tpebs: Failed to create control fifo");
ret = -1;
goto out;
}
ret = evsel__tpebs_start_perf_record(evsel);
if (ret)
goto out;
if (pthread_create(&tpebs_reader_thread, /*attr=*/NULL, __sample_reader,
/*arg=*/NULL)) {
kill(tpebs_cmd.pid, SIGTERM);
close(tpebs_cmd.out);
pr_err("Could not create thread to process sample data.\n");
ret = -1;
goto out;
}
ret = tpebs_send_record_cmd(EVLIST_CTL_CMD_ENABLE_TAG);
}
out:
if (ret) {
struct tpebs_retire_lat *t = tpebs_retire_lat__find(evsel);
list_del_init(&t->nd);
tpebs_retire_lat__delete(t);
}
mutex_unlock(tpebs_mtx_get());
return ret;
}
int evsel__tpebs_read(struct evsel *evsel, int cpu_map_idx, int thread)
{
struct perf_counts_values *count, *old_count = NULL;
struct tpebs_retire_lat *t;
uint64_t val;
int ret;
/* Only set retire_latency value to the first CPU and thread. */
if (cpu_map_idx != 0 || thread != 0)
return 0;
if (evsel->prev_raw_counts)
old_count = perf_counts(evsel->prev_raw_counts, cpu_map_idx, thread);
count = perf_counts(evsel->counts, cpu_map_idx, thread);
mutex_lock(tpebs_mtx_get());
t = tpebs_retire_lat__find(evsel);
/*
* If reading the first tpebs result, send a ping to the record
* process. Allow the sample reader a chance to read by releasing and
* reacquiring the lock.
*/
if (t && &t->nd == tpebs_results.next) {
ret = tpebs_send_record_cmd(EVLIST_CTL_CMD_PING_TAG);
mutex_unlock(tpebs_mtx_get());
if (ret)
return ret;
mutex_lock(tpebs_mtx_get());
}
if (t == NULL || t->stats.n == 0) {
/* No sample data, use default. */
if (tpebs_recording) {
pr_warning_once(
"Using precomputed retirement latency data as no samples\n");
}
val = 0;
switch (tpebs_mode) {
case TPEBS_MODE__MIN:
val = rint(evsel->retirement_latency.min);
break;
case TPEBS_MODE__MAX:
val = rint(evsel->retirement_latency.max);
break;
default:
case TPEBS_MODE__LAST:
case TPEBS_MODE__MEAN:
val = rint(evsel->retirement_latency.mean);
break;
}
} else {
switch (tpebs_mode) {
case TPEBS_MODE__MIN:
val = t->stats.min;
break;
case TPEBS_MODE__MAX:
val = t->stats.max;
break;
case TPEBS_MODE__LAST:
val = t->last;
break;
default:
case TPEBS_MODE__MEAN:
val = rint(t->stats.mean);
break;
}
}
mutex_unlock(tpebs_mtx_get());
if (old_count) {
count->val = old_count->val + val;
count->run = old_count->run + 1;
count->ena = old_count->ena + 1;
} else {
count->val = val;
count->run++;
count->ena++;
}
return 0;
}
/**
* evsel__tpebs_close() - delete tpebs related data. If the last event, stop the
* created thread and process by calling tpebs_stop().
*
* This function is called in evsel__close() to be symmetric with
* evsel__tpebs_open() being called in evsel__open().
*/
void evsel__tpebs_close(struct evsel *evsel)
{
struct tpebs_retire_lat *t;
mutex_lock(tpebs_mtx_get());
t = tpebs_retire_lat__find(evsel);
if (t) {
list_del_init(&t->nd);
tpebs_retire_lat__delete(t);
if (list_empty(&tpebs_results))
tpebs_stop();
}
mutex_unlock(tpebs_mtx_get());
}