linux/tools/perf/tests/shell/stat+std_output.sh

111 lines
2.9 KiB
Bash
Raw Permalink Normal View History

#!/bin/bash
# perf stat STD output linter
# SPDX-License-Identifier: GPL-2.0
# Tests various perf stat STD output commands for
# default event and metricgroup
set -e
# shellcheck source=lib/stat_output.sh
. "$(dirname $0)"/lib/stat_output.sh
stat_output=$(mktemp /tmp/__perf_test.stat_output.std.XXXXX)
event_name=(cpu-clock task-clock context-switches cpu-migrations page-faults stalled-cycles-frontend stalled-cycles-backend cycles instructions branches branch-misses)
event_metric=("CPUs utilized" "CPUs utilized" "/sec" "/sec" "/sec" "frontend cycles idle" "backend cycles idle" "GHz" "insn per cycle" "/sec" "of all branches")
perf test: Skip metric w/o event name on arm64 in stat STD output linter stat+std_output.sh test fails on my arm64 machine: [root@localhost shell]# ./stat+std_output.sh Checking STD output: no args Unknown event name in TopDownL1 # 0.18 retiring [root@localhost shell]# ./stat+std_output.sh Checking STD output: no args [Success] Checking STD output: system wide [Success] Checking STD output: interval [Success] Checking STD output: per thread Unknown event name in tmux: server-1114960 # 0.41 frontend_bound When no args specified `perf stat` will add TopdownL1 metric group and the output will be like: [root@localhost shell]# perf stat -- stress-ng --vm 1 --timeout 1 stress-ng: info: [3351733] setting to a 1 second run per stressor stress-ng: info: [3351733] dispatching hogs: 1 vm stress-ng: info: [3351733] successful run completed in 1.02s Performance counter stats for 'stress-ng --vm 1 --timeout 1': 1,037.71 msec task-clock # 1.000 CPUs utilized 13 context-switches # 12.528 /sec 1 cpu-migrations # 0.964 /sec 67,544 page-faults # 65.090 K/sec 2,691,932,561 cycles # 2.594 GHz (74.56%) 6,571,333,653 instructions # 2.44 insn per cycle (74.92%) 521,863,142 branches # 502.901 M/sec (75.21%) 425,879 branch-misses # 0.08% of all branches (87.57%) TopDownL1 # 0.61 retiring (87.67%) # 0.03 frontend_bound (87.67%) # 0.02 bad_speculation (87.67%) # 0.34 backend_bound (74.61%) 1.038138390 seconds time elapsed 0.844849000 seconds user 0.189053000 seconds sys Metrics in group TopDownL1 don't have event name on arm64 but are not listed in the $skip_metric list which they should be listed. Add them to the skip list as what does for x86 platforms in [1]. [1] commit 4d60e83dfcee ("perf test: Skip metrics w/o event name in stat STD output linter") Signed-off-by: Yicong Yang <yangyicong@hisilicon.com> Reviewed-by: Ian Rogers <irogers@google.com> Cc: linuxarm@huawei.com Cc: kan.liang@linux.intel.com Signed-off-by: Namhyung Kim <namhyung@kernel.org> Link: https://lore.kernel.org/r/20240207091222.54096-1-yangyicong@huawei.com
2024-02-07 17:12:22 +08:00
skip_metric=("stalled cycles per insn" "tma_" "retiring" "frontend_bound" "bad_speculation" "backend_bound")
cleanup() {
rm -f "${stat_output}"
trap - EXIT TERM INT
}
trap_cleanup() {
cleanup
exit 1
}
trap trap_cleanup EXIT TERM INT
function commachecker()
{
local prefix=1
case "$1"
in "--interval") prefix=2
;; "--per-thread") prefix=2
;; "--system-wide-no-aggr") prefix=2
;; "--per-core") prefix=3
;; "--per-socket") prefix=3
;; "--per-node") prefix=3
;; "--per-die") prefix=3
;; "--per-cache") prefix=3
perf stat: Support per-cluster aggregation Some platforms have 'cluster' topology and CPUs in the cluster will share resources like L3 Cache Tag (for HiSilicon Kunpeng SoC) or L2 cache (for Intel Jacobsville). Currently parsing and building cluster topology have been supported since [1]. perf stat has already supported aggregation for other topologies like die or socket, etc. It'll be useful to aggregate per-cluster to find problems like L3T bandwidth contention. This patch add support for "--per-cluster" option for per-cluster aggregation. Also update the docs and related test. The output will be like: [root@localhost tmp]# perf stat -a -e LLC-load --per-cluster -- sleep 5 Performance counter stats for 'system wide': S56-D0-CLS158 4 1,321,521,570 LLC-load S56-D0-CLS594 4 794,211,453 LLC-load S56-D0-CLS1030 4 41,623 LLC-load S56-D0-CLS1466 4 41,646 LLC-load S56-D0-CLS1902 4 16,863 LLC-load S56-D0-CLS2338 4 15,721 LLC-load S56-D0-CLS2774 4 22,671 LLC-load [...] On a legacy system without cluster or cluster support, the output will be look like: [root@localhost perf]# perf stat -a -e cycles --per-cluster -- sleep 1 Performance counter stats for 'system wide': S56-D0-CLS0 64 18,011,485 cycles S7182-D0-CLS0 64 16,548,835 cycles Note that this patch doesn't mix the cluster information in the outputs of --per-core to avoid breaking any tools/scripts using it. Note that perf recently supports "--per-cache" aggregation, but it's not the same with the cluster although cluster CPUs may share some cache resources. For example on my machine all clusters within a die share the same L3 cache: $ cat /sys/devices/system/cpu/cpu0/cache/index3/shared_cpu_list 0-31 $ cat /sys/devices/system/cpu/cpu0/topology/cluster_cpus_list 0-3 [1] commit c5e22feffdd7 ("topology: Represent clusters of CPUs within a die") Tested-by: Jie Zhan <zhanjie9@hisilicon.com> Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com> Reviewed-by: Ian Rogers <irogers@google.com> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com> Cc: james.clark@arm.com Cc: 21cnbao@gmail.com Cc: prime.zeng@hisilicon.com Cc: Jonathan.Cameron@huawei.com Cc: fanghao11@huawei.com Cc: linuxarm@huawei.com Cc: tim.c.chen@intel.com Cc: linux-arm-kernel@lists.infradead.org Signed-off-by: Namhyung Kim <namhyung@kernel.org> Link: https://lore.kernel.org/r/20240208024026.2691-1-yangyicong@huawei.com
2024-02-08 10:40:26 +08:00
;; "--per-cluster") prefix=3
esac
while read line
do
# Ignore initial "started on" comment.
x=${line:0:1}
[ "$x" = "#" ] && continue
# Ignore initial blank line.
[ "$line" = "" ] && continue
# Ignore "Performance counter stats"
x=${line:0:25}
[ "$x" = "Performance counter stats" ] && continue
# Ignore "seconds time elapsed" and break
[[ "$line" == *"time elapsed"* ]] && break
main_body=$(echo $line | cut -d' ' -f$prefix-)
x=${main_body%#*}
perf test: Skip metrics w/o event name in stat STD output linter This test checks if the output of perf stat to match event names and metrics. So it wants the output lines to have both event name and metric. Otherwise it should skip the line. On AMD machines, the instruction event has two metrics and they are printed in separate lines. It makes the line without event name like below: # perf stat -a sleep 1 Performance counter stats for 'system wide': 64,383.34 msec cpu-clock # 64.048 CPUs utilized 14,526 context-switches # 225.617 /sec 112 cpu-migrations # 1.740 /sec 190 page-faults # 2.951 /sec 807,558,652 cycles # 0.013 GHz (83.30%) 69,809,799 stalled-cycles-frontend # 8.64% frontend cycles idle (83.30%) 196,983,266 stalled-cycles-backend # 24.39% backend cycles idle (83.30%) 424,876,008 instructions # 0.53 insn per cycle (here) ---> # 0.46 stalled cycles per insn (83.30%) 97,788,321 branches # 1.519 M/sec (83.34%) 4,147,377 branch-misses # 4.24% of all branches (83.46%) 1.005241409 seconds time elapsed Also modern Intel machines have TopDown metrics which also don't have event names. # perf stat -a sleep 1 Performance counter stats for 'system wide': 8,015.39 msec cpu-clock # 7.996 CPUs utilized 5,823 context-switches # 726.477 /sec 189 cpu-migrations # 23.580 /sec 139 page-faults # 17.342 /sec 435,139,308 cycles # 0.054 GHz 193,891,345 instructions # 0.45 insn per cycle 42,773,028 branches # 5.336 M/sec 2,298,113 branch-misses # 5.37% of all branches TopdownL1 # 25.5 % tma_backend_bound /--> # 7.9 % tma_bad_speculation (here) --+ # 55.7 % tma_frontend_bound \--> # 10.9 % tma_retiring 1.002395924 seconds time elapsed There is a check to skip TopdownL1 and TopdownL2 specifically but it does not cover every affected lines. So there is another check to skip the line if it has nothing on the left side of # sign. Well.. it seems ok but that's not enough too. When aggregation mode (like --per-socket or --per-thread) is used, it adds some prefix (e.g. CPU socket, task name and PID) in the output line. So the test code ignores them to normalize result. A problem can happen for per-thread mode when task name contains one or more spaces. It'd only ignore the first part of the task name, and it thinks there's something more in the line so it would not skip. # perf stat -a --perf-thread sleep 1 ... perf-21276 # 70.2 % tma_backend_bound perf-21276 # 3.9 % tma_bad_speculation perf-21276 # 10.5 % tma_frontend_bound perf-21276 # 15.3 % tma_retiring ^^^^^^^^^^ (ignored) my task-21328 # 70.2 % tma_backend_bound my task-21328 # 3.9 % tma_bad_speculation my task-21328 # 10.5 % tma_frontend_bound my task-21328 # 15.3 % tma_retiring ^^ (ignored) So I think it should look at the metric names instead. Add skip_metric to hold the list of names to skip. It would contain 'stalled cycles per insn' and metrics started by 'tma_'. Fixes: 99a04a48f225 ("perf test: Add test case for the standard 'perf stat' output") Acked-by: Ian Rogers <irogers@google.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Arnaldo Carvalho de Melo <acme@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Ingo Molnar <mingo@kernel.org> Link: https://lore.kernel.org/r/20230623230139.985594-2-namhyung@kernel.org Signed-off-by: Namhyung Kim <namhyung@kernel.org>
2023-06-23 16:01:39 -07:00
[ "$x" = "" ] && continue
# Skip metrics without event name
y=${main_body#*#}
for i in "${!skip_metric[@]}"; do
[[ "$y" == *"${skip_metric[$i]}"* ]] && break
done
perf test: Skip metrics w/o event name in stat STD output linter This test checks if the output of perf stat to match event names and metrics. So it wants the output lines to have both event name and metric. Otherwise it should skip the line. On AMD machines, the instruction event has two metrics and they are printed in separate lines. It makes the line without event name like below: # perf stat -a sleep 1 Performance counter stats for 'system wide': 64,383.34 msec cpu-clock # 64.048 CPUs utilized 14,526 context-switches # 225.617 /sec 112 cpu-migrations # 1.740 /sec 190 page-faults # 2.951 /sec 807,558,652 cycles # 0.013 GHz (83.30%) 69,809,799 stalled-cycles-frontend # 8.64% frontend cycles idle (83.30%) 196,983,266 stalled-cycles-backend # 24.39% backend cycles idle (83.30%) 424,876,008 instructions # 0.53 insn per cycle (here) ---> # 0.46 stalled cycles per insn (83.30%) 97,788,321 branches # 1.519 M/sec (83.34%) 4,147,377 branch-misses # 4.24% of all branches (83.46%) 1.005241409 seconds time elapsed Also modern Intel machines have TopDown metrics which also don't have event names. # perf stat -a sleep 1 Performance counter stats for 'system wide': 8,015.39 msec cpu-clock # 7.996 CPUs utilized 5,823 context-switches # 726.477 /sec 189 cpu-migrations # 23.580 /sec 139 page-faults # 17.342 /sec 435,139,308 cycles # 0.054 GHz 193,891,345 instructions # 0.45 insn per cycle 42,773,028 branches # 5.336 M/sec 2,298,113 branch-misses # 5.37% of all branches TopdownL1 # 25.5 % tma_backend_bound /--> # 7.9 % tma_bad_speculation (here) --+ # 55.7 % tma_frontend_bound \--> # 10.9 % tma_retiring 1.002395924 seconds time elapsed There is a check to skip TopdownL1 and TopdownL2 specifically but it does not cover every affected lines. So there is another check to skip the line if it has nothing on the left side of # sign. Well.. it seems ok but that's not enough too. When aggregation mode (like --per-socket or --per-thread) is used, it adds some prefix (e.g. CPU socket, task name and PID) in the output line. So the test code ignores them to normalize result. A problem can happen for per-thread mode when task name contains one or more spaces. It'd only ignore the first part of the task name, and it thinks there's something more in the line so it would not skip. # perf stat -a --perf-thread sleep 1 ... perf-21276 # 70.2 % tma_backend_bound perf-21276 # 3.9 % tma_bad_speculation perf-21276 # 10.5 % tma_frontend_bound perf-21276 # 15.3 % tma_retiring ^^^^^^^^^^ (ignored) my task-21328 # 70.2 % tma_backend_bound my task-21328 # 3.9 % tma_bad_speculation my task-21328 # 10.5 % tma_frontend_bound my task-21328 # 15.3 % tma_retiring ^^ (ignored) So I think it should look at the metric names instead. Add skip_metric to hold the list of names to skip. It would contain 'stalled cycles per insn' and metrics started by 'tma_'. Fixes: 99a04a48f225 ("perf test: Add test case for the standard 'perf stat' output") Acked-by: Ian Rogers <irogers@google.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Arnaldo Carvalho de Melo <acme@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Ingo Molnar <mingo@kernel.org> Link: https://lore.kernel.org/r/20230623230139.985594-2-namhyung@kernel.org Signed-off-by: Namhyung Kim <namhyung@kernel.org>
2023-06-23 16:01:39 -07:00
[[ "$y" == *"${skip_metric[$i]}"* ]] && continue
# Check default event
for i in "${!event_name[@]}"; do
[[ "$x" == *"${event_name[$i]}"* ]] && break
done
[[ ! "$x" == *"${event_name[$i]}"* ]] && {
echo "Unknown event name in $line" 1>&2
exit 1;
}
# Check event metric if it exists
[[ ! "$main_body" == *"#"* ]] && continue
[[ ! "$main_body" == *"${event_metric[$i]}"* ]] && {
echo "wrong event metric. expected ${event_metric[$i]} in $line" 1>&2
exit 1;
}
done < "${stat_output}"
return 0
}
perf_cmd="-o ${stat_output}"
skip_test=$(check_for_topology)
check_no_args "STD" "$perf_cmd"
check_system_wide "STD" "$perf_cmd"
check_interval "STD" "$perf_cmd"
check_per_thread "STD" "$perf_cmd"
check_per_node "STD" "$perf_cmd"
if [ $skip_test -ne 1 ]
then
check_system_wide_no_aggr "STD" "$perf_cmd"
check_per_core "STD" "$perf_cmd"
check_per_cache_instance "STD" "$perf_cmd"
perf stat: Support per-cluster aggregation Some platforms have 'cluster' topology and CPUs in the cluster will share resources like L3 Cache Tag (for HiSilicon Kunpeng SoC) or L2 cache (for Intel Jacobsville). Currently parsing and building cluster topology have been supported since [1]. perf stat has already supported aggregation for other topologies like die or socket, etc. It'll be useful to aggregate per-cluster to find problems like L3T bandwidth contention. This patch add support for "--per-cluster" option for per-cluster aggregation. Also update the docs and related test. The output will be like: [root@localhost tmp]# perf stat -a -e LLC-load --per-cluster -- sleep 5 Performance counter stats for 'system wide': S56-D0-CLS158 4 1,321,521,570 LLC-load S56-D0-CLS594 4 794,211,453 LLC-load S56-D0-CLS1030 4 41,623 LLC-load S56-D0-CLS1466 4 41,646 LLC-load S56-D0-CLS1902 4 16,863 LLC-load S56-D0-CLS2338 4 15,721 LLC-load S56-D0-CLS2774 4 22,671 LLC-load [...] On a legacy system without cluster or cluster support, the output will be look like: [root@localhost perf]# perf stat -a -e cycles --per-cluster -- sleep 1 Performance counter stats for 'system wide': S56-D0-CLS0 64 18,011,485 cycles S7182-D0-CLS0 64 16,548,835 cycles Note that this patch doesn't mix the cluster information in the outputs of --per-core to avoid breaking any tools/scripts using it. Note that perf recently supports "--per-cache" aggregation, but it's not the same with the cluster although cluster CPUs may share some cache resources. For example on my machine all clusters within a die share the same L3 cache: $ cat /sys/devices/system/cpu/cpu0/cache/index3/shared_cpu_list 0-31 $ cat /sys/devices/system/cpu/cpu0/topology/cluster_cpus_list 0-3 [1] commit c5e22feffdd7 ("topology: Represent clusters of CPUs within a die") Tested-by: Jie Zhan <zhanjie9@hisilicon.com> Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com> Reviewed-by: Ian Rogers <irogers@google.com> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com> Cc: james.clark@arm.com Cc: 21cnbao@gmail.com Cc: prime.zeng@hisilicon.com Cc: Jonathan.Cameron@huawei.com Cc: fanghao11@huawei.com Cc: linuxarm@huawei.com Cc: tim.c.chen@intel.com Cc: linux-arm-kernel@lists.infradead.org Signed-off-by: Namhyung Kim <namhyung@kernel.org> Link: https://lore.kernel.org/r/20240208024026.2691-1-yangyicong@huawei.com
2024-02-08 10:40:26 +08:00
check_per_cluster "STD" "$perf_cmd"
check_per_die "STD" "$perf_cmd"
check_per_socket "STD" "$perf_cmd"
else
echo "[Skip] Skipping tests for system_wide_no_aggr, per_core, per_die and per_socket since socket id exposed via topology is invalid"
fi
cleanup
exit 0