2024-04-04 19:45:26 -07:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# SPDX-License-Identifier: GPL-2.0
|
|
|
|
|
2025-06-20 09:11:08 -07:00
|
|
|
"""
|
|
|
|
Tests related to standard netdevice statistics.
|
|
|
|
"""
|
|
|
|
|
2024-08-01 17:03:07 -07:00
|
|
|
import errno
|
2025-01-06 18:29:32 -08:00
|
|
|
import subprocess
|
|
|
|
import time
|
2024-04-19 19:35:42 -07:00
|
|
|
from lib.py import ksft_run, ksft_exit, ksft_pr
|
2025-01-06 18:29:32 -08:00
|
|
|
from lib.py import ksft_ge, ksft_eq, ksft_is, ksft_in, ksft_lt, ksft_true, ksft_raises
|
2025-06-20 09:11:09 -07:00
|
|
|
from lib.py import KsftSkipEx, KsftFailEx
|
2024-08-01 17:03:08 -07:00
|
|
|
from lib.py import ksft_disruptive
|
2024-04-04 19:45:26 -07:00
|
|
|
from lib.py import EthtoolFamily, NetdevFamily, RtnlFamily, NlError
|
|
|
|
from lib.py import NetDrvEnv
|
2025-01-06 18:29:32 -08:00
|
|
|
from lib.py import cmd, ip, defer
|
2024-04-04 19:45:26 -07:00
|
|
|
|
|
|
|
ethnl = EthtoolFamily()
|
|
|
|
netfam = NetdevFamily()
|
|
|
|
rtnl = RtnlFamily()
|
|
|
|
|
|
|
|
|
|
|
|
def check_pause(cfg) -> None:
|
2025-06-20 09:11:08 -07:00
|
|
|
"""
|
|
|
|
Check that drivers which support Pause config also report standard
|
|
|
|
pause stats.
|
|
|
|
"""
|
2024-04-04 19:45:26 -07:00
|
|
|
|
|
|
|
try:
|
|
|
|
ethnl.pause_get({"header": {"dev-index": cfg.ifindex}})
|
|
|
|
except NlError as e:
|
2024-08-01 17:03:09 -07:00
|
|
|
if e.error == errno.EOPNOTSUPP:
|
2025-06-20 09:11:09 -07:00
|
|
|
raise KsftSkipEx("pause not supported by the device") from e
|
2024-04-04 19:45:26 -07:00
|
|
|
raise
|
|
|
|
|
|
|
|
data = ethnl.pause_get({"header": {"dev-index": cfg.ifindex,
|
|
|
|
"flags": {'stats'}}})
|
|
|
|
ksft_true(data['stats'], "driver does not report stats")
|
|
|
|
|
|
|
|
|
|
|
|
def check_fec(cfg) -> None:
|
2025-06-20 09:11:08 -07:00
|
|
|
"""
|
|
|
|
Check that drivers which support FEC config also report standard
|
|
|
|
FEC stats.
|
|
|
|
"""
|
2024-04-04 19:45:26 -07:00
|
|
|
|
|
|
|
try:
|
|
|
|
ethnl.fec_get({"header": {"dev-index": cfg.ifindex}})
|
|
|
|
except NlError as e:
|
2024-08-01 17:03:09 -07:00
|
|
|
if e.error == errno.EOPNOTSUPP:
|
2025-06-20 09:11:09 -07:00
|
|
|
raise KsftSkipEx("FEC not supported by the device") from e
|
2024-04-04 19:45:26 -07:00
|
|
|
raise
|
|
|
|
|
|
|
|
data = ethnl.fec_get({"header": {"dev-index": cfg.ifindex,
|
|
|
|
"flags": {'stats'}}})
|
|
|
|
ksft_true(data['stats'], "driver does not report stats")
|
|
|
|
|
|
|
|
|
|
|
|
def pkt_byte_sum(cfg) -> None:
|
2025-06-20 09:11:08 -07:00
|
|
|
"""
|
|
|
|
Check that qstat and interface stats match in value.
|
|
|
|
"""
|
2024-04-04 19:45:26 -07:00
|
|
|
|
|
|
|
def get_qstat(test):
|
|
|
|
stats = netfam.qstats_get({}, dump=True)
|
|
|
|
if stats:
|
|
|
|
for qs in stats:
|
|
|
|
if qs["ifindex"]== test.ifindex:
|
|
|
|
return qs
|
2025-06-20 09:11:08 -07:00
|
|
|
return None
|
2024-04-04 19:45:26 -07:00
|
|
|
|
|
|
|
qstat = get_qstat(cfg)
|
|
|
|
if qstat is None:
|
|
|
|
raise KsftSkipEx("qstats not supported by the device")
|
|
|
|
|
|
|
|
for key in ['tx-packets', 'tx-bytes', 'rx-packets', 'rx-bytes']:
|
|
|
|
ksft_in(key, qstat, "Drivers should always report basic keys")
|
|
|
|
|
|
|
|
# Compare stats, rtnl stats and qstats must match,
|
|
|
|
# but the interface may be up, so do a series of dumps
|
|
|
|
# each time the more "recent" stats must be higher or same.
|
|
|
|
def stat_cmp(rstat, qstat):
|
|
|
|
for key in ['tx-packets', 'tx-bytes', 'rx-packets', 'rx-bytes']:
|
|
|
|
if rstat[key] != qstat[key]:
|
|
|
|
return rstat[key] - qstat[key]
|
|
|
|
return 0
|
|
|
|
|
|
|
|
for _ in range(10):
|
testing: net-drv: use stats64 for testing
Testing a network device that has large numbers of bytes/packets may
overflow. Using stats64 when comparing fixes this problem.
I tripped on this while iterating on a qstats patch for mlx5. See below
for confirmation without my added code that this is a bug.
Before this patch (with added debugging output):
$ NETIF=eth0 tools/testing/selftests/drivers/net/stats.py
KTAP version 1
1..4
ok 1 stats.check_pause
ok 2 stats.check_fec
rstat: 481708634 qstat: 666201639514 key: tx-bytes
not ok 3 stats.pkt_byte_sum
ok 4 stats.qstat_by_ifindex
Note the huge delta above ^^^ in the rtnl vs qstats.
After this patch:
$ NETIF=eth0 tools/testing/selftests/drivers/net/stats.py
KTAP version 1
1..4
ok 1 stats.check_pause
ok 2 stats.check_fec
ok 3 stats.pkt_byte_sum
ok 4 stats.qstat_by_ifindex
It looks like rtnl_fill_stats in net/core/rtnetlink.c will attempt to
copy the 64bit stats into a 32bit structure which is probably why this
behavior is occurring.
To show this is happening, you can get the underlying stats that the
stats.py test uses like this:
$ ./cli.py --spec ../../../Documentation/netlink/specs/rt_link.yaml \
--do getlink --json '{"ifi-index": 7}'
And examine the output (heavily snipped to show relevant fields):
'stats': {
'multicast': 3739197,
'rx-bytes': 1201525399,
'rx-packets': 56807158,
'tx-bytes': 492404458,
'tx-packets': 1200285371,
'stats64': {
'multicast': 3739197,
'rx-bytes': 35561263767,
'rx-packets': 56807158,
'tx-bytes': 666212335338,
'tx-packets': 1200285371,
The stats.py test prior to this patch was using the 'stats' structure
above, which matches the failure output on my system.
Comparing side by side, rx-bytes and tx-bytes, and getting ethtool -S
output:
rx-bytes stats: 1201525399
rx-bytes stats64: 35561263767
rx-bytes ethtool: 36203402638
tx-bytes stats: 492404458
tx-bytes stats64: 666212335338
tx-bytes ethtool: 666215360113
Note that the above was taken from a system with an mlx5 NIC, which only
exposes ndo_get_stats64.
Based on the ethtool output and qstat output, it appears that stats.py
should be updated to use the 'stats64' structure for accurate
comparisons when packet/byte counters get very large.
To confirm that this was not related to the qstats code I was iterating
on, I booted a kernel without my driver changes and re-ran the test
which shows the qstats are skipped (as they don't exist for mlx5):
NETIF=eth0 tools/testing/selftests/drivers/net/stats.py
KTAP version 1
1..4
ok 1 stats.check_pause
ok 2 stats.check_fec
ok 3 stats.pkt_byte_sum # SKIP qstats not supported by the device
ok 4 stats.qstat_by_ifindex # SKIP No ifindex supports qstats
But, fetching the stats using the CLI
$ ./cli.py --spec ../../../Documentation/netlink/specs/rt_link.yaml \
--do getlink --json '{"ifi-index": 7}'
Shows the same issue (heavily snipped for relevant fields only):
'stats': {
'multicast': 105489,
'rx-bytes': 530879526,
'rx-packets': 751415,
'tx-bytes': 2510191396,
'tx-packets': 27700323,
'stats64': {
'multicast': 105489,
'rx-bytes': 530879526,
'rx-packets': 751415,
'tx-bytes': 15395093284,
'tx-packets': 27700323,
Comparing side by side with ethtool -S on the unmodified mlx5 driver:
tx-bytes stats: 2510191396
tx-bytes stats64: 15395093284
tx-bytes ethtool: 17718435810
Fixes: f0e6c86e4bab ("testing: net-drv: add a driver test for stats reporting")
Signed-off-by: Joe Damato <jdamato@fastly.com>
Link: https://lore.kernel.org/r/20240520235850.190041-1-jdamato@fastly.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2024-05-20 23:58:43 +00:00
|
|
|
rtstat = rtnl.getlink({"ifi-index": cfg.ifindex})['stats64']
|
2024-04-04 19:45:26 -07:00
|
|
|
if stat_cmp(rtstat, qstat) < 0:
|
2025-06-20 09:11:08 -07:00
|
|
|
raise KsftFailEx("RTNL stats are lower, fetched later")
|
2024-04-04 19:45:26 -07:00
|
|
|
qstat = get_qstat(cfg)
|
|
|
|
if stat_cmp(rtstat, qstat) > 0:
|
2025-06-20 09:11:08 -07:00
|
|
|
raise KsftFailEx("Qstats are lower, fetched later")
|
2024-04-04 19:45:26 -07:00
|
|
|
|
|
|
|
|
2024-04-19 19:35:42 -07:00
|
|
|
def qstat_by_ifindex(cfg) -> None:
|
2025-06-20 09:11:08 -07:00
|
|
|
""" Qstats Netlink API tests - querying by ifindex. """
|
2024-04-19 19:35:42 -07:00
|
|
|
|
|
|
|
# Construct a map ifindex -> [dump, by-index, dump]
|
|
|
|
ifindexes = {}
|
|
|
|
stats = netfam.qstats_get({}, dump=True)
|
|
|
|
for entry in stats:
|
|
|
|
ifindexes[entry['ifindex']] = [entry, None, None]
|
|
|
|
|
2025-06-20 09:11:08 -07:00
|
|
|
for ifindex in ifindexes:
|
2024-04-19 19:35:42 -07:00
|
|
|
entry = netfam.qstats_get({"ifindex": ifindex}, dump=True)
|
|
|
|
ksft_eq(len(entry), 1)
|
|
|
|
ifindexes[entry[0]['ifindex']][1] = entry[0]
|
|
|
|
|
|
|
|
stats = netfam.qstats_get({}, dump=True)
|
|
|
|
for entry in stats:
|
|
|
|
ifindexes[entry['ifindex']][2] = entry
|
|
|
|
|
|
|
|
if len(ifindexes) == 0:
|
|
|
|
raise KsftSkipEx("No ifindex supports qstats")
|
|
|
|
|
|
|
|
# Now make sure the stats match/make sense
|
|
|
|
for ifindex, triple in ifindexes.items():
|
|
|
|
all_keys = triple[0].keys() | triple[1].keys() | triple[2].keys()
|
|
|
|
|
|
|
|
for key in all_keys:
|
|
|
|
ksft_ge(triple[1][key], triple[0][key], comment="bad key: " + key)
|
|
|
|
ksft_ge(triple[2][key], triple[1][key], comment="bad key: " + key)
|
|
|
|
|
2024-12-13 07:22:44 -08:00
|
|
|
# Sanity check the dumps
|
|
|
|
queues = NetdevFamily(recv_size=4096).qstats_get({"scope": "queue"}, dump=True)
|
|
|
|
# Reformat the output into {ifindex: {rx: [id, id, ...], tx: [id, id, ...]}}
|
|
|
|
parsed = {}
|
|
|
|
for entry in queues:
|
|
|
|
ifindex = entry["ifindex"]
|
|
|
|
if ifindex not in parsed:
|
|
|
|
parsed[ifindex] = {"rx":[], "tx": []}
|
|
|
|
parsed[ifindex][entry["queue-type"]].append(entry['queue-id'])
|
|
|
|
# Now, validate
|
|
|
|
for ifindex, queues in parsed.items():
|
|
|
|
for qtype in ['rx', 'tx']:
|
|
|
|
ksft_eq(len(queues[qtype]), len(set(queues[qtype])),
|
|
|
|
comment="repeated queue keys")
|
|
|
|
ksft_eq(len(queues[qtype]), max(queues[qtype]) + 1,
|
|
|
|
comment="missing queue keys")
|
|
|
|
|
2024-04-19 19:35:42 -07:00
|
|
|
# Test invalid dumps
|
|
|
|
# 0 is invalid
|
|
|
|
with ksft_raises(NlError) as cm:
|
|
|
|
netfam.qstats_get({"ifindex": 0}, dump=True)
|
|
|
|
ksft_eq(cm.exception.nl_msg.error, -34)
|
|
|
|
ksft_eq(cm.exception.nl_msg.extack['bad-attr'], '.ifindex')
|
|
|
|
|
|
|
|
# loopback has no stats
|
|
|
|
with ksft_raises(NlError) as cm:
|
|
|
|
netfam.qstats_get({"ifindex": 1}, dump=True)
|
2024-08-01 17:03:09 -07:00
|
|
|
ksft_eq(cm.exception.nl_msg.error, -errno.EOPNOTSUPP)
|
2024-04-19 19:35:42 -07:00
|
|
|
ksft_eq(cm.exception.nl_msg.extack['bad-attr'], '.ifindex')
|
|
|
|
|
|
|
|
# Try to get stats for lowest unused ifindex but not 0
|
|
|
|
devs = rtnl.getlink({}, dump=True)
|
2025-06-20 09:11:08 -07:00
|
|
|
all_ifindexes = set(dev["ifi-index"] for dev in devs)
|
2024-04-19 19:35:42 -07:00
|
|
|
lowest = 2
|
|
|
|
while lowest in all_ifindexes:
|
|
|
|
lowest += 1
|
|
|
|
|
|
|
|
with ksft_raises(NlError) as cm:
|
|
|
|
netfam.qstats_get({"ifindex": lowest}, dump=True)
|
|
|
|
ksft_eq(cm.exception.nl_msg.error, -19)
|
|
|
|
ksft_eq(cm.exception.nl_msg.extack['bad-attr'], '.ifindex')
|
|
|
|
|
|
|
|
|
2024-08-01 17:03:08 -07:00
|
|
|
@ksft_disruptive
|
2024-08-01 17:03:07 -07:00
|
|
|
def check_down(cfg) -> None:
|
2025-06-20 09:11:08 -07:00
|
|
|
""" Test statistics (interface and qstat) are not impacted by ifdown """
|
|
|
|
|
2024-08-01 17:03:07 -07:00
|
|
|
try:
|
|
|
|
qstat = netfam.qstats_get({"ifindex": cfg.ifindex}, dump=True)[0]
|
|
|
|
except NlError as e:
|
|
|
|
if e.error == errno.EOPNOTSUPP:
|
2025-06-20 09:11:08 -07:00
|
|
|
raise KsftSkipEx("qstats not supported by the device") from e
|
2024-08-01 17:03:07 -07:00
|
|
|
raise
|
|
|
|
|
|
|
|
ip(f"link set dev {cfg.dev['ifname']} down")
|
|
|
|
defer(ip, f"link set dev {cfg.dev['ifname']} up")
|
|
|
|
|
|
|
|
qstat2 = netfam.qstats_get({"ifindex": cfg.ifindex}, dump=True)[0]
|
2025-06-20 09:11:08 -07:00
|
|
|
for k in qstat:
|
2024-08-01 17:03:07 -07:00
|
|
|
ksft_ge(qstat2[k], qstat[k], comment=f"{k} went backwards on device down")
|
|
|
|
|
|
|
|
# exercise per-queue API to make sure that "device down" state
|
|
|
|
# is handled correctly and doesn't crash
|
|
|
|
netfam.qstats_get({"ifindex": cfg.ifindex, "scope": "queue"}, dump=True)
|
|
|
|
|
|
|
|
|
2025-01-06 18:29:32 -08:00
|
|
|
def __run_inf_loop(body):
|
|
|
|
body = body.strip()
|
|
|
|
if body[-1] != ';':
|
|
|
|
body += ';'
|
|
|
|
|
|
|
|
return subprocess.Popen(f"while true; do {body} done", shell=True,
|
|
|
|
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
|
|
|
|
|
|
|
|
|
|
def __stats_increase_sanely(old, new) -> None:
|
|
|
|
for k in old.keys():
|
|
|
|
ksft_ge(new[k], old[k])
|
|
|
|
ksft_lt(new[k] - old[k], 1 << 31, comment="likely wrapping error")
|
|
|
|
|
|
|
|
|
|
|
|
def procfs_hammer(cfg) -> None:
|
|
|
|
"""
|
|
|
|
Reading stats via procfs only holds the RCU lock, which is not an exclusive
|
|
|
|
lock, make sure drivers can handle parallel reads of stats.
|
|
|
|
"""
|
|
|
|
one = __run_inf_loop("cat /proc/net/dev")
|
|
|
|
defer(one.kill)
|
|
|
|
two = __run_inf_loop("cat /proc/net/dev")
|
|
|
|
defer(two.kill)
|
|
|
|
|
|
|
|
time.sleep(1)
|
|
|
|
# Make sure the processes are running
|
|
|
|
ksft_is(one.poll(), None)
|
|
|
|
ksft_is(two.poll(), None)
|
|
|
|
|
|
|
|
rtstat1 = rtnl.getlink({"ifi-index": cfg.ifindex})['stats64']
|
|
|
|
time.sleep(2)
|
|
|
|
rtstat2 = rtnl.getlink({"ifi-index": cfg.ifindex})['stats64']
|
|
|
|
__stats_increase_sanely(rtstat1, rtstat2)
|
|
|
|
# defers will kill the loops
|
|
|
|
|
|
|
|
|
|
|
|
@ksft_disruptive
|
|
|
|
def procfs_downup_hammer(cfg) -> None:
|
|
|
|
"""
|
|
|
|
Reading stats via procfs only holds the RCU lock, drivers often try
|
|
|
|
to sleep when reading the stats, or don't protect against races.
|
|
|
|
"""
|
|
|
|
# Max out the queues, we'll flip between max and 1
|
|
|
|
channels = ethnl.channels_get({'header': {'dev-index': cfg.ifindex}})
|
|
|
|
if channels['combined-count'] == 0:
|
|
|
|
rx_type = 'rx'
|
|
|
|
else:
|
|
|
|
rx_type = 'combined'
|
|
|
|
cur_queue_cnt = channels[f'{rx_type}-count']
|
|
|
|
max_queue_cnt = channels[f'{rx_type}-max']
|
|
|
|
|
|
|
|
cmd(f"ethtool -L {cfg.ifname} {rx_type} {max_queue_cnt}")
|
|
|
|
defer(cmd, f"ethtool -L {cfg.ifname} {rx_type} {cur_queue_cnt}")
|
|
|
|
|
|
|
|
# Real test stats
|
|
|
|
stats = __run_inf_loop("cat /proc/net/dev")
|
|
|
|
defer(stats.kill)
|
|
|
|
|
|
|
|
ipset = f"ip link set dev {cfg.ifname}"
|
|
|
|
defer(ip, f"link set dev {cfg.ifname} up")
|
|
|
|
# The "echo -n 1" lets us count iterations below
|
|
|
|
updown = f"{ipset} down; sleep 0.05; {ipset} up; sleep 0.05; " + \
|
|
|
|
f"ethtool -L {cfg.ifname} {rx_type} 1; " + \
|
|
|
|
f"ethtool -L {cfg.ifname} {rx_type} {max_queue_cnt}; " + \
|
|
|
|
"echo -n 1"
|
|
|
|
updown = __run_inf_loop(updown)
|
|
|
|
kill_updown = defer(updown.kill)
|
|
|
|
|
|
|
|
time.sleep(1)
|
|
|
|
# Make sure the processes are running
|
|
|
|
ksft_is(stats.poll(), None)
|
|
|
|
ksft_is(updown.poll(), None)
|
|
|
|
|
|
|
|
rtstat1 = rtnl.getlink({"ifi-index": cfg.ifindex})['stats64']
|
|
|
|
# We're looking for crashes, give it extra time
|
|
|
|
time.sleep(9)
|
|
|
|
rtstat2 = rtnl.getlink({"ifi-index": cfg.ifindex})['stats64']
|
|
|
|
__stats_increase_sanely(rtstat1, rtstat2)
|
|
|
|
|
|
|
|
kill_updown.exec()
|
|
|
|
stdout, _ = updown.communicate(timeout=5)
|
|
|
|
ksft_pr("completed up/down cycles:", len(stdout.decode('utf-8')))
|
|
|
|
|
|
|
|
|
2024-04-04 19:45:26 -07:00
|
|
|
def main() -> None:
|
2025-06-20 09:11:08 -07:00
|
|
|
""" Ksft boiler plate main """
|
|
|
|
|
2024-12-13 07:22:44 -08:00
|
|
|
with NetDrvEnv(__file__, queue_count=100) as cfg:
|
2024-08-01 17:03:07 -07:00
|
|
|
ksft_run([check_pause, check_fec, pkt_byte_sum, qstat_by_ifindex,
|
2025-01-06 18:29:32 -08:00
|
|
|
check_down, procfs_hammer, procfs_downup_hammer],
|
2024-04-04 19:45:26 -07:00
|
|
|
args=(cfg, ))
|
2024-04-17 16:11:40 -07:00
|
|
|
ksft_exit()
|
2024-04-04 19:45:26 -07:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|