linux/tools/testing/selftests/bpf/prog_tests/lwt_reroute.c

// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause

/*
 * Test suite of lwt BPF programs that reroutes packets
 *   The file tests focus not only if these programs work as expected normally,
 *   but also if they can handle abnormal situations gracefully. This test
 *   suite currently only covers lwt_xmit hook. lwt_in tests have not been
 *   implemented.
 *
 * WARNING
 * -------
 *  This test suite can crash the kernel, thus should be run in a VM.
 *
 * Setup:
 * ---------
 *  all tests are performed in a single netns. A lwt encap route is setup for
 *  each subtest:
 *
 *    ip route add 10.0.0.0/24 encap bpf xmit <obj> sec "<section_N>" dev link_err
 *
 *  Here <obj> is statically defined to test_lwt_reroute.bpf.o, and it contains
 *  a single test program entry. This program sets packet mark by last byte of
 *  the IPv4 daddr. For example, a packet going to 1.2.3.4 will receive a skb
 *  mark 4. A packet will only be marked once, and IP x.x.x.0 will be skipped
 *  to avoid route loop. We didn't use generated BPF skeleton since the
 *  attachment for lwt programs are not supported by libbpf yet.
 *
 *  The test program will bring up a tun device, and sets up the following
 *  routes:
 *
 *    ip rule add pref 100 from all fwmark <tun_index> lookup 100
 *    ip route add table 100 default dev tun0
 *
 *  For normal testing, a ping command is running in the test netns:
 *
 *    ping 10.0.0.<tun_index> -c 1 -w 1 -s 100
 *
 *  For abnormal testing, fq is used as the qdisc of the tun device. Then a UDP
 *  socket will try to overflow the fq queue and trigger qdisc drop error.
 *
 * Scenarios:
 * --------------------------------
 *  1. Reroute to a running tun device
 *  2. Reroute to a device where qdisc drop
 *
 *  For case 1, ping packets should be received by the tun device.
 *
 *  For case 2, force UDP packets to overflow fq limit. As long as kernel
 *  is not crashed, it is considered successful.
 */
#define NETNS "ns_lwt_reroute"
#include "lwt_helpers.h"
#include "network_helpers.h"
#include <linux/net_tstamp.h>

#define BPF_OBJECT            "test_lwt_reroute.bpf.o"
#define LOCAL_SRC             "10.0.0.1"
#define TEST_CIDR             "10.0.0.0/24"
#define XMIT_HOOK             "xmit"
#define XMIT_SECTION          "lwt_xmit"
#define NSEC_PER_SEC          1000000000ULL

/* send a ping to be rerouted to the target device */
static void ping_once(const char *ip)
{
	/* We won't get a reply. Don't fail here */
	SYS_NOFAIL("ping %s -c1 -W1 -s %d",
		   ip, ICMP_PAYLOAD_SIZE);
}

/* Send snd_target UDP packets to overflow the fq queue and trigger qdisc drop
 * error. This is done via TX tstamp to force buffering delayed packets.
 */
static int overflow_fq(int snd_target, const char *target_ip)
{
	struct sockaddr_in addr = {
		.sin_family = AF_INET,
		.sin_port = htons(1234),
	};

	char data_buf[8]; /* only #pkts matter, so use a random small buffer */
	char control_buf[CMSG_SPACE(sizeof(uint64_t))];
	struct iovec iov = {
		.iov_base = data_buf,
		.iov_len = sizeof(data_buf),
	};
	int err = -1;
	int s = -1;
	struct sock_txtime txtime_on = {
		.clockid = CLOCK_MONOTONIC,
		.flags = 0,
	};
	struct msghdr msg = {
		.msg_name = &addr,
		.msg_namelen = sizeof(addr),
		.msg_control = control_buf,
		.msg_controllen = sizeof(control_buf),
		.msg_iovlen = 1,
		.msg_iov = &iov,
	};
	struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);

	memset(data_buf, 0, sizeof(data_buf));

	s = socket(AF_INET, SOCK_DGRAM, 0);
	if (!ASSERT_GE(s, 0, "socket"))
		goto out;

	err = setsockopt(s, SOL_SOCKET, SO_TXTIME, &txtime_on, sizeof(txtime_on));
	if (!ASSERT_OK(err, "setsockopt(SO_TXTIME)"))
		goto out;

	err = inet_pton(AF_INET, target_ip, &addr.sin_addr);
	if (!ASSERT_EQ(err, 1, "inet_pton"))
		goto out;

	while (snd_target > 0) {
		struct timespec now;

		memset(control_buf, 0, sizeof(control_buf));
		cmsg->cmsg_type = SCM_TXTIME;
		cmsg->cmsg_level = SOL_SOCKET;
		cmsg->cmsg_len = CMSG_LEN(sizeof(uint64_t));

		err = clock_gettime(CLOCK_MONOTONIC, &now);
		if (!ASSERT_OK(err, "clock_gettime(CLOCK_MONOTONIC)")) {
			err = -1;
			goto out;
		}

		*(uint64_t *)CMSG_DATA(cmsg) = (now.tv_nsec + 1) * NSEC_PER_SEC +
					       now.tv_nsec;

		/* we will intentionally send more than fq limit, so ignore
		 * the error here.
		 */
		sendmsg(s, &msg, MSG_NOSIGNAL);
		snd_target--;
	}

	/* no kernel crash so far is considered success */
	err = 0;

out:
	if (s >= 0)
		close(s);

	return err;
}

static int setup(const char *tun_dev)
{
	int target_index = -1;
	int tap_fd = -1;

	tap_fd = open_tuntap(tun_dev, false);
	if (!ASSERT_GE(tap_fd, 0, "open_tun"))
		return -1;

	target_index = if_nametoindex(tun_dev);
	if (!ASSERT_GE(target_index, 0, "if_nametoindex"))
		return -1;

	SYS(fail, "ip link add link_err type dummy");
	SYS(fail, "ip link set lo up");
	SYS(fail, "ip addr add dev lo " LOCAL_SRC "/32");
	SYS(fail, "ip link set link_err up");
	SYS(fail, "ip link set %s up", tun_dev);

	SYS(fail, "ip route add %s dev link_err encap bpf xmit obj %s sec lwt_xmit",
	    TEST_CIDR, BPF_OBJECT);

	SYS(fail, "ip rule add pref 100 from all fwmark %d lookup 100",
	    target_index);
	SYS(fail, "ip route add t 100 default dev %s", tun_dev);

	return tap_fd;

fail:
	if (tap_fd >= 0)
		close(tap_fd);
	return -1;
}

static void test_lwt_reroute_normal_xmit(void)
{
	const char *tun_dev = "tun0";
	int tun_fd = -1;
	int ifindex = -1;
	char ip[256];
	struct timeval timeo = {
		.tv_sec = 0,
		.tv_usec = 250000,
	};

	tun_fd = setup(tun_dev);
	if (!ASSERT_GE(tun_fd, 0, "setup_reroute"))
		return;

	ifindex = if_nametoindex(tun_dev);
	if (!ASSERT_GE(ifindex, 0, "if_nametoindex"))
		return;

	snprintf(ip, 256, "10.0.0.%d", ifindex);

	/* ping packets should be received by the tun device */
	ping_once(ip);

	if (!ASSERT_EQ(wait_for_packet(tun_fd, __expect_icmp_ipv4, &timeo), 1,
		       "wait_for_packet"))
		log_err("%s xmit", __func__);
}

/*
 * Test the failure case when the skb is dropped at the qdisc. This is a
 * regression prevention at the xmit hook only.
 */
static void test_lwt_reroute_qdisc_dropped(void)
{
	const char *tun_dev = "tun0";
	int tun_fd = -1;
	int ifindex = -1;
	char ip[256];

	tun_fd = setup(tun_dev);
	if (!ASSERT_GE(tun_fd, 0, "setup_reroute"))
		goto fail;

	SYS(fail, "tc qdisc replace dev %s root fq limit 5 flow_limit 5", tun_dev);

	ifindex = if_nametoindex(tun_dev);
	if (!ASSERT_GE(ifindex, 0, "if_nametoindex"))
		return;

	snprintf(ip, 256, "10.0.0.%d", ifindex);
	ASSERT_EQ(overflow_fq(10, ip), 0, "overflow_fq");

fail:
	if (tun_fd >= 0)
		close(tun_fd);
}

static void *test_lwt_reroute_run(void *arg)
{
	netns_delete();
	RUN_TEST(lwt_reroute_normal_xmit);
	RUN_TEST(lwt_reroute_qdisc_dropped);
	return NULL;
}

void test_lwt_reroute(void)
{
	pthread_t test_thread;
	int err;

	/* Run the tests in their own thread to isolate the namespace changes
	 * so they do not affect the environment of other tests.
	 * (specifically needed because of unshare(CLONE_NEWNS) in open_netns())
	 */
	err = pthread_create(&test_thread, NULL, &test_lwt_reroute_run, NULL);
	if (ASSERT_OK(err, "pthread_create"))
		ASSERT_OK(pthread_join(test_thread, NULL), "pthread_join");
}
selftests/bpf: Add lwt_xmit tests for BPF_REROUTE There is no lwt test case for BPF_REROUTE yet. Add test cases for both normal and abnormal situations. The abnormal situation is set up with an fq qdisc on the reroute target device. Without proper fixes, overflow this qdisc queue limit (to trigger a drop) would panic the kernel. Signed-off-by: Yan Zhai <yan@cloudflare.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Link: https://lore.kernel.org/bpf/62c8ddc1e924269dcf80d2e8af1a1e632cee0b3a.1692326837.git.yan@cloudflare.com 2023-08-17 19:58:18 -07:00			`// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause`

			`/*`
			`* Test suite of lwt BPF programs that reroutes packets`
			`* The file tests focus not only if these programs work as expected normally,`
			`* but also if they can handle abnormal situations gracefully. This test`
			`* suite currently only covers lwt_xmit hook. lwt_in tests have not been`
			`* implemented.`
			`*`
			`* WARNING`
			`* -------`
			`* This test suite can crash the kernel, thus should be run in a VM.`
			`*`
			`* Setup:`
			`* ---------`
			`* all tests are performed in a single netns. A lwt encap route is setup for`
			`* each subtest:`
			`*`
			`* ip route add 10.0.0.0/24 encap bpf xmit <obj> sec "<section_N>" dev link_err`
			`*`
			`* Here <obj> is statically defined to test_lwt_reroute.bpf.o, and it contains`
			`* a single test program entry. This program sets packet mark by last byte of`
			`* the IPv4 daddr. For example, a packet going to 1.2.3.4 will receive a skb`
			`* mark 4. A packet will only be marked once, and IP x.x.x.0 will be skipped`
			`* to avoid route loop. We didn't use generated BPF skeleton since the`
			`* attachment for lwt programs are not supported by libbpf yet.`
			`*`
			`* The test program will bring up a tun device, and sets up the following`
			`* routes:`
			`*`
			`* ip rule add pref 100 from all fwmark <tun_index> lookup 100`
			`* ip route add table 100 default dev tun0`
			`*`
			`* For normal testing, a ping command is running in the test netns:`
			`*`
			`* ping 10.0.0.<tun_index> -c 1 -w 1 -s 100`
			`*`
			`* For abnormal testing, fq is used as the qdisc of the tun device. Then a UDP`
			`* socket will try to overflow the fq queue and trigger qdisc drop error.`
			`*`
			`* Scenarios:`
			`* --------------------------------`
			`* 1. Reroute to a running tun device`
			`* 2. Reroute to a device where qdisc drop`
			`*`
			`* For case 1, ping packets should be received by the tun device.`
			`*`
			`* For case 2, force UDP packets to overflow fq limit. As long as kernel`
			`* is not crashed, it is considered successful.`
			`*/`
selftests/bpf: Fix flaky selftest lwt_redirect/lwt_reroute Recently, when running './test_progs -j', I occasionally hit the following errors: test_lwt_redirect:PASS:pthread_create 0 nsec test_lwt_redirect_run:FAIL:netns_create unexpected error: 256 (errno 0) #142/2 lwt_redirect/lwt_redirect_normal_nomac:FAIL #142 lwt_redirect:FAIL test_lwt_reroute:PASS:pthread_create 0 nsec test_lwt_reroute_run:FAIL:netns_create unexpected error: 256 (errno 0) test_lwt_reroute:PASS:pthread_join 0 nsec #143/2 lwt_reroute/lwt_reroute_qdisc_dropped:FAIL #143 lwt_reroute:FAIL The netns_create() definition looks like below: #define NETNS "ns_lwt" static inline int netns_create(void) { return system("ip netns add " NETNS); } One possibility is that both lwt_redirect and lwt_reroute create netns with the same name "ns_lwt" which may cause conflict. I tried the following example: $ sudo ip netns add abc $ echo $? 0 $ sudo ip netns add abc Cannot create namespace file "/var/run/netns/abc": File exists $ echo $? 1 $ The return code for above netns_create() is 256. The internet search suggests that the return value for 'ip netns add ns_lwt' is 1, which matches the above 'sudo ip netns add abc' example. This patch tried to use different netns names for two tests to avoid 'ip netns add <name>' failure. I ran './test_progs -j' 10 times and all succeeded with lwt_redirect/lwt_reroute tests. Signed-off-by: Yonghong Song <yonghong.song@linux.dev> Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Tested-by: Eduard Zingerman <eddyz87@gmail.com> Link: https://lore.kernel.org/bpf/20240205052914.1742687-1-yonghong.song@linux.dev 2024-02-04 21:29:14 -08:00			`#define NETNS "ns_lwt_reroute"`
selftests/bpf: Add lwt_xmit tests for BPF_REROUTE There is no lwt test case for BPF_REROUTE yet. Add test cases for both normal and abnormal situations. The abnormal situation is set up with an fq qdisc on the reroute target device. Without proper fixes, overflow this qdisc queue limit (to trigger a drop) would panic the kernel. Signed-off-by: Yan Zhai <yan@cloudflare.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Link: https://lore.kernel.org/bpf/62c8ddc1e924269dcf80d2e8af1a1e632cee0b3a.1692326837.git.yan@cloudflare.com 2023-08-17 19:58:18 -07:00			`#include "lwt_helpers.h"`
			`#include "network_helpers.h"`
			`#include <linux/net_tstamp.h>`

			`#define BPF_OBJECT "test_lwt_reroute.bpf.o"`
			`#define LOCAL_SRC "10.0.0.1"`
			`#define TEST_CIDR "10.0.0.0/24"`
			`#define XMIT_HOOK "xmit"`
			`#define XMIT_SECTION "lwt_xmit"`
			`#define NSEC_PER_SEC 1000000000ULL`

			`/* send a ping to be rerouted to the target device */`
			`static void ping_once(const char *ip)`
			`{`
			`/* We won't get a reply. Don't fail here */`
selftests/bpf: Remove "&>" usage in the selftests In s390, CI reported that the sock_iter_batch selftest hits this error very often: 2024-01-26T16:56:49.3091804Z Bind /proc/self/ns/net -> /run/netns/sock_iter_batch_netns failed: No such file or directory 2024-01-26T16:56:49.3149524Z Cannot remove namespace file "/run/netns/sock_iter_batch_netns": No such file or directory 2024-01-26T16:56:49.3772213Z test_sock_iter_batch:FAIL:ip netns add sock_iter_batch_netns unexpected error: 256 (errno 0) It happens very often in s390 but Manu also noticed it happens very sparsely in other arch also. It turns out the default dash shell does not recognize "&>" as a redirection operator, so the command went to the background. In the sock_iter_batch selftest, the "ip netns delete" went into background and then race with the following "ip netns add" command. This patch replaces the "&> /dev/null" usage with ">/dev/null 2>&1" and does this redirection in the SYS_NOFAIL macro instead of doing it individually by its caller. The SYS_NOFAIL callers do not care about failure, so it is no harm to do this redirection even if some of the existing callers do not redirect to /dev/null now. It touches different test files, so I skipped the Fixes tags in this patch. Some of the changed tests do not use "&>" but they use the SYS_NOFAIL, so these tests are also changed to avoid doing its own redirection because SYS_NOFAIL does it internally now. Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org> Link: https://lore.kernel.org/r/20240127025017.950825-1-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov <ast@kernel.org> 2024-01-26 18:50:17 -08:00			`SYS_NOFAIL("ping %s -c1 -W1 -s %d",`
selftests/bpf: Add lwt_xmit tests for BPF_REROUTE There is no lwt test case for BPF_REROUTE yet. Add test cases for both normal and abnormal situations. The abnormal situation is set up with an fq qdisc on the reroute target device. Without proper fixes, overflow this qdisc queue limit (to trigger a drop) would panic the kernel. Signed-off-by: Yan Zhai <yan@cloudflare.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Link: https://lore.kernel.org/bpf/62c8ddc1e924269dcf80d2e8af1a1e632cee0b3a.1692326837.git.yan@cloudflare.com 2023-08-17 19:58:18 -07:00			`ip, ICMP_PAYLOAD_SIZE);`
			`}`

			`/* Send snd_target UDP packets to overflow the fq queue and trigger qdisc drop`
			`* error. This is done via TX tstamp to force buffering delayed packets.`
			`*/`
			`static int overflow_fq(int snd_target, const char *target_ip)`
			`{`
			`struct sockaddr_in addr = {`
			`.sin_family = AF_INET,`
			`.sin_port = htons(1234),`
			`};`

			`char data_buf[8]; /* only #pkts matter, so use a random small buffer */`
			`char control_buf[CMSG_SPACE(sizeof(uint64_t))];`
			`struct iovec iov = {`
			`.iov_base = data_buf,`
			`.iov_len = sizeof(data_buf),`
			`};`
			`int err = -1;`
			`int s = -1;`
			`struct sock_txtime txtime_on = {`
			`.clockid = CLOCK_MONOTONIC,`
			`.flags = 0,`
			`};`
			`struct msghdr msg = {`
			`.msg_name = &addr,`
			`.msg_namelen = sizeof(addr),`
			`.msg_control = control_buf,`
			`.msg_controllen = sizeof(control_buf),`
			`.msg_iovlen = 1,`
			`.msg_iov = &iov,`
			`};`
			`struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);`

			`memset(data_buf, 0, sizeof(data_buf));`

			`s = socket(AF_INET, SOCK_DGRAM, 0);`
			`if (!ASSERT_GE(s, 0, "socket"))`
			`goto out;`

			`err = setsockopt(s, SOL_SOCKET, SO_TXTIME, &txtime_on, sizeof(txtime_on));`
			`if (!ASSERT_OK(err, "setsockopt(SO_TXTIME)"))`
			`goto out;`

			`err = inet_pton(AF_INET, target_ip, &addr.sin_addr);`
			`if (!ASSERT_EQ(err, 1, "inet_pton"))`
			`goto out;`

			`while (snd_target > 0) {`
			`struct timespec now;`

			`memset(control_buf, 0, sizeof(control_buf));`
			`cmsg->cmsg_type = SCM_TXTIME;`
			`cmsg->cmsg_level = SOL_SOCKET;`
			`cmsg->cmsg_len = CMSG_LEN(sizeof(uint64_t));`

			`err = clock_gettime(CLOCK_MONOTONIC, &now);`
			`if (!ASSERT_OK(err, "clock_gettime(CLOCK_MONOTONIC)")) {`
			`err = -1;`
			`goto out;`
			`}`

			`(uint64_t )CMSG_DATA(cmsg) = (now.tv_nsec + 1) * NSEC_PER_SEC +`
			`now.tv_nsec;`

			`/* we will intentionally send more than fq limit, so ignore`
			`* the error here.`
			`*/`
			`sendmsg(s, &msg, MSG_NOSIGNAL);`
			`snd_target--;`
			`}`

			`/* no kernel crash so far is considered success */`
			`err = 0;`

			`out:`
			`if (s >= 0)`
			`close(s);`

			`return err;`
			`}`

			`static int setup(const char *tun_dev)`
			`{`
			`int target_index = -1;`
			`int tap_fd = -1;`

			`tap_fd = open_tuntap(tun_dev, false);`
			`if (!ASSERT_GE(tap_fd, 0, "open_tun"))`
			`return -1;`

			`target_index = if_nametoindex(tun_dev);`
			`if (!ASSERT_GE(target_index, 0, "if_nametoindex"))`
			`return -1;`

			`SYS(fail, "ip link add link_err type dummy");`
			`SYS(fail, "ip link set lo up");`
			`SYS(fail, "ip addr add dev lo " LOCAL_SRC "/32");`
			`SYS(fail, "ip link set link_err up");`
			`SYS(fail, "ip link set %s up", tun_dev);`

			`SYS(fail, "ip route add %s dev link_err encap bpf xmit obj %s sec lwt_xmit",`
			`TEST_CIDR, BPF_OBJECT);`

			`SYS(fail, "ip rule add pref 100 from all fwmark %d lookup 100",`
			`target_index);`
			`SYS(fail, "ip route add t 100 default dev %s", tun_dev);`

			`return tap_fd;`

			`fail:`
			`if (tap_fd >= 0)`
			`close(tap_fd);`
			`return -1;`
			`}`

			`static void test_lwt_reroute_normal_xmit(void)`
			`{`
			`const char *tun_dev = "tun0";`
			`int tun_fd = -1;`
			`int ifindex = -1;`
			`char ip[256];`
			`struct timeval timeo = {`
			`.tv_sec = 0,`
			`.tv_usec = 250000,`
			`};`

			`tun_fd = setup(tun_dev);`
			`if (!ASSERT_GE(tun_fd, 0, "setup_reroute"))`
			`return;`

			`ifindex = if_nametoindex(tun_dev);`
			`if (!ASSERT_GE(ifindex, 0, "if_nametoindex"))`
			`return;`

			`snprintf(ip, 256, "10.0.0.%d", ifindex);`

			`/* ping packets should be received by the tun device */`
			`ping_once(ip);`

			`if (!ASSERT_EQ(wait_for_packet(tun_fd, __expect_icmp_ipv4, &timeo), 1,`
			`"wait_for_packet"))`
			`log_err("%s xmit", __func__);`
			`}`

			`/*`
			`* Test the failure case when the skb is dropped at the qdisc. This is a`
			`* regression prevention at the xmit hook only.`
			`*/`
			`static void test_lwt_reroute_qdisc_dropped(void)`
			`{`
			`const char *tun_dev = "tun0";`
			`int tun_fd = -1;`
			`int ifindex = -1;`
			`char ip[256];`

			`tun_fd = setup(tun_dev);`
			`if (!ASSERT_GE(tun_fd, 0, "setup_reroute"))`
			`goto fail;`

			`SYS(fail, "tc qdisc replace dev %s root fq limit 5 flow_limit 5", tun_dev);`

			`ifindex = if_nametoindex(tun_dev);`
			`if (!ASSERT_GE(ifindex, 0, "if_nametoindex"))`
			`return;`

			`snprintf(ip, 256, "10.0.0.%d", ifindex);`
			`ASSERT_EQ(overflow_fq(10, ip), 0, "overflow_fq");`

			`fail:`
			`if (tun_fd >= 0)`
			`close(tun_fd);`
			`}`

			`static void test_lwt_reroute_run(void arg)`
			`{`
			`netns_delete();`
			`RUN_TEST(lwt_reroute_normal_xmit);`
			`RUN_TEST(lwt_reroute_qdisc_dropped);`
			`return NULL;`
			`}`

			`void test_lwt_reroute(void)`
			`{`
			`pthread_t test_thread;`
			`int err;`

			`/* Run the tests in their own thread to isolate the namespace changes`
			`* so they do not affect the environment of other tests.`
			`* (specifically needed because of unshare(CLONE_NEWNS) in open_netns())`
			`*/`
			`err = pthread_create(&test_thread, NULL, &test_lwt_reroute_run, NULL);`
			`if (ASSERT_OK(err, "pthread_create"))`
			`ASSERT_OK(pthread_join(test_thread, NULL), "pthread_join");`
			`}`