linux/tools/testing/selftests/bpf/progs/test_sock_fields.c

311 lines
7.3 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2019 Facebook */
#include <linux/bpf.h>
#include <netinet/in.h>
#include <stdbool.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_endian.h>
enum bpf_linum_array_idx {
EGRESS_LINUM_IDX,
INGRESS_LINUM_IDX,
READ_SK_DST_PORT_LINUM_IDX,
__NR_BPF_LINUM_ARRAY_IDX,
};
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(max_entries, __NR_BPF_LINUM_ARRAY_IDX);
__type(key, __u32);
__type(value, __u32);
} linum_map SEC(".maps");
struct bpf_spinlock_cnt {
struct bpf_spin_lock lock;
__u32 cnt;
};
struct {
__uint(type, BPF_MAP_TYPE_SK_STORAGE);
__uint(map_flags, BPF_F_NO_PREALLOC);
__type(key, int);
__type(value, struct bpf_spinlock_cnt);
} sk_pkt_out_cnt SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_SK_STORAGE);
__uint(map_flags, BPF_F_NO_PREALLOC);
__type(key, int);
__type(value, struct bpf_spinlock_cnt);
} sk_pkt_out_cnt10 SEC(".maps");
selftests/bpf: Remove the bpf_tcp_helpers.h usages from other non tcp-cc tests The patch removes the remaining bpf_tcp_helpers.h usages in the non tcp-cc networking tests. It either replaces it with bpf_tracing_net.h or just removed it because the test is not actually using any kernel sockets. For the later, the missing macro (mainly SOL_TCP) is defined locally. An exception is the test_sock_fields which is testing the "struct bpf_sock" type instead of the kernel sock type. Whenever "vmlinux.h" is used instead, it hits a verifier error on doing arithmetic on the sock_common pointer: ; return !a6[0] && !a6[1] && !a6[2] && a6[3] == bpf_htonl(1); @ test_sock_fields.c:54 21: (61) r2 = *(u32 *)(r1 +28) ; R1_w=sock_common() R2_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) 22: (56) if w2 != 0x0 goto pc-6 ; R2_w=0 23: (b7) r3 = 28 ; R3_w=28 24: (bf) r2 = r1 ; R1_w=sock_common() R2_w=sock_common() 25: (0f) r2 += r3 R2 pointer arithmetic on sock_common prohibited Hence, instead of including bpf_tracing_net.h, the test_sock_fields test defines a tcp_sock with one lsndtime field in it. Another highlight is, in sockopt_qos_to_cc.c, the tcp_cc_eq() is replaced by bpf_strncmp(). tcp_cc_eq() was a workaround in bpf_tcp_helpers.h before bpf_strncmp had been added. The SOL_IPV6 addition to bpf_tracing_net.h is needed by the test_tcpbpf_kern test. Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org> Link: https://lore.kernel.org/r/20240509175026.3423614-10-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2024-05-09 10:50:25 -07:00
struct tcp_sock {
__u32 lsndtime;
} __attribute__((preserve_access_index));
struct bpf_tcp_sock listen_tp = {};
struct sockaddr_in6 srv_sa6 = {};
struct bpf_tcp_sock cli_tp = {};
struct bpf_tcp_sock srv_tp = {};
struct bpf_sock listen_sk = {};
struct bpf_sock srv_sk = {};
struct bpf_sock cli_sk = {};
__u64 parent_cg_id = 0;
__u64 child_cg_id = 0;
__u64 lsndtime = 0;
static bool is_loopback6(__u32 *a6)
{
return !a6[0] && !a6[1] && !a6[2] && a6[3] == bpf_htonl(1);
}
static void skcpy(struct bpf_sock *dst,
const struct bpf_sock *src)
{
dst->bound_dev_if = src->bound_dev_if;
dst->family = src->family;
dst->type = src->type;
dst->protocol = src->protocol;
dst->mark = src->mark;
dst->priority = src->priority;
dst->src_ip4 = src->src_ip4;
dst->src_ip6[0] = src->src_ip6[0];
dst->src_ip6[1] = src->src_ip6[1];
dst->src_ip6[2] = src->src_ip6[2];
dst->src_ip6[3] = src->src_ip6[3];
dst->src_port = src->src_port;
dst->dst_ip4 = src->dst_ip4;
dst->dst_ip6[0] = src->dst_ip6[0];
dst->dst_ip6[1] = src->dst_ip6[1];
dst->dst_ip6[2] = src->dst_ip6[2];
dst->dst_ip6[3] = src->dst_ip6[3];
dst->dst_port = src->dst_port;
dst->state = src->state;
}
static void tpcpy(struct bpf_tcp_sock *dst,
const struct bpf_tcp_sock *src)
{
dst->snd_cwnd = src->snd_cwnd;
dst->srtt_us = src->srtt_us;
dst->rtt_min = src->rtt_min;
dst->snd_ssthresh = src->snd_ssthresh;
dst->rcv_nxt = src->rcv_nxt;
dst->snd_nxt = src->snd_nxt;
dst->snd_una = src->snd_una;
dst->mss_cache = src->mss_cache;
dst->ecn_flags = src->ecn_flags;
dst->rate_delivered = src->rate_delivered;
dst->rate_interval_us = src->rate_interval_us;
dst->packets_out = src->packets_out;
dst->retrans_out = src->retrans_out;
dst->total_retrans = src->total_retrans;
dst->segs_in = src->segs_in;
dst->data_segs_in = src->data_segs_in;
dst->segs_out = src->segs_out;
dst->data_segs_out = src->data_segs_out;
dst->lost_out = src->lost_out;
dst->sacked_out = src->sacked_out;
dst->bytes_received = src->bytes_received;
dst->bytes_acked = src->bytes_acked;
}
/* Always return CG_OK so that no pkt will be filtered out */
#define CG_OK 1
#define RET_LOG() ({ \
linum = __LINE__; \
bpf_map_update_elem(&linum_map, &linum_idx, &linum, BPF_ANY); \
return CG_OK; \
})
SEC("cgroup_skb/egress")
int egress_read_sock_fields(struct __sk_buff *skb)
{
struct bpf_spinlock_cnt cli_cnt_init = { .lock = {}, .cnt = 0xeB9F };
struct bpf_spinlock_cnt *pkt_out_cnt, *pkt_out_cnt10;
struct bpf_tcp_sock *tp, *tp_ret;
struct bpf_sock *sk, *sk_ret;
__u32 linum, linum_idx;
struct tcp_sock *ktp;
linum_idx = EGRESS_LINUM_IDX;
sk = skb->sk;
if (!sk)
RET_LOG();
/* Not testing the egress traffic or the listening socket,
* which are covered by the cgroup_skb/ingress test program.
*/
if (sk->family != AF_INET6 || !is_loopback6(sk->src_ip6) ||
sk->state == BPF_TCP_LISTEN)
return CG_OK;
if (sk->src_port == bpf_ntohs(srv_sa6.sin6_port)) {
/* Server socket */
sk_ret = &srv_sk;
tp_ret = &srv_tp;
} else if (sk->dst_port == srv_sa6.sin6_port) {
/* Client socket */
sk_ret = &cli_sk;
tp_ret = &cli_tp;
} else {
/* Not the testing egress traffic */
return CG_OK;
}
/* It must be a fullsock for cgroup_skb/egress prog */
sk = bpf_sk_fullsock(sk);
if (!sk)
RET_LOG();
/* Not the testing egress traffic */
if (sk->protocol != IPPROTO_TCP)
return CG_OK;
tp = bpf_tcp_sock(sk);
if (!tp)
RET_LOG();
skcpy(sk_ret, sk);
tpcpy(tp_ret, tp);
if (sk_ret == &srv_sk) {
ktp = bpf_skc_to_tcp_sock(sk);
if (!ktp)
RET_LOG();
lsndtime = ktp->lsndtime;
child_cg_id = bpf_sk_cgroup_id(ktp);
if (!child_cg_id)
RET_LOG();
parent_cg_id = bpf_sk_ancestor_cgroup_id(ktp, 2);
if (!parent_cg_id)
RET_LOG();
/* The userspace has created it for srv sk */
pkt_out_cnt = bpf_sk_storage_get(&sk_pkt_out_cnt, ktp, 0, 0);
pkt_out_cnt10 = bpf_sk_storage_get(&sk_pkt_out_cnt10, ktp,
0, 0);
} else {
pkt_out_cnt = bpf_sk_storage_get(&sk_pkt_out_cnt, sk,
&cli_cnt_init,
BPF_SK_STORAGE_GET_F_CREATE);
pkt_out_cnt10 = bpf_sk_storage_get(&sk_pkt_out_cnt10,
sk, &cli_cnt_init,
BPF_SK_STORAGE_GET_F_CREATE);
}
if (!pkt_out_cnt || !pkt_out_cnt10)
RET_LOG();
/* Even both cnt and cnt10 have lock defined in their BTF,
* intentionally one cnt takes lock while one does not
* as a test for the spinlock support in BPF_MAP_TYPE_SK_STORAGE.
*/
pkt_out_cnt->cnt += 1;
bpf_spin_lock(&pkt_out_cnt10->lock);
pkt_out_cnt10->cnt += 10;
bpf_spin_unlock(&pkt_out_cnt10->lock);
return CG_OK;
}
SEC("cgroup_skb/ingress")
int ingress_read_sock_fields(struct __sk_buff *skb)
{
struct bpf_tcp_sock *tp;
__u32 linum, linum_idx;
struct bpf_sock *sk;
linum_idx = INGRESS_LINUM_IDX;
sk = skb->sk;
if (!sk)
RET_LOG();
/* Not the testing ingress traffic to the server */
if (sk->family != AF_INET6 || !is_loopback6(sk->src_ip6) ||
sk->src_port != bpf_ntohs(srv_sa6.sin6_port))
return CG_OK;
/* Only interested in the listening socket */
if (sk->state != BPF_TCP_LISTEN)
return CG_OK;
/* It must be a fullsock for cgroup_skb/ingress prog */
sk = bpf_sk_fullsock(sk);
if (!sk)
RET_LOG();
tp = bpf_tcp_sock(sk);
if (!tp)
RET_LOG();
skcpy(&listen_sk, sk);
tpcpy(&listen_tp, tp);
return CG_OK;
}
selftests/bpf: Fix test for 4-byte load from dst_port on big-endian The check for 4-byte load from dst_port offset into bpf_sock is failing on big-endian architecture - s390. The bpf access converter rewrites the 4-byte load to a 2-byte load from sock_common at skc_dport offset, as shown below. * s390 / llvm-objdump -S --no-show-raw-insn 00000000000002a0 <sk_dst_port__load_word>: 84: r1 = *(u32 *)(r1 + 48) 85: w0 = 1 86: if w1 == 51966 goto +1 <LBB5_2> 87: w0 = 0 00000000000002c0 <LBB5_2>: 88: exit * s390 / bpftool prog dump xlated _Bool sk_dst_port__load_word(struct bpf_sock * sk): 35: (69) r1 = *(u16 *)(r1 +12) 36: (bc) w1 = w1 37: (b4) w0 = 1 38: (16) if w1 == 0xcafe goto pc+1 39: (b4) w0 = 0 40: (95) exit * x86_64 / llvm-objdump -S --no-show-raw-insn 00000000000002a0 <sk_dst_port__load_word>: 84: r1 = *(u32 *)(r1 + 48) 85: w0 = 1 86: if w1 == 65226 goto +1 <LBB5_2> 87: w0 = 0 00000000000002c0 <LBB5_2>: 88: exit * x86_64 / bpftool prog dump xlated _Bool sk_dst_port__load_word(struct bpf_sock * sk): 33: (69) r1 = *(u16 *)(r1 +12) 34: (b4) w0 = 1 35: (16) if w1 == 0xfeca goto pc+1 36: (b4) w0 = 0 37: (95) exit This leads to surprises if we treat the destination register contents as a 32-bit value, ignoring the fact that in reality it contains a 16-bit value. On little-endian the register contents reflect the bpf_sock struct definition, where the lower 16-bits contain the port number: struct bpf_sock { ... __be16 dst_port; /* offset 48 */ __u16 :16; ... }; However, on big-endian the register contents suggest that field the layout of bpf_sock struct is as so: struct bpf_sock { ... __u16 :16; /* offset 48 */ __be16 dst_port; ... }; Account for this quirky access conversion in the test case exercising the 4-byte load by treating the result as 16-bit wide. Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Martin KaFai Lau <kafai@fb.com> Link: https://lore.kernel.org/bpf/20220317113920.1068535-5-jakub@cloudflare.com
2022-03-17 12:39:20 +01:00
/*
* NOTE: 4-byte load from bpf_sock at dst_port offset is quirky. It
* gets rewritten by the access converter to a 2-byte load for
* backward compatibility. Treating the load result as a be16 value
* makes the code portable across little- and big-endian platforms.
*/
static __noinline bool sk_dst_port__load_word(struct bpf_sock *sk)
{
__u32 *word = (__u32 *)&sk->dst_port;
selftests/bpf: Fix test for 4-byte load from dst_port on big-endian The check for 4-byte load from dst_port offset into bpf_sock is failing on big-endian architecture - s390. The bpf access converter rewrites the 4-byte load to a 2-byte load from sock_common at skc_dport offset, as shown below. * s390 / llvm-objdump -S --no-show-raw-insn 00000000000002a0 <sk_dst_port__load_word>: 84: r1 = *(u32 *)(r1 + 48) 85: w0 = 1 86: if w1 == 51966 goto +1 <LBB5_2> 87: w0 = 0 00000000000002c0 <LBB5_2>: 88: exit * s390 / bpftool prog dump xlated _Bool sk_dst_port__load_word(struct bpf_sock * sk): 35: (69) r1 = *(u16 *)(r1 +12) 36: (bc) w1 = w1 37: (b4) w0 = 1 38: (16) if w1 == 0xcafe goto pc+1 39: (b4) w0 = 0 40: (95) exit * x86_64 / llvm-objdump -S --no-show-raw-insn 00000000000002a0 <sk_dst_port__load_word>: 84: r1 = *(u32 *)(r1 + 48) 85: w0 = 1 86: if w1 == 65226 goto +1 <LBB5_2> 87: w0 = 0 00000000000002c0 <LBB5_2>: 88: exit * x86_64 / bpftool prog dump xlated _Bool sk_dst_port__load_word(struct bpf_sock * sk): 33: (69) r1 = *(u16 *)(r1 +12) 34: (b4) w0 = 1 35: (16) if w1 == 0xfeca goto pc+1 36: (b4) w0 = 0 37: (95) exit This leads to surprises if we treat the destination register contents as a 32-bit value, ignoring the fact that in reality it contains a 16-bit value. On little-endian the register contents reflect the bpf_sock struct definition, where the lower 16-bits contain the port number: struct bpf_sock { ... __be16 dst_port; /* offset 48 */ __u16 :16; ... }; However, on big-endian the register contents suggest that field the layout of bpf_sock struct is as so: struct bpf_sock { ... __u16 :16; /* offset 48 */ __be16 dst_port; ... }; Account for this quirky access conversion in the test case exercising the 4-byte load by treating the result as 16-bit wide. Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Martin KaFai Lau <kafai@fb.com> Link: https://lore.kernel.org/bpf/20220317113920.1068535-5-jakub@cloudflare.com
2022-03-17 12:39:20 +01:00
return word[0] == bpf_htons(0xcafe);
}
static __noinline bool sk_dst_port__load_half(struct bpf_sock *sk)
{
selftests/bpf: Fix s390 sock_field test failure llvm patch [1] enabled cross-function optimization for func arguments (ArgumentPromotion) at -O2 level. And this caused s390 sock_fields test failure ([2]). The failure is gone right now as patch [1] was reverted in [3]. But it is possible that patch [3] will be reverted again and then the test failure in [2] will show up again. So it is desirable to fix the failure regardless. The following is an analysis why sock_field test fails with llvm patch [1]. The main problem is in static __noinline bool sk_dst_port__load_word(struct bpf_sock *sk) { __u32 *word = (__u32 *)&sk->dst_port; return word[0] == bpf_htons(0xcafe); } static __noinline bool sk_dst_port__load_half(struct bpf_sock *sk) { __u16 *half = (__u16 *)&sk->dst_port; return half[0] == bpf_htons(0xcafe); } ... int read_sk_dst_port(struct __sk_buff *skb) { ... sk = skb->sk; ... if (!sk_dst_port__load_word(sk)) RET_LOG(); if (!sk_dst_port__load_half(sk)) RET_LOG(); ... } Through some cross-function optimization by ArgumentPromotion optimization, the compiler does: static __noinline bool sk_dst_port__load_word(__u32 word_val) { return word_val == bpf_htons(0xcafe); } static __noinline bool sk_dst_port__load_half(__u16 half_val) { return half_val == bpf_htons(0xcafe); } ... int read_sk_dst_port(struct __sk_buff *skb) { ... sk = skb->sk; ... __u32 *word = (__u32 *)&sk->dst_port; __u32 word_val = word[0]; ... if (!sk_dst_port__load_word(word_val)) RET_LOG(); __u16 half_val = word_val >> 16; if (!sk_dst_port__load_half(half_val)) RET_LOG(); ... } In current uapi bpf.h, we have struct bpf_sock { ... __be16 dst_port; /* network byte order */ __u16 :16; /* zero padding */ ... }; But the old kernel (e.g., 5.6) we have struct bpf_sock { ... __u32 dst_port; /* network byte order */ ... }; So for backward compatability reason, 4-byte load of dst_port is converted to 2-byte load internally. Specifically, 'word_val = word[0]' is replaced by 2-byte load by the verifier and this caused the trouble for later sk_dst_port__load_half() where half_val becomes 0. Typical usr program won't have such a code pattern tiggering the above bug, so let us fix the test failure with source code change. Adding an empty asm volatile statement seems enough to prevent undesired transformation. [1] https://reviews.llvm.org/D148269 [2] https://lore.kernel.org/bpf/e7f2c5e8-a50c-198d-8f95-388165f1e4fd@meta.com/ [3] https://reviews.llvm.org/rG141be5c062ecf22bd287afffd310e8ac4711444a Tested-by: Ilya Leoshkevich <iii@linux.ibm.com> Signed-off-by: Yonghong Song <yhs@fb.com> Link: https://lore.kernel.org/r/20230516214945.1013578-1-yhs@fb.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-05-16 14:49:45 -07:00
__u16 *half;
asm volatile ("");
half = (__u16 *)&sk->dst_port;
return half[0] == bpf_htons(0xcafe);
}
static __noinline bool sk_dst_port__load_byte(struct bpf_sock *sk)
{
__u8 *byte = (__u8 *)&sk->dst_port;
return byte[0] == 0xca && byte[1] == 0xfe;
}
SEC("cgroup_skb/egress")
int read_sk_dst_port(struct __sk_buff *skb)
{
__u32 linum, linum_idx;
struct bpf_sock *sk;
linum_idx = READ_SK_DST_PORT_LINUM_IDX;
sk = skb->sk;
if (!sk)
RET_LOG();
/* Ignore everything but the SYN from the client socket */
if (sk->state != BPF_TCP_SYN_SENT)
return CG_OK;
if (!sk_dst_port__load_word(sk))
RET_LOG();
if (!sk_dst_port__load_half(sk))
RET_LOG();
if (!sk_dst_port__load_byte(sk))
RET_LOG();
return CG_OK;
}
char _license[] SEC("license") = "GPL";