linux/include/net/gro.h

605 lines
16 KiB
C
Raw Permalink Normal View History

/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef _NET_GRO_H
#define _NET_GRO_H
#include <linux/indirect_call_wrapper.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <net/ip6_checksum.h>
#include <linux/skbuff.h>
#include <net/udp.h>
#include <net/hotdata.h>
net: allow small head cache usage with large MAX_SKB_FRAGS values Sabrina reported the following splat: WARNING: CPU: 0 PID: 1 at net/core/dev.c:6935 netif_napi_add_weight_locked+0x8f2/0xba0 Modules linked in: CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.14.0-rc1-net-00092-g011b03359038 #996 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.16.3-1-1 04/01/2014 RIP: 0010:netif_napi_add_weight_locked+0x8f2/0xba0 Code: e8 c3 e6 6a fe 48 83 c4 28 5b 5d 41 5c 41 5d 41 5e 41 5f c3 cc cc cc cc c7 44 24 10 ff ff ff ff e9 8f fb ff ff e8 9e e6 6a fe <0f> 0b e9 d3 fe ff ff e8 92 e6 6a fe 48 8b 04 24 be ff ff ff ff 48 RSP: 0000:ffffc9000001fc60 EFLAGS: 00010293 RAX: 0000000000000000 RBX: ffff88806ce48128 RCX: 1ffff11001664b9e RDX: ffff888008f00040 RSI: ffffffff8317ca42 RDI: ffff88800b325cb6 RBP: ffff88800b325c40 R08: 0000000000000001 R09: ffffed100167502c R10: ffff88800b3a8163 R11: 0000000000000000 R12: ffff88800ac1c168 R13: ffff88800ac1c168 R14: ffff88800ac1c168 R15: 0000000000000007 FS: 0000000000000000(0000) GS:ffff88806ce00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff888008201000 CR3: 0000000004c94001 CR4: 0000000000370ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: <TASK> gro_cells_init+0x1ba/0x270 xfrm_input_init+0x4b/0x2a0 xfrm_init+0x38/0x50 ip_rt_init+0x2d7/0x350 ip_init+0xf/0x20 inet_init+0x406/0x590 do_one_initcall+0x9d/0x2e0 do_initcalls+0x23b/0x280 kernel_init_freeable+0x445/0x490 kernel_init+0x20/0x1d0 ret_from_fork+0x46/0x80 ret_from_fork_asm+0x1a/0x30 </TASK> irq event stamp: 584330 hardirqs last enabled at (584338): [<ffffffff8168bf87>] __up_console_sem+0x77/0xb0 hardirqs last disabled at (584345): [<ffffffff8168bf6c>] __up_console_sem+0x5c/0xb0 softirqs last enabled at (583242): [<ffffffff833ee96d>] netlink_insert+0x14d/0x470 softirqs last disabled at (583754): [<ffffffff8317c8cd>] netif_napi_add_weight_locked+0x77d/0xba0 on kernel built with MAX_SKB_FRAGS=45, where SKB_WITH_OVERHEAD(1024) is smaller than GRO_MAX_HEAD. Such built additionally contains the revert of the single page frag cache so that napi_get_frags() ends up using the page frag allocator, triggering the splat. Note that the underlying issue is independent from the mentioned revert; address it ensuring that the small head cache will fit either TCP and GRO allocation and updating napi_alloc_skb() and __netdev_alloc_skb() to select kmalloc() usage for any allocation fitting such cache. Reported-by: Sabrina Dubroca <sd@queasysnail.net> Suggested-by: Eric Dumazet <edumazet@google.com> Fixes: 3948b05950fd ("net: introduce a config option to tweak MAX_SKB_FRAGS") Reviewed-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-02-18 19:29:39 +01:00
/* This should be increased if a protocol with a bigger head is added. */
#define GRO_MAX_HEAD (MAX_HEADER + 128)
struct napi_gro_cb {
union {
struct {
/* Virtual address of skb_shinfo(skb)->frags[0].page + offset. */
void *frag0;
/* Length of frag0. */
unsigned int frag0_len;
};
struct {
/* used in skb_gro_receive() slow path */
struct sk_buff *last;
/* jiffies when first packet was created/queued */
unsigned long age;
};
};
/* This indicates where we are processing relative to skb->data. */
int data_offset;
/* This is non-zero if the packet cannot be merged with the new skb. */
u16 flush;
/* Number of segments aggregated. */
u16 count;
xfrm: Support GRO for IPv4 ESP in UDP encapsulation This patch enables the GRO codepath for IPv4 ESP in UDP encapsulated packets. Decapsulation happens at L2 and saves a full round through the stack for each packet. This is also needed to support HW offload for ESP in UDP encapsulation. Enabling this would imporove performance for ESP in UDP datapath, i.e IPsec with NAT in between. By default GRP for ESP-in-UDP is disabled for UDP sockets. To enable this feature for an ESP socket, the following two options need to be set: 1. enable ESP-in-UDP: (this is already set by an IKE daemon). int type = UDP_ENCAP_ESPINUDP; setsockopt(fd, SOL_UDP, UDP_ENCAP, &type, sizeof(type)); 2. To enable GRO for ESP in UDP socket: type = true; setsockopt(fd, SOL_UDP, UDP_GRO, &type, sizeof(type)); Enabling ESP-in-UDP has the side effect of preventing the Linux stack from seeing ESP packets at the L3 (when ESP OFFLOAD is disabled), as packets are immediately decapsulated from UDP and decrypted. This change may affect nftable rules that match on ESP packets at L3. Also tcpdump won't see the ESP packet. Developers/admins are advised to review and adapt any nftable rules accordingly before enabling this feature to prevent potential rule breakage. Also tcpdump will not see from ESP packets from a ESP in UDP flow, when this is enabled. Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com> Co-developed-by: Antony Antony <antony.antony@secunet.com> Signed-off-by: Antony Antony <antony.antony@secunet.com> Reviewed-by: Eyal Birger <eyal.birger@gmail.com>
2023-10-04 15:05:27 +02:00
/* Used in ipv6_gro_receive() and foo-over-udp and esp-in-udp */
u16 proto;
u16 pad;
net: gro: move L3 flush checks to tcp_gro_receive and udp_gro_receive_segment {inet,ipv6}_gro_receive functions perform flush checks (ttl, flags, iph->id, ...) against all packets in a loop. These flush checks are used in all merging UDP and TCP flows. These checks need to be done only once and only against the found p skb, since they only affect flush and not same_flow. This patch leverages correct network header offsets from the cb for both outer and inner network headers - allowing these checks to be done only once, in tcp_gro_receive and udp_gro_receive_segment. As a result, NAPI_GRO_CB(p)->flush is not used at all. In addition, flush_id checks are more declarative and contained in inet_gro_flush, thus removing the need for flush_id in napi_gro_cb. This results in less parsing code for non-loop flush tests for TCP and UDP flows. To make sure results are not within noise range - I've made netfilter drop all TCP packets, and measured CPU performance in GRO (in this case GRO is responsible for about 50% of the CPU utilization). perf top while replaying 64 parallel IP/TCP streams merging in GRO: (gro_receive_network_flush is compiled inline to tcp_gro_receive) net-next: 6.94% [kernel] [k] inet_gro_receive 3.02% [kernel] [k] tcp_gro_receive patch applied: 4.27% [kernel] [k] tcp_gro_receive 4.22% [kernel] [k] inet_gro_receive perf top while replaying 64 parallel IP/IP/TCP streams merging in GRO (same results for any encapsulation, in this case inet_gro_receive is top offender in net-next) net-next: 10.09% [kernel] [k] inet_gro_receive 2.08% [kernel] [k] tcp_gro_receive patch applied: 6.97% [kernel] [k] inet_gro_receive 3.68% [kernel] [k] tcp_gro_receive Signed-off-by: Richard Gobert <richardbgobert@gmail.com> Reviewed-by: Willem de Bruijn <willemb@google.com> Link: https://lore.kernel.org/r/20240509190819.2985-3-richardbgobert@gmail.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-05-09 21:08:18 +02:00
/* Used in napi_gro_cb::free */
#define NAPI_GRO_FREE 1
#define NAPI_GRO_FREE_STOLEN_HEAD 2
/* portion of the cb set to zero at every gro iteration */
struct_group(zeroed,
/* Start offset for remote checksum offload */
u16 gro_remcsum_start;
/* This is non-zero if the packet may be of the same flow. */
u8 same_flow:1;
/* Used in tunnel GRO receive */
u8 encap_mark:1;
/* GRO checksum is valid */
u8 csum_valid:1;
/* Number of checksums via CHECKSUM_UNNECESSARY */
u8 csum_cnt:3;
/* Free the skb? */
u8 free:2;
/* Used in foo-over-udp, set in udp[46]_gro_receive */
u8 is_ipv6:1;
/* Used in GRE, set in fou/gue_gro_receive */
u8 is_fou:1;
net: gro: move L3 flush checks to tcp_gro_receive and udp_gro_receive_segment {inet,ipv6}_gro_receive functions perform flush checks (ttl, flags, iph->id, ...) against all packets in a loop. These flush checks are used in all merging UDP and TCP flows. These checks need to be done only once and only against the found p skb, since they only affect flush and not same_flow. This patch leverages correct network header offsets from the cb for both outer and inner network headers - allowing these checks to be done only once, in tcp_gro_receive and udp_gro_receive_segment. As a result, NAPI_GRO_CB(p)->flush is not used at all. In addition, flush_id checks are more declarative and contained in inet_gro_flush, thus removing the need for flush_id in napi_gro_cb. This results in less parsing code for non-loop flush tests for TCP and UDP flows. To make sure results are not within noise range - I've made netfilter drop all TCP packets, and measured CPU performance in GRO (in this case GRO is responsible for about 50% of the CPU utilization). perf top while replaying 64 parallel IP/TCP streams merging in GRO: (gro_receive_network_flush is compiled inline to tcp_gro_receive) net-next: 6.94% [kernel] [k] inet_gro_receive 3.02% [kernel] [k] tcp_gro_receive patch applied: 4.27% [kernel] [k] tcp_gro_receive 4.22% [kernel] [k] inet_gro_receive perf top while replaying 64 parallel IP/IP/TCP streams merging in GRO (same results for any encapsulation, in this case inet_gro_receive is top offender in net-next) net-next: 10.09% [kernel] [k] inet_gro_receive 2.08% [kernel] [k] tcp_gro_receive patch applied: 6.97% [kernel] [k] inet_gro_receive 3.68% [kernel] [k] tcp_gro_receive Signed-off-by: Richard Gobert <richardbgobert@gmail.com> Reviewed-by: Willem de Bruijn <willemb@google.com> Link: https://lore.kernel.org/r/20240509190819.2985-3-richardbgobert@gmail.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-05-09 21:08:18 +02:00
/* Used to determine if ipid_offset can be ignored */
u8 ip_fixedid:1;
/* Number of gro_receive callbacks this packet already went through */
u8 recursion_counter:4;
/* GRO is done by frag_list pointer chaining. */
u8 is_flist:1;
);
/* used to support CHECKSUM_COMPLETE for tunneling protocols */
__wsum csum;
net: gro: fix udp bad offset in socket lookup by adding {inner_}network_offset to napi_gro_cb Commits a602456 ("udp: Add GRO functions to UDP socket") and 57c67ff ("udp: additional GRO support") introduce incorrect usage of {ip,ipv6}_hdr in the complete phase of gro. The functions always return skb->network_header, which in the case of encapsulated packets at the gro complete phase, is always set to the innermost L3 of the packet. That means that calling {ip,ipv6}_hdr for skbs which completed the GRO receive phase (both in gro_list and *_gro_complete) when parsing an encapsulated packet's _outer_ L3/L4 may return an unexpected value. This incorrect usage leads to a bug in GRO's UDP socket lookup. udp{4,6}_lib_lookup_skb functions use ip_hdr/ipv6_hdr respectively. These *_hdr functions return network_header which will point to the innermost L3, resulting in the wrong offset being used in __udp{4,6}_lib_lookup with encapsulated packets. This patch adds network_offset and inner_network_offset to napi_gro_cb, and makes sure both are set correctly. To fix the issue, network_offsets union is used inside napi_gro_cb, in which both the outer and the inner network offsets are saved. Reproduction example: Endpoint configuration example (fou + local address bind) # ip fou add port 6666 ipproto 4 # ip link add name tun1 type ipip remote 2.2.2.1 local 2.2.2.2 encap fou encap-dport 5555 encap-sport 6666 mode ipip # ip link set tun1 up # ip a add 1.1.1.2/24 dev tun1 Netperf TCP_STREAM result on net-next before patch is applied: net-next main, GRO enabled: $ netperf -H 1.1.1.2 -t TCP_STREAM -l 5 Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 131072 16384 16384 5.28 2.37 net-next main, GRO disabled: $ netperf -H 1.1.1.2 -t TCP_STREAM -l 5 Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 131072 16384 16384 5.01 2745.06 patch applied, GRO enabled: $ netperf -H 1.1.1.2 -t TCP_STREAM -l 5 Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 131072 16384 16384 5.01 2877.38 Fixes: a6024562ffd7 ("udp: Add GRO functions to UDP socket") Signed-off-by: Richard Gobert <richardbgobert@gmail.com> Reviewed-by: Eric Dumazet <edumazet@google.com> Reviewed-by: Willem de Bruijn <willemb@google.com> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2024-04-30 16:35:54 +02:00
/* L3 offsets */
union {
struct {
u16 network_offset;
u16 inner_network_offset;
};
u16 network_offsets[2];
};
};
#define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb)
#define GRO_RECURSION_LIMIT 15
static inline int gro_recursion_inc_test(struct sk_buff *skb)
{
return ++NAPI_GRO_CB(skb)->recursion_counter == GRO_RECURSION_LIMIT;
}
typedef struct sk_buff *(*gro_receive_t)(struct list_head *, struct sk_buff *);
static inline struct sk_buff *call_gro_receive(gro_receive_t cb,
struct list_head *head,
struct sk_buff *skb)
{
if (unlikely(gro_recursion_inc_test(skb))) {
NAPI_GRO_CB(skb)->flush |= 1;
return NULL;
}
return cb(head, skb);
}
typedef struct sk_buff *(*gro_receive_sk_t)(struct sock *, struct list_head *,
struct sk_buff *);
static inline struct sk_buff *call_gro_receive_sk(gro_receive_sk_t cb,
struct sock *sk,
struct list_head *head,
struct sk_buff *skb)
{
if (unlikely(gro_recursion_inc_test(skb))) {
NAPI_GRO_CB(skb)->flush |= 1;
return NULL;
}
return cb(sk, head, skb);
}
static inline unsigned int skb_gro_offset(const struct sk_buff *skb)
{
return NAPI_GRO_CB(skb)->data_offset;
}
static inline unsigned int skb_gro_len(const struct sk_buff *skb)
{
return skb->len - NAPI_GRO_CB(skb)->data_offset;
}
static inline void skb_gro_pull(struct sk_buff *skb, unsigned int len)
{
NAPI_GRO_CB(skb)->data_offset += len;
}
static inline void *skb_gro_header_fast(const struct sk_buff *skb,
unsigned int offset)
{
return NAPI_GRO_CB(skb)->frag0 + offset;
}
static inline bool skb_gro_may_pull(const struct sk_buff *skb,
unsigned int hlen)
{
return likely(hlen <= NAPI_GRO_CB(skb)->frag0_len);
}
static inline void *skb_gro_header_slow(struct sk_buff *skb, unsigned int hlen,
unsigned int offset)
{
if (!pskb_may_pull(skb, hlen))
return NULL;
return skb->data + offset;
}
static inline void *skb_gro_header(struct sk_buff *skb, unsigned int hlen,
unsigned int offset)
{
void *ptr;
ptr = skb_gro_header_fast(skb, offset);
if (!skb_gro_may_pull(skb, hlen))
ptr = skb_gro_header_slow(skb, hlen, offset);
return ptr;
}
static inline int skb_gro_receive_network_offset(const struct sk_buff *skb)
{
return NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark];
}
static inline void *skb_gro_network_header(const struct sk_buff *skb)
{
if (skb_gro_may_pull(skb, skb_gro_offset(skb)))
return skb_gro_header_fast(skb, skb_gro_receive_network_offset(skb));
return skb->data + skb_gro_receive_network_offset(skb);
}
static inline __wsum inet_gro_compute_pseudo(const struct sk_buff *skb,
int proto)
{
const struct iphdr *iph = skb_gro_network_header(skb);
return csum_tcpudp_nofold(iph->saddr, iph->daddr,
skb_gro_len(skb), proto, 0);
}
static inline void skb_gro_postpull_rcsum(struct sk_buff *skb,
const void *start, unsigned int len)
{
if (NAPI_GRO_CB(skb)->csum_valid)
NAPI_GRO_CB(skb)->csum = wsum_negate(csum_partial(start, len,
wsum_negate(NAPI_GRO_CB(skb)->csum)));
}
/* GRO checksum functions. These are logical equivalents of the normal
* checksum functions (in skbuff.h) except that they operate on the GRO
* offsets and fields in sk_buff.
*/
__sum16 __skb_gro_checksum_complete(struct sk_buff *skb);
static inline bool skb_at_gro_remcsum_start(struct sk_buff *skb)
{
return (NAPI_GRO_CB(skb)->gro_remcsum_start == skb_gro_offset(skb));
}
static inline bool __skb_gro_checksum_validate_needed(struct sk_buff *skb,
bool zero_okay,
__sum16 check)
{
return ((skb->ip_summed != CHECKSUM_PARTIAL ||
skb_checksum_start_offset(skb) <
skb_gro_offset(skb)) &&
!skb_at_gro_remcsum_start(skb) &&
NAPI_GRO_CB(skb)->csum_cnt == 0 &&
(!zero_okay || check));
}
static inline __sum16 __skb_gro_checksum_validate_complete(struct sk_buff *skb,
__wsum psum)
{
if (NAPI_GRO_CB(skb)->csum_valid &&
!csum_fold(csum_add(psum, NAPI_GRO_CB(skb)->csum)))
return 0;
NAPI_GRO_CB(skb)->csum = psum;
return __skb_gro_checksum_complete(skb);
}
static inline void skb_gro_incr_csum_unnecessary(struct sk_buff *skb)
{
if (NAPI_GRO_CB(skb)->csum_cnt > 0) {
/* Consume a checksum from CHECKSUM_UNNECESSARY */
NAPI_GRO_CB(skb)->csum_cnt--;
} else {
/* Update skb for CHECKSUM_UNNECESSARY and csum_level when we
* verified a new top level checksum or an encapsulated one
* during GRO. This saves work if we fallback to normal path.
*/
__skb_incr_checksum_unnecessary(skb);
}
}
#define __skb_gro_checksum_validate(skb, proto, zero_okay, check, \
compute_pseudo) \
({ \
__sum16 __ret = 0; \
if (__skb_gro_checksum_validate_needed(skb, zero_okay, check)) \
__ret = __skb_gro_checksum_validate_complete(skb, \
compute_pseudo(skb, proto)); \
if (!__ret) \
skb_gro_incr_csum_unnecessary(skb); \
__ret; \
})
#define skb_gro_checksum_validate(skb, proto, compute_pseudo) \
__skb_gro_checksum_validate(skb, proto, false, 0, compute_pseudo)
#define skb_gro_checksum_validate_zero_check(skb, proto, check, \
compute_pseudo) \
__skb_gro_checksum_validate(skb, proto, true, check, compute_pseudo)
#define skb_gro_checksum_simple_validate(skb) \
__skb_gro_checksum_validate(skb, 0, false, 0, null_compute_pseudo)
static inline bool __skb_gro_checksum_convert_check(struct sk_buff *skb)
{
return (NAPI_GRO_CB(skb)->csum_cnt == 0 &&
!NAPI_GRO_CB(skb)->csum_valid);
}
static inline void __skb_gro_checksum_convert(struct sk_buff *skb,
__wsum pseudo)
{
NAPI_GRO_CB(skb)->csum = ~pseudo;
NAPI_GRO_CB(skb)->csum_valid = 1;
}
#define skb_gro_checksum_try_convert(skb, proto, compute_pseudo) \
do { \
if (__skb_gro_checksum_convert_check(skb)) \
__skb_gro_checksum_convert(skb, \
compute_pseudo(skb, proto)); \
} while (0)
struct gro_remcsum {
int offset;
__wsum delta;
};
static inline void skb_gro_remcsum_init(struct gro_remcsum *grc)
{
grc->offset = 0;
grc->delta = 0;
}
static inline void *skb_gro_remcsum_process(struct sk_buff *skb, void *ptr,
unsigned int off, size_t hdrlen,
int start, int offset,
struct gro_remcsum *grc,
bool nopartial)
{
__wsum delta;
size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
BUG_ON(!NAPI_GRO_CB(skb)->csum_valid);
if (!nopartial) {
NAPI_GRO_CB(skb)->gro_remcsum_start = off + hdrlen + start;
return ptr;
}
ptr = skb_gro_header(skb, off + plen, off);
if (!ptr)
return NULL;
delta = remcsum_adjust(ptr + hdrlen, NAPI_GRO_CB(skb)->csum,
start, offset);
/* Adjust skb->csum since we changed the packet */
NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta);
grc->offset = off + hdrlen + offset;
grc->delta = delta;
return ptr;
}
static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb,
struct gro_remcsum *grc)
{
void *ptr;
size_t plen = grc->offset + sizeof(u16);
if (!grc->delta)
return;
ptr = skb_gro_header(skb, plen, grc->offset);
if (!ptr)
return;
remcsum_unadjust((__sum16 *)ptr, grc->delta);
}
#ifdef CONFIG_XFRM_OFFLOAD
static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush)
{
if (PTR_ERR(pp) != -EINPROGRESS)
NAPI_GRO_CB(skb)->flush |= flush;
}
static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb,
struct sk_buff *pp,
int flush,
struct gro_remcsum *grc)
{
if (PTR_ERR(pp) != -EINPROGRESS) {
NAPI_GRO_CB(skb)->flush |= flush;
skb_gro_remcsum_cleanup(skb, grc);
skb->remcsum_offload = 0;
}
}
#else
static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush)
{
NAPI_GRO_CB(skb)->flush |= flush;
}
static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb,
struct sk_buff *pp,
int flush,
struct gro_remcsum *grc)
{
NAPI_GRO_CB(skb)->flush |= flush;
skb_gro_remcsum_cleanup(skb, grc);
skb->remcsum_offload = 0;
}
#endif
INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *,
struct sk_buff *));
INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int));
INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *,
struct sk_buff *));
INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int));
INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp4_gro_receive(struct list_head *,
struct sk_buff *));
INDIRECT_CALLABLE_DECLARE(int udp4_gro_complete(struct sk_buff *, int));
INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp6_gro_receive(struct list_head *,
struct sk_buff *));
INDIRECT_CALLABLE_DECLARE(int udp6_gro_complete(struct sk_buff *, int));
#define indirect_call_gro_receive_inet(cb, f2, f1, head, skb) \
({ \
unlikely(gro_recursion_inc_test(skb)) ? \
NAPI_GRO_CB(skb)->flush |= 1, NULL : \
INDIRECT_CALL_INET(cb, f2, f1, head, skb); \
})
struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
struct udphdr *uh, struct sock *sk);
int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup);
static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb)
{
struct udphdr *uh;
unsigned int hlen, off;
off = skb_gro_offset(skb);
hlen = off + sizeof(*uh);
uh = skb_gro_header(skb, hlen, off);
return uh;
}
static inline __wsum ip6_gro_compute_pseudo(const struct sk_buff *skb,
int proto)
{
const struct ipv6hdr *iph = skb_gro_network_header(skb);
return ~csum_unfold(csum_ipv6_magic(&iph->saddr, &iph->daddr,
skb_gro_len(skb), proto, 0));
}
net: gro: move L3 flush checks to tcp_gro_receive and udp_gro_receive_segment {inet,ipv6}_gro_receive functions perform flush checks (ttl, flags, iph->id, ...) against all packets in a loop. These flush checks are used in all merging UDP and TCP flows. These checks need to be done only once and only against the found p skb, since they only affect flush and not same_flow. This patch leverages correct network header offsets from the cb for both outer and inner network headers - allowing these checks to be done only once, in tcp_gro_receive and udp_gro_receive_segment. As a result, NAPI_GRO_CB(p)->flush is not used at all. In addition, flush_id checks are more declarative and contained in inet_gro_flush, thus removing the need for flush_id in napi_gro_cb. This results in less parsing code for non-loop flush tests for TCP and UDP flows. To make sure results are not within noise range - I've made netfilter drop all TCP packets, and measured CPU performance in GRO (in this case GRO is responsible for about 50% of the CPU utilization). perf top while replaying 64 parallel IP/TCP streams merging in GRO: (gro_receive_network_flush is compiled inline to tcp_gro_receive) net-next: 6.94% [kernel] [k] inet_gro_receive 3.02% [kernel] [k] tcp_gro_receive patch applied: 4.27% [kernel] [k] tcp_gro_receive 4.22% [kernel] [k] inet_gro_receive perf top while replaying 64 parallel IP/IP/TCP streams merging in GRO (same results for any encapsulation, in this case inet_gro_receive is top offender in net-next) net-next: 10.09% [kernel] [k] inet_gro_receive 2.08% [kernel] [k] tcp_gro_receive patch applied: 6.97% [kernel] [k] inet_gro_receive 3.68% [kernel] [k] tcp_gro_receive Signed-off-by: Richard Gobert <richardbgobert@gmail.com> Reviewed-by: Willem de Bruijn <willemb@google.com> Link: https://lore.kernel.org/r/20240509190819.2985-3-richardbgobert@gmail.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-05-09 21:08:18 +02:00
static inline int inet_gro_flush(const struct iphdr *iph, const struct iphdr *iph2,
struct sk_buff *p, bool outer)
{
const u32 id = ntohl(*(__be32 *)&iph->id);
const u32 id2 = ntohl(*(__be32 *)&iph2->id);
const u16 ipid_offset = (id >> 16) - (id2 >> 16);
const u16 count = NAPI_GRO_CB(p)->count;
const u32 df = id & IP_DF;
int flush;
/* All fields must match except length and checksum. */
flush = (iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | (df ^ (id2 & IP_DF));
if (flush | (outer && df))
return flush;
/* When we receive our second frame we can make a decision on if we
* continue this flow as an atomic flow with a fixed ID or if we use
* an incrementing ID.
*/
if (count == 1 && df && !ipid_offset)
NAPI_GRO_CB(p)->ip_fixedid = true;
return ipid_offset ^ (count * !NAPI_GRO_CB(p)->ip_fixedid);
}
static inline int ipv6_gro_flush(const struct ipv6hdr *iph, const struct ipv6hdr *iph2)
{
/* <Version:4><Traffic_Class:8><Flow_Label:20> */
__be32 first_word = *(__be32 *)iph ^ *(__be32 *)iph2;
/* Flush if Traffic Class fields are different. */
return !!((first_word & htonl(0x0FF00000)) |
(__force __be32)(iph->hop_limit ^ iph2->hop_limit));
}
static inline int __gro_receive_network_flush(const void *th, const void *th2,
struct sk_buff *p, const u16 diff,
bool outer)
{
const void *nh = th - diff;
const void *nh2 = th2 - diff;
if (((struct iphdr *)nh)->version == 6)
return ipv6_gro_flush(nh, nh2);
else
return inet_gro_flush(nh, nh2, p, outer);
}
static inline int gro_receive_network_flush(const void *th, const void *th2,
struct sk_buff *p)
{
const bool encap_mark = NAPI_GRO_CB(p)->encap_mark;
int off = skb_transport_offset(p);
int flush;
flush = __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->network_offset, encap_mark);
if (encap_mark)
flush |= __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->inner_network_offset, false);
return flush;
}
int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb);
int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb);
void __gro_flush(struct gro_node *gro, bool flush_old);
static inline void gro_flush(struct gro_node *gro, bool flush_old)
{
if (!gro->bitmask)
return;
__gro_flush(gro, flush_old);
}
static inline void napi_gro_flush(struct napi_struct *napi, bool flush_old)
{
gro_flush(&napi->gro, flush_old);
}
/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
static inline void gro_normal_list(struct gro_node *gro)
{
if (!gro->rx_count)
return;
netif_receive_skb_list_internal(&gro->rx_list);
INIT_LIST_HEAD(&gro->rx_list);
gro->rx_count = 0;
}
static inline void gro_flush_normal(struct gro_node *gro, bool flush_old)
{
gro_flush(gro, flush_old);
gro_normal_list(gro);
}
/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded,
* pass the whole batch up to the stack.
*/
static inline void gro_normal_one(struct gro_node *gro, struct sk_buff *skb,
int segs)
{
list_add_tail(&skb->list, &gro->rx_list);
gro->rx_count += segs;
if (gro->rx_count >= READ_ONCE(net_hotdata.gro_normal_batch))
gro_normal_list(gro);
}
void gro_init(struct gro_node *gro);
void gro_cleanup(struct gro_node *gro);
/* This function is the alternative of 'inet_iif' and 'inet_sdif'
* functions in case we can not rely on fields of IPCB.
*
* The caller must verify skb_valid_dst(skb) is false and skb->dev is initialized.
* The caller must hold the RCU read lock.
*/
static inline void inet_get_iif_sdif(const struct sk_buff *skb, int *iif, int *sdif)
{
*iif = inet_iif(skb) ?: skb->dev->ifindex;
*sdif = 0;
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
if (netif_is_l3_slave(skb->dev)) {
struct net_device *master = netdev_master_upper_dev_get_rcu(skb->dev);
*sdif = *iif;
*iif = master ? master->ifindex : 0;
}
#endif
}
/* This function is the alternative of 'inet6_iif' and 'inet6_sdif'
* functions in case we can not rely on fields of IP6CB.
*
* The caller must verify skb_valid_dst(skb) is false and skb->dev is initialized.
* The caller must hold the RCU read lock.
*/
static inline void inet6_get_iif_sdif(const struct sk_buff *skb, int *iif, int *sdif)
{
/* using skb->dev->ifindex because skb_dst(skb) is not initialized */
*iif = skb->dev->ifindex;
*sdif = 0;
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
if (netif_is_l3_slave(skb->dev)) {
struct net_device *master = netdev_master_upper_dev_get_rcu(skb->dev);
*sdif = *iif;
*iif = master ? master->ifindex : 0;
}
#endif
}
struct packet_offload *gro_find_receive_by_type(__be16 type);
struct packet_offload *gro_find_complete_by_type(__be16 type);
#endif /* _NET_GRO_H */