linux/net/bridge/br_nf_core.c

// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *	Handle firewalling core
 *	Linux ethernet bridge
 *
 *	Authors:
 *	Lennert Buytenhek		<buytenh@gnu.org>
 *	Bart De Schuymer		<bdschuym@pandora.be>
 *
 *	Lennert dedicates this file to Kerstin Wurdinger.
 */

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/in_route.h>
#include <linux/inetdevice.h>
#include <net/route.h>

#include "br_private.h"
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif

static void fake_update_pmtu(struct dst_entry *dst, struct sock *sk,
			     struct sk_buff *skb, u32 mtu,
			     bool confirm_neigh)
{
}

static void fake_redirect(struct dst_entry *dst, struct sock *sk,
			  struct sk_buff *skb)
{
}

static u32 *fake_cow_metrics(struct dst_entry *dst, unsigned long old)
{
	return NULL;
}

static struct neighbour *fake_neigh_lookup(const struct dst_entry *dst,
					   struct sk_buff *skb,
					   const void *daddr)
{
	return NULL;
}

static unsigned int fake_mtu(const struct dst_entry *dst)
{
	return dst->dev->mtu;
}

static struct dst_ops fake_dst_ops = {
	.family		= AF_INET,
	.update_pmtu	= fake_update_pmtu,
	.redirect	= fake_redirect,
	.cow_metrics	= fake_cow_metrics,
	.neigh_lookup	= fake_neigh_lookup,
	.mtu		= fake_mtu,
};

/*
 * Initialize bogus route table used to keep netfilter happy.
 * Currently, we fill in the PMTU entry because netfilter
 * refragmentation needs it, and the rt_flags entry because
 * ipt_REJECT needs it.  Future netfilter modules might
 * require us to fill additional fields.
 */
void br_netfilter_rtable_init(struct net_bridge *br)
{
	struct rtable *rt = &br->fake_rtable;

	rcuref_init(&rt->dst.__rcuref, 1);
	rt->dst.dev = br->dev;
	dst_init_metrics(&rt->dst, br->metrics, false);
	dst_metric_set(&rt->dst, RTAX_MTU, br->dev->mtu);
	rt->dst.flags	= DST_NOXFRM | DST_FAKE_RTABLE;
	rt->dst.ops = &fake_dst_ops;
}

int __init br_nf_core_init(void)
{
	return dst_entries_init(&fake_dst_ops);
}

void br_nf_core_fini(void)
{
	dst_entries_destroy(&fake_dst_ops);
}
treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 152 Based on 1 normalized pattern(s): this program is free software you can redistribute it and or modify it under the terms of the gnu general public license as published by the free software foundation either version 2 of the license or at your option any later version extracted by the scancode license scanner the SPDX license identifier GPL-2.0-or-later has been chosen to replace the boilerplate/reference in 3029 file(s). Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Allison Randal <allison@lohutok.net> Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190527070032.746973796@linutronix.de Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> 2019-05-27 08:55:01 +02:00			`// SPDX-License-Identifier: GPL-2.0-or-later`
netfilter: bridge: move br_netfilter out of the core Jesper reported that br_netfilter always registers the hooks since this is part of the bridge core. This harms performance for people that don't need this. This patch modularizes br_netfilter so it can be rmmod'ed, thus, the hooks can be unregistered. I think the bridge netfilter should have been a separated module since the beginning, Patrick agreed on that. Note that this is breaking compatibility for users that expect that bridge netfilter is going to be available after explicitly 'modprobe bridge' or via automatic load through brctl. However, the damage can be easily undone by modprobing br_netfilter. The bridge core also spots a message to provide a clue to people that didn't notice that this has been deprecated. On top of that, the plan is that nftables will not rely on this software layer, but integrate the connection tracking into the bridge layer to enable stateful filtering and NAT, which is was bridge netfilter users seem to require. This patch still keeps the fake_dst_ops in the bridge core, since this is required by when the bridge port is initialized. So we can safely modprobe/rmmod br_netfilter anytime. Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> Acked-by: Florian Westphal <fw@strlen.de> 2014-09-18 11:29:03 +02:00			`/*`
			`* Handle firewalling core`
			`* Linux ethernet bridge`
			`*`
			`* Authors:`
			`* Lennert Buytenhek <buytenh@gnu.org>`
			`* Bart De Schuymer <bdschuym@pandora.be>`
			`*`
			`* Lennert dedicates this file to Kerstin Wurdinger.`
			`*/`

			`#include <linux/module.h>`
			`#include <linux/kernel.h>`
			`#include <linux/in_route.h>`
			`#include <linux/inetdevice.h>`
			`#include <net/route.h>`

			`#include "br_private.h"`
			`#ifdef CONFIG_SYSCTL`
			`#include <linux/sysctl.h>`
			`#endif`

			`static void fake_update_pmtu(struct dst_entry dst, struct sock sk,`
net: add bool confirm_neigh parameter for dst_ops.update_pmtu The MTU update code is supposed to be invoked in response to real networking events that update the PMTU. In IPv6 PMTU update function __ip6_rt_update_pmtu() we called dst_confirm_neigh() to update neighbor confirmed time. But for tunnel code, it will call pmtu before xmit, like: - tnl_update_pmtu() - skb_dst_update_pmtu() - ip6_rt_update_pmtu() - __ip6_rt_update_pmtu() - dst_confirm_neigh() If the tunnel remote dst mac address changed and we still do the neigh confirm, we will not be able to update neigh cache and ping6 remote will failed. So for this ip_tunnel_xmit() case, _EVEN_ if the MTU is changed, we should not be invoking dst_confirm_neigh() as we have no evidence of successful two-way communication at this point. On the other hand it is also important to keep the neigh reachability fresh for TCP flows, so we cannot remove this dst_confirm_neigh() call. To fix the issue, we have to add a new bool parameter for dst_ops.update_pmtu to choose whether we should do neigh update or not. I will add the parameter in this patch and set all the callers to true to comply with the previous way, and fix the tunnel code one by one on later patches. v5: No change. v4: No change. v3: Do not remove dst_confirm_neigh, but add a new bool parameter in dst_ops.update_pmtu to control whether we should do neighbor confirm. Also split the big patch to small ones for each area. v2: Remove dst_confirm_neigh in __ip6_rt_update_pmtu. Suggested-by: David Miller <davem@davemloft.net> Reviewed-by: Guillaume Nault <gnault@redhat.com> Acked-by: David Ahern <dsahern@gmail.com> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2019-12-22 10:51:09 +08:00			`struct sk_buff *skb, u32 mtu,`
			`bool confirm_neigh)`
netfilter: bridge: move br_netfilter out of the core Jesper reported that br_netfilter always registers the hooks since this is part of the bridge core. This harms performance for people that don't need this. This patch modularizes br_netfilter so it can be rmmod'ed, thus, the hooks can be unregistered. I think the bridge netfilter should have been a separated module since the beginning, Patrick agreed on that. Note that this is breaking compatibility for users that expect that bridge netfilter is going to be available after explicitly 'modprobe bridge' or via automatic load through brctl. However, the damage can be easily undone by modprobing br_netfilter. The bridge core also spots a message to provide a clue to people that didn't notice that this has been deprecated. On top of that, the plan is that nftables will not rely on this software layer, but integrate the connection tracking into the bridge layer to enable stateful filtering and NAT, which is was bridge netfilter users seem to require. This patch still keeps the fake_dst_ops in the bridge core, since this is required by when the bridge port is initialized. So we can safely modprobe/rmmod br_netfilter anytime. Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> Acked-by: Florian Westphal <fw@strlen.de> 2014-09-18 11:29:03 +02:00			`{`
			`}`

			`static void fake_redirect(struct dst_entry dst, struct sock sk,`
			`struct sk_buff *skb)`
			`{`
			`}`

			`static u32 fake_cow_metrics(struct dst_entry dst, unsigned long old)`
			`{`
			`return NULL;`
			`}`

			`static struct neighbour fake_neigh_lookup(const struct dst_entry dst,`
			`struct sk_buff *skb,`
			`const void *daddr)`
			`{`
			`return NULL;`
			`}`

			`static unsigned int fake_mtu(const struct dst_entry *dst)`
			`{`
			`return dst->dev->mtu;`
			`}`

			`static struct dst_ops fake_dst_ops = {`
			`.family = AF_INET,`
			`.update_pmtu = fake_update_pmtu,`
			`.redirect = fake_redirect,`
			`.cow_metrics = fake_cow_metrics,`
			`.neigh_lookup = fake_neigh_lookup,`
			`.mtu = fake_mtu,`
			`};`

			`/*`
			`* Initialize bogus route table used to keep netfilter happy.`
			`* Currently, we fill in the PMTU entry because netfilter`
			`* refragmentation needs it, and the rt_flags entry because`
			`* ipt_REJECT needs it. Future netfilter modules might`
			`* require us to fill additional fields.`
			`*/`
			`void br_netfilter_rtable_init(struct net_bridge *br)`
			`{`
			`struct rtable *rt = &br->fake_rtable;`

net: dst: Switch to rcuref_t reference counting Under high contention dst_entry::__refcnt becomes a significant bottleneck. atomic_inc_not_zero() is implemented with a cmpxchg() loop, which goes into high retry rates on contention. Switch the reference count to rcuref_t which results in a significant performance gain. Rename the reference count member to __rcuref to reflect the change. The gain depends on the micro-architecture and the number of concurrent operations and has been measured in the range of +25% to +130% with a localhost memtier/memcached benchmark which amplifies the problem massively. Running the memtier/memcached benchmark over a real (1Gb) network connection the conversion on top of the false sharing fix for struct dst_entry::__refcnt results in a total gain in the 2%-5% range over the upstream baseline. Reported-by: Wangyang Guo <wangyang.guo@intel.com> Reported-by: Arjan Van De Ven <arjan.van.de.ven@intel.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Link: https://lore.kernel.org/r/20230307125538.989175656@linutronix.de Link: https://lore.kernel.org/r/20230323102800.215027837@linutronix.de Signed-off-by: Jakub Kicinski <kuba@kernel.org> 2023-03-23 21:55:32 +01:00			`rcuref_init(&rt->dst.__rcuref, 1);`
netfilter: bridge: move br_netfilter out of the core Jesper reported that br_netfilter always registers the hooks since this is part of the bridge core. This harms performance for people that don't need this. This patch modularizes br_netfilter so it can be rmmod'ed, thus, the hooks can be unregistered. I think the bridge netfilter should have been a separated module since the beginning, Patrick agreed on that. Note that this is breaking compatibility for users that expect that bridge netfilter is going to be available after explicitly 'modprobe bridge' or via automatic load through brctl. However, the damage can be easily undone by modprobing br_netfilter. The bridge core also spots a message to provide a clue to people that didn't notice that this has been deprecated. On top of that, the plan is that nftables will not rely on this software layer, but integrate the connection tracking into the bridge layer to enable stateful filtering and NAT, which is was bridge netfilter users seem to require. This patch still keeps the fake_dst_ops in the bridge core, since this is required by when the bridge port is initialized. So we can safely modprobe/rmmod br_netfilter anytime. Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> Acked-by: Florian Westphal <fw@strlen.de> 2014-09-18 11:29:03 +02:00			`rt->dst.dev = br->dev;`
bridge: netfilter: Fix forwarding of fragmented packets When netfilter defrag hooks are loaded (due to the presence of conntrack rules, for example), fragmented packets entering the bridge will be defragged by the bridge's pre-routing hook (br_nf_pre_routing() -> ipv4_conntrack_defrag()). Later on, in the bridge's post-routing hook, the defragged packet will be fragmented again. If the size of the largest fragment is larger than what the kernel has determined as the destination MTU (using ip_skb_dst_mtu()), the defragged packet will be dropped. Before commit ac6627a28dbf ("net: ipv4: Consolidate ipv4_mtu and ip_dst_mtu_maybe_forward"), ip_skb_dst_mtu() would return dst_mtu() as the destination MTU. Assuming the dst entry attached to the packet is the bridge's fake rtable one, this would simply be the bridge's MTU (see fake_mtu()). However, after above mentioned commit, ip_skb_dst_mtu() ends up returning the route's MTU stored in the dst entry's metrics. Ideally, in case the dst entry is the bridge's fake rtable one, this should be the bridge's MTU as the bridge takes care of updating this metric when its MTU changes (see br_change_mtu()). Unfortunately, the last operation is a no-op given the metrics attached to the fake rtable entry are marked as read-only. Therefore, ip_skb_dst_mtu() ends up returning 1500 (the initial MTU value) and defragged packets are dropped during fragmentation when dealing with large fragments and high MTU (e.g., 9k). Fix by moving the fake rtable entry's metrics to be per-bridge (in a similar fashion to the fake rtable entry itself) and marking them as writable, thereby allowing MTU changes to be reflected. Fixes: 62fa8a846d7d ("net: Implement read-only protection and COW'ing of metrics.") Fixes: 33eb9873a283 ("bridge: initialize fake_rtable metrics") Reported-by: Venkat Venkatsubra <venkat.x.venkatsubra@oracle.com> Closes: https://lore.kernel.org/netdev/PH0PR10MB4504888284FF4CBA648197D0ACB82@PH0PR10MB4504.namprd10.prod.outlook.com/ Tested-by: Venkat Venkatsubra <venkat.x.venkatsubra@oracle.com> Signed-off-by: Ido Schimmel <idosch@nvidia.com> Acked-by: Nikolay Aleksandrov <razor@blackwall.org> Link: https://patch.msgid.link/20250515084848.727706-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> 2025-05-15 11:48:48 +03:00			`dst_init_metrics(&rt->dst, br->metrics, false);`
			`dst_metric_set(&rt->dst, RTAX_MTU, br->dev->mtu);`
netfilter: bridge: move br_netfilter out of the core Jesper reported that br_netfilter always registers the hooks since this is part of the bridge core. This harms performance for people that don't need this. This patch modularizes br_netfilter so it can be rmmod'ed, thus, the hooks can be unregistered. I think the bridge netfilter should have been a separated module since the beginning, Patrick agreed on that. Note that this is breaking compatibility for users that expect that bridge netfilter is going to be available after explicitly 'modprobe bridge' or via automatic load through brctl. However, the damage can be easily undone by modprobing br_netfilter. The bridge core also spots a message to provide a clue to people that didn't notice that this has been deprecated. On top of that, the plan is that nftables will not rely on this software layer, but integrate the connection tracking into the bridge layer to enable stateful filtering and NAT, which is was bridge netfilter users seem to require. This patch still keeps the fake_dst_ops in the bridge core, since this is required by when the bridge port is initialized. So we can safely modprobe/rmmod br_netfilter anytime. Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> Acked-by: Florian Westphal <fw@strlen.de> 2014-09-18 11:29:03 +02:00			`rt->dst.flags = DST_NOXFRM \| DST_FAKE_RTABLE;`
			`rt->dst.ops = &fake_dst_ops;`
			`}`

			`int __init br_nf_core_init(void)`
			`{`
			`return dst_entries_init(&fake_dst_ops);`
			`}`

			`void br_nf_core_fini(void)`
			`{`
			`dst_entries_destroy(&fake_dst_ops);`
			`}`