mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-08-05 16:54:27 +00:00

When netfilter defrag hooks are loaded (due to the presence of conntrack rules, for example), fragmented packets entering the bridge will be defragged by the bridge's pre-routing hook (br_nf_pre_routing() -> ipv4_conntrack_defrag()). Later on, in the bridge's post-routing hook, the defragged packet will be fragmented again. If the size of the largest fragment is larger than what the kernel has determined as the destination MTU (using ip_skb_dst_mtu()), the defragged packet will be dropped. Before commitac6627a28d
("net: ipv4: Consolidate ipv4_mtu and ip_dst_mtu_maybe_forward"), ip_skb_dst_mtu() would return dst_mtu() as the destination MTU. Assuming the dst entry attached to the packet is the bridge's fake rtable one, this would simply be the bridge's MTU (see fake_mtu()). However, after above mentioned commit, ip_skb_dst_mtu() ends up returning the route's MTU stored in the dst entry's metrics. Ideally, in case the dst entry is the bridge's fake rtable one, this should be the bridge's MTU as the bridge takes care of updating this metric when its MTU changes (see br_change_mtu()). Unfortunately, the last operation is a no-op given the metrics attached to the fake rtable entry are marked as read-only. Therefore, ip_skb_dst_mtu() ends up returning 1500 (the initial MTU value) and defragged packets are dropped during fragmentation when dealing with large fragments and high MTU (e.g., 9k). Fix by moving the fake rtable entry's metrics to be per-bridge (in a similar fashion to the fake rtable entry itself) and marking them as writable, thereby allowing MTU changes to be reflected. Fixes:62fa8a846d
("net: Implement read-only protection and COW'ing of metrics.") Fixes:33eb9873a2
("bridge: initialize fake_rtable metrics") Reported-by: Venkat Venkatsubra <venkat.x.venkatsubra@oracle.com> Closes: https://lore.kernel.org/netdev/PH0PR10MB4504888284FF4CBA648197D0ACB82@PH0PR10MB4504.namprd10.prod.outlook.com/ Tested-by: Venkat Venkatsubra <venkat.x.venkatsubra@oracle.com> Signed-off-by: Ido Schimmel <idosch@nvidia.com> Acked-by: Nikolay Aleksandrov <razor@blackwall.org> Link: https://patch.msgid.link/20250515084848.727706-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
88 lines
2 KiB
C
88 lines
2 KiB
C
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
/*
|
|
* Handle firewalling core
|
|
* Linux ethernet bridge
|
|
*
|
|
* Authors:
|
|
* Lennert Buytenhek <buytenh@gnu.org>
|
|
* Bart De Schuymer <bdschuym@pandora.be>
|
|
*
|
|
* Lennert dedicates this file to Kerstin Wurdinger.
|
|
*/
|
|
|
|
#include <linux/module.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/in_route.h>
|
|
#include <linux/inetdevice.h>
|
|
#include <net/route.h>
|
|
|
|
#include "br_private.h"
|
|
#ifdef CONFIG_SYSCTL
|
|
#include <linux/sysctl.h>
|
|
#endif
|
|
|
|
static void fake_update_pmtu(struct dst_entry *dst, struct sock *sk,
|
|
struct sk_buff *skb, u32 mtu,
|
|
bool confirm_neigh)
|
|
{
|
|
}
|
|
|
|
static void fake_redirect(struct dst_entry *dst, struct sock *sk,
|
|
struct sk_buff *skb)
|
|
{
|
|
}
|
|
|
|
static u32 *fake_cow_metrics(struct dst_entry *dst, unsigned long old)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
static struct neighbour *fake_neigh_lookup(const struct dst_entry *dst,
|
|
struct sk_buff *skb,
|
|
const void *daddr)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
static unsigned int fake_mtu(const struct dst_entry *dst)
|
|
{
|
|
return dst->dev->mtu;
|
|
}
|
|
|
|
static struct dst_ops fake_dst_ops = {
|
|
.family = AF_INET,
|
|
.update_pmtu = fake_update_pmtu,
|
|
.redirect = fake_redirect,
|
|
.cow_metrics = fake_cow_metrics,
|
|
.neigh_lookup = fake_neigh_lookup,
|
|
.mtu = fake_mtu,
|
|
};
|
|
|
|
/*
|
|
* Initialize bogus route table used to keep netfilter happy.
|
|
* Currently, we fill in the PMTU entry because netfilter
|
|
* refragmentation needs it, and the rt_flags entry because
|
|
* ipt_REJECT needs it. Future netfilter modules might
|
|
* require us to fill additional fields.
|
|
*/
|
|
void br_netfilter_rtable_init(struct net_bridge *br)
|
|
{
|
|
struct rtable *rt = &br->fake_rtable;
|
|
|
|
rcuref_init(&rt->dst.__rcuref, 1);
|
|
rt->dst.dev = br->dev;
|
|
dst_init_metrics(&rt->dst, br->metrics, false);
|
|
dst_metric_set(&rt->dst, RTAX_MTU, br->dev->mtu);
|
|
rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE;
|
|
rt->dst.ops = &fake_dst_ops;
|
|
}
|
|
|
|
int __init br_nf_core_init(void)
|
|
{
|
|
return dst_entries_init(&fake_dst_ops);
|
|
}
|
|
|
|
void br_nf_core_fini(void)
|
|
{
|
|
dst_entries_destroy(&fake_dst_ops);
|
|
}
|