mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-08-05 16:54:27 +00:00

vmxnet3 driver's XDP handling is buggy for packet sizes using ring0 (that is, packet sizes between 128 - 3k bytes). We noticed MTU-related connectivity issues with Cilium's service load- balancing in case of vmxnet3 as NIC underneath. A simple curl to a HTTP backend service where the XDP LB was doing IPIP encap led to overly large packet sizes but only for *some* of the packets (e.g. HTTP GET request) while others (e.g. the prior TCP 3WHS) looked completely fine on the wire. In fact, the pcap recording on the backend node actually revealed that the node with the XDP LB was leaking uninitialized kernel data onto the wire for the affected packets, for example, while the packets should have been 152 bytes their actual size was 1482 bytes, so the remainder after 152 bytes was padded with whatever other data was in that page at the time (e.g. we saw user/payload data from prior processed packets). We only noticed this through an MTU issue, e.g. when the XDP LB node and the backend node both had the same MTU (e.g. 1500) then the curl request got dropped on the backend node's NIC given the packet was too large even though the IPIP-encapped packet normally would never even come close to the MTU limit. Lowering the MTU on the XDP LB (e.g. 1480) allowed to let the curl request succeed (which also indicates that the kernel ignored the padding, and thus the issue wasn't very user-visible). Commite127ce7699
("vmxnet3: Fix missing reserved tailroom") was too eager to also switch xdp_prepare_buff() from rcd->len to rbi->len. It really needs to stick to rcd->len which is the actual packet length from the descriptor. The latter we also feed into vmxnet3_process_xdp_small(), by the way, and it indicates the correct length needed to initialize the xdp->{data,data_end} parts. Fore127ce7699
("vmxnet3: Fix missing reserved tailroom") the relevant part was adapting xdp_init_buff() to address the warning given the xdp_data_hard_end() depends on xdp->frame_sz. With that fixed, traffic on the wire looks good again. Fixes:e127ce7699
("vmxnet3: Fix missing reserved tailroom") Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Tested-by: Andrew Sauber <andrew.sauber@isovalent.com> Cc: Anton Protopopov <aspsk@isovalent.com> Cc: William Tu <witu@nvidia.com> Cc: Martin Zaharinov <micron10@gmail.com> Cc: Ronak Doshi <ronak.doshi@broadcom.com> Reviewed-by: Simon Horman <horms@kernel.org> Link: https://patch.msgid.link/20250423133600.176689-1-daniel@iogearbox.net Signed-off-by: Jakub Kicinski <kuba@kernel.org>
429 lines
10 KiB
C
429 lines
10 KiB
C
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
/*
|
|
* Linux driver for VMware's vmxnet3 ethernet NIC.
|
|
* Copyright (C) 2008-2023, VMware, Inc. All Rights Reserved.
|
|
* Maintained by: pv-drivers@vmware.com
|
|
*
|
|
*/
|
|
|
|
#include "vmxnet3_int.h"
|
|
#include "vmxnet3_xdp.h"
|
|
|
|
static void
|
|
vmxnet3_xdp_exchange_program(struct vmxnet3_adapter *adapter,
|
|
struct bpf_prog *prog)
|
|
{
|
|
rcu_assign_pointer(adapter->xdp_bpf_prog, prog);
|
|
}
|
|
|
|
static inline struct vmxnet3_tx_queue *
|
|
vmxnet3_xdp_get_tq(struct vmxnet3_adapter *adapter)
|
|
{
|
|
struct vmxnet3_tx_queue *tq;
|
|
int tq_number;
|
|
int cpu;
|
|
|
|
tq_number = adapter->num_tx_queues;
|
|
cpu = smp_processor_id();
|
|
if (likely(cpu < tq_number))
|
|
tq = &adapter->tx_queue[cpu];
|
|
else
|
|
tq = &adapter->tx_queue[cpu % tq_number];
|
|
|
|
return tq;
|
|
}
|
|
|
|
static int
|
|
vmxnet3_xdp_set(struct net_device *netdev, struct netdev_bpf *bpf,
|
|
struct netlink_ext_ack *extack)
|
|
{
|
|
struct vmxnet3_adapter *adapter = netdev_priv(netdev);
|
|
struct bpf_prog *new_bpf_prog = bpf->prog;
|
|
struct bpf_prog *old_bpf_prog;
|
|
bool need_update;
|
|
bool running;
|
|
int err;
|
|
|
|
if (new_bpf_prog && netdev->mtu > VMXNET3_XDP_MAX_MTU) {
|
|
NL_SET_ERR_MSG_FMT_MOD(extack, "MTU %u too large for XDP",
|
|
netdev->mtu);
|
|
return -EOPNOTSUPP;
|
|
}
|
|
|
|
if (adapter->netdev->features & NETIF_F_LRO) {
|
|
NL_SET_ERR_MSG_MOD(extack, "LRO is not supported with XDP");
|
|
adapter->netdev->features &= ~NETIF_F_LRO;
|
|
}
|
|
|
|
old_bpf_prog = rcu_dereference(adapter->xdp_bpf_prog);
|
|
if (!new_bpf_prog && !old_bpf_prog)
|
|
return 0;
|
|
|
|
running = netif_running(netdev);
|
|
need_update = !!old_bpf_prog != !!new_bpf_prog;
|
|
|
|
if (running && need_update)
|
|
vmxnet3_quiesce_dev(adapter);
|
|
|
|
vmxnet3_xdp_exchange_program(adapter, new_bpf_prog);
|
|
if (old_bpf_prog)
|
|
bpf_prog_put(old_bpf_prog);
|
|
|
|
if (!running || !need_update)
|
|
return 0;
|
|
|
|
if (new_bpf_prog)
|
|
xdp_features_set_redirect_target(netdev, false);
|
|
else
|
|
xdp_features_clear_redirect_target(netdev);
|
|
|
|
vmxnet3_reset_dev(adapter);
|
|
vmxnet3_rq_destroy_all(adapter);
|
|
vmxnet3_adjust_rx_ring_size(adapter);
|
|
err = vmxnet3_rq_create_all(adapter);
|
|
if (err) {
|
|
NL_SET_ERR_MSG_MOD(extack,
|
|
"failed to re-create rx queues for XDP.");
|
|
return -EOPNOTSUPP;
|
|
}
|
|
err = vmxnet3_activate_dev(adapter);
|
|
if (err) {
|
|
NL_SET_ERR_MSG_MOD(extack,
|
|
"failed to activate device for XDP.");
|
|
return -EOPNOTSUPP;
|
|
}
|
|
clear_bit(VMXNET3_STATE_BIT_RESETTING, &adapter->state);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/* This is the main xdp call used by kernel to set/unset eBPF program. */
|
|
int
|
|
vmxnet3_xdp(struct net_device *netdev, struct netdev_bpf *bpf)
|
|
{
|
|
switch (bpf->command) {
|
|
case XDP_SETUP_PROG:
|
|
return vmxnet3_xdp_set(netdev, bpf, bpf->extack);
|
|
default:
|
|
return -EINVAL;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
vmxnet3_xdp_xmit_frame(struct vmxnet3_adapter *adapter,
|
|
struct xdp_frame *xdpf,
|
|
struct vmxnet3_tx_queue *tq, bool dma_map)
|
|
{
|
|
struct vmxnet3_tx_buf_info *tbi = NULL;
|
|
union Vmxnet3_GenericDesc *gdesc;
|
|
struct vmxnet3_tx_ctx ctx;
|
|
int tx_num_deferred;
|
|
struct page *page;
|
|
u32 buf_size;
|
|
u32 dw2;
|
|
|
|
spin_lock_irq(&tq->tx_lock);
|
|
dw2 = (tq->tx_ring.gen ^ 0x1) << VMXNET3_TXD_GEN_SHIFT;
|
|
dw2 |= xdpf->len;
|
|
ctx.sop_txd = tq->tx_ring.base + tq->tx_ring.next2fill;
|
|
gdesc = ctx.sop_txd;
|
|
|
|
buf_size = xdpf->len;
|
|
tbi = tq->buf_info + tq->tx_ring.next2fill;
|
|
|
|
if (vmxnet3_cmd_ring_desc_avail(&tq->tx_ring) == 0) {
|
|
tq->stats.tx_ring_full++;
|
|
spin_unlock_irq(&tq->tx_lock);
|
|
return -ENOSPC;
|
|
}
|
|
|
|
tbi->map_type = VMXNET3_MAP_XDP;
|
|
if (dma_map) { /* ndo_xdp_xmit */
|
|
tbi->dma_addr = dma_map_single(&adapter->pdev->dev,
|
|
xdpf->data, buf_size,
|
|
DMA_TO_DEVICE);
|
|
if (dma_mapping_error(&adapter->pdev->dev, tbi->dma_addr)) {
|
|
spin_unlock_irq(&tq->tx_lock);
|
|
return -EFAULT;
|
|
}
|
|
tbi->map_type |= VMXNET3_MAP_SINGLE;
|
|
} else { /* XDP buffer from page pool */
|
|
page = virt_to_page(xdpf->data);
|
|
tbi->dma_addr = page_pool_get_dma_addr(page) +
|
|
(xdpf->data - (void *)xdpf);
|
|
dma_sync_single_for_device(&adapter->pdev->dev,
|
|
tbi->dma_addr, buf_size,
|
|
DMA_TO_DEVICE);
|
|
}
|
|
tbi->xdpf = xdpf;
|
|
tbi->len = buf_size;
|
|
|
|
gdesc = tq->tx_ring.base + tq->tx_ring.next2fill;
|
|
WARN_ON_ONCE(gdesc->txd.gen == tq->tx_ring.gen);
|
|
|
|
gdesc->txd.addr = cpu_to_le64(tbi->dma_addr);
|
|
gdesc->dword[2] = cpu_to_le32(dw2);
|
|
|
|
/* Setup the EOP desc */
|
|
gdesc->dword[3] = cpu_to_le32(VMXNET3_TXD_CQ | VMXNET3_TXD_EOP);
|
|
|
|
gdesc->txd.om = 0;
|
|
gdesc->txd.msscof = 0;
|
|
gdesc->txd.hlen = 0;
|
|
gdesc->txd.ti = 0;
|
|
|
|
tx_num_deferred = le32_to_cpu(tq->shared->txNumDeferred);
|
|
le32_add_cpu(&tq->shared->txNumDeferred, 1);
|
|
tx_num_deferred++;
|
|
|
|
vmxnet3_cmd_ring_adv_next2fill(&tq->tx_ring);
|
|
|
|
/* set the last buf_info for the pkt */
|
|
tbi->sop_idx = ctx.sop_txd - tq->tx_ring.base;
|
|
|
|
dma_wmb();
|
|
gdesc->dword[2] = cpu_to_le32(le32_to_cpu(gdesc->dword[2]) ^
|
|
VMXNET3_TXD_GEN);
|
|
spin_unlock_irq(&tq->tx_lock);
|
|
|
|
/* No need to handle the case when tx_num_deferred doesn't reach
|
|
* threshold. Backend driver at hypervisor side will poll and reset
|
|
* tq->shared->txNumDeferred to 0.
|
|
*/
|
|
if (tx_num_deferred >= le32_to_cpu(tq->shared->txThreshold)) {
|
|
tq->shared->txNumDeferred = 0;
|
|
VMXNET3_WRITE_BAR0_REG(adapter,
|
|
VMXNET3_REG_TXPROD + tq->qid * 8,
|
|
tq->tx_ring.next2fill);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
vmxnet3_xdp_xmit_back(struct vmxnet3_adapter *adapter,
|
|
struct xdp_frame *xdpf)
|
|
{
|
|
struct vmxnet3_tx_queue *tq;
|
|
struct netdev_queue *nq;
|
|
int err;
|
|
|
|
tq = vmxnet3_xdp_get_tq(adapter);
|
|
if (tq->stopped)
|
|
return -ENETDOWN;
|
|
|
|
nq = netdev_get_tx_queue(adapter->netdev, tq->qid);
|
|
|
|
__netif_tx_lock(nq, smp_processor_id());
|
|
err = vmxnet3_xdp_xmit_frame(adapter, xdpf, tq, false);
|
|
__netif_tx_unlock(nq);
|
|
|
|
return err;
|
|
}
|
|
|
|
/* ndo_xdp_xmit */
|
|
int
|
|
vmxnet3_xdp_xmit(struct net_device *dev,
|
|
int n, struct xdp_frame **frames, u32 flags)
|
|
{
|
|
struct vmxnet3_adapter *adapter = netdev_priv(dev);
|
|
struct vmxnet3_tx_queue *tq;
|
|
struct netdev_queue *nq;
|
|
int i;
|
|
|
|
if (unlikely(test_bit(VMXNET3_STATE_BIT_QUIESCED, &adapter->state)))
|
|
return -ENETDOWN;
|
|
if (unlikely(test_bit(VMXNET3_STATE_BIT_RESETTING, &adapter->state)))
|
|
return -EINVAL;
|
|
|
|
tq = vmxnet3_xdp_get_tq(adapter);
|
|
if (tq->stopped)
|
|
return -ENETDOWN;
|
|
|
|
nq = netdev_get_tx_queue(adapter->netdev, tq->qid);
|
|
|
|
__netif_tx_lock(nq, smp_processor_id());
|
|
for (i = 0; i < n; i++) {
|
|
if (vmxnet3_xdp_xmit_frame(adapter, frames[i], tq, true)) {
|
|
tq->stats.xdp_xmit_err++;
|
|
break;
|
|
}
|
|
}
|
|
tq->stats.xdp_xmit += i;
|
|
__netif_tx_unlock(nq);
|
|
|
|
return i;
|
|
}
|
|
|
|
static int
|
|
vmxnet3_run_xdp(struct vmxnet3_rx_queue *rq, struct xdp_buff *xdp,
|
|
struct bpf_prog *prog)
|
|
{
|
|
struct xdp_frame *xdpf;
|
|
struct page *page;
|
|
int err;
|
|
u32 act;
|
|
|
|
rq->stats.xdp_packets++;
|
|
act = bpf_prog_run_xdp(prog, xdp);
|
|
page = virt_to_page(xdp->data_hard_start);
|
|
|
|
switch (act) {
|
|
case XDP_PASS:
|
|
return act;
|
|
case XDP_REDIRECT:
|
|
err = xdp_do_redirect(rq->adapter->netdev, xdp, prog);
|
|
if (!err) {
|
|
rq->stats.xdp_redirects++;
|
|
} else {
|
|
rq->stats.xdp_drops++;
|
|
page_pool_recycle_direct(rq->page_pool, page);
|
|
}
|
|
return act;
|
|
case XDP_TX:
|
|
xdpf = xdp_convert_buff_to_frame(xdp);
|
|
if (unlikely(!xdpf ||
|
|
vmxnet3_xdp_xmit_back(rq->adapter, xdpf))) {
|
|
rq->stats.xdp_drops++;
|
|
page_pool_recycle_direct(rq->page_pool, page);
|
|
} else {
|
|
rq->stats.xdp_tx++;
|
|
}
|
|
return act;
|
|
default:
|
|
bpf_warn_invalid_xdp_action(rq->adapter->netdev, prog, act);
|
|
fallthrough;
|
|
case XDP_ABORTED:
|
|
trace_xdp_exception(rq->adapter->netdev, prog, act);
|
|
rq->stats.xdp_aborted++;
|
|
break;
|
|
case XDP_DROP:
|
|
rq->stats.xdp_drops++;
|
|
break;
|
|
}
|
|
|
|
page_pool_recycle_direct(rq->page_pool, page);
|
|
|
|
return act;
|
|
}
|
|
|
|
static struct sk_buff *
|
|
vmxnet3_build_skb(struct vmxnet3_rx_queue *rq, struct page *page,
|
|
const struct xdp_buff *xdp)
|
|
{
|
|
struct sk_buff *skb;
|
|
|
|
skb = build_skb(page_address(page), PAGE_SIZE);
|
|
if (unlikely(!skb)) {
|
|
page_pool_recycle_direct(rq->page_pool, page);
|
|
rq->stats.rx_buf_alloc_failure++;
|
|
return NULL;
|
|
}
|
|
|
|
/* bpf prog might change len and data position. */
|
|
skb_reserve(skb, xdp->data - xdp->data_hard_start);
|
|
skb_put(skb, xdp->data_end - xdp->data);
|
|
skb_mark_for_recycle(skb);
|
|
|
|
return skb;
|
|
}
|
|
|
|
/* Handle packets from DataRing. */
|
|
int
|
|
vmxnet3_process_xdp_small(struct vmxnet3_adapter *adapter,
|
|
struct vmxnet3_rx_queue *rq,
|
|
void *data, int len,
|
|
struct sk_buff **skb_xdp_pass)
|
|
{
|
|
struct bpf_prog *xdp_prog;
|
|
struct xdp_buff xdp;
|
|
struct page *page;
|
|
int act;
|
|
|
|
page = page_pool_alloc_pages(rq->page_pool, GFP_ATOMIC);
|
|
if (unlikely(!page)) {
|
|
rq->stats.rx_buf_alloc_failure++;
|
|
return XDP_DROP;
|
|
}
|
|
|
|
xdp_init_buff(&xdp, PAGE_SIZE, &rq->xdp_rxq);
|
|
xdp_prepare_buff(&xdp, page_address(page), rq->page_pool->p.offset,
|
|
len, false);
|
|
xdp_buff_clear_frags_flag(&xdp);
|
|
|
|
/* Must copy the data because it's at dataring. */
|
|
memcpy(xdp.data, data, len);
|
|
|
|
xdp_prog = rcu_dereference(rq->adapter->xdp_bpf_prog);
|
|
if (!xdp_prog) {
|
|
act = XDP_PASS;
|
|
goto out_skb;
|
|
}
|
|
act = vmxnet3_run_xdp(rq, &xdp, xdp_prog);
|
|
if (act != XDP_PASS)
|
|
return act;
|
|
|
|
out_skb:
|
|
*skb_xdp_pass = vmxnet3_build_skb(rq, page, &xdp);
|
|
if (!*skb_xdp_pass)
|
|
return XDP_DROP;
|
|
|
|
/* No need to refill. */
|
|
return likely(*skb_xdp_pass) ? act : XDP_DROP;
|
|
}
|
|
|
|
int
|
|
vmxnet3_process_xdp(struct vmxnet3_adapter *adapter,
|
|
struct vmxnet3_rx_queue *rq,
|
|
struct Vmxnet3_RxCompDesc *rcd,
|
|
struct vmxnet3_rx_buf_info *rbi,
|
|
struct Vmxnet3_RxDesc *rxd,
|
|
struct sk_buff **skb_xdp_pass)
|
|
{
|
|
struct bpf_prog *xdp_prog;
|
|
dma_addr_t new_dma_addr;
|
|
struct xdp_buff xdp;
|
|
struct page *page;
|
|
void *new_data;
|
|
int act;
|
|
|
|
page = rbi->page;
|
|
dma_sync_single_for_cpu(&adapter->pdev->dev,
|
|
page_pool_get_dma_addr(page) +
|
|
rq->page_pool->p.offset, rbi->len,
|
|
page_pool_get_dma_dir(rq->page_pool));
|
|
|
|
xdp_init_buff(&xdp, PAGE_SIZE, &rq->xdp_rxq);
|
|
xdp_prepare_buff(&xdp, page_address(page), rq->page_pool->p.offset,
|
|
rcd->len, false);
|
|
xdp_buff_clear_frags_flag(&xdp);
|
|
|
|
xdp_prog = rcu_dereference(rq->adapter->xdp_bpf_prog);
|
|
if (!xdp_prog) {
|
|
act = XDP_PASS;
|
|
goto out_skb;
|
|
}
|
|
act = vmxnet3_run_xdp(rq, &xdp, xdp_prog);
|
|
|
|
if (act == XDP_PASS) {
|
|
out_skb:
|
|
*skb_xdp_pass = vmxnet3_build_skb(rq, page, &xdp);
|
|
if (!*skb_xdp_pass)
|
|
act = XDP_DROP;
|
|
}
|
|
|
|
new_data = vmxnet3_pp_get_buff(rq->page_pool, &new_dma_addr,
|
|
GFP_ATOMIC);
|
|
if (!new_data) {
|
|
rq->stats.rx_buf_alloc_failure++;
|
|
return XDP_DROP;
|
|
}
|
|
rbi->page = virt_to_page(new_data);
|
|
rbi->dma_addr = new_dma_addr;
|
|
rxd->addr = cpu_to_le64(rbi->dma_addr);
|
|
rxd->len = rbi->len;
|
|
|
|
return act;
|
|
}
|