2018-03-20 07:58:13 -07:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
/* Copyright (c) 2018, Intel Corporation. */
|
|
|
|
|
|
|
|
/* The driver transmit and receive code */
|
|
|
|
|
|
|
|
#include <linux/mm.h>
|
2021-10-27 12:38:36 -07:00
|
|
|
#include <linux/netdevice.h>
|
|
|
|
#include <linux/prefetch.h>
|
2019-11-04 09:38:56 -08:00
|
|
|
#include <linux/bpf_trace.h>
|
2021-08-06 13:53:56 -07:00
|
|
|
#include <net/dsfield.h>
|
2022-03-17 21:12:12 -07:00
|
|
|
#include <net/mpls.h>
|
2019-11-04 09:38:56 -08:00
|
|
|
#include <net/xdp.h>
|
2019-11-04 09:38:56 -08:00
|
|
|
#include "ice_txrx_lib.h"
|
2019-11-04 09:38:56 -08:00
|
|
|
#include "ice_lib.h"
|
2018-03-20 07:58:13 -07:00
|
|
|
#include "ice.h"
|
2021-06-08 16:35:17 -07:00
|
|
|
#include "ice_trace.h"
|
2019-02-28 15:24:28 -08:00
|
|
|
#include "ice_dcb_lib.h"
|
2019-11-04 09:38:56 -08:00
|
|
|
#include "ice_xsk.h"
|
2021-08-19 17:08:58 -07:00
|
|
|
#include "ice_eswitch.h"
|
2018-03-20 07:58:13 -07:00
|
|
|
|
2018-03-20 07:58:14 -07:00
|
|
|
#define ICE_RX_HDR_SIZE 256
|
|
|
|
|
2020-05-11 18:01:40 -07:00
|
|
|
#define FDIR_DESC_RXDID 0x40
|
2020-05-11 18:01:42 -07:00
|
|
|
#define ICE_FDIR_CLEAN_DELAY 10
|
|
|
|
|
|
|
|
/**
|
|
|
|
* ice_prgm_fdir_fltr - Program a Flow Director filter
|
|
|
|
* @vsi: VSI to send dummy packet
|
|
|
|
* @fdir_desc: flow director descriptor
|
|
|
|
* @raw_packet: allocated buffer for flow director
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
ice_prgm_fdir_fltr(struct ice_vsi *vsi, struct ice_fltr_desc *fdir_desc,
|
|
|
|
u8 *raw_packet)
|
|
|
|
{
|
|
|
|
struct ice_tx_buf *tx_buf, *first;
|
|
|
|
struct ice_fltr_desc *f_desc;
|
|
|
|
struct ice_tx_desc *tx_desc;
|
2021-08-19 13:59:58 +02:00
|
|
|
struct ice_tx_ring *tx_ring;
|
2020-05-11 18:01:42 -07:00
|
|
|
struct device *dev;
|
|
|
|
dma_addr_t dma;
|
|
|
|
u32 td_cmd;
|
|
|
|
u16 i;
|
|
|
|
|
|
|
|
/* VSI and Tx ring */
|
|
|
|
if (!vsi)
|
|
|
|
return -ENOENT;
|
|
|
|
tx_ring = vsi->tx_rings[0];
|
|
|
|
if (!tx_ring || !tx_ring->desc)
|
|
|
|
return -ENOENT;
|
|
|
|
dev = tx_ring->dev;
|
|
|
|
|
|
|
|
/* we are using two descriptors to add/del a filter and we can wait */
|
|
|
|
for (i = ICE_FDIR_CLEAN_DELAY; ICE_DESC_UNUSED(tx_ring) < 2; i--) {
|
|
|
|
if (!i)
|
|
|
|
return -EAGAIN;
|
|
|
|
msleep_interruptible(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
dma = dma_map_single(dev, raw_packet, ICE_FDIR_MAX_RAW_PKT_SIZE,
|
|
|
|
DMA_TO_DEVICE);
|
|
|
|
|
|
|
|
if (dma_mapping_error(dev, dma))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
/* grab the next descriptor */
|
|
|
|
i = tx_ring->next_to_use;
|
|
|
|
first = &tx_ring->tx_buf[i];
|
|
|
|
f_desc = ICE_TX_FDIRDESC(tx_ring, i);
|
|
|
|
memcpy(f_desc, fdir_desc, sizeof(*f_desc));
|
|
|
|
|
|
|
|
i++;
|
|
|
|
i = (i < tx_ring->count) ? i : 0;
|
|
|
|
tx_desc = ICE_TX_DESC(tx_ring, i);
|
|
|
|
tx_buf = &tx_ring->tx_buf[i];
|
|
|
|
|
|
|
|
i++;
|
|
|
|
tx_ring->next_to_use = (i < tx_ring->count) ? i : 0;
|
|
|
|
|
|
|
|
memset(tx_buf, 0, sizeof(*tx_buf));
|
|
|
|
dma_unmap_len_set(tx_buf, len, ICE_FDIR_MAX_RAW_PKT_SIZE);
|
|
|
|
dma_unmap_addr_set(tx_buf, dma, dma);
|
|
|
|
|
|
|
|
tx_desc->buf_addr = cpu_to_le64(dma);
|
|
|
|
td_cmd = ICE_TXD_LAST_DESC_CMD | ICE_TX_DESC_CMD_DUMMY |
|
|
|
|
ICE_TX_DESC_CMD_RE;
|
|
|
|
|
ice: Robustify cleaning/completing XDP Tx buffers
When queueing frames from a Page Pool for redirecting to a device backed
by the ice driver, `perf top` shows heavy load on page_alloc() and
page_frag_free(), despite that on a properly working system it must be
fully or at least almost zero-alloc. The problem is in fact a bit deeper
and raises from how ice cleans up completed Tx buffers.
The story so far: when cleaning/freeing the resources related to
a particular completed Tx frame (skbs, DMA mappings etc.), ice uses some
heuristics only without setting any type explicitly (except for dummy
Flow Director packets, which are marked via ice_tx_buf::tx_flags).
This kinda works, but only up to some point. For example, currently ice
assumes that each frame coming to __ice_xmit_xdp_ring(), is backed by
either plain order-0 page or plain page frag, while it may also be
backed by Page Pool or any other possible memory models introduced in
future. This means any &xdp_frame must be freed properly via
xdp_return_frame() family with no assumptions.
In order to do that, the whole heuristics must be replaced with setting
the Tx buffer/frame type explicitly, just how it's always been done via
an enum. Let us reuse 16 bits from ::tx_flags -- 1 bit-and instr won't
hurt much -- especially given that sometimes there was a check for
%ICE_TX_FLAGS_DUMMY_PKT, which is now turned from a flag to an enum
member. The rest of the changes is straightforward and most of it is
just a conversion to rely now on the type set in &ice_tx_buf rather than
to some secondary properties.
For now, no functional changes intended, the change only prepares the
ground for starting freeing XDP frames properly next step. And it must
be done atomically/synchronously to not break stuff.
Signed-off-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/bpf/20230210170618.1973430-5-alexandr.lobakin@intel.com
2023-02-10 18:06:16 +01:00
|
|
|
tx_buf->type = ICE_TX_BUF_DUMMY;
|
2020-05-11 18:01:42 -07:00
|
|
|
tx_buf->raw_buf = raw_packet;
|
|
|
|
|
|
|
|
tx_desc->cmd_type_offset_bsz =
|
|
|
|
ice_build_ctob(td_cmd, 0, ICE_FDIR_MAX_RAW_PKT_SIZE, 0);
|
|
|
|
|
|
|
|
/* Force memory write to complete before letting h/w know
|
|
|
|
* there are new descriptors to fetch.
|
|
|
|
*/
|
|
|
|
wmb();
|
|
|
|
|
|
|
|
/* mark the data descriptor to be watched */
|
|
|
|
first->next_to_watch = tx_desc;
|
|
|
|
|
|
|
|
writel(tx_ring->next_to_use, tx_ring->tail);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2020-05-11 18:01:40 -07:00
|
|
|
|
2018-03-20 07:58:13 -07:00
|
|
|
/**
|
|
|
|
* ice_unmap_and_free_tx_buf - Release a Tx buffer
|
|
|
|
* @ring: the ring that owns the buffer
|
|
|
|
* @tx_buf: the buffer to free
|
|
|
|
*/
|
|
|
|
static void
|
2021-08-19 13:59:58 +02:00
|
|
|
ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct ice_tx_buf *tx_buf)
|
2018-03-20 07:58:13 -07:00
|
|
|
{
|
ice: Robustify cleaning/completing XDP Tx buffers
When queueing frames from a Page Pool for redirecting to a device backed
by the ice driver, `perf top` shows heavy load on page_alloc() and
page_frag_free(), despite that on a properly working system it must be
fully or at least almost zero-alloc. The problem is in fact a bit deeper
and raises from how ice cleans up completed Tx buffers.
The story so far: when cleaning/freeing the resources related to
a particular completed Tx frame (skbs, DMA mappings etc.), ice uses some
heuristics only without setting any type explicitly (except for dummy
Flow Director packets, which are marked via ice_tx_buf::tx_flags).
This kinda works, but only up to some point. For example, currently ice
assumes that each frame coming to __ice_xmit_xdp_ring(), is backed by
either plain order-0 page or plain page frag, while it may also be
backed by Page Pool or any other possible memory models introduced in
future. This means any &xdp_frame must be freed properly via
xdp_return_frame() family with no assumptions.
In order to do that, the whole heuristics must be replaced with setting
the Tx buffer/frame type explicitly, just how it's always been done via
an enum. Let us reuse 16 bits from ::tx_flags -- 1 bit-and instr won't
hurt much -- especially given that sometimes there was a check for
%ICE_TX_FLAGS_DUMMY_PKT, which is now turned from a flag to an enum
member. The rest of the changes is straightforward and most of it is
just a conversion to rely now on the type set in &ice_tx_buf rather than
to some secondary properties.
For now, no functional changes intended, the change only prepares the
ground for starting freeing XDP frames properly next step. And it must
be done atomically/synchronously to not break stuff.
Signed-off-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/bpf/20230210170618.1973430-5-alexandr.lobakin@intel.com
2023-02-10 18:06:16 +01:00
|
|
|
if (dma_unmap_len(tx_buf, len))
|
2018-03-20 07:58:13 -07:00
|
|
|
dma_unmap_page(ring->dev,
|
|
|
|
dma_unmap_addr(tx_buf, dma),
|
|
|
|
dma_unmap_len(tx_buf, len),
|
|
|
|
DMA_TO_DEVICE);
|
ice: Robustify cleaning/completing XDP Tx buffers
When queueing frames from a Page Pool for redirecting to a device backed
by the ice driver, `perf top` shows heavy load on page_alloc() and
page_frag_free(), despite that on a properly working system it must be
fully or at least almost zero-alloc. The problem is in fact a bit deeper
and raises from how ice cleans up completed Tx buffers.
The story so far: when cleaning/freeing the resources related to
a particular completed Tx frame (skbs, DMA mappings etc.), ice uses some
heuristics only without setting any type explicitly (except for dummy
Flow Director packets, which are marked via ice_tx_buf::tx_flags).
This kinda works, but only up to some point. For example, currently ice
assumes that each frame coming to __ice_xmit_xdp_ring(), is backed by
either plain order-0 page or plain page frag, while it may also be
backed by Page Pool or any other possible memory models introduced in
future. This means any &xdp_frame must be freed properly via
xdp_return_frame() family with no assumptions.
In order to do that, the whole heuristics must be replaced with setting
the Tx buffer/frame type explicitly, just how it's always been done via
an enum. Let us reuse 16 bits from ::tx_flags -- 1 bit-and instr won't
hurt much -- especially given that sometimes there was a check for
%ICE_TX_FLAGS_DUMMY_PKT, which is now turned from a flag to an enum
member. The rest of the changes is straightforward and most of it is
just a conversion to rely now on the type set in &ice_tx_buf rather than
to some secondary properties.
For now, no functional changes intended, the change only prepares the
ground for starting freeing XDP frames properly next step. And it must
be done atomically/synchronously to not break stuff.
Signed-off-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/bpf/20230210170618.1973430-5-alexandr.lobakin@intel.com
2023-02-10 18:06:16 +01:00
|
|
|
|
|
|
|
switch (tx_buf->type) {
|
|
|
|
case ICE_TX_BUF_DUMMY:
|
|
|
|
devm_kfree(ring->dev, tx_buf->raw_buf);
|
|
|
|
break;
|
|
|
|
case ICE_TX_BUF_SKB:
|
|
|
|
dev_kfree_skb_any(tx_buf->skb);
|
|
|
|
break;
|
|
|
|
case ICE_TX_BUF_XDP_TX:
|
|
|
|
page_frag_free(tx_buf->raw_buf);
|
|
|
|
break;
|
2023-02-10 18:06:17 +01:00
|
|
|
case ICE_TX_BUF_XDP_XMIT:
|
|
|
|
xdp_return_frame(tx_buf->xdpf);
|
|
|
|
break;
|
2018-03-20 07:58:13 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
tx_buf->next_to_watch = NULL;
|
ice: Robustify cleaning/completing XDP Tx buffers
When queueing frames from a Page Pool for redirecting to a device backed
by the ice driver, `perf top` shows heavy load on page_alloc() and
page_frag_free(), despite that on a properly working system it must be
fully or at least almost zero-alloc. The problem is in fact a bit deeper
and raises from how ice cleans up completed Tx buffers.
The story so far: when cleaning/freeing the resources related to
a particular completed Tx frame (skbs, DMA mappings etc.), ice uses some
heuristics only without setting any type explicitly (except for dummy
Flow Director packets, which are marked via ice_tx_buf::tx_flags).
This kinda works, but only up to some point. For example, currently ice
assumes that each frame coming to __ice_xmit_xdp_ring(), is backed by
either plain order-0 page or plain page frag, while it may also be
backed by Page Pool or any other possible memory models introduced in
future. This means any &xdp_frame must be freed properly via
xdp_return_frame() family with no assumptions.
In order to do that, the whole heuristics must be replaced with setting
the Tx buffer/frame type explicitly, just how it's always been done via
an enum. Let us reuse 16 bits from ::tx_flags -- 1 bit-and instr won't
hurt much -- especially given that sometimes there was a check for
%ICE_TX_FLAGS_DUMMY_PKT, which is now turned from a flag to an enum
member. The rest of the changes is straightforward and most of it is
just a conversion to rely now on the type set in &ice_tx_buf rather than
to some secondary properties.
For now, no functional changes intended, the change only prepares the
ground for starting freeing XDP frames properly next step. And it must
be done atomically/synchronously to not break stuff.
Signed-off-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/bpf/20230210170618.1973430-5-alexandr.lobakin@intel.com
2023-02-10 18:06:16 +01:00
|
|
|
tx_buf->type = ICE_TX_BUF_EMPTY;
|
2018-03-20 07:58:13 -07:00
|
|
|
dma_unmap_len_set(tx_buf, len, 0);
|
|
|
|
/* tx_buf must be completely set up in the transmit path */
|
|
|
|
}
|
|
|
|
|
2021-08-19 13:59:58 +02:00
|
|
|
static struct netdev_queue *txring_txq(const struct ice_tx_ring *ring)
|
2018-03-20 07:58:13 -07:00
|
|
|
{
|
|
|
|
return netdev_get_tx_queue(ring->netdev, ring->q_index);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* ice_clean_tx_ring - Free any empty Tx buffers
|
|
|
|
* @tx_ring: ring to be cleaned
|
|
|
|
*/
|
2021-08-19 13:59:58 +02:00
|
|
|
void ice_clean_tx_ring(struct ice_tx_ring *tx_ring)
|
2018-03-20 07:58:13 -07:00
|
|
|
{
|
2021-08-19 13:59:58 +02:00
|
|
|
u32 size;
|
2018-03-20 07:58:13 -07:00
|
|
|
u16 i;
|
|
|
|
|
2020-08-28 10:26:15 +02:00
|
|
|
if (ice_ring_is_xdp(tx_ring) && tx_ring->xsk_pool) {
|
2019-11-04 09:38:56 -08:00
|
|
|
ice_xsk_clean_xdp_ring(tx_ring);
|
|
|
|
goto tx_skip_free;
|
|
|
|
}
|
|
|
|
|
2018-03-20 07:58:13 -07:00
|
|
|
/* ring already cleared, nothing to do */
|
|
|
|
if (!tx_ring->tx_buf)
|
|
|
|
return;
|
|
|
|
|
2019-04-16 10:35:03 -07:00
|
|
|
/* Free all the Tx ring sk_buffs */
|
2018-03-20 07:58:13 -07:00
|
|
|
for (i = 0; i < tx_ring->count; i++)
|
|
|
|
ice_unmap_and_free_tx_buf(tx_ring, &tx_ring->tx_buf[i]);
|
|
|
|
|
2019-11-04 09:38:56 -08:00
|
|
|
tx_skip_free:
|
2019-02-08 12:50:31 -08:00
|
|
|
memset(tx_ring->tx_buf, 0, sizeof(*tx_ring->tx_buf) * tx_ring->count);
|
2018-03-20 07:58:13 -07:00
|
|
|
|
2021-08-19 13:59:58 +02:00
|
|
|
size = ALIGN(tx_ring->count * sizeof(struct ice_tx_desc),
|
|
|
|
PAGE_SIZE);
|
2018-03-20 07:58:13 -07:00
|
|
|
/* Zero out the descriptor ring */
|
2021-08-19 13:59:58 +02:00
|
|
|
memset(tx_ring->desc, 0, size);
|
2018-03-20 07:58:13 -07:00
|
|
|
|
|
|
|
tx_ring->next_to_use = 0;
|
|
|
|
tx_ring->next_to_clean = 0;
|
|
|
|
|
|
|
|
if (!tx_ring->netdev)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* cleanup Tx queue statistics */
|
|
|
|
netdev_tx_reset_queue(txring_txq(tx_ring));
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* ice_free_tx_ring - Free Tx resources per queue
|
|
|
|
* @tx_ring: Tx descriptor ring for a specific queue
|
|
|
|
*
|
|
|
|
* Free all transmit software resources
|
|
|
|
*/
|
2021-08-19 13:59:58 +02:00
|
|
|
void ice_free_tx_ring(struct ice_tx_ring *tx_ring)
|
2018-03-20 07:58:13 -07:00
|
|
|
{
|
2021-08-19 13:59:58 +02:00
|
|
|
u32 size;
|
|
|
|
|
2018-03-20 07:58:13 -07:00
|
|
|
ice_clean_tx_ring(tx_ring);
|
|
|
|
devm_kfree(tx_ring->dev, tx_ring->tx_buf);
|
|
|
|
tx_ring->tx_buf = NULL;
|
|
|
|
|
|
|
|
if (tx_ring->desc) {
|
2021-08-19 13:59:58 +02:00
|
|
|
size = ALIGN(tx_ring->count * sizeof(struct ice_tx_desc),
|
|
|
|
PAGE_SIZE);
|
|
|
|
dmam_free_coherent(tx_ring->dev, size,
|
2018-03-20 07:58:13 -07:00
|
|
|
tx_ring->desc, tx_ring->dma);
|
|
|
|
tx_ring->desc = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-03-20 07:58:14 -07:00
|
|
|
/**
|
|
|
|
* ice_clean_tx_irq - Reclaim resources after transmit completes
|
|
|
|
* @tx_ring: Tx ring to clean
|
|
|
|
* @napi_budget: Used to determine if we are in netpoll
|
|
|
|
*
|
|
|
|
* Returns true if there's any budget left (e.g. the clean is finished)
|
|
|
|
*/
|
2021-08-19 13:59:58 +02:00
|
|
|
static bool ice_clean_tx_irq(struct ice_tx_ring *tx_ring, int napi_budget)
|
2018-03-20 07:58:14 -07:00
|
|
|
{
|
|
|
|
unsigned int total_bytes = 0, total_pkts = 0;
|
2019-08-08 07:39:35 -07:00
|
|
|
unsigned int budget = ICE_DFLT_IRQ_WORK;
|
|
|
|
struct ice_vsi *vsi = tx_ring->vsi;
|
2018-03-20 07:58:14 -07:00
|
|
|
s16 i = tx_ring->next_to_clean;
|
|
|
|
struct ice_tx_desc *tx_desc;
|
|
|
|
struct ice_tx_buf *tx_buf;
|
|
|
|
|
2021-10-27 12:38:36 -07:00
|
|
|
/* get the bql data ready */
|
2022-02-03 13:16:51 +01:00
|
|
|
netdev_txq_bql_complete_prefetchw(txring_txq(tx_ring));
|
2021-10-27 12:38:36 -07:00
|
|
|
|
2018-03-20 07:58:14 -07:00
|
|
|
tx_buf = &tx_ring->tx_buf[i];
|
|
|
|
tx_desc = ICE_TX_DESC(tx_ring, i);
|
|
|
|
i -= tx_ring->count;
|
|
|
|
|
2019-08-08 07:39:35 -07:00
|
|
|
prefetch(&vsi->state);
|
|
|
|
|
2018-03-20 07:58:14 -07:00
|
|
|
do {
|
|
|
|
struct ice_tx_desc *eop_desc = tx_buf->next_to_watch;
|
|
|
|
|
|
|
|
/* if next_to_watch is not set then there is no work pending */
|
|
|
|
if (!eop_desc)
|
|
|
|
break;
|
|
|
|
|
2021-10-27 12:38:36 -07:00
|
|
|
/* follow the guidelines of other drivers */
|
|
|
|
prefetchw(&tx_buf->skb->users);
|
|
|
|
|
2018-03-20 07:58:14 -07:00
|
|
|
smp_rmb(); /* prevent any other reads prior to eop_desc */
|
|
|
|
|
2021-06-08 16:35:17 -07:00
|
|
|
ice_trace(clean_tx_irq, tx_ring, tx_desc, tx_buf);
|
2018-03-20 07:58:14 -07:00
|
|
|
/* if the descriptor isn't done, no work yet to do */
|
|
|
|
if (!(eop_desc->cmd_type_offset_bsz &
|
|
|
|
cpu_to_le64(ICE_TX_DESC_DTYPE_DESC_DONE)))
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* clear next_to_watch to prevent false hangs */
|
|
|
|
tx_buf->next_to_watch = NULL;
|
|
|
|
|
|
|
|
/* update the statistics for this packet */
|
|
|
|
total_bytes += tx_buf->bytecount;
|
|
|
|
total_pkts += tx_buf->gso_segs;
|
|
|
|
|
ice: optimize XDP_TX workloads
Optimize Tx descriptor cleaning for XDP. Current approach doesn't
really scale and chokes when multiple flows are handled.
Introduce two ring fields, @next_dd and @next_rs that will keep track of
descriptor that should be looked at when the need for cleaning arise and
the descriptor that should have the RS bit set, respectively.
Note that at this point the threshold is a constant (32), but it is
something that we could make configurable.
First thing is to get away from setting RS bit on each descriptor. Let's
do this only once NTU is higher than the currently @next_rs value. In
such case, grab the tx_desc[next_rs], set the RS bit in descriptor and
advance the @next_rs by a 32.
Second thing is to clean the Tx ring only when there are less than 32
free entries. For that case, look up the tx_desc[next_dd] for a DD bit.
This bit is written back by HW to let the driver know that xmit was
successful. It will happen only for those descriptors that had RS bit
set. Clean only 32 descriptors and advance the DD bit.
Actual cleaning routine is moved from ice_napi_poll() down to the
ice_xmit_xdp_ring(). It is safe to do so as XDP ring will not get any
SKBs in there that would rely on interrupts for the cleaning. Nice side
effect is that for rare case of Tx fallback path (that next patch is
going to introduce) we don't have to trigger the SW irq to clean the
ring.
With those two concepts, ring is kept at being almost full, but it is
guaranteed that driver will be able to produce Tx descriptors.
This approach seems to work out well even though the Tx descriptors are
produced in one-by-one manner. Test was conducted with the ice HW
bombarded with packets from HW generator, configured to generate 30
flows.
Xdp2 sample yields the following results:
<snip>
proto 17: 79973066 pkt/s
proto 17: 80018911 pkt/s
proto 17: 80004654 pkt/s
proto 17: 79992395 pkt/s
proto 17: 79975162 pkt/s
proto 17: 79955054 pkt/s
proto 17: 79869168 pkt/s
proto 17: 79823947 pkt/s
proto 17: 79636971 pkt/s
</snip>
As that sample reports the Rx'ed frames, let's look at sar output.
It says that what we Rx'ed we do actually Tx, no noticeable drops.
Average: IFACE rxpck/s txpck/s rxkB/s txkB/s rxcmp/s txcmp/s rxmcst/s %ifutil
Average: ens4f1 79842324.00 79842310.40 4678261.17 4678260.38 0.00 0.00 0.00 38.32
with tx_busy staying calm.
When compared to a state before:
Average: IFACE rxpck/s txpck/s rxkB/s txkB/s rxcmp/s txcmp/s rxmcst/s %ifutil
Average: ens4f1 90919711.60 42233822.60 5327326.85 2474638.04 0.00 0.00 0.00 43.64
it can be observed that the amount of txpck/s is almost doubled, meaning
that the performance is improved by around 90%. All of this due to the
drops in the driver, previously the tx_busy stat was bumped at a 7mpps
rate.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Tested-by: George Kuruvinakunnel <george.kuruvinakunnel@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-08-19 14:00:02 +02:00
|
|
|
/* free the skb */
|
|
|
|
napi_consume_skb(tx_buf->skb, napi_budget);
|
2018-03-20 07:58:14 -07:00
|
|
|
|
|
|
|
/* unmap skb header data */
|
|
|
|
dma_unmap_single(tx_ring->dev,
|
|
|
|
dma_unmap_addr(tx_buf, dma),
|
|
|
|
dma_unmap_len(tx_buf, len),
|
|
|
|
DMA_TO_DEVICE);
|
|
|
|
|
|
|
|
/* clear tx_buf data */
|
ice: Robustify cleaning/completing XDP Tx buffers
When queueing frames from a Page Pool for redirecting to a device backed
by the ice driver, `perf top` shows heavy load on page_alloc() and
page_frag_free(), despite that on a properly working system it must be
fully or at least almost zero-alloc. The problem is in fact a bit deeper
and raises from how ice cleans up completed Tx buffers.
The story so far: when cleaning/freeing the resources related to
a particular completed Tx frame (skbs, DMA mappings etc.), ice uses some
heuristics only without setting any type explicitly (except for dummy
Flow Director packets, which are marked via ice_tx_buf::tx_flags).
This kinda works, but only up to some point. For example, currently ice
assumes that each frame coming to __ice_xmit_xdp_ring(), is backed by
either plain order-0 page or plain page frag, while it may also be
backed by Page Pool or any other possible memory models introduced in
future. This means any &xdp_frame must be freed properly via
xdp_return_frame() family with no assumptions.
In order to do that, the whole heuristics must be replaced with setting
the Tx buffer/frame type explicitly, just how it's always been done via
an enum. Let us reuse 16 bits from ::tx_flags -- 1 bit-and instr won't
hurt much -- especially given that sometimes there was a check for
%ICE_TX_FLAGS_DUMMY_PKT, which is now turned from a flag to an enum
member. The rest of the changes is straightforward and most of it is
just a conversion to rely now on the type set in &ice_tx_buf rather than
to some secondary properties.
For now, no functional changes intended, the change only prepares the
ground for starting freeing XDP frames properly next step. And it must
be done atomically/synchronously to not break stuff.
Signed-off-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/bpf/20230210170618.1973430-5-alexandr.lobakin@intel.com
2023-02-10 18:06:16 +01:00
|
|
|
tx_buf->type = ICE_TX_BUF_EMPTY;
|
2018-03-20 07:58:14 -07:00
|
|
|
dma_unmap_len_set(tx_buf, len, 0);
|
|
|
|
|
|
|
|
/* unmap remaining buffers */
|
|
|
|
while (tx_desc != eop_desc) {
|
2021-06-08 16:35:17 -07:00
|
|
|
ice_trace(clean_tx_irq_unmap, tx_ring, tx_desc, tx_buf);
|
2018-03-20 07:58:14 -07:00
|
|
|
tx_buf++;
|
|
|
|
tx_desc++;
|
|
|
|
i++;
|
|
|
|
if (unlikely(!i)) {
|
|
|
|
i -= tx_ring->count;
|
|
|
|
tx_buf = tx_ring->tx_buf;
|
|
|
|
tx_desc = ICE_TX_DESC(tx_ring, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* unmap any remaining paged data */
|
|
|
|
if (dma_unmap_len(tx_buf, len)) {
|
|
|
|
dma_unmap_page(tx_ring->dev,
|
|
|
|
dma_unmap_addr(tx_buf, dma),
|
|
|
|
dma_unmap_len(tx_buf, len),
|
|
|
|
DMA_TO_DEVICE);
|
|
|
|
dma_unmap_len_set(tx_buf, len, 0);
|
|
|
|
}
|
|
|
|
}
|
2021-06-08 16:35:17 -07:00
|
|
|
ice_trace(clean_tx_irq_unmap_eop, tx_ring, tx_desc, tx_buf);
|
2018-03-20 07:58:14 -07:00
|
|
|
|
|
|
|
/* move us one more past the eop_desc for start of next pkt */
|
|
|
|
tx_buf++;
|
|
|
|
tx_desc++;
|
|
|
|
i++;
|
|
|
|
if (unlikely(!i)) {
|
|
|
|
i -= tx_ring->count;
|
|
|
|
tx_buf = tx_ring->tx_buf;
|
|
|
|
tx_desc = ICE_TX_DESC(tx_ring, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
prefetch(tx_desc);
|
|
|
|
|
|
|
|
/* update budget accounting */
|
|
|
|
budget--;
|
|
|
|
} while (likely(budget));
|
|
|
|
|
|
|
|
i += tx_ring->count;
|
|
|
|
tx_ring->next_to_clean = i;
|
2019-11-04 09:38:56 -08:00
|
|
|
|
|
|
|
ice_update_tx_ring_stats(tx_ring, total_pkts, total_bytes);
|
2021-10-25 17:08:23 -07:00
|
|
|
netdev_tx_completed_queue(txring_txq(tx_ring), total_pkts, total_bytes);
|
2018-03-20 07:58:14 -07:00
|
|
|
|
|
|
|
#define TX_WAKE_THRESHOLD ((s16)(DESC_NEEDED * 2))
|
|
|
|
if (unlikely(total_pkts && netif_carrier_ok(tx_ring->netdev) &&
|
|
|
|
(ICE_DESC_UNUSED(tx_ring) >= TX_WAKE_THRESHOLD))) {
|
|
|
|
/* Make sure that anybody stopping the queue after this
|
|
|
|
* sees the new next_to_clean.
|
|
|
|
*/
|
|
|
|
smp_mb();
|
2021-10-25 17:08:23 -07:00
|
|
|
if (netif_tx_queue_stopped(txring_txq(tx_ring)) &&
|
2021-03-02 10:15:37 -08:00
|
|
|
!test_bit(ICE_VSI_DOWN, vsi->state)) {
|
2021-10-25 17:08:23 -07:00
|
|
|
netif_tx_wake_queue(txring_txq(tx_ring));
|
ice: Accumulate ring statistics over reset
Resets may occur with or without user interaction. For example, a TX hang
or reconfiguration of parameters will result in a reset. During reset, the
VSI is freed, freeing any statistics structures inside as well. This would
create an issue for the user where a reset happens in the background,
statistics set to zero, and the user checks ring statistics expecting them
to be populated.
To ensure this doesn't happen, accumulate ring statistics over reset.
Define a new ring statistics structure, ice_ring_stats. The new structure
lives in the VSI's parent, preserving ring statistics when VSI is freed.
1. Define a new structure vsi_ring_stats in the PF scope
2. Allocate/free stats only during probe, unload, or change in ring size
3. Replace previous ring statistics functionality with new structure
Signed-off-by: Benjamin Mikailenko <benjamin.mikailenko@intel.com>
Tested-by: Gurucharan G <gurucharanx.g@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2022-11-18 16:20:02 -05:00
|
|
|
++tx_ring->ring_stats->tx_stats.restart_q;
|
2018-03-20 07:58:14 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return !!budget;
|
|
|
|
}
|
|
|
|
|
2018-03-20 07:58:13 -07:00
|
|
|
/**
|
|
|
|
* ice_setup_tx_ring - Allocate the Tx descriptors
|
2018-10-26 11:44:47 -07:00
|
|
|
* @tx_ring: the Tx ring to set up
|
2018-03-20 07:58:13 -07:00
|
|
|
*
|
|
|
|
* Return 0 on success, negative on error
|
|
|
|
*/
|
2021-08-19 13:59:58 +02:00
|
|
|
int ice_setup_tx_ring(struct ice_tx_ring *tx_ring)
|
2018-03-20 07:58:13 -07:00
|
|
|
{
|
|
|
|
struct device *dev = tx_ring->dev;
|
2021-08-19 13:59:58 +02:00
|
|
|
u32 size;
|
2018-03-20 07:58:13 -07:00
|
|
|
|
|
|
|
if (!dev)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
/* warn if we are about to overwrite the pointer */
|
|
|
|
WARN_ON(tx_ring->tx_buf);
|
2019-02-08 12:50:31 -08:00
|
|
|
tx_ring->tx_buf =
|
2021-10-06 13:09:08 -05:00
|
|
|
devm_kcalloc(dev, sizeof(*tx_ring->tx_buf), tx_ring->count,
|
2019-02-08 12:50:31 -08:00
|
|
|
GFP_KERNEL);
|
2018-03-20 07:58:13 -07:00
|
|
|
if (!tx_ring->tx_buf)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2019-02-08 12:50:59 -08:00
|
|
|
/* round up to nearest page */
|
2021-08-19 13:59:58 +02:00
|
|
|
size = ALIGN(tx_ring->count * sizeof(struct ice_tx_desc),
|
|
|
|
PAGE_SIZE);
|
|
|
|
tx_ring->desc = dmam_alloc_coherent(dev, size, &tx_ring->dma,
|
2018-03-20 07:58:13 -07:00
|
|
|
GFP_KERNEL);
|
|
|
|
if (!tx_ring->desc) {
|
|
|
|
dev_err(dev, "Unable to allocate memory for the Tx descriptor ring, size=%d\n",
|
2021-08-19 13:59:58 +02:00
|
|
|
size);
|
2018-03-20 07:58:13 -07:00
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
|
|
|
|
tx_ring->next_to_use = 0;
|
|
|
|
tx_ring->next_to_clean = 0;
|
ice: Accumulate ring statistics over reset
Resets may occur with or without user interaction. For example, a TX hang
or reconfiguration of parameters will result in a reset. During reset, the
VSI is freed, freeing any statistics structures inside as well. This would
create an issue for the user where a reset happens in the background,
statistics set to zero, and the user checks ring statistics expecting them
to be populated.
To ensure this doesn't happen, accumulate ring statistics over reset.
Define a new ring statistics structure, ice_ring_stats. The new structure
lives in the VSI's parent, preserving ring statistics when VSI is freed.
1. Define a new structure vsi_ring_stats in the PF scope
2. Allocate/free stats only during probe, unload, or change in ring size
3. Replace previous ring statistics functionality with new structure
Signed-off-by: Benjamin Mikailenko <benjamin.mikailenko@intel.com>
Tested-by: Gurucharan G <gurucharanx.g@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2022-11-18 16:20:02 -05:00
|
|
|
tx_ring->ring_stats->tx_stats.prev_pkt = -1;
|
2018-03-20 07:58:13 -07:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
err:
|
|
|
|
devm_kfree(dev, tx_ring->tx_buf);
|
|
|
|
tx_ring->tx_buf = NULL;
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* ice_clean_rx_ring - Free Rx buffers
|
|
|
|
* @rx_ring: ring to be cleaned
|
|
|
|
*/
|
2021-08-19 13:59:58 +02:00
|
|
|
void ice_clean_rx_ring(struct ice_rx_ring *rx_ring)
|
2018-03-20 07:58:13 -07:00
|
|
|
{
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
struct xdp_buff *xdp = &rx_ring->xdp;
|
2018-03-20 07:58:13 -07:00
|
|
|
struct device *dev = rx_ring->dev;
|
2021-08-19 13:59:58 +02:00
|
|
|
u32 size;
|
2018-03-20 07:58:13 -07:00
|
|
|
u16 i;
|
|
|
|
|
|
|
|
/* ring already cleared, nothing to do */
|
|
|
|
if (!rx_ring->rx_buf)
|
|
|
|
return;
|
|
|
|
|
2020-08-28 10:26:15 +02:00
|
|
|
if (rx_ring->xsk_pool) {
|
2019-11-04 09:38:56 -08:00
|
|
|
ice_xsk_clean_rx_ring(rx_ring);
|
|
|
|
goto rx_skip_free;
|
|
|
|
}
|
|
|
|
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
if (xdp->data) {
|
|
|
|
xdp_return_buff(xdp);
|
|
|
|
xdp->data = NULL;
|
|
|
|
}
|
|
|
|
|
2018-03-20 07:58:13 -07:00
|
|
|
/* Free all the Rx ring sk_buffs */
|
|
|
|
for (i = 0; i < rx_ring->count; i++) {
|
|
|
|
struct ice_rx_buf *rx_buf = &rx_ring->rx_buf[i];
|
|
|
|
|
|
|
|
if (!rx_buf->page)
|
|
|
|
continue;
|
|
|
|
|
2019-02-13 10:51:07 -08:00
|
|
|
/* Invalidate cache lines that may have been written to by
|
|
|
|
* device so that we avoid corrupting memory.
|
|
|
|
*/
|
|
|
|
dma_sync_single_range_for_cpu(dev, rx_buf->dma,
|
|
|
|
rx_buf->page_offset,
|
2019-10-24 01:11:22 -07:00
|
|
|
rx_ring->rx_buf_len,
|
|
|
|
DMA_FROM_DEVICE);
|
2019-02-13 10:51:07 -08:00
|
|
|
|
|
|
|
/* free resources associated with mapping */
|
2019-10-24 01:11:22 -07:00
|
|
|
dma_unmap_page_attrs(dev, rx_buf->dma, ice_rx_pg_size(rx_ring),
|
2019-02-13 10:51:07 -08:00
|
|
|
DMA_FROM_DEVICE, ICE_RX_DMA_ATTR);
|
2019-02-13 10:51:04 -08:00
|
|
|
__page_frag_cache_drain(rx_buf->page, rx_buf->pagecnt_bias);
|
2018-03-20 07:58:13 -07:00
|
|
|
|
|
|
|
rx_buf->page = NULL;
|
|
|
|
rx_buf->page_offset = 0;
|
|
|
|
}
|
|
|
|
|
2019-11-04 09:38:56 -08:00
|
|
|
rx_skip_free:
|
2021-12-13 16:31:07 +01:00
|
|
|
if (rx_ring->xsk_pool)
|
|
|
|
memset(rx_ring->xdp_buf, 0, array_size(rx_ring->count, sizeof(*rx_ring->xdp_buf)));
|
|
|
|
else
|
|
|
|
memset(rx_ring->rx_buf, 0, array_size(rx_ring->count, sizeof(*rx_ring->rx_buf)));
|
2018-03-20 07:58:13 -07:00
|
|
|
|
|
|
|
/* Zero out the descriptor ring */
|
2021-08-19 13:59:58 +02:00
|
|
|
size = ALIGN(rx_ring->count * sizeof(union ice_32byte_rx_desc),
|
|
|
|
PAGE_SIZE);
|
|
|
|
memset(rx_ring->desc, 0, size);
|
2018-03-20 07:58:13 -07:00
|
|
|
|
|
|
|
rx_ring->next_to_alloc = 0;
|
|
|
|
rx_ring->next_to_clean = 0;
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
rx_ring->first_desc = 0;
|
2018-03-20 07:58:13 -07:00
|
|
|
rx_ring->next_to_use = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* ice_free_rx_ring - Free Rx resources
|
|
|
|
* @rx_ring: ring to clean the resources from
|
|
|
|
*
|
|
|
|
* Free all receive software resources
|
|
|
|
*/
|
2021-08-19 13:59:58 +02:00
|
|
|
void ice_free_rx_ring(struct ice_rx_ring *rx_ring)
|
2018-03-20 07:58:13 -07:00
|
|
|
{
|
2021-08-19 13:59:58 +02:00
|
|
|
u32 size;
|
|
|
|
|
2018-03-20 07:58:13 -07:00
|
|
|
ice_clean_rx_ring(rx_ring);
|
2019-11-04 09:38:56 -08:00
|
|
|
if (rx_ring->vsi->type == ICE_VSI_PF)
|
|
|
|
if (xdp_rxq_info_is_reg(&rx_ring->xdp_rxq))
|
|
|
|
xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
|
|
|
|
rx_ring->xdp_prog = NULL;
|
2021-12-13 16:31:07 +01:00
|
|
|
if (rx_ring->xsk_pool) {
|
|
|
|
kfree(rx_ring->xdp_buf);
|
|
|
|
rx_ring->xdp_buf = NULL;
|
|
|
|
} else {
|
|
|
|
kfree(rx_ring->rx_buf);
|
|
|
|
rx_ring->rx_buf = NULL;
|
|
|
|
}
|
2018-03-20 07:58:13 -07:00
|
|
|
|
|
|
|
if (rx_ring->desc) {
|
2021-08-19 13:59:58 +02:00
|
|
|
size = ALIGN(rx_ring->count * sizeof(union ice_32byte_rx_desc),
|
|
|
|
PAGE_SIZE);
|
|
|
|
dmam_free_coherent(rx_ring->dev, size,
|
2018-03-20 07:58:13 -07:00
|
|
|
rx_ring->desc, rx_ring->dma);
|
|
|
|
rx_ring->desc = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* ice_setup_rx_ring - Allocate the Rx descriptors
|
2018-10-26 11:44:47 -07:00
|
|
|
* @rx_ring: the Rx ring to set up
|
2018-03-20 07:58:13 -07:00
|
|
|
*
|
|
|
|
* Return 0 on success, negative on error
|
|
|
|
*/
|
2021-08-19 13:59:58 +02:00
|
|
|
int ice_setup_rx_ring(struct ice_rx_ring *rx_ring)
|
2018-03-20 07:58:13 -07:00
|
|
|
{
|
|
|
|
struct device *dev = rx_ring->dev;
|
2021-08-19 13:59:58 +02:00
|
|
|
u32 size;
|
2018-03-20 07:58:13 -07:00
|
|
|
|
|
|
|
if (!dev)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
/* warn if we are about to overwrite the pointer */
|
|
|
|
WARN_ON(rx_ring->rx_buf);
|
2019-02-08 12:50:31 -08:00
|
|
|
rx_ring->rx_buf =
|
2021-12-13 16:31:07 +01:00
|
|
|
kcalloc(rx_ring->count, sizeof(*rx_ring->rx_buf), GFP_KERNEL);
|
2018-03-20 07:58:13 -07:00
|
|
|
if (!rx_ring->rx_buf)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2019-02-08 12:50:59 -08:00
|
|
|
/* round up to nearest page */
|
2021-08-19 13:59:58 +02:00
|
|
|
size = ALIGN(rx_ring->count * sizeof(union ice_32byte_rx_desc),
|
|
|
|
PAGE_SIZE);
|
|
|
|
rx_ring->desc = dmam_alloc_coherent(dev, size, &rx_ring->dma,
|
2018-03-20 07:58:13 -07:00
|
|
|
GFP_KERNEL);
|
|
|
|
if (!rx_ring->desc) {
|
|
|
|
dev_err(dev, "Unable to allocate memory for the Rx descriptor ring, size=%d\n",
|
2021-08-19 13:59:58 +02:00
|
|
|
size);
|
2018-03-20 07:58:13 -07:00
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
|
|
|
|
rx_ring->next_to_use = 0;
|
|
|
|
rx_ring->next_to_clean = 0;
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
rx_ring->first_desc = 0;
|
2019-11-04 09:38:56 -08:00
|
|
|
|
|
|
|
if (ice_is_xdp_ena_vsi(rx_ring->vsi))
|
|
|
|
WRITE_ONCE(rx_ring->xdp_prog, rx_ring->vsi->xdp_prog);
|
|
|
|
|
2018-03-20 07:58:13 -07:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
err:
|
2021-12-13 16:31:07 +01:00
|
|
|
kfree(rx_ring->rx_buf);
|
2018-03-20 07:58:13 -07:00
|
|
|
rx_ring->rx_buf = NULL;
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
2023-01-31 21:44:55 +01:00
|
|
|
/**
|
|
|
|
* ice_rx_frame_truesize
|
|
|
|
* @rx_ring: ptr to Rx ring
|
|
|
|
* @size: size
|
|
|
|
*
|
|
|
|
* calculate the truesize with taking into the account PAGE_SIZE of
|
|
|
|
* underlying arch
|
|
|
|
*/
|
2020-07-13 13:53:18 -07:00
|
|
|
static unsigned int
|
2023-01-31 21:44:55 +01:00
|
|
|
ice_rx_frame_truesize(struct ice_rx_ring *rx_ring, const unsigned int size)
|
ice: Add XDP frame size to driver
This driver uses different memory models depending on PAGE_SIZE at
compile time. For PAGE_SIZE 4K it uses page splitting, meaning for
normal MTU frame size is 2048 bytes (and headroom 192 bytes). For
larger MTUs the driver still use page splitting, by allocating
order-1 pages (8192 bytes) for RX frames. For PAGE_SIZE larger than
4K, driver instead advance its rx_buffer->page_offset with the frame
size "truesize".
For XDP frame size calculations, this mean that in PAGE_SIZE larger
than 4K mode the frame_sz change on a per packet basis. For the page
split 4K PAGE_SIZE mode, xdp.frame_sz is more constant and can be
updated once outside the main NAPI loop.
The default setting in the driver uses build_skb(), which provides
the necessary headroom and tailroom for XDP-redirect in RX-frame
(in both modes).
There is one complication, which is legacy-rx mode (configurable via
ethtool priv-flags). There are zero headroom in this mode, which is a
requirement for XDP-redirect to work. The conversion to xdp_frame
(convert_to_xdp_frame) will detect this insufficient space, and
xdp_do_redirect() call will fail. This is deemed acceptable, as it
allows other XDP actions to still work in legacy-mode. In
legacy-mode + larger PAGE_SIZE due to lacking tailroom, we also
accept that xdp_adjust_tail shrink doesn't work.
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Cc: intel-wired-lan@lists.osuosl.org
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: Alexander Duyck <alexander.duyck@gmail.com>
Link: https://lore.kernel.org/bpf/158945347002.97035.328088795813704587.stgit@firesoul
2020-05-14 12:51:10 +02:00
|
|
|
{
|
|
|
|
unsigned int truesize;
|
|
|
|
|
|
|
|
#if (PAGE_SIZE < 8192)
|
|
|
|
truesize = ice_rx_pg_size(rx_ring) / 2; /* Must be power-of-2 */
|
|
|
|
#else
|
2021-01-18 16:13:17 +01:00
|
|
|
truesize = rx_ring->rx_offset ?
|
|
|
|
SKB_DATA_ALIGN(rx_ring->rx_offset + size) +
|
ice: Add XDP frame size to driver
This driver uses different memory models depending on PAGE_SIZE at
compile time. For PAGE_SIZE 4K it uses page splitting, meaning for
normal MTU frame size is 2048 bytes (and headroom 192 bytes). For
larger MTUs the driver still use page splitting, by allocating
order-1 pages (8192 bytes) for RX frames. For PAGE_SIZE larger than
4K, driver instead advance its rx_buffer->page_offset with the frame
size "truesize".
For XDP frame size calculations, this mean that in PAGE_SIZE larger
than 4K mode the frame_sz change on a per packet basis. For the page
split 4K PAGE_SIZE mode, xdp.frame_sz is more constant and can be
updated once outside the main NAPI loop.
The default setting in the driver uses build_skb(), which provides
the necessary headroom and tailroom for XDP-redirect in RX-frame
(in both modes).
There is one complication, which is legacy-rx mode (configurable via
ethtool priv-flags). There are zero headroom in this mode, which is a
requirement for XDP-redirect to work. The conversion to xdp_frame
(convert_to_xdp_frame) will detect this insufficient space, and
xdp_do_redirect() call will fail. This is deemed acceptable, as it
allows other XDP actions to still work in legacy-mode. In
legacy-mode + larger PAGE_SIZE due to lacking tailroom, we also
accept that xdp_adjust_tail shrink doesn't work.
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Cc: intel-wired-lan@lists.osuosl.org
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: Alexander Duyck <alexander.duyck@gmail.com>
Link: https://lore.kernel.org/bpf/158945347002.97035.328088795813704587.stgit@firesoul
2020-05-14 12:51:10 +02:00
|
|
|
SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) :
|
|
|
|
SKB_DATA_ALIGN(size);
|
|
|
|
#endif
|
|
|
|
return truesize;
|
|
|
|
}
|
|
|
|
|
2019-11-04 09:38:56 -08:00
|
|
|
/**
|
|
|
|
* ice_run_xdp - Executes an XDP program on initialized xdp_buff
|
|
|
|
* @rx_ring: Rx ring
|
|
|
|
* @xdp: xdp_buff used as input to the XDP program
|
|
|
|
* @xdp_prog: XDP program to run
|
2021-08-19 14:00:01 +02:00
|
|
|
* @xdp_ring: ring to be used for XDP_TX action
|
2023-01-31 21:44:59 +01:00
|
|
|
* @rx_buf: Rx buffer to store the XDP action
|
2023-12-05 22:08:33 +01:00
|
|
|
* @eop_desc: Last descriptor in packet to read metadata from
|
2019-11-04 09:38:56 -08:00
|
|
|
*
|
|
|
|
* Returns any of ICE_XDP_{PASS, CONSUMED, TX, REDIR}
|
|
|
|
*/
|
2023-01-31 21:44:59 +01:00
|
|
|
static void
|
2021-08-19 13:59:58 +02:00
|
|
|
ice_run_xdp(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
|
2023-01-31 21:44:59 +01:00
|
|
|
struct bpf_prog *xdp_prog, struct ice_tx_ring *xdp_ring,
|
2023-12-05 22:08:33 +01:00
|
|
|
struct ice_rx_buf *rx_buf, union ice_32b_rx_flex_desc *eop_desc)
|
2019-11-04 09:38:56 -08:00
|
|
|
{
|
2023-01-31 21:44:59 +01:00
|
|
|
unsigned int ret = ICE_XDP_PASS;
|
2019-11-04 09:38:56 -08:00
|
|
|
u32 act;
|
|
|
|
|
2023-01-31 21:44:59 +01:00
|
|
|
if (!xdp_prog)
|
|
|
|
goto exit;
|
|
|
|
|
2023-12-05 22:08:33 +01:00
|
|
|
ice_xdp_meta_set_desc(xdp, eop_desc);
|
|
|
|
|
2019-11-04 09:38:56 -08:00
|
|
|
act = bpf_prog_run_xdp(xdp_prog, xdp);
|
|
|
|
switch (act) {
|
|
|
|
case XDP_PASS:
|
2023-01-31 21:44:59 +01:00
|
|
|
break;
|
2019-11-04 09:38:56 -08:00
|
|
|
case XDP_TX:
|
2021-08-19 14:00:03 +02:00
|
|
|
if (static_branch_unlikely(&ice_xdp_locking_key))
|
|
|
|
spin_lock(&xdp_ring->tx_lock);
|
2023-02-10 18:06:17 +01:00
|
|
|
ret = __ice_xmit_xdp_ring(xdp, xdp_ring, false);
|
2021-08-19 14:00:03 +02:00
|
|
|
if (static_branch_unlikely(&ice_xdp_locking_key))
|
|
|
|
spin_unlock(&xdp_ring->tx_lock);
|
2023-01-31 21:44:59 +01:00
|
|
|
if (ret == ICE_XDP_CONSUMED)
|
2021-05-10 11:38:50 +02:00
|
|
|
goto out_failure;
|
2023-01-31 21:44:59 +01:00
|
|
|
break;
|
2019-11-04 09:38:56 -08:00
|
|
|
case XDP_REDIRECT:
|
2023-01-31 21:44:59 +01:00
|
|
|
if (xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog))
|
2021-05-10 11:38:50 +02:00
|
|
|
goto out_failure;
|
2023-01-31 21:44:59 +01:00
|
|
|
ret = ICE_XDP_REDIR;
|
|
|
|
break;
|
2019-11-04 09:38:56 -08:00
|
|
|
default:
|
2021-11-30 11:08:07 +01:00
|
|
|
bpf_warn_invalid_xdp_action(rx_ring->netdev, xdp_prog, act);
|
2020-01-22 07:21:35 -08:00
|
|
|
fallthrough;
|
2019-11-04 09:38:56 -08:00
|
|
|
case XDP_ABORTED:
|
2021-05-10 11:38:50 +02:00
|
|
|
out_failure:
|
2019-11-04 09:38:56 -08:00
|
|
|
trace_xdp_exception(rx_ring->netdev, xdp_prog, act);
|
2020-01-22 07:21:35 -08:00
|
|
|
fallthrough;
|
2019-11-04 09:38:56 -08:00
|
|
|
case XDP_DROP:
|
2023-01-31 21:44:59 +01:00
|
|
|
ret = ICE_XDP_CONSUMED;
|
2019-11-04 09:38:56 -08:00
|
|
|
}
|
2023-01-31 21:44:59 +01:00
|
|
|
exit:
|
2024-01-24 20:15:55 +01:00
|
|
|
ice_set_rx_bufs_act(xdp, rx_ring, ret);
|
2019-11-04 09:38:56 -08:00
|
|
|
}
|
|
|
|
|
2023-02-10 18:06:18 +01:00
|
|
|
/**
|
|
|
|
* ice_xmit_xdp_ring - submit frame to XDP ring for transmission
|
|
|
|
* @xdpf: XDP frame that will be converted to XDP buff
|
|
|
|
* @xdp_ring: XDP ring for transmission
|
|
|
|
*/
|
|
|
|
static int ice_xmit_xdp_ring(const struct xdp_frame *xdpf,
|
|
|
|
struct ice_tx_ring *xdp_ring)
|
|
|
|
{
|
|
|
|
struct xdp_buff xdp;
|
|
|
|
|
|
|
|
xdp.data_hard_start = (void *)xdpf;
|
|
|
|
xdp.data = xdpf->data;
|
|
|
|
xdp.data_end = xdp.data + xdpf->len;
|
|
|
|
xdp.frame_sz = xdpf->frame_sz;
|
|
|
|
xdp.flags = xdpf->flags;
|
|
|
|
|
|
|
|
return __ice_xmit_xdp_ring(&xdp, xdp_ring, true);
|
|
|
|
}
|
|
|
|
|
2019-11-04 09:38:56 -08:00
|
|
|
/**
|
|
|
|
* ice_xdp_xmit - submit packets to XDP ring for transmission
|
|
|
|
* @dev: netdev
|
|
|
|
* @n: number of XDP frames to be transmitted
|
|
|
|
* @frames: XDP frames to be transmitted
|
|
|
|
* @flags: transmit flags
|
|
|
|
*
|
2021-03-08 12:06:58 +01:00
|
|
|
* Returns number of frames successfully sent. Failed frames
|
|
|
|
* will be free'ed by XDP core.
|
2019-11-04 09:38:56 -08:00
|
|
|
* For error cases, a negative errno code is returned and no-frames
|
|
|
|
* are transmitted (caller must handle freeing frames).
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
ice_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
|
|
|
|
u32 flags)
|
|
|
|
{
|
|
|
|
struct ice_netdev_priv *np = netdev_priv(dev);
|
|
|
|
unsigned int queue_index = smp_processor_id();
|
|
|
|
struct ice_vsi *vsi = np->vsi;
|
2021-08-19 13:59:58 +02:00
|
|
|
struct ice_tx_ring *xdp_ring;
|
2023-01-31 21:45:04 +01:00
|
|
|
struct ice_tx_buf *tx_buf;
|
2021-03-08 12:06:58 +01:00
|
|
|
int nxmit = 0, i;
|
2019-11-04 09:38:56 -08:00
|
|
|
|
2021-03-02 10:15:37 -08:00
|
|
|
if (test_bit(ICE_VSI_DOWN, vsi->state))
|
2019-11-04 09:38:56 -08:00
|
|
|
return -ENETDOWN;
|
|
|
|
|
2022-09-19 15:43:46 +02:00
|
|
|
if (!ice_is_xdp_ena_vsi(vsi))
|
2019-11-04 09:38:56 -08:00
|
|
|
return -ENXIO;
|
|
|
|
|
|
|
|
if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
|
|
|
|
return -EINVAL;
|
|
|
|
|
2021-08-19 14:00:03 +02:00
|
|
|
if (static_branch_unlikely(&ice_xdp_locking_key)) {
|
|
|
|
queue_index %= vsi->num_xdp_txq;
|
|
|
|
xdp_ring = vsi->xdp_rings[queue_index];
|
|
|
|
spin_lock(&xdp_ring->tx_lock);
|
|
|
|
} else {
|
2022-09-19 15:43:46 +02:00
|
|
|
/* Generally, should not happen */
|
|
|
|
if (unlikely(queue_index >= vsi->num_xdp_txq))
|
|
|
|
return -ENXIO;
|
2021-08-19 14:00:03 +02:00
|
|
|
xdp_ring = vsi->xdp_rings[queue_index];
|
|
|
|
}
|
|
|
|
|
2023-01-31 21:45:04 +01:00
|
|
|
tx_buf = &xdp_ring->tx_buf[xdp_ring->next_to_use];
|
2019-11-04 09:38:56 -08:00
|
|
|
for (i = 0; i < n; i++) {
|
2023-02-10 18:06:18 +01:00
|
|
|
const struct xdp_frame *xdpf = frames[i];
|
2019-11-04 09:38:56 -08:00
|
|
|
int err;
|
|
|
|
|
2023-01-31 21:45:04 +01:00
|
|
|
err = ice_xmit_xdp_ring(xdpf, xdp_ring);
|
2021-03-08 12:06:58 +01:00
|
|
|
if (err != ICE_XDP_TX)
|
|
|
|
break;
|
|
|
|
nxmit++;
|
2019-11-04 09:38:56 -08:00
|
|
|
}
|
|
|
|
|
2023-01-31 21:45:04 +01:00
|
|
|
tx_buf->rs_idx = ice_set_rs_bit(xdp_ring);
|
2019-11-04 09:38:56 -08:00
|
|
|
if (unlikely(flags & XDP_XMIT_FLUSH))
|
|
|
|
ice_xdp_ring_update_tail(xdp_ring);
|
|
|
|
|
2021-08-19 14:00:03 +02:00
|
|
|
if (static_branch_unlikely(&ice_xdp_locking_key))
|
|
|
|
spin_unlock(&xdp_ring->tx_lock);
|
|
|
|
|
2021-03-08 12:06:58 +01:00
|
|
|
return nxmit;
|
2019-11-04 09:38:56 -08:00
|
|
|
}
|
|
|
|
|
2018-03-20 07:58:13 -07:00
|
|
|
/**
|
|
|
|
* ice_alloc_mapped_page - recycle or make a new page
|
|
|
|
* @rx_ring: ring to use
|
|
|
|
* @bi: rx_buf struct to modify
|
|
|
|
*
|
|
|
|
* Returns true if the page was successfully allocated or
|
|
|
|
* reused.
|
|
|
|
*/
|
2019-02-26 16:35:11 -08:00
|
|
|
static bool
|
2021-08-19 13:59:58 +02:00
|
|
|
ice_alloc_mapped_page(struct ice_rx_ring *rx_ring, struct ice_rx_buf *bi)
|
2018-03-20 07:58:13 -07:00
|
|
|
{
|
|
|
|
struct page *page = bi->page;
|
|
|
|
dma_addr_t dma;
|
|
|
|
|
|
|
|
/* since we are recycling buffers we should seldom need to alloc */
|
2020-07-29 17:19:22 -07:00
|
|
|
if (likely(page))
|
2018-03-20 07:58:13 -07:00
|
|
|
return true;
|
|
|
|
|
|
|
|
/* alloc new page for storage */
|
2019-10-24 01:11:22 -07:00
|
|
|
page = dev_alloc_pages(ice_rx_pg_order(rx_ring));
|
2018-03-20 07:58:14 -07:00
|
|
|
if (unlikely(!page)) {
|
ice: Accumulate ring statistics over reset
Resets may occur with or without user interaction. For example, a TX hang
or reconfiguration of parameters will result in a reset. During reset, the
VSI is freed, freeing any statistics structures inside as well. This would
create an issue for the user where a reset happens in the background,
statistics set to zero, and the user checks ring statistics expecting them
to be populated.
To ensure this doesn't happen, accumulate ring statistics over reset.
Define a new ring statistics structure, ice_ring_stats. The new structure
lives in the VSI's parent, preserving ring statistics when VSI is freed.
1. Define a new structure vsi_ring_stats in the PF scope
2. Allocate/free stats only during probe, unload, or change in ring size
3. Replace previous ring statistics functionality with new structure
Signed-off-by: Benjamin Mikailenko <benjamin.mikailenko@intel.com>
Tested-by: Gurucharan G <gurucharanx.g@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2022-11-18 16:20:02 -05:00
|
|
|
rx_ring->ring_stats->rx_stats.alloc_page_failed++;
|
2018-03-20 07:58:13 -07:00
|
|
|
return false;
|
2018-03-20 07:58:14 -07:00
|
|
|
}
|
2018-03-20 07:58:13 -07:00
|
|
|
|
|
|
|
/* map page for use */
|
2019-10-24 01:11:22 -07:00
|
|
|
dma = dma_map_page_attrs(rx_ring->dev, page, 0, ice_rx_pg_size(rx_ring),
|
2019-02-13 10:51:07 -08:00
|
|
|
DMA_FROM_DEVICE, ICE_RX_DMA_ATTR);
|
2018-03-20 07:58:13 -07:00
|
|
|
|
|
|
|
/* if mapping failed free memory back to system since
|
|
|
|
* there isn't much point in holding memory we can't use
|
|
|
|
*/
|
|
|
|
if (dma_mapping_error(rx_ring->dev, dma)) {
|
2019-10-24 01:11:22 -07:00
|
|
|
__free_pages(page, ice_rx_pg_order(rx_ring));
|
ice: Accumulate ring statistics over reset
Resets may occur with or without user interaction. For example, a TX hang
or reconfiguration of parameters will result in a reset. During reset, the
VSI is freed, freeing any statistics structures inside as well. This would
create an issue for the user where a reset happens in the background,
statistics set to zero, and the user checks ring statistics expecting them
to be populated.
To ensure this doesn't happen, accumulate ring statistics over reset.
Define a new ring statistics structure, ice_ring_stats. The new structure
lives in the VSI's parent, preserving ring statistics when VSI is freed.
1. Define a new structure vsi_ring_stats in the PF scope
2. Allocate/free stats only during probe, unload, or change in ring size
3. Replace previous ring statistics functionality with new structure
Signed-off-by: Benjamin Mikailenko <benjamin.mikailenko@intel.com>
Tested-by: Gurucharan G <gurucharanx.g@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2022-11-18 16:20:02 -05:00
|
|
|
rx_ring->ring_stats->rx_stats.alloc_page_failed++;
|
2018-03-20 07:58:13 -07:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
bi->dma = dma;
|
|
|
|
bi->page = page;
|
2021-01-18 16:13:17 +01:00
|
|
|
bi->page_offset = rx_ring->rx_offset;
|
2019-02-13 10:51:04 -08:00
|
|
|
page_ref_add(page, USHRT_MAX - 1);
|
|
|
|
bi->pagecnt_bias = USHRT_MAX;
|
2018-03-20 07:58:13 -07:00
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* ice_alloc_rx_bufs - Replace used receive buffers
|
|
|
|
* @rx_ring: ring to place buffers on
|
|
|
|
* @cleaned_count: number of buffers to replace
|
|
|
|
*
|
2019-06-26 02:20:19 -07:00
|
|
|
* Returns false if all allocations were successful, true if any fail. Returning
|
|
|
|
* true signals to the caller that we didn't replace cleaned_count buffers and
|
|
|
|
* there is more work to do.
|
|
|
|
*
|
|
|
|
* First, try to clean "cleaned_count" Rx buffers. Then refill the cleaned Rx
|
|
|
|
* buffers. Then bump tail at most one time. Grouping like this lets us avoid
|
|
|
|
* multiple tail writes per call.
|
2018-03-20 07:58:13 -07:00
|
|
|
*/
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
bool ice_alloc_rx_bufs(struct ice_rx_ring *rx_ring, unsigned int cleaned_count)
|
2018-03-20 07:58:13 -07:00
|
|
|
{
|
|
|
|
union ice_32b_rx_flex_desc *rx_desc;
|
|
|
|
u16 ntu = rx_ring->next_to_use;
|
|
|
|
struct ice_rx_buf *bi;
|
|
|
|
|
|
|
|
/* do nothing if no valid netdev defined */
|
2020-05-11 18:01:40 -07:00
|
|
|
if ((!rx_ring->netdev && rx_ring->vsi->type != ICE_VSI_CTRL) ||
|
|
|
|
!cleaned_count)
|
2018-03-20 07:58:13 -07:00
|
|
|
return false;
|
|
|
|
|
2019-02-19 15:04:13 -08:00
|
|
|
/* get the Rx descriptor and buffer based on next_to_use */
|
2018-03-20 07:58:13 -07:00
|
|
|
rx_desc = ICE_RX_DESC(rx_ring, ntu);
|
|
|
|
bi = &rx_ring->rx_buf[ntu];
|
|
|
|
|
|
|
|
do {
|
2019-06-26 02:20:23 -07:00
|
|
|
/* if we fail here, we have work remaining */
|
2018-03-20 07:58:13 -07:00
|
|
|
if (!ice_alloc_mapped_page(rx_ring, bi))
|
2019-06-26 02:20:23 -07:00
|
|
|
break;
|
2018-03-20 07:58:13 -07:00
|
|
|
|
2019-02-13 10:51:07 -08:00
|
|
|
/* sync the buffer for use by the device */
|
|
|
|
dma_sync_single_range_for_device(rx_ring->dev, bi->dma,
|
|
|
|
bi->page_offset,
|
2019-10-24 01:11:22 -07:00
|
|
|
rx_ring->rx_buf_len,
|
2019-02-13 10:51:07 -08:00
|
|
|
DMA_FROM_DEVICE);
|
|
|
|
|
2018-03-20 07:58:13 -07:00
|
|
|
/* Refresh the desc even if buffer_addrs didn't change
|
|
|
|
* because each write-back erases this info.
|
|
|
|
*/
|
|
|
|
rx_desc->read.pkt_addr = cpu_to_le64(bi->dma + bi->page_offset);
|
|
|
|
|
|
|
|
rx_desc++;
|
|
|
|
bi++;
|
|
|
|
ntu++;
|
|
|
|
if (unlikely(ntu == rx_ring->count)) {
|
|
|
|
rx_desc = ICE_RX_DESC(rx_ring, 0);
|
|
|
|
bi = rx_ring->rx_buf;
|
|
|
|
ntu = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* clear the status bits for the next_to_use descriptor */
|
|
|
|
rx_desc->wb.status_error0 = 0;
|
|
|
|
|
|
|
|
cleaned_count--;
|
|
|
|
} while (cleaned_count);
|
|
|
|
|
|
|
|
if (rx_ring->next_to_use != ntu)
|
|
|
|
ice_release_rx_desc(rx_ring, ntu);
|
|
|
|
|
2019-06-26 02:20:23 -07:00
|
|
|
return !!cleaned_count;
|
2018-03-20 07:58:13 -07:00
|
|
|
}
|
2018-03-20 07:58:14 -07:00
|
|
|
|
|
|
|
/**
|
2019-02-13 10:51:05 -08:00
|
|
|
* ice_rx_buf_adjust_pg_offset - Prepare Rx buffer for reuse
|
|
|
|
* @rx_buf: Rx buffer to adjust
|
|
|
|
* @size: Size of adjustment
|
2018-03-20 07:58:14 -07:00
|
|
|
*
|
2019-02-13 10:51:05 -08:00
|
|
|
* Update the offset within page so that Rx buf will be ready to be reused.
|
|
|
|
* For systems with PAGE_SIZE < 8192 this function will flip the page offset
|
|
|
|
* so the second half of page assigned to Rx buffer will be used, otherwise
|
2020-02-06 01:20:13 -08:00
|
|
|
* the offset is moved by "size" bytes
|
2018-03-20 07:58:14 -07:00
|
|
|
*/
|
2019-02-13 10:51:05 -08:00
|
|
|
static void
|
|
|
|
ice_rx_buf_adjust_pg_offset(struct ice_rx_buf *rx_buf, unsigned int size)
|
2018-03-20 07:58:14 -07:00
|
|
|
{
|
|
|
|
#if (PAGE_SIZE < 8192)
|
2019-02-13 10:51:05 -08:00
|
|
|
/* flip page offset to other buffer */
|
|
|
|
rx_buf->page_offset ^= size;
|
2018-03-20 07:58:14 -07:00
|
|
|
#else
|
2019-02-13 10:51:05 -08:00
|
|
|
/* move offset up to the next cache line */
|
|
|
|
rx_buf->page_offset += size;
|
|
|
|
#endif
|
|
|
|
}
|
2018-03-20 07:58:14 -07:00
|
|
|
|
2019-02-13 10:51:02 -08:00
|
|
|
/**
|
|
|
|
* ice_can_reuse_rx_page - Determine if page can be reused for another Rx
|
|
|
|
* @rx_buf: buffer containing the page
|
|
|
|
*
|
|
|
|
* If page is reusable, we have a green light for calling ice_reuse_rx_page,
|
|
|
|
* which will assign the current buffer to the buffer that next_to_alloc is
|
|
|
|
* pointing to; otherwise, the DMA mapping needs to be destroyed and
|
|
|
|
* page freed
|
|
|
|
*/
|
2020-08-25 19:27:36 +02:00
|
|
|
static bool
|
2023-01-31 21:44:56 +01:00
|
|
|
ice_can_reuse_rx_page(struct ice_rx_buf *rx_buf)
|
2019-02-13 10:51:02 -08:00
|
|
|
{
|
2019-02-13 10:51:04 -08:00
|
|
|
unsigned int pagecnt_bias = rx_buf->pagecnt_bias;
|
2019-02-13 10:51:02 -08:00
|
|
|
struct page *page = rx_buf->page;
|
2018-03-20 07:58:14 -07:00
|
|
|
|
2021-02-02 13:31:35 +00:00
|
|
|
/* avoid re-using remote and pfmemalloc pages */
|
|
|
|
if (!dev_page_is_reusable(page))
|
2018-03-20 07:58:14 -07:00
|
|
|
return false;
|
|
|
|
|
|
|
|
#if (PAGE_SIZE < 8192)
|
|
|
|
/* if we are only owner of page we can reuse it */
|
2023-01-31 21:44:56 +01:00
|
|
|
if (unlikely(rx_buf->pgcnt - pagecnt_bias > 1))
|
2018-03-20 07:58:14 -07:00
|
|
|
return false;
|
|
|
|
#else
|
2019-10-24 01:11:22 -07:00
|
|
|
#define ICE_LAST_OFFSET \
|
|
|
|
(SKB_WITH_OVERHEAD(PAGE_SIZE) - ICE_RXBUF_2048)
|
|
|
|
if (rx_buf->page_offset > ICE_LAST_OFFSET)
|
2018-03-20 07:58:14 -07:00
|
|
|
return false;
|
|
|
|
#endif /* PAGE_SIZE < 8192) */
|
|
|
|
|
2019-02-13 10:51:04 -08:00
|
|
|
/* If we have drained the page fragment pool we need to update
|
|
|
|
* the pagecnt_bias and page count so that we fully restock the
|
|
|
|
* number of references the driver holds.
|
2018-03-20 07:58:14 -07:00
|
|
|
*/
|
2019-02-13 10:51:04 -08:00
|
|
|
if (unlikely(pagecnt_bias == 1)) {
|
|
|
|
page_ref_add(page, USHRT_MAX - 1);
|
|
|
|
rx_buf->pagecnt_bias = USHRT_MAX;
|
|
|
|
}
|
2018-03-20 07:58:14 -07:00
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
* ice_add_xdp_frag - Add contents of Rx buffer to xdp buf as a frag
|
2019-10-24 01:11:22 -07:00
|
|
|
* @rx_ring: Rx descriptor ring to transact packets on
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
* @xdp: xdp buff to place the data into
|
2018-03-20 07:58:14 -07:00
|
|
|
* @rx_buf: buffer containing page to add
|
2019-02-13 10:51:06 -08:00
|
|
|
* @size: packet length from rx_desc
|
2018-03-20 07:58:14 -07:00
|
|
|
*
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
* This function will add the data contained in rx_buf->page to the xdp buf.
|
|
|
|
* It will just attach the page as a frag.
|
2018-03-20 07:58:14 -07:00
|
|
|
*/
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
static int
|
|
|
|
ice_add_xdp_frag(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
|
|
|
|
struct ice_rx_buf *rx_buf, const unsigned int size)
|
2018-03-20 07:58:14 -07:00
|
|
|
{
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
|
|
|
|
|
2019-07-25 01:55:34 -07:00
|
|
|
if (!size)
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (!xdp_buff_has_frags(xdp)) {
|
|
|
|
sinfo->nr_frags = 0;
|
|
|
|
sinfo->xdp_frags_size = 0;
|
|
|
|
xdp_buff_set_frags_flag(xdp);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (unlikely(sinfo->nr_frags == MAX_SKB_FRAGS)) {
|
2024-01-24 20:15:55 +01:00
|
|
|
ice_set_rx_bufs_act(xdp, rx_ring, ICE_XDP_CONSUMED);
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
__skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++, rx_buf->page,
|
|
|
|
rx_buf->page_offset, size);
|
|
|
|
sinfo->xdp_frags_size += size;
|
2024-01-24 20:15:55 +01:00
|
|
|
/* remember frag count before XDP prog execution; bpf_xdp_adjust_tail()
|
|
|
|
* can pop off frags but driver has to handle it on its own
|
|
|
|
*/
|
|
|
|
rx_ring->nr_frags = sinfo->nr_frags;
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
|
|
|
|
if (page_is_pfmemalloc(rx_buf->page))
|
|
|
|
xdp_buff_set_frag_pfmemalloc(xdp);
|
|
|
|
|
|
|
|
return 0;
|
2018-03-20 07:58:14 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* ice_reuse_rx_page - page flip buffer and store it back on the ring
|
2018-10-26 11:44:47 -07:00
|
|
|
* @rx_ring: Rx descriptor ring to store buffers on
|
2018-03-20 07:58:14 -07:00
|
|
|
* @old_buf: donor buffer to have page reused
|
|
|
|
*
|
|
|
|
* Synchronizes page for reuse by the adapter
|
|
|
|
*/
|
2019-02-26 16:35:11 -08:00
|
|
|
static void
|
2021-08-19 13:59:58 +02:00
|
|
|
ice_reuse_rx_page(struct ice_rx_ring *rx_ring, struct ice_rx_buf *old_buf)
|
2018-03-20 07:58:14 -07:00
|
|
|
{
|
|
|
|
u16 nta = rx_ring->next_to_alloc;
|
|
|
|
struct ice_rx_buf *new_buf;
|
|
|
|
|
|
|
|
new_buf = &rx_ring->rx_buf[nta];
|
|
|
|
|
|
|
|
/* update, and store next to alloc */
|
|
|
|
nta++;
|
|
|
|
rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
|
|
|
|
|
2019-02-13 10:51:06 -08:00
|
|
|
/* Transfer page from old buffer to new buffer.
|
|
|
|
* Move each member individually to avoid possible store
|
|
|
|
* forwarding stalls and unnecessary copy of skb.
|
|
|
|
*/
|
|
|
|
new_buf->dma = old_buf->dma;
|
|
|
|
new_buf->page = old_buf->page;
|
|
|
|
new_buf->page_offset = old_buf->page_offset;
|
|
|
|
new_buf->pagecnt_bias = old_buf->pagecnt_bias;
|
2018-03-20 07:58:14 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2019-02-13 10:51:01 -08:00
|
|
|
* ice_get_rx_buf - Fetch Rx buffer and synchronize data for use
|
2018-10-26 11:44:47 -07:00
|
|
|
* @rx_ring: Rx descriptor ring to transact packets on
|
2019-02-13 10:51:01 -08:00
|
|
|
* @size: size of buffer to add to skb
|
2023-03-13 13:36:07 -07:00
|
|
|
* @ntc: index of next to clean element
|
2018-03-20 07:58:14 -07:00
|
|
|
*
|
2019-02-13 10:51:01 -08:00
|
|
|
* This function will pull an Rx buffer from the ring and synchronize it
|
|
|
|
* for use by the CPU.
|
2018-03-20 07:58:14 -07:00
|
|
|
*/
|
2019-02-13 10:51:01 -08:00
|
|
|
static struct ice_rx_buf *
|
2023-01-31 21:44:57 +01:00
|
|
|
ice_get_rx_buf(struct ice_rx_ring *rx_ring, const unsigned int size,
|
|
|
|
const unsigned int ntc)
|
2018-03-20 07:58:14 -07:00
|
|
|
{
|
|
|
|
struct ice_rx_buf *rx_buf;
|
|
|
|
|
2023-01-31 21:44:57 +01:00
|
|
|
rx_buf = &rx_ring->rx_buf[ntc];
|
2023-01-31 21:44:56 +01:00
|
|
|
rx_buf->pgcnt =
|
2020-08-25 19:27:36 +02:00
|
|
|
#if (PAGE_SIZE < 8192)
|
|
|
|
page_count(rx_buf->page);
|
|
|
|
#else
|
|
|
|
0;
|
|
|
|
#endif
|
2019-02-13 10:51:01 -08:00
|
|
|
prefetchw(rx_buf->page);
|
|
|
|
|
2019-07-25 01:55:34 -07:00
|
|
|
if (!size)
|
|
|
|
return rx_buf;
|
2019-02-13 10:51:01 -08:00
|
|
|
/* we are reusing so sync this buffer for CPU use */
|
|
|
|
dma_sync_single_range_for_cpu(rx_ring->dev, rx_buf->dma,
|
|
|
|
rx_buf->page_offset, size,
|
|
|
|
DMA_FROM_DEVICE);
|
2018-03-20 07:58:14 -07:00
|
|
|
|
2019-02-13 10:51:04 -08:00
|
|
|
/* We have pulled a buffer for use, so decrement pagecnt_bias */
|
|
|
|
rx_buf->pagecnt_bias--;
|
2018-03-20 07:58:14 -07:00
|
|
|
|
2019-02-13 10:51:01 -08:00
|
|
|
return rx_buf;
|
|
|
|
}
|
2018-03-20 07:58:14 -07:00
|
|
|
|
2019-10-24 01:11:24 -07:00
|
|
|
/**
|
|
|
|
* ice_build_skb - Build skb around an existing buffer
|
|
|
|
* @rx_ring: Rx descriptor ring to transact packets on
|
|
|
|
* @xdp: xdp_buff pointing to the data
|
|
|
|
*
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
* This function builds an skb around an existing XDP buffer, taking care
|
|
|
|
* to set up the skb correctly and avoid any memcpy overhead. Driver has
|
|
|
|
* already combined frags (if any) to skb_shared_info.
|
2019-10-24 01:11:24 -07:00
|
|
|
*/
|
|
|
|
static struct sk_buff *
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
ice_build_skb(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp)
|
2019-10-24 01:11:24 -07:00
|
|
|
{
|
2020-05-07 17:41:05 -07:00
|
|
|
u8 metasize = xdp->data - xdp->data_meta;
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
struct skb_shared_info *sinfo = NULL;
|
|
|
|
unsigned int nr_frags;
|
2019-10-24 01:11:24 -07:00
|
|
|
struct sk_buff *skb;
|
|
|
|
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
if (unlikely(xdp_buff_has_frags(xdp))) {
|
|
|
|
sinfo = xdp_get_shared_info_from_buff(xdp);
|
|
|
|
nr_frags = sinfo->nr_frags;
|
|
|
|
}
|
|
|
|
|
2019-10-24 01:11:24 -07:00
|
|
|
/* Prefetch first cache line of first page. If xdp->data_meta
|
|
|
|
* is unused, this points exactly as xdp->data, otherwise we
|
|
|
|
* likely have a consumer accessing first few bytes of meta
|
|
|
|
* data, and then actual data.
|
|
|
|
*/
|
2020-08-26 15:54:16 +03:00
|
|
|
net_prefetch(xdp->data_meta);
|
2019-10-24 01:11:24 -07:00
|
|
|
/* build an skb around the page buffer */
|
2023-01-31 21:45:02 +01:00
|
|
|
skb = napi_build_skb(xdp->data_hard_start, xdp->frame_sz);
|
2019-10-24 01:11:24 -07:00
|
|
|
if (unlikely(!skb))
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
/* must to record Rx queue, otherwise OS features such as
|
|
|
|
* symmetric queue won't work
|
|
|
|
*/
|
|
|
|
skb_record_rx_queue(skb, rx_ring->q_index);
|
|
|
|
|
|
|
|
/* update pointers within the skb to store the data */
|
|
|
|
skb_reserve(skb, xdp->data - xdp->data_hard_start);
|
|
|
|
__skb_put(skb, xdp->data_end - xdp->data);
|
|
|
|
if (metasize)
|
|
|
|
skb_metadata_set(skb, metasize);
|
|
|
|
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
if (unlikely(xdp_buff_has_frags(xdp)))
|
|
|
|
xdp_update_skb_shared_info(skb, nr_frags,
|
|
|
|
sinfo->xdp_frags_size,
|
|
|
|
nr_frags * xdp->frame_sz,
|
|
|
|
xdp_buff_is_frag_pfmemalloc(xdp));
|
|
|
|
|
2019-10-24 01:11:24 -07:00
|
|
|
return skb;
|
|
|
|
}
|
|
|
|
|
2018-03-20 07:58:14 -07:00
|
|
|
/**
|
2019-02-13 10:51:06 -08:00
|
|
|
* ice_construct_skb - Allocate skb and populate it
|
2018-10-26 11:44:47 -07:00
|
|
|
* @rx_ring: Rx descriptor ring to transact packets on
|
2019-11-04 09:38:56 -08:00
|
|
|
* @xdp: xdp_buff pointing to the data
|
2018-03-20 07:58:14 -07:00
|
|
|
*
|
2019-02-13 10:51:06 -08:00
|
|
|
* This function allocates an skb. It then populates it with the page
|
|
|
|
* data from the current receive descriptor, taking care to set up the
|
|
|
|
* skb correctly.
|
2018-03-20 07:58:14 -07:00
|
|
|
*/
|
2019-02-26 16:35:11 -08:00
|
|
|
static struct sk_buff *
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
ice_construct_skb(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp)
|
2018-03-20 07:58:14 -07:00
|
|
|
{
|
2019-11-04 09:38:56 -08:00
|
|
|
unsigned int size = xdp->data_end - xdp->data;
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
struct skb_shared_info *sinfo = NULL;
|
|
|
|
struct ice_rx_buf *rx_buf;
|
|
|
|
unsigned int nr_frags = 0;
|
2019-02-13 10:51:06 -08:00
|
|
|
unsigned int headlen;
|
|
|
|
struct sk_buff *skb;
|
2018-03-20 07:58:14 -07:00
|
|
|
|
2019-02-13 10:51:06 -08:00
|
|
|
/* prefetch first cache line of first page */
|
2023-01-31 21:44:54 +01:00
|
|
|
net_prefetch(xdp->data);
|
2018-03-20 07:58:14 -07:00
|
|
|
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
if (unlikely(xdp_buff_has_frags(xdp))) {
|
|
|
|
sinfo = xdp_get_shared_info_from_buff(xdp);
|
|
|
|
nr_frags = sinfo->nr_frags;
|
|
|
|
}
|
|
|
|
|
2019-02-13 10:51:06 -08:00
|
|
|
/* allocate a skb to store the frags */
|
2024-03-26 21:02:12 -07:00
|
|
|
skb = napi_alloc_skb(&rx_ring->q_vector->napi, ICE_RX_HDR_SIZE);
|
2019-02-13 10:51:06 -08:00
|
|
|
if (unlikely(!skb))
|
|
|
|
return NULL;
|
2018-03-20 07:58:14 -07:00
|
|
|
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
rx_buf = &rx_ring->rx_buf[rx_ring->first_desc];
|
2019-02-13 10:51:06 -08:00
|
|
|
skb_record_rx_queue(skb, rx_ring->q_index);
|
|
|
|
/* Determine available headroom for copy */
|
|
|
|
headlen = size;
|
|
|
|
if (headlen > ICE_RX_HDR_SIZE)
|
2019-11-04 09:38:56 -08:00
|
|
|
headlen = eth_get_headlen(skb->dev, xdp->data, ICE_RX_HDR_SIZE);
|
2018-03-20 07:58:14 -07:00
|
|
|
|
2019-02-13 10:51:06 -08:00
|
|
|
/* align pull length to size of long to optimize memcpy performance */
|
2023-01-31 21:44:54 +01:00
|
|
|
memcpy(__skb_put(skb, headlen), xdp->data, ALIGN(headlen,
|
|
|
|
sizeof(long)));
|
2018-03-20 07:58:14 -07:00
|
|
|
|
2019-02-13 10:51:06 -08:00
|
|
|
/* if we exhaust the linear part then add what is left as a frag */
|
|
|
|
size -= headlen;
|
|
|
|
if (size) {
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
/* besides adding here a partial frag, we are going to add
|
|
|
|
* frags from xdp_buff, make sure there is enough space for
|
|
|
|
* them
|
|
|
|
*/
|
|
|
|
if (unlikely(nr_frags >= MAX_SKB_FRAGS - 1)) {
|
|
|
|
dev_kfree_skb(skb);
|
|
|
|
return NULL;
|
|
|
|
}
|
2019-02-13 10:51:06 -08:00
|
|
|
skb_add_rx_frag(skb, 0, rx_buf->page,
|
2023-01-31 21:45:02 +01:00
|
|
|
rx_buf->page_offset + headlen, size,
|
|
|
|
xdp->frame_sz);
|
2018-03-20 07:58:14 -07:00
|
|
|
} else {
|
2023-01-31 21:44:59 +01:00
|
|
|
/* buffer is unused, change the act that should be taken later
|
|
|
|
* on; data was copied onto skb's linear part so there's no
|
|
|
|
* need for adjusting page offset and we can reuse this buffer
|
|
|
|
* as-is
|
2019-02-13 10:51:06 -08:00
|
|
|
*/
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
rx_buf->act = ICE_SKB_CONSUMED;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (unlikely(xdp_buff_has_frags(xdp))) {
|
|
|
|
struct skb_shared_info *skinfo = skb_shinfo(skb);
|
|
|
|
|
|
|
|
memcpy(&skinfo->frags[skinfo->nr_frags], &sinfo->frags[0],
|
|
|
|
sizeof(skb_frag_t) * nr_frags);
|
|
|
|
|
|
|
|
xdp_update_skb_shared_info(skb, skinfo->nr_frags + nr_frags,
|
|
|
|
sinfo->xdp_frags_size,
|
|
|
|
nr_frags * xdp->frame_sz,
|
|
|
|
xdp_buff_is_frag_pfmemalloc(xdp));
|
2018-03-20 07:58:14 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
return skb;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2019-02-13 10:51:05 -08:00
|
|
|
* ice_put_rx_buf - Clean up used buffer and either recycle or free
|
|
|
|
* @rx_ring: Rx descriptor ring to transact packets on
|
|
|
|
* @rx_buf: Rx buffer to pull data from
|
2018-03-20 07:58:14 -07:00
|
|
|
*
|
2023-01-31 21:44:57 +01:00
|
|
|
* This function will clean up the contents of the rx_buf. It will either
|
|
|
|
* recycle the buffer or unmap it and free the associated resources.
|
2018-03-20 07:58:14 -07:00
|
|
|
*/
|
2020-08-25 19:27:36 +02:00
|
|
|
static void
|
2023-01-31 21:44:56 +01:00
|
|
|
ice_put_rx_buf(struct ice_rx_ring *rx_ring, struct ice_rx_buf *rx_buf)
|
2018-03-20 07:58:14 -07:00
|
|
|
{
|
2019-07-25 01:55:34 -07:00
|
|
|
if (!rx_buf)
|
|
|
|
return;
|
|
|
|
|
2023-01-31 21:44:56 +01:00
|
|
|
if (ice_can_reuse_rx_page(rx_buf)) {
|
2019-07-25 01:55:34 -07:00
|
|
|
/* hand second half of page back to the ring */
|
2018-03-20 07:58:14 -07:00
|
|
|
ice_reuse_rx_page(rx_ring, rx_buf);
|
|
|
|
} else {
|
|
|
|
/* we are not reusing the buffer so unmap it */
|
2019-10-24 01:11:22 -07:00
|
|
|
dma_unmap_page_attrs(rx_ring->dev, rx_buf->dma,
|
|
|
|
ice_rx_pg_size(rx_ring), DMA_FROM_DEVICE,
|
|
|
|
ICE_RX_DMA_ATTR);
|
2019-02-13 10:51:04 -08:00
|
|
|
__page_frag_cache_drain(rx_buf->page, rx_buf->pagecnt_bias);
|
2018-03-20 07:58:14 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/* clear contents of buffer_info */
|
|
|
|
rx_buf->page = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* ice_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
|
2018-10-26 11:44:47 -07:00
|
|
|
* @rx_ring: Rx descriptor ring to transact packets on
|
2018-03-20 07:58:14 -07:00
|
|
|
* @budget: Total limit on number of packets to process
|
|
|
|
*
|
|
|
|
* This function provides a "bounce buffer" approach to Rx interrupt
|
2018-10-26 11:44:46 -07:00
|
|
|
* processing. The advantage to this is that on systems that have
|
2018-03-20 07:58:14 -07:00
|
|
|
* expensive overhead for IOMMU access this provides a means of avoiding
|
|
|
|
* it by maintaining the mapping of the page to the system.
|
|
|
|
*
|
|
|
|
* Returns amount of work completed
|
|
|
|
*/
|
2021-08-19 13:59:58 +02:00
|
|
|
int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
|
2018-03-20 07:58:14 -07:00
|
|
|
{
|
2023-01-31 21:44:55 +01:00
|
|
|
unsigned int total_rx_bytes = 0, total_rx_pkts = 0;
|
2021-01-18 16:13:17 +01:00
|
|
|
unsigned int offset = rx_ring->rx_offset;
|
2023-01-31 21:44:55 +01:00
|
|
|
struct xdp_buff *xdp = &rx_ring->xdp;
|
2023-05-31 08:44:57 -07:00
|
|
|
u32 cached_ntc = rx_ring->first_desc;
|
2021-08-19 14:00:01 +02:00
|
|
|
struct ice_tx_ring *xdp_ring = NULL;
|
2019-11-04 09:38:56 -08:00
|
|
|
struct bpf_prog *xdp_prog = NULL;
|
2023-01-31 21:44:57 +01:00
|
|
|
u32 ntc = rx_ring->next_to_clean;
|
|
|
|
u32 cnt = rx_ring->count;
|
2023-01-31 21:44:59 +01:00
|
|
|
u32 xdp_xmit = 0;
|
2023-01-31 21:45:04 +01:00
|
|
|
u32 cached_ntu;
|
2019-06-26 02:20:19 -07:00
|
|
|
bool failure;
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
u32 first;
|
2018-03-20 07:58:14 -07:00
|
|
|
|
ice: Add XDP frame size to driver
This driver uses different memory models depending on PAGE_SIZE at
compile time. For PAGE_SIZE 4K it uses page splitting, meaning for
normal MTU frame size is 2048 bytes (and headroom 192 bytes). For
larger MTUs the driver still use page splitting, by allocating
order-1 pages (8192 bytes) for RX frames. For PAGE_SIZE larger than
4K, driver instead advance its rx_buffer->page_offset with the frame
size "truesize".
For XDP frame size calculations, this mean that in PAGE_SIZE larger
than 4K mode the frame_sz change on a per packet basis. For the page
split 4K PAGE_SIZE mode, xdp.frame_sz is more constant and can be
updated once outside the main NAPI loop.
The default setting in the driver uses build_skb(), which provides
the necessary headroom and tailroom for XDP-redirect in RX-frame
(in both modes).
There is one complication, which is legacy-rx mode (configurable via
ethtool priv-flags). There are zero headroom in this mode, which is a
requirement for XDP-redirect to work. The conversion to xdp_frame
(convert_to_xdp_frame) will detect this insufficient space, and
xdp_do_redirect() call will fail. This is deemed acceptable, as it
allows other XDP actions to still work in legacy-mode. In
legacy-mode + larger PAGE_SIZE due to lacking tailroom, we also
accept that xdp_adjust_tail shrink doesn't work.
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Cc: intel-wired-lan@lists.osuosl.org
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: Alexander Duyck <alexander.duyck@gmail.com>
Link: https://lore.kernel.org/bpf/158945347002.97035.328088795813704587.stgit@firesoul
2020-05-14 12:51:10 +02:00
|
|
|
/* Frame size depend on rx_ring setup when PAGE_SIZE=4K */
|
|
|
|
#if (PAGE_SIZE < 8192)
|
2023-01-31 21:44:55 +01:00
|
|
|
xdp->frame_sz = ice_rx_frame_truesize(rx_ring, 0);
|
ice: Add XDP frame size to driver
This driver uses different memory models depending on PAGE_SIZE at
compile time. For PAGE_SIZE 4K it uses page splitting, meaning for
normal MTU frame size is 2048 bytes (and headroom 192 bytes). For
larger MTUs the driver still use page splitting, by allocating
order-1 pages (8192 bytes) for RX frames. For PAGE_SIZE larger than
4K, driver instead advance its rx_buffer->page_offset with the frame
size "truesize".
For XDP frame size calculations, this mean that in PAGE_SIZE larger
than 4K mode the frame_sz change on a per packet basis. For the page
split 4K PAGE_SIZE mode, xdp.frame_sz is more constant and can be
updated once outside the main NAPI loop.
The default setting in the driver uses build_skb(), which provides
the necessary headroom and tailroom for XDP-redirect in RX-frame
(in both modes).
There is one complication, which is legacy-rx mode (configurable via
ethtool priv-flags). There are zero headroom in this mode, which is a
requirement for XDP-redirect to work. The conversion to xdp_frame
(convert_to_xdp_frame) will detect this insufficient space, and
xdp_do_redirect() call will fail. This is deemed acceptable, as it
allows other XDP actions to still work in legacy-mode. In
legacy-mode + larger PAGE_SIZE due to lacking tailroom, we also
accept that xdp_adjust_tail shrink doesn't work.
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Cc: intel-wired-lan@lists.osuosl.org
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: Alexander Duyck <alexander.duyck@gmail.com>
Link: https://lore.kernel.org/bpf/158945347002.97035.328088795813704587.stgit@firesoul
2020-05-14 12:51:10 +02:00
|
|
|
#endif
|
2019-11-04 09:38:56 -08:00
|
|
|
|
2021-08-19 14:00:01 +02:00
|
|
|
xdp_prog = READ_ONCE(rx_ring->xdp_prog);
|
2023-01-31 21:45:04 +01:00
|
|
|
if (xdp_prog) {
|
2021-08-19 14:00:01 +02:00
|
|
|
xdp_ring = rx_ring->xdp_ring;
|
2023-01-31 21:45:04 +01:00
|
|
|
cached_ntu = xdp_ring->next_to_use;
|
|
|
|
}
|
2021-08-19 14:00:01 +02:00
|
|
|
|
2019-02-19 15:04:13 -08:00
|
|
|
/* start the loop to process Rx packets bounded by 'budget' */
|
2018-03-20 07:58:14 -07:00
|
|
|
while (likely(total_rx_pkts < (unsigned int)budget)) {
|
|
|
|
union ice_32b_rx_flex_desc *rx_desc;
|
2019-02-13 10:51:01 -08:00
|
|
|
struct ice_rx_buf *rx_buf;
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
struct sk_buff *skb;
|
2019-02-13 10:51:01 -08:00
|
|
|
unsigned int size;
|
2018-03-20 07:58:14 -07:00
|
|
|
u16 stat_err_bits;
|
2023-12-05 22:08:39 +01:00
|
|
|
u16 vlan_tci;
|
2018-03-20 07:58:14 -07:00
|
|
|
|
2019-02-19 15:04:13 -08:00
|
|
|
/* get the Rx desc from Rx ring based on 'next_to_clean' */
|
2023-01-31 21:44:57 +01:00
|
|
|
rx_desc = ICE_RX_DESC(rx_ring, ntc);
|
2018-03-20 07:58:14 -07:00
|
|
|
|
|
|
|
/* status_error_len will always be zero for unused descriptors
|
|
|
|
* because it's cleared in cleanup, and overlaps with hdr_addr
|
|
|
|
* which is always zero because packet split isn't used, if the
|
|
|
|
* hardware wrote DD then it will be non-zero
|
|
|
|
*/
|
|
|
|
stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_DD_S);
|
ice: Add hot path support for 802.1Q and 802.1ad VLAN offloads
Currently the driver only supports 802.1Q VLAN insertion and stripping.
However, once Double VLAN Mode (DVM) is fully supported, then both 802.1Q
and 802.1ad VLAN insertion and stripping will be supported. Unfortunately
the VSI context parameters only allow for one VLAN ethertype at a time
for VLAN offloads so only one or the other VLAN ethertype offload can be
supported at once.
To support this, multiple changes are needed.
Rx path changes:
[1] In DVM, the Rx queue context l2tagsel field needs to be cleared so
the outermost tag shows up in the l2tag2_2nd field of the Rx flex
descriptor. In Single VLAN Mode (SVM), the l2tagsel field should remain
1 to support SVM configurations.
[2] Modify the ice_test_staterr() function to take a __le16 instead of
the ice_32b_rx_flex_desc union pointer so this function can be used for
both rx_desc->wb.status_error0 and rx_desc->wb.status_error1.
[3] Add the new inline function ice_get_vlan_tag_from_rx_desc() that
checks if there is a VLAN tag in l2tag1 or l2tag2_2nd.
[4] In ice_receive_skb(), add a check to see if NETIF_F_HW_VLAN_STAG_RX
is enabled in netdev->features. If it is, then this is the VLAN
ethertype that needs to be added to the stripping VLAN tag. Since
ice_fix_features() prevents CTAG_RX and STAG_RX from being enabled
simultaneously, the VLAN ethertype will only ever be 802.1Q or 802.1ad.
Tx path changes:
[1] In DVM, the VLAN tag needs to be placed in the l2tag2 field of the Tx
context descriptor. The new define ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN was
added to the list of tx_flags to handle this case.
[2] When the stack requests the VLAN tag to be offloaded on Tx, the
driver needs to set either ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN or
ICE_TX_FLAGS_HW_VLAN, so the tag is inserted in l2tag2 or l2tag1
respectively. To determine which location to use, set a bit in the Tx
ring flags field during ring allocation that can be used to determine
which field to use in the Tx descriptor. In DVM, always use l2tag2,
and in SVM, always use l2tag1.
Signed-off-by: Brett Creeley <brett.creeley@intel.com>
Tested-by: Gurucharan G <gurucharanx.g@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-12-02 08:38:47 -08:00
|
|
|
if (!ice_test_staterr(rx_desc->wb.status_error0, stat_err_bits))
|
2018-03-20 07:58:14 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
/* This memory barrier is needed to keep us from reading
|
|
|
|
* any other fields out of the rx_desc until we know the
|
|
|
|
* DD bit is set.
|
|
|
|
*/
|
|
|
|
dma_rmb();
|
|
|
|
|
2021-06-08 16:35:17 -07:00
|
|
|
ice_trace(clean_rx_irq, rx_ring, rx_desc);
|
2020-05-11 18:01:40 -07:00
|
|
|
if (rx_desc->wb.rxdid == FDIR_DESC_RXDID || !rx_ring->netdev) {
|
2021-03-09 11:08:10 +08:00
|
|
|
struct ice_vsi *ctrl_vsi = rx_ring->vsi;
|
|
|
|
|
|
|
|
if (rx_desc->wb.rxdid == FDIR_DESC_RXDID &&
|
2022-02-16 13:37:29 -08:00
|
|
|
ctrl_vsi->vf)
|
2021-03-09 11:08:10 +08:00
|
|
|
ice_vc_fdir_irq_handler(ctrl_vsi, rx_desc);
|
2023-01-31 21:44:57 +01:00
|
|
|
if (++ntc == cnt)
|
|
|
|
ntc = 0;
|
2023-03-09 13:38:56 -08:00
|
|
|
rx_ring->first_desc = ntc;
|
2020-05-11 18:01:40 -07:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2019-02-13 10:51:01 -08:00
|
|
|
size = le16_to_cpu(rx_desc->wb.pkt_len) &
|
|
|
|
ICE_RX_FLX_DESC_PKT_LEN_M;
|
|
|
|
|
2019-07-25 01:55:34 -07:00
|
|
|
/* retrieve a buffer from the ring */
|
2023-01-31 21:44:57 +01:00
|
|
|
rx_buf = ice_get_rx_buf(rx_ring, size, ntc);
|
2019-07-25 01:55:34 -07:00
|
|
|
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
if (!xdp->data) {
|
|
|
|
void *hard_start;
|
2019-11-04 09:38:56 -08:00
|
|
|
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
hard_start = page_address(rx_buf->page) + rx_buf->page_offset -
|
|
|
|
offset;
|
|
|
|
xdp_prepare_buff(xdp, hard_start, offset, size, !!offset);
|
ice: Add XDP frame size to driver
This driver uses different memory models depending on PAGE_SIZE at
compile time. For PAGE_SIZE 4K it uses page splitting, meaning for
normal MTU frame size is 2048 bytes (and headroom 192 bytes). For
larger MTUs the driver still use page splitting, by allocating
order-1 pages (8192 bytes) for RX frames. For PAGE_SIZE larger than
4K, driver instead advance its rx_buffer->page_offset with the frame
size "truesize".
For XDP frame size calculations, this mean that in PAGE_SIZE larger
than 4K mode the frame_sz change on a per packet basis. For the page
split 4K PAGE_SIZE mode, xdp.frame_sz is more constant and can be
updated once outside the main NAPI loop.
The default setting in the driver uses build_skb(), which provides
the necessary headroom and tailroom for XDP-redirect in RX-frame
(in both modes).
There is one complication, which is legacy-rx mode (configurable via
ethtool priv-flags). There are zero headroom in this mode, which is a
requirement for XDP-redirect to work. The conversion to xdp_frame
(convert_to_xdp_frame) will detect this insufficient space, and
xdp_do_redirect() call will fail. This is deemed acceptable, as it
allows other XDP actions to still work in legacy-mode. In
legacy-mode + larger PAGE_SIZE due to lacking tailroom, we also
accept that xdp_adjust_tail shrink doesn't work.
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Cc: intel-wired-lan@lists.osuosl.org
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: Alexander Duyck <alexander.duyck@gmail.com>
Link: https://lore.kernel.org/bpf/158945347002.97035.328088795813704587.stgit@firesoul
2020-05-14 12:51:10 +02:00
|
|
|
#if (PAGE_SIZE > 4096)
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
/* At larger PAGE_SIZE, frame_sz depend on len size */
|
|
|
|
xdp->frame_sz = ice_rx_frame_truesize(rx_ring, size);
|
ice: Add XDP frame size to driver
This driver uses different memory models depending on PAGE_SIZE at
compile time. For PAGE_SIZE 4K it uses page splitting, meaning for
normal MTU frame size is 2048 bytes (and headroom 192 bytes). For
larger MTUs the driver still use page splitting, by allocating
order-1 pages (8192 bytes) for RX frames. For PAGE_SIZE larger than
4K, driver instead advance its rx_buffer->page_offset with the frame
size "truesize".
For XDP frame size calculations, this mean that in PAGE_SIZE larger
than 4K mode the frame_sz change on a per packet basis. For the page
split 4K PAGE_SIZE mode, xdp.frame_sz is more constant and can be
updated once outside the main NAPI loop.
The default setting in the driver uses build_skb(), which provides
the necessary headroom and tailroom for XDP-redirect in RX-frame
(in both modes).
There is one complication, which is legacy-rx mode (configurable via
ethtool priv-flags). There are zero headroom in this mode, which is a
requirement for XDP-redirect to work. The conversion to xdp_frame
(convert_to_xdp_frame) will detect this insufficient space, and
xdp_do_redirect() call will fail. This is deemed acceptable, as it
allows other XDP actions to still work in legacy-mode. In
legacy-mode + larger PAGE_SIZE due to lacking tailroom, we also
accept that xdp_adjust_tail shrink doesn't work.
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Cc: intel-wired-lan@lists.osuosl.org
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: Alexander Duyck <alexander.duyck@gmail.com>
Link: https://lore.kernel.org/bpf/158945347002.97035.328088795813704587.stgit@firesoul
2020-05-14 12:51:10 +02:00
|
|
|
#endif
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
xdp_buff_clear_frags_flag(xdp);
|
|
|
|
} else if (ice_add_xdp_frag(rx_ring, xdp, rx_buf, size)) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (++ntc == cnt)
|
|
|
|
ntc = 0;
|
|
|
|
|
|
|
|
/* skip if it is NOP desc */
|
|
|
|
if (ice_is_non_eop(rx_ring, rx_desc))
|
|
|
|
continue;
|
2019-11-04 09:38:56 -08:00
|
|
|
|
2023-12-05 22:08:33 +01:00
|
|
|
ice_run_xdp(rx_ring, xdp, xdp_prog, xdp_ring, rx_buf, rx_desc);
|
2023-01-31 21:44:59 +01:00
|
|
|
if (rx_buf->act == ICE_XDP_PASS)
|
2019-10-24 01:11:23 -07:00
|
|
|
goto construct_skb;
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
total_rx_bytes += xdp_get_buff_len(xdp);
|
2019-10-24 01:11:23 -07:00
|
|
|
total_rx_pkts++;
|
|
|
|
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
xdp->data = NULL;
|
|
|
|
rx_ring->first_desc = ntc;
|
2024-01-24 20:15:55 +01:00
|
|
|
rx_ring->nr_frags = 0;
|
2019-10-24 01:11:23 -07:00
|
|
|
continue;
|
2019-11-04 09:38:56 -08:00
|
|
|
construct_skb:
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
if (likely(ice_ring_uses_build_skb(rx_ring)))
|
|
|
|
skb = ice_build_skb(rx_ring, xdp);
|
|
|
|
else
|
|
|
|
skb = ice_construct_skb(rx_ring, xdp);
|
2019-02-13 10:51:06 -08:00
|
|
|
/* exit if we failed to retrieve a buffer */
|
|
|
|
if (!skb) {
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
rx_ring->ring_stats->rx_stats.alloc_page_failed++;
|
|
|
|
rx_buf->act = ICE_XDP_CONSUMED;
|
|
|
|
if (unlikely(xdp_buff_has_frags(xdp)))
|
|
|
|
ice_set_rx_bufs_act(xdp, rx_ring,
|
|
|
|
ICE_XDP_CONSUMED);
|
|
|
|
xdp->data = NULL;
|
|
|
|
rx_ring->first_desc = ntc;
|
2024-01-24 20:15:55 +01:00
|
|
|
rx_ring->nr_frags = 0;
|
2018-03-20 07:58:14 -07:00
|
|
|
break;
|
2019-02-13 10:51:06 -08:00
|
|
|
}
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
xdp->data = NULL;
|
|
|
|
rx_ring->first_desc = ntc;
|
2024-01-24 20:15:55 +01:00
|
|
|
rx_ring->nr_frags = 0;
|
2018-03-20 07:58:14 -07:00
|
|
|
|
|
|
|
stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_RXE_S);
|
ice: Add hot path support for 802.1Q and 802.1ad VLAN offloads
Currently the driver only supports 802.1Q VLAN insertion and stripping.
However, once Double VLAN Mode (DVM) is fully supported, then both 802.1Q
and 802.1ad VLAN insertion and stripping will be supported. Unfortunately
the VSI context parameters only allow for one VLAN ethertype at a time
for VLAN offloads so only one or the other VLAN ethertype offload can be
supported at once.
To support this, multiple changes are needed.
Rx path changes:
[1] In DVM, the Rx queue context l2tagsel field needs to be cleared so
the outermost tag shows up in the l2tag2_2nd field of the Rx flex
descriptor. In Single VLAN Mode (SVM), the l2tagsel field should remain
1 to support SVM configurations.
[2] Modify the ice_test_staterr() function to take a __le16 instead of
the ice_32b_rx_flex_desc union pointer so this function can be used for
both rx_desc->wb.status_error0 and rx_desc->wb.status_error1.
[3] Add the new inline function ice_get_vlan_tag_from_rx_desc() that
checks if there is a VLAN tag in l2tag1 or l2tag2_2nd.
[4] In ice_receive_skb(), add a check to see if NETIF_F_HW_VLAN_STAG_RX
is enabled in netdev->features. If it is, then this is the VLAN
ethertype that needs to be added to the stripping VLAN tag. Since
ice_fix_features() prevents CTAG_RX and STAG_RX from being enabled
simultaneously, the VLAN ethertype will only ever be 802.1Q or 802.1ad.
Tx path changes:
[1] In DVM, the VLAN tag needs to be placed in the l2tag2 field of the Tx
context descriptor. The new define ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN was
added to the list of tx_flags to handle this case.
[2] When the stack requests the VLAN tag to be offloaded on Tx, the
driver needs to set either ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN or
ICE_TX_FLAGS_HW_VLAN, so the tag is inserted in l2tag2 or l2tag1
respectively. To determine which location to use, set a bit in the Tx
ring flags field during ring allocation that can be used to determine
which field to use in the Tx descriptor. In DVM, always use l2tag2,
and in SVM, always use l2tag1.
Signed-off-by: Brett Creeley <brett.creeley@intel.com>
Tested-by: Gurucharan G <gurucharanx.g@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-12-02 08:38:47 -08:00
|
|
|
if (unlikely(ice_test_staterr(rx_desc->wb.status_error0,
|
|
|
|
stat_err_bits))) {
|
2018-03-20 07:58:14 -07:00
|
|
|
dev_kfree_skb_any(skb);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2023-12-05 22:08:39 +01:00
|
|
|
vlan_tci = ice_get_vlan_tci(rx_desc);
|
2018-03-20 07:58:14 -07:00
|
|
|
|
2019-10-09 07:09:51 -07:00
|
|
|
/* pad the skb if needed, to make a valid ethernet frame */
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
if (eth_skb_pad(skb))
|
2018-03-20 07:58:14 -07:00
|
|
|
continue;
|
|
|
|
|
|
|
|
/* probably a little skewed due to removing CRC */
|
|
|
|
total_rx_bytes += skb->len;
|
|
|
|
|
2018-03-20 07:58:15 -07:00
|
|
|
/* populate checksum, VLAN, and protocol */
|
2023-12-05 22:08:32 +01:00
|
|
|
ice_process_skb_fields(rx_ring, rx_desc, skb);
|
2018-03-20 07:58:15 -07:00
|
|
|
|
2021-06-08 16:35:17 -07:00
|
|
|
ice_trace(clean_rx_irq_indicate, rx_ring, rx_desc, skb);
|
2018-03-20 07:58:14 -07:00
|
|
|
/* send completed skb up the stack */
|
2023-12-05 22:08:39 +01:00
|
|
|
ice_receive_skb(rx_ring, skb, vlan_tci);
|
2018-03-20 07:58:14 -07:00
|
|
|
|
|
|
|
/* update budget accounting */
|
|
|
|
total_rx_pkts++;
|
|
|
|
}
|
|
|
|
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
first = rx_ring->first_desc;
|
|
|
|
while (cached_ntc != first) {
|
2023-01-31 21:44:59 +01:00
|
|
|
struct ice_rx_buf *buf = &rx_ring->rx_buf[cached_ntc];
|
|
|
|
|
|
|
|
if (buf->act & (ICE_XDP_TX | ICE_XDP_REDIR)) {
|
|
|
|
ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz);
|
|
|
|
xdp_xmit |= buf->act;
|
|
|
|
} else if (buf->act & ICE_XDP_CONSUMED) {
|
|
|
|
buf->pagecnt_bias++;
|
|
|
|
} else if (buf->act == ICE_XDP_PASS) {
|
|
|
|
ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz);
|
|
|
|
}
|
|
|
|
|
|
|
|
ice_put_rx_buf(rx_ring, buf);
|
|
|
|
if (++cached_ntc >= cnt)
|
|
|
|
cached_ntc = 0;
|
|
|
|
}
|
2023-01-31 21:44:57 +01:00
|
|
|
rx_ring->next_to_clean = ntc;
|
2019-06-26 02:20:19 -07:00
|
|
|
/* return up to cleaned_count buffers to hardware */
|
ice: Add support for XDP multi-buffer on Rx side
Ice driver needs to be a bit reworked on Rx data path in order to
support multi-buffer XDP. For skb path, it currently works in a way that
Rx ring carries pointer to skb so if driver didn't manage to combine
fragmented frame at current NAPI instance, it can restore the state on
next instance and keep looking for last fragment (so descriptor with EOP
bit set). What needs to be achieved is that xdp_buff needs to be
combined in such way (linear + frags part) in the first place. Then skb
will be ready to go in case of XDP_PASS or BPF program being not present
on interface. If BPF program is there, it would work on multi-buffer
XDP. At this point xdp_buff resides directly on Rx ring, so given the
fact that skb will be built straight from xdp_buff, there will be no
further need to carry skb on Rx ring.
Besides removing skb pointer from Rx ring, lots of members have been
moved around within ice_rx_ring. First and foremost reason was to place
rx_buf with xdp_buff on the same cacheline. This means that once we
touch rx_buf (which is a preceding step before touching xdp_buff),
xdp_buff will already be hot in cache. Second thing was that xdp_rxq is
used rather rarely and it occupies a separate cacheline, so maybe it is
better to have it at the end of ice_rx_ring.
Other change that affects ice_rx_ring is the introduction of
ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate
rx_buf->act to all the parts of current xdp_buff after running XDP
program, so that ice_put_rx_buf() that got moved out of the main Rx
processing loop will be able to tak an appriopriate action on each
buffer. Second is for ice_construct_skb().
ice_construct_skb() has a copybreak mechanism which had an explicit
impact on xdp_buff->skb conversion in the new approach when legacy Rx
flag is toggled. It works in a way that linear part is 256 bytes long,
if frame is bigger than that, remaining bytes are going as a frag to
skb_shared_info.
This means while memcpying frags from xdp_buff to newly allocated skb,
care needs to be taken when picking the destination frag array entry.
Upon the time ice_construct_skb() is called, when dealing with
fragmented frame, current rx_buf points to the *last* fragment, but
copybreak needs to be done against the first one. That's where
ice_rx_ring::first_desc helps.
When frame building spans across NAPI polls (DD bit is not set on
current descriptor and xdp->data is not NULL) with current Rx buffer
handling state there might be some problems.
Since calls to ice_put_rx_buf() were pulled out of the main Rx
processing loop and were scoped from cached_ntc to current ntc, remember
that now mentioned function relies on rx_buf->act, which is set within
ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so
currently we could put Rx buffer with rx_buf->act being *uninitialized*.
To address this, change scoping to rely on first_desc on both boundaries
instead.
This also implies that cleaned_count which is used as an input to
ice_alloc_rx_buffers() and tells how many new buffers should be refilled
has to be adjusted. If it stayed as is, what could happen is a case
where ntc would go over ntu.
Therefore, remove cleaned_count altogether and use against allocing
routine newly introduced ICE_RX_DESC_UNUSED() macro which is an
equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on
struct ice_rx_ring::first_desc instead of next_to_clean.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
|
|
|
failure = ice_alloc_rx_bufs(rx_ring, ICE_RX_DESC_UNUSED(rx_ring));
|
2019-06-26 02:20:19 -07:00
|
|
|
|
2023-01-31 21:45:01 +01:00
|
|
|
if (xdp_xmit)
|
2023-01-31 21:45:04 +01:00
|
|
|
ice_finalize_xdp_rx(xdp_ring, xdp_xmit, cached_ntu);
|
2019-11-04 09:38:56 -08:00
|
|
|
|
ice: Accumulate ring statistics over reset
Resets may occur with or without user interaction. For example, a TX hang
or reconfiguration of parameters will result in a reset. During reset, the
VSI is freed, freeing any statistics structures inside as well. This would
create an issue for the user where a reset happens in the background,
statistics set to zero, and the user checks ring statistics expecting them
to be populated.
To ensure this doesn't happen, accumulate ring statistics over reset.
Define a new ring statistics structure, ice_ring_stats. The new structure
lives in the VSI's parent, preserving ring statistics when VSI is freed.
1. Define a new structure vsi_ring_stats in the PF scope
2. Allocate/free stats only during probe, unload, or change in ring size
3. Replace previous ring statistics functionality with new structure
Signed-off-by: Benjamin Mikailenko <benjamin.mikailenko@intel.com>
Tested-by: Gurucharan G <gurucharanx.g@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2022-11-18 16:20:02 -05:00
|
|
|
if (rx_ring->ring_stats)
|
|
|
|
ice_update_rx_ring_stats(rx_ring, total_rx_pkts,
|
|
|
|
total_rx_bytes);
|
2018-03-20 07:58:14 -07:00
|
|
|
|
|
|
|
/* guarantee a trip back through this routine if there was a failure */
|
|
|
|
return failure ? budget : (int)total_rx_pkts;
|
|
|
|
}
|
|
|
|
|
ice: update dim usage and moderation
The driver was having trouble with unreliable latency when doing single
threaded ping-pong tests. This was root caused to the DIM algorithm
landing on a too slow interrupt value, which caused high latency, and it
was especially present when queues were being switched frequently by the
scheduler as happens on default setups today.
In attempting to improve this, we allow the upper rate limit for
interrupts to move to rate limit of 4 microseconds as a max, which means
that no vector can generate more than 250,000 interrupts per second. The
old config was up to 100,000. The driver previously tried to program the
rate limit too frequently and if the receive and transmit side were both
active on the same vector, the INTRL would be set incorrectly, and this
change fixes that issue as a side effect of the redesign.
This driver will operate from now on with a slightly changed DIM table
with more emphasis towards latency sensitivity by having more table
entries with lower latency than with high latency (high being >= 64
microseconds).
The driver also resets the DIM algorithm state with a new stats set when
there is no work done and the data becomes stale (older than 1 second),
for the respective receive or transmit portion of the interrupt.
Add a new helper for setting rate limit, which will be used more
in a followup patch.
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Tested-by: Gurucharan G <gurucharanx.g@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-09-20 12:30:12 -07:00
|
|
|
static void __ice_update_sample(struct ice_q_vector *q_vector,
|
|
|
|
struct ice_ring_container *rc,
|
|
|
|
struct dim_sample *sample,
|
|
|
|
bool is_tx)
|
|
|
|
{
|
|
|
|
u64 packets = 0, bytes = 0;
|
|
|
|
|
|
|
|
if (is_tx) {
|
|
|
|
struct ice_tx_ring *tx_ring;
|
|
|
|
|
|
|
|
ice_for_each_tx_ring(tx_ring, *rc) {
|
ice: Accumulate ring statistics over reset
Resets may occur with or without user interaction. For example, a TX hang
or reconfiguration of parameters will result in a reset. During reset, the
VSI is freed, freeing any statistics structures inside as well. This would
create an issue for the user where a reset happens in the background,
statistics set to zero, and the user checks ring statistics expecting them
to be populated.
To ensure this doesn't happen, accumulate ring statistics over reset.
Define a new ring statistics structure, ice_ring_stats. The new structure
lives in the VSI's parent, preserving ring statistics when VSI is freed.
1. Define a new structure vsi_ring_stats in the PF scope
2. Allocate/free stats only during probe, unload, or change in ring size
3. Replace previous ring statistics functionality with new structure
Signed-off-by: Benjamin Mikailenko <benjamin.mikailenko@intel.com>
Tested-by: Gurucharan G <gurucharanx.g@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2022-11-18 16:20:02 -05:00
|
|
|
struct ice_ring_stats *ring_stats;
|
|
|
|
|
|
|
|
ring_stats = tx_ring->ring_stats;
|
|
|
|
if (!ring_stats)
|
|
|
|
continue;
|
|
|
|
packets += ring_stats->stats.pkts;
|
|
|
|
bytes += ring_stats->stats.bytes;
|
ice: update dim usage and moderation
The driver was having trouble with unreliable latency when doing single
threaded ping-pong tests. This was root caused to the DIM algorithm
landing on a too slow interrupt value, which caused high latency, and it
was especially present when queues were being switched frequently by the
scheduler as happens on default setups today.
In attempting to improve this, we allow the upper rate limit for
interrupts to move to rate limit of 4 microseconds as a max, which means
that no vector can generate more than 250,000 interrupts per second. The
old config was up to 100,000. The driver previously tried to program the
rate limit too frequently and if the receive and transmit side were both
active on the same vector, the INTRL would be set incorrectly, and this
change fixes that issue as a side effect of the redesign.
This driver will operate from now on with a slightly changed DIM table
with more emphasis towards latency sensitivity by having more table
entries with lower latency than with high latency (high being >= 64
microseconds).
The driver also resets the DIM algorithm state with a new stats set when
there is no work done and the data becomes stale (older than 1 second),
for the respective receive or transmit portion of the interrupt.
Add a new helper for setting rate limit, which will be used more
in a followup patch.
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Tested-by: Gurucharan G <gurucharanx.g@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-09-20 12:30:12 -07:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
struct ice_rx_ring *rx_ring;
|
|
|
|
|
|
|
|
ice_for_each_rx_ring(rx_ring, *rc) {
|
ice: Accumulate ring statistics over reset
Resets may occur with or without user interaction. For example, a TX hang
or reconfiguration of parameters will result in a reset. During reset, the
VSI is freed, freeing any statistics structures inside as well. This would
create an issue for the user where a reset happens in the background,
statistics set to zero, and the user checks ring statistics expecting them
to be populated.
To ensure this doesn't happen, accumulate ring statistics over reset.
Define a new ring statistics structure, ice_ring_stats. The new structure
lives in the VSI's parent, preserving ring statistics when VSI is freed.
1. Define a new structure vsi_ring_stats in the PF scope
2. Allocate/free stats only during probe, unload, or change in ring size
3. Replace previous ring statistics functionality with new structure
Signed-off-by: Benjamin Mikailenko <benjamin.mikailenko@intel.com>
Tested-by: Gurucharan G <gurucharanx.g@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2022-11-18 16:20:02 -05:00
|
|
|
struct ice_ring_stats *ring_stats;
|
|
|
|
|
|
|
|
ring_stats = rx_ring->ring_stats;
|
|
|
|
if (!ring_stats)
|
|
|
|
continue;
|
|
|
|
packets += ring_stats->stats.pkts;
|
|
|
|
bytes += ring_stats->stats.bytes;
|
ice: update dim usage and moderation
The driver was having trouble with unreliable latency when doing single
threaded ping-pong tests. This was root caused to the DIM algorithm
landing on a too slow interrupt value, which caused high latency, and it
was especially present when queues were being switched frequently by the
scheduler as happens on default setups today.
In attempting to improve this, we allow the upper rate limit for
interrupts to move to rate limit of 4 microseconds as a max, which means
that no vector can generate more than 250,000 interrupts per second. The
old config was up to 100,000. The driver previously tried to program the
rate limit too frequently and if the receive and transmit side were both
active on the same vector, the INTRL would be set incorrectly, and this
change fixes that issue as a side effect of the redesign.
This driver will operate from now on with a slightly changed DIM table
with more emphasis towards latency sensitivity by having more table
entries with lower latency than with high latency (high being >= 64
microseconds).
The driver also resets the DIM algorithm state with a new stats set when
there is no work done and the data becomes stale (older than 1 second),
for the respective receive or transmit portion of the interrupt.
Add a new helper for setting rate limit, which will be used more
in a followup patch.
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Tested-by: Gurucharan G <gurucharanx.g@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-09-20 12:30:12 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
dim_update_sample(q_vector->total_events, packets, bytes, sample);
|
|
|
|
sample->comp_ctr = 0;
|
|
|
|
|
|
|
|
/* if dim settings get stale, like when not updated for 1
|
|
|
|
* second or longer, force it to start again. This addresses the
|
|
|
|
* frequent case of an idle queue being switched to by the
|
|
|
|
* scheduler. The 1,000 here means 1,000 milliseconds.
|
|
|
|
*/
|
|
|
|
if (ktime_ms_delta(sample->time, rc->dim.start_sample.time) >= 1000)
|
|
|
|
rc->dim.state = DIM_START_MEASURE;
|
|
|
|
}
|
|
|
|
|
2019-02-28 15:25:47 -08:00
|
|
|
/**
|
ice: replace custom AIM algorithm with kernel's DIM library
The ice driver has support for adaptive interrupt moderation, an
algorithm for tuning the interrupt rate dynamically. This algorithm
is based on various assumptions about ring size, socket buffer size,
link speed, SKB overhead, ethernet frame overhead and more.
The Linux kernel has support for a dynamic interrupt moderation
algorithm known as "dimlib". Replace the custom driver-specific
implementation of dynamic interrupt moderation with the kernel's
algorithm.
The Intel hardware has a different hardware implementation than the
originators of the dimlib code had to work with, which requires the
driver to use a slightly different set of inputs for the actual
moderation values, while getting all the advice from dimlib of
better/worse, shift left or right.
The change made for this implementation is to use a pair of values
for each of the 5 "slots" that the dimlib moderation expects, and
the driver will program those pairs when dimlib recommends a slot to
use. The currently implementation uses two tables, one for receive
and one for transmit, and the pairs of values in each slot set the
maximum delay of an interrupt and a maximum number of interrupts per
second (both expressed in microseconds).
There are two separate kinds of bugs fixed by using DIMLIB, one is
UDP single stream send was too slow, and the other is that 8K
ping-pong was going to the most aggressive moderation and has much
too high latency.
The overall result of using DIMLIB is that we meet or exceed our
performance expectations set based on the old algorithm.
Co-developed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-03-31 14:16:57 -07:00
|
|
|
* ice_net_dim - Update net DIM algorithm
|
|
|
|
* @q_vector: the vector associated with the interrupt
|
2019-02-28 15:25:47 -08:00
|
|
|
*
|
ice: replace custom AIM algorithm with kernel's DIM library
The ice driver has support for adaptive interrupt moderation, an
algorithm for tuning the interrupt rate dynamically. This algorithm
is based on various assumptions about ring size, socket buffer size,
link speed, SKB overhead, ethernet frame overhead and more.
The Linux kernel has support for a dynamic interrupt moderation
algorithm known as "dimlib". Replace the custom driver-specific
implementation of dynamic interrupt moderation with the kernel's
algorithm.
The Intel hardware has a different hardware implementation than the
originators of the dimlib code had to work with, which requires the
driver to use a slightly different set of inputs for the actual
moderation values, while getting all the advice from dimlib of
better/worse, shift left or right.
The change made for this implementation is to use a pair of values
for each of the 5 "slots" that the dimlib moderation expects, and
the driver will program those pairs when dimlib recommends a slot to
use. The currently implementation uses two tables, one for receive
and one for transmit, and the pairs of values in each slot set the
maximum delay of an interrupt and a maximum number of interrupts per
second (both expressed in microseconds).
There are two separate kinds of bugs fixed by using DIMLIB, one is
UDP single stream send was too slow, and the other is that 8K
ping-pong was going to the most aggressive moderation and has much
too high latency.
The overall result of using DIMLIB is that we meet or exceed our
performance expectations set based on the old algorithm.
Co-developed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-03-31 14:16:57 -07:00
|
|
|
* Create a DIM sample and notify net_dim() so that it can possibly decide
|
|
|
|
* a new ITR value based on incoming packets, bytes, and interrupts.
|
2019-02-28 15:25:47 -08:00
|
|
|
*
|
ice: replace custom AIM algorithm with kernel's DIM library
The ice driver has support for adaptive interrupt moderation, an
algorithm for tuning the interrupt rate dynamically. This algorithm
is based on various assumptions about ring size, socket buffer size,
link speed, SKB overhead, ethernet frame overhead and more.
The Linux kernel has support for a dynamic interrupt moderation
algorithm known as "dimlib". Replace the custom driver-specific
implementation of dynamic interrupt moderation with the kernel's
algorithm.
The Intel hardware has a different hardware implementation than the
originators of the dimlib code had to work with, which requires the
driver to use a slightly different set of inputs for the actual
moderation values, while getting all the advice from dimlib of
better/worse, shift left or right.
The change made for this implementation is to use a pair of values
for each of the 5 "slots" that the dimlib moderation expects, and
the driver will program those pairs when dimlib recommends a slot to
use. The currently implementation uses two tables, one for receive
and one for transmit, and the pairs of values in each slot set the
maximum delay of an interrupt and a maximum number of interrupts per
second (both expressed in microseconds).
There are two separate kinds of bugs fixed by using DIMLIB, one is
UDP single stream send was too slow, and the other is that 8K
ping-pong was going to the most aggressive moderation and has much
too high latency.
The overall result of using DIMLIB is that we meet or exceed our
performance expectations set based on the old algorithm.
Co-developed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-03-31 14:16:57 -07:00
|
|
|
* This function is a no-op if the ring is not configured to dynamic ITR.
|
2019-02-28 15:25:47 -08:00
|
|
|
*/
|
ice: replace custom AIM algorithm with kernel's DIM library
The ice driver has support for adaptive interrupt moderation, an
algorithm for tuning the interrupt rate dynamically. This algorithm
is based on various assumptions about ring size, socket buffer size,
link speed, SKB overhead, ethernet frame overhead and more.
The Linux kernel has support for a dynamic interrupt moderation
algorithm known as "dimlib". Replace the custom driver-specific
implementation of dynamic interrupt moderation with the kernel's
algorithm.
The Intel hardware has a different hardware implementation than the
originators of the dimlib code had to work with, which requires the
driver to use a slightly different set of inputs for the actual
moderation values, while getting all the advice from dimlib of
better/worse, shift left or right.
The change made for this implementation is to use a pair of values
for each of the 5 "slots" that the dimlib moderation expects, and
the driver will program those pairs when dimlib recommends a slot to
use. The currently implementation uses two tables, one for receive
and one for transmit, and the pairs of values in each slot set the
maximum delay of an interrupt and a maximum number of interrupts per
second (both expressed in microseconds).
There are two separate kinds of bugs fixed by using DIMLIB, one is
UDP single stream send was too slow, and the other is that 8K
ping-pong was going to the most aggressive moderation and has much
too high latency.
The overall result of using DIMLIB is that we meet or exceed our
performance expectations set based on the old algorithm.
Co-developed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-03-31 14:16:57 -07:00
|
|
|
static void ice_net_dim(struct ice_q_vector *q_vector)
|
2019-02-19 15:04:01 -08:00
|
|
|
{
|
ice: replace custom AIM algorithm with kernel's DIM library
The ice driver has support for adaptive interrupt moderation, an
algorithm for tuning the interrupt rate dynamically. This algorithm
is based on various assumptions about ring size, socket buffer size,
link speed, SKB overhead, ethernet frame overhead and more.
The Linux kernel has support for a dynamic interrupt moderation
algorithm known as "dimlib". Replace the custom driver-specific
implementation of dynamic interrupt moderation with the kernel's
algorithm.
The Intel hardware has a different hardware implementation than the
originators of the dimlib code had to work with, which requires the
driver to use a slightly different set of inputs for the actual
moderation values, while getting all the advice from dimlib of
better/worse, shift left or right.
The change made for this implementation is to use a pair of values
for each of the 5 "slots" that the dimlib moderation expects, and
the driver will program those pairs when dimlib recommends a slot to
use. The currently implementation uses two tables, one for receive
and one for transmit, and the pairs of values in each slot set the
maximum delay of an interrupt and a maximum number of interrupts per
second (both expressed in microseconds).
There are two separate kinds of bugs fixed by using DIMLIB, one is
UDP single stream send was too slow, and the other is that 8K
ping-pong was going to the most aggressive moderation and has much
too high latency.
The overall result of using DIMLIB is that we meet or exceed our
performance expectations set based on the old algorithm.
Co-developed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-03-31 14:16:57 -07:00
|
|
|
struct ice_ring_container *tx = &q_vector->tx;
|
|
|
|
struct ice_ring_container *rx = &q_vector->rx;
|
2019-02-28 15:25:47 -08:00
|
|
|
|
2021-03-31 14:16:59 -07:00
|
|
|
if (ITR_IS_DYNAMIC(tx)) {
|
ice: update dim usage and moderation
The driver was having trouble with unreliable latency when doing single
threaded ping-pong tests. This was root caused to the DIM algorithm
landing on a too slow interrupt value, which caused high latency, and it
was especially present when queues were being switched frequently by the
scheduler as happens on default setups today.
In attempting to improve this, we allow the upper rate limit for
interrupts to move to rate limit of 4 microseconds as a max, which means
that no vector can generate more than 250,000 interrupts per second. The
old config was up to 100,000. The driver previously tried to program the
rate limit too frequently and if the receive and transmit side were both
active on the same vector, the INTRL would be set incorrectly, and this
change fixes that issue as a side effect of the redesign.
This driver will operate from now on with a slightly changed DIM table
with more emphasis towards latency sensitivity by having more table
entries with lower latency than with high latency (high being >= 64
microseconds).
The driver also resets the DIM algorithm state with a new stats set when
there is no work done and the data becomes stale (older than 1 second),
for the respective receive or transmit portion of the interrupt.
Add a new helper for setting rate limit, which will be used more
in a followup patch.
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Tested-by: Gurucharan G <gurucharanx.g@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-09-20 12:30:12 -07:00
|
|
|
struct dim_sample dim_sample;
|
2019-02-19 15:04:01 -08:00
|
|
|
|
ice: update dim usage and moderation
The driver was having trouble with unreliable latency when doing single
threaded ping-pong tests. This was root caused to the DIM algorithm
landing on a too slow interrupt value, which caused high latency, and it
was especially present when queues were being switched frequently by the
scheduler as happens on default setups today.
In attempting to improve this, we allow the upper rate limit for
interrupts to move to rate limit of 4 microseconds as a max, which means
that no vector can generate more than 250,000 interrupts per second. The
old config was up to 100,000. The driver previously tried to program the
rate limit too frequently and if the receive and transmit side were both
active on the same vector, the INTRL would be set incorrectly, and this
change fixes that issue as a side effect of the redesign.
This driver will operate from now on with a slightly changed DIM table
with more emphasis towards latency sensitivity by having more table
entries with lower latency than with high latency (high being >= 64
microseconds).
The driver also resets the DIM algorithm state with a new stats set when
there is no work done and the data becomes stale (older than 1 second),
for the respective receive or transmit portion of the interrupt.
Add a new helper for setting rate limit, which will be used more
in a followup patch.
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Tested-by: Gurucharan G <gurucharanx.g@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-09-20 12:30:12 -07:00
|
|
|
__ice_update_sample(q_vector, tx, &dim_sample, true);
|
ice: replace custom AIM algorithm with kernel's DIM library
The ice driver has support for adaptive interrupt moderation, an
algorithm for tuning the interrupt rate dynamically. This algorithm
is based on various assumptions about ring size, socket buffer size,
link speed, SKB overhead, ethernet frame overhead and more.
The Linux kernel has support for a dynamic interrupt moderation
algorithm known as "dimlib". Replace the custom driver-specific
implementation of dynamic interrupt moderation with the kernel's
algorithm.
The Intel hardware has a different hardware implementation than the
originators of the dimlib code had to work with, which requires the
driver to use a slightly different set of inputs for the actual
moderation values, while getting all the advice from dimlib of
better/worse, shift left or right.
The change made for this implementation is to use a pair of values
for each of the 5 "slots" that the dimlib moderation expects, and
the driver will program those pairs when dimlib recommends a slot to
use. The currently implementation uses two tables, one for receive
and one for transmit, and the pairs of values in each slot set the
maximum delay of an interrupt and a maximum number of interrupts per
second (both expressed in microseconds).
There are two separate kinds of bugs fixed by using DIMLIB, one is
UDP single stream send was too slow, and the other is that 8K
ping-pong was going to the most aggressive moderation and has much
too high latency.
The overall result of using DIMLIB is that we meet or exceed our
performance expectations set based on the old algorithm.
Co-developed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-03-31 14:16:57 -07:00
|
|
|
net_dim(&tx->dim, dim_sample);
|
2019-02-19 15:04:01 -08:00
|
|
|
}
|
|
|
|
|
2021-03-31 14:16:59 -07:00
|
|
|
if (ITR_IS_DYNAMIC(rx)) {
|
ice: update dim usage and moderation
The driver was having trouble with unreliable latency when doing single
threaded ping-pong tests. This was root caused to the DIM algorithm
landing on a too slow interrupt value, which caused high latency, and it
was especially present when queues were being switched frequently by the
scheduler as happens on default setups today.
In attempting to improve this, we allow the upper rate limit for
interrupts to move to rate limit of 4 microseconds as a max, which means
that no vector can generate more than 250,000 interrupts per second. The
old config was up to 100,000. The driver previously tried to program the
rate limit too frequently and if the receive and transmit side were both
active on the same vector, the INTRL would be set incorrectly, and this
change fixes that issue as a side effect of the redesign.
This driver will operate from now on with a slightly changed DIM table
with more emphasis towards latency sensitivity by having more table
entries with lower latency than with high latency (high being >= 64
microseconds).
The driver also resets the DIM algorithm state with a new stats set when
there is no work done and the data becomes stale (older than 1 second),
for the respective receive or transmit portion of the interrupt.
Add a new helper for setting rate limit, which will be used more
in a followup patch.
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Tested-by: Gurucharan G <gurucharanx.g@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-09-20 12:30:12 -07:00
|
|
|
struct dim_sample dim_sample;
|
2019-02-19 15:04:01 -08:00
|
|
|
|
ice: update dim usage and moderation
The driver was having trouble with unreliable latency when doing single
threaded ping-pong tests. This was root caused to the DIM algorithm
landing on a too slow interrupt value, which caused high latency, and it
was especially present when queues were being switched frequently by the
scheduler as happens on default setups today.
In attempting to improve this, we allow the upper rate limit for
interrupts to move to rate limit of 4 microseconds as a max, which means
that no vector can generate more than 250,000 interrupts per second. The
old config was up to 100,000. The driver previously tried to program the
rate limit too frequently and if the receive and transmit side were both
active on the same vector, the INTRL would be set incorrectly, and this
change fixes that issue as a side effect of the redesign.
This driver will operate from now on with a slightly changed DIM table
with more emphasis towards latency sensitivity by having more table
entries with lower latency than with high latency (high being >= 64
microseconds).
The driver also resets the DIM algorithm state with a new stats set when
there is no work done and the data becomes stale (older than 1 second),
for the respective receive or transmit portion of the interrupt.
Add a new helper for setting rate limit, which will be used more
in a followup patch.
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Tested-by: Gurucharan G <gurucharanx.g@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-09-20 12:30:12 -07:00
|
|
|
__ice_update_sample(q_vector, rx, &dim_sample, false);
|
ice: replace custom AIM algorithm with kernel's DIM library
The ice driver has support for adaptive interrupt moderation, an
algorithm for tuning the interrupt rate dynamically. This algorithm
is based on various assumptions about ring size, socket buffer size,
link speed, SKB overhead, ethernet frame overhead and more.
The Linux kernel has support for a dynamic interrupt moderation
algorithm known as "dimlib". Replace the custom driver-specific
implementation of dynamic interrupt moderation with the kernel's
algorithm.
The Intel hardware has a different hardware implementation than the
originators of the dimlib code had to work with, which requires the
driver to use a slightly different set of inputs for the actual
moderation values, while getting all the advice from dimlib of
better/worse, shift left or right.
The change made for this implementation is to use a pair of values
for each of the 5 "slots" that the dimlib moderation expects, and
the driver will program those pairs when dimlib recommends a slot to
use. The currently implementation uses two tables, one for receive
and one for transmit, and the pairs of values in each slot set the
maximum delay of an interrupt and a maximum number of interrupts per
second (both expressed in microseconds).
There are two separate kinds of bugs fixed by using DIMLIB, one is
UDP single stream send was too slow, and the other is that 8K
ping-pong was going to the most aggressive moderation and has much
too high latency.
The overall result of using DIMLIB is that we meet or exceed our
performance expectations set based on the old algorithm.
Co-developed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-03-31 14:16:57 -07:00
|
|
|
net_dim(&rx->dim, dim_sample);
|
2019-02-19 15:04:01 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-12-19 10:03:29 -08:00
|
|
|
/**
|
|
|
|
* ice_buildreg_itr - build value for writing to the GLINT_DYN_CTL register
|
|
|
|
* @itr_idx: interrupt throttling index
|
2019-02-19 15:04:01 -08:00
|
|
|
* @itr: interrupt throttling value in usecs
|
2018-12-19 10:03:29 -08:00
|
|
|
*/
|
2019-02-19 15:04:05 -08:00
|
|
|
static u32 ice_buildreg_itr(u16 itr_idx, u16 itr)
|
2018-12-19 10:03:29 -08:00
|
|
|
{
|
2019-04-16 10:35:03 -07:00
|
|
|
/* The ITR value is reported in microseconds, and the register value is
|
2019-02-19 15:04:01 -08:00
|
|
|
* recorded in 2 microsecond units. For this reason we only need to
|
|
|
|
* shift by the GLINT_DYN_CTL_INTERVAL_S - ICE_ITR_GRAN_S to apply this
|
|
|
|
* granularity as a shift instead of division. The mask makes sure the
|
|
|
|
* ITR value is never odd so we don't accidentally write into the field
|
|
|
|
* prior to the ITR field.
|
|
|
|
*/
|
|
|
|
itr &= ICE_ITR_MASK;
|
|
|
|
|
2018-12-19 10:03:29 -08:00
|
|
|
return GLINT_DYN_CTL_INTENA_M | GLINT_DYN_CTL_CLEARPBA_M |
|
|
|
|
(itr_idx << GLINT_DYN_CTL_ITR_INDX_S) |
|
2019-02-19 15:04:01 -08:00
|
|
|
(itr << (GLINT_DYN_CTL_INTERVAL_S - ICE_ITR_GRAN_S));
|
2018-12-19 10:03:29 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
ice: update dim usage and moderation
The driver was having trouble with unreliable latency when doing single
threaded ping-pong tests. This was root caused to the DIM algorithm
landing on a too slow interrupt value, which caused high latency, and it
was especially present when queues were being switched frequently by the
scheduler as happens on default setups today.
In attempting to improve this, we allow the upper rate limit for
interrupts to move to rate limit of 4 microseconds as a max, which means
that no vector can generate more than 250,000 interrupts per second. The
old config was up to 100,000. The driver previously tried to program the
rate limit too frequently and if the receive and transmit side were both
active on the same vector, the INTRL would be set incorrectly, and this
change fixes that issue as a side effect of the redesign.
This driver will operate from now on with a slightly changed DIM table
with more emphasis towards latency sensitivity by having more table
entries with lower latency than with high latency (high being >= 64
microseconds).
The driver also resets the DIM algorithm state with a new stats set when
there is no work done and the data becomes stale (older than 1 second),
for the respective receive or transmit portion of the interrupt.
Add a new helper for setting rate limit, which will be used more
in a followup patch.
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Tested-by: Gurucharan G <gurucharanx.g@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-09-20 12:30:12 -07:00
|
|
|
* ice_enable_interrupt - re-enable MSI-X interrupt
|
ice: replace custom AIM algorithm with kernel's DIM library
The ice driver has support for adaptive interrupt moderation, an
algorithm for tuning the interrupt rate dynamically. This algorithm
is based on various assumptions about ring size, socket buffer size,
link speed, SKB overhead, ethernet frame overhead and more.
The Linux kernel has support for a dynamic interrupt moderation
algorithm known as "dimlib". Replace the custom driver-specific
implementation of dynamic interrupt moderation with the kernel's
algorithm.
The Intel hardware has a different hardware implementation than the
originators of the dimlib code had to work with, which requires the
driver to use a slightly different set of inputs for the actual
moderation values, while getting all the advice from dimlib of
better/worse, shift left or right.
The change made for this implementation is to use a pair of values
for each of the 5 "slots" that the dimlib moderation expects, and
the driver will program those pairs when dimlib recommends a slot to
use. The currently implementation uses two tables, one for receive
and one for transmit, and the pairs of values in each slot set the
maximum delay of an interrupt and a maximum number of interrupts per
second (both expressed in microseconds).
There are two separate kinds of bugs fixed by using DIMLIB, one is
UDP single stream send was too slow, and the other is that 8K
ping-pong was going to the most aggressive moderation and has much
too high latency.
The overall result of using DIMLIB is that we meet or exceed our
performance expectations set based on the old algorithm.
Co-developed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-03-31 14:16:57 -07:00
|
|
|
* @q_vector: the vector associated with the interrupt to enable
|
|
|
|
*
|
ice: update dim usage and moderation
The driver was having trouble with unreliable latency when doing single
threaded ping-pong tests. This was root caused to the DIM algorithm
landing on a too slow interrupt value, which caused high latency, and it
was especially present when queues were being switched frequently by the
scheduler as happens on default setups today.
In attempting to improve this, we allow the upper rate limit for
interrupts to move to rate limit of 4 microseconds as a max, which means
that no vector can generate more than 250,000 interrupts per second. The
old config was up to 100,000. The driver previously tried to program the
rate limit too frequently and if the receive and transmit side were both
active on the same vector, the INTRL would be set incorrectly, and this
change fixes that issue as a side effect of the redesign.
This driver will operate from now on with a slightly changed DIM table
with more emphasis towards latency sensitivity by having more table
entries with lower latency than with high latency (high being >= 64
microseconds).
The driver also resets the DIM algorithm state with a new stats set when
there is no work done and the data becomes stale (older than 1 second),
for the respective receive or transmit portion of the interrupt.
Add a new helper for setting rate limit, which will be used more
in a followup patch.
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Tested-by: Gurucharan G <gurucharanx.g@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-09-20 12:30:12 -07:00
|
|
|
* If the VSI is down, the interrupt will not be re-enabled. Also,
|
|
|
|
* when enabling the interrupt always reset the wb_on_itr to false
|
|
|
|
* and trigger a software interrupt to clean out internal state.
|
2018-12-19 10:03:29 -08:00
|
|
|
*/
|
ice: update dim usage and moderation
The driver was having trouble with unreliable latency when doing single
threaded ping-pong tests. This was root caused to the DIM algorithm
landing on a too slow interrupt value, which caused high latency, and it
was especially present when queues were being switched frequently by the
scheduler as happens on default setups today.
In attempting to improve this, we allow the upper rate limit for
interrupts to move to rate limit of 4 microseconds as a max, which means
that no vector can generate more than 250,000 interrupts per second. The
old config was up to 100,000. The driver previously tried to program the
rate limit too frequently and if the receive and transmit side were both
active on the same vector, the INTRL would be set incorrectly, and this
change fixes that issue as a side effect of the redesign.
This driver will operate from now on with a slightly changed DIM table
with more emphasis towards latency sensitivity by having more table
entries with lower latency than with high latency (high being >= 64
microseconds).
The driver also resets the DIM algorithm state with a new stats set when
there is no work done and the data becomes stale (older than 1 second),
for the respective receive or transmit portion of the interrupt.
Add a new helper for setting rate limit, which will be used more
in a followup patch.
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Tested-by: Gurucharan G <gurucharanx.g@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-09-20 12:30:12 -07:00
|
|
|
static void ice_enable_interrupt(struct ice_q_vector *q_vector)
|
2018-12-19 10:03:29 -08:00
|
|
|
{
|
2019-08-08 07:39:35 -07:00
|
|
|
struct ice_vsi *vsi = q_vector->vsi;
|
2021-03-31 14:16:58 -07:00
|
|
|
bool wb_en = q_vector->wb_on_itr;
|
2018-12-19 10:03:29 -08:00
|
|
|
u32 itr_val;
|
|
|
|
|
ice: replace custom AIM algorithm with kernel's DIM library
The ice driver has support for adaptive interrupt moderation, an
algorithm for tuning the interrupt rate dynamically. This algorithm
is based on various assumptions about ring size, socket buffer size,
link speed, SKB overhead, ethernet frame overhead and more.
The Linux kernel has support for a dynamic interrupt moderation
algorithm known as "dimlib". Replace the custom driver-specific
implementation of dynamic interrupt moderation with the kernel's
algorithm.
The Intel hardware has a different hardware implementation than the
originators of the dimlib code had to work with, which requires the
driver to use a slightly different set of inputs for the actual
moderation values, while getting all the advice from dimlib of
better/worse, shift left or right.
The change made for this implementation is to use a pair of values
for each of the 5 "slots" that the dimlib moderation expects, and
the driver will program those pairs when dimlib recommends a slot to
use. The currently implementation uses two tables, one for receive
and one for transmit, and the pairs of values in each slot set the
maximum delay of an interrupt and a maximum number of interrupts per
second (both expressed in microseconds).
There are two separate kinds of bugs fixed by using DIMLIB, one is
UDP single stream send was too slow, and the other is that 8K
ping-pong was going to the most aggressive moderation and has much
too high latency.
The overall result of using DIMLIB is that we meet or exceed our
performance expectations set based on the old algorithm.
Co-developed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-03-31 14:16:57 -07:00
|
|
|
if (test_bit(ICE_DOWN, vsi->state))
|
|
|
|
return;
|
|
|
|
|
2021-09-20 12:30:14 -07:00
|
|
|
/* trigger an ITR delayed software interrupt when exiting busy poll, to
|
|
|
|
* make sure to catch any pending cleanups that might have been missed
|
|
|
|
* due to interrupt state transition. If busy poll or poll isn't
|
|
|
|
* enabled, then don't update ITR, and just enable the interrupt.
|
2018-12-19 10:03:29 -08:00
|
|
|
*/
|
2021-09-20 12:30:14 -07:00
|
|
|
if (!wb_en) {
|
|
|
|
itr_val = ice_buildreg_itr(ICE_ITR_NONE, 0);
|
|
|
|
} else {
|
ice: replace custom AIM algorithm with kernel's DIM library
The ice driver has support for adaptive interrupt moderation, an
algorithm for tuning the interrupt rate dynamically. This algorithm
is based on various assumptions about ring size, socket buffer size,
link speed, SKB overhead, ethernet frame overhead and more.
The Linux kernel has support for a dynamic interrupt moderation
algorithm known as "dimlib". Replace the custom driver-specific
implementation of dynamic interrupt moderation with the kernel's
algorithm.
The Intel hardware has a different hardware implementation than the
originators of the dimlib code had to work with, which requires the
driver to use a slightly different set of inputs for the actual
moderation values, while getting all the advice from dimlib of
better/worse, shift left or right.
The change made for this implementation is to use a pair of values
for each of the 5 "slots" that the dimlib moderation expects, and
the driver will program those pairs when dimlib recommends a slot to
use. The currently implementation uses two tables, one for receive
and one for transmit, and the pairs of values in each slot set the
maximum delay of an interrupt and a maximum number of interrupts per
second (both expressed in microseconds).
There are two separate kinds of bugs fixed by using DIMLIB, one is
UDP single stream send was too slow, and the other is that 8K
ping-pong was going to the most aggressive moderation and has much
too high latency.
The overall result of using DIMLIB is that we meet or exceed our
performance expectations set based on the old algorithm.
Co-developed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-03-31 14:16:57 -07:00
|
|
|
q_vector->wb_on_itr = false;
|
|
|
|
|
2021-09-20 12:30:14 -07:00
|
|
|
/* do two things here with a single write. Set up the third ITR
|
|
|
|
* index to be used for software interrupt moderation, and then
|
|
|
|
* trigger a software interrupt with a rate limit of 20K on
|
|
|
|
* software interrupts, this will help avoid high interrupt
|
|
|
|
* loads due to frequently polling and exiting polling.
|
|
|
|
*/
|
|
|
|
itr_val = ice_buildreg_itr(ICE_IDX_ITR2, ICE_ITR_20K);
|
2021-03-31 14:16:58 -07:00
|
|
|
itr_val |= GLINT_DYN_CTL_SWINT_TRIG_M |
|
2021-09-20 12:30:14 -07:00
|
|
|
ICE_IDX_ITR2 << GLINT_DYN_CTL_SW_ITR_INDX_S |
|
2021-03-31 14:16:58 -07:00
|
|
|
GLINT_DYN_CTL_SW_ITR_INDX_ENA_M;
|
|
|
|
}
|
ice: replace custom AIM algorithm with kernel's DIM library
The ice driver has support for adaptive interrupt moderation, an
algorithm for tuning the interrupt rate dynamically. This algorithm
is based on various assumptions about ring size, socket buffer size,
link speed, SKB overhead, ethernet frame overhead and more.
The Linux kernel has support for a dynamic interrupt moderation
algorithm known as "dimlib". Replace the custom driver-specific
implementation of dynamic interrupt moderation with the kernel's
algorithm.
The Intel hardware has a different hardware implementation than the
originators of the dimlib code had to work with, which requires the
driver to use a slightly different set of inputs for the actual
moderation values, while getting all the advice from dimlib of
better/worse, shift left or right.
The change made for this implementation is to use a pair of values
for each of the 5 "slots" that the dimlib moderation expects, and
the driver will program those pairs when dimlib recommends a slot to
use. The currently implementation uses two tables, one for receive
and one for transmit, and the pairs of values in each slot set the
maximum delay of an interrupt and a maximum number of interrupts per
second (both expressed in microseconds).
There are two separate kinds of bugs fixed by using DIMLIB, one is
UDP single stream send was too slow, and the other is that 8K
ping-pong was going to the most aggressive moderation and has much
too high latency.
The overall result of using DIMLIB is that we meet or exceed our
performance expectations set based on the old algorithm.
Co-developed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-03-31 14:16:57 -07:00
|
|
|
wr32(&vsi->back->hw, GLINT_DYN_CTL(q_vector->reg_idx), itr_val);
|
2018-12-19 10:03:29 -08:00
|
|
|
}
|
|
|
|
|
2019-07-25 01:55:32 -07:00
|
|
|
/**
|
|
|
|
* ice_set_wb_on_itr - set WB_ON_ITR for this q_vector
|
|
|
|
* @q_vector: q_vector to set WB_ON_ITR on
|
|
|
|
*
|
|
|
|
* We need to tell hardware to write-back completed descriptors even when
|
|
|
|
* interrupts are disabled. Descriptors will be written back on cache line
|
|
|
|
* boundaries without WB_ON_ITR enabled, but if we don't enable WB_ON_ITR
|
2020-11-20 16:39:34 -08:00
|
|
|
* descriptors may not be written back if they don't fill a cache line until
|
|
|
|
* the next interrupt.
|
2019-07-25 01:55:32 -07:00
|
|
|
*
|
2020-11-20 16:39:34 -08:00
|
|
|
* This sets the write-back frequency to whatever was set previously for the
|
|
|
|
* ITR indices. Also, set the INTENA_MSK bit to make sure hardware knows we
|
|
|
|
* aren't meddling with the INTENA_M bit.
|
2019-07-25 01:55:32 -07:00
|
|
|
*/
|
2019-08-08 07:39:35 -07:00
|
|
|
static void ice_set_wb_on_itr(struct ice_q_vector *q_vector)
|
2019-07-25 01:55:32 -07:00
|
|
|
{
|
2019-08-08 07:39:35 -07:00
|
|
|
struct ice_vsi *vsi = q_vector->vsi;
|
|
|
|
|
2020-11-20 16:39:34 -08:00
|
|
|
/* already in wb_on_itr mode no need to change it */
|
ice: replace custom AIM algorithm with kernel's DIM library
The ice driver has support for adaptive interrupt moderation, an
algorithm for tuning the interrupt rate dynamically. This algorithm
is based on various assumptions about ring size, socket buffer size,
link speed, SKB overhead, ethernet frame overhead and more.
The Linux kernel has support for a dynamic interrupt moderation
algorithm known as "dimlib". Replace the custom driver-specific
implementation of dynamic interrupt moderation with the kernel's
algorithm.
The Intel hardware has a different hardware implementation than the
originators of the dimlib code had to work with, which requires the
driver to use a slightly different set of inputs for the actual
moderation values, while getting all the advice from dimlib of
better/worse, shift left or right.
The change made for this implementation is to use a pair of values
for each of the 5 "slots" that the dimlib moderation expects, and
the driver will program those pairs when dimlib recommends a slot to
use. The currently implementation uses two tables, one for receive
and one for transmit, and the pairs of values in each slot set the
maximum delay of an interrupt and a maximum number of interrupts per
second (both expressed in microseconds).
There are two separate kinds of bugs fixed by using DIMLIB, one is
UDP single stream send was too slow, and the other is that 8K
ping-pong was going to the most aggressive moderation and has much
too high latency.
The overall result of using DIMLIB is that we meet or exceed our
performance expectations set based on the old algorithm.
Co-developed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-03-31 14:16:57 -07:00
|
|
|
if (q_vector->wb_on_itr)
|
2019-07-25 01:55:32 -07:00
|
|
|
return;
|
|
|
|
|
2020-11-20 16:39:34 -08:00
|
|
|
/* use previously set ITR values for all of the ITR indices by
|
|
|
|
* specifying ICE_ITR_NONE, which will vary in adaptive (AIM) mode and
|
|
|
|
* be static in non-adaptive mode (user configured)
|
|
|
|
*/
|
|
|
|
wr32(&vsi->back->hw, GLINT_DYN_CTL(q_vector->reg_idx),
|
2023-12-05 17:01:05 -08:00
|
|
|
FIELD_PREP(GLINT_DYN_CTL_ITR_INDX_M, ICE_ITR_NONE) |
|
|
|
|
FIELD_PREP(GLINT_DYN_CTL_INTENA_MSK_M, 1) |
|
|
|
|
FIELD_PREP(GLINT_DYN_CTL_WB_ON_ITR_M, 1));
|
2019-07-25 01:55:32 -07:00
|
|
|
|
ice: replace custom AIM algorithm with kernel's DIM library
The ice driver has support for adaptive interrupt moderation, an
algorithm for tuning the interrupt rate dynamically. This algorithm
is based on various assumptions about ring size, socket buffer size,
link speed, SKB overhead, ethernet frame overhead and more.
The Linux kernel has support for a dynamic interrupt moderation
algorithm known as "dimlib". Replace the custom driver-specific
implementation of dynamic interrupt moderation with the kernel's
algorithm.
The Intel hardware has a different hardware implementation than the
originators of the dimlib code had to work with, which requires the
driver to use a slightly different set of inputs for the actual
moderation values, while getting all the advice from dimlib of
better/worse, shift left or right.
The change made for this implementation is to use a pair of values
for each of the 5 "slots" that the dimlib moderation expects, and
the driver will program those pairs when dimlib recommends a slot to
use. The currently implementation uses two tables, one for receive
and one for transmit, and the pairs of values in each slot set the
maximum delay of an interrupt and a maximum number of interrupts per
second (both expressed in microseconds).
There are two separate kinds of bugs fixed by using DIMLIB, one is
UDP single stream send was too slow, and the other is that 8K
ping-pong was going to the most aggressive moderation and has much
too high latency.
The overall result of using DIMLIB is that we meet or exceed our
performance expectations set based on the old algorithm.
Co-developed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-03-31 14:16:57 -07:00
|
|
|
q_vector->wb_on_itr = true;
|
2019-07-25 01:55:32 -07:00
|
|
|
}
|
|
|
|
|
2018-03-20 07:58:14 -07:00
|
|
|
/**
|
|
|
|
* ice_napi_poll - NAPI polling Rx/Tx cleanup routine
|
|
|
|
* @napi: napi struct with our devices info in it
|
|
|
|
* @budget: amount of work driver is allowed to do this pass, in packets
|
|
|
|
*
|
|
|
|
* This function will clean all queues associated with a q_vector.
|
|
|
|
*
|
|
|
|
* Returns the amount of work done
|
|
|
|
*/
|
|
|
|
int ice_napi_poll(struct napi_struct *napi, int budget)
|
|
|
|
{
|
|
|
|
struct ice_q_vector *q_vector =
|
|
|
|
container_of(napi, struct ice_q_vector, napi);
|
2021-08-19 13:59:58 +02:00
|
|
|
struct ice_tx_ring *tx_ring;
|
|
|
|
struct ice_rx_ring *rx_ring;
|
2018-03-20 07:58:14 -07:00
|
|
|
bool clean_complete = true;
|
2019-07-25 01:55:29 -07:00
|
|
|
int budget_per_ring;
|
2018-03-20 07:58:14 -07:00
|
|
|
int work_done = 0;
|
|
|
|
|
|
|
|
/* Since the actual Tx work is minimal, we can give the Tx a larger
|
|
|
|
* budget and be more aggressive about cleaning up the Tx descriptors.
|
|
|
|
*/
|
2021-08-19 13:59:58 +02:00
|
|
|
ice_for_each_tx_ring(tx_ring, q_vector->tx) {
|
ice: optimize XDP_TX workloads
Optimize Tx descriptor cleaning for XDP. Current approach doesn't
really scale and chokes when multiple flows are handled.
Introduce two ring fields, @next_dd and @next_rs that will keep track of
descriptor that should be looked at when the need for cleaning arise and
the descriptor that should have the RS bit set, respectively.
Note that at this point the threshold is a constant (32), but it is
something that we could make configurable.
First thing is to get away from setting RS bit on each descriptor. Let's
do this only once NTU is higher than the currently @next_rs value. In
such case, grab the tx_desc[next_rs], set the RS bit in descriptor and
advance the @next_rs by a 32.
Second thing is to clean the Tx ring only when there are less than 32
free entries. For that case, look up the tx_desc[next_dd] for a DD bit.
This bit is written back by HW to let the driver know that xmit was
successful. It will happen only for those descriptors that had RS bit
set. Clean only 32 descriptors and advance the DD bit.
Actual cleaning routine is moved from ice_napi_poll() down to the
ice_xmit_xdp_ring(). It is safe to do so as XDP ring will not get any
SKBs in there that would rely on interrupts for the cleaning. Nice side
effect is that for rare case of Tx fallback path (that next patch is
going to introduce) we don't have to trigger the SW irq to clean the
ring.
With those two concepts, ring is kept at being almost full, but it is
guaranteed that driver will be able to produce Tx descriptors.
This approach seems to work out well even though the Tx descriptors are
produced in one-by-one manner. Test was conducted with the ice HW
bombarded with packets from HW generator, configured to generate 30
flows.
Xdp2 sample yields the following results:
<snip>
proto 17: 79973066 pkt/s
proto 17: 80018911 pkt/s
proto 17: 80004654 pkt/s
proto 17: 79992395 pkt/s
proto 17: 79975162 pkt/s
proto 17: 79955054 pkt/s
proto 17: 79869168 pkt/s
proto 17: 79823947 pkt/s
proto 17: 79636971 pkt/s
</snip>
As that sample reports the Rx'ed frames, let's look at sar output.
It says that what we Rx'ed we do actually Tx, no noticeable drops.
Average: IFACE rxpck/s txpck/s rxkB/s txkB/s rxcmp/s txcmp/s rxmcst/s %ifutil
Average: ens4f1 79842324.00 79842310.40 4678261.17 4678260.38 0.00 0.00 0.00 38.32
with tx_busy staying calm.
When compared to a state before:
Average: IFACE rxpck/s txpck/s rxkB/s txkB/s rxcmp/s txcmp/s rxmcst/s %ifutil
Average: ens4f1 90919711.60 42233822.60 5327326.85 2474638.04 0.00 0.00 0.00 43.64
it can be observed that the amount of txpck/s is almost doubled, meaning
that the performance is improved by around 90%. All of this due to the
drops in the driver, previously the tx_busy stat was bumped at a 7mpps
rate.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Tested-by: George Kuruvinakunnel <george.kuruvinakunnel@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-08-19 14:00:02 +02:00
|
|
|
bool wd;
|
|
|
|
|
|
|
|
if (tx_ring->xsk_pool)
|
2022-09-01 12:40:39 +02:00
|
|
|
wd = ice_xmit_zc(tx_ring);
|
ice: optimize XDP_TX workloads
Optimize Tx descriptor cleaning for XDP. Current approach doesn't
really scale and chokes when multiple flows are handled.
Introduce two ring fields, @next_dd and @next_rs that will keep track of
descriptor that should be looked at when the need for cleaning arise and
the descriptor that should have the RS bit set, respectively.
Note that at this point the threshold is a constant (32), but it is
something that we could make configurable.
First thing is to get away from setting RS bit on each descriptor. Let's
do this only once NTU is higher than the currently @next_rs value. In
such case, grab the tx_desc[next_rs], set the RS bit in descriptor and
advance the @next_rs by a 32.
Second thing is to clean the Tx ring only when there are less than 32
free entries. For that case, look up the tx_desc[next_dd] for a DD bit.
This bit is written back by HW to let the driver know that xmit was
successful. It will happen only for those descriptors that had RS bit
set. Clean only 32 descriptors and advance the DD bit.
Actual cleaning routine is moved from ice_napi_poll() down to the
ice_xmit_xdp_ring(). It is safe to do so as XDP ring will not get any
SKBs in there that would rely on interrupts for the cleaning. Nice side
effect is that for rare case of Tx fallback path (that next patch is
going to introduce) we don't have to trigger the SW irq to clean the
ring.
With those two concepts, ring is kept at being almost full, but it is
guaranteed that driver will be able to produce Tx descriptors.
This approach seems to work out well even though the Tx descriptors are
produced in one-by-one manner. Test was conducted with the ice HW
bombarded with packets from HW generator, configured to generate 30
flows.
Xdp2 sample yields the following results:
<snip>
proto 17: 79973066 pkt/s
proto 17: 80018911 pkt/s
proto 17: 80004654 pkt/s
proto 17: 79992395 pkt/s
proto 17: 79975162 pkt/s
proto 17: 79955054 pkt/s
proto 17: 79869168 pkt/s
proto 17: 79823947 pkt/s
proto 17: 79636971 pkt/s
</snip>
As that sample reports the Rx'ed frames, let's look at sar output.
It says that what we Rx'ed we do actually Tx, no noticeable drops.
Average: IFACE rxpck/s txpck/s rxkB/s txkB/s rxcmp/s txcmp/s rxmcst/s %ifutil
Average: ens4f1 79842324.00 79842310.40 4678261.17 4678260.38 0.00 0.00 0.00 38.32
with tx_busy staying calm.
When compared to a state before:
Average: IFACE rxpck/s txpck/s rxkB/s txkB/s rxcmp/s txcmp/s rxmcst/s %ifutil
Average: ens4f1 90919711.60 42233822.60 5327326.85 2474638.04 0.00 0.00 0.00 43.64
it can be observed that the amount of txpck/s is almost doubled, meaning
that the performance is improved by around 90%. All of this due to the
drops in the driver, previously the tx_busy stat was bumped at a 7mpps
rate.
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Tested-by: George Kuruvinakunnel <george.kuruvinakunnel@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-08-19 14:00:02 +02:00
|
|
|
else if (ice_ring_is_xdp(tx_ring))
|
|
|
|
wd = true;
|
|
|
|
else
|
|
|
|
wd = ice_clean_tx_irq(tx_ring, budget);
|
2019-11-04 09:38:56 -08:00
|
|
|
|
|
|
|
if (!wd)
|
2018-03-20 07:58:14 -07:00
|
|
|
clean_complete = false;
|
2019-11-04 09:38:56 -08:00
|
|
|
}
|
2018-03-20 07:58:14 -07:00
|
|
|
|
|
|
|
/* Handle case where we are called by netpoll with a budget of 0 */
|
2019-08-08 07:39:37 -07:00
|
|
|
if (unlikely(budget <= 0))
|
2018-03-20 07:58:14 -07:00
|
|
|
return budget;
|
|
|
|
|
2019-07-25 01:55:29 -07:00
|
|
|
/* normally we have 1 Rx ring per q_vector */
|
|
|
|
if (unlikely(q_vector->num_ring_rx > 1))
|
|
|
|
/* We attempt to distribute budget to each Rx queue fairly, but
|
|
|
|
* don't allow the budget to go below 1 because that would exit
|
|
|
|
* polling early.
|
|
|
|
*/
|
2020-05-07 17:41:05 -07:00
|
|
|
budget_per_ring = max_t(int, budget / q_vector->num_ring_rx, 1);
|
2019-07-25 01:55:29 -07:00
|
|
|
else
|
|
|
|
/* Max of 1 Rx ring in this q_vector so give it the budget */
|
|
|
|
budget_per_ring = budget;
|
2018-03-20 07:58:14 -07:00
|
|
|
|
2021-08-19 13:59:58 +02:00
|
|
|
ice_for_each_rx_ring(rx_ring, q_vector->rx) {
|
2018-03-20 07:58:14 -07:00
|
|
|
int cleaned;
|
|
|
|
|
2019-11-04 09:38:56 -08:00
|
|
|
/* A dedicated path for zero-copy allows making a single
|
|
|
|
* comparison in the irq context instead of many inside the
|
|
|
|
* ice_clean_rx_irq function and makes the codebase cleaner.
|
|
|
|
*/
|
2021-08-19 13:59:58 +02:00
|
|
|
cleaned = rx_ring->xsk_pool ?
|
|
|
|
ice_clean_rx_irq_zc(rx_ring, budget_per_ring) :
|
|
|
|
ice_clean_rx_irq(rx_ring, budget_per_ring);
|
2018-03-20 07:58:14 -07:00
|
|
|
work_done += cleaned;
|
|
|
|
/* if we clean as many as budgeted, we must not be done */
|
|
|
|
if (cleaned >= budget_per_ring)
|
|
|
|
clean_complete = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* If work not completed, return budget and polling will return */
|
2020-11-20 16:39:34 -08:00
|
|
|
if (!clean_complete) {
|
|
|
|
/* Set the writeback on ITR so partial completions of
|
|
|
|
* cache-lines will still continue even if we're polling.
|
|
|
|
*/
|
|
|
|
ice_set_wb_on_itr(q_vector);
|
2018-03-20 07:58:14 -07:00
|
|
|
return budget;
|
2020-11-20 16:39:34 -08:00
|
|
|
}
|
2018-03-20 07:58:14 -07:00
|
|
|
|
2018-11-08 14:55:32 -08:00
|
|
|
/* Exit the polling mode, but don't re-enable interrupts if stack might
|
|
|
|
* poll us due to busy-polling
|
|
|
|
*/
|
2022-01-25 17:04:39 +01:00
|
|
|
if (napi_complete_done(napi, work_done)) {
|
ice: update dim usage and moderation
The driver was having trouble with unreliable latency when doing single
threaded ping-pong tests. This was root caused to the DIM algorithm
landing on a too slow interrupt value, which caused high latency, and it
was especially present when queues were being switched frequently by the
scheduler as happens on default setups today.
In attempting to improve this, we allow the upper rate limit for
interrupts to move to rate limit of 4 microseconds as a max, which means
that no vector can generate more than 250,000 interrupts per second. The
old config was up to 100,000. The driver previously tried to program the
rate limit too frequently and if the receive and transmit side were both
active on the same vector, the INTRL would be set incorrectly, and this
change fixes that issue as a side effect of the redesign.
This driver will operate from now on with a slightly changed DIM table
with more emphasis towards latency sensitivity by having more table
entries with lower latency than with high latency (high being >= 64
microseconds).
The driver also resets the DIM algorithm state with a new stats set when
there is no work done and the data becomes stale (older than 1 second),
for the respective receive or transmit portion of the interrupt.
Add a new helper for setting rate limit, which will be used more
in a followup patch.
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Tested-by: Gurucharan G <gurucharanx.g@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-09-20 12:30:12 -07:00
|
|
|
ice_net_dim(q_vector);
|
|
|
|
ice_enable_interrupt(q_vector);
|
|
|
|
} else {
|
2019-08-08 07:39:35 -07:00
|
|
|
ice_set_wb_on_itr(q_vector);
|
ice: update dim usage and moderation
The driver was having trouble with unreliable latency when doing single
threaded ping-pong tests. This was root caused to the DIM algorithm
landing on a too slow interrupt value, which caused high latency, and it
was especially present when queues were being switched frequently by the
scheduler as happens on default setups today.
In attempting to improve this, we allow the upper rate limit for
interrupts to move to rate limit of 4 microseconds as a max, which means
that no vector can generate more than 250,000 interrupts per second. The
old config was up to 100,000. The driver previously tried to program the
rate limit too frequently and if the receive and transmit side were both
active on the same vector, the INTRL would be set incorrectly, and this
change fixes that issue as a side effect of the redesign.
This driver will operate from now on with a slightly changed DIM table
with more emphasis towards latency sensitivity by having more table
entries with lower latency than with high latency (high being >= 64
microseconds).
The driver also resets the DIM algorithm state with a new stats set when
there is no work done and the data becomes stale (older than 1 second),
for the respective receive or transmit portion of the interrupt.
Add a new helper for setting rate limit, which will be used more
in a followup patch.
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Tested-by: Gurucharan G <gurucharanx.g@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-09-20 12:30:12 -07:00
|
|
|
}
|
2018-10-26 11:44:43 -07:00
|
|
|
|
2019-02-08 12:50:35 -08:00
|
|
|
return min_t(int, work_done, budget - 1);
|
2018-03-20 07:58:14 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2018-10-26 11:44:47 -07:00
|
|
|
* __ice_maybe_stop_tx - 2nd level check for Tx stop conditions
|
2018-03-20 07:58:14 -07:00
|
|
|
* @tx_ring: the ring to be checked
|
|
|
|
* @size: the size buffer we want to assure is available
|
|
|
|
*
|
|
|
|
* Returns -EBUSY if a stop is needed, else 0
|
|
|
|
*/
|
2021-08-19 13:59:58 +02:00
|
|
|
static int __ice_maybe_stop_tx(struct ice_tx_ring *tx_ring, unsigned int size)
|
2018-03-20 07:58:14 -07:00
|
|
|
{
|
2021-10-25 17:08:23 -07:00
|
|
|
netif_tx_stop_queue(txring_txq(tx_ring));
|
2018-03-20 07:58:14 -07:00
|
|
|
/* Memory barrier before checking head and tail */
|
|
|
|
smp_mb();
|
|
|
|
|
|
|
|
/* Check again in a case another CPU has just made room available. */
|
|
|
|
if (likely(ICE_DESC_UNUSED(tx_ring) < size))
|
|
|
|
return -EBUSY;
|
|
|
|
|
2021-10-25 17:08:23 -07:00
|
|
|
/* A reprieve! - use start_queue because it doesn't call schedule */
|
|
|
|
netif_tx_start_queue(txring_txq(tx_ring));
|
ice: Accumulate ring statistics over reset
Resets may occur with or without user interaction. For example, a TX hang
or reconfiguration of parameters will result in a reset. During reset, the
VSI is freed, freeing any statistics structures inside as well. This would
create an issue for the user where a reset happens in the background,
statistics set to zero, and the user checks ring statistics expecting them
to be populated.
To ensure this doesn't happen, accumulate ring statistics over reset.
Define a new ring statistics structure, ice_ring_stats. The new structure
lives in the VSI's parent, preserving ring statistics when VSI is freed.
1. Define a new structure vsi_ring_stats in the PF scope
2. Allocate/free stats only during probe, unload, or change in ring size
3. Replace previous ring statistics functionality with new structure
Signed-off-by: Benjamin Mikailenko <benjamin.mikailenko@intel.com>
Tested-by: Gurucharan G <gurucharanx.g@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2022-11-18 16:20:02 -05:00
|
|
|
++tx_ring->ring_stats->tx_stats.restart_q;
|
2018-03-20 07:58:14 -07:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2018-10-26 11:44:47 -07:00
|
|
|
* ice_maybe_stop_tx - 1st level check for Tx stop conditions
|
2018-03-20 07:58:14 -07:00
|
|
|
* @tx_ring: the ring to be checked
|
|
|
|
* @size: the size buffer we want to assure is available
|
|
|
|
*
|
|
|
|
* Returns 0 if stop is not needed
|
|
|
|
*/
|
2021-08-19 13:59:58 +02:00
|
|
|
static int ice_maybe_stop_tx(struct ice_tx_ring *tx_ring, unsigned int size)
|
2018-03-20 07:58:14 -07:00
|
|
|
{
|
|
|
|
if (likely(ICE_DESC_UNUSED(tx_ring) >= size))
|
|
|
|
return 0;
|
2018-10-26 11:44:47 -07:00
|
|
|
|
2018-03-20 07:58:14 -07:00
|
|
|
return __ice_maybe_stop_tx(tx_ring, size);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* ice_tx_map - Build the Tx descriptor
|
|
|
|
* @tx_ring: ring to send buffer on
|
|
|
|
* @first: first buffer info buffer to use
|
2018-03-20 07:58:15 -07:00
|
|
|
* @off: pointer to struct that holds offload parameters
|
2018-03-20 07:58:14 -07:00
|
|
|
*
|
|
|
|
* This function loops over the skb data pointed to by *first
|
|
|
|
* and gets a physical address for each memory location and programs
|
|
|
|
* it and the length into the transmit descriptor.
|
|
|
|
*/
|
2018-03-20 07:58:15 -07:00
|
|
|
static void
|
2021-08-19 13:59:58 +02:00
|
|
|
ice_tx_map(struct ice_tx_ring *tx_ring, struct ice_tx_buf *first,
|
2018-03-20 07:58:15 -07:00
|
|
|
struct ice_tx_offload_params *off)
|
2018-03-20 07:58:14 -07:00
|
|
|
{
|
2018-03-20 07:58:15 -07:00
|
|
|
u64 td_offset, td_tag, td_cmd;
|
2018-03-20 07:58:14 -07:00
|
|
|
u16 i = tx_ring->next_to_use;
|
|
|
|
unsigned int data_len, size;
|
|
|
|
struct ice_tx_desc *tx_desc;
|
|
|
|
struct ice_tx_buf *tx_buf;
|
|
|
|
struct sk_buff *skb;
|
2020-02-06 01:20:13 -08:00
|
|
|
skb_frag_t *frag;
|
2018-03-20 07:58:14 -07:00
|
|
|
dma_addr_t dma;
|
2021-10-25 17:08:26 -07:00
|
|
|
bool kick;
|
2018-03-20 07:58:14 -07:00
|
|
|
|
2018-03-20 07:58:15 -07:00
|
|
|
td_tag = off->td_l2tag1;
|
|
|
|
td_cmd = off->td_cmd;
|
|
|
|
td_offset = off->td_offset;
|
2018-03-20 07:58:14 -07:00
|
|
|
skb = first->skb;
|
|
|
|
|
|
|
|
data_len = skb->data_len;
|
|
|
|
size = skb_headlen(skb);
|
|
|
|
|
|
|
|
tx_desc = ICE_TX_DESC(tx_ring, i);
|
|
|
|
|
2018-03-20 07:58:15 -07:00
|
|
|
if (first->tx_flags & ICE_TX_FLAGS_HW_VLAN) {
|
|
|
|
td_cmd |= (u64)ICE_TX_DESC_CMD_IL2TAG1;
|
2023-05-11 08:53:19 -07:00
|
|
|
td_tag = first->vid;
|
2018-03-20 07:58:15 -07:00
|
|
|
}
|
|
|
|
|
2018-03-20 07:58:14 -07:00
|
|
|
dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);
|
|
|
|
|
|
|
|
tx_buf = first;
|
|
|
|
|
|
|
|
for (frag = &skb_shinfo(skb)->frags[0];; frag++) {
|
|
|
|
unsigned int max_data = ICE_MAX_DATA_PER_TXD_ALIGNED;
|
|
|
|
|
|
|
|
if (dma_mapping_error(tx_ring->dev, dma))
|
|
|
|
goto dma_error;
|
|
|
|
|
|
|
|
/* record length, and DMA address */
|
|
|
|
dma_unmap_len_set(tx_buf, len, size);
|
|
|
|
dma_unmap_addr_set(tx_buf, dma, dma);
|
|
|
|
|
|
|
|
/* align size to end of page */
|
|
|
|
max_data += -dma & (ICE_MAX_READ_REQ_SIZE - 1);
|
|
|
|
tx_desc->buf_addr = cpu_to_le64(dma);
|
|
|
|
|
|
|
|
/* account for data chunks larger than the hardware
|
|
|
|
* can handle
|
|
|
|
*/
|
|
|
|
while (unlikely(size > ICE_MAX_DATA_PER_TXD)) {
|
|
|
|
tx_desc->cmd_type_offset_bsz =
|
2020-05-07 17:41:13 -07:00
|
|
|
ice_build_ctob(td_cmd, td_offset, max_data,
|
|
|
|
td_tag);
|
2018-03-20 07:58:14 -07:00
|
|
|
|
|
|
|
tx_desc++;
|
|
|
|
i++;
|
|
|
|
|
|
|
|
if (i == tx_ring->count) {
|
|
|
|
tx_desc = ICE_TX_DESC(tx_ring, 0);
|
|
|
|
i = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
dma += max_data;
|
|
|
|
size -= max_data;
|
|
|
|
|
|
|
|
max_data = ICE_MAX_DATA_PER_TXD_ALIGNED;
|
|
|
|
tx_desc->buf_addr = cpu_to_le64(dma);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (likely(!data_len))
|
|
|
|
break;
|
|
|
|
|
2020-05-07 17:41:13 -07:00
|
|
|
tx_desc->cmd_type_offset_bsz = ice_build_ctob(td_cmd, td_offset,
|
|
|
|
size, td_tag);
|
2018-03-20 07:58:14 -07:00
|
|
|
|
|
|
|
tx_desc++;
|
|
|
|
i++;
|
|
|
|
|
|
|
|
if (i == tx_ring->count) {
|
|
|
|
tx_desc = ICE_TX_DESC(tx_ring, 0);
|
|
|
|
i = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
size = skb_frag_size(frag);
|
|
|
|
data_len -= size;
|
|
|
|
|
|
|
|
dma = skb_frag_dma_map(tx_ring->dev, frag, 0, size,
|
|
|
|
DMA_TO_DEVICE);
|
|
|
|
|
|
|
|
tx_buf = &tx_ring->tx_buf[i];
|
ice: Robustify cleaning/completing XDP Tx buffers
When queueing frames from a Page Pool for redirecting to a device backed
by the ice driver, `perf top` shows heavy load on page_alloc() and
page_frag_free(), despite that on a properly working system it must be
fully or at least almost zero-alloc. The problem is in fact a bit deeper
and raises from how ice cleans up completed Tx buffers.
The story so far: when cleaning/freeing the resources related to
a particular completed Tx frame (skbs, DMA mappings etc.), ice uses some
heuristics only without setting any type explicitly (except for dummy
Flow Director packets, which are marked via ice_tx_buf::tx_flags).
This kinda works, but only up to some point. For example, currently ice
assumes that each frame coming to __ice_xmit_xdp_ring(), is backed by
either plain order-0 page or plain page frag, while it may also be
backed by Page Pool or any other possible memory models introduced in
future. This means any &xdp_frame must be freed properly via
xdp_return_frame() family with no assumptions.
In order to do that, the whole heuristics must be replaced with setting
the Tx buffer/frame type explicitly, just how it's always been done via
an enum. Let us reuse 16 bits from ::tx_flags -- 1 bit-and instr won't
hurt much -- especially given that sometimes there was a check for
%ICE_TX_FLAGS_DUMMY_PKT, which is now turned from a flag to an enum
member. The rest of the changes is straightforward and most of it is
just a conversion to rely now on the type set in &ice_tx_buf rather than
to some secondary properties.
For now, no functional changes intended, the change only prepares the
ground for starting freeing XDP frames properly next step. And it must
be done atomically/synchronously to not break stuff.
Signed-off-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/bpf/20230210170618.1973430-5-alexandr.lobakin@intel.com
2023-02-10 18:06:16 +01:00
|
|
|
tx_buf->type = ICE_TX_BUF_FRAG;
|
2018-03-20 07:58:14 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/* record SW timestamp if HW timestamp is not available */
|
|
|
|
skb_tx_timestamp(first->skb);
|
|
|
|
|
|
|
|
i++;
|
|
|
|
if (i == tx_ring->count)
|
|
|
|
i = 0;
|
|
|
|
|
|
|
|
/* write last descriptor with RS and EOP bits */
|
2019-11-04 09:38:56 -08:00
|
|
|
td_cmd |= (u64)ICE_TXD_LAST_DESC_CMD;
|
2020-05-07 17:41:13 -07:00
|
|
|
tx_desc->cmd_type_offset_bsz =
|
|
|
|
ice_build_ctob(td_cmd, td_offset, size, td_tag);
|
2018-03-20 07:58:14 -07:00
|
|
|
|
|
|
|
/* Force memory writes to complete before letting h/w know there
|
|
|
|
* are new descriptors to fetch.
|
|
|
|
*
|
|
|
|
* We also use this memory barrier to make certain all of the
|
|
|
|
* status bits have been updated before next_to_watch is written.
|
|
|
|
*/
|
|
|
|
wmb();
|
|
|
|
|
|
|
|
/* set next_to_watch value indicating a packet is present */
|
|
|
|
first->next_to_watch = tx_desc;
|
|
|
|
|
|
|
|
tx_ring->next_to_use = i;
|
|
|
|
|
|
|
|
ice_maybe_stop_tx(tx_ring, DESC_NEEDED);
|
|
|
|
|
|
|
|
/* notify HW of packet */
|
2021-10-25 17:08:26 -07:00
|
|
|
kick = __netdev_tx_sent_queue(txring_txq(tx_ring), first->bytecount,
|
|
|
|
netdev_xmit_more());
|
|
|
|
if (kick)
|
|
|
|
/* notify HW of packet */
|
2018-03-20 07:58:14 -07:00
|
|
|
writel(i, tx_ring->tail);
|
|
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
dma_error:
|
2019-04-16 10:35:03 -07:00
|
|
|
/* clear DMA mappings for failed tx_buf map */
|
2018-03-20 07:58:14 -07:00
|
|
|
for (;;) {
|
|
|
|
tx_buf = &tx_ring->tx_buf[i];
|
|
|
|
ice_unmap_and_free_tx_buf(tx_ring, tx_buf);
|
|
|
|
if (tx_buf == first)
|
|
|
|
break;
|
|
|
|
if (i == 0)
|
|
|
|
i = tx_ring->count;
|
|
|
|
i--;
|
|
|
|
}
|
|
|
|
|
|
|
|
tx_ring->next_to_use = i;
|
|
|
|
}
|
|
|
|
|
2018-03-20 07:58:15 -07:00
|
|
|
/**
|
|
|
|
* ice_tx_csum - Enable Tx checksum offloads
|
|
|
|
* @first: pointer to the first descriptor
|
|
|
|
* @off: pointer to struct that holds offload parameters
|
|
|
|
*
|
|
|
|
* Returns 0 or error (negative) if checksum offload can't happen, 1 otherwise.
|
|
|
|
*/
|
|
|
|
static
|
|
|
|
int ice_tx_csum(struct ice_tx_buf *first, struct ice_tx_offload_params *off)
|
|
|
|
{
|
|
|
|
u32 l4_len = 0, l3_len = 0, l2_len = 0;
|
|
|
|
struct sk_buff *skb = first->skb;
|
|
|
|
union {
|
|
|
|
struct iphdr *v4;
|
|
|
|
struct ipv6hdr *v6;
|
|
|
|
unsigned char *hdr;
|
|
|
|
} ip;
|
|
|
|
union {
|
|
|
|
struct tcphdr *tcp;
|
|
|
|
unsigned char *hdr;
|
|
|
|
} l4;
|
|
|
|
__be16 frag_off, protocol;
|
|
|
|
unsigned char *exthdr;
|
|
|
|
u32 offset, cmd = 0;
|
|
|
|
u8 l4_proto = 0;
|
|
|
|
|
|
|
|
if (skb->ip_summed != CHECKSUM_PARTIAL)
|
|
|
|
return 0;
|
|
|
|
|
2022-03-17 21:12:12 -07:00
|
|
|
protocol = vlan_get_protocol(skb);
|
|
|
|
|
2022-07-18 13:34:27 +02:00
|
|
|
if (eth_p_mpls(protocol)) {
|
2022-03-17 21:12:12 -07:00
|
|
|
ip.hdr = skb_inner_network_header(skb);
|
2022-07-18 13:34:27 +02:00
|
|
|
l4.hdr = skb_checksum_start(skb);
|
|
|
|
} else {
|
2022-03-17 21:12:12 -07:00
|
|
|
ip.hdr = skb_network_header(skb);
|
2022-07-18 13:34:27 +02:00
|
|
|
l4.hdr = skb_transport_header(skb);
|
|
|
|
}
|
2018-03-20 07:58:15 -07:00
|
|
|
|
|
|
|
/* compute outer L2 header size */
|
|
|
|
l2_len = ip.hdr - skb->data;
|
|
|
|
offset = (l2_len / 2) << ICE_TX_DESC_LEN_MACLEN_S;
|
|
|
|
|
2022-03-17 21:12:12 -07:00
|
|
|
/* set the tx_flags to indicate the IP protocol type. this is
|
|
|
|
* required so that checksum header computation below is accurate.
|
|
|
|
*/
|
|
|
|
if (ip.v4->version == 4)
|
2020-05-06 09:32:30 -07:00
|
|
|
first->tx_flags |= ICE_TX_FLAGS_IPV4;
|
2022-03-17 21:12:12 -07:00
|
|
|
else if (ip.v6->version == 6)
|
2020-05-06 09:32:30 -07:00
|
|
|
first->tx_flags |= ICE_TX_FLAGS_IPV6;
|
|
|
|
|
|
|
|
if (skb->encapsulation) {
|
|
|
|
bool gso_ena = false;
|
|
|
|
u32 tunnel = 0;
|
|
|
|
|
|
|
|
/* define outer network header type */
|
|
|
|
if (first->tx_flags & ICE_TX_FLAGS_IPV4) {
|
|
|
|
tunnel |= (first->tx_flags & ICE_TX_FLAGS_TSO) ?
|
|
|
|
ICE_TX_CTX_EIPT_IPV4 :
|
|
|
|
ICE_TX_CTX_EIPT_IPV4_NO_CSUM;
|
|
|
|
l4_proto = ip.v4->protocol;
|
|
|
|
} else if (first->tx_flags & ICE_TX_FLAGS_IPV6) {
|
2020-11-20 16:38:31 -08:00
|
|
|
int ret;
|
|
|
|
|
2020-05-06 09:32:30 -07:00
|
|
|
tunnel |= ICE_TX_CTX_EIPT_IPV6;
|
|
|
|
exthdr = ip.hdr + sizeof(*ip.v6);
|
|
|
|
l4_proto = ip.v6->nexthdr;
|
2020-11-20 16:38:31 -08:00
|
|
|
ret = ipv6_skip_exthdr(skb, exthdr - skb->data,
|
|
|
|
&l4_proto, &frag_off);
|
|
|
|
if (ret < 0)
|
|
|
|
return -1;
|
2020-05-06 09:32:30 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/* define outer transport */
|
|
|
|
switch (l4_proto) {
|
|
|
|
case IPPROTO_UDP:
|
|
|
|
tunnel |= ICE_TXD_CTX_UDP_TUNNELING;
|
|
|
|
first->tx_flags |= ICE_TX_FLAGS_TUNNEL;
|
|
|
|
break;
|
|
|
|
case IPPROTO_GRE:
|
|
|
|
tunnel |= ICE_TXD_CTX_GRE_TUNNELING;
|
|
|
|
first->tx_flags |= ICE_TX_FLAGS_TUNNEL;
|
|
|
|
break;
|
|
|
|
case IPPROTO_IPIP:
|
|
|
|
case IPPROTO_IPV6:
|
|
|
|
first->tx_flags |= ICE_TX_FLAGS_TUNNEL;
|
|
|
|
l4.hdr = skb_inner_network_header(skb);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
if (first->tx_flags & ICE_TX_FLAGS_TSO)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
skb_checksum_help(skb);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* compute outer L3 header size */
|
|
|
|
tunnel |= ((l4.hdr - ip.hdr) / 4) <<
|
|
|
|
ICE_TXD_CTX_QW0_EIPLEN_S;
|
|
|
|
|
|
|
|
/* switch IP header pointer from outer to inner header */
|
|
|
|
ip.hdr = skb_inner_network_header(skb);
|
|
|
|
|
|
|
|
/* compute tunnel header size */
|
|
|
|
tunnel |= ((ip.hdr - l4.hdr) / 2) <<
|
|
|
|
ICE_TXD_CTX_QW0_NATLEN_S;
|
|
|
|
|
|
|
|
gso_ena = skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL;
|
|
|
|
/* indicate if we need to offload outer UDP header */
|
|
|
|
if ((first->tx_flags & ICE_TX_FLAGS_TSO) && !gso_ena &&
|
|
|
|
(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM))
|
|
|
|
tunnel |= ICE_TXD_CTX_QW0_L4T_CS_M;
|
|
|
|
|
|
|
|
/* record tunnel offload values */
|
|
|
|
off->cd_tunnel_params |= tunnel;
|
|
|
|
|
|
|
|
/* set DTYP=1 to indicate that it's an Tx context descriptor
|
|
|
|
* in IPsec tunnel mode with Tx offloads in Quad word 1
|
|
|
|
*/
|
|
|
|
off->cd_qw1 |= (u64)ICE_TX_DESC_DTYPE_CTX;
|
|
|
|
|
|
|
|
/* switch L4 header pointer from outer to inner */
|
|
|
|
l4.hdr = skb_inner_transport_header(skb);
|
|
|
|
l4_proto = 0;
|
|
|
|
|
|
|
|
/* reset type as we transition from outer to inner headers */
|
|
|
|
first->tx_flags &= ~(ICE_TX_FLAGS_IPV4 | ICE_TX_FLAGS_IPV6);
|
|
|
|
if (ip.v4->version == 4)
|
|
|
|
first->tx_flags |= ICE_TX_FLAGS_IPV4;
|
|
|
|
if (ip.v6->version == 6)
|
|
|
|
first->tx_flags |= ICE_TX_FLAGS_IPV6;
|
|
|
|
}
|
2018-03-20 07:58:15 -07:00
|
|
|
|
|
|
|
/* Enable IP checksum offloads */
|
2020-05-06 09:32:30 -07:00
|
|
|
if (first->tx_flags & ICE_TX_FLAGS_IPV4) {
|
2018-03-20 07:58:15 -07:00
|
|
|
l4_proto = ip.v4->protocol;
|
|
|
|
/* the stack computes the IP header already, the only time we
|
|
|
|
* need the hardware to recompute it is in the case of TSO.
|
|
|
|
*/
|
|
|
|
if (first->tx_flags & ICE_TX_FLAGS_TSO)
|
|
|
|
cmd |= ICE_TX_DESC_CMD_IIPT_IPV4_CSUM;
|
|
|
|
else
|
|
|
|
cmd |= ICE_TX_DESC_CMD_IIPT_IPV4;
|
|
|
|
|
2020-05-06 09:32:30 -07:00
|
|
|
} else if (first->tx_flags & ICE_TX_FLAGS_IPV6) {
|
2018-03-20 07:58:15 -07:00
|
|
|
cmd |= ICE_TX_DESC_CMD_IIPT_IPV6;
|
|
|
|
exthdr = ip.hdr + sizeof(*ip.v6);
|
|
|
|
l4_proto = ip.v6->nexthdr;
|
|
|
|
if (l4.hdr != exthdr)
|
|
|
|
ipv6_skip_exthdr(skb, exthdr - skb->data, &l4_proto,
|
|
|
|
&frag_off);
|
|
|
|
} else {
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* compute inner L3 header size */
|
|
|
|
l3_len = l4.hdr - ip.hdr;
|
|
|
|
offset |= (l3_len / 4) << ICE_TX_DESC_LEN_IPLEN_S;
|
|
|
|
|
|
|
|
/* Enable L4 checksum offloads */
|
|
|
|
switch (l4_proto) {
|
|
|
|
case IPPROTO_TCP:
|
|
|
|
/* enable checksum offloads */
|
|
|
|
cmd |= ICE_TX_DESC_CMD_L4T_EOFT_TCP;
|
|
|
|
l4_len = l4.tcp->doff;
|
|
|
|
offset |= l4_len << ICE_TX_DESC_LEN_L4_LEN_S;
|
|
|
|
break;
|
|
|
|
case IPPROTO_UDP:
|
|
|
|
/* enable UDP checksum offload */
|
|
|
|
cmd |= ICE_TX_DESC_CMD_L4T_EOFT_UDP;
|
|
|
|
l4_len = (sizeof(struct udphdr) >> 2);
|
|
|
|
offset |= l4_len << ICE_TX_DESC_LEN_L4_LEN_S;
|
|
|
|
break;
|
|
|
|
case IPPROTO_SCTP:
|
2018-12-19 10:03:32 -08:00
|
|
|
/* enable SCTP checksum offload */
|
|
|
|
cmd |= ICE_TX_DESC_CMD_L4T_EOFT_SCTP;
|
|
|
|
l4_len = sizeof(struct sctphdr) >> 2;
|
|
|
|
offset |= l4_len << ICE_TX_DESC_LEN_L4_LEN_S;
|
|
|
|
break;
|
|
|
|
|
2018-03-20 07:58:15 -07:00
|
|
|
default:
|
|
|
|
if (first->tx_flags & ICE_TX_FLAGS_TSO)
|
|
|
|
return -1;
|
|
|
|
skb_checksum_help(skb);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
off->td_cmd |= cmd;
|
|
|
|
off->td_offset |= offset;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2019-02-19 15:04:13 -08:00
|
|
|
* ice_tx_prepare_vlan_flags - prepare generic Tx VLAN tagging flags for HW
|
2018-03-20 07:58:15 -07:00
|
|
|
* @tx_ring: ring to send buffer on
|
|
|
|
* @first: pointer to struct ice_tx_buf
|
|
|
|
*
|
|
|
|
* Checks the skb and set up correspondingly several generic transmit flags
|
|
|
|
* related to VLAN tagging for the HW, such as VLAN, DCB, etc.
|
|
|
|
*/
|
2020-05-15 17:51:19 -07:00
|
|
|
static void
|
2021-08-19 13:59:58 +02:00
|
|
|
ice_tx_prepare_vlan_flags(struct ice_tx_ring *tx_ring, struct ice_tx_buf *first)
|
2018-03-20 07:58:15 -07:00
|
|
|
{
|
|
|
|
struct sk_buff *skb = first->skb;
|
|
|
|
|
2020-05-15 17:51:19 -07:00
|
|
|
/* nothing left to do, software offloaded VLAN */
|
|
|
|
if (!skb_vlan_tag_present(skb) && eth_type_vlan(skb->protocol))
|
|
|
|
return;
|
|
|
|
|
ice: Add hot path support for 802.1Q and 802.1ad VLAN offloads
Currently the driver only supports 802.1Q VLAN insertion and stripping.
However, once Double VLAN Mode (DVM) is fully supported, then both 802.1Q
and 802.1ad VLAN insertion and stripping will be supported. Unfortunately
the VSI context parameters only allow for one VLAN ethertype at a time
for VLAN offloads so only one or the other VLAN ethertype offload can be
supported at once.
To support this, multiple changes are needed.
Rx path changes:
[1] In DVM, the Rx queue context l2tagsel field needs to be cleared so
the outermost tag shows up in the l2tag2_2nd field of the Rx flex
descriptor. In Single VLAN Mode (SVM), the l2tagsel field should remain
1 to support SVM configurations.
[2] Modify the ice_test_staterr() function to take a __le16 instead of
the ice_32b_rx_flex_desc union pointer so this function can be used for
both rx_desc->wb.status_error0 and rx_desc->wb.status_error1.
[3] Add the new inline function ice_get_vlan_tag_from_rx_desc() that
checks if there is a VLAN tag in l2tag1 or l2tag2_2nd.
[4] In ice_receive_skb(), add a check to see if NETIF_F_HW_VLAN_STAG_RX
is enabled in netdev->features. If it is, then this is the VLAN
ethertype that needs to be added to the stripping VLAN tag. Since
ice_fix_features() prevents CTAG_RX and STAG_RX from being enabled
simultaneously, the VLAN ethertype will only ever be 802.1Q or 802.1ad.
Tx path changes:
[1] In DVM, the VLAN tag needs to be placed in the l2tag2 field of the Tx
context descriptor. The new define ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN was
added to the list of tx_flags to handle this case.
[2] When the stack requests the VLAN tag to be offloaded on Tx, the
driver needs to set either ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN or
ICE_TX_FLAGS_HW_VLAN, so the tag is inserted in l2tag2 or l2tag1
respectively. To determine which location to use, set a bit in the Tx
ring flags field during ring allocation that can be used to determine
which field to use in the Tx descriptor. In DVM, always use l2tag2,
and in SVM, always use l2tag1.
Signed-off-by: Brett Creeley <brett.creeley@intel.com>
Tested-by: Gurucharan G <gurucharanx.g@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-12-02 08:38:47 -08:00
|
|
|
/* the VLAN ethertype/tpid is determined by VSI configuration and netdev
|
|
|
|
* feature flags, which the driver only allows either 802.1Q or 802.1ad
|
|
|
|
* VLAN offloads exclusively so we only care about the VLAN ID here
|
2020-05-15 17:51:19 -07:00
|
|
|
*/
|
2018-03-20 07:58:15 -07:00
|
|
|
if (skb_vlan_tag_present(skb)) {
|
2023-05-11 08:53:19 -07:00
|
|
|
first->vid = skb_vlan_tag_get(skb);
|
ice: Add hot path support for 802.1Q and 802.1ad VLAN offloads
Currently the driver only supports 802.1Q VLAN insertion and stripping.
However, once Double VLAN Mode (DVM) is fully supported, then both 802.1Q
and 802.1ad VLAN insertion and stripping will be supported. Unfortunately
the VSI context parameters only allow for one VLAN ethertype at a time
for VLAN offloads so only one or the other VLAN ethertype offload can be
supported at once.
To support this, multiple changes are needed.
Rx path changes:
[1] In DVM, the Rx queue context l2tagsel field needs to be cleared so
the outermost tag shows up in the l2tag2_2nd field of the Rx flex
descriptor. In Single VLAN Mode (SVM), the l2tagsel field should remain
1 to support SVM configurations.
[2] Modify the ice_test_staterr() function to take a __le16 instead of
the ice_32b_rx_flex_desc union pointer so this function can be used for
both rx_desc->wb.status_error0 and rx_desc->wb.status_error1.
[3] Add the new inline function ice_get_vlan_tag_from_rx_desc() that
checks if there is a VLAN tag in l2tag1 or l2tag2_2nd.
[4] In ice_receive_skb(), add a check to see if NETIF_F_HW_VLAN_STAG_RX
is enabled in netdev->features. If it is, then this is the VLAN
ethertype that needs to be added to the stripping VLAN tag. Since
ice_fix_features() prevents CTAG_RX and STAG_RX from being enabled
simultaneously, the VLAN ethertype will only ever be 802.1Q or 802.1ad.
Tx path changes:
[1] In DVM, the VLAN tag needs to be placed in the l2tag2 field of the Tx
context descriptor. The new define ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN was
added to the list of tx_flags to handle this case.
[2] When the stack requests the VLAN tag to be offloaded on Tx, the
driver needs to set either ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN or
ICE_TX_FLAGS_HW_VLAN, so the tag is inserted in l2tag2 or l2tag1
respectively. To determine which location to use, set a bit in the Tx
ring flags field during ring allocation that can be used to determine
which field to use in the Tx descriptor. In DVM, always use l2tag2,
and in SVM, always use l2tag1.
Signed-off-by: Brett Creeley <brett.creeley@intel.com>
Tested-by: Gurucharan G <gurucharanx.g@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-12-02 08:38:47 -08:00
|
|
|
if (tx_ring->flags & ICE_TX_FLAGS_RING_VLAN_L2TAG2)
|
|
|
|
first->tx_flags |= ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN;
|
|
|
|
else
|
|
|
|
first->tx_flags |= ICE_TX_FLAGS_HW_VLAN;
|
2018-03-20 07:58:15 -07:00
|
|
|
}
|
|
|
|
|
2020-05-15 17:51:19 -07:00
|
|
|
ice_tx_prepare_vlan_flags_dcb(tx_ring, first);
|
2018-03-20 07:58:15 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* ice_tso - computes mss and TSO length to prepare for TSO
|
|
|
|
* @first: pointer to struct ice_tx_buf
|
|
|
|
* @off: pointer to struct that holds offload parameters
|
|
|
|
*
|
|
|
|
* Returns 0 or error (negative) if TSO can't happen, 1 otherwise.
|
|
|
|
*/
|
|
|
|
static
|
|
|
|
int ice_tso(struct ice_tx_buf *first, struct ice_tx_offload_params *off)
|
|
|
|
{
|
|
|
|
struct sk_buff *skb = first->skb;
|
|
|
|
union {
|
|
|
|
struct iphdr *v4;
|
|
|
|
struct ipv6hdr *v6;
|
|
|
|
unsigned char *hdr;
|
|
|
|
} ip;
|
|
|
|
union {
|
|
|
|
struct tcphdr *tcp;
|
2019-12-12 03:12:53 -08:00
|
|
|
struct udphdr *udp;
|
2018-03-20 07:58:15 -07:00
|
|
|
unsigned char *hdr;
|
|
|
|
} l4;
|
|
|
|
u64 cd_mss, cd_tso_len;
|
2022-03-17 21:12:12 -07:00
|
|
|
__be16 protocol;
|
2020-05-07 17:41:05 -07:00
|
|
|
u32 paylen;
|
|
|
|
u8 l4_start;
|
2018-03-20 07:58:15 -07:00
|
|
|
int err;
|
|
|
|
|
|
|
|
if (skb->ip_summed != CHECKSUM_PARTIAL)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (!skb_is_gso(skb))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
err = skb_cow_head(skb, 0);
|
|
|
|
if (err < 0)
|
|
|
|
return err;
|
|
|
|
|
2022-03-17 21:12:12 -07:00
|
|
|
protocol = vlan_get_protocol(skb);
|
|
|
|
|
|
|
|
if (eth_p_mpls(protocol))
|
|
|
|
ip.hdr = skb_inner_network_header(skb);
|
|
|
|
else
|
|
|
|
ip.hdr = skb_network_header(skb);
|
|
|
|
l4.hdr = skb_checksum_start(skb);
|
2018-03-20 07:58:15 -07:00
|
|
|
|
|
|
|
/* initialize outer IP header fields */
|
|
|
|
if (ip.v4->version == 4) {
|
|
|
|
ip.v4->tot_len = 0;
|
|
|
|
ip.v4->check = 0;
|
|
|
|
} else {
|
|
|
|
ip.v6->payload_len = 0;
|
|
|
|
}
|
|
|
|
|
2020-05-06 09:32:30 -07:00
|
|
|
if (skb_shinfo(skb)->gso_type & (SKB_GSO_GRE |
|
|
|
|
SKB_GSO_GRE_CSUM |
|
|
|
|
SKB_GSO_IPXIP4 |
|
|
|
|
SKB_GSO_IPXIP6 |
|
|
|
|
SKB_GSO_UDP_TUNNEL |
|
|
|
|
SKB_GSO_UDP_TUNNEL_CSUM)) {
|
|
|
|
if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) &&
|
|
|
|
(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM)) {
|
|
|
|
l4.udp->len = 0;
|
|
|
|
|
|
|
|
/* determine offset of outer transport header */
|
2020-05-07 17:41:05 -07:00
|
|
|
l4_start = (u8)(l4.hdr - skb->data);
|
2020-05-06 09:32:30 -07:00
|
|
|
|
|
|
|
/* remove payload length from outer checksum */
|
|
|
|
paylen = skb->len - l4_start;
|
|
|
|
csum_replace_by_diff(&l4.udp->check,
|
|
|
|
(__force __wsum)htonl(paylen));
|
|
|
|
}
|
|
|
|
|
|
|
|
/* reset pointers to inner headers */
|
|
|
|
ip.hdr = skb_inner_network_header(skb);
|
|
|
|
l4.hdr = skb_inner_transport_header(skb);
|
|
|
|
|
|
|
|
/* initialize inner IP header fields */
|
|
|
|
if (ip.v4->version == 4) {
|
|
|
|
ip.v4->tot_len = 0;
|
|
|
|
ip.v4->check = 0;
|
|
|
|
} else {
|
|
|
|
ip.v6->payload_len = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-03-20 07:58:15 -07:00
|
|
|
/* determine offset of transport header */
|
2020-05-07 17:41:05 -07:00
|
|
|
l4_start = (u8)(l4.hdr - skb->data);
|
2018-03-20 07:58:15 -07:00
|
|
|
|
|
|
|
/* remove payload length from checksum */
|
|
|
|
paylen = skb->len - l4_start;
|
|
|
|
|
2019-12-12 03:12:53 -08:00
|
|
|
if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) {
|
|
|
|
csum_replace_by_diff(&l4.udp->check,
|
|
|
|
(__force __wsum)htonl(paylen));
|
|
|
|
/* compute length of UDP segmentation header */
|
2020-05-07 17:41:05 -07:00
|
|
|
off->header_len = (u8)sizeof(l4.udp) + l4_start;
|
2019-12-12 03:12:53 -08:00
|
|
|
} else {
|
|
|
|
csum_replace_by_diff(&l4.tcp->check,
|
|
|
|
(__force __wsum)htonl(paylen));
|
|
|
|
/* compute length of TCP segmentation header */
|
2020-05-07 17:41:05 -07:00
|
|
|
off->header_len = (u8)((l4.tcp->doff * 4) + l4_start);
|
2019-12-12 03:12:53 -08:00
|
|
|
}
|
2018-03-20 07:58:15 -07:00
|
|
|
|
|
|
|
/* update gso_segs and bytecount */
|
|
|
|
first->gso_segs = skb_shinfo(skb)->gso_segs;
|
2018-10-26 10:40:59 -07:00
|
|
|
first->bytecount += (first->gso_segs - 1) * off->header_len;
|
2018-03-20 07:58:15 -07:00
|
|
|
|
|
|
|
cd_tso_len = skb->len - off->header_len;
|
|
|
|
cd_mss = skb_shinfo(skb)->gso_size;
|
|
|
|
|
|
|
|
/* record cdesc_qw1 with TSO parameters */
|
2019-04-16 10:30:40 -07:00
|
|
|
off->cd_qw1 |= (u64)(ICE_TX_DESC_DTYPE_CTX |
|
|
|
|
(ICE_TX_CTX_DESC_TSO << ICE_TXD_CTX_QW1_CMD_S) |
|
|
|
|
(cd_tso_len << ICE_TXD_CTX_QW1_TSO_LEN_S) |
|
|
|
|
(cd_mss << ICE_TXD_CTX_QW1_MSS_S));
|
2018-03-20 07:58:15 -07:00
|
|
|
first->tx_flags |= ICE_TX_FLAGS_TSO;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2018-03-20 07:58:14 -07:00
|
|
|
/**
|
|
|
|
* ice_txd_use_count - estimate the number of descriptors needed for Tx
|
|
|
|
* @size: transmit request size in bytes
|
|
|
|
*
|
|
|
|
* Due to hardware alignment restrictions (4K alignment), we need to
|
|
|
|
* assume that we can have no more than 12K of data per descriptor, even
|
|
|
|
* though each descriptor can take up to 16K - 1 bytes of aligned memory.
|
|
|
|
* Thus, we need to divide by 12K. But division is slow! Instead,
|
|
|
|
* we decompose the operation into shifts and one relatively cheap
|
|
|
|
* multiply operation.
|
|
|
|
*
|
|
|
|
* To divide by 12K, we first divide by 4K, then divide by 3:
|
|
|
|
* To divide by 4K, shift right by 12 bits
|
|
|
|
* To divide by 3, multiply by 85, then divide by 256
|
|
|
|
* (Divide by 256 is done by shifting right by 8 bits)
|
|
|
|
* Finally, we add one to round up. Because 256 isn't an exact multiple of
|
|
|
|
* 3, we'll underestimate near each multiple of 12K. This is actually more
|
|
|
|
* accurate as we have 4K - 1 of wiggle room that we can fit into the last
|
2018-10-26 11:44:46 -07:00
|
|
|
* segment. For our purposes this is accurate out to 1M which is orders of
|
2018-03-20 07:58:14 -07:00
|
|
|
* magnitude greater than our largest possible GSO size.
|
|
|
|
*
|
|
|
|
* This would then be implemented as:
|
ice: Fix tx_timeout in PF driver
Prior to this commit the driver was running into tx_timeouts when a
queue was stressed enough. This was happening because the HW tail
and SW tail (NTU) were incorrectly out of sync. Consequently this was
causing the HW head to collide with the HW tail, which to the hardware
means that all descriptors posted for Tx have been processed.
Due to the Tx logic used in the driver SW tail and HW tail are allowed
to be out of sync. This is done as an optimization because it allows the
driver to write HW tail as infrequently as possible, while still
updating the SW tail index to keep track. However, there are situations
where this results in the tail never getting updated, resulting in Tx
timeouts.
Tx HW tail write condition:
if (netif_xmit_stopped(txring_txq(tx_ring) || !skb->xmit_more)
writel(sw_tail, tx_ring->tail);
An issue was found in the Tx logic that was causing the afore mentioned
condition for updating HW tail to never happen, causing tx_timeouts.
In ice_xmit_frame_ring we calculate how many descriptors we need for the
Tx transaction based on the skb the kernel hands us. This is then passed
into ice_maybe_stop_tx along with some extra padding to determine if we
have enough descriptors available for this transaction. If we don't then
we return -EBUSY to the stack, otherwise we move on and eventually
prepare the Tx descriptors accordingly in ice_tx_map and set
next_to_watch. In ice_tx_map we make another call to ice_maybe_stop_tx
with a value of MAX_SKB_FRAGS + 4. The key here is that this value is
possibly less than the value we sent in the first call to
ice_maybe_stop_tx in ice_xmit_frame_ring. Now, if the number of unused
descriptors is between MAX_SKB_FRAGS + 4 and the value used in the first
call to ice_maybe_stop_tx in ice_xmit_frame_ring then we do not update
the HW tail because of the "Tx HW tail write condition" above. This is
because in ice_maybe_stop_tx we return success from ice_maybe_stop_tx
instead of calling __ice_maybe_stop_tx and subsequently calling
netif_stop_subqueue, which sets the __QUEUE_STATE_DEV_XOFF bit. This
bit is then checked in the "Tx HW tail write condition" by calling
netif_xmit_stopped and subsequently updating HW tail if the
afore mentioned bit is set.
In ice_clean_tx_irq, if next_to_watch is not NULL, we end up cleaning
the descriptors that HW sets the DD bit on and we have the budget. The
HW head will eventually run into the HW tail in response to the
description in the paragraph above.
The next time through ice_xmit_frame_ring we make the initial call to
ice_maybe_stop_tx with another skb from the stack. This time we do not
have enough descriptors available and we return NETDEV_TX_BUSY to the
stack and end up setting next_to_watch to NULL.
This is where we are stuck. In ice_clean_tx_irq we never clean anything
because next_to_watch is always NULL and in ice_xmit_frame_ring we never
update HW tail because we already return NETDEV_TX_BUSY to the stack and
eventually we hit a tx_timeout.
This issue was fixed by making sure that the second call to
ice_maybe_stop_tx in ice_tx_map is passed a value that is >= the value
that was used on the initial call to ice_maybe_stop_tx in
ice_xmit_frame_ring. This was done by adding the following defines to
make the logic more clear and to reduce the chance of mucking this up
again:
ICE_CACHE_LINE_BYTES 64
ICE_DESCS_PER_CACHE_LINE (ICE_CACHE_LINE_BYTES / \
sizeof(struct ice_tx_desc))
ICE_DESCS_FOR_CTX_DESC 1
ICE_DESCS_FOR_SKB_DATA_PTR 1
The ICE_CACHE_LINE_BYTES being 64 is an assumption being made so we
don't have to figure this out on every pass through the Tx path. Instead
I added a sanity check in ice_probe to verify cache line size and print
a message if it's not 64 Bytes. This will make it easier to file issues
if they are seen when the cache line size is not 64 Bytes when reading
from the GLPCI_CNF2 register.
Signed-off-by: Brett Creeley <brett.creeley@intel.com>
Signed-off-by: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2018-10-26 10:40:58 -07:00
|
|
|
* return (((size >> 12) * 85) >> 8) + ICE_DESCS_FOR_SKB_DATA_PTR;
|
2018-03-20 07:58:14 -07:00
|
|
|
*
|
|
|
|
* Since multiplication and division are commutative, we can reorder
|
|
|
|
* operations into:
|
ice: Fix tx_timeout in PF driver
Prior to this commit the driver was running into tx_timeouts when a
queue was stressed enough. This was happening because the HW tail
and SW tail (NTU) were incorrectly out of sync. Consequently this was
causing the HW head to collide with the HW tail, which to the hardware
means that all descriptors posted for Tx have been processed.
Due to the Tx logic used in the driver SW tail and HW tail are allowed
to be out of sync. This is done as an optimization because it allows the
driver to write HW tail as infrequently as possible, while still
updating the SW tail index to keep track. However, there are situations
where this results in the tail never getting updated, resulting in Tx
timeouts.
Tx HW tail write condition:
if (netif_xmit_stopped(txring_txq(tx_ring) || !skb->xmit_more)
writel(sw_tail, tx_ring->tail);
An issue was found in the Tx logic that was causing the afore mentioned
condition for updating HW tail to never happen, causing tx_timeouts.
In ice_xmit_frame_ring we calculate how many descriptors we need for the
Tx transaction based on the skb the kernel hands us. This is then passed
into ice_maybe_stop_tx along with some extra padding to determine if we
have enough descriptors available for this transaction. If we don't then
we return -EBUSY to the stack, otherwise we move on and eventually
prepare the Tx descriptors accordingly in ice_tx_map and set
next_to_watch. In ice_tx_map we make another call to ice_maybe_stop_tx
with a value of MAX_SKB_FRAGS + 4. The key here is that this value is
possibly less than the value we sent in the first call to
ice_maybe_stop_tx in ice_xmit_frame_ring. Now, if the number of unused
descriptors is between MAX_SKB_FRAGS + 4 and the value used in the first
call to ice_maybe_stop_tx in ice_xmit_frame_ring then we do not update
the HW tail because of the "Tx HW tail write condition" above. This is
because in ice_maybe_stop_tx we return success from ice_maybe_stop_tx
instead of calling __ice_maybe_stop_tx and subsequently calling
netif_stop_subqueue, which sets the __QUEUE_STATE_DEV_XOFF bit. This
bit is then checked in the "Tx HW tail write condition" by calling
netif_xmit_stopped and subsequently updating HW tail if the
afore mentioned bit is set.
In ice_clean_tx_irq, if next_to_watch is not NULL, we end up cleaning
the descriptors that HW sets the DD bit on and we have the budget. The
HW head will eventually run into the HW tail in response to the
description in the paragraph above.
The next time through ice_xmit_frame_ring we make the initial call to
ice_maybe_stop_tx with another skb from the stack. This time we do not
have enough descriptors available and we return NETDEV_TX_BUSY to the
stack and end up setting next_to_watch to NULL.
This is where we are stuck. In ice_clean_tx_irq we never clean anything
because next_to_watch is always NULL and in ice_xmit_frame_ring we never
update HW tail because we already return NETDEV_TX_BUSY to the stack and
eventually we hit a tx_timeout.
This issue was fixed by making sure that the second call to
ice_maybe_stop_tx in ice_tx_map is passed a value that is >= the value
that was used on the initial call to ice_maybe_stop_tx in
ice_xmit_frame_ring. This was done by adding the following defines to
make the logic more clear and to reduce the chance of mucking this up
again:
ICE_CACHE_LINE_BYTES 64
ICE_DESCS_PER_CACHE_LINE (ICE_CACHE_LINE_BYTES / \
sizeof(struct ice_tx_desc))
ICE_DESCS_FOR_CTX_DESC 1
ICE_DESCS_FOR_SKB_DATA_PTR 1
The ICE_CACHE_LINE_BYTES being 64 is an assumption being made so we
don't have to figure this out on every pass through the Tx path. Instead
I added a sanity check in ice_probe to verify cache line size and print
a message if it's not 64 Bytes. This will make it easier to file issues
if they are seen when the cache line size is not 64 Bytes when reading
from the GLPCI_CNF2 register.
Signed-off-by: Brett Creeley <brett.creeley@intel.com>
Signed-off-by: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2018-10-26 10:40:58 -07:00
|
|
|
* return ((size * 85) >> 20) + ICE_DESCS_FOR_SKB_DATA_PTR;
|
2018-03-20 07:58:14 -07:00
|
|
|
*/
|
|
|
|
static unsigned int ice_txd_use_count(unsigned int size)
|
|
|
|
{
|
ice: Fix tx_timeout in PF driver
Prior to this commit the driver was running into tx_timeouts when a
queue was stressed enough. This was happening because the HW tail
and SW tail (NTU) were incorrectly out of sync. Consequently this was
causing the HW head to collide with the HW tail, which to the hardware
means that all descriptors posted for Tx have been processed.
Due to the Tx logic used in the driver SW tail and HW tail are allowed
to be out of sync. This is done as an optimization because it allows the
driver to write HW tail as infrequently as possible, while still
updating the SW tail index to keep track. However, there are situations
where this results in the tail never getting updated, resulting in Tx
timeouts.
Tx HW tail write condition:
if (netif_xmit_stopped(txring_txq(tx_ring) || !skb->xmit_more)
writel(sw_tail, tx_ring->tail);
An issue was found in the Tx logic that was causing the afore mentioned
condition for updating HW tail to never happen, causing tx_timeouts.
In ice_xmit_frame_ring we calculate how many descriptors we need for the
Tx transaction based on the skb the kernel hands us. This is then passed
into ice_maybe_stop_tx along with some extra padding to determine if we
have enough descriptors available for this transaction. If we don't then
we return -EBUSY to the stack, otherwise we move on and eventually
prepare the Tx descriptors accordingly in ice_tx_map and set
next_to_watch. In ice_tx_map we make another call to ice_maybe_stop_tx
with a value of MAX_SKB_FRAGS + 4. The key here is that this value is
possibly less than the value we sent in the first call to
ice_maybe_stop_tx in ice_xmit_frame_ring. Now, if the number of unused
descriptors is between MAX_SKB_FRAGS + 4 and the value used in the first
call to ice_maybe_stop_tx in ice_xmit_frame_ring then we do not update
the HW tail because of the "Tx HW tail write condition" above. This is
because in ice_maybe_stop_tx we return success from ice_maybe_stop_tx
instead of calling __ice_maybe_stop_tx and subsequently calling
netif_stop_subqueue, which sets the __QUEUE_STATE_DEV_XOFF bit. This
bit is then checked in the "Tx HW tail write condition" by calling
netif_xmit_stopped and subsequently updating HW tail if the
afore mentioned bit is set.
In ice_clean_tx_irq, if next_to_watch is not NULL, we end up cleaning
the descriptors that HW sets the DD bit on and we have the budget. The
HW head will eventually run into the HW tail in response to the
description in the paragraph above.
The next time through ice_xmit_frame_ring we make the initial call to
ice_maybe_stop_tx with another skb from the stack. This time we do not
have enough descriptors available and we return NETDEV_TX_BUSY to the
stack and end up setting next_to_watch to NULL.
This is where we are stuck. In ice_clean_tx_irq we never clean anything
because next_to_watch is always NULL and in ice_xmit_frame_ring we never
update HW tail because we already return NETDEV_TX_BUSY to the stack and
eventually we hit a tx_timeout.
This issue was fixed by making sure that the second call to
ice_maybe_stop_tx in ice_tx_map is passed a value that is >= the value
that was used on the initial call to ice_maybe_stop_tx in
ice_xmit_frame_ring. This was done by adding the following defines to
make the logic more clear and to reduce the chance of mucking this up
again:
ICE_CACHE_LINE_BYTES 64
ICE_DESCS_PER_CACHE_LINE (ICE_CACHE_LINE_BYTES / \
sizeof(struct ice_tx_desc))
ICE_DESCS_FOR_CTX_DESC 1
ICE_DESCS_FOR_SKB_DATA_PTR 1
The ICE_CACHE_LINE_BYTES being 64 is an assumption being made so we
don't have to figure this out on every pass through the Tx path. Instead
I added a sanity check in ice_probe to verify cache line size and print
a message if it's not 64 Bytes. This will make it easier to file issues
if they are seen when the cache line size is not 64 Bytes when reading
from the GLPCI_CNF2 register.
Signed-off-by: Brett Creeley <brett.creeley@intel.com>
Signed-off-by: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2018-10-26 10:40:58 -07:00
|
|
|
return ((size * 85) >> 20) + ICE_DESCS_FOR_SKB_DATA_PTR;
|
2018-03-20 07:58:14 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2018-10-26 11:44:47 -07:00
|
|
|
* ice_xmit_desc_count - calculate number of Tx descriptors needed
|
2018-03-20 07:58:14 -07:00
|
|
|
* @skb: send buffer
|
|
|
|
*
|
|
|
|
* Returns number of data descriptors needed for this skb.
|
|
|
|
*/
|
|
|
|
static unsigned int ice_xmit_desc_count(struct sk_buff *skb)
|
|
|
|
{
|
2019-07-22 20:08:25 -07:00
|
|
|
const skb_frag_t *frag = &skb_shinfo(skb)->frags[0];
|
2018-03-20 07:58:14 -07:00
|
|
|
unsigned int nr_frags = skb_shinfo(skb)->nr_frags;
|
|
|
|
unsigned int count = 0, size = skb_headlen(skb);
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
count += ice_txd_use_count(size);
|
|
|
|
|
|
|
|
if (!nr_frags--)
|
|
|
|
break;
|
|
|
|
|
|
|
|
size = skb_frag_size(frag++);
|
|
|
|
}
|
|
|
|
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* __ice_chk_linearize - Check if there are more than 8 buffers per packet
|
|
|
|
* @skb: send buffer
|
|
|
|
*
|
|
|
|
* Note: This HW can't DMA more than 8 buffers to build a packet on the wire
|
|
|
|
* and so we need to figure out the cases where we need to linearize the skb.
|
|
|
|
*
|
|
|
|
* For TSO we need to count the TSO header and segment payload separately.
|
|
|
|
* As such we need to check cases where we have 7 fragments or more as we
|
|
|
|
* can potentially require 9 DMA transactions, 1 for the TSO header, 1 for
|
|
|
|
* the segment payload in the first descriptor, and another 7 for the
|
|
|
|
* fragments.
|
|
|
|
*/
|
|
|
|
static bool __ice_chk_linearize(struct sk_buff *skb)
|
|
|
|
{
|
2019-07-22 20:08:25 -07:00
|
|
|
const skb_frag_t *frag, *stale;
|
2018-03-20 07:58:14 -07:00
|
|
|
int nr_frags, sum;
|
|
|
|
|
|
|
|
/* no need to check if number of frags is less than 7 */
|
|
|
|
nr_frags = skb_shinfo(skb)->nr_frags;
|
|
|
|
if (nr_frags < (ICE_MAX_BUF_TXD - 1))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
/* We need to walk through the list and validate that each group
|
|
|
|
* of 6 fragments totals at least gso_size.
|
|
|
|
*/
|
|
|
|
nr_frags -= ICE_MAX_BUF_TXD - 2;
|
|
|
|
frag = &skb_shinfo(skb)->frags[0];
|
|
|
|
|
2018-10-26 11:44:46 -07:00
|
|
|
/* Initialize size to the negative value of gso_size minus 1. We
|
2020-02-06 01:20:13 -08:00
|
|
|
* use this as the worst case scenario in which the frag ahead
|
2018-03-20 07:58:14 -07:00
|
|
|
* of us only provides one byte which is why we are limited to 6
|
|
|
|
* descriptors for a single transmit as the header and previous
|
|
|
|
* fragment are already consuming 2 descriptors.
|
|
|
|
*/
|
|
|
|
sum = 1 - skb_shinfo(skb)->gso_size;
|
|
|
|
|
|
|
|
/* Add size of frags 0 through 4 to create our initial sum */
|
|
|
|
sum += skb_frag_size(frag++);
|
|
|
|
sum += skb_frag_size(frag++);
|
|
|
|
sum += skb_frag_size(frag++);
|
|
|
|
sum += skb_frag_size(frag++);
|
|
|
|
sum += skb_frag_size(frag++);
|
|
|
|
|
|
|
|
/* Walk through fragments adding latest fragment, testing it, and
|
|
|
|
* then removing stale fragments from the sum.
|
|
|
|
*/
|
2020-07-29 17:19:17 -07:00
|
|
|
for (stale = &skb_shinfo(skb)->frags[0];; stale++) {
|
|
|
|
int stale_size = skb_frag_size(stale);
|
|
|
|
|
2018-03-20 07:58:14 -07:00
|
|
|
sum += skb_frag_size(frag++);
|
|
|
|
|
2020-07-29 17:19:17 -07:00
|
|
|
/* The stale fragment may present us with a smaller
|
|
|
|
* descriptor than the actual fragment size. To account
|
|
|
|
* for that we need to remove all the data on the front and
|
|
|
|
* figure out what the remainder would be in the last
|
|
|
|
* descriptor associated with the fragment.
|
|
|
|
*/
|
|
|
|
if (stale_size > ICE_MAX_DATA_PER_TXD) {
|
|
|
|
int align_pad = -(skb_frag_off(stale)) &
|
|
|
|
(ICE_MAX_READ_REQ_SIZE - 1);
|
|
|
|
|
|
|
|
sum -= align_pad;
|
|
|
|
stale_size -= align_pad;
|
|
|
|
|
|
|
|
do {
|
|
|
|
sum -= ICE_MAX_DATA_PER_TXD_ALIGNED;
|
|
|
|
stale_size -= ICE_MAX_DATA_PER_TXD_ALIGNED;
|
|
|
|
} while (stale_size > ICE_MAX_DATA_PER_TXD);
|
|
|
|
}
|
|
|
|
|
2018-03-20 07:58:14 -07:00
|
|
|
/* if sum is negative we failed to make sufficient progress */
|
|
|
|
if (sum < 0)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
if (!nr_frags--)
|
|
|
|
break;
|
|
|
|
|
2020-07-29 17:19:17 -07:00
|
|
|
sum -= stale_size;
|
2018-03-20 07:58:14 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* ice_chk_linearize - Check if there are more than 8 fragments per packet
|
|
|
|
* @skb: send buffer
|
|
|
|
* @count: number of buffers used
|
|
|
|
*
|
|
|
|
* Note: Our HW can't scatter-gather more than 8 fragments to build
|
|
|
|
* a packet on the wire and so we need to figure out the cases where we
|
|
|
|
* need to linearize the skb.
|
|
|
|
*/
|
|
|
|
static bool ice_chk_linearize(struct sk_buff *skb, unsigned int count)
|
|
|
|
{
|
|
|
|
/* Both TSO and single send will work if count is less than 8 */
|
|
|
|
if (likely(count < ICE_MAX_BUF_TXD))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (skb_is_gso(skb))
|
|
|
|
return __ice_chk_linearize(skb);
|
|
|
|
|
|
|
|
/* we can support up to 8 data buffers for a single send */
|
|
|
|
return count != ICE_MAX_BUF_TXD;
|
|
|
|
}
|
|
|
|
|
ice: enable transmit timestamps for E810 devices
Add support for enabling Tx timestamp requests for outgoing packets on
E810 devices.
The ice hardware can support multiple outstanding Tx timestamp requests.
When sending a descriptor to hardware, a Tx timestamp request is made by
setting a request bit, and assigning an index that represents which Tx
timestamp index to store the timestamp in.
Hardware makes no effort to synchronize the index use, so it is up to
software to ensure that Tx timestamp indexes are not re-used before the
timestamp is reported back.
To do this, introduce a Tx timestamp tracker which will keep track of
currently in-use indexes.
In the hot path, if a packet has a timestamp request, an index will be
requested from the tracker. Unfortunately, this does require a lock as
the indexes are shared across all queues on a PHY. There are not enough
indexes to reliably assign only 1 to each queue.
For the E810 devices, the timestamp indexes are not shared across PHYs,
so each port can have its own tracking.
Once hardware captures a timestamp, an interrupt is fired. In this
interrupt, trigger a new work item that will figure out which timestamp
was completed, and report the timestamp back to the stack.
This function loops through the Tx timestamp indexes and checks whether
there is now a valid timestamp. If so, it clears the PHY timestamp
indication in the PHY memory, locks and removes the SKB and bit in the
tracker, then reports the timestamp to the stack.
It is possible in some cases that a timestamp request will be initiated
but never completed. This might occur if the packet is dropped by
software or hardware before it reaches the PHY.
Add a task to the periodic work function that will check whether
a timestamp request is more than a few seconds old. If so, the timestamp
index is cleared in the PHY, and the SKB is released.
Just as with Rx timestamps, the Tx timestamps are only 40 bits wide, and
use the same overall logic for extending to 64 bits of nanoseconds.
With this change, E810 devices should be able to perform basic PTP
functionality.
Future changes will extend the support to cover the E822-based devices.
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-06-09 09:39:53 -07:00
|
|
|
/**
|
|
|
|
* ice_tstamp - set up context descriptor for hardware timestamp
|
|
|
|
* @tx_ring: pointer to the Tx ring to send buffer on
|
|
|
|
* @skb: pointer to the SKB we're sending
|
|
|
|
* @first: Tx buffer
|
|
|
|
* @off: Tx offload parameters
|
|
|
|
*/
|
|
|
|
static void
|
2021-08-19 13:59:58 +02:00
|
|
|
ice_tstamp(struct ice_tx_ring *tx_ring, struct sk_buff *skb,
|
ice: enable transmit timestamps for E810 devices
Add support for enabling Tx timestamp requests for outgoing packets on
E810 devices.
The ice hardware can support multiple outstanding Tx timestamp requests.
When sending a descriptor to hardware, a Tx timestamp request is made by
setting a request bit, and assigning an index that represents which Tx
timestamp index to store the timestamp in.
Hardware makes no effort to synchronize the index use, so it is up to
software to ensure that Tx timestamp indexes are not re-used before the
timestamp is reported back.
To do this, introduce a Tx timestamp tracker which will keep track of
currently in-use indexes.
In the hot path, if a packet has a timestamp request, an index will be
requested from the tracker. Unfortunately, this does require a lock as
the indexes are shared across all queues on a PHY. There are not enough
indexes to reliably assign only 1 to each queue.
For the E810 devices, the timestamp indexes are not shared across PHYs,
so each port can have its own tracking.
Once hardware captures a timestamp, an interrupt is fired. In this
interrupt, trigger a new work item that will figure out which timestamp
was completed, and report the timestamp back to the stack.
This function loops through the Tx timestamp indexes and checks whether
there is now a valid timestamp. If so, it clears the PHY timestamp
indication in the PHY memory, locks and removes the SKB and bit in the
tracker, then reports the timestamp to the stack.
It is possible in some cases that a timestamp request will be initiated
but never completed. This might occur if the packet is dropped by
software or hardware before it reaches the PHY.
Add a task to the periodic work function that will check whether
a timestamp request is more than a few seconds old. If so, the timestamp
index is cleared in the PHY, and the SKB is released.
Just as with Rx timestamps, the Tx timestamps are only 40 bits wide, and
use the same overall logic for extending to 64 bits of nanoseconds.
With this change, E810 devices should be able to perform basic PTP
functionality.
Future changes will extend the support to cover the E822-based devices.
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-06-09 09:39:53 -07:00
|
|
|
struct ice_tx_buf *first, struct ice_tx_offload_params *off)
|
|
|
|
{
|
|
|
|
s8 idx;
|
|
|
|
|
|
|
|
/* only timestamp the outbound packet if the user has requested it */
|
|
|
|
if (likely(!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)))
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* Tx timestamps cannot be sampled when doing TSO */
|
|
|
|
if (first->tx_flags & ICE_TX_FLAGS_TSO)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* Grab an open timestamp slot */
|
|
|
|
idx = ice_ptp_request_ts(tx_ring->tx_tstamps, skb);
|
2022-07-27 16:15:59 -07:00
|
|
|
if (idx < 0) {
|
|
|
|
tx_ring->vsi->back->ptp.tx_hwtstamp_skipped++;
|
ice: enable transmit timestamps for E810 devices
Add support for enabling Tx timestamp requests for outgoing packets on
E810 devices.
The ice hardware can support multiple outstanding Tx timestamp requests.
When sending a descriptor to hardware, a Tx timestamp request is made by
setting a request bit, and assigning an index that represents which Tx
timestamp index to store the timestamp in.
Hardware makes no effort to synchronize the index use, so it is up to
software to ensure that Tx timestamp indexes are not re-used before the
timestamp is reported back.
To do this, introduce a Tx timestamp tracker which will keep track of
currently in-use indexes.
In the hot path, if a packet has a timestamp request, an index will be
requested from the tracker. Unfortunately, this does require a lock as
the indexes are shared across all queues on a PHY. There are not enough
indexes to reliably assign only 1 to each queue.
For the E810 devices, the timestamp indexes are not shared across PHYs,
so each port can have its own tracking.
Once hardware captures a timestamp, an interrupt is fired. In this
interrupt, trigger a new work item that will figure out which timestamp
was completed, and report the timestamp back to the stack.
This function loops through the Tx timestamp indexes and checks whether
there is now a valid timestamp. If so, it clears the PHY timestamp
indication in the PHY memory, locks and removes the SKB and bit in the
tracker, then reports the timestamp to the stack.
It is possible in some cases that a timestamp request will be initiated
but never completed. This might occur if the packet is dropped by
software or hardware before it reaches the PHY.
Add a task to the periodic work function that will check whether
a timestamp request is more than a few seconds old. If so, the timestamp
index is cleared in the PHY, and the SKB is released.
Just as with Rx timestamps, the Tx timestamps are only 40 bits wide, and
use the same overall logic for extending to 64 bits of nanoseconds.
With this change, E810 devices should be able to perform basic PTP
functionality.
Future changes will extend the support to cover the E822-based devices.
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-06-09 09:39:53 -07:00
|
|
|
return;
|
2022-07-27 16:15:59 -07:00
|
|
|
}
|
ice: enable transmit timestamps for E810 devices
Add support for enabling Tx timestamp requests for outgoing packets on
E810 devices.
The ice hardware can support multiple outstanding Tx timestamp requests.
When sending a descriptor to hardware, a Tx timestamp request is made by
setting a request bit, and assigning an index that represents which Tx
timestamp index to store the timestamp in.
Hardware makes no effort to synchronize the index use, so it is up to
software to ensure that Tx timestamp indexes are not re-used before the
timestamp is reported back.
To do this, introduce a Tx timestamp tracker which will keep track of
currently in-use indexes.
In the hot path, if a packet has a timestamp request, an index will be
requested from the tracker. Unfortunately, this does require a lock as
the indexes are shared across all queues on a PHY. There are not enough
indexes to reliably assign only 1 to each queue.
For the E810 devices, the timestamp indexes are not shared across PHYs,
so each port can have its own tracking.
Once hardware captures a timestamp, an interrupt is fired. In this
interrupt, trigger a new work item that will figure out which timestamp
was completed, and report the timestamp back to the stack.
This function loops through the Tx timestamp indexes and checks whether
there is now a valid timestamp. If so, it clears the PHY timestamp
indication in the PHY memory, locks and removes the SKB and bit in the
tracker, then reports the timestamp to the stack.
It is possible in some cases that a timestamp request will be initiated
but never completed. This might occur if the packet is dropped by
software or hardware before it reaches the PHY.
Add a task to the periodic work function that will check whether
a timestamp request is more than a few seconds old. If so, the timestamp
index is cleared in the PHY, and the SKB is released.
Just as with Rx timestamps, the Tx timestamps are only 40 bits wide, and
use the same overall logic for extending to 64 bits of nanoseconds.
With this change, E810 devices should be able to perform basic PTP
functionality.
Future changes will extend the support to cover the E822-based devices.
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-06-09 09:39:53 -07:00
|
|
|
|
|
|
|
off->cd_qw1 |= (u64)(ICE_TX_DESC_DTYPE_CTX |
|
|
|
|
(ICE_TX_CTX_DESC_TSYN << ICE_TXD_CTX_QW1_CMD_S) |
|
|
|
|
((u64)idx << ICE_TXD_CTX_QW1_TSO_LEN_S));
|
|
|
|
first->tx_flags |= ICE_TX_FLAGS_TSYN;
|
|
|
|
}
|
|
|
|
|
2018-03-20 07:58:14 -07:00
|
|
|
/**
|
|
|
|
* ice_xmit_frame_ring - Sends buffer on Tx ring
|
|
|
|
* @skb: send buffer
|
|
|
|
* @tx_ring: ring to send buffer on
|
|
|
|
*
|
|
|
|
* Returns NETDEV_TX_OK if sent, else an error code
|
|
|
|
*/
|
|
|
|
static netdev_tx_t
|
2021-08-19 13:59:58 +02:00
|
|
|
ice_xmit_frame_ring(struct sk_buff *skb, struct ice_tx_ring *tx_ring)
|
2018-03-20 07:58:14 -07:00
|
|
|
{
|
2018-03-20 07:58:15 -07:00
|
|
|
struct ice_tx_offload_params offload = { 0 };
|
2019-07-29 02:04:43 -07:00
|
|
|
struct ice_vsi *vsi = tx_ring->vsi;
|
2018-03-20 07:58:14 -07:00
|
|
|
struct ice_tx_buf *first;
|
2021-05-05 14:17:59 -07:00
|
|
|
struct ethhdr *eth;
|
2018-03-20 07:58:14 -07:00
|
|
|
unsigned int count;
|
2018-03-20 07:58:15 -07:00
|
|
|
int tso, csum;
|
2018-03-20 07:58:14 -07:00
|
|
|
|
2021-06-08 16:35:17 -07:00
|
|
|
ice_trace(xmit_frame_ring, tx_ring, skb);
|
|
|
|
|
2023-02-07 17:23:03 +01:00
|
|
|
if (unlikely(ipv6_hopopt_jumbo_remove(skb)))
|
|
|
|
goto out_drop;
|
|
|
|
|
2018-03-20 07:58:14 -07:00
|
|
|
count = ice_xmit_desc_count(skb);
|
|
|
|
if (ice_chk_linearize(skb, count)) {
|
|
|
|
if (__skb_linearize(skb))
|
|
|
|
goto out_drop;
|
|
|
|
count = ice_txd_use_count(skb->len);
|
ice: Accumulate ring statistics over reset
Resets may occur with or without user interaction. For example, a TX hang
or reconfiguration of parameters will result in a reset. During reset, the
VSI is freed, freeing any statistics structures inside as well. This would
create an issue for the user where a reset happens in the background,
statistics set to zero, and the user checks ring statistics expecting them
to be populated.
To ensure this doesn't happen, accumulate ring statistics over reset.
Define a new ring statistics structure, ice_ring_stats. The new structure
lives in the VSI's parent, preserving ring statistics when VSI is freed.
1. Define a new structure vsi_ring_stats in the PF scope
2. Allocate/free stats only during probe, unload, or change in ring size
3. Replace previous ring statistics functionality with new structure
Signed-off-by: Benjamin Mikailenko <benjamin.mikailenko@intel.com>
Tested-by: Gurucharan G <gurucharanx.g@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2022-11-18 16:20:02 -05:00
|
|
|
tx_ring->ring_stats->tx_stats.tx_linearize++;
|
2018-03-20 07:58:14 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/* need: 1 descriptor per page * PAGE_SIZE/ICE_MAX_DATA_PER_TXD,
|
|
|
|
* + 1 desc for skb_head_len/ICE_MAX_DATA_PER_TXD,
|
|
|
|
* + 4 desc gap to avoid the cache line where head is,
|
|
|
|
* + 1 desc for context descriptor,
|
|
|
|
* otherwise try next time
|
|
|
|
*/
|
ice: Fix tx_timeout in PF driver
Prior to this commit the driver was running into tx_timeouts when a
queue was stressed enough. This was happening because the HW tail
and SW tail (NTU) were incorrectly out of sync. Consequently this was
causing the HW head to collide with the HW tail, which to the hardware
means that all descriptors posted for Tx have been processed.
Due to the Tx logic used in the driver SW tail and HW tail are allowed
to be out of sync. This is done as an optimization because it allows the
driver to write HW tail as infrequently as possible, while still
updating the SW tail index to keep track. However, there are situations
where this results in the tail never getting updated, resulting in Tx
timeouts.
Tx HW tail write condition:
if (netif_xmit_stopped(txring_txq(tx_ring) || !skb->xmit_more)
writel(sw_tail, tx_ring->tail);
An issue was found in the Tx logic that was causing the afore mentioned
condition for updating HW tail to never happen, causing tx_timeouts.
In ice_xmit_frame_ring we calculate how many descriptors we need for the
Tx transaction based on the skb the kernel hands us. This is then passed
into ice_maybe_stop_tx along with some extra padding to determine if we
have enough descriptors available for this transaction. If we don't then
we return -EBUSY to the stack, otherwise we move on and eventually
prepare the Tx descriptors accordingly in ice_tx_map and set
next_to_watch. In ice_tx_map we make another call to ice_maybe_stop_tx
with a value of MAX_SKB_FRAGS + 4. The key here is that this value is
possibly less than the value we sent in the first call to
ice_maybe_stop_tx in ice_xmit_frame_ring. Now, if the number of unused
descriptors is between MAX_SKB_FRAGS + 4 and the value used in the first
call to ice_maybe_stop_tx in ice_xmit_frame_ring then we do not update
the HW tail because of the "Tx HW tail write condition" above. This is
because in ice_maybe_stop_tx we return success from ice_maybe_stop_tx
instead of calling __ice_maybe_stop_tx and subsequently calling
netif_stop_subqueue, which sets the __QUEUE_STATE_DEV_XOFF bit. This
bit is then checked in the "Tx HW tail write condition" by calling
netif_xmit_stopped and subsequently updating HW tail if the
afore mentioned bit is set.
In ice_clean_tx_irq, if next_to_watch is not NULL, we end up cleaning
the descriptors that HW sets the DD bit on and we have the budget. The
HW head will eventually run into the HW tail in response to the
description in the paragraph above.
The next time through ice_xmit_frame_ring we make the initial call to
ice_maybe_stop_tx with another skb from the stack. This time we do not
have enough descriptors available and we return NETDEV_TX_BUSY to the
stack and end up setting next_to_watch to NULL.
This is where we are stuck. In ice_clean_tx_irq we never clean anything
because next_to_watch is always NULL and in ice_xmit_frame_ring we never
update HW tail because we already return NETDEV_TX_BUSY to the stack and
eventually we hit a tx_timeout.
This issue was fixed by making sure that the second call to
ice_maybe_stop_tx in ice_tx_map is passed a value that is >= the value
that was used on the initial call to ice_maybe_stop_tx in
ice_xmit_frame_ring. This was done by adding the following defines to
make the logic more clear and to reduce the chance of mucking this up
again:
ICE_CACHE_LINE_BYTES 64
ICE_DESCS_PER_CACHE_LINE (ICE_CACHE_LINE_BYTES / \
sizeof(struct ice_tx_desc))
ICE_DESCS_FOR_CTX_DESC 1
ICE_DESCS_FOR_SKB_DATA_PTR 1
The ICE_CACHE_LINE_BYTES being 64 is an assumption being made so we
don't have to figure this out on every pass through the Tx path. Instead
I added a sanity check in ice_probe to verify cache line size and print
a message if it's not 64 Bytes. This will make it easier to file issues
if they are seen when the cache line size is not 64 Bytes when reading
from the GLPCI_CNF2 register.
Signed-off-by: Brett Creeley <brett.creeley@intel.com>
Signed-off-by: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2018-10-26 10:40:58 -07:00
|
|
|
if (ice_maybe_stop_tx(tx_ring, count + ICE_DESCS_PER_CACHE_LINE +
|
|
|
|
ICE_DESCS_FOR_CTX_DESC)) {
|
ice: Accumulate ring statistics over reset
Resets may occur with or without user interaction. For example, a TX hang
or reconfiguration of parameters will result in a reset. During reset, the
VSI is freed, freeing any statistics structures inside as well. This would
create an issue for the user where a reset happens in the background,
statistics set to zero, and the user checks ring statistics expecting them
to be populated.
To ensure this doesn't happen, accumulate ring statistics over reset.
Define a new ring statistics structure, ice_ring_stats. The new structure
lives in the VSI's parent, preserving ring statistics when VSI is freed.
1. Define a new structure vsi_ring_stats in the PF scope
2. Allocate/free stats only during probe, unload, or change in ring size
3. Replace previous ring statistics functionality with new structure
Signed-off-by: Benjamin Mikailenko <benjamin.mikailenko@intel.com>
Tested-by: Gurucharan G <gurucharanx.g@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2022-11-18 16:20:02 -05:00
|
|
|
tx_ring->ring_stats->tx_stats.tx_busy++;
|
2018-03-20 07:58:14 -07:00
|
|
|
return NETDEV_TX_BUSY;
|
|
|
|
}
|
|
|
|
|
2021-10-27 12:38:36 -07:00
|
|
|
/* prefetch for bql data which is infrequently used */
|
|
|
|
netdev_txq_bql_enqueue_prefetchw(txring_txq(tx_ring));
|
|
|
|
|
2018-03-20 07:58:15 -07:00
|
|
|
offload.tx_ring = tx_ring;
|
|
|
|
|
2018-03-20 07:58:14 -07:00
|
|
|
/* record the location of the first descriptor for this packet */
|
|
|
|
first = &tx_ring->tx_buf[tx_ring->next_to_use];
|
|
|
|
first->skb = skb;
|
ice: Robustify cleaning/completing XDP Tx buffers
When queueing frames from a Page Pool for redirecting to a device backed
by the ice driver, `perf top` shows heavy load on page_alloc() and
page_frag_free(), despite that on a properly working system it must be
fully or at least almost zero-alloc. The problem is in fact a bit deeper
and raises from how ice cleans up completed Tx buffers.
The story so far: when cleaning/freeing the resources related to
a particular completed Tx frame (skbs, DMA mappings etc.), ice uses some
heuristics only without setting any type explicitly (except for dummy
Flow Director packets, which are marked via ice_tx_buf::tx_flags).
This kinda works, but only up to some point. For example, currently ice
assumes that each frame coming to __ice_xmit_xdp_ring(), is backed by
either plain order-0 page or plain page frag, while it may also be
backed by Page Pool or any other possible memory models introduced in
future. This means any &xdp_frame must be freed properly via
xdp_return_frame() family with no assumptions.
In order to do that, the whole heuristics must be replaced with setting
the Tx buffer/frame type explicitly, just how it's always been done via
an enum. Let us reuse 16 bits from ::tx_flags -- 1 bit-and instr won't
hurt much -- especially given that sometimes there was a check for
%ICE_TX_FLAGS_DUMMY_PKT, which is now turned from a flag to an enum
member. The rest of the changes is straightforward and most of it is
just a conversion to rely now on the type set in &ice_tx_buf rather than
to some secondary properties.
For now, no functional changes intended, the change only prepares the
ground for starting freeing XDP frames properly next step. And it must
be done atomically/synchronously to not break stuff.
Signed-off-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/bpf/20230210170618.1973430-5-alexandr.lobakin@intel.com
2023-02-10 18:06:16 +01:00
|
|
|
first->type = ICE_TX_BUF_SKB;
|
2018-03-20 07:58:14 -07:00
|
|
|
first->bytecount = max_t(unsigned int, skb->len, ETH_ZLEN);
|
|
|
|
first->gso_segs = 1;
|
2018-03-20 07:58:15 -07:00
|
|
|
first->tx_flags = 0;
|
|
|
|
|
|
|
|
/* prepare the VLAN tagging flags for Tx */
|
2020-05-15 17:51:19 -07:00
|
|
|
ice_tx_prepare_vlan_flags(tx_ring, first);
|
ice: Add hot path support for 802.1Q and 802.1ad VLAN offloads
Currently the driver only supports 802.1Q VLAN insertion and stripping.
However, once Double VLAN Mode (DVM) is fully supported, then both 802.1Q
and 802.1ad VLAN insertion and stripping will be supported. Unfortunately
the VSI context parameters only allow for one VLAN ethertype at a time
for VLAN offloads so only one or the other VLAN ethertype offload can be
supported at once.
To support this, multiple changes are needed.
Rx path changes:
[1] In DVM, the Rx queue context l2tagsel field needs to be cleared so
the outermost tag shows up in the l2tag2_2nd field of the Rx flex
descriptor. In Single VLAN Mode (SVM), the l2tagsel field should remain
1 to support SVM configurations.
[2] Modify the ice_test_staterr() function to take a __le16 instead of
the ice_32b_rx_flex_desc union pointer so this function can be used for
both rx_desc->wb.status_error0 and rx_desc->wb.status_error1.
[3] Add the new inline function ice_get_vlan_tag_from_rx_desc() that
checks if there is a VLAN tag in l2tag1 or l2tag2_2nd.
[4] In ice_receive_skb(), add a check to see if NETIF_F_HW_VLAN_STAG_RX
is enabled in netdev->features. If it is, then this is the VLAN
ethertype that needs to be added to the stripping VLAN tag. Since
ice_fix_features() prevents CTAG_RX and STAG_RX from being enabled
simultaneously, the VLAN ethertype will only ever be 802.1Q or 802.1ad.
Tx path changes:
[1] In DVM, the VLAN tag needs to be placed in the l2tag2 field of the Tx
context descriptor. The new define ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN was
added to the list of tx_flags to handle this case.
[2] When the stack requests the VLAN tag to be offloaded on Tx, the
driver needs to set either ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN or
ICE_TX_FLAGS_HW_VLAN, so the tag is inserted in l2tag2 or l2tag1
respectively. To determine which location to use, set a bit in the Tx
ring flags field during ring allocation that can be used to determine
which field to use in the Tx descriptor. In DVM, always use l2tag2,
and in SVM, always use l2tag1.
Signed-off-by: Brett Creeley <brett.creeley@intel.com>
Tested-by: Gurucharan G <gurucharanx.g@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-12-02 08:38:47 -08:00
|
|
|
if (first->tx_flags & ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN) {
|
|
|
|
offload.cd_qw1 |= (u64)(ICE_TX_DESC_DTYPE_CTX |
|
|
|
|
(ICE_TX_CTX_DESC_IL2TAG2 <<
|
|
|
|
ICE_TXD_CTX_QW1_CMD_S));
|
2023-05-11 08:53:19 -07:00
|
|
|
offload.cd_l2tag2 = first->vid;
|
ice: Add hot path support for 802.1Q and 802.1ad VLAN offloads
Currently the driver only supports 802.1Q VLAN insertion and stripping.
However, once Double VLAN Mode (DVM) is fully supported, then both 802.1Q
and 802.1ad VLAN insertion and stripping will be supported. Unfortunately
the VSI context parameters only allow for one VLAN ethertype at a time
for VLAN offloads so only one or the other VLAN ethertype offload can be
supported at once.
To support this, multiple changes are needed.
Rx path changes:
[1] In DVM, the Rx queue context l2tagsel field needs to be cleared so
the outermost tag shows up in the l2tag2_2nd field of the Rx flex
descriptor. In Single VLAN Mode (SVM), the l2tagsel field should remain
1 to support SVM configurations.
[2] Modify the ice_test_staterr() function to take a __le16 instead of
the ice_32b_rx_flex_desc union pointer so this function can be used for
both rx_desc->wb.status_error0 and rx_desc->wb.status_error1.
[3] Add the new inline function ice_get_vlan_tag_from_rx_desc() that
checks if there is a VLAN tag in l2tag1 or l2tag2_2nd.
[4] In ice_receive_skb(), add a check to see if NETIF_F_HW_VLAN_STAG_RX
is enabled in netdev->features. If it is, then this is the VLAN
ethertype that needs to be added to the stripping VLAN tag. Since
ice_fix_features() prevents CTAG_RX and STAG_RX from being enabled
simultaneously, the VLAN ethertype will only ever be 802.1Q or 802.1ad.
Tx path changes:
[1] In DVM, the VLAN tag needs to be placed in the l2tag2 field of the Tx
context descriptor. The new define ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN was
added to the list of tx_flags to handle this case.
[2] When the stack requests the VLAN tag to be offloaded on Tx, the
driver needs to set either ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN or
ICE_TX_FLAGS_HW_VLAN, so the tag is inserted in l2tag2 or l2tag1
respectively. To determine which location to use, set a bit in the Tx
ring flags field during ring allocation that can be used to determine
which field to use in the Tx descriptor. In DVM, always use l2tag2,
and in SVM, always use l2tag1.
Signed-off-by: Brett Creeley <brett.creeley@intel.com>
Tested-by: Gurucharan G <gurucharanx.g@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-12-02 08:38:47 -08:00
|
|
|
}
|
2018-03-20 07:58:15 -07:00
|
|
|
|
|
|
|
/* set up TSO offload */
|
|
|
|
tso = ice_tso(first, &offload);
|
|
|
|
if (tso < 0)
|
|
|
|
goto out_drop;
|
|
|
|
|
|
|
|
/* always set up Tx checksum offload */
|
|
|
|
csum = ice_tx_csum(first, &offload);
|
|
|
|
if (csum < 0)
|
|
|
|
goto out_drop;
|
|
|
|
|
2019-07-29 02:04:43 -07:00
|
|
|
/* allow CONTROL frames egress from main VSI if FW LLDP disabled */
|
2021-05-05 14:17:59 -07:00
|
|
|
eth = (struct ethhdr *)skb_mac_header(skb);
|
|
|
|
if (unlikely((skb->priority == TC_PRIO_CONTROL ||
|
|
|
|
eth->h_proto == htons(ETH_P_LLDP)) &&
|
2019-07-29 02:04:43 -07:00
|
|
|
vsi->type == ICE_VSI_PF &&
|
2020-11-20 16:39:35 -08:00
|
|
|
vsi->port_info->qos_cfg.is_sw_lldp))
|
2019-07-29 02:04:43 -07:00
|
|
|
offload.cd_qw1 |= (u64)(ICE_TX_DESC_DTYPE_CTX |
|
|
|
|
ICE_TX_CTX_DESC_SWTCH_UPLINK <<
|
|
|
|
ICE_TXD_CTX_QW1_CMD_S);
|
|
|
|
|
ice: enable transmit timestamps for E810 devices
Add support for enabling Tx timestamp requests for outgoing packets on
E810 devices.
The ice hardware can support multiple outstanding Tx timestamp requests.
When sending a descriptor to hardware, a Tx timestamp request is made by
setting a request bit, and assigning an index that represents which Tx
timestamp index to store the timestamp in.
Hardware makes no effort to synchronize the index use, so it is up to
software to ensure that Tx timestamp indexes are not re-used before the
timestamp is reported back.
To do this, introduce a Tx timestamp tracker which will keep track of
currently in-use indexes.
In the hot path, if a packet has a timestamp request, an index will be
requested from the tracker. Unfortunately, this does require a lock as
the indexes are shared across all queues on a PHY. There are not enough
indexes to reliably assign only 1 to each queue.
For the E810 devices, the timestamp indexes are not shared across PHYs,
so each port can have its own tracking.
Once hardware captures a timestamp, an interrupt is fired. In this
interrupt, trigger a new work item that will figure out which timestamp
was completed, and report the timestamp back to the stack.
This function loops through the Tx timestamp indexes and checks whether
there is now a valid timestamp. If so, it clears the PHY timestamp
indication in the PHY memory, locks and removes the SKB and bit in the
tracker, then reports the timestamp to the stack.
It is possible in some cases that a timestamp request will be initiated
but never completed. This might occur if the packet is dropped by
software or hardware before it reaches the PHY.
Add a task to the periodic work function that will check whether
a timestamp request is more than a few seconds old. If so, the timestamp
index is cleared in the PHY, and the SKB is released.
Just as with Rx timestamps, the Tx timestamps are only 40 bits wide, and
use the same overall logic for extending to 64 bits of nanoseconds.
With this change, E810 devices should be able to perform basic PTP
functionality.
Future changes will extend the support to cover the E822-based devices.
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-06-09 09:39:53 -07:00
|
|
|
ice_tstamp(tx_ring, skb, first, &offload);
|
2021-08-19 17:08:58 -07:00
|
|
|
if (ice_is_switchdev_running(vsi->back))
|
|
|
|
ice_eswitch_set_target_vsi(skb, &offload);
|
ice: enable transmit timestamps for E810 devices
Add support for enabling Tx timestamp requests for outgoing packets on
E810 devices.
The ice hardware can support multiple outstanding Tx timestamp requests.
When sending a descriptor to hardware, a Tx timestamp request is made by
setting a request bit, and assigning an index that represents which Tx
timestamp index to store the timestamp in.
Hardware makes no effort to synchronize the index use, so it is up to
software to ensure that Tx timestamp indexes are not re-used before the
timestamp is reported back.
To do this, introduce a Tx timestamp tracker which will keep track of
currently in-use indexes.
In the hot path, if a packet has a timestamp request, an index will be
requested from the tracker. Unfortunately, this does require a lock as
the indexes are shared across all queues on a PHY. There are not enough
indexes to reliably assign only 1 to each queue.
For the E810 devices, the timestamp indexes are not shared across PHYs,
so each port can have its own tracking.
Once hardware captures a timestamp, an interrupt is fired. In this
interrupt, trigger a new work item that will figure out which timestamp
was completed, and report the timestamp back to the stack.
This function loops through the Tx timestamp indexes and checks whether
there is now a valid timestamp. If so, it clears the PHY timestamp
indication in the PHY memory, locks and removes the SKB and bit in the
tracker, then reports the timestamp to the stack.
It is possible in some cases that a timestamp request will be initiated
but never completed. This might occur if the packet is dropped by
software or hardware before it reaches the PHY.
Add a task to the periodic work function that will check whether
a timestamp request is more than a few seconds old. If so, the timestamp
index is cleared in the PHY, and the SKB is released.
Just as with Rx timestamps, the Tx timestamps are only 40 bits wide, and
use the same overall logic for extending to 64 bits of nanoseconds.
With this change, E810 devices should be able to perform basic PTP
functionality.
Future changes will extend the support to cover the E822-based devices.
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-06-09 09:39:53 -07:00
|
|
|
|
2019-07-29 02:04:43 -07:00
|
|
|
if (offload.cd_qw1 & ICE_TX_DESC_DTYPE_CTX) {
|
2018-03-20 07:58:15 -07:00
|
|
|
struct ice_tx_ctx_desc *cdesc;
|
2020-05-07 17:41:05 -07:00
|
|
|
u16 i = tx_ring->next_to_use;
|
2018-03-20 07:58:15 -07:00
|
|
|
|
|
|
|
/* grab the next descriptor */
|
|
|
|
cdesc = ICE_TX_CTX_DESC(tx_ring, i);
|
|
|
|
i++;
|
|
|
|
tx_ring->next_to_use = (i < tx_ring->count) ? i : 0;
|
|
|
|
|
|
|
|
/* setup context descriptor */
|
|
|
|
cdesc->tunneling_params = cpu_to_le32(offload.cd_tunnel_params);
|
|
|
|
cdesc->l2tag2 = cpu_to_le16(offload.cd_l2tag2);
|
|
|
|
cdesc->rsvd = cpu_to_le16(0);
|
|
|
|
cdesc->qw1 = cpu_to_le64(offload.cd_qw1);
|
|
|
|
}
|
2018-03-20 07:58:14 -07:00
|
|
|
|
2018-03-20 07:58:15 -07:00
|
|
|
ice_tx_map(tx_ring, first, &offload);
|
2018-03-20 07:58:14 -07:00
|
|
|
return NETDEV_TX_OK;
|
|
|
|
|
|
|
|
out_drop:
|
2021-06-08 16:35:17 -07:00
|
|
|
ice_trace(xmit_frame_ring_drop, tx_ring, skb);
|
2018-03-20 07:58:14 -07:00
|
|
|
dev_kfree_skb_any(skb);
|
|
|
|
return NETDEV_TX_OK;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* ice_start_xmit - Selects the correct VSI and Tx queue to send buffer
|
|
|
|
* @skb: send buffer
|
|
|
|
* @netdev: network interface device structure
|
|
|
|
*
|
|
|
|
* Returns NETDEV_TX_OK if sent, else an error code
|
|
|
|
*/
|
|
|
|
netdev_tx_t ice_start_xmit(struct sk_buff *skb, struct net_device *netdev)
|
|
|
|
{
|
|
|
|
struct ice_netdev_priv *np = netdev_priv(netdev);
|
|
|
|
struct ice_vsi *vsi = np->vsi;
|
2021-08-19 13:59:58 +02:00
|
|
|
struct ice_tx_ring *tx_ring;
|
2018-03-20 07:58:14 -07:00
|
|
|
|
|
|
|
tx_ring = vsi->tx_rings[skb->queue_mapping];
|
|
|
|
|
|
|
|
/* hardware can't handle really short frames, hardware padding works
|
|
|
|
* beyond this point
|
|
|
|
*/
|
|
|
|
if (skb_put_padto(skb, ICE_MIN_TX_LEN))
|
|
|
|
return NETDEV_TX_OK;
|
|
|
|
|
|
|
|
return ice_xmit_frame_ring(skb, tx_ring);
|
|
|
|
}
|
2020-05-11 18:01:40 -07:00
|
|
|
|
2021-08-06 13:53:56 -07:00
|
|
|
/**
|
|
|
|
* ice_get_dscp_up - return the UP/TC value for a SKB
|
|
|
|
* @dcbcfg: DCB config that contains DSCP to UP/TC mapping
|
|
|
|
* @skb: SKB to query for info to determine UP/TC
|
|
|
|
*
|
|
|
|
* This function is to only be called when the PF is in L3 DSCP PFC mode
|
|
|
|
*/
|
|
|
|
static u8 ice_get_dscp_up(struct ice_dcbx_cfg *dcbcfg, struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
u8 dscp = 0;
|
|
|
|
|
|
|
|
if (skb->protocol == htons(ETH_P_IP))
|
|
|
|
dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
|
|
|
|
else if (skb->protocol == htons(ETH_P_IPV6))
|
|
|
|
dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
|
|
|
|
|
|
|
|
return dcbcfg->dscp_map[dscp];
|
|
|
|
}
|
|
|
|
|
|
|
|
u16
|
|
|
|
ice_select_queue(struct net_device *netdev, struct sk_buff *skb,
|
|
|
|
struct net_device *sb_dev)
|
|
|
|
{
|
|
|
|
struct ice_pf *pf = ice_netdev_to_pf(netdev);
|
|
|
|
struct ice_dcbx_cfg *dcbcfg;
|
|
|
|
|
|
|
|
dcbcfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg;
|
|
|
|
if (dcbcfg->pfc_mode == ICE_QOS_MODE_DSCP)
|
|
|
|
skb->priority = ice_get_dscp_up(dcbcfg, skb);
|
|
|
|
|
|
|
|
return netdev_pick_tx(netdev, skb, sb_dev);
|
|
|
|
}
|
|
|
|
|
2020-05-11 18:01:40 -07:00
|
|
|
/**
|
|
|
|
* ice_clean_ctrl_tx_irq - interrupt handler for flow director Tx queue
|
|
|
|
* @tx_ring: tx_ring to clean
|
|
|
|
*/
|
2021-08-19 13:59:58 +02:00
|
|
|
void ice_clean_ctrl_tx_irq(struct ice_tx_ring *tx_ring)
|
2020-05-11 18:01:40 -07:00
|
|
|
{
|
|
|
|
struct ice_vsi *vsi = tx_ring->vsi;
|
|
|
|
s16 i = tx_ring->next_to_clean;
|
|
|
|
int budget = ICE_DFLT_IRQ_WORK;
|
|
|
|
struct ice_tx_desc *tx_desc;
|
|
|
|
struct ice_tx_buf *tx_buf;
|
|
|
|
|
|
|
|
tx_buf = &tx_ring->tx_buf[i];
|
|
|
|
tx_desc = ICE_TX_DESC(tx_ring, i);
|
|
|
|
i -= tx_ring->count;
|
|
|
|
|
|
|
|
do {
|
|
|
|
struct ice_tx_desc *eop_desc = tx_buf->next_to_watch;
|
|
|
|
|
|
|
|
/* if next_to_watch is not set then there is no pending work */
|
|
|
|
if (!eop_desc)
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* prevent any other reads prior to eop_desc */
|
|
|
|
smp_rmb();
|
|
|
|
|
|
|
|
/* if the descriptor isn't done, no work to do */
|
|
|
|
if (!(eop_desc->cmd_type_offset_bsz &
|
|
|
|
cpu_to_le64(ICE_TX_DESC_DTYPE_DESC_DONE)))
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* clear next_to_watch to prevent false hangs */
|
|
|
|
tx_buf->next_to_watch = NULL;
|
|
|
|
tx_desc->buf_addr = 0;
|
|
|
|
tx_desc->cmd_type_offset_bsz = 0;
|
|
|
|
|
|
|
|
/* move past filter desc */
|
|
|
|
tx_buf++;
|
|
|
|
tx_desc++;
|
|
|
|
i++;
|
|
|
|
if (unlikely(!i)) {
|
|
|
|
i -= tx_ring->count;
|
|
|
|
tx_buf = tx_ring->tx_buf;
|
|
|
|
tx_desc = ICE_TX_DESC(tx_ring, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* unmap the data header */
|
|
|
|
if (dma_unmap_len(tx_buf, len))
|
|
|
|
dma_unmap_single(tx_ring->dev,
|
|
|
|
dma_unmap_addr(tx_buf, dma),
|
|
|
|
dma_unmap_len(tx_buf, len),
|
|
|
|
DMA_TO_DEVICE);
|
ice: Robustify cleaning/completing XDP Tx buffers
When queueing frames from a Page Pool for redirecting to a device backed
by the ice driver, `perf top` shows heavy load on page_alloc() and
page_frag_free(), despite that on a properly working system it must be
fully or at least almost zero-alloc. The problem is in fact a bit deeper
and raises from how ice cleans up completed Tx buffers.
The story so far: when cleaning/freeing the resources related to
a particular completed Tx frame (skbs, DMA mappings etc.), ice uses some
heuristics only without setting any type explicitly (except for dummy
Flow Director packets, which are marked via ice_tx_buf::tx_flags).
This kinda works, but only up to some point. For example, currently ice
assumes that each frame coming to __ice_xmit_xdp_ring(), is backed by
either plain order-0 page or plain page frag, while it may also be
backed by Page Pool or any other possible memory models introduced in
future. This means any &xdp_frame must be freed properly via
xdp_return_frame() family with no assumptions.
In order to do that, the whole heuristics must be replaced with setting
the Tx buffer/frame type explicitly, just how it's always been done via
an enum. Let us reuse 16 bits from ::tx_flags -- 1 bit-and instr won't
hurt much -- especially given that sometimes there was a check for
%ICE_TX_FLAGS_DUMMY_PKT, which is now turned from a flag to an enum
member. The rest of the changes is straightforward and most of it is
just a conversion to rely now on the type set in &ice_tx_buf rather than
to some secondary properties.
For now, no functional changes intended, the change only prepares the
ground for starting freeing XDP frames properly next step. And it must
be done atomically/synchronously to not break stuff.
Signed-off-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/bpf/20230210170618.1973430-5-alexandr.lobakin@intel.com
2023-02-10 18:06:16 +01:00
|
|
|
if (tx_buf->type == ICE_TX_BUF_DUMMY)
|
2020-05-11 18:01:40 -07:00
|
|
|
devm_kfree(tx_ring->dev, tx_buf->raw_buf);
|
|
|
|
|
|
|
|
/* clear next_to_watch to prevent false hangs */
|
ice: Robustify cleaning/completing XDP Tx buffers
When queueing frames from a Page Pool for redirecting to a device backed
by the ice driver, `perf top` shows heavy load on page_alloc() and
page_frag_free(), despite that on a properly working system it must be
fully or at least almost zero-alloc. The problem is in fact a bit deeper
and raises from how ice cleans up completed Tx buffers.
The story so far: when cleaning/freeing the resources related to
a particular completed Tx frame (skbs, DMA mappings etc.), ice uses some
heuristics only without setting any type explicitly (except for dummy
Flow Director packets, which are marked via ice_tx_buf::tx_flags).
This kinda works, but only up to some point. For example, currently ice
assumes that each frame coming to __ice_xmit_xdp_ring(), is backed by
either plain order-0 page or plain page frag, while it may also be
backed by Page Pool or any other possible memory models introduced in
future. This means any &xdp_frame must be freed properly via
xdp_return_frame() family with no assumptions.
In order to do that, the whole heuristics must be replaced with setting
the Tx buffer/frame type explicitly, just how it's always been done via
an enum. Let us reuse 16 bits from ::tx_flags -- 1 bit-and instr won't
hurt much -- especially given that sometimes there was a check for
%ICE_TX_FLAGS_DUMMY_PKT, which is now turned from a flag to an enum
member. The rest of the changes is straightforward and most of it is
just a conversion to rely now on the type set in &ice_tx_buf rather than
to some secondary properties.
For now, no functional changes intended, the change only prepares the
ground for starting freeing XDP frames properly next step. And it must
be done atomically/synchronously to not break stuff.
Signed-off-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/bpf/20230210170618.1973430-5-alexandr.lobakin@intel.com
2023-02-10 18:06:16 +01:00
|
|
|
tx_buf->type = ICE_TX_BUF_EMPTY;
|
2020-05-11 18:01:40 -07:00
|
|
|
tx_buf->tx_flags = 0;
|
|
|
|
tx_buf->next_to_watch = NULL;
|
|
|
|
dma_unmap_len_set(tx_buf, len, 0);
|
|
|
|
tx_desc->buf_addr = 0;
|
|
|
|
tx_desc->cmd_type_offset_bsz = 0;
|
|
|
|
|
|
|
|
/* move past eop_desc for start of next FD desc */
|
|
|
|
tx_buf++;
|
|
|
|
tx_desc++;
|
|
|
|
i++;
|
|
|
|
if (unlikely(!i)) {
|
|
|
|
i -= tx_ring->count;
|
|
|
|
tx_buf = tx_ring->tx_buf;
|
|
|
|
tx_desc = ICE_TX_DESC(tx_ring, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
budget--;
|
|
|
|
} while (likely(budget));
|
|
|
|
|
|
|
|
i += tx_ring->count;
|
|
|
|
tx_ring->next_to_clean = i;
|
|
|
|
|
|
|
|
/* re-enable interrupt if needed */
|
|
|
|
ice_irq_dynamic_ena(&vsi->back->hw, vsi, vsi->q_vectors[0]);
|
|
|
|
}
|