linux/drivers/net/ethernet/intel/ice/ice_base.c

1187 lines
31 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2019, Intel Corporation. */
#include <net/xdp_sock_drv.h>
#include "ice_base.h"
#include "ice_lib.h"
#include "ice_dcb_lib.h"
ice: rename ice_virtchnl_pf.c to ice_sriov.c The ice_virtchnl_pf.c and ice_virtchnl_pf.h files are where most of the code for implementing Single Root IOV virtualization resides. This code includes support for bringing up and tearing down VFs, hooks into the kernel SR-IOV netdev operations, and for handling virtchnl messages from VFs. In the future, we plan to support Scalable IOV in addition to Single Root IOV as an alternative virtualization scheme. This implementation will re-use some but not all of the code in ice_virtchnl_pf.c To prepare for this future, we want to refactor and split up the code in ice_virtchnl_pf.c into the following scheme: * ice_vf_lib.[ch] Basic VF structures and accessors. This is where scheme-independent code will reside. * ice_virtchnl.[ch] Virtchnl message handling. This is where the bulk of the logic for processing messages from VFs using the virtchnl messaging scheme will reside. This is separated from ice_vf_lib.c because it is distinct and has a bulk of the processing code. * ice_sriov.[ch] Single Root IOV implementation, including initialization and the routines for interacting with SR-IOV based netdev operations. * (future) ice_siov.[ch] Scalable IOV implementation. As a first step, lets assume that all of the code in ice_virtchnl_pf.[ch] is for Single Root IOV. Rename this file to ice_sriov.c and its header to ice_sriov.h Future changes will further split out the code in these files following the plan outlined here. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Konrad Jankowski <konrad0.jankowski@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2022-02-22 16:26:49 -08:00
#include "ice_sriov.h"
/**
* __ice_vsi_get_qs_contig - Assign a contiguous chunk of queues to VSI
* @qs_cfg: gathered variables needed for PF->VSI queues assignment
*
* Return 0 on success and -ENOMEM in case of no left space in PF queue bitmap
*/
static int __ice_vsi_get_qs_contig(struct ice_qs_cfg *qs_cfg)
{
unsigned int offset, i;
mutex_lock(qs_cfg->qs_mutex);
offset = bitmap_find_next_zero_area(qs_cfg->pf_map, qs_cfg->pf_map_size,
0, qs_cfg->q_count, 0);
if (offset >= qs_cfg->pf_map_size) {
mutex_unlock(qs_cfg->qs_mutex);
return -ENOMEM;
}
bitmap_set(qs_cfg->pf_map, offset, qs_cfg->q_count);
for (i = 0; i < qs_cfg->q_count; i++)
qs_cfg->vsi_map[i + qs_cfg->vsi_map_offset] = (u16)(i + offset);
mutex_unlock(qs_cfg->qs_mutex);
return 0;
}
/**
* __ice_vsi_get_qs_sc - Assign a scattered queues from PF to VSI
* @qs_cfg: gathered variables needed for pf->vsi queues assignment
*
* Return 0 on success and -ENOMEM in case of no left space in PF queue bitmap
*/
static int __ice_vsi_get_qs_sc(struct ice_qs_cfg *qs_cfg)
{
unsigned int i, index = 0;
mutex_lock(qs_cfg->qs_mutex);
for (i = 0; i < qs_cfg->q_count; i++) {
index = find_next_zero_bit(qs_cfg->pf_map,
qs_cfg->pf_map_size, index);
if (index >= qs_cfg->pf_map_size)
goto err_scatter;
set_bit(index, qs_cfg->pf_map);
qs_cfg->vsi_map[i + qs_cfg->vsi_map_offset] = (u16)index;
}
mutex_unlock(qs_cfg->qs_mutex);
return 0;
err_scatter:
for (index = 0; index < i; index++) {
clear_bit(qs_cfg->vsi_map[index], qs_cfg->pf_map);
qs_cfg->vsi_map[index + qs_cfg->vsi_map_offset] = 0;
}
mutex_unlock(qs_cfg->qs_mutex);
return -ENOMEM;
}
/**
* ice_pf_rxq_wait - Wait for a PF's Rx queue to be enabled or disabled
* @pf: the PF being configured
* @pf_q: the PF queue
* @ena: enable or disable state of the queue
*
* This routine will wait for the given Rx queue of the PF to reach the
* enabled or disabled state.
* Returns -ETIMEDOUT in case of failing to reach the requested state after
* multiple retries; else will return 0 in case of success.
*/
static int ice_pf_rxq_wait(struct ice_pf *pf, int pf_q, bool ena)
{
int i;
for (i = 0; i < ICE_Q_WAIT_MAX_RETRY; i++) {
if (ena == !!(rd32(&pf->hw, QRX_CTRL(pf_q)) &
QRX_CTRL_QENA_STAT_M))
return 0;
usleep_range(20, 40);
}
return -ETIMEDOUT;
}
/**
* ice_vsi_alloc_q_vector - Allocate memory for a single interrupt vector
* @vsi: the VSI being configured
* @v_idx: index of the vector in the VSI struct
*
* We allocate one q_vector and set default value for ITR setting associated
* with this q_vector. If allocation fails we return -ENOMEM.
*/
static int ice_vsi_alloc_q_vector(struct ice_vsi *vsi, u16 v_idx)
{
struct ice_pf *pf = vsi->back;
struct ice_q_vector *q_vector;
int err;
/* allocate q_vector */
q_vector = kzalloc(sizeof(*q_vector), GFP_KERNEL);
if (!q_vector)
return -ENOMEM;
q_vector->vsi = vsi;
q_vector->v_idx = v_idx;
q_vector->tx.itr_setting = ICE_DFLT_TX_ITR;
q_vector->rx.itr_setting = ICE_DFLT_RX_ITR;
q_vector->tx.itr_mode = ITR_DYNAMIC;
q_vector->rx.itr_mode = ITR_DYNAMIC;
q_vector->tx.type = ICE_TX_CONTAINER;
q_vector->rx.type = ICE_RX_CONTAINER;
q_vector->irq.index = -ENOENT;
if (vsi->type == ICE_VSI_VF) {
ice: store VF relative MSI-X index in q_vector->vf_reg_idx The ice physical function driver needs to configure the association of queues and interrupts on behalf of its virtual functions. This is done over virtchnl by the VF sending messages during its initialization phase. These messages contain a vector_id which the VF wants to associate with a given queue. This ID is relative to the VF space, where 0 indicates the control IRQ for non-queue interrupts. When programming the mapping, the PF driver currently passes this vector_id directly to the low level functions for programming. This works for SR-IOV, because the hardware uses the VF-based indexing for interrupts. This won't work for Scalable IOV, which uses PF-based indexing for programming its VSIs. To handle this, the driver needs to be able to look up the proper index to use for programming. For typical IRQs, this would be the q_vector->reg_idx field. The q_vector->reg_idx can't be set to a VF relative value, because it is used when the PF needs to control the interrupt, such as when triggering a software interrupt on stopping the Tx queue. Thus, introduce a new q_vector->vf_reg_idx which can store the VF relative index for registers which expect this. Use this in ice_cfg_interrupt to look up the VF index from the q_vector. This allows removing the vector ID parameter of ice_cfg_interrupt. Also notice that this function returns an int, but then is cast to the virtchnl error enumeration, virtchnl_status_code. Update the return type to indicate it does not return an integer error code. We can't use normal error codes here because the return values are passed across the virtchnl interface. This will allow the future Scalable IOV VFs to correctly look up the index needed for programming the VF queues without breaking SR-IOV. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Rafal Romanowski <rafal.romanowski@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2024-03-22 14:44:45 -07:00
ice_calc_vf_reg_idx(vsi->vf, q_vector);
goto out;
} else if (vsi->type == ICE_VSI_CTRL && vsi->vf) {
struct ice_vsi *ctrl_vsi = ice_get_vf_ctrl_vsi(pf, vsi);
if (ctrl_vsi) {
if (unlikely(!ctrl_vsi->q_vectors)) {
err = -ENOENT;
goto err_free_q_vector;
}
q_vector->irq = ctrl_vsi->q_vectors[0]->irq;
goto skip_alloc;
}
}
q_vector->irq = ice_alloc_irq(pf, vsi->irq_dyn_alloc);
if (q_vector->irq.index < 0) {
err = -ENOMEM;
goto err_free_q_vector;
}
skip_alloc:
q_vector->reg_idx = q_vector->irq.index;
ice: store VF relative MSI-X index in q_vector->vf_reg_idx The ice physical function driver needs to configure the association of queues and interrupts on behalf of its virtual functions. This is done over virtchnl by the VF sending messages during its initialization phase. These messages contain a vector_id which the VF wants to associate with a given queue. This ID is relative to the VF space, where 0 indicates the control IRQ for non-queue interrupts. When programming the mapping, the PF driver currently passes this vector_id directly to the low level functions for programming. This works for SR-IOV, because the hardware uses the VF-based indexing for interrupts. This won't work for Scalable IOV, which uses PF-based indexing for programming its VSIs. To handle this, the driver needs to be able to look up the proper index to use for programming. For typical IRQs, this would be the q_vector->reg_idx field. The q_vector->reg_idx can't be set to a VF relative value, because it is used when the PF needs to control the interrupt, such as when triggering a software interrupt on stopping the Tx queue. Thus, introduce a new q_vector->vf_reg_idx which can store the VF relative index for registers which expect this. Use this in ice_cfg_interrupt to look up the VF index from the q_vector. This allows removing the vector ID parameter of ice_cfg_interrupt. Also notice that this function returns an int, but then is cast to the virtchnl error enumeration, virtchnl_status_code. Update the return type to indicate it does not return an integer error code. We can't use normal error codes here because the return values are passed across the virtchnl interface. This will allow the future Scalable IOV VFs to correctly look up the index needed for programming the VF queues without breaking SR-IOV. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Rafal Romanowski <rafal.romanowski@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2024-03-22 14:44:45 -07:00
q_vector->vf_reg_idx = q_vector->irq.index;
/* only set affinity_mask if the CPU is online */
if (cpu_online(v_idx))
cpumask_set_cpu(v_idx, &q_vector->affinity_mask);
/* This will not be called in the driver load path because the netdev
* will not be created yet. All other cases with register the NAPI
* handler here (i.e. resume, reset/rebuild, etc.)
*/
if (vsi->netdev)
netif_napi_add(vsi->netdev, &q_vector->napi, ice_napi_poll);
out:
/* tie q_vector and VSI together */
vsi->q_vectors[v_idx] = q_vector;
return 0;
err_free_q_vector:
kfree(q_vector);
return err;
}
/**
* ice_free_q_vector - Free memory allocated for a specific interrupt vector
* @vsi: VSI having the memory freed
* @v_idx: index of the vector to be freed
*/
static void ice_free_q_vector(struct ice_vsi *vsi, int v_idx)
{
struct ice_q_vector *q_vector;
struct ice_pf *pf = vsi->back;
struct ice_tx_ring *tx_ring;
struct ice_rx_ring *rx_ring;
struct device *dev;
dev = ice_pf_to_dev(pf);
if (!vsi->q_vectors[v_idx]) {
dev_dbg(dev, "Queue vector at index %d not found\n", v_idx);
return;
}
q_vector = vsi->q_vectors[v_idx];
ice_for_each_tx_ring(tx_ring, q_vector->tx) {
ice_queue_set_napi(vsi, tx_ring->q_index, NETDEV_QUEUE_TYPE_TX,
NULL);
tx_ring->q_vector = NULL;
}
ice_for_each_rx_ring(rx_ring, q_vector->rx) {
ice_queue_set_napi(vsi, rx_ring->q_index, NETDEV_QUEUE_TYPE_RX,
NULL);
rx_ring->q_vector = NULL;
}
/* only VSI with an associated netdev is set up with NAPI */
if (vsi->netdev)
netif_napi_del(&q_vector->napi);
/* release MSIX interrupt if q_vector had interrupt allocated */
if (q_vector->irq.index < 0)
goto free_q_vector;
/* only free last VF ctrl vsi interrupt */
if (vsi->type == ICE_VSI_CTRL && vsi->vf &&
ice_get_vf_ctrl_vsi(pf, vsi))
goto free_q_vector;
ice_free_irq(pf, q_vector->irq);
free_q_vector:
kfree(q_vector);
vsi->q_vectors[v_idx] = NULL;
}
/**
* ice_cfg_itr_gran - set the ITR granularity to 2 usecs if not already set
* @hw: board specific structure
*/
static void ice_cfg_itr_gran(struct ice_hw *hw)
{
u32 regval = rd32(hw, GLINT_CTL);
/* no need to update global register if ITR gran is already set */
if (!(regval & GLINT_CTL_DIS_AUTOMASK_M) &&
(FIELD_GET(GLINT_CTL_ITR_GRAN_200_M, regval) == ICE_ITR_GRAN_US) &&
(FIELD_GET(GLINT_CTL_ITR_GRAN_100_M, regval) == ICE_ITR_GRAN_US) &&
(FIELD_GET(GLINT_CTL_ITR_GRAN_50_M, regval) == ICE_ITR_GRAN_US) &&
(FIELD_GET(GLINT_CTL_ITR_GRAN_25_M, regval) == ICE_ITR_GRAN_US))
return;
regval = FIELD_PREP(GLINT_CTL_ITR_GRAN_200_M, ICE_ITR_GRAN_US) |
FIELD_PREP(GLINT_CTL_ITR_GRAN_100_M, ICE_ITR_GRAN_US) |
FIELD_PREP(GLINT_CTL_ITR_GRAN_50_M, ICE_ITR_GRAN_US) |
FIELD_PREP(GLINT_CTL_ITR_GRAN_25_M, ICE_ITR_GRAN_US);
wr32(hw, GLINT_CTL, regval);
}
/**
* ice_calc_txq_handle - calculate the queue handle
* @vsi: VSI that ring belongs to
* @ring: ring to get the absolute queue index
* @tc: traffic class number
*/
static u16 ice_calc_txq_handle(struct ice_vsi *vsi, struct ice_tx_ring *ring, u8 tc)
{
WARN_ONCE(ice_ring_is_xdp(ring) && tc, "XDP ring can't belong to TC other than 0\n");
if (ring->ch)
return ring->q_index - ring->ch->base_q;
/* Idea here for calculation is that we subtract the number of queue
* count from TC that ring belongs to from it's absolute queue index
* and as a result we get the queue's index within TC.
*/
return ring->q_index - vsi->tc_cfg.tc_info[tc].qoffset;
}
/**
* ice_cfg_xps_tx_ring - Configure XPS for a Tx ring
* @ring: The Tx ring to configure
*
* This enables/disables XPS for a given Tx descriptor ring
* based on the TCs enabled for the VSI that ring belongs to.
*/
static void ice_cfg_xps_tx_ring(struct ice_tx_ring *ring)
{
if (!ring->q_vector || !ring->netdev)
return;
/* We only initialize XPS once, so as not to overwrite user settings */
if (test_and_set_bit(ICE_TX_XPS_INIT_DONE, ring->xps_state))
return;
netif_set_xps_queue(ring->netdev, &ring->q_vector->affinity_mask,
ring->q_index);
}
/**
* ice_setup_tx_ctx - setup a struct ice_tlan_ctx instance
* @ring: The Tx ring to configure
* @tlan_ctx: Pointer to the Tx LAN queue context structure to be initialized
* @pf_q: queue index in the PF space
*
* Configure the Tx descriptor ring in TLAN context.
*/
static void
ice_setup_tx_ctx(struct ice_tx_ring *ring, struct ice_tlan_ctx *tlan_ctx, u16 pf_q)
{
struct ice_vsi *vsi = ring->vsi;
struct ice_hw *hw = &vsi->back->hw;
tlan_ctx->base = ring->dma >> ICE_TLAN_CTX_BASE_S;
tlan_ctx->port_num = vsi->port_info->lport;
/* Transmit Queue Length */
tlan_ctx->qlen = ring->count;
ice_set_cgd_num(tlan_ctx, ring->dcb_tc);
/* PF number */
tlan_ctx->pf_num = hw->pf_id;
/* queue belongs to a specific VSI type
* VF / VM index should be programmed per vmvf_type setting:
* for vmvf_type = VF, it is VF number between 0-256
* for vmvf_type = VM, it is VM number between 0-767
* for PF or EMP this field should be set to zero
*/
switch (vsi->type) {
case ICE_VSI_LB:
case ICE_VSI_CTRL:
case ICE_VSI_PF:
if (ring->ch)
tlan_ctx->vmvf_type = ICE_TLAN_CTX_VMVF_TYPE_VMQ;
else
tlan_ctx->vmvf_type = ICE_TLAN_CTX_VMVF_TYPE_PF;
break;
case ICE_VSI_VF:
/* Firmware expects vmvf_num to be absolute VF ID */
ice: store VF pointer instead of VF ID The VSI structure contains a vf_id field used to associate a VSI with a VF. This is used mainly for ICE_VSI_VF as well as partially for ICE_VSI_CTRL associated with the VFs. This API was designed with the idea that VFs are stored in a simple array that was expected to be static throughout most of the driver's life. We plan on refactoring VF storage in a few key ways: 1) converting from a simple static array to a hash table 2) using krefs to track VF references obtained from the hash table 3) use RCU to delay release of VF memory until after all references are dropped This is motivated by the goal to ensure that the lifetime of VF structures is accounted for, and prevent various use-after-free bugs. With the existing vsi->vf_id, the reference tracking for VFs would become somewhat convoluted, because each VSI maintains a vf_id field which will then require performing a look up. This means all these flows will require reference tracking and proper usage of rcu_read_lock, etc. We know that the VF VSI will always be backed by a valid VF structure, because the VSI is created during VF initialization and removed before the VF is destroyed. Rely on this and store a reference to the VF in the VSI structure instead of storing a VF ID. This will simplify the usage and avoid the need to perform lookups on the hash table in the future. For ICE_VSI_VF, it is expected that vsi->vf is always non-NULL after ice_vsi_alloc succeeds. Because of this, use WARN_ON when checking if a vsi->vf pointer is valid when dealing with VF VSIs. This will aid in debugging code which violates this assumption and avoid more disastrous panics. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Konrad Jankowski <konrad0.jankowski@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2022-02-16 13:37:29 -08:00
tlan_ctx->vmvf_num = hw->func_caps.vf_base_id + vsi->vf->vf_id;
tlan_ctx->vmvf_type = ICE_TLAN_CTX_VMVF_TYPE_VF;
break;
default:
return;
}
/* make sure the context is associated with the right VSI */
if (ring->ch)
tlan_ctx->src_vsi = ring->ch->vsi_num;
else
tlan_ctx->src_vsi = ice_get_hw_vsi_num(hw, vsi->idx);
ice: enable transmit timestamps for E810 devices Add support for enabling Tx timestamp requests for outgoing packets on E810 devices. The ice hardware can support multiple outstanding Tx timestamp requests. When sending a descriptor to hardware, a Tx timestamp request is made by setting a request bit, and assigning an index that represents which Tx timestamp index to store the timestamp in. Hardware makes no effort to synchronize the index use, so it is up to software to ensure that Tx timestamp indexes are not re-used before the timestamp is reported back. To do this, introduce a Tx timestamp tracker which will keep track of currently in-use indexes. In the hot path, if a packet has a timestamp request, an index will be requested from the tracker. Unfortunately, this does require a lock as the indexes are shared across all queues on a PHY. There are not enough indexes to reliably assign only 1 to each queue. For the E810 devices, the timestamp indexes are not shared across PHYs, so each port can have its own tracking. Once hardware captures a timestamp, an interrupt is fired. In this interrupt, trigger a new work item that will figure out which timestamp was completed, and report the timestamp back to the stack. This function loops through the Tx timestamp indexes and checks whether there is now a valid timestamp. If so, it clears the PHY timestamp indication in the PHY memory, locks and removes the SKB and bit in the tracker, then reports the timestamp to the stack. It is possible in some cases that a timestamp request will be initiated but never completed. This might occur if the packet is dropped by software or hardware before it reaches the PHY. Add a task to the periodic work function that will check whether a timestamp request is more than a few seconds old. If so, the timestamp index is cleared in the PHY, and the SKB is released. Just as with Rx timestamps, the Tx timestamps are only 40 bits wide, and use the same overall logic for extending to 64 bits of nanoseconds. With this change, E810 devices should be able to perform basic PTP functionality. Future changes will extend the support to cover the E822-based devices. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Tony Brelinski <tonyx.brelinski@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-06-09 09:39:53 -07:00
/* Restrict Tx timestamps to the PF VSI */
switch (vsi->type) {
case ICE_VSI_PF:
tlan_ctx->tsyn_ena = 1;
break;
default:
break;
}
tlan_ctx->tso_ena = ICE_TX_LEGACY;
tlan_ctx->tso_qnum = pf_q;
/* Legacy or Advanced Host Interface:
* 0: Advanced Host Interface
* 1: Legacy Host Interface
*/
tlan_ctx->legacy_int = ICE_TX_LEGACY;
}
/**
* ice_rx_offset - Return expected offset into page to access data
* @rx_ring: Ring we are requesting offset of
*
* Returns the offset value for ring into the data buffer.
*/
static unsigned int ice_rx_offset(struct ice_rx_ring *rx_ring)
{
if (ice_ring_uses_build_skb(rx_ring))
return ICE_SKB_PAD;
return 0;
}
/**
* ice_setup_rx_ctx - Configure a receive ring context
* @ring: The Rx ring to configure
*
* Configure the Rx descriptor ring in RLAN context.
*/
static int ice_setup_rx_ctx(struct ice_rx_ring *ring)
{
struct ice_vsi *vsi = ring->vsi;
u32 rxdid = ICE_RXDID_FLEX_NIC;
struct ice_rlan_ctx rlan_ctx;
struct ice_hw *hw;
u16 pf_q;
int err;
hw = &vsi->back->hw;
/* what is Rx queue number in global space of 2K Rx queues */
pf_q = vsi->rxq_map[ring->q_index];
/* clear the context structure first */
memset(&rlan_ctx, 0, sizeof(rlan_ctx));
/* Receive Queue Base Address.
* Indicates the starting address of the descriptor queue defined in
* 128 Byte units.
*/
rlan_ctx.base = ring->dma >> ICE_RLAN_BASE_S;
rlan_ctx.qlen = ring->count;
/* Receive Packet Data Buffer Size.
* The Packet Data Buffer Size is defined in 128 byte units.
*/
rlan_ctx.dbuf = DIV_ROUND_UP(ring->rx_buf_len,
BIT_ULL(ICE_RLAN_CTX_DBUF_S));
/* use 32 byte descriptors */
rlan_ctx.dsize = 1;
/* Strip the Ethernet CRC bytes before the packet is posted to host
* memory.
*/
rlan_ctx.crcstrip = !(ring->flags & ICE_RX_FLAGS_CRC_STRIP_DIS);
ice: Add hot path support for 802.1Q and 802.1ad VLAN offloads Currently the driver only supports 802.1Q VLAN insertion and stripping. However, once Double VLAN Mode (DVM) is fully supported, then both 802.1Q and 802.1ad VLAN insertion and stripping will be supported. Unfortunately the VSI context parameters only allow for one VLAN ethertype at a time for VLAN offloads so only one or the other VLAN ethertype offload can be supported at once. To support this, multiple changes are needed. Rx path changes: [1] In DVM, the Rx queue context l2tagsel field needs to be cleared so the outermost tag shows up in the l2tag2_2nd field of the Rx flex descriptor. In Single VLAN Mode (SVM), the l2tagsel field should remain 1 to support SVM configurations. [2] Modify the ice_test_staterr() function to take a __le16 instead of the ice_32b_rx_flex_desc union pointer so this function can be used for both rx_desc->wb.status_error0 and rx_desc->wb.status_error1. [3] Add the new inline function ice_get_vlan_tag_from_rx_desc() that checks if there is a VLAN tag in l2tag1 or l2tag2_2nd. [4] In ice_receive_skb(), add a check to see if NETIF_F_HW_VLAN_STAG_RX is enabled in netdev->features. If it is, then this is the VLAN ethertype that needs to be added to the stripping VLAN tag. Since ice_fix_features() prevents CTAG_RX and STAG_RX from being enabled simultaneously, the VLAN ethertype will only ever be 802.1Q or 802.1ad. Tx path changes: [1] In DVM, the VLAN tag needs to be placed in the l2tag2 field of the Tx context descriptor. The new define ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN was added to the list of tx_flags to handle this case. [2] When the stack requests the VLAN tag to be offloaded on Tx, the driver needs to set either ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN or ICE_TX_FLAGS_HW_VLAN, so the tag is inserted in l2tag2 or l2tag1 respectively. To determine which location to use, set a bit in the Tx ring flags field during ring allocation that can be used to determine which field to use in the Tx descriptor. In DVM, always use l2tag2, and in SVM, always use l2tag1. Signed-off-by: Brett Creeley <brett.creeley@intel.com> Tested-by: Gurucharan G <gurucharanx.g@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-12-02 08:38:47 -08:00
/* L2TSEL flag defines the reported L2 Tags in the receive descriptor
* and it needs to remain 1 for non-DVM capable configurations to not
* break backward compatibility for VF drivers. Setting this field to 0
* will cause the single/outer VLAN tag to be stripped to the L2TAG2_2ND
* field in the Rx descriptor. Setting it to 1 allows the VLAN tag to
* be stripped in L2TAG1 of the Rx descriptor, which is where VFs will
* check for the tag
*/
if (ice_is_dvm_ena(hw))
if (vsi->type == ICE_VSI_VF &&
ice: store VF pointer instead of VF ID The VSI structure contains a vf_id field used to associate a VSI with a VF. This is used mainly for ICE_VSI_VF as well as partially for ICE_VSI_CTRL associated with the VFs. This API was designed with the idea that VFs are stored in a simple array that was expected to be static throughout most of the driver's life. We plan on refactoring VF storage in a few key ways: 1) converting from a simple static array to a hash table 2) using krefs to track VF references obtained from the hash table 3) use RCU to delay release of VF memory until after all references are dropped This is motivated by the goal to ensure that the lifetime of VF structures is accounted for, and prevent various use-after-free bugs. With the existing vsi->vf_id, the reference tracking for VFs would become somewhat convoluted, because each VSI maintains a vf_id field which will then require performing a look up. This means all these flows will require reference tracking and proper usage of rcu_read_lock, etc. We know that the VF VSI will always be backed by a valid VF structure, because the VSI is created during VF initialization and removed before the VF is destroyed. Rely on this and store a reference to the VF in the VSI structure instead of storing a VF ID. This will simplify the usage and avoid the need to perform lookups on the hash table in the future. For ICE_VSI_VF, it is expected that vsi->vf is always non-NULL after ice_vsi_alloc succeeds. Because of this, use WARN_ON when checking if a vsi->vf pointer is valid when dealing with VF VSIs. This will aid in debugging code which violates this assumption and avoid more disastrous panics. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Konrad Jankowski <konrad0.jankowski@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2022-02-16 13:37:29 -08:00
ice_vf_is_port_vlan_ena(vsi->vf))
ice: Add hot path support for 802.1Q and 802.1ad VLAN offloads Currently the driver only supports 802.1Q VLAN insertion and stripping. However, once Double VLAN Mode (DVM) is fully supported, then both 802.1Q and 802.1ad VLAN insertion and stripping will be supported. Unfortunately the VSI context parameters only allow for one VLAN ethertype at a time for VLAN offloads so only one or the other VLAN ethertype offload can be supported at once. To support this, multiple changes are needed. Rx path changes: [1] In DVM, the Rx queue context l2tagsel field needs to be cleared so the outermost tag shows up in the l2tag2_2nd field of the Rx flex descriptor. In Single VLAN Mode (SVM), the l2tagsel field should remain 1 to support SVM configurations. [2] Modify the ice_test_staterr() function to take a __le16 instead of the ice_32b_rx_flex_desc union pointer so this function can be used for both rx_desc->wb.status_error0 and rx_desc->wb.status_error1. [3] Add the new inline function ice_get_vlan_tag_from_rx_desc() that checks if there is a VLAN tag in l2tag1 or l2tag2_2nd. [4] In ice_receive_skb(), add a check to see if NETIF_F_HW_VLAN_STAG_RX is enabled in netdev->features. If it is, then this is the VLAN ethertype that needs to be added to the stripping VLAN tag. Since ice_fix_features() prevents CTAG_RX and STAG_RX from being enabled simultaneously, the VLAN ethertype will only ever be 802.1Q or 802.1ad. Tx path changes: [1] In DVM, the VLAN tag needs to be placed in the l2tag2 field of the Tx context descriptor. The new define ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN was added to the list of tx_flags to handle this case. [2] When the stack requests the VLAN tag to be offloaded on Tx, the driver needs to set either ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN or ICE_TX_FLAGS_HW_VLAN, so the tag is inserted in l2tag2 or l2tag1 respectively. To determine which location to use, set a bit in the Tx ring flags field during ring allocation that can be used to determine which field to use in the Tx descriptor. In DVM, always use l2tag2, and in SVM, always use l2tag1. Signed-off-by: Brett Creeley <brett.creeley@intel.com> Tested-by: Gurucharan G <gurucharanx.g@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-12-02 08:38:47 -08:00
rlan_ctx.l2tsel = 1;
else
rlan_ctx.l2tsel = 0;
else
rlan_ctx.l2tsel = 1;
rlan_ctx.dtype = ICE_RX_DTYPE_NO_SPLIT;
rlan_ctx.hsplit_0 = ICE_RLAN_RX_HSPLIT_0_NO_SPLIT;
rlan_ctx.hsplit_1 = ICE_RLAN_RX_HSPLIT_1_NO_SPLIT;
/* This controls whether VLAN is stripped from inner headers
* The VLAN in the inner L2 header is stripped to the receive
* descriptor if enabled by this flag.
*/
rlan_ctx.showiv = 0;
/* Max packet size for this queue - must not be set to a larger value
* than 5 x DBUF
*/
rlan_ctx.rxmax = min_t(u32, vsi->max_frame,
ICE_MAX_CHAINED_RX_BUFS * ring->rx_buf_len);
/* Rx queue threshold in units of 64 */
rlan_ctx.lrxqthresh = 1;
/* PF acts as uplink for switchdev; set flex descriptor with src_vsi
* metadata and flags to allow redirecting to PR netdev
*/
if (ice_is_eswitch_mode_switchdev(vsi->back)) {
ring->flags |= ICE_RX_FLAGS_MULTIDEV;
rxdid = ICE_RXDID_FLEX_NIC_2;
}
/* Enable Flexible Descriptors in the queue context which
* allows this driver to select a specific receive descriptor format
* increasing context priority to pick up profile ID; default is 0x01;
* setting to 0x03 to ensure profile is programming if prev context is
* of same priority
*/
if (vsi->type != ICE_VSI_VF)
ice: enable receive hardware timestamping Add SIOCGHWTSTAMP and SIOCSHWTSTAMP ioctl handlers to respond to requests to enable timestamping support. If the request is for enabling Rx timestamps, set a bit in the Rx descriptors to indicate that receive timestamps should be reported. Hardware captures receive timestamps in the PHY which only captures part of the timer, and reports only 40 bits into the Rx descriptor. The upper 32 bits represent the contents of GLTSYN_TIME_L at the point of packet reception, while the lower 8 bits represent the upper 8 bits of GLTSYN_TIME_0. The networking and PTP stack expect 64 bit timestamps in nanoseconds. To support this, implement some logic to extend the timestamps by using the full PHC time. If the Rx timestamp was captured prior to the PHC time, then the real timestamp is PHC - (lower_32_bits(PHC) - timestamp) If the Rx timestamp was captured after the PHC time, then the real timestamp is PHC + (timestamp - lower_32_bits(PHC)) These calculations are correct as long as neither the PHC timestamp nor the Rx timestamps are more than 2^32-1 nanseconds old. Further, we can detect when the Rx timestamp is before or after the PHC as long as the PHC timestamp is no more than 2^31-1 nanoseconds old. In that case, we calculate the delta between the lower 32 bits of the PHC and the Rx timestamp. If it's larger than 2^31-1 then the Rx timestamp must have been captured in the past. If it's smaller, then the Rx timestamp must have been captured after PHC time. Add an ice_ptp_extend_32b_ts function that relies on a cached copy of the PHC time and implements this algorithm to calculate the proper upper 32bits of the Rx timestamps. Cache the PHC time periodically in all of the Rx rings. This enables each Rx ring to simply call the extension function with a recent copy of the PHC time. By ensuring that the PHC time is kept up to date periodically, we ensure this algorithm doesn't use stale data and produce incorrect results. To cache the time, introduce a kworker and a kwork item to periodically store the Rx time. It might seem like we should use the .do_aux_work interface of the PTP clock. This doesn't work because all PFs must cache this time, but only one PF owns the PTP clock device. Thus, the ice driver will manage its own kthread instead of relying on the PTP do_aux_work handler. With this change, the driver can now report Rx timestamps on all incoming packets. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Tony Brelinski <tonyx.brelinski@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-06-09 09:39:52 -07:00
ice_write_qrxflxp_cntxt(hw, pf_q, rxdid, 0x3, true);
else
ice: enable receive hardware timestamping Add SIOCGHWTSTAMP and SIOCSHWTSTAMP ioctl handlers to respond to requests to enable timestamping support. If the request is for enabling Rx timestamps, set a bit in the Rx descriptors to indicate that receive timestamps should be reported. Hardware captures receive timestamps in the PHY which only captures part of the timer, and reports only 40 bits into the Rx descriptor. The upper 32 bits represent the contents of GLTSYN_TIME_L at the point of packet reception, while the lower 8 bits represent the upper 8 bits of GLTSYN_TIME_0. The networking and PTP stack expect 64 bit timestamps in nanoseconds. To support this, implement some logic to extend the timestamps by using the full PHC time. If the Rx timestamp was captured prior to the PHC time, then the real timestamp is PHC - (lower_32_bits(PHC) - timestamp) If the Rx timestamp was captured after the PHC time, then the real timestamp is PHC + (timestamp - lower_32_bits(PHC)) These calculations are correct as long as neither the PHC timestamp nor the Rx timestamps are more than 2^32-1 nanseconds old. Further, we can detect when the Rx timestamp is before or after the PHC as long as the PHC timestamp is no more than 2^31-1 nanoseconds old. In that case, we calculate the delta between the lower 32 bits of the PHC and the Rx timestamp. If it's larger than 2^31-1 then the Rx timestamp must have been captured in the past. If it's smaller, then the Rx timestamp must have been captured after PHC time. Add an ice_ptp_extend_32b_ts function that relies on a cached copy of the PHC time and implements this algorithm to calculate the proper upper 32bits of the Rx timestamps. Cache the PHC time periodically in all of the Rx rings. This enables each Rx ring to simply call the extension function with a recent copy of the PHC time. By ensuring that the PHC time is kept up to date periodically, we ensure this algorithm doesn't use stale data and produce incorrect results. To cache the time, introduce a kworker and a kwork item to periodically store the Rx time. It might seem like we should use the .do_aux_work interface of the PTP clock. This doesn't work because all PFs must cache this time, but only one PF owns the PTP clock device. Thus, the ice driver will manage its own kthread instead of relying on the PTP do_aux_work handler. With this change, the driver can now report Rx timestamps on all incoming packets. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Tony Brelinski <tonyx.brelinski@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-06-09 09:39:52 -07:00
ice_write_qrxflxp_cntxt(hw, pf_q, ICE_RXDID_LEGACY_1, 0x3,
false);
/* Absolute queue number out of 2K needs to be passed */
err = ice_write_rxq_ctx(hw, &rlan_ctx, pf_q);
if (err) {
dev_err(ice_pf_to_dev(vsi->back), "Failed to set LAN Rx queue context for absolute Rx queue %d error: %d\n",
pf_q, err);
return -EIO;
}
if (vsi->type == ICE_VSI_VF)
return 0;
/* configure Rx buffer alignment */
if (!vsi->netdev || test_bit(ICE_FLAG_LEGACY_RX, vsi->back->flags))
ice_clear_ring_build_skb_ena(ring);
else
ice_set_ring_build_skb_ena(ring);
ring->rx_offset = ice_rx_offset(ring);
/* init queue specific tail register */
ring->tail = hw->hw_addr + QRX_TAIL(pf_q);
writel(0, ring->tail);
return 0;
}
static void ice_xsk_pool_fill_cb(struct ice_rx_ring *ring)
{
void *ctx_ptr = &ring->pkt_ctx;
struct xsk_cb_desc desc = {};
XSK_CHECK_PRIV_TYPE(struct ice_xdp_buff);
desc.src = &ctx_ptr;
desc.off = offsetof(struct ice_xdp_buff, pkt_ctx) -
sizeof(struct xdp_buff);
desc.bytes = sizeof(ctx_ptr);
xsk_pool_fill_cb(ring->xsk_pool, &desc);
}
/**
* ice_vsi_cfg_rxq - Configure an Rx queue
* @ring: the ring being configured
*
* Return 0 on success and a negative value on error.
*/
static int ice_vsi_cfg_rxq(struct ice_rx_ring *ring)
{
struct device *dev = ice_pf_to_dev(ring->vsi->back);
ice: Add support for XDP multi-buffer on Rx side Ice driver needs to be a bit reworked on Rx data path in order to support multi-buffer XDP. For skb path, it currently works in a way that Rx ring carries pointer to skb so if driver didn't manage to combine fragmented frame at current NAPI instance, it can restore the state on next instance and keep looking for last fragment (so descriptor with EOP bit set). What needs to be achieved is that xdp_buff needs to be combined in such way (linear + frags part) in the first place. Then skb will be ready to go in case of XDP_PASS or BPF program being not present on interface. If BPF program is there, it would work on multi-buffer XDP. At this point xdp_buff resides directly on Rx ring, so given the fact that skb will be built straight from xdp_buff, there will be no further need to carry skb on Rx ring. Besides removing skb pointer from Rx ring, lots of members have been moved around within ice_rx_ring. First and foremost reason was to place rx_buf with xdp_buff on the same cacheline. This means that once we touch rx_buf (which is a preceding step before touching xdp_buff), xdp_buff will already be hot in cache. Second thing was that xdp_rxq is used rather rarely and it occupies a separate cacheline, so maybe it is better to have it at the end of ice_rx_ring. Other change that affects ice_rx_ring is the introduction of ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate rx_buf->act to all the parts of current xdp_buff after running XDP program, so that ice_put_rx_buf() that got moved out of the main Rx processing loop will be able to tak an appriopriate action on each buffer. Second is for ice_construct_skb(). ice_construct_skb() has a copybreak mechanism which had an explicit impact on xdp_buff->skb conversion in the new approach when legacy Rx flag is toggled. It works in a way that linear part is 256 bytes long, if frame is bigger than that, remaining bytes are going as a frag to skb_shared_info. This means while memcpying frags from xdp_buff to newly allocated skb, care needs to be taken when picking the destination frag array entry. Upon the time ice_construct_skb() is called, when dealing with fragmented frame, current rx_buf points to the *last* fragment, but copybreak needs to be done against the first one. That's where ice_rx_ring::first_desc helps. When frame building spans across NAPI polls (DD bit is not set on current descriptor and xdp->data is not NULL) with current Rx buffer handling state there might be some problems. Since calls to ice_put_rx_buf() were pulled out of the main Rx processing loop and were scoped from cached_ntc to current ntc, remember that now mentioned function relies on rx_buf->act, which is set within ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so currently we could put Rx buffer with rx_buf->act being *uninitialized*. To address this, change scoping to rely on first_desc on both boundaries instead. This also implies that cleaned_count which is used as an input to ice_alloc_rx_buffers() and tells how many new buffers should be refilled has to be adjusted. If it stayed as is, what could happen is a case where ntc would go over ntu. Therefore, remove cleaned_count altogether and use against allocing routine newly introduced ICE_RX_DESC_UNUSED() macro which is an equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on struct ice_rx_ring::first_desc instead of next_to_clean. Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com> Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
u32 num_bufs = ICE_RX_DESC_UNUSED(ring);
int err;
ring->rx_buf_len = ring->vsi->rx_buf_len;
if (ring->vsi->type == ICE_VSI_PF) {
if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) {
err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev,
ring->q_index,
ring->q_vector->napi.napi_id,
ring->rx_buf_len);
if (err)
return err;
}
ring->xsk_pool = ice_xsk_pool(ring);
if (ring->xsk_pool) {
xdp_rxq_info_unreg(&ring->xdp_rxq);
ring->rx_buf_len =
xsk_pool_get_rx_frame_size(ring->xsk_pool);
err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev,
ring->q_index,
ring->q_vector->napi.napi_id,
ring->rx_buf_len);
if (err)
return err;
err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
MEM_TYPE_XSK_BUFF_POOL,
NULL);
if (err)
return err;
xsk_pool_set_rxq_info(ring->xsk_pool, &ring->xdp_rxq);
ice_xsk_pool_fill_cb(ring);
dev_info(dev, "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n",
ring->q_index);
} else {
if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) {
err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev,
ring->q_index,
ring->q_vector->napi.napi_id,
ring->rx_buf_len);
if (err)
return err;
}
err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
MEM_TYPE_PAGE_SHARED,
NULL);
if (err)
return err;
}
}
xdp_init_buff(&ring->xdp, ice_rx_pg_size(ring) / 2, &ring->xdp_rxq);
ice: Add support for XDP multi-buffer on Rx side Ice driver needs to be a bit reworked on Rx data path in order to support multi-buffer XDP. For skb path, it currently works in a way that Rx ring carries pointer to skb so if driver didn't manage to combine fragmented frame at current NAPI instance, it can restore the state on next instance and keep looking for last fragment (so descriptor with EOP bit set). What needs to be achieved is that xdp_buff needs to be combined in such way (linear + frags part) in the first place. Then skb will be ready to go in case of XDP_PASS or BPF program being not present on interface. If BPF program is there, it would work on multi-buffer XDP. At this point xdp_buff resides directly on Rx ring, so given the fact that skb will be built straight from xdp_buff, there will be no further need to carry skb on Rx ring. Besides removing skb pointer from Rx ring, lots of members have been moved around within ice_rx_ring. First and foremost reason was to place rx_buf with xdp_buff on the same cacheline. This means that once we touch rx_buf (which is a preceding step before touching xdp_buff), xdp_buff will already be hot in cache. Second thing was that xdp_rxq is used rather rarely and it occupies a separate cacheline, so maybe it is better to have it at the end of ice_rx_ring. Other change that affects ice_rx_ring is the introduction of ice_rx_ring::first_desc. Its purpose is twofold - first is to propagate rx_buf->act to all the parts of current xdp_buff after running XDP program, so that ice_put_rx_buf() that got moved out of the main Rx processing loop will be able to tak an appriopriate action on each buffer. Second is for ice_construct_skb(). ice_construct_skb() has a copybreak mechanism which had an explicit impact on xdp_buff->skb conversion in the new approach when legacy Rx flag is toggled. It works in a way that linear part is 256 bytes long, if frame is bigger than that, remaining bytes are going as a frag to skb_shared_info. This means while memcpying frags from xdp_buff to newly allocated skb, care needs to be taken when picking the destination frag array entry. Upon the time ice_construct_skb() is called, when dealing with fragmented frame, current rx_buf points to the *last* fragment, but copybreak needs to be done against the first one. That's where ice_rx_ring::first_desc helps. When frame building spans across NAPI polls (DD bit is not set on current descriptor and xdp->data is not NULL) with current Rx buffer handling state there might be some problems. Since calls to ice_put_rx_buf() were pulled out of the main Rx processing loop and were scoped from cached_ntc to current ntc, remember that now mentioned function relies on rx_buf->act, which is set within ice_run_xdp(). ice_run_xdp() is called when EOP bit was found, so currently we could put Rx buffer with rx_buf->act being *uninitialized*. To address this, change scoping to rely on first_desc on both boundaries instead. This also implies that cleaned_count which is used as an input to ice_alloc_rx_buffers() and tells how many new buffers should be refilled has to be adjusted. If it stayed as is, what could happen is a case where ntc would go over ntu. Therefore, remove cleaned_count altogether and use against allocing routine newly introduced ICE_RX_DESC_UNUSED() macro which is an equivalent of ICE_DESC_UNUSED() dedicated for Rx side and based on struct ice_rx_ring::first_desc instead of next_to_clean. Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com> Link: https://lore.kernel.org/bpf/20230131204506.219292-11-maciej.fijalkowski@intel.com
2023-01-31 21:45:03 +01:00
ring->xdp.data = NULL;
ring->xdp_ext.pkt_ctx = &ring->pkt_ctx;
err = ice_setup_rx_ctx(ring);
if (err) {
dev_err(dev, "ice_setup_rx_ctx failed for RxQ %d, err %d\n",
ring->q_index, err);
return err;
}
if (ring->xsk_pool) {
bool ok;
if (!xsk_buff_can_alloc(ring->xsk_pool, num_bufs)) {
dev_warn(dev, "XSK buffer pool does not provide enough addresses to fill %d buffers on Rx ring %d\n",
num_bufs, ring->q_index);
dev_warn(dev, "Change Rx ring/fill queue size to avoid performance issues\n");
return 0;
}
ok = ice_alloc_rx_bufs_zc(ring, num_bufs);
if (!ok) {
u16 pf_q = ring->vsi->rxq_map[ring->q_index];
dev_info(dev, "Failed to allocate some buffers on XSK buffer pool enabled Rx ring %d (pf_q %d)\n",
ring->q_index, pf_q);
}
return 0;
}
ice_alloc_rx_bufs(ring, num_bufs);
return 0;
}
int ice_vsi_cfg_single_rxq(struct ice_vsi *vsi, u16 q_idx)
{
if (q_idx >= vsi->num_rxq)
return -EINVAL;
return ice_vsi_cfg_rxq(vsi->rx_rings[q_idx]);
}
/**
* ice_vsi_cfg_frame_size - setup max frame size and Rx buffer length
* @vsi: VSI
*/
static void ice_vsi_cfg_frame_size(struct ice_vsi *vsi)
{
if (!vsi->netdev || test_bit(ICE_FLAG_LEGACY_RX, vsi->back->flags)) {
vsi->max_frame = ICE_MAX_FRAME_LEGACY_RX;
vsi->rx_buf_len = ICE_RXBUF_1664;
#if (PAGE_SIZE < 8192)
} else if (!ICE_2K_TOO_SMALL_WITH_PADDING &&
(vsi->netdev->mtu <= ETH_DATA_LEN)) {
vsi->max_frame = ICE_RXBUF_1536 - NET_IP_ALIGN;
vsi->rx_buf_len = ICE_RXBUF_1536 - NET_IP_ALIGN;
#endif
} else {
vsi->max_frame = ICE_AQ_SET_MAC_FRAME_SIZE_MAX;
vsi->rx_buf_len = ICE_RXBUF_3072;
}
}
/**
* ice_vsi_cfg_rxqs - Configure the VSI for Rx
* @vsi: the VSI being configured
*
* Return 0 on success and a negative value on error
* Configure the Rx VSI for operation.
*/
int ice_vsi_cfg_rxqs(struct ice_vsi *vsi)
{
u16 i;
if (vsi->type == ICE_VSI_VF)
goto setup_rings;
ice_vsi_cfg_frame_size(vsi);
setup_rings:
/* set up individual rings */
ice_for_each_rxq(vsi, i) {
int err = ice_vsi_cfg_rxq(vsi->rx_rings[i]);
if (err)
return err;
}
return 0;
}
/**
* __ice_vsi_get_qs - helper function for assigning queues from PF to VSI
* @qs_cfg: gathered variables needed for pf->vsi queues assignment
*
* This function first tries to find contiguous space. If it is not successful,
* it tries with the scatter approach.
*
* Return 0 on success and -ENOMEM in case of no left space in PF queue bitmap
*/
int __ice_vsi_get_qs(struct ice_qs_cfg *qs_cfg)
{
int ret = 0;
ret = __ice_vsi_get_qs_contig(qs_cfg);
if (ret) {
/* contig failed, so try with scatter approach */
qs_cfg->mapping_mode = ICE_VSI_MAP_SCATTER;
qs_cfg->q_count = min_t(unsigned int, qs_cfg->q_count,
qs_cfg->scatter_count);
ret = __ice_vsi_get_qs_sc(qs_cfg);
}
return ret;
}
/**
* ice_vsi_ctrl_one_rx_ring - start/stop VSI's Rx ring with no busy wait
* @vsi: the VSI being configured
* @ena: start or stop the Rx ring
* @rxq_idx: 0-based Rx queue index for the VSI passed in
* @wait: wait or don't wait for configuration to finish in hardware
*
* Return 0 on success and negative on error.
*/
int
ice_vsi_ctrl_one_rx_ring(struct ice_vsi *vsi, bool ena, u16 rxq_idx, bool wait)
{
int pf_q = vsi->rxq_map[rxq_idx];
struct ice_pf *pf = vsi->back;
struct ice_hw *hw = &pf->hw;
u32 rx_reg;
rx_reg = rd32(hw, QRX_CTRL(pf_q));
/* Skip if the queue is already in the requested state */
if (ena == !!(rx_reg & QRX_CTRL_QENA_STAT_M))
return 0;
/* turn on/off the queue */
if (ena)
rx_reg |= QRX_CTRL_QENA_REQ_M;
else
rx_reg &= ~QRX_CTRL_QENA_REQ_M;
wr32(hw, QRX_CTRL(pf_q), rx_reg);
if (!wait)
return 0;
ice_flush(hw);
return ice_pf_rxq_wait(pf, pf_q, ena);
}
/**
* ice_vsi_wait_one_rx_ring - wait for a VSI's Rx ring to be stopped/started
* @vsi: the VSI being configured
* @ena: true/false to verify Rx ring has been enabled/disabled respectively
* @rxq_idx: 0-based Rx queue index for the VSI passed in
*
* This routine will wait for the given Rx queue of the VSI to reach the
* enabled or disabled state. Returns -ETIMEDOUT in case of failing to reach
* the requested state after multiple retries; else will return 0 in case of
* success.
*/
int ice_vsi_wait_one_rx_ring(struct ice_vsi *vsi, bool ena, u16 rxq_idx)
{
int pf_q = vsi->rxq_map[rxq_idx];
struct ice_pf *pf = vsi->back;
return ice_pf_rxq_wait(pf, pf_q, ena);
}
/**
* ice_vsi_alloc_q_vectors - Allocate memory for interrupt vectors
* @vsi: the VSI being configured
*
* We allocate one q_vector per queue interrupt. If allocation fails we
* return -ENOMEM.
*/
int ice_vsi_alloc_q_vectors(struct ice_vsi *vsi)
{
struct device *dev = ice_pf_to_dev(vsi->back);
u16 v_idx;
int err;
if (vsi->q_vectors[0]) {
dev_dbg(dev, "VSI %d has existing q_vectors\n", vsi->vsi_num);
return -EEXIST;
}
for (v_idx = 0; v_idx < vsi->num_q_vectors; v_idx++) {
err = ice_vsi_alloc_q_vector(vsi, v_idx);
if (err)
goto err_out;
}
return 0;
err_out:
while (v_idx--)
ice_free_q_vector(vsi, v_idx);
dev_err(dev, "Failed to allocate %d q_vector for VSI %d, ret=%d\n",
vsi->num_q_vectors, vsi->vsi_num, err);
vsi->num_q_vectors = 0;
return err;
}
/**
* ice_vsi_map_rings_to_vectors - Map VSI rings to interrupt vectors
* @vsi: the VSI being configured
*
* This function maps descriptor rings to the queue-specific vectors allotted
* through the MSI-X enabling code. On a constrained vector budget, we map Tx
* and Rx rings to the vector as "efficiently" as possible.
*/
void ice_vsi_map_rings_to_vectors(struct ice_vsi *vsi)
{
int q_vectors = vsi->num_q_vectors;
u16 tx_rings_rem, rx_rings_rem;
int v_id;
/* initially assigning remaining rings count to VSIs num queue value */
tx_rings_rem = vsi->num_txq;
rx_rings_rem = vsi->num_rxq;
for (v_id = 0; v_id < q_vectors; v_id++) {
struct ice_q_vector *q_vector = vsi->q_vectors[v_id];
u8 tx_rings_per_v, rx_rings_per_v;
u16 q_id, q_base;
/* Tx rings mapping to vector */
tx_rings_per_v = (u8)DIV_ROUND_UP(tx_rings_rem,
q_vectors - v_id);
q_vector->num_ring_tx = tx_rings_per_v;
q_vector->tx.tx_ring = NULL;
q_vector->tx.itr_idx = ICE_TX_ITR;
q_base = vsi->num_txq - tx_rings_rem;
for (q_id = q_base; q_id < (q_base + tx_rings_per_v); q_id++) {
struct ice_tx_ring *tx_ring = vsi->tx_rings[q_id];
tx_ring->q_vector = q_vector;
tx_ring->next = q_vector->tx.tx_ring;
q_vector->tx.tx_ring = tx_ring;
}
tx_rings_rem -= tx_rings_per_v;
/* Rx rings mapping to vector */
rx_rings_per_v = (u8)DIV_ROUND_UP(rx_rings_rem,
q_vectors - v_id);
q_vector->num_ring_rx = rx_rings_per_v;
q_vector->rx.rx_ring = NULL;
q_vector->rx.itr_idx = ICE_RX_ITR;
q_base = vsi->num_rxq - rx_rings_rem;
for (q_id = q_base; q_id < (q_base + rx_rings_per_v); q_id++) {
struct ice_rx_ring *rx_ring = vsi->rx_rings[q_id];
rx_ring->q_vector = q_vector;
rx_ring->next = q_vector->rx.rx_ring;
q_vector->rx.rx_ring = rx_ring;
}
rx_rings_rem -= rx_rings_per_v;
}
}
/**
* ice_vsi_free_q_vectors - Free memory allocated for interrupt vectors
* @vsi: the VSI having memory freed
*/
void ice_vsi_free_q_vectors(struct ice_vsi *vsi)
{
int v_idx;
ice_for_each_q_vector(vsi, v_idx)
ice_free_q_vector(vsi, v_idx);
ice: prevent NULL pointer deref during reload Calling ethtool during reload can lead to call trace, because VSI isn't configured for some time, but netdev is alive. To fix it add rtnl lock for VSI deconfig and config. Set ::num_q_vectors to 0 after freeing and add a check for ::tx/rx_rings in ring related ethtool ops. Add proper unroll of filters in ice_start_eth(). Reproduction: $watch -n 0.1 -d 'ethtool -g enp24s0f0np0' $devlink dev reload pci/0000:18:00.0 action driver_reinit Call trace before fix: [66303.926205] BUG: kernel NULL pointer dereference, address: 0000000000000000 [66303.926259] #PF: supervisor read access in kernel mode [66303.926286] #PF: error_code(0x0000) - not-present page [66303.926311] PGD 0 P4D 0 [66303.926332] Oops: 0000 [#1] PREEMPT SMP PTI [66303.926358] CPU: 4 PID: 933821 Comm: ethtool Kdump: loaded Tainted: G OE 6.4.0-rc5+ #1 [66303.926400] Hardware name: Intel Corporation S2600WFT/S2600WFT, BIOS SE5C620.86B.00.01.0014.070920180847 07/09/2018 [66303.926446] RIP: 0010:ice_get_ringparam+0x22/0x50 [ice] [66303.926649] Code: 90 90 90 90 90 90 90 90 f3 0f 1e fa 0f 1f 44 00 00 48 8b 87 c0 09 00 00 c7 46 04 e0 1f 00 00 c7 46 10 e0 1f 00 00 48 8b 50 20 <48> 8b 12 0f b7 52 3a 89 56 14 48 8b 40 28 48 8b 00 0f b7 40 58 48 [66303.926722] RSP: 0018:ffffad40472f39c8 EFLAGS: 00010246 [66303.926749] RAX: ffff98a8ada05828 RBX: ffff98a8c46dd060 RCX: ffffad40472f3b48 [66303.926781] RDX: 0000000000000000 RSI: ffff98a8c46dd068 RDI: ffff98a8b23c4000 [66303.926811] RBP: ffffad40472f3b48 R08: 00000000000337b0 R09: 0000000000000000 [66303.926843] R10: 0000000000000001 R11: 0000000000000100 R12: ffff98a8b23c4000 [66303.926874] R13: ffff98a8c46dd060 R14: 000000000000000f R15: ffffad40472f3a50 [66303.926906] FS: 00007f6397966740(0000) GS:ffff98b390900000(0000) knlGS:0000000000000000 [66303.926941] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [66303.926967] CR2: 0000000000000000 CR3: 000000011ac20002 CR4: 00000000007706e0 [66303.926999] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [66303.927029] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [66303.927060] PKRU: 55555554 [66303.927075] Call Trace: [66303.927094] <TASK> [66303.927111] ? __die+0x23/0x70 [66303.927140] ? page_fault_oops+0x171/0x4e0 [66303.927176] ? exc_page_fault+0x7f/0x180 [66303.927209] ? asm_exc_page_fault+0x26/0x30 [66303.927244] ? ice_get_ringparam+0x22/0x50 [ice] [66303.927433] rings_prepare_data+0x62/0x80 [66303.927469] ethnl_default_doit+0xe2/0x350 [66303.927501] genl_family_rcv_msg_doit.isra.0+0xe3/0x140 [66303.927538] genl_rcv_msg+0x1b1/0x2c0 [66303.927561] ? __pfx_ethnl_default_doit+0x10/0x10 [66303.927590] ? __pfx_genl_rcv_msg+0x10/0x10 [66303.927615] netlink_rcv_skb+0x58/0x110 [66303.927644] genl_rcv+0x28/0x40 [66303.927665] netlink_unicast+0x19e/0x290 [66303.927691] netlink_sendmsg+0x254/0x4d0 [66303.927717] sock_sendmsg+0x93/0xa0 [66303.927743] __sys_sendto+0x126/0x170 [66303.927780] __x64_sys_sendto+0x24/0x30 [66303.928593] do_syscall_64+0x5d/0x90 [66303.929370] ? __count_memcg_events+0x60/0xa0 [66303.930146] ? count_memcg_events.constprop.0+0x1a/0x30 [66303.930920] ? handle_mm_fault+0x9e/0x350 [66303.931688] ? do_user_addr_fault+0x258/0x740 [66303.932452] ? exc_page_fault+0x7f/0x180 [66303.933193] entry_SYSCALL_64_after_hwframe+0x72/0xdc Fixes: 5b246e533d01 ("ice: split probe into smaller functions") Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com> Signed-off-by: Michal Swiatkowski <michal.swiatkowski@linux.intel.com> Reviewed-by: Simon Horman <simon.horman@corigine.com> Tested-by: Pucha Himasekhar Reddy <himasekharx.reddy.pucha@intel.com> (A Contingent worker at Intel) Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2023-07-06 08:25:51 +02:00
vsi->num_q_vectors = 0;
}
/**
* ice_vsi_cfg_txq - Configure single Tx queue
* @vsi: the VSI that queue belongs to
* @ring: Tx ring to be configured
* @qg_buf: queue group buffer
*/
static int
ice_vsi_cfg_txq(struct ice_vsi *vsi, struct ice_tx_ring *ring,
struct ice_aqc_add_tx_qgrp *qg_buf)
{
u8 buf_len = struct_size(qg_buf, txqs, 1);
struct ice_tlan_ctx tlan_ctx = { 0 };
struct ice_aqc_add_txqs_perq *txq;
struct ice_channel *ch = ring->ch;
struct ice_pf *pf = vsi->back;
struct ice_hw *hw = &pf->hw;
int status;
u16 pf_q;
u8 tc;
/* Configure XPS */
ice_cfg_xps_tx_ring(ring);
pf_q = ring->reg_idx;
ice_setup_tx_ctx(ring, &tlan_ctx, pf_q);
/* copy context contents into the qg_buf */
qg_buf->txqs[0].txq_id = cpu_to_le16(pf_q);
ice_set_ctx(hw, (u8 *)&tlan_ctx, qg_buf->txqs[0].txq_ctx,
ice_tlan_ctx_info);
/* init queue specific tail reg. It is referred as
* transmit comm scheduler queue doorbell.
*/
ring->tail = hw->hw_addr + QTX_COMM_DBELL(pf_q);
if (IS_ENABLED(CONFIG_DCB))
tc = ring->dcb_tc;
else
tc = 0;
/* Add unique software queue handle of the Tx queue per
* TC into the VSI Tx ring
*/
ring->q_handle = ice_calc_txq_handle(vsi, ring, tc);
if (ch)
status = ice_ena_vsi_txq(vsi->port_info, ch->ch_vsi->idx, 0,
ring->q_handle, 1, qg_buf, buf_len,
NULL);
else
status = ice_ena_vsi_txq(vsi->port_info, vsi->idx, tc,
ring->q_handle, 1, qg_buf, buf_len,
NULL);
if (status) {
dev_err(ice_pf_to_dev(pf), "Failed to set LAN Tx queue context, error: %d\n",
status);
return status;
}
/* Add Tx Queue TEID into the VSI Tx ring from the
* response. This will complete configuring and
* enabling the queue.
*/
txq = &qg_buf->txqs[0];
if (pf_q == le16_to_cpu(txq->txq_id))
ring->txq_teid = le32_to_cpu(txq->q_teid);
return 0;
}
int ice_vsi_cfg_single_txq(struct ice_vsi *vsi, struct ice_tx_ring **tx_rings,
u16 q_idx)
{
DEFINE_RAW_FLEX(struct ice_aqc_add_tx_qgrp, qg_buf, txqs, 1);
if (q_idx >= vsi->alloc_txq || !tx_rings || !tx_rings[q_idx])
return -EINVAL;
qg_buf->num_txqs = 1;
return ice_vsi_cfg_txq(vsi, tx_rings[q_idx], qg_buf);
}
/**
* ice_vsi_cfg_txqs - Configure the VSI for Tx
* @vsi: the VSI being configured
* @rings: Tx ring array to be configured
* @count: number of Tx ring array elements
*
* Return 0 on success and a negative value on error
* Configure the Tx VSI for operation.
*/
static int
ice_vsi_cfg_txqs(struct ice_vsi *vsi, struct ice_tx_ring **rings, u16 count)
{
DEFINE_RAW_FLEX(struct ice_aqc_add_tx_qgrp, qg_buf, txqs, 1);
int err = 0;
u16 q_idx;
qg_buf->num_txqs = 1;
for (q_idx = 0; q_idx < count; q_idx++) {
err = ice_vsi_cfg_txq(vsi, rings[q_idx], qg_buf);
if (err)
break;
}
return err;
}
/**
* ice_vsi_cfg_lan_txqs - Configure the VSI for Tx
* @vsi: the VSI being configured
*
* Return 0 on success and a negative value on error
* Configure the Tx VSI for operation.
*/
int ice_vsi_cfg_lan_txqs(struct ice_vsi *vsi)
{
return ice_vsi_cfg_txqs(vsi, vsi->tx_rings, vsi->num_txq);
}
/**
* ice_vsi_cfg_xdp_txqs - Configure Tx queues dedicated for XDP in given VSI
* @vsi: the VSI being configured
*
* Return 0 on success and a negative value on error
* Configure the Tx queues dedicated for XDP in given VSI for operation.
*/
int ice_vsi_cfg_xdp_txqs(struct ice_vsi *vsi)
{
int ret;
int i;
ret = ice_vsi_cfg_txqs(vsi, vsi->xdp_rings, vsi->num_xdp_txq);
if (ret)
return ret;
ice_for_each_rxq(vsi, i)
ice_tx_xsk_pool(vsi, i);
return 0;
}
/**
* ice_cfg_itr - configure the initial interrupt throttle values
* @hw: pointer to the HW structure
* @q_vector: interrupt vector that's being configured
*
* Configure interrupt throttling values for the ring containers that are
* associated with the interrupt vector passed in.
*/
void ice_cfg_itr(struct ice_hw *hw, struct ice_q_vector *q_vector)
{
ice_cfg_itr_gran(hw);
if (q_vector->num_ring_rx)
ice_write_itr(&q_vector->rx, q_vector->rx.itr_setting);
if (q_vector->num_ring_tx)
ice_write_itr(&q_vector->tx, q_vector->tx.itr_setting);
ice_write_intrl(q_vector, q_vector->intrl);
}
/**
* ice_cfg_txq_interrupt - configure interrupt on Tx queue
* @vsi: the VSI being configured
* @txq: Tx queue being mapped to MSI-X vector
* @msix_idx: MSI-X vector index within the function
* @itr_idx: ITR index of the interrupt cause
*
* Configure interrupt on Tx queue by associating Tx queue to MSI-X vector
* within the function space.
*/
void
ice_cfg_txq_interrupt(struct ice_vsi *vsi, u16 txq, u16 msix_idx, u16 itr_idx)
{
struct ice_pf *pf = vsi->back;
struct ice_hw *hw = &pf->hw;
u32 val;
itr_idx = FIELD_PREP(QINT_TQCTL_ITR_INDX_M, itr_idx);
val = QINT_TQCTL_CAUSE_ENA_M | itr_idx |
FIELD_PREP(QINT_TQCTL_MSIX_INDX_M, msix_idx);
wr32(hw, QINT_TQCTL(vsi->txq_map[txq]), val);
if (ice_is_xdp_ena_vsi(vsi)) {
u32 xdp_txq = txq + vsi->num_xdp_txq;
wr32(hw, QINT_TQCTL(vsi->txq_map[xdp_txq]),
val);
}
ice_flush(hw);
}
/**
* ice_cfg_rxq_interrupt - configure interrupt on Rx queue
* @vsi: the VSI being configured
* @rxq: Rx queue being mapped to MSI-X vector
* @msix_idx: MSI-X vector index within the function
* @itr_idx: ITR index of the interrupt cause
*
* Configure interrupt on Rx queue by associating Rx queue to MSI-X vector
* within the function space.
*/
void
ice_cfg_rxq_interrupt(struct ice_vsi *vsi, u16 rxq, u16 msix_idx, u16 itr_idx)
{
struct ice_pf *pf = vsi->back;
struct ice_hw *hw = &pf->hw;
u32 val;
itr_idx = FIELD_PREP(QINT_RQCTL_ITR_INDX_M, itr_idx);
val = QINT_RQCTL_CAUSE_ENA_M | itr_idx |
FIELD_PREP(QINT_RQCTL_MSIX_INDX_M, msix_idx);
wr32(hw, QINT_RQCTL(vsi->rxq_map[rxq]), val);
ice_flush(hw);
}
/**
* ice_trigger_sw_intr - trigger a software interrupt
* @hw: pointer to the HW structure
* @q_vector: interrupt vector to trigger the software interrupt for
*/
void ice_trigger_sw_intr(struct ice_hw *hw, const struct ice_q_vector *q_vector)
{
wr32(hw, GLINT_DYN_CTL(q_vector->reg_idx),
(ICE_ITR_NONE << GLINT_DYN_CTL_ITR_INDX_S) |
GLINT_DYN_CTL_SWINT_TRIG_M |
GLINT_DYN_CTL_INTENA_M);
}
/**
* ice_vsi_stop_tx_ring - Disable single Tx ring
* @vsi: the VSI being configured
* @rst_src: reset source
* @rel_vmvf_num: Relative ID of VF/VM
* @ring: Tx ring to be stopped
* @txq_meta: Meta data of Tx ring to be stopped
*/
int
ice_vsi_stop_tx_ring(struct ice_vsi *vsi, enum ice_disq_rst_src rst_src,
u16 rel_vmvf_num, struct ice_tx_ring *ring,
struct ice_txq_meta *txq_meta)
{
struct ice_pf *pf = vsi->back;
struct ice_q_vector *q_vector;
struct ice_hw *hw = &pf->hw;
int status;
u32 val;
/* clear cause_ena bit for disabled queues */
val = rd32(hw, QINT_TQCTL(ring->reg_idx));
val &= ~QINT_TQCTL_CAUSE_ENA_M;
wr32(hw, QINT_TQCTL(ring->reg_idx), val);
/* software is expected to wait for 100 ns */
ndelay(100);
/* trigger a software interrupt for the vector
* associated to the queue to schedule NAPI handler
*/
q_vector = ring->q_vector;
if (q_vector && !(vsi->vf && ice_is_vf_disabled(vsi->vf)))
ice_trigger_sw_intr(hw, q_vector);
status = ice_dis_vsi_txq(vsi->port_info, txq_meta->vsi_idx,
txq_meta->tc, 1, &txq_meta->q_handle,
&txq_meta->q_id, &txq_meta->q_teid, rst_src,
rel_vmvf_num, NULL);
/* if the disable queue command was exercised during an
* active reset flow, -EBUSY is returned.
* This is not an error as the reset operation disables
* queues at the hardware level anyway.
*/
if (status == -EBUSY) {
dev_dbg(ice_pf_to_dev(vsi->back), "Reset in progress. LAN Tx queues already disabled\n");
} else if (status == -ENOENT) {
dev_dbg(ice_pf_to_dev(vsi->back), "LAN Tx queues do not exist, nothing to disable\n");
} else if (status) {
dev_dbg(ice_pf_to_dev(vsi->back), "Failed to disable LAN Tx queues, error: %d\n",
status);
return status;
}
return 0;
}
/**
* ice_fill_txq_meta - Prepare the Tx queue's meta data
* @vsi: VSI that ring belongs to
* @ring: ring that txq_meta will be based on
* @txq_meta: a helper struct that wraps Tx queue's information
*
* Set up a helper struct that will contain all the necessary fields that
* are needed for stopping Tx queue
*/
void
ice_fill_txq_meta(const struct ice_vsi *vsi, struct ice_tx_ring *ring,
struct ice_txq_meta *txq_meta)
{
struct ice_channel *ch = ring->ch;
u8 tc;
if (IS_ENABLED(CONFIG_DCB))
tc = ring->dcb_tc;
else
tc = 0;
txq_meta->q_id = ring->reg_idx;
txq_meta->q_teid = ring->txq_teid;
txq_meta->q_handle = ring->q_handle;
if (ch) {
txq_meta->vsi_idx = ch->ch_vsi->idx;
txq_meta->tc = 0;
} else {
txq_meta->vsi_idx = vsi->idx;
txq_meta->tc = tc;
}
}