linux/drivers/net/ethernet/intel/ice/ice_repr.c

468 lines
11 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0
/* Copyright (C) 2019-2021, Intel Corporation. */
#include "ice.h"
#include "ice_eswitch.h"
#include "ice_devlink.h"
ice: rename ice_virtchnl_pf.c to ice_sriov.c The ice_virtchnl_pf.c and ice_virtchnl_pf.h files are where most of the code for implementing Single Root IOV virtualization resides. This code includes support for bringing up and tearing down VFs, hooks into the kernel SR-IOV netdev operations, and for handling virtchnl messages from VFs. In the future, we plan to support Scalable IOV in addition to Single Root IOV as an alternative virtualization scheme. This implementation will re-use some but not all of the code in ice_virtchnl_pf.c To prepare for this future, we want to refactor and split up the code in ice_virtchnl_pf.c into the following scheme: * ice_vf_lib.[ch] Basic VF structures and accessors. This is where scheme-independent code will reside. * ice_virtchnl.[ch] Virtchnl message handling. This is where the bulk of the logic for processing messages from VFs using the virtchnl messaging scheme will reside. This is separated from ice_vf_lib.c because it is distinct and has a bulk of the processing code. * ice_sriov.[ch] Single Root IOV implementation, including initialization and the routines for interacting with SR-IOV based netdev operations. * (future) ice_siov.[ch] Scalable IOV implementation. As a first step, lets assume that all of the code in ice_virtchnl_pf.[ch] is for Single Root IOV. Rename this file to ice_sriov.c and its header to ice_sriov.h Future changes will further split out the code in these files following the plan outlined here. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Konrad Jankowski <konrad0.jankowski@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2022-02-22 16:26:49 -08:00
#include "ice_sriov.h"
#include "ice_tc_lib.h"
/**
* ice_repr_get_sw_port_id - get port ID associated with representor
* @repr: pointer to port representor
*/
static int ice_repr_get_sw_port_id(struct ice_repr *repr)
{
return repr->vf->pf->hw.port_info->lport;
}
/**
* ice_repr_get_phys_port_name - get phys port name
* @netdev: pointer to port representor netdev
* @buf: write here port name
* @len: max length of buf
*/
static int
ice_repr_get_phys_port_name(struct net_device *netdev, char *buf, size_t len)
{
struct ice_netdev_priv *np = netdev_priv(netdev);
struct ice_repr *repr = np->repr;
int res;
/* Devlink port is registered and devlink core is taking care of name formatting. */
if (repr->vf->devlink_port.devlink)
return -EOPNOTSUPP;
res = snprintf(buf, len, "pf%dvfr%d", ice_repr_get_sw_port_id(repr),
repr->vf->vf_id);
if (res <= 0)
return -EOPNOTSUPP;
return 0;
}
/**
* ice_repr_get_stats64 - get VF stats for VFPR use
* @netdev: pointer to port representor netdev
* @stats: pointer to struct where stats can be stored
*/
static void
ice_repr_get_stats64(struct net_device *netdev, struct rtnl_link_stats64 *stats)
{
struct ice_netdev_priv *np = netdev_priv(netdev);
struct ice_eth_stats *eth_stats;
struct ice_vsi *vsi;
if (ice_is_vf_disabled(np->repr->vf))
return;
vsi = np->repr->src_vsi;
ice_update_vsi_stats(vsi);
eth_stats = &vsi->eth_stats;
stats->tx_packets = eth_stats->tx_unicast + eth_stats->tx_broadcast +
eth_stats->tx_multicast;
stats->rx_packets = eth_stats->rx_unicast + eth_stats->rx_broadcast +
eth_stats->rx_multicast;
stats->tx_bytes = eth_stats->tx_bytes;
stats->rx_bytes = eth_stats->rx_bytes;
stats->multicast = eth_stats->rx_multicast;
stats->tx_errors = eth_stats->tx_errors;
stats->tx_dropped = eth_stats->tx_discards;
stats->rx_dropped = eth_stats->rx_discards;
}
/**
* ice_netdev_to_repr - Get port representor for given netdevice
* @netdev: pointer to port representor netdev
*/
struct ice_repr *ice_netdev_to_repr(struct net_device *netdev)
{
struct ice_netdev_priv *np = netdev_priv(netdev);
return np->repr;
}
/**
* ice_repr_open - Enable port representor's network interface
* @netdev: network interface device structure
*
* The open entry point is called when a port representor's network
* interface is made active by the system (IFF_UP). Corresponding
* VF is notified about link status change.
*
* Returns 0 on success
*/
static int ice_repr_open(struct net_device *netdev)
{
struct ice_repr *repr = ice_netdev_to_repr(netdev);
struct ice_vf *vf;
vf = repr->vf;
vf->link_forced = true;
vf->link_up = true;
ice_vc_notify_vf_link_state(vf);
netif_carrier_on(netdev);
netif_tx_start_all_queues(netdev);
return 0;
}
/**
* ice_repr_stop - Disable port representor's network interface
* @netdev: network interface device structure
*
* The stop entry point is called when a port representor's network
* interface is de-activated by the system. Corresponding
* VF is notified about link status change.
*
* Returns 0 on success
*/
static int ice_repr_stop(struct net_device *netdev)
{
struct ice_repr *repr = ice_netdev_to_repr(netdev);
struct ice_vf *vf;
vf = repr->vf;
vf->link_forced = true;
vf->link_up = false;
ice_vc_notify_vf_link_state(vf);
netif_carrier_off(netdev);
netif_tx_stop_all_queues(netdev);
return 0;
}
static struct devlink_port *
ice_repr_get_devlink_port(struct net_device *netdev)
{
struct ice_repr *repr = ice_netdev_to_repr(netdev);
return &repr->vf->devlink_port;
}
/**
* ice_repr_sp_stats64 - get slow path stats for port representor
* @dev: network interface device structure
* @stats: netlink stats structure
*
* RX/TX stats are being swapped here to be consistent with VF stats. In slow
* path, port representor receives data when the corresponding VF is sending it
* (and vice versa), TX and RX bytes/packets are effectively swapped on port
* representor.
*/
static int
ice_repr_sp_stats64(const struct net_device *dev,
struct rtnl_link_stats64 *stats)
{
struct ice_netdev_priv *np = netdev_priv(dev);
int vf_id = np->repr->vf->vf_id;
struct ice_tx_ring *tx_ring;
struct ice_rx_ring *rx_ring;
u64 pkts, bytes;
tx_ring = np->vsi->tx_rings[vf_id];
ice_fetch_u64_stats_per_ring(&tx_ring->syncp, tx_ring->stats,
&pkts, &bytes);
stats->rx_packets = pkts;
stats->rx_bytes = bytes;
rx_ring = np->vsi->rx_rings[vf_id];
ice_fetch_u64_stats_per_ring(&rx_ring->syncp, rx_ring->stats,
&pkts, &bytes);
stats->tx_packets = pkts;
stats->tx_bytes = bytes;
stats->tx_dropped = rx_ring->rx_stats.alloc_page_failed +
rx_ring->rx_stats.alloc_buf_failed;
return 0;
}
static bool
ice_repr_ndo_has_offload_stats(const struct net_device *dev, int attr_id)
{
return attr_id == IFLA_OFFLOAD_XSTATS_CPU_HIT;
}
static int
ice_repr_ndo_get_offload_stats(int attr_id, const struct net_device *dev,
void *sp)
{
if (attr_id == IFLA_OFFLOAD_XSTATS_CPU_HIT)
return ice_repr_sp_stats64(dev, (struct rtnl_link_stats64 *)sp);
return -EINVAL;
}
static int
ice_repr_setup_tc_cls_flower(struct ice_repr *repr,
struct flow_cls_offload *flower)
{
switch (flower->command) {
case FLOW_CLS_REPLACE:
return ice_add_cls_flower(repr->netdev, repr->src_vsi, flower);
case FLOW_CLS_DESTROY:
return ice_del_cls_flower(repr->src_vsi, flower);
default:
return -EINVAL;
}
}
static int
ice_repr_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
void *cb_priv)
{
struct flow_cls_offload *flower = (struct flow_cls_offload *)type_data;
struct ice_netdev_priv *np = (struct ice_netdev_priv *)cb_priv;
switch (type) {
case TC_SETUP_CLSFLOWER:
return ice_repr_setup_tc_cls_flower(np->repr, flower);
default:
return -EOPNOTSUPP;
}
}
static LIST_HEAD(ice_repr_block_cb_list);
static int
ice_repr_setup_tc(struct net_device *netdev, enum tc_setup_type type,
void *type_data)
{
struct ice_netdev_priv *np = netdev_priv(netdev);
switch (type) {
case TC_SETUP_BLOCK:
return flow_block_cb_setup_simple((struct flow_block_offload *)
type_data,
&ice_repr_block_cb_list,
ice_repr_setup_tc_block_cb,
np, np, true);
default:
return -EOPNOTSUPP;
}
}
static const struct net_device_ops ice_repr_netdev_ops = {
.ndo_get_phys_port_name = ice_repr_get_phys_port_name,
.ndo_get_stats64 = ice_repr_get_stats64,
.ndo_open = ice_repr_open,
.ndo_stop = ice_repr_stop,
.ndo_start_xmit = ice_eswitch_port_start_xmit,
.ndo_get_devlink_port = ice_repr_get_devlink_port,
.ndo_setup_tc = ice_repr_setup_tc,
.ndo_has_offload_stats = ice_repr_ndo_has_offload_stats,
.ndo_get_offload_stats = ice_repr_ndo_get_offload_stats,
};
/**
* ice_is_port_repr_netdev - Check if a given netdevice is a port representor netdev
* @netdev: pointer to netdev
*/
bool ice_is_port_repr_netdev(struct net_device *netdev)
{
return netdev && (netdev->netdev_ops == &ice_repr_netdev_ops);
}
/**
* ice_repr_reg_netdev - register port representor netdev
* @netdev: pointer to port representor netdev
*/
static int
ice_repr_reg_netdev(struct net_device *netdev)
{
eth_hw_addr_random(netdev);
netdev->netdev_ops = &ice_repr_netdev_ops;
ice_set_ethtool_repr_ops(netdev);
netdev->hw_features |= NETIF_F_HW_TC;
netif_carrier_off(netdev);
netif_tx_stop_all_queues(netdev);
return register_netdev(netdev);
}
/**
* ice_repr_add - add representor for VF
* @vf: pointer to VF structure
*/
static int ice_repr_add(struct ice_vf *vf)
{
struct ice_q_vector *q_vector;
struct ice_netdev_priv *np;
struct ice_repr *repr;
struct ice_vsi *vsi;
int err;
vsi = ice_get_vf_vsi(vf);
if (!vsi)
return -EINVAL;
repr = kzalloc(sizeof(*repr), GFP_KERNEL);
if (!repr)
return -ENOMEM;
#ifdef CONFIG_ICE_SWITCHDEV
repr->mac_rule = kzalloc(sizeof(*repr->mac_rule), GFP_KERNEL);
if (!repr->mac_rule) {
err = -ENOMEM;
goto err_alloc_rule;
}
#endif
repr->netdev = alloc_etherdev(sizeof(struct ice_netdev_priv));
if (!repr->netdev) {
err = -ENOMEM;
goto err_alloc;
}
repr->src_vsi = vsi;
repr->vf = vf;
vf->repr = repr;
np = netdev_priv(repr->netdev);
np->repr = repr;
q_vector = kzalloc(sizeof(*q_vector), GFP_KERNEL);
if (!q_vector) {
err = -ENOMEM;
goto err_alloc_q_vector;
}
repr->q_vector = q_vector;
err = ice_devlink_create_vf_port(vf);
if (err)
goto err_devlink;
repr->netdev->min_mtu = ETH_MIN_MTU;
repr->netdev->max_mtu = ICE_MAX_MTU;
SET_NETDEV_DEV(repr->netdev, ice_pf_to_dev(vf->pf));
err = ice_repr_reg_netdev(repr->netdev);
if (err)
goto err_netdev;
devlink_port_type_eth_set(&vf->devlink_port, repr->netdev);
ice: convert vf->vc_ops to a const pointer The vc_ops structure is used to allow different handlers for virtchnl commands when the driver is in representor mode. The current implementation uses a copy of the ops table in each VF, and modifies this copy dynamically. The usual practice in kernel code is to store the ops table in a constant structure and point to different versions. This has a number of advantages: 1. Reduced memory usage. Each VF merely points to the correct table, so they're able to re-use the same constant lookup table in memory. 2. Consistency. It becomes more difficult to accidentally update or edit only one op call. Instead, the code switches to the correct able by a single pointer write. In general this is atomic, either the pointer is updated or its not. 3. Code Layout. The VF structure can store a pointer to the table without needing to have the full structure definition defined prior to the VF structure definition. This will aid in future refactoring of code by allowing the VF pointer to be kept in ice_vf_lib.h while the virtchnl ops table can be maintained in ice_virtchnl.h There is one major downside in the case of the vc_ops structure. Most of the operations in the table are the same between the two current implementations. This can appear to lead to duplication since each implementation must now fill in the complete table. It could make spotting the differences in the representor mode more challenging. Unfortunately, methods to make this less error prone either add complexity overhead (macros using CPP token concatenation) or don't work on all compilers we support (constant initializer from another constant structure). The cost of maintaining two structures does not out weigh the benefits of the constant table model. While we're making these changes, go ahead and rename the structure and implementations with "virtchnl" instead of "vc_vf_". This will more closely align with the planned file renaming, and avoid similar names when we later introduce a "vf ops" table for separating Scalable IOV and Single Root IOV implementations. Leave the accessor/assignment functions in order to avoid issues with compiling with options disabled. The interface makes it easier to handle when CONFIG_PCI_IOV is disabled in the kernel. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Sandeep Penigalapati <sandeep.penigalapati@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2022-02-22 16:26:51 -08:00
ice_virtchnl_set_repr_ops(vf);
ice: refactor unwind cleanup in eswitch mode The code for supporting eswitch mode and port representors on VFs uses an unwind based cleanup flow when handling errors. These flows are used to cleanup and get everything back to the state prior to attempting to switch from legacy to representor mode or back. The unwind iterations make sense, but complicate a plan to refactor the VF array structure. In the future we won't have a clean method of reversing an iteration of the VFs. Instead, we can change the cleanup flow to just iterate over all VF structures and clean up appropriately. First notice that ice_repr_add_for_all_vfs and ice_repr_rem_from_all_vfs have an additional step of re-assigning the VC ops. There is no good reason to do this outside of ice_repr_add and ice_repr_rem. It can simply be done as the last step of these functions. Second, make sure ice_repr_rem is safe to call on a VF which does not have a representor. Check if vf->repr is NULL first and exit early if so. Move ice_repr_rem_from_all_vfs above ice_repr_add_for_all_vfs so that we can call it from the cleanup function. In ice_eswitch.c, replace the unwind iteration with a call to ice_eswitch_release_reprs. This will go through all of the VFs and revert the VF back to the standard model without the eswitch mode. To make this safe, ensure this function checks whether or not the represent or has been moved. Rely on the metadata destination in vf->repr->dst. This must be NULL if the representor has not been moved to eswitch mode. Ensure that we always re-assign this value back to NULL after freeing it, and move the ice_eswitch_release_reprs so that it can be called from the setup function. With these changes, eswitch cleanup no longer uses an unwind flow that is problematic for the planned VF data structure change. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Sandeep Penigalapati <sandeep.penigalapati@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2022-02-16 13:37:28 -08:00
return 0;
err_netdev:
ice_devlink_destroy_vf_port(vf);
err_devlink:
kfree(repr->q_vector);
vf->repr->q_vector = NULL;
err_alloc_q_vector:
free_netdev(repr->netdev);
repr->netdev = NULL;
err_alloc:
#ifdef CONFIG_ICE_SWITCHDEV
kfree(repr->mac_rule);
repr->mac_rule = NULL;
err_alloc_rule:
#endif
kfree(repr);
vf->repr = NULL;
return err;
}
/**
* ice_repr_rem - remove representor from VF
* @vf: pointer to VF structure
*/
static void ice_repr_rem(struct ice_vf *vf)
{
ice: refactor unwind cleanup in eswitch mode The code for supporting eswitch mode and port representors on VFs uses an unwind based cleanup flow when handling errors. These flows are used to cleanup and get everything back to the state prior to attempting to switch from legacy to representor mode or back. The unwind iterations make sense, but complicate a plan to refactor the VF array structure. In the future we won't have a clean method of reversing an iteration of the VFs. Instead, we can change the cleanup flow to just iterate over all VF structures and clean up appropriately. First notice that ice_repr_add_for_all_vfs and ice_repr_rem_from_all_vfs have an additional step of re-assigning the VC ops. There is no good reason to do this outside of ice_repr_add and ice_repr_rem. It can simply be done as the last step of these functions. Second, make sure ice_repr_rem is safe to call on a VF which does not have a representor. Check if vf->repr is NULL first and exit early if so. Move ice_repr_rem_from_all_vfs above ice_repr_add_for_all_vfs so that we can call it from the cleanup function. In ice_eswitch.c, replace the unwind iteration with a call to ice_eswitch_release_reprs. This will go through all of the VFs and revert the VF back to the standard model without the eswitch mode. To make this safe, ensure this function checks whether or not the represent or has been moved. Rely on the metadata destination in vf->repr->dst. This must be NULL if the representor has not been moved to eswitch mode. Ensure that we always re-assign this value back to NULL after freeing it, and move the ice_eswitch_release_reprs so that it can be called from the setup function. With these changes, eswitch cleanup no longer uses an unwind flow that is problematic for the planned VF data structure change. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Sandeep Penigalapati <sandeep.penigalapati@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2022-02-16 13:37:28 -08:00
if (!vf->repr)
return;
kfree(vf->repr->q_vector);
vf->repr->q_vector = NULL;
unregister_netdev(vf->repr->netdev);
ice_devlink_destroy_vf_port(vf);
free_netdev(vf->repr->netdev);
vf->repr->netdev = NULL;
#ifdef CONFIG_ICE_SWITCHDEV
kfree(vf->repr->mac_rule);
vf->repr->mac_rule = NULL;
#endif
kfree(vf->repr);
vf->repr = NULL;
ice: refactor unwind cleanup in eswitch mode The code for supporting eswitch mode and port representors on VFs uses an unwind based cleanup flow when handling errors. These flows are used to cleanup and get everything back to the state prior to attempting to switch from legacy to representor mode or back. The unwind iterations make sense, but complicate a plan to refactor the VF array structure. In the future we won't have a clean method of reversing an iteration of the VFs. Instead, we can change the cleanup flow to just iterate over all VF structures and clean up appropriately. First notice that ice_repr_add_for_all_vfs and ice_repr_rem_from_all_vfs have an additional step of re-assigning the VC ops. There is no good reason to do this outside of ice_repr_add and ice_repr_rem. It can simply be done as the last step of these functions. Second, make sure ice_repr_rem is safe to call on a VF which does not have a representor. Check if vf->repr is NULL first and exit early if so. Move ice_repr_rem_from_all_vfs above ice_repr_add_for_all_vfs so that we can call it from the cleanup function. In ice_eswitch.c, replace the unwind iteration with a call to ice_eswitch_release_reprs. This will go through all of the VFs and revert the VF back to the standard model without the eswitch mode. To make this safe, ensure this function checks whether or not the represent or has been moved. Rely on the metadata destination in vf->repr->dst. This must be NULL if the representor has not been moved to eswitch mode. Ensure that we always re-assign this value back to NULL after freeing it, and move the ice_eswitch_release_reprs so that it can be called from the setup function. With these changes, eswitch cleanup no longer uses an unwind flow that is problematic for the planned VF data structure change. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Sandeep Penigalapati <sandeep.penigalapati@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2022-02-16 13:37:28 -08:00
ice: convert vf->vc_ops to a const pointer The vc_ops structure is used to allow different handlers for virtchnl commands when the driver is in representor mode. The current implementation uses a copy of the ops table in each VF, and modifies this copy dynamically. The usual practice in kernel code is to store the ops table in a constant structure and point to different versions. This has a number of advantages: 1. Reduced memory usage. Each VF merely points to the correct table, so they're able to re-use the same constant lookup table in memory. 2. Consistency. It becomes more difficult to accidentally update or edit only one op call. Instead, the code switches to the correct able by a single pointer write. In general this is atomic, either the pointer is updated or its not. 3. Code Layout. The VF structure can store a pointer to the table without needing to have the full structure definition defined prior to the VF structure definition. This will aid in future refactoring of code by allowing the VF pointer to be kept in ice_vf_lib.h while the virtchnl ops table can be maintained in ice_virtchnl.h There is one major downside in the case of the vc_ops structure. Most of the operations in the table are the same between the two current implementations. This can appear to lead to duplication since each implementation must now fill in the complete table. It could make spotting the differences in the representor mode more challenging. Unfortunately, methods to make this less error prone either add complexity overhead (macros using CPP token concatenation) or don't work on all compilers we support (constant initializer from another constant structure). The cost of maintaining two structures does not out weigh the benefits of the constant table model. While we're making these changes, go ahead and rename the structure and implementations with "virtchnl" instead of "vc_vf_". This will more closely align with the planned file renaming, and avoid similar names when we later introduce a "vf ops" table for separating Scalable IOV and Single Root IOV implementations. Leave the accessor/assignment functions in order to avoid issues with compiling with options disabled. The interface makes it easier to handle when CONFIG_PCI_IOV is disabled in the kernel. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Sandeep Penigalapati <sandeep.penigalapati@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2022-02-22 16:26:51 -08:00
ice_virtchnl_set_dflt_ops(vf);
}
/**
ice: refactor unwind cleanup in eswitch mode The code for supporting eswitch mode and port representors on VFs uses an unwind based cleanup flow when handling errors. These flows are used to cleanup and get everything back to the state prior to attempting to switch from legacy to representor mode or back. The unwind iterations make sense, but complicate a plan to refactor the VF array structure. In the future we won't have a clean method of reversing an iteration of the VFs. Instead, we can change the cleanup flow to just iterate over all VF structures and clean up appropriately. First notice that ice_repr_add_for_all_vfs and ice_repr_rem_from_all_vfs have an additional step of re-assigning the VC ops. There is no good reason to do this outside of ice_repr_add and ice_repr_rem. It can simply be done as the last step of these functions. Second, make sure ice_repr_rem is safe to call on a VF which does not have a representor. Check if vf->repr is NULL first and exit early if so. Move ice_repr_rem_from_all_vfs above ice_repr_add_for_all_vfs so that we can call it from the cleanup function. In ice_eswitch.c, replace the unwind iteration with a call to ice_eswitch_release_reprs. This will go through all of the VFs and revert the VF back to the standard model without the eswitch mode. To make this safe, ensure this function checks whether or not the represent or has been moved. Rely on the metadata destination in vf->repr->dst. This must be NULL if the representor has not been moved to eswitch mode. Ensure that we always re-assign this value back to NULL after freeing it, and move the ice_eswitch_release_reprs so that it can be called from the setup function. With these changes, eswitch cleanup no longer uses an unwind flow that is problematic for the planned VF data structure change. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Sandeep Penigalapati <sandeep.penigalapati@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2022-02-16 13:37:28 -08:00
* ice_repr_rem_from_all_vfs - remove port representor for all VFs
* @pf: pointer to PF structure
*/
ice: refactor unwind cleanup in eswitch mode The code for supporting eswitch mode and port representors on VFs uses an unwind based cleanup flow when handling errors. These flows are used to cleanup and get everything back to the state prior to attempting to switch from legacy to representor mode or back. The unwind iterations make sense, but complicate a plan to refactor the VF array structure. In the future we won't have a clean method of reversing an iteration of the VFs. Instead, we can change the cleanup flow to just iterate over all VF structures and clean up appropriately. First notice that ice_repr_add_for_all_vfs and ice_repr_rem_from_all_vfs have an additional step of re-assigning the VC ops. There is no good reason to do this outside of ice_repr_add and ice_repr_rem. It can simply be done as the last step of these functions. Second, make sure ice_repr_rem is safe to call on a VF which does not have a representor. Check if vf->repr is NULL first and exit early if so. Move ice_repr_rem_from_all_vfs above ice_repr_add_for_all_vfs so that we can call it from the cleanup function. In ice_eswitch.c, replace the unwind iteration with a call to ice_eswitch_release_reprs. This will go through all of the VFs and revert the VF back to the standard model without the eswitch mode. To make this safe, ensure this function checks whether or not the represent or has been moved. Rely on the metadata destination in vf->repr->dst. This must be NULL if the representor has not been moved to eswitch mode. Ensure that we always re-assign this value back to NULL after freeing it, and move the ice_eswitch_release_reprs so that it can be called from the setup function. With these changes, eswitch cleanup no longer uses an unwind flow that is problematic for the planned VF data structure change. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Sandeep Penigalapati <sandeep.penigalapati@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2022-02-16 13:37:28 -08:00
void ice_repr_rem_from_all_vfs(struct ice_pf *pf)
{
struct ice_vf *vf;
unsigned int bkt;
ice: convert VF storage to hash table with krefs and RCU The ice driver stores VF structures in a simple array which is allocated once at the time of VF creation. The VF structures are then accessed from the array by their VF ID. The ID must be between 0 and the number of allocated VFs. Multiple threads can access this table: * .ndo operations such as .ndo_get_vf_cfg or .ndo_set_vf_trust * interrupts, such as due to messages from the VF using the virtchnl communication * processing such as device reset * commands to add or remove VFs The current implementation does not keep track of when all threads are done operating on a VF and can potentially result in use-after-free issues caused by one thread accessing a VF structure after it has been released when removing VFs. Some of these are prevented with various state flags and checks. In addition, this structure is quite static and does not support a planned future where virtualization can be more dynamic. As we begin to look at supporting Scalable IOV with the ice driver (as opposed to just supporting Single Root IOV), this structure is not sufficient. In the future, VFs will be able to be added and removed individually and dynamically. To allow for this, and to better protect against a whole class of use-after-free bugs, replace the VF storage with a combination of a hash table and krefs to reference track all of the accesses to VFs through the hash table. A hash table still allows efficient look up of the VF given its ID, but also allows adding and removing VFs. It does not require contiguous VF IDs. The use of krefs allows the cleanup of the VF memory to be delayed until after all threads have released their reference (by calling ice_put_vf). To prevent corruption of the hash table, a combination of RCU and the mutex table_lock are used. Addition and removal from the hash table use the RCU-aware hash macros. This allows simple read-only look ups that iterate to locate a single VF can be fast using RCU. Accesses which modify the hash table, or which can't take RCU because they sleep, will hold the mutex lock. By using this design, we have a stronger guarantee that the VF structure can't be released until after all threads are finished operating on it. We also pave the way for the more dynamic Scalable IOV implementation in the future. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Konrad Jankowski <konrad0.jankowski@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2022-02-16 13:37:38 -08:00
lockdep_assert_held(&pf->vfs.table_lock);
ice_for_each_vf(pf, bkt, vf)
ice_repr_rem(vf);
}
/**
ice: refactor unwind cleanup in eswitch mode The code for supporting eswitch mode and port representors on VFs uses an unwind based cleanup flow when handling errors. These flows are used to cleanup and get everything back to the state prior to attempting to switch from legacy to representor mode or back. The unwind iterations make sense, but complicate a plan to refactor the VF array structure. In the future we won't have a clean method of reversing an iteration of the VFs. Instead, we can change the cleanup flow to just iterate over all VF structures and clean up appropriately. First notice that ice_repr_add_for_all_vfs and ice_repr_rem_from_all_vfs have an additional step of re-assigning the VC ops. There is no good reason to do this outside of ice_repr_add and ice_repr_rem. It can simply be done as the last step of these functions. Second, make sure ice_repr_rem is safe to call on a VF which does not have a representor. Check if vf->repr is NULL first and exit early if so. Move ice_repr_rem_from_all_vfs above ice_repr_add_for_all_vfs so that we can call it from the cleanup function. In ice_eswitch.c, replace the unwind iteration with a call to ice_eswitch_release_reprs. This will go through all of the VFs and revert the VF back to the standard model without the eswitch mode. To make this safe, ensure this function checks whether or not the represent or has been moved. Rely on the metadata destination in vf->repr->dst. This must be NULL if the representor has not been moved to eswitch mode. Ensure that we always re-assign this value back to NULL after freeing it, and move the ice_eswitch_release_reprs so that it can be called from the setup function. With these changes, eswitch cleanup no longer uses an unwind flow that is problematic for the planned VF data structure change. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Sandeep Penigalapati <sandeep.penigalapati@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2022-02-16 13:37:28 -08:00
* ice_repr_add_for_all_vfs - add port representor for all VFs
* @pf: pointer to PF structure
*/
ice: refactor unwind cleanup in eswitch mode The code for supporting eswitch mode and port representors on VFs uses an unwind based cleanup flow when handling errors. These flows are used to cleanup and get everything back to the state prior to attempting to switch from legacy to representor mode or back. The unwind iterations make sense, but complicate a plan to refactor the VF array structure. In the future we won't have a clean method of reversing an iteration of the VFs. Instead, we can change the cleanup flow to just iterate over all VF structures and clean up appropriately. First notice that ice_repr_add_for_all_vfs and ice_repr_rem_from_all_vfs have an additional step of re-assigning the VC ops. There is no good reason to do this outside of ice_repr_add and ice_repr_rem. It can simply be done as the last step of these functions. Second, make sure ice_repr_rem is safe to call on a VF which does not have a representor. Check if vf->repr is NULL first and exit early if so. Move ice_repr_rem_from_all_vfs above ice_repr_add_for_all_vfs so that we can call it from the cleanup function. In ice_eswitch.c, replace the unwind iteration with a call to ice_eswitch_release_reprs. This will go through all of the VFs and revert the VF back to the standard model without the eswitch mode. To make this safe, ensure this function checks whether or not the represent or has been moved. Rely on the metadata destination in vf->repr->dst. This must be NULL if the representor has not been moved to eswitch mode. Ensure that we always re-assign this value back to NULL after freeing it, and move the ice_eswitch_release_reprs so that it can be called from the setup function. With these changes, eswitch cleanup no longer uses an unwind flow that is problematic for the planned VF data structure change. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Sandeep Penigalapati <sandeep.penigalapati@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2022-02-16 13:37:28 -08:00
int ice_repr_add_for_all_vfs(struct ice_pf *pf)
{
struct ice_vf *vf;
unsigned int bkt;
ice: refactor unwind cleanup in eswitch mode The code for supporting eswitch mode and port representors on VFs uses an unwind based cleanup flow when handling errors. These flows are used to cleanup and get everything back to the state prior to attempting to switch from legacy to representor mode or back. The unwind iterations make sense, but complicate a plan to refactor the VF array structure. In the future we won't have a clean method of reversing an iteration of the VFs. Instead, we can change the cleanup flow to just iterate over all VF structures and clean up appropriately. First notice that ice_repr_add_for_all_vfs and ice_repr_rem_from_all_vfs have an additional step of re-assigning the VC ops. There is no good reason to do this outside of ice_repr_add and ice_repr_rem. It can simply be done as the last step of these functions. Second, make sure ice_repr_rem is safe to call on a VF which does not have a representor. Check if vf->repr is NULL first and exit early if so. Move ice_repr_rem_from_all_vfs above ice_repr_add_for_all_vfs so that we can call it from the cleanup function. In ice_eswitch.c, replace the unwind iteration with a call to ice_eswitch_release_reprs. This will go through all of the VFs and revert the VF back to the standard model without the eswitch mode. To make this safe, ensure this function checks whether or not the represent or has been moved. Rely on the metadata destination in vf->repr->dst. This must be NULL if the representor has not been moved to eswitch mode. Ensure that we always re-assign this value back to NULL after freeing it, and move the ice_eswitch_release_reprs so that it can be called from the setup function. With these changes, eswitch cleanup no longer uses an unwind flow that is problematic for the planned VF data structure change. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Sandeep Penigalapati <sandeep.penigalapati@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2022-02-16 13:37:28 -08:00
int err;
ice: convert VF storage to hash table with krefs and RCU The ice driver stores VF structures in a simple array which is allocated once at the time of VF creation. The VF structures are then accessed from the array by their VF ID. The ID must be between 0 and the number of allocated VFs. Multiple threads can access this table: * .ndo operations such as .ndo_get_vf_cfg or .ndo_set_vf_trust * interrupts, such as due to messages from the VF using the virtchnl communication * processing such as device reset * commands to add or remove VFs The current implementation does not keep track of when all threads are done operating on a VF and can potentially result in use-after-free issues caused by one thread accessing a VF structure after it has been released when removing VFs. Some of these are prevented with various state flags and checks. In addition, this structure is quite static and does not support a planned future where virtualization can be more dynamic. As we begin to look at supporting Scalable IOV with the ice driver (as opposed to just supporting Single Root IOV), this structure is not sufficient. In the future, VFs will be able to be added and removed individually and dynamically. To allow for this, and to better protect against a whole class of use-after-free bugs, replace the VF storage with a combination of a hash table and krefs to reference track all of the accesses to VFs through the hash table. A hash table still allows efficient look up of the VF given its ID, but also allows adding and removing VFs. It does not require contiguous VF IDs. The use of krefs allows the cleanup of the VF memory to be delayed until after all threads have released their reference (by calling ice_put_vf). To prevent corruption of the hash table, a combination of RCU and the mutex table_lock are used. Addition and removal from the hash table use the RCU-aware hash macros. This allows simple read-only look ups that iterate to locate a single VF can be fast using RCU. Accesses which modify the hash table, or which can't take RCU because they sleep, will hold the mutex lock. By using this design, we have a stronger guarantee that the VF structure can't be released until after all threads are finished operating on it. We also pave the way for the more dynamic Scalable IOV implementation in the future. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Konrad Jankowski <konrad0.jankowski@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2022-02-16 13:37:38 -08:00
lockdep_assert_held(&pf->vfs.table_lock);
ice_for_each_vf(pf, bkt, vf) {
ice: refactor unwind cleanup in eswitch mode The code for supporting eswitch mode and port representors on VFs uses an unwind based cleanup flow when handling errors. These flows are used to cleanup and get everything back to the state prior to attempting to switch from legacy to representor mode or back. The unwind iterations make sense, but complicate a plan to refactor the VF array structure. In the future we won't have a clean method of reversing an iteration of the VFs. Instead, we can change the cleanup flow to just iterate over all VF structures and clean up appropriately. First notice that ice_repr_add_for_all_vfs and ice_repr_rem_from_all_vfs have an additional step of re-assigning the VC ops. There is no good reason to do this outside of ice_repr_add and ice_repr_rem. It can simply be done as the last step of these functions. Second, make sure ice_repr_rem is safe to call on a VF which does not have a representor. Check if vf->repr is NULL first and exit early if so. Move ice_repr_rem_from_all_vfs above ice_repr_add_for_all_vfs so that we can call it from the cleanup function. In ice_eswitch.c, replace the unwind iteration with a call to ice_eswitch_release_reprs. This will go through all of the VFs and revert the VF back to the standard model without the eswitch mode. To make this safe, ensure this function checks whether or not the represent or has been moved. Rely on the metadata destination in vf->repr->dst. This must be NULL if the representor has not been moved to eswitch mode. Ensure that we always re-assign this value back to NULL after freeing it, and move the ice_eswitch_release_reprs so that it can be called from the setup function. With these changes, eswitch cleanup no longer uses an unwind flow that is problematic for the planned VF data structure change. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Sandeep Penigalapati <sandeep.penigalapati@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2022-02-16 13:37:28 -08:00
err = ice_repr_add(vf);
if (err)
goto err;
}
ice: refactor unwind cleanup in eswitch mode The code for supporting eswitch mode and port representors on VFs uses an unwind based cleanup flow when handling errors. These flows are used to cleanup and get everything back to the state prior to attempting to switch from legacy to representor mode or back. The unwind iterations make sense, but complicate a plan to refactor the VF array structure. In the future we won't have a clean method of reversing an iteration of the VFs. Instead, we can change the cleanup flow to just iterate over all VF structures and clean up appropriately. First notice that ice_repr_add_for_all_vfs and ice_repr_rem_from_all_vfs have an additional step of re-assigning the VC ops. There is no good reason to do this outside of ice_repr_add and ice_repr_rem. It can simply be done as the last step of these functions. Second, make sure ice_repr_rem is safe to call on a VF which does not have a representor. Check if vf->repr is NULL first and exit early if so. Move ice_repr_rem_from_all_vfs above ice_repr_add_for_all_vfs so that we can call it from the cleanup function. In ice_eswitch.c, replace the unwind iteration with a call to ice_eswitch_release_reprs. This will go through all of the VFs and revert the VF back to the standard model without the eswitch mode. To make this safe, ensure this function checks whether or not the represent or has been moved. Rely on the metadata destination in vf->repr->dst. This must be NULL if the representor has not been moved to eswitch mode. Ensure that we always re-assign this value back to NULL after freeing it, and move the ice_eswitch_release_reprs so that it can be called from the setup function. With these changes, eswitch cleanup no longer uses an unwind flow that is problematic for the planned VF data structure change. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Sandeep Penigalapati <sandeep.penigalapati@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2022-02-16 13:37:28 -08:00
return 0;
err:
ice_repr_rem_from_all_vfs(pf);
return err;
}
ice: set and release switchdev environment Switchdev environment has to be set up when user create VFs and eswitch mode is switchdev. Release is done when user delete all VFs. Data path in this implementation is based on control plane VSI. This VSI is used to pass traffic from port representors to corresponding VFs and vice versa. Default TX rule has to be added to forward packet to control plane VSI. This will redirect packets from VFs which don't match other rules to control plane VSI. On RX side default rule is added on uplink VSI to receive all traffic that doesn't match other rules. When setting switchdev environment all other rules from VFs should be removed. Packet to VFs will be forwarded by control plane VSI. As VF without any mac rules can't send any packet because of antispoof mechanism, VSI antispoof should be turned off on each VFs. To send packet from representor to correct VSI, destination VSI field in TX descriptor will have to be filled. Allow that by setting destination override bit in control plane VSI security config. Packet from VFs will be received on control plane VSI. Driver should decide to which netdev forward the packet. Decision is made based on src_vsi field from descriptor. There is a target netdev list in control plane VSI struct which choose netdev based on src_vsi number. Co-developed-by: Michal Swiatkowski <michal.swiatkowski@linux.intel.com> Signed-off-by: Michal Swiatkowski <michal.swiatkowski@linux.intel.com> Signed-off-by: Grzegorz Nitka <grzegorz.nitka@intel.com> Tested-by: Sandeep Penigalapati <sandeep.penigalapati@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-08-19 17:08:54 -07:00
/**
* ice_repr_start_tx_queues - start Tx queues of port representor
* @repr: pointer to repr structure
*/
void ice_repr_start_tx_queues(struct ice_repr *repr)
{
netif_carrier_on(repr->netdev);
netif_tx_start_all_queues(repr->netdev);
}
/**
* ice_repr_stop_tx_queues - stop Tx queues of port representor
* @repr: pointer to repr structure
*/
void ice_repr_stop_tx_queues(struct ice_repr *repr)
{
netif_carrier_off(repr->netdev);
netif_tx_stop_all_queues(repr->netdev);
}
ice: set and release switchdev environment Switchdev environment has to be set up when user create VFs and eswitch mode is switchdev. Release is done when user delete all VFs. Data path in this implementation is based on control plane VSI. This VSI is used to pass traffic from port representors to corresponding VFs and vice versa. Default TX rule has to be added to forward packet to control plane VSI. This will redirect packets from VFs which don't match other rules to control plane VSI. On RX side default rule is added on uplink VSI to receive all traffic that doesn't match other rules. When setting switchdev environment all other rules from VFs should be removed. Packet to VFs will be forwarded by control plane VSI. As VF without any mac rules can't send any packet because of antispoof mechanism, VSI antispoof should be turned off on each VFs. To send packet from representor to correct VSI, destination VSI field in TX descriptor will have to be filled. Allow that by setting destination override bit in control plane VSI security config. Packet from VFs will be received on control plane VSI. Driver should decide to which netdev forward the packet. Decision is made based on src_vsi field from descriptor. There is a target netdev list in control plane VSI struct which choose netdev based on src_vsi number. Co-developed-by: Michal Swiatkowski <michal.swiatkowski@linux.intel.com> Signed-off-by: Michal Swiatkowski <michal.swiatkowski@linux.intel.com> Signed-off-by: Grzegorz Nitka <grzegorz.nitka@intel.com> Tested-by: Sandeep Penigalapati <sandeep.penigalapati@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2021-08-19 17:08:54 -07:00
/**
* ice_repr_set_traffic_vsi - set traffic VSI for port representor
* @repr: repr on with VSI will be set
* @vsi: pointer to VSI that will be used by port representor to pass traffic
*/
void ice_repr_set_traffic_vsi(struct ice_repr *repr, struct ice_vsi *vsi)
{
struct ice_netdev_priv *np = netdev_priv(repr->netdev);
np->vsi = vsi;
}