linux/drivers/infiniband/hw/mlx5/ib_rep.c

277 lines
6.6 KiB
C
Raw Permalink Normal View History

// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
/*
* Copyright (c) 2018 Mellanox Technologies. All rights reserved.
*/
#include <linux/mlx5/vport.h>
#include "ib_rep.h"
#include "srq.h"
static int
mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev,
struct mlx5_eswitch_rep *rep,
int vport_index)
{
struct mlx5_ib_dev *ibdev;
RDMA/mlx5: Use IB set_netdev and get_netdev functions The IB layer provides a common interface to store and get net devices associated to an IB device port (ib_device_set_netdev() and ib_device_get_netdev()). Previously, mlx5_ib stored and managed the associated net devices internally. Replace internal net device management in mlx5_ib with ib_device_set_netdev() when attaching/detaching a net device and ib_device_get_netdev() when retrieving the net device. Export ib_device_get_netdev(). For mlx5 representors/PFs/VFs and lag creation we replace the netdev assignments with the IB set/get netdev functions. In active-backup mode lag the active slave net device is stored in the lag itself. To assure the net device stored in a lag bond IB device is the active slave we implement the following: - mlx5_core: when modifying the slave of a bond we send the internal driver event MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE. - mlx5_ib: when catching the event call ib_device_set_netdev() This patch also ensures the correct IB events are sent in switchdev lag. While at it, when in multiport eswitch mode, only a single IB device is created for all ports. The said IB device will receive all netdev events of its VFs once loaded, thus to avoid overwriting the mapping of PF IB device to PF netdev, ignore NETDEV_REGISTER events if the ib device has already been mapped to a netdev. Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com> Signed-off-by: Michael Guralnik <michaelgur@nvidia.com> Link: https://patch.msgid.link/20240909173025.30422-6-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky <leon@kernel.org>
2024-09-09 20:30:23 +03:00
struct net_device *ndev;
ibdev = mlx5_eswitch_uplink_get_proto_dev(dev->priv.eswitch, REP_IB);
if (!ibdev)
return -EINVAL;
ibdev->port[vport_index].rep = rep;
rep->rep_data[REP_IB].priv = ibdev;
RDMA/mlx5: Use IB set_netdev and get_netdev functions The IB layer provides a common interface to store and get net devices associated to an IB device port (ib_device_set_netdev() and ib_device_get_netdev()). Previously, mlx5_ib stored and managed the associated net devices internally. Replace internal net device management in mlx5_ib with ib_device_set_netdev() when attaching/detaching a net device and ib_device_get_netdev() when retrieving the net device. Export ib_device_get_netdev(). For mlx5 representors/PFs/VFs and lag creation we replace the netdev assignments with the IB set/get netdev functions. In active-backup mode lag the active slave net device is stored in the lag itself. To assure the net device stored in a lag bond IB device is the active slave we implement the following: - mlx5_core: when modifying the slave of a bond we send the internal driver event MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE. - mlx5_ib: when catching the event call ib_device_set_netdev() This patch also ensures the correct IB events are sent in switchdev lag. While at it, when in multiport eswitch mode, only a single IB device is created for all ports. The said IB device will receive all netdev events of its VFs once loaded, thus to avoid overwriting the mapping of PF IB device to PF netdev, ignore NETDEV_REGISTER events if the ib device has already been mapped to a netdev. Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com> Signed-off-by: Michael Guralnik <michaelgur@nvidia.com> Link: https://patch.msgid.link/20240909173025.30422-6-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky <leon@kernel.org>
2024-09-09 20:30:23 +03:00
ndev = mlx5_ib_get_rep_netdev(rep->esw, rep->vport);
RDMA/mlx5: Use IB set_netdev and get_netdev functions The IB layer provides a common interface to store and get net devices associated to an IB device port (ib_device_set_netdev() and ib_device_get_netdev()). Previously, mlx5_ib stored and managed the associated net devices internally. Replace internal net device management in mlx5_ib with ib_device_set_netdev() when attaching/detaching a net device and ib_device_get_netdev() when retrieving the net device. Export ib_device_get_netdev(). For mlx5 representors/PFs/VFs and lag creation we replace the netdev assignments with the IB set/get netdev functions. In active-backup mode lag the active slave net device is stored in the lag itself. To assure the net device stored in a lag bond IB device is the active slave we implement the following: - mlx5_core: when modifying the slave of a bond we send the internal driver event MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE. - mlx5_ib: when catching the event call ib_device_set_netdev() This patch also ensures the correct IB events are sent in switchdev lag. While at it, when in multiport eswitch mode, only a single IB device is created for all ports. The said IB device will receive all netdev events of its VFs once loaded, thus to avoid overwriting the mapping of PF IB device to PF netdev, ignore NETDEV_REGISTER events if the ib device has already been mapped to a netdev. Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com> Signed-off-by: Michael Guralnik <michaelgur@nvidia.com> Link: https://patch.msgid.link/20240909173025.30422-6-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky <leon@kernel.org>
2024-09-09 20:30:23 +03:00
return ib_device_set_netdev(&ibdev->ib_dev, ndev, vport_index + 1);
}
static void mlx5_ib_register_peer_vport_reps(struct mlx5_core_dev *mdev);
static void mlx5_ib_num_ports_update(struct mlx5_core_dev *dev, u32 *num_ports)
{
struct mlx5_core_dev *peer_dev;
int i;
mlx5_lag_for_each_peer_mdev(dev, peer_dev, i) {
u32 peer_num_ports = mlx5_eswitch_get_total_vports(peer_dev);
if (mlx5_lag_is_mpesw(peer_dev))
*num_ports += peer_num_ports;
else
/* Only 1 ib port is the representor for all uplinks */
*num_ports += peer_num_ports - 1;
}
}
static int
mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
{
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 09:04:20 +02:00
u32 num_ports = mlx5_eswitch_get_total_vports(dev);
struct mlx5_core_dev *lag_master = dev;
const struct mlx5_ib_profile *profile;
struct mlx5_core_dev *peer_dev;
struct mlx5_ib_dev *ibdev;
int new_uplink = false;
int vport_index;
int ret;
int i;
vport_index = rep->vport_index;
if (mlx5_lag_is_shared_fdb(dev)) {
if (mlx5_lag_is_master(dev)) {
mlx5_ib_num_ports_update(dev, &num_ports);
} else {
if (rep->vport == MLX5_VPORT_UPLINK) {
if (!mlx5_lag_is_mpesw(dev))
return 0;
new_uplink = true;
}
mlx5_lag_for_each_peer_mdev(dev, peer_dev, i) {
u32 peer_n_ports = mlx5_eswitch_get_total_vports(peer_dev);
if (mlx5_lag_is_master(peer_dev))
lag_master = peer_dev;
else if (!mlx5_lag_is_mpesw(dev))
/* Only 1 ib port is the representor for all uplinks */
peer_n_ports--;
if (mlx5_get_dev_index(peer_dev) < mlx5_get_dev_index(dev))
vport_index += peer_n_ports;
}
}
}
if (rep->vport == MLX5_VPORT_UPLINK && !new_uplink)
profile = &raw_eth_profile;
else
return mlx5_ib_set_vport_rep(lag_master, rep, vport_index);
ibdev = ib_alloc_device_with_net(mlx5_ib_dev, ib_dev,
mlx5_core_net(lag_master));
if (!ibdev)
return -ENOMEM;
ibdev->port = kcalloc(num_ports, sizeof(*ibdev->port),
GFP_KERNEL);
if (!ibdev->port) {
ret = -ENOMEM;
goto fail_port;
}
ibdev->is_rep = true;
vport_index = rep->vport_index;
ibdev->port[vport_index].rep = rep;
ibdev->mdev = lag_master;
ibdev->num_ports = num_ports;
RDMA/mlx5: Use IB set_netdev and get_netdev functions The IB layer provides a common interface to store and get net devices associated to an IB device port (ib_device_set_netdev() and ib_device_get_netdev()). Previously, mlx5_ib stored and managed the associated net devices internally. Replace internal net device management in mlx5_ib with ib_device_set_netdev() when attaching/detaching a net device and ib_device_get_netdev() when retrieving the net device. Export ib_device_get_netdev(). For mlx5 representors/PFs/VFs and lag creation we replace the netdev assignments with the IB set/get netdev functions. In active-backup mode lag the active slave net device is stored in the lag itself. To assure the net device stored in a lag bond IB device is the active slave we implement the following: - mlx5_core: when modifying the slave of a bond we send the internal driver event MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE. - mlx5_ib: when catching the event call ib_device_set_netdev() This patch also ensures the correct IB events are sent in switchdev lag. While at it, when in multiport eswitch mode, only a single IB device is created for all ports. The said IB device will receive all netdev events of its VFs once loaded, thus to avoid overwriting the mapping of PF IB device to PF netdev, ignore NETDEV_REGISTER events if the ib device has already been mapped to a netdev. Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com> Signed-off-by: Michael Guralnik <michaelgur@nvidia.com> Link: https://patch.msgid.link/20240909173025.30422-6-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky <leon@kernel.org>
2024-09-09 20:30:23 +03:00
ibdev->ib_dev.phys_port_cnt = num_ports;
ret = ib_device_set_netdev(&ibdev->ib_dev,
mlx5_ib_get_rep_netdev(lag_master->priv.eswitch,
rep->vport),
vport_index + 1);
if (ret)
goto fail_add;
ret = __mlx5_ib_add(ibdev, profile);
if (ret)
goto fail_add;
rep->rep_data[REP_IB].priv = ibdev;
if (mlx5_lag_is_shared_fdb(lag_master))
mlx5_ib_register_peer_vport_reps(lag_master);
return 0;
fail_add:
kfree(ibdev->port);
fail_port:
ib_dealloc_device(&ibdev->ib_dev);
return ret;
}
static void *mlx5_ib_rep_to_dev(struct mlx5_eswitch_rep *rep)
{
return rep->rep_data[REP_IB].priv;
}
static void
mlx5_ib_vport_rep_unload(struct mlx5_eswitch_rep *rep)
{
struct mlx5_core_dev *mdev = mlx5_eswitch_get_core_dev(rep->esw);
struct mlx5_ib_dev *dev = mlx5_ib_rep_to_dev(rep);
int vport_index = rep->vport_index;
struct mlx5_ib_port *port;
int i;
if (WARN_ON(!mdev))
return;
if (!dev)
return;
if (mlx5_lag_is_shared_fdb(mdev) &&
!mlx5_lag_is_master(mdev)) {
if (rep->vport == MLX5_VPORT_UPLINK && !mlx5_lag_is_mpesw(mdev))
return;
for (i = 0; i < dev->num_ports; i++) {
if (dev->port[i].rep == rep)
break;
}
if (WARN_ON(i == dev->num_ports))
return;
vport_index = i;
}
port = &dev->port[vport_index];
RDMA/mlx5: Use IB set_netdev and get_netdev functions The IB layer provides a common interface to store and get net devices associated to an IB device port (ib_device_set_netdev() and ib_device_get_netdev()). Previously, mlx5_ib stored and managed the associated net devices internally. Replace internal net device management in mlx5_ib with ib_device_set_netdev() when attaching/detaching a net device and ib_device_get_netdev() when retrieving the net device. Export ib_device_get_netdev(). For mlx5 representors/PFs/VFs and lag creation we replace the netdev assignments with the IB set/get netdev functions. In active-backup mode lag the active slave net device is stored in the lag itself. To assure the net device stored in a lag bond IB device is the active slave we implement the following: - mlx5_core: when modifying the slave of a bond we send the internal driver event MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE. - mlx5_ib: when catching the event call ib_device_set_netdev() This patch also ensures the correct IB events are sent in switchdev lag. While at it, when in multiport eswitch mode, only a single IB device is created for all ports. The said IB device will receive all netdev events of its VFs once loaded, thus to avoid overwriting the mapping of PF IB device to PF netdev, ignore NETDEV_REGISTER events if the ib device has already been mapped to a netdev. Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com> Signed-off-by: Michael Guralnik <michaelgur@nvidia.com> Link: https://patch.msgid.link/20240909173025.30422-6-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky <leon@kernel.org>
2024-09-09 20:30:23 +03:00
ib_device_set_netdev(&dev->ib_dev, NULL, vport_index + 1);
rep->rep_data[REP_IB].priv = NULL;
port->rep = NULL;
if (rep->vport == MLX5_VPORT_UPLINK) {
if (mlx5_lag_is_shared_fdb(mdev) && !mlx5_lag_is_master(mdev))
return;
if (mlx5_lag_is_shared_fdb(mdev)) {
struct mlx5_core_dev *peer_mdev;
struct mlx5_eswitch *esw;
mlx5_lag_for_each_peer_mdev(mdev, peer_mdev, i) {
esw = peer_mdev->priv.eswitch;
mlx5_eswitch_unregister_vport_reps(esw, REP_IB);
}
}
__mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX);
}
}
static const struct mlx5_eswitch_rep_ops rep_ops = {
.load = mlx5_ib_vport_rep_load,
.unload = mlx5_ib_vport_rep_unload,
.get_proto_dev = mlx5_ib_rep_to_dev,
};
static void mlx5_ib_register_peer_vport_reps(struct mlx5_core_dev *mdev)
{
struct mlx5_core_dev *peer_mdev;
struct mlx5_eswitch *esw;
int i;
mlx5_lag_for_each_peer_mdev(mdev, peer_mdev, i) {
esw = peer_mdev->priv.eswitch;
mlx5_eswitch_register_vport_reps(esw, &rep_ops, REP_IB);
}
}
struct net_device *mlx5_ib_get_rep_netdev(struct mlx5_eswitch *esw,
u16 vport_num)
{
return mlx5_eswitch_get_proto_dev(esw, vport_num, REP_ETH);
}
struct mlx5_flow_handle *create_flow_rule_vport_sq(struct mlx5_ib_dev *dev,
struct mlx5_ib_sq *sq,
RDMA: Support more than 255 rdma ports Current code uses many different types when dealing with a port of a RDMA device: u8, unsigned int and u32. Switch to u32 to clean up the logic. This allows us to make (at least) the core view consistent and use the same type. Unfortunately not all places can be converted. Many uverbs functions expect port to be u8 so keep those places in order not to break UAPIs. HW/Spec defined values must also not be changed. With the switch to u32 we now can support devices with more than 255 ports. U32_MAX is reserved to make control logic a bit easier to deal with. As a device with U32_MAX ports probably isn't going to happen any time soon this seems like a non issue. When a device with more than 255 ports is created uverbs will report the RDMA device as having 255 ports as this is the max currently supported. The verbs interface is not changed yet because the IBTA spec limits the port size in too many places to be u8 and all applications that relies in verbs won't be able to cope with this change. At this stage, we are extending the interfaces that are using vendor channel solely Once the limitation is lifted mlx5 in switchdev mode will be able to have thousands of SFs created by the device. As the only instance of an RDMA device that reports more than 255 ports will be a representor device and it exposes itself as a RAW Ethernet only device CM/MAD/IPoIB and other ULPs aren't effected by this change and their sysfs/interfaces that are exposes to userspace can remain unchanged. While here cleanup some alignment issues and remove unneeded sanity checks (mainly in rdmavt), Link: https://lore.kernel.org/r/20210301070420.439400-1-leon@kernel.org Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2021-03-01 09:04:20 +02:00
u32 port)
{
struct mlx5_eswitch *esw = dev->mdev->priv.eswitch;
struct mlx5_eswitch_rep *rep;
if (!dev->is_rep || !port)
return NULL;
if (!dev->port[port - 1].rep)
return ERR_PTR(-EINVAL);
rep = dev->port[port - 1].rep;
return mlx5_eswitch_add_send_to_vport_rule(esw, esw, rep, sq->base.mqp.qpn);
}
static int mlx5r_rep_probe(struct auxiliary_device *adev,
const struct auxiliary_device_id *id)
{
struct mlx5_adev *idev = container_of(adev, struct mlx5_adev, adev);
struct mlx5_core_dev *mdev = idev->mdev;
struct mlx5_eswitch *esw;
esw = mdev->priv.eswitch;
mlx5_eswitch_register_vport_reps(esw, &rep_ops, REP_IB);
return 0;
}
static void mlx5r_rep_remove(struct auxiliary_device *adev)
{
struct mlx5_adev *idev = container_of(adev, struct mlx5_adev, adev);
struct mlx5_core_dev *mdev = idev->mdev;
struct mlx5_eswitch *esw;
esw = mdev->priv.eswitch;
mlx5_eswitch_unregister_vport_reps(esw, REP_IB);
}
static const struct auxiliary_device_id mlx5r_rep_id_table[] = {
{ .name = MLX5_ADEV_NAME ".rdma-rep", },
{},
};
MODULE_DEVICE_TABLE(auxiliary, mlx5r_rep_id_table);
static struct auxiliary_driver mlx5r_rep_driver = {
.name = "rep",
.probe = mlx5r_rep_probe,
.remove = mlx5r_rep_remove,
.id_table = mlx5r_rep_id_table,
};
int mlx5r_rep_init(void)
{
return auxiliary_driver_register(&mlx5r_rep_driver);
}
void mlx5r_rep_cleanup(void)
{
auxiliary_driver_unregister(&mlx5r_rep_driver);
}