linux/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
Cosmin Ratiu 463e517696 IB/IPoIB: Replace vlan_rwsem with the netdev instance lock
vlan_rwsem was added more than a decade ago to work around a deadlock
involving the original mutex being acquired twice, once from the wq.
Subsequent changes then tweaked it to partially protect access to
ipoib_dev_priv->child_intfs together with the RTNL. Flushing the wq
synchronously was also since then refactored to happen separately.

This semaphore unfortunately prevents updating ipoib to work with
devices that require the netdev lock, because of lock ordering issues
between RTNL, vlan_rwsem and the netdev instance locks of parent and
child devices.

To uncomplicate things, this commit replaces vlan_rwsem with the netdev
instance lock of the parent device. Both parent child_intfs list and the
children's list membership in it require holding the parent netdev
instance lock.

All call paths were carefully reviewed and no-longer-needed ASSERT_RTNL
calls were dropped. Some non-trivial changes:
- ipoib_match_gid_pkey_addr() now only acquires the instance lock and
  iterates through child_intfs for the first level of recursion (the
  parent), as it's not possible to have multiple levels of nested
  subinterfaces.
- ipoib_open() and ipoib_stop() schedule tasks on the global workqueue
  to open/stop child interfaces to avoid potentially acquiring nested
  netdev instance locks. To avoid the device going away between the task
  scheduling and execution, netdev_hold/netdev_put are used.

Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1747829342-1018757-3-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-05-22 09:15:04 -07:00

298 lines
7.5 KiB
C

/*
* Copyright (c) 2004 Topspin Communications. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include <linux/sched/signal.h>
#include <linux/init.h>
#include <linux/seq_file.h>
#include <linux/uaccess.h>
#include "ipoib.h"
static ssize_t parent_show(struct device *d, struct device_attribute *attr,
char *buf)
{
struct net_device *dev = to_net_dev(d);
struct ipoib_dev_priv *priv = ipoib_priv(dev);
return sysfs_emit(buf, "%s\n", priv->parent->name);
}
static DEVICE_ATTR_RO(parent);
static bool is_child_unique(struct ipoib_dev_priv *ppriv,
struct ipoib_dev_priv *priv)
{
struct ipoib_dev_priv *tpriv;
bool result = true;
/*
* Since the legacy sysfs interface uses pkey for deletion it cannot
* support more than one interface with the same pkey, it creates
* ambiguity. The RTNL interface deletes using the netdev so it does
* not have a problem to support duplicated pkeys.
*/
if (priv->child_type != IPOIB_LEGACY_CHILD)
return true;
/*
* First ensure this isn't a duplicate. We check the parent device and
* then all of the legacy child interfaces to make sure the Pkey
* doesn't match.
*/
if (ppriv->pkey == priv->pkey)
return false;
netdev_lock(ppriv->dev);
list_for_each_entry(tpriv, &ppriv->child_intfs, list) {
if (tpriv->pkey == priv->pkey &&
tpriv->child_type == IPOIB_LEGACY_CHILD) {
result = false;
break;
}
}
netdev_unlock(ppriv->dev);
return result;
}
/*
* NOTE: If this function fails then the priv->dev will remain valid, however
* priv will have been freed and must not be touched by caller in the error
* case.
*
* If (ndev->reg_state == NETREG_UNINITIALIZED) then it is up to the caller to
* free the net_device (just as rtnl_newlink does) otherwise the net_device
* will be freed when the rtnl is unlocked.
*/
int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv,
u16 pkey, int type)
{
struct net_device *ndev = priv->dev;
int result;
struct rdma_netdev *rn = netdev_priv(ndev);
/*
* We do not need to touch priv if register_netdevice fails, so just
* always use this flow.
*/
ndev->priv_destructor = ipoib_intf_free;
/*
* Racing with unregister of the parent must be prevented by the
* caller.
*/
WARN_ON(ppriv->dev->reg_state != NETREG_REGISTERED);
if (pkey == 0 || pkey == 0x8000) {
result = -EINVAL;
goto out_early;
}
rn->mtu = priv->mcast_mtu;
priv->parent = ppriv->dev;
priv->pkey = pkey;
priv->child_type = type;
if (!is_child_unique(ppriv, priv)) {
result = -ENOTUNIQ;
goto out_early;
}
result = register_netdevice(ndev);
if (result) {
ipoib_warn(priv, "failed to initialize; error %i", result);
/*
* register_netdevice sometimes calls priv_destructor,
* sometimes not. Make sure it was done.
*/
goto out_early;
}
/* RTNL childs don't need proprietary sysfs entries */
if (type == IPOIB_LEGACY_CHILD) {
if (ipoib_cm_add_mode_attr(ndev))
goto sysfs_failed;
if (ipoib_add_pkey_attr(ndev))
goto sysfs_failed;
if (ipoib_add_umcast_attr(ndev))
goto sysfs_failed;
if (device_create_file(&ndev->dev, &dev_attr_parent))
goto sysfs_failed;
}
return 0;
sysfs_failed:
unregister_netdevice(priv->dev);
return -ENOMEM;
out_early:
if (ndev->priv_destructor)
ndev->priv_destructor(ndev);
return result;
}
int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey)
{
struct ipoib_dev_priv *ppriv, *priv;
char intf_name[IFNAMSIZ];
struct net_device *ndev;
int result;
if (!capable(CAP_NET_ADMIN))
return -EPERM;
if (!rtnl_trylock())
return restart_syscall();
if (pdev->reg_state != NETREG_REGISTERED) {
rtnl_unlock();
return -EPERM;
}
ppriv = ipoib_priv(pdev);
/* If you increase IFNAMSIZ, update snprintf below
* to allow longer names.
*/
BUILD_BUG_ON(IFNAMSIZ != 16);
snprintf(intf_name, sizeof(intf_name), "%.10s.%04x", ppriv->dev->name,
pkey);
ndev = ipoib_intf_alloc(ppriv->ca, ppriv->port, intf_name);
if (IS_ERR(ndev)) {
result = PTR_ERR(ndev);
goto out;
}
priv = ipoib_priv(ndev);
ndev->rtnl_link_ops = ipoib_get_link_ops();
result = __ipoib_vlan_add(ppriv, priv, pkey, IPOIB_LEGACY_CHILD);
if (result && ndev->reg_state == NETREG_UNINITIALIZED)
free_netdev(ndev);
out:
rtnl_unlock();
return result;
}
struct ipoib_vlan_delete_work {
struct work_struct work;
struct net_device *dev;
};
/*
* sysfs callbacks of a netdevice cannot obtain the rtnl lock as
* unregister_netdev ultimately deletes the sysfs files while holding the rtnl
* lock. This deadlocks the system.
*
* A callback can use rtnl_trylock to avoid the deadlock but it cannot call
* unregister_netdev as that internally takes and releases the rtnl_lock. So
* instead we find the netdev to unregister and then do the actual unregister
* from the global work queue where we can obtain the rtnl_lock safely.
*/
static void ipoib_vlan_delete_task(struct work_struct *work)
{
struct ipoib_vlan_delete_work *pwork =
container_of(work, struct ipoib_vlan_delete_work, work);
struct net_device *dev = pwork->dev;
rtnl_lock();
/* Unregistering tasks can race with another task or parent removal */
if (dev->reg_state == NETREG_REGISTERED) {
struct ipoib_dev_priv *priv = ipoib_priv(dev);
struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent);
ipoib_dbg(ppriv, "delete child vlan %s\n", dev->name);
unregister_netdevice(dev);
}
rtnl_unlock();
kfree(pwork);
}
int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey)
{
struct ipoib_dev_priv *ppriv, *priv, *tpriv;
int rc;
if (!capable(CAP_NET_ADMIN))
return -EPERM;
if (!rtnl_trylock())
return restart_syscall();
if (pdev->reg_state != NETREG_REGISTERED) {
rtnl_unlock();
return -EPERM;
}
ppriv = ipoib_priv(pdev);
rc = -ENODEV;
netdev_lock(ppriv->dev);
list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) {
if (priv->pkey == pkey &&
priv->child_type == IPOIB_LEGACY_CHILD) {
struct ipoib_vlan_delete_work *work;
work = kmalloc(sizeof(*work), GFP_KERNEL);
if (!work) {
rc = -ENOMEM;
goto out;
}
list_del_init(&priv->list);
work->dev = priv->dev;
INIT_WORK(&work->work, ipoib_vlan_delete_task);
queue_work(ipoib_workqueue, &work->work);
rc = 0;
break;
}
}
out:
netdev_unlock(ppriv->dev);
rtnl_unlock();
return rc;
}