mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-08-05 16:54:27 +00:00
net-sysfs: remove rtnl_trylock from device attributes
There is an ABBA deadlock between net device unregistration and sysfs files being accessed[1][2]. To prevent this from happening all paths taking the rtnl lock after the sysfs one (actually kn->active refcount) use rtnl_trylock and return early (using restart_syscall)[3], which can make syscalls to spin for a long time when there is contention on the rtnl lock[4]. There are not many possibilities to improve the above: - Rework the entire net/ locking logic. - Invert two locks in one of the paths — not possible. But here it's actually possible to drop one of the locks safely: the kernfs_node refcount. More details in the code itself, which comes with lots of comments. Note that we check the device is alive in the added sysfs_rtnl_lock helper to disallow sysfs operations to run after device dismantle has started. This also help keeping the same behavior as before. Because of this calls to dev_isalive in sysfs ops were removed. [1] https://lore.kernel.org/netdev/49A4D5D5.5090602@trash.net/ [2] https://lore.kernel.org/netdev/m14oyhis31.fsf@fess.ebiederm.org/ [3] https://lore.kernel.org/netdev/20090226084924.16cb3e08@nehalam/ [4] https://lore.kernel.org/all/20210928125500.167943-1-atenart@kernel.org/T/ Signed-off-by: Antoine Tenart <atenart@kernel.org> Link: https://patch.msgid.link/20250204170314.146022-2-atenart@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
parent
0bea93fdba
commit
79c61899b5
3 changed files with 139 additions and 53 deletions
|
@ -43,6 +43,7 @@ extern void rtnl_lock(void);
|
|||
extern void rtnl_unlock(void);
|
||||
extern int rtnl_trylock(void);
|
||||
extern int rtnl_is_locked(void);
|
||||
extern int rtnl_lock_interruptible(void);
|
||||
extern int rtnl_lock_killable(void);
|
||||
extern bool refcount_dec_and_rtnl_lock(refcount_t *r);
|
||||
|
||||
|
|
|
@ -42,6 +42,87 @@ static inline int dev_isalive(const struct net_device *dev)
|
|||
return READ_ONCE(dev->reg_state) <= NETREG_REGISTERED;
|
||||
}
|
||||
|
||||
/* There is a possible ABBA deadlock between rtnl_lock and kernfs_node->active,
|
||||
* when unregistering a net device and accessing associated sysfs files. The
|
||||
* potential deadlock is as follow:
|
||||
*
|
||||
* CPU 0 CPU 1
|
||||
*
|
||||
* rtnl_lock vfs_read
|
||||
* unregister_netdevice_many kernfs_seq_start
|
||||
* device_del / kobject_put kernfs_get_active (kn->active++)
|
||||
* kernfs_drain sysfs_kf_seq_show
|
||||
* wait_event( rtnl_lock
|
||||
* kn->active == KN_DEACTIVATED_BIAS) -> waits on CPU 0 to release
|
||||
* -> waits on CPU 1 to decrease kn->active the rtnl lock.
|
||||
*
|
||||
* The historical fix was to use rtnl_trylock with restart_syscall to bail out
|
||||
* of sysfs operations when the lock couldn't be taken. This fixed the above
|
||||
* issue as it allowed CPU 1 to bail out of the ABBA situation.
|
||||
*
|
||||
* But it came with performances issues, as syscalls are being restarted in
|
||||
* loops when there was contention on the rtnl lock, with huge slow downs in
|
||||
* specific scenarios (e.g. lots of virtual interfaces created and userspace
|
||||
* daemons querying their attributes).
|
||||
*
|
||||
* The idea below is to bail out of the active kernfs_node protection
|
||||
* (kn->active) while trying to take the rtnl lock.
|
||||
*
|
||||
* This replaces rtnl_lock() and still has to be used with rtnl_unlock(). The
|
||||
* net device is guaranteed to be alive if this returns successfully.
|
||||
*/
|
||||
static int sysfs_rtnl_lock(struct kobject *kobj, struct attribute *attr,
|
||||
struct net_device *ndev)
|
||||
{
|
||||
struct kernfs_node *kn;
|
||||
int ret = 0;
|
||||
|
||||
/* First, we hold a reference to the net device as the unregistration
|
||||
* path might run in parallel. This will ensure the net device and the
|
||||
* associated sysfs objects won't be freed while we try to take the rtnl
|
||||
* lock.
|
||||
*/
|
||||
dev_hold(ndev);
|
||||
/* sysfs_break_active_protection was introduced to allow self-removal of
|
||||
* devices and their associated sysfs files by bailing out of the
|
||||
* sysfs/kernfs protection. We do this here to allow the unregistration
|
||||
* path to complete in parallel. The following takes a reference on the
|
||||
* kobject and the kernfs_node being accessed.
|
||||
*
|
||||
* This works because we hold a reference onto the net device and the
|
||||
* unregistration path will wait for us eventually in netdev_run_todo
|
||||
* (outside an rtnl lock section).
|
||||
*/
|
||||
kn = sysfs_break_active_protection(kobj, attr);
|
||||
/* We can now try to take the rtnl lock. This can't deadlock us as the
|
||||
* unregistration path is able to drain sysfs files (kernfs_node) thanks
|
||||
* to the above dance.
|
||||
*/
|
||||
if (rtnl_lock_interruptible()) {
|
||||
ret = -ERESTARTSYS;
|
||||
goto unbreak;
|
||||
}
|
||||
/* Check dismantle on the device hasn't started, otherwise deny the
|
||||
* operation.
|
||||
*/
|
||||
if (!dev_isalive(ndev)) {
|
||||
rtnl_unlock();
|
||||
ret = -ENODEV;
|
||||
goto unbreak;
|
||||
}
|
||||
/* We are now sure the device dismantle hasn't started nor that it can
|
||||
* start before we exit the locking section as we hold the rtnl lock.
|
||||
* There's no need to keep unbreaking the sysfs protection nor to hold
|
||||
* a net device reference from that point; that was only needed to take
|
||||
* the rtnl lock.
|
||||
*/
|
||||
unbreak:
|
||||
sysfs_unbreak_active_protection(kn);
|
||||
dev_put(ndev);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* use same locking rules as GIF* ioctl's */
|
||||
static ssize_t netdev_show(const struct device *dev,
|
||||
struct device_attribute *attr, char *buf,
|
||||
|
@ -95,14 +176,14 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,
|
|||
if (ret)
|
||||
goto err;
|
||||
|
||||
if (!rtnl_trylock())
|
||||
return restart_syscall();
|
||||
ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
if (dev_isalive(netdev)) {
|
||||
ret = (*set)(netdev, new);
|
||||
if (ret == 0)
|
||||
ret = len;
|
||||
}
|
||||
|
||||
rtnl_unlock();
|
||||
err:
|
||||
return ret;
|
||||
|
@ -220,7 +301,7 @@ static ssize_t carrier_store(struct device *dev, struct device_attribute *attr,
|
|||
struct net_device *netdev = to_net_dev(dev);
|
||||
|
||||
/* The check is also done in change_carrier; this helps returning early
|
||||
* without hitting the trylock/restart in netdev_store.
|
||||
* without hitting the locking section in netdev_store.
|
||||
*/
|
||||
if (!netdev->netdev_ops->ndo_change_carrier)
|
||||
return -EOPNOTSUPP;
|
||||
|
@ -234,8 +315,9 @@ static ssize_t carrier_show(struct device *dev,
|
|||
struct net_device *netdev = to_net_dev(dev);
|
||||
int ret = -EINVAL;
|
||||
|
||||
if (!rtnl_trylock())
|
||||
return restart_syscall();
|
||||
ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (netif_running(netdev)) {
|
||||
/* Synchronize carrier state with link watch,
|
||||
|
@ -245,8 +327,8 @@ static ssize_t carrier_show(struct device *dev,
|
|||
|
||||
ret = sysfs_emit(buf, fmt_dec, !!netif_carrier_ok(netdev));
|
||||
}
|
||||
rtnl_unlock();
|
||||
|
||||
rtnl_unlock();
|
||||
return ret;
|
||||
}
|
||||
static DEVICE_ATTR_RW(carrier);
|
||||
|
@ -258,13 +340,14 @@ static ssize_t speed_show(struct device *dev,
|
|||
int ret = -EINVAL;
|
||||
|
||||
/* The check is also done in __ethtool_get_link_ksettings; this helps
|
||||
* returning early without hitting the trylock/restart below.
|
||||
* returning early without hitting the locking section below.
|
||||
*/
|
||||
if (!netdev->ethtool_ops->get_link_ksettings)
|
||||
return ret;
|
||||
|
||||
if (!rtnl_trylock())
|
||||
return restart_syscall();
|
||||
ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (netif_running(netdev)) {
|
||||
struct ethtool_link_ksettings cmd;
|
||||
|
@ -284,13 +367,14 @@ static ssize_t duplex_show(struct device *dev,
|
|||
int ret = -EINVAL;
|
||||
|
||||
/* The check is also done in __ethtool_get_link_ksettings; this helps
|
||||
* returning early without hitting the trylock/restart below.
|
||||
* returning early without hitting the locking section below.
|
||||
*/
|
||||
if (!netdev->ethtool_ops->get_link_ksettings)
|
||||
return ret;
|
||||
|
||||
if (!rtnl_trylock())
|
||||
return restart_syscall();
|
||||
ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (netif_running(netdev)) {
|
||||
struct ethtool_link_ksettings cmd;
|
||||
|
@ -490,16 +574,15 @@ static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
|
|||
if (len > 0 && buf[len - 1] == '\n')
|
||||
--count;
|
||||
|
||||
if (!rtnl_trylock())
|
||||
return restart_syscall();
|
||||
ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (dev_isalive(netdev)) {
|
||||
ret = dev_set_alias(netdev, buf, count);
|
||||
if (ret < 0)
|
||||
goto err;
|
||||
ret = len;
|
||||
netdev_state_change(netdev);
|
||||
}
|
||||
err:
|
||||
rtnl_unlock();
|
||||
|
||||
|
@ -551,24 +634,23 @@ static ssize_t phys_port_id_show(struct device *dev,
|
|||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
struct net_device *netdev = to_net_dev(dev);
|
||||
struct netdev_phys_item_id ppid;
|
||||
ssize_t ret = -EINVAL;
|
||||
|
||||
/* The check is also done in dev_get_phys_port_id; this helps returning
|
||||
* early without hitting the trylock/restart below.
|
||||
* early without hitting the locking section below.
|
||||
*/
|
||||
if (!netdev->netdev_ops->ndo_get_phys_port_id)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
if (!rtnl_trylock())
|
||||
return restart_syscall();
|
||||
|
||||
if (dev_isalive(netdev)) {
|
||||
struct netdev_phys_item_id ppid;
|
||||
ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = dev_get_phys_port_id(netdev, &ppid);
|
||||
if (!ret)
|
||||
ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id);
|
||||
}
|
||||
|
||||
rtnl_unlock();
|
||||
|
||||
return ret;
|
||||
|
@ -580,24 +662,23 @@ static ssize_t phys_port_name_show(struct device *dev,
|
|||
{
|
||||
struct net_device *netdev = to_net_dev(dev);
|
||||
ssize_t ret = -EINVAL;
|
||||
char name[IFNAMSIZ];
|
||||
|
||||
/* The checks are also done in dev_get_phys_port_name; this helps
|
||||
* returning early without hitting the trylock/restart below.
|
||||
* returning early without hitting the locking section below.
|
||||
*/
|
||||
if (!netdev->netdev_ops->ndo_get_phys_port_name &&
|
||||
!netdev->devlink_port)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
if (!rtnl_trylock())
|
||||
return restart_syscall();
|
||||
|
||||
if (dev_isalive(netdev)) {
|
||||
char name[IFNAMSIZ];
|
||||
ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = dev_get_phys_port_name(netdev, name, sizeof(name));
|
||||
if (!ret)
|
||||
ret = sysfs_emit(buf, "%s\n", name);
|
||||
}
|
||||
|
||||
rtnl_unlock();
|
||||
|
||||
return ret;
|
||||
|
@ -608,26 +689,25 @@ static ssize_t phys_switch_id_show(struct device *dev,
|
|||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
struct net_device *netdev = to_net_dev(dev);
|
||||
struct netdev_phys_item_id ppid = { };
|
||||
ssize_t ret = -EINVAL;
|
||||
|
||||
/* The checks are also done in dev_get_phys_port_name; this helps
|
||||
* returning early without hitting the trylock/restart below. This works
|
||||
* returning early without hitting the locking section below. This works
|
||||
* because recurse is false when calling dev_get_port_parent_id.
|
||||
*/
|
||||
if (!netdev->netdev_ops->ndo_get_port_parent_id &&
|
||||
!netdev->devlink_port)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
if (!rtnl_trylock())
|
||||
return restart_syscall();
|
||||
|
||||
if (dev_isalive(netdev)) {
|
||||
struct netdev_phys_item_id ppid = { };
|
||||
ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = dev_get_port_parent_id(netdev, &ppid, false);
|
||||
if (!ret)
|
||||
ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id);
|
||||
}
|
||||
|
||||
rtnl_unlock();
|
||||
|
||||
return ret;
|
||||
|
|
|
@ -80,6 +80,11 @@ void rtnl_lock(void)
|
|||
}
|
||||
EXPORT_SYMBOL(rtnl_lock);
|
||||
|
||||
int rtnl_lock_interruptible(void)
|
||||
{
|
||||
return mutex_lock_interruptible(&rtnl_mutex);
|
||||
}
|
||||
|
||||
int rtnl_lock_killable(void)
|
||||
{
|
||||
return mutex_lock_killable(&rtnl_mutex);
|
||||
|
|
Loading…
Add table
Reference in a new issue