fs: kill MNT_ONRB

Move mnt->mnt_node into the union with mnt->mnt_rcu and mnt->mnt_llist
instead of keeping it with mnt->mnt_list. This allows us to use
RB_CLEAR_NODE(&mnt->mnt_node) in umount_tree() as well as
list_empty(&mnt->mnt_node). That in turn allows us to remove MNT_ONRB.

This also fixes the bug reported in [1] where seemingly MNT_ONRB wasn't
set in @mnt->mnt_flags even though the mount was present in the mount
rbtree of the mount namespace.

The root cause is the following race. When a btrfs subvolume is mounted
a temporary mount is created:

btrfs_get_tree_subvol()
{
        mnt = fc_mount()
        // Register the newly allocated mount with sb->mounts:
        lock_mount_hash();
        list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
        unlock_mount_hash();
}

and registered on sb->s_mounts. Later it is added to an anonymous mount
namespace via mount_subvol():

-> mount_subvol()
   -> mount_subtree()
      -> alloc_mnt_ns()
         mnt_add_to_ns()
         vfs_path_lookup()
         put_mnt_ns()

The mnt_add_to_ns() call raises MNT_ONRB in @mnt->mnt_flags. If someone
concurrently does a ro remount:

reconfigure_super()
-> sb_prepare_remount_readonly()
   {
           list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
   }

all mounts registered in sb->s_mounts are visited and first
MNT_WRITE_HOLD is raised, then MNT_READONLY is raised, and finally
MNT_WRITE_HOLD is removed again.

The flag modification for MNT_WRITE_HOLD/MNT_READONLY and MNT_ONRB race
so MNT_ONRB might be lost.

Fixes: 2eea9ce431 ("mounts: keep list of mounts in an rbtree")
Cc: <stable@kernel.org> # v6.8+
Link: https://lore.kernel.org/r/20241215-vfs-6-14-mount-work-v1-1-fd55922c4af8@kernel.org
Link: https://lore.kernel.org/r/ec6784ed-8722-4695-980a-4400d4e7bd1a@gmx.com [1]
Signed-off-by: Christian Brauner <brauner@kernel.org>
This commit is contained in:
Christian Brauner 2024-12-15 21:17:05 +01:00
parent 40384c840e
commit 344bac8f0d
No known key found for this signature in database
GPG key ID: 91C61BC06578DCA2
3 changed files with 16 additions and 16 deletions

View file

@ -38,6 +38,7 @@ struct mount {
struct dentry *mnt_mountpoint; struct dentry *mnt_mountpoint;
struct vfsmount mnt; struct vfsmount mnt;
union { union {
struct rb_node mnt_node; /* node in the ns->mounts rbtree */
struct rcu_head mnt_rcu; struct rcu_head mnt_rcu;
struct llist_node mnt_llist; struct llist_node mnt_llist;
}; };
@ -51,10 +52,7 @@ struct mount {
struct list_head mnt_child; /* and going through their mnt_child */ struct list_head mnt_child; /* and going through their mnt_child */
struct list_head mnt_instance; /* mount instance on sb->s_mounts */ struct list_head mnt_instance; /* mount instance on sb->s_mounts */
const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */ const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */
union { struct list_head mnt_list;
struct rb_node mnt_node; /* Under ns->mounts */
struct list_head mnt_list;
};
struct list_head mnt_expire; /* link in fs-specific expiry list */ struct list_head mnt_expire; /* link in fs-specific expiry list */
struct list_head mnt_share; /* circular list of shared mounts */ struct list_head mnt_share; /* circular list of shared mounts */
struct list_head mnt_slave_list;/* list of slave mounts */ struct list_head mnt_slave_list;/* list of slave mounts */
@ -145,11 +143,16 @@ static inline bool is_anon_ns(struct mnt_namespace *ns)
return ns->seq == 0; return ns->seq == 0;
} }
static inline bool mnt_ns_attached(const struct mount *mnt)
{
return !RB_EMPTY_NODE(&mnt->mnt_node);
}
static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list) static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
{ {
WARN_ON(!(mnt->mnt.mnt_flags & MNT_ONRB)); WARN_ON(!mnt_ns_attached(mnt));
mnt->mnt.mnt_flags &= ~MNT_ONRB;
rb_erase(&mnt->mnt_node, &mnt->mnt_ns->mounts); rb_erase(&mnt->mnt_node, &mnt->mnt_ns->mounts);
RB_CLEAR_NODE(&mnt->mnt_node);
list_add_tail(&mnt->mnt_list, dt_list); list_add_tail(&mnt->mnt_list, dt_list);
} }

View file

@ -344,6 +344,7 @@ static struct mount *alloc_vfsmnt(const char *name)
INIT_HLIST_NODE(&mnt->mnt_mp_list); INIT_HLIST_NODE(&mnt->mnt_mp_list);
INIT_LIST_HEAD(&mnt->mnt_umounting); INIT_LIST_HEAD(&mnt->mnt_umounting);
INIT_HLIST_HEAD(&mnt->mnt_stuck_children); INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
RB_CLEAR_NODE(&mnt->mnt_node);
mnt->mnt.mnt_idmap = &nop_mnt_idmap; mnt->mnt.mnt_idmap = &nop_mnt_idmap;
} }
return mnt; return mnt;
@ -1124,7 +1125,7 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
struct rb_node **link = &ns->mounts.rb_node; struct rb_node **link = &ns->mounts.rb_node;
struct rb_node *parent = NULL; struct rb_node *parent = NULL;
WARN_ON(mnt->mnt.mnt_flags & MNT_ONRB); WARN_ON(mnt_ns_attached(mnt));
mnt->mnt_ns = ns; mnt->mnt_ns = ns;
while (*link) { while (*link) {
parent = *link; parent = *link;
@ -1135,7 +1136,6 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
} }
rb_link_node(&mnt->mnt_node, parent, link); rb_link_node(&mnt->mnt_node, parent, link);
rb_insert_color(&mnt->mnt_node, &ns->mounts); rb_insert_color(&mnt->mnt_node, &ns->mounts);
mnt->mnt.mnt_flags |= MNT_ONRB;
} }
/* /*
@ -1305,7 +1305,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
} }
mnt->mnt.mnt_flags = old->mnt.mnt_flags; mnt->mnt.mnt_flags = old->mnt.mnt_flags;
mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL|MNT_ONRB); mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
atomic_inc(&sb->s_active); atomic_inc(&sb->s_active);
mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt)); mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));
@ -1763,7 +1763,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
/* Gather the mounts to umount */ /* Gather the mounts to umount */
for (p = mnt; p; p = next_mnt(p, mnt)) { for (p = mnt; p; p = next_mnt(p, mnt)) {
p->mnt.mnt_flags |= MNT_UMOUNT; p->mnt.mnt_flags |= MNT_UMOUNT;
if (p->mnt.mnt_flags & MNT_ONRB) if (mnt_ns_attached(p))
move_from_ns(p, &tmp_list); move_from_ns(p, &tmp_list);
else else
list_move(&p->mnt_list, &tmp_list); list_move(&p->mnt_list, &tmp_list);
@ -1912,16 +1912,14 @@ static int do_umount(struct mount *mnt, int flags)
event++; event++;
if (flags & MNT_DETACH) { if (flags & MNT_DETACH) {
if (mnt->mnt.mnt_flags & MNT_ONRB || if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list))
!list_empty(&mnt->mnt_list))
umount_tree(mnt, UMOUNT_PROPAGATE); umount_tree(mnt, UMOUNT_PROPAGATE);
retval = 0; retval = 0;
} else { } else {
shrink_submounts(mnt); shrink_submounts(mnt);
retval = -EBUSY; retval = -EBUSY;
if (!propagate_mount_busy(mnt, 2)) { if (!propagate_mount_busy(mnt, 2)) {
if (mnt->mnt.mnt_flags & MNT_ONRB || if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list))
!list_empty(&mnt->mnt_list))
umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
retval = 0; retval = 0;
} }

View file

@ -50,7 +50,7 @@ struct path;
#define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME ) #define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME )
#define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \ #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \
MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | MNT_ONRB) MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED)
#define MNT_INTERNAL 0x4000 #define MNT_INTERNAL 0x4000
@ -64,7 +64,6 @@ struct path;
#define MNT_SYNC_UMOUNT 0x2000000 #define MNT_SYNC_UMOUNT 0x2000000
#define MNT_MARKED 0x4000000 #define MNT_MARKED 0x4000000
#define MNT_UMOUNT 0x8000000 #define MNT_UMOUNT 0x8000000
#define MNT_ONRB 0x10000000
struct vfsmount { struct vfsmount {
struct dentry *mnt_root; /* root of the mounted tree */ struct dentry *mnt_root; /* root of the mounted tree */