linux/include/linux/mount.h
Christian Brauner 344bac8f0d
fs: kill MNT_ONRB
Move mnt->mnt_node into the union with mnt->mnt_rcu and mnt->mnt_llist
instead of keeping it with mnt->mnt_list. This allows us to use
RB_CLEAR_NODE(&mnt->mnt_node) in umount_tree() as well as
list_empty(&mnt->mnt_node). That in turn allows us to remove MNT_ONRB.

This also fixes the bug reported in [1] where seemingly MNT_ONRB wasn't
set in @mnt->mnt_flags even though the mount was present in the mount
rbtree of the mount namespace.

The root cause is the following race. When a btrfs subvolume is mounted
a temporary mount is created:

btrfs_get_tree_subvol()
{
        mnt = fc_mount()
        // Register the newly allocated mount with sb->mounts:
        lock_mount_hash();
        list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
        unlock_mount_hash();
}

and registered on sb->s_mounts. Later it is added to an anonymous mount
namespace via mount_subvol():

-> mount_subvol()
   -> mount_subtree()
      -> alloc_mnt_ns()
         mnt_add_to_ns()
         vfs_path_lookup()
         put_mnt_ns()

The mnt_add_to_ns() call raises MNT_ONRB in @mnt->mnt_flags. If someone
concurrently does a ro remount:

reconfigure_super()
-> sb_prepare_remount_readonly()
   {
           list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
   }

all mounts registered in sb->s_mounts are visited and first
MNT_WRITE_HOLD is raised, then MNT_READONLY is raised, and finally
MNT_WRITE_HOLD is removed again.

The flag modification for MNT_WRITE_HOLD/MNT_READONLY and MNT_ONRB race
so MNT_ONRB might be lost.

Fixes: 2eea9ce431 ("mounts: keep list of mounts in an rbtree")
Cc: <stable@kernel.org> # v6.8+
Link: https://lore.kernel.org/r/20241215-vfs-6-14-mount-work-v1-1-fd55922c4af8@kernel.org
Link: https://lore.kernel.org/r/ec6784ed-8722-4695-980a-4400d4e7bd1a@gmx.com [1]
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-01-09 16:58:50 +01:00

126 lines
4.3 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
/*
*
* Definitions for mount interface. This describes the in the kernel build
* linkedlist with mounted filesystems.
*
* Author: Marco van Wieringen <mvw@planets.elm.net>
*
*/
#ifndef _LINUX_MOUNT_H
#define _LINUX_MOUNT_H
#include <linux/types.h>
#include <asm/barrier.h>
struct super_block;
struct dentry;
struct user_namespace;
struct mnt_idmap;
struct file_system_type;
struct fs_context;
struct file;
struct path;
#define MNT_NOSUID 0x01
#define MNT_NODEV 0x02
#define MNT_NOEXEC 0x04
#define MNT_NOATIME 0x08
#define MNT_NODIRATIME 0x10
#define MNT_RELATIME 0x20
#define MNT_READONLY 0x40 /* does the user want this to be r/o? */
#define MNT_NOSYMFOLLOW 0x80
#define MNT_SHRINKABLE 0x100
#define MNT_WRITE_HOLD 0x200
#define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */
#define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */
/*
* MNT_SHARED_MASK is the set of flags that should be cleared when a
* mount becomes shared. Currently, this is only the flag that says a
* mount cannot be bind mounted, since this is how we create a mount
* that shares events with another mount. If you add a new MNT_*
* flag, consider how it interacts with shared mounts.
*/
#define MNT_SHARED_MASK (MNT_UNBINDABLE)
#define MNT_USER_SETTABLE_MASK (MNT_NOSUID | MNT_NODEV | MNT_NOEXEC \
| MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME \
| MNT_READONLY | MNT_NOSYMFOLLOW)
#define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME )
#define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \
MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED)
#define MNT_INTERNAL 0x4000
#define MNT_LOCK_ATIME 0x040000
#define MNT_LOCK_NOEXEC 0x080000
#define MNT_LOCK_NOSUID 0x100000
#define MNT_LOCK_NODEV 0x200000
#define MNT_LOCK_READONLY 0x400000
#define MNT_LOCKED 0x800000
#define MNT_DOOMED 0x1000000
#define MNT_SYNC_UMOUNT 0x2000000
#define MNT_MARKED 0x4000000
#define MNT_UMOUNT 0x8000000
struct vfsmount {
struct dentry *mnt_root; /* root of the mounted tree */
struct super_block *mnt_sb; /* pointer to superblock */
int mnt_flags;
struct mnt_idmap *mnt_idmap;
} __randomize_layout;
static inline struct mnt_idmap *mnt_idmap(const struct vfsmount *mnt)
{
/* Pairs with smp_store_release() in do_idmap_mount(). */
return smp_load_acquire(&mnt->mnt_idmap);
}
extern int mnt_want_write(struct vfsmount *mnt);
extern int mnt_want_write_file(struct file *file);
extern void mnt_drop_write(struct vfsmount *mnt);
extern void mnt_drop_write_file(struct file *file);
extern void mntput(struct vfsmount *mnt);
extern struct vfsmount *mntget(struct vfsmount *mnt);
extern void mnt_make_shortterm(struct vfsmount *mnt);
extern struct vfsmount *mnt_clone_internal(const struct path *path);
extern bool __mnt_is_readonly(struct vfsmount *mnt);
extern bool mnt_may_suid(struct vfsmount *mnt);
extern struct vfsmount *clone_private_mount(const struct path *path);
int mnt_get_write_access(struct vfsmount *mnt);
void mnt_put_write_access(struct vfsmount *mnt);
extern struct vfsmount *fc_mount(struct fs_context *fc);
extern struct vfsmount *vfs_create_mount(struct fs_context *fc);
extern struct vfsmount *vfs_kern_mount(struct file_system_type *type,
int flags, const char *name,
void *data);
extern struct vfsmount *vfs_submount(const struct dentry *mountpoint,
struct file_system_type *type,
const char *name, void *data);
extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list);
extern void mark_mounts_for_expiry(struct list_head *mounts);
extern bool path_is_mountpoint(const struct path *path);
extern bool our_mnt(struct vfsmount *mnt);
extern struct vfsmount *kern_mount(struct file_system_type *);
extern void kern_unmount(struct vfsmount *mnt);
extern int may_umount_tree(struct vfsmount *);
extern int may_umount(struct vfsmount *);
extern long do_mount(const char *, const char __user *,
const char *, unsigned long, void *);
extern struct vfsmount *collect_mounts(const struct path *);
extern void drop_collected_mounts(struct vfsmount *);
extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *,
struct vfsmount *);
extern void kern_unmount_array(struct vfsmount *mnt[], unsigned int num);
extern int cifs_root_data(char **dev, char **opts);
#endif /* _LINUX_MOUNT_H */