From a48bdf80ce6938f8c1de6a56fed7c4f6f46904e9 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sat, 16 Nov 2024 07:41:28 +0100 Subject: [PATCH 01/28] fs: delay sysctl_nr_open check in expand_files() Suppose a thread sharing the table started a resize, while sysctl_nr_open got lowered to a value which prohibits it. This is still going to go through with and without the patch, which is fine. Further suppose another thread shows up to do a matching expansion while resize_in_progress == true. It is going to error out since it performs the sysctl_nr_open check *before* finding out if there is an expansion in progress. But the aformentioned thread is going to succeded, so the error is spurious (and it would not happen if the thread showed up a little bit later). Checking the sysctl *after* we know there are no pending updates sorts it out. While here annotate the thing as unlikely. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20241116064128.280870-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/file.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/file.c b/fs/file.c index fb1011cf6b4a..019fb9acf91b 100644 --- a/fs/file.c +++ b/fs/file.c @@ -278,10 +278,6 @@ repeat: if (nr < fdt->max_fds) return 0; - /* Can we expand? */ - if (nr >= sysctl_nr_open) - return -EMFILE; - if (unlikely(files->resize_in_progress)) { spin_unlock(&files->file_lock); wait_event(files->resize_wait, !files->resize_in_progress); @@ -289,6 +285,10 @@ repeat: goto repeat; } + /* Can we expand? */ + if (unlikely(nr >= sysctl_nr_open)) + return -EMFILE; + /* All good, so we try */ files->resize_in_progress = true; error = expand_fdtable(files, nr); From ea382199071931d19aac5f688b543e07360e2b64 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 20 Nov 2024 12:20:34 +0100 Subject: [PATCH 02/28] vfs: support caching symlink lengths in inodes When utilized it dodges strlen() in vfs_readlink(), giving about 1.5% speed up when issuing readlink on /initrd.img on ext4. Filesystems opt in by calling inode_set_cached_link() when creating an inode. The size is stored in a new union utilizing the same space as i_devices, thus avoiding growing the struct or taking up any more space. Churn-wise the current readlink_copy() helper is patched to accept the size instead of calculating it. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20241120112037.822078-2-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/namei.c | 34 +++++++++++++++++++--------------- fs/proc/namespaces.c | 2 +- include/linux/fs.h | 15 +++++++++++++-- security/apparmor/apparmorfs.c | 2 +- 4 files changed, 34 insertions(+), 19 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 9d30c7aa9aa6..e56c29a22d26 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -5272,19 +5272,16 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna getname(newname), 0); } -int readlink_copy(char __user *buffer, int buflen, const char *link) +int readlink_copy(char __user *buffer, int buflen, const char *link, int linklen) { - int len = PTR_ERR(link); - if (IS_ERR(link)) - goto out; + int copylen; - len = strlen(link); - if (len > (unsigned) buflen) - len = buflen; - if (copy_to_user(buffer, link, len)) - len = -EFAULT; -out: - return len; + copylen = linklen; + if (unlikely(copylen > (unsigned) buflen)) + copylen = buflen; + if (copy_to_user(buffer, link, copylen)) + copylen = -EFAULT; + return copylen; } /** @@ -5304,6 +5301,9 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen) const char *link; int res; + if (inode->i_opflags & IOP_CACHED_LINK) + return readlink_copy(buffer, buflen, inode->i_link, inode->i_linklen); + if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) { if (unlikely(inode->i_op->readlink)) return inode->i_op->readlink(dentry, buffer, buflen); @@ -5322,7 +5322,7 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen) if (IS_ERR(link)) return PTR_ERR(link); } - res = readlink_copy(buffer, buflen, link); + res = readlink_copy(buffer, buflen, link, strlen(link)); do_delayed_call(&done); return res; } @@ -5391,10 +5391,14 @@ EXPORT_SYMBOL(page_put_link); int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) { + const char *link; + int res; + DEFINE_DELAYED_CALL(done); - int res = readlink_copy(buffer, buflen, - page_get_link(dentry, d_inode(dentry), - &done)); + link = page_get_link(dentry, d_inode(dentry), &done); + res = PTR_ERR(link); + if (!IS_ERR(link)) + res = readlink_copy(buffer, buflen, link, strlen(link)); do_delayed_call(&done); return res; } diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 8e159fc78c0a..c610224faf10 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -83,7 +83,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { res = ns_get_name(name, sizeof(name), task, ns_ops); if (res >= 0) - res = readlink_copy(buffer, buflen, name); + res = readlink_copy(buffer, buflen, name, strlen(name)); } put_task_struct(task); return res; diff --git a/include/linux/fs.h b/include/linux/fs.h index 7e29433c5ecc..2cc98de5af43 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -626,6 +626,7 @@ is_uncached_acl(struct posix_acl *acl) #define IOP_XATTR 0x0008 #define IOP_DEFAULT_READLINK 0x0010 #define IOP_MGTIME 0x0020 +#define IOP_CACHED_LINK 0x0040 /* * Keep mostly read-only and often accessed (especially for @@ -723,7 +724,10 @@ struct inode { }; struct file_lock_context *i_flctx; struct address_space i_data; - struct list_head i_devices; + union { + struct list_head i_devices; + int i_linklen; + }; union { struct pipe_inode_info *i_pipe; struct cdev *i_cdev; @@ -749,6 +753,13 @@ struct inode { void *i_private; /* fs or device private pointer */ } __randomize_layout; +static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen) +{ + inode->i_link = link; + inode->i_linklen = linklen; + inode->i_opflags |= IOP_CACHED_LINK; +} + /* * Get bit address from inode->i_state to use with wait_var_event() * infrastructre. @@ -3351,7 +3362,7 @@ extern const struct file_operations generic_ro_fops; #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) -extern int readlink_copy(char __user *, int, const char *); +extern int readlink_copy(char __user *, int, const char *, int); extern int page_readlink(struct dentry *, char __user *, int); extern const char *page_get_link(struct dentry *, struct inode *, struct delayed_call *); diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 2c0185ebc900..c07d150685d7 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -2612,7 +2612,7 @@ static int policy_readlink(struct dentry *dentry, char __user *buffer, res = snprintf(name, sizeof(name), "%s:[%lu]", AAFS_NAME, d_inode(dentry)->i_ino); if (res > 0 && res < sizeof(name)) - res = readlink_copy(buffer, buflen, name); + res = readlink_copy(buffer, buflen, name, strlen(name)); else res = -ENOENT; From c7175957b28a69947dd1d36e8b19ac0d3c1a5d7d Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 27 Jul 2023 20:03:55 +0200 Subject: [PATCH 03/28] seqlock: annotate spinning as unlikely() in __read_seqcount_begin Annotation already used to be there, but got lost in 52ac39e5db5148f7 ("seqlock: seqcount_t: Implement all read APIs as statement expressions"). Does not look like it was intentional. Without it gcc 12 decides to compile the following in path_init: nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount); nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount); into 2 cases of conditional jumps forward if the value is even, aka branch prediction miss by default in the common case on x86-64. With the patch jumps are only for odd values. before: [snip] mov 0x104fe96(%rip),%eax # 0xffffffff82409680 test $0x1,%al je 0xffffffff813b97fa pause mov 0x104fe8a(%rip),%eax # 0xffffffff82409680 test $0x1,%al jne 0xffffffff813b97ee mov %eax,0x48(%rbx) mov 0x104fdfd(%rip),%eax # 0xffffffff82409600 test $0x1,%al je 0xffffffff813b9813 pause mov 0x104fdf1(%rip),%eax # 0xffffffff82409600 test $0x1,%al jne 0xffffffff813b9807 [/snip] after: [snip] mov 0x104fec6(%rip),%eax # 0xffffffff82409680 test $0x1,%al jne 0xffffffff813b99af mov %eax,0x48(%rbx) mov 0x104fe35(%rip),%eax # 0xffffffff82409600 test $0x1,%al jne 0xffffffff813b999d [/snip] Interestingly .text gets slightly smaller (as reported by size(1)): before: 20702563 after: 20702429 Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20230727180355.813995-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/seqlock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 5298765d6ca4..eb20dcaa51b5 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -272,7 +272,7 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) ({ \ unsigned __seq; \ \ - while ((__seq = seqprop_sequence(s)) & 1) \ + while (unlikely((__seq = seqprop_sequence(s)) & 1)) \ cpu_relax(); \ \ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ From bae80473f7b0b25772619e7692019b1549d4a82c Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 20 Nov 2024 12:20:35 +0100 Subject: [PATCH 04/28] ext4: use inode_set_cached_link() Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20241120112037.822078-3-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/ext4/inode.c | 3 ++- fs/ext4/namei.c | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 89aade6f45f6..7c54ae5fcbd4 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5006,10 +5006,11 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, if (IS_ENCRYPTED(inode)) { inode->i_op = &ext4_encrypted_symlink_inode_operations; } else if (ext4_inode_is_fast_symlink(inode)) { - inode->i_link = (char *)ei->i_data; inode->i_op = &ext4_fast_symlink_inode_operations; nd_terminate_link(ei->i_data, inode->i_size, sizeof(ei->i_data) - 1); + inode_set_cached_link(inode, (char *)ei->i_data, + inode->i_size); } else { inode->i_op = &ext4_symlink_inode_operations; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index bcf2737078b8..536d56d15072 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3418,7 +3418,6 @@ retry: inode->i_op = &ext4_symlink_inode_operations; } else { inode->i_op = &ext4_fast_symlink_inode_operations; - inode->i_link = (char *)&EXT4_I(inode)->i_data; } } @@ -3434,6 +3433,9 @@ retry: disk_link.len); inode->i_size = disk_link.len - 1; EXT4_I(inode)->i_disksize = inode->i_size; + if (!IS_ENCRYPTED(inode)) + inode_set_cached_link(inode, (char *)&EXT4_I(inode)->i_data, + inode->i_size); } err = ext4_add_nondir(handle, dentry, &inode); if (handle) From 135ec43eb29c68ed26e2d10f221d43f7d9139a8f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 20 Nov 2024 17:13:52 -0800 Subject: [PATCH 05/28] fiemap: use kernel-doc includes in fiemap docbook Add some kernel-doc notation to structs in fiemap header files then pull that into Documentation/filesystems/fiemap.rst instead of duplicating the header file structs in fiemap.rst. This helps to future-proof fiemap.rst against struct changes. Add missing flags documentation from header files into fiemap.rst for FIEMAP_FLAG_CACHE and FIEMAP_EXTENT_SHARED. Signed-off-by: Randy Dunlap Link: https://lore.kernel.org/r/20241121011352.201907-1-rdunlap@infradead.org Cc: Christoph Hellwig Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Cc: Jonathan Corbet Cc: linux-doc@vger.kernel.org Cc: Matthew Wilcox Signed-off-by: Christian Brauner --- Documentation/filesystems/fiemap.rst | 45 ++++++++------------------ include/linux/fiemap.h | 16 +++++++--- include/uapi/linux/fiemap.h | 47 +++++++++++++++++++--------- 3 files changed, 57 insertions(+), 51 deletions(-) diff --git a/Documentation/filesystems/fiemap.rst b/Documentation/filesystems/fiemap.rst index 93fc96f760aa..23b3ed229e49 100644 --- a/Documentation/filesystems/fiemap.rst +++ b/Documentation/filesystems/fiemap.rst @@ -12,21 +12,10 @@ returns a list of extents. Request Basics -------------- -A fiemap request is encoded within struct fiemap:: - - struct fiemap { - __u64 fm_start; /* logical offset (inclusive) at - * which to start mapping (in) */ - __u64 fm_length; /* logical length of mapping which - * userspace cares about (in) */ - __u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */ - __u32 fm_mapped_extents; /* number of extents that were - * mapped (out) */ - __u32 fm_extent_count; /* size of fm_extents array (in) */ - __u32 fm_reserved; - struct fiemap_extent fm_extents[0]; /* array of mapped extents (out) */ - }; +A fiemap request is encoded within struct fiemap: +.. kernel-doc:: include/uapi/linux/fiemap.h + :identifiers: fiemap fm_start, and fm_length specify the logical range within the file which the process would like mappings for. Extents returned mirror @@ -60,6 +49,8 @@ FIEMAP_FLAG_XATTR If this flag is set, the extents returned will describe the inodes extended attribute lookup tree, instead of its data tree. +FIEMAP_FLAG_CACHE + This flag requests caching of the extents. Extent Mapping -------------- @@ -77,18 +68,10 @@ complete the requested range and will not have the FIEMAP_EXTENT_LAST flag set (see the next section on extent flags). Each extent is described by a single fiemap_extent structure as -returned in fm_extents:: +returned in fm_extents: - struct fiemap_extent { - __u64 fe_logical; /* logical offset in bytes for the start of - * the extent */ - __u64 fe_physical; /* physical offset in bytes for the start - * of the extent */ - __u64 fe_length; /* length in bytes for the extent */ - __u64 fe_reserved64[2]; - __u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */ - __u32 fe_reserved[3]; - }; +.. kernel-doc:: include/uapi/linux/fiemap.h + :identifiers: fiemap_extent All offsets and lengths are in bytes and mirror those on disk. It is valid for an extents logical offset to start before the request or its logical @@ -175,6 +158,8 @@ FIEMAP_EXTENT_MERGED userspace would be highly inefficient, the kernel will try to merge most adjacent blocks into 'extents'. +FIEMAP_EXTENT_SHARED + This flag is set to request that space be shared with other files. VFS -> File System Implementation --------------------------------- @@ -191,14 +176,10 @@ each discovered extent:: u64 len); ->fiemap is passed struct fiemap_extent_info which describes the -fiemap request:: +fiemap request: - struct fiemap_extent_info { - unsigned int fi_flags; /* Flags as passed from user */ - unsigned int fi_extents_mapped; /* Number of mapped extents */ - unsigned int fi_extents_max; /* Size of fiemap_extent array */ - struct fiemap_extent *fi_extents_start; /* Start of fiemap_extent array */ - }; +.. kernel-doc:: include/linux/fiemap.h + :identifiers: fiemap_extent_info It is intended that the file system should not need to access any of this structure directly. Filesystem handlers should be tolerant to signals and return diff --git a/include/linux/fiemap.h b/include/linux/fiemap.h index c50882f19235..966092ffa89a 100644 --- a/include/linux/fiemap.h +++ b/include/linux/fiemap.h @@ -5,12 +5,18 @@ #include #include +/** + * struct fiemap_extent_info - fiemap request to a filesystem + * @fi_flags: Flags as passed from user + * @fi_extents_mapped: Number of mapped extents + * @fi_extents_max: Size of fiemap_extent array + * @fi_extents_start: Start of fiemap_extent array + */ struct fiemap_extent_info { - unsigned int fi_flags; /* Flags as passed from user */ - unsigned int fi_extents_mapped; /* Number of mapped extents */ - unsigned int fi_extents_max; /* Size of fiemap_extent array */ - struct fiemap_extent __user *fi_extents_start; /* Start of - fiemap_extent array */ + unsigned int fi_flags; + unsigned int fi_extents_mapped; + unsigned int fi_extents_max; + struct fiemap_extent __user *fi_extents_start; }; int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo, diff --git a/include/uapi/linux/fiemap.h b/include/uapi/linux/fiemap.h index 24ca0c00cae3..9d9e8ae32b41 100644 --- a/include/uapi/linux/fiemap.h +++ b/include/uapi/linux/fiemap.h @@ -14,37 +14,56 @@ #include +/** + * struct fiemap_extent - description of one fiemap extent + * @fe_logical: byte offset of the extent in the file + * @fe_physical: byte offset of extent on disk + * @fe_length: length in bytes for this extent + * @fe_flags: FIEMAP_EXTENT_* flags for this extent + */ struct fiemap_extent { - __u64 fe_logical; /* logical offset in bytes for the start of - * the extent from the beginning of the file */ - __u64 fe_physical; /* physical offset in bytes for the start - * of the extent from the beginning of the disk */ - __u64 fe_length; /* length in bytes for this extent */ + __u64 fe_logical; + __u64 fe_physical; + __u64 fe_length; + /* private: */ __u64 fe_reserved64[2]; - __u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */ + /* public: */ + __u32 fe_flags; + /* private: */ __u32 fe_reserved[3]; }; +/** + * struct fiemap - file extent mappings + * @fm_start: byte offset (inclusive) at which to start mapping (in) + * @fm_length: logical length of mapping which userspace wants (in) + * @fm_flags: FIEMAP_FLAG_* flags for request (in/out) + * @fm_mapped_extents: number of extents that were mapped (out) + * @fm_extent_count: size of fm_extents array (in) + * @fm_extents: array of mapped extents (out) + */ struct fiemap { - __u64 fm_start; /* logical offset (inclusive) at - * which to start mapping (in) */ - __u64 fm_length; /* logical length of mapping which - * userspace wants (in) */ - __u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */ - __u32 fm_mapped_extents;/* number of extents that were mapped (out) */ - __u32 fm_extent_count; /* size of fm_extents array (in) */ + __u64 fm_start; + __u64 fm_length; + __u32 fm_flags; + __u32 fm_mapped_extents; + __u32 fm_extent_count; + /* private: */ __u32 fm_reserved; - struct fiemap_extent fm_extents[]; /* array of mapped extents (out) */ + /* public: */ + struct fiemap_extent fm_extents[]; }; #define FIEMAP_MAX_OFFSET (~0ULL) +/* flags used in fm_flags: */ #define FIEMAP_FLAG_SYNC 0x00000001 /* sync file data before map */ #define FIEMAP_FLAG_XATTR 0x00000002 /* map extended attribute tree */ #define FIEMAP_FLAG_CACHE 0x00000004 /* request caching of the extents */ #define FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR) +/* flags used in fe_flags: */ #define FIEMAP_EXTENT_LAST 0x00000001 /* Last extent in file. */ #define FIEMAP_EXTENT_UNKNOWN 0x00000002 /* Data location unknown. */ #define FIEMAP_EXTENT_DELALLOC 0x00000004 /* Location still pending. From 657e726e0cb9ba4f583ae7d226100bc43cc43a41 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 20 Nov 2024 12:20:36 +0100 Subject: [PATCH 06/28] tmpfs: use inode_set_cached_link() Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20241120112037.822078-4-mjguzik@gmail.com Signed-off-by: Christian Brauner --- mm/shmem.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index ccb9629a0f70..7beba4c1be5a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3914,6 +3914,7 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, int len; struct inode *inode; struct folio *folio; + char *link; len = strlen(symname) + 1; if (len > PAGE_SIZE) @@ -3935,12 +3936,13 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, inode->i_size = len-1; if (len <= SHORT_SYMLINK_LEN) { - inode->i_link = kmemdup(symname, len, GFP_KERNEL); - if (!inode->i_link) { + link = kmemdup(symname, len, GFP_KERNEL); + if (!link) { error = -ENOMEM; goto out_remove_offset; } inode->i_op = &shmem_short_symlink_operations; + inode_set_cached_link(inode, link, len - 1); } else { inode_nohighmem(inode); inode->i_mapping->a_ops = &shmem_aops; From d727935cad9f6f52c8d184968f9720fdc966c669 Mon Sep 17 00:00:00 2001 From: Jinliang Zheng Date: Sun, 24 Nov 2024 11:46:36 +0800 Subject: [PATCH 07/28] fs: fix proc_handler for sysctl_nr_open Use proc_douintvec_minmax() instead of proc_dointvec_minmax() to handle sysctl_nr_open, because its data type is unsigned int, not int. Fixes: 9b80a184eaad ("fs/file: more unsigned file descriptors") Signed-off-by: Jinliang Zheng Link: https://lore.kernel.org/r/20241124034636.325337-1-alexjlzheng@tencent.com Signed-off-by: Christian Brauner --- fs/file_table.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/file_table.c b/fs/file_table.c index 976736be47cb..502b81f614d9 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -128,7 +128,7 @@ static struct ctl_table fs_stat_sysctls[] = { .data = &sysctl_nr_open, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_douintvec_minmax, .extra1 = &sysctl_nr_open_min, .extra2 = &sysctl_nr_open_max, }, From 1197867a5dc8924d83ce484b6fd361ca32423dac Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 25 Nov 2024 17:54:41 +0000 Subject: [PATCH 08/28] watch_queue: Use page->private instead of page->index We are attempting to eliminate page->index, so use page->private instead. Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20241125175443.2911738-1-willy@infradead.org Signed-off-by: Christian Brauner --- kernel/watch_queue.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 1895fbc32bcb..5267adeaa403 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -71,7 +71,7 @@ static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe, bit /= WATCH_QUEUE_NOTE_SIZE; page = buf->page; - bit += page->index; + bit += page->private; set_bit(bit, wqueue->notes_bitmap); generic_pipe_buf_release(pipe, buf); @@ -278,7 +278,7 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) pages[i] = alloc_page(GFP_KERNEL); if (!pages[i]) goto error_p; - pages[i]->index = i * WATCH_QUEUE_NOTES_PER_PAGE; + pages[i]->private = i * WATCH_QUEUE_NOTES_PER_PAGE; } bitmap = bitmap_alloc(nr_notes, GFP_KERNEL); From 9b7da575f85962c44abe7dc245b0a58179ad2c45 Mon Sep 17 00:00:00 2001 From: shao mingyin Date: Wed, 23 Oct 2024 13:58:50 +0800 Subject: [PATCH 09/28] file: flush delayed work in delayed fput() The fput() of file rcS might not have completed causing issues when executing the file. rcS is opened in do_populate_rootfs before executed. At the end of do_populate_rootfs() flush_delayed_fput() is called. Now do_populate_rootfs() assumes that all fput()s caused by do_populate_rootfs() have completed. But flush_delayed_fput() can only ensure that fput() on the current delayed_fput_list has finished. Any file that has been removed from delayed_fput_list asynchronously in the meantime might not have completed causing the exec to fail. do_populate_rootfs delayed_fput_list delayed_fput execve fput() a fput() a->b fput() a->b->rcS __fput(a) fput() c fput() c->d __fput(b) flush_delayed_fput __fput(c) __fput(d) __fput(b) __fput(b) execve(rcS) Ensure that all delayed work is done by calling flush_delayed_work() in flush_delayed_fput() explicitly. Signed-off-by: Chen Lin Signed-off-by: Shao Mingyin Link: https://lore.kernel.org/r/20241023135850067m3w2R0UXESiVCYz_wdAoT@zte.com.cn Cc: Yang Yang Cc: Yang Tao Cc: Xu Xin [brauner: rewrite commit message] Signed-off-by: Christian Brauner --- fs/file_table.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/file_table.c b/fs/file_table.c index 502b81f614d9..a32171d2b83f 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -478,6 +478,8 @@ static void ____fput(struct callback_head *work) __fput(container_of(work, struct file, f_task_work)); } +static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); + /* * If kernel thread really needs to have the final fput() it has done * to complete, call this. The only user right now is the boot - we @@ -491,11 +493,10 @@ static void ____fput(struct callback_head *work) void flush_delayed_fput(void) { delayed_fput(NULL); + flush_delayed_work(&delayed_fput_work); } EXPORT_SYMBOL_GPL(flush_delayed_fput); -static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); - void fput(struct file *file) { if (file_ref_put(&file->f_ref)) { From 3212a8f34021a16d13ace91d3ac5f451ef8d0103 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sat, 30 Nov 2024 06:17:11 +0100 Subject: [PATCH 10/28] fs: use a consume fence in mnt_idmap() The routine is used in link_path_walk() for every path component. To my reading the entire point of the fence was to grab a fully populated mnt_idmap, but that's already going to happen with mere consume fence. Eliminates an actual fence on arm64. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20241130051712.1036527-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/mount.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mount.h b/include/linux/mount.h index c34c18b4e8f3..33f17b6e8732 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -76,7 +76,7 @@ struct vfsmount { static inline struct mnt_idmap *mnt_idmap(const struct vfsmount *mnt) { /* Pairs with smp_store_release() in do_idmap_mount(). */ - return smp_load_acquire(&mnt->mnt_idmap); + return READ_ONCE(mnt->mnt_idmap); } extern int mnt_want_write(struct vfsmount *mnt); From 4db9f52fa9b81addc412330957bb7a657d2f1ffb Mon Sep 17 00:00:00 2001 From: Guo Weikang Date: Mon, 2 Dec 2024 16:11:45 +0800 Subject: [PATCH 11/28] fs: fc_log replace magic number 7 with ARRAY_SIZE() Replace the hardcoded value `7` in `put_fc_log()` with `ARRAY_SIZE`. This improves maintainability by ensuring the loop adapts to changes in the buffer size. Signed-off-by: Guo Weikang Link: https://lore.kernel.org/r/20241202081146.1031780-1-guoweikang.kernel@gmail.com Signed-off-by: Christian Brauner --- fs/fs_context.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fs_context.c b/fs/fs_context.c index 98589aae5208..582d33e81117 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -493,7 +493,7 @@ static void put_fc_log(struct fs_context *fc) if (log) { if (refcount_dec_and_test(&log->usage)) { fc->log.log = NULL; - for (i = 0; i <= 7; i++) + for (i = 0; i < ARRAY_SIZE(log->buffer) ; i++) if (log->need_free & (1 << i)) kfree(log->buffer[i]); kfree(log); From 175c6a216dda4c88f7050b67e75a6cf331086c75 Mon Sep 17 00:00:00 2001 From: Zhu Jun Date: Wed, 4 Dec 2024 00:12:18 -0800 Subject: [PATCH 12/28] fs: Fix grammar and spelling in propagate_umount() Fix grammar and spelling in the propagate_umount() function. Signed-off-by: Zhu Jun Link: https://lore.kernel.org/r/20241204081218.12141-1-zhujun2@cmss.chinamobile.com Signed-off-by: Christian Brauner --- fs/pnode.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/pnode.c b/fs/pnode.c index a799e0315cc9..ef048f008bdd 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -611,10 +611,10 @@ int propagate_umount(struct list_head *list) continue; } else if (child->mnt.mnt_flags & MNT_UMOUNT) { /* - * We have come accross an partially unmounted - * mount in list that has not been visited yet. - * Remember it has been visited and continue - * about our merry way. + * We have come across a partially unmounted + * mount in a list that has not been visited + * yet. Remember it has been visited and + * continue about our merry way. */ list_add_tail(&child->mnt_umounting, &visited); continue; From ec052fae814d467d6aa7e591b4b24531b87e65ec Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 5 Dec 2024 16:47:43 +0100 Subject: [PATCH 13/28] fs: sort out a stale comment about races between fd alloc and dup2 It claims the issue is only relevant for shared descriptor tables which is of no concern for POSIX (but then is POSIX of concern to anyone today?), which I presume predates standarized threading. The comment also mentions the following systems: - OpenBSD installing a larval file -- they moved away from it, file is installed late and EBUSY is returned on conflict - FreeBSD returning EBADF -- reworked to install the file early like OpenBSD used to do - NetBSD "deadlocks in amusing ways" -- their solution looks Solaris-inspired (not a compliment) and I would not be particularly surprised if it indeed deadlocked, in amusing ways or otherwise I don't believe mentioning any of these adds anything and the statement about the issue not being POSIX-relevant is outdated. dup2 description in POSIX still does not mention the problem. Just shorten the comment and be done with it. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20241205154743.1586584-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/file.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/fs/file.c b/fs/file.c index 019fb9acf91b..d498715ef415 100644 --- a/fs/file.c +++ b/fs/file.c @@ -1230,17 +1230,9 @@ __releases(&files->file_lock) /* * We need to detect attempts to do dup2() over allocated but still - * not finished descriptor. NB: OpenBSD avoids that at the price of - * extra work in their equivalent of fget() - they insert struct - * file immediately after grabbing descriptor, mark it larval if - * more work (e.g. actual opening) is needed and make sure that - * fget() treats larval files as absent. Potentially interesting, - * but while extra work in fget() is trivial, locking implications - * and amount of surgery on open()-related paths in VFS are not. - * FreeBSD fails with -EBADF in the same situation, NetBSD "solution" - * deadlocks in rather amusing ways, AFAICS. All of that is out of - * scope of POSIX or SUS, since neither considers shared descriptor - * tables and this condition does not arise without those. + * not finished descriptor. + * + * POSIX is silent on the issue, we return -EBUSY. */ fdt = files_fdtable(files); fd = array_index_nospec(fd, fdt->max_fds); From af6505e5745b9f3a670de405b08b73573343c15c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:45 -0700 Subject: [PATCH 14/28] fs: add RWF_DONTCACHE iocb and FOP_DONTCACHE file_operations flag If a file system supports uncached buffered IO, it may set FOP_DONTCACHE and enable support for RWF_DONTCACHE. If RWF_DONTCACHE is attempted without the file system supporting it, it'll get errored with -EOPNOTSUPP. Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/20241220154831.1086649-8-axboe@kernel.dk Signed-off-by: Christian Brauner --- include/linux/fs.h | 14 +++++++++++++- include/uapi/linux/fs.h | 6 +++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 7e29433c5ecc..6a838b5479a6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -322,6 +322,7 @@ struct readahead_control; #define IOCB_NOWAIT (__force int) RWF_NOWAIT #define IOCB_APPEND (__force int) RWF_APPEND #define IOCB_ATOMIC (__force int) RWF_ATOMIC +#define IOCB_DONTCACHE (__force int) RWF_DONTCACHE /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) @@ -356,7 +357,8 @@ struct readahead_control; { IOCB_SYNC, "SYNC" }, \ { IOCB_NOWAIT, "NOWAIT" }, \ { IOCB_APPEND, "APPEND" }, \ - { IOCB_ATOMIC, "ATOMIC"}, \ + { IOCB_ATOMIC, "ATOMIC" }, \ + { IOCB_DONTCACHE, "DONTCACHE" }, \ { IOCB_EVENTFD, "EVENTFD"}, \ { IOCB_DIRECT, "DIRECT" }, \ { IOCB_WRITE, "WRITE" }, \ @@ -2127,6 +2129,8 @@ struct file_operations { #define FOP_UNSIGNED_OFFSET ((__force fop_flags_t)(1 << 5)) /* Supports asynchronous lock callbacks */ #define FOP_ASYNC_LOCK ((__force fop_flags_t)(1 << 6)) +/* File system supports uncached read/write buffered IO */ +#define FOP_DONTCACHE ((__force fop_flags_t)(1 << 7)) /* Wrap a directory iterator that needs exclusive inode access */ int wrap_directory_iterator(struct file *, struct dir_context *, @@ -3614,6 +3618,14 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags, if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE)) return -EOPNOTSUPP; } + if (flags & RWF_DONTCACHE) { + /* file system must support it */ + if (!(ki->ki_filp->f_op->fop_flags & FOP_DONTCACHE)) + return -EOPNOTSUPP; + /* DAX mappings not supported */ + if (IS_DAX(ki->ki_filp->f_mapping->host)) + return -EOPNOTSUPP; + } kiocb_flags |= (__force int) (flags & RWF_SUPPORTED); if (flags & RWF_SYNC) kiocb_flags |= IOCB_DSYNC; diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 753971770733..56a4f93a08f4 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -332,9 +332,13 @@ typedef int __bitwise __kernel_rwf_t; /* Atomic Write */ #define RWF_ATOMIC ((__force __kernel_rwf_t)0x00000040) +/* buffered IO that drops the cache after reading or writing data */ +#define RWF_DONTCACHE ((__force __kernel_rwf_t)0x00000080) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ - RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC) + RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC |\ + RWF_DONTCACHE) #define PROCFS_IOCTL_MAGIC 'f' From ab251dacfbae28772c897f068a4184f478189ff2 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 2 Jan 2025 09:22:56 +0100 Subject: [PATCH 15/28] fs/proc: do_task_stat: Fix ESP not readable during coredump The field "eip" (instruction pointer) and "esp" (stack pointer) of a task can be read from /proc/PID/stat. These fields can be interesting for coredump. However, these fields were disabled by commit 0a1eb2d474ed ("fs/proc: Stop reporting eip and esp in /proc/PID/stat"), because it is generally unsafe to do so. But it is safe for a coredumping process, and therefore exceptions were made: - for a coredumping thread by commit fd7d56270b52 ("fs/proc: Report eip/esp in /prod/PID/stat for coredumping"). - for all other threads in a coredumping process by commit cb8f381f1613 ("fs/proc/array.c: allow reporting eip/esp for all coredumping threads"). The above two commits check the PF_DUMPCORE flag to determine a coredump thread and the PF_EXITING flag for the other threads. Unfortunately, commit 92307383082d ("coredump: Don't perform any cleanups before dumping core") moved coredump to happen earlier and before PF_EXITING is set. Thus, checking PF_EXITING is no longer the correct way to determine threads in a coredumping process. Instead of PF_EXITING, use PF_POSTCOREDUMP to determine the other threads. Checking of PF_EXITING was added for coredumping, so it probably can now be removed. But it doesn't hurt to keep. Fixes: 92307383082d ("coredump: Don't perform any cleanups before dumping core") Cc: stable@vger.kernel.org Cc: Eric W. Biederman Acked-by: Oleg Nesterov Acked-by: Kees Cook Signed-off-by: Nam Cao Link: https://lore.kernel.org/r/d89af63d478d6c64cc46a01420b46fd6eb147d6f.1735805772.git.namcao@linutronix.de Signed-off-by: Christian Brauner --- fs/proc/array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index 55ed3510d2bb..d6a0369caa93 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -500,7 +500,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, * a program is not able to use ptrace(2) in that case. It is * safe because the task has stopped executing permanently. */ - if (permitted && (task->flags & (PF_EXITING|PF_DUMPCORE))) { + if (permitted && (task->flags & (PF_EXITING|PF_DUMPCORE|PF_POSTCOREDUMP))) { if (try_get_task_stack(task)) { eip = KSTK_EIP(task); esp = KSTK_ESP(task); From 15858da53542360931a457f32bcdc4287d13731f Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 2 Jan 2025 09:22:57 +0100 Subject: [PATCH 16/28] selftests: coredump: Add stackdump test Add a test which checks that the kstkesp field in /proc/pid/stat can be read for all threads of a coredumping process. For full details including the motivation for this test and how it works, see the README file added by this commit. Reviewed-by: John Ogness Signed-off-by: Nam Cao Link: https://lore.kernel.org/r/50e737b6576208566d14efcf1934fe840de6b1f4.1735805772.git.namcao@linutronix.de Signed-off-by: Christian Brauner --- tools/testing/selftests/coredump/Makefile | 7 + tools/testing/selftests/coredump/README.rst | 50 ++++++ tools/testing/selftests/coredump/stackdump | 14 ++ .../selftests/coredump/stackdump_test.c | 151 ++++++++++++++++++ 4 files changed, 222 insertions(+) create mode 100644 tools/testing/selftests/coredump/Makefile create mode 100644 tools/testing/selftests/coredump/README.rst create mode 100755 tools/testing/selftests/coredump/stackdump create mode 100644 tools/testing/selftests/coredump/stackdump_test.c diff --git a/tools/testing/selftests/coredump/Makefile b/tools/testing/selftests/coredump/Makefile new file mode 100644 index 000000000000..ed210037b29d --- /dev/null +++ b/tools/testing/selftests/coredump/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only +CFLAGS = $(KHDR_INCLUDES) + +TEST_GEN_PROGS := stackdump_test +TEST_FILES := stackdump + +include ../lib.mk diff --git a/tools/testing/selftests/coredump/README.rst b/tools/testing/selftests/coredump/README.rst new file mode 100644 index 000000000000..164a7aa181c8 --- /dev/null +++ b/tools/testing/selftests/coredump/README.rst @@ -0,0 +1,50 @@ +coredump selftest +================= + +Background context +------------------ + +`coredump` is a feature which dumps a process's memory space when the process terminates +unexpectedly (e.g. due to segmentation fault), which can be useful for debugging. By default, +`coredump` dumps the memory to the file named `core`, but this behavior can be changed by writing a +different file name to `/proc/sys/kernel/core_pattern`. Furthermore, `coredump` can be piped to a +user-space program by writing the pipe symbol (`|`) followed by the command to be executed to +`/proc/sys/kernel/core_pattern`. For the full description, see `man 5 core`. + +The piped user program may be interested in reading the stack pointers of the crashed process. The +crashed process's stack pointers can be read from `procfs`: it is the `kstkesp` field in +`/proc/$PID/stat`. See `man 5 proc` for all the details. + +The problem +----------- +While a thread is active, the stack pointer is unsafe to read and therefore the `kstkesp` field +reads zero. But when the thread is dead (e.g. during a coredump), this field should have valid +value. + +However, this was broken in the past and `kstkesp` was zero even during coredump: + +* commit 0a1eb2d474ed ("fs/proc: Stop reporting eip and esp in /proc/PID/stat") changed kstkesp to + always be zero + +* commit fd7d56270b52 ("fs/proc: Report eip/esp in /prod/PID/stat for coredumping") fixed it for the + coredumping thread. However, other threads in a coredumping process still had the problem. + +* commit cb8f381f1613 ("fs/proc/array.c: allow reporting eip/esp for all coredumping threads") fixed + for all threads in a coredumping process. + +* commit 92307383082d ("coredump: Don't perform any cleanups before dumping core") broke it again + for the other threads in a coredumping process. + +The problem has been fixed now, but considering the history, it may appear again in the future. + +The goal of this test +--------------------- +This test detects problem with reading `kstkesp` during coredump by doing the following: + +#. Tell the kernel to execute the "stackdump" script when a coredump happens. This script + reads the stack pointers of all threads of crashed processes. + +#. Spawn a child process who creates some threads and then crashes. + +#. Read the output from the "stackdump" script, and make sure all stack pointer values are + non-zero. diff --git a/tools/testing/selftests/coredump/stackdump b/tools/testing/selftests/coredump/stackdump new file mode 100755 index 000000000000..96714ce42d12 --- /dev/null +++ b/tools/testing/selftests/coredump/stackdump @@ -0,0 +1,14 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +CRASH_PROGRAM_ID=$1 +STACKDUMP_FILE=$2 + +TMP=$(mktemp) + +for t in /proc/$CRASH_PROGRAM_ID/task/*; do + tid=$(basename $t) + cat /proc/$tid/stat | awk '{print $29}' >> $TMP +done + +mv $TMP $STACKDUMP_FILE diff --git a/tools/testing/selftests/coredump/stackdump_test.c b/tools/testing/selftests/coredump/stackdump_test.c new file mode 100644 index 000000000000..137b2364a082 --- /dev/null +++ b/tools/testing/selftests/coredump/stackdump_test.c @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest_harness.h" + +#define STACKDUMP_FILE "stack_values" +#define STACKDUMP_SCRIPT "stackdump" +#define NUM_THREAD_SPAWN 128 + +static void *do_nothing(void *) +{ + while (1) + pause(); +} + +static void crashing_child(void) +{ + pthread_t thread; + int i; + + for (i = 0; i < NUM_THREAD_SPAWN; ++i) + pthread_create(&thread, NULL, do_nothing, NULL); + + /* crash on purpose */ + i = *(int *)NULL; +} + +FIXTURE(coredump) +{ + char original_core_pattern[256]; +}; + +FIXTURE_SETUP(coredump) +{ + char buf[PATH_MAX]; + FILE *file; + char *dir; + int ret; + + file = fopen("/proc/sys/kernel/core_pattern", "r"); + ASSERT_NE(NULL, file); + + ret = fread(self->original_core_pattern, 1, sizeof(self->original_core_pattern), file); + ASSERT_TRUE(ret || feof(file)); + ASSERT_LT(ret, sizeof(self->original_core_pattern)); + + self->original_core_pattern[ret] = '\0'; + + ret = fclose(file); + ASSERT_EQ(0, ret); +} + +FIXTURE_TEARDOWN(coredump) +{ + const char *reason; + FILE *file; + int ret; + + unlink(STACKDUMP_FILE); + + file = fopen("/proc/sys/kernel/core_pattern", "w"); + if (!file) { + reason = "Unable to open core_pattern"; + goto fail; + } + + ret = fprintf(file, "%s", self->original_core_pattern); + if (ret < 0) { + reason = "Unable to write to core_pattern"; + goto fail; + } + + ret = fclose(file); + if (ret) { + reason = "Unable to close core_pattern"; + goto fail; + } + + return; +fail: + /* This should never happen */ + fprintf(stderr, "Failed to cleanup stackdump test: %s\n", reason); +} + +TEST_F(coredump, stackdump) +{ + struct sigaction action = {}; + unsigned long long stack; + char *test_dir, *line; + size_t line_length; + char buf[PATH_MAX]; + int ret, i; + FILE *file; + pid_t pid; + + /* + * Step 1: Setup core_pattern so that the stackdump script is executed when the child + * process crashes + */ + ret = readlink("/proc/self/exe", buf, sizeof(buf)); + ASSERT_NE(-1, ret); + ASSERT_LT(ret, sizeof(buf)); + buf[ret] = '\0'; + + test_dir = dirname(buf); + + file = fopen("/proc/sys/kernel/core_pattern", "w"); + ASSERT_NE(NULL, file); + + ret = fprintf(file, "|%1$s/%2$s %%P %1$s/%3$s", test_dir, STACKDUMP_SCRIPT, STACKDUMP_FILE); + ASSERT_LT(0, ret); + + ret = fclose(file); + ASSERT_EQ(0, ret); + + /* Step 2: Create a process who spawns some threads then crashes */ + pid = fork(); + ASSERT_TRUE(pid >= 0); + if (pid == 0) + crashing_child(); + + /* + * Step 3: Wait for the stackdump script to write the stack pointers to the stackdump file + */ + for (i = 0; i < 10; ++i) { + file = fopen(STACKDUMP_FILE, "r"); + if (file) + break; + sleep(1); + } + ASSERT_NE(file, NULL); + + /* Step 4: Make sure all stack pointer values are non-zero */ + for (i = 0; -1 != getline(&line, &line_length, file); ++i) { + stack = strtoull(line, NULL, 10); + ASSERT_NE(stack, 0); + } + + ASSERT_EQ(i, 1 + NUM_THREAD_SPAWN); + + fclose(file); +} + +TEST_HARNESS_MAIN From aaec5a95d59615523db03dd53c2052f0a87beea7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 2 Jan 2025 15:07:15 +0100 Subject: [PATCH 17/28] pipe_read: don't wake up the writer if the pipe is still full wake_up(pipe->wr_wait) makes no sense if pipe_full() is still true after the reading, the writer sleeping in wait_event(wr_wait, pipe_writable()) will check the pipe_writable() == !pipe_full() condition and sleep again. Only wake the writer if we actually released a pipe buf, and the pipe was full before we did so. Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/all/20241229135737.GA3293@redhat.com/ Link: https://lore.kernel.org/r/20250102140715.GA7091@redhat.com Reported-by: WangYuli Signed-off-by: Christian Brauner --- fs/pipe.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index 12b22c2723b7..82fede0f2111 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -253,7 +253,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) size_t total_len = iov_iter_count(to); struct file *filp = iocb->ki_filp; struct pipe_inode_info *pipe = filp->private_data; - bool was_full, wake_next_reader = false; + bool wake_writer = false, wake_next_reader = false; ssize_t ret; /* Null read succeeds. */ @@ -264,14 +264,13 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) mutex_lock(&pipe->mutex); /* - * We only wake up writers if the pipe was full when we started - * reading in order to avoid unnecessary wakeups. + * We only wake up writers if the pipe was full when we started reading + * and it is no longer full after reading to avoid unnecessary wakeups. * * But when we do wake up writers, we do so using a sync wakeup * (WF_SYNC), because we want them to get going and generate more * data for us. */ - was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); for (;;) { /* Read ->head with a barrier vs post_one_notification() */ unsigned int head = smp_load_acquire(&pipe->head); @@ -340,8 +339,10 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) buf->len = 0; } - if (!buf->len) + if (!buf->len) { + wake_writer |= pipe_full(head, tail, pipe->max_usage); tail = pipe_update_tail(pipe, buf, tail); + } total_len -= chars; if (!total_len) break; /* common path: read succeeded */ @@ -377,7 +378,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) * _very_ unlikely case that the pipe was full, but we got * no data. */ - if (unlikely(was_full)) + if (unlikely(wake_writer)) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); @@ -390,15 +391,15 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0) return -ERESTARTSYS; - mutex_lock(&pipe->mutex); - was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); + wake_writer = false; wake_next_reader = true; + mutex_lock(&pipe->mutex); } if (pipe_empty(pipe->head, pipe->tail)) wake_next_reader = false; mutex_unlock(&pipe->mutex); - if (was_full) + if (wake_writer) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); if (wake_next_reader) wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); From 5cf8f938bf5ca441a02a3bbf6ef772963aa387b3 Mon Sep 17 00:00:00 2001 From: Christian Kujau Date: Mon, 6 Jan 2025 16:32:05 +0100 Subject: [PATCH 18/28] vbox: Enable VBOXGUEST and VBOXSF_FS on ARM64 Now that VirtualBox is able to run as a host on arm64 (e.g. the Apple M3 processors) we can enable VBOXSF_FS (and in turn VBOXGUEST) for this architecture. Tested with various runs of bonnie++ and dbench on an Apple MacBook Pro with the latest Virtualbox 7.1.4 r165100 installed. Signed-off-by: Christian Kujau Link: https://lore.kernel.org/r/7384d96c-2a77-39b0-2306-90129bae9342@nerdbynature.de Reviewed-by: Hans de Goede Signed-off-by: Christian Brauner --- drivers/virt/vboxguest/Kconfig | 2 +- fs/vboxsf/Kconfig | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/virt/vboxguest/Kconfig b/drivers/virt/vboxguest/Kconfig index cc329887bfae..11b153e7454e 100644 --- a/drivers/virt/vboxguest/Kconfig +++ b/drivers/virt/vboxguest/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config VBOXGUEST tristate "Virtual Box Guest integration support" - depends on X86 && PCI && INPUT + depends on (ARM64 || X86) && PCI && INPUT help This is a driver for the Virtual Box Guest PCI device used in Virtual Box virtual machines. Enabling this driver will add diff --git a/fs/vboxsf/Kconfig b/fs/vboxsf/Kconfig index b84586ae08b3..d4694026db8b 100644 --- a/fs/vboxsf/Kconfig +++ b/fs/vboxsf/Kconfig @@ -1,6 +1,6 @@ config VBOXSF_FS tristate "VirtualBox guest shared folder (vboxsf) support" - depends on X86 && VBOXGUEST + depends on (ARM64 || X86) && VBOXGUEST select NLS help VirtualBox hosts can share folders with guests, this driver From 344af27715ddbf357cf76978d674428b88f8e92d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 13 Jan 2025 09:37:24 +0100 Subject: [PATCH 19/28] select: Fix unbalanced user_access_end() While working on implementing user access validation on powerpc I got the following warnings on a pmac32_defconfig build: CC fs/select.o fs/select.o: warning: objtool: sys_pselect6+0x1bc: redundant UACCESS disable fs/select.o: warning: objtool: sys_pselect6_time32+0x1bc: redundant UACCESS disable On powerpc/32s, user_read_access_begin/end() are no-ops, but the failure path has a user_access_end() instead of user_read_access_end() which means an access end without any prior access begin. Replace that user_access_end() by user_read_access_end(). Fixes: 7e71609f64ec ("pselect6() and friends: take handling the combined 6th/7th args into helper") Signed-off-by: Christophe Leroy Link: https://lore.kernel.org/r/a7139e28d767a13e667ee3c79599a8047222ef36.1736751221.git.christophe.leroy@csgroup.eu Signed-off-by: Christian Brauner --- fs/select.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/select.c b/fs/select.c index e223d1fe9d55..7da531b1cf6b 100644 --- a/fs/select.c +++ b/fs/select.c @@ -786,7 +786,7 @@ static inline int get_sigset_argpack(struct sigset_argpack *to, } return 0; Efault: - user_access_end(); + user_read_access_end(); return -EFAULT; } @@ -1355,7 +1355,7 @@ static inline int get_compat_sigset_argpack(struct compat_sigset_argpack *to, } return 0; Efault: - user_access_end(); + user_read_access_end(); return -EFAULT; } From 4f3b63e8a8a28e3dcdcf3ff260f57a732a20b92b Mon Sep 17 00:00:00 2001 From: Sentaro Onizuka Date: Tue, 14 Jan 2025 00:14:00 +0900 Subject: [PATCH 20/28] fs: Fix return type of do_mount() from long to int Fix the return type of do_mount() function from long to int to match its ac tual behavior. The function only returns int values, and all callers, inclu ding those in fs/namespace.c and arch/alpha/kernel/osf_sys.c, already treat the return value as int. This change improves type consistency across the filesystem code and aligns the function signature with its existing impleme ntation and usage. Signed-off-by: Sentaro Onizuka Link: https://lore.kernel.org/r/20250113151400.55512-1-sentaro@amazon.com Signed-off-by: Christian Brauner --- fs/namespace.c | 2 +- include/linux/mount.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 23e81c2a1e3f..5d808778a3ae 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3835,7 +3835,7 @@ int path_mount(const char *dev_name, struct path *path, data_page); } -long do_mount(const char *dev_name, const char __user *dir_name, +int do_mount(const char *dev_name, const char __user *dir_name, const char *type_page, unsigned long flags, void *data_page) { struct path path; diff --git a/include/linux/mount.h b/include/linux/mount.h index 33f17b6e8732..a7b472faec2c 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -114,7 +114,7 @@ extern struct vfsmount *kern_mount(struct file_system_type *); extern void kern_unmount(struct vfsmount *mnt); extern int may_umount_tree(struct vfsmount *); extern int may_umount(struct vfsmount *); -extern long do_mount(const char *, const char __user *, +int do_mount(const char *, const char __user *, const char *, unsigned long, void *); extern struct vfsmount *collect_mounts(const struct path *); extern void drop_collected_mounts(struct vfsmount *); From 4b193fa75efffd90c054d1a7f2b5dbe29a461c14 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 15 Jan 2025 10:46:37 +0100 Subject: [PATCH 21/28] lockref: remove lockref_put_not_zero lockref_put_not_zero is not used anywhere, and unless I'm missing something didn't end up being used used at all. Remove it. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250115094702.504610-2-hch@lst.de Signed-off-by: Christian Brauner --- include/linux/lockref.h | 1 - lib/lockref.c | 28 ---------------------------- 2 files changed, 29 deletions(-) diff --git a/include/linux/lockref.h b/include/linux/lockref.h index c3a1f78bc884..e5aa0347f274 100644 --- a/include/linux/lockref.h +++ b/include/linux/lockref.h @@ -37,7 +37,6 @@ struct lockref { extern void lockref_get(struct lockref *); extern int lockref_put_return(struct lockref *); extern int lockref_get_not_zero(struct lockref *); -extern int lockref_put_not_zero(struct lockref *); extern int lockref_put_or_lock(struct lockref *); extern void lockref_mark_dead(struct lockref *); diff --git a/lib/lockref.c b/lib/lockref.c index 2afe4c5d8919..a68192c979b3 100644 --- a/lib/lockref.c +++ b/lib/lockref.c @@ -81,34 +81,6 @@ int lockref_get_not_zero(struct lockref *lockref) } EXPORT_SYMBOL(lockref_get_not_zero); -/** - * lockref_put_not_zero - Decrements count unless count <= 1 before decrement - * @lockref: pointer to lockref structure - * Return: 1 if count updated successfully or 0 if count would become zero - */ -int lockref_put_not_zero(struct lockref *lockref) -{ - int retval; - - CMPXCHG_LOOP( - new.count--; - if (old.count <= 1) - return 0; - , - return 1; - ); - - spin_lock(&lockref->lock); - retval = 0; - if (lockref->count > 1) { - lockref->count--; - retval = 1; - } - spin_unlock(&lockref->lock); - return retval; -} -EXPORT_SYMBOL(lockref_put_not_zero); - /** * lockref_put_return - Decrement reference count if possible * @lockref: pointer to lockref structure From d60f2280a1b5b9a4796f9a13f7fdff1d0b99f718 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 15 Jan 2025 10:46:38 +0100 Subject: [PATCH 22/28] lockref: improve the lockref_get_not_zero description lockref_put_return returns exactly -1 and not "an error" when the lockref is dead or locked. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250115094702.504610-3-hch@lst.de Signed-off-by: Christian Brauner --- lib/lockref.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/lockref.c b/lib/lockref.c index a68192c979b3..b1b042a9a6c8 100644 --- a/lib/lockref.c +++ b/lib/lockref.c @@ -86,7 +86,7 @@ EXPORT_SYMBOL(lockref_get_not_zero); * @lockref: pointer to lockref structure * * Decrement the reference count and return the new value. - * If the lockref was dead or locked, return an error. + * If the lockref was dead or locked, return -1. */ int lockref_put_return(struct lockref *lockref) { From 6d2868d5b6fca7534641440efe432cf268bd8e1b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 15 Jan 2025 10:46:39 +0100 Subject: [PATCH 23/28] lockref: use bool for false/true returns Replace int used as bool with the actual bool type for return values that can only be true or false. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250115094702.504610-4-hch@lst.de Signed-off-by: Christian Brauner --- include/linux/lockref.h | 6 +++--- lib/lockref.c | 30 ++++++++++++++---------------- 2 files changed, 17 insertions(+), 19 deletions(-) diff --git a/include/linux/lockref.h b/include/linux/lockref.h index e5aa0347f274..3d770e1bdbad 100644 --- a/include/linux/lockref.h +++ b/include/linux/lockref.h @@ -36,11 +36,11 @@ struct lockref { extern void lockref_get(struct lockref *); extern int lockref_put_return(struct lockref *); -extern int lockref_get_not_zero(struct lockref *); -extern int lockref_put_or_lock(struct lockref *); +bool lockref_get_not_zero(struct lockref *lockref); +bool lockref_put_or_lock(struct lockref *lockref); extern void lockref_mark_dead(struct lockref *); -extern int lockref_get_not_dead(struct lockref *); +bool lockref_get_not_dead(struct lockref *lockref); /* Must be called under spinlock for reliable results */ static inline bool __lockref_is_dead(const struct lockref *l) diff --git a/lib/lockref.c b/lib/lockref.c index b1b042a9a6c8..5d8e3ef3860e 100644 --- a/lib/lockref.c +++ b/lib/lockref.c @@ -58,23 +58,22 @@ EXPORT_SYMBOL(lockref_get); * @lockref: pointer to lockref structure * Return: 1 if count updated successfully or 0 if count was zero */ -int lockref_get_not_zero(struct lockref *lockref) +bool lockref_get_not_zero(struct lockref *lockref) { - int retval; + bool retval = false; CMPXCHG_LOOP( new.count++; if (old.count <= 0) - return 0; + return false; , - return 1; + return true; ); spin_lock(&lockref->lock); - retval = 0; if (lockref->count > 0) { lockref->count++; - retval = 1; + retval = true; } spin_unlock(&lockref->lock); return retval; @@ -106,22 +105,22 @@ EXPORT_SYMBOL(lockref_put_return); * @lockref: pointer to lockref structure * Return: 1 if count updated successfully or 0 if count <= 1 and lock taken */ -int lockref_put_or_lock(struct lockref *lockref) +bool lockref_put_or_lock(struct lockref *lockref) { CMPXCHG_LOOP( new.count--; if (old.count <= 1) break; , - return 1; + return true; ); spin_lock(&lockref->lock); if (lockref->count <= 1) - return 0; + return false; lockref->count--; spin_unlock(&lockref->lock); - return 1; + return true; } EXPORT_SYMBOL(lockref_put_or_lock); @@ -141,23 +140,22 @@ EXPORT_SYMBOL(lockref_mark_dead); * @lockref: pointer to lockref structure * Return: 1 if count updated successfully or 0 if lockref was dead */ -int lockref_get_not_dead(struct lockref *lockref) +bool lockref_get_not_dead(struct lockref *lockref) { - int retval; + bool retval = false; CMPXCHG_LOOP( new.count++; if (old.count < 0) - return 0; + return false; , - return 1; + return true; ); spin_lock(&lockref->lock); - retval = 0; if (lockref->count >= 0) { lockref->count++; - retval = 1; + retval = true; } spin_unlock(&lockref->lock); return retval; From 25d8060418b4e83e109b20f3b3931301e254b8f4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 15 Jan 2025 10:46:40 +0100 Subject: [PATCH 24/28] lockref: drop superfluous externs Drop the superfluous externs from the remaining prototypes in lockref.h. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250115094702.504610-5-hch@lst.de Signed-off-by: Christian Brauner --- include/linux/lockref.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/lockref.h b/include/linux/lockref.h index 3d770e1bdbad..f821f46e9fb4 100644 --- a/include/linux/lockref.h +++ b/include/linux/lockref.h @@ -34,12 +34,12 @@ struct lockref { }; }; -extern void lockref_get(struct lockref *); -extern int lockref_put_return(struct lockref *); +void lockref_get(struct lockref *lockref); +int lockref_put_return(struct lockref *lockref); bool lockref_get_not_zero(struct lockref *lockref); bool lockref_put_or_lock(struct lockref *lockref); -extern void lockref_mark_dead(struct lockref *); +void lockref_mark_dead(struct lockref *lockref); bool lockref_get_not_dead(struct lockref *lockref); /* Must be called under spinlock for reliable results */ From 63440d1c6dd1fc782db905319dbfb4db354e54b9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 15 Jan 2025 10:46:41 +0100 Subject: [PATCH 25/28] lockref: add a lockref_init helper Add a helper to initialize the lockdep, that is initialize the spinlock and set a value. Having to open code them isn't a big deal, but having an initializer feels right for a proper primitive. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250115094702.504610-6-hch@lst.de Signed-off-by: Christian Brauner --- include/linux/lockref.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/include/linux/lockref.h b/include/linux/lockref.h index f821f46e9fb4..c39f119659ba 100644 --- a/include/linux/lockref.h +++ b/include/linux/lockref.h @@ -34,6 +34,17 @@ struct lockref { }; }; +/** + * lockref_init - Initialize a lockref + * @lockref: pointer to lockref structure + * @count: initial count + */ +static inline void lockref_init(struct lockref *lockref, unsigned int count) +{ + spin_lock_init(&lockref->lock); + lockref->count = count; +} + void lockref_get(struct lockref *lockref); int lockref_put_return(struct lockref *lockref); bool lockref_get_not_zero(struct lockref *lockref); From 8c32b87c4f885fab3c9b2378a3f855dbf280fbca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 15 Jan 2025 10:46:42 +0100 Subject: [PATCH 26/28] dcache: use lockref_init for d_lockref Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250115094702.504610-7-hch@lst.de Signed-off-by: Christian Brauner --- fs/dcache.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index b4d5e9e1e43d..1a01d7a6a7a9 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1681,9 +1681,8 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) /* Make sure we always see the terminating NUL character */ smp_store_release(&dentry->d_name.name, dname); /* ^^^ */ - dentry->d_lockref.count = 1; dentry->d_flags = 0; - spin_lock_init(&dentry->d_lock); + lockref_init(&dentry->d_lockref, 1); seqcount_spinlock_init(&dentry->d_seq, &dentry->d_lock); dentry->d_inode = NULL; dentry->d_parent = dentry; From 6f86f1465b595864d4e4c58179f2ebcc3dbf5b62 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 15 Jan 2025 10:46:43 +0100 Subject: [PATCH 27/28] erofs: use lockref_init for pcl->lockref Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250115094702.504610-8-hch@lst.de Reviewed-by: Gao Xiang Signed-off-by: Christian Brauner --- fs/erofs/zdata.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 01f147505487..59f143d9744f 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -747,8 +747,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) if (IS_ERR(pcl)) return PTR_ERR(pcl); - spin_lock_init(&pcl->lockref.lock); - pcl->lockref.count = 1; /* one ref for this request */ + lockref_init(&pcl->lockref, 1); /* one ref for this request */ pcl->algorithmformat = map->m_algorithmformat; pcl->length = 0; pcl->partial = true; From 3e652eba244c222b0ba95a3f6fd79315eb020f73 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 15 Jan 2025 10:46:44 +0100 Subject: [PATCH 28/28] gfs2: use lockref_init for qd_lockref Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250115094702.504610-9-hch@lst.de Signed-off-by: Christian Brauner --- fs/gfs2/quota.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 72b48f6f5561..58bc5013ca49 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -236,8 +236,7 @@ static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, str return NULL; qd->qd_sbd = sdp; - qd->qd_lockref.count = 0; - spin_lock_init(&qd->qd_lockref.lock); + lockref_init(&qd->qd_lockref, 0); qd->qd_id = qid; qd->qd_slot = -1; INIT_LIST_HEAD(&qd->qd_lru);