2024-07-29 12:50:41 +01:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0+ */
|
|
|
|
/*
|
|
|
|
* vma_internal.h
|
|
|
|
*
|
|
|
|
* Header providing userland wrappers and shims for the functionality provided
|
|
|
|
* by mm/vma_internal.h.
|
|
|
|
*
|
|
|
|
* We make the header guard the same as mm/vma_internal.h, so if this shim
|
|
|
|
* header is included, it precludes the inclusion of the kernel one.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef __MM_VMA_INTERNAL_H
|
|
|
|
#define __MM_VMA_INTERNAL_H
|
|
|
|
|
|
|
|
#define __private
|
|
|
|
#define __bitwise
|
|
|
|
#define __randomize_layout
|
|
|
|
|
|
|
|
#define CONFIG_MMU
|
|
|
|
#define CONFIG_PER_VMA_LOCK
|
|
|
|
|
|
|
|
#include <stdlib.h>
|
|
|
|
|
|
|
|
#include <linux/list.h>
|
|
|
|
#include <linux/maple_tree.h>
|
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/rbtree.h>
|
mm: replace vm_lock and detached flag with a reference count
rw_semaphore is a sizable structure of 40 bytes and consumes considerable
space for each vm_area_struct. However vma_lock has two important
specifics which can be used to replace rw_semaphore with a simpler
structure:
1. Readers never wait. They try to take the vma_lock and fall back to
mmap_lock if that fails.
2. Only one writer at a time will ever try to write-lock a vma_lock
because writers first take mmap_lock in write mode. Because of these
requirements, full rw_semaphore functionality is not needed and we can
replace rw_semaphore and the vma->detached flag with a refcount
(vm_refcnt).
When vma is in detached state, vm_refcnt is 0 and only a call to
vma_mark_attached() can take it out of this state. Note that unlike
before, now we enforce both vma_mark_attached() and vma_mark_detached() to
be done only after vma has been write-locked. vma_mark_attached() changes
vm_refcnt to 1 to indicate that it has been attached to the vma tree.
When a reader takes read lock, it increments vm_refcnt, unless the top
usable bit of vm_refcnt (0x40000000) is set, indicating presence of a
writer. When writer takes write lock, it sets the top usable bit to
indicate its presence. If there are readers, writer will wait using newly
introduced mm->vma_writer_wait. Since all writers take mmap_lock in write
mode first, there can be only one writer at a time. The last reader to
release the lock will signal the writer to wake up. refcount might
overflow if there are many competing readers, in which case read-locking
will fail. Readers are expected to handle such failures.
In summary:
1. all readers increment the vm_refcnt;
2. writer sets top usable (writer) bit of vm_refcnt;
3. readers cannot increment the vm_refcnt if the writer bit is set;
4. in the presence of readers, writer must wait for the vm_refcnt to drop
to 1 (plus the VMA_LOCK_OFFSET writer bit), indicating an attached vma
with no readers;
5. vm_refcnt overflow is handled by the readers.
While this vm_lock replacement does not yet result in a smaller
vm_area_struct (it stays at 256 bytes due to cacheline alignment), it
allows for further size optimization by structure member regrouping to
bring the size of vm_area_struct below 192 bytes.
[surenb@google.com: fix a crash due to vma_end_read() that should have been removed]
Link: https://lkml.kernel.org/r/20250220200208.323769-1-surenb@google.com
Link: https://lkml.kernel.org/r/20250213224655.1680278-13-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Matthew Wilcox <willy@infradead.org>
Tested-by: Shivank Garg <shivankg@amd.com>
Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-02-13 14:46:49 -08:00
|
|
|
#include <linux/refcount.h>
|
2024-07-29 12:50:41 +01:00
|
|
|
|
2024-12-03 18:05:09 +00:00
|
|
|
extern unsigned long stack_guard_gap;
|
|
|
|
#ifdef CONFIG_MMU
|
|
|
|
extern unsigned long mmap_min_addr;
|
|
|
|
extern unsigned long dac_mmap_min_addr;
|
|
|
|
#else
|
|
|
|
#define mmap_min_addr 0UL
|
|
|
|
#define dac_mmap_min_addr 0UL
|
|
|
|
#endif
|
|
|
|
|
2024-07-29 12:50:41 +01:00
|
|
|
#define VM_WARN_ON(_expr) (WARN_ON(_expr))
|
|
|
|
#define VM_WARN_ON_ONCE(_expr) (WARN_ON_ONCE(_expr))
|
2025-01-16 10:15:38 -08:00
|
|
|
#define VM_WARN_ON_VMG(_expr, _vmg) (WARN_ON(_expr))
|
2024-07-29 12:50:41 +01:00
|
|
|
#define VM_BUG_ON(_expr) (BUG_ON(_expr))
|
|
|
|
#define VM_BUG_ON_VMA(_expr, _vma) (BUG_ON(_expr))
|
|
|
|
|
2025-01-02 12:10:52 +00:00
|
|
|
#define MMF_HAS_MDWE 28
|
|
|
|
|
2024-07-29 12:50:41 +01:00
|
|
|
#define VM_NONE 0x00000000
|
|
|
|
#define VM_READ 0x00000001
|
|
|
|
#define VM_WRITE 0x00000002
|
|
|
|
#define VM_EXEC 0x00000004
|
|
|
|
#define VM_SHARED 0x00000008
|
|
|
|
#define VM_MAYREAD 0x00000010
|
|
|
|
#define VM_MAYWRITE 0x00000020
|
mm/vma: move brk() internals to mm/vma.c
Patch series "mm/vma: make more mmap logic userland testable".
This series carries on the work started in previous series and
continued in commit 52956b0d7fb9 ("mm: isolate mmap internal logic to
mm/vma.c"), moving the remainder of memory mapping implementation
details logic into mm/vma.c allowing the bulk of the mapping logic to
be unit tested.
It is highly useful to do so, as this means we can both fundamentally test
this core logic, and introduce regression tests to ensure any issues
previously resolved do not recur.
Vitally, this includes the do_brk_flags() function, meaning we have both
core means of userland mapping memory now testable.
Performance testing was performed after this change given the brk() system
call's sensitivity to change, and no performance regression was observed.
The stack expansion logic is also moved into mm/vma.c, which necessitates
a change in the API exposed to the exec code, removing the invocation of
the expand_downwards() function used in get_arg_page() and instead adding
mmap_read_lock_maybe_expand() to wrap this.
This patch (of 5):
Now we have moved mmap_region() internals to mm/vma.c, making it available
to userland testing, it makes sense to do the same with brk().
This continues the pattern of VMA heavy lifting being done in mm/vma.c in
an environment where it can be subject to straightforward unit and
regression testing, with other VMA-adjacent files becoming wrappers around
this functionality.
[lorenzo.stoakes@oracle.com: add missing personality header import]
Link: https://lkml.kernel.org/r/2a717265-985f-45eb-9257-8b2857088ed4@lucifer.local
Link: https://lkml.kernel.org/r/cover.1733248985.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/3d24b9e67bb0261539ca921d1188a10a1b4d4357.1733248985.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-12-03 18:05:08 +00:00
|
|
|
#define VM_MAYEXEC 0x00000040
|
2024-07-29 12:50:41 +01:00
|
|
|
#define VM_GROWSDOWN 0x00000100
|
|
|
|
#define VM_PFNMAP 0x00000400
|
|
|
|
#define VM_LOCKED 0x00002000
|
|
|
|
#define VM_IO 0x00004000
|
2025-04-28 16:28:15 +01:00
|
|
|
#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */
|
|
|
|
#define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */
|
2024-07-29 12:50:41 +01:00
|
|
|
#define VM_DONTEXPAND 0x00040000
|
tools: testing: add additional vma_internal.h stubs
Patch series "fix error handling in mmap_region() and refactor", v3.
The mmap_region() function is somewhat terrifying, with spaghetti-like
control flow and numerous means by which issues can arise and incomplete
state, memory leaks and other unpleasantness can occur.
This series goes to great lengths to simplify how mmap_region() works and
to avoid unwinding errors late on in the process of setting up the VMA for
the new mapping, and equally avoids such operations occurring while the
VMA is in an inconsistent state.
This series builds on the previously submitted hotfix patches (see link to
v2 below) which addresses the most critical issues around mmap_region(),
and further works to improve mmap_region() complexity, stability, and
testability.
This series moves the code to mm/vma.c to render it userland testable,
refactors and simplifies it into smaller functions that are significantly
more readable.
It additionally avoids performing an attempt at a second merge mid-way
through allocating a new VMA, a dubious proposition at best and one that
is highly subject to subtle bugs.
Rather than do this, we simply note that we ought to retry the merge and
do this as a final step.
This patch (of 3):
Add some additional vma_internal.h stubs in preparation for
__mmap_region() being moved to mm/vma.c. Without these the move would
result in the tests no longer compiling.
Link: https://lkml.kernel.org/r/cover.1729858176.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/74b27e159e261d2ac1fe66a130edad1d61fdc176.1729858176.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-25 13:26:23 +01:00
|
|
|
#define VM_LOCKONFAULT 0x00080000
|
2024-07-29 12:50:41 +01:00
|
|
|
#define VM_ACCOUNT 0x00100000
|
tools: testing: add additional vma_internal.h stubs
Patch series "fix error handling in mmap_region() and refactor", v3.
The mmap_region() function is somewhat terrifying, with spaghetti-like
control flow and numerous means by which issues can arise and incomplete
state, memory leaks and other unpleasantness can occur.
This series goes to great lengths to simplify how mmap_region() works and
to avoid unwinding errors late on in the process of setting up the VMA for
the new mapping, and equally avoids such operations occurring while the
VMA is in an inconsistent state.
This series builds on the previously submitted hotfix patches (see link to
v2 below) which addresses the most critical issues around mmap_region(),
and further works to improve mmap_region() complexity, stability, and
testability.
This series moves the code to mm/vma.c to render it userland testable,
refactors and simplifies it into smaller functions that are significantly
more readable.
It additionally avoids performing an attempt at a second merge mid-way
through allocating a new VMA, a dubious proposition at best and one that
is highly subject to subtle bugs.
Rather than do this, we simply note that we ought to retry the merge and
do this as a final step.
This patch (of 3):
Add some additional vma_internal.h stubs in preparation for
__mmap_region() being moved to mm/vma.c. Without these the move would
result in the tests no longer compiling.
Link: https://lkml.kernel.org/r/cover.1729858176.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/74b27e159e261d2ac1fe66a130edad1d61fdc176.1729858176.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-25 13:26:23 +01:00
|
|
|
#define VM_NORESERVE 0x00200000
|
2024-07-29 12:50:41 +01:00
|
|
|
#define VM_MIXEDMAP 0x10000000
|
|
|
|
#define VM_STACK VM_GROWSDOWN
|
|
|
|
#define VM_SHADOW_STACK VM_NONE
|
|
|
|
#define VM_SOFTDIRTY 0
|
2024-12-03 18:05:09 +00:00
|
|
|
#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */
|
|
|
|
#define VM_GROWSUP VM_NONE
|
2024-07-29 12:50:41 +01:00
|
|
|
|
|
|
|
#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC)
|
|
|
|
#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)
|
|
|
|
|
2025-04-28 16:28:15 +01:00
|
|
|
#ifdef CONFIG_STACK_GROWSUP
|
|
|
|
#define VM_STACK VM_GROWSUP
|
|
|
|
#define VM_STACK_EARLY VM_GROWSDOWN
|
|
|
|
#else
|
|
|
|
#define VM_STACK VM_GROWSDOWN
|
|
|
|
#define VM_STACK_EARLY 0
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE)
|
|
|
|
#define TASK_SIZE_LOW DEFAULT_MAP_WINDOW
|
|
|
|
#define TASK_SIZE_MAX DEFAULT_MAP_WINDOW
|
|
|
|
#define STACK_TOP TASK_SIZE_LOW
|
|
|
|
#define STACK_TOP_MAX TASK_SIZE_MAX
|
|
|
|
|
tools: testing: add additional vma_internal.h stubs
Patch series "fix error handling in mmap_region() and refactor", v3.
The mmap_region() function is somewhat terrifying, with spaghetti-like
control flow and numerous means by which issues can arise and incomplete
state, memory leaks and other unpleasantness can occur.
This series goes to great lengths to simplify how mmap_region() works and
to avoid unwinding errors late on in the process of setting up the VMA for
the new mapping, and equally avoids such operations occurring while the
VMA is in an inconsistent state.
This series builds on the previously submitted hotfix patches (see link to
v2 below) which addresses the most critical issues around mmap_region(),
and further works to improve mmap_region() complexity, stability, and
testability.
This series moves the code to mm/vma.c to render it userland testable,
refactors and simplifies it into smaller functions that are significantly
more readable.
It additionally avoids performing an attempt at a second merge mid-way
through allocating a new VMA, a dubious proposition at best and one that
is highly subject to subtle bugs.
Rather than do this, we simply note that we ought to retry the merge and
do this as a final step.
This patch (of 3):
Add some additional vma_internal.h stubs in preparation for
__mmap_region() being moved to mm/vma.c. Without these the move would
result in the tests no longer compiling.
Link: https://lkml.kernel.org/r/cover.1729858176.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/74b27e159e261d2ac1fe66a130edad1d61fdc176.1729858176.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-25 13:26:23 +01:00
|
|
|
/* This mask represents all the VMA flag bits used by mlock */
|
|
|
|
#define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT)
|
|
|
|
|
mm/vma: move brk() internals to mm/vma.c
Patch series "mm/vma: make more mmap logic userland testable".
This series carries on the work started in previous series and
continued in commit 52956b0d7fb9 ("mm: isolate mmap internal logic to
mm/vma.c"), moving the remainder of memory mapping implementation
details logic into mm/vma.c allowing the bulk of the mapping logic to
be unit tested.
It is highly useful to do so, as this means we can both fundamentally test
this core logic, and introduce regression tests to ensure any issues
previously resolved do not recur.
Vitally, this includes the do_brk_flags() function, meaning we have both
core means of userland mapping memory now testable.
Performance testing was performed after this change given the brk() system
call's sensitivity to change, and no performance regression was observed.
The stack expansion logic is also moved into mm/vma.c, which necessitates
a change in the API exposed to the exec code, removing the invocation of
the expand_downwards() function used in get_arg_page() and instead adding
mmap_read_lock_maybe_expand() to wrap this.
This patch (of 5):
Now we have moved mmap_region() internals to mm/vma.c, making it available
to userland testing, it makes sense to do the same with brk().
This continues the pattern of VMA heavy lifting being done in mm/vma.c in
an environment where it can be subject to straightforward unit and
regression testing, with other VMA-adjacent files becoming wrappers around
this functionality.
[lorenzo.stoakes@oracle.com: add missing personality header import]
Link: https://lkml.kernel.org/r/2a717265-985f-45eb-9257-8b2857088ed4@lucifer.local
Link: https://lkml.kernel.org/r/cover.1733248985.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/3d24b9e67bb0261539ca921d1188a10a1b4d4357.1733248985.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-12-03 18:05:08 +00:00
|
|
|
#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0)
|
|
|
|
|
|
|
|
#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \
|
|
|
|
VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
|
|
|
|
|
|
|
|
#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC
|
|
|
|
|
2024-12-03 18:05:09 +00:00
|
|
|
#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK)
|
|
|
|
|
2025-04-28 16:28:15 +01:00
|
|
|
#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
|
|
|
|
#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
|
|
|
|
#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY)
|
|
|
|
|
2024-12-03 18:05:11 +00:00
|
|
|
#define RLIMIT_STACK 3 /* max stack size */
|
|
|
|
#define RLIMIT_MEMLOCK 8 /* max locked-in-memory address space */
|
|
|
|
|
|
|
|
#define CAP_IPC_LOCK 14
|
|
|
|
|
tools: testing: add additional vma_internal.h stubs
Patch series "fix error handling in mmap_region() and refactor", v3.
The mmap_region() function is somewhat terrifying, with spaghetti-like
control flow and numerous means by which issues can arise and incomplete
state, memory leaks and other unpleasantness can occur.
This series goes to great lengths to simplify how mmap_region() works and
to avoid unwinding errors late on in the process of setting up the VMA for
the new mapping, and equally avoids such operations occurring while the
VMA is in an inconsistent state.
This series builds on the previously submitted hotfix patches (see link to
v2 below) which addresses the most critical issues around mmap_region(),
and further works to improve mmap_region() complexity, stability, and
testability.
This series moves the code to mm/vma.c to render it userland testable,
refactors and simplifies it into smaller functions that are significantly
more readable.
It additionally avoids performing an attempt at a second merge mid-way
through allocating a new VMA, a dubious proposition at best and one that
is highly subject to subtle bugs.
Rather than do this, we simply note that we ought to retry the merge and
do this as a final step.
This patch (of 3):
Add some additional vma_internal.h stubs in preparation for
__mmap_region() being moved to mm/vma.c. Without these the move would
result in the tests no longer compiling.
Link: https://lkml.kernel.org/r/cover.1729858176.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/74b27e159e261d2ac1fe66a130edad1d61fdc176.1729858176.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-25 13:26:23 +01:00
|
|
|
#ifdef CONFIG_64BIT
|
mm/mseal: always define VM_SEALED
Patch series "mseal cleanups", v4.
Perform a number of cleanups to the mseal logic. Firstly, VM_SEALED is
treated differently from every other VMA flag, it really doesn't make
sense to do this, so we start by making this consistent with everything
else.
Next we place the madvise logic where it belongs - in mm/madvise.c. It
really makes no sense to abstract this elsewhere. In doing so, we go to
great lengths to explain very clearly the previously very confusing logic
as to what sealed mappings are impacted here.
In doing so, we retain existing logic regarding treatment of madvise()
discard operations for a sealed, read-only MAP_PRIVATE file-backed
mapping. This is something we likely need to revisit.
We then abstract out and explain the 'are there are any gaps in this range
in the mm?' check being performed as a prerequisite to mseal being
performed.
Finally, we simplify the actual mseal logic which is really quite
straightforward.
No functional change is intended.
This patch (of 4):
There is no reason to treat VM_SEALED in a special way, in each other case
in which a VMA flag is unavailable due to configuration, we simply assign
that flag to VM_NONE, so make VM_SEALED consistent with all other VMA
flags in this respect.
Additionally, use the next available bit for VM_SEALED, 42, rather than
arbitrarily putting it at 63 and update the declaration to match all other
VMA flags.
No functional change intended.
Link: https://lkml.kernel.org/r/cover.1753431105.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/aeb398a77029b6e7377cd944328bc9bbc3c90537.1753431105.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Kees Cook <kees@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-07-25 09:29:41 +01:00
|
|
|
#define VM_SEALED_BIT 42
|
|
|
|
#define VM_SEALED BIT(VM_SEALED_BIT)
|
|
|
|
#else
|
|
|
|
#define VM_SEALED VM_NONE
|
tools: testing: add additional vma_internal.h stubs
Patch series "fix error handling in mmap_region() and refactor", v3.
The mmap_region() function is somewhat terrifying, with spaghetti-like
control flow and numerous means by which issues can arise and incomplete
state, memory leaks and other unpleasantness can occur.
This series goes to great lengths to simplify how mmap_region() works and
to avoid unwinding errors late on in the process of setting up the VMA for
the new mapping, and equally avoids such operations occurring while the
VMA is in an inconsistent state.
This series builds on the previously submitted hotfix patches (see link to
v2 below) which addresses the most critical issues around mmap_region(),
and further works to improve mmap_region() complexity, stability, and
testability.
This series moves the code to mm/vma.c to render it userland testable,
refactors and simplifies it into smaller functions that are significantly
more readable.
It additionally avoids performing an attempt at a second merge mid-way
through allocating a new VMA, a dubious proposition at best and one that
is highly subject to subtle bugs.
Rather than do this, we simply note that we ought to retry the merge and
do this as a final step.
This patch (of 3):
Add some additional vma_internal.h stubs in preparation for
__mmap_region() being moved to mm/vma.c. Without these the move would
result in the tests no longer compiling.
Link: https://lkml.kernel.org/r/cover.1729858176.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/74b27e159e261d2ac1fe66a130edad1d61fdc176.1729858176.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-25 13:26:23 +01:00
|
|
|
#endif
|
|
|
|
|
2024-07-29 12:50:41 +01:00
|
|
|
#define FIRST_USER_ADDRESS 0UL
|
|
|
|
#define USER_PGTABLES_CEILING 0UL
|
|
|
|
|
|
|
|
#define vma_policy(vma) NULL
|
|
|
|
|
|
|
|
#define down_write_nest_lock(sem, nest_lock)
|
|
|
|
|
|
|
|
#define pgprot_val(x) ((x).pgprot)
|
|
|
|
#define __pgprot(x) ((pgprot_t) { (x) } )
|
|
|
|
|
|
|
|
#define for_each_vma(__vmi, __vma) \
|
|
|
|
while (((__vma) = vma_next(&(__vmi))) != NULL)
|
|
|
|
|
|
|
|
/* The MM code likes to work with exclusive end addresses */
|
|
|
|
#define for_each_vma_range(__vmi, __vma, __end) \
|
|
|
|
while (((__vma) = vma_find(&(__vmi), (__end))) != NULL)
|
|
|
|
|
|
|
|
#define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK)
|
|
|
|
|
|
|
|
#define PHYS_PFN(x) ((unsigned long)((x) >> PAGE_SHIFT))
|
|
|
|
|
|
|
|
#define test_and_set_bit(nr, addr) __test_and_set_bit(nr, addr)
|
|
|
|
#define test_and_clear_bit(nr, addr) __test_and_clear_bit(nr, addr)
|
|
|
|
|
|
|
|
#define TASK_SIZE ((1ul << 47)-PAGE_SIZE)
|
|
|
|
|
|
|
|
#define AS_MM_ALL_LOCKS 2
|
|
|
|
|
|
|
|
/* We hardcode this for now. */
|
|
|
|
#define sysctl_max_map_count 0x1000000UL
|
|
|
|
|
|
|
|
#define pgoff_t unsigned long
|
|
|
|
typedef unsigned long pgprotval_t;
|
|
|
|
typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;
|
|
|
|
typedef unsigned long vm_flags_t;
|
|
|
|
typedef __bitwise unsigned int vm_fault_t;
|
|
|
|
|
2024-08-30 19:10:14 +01:00
|
|
|
/*
|
|
|
|
* The shared stubs do not implement this, it amounts to an fprintf(STDERR,...)
|
|
|
|
* either way :)
|
|
|
|
*/
|
|
|
|
#define pr_warn_once pr_err
|
|
|
|
|
mm: perform VMA allocation, freeing, duplication in mm
Right now these are performed in kernel/fork.c which is odd and a
violation of separation of concerns, as well as preventing us from
integrating this and related logic into userland VMA testing going
forward.
There is a fly in the ointment - nommu - mmap.c is not compiled if
CONFIG_MMU not set, and neither is vma.c.
To square the circle, let's add a new file - vma_init.c. This will be
compiled for both CONFIG_MMU and nommu builds, and will also form part of
the VMA userland testing.
This allows us to de-duplicate code, while maintaining separation of
concerns and the ability for us to userland test this logic.
Update the VMA userland tests accordingly, additionally adding a
detach_free_vma() helper function to correctly detach VMAs before freeing
them in test code, as this change was triggering the assert for this.
[akpm@linux-foundation.org: remove stray newline, per Liam]
Link: https://lkml.kernel.org/r/f97b3a85a6da0196b28070df331b99e22b263be8.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-28 16:28:17 +01:00
|
|
|
#define data_race(expr) expr
|
|
|
|
|
|
|
|
#define ASSERT_EXCLUSIVE_WRITER(x)
|
|
|
|
|
2025-06-09 17:57:49 +01:00
|
|
|
/**
|
|
|
|
* swap - swap values of @a and @b
|
|
|
|
* @a: first value
|
|
|
|
* @b: second value
|
|
|
|
*/
|
|
|
|
#define swap(a, b) \
|
|
|
|
do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
|
|
|
|
|
2024-07-29 12:50:41 +01:00
|
|
|
struct kref {
|
|
|
|
refcount_t refcount;
|
|
|
|
};
|
|
|
|
|
2024-08-30 19:10:14 +01:00
|
|
|
/*
|
|
|
|
* Define the task command name length as enum, then it can be visible to
|
|
|
|
* BPF programs.
|
|
|
|
*/
|
|
|
|
enum {
|
|
|
|
TASK_COMM_LEN = 16,
|
|
|
|
};
|
|
|
|
|
mm/vma: move brk() internals to mm/vma.c
Patch series "mm/vma: make more mmap logic userland testable".
This series carries on the work started in previous series and
continued in commit 52956b0d7fb9 ("mm: isolate mmap internal logic to
mm/vma.c"), moving the remainder of memory mapping implementation
details logic into mm/vma.c allowing the bulk of the mapping logic to
be unit tested.
It is highly useful to do so, as this means we can both fundamentally test
this core logic, and introduce regression tests to ensure any issues
previously resolved do not recur.
Vitally, this includes the do_brk_flags() function, meaning we have both
core means of userland mapping memory now testable.
Performance testing was performed after this change given the brk() system
call's sensitivity to change, and no performance regression was observed.
The stack expansion logic is also moved into mm/vma.c, which necessitates
a change in the API exposed to the exec code, removing the invocation of
the expand_downwards() function used in get_arg_page() and instead adding
mmap_read_lock_maybe_expand() to wrap this.
This patch (of 5):
Now we have moved mmap_region() internals to mm/vma.c, making it available
to userland testing, it makes sense to do the same with brk().
This continues the pattern of VMA heavy lifting being done in mm/vma.c in
an environment where it can be subject to straightforward unit and
regression testing, with other VMA-adjacent files becoming wrappers around
this functionality.
[lorenzo.stoakes@oracle.com: add missing personality header import]
Link: https://lkml.kernel.org/r/2a717265-985f-45eb-9257-8b2857088ed4@lucifer.local
Link: https://lkml.kernel.org/r/cover.1733248985.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/3d24b9e67bb0261539ca921d1188a10a1b4d4357.1733248985.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-12-03 18:05:08 +00:00
|
|
|
/*
|
|
|
|
* Flags for bug emulation.
|
|
|
|
*
|
|
|
|
* These occupy the top three bytes.
|
|
|
|
*/
|
|
|
|
enum {
|
|
|
|
READ_IMPLIES_EXEC = 0x0400000,
|
|
|
|
};
|
|
|
|
|
2024-08-30 19:10:14 +01:00
|
|
|
struct task_struct {
|
|
|
|
char comm[TASK_COMM_LEN];
|
|
|
|
pid_t pid;
|
|
|
|
struct mm_struct *mm;
|
mm/vma: move brk() internals to mm/vma.c
Patch series "mm/vma: make more mmap logic userland testable".
This series carries on the work started in previous series and
continued in commit 52956b0d7fb9 ("mm: isolate mmap internal logic to
mm/vma.c"), moving the remainder of memory mapping implementation
details logic into mm/vma.c allowing the bulk of the mapping logic to
be unit tested.
It is highly useful to do so, as this means we can both fundamentally test
this core logic, and introduce regression tests to ensure any issues
previously resolved do not recur.
Vitally, this includes the do_brk_flags() function, meaning we have both
core means of userland mapping memory now testable.
Performance testing was performed after this change given the brk() system
call's sensitivity to change, and no performance regression was observed.
The stack expansion logic is also moved into mm/vma.c, which necessitates
a change in the API exposed to the exec code, removing the invocation of
the expand_downwards() function used in get_arg_page() and instead adding
mmap_read_lock_maybe_expand() to wrap this.
This patch (of 5):
Now we have moved mmap_region() internals to mm/vma.c, making it available
to userland testing, it makes sense to do the same with brk().
This continues the pattern of VMA heavy lifting being done in mm/vma.c in
an environment where it can be subject to straightforward unit and
regression testing, with other VMA-adjacent files becoming wrappers around
this functionality.
[lorenzo.stoakes@oracle.com: add missing personality header import]
Link: https://lkml.kernel.org/r/2a717265-985f-45eb-9257-8b2857088ed4@lucifer.local
Link: https://lkml.kernel.org/r/cover.1733248985.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/3d24b9e67bb0261539ca921d1188a10a1b4d4357.1733248985.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-12-03 18:05:08 +00:00
|
|
|
|
|
|
|
/* Used for emulating ABI behavior of previous Linux versions: */
|
|
|
|
unsigned int personality;
|
2024-08-30 19:10:14 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
struct task_struct *get_current(void);
|
|
|
|
#define current get_current()
|
|
|
|
|
2024-07-29 12:50:41 +01:00
|
|
|
struct anon_vma {
|
|
|
|
struct anon_vma *root;
|
|
|
|
struct rb_root_cached rb_root;
|
2024-08-30 19:10:14 +01:00
|
|
|
|
|
|
|
/* Test fields. */
|
|
|
|
bool was_cloned;
|
|
|
|
bool was_unlinked;
|
2024-07-29 12:50:41 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
struct anon_vma_chain {
|
|
|
|
struct anon_vma *anon_vma;
|
|
|
|
struct list_head same_vma;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct anon_vma_name {
|
|
|
|
struct kref kref;
|
|
|
|
/* The name needs to be at the end because it is dynamically sized. */
|
|
|
|
char name[];
|
|
|
|
};
|
|
|
|
|
|
|
|
struct vma_iterator {
|
|
|
|
struct ma_state mas;
|
|
|
|
};
|
|
|
|
|
|
|
|
#define VMA_ITERATOR(name, __mm, __addr) \
|
|
|
|
struct vma_iterator name = { \
|
|
|
|
.mas = { \
|
|
|
|
.tree = &(__mm)->mm_mt, \
|
|
|
|
.index = __addr, \
|
|
|
|
.node = NULL, \
|
|
|
|
.status = ma_start, \
|
|
|
|
}, \
|
|
|
|
}
|
|
|
|
|
|
|
|
struct address_space {
|
|
|
|
struct rb_root_cached i_mmap;
|
|
|
|
unsigned long flags;
|
|
|
|
atomic_t i_mmap_writable;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct vm_userfaultfd_ctx {};
|
|
|
|
struct mempolicy {};
|
|
|
|
struct mmu_gather {};
|
|
|
|
struct mutex {};
|
|
|
|
#define DEFINE_MUTEX(mutexname) \
|
|
|
|
struct mutex mutexname = {}
|
|
|
|
|
|
|
|
struct mm_struct {
|
|
|
|
struct maple_tree mm_mt;
|
|
|
|
int map_count; /* number of VMAs */
|
|
|
|
unsigned long total_vm; /* Total pages mapped */
|
|
|
|
unsigned long locked_vm; /* Pages that have PG_mlocked set */
|
|
|
|
unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */
|
|
|
|
unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
|
|
|
|
unsigned long stack_vm; /* VM_STACK */
|
mm/vma: move brk() internals to mm/vma.c
Patch series "mm/vma: make more mmap logic userland testable".
This series carries on the work started in previous series and
continued in commit 52956b0d7fb9 ("mm: isolate mmap internal logic to
mm/vma.c"), moving the remainder of memory mapping implementation
details logic into mm/vma.c allowing the bulk of the mapping logic to
be unit tested.
It is highly useful to do so, as this means we can both fundamentally test
this core logic, and introduce regression tests to ensure any issues
previously resolved do not recur.
Vitally, this includes the do_brk_flags() function, meaning we have both
core means of userland mapping memory now testable.
Performance testing was performed after this change given the brk() system
call's sensitivity to change, and no performance regression was observed.
The stack expansion logic is also moved into mm/vma.c, which necessitates
a change in the API exposed to the exec code, removing the invocation of
the expand_downwards() function used in get_arg_page() and instead adding
mmap_read_lock_maybe_expand() to wrap this.
This patch (of 5):
Now we have moved mmap_region() internals to mm/vma.c, making it available
to userland testing, it makes sense to do the same with brk().
This continues the pattern of VMA heavy lifting being done in mm/vma.c in
an environment where it can be subject to straightforward unit and
regression testing, with other VMA-adjacent files becoming wrappers around
this functionality.
[lorenzo.stoakes@oracle.com: add missing personality header import]
Link: https://lkml.kernel.org/r/2a717265-985f-45eb-9257-8b2857088ed4@lucifer.local
Link: https://lkml.kernel.org/r/cover.1733248985.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/3d24b9e67bb0261539ca921d1188a10a1b4d4357.1733248985.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-12-03 18:05:08 +00:00
|
|
|
|
|
|
|
unsigned long def_flags;
|
2025-01-02 12:10:52 +00:00
|
|
|
|
|
|
|
unsigned long flags; /* Must use atomic bitops to access */
|
2024-07-29 12:50:41 +01:00
|
|
|
};
|
|
|
|
|
mm: introduce new .mmap_prepare() file callback
Patch series "eliminate mmap() retry merge, add .mmap_prepare hook", v2.
During the mmap() of a file-backed mapping, we invoke the underlying
driver file's mmap() callback in order to perform driver/file system
initialisation of the underlying VMA.
This has been a source of issues in the past, including a significant
security concern relating to unwinding of error state discovered by Jann
Horn, as fixed in commit 5de195060b2e ("mm: resolve faulty mmap_region()
error path behaviour") which performed the recent, significant, rework of
mmap() as a whole.
However, we have had a fly in the ointment remain - drivers have a great
deal of freedom in the .mmap() hook to manipulate VMA state (as well as
page table state).
This can be problematic, as we can no longer reason sensibly about VMA
state once the call is complete (the ability to do - anything - here does
rather interfere with that).
In addition, callers may choose to do odd or unusual things which might
interfere with subsequent steps in the mmap() process, and it may do so
and then raise an error, requiring very careful unwinding of state about
which we can make no assumptions.
Rather than providing such an open-ended interface, this series provides
an alternative, far more restrictive one - we expose a whitelist of fields
which can be adjusted by the driver, along with immutable state upon which
the driver can make such decisions:
struct vm_area_desc {
/* Immutable state. */
struct mm_struct *mm;
unsigned long start;
unsigned long end;
/* Mutable fields. Populated with initial state. */
pgoff_t pgoff;
struct file *file;
vm_flags_t vm_flags;
pgprot_t page_prot;
/* Write-only fields. */
const struct vm_operations_struct *vm_ops;
void *private_data;
};
The mmap logic then updates the state used to either merge with a VMA or
establish a new VMA based upon this logic.
This is achieved via new file hook .mmap_prepare(), which is, importantly,
invoked very early on in the mmap() process.
If an error arises, we can very simply abort the operation with very
little unwinding of state required.
The existing logic contains another, related, peccadillo - since the
.mmap() callback might do anything, it may also cause a previously
unmergeable VMA to become mergeable with adjacent VMAs.
Right now the logic will retry a merge like this only if the driver
changes VMA flags, and changes them in such a way that a merge might
succeed (that is, the flags are not 'special', that is do not contain any
of the flags specified in VM_SPECIAL).
This has also been the source of a great deal of pain - it's hard to
reason about an .mmap() callback that might do - anything - but it's also
hard to reason about setting up a VMA and writing to the maple tree, only
to do it again utilising a great deal of shared state.
Since .mmap_prepare() sets fields before the first merge is even
attempted, the use of this callback obviates the need for this retry merge
logic.
A driver may only specify .mmap_prepare() or the deprecated .mmap()
callback. In future we may add futher callbacks beyond .mmap_prepare() to
faciliate all use cass as we convert drivers.
In researching this change, I examined every .mmap() callback, and
discovered only a very few that set VMA state in such a way that a. the
VMA flags changed and b. this would be mergeable.
In the majority of cases, it turns out that drivers are mapping kernel
memory and thus ultimately set VM_PFNMAP, VM_MIXEDMAP, or other
unmergeable VM_SPECIAL flags.
Of those that remain I identified a number of cases which are only
applicable in DAX, setting the VM_HUGEPAGE flag:
* dax_mmap()
* erofs_file_mmap()
* ext4_file_mmap()
* xfs_file_mmap()
For this remerge to not occur and to impact users, each of these cases
would require a user to mmap() files using DAX, in parts, immediately
adjacent to one another.
This is a very unlikely usecase and so it does not appear to be worthwhile
to adjust this functionality accordingly.
We can, however, very quickly do so if needed by simply adding an
.mmap_prepare() callback to these as required.
There are two further non-DAX cases I idenitfied:
* orangefs_file_mmap() - Clears VM_RAND_READ if set, replacing with
VM_SEQ_READ.
* usb_stream_hwdep_mmap() - Sets VM_DONTDUMP.
Both of these cases again seem very unlikely to be mmap()'d immediately
adjacent to one another in a fashion that would result in a merge.
Finally, we are left with a viable case:
* secretmem_mmap() - Set VM_LOCKED, VM_DONTDUMP.
This is viable enough that the mm selftests trigger the logic as a matter
of course. Therefore, this series replace the .secretmem_mmap() hook with
.secret_mmap_prepare().
This patch (of 3):
Provide a means by which drivers can specify which fields of those
permitted to be changed should be altered to prior to mmap()'ing a range
(which may either result from a merge or from mapping an entirely new
VMA).
Doing so is substantially safer than the existing .mmap() calback which
provides unrestricted access to the part-constructed VMA and permits
drivers and file systems to do 'creative' things which makes it hard to
reason about the state of the VMA after the function returns.
The existing .mmap() callback's freedom has caused a great deal of issues,
especially in error handling, as unwinding the mmap() state has proven to
be non-trivial and caused significant issues in the past, for instance
those addressed in commit 5de195060b2e ("mm: resolve faulty mmap_region()
error path behaviour").
It also necessitates a second attempt at merge once the .mmap() callback
has completed, which has caused issues in the past, is awkward, adds
overhead and is difficult to reason about.
The .mmap_prepare() callback eliminates this requirement, as we can update
fields prior to even attempting the first merge. It is safer, as we
heavily restrict what can actually be modified, and being invoked very
early in the mmap() process, error handling can be performed safely with
very little unwinding of state required.
The .mmap_prepare() and deprecated .mmap() callbacks are mutually
exclusive, so we permit only one to be invoked at a time.
Update vma userland test stubs to account for changes.
Link: https://lkml.kernel.org/r/cover.1746792520.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/adb36a7c4affd7393b2fc4b54cc5cfe211e41f71.1746792520.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-05-09 13:13:34 +01:00
|
|
|
struct vm_area_struct;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Describes a VMA that is about to be mmap()'ed. Drivers may choose to
|
|
|
|
* manipulate mutable fields which will cause those fields to be updated in the
|
|
|
|
* resultant VMA.
|
|
|
|
*
|
|
|
|
* Helper functions are not required for manipulating any field.
|
|
|
|
*/
|
|
|
|
struct vm_area_desc {
|
|
|
|
/* Immutable state. */
|
|
|
|
struct mm_struct *mm;
|
|
|
|
unsigned long start;
|
|
|
|
unsigned long end;
|
|
|
|
|
|
|
|
/* Mutable fields. Populated with initial state. */
|
|
|
|
pgoff_t pgoff;
|
|
|
|
struct file *file;
|
|
|
|
vm_flags_t vm_flags;
|
|
|
|
pgprot_t page_prot;
|
|
|
|
|
|
|
|
/* Write-only fields. */
|
|
|
|
const struct vm_operations_struct *vm_ops;
|
|
|
|
void *private_data;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct file_operations {
|
|
|
|
int (*mmap)(struct file *, struct vm_area_struct *);
|
|
|
|
int (*mmap_prepare)(struct vm_area_desc *);
|
|
|
|
};
|
|
|
|
|
2024-07-29 12:50:41 +01:00
|
|
|
struct file {
|
|
|
|
struct address_space *f_mapping;
|
mm: introduce new .mmap_prepare() file callback
Patch series "eliminate mmap() retry merge, add .mmap_prepare hook", v2.
During the mmap() of a file-backed mapping, we invoke the underlying
driver file's mmap() callback in order to perform driver/file system
initialisation of the underlying VMA.
This has been a source of issues in the past, including a significant
security concern relating to unwinding of error state discovered by Jann
Horn, as fixed in commit 5de195060b2e ("mm: resolve faulty mmap_region()
error path behaviour") which performed the recent, significant, rework of
mmap() as a whole.
However, we have had a fly in the ointment remain - drivers have a great
deal of freedom in the .mmap() hook to manipulate VMA state (as well as
page table state).
This can be problematic, as we can no longer reason sensibly about VMA
state once the call is complete (the ability to do - anything - here does
rather interfere with that).
In addition, callers may choose to do odd or unusual things which might
interfere with subsequent steps in the mmap() process, and it may do so
and then raise an error, requiring very careful unwinding of state about
which we can make no assumptions.
Rather than providing such an open-ended interface, this series provides
an alternative, far more restrictive one - we expose a whitelist of fields
which can be adjusted by the driver, along with immutable state upon which
the driver can make such decisions:
struct vm_area_desc {
/* Immutable state. */
struct mm_struct *mm;
unsigned long start;
unsigned long end;
/* Mutable fields. Populated with initial state. */
pgoff_t pgoff;
struct file *file;
vm_flags_t vm_flags;
pgprot_t page_prot;
/* Write-only fields. */
const struct vm_operations_struct *vm_ops;
void *private_data;
};
The mmap logic then updates the state used to either merge with a VMA or
establish a new VMA based upon this logic.
This is achieved via new file hook .mmap_prepare(), which is, importantly,
invoked very early on in the mmap() process.
If an error arises, we can very simply abort the operation with very
little unwinding of state required.
The existing logic contains another, related, peccadillo - since the
.mmap() callback might do anything, it may also cause a previously
unmergeable VMA to become mergeable with adjacent VMAs.
Right now the logic will retry a merge like this only if the driver
changes VMA flags, and changes them in such a way that a merge might
succeed (that is, the flags are not 'special', that is do not contain any
of the flags specified in VM_SPECIAL).
This has also been the source of a great deal of pain - it's hard to
reason about an .mmap() callback that might do - anything - but it's also
hard to reason about setting up a VMA and writing to the maple tree, only
to do it again utilising a great deal of shared state.
Since .mmap_prepare() sets fields before the first merge is even
attempted, the use of this callback obviates the need for this retry merge
logic.
A driver may only specify .mmap_prepare() or the deprecated .mmap()
callback. In future we may add futher callbacks beyond .mmap_prepare() to
faciliate all use cass as we convert drivers.
In researching this change, I examined every .mmap() callback, and
discovered only a very few that set VMA state in such a way that a. the
VMA flags changed and b. this would be mergeable.
In the majority of cases, it turns out that drivers are mapping kernel
memory and thus ultimately set VM_PFNMAP, VM_MIXEDMAP, or other
unmergeable VM_SPECIAL flags.
Of those that remain I identified a number of cases which are only
applicable in DAX, setting the VM_HUGEPAGE flag:
* dax_mmap()
* erofs_file_mmap()
* ext4_file_mmap()
* xfs_file_mmap()
For this remerge to not occur and to impact users, each of these cases
would require a user to mmap() files using DAX, in parts, immediately
adjacent to one another.
This is a very unlikely usecase and so it does not appear to be worthwhile
to adjust this functionality accordingly.
We can, however, very quickly do so if needed by simply adding an
.mmap_prepare() callback to these as required.
There are two further non-DAX cases I idenitfied:
* orangefs_file_mmap() - Clears VM_RAND_READ if set, replacing with
VM_SEQ_READ.
* usb_stream_hwdep_mmap() - Sets VM_DONTDUMP.
Both of these cases again seem very unlikely to be mmap()'d immediately
adjacent to one another in a fashion that would result in a merge.
Finally, we are left with a viable case:
* secretmem_mmap() - Set VM_LOCKED, VM_DONTDUMP.
This is viable enough that the mm selftests trigger the logic as a matter
of course. Therefore, this series replace the .secretmem_mmap() hook with
.secret_mmap_prepare().
This patch (of 3):
Provide a means by which drivers can specify which fields of those
permitted to be changed should be altered to prior to mmap()'ing a range
(which may either result from a merge or from mapping an entirely new
VMA).
Doing so is substantially safer than the existing .mmap() calback which
provides unrestricted access to the part-constructed VMA and permits
drivers and file systems to do 'creative' things which makes it hard to
reason about the state of the VMA after the function returns.
The existing .mmap() callback's freedom has caused a great deal of issues,
especially in error handling, as unwinding the mmap() state has proven to
be non-trivial and caused significant issues in the past, for instance
those addressed in commit 5de195060b2e ("mm: resolve faulty mmap_region()
error path behaviour").
It also necessitates a second attempt at merge once the .mmap() callback
has completed, which has caused issues in the past, is awkward, adds
overhead and is difficult to reason about.
The .mmap_prepare() callback eliminates this requirement, as we can update
fields prior to even attempting the first merge. It is safer, as we
heavily restrict what can actually be modified, and being invoked very
early in the mmap() process, error handling can be performed safely with
very little unwinding of state required.
The .mmap_prepare() and deprecated .mmap() callbacks are mutually
exclusive, so we permit only one to be invoked at a time.
Update vma userland test stubs to account for changes.
Link: https://lkml.kernel.org/r/cover.1746792520.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/adb36a7c4affd7393b2fc4b54cc5cfe211e41f71.1746792520.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-05-09 13:13:34 +01:00
|
|
|
const struct file_operations *f_op;
|
2024-07-29 12:50:41 +01:00
|
|
|
};
|
|
|
|
|
mm: replace vm_lock and detached flag with a reference count
rw_semaphore is a sizable structure of 40 bytes and consumes considerable
space for each vm_area_struct. However vma_lock has two important
specifics which can be used to replace rw_semaphore with a simpler
structure:
1. Readers never wait. They try to take the vma_lock and fall back to
mmap_lock if that fails.
2. Only one writer at a time will ever try to write-lock a vma_lock
because writers first take mmap_lock in write mode. Because of these
requirements, full rw_semaphore functionality is not needed and we can
replace rw_semaphore and the vma->detached flag with a refcount
(vm_refcnt).
When vma is in detached state, vm_refcnt is 0 and only a call to
vma_mark_attached() can take it out of this state. Note that unlike
before, now we enforce both vma_mark_attached() and vma_mark_detached() to
be done only after vma has been write-locked. vma_mark_attached() changes
vm_refcnt to 1 to indicate that it has been attached to the vma tree.
When a reader takes read lock, it increments vm_refcnt, unless the top
usable bit of vm_refcnt (0x40000000) is set, indicating presence of a
writer. When writer takes write lock, it sets the top usable bit to
indicate its presence. If there are readers, writer will wait using newly
introduced mm->vma_writer_wait. Since all writers take mmap_lock in write
mode first, there can be only one writer at a time. The last reader to
release the lock will signal the writer to wake up. refcount might
overflow if there are many competing readers, in which case read-locking
will fail. Readers are expected to handle such failures.
In summary:
1. all readers increment the vm_refcnt;
2. writer sets top usable (writer) bit of vm_refcnt;
3. readers cannot increment the vm_refcnt if the writer bit is set;
4. in the presence of readers, writer must wait for the vm_refcnt to drop
to 1 (plus the VMA_LOCK_OFFSET writer bit), indicating an attached vma
with no readers;
5. vm_refcnt overflow is handled by the readers.
While this vm_lock replacement does not yet result in a smaller
vm_area_struct (it stays at 256 bytes due to cacheline alignment), it
allows for further size optimization by structure member regrouping to
bring the size of vm_area_struct below 192 bytes.
[surenb@google.com: fix a crash due to vma_end_read() that should have been removed]
Link: https://lkml.kernel.org/r/20250220200208.323769-1-surenb@google.com
Link: https://lkml.kernel.org/r/20250213224655.1680278-13-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Matthew Wilcox <willy@infradead.org>
Tested-by: Shivank Garg <shivankg@amd.com>
Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-02-13 14:46:49 -08:00
|
|
|
#define VMA_LOCK_OFFSET 0x40000000
|
|
|
|
|
mm: perform VMA allocation, freeing, duplication in mm
Right now these are performed in kernel/fork.c which is odd and a
violation of separation of concerns, as well as preventing us from
integrating this and related logic into userland VMA testing going
forward.
There is a fly in the ointment - nommu - mmap.c is not compiled if
CONFIG_MMU not set, and neither is vma.c.
To square the circle, let's add a new file - vma_init.c. This will be
compiled for both CONFIG_MMU and nommu builds, and will also form part of
the VMA userland testing.
This allows us to de-duplicate code, while maintaining separation of
concerns and the ability for us to userland test this logic.
Update the VMA userland tests accordingly, additionally adding a
detach_free_vma() helper function to correctly detach VMAs before freeing
them in test code, as this change was triggering the assert for this.
[akpm@linux-foundation.org: remove stray newline, per Liam]
Link: https://lkml.kernel.org/r/f97b3a85a6da0196b28070df331b99e22b263be8.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-28 16:28:17 +01:00
|
|
|
typedef struct { unsigned long v; } freeptr_t;
|
|
|
|
|
2024-07-29 12:50:41 +01:00
|
|
|
struct vm_area_struct {
|
|
|
|
/* The first cache line has the info for VMA tree walking. */
|
|
|
|
|
|
|
|
union {
|
|
|
|
struct {
|
|
|
|
/* VMA covers [vm_start; vm_end) addresses within mm */
|
|
|
|
unsigned long vm_start;
|
|
|
|
unsigned long vm_end;
|
|
|
|
};
|
mm: perform VMA allocation, freeing, duplication in mm
Right now these are performed in kernel/fork.c which is odd and a
violation of separation of concerns, as well as preventing us from
integrating this and related logic into userland VMA testing going
forward.
There is a fly in the ointment - nommu - mmap.c is not compiled if
CONFIG_MMU not set, and neither is vma.c.
To square the circle, let's add a new file - vma_init.c. This will be
compiled for both CONFIG_MMU and nommu builds, and will also form part of
the VMA userland testing.
This allows us to de-duplicate code, while maintaining separation of
concerns and the ability for us to userland test this logic.
Update the VMA userland tests accordingly, additionally adding a
detach_free_vma() helper function to correctly detach VMAs before freeing
them in test code, as this change was triggering the assert for this.
[akpm@linux-foundation.org: remove stray newline, per Liam]
Link: https://lkml.kernel.org/r/f97b3a85a6da0196b28070df331b99e22b263be8.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-28 16:28:17 +01:00
|
|
|
freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */
|
2024-07-29 12:50:41 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
struct mm_struct *vm_mm; /* The address space we belong to. */
|
|
|
|
pgprot_t vm_page_prot; /* Access permissions of this VMA. */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Flags, see mm.h.
|
|
|
|
* To modify use vm_flags_{init|reset|set|clear|mod} functions.
|
|
|
|
*/
|
|
|
|
union {
|
|
|
|
const vm_flags_t vm_flags;
|
|
|
|
vm_flags_t __private __vm_flags;
|
|
|
|
};
|
|
|
|
|
|
|
|
#ifdef CONFIG_PER_VMA_LOCK
|
|
|
|
/*
|
|
|
|
* Can only be written (using WRITE_ONCE()) while holding both:
|
|
|
|
* - mmap_lock (in write mode)
|
mm: replace vm_lock and detached flag with a reference count
rw_semaphore is a sizable structure of 40 bytes and consumes considerable
space for each vm_area_struct. However vma_lock has two important
specifics which can be used to replace rw_semaphore with a simpler
structure:
1. Readers never wait. They try to take the vma_lock and fall back to
mmap_lock if that fails.
2. Only one writer at a time will ever try to write-lock a vma_lock
because writers first take mmap_lock in write mode. Because of these
requirements, full rw_semaphore functionality is not needed and we can
replace rw_semaphore and the vma->detached flag with a refcount
(vm_refcnt).
When vma is in detached state, vm_refcnt is 0 and only a call to
vma_mark_attached() can take it out of this state. Note that unlike
before, now we enforce both vma_mark_attached() and vma_mark_detached() to
be done only after vma has been write-locked. vma_mark_attached() changes
vm_refcnt to 1 to indicate that it has been attached to the vma tree.
When a reader takes read lock, it increments vm_refcnt, unless the top
usable bit of vm_refcnt (0x40000000) is set, indicating presence of a
writer. When writer takes write lock, it sets the top usable bit to
indicate its presence. If there are readers, writer will wait using newly
introduced mm->vma_writer_wait. Since all writers take mmap_lock in write
mode first, there can be only one writer at a time. The last reader to
release the lock will signal the writer to wake up. refcount might
overflow if there are many competing readers, in which case read-locking
will fail. Readers are expected to handle such failures.
In summary:
1. all readers increment the vm_refcnt;
2. writer sets top usable (writer) bit of vm_refcnt;
3. readers cannot increment the vm_refcnt if the writer bit is set;
4. in the presence of readers, writer must wait for the vm_refcnt to drop
to 1 (plus the VMA_LOCK_OFFSET writer bit), indicating an attached vma
with no readers;
5. vm_refcnt overflow is handled by the readers.
While this vm_lock replacement does not yet result in a smaller
vm_area_struct (it stays at 256 bytes due to cacheline alignment), it
allows for further size optimization by structure member regrouping to
bring the size of vm_area_struct below 192 bytes.
[surenb@google.com: fix a crash due to vma_end_read() that should have been removed]
Link: https://lkml.kernel.org/r/20250220200208.323769-1-surenb@google.com
Link: https://lkml.kernel.org/r/20250213224655.1680278-13-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Matthew Wilcox <willy@infradead.org>
Tested-by: Shivank Garg <shivankg@amd.com>
Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-02-13 14:46:49 -08:00
|
|
|
* - vm_refcnt bit at VMA_LOCK_OFFSET is set
|
2024-07-29 12:50:41 +01:00
|
|
|
* Can be read reliably while holding one of:
|
|
|
|
* - mmap_lock (in read or write mode)
|
mm: replace vm_lock and detached flag with a reference count
rw_semaphore is a sizable structure of 40 bytes and consumes considerable
space for each vm_area_struct. However vma_lock has two important
specifics which can be used to replace rw_semaphore with a simpler
structure:
1. Readers never wait. They try to take the vma_lock and fall back to
mmap_lock if that fails.
2. Only one writer at a time will ever try to write-lock a vma_lock
because writers first take mmap_lock in write mode. Because of these
requirements, full rw_semaphore functionality is not needed and we can
replace rw_semaphore and the vma->detached flag with a refcount
(vm_refcnt).
When vma is in detached state, vm_refcnt is 0 and only a call to
vma_mark_attached() can take it out of this state. Note that unlike
before, now we enforce both vma_mark_attached() and vma_mark_detached() to
be done only after vma has been write-locked. vma_mark_attached() changes
vm_refcnt to 1 to indicate that it has been attached to the vma tree.
When a reader takes read lock, it increments vm_refcnt, unless the top
usable bit of vm_refcnt (0x40000000) is set, indicating presence of a
writer. When writer takes write lock, it sets the top usable bit to
indicate its presence. If there are readers, writer will wait using newly
introduced mm->vma_writer_wait. Since all writers take mmap_lock in write
mode first, there can be only one writer at a time. The last reader to
release the lock will signal the writer to wake up. refcount might
overflow if there are many competing readers, in which case read-locking
will fail. Readers are expected to handle such failures.
In summary:
1. all readers increment the vm_refcnt;
2. writer sets top usable (writer) bit of vm_refcnt;
3. readers cannot increment the vm_refcnt if the writer bit is set;
4. in the presence of readers, writer must wait for the vm_refcnt to drop
to 1 (plus the VMA_LOCK_OFFSET writer bit), indicating an attached vma
with no readers;
5. vm_refcnt overflow is handled by the readers.
While this vm_lock replacement does not yet result in a smaller
vm_area_struct (it stays at 256 bytes due to cacheline alignment), it
allows for further size optimization by structure member regrouping to
bring the size of vm_area_struct below 192 bytes.
[surenb@google.com: fix a crash due to vma_end_read() that should have been removed]
Link: https://lkml.kernel.org/r/20250220200208.323769-1-surenb@google.com
Link: https://lkml.kernel.org/r/20250213224655.1680278-13-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Matthew Wilcox <willy@infradead.org>
Tested-by: Shivank Garg <shivankg@amd.com>
Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-02-13 14:46:49 -08:00
|
|
|
* - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1
|
2024-07-29 12:50:41 +01:00
|
|
|
* Can be read unreliably (using READ_ONCE()) for pessimistic bailout
|
|
|
|
* while holding nothing (except RCU to keep the VMA struct allocated).
|
|
|
|
*
|
|
|
|
* This sequence counter is explicitly allowed to overflow; sequence
|
|
|
|
* counter reuse can only lead to occasional unnecessary use of the
|
|
|
|
* slowpath.
|
|
|
|
*/
|
2024-11-22 09:44:15 -08:00
|
|
|
unsigned int vm_lock_seq;
|
2024-07-29 12:50:41 +01:00
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
|
|
|
|
* list, after a COW of one of the file pages. A MAP_SHARED vma
|
|
|
|
* can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack
|
|
|
|
* or brk vma (with NULL file) can only be in an anon_vma list.
|
|
|
|
*/
|
|
|
|
struct list_head anon_vma_chain; /* Serialized by mmap_lock &
|
|
|
|
* page_table_lock */
|
|
|
|
struct anon_vma *anon_vma; /* Serialized by page_table_lock */
|
|
|
|
|
|
|
|
/* Function pointers to deal with this struct. */
|
|
|
|
const struct vm_operations_struct *vm_ops;
|
|
|
|
|
|
|
|
/* Information about our backing store: */
|
|
|
|
unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE
|
|
|
|
units */
|
|
|
|
struct file * vm_file; /* File we map to (can be NULL). */
|
|
|
|
void * vm_private_data; /* was vm_pte (shared mem) */
|
|
|
|
|
|
|
|
#ifdef CONFIG_SWAP
|
|
|
|
atomic_long_t swap_readahead_info;
|
|
|
|
#endif
|
|
|
|
#ifndef CONFIG_MMU
|
|
|
|
struct vm_region *vm_region; /* NOMMU mapping region */
|
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
struct mempolicy *vm_policy; /* NUMA policy for the VMA */
|
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_NUMA_BALANCING
|
|
|
|
struct vma_numab_state *numab_state; /* NUMA Balancing state */
|
|
|
|
#endif
|
mm: replace vm_lock and detached flag with a reference count
rw_semaphore is a sizable structure of 40 bytes and consumes considerable
space for each vm_area_struct. However vma_lock has two important
specifics which can be used to replace rw_semaphore with a simpler
structure:
1. Readers never wait. They try to take the vma_lock and fall back to
mmap_lock if that fails.
2. Only one writer at a time will ever try to write-lock a vma_lock
because writers first take mmap_lock in write mode. Because of these
requirements, full rw_semaphore functionality is not needed and we can
replace rw_semaphore and the vma->detached flag with a refcount
(vm_refcnt).
When vma is in detached state, vm_refcnt is 0 and only a call to
vma_mark_attached() can take it out of this state. Note that unlike
before, now we enforce both vma_mark_attached() and vma_mark_detached() to
be done only after vma has been write-locked. vma_mark_attached() changes
vm_refcnt to 1 to indicate that it has been attached to the vma tree.
When a reader takes read lock, it increments vm_refcnt, unless the top
usable bit of vm_refcnt (0x40000000) is set, indicating presence of a
writer. When writer takes write lock, it sets the top usable bit to
indicate its presence. If there are readers, writer will wait using newly
introduced mm->vma_writer_wait. Since all writers take mmap_lock in write
mode first, there can be only one writer at a time. The last reader to
release the lock will signal the writer to wake up. refcount might
overflow if there are many competing readers, in which case read-locking
will fail. Readers are expected to handle such failures.
In summary:
1. all readers increment the vm_refcnt;
2. writer sets top usable (writer) bit of vm_refcnt;
3. readers cannot increment the vm_refcnt if the writer bit is set;
4. in the presence of readers, writer must wait for the vm_refcnt to drop
to 1 (plus the VMA_LOCK_OFFSET writer bit), indicating an attached vma
with no readers;
5. vm_refcnt overflow is handled by the readers.
While this vm_lock replacement does not yet result in a smaller
vm_area_struct (it stays at 256 bytes due to cacheline alignment), it
allows for further size optimization by structure member regrouping to
bring the size of vm_area_struct below 192 bytes.
[surenb@google.com: fix a crash due to vma_end_read() that should have been removed]
Link: https://lkml.kernel.org/r/20250220200208.323769-1-surenb@google.com
Link: https://lkml.kernel.org/r/20250213224655.1680278-13-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Matthew Wilcox <willy@infradead.org>
Tested-by: Shivank Garg <shivankg@amd.com>
Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-02-13 14:46:49 -08:00
|
|
|
#ifdef CONFIG_PER_VMA_LOCK
|
|
|
|
/* Unstable RCU readers are allowed to read this. */
|
|
|
|
refcount_t vm_refcnt;
|
|
|
|
#endif
|
2025-02-13 14:46:50 -08:00
|
|
|
/*
|
|
|
|
* For areas with an address space and backing store,
|
|
|
|
* linkage into the address_space->i_mmap interval tree.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
struct {
|
|
|
|
struct rb_node rb;
|
|
|
|
unsigned long rb_subtree_last;
|
|
|
|
} shared;
|
|
|
|
#ifdef CONFIG_ANON_VMA_NAME
|
|
|
|
/*
|
|
|
|
* For private and shared anonymous mappings, a pointer to a null
|
|
|
|
* terminated string containing the name given to the vma, or NULL if
|
|
|
|
* unnamed. Serialized by mmap_lock. Use anon_vma_name to access.
|
|
|
|
*/
|
|
|
|
struct anon_vma_name *anon_name;
|
|
|
|
#endif
|
|
|
|
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
|
2024-07-29 12:50:41 +01:00
|
|
|
} __randomize_layout;
|
|
|
|
|
|
|
|
struct vm_fault {};
|
|
|
|
|
|
|
|
struct vm_operations_struct {
|
|
|
|
void (*open)(struct vm_area_struct * area);
|
|
|
|
/**
|
|
|
|
* @close: Called when the VMA is being removed from the MM.
|
|
|
|
* Context: User context. May sleep. Caller holds mmap_lock.
|
|
|
|
*/
|
|
|
|
void (*close)(struct vm_area_struct * area);
|
|
|
|
/* Called any time before splitting to check if it's allowed */
|
|
|
|
int (*may_split)(struct vm_area_struct *area, unsigned long addr);
|
|
|
|
int (*mremap)(struct vm_area_struct *area);
|
|
|
|
/*
|
|
|
|
* Called by mprotect() to make driver-specific permission
|
|
|
|
* checks before mprotect() is finalised. The VMA must not
|
|
|
|
* be modified. Returns 0 if mprotect() can proceed.
|
|
|
|
*/
|
|
|
|
int (*mprotect)(struct vm_area_struct *vma, unsigned long start,
|
|
|
|
unsigned long end, unsigned long newflags);
|
|
|
|
vm_fault_t (*fault)(struct vm_fault *vmf);
|
|
|
|
vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order);
|
|
|
|
vm_fault_t (*map_pages)(struct vm_fault *vmf,
|
|
|
|
pgoff_t start_pgoff, pgoff_t end_pgoff);
|
|
|
|
unsigned long (*pagesize)(struct vm_area_struct * area);
|
|
|
|
|
|
|
|
/* notification that a previously read-only page is about to become
|
|
|
|
* writable, if an error is returned it will cause a SIGBUS */
|
|
|
|
vm_fault_t (*page_mkwrite)(struct vm_fault *vmf);
|
|
|
|
|
|
|
|
/* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */
|
|
|
|
vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf);
|
|
|
|
|
|
|
|
/* called by access_process_vm when get_user_pages() fails, typically
|
|
|
|
* for use by special VMAs. See also generic_access_phys() for a generic
|
|
|
|
* implementation useful for any iomem mapping.
|
|
|
|
*/
|
|
|
|
int (*access)(struct vm_area_struct *vma, unsigned long addr,
|
|
|
|
void *buf, int len, int write);
|
|
|
|
|
|
|
|
/* Called by the /proc/PID/maps code to ask the vma whether it
|
|
|
|
* has a special name. Returning non-NULL will also cause this
|
|
|
|
* vma to be dumped unconditionally. */
|
|
|
|
const char *(*name)(struct vm_area_struct *vma);
|
|
|
|
|
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
/*
|
|
|
|
* set_policy() op must add a reference to any non-NULL @new mempolicy
|
|
|
|
* to hold the policy upon return. Caller should pass NULL @new to
|
|
|
|
* remove a policy and fall back to surrounding context--i.e. do not
|
|
|
|
* install a MPOL_DEFAULT policy, nor the task or system default
|
|
|
|
* mempolicy.
|
|
|
|
*/
|
|
|
|
int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* get_policy() op must add reference [mpol_get()] to any policy at
|
|
|
|
* (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure
|
|
|
|
* in mm/mempolicy.c will do this automatically.
|
|
|
|
* get_policy() must NOT add a ref if the policy at (vma,addr) is not
|
|
|
|
* marked as MPOL_SHARED. vma policies are protected by the mmap_lock.
|
|
|
|
* If no [shared/vma] mempolicy exists at the addr, get_policy() op
|
|
|
|
* must return NULL--i.e., do not "fallback" to task or system default
|
|
|
|
* policy.
|
|
|
|
*/
|
|
|
|
struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
|
|
|
|
unsigned long addr, pgoff_t *ilx);
|
|
|
|
#endif
|
|
|
|
/*
|
|
|
|
* Called by vm_normal_page() for special PTEs to find the
|
|
|
|
* page for @addr. This is useful if the default behavior
|
|
|
|
* (using pte_page()) would not find the correct page.
|
|
|
|
*/
|
|
|
|
struct page *(*find_special_page)(struct vm_area_struct *vma,
|
|
|
|
unsigned long addr);
|
|
|
|
};
|
|
|
|
|
2024-12-03 18:05:09 +00:00
|
|
|
struct vm_unmapped_area_info {
|
|
|
|
#define VM_UNMAPPED_AREA_TOPDOWN 1
|
|
|
|
unsigned long flags;
|
|
|
|
unsigned long length;
|
|
|
|
unsigned long low_limit;
|
|
|
|
unsigned long high_limit;
|
|
|
|
unsigned long align_mask;
|
|
|
|
unsigned long align_offset;
|
|
|
|
unsigned long start_gap;
|
|
|
|
};
|
|
|
|
|
mm: establish mm/vma_exec.c for shared exec/mm VMA functionality
Patch series "move all VMA allocation, freeing and duplication logic to
mm", v3.
Currently VMA allocation, freeing and duplication exist in kernel/fork.c,
which is a violation of separation of concerns, and leaves these functions
exposed to the rest of the kernel when they are in fact internal
implementation details.
Resolve this by moving this logic to mm, and making it internal to vma.c,
vma.h.
This also allows us, in future, to provide userland testing around this
functionality.
We additionally abstract dup_mmap() to mm, being careful to ensure
kernel/fork.c acceses this via the mm internal header so it is not exposed
elsewhere in the kernel.
As part of this change, also abstract initial stack allocation performed
in __bprm_mm_init() out of fs code into mm via the
create_init_stack_vma(), as this code uses vm_area_alloc() and
vm_area_free().
In order to do so sensibly, we introduce a new mm/vma_exec.c file, which
contains the code that is shared by mm and exec. This file is added to
both memory mapping and exec sections in MAINTAINERS so both sets of
maintainers can maintain oversight.
As part of this change, we also move relocate_vma_down() to mm/vma_exec.c
so all shared mm/exec functionality is kept in one place.
We add code shared between nommu and mmu-enabled configurations in order
to share VMA allocation, freeing and duplication code correctly while also
keeping these functions available in userland VMA testing.
This is achieved by adding a mm/vma_init.c file which is also compiled by
the userland tests.
This patch (of 4):
There is functionality that overlaps the exec and memory mapping
subsystems. While it properly belongs in mm, it is important that exec
maintainers maintain oversight of this functionality correctly.
We can establish both goals by adding a new mm/vma_exec.c file which
contains these 'glue' functions, and have fs/exec.c import them.
As a part of this change, to ensure that proper oversight is achieved, add
the file to both the MEMORY MAPPING and EXEC & BINFMT API, ELF sections.
scripts/get_maintainer.pl can correctly handle files in multiple entries
and this neatly handles the cross-over.
[akpm@linux-foundation.org: fix comment typo]
Link: https://lkml.kernel.org/r/80f0d0c6-0b68-47f9-ab78-0ab7f74677fc@lucifer.local
Link: https://lkml.kernel.org/r/cover.1745853549.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/91f2cee8f17d65214a9d83abb7011aa15f1ea690.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-28 16:28:14 +01:00
|
|
|
struct pagetable_move_control {
|
|
|
|
struct vm_area_struct *old; /* Source VMA. */
|
|
|
|
struct vm_area_struct *new; /* Destination VMA. */
|
|
|
|
unsigned long old_addr; /* Address from which the move begins. */
|
|
|
|
unsigned long old_end; /* Exclusive address at which old range ends. */
|
|
|
|
unsigned long new_addr; /* Address to move page tables to. */
|
|
|
|
unsigned long len_in; /* Bytes to remap specified by user. */
|
|
|
|
|
|
|
|
bool need_rmap_locks; /* Do rmap locks need to be taken? */
|
|
|
|
bool for_stack; /* Is this an early temp stack being moved? */
|
|
|
|
};
|
|
|
|
|
|
|
|
#define PAGETABLE_MOVE(name, old_, new_, old_addr_, new_addr_, len_) \
|
|
|
|
struct pagetable_move_control name = { \
|
|
|
|
.old = old_, \
|
|
|
|
.new = new_, \
|
|
|
|
.old_addr = old_addr_, \
|
|
|
|
.old_end = (old_addr_) + (len_), \
|
|
|
|
.new_addr = new_addr_, \
|
|
|
|
.len_in = len_, \
|
|
|
|
}
|
|
|
|
|
mm: perform VMA allocation, freeing, duplication in mm
Right now these are performed in kernel/fork.c which is odd and a
violation of separation of concerns, as well as preventing us from
integrating this and related logic into userland VMA testing going
forward.
There is a fly in the ointment - nommu - mmap.c is not compiled if
CONFIG_MMU not set, and neither is vma.c.
To square the circle, let's add a new file - vma_init.c. This will be
compiled for both CONFIG_MMU and nommu builds, and will also form part of
the VMA userland testing.
This allows us to de-duplicate code, while maintaining separation of
concerns and the ability for us to userland test this logic.
Update the VMA userland tests accordingly, additionally adding a
detach_free_vma() helper function to correctly detach VMAs before freeing
them in test code, as this change was triggering the assert for this.
[akpm@linux-foundation.org: remove stray newline, per Liam]
Link: https://lkml.kernel.org/r/f97b3a85a6da0196b28070df331b99e22b263be8.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-28 16:28:17 +01:00
|
|
|
struct kmem_cache_args {
|
|
|
|
/**
|
|
|
|
* @align: The required alignment for the objects.
|
|
|
|
*
|
|
|
|
* %0 means no specific alignment is requested.
|
|
|
|
*/
|
|
|
|
unsigned int align;
|
|
|
|
/**
|
|
|
|
* @useroffset: Usercopy region offset.
|
|
|
|
*
|
|
|
|
* %0 is a valid offset, when @usersize is non-%0
|
|
|
|
*/
|
|
|
|
unsigned int useroffset;
|
|
|
|
/**
|
|
|
|
* @usersize: Usercopy region size.
|
|
|
|
*
|
|
|
|
* %0 means no usercopy region is specified.
|
|
|
|
*/
|
|
|
|
unsigned int usersize;
|
|
|
|
/**
|
|
|
|
* @freeptr_offset: Custom offset for the free pointer
|
|
|
|
* in &SLAB_TYPESAFE_BY_RCU caches
|
|
|
|
*
|
|
|
|
* By default &SLAB_TYPESAFE_BY_RCU caches place the free pointer
|
|
|
|
* outside of the object. This might cause the object to grow in size.
|
|
|
|
* Cache creators that have a reason to avoid this can specify a custom
|
|
|
|
* free pointer offset in their struct where the free pointer will be
|
|
|
|
* placed.
|
|
|
|
*
|
|
|
|
* Note that placing the free pointer inside the object requires the
|
|
|
|
* caller to ensure that no fields are invalidated that are required to
|
|
|
|
* guard against object recycling (See &SLAB_TYPESAFE_BY_RCU for
|
|
|
|
* details).
|
|
|
|
*
|
|
|
|
* Using %0 as a value for @freeptr_offset is valid. If @freeptr_offset
|
|
|
|
* is specified, %use_freeptr_offset must be set %true.
|
|
|
|
*
|
|
|
|
* Note that @ctor currently isn't supported with custom free pointers
|
|
|
|
* as a @ctor requires an external free pointer.
|
|
|
|
*/
|
|
|
|
unsigned int freeptr_offset;
|
|
|
|
/**
|
|
|
|
* @use_freeptr_offset: Whether a @freeptr_offset is used.
|
|
|
|
*/
|
|
|
|
bool use_freeptr_offset;
|
|
|
|
/**
|
|
|
|
* @ctor: A constructor for the objects.
|
|
|
|
*
|
|
|
|
* The constructor is invoked for each object in a newly allocated slab
|
|
|
|
* page. It is the cache user's responsibility to free object in the
|
|
|
|
* same state as after calling the constructor, or deal appropriately
|
|
|
|
* with any differences between a freshly constructed and a reallocated
|
|
|
|
* object.
|
|
|
|
*
|
|
|
|
* %NULL means no constructor.
|
|
|
|
*/
|
|
|
|
void (*ctor)(void *);
|
|
|
|
};
|
|
|
|
|
2024-07-29 12:50:41 +01:00
|
|
|
static inline void vma_iter_invalidate(struct vma_iterator *vmi)
|
|
|
|
{
|
|
|
|
mas_pause(&vmi->mas);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
|
|
|
|
{
|
|
|
|
return __pgprot(pgprot_val(oldprot) | pgprot_val(newprot));
|
|
|
|
}
|
|
|
|
|
mm: change vm_get_page_prot() to accept vm_flags_t argument
Patch series "use vm_flags_t consistently".
The VMA flags field vma->vm_flags is of type vm_flags_t. Right now this
is exactly equivalent to unsigned long, but it should not be assumed to
be.
Much code that references vma->vm_flags already correctly uses vm_flags_t,
but a fairly large chunk of code simply uses unsigned long and assumes
that the two are equivalent.
This series corrects that and has us use vm_flags_t consistently.
This series is motivated by the desire to, in a future series, adjust
vm_flags_t to be a u64 regardless of whether the kernel is 32-bit or
64-bit in order to deal with the VMA flag exhaustion issue and avoid all
the various problems that arise from it (being unable to use certain
features in 32-bit, being unable to add new flags except for 64-bit, etc.)
This is therefore a critical first step towards that goal. At any rate,
using the correct type is of value regardless.
We additionally take the opportunity to refer to VMA flags as vm_flags
where possible to make clear what we're referring to.
Overall, this series does not introduce any functional change.
This patch (of 3):
We abstract the type of the VMA flags to vm_flags_t, however in may places
it is simply assumed this is unsigned long, which is simply incorrect.
At the moment this is simply an incongruity, however in future we plan to
change this type and therefore this change is a critical requirement for
doing so.
Overall, this patch does not introduce any functional change.
[lorenzo.stoakes@oracle.com: add missing vm_get_page_prot() instance, remove include]
Link: https://lkml.kernel.org/r/552f88e1-2df8-4e95-92b8-812f7c8db829@lucifer.local
Link: https://lkml.kernel.org/r/cover.1750274467.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/a12769720a2743f235643b158c4f4f0a9911daf0.1750274467.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Acked-by: Christian Brauner <brauner@kernel.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Acked-by: Catalin Marinas <catalin.marinas@arm.com> [arm64]
Acked-by: Zi Yan <ziy@nvidia.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Jann Horn <jannh@google.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-06-18 20:42:52 +01:00
|
|
|
static inline pgprot_t vm_get_page_prot(vm_flags_t vm_flags)
|
2024-07-29 12:50:41 +01:00
|
|
|
{
|
|
|
|
return __pgprot(vm_flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool is_shared_maywrite(vm_flags_t vm_flags)
|
|
|
|
{
|
|
|
|
return (vm_flags & (VM_SHARED | VM_MAYWRITE)) ==
|
|
|
|
(VM_SHARED | VM_MAYWRITE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
return is_shared_maywrite(vma->vm_flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Uses mas_find() to get the first VMA when the iterator starts.
|
|
|
|
* Calling mas_next() could skip the first entry.
|
|
|
|
*/
|
|
|
|
return mas_find(&vmi->mas, ULONG_MAX);
|
|
|
|
}
|
|
|
|
|
mm: replace vm_lock and detached flag with a reference count
rw_semaphore is a sizable structure of 40 bytes and consumes considerable
space for each vm_area_struct. However vma_lock has two important
specifics which can be used to replace rw_semaphore with a simpler
structure:
1. Readers never wait. They try to take the vma_lock and fall back to
mmap_lock if that fails.
2. Only one writer at a time will ever try to write-lock a vma_lock
because writers first take mmap_lock in write mode. Because of these
requirements, full rw_semaphore functionality is not needed and we can
replace rw_semaphore and the vma->detached flag with a refcount
(vm_refcnt).
When vma is in detached state, vm_refcnt is 0 and only a call to
vma_mark_attached() can take it out of this state. Note that unlike
before, now we enforce both vma_mark_attached() and vma_mark_detached() to
be done only after vma has been write-locked. vma_mark_attached() changes
vm_refcnt to 1 to indicate that it has been attached to the vma tree.
When a reader takes read lock, it increments vm_refcnt, unless the top
usable bit of vm_refcnt (0x40000000) is set, indicating presence of a
writer. When writer takes write lock, it sets the top usable bit to
indicate its presence. If there are readers, writer will wait using newly
introduced mm->vma_writer_wait. Since all writers take mmap_lock in write
mode first, there can be only one writer at a time. The last reader to
release the lock will signal the writer to wake up. refcount might
overflow if there are many competing readers, in which case read-locking
will fail. Readers are expected to handle such failures.
In summary:
1. all readers increment the vm_refcnt;
2. writer sets top usable (writer) bit of vm_refcnt;
3. readers cannot increment the vm_refcnt if the writer bit is set;
4. in the presence of readers, writer must wait for the vm_refcnt to drop
to 1 (plus the VMA_LOCK_OFFSET writer bit), indicating an attached vma
with no readers;
5. vm_refcnt overflow is handled by the readers.
While this vm_lock replacement does not yet result in a smaller
vm_area_struct (it stays at 256 bytes due to cacheline alignment), it
allows for further size optimization by structure member regrouping to
bring the size of vm_area_struct below 192 bytes.
[surenb@google.com: fix a crash due to vma_end_read() that should have been removed]
Link: https://lkml.kernel.org/r/20250220200208.323769-1-surenb@google.com
Link: https://lkml.kernel.org/r/20250213224655.1680278-13-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Matthew Wilcox <willy@infradead.org>
Tested-by: Shivank Garg <shivankg@amd.com>
Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-02-13 14:46:49 -08:00
|
|
|
/*
|
|
|
|
* WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these
|
|
|
|
* assertions should be made either under mmap_write_lock or when the object
|
|
|
|
* has been isolated under mmap_write_lock, ensuring no competing writers.
|
|
|
|
*/
|
2025-02-13 14:46:41 -08:00
|
|
|
static inline void vma_assert_attached(struct vm_area_struct *vma)
|
|
|
|
{
|
mm: replace vm_lock and detached flag with a reference count
rw_semaphore is a sizable structure of 40 bytes and consumes considerable
space for each vm_area_struct. However vma_lock has two important
specifics which can be used to replace rw_semaphore with a simpler
structure:
1. Readers never wait. They try to take the vma_lock and fall back to
mmap_lock if that fails.
2. Only one writer at a time will ever try to write-lock a vma_lock
because writers first take mmap_lock in write mode. Because of these
requirements, full rw_semaphore functionality is not needed and we can
replace rw_semaphore and the vma->detached flag with a refcount
(vm_refcnt).
When vma is in detached state, vm_refcnt is 0 and only a call to
vma_mark_attached() can take it out of this state. Note that unlike
before, now we enforce both vma_mark_attached() and vma_mark_detached() to
be done only after vma has been write-locked. vma_mark_attached() changes
vm_refcnt to 1 to indicate that it has been attached to the vma tree.
When a reader takes read lock, it increments vm_refcnt, unless the top
usable bit of vm_refcnt (0x40000000) is set, indicating presence of a
writer. When writer takes write lock, it sets the top usable bit to
indicate its presence. If there are readers, writer will wait using newly
introduced mm->vma_writer_wait. Since all writers take mmap_lock in write
mode first, there can be only one writer at a time. The last reader to
release the lock will signal the writer to wake up. refcount might
overflow if there are many competing readers, in which case read-locking
will fail. Readers are expected to handle such failures.
In summary:
1. all readers increment the vm_refcnt;
2. writer sets top usable (writer) bit of vm_refcnt;
3. readers cannot increment the vm_refcnt if the writer bit is set;
4. in the presence of readers, writer must wait for the vm_refcnt to drop
to 1 (plus the VMA_LOCK_OFFSET writer bit), indicating an attached vma
with no readers;
5. vm_refcnt overflow is handled by the readers.
While this vm_lock replacement does not yet result in a smaller
vm_area_struct (it stays at 256 bytes due to cacheline alignment), it
allows for further size optimization by structure member regrouping to
bring the size of vm_area_struct below 192 bytes.
[surenb@google.com: fix a crash due to vma_end_read() that should have been removed]
Link: https://lkml.kernel.org/r/20250220200208.323769-1-surenb@google.com
Link: https://lkml.kernel.org/r/20250213224655.1680278-13-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Matthew Wilcox <willy@infradead.org>
Tested-by: Shivank Garg <shivankg@amd.com>
Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-02-13 14:46:49 -08:00
|
|
|
WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt));
|
2025-02-13 14:46:41 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void vma_assert_detached(struct vm_area_struct *vma)
|
|
|
|
{
|
mm: replace vm_lock and detached flag with a reference count
rw_semaphore is a sizable structure of 40 bytes and consumes considerable
space for each vm_area_struct. However vma_lock has two important
specifics which can be used to replace rw_semaphore with a simpler
structure:
1. Readers never wait. They try to take the vma_lock and fall back to
mmap_lock if that fails.
2. Only one writer at a time will ever try to write-lock a vma_lock
because writers first take mmap_lock in write mode. Because of these
requirements, full rw_semaphore functionality is not needed and we can
replace rw_semaphore and the vma->detached flag with a refcount
(vm_refcnt).
When vma is in detached state, vm_refcnt is 0 and only a call to
vma_mark_attached() can take it out of this state. Note that unlike
before, now we enforce both vma_mark_attached() and vma_mark_detached() to
be done only after vma has been write-locked. vma_mark_attached() changes
vm_refcnt to 1 to indicate that it has been attached to the vma tree.
When a reader takes read lock, it increments vm_refcnt, unless the top
usable bit of vm_refcnt (0x40000000) is set, indicating presence of a
writer. When writer takes write lock, it sets the top usable bit to
indicate its presence. If there are readers, writer will wait using newly
introduced mm->vma_writer_wait. Since all writers take mmap_lock in write
mode first, there can be only one writer at a time. The last reader to
release the lock will signal the writer to wake up. refcount might
overflow if there are many competing readers, in which case read-locking
will fail. Readers are expected to handle such failures.
In summary:
1. all readers increment the vm_refcnt;
2. writer sets top usable (writer) bit of vm_refcnt;
3. readers cannot increment the vm_refcnt if the writer bit is set;
4. in the presence of readers, writer must wait for the vm_refcnt to drop
to 1 (plus the VMA_LOCK_OFFSET writer bit), indicating an attached vma
with no readers;
5. vm_refcnt overflow is handled by the readers.
While this vm_lock replacement does not yet result in a smaller
vm_area_struct (it stays at 256 bytes due to cacheline alignment), it
allows for further size optimization by structure member regrouping to
bring the size of vm_area_struct below 192 bytes.
[surenb@google.com: fix a crash due to vma_end_read() that should have been removed]
Link: https://lkml.kernel.org/r/20250220200208.323769-1-surenb@google.com
Link: https://lkml.kernel.org/r/20250213224655.1680278-13-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Matthew Wilcox <willy@infradead.org>
Tested-by: Shivank Garg <shivankg@amd.com>
Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-02-13 14:46:49 -08:00
|
|
|
WARN_ON_ONCE(refcount_read(&vma->vm_refcnt));
|
2025-02-13 14:46:41 -08:00
|
|
|
}
|
|
|
|
|
2024-07-29 12:50:41 +01:00
|
|
|
static inline void vma_assert_write_locked(struct vm_area_struct *);
|
2025-02-13 14:46:40 -08:00
|
|
|
static inline void vma_mark_attached(struct vm_area_struct *vma)
|
|
|
|
{
|
mm: replace vm_lock and detached flag with a reference count
rw_semaphore is a sizable structure of 40 bytes and consumes considerable
space for each vm_area_struct. However vma_lock has two important
specifics which can be used to replace rw_semaphore with a simpler
structure:
1. Readers never wait. They try to take the vma_lock and fall back to
mmap_lock if that fails.
2. Only one writer at a time will ever try to write-lock a vma_lock
because writers first take mmap_lock in write mode. Because of these
requirements, full rw_semaphore functionality is not needed and we can
replace rw_semaphore and the vma->detached flag with a refcount
(vm_refcnt).
When vma is in detached state, vm_refcnt is 0 and only a call to
vma_mark_attached() can take it out of this state. Note that unlike
before, now we enforce both vma_mark_attached() and vma_mark_detached() to
be done only after vma has been write-locked. vma_mark_attached() changes
vm_refcnt to 1 to indicate that it has been attached to the vma tree.
When a reader takes read lock, it increments vm_refcnt, unless the top
usable bit of vm_refcnt (0x40000000) is set, indicating presence of a
writer. When writer takes write lock, it sets the top usable bit to
indicate its presence. If there are readers, writer will wait using newly
introduced mm->vma_writer_wait. Since all writers take mmap_lock in write
mode first, there can be only one writer at a time. The last reader to
release the lock will signal the writer to wake up. refcount might
overflow if there are many competing readers, in which case read-locking
will fail. Readers are expected to handle such failures.
In summary:
1. all readers increment the vm_refcnt;
2. writer sets top usable (writer) bit of vm_refcnt;
3. readers cannot increment the vm_refcnt if the writer bit is set;
4. in the presence of readers, writer must wait for the vm_refcnt to drop
to 1 (plus the VMA_LOCK_OFFSET writer bit), indicating an attached vma
with no readers;
5. vm_refcnt overflow is handled by the readers.
While this vm_lock replacement does not yet result in a smaller
vm_area_struct (it stays at 256 bytes due to cacheline alignment), it
allows for further size optimization by structure member regrouping to
bring the size of vm_area_struct below 192 bytes.
[surenb@google.com: fix a crash due to vma_end_read() that should have been removed]
Link: https://lkml.kernel.org/r/20250220200208.323769-1-surenb@google.com
Link: https://lkml.kernel.org/r/20250213224655.1680278-13-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Matthew Wilcox <willy@infradead.org>
Tested-by: Shivank Garg <shivankg@amd.com>
Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-02-13 14:46:49 -08:00
|
|
|
vma_assert_write_locked(vma);
|
|
|
|
vma_assert_detached(vma);
|
2025-02-13 14:46:54 -08:00
|
|
|
refcount_set_release(&vma->vm_refcnt, 1);
|
2025-02-13 14:46:40 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void vma_mark_detached(struct vm_area_struct *vma)
|
2024-07-29 12:50:41 +01:00
|
|
|
{
|
2025-02-13 14:46:40 -08:00
|
|
|
vma_assert_write_locked(vma);
|
mm: replace vm_lock and detached flag with a reference count
rw_semaphore is a sizable structure of 40 bytes and consumes considerable
space for each vm_area_struct. However vma_lock has two important
specifics which can be used to replace rw_semaphore with a simpler
structure:
1. Readers never wait. They try to take the vma_lock and fall back to
mmap_lock if that fails.
2. Only one writer at a time will ever try to write-lock a vma_lock
because writers first take mmap_lock in write mode. Because of these
requirements, full rw_semaphore functionality is not needed and we can
replace rw_semaphore and the vma->detached flag with a refcount
(vm_refcnt).
When vma is in detached state, vm_refcnt is 0 and only a call to
vma_mark_attached() can take it out of this state. Note that unlike
before, now we enforce both vma_mark_attached() and vma_mark_detached() to
be done only after vma has been write-locked. vma_mark_attached() changes
vm_refcnt to 1 to indicate that it has been attached to the vma tree.
When a reader takes read lock, it increments vm_refcnt, unless the top
usable bit of vm_refcnt (0x40000000) is set, indicating presence of a
writer. When writer takes write lock, it sets the top usable bit to
indicate its presence. If there are readers, writer will wait using newly
introduced mm->vma_writer_wait. Since all writers take mmap_lock in write
mode first, there can be only one writer at a time. The last reader to
release the lock will signal the writer to wake up. refcount might
overflow if there are many competing readers, in which case read-locking
will fail. Readers are expected to handle such failures.
In summary:
1. all readers increment the vm_refcnt;
2. writer sets top usable (writer) bit of vm_refcnt;
3. readers cannot increment the vm_refcnt if the writer bit is set;
4. in the presence of readers, writer must wait for the vm_refcnt to drop
to 1 (plus the VMA_LOCK_OFFSET writer bit), indicating an attached vma
with no readers;
5. vm_refcnt overflow is handled by the readers.
While this vm_lock replacement does not yet result in a smaller
vm_area_struct (it stays at 256 bytes due to cacheline alignment), it
allows for further size optimization by structure member regrouping to
bring the size of vm_area_struct below 192 bytes.
[surenb@google.com: fix a crash due to vma_end_read() that should have been removed]
Link: https://lkml.kernel.org/r/20250220200208.323769-1-surenb@google.com
Link: https://lkml.kernel.org/r/20250213224655.1680278-13-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Matthew Wilcox <willy@infradead.org>
Tested-by: Shivank Garg <shivankg@amd.com>
Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-02-13 14:46:49 -08:00
|
|
|
vma_assert_attached(vma);
|
|
|
|
/* We are the only writer, so no need to use vma_refcount_put(). */
|
|
|
|
if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
|
|
|
|
/*
|
|
|
|
* Reader must have temporarily raised vm_refcnt but it will
|
|
|
|
* drop it without using the vma since vma is write-locked.
|
|
|
|
*/
|
|
|
|
}
|
2024-07-29 12:50:41 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
extern const struct vm_operations_struct vma_dummy_vm_ops;
|
|
|
|
|
2024-12-03 18:05:11 +00:00
|
|
|
extern unsigned long rlimit(unsigned int limit);
|
|
|
|
|
2024-07-29 12:50:41 +01:00
|
|
|
static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
memset(vma, 0, sizeof(*vma));
|
|
|
|
vma->vm_mm = mm;
|
|
|
|
vma->vm_ops = &vma_dummy_vm_ops;
|
|
|
|
INIT_LIST_HEAD(&vma->anon_vma_chain);
|
mm: replace vm_lock and detached flag with a reference count
rw_semaphore is a sizable structure of 40 bytes and consumes considerable
space for each vm_area_struct. However vma_lock has two important
specifics which can be used to replace rw_semaphore with a simpler
structure:
1. Readers never wait. They try to take the vma_lock and fall back to
mmap_lock if that fails.
2. Only one writer at a time will ever try to write-lock a vma_lock
because writers first take mmap_lock in write mode. Because of these
requirements, full rw_semaphore functionality is not needed and we can
replace rw_semaphore and the vma->detached flag with a refcount
(vm_refcnt).
When vma is in detached state, vm_refcnt is 0 and only a call to
vma_mark_attached() can take it out of this state. Note that unlike
before, now we enforce both vma_mark_attached() and vma_mark_detached() to
be done only after vma has been write-locked. vma_mark_attached() changes
vm_refcnt to 1 to indicate that it has been attached to the vma tree.
When a reader takes read lock, it increments vm_refcnt, unless the top
usable bit of vm_refcnt (0x40000000) is set, indicating presence of a
writer. When writer takes write lock, it sets the top usable bit to
indicate its presence. If there are readers, writer will wait using newly
introduced mm->vma_writer_wait. Since all writers take mmap_lock in write
mode first, there can be only one writer at a time. The last reader to
release the lock will signal the writer to wake up. refcount might
overflow if there are many competing readers, in which case read-locking
will fail. Readers are expected to handle such failures.
In summary:
1. all readers increment the vm_refcnt;
2. writer sets top usable (writer) bit of vm_refcnt;
3. readers cannot increment the vm_refcnt if the writer bit is set;
4. in the presence of readers, writer must wait for the vm_refcnt to drop
to 1 (plus the VMA_LOCK_OFFSET writer bit), indicating an attached vma
with no readers;
5. vm_refcnt overflow is handled by the readers.
While this vm_lock replacement does not yet result in a smaller
vm_area_struct (it stays at 256 bytes due to cacheline alignment), it
allows for further size optimization by structure member regrouping to
bring the size of vm_area_struct below 192 bytes.
[surenb@google.com: fix a crash due to vma_end_read() that should have been removed]
Link: https://lkml.kernel.org/r/20250220200208.323769-1-surenb@google.com
Link: https://lkml.kernel.org/r/20250213224655.1680278-13-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Matthew Wilcox <willy@infradead.org>
Tested-by: Shivank Garg <shivankg@amd.com>
Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-02-13 14:46:49 -08:00
|
|
|
vma->vm_lock_seq = UINT_MAX;
|
2024-07-29 12:50:41 +01:00
|
|
|
}
|
|
|
|
|
mm: perform VMA allocation, freeing, duplication in mm
Right now these are performed in kernel/fork.c which is odd and a
violation of separation of concerns, as well as preventing us from
integrating this and related logic into userland VMA testing going
forward.
There is a fly in the ointment - nommu - mmap.c is not compiled if
CONFIG_MMU not set, and neither is vma.c.
To square the circle, let's add a new file - vma_init.c. This will be
compiled for both CONFIG_MMU and nommu builds, and will also form part of
the VMA userland testing.
This allows us to de-duplicate code, while maintaining separation of
concerns and the ability for us to userland test this logic.
Update the VMA userland tests accordingly, additionally adding a
detach_free_vma() helper function to correctly detach VMAs before freeing
them in test code, as this change was triggering the assert for this.
[akpm@linux-foundation.org: remove stray newline, per Liam]
Link: https://lkml.kernel.org/r/f97b3a85a6da0196b28070df331b99e22b263be8.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-28 16:28:17 +01:00
|
|
|
struct kmem_cache {
|
|
|
|
const char *name;
|
|
|
|
size_t object_size;
|
|
|
|
struct kmem_cache_args *args;
|
|
|
|
};
|
2024-07-29 12:50:41 +01:00
|
|
|
|
mm: perform VMA allocation, freeing, duplication in mm
Right now these are performed in kernel/fork.c which is odd and a
violation of separation of concerns, as well as preventing us from
integrating this and related logic into userland VMA testing going
forward.
There is a fly in the ointment - nommu - mmap.c is not compiled if
CONFIG_MMU not set, and neither is vma.c.
To square the circle, let's add a new file - vma_init.c. This will be
compiled for both CONFIG_MMU and nommu builds, and will also form part of
the VMA userland testing.
This allows us to de-duplicate code, while maintaining separation of
concerns and the ability for us to userland test this logic.
Update the VMA userland tests accordingly, additionally adding a
detach_free_vma() helper function to correctly detach VMAs before freeing
them in test code, as this change was triggering the assert for this.
[akpm@linux-foundation.org: remove stray newline, per Liam]
Link: https://lkml.kernel.org/r/f97b3a85a6da0196b28070df331b99e22b263be8.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-28 16:28:17 +01:00
|
|
|
static inline struct kmem_cache *__kmem_cache_create(const char *name,
|
|
|
|
size_t object_size,
|
|
|
|
struct kmem_cache_args *args)
|
|
|
|
{
|
|
|
|
struct kmem_cache *ret = malloc(sizeof(struct kmem_cache));
|
2024-07-29 12:50:41 +01:00
|
|
|
|
mm: perform VMA allocation, freeing, duplication in mm
Right now these are performed in kernel/fork.c which is odd and a
violation of separation of concerns, as well as preventing us from
integrating this and related logic into userland VMA testing going
forward.
There is a fly in the ointment - nommu - mmap.c is not compiled if
CONFIG_MMU not set, and neither is vma.c.
To square the circle, let's add a new file - vma_init.c. This will be
compiled for both CONFIG_MMU and nommu builds, and will also form part of
the VMA userland testing.
This allows us to de-duplicate code, while maintaining separation of
concerns and the ability for us to userland test this logic.
Update the VMA userland tests accordingly, additionally adding a
detach_free_vma() helper function to correctly detach VMAs before freeing
them in test code, as this change was triggering the assert for this.
[akpm@linux-foundation.org: remove stray newline, per Liam]
Link: https://lkml.kernel.org/r/f97b3a85a6da0196b28070df331b99e22b263be8.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-28 16:28:17 +01:00
|
|
|
ret->name = name;
|
|
|
|
ret->object_size = object_size;
|
|
|
|
ret->args = args;
|
2024-07-29 12:50:41 +01:00
|
|
|
|
mm: perform VMA allocation, freeing, duplication in mm
Right now these are performed in kernel/fork.c which is odd and a
violation of separation of concerns, as well as preventing us from
integrating this and related logic into userland VMA testing going
forward.
There is a fly in the ointment - nommu - mmap.c is not compiled if
CONFIG_MMU not set, and neither is vma.c.
To square the circle, let's add a new file - vma_init.c. This will be
compiled for both CONFIG_MMU and nommu builds, and will also form part of
the VMA userland testing.
This allows us to de-duplicate code, while maintaining separation of
concerns and the ability for us to userland test this logic.
Update the VMA userland tests accordingly, additionally adding a
detach_free_vma() helper function to correctly detach VMAs before freeing
them in test code, as this change was triggering the assert for this.
[akpm@linux-foundation.org: remove stray newline, per Liam]
Link: https://lkml.kernel.org/r/f97b3a85a6da0196b28070df331b99e22b263be8.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-28 16:28:17 +01:00
|
|
|
return ret;
|
2024-07-29 12:50:41 +01:00
|
|
|
}
|
|
|
|
|
mm: perform VMA allocation, freeing, duplication in mm
Right now these are performed in kernel/fork.c which is odd and a
violation of separation of concerns, as well as preventing us from
integrating this and related logic into userland VMA testing going
forward.
There is a fly in the ointment - nommu - mmap.c is not compiled if
CONFIG_MMU not set, and neither is vma.c.
To square the circle, let's add a new file - vma_init.c. This will be
compiled for both CONFIG_MMU and nommu builds, and will also form part of
the VMA userland testing.
This allows us to de-duplicate code, while maintaining separation of
concerns and the ability for us to userland test this logic.
Update the VMA userland tests accordingly, additionally adding a
detach_free_vma() helper function to correctly detach VMAs before freeing
them in test code, as this change was triggering the assert for this.
[akpm@linux-foundation.org: remove stray newline, per Liam]
Link: https://lkml.kernel.org/r/f97b3a85a6da0196b28070df331b99e22b263be8.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-28 16:28:17 +01:00
|
|
|
#define kmem_cache_create(__name, __object_size, __args, ...) \
|
|
|
|
__kmem_cache_create((__name), (__object_size), (__args))
|
2024-07-29 12:50:41 +01:00
|
|
|
|
mm: perform VMA allocation, freeing, duplication in mm
Right now these are performed in kernel/fork.c which is odd and a
violation of separation of concerns, as well as preventing us from
integrating this and related logic into userland VMA testing going
forward.
There is a fly in the ointment - nommu - mmap.c is not compiled if
CONFIG_MMU not set, and neither is vma.c.
To square the circle, let's add a new file - vma_init.c. This will be
compiled for both CONFIG_MMU and nommu builds, and will also form part of
the VMA userland testing.
This allows us to de-duplicate code, while maintaining separation of
concerns and the ability for us to userland test this logic.
Update the VMA userland tests accordingly, additionally adding a
detach_free_vma() helper function to correctly detach VMAs before freeing
them in test code, as this change was triggering the assert for this.
[akpm@linux-foundation.org: remove stray newline, per Liam]
Link: https://lkml.kernel.org/r/f97b3a85a6da0196b28070df331b99e22b263be8.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-28 16:28:17 +01:00
|
|
|
static inline void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
|
|
|
|
{
|
|
|
|
(void)gfpflags;
|
2024-07-29 12:50:41 +01:00
|
|
|
|
mm: perform VMA allocation, freeing, duplication in mm
Right now these are performed in kernel/fork.c which is odd and a
violation of separation of concerns, as well as preventing us from
integrating this and related logic into userland VMA testing going
forward.
There is a fly in the ointment - nommu - mmap.c is not compiled if
CONFIG_MMU not set, and neither is vma.c.
To square the circle, let's add a new file - vma_init.c. This will be
compiled for both CONFIG_MMU and nommu builds, and will also form part of
the VMA userland testing.
This allows us to de-duplicate code, while maintaining separation of
concerns and the ability for us to userland test this logic.
Update the VMA userland tests accordingly, additionally adding a
detach_free_vma() helper function to correctly detach VMAs before freeing
them in test code, as this change was triggering the assert for this.
[akpm@linux-foundation.org: remove stray newline, per Liam]
Link: https://lkml.kernel.org/r/f97b3a85a6da0196b28070df331b99e22b263be8.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-28 16:28:17 +01:00
|
|
|
return calloc(s->object_size, 1);
|
|
|
|
}
|
2024-07-29 12:50:41 +01:00
|
|
|
|
mm: perform VMA allocation, freeing, duplication in mm
Right now these are performed in kernel/fork.c which is odd and a
violation of separation of concerns, as well as preventing us from
integrating this and related logic into userland VMA testing going
forward.
There is a fly in the ointment - nommu - mmap.c is not compiled if
CONFIG_MMU not set, and neither is vma.c.
To square the circle, let's add a new file - vma_init.c. This will be
compiled for both CONFIG_MMU and nommu builds, and will also form part of
the VMA userland testing.
This allows us to de-duplicate code, while maintaining separation of
concerns and the ability for us to userland test this logic.
Update the VMA userland tests accordingly, additionally adding a
detach_free_vma() helper function to correctly detach VMAs before freeing
them in test code, as this change was triggering the assert for this.
[akpm@linux-foundation.org: remove stray newline, per Liam]
Link: https://lkml.kernel.org/r/f97b3a85a6da0196b28070df331b99e22b263be8.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-28 16:28:17 +01:00
|
|
|
static inline void kmem_cache_free(struct kmem_cache *s, void *x)
|
|
|
|
{
|
|
|
|
free(x);
|
2024-07-29 12:50:41 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* These are defined in vma.h, but sadly vm_stat_account() is referenced by
|
|
|
|
* kernel/fork.c, so we have to these broadly available there, and temporarily
|
|
|
|
* define them here to resolve the dependency cycle.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define is_exec_mapping(flags) \
|
|
|
|
((flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC)
|
|
|
|
|
|
|
|
#define is_stack_mapping(flags) \
|
|
|
|
(((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK))
|
|
|
|
|
|
|
|
#define is_data_mapping(flags) \
|
|
|
|
((flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE)
|
|
|
|
|
|
|
|
static inline void vm_stat_account(struct mm_struct *mm, vm_flags_t flags,
|
|
|
|
long npages)
|
|
|
|
{
|
|
|
|
WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm)+npages);
|
|
|
|
|
|
|
|
if (is_exec_mapping(flags))
|
|
|
|
mm->exec_vm += npages;
|
|
|
|
else if (is_stack_mapping(flags))
|
|
|
|
mm->stack_vm += npages;
|
|
|
|
else if (is_data_mapping(flags))
|
|
|
|
mm->data_vm += npages;
|
|
|
|
}
|
|
|
|
|
|
|
|
#undef is_exec_mapping
|
|
|
|
#undef is_stack_mapping
|
|
|
|
#undef is_data_mapping
|
|
|
|
|
|
|
|
/* Currently stubbed but we may later wish to un-stub. */
|
|
|
|
static inline void vm_acct_memory(long pages);
|
|
|
|
static inline void vm_unacct_memory(long pages)
|
|
|
|
{
|
|
|
|
vm_acct_memory(-pages);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mapping_allow_writable(struct address_space *mapping)
|
|
|
|
{
|
|
|
|
atomic_inc(&mapping->i_mmap_writable);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void vma_set_range(struct vm_area_struct *vma,
|
|
|
|
unsigned long start, unsigned long end,
|
|
|
|
pgoff_t pgoff)
|
|
|
|
{
|
|
|
|
vma->vm_start = start;
|
|
|
|
vma->vm_end = end;
|
|
|
|
vma->vm_pgoff = pgoff;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline
|
|
|
|
struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max)
|
|
|
|
{
|
|
|
|
return mas_find(&vmi->mas, max - 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int vma_iter_clear_gfp(struct vma_iterator *vmi,
|
|
|
|
unsigned long start, unsigned long end, gfp_t gfp)
|
|
|
|
{
|
|
|
|
__mas_set_range(&vmi->mas, start, end - 1);
|
|
|
|
mas_store_gfp(&vmi->mas, NULL, gfp);
|
|
|
|
if (unlikely(mas_is_err(&vmi->mas)))
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mmap_assert_locked(struct mm_struct *);
|
|
|
|
static inline struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
|
|
|
|
unsigned long start_addr,
|
|
|
|
unsigned long end_addr)
|
|
|
|
{
|
|
|
|
unsigned long index = start_addr;
|
|
|
|
|
|
|
|
mmap_assert_locked(mm);
|
|
|
|
return mt_find(&mm->mm_mt, &index, end_addr - 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline
|
|
|
|
struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr)
|
|
|
|
{
|
|
|
|
return mtree_load(&mm->mm_mt, addr);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi)
|
|
|
|
{
|
|
|
|
return mas_prev(&vmi->mas, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr)
|
|
|
|
{
|
|
|
|
mas_set(&vmi->mas, addr);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool vma_is_anonymous(struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
return !vma->vm_ops;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Defined in vma.h, so temporarily define here to avoid circular dependency. */
|
|
|
|
#define vma_iter_load(vmi) \
|
|
|
|
mas_walk(&(vmi)->mas)
|
|
|
|
|
|
|
|
static inline struct vm_area_struct *
|
|
|
|
find_vma_prev(struct mm_struct *mm, unsigned long addr,
|
|
|
|
struct vm_area_struct **pprev)
|
|
|
|
{
|
|
|
|
struct vm_area_struct *vma;
|
|
|
|
VMA_ITERATOR(vmi, mm, addr);
|
|
|
|
|
|
|
|
vma = vma_iter_load(&vmi);
|
|
|
|
*pprev = vma_prev(&vmi);
|
|
|
|
if (!vma)
|
|
|
|
vma = vma_next(&vmi);
|
|
|
|
return vma;
|
|
|
|
}
|
|
|
|
|
|
|
|
#undef vma_iter_load
|
|
|
|
|
|
|
|
static inline void vma_iter_init(struct vma_iterator *vmi,
|
|
|
|
struct mm_struct *mm, unsigned long addr)
|
|
|
|
{
|
|
|
|
mas_init(&vmi->mas, &mm->mm_mt, addr);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Stubbed functions. */
|
|
|
|
|
|
|
|
static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
|
|
|
|
struct vm_userfaultfd_ctx vm_ctx)
|
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
|
|
|
|
struct anon_vma_name *anon_name2)
|
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void might_sleep(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline unsigned long vma_pages(struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void fput(struct file *)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mpol_put(struct mempolicy *)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void lru_add_drain(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void tlb_gather_mmu(struct mmu_gather *, struct mm_struct *)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void update_hiwater_rss(struct mm_struct *)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void update_hiwater_vm(struct mm_struct *)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
|
|
|
|
struct vm_area_struct *vma, unsigned long start_addr,
|
|
|
|
unsigned long end_addr, unsigned long tree_end,
|
|
|
|
bool mm_wr_locked)
|
|
|
|
{
|
|
|
|
(void)tlb;
|
|
|
|
(void)mas;
|
|
|
|
(void)vma;
|
|
|
|
(void)start_addr;
|
|
|
|
(void)end_addr;
|
|
|
|
(void)tree_end;
|
|
|
|
(void)mm_wr_locked;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
|
|
|
|
struct vm_area_struct *vma, unsigned long floor,
|
|
|
|
unsigned long ceiling, bool mm_wr_locked)
|
|
|
|
{
|
|
|
|
(void)tlb;
|
|
|
|
(void)mas;
|
|
|
|
(void)vma;
|
|
|
|
(void)floor;
|
|
|
|
(void)ceiling;
|
|
|
|
(void)mm_wr_locked;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mapping_unmap_writable(struct address_space *)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void flush_dcache_mmap_lock(struct address_space *)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void tlb_finish_mmu(struct mmu_gather *)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
tools: testing: add additional vma_internal.h stubs
Patch series "fix error handling in mmap_region() and refactor", v3.
The mmap_region() function is somewhat terrifying, with spaghetti-like
control flow and numerous means by which issues can arise and incomplete
state, memory leaks and other unpleasantness can occur.
This series goes to great lengths to simplify how mmap_region() works and
to avoid unwinding errors late on in the process of setting up the VMA for
the new mapping, and equally avoids such operations occurring while the
VMA is in an inconsistent state.
This series builds on the previously submitted hotfix patches (see link to
v2 below) which addresses the most critical issues around mmap_region(),
and further works to improve mmap_region() complexity, stability, and
testability.
This series moves the code to mm/vma.c to render it userland testable,
refactors and simplifies it into smaller functions that are significantly
more readable.
It additionally avoids performing an attempt at a second merge mid-way
through allocating a new VMA, a dubious proposition at best and one that
is highly subject to subtle bugs.
Rather than do this, we simply note that we ought to retry the merge and
do this as a final step.
This patch (of 3):
Add some additional vma_internal.h stubs in preparation for
__mmap_region() being moved to mm/vma.c. Without these the move would
result in the tests no longer compiling.
Link: https://lkml.kernel.org/r/cover.1729858176.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/74b27e159e261d2ac1fe66a130edad1d61fdc176.1729858176.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-25 13:26:23 +01:00
|
|
|
static inline struct file *get_file(struct file *f)
|
2024-07-29 12:50:41 +01:00
|
|
|
{
|
tools: testing: add additional vma_internal.h stubs
Patch series "fix error handling in mmap_region() and refactor", v3.
The mmap_region() function is somewhat terrifying, with spaghetti-like
control flow and numerous means by which issues can arise and incomplete
state, memory leaks and other unpleasantness can occur.
This series goes to great lengths to simplify how mmap_region() works and
to avoid unwinding errors late on in the process of setting up the VMA for
the new mapping, and equally avoids such operations occurring while the
VMA is in an inconsistent state.
This series builds on the previously submitted hotfix patches (see link to
v2 below) which addresses the most critical issues around mmap_region(),
and further works to improve mmap_region() complexity, stability, and
testability.
This series moves the code to mm/vma.c to render it userland testable,
refactors and simplifies it into smaller functions that are significantly
more readable.
It additionally avoids performing an attempt at a second merge mid-way
through allocating a new VMA, a dubious proposition at best and one that
is highly subject to subtle bugs.
Rather than do this, we simply note that we ought to retry the merge and
do this as a final step.
This patch (of 3):
Add some additional vma_internal.h stubs in preparation for
__mmap_region() being moved to mm/vma.c. Without these the move would
result in the tests no longer compiling.
Link: https://lkml.kernel.org/r/cover.1729858176.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/74b27e159e261d2ac1fe66a130edad1d61fdc176.1729858176.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-25 13:26:23 +01:00
|
|
|
return f;
|
2024-07-29 12:50:41 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline int vma_dup_policy(struct vm_area_struct *, struct vm_area_struct *)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2024-08-30 19:10:14 +01:00
|
|
|
static inline int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
|
2024-07-29 12:50:41 +01:00
|
|
|
{
|
2024-08-30 19:10:14 +01:00
|
|
|
/* For testing purposes. We indicate that an anon_vma has been cloned. */
|
|
|
|
if (src->anon_vma != NULL) {
|
|
|
|
dst->anon_vma = src->anon_vma;
|
|
|
|
dst->anon_vma->was_cloned = true;
|
|
|
|
}
|
|
|
|
|
2024-07-29 12:50:41 +01:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2024-08-30 19:10:14 +01:00
|
|
|
static inline void vma_start_write(struct vm_area_struct *vma)
|
2024-07-29 12:50:41 +01:00
|
|
|
{
|
2024-08-30 19:10:14 +01:00
|
|
|
/* Used to indicate to tests that a write operation has begun. */
|
|
|
|
vma->vm_lock_seq++;
|
2024-07-29 12:50:41 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
|
|
|
|
unsigned long start,
|
|
|
|
unsigned long end,
|
mm: completely abstract unnecessary adj_start calculation
The adj_start calculation has been a constant source of confusion in the
VMA merge code.
There are two cases to consider, one where we adjust the start of the
vmg->middle VMA (i.e. the vmg->__adjust_middle_start merge flag is set),
in which case adj_start is calculated as:
(1) adj_start = vmg->end - vmg->middle->vm_start
And the case where we adjust the start of the vmg->next VMA (i.e. the
vmg->__adjust_next_start merge flag is set), in which case adj_start is
calculated as:
(2) adj_start = -(vmg->middle->vm_end - vmg->end)
We apply (1) thusly:
vmg->middle->vm_start =
vmg->middle->vm_start + vmg->end - vmg->middle->vm_start
Which simplifies to:
vmg->middle->vm_start = vmg->end
Similarly, we apply (2) as:
vmg->next->vm_start =
vmg->next->vm_start + -(vmg->middle->vm_end - vmg->end)
Noting that for these VMAs to be mergeable vmg->middle->vm_end ==
vmg->next->vm_start and so this simplifies to:
vmg->next->vm_start =
vmg->next->vm_start + -(vmg->next->vm_start - vmg->end)
Which simplifies to:
vmg->next->vm_start = vmg->end
Therefore in each case, we simply need to adjust the start of the VMA to
vmg->end (!) and can do away with this adj_start calculation. The only
caveat is that we must ensure we update the vm_pgoff field correctly.
We therefore abstract this entire calculation to a new function
vmg_adjust_set_range() which performs this calculation and sets the
adjusted VMA's new range using the general vma_set_range() function.
We also must update vma_adjust_trans_huge() which expects the
now-abstracted adj_start parameter. It turns out this is wholly
unnecessary.
In vma_adjust_trans_huge() the relevant code is:
if (adjust_next > 0) {
struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end);
unsigned long nstart = next->vm_start;
nstart += adjust_next;
split_huge_pmd_if_needed(next, nstart);
}
The only case where this is relevant is when vmg->__adjust_middle_start is
specified (in which case adj_next would have been positive), i.e. the one
in which the vma specified is vmg->prev and this the sought 'next' VMA
would be vmg->middle.
We can therefore eliminate the find_vma() invocation altogether and simply
provide the vmg->middle VMA in this instance, or NULL otherwise.
Again we have an adj_next offset calculation:
next->vm_start + vmg->end - vmg->middle->vm_start
Where next == vmg->middle this simplifies to vmg->end as previously
demonstrated.
Therefore nstart is equal to vmg->end, which is already passed to
vma_adjust_trans_huge() via the 'end' parameter and so this code (rather
delightfully) simplifies to:
if (next)
split_huge_pmd_if_needed(next, end);
With these changes in place, it becomes silly for commit_merge() to return
vmg->target, as it is always the same and threaded through vmg, so we
finally change commit_merge() to return an error value once again.
This patch has no change in functional behaviour.
Link: https://lkml.kernel.org/r/7bce2cd4b5afb56211822835d145471280c3dccc.1738326519.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-31 12:31:53 +00:00
|
|
|
struct vm_area_struct *next)
|
2024-07-29 12:50:41 +01:00
|
|
|
{
|
|
|
|
(void)vma;
|
|
|
|
(void)start;
|
|
|
|
(void)end;
|
mm: completely abstract unnecessary adj_start calculation
The adj_start calculation has been a constant source of confusion in the
VMA merge code.
There are two cases to consider, one where we adjust the start of the
vmg->middle VMA (i.e. the vmg->__adjust_middle_start merge flag is set),
in which case adj_start is calculated as:
(1) adj_start = vmg->end - vmg->middle->vm_start
And the case where we adjust the start of the vmg->next VMA (i.e. the
vmg->__adjust_next_start merge flag is set), in which case adj_start is
calculated as:
(2) adj_start = -(vmg->middle->vm_end - vmg->end)
We apply (1) thusly:
vmg->middle->vm_start =
vmg->middle->vm_start + vmg->end - vmg->middle->vm_start
Which simplifies to:
vmg->middle->vm_start = vmg->end
Similarly, we apply (2) as:
vmg->next->vm_start =
vmg->next->vm_start + -(vmg->middle->vm_end - vmg->end)
Noting that for these VMAs to be mergeable vmg->middle->vm_end ==
vmg->next->vm_start and so this simplifies to:
vmg->next->vm_start =
vmg->next->vm_start + -(vmg->next->vm_start - vmg->end)
Which simplifies to:
vmg->next->vm_start = vmg->end
Therefore in each case, we simply need to adjust the start of the VMA to
vmg->end (!) and can do away with this adj_start calculation. The only
caveat is that we must ensure we update the vm_pgoff field correctly.
We therefore abstract this entire calculation to a new function
vmg_adjust_set_range() which performs this calculation and sets the
adjusted VMA's new range using the general vma_set_range() function.
We also must update vma_adjust_trans_huge() which expects the
now-abstracted adj_start parameter. It turns out this is wholly
unnecessary.
In vma_adjust_trans_huge() the relevant code is:
if (adjust_next > 0) {
struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end);
unsigned long nstart = next->vm_start;
nstart += adjust_next;
split_huge_pmd_if_needed(next, nstart);
}
The only case where this is relevant is when vmg->__adjust_middle_start is
specified (in which case adj_next would have been positive), i.e. the one
in which the vma specified is vmg->prev and this the sought 'next' VMA
would be vmg->middle.
We can therefore eliminate the find_vma() invocation altogether and simply
provide the vmg->middle VMA in this instance, or NULL otherwise.
Again we have an adj_next offset calculation:
next->vm_start + vmg->end - vmg->middle->vm_start
Where next == vmg->middle this simplifies to vmg->end as previously
demonstrated.
Therefore nstart is equal to vmg->end, which is already passed to
vma_adjust_trans_huge() via the 'end' parameter and so this code (rather
delightfully) simplifies to:
if (next)
split_huge_pmd_if_needed(next, end);
With these changes in place, it becomes silly for commit_merge() to return
vmg->target, as it is always the same and threaded through vmg, so we
finally change commit_merge() to return an error value once again.
This patch has no change in functional behaviour.
Link: https://lkml.kernel.org/r/7bce2cd4b5afb56211822835d145471280c3dccc.1738326519.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-31 12:31:53 +00:00
|
|
|
(void)next;
|
2024-07-29 12:50:41 +01:00
|
|
|
}
|
|
|
|
|
2025-05-27 23:23:53 +02:00
|
|
|
static inline void hugetlb_split(struct vm_area_struct *, unsigned long) {}
|
|
|
|
|
2024-07-29 12:50:41 +01:00
|
|
|
static inline void vma_iter_free(struct vma_iterator *vmi)
|
|
|
|
{
|
|
|
|
mas_destroy(&vmi->mas);
|
|
|
|
}
|
|
|
|
|
mm: avoid using vma_merge() for new VMAs
Abstract vma_merge_new_vma() to use vma_merge_struct and rename the
resultant function vma_merge_new_range() to be clear what the purpose of
this function is - a new VMA is desired in the specified range, and we
wish to see if it is possible to 'merge' surrounding VMAs into this range
rather than having to allocate a new VMA.
Note that this function uses vma_extend() exclusively, so adopts its
requirement that the iterator point at or before the gap. We add an
assert to this effect.
This is as opposed to vma_merge_existing_range(), which will be introduced
in a subsequent commit, and provide the same functionality for cases in
which we are modifying an existing VMA.
In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.
Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().
Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.
Also add the ability for the vmg to track state, and able to report
errors, allowing for us to differentiate a failed merge from an inability
to allocate memory in callers.
This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.
Also introduce vma_iter_next_rewind() to allow for retrieval of the next,
and (optionally) the prev VMA, rewinding to the start of the previous gap.
Introduce are_anon_vmas_compatible() to abstract individual VMA anon_vma
comparison for the case of merging on both sides where the anon_vma of the
VMA being merged maybe compatible with prev and next, but prev and next's
anon_vma's may not be compatible with each other.
Finally also introduce can_vma_merge_left() / can_vma_merge_right() to
check adjacent VMA compatibility and that they are indeed adjacent.
Link: https://lkml.kernel.org/r/49d37c0769b6b9dc03b27fe4d059173832556392.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:18 +01:00
|
|
|
static inline
|
|
|
|
struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi)
|
|
|
|
{
|
|
|
|
return mas_next_range(&vmi->mas, ULONG_MAX);
|
|
|
|
}
|
|
|
|
|
2024-07-29 12:50:41 +01:00
|
|
|
static inline void vm_acct_memory(long pages)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void vma_interval_tree_insert(struct vm_area_struct *,
|
|
|
|
struct rb_root_cached *)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void vma_interval_tree_remove(struct vm_area_struct *,
|
|
|
|
struct rb_root_cached *)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void flush_dcache_mmap_unlock(struct address_space *)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void anon_vma_interval_tree_insert(struct anon_vma_chain*,
|
|
|
|
struct rb_root_cached *)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void anon_vma_interval_tree_remove(struct anon_vma_chain*,
|
|
|
|
struct rb_root_cached *)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void uprobe_mmap(struct vm_area_struct *)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void uprobe_munmap(struct vm_area_struct *vma,
|
|
|
|
unsigned long start, unsigned long end)
|
|
|
|
{
|
|
|
|
(void)vma;
|
|
|
|
(void)start;
|
|
|
|
(void)end;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void i_mmap_lock_write(struct address_space *)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void anon_vma_lock_write(struct anon_vma *)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void vma_assert_write_locked(struct vm_area_struct *)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2024-08-30 19:10:14 +01:00
|
|
|
static inline void unlink_anon_vmas(struct vm_area_struct *vma)
|
2024-07-29 12:50:41 +01:00
|
|
|
{
|
2024-08-30 19:10:14 +01:00
|
|
|
/* For testing purposes, indicate that the anon_vma was unlinked. */
|
|
|
|
vma->anon_vma->was_unlinked = true;
|
2024-07-29 12:50:41 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void anon_vma_unlock_write(struct anon_vma *)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void i_mmap_unlock_write(struct address_space *)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void anon_vma_merge(struct vm_area_struct *,
|
|
|
|
struct vm_area_struct *)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma,
|
|
|
|
unsigned long start,
|
|
|
|
unsigned long end,
|
|
|
|
struct list_head *unmaps)
|
|
|
|
{
|
|
|
|
(void)vma;
|
|
|
|
(void)start;
|
|
|
|
(void)end;
|
|
|
|
(void)unmaps;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mmap_write_downgrade(struct mm_struct *)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mmap_read_unlock(struct mm_struct *)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mmap_write_unlock(struct mm_struct *)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2024-12-03 18:05:12 +00:00
|
|
|
static inline int mmap_write_lock_killable(struct mm_struct *)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2024-07-29 12:50:41 +01:00
|
|
|
static inline bool can_modify_mm(struct mm_struct *mm,
|
|
|
|
unsigned long start,
|
|
|
|
unsigned long end)
|
|
|
|
{
|
|
|
|
(void)mm;
|
|
|
|
(void)start;
|
|
|
|
(void)end;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void arch_unmap(struct mm_struct *mm,
|
|
|
|
unsigned long start,
|
|
|
|
unsigned long end)
|
|
|
|
{
|
|
|
|
(void)mm;
|
|
|
|
(void)start;
|
|
|
|
(void)end;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mmap_assert_locked(struct mm_struct *)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool mpol_equal(struct mempolicy *, struct mempolicy *)
|
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void khugepaged_enter_vma(struct vm_area_struct *vma,
|
2025-06-18 20:42:53 +01:00
|
|
|
vm_flags_t vm_flags)
|
2024-07-29 12:50:41 +01:00
|
|
|
{
|
|
|
|
(void)vma;
|
|
|
|
(void)vm_flags;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool mapping_can_writeback(struct address_space *)
|
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool is_vm_hugetlb_page(struct vm_area_struct *)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool vma_soft_dirty_enabled(struct vm_area_struct *)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool userfaultfd_wp(struct vm_area_struct *)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mmap_assert_write_locked(struct mm_struct *)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mutex_lock(struct mutex *)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mutex_unlock(struct mutex *)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool mutex_is_locked(struct mutex *)
|
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool signal_pending(void *)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
tools: testing: add additional vma_internal.h stubs
Patch series "fix error handling in mmap_region() and refactor", v3.
The mmap_region() function is somewhat terrifying, with spaghetti-like
control flow and numerous means by which issues can arise and incomplete
state, memory leaks and other unpleasantness can occur.
This series goes to great lengths to simplify how mmap_region() works and
to avoid unwinding errors late on in the process of setting up the VMA for
the new mapping, and equally avoids such operations occurring while the
VMA is in an inconsistent state.
This series builds on the previously submitted hotfix patches (see link to
v2 below) which addresses the most critical issues around mmap_region(),
and further works to improve mmap_region() complexity, stability, and
testability.
This series moves the code to mm/vma.c to render it userland testable,
refactors and simplifies it into smaller functions that are significantly
more readable.
It additionally avoids performing an attempt at a second merge mid-way
through allocating a new VMA, a dubious proposition at best and one that
is highly subject to subtle bugs.
Rather than do this, we simply note that we ought to retry the merge and
do this as a final step.
This patch (of 3):
Add some additional vma_internal.h stubs in preparation for
__mmap_region() being moved to mm/vma.c. Without these the move would
result in the tests no longer compiling.
Link: https://lkml.kernel.org/r/cover.1729858176.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/74b27e159e261d2ac1fe66a130edad1d61fdc176.1729858176.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-25 13:26:23 +01:00
|
|
|
static inline bool is_file_hugepages(struct file *)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int security_vm_enough_memory_mm(struct mm_struct *, long)
|
|
|
|
{
|
2024-12-13 16:24:09 +00:00
|
|
|
return 0;
|
tools: testing: add additional vma_internal.h stubs
Patch series "fix error handling in mmap_region() and refactor", v3.
The mmap_region() function is somewhat terrifying, with spaghetti-like
control flow and numerous means by which issues can arise and incomplete
state, memory leaks and other unpleasantness can occur.
This series goes to great lengths to simplify how mmap_region() works and
to avoid unwinding errors late on in the process of setting up the VMA for
the new mapping, and equally avoids such operations occurring while the
VMA is in an inconsistent state.
This series builds on the previously submitted hotfix patches (see link to
v2 below) which addresses the most critical issues around mmap_region(),
and further works to improve mmap_region() complexity, stability, and
testability.
This series moves the code to mm/vma.c to render it userland testable,
refactors and simplifies it into smaller functions that are significantly
more readable.
It additionally avoids performing an attempt at a second merge mid-way
through allocating a new VMA, a dubious proposition at best and one that
is highly subject to subtle bugs.
Rather than do this, we simply note that we ought to retry the merge and
do this as a final step.
This patch (of 3):
Add some additional vma_internal.h stubs in preparation for
__mmap_region() being moved to mm/vma.c. Without these the move would
result in the tests no longer compiling.
Link: https://lkml.kernel.org/r/cover.1729858176.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/74b27e159e261d2ac1fe66a130edad1d61fdc176.1729858176.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-25 13:26:23 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long)
|
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void vm_flags_init(struct vm_area_struct *vma,
|
|
|
|
vm_flags_t flags)
|
|
|
|
{
|
|
|
|
vma->__vm_flags = flags;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void vm_flags_set(struct vm_area_struct *vma,
|
|
|
|
vm_flags_t flags)
|
|
|
|
{
|
|
|
|
vma_start_write(vma);
|
|
|
|
vma->__vm_flags |= flags;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void vm_flags_clear(struct vm_area_struct *vma,
|
|
|
|
vm_flags_t flags)
|
|
|
|
{
|
|
|
|
vma_start_write(vma);
|
|
|
|
vma->__vm_flags &= ~flags;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int shmem_zero_setup(struct vm_area_struct *)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void vma_set_anonymous(struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
vma->vm_ops = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void ksm_add_vma(struct vm_area_struct *)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void perf_event_mmap(struct vm_area_struct *)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool vma_is_dax(struct vm_area_struct *)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline struct vm_area_struct *get_gate_vma(struct mm_struct *)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
|
|
|
|
|
|
|
|
/* Update vma->vm_page_prot to reflect vma->vm_flags. */
|
|
|
|
static inline void vma_set_page_prot(struct vm_area_struct *vma)
|
|
|
|
{
|
2025-06-18 20:42:53 +01:00
|
|
|
vm_flags_t vm_flags = vma->vm_flags;
|
tools: testing: add additional vma_internal.h stubs
Patch series "fix error handling in mmap_region() and refactor", v3.
The mmap_region() function is somewhat terrifying, with spaghetti-like
control flow and numerous means by which issues can arise and incomplete
state, memory leaks and other unpleasantness can occur.
This series goes to great lengths to simplify how mmap_region() works and
to avoid unwinding errors late on in the process of setting up the VMA for
the new mapping, and equally avoids such operations occurring while the
VMA is in an inconsistent state.
This series builds on the previously submitted hotfix patches (see link to
v2 below) which addresses the most critical issues around mmap_region(),
and further works to improve mmap_region() complexity, stability, and
testability.
This series moves the code to mm/vma.c to render it userland testable,
refactors and simplifies it into smaller functions that are significantly
more readable.
It additionally avoids performing an attempt at a second merge mid-way
through allocating a new VMA, a dubious proposition at best and one that
is highly subject to subtle bugs.
Rather than do this, we simply note that we ought to retry the merge and
do this as a final step.
This patch (of 3):
Add some additional vma_internal.h stubs in preparation for
__mmap_region() being moved to mm/vma.c. Without these the move would
result in the tests no longer compiling.
Link: https://lkml.kernel.org/r/cover.1729858176.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/74b27e159e261d2ac1fe66a130edad1d61fdc176.1729858176.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-25 13:26:23 +01:00
|
|
|
pgprot_t vm_page_prot;
|
|
|
|
|
|
|
|
/* testing: we inline vm_pgprot_modify() to avoid clash with vma.h. */
|
|
|
|
vm_page_prot = pgprot_modify(vma->vm_page_prot, vm_get_page_prot(vm_flags));
|
|
|
|
|
|
|
|
if (vma_wants_writenotify(vma, vm_page_prot)) {
|
|
|
|
vm_flags &= ~VM_SHARED;
|
|
|
|
/* testing: we inline vm_pgprot_modify() to avoid clash with vma.h. */
|
|
|
|
vm_page_prot = pgprot_modify(vm_page_prot, vm_get_page_prot(vm_flags));
|
|
|
|
}
|
|
|
|
/* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */
|
|
|
|
WRITE_ONCE(vma->vm_page_prot, vm_page_prot);
|
|
|
|
}
|
|
|
|
|
2025-06-18 20:42:54 +01:00
|
|
|
static inline bool arch_validate_flags(vm_flags_t)
|
tools: testing: add additional vma_internal.h stubs
Patch series "fix error handling in mmap_region() and refactor", v3.
The mmap_region() function is somewhat terrifying, with spaghetti-like
control flow and numerous means by which issues can arise and incomplete
state, memory leaks and other unpleasantness can occur.
This series goes to great lengths to simplify how mmap_region() works and
to avoid unwinding errors late on in the process of setting up the VMA for
the new mapping, and equally avoids such operations occurring while the
VMA is in an inconsistent state.
This series builds on the previously submitted hotfix patches (see link to
v2 below) which addresses the most critical issues around mmap_region(),
and further works to improve mmap_region() complexity, stability, and
testability.
This series moves the code to mm/vma.c to render it userland testable,
refactors and simplifies it into smaller functions that are significantly
more readable.
It additionally avoids performing an attempt at a second merge mid-way
through allocating a new VMA, a dubious proposition at best and one that
is highly subject to subtle bugs.
Rather than do this, we simply note that we ought to retry the merge and
do this as a final step.
This patch (of 3):
Add some additional vma_internal.h stubs in preparation for
__mmap_region() being moved to mm/vma.c. Without these the move would
result in the tests no longer compiling.
Link: https://lkml.kernel.org/r/cover.1729858176.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/74b27e159e261d2ac1fe66a130edad1d61fdc176.1729858176.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-10-25 13:26:23 +01:00
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void vma_close(struct vm_area_struct *)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int mmap_file(struct file *, struct vm_area_struct *)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2024-12-03 18:05:09 +00:00
|
|
|
static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
if (vma->vm_flags & VM_GROWSDOWN)
|
|
|
|
return stack_guard_gap;
|
|
|
|
|
|
|
|
/* See reasoning around the VM_SHADOW_STACK definition */
|
|
|
|
if (vma->vm_flags & VM_SHADOW_STACK)
|
|
|
|
return PAGE_SIZE;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
unsigned long gap = stack_guard_start_gap(vma);
|
|
|
|
unsigned long vm_start = vma->vm_start;
|
|
|
|
|
|
|
|
vm_start -= gap;
|
|
|
|
if (vm_start > vma->vm_start)
|
|
|
|
vm_start = 0;
|
|
|
|
return vm_start;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline unsigned long vm_end_gap(struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
unsigned long vm_end = vma->vm_end;
|
|
|
|
|
|
|
|
if (vma->vm_flags & VM_GROWSUP) {
|
|
|
|
vm_end += stack_guard_gap;
|
|
|
|
if (vm_end < vma->vm_end)
|
|
|
|
vm_end = -PAGE_SIZE;
|
|
|
|
}
|
|
|
|
return vm_end;
|
|
|
|
}
|
|
|
|
|
2024-12-03 18:05:11 +00:00
|
|
|
static inline int is_hugepage_only_range(struct mm_struct *mm,
|
|
|
|
unsigned long addr, unsigned long len)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool vma_is_accessible(struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
return vma->vm_flags & VM_ACCESS_FLAGS;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool capable(int cap)
|
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
static inline bool mlock_future_ok(struct mm_struct *mm, vm_flags_t vm_flags,
|
2024-12-03 18:05:11 +00:00
|
|
|
unsigned long bytes)
|
|
|
|
{
|
|
|
|
unsigned long locked_pages, limit_pages;
|
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
if (!(vm_flags & VM_LOCKED) || capable(CAP_IPC_LOCK))
|
2024-12-03 18:05:11 +00:00
|
|
|
return true;
|
|
|
|
|
|
|
|
locked_pages = bytes >> PAGE_SHIFT;
|
|
|
|
locked_pages += mm->locked_vm;
|
|
|
|
|
|
|
|
limit_pages = rlimit(RLIMIT_MEMLOCK);
|
|
|
|
limit_pages >>= PAGE_SHIFT;
|
|
|
|
|
|
|
|
return locked_pages <= limit_pages;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int __anon_vma_prepare(struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
struct anon_vma *anon_vma = calloc(1, sizeof(struct anon_vma));
|
|
|
|
|
|
|
|
if (!anon_vma)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
anon_vma->root = anon_vma;
|
|
|
|
vma->anon_vma = anon_vma;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int anon_vma_prepare(struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
if (likely(vma->anon_vma))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
return __anon_vma_prepare(vma);
|
|
|
|
}
|
|
|
|
|
2024-12-03 18:05:12 +00:00
|
|
|
static inline void userfaultfd_unmap_complete(struct mm_struct *mm,
|
|
|
|
struct list_head *uf)
|
|
|
|
{
|
|
|
|
}
|
2024-12-03 18:05:11 +00:00
|
|
|
|
2025-01-02 12:10:52 +00:00
|
|
|
/*
|
|
|
|
* Denies creating a writable executable mapping or gaining executable permissions.
|
|
|
|
*
|
|
|
|
* This denies the following:
|
|
|
|
*
|
|
|
|
* a) mmap(PROT_WRITE | PROT_EXEC)
|
|
|
|
*
|
|
|
|
* b) mmap(PROT_WRITE)
|
|
|
|
* mprotect(PROT_EXEC)
|
|
|
|
*
|
|
|
|
* c) mmap(PROT_WRITE)
|
|
|
|
* mprotect(PROT_READ)
|
|
|
|
* mprotect(PROT_EXEC)
|
|
|
|
*
|
|
|
|
* But allows the following:
|
|
|
|
*
|
|
|
|
* d) mmap(PROT_READ | PROT_EXEC)
|
|
|
|
* mmap(PROT_READ | PROT_EXEC | PROT_BTI)
|
|
|
|
*
|
|
|
|
* This is only applicable if the user has set the Memory-Deny-Write-Execute
|
|
|
|
* (MDWE) protection mask for the current process.
|
|
|
|
*
|
|
|
|
* @old specifies the VMA flags the VMA originally possessed, and @new the ones
|
|
|
|
* we propose to set.
|
|
|
|
*
|
|
|
|
* Return: false if proposed change is OK, true if not ok and should be denied.
|
|
|
|
*/
|
|
|
|
static inline bool map_deny_write_exec(unsigned long old, unsigned long new)
|
|
|
|
{
|
|
|
|
/* If MDWE is disabled, we have nothing to deny. */
|
|
|
|
if (!test_bit(MMF_HAS_MDWE, ¤t->mm->flags))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
/* If the new VMA is not executable, we have nothing to deny. */
|
|
|
|
if (!(new & VM_EXEC))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
/* Under MDWE we do not accept newly writably executable VMAs... */
|
|
|
|
if (new & VM_WRITE)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
/* ...nor previously non-executable VMAs becoming executable. */
|
|
|
|
if (!(old & VM_EXEC))
|
|
|
|
return true;
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int mapping_map_writable(struct address_space *mapping)
|
|
|
|
{
|
|
|
|
int c = atomic_read(&mapping->i_mmap_writable);
|
|
|
|
|
|
|
|
/* Derived from the raw_atomic_inc_unless_negative() implementation. */
|
|
|
|
do {
|
|
|
|
if (c < 0)
|
|
|
|
return -EPERM;
|
|
|
|
} while (!__sync_bool_compare_and_swap(&mapping->i_mmap_writable, c, c+1));
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
mm: establish mm/vma_exec.c for shared exec/mm VMA functionality
Patch series "move all VMA allocation, freeing and duplication logic to
mm", v3.
Currently VMA allocation, freeing and duplication exist in kernel/fork.c,
which is a violation of separation of concerns, and leaves these functions
exposed to the rest of the kernel when they are in fact internal
implementation details.
Resolve this by moving this logic to mm, and making it internal to vma.c,
vma.h.
This also allows us, in future, to provide userland testing around this
functionality.
We additionally abstract dup_mmap() to mm, being careful to ensure
kernel/fork.c acceses this via the mm internal header so it is not exposed
elsewhere in the kernel.
As part of this change, also abstract initial stack allocation performed
in __bprm_mm_init() out of fs code into mm via the
create_init_stack_vma(), as this code uses vm_area_alloc() and
vm_area_free().
In order to do so sensibly, we introduce a new mm/vma_exec.c file, which
contains the code that is shared by mm and exec. This file is added to
both memory mapping and exec sections in MAINTAINERS so both sets of
maintainers can maintain oversight.
As part of this change, we also move relocate_vma_down() to mm/vma_exec.c
so all shared mm/exec functionality is kept in one place.
We add code shared between nommu and mmu-enabled configurations in order
to share VMA allocation, freeing and duplication code correctly while also
keeping these functions available in userland VMA testing.
This is achieved by adding a mm/vma_init.c file which is also compiled by
the userland tests.
This patch (of 4):
There is functionality that overlaps the exec and memory mapping
subsystems. While it properly belongs in mm, it is important that exec
maintainers maintain oversight of this functionality correctly.
We can establish both goals by adding a new mm/vma_exec.c file which
contains these 'glue' functions, and have fs/exec.c import them.
As a part of this change, to ensure that proper oversight is achieved, add
the file to both the MEMORY MAPPING and EXEC & BINFMT API, ELF sections.
scripts/get_maintainer.pl can correctly handle files in multiple entries
and this neatly handles the cross-over.
[akpm@linux-foundation.org: fix comment typo]
Link: https://lkml.kernel.org/r/80f0d0c6-0b68-47f9-ab78-0ab7f74677fc@lucifer.local
Link: https://lkml.kernel.org/r/cover.1745853549.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/91f2cee8f17d65214a9d83abb7011aa15f1ea690.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-28 16:28:14 +01:00
|
|
|
static inline unsigned long move_page_tables(struct pagetable_move_control *pmc)
|
|
|
|
{
|
|
|
|
(void)pmc;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void free_pgd_range(struct mmu_gather *tlb,
|
|
|
|
unsigned long addr, unsigned long end,
|
|
|
|
unsigned long floor, unsigned long ceiling)
|
|
|
|
{
|
|
|
|
(void)tlb;
|
|
|
|
(void)addr;
|
|
|
|
(void)end;
|
|
|
|
(void)floor;
|
|
|
|
(void)ceiling;
|
|
|
|
}
|
|
|
|
|
2025-04-28 16:28:15 +01:00
|
|
|
static inline int ksm_execve(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
(void)mm;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void ksm_exit(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
(void)mm;
|
|
|
|
}
|
|
|
|
|
mm: perform VMA allocation, freeing, duplication in mm
Right now these are performed in kernel/fork.c which is odd and a
violation of separation of concerns, as well as preventing us from
integrating this and related logic into userland VMA testing going
forward.
There is a fly in the ointment - nommu - mmap.c is not compiled if
CONFIG_MMU not set, and neither is vma.c.
To square the circle, let's add a new file - vma_init.c. This will be
compiled for both CONFIG_MMU and nommu builds, and will also form part of
the VMA userland testing.
This allows us to de-duplicate code, while maintaining separation of
concerns and the ability for us to userland test this logic.
Update the VMA userland tests accordingly, additionally adding a
detach_free_vma() helper function to correctly detach VMAs before freeing
them in test code, as this change was triggering the assert for this.
[akpm@linux-foundation.org: remove stray newline, per Liam]
Link: https://lkml.kernel.org/r/f97b3a85a6da0196b28070df331b99e22b263be8.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-28 16:28:17 +01:00
|
|
|
static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt)
|
|
|
|
{
|
|
|
|
(void)vma;
|
|
|
|
(void)reset_refcnt;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void vma_numab_state_init(struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
(void)vma;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void vma_numab_state_free(struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
(void)vma;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
|
|
|
|
struct vm_area_struct *new_vma)
|
|
|
|
{
|
|
|
|
(void)orig_vma;
|
|
|
|
(void)new_vma;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void free_anon_vma_name(struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
(void)vma;
|
|
|
|
}
|
|
|
|
|
2025-06-16 20:33:20 +01:00
|
|
|
/* Declared in vma.h. */
|
|
|
|
static inline void set_vma_from_desc(struct vm_area_struct *vma,
|
|
|
|
struct vm_area_desc *desc);
|
|
|
|
|
|
|
|
static inline struct vm_area_desc *vma_to_desc(struct vm_area_struct *vma,
|
|
|
|
struct vm_area_desc *desc);
|
|
|
|
|
|
|
|
static int compat_vma_mmap_prepare(struct file *file,
|
|
|
|
struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
struct vm_area_desc desc;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
err = file->f_op->mmap_prepare(vma_to_desc(vma, &desc));
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
set_vma_from_desc(vma, &desc);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
mm: introduce new .mmap_prepare() file callback
Patch series "eliminate mmap() retry merge, add .mmap_prepare hook", v2.
During the mmap() of a file-backed mapping, we invoke the underlying
driver file's mmap() callback in order to perform driver/file system
initialisation of the underlying VMA.
This has been a source of issues in the past, including a significant
security concern relating to unwinding of error state discovered by Jann
Horn, as fixed in commit 5de195060b2e ("mm: resolve faulty mmap_region()
error path behaviour") which performed the recent, significant, rework of
mmap() as a whole.
However, we have had a fly in the ointment remain - drivers have a great
deal of freedom in the .mmap() hook to manipulate VMA state (as well as
page table state).
This can be problematic, as we can no longer reason sensibly about VMA
state once the call is complete (the ability to do - anything - here does
rather interfere with that).
In addition, callers may choose to do odd or unusual things which might
interfere with subsequent steps in the mmap() process, and it may do so
and then raise an error, requiring very careful unwinding of state about
which we can make no assumptions.
Rather than providing such an open-ended interface, this series provides
an alternative, far more restrictive one - we expose a whitelist of fields
which can be adjusted by the driver, along with immutable state upon which
the driver can make such decisions:
struct vm_area_desc {
/* Immutable state. */
struct mm_struct *mm;
unsigned long start;
unsigned long end;
/* Mutable fields. Populated with initial state. */
pgoff_t pgoff;
struct file *file;
vm_flags_t vm_flags;
pgprot_t page_prot;
/* Write-only fields. */
const struct vm_operations_struct *vm_ops;
void *private_data;
};
The mmap logic then updates the state used to either merge with a VMA or
establish a new VMA based upon this logic.
This is achieved via new file hook .mmap_prepare(), which is, importantly,
invoked very early on in the mmap() process.
If an error arises, we can very simply abort the operation with very
little unwinding of state required.
The existing logic contains another, related, peccadillo - since the
.mmap() callback might do anything, it may also cause a previously
unmergeable VMA to become mergeable with adjacent VMAs.
Right now the logic will retry a merge like this only if the driver
changes VMA flags, and changes them in such a way that a merge might
succeed (that is, the flags are not 'special', that is do not contain any
of the flags specified in VM_SPECIAL).
This has also been the source of a great deal of pain - it's hard to
reason about an .mmap() callback that might do - anything - but it's also
hard to reason about setting up a VMA and writing to the maple tree, only
to do it again utilising a great deal of shared state.
Since .mmap_prepare() sets fields before the first merge is even
attempted, the use of this callback obviates the need for this retry merge
logic.
A driver may only specify .mmap_prepare() or the deprecated .mmap()
callback. In future we may add futher callbacks beyond .mmap_prepare() to
faciliate all use cass as we convert drivers.
In researching this change, I examined every .mmap() callback, and
discovered only a very few that set VMA state in such a way that a. the
VMA flags changed and b. this would be mergeable.
In the majority of cases, it turns out that drivers are mapping kernel
memory and thus ultimately set VM_PFNMAP, VM_MIXEDMAP, or other
unmergeable VM_SPECIAL flags.
Of those that remain I identified a number of cases which are only
applicable in DAX, setting the VM_HUGEPAGE flag:
* dax_mmap()
* erofs_file_mmap()
* ext4_file_mmap()
* xfs_file_mmap()
For this remerge to not occur and to impact users, each of these cases
would require a user to mmap() files using DAX, in parts, immediately
adjacent to one another.
This is a very unlikely usecase and so it does not appear to be worthwhile
to adjust this functionality accordingly.
We can, however, very quickly do so if needed by simply adding an
.mmap_prepare() callback to these as required.
There are two further non-DAX cases I idenitfied:
* orangefs_file_mmap() - Clears VM_RAND_READ if set, replacing with
VM_SEQ_READ.
* usb_stream_hwdep_mmap() - Sets VM_DONTDUMP.
Both of these cases again seem very unlikely to be mmap()'d immediately
adjacent to one another in a fashion that would result in a merge.
Finally, we are left with a viable case:
* secretmem_mmap() - Set VM_LOCKED, VM_DONTDUMP.
This is viable enough that the mm selftests trigger the logic as a matter
of course. Therefore, this series replace the .secretmem_mmap() hook with
.secret_mmap_prepare().
This patch (of 3):
Provide a means by which drivers can specify which fields of those
permitted to be changed should be altered to prior to mmap()'ing a range
(which may either result from a merge or from mapping an entirely new
VMA).
Doing so is substantially safer than the existing .mmap() calback which
provides unrestricted access to the part-constructed VMA and permits
drivers and file systems to do 'creative' things which makes it hard to
reason about the state of the VMA after the function returns.
The existing .mmap() callback's freedom has caused a great deal of issues,
especially in error handling, as unwinding the mmap() state has proven to
be non-trivial and caused significant issues in the past, for instance
those addressed in commit 5de195060b2e ("mm: resolve faulty mmap_region()
error path behaviour").
It also necessitates a second attempt at merge once the .mmap() callback
has completed, which has caused issues in the past, is awkward, adds
overhead and is difficult to reason about.
The .mmap_prepare() callback eliminates this requirement, as we can update
fields prior to even attempting the first merge. It is safer, as we
heavily restrict what can actually be modified, and being invoked very
early in the mmap() process, error handling can be performed safely with
very little unwinding of state required.
The .mmap_prepare() and deprecated .mmap() callbacks are mutually
exclusive, so we permit only one to be invoked at a time.
Update vma userland test stubs to account for changes.
Link: https://lkml.kernel.org/r/cover.1746792520.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/adb36a7c4affd7393b2fc4b54cc5cfe211e41f71.1746792520.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-05-09 13:13:34 +01:00
|
|
|
/* Did the driver provide valid mmap hook configuration? */
|
2025-06-16 20:33:22 +01:00
|
|
|
static inline bool can_mmap_file(struct file *file)
|
mm: introduce new .mmap_prepare() file callback
Patch series "eliminate mmap() retry merge, add .mmap_prepare hook", v2.
During the mmap() of a file-backed mapping, we invoke the underlying
driver file's mmap() callback in order to perform driver/file system
initialisation of the underlying VMA.
This has been a source of issues in the past, including a significant
security concern relating to unwinding of error state discovered by Jann
Horn, as fixed in commit 5de195060b2e ("mm: resolve faulty mmap_region()
error path behaviour") which performed the recent, significant, rework of
mmap() as a whole.
However, we have had a fly in the ointment remain - drivers have a great
deal of freedom in the .mmap() hook to manipulate VMA state (as well as
page table state).
This can be problematic, as we can no longer reason sensibly about VMA
state once the call is complete (the ability to do - anything - here does
rather interfere with that).
In addition, callers may choose to do odd or unusual things which might
interfere with subsequent steps in the mmap() process, and it may do so
and then raise an error, requiring very careful unwinding of state about
which we can make no assumptions.
Rather than providing such an open-ended interface, this series provides
an alternative, far more restrictive one - we expose a whitelist of fields
which can be adjusted by the driver, along with immutable state upon which
the driver can make such decisions:
struct vm_area_desc {
/* Immutable state. */
struct mm_struct *mm;
unsigned long start;
unsigned long end;
/* Mutable fields. Populated with initial state. */
pgoff_t pgoff;
struct file *file;
vm_flags_t vm_flags;
pgprot_t page_prot;
/* Write-only fields. */
const struct vm_operations_struct *vm_ops;
void *private_data;
};
The mmap logic then updates the state used to either merge with a VMA or
establish a new VMA based upon this logic.
This is achieved via new file hook .mmap_prepare(), which is, importantly,
invoked very early on in the mmap() process.
If an error arises, we can very simply abort the operation with very
little unwinding of state required.
The existing logic contains another, related, peccadillo - since the
.mmap() callback might do anything, it may also cause a previously
unmergeable VMA to become mergeable with adjacent VMAs.
Right now the logic will retry a merge like this only if the driver
changes VMA flags, and changes them in such a way that a merge might
succeed (that is, the flags are not 'special', that is do not contain any
of the flags specified in VM_SPECIAL).
This has also been the source of a great deal of pain - it's hard to
reason about an .mmap() callback that might do - anything - but it's also
hard to reason about setting up a VMA and writing to the maple tree, only
to do it again utilising a great deal of shared state.
Since .mmap_prepare() sets fields before the first merge is even
attempted, the use of this callback obviates the need for this retry merge
logic.
A driver may only specify .mmap_prepare() or the deprecated .mmap()
callback. In future we may add futher callbacks beyond .mmap_prepare() to
faciliate all use cass as we convert drivers.
In researching this change, I examined every .mmap() callback, and
discovered only a very few that set VMA state in such a way that a. the
VMA flags changed and b. this would be mergeable.
In the majority of cases, it turns out that drivers are mapping kernel
memory and thus ultimately set VM_PFNMAP, VM_MIXEDMAP, or other
unmergeable VM_SPECIAL flags.
Of those that remain I identified a number of cases which are only
applicable in DAX, setting the VM_HUGEPAGE flag:
* dax_mmap()
* erofs_file_mmap()
* ext4_file_mmap()
* xfs_file_mmap()
For this remerge to not occur and to impact users, each of these cases
would require a user to mmap() files using DAX, in parts, immediately
adjacent to one another.
This is a very unlikely usecase and so it does not appear to be worthwhile
to adjust this functionality accordingly.
We can, however, very quickly do so if needed by simply adding an
.mmap_prepare() callback to these as required.
There are two further non-DAX cases I idenitfied:
* orangefs_file_mmap() - Clears VM_RAND_READ if set, replacing with
VM_SEQ_READ.
* usb_stream_hwdep_mmap() - Sets VM_DONTDUMP.
Both of these cases again seem very unlikely to be mmap()'d immediately
adjacent to one another in a fashion that would result in a merge.
Finally, we are left with a viable case:
* secretmem_mmap() - Set VM_LOCKED, VM_DONTDUMP.
This is viable enough that the mm selftests trigger the logic as a matter
of course. Therefore, this series replace the .secretmem_mmap() hook with
.secret_mmap_prepare().
This patch (of 3):
Provide a means by which drivers can specify which fields of those
permitted to be changed should be altered to prior to mmap()'ing a range
(which may either result from a merge or from mapping an entirely new
VMA).
Doing so is substantially safer than the existing .mmap() calback which
provides unrestricted access to the part-constructed VMA and permits
drivers and file systems to do 'creative' things which makes it hard to
reason about the state of the VMA after the function returns.
The existing .mmap() callback's freedom has caused a great deal of issues,
especially in error handling, as unwinding the mmap() state has proven to
be non-trivial and caused significant issues in the past, for instance
those addressed in commit 5de195060b2e ("mm: resolve faulty mmap_region()
error path behaviour").
It also necessitates a second attempt at merge once the .mmap() callback
has completed, which has caused issues in the past, is awkward, adds
overhead and is difficult to reason about.
The .mmap_prepare() callback eliminates this requirement, as we can update
fields prior to even attempting the first merge. It is safer, as we
heavily restrict what can actually be modified, and being invoked very
early in the mmap() process, error handling can be performed safely with
very little unwinding of state required.
The .mmap_prepare() and deprecated .mmap() callbacks are mutually
exclusive, so we permit only one to be invoked at a time.
Update vma userland test stubs to account for changes.
Link: https://lkml.kernel.org/r/cover.1746792520.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/adb36a7c4affd7393b2fc4b54cc5cfe211e41f71.1746792520.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-05-09 13:13:34 +01:00
|
|
|
{
|
|
|
|
bool has_mmap = file->f_op->mmap;
|
|
|
|
bool has_mmap_prepare = file->f_op->mmap_prepare;
|
|
|
|
|
|
|
|
/* Hooks are mutually exclusive. */
|
|
|
|
if (WARN_ON_ONCE(has_mmap && has_mmap_prepare))
|
|
|
|
return false;
|
2025-06-16 20:33:20 +01:00
|
|
|
if (!has_mmap && !has_mmap_prepare)
|
mm: introduce new .mmap_prepare() file callback
Patch series "eliminate mmap() retry merge, add .mmap_prepare hook", v2.
During the mmap() of a file-backed mapping, we invoke the underlying
driver file's mmap() callback in order to perform driver/file system
initialisation of the underlying VMA.
This has been a source of issues in the past, including a significant
security concern relating to unwinding of error state discovered by Jann
Horn, as fixed in commit 5de195060b2e ("mm: resolve faulty mmap_region()
error path behaviour") which performed the recent, significant, rework of
mmap() as a whole.
However, we have had a fly in the ointment remain - drivers have a great
deal of freedom in the .mmap() hook to manipulate VMA state (as well as
page table state).
This can be problematic, as we can no longer reason sensibly about VMA
state once the call is complete (the ability to do - anything - here does
rather interfere with that).
In addition, callers may choose to do odd or unusual things which might
interfere with subsequent steps in the mmap() process, and it may do so
and then raise an error, requiring very careful unwinding of state about
which we can make no assumptions.
Rather than providing such an open-ended interface, this series provides
an alternative, far more restrictive one - we expose a whitelist of fields
which can be adjusted by the driver, along with immutable state upon which
the driver can make such decisions:
struct vm_area_desc {
/* Immutable state. */
struct mm_struct *mm;
unsigned long start;
unsigned long end;
/* Mutable fields. Populated with initial state. */
pgoff_t pgoff;
struct file *file;
vm_flags_t vm_flags;
pgprot_t page_prot;
/* Write-only fields. */
const struct vm_operations_struct *vm_ops;
void *private_data;
};
The mmap logic then updates the state used to either merge with a VMA or
establish a new VMA based upon this logic.
This is achieved via new file hook .mmap_prepare(), which is, importantly,
invoked very early on in the mmap() process.
If an error arises, we can very simply abort the operation with very
little unwinding of state required.
The existing logic contains another, related, peccadillo - since the
.mmap() callback might do anything, it may also cause a previously
unmergeable VMA to become mergeable with adjacent VMAs.
Right now the logic will retry a merge like this only if the driver
changes VMA flags, and changes them in such a way that a merge might
succeed (that is, the flags are not 'special', that is do not contain any
of the flags specified in VM_SPECIAL).
This has also been the source of a great deal of pain - it's hard to
reason about an .mmap() callback that might do - anything - but it's also
hard to reason about setting up a VMA and writing to the maple tree, only
to do it again utilising a great deal of shared state.
Since .mmap_prepare() sets fields before the first merge is even
attempted, the use of this callback obviates the need for this retry merge
logic.
A driver may only specify .mmap_prepare() or the deprecated .mmap()
callback. In future we may add futher callbacks beyond .mmap_prepare() to
faciliate all use cass as we convert drivers.
In researching this change, I examined every .mmap() callback, and
discovered only a very few that set VMA state in such a way that a. the
VMA flags changed and b. this would be mergeable.
In the majority of cases, it turns out that drivers are mapping kernel
memory and thus ultimately set VM_PFNMAP, VM_MIXEDMAP, or other
unmergeable VM_SPECIAL flags.
Of those that remain I identified a number of cases which are only
applicable in DAX, setting the VM_HUGEPAGE flag:
* dax_mmap()
* erofs_file_mmap()
* ext4_file_mmap()
* xfs_file_mmap()
For this remerge to not occur and to impact users, each of these cases
would require a user to mmap() files using DAX, in parts, immediately
adjacent to one another.
This is a very unlikely usecase and so it does not appear to be worthwhile
to adjust this functionality accordingly.
We can, however, very quickly do so if needed by simply adding an
.mmap_prepare() callback to these as required.
There are two further non-DAX cases I idenitfied:
* orangefs_file_mmap() - Clears VM_RAND_READ if set, replacing with
VM_SEQ_READ.
* usb_stream_hwdep_mmap() - Sets VM_DONTDUMP.
Both of these cases again seem very unlikely to be mmap()'d immediately
adjacent to one another in a fashion that would result in a merge.
Finally, we are left with a viable case:
* secretmem_mmap() - Set VM_LOCKED, VM_DONTDUMP.
This is viable enough that the mm selftests trigger the logic as a matter
of course. Therefore, this series replace the .secretmem_mmap() hook with
.secret_mmap_prepare().
This patch (of 3):
Provide a means by which drivers can specify which fields of those
permitted to be changed should be altered to prior to mmap()'ing a range
(which may either result from a merge or from mapping an entirely new
VMA).
Doing so is substantially safer than the existing .mmap() calback which
provides unrestricted access to the part-constructed VMA and permits
drivers and file systems to do 'creative' things which makes it hard to
reason about the state of the VMA after the function returns.
The existing .mmap() callback's freedom has caused a great deal of issues,
especially in error handling, as unwinding the mmap() state has proven to
be non-trivial and caused significant issues in the past, for instance
those addressed in commit 5de195060b2e ("mm: resolve faulty mmap_region()
error path behaviour").
It also necessitates a second attempt at merge once the .mmap() callback
has completed, which has caused issues in the past, is awkward, adds
overhead and is difficult to reason about.
The .mmap_prepare() callback eliminates this requirement, as we can update
fields prior to even attempting the first merge. It is safer, as we
heavily restrict what can actually be modified, and being invoked very
early in the mmap() process, error handling can be performed safely with
very little unwinding of state required.
The .mmap_prepare() and deprecated .mmap() callbacks are mutually
exclusive, so we permit only one to be invoked at a time.
Update vma userland test stubs to account for changes.
Link: https://lkml.kernel.org/r/cover.1746792520.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/adb36a7c4affd7393b2fc4b54cc5cfe211e41f71.1746792520.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-05-09 13:13:34 +01:00
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2025-06-16 20:33:20 +01:00
|
|
|
static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma)
|
mm: introduce new .mmap_prepare() file callback
Patch series "eliminate mmap() retry merge, add .mmap_prepare hook", v2.
During the mmap() of a file-backed mapping, we invoke the underlying
driver file's mmap() callback in order to perform driver/file system
initialisation of the underlying VMA.
This has been a source of issues in the past, including a significant
security concern relating to unwinding of error state discovered by Jann
Horn, as fixed in commit 5de195060b2e ("mm: resolve faulty mmap_region()
error path behaviour") which performed the recent, significant, rework of
mmap() as a whole.
However, we have had a fly in the ointment remain - drivers have a great
deal of freedom in the .mmap() hook to manipulate VMA state (as well as
page table state).
This can be problematic, as we can no longer reason sensibly about VMA
state once the call is complete (the ability to do - anything - here does
rather interfere with that).
In addition, callers may choose to do odd or unusual things which might
interfere with subsequent steps in the mmap() process, and it may do so
and then raise an error, requiring very careful unwinding of state about
which we can make no assumptions.
Rather than providing such an open-ended interface, this series provides
an alternative, far more restrictive one - we expose a whitelist of fields
which can be adjusted by the driver, along with immutable state upon which
the driver can make such decisions:
struct vm_area_desc {
/* Immutable state. */
struct mm_struct *mm;
unsigned long start;
unsigned long end;
/* Mutable fields. Populated with initial state. */
pgoff_t pgoff;
struct file *file;
vm_flags_t vm_flags;
pgprot_t page_prot;
/* Write-only fields. */
const struct vm_operations_struct *vm_ops;
void *private_data;
};
The mmap logic then updates the state used to either merge with a VMA or
establish a new VMA based upon this logic.
This is achieved via new file hook .mmap_prepare(), which is, importantly,
invoked very early on in the mmap() process.
If an error arises, we can very simply abort the operation with very
little unwinding of state required.
The existing logic contains another, related, peccadillo - since the
.mmap() callback might do anything, it may also cause a previously
unmergeable VMA to become mergeable with adjacent VMAs.
Right now the logic will retry a merge like this only if the driver
changes VMA flags, and changes them in such a way that a merge might
succeed (that is, the flags are not 'special', that is do not contain any
of the flags specified in VM_SPECIAL).
This has also been the source of a great deal of pain - it's hard to
reason about an .mmap() callback that might do - anything - but it's also
hard to reason about setting up a VMA and writing to the maple tree, only
to do it again utilising a great deal of shared state.
Since .mmap_prepare() sets fields before the first merge is even
attempted, the use of this callback obviates the need for this retry merge
logic.
A driver may only specify .mmap_prepare() or the deprecated .mmap()
callback. In future we may add futher callbacks beyond .mmap_prepare() to
faciliate all use cass as we convert drivers.
In researching this change, I examined every .mmap() callback, and
discovered only a very few that set VMA state in such a way that a. the
VMA flags changed and b. this would be mergeable.
In the majority of cases, it turns out that drivers are mapping kernel
memory and thus ultimately set VM_PFNMAP, VM_MIXEDMAP, or other
unmergeable VM_SPECIAL flags.
Of those that remain I identified a number of cases which are only
applicable in DAX, setting the VM_HUGEPAGE flag:
* dax_mmap()
* erofs_file_mmap()
* ext4_file_mmap()
* xfs_file_mmap()
For this remerge to not occur and to impact users, each of these cases
would require a user to mmap() files using DAX, in parts, immediately
adjacent to one another.
This is a very unlikely usecase and so it does not appear to be worthwhile
to adjust this functionality accordingly.
We can, however, very quickly do so if needed by simply adding an
.mmap_prepare() callback to these as required.
There are two further non-DAX cases I idenitfied:
* orangefs_file_mmap() - Clears VM_RAND_READ if set, replacing with
VM_SEQ_READ.
* usb_stream_hwdep_mmap() - Sets VM_DONTDUMP.
Both of these cases again seem very unlikely to be mmap()'d immediately
adjacent to one another in a fashion that would result in a merge.
Finally, we are left with a viable case:
* secretmem_mmap() - Set VM_LOCKED, VM_DONTDUMP.
This is viable enough that the mm selftests trigger the logic as a matter
of course. Therefore, this series replace the .secretmem_mmap() hook with
.secret_mmap_prepare().
This patch (of 3):
Provide a means by which drivers can specify which fields of those
permitted to be changed should be altered to prior to mmap()'ing a range
(which may either result from a merge or from mapping an entirely new
VMA).
Doing so is substantially safer than the existing .mmap() calback which
provides unrestricted access to the part-constructed VMA and permits
drivers and file systems to do 'creative' things which makes it hard to
reason about the state of the VMA after the function returns.
The existing .mmap() callback's freedom has caused a great deal of issues,
especially in error handling, as unwinding the mmap() state has proven to
be non-trivial and caused significant issues in the past, for instance
those addressed in commit 5de195060b2e ("mm: resolve faulty mmap_region()
error path behaviour").
It also necessitates a second attempt at merge once the .mmap() callback
has completed, which has caused issues in the past, is awkward, adds
overhead and is difficult to reason about.
The .mmap_prepare() callback eliminates this requirement, as we can update
fields prior to even attempting the first merge. It is safer, as we
heavily restrict what can actually be modified, and being invoked very
early in the mmap() process, error handling can be performed safely with
very little unwinding of state required.
The .mmap_prepare() and deprecated .mmap() callbacks are mutually
exclusive, so we permit only one to be invoked at a time.
Update vma userland test stubs to account for changes.
Link: https://lkml.kernel.org/r/cover.1746792520.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/adb36a7c4affd7393b2fc4b54cc5cfe211e41f71.1746792520.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-05-09 13:13:34 +01:00
|
|
|
{
|
2025-06-16 20:33:20 +01:00
|
|
|
if (file->f_op->mmap_prepare)
|
|
|
|
return compat_vma_mmap_prepare(file, vma);
|
mm: introduce new .mmap_prepare() file callback
Patch series "eliminate mmap() retry merge, add .mmap_prepare hook", v2.
During the mmap() of a file-backed mapping, we invoke the underlying
driver file's mmap() callback in order to perform driver/file system
initialisation of the underlying VMA.
This has been a source of issues in the past, including a significant
security concern relating to unwinding of error state discovered by Jann
Horn, as fixed in commit 5de195060b2e ("mm: resolve faulty mmap_region()
error path behaviour") which performed the recent, significant, rework of
mmap() as a whole.
However, we have had a fly in the ointment remain - drivers have a great
deal of freedom in the .mmap() hook to manipulate VMA state (as well as
page table state).
This can be problematic, as we can no longer reason sensibly about VMA
state once the call is complete (the ability to do - anything - here does
rather interfere with that).
In addition, callers may choose to do odd or unusual things which might
interfere with subsequent steps in the mmap() process, and it may do so
and then raise an error, requiring very careful unwinding of state about
which we can make no assumptions.
Rather than providing such an open-ended interface, this series provides
an alternative, far more restrictive one - we expose a whitelist of fields
which can be adjusted by the driver, along with immutable state upon which
the driver can make such decisions:
struct vm_area_desc {
/* Immutable state. */
struct mm_struct *mm;
unsigned long start;
unsigned long end;
/* Mutable fields. Populated with initial state. */
pgoff_t pgoff;
struct file *file;
vm_flags_t vm_flags;
pgprot_t page_prot;
/* Write-only fields. */
const struct vm_operations_struct *vm_ops;
void *private_data;
};
The mmap logic then updates the state used to either merge with a VMA or
establish a new VMA based upon this logic.
This is achieved via new file hook .mmap_prepare(), which is, importantly,
invoked very early on in the mmap() process.
If an error arises, we can very simply abort the operation with very
little unwinding of state required.
The existing logic contains another, related, peccadillo - since the
.mmap() callback might do anything, it may also cause a previously
unmergeable VMA to become mergeable with adjacent VMAs.
Right now the logic will retry a merge like this only if the driver
changes VMA flags, and changes them in such a way that a merge might
succeed (that is, the flags are not 'special', that is do not contain any
of the flags specified in VM_SPECIAL).
This has also been the source of a great deal of pain - it's hard to
reason about an .mmap() callback that might do - anything - but it's also
hard to reason about setting up a VMA and writing to the maple tree, only
to do it again utilising a great deal of shared state.
Since .mmap_prepare() sets fields before the first merge is even
attempted, the use of this callback obviates the need for this retry merge
logic.
A driver may only specify .mmap_prepare() or the deprecated .mmap()
callback. In future we may add futher callbacks beyond .mmap_prepare() to
faciliate all use cass as we convert drivers.
In researching this change, I examined every .mmap() callback, and
discovered only a very few that set VMA state in such a way that a. the
VMA flags changed and b. this would be mergeable.
In the majority of cases, it turns out that drivers are mapping kernel
memory and thus ultimately set VM_PFNMAP, VM_MIXEDMAP, or other
unmergeable VM_SPECIAL flags.
Of those that remain I identified a number of cases which are only
applicable in DAX, setting the VM_HUGEPAGE flag:
* dax_mmap()
* erofs_file_mmap()
* ext4_file_mmap()
* xfs_file_mmap()
For this remerge to not occur and to impact users, each of these cases
would require a user to mmap() files using DAX, in parts, immediately
adjacent to one another.
This is a very unlikely usecase and so it does not appear to be worthwhile
to adjust this functionality accordingly.
We can, however, very quickly do so if needed by simply adding an
.mmap_prepare() callback to these as required.
There are two further non-DAX cases I idenitfied:
* orangefs_file_mmap() - Clears VM_RAND_READ if set, replacing with
VM_SEQ_READ.
* usb_stream_hwdep_mmap() - Sets VM_DONTDUMP.
Both of these cases again seem very unlikely to be mmap()'d immediately
adjacent to one another in a fashion that would result in a merge.
Finally, we are left with a viable case:
* secretmem_mmap() - Set VM_LOCKED, VM_DONTDUMP.
This is viable enough that the mm selftests trigger the logic as a matter
of course. Therefore, this series replace the .secretmem_mmap() hook with
.secret_mmap_prepare().
This patch (of 3):
Provide a means by which drivers can specify which fields of those
permitted to be changed should be altered to prior to mmap()'ing a range
(which may either result from a merge or from mapping an entirely new
VMA).
Doing so is substantially safer than the existing .mmap() calback which
provides unrestricted access to the part-constructed VMA and permits
drivers and file systems to do 'creative' things which makes it hard to
reason about the state of the VMA after the function returns.
The existing .mmap() callback's freedom has caused a great deal of issues,
especially in error handling, as unwinding the mmap() state has proven to
be non-trivial and caused significant issues in the past, for instance
those addressed in commit 5de195060b2e ("mm: resolve faulty mmap_region()
error path behaviour").
It also necessitates a second attempt at merge once the .mmap() callback
has completed, which has caused issues in the past, is awkward, adds
overhead and is difficult to reason about.
The .mmap_prepare() callback eliminates this requirement, as we can update
fields prior to even attempting the first merge. It is safer, as we
heavily restrict what can actually be modified, and being invoked very
early in the mmap() process, error handling can be performed safely with
very little unwinding of state required.
The .mmap_prepare() and deprecated .mmap() callbacks are mutually
exclusive, so we permit only one to be invoked at a time.
Update vma userland test stubs to account for changes.
Link: https://lkml.kernel.org/r/cover.1746792520.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/adb36a7c4affd7393b2fc4b54cc5cfe211e41f71.1746792520.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-05-09 13:13:34 +01:00
|
|
|
|
|
|
|
return file->f_op->mmap(file, vma);
|
|
|
|
}
|
|
|
|
|
2025-06-16 20:33:20 +01:00
|
|
|
static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc)
|
mm: introduce new .mmap_prepare() file callback
Patch series "eliminate mmap() retry merge, add .mmap_prepare hook", v2.
During the mmap() of a file-backed mapping, we invoke the underlying
driver file's mmap() callback in order to perform driver/file system
initialisation of the underlying VMA.
This has been a source of issues in the past, including a significant
security concern relating to unwinding of error state discovered by Jann
Horn, as fixed in commit 5de195060b2e ("mm: resolve faulty mmap_region()
error path behaviour") which performed the recent, significant, rework of
mmap() as a whole.
However, we have had a fly in the ointment remain - drivers have a great
deal of freedom in the .mmap() hook to manipulate VMA state (as well as
page table state).
This can be problematic, as we can no longer reason sensibly about VMA
state once the call is complete (the ability to do - anything - here does
rather interfere with that).
In addition, callers may choose to do odd or unusual things which might
interfere with subsequent steps in the mmap() process, and it may do so
and then raise an error, requiring very careful unwinding of state about
which we can make no assumptions.
Rather than providing such an open-ended interface, this series provides
an alternative, far more restrictive one - we expose a whitelist of fields
which can be adjusted by the driver, along with immutable state upon which
the driver can make such decisions:
struct vm_area_desc {
/* Immutable state. */
struct mm_struct *mm;
unsigned long start;
unsigned long end;
/* Mutable fields. Populated with initial state. */
pgoff_t pgoff;
struct file *file;
vm_flags_t vm_flags;
pgprot_t page_prot;
/* Write-only fields. */
const struct vm_operations_struct *vm_ops;
void *private_data;
};
The mmap logic then updates the state used to either merge with a VMA or
establish a new VMA based upon this logic.
This is achieved via new file hook .mmap_prepare(), which is, importantly,
invoked very early on in the mmap() process.
If an error arises, we can very simply abort the operation with very
little unwinding of state required.
The existing logic contains another, related, peccadillo - since the
.mmap() callback might do anything, it may also cause a previously
unmergeable VMA to become mergeable with adjacent VMAs.
Right now the logic will retry a merge like this only if the driver
changes VMA flags, and changes them in such a way that a merge might
succeed (that is, the flags are not 'special', that is do not contain any
of the flags specified in VM_SPECIAL).
This has also been the source of a great deal of pain - it's hard to
reason about an .mmap() callback that might do - anything - but it's also
hard to reason about setting up a VMA and writing to the maple tree, only
to do it again utilising a great deal of shared state.
Since .mmap_prepare() sets fields before the first merge is even
attempted, the use of this callback obviates the need for this retry merge
logic.
A driver may only specify .mmap_prepare() or the deprecated .mmap()
callback. In future we may add futher callbacks beyond .mmap_prepare() to
faciliate all use cass as we convert drivers.
In researching this change, I examined every .mmap() callback, and
discovered only a very few that set VMA state in such a way that a. the
VMA flags changed and b. this would be mergeable.
In the majority of cases, it turns out that drivers are mapping kernel
memory and thus ultimately set VM_PFNMAP, VM_MIXEDMAP, or other
unmergeable VM_SPECIAL flags.
Of those that remain I identified a number of cases which are only
applicable in DAX, setting the VM_HUGEPAGE flag:
* dax_mmap()
* erofs_file_mmap()
* ext4_file_mmap()
* xfs_file_mmap()
For this remerge to not occur and to impact users, each of these cases
would require a user to mmap() files using DAX, in parts, immediately
adjacent to one another.
This is a very unlikely usecase and so it does not appear to be worthwhile
to adjust this functionality accordingly.
We can, however, very quickly do so if needed by simply adding an
.mmap_prepare() callback to these as required.
There are two further non-DAX cases I idenitfied:
* orangefs_file_mmap() - Clears VM_RAND_READ if set, replacing with
VM_SEQ_READ.
* usb_stream_hwdep_mmap() - Sets VM_DONTDUMP.
Both of these cases again seem very unlikely to be mmap()'d immediately
adjacent to one another in a fashion that would result in a merge.
Finally, we are left with a viable case:
* secretmem_mmap() - Set VM_LOCKED, VM_DONTDUMP.
This is viable enough that the mm selftests trigger the logic as a matter
of course. Therefore, this series replace the .secretmem_mmap() hook with
.secret_mmap_prepare().
This patch (of 3):
Provide a means by which drivers can specify which fields of those
permitted to be changed should be altered to prior to mmap()'ing a range
(which may either result from a merge or from mapping an entirely new
VMA).
Doing so is substantially safer than the existing .mmap() calback which
provides unrestricted access to the part-constructed VMA and permits
drivers and file systems to do 'creative' things which makes it hard to
reason about the state of the VMA after the function returns.
The existing .mmap() callback's freedom has caused a great deal of issues,
especially in error handling, as unwinding the mmap() state has proven to
be non-trivial and caused significant issues in the past, for instance
those addressed in commit 5de195060b2e ("mm: resolve faulty mmap_region()
error path behaviour").
It also necessitates a second attempt at merge once the .mmap() callback
has completed, which has caused issues in the past, is awkward, adds
overhead and is difficult to reason about.
The .mmap_prepare() callback eliminates this requirement, as we can update
fields prior to even attempting the first merge. It is safer, as we
heavily restrict what can actually be modified, and being invoked very
early in the mmap() process, error handling can be performed safely with
very little unwinding of state required.
The .mmap_prepare() and deprecated .mmap() callbacks are mutually
exclusive, so we permit only one to be invoked at a time.
Update vma userland test stubs to account for changes.
Link: https://lkml.kernel.org/r/cover.1746792520.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/adb36a7c4affd7393b2fc4b54cc5cfe211e41f71.1746792520.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-05-09 13:13:34 +01:00
|
|
|
{
|
|
|
|
return file->f_op->mmap_prepare(desc);
|
|
|
|
}
|
|
|
|
|
2025-05-28 15:15:39 +01:00
|
|
|
static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
(void)vma;
|
|
|
|
}
|
|
|
|
|
2025-06-09 17:57:49 +01:00
|
|
|
static inline void vma_set_file(struct vm_area_struct *vma, struct file *file)
|
|
|
|
{
|
|
|
|
/* Changing an anonymous vma with this is illegal */
|
|
|
|
get_file(file);
|
|
|
|
swap(vma->vm_file, file);
|
|
|
|
fput(file);
|
|
|
|
}
|
|
|
|
|
mm: prevent KSM from breaking VMA merging for new VMAs
If a user wishes to enable KSM mergeability for an entire process and all
fork/exec'd processes that come after it, they use the prctl()
PR_SET_MEMORY_MERGE operation.
This defaults all newly mapped VMAs to have the VM_MERGEABLE VMA flag set
(in order to indicate they are KSM mergeable), as well as setting this
flag for all existing VMAs and propagating this across fork/exec.
However it also breaks VMA merging for new VMAs, both in the process and
all forked (and fork/exec'd) child processes.
This is because when a new mapping is proposed, the flags specified will
never have VM_MERGEABLE set. However all adjacent VMAs will already have
VM_MERGEABLE set, rendering VMAs unmergeable by default.
To work around this, we try to set the VM_MERGEABLE flag prior to
attempting a merge. In the case of brk() this can always be done.
However on mmap() things are more complicated - while KSM is not supported
for MAP_SHARED file-backed mappings, it is supported for MAP_PRIVATE
file-backed mappings.
These mappings may have deprecated .mmap() callbacks specified which
could, in theory, adjust flags and thus KSM eligibility.
So we check to determine whether this is possible. If not, we set
VM_MERGEABLE prior to the merge attempt on mmap(), otherwise we retain the
previous behaviour.
This fixes VMA merging for all new anonymous mappings, which covers the
majority of real-world cases, so we should see a significant improvement
in VMA mergeability.
For MAP_PRIVATE file-backed mappings, those which implement the
.mmap_prepare() hook and shmem are both known to be safe, so we allow
these, disallowing all other cases.
Also add stubs for newly introduced function invocations to VMA userland
testing.
[lorenzo.stoakes@oracle.com: correctly invoke late KSM check after mmap hook]
Link: https://lkml.kernel.org/r/5861f8f6-cf5a-4d82-a062-139fb3f9cddb@lucifer.local
Link: https://lkml.kernel.org/r/3ba660af716d87a18ca5b4e635f2101edeb56340.1748537921.git.lorenzo.stoakes@oracle.com
Fixes: d7597f59d1d3 ("mm: add new api to enable ksm per process") # please no backport!
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Xu Xin <xu.xin16@zte.com.cn>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Stefan Roesch <shr@devkernel.io>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-05-29 18:15:47 +01:00
|
|
|
static inline bool shmem_file(struct file *)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline vm_flags_t ksm_vma_flags(const struct mm_struct *, const struct file *,
|
|
|
|
vm_flags_t vm_flags)
|
|
|
|
{
|
|
|
|
return vm_flags;
|
|
|
|
}
|
|
|
|
|
2024-07-29 12:50:41 +01:00
|
|
|
#endif /* __MM_VMA_INTERNAL_H */
|