2024-07-29 12:50:38 +01:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
|
|
|
/*
|
|
|
|
* vma.h
|
|
|
|
*
|
|
|
|
* Core VMA manipulation API implemented in vma.c.
|
|
|
|
*/
|
|
|
|
#ifndef __MM_VMA_H
|
|
|
|
#define __MM_VMA_H
|
|
|
|
|
|
|
|
/*
|
|
|
|
* VMA lock generalization
|
|
|
|
*/
|
|
|
|
struct vma_prepare {
|
|
|
|
struct vm_area_struct *vma;
|
|
|
|
struct vm_area_struct *adj_next;
|
|
|
|
struct file *file;
|
|
|
|
struct address_space *mapping;
|
|
|
|
struct anon_vma *anon_vma;
|
|
|
|
struct vm_area_struct *insert;
|
|
|
|
struct vm_area_struct *remove;
|
|
|
|
struct vm_area_struct *remove2;
|
mm: fix uprobe pte be overwritten when expanding vma
Patch series "Fix uprobe pte be overwritten when expanding vma".
This patch (of 4):
We encountered a BUG alert triggered by Syzkaller as follows:
BUG: Bad rss-counter state mm:00000000b4a60fca type:MM_ANONPAGES val:1
And we can reproduce it with the following steps:
1. register uprobe on file at zero offset
2. mmap the file at zero offset:
addr1 = mmap(NULL, 2 * 4096, PROT_NONE, MAP_PRIVATE, fd, 0);
3. mremap part of vma1 to new vma2:
addr2 = mremap(addr1, 4096, 2 * 4096, MREMAP_MAYMOVE);
4. mremap back to orig addr1:
mremap(addr2, 4096, 4096, MREMAP_MAYMOVE | MREMAP_FIXED, addr1);
In step 3, the vma1 range [addr1, addr1 + 4096] will be remap to new vma2
with range [addr2, addr2 + 8192], and remap uprobe anon page from the vma1
to vma2, then unmap the vma1 range [addr1, addr1 + 4096].
In step 4, the vma2 range [addr2, addr2 + 4096] will be remap back to the
addr range [addr1, addr1 + 4096]. Since the addr range [addr1 + 4096,
addr1 + 8192] still maps the file, it will take vma_merge_new_range to
expand the range, and then do uprobe_mmap in vma_complete. Since the
merged vma pgoff is also zero offset, it will install uprobe anon page to
the merged vma. However, the upcomming move_page_tables step, which use
set_pte_at to remap the vma2 uprobe pte to the merged vma, will overwrite
the newly uprobe pte in the merged vma, and lead that pte to be orphan.
Since the uprobe pte will be remapped to the merged vma, we can remove the
unnecessary uprobe_mmap upon merged vma.
This problem was first found in linux-6.6.y and also exists in the
community syzkaller:
https://lore.kernel.org/all/000000000000ada39605a5e71711@google.com/T/
Link: https://lkml.kernel.org/r/20250529155650.4017699-1-pulehui@huaweicloud.com
Link: https://lkml.kernel.org/r/20250529155650.4017699-2-pulehui@huaweicloud.com
Fixes: 2b1444983508 ("uprobes, mm, x86: Add the ability to install and remove uprobes breakpoints")
Signed-off-by: Pu Lehui <pulehui@huawei.com>
Suggested-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-05-29 15:56:47 +00:00
|
|
|
|
|
|
|
bool skip_vma_uprobe :1;
|
2024-07-29 12:50:38 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
struct unlink_vma_file_batch {
|
|
|
|
int count;
|
|
|
|
struct vm_area_struct *vmas[8];
|
|
|
|
};
|
|
|
|
|
2024-08-30 00:00:45 -04:00
|
|
|
/*
|
|
|
|
* vma munmap operation
|
|
|
|
*/
|
|
|
|
struct vma_munmap_struct {
|
|
|
|
struct vma_iterator *vmi;
|
|
|
|
struct vm_area_struct *vma; /* The first vma to munmap */
|
2024-08-30 00:00:46 -04:00
|
|
|
struct vm_area_struct *prev; /* vma before the munmap area */
|
|
|
|
struct vm_area_struct *next; /* vma after the munmap area */
|
2024-08-30 00:00:45 -04:00
|
|
|
struct list_head *uf; /* Userfaultfd list_head */
|
|
|
|
unsigned long start; /* Aligned start addr (inclusive) */
|
|
|
|
unsigned long end; /* Aligned end addr (exclusive) */
|
2024-08-30 00:00:52 -04:00
|
|
|
unsigned long unmap_start; /* Unmap PTE start */
|
|
|
|
unsigned long unmap_end; /* Unmap PTE end */
|
2024-08-30 00:00:45 -04:00
|
|
|
int vma_count; /* Number of vmas that will be removed */
|
mm/vma.h: optimise vma_munmap_struct
The vma_munmap_struct has a hole of 4 bytes and pushes the struct to three
cachelines. Relocating the three booleans upwards allows for the struct
to only use two cachelines (as reported by pahole on amd64).
Before:
struct vma_munmap_struct {
struct vma_iterator * vmi; /* 0 8 */
struct vm_area_struct * vma; /* 8 8 */
struct vm_area_struct * prev; /* 16 8 */
struct vm_area_struct * next; /* 24 8 */
struct list_head * uf; /* 32 8 */
long unsigned int start; /* 40 8 */
long unsigned int end; /* 48 8 */
long unsigned int unmap_start; /* 56 8 */
/* --- cacheline 1 boundary (64 bytes) --- */
long unsigned int unmap_end; /* 64 8 */
int vma_count; /* 72 4 */
/* XXX 4 bytes hole, try to pack */
long unsigned int nr_pages; /* 80 8 */
long unsigned int locked_vm; /* 88 8 */
long unsigned int nr_accounted; /* 96 8 */
long unsigned int exec_vm; /* 104 8 */
long unsigned int stack_vm; /* 112 8 */
long unsigned int data_vm; /* 120 8 */
/* --- cacheline 2 boundary (128 bytes) --- */
bool unlock; /* 128 1 */
bool clear_ptes; /* 129 1 */
bool closed_vm_ops; /* 130 1 */
/* size: 136, cachelines: 3, members: 19 */
/* sum members: 127, holes: 1, sum holes: 4 */
/* padding: 5 */
/* last cacheline: 8 bytes */
};
After:
struct vma_munmap_struct {
struct vma_iterator * vmi; /* 0 8 */
struct vm_area_struct * vma; /* 8 8 */
struct vm_area_struct * prev; /* 16 8 */
struct vm_area_struct * next; /* 24 8 */
struct list_head * uf; /* 32 8 */
long unsigned int start; /* 40 8 */
long unsigned int end; /* 48 8 */
long unsigned int unmap_start; /* 56 8 */
/* --- cacheline 1 boundary (64 bytes) --- */
long unsigned int unmap_end; /* 64 8 */
int vma_count; /* 72 4 */
bool unlock; /* 76 1 */
bool clear_ptes; /* 77 1 */
bool closed_vm_ops; /* 78 1 */
/* XXX 1 byte hole, try to pack */
long unsigned int nr_pages; /* 80 8 */
long unsigned int locked_vm; /* 88 8 */
long unsigned int nr_accounted; /* 96 8 */
long unsigned int exec_vm; /* 104 8 */
long unsigned int stack_vm; /* 112 8 */
long unsigned int data_vm; /* 120 8 */
/* size: 128, cachelines: 2, members: 19 */
/* sum members: 127, holes: 1, sum holes: 1 */
};
Link: https://lkml.kernel.org/r/20240830040101.822209-22-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 00:01:01 -04:00
|
|
|
bool unlock; /* Unlock after the munmap */
|
|
|
|
bool clear_ptes; /* If there are outstanding PTE to be cleared */
|
2024-10-29 18:11:46 +00:00
|
|
|
/* 2 byte hole */
|
2024-08-30 00:00:45 -04:00
|
|
|
unsigned long nr_pages; /* Number of pages being removed */
|
|
|
|
unsigned long locked_vm; /* Number of locked pages */
|
2024-08-30 00:00:46 -04:00
|
|
|
unsigned long nr_accounted; /* Number of VM_ACCOUNT pages */
|
|
|
|
unsigned long exec_vm;
|
|
|
|
unsigned long stack_vm;
|
|
|
|
unsigned long data_vm;
|
2024-08-30 00:00:45 -04:00
|
|
|
};
|
|
|
|
|
mm: avoid using vma_merge() for new VMAs
Abstract vma_merge_new_vma() to use vma_merge_struct and rename the
resultant function vma_merge_new_range() to be clear what the purpose of
this function is - a new VMA is desired in the specified range, and we
wish to see if it is possible to 'merge' surrounding VMAs into this range
rather than having to allocate a new VMA.
Note that this function uses vma_extend() exclusively, so adopts its
requirement that the iterator point at or before the gap. We add an
assert to this effect.
This is as opposed to vma_merge_existing_range(), which will be introduced
in a subsequent commit, and provide the same functionality for cases in
which we are modifying an existing VMA.
In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.
Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().
Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.
Also add the ability for the vmg to track state, and able to report
errors, allowing for us to differentiate a failed merge from an inability
to allocate memory in callers.
This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.
Also introduce vma_iter_next_rewind() to allow for retrieval of the next,
and (optionally) the prev VMA, rewinding to the start of the previous gap.
Introduce are_anon_vmas_compatible() to abstract individual VMA anon_vma
comparison for the case of merging on both sides where the anon_vma of the
VMA being merged maybe compatible with prev and next, but prev and next's
anon_vma's may not be compatible with each other.
Finally also introduce can_vma_merge_left() / can_vma_merge_right() to
check adjacent VMA compatibility and that they are indeed adjacent.
Link: https://lkml.kernel.org/r/49d37c0769b6b9dc03b27fe4d059173832556392.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:18 +01:00
|
|
|
enum vma_merge_state {
|
|
|
|
VMA_MERGE_START,
|
|
|
|
VMA_MERGE_ERROR_NOMEM,
|
|
|
|
VMA_MERGE_NOMERGE,
|
|
|
|
VMA_MERGE_SUCCESS,
|
|
|
|
};
|
|
|
|
|
mm: simplify vma merge structure and expand comments
Patch series "mm: further simplify VMA merge operation", v3.
While significant efforts have been made to improve the VMA merge
operation, there remains remnants of the bad (or rather confusing) old
days, which make the code difficult to understand, more bug prone and thus
harder to modify.
This series attempts to significantly improve matters in a number of
respects - with a focus on simplifying the commit_merge() function which
actually actions the merge operation - and importantly, adjusting the two
most confusing merge cases - those in which we 'adjust' the VMA
immediately adjacent to the one being merged.
One source of confusion are the VMAs being threaded through the operation
themselves - vmg->prev, vmg->vma and vmg->next.
At the start of the operation, vmg->vma is either NULL if a new VMA is
propose to be added, or if not then a pointer to an existing VMA being
modified, and prev/next are (perhaps not present) VMAs sat immediately
before and after the range specified in vmg->start, end, respectively.
However, during the VMA merge operation, we change vmg->start, end and
pgoff to span the newly merged range and vmg->vma to either be:
a. The ultimately returned VMA (in most cases) or b. A VMA which we will
manipulate, but ultimately instead return vmg->next.
Case b. especially here is confusing for somebody reading this code, but
the fact we update this state, along with vmg->start, end, pgoff only
makes matters worse.
We simplify things by replacing vmg->vma with vmg->middle and never
changing it - this is always either NULL (for a new VMA) or the VMA being
modified between vmg->prev and vmg->next.
We further simplify by placing the merged VMA in a new vmg->target field -
whether case b. above is the case or not. The reader of the code can now
simply rely on vmg->middle being the middle VMA and vmg->target being the
ultimately merged VMA.
We additionally tackle the confusing cases where we 'adjust' VMAs other
than the one we ultimately return as the merged VMA (this includes case b.
above). These are:
(1)
merge
<----------->
|------||--------| |------------|---|
| prev || middle | -> | target | m |
|------||--------| |------------|---|
In which case middle must be adjusted so middle->vm_start is increased as
well as performing the merge.
(2) (equivalent to case b. above)
<------------->
|---------||------| |---|-------------|
| middle || next | -> | m | target |
|---------||------| |---|-------------|
In which case next must be adjusted so next->vm_start is decreased as well
as performing the merge.
This cases have previously been performed by calculating and passing
around a dubious and confusing 'adj_start' parameter along side a pointer
to an 'adjust' VMA indicating which VMA requires additional adjustment
(middle in case 1 and next in case 2).
With the VMG structure in place we are able to avoid this by simply
setting a merge flag to describe each case:
(1) Sets the vmg->__adjust_middle_start flag
(2) Sets the vmg->__adjust_next_start flag
By doing so it turns out we can vastly simplify the logic and calculate
what is required to perform the operation.
Taken together the refactorings make it far easier to understand what is
being done even in these more confusing cases, make the code far more
maintainable, debuggable, and testable, providing more internal state
indicating what is happening in the merge operation.
The changes have no functional net impact on the merge operation and
everything should still behave as it did before.
This patch (of 5):
The merge code, while much improved, still has a number of points of
confusion. As part of a broader series cleaning this up to make this more
maintainable, we start by addressing some confusion around
vma_merge_struct fields.
So far, the caller either provides no vmg->vma (a new VMA) or supplies the
existing VMA which is being altered, setting vmg->start,end,pgoff to the
proposed VMA dimensions.
vmg->vma is then updated, as are vmg->start,end,pgoff as the merge process
proceeds and the appropriate merge strategy is determined.
This is rather confusing, as vmg->vma starts off as the 'middle' VMA
between vmg->prev,next, but becomes the 'target' VMA, except in one
specific edge case (merge next, shrink middle).
Int his patch we introduce vmg->middle to describe the VMA that is between
vmg->prev and vmg->next, and does NOT change during the merge operation.
We replace vmg->vma with vmg->target, and use this only during the merge
operation itself.
Aside from the merge right, shrink middle case, this becomes the VMA that
forms the basis of the VMA that is returned. This edge case can be
addressed in a future commit.
We also add a number of comments to explain what is going on.
Finally, we adjust the ASCII diagrams showing each merge case in
vma_merge_existing_range() to be clearer - the arrow range previously
showed the vmg->start, end spanned area, but it is clearer to change this
to show the final merged VMA.
This patch has no change in functional behaviour.
Link: https://lkml.kernel.org/r/cover.1738326519.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/4dfe60f1419d55e5d0516f56349695d73a57184c.1738326519.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-31 12:31:49 +00:00
|
|
|
/*
|
|
|
|
* Describes a VMA merge operation and is threaded throughout it.
|
|
|
|
*
|
|
|
|
* Any of the fields may be mutated by the merge operation, so no guarantees are
|
|
|
|
* made to the contents of this structure after a merge operation has completed.
|
|
|
|
*/
|
2024-08-30 19:10:15 +01:00
|
|
|
struct vma_merge_struct {
|
|
|
|
struct mm_struct *mm;
|
|
|
|
struct vma_iterator *vmi;
|
mm: simplify vma merge structure and expand comments
Patch series "mm: further simplify VMA merge operation", v3.
While significant efforts have been made to improve the VMA merge
operation, there remains remnants of the bad (or rather confusing) old
days, which make the code difficult to understand, more bug prone and thus
harder to modify.
This series attempts to significantly improve matters in a number of
respects - with a focus on simplifying the commit_merge() function which
actually actions the merge operation - and importantly, adjusting the two
most confusing merge cases - those in which we 'adjust' the VMA
immediately adjacent to the one being merged.
One source of confusion are the VMAs being threaded through the operation
themselves - vmg->prev, vmg->vma and vmg->next.
At the start of the operation, vmg->vma is either NULL if a new VMA is
propose to be added, or if not then a pointer to an existing VMA being
modified, and prev/next are (perhaps not present) VMAs sat immediately
before and after the range specified in vmg->start, end, respectively.
However, during the VMA merge operation, we change vmg->start, end and
pgoff to span the newly merged range and vmg->vma to either be:
a. The ultimately returned VMA (in most cases) or b. A VMA which we will
manipulate, but ultimately instead return vmg->next.
Case b. especially here is confusing for somebody reading this code, but
the fact we update this state, along with vmg->start, end, pgoff only
makes matters worse.
We simplify things by replacing vmg->vma with vmg->middle and never
changing it - this is always either NULL (for a new VMA) or the VMA being
modified between vmg->prev and vmg->next.
We further simplify by placing the merged VMA in a new vmg->target field -
whether case b. above is the case or not. The reader of the code can now
simply rely on vmg->middle being the middle VMA and vmg->target being the
ultimately merged VMA.
We additionally tackle the confusing cases where we 'adjust' VMAs other
than the one we ultimately return as the merged VMA (this includes case b.
above). These are:
(1)
merge
<----------->
|------||--------| |------------|---|
| prev || middle | -> | target | m |
|------||--------| |------------|---|
In which case middle must be adjusted so middle->vm_start is increased as
well as performing the merge.
(2) (equivalent to case b. above)
<------------->
|---------||------| |---|-------------|
| middle || next | -> | m | target |
|---------||------| |---|-------------|
In which case next must be adjusted so next->vm_start is decreased as well
as performing the merge.
This cases have previously been performed by calculating and passing
around a dubious and confusing 'adj_start' parameter along side a pointer
to an 'adjust' VMA indicating which VMA requires additional adjustment
(middle in case 1 and next in case 2).
With the VMG structure in place we are able to avoid this by simply
setting a merge flag to describe each case:
(1) Sets the vmg->__adjust_middle_start flag
(2) Sets the vmg->__adjust_next_start flag
By doing so it turns out we can vastly simplify the logic and calculate
what is required to perform the operation.
Taken together the refactorings make it far easier to understand what is
being done even in these more confusing cases, make the code far more
maintainable, debuggable, and testable, providing more internal state
indicating what is happening in the merge operation.
The changes have no functional net impact on the merge operation and
everything should still behave as it did before.
This patch (of 5):
The merge code, while much improved, still has a number of points of
confusion. As part of a broader series cleaning this up to make this more
maintainable, we start by addressing some confusion around
vma_merge_struct fields.
So far, the caller either provides no vmg->vma (a new VMA) or supplies the
existing VMA which is being altered, setting vmg->start,end,pgoff to the
proposed VMA dimensions.
vmg->vma is then updated, as are vmg->start,end,pgoff as the merge process
proceeds and the appropriate merge strategy is determined.
This is rather confusing, as vmg->vma starts off as the 'middle' VMA
between vmg->prev,next, but becomes the 'target' VMA, except in one
specific edge case (merge next, shrink middle).
Int his patch we introduce vmg->middle to describe the VMA that is between
vmg->prev and vmg->next, and does NOT change during the merge operation.
We replace vmg->vma with vmg->target, and use this only during the merge
operation itself.
Aside from the merge right, shrink middle case, this becomes the VMA that
forms the basis of the VMA that is returned. This edge case can be
addressed in a future commit.
We also add a number of comments to explain what is going on.
Finally, we adjust the ASCII diagrams showing each merge case in
vma_merge_existing_range() to be clearer - the arrow range previously
showed the vmg->start, end spanned area, but it is clearer to change this
to show the final merged VMA.
This patch has no change in functional behaviour.
Link: https://lkml.kernel.org/r/cover.1738326519.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/4dfe60f1419d55e5d0516f56349695d73a57184c.1738326519.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-31 12:31:49 +00:00
|
|
|
/*
|
|
|
|
* Adjacent VMAs, any of which may be NULL if not present:
|
|
|
|
*
|
|
|
|
* |------|--------|------|
|
|
|
|
* | prev | middle | next |
|
|
|
|
* |------|--------|------|
|
|
|
|
*
|
|
|
|
* middle may not yet exist in the case of a proposed new VMA being
|
|
|
|
* merged, or it may be an existing VMA.
|
|
|
|
*
|
|
|
|
* next may be assigned by the caller.
|
|
|
|
*/
|
2024-08-30 19:10:15 +01:00
|
|
|
struct vm_area_struct *prev;
|
mm: simplify vma merge structure and expand comments
Patch series "mm: further simplify VMA merge operation", v3.
While significant efforts have been made to improve the VMA merge
operation, there remains remnants of the bad (or rather confusing) old
days, which make the code difficult to understand, more bug prone and thus
harder to modify.
This series attempts to significantly improve matters in a number of
respects - with a focus on simplifying the commit_merge() function which
actually actions the merge operation - and importantly, adjusting the two
most confusing merge cases - those in which we 'adjust' the VMA
immediately adjacent to the one being merged.
One source of confusion are the VMAs being threaded through the operation
themselves - vmg->prev, vmg->vma and vmg->next.
At the start of the operation, vmg->vma is either NULL if a new VMA is
propose to be added, or if not then a pointer to an existing VMA being
modified, and prev/next are (perhaps not present) VMAs sat immediately
before and after the range specified in vmg->start, end, respectively.
However, during the VMA merge operation, we change vmg->start, end and
pgoff to span the newly merged range and vmg->vma to either be:
a. The ultimately returned VMA (in most cases) or b. A VMA which we will
manipulate, but ultimately instead return vmg->next.
Case b. especially here is confusing for somebody reading this code, but
the fact we update this state, along with vmg->start, end, pgoff only
makes matters worse.
We simplify things by replacing vmg->vma with vmg->middle and never
changing it - this is always either NULL (for a new VMA) or the VMA being
modified between vmg->prev and vmg->next.
We further simplify by placing the merged VMA in a new vmg->target field -
whether case b. above is the case or not. The reader of the code can now
simply rely on vmg->middle being the middle VMA and vmg->target being the
ultimately merged VMA.
We additionally tackle the confusing cases where we 'adjust' VMAs other
than the one we ultimately return as the merged VMA (this includes case b.
above). These are:
(1)
merge
<----------->
|------||--------| |------------|---|
| prev || middle | -> | target | m |
|------||--------| |------------|---|
In which case middle must be adjusted so middle->vm_start is increased as
well as performing the merge.
(2) (equivalent to case b. above)
<------------->
|---------||------| |---|-------------|
| middle || next | -> | m | target |
|---------||------| |---|-------------|
In which case next must be adjusted so next->vm_start is decreased as well
as performing the merge.
This cases have previously been performed by calculating and passing
around a dubious and confusing 'adj_start' parameter along side a pointer
to an 'adjust' VMA indicating which VMA requires additional adjustment
(middle in case 1 and next in case 2).
With the VMG structure in place we are able to avoid this by simply
setting a merge flag to describe each case:
(1) Sets the vmg->__adjust_middle_start flag
(2) Sets the vmg->__adjust_next_start flag
By doing so it turns out we can vastly simplify the logic and calculate
what is required to perform the operation.
Taken together the refactorings make it far easier to understand what is
being done even in these more confusing cases, make the code far more
maintainable, debuggable, and testable, providing more internal state
indicating what is happening in the merge operation.
The changes have no functional net impact on the merge operation and
everything should still behave as it did before.
This patch (of 5):
The merge code, while much improved, still has a number of points of
confusion. As part of a broader series cleaning this up to make this more
maintainable, we start by addressing some confusion around
vma_merge_struct fields.
So far, the caller either provides no vmg->vma (a new VMA) or supplies the
existing VMA which is being altered, setting vmg->start,end,pgoff to the
proposed VMA dimensions.
vmg->vma is then updated, as are vmg->start,end,pgoff as the merge process
proceeds and the appropriate merge strategy is determined.
This is rather confusing, as vmg->vma starts off as the 'middle' VMA
between vmg->prev,next, but becomes the 'target' VMA, except in one
specific edge case (merge next, shrink middle).
Int his patch we introduce vmg->middle to describe the VMA that is between
vmg->prev and vmg->next, and does NOT change during the merge operation.
We replace vmg->vma with vmg->target, and use this only during the merge
operation itself.
Aside from the merge right, shrink middle case, this becomes the VMA that
forms the basis of the VMA that is returned. This edge case can be
addressed in a future commit.
We also add a number of comments to explain what is going on.
Finally, we adjust the ASCII diagrams showing each merge case in
vma_merge_existing_range() to be clearer - the arrow range previously
showed the vmg->start, end spanned area, but it is clearer to change this
to show the final merged VMA.
This patch has no change in functional behaviour.
Link: https://lkml.kernel.org/r/cover.1738326519.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/4dfe60f1419d55e5d0516f56349695d73a57184c.1738326519.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-31 12:31:49 +00:00
|
|
|
struct vm_area_struct *middle;
|
|
|
|
struct vm_area_struct *next;
|
2025-01-31 12:31:52 +00:00
|
|
|
/* This is the VMA we ultimately target to become the merged VMA. */
|
mm: simplify vma merge structure and expand comments
Patch series "mm: further simplify VMA merge operation", v3.
While significant efforts have been made to improve the VMA merge
operation, there remains remnants of the bad (or rather confusing) old
days, which make the code difficult to understand, more bug prone and thus
harder to modify.
This series attempts to significantly improve matters in a number of
respects - with a focus on simplifying the commit_merge() function which
actually actions the merge operation - and importantly, adjusting the two
most confusing merge cases - those in which we 'adjust' the VMA
immediately adjacent to the one being merged.
One source of confusion are the VMAs being threaded through the operation
themselves - vmg->prev, vmg->vma and vmg->next.
At the start of the operation, vmg->vma is either NULL if a new VMA is
propose to be added, or if not then a pointer to an existing VMA being
modified, and prev/next are (perhaps not present) VMAs sat immediately
before and after the range specified in vmg->start, end, respectively.
However, during the VMA merge operation, we change vmg->start, end and
pgoff to span the newly merged range and vmg->vma to either be:
a. The ultimately returned VMA (in most cases) or b. A VMA which we will
manipulate, but ultimately instead return vmg->next.
Case b. especially here is confusing for somebody reading this code, but
the fact we update this state, along with vmg->start, end, pgoff only
makes matters worse.
We simplify things by replacing vmg->vma with vmg->middle and never
changing it - this is always either NULL (for a new VMA) or the VMA being
modified between vmg->prev and vmg->next.
We further simplify by placing the merged VMA in a new vmg->target field -
whether case b. above is the case or not. The reader of the code can now
simply rely on vmg->middle being the middle VMA and vmg->target being the
ultimately merged VMA.
We additionally tackle the confusing cases where we 'adjust' VMAs other
than the one we ultimately return as the merged VMA (this includes case b.
above). These are:
(1)
merge
<----------->
|------||--------| |------------|---|
| prev || middle | -> | target | m |
|------||--------| |------------|---|
In which case middle must be adjusted so middle->vm_start is increased as
well as performing the merge.
(2) (equivalent to case b. above)
<------------->
|---------||------| |---|-------------|
| middle || next | -> | m | target |
|---------||------| |---|-------------|
In which case next must be adjusted so next->vm_start is decreased as well
as performing the merge.
This cases have previously been performed by calculating and passing
around a dubious and confusing 'adj_start' parameter along side a pointer
to an 'adjust' VMA indicating which VMA requires additional adjustment
(middle in case 1 and next in case 2).
With the VMG structure in place we are able to avoid this by simply
setting a merge flag to describe each case:
(1) Sets the vmg->__adjust_middle_start flag
(2) Sets the vmg->__adjust_next_start flag
By doing so it turns out we can vastly simplify the logic and calculate
what is required to perform the operation.
Taken together the refactorings make it far easier to understand what is
being done even in these more confusing cases, make the code far more
maintainable, debuggable, and testable, providing more internal state
indicating what is happening in the merge operation.
The changes have no functional net impact on the merge operation and
everything should still behave as it did before.
This patch (of 5):
The merge code, while much improved, still has a number of points of
confusion. As part of a broader series cleaning this up to make this more
maintainable, we start by addressing some confusion around
vma_merge_struct fields.
So far, the caller either provides no vmg->vma (a new VMA) or supplies the
existing VMA which is being altered, setting vmg->start,end,pgoff to the
proposed VMA dimensions.
vmg->vma is then updated, as are vmg->start,end,pgoff as the merge process
proceeds and the appropriate merge strategy is determined.
This is rather confusing, as vmg->vma starts off as the 'middle' VMA
between vmg->prev,next, but becomes the 'target' VMA, except in one
specific edge case (merge next, shrink middle).
Int his patch we introduce vmg->middle to describe the VMA that is between
vmg->prev and vmg->next, and does NOT change during the merge operation.
We replace vmg->vma with vmg->target, and use this only during the merge
operation itself.
Aside from the merge right, shrink middle case, this becomes the VMA that
forms the basis of the VMA that is returned. This edge case can be
addressed in a future commit.
We also add a number of comments to explain what is going on.
Finally, we adjust the ASCII diagrams showing each merge case in
vma_merge_existing_range() to be clearer - the arrow range previously
showed the vmg->start, end spanned area, but it is clearer to change this
to show the final merged VMA.
This patch has no change in functional behaviour.
Link: https://lkml.kernel.org/r/cover.1738326519.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/4dfe60f1419d55e5d0516f56349695d73a57184c.1738326519.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-31 12:31:49 +00:00
|
|
|
struct vm_area_struct *target;
|
|
|
|
/*
|
|
|
|
* Initially, the start, end, pgoff fields are provided by the caller
|
|
|
|
* and describe the proposed new VMA range, whether modifying an
|
|
|
|
* existing VMA (which will be 'middle'), or adding a new one.
|
|
|
|
*
|
|
|
|
* During the merge process these fields are updated to describe the new
|
|
|
|
* range _including those VMAs which will be merged_.
|
|
|
|
*/
|
2024-08-30 19:10:15 +01:00
|
|
|
unsigned long start;
|
|
|
|
unsigned long end;
|
mm: simplify vma merge structure and expand comments
Patch series "mm: further simplify VMA merge operation", v3.
While significant efforts have been made to improve the VMA merge
operation, there remains remnants of the bad (or rather confusing) old
days, which make the code difficult to understand, more bug prone and thus
harder to modify.
This series attempts to significantly improve matters in a number of
respects - with a focus on simplifying the commit_merge() function which
actually actions the merge operation - and importantly, adjusting the two
most confusing merge cases - those in which we 'adjust' the VMA
immediately adjacent to the one being merged.
One source of confusion are the VMAs being threaded through the operation
themselves - vmg->prev, vmg->vma and vmg->next.
At the start of the operation, vmg->vma is either NULL if a new VMA is
propose to be added, or if not then a pointer to an existing VMA being
modified, and prev/next are (perhaps not present) VMAs sat immediately
before and after the range specified in vmg->start, end, respectively.
However, during the VMA merge operation, we change vmg->start, end and
pgoff to span the newly merged range and vmg->vma to either be:
a. The ultimately returned VMA (in most cases) or b. A VMA which we will
manipulate, but ultimately instead return vmg->next.
Case b. especially here is confusing for somebody reading this code, but
the fact we update this state, along with vmg->start, end, pgoff only
makes matters worse.
We simplify things by replacing vmg->vma with vmg->middle and never
changing it - this is always either NULL (for a new VMA) or the VMA being
modified between vmg->prev and vmg->next.
We further simplify by placing the merged VMA in a new vmg->target field -
whether case b. above is the case or not. The reader of the code can now
simply rely on vmg->middle being the middle VMA and vmg->target being the
ultimately merged VMA.
We additionally tackle the confusing cases where we 'adjust' VMAs other
than the one we ultimately return as the merged VMA (this includes case b.
above). These are:
(1)
merge
<----------->
|------||--------| |------------|---|
| prev || middle | -> | target | m |
|------||--------| |------------|---|
In which case middle must be adjusted so middle->vm_start is increased as
well as performing the merge.
(2) (equivalent to case b. above)
<------------->
|---------||------| |---|-------------|
| middle || next | -> | m | target |
|---------||------| |---|-------------|
In which case next must be adjusted so next->vm_start is decreased as well
as performing the merge.
This cases have previously been performed by calculating and passing
around a dubious and confusing 'adj_start' parameter along side a pointer
to an 'adjust' VMA indicating which VMA requires additional adjustment
(middle in case 1 and next in case 2).
With the VMG structure in place we are able to avoid this by simply
setting a merge flag to describe each case:
(1) Sets the vmg->__adjust_middle_start flag
(2) Sets the vmg->__adjust_next_start flag
By doing so it turns out we can vastly simplify the logic and calculate
what is required to perform the operation.
Taken together the refactorings make it far easier to understand what is
being done even in these more confusing cases, make the code far more
maintainable, debuggable, and testable, providing more internal state
indicating what is happening in the merge operation.
The changes have no functional net impact on the merge operation and
everything should still behave as it did before.
This patch (of 5):
The merge code, while much improved, still has a number of points of
confusion. As part of a broader series cleaning this up to make this more
maintainable, we start by addressing some confusion around
vma_merge_struct fields.
So far, the caller either provides no vmg->vma (a new VMA) or supplies the
existing VMA which is being altered, setting vmg->start,end,pgoff to the
proposed VMA dimensions.
vmg->vma is then updated, as are vmg->start,end,pgoff as the merge process
proceeds and the appropriate merge strategy is determined.
This is rather confusing, as vmg->vma starts off as the 'middle' VMA
between vmg->prev,next, but becomes the 'target' VMA, except in one
specific edge case (merge next, shrink middle).
Int his patch we introduce vmg->middle to describe the VMA that is between
vmg->prev and vmg->next, and does NOT change during the merge operation.
We replace vmg->vma with vmg->target, and use this only during the merge
operation itself.
Aside from the merge right, shrink middle case, this becomes the VMA that
forms the basis of the VMA that is returned. This edge case can be
addressed in a future commit.
We also add a number of comments to explain what is going on.
Finally, we adjust the ASCII diagrams showing each merge case in
vma_merge_existing_range() to be clearer - the arrow range previously
showed the vmg->start, end spanned area, but it is clearer to change this
to show the final merged VMA.
This patch has no change in functional behaviour.
Link: https://lkml.kernel.org/r/cover.1738326519.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/4dfe60f1419d55e5d0516f56349695d73a57184c.1738326519.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-31 12:31:49 +00:00
|
|
|
pgoff_t pgoff;
|
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vm_flags_t vm_flags;
|
2024-08-30 19:10:15 +01:00
|
|
|
struct file *file;
|
|
|
|
struct anon_vma *anon_vma;
|
|
|
|
struct mempolicy *policy;
|
|
|
|
struct vm_userfaultfd_ctx uffd_ctx;
|
|
|
|
struct anon_vma_name *anon_name;
|
mm: avoid using vma_merge() for new VMAs
Abstract vma_merge_new_vma() to use vma_merge_struct and rename the
resultant function vma_merge_new_range() to be clear what the purpose of
this function is - a new VMA is desired in the specified range, and we
wish to see if it is possible to 'merge' surrounding VMAs into this range
rather than having to allocate a new VMA.
Note that this function uses vma_extend() exclusively, so adopts its
requirement that the iterator point at or before the gap. We add an
assert to this effect.
This is as opposed to vma_merge_existing_range(), which will be introduced
in a subsequent commit, and provide the same functionality for cases in
which we are modifying an existing VMA.
In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.
Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().
Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.
Also add the ability for the vmg to track state, and able to report
errors, allowing for us to differentiate a failed merge from an inability
to allocate memory in callers.
This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.
Also introduce vma_iter_next_rewind() to allow for retrieval of the next,
and (optionally) the prev VMA, rewinding to the start of the previous gap.
Introduce are_anon_vmas_compatible() to abstract individual VMA anon_vma
comparison for the case of merging on both sides where the anon_vma of the
VMA being merged maybe compatible with prev and next, but prev and next's
anon_vma's may not be compatible with each other.
Finally also introduce can_vma_merge_left() / can_vma_merge_right() to
check adjacent VMA compatibility and that they are indeed adjacent.
Link: https://lkml.kernel.org/r/49d37c0769b6b9dc03b27fe4d059173832556392.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:18 +01:00
|
|
|
enum vma_merge_state state;
|
2025-01-31 12:31:50 +00:00
|
|
|
|
|
|
|
/* Flags which callers can use to modify merge behaviour: */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we can expand, simply do so. We know there is nothing to merge to
|
|
|
|
* the right. Does not reset state upon failure to merge. The VMA
|
|
|
|
* iterator is assumed to be positioned at the previous VMA, rather than
|
|
|
|
* at the gap.
|
|
|
|
*/
|
|
|
|
bool just_expand :1;
|
|
|
|
|
mm/vma: add give_up_on_oom option on modify/merge, use in uffd release
Currently, if a VMA merge fails due to an OOM condition arising on commit
merge or a failure to duplicate anon_vma's, we report this so the caller
can handle it.
However there are cases where the caller is only ostensibly trying a
merge, and doesn't mind if it fails due to this condition.
Since we do not want to introduce an implicit assumption that we only
actually modify VMAs after OOM conditions might arise, add a 'give up on
oom' option and make an explicit contract that, should this flag be set, we
absolutely will not modify any VMAs should OOM arise and just bail out.
Since it'd be very unusual for a user to try to vma_modify() with this flag
set but be specifying a range within a VMA which ends up being split (which
can fail due to rlimit issues, not only OOM), we add a debug warning for
this condition.
The motivating reason for this is uffd release - syzkaller (and Pedro
Falcato's VERY astute analysis) found a way in which an injected fault on
allocation, triggering an OOM condition on commit merge, would result in
uffd code becoming confused and treating an error value as if it were a VMA
pointer.
To avoid this, we make use of this new VMG flag to ensure that this never
occurs, utilising the fact that, should we be clearing entire VMAs, we do
not wish an OOM event to be reported to us.
Many thanks to Pedro Falcato for his excellent analysis and Jann Horn for
his insightful and intelligent analysis of the situation, both of whom were
instrumental in this fix.
Link: https://lkml.kernel.org/r/20250321100937.46634-1-lorenzo.stoakes@oracle.com
Reported-by: syzbot+20ed41006cf9d842c2b5@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/all/67dc67f0.050a0220.25ae54.001e.GAE@google.com/
Fixes: 47b16d0462a4 ("mm: abort vma_modify() on merge out of memory failure")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Suggested-by: Pedro Falcato <pfalcato@suse.de>
Suggested-by: Jann Horn <jannh@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-03-21 10:09:37 +00:00
|
|
|
/*
|
|
|
|
* If a merge is possible, but an OOM error occurs, give up and don't
|
|
|
|
* execute the merge, returning NULL.
|
|
|
|
*/
|
|
|
|
bool give_up_on_oom :1;
|
|
|
|
|
mm: fix uprobe pte be overwritten when expanding vma
Patch series "Fix uprobe pte be overwritten when expanding vma".
This patch (of 4):
We encountered a BUG alert triggered by Syzkaller as follows:
BUG: Bad rss-counter state mm:00000000b4a60fca type:MM_ANONPAGES val:1
And we can reproduce it with the following steps:
1. register uprobe on file at zero offset
2. mmap the file at zero offset:
addr1 = mmap(NULL, 2 * 4096, PROT_NONE, MAP_PRIVATE, fd, 0);
3. mremap part of vma1 to new vma2:
addr2 = mremap(addr1, 4096, 2 * 4096, MREMAP_MAYMOVE);
4. mremap back to orig addr1:
mremap(addr2, 4096, 4096, MREMAP_MAYMOVE | MREMAP_FIXED, addr1);
In step 3, the vma1 range [addr1, addr1 + 4096] will be remap to new vma2
with range [addr2, addr2 + 8192], and remap uprobe anon page from the vma1
to vma2, then unmap the vma1 range [addr1, addr1 + 4096].
In step 4, the vma2 range [addr2, addr2 + 4096] will be remap back to the
addr range [addr1, addr1 + 4096]. Since the addr range [addr1 + 4096,
addr1 + 8192] still maps the file, it will take vma_merge_new_range to
expand the range, and then do uprobe_mmap in vma_complete. Since the
merged vma pgoff is also zero offset, it will install uprobe anon page to
the merged vma. However, the upcomming move_page_tables step, which use
set_pte_at to remap the vma2 uprobe pte to the merged vma, will overwrite
the newly uprobe pte in the merged vma, and lead that pte to be orphan.
Since the uprobe pte will be remapped to the merged vma, we can remove the
unnecessary uprobe_mmap upon merged vma.
This problem was first found in linux-6.6.y and also exists in the
community syzkaller:
https://lore.kernel.org/all/000000000000ada39605a5e71711@google.com/T/
Link: https://lkml.kernel.org/r/20250529155650.4017699-1-pulehui@huaweicloud.com
Link: https://lkml.kernel.org/r/20250529155650.4017699-2-pulehui@huaweicloud.com
Fixes: 2b1444983508 ("uprobes, mm, x86: Add the ability to install and remove uprobes breakpoints")
Signed-off-by: Pu Lehui <pulehui@huawei.com>
Suggested-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-05-29 15:56:47 +00:00
|
|
|
/*
|
|
|
|
* If set, skip uprobe_mmap upon merged vma.
|
|
|
|
*/
|
|
|
|
bool skip_vma_uprobe :1;
|
|
|
|
|
2025-01-31 12:31:50 +00:00
|
|
|
/* Internal flags set during merge process: */
|
|
|
|
|
mm: eliminate adj_start parameter from commit_merge()
Introduce internal vmg->__adjust_middle_start and vmg->__adjust_next_start
merge flags, enabling us to indicate to commit_merge() that we are
performing a merge which either spans only part of vmg->middle, or part of
vmg->next respectively.
In the former instance, we change the start of vmg->middle to match the
attributes of vmg->prev, without spanning all of vmg->middle.
This implies that vmg->prev->vm_end and vmg->middle->vm_start are both
increased to form the new merged VMA (vmg->prev) and the new subsequent
VMA (vmg->middle).
In the latter case, we change the end of vmg->middle to match the
attributes of vmg->next, without spanning all of vmg->next.
This implies that vmg->middle->vm_end and vmg->next->vm_start are both
decreased to form the new merged VMA (vmg->next) and the new prior VMA
(vmg->middle).
Since we now have a stable set of prev, middle, next VMAs threaded through
vmg and with these flags set know what is happening, we can perform the
calculation in commit_merge() instead.
This allows us to drop the confusing adj_start parameter and instead pass
semantic information to commit_merge().
In the latter case the -(middle->vm_end - start) calculation becomes
-(middle->vm-end - vmg->end), however this is correct as vmg->end is set
to the start parameter.
This is because in this case (rather confusingly), we manipulate
vmg->middle, but ultimately return vmg->next, whose range will be
correctly specified. At this point vmg->start, end is the new range for
the prior VMA rather than the merged one.
This patch has no change in functional behaviour.
Link: https://lkml.kernel.org/r/bcec0cd980b373a5eb02236cb033034ce1effe42.1738326519.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-31 12:31:51 +00:00
|
|
|
/*
|
|
|
|
* Internal flag indicating the merge increases vmg->middle->vm_start
|
|
|
|
* (and thereby, vmg->prev->vm_end).
|
|
|
|
*/
|
|
|
|
bool __adjust_middle_start :1;
|
|
|
|
/*
|
|
|
|
* Internal flag indicating the merge decreases vmg->next->vm_start
|
|
|
|
* (and thereby, vmg->middle->vm_end).
|
|
|
|
*/
|
|
|
|
bool __adjust_next_start :1;
|
2025-01-31 12:31:50 +00:00
|
|
|
/*
|
|
|
|
* Internal flag used during the merge operation to indicate we will
|
|
|
|
* remove vmg->middle.
|
|
|
|
*/
|
|
|
|
bool __remove_middle :1;
|
|
|
|
/*
|
|
|
|
* Internal flag used during the merge operationr to indicate we will
|
|
|
|
* remove vmg->next.
|
|
|
|
*/
|
|
|
|
bool __remove_next :1;
|
|
|
|
|
2024-08-30 19:10:15 +01:00
|
|
|
};
|
|
|
|
|
mm: avoid using vma_merge() for new VMAs
Abstract vma_merge_new_vma() to use vma_merge_struct and rename the
resultant function vma_merge_new_range() to be clear what the purpose of
this function is - a new VMA is desired in the specified range, and we
wish to see if it is possible to 'merge' surrounding VMAs into this range
rather than having to allocate a new VMA.
Note that this function uses vma_extend() exclusively, so adopts its
requirement that the iterator point at or before the gap. We add an
assert to this effect.
This is as opposed to vma_merge_existing_range(), which will be introduced
in a subsequent commit, and provide the same functionality for cases in
which we are modifying an existing VMA.
In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.
Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().
Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.
Also add the ability for the vmg to track state, and able to report
errors, allowing for us to differentiate a failed merge from an inability
to allocate memory in callers.
This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.
Also introduce vma_iter_next_rewind() to allow for retrieval of the next,
and (optionally) the prev VMA, rewinding to the start of the previous gap.
Introduce are_anon_vmas_compatible() to abstract individual VMA anon_vma
comparison for the case of merging on both sides where the anon_vma of the
VMA being merged maybe compatible with prev and next, but prev and next's
anon_vma's may not be compatible with each other.
Finally also introduce can_vma_merge_left() / can_vma_merge_right() to
check adjacent VMA compatibility and that they are indeed adjacent.
Link: https://lkml.kernel.org/r/49d37c0769b6b9dc03b27fe4d059173832556392.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:18 +01:00
|
|
|
static inline bool vmg_nomem(struct vma_merge_struct *vmg)
|
|
|
|
{
|
|
|
|
return vmg->state == VMA_MERGE_ERROR_NOMEM;
|
|
|
|
}
|
|
|
|
|
2024-08-30 19:10:15 +01:00
|
|
|
/* Assumes addr >= vma->vm_start. */
|
|
|
|
static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
|
|
|
|
unsigned long addr)
|
|
|
|
{
|
|
|
|
return vma->vm_pgoff + PHYS_PFN(addr - vma->vm_start);
|
|
|
|
}
|
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
#define VMG_STATE(name, mm_, vmi_, start_, end_, vm_flags_, pgoff_) \
|
2024-08-30 19:10:15 +01:00
|
|
|
struct vma_merge_struct name = { \
|
|
|
|
.mm = mm_, \
|
|
|
|
.vmi = vmi_, \
|
|
|
|
.start = start_, \
|
|
|
|
.end = end_, \
|
2025-06-18 20:42:53 +01:00
|
|
|
.vm_flags = vm_flags_, \
|
2024-08-30 19:10:15 +01:00
|
|
|
.pgoff = pgoff_, \
|
mm: avoid using vma_merge() for new VMAs
Abstract vma_merge_new_vma() to use vma_merge_struct and rename the
resultant function vma_merge_new_range() to be clear what the purpose of
this function is - a new VMA is desired in the specified range, and we
wish to see if it is possible to 'merge' surrounding VMAs into this range
rather than having to allocate a new VMA.
Note that this function uses vma_extend() exclusively, so adopts its
requirement that the iterator point at or before the gap. We add an
assert to this effect.
This is as opposed to vma_merge_existing_range(), which will be introduced
in a subsequent commit, and provide the same functionality for cases in
which we are modifying an existing VMA.
In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.
Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().
Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.
Also add the ability for the vmg to track state, and able to report
errors, allowing for us to differentiate a failed merge from an inability
to allocate memory in callers.
This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.
Also introduce vma_iter_next_rewind() to allow for retrieval of the next,
and (optionally) the prev VMA, rewinding to the start of the previous gap.
Introduce are_anon_vmas_compatible() to abstract individual VMA anon_vma
comparison for the case of merging on both sides where the anon_vma of the
VMA being merged maybe compatible with prev and next, but prev and next's
anon_vma's may not be compatible with each other.
Finally also introduce can_vma_merge_left() / can_vma_merge_right() to
check adjacent VMA compatibility and that they are indeed adjacent.
Link: https://lkml.kernel.org/r/49d37c0769b6b9dc03b27fe4d059173832556392.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:18 +01:00
|
|
|
.state = VMA_MERGE_START, \
|
2024-08-30 19:10:15 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
#define VMG_VMA_STATE(name, vmi_, prev_, vma_, start_, end_) \
|
|
|
|
struct vma_merge_struct name = { \
|
|
|
|
.mm = vma_->vm_mm, \
|
|
|
|
.vmi = vmi_, \
|
|
|
|
.prev = prev_, \
|
mm: simplify vma merge structure and expand comments
Patch series "mm: further simplify VMA merge operation", v3.
While significant efforts have been made to improve the VMA merge
operation, there remains remnants of the bad (or rather confusing) old
days, which make the code difficult to understand, more bug prone and thus
harder to modify.
This series attempts to significantly improve matters in a number of
respects - with a focus on simplifying the commit_merge() function which
actually actions the merge operation - and importantly, adjusting the two
most confusing merge cases - those in which we 'adjust' the VMA
immediately adjacent to the one being merged.
One source of confusion are the VMAs being threaded through the operation
themselves - vmg->prev, vmg->vma and vmg->next.
At the start of the operation, vmg->vma is either NULL if a new VMA is
propose to be added, or if not then a pointer to an existing VMA being
modified, and prev/next are (perhaps not present) VMAs sat immediately
before and after the range specified in vmg->start, end, respectively.
However, during the VMA merge operation, we change vmg->start, end and
pgoff to span the newly merged range and vmg->vma to either be:
a. The ultimately returned VMA (in most cases) or b. A VMA which we will
manipulate, but ultimately instead return vmg->next.
Case b. especially here is confusing for somebody reading this code, but
the fact we update this state, along with vmg->start, end, pgoff only
makes matters worse.
We simplify things by replacing vmg->vma with vmg->middle and never
changing it - this is always either NULL (for a new VMA) or the VMA being
modified between vmg->prev and vmg->next.
We further simplify by placing the merged VMA in a new vmg->target field -
whether case b. above is the case or not. The reader of the code can now
simply rely on vmg->middle being the middle VMA and vmg->target being the
ultimately merged VMA.
We additionally tackle the confusing cases where we 'adjust' VMAs other
than the one we ultimately return as the merged VMA (this includes case b.
above). These are:
(1)
merge
<----------->
|------||--------| |------------|---|
| prev || middle | -> | target | m |
|------||--------| |------------|---|
In which case middle must be adjusted so middle->vm_start is increased as
well as performing the merge.
(2) (equivalent to case b. above)
<------------->
|---------||------| |---|-------------|
| middle || next | -> | m | target |
|---------||------| |---|-------------|
In which case next must be adjusted so next->vm_start is decreased as well
as performing the merge.
This cases have previously been performed by calculating and passing
around a dubious and confusing 'adj_start' parameter along side a pointer
to an 'adjust' VMA indicating which VMA requires additional adjustment
(middle in case 1 and next in case 2).
With the VMG structure in place we are able to avoid this by simply
setting a merge flag to describe each case:
(1) Sets the vmg->__adjust_middle_start flag
(2) Sets the vmg->__adjust_next_start flag
By doing so it turns out we can vastly simplify the logic and calculate
what is required to perform the operation.
Taken together the refactorings make it far easier to understand what is
being done even in these more confusing cases, make the code far more
maintainable, debuggable, and testable, providing more internal state
indicating what is happening in the merge operation.
The changes have no functional net impact on the merge operation and
everything should still behave as it did before.
This patch (of 5):
The merge code, while much improved, still has a number of points of
confusion. As part of a broader series cleaning this up to make this more
maintainable, we start by addressing some confusion around
vma_merge_struct fields.
So far, the caller either provides no vmg->vma (a new VMA) or supplies the
existing VMA which is being altered, setting vmg->start,end,pgoff to the
proposed VMA dimensions.
vmg->vma is then updated, as are vmg->start,end,pgoff as the merge process
proceeds and the appropriate merge strategy is determined.
This is rather confusing, as vmg->vma starts off as the 'middle' VMA
between vmg->prev,next, but becomes the 'target' VMA, except in one
specific edge case (merge next, shrink middle).
Int his patch we introduce vmg->middle to describe the VMA that is between
vmg->prev and vmg->next, and does NOT change during the merge operation.
We replace vmg->vma with vmg->target, and use this only during the merge
operation itself.
Aside from the merge right, shrink middle case, this becomes the VMA that
forms the basis of the VMA that is returned. This edge case can be
addressed in a future commit.
We also add a number of comments to explain what is going on.
Finally, we adjust the ASCII diagrams showing each merge case in
vma_merge_existing_range() to be clearer - the arrow range previously
showed the vmg->start, end spanned area, but it is clearer to change this
to show the final merged VMA.
This patch has no change in functional behaviour.
Link: https://lkml.kernel.org/r/cover.1738326519.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/4dfe60f1419d55e5d0516f56349695d73a57184c.1738326519.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-31 12:31:49 +00:00
|
|
|
.middle = vma_, \
|
2024-08-30 19:10:15 +01:00
|
|
|
.next = NULL, \
|
|
|
|
.start = start_, \
|
|
|
|
.end = end_, \
|
2025-06-18 20:42:53 +01:00
|
|
|
.vm_flags = vma_->vm_flags, \
|
2024-08-30 19:10:15 +01:00
|
|
|
.pgoff = vma_pgoff_offset(vma_, start_), \
|
|
|
|
.file = vma_->vm_file, \
|
|
|
|
.anon_vma = vma_->anon_vma, \
|
|
|
|
.policy = vma_policy(vma_), \
|
|
|
|
.uffd_ctx = vma_->vm_userfaultfd_ctx, \
|
|
|
|
.anon_name = anon_vma_name(vma_), \
|
mm: avoid using vma_merge() for new VMAs
Abstract vma_merge_new_vma() to use vma_merge_struct and rename the
resultant function vma_merge_new_range() to be clear what the purpose of
this function is - a new VMA is desired in the specified range, and we
wish to see if it is possible to 'merge' surrounding VMAs into this range
rather than having to allocate a new VMA.
Note that this function uses vma_extend() exclusively, so adopts its
requirement that the iterator point at or before the gap. We add an
assert to this effect.
This is as opposed to vma_merge_existing_range(), which will be introduced
in a subsequent commit, and provide the same functionality for cases in
which we are modifying an existing VMA.
In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.
Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().
Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.
Also add the ability for the vmg to track state, and able to report
errors, allowing for us to differentiate a failed merge from an inability
to allocate memory in callers.
This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.
Also introduce vma_iter_next_rewind() to allow for retrieval of the next,
and (optionally) the prev VMA, rewinding to the start of the previous gap.
Introduce are_anon_vmas_compatible() to abstract individual VMA anon_vma
comparison for the case of merging on both sides where the anon_vma of the
VMA being merged maybe compatible with prev and next, but prev and next's
anon_vma's may not be compatible with each other.
Finally also introduce can_vma_merge_left() / can_vma_merge_right() to
check adjacent VMA compatibility and that they are indeed adjacent.
Link: https://lkml.kernel.org/r/49d37c0769b6b9dc03b27fe4d059173832556392.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:18 +01:00
|
|
|
.state = VMA_MERGE_START, \
|
2024-08-30 19:10:15 +01:00
|
|
|
}
|
|
|
|
|
2024-07-29 12:50:38 +01:00
|
|
|
#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
|
|
|
|
void validate_mm(struct mm_struct *mm);
|
|
|
|
#else
|
|
|
|
#define validate_mm(mm) do { } while (0)
|
|
|
|
#endif
|
|
|
|
|
2024-12-06 22:50:36 +00:00
|
|
|
__must_check int vma_expand(struct vma_merge_struct *vmg);
|
|
|
|
__must_check int vma_shrink(struct vma_iterator *vmi,
|
|
|
|
struct vm_area_struct *vma,
|
|
|
|
unsigned long start, unsigned long end, pgoff_t pgoff);
|
2024-07-29 12:50:38 +01:00
|
|
|
|
2024-08-30 00:00:55 -04:00
|
|
|
static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
|
|
|
|
struct vm_area_struct *vma, gfp_t gfp)
|
|
|
|
|
|
|
|
{
|
|
|
|
if (vmi->mas.status != ma_start &&
|
|
|
|
((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
|
|
|
|
vma_iter_invalidate(vmi);
|
|
|
|
|
|
|
|
__mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1);
|
|
|
|
mas_store_gfp(&vmi->mas, vma, gfp);
|
|
|
|
if (unlikely(mas_is_err(&vmi->mas)))
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2025-02-13 14:46:40 -08:00
|
|
|
vma_mark_attached(vma);
|
2024-08-30 00:00:55 -04:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2025-06-09 17:57:49 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Temporary helper functions for file systems which wrap an invocation of
|
|
|
|
* f_op->mmap() but which might have an underlying file system which implements
|
|
|
|
* f_op->mmap_prepare().
|
|
|
|
*/
|
|
|
|
|
|
|
|
static inline struct vm_area_desc *vma_to_desc(struct vm_area_struct *vma,
|
|
|
|
struct vm_area_desc *desc)
|
|
|
|
{
|
|
|
|
desc->mm = vma->vm_mm;
|
|
|
|
desc->start = vma->vm_start;
|
|
|
|
desc->end = vma->vm_end;
|
|
|
|
|
|
|
|
desc->pgoff = vma->vm_pgoff;
|
|
|
|
desc->file = vma->vm_file;
|
|
|
|
desc->vm_flags = vma->vm_flags;
|
|
|
|
desc->page_prot = vma->vm_page_prot;
|
|
|
|
|
|
|
|
desc->vm_ops = NULL;
|
|
|
|
desc->private_data = NULL;
|
|
|
|
|
|
|
|
return desc;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void set_vma_from_desc(struct vm_area_struct *vma,
|
|
|
|
struct vm_area_desc *desc)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Since we're invoking .mmap_prepare() despite having a partially
|
|
|
|
* established VMA, we must take care to handle setting fields
|
|
|
|
* correctly.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* Mutable fields. Populated with initial state. */
|
|
|
|
vma->vm_pgoff = desc->pgoff;
|
|
|
|
if (vma->vm_file != desc->file)
|
|
|
|
vma_set_file(vma, desc->file);
|
|
|
|
if (vma->vm_flags != desc->vm_flags)
|
|
|
|
vm_flags_set(vma, desc->vm_flags);
|
|
|
|
vma->vm_page_prot = desc->page_prot;
|
|
|
|
|
|
|
|
/* User-defined fields. */
|
|
|
|
vma->vm_ops = desc->vm_ops;
|
|
|
|
vma->vm_private_data = desc->private_data;
|
|
|
|
}
|
|
|
|
|
2024-07-29 12:50:38 +01:00
|
|
|
int
|
|
|
|
do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
|
|
|
|
struct mm_struct *mm, unsigned long start,
|
|
|
|
unsigned long end, struct list_head *uf, bool unlock);
|
|
|
|
|
|
|
|
int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
|
|
|
|
unsigned long start, size_t len, struct list_head *uf,
|
|
|
|
bool unlock);
|
|
|
|
|
2025-02-13 14:46:54 -08:00
|
|
|
void remove_vma(struct vm_area_struct *vma);
|
2024-07-29 12:50:38 +01:00
|
|
|
|
2024-08-30 00:00:53 -04:00
|
|
|
void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
|
|
|
|
struct vm_area_struct *prev, struct vm_area_struct *next);
|
2024-07-29 12:50:38 +01:00
|
|
|
|
|
|
|
/* We are about to modify the VMA's flags. */
|
2024-12-06 22:50:36 +00:00
|
|
|
__must_check struct vm_area_struct
|
|
|
|
*vma_modify_flags(struct vma_iterator *vmi,
|
2024-08-30 19:10:15 +01:00
|
|
|
struct vm_area_struct *prev, struct vm_area_struct *vma,
|
|
|
|
unsigned long start, unsigned long end,
|
2025-06-18 20:42:53 +01:00
|
|
|
vm_flags_t vm_flags);
|
2024-07-29 12:50:38 +01:00
|
|
|
|
2025-07-14 14:58:39 +01:00
|
|
|
/* We are about to modify the VMA's anon_name. */
|
2024-12-06 22:50:36 +00:00
|
|
|
__must_check struct vm_area_struct
|
2025-07-14 14:58:39 +01:00
|
|
|
*vma_modify_name(struct vma_iterator *vmi,
|
|
|
|
struct vm_area_struct *prev,
|
|
|
|
struct vm_area_struct *vma,
|
|
|
|
unsigned long start,
|
|
|
|
unsigned long end,
|
|
|
|
struct anon_vma_name *new_name);
|
2024-07-29 12:50:38 +01:00
|
|
|
|
|
|
|
/* We are about to modify the VMA's memory policy. */
|
2024-12-06 22:50:36 +00:00
|
|
|
__must_check struct vm_area_struct
|
2024-07-29 12:50:38 +01:00
|
|
|
*vma_modify_policy(struct vma_iterator *vmi,
|
|
|
|
struct vm_area_struct *prev,
|
|
|
|
struct vm_area_struct *vma,
|
|
|
|
unsigned long start, unsigned long end,
|
2024-08-30 19:10:15 +01:00
|
|
|
struct mempolicy *new_pol);
|
2024-07-29 12:50:38 +01:00
|
|
|
|
|
|
|
/* We are about to modify the VMA's flags and/or uffd context. */
|
2024-12-06 22:50:36 +00:00
|
|
|
__must_check struct vm_area_struct
|
2024-07-29 12:50:38 +01:00
|
|
|
*vma_modify_flags_uffd(struct vma_iterator *vmi,
|
|
|
|
struct vm_area_struct *prev,
|
|
|
|
struct vm_area_struct *vma,
|
|
|
|
unsigned long start, unsigned long end,
|
2025-06-18 20:42:53 +01:00
|
|
|
vm_flags_t vm_flags,
|
mm/vma: add give_up_on_oom option on modify/merge, use in uffd release
Currently, if a VMA merge fails due to an OOM condition arising on commit
merge or a failure to duplicate anon_vma's, we report this so the caller
can handle it.
However there are cases where the caller is only ostensibly trying a
merge, and doesn't mind if it fails due to this condition.
Since we do not want to introduce an implicit assumption that we only
actually modify VMAs after OOM conditions might arise, add a 'give up on
oom' option and make an explicit contract that, should this flag be set, we
absolutely will not modify any VMAs should OOM arise and just bail out.
Since it'd be very unusual for a user to try to vma_modify() with this flag
set but be specifying a range within a VMA which ends up being split (which
can fail due to rlimit issues, not only OOM), we add a debug warning for
this condition.
The motivating reason for this is uffd release - syzkaller (and Pedro
Falcato's VERY astute analysis) found a way in which an injected fault on
allocation, triggering an OOM condition on commit merge, would result in
uffd code becoming confused and treating an error value as if it were a VMA
pointer.
To avoid this, we make use of this new VMG flag to ensure that this never
occurs, utilising the fact that, should we be clearing entire VMAs, we do
not wish an OOM event to be reported to us.
Many thanks to Pedro Falcato for his excellent analysis and Jann Horn for
his insightful and intelligent analysis of the situation, both of whom were
instrumental in this fix.
Link: https://lkml.kernel.org/r/20250321100937.46634-1-lorenzo.stoakes@oracle.com
Reported-by: syzbot+20ed41006cf9d842c2b5@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/all/67dc67f0.050a0220.25ae54.001e.GAE@google.com/
Fixes: 47b16d0462a4 ("mm: abort vma_modify() on merge out of memory failure")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Suggested-by: Pedro Falcato <pfalcato@suse.de>
Suggested-by: Jann Horn <jannh@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-03-21 10:09:37 +00:00
|
|
|
struct vm_userfaultfd_ctx new_ctx,
|
|
|
|
bool give_up_on_oom);
|
2024-07-29 12:50:38 +01:00
|
|
|
|
2024-12-06 22:50:36 +00:00
|
|
|
__must_check struct vm_area_struct
|
|
|
|
*vma_merge_new_range(struct vma_merge_struct *vmg);
|
2024-07-29 12:50:38 +01:00
|
|
|
|
2024-12-06 22:50:36 +00:00
|
|
|
__must_check struct vm_area_struct
|
|
|
|
*vma_merge_extend(struct vma_iterator *vmi,
|
|
|
|
struct vm_area_struct *vma,
|
|
|
|
unsigned long delta);
|
2024-07-29 12:50:38 +01:00
|
|
|
|
|
|
|
void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb);
|
|
|
|
|
|
|
|
void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb);
|
|
|
|
|
|
|
|
void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb,
|
|
|
|
struct vm_area_struct *vma);
|
|
|
|
|
|
|
|
void unlink_file_vma(struct vm_area_struct *vma);
|
|
|
|
|
|
|
|
void vma_link_file(struct vm_area_struct *vma);
|
|
|
|
|
|
|
|
int vma_link(struct mm_struct *mm, struct vm_area_struct *vma);
|
|
|
|
|
|
|
|
struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
|
|
|
|
unsigned long addr, unsigned long len, pgoff_t pgoff,
|
|
|
|
bool *need_rmap_locks);
|
|
|
|
|
|
|
|
struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma);
|
|
|
|
|
|
|
|
bool vma_needs_dirty_tracking(struct vm_area_struct *vma);
|
|
|
|
bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
|
|
|
|
|
|
|
|
int mm_take_all_locks(struct mm_struct *mm);
|
|
|
|
void mm_drop_all_locks(struct mm_struct *mm);
|
|
|
|
|
2025-01-02 12:10:52 +00:00
|
|
|
unsigned long mmap_region(struct file *file, unsigned long addr,
|
2024-10-25 13:26:24 +01:00
|
|
|
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
|
|
|
|
struct list_head *uf);
|
|
|
|
|
mm/vma: move brk() internals to mm/vma.c
Patch series "mm/vma: make more mmap logic userland testable".
This series carries on the work started in previous series and
continued in commit 52956b0d7fb9 ("mm: isolate mmap internal logic to
mm/vma.c"), moving the remainder of memory mapping implementation
details logic into mm/vma.c allowing the bulk of the mapping logic to
be unit tested.
It is highly useful to do so, as this means we can both fundamentally test
this core logic, and introduce regression tests to ensure any issues
previously resolved do not recur.
Vitally, this includes the do_brk_flags() function, meaning we have both
core means of userland mapping memory now testable.
Performance testing was performed after this change given the brk() system
call's sensitivity to change, and no performance regression was observed.
The stack expansion logic is also moved into mm/vma.c, which necessitates
a change in the API exposed to the exec code, removing the invocation of
the expand_downwards() function used in get_arg_page() and instead adding
mmap_read_lock_maybe_expand() to wrap this.
This patch (of 5):
Now we have moved mmap_region() internals to mm/vma.c, making it available
to userland testing, it makes sense to do the same with brk().
This continues the pattern of VMA heavy lifting being done in mm/vma.c in
an environment where it can be subject to straightforward unit and
regression testing, with other VMA-adjacent files becoming wrappers around
this functionality.
[lorenzo.stoakes@oracle.com: add missing personality header import]
Link: https://lkml.kernel.org/r/2a717265-985f-45eb-9257-8b2857088ed4@lucifer.local
Link: https://lkml.kernel.org/r/cover.1733248985.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/3d24b9e67bb0261539ca921d1188a10a1b4d4357.1733248985.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-12-03 18:05:08 +00:00
|
|
|
int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma,
|
|
|
|
unsigned long addr, unsigned long request, unsigned long flags);
|
|
|
|
|
2024-12-03 18:05:09 +00:00
|
|
|
unsigned long unmapped_area(struct vm_unmapped_area_info *info);
|
|
|
|
unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info);
|
|
|
|
|
2024-07-29 12:50:38 +01:00
|
|
|
static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* We want to check manually if we can change individual PTEs writable
|
|
|
|
* if we can't do that automatically for all PTEs in a mapping. For
|
|
|
|
* private mappings, that's always the case when we have write
|
|
|
|
* permissions as we properly have to handle COW.
|
|
|
|
*/
|
|
|
|
if (vma->vm_flags & VM_SHARED)
|
|
|
|
return vma_wants_writenotify(vma, vma->vm_page_prot);
|
|
|
|
return !!(vma->vm_flags & VM_WRITE);
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_MMU
|
2025-06-18 20:42:53 +01:00
|
|
|
static inline pgprot_t vm_pgprot_modify(pgprot_t oldprot, vm_flags_t vm_flags)
|
2024-07-29 12:50:38 +01:00
|
|
|
{
|
|
|
|
return pgprot_modify(oldprot, vm_get_page_prot(vm_flags));
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi,
|
|
|
|
unsigned long min)
|
|
|
|
{
|
|
|
|
return mas_prev(&vmi->mas, min);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* These three helpers classifies VMAs for virtual memory accounting.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Executable code area - executable, not writable, not stack
|
|
|
|
*/
|
|
|
|
static inline bool is_exec_mapping(vm_flags_t flags)
|
|
|
|
{
|
|
|
|
return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Stack area (including shadow stacks)
|
|
|
|
*
|
|
|
|
* VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous:
|
|
|
|
* do_mmap() forbids all other combinations.
|
|
|
|
*/
|
|
|
|
static inline bool is_stack_mapping(vm_flags_t flags)
|
|
|
|
{
|
|
|
|
return ((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Data area - private, writable, not stack
|
|
|
|
*/
|
|
|
|
static inline bool is_data_mapping(vm_flags_t flags)
|
|
|
|
{
|
|
|
|
return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static inline void vma_iter_config(struct vma_iterator *vmi,
|
|
|
|
unsigned long index, unsigned long last)
|
|
|
|
{
|
|
|
|
__mas_set_range(&vmi->mas, index, last - 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void vma_iter_reset(struct vma_iterator *vmi)
|
|
|
|
{
|
|
|
|
mas_reset(&vmi->mas);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline
|
|
|
|
struct vm_area_struct *vma_iter_prev_range_limit(struct vma_iterator *vmi, unsigned long min)
|
|
|
|
{
|
|
|
|
return mas_prev_range(&vmi->mas, min);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline
|
|
|
|
struct vm_area_struct *vma_iter_next_range_limit(struct vma_iterator *vmi, unsigned long max)
|
|
|
|
{
|
|
|
|
return mas_next_range(&vmi->mas, max);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int vma_iter_area_lowest(struct vma_iterator *vmi, unsigned long min,
|
|
|
|
unsigned long max, unsigned long size)
|
|
|
|
{
|
|
|
|
return mas_empty_area(&vmi->mas, min, max - 1, size);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int vma_iter_area_highest(struct vma_iterator *vmi, unsigned long min,
|
|
|
|
unsigned long max, unsigned long size)
|
|
|
|
{
|
|
|
|
return mas_empty_area_rev(&vmi->mas, min, max - 1, size);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* VMA Iterator functions shared between nommu and mmap
|
|
|
|
*/
|
|
|
|
static inline int vma_iter_prealloc(struct vma_iterator *vmi,
|
|
|
|
struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
return mas_preallocate(&vmi->mas, vma, GFP_KERNEL);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void vma_iter_clear(struct vma_iterator *vmi)
|
|
|
|
{
|
|
|
|
mas_store_prealloc(&vmi->mas, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi)
|
|
|
|
{
|
|
|
|
return mas_walk(&vmi->mas);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Store a VMA with preallocated memory */
|
2025-02-13 14:46:41 -08:00
|
|
|
static inline void vma_iter_store_overwrite(struct vma_iterator *vmi,
|
|
|
|
struct vm_area_struct *vma)
|
2024-07-29 12:50:38 +01:00
|
|
|
{
|
2025-02-13 14:46:41 -08:00
|
|
|
vma_assert_attached(vma);
|
2024-07-29 12:50:38 +01:00
|
|
|
|
|
|
|
#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
|
|
|
|
if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start &&
|
|
|
|
vmi->mas.index > vma->vm_start)) {
|
|
|
|
pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n",
|
|
|
|
vmi->mas.index, vma->vm_start, vma->vm_start,
|
|
|
|
vma->vm_end, vmi->mas.index, vmi->mas.last);
|
|
|
|
}
|
|
|
|
if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start &&
|
|
|
|
vmi->mas.last < vma->vm_start)) {
|
|
|
|
pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n",
|
|
|
|
vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end,
|
|
|
|
vmi->mas.index, vmi->mas.last);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
if (vmi->mas.status != ma_start &&
|
|
|
|
((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
|
|
|
|
vma_iter_invalidate(vmi);
|
|
|
|
|
|
|
|
__mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1);
|
|
|
|
mas_store_prealloc(&vmi->mas, vma);
|
2025-02-13 14:46:41 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void vma_iter_store_new(struct vma_iterator *vmi,
|
|
|
|
struct vm_area_struct *vma)
|
|
|
|
{
|
2025-02-13 14:46:40 -08:00
|
|
|
vma_mark_attached(vma);
|
2025-02-13 14:46:41 -08:00
|
|
|
vma_iter_store_overwrite(vmi, vma);
|
2024-07-29 12:50:38 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline unsigned long vma_iter_addr(struct vma_iterator *vmi)
|
|
|
|
{
|
|
|
|
return vmi->mas.index;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline unsigned long vma_iter_end(struct vma_iterator *vmi)
|
|
|
|
{
|
|
|
|
return vmi->mas.last + 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi,
|
|
|
|
unsigned long count)
|
|
|
|
{
|
|
|
|
return mas_expected_entries(&vmi->mas, count);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline
|
|
|
|
struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi)
|
|
|
|
{
|
|
|
|
return mas_prev_range(&vmi->mas, 0);
|
|
|
|
}
|
|
|
|
|
mm: avoid using vma_merge() for new VMAs
Abstract vma_merge_new_vma() to use vma_merge_struct and rename the
resultant function vma_merge_new_range() to be clear what the purpose of
this function is - a new VMA is desired in the specified range, and we
wish to see if it is possible to 'merge' surrounding VMAs into this range
rather than having to allocate a new VMA.
Note that this function uses vma_extend() exclusively, so adopts its
requirement that the iterator point at or before the gap. We add an
assert to this effect.
This is as opposed to vma_merge_existing_range(), which will be introduced
in a subsequent commit, and provide the same functionality for cases in
which we are modifying an existing VMA.
In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.
Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().
Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.
Also add the ability for the vmg to track state, and able to report
errors, allowing for us to differentiate a failed merge from an inability
to allocate memory in callers.
This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.
Also introduce vma_iter_next_rewind() to allow for retrieval of the next,
and (optionally) the prev VMA, rewinding to the start of the previous gap.
Introduce are_anon_vmas_compatible() to abstract individual VMA anon_vma
comparison for the case of merging on both sides where the anon_vma of the
VMA being merged maybe compatible with prev and next, but prev and next's
anon_vma's may not be compatible with each other.
Finally also introduce can_vma_merge_left() / can_vma_merge_right() to
check adjacent VMA compatibility and that they are indeed adjacent.
Link: https://lkml.kernel.org/r/49d37c0769b6b9dc03b27fe4d059173832556392.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:18 +01:00
|
|
|
/*
|
|
|
|
* Retrieve the next VMA and rewind the iterator to end of the previous VMA, or
|
|
|
|
* if no previous VMA, to index 0.
|
|
|
|
*/
|
|
|
|
static inline
|
|
|
|
struct vm_area_struct *vma_iter_next_rewind(struct vma_iterator *vmi,
|
|
|
|
struct vm_area_struct **pprev)
|
|
|
|
{
|
|
|
|
struct vm_area_struct *next = vma_next(vmi);
|
|
|
|
struct vm_area_struct *prev = vma_prev(vmi);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Consider the case where no previous VMA exists. We advance to the
|
|
|
|
* next VMA, skipping any gap, then rewind to the start of the range.
|
|
|
|
*
|
|
|
|
* If we were to unconditionally advance to the next range we'd wind up
|
|
|
|
* at the next VMA again, so we check to ensure there is a previous VMA
|
|
|
|
* to skip over.
|
|
|
|
*/
|
|
|
|
if (prev)
|
|
|
|
vma_iter_next_range(vmi);
|
|
|
|
|
|
|
|
if (pprev)
|
|
|
|
*pprev = prev;
|
|
|
|
|
|
|
|
return next;
|
|
|
|
}
|
|
|
|
|
2024-08-17 01:18:28 +01:00
|
|
|
#ifdef CONFIG_64BIT
|
|
|
|
static inline bool vma_is_sealed(struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
return (vma->vm_flags & VM_SEALED);
|
|
|
|
}
|
|
|
|
#else
|
2025-07-25 09:29:43 +01:00
|
|
|
static inline bool vma_is_sealed(struct vm_area_struct *vma)
|
2024-08-17 01:18:28 +01:00
|
|
|
{
|
2025-07-25 09:29:43 +01:00
|
|
|
return false;
|
2024-08-17 01:18:28 +01:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2024-12-03 18:05:11 +00:00
|
|
|
#if defined(CONFIG_STACK_GROWSUP)
|
|
|
|
int expand_upwards(struct vm_area_struct *vma, unsigned long address);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
int expand_downwards(struct vm_area_struct *vma, unsigned long address);
|
|
|
|
|
2024-12-03 18:05:12 +00:00
|
|
|
int __vm_munmap(unsigned long start, size_t len, bool unlock);
|
|
|
|
|
2025-04-28 16:28:15 +01:00
|
|
|
int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma);
|
|
|
|
|
mm: perform VMA allocation, freeing, duplication in mm
Right now these are performed in kernel/fork.c which is odd and a
violation of separation of concerns, as well as preventing us from
integrating this and related logic into userland VMA testing going
forward.
There is a fly in the ointment - nommu - mmap.c is not compiled if
CONFIG_MMU not set, and neither is vma.c.
To square the circle, let's add a new file - vma_init.c. This will be
compiled for both CONFIG_MMU and nommu builds, and will also form part of
the VMA userland testing.
This allows us to de-duplicate code, while maintaining separation of
concerns and the ability for us to userland test this logic.
Update the VMA userland tests accordingly, additionally adding a
detach_free_vma() helper function to correctly detach VMAs before freeing
them in test code, as this change was triggering the assert for this.
[akpm@linux-foundation.org: remove stray newline, per Liam]
Link: https://lkml.kernel.org/r/f97b3a85a6da0196b28070df331b99e22b263be8.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-28 16:28:17 +01:00
|
|
|
/* vma_init.h, shared between CONFIG_MMU and nommu. */
|
|
|
|
void __init vma_state_init(void);
|
|
|
|
struct vm_area_struct *vm_area_alloc(struct mm_struct *mm);
|
|
|
|
struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig);
|
|
|
|
void vm_area_free(struct vm_area_struct *vma);
|
|
|
|
|
mm: establish mm/vma_exec.c for shared exec/mm VMA functionality
Patch series "move all VMA allocation, freeing and duplication logic to
mm", v3.
Currently VMA allocation, freeing and duplication exist in kernel/fork.c,
which is a violation of separation of concerns, and leaves these functions
exposed to the rest of the kernel when they are in fact internal
implementation details.
Resolve this by moving this logic to mm, and making it internal to vma.c,
vma.h.
This also allows us, in future, to provide userland testing around this
functionality.
We additionally abstract dup_mmap() to mm, being careful to ensure
kernel/fork.c acceses this via the mm internal header so it is not exposed
elsewhere in the kernel.
As part of this change, also abstract initial stack allocation performed
in __bprm_mm_init() out of fs code into mm via the
create_init_stack_vma(), as this code uses vm_area_alloc() and
vm_area_free().
In order to do so sensibly, we introduce a new mm/vma_exec.c file, which
contains the code that is shared by mm and exec. This file is added to
both memory mapping and exec sections in MAINTAINERS so both sets of
maintainers can maintain oversight.
As part of this change, we also move relocate_vma_down() to mm/vma_exec.c
so all shared mm/exec functionality is kept in one place.
We add code shared between nommu and mmu-enabled configurations in order
to share VMA allocation, freeing and duplication code correctly while also
keeping these functions available in userland VMA testing.
This is achieved by adding a mm/vma_init.c file which is also compiled by
the userland tests.
This patch (of 4):
There is functionality that overlaps the exec and memory mapping
subsystems. While it properly belongs in mm, it is important that exec
maintainers maintain oversight of this functionality correctly.
We can establish both goals by adding a new mm/vma_exec.c file which
contains these 'glue' functions, and have fs/exec.c import them.
As a part of this change, to ensure that proper oversight is achieved, add
the file to both the MEMORY MAPPING and EXEC & BINFMT API, ELF sections.
scripts/get_maintainer.pl can correctly handle files in multiple entries
and this neatly handles the cross-over.
[akpm@linux-foundation.org: fix comment typo]
Link: https://lkml.kernel.org/r/80f0d0c6-0b68-47f9-ab78-0ab7f74677fc@lucifer.local
Link: https://lkml.kernel.org/r/cover.1745853549.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/91f2cee8f17d65214a9d83abb7011aa15f1ea690.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-28 16:28:14 +01:00
|
|
|
/* vma_exec.c */
|
|
|
|
#ifdef CONFIG_MMU
|
2025-04-28 16:28:15 +01:00
|
|
|
int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap,
|
|
|
|
unsigned long *top_mem_p);
|
mm: establish mm/vma_exec.c for shared exec/mm VMA functionality
Patch series "move all VMA allocation, freeing and duplication logic to
mm", v3.
Currently VMA allocation, freeing and duplication exist in kernel/fork.c,
which is a violation of separation of concerns, and leaves these functions
exposed to the rest of the kernel when they are in fact internal
implementation details.
Resolve this by moving this logic to mm, and making it internal to vma.c,
vma.h.
This also allows us, in future, to provide userland testing around this
functionality.
We additionally abstract dup_mmap() to mm, being careful to ensure
kernel/fork.c acceses this via the mm internal header so it is not exposed
elsewhere in the kernel.
As part of this change, also abstract initial stack allocation performed
in __bprm_mm_init() out of fs code into mm via the
create_init_stack_vma(), as this code uses vm_area_alloc() and
vm_area_free().
In order to do so sensibly, we introduce a new mm/vma_exec.c file, which
contains the code that is shared by mm and exec. This file is added to
both memory mapping and exec sections in MAINTAINERS so both sets of
maintainers can maintain oversight.
As part of this change, we also move relocate_vma_down() to mm/vma_exec.c
so all shared mm/exec functionality is kept in one place.
We add code shared between nommu and mmu-enabled configurations in order
to share VMA allocation, freeing and duplication code correctly while also
keeping these functions available in userland VMA testing.
This is achieved by adding a mm/vma_init.c file which is also compiled by
the userland tests.
This patch (of 4):
There is functionality that overlaps the exec and memory mapping
subsystems. While it properly belongs in mm, it is important that exec
maintainers maintain oversight of this functionality correctly.
We can establish both goals by adding a new mm/vma_exec.c file which
contains these 'glue' functions, and have fs/exec.c import them.
As a part of this change, to ensure that proper oversight is achieved, add
the file to both the MEMORY MAPPING and EXEC & BINFMT API, ELF sections.
scripts/get_maintainer.pl can correctly handle files in multiple entries
and this neatly handles the cross-over.
[akpm@linux-foundation.org: fix comment typo]
Link: https://lkml.kernel.org/r/80f0d0c6-0b68-47f9-ab78-0ab7f74677fc@lucifer.local
Link: https://lkml.kernel.org/r/cover.1745853549.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/91f2cee8f17d65214a9d83abb7011aa15f1ea690.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-28 16:28:14 +01:00
|
|
|
int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift);
|
|
|
|
#endif
|
|
|
|
|
2024-07-29 12:50:38 +01:00
|
|
|
#endif /* __MM_VMA_H */
|