2024-07-29 12:50:41 +01:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
|
|
|
|
|
|
#include <stdbool.h>
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
|
2024-10-17 17:56:38 +01:00
|
|
|
#include "generated/bit-length.h"
|
|
|
|
|
2024-07-29 12:50:41 +01:00
|
|
|
#include "maple-shared.h"
|
|
|
|
#include "vma_internal.h"
|
|
|
|
|
2024-08-30 19:10:14 +01:00
|
|
|
/* Include so header guard set. */
|
|
|
|
#include "../../../mm/vma.h"
|
|
|
|
|
|
|
|
static bool fail_prealloc;
|
|
|
|
|
|
|
|
/* Then override vma_iter_prealloc() so we can choose to fail it. */
|
|
|
|
#define vma_iter_prealloc(vmi, vma) \
|
|
|
|
(fail_prealloc ? -ENOMEM : mas_preallocate(&(vmi)->mas, (vma), GFP_KERNEL))
|
|
|
|
|
2024-12-03 18:05:09 +00:00
|
|
|
#define CONFIG_DEFAULT_MMAP_MIN_ADDR 65536
|
|
|
|
|
|
|
|
unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
|
|
|
|
unsigned long dac_mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
|
|
|
|
unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;
|
|
|
|
|
2024-07-29 12:50:41 +01:00
|
|
|
/*
|
|
|
|
* Directly import the VMA implementation here. Our vma_internal.h wrapper
|
|
|
|
* provides userland-equivalent functionality for everything vma.c uses.
|
|
|
|
*/
|
mm: perform VMA allocation, freeing, duplication in mm
Right now these are performed in kernel/fork.c which is odd and a
violation of separation of concerns, as well as preventing us from
integrating this and related logic into userland VMA testing going
forward.
There is a fly in the ointment - nommu - mmap.c is not compiled if
CONFIG_MMU not set, and neither is vma.c.
To square the circle, let's add a new file - vma_init.c. This will be
compiled for both CONFIG_MMU and nommu builds, and will also form part of
the VMA userland testing.
This allows us to de-duplicate code, while maintaining separation of
concerns and the ability for us to userland test this logic.
Update the VMA userland tests accordingly, additionally adding a
detach_free_vma() helper function to correctly detach VMAs before freeing
them in test code, as this change was triggering the assert for this.
[akpm@linux-foundation.org: remove stray newline, per Liam]
Link: https://lkml.kernel.org/r/f97b3a85a6da0196b28070df331b99e22b263be8.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-28 16:28:17 +01:00
|
|
|
#include "../../../mm/vma_init.c"
|
mm: establish mm/vma_exec.c for shared exec/mm VMA functionality
Patch series "move all VMA allocation, freeing and duplication logic to
mm", v3.
Currently VMA allocation, freeing and duplication exist in kernel/fork.c,
which is a violation of separation of concerns, and leaves these functions
exposed to the rest of the kernel when they are in fact internal
implementation details.
Resolve this by moving this logic to mm, and making it internal to vma.c,
vma.h.
This also allows us, in future, to provide userland testing around this
functionality.
We additionally abstract dup_mmap() to mm, being careful to ensure
kernel/fork.c acceses this via the mm internal header so it is not exposed
elsewhere in the kernel.
As part of this change, also abstract initial stack allocation performed
in __bprm_mm_init() out of fs code into mm via the
create_init_stack_vma(), as this code uses vm_area_alloc() and
vm_area_free().
In order to do so sensibly, we introduce a new mm/vma_exec.c file, which
contains the code that is shared by mm and exec. This file is added to
both memory mapping and exec sections in MAINTAINERS so both sets of
maintainers can maintain oversight.
As part of this change, we also move relocate_vma_down() to mm/vma_exec.c
so all shared mm/exec functionality is kept in one place.
We add code shared between nommu and mmu-enabled configurations in order
to share VMA allocation, freeing and duplication code correctly while also
keeping these functions available in userland VMA testing.
This is achieved by adding a mm/vma_init.c file which is also compiled by
the userland tests.
This patch (of 4):
There is functionality that overlaps the exec and memory mapping
subsystems. While it properly belongs in mm, it is important that exec
maintainers maintain oversight of this functionality correctly.
We can establish both goals by adding a new mm/vma_exec.c file which
contains these 'glue' functions, and have fs/exec.c import them.
As a part of this change, to ensure that proper oversight is achieved, add
the file to both the MEMORY MAPPING and EXEC & BINFMT API, ELF sections.
scripts/get_maintainer.pl can correctly handle files in multiple entries
and this neatly handles the cross-over.
[akpm@linux-foundation.org: fix comment typo]
Link: https://lkml.kernel.org/r/80f0d0c6-0b68-47f9-ab78-0ab7f74677fc@lucifer.local
Link: https://lkml.kernel.org/r/cover.1745853549.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/91f2cee8f17d65214a9d83abb7011aa15f1ea690.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-28 16:28:14 +01:00
|
|
|
#include "../../../mm/vma_exec.c"
|
2024-07-29 12:50:41 +01:00
|
|
|
#include "../../../mm/vma.c"
|
|
|
|
|
|
|
|
const struct vm_operations_struct vma_dummy_vm_ops;
|
2024-08-30 19:10:14 +01:00
|
|
|
static struct anon_vma dummy_anon_vma;
|
2024-07-29 12:50:41 +01:00
|
|
|
|
|
|
|
#define ASSERT_TRUE(_expr) \
|
|
|
|
do { \
|
|
|
|
if (!(_expr)) { \
|
|
|
|
fprintf(stderr, \
|
|
|
|
"Assert FAILED at %s:%d:%s(): %s is FALSE.\n", \
|
|
|
|
__FILE__, __LINE__, __FUNCTION__, #_expr); \
|
|
|
|
return false; \
|
|
|
|
} \
|
|
|
|
} while (0)
|
|
|
|
#define ASSERT_FALSE(_expr) ASSERT_TRUE(!(_expr))
|
|
|
|
#define ASSERT_EQ(_val1, _val2) ASSERT_TRUE((_val1) == (_val2))
|
|
|
|
#define ASSERT_NE(_val1, _val2) ASSERT_TRUE((_val1) != (_val2))
|
|
|
|
|
2024-08-30 19:10:14 +01:00
|
|
|
static struct task_struct __current;
|
|
|
|
|
|
|
|
struct task_struct *get_current(void)
|
|
|
|
{
|
|
|
|
return &__current;
|
|
|
|
}
|
|
|
|
|
2024-12-03 18:05:11 +00:00
|
|
|
unsigned long rlimit(unsigned int limit)
|
|
|
|
{
|
|
|
|
return (unsigned long)-1;
|
|
|
|
}
|
|
|
|
|
2024-08-30 19:10:14 +01:00
|
|
|
/* Helper function to simply allocate a VMA. */
|
2024-07-29 12:50:41 +01:00
|
|
|
static struct vm_area_struct *alloc_vma(struct mm_struct *mm,
|
|
|
|
unsigned long start,
|
|
|
|
unsigned long end,
|
|
|
|
pgoff_t pgoff,
|
2025-06-18 20:42:53 +01:00
|
|
|
vm_flags_t vm_flags)
|
2024-07-29 12:50:41 +01:00
|
|
|
{
|
|
|
|
struct vm_area_struct *ret = vm_area_alloc(mm);
|
|
|
|
|
|
|
|
if (ret == NULL)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
ret->vm_start = start;
|
|
|
|
ret->vm_end = end;
|
|
|
|
ret->vm_pgoff = pgoff;
|
2025-06-18 20:42:53 +01:00
|
|
|
ret->__vm_flags = vm_flags;
|
2025-02-13 14:46:41 -08:00
|
|
|
vma_assert_detached(ret);
|
2024-07-29 12:50:41 +01:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2025-02-13 14:46:41 -08:00
|
|
|
/* Helper function to allocate a VMA and link it to the tree. */
|
|
|
|
static int attach_vma(struct mm_struct *mm, struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
int res;
|
|
|
|
|
|
|
|
res = vma_link(mm, vma);
|
|
|
|
if (!res)
|
|
|
|
vma_assert_attached(vma);
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
mm: perform VMA allocation, freeing, duplication in mm
Right now these are performed in kernel/fork.c which is odd and a
violation of separation of concerns, as well as preventing us from
integrating this and related logic into userland VMA testing going
forward.
There is a fly in the ointment - nommu - mmap.c is not compiled if
CONFIG_MMU not set, and neither is vma.c.
To square the circle, let's add a new file - vma_init.c. This will be
compiled for both CONFIG_MMU and nommu builds, and will also form part of
the VMA userland testing.
This allows us to de-duplicate code, while maintaining separation of
concerns and the ability for us to userland test this logic.
Update the VMA userland tests accordingly, additionally adding a
detach_free_vma() helper function to correctly detach VMAs before freeing
them in test code, as this change was triggering the assert for this.
[akpm@linux-foundation.org: remove stray newline, per Liam]
Link: https://lkml.kernel.org/r/f97b3a85a6da0196b28070df331b99e22b263be8.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-28 16:28:17 +01:00
|
|
|
static void detach_free_vma(struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
vma_mark_detached(vma);
|
|
|
|
vm_area_free(vma);
|
|
|
|
}
|
|
|
|
|
2024-08-30 19:10:14 +01:00
|
|
|
/* Helper function to allocate a VMA and link it to the tree. */
|
|
|
|
static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm,
|
|
|
|
unsigned long start,
|
|
|
|
unsigned long end,
|
|
|
|
pgoff_t pgoff,
|
2025-06-18 20:42:53 +01:00
|
|
|
vm_flags_t vm_flags)
|
2024-08-30 19:10:14 +01:00
|
|
|
{
|
2025-06-18 20:42:53 +01:00
|
|
|
struct vm_area_struct *vma = alloc_vma(mm, start, end, pgoff, vm_flags);
|
2024-08-30 19:10:14 +01:00
|
|
|
|
|
|
|
if (vma == NULL)
|
|
|
|
return NULL;
|
|
|
|
|
2025-02-13 14:46:41 -08:00
|
|
|
if (attach_vma(mm, vma)) {
|
mm: perform VMA allocation, freeing, duplication in mm
Right now these are performed in kernel/fork.c which is odd and a
violation of separation of concerns, as well as preventing us from
integrating this and related logic into userland VMA testing going
forward.
There is a fly in the ointment - nommu - mmap.c is not compiled if
CONFIG_MMU not set, and neither is vma.c.
To square the circle, let's add a new file - vma_init.c. This will be
compiled for both CONFIG_MMU and nommu builds, and will also form part of
the VMA userland testing.
This allows us to de-duplicate code, while maintaining separation of
concerns and the ability for us to userland test this logic.
Update the VMA userland tests accordingly, additionally adding a
detach_free_vma() helper function to correctly detach VMAs before freeing
them in test code, as this change was triggering the assert for this.
[akpm@linux-foundation.org: remove stray newline, per Liam]
Link: https://lkml.kernel.org/r/f97b3a85a6da0196b28070df331b99e22b263be8.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-28 16:28:17 +01:00
|
|
|
detach_free_vma(vma);
|
2024-08-30 19:10:14 +01:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Reset this counter which we use to track whether writes have
|
|
|
|
* begun. Linking to the tree will have caused this to be incremented,
|
|
|
|
* which means we will get a false positive otherwise.
|
|
|
|
*/
|
2024-11-22 09:44:15 -08:00
|
|
|
vma->vm_lock_seq = UINT_MAX;
|
2024-08-30 19:10:14 +01:00
|
|
|
|
|
|
|
return vma;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Helper function which provides a wrapper around a merge new VMA operation. */
|
|
|
|
static struct vm_area_struct *merge_new(struct vma_merge_struct *vmg)
|
|
|
|
{
|
2025-02-13 14:46:41 -08:00
|
|
|
struct vm_area_struct *vma;
|
2024-08-30 19:10:14 +01:00
|
|
|
/*
|
|
|
|
* For convenience, get prev and next VMAs. Which the new VMA operation
|
|
|
|
* requires.
|
|
|
|
*/
|
|
|
|
vmg->next = vma_next(vmg->vmi);
|
|
|
|
vmg->prev = vma_prev(vmg->vmi);
|
mm: avoid using vma_merge() for new VMAs
Abstract vma_merge_new_vma() to use vma_merge_struct and rename the
resultant function vma_merge_new_range() to be clear what the purpose of
this function is - a new VMA is desired in the specified range, and we
wish to see if it is possible to 'merge' surrounding VMAs into this range
rather than having to allocate a new VMA.
Note that this function uses vma_extend() exclusively, so adopts its
requirement that the iterator point at or before the gap. We add an
assert to this effect.
This is as opposed to vma_merge_existing_range(), which will be introduced
in a subsequent commit, and provide the same functionality for cases in
which we are modifying an existing VMA.
In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.
Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().
Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.
Also add the ability for the vmg to track state, and able to report
errors, allowing for us to differentiate a failed merge from an inability
to allocate memory in callers.
This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.
Also introduce vma_iter_next_rewind() to allow for retrieval of the next,
and (optionally) the prev VMA, rewinding to the start of the previous gap.
Introduce are_anon_vmas_compatible() to abstract individual VMA anon_vma
comparison for the case of merging on both sides where the anon_vma of the
VMA being merged maybe compatible with prev and next, but prev and next's
anon_vma's may not be compatible with each other.
Finally also introduce can_vma_merge_left() / can_vma_merge_right() to
check adjacent VMA compatibility and that they are indeed adjacent.
Link: https://lkml.kernel.org/r/49d37c0769b6b9dc03b27fe4d059173832556392.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:18 +01:00
|
|
|
vma_iter_next_range(vmg->vmi);
|
2024-08-30 19:10:14 +01:00
|
|
|
|
2025-02-13 14:46:41 -08:00
|
|
|
vma = vma_merge_new_range(vmg);
|
|
|
|
if (vma)
|
|
|
|
vma_assert_attached(vma);
|
|
|
|
|
|
|
|
return vma;
|
2024-08-30 19:10:14 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Helper function which provides a wrapper around a merge existing VMA
|
|
|
|
* operation.
|
|
|
|
*/
|
|
|
|
static struct vm_area_struct *merge_existing(struct vma_merge_struct *vmg)
|
|
|
|
{
|
2025-02-13 14:46:41 -08:00
|
|
|
struct vm_area_struct *vma;
|
|
|
|
|
|
|
|
vma = vma_merge_existing_range(vmg);
|
|
|
|
if (vma)
|
|
|
|
vma_assert_attached(vma);
|
|
|
|
return vma;
|
2024-08-30 19:10:14 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Helper function which provides a wrapper around the expansion of an existing
|
|
|
|
* VMA.
|
|
|
|
*/
|
|
|
|
static int expand_existing(struct vma_merge_struct *vmg)
|
|
|
|
{
|
2024-08-30 19:10:17 +01:00
|
|
|
return vma_expand(vmg);
|
2024-08-30 19:10:14 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Helper function to reset merge state the associated VMA iterator to a
|
|
|
|
* specified new range.
|
|
|
|
*/
|
|
|
|
static void vmg_set_range(struct vma_merge_struct *vmg, unsigned long start,
|
2025-06-18 20:42:53 +01:00
|
|
|
unsigned long end, pgoff_t pgoff, vm_flags_t vm_flags)
|
2024-08-30 19:10:14 +01:00
|
|
|
{
|
|
|
|
vma_iter_set(vmg->vmi, start);
|
|
|
|
|
|
|
|
vmg->prev = NULL;
|
mm: simplify vma merge structure and expand comments
Patch series "mm: further simplify VMA merge operation", v3.
While significant efforts have been made to improve the VMA merge
operation, there remains remnants of the bad (or rather confusing) old
days, which make the code difficult to understand, more bug prone and thus
harder to modify.
This series attempts to significantly improve matters in a number of
respects - with a focus on simplifying the commit_merge() function which
actually actions the merge operation - and importantly, adjusting the two
most confusing merge cases - those in which we 'adjust' the VMA
immediately adjacent to the one being merged.
One source of confusion are the VMAs being threaded through the operation
themselves - vmg->prev, vmg->vma and vmg->next.
At the start of the operation, vmg->vma is either NULL if a new VMA is
propose to be added, or if not then a pointer to an existing VMA being
modified, and prev/next are (perhaps not present) VMAs sat immediately
before and after the range specified in vmg->start, end, respectively.
However, during the VMA merge operation, we change vmg->start, end and
pgoff to span the newly merged range and vmg->vma to either be:
a. The ultimately returned VMA (in most cases) or b. A VMA which we will
manipulate, but ultimately instead return vmg->next.
Case b. especially here is confusing for somebody reading this code, but
the fact we update this state, along with vmg->start, end, pgoff only
makes matters worse.
We simplify things by replacing vmg->vma with vmg->middle and never
changing it - this is always either NULL (for a new VMA) or the VMA being
modified between vmg->prev and vmg->next.
We further simplify by placing the merged VMA in a new vmg->target field -
whether case b. above is the case or not. The reader of the code can now
simply rely on vmg->middle being the middle VMA and vmg->target being the
ultimately merged VMA.
We additionally tackle the confusing cases where we 'adjust' VMAs other
than the one we ultimately return as the merged VMA (this includes case b.
above). These are:
(1)
merge
<----------->
|------||--------| |------------|---|
| prev || middle | -> | target | m |
|------||--------| |------------|---|
In which case middle must be adjusted so middle->vm_start is increased as
well as performing the merge.
(2) (equivalent to case b. above)
<------------->
|---------||------| |---|-------------|
| middle || next | -> | m | target |
|---------||------| |---|-------------|
In which case next must be adjusted so next->vm_start is decreased as well
as performing the merge.
This cases have previously been performed by calculating and passing
around a dubious and confusing 'adj_start' parameter along side a pointer
to an 'adjust' VMA indicating which VMA requires additional adjustment
(middle in case 1 and next in case 2).
With the VMG structure in place we are able to avoid this by simply
setting a merge flag to describe each case:
(1) Sets the vmg->__adjust_middle_start flag
(2) Sets the vmg->__adjust_next_start flag
By doing so it turns out we can vastly simplify the logic and calculate
what is required to perform the operation.
Taken together the refactorings make it far easier to understand what is
being done even in these more confusing cases, make the code far more
maintainable, debuggable, and testable, providing more internal state
indicating what is happening in the merge operation.
The changes have no functional net impact on the merge operation and
everything should still behave as it did before.
This patch (of 5):
The merge code, while much improved, still has a number of points of
confusion. As part of a broader series cleaning this up to make this more
maintainable, we start by addressing some confusion around
vma_merge_struct fields.
So far, the caller either provides no vmg->vma (a new VMA) or supplies the
existing VMA which is being altered, setting vmg->start,end,pgoff to the
proposed VMA dimensions.
vmg->vma is then updated, as are vmg->start,end,pgoff as the merge process
proceeds and the appropriate merge strategy is determined.
This is rather confusing, as vmg->vma starts off as the 'middle' VMA
between vmg->prev,next, but becomes the 'target' VMA, except in one
specific edge case (merge next, shrink middle).
Int his patch we introduce vmg->middle to describe the VMA that is between
vmg->prev and vmg->next, and does NOT change during the merge operation.
We replace vmg->vma with vmg->target, and use this only during the merge
operation itself.
Aside from the merge right, shrink middle case, this becomes the VMA that
forms the basis of the VMA that is returned. This edge case can be
addressed in a future commit.
We also add a number of comments to explain what is going on.
Finally, we adjust the ASCII diagrams showing each merge case in
vma_merge_existing_range() to be clearer - the arrow range previously
showed the vmg->start, end spanned area, but it is clearer to change this
to show the final merged VMA.
This patch has no change in functional behaviour.
Link: https://lkml.kernel.org/r/cover.1738326519.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/4dfe60f1419d55e5d0516f56349695d73a57184c.1738326519.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-31 12:31:49 +00:00
|
|
|
vmg->middle = NULL;
|
2024-08-30 19:10:14 +01:00
|
|
|
vmg->next = NULL;
|
2025-01-31 12:31:50 +00:00
|
|
|
vmg->target = NULL;
|
2024-08-30 19:10:14 +01:00
|
|
|
|
|
|
|
vmg->start = start;
|
|
|
|
vmg->end = end;
|
|
|
|
vmg->pgoff = pgoff;
|
2025-06-18 20:42:53 +01:00
|
|
|
vmg->vm_flags = vm_flags;
|
2025-01-31 12:31:50 +00:00
|
|
|
|
|
|
|
vmg->just_expand = false;
|
|
|
|
vmg->__remove_middle = false;
|
|
|
|
vmg->__remove_next = false;
|
mm: eliminate adj_start parameter from commit_merge()
Introduce internal vmg->__adjust_middle_start and vmg->__adjust_next_start
merge flags, enabling us to indicate to commit_merge() that we are
performing a merge which either spans only part of vmg->middle, or part of
vmg->next respectively.
In the former instance, we change the start of vmg->middle to match the
attributes of vmg->prev, without spanning all of vmg->middle.
This implies that vmg->prev->vm_end and vmg->middle->vm_start are both
increased to form the new merged VMA (vmg->prev) and the new subsequent
VMA (vmg->middle).
In the latter case, we change the end of vmg->middle to match the
attributes of vmg->next, without spanning all of vmg->next.
This implies that vmg->middle->vm_end and vmg->next->vm_start are both
decreased to form the new merged VMA (vmg->next) and the new prior VMA
(vmg->middle).
Since we now have a stable set of prev, middle, next VMAs threaded through
vmg and with these flags set know what is happening, we can perform the
calculation in commit_merge() instead.
This allows us to drop the confusing adj_start parameter and instead pass
semantic information to commit_merge().
In the latter case the -(middle->vm_end - start) calculation becomes
-(middle->vm-end - vmg->end), however this is correct as vmg->end is set
to the start parameter.
This is because in this case (rather confusingly), we manipulate
vmg->middle, but ultimately return vmg->next, whose range will be
correctly specified. At this point vmg->start, end is the new range for
the prior VMA rather than the merged one.
This patch has no change in functional behaviour.
Link: https://lkml.kernel.org/r/bcec0cd980b373a5eb02236cb033034ce1effe42.1738326519.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-31 12:31:51 +00:00
|
|
|
vmg->__adjust_middle_start = false;
|
|
|
|
vmg->__adjust_next_start = false;
|
2024-08-30 19:10:14 +01:00
|
|
|
}
|
|
|
|
|
mm/vma: fix incorrectly disallowed anonymous VMA merges
Patch series "fix incorrectly disallowed anonymous VMA merges", v2.
It appears that we have been incorrectly rejecting merge cases for 15
years, apparently by mistake.
Imagine a range of anonymous mapped momemory divided into two VMAs like
this, with incompatible protection bits:
RW RWX
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
mprotect(RW)
Now imagine mprotect()'ing vma so it is RW. This appears as if it should
merge, it does not.
Neither does this case, again mprotect()'ing vma RW:
RWX RW
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
mprotect(RW)
Nor:
RW RWX RW
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
mprotect(RW)
What's going on here?
In commit 5beb49305251 ("mm: change anon_vma linking to fix multi-process
server scalability issue"), from 2010, Rik von Riel took careful care to
account for these cases - commenting that '[this is] easily overlooked:
when mprotect shifts the boundary, make sure the expanding vma has
anon_vma set if the shrinking vma had, to cover any anon pages imported.'
However, commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
introduced a little over a year later, appears to have accidentally
disallowed this.
By adjusting the is_mergeable_anon_vma() function to avoid lock contention
across large trees of forked anon_vma's, this commit wrongly assumed the
VMA being checked (the ostensible merge 'target') should be faulted, that
is, have an anon_vma, and thus an anon_vma_chain list established, but
only of length 1.
This appears to have been unintentional, as disallowing empty target VMAs
like this across the board makes no sense.
We already have logic that accounts for this case, the same logic Rik
introduced in 2010, now via dup_anon_vma() (and ultimately
anon_vma_clone()), so there is no problem permitting this.
This series fixes this mistake and also ensures that scalability concerns
remain addressed by explicitly checking that whatever VMA is being merged
has not been forked.
A full set of self tests which reproduce the issue are provided, as well
as updating userland VMA tests to assert this behaviour.
The self tests additionally assert scalability concerns are addressed.
This patch (of 3):
anon_vma_chain's were introduced by Rik von Riel in commit 5beb49305251
("mm: change anon_vma linking to fix multi-process server scalability
issue").
This patch was introduced in March 2010. As part of this change, careful
attention was made to the instance of mprotect() causing a VMA merge, with
one faulted (i.e. having anon_vma set) and another not:
/*
* Easily overlooked: when mprotect shifts the boundary,
* make sure the expanding vma has anon_vma set if the
* shrinking vma had, to cover any anon pages imported.
*/
In the modern VMA code, this is handled in dup_anon_vma() (and ultimately
anon_vma_clone()).
This case is one of the three configurations of adjacent VMA anon_vma
state that we might encounter on merge (where dst is the VMA which will be
merged into and src the one being merged into dst):
1. dst->anon_vma, src->anon_vma - These must be equal, no-op.
2. dst->anon_vma, !src->anon_vma - We simply use dst->anon_vma, no-op.
3. !dst->anon_vma, src->anon_vma - The case in question here.
In case 3, the instance addressed here - we duplicate the AVC connections
from src and place into dst.
However, in practice, we very often do NOT do this.
This appears to be due to an inadvertent consequence of the change
introduced by commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs"),
introduced in May 2011.
This implies that this merge case was functional only for a little over a
year, and has since been broken for ~15 years.
Here, lock scalability concerns lead to us restricting anonymous merges
only to those VMAs with 1 entry in their vma->anon_vma_chain, that is, a
VMA that is not connected to any parent process's anon_vma.
The mergeability test looks like this:
static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
struct anon_vma *anon_vma2, struct vm_area_struct *vma)
{
if ((!anon_vma1 || !anon_vma2) && (!vma ||
!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)))
return true;
return anon_vma1 == anon_vma2;
}
However, we have a problem here - typically the vma passed here is the
destination VMA.
For instance in vma_merge_existing_range() we invoke:
can_vma_merge_left()
-> [ check that there is an immediately adjacent prior VMA ]
-> can_vma_merge_after()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], prev->anon_vma, prev)
So if we were considering a target unfaulted 'prev':
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
This would call is_mergeable_anon_vma(NULL, vma->anon_vma, prev).
The list_is_singular() check for vma->anon_vma_chain, an empty list on
fault, would cause this merge to _fail_ even though all else indicates a
merge.
Equally a simple merge into a next VMA would hit the same problem:
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
can_vma_merge_right()
-> [ check that there is an immediately adjacent succeeding VMA ]
-> can_vma_merge_before()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], next->anon_vma, next)
For a 3-way merge, we'd also hit the same problem if it was configured like
this for instance:
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
As we'd call can_vma_merge_left() for prev, and can_vma_merge_right() for
next, both of which would fail.
vma_merge_new_range() (and relatedly, vma_expand()) are not impacted, as
the new VMA would never already be faulted (it is a proposed new range).
Because we already handle each of the aforementioned merge cases, and can
absolutely therefore deal with an existing VMA merge with !dst->anon_vma,
src->anon_vma, there is absolutely no reason to disallow this kind of
merge.
It seems that the intention of this patch is to ensure that, in the
instance of merging unfaulted VMAs with faulted ones, we never wish to do
so with those with multiple AVCs due to the fact that anon_vma lock's are
held across both parent and child anon_vma's (actually, the 'root' parent
anon_vma's lock is used).
In fact, the original commit alludes to this - "find_mergeable_anon_vma()
already considers this case".
In find_mergeable_anon_vma() however, we check the anon_vma which will be
merged from, if it is set, then we check
list_is_singular(vma->anon_vma_chain).
So to match this logic, update is_mergeable_anon_vma() to perform this
scalability check on the VMA whose anon_vma we ultimately merge into.
This matches existing behaviour with forked VMAs, only we no longer
wrongly disallow ALL empty target merges.
So we both allow merge cases and ensure the scalability check is correctly
applied.
We may wish to revisit these lock scalability concerns at a later date and
ensure they are still valid.
Additionally, correct userland VMA tests which were mistakenly not
asserting these cases correctly previously to now correctly assert this,
and to ensure vmg->anon_vma state is always consistent to account for
newly introduced asserts.
Link: https://lkml.kernel.org/r/cover.1744104124.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/18c756fc9eaf7ad082a710c91133b8346f8cd9a8.1744104124.git.lorenzo.stoakes@oracle.com
Fixes: 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Yeoreum Yun <yeoreum.yun@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-08 10:29:31 +01:00
|
|
|
/* Helper function to set both the VMG range and its anon_vma. */
|
|
|
|
static void vmg_set_range_anon_vma(struct vma_merge_struct *vmg, unsigned long start,
|
2025-06-18 20:42:53 +01:00
|
|
|
unsigned long end, pgoff_t pgoff, vm_flags_t vm_flags,
|
mm/vma: fix incorrectly disallowed anonymous VMA merges
Patch series "fix incorrectly disallowed anonymous VMA merges", v2.
It appears that we have been incorrectly rejecting merge cases for 15
years, apparently by mistake.
Imagine a range of anonymous mapped momemory divided into two VMAs like
this, with incompatible protection bits:
RW RWX
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
mprotect(RW)
Now imagine mprotect()'ing vma so it is RW. This appears as if it should
merge, it does not.
Neither does this case, again mprotect()'ing vma RW:
RWX RW
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
mprotect(RW)
Nor:
RW RWX RW
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
mprotect(RW)
What's going on here?
In commit 5beb49305251 ("mm: change anon_vma linking to fix multi-process
server scalability issue"), from 2010, Rik von Riel took careful care to
account for these cases - commenting that '[this is] easily overlooked:
when mprotect shifts the boundary, make sure the expanding vma has
anon_vma set if the shrinking vma had, to cover any anon pages imported.'
However, commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
introduced a little over a year later, appears to have accidentally
disallowed this.
By adjusting the is_mergeable_anon_vma() function to avoid lock contention
across large trees of forked anon_vma's, this commit wrongly assumed the
VMA being checked (the ostensible merge 'target') should be faulted, that
is, have an anon_vma, and thus an anon_vma_chain list established, but
only of length 1.
This appears to have been unintentional, as disallowing empty target VMAs
like this across the board makes no sense.
We already have logic that accounts for this case, the same logic Rik
introduced in 2010, now via dup_anon_vma() (and ultimately
anon_vma_clone()), so there is no problem permitting this.
This series fixes this mistake and also ensures that scalability concerns
remain addressed by explicitly checking that whatever VMA is being merged
has not been forked.
A full set of self tests which reproduce the issue are provided, as well
as updating userland VMA tests to assert this behaviour.
The self tests additionally assert scalability concerns are addressed.
This patch (of 3):
anon_vma_chain's were introduced by Rik von Riel in commit 5beb49305251
("mm: change anon_vma linking to fix multi-process server scalability
issue").
This patch was introduced in March 2010. As part of this change, careful
attention was made to the instance of mprotect() causing a VMA merge, with
one faulted (i.e. having anon_vma set) and another not:
/*
* Easily overlooked: when mprotect shifts the boundary,
* make sure the expanding vma has anon_vma set if the
* shrinking vma had, to cover any anon pages imported.
*/
In the modern VMA code, this is handled in dup_anon_vma() (and ultimately
anon_vma_clone()).
This case is one of the three configurations of adjacent VMA anon_vma
state that we might encounter on merge (where dst is the VMA which will be
merged into and src the one being merged into dst):
1. dst->anon_vma, src->anon_vma - These must be equal, no-op.
2. dst->anon_vma, !src->anon_vma - We simply use dst->anon_vma, no-op.
3. !dst->anon_vma, src->anon_vma - The case in question here.
In case 3, the instance addressed here - we duplicate the AVC connections
from src and place into dst.
However, in practice, we very often do NOT do this.
This appears to be due to an inadvertent consequence of the change
introduced by commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs"),
introduced in May 2011.
This implies that this merge case was functional only for a little over a
year, and has since been broken for ~15 years.
Here, lock scalability concerns lead to us restricting anonymous merges
only to those VMAs with 1 entry in their vma->anon_vma_chain, that is, a
VMA that is not connected to any parent process's anon_vma.
The mergeability test looks like this:
static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
struct anon_vma *anon_vma2, struct vm_area_struct *vma)
{
if ((!anon_vma1 || !anon_vma2) && (!vma ||
!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)))
return true;
return anon_vma1 == anon_vma2;
}
However, we have a problem here - typically the vma passed here is the
destination VMA.
For instance in vma_merge_existing_range() we invoke:
can_vma_merge_left()
-> [ check that there is an immediately adjacent prior VMA ]
-> can_vma_merge_after()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], prev->anon_vma, prev)
So if we were considering a target unfaulted 'prev':
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
This would call is_mergeable_anon_vma(NULL, vma->anon_vma, prev).
The list_is_singular() check for vma->anon_vma_chain, an empty list on
fault, would cause this merge to _fail_ even though all else indicates a
merge.
Equally a simple merge into a next VMA would hit the same problem:
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
can_vma_merge_right()
-> [ check that there is an immediately adjacent succeeding VMA ]
-> can_vma_merge_before()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], next->anon_vma, next)
For a 3-way merge, we'd also hit the same problem if it was configured like
this for instance:
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
As we'd call can_vma_merge_left() for prev, and can_vma_merge_right() for
next, both of which would fail.
vma_merge_new_range() (and relatedly, vma_expand()) are not impacted, as
the new VMA would never already be faulted (it is a proposed new range).
Because we already handle each of the aforementioned merge cases, and can
absolutely therefore deal with an existing VMA merge with !dst->anon_vma,
src->anon_vma, there is absolutely no reason to disallow this kind of
merge.
It seems that the intention of this patch is to ensure that, in the
instance of merging unfaulted VMAs with faulted ones, we never wish to do
so with those with multiple AVCs due to the fact that anon_vma lock's are
held across both parent and child anon_vma's (actually, the 'root' parent
anon_vma's lock is used).
In fact, the original commit alludes to this - "find_mergeable_anon_vma()
already considers this case".
In find_mergeable_anon_vma() however, we check the anon_vma which will be
merged from, if it is set, then we check
list_is_singular(vma->anon_vma_chain).
So to match this logic, update is_mergeable_anon_vma() to perform this
scalability check on the VMA whose anon_vma we ultimately merge into.
This matches existing behaviour with forked VMAs, only we no longer
wrongly disallow ALL empty target merges.
So we both allow merge cases and ensure the scalability check is correctly
applied.
We may wish to revisit these lock scalability concerns at a later date and
ensure they are still valid.
Additionally, correct userland VMA tests which were mistakenly not
asserting these cases correctly previously to now correctly assert this,
and to ensure vmg->anon_vma state is always consistent to account for
newly introduced asserts.
Link: https://lkml.kernel.org/r/cover.1744104124.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/18c756fc9eaf7ad082a710c91133b8346f8cd9a8.1744104124.git.lorenzo.stoakes@oracle.com
Fixes: 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Yeoreum Yun <yeoreum.yun@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-08 10:29:31 +01:00
|
|
|
struct anon_vma *anon_vma)
|
|
|
|
{
|
2025-06-18 20:42:53 +01:00
|
|
|
vmg_set_range(vmg, start, end, pgoff, vm_flags);
|
mm/vma: fix incorrectly disallowed anonymous VMA merges
Patch series "fix incorrectly disallowed anonymous VMA merges", v2.
It appears that we have been incorrectly rejecting merge cases for 15
years, apparently by mistake.
Imagine a range of anonymous mapped momemory divided into two VMAs like
this, with incompatible protection bits:
RW RWX
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
mprotect(RW)
Now imagine mprotect()'ing vma so it is RW. This appears as if it should
merge, it does not.
Neither does this case, again mprotect()'ing vma RW:
RWX RW
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
mprotect(RW)
Nor:
RW RWX RW
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
mprotect(RW)
What's going on here?
In commit 5beb49305251 ("mm: change anon_vma linking to fix multi-process
server scalability issue"), from 2010, Rik von Riel took careful care to
account for these cases - commenting that '[this is] easily overlooked:
when mprotect shifts the boundary, make sure the expanding vma has
anon_vma set if the shrinking vma had, to cover any anon pages imported.'
However, commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
introduced a little over a year later, appears to have accidentally
disallowed this.
By adjusting the is_mergeable_anon_vma() function to avoid lock contention
across large trees of forked anon_vma's, this commit wrongly assumed the
VMA being checked (the ostensible merge 'target') should be faulted, that
is, have an anon_vma, and thus an anon_vma_chain list established, but
only of length 1.
This appears to have been unintentional, as disallowing empty target VMAs
like this across the board makes no sense.
We already have logic that accounts for this case, the same logic Rik
introduced in 2010, now via dup_anon_vma() (and ultimately
anon_vma_clone()), so there is no problem permitting this.
This series fixes this mistake and also ensures that scalability concerns
remain addressed by explicitly checking that whatever VMA is being merged
has not been forked.
A full set of self tests which reproduce the issue are provided, as well
as updating userland VMA tests to assert this behaviour.
The self tests additionally assert scalability concerns are addressed.
This patch (of 3):
anon_vma_chain's were introduced by Rik von Riel in commit 5beb49305251
("mm: change anon_vma linking to fix multi-process server scalability
issue").
This patch was introduced in March 2010. As part of this change, careful
attention was made to the instance of mprotect() causing a VMA merge, with
one faulted (i.e. having anon_vma set) and another not:
/*
* Easily overlooked: when mprotect shifts the boundary,
* make sure the expanding vma has anon_vma set if the
* shrinking vma had, to cover any anon pages imported.
*/
In the modern VMA code, this is handled in dup_anon_vma() (and ultimately
anon_vma_clone()).
This case is one of the three configurations of adjacent VMA anon_vma
state that we might encounter on merge (where dst is the VMA which will be
merged into and src the one being merged into dst):
1. dst->anon_vma, src->anon_vma - These must be equal, no-op.
2. dst->anon_vma, !src->anon_vma - We simply use dst->anon_vma, no-op.
3. !dst->anon_vma, src->anon_vma - The case in question here.
In case 3, the instance addressed here - we duplicate the AVC connections
from src and place into dst.
However, in practice, we very often do NOT do this.
This appears to be due to an inadvertent consequence of the change
introduced by commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs"),
introduced in May 2011.
This implies that this merge case was functional only for a little over a
year, and has since been broken for ~15 years.
Here, lock scalability concerns lead to us restricting anonymous merges
only to those VMAs with 1 entry in their vma->anon_vma_chain, that is, a
VMA that is not connected to any parent process's anon_vma.
The mergeability test looks like this:
static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
struct anon_vma *anon_vma2, struct vm_area_struct *vma)
{
if ((!anon_vma1 || !anon_vma2) && (!vma ||
!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)))
return true;
return anon_vma1 == anon_vma2;
}
However, we have a problem here - typically the vma passed here is the
destination VMA.
For instance in vma_merge_existing_range() we invoke:
can_vma_merge_left()
-> [ check that there is an immediately adjacent prior VMA ]
-> can_vma_merge_after()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], prev->anon_vma, prev)
So if we were considering a target unfaulted 'prev':
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
This would call is_mergeable_anon_vma(NULL, vma->anon_vma, prev).
The list_is_singular() check for vma->anon_vma_chain, an empty list on
fault, would cause this merge to _fail_ even though all else indicates a
merge.
Equally a simple merge into a next VMA would hit the same problem:
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
can_vma_merge_right()
-> [ check that there is an immediately adjacent succeeding VMA ]
-> can_vma_merge_before()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], next->anon_vma, next)
For a 3-way merge, we'd also hit the same problem if it was configured like
this for instance:
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
As we'd call can_vma_merge_left() for prev, and can_vma_merge_right() for
next, both of which would fail.
vma_merge_new_range() (and relatedly, vma_expand()) are not impacted, as
the new VMA would never already be faulted (it is a proposed new range).
Because we already handle each of the aforementioned merge cases, and can
absolutely therefore deal with an existing VMA merge with !dst->anon_vma,
src->anon_vma, there is absolutely no reason to disallow this kind of
merge.
It seems that the intention of this patch is to ensure that, in the
instance of merging unfaulted VMAs with faulted ones, we never wish to do
so with those with multiple AVCs due to the fact that anon_vma lock's are
held across both parent and child anon_vma's (actually, the 'root' parent
anon_vma's lock is used).
In fact, the original commit alludes to this - "find_mergeable_anon_vma()
already considers this case".
In find_mergeable_anon_vma() however, we check the anon_vma which will be
merged from, if it is set, then we check
list_is_singular(vma->anon_vma_chain).
So to match this logic, update is_mergeable_anon_vma() to perform this
scalability check on the VMA whose anon_vma we ultimately merge into.
This matches existing behaviour with forked VMAs, only we no longer
wrongly disallow ALL empty target merges.
So we both allow merge cases and ensure the scalability check is correctly
applied.
We may wish to revisit these lock scalability concerns at a later date and
ensure they are still valid.
Additionally, correct userland VMA tests which were mistakenly not
asserting these cases correctly previously to now correctly assert this,
and to ensure vmg->anon_vma state is always consistent to account for
newly introduced asserts.
Link: https://lkml.kernel.org/r/cover.1744104124.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/18c756fc9eaf7ad082a710c91133b8346f8cd9a8.1744104124.git.lorenzo.stoakes@oracle.com
Fixes: 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Yeoreum Yun <yeoreum.yun@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-08 10:29:31 +01:00
|
|
|
vmg->anon_vma = anon_vma;
|
|
|
|
}
|
|
|
|
|
2024-08-30 19:10:14 +01:00
|
|
|
/*
|
|
|
|
* Helper function to try to merge a new VMA.
|
|
|
|
*
|
|
|
|
* Update vmg and the iterator for it and try to merge, otherwise allocate a new
|
|
|
|
* VMA, link it to the maple tree and return it.
|
|
|
|
*/
|
|
|
|
static struct vm_area_struct *try_merge_new_vma(struct mm_struct *mm,
|
|
|
|
struct vma_merge_struct *vmg,
|
|
|
|
unsigned long start, unsigned long end,
|
2025-06-18 20:42:53 +01:00
|
|
|
pgoff_t pgoff, vm_flags_t vm_flags,
|
2024-08-30 19:10:14 +01:00
|
|
|
bool *was_merged)
|
|
|
|
{
|
|
|
|
struct vm_area_struct *merged;
|
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vmg_set_range(vmg, start, end, pgoff, vm_flags);
|
2024-08-30 19:10:14 +01:00
|
|
|
|
|
|
|
merged = merge_new(vmg);
|
|
|
|
if (merged) {
|
|
|
|
*was_merged = true;
|
mm: avoid using vma_merge() for new VMAs
Abstract vma_merge_new_vma() to use vma_merge_struct and rename the
resultant function vma_merge_new_range() to be clear what the purpose of
this function is - a new VMA is desired in the specified range, and we
wish to see if it is possible to 'merge' surrounding VMAs into this range
rather than having to allocate a new VMA.
Note that this function uses vma_extend() exclusively, so adopts its
requirement that the iterator point at or before the gap. We add an
assert to this effect.
This is as opposed to vma_merge_existing_range(), which will be introduced
in a subsequent commit, and provide the same functionality for cases in
which we are modifying an existing VMA.
In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.
Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().
Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.
Also add the ability for the vmg to track state, and able to report
errors, allowing for us to differentiate a failed merge from an inability
to allocate memory in callers.
This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.
Also introduce vma_iter_next_rewind() to allow for retrieval of the next,
and (optionally) the prev VMA, rewinding to the start of the previous gap.
Introduce are_anon_vmas_compatible() to abstract individual VMA anon_vma
comparison for the case of merging on both sides where the anon_vma of the
VMA being merged maybe compatible with prev and next, but prev and next's
anon_vma's may not be compatible with each other.
Finally also introduce can_vma_merge_left() / can_vma_merge_right() to
check adjacent VMA compatibility and that they are indeed adjacent.
Link: https://lkml.kernel.org/r/49d37c0769b6b9dc03b27fe4d059173832556392.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:18 +01:00
|
|
|
ASSERT_EQ(vmg->state, VMA_MERGE_SUCCESS);
|
2024-08-30 19:10:14 +01:00
|
|
|
return merged;
|
|
|
|
}
|
|
|
|
|
|
|
|
*was_merged = false;
|
mm: avoid using vma_merge() for new VMAs
Abstract vma_merge_new_vma() to use vma_merge_struct and rename the
resultant function vma_merge_new_range() to be clear what the purpose of
this function is - a new VMA is desired in the specified range, and we
wish to see if it is possible to 'merge' surrounding VMAs into this range
rather than having to allocate a new VMA.
Note that this function uses vma_extend() exclusively, so adopts its
requirement that the iterator point at or before the gap. We add an
assert to this effect.
This is as opposed to vma_merge_existing_range(), which will be introduced
in a subsequent commit, and provide the same functionality for cases in
which we are modifying an existing VMA.
In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.
Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().
Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.
Also add the ability for the vmg to track state, and able to report
errors, allowing for us to differentiate a failed merge from an inability
to allocate memory in callers.
This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.
Also introduce vma_iter_next_rewind() to allow for retrieval of the next,
and (optionally) the prev VMA, rewinding to the start of the previous gap.
Introduce are_anon_vmas_compatible() to abstract individual VMA anon_vma
comparison for the case of merging on both sides where the anon_vma of the
VMA being merged maybe compatible with prev and next, but prev and next's
anon_vma's may not be compatible with each other.
Finally also introduce can_vma_merge_left() / can_vma_merge_right() to
check adjacent VMA compatibility and that they are indeed adjacent.
Link: https://lkml.kernel.org/r/49d37c0769b6b9dc03b27fe4d059173832556392.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:18 +01:00
|
|
|
|
|
|
|
ASSERT_EQ(vmg->state, VMA_MERGE_NOMERGE);
|
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
return alloc_and_link_vma(mm, start, end, pgoff, vm_flags);
|
2024-08-30 19:10:14 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Helper function to reset the dummy anon_vma to indicate it has not been
|
|
|
|
* duplicated.
|
|
|
|
*/
|
|
|
|
static void reset_dummy_anon_vma(void)
|
|
|
|
{
|
|
|
|
dummy_anon_vma.was_cloned = false;
|
|
|
|
dummy_anon_vma.was_unlinked = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Helper function to remove all VMAs and destroy the maple tree associated with
|
|
|
|
* a virtual address space. Returns a count of VMAs in the tree.
|
|
|
|
*/
|
|
|
|
static int cleanup_mm(struct mm_struct *mm, struct vma_iterator *vmi)
|
|
|
|
{
|
|
|
|
struct vm_area_struct *vma;
|
|
|
|
int count = 0;
|
|
|
|
|
|
|
|
fail_prealloc = false;
|
|
|
|
reset_dummy_anon_vma();
|
|
|
|
|
|
|
|
vma_iter_set(vmi, 0);
|
|
|
|
for_each_vma(*vmi, vma) {
|
mm: perform VMA allocation, freeing, duplication in mm
Right now these are performed in kernel/fork.c which is odd and a
violation of separation of concerns, as well as preventing us from
integrating this and related logic into userland VMA testing going
forward.
There is a fly in the ointment - nommu - mmap.c is not compiled if
CONFIG_MMU not set, and neither is vma.c.
To square the circle, let's add a new file - vma_init.c. This will be
compiled for both CONFIG_MMU and nommu builds, and will also form part of
the VMA userland testing.
This allows us to de-duplicate code, while maintaining separation of
concerns and the ability for us to userland test this logic.
Update the VMA userland tests accordingly, additionally adding a
detach_free_vma() helper function to correctly detach VMAs before freeing
them in test code, as this change was triggering the assert for this.
[akpm@linux-foundation.org: remove stray newline, per Liam]
Link: https://lkml.kernel.org/r/f97b3a85a6da0196b28070df331b99e22b263be8.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-28 16:28:17 +01:00
|
|
|
detach_free_vma(vma);
|
2024-08-30 19:10:14 +01:00
|
|
|
count++;
|
|
|
|
}
|
|
|
|
|
|
|
|
mtree_destroy(&mm->mm_mt);
|
|
|
|
mm->map_count = 0;
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Helper function to determine if VMA has had vma_start_write() performed. */
|
|
|
|
static bool vma_write_started(struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
int seq = vma->vm_lock_seq;
|
|
|
|
|
|
|
|
/* We reset after each check. */
|
2024-11-22 09:44:15 -08:00
|
|
|
vma->vm_lock_seq = UINT_MAX;
|
2024-08-30 19:10:14 +01:00
|
|
|
|
|
|
|
/* The vma_start_write() stub simply increments this value. */
|
|
|
|
return seq > -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Helper function providing a dummy vm_ops->close() method.*/
|
|
|
|
static void dummy_close(struct vm_area_struct *)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
mm/vma: fix incorrectly disallowed anonymous VMA merges
Patch series "fix incorrectly disallowed anonymous VMA merges", v2.
It appears that we have been incorrectly rejecting merge cases for 15
years, apparently by mistake.
Imagine a range of anonymous mapped momemory divided into two VMAs like
this, with incompatible protection bits:
RW RWX
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
mprotect(RW)
Now imagine mprotect()'ing vma so it is RW. This appears as if it should
merge, it does not.
Neither does this case, again mprotect()'ing vma RW:
RWX RW
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
mprotect(RW)
Nor:
RW RWX RW
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
mprotect(RW)
What's going on here?
In commit 5beb49305251 ("mm: change anon_vma linking to fix multi-process
server scalability issue"), from 2010, Rik von Riel took careful care to
account for these cases - commenting that '[this is] easily overlooked:
when mprotect shifts the boundary, make sure the expanding vma has
anon_vma set if the shrinking vma had, to cover any anon pages imported.'
However, commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
introduced a little over a year later, appears to have accidentally
disallowed this.
By adjusting the is_mergeable_anon_vma() function to avoid lock contention
across large trees of forked anon_vma's, this commit wrongly assumed the
VMA being checked (the ostensible merge 'target') should be faulted, that
is, have an anon_vma, and thus an anon_vma_chain list established, but
only of length 1.
This appears to have been unintentional, as disallowing empty target VMAs
like this across the board makes no sense.
We already have logic that accounts for this case, the same logic Rik
introduced in 2010, now via dup_anon_vma() (and ultimately
anon_vma_clone()), so there is no problem permitting this.
This series fixes this mistake and also ensures that scalability concerns
remain addressed by explicitly checking that whatever VMA is being merged
has not been forked.
A full set of self tests which reproduce the issue are provided, as well
as updating userland VMA tests to assert this behaviour.
The self tests additionally assert scalability concerns are addressed.
This patch (of 3):
anon_vma_chain's were introduced by Rik von Riel in commit 5beb49305251
("mm: change anon_vma linking to fix multi-process server scalability
issue").
This patch was introduced in March 2010. As part of this change, careful
attention was made to the instance of mprotect() causing a VMA merge, with
one faulted (i.e. having anon_vma set) and another not:
/*
* Easily overlooked: when mprotect shifts the boundary,
* make sure the expanding vma has anon_vma set if the
* shrinking vma had, to cover any anon pages imported.
*/
In the modern VMA code, this is handled in dup_anon_vma() (and ultimately
anon_vma_clone()).
This case is one of the three configurations of adjacent VMA anon_vma
state that we might encounter on merge (where dst is the VMA which will be
merged into and src the one being merged into dst):
1. dst->anon_vma, src->anon_vma - These must be equal, no-op.
2. dst->anon_vma, !src->anon_vma - We simply use dst->anon_vma, no-op.
3. !dst->anon_vma, src->anon_vma - The case in question here.
In case 3, the instance addressed here - we duplicate the AVC connections
from src and place into dst.
However, in practice, we very often do NOT do this.
This appears to be due to an inadvertent consequence of the change
introduced by commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs"),
introduced in May 2011.
This implies that this merge case was functional only for a little over a
year, and has since been broken for ~15 years.
Here, lock scalability concerns lead to us restricting anonymous merges
only to those VMAs with 1 entry in their vma->anon_vma_chain, that is, a
VMA that is not connected to any parent process's anon_vma.
The mergeability test looks like this:
static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
struct anon_vma *anon_vma2, struct vm_area_struct *vma)
{
if ((!anon_vma1 || !anon_vma2) && (!vma ||
!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)))
return true;
return anon_vma1 == anon_vma2;
}
However, we have a problem here - typically the vma passed here is the
destination VMA.
For instance in vma_merge_existing_range() we invoke:
can_vma_merge_left()
-> [ check that there is an immediately adjacent prior VMA ]
-> can_vma_merge_after()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], prev->anon_vma, prev)
So if we were considering a target unfaulted 'prev':
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
This would call is_mergeable_anon_vma(NULL, vma->anon_vma, prev).
The list_is_singular() check for vma->anon_vma_chain, an empty list on
fault, would cause this merge to _fail_ even though all else indicates a
merge.
Equally a simple merge into a next VMA would hit the same problem:
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
can_vma_merge_right()
-> [ check that there is an immediately adjacent succeeding VMA ]
-> can_vma_merge_before()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], next->anon_vma, next)
For a 3-way merge, we'd also hit the same problem if it was configured like
this for instance:
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
As we'd call can_vma_merge_left() for prev, and can_vma_merge_right() for
next, both of which would fail.
vma_merge_new_range() (and relatedly, vma_expand()) are not impacted, as
the new VMA would never already be faulted (it is a proposed new range).
Because we already handle each of the aforementioned merge cases, and can
absolutely therefore deal with an existing VMA merge with !dst->anon_vma,
src->anon_vma, there is absolutely no reason to disallow this kind of
merge.
It seems that the intention of this patch is to ensure that, in the
instance of merging unfaulted VMAs with faulted ones, we never wish to do
so with those with multiple AVCs due to the fact that anon_vma lock's are
held across both parent and child anon_vma's (actually, the 'root' parent
anon_vma's lock is used).
In fact, the original commit alludes to this - "find_mergeable_anon_vma()
already considers this case".
In find_mergeable_anon_vma() however, we check the anon_vma which will be
merged from, if it is set, then we check
list_is_singular(vma->anon_vma_chain).
So to match this logic, update is_mergeable_anon_vma() to perform this
scalability check on the VMA whose anon_vma we ultimately merge into.
This matches existing behaviour with forked VMAs, only we no longer
wrongly disallow ALL empty target merges.
So we both allow merge cases and ensure the scalability check is correctly
applied.
We may wish to revisit these lock scalability concerns at a later date and
ensure they are still valid.
Additionally, correct userland VMA tests which were mistakenly not
asserting these cases correctly previously to now correctly assert this,
and to ensure vmg->anon_vma state is always consistent to account for
newly introduced asserts.
Link: https://lkml.kernel.org/r/cover.1744104124.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/18c756fc9eaf7ad082a710c91133b8346f8cd9a8.1744104124.git.lorenzo.stoakes@oracle.com
Fixes: 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Yeoreum Yun <yeoreum.yun@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-08 10:29:31 +01:00
|
|
|
static void __vma_set_dummy_anon_vma(struct vm_area_struct *vma,
|
|
|
|
struct anon_vma_chain *avc,
|
|
|
|
struct anon_vma *anon_vma)
|
|
|
|
{
|
|
|
|
vma->anon_vma = anon_vma;
|
|
|
|
INIT_LIST_HEAD(&vma->anon_vma_chain);
|
|
|
|
list_add(&avc->same_vma, &vma->anon_vma_chain);
|
|
|
|
avc->anon_vma = vma->anon_vma;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void vma_set_dummy_anon_vma(struct vm_area_struct *vma,
|
|
|
|
struct anon_vma_chain *avc)
|
|
|
|
{
|
|
|
|
__vma_set_dummy_anon_vma(vma, avc, &dummy_anon_vma);
|
|
|
|
}
|
|
|
|
|
2024-07-29 12:50:41 +01:00
|
|
|
static bool test_simple_merge(void)
|
|
|
|
{
|
|
|
|
struct vm_area_struct *vma;
|
2025-06-18 20:42:53 +01:00
|
|
|
vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
|
2024-07-29 12:50:41 +01:00
|
|
|
struct mm_struct mm = {};
|
2025-06-18 20:42:53 +01:00
|
|
|
struct vm_area_struct *vma_left = alloc_vma(&mm, 0, 0x1000, 0, vm_flags);
|
|
|
|
struct vm_area_struct *vma_right = alloc_vma(&mm, 0x2000, 0x3000, 2, vm_flags);
|
2024-07-29 12:50:41 +01:00
|
|
|
VMA_ITERATOR(vmi, &mm, 0x1000);
|
2024-08-30 19:10:14 +01:00
|
|
|
struct vma_merge_struct vmg = {
|
|
|
|
.mm = &mm,
|
|
|
|
.vmi = &vmi,
|
|
|
|
.start = 0x1000,
|
|
|
|
.end = 0x2000,
|
2025-06-18 20:42:53 +01:00
|
|
|
.vm_flags = vm_flags,
|
2024-08-30 19:10:14 +01:00
|
|
|
.pgoff = 1,
|
|
|
|
};
|
2024-07-29 12:50:41 +01:00
|
|
|
|
2025-02-13 14:46:41 -08:00
|
|
|
ASSERT_FALSE(attach_vma(&mm, vma_left));
|
|
|
|
ASSERT_FALSE(attach_vma(&mm, vma_right));
|
2024-07-29 12:50:41 +01:00
|
|
|
|
2024-08-30 19:10:14 +01:00
|
|
|
vma = merge_new(&vmg);
|
2024-07-29 12:50:41 +01:00
|
|
|
ASSERT_NE(vma, NULL);
|
|
|
|
|
|
|
|
ASSERT_EQ(vma->vm_start, 0);
|
|
|
|
ASSERT_EQ(vma->vm_end, 0x3000);
|
|
|
|
ASSERT_EQ(vma->vm_pgoff, 0);
|
2025-06-18 20:42:53 +01:00
|
|
|
ASSERT_EQ(vma->vm_flags, vm_flags);
|
2024-07-29 12:50:41 +01:00
|
|
|
|
mm: perform VMA allocation, freeing, duplication in mm
Right now these are performed in kernel/fork.c which is odd and a
violation of separation of concerns, as well as preventing us from
integrating this and related logic into userland VMA testing going
forward.
There is a fly in the ointment - nommu - mmap.c is not compiled if
CONFIG_MMU not set, and neither is vma.c.
To square the circle, let's add a new file - vma_init.c. This will be
compiled for both CONFIG_MMU and nommu builds, and will also form part of
the VMA userland testing.
This allows us to de-duplicate code, while maintaining separation of
concerns and the ability for us to userland test this logic.
Update the VMA userland tests accordingly, additionally adding a
detach_free_vma() helper function to correctly detach VMAs before freeing
them in test code, as this change was triggering the assert for this.
[akpm@linux-foundation.org: remove stray newline, per Liam]
Link: https://lkml.kernel.org/r/f97b3a85a6da0196b28070df331b99e22b263be8.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-28 16:28:17 +01:00
|
|
|
detach_free_vma(vma);
|
2024-07-29 12:50:41 +01:00
|
|
|
mtree_destroy(&mm.mm_mt);
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool test_simple_modify(void)
|
|
|
|
{
|
|
|
|
struct vm_area_struct *vma;
|
2025-06-18 20:42:53 +01:00
|
|
|
vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
|
2024-07-29 12:50:41 +01:00
|
|
|
struct mm_struct mm = {};
|
2025-06-18 20:42:53 +01:00
|
|
|
struct vm_area_struct *init_vma = alloc_vma(&mm, 0, 0x3000, 0, vm_flags);
|
2024-07-29 12:50:41 +01:00
|
|
|
VMA_ITERATOR(vmi, &mm, 0x1000);
|
|
|
|
|
2025-02-13 14:46:41 -08:00
|
|
|
ASSERT_FALSE(attach_vma(&mm, init_vma));
|
2024-07-29 12:50:41 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The flags will not be changed, the vma_modify_flags() function
|
|
|
|
* performs the merge/split only.
|
|
|
|
*/
|
|
|
|
vma = vma_modify_flags(&vmi, init_vma, init_vma,
|
|
|
|
0x1000, 0x2000, VM_READ | VM_MAYREAD);
|
|
|
|
ASSERT_NE(vma, NULL);
|
|
|
|
/* We modify the provided VMA, and on split allocate new VMAs. */
|
|
|
|
ASSERT_EQ(vma, init_vma);
|
|
|
|
|
|
|
|
ASSERT_EQ(vma->vm_start, 0x1000);
|
|
|
|
ASSERT_EQ(vma->vm_end, 0x2000);
|
|
|
|
ASSERT_EQ(vma->vm_pgoff, 1);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Now walk through the three split VMAs and make sure they are as
|
|
|
|
* expected.
|
|
|
|
*/
|
|
|
|
|
|
|
|
vma_iter_set(&vmi, 0);
|
|
|
|
vma = vma_iter_load(&vmi);
|
|
|
|
|
|
|
|
ASSERT_EQ(vma->vm_start, 0);
|
|
|
|
ASSERT_EQ(vma->vm_end, 0x1000);
|
|
|
|
ASSERT_EQ(vma->vm_pgoff, 0);
|
|
|
|
|
mm: perform VMA allocation, freeing, duplication in mm
Right now these are performed in kernel/fork.c which is odd and a
violation of separation of concerns, as well as preventing us from
integrating this and related logic into userland VMA testing going
forward.
There is a fly in the ointment - nommu - mmap.c is not compiled if
CONFIG_MMU not set, and neither is vma.c.
To square the circle, let's add a new file - vma_init.c. This will be
compiled for both CONFIG_MMU and nommu builds, and will also form part of
the VMA userland testing.
This allows us to de-duplicate code, while maintaining separation of
concerns and the ability for us to userland test this logic.
Update the VMA userland tests accordingly, additionally adding a
detach_free_vma() helper function to correctly detach VMAs before freeing
them in test code, as this change was triggering the assert for this.
[akpm@linux-foundation.org: remove stray newline, per Liam]
Link: https://lkml.kernel.org/r/f97b3a85a6da0196b28070df331b99e22b263be8.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-28 16:28:17 +01:00
|
|
|
detach_free_vma(vma);
|
2024-07-29 12:50:41 +01:00
|
|
|
vma_iter_clear(&vmi);
|
|
|
|
|
|
|
|
vma = vma_next(&vmi);
|
|
|
|
|
|
|
|
ASSERT_EQ(vma->vm_start, 0x1000);
|
|
|
|
ASSERT_EQ(vma->vm_end, 0x2000);
|
|
|
|
ASSERT_EQ(vma->vm_pgoff, 1);
|
|
|
|
|
mm: perform VMA allocation, freeing, duplication in mm
Right now these are performed in kernel/fork.c which is odd and a
violation of separation of concerns, as well as preventing us from
integrating this and related logic into userland VMA testing going
forward.
There is a fly in the ointment - nommu - mmap.c is not compiled if
CONFIG_MMU not set, and neither is vma.c.
To square the circle, let's add a new file - vma_init.c. This will be
compiled for both CONFIG_MMU and nommu builds, and will also form part of
the VMA userland testing.
This allows us to de-duplicate code, while maintaining separation of
concerns and the ability for us to userland test this logic.
Update the VMA userland tests accordingly, additionally adding a
detach_free_vma() helper function to correctly detach VMAs before freeing
them in test code, as this change was triggering the assert for this.
[akpm@linux-foundation.org: remove stray newline, per Liam]
Link: https://lkml.kernel.org/r/f97b3a85a6da0196b28070df331b99e22b263be8.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-28 16:28:17 +01:00
|
|
|
detach_free_vma(vma);
|
2024-07-29 12:50:41 +01:00
|
|
|
vma_iter_clear(&vmi);
|
|
|
|
|
|
|
|
vma = vma_next(&vmi);
|
|
|
|
|
|
|
|
ASSERT_EQ(vma->vm_start, 0x2000);
|
|
|
|
ASSERT_EQ(vma->vm_end, 0x3000);
|
|
|
|
ASSERT_EQ(vma->vm_pgoff, 2);
|
|
|
|
|
mm: perform VMA allocation, freeing, duplication in mm
Right now these are performed in kernel/fork.c which is odd and a
violation of separation of concerns, as well as preventing us from
integrating this and related logic into userland VMA testing going
forward.
There is a fly in the ointment - nommu - mmap.c is not compiled if
CONFIG_MMU not set, and neither is vma.c.
To square the circle, let's add a new file - vma_init.c. This will be
compiled for both CONFIG_MMU and nommu builds, and will also form part of
the VMA userland testing.
This allows us to de-duplicate code, while maintaining separation of
concerns and the ability for us to userland test this logic.
Update the VMA userland tests accordingly, additionally adding a
detach_free_vma() helper function to correctly detach VMAs before freeing
them in test code, as this change was triggering the assert for this.
[akpm@linux-foundation.org: remove stray newline, per Liam]
Link: https://lkml.kernel.org/r/f97b3a85a6da0196b28070df331b99e22b263be8.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-28 16:28:17 +01:00
|
|
|
detach_free_vma(vma);
|
2024-07-29 12:50:41 +01:00
|
|
|
mtree_destroy(&mm.mm_mt);
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool test_simple_expand(void)
|
|
|
|
{
|
2025-06-18 20:42:53 +01:00
|
|
|
vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
|
2024-07-29 12:50:41 +01:00
|
|
|
struct mm_struct mm = {};
|
2025-06-18 20:42:53 +01:00
|
|
|
struct vm_area_struct *vma = alloc_vma(&mm, 0, 0x1000, 0, vm_flags);
|
2024-07-29 12:50:41 +01:00
|
|
|
VMA_ITERATOR(vmi, &mm, 0);
|
2024-08-30 19:10:14 +01:00
|
|
|
struct vma_merge_struct vmg = {
|
|
|
|
.vmi = &vmi,
|
2025-06-13 19:48:07 +01:00
|
|
|
.target = vma,
|
2024-08-30 19:10:14 +01:00
|
|
|
.start = 0,
|
|
|
|
.end = 0x3000,
|
|
|
|
.pgoff = 0,
|
|
|
|
};
|
2024-07-29 12:50:41 +01:00
|
|
|
|
2025-02-13 14:46:41 -08:00
|
|
|
ASSERT_FALSE(attach_vma(&mm, vma));
|
2024-07-29 12:50:41 +01:00
|
|
|
|
2024-08-30 19:10:14 +01:00
|
|
|
ASSERT_FALSE(expand_existing(&vmg));
|
2024-07-29 12:50:41 +01:00
|
|
|
|
|
|
|
ASSERT_EQ(vma->vm_start, 0);
|
|
|
|
ASSERT_EQ(vma->vm_end, 0x3000);
|
|
|
|
ASSERT_EQ(vma->vm_pgoff, 0);
|
|
|
|
|
mm: perform VMA allocation, freeing, duplication in mm
Right now these are performed in kernel/fork.c which is odd and a
violation of separation of concerns, as well as preventing us from
integrating this and related logic into userland VMA testing going
forward.
There is a fly in the ointment - nommu - mmap.c is not compiled if
CONFIG_MMU not set, and neither is vma.c.
To square the circle, let's add a new file - vma_init.c. This will be
compiled for both CONFIG_MMU and nommu builds, and will also form part of
the VMA userland testing.
This allows us to de-duplicate code, while maintaining separation of
concerns and the ability for us to userland test this logic.
Update the VMA userland tests accordingly, additionally adding a
detach_free_vma() helper function to correctly detach VMAs before freeing
them in test code, as this change was triggering the assert for this.
[akpm@linux-foundation.org: remove stray newline, per Liam]
Link: https://lkml.kernel.org/r/f97b3a85a6da0196b28070df331b99e22b263be8.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-28 16:28:17 +01:00
|
|
|
detach_free_vma(vma);
|
2024-07-29 12:50:41 +01:00
|
|
|
mtree_destroy(&mm.mm_mt);
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool test_simple_shrink(void)
|
|
|
|
{
|
2025-06-18 20:42:53 +01:00
|
|
|
vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
|
2024-07-29 12:50:41 +01:00
|
|
|
struct mm_struct mm = {};
|
2025-06-18 20:42:53 +01:00
|
|
|
struct vm_area_struct *vma = alloc_vma(&mm, 0, 0x3000, 0, vm_flags);
|
2024-07-29 12:50:41 +01:00
|
|
|
VMA_ITERATOR(vmi, &mm, 0);
|
|
|
|
|
2025-02-13 14:46:41 -08:00
|
|
|
ASSERT_FALSE(attach_vma(&mm, vma));
|
2024-07-29 12:50:41 +01:00
|
|
|
|
|
|
|
ASSERT_FALSE(vma_shrink(&vmi, vma, 0, 0x1000, 0));
|
|
|
|
|
|
|
|
ASSERT_EQ(vma->vm_start, 0);
|
|
|
|
ASSERT_EQ(vma->vm_end, 0x1000);
|
|
|
|
ASSERT_EQ(vma->vm_pgoff, 0);
|
|
|
|
|
mm: perform VMA allocation, freeing, duplication in mm
Right now these are performed in kernel/fork.c which is odd and a
violation of separation of concerns, as well as preventing us from
integrating this and related logic into userland VMA testing going
forward.
There is a fly in the ointment - nommu - mmap.c is not compiled if
CONFIG_MMU not set, and neither is vma.c.
To square the circle, let's add a new file - vma_init.c. This will be
compiled for both CONFIG_MMU and nommu builds, and will also form part of
the VMA userland testing.
This allows us to de-duplicate code, while maintaining separation of
concerns and the ability for us to userland test this logic.
Update the VMA userland tests accordingly, additionally adding a
detach_free_vma() helper function to correctly detach VMAs before freeing
them in test code, as this change was triggering the assert for this.
[akpm@linux-foundation.org: remove stray newline, per Liam]
Link: https://lkml.kernel.org/r/f97b3a85a6da0196b28070df331b99e22b263be8.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-28 16:28:17 +01:00
|
|
|
detach_free_vma(vma);
|
2024-07-29 12:50:41 +01:00
|
|
|
mtree_destroy(&mm.mm_mt);
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2024-08-30 19:10:14 +01:00
|
|
|
static bool test_merge_new(void)
|
|
|
|
{
|
2025-06-18 20:42:53 +01:00
|
|
|
vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
|
2024-08-30 19:10:14 +01:00
|
|
|
struct mm_struct mm = {};
|
|
|
|
VMA_ITERATOR(vmi, &mm, 0);
|
|
|
|
struct vma_merge_struct vmg = {
|
|
|
|
.mm = &mm,
|
|
|
|
.vmi = &vmi,
|
|
|
|
};
|
|
|
|
struct anon_vma_chain dummy_anon_vma_chain_a = {
|
|
|
|
.anon_vma = &dummy_anon_vma,
|
|
|
|
};
|
|
|
|
struct anon_vma_chain dummy_anon_vma_chain_b = {
|
|
|
|
.anon_vma = &dummy_anon_vma,
|
|
|
|
};
|
|
|
|
struct anon_vma_chain dummy_anon_vma_chain_c = {
|
|
|
|
.anon_vma = &dummy_anon_vma,
|
|
|
|
};
|
|
|
|
struct anon_vma_chain dummy_anon_vma_chain_d = {
|
|
|
|
.anon_vma = &dummy_anon_vma,
|
|
|
|
};
|
mm: rework vm_ops->close() handling on VMA merge
In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
removed in mergeability test") we relaxed the VMA merge rules for VMAs
possessing a vm_ops->close() hook, permitting this operation in instances
where we wouldn't delete the VMA as part of the merge operation.
This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix
vma_merge() case 7 with vma_ops->close") to account for a subtle case that
the previous commit had not taken into account.
In both instances, we first rely on is_mergeable_vma() to determine
whether we might be dealing with a VMA that might be removed, taking
advantage of the fact that a 'previous' VMA will never be deleted, only
VMAs that follow it.
The second patch corrects the instance where a merge of the previous VMA
into a subsequent one did not correctly check whether the subsequent VMA
had a vm_ops->close() handler.
Both changes prevent merge cases that are actually permissible (for
instance a merge of a VMA into a following VMA with a vm_ops->close(), but
with no previous VMA, which would result in the next VMA being extended,
not deleted).
In addition, both changes fail to consider the case where a VMA that would
otherwise be merged with the previous and next VMA might have
vm_ops->close(), on the assumption that for this to be the case, all three
would have to have the same vma->vm_file to be mergeable and thus the same
vm_ops.
And in addition both changes operate at 50,000 feet, trying to guess
whether a VMA will be deleted.
As we have majorly refactored the VMA merge operation and de-duplicated
code to the point where we know precisely where deletions will occur, this
patch removes the aforementioned checks altogether and instead explicitly
checks whether a VMA will be deleted.
In cases where a reduced merge is still possible (where we merge both
previous and next VMA but the next VMA has a vm_ops->close hook, meaning
we could just merge the previous and current VMA), we do so, otherwise the
merge is not permitted.
We take advantage of our userland testing to assert that this functions
correctly - replacing the previous limited vm_ops->close() tests with
tests for every single case where we delete a VMA.
We also update all testing for both new and modified VMAs to set
vma->vm_ops->close() in every single instance where this would not prevent
the merge, to assert that we never do so.
Link: https://lkml.kernel.org/r/9f96b8cfeef3d14afabddac3d6144afdfbef2e22.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:22 +01:00
|
|
|
const struct vm_operations_struct vm_ops = {
|
|
|
|
.close = dummy_close,
|
|
|
|
};
|
2024-08-30 19:10:14 +01:00
|
|
|
int count;
|
|
|
|
struct vm_area_struct *vma, *vma_a, *vma_b, *vma_c, *vma_d;
|
|
|
|
bool merged;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* 0123456789abc
|
|
|
|
* AA B CC
|
|
|
|
*/
|
2025-06-18 20:42:53 +01:00
|
|
|
vma_a = alloc_and_link_vma(&mm, 0, 0x2000, 0, vm_flags);
|
2024-08-30 19:10:14 +01:00
|
|
|
ASSERT_NE(vma_a, NULL);
|
|
|
|
/* We give each VMA a single avc so we can test anon_vma duplication. */
|
|
|
|
INIT_LIST_HEAD(&vma_a->anon_vma_chain);
|
|
|
|
list_add(&dummy_anon_vma_chain_a.same_vma, &vma_a->anon_vma_chain);
|
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vma_b = alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, vm_flags);
|
2024-08-30 19:10:14 +01:00
|
|
|
ASSERT_NE(vma_b, NULL);
|
|
|
|
INIT_LIST_HEAD(&vma_b->anon_vma_chain);
|
|
|
|
list_add(&dummy_anon_vma_chain_b.same_vma, &vma_b->anon_vma_chain);
|
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vma_c = alloc_and_link_vma(&mm, 0xb000, 0xc000, 0xb, vm_flags);
|
2024-08-30 19:10:14 +01:00
|
|
|
ASSERT_NE(vma_c, NULL);
|
|
|
|
INIT_LIST_HEAD(&vma_c->anon_vma_chain);
|
|
|
|
list_add(&dummy_anon_vma_chain_c.same_vma, &vma_c->anon_vma_chain);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* NO merge.
|
|
|
|
*
|
|
|
|
* 0123456789abc
|
|
|
|
* AA B ** CC
|
|
|
|
*/
|
2025-06-18 20:42:53 +01:00
|
|
|
vma_d = try_merge_new_vma(&mm, &vmg, 0x7000, 0x9000, 7, vm_flags, &merged);
|
2024-08-30 19:10:14 +01:00
|
|
|
ASSERT_NE(vma_d, NULL);
|
|
|
|
INIT_LIST_HEAD(&vma_d->anon_vma_chain);
|
|
|
|
list_add(&dummy_anon_vma_chain_d.same_vma, &vma_d->anon_vma_chain);
|
|
|
|
ASSERT_FALSE(merged);
|
|
|
|
ASSERT_EQ(mm.map_count, 4);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Merge BOTH sides.
|
|
|
|
*
|
|
|
|
* 0123456789abc
|
|
|
|
* AA*B DD CC
|
|
|
|
*/
|
mm: rework vm_ops->close() handling on VMA merge
In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
removed in mergeability test") we relaxed the VMA merge rules for VMAs
possessing a vm_ops->close() hook, permitting this operation in instances
where we wouldn't delete the VMA as part of the merge operation.
This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix
vma_merge() case 7 with vma_ops->close") to account for a subtle case that
the previous commit had not taken into account.
In both instances, we first rely on is_mergeable_vma() to determine
whether we might be dealing with a VMA that might be removed, taking
advantage of the fact that a 'previous' VMA will never be deleted, only
VMAs that follow it.
The second patch corrects the instance where a merge of the previous VMA
into a subsequent one did not correctly check whether the subsequent VMA
had a vm_ops->close() handler.
Both changes prevent merge cases that are actually permissible (for
instance a merge of a VMA into a following VMA with a vm_ops->close(), but
with no previous VMA, which would result in the next VMA being extended,
not deleted).
In addition, both changes fail to consider the case where a VMA that would
otherwise be merged with the previous and next VMA might have
vm_ops->close(), on the assumption that for this to be the case, all three
would have to have the same vma->vm_file to be mergeable and thus the same
vm_ops.
And in addition both changes operate at 50,000 feet, trying to guess
whether a VMA will be deleted.
As we have majorly refactored the VMA merge operation and de-duplicated
code to the point where we know precisely where deletions will occur, this
patch removes the aforementioned checks altogether and instead explicitly
checks whether a VMA will be deleted.
In cases where a reduced merge is still possible (where we merge both
previous and next VMA but the next VMA has a vm_ops->close hook, meaning
we could just merge the previous and current VMA), we do so, otherwise the
merge is not permitted.
We take advantage of our userland testing to assert that this functions
correctly - replacing the previous limited vm_ops->close() tests with
tests for every single case where we delete a VMA.
We also update all testing for both new and modified VMAs to set
vma->vm_ops->close() in every single instance where this would not prevent
the merge, to assert that we never do so.
Link: https://lkml.kernel.org/r/9f96b8cfeef3d14afabddac3d6144afdfbef2e22.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:22 +01:00
|
|
|
vma_a->vm_ops = &vm_ops; /* This should have no impact. */
|
2024-08-30 19:10:14 +01:00
|
|
|
vma_b->anon_vma = &dummy_anon_vma;
|
2025-06-18 20:42:53 +01:00
|
|
|
vma = try_merge_new_vma(&mm, &vmg, 0x2000, 0x3000, 2, vm_flags, &merged);
|
2024-08-30 19:10:14 +01:00
|
|
|
ASSERT_EQ(vma, vma_a);
|
|
|
|
/* Merge with A, delete B. */
|
|
|
|
ASSERT_TRUE(merged);
|
|
|
|
ASSERT_EQ(vma->vm_start, 0);
|
|
|
|
ASSERT_EQ(vma->vm_end, 0x4000);
|
|
|
|
ASSERT_EQ(vma->vm_pgoff, 0);
|
|
|
|
ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
|
|
|
|
ASSERT_TRUE(vma_write_started(vma));
|
|
|
|
ASSERT_EQ(mm.map_count, 3);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Merge to PREVIOUS VMA.
|
|
|
|
*
|
|
|
|
* 0123456789abc
|
|
|
|
* AAAA* DD CC
|
|
|
|
*/
|
2025-06-18 20:42:53 +01:00
|
|
|
vma = try_merge_new_vma(&mm, &vmg, 0x4000, 0x5000, 4, vm_flags, &merged);
|
2024-08-30 19:10:14 +01:00
|
|
|
ASSERT_EQ(vma, vma_a);
|
|
|
|
/* Extend A. */
|
|
|
|
ASSERT_TRUE(merged);
|
|
|
|
ASSERT_EQ(vma->vm_start, 0);
|
|
|
|
ASSERT_EQ(vma->vm_end, 0x5000);
|
|
|
|
ASSERT_EQ(vma->vm_pgoff, 0);
|
|
|
|
ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
|
|
|
|
ASSERT_TRUE(vma_write_started(vma));
|
|
|
|
ASSERT_EQ(mm.map_count, 3);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Merge to NEXT VMA.
|
|
|
|
*
|
|
|
|
* 0123456789abc
|
|
|
|
* AAAAA *DD CC
|
|
|
|
*/
|
|
|
|
vma_d->anon_vma = &dummy_anon_vma;
|
mm: rework vm_ops->close() handling on VMA merge
In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
removed in mergeability test") we relaxed the VMA merge rules for VMAs
possessing a vm_ops->close() hook, permitting this operation in instances
where we wouldn't delete the VMA as part of the merge operation.
This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix
vma_merge() case 7 with vma_ops->close") to account for a subtle case that
the previous commit had not taken into account.
In both instances, we first rely on is_mergeable_vma() to determine
whether we might be dealing with a VMA that might be removed, taking
advantage of the fact that a 'previous' VMA will never be deleted, only
VMAs that follow it.
The second patch corrects the instance where a merge of the previous VMA
into a subsequent one did not correctly check whether the subsequent VMA
had a vm_ops->close() handler.
Both changes prevent merge cases that are actually permissible (for
instance a merge of a VMA into a following VMA with a vm_ops->close(), but
with no previous VMA, which would result in the next VMA being extended,
not deleted).
In addition, both changes fail to consider the case where a VMA that would
otherwise be merged with the previous and next VMA might have
vm_ops->close(), on the assumption that for this to be the case, all three
would have to have the same vma->vm_file to be mergeable and thus the same
vm_ops.
And in addition both changes operate at 50,000 feet, trying to guess
whether a VMA will be deleted.
As we have majorly refactored the VMA merge operation and de-duplicated
code to the point where we know precisely where deletions will occur, this
patch removes the aforementioned checks altogether and instead explicitly
checks whether a VMA will be deleted.
In cases where a reduced merge is still possible (where we merge both
previous and next VMA but the next VMA has a vm_ops->close hook, meaning
we could just merge the previous and current VMA), we do so, otherwise the
merge is not permitted.
We take advantage of our userland testing to assert that this functions
correctly - replacing the previous limited vm_ops->close() tests with
tests for every single case where we delete a VMA.
We also update all testing for both new and modified VMAs to set
vma->vm_ops->close() in every single instance where this would not prevent
the merge, to assert that we never do so.
Link: https://lkml.kernel.org/r/9f96b8cfeef3d14afabddac3d6144afdfbef2e22.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:22 +01:00
|
|
|
vma_d->vm_ops = &vm_ops; /* This should have no impact. */
|
2025-06-18 20:42:53 +01:00
|
|
|
vma = try_merge_new_vma(&mm, &vmg, 0x6000, 0x7000, 6, vm_flags, &merged);
|
2024-08-30 19:10:14 +01:00
|
|
|
ASSERT_EQ(vma, vma_d);
|
|
|
|
/* Prepend. */
|
|
|
|
ASSERT_TRUE(merged);
|
|
|
|
ASSERT_EQ(vma->vm_start, 0x6000);
|
|
|
|
ASSERT_EQ(vma->vm_end, 0x9000);
|
|
|
|
ASSERT_EQ(vma->vm_pgoff, 6);
|
|
|
|
ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
|
|
|
|
ASSERT_TRUE(vma_write_started(vma));
|
|
|
|
ASSERT_EQ(mm.map_count, 3);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Merge BOTH sides.
|
|
|
|
*
|
|
|
|
* 0123456789abc
|
|
|
|
* AAAAA*DDD CC
|
|
|
|
*/
|
mm: rework vm_ops->close() handling on VMA merge
In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
removed in mergeability test") we relaxed the VMA merge rules for VMAs
possessing a vm_ops->close() hook, permitting this operation in instances
where we wouldn't delete the VMA as part of the merge operation.
This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix
vma_merge() case 7 with vma_ops->close") to account for a subtle case that
the previous commit had not taken into account.
In both instances, we first rely on is_mergeable_vma() to determine
whether we might be dealing with a VMA that might be removed, taking
advantage of the fact that a 'previous' VMA will never be deleted, only
VMAs that follow it.
The second patch corrects the instance where a merge of the previous VMA
into a subsequent one did not correctly check whether the subsequent VMA
had a vm_ops->close() handler.
Both changes prevent merge cases that are actually permissible (for
instance a merge of a VMA into a following VMA with a vm_ops->close(), but
with no previous VMA, which would result in the next VMA being extended,
not deleted).
In addition, both changes fail to consider the case where a VMA that would
otherwise be merged with the previous and next VMA might have
vm_ops->close(), on the assumption that for this to be the case, all three
would have to have the same vma->vm_file to be mergeable and thus the same
vm_ops.
And in addition both changes operate at 50,000 feet, trying to guess
whether a VMA will be deleted.
As we have majorly refactored the VMA merge operation and de-duplicated
code to the point where we know precisely where deletions will occur, this
patch removes the aforementioned checks altogether and instead explicitly
checks whether a VMA will be deleted.
In cases where a reduced merge is still possible (where we merge both
previous and next VMA but the next VMA has a vm_ops->close hook, meaning
we could just merge the previous and current VMA), we do so, otherwise the
merge is not permitted.
We take advantage of our userland testing to assert that this functions
correctly - replacing the previous limited vm_ops->close() tests with
tests for every single case where we delete a VMA.
We also update all testing for both new and modified VMAs to set
vma->vm_ops->close() in every single instance where this would not prevent
the merge, to assert that we never do so.
Link: https://lkml.kernel.org/r/9f96b8cfeef3d14afabddac3d6144afdfbef2e22.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:22 +01:00
|
|
|
vma_d->vm_ops = NULL; /* This would otherwise degrade the merge. */
|
2025-06-18 20:42:53 +01:00
|
|
|
vma = try_merge_new_vma(&mm, &vmg, 0x5000, 0x6000, 5, vm_flags, &merged);
|
2024-08-30 19:10:14 +01:00
|
|
|
ASSERT_EQ(vma, vma_a);
|
|
|
|
/* Merge with A, delete D. */
|
|
|
|
ASSERT_TRUE(merged);
|
|
|
|
ASSERT_EQ(vma->vm_start, 0);
|
|
|
|
ASSERT_EQ(vma->vm_end, 0x9000);
|
|
|
|
ASSERT_EQ(vma->vm_pgoff, 0);
|
|
|
|
ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
|
|
|
|
ASSERT_TRUE(vma_write_started(vma));
|
|
|
|
ASSERT_EQ(mm.map_count, 2);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Merge to NEXT VMA.
|
|
|
|
*
|
|
|
|
* 0123456789abc
|
|
|
|
* AAAAAAAAA *CC
|
|
|
|
*/
|
|
|
|
vma_c->anon_vma = &dummy_anon_vma;
|
2025-06-18 20:42:53 +01:00
|
|
|
vma = try_merge_new_vma(&mm, &vmg, 0xa000, 0xb000, 0xa, vm_flags, &merged);
|
2024-08-30 19:10:14 +01:00
|
|
|
ASSERT_EQ(vma, vma_c);
|
|
|
|
/* Prepend C. */
|
|
|
|
ASSERT_TRUE(merged);
|
|
|
|
ASSERT_EQ(vma->vm_start, 0xa000);
|
|
|
|
ASSERT_EQ(vma->vm_end, 0xc000);
|
|
|
|
ASSERT_EQ(vma->vm_pgoff, 0xa);
|
|
|
|
ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
|
|
|
|
ASSERT_TRUE(vma_write_started(vma));
|
|
|
|
ASSERT_EQ(mm.map_count, 2);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Merge BOTH sides.
|
|
|
|
*
|
|
|
|
* 0123456789abc
|
|
|
|
* AAAAAAAAA*CCC
|
|
|
|
*/
|
2025-06-18 20:42:53 +01:00
|
|
|
vma = try_merge_new_vma(&mm, &vmg, 0x9000, 0xa000, 0x9, vm_flags, &merged);
|
2024-08-30 19:10:14 +01:00
|
|
|
ASSERT_EQ(vma, vma_a);
|
|
|
|
/* Extend A and delete C. */
|
|
|
|
ASSERT_TRUE(merged);
|
|
|
|
ASSERT_EQ(vma->vm_start, 0);
|
|
|
|
ASSERT_EQ(vma->vm_end, 0xc000);
|
|
|
|
ASSERT_EQ(vma->vm_pgoff, 0);
|
|
|
|
ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
|
|
|
|
ASSERT_TRUE(vma_write_started(vma));
|
|
|
|
ASSERT_EQ(mm.map_count, 1);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Final state.
|
|
|
|
*
|
|
|
|
* 0123456789abc
|
|
|
|
* AAAAAAAAAAAAA
|
|
|
|
*/
|
|
|
|
|
|
|
|
count = 0;
|
|
|
|
vma_iter_set(&vmi, 0);
|
|
|
|
for_each_vma(vmi, vma) {
|
|
|
|
ASSERT_NE(vma, NULL);
|
|
|
|
ASSERT_EQ(vma->vm_start, 0);
|
|
|
|
ASSERT_EQ(vma->vm_end, 0xc000);
|
|
|
|
ASSERT_EQ(vma->vm_pgoff, 0);
|
|
|
|
ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
|
|
|
|
|
mm: perform VMA allocation, freeing, duplication in mm
Right now these are performed in kernel/fork.c which is odd and a
violation of separation of concerns, as well as preventing us from
integrating this and related logic into userland VMA testing going
forward.
There is a fly in the ointment - nommu - mmap.c is not compiled if
CONFIG_MMU not set, and neither is vma.c.
To square the circle, let's add a new file - vma_init.c. This will be
compiled for both CONFIG_MMU and nommu builds, and will also form part of
the VMA userland testing.
This allows us to de-duplicate code, while maintaining separation of
concerns and the ability for us to userland test this logic.
Update the VMA userland tests accordingly, additionally adding a
detach_free_vma() helper function to correctly detach VMAs before freeing
them in test code, as this change was triggering the assert for this.
[akpm@linux-foundation.org: remove stray newline, per Liam]
Link: https://lkml.kernel.org/r/f97b3a85a6da0196b28070df331b99e22b263be8.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-28 16:28:17 +01:00
|
|
|
detach_free_vma(vma);
|
2024-08-30 19:10:14 +01:00
|
|
|
count++;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Should only have one VMA left (though freed) after all is done.*/
|
|
|
|
ASSERT_EQ(count, 1);
|
|
|
|
|
|
|
|
mtree_destroy(&mm.mm_mt);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool test_vma_merge_special_flags(void)
|
|
|
|
{
|
2025-06-18 20:42:53 +01:00
|
|
|
vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
|
2024-08-30 19:10:14 +01:00
|
|
|
struct mm_struct mm = {};
|
|
|
|
VMA_ITERATOR(vmi, &mm, 0);
|
|
|
|
struct vma_merge_struct vmg = {
|
|
|
|
.mm = &mm,
|
|
|
|
.vmi = &vmi,
|
|
|
|
};
|
|
|
|
vm_flags_t special_flags[] = { VM_IO, VM_DONTEXPAND, VM_PFNMAP, VM_MIXEDMAP };
|
|
|
|
vm_flags_t all_special_flags = 0;
|
|
|
|
int i;
|
|
|
|
struct vm_area_struct *vma_left, *vma;
|
|
|
|
|
|
|
|
/* Make sure there aren't new VM_SPECIAL flags. */
|
|
|
|
for (i = 0; i < ARRAY_SIZE(special_flags); i++) {
|
|
|
|
all_special_flags |= special_flags[i];
|
|
|
|
}
|
|
|
|
ASSERT_EQ(all_special_flags, VM_SPECIAL);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* 01234
|
|
|
|
* AAA
|
|
|
|
*/
|
2025-06-18 20:42:53 +01:00
|
|
|
vma_left = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
|
2024-08-30 19:10:14 +01:00
|
|
|
ASSERT_NE(vma_left, NULL);
|
|
|
|
|
|
|
|
/* 1. Set up new VMA with special flag that would otherwise merge. */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* 01234
|
|
|
|
* AAA*
|
|
|
|
*
|
|
|
|
* This should merge if not for the VM_SPECIAL flag.
|
|
|
|
*/
|
2025-06-18 20:42:53 +01:00
|
|
|
vmg_set_range(&vmg, 0x3000, 0x4000, 3, vm_flags);
|
2024-08-30 19:10:14 +01:00
|
|
|
for (i = 0; i < ARRAY_SIZE(special_flags); i++) {
|
|
|
|
vm_flags_t special_flag = special_flags[i];
|
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vma_left->__vm_flags = vm_flags | special_flag;
|
|
|
|
vmg.vm_flags = vm_flags | special_flag;
|
2024-08-30 19:10:14 +01:00
|
|
|
vma = merge_new(&vmg);
|
|
|
|
ASSERT_EQ(vma, NULL);
|
mm: avoid using vma_merge() for new VMAs
Abstract vma_merge_new_vma() to use vma_merge_struct and rename the
resultant function vma_merge_new_range() to be clear what the purpose of
this function is - a new VMA is desired in the specified range, and we
wish to see if it is possible to 'merge' surrounding VMAs into this range
rather than having to allocate a new VMA.
Note that this function uses vma_extend() exclusively, so adopts its
requirement that the iterator point at or before the gap. We add an
assert to this effect.
This is as opposed to vma_merge_existing_range(), which will be introduced
in a subsequent commit, and provide the same functionality for cases in
which we are modifying an existing VMA.
In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.
Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().
Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.
Also add the ability for the vmg to track state, and able to report
errors, allowing for us to differentiate a failed merge from an inability
to allocate memory in callers.
This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.
Also introduce vma_iter_next_rewind() to allow for retrieval of the next,
and (optionally) the prev VMA, rewinding to the start of the previous gap.
Introduce are_anon_vmas_compatible() to abstract individual VMA anon_vma
comparison for the case of merging on both sides where the anon_vma of the
VMA being merged maybe compatible with prev and next, but prev and next's
anon_vma's may not be compatible with each other.
Finally also introduce can_vma_merge_left() / can_vma_merge_right() to
check adjacent VMA compatibility and that they are indeed adjacent.
Link: https://lkml.kernel.org/r/49d37c0769b6b9dc03b27fe4d059173832556392.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:18 +01:00
|
|
|
ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
|
2024-08-30 19:10:14 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/* 2. Modify VMA with special flag that would otherwise merge. */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* 01234
|
|
|
|
* AAAB
|
|
|
|
*
|
|
|
|
* Create a VMA to modify.
|
|
|
|
*/
|
2025-06-18 20:42:53 +01:00
|
|
|
vma = alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, vm_flags);
|
2024-08-30 19:10:14 +01:00
|
|
|
ASSERT_NE(vma, NULL);
|
mm: simplify vma merge structure and expand comments
Patch series "mm: further simplify VMA merge operation", v3.
While significant efforts have been made to improve the VMA merge
operation, there remains remnants of the bad (or rather confusing) old
days, which make the code difficult to understand, more bug prone and thus
harder to modify.
This series attempts to significantly improve matters in a number of
respects - with a focus on simplifying the commit_merge() function which
actually actions the merge operation - and importantly, adjusting the two
most confusing merge cases - those in which we 'adjust' the VMA
immediately adjacent to the one being merged.
One source of confusion are the VMAs being threaded through the operation
themselves - vmg->prev, vmg->vma and vmg->next.
At the start of the operation, vmg->vma is either NULL if a new VMA is
propose to be added, or if not then a pointer to an existing VMA being
modified, and prev/next are (perhaps not present) VMAs sat immediately
before and after the range specified in vmg->start, end, respectively.
However, during the VMA merge operation, we change vmg->start, end and
pgoff to span the newly merged range and vmg->vma to either be:
a. The ultimately returned VMA (in most cases) or b. A VMA which we will
manipulate, but ultimately instead return vmg->next.
Case b. especially here is confusing for somebody reading this code, but
the fact we update this state, along with vmg->start, end, pgoff only
makes matters worse.
We simplify things by replacing vmg->vma with vmg->middle and never
changing it - this is always either NULL (for a new VMA) or the VMA being
modified between vmg->prev and vmg->next.
We further simplify by placing the merged VMA in a new vmg->target field -
whether case b. above is the case or not. The reader of the code can now
simply rely on vmg->middle being the middle VMA and vmg->target being the
ultimately merged VMA.
We additionally tackle the confusing cases where we 'adjust' VMAs other
than the one we ultimately return as the merged VMA (this includes case b.
above). These are:
(1)
merge
<----------->
|------||--------| |------------|---|
| prev || middle | -> | target | m |
|------||--------| |------------|---|
In which case middle must be adjusted so middle->vm_start is increased as
well as performing the merge.
(2) (equivalent to case b. above)
<------------->
|---------||------| |---|-------------|
| middle || next | -> | m | target |
|---------||------| |---|-------------|
In which case next must be adjusted so next->vm_start is decreased as well
as performing the merge.
This cases have previously been performed by calculating and passing
around a dubious and confusing 'adj_start' parameter along side a pointer
to an 'adjust' VMA indicating which VMA requires additional adjustment
(middle in case 1 and next in case 2).
With the VMG structure in place we are able to avoid this by simply
setting a merge flag to describe each case:
(1) Sets the vmg->__adjust_middle_start flag
(2) Sets the vmg->__adjust_next_start flag
By doing so it turns out we can vastly simplify the logic and calculate
what is required to perform the operation.
Taken together the refactorings make it far easier to understand what is
being done even in these more confusing cases, make the code far more
maintainable, debuggable, and testable, providing more internal state
indicating what is happening in the merge operation.
The changes have no functional net impact on the merge operation and
everything should still behave as it did before.
This patch (of 5):
The merge code, while much improved, still has a number of points of
confusion. As part of a broader series cleaning this up to make this more
maintainable, we start by addressing some confusion around
vma_merge_struct fields.
So far, the caller either provides no vmg->vma (a new VMA) or supplies the
existing VMA which is being altered, setting vmg->start,end,pgoff to the
proposed VMA dimensions.
vmg->vma is then updated, as are vmg->start,end,pgoff as the merge process
proceeds and the appropriate merge strategy is determined.
This is rather confusing, as vmg->vma starts off as the 'middle' VMA
between vmg->prev,next, but becomes the 'target' VMA, except in one
specific edge case (merge next, shrink middle).
Int his patch we introduce vmg->middle to describe the VMA that is between
vmg->prev and vmg->next, and does NOT change during the merge operation.
We replace vmg->vma with vmg->target, and use this only during the merge
operation itself.
Aside from the merge right, shrink middle case, this becomes the VMA that
forms the basis of the VMA that is returned. This edge case can be
addressed in a future commit.
We also add a number of comments to explain what is going on.
Finally, we adjust the ASCII diagrams showing each merge case in
vma_merge_existing_range() to be clearer - the arrow range previously
showed the vmg->start, end spanned area, but it is clearer to change this
to show the final merged VMA.
This patch has no change in functional behaviour.
Link: https://lkml.kernel.org/r/cover.1738326519.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/4dfe60f1419d55e5d0516f56349695d73a57184c.1738326519.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-31 12:31:49 +00:00
|
|
|
vmg.middle = vma;
|
2024-08-30 19:10:14 +01:00
|
|
|
|
|
|
|
for (i = 0; i < ARRAY_SIZE(special_flags); i++) {
|
|
|
|
vm_flags_t special_flag = special_flags[i];
|
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vma_left->__vm_flags = vm_flags | special_flag;
|
|
|
|
vmg.vm_flags = vm_flags | special_flag;
|
2024-08-30 19:10:14 +01:00
|
|
|
vma = merge_existing(&vmg);
|
|
|
|
ASSERT_EQ(vma, NULL);
|
mm: avoid using vma_merge() for new VMAs
Abstract vma_merge_new_vma() to use vma_merge_struct and rename the
resultant function vma_merge_new_range() to be clear what the purpose of
this function is - a new VMA is desired in the specified range, and we
wish to see if it is possible to 'merge' surrounding VMAs into this range
rather than having to allocate a new VMA.
Note that this function uses vma_extend() exclusively, so adopts its
requirement that the iterator point at or before the gap. We add an
assert to this effect.
This is as opposed to vma_merge_existing_range(), which will be introduced
in a subsequent commit, and provide the same functionality for cases in
which we are modifying an existing VMA.
In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.
Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().
Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.
Also add the ability for the vmg to track state, and able to report
errors, allowing for us to differentiate a failed merge from an inability
to allocate memory in callers.
This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.
Also introduce vma_iter_next_rewind() to allow for retrieval of the next,
and (optionally) the prev VMA, rewinding to the start of the previous gap.
Introduce are_anon_vmas_compatible() to abstract individual VMA anon_vma
comparison for the case of merging on both sides where the anon_vma of the
VMA being merged maybe compatible with prev and next, but prev and next's
anon_vma's may not be compatible with each other.
Finally also introduce can_vma_merge_left() / can_vma_merge_right() to
check adjacent VMA compatibility and that they are indeed adjacent.
Link: https://lkml.kernel.org/r/49d37c0769b6b9dc03b27fe4d059173832556392.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:18 +01:00
|
|
|
ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
|
2024-08-30 19:10:14 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
cleanup_mm(&mm, &vmi);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool test_vma_merge_with_close(void)
|
|
|
|
{
|
2025-06-18 20:42:53 +01:00
|
|
|
vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
|
2024-08-30 19:10:14 +01:00
|
|
|
struct mm_struct mm = {};
|
|
|
|
VMA_ITERATOR(vmi, &mm, 0);
|
|
|
|
struct vma_merge_struct vmg = {
|
|
|
|
.mm = &mm,
|
|
|
|
.vmi = &vmi,
|
|
|
|
};
|
|
|
|
const struct vm_operations_struct vm_ops = {
|
|
|
|
.close = dummy_close,
|
|
|
|
};
|
mm: rework vm_ops->close() handling on VMA merge
In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
removed in mergeability test") we relaxed the VMA merge rules for VMAs
possessing a vm_ops->close() hook, permitting this operation in instances
where we wouldn't delete the VMA as part of the merge operation.
This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix
vma_merge() case 7 with vma_ops->close") to account for a subtle case that
the previous commit had not taken into account.
In both instances, we first rely on is_mergeable_vma() to determine
whether we might be dealing with a VMA that might be removed, taking
advantage of the fact that a 'previous' VMA will never be deleted, only
VMAs that follow it.
The second patch corrects the instance where a merge of the previous VMA
into a subsequent one did not correctly check whether the subsequent VMA
had a vm_ops->close() handler.
Both changes prevent merge cases that are actually permissible (for
instance a merge of a VMA into a following VMA with a vm_ops->close(), but
with no previous VMA, which would result in the next VMA being extended,
not deleted).
In addition, both changes fail to consider the case where a VMA that would
otherwise be merged with the previous and next VMA might have
vm_ops->close(), on the assumption that for this to be the case, all three
would have to have the same vma->vm_file to be mergeable and thus the same
vm_ops.
And in addition both changes operate at 50,000 feet, trying to guess
whether a VMA will be deleted.
As we have majorly refactored the VMA merge operation and de-duplicated
code to the point where we know precisely where deletions will occur, this
patch removes the aforementioned checks altogether and instead explicitly
checks whether a VMA will be deleted.
In cases where a reduced merge is still possible (where we merge both
previous and next VMA but the next VMA has a vm_ops->close hook, meaning
we could just merge the previous and current VMA), we do so, otherwise the
merge is not permitted.
We take advantage of our userland testing to assert that this functions
correctly - replacing the previous limited vm_ops->close() tests with
tests for every single case where we delete a VMA.
We also update all testing for both new and modified VMAs to set
vma->vm_ops->close() in every single instance where this would not prevent
the merge, to assert that we never do so.
Link: https://lkml.kernel.org/r/9f96b8cfeef3d14afabddac3d6144afdfbef2e22.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:22 +01:00
|
|
|
struct vm_area_struct *vma_prev, *vma_next, *vma;
|
2024-08-30 19:10:14 +01:00
|
|
|
|
|
|
|
/*
|
mm: rework vm_ops->close() handling on VMA merge
In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
removed in mergeability test") we relaxed the VMA merge rules for VMAs
possessing a vm_ops->close() hook, permitting this operation in instances
where we wouldn't delete the VMA as part of the merge operation.
This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix
vma_merge() case 7 with vma_ops->close") to account for a subtle case that
the previous commit had not taken into account.
In both instances, we first rely on is_mergeable_vma() to determine
whether we might be dealing with a VMA that might be removed, taking
advantage of the fact that a 'previous' VMA will never be deleted, only
VMAs that follow it.
The second patch corrects the instance where a merge of the previous VMA
into a subsequent one did not correctly check whether the subsequent VMA
had a vm_ops->close() handler.
Both changes prevent merge cases that are actually permissible (for
instance a merge of a VMA into a following VMA with a vm_ops->close(), but
with no previous VMA, which would result in the next VMA being extended,
not deleted).
In addition, both changes fail to consider the case where a VMA that would
otherwise be merged with the previous and next VMA might have
vm_ops->close(), on the assumption that for this to be the case, all three
would have to have the same vma->vm_file to be mergeable and thus the same
vm_ops.
And in addition both changes operate at 50,000 feet, trying to guess
whether a VMA will be deleted.
As we have majorly refactored the VMA merge operation and de-duplicated
code to the point where we know precisely where deletions will occur, this
patch removes the aforementioned checks altogether and instead explicitly
checks whether a VMA will be deleted.
In cases where a reduced merge is still possible (where we merge both
previous and next VMA but the next VMA has a vm_ops->close hook, meaning
we could just merge the previous and current VMA), we do so, otherwise the
merge is not permitted.
We take advantage of our userland testing to assert that this functions
correctly - replacing the previous limited vm_ops->close() tests with
tests for every single case where we delete a VMA.
We also update all testing for both new and modified VMAs to set
vma->vm_ops->close() in every single instance where this would not prevent
the merge, to assert that we never do so.
Link: https://lkml.kernel.org/r/9f96b8cfeef3d14afabddac3d6144afdfbef2e22.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:22 +01:00
|
|
|
* When merging VMAs we are not permitted to remove any VMA that has a
|
|
|
|
* vm_ops->close() hook.
|
2024-08-30 19:10:14 +01:00
|
|
|
*
|
|
|
|
* Considering the two possible adjacent VMAs to which a VMA can be
|
|
|
|
* merged:
|
|
|
|
*
|
|
|
|
* [ prev ][ vma ][ next ]
|
|
|
|
*
|
|
|
|
* In no case will we need to delete prev. If the operation is
|
|
|
|
* mergeable, then prev will be extended with one or both of vma and
|
|
|
|
* next deleted.
|
|
|
|
*
|
|
|
|
* As a result, during initial mergeability checks, only
|
|
|
|
* can_vma_merge_before() (which implies the VMA being merged with is
|
|
|
|
* 'next' as shown above) bothers to check to see whether the next VMA
|
|
|
|
* has a vm_ops->close() callback that will need to be called when
|
|
|
|
* removed.
|
|
|
|
*
|
|
|
|
* If it does, then we cannot merge as the resources that the close()
|
|
|
|
* operation potentially clears down are tied only to the existing VMA
|
|
|
|
* range and we have no way of extending those to the nearly merged one.
|
|
|
|
*
|
|
|
|
* We must consider two scenarios:
|
|
|
|
*
|
|
|
|
* A.
|
|
|
|
*
|
|
|
|
* vm_ops->close: - - !NULL
|
|
|
|
* [ prev ][ vma ][ next ]
|
|
|
|
*
|
|
|
|
* Where prev may or may not be present/mergeable.
|
|
|
|
*
|
|
|
|
* This is picked up by a specific check in can_vma_merge_before().
|
|
|
|
*
|
|
|
|
* B.
|
|
|
|
*
|
|
|
|
* vm_ops->close: - !NULL
|
|
|
|
* [ prev ][ vma ]
|
|
|
|
*
|
|
|
|
* Where prev and vma are present and mergeable.
|
|
|
|
*
|
|
|
|
* This is picked up by a specific check in the modified VMA merge.
|
|
|
|
*
|
|
|
|
* IMPORTANT NOTE: We make the assumption that the following case:
|
|
|
|
*
|
|
|
|
* - !NULL NULL
|
|
|
|
* [ prev ][ vma ][ next ]
|
|
|
|
*
|
|
|
|
* Cannot occur, because vma->vm_ops being the same implies the same
|
|
|
|
* vma->vm_file, and therefore this would mean that next->vm_ops->close
|
|
|
|
* would be set too, and thus scenario A would pick this up.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
mm: rework vm_ops->close() handling on VMA merge
In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
removed in mergeability test") we relaxed the VMA merge rules for VMAs
possessing a vm_ops->close() hook, permitting this operation in instances
where we wouldn't delete the VMA as part of the merge operation.
This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix
vma_merge() case 7 with vma_ops->close") to account for a subtle case that
the previous commit had not taken into account.
In both instances, we first rely on is_mergeable_vma() to determine
whether we might be dealing with a VMA that might be removed, taking
advantage of the fact that a 'previous' VMA will never be deleted, only
VMAs that follow it.
The second patch corrects the instance where a merge of the previous VMA
into a subsequent one did not correctly check whether the subsequent VMA
had a vm_ops->close() handler.
Both changes prevent merge cases that are actually permissible (for
instance a merge of a VMA into a following VMA with a vm_ops->close(), but
with no previous VMA, which would result in the next VMA being extended,
not deleted).
In addition, both changes fail to consider the case where a VMA that would
otherwise be merged with the previous and next VMA might have
vm_ops->close(), on the assumption that for this to be the case, all three
would have to have the same vma->vm_file to be mergeable and thus the same
vm_ops.
And in addition both changes operate at 50,000 feet, trying to guess
whether a VMA will be deleted.
As we have majorly refactored the VMA merge operation and de-duplicated
code to the point where we know precisely where deletions will occur, this
patch removes the aforementioned checks altogether and instead explicitly
checks whether a VMA will be deleted.
In cases where a reduced merge is still possible (where we merge both
previous and next VMA but the next VMA has a vm_ops->close hook, meaning
we could just merge the previous and current VMA), we do so, otherwise the
merge is not permitted.
We take advantage of our userland testing to assert that this functions
correctly - replacing the previous limited vm_ops->close() tests with
tests for every single case where we delete a VMA.
We also update all testing for both new and modified VMAs to set
vma->vm_ops->close() in every single instance where this would not prevent
the merge, to assert that we never do so.
Link: https://lkml.kernel.org/r/9f96b8cfeef3d14afabddac3d6144afdfbef2e22.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:22 +01:00
|
|
|
* The only case of a new VMA merge that results in a VMA being deleted
|
|
|
|
* is one where both the previous and next VMAs are merged - in this
|
|
|
|
* instance the next VMA is deleted, and the previous VMA is extended.
|
2024-08-30 19:10:14 +01:00
|
|
|
*
|
mm: rework vm_ops->close() handling on VMA merge
In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
removed in mergeability test") we relaxed the VMA merge rules for VMAs
possessing a vm_ops->close() hook, permitting this operation in instances
where we wouldn't delete the VMA as part of the merge operation.
This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix
vma_merge() case 7 with vma_ops->close") to account for a subtle case that
the previous commit had not taken into account.
In both instances, we first rely on is_mergeable_vma() to determine
whether we might be dealing with a VMA that might be removed, taking
advantage of the fact that a 'previous' VMA will never be deleted, only
VMAs that follow it.
The second patch corrects the instance where a merge of the previous VMA
into a subsequent one did not correctly check whether the subsequent VMA
had a vm_ops->close() handler.
Both changes prevent merge cases that are actually permissible (for
instance a merge of a VMA into a following VMA with a vm_ops->close(), but
with no previous VMA, which would result in the next VMA being extended,
not deleted).
In addition, both changes fail to consider the case where a VMA that would
otherwise be merged with the previous and next VMA might have
vm_ops->close(), on the assumption that for this to be the case, all three
would have to have the same vma->vm_file to be mergeable and thus the same
vm_ops.
And in addition both changes operate at 50,000 feet, trying to guess
whether a VMA will be deleted.
As we have majorly refactored the VMA merge operation and de-duplicated
code to the point where we know precisely where deletions will occur, this
patch removes the aforementioned checks altogether and instead explicitly
checks whether a VMA will be deleted.
In cases where a reduced merge is still possible (where we merge both
previous and next VMA but the next VMA has a vm_ops->close hook, meaning
we could just merge the previous and current VMA), we do so, otherwise the
merge is not permitted.
We take advantage of our userland testing to assert that this functions
correctly - replacing the previous limited vm_ops->close() tests with
tests for every single case where we delete a VMA.
We also update all testing for both new and modified VMAs to set
vma->vm_ops->close() in every single instance where this would not prevent
the merge, to assert that we never do so.
Link: https://lkml.kernel.org/r/9f96b8cfeef3d14afabddac3d6144afdfbef2e22.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:22 +01:00
|
|
|
* If we are unable to do so, we reduce the operation to simply
|
|
|
|
* extending the prev VMA and not merging next.
|
|
|
|
*
|
|
|
|
* 0123456789
|
|
|
|
* PPP**NNNN
|
|
|
|
* ->
|
|
|
|
* 0123456789
|
|
|
|
* PPPPPPNNN
|
2024-08-30 19:10:14 +01:00
|
|
|
*/
|
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
|
|
|
|
vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, vm_flags);
|
2024-08-30 19:10:14 +01:00
|
|
|
vma_next->vm_ops = &vm_ops;
|
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
|
mm: rework vm_ops->close() handling on VMA merge
In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
removed in mergeability test") we relaxed the VMA merge rules for VMAs
possessing a vm_ops->close() hook, permitting this operation in instances
where we wouldn't delete the VMA as part of the merge operation.
This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix
vma_merge() case 7 with vma_ops->close") to account for a subtle case that
the previous commit had not taken into account.
In both instances, we first rely on is_mergeable_vma() to determine
whether we might be dealing with a VMA that might be removed, taking
advantage of the fact that a 'previous' VMA will never be deleted, only
VMAs that follow it.
The second patch corrects the instance where a merge of the previous VMA
into a subsequent one did not correctly check whether the subsequent VMA
had a vm_ops->close() handler.
Both changes prevent merge cases that are actually permissible (for
instance a merge of a VMA into a following VMA with a vm_ops->close(), but
with no previous VMA, which would result in the next VMA being extended,
not deleted).
In addition, both changes fail to consider the case where a VMA that would
otherwise be merged with the previous and next VMA might have
vm_ops->close(), on the assumption that for this to be the case, all three
would have to have the same vma->vm_file to be mergeable and thus the same
vm_ops.
And in addition both changes operate at 50,000 feet, trying to guess
whether a VMA will be deleted.
As we have majorly refactored the VMA merge operation and de-duplicated
code to the point where we know precisely where deletions will occur, this
patch removes the aforementioned checks altogether and instead explicitly
checks whether a VMA will be deleted.
In cases where a reduced merge is still possible (where we merge both
previous and next VMA but the next VMA has a vm_ops->close hook, meaning
we could just merge the previous and current VMA), we do so, otherwise the
merge is not permitted.
We take advantage of our userland testing to assert that this functions
correctly - replacing the previous limited vm_ops->close() tests with
tests for every single case where we delete a VMA.
We also update all testing for both new and modified VMAs to set
vma->vm_ops->close() in every single instance where this would not prevent
the merge, to assert that we never do so.
Link: https://lkml.kernel.org/r/9f96b8cfeef3d14afabddac3d6144afdfbef2e22.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:22 +01:00
|
|
|
ASSERT_EQ(merge_new(&vmg), vma_prev);
|
|
|
|
ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
|
|
|
|
ASSERT_EQ(vma_prev->vm_start, 0);
|
|
|
|
ASSERT_EQ(vma_prev->vm_end, 0x5000);
|
|
|
|
ASSERT_EQ(vma_prev->vm_pgoff, 0);
|
2024-08-30 19:10:14 +01:00
|
|
|
|
mm: rework vm_ops->close() handling on VMA merge
In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
removed in mergeability test") we relaxed the VMA merge rules for VMAs
possessing a vm_ops->close() hook, permitting this operation in instances
where we wouldn't delete the VMA as part of the merge operation.
This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix
vma_merge() case 7 with vma_ops->close") to account for a subtle case that
the previous commit had not taken into account.
In both instances, we first rely on is_mergeable_vma() to determine
whether we might be dealing with a VMA that might be removed, taking
advantage of the fact that a 'previous' VMA will never be deleted, only
VMAs that follow it.
The second patch corrects the instance where a merge of the previous VMA
into a subsequent one did not correctly check whether the subsequent VMA
had a vm_ops->close() handler.
Both changes prevent merge cases that are actually permissible (for
instance a merge of a VMA into a following VMA with a vm_ops->close(), but
with no previous VMA, which would result in the next VMA being extended,
not deleted).
In addition, both changes fail to consider the case where a VMA that would
otherwise be merged with the previous and next VMA might have
vm_ops->close(), on the assumption that for this to be the case, all three
would have to have the same vma->vm_file to be mergeable and thus the same
vm_ops.
And in addition both changes operate at 50,000 feet, trying to guess
whether a VMA will be deleted.
As we have majorly refactored the VMA merge operation and de-duplicated
code to the point where we know precisely where deletions will occur, this
patch removes the aforementioned checks altogether and instead explicitly
checks whether a VMA will be deleted.
In cases where a reduced merge is still possible (where we merge both
previous and next VMA but the next VMA has a vm_ops->close hook, meaning
we could just merge the previous and current VMA), we do so, otherwise the
merge is not permitted.
We take advantage of our userland testing to assert that this functions
correctly - replacing the previous limited vm_ops->close() tests with
tests for every single case where we delete a VMA.
We also update all testing for both new and modified VMAs to set
vma->vm_ops->close() in every single instance where this would not prevent
the merge, to assert that we never do so.
Link: https://lkml.kernel.org/r/9f96b8cfeef3d14afabddac3d6144afdfbef2e22.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:22 +01:00
|
|
|
ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* When modifying an existing VMA there are further cases where we
|
|
|
|
* delete VMAs.
|
|
|
|
*
|
|
|
|
* <>
|
|
|
|
* 0123456789
|
|
|
|
* PPPVV
|
|
|
|
*
|
|
|
|
* In this instance, if vma has a close hook, the merge simply cannot
|
|
|
|
* proceed.
|
|
|
|
*/
|
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
|
|
|
|
vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
|
mm: rework vm_ops->close() handling on VMA merge
In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
removed in mergeability test") we relaxed the VMA merge rules for VMAs
possessing a vm_ops->close() hook, permitting this operation in instances
where we wouldn't delete the VMA as part of the merge operation.
This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix
vma_merge() case 7 with vma_ops->close") to account for a subtle case that
the previous commit had not taken into account.
In both instances, we first rely on is_mergeable_vma() to determine
whether we might be dealing with a VMA that might be removed, taking
advantage of the fact that a 'previous' VMA will never be deleted, only
VMAs that follow it.
The second patch corrects the instance where a merge of the previous VMA
into a subsequent one did not correctly check whether the subsequent VMA
had a vm_ops->close() handler.
Both changes prevent merge cases that are actually permissible (for
instance a merge of a VMA into a following VMA with a vm_ops->close(), but
with no previous VMA, which would result in the next VMA being extended,
not deleted).
In addition, both changes fail to consider the case where a VMA that would
otherwise be merged with the previous and next VMA might have
vm_ops->close(), on the assumption that for this to be the case, all three
would have to have the same vma->vm_file to be mergeable and thus the same
vm_ops.
And in addition both changes operate at 50,000 feet, trying to guess
whether a VMA will be deleted.
As we have majorly refactored the VMA merge operation and de-duplicated
code to the point where we know precisely where deletions will occur, this
patch removes the aforementioned checks altogether and instead explicitly
checks whether a VMA will be deleted.
In cases where a reduced merge is still possible (where we merge both
previous and next VMA but the next VMA has a vm_ops->close hook, meaning
we could just merge the previous and current VMA), we do so, otherwise the
merge is not permitted.
We take advantage of our userland testing to assert that this functions
correctly - replacing the previous limited vm_ops->close() tests with
tests for every single case where we delete a VMA.
We also update all testing for both new and modified VMAs to set
vma->vm_ops->close() in every single instance where this would not prevent
the merge, to assert that we never do so.
Link: https://lkml.kernel.org/r/9f96b8cfeef3d14afabddac3d6144afdfbef2e22.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:22 +01:00
|
|
|
vma->vm_ops = &vm_ops;
|
2024-08-30 19:10:14 +01:00
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
|
mm: rework vm_ops->close() handling on VMA merge
In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
removed in mergeability test") we relaxed the VMA merge rules for VMAs
possessing a vm_ops->close() hook, permitting this operation in instances
where we wouldn't delete the VMA as part of the merge operation.
This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix
vma_merge() case 7 with vma_ops->close") to account for a subtle case that
the previous commit had not taken into account.
In both instances, we first rely on is_mergeable_vma() to determine
whether we might be dealing with a VMA that might be removed, taking
advantage of the fact that a 'previous' VMA will never be deleted, only
VMAs that follow it.
The second patch corrects the instance where a merge of the previous VMA
into a subsequent one did not correctly check whether the subsequent VMA
had a vm_ops->close() handler.
Both changes prevent merge cases that are actually permissible (for
instance a merge of a VMA into a following VMA with a vm_ops->close(), but
with no previous VMA, which would result in the next VMA being extended,
not deleted).
In addition, both changes fail to consider the case where a VMA that would
otherwise be merged with the previous and next VMA might have
vm_ops->close(), on the assumption that for this to be the case, all three
would have to have the same vma->vm_file to be mergeable and thus the same
vm_ops.
And in addition both changes operate at 50,000 feet, trying to guess
whether a VMA will be deleted.
As we have majorly refactored the VMA merge operation and de-duplicated
code to the point where we know precisely where deletions will occur, this
patch removes the aforementioned checks altogether and instead explicitly
checks whether a VMA will be deleted.
In cases where a reduced merge is still possible (where we merge both
previous and next VMA but the next VMA has a vm_ops->close hook, meaning
we could just merge the previous and current VMA), we do so, otherwise the
merge is not permitted.
We take advantage of our userland testing to assert that this functions
correctly - replacing the previous limited vm_ops->close() tests with
tests for every single case where we delete a VMA.
We also update all testing for both new and modified VMAs to set
vma->vm_ops->close() in every single instance where this would not prevent
the merge, to assert that we never do so.
Link: https://lkml.kernel.org/r/9f96b8cfeef3d14afabddac3d6144afdfbef2e22.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:22 +01:00
|
|
|
vmg.prev = vma_prev;
|
mm: simplify vma merge structure and expand comments
Patch series "mm: further simplify VMA merge operation", v3.
While significant efforts have been made to improve the VMA merge
operation, there remains remnants of the bad (or rather confusing) old
days, which make the code difficult to understand, more bug prone and thus
harder to modify.
This series attempts to significantly improve matters in a number of
respects - with a focus on simplifying the commit_merge() function which
actually actions the merge operation - and importantly, adjusting the two
most confusing merge cases - those in which we 'adjust' the VMA
immediately adjacent to the one being merged.
One source of confusion are the VMAs being threaded through the operation
themselves - vmg->prev, vmg->vma and vmg->next.
At the start of the operation, vmg->vma is either NULL if a new VMA is
propose to be added, or if not then a pointer to an existing VMA being
modified, and prev/next are (perhaps not present) VMAs sat immediately
before and after the range specified in vmg->start, end, respectively.
However, during the VMA merge operation, we change vmg->start, end and
pgoff to span the newly merged range and vmg->vma to either be:
a. The ultimately returned VMA (in most cases) or b. A VMA which we will
manipulate, but ultimately instead return vmg->next.
Case b. especially here is confusing for somebody reading this code, but
the fact we update this state, along with vmg->start, end, pgoff only
makes matters worse.
We simplify things by replacing vmg->vma with vmg->middle and never
changing it - this is always either NULL (for a new VMA) or the VMA being
modified between vmg->prev and vmg->next.
We further simplify by placing the merged VMA in a new vmg->target field -
whether case b. above is the case or not. The reader of the code can now
simply rely on vmg->middle being the middle VMA and vmg->target being the
ultimately merged VMA.
We additionally tackle the confusing cases where we 'adjust' VMAs other
than the one we ultimately return as the merged VMA (this includes case b.
above). These are:
(1)
merge
<----------->
|------||--------| |------------|---|
| prev || middle | -> | target | m |
|------||--------| |------------|---|
In which case middle must be adjusted so middle->vm_start is increased as
well as performing the merge.
(2) (equivalent to case b. above)
<------------->
|---------||------| |---|-------------|
| middle || next | -> | m | target |
|---------||------| |---|-------------|
In which case next must be adjusted so next->vm_start is decreased as well
as performing the merge.
This cases have previously been performed by calculating and passing
around a dubious and confusing 'adj_start' parameter along side a pointer
to an 'adjust' VMA indicating which VMA requires additional adjustment
(middle in case 1 and next in case 2).
With the VMG structure in place we are able to avoid this by simply
setting a merge flag to describe each case:
(1) Sets the vmg->__adjust_middle_start flag
(2) Sets the vmg->__adjust_next_start flag
By doing so it turns out we can vastly simplify the logic and calculate
what is required to perform the operation.
Taken together the refactorings make it far easier to understand what is
being done even in these more confusing cases, make the code far more
maintainable, debuggable, and testable, providing more internal state
indicating what is happening in the merge operation.
The changes have no functional net impact on the merge operation and
everything should still behave as it did before.
This patch (of 5):
The merge code, while much improved, still has a number of points of
confusion. As part of a broader series cleaning this up to make this more
maintainable, we start by addressing some confusion around
vma_merge_struct fields.
So far, the caller either provides no vmg->vma (a new VMA) or supplies the
existing VMA which is being altered, setting vmg->start,end,pgoff to the
proposed VMA dimensions.
vmg->vma is then updated, as are vmg->start,end,pgoff as the merge process
proceeds and the appropriate merge strategy is determined.
This is rather confusing, as vmg->vma starts off as the 'middle' VMA
between vmg->prev,next, but becomes the 'target' VMA, except in one
specific edge case (merge next, shrink middle).
Int his patch we introduce vmg->middle to describe the VMA that is between
vmg->prev and vmg->next, and does NOT change during the merge operation.
We replace vmg->vma with vmg->target, and use this only during the merge
operation itself.
Aside from the merge right, shrink middle case, this becomes the VMA that
forms the basis of the VMA that is returned. This edge case can be
addressed in a future commit.
We also add a number of comments to explain what is going on.
Finally, we adjust the ASCII diagrams showing each merge case in
vma_merge_existing_range() to be clearer - the arrow range previously
showed the vmg->start, end spanned area, but it is clearer to change this
to show the final merged VMA.
This patch has no change in functional behaviour.
Link: https://lkml.kernel.org/r/cover.1738326519.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/4dfe60f1419d55e5d0516f56349695d73a57184c.1738326519.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-31 12:31:49 +00:00
|
|
|
vmg.middle = vma;
|
2024-08-30 19:10:14 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The VMA being modified in a way that would otherwise merge should
|
|
|
|
* also fail.
|
|
|
|
*/
|
|
|
|
ASSERT_EQ(merge_existing(&vmg), NULL);
|
mm: avoid using vma_merge() for new VMAs
Abstract vma_merge_new_vma() to use vma_merge_struct and rename the
resultant function vma_merge_new_range() to be clear what the purpose of
this function is - a new VMA is desired in the specified range, and we
wish to see if it is possible to 'merge' surrounding VMAs into this range
rather than having to allocate a new VMA.
Note that this function uses vma_extend() exclusively, so adopts its
requirement that the iterator point at or before the gap. We add an
assert to this effect.
This is as opposed to vma_merge_existing_range(), which will be introduced
in a subsequent commit, and provide the same functionality for cases in
which we are modifying an existing VMA.
In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.
Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().
Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.
Also add the ability for the vmg to track state, and able to report
errors, allowing for us to differentiate a failed merge from an inability
to allocate memory in callers.
This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.
Also introduce vma_iter_next_rewind() to allow for retrieval of the next,
and (optionally) the prev VMA, rewinding to the start of the previous gap.
Introduce are_anon_vmas_compatible() to abstract individual VMA anon_vma
comparison for the case of merging on both sides where the anon_vma of the
VMA being merged maybe compatible with prev and next, but prev and next's
anon_vma's may not be compatible with each other.
Finally also introduce can_vma_merge_left() / can_vma_merge_right() to
check adjacent VMA compatibility and that they are indeed adjacent.
Link: https://lkml.kernel.org/r/49d37c0769b6b9dc03b27fe4d059173832556392.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:18 +01:00
|
|
|
ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
|
2024-08-30 19:10:14 +01:00
|
|
|
|
mm: rework vm_ops->close() handling on VMA merge
In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
removed in mergeability test") we relaxed the VMA merge rules for VMAs
possessing a vm_ops->close() hook, permitting this operation in instances
where we wouldn't delete the VMA as part of the merge operation.
This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix
vma_merge() case 7 with vma_ops->close") to account for a subtle case that
the previous commit had not taken into account.
In both instances, we first rely on is_mergeable_vma() to determine
whether we might be dealing with a VMA that might be removed, taking
advantage of the fact that a 'previous' VMA will never be deleted, only
VMAs that follow it.
The second patch corrects the instance where a merge of the previous VMA
into a subsequent one did not correctly check whether the subsequent VMA
had a vm_ops->close() handler.
Both changes prevent merge cases that are actually permissible (for
instance a merge of a VMA into a following VMA with a vm_ops->close(), but
with no previous VMA, which would result in the next VMA being extended,
not deleted).
In addition, both changes fail to consider the case where a VMA that would
otherwise be merged with the previous and next VMA might have
vm_ops->close(), on the assumption that for this to be the case, all three
would have to have the same vma->vm_file to be mergeable and thus the same
vm_ops.
And in addition both changes operate at 50,000 feet, trying to guess
whether a VMA will be deleted.
As we have majorly refactored the VMA merge operation and de-duplicated
code to the point where we know precisely where deletions will occur, this
patch removes the aforementioned checks altogether and instead explicitly
checks whether a VMA will be deleted.
In cases where a reduced merge is still possible (where we merge both
previous and next VMA but the next VMA has a vm_ops->close hook, meaning
we could just merge the previous and current VMA), we do so, otherwise the
merge is not permitted.
We take advantage of our userland testing to assert that this functions
correctly - replacing the previous limited vm_ops->close() tests with
tests for every single case where we delete a VMA.
We also update all testing for both new and modified VMAs to set
vma->vm_ops->close() in every single instance where this would not prevent
the merge, to assert that we never do so.
Link: https://lkml.kernel.org/r/9f96b8cfeef3d14afabddac3d6144afdfbef2e22.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:22 +01:00
|
|
|
ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This case is mirrored if merging with next.
|
2024-08-30 19:10:14 +01:00
|
|
|
*
|
mm: rework vm_ops->close() handling on VMA merge
In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
removed in mergeability test") we relaxed the VMA merge rules for VMAs
possessing a vm_ops->close() hook, permitting this operation in instances
where we wouldn't delete the VMA as part of the merge operation.
This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix
vma_merge() case 7 with vma_ops->close") to account for a subtle case that
the previous commit had not taken into account.
In both instances, we first rely on is_mergeable_vma() to determine
whether we might be dealing with a VMA that might be removed, taking
advantage of the fact that a 'previous' VMA will never be deleted, only
VMAs that follow it.
The second patch corrects the instance where a merge of the previous VMA
into a subsequent one did not correctly check whether the subsequent VMA
had a vm_ops->close() handler.
Both changes prevent merge cases that are actually permissible (for
instance a merge of a VMA into a following VMA with a vm_ops->close(), but
with no previous VMA, which would result in the next VMA being extended,
not deleted).
In addition, both changes fail to consider the case where a VMA that would
otherwise be merged with the previous and next VMA might have
vm_ops->close(), on the assumption that for this to be the case, all three
would have to have the same vma->vm_file to be mergeable and thus the same
vm_ops.
And in addition both changes operate at 50,000 feet, trying to guess
whether a VMA will be deleted.
As we have majorly refactored the VMA merge operation and de-duplicated
code to the point where we know precisely where deletions will occur, this
patch removes the aforementioned checks altogether and instead explicitly
checks whether a VMA will be deleted.
In cases where a reduced merge is still possible (where we merge both
previous and next VMA but the next VMA has a vm_ops->close hook, meaning
we could just merge the previous and current VMA), we do so, otherwise the
merge is not permitted.
We take advantage of our userland testing to assert that this functions
correctly - replacing the previous limited vm_ops->close() tests with
tests for every single case where we delete a VMA.
We also update all testing for both new and modified VMAs to set
vma->vm_ops->close() in every single instance where this would not prevent
the merge, to assert that we never do so.
Link: https://lkml.kernel.org/r/9f96b8cfeef3d14afabddac3d6144afdfbef2e22.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:22 +01:00
|
|
|
* <>
|
|
|
|
* 0123456789
|
|
|
|
* VVNNNN
|
2024-08-30 19:10:14 +01:00
|
|
|
*
|
mm: rework vm_ops->close() handling on VMA merge
In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
removed in mergeability test") we relaxed the VMA merge rules for VMAs
possessing a vm_ops->close() hook, permitting this operation in instances
where we wouldn't delete the VMA as part of the merge operation.
This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix
vma_merge() case 7 with vma_ops->close") to account for a subtle case that
the previous commit had not taken into account.
In both instances, we first rely on is_mergeable_vma() to determine
whether we might be dealing with a VMA that might be removed, taking
advantage of the fact that a 'previous' VMA will never be deleted, only
VMAs that follow it.
The second patch corrects the instance where a merge of the previous VMA
into a subsequent one did not correctly check whether the subsequent VMA
had a vm_ops->close() handler.
Both changes prevent merge cases that are actually permissible (for
instance a merge of a VMA into a following VMA with a vm_ops->close(), but
with no previous VMA, which would result in the next VMA being extended,
not deleted).
In addition, both changes fail to consider the case where a VMA that would
otherwise be merged with the previous and next VMA might have
vm_ops->close(), on the assumption that for this to be the case, all three
would have to have the same vma->vm_file to be mergeable and thus the same
vm_ops.
And in addition both changes operate at 50,000 feet, trying to guess
whether a VMA will be deleted.
As we have majorly refactored the VMA merge operation and de-duplicated
code to the point where we know precisely where deletions will occur, this
patch removes the aforementioned checks altogether and instead explicitly
checks whether a VMA will be deleted.
In cases where a reduced merge is still possible (where we merge both
previous and next VMA but the next VMA has a vm_ops->close hook, meaning
we could just merge the previous and current VMA), we do so, otherwise the
merge is not permitted.
We take advantage of our userland testing to assert that this functions
correctly - replacing the previous limited vm_ops->close() tests with
tests for every single case where we delete a VMA.
We also update all testing for both new and modified VMAs to set
vma->vm_ops->close() in every single instance where this would not prevent
the merge, to assert that we never do so.
Link: https://lkml.kernel.org/r/9f96b8cfeef3d14afabddac3d6144afdfbef2e22.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:22 +01:00
|
|
|
* In this instance, if vma has a close hook, the merge simply cannot
|
|
|
|
* proceed.
|
2024-08-30 19:10:14 +01:00
|
|
|
*/
|
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
|
|
|
|
vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, vm_flags);
|
2024-08-30 19:10:14 +01:00
|
|
|
vma->vm_ops = &vm_ops;
|
mm: rework vm_ops->close() handling on VMA merge
In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
removed in mergeability test") we relaxed the VMA merge rules for VMAs
possessing a vm_ops->close() hook, permitting this operation in instances
where we wouldn't delete the VMA as part of the merge operation.
This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix
vma_merge() case 7 with vma_ops->close") to account for a subtle case that
the previous commit had not taken into account.
In both instances, we first rely on is_mergeable_vma() to determine
whether we might be dealing with a VMA that might be removed, taking
advantage of the fact that a 'previous' VMA will never be deleted, only
VMAs that follow it.
The second patch corrects the instance where a merge of the previous VMA
into a subsequent one did not correctly check whether the subsequent VMA
had a vm_ops->close() handler.
Both changes prevent merge cases that are actually permissible (for
instance a merge of a VMA into a following VMA with a vm_ops->close(), but
with no previous VMA, which would result in the next VMA being extended,
not deleted).
In addition, both changes fail to consider the case where a VMA that would
otherwise be merged with the previous and next VMA might have
vm_ops->close(), on the assumption that for this to be the case, all three
would have to have the same vma->vm_file to be mergeable and thus the same
vm_ops.
And in addition both changes operate at 50,000 feet, trying to guess
whether a VMA will be deleted.
As we have majorly refactored the VMA merge operation and de-duplicated
code to the point where we know precisely where deletions will occur, this
patch removes the aforementioned checks altogether and instead explicitly
checks whether a VMA will be deleted.
In cases where a reduced merge is still possible (where we merge both
previous and next VMA but the next VMA has a vm_ops->close hook, meaning
we could just merge the previous and current VMA), we do so, otherwise the
merge is not permitted.
We take advantage of our userland testing to assert that this functions
correctly - replacing the previous limited vm_ops->close() tests with
tests for every single case where we delete a VMA.
We also update all testing for both new and modified VMAs to set
vma->vm_ops->close() in every single instance where this would not prevent
the merge, to assert that we never do so.
Link: https://lkml.kernel.org/r/9f96b8cfeef3d14afabddac3d6144afdfbef2e22.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:22 +01:00
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
|
mm: simplify vma merge structure and expand comments
Patch series "mm: further simplify VMA merge operation", v3.
While significant efforts have been made to improve the VMA merge
operation, there remains remnants of the bad (or rather confusing) old
days, which make the code difficult to understand, more bug prone and thus
harder to modify.
This series attempts to significantly improve matters in a number of
respects - with a focus on simplifying the commit_merge() function which
actually actions the merge operation - and importantly, adjusting the two
most confusing merge cases - those in which we 'adjust' the VMA
immediately adjacent to the one being merged.
One source of confusion are the VMAs being threaded through the operation
themselves - vmg->prev, vmg->vma and vmg->next.
At the start of the operation, vmg->vma is either NULL if a new VMA is
propose to be added, or if not then a pointer to an existing VMA being
modified, and prev/next are (perhaps not present) VMAs sat immediately
before and after the range specified in vmg->start, end, respectively.
However, during the VMA merge operation, we change vmg->start, end and
pgoff to span the newly merged range and vmg->vma to either be:
a. The ultimately returned VMA (in most cases) or b. A VMA which we will
manipulate, but ultimately instead return vmg->next.
Case b. especially here is confusing for somebody reading this code, but
the fact we update this state, along with vmg->start, end, pgoff only
makes matters worse.
We simplify things by replacing vmg->vma with vmg->middle and never
changing it - this is always either NULL (for a new VMA) or the VMA being
modified between vmg->prev and vmg->next.
We further simplify by placing the merged VMA in a new vmg->target field -
whether case b. above is the case or not. The reader of the code can now
simply rely on vmg->middle being the middle VMA and vmg->target being the
ultimately merged VMA.
We additionally tackle the confusing cases where we 'adjust' VMAs other
than the one we ultimately return as the merged VMA (this includes case b.
above). These are:
(1)
merge
<----------->
|------||--------| |------------|---|
| prev || middle | -> | target | m |
|------||--------| |------------|---|
In which case middle must be adjusted so middle->vm_start is increased as
well as performing the merge.
(2) (equivalent to case b. above)
<------------->
|---------||------| |---|-------------|
| middle || next | -> | m | target |
|---------||------| |---|-------------|
In which case next must be adjusted so next->vm_start is decreased as well
as performing the merge.
This cases have previously been performed by calculating and passing
around a dubious and confusing 'adj_start' parameter along side a pointer
to an 'adjust' VMA indicating which VMA requires additional adjustment
(middle in case 1 and next in case 2).
With the VMG structure in place we are able to avoid this by simply
setting a merge flag to describe each case:
(1) Sets the vmg->__adjust_middle_start flag
(2) Sets the vmg->__adjust_next_start flag
By doing so it turns out we can vastly simplify the logic and calculate
what is required to perform the operation.
Taken together the refactorings make it far easier to understand what is
being done even in these more confusing cases, make the code far more
maintainable, debuggable, and testable, providing more internal state
indicating what is happening in the merge operation.
The changes have no functional net impact on the merge operation and
everything should still behave as it did before.
This patch (of 5):
The merge code, while much improved, still has a number of points of
confusion. As part of a broader series cleaning this up to make this more
maintainable, we start by addressing some confusion around
vma_merge_struct fields.
So far, the caller either provides no vmg->vma (a new VMA) or supplies the
existing VMA which is being altered, setting vmg->start,end,pgoff to the
proposed VMA dimensions.
vmg->vma is then updated, as are vmg->start,end,pgoff as the merge process
proceeds and the appropriate merge strategy is determined.
This is rather confusing, as vmg->vma starts off as the 'middle' VMA
between vmg->prev,next, but becomes the 'target' VMA, except in one
specific edge case (merge next, shrink middle).
Int his patch we introduce vmg->middle to describe the VMA that is between
vmg->prev and vmg->next, and does NOT change during the merge operation.
We replace vmg->vma with vmg->target, and use this only during the merge
operation itself.
Aside from the merge right, shrink middle case, this becomes the VMA that
forms the basis of the VMA that is returned. This edge case can be
addressed in a future commit.
We also add a number of comments to explain what is going on.
Finally, we adjust the ASCII diagrams showing each merge case in
vma_merge_existing_range() to be clearer - the arrow range previously
showed the vmg->start, end spanned area, but it is clearer to change this
to show the final merged VMA.
This patch has no change in functional behaviour.
Link: https://lkml.kernel.org/r/cover.1738326519.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/4dfe60f1419d55e5d0516f56349695d73a57184c.1738326519.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-31 12:31:49 +00:00
|
|
|
vmg.middle = vma;
|
2024-08-30 19:10:14 +01:00
|
|
|
ASSERT_EQ(merge_existing(&vmg), NULL);
|
2024-08-30 19:10:21 +01:00
|
|
|
/*
|
|
|
|
* Initially this is misapprehended as an out of memory report, as the
|
|
|
|
* close() check is handled in the same way as anon_vma duplication
|
|
|
|
* failures, however a subsequent patch resolves this.
|
|
|
|
*/
|
mm: rework vm_ops->close() handling on VMA merge
In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
removed in mergeability test") we relaxed the VMA merge rules for VMAs
possessing a vm_ops->close() hook, permitting this operation in instances
where we wouldn't delete the VMA as part of the merge operation.
This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix
vma_merge() case 7 with vma_ops->close") to account for a subtle case that
the previous commit had not taken into account.
In both instances, we first rely on is_mergeable_vma() to determine
whether we might be dealing with a VMA that might be removed, taking
advantage of the fact that a 'previous' VMA will never be deleted, only
VMAs that follow it.
The second patch corrects the instance where a merge of the previous VMA
into a subsequent one did not correctly check whether the subsequent VMA
had a vm_ops->close() handler.
Both changes prevent merge cases that are actually permissible (for
instance a merge of a VMA into a following VMA with a vm_ops->close(), but
with no previous VMA, which would result in the next VMA being extended,
not deleted).
In addition, both changes fail to consider the case where a VMA that would
otherwise be merged with the previous and next VMA might have
vm_ops->close(), on the assumption that for this to be the case, all three
would have to have the same vma->vm_file to be mergeable and thus the same
vm_ops.
And in addition both changes operate at 50,000 feet, trying to guess
whether a VMA will be deleted.
As we have majorly refactored the VMA merge operation and de-duplicated
code to the point where we know precisely where deletions will occur, this
patch removes the aforementioned checks altogether and instead explicitly
checks whether a VMA will be deleted.
In cases where a reduced merge is still possible (where we merge both
previous and next VMA but the next VMA has a vm_ops->close hook, meaning
we could just merge the previous and current VMA), we do so, otherwise the
merge is not permitted.
We take advantage of our userland testing to assert that this functions
correctly - replacing the previous limited vm_ops->close() tests with
tests for every single case where we delete a VMA.
We also update all testing for both new and modified VMAs to set
vma->vm_ops->close() in every single instance where this would not prevent
the merge, to assert that we never do so.
Link: https://lkml.kernel.org/r/9f96b8cfeef3d14afabddac3d6144afdfbef2e22.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:22 +01:00
|
|
|
ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
|
|
|
|
|
|
|
|
ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Finally, we consider two variants of the case where we modify a VMA
|
|
|
|
* to merge with both the previous and next VMAs.
|
|
|
|
*
|
|
|
|
* The first variant is where vma has a close hook. In this instance, no
|
|
|
|
* merge can proceed.
|
|
|
|
*
|
|
|
|
* <>
|
|
|
|
* 0123456789
|
|
|
|
* PPPVVNNNN
|
|
|
|
*/
|
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
|
|
|
|
vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
|
|
|
|
vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, vm_flags);
|
mm: rework vm_ops->close() handling on VMA merge
In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
removed in mergeability test") we relaxed the VMA merge rules for VMAs
possessing a vm_ops->close() hook, permitting this operation in instances
where we wouldn't delete the VMA as part of the merge operation.
This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix
vma_merge() case 7 with vma_ops->close") to account for a subtle case that
the previous commit had not taken into account.
In both instances, we first rely on is_mergeable_vma() to determine
whether we might be dealing with a VMA that might be removed, taking
advantage of the fact that a 'previous' VMA will never be deleted, only
VMAs that follow it.
The second patch corrects the instance where a merge of the previous VMA
into a subsequent one did not correctly check whether the subsequent VMA
had a vm_ops->close() handler.
Both changes prevent merge cases that are actually permissible (for
instance a merge of a VMA into a following VMA with a vm_ops->close(), but
with no previous VMA, which would result in the next VMA being extended,
not deleted).
In addition, both changes fail to consider the case where a VMA that would
otherwise be merged with the previous and next VMA might have
vm_ops->close(), on the assumption that for this to be the case, all three
would have to have the same vma->vm_file to be mergeable and thus the same
vm_ops.
And in addition both changes operate at 50,000 feet, trying to guess
whether a VMA will be deleted.
As we have majorly refactored the VMA merge operation and de-duplicated
code to the point where we know precisely where deletions will occur, this
patch removes the aforementioned checks altogether and instead explicitly
checks whether a VMA will be deleted.
In cases where a reduced merge is still possible (where we merge both
previous and next VMA but the next VMA has a vm_ops->close hook, meaning
we could just merge the previous and current VMA), we do so, otherwise the
merge is not permitted.
We take advantage of our userland testing to assert that this functions
correctly - replacing the previous limited vm_ops->close() tests with
tests for every single case where we delete a VMA.
We also update all testing for both new and modified VMAs to set
vma->vm_ops->close() in every single instance where this would not prevent
the merge, to assert that we never do so.
Link: https://lkml.kernel.org/r/9f96b8cfeef3d14afabddac3d6144afdfbef2e22.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:22 +01:00
|
|
|
vma->vm_ops = &vm_ops;
|
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
|
mm: rework vm_ops->close() handling on VMA merge
In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
removed in mergeability test") we relaxed the VMA merge rules for VMAs
possessing a vm_ops->close() hook, permitting this operation in instances
where we wouldn't delete the VMA as part of the merge operation.
This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix
vma_merge() case 7 with vma_ops->close") to account for a subtle case that
the previous commit had not taken into account.
In both instances, we first rely on is_mergeable_vma() to determine
whether we might be dealing with a VMA that might be removed, taking
advantage of the fact that a 'previous' VMA will never be deleted, only
VMAs that follow it.
The second patch corrects the instance where a merge of the previous VMA
into a subsequent one did not correctly check whether the subsequent VMA
had a vm_ops->close() handler.
Both changes prevent merge cases that are actually permissible (for
instance a merge of a VMA into a following VMA with a vm_ops->close(), but
with no previous VMA, which would result in the next VMA being extended,
not deleted).
In addition, both changes fail to consider the case where a VMA that would
otherwise be merged with the previous and next VMA might have
vm_ops->close(), on the assumption that for this to be the case, all three
would have to have the same vma->vm_file to be mergeable and thus the same
vm_ops.
And in addition both changes operate at 50,000 feet, trying to guess
whether a VMA will be deleted.
As we have majorly refactored the VMA merge operation and de-duplicated
code to the point where we know precisely where deletions will occur, this
patch removes the aforementioned checks altogether and instead explicitly
checks whether a VMA will be deleted.
In cases where a reduced merge is still possible (where we merge both
previous and next VMA but the next VMA has a vm_ops->close hook, meaning
we could just merge the previous and current VMA), we do so, otherwise the
merge is not permitted.
We take advantage of our userland testing to assert that this functions
correctly - replacing the previous limited vm_ops->close() tests with
tests for every single case where we delete a VMA.
We also update all testing for both new and modified VMAs to set
vma->vm_ops->close() in every single instance where this would not prevent
the merge, to assert that we never do so.
Link: https://lkml.kernel.org/r/9f96b8cfeef3d14afabddac3d6144afdfbef2e22.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:22 +01:00
|
|
|
vmg.prev = vma_prev;
|
mm: simplify vma merge structure and expand comments
Patch series "mm: further simplify VMA merge operation", v3.
While significant efforts have been made to improve the VMA merge
operation, there remains remnants of the bad (or rather confusing) old
days, which make the code difficult to understand, more bug prone and thus
harder to modify.
This series attempts to significantly improve matters in a number of
respects - with a focus on simplifying the commit_merge() function which
actually actions the merge operation - and importantly, adjusting the two
most confusing merge cases - those in which we 'adjust' the VMA
immediately adjacent to the one being merged.
One source of confusion are the VMAs being threaded through the operation
themselves - vmg->prev, vmg->vma and vmg->next.
At the start of the operation, vmg->vma is either NULL if a new VMA is
propose to be added, or if not then a pointer to an existing VMA being
modified, and prev/next are (perhaps not present) VMAs sat immediately
before and after the range specified in vmg->start, end, respectively.
However, during the VMA merge operation, we change vmg->start, end and
pgoff to span the newly merged range and vmg->vma to either be:
a. The ultimately returned VMA (in most cases) or b. A VMA which we will
manipulate, but ultimately instead return vmg->next.
Case b. especially here is confusing for somebody reading this code, but
the fact we update this state, along with vmg->start, end, pgoff only
makes matters worse.
We simplify things by replacing vmg->vma with vmg->middle and never
changing it - this is always either NULL (for a new VMA) or the VMA being
modified between vmg->prev and vmg->next.
We further simplify by placing the merged VMA in a new vmg->target field -
whether case b. above is the case or not. The reader of the code can now
simply rely on vmg->middle being the middle VMA and vmg->target being the
ultimately merged VMA.
We additionally tackle the confusing cases where we 'adjust' VMAs other
than the one we ultimately return as the merged VMA (this includes case b.
above). These are:
(1)
merge
<----------->
|------||--------| |------------|---|
| prev || middle | -> | target | m |
|------||--------| |------------|---|
In which case middle must be adjusted so middle->vm_start is increased as
well as performing the merge.
(2) (equivalent to case b. above)
<------------->
|---------||------| |---|-------------|
| middle || next | -> | m | target |
|---------||------| |---|-------------|
In which case next must be adjusted so next->vm_start is decreased as well
as performing the merge.
This cases have previously been performed by calculating and passing
around a dubious and confusing 'adj_start' parameter along side a pointer
to an 'adjust' VMA indicating which VMA requires additional adjustment
(middle in case 1 and next in case 2).
With the VMG structure in place we are able to avoid this by simply
setting a merge flag to describe each case:
(1) Sets the vmg->__adjust_middle_start flag
(2) Sets the vmg->__adjust_next_start flag
By doing so it turns out we can vastly simplify the logic and calculate
what is required to perform the operation.
Taken together the refactorings make it far easier to understand what is
being done even in these more confusing cases, make the code far more
maintainable, debuggable, and testable, providing more internal state
indicating what is happening in the merge operation.
The changes have no functional net impact on the merge operation and
everything should still behave as it did before.
This patch (of 5):
The merge code, while much improved, still has a number of points of
confusion. As part of a broader series cleaning this up to make this more
maintainable, we start by addressing some confusion around
vma_merge_struct fields.
So far, the caller either provides no vmg->vma (a new VMA) or supplies the
existing VMA which is being altered, setting vmg->start,end,pgoff to the
proposed VMA dimensions.
vmg->vma is then updated, as are vmg->start,end,pgoff as the merge process
proceeds and the appropriate merge strategy is determined.
This is rather confusing, as vmg->vma starts off as the 'middle' VMA
between vmg->prev,next, but becomes the 'target' VMA, except in one
specific edge case (merge next, shrink middle).
Int his patch we introduce vmg->middle to describe the VMA that is between
vmg->prev and vmg->next, and does NOT change during the merge operation.
We replace vmg->vma with vmg->target, and use this only during the merge
operation itself.
Aside from the merge right, shrink middle case, this becomes the VMA that
forms the basis of the VMA that is returned. This edge case can be
addressed in a future commit.
We also add a number of comments to explain what is going on.
Finally, we adjust the ASCII diagrams showing each merge case in
vma_merge_existing_range() to be clearer - the arrow range previously
showed the vmg->start, end spanned area, but it is clearer to change this
to show the final merged VMA.
This patch has no change in functional behaviour.
Link: https://lkml.kernel.org/r/cover.1738326519.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/4dfe60f1419d55e5d0516f56349695d73a57184c.1738326519.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-31 12:31:49 +00:00
|
|
|
vmg.middle = vma;
|
mm: rework vm_ops->close() handling on VMA merge
In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
removed in mergeability test") we relaxed the VMA merge rules for VMAs
possessing a vm_ops->close() hook, permitting this operation in instances
where we wouldn't delete the VMA as part of the merge operation.
This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix
vma_merge() case 7 with vma_ops->close") to account for a subtle case that
the previous commit had not taken into account.
In both instances, we first rely on is_mergeable_vma() to determine
whether we might be dealing with a VMA that might be removed, taking
advantage of the fact that a 'previous' VMA will never be deleted, only
VMAs that follow it.
The second patch corrects the instance where a merge of the previous VMA
into a subsequent one did not correctly check whether the subsequent VMA
had a vm_ops->close() handler.
Both changes prevent merge cases that are actually permissible (for
instance a merge of a VMA into a following VMA with a vm_ops->close(), but
with no previous VMA, which would result in the next VMA being extended,
not deleted).
In addition, both changes fail to consider the case where a VMA that would
otherwise be merged with the previous and next VMA might have
vm_ops->close(), on the assumption that for this to be the case, all three
would have to have the same vma->vm_file to be mergeable and thus the same
vm_ops.
And in addition both changes operate at 50,000 feet, trying to guess
whether a VMA will be deleted.
As we have majorly refactored the VMA merge operation and de-duplicated
code to the point where we know precisely where deletions will occur, this
patch removes the aforementioned checks altogether and instead explicitly
checks whether a VMA will be deleted.
In cases where a reduced merge is still possible (where we merge both
previous and next VMA but the next VMA has a vm_ops->close hook, meaning
we could just merge the previous and current VMA), we do so, otherwise the
merge is not permitted.
We take advantage of our userland testing to assert that this functions
correctly - replacing the previous limited vm_ops->close() tests with
tests for every single case where we delete a VMA.
We also update all testing for both new and modified VMAs to set
vma->vm_ops->close() in every single instance where this would not prevent
the merge, to assert that we never do so.
Link: https://lkml.kernel.org/r/9f96b8cfeef3d14afabddac3d6144afdfbef2e22.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:22 +01:00
|
|
|
|
|
|
|
ASSERT_EQ(merge_existing(&vmg), NULL);
|
|
|
|
ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
|
|
|
|
|
|
|
|
ASSERT_EQ(cleanup_mm(&mm, &vmi), 3);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The second variant is where next has a close hook. In this instance,
|
|
|
|
* we reduce the operation to a merge between prev and vma.
|
|
|
|
*
|
|
|
|
* <>
|
|
|
|
* 0123456789
|
|
|
|
* PPPVVNNNN
|
|
|
|
* ->
|
|
|
|
* 0123456789
|
|
|
|
* PPPPPNNNN
|
|
|
|
*/
|
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
|
|
|
|
vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
|
|
|
|
vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, vm_flags);
|
mm: rework vm_ops->close() handling on VMA merge
In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
removed in mergeability test") we relaxed the VMA merge rules for VMAs
possessing a vm_ops->close() hook, permitting this operation in instances
where we wouldn't delete the VMA as part of the merge operation.
This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix
vma_merge() case 7 with vma_ops->close") to account for a subtle case that
the previous commit had not taken into account.
In both instances, we first rely on is_mergeable_vma() to determine
whether we might be dealing with a VMA that might be removed, taking
advantage of the fact that a 'previous' VMA will never be deleted, only
VMAs that follow it.
The second patch corrects the instance where a merge of the previous VMA
into a subsequent one did not correctly check whether the subsequent VMA
had a vm_ops->close() handler.
Both changes prevent merge cases that are actually permissible (for
instance a merge of a VMA into a following VMA with a vm_ops->close(), but
with no previous VMA, which would result in the next VMA being extended,
not deleted).
In addition, both changes fail to consider the case where a VMA that would
otherwise be merged with the previous and next VMA might have
vm_ops->close(), on the assumption that for this to be the case, all three
would have to have the same vma->vm_file to be mergeable and thus the same
vm_ops.
And in addition both changes operate at 50,000 feet, trying to guess
whether a VMA will be deleted.
As we have majorly refactored the VMA merge operation and de-duplicated
code to the point where we know precisely where deletions will occur, this
patch removes the aforementioned checks altogether and instead explicitly
checks whether a VMA will be deleted.
In cases where a reduced merge is still possible (where we merge both
previous and next VMA but the next VMA has a vm_ops->close hook, meaning
we could just merge the previous and current VMA), we do so, otherwise the
merge is not permitted.
We take advantage of our userland testing to assert that this functions
correctly - replacing the previous limited vm_ops->close() tests with
tests for every single case where we delete a VMA.
We also update all testing for both new and modified VMAs to set
vma->vm_ops->close() in every single instance where this would not prevent
the merge, to assert that we never do so.
Link: https://lkml.kernel.org/r/9f96b8cfeef3d14afabddac3d6144afdfbef2e22.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:22 +01:00
|
|
|
vma_next->vm_ops = &vm_ops;
|
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
|
mm: rework vm_ops->close() handling on VMA merge
In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
removed in mergeability test") we relaxed the VMA merge rules for VMAs
possessing a vm_ops->close() hook, permitting this operation in instances
where we wouldn't delete the VMA as part of the merge operation.
This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix
vma_merge() case 7 with vma_ops->close") to account for a subtle case that
the previous commit had not taken into account.
In both instances, we first rely on is_mergeable_vma() to determine
whether we might be dealing with a VMA that might be removed, taking
advantage of the fact that a 'previous' VMA will never be deleted, only
VMAs that follow it.
The second patch corrects the instance where a merge of the previous VMA
into a subsequent one did not correctly check whether the subsequent VMA
had a vm_ops->close() handler.
Both changes prevent merge cases that are actually permissible (for
instance a merge of a VMA into a following VMA with a vm_ops->close(), but
with no previous VMA, which would result in the next VMA being extended,
not deleted).
In addition, both changes fail to consider the case where a VMA that would
otherwise be merged with the previous and next VMA might have
vm_ops->close(), on the assumption that for this to be the case, all three
would have to have the same vma->vm_file to be mergeable and thus the same
vm_ops.
And in addition both changes operate at 50,000 feet, trying to guess
whether a VMA will be deleted.
As we have majorly refactored the VMA merge operation and de-duplicated
code to the point where we know precisely where deletions will occur, this
patch removes the aforementioned checks altogether and instead explicitly
checks whether a VMA will be deleted.
In cases where a reduced merge is still possible (where we merge both
previous and next VMA but the next VMA has a vm_ops->close hook, meaning
we could just merge the previous and current VMA), we do so, otherwise the
merge is not permitted.
We take advantage of our userland testing to assert that this functions
correctly - replacing the previous limited vm_ops->close() tests with
tests for every single case where we delete a VMA.
We also update all testing for both new and modified VMAs to set
vma->vm_ops->close() in every single instance where this would not prevent
the merge, to assert that we never do so.
Link: https://lkml.kernel.org/r/9f96b8cfeef3d14afabddac3d6144afdfbef2e22.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:22 +01:00
|
|
|
vmg.prev = vma_prev;
|
mm: simplify vma merge structure and expand comments
Patch series "mm: further simplify VMA merge operation", v3.
While significant efforts have been made to improve the VMA merge
operation, there remains remnants of the bad (or rather confusing) old
days, which make the code difficult to understand, more bug prone and thus
harder to modify.
This series attempts to significantly improve matters in a number of
respects - with a focus on simplifying the commit_merge() function which
actually actions the merge operation - and importantly, adjusting the two
most confusing merge cases - those in which we 'adjust' the VMA
immediately adjacent to the one being merged.
One source of confusion are the VMAs being threaded through the operation
themselves - vmg->prev, vmg->vma and vmg->next.
At the start of the operation, vmg->vma is either NULL if a new VMA is
propose to be added, or if not then a pointer to an existing VMA being
modified, and prev/next are (perhaps not present) VMAs sat immediately
before and after the range specified in vmg->start, end, respectively.
However, during the VMA merge operation, we change vmg->start, end and
pgoff to span the newly merged range and vmg->vma to either be:
a. The ultimately returned VMA (in most cases) or b. A VMA which we will
manipulate, but ultimately instead return vmg->next.
Case b. especially here is confusing for somebody reading this code, but
the fact we update this state, along with vmg->start, end, pgoff only
makes matters worse.
We simplify things by replacing vmg->vma with vmg->middle and never
changing it - this is always either NULL (for a new VMA) or the VMA being
modified between vmg->prev and vmg->next.
We further simplify by placing the merged VMA in a new vmg->target field -
whether case b. above is the case or not. The reader of the code can now
simply rely on vmg->middle being the middle VMA and vmg->target being the
ultimately merged VMA.
We additionally tackle the confusing cases where we 'adjust' VMAs other
than the one we ultimately return as the merged VMA (this includes case b.
above). These are:
(1)
merge
<----------->
|------||--------| |------------|---|
| prev || middle | -> | target | m |
|------||--------| |------------|---|
In which case middle must be adjusted so middle->vm_start is increased as
well as performing the merge.
(2) (equivalent to case b. above)
<------------->
|---------||------| |---|-------------|
| middle || next | -> | m | target |
|---------||------| |---|-------------|
In which case next must be adjusted so next->vm_start is decreased as well
as performing the merge.
This cases have previously been performed by calculating and passing
around a dubious and confusing 'adj_start' parameter along side a pointer
to an 'adjust' VMA indicating which VMA requires additional adjustment
(middle in case 1 and next in case 2).
With the VMG structure in place we are able to avoid this by simply
setting a merge flag to describe each case:
(1) Sets the vmg->__adjust_middle_start flag
(2) Sets the vmg->__adjust_next_start flag
By doing so it turns out we can vastly simplify the logic and calculate
what is required to perform the operation.
Taken together the refactorings make it far easier to understand what is
being done even in these more confusing cases, make the code far more
maintainable, debuggable, and testable, providing more internal state
indicating what is happening in the merge operation.
The changes have no functional net impact on the merge operation and
everything should still behave as it did before.
This patch (of 5):
The merge code, while much improved, still has a number of points of
confusion. As part of a broader series cleaning this up to make this more
maintainable, we start by addressing some confusion around
vma_merge_struct fields.
So far, the caller either provides no vmg->vma (a new VMA) or supplies the
existing VMA which is being altered, setting vmg->start,end,pgoff to the
proposed VMA dimensions.
vmg->vma is then updated, as are vmg->start,end,pgoff as the merge process
proceeds and the appropriate merge strategy is determined.
This is rather confusing, as vmg->vma starts off as the 'middle' VMA
between vmg->prev,next, but becomes the 'target' VMA, except in one
specific edge case (merge next, shrink middle).
Int his patch we introduce vmg->middle to describe the VMA that is between
vmg->prev and vmg->next, and does NOT change during the merge operation.
We replace vmg->vma with vmg->target, and use this only during the merge
operation itself.
Aside from the merge right, shrink middle case, this becomes the VMA that
forms the basis of the VMA that is returned. This edge case can be
addressed in a future commit.
We also add a number of comments to explain what is going on.
Finally, we adjust the ASCII diagrams showing each merge case in
vma_merge_existing_range() to be clearer - the arrow range previously
showed the vmg->start, end spanned area, but it is clearer to change this
to show the final merged VMA.
This patch has no change in functional behaviour.
Link: https://lkml.kernel.org/r/cover.1738326519.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/4dfe60f1419d55e5d0516f56349695d73a57184c.1738326519.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-31 12:31:49 +00:00
|
|
|
vmg.middle = vma;
|
mm: rework vm_ops->close() handling on VMA merge
In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
removed in mergeability test") we relaxed the VMA merge rules for VMAs
possessing a vm_ops->close() hook, permitting this operation in instances
where we wouldn't delete the VMA as part of the merge operation.
This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix
vma_merge() case 7 with vma_ops->close") to account for a subtle case that
the previous commit had not taken into account.
In both instances, we first rely on is_mergeable_vma() to determine
whether we might be dealing with a VMA that might be removed, taking
advantage of the fact that a 'previous' VMA will never be deleted, only
VMAs that follow it.
The second patch corrects the instance where a merge of the previous VMA
into a subsequent one did not correctly check whether the subsequent VMA
had a vm_ops->close() handler.
Both changes prevent merge cases that are actually permissible (for
instance a merge of a VMA into a following VMA with a vm_ops->close(), but
with no previous VMA, which would result in the next VMA being extended,
not deleted).
In addition, both changes fail to consider the case where a VMA that would
otherwise be merged with the previous and next VMA might have
vm_ops->close(), on the assumption that for this to be the case, all three
would have to have the same vma->vm_file to be mergeable and thus the same
vm_ops.
And in addition both changes operate at 50,000 feet, trying to guess
whether a VMA will be deleted.
As we have majorly refactored the VMA merge operation and de-duplicated
code to the point where we know precisely where deletions will occur, this
patch removes the aforementioned checks altogether and instead explicitly
checks whether a VMA will be deleted.
In cases where a reduced merge is still possible (where we merge both
previous and next VMA but the next VMA has a vm_ops->close hook, meaning
we could just merge the previous and current VMA), we do so, otherwise the
merge is not permitted.
We take advantage of our userland testing to assert that this functions
correctly - replacing the previous limited vm_ops->close() tests with
tests for every single case where we delete a VMA.
We also update all testing for both new and modified VMAs to set
vma->vm_ops->close() in every single instance where this would not prevent
the merge, to assert that we never do so.
Link: https://lkml.kernel.org/r/9f96b8cfeef3d14afabddac3d6144afdfbef2e22.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:22 +01:00
|
|
|
|
|
|
|
ASSERT_EQ(merge_existing(&vmg), vma_prev);
|
|
|
|
ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
|
|
|
|
ASSERT_EQ(vma_prev->vm_start, 0);
|
|
|
|
ASSERT_EQ(vma_prev->vm_end, 0x5000);
|
|
|
|
ASSERT_EQ(vma_prev->vm_pgoff, 0);
|
|
|
|
|
|
|
|
ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
|
2024-08-30 19:10:14 +01:00
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool test_vma_merge_new_with_close(void)
|
|
|
|
{
|
2025-06-18 20:42:53 +01:00
|
|
|
vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
|
2024-08-30 19:10:14 +01:00
|
|
|
struct mm_struct mm = {};
|
|
|
|
VMA_ITERATOR(vmi, &mm, 0);
|
|
|
|
struct vma_merge_struct vmg = {
|
|
|
|
.mm = &mm,
|
|
|
|
.vmi = &vmi,
|
|
|
|
};
|
2025-06-18 20:42:53 +01:00
|
|
|
struct vm_area_struct *vma_prev = alloc_and_link_vma(&mm, 0, 0x2000, 0, vm_flags);
|
|
|
|
struct vm_area_struct *vma_next = alloc_and_link_vma(&mm, 0x5000, 0x7000, 5, vm_flags);
|
2024-08-30 19:10:14 +01:00
|
|
|
const struct vm_operations_struct vm_ops = {
|
|
|
|
.close = dummy_close,
|
|
|
|
};
|
|
|
|
struct vm_area_struct *vma;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We should allow the partial merge of a proposed new VMA if the
|
|
|
|
* surrounding VMAs have vm_ops->close() hooks (but are otherwise
|
|
|
|
* compatible), e.g.:
|
|
|
|
*
|
|
|
|
* New VMA
|
|
|
|
* A v-------v B
|
|
|
|
* |-----| |-----|
|
|
|
|
* close close
|
|
|
|
*
|
|
|
|
* Since the rule is to not DELETE a VMA with a close operation, this
|
|
|
|
* should be permitted, only rather than expanding A and deleting B, we
|
|
|
|
* should simply expand A and leave B intact, e.g.:
|
|
|
|
*
|
|
|
|
* New VMA
|
|
|
|
* A B
|
|
|
|
* |------------||-----|
|
|
|
|
* close close
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* Have prev and next have a vm_ops->close() hook. */
|
|
|
|
vma_prev->vm_ops = &vm_ops;
|
|
|
|
vma_next->vm_ops = &vm_ops;
|
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vmg_set_range(&vmg, 0x2000, 0x5000, 2, vm_flags);
|
2024-08-30 19:10:14 +01:00
|
|
|
vma = merge_new(&vmg);
|
|
|
|
ASSERT_NE(vma, NULL);
|
mm: avoid using vma_merge() for new VMAs
Abstract vma_merge_new_vma() to use vma_merge_struct and rename the
resultant function vma_merge_new_range() to be clear what the purpose of
this function is - a new VMA is desired in the specified range, and we
wish to see if it is possible to 'merge' surrounding VMAs into this range
rather than having to allocate a new VMA.
Note that this function uses vma_extend() exclusively, so adopts its
requirement that the iterator point at or before the gap. We add an
assert to this effect.
This is as opposed to vma_merge_existing_range(), which will be introduced
in a subsequent commit, and provide the same functionality for cases in
which we are modifying an existing VMA.
In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.
Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().
Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.
Also add the ability for the vmg to track state, and able to report
errors, allowing for us to differentiate a failed merge from an inability
to allocate memory in callers.
This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.
Also introduce vma_iter_next_rewind() to allow for retrieval of the next,
and (optionally) the prev VMA, rewinding to the start of the previous gap.
Introduce are_anon_vmas_compatible() to abstract individual VMA anon_vma
comparison for the case of merging on both sides where the anon_vma of the
VMA being merged maybe compatible with prev and next, but prev and next's
anon_vma's may not be compatible with each other.
Finally also introduce can_vma_merge_left() / can_vma_merge_right() to
check adjacent VMA compatibility and that they are indeed adjacent.
Link: https://lkml.kernel.org/r/49d37c0769b6b9dc03b27fe4d059173832556392.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:18 +01:00
|
|
|
ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
|
2024-08-30 19:10:14 +01:00
|
|
|
ASSERT_EQ(vma->vm_start, 0);
|
|
|
|
ASSERT_EQ(vma->vm_end, 0x5000);
|
|
|
|
ASSERT_EQ(vma->vm_pgoff, 0);
|
|
|
|
ASSERT_EQ(vma->vm_ops, &vm_ops);
|
|
|
|
ASSERT_TRUE(vma_write_started(vma));
|
|
|
|
ASSERT_EQ(mm.map_count, 2);
|
|
|
|
|
|
|
|
cleanup_mm(&mm, &vmi);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool test_merge_existing(void)
|
|
|
|
{
|
2025-06-18 20:42:53 +01:00
|
|
|
vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
|
2024-08-30 19:10:14 +01:00
|
|
|
struct mm_struct mm = {};
|
|
|
|
VMA_ITERATOR(vmi, &mm, 0);
|
|
|
|
struct vm_area_struct *vma, *vma_prev, *vma_next;
|
|
|
|
struct vma_merge_struct vmg = {
|
|
|
|
.mm = &mm,
|
|
|
|
.vmi = &vmi,
|
|
|
|
};
|
mm: rework vm_ops->close() handling on VMA merge
In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
removed in mergeability test") we relaxed the VMA merge rules for VMAs
possessing a vm_ops->close() hook, permitting this operation in instances
where we wouldn't delete the VMA as part of the merge operation.
This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix
vma_merge() case 7 with vma_ops->close") to account for a subtle case that
the previous commit had not taken into account.
In both instances, we first rely on is_mergeable_vma() to determine
whether we might be dealing with a VMA that might be removed, taking
advantage of the fact that a 'previous' VMA will never be deleted, only
VMAs that follow it.
The second patch corrects the instance where a merge of the previous VMA
into a subsequent one did not correctly check whether the subsequent VMA
had a vm_ops->close() handler.
Both changes prevent merge cases that are actually permissible (for
instance a merge of a VMA into a following VMA with a vm_ops->close(), but
with no previous VMA, which would result in the next VMA being extended,
not deleted).
In addition, both changes fail to consider the case where a VMA that would
otherwise be merged with the previous and next VMA might have
vm_ops->close(), on the assumption that for this to be the case, all three
would have to have the same vma->vm_file to be mergeable and thus the same
vm_ops.
And in addition both changes operate at 50,000 feet, trying to guess
whether a VMA will be deleted.
As we have majorly refactored the VMA merge operation and de-duplicated
code to the point where we know precisely where deletions will occur, this
patch removes the aforementioned checks altogether and instead explicitly
checks whether a VMA will be deleted.
In cases where a reduced merge is still possible (where we merge both
previous and next VMA but the next VMA has a vm_ops->close hook, meaning
we could just merge the previous and current VMA), we do so, otherwise the
merge is not permitted.
We take advantage of our userland testing to assert that this functions
correctly - replacing the previous limited vm_ops->close() tests with
tests for every single case where we delete a VMA.
We also update all testing for both new and modified VMAs to set
vma->vm_ops->close() in every single instance where this would not prevent
the merge, to assert that we never do so.
Link: https://lkml.kernel.org/r/9f96b8cfeef3d14afabddac3d6144afdfbef2e22.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:22 +01:00
|
|
|
const struct vm_operations_struct vm_ops = {
|
|
|
|
.close = dummy_close,
|
|
|
|
};
|
mm/vma: fix incorrectly disallowed anonymous VMA merges
Patch series "fix incorrectly disallowed anonymous VMA merges", v2.
It appears that we have been incorrectly rejecting merge cases for 15
years, apparently by mistake.
Imagine a range of anonymous mapped momemory divided into two VMAs like
this, with incompatible protection bits:
RW RWX
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
mprotect(RW)
Now imagine mprotect()'ing vma so it is RW. This appears as if it should
merge, it does not.
Neither does this case, again mprotect()'ing vma RW:
RWX RW
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
mprotect(RW)
Nor:
RW RWX RW
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
mprotect(RW)
What's going on here?
In commit 5beb49305251 ("mm: change anon_vma linking to fix multi-process
server scalability issue"), from 2010, Rik von Riel took careful care to
account for these cases - commenting that '[this is] easily overlooked:
when mprotect shifts the boundary, make sure the expanding vma has
anon_vma set if the shrinking vma had, to cover any anon pages imported.'
However, commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
introduced a little over a year later, appears to have accidentally
disallowed this.
By adjusting the is_mergeable_anon_vma() function to avoid lock contention
across large trees of forked anon_vma's, this commit wrongly assumed the
VMA being checked (the ostensible merge 'target') should be faulted, that
is, have an anon_vma, and thus an anon_vma_chain list established, but
only of length 1.
This appears to have been unintentional, as disallowing empty target VMAs
like this across the board makes no sense.
We already have logic that accounts for this case, the same logic Rik
introduced in 2010, now via dup_anon_vma() (and ultimately
anon_vma_clone()), so there is no problem permitting this.
This series fixes this mistake and also ensures that scalability concerns
remain addressed by explicitly checking that whatever VMA is being merged
has not been forked.
A full set of self tests which reproduce the issue are provided, as well
as updating userland VMA tests to assert this behaviour.
The self tests additionally assert scalability concerns are addressed.
This patch (of 3):
anon_vma_chain's were introduced by Rik von Riel in commit 5beb49305251
("mm: change anon_vma linking to fix multi-process server scalability
issue").
This patch was introduced in March 2010. As part of this change, careful
attention was made to the instance of mprotect() causing a VMA merge, with
one faulted (i.e. having anon_vma set) and another not:
/*
* Easily overlooked: when mprotect shifts the boundary,
* make sure the expanding vma has anon_vma set if the
* shrinking vma had, to cover any anon pages imported.
*/
In the modern VMA code, this is handled in dup_anon_vma() (and ultimately
anon_vma_clone()).
This case is one of the three configurations of adjacent VMA anon_vma
state that we might encounter on merge (where dst is the VMA which will be
merged into and src the one being merged into dst):
1. dst->anon_vma, src->anon_vma - These must be equal, no-op.
2. dst->anon_vma, !src->anon_vma - We simply use dst->anon_vma, no-op.
3. !dst->anon_vma, src->anon_vma - The case in question here.
In case 3, the instance addressed here - we duplicate the AVC connections
from src and place into dst.
However, in practice, we very often do NOT do this.
This appears to be due to an inadvertent consequence of the change
introduced by commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs"),
introduced in May 2011.
This implies that this merge case was functional only for a little over a
year, and has since been broken for ~15 years.
Here, lock scalability concerns lead to us restricting anonymous merges
only to those VMAs with 1 entry in their vma->anon_vma_chain, that is, a
VMA that is not connected to any parent process's anon_vma.
The mergeability test looks like this:
static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
struct anon_vma *anon_vma2, struct vm_area_struct *vma)
{
if ((!anon_vma1 || !anon_vma2) && (!vma ||
!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)))
return true;
return anon_vma1 == anon_vma2;
}
However, we have a problem here - typically the vma passed here is the
destination VMA.
For instance in vma_merge_existing_range() we invoke:
can_vma_merge_left()
-> [ check that there is an immediately adjacent prior VMA ]
-> can_vma_merge_after()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], prev->anon_vma, prev)
So if we were considering a target unfaulted 'prev':
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
This would call is_mergeable_anon_vma(NULL, vma->anon_vma, prev).
The list_is_singular() check for vma->anon_vma_chain, an empty list on
fault, would cause this merge to _fail_ even though all else indicates a
merge.
Equally a simple merge into a next VMA would hit the same problem:
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
can_vma_merge_right()
-> [ check that there is an immediately adjacent succeeding VMA ]
-> can_vma_merge_before()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], next->anon_vma, next)
For a 3-way merge, we'd also hit the same problem if it was configured like
this for instance:
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
As we'd call can_vma_merge_left() for prev, and can_vma_merge_right() for
next, both of which would fail.
vma_merge_new_range() (and relatedly, vma_expand()) are not impacted, as
the new VMA would never already be faulted (it is a proposed new range).
Because we already handle each of the aforementioned merge cases, and can
absolutely therefore deal with an existing VMA merge with !dst->anon_vma,
src->anon_vma, there is absolutely no reason to disallow this kind of
merge.
It seems that the intention of this patch is to ensure that, in the
instance of merging unfaulted VMAs with faulted ones, we never wish to do
so with those with multiple AVCs due to the fact that anon_vma lock's are
held across both parent and child anon_vma's (actually, the 'root' parent
anon_vma's lock is used).
In fact, the original commit alludes to this - "find_mergeable_anon_vma()
already considers this case".
In find_mergeable_anon_vma() however, we check the anon_vma which will be
merged from, if it is set, then we check
list_is_singular(vma->anon_vma_chain).
So to match this logic, update is_mergeable_anon_vma() to perform this
scalability check on the VMA whose anon_vma we ultimately merge into.
This matches existing behaviour with forked VMAs, only we no longer
wrongly disallow ALL empty target merges.
So we both allow merge cases and ensure the scalability check is correctly
applied.
We may wish to revisit these lock scalability concerns at a later date and
ensure they are still valid.
Additionally, correct userland VMA tests which were mistakenly not
asserting these cases correctly previously to now correctly assert this,
and to ensure vmg->anon_vma state is always consistent to account for
newly introduced asserts.
Link: https://lkml.kernel.org/r/cover.1744104124.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/18c756fc9eaf7ad082a710c91133b8346f8cd9a8.1744104124.git.lorenzo.stoakes@oracle.com
Fixes: 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Yeoreum Yun <yeoreum.yun@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-08 10:29:31 +01:00
|
|
|
struct anon_vma_chain avc = {};
|
2024-08-30 19:10:14 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Merge right case - partial span.
|
|
|
|
*
|
|
|
|
* <->
|
|
|
|
* 0123456789
|
|
|
|
* VVVVNNN
|
|
|
|
* ->
|
|
|
|
* 0123456789
|
|
|
|
* VNNNNNN
|
|
|
|
*/
|
2025-06-18 20:42:53 +01:00
|
|
|
vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, vm_flags);
|
mm: rework vm_ops->close() handling on VMA merge
In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
removed in mergeability test") we relaxed the VMA merge rules for VMAs
possessing a vm_ops->close() hook, permitting this operation in instances
where we wouldn't delete the VMA as part of the merge operation.
This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix
vma_merge() case 7 with vma_ops->close") to account for a subtle case that
the previous commit had not taken into account.
In both instances, we first rely on is_mergeable_vma() to determine
whether we might be dealing with a VMA that might be removed, taking
advantage of the fact that a 'previous' VMA will never be deleted, only
VMAs that follow it.
The second patch corrects the instance where a merge of the previous VMA
into a subsequent one did not correctly check whether the subsequent VMA
had a vm_ops->close() handler.
Both changes prevent merge cases that are actually permissible (for
instance a merge of a VMA into a following VMA with a vm_ops->close(), but
with no previous VMA, which would result in the next VMA being extended,
not deleted).
In addition, both changes fail to consider the case where a VMA that would
otherwise be merged with the previous and next VMA might have
vm_ops->close(), on the assumption that for this to be the case, all three
would have to have the same vma->vm_file to be mergeable and thus the same
vm_ops.
And in addition both changes operate at 50,000 feet, trying to guess
whether a VMA will be deleted.
As we have majorly refactored the VMA merge operation and de-duplicated
code to the point where we know precisely where deletions will occur, this
patch removes the aforementioned checks altogether and instead explicitly
checks whether a VMA will be deleted.
In cases where a reduced merge is still possible (where we merge both
previous and next VMA but the next VMA has a vm_ops->close hook, meaning
we could just merge the previous and current VMA), we do so, otherwise the
merge is not permitted.
We take advantage of our userland testing to assert that this functions
correctly - replacing the previous limited vm_ops->close() tests with
tests for every single case where we delete a VMA.
We also update all testing for both new and modified VMAs to set
vma->vm_ops->close() in every single instance where this would not prevent
the merge, to assert that we never do so.
Link: https://lkml.kernel.org/r/9f96b8cfeef3d14afabddac3d6144afdfbef2e22.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:22 +01:00
|
|
|
vma->vm_ops = &vm_ops; /* This should have no impact. */
|
2025-06-18 20:42:53 +01:00
|
|
|
vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, vm_flags);
|
mm: rework vm_ops->close() handling on VMA merge
In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
removed in mergeability test") we relaxed the VMA merge rules for VMAs
possessing a vm_ops->close() hook, permitting this operation in instances
where we wouldn't delete the VMA as part of the merge operation.
This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix
vma_merge() case 7 with vma_ops->close") to account for a subtle case that
the previous commit had not taken into account.
In both instances, we first rely on is_mergeable_vma() to determine
whether we might be dealing with a VMA that might be removed, taking
advantage of the fact that a 'previous' VMA will never be deleted, only
VMAs that follow it.
The second patch corrects the instance where a merge of the previous VMA
into a subsequent one did not correctly check whether the subsequent VMA
had a vm_ops->close() handler.
Both changes prevent merge cases that are actually permissible (for
instance a merge of a VMA into a following VMA with a vm_ops->close(), but
with no previous VMA, which would result in the next VMA being extended,
not deleted).
In addition, both changes fail to consider the case where a VMA that would
otherwise be merged with the previous and next VMA might have
vm_ops->close(), on the assumption that for this to be the case, all three
would have to have the same vma->vm_file to be mergeable and thus the same
vm_ops.
And in addition both changes operate at 50,000 feet, trying to guess
whether a VMA will be deleted.
As we have majorly refactored the VMA merge operation and de-duplicated
code to the point where we know precisely where deletions will occur, this
patch removes the aforementioned checks altogether and instead explicitly
checks whether a VMA will be deleted.
In cases where a reduced merge is still possible (where we merge both
previous and next VMA but the next VMA has a vm_ops->close hook, meaning
we could just merge the previous and current VMA), we do so, otherwise the
merge is not permitted.
We take advantage of our userland testing to assert that this functions
correctly - replacing the previous limited vm_ops->close() tests with
tests for every single case where we delete a VMA.
We also update all testing for both new and modified VMAs to set
vma->vm_ops->close() in every single instance where this would not prevent
the merge, to assert that we never do so.
Link: https://lkml.kernel.org/r/9f96b8cfeef3d14afabddac3d6144afdfbef2e22.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:22 +01:00
|
|
|
vma_next->vm_ops = &vm_ops; /* This should have no impact. */
|
2025-06-18 20:42:53 +01:00
|
|
|
vmg_set_range_anon_vma(&vmg, 0x3000, 0x6000, 3, vm_flags, &dummy_anon_vma);
|
mm: simplify vma merge structure and expand comments
Patch series "mm: further simplify VMA merge operation", v3.
While significant efforts have been made to improve the VMA merge
operation, there remains remnants of the bad (or rather confusing) old
days, which make the code difficult to understand, more bug prone and thus
harder to modify.
This series attempts to significantly improve matters in a number of
respects - with a focus on simplifying the commit_merge() function which
actually actions the merge operation - and importantly, adjusting the two
most confusing merge cases - those in which we 'adjust' the VMA
immediately adjacent to the one being merged.
One source of confusion are the VMAs being threaded through the operation
themselves - vmg->prev, vmg->vma and vmg->next.
At the start of the operation, vmg->vma is either NULL if a new VMA is
propose to be added, or if not then a pointer to an existing VMA being
modified, and prev/next are (perhaps not present) VMAs sat immediately
before and after the range specified in vmg->start, end, respectively.
However, during the VMA merge operation, we change vmg->start, end and
pgoff to span the newly merged range and vmg->vma to either be:
a. The ultimately returned VMA (in most cases) or b. A VMA which we will
manipulate, but ultimately instead return vmg->next.
Case b. especially here is confusing for somebody reading this code, but
the fact we update this state, along with vmg->start, end, pgoff only
makes matters worse.
We simplify things by replacing vmg->vma with vmg->middle and never
changing it - this is always either NULL (for a new VMA) or the VMA being
modified between vmg->prev and vmg->next.
We further simplify by placing the merged VMA in a new vmg->target field -
whether case b. above is the case or not. The reader of the code can now
simply rely on vmg->middle being the middle VMA and vmg->target being the
ultimately merged VMA.
We additionally tackle the confusing cases where we 'adjust' VMAs other
than the one we ultimately return as the merged VMA (this includes case b.
above). These are:
(1)
merge
<----------->
|------||--------| |------------|---|
| prev || middle | -> | target | m |
|------||--------| |------------|---|
In which case middle must be adjusted so middle->vm_start is increased as
well as performing the merge.
(2) (equivalent to case b. above)
<------------->
|---------||------| |---|-------------|
| middle || next | -> | m | target |
|---------||------| |---|-------------|
In which case next must be adjusted so next->vm_start is decreased as well
as performing the merge.
This cases have previously been performed by calculating and passing
around a dubious and confusing 'adj_start' parameter along side a pointer
to an 'adjust' VMA indicating which VMA requires additional adjustment
(middle in case 1 and next in case 2).
With the VMG structure in place we are able to avoid this by simply
setting a merge flag to describe each case:
(1) Sets the vmg->__adjust_middle_start flag
(2) Sets the vmg->__adjust_next_start flag
By doing so it turns out we can vastly simplify the logic and calculate
what is required to perform the operation.
Taken together the refactorings make it far easier to understand what is
being done even in these more confusing cases, make the code far more
maintainable, debuggable, and testable, providing more internal state
indicating what is happening in the merge operation.
The changes have no functional net impact on the merge operation and
everything should still behave as it did before.
This patch (of 5):
The merge code, while much improved, still has a number of points of
confusion. As part of a broader series cleaning this up to make this more
maintainable, we start by addressing some confusion around
vma_merge_struct fields.
So far, the caller either provides no vmg->vma (a new VMA) or supplies the
existing VMA which is being altered, setting vmg->start,end,pgoff to the
proposed VMA dimensions.
vmg->vma is then updated, as are vmg->start,end,pgoff as the merge process
proceeds and the appropriate merge strategy is determined.
This is rather confusing, as vmg->vma starts off as the 'middle' VMA
between vmg->prev,next, but becomes the 'target' VMA, except in one
specific edge case (merge next, shrink middle).
Int his patch we introduce vmg->middle to describe the VMA that is between
vmg->prev and vmg->next, and does NOT change during the merge operation.
We replace vmg->vma with vmg->target, and use this only during the merge
operation itself.
Aside from the merge right, shrink middle case, this becomes the VMA that
forms the basis of the VMA that is returned. This edge case can be
addressed in a future commit.
We also add a number of comments to explain what is going on.
Finally, we adjust the ASCII diagrams showing each merge case in
vma_merge_existing_range() to be clearer - the arrow range previously
showed the vmg->start, end spanned area, but it is clearer to change this
to show the final merged VMA.
This patch has no change in functional behaviour.
Link: https://lkml.kernel.org/r/cover.1738326519.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/4dfe60f1419d55e5d0516f56349695d73a57184c.1738326519.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-31 12:31:49 +00:00
|
|
|
vmg.middle = vma;
|
2024-08-30 19:10:14 +01:00
|
|
|
vmg.prev = vma;
|
mm/vma: fix incorrectly disallowed anonymous VMA merges
Patch series "fix incorrectly disallowed anonymous VMA merges", v2.
It appears that we have been incorrectly rejecting merge cases for 15
years, apparently by mistake.
Imagine a range of anonymous mapped momemory divided into two VMAs like
this, with incompatible protection bits:
RW RWX
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
mprotect(RW)
Now imagine mprotect()'ing vma so it is RW. This appears as if it should
merge, it does not.
Neither does this case, again mprotect()'ing vma RW:
RWX RW
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
mprotect(RW)
Nor:
RW RWX RW
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
mprotect(RW)
What's going on here?
In commit 5beb49305251 ("mm: change anon_vma linking to fix multi-process
server scalability issue"), from 2010, Rik von Riel took careful care to
account for these cases - commenting that '[this is] easily overlooked:
when mprotect shifts the boundary, make sure the expanding vma has
anon_vma set if the shrinking vma had, to cover any anon pages imported.'
However, commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
introduced a little over a year later, appears to have accidentally
disallowed this.
By adjusting the is_mergeable_anon_vma() function to avoid lock contention
across large trees of forked anon_vma's, this commit wrongly assumed the
VMA being checked (the ostensible merge 'target') should be faulted, that
is, have an anon_vma, and thus an anon_vma_chain list established, but
only of length 1.
This appears to have been unintentional, as disallowing empty target VMAs
like this across the board makes no sense.
We already have logic that accounts for this case, the same logic Rik
introduced in 2010, now via dup_anon_vma() (and ultimately
anon_vma_clone()), so there is no problem permitting this.
This series fixes this mistake and also ensures that scalability concerns
remain addressed by explicitly checking that whatever VMA is being merged
has not been forked.
A full set of self tests which reproduce the issue are provided, as well
as updating userland VMA tests to assert this behaviour.
The self tests additionally assert scalability concerns are addressed.
This patch (of 3):
anon_vma_chain's were introduced by Rik von Riel in commit 5beb49305251
("mm: change anon_vma linking to fix multi-process server scalability
issue").
This patch was introduced in March 2010. As part of this change, careful
attention was made to the instance of mprotect() causing a VMA merge, with
one faulted (i.e. having anon_vma set) and another not:
/*
* Easily overlooked: when mprotect shifts the boundary,
* make sure the expanding vma has anon_vma set if the
* shrinking vma had, to cover any anon pages imported.
*/
In the modern VMA code, this is handled in dup_anon_vma() (and ultimately
anon_vma_clone()).
This case is one of the three configurations of adjacent VMA anon_vma
state that we might encounter on merge (where dst is the VMA which will be
merged into and src the one being merged into dst):
1. dst->anon_vma, src->anon_vma - These must be equal, no-op.
2. dst->anon_vma, !src->anon_vma - We simply use dst->anon_vma, no-op.
3. !dst->anon_vma, src->anon_vma - The case in question here.
In case 3, the instance addressed here - we duplicate the AVC connections
from src and place into dst.
However, in practice, we very often do NOT do this.
This appears to be due to an inadvertent consequence of the change
introduced by commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs"),
introduced in May 2011.
This implies that this merge case was functional only for a little over a
year, and has since been broken for ~15 years.
Here, lock scalability concerns lead to us restricting anonymous merges
only to those VMAs with 1 entry in their vma->anon_vma_chain, that is, a
VMA that is not connected to any parent process's anon_vma.
The mergeability test looks like this:
static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
struct anon_vma *anon_vma2, struct vm_area_struct *vma)
{
if ((!anon_vma1 || !anon_vma2) && (!vma ||
!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)))
return true;
return anon_vma1 == anon_vma2;
}
However, we have a problem here - typically the vma passed here is the
destination VMA.
For instance in vma_merge_existing_range() we invoke:
can_vma_merge_left()
-> [ check that there is an immediately adjacent prior VMA ]
-> can_vma_merge_after()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], prev->anon_vma, prev)
So if we were considering a target unfaulted 'prev':
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
This would call is_mergeable_anon_vma(NULL, vma->anon_vma, prev).
The list_is_singular() check for vma->anon_vma_chain, an empty list on
fault, would cause this merge to _fail_ even though all else indicates a
merge.
Equally a simple merge into a next VMA would hit the same problem:
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
can_vma_merge_right()
-> [ check that there is an immediately adjacent succeeding VMA ]
-> can_vma_merge_before()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], next->anon_vma, next)
For a 3-way merge, we'd also hit the same problem if it was configured like
this for instance:
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
As we'd call can_vma_merge_left() for prev, and can_vma_merge_right() for
next, both of which would fail.
vma_merge_new_range() (and relatedly, vma_expand()) are not impacted, as
the new VMA would never already be faulted (it is a proposed new range).
Because we already handle each of the aforementioned merge cases, and can
absolutely therefore deal with an existing VMA merge with !dst->anon_vma,
src->anon_vma, there is absolutely no reason to disallow this kind of
merge.
It seems that the intention of this patch is to ensure that, in the
instance of merging unfaulted VMAs with faulted ones, we never wish to do
so with those with multiple AVCs due to the fact that anon_vma lock's are
held across both parent and child anon_vma's (actually, the 'root' parent
anon_vma's lock is used).
In fact, the original commit alludes to this - "find_mergeable_anon_vma()
already considers this case".
In find_mergeable_anon_vma() however, we check the anon_vma which will be
merged from, if it is set, then we check
list_is_singular(vma->anon_vma_chain).
So to match this logic, update is_mergeable_anon_vma() to perform this
scalability check on the VMA whose anon_vma we ultimately merge into.
This matches existing behaviour with forked VMAs, only we no longer
wrongly disallow ALL empty target merges.
So we both allow merge cases and ensure the scalability check is correctly
applied.
We may wish to revisit these lock scalability concerns at a later date and
ensure they are still valid.
Additionally, correct userland VMA tests which were mistakenly not
asserting these cases correctly previously to now correctly assert this,
and to ensure vmg->anon_vma state is always consistent to account for
newly introduced asserts.
Link: https://lkml.kernel.org/r/cover.1744104124.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/18c756fc9eaf7ad082a710c91133b8346f8cd9a8.1744104124.git.lorenzo.stoakes@oracle.com
Fixes: 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Yeoreum Yun <yeoreum.yun@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-08 10:29:31 +01:00
|
|
|
vma_set_dummy_anon_vma(vma, &avc);
|
2024-08-30 19:10:14 +01:00
|
|
|
ASSERT_EQ(merge_existing(&vmg), vma_next);
|
mm: avoid using vma_merge() for new VMAs
Abstract vma_merge_new_vma() to use vma_merge_struct and rename the
resultant function vma_merge_new_range() to be clear what the purpose of
this function is - a new VMA is desired in the specified range, and we
wish to see if it is possible to 'merge' surrounding VMAs into this range
rather than having to allocate a new VMA.
Note that this function uses vma_extend() exclusively, so adopts its
requirement that the iterator point at or before the gap. We add an
assert to this effect.
This is as opposed to vma_merge_existing_range(), which will be introduced
in a subsequent commit, and provide the same functionality for cases in
which we are modifying an existing VMA.
In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.
Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().
Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.
Also add the ability for the vmg to track state, and able to report
errors, allowing for us to differentiate a failed merge from an inability
to allocate memory in callers.
This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.
Also introduce vma_iter_next_rewind() to allow for retrieval of the next,
and (optionally) the prev VMA, rewinding to the start of the previous gap.
Introduce are_anon_vmas_compatible() to abstract individual VMA anon_vma
comparison for the case of merging on both sides where the anon_vma of the
VMA being merged maybe compatible with prev and next, but prev and next's
anon_vma's may not be compatible with each other.
Finally also introduce can_vma_merge_left() / can_vma_merge_right() to
check adjacent VMA compatibility and that they are indeed adjacent.
Link: https://lkml.kernel.org/r/49d37c0769b6b9dc03b27fe4d059173832556392.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:18 +01:00
|
|
|
ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
|
2024-08-30 19:10:14 +01:00
|
|
|
ASSERT_EQ(vma_next->vm_start, 0x3000);
|
|
|
|
ASSERT_EQ(vma_next->vm_end, 0x9000);
|
|
|
|
ASSERT_EQ(vma_next->vm_pgoff, 3);
|
|
|
|
ASSERT_EQ(vma_next->anon_vma, &dummy_anon_vma);
|
|
|
|
ASSERT_EQ(vma->vm_start, 0x2000);
|
|
|
|
ASSERT_EQ(vma->vm_end, 0x3000);
|
|
|
|
ASSERT_EQ(vma->vm_pgoff, 2);
|
|
|
|
ASSERT_TRUE(vma_write_started(vma));
|
|
|
|
ASSERT_TRUE(vma_write_started(vma_next));
|
|
|
|
ASSERT_EQ(mm.map_count, 2);
|
|
|
|
|
|
|
|
/* Clear down and reset. */
|
|
|
|
ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Merge right case - full span.
|
|
|
|
*
|
|
|
|
* <-->
|
|
|
|
* 0123456789
|
|
|
|
* VVVVNNN
|
|
|
|
* ->
|
|
|
|
* 0123456789
|
|
|
|
* NNNNNNN
|
|
|
|
*/
|
2025-06-18 20:42:53 +01:00
|
|
|
vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, vm_flags);
|
|
|
|
vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, vm_flags);
|
mm: rework vm_ops->close() handling on VMA merge
In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
removed in mergeability test") we relaxed the VMA merge rules for VMAs
possessing a vm_ops->close() hook, permitting this operation in instances
where we wouldn't delete the VMA as part of the merge operation.
This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix
vma_merge() case 7 with vma_ops->close") to account for a subtle case that
the previous commit had not taken into account.
In both instances, we first rely on is_mergeable_vma() to determine
whether we might be dealing with a VMA that might be removed, taking
advantage of the fact that a 'previous' VMA will never be deleted, only
VMAs that follow it.
The second patch corrects the instance where a merge of the previous VMA
into a subsequent one did not correctly check whether the subsequent VMA
had a vm_ops->close() handler.
Both changes prevent merge cases that are actually permissible (for
instance a merge of a VMA into a following VMA with a vm_ops->close(), but
with no previous VMA, which would result in the next VMA being extended,
not deleted).
In addition, both changes fail to consider the case where a VMA that would
otherwise be merged with the previous and next VMA might have
vm_ops->close(), on the assumption that for this to be the case, all three
would have to have the same vma->vm_file to be mergeable and thus the same
vm_ops.
And in addition both changes operate at 50,000 feet, trying to guess
whether a VMA will be deleted.
As we have majorly refactored the VMA merge operation and de-duplicated
code to the point where we know precisely where deletions will occur, this
patch removes the aforementioned checks altogether and instead explicitly
checks whether a VMA will be deleted.
In cases where a reduced merge is still possible (where we merge both
previous and next VMA but the next VMA has a vm_ops->close hook, meaning
we could just merge the previous and current VMA), we do so, otherwise the
merge is not permitted.
We take advantage of our userland testing to assert that this functions
correctly - replacing the previous limited vm_ops->close() tests with
tests for every single case where we delete a VMA.
We also update all testing for both new and modified VMAs to set
vma->vm_ops->close() in every single instance where this would not prevent
the merge, to assert that we never do so.
Link: https://lkml.kernel.org/r/9f96b8cfeef3d14afabddac3d6144afdfbef2e22.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:22 +01:00
|
|
|
vma_next->vm_ops = &vm_ops; /* This should have no impact. */
|
2025-06-18 20:42:53 +01:00
|
|
|
vmg_set_range_anon_vma(&vmg, 0x2000, 0x6000, 2, vm_flags, &dummy_anon_vma);
|
mm: simplify vma merge structure and expand comments
Patch series "mm: further simplify VMA merge operation", v3.
While significant efforts have been made to improve the VMA merge
operation, there remains remnants of the bad (or rather confusing) old
days, which make the code difficult to understand, more bug prone and thus
harder to modify.
This series attempts to significantly improve matters in a number of
respects - with a focus on simplifying the commit_merge() function which
actually actions the merge operation - and importantly, adjusting the two
most confusing merge cases - those in which we 'adjust' the VMA
immediately adjacent to the one being merged.
One source of confusion are the VMAs being threaded through the operation
themselves - vmg->prev, vmg->vma and vmg->next.
At the start of the operation, vmg->vma is either NULL if a new VMA is
propose to be added, or if not then a pointer to an existing VMA being
modified, and prev/next are (perhaps not present) VMAs sat immediately
before and after the range specified in vmg->start, end, respectively.
However, during the VMA merge operation, we change vmg->start, end and
pgoff to span the newly merged range and vmg->vma to either be:
a. The ultimately returned VMA (in most cases) or b. A VMA which we will
manipulate, but ultimately instead return vmg->next.
Case b. especially here is confusing for somebody reading this code, but
the fact we update this state, along with vmg->start, end, pgoff only
makes matters worse.
We simplify things by replacing vmg->vma with vmg->middle and never
changing it - this is always either NULL (for a new VMA) or the VMA being
modified between vmg->prev and vmg->next.
We further simplify by placing the merged VMA in a new vmg->target field -
whether case b. above is the case or not. The reader of the code can now
simply rely on vmg->middle being the middle VMA and vmg->target being the
ultimately merged VMA.
We additionally tackle the confusing cases where we 'adjust' VMAs other
than the one we ultimately return as the merged VMA (this includes case b.
above). These are:
(1)
merge
<----------->
|------||--------| |------------|---|
| prev || middle | -> | target | m |
|------||--------| |------------|---|
In which case middle must be adjusted so middle->vm_start is increased as
well as performing the merge.
(2) (equivalent to case b. above)
<------------->
|---------||------| |---|-------------|
| middle || next | -> | m | target |
|---------||------| |---|-------------|
In which case next must be adjusted so next->vm_start is decreased as well
as performing the merge.
This cases have previously been performed by calculating and passing
around a dubious and confusing 'adj_start' parameter along side a pointer
to an 'adjust' VMA indicating which VMA requires additional adjustment
(middle in case 1 and next in case 2).
With the VMG structure in place we are able to avoid this by simply
setting a merge flag to describe each case:
(1) Sets the vmg->__adjust_middle_start flag
(2) Sets the vmg->__adjust_next_start flag
By doing so it turns out we can vastly simplify the logic and calculate
what is required to perform the operation.
Taken together the refactorings make it far easier to understand what is
being done even in these more confusing cases, make the code far more
maintainable, debuggable, and testable, providing more internal state
indicating what is happening in the merge operation.
The changes have no functional net impact on the merge operation and
everything should still behave as it did before.
This patch (of 5):
The merge code, while much improved, still has a number of points of
confusion. As part of a broader series cleaning this up to make this more
maintainable, we start by addressing some confusion around
vma_merge_struct fields.
So far, the caller either provides no vmg->vma (a new VMA) or supplies the
existing VMA which is being altered, setting vmg->start,end,pgoff to the
proposed VMA dimensions.
vmg->vma is then updated, as are vmg->start,end,pgoff as the merge process
proceeds and the appropriate merge strategy is determined.
This is rather confusing, as vmg->vma starts off as the 'middle' VMA
between vmg->prev,next, but becomes the 'target' VMA, except in one
specific edge case (merge next, shrink middle).
Int his patch we introduce vmg->middle to describe the VMA that is between
vmg->prev and vmg->next, and does NOT change during the merge operation.
We replace vmg->vma with vmg->target, and use this only during the merge
operation itself.
Aside from the merge right, shrink middle case, this becomes the VMA that
forms the basis of the VMA that is returned. This edge case can be
addressed in a future commit.
We also add a number of comments to explain what is going on.
Finally, we adjust the ASCII diagrams showing each merge case in
vma_merge_existing_range() to be clearer - the arrow range previously
showed the vmg->start, end spanned area, but it is clearer to change this
to show the final merged VMA.
This patch has no change in functional behaviour.
Link: https://lkml.kernel.org/r/cover.1738326519.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/4dfe60f1419d55e5d0516f56349695d73a57184c.1738326519.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-31 12:31:49 +00:00
|
|
|
vmg.middle = vma;
|
mm/vma: fix incorrectly disallowed anonymous VMA merges
Patch series "fix incorrectly disallowed anonymous VMA merges", v2.
It appears that we have been incorrectly rejecting merge cases for 15
years, apparently by mistake.
Imagine a range of anonymous mapped momemory divided into two VMAs like
this, with incompatible protection bits:
RW RWX
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
mprotect(RW)
Now imagine mprotect()'ing vma so it is RW. This appears as if it should
merge, it does not.
Neither does this case, again mprotect()'ing vma RW:
RWX RW
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
mprotect(RW)
Nor:
RW RWX RW
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
mprotect(RW)
What's going on here?
In commit 5beb49305251 ("mm: change anon_vma linking to fix multi-process
server scalability issue"), from 2010, Rik von Riel took careful care to
account for these cases - commenting that '[this is] easily overlooked:
when mprotect shifts the boundary, make sure the expanding vma has
anon_vma set if the shrinking vma had, to cover any anon pages imported.'
However, commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
introduced a little over a year later, appears to have accidentally
disallowed this.
By adjusting the is_mergeable_anon_vma() function to avoid lock contention
across large trees of forked anon_vma's, this commit wrongly assumed the
VMA being checked (the ostensible merge 'target') should be faulted, that
is, have an anon_vma, and thus an anon_vma_chain list established, but
only of length 1.
This appears to have been unintentional, as disallowing empty target VMAs
like this across the board makes no sense.
We already have logic that accounts for this case, the same logic Rik
introduced in 2010, now via dup_anon_vma() (and ultimately
anon_vma_clone()), so there is no problem permitting this.
This series fixes this mistake and also ensures that scalability concerns
remain addressed by explicitly checking that whatever VMA is being merged
has not been forked.
A full set of self tests which reproduce the issue are provided, as well
as updating userland VMA tests to assert this behaviour.
The self tests additionally assert scalability concerns are addressed.
This patch (of 3):
anon_vma_chain's were introduced by Rik von Riel in commit 5beb49305251
("mm: change anon_vma linking to fix multi-process server scalability
issue").
This patch was introduced in March 2010. As part of this change, careful
attention was made to the instance of mprotect() causing a VMA merge, with
one faulted (i.e. having anon_vma set) and another not:
/*
* Easily overlooked: when mprotect shifts the boundary,
* make sure the expanding vma has anon_vma set if the
* shrinking vma had, to cover any anon pages imported.
*/
In the modern VMA code, this is handled in dup_anon_vma() (and ultimately
anon_vma_clone()).
This case is one of the three configurations of adjacent VMA anon_vma
state that we might encounter on merge (where dst is the VMA which will be
merged into and src the one being merged into dst):
1. dst->anon_vma, src->anon_vma - These must be equal, no-op.
2. dst->anon_vma, !src->anon_vma - We simply use dst->anon_vma, no-op.
3. !dst->anon_vma, src->anon_vma - The case in question here.
In case 3, the instance addressed here - we duplicate the AVC connections
from src and place into dst.
However, in practice, we very often do NOT do this.
This appears to be due to an inadvertent consequence of the change
introduced by commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs"),
introduced in May 2011.
This implies that this merge case was functional only for a little over a
year, and has since been broken for ~15 years.
Here, lock scalability concerns lead to us restricting anonymous merges
only to those VMAs with 1 entry in their vma->anon_vma_chain, that is, a
VMA that is not connected to any parent process's anon_vma.
The mergeability test looks like this:
static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
struct anon_vma *anon_vma2, struct vm_area_struct *vma)
{
if ((!anon_vma1 || !anon_vma2) && (!vma ||
!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)))
return true;
return anon_vma1 == anon_vma2;
}
However, we have a problem here - typically the vma passed here is the
destination VMA.
For instance in vma_merge_existing_range() we invoke:
can_vma_merge_left()
-> [ check that there is an immediately adjacent prior VMA ]
-> can_vma_merge_after()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], prev->anon_vma, prev)
So if we were considering a target unfaulted 'prev':
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
This would call is_mergeable_anon_vma(NULL, vma->anon_vma, prev).
The list_is_singular() check for vma->anon_vma_chain, an empty list on
fault, would cause this merge to _fail_ even though all else indicates a
merge.
Equally a simple merge into a next VMA would hit the same problem:
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
can_vma_merge_right()
-> [ check that there is an immediately adjacent succeeding VMA ]
-> can_vma_merge_before()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], next->anon_vma, next)
For a 3-way merge, we'd also hit the same problem if it was configured like
this for instance:
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
As we'd call can_vma_merge_left() for prev, and can_vma_merge_right() for
next, both of which would fail.
vma_merge_new_range() (and relatedly, vma_expand()) are not impacted, as
the new VMA would never already be faulted (it is a proposed new range).
Because we already handle each of the aforementioned merge cases, and can
absolutely therefore deal with an existing VMA merge with !dst->anon_vma,
src->anon_vma, there is absolutely no reason to disallow this kind of
merge.
It seems that the intention of this patch is to ensure that, in the
instance of merging unfaulted VMAs with faulted ones, we never wish to do
so with those with multiple AVCs due to the fact that anon_vma lock's are
held across both parent and child anon_vma's (actually, the 'root' parent
anon_vma's lock is used).
In fact, the original commit alludes to this - "find_mergeable_anon_vma()
already considers this case".
In find_mergeable_anon_vma() however, we check the anon_vma which will be
merged from, if it is set, then we check
list_is_singular(vma->anon_vma_chain).
So to match this logic, update is_mergeable_anon_vma() to perform this
scalability check on the VMA whose anon_vma we ultimately merge into.
This matches existing behaviour with forked VMAs, only we no longer
wrongly disallow ALL empty target merges.
So we both allow merge cases and ensure the scalability check is correctly
applied.
We may wish to revisit these lock scalability concerns at a later date and
ensure they are still valid.
Additionally, correct userland VMA tests which were mistakenly not
asserting these cases correctly previously to now correctly assert this,
and to ensure vmg->anon_vma state is always consistent to account for
newly introduced asserts.
Link: https://lkml.kernel.org/r/cover.1744104124.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/18c756fc9eaf7ad082a710c91133b8346f8cd9a8.1744104124.git.lorenzo.stoakes@oracle.com
Fixes: 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Yeoreum Yun <yeoreum.yun@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-08 10:29:31 +01:00
|
|
|
vma_set_dummy_anon_vma(vma, &avc);
|
2024-08-30 19:10:14 +01:00
|
|
|
ASSERT_EQ(merge_existing(&vmg), vma_next);
|
mm: avoid using vma_merge() for new VMAs
Abstract vma_merge_new_vma() to use vma_merge_struct and rename the
resultant function vma_merge_new_range() to be clear what the purpose of
this function is - a new VMA is desired in the specified range, and we
wish to see if it is possible to 'merge' surrounding VMAs into this range
rather than having to allocate a new VMA.
Note that this function uses vma_extend() exclusively, so adopts its
requirement that the iterator point at or before the gap. We add an
assert to this effect.
This is as opposed to vma_merge_existing_range(), which will be introduced
in a subsequent commit, and provide the same functionality for cases in
which we are modifying an existing VMA.
In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.
Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().
Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.
Also add the ability for the vmg to track state, and able to report
errors, allowing for us to differentiate a failed merge from an inability
to allocate memory in callers.
This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.
Also introduce vma_iter_next_rewind() to allow for retrieval of the next,
and (optionally) the prev VMA, rewinding to the start of the previous gap.
Introduce are_anon_vmas_compatible() to abstract individual VMA anon_vma
comparison for the case of merging on both sides where the anon_vma of the
VMA being merged maybe compatible with prev and next, but prev and next's
anon_vma's may not be compatible with each other.
Finally also introduce can_vma_merge_left() / can_vma_merge_right() to
check adjacent VMA compatibility and that they are indeed adjacent.
Link: https://lkml.kernel.org/r/49d37c0769b6b9dc03b27fe4d059173832556392.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:18 +01:00
|
|
|
ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
|
2024-08-30 19:10:14 +01:00
|
|
|
ASSERT_EQ(vma_next->vm_start, 0x2000);
|
|
|
|
ASSERT_EQ(vma_next->vm_end, 0x9000);
|
|
|
|
ASSERT_EQ(vma_next->vm_pgoff, 2);
|
|
|
|
ASSERT_EQ(vma_next->anon_vma, &dummy_anon_vma);
|
|
|
|
ASSERT_TRUE(vma_write_started(vma_next));
|
|
|
|
ASSERT_EQ(mm.map_count, 1);
|
|
|
|
|
|
|
|
/* Clear down and reset. We should have deleted vma. */
|
|
|
|
ASSERT_EQ(cleanup_mm(&mm, &vmi), 1);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Merge left case - partial span.
|
|
|
|
*
|
|
|
|
* <->
|
|
|
|
* 0123456789
|
|
|
|
* PPPVVVV
|
|
|
|
* ->
|
|
|
|
* 0123456789
|
|
|
|
* PPPPPPV
|
|
|
|
*/
|
2025-06-18 20:42:53 +01:00
|
|
|
vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
|
mm: rework vm_ops->close() handling on VMA merge
In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
removed in mergeability test") we relaxed the VMA merge rules for VMAs
possessing a vm_ops->close() hook, permitting this operation in instances
where we wouldn't delete the VMA as part of the merge operation.
This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix
vma_merge() case 7 with vma_ops->close") to account for a subtle case that
the previous commit had not taken into account.
In both instances, we first rely on is_mergeable_vma() to determine
whether we might be dealing with a VMA that might be removed, taking
advantage of the fact that a 'previous' VMA will never be deleted, only
VMAs that follow it.
The second patch corrects the instance where a merge of the previous VMA
into a subsequent one did not correctly check whether the subsequent VMA
had a vm_ops->close() handler.
Both changes prevent merge cases that are actually permissible (for
instance a merge of a VMA into a following VMA with a vm_ops->close(), but
with no previous VMA, which would result in the next VMA being extended,
not deleted).
In addition, both changes fail to consider the case where a VMA that would
otherwise be merged with the previous and next VMA might have
vm_ops->close(), on the assumption that for this to be the case, all three
would have to have the same vma->vm_file to be mergeable and thus the same
vm_ops.
And in addition both changes operate at 50,000 feet, trying to guess
whether a VMA will be deleted.
As we have majorly refactored the VMA merge operation and de-duplicated
code to the point where we know precisely where deletions will occur, this
patch removes the aforementioned checks altogether and instead explicitly
checks whether a VMA will be deleted.
In cases where a reduced merge is still possible (where we merge both
previous and next VMA but the next VMA has a vm_ops->close hook, meaning
we could just merge the previous and current VMA), we do so, otherwise the
merge is not permitted.
We take advantage of our userland testing to assert that this functions
correctly - replacing the previous limited vm_ops->close() tests with
tests for every single case where we delete a VMA.
We also update all testing for both new and modified VMAs to set
vma->vm_ops->close() in every single instance where this would not prevent
the merge, to assert that we never do so.
Link: https://lkml.kernel.org/r/9f96b8cfeef3d14afabddac3d6144afdfbef2e22.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:22 +01:00
|
|
|
vma_prev->vm_ops = &vm_ops; /* This should have no impact. */
|
2025-06-18 20:42:53 +01:00
|
|
|
vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags);
|
mm: rework vm_ops->close() handling on VMA merge
In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
removed in mergeability test") we relaxed the VMA merge rules for VMAs
possessing a vm_ops->close() hook, permitting this operation in instances
where we wouldn't delete the VMA as part of the merge operation.
This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix
vma_merge() case 7 with vma_ops->close") to account for a subtle case that
the previous commit had not taken into account.
In both instances, we first rely on is_mergeable_vma() to determine
whether we might be dealing with a VMA that might be removed, taking
advantage of the fact that a 'previous' VMA will never be deleted, only
VMAs that follow it.
The second patch corrects the instance where a merge of the previous VMA
into a subsequent one did not correctly check whether the subsequent VMA
had a vm_ops->close() handler.
Both changes prevent merge cases that are actually permissible (for
instance a merge of a VMA into a following VMA with a vm_ops->close(), but
with no previous VMA, which would result in the next VMA being extended,
not deleted).
In addition, both changes fail to consider the case where a VMA that would
otherwise be merged with the previous and next VMA might have
vm_ops->close(), on the assumption that for this to be the case, all three
would have to have the same vma->vm_file to be mergeable and thus the same
vm_ops.
And in addition both changes operate at 50,000 feet, trying to guess
whether a VMA will be deleted.
As we have majorly refactored the VMA merge operation and de-duplicated
code to the point where we know precisely where deletions will occur, this
patch removes the aforementioned checks altogether and instead explicitly
checks whether a VMA will be deleted.
In cases where a reduced merge is still possible (where we merge both
previous and next VMA but the next VMA has a vm_ops->close hook, meaning
we could just merge the previous and current VMA), we do so, otherwise the
merge is not permitted.
We take advantage of our userland testing to assert that this functions
correctly - replacing the previous limited vm_ops->close() tests with
tests for every single case where we delete a VMA.
We also update all testing for both new and modified VMAs to set
vma->vm_ops->close() in every single instance where this would not prevent
the merge, to assert that we never do so.
Link: https://lkml.kernel.org/r/9f96b8cfeef3d14afabddac3d6144afdfbef2e22.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:22 +01:00
|
|
|
vma->vm_ops = &vm_ops; /* This should have no impact. */
|
2025-06-18 20:42:53 +01:00
|
|
|
vmg_set_range_anon_vma(&vmg, 0x3000, 0x6000, 3, vm_flags, &dummy_anon_vma);
|
2024-08-30 19:10:14 +01:00
|
|
|
vmg.prev = vma_prev;
|
mm: simplify vma merge structure and expand comments
Patch series "mm: further simplify VMA merge operation", v3.
While significant efforts have been made to improve the VMA merge
operation, there remains remnants of the bad (or rather confusing) old
days, which make the code difficult to understand, more bug prone and thus
harder to modify.
This series attempts to significantly improve matters in a number of
respects - with a focus on simplifying the commit_merge() function which
actually actions the merge operation - and importantly, adjusting the two
most confusing merge cases - those in which we 'adjust' the VMA
immediately adjacent to the one being merged.
One source of confusion are the VMAs being threaded through the operation
themselves - vmg->prev, vmg->vma and vmg->next.
At the start of the operation, vmg->vma is either NULL if a new VMA is
propose to be added, or if not then a pointer to an existing VMA being
modified, and prev/next are (perhaps not present) VMAs sat immediately
before and after the range specified in vmg->start, end, respectively.
However, during the VMA merge operation, we change vmg->start, end and
pgoff to span the newly merged range and vmg->vma to either be:
a. The ultimately returned VMA (in most cases) or b. A VMA which we will
manipulate, but ultimately instead return vmg->next.
Case b. especially here is confusing for somebody reading this code, but
the fact we update this state, along with vmg->start, end, pgoff only
makes matters worse.
We simplify things by replacing vmg->vma with vmg->middle and never
changing it - this is always either NULL (for a new VMA) or the VMA being
modified between vmg->prev and vmg->next.
We further simplify by placing the merged VMA in a new vmg->target field -
whether case b. above is the case or not. The reader of the code can now
simply rely on vmg->middle being the middle VMA and vmg->target being the
ultimately merged VMA.
We additionally tackle the confusing cases where we 'adjust' VMAs other
than the one we ultimately return as the merged VMA (this includes case b.
above). These are:
(1)
merge
<----------->
|------||--------| |------------|---|
| prev || middle | -> | target | m |
|------||--------| |------------|---|
In which case middle must be adjusted so middle->vm_start is increased as
well as performing the merge.
(2) (equivalent to case b. above)
<------------->
|---------||------| |---|-------------|
| middle || next | -> | m | target |
|---------||------| |---|-------------|
In which case next must be adjusted so next->vm_start is decreased as well
as performing the merge.
This cases have previously been performed by calculating and passing
around a dubious and confusing 'adj_start' parameter along side a pointer
to an 'adjust' VMA indicating which VMA requires additional adjustment
(middle in case 1 and next in case 2).
With the VMG structure in place we are able to avoid this by simply
setting a merge flag to describe each case:
(1) Sets the vmg->__adjust_middle_start flag
(2) Sets the vmg->__adjust_next_start flag
By doing so it turns out we can vastly simplify the logic and calculate
what is required to perform the operation.
Taken together the refactorings make it far easier to understand what is
being done even in these more confusing cases, make the code far more
maintainable, debuggable, and testable, providing more internal state
indicating what is happening in the merge operation.
The changes have no functional net impact on the merge operation and
everything should still behave as it did before.
This patch (of 5):
The merge code, while much improved, still has a number of points of
confusion. As part of a broader series cleaning this up to make this more
maintainable, we start by addressing some confusion around
vma_merge_struct fields.
So far, the caller either provides no vmg->vma (a new VMA) or supplies the
existing VMA which is being altered, setting vmg->start,end,pgoff to the
proposed VMA dimensions.
vmg->vma is then updated, as are vmg->start,end,pgoff as the merge process
proceeds and the appropriate merge strategy is determined.
This is rather confusing, as vmg->vma starts off as the 'middle' VMA
between vmg->prev,next, but becomes the 'target' VMA, except in one
specific edge case (merge next, shrink middle).
Int his patch we introduce vmg->middle to describe the VMA that is between
vmg->prev and vmg->next, and does NOT change during the merge operation.
We replace vmg->vma with vmg->target, and use this only during the merge
operation itself.
Aside from the merge right, shrink middle case, this becomes the VMA that
forms the basis of the VMA that is returned. This edge case can be
addressed in a future commit.
We also add a number of comments to explain what is going on.
Finally, we adjust the ASCII diagrams showing each merge case in
vma_merge_existing_range() to be clearer - the arrow range previously
showed the vmg->start, end spanned area, but it is clearer to change this
to show the final merged VMA.
This patch has no change in functional behaviour.
Link: https://lkml.kernel.org/r/cover.1738326519.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/4dfe60f1419d55e5d0516f56349695d73a57184c.1738326519.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-31 12:31:49 +00:00
|
|
|
vmg.middle = vma;
|
mm/vma: fix incorrectly disallowed anonymous VMA merges
Patch series "fix incorrectly disallowed anonymous VMA merges", v2.
It appears that we have been incorrectly rejecting merge cases for 15
years, apparently by mistake.
Imagine a range of anonymous mapped momemory divided into two VMAs like
this, with incompatible protection bits:
RW RWX
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
mprotect(RW)
Now imagine mprotect()'ing vma so it is RW. This appears as if it should
merge, it does not.
Neither does this case, again mprotect()'ing vma RW:
RWX RW
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
mprotect(RW)
Nor:
RW RWX RW
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
mprotect(RW)
What's going on here?
In commit 5beb49305251 ("mm: change anon_vma linking to fix multi-process
server scalability issue"), from 2010, Rik von Riel took careful care to
account for these cases - commenting that '[this is] easily overlooked:
when mprotect shifts the boundary, make sure the expanding vma has
anon_vma set if the shrinking vma had, to cover any anon pages imported.'
However, commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
introduced a little over a year later, appears to have accidentally
disallowed this.
By adjusting the is_mergeable_anon_vma() function to avoid lock contention
across large trees of forked anon_vma's, this commit wrongly assumed the
VMA being checked (the ostensible merge 'target') should be faulted, that
is, have an anon_vma, and thus an anon_vma_chain list established, but
only of length 1.
This appears to have been unintentional, as disallowing empty target VMAs
like this across the board makes no sense.
We already have logic that accounts for this case, the same logic Rik
introduced in 2010, now via dup_anon_vma() (and ultimately
anon_vma_clone()), so there is no problem permitting this.
This series fixes this mistake and also ensures that scalability concerns
remain addressed by explicitly checking that whatever VMA is being merged
has not been forked.
A full set of self tests which reproduce the issue are provided, as well
as updating userland VMA tests to assert this behaviour.
The self tests additionally assert scalability concerns are addressed.
This patch (of 3):
anon_vma_chain's were introduced by Rik von Riel in commit 5beb49305251
("mm: change anon_vma linking to fix multi-process server scalability
issue").
This patch was introduced in March 2010. As part of this change, careful
attention was made to the instance of mprotect() causing a VMA merge, with
one faulted (i.e. having anon_vma set) and another not:
/*
* Easily overlooked: when mprotect shifts the boundary,
* make sure the expanding vma has anon_vma set if the
* shrinking vma had, to cover any anon pages imported.
*/
In the modern VMA code, this is handled in dup_anon_vma() (and ultimately
anon_vma_clone()).
This case is one of the three configurations of adjacent VMA anon_vma
state that we might encounter on merge (where dst is the VMA which will be
merged into and src the one being merged into dst):
1. dst->anon_vma, src->anon_vma - These must be equal, no-op.
2. dst->anon_vma, !src->anon_vma - We simply use dst->anon_vma, no-op.
3. !dst->anon_vma, src->anon_vma - The case in question here.
In case 3, the instance addressed here - we duplicate the AVC connections
from src and place into dst.
However, in practice, we very often do NOT do this.
This appears to be due to an inadvertent consequence of the change
introduced by commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs"),
introduced in May 2011.
This implies that this merge case was functional only for a little over a
year, and has since been broken for ~15 years.
Here, lock scalability concerns lead to us restricting anonymous merges
only to those VMAs with 1 entry in their vma->anon_vma_chain, that is, a
VMA that is not connected to any parent process's anon_vma.
The mergeability test looks like this:
static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
struct anon_vma *anon_vma2, struct vm_area_struct *vma)
{
if ((!anon_vma1 || !anon_vma2) && (!vma ||
!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)))
return true;
return anon_vma1 == anon_vma2;
}
However, we have a problem here - typically the vma passed here is the
destination VMA.
For instance in vma_merge_existing_range() we invoke:
can_vma_merge_left()
-> [ check that there is an immediately adjacent prior VMA ]
-> can_vma_merge_after()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], prev->anon_vma, prev)
So if we were considering a target unfaulted 'prev':
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
This would call is_mergeable_anon_vma(NULL, vma->anon_vma, prev).
The list_is_singular() check for vma->anon_vma_chain, an empty list on
fault, would cause this merge to _fail_ even though all else indicates a
merge.
Equally a simple merge into a next VMA would hit the same problem:
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
can_vma_merge_right()
-> [ check that there is an immediately adjacent succeeding VMA ]
-> can_vma_merge_before()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], next->anon_vma, next)
For a 3-way merge, we'd also hit the same problem if it was configured like
this for instance:
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
As we'd call can_vma_merge_left() for prev, and can_vma_merge_right() for
next, both of which would fail.
vma_merge_new_range() (and relatedly, vma_expand()) are not impacted, as
the new VMA would never already be faulted (it is a proposed new range).
Because we already handle each of the aforementioned merge cases, and can
absolutely therefore deal with an existing VMA merge with !dst->anon_vma,
src->anon_vma, there is absolutely no reason to disallow this kind of
merge.
It seems that the intention of this patch is to ensure that, in the
instance of merging unfaulted VMAs with faulted ones, we never wish to do
so with those with multiple AVCs due to the fact that anon_vma lock's are
held across both parent and child anon_vma's (actually, the 'root' parent
anon_vma's lock is used).
In fact, the original commit alludes to this - "find_mergeable_anon_vma()
already considers this case".
In find_mergeable_anon_vma() however, we check the anon_vma which will be
merged from, if it is set, then we check
list_is_singular(vma->anon_vma_chain).
So to match this logic, update is_mergeable_anon_vma() to perform this
scalability check on the VMA whose anon_vma we ultimately merge into.
This matches existing behaviour with forked VMAs, only we no longer
wrongly disallow ALL empty target merges.
So we both allow merge cases and ensure the scalability check is correctly
applied.
We may wish to revisit these lock scalability concerns at a later date and
ensure they are still valid.
Additionally, correct userland VMA tests which were mistakenly not
asserting these cases correctly previously to now correctly assert this,
and to ensure vmg->anon_vma state is always consistent to account for
newly introduced asserts.
Link: https://lkml.kernel.org/r/cover.1744104124.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/18c756fc9eaf7ad082a710c91133b8346f8cd9a8.1744104124.git.lorenzo.stoakes@oracle.com
Fixes: 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Yeoreum Yun <yeoreum.yun@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-08 10:29:31 +01:00
|
|
|
vma_set_dummy_anon_vma(vma, &avc);
|
2024-08-30 19:10:14 +01:00
|
|
|
ASSERT_EQ(merge_existing(&vmg), vma_prev);
|
mm: avoid using vma_merge() for new VMAs
Abstract vma_merge_new_vma() to use vma_merge_struct and rename the
resultant function vma_merge_new_range() to be clear what the purpose of
this function is - a new VMA is desired in the specified range, and we
wish to see if it is possible to 'merge' surrounding VMAs into this range
rather than having to allocate a new VMA.
Note that this function uses vma_extend() exclusively, so adopts its
requirement that the iterator point at or before the gap. We add an
assert to this effect.
This is as opposed to vma_merge_existing_range(), which will be introduced
in a subsequent commit, and provide the same functionality for cases in
which we are modifying an existing VMA.
In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.
Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().
Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.
Also add the ability for the vmg to track state, and able to report
errors, allowing for us to differentiate a failed merge from an inability
to allocate memory in callers.
This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.
Also introduce vma_iter_next_rewind() to allow for retrieval of the next,
and (optionally) the prev VMA, rewinding to the start of the previous gap.
Introduce are_anon_vmas_compatible() to abstract individual VMA anon_vma
comparison for the case of merging on both sides where the anon_vma of the
VMA being merged maybe compatible with prev and next, but prev and next's
anon_vma's may not be compatible with each other.
Finally also introduce can_vma_merge_left() / can_vma_merge_right() to
check adjacent VMA compatibility and that they are indeed adjacent.
Link: https://lkml.kernel.org/r/49d37c0769b6b9dc03b27fe4d059173832556392.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:18 +01:00
|
|
|
ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
|
2024-08-30 19:10:14 +01:00
|
|
|
ASSERT_EQ(vma_prev->vm_start, 0);
|
|
|
|
ASSERT_EQ(vma_prev->vm_end, 0x6000);
|
|
|
|
ASSERT_EQ(vma_prev->vm_pgoff, 0);
|
|
|
|
ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
|
|
|
|
ASSERT_EQ(vma->vm_start, 0x6000);
|
|
|
|
ASSERT_EQ(vma->vm_end, 0x7000);
|
|
|
|
ASSERT_EQ(vma->vm_pgoff, 6);
|
|
|
|
ASSERT_TRUE(vma_write_started(vma_prev));
|
|
|
|
ASSERT_TRUE(vma_write_started(vma));
|
|
|
|
ASSERT_EQ(mm.map_count, 2);
|
|
|
|
|
|
|
|
/* Clear down and reset. */
|
|
|
|
ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Merge left case - full span.
|
|
|
|
*
|
|
|
|
* <-->
|
|
|
|
* 0123456789
|
|
|
|
* PPPVVVV
|
|
|
|
* ->
|
|
|
|
* 0123456789
|
|
|
|
* PPPPPPP
|
|
|
|
*/
|
2025-06-18 20:42:53 +01:00
|
|
|
vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
|
mm: rework vm_ops->close() handling on VMA merge
In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
removed in mergeability test") we relaxed the VMA merge rules for VMAs
possessing a vm_ops->close() hook, permitting this operation in instances
where we wouldn't delete the VMA as part of the merge operation.
This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix
vma_merge() case 7 with vma_ops->close") to account for a subtle case that
the previous commit had not taken into account.
In both instances, we first rely on is_mergeable_vma() to determine
whether we might be dealing with a VMA that might be removed, taking
advantage of the fact that a 'previous' VMA will never be deleted, only
VMAs that follow it.
The second patch corrects the instance where a merge of the previous VMA
into a subsequent one did not correctly check whether the subsequent VMA
had a vm_ops->close() handler.
Both changes prevent merge cases that are actually permissible (for
instance a merge of a VMA into a following VMA with a vm_ops->close(), but
with no previous VMA, which would result in the next VMA being extended,
not deleted).
In addition, both changes fail to consider the case where a VMA that would
otherwise be merged with the previous and next VMA might have
vm_ops->close(), on the assumption that for this to be the case, all three
would have to have the same vma->vm_file to be mergeable and thus the same
vm_ops.
And in addition both changes operate at 50,000 feet, trying to guess
whether a VMA will be deleted.
As we have majorly refactored the VMA merge operation and de-duplicated
code to the point where we know precisely where deletions will occur, this
patch removes the aforementioned checks altogether and instead explicitly
checks whether a VMA will be deleted.
In cases where a reduced merge is still possible (where we merge both
previous and next VMA but the next VMA has a vm_ops->close hook, meaning
we could just merge the previous and current VMA), we do so, otherwise the
merge is not permitted.
We take advantage of our userland testing to assert that this functions
correctly - replacing the previous limited vm_ops->close() tests with
tests for every single case where we delete a VMA.
We also update all testing for both new and modified VMAs to set
vma->vm_ops->close() in every single instance where this would not prevent
the merge, to assert that we never do so.
Link: https://lkml.kernel.org/r/9f96b8cfeef3d14afabddac3d6144afdfbef2e22.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:22 +01:00
|
|
|
vma_prev->vm_ops = &vm_ops; /* This should have no impact. */
|
2025-06-18 20:42:53 +01:00
|
|
|
vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags);
|
|
|
|
vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vm_flags, &dummy_anon_vma);
|
2024-08-30 19:10:14 +01:00
|
|
|
vmg.prev = vma_prev;
|
mm: simplify vma merge structure and expand comments
Patch series "mm: further simplify VMA merge operation", v3.
While significant efforts have been made to improve the VMA merge
operation, there remains remnants of the bad (or rather confusing) old
days, which make the code difficult to understand, more bug prone and thus
harder to modify.
This series attempts to significantly improve matters in a number of
respects - with a focus on simplifying the commit_merge() function which
actually actions the merge operation - and importantly, adjusting the two
most confusing merge cases - those in which we 'adjust' the VMA
immediately adjacent to the one being merged.
One source of confusion are the VMAs being threaded through the operation
themselves - vmg->prev, vmg->vma and vmg->next.
At the start of the operation, vmg->vma is either NULL if a new VMA is
propose to be added, or if not then a pointer to an existing VMA being
modified, and prev/next are (perhaps not present) VMAs sat immediately
before and after the range specified in vmg->start, end, respectively.
However, during the VMA merge operation, we change vmg->start, end and
pgoff to span the newly merged range and vmg->vma to either be:
a. The ultimately returned VMA (in most cases) or b. A VMA which we will
manipulate, but ultimately instead return vmg->next.
Case b. especially here is confusing for somebody reading this code, but
the fact we update this state, along with vmg->start, end, pgoff only
makes matters worse.
We simplify things by replacing vmg->vma with vmg->middle and never
changing it - this is always either NULL (for a new VMA) or the VMA being
modified between vmg->prev and vmg->next.
We further simplify by placing the merged VMA in a new vmg->target field -
whether case b. above is the case or not. The reader of the code can now
simply rely on vmg->middle being the middle VMA and vmg->target being the
ultimately merged VMA.
We additionally tackle the confusing cases where we 'adjust' VMAs other
than the one we ultimately return as the merged VMA (this includes case b.
above). These are:
(1)
merge
<----------->
|------||--------| |------------|---|
| prev || middle | -> | target | m |
|------||--------| |------------|---|
In which case middle must be adjusted so middle->vm_start is increased as
well as performing the merge.
(2) (equivalent to case b. above)
<------------->
|---------||------| |---|-------------|
| middle || next | -> | m | target |
|---------||------| |---|-------------|
In which case next must be adjusted so next->vm_start is decreased as well
as performing the merge.
This cases have previously been performed by calculating and passing
around a dubious and confusing 'adj_start' parameter along side a pointer
to an 'adjust' VMA indicating which VMA requires additional adjustment
(middle in case 1 and next in case 2).
With the VMG structure in place we are able to avoid this by simply
setting a merge flag to describe each case:
(1) Sets the vmg->__adjust_middle_start flag
(2) Sets the vmg->__adjust_next_start flag
By doing so it turns out we can vastly simplify the logic and calculate
what is required to perform the operation.
Taken together the refactorings make it far easier to understand what is
being done even in these more confusing cases, make the code far more
maintainable, debuggable, and testable, providing more internal state
indicating what is happening in the merge operation.
The changes have no functional net impact on the merge operation and
everything should still behave as it did before.
This patch (of 5):
The merge code, while much improved, still has a number of points of
confusion. As part of a broader series cleaning this up to make this more
maintainable, we start by addressing some confusion around
vma_merge_struct fields.
So far, the caller either provides no vmg->vma (a new VMA) or supplies the
existing VMA which is being altered, setting vmg->start,end,pgoff to the
proposed VMA dimensions.
vmg->vma is then updated, as are vmg->start,end,pgoff as the merge process
proceeds and the appropriate merge strategy is determined.
This is rather confusing, as vmg->vma starts off as the 'middle' VMA
between vmg->prev,next, but becomes the 'target' VMA, except in one
specific edge case (merge next, shrink middle).
Int his patch we introduce vmg->middle to describe the VMA that is between
vmg->prev and vmg->next, and does NOT change during the merge operation.
We replace vmg->vma with vmg->target, and use this only during the merge
operation itself.
Aside from the merge right, shrink middle case, this becomes the VMA that
forms the basis of the VMA that is returned. This edge case can be
addressed in a future commit.
We also add a number of comments to explain what is going on.
Finally, we adjust the ASCII diagrams showing each merge case in
vma_merge_existing_range() to be clearer - the arrow range previously
showed the vmg->start, end spanned area, but it is clearer to change this
to show the final merged VMA.
This patch has no change in functional behaviour.
Link: https://lkml.kernel.org/r/cover.1738326519.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/4dfe60f1419d55e5d0516f56349695d73a57184c.1738326519.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-31 12:31:49 +00:00
|
|
|
vmg.middle = vma;
|
mm/vma: fix incorrectly disallowed anonymous VMA merges
Patch series "fix incorrectly disallowed anonymous VMA merges", v2.
It appears that we have been incorrectly rejecting merge cases for 15
years, apparently by mistake.
Imagine a range of anonymous mapped momemory divided into two VMAs like
this, with incompatible protection bits:
RW RWX
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
mprotect(RW)
Now imagine mprotect()'ing vma so it is RW. This appears as if it should
merge, it does not.
Neither does this case, again mprotect()'ing vma RW:
RWX RW
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
mprotect(RW)
Nor:
RW RWX RW
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
mprotect(RW)
What's going on here?
In commit 5beb49305251 ("mm: change anon_vma linking to fix multi-process
server scalability issue"), from 2010, Rik von Riel took careful care to
account for these cases - commenting that '[this is] easily overlooked:
when mprotect shifts the boundary, make sure the expanding vma has
anon_vma set if the shrinking vma had, to cover any anon pages imported.'
However, commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
introduced a little over a year later, appears to have accidentally
disallowed this.
By adjusting the is_mergeable_anon_vma() function to avoid lock contention
across large trees of forked anon_vma's, this commit wrongly assumed the
VMA being checked (the ostensible merge 'target') should be faulted, that
is, have an anon_vma, and thus an anon_vma_chain list established, but
only of length 1.
This appears to have been unintentional, as disallowing empty target VMAs
like this across the board makes no sense.
We already have logic that accounts for this case, the same logic Rik
introduced in 2010, now via dup_anon_vma() (and ultimately
anon_vma_clone()), so there is no problem permitting this.
This series fixes this mistake and also ensures that scalability concerns
remain addressed by explicitly checking that whatever VMA is being merged
has not been forked.
A full set of self tests which reproduce the issue are provided, as well
as updating userland VMA tests to assert this behaviour.
The self tests additionally assert scalability concerns are addressed.
This patch (of 3):
anon_vma_chain's were introduced by Rik von Riel in commit 5beb49305251
("mm: change anon_vma linking to fix multi-process server scalability
issue").
This patch was introduced in March 2010. As part of this change, careful
attention was made to the instance of mprotect() causing a VMA merge, with
one faulted (i.e. having anon_vma set) and another not:
/*
* Easily overlooked: when mprotect shifts the boundary,
* make sure the expanding vma has anon_vma set if the
* shrinking vma had, to cover any anon pages imported.
*/
In the modern VMA code, this is handled in dup_anon_vma() (and ultimately
anon_vma_clone()).
This case is one of the three configurations of adjacent VMA anon_vma
state that we might encounter on merge (where dst is the VMA which will be
merged into and src the one being merged into dst):
1. dst->anon_vma, src->anon_vma - These must be equal, no-op.
2. dst->anon_vma, !src->anon_vma - We simply use dst->anon_vma, no-op.
3. !dst->anon_vma, src->anon_vma - The case in question here.
In case 3, the instance addressed here - we duplicate the AVC connections
from src and place into dst.
However, in practice, we very often do NOT do this.
This appears to be due to an inadvertent consequence of the change
introduced by commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs"),
introduced in May 2011.
This implies that this merge case was functional only for a little over a
year, and has since been broken for ~15 years.
Here, lock scalability concerns lead to us restricting anonymous merges
only to those VMAs with 1 entry in their vma->anon_vma_chain, that is, a
VMA that is not connected to any parent process's anon_vma.
The mergeability test looks like this:
static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
struct anon_vma *anon_vma2, struct vm_area_struct *vma)
{
if ((!anon_vma1 || !anon_vma2) && (!vma ||
!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)))
return true;
return anon_vma1 == anon_vma2;
}
However, we have a problem here - typically the vma passed here is the
destination VMA.
For instance in vma_merge_existing_range() we invoke:
can_vma_merge_left()
-> [ check that there is an immediately adjacent prior VMA ]
-> can_vma_merge_after()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], prev->anon_vma, prev)
So if we were considering a target unfaulted 'prev':
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
This would call is_mergeable_anon_vma(NULL, vma->anon_vma, prev).
The list_is_singular() check for vma->anon_vma_chain, an empty list on
fault, would cause this merge to _fail_ even though all else indicates a
merge.
Equally a simple merge into a next VMA would hit the same problem:
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
can_vma_merge_right()
-> [ check that there is an immediately adjacent succeeding VMA ]
-> can_vma_merge_before()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], next->anon_vma, next)
For a 3-way merge, we'd also hit the same problem if it was configured like
this for instance:
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
As we'd call can_vma_merge_left() for prev, and can_vma_merge_right() for
next, both of which would fail.
vma_merge_new_range() (and relatedly, vma_expand()) are not impacted, as
the new VMA would never already be faulted (it is a proposed new range).
Because we already handle each of the aforementioned merge cases, and can
absolutely therefore deal with an existing VMA merge with !dst->anon_vma,
src->anon_vma, there is absolutely no reason to disallow this kind of
merge.
It seems that the intention of this patch is to ensure that, in the
instance of merging unfaulted VMAs with faulted ones, we never wish to do
so with those with multiple AVCs due to the fact that anon_vma lock's are
held across both parent and child anon_vma's (actually, the 'root' parent
anon_vma's lock is used).
In fact, the original commit alludes to this - "find_mergeable_anon_vma()
already considers this case".
In find_mergeable_anon_vma() however, we check the anon_vma which will be
merged from, if it is set, then we check
list_is_singular(vma->anon_vma_chain).
So to match this logic, update is_mergeable_anon_vma() to perform this
scalability check on the VMA whose anon_vma we ultimately merge into.
This matches existing behaviour with forked VMAs, only we no longer
wrongly disallow ALL empty target merges.
So we both allow merge cases and ensure the scalability check is correctly
applied.
We may wish to revisit these lock scalability concerns at a later date and
ensure they are still valid.
Additionally, correct userland VMA tests which were mistakenly not
asserting these cases correctly previously to now correctly assert this,
and to ensure vmg->anon_vma state is always consistent to account for
newly introduced asserts.
Link: https://lkml.kernel.org/r/cover.1744104124.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/18c756fc9eaf7ad082a710c91133b8346f8cd9a8.1744104124.git.lorenzo.stoakes@oracle.com
Fixes: 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Yeoreum Yun <yeoreum.yun@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-08 10:29:31 +01:00
|
|
|
vma_set_dummy_anon_vma(vma, &avc);
|
2024-08-30 19:10:14 +01:00
|
|
|
ASSERT_EQ(merge_existing(&vmg), vma_prev);
|
mm: avoid using vma_merge() for new VMAs
Abstract vma_merge_new_vma() to use vma_merge_struct and rename the
resultant function vma_merge_new_range() to be clear what the purpose of
this function is - a new VMA is desired in the specified range, and we
wish to see if it is possible to 'merge' surrounding VMAs into this range
rather than having to allocate a new VMA.
Note that this function uses vma_extend() exclusively, so adopts its
requirement that the iterator point at or before the gap. We add an
assert to this effect.
This is as opposed to vma_merge_existing_range(), which will be introduced
in a subsequent commit, and provide the same functionality for cases in
which we are modifying an existing VMA.
In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.
Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().
Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.
Also add the ability for the vmg to track state, and able to report
errors, allowing for us to differentiate a failed merge from an inability
to allocate memory in callers.
This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.
Also introduce vma_iter_next_rewind() to allow for retrieval of the next,
and (optionally) the prev VMA, rewinding to the start of the previous gap.
Introduce are_anon_vmas_compatible() to abstract individual VMA anon_vma
comparison for the case of merging on both sides where the anon_vma of the
VMA being merged maybe compatible with prev and next, but prev and next's
anon_vma's may not be compatible with each other.
Finally also introduce can_vma_merge_left() / can_vma_merge_right() to
check adjacent VMA compatibility and that they are indeed adjacent.
Link: https://lkml.kernel.org/r/49d37c0769b6b9dc03b27fe4d059173832556392.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:18 +01:00
|
|
|
ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
|
2024-08-30 19:10:14 +01:00
|
|
|
ASSERT_EQ(vma_prev->vm_start, 0);
|
|
|
|
ASSERT_EQ(vma_prev->vm_end, 0x7000);
|
|
|
|
ASSERT_EQ(vma_prev->vm_pgoff, 0);
|
|
|
|
ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
|
|
|
|
ASSERT_TRUE(vma_write_started(vma_prev));
|
|
|
|
ASSERT_EQ(mm.map_count, 1);
|
|
|
|
|
|
|
|
/* Clear down and reset. We should have deleted vma. */
|
|
|
|
ASSERT_EQ(cleanup_mm(&mm, &vmi), 1);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Merge both case.
|
|
|
|
*
|
|
|
|
* <-->
|
|
|
|
* 0123456789
|
|
|
|
* PPPVVVVNNN
|
|
|
|
* ->
|
|
|
|
* 0123456789
|
|
|
|
* PPPPPPPPPP
|
|
|
|
*/
|
2025-06-18 20:42:53 +01:00
|
|
|
vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
|
mm: rework vm_ops->close() handling on VMA merge
In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
removed in mergeability test") we relaxed the VMA merge rules for VMAs
possessing a vm_ops->close() hook, permitting this operation in instances
where we wouldn't delete the VMA as part of the merge operation.
This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix
vma_merge() case 7 with vma_ops->close") to account for a subtle case that
the previous commit had not taken into account.
In both instances, we first rely on is_mergeable_vma() to determine
whether we might be dealing with a VMA that might be removed, taking
advantage of the fact that a 'previous' VMA will never be deleted, only
VMAs that follow it.
The second patch corrects the instance where a merge of the previous VMA
into a subsequent one did not correctly check whether the subsequent VMA
had a vm_ops->close() handler.
Both changes prevent merge cases that are actually permissible (for
instance a merge of a VMA into a following VMA with a vm_ops->close(), but
with no previous VMA, which would result in the next VMA being extended,
not deleted).
In addition, both changes fail to consider the case where a VMA that would
otherwise be merged with the previous and next VMA might have
vm_ops->close(), on the assumption that for this to be the case, all three
would have to have the same vma->vm_file to be mergeable and thus the same
vm_ops.
And in addition both changes operate at 50,000 feet, trying to guess
whether a VMA will be deleted.
As we have majorly refactored the VMA merge operation and de-duplicated
code to the point where we know precisely where deletions will occur, this
patch removes the aforementioned checks altogether and instead explicitly
checks whether a VMA will be deleted.
In cases where a reduced merge is still possible (where we merge both
previous and next VMA but the next VMA has a vm_ops->close hook, meaning
we could just merge the previous and current VMA), we do so, otherwise the
merge is not permitted.
We take advantage of our userland testing to assert that this functions
correctly - replacing the previous limited vm_ops->close() tests with
tests for every single case where we delete a VMA.
We also update all testing for both new and modified VMAs to set
vma->vm_ops->close() in every single instance where this would not prevent
the merge, to assert that we never do so.
Link: https://lkml.kernel.org/r/9f96b8cfeef3d14afabddac3d6144afdfbef2e22.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:22 +01:00
|
|
|
vma_prev->vm_ops = &vm_ops; /* This should have no impact. */
|
2025-06-18 20:42:53 +01:00
|
|
|
vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags);
|
|
|
|
vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, vm_flags);
|
|
|
|
vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vm_flags, &dummy_anon_vma);
|
2024-08-30 19:10:14 +01:00
|
|
|
vmg.prev = vma_prev;
|
mm: simplify vma merge structure and expand comments
Patch series "mm: further simplify VMA merge operation", v3.
While significant efforts have been made to improve the VMA merge
operation, there remains remnants of the bad (or rather confusing) old
days, which make the code difficult to understand, more bug prone and thus
harder to modify.
This series attempts to significantly improve matters in a number of
respects - with a focus on simplifying the commit_merge() function which
actually actions the merge operation - and importantly, adjusting the two
most confusing merge cases - those in which we 'adjust' the VMA
immediately adjacent to the one being merged.
One source of confusion are the VMAs being threaded through the operation
themselves - vmg->prev, vmg->vma and vmg->next.
At the start of the operation, vmg->vma is either NULL if a new VMA is
propose to be added, or if not then a pointer to an existing VMA being
modified, and prev/next are (perhaps not present) VMAs sat immediately
before and after the range specified in vmg->start, end, respectively.
However, during the VMA merge operation, we change vmg->start, end and
pgoff to span the newly merged range and vmg->vma to either be:
a. The ultimately returned VMA (in most cases) or b. A VMA which we will
manipulate, but ultimately instead return vmg->next.
Case b. especially here is confusing for somebody reading this code, but
the fact we update this state, along with vmg->start, end, pgoff only
makes matters worse.
We simplify things by replacing vmg->vma with vmg->middle and never
changing it - this is always either NULL (for a new VMA) or the VMA being
modified between vmg->prev and vmg->next.
We further simplify by placing the merged VMA in a new vmg->target field -
whether case b. above is the case or not. The reader of the code can now
simply rely on vmg->middle being the middle VMA and vmg->target being the
ultimately merged VMA.
We additionally tackle the confusing cases where we 'adjust' VMAs other
than the one we ultimately return as the merged VMA (this includes case b.
above). These are:
(1)
merge
<----------->
|------||--------| |------------|---|
| prev || middle | -> | target | m |
|------||--------| |------------|---|
In which case middle must be adjusted so middle->vm_start is increased as
well as performing the merge.
(2) (equivalent to case b. above)
<------------->
|---------||------| |---|-------------|
| middle || next | -> | m | target |
|---------||------| |---|-------------|
In which case next must be adjusted so next->vm_start is decreased as well
as performing the merge.
This cases have previously been performed by calculating and passing
around a dubious and confusing 'adj_start' parameter along side a pointer
to an 'adjust' VMA indicating which VMA requires additional adjustment
(middle in case 1 and next in case 2).
With the VMG structure in place we are able to avoid this by simply
setting a merge flag to describe each case:
(1) Sets the vmg->__adjust_middle_start flag
(2) Sets the vmg->__adjust_next_start flag
By doing so it turns out we can vastly simplify the logic and calculate
what is required to perform the operation.
Taken together the refactorings make it far easier to understand what is
being done even in these more confusing cases, make the code far more
maintainable, debuggable, and testable, providing more internal state
indicating what is happening in the merge operation.
The changes have no functional net impact on the merge operation and
everything should still behave as it did before.
This patch (of 5):
The merge code, while much improved, still has a number of points of
confusion. As part of a broader series cleaning this up to make this more
maintainable, we start by addressing some confusion around
vma_merge_struct fields.
So far, the caller either provides no vmg->vma (a new VMA) or supplies the
existing VMA which is being altered, setting vmg->start,end,pgoff to the
proposed VMA dimensions.
vmg->vma is then updated, as are vmg->start,end,pgoff as the merge process
proceeds and the appropriate merge strategy is determined.
This is rather confusing, as vmg->vma starts off as the 'middle' VMA
between vmg->prev,next, but becomes the 'target' VMA, except in one
specific edge case (merge next, shrink middle).
Int his patch we introduce vmg->middle to describe the VMA that is between
vmg->prev and vmg->next, and does NOT change during the merge operation.
We replace vmg->vma with vmg->target, and use this only during the merge
operation itself.
Aside from the merge right, shrink middle case, this becomes the VMA that
forms the basis of the VMA that is returned. This edge case can be
addressed in a future commit.
We also add a number of comments to explain what is going on.
Finally, we adjust the ASCII diagrams showing each merge case in
vma_merge_existing_range() to be clearer - the arrow range previously
showed the vmg->start, end spanned area, but it is clearer to change this
to show the final merged VMA.
This patch has no change in functional behaviour.
Link: https://lkml.kernel.org/r/cover.1738326519.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/4dfe60f1419d55e5d0516f56349695d73a57184c.1738326519.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-31 12:31:49 +00:00
|
|
|
vmg.middle = vma;
|
mm/vma: fix incorrectly disallowed anonymous VMA merges
Patch series "fix incorrectly disallowed anonymous VMA merges", v2.
It appears that we have been incorrectly rejecting merge cases for 15
years, apparently by mistake.
Imagine a range of anonymous mapped momemory divided into two VMAs like
this, with incompatible protection bits:
RW RWX
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
mprotect(RW)
Now imagine mprotect()'ing vma so it is RW. This appears as if it should
merge, it does not.
Neither does this case, again mprotect()'ing vma RW:
RWX RW
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
mprotect(RW)
Nor:
RW RWX RW
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
mprotect(RW)
What's going on here?
In commit 5beb49305251 ("mm: change anon_vma linking to fix multi-process
server scalability issue"), from 2010, Rik von Riel took careful care to
account for these cases - commenting that '[this is] easily overlooked:
when mprotect shifts the boundary, make sure the expanding vma has
anon_vma set if the shrinking vma had, to cover any anon pages imported.'
However, commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
introduced a little over a year later, appears to have accidentally
disallowed this.
By adjusting the is_mergeable_anon_vma() function to avoid lock contention
across large trees of forked anon_vma's, this commit wrongly assumed the
VMA being checked (the ostensible merge 'target') should be faulted, that
is, have an anon_vma, and thus an anon_vma_chain list established, but
only of length 1.
This appears to have been unintentional, as disallowing empty target VMAs
like this across the board makes no sense.
We already have logic that accounts for this case, the same logic Rik
introduced in 2010, now via dup_anon_vma() (and ultimately
anon_vma_clone()), so there is no problem permitting this.
This series fixes this mistake and also ensures that scalability concerns
remain addressed by explicitly checking that whatever VMA is being merged
has not been forked.
A full set of self tests which reproduce the issue are provided, as well
as updating userland VMA tests to assert this behaviour.
The self tests additionally assert scalability concerns are addressed.
This patch (of 3):
anon_vma_chain's were introduced by Rik von Riel in commit 5beb49305251
("mm: change anon_vma linking to fix multi-process server scalability
issue").
This patch was introduced in March 2010. As part of this change, careful
attention was made to the instance of mprotect() causing a VMA merge, with
one faulted (i.e. having anon_vma set) and another not:
/*
* Easily overlooked: when mprotect shifts the boundary,
* make sure the expanding vma has anon_vma set if the
* shrinking vma had, to cover any anon pages imported.
*/
In the modern VMA code, this is handled in dup_anon_vma() (and ultimately
anon_vma_clone()).
This case is one of the three configurations of adjacent VMA anon_vma
state that we might encounter on merge (where dst is the VMA which will be
merged into and src the one being merged into dst):
1. dst->anon_vma, src->anon_vma - These must be equal, no-op.
2. dst->anon_vma, !src->anon_vma - We simply use dst->anon_vma, no-op.
3. !dst->anon_vma, src->anon_vma - The case in question here.
In case 3, the instance addressed here - we duplicate the AVC connections
from src and place into dst.
However, in practice, we very often do NOT do this.
This appears to be due to an inadvertent consequence of the change
introduced by commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs"),
introduced in May 2011.
This implies that this merge case was functional only for a little over a
year, and has since been broken for ~15 years.
Here, lock scalability concerns lead to us restricting anonymous merges
only to those VMAs with 1 entry in their vma->anon_vma_chain, that is, a
VMA that is not connected to any parent process's anon_vma.
The mergeability test looks like this:
static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
struct anon_vma *anon_vma2, struct vm_area_struct *vma)
{
if ((!anon_vma1 || !anon_vma2) && (!vma ||
!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)))
return true;
return anon_vma1 == anon_vma2;
}
However, we have a problem here - typically the vma passed here is the
destination VMA.
For instance in vma_merge_existing_range() we invoke:
can_vma_merge_left()
-> [ check that there is an immediately adjacent prior VMA ]
-> can_vma_merge_after()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], prev->anon_vma, prev)
So if we were considering a target unfaulted 'prev':
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
This would call is_mergeable_anon_vma(NULL, vma->anon_vma, prev).
The list_is_singular() check for vma->anon_vma_chain, an empty list on
fault, would cause this merge to _fail_ even though all else indicates a
merge.
Equally a simple merge into a next VMA would hit the same problem:
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
can_vma_merge_right()
-> [ check that there is an immediately adjacent succeeding VMA ]
-> can_vma_merge_before()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], next->anon_vma, next)
For a 3-way merge, we'd also hit the same problem if it was configured like
this for instance:
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
As we'd call can_vma_merge_left() for prev, and can_vma_merge_right() for
next, both of which would fail.
vma_merge_new_range() (and relatedly, vma_expand()) are not impacted, as
the new VMA would never already be faulted (it is a proposed new range).
Because we already handle each of the aforementioned merge cases, and can
absolutely therefore deal with an existing VMA merge with !dst->anon_vma,
src->anon_vma, there is absolutely no reason to disallow this kind of
merge.
It seems that the intention of this patch is to ensure that, in the
instance of merging unfaulted VMAs with faulted ones, we never wish to do
so with those with multiple AVCs due to the fact that anon_vma lock's are
held across both parent and child anon_vma's (actually, the 'root' parent
anon_vma's lock is used).
In fact, the original commit alludes to this - "find_mergeable_anon_vma()
already considers this case".
In find_mergeable_anon_vma() however, we check the anon_vma which will be
merged from, if it is set, then we check
list_is_singular(vma->anon_vma_chain).
So to match this logic, update is_mergeable_anon_vma() to perform this
scalability check on the VMA whose anon_vma we ultimately merge into.
This matches existing behaviour with forked VMAs, only we no longer
wrongly disallow ALL empty target merges.
So we both allow merge cases and ensure the scalability check is correctly
applied.
We may wish to revisit these lock scalability concerns at a later date and
ensure they are still valid.
Additionally, correct userland VMA tests which were mistakenly not
asserting these cases correctly previously to now correctly assert this,
and to ensure vmg->anon_vma state is always consistent to account for
newly introduced asserts.
Link: https://lkml.kernel.org/r/cover.1744104124.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/18c756fc9eaf7ad082a710c91133b8346f8cd9a8.1744104124.git.lorenzo.stoakes@oracle.com
Fixes: 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Yeoreum Yun <yeoreum.yun@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-08 10:29:31 +01:00
|
|
|
vma_set_dummy_anon_vma(vma, &avc);
|
2024-08-30 19:10:14 +01:00
|
|
|
ASSERT_EQ(merge_existing(&vmg), vma_prev);
|
mm: avoid using vma_merge() for new VMAs
Abstract vma_merge_new_vma() to use vma_merge_struct and rename the
resultant function vma_merge_new_range() to be clear what the purpose of
this function is - a new VMA is desired in the specified range, and we
wish to see if it is possible to 'merge' surrounding VMAs into this range
rather than having to allocate a new VMA.
Note that this function uses vma_extend() exclusively, so adopts its
requirement that the iterator point at or before the gap. We add an
assert to this effect.
This is as opposed to vma_merge_existing_range(), which will be introduced
in a subsequent commit, and provide the same functionality for cases in
which we are modifying an existing VMA.
In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.
Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().
Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.
Also add the ability for the vmg to track state, and able to report
errors, allowing for us to differentiate a failed merge from an inability
to allocate memory in callers.
This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.
Also introduce vma_iter_next_rewind() to allow for retrieval of the next,
and (optionally) the prev VMA, rewinding to the start of the previous gap.
Introduce are_anon_vmas_compatible() to abstract individual VMA anon_vma
comparison for the case of merging on both sides where the anon_vma of the
VMA being merged maybe compatible with prev and next, but prev and next's
anon_vma's may not be compatible with each other.
Finally also introduce can_vma_merge_left() / can_vma_merge_right() to
check adjacent VMA compatibility and that they are indeed adjacent.
Link: https://lkml.kernel.org/r/49d37c0769b6b9dc03b27fe4d059173832556392.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:18 +01:00
|
|
|
ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
|
2024-08-30 19:10:14 +01:00
|
|
|
ASSERT_EQ(vma_prev->vm_start, 0);
|
|
|
|
ASSERT_EQ(vma_prev->vm_end, 0x9000);
|
|
|
|
ASSERT_EQ(vma_prev->vm_pgoff, 0);
|
|
|
|
ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
|
|
|
|
ASSERT_TRUE(vma_write_started(vma_prev));
|
|
|
|
ASSERT_EQ(mm.map_count, 1);
|
|
|
|
|
|
|
|
/* Clear down and reset. We should have deleted prev and next. */
|
|
|
|
ASSERT_EQ(cleanup_mm(&mm, &vmi), 1);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Non-merge ranges. the modified VMA merge operation assumes that the
|
|
|
|
* caller always specifies ranges within the input VMA so we need only
|
|
|
|
* examine these cases.
|
|
|
|
*
|
|
|
|
* -
|
|
|
|
* -
|
|
|
|
* -
|
|
|
|
* <->
|
|
|
|
* <>
|
|
|
|
* <>
|
|
|
|
* 0123456789a
|
|
|
|
* PPPVVVVVNNN
|
|
|
|
*/
|
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
|
|
|
|
vma = alloc_and_link_vma(&mm, 0x3000, 0x8000, 3, vm_flags);
|
|
|
|
vma_next = alloc_and_link_vma(&mm, 0x8000, 0xa000, 8, vm_flags);
|
2024-08-30 19:10:14 +01:00
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vmg_set_range(&vmg, 0x4000, 0x5000, 4, vm_flags);
|
2024-08-30 19:10:14 +01:00
|
|
|
vmg.prev = vma;
|
mm: simplify vma merge structure and expand comments
Patch series "mm: further simplify VMA merge operation", v3.
While significant efforts have been made to improve the VMA merge
operation, there remains remnants of the bad (or rather confusing) old
days, which make the code difficult to understand, more bug prone and thus
harder to modify.
This series attempts to significantly improve matters in a number of
respects - with a focus on simplifying the commit_merge() function which
actually actions the merge operation - and importantly, adjusting the two
most confusing merge cases - those in which we 'adjust' the VMA
immediately adjacent to the one being merged.
One source of confusion are the VMAs being threaded through the operation
themselves - vmg->prev, vmg->vma and vmg->next.
At the start of the operation, vmg->vma is either NULL if a new VMA is
propose to be added, or if not then a pointer to an existing VMA being
modified, and prev/next are (perhaps not present) VMAs sat immediately
before and after the range specified in vmg->start, end, respectively.
However, during the VMA merge operation, we change vmg->start, end and
pgoff to span the newly merged range and vmg->vma to either be:
a. The ultimately returned VMA (in most cases) or b. A VMA which we will
manipulate, but ultimately instead return vmg->next.
Case b. especially here is confusing for somebody reading this code, but
the fact we update this state, along with vmg->start, end, pgoff only
makes matters worse.
We simplify things by replacing vmg->vma with vmg->middle and never
changing it - this is always either NULL (for a new VMA) or the VMA being
modified between vmg->prev and vmg->next.
We further simplify by placing the merged VMA in a new vmg->target field -
whether case b. above is the case or not. The reader of the code can now
simply rely on vmg->middle being the middle VMA and vmg->target being the
ultimately merged VMA.
We additionally tackle the confusing cases where we 'adjust' VMAs other
than the one we ultimately return as the merged VMA (this includes case b.
above). These are:
(1)
merge
<----------->
|------||--------| |------------|---|
| prev || middle | -> | target | m |
|------||--------| |------------|---|
In which case middle must be adjusted so middle->vm_start is increased as
well as performing the merge.
(2) (equivalent to case b. above)
<------------->
|---------||------| |---|-------------|
| middle || next | -> | m | target |
|---------||------| |---|-------------|
In which case next must be adjusted so next->vm_start is decreased as well
as performing the merge.
This cases have previously been performed by calculating and passing
around a dubious and confusing 'adj_start' parameter along side a pointer
to an 'adjust' VMA indicating which VMA requires additional adjustment
(middle in case 1 and next in case 2).
With the VMG structure in place we are able to avoid this by simply
setting a merge flag to describe each case:
(1) Sets the vmg->__adjust_middle_start flag
(2) Sets the vmg->__adjust_next_start flag
By doing so it turns out we can vastly simplify the logic and calculate
what is required to perform the operation.
Taken together the refactorings make it far easier to understand what is
being done even in these more confusing cases, make the code far more
maintainable, debuggable, and testable, providing more internal state
indicating what is happening in the merge operation.
The changes have no functional net impact on the merge operation and
everything should still behave as it did before.
This patch (of 5):
The merge code, while much improved, still has a number of points of
confusion. As part of a broader series cleaning this up to make this more
maintainable, we start by addressing some confusion around
vma_merge_struct fields.
So far, the caller either provides no vmg->vma (a new VMA) or supplies the
existing VMA which is being altered, setting vmg->start,end,pgoff to the
proposed VMA dimensions.
vmg->vma is then updated, as are vmg->start,end,pgoff as the merge process
proceeds and the appropriate merge strategy is determined.
This is rather confusing, as vmg->vma starts off as the 'middle' VMA
between vmg->prev,next, but becomes the 'target' VMA, except in one
specific edge case (merge next, shrink middle).
Int his patch we introduce vmg->middle to describe the VMA that is between
vmg->prev and vmg->next, and does NOT change during the merge operation.
We replace vmg->vma with vmg->target, and use this only during the merge
operation itself.
Aside from the merge right, shrink middle case, this becomes the VMA that
forms the basis of the VMA that is returned. This edge case can be
addressed in a future commit.
We also add a number of comments to explain what is going on.
Finally, we adjust the ASCII diagrams showing each merge case in
vma_merge_existing_range() to be clearer - the arrow range previously
showed the vmg->start, end spanned area, but it is clearer to change this
to show the final merged VMA.
This patch has no change in functional behaviour.
Link: https://lkml.kernel.org/r/cover.1738326519.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/4dfe60f1419d55e5d0516f56349695d73a57184c.1738326519.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-31 12:31:49 +00:00
|
|
|
vmg.middle = vma;
|
2024-08-30 19:10:14 +01:00
|
|
|
ASSERT_EQ(merge_existing(&vmg), NULL);
|
mm: avoid using vma_merge() for new VMAs
Abstract vma_merge_new_vma() to use vma_merge_struct and rename the
resultant function vma_merge_new_range() to be clear what the purpose of
this function is - a new VMA is desired in the specified range, and we
wish to see if it is possible to 'merge' surrounding VMAs into this range
rather than having to allocate a new VMA.
Note that this function uses vma_extend() exclusively, so adopts its
requirement that the iterator point at or before the gap. We add an
assert to this effect.
This is as opposed to vma_merge_existing_range(), which will be introduced
in a subsequent commit, and provide the same functionality for cases in
which we are modifying an existing VMA.
In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.
Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().
Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.
Also add the ability for the vmg to track state, and able to report
errors, allowing for us to differentiate a failed merge from an inability
to allocate memory in callers.
This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.
Also introduce vma_iter_next_rewind() to allow for retrieval of the next,
and (optionally) the prev VMA, rewinding to the start of the previous gap.
Introduce are_anon_vmas_compatible() to abstract individual VMA anon_vma
comparison for the case of merging on both sides where the anon_vma of the
VMA being merged maybe compatible with prev and next, but prev and next's
anon_vma's may not be compatible with each other.
Finally also introduce can_vma_merge_left() / can_vma_merge_right() to
check adjacent VMA compatibility and that they are indeed adjacent.
Link: https://lkml.kernel.org/r/49d37c0769b6b9dc03b27fe4d059173832556392.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:18 +01:00
|
|
|
ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
|
2024-08-30 19:10:14 +01:00
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vmg_set_range(&vmg, 0x5000, 0x6000, 5, vm_flags);
|
2024-08-30 19:10:14 +01:00
|
|
|
vmg.prev = vma;
|
mm: simplify vma merge structure and expand comments
Patch series "mm: further simplify VMA merge operation", v3.
While significant efforts have been made to improve the VMA merge
operation, there remains remnants of the bad (or rather confusing) old
days, which make the code difficult to understand, more bug prone and thus
harder to modify.
This series attempts to significantly improve matters in a number of
respects - with a focus on simplifying the commit_merge() function which
actually actions the merge operation - and importantly, adjusting the two
most confusing merge cases - those in which we 'adjust' the VMA
immediately adjacent to the one being merged.
One source of confusion are the VMAs being threaded through the operation
themselves - vmg->prev, vmg->vma and vmg->next.
At the start of the operation, vmg->vma is either NULL if a new VMA is
propose to be added, or if not then a pointer to an existing VMA being
modified, and prev/next are (perhaps not present) VMAs sat immediately
before and after the range specified in vmg->start, end, respectively.
However, during the VMA merge operation, we change vmg->start, end and
pgoff to span the newly merged range and vmg->vma to either be:
a. The ultimately returned VMA (in most cases) or b. A VMA which we will
manipulate, but ultimately instead return vmg->next.
Case b. especially here is confusing for somebody reading this code, but
the fact we update this state, along with vmg->start, end, pgoff only
makes matters worse.
We simplify things by replacing vmg->vma with vmg->middle and never
changing it - this is always either NULL (for a new VMA) or the VMA being
modified between vmg->prev and vmg->next.
We further simplify by placing the merged VMA in a new vmg->target field -
whether case b. above is the case or not. The reader of the code can now
simply rely on vmg->middle being the middle VMA and vmg->target being the
ultimately merged VMA.
We additionally tackle the confusing cases where we 'adjust' VMAs other
than the one we ultimately return as the merged VMA (this includes case b.
above). These are:
(1)
merge
<----------->
|------||--------| |------------|---|
| prev || middle | -> | target | m |
|------||--------| |------------|---|
In which case middle must be adjusted so middle->vm_start is increased as
well as performing the merge.
(2) (equivalent to case b. above)
<------------->
|---------||------| |---|-------------|
| middle || next | -> | m | target |
|---------||------| |---|-------------|
In which case next must be adjusted so next->vm_start is decreased as well
as performing the merge.
This cases have previously been performed by calculating and passing
around a dubious and confusing 'adj_start' parameter along side a pointer
to an 'adjust' VMA indicating which VMA requires additional adjustment
(middle in case 1 and next in case 2).
With the VMG structure in place we are able to avoid this by simply
setting a merge flag to describe each case:
(1) Sets the vmg->__adjust_middle_start flag
(2) Sets the vmg->__adjust_next_start flag
By doing so it turns out we can vastly simplify the logic and calculate
what is required to perform the operation.
Taken together the refactorings make it far easier to understand what is
being done even in these more confusing cases, make the code far more
maintainable, debuggable, and testable, providing more internal state
indicating what is happening in the merge operation.
The changes have no functional net impact on the merge operation and
everything should still behave as it did before.
This patch (of 5):
The merge code, while much improved, still has a number of points of
confusion. As part of a broader series cleaning this up to make this more
maintainable, we start by addressing some confusion around
vma_merge_struct fields.
So far, the caller either provides no vmg->vma (a new VMA) or supplies the
existing VMA which is being altered, setting vmg->start,end,pgoff to the
proposed VMA dimensions.
vmg->vma is then updated, as are vmg->start,end,pgoff as the merge process
proceeds and the appropriate merge strategy is determined.
This is rather confusing, as vmg->vma starts off as the 'middle' VMA
between vmg->prev,next, but becomes the 'target' VMA, except in one
specific edge case (merge next, shrink middle).
Int his patch we introduce vmg->middle to describe the VMA that is between
vmg->prev and vmg->next, and does NOT change during the merge operation.
We replace vmg->vma with vmg->target, and use this only during the merge
operation itself.
Aside from the merge right, shrink middle case, this becomes the VMA that
forms the basis of the VMA that is returned. This edge case can be
addressed in a future commit.
We also add a number of comments to explain what is going on.
Finally, we adjust the ASCII diagrams showing each merge case in
vma_merge_existing_range() to be clearer - the arrow range previously
showed the vmg->start, end spanned area, but it is clearer to change this
to show the final merged VMA.
This patch has no change in functional behaviour.
Link: https://lkml.kernel.org/r/cover.1738326519.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/4dfe60f1419d55e5d0516f56349695d73a57184c.1738326519.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-31 12:31:49 +00:00
|
|
|
vmg.middle = vma;
|
2024-08-30 19:10:14 +01:00
|
|
|
ASSERT_EQ(merge_existing(&vmg), NULL);
|
mm: avoid using vma_merge() for new VMAs
Abstract vma_merge_new_vma() to use vma_merge_struct and rename the
resultant function vma_merge_new_range() to be clear what the purpose of
this function is - a new VMA is desired in the specified range, and we
wish to see if it is possible to 'merge' surrounding VMAs into this range
rather than having to allocate a new VMA.
Note that this function uses vma_extend() exclusively, so adopts its
requirement that the iterator point at or before the gap. We add an
assert to this effect.
This is as opposed to vma_merge_existing_range(), which will be introduced
in a subsequent commit, and provide the same functionality for cases in
which we are modifying an existing VMA.
In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.
Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().
Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.
Also add the ability for the vmg to track state, and able to report
errors, allowing for us to differentiate a failed merge from an inability
to allocate memory in callers.
This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.
Also introduce vma_iter_next_rewind() to allow for retrieval of the next,
and (optionally) the prev VMA, rewinding to the start of the previous gap.
Introduce are_anon_vmas_compatible() to abstract individual VMA anon_vma
comparison for the case of merging on both sides where the anon_vma of the
VMA being merged maybe compatible with prev and next, but prev and next's
anon_vma's may not be compatible with each other.
Finally also introduce can_vma_merge_left() / can_vma_merge_right() to
check adjacent VMA compatibility and that they are indeed adjacent.
Link: https://lkml.kernel.org/r/49d37c0769b6b9dc03b27fe4d059173832556392.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:18 +01:00
|
|
|
ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
|
2024-08-30 19:10:14 +01:00
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vmg_set_range(&vmg, 0x6000, 0x7000, 6, vm_flags);
|
2024-08-30 19:10:14 +01:00
|
|
|
vmg.prev = vma;
|
mm: simplify vma merge structure and expand comments
Patch series "mm: further simplify VMA merge operation", v3.
While significant efforts have been made to improve the VMA merge
operation, there remains remnants of the bad (or rather confusing) old
days, which make the code difficult to understand, more bug prone and thus
harder to modify.
This series attempts to significantly improve matters in a number of
respects - with a focus on simplifying the commit_merge() function which
actually actions the merge operation - and importantly, adjusting the two
most confusing merge cases - those in which we 'adjust' the VMA
immediately adjacent to the one being merged.
One source of confusion are the VMAs being threaded through the operation
themselves - vmg->prev, vmg->vma and vmg->next.
At the start of the operation, vmg->vma is either NULL if a new VMA is
propose to be added, or if not then a pointer to an existing VMA being
modified, and prev/next are (perhaps not present) VMAs sat immediately
before and after the range specified in vmg->start, end, respectively.
However, during the VMA merge operation, we change vmg->start, end and
pgoff to span the newly merged range and vmg->vma to either be:
a. The ultimately returned VMA (in most cases) or b. A VMA which we will
manipulate, but ultimately instead return vmg->next.
Case b. especially here is confusing for somebody reading this code, but
the fact we update this state, along with vmg->start, end, pgoff only
makes matters worse.
We simplify things by replacing vmg->vma with vmg->middle and never
changing it - this is always either NULL (for a new VMA) or the VMA being
modified between vmg->prev and vmg->next.
We further simplify by placing the merged VMA in a new vmg->target field -
whether case b. above is the case or not. The reader of the code can now
simply rely on vmg->middle being the middle VMA and vmg->target being the
ultimately merged VMA.
We additionally tackle the confusing cases where we 'adjust' VMAs other
than the one we ultimately return as the merged VMA (this includes case b.
above). These are:
(1)
merge
<----------->
|------||--------| |------------|---|
| prev || middle | -> | target | m |
|------||--------| |------------|---|
In which case middle must be adjusted so middle->vm_start is increased as
well as performing the merge.
(2) (equivalent to case b. above)
<------------->
|---------||------| |---|-------------|
| middle || next | -> | m | target |
|---------||------| |---|-------------|
In which case next must be adjusted so next->vm_start is decreased as well
as performing the merge.
This cases have previously been performed by calculating and passing
around a dubious and confusing 'adj_start' parameter along side a pointer
to an 'adjust' VMA indicating which VMA requires additional adjustment
(middle in case 1 and next in case 2).
With the VMG structure in place we are able to avoid this by simply
setting a merge flag to describe each case:
(1) Sets the vmg->__adjust_middle_start flag
(2) Sets the vmg->__adjust_next_start flag
By doing so it turns out we can vastly simplify the logic and calculate
what is required to perform the operation.
Taken together the refactorings make it far easier to understand what is
being done even in these more confusing cases, make the code far more
maintainable, debuggable, and testable, providing more internal state
indicating what is happening in the merge operation.
The changes have no functional net impact on the merge operation and
everything should still behave as it did before.
This patch (of 5):
The merge code, while much improved, still has a number of points of
confusion. As part of a broader series cleaning this up to make this more
maintainable, we start by addressing some confusion around
vma_merge_struct fields.
So far, the caller either provides no vmg->vma (a new VMA) or supplies the
existing VMA which is being altered, setting vmg->start,end,pgoff to the
proposed VMA dimensions.
vmg->vma is then updated, as are vmg->start,end,pgoff as the merge process
proceeds and the appropriate merge strategy is determined.
This is rather confusing, as vmg->vma starts off as the 'middle' VMA
between vmg->prev,next, but becomes the 'target' VMA, except in one
specific edge case (merge next, shrink middle).
Int his patch we introduce vmg->middle to describe the VMA that is between
vmg->prev and vmg->next, and does NOT change during the merge operation.
We replace vmg->vma with vmg->target, and use this only during the merge
operation itself.
Aside from the merge right, shrink middle case, this becomes the VMA that
forms the basis of the VMA that is returned. This edge case can be
addressed in a future commit.
We also add a number of comments to explain what is going on.
Finally, we adjust the ASCII diagrams showing each merge case in
vma_merge_existing_range() to be clearer - the arrow range previously
showed the vmg->start, end spanned area, but it is clearer to change this
to show the final merged VMA.
This patch has no change in functional behaviour.
Link: https://lkml.kernel.org/r/cover.1738326519.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/4dfe60f1419d55e5d0516f56349695d73a57184c.1738326519.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-31 12:31:49 +00:00
|
|
|
vmg.middle = vma;
|
2024-08-30 19:10:14 +01:00
|
|
|
ASSERT_EQ(merge_existing(&vmg), NULL);
|
mm: avoid using vma_merge() for new VMAs
Abstract vma_merge_new_vma() to use vma_merge_struct and rename the
resultant function vma_merge_new_range() to be clear what the purpose of
this function is - a new VMA is desired in the specified range, and we
wish to see if it is possible to 'merge' surrounding VMAs into this range
rather than having to allocate a new VMA.
Note that this function uses vma_extend() exclusively, so adopts its
requirement that the iterator point at or before the gap. We add an
assert to this effect.
This is as opposed to vma_merge_existing_range(), which will be introduced
in a subsequent commit, and provide the same functionality for cases in
which we are modifying an existing VMA.
In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.
Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().
Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.
Also add the ability for the vmg to track state, and able to report
errors, allowing for us to differentiate a failed merge from an inability
to allocate memory in callers.
This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.
Also introduce vma_iter_next_rewind() to allow for retrieval of the next,
and (optionally) the prev VMA, rewinding to the start of the previous gap.
Introduce are_anon_vmas_compatible() to abstract individual VMA anon_vma
comparison for the case of merging on both sides where the anon_vma of the
VMA being merged maybe compatible with prev and next, but prev and next's
anon_vma's may not be compatible with each other.
Finally also introduce can_vma_merge_left() / can_vma_merge_right() to
check adjacent VMA compatibility and that they are indeed adjacent.
Link: https://lkml.kernel.org/r/49d37c0769b6b9dc03b27fe4d059173832556392.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:18 +01:00
|
|
|
ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
|
2024-08-30 19:10:14 +01:00
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vmg_set_range(&vmg, 0x4000, 0x7000, 4, vm_flags);
|
2024-08-30 19:10:14 +01:00
|
|
|
vmg.prev = vma;
|
mm: simplify vma merge structure and expand comments
Patch series "mm: further simplify VMA merge operation", v3.
While significant efforts have been made to improve the VMA merge
operation, there remains remnants of the bad (or rather confusing) old
days, which make the code difficult to understand, more bug prone and thus
harder to modify.
This series attempts to significantly improve matters in a number of
respects - with a focus on simplifying the commit_merge() function which
actually actions the merge operation - and importantly, adjusting the two
most confusing merge cases - those in which we 'adjust' the VMA
immediately adjacent to the one being merged.
One source of confusion are the VMAs being threaded through the operation
themselves - vmg->prev, vmg->vma and vmg->next.
At the start of the operation, vmg->vma is either NULL if a new VMA is
propose to be added, or if not then a pointer to an existing VMA being
modified, and prev/next are (perhaps not present) VMAs sat immediately
before and after the range specified in vmg->start, end, respectively.
However, during the VMA merge operation, we change vmg->start, end and
pgoff to span the newly merged range and vmg->vma to either be:
a. The ultimately returned VMA (in most cases) or b. A VMA which we will
manipulate, but ultimately instead return vmg->next.
Case b. especially here is confusing for somebody reading this code, but
the fact we update this state, along with vmg->start, end, pgoff only
makes matters worse.
We simplify things by replacing vmg->vma with vmg->middle and never
changing it - this is always either NULL (for a new VMA) or the VMA being
modified between vmg->prev and vmg->next.
We further simplify by placing the merged VMA in a new vmg->target field -
whether case b. above is the case or not. The reader of the code can now
simply rely on vmg->middle being the middle VMA and vmg->target being the
ultimately merged VMA.
We additionally tackle the confusing cases where we 'adjust' VMAs other
than the one we ultimately return as the merged VMA (this includes case b.
above). These are:
(1)
merge
<----------->
|------||--------| |------------|---|
| prev || middle | -> | target | m |
|------||--------| |------------|---|
In which case middle must be adjusted so middle->vm_start is increased as
well as performing the merge.
(2) (equivalent to case b. above)
<------------->
|---------||------| |---|-------------|
| middle || next | -> | m | target |
|---------||------| |---|-------------|
In which case next must be adjusted so next->vm_start is decreased as well
as performing the merge.
This cases have previously been performed by calculating and passing
around a dubious and confusing 'adj_start' parameter along side a pointer
to an 'adjust' VMA indicating which VMA requires additional adjustment
(middle in case 1 and next in case 2).
With the VMG structure in place we are able to avoid this by simply
setting a merge flag to describe each case:
(1) Sets the vmg->__adjust_middle_start flag
(2) Sets the vmg->__adjust_next_start flag
By doing so it turns out we can vastly simplify the logic and calculate
what is required to perform the operation.
Taken together the refactorings make it far easier to understand what is
being done even in these more confusing cases, make the code far more
maintainable, debuggable, and testable, providing more internal state
indicating what is happening in the merge operation.
The changes have no functional net impact on the merge operation and
everything should still behave as it did before.
This patch (of 5):
The merge code, while much improved, still has a number of points of
confusion. As part of a broader series cleaning this up to make this more
maintainable, we start by addressing some confusion around
vma_merge_struct fields.
So far, the caller either provides no vmg->vma (a new VMA) or supplies the
existing VMA which is being altered, setting vmg->start,end,pgoff to the
proposed VMA dimensions.
vmg->vma is then updated, as are vmg->start,end,pgoff as the merge process
proceeds and the appropriate merge strategy is determined.
This is rather confusing, as vmg->vma starts off as the 'middle' VMA
between vmg->prev,next, but becomes the 'target' VMA, except in one
specific edge case (merge next, shrink middle).
Int his patch we introduce vmg->middle to describe the VMA that is between
vmg->prev and vmg->next, and does NOT change during the merge operation.
We replace vmg->vma with vmg->target, and use this only during the merge
operation itself.
Aside from the merge right, shrink middle case, this becomes the VMA that
forms the basis of the VMA that is returned. This edge case can be
addressed in a future commit.
We also add a number of comments to explain what is going on.
Finally, we adjust the ASCII diagrams showing each merge case in
vma_merge_existing_range() to be clearer - the arrow range previously
showed the vmg->start, end spanned area, but it is clearer to change this
to show the final merged VMA.
This patch has no change in functional behaviour.
Link: https://lkml.kernel.org/r/cover.1738326519.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/4dfe60f1419d55e5d0516f56349695d73a57184c.1738326519.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-31 12:31:49 +00:00
|
|
|
vmg.middle = vma;
|
2024-08-30 19:10:14 +01:00
|
|
|
ASSERT_EQ(merge_existing(&vmg), NULL);
|
mm: avoid using vma_merge() for new VMAs
Abstract vma_merge_new_vma() to use vma_merge_struct and rename the
resultant function vma_merge_new_range() to be clear what the purpose of
this function is - a new VMA is desired in the specified range, and we
wish to see if it is possible to 'merge' surrounding VMAs into this range
rather than having to allocate a new VMA.
Note that this function uses vma_extend() exclusively, so adopts its
requirement that the iterator point at or before the gap. We add an
assert to this effect.
This is as opposed to vma_merge_existing_range(), which will be introduced
in a subsequent commit, and provide the same functionality for cases in
which we are modifying an existing VMA.
In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.
Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().
Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.
Also add the ability for the vmg to track state, and able to report
errors, allowing for us to differentiate a failed merge from an inability
to allocate memory in callers.
This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.
Also introduce vma_iter_next_rewind() to allow for retrieval of the next,
and (optionally) the prev VMA, rewinding to the start of the previous gap.
Introduce are_anon_vmas_compatible() to abstract individual VMA anon_vma
comparison for the case of merging on both sides where the anon_vma of the
VMA being merged maybe compatible with prev and next, but prev and next's
anon_vma's may not be compatible with each other.
Finally also introduce can_vma_merge_left() / can_vma_merge_right() to
check adjacent VMA compatibility and that they are indeed adjacent.
Link: https://lkml.kernel.org/r/49d37c0769b6b9dc03b27fe4d059173832556392.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:18 +01:00
|
|
|
ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
|
2024-08-30 19:10:14 +01:00
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vmg_set_range(&vmg, 0x4000, 0x6000, 4, vm_flags);
|
2024-08-30 19:10:14 +01:00
|
|
|
vmg.prev = vma;
|
mm: simplify vma merge structure and expand comments
Patch series "mm: further simplify VMA merge operation", v3.
While significant efforts have been made to improve the VMA merge
operation, there remains remnants of the bad (or rather confusing) old
days, which make the code difficult to understand, more bug prone and thus
harder to modify.
This series attempts to significantly improve matters in a number of
respects - with a focus on simplifying the commit_merge() function which
actually actions the merge operation - and importantly, adjusting the two
most confusing merge cases - those in which we 'adjust' the VMA
immediately adjacent to the one being merged.
One source of confusion are the VMAs being threaded through the operation
themselves - vmg->prev, vmg->vma and vmg->next.
At the start of the operation, vmg->vma is either NULL if a new VMA is
propose to be added, or if not then a pointer to an existing VMA being
modified, and prev/next are (perhaps not present) VMAs sat immediately
before and after the range specified in vmg->start, end, respectively.
However, during the VMA merge operation, we change vmg->start, end and
pgoff to span the newly merged range and vmg->vma to either be:
a. The ultimately returned VMA (in most cases) or b. A VMA which we will
manipulate, but ultimately instead return vmg->next.
Case b. especially here is confusing for somebody reading this code, but
the fact we update this state, along with vmg->start, end, pgoff only
makes matters worse.
We simplify things by replacing vmg->vma with vmg->middle and never
changing it - this is always either NULL (for a new VMA) or the VMA being
modified between vmg->prev and vmg->next.
We further simplify by placing the merged VMA in a new vmg->target field -
whether case b. above is the case or not. The reader of the code can now
simply rely on vmg->middle being the middle VMA and vmg->target being the
ultimately merged VMA.
We additionally tackle the confusing cases where we 'adjust' VMAs other
than the one we ultimately return as the merged VMA (this includes case b.
above). These are:
(1)
merge
<----------->
|------||--------| |------------|---|
| prev || middle | -> | target | m |
|------||--------| |------------|---|
In which case middle must be adjusted so middle->vm_start is increased as
well as performing the merge.
(2) (equivalent to case b. above)
<------------->
|---------||------| |---|-------------|
| middle || next | -> | m | target |
|---------||------| |---|-------------|
In which case next must be adjusted so next->vm_start is decreased as well
as performing the merge.
This cases have previously been performed by calculating and passing
around a dubious and confusing 'adj_start' parameter along side a pointer
to an 'adjust' VMA indicating which VMA requires additional adjustment
(middle in case 1 and next in case 2).
With the VMG structure in place we are able to avoid this by simply
setting a merge flag to describe each case:
(1) Sets the vmg->__adjust_middle_start flag
(2) Sets the vmg->__adjust_next_start flag
By doing so it turns out we can vastly simplify the logic and calculate
what is required to perform the operation.
Taken together the refactorings make it far easier to understand what is
being done even in these more confusing cases, make the code far more
maintainable, debuggable, and testable, providing more internal state
indicating what is happening in the merge operation.
The changes have no functional net impact on the merge operation and
everything should still behave as it did before.
This patch (of 5):
The merge code, while much improved, still has a number of points of
confusion. As part of a broader series cleaning this up to make this more
maintainable, we start by addressing some confusion around
vma_merge_struct fields.
So far, the caller either provides no vmg->vma (a new VMA) or supplies the
existing VMA which is being altered, setting vmg->start,end,pgoff to the
proposed VMA dimensions.
vmg->vma is then updated, as are vmg->start,end,pgoff as the merge process
proceeds and the appropriate merge strategy is determined.
This is rather confusing, as vmg->vma starts off as the 'middle' VMA
between vmg->prev,next, but becomes the 'target' VMA, except in one
specific edge case (merge next, shrink middle).
Int his patch we introduce vmg->middle to describe the VMA that is between
vmg->prev and vmg->next, and does NOT change during the merge operation.
We replace vmg->vma with vmg->target, and use this only during the merge
operation itself.
Aside from the merge right, shrink middle case, this becomes the VMA that
forms the basis of the VMA that is returned. This edge case can be
addressed in a future commit.
We also add a number of comments to explain what is going on.
Finally, we adjust the ASCII diagrams showing each merge case in
vma_merge_existing_range() to be clearer - the arrow range previously
showed the vmg->start, end spanned area, but it is clearer to change this
to show the final merged VMA.
This patch has no change in functional behaviour.
Link: https://lkml.kernel.org/r/cover.1738326519.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/4dfe60f1419d55e5d0516f56349695d73a57184c.1738326519.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-31 12:31:49 +00:00
|
|
|
vmg.middle = vma;
|
2024-08-30 19:10:14 +01:00
|
|
|
ASSERT_EQ(merge_existing(&vmg), NULL);
|
mm: avoid using vma_merge() for new VMAs
Abstract vma_merge_new_vma() to use vma_merge_struct and rename the
resultant function vma_merge_new_range() to be clear what the purpose of
this function is - a new VMA is desired in the specified range, and we
wish to see if it is possible to 'merge' surrounding VMAs into this range
rather than having to allocate a new VMA.
Note that this function uses vma_extend() exclusively, so adopts its
requirement that the iterator point at or before the gap. We add an
assert to this effect.
This is as opposed to vma_merge_existing_range(), which will be introduced
in a subsequent commit, and provide the same functionality for cases in
which we are modifying an existing VMA.
In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.
Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().
Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.
Also add the ability for the vmg to track state, and able to report
errors, allowing for us to differentiate a failed merge from an inability
to allocate memory in callers.
This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.
Also introduce vma_iter_next_rewind() to allow for retrieval of the next,
and (optionally) the prev VMA, rewinding to the start of the previous gap.
Introduce are_anon_vmas_compatible() to abstract individual VMA anon_vma
comparison for the case of merging on both sides where the anon_vma of the
VMA being merged maybe compatible with prev and next, but prev and next's
anon_vma's may not be compatible with each other.
Finally also introduce can_vma_merge_left() / can_vma_merge_right() to
check adjacent VMA compatibility and that they are indeed adjacent.
Link: https://lkml.kernel.org/r/49d37c0769b6b9dc03b27fe4d059173832556392.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:18 +01:00
|
|
|
ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
|
2024-08-30 19:10:14 +01:00
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vmg_set_range(&vmg, 0x5000, 0x6000, 5, vm_flags);
|
2024-08-30 19:10:14 +01:00
|
|
|
vmg.prev = vma;
|
mm: simplify vma merge structure and expand comments
Patch series "mm: further simplify VMA merge operation", v3.
While significant efforts have been made to improve the VMA merge
operation, there remains remnants of the bad (or rather confusing) old
days, which make the code difficult to understand, more bug prone and thus
harder to modify.
This series attempts to significantly improve matters in a number of
respects - with a focus on simplifying the commit_merge() function which
actually actions the merge operation - and importantly, adjusting the two
most confusing merge cases - those in which we 'adjust' the VMA
immediately adjacent to the one being merged.
One source of confusion are the VMAs being threaded through the operation
themselves - vmg->prev, vmg->vma and vmg->next.
At the start of the operation, vmg->vma is either NULL if a new VMA is
propose to be added, or if not then a pointer to an existing VMA being
modified, and prev/next are (perhaps not present) VMAs sat immediately
before and after the range specified in vmg->start, end, respectively.
However, during the VMA merge operation, we change vmg->start, end and
pgoff to span the newly merged range and vmg->vma to either be:
a. The ultimately returned VMA (in most cases) or b. A VMA which we will
manipulate, but ultimately instead return vmg->next.
Case b. especially here is confusing for somebody reading this code, but
the fact we update this state, along with vmg->start, end, pgoff only
makes matters worse.
We simplify things by replacing vmg->vma with vmg->middle and never
changing it - this is always either NULL (for a new VMA) or the VMA being
modified between vmg->prev and vmg->next.
We further simplify by placing the merged VMA in a new vmg->target field -
whether case b. above is the case or not. The reader of the code can now
simply rely on vmg->middle being the middle VMA and vmg->target being the
ultimately merged VMA.
We additionally tackle the confusing cases where we 'adjust' VMAs other
than the one we ultimately return as the merged VMA (this includes case b.
above). These are:
(1)
merge
<----------->
|------||--------| |------------|---|
| prev || middle | -> | target | m |
|------||--------| |------------|---|
In which case middle must be adjusted so middle->vm_start is increased as
well as performing the merge.
(2) (equivalent to case b. above)
<------------->
|---------||------| |---|-------------|
| middle || next | -> | m | target |
|---------||------| |---|-------------|
In which case next must be adjusted so next->vm_start is decreased as well
as performing the merge.
This cases have previously been performed by calculating and passing
around a dubious and confusing 'adj_start' parameter along side a pointer
to an 'adjust' VMA indicating which VMA requires additional adjustment
(middle in case 1 and next in case 2).
With the VMG structure in place we are able to avoid this by simply
setting a merge flag to describe each case:
(1) Sets the vmg->__adjust_middle_start flag
(2) Sets the vmg->__adjust_next_start flag
By doing so it turns out we can vastly simplify the logic and calculate
what is required to perform the operation.
Taken together the refactorings make it far easier to understand what is
being done even in these more confusing cases, make the code far more
maintainable, debuggable, and testable, providing more internal state
indicating what is happening in the merge operation.
The changes have no functional net impact on the merge operation and
everything should still behave as it did before.
This patch (of 5):
The merge code, while much improved, still has a number of points of
confusion. As part of a broader series cleaning this up to make this more
maintainable, we start by addressing some confusion around
vma_merge_struct fields.
So far, the caller either provides no vmg->vma (a new VMA) or supplies the
existing VMA which is being altered, setting vmg->start,end,pgoff to the
proposed VMA dimensions.
vmg->vma is then updated, as are vmg->start,end,pgoff as the merge process
proceeds and the appropriate merge strategy is determined.
This is rather confusing, as vmg->vma starts off as the 'middle' VMA
between vmg->prev,next, but becomes the 'target' VMA, except in one
specific edge case (merge next, shrink middle).
Int his patch we introduce vmg->middle to describe the VMA that is between
vmg->prev and vmg->next, and does NOT change during the merge operation.
We replace vmg->vma with vmg->target, and use this only during the merge
operation itself.
Aside from the merge right, shrink middle case, this becomes the VMA that
forms the basis of the VMA that is returned. This edge case can be
addressed in a future commit.
We also add a number of comments to explain what is going on.
Finally, we adjust the ASCII diagrams showing each merge case in
vma_merge_existing_range() to be clearer - the arrow range previously
showed the vmg->start, end spanned area, but it is clearer to change this
to show the final merged VMA.
This patch has no change in functional behaviour.
Link: https://lkml.kernel.org/r/cover.1738326519.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/4dfe60f1419d55e5d0516f56349695d73a57184c.1738326519.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-31 12:31:49 +00:00
|
|
|
vmg.middle = vma;
|
2024-08-30 19:10:14 +01:00
|
|
|
ASSERT_EQ(merge_existing(&vmg), NULL);
|
mm: avoid using vma_merge() for new VMAs
Abstract vma_merge_new_vma() to use vma_merge_struct and rename the
resultant function vma_merge_new_range() to be clear what the purpose of
this function is - a new VMA is desired in the specified range, and we
wish to see if it is possible to 'merge' surrounding VMAs into this range
rather than having to allocate a new VMA.
Note that this function uses vma_extend() exclusively, so adopts its
requirement that the iterator point at or before the gap. We add an
assert to this effect.
This is as opposed to vma_merge_existing_range(), which will be introduced
in a subsequent commit, and provide the same functionality for cases in
which we are modifying an existing VMA.
In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.
Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().
Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.
Also add the ability for the vmg to track state, and able to report
errors, allowing for us to differentiate a failed merge from an inability
to allocate memory in callers.
This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.
Also introduce vma_iter_next_rewind() to allow for retrieval of the next,
and (optionally) the prev VMA, rewinding to the start of the previous gap.
Introduce are_anon_vmas_compatible() to abstract individual VMA anon_vma
comparison for the case of merging on both sides where the anon_vma of the
VMA being merged maybe compatible with prev and next, but prev and next's
anon_vma's may not be compatible with each other.
Finally also introduce can_vma_merge_left() / can_vma_merge_right() to
check adjacent VMA compatibility and that they are indeed adjacent.
Link: https://lkml.kernel.org/r/49d37c0769b6b9dc03b27fe4d059173832556392.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:18 +01:00
|
|
|
ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
|
2024-08-30 19:10:14 +01:00
|
|
|
|
|
|
|
ASSERT_EQ(cleanup_mm(&mm, &vmi), 3);
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool test_anon_vma_non_mergeable(void)
|
|
|
|
{
|
2025-06-18 20:42:53 +01:00
|
|
|
vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
|
2024-08-30 19:10:14 +01:00
|
|
|
struct mm_struct mm = {};
|
|
|
|
VMA_ITERATOR(vmi, &mm, 0);
|
|
|
|
struct vm_area_struct *vma, *vma_prev, *vma_next;
|
|
|
|
struct vma_merge_struct vmg = {
|
|
|
|
.mm = &mm,
|
|
|
|
.vmi = &vmi,
|
|
|
|
};
|
mm/vma: fix incorrectly disallowed anonymous VMA merges
Patch series "fix incorrectly disallowed anonymous VMA merges", v2.
It appears that we have been incorrectly rejecting merge cases for 15
years, apparently by mistake.
Imagine a range of anonymous mapped momemory divided into two VMAs like
this, with incompatible protection bits:
RW RWX
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
mprotect(RW)
Now imagine mprotect()'ing vma so it is RW. This appears as if it should
merge, it does not.
Neither does this case, again mprotect()'ing vma RW:
RWX RW
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
mprotect(RW)
Nor:
RW RWX RW
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
mprotect(RW)
What's going on here?
In commit 5beb49305251 ("mm: change anon_vma linking to fix multi-process
server scalability issue"), from 2010, Rik von Riel took careful care to
account for these cases - commenting that '[this is] easily overlooked:
when mprotect shifts the boundary, make sure the expanding vma has
anon_vma set if the shrinking vma had, to cover any anon pages imported.'
However, commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
introduced a little over a year later, appears to have accidentally
disallowed this.
By adjusting the is_mergeable_anon_vma() function to avoid lock contention
across large trees of forked anon_vma's, this commit wrongly assumed the
VMA being checked (the ostensible merge 'target') should be faulted, that
is, have an anon_vma, and thus an anon_vma_chain list established, but
only of length 1.
This appears to have been unintentional, as disallowing empty target VMAs
like this across the board makes no sense.
We already have logic that accounts for this case, the same logic Rik
introduced in 2010, now via dup_anon_vma() (and ultimately
anon_vma_clone()), so there is no problem permitting this.
This series fixes this mistake and also ensures that scalability concerns
remain addressed by explicitly checking that whatever VMA is being merged
has not been forked.
A full set of self tests which reproduce the issue are provided, as well
as updating userland VMA tests to assert this behaviour.
The self tests additionally assert scalability concerns are addressed.
This patch (of 3):
anon_vma_chain's were introduced by Rik von Riel in commit 5beb49305251
("mm: change anon_vma linking to fix multi-process server scalability
issue").
This patch was introduced in March 2010. As part of this change, careful
attention was made to the instance of mprotect() causing a VMA merge, with
one faulted (i.e. having anon_vma set) and another not:
/*
* Easily overlooked: when mprotect shifts the boundary,
* make sure the expanding vma has anon_vma set if the
* shrinking vma had, to cover any anon pages imported.
*/
In the modern VMA code, this is handled in dup_anon_vma() (and ultimately
anon_vma_clone()).
This case is one of the three configurations of adjacent VMA anon_vma
state that we might encounter on merge (where dst is the VMA which will be
merged into and src the one being merged into dst):
1. dst->anon_vma, src->anon_vma - These must be equal, no-op.
2. dst->anon_vma, !src->anon_vma - We simply use dst->anon_vma, no-op.
3. !dst->anon_vma, src->anon_vma - The case in question here.
In case 3, the instance addressed here - we duplicate the AVC connections
from src and place into dst.
However, in practice, we very often do NOT do this.
This appears to be due to an inadvertent consequence of the change
introduced by commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs"),
introduced in May 2011.
This implies that this merge case was functional only for a little over a
year, and has since been broken for ~15 years.
Here, lock scalability concerns lead to us restricting anonymous merges
only to those VMAs with 1 entry in their vma->anon_vma_chain, that is, a
VMA that is not connected to any parent process's anon_vma.
The mergeability test looks like this:
static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
struct anon_vma *anon_vma2, struct vm_area_struct *vma)
{
if ((!anon_vma1 || !anon_vma2) && (!vma ||
!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)))
return true;
return anon_vma1 == anon_vma2;
}
However, we have a problem here - typically the vma passed here is the
destination VMA.
For instance in vma_merge_existing_range() we invoke:
can_vma_merge_left()
-> [ check that there is an immediately adjacent prior VMA ]
-> can_vma_merge_after()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], prev->anon_vma, prev)
So if we were considering a target unfaulted 'prev':
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
This would call is_mergeable_anon_vma(NULL, vma->anon_vma, prev).
The list_is_singular() check for vma->anon_vma_chain, an empty list on
fault, would cause this merge to _fail_ even though all else indicates a
merge.
Equally a simple merge into a next VMA would hit the same problem:
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
can_vma_merge_right()
-> [ check that there is an immediately adjacent succeeding VMA ]
-> can_vma_merge_before()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], next->anon_vma, next)
For a 3-way merge, we'd also hit the same problem if it was configured like
this for instance:
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
As we'd call can_vma_merge_left() for prev, and can_vma_merge_right() for
next, both of which would fail.
vma_merge_new_range() (and relatedly, vma_expand()) are not impacted, as
the new VMA would never already be faulted (it is a proposed new range).
Because we already handle each of the aforementioned merge cases, and can
absolutely therefore deal with an existing VMA merge with !dst->anon_vma,
src->anon_vma, there is absolutely no reason to disallow this kind of
merge.
It seems that the intention of this patch is to ensure that, in the
instance of merging unfaulted VMAs with faulted ones, we never wish to do
so with those with multiple AVCs due to the fact that anon_vma lock's are
held across both parent and child anon_vma's (actually, the 'root' parent
anon_vma's lock is used).
In fact, the original commit alludes to this - "find_mergeable_anon_vma()
already considers this case".
In find_mergeable_anon_vma() however, we check the anon_vma which will be
merged from, if it is set, then we check
list_is_singular(vma->anon_vma_chain).
So to match this logic, update is_mergeable_anon_vma() to perform this
scalability check on the VMA whose anon_vma we ultimately merge into.
This matches existing behaviour with forked VMAs, only we no longer
wrongly disallow ALL empty target merges.
So we both allow merge cases and ensure the scalability check is correctly
applied.
We may wish to revisit these lock scalability concerns at a later date and
ensure they are still valid.
Additionally, correct userland VMA tests which were mistakenly not
asserting these cases correctly previously to now correctly assert this,
and to ensure vmg->anon_vma state is always consistent to account for
newly introduced asserts.
Link: https://lkml.kernel.org/r/cover.1744104124.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/18c756fc9eaf7ad082a710c91133b8346f8cd9a8.1744104124.git.lorenzo.stoakes@oracle.com
Fixes: 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Yeoreum Yun <yeoreum.yun@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-08 10:29:31 +01:00
|
|
|
struct anon_vma_chain dummy_anon_vma_chain_1 = {};
|
|
|
|
struct anon_vma_chain dummy_anon_vma_chain_2 = {};
|
|
|
|
struct anon_vma dummy_anon_vma_2;
|
2024-08-30 19:10:14 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* In the case of modified VMA merge, merging both left and right VMAs
|
|
|
|
* but where prev and next have incompatible anon_vma objects, we revert
|
|
|
|
* to a merge of prev and VMA:
|
|
|
|
*
|
|
|
|
* <-->
|
|
|
|
* 0123456789
|
|
|
|
* PPPVVVVNNN
|
|
|
|
* ->
|
|
|
|
* 0123456789
|
|
|
|
* PPPPPPPNNN
|
|
|
|
*/
|
2025-06-18 20:42:53 +01:00
|
|
|
vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
|
|
|
|
vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags);
|
|
|
|
vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, vm_flags);
|
2024-08-30 19:10:14 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Give both prev and next single anon_vma_chain fields, so they will
|
|
|
|
* merge with the NULL vmg->anon_vma.
|
|
|
|
*
|
|
|
|
* However, when prev is compared to next, the merge should fail.
|
|
|
|
*/
|
2025-06-18 20:42:53 +01:00
|
|
|
vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vm_flags, NULL);
|
2024-08-30 19:10:14 +01:00
|
|
|
vmg.prev = vma_prev;
|
mm: simplify vma merge structure and expand comments
Patch series "mm: further simplify VMA merge operation", v3.
While significant efforts have been made to improve the VMA merge
operation, there remains remnants of the bad (or rather confusing) old
days, which make the code difficult to understand, more bug prone and thus
harder to modify.
This series attempts to significantly improve matters in a number of
respects - with a focus on simplifying the commit_merge() function which
actually actions the merge operation - and importantly, adjusting the two
most confusing merge cases - those in which we 'adjust' the VMA
immediately adjacent to the one being merged.
One source of confusion are the VMAs being threaded through the operation
themselves - vmg->prev, vmg->vma and vmg->next.
At the start of the operation, vmg->vma is either NULL if a new VMA is
propose to be added, or if not then a pointer to an existing VMA being
modified, and prev/next are (perhaps not present) VMAs sat immediately
before and after the range specified in vmg->start, end, respectively.
However, during the VMA merge operation, we change vmg->start, end and
pgoff to span the newly merged range and vmg->vma to either be:
a. The ultimately returned VMA (in most cases) or b. A VMA which we will
manipulate, but ultimately instead return vmg->next.
Case b. especially here is confusing for somebody reading this code, but
the fact we update this state, along with vmg->start, end, pgoff only
makes matters worse.
We simplify things by replacing vmg->vma with vmg->middle and never
changing it - this is always either NULL (for a new VMA) or the VMA being
modified between vmg->prev and vmg->next.
We further simplify by placing the merged VMA in a new vmg->target field -
whether case b. above is the case or not. The reader of the code can now
simply rely on vmg->middle being the middle VMA and vmg->target being the
ultimately merged VMA.
We additionally tackle the confusing cases where we 'adjust' VMAs other
than the one we ultimately return as the merged VMA (this includes case b.
above). These are:
(1)
merge
<----------->
|------||--------| |------------|---|
| prev || middle | -> | target | m |
|------||--------| |------------|---|
In which case middle must be adjusted so middle->vm_start is increased as
well as performing the merge.
(2) (equivalent to case b. above)
<------------->
|---------||------| |---|-------------|
| middle || next | -> | m | target |
|---------||------| |---|-------------|
In which case next must be adjusted so next->vm_start is decreased as well
as performing the merge.
This cases have previously been performed by calculating and passing
around a dubious and confusing 'adj_start' parameter along side a pointer
to an 'adjust' VMA indicating which VMA requires additional adjustment
(middle in case 1 and next in case 2).
With the VMG structure in place we are able to avoid this by simply
setting a merge flag to describe each case:
(1) Sets the vmg->__adjust_middle_start flag
(2) Sets the vmg->__adjust_next_start flag
By doing so it turns out we can vastly simplify the logic and calculate
what is required to perform the operation.
Taken together the refactorings make it far easier to understand what is
being done even in these more confusing cases, make the code far more
maintainable, debuggable, and testable, providing more internal state
indicating what is happening in the merge operation.
The changes have no functional net impact on the merge operation and
everything should still behave as it did before.
This patch (of 5):
The merge code, while much improved, still has a number of points of
confusion. As part of a broader series cleaning this up to make this more
maintainable, we start by addressing some confusion around
vma_merge_struct fields.
So far, the caller either provides no vmg->vma (a new VMA) or supplies the
existing VMA which is being altered, setting vmg->start,end,pgoff to the
proposed VMA dimensions.
vmg->vma is then updated, as are vmg->start,end,pgoff as the merge process
proceeds and the appropriate merge strategy is determined.
This is rather confusing, as vmg->vma starts off as the 'middle' VMA
between vmg->prev,next, but becomes the 'target' VMA, except in one
specific edge case (merge next, shrink middle).
Int his patch we introduce vmg->middle to describe the VMA that is between
vmg->prev and vmg->next, and does NOT change during the merge operation.
We replace vmg->vma with vmg->target, and use this only during the merge
operation itself.
Aside from the merge right, shrink middle case, this becomes the VMA that
forms the basis of the VMA that is returned. This edge case can be
addressed in a future commit.
We also add a number of comments to explain what is going on.
Finally, we adjust the ASCII diagrams showing each merge case in
vma_merge_existing_range() to be clearer - the arrow range previously
showed the vmg->start, end spanned area, but it is clearer to change this
to show the final merged VMA.
This patch has no change in functional behaviour.
Link: https://lkml.kernel.org/r/cover.1738326519.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/4dfe60f1419d55e5d0516f56349695d73a57184c.1738326519.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-31 12:31:49 +00:00
|
|
|
vmg.middle = vma;
|
mm/vma: fix incorrectly disallowed anonymous VMA merges
Patch series "fix incorrectly disallowed anonymous VMA merges", v2.
It appears that we have been incorrectly rejecting merge cases for 15
years, apparently by mistake.
Imagine a range of anonymous mapped momemory divided into two VMAs like
this, with incompatible protection bits:
RW RWX
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
mprotect(RW)
Now imagine mprotect()'ing vma so it is RW. This appears as if it should
merge, it does not.
Neither does this case, again mprotect()'ing vma RW:
RWX RW
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
mprotect(RW)
Nor:
RW RWX RW
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
mprotect(RW)
What's going on here?
In commit 5beb49305251 ("mm: change anon_vma linking to fix multi-process
server scalability issue"), from 2010, Rik von Riel took careful care to
account for these cases - commenting that '[this is] easily overlooked:
when mprotect shifts the boundary, make sure the expanding vma has
anon_vma set if the shrinking vma had, to cover any anon pages imported.'
However, commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
introduced a little over a year later, appears to have accidentally
disallowed this.
By adjusting the is_mergeable_anon_vma() function to avoid lock contention
across large trees of forked anon_vma's, this commit wrongly assumed the
VMA being checked (the ostensible merge 'target') should be faulted, that
is, have an anon_vma, and thus an anon_vma_chain list established, but
only of length 1.
This appears to have been unintentional, as disallowing empty target VMAs
like this across the board makes no sense.
We already have logic that accounts for this case, the same logic Rik
introduced in 2010, now via dup_anon_vma() (and ultimately
anon_vma_clone()), so there is no problem permitting this.
This series fixes this mistake and also ensures that scalability concerns
remain addressed by explicitly checking that whatever VMA is being merged
has not been forked.
A full set of self tests which reproduce the issue are provided, as well
as updating userland VMA tests to assert this behaviour.
The self tests additionally assert scalability concerns are addressed.
This patch (of 3):
anon_vma_chain's were introduced by Rik von Riel in commit 5beb49305251
("mm: change anon_vma linking to fix multi-process server scalability
issue").
This patch was introduced in March 2010. As part of this change, careful
attention was made to the instance of mprotect() causing a VMA merge, with
one faulted (i.e. having anon_vma set) and another not:
/*
* Easily overlooked: when mprotect shifts the boundary,
* make sure the expanding vma has anon_vma set if the
* shrinking vma had, to cover any anon pages imported.
*/
In the modern VMA code, this is handled in dup_anon_vma() (and ultimately
anon_vma_clone()).
This case is one of the three configurations of adjacent VMA anon_vma
state that we might encounter on merge (where dst is the VMA which will be
merged into and src the one being merged into dst):
1. dst->anon_vma, src->anon_vma - These must be equal, no-op.
2. dst->anon_vma, !src->anon_vma - We simply use dst->anon_vma, no-op.
3. !dst->anon_vma, src->anon_vma - The case in question here.
In case 3, the instance addressed here - we duplicate the AVC connections
from src and place into dst.
However, in practice, we very often do NOT do this.
This appears to be due to an inadvertent consequence of the change
introduced by commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs"),
introduced in May 2011.
This implies that this merge case was functional only for a little over a
year, and has since been broken for ~15 years.
Here, lock scalability concerns lead to us restricting anonymous merges
only to those VMAs with 1 entry in their vma->anon_vma_chain, that is, a
VMA that is not connected to any parent process's anon_vma.
The mergeability test looks like this:
static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
struct anon_vma *anon_vma2, struct vm_area_struct *vma)
{
if ((!anon_vma1 || !anon_vma2) && (!vma ||
!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)))
return true;
return anon_vma1 == anon_vma2;
}
However, we have a problem here - typically the vma passed here is the
destination VMA.
For instance in vma_merge_existing_range() we invoke:
can_vma_merge_left()
-> [ check that there is an immediately adjacent prior VMA ]
-> can_vma_merge_after()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], prev->anon_vma, prev)
So if we were considering a target unfaulted 'prev':
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
This would call is_mergeable_anon_vma(NULL, vma->anon_vma, prev).
The list_is_singular() check for vma->anon_vma_chain, an empty list on
fault, would cause this merge to _fail_ even though all else indicates a
merge.
Equally a simple merge into a next VMA would hit the same problem:
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
can_vma_merge_right()
-> [ check that there is an immediately adjacent succeeding VMA ]
-> can_vma_merge_before()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], next->anon_vma, next)
For a 3-way merge, we'd also hit the same problem if it was configured like
this for instance:
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
As we'd call can_vma_merge_left() for prev, and can_vma_merge_right() for
next, both of which would fail.
vma_merge_new_range() (and relatedly, vma_expand()) are not impacted, as
the new VMA would never already be faulted (it is a proposed new range).
Because we already handle each of the aforementioned merge cases, and can
absolutely therefore deal with an existing VMA merge with !dst->anon_vma,
src->anon_vma, there is absolutely no reason to disallow this kind of
merge.
It seems that the intention of this patch is to ensure that, in the
instance of merging unfaulted VMAs with faulted ones, we never wish to do
so with those with multiple AVCs due to the fact that anon_vma lock's are
held across both parent and child anon_vma's (actually, the 'root' parent
anon_vma's lock is used).
In fact, the original commit alludes to this - "find_mergeable_anon_vma()
already considers this case".
In find_mergeable_anon_vma() however, we check the anon_vma which will be
merged from, if it is set, then we check
list_is_singular(vma->anon_vma_chain).
So to match this logic, update is_mergeable_anon_vma() to perform this
scalability check on the VMA whose anon_vma we ultimately merge into.
This matches existing behaviour with forked VMAs, only we no longer
wrongly disallow ALL empty target merges.
So we both allow merge cases and ensure the scalability check is correctly
applied.
We may wish to revisit these lock scalability concerns at a later date and
ensure they are still valid.
Additionally, correct userland VMA tests which were mistakenly not
asserting these cases correctly previously to now correctly assert this,
and to ensure vmg->anon_vma state is always consistent to account for
newly introduced asserts.
Link: https://lkml.kernel.org/r/cover.1744104124.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/18c756fc9eaf7ad082a710c91133b8346f8cd9a8.1744104124.git.lorenzo.stoakes@oracle.com
Fixes: 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Yeoreum Yun <yeoreum.yun@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-08 10:29:31 +01:00
|
|
|
vma_set_dummy_anon_vma(vma_prev, &dummy_anon_vma_chain_1);
|
|
|
|
__vma_set_dummy_anon_vma(vma_next, &dummy_anon_vma_chain_2, &dummy_anon_vma_2);
|
2024-08-30 19:10:14 +01:00
|
|
|
|
|
|
|
ASSERT_EQ(merge_existing(&vmg), vma_prev);
|
mm: avoid using vma_merge() for new VMAs
Abstract vma_merge_new_vma() to use vma_merge_struct and rename the
resultant function vma_merge_new_range() to be clear what the purpose of
this function is - a new VMA is desired in the specified range, and we
wish to see if it is possible to 'merge' surrounding VMAs into this range
rather than having to allocate a new VMA.
Note that this function uses vma_extend() exclusively, so adopts its
requirement that the iterator point at or before the gap. We add an
assert to this effect.
This is as opposed to vma_merge_existing_range(), which will be introduced
in a subsequent commit, and provide the same functionality for cases in
which we are modifying an existing VMA.
In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.
Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().
Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.
Also add the ability for the vmg to track state, and able to report
errors, allowing for us to differentiate a failed merge from an inability
to allocate memory in callers.
This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.
Also introduce vma_iter_next_rewind() to allow for retrieval of the next,
and (optionally) the prev VMA, rewinding to the start of the previous gap.
Introduce are_anon_vmas_compatible() to abstract individual VMA anon_vma
comparison for the case of merging on both sides where the anon_vma of the
VMA being merged maybe compatible with prev and next, but prev and next's
anon_vma's may not be compatible with each other.
Finally also introduce can_vma_merge_left() / can_vma_merge_right() to
check adjacent VMA compatibility and that they are indeed adjacent.
Link: https://lkml.kernel.org/r/49d37c0769b6b9dc03b27fe4d059173832556392.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:18 +01:00
|
|
|
ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
|
2024-08-30 19:10:14 +01:00
|
|
|
ASSERT_EQ(vma_prev->vm_start, 0);
|
|
|
|
ASSERT_EQ(vma_prev->vm_end, 0x7000);
|
|
|
|
ASSERT_EQ(vma_prev->vm_pgoff, 0);
|
|
|
|
ASSERT_TRUE(vma_write_started(vma_prev));
|
|
|
|
ASSERT_FALSE(vma_write_started(vma_next));
|
|
|
|
|
|
|
|
/* Clear down and reset. */
|
|
|
|
ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Now consider the new VMA case. This is equivalent, only adding a new
|
|
|
|
* VMA in a gap between prev and next.
|
|
|
|
*
|
|
|
|
* <-->
|
|
|
|
* 0123456789
|
|
|
|
* PPP****NNN
|
|
|
|
* ->
|
|
|
|
* 0123456789
|
|
|
|
* PPPPPPPNNN
|
|
|
|
*/
|
2025-06-18 20:42:53 +01:00
|
|
|
vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
|
|
|
|
vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, vm_flags);
|
2024-08-30 19:10:14 +01:00
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vm_flags, NULL);
|
2024-08-30 19:10:14 +01:00
|
|
|
vmg.prev = vma_prev;
|
mm/vma: fix incorrectly disallowed anonymous VMA merges
Patch series "fix incorrectly disallowed anonymous VMA merges", v2.
It appears that we have been incorrectly rejecting merge cases for 15
years, apparently by mistake.
Imagine a range of anonymous mapped momemory divided into two VMAs like
this, with incompatible protection bits:
RW RWX
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
mprotect(RW)
Now imagine mprotect()'ing vma so it is RW. This appears as if it should
merge, it does not.
Neither does this case, again mprotect()'ing vma RW:
RWX RW
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
mprotect(RW)
Nor:
RW RWX RW
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
mprotect(RW)
What's going on here?
In commit 5beb49305251 ("mm: change anon_vma linking to fix multi-process
server scalability issue"), from 2010, Rik von Riel took careful care to
account for these cases - commenting that '[this is] easily overlooked:
when mprotect shifts the boundary, make sure the expanding vma has
anon_vma set if the shrinking vma had, to cover any anon pages imported.'
However, commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
introduced a little over a year later, appears to have accidentally
disallowed this.
By adjusting the is_mergeable_anon_vma() function to avoid lock contention
across large trees of forked anon_vma's, this commit wrongly assumed the
VMA being checked (the ostensible merge 'target') should be faulted, that
is, have an anon_vma, and thus an anon_vma_chain list established, but
only of length 1.
This appears to have been unintentional, as disallowing empty target VMAs
like this across the board makes no sense.
We already have logic that accounts for this case, the same logic Rik
introduced in 2010, now via dup_anon_vma() (and ultimately
anon_vma_clone()), so there is no problem permitting this.
This series fixes this mistake and also ensures that scalability concerns
remain addressed by explicitly checking that whatever VMA is being merged
has not been forked.
A full set of self tests which reproduce the issue are provided, as well
as updating userland VMA tests to assert this behaviour.
The self tests additionally assert scalability concerns are addressed.
This patch (of 3):
anon_vma_chain's were introduced by Rik von Riel in commit 5beb49305251
("mm: change anon_vma linking to fix multi-process server scalability
issue").
This patch was introduced in March 2010. As part of this change, careful
attention was made to the instance of mprotect() causing a VMA merge, with
one faulted (i.e. having anon_vma set) and another not:
/*
* Easily overlooked: when mprotect shifts the boundary,
* make sure the expanding vma has anon_vma set if the
* shrinking vma had, to cover any anon pages imported.
*/
In the modern VMA code, this is handled in dup_anon_vma() (and ultimately
anon_vma_clone()).
This case is one of the three configurations of adjacent VMA anon_vma
state that we might encounter on merge (where dst is the VMA which will be
merged into and src the one being merged into dst):
1. dst->anon_vma, src->anon_vma - These must be equal, no-op.
2. dst->anon_vma, !src->anon_vma - We simply use dst->anon_vma, no-op.
3. !dst->anon_vma, src->anon_vma - The case in question here.
In case 3, the instance addressed here - we duplicate the AVC connections
from src and place into dst.
However, in practice, we very often do NOT do this.
This appears to be due to an inadvertent consequence of the change
introduced by commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs"),
introduced in May 2011.
This implies that this merge case was functional only for a little over a
year, and has since been broken for ~15 years.
Here, lock scalability concerns lead to us restricting anonymous merges
only to those VMAs with 1 entry in their vma->anon_vma_chain, that is, a
VMA that is not connected to any parent process's anon_vma.
The mergeability test looks like this:
static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
struct anon_vma *anon_vma2, struct vm_area_struct *vma)
{
if ((!anon_vma1 || !anon_vma2) && (!vma ||
!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)))
return true;
return anon_vma1 == anon_vma2;
}
However, we have a problem here - typically the vma passed here is the
destination VMA.
For instance in vma_merge_existing_range() we invoke:
can_vma_merge_left()
-> [ check that there is an immediately adjacent prior VMA ]
-> can_vma_merge_after()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], prev->anon_vma, prev)
So if we were considering a target unfaulted 'prev':
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
This would call is_mergeable_anon_vma(NULL, vma->anon_vma, prev).
The list_is_singular() check for vma->anon_vma_chain, an empty list on
fault, would cause this merge to _fail_ even though all else indicates a
merge.
Equally a simple merge into a next VMA would hit the same problem:
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
can_vma_merge_right()
-> [ check that there is an immediately adjacent succeeding VMA ]
-> can_vma_merge_before()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], next->anon_vma, next)
For a 3-way merge, we'd also hit the same problem if it was configured like
this for instance:
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
As we'd call can_vma_merge_left() for prev, and can_vma_merge_right() for
next, both of which would fail.
vma_merge_new_range() (and relatedly, vma_expand()) are not impacted, as
the new VMA would never already be faulted (it is a proposed new range).
Because we already handle each of the aforementioned merge cases, and can
absolutely therefore deal with an existing VMA merge with !dst->anon_vma,
src->anon_vma, there is absolutely no reason to disallow this kind of
merge.
It seems that the intention of this patch is to ensure that, in the
instance of merging unfaulted VMAs with faulted ones, we never wish to do
so with those with multiple AVCs due to the fact that anon_vma lock's are
held across both parent and child anon_vma's (actually, the 'root' parent
anon_vma's lock is used).
In fact, the original commit alludes to this - "find_mergeable_anon_vma()
already considers this case".
In find_mergeable_anon_vma() however, we check the anon_vma which will be
merged from, if it is set, then we check
list_is_singular(vma->anon_vma_chain).
So to match this logic, update is_mergeable_anon_vma() to perform this
scalability check on the VMA whose anon_vma we ultimately merge into.
This matches existing behaviour with forked VMAs, only we no longer
wrongly disallow ALL empty target merges.
So we both allow merge cases and ensure the scalability check is correctly
applied.
We may wish to revisit these lock scalability concerns at a later date and
ensure they are still valid.
Additionally, correct userland VMA tests which were mistakenly not
asserting these cases correctly previously to now correctly assert this,
and to ensure vmg->anon_vma state is always consistent to account for
newly introduced asserts.
Link: https://lkml.kernel.org/r/cover.1744104124.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/18c756fc9eaf7ad082a710c91133b8346f8cd9a8.1744104124.git.lorenzo.stoakes@oracle.com
Fixes: 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Yeoreum Yun <yeoreum.yun@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-08 10:29:31 +01:00
|
|
|
vma_set_dummy_anon_vma(vma_prev, &dummy_anon_vma_chain_1);
|
|
|
|
__vma_set_dummy_anon_vma(vma_next, &dummy_anon_vma_chain_2, &dummy_anon_vma_2);
|
2024-08-30 19:10:14 +01:00
|
|
|
|
mm/vma: fix incorrectly disallowed anonymous VMA merges
Patch series "fix incorrectly disallowed anonymous VMA merges", v2.
It appears that we have been incorrectly rejecting merge cases for 15
years, apparently by mistake.
Imagine a range of anonymous mapped momemory divided into two VMAs like
this, with incompatible protection bits:
RW RWX
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
mprotect(RW)
Now imagine mprotect()'ing vma so it is RW. This appears as if it should
merge, it does not.
Neither does this case, again mprotect()'ing vma RW:
RWX RW
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
mprotect(RW)
Nor:
RW RWX RW
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
mprotect(RW)
What's going on here?
In commit 5beb49305251 ("mm: change anon_vma linking to fix multi-process
server scalability issue"), from 2010, Rik von Riel took careful care to
account for these cases - commenting that '[this is] easily overlooked:
when mprotect shifts the boundary, make sure the expanding vma has
anon_vma set if the shrinking vma had, to cover any anon pages imported.'
However, commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
introduced a little over a year later, appears to have accidentally
disallowed this.
By adjusting the is_mergeable_anon_vma() function to avoid lock contention
across large trees of forked anon_vma's, this commit wrongly assumed the
VMA being checked (the ostensible merge 'target') should be faulted, that
is, have an anon_vma, and thus an anon_vma_chain list established, but
only of length 1.
This appears to have been unintentional, as disallowing empty target VMAs
like this across the board makes no sense.
We already have logic that accounts for this case, the same logic Rik
introduced in 2010, now via dup_anon_vma() (and ultimately
anon_vma_clone()), so there is no problem permitting this.
This series fixes this mistake and also ensures that scalability concerns
remain addressed by explicitly checking that whatever VMA is being merged
has not been forked.
A full set of self tests which reproduce the issue are provided, as well
as updating userland VMA tests to assert this behaviour.
The self tests additionally assert scalability concerns are addressed.
This patch (of 3):
anon_vma_chain's were introduced by Rik von Riel in commit 5beb49305251
("mm: change anon_vma linking to fix multi-process server scalability
issue").
This patch was introduced in March 2010. As part of this change, careful
attention was made to the instance of mprotect() causing a VMA merge, with
one faulted (i.e. having anon_vma set) and another not:
/*
* Easily overlooked: when mprotect shifts the boundary,
* make sure the expanding vma has anon_vma set if the
* shrinking vma had, to cover any anon pages imported.
*/
In the modern VMA code, this is handled in dup_anon_vma() (and ultimately
anon_vma_clone()).
This case is one of the three configurations of adjacent VMA anon_vma
state that we might encounter on merge (where dst is the VMA which will be
merged into and src the one being merged into dst):
1. dst->anon_vma, src->anon_vma - These must be equal, no-op.
2. dst->anon_vma, !src->anon_vma - We simply use dst->anon_vma, no-op.
3. !dst->anon_vma, src->anon_vma - The case in question here.
In case 3, the instance addressed here - we duplicate the AVC connections
from src and place into dst.
However, in practice, we very often do NOT do this.
This appears to be due to an inadvertent consequence of the change
introduced by commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs"),
introduced in May 2011.
This implies that this merge case was functional only for a little over a
year, and has since been broken for ~15 years.
Here, lock scalability concerns lead to us restricting anonymous merges
only to those VMAs with 1 entry in their vma->anon_vma_chain, that is, a
VMA that is not connected to any parent process's anon_vma.
The mergeability test looks like this:
static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
struct anon_vma *anon_vma2, struct vm_area_struct *vma)
{
if ((!anon_vma1 || !anon_vma2) && (!vma ||
!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)))
return true;
return anon_vma1 == anon_vma2;
}
However, we have a problem here - typically the vma passed here is the
destination VMA.
For instance in vma_merge_existing_range() we invoke:
can_vma_merge_left()
-> [ check that there is an immediately adjacent prior VMA ]
-> can_vma_merge_after()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], prev->anon_vma, prev)
So if we were considering a target unfaulted 'prev':
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
This would call is_mergeable_anon_vma(NULL, vma->anon_vma, prev).
The list_is_singular() check for vma->anon_vma_chain, an empty list on
fault, would cause this merge to _fail_ even though all else indicates a
merge.
Equally a simple merge into a next VMA would hit the same problem:
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
can_vma_merge_right()
-> [ check that there is an immediately adjacent succeeding VMA ]
-> can_vma_merge_before()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], next->anon_vma, next)
For a 3-way merge, we'd also hit the same problem if it was configured like
this for instance:
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
As we'd call can_vma_merge_left() for prev, and can_vma_merge_right() for
next, both of which would fail.
vma_merge_new_range() (and relatedly, vma_expand()) are not impacted, as
the new VMA would never already be faulted (it is a proposed new range).
Because we already handle each of the aforementioned merge cases, and can
absolutely therefore deal with an existing VMA merge with !dst->anon_vma,
src->anon_vma, there is absolutely no reason to disallow this kind of
merge.
It seems that the intention of this patch is to ensure that, in the
instance of merging unfaulted VMAs with faulted ones, we never wish to do
so with those with multiple AVCs due to the fact that anon_vma lock's are
held across both parent and child anon_vma's (actually, the 'root' parent
anon_vma's lock is used).
In fact, the original commit alludes to this - "find_mergeable_anon_vma()
already considers this case".
In find_mergeable_anon_vma() however, we check the anon_vma which will be
merged from, if it is set, then we check
list_is_singular(vma->anon_vma_chain).
So to match this logic, update is_mergeable_anon_vma() to perform this
scalability check on the VMA whose anon_vma we ultimately merge into.
This matches existing behaviour with forked VMAs, only we no longer
wrongly disallow ALL empty target merges.
So we both allow merge cases and ensure the scalability check is correctly
applied.
We may wish to revisit these lock scalability concerns at a later date and
ensure they are still valid.
Additionally, correct userland VMA tests which were mistakenly not
asserting these cases correctly previously to now correctly assert this,
and to ensure vmg->anon_vma state is always consistent to account for
newly introduced asserts.
Link: https://lkml.kernel.org/r/cover.1744104124.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/18c756fc9eaf7ad082a710c91133b8346f8cd9a8.1744104124.git.lorenzo.stoakes@oracle.com
Fixes: 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Yeoreum Yun <yeoreum.yun@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-08 10:29:31 +01:00
|
|
|
vmg.anon_vma = NULL;
|
2024-08-30 19:10:14 +01:00
|
|
|
ASSERT_EQ(merge_new(&vmg), vma_prev);
|
mm: avoid using vma_merge() for new VMAs
Abstract vma_merge_new_vma() to use vma_merge_struct and rename the
resultant function vma_merge_new_range() to be clear what the purpose of
this function is - a new VMA is desired in the specified range, and we
wish to see if it is possible to 'merge' surrounding VMAs into this range
rather than having to allocate a new VMA.
Note that this function uses vma_extend() exclusively, so adopts its
requirement that the iterator point at or before the gap. We add an
assert to this effect.
This is as opposed to vma_merge_existing_range(), which will be introduced
in a subsequent commit, and provide the same functionality for cases in
which we are modifying an existing VMA.
In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.
Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().
Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.
Also add the ability for the vmg to track state, and able to report
errors, allowing for us to differentiate a failed merge from an inability
to allocate memory in callers.
This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.
Also introduce vma_iter_next_rewind() to allow for retrieval of the next,
and (optionally) the prev VMA, rewinding to the start of the previous gap.
Introduce are_anon_vmas_compatible() to abstract individual VMA anon_vma
comparison for the case of merging on both sides where the anon_vma of the
VMA being merged maybe compatible with prev and next, but prev and next's
anon_vma's may not be compatible with each other.
Finally also introduce can_vma_merge_left() / can_vma_merge_right() to
check adjacent VMA compatibility and that they are indeed adjacent.
Link: https://lkml.kernel.org/r/49d37c0769b6b9dc03b27fe4d059173832556392.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:18 +01:00
|
|
|
ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
|
2024-08-30 19:10:14 +01:00
|
|
|
ASSERT_EQ(vma_prev->vm_start, 0);
|
|
|
|
ASSERT_EQ(vma_prev->vm_end, 0x7000);
|
|
|
|
ASSERT_EQ(vma_prev->vm_pgoff, 0);
|
|
|
|
ASSERT_TRUE(vma_write_started(vma_prev));
|
|
|
|
ASSERT_FALSE(vma_write_started(vma_next));
|
|
|
|
|
|
|
|
/* Final cleanup. */
|
|
|
|
ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool test_dup_anon_vma(void)
|
|
|
|
{
|
2025-06-18 20:42:53 +01:00
|
|
|
vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
|
2024-08-30 19:10:14 +01:00
|
|
|
struct mm_struct mm = {};
|
|
|
|
VMA_ITERATOR(vmi, &mm, 0);
|
|
|
|
struct vma_merge_struct vmg = {
|
|
|
|
.mm = &mm,
|
|
|
|
.vmi = &vmi,
|
|
|
|
};
|
|
|
|
struct anon_vma_chain dummy_anon_vma_chain = {
|
|
|
|
.anon_vma = &dummy_anon_vma,
|
|
|
|
};
|
|
|
|
struct vm_area_struct *vma_prev, *vma_next, *vma;
|
|
|
|
|
|
|
|
reset_dummy_anon_vma();
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Expanding a VMA delete the next one duplicates next's anon_vma and
|
|
|
|
* assigns it to the expanded VMA.
|
|
|
|
*
|
|
|
|
* This covers new VMA merging, as these operations amount to a VMA
|
|
|
|
* expand.
|
|
|
|
*/
|
2025-06-18 20:42:53 +01:00
|
|
|
vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
|
|
|
|
vma_next = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
|
2024-08-30 19:10:14 +01:00
|
|
|
vma_next->anon_vma = &dummy_anon_vma;
|
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vmg_set_range(&vmg, 0, 0x5000, 0, vm_flags);
|
2025-06-13 19:48:07 +01:00
|
|
|
vmg.target = vma_prev;
|
2024-08-30 19:10:14 +01:00
|
|
|
vmg.next = vma_next;
|
|
|
|
|
|
|
|
ASSERT_EQ(expand_existing(&vmg), 0);
|
|
|
|
|
|
|
|
/* Will have been cloned. */
|
|
|
|
ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
|
|
|
|
ASSERT_TRUE(vma_prev->anon_vma->was_cloned);
|
|
|
|
|
|
|
|
/* Cleanup ready for next run. */
|
|
|
|
cleanup_mm(&mm, &vmi);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* next has anon_vma, we assign to prev.
|
|
|
|
*
|
|
|
|
* |<----->|
|
|
|
|
* |-------*********-------|
|
|
|
|
* prev vma next
|
|
|
|
* extend delete delete
|
|
|
|
*/
|
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
|
|
|
|
vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
|
|
|
|
vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, vm_flags);
|
2024-08-30 19:10:14 +01:00
|
|
|
|
|
|
|
/* Initialise avc so mergeability check passes. */
|
|
|
|
INIT_LIST_HEAD(&vma_next->anon_vma_chain);
|
|
|
|
list_add(&dummy_anon_vma_chain.same_vma, &vma_next->anon_vma_chain);
|
|
|
|
|
|
|
|
vma_next->anon_vma = &dummy_anon_vma;
|
2025-06-18 20:42:53 +01:00
|
|
|
vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
|
2024-08-30 19:10:14 +01:00
|
|
|
vmg.prev = vma_prev;
|
mm: simplify vma merge structure and expand comments
Patch series "mm: further simplify VMA merge operation", v3.
While significant efforts have been made to improve the VMA merge
operation, there remains remnants of the bad (or rather confusing) old
days, which make the code difficult to understand, more bug prone and thus
harder to modify.
This series attempts to significantly improve matters in a number of
respects - with a focus on simplifying the commit_merge() function which
actually actions the merge operation - and importantly, adjusting the two
most confusing merge cases - those in which we 'adjust' the VMA
immediately adjacent to the one being merged.
One source of confusion are the VMAs being threaded through the operation
themselves - vmg->prev, vmg->vma and vmg->next.
At the start of the operation, vmg->vma is either NULL if a new VMA is
propose to be added, or if not then a pointer to an existing VMA being
modified, and prev/next are (perhaps not present) VMAs sat immediately
before and after the range specified in vmg->start, end, respectively.
However, during the VMA merge operation, we change vmg->start, end and
pgoff to span the newly merged range and vmg->vma to either be:
a. The ultimately returned VMA (in most cases) or b. A VMA which we will
manipulate, but ultimately instead return vmg->next.
Case b. especially here is confusing for somebody reading this code, but
the fact we update this state, along with vmg->start, end, pgoff only
makes matters worse.
We simplify things by replacing vmg->vma with vmg->middle and never
changing it - this is always either NULL (for a new VMA) or the VMA being
modified between vmg->prev and vmg->next.
We further simplify by placing the merged VMA in a new vmg->target field -
whether case b. above is the case or not. The reader of the code can now
simply rely on vmg->middle being the middle VMA and vmg->target being the
ultimately merged VMA.
We additionally tackle the confusing cases where we 'adjust' VMAs other
than the one we ultimately return as the merged VMA (this includes case b.
above). These are:
(1)
merge
<----------->
|------||--------| |------------|---|
| prev || middle | -> | target | m |
|------||--------| |------------|---|
In which case middle must be adjusted so middle->vm_start is increased as
well as performing the merge.
(2) (equivalent to case b. above)
<------------->
|---------||------| |---|-------------|
| middle || next | -> | m | target |
|---------||------| |---|-------------|
In which case next must be adjusted so next->vm_start is decreased as well
as performing the merge.
This cases have previously been performed by calculating and passing
around a dubious and confusing 'adj_start' parameter along side a pointer
to an 'adjust' VMA indicating which VMA requires additional adjustment
(middle in case 1 and next in case 2).
With the VMG structure in place we are able to avoid this by simply
setting a merge flag to describe each case:
(1) Sets the vmg->__adjust_middle_start flag
(2) Sets the vmg->__adjust_next_start flag
By doing so it turns out we can vastly simplify the logic and calculate
what is required to perform the operation.
Taken together the refactorings make it far easier to understand what is
being done even in these more confusing cases, make the code far more
maintainable, debuggable, and testable, providing more internal state
indicating what is happening in the merge operation.
The changes have no functional net impact on the merge operation and
everything should still behave as it did before.
This patch (of 5):
The merge code, while much improved, still has a number of points of
confusion. As part of a broader series cleaning this up to make this more
maintainable, we start by addressing some confusion around
vma_merge_struct fields.
So far, the caller either provides no vmg->vma (a new VMA) or supplies the
existing VMA which is being altered, setting vmg->start,end,pgoff to the
proposed VMA dimensions.
vmg->vma is then updated, as are vmg->start,end,pgoff as the merge process
proceeds and the appropriate merge strategy is determined.
This is rather confusing, as vmg->vma starts off as the 'middle' VMA
between vmg->prev,next, but becomes the 'target' VMA, except in one
specific edge case (merge next, shrink middle).
Int his patch we introduce vmg->middle to describe the VMA that is between
vmg->prev and vmg->next, and does NOT change during the merge operation.
We replace vmg->vma with vmg->target, and use this only during the merge
operation itself.
Aside from the merge right, shrink middle case, this becomes the VMA that
forms the basis of the VMA that is returned. This edge case can be
addressed in a future commit.
We also add a number of comments to explain what is going on.
Finally, we adjust the ASCII diagrams showing each merge case in
vma_merge_existing_range() to be clearer - the arrow range previously
showed the vmg->start, end spanned area, but it is clearer to change this
to show the final merged VMA.
This patch has no change in functional behaviour.
Link: https://lkml.kernel.org/r/cover.1738326519.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/4dfe60f1419d55e5d0516f56349695d73a57184c.1738326519.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-31 12:31:49 +00:00
|
|
|
vmg.middle = vma;
|
2024-08-30 19:10:14 +01:00
|
|
|
|
|
|
|
ASSERT_EQ(merge_existing(&vmg), vma_prev);
|
mm: avoid using vma_merge() for new VMAs
Abstract vma_merge_new_vma() to use vma_merge_struct and rename the
resultant function vma_merge_new_range() to be clear what the purpose of
this function is - a new VMA is desired in the specified range, and we
wish to see if it is possible to 'merge' surrounding VMAs into this range
rather than having to allocate a new VMA.
Note that this function uses vma_extend() exclusively, so adopts its
requirement that the iterator point at or before the gap. We add an
assert to this effect.
This is as opposed to vma_merge_existing_range(), which will be introduced
in a subsequent commit, and provide the same functionality for cases in
which we are modifying an existing VMA.
In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.
Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().
Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.
Also add the ability for the vmg to track state, and able to report
errors, allowing for us to differentiate a failed merge from an inability
to allocate memory in callers.
This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.
Also introduce vma_iter_next_rewind() to allow for retrieval of the next,
and (optionally) the prev VMA, rewinding to the start of the previous gap.
Introduce are_anon_vmas_compatible() to abstract individual VMA anon_vma
comparison for the case of merging on both sides where the anon_vma of the
VMA being merged maybe compatible with prev and next, but prev and next's
anon_vma's may not be compatible with each other.
Finally also introduce can_vma_merge_left() / can_vma_merge_right() to
check adjacent VMA compatibility and that they are indeed adjacent.
Link: https://lkml.kernel.org/r/49d37c0769b6b9dc03b27fe4d059173832556392.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:18 +01:00
|
|
|
ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
|
2024-08-30 19:10:14 +01:00
|
|
|
|
|
|
|
ASSERT_EQ(vma_prev->vm_start, 0);
|
|
|
|
ASSERT_EQ(vma_prev->vm_end, 0x8000);
|
|
|
|
|
|
|
|
ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
|
|
|
|
ASSERT_TRUE(vma_prev->anon_vma->was_cloned);
|
|
|
|
|
|
|
|
cleanup_mm(&mm, &vmi);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* vma has anon_vma, we assign to prev.
|
|
|
|
*
|
|
|
|
* |<----->|
|
|
|
|
* |-------*********-------|
|
|
|
|
* prev vma next
|
|
|
|
* extend delete delete
|
|
|
|
*/
|
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
|
|
|
|
vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
|
|
|
|
vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, vm_flags);
|
mm/vma: fix incorrectly disallowed anonymous VMA merges
Patch series "fix incorrectly disallowed anonymous VMA merges", v2.
It appears that we have been incorrectly rejecting merge cases for 15
years, apparently by mistake.
Imagine a range of anonymous mapped momemory divided into two VMAs like
this, with incompatible protection bits:
RW RWX
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
mprotect(RW)
Now imagine mprotect()'ing vma so it is RW. This appears as if it should
merge, it does not.
Neither does this case, again mprotect()'ing vma RW:
RWX RW
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
mprotect(RW)
Nor:
RW RWX RW
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
mprotect(RW)
What's going on here?
In commit 5beb49305251 ("mm: change anon_vma linking to fix multi-process
server scalability issue"), from 2010, Rik von Riel took careful care to
account for these cases - commenting that '[this is] easily overlooked:
when mprotect shifts the boundary, make sure the expanding vma has
anon_vma set if the shrinking vma had, to cover any anon pages imported.'
However, commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
introduced a little over a year later, appears to have accidentally
disallowed this.
By adjusting the is_mergeable_anon_vma() function to avoid lock contention
across large trees of forked anon_vma's, this commit wrongly assumed the
VMA being checked (the ostensible merge 'target') should be faulted, that
is, have an anon_vma, and thus an anon_vma_chain list established, but
only of length 1.
This appears to have been unintentional, as disallowing empty target VMAs
like this across the board makes no sense.
We already have logic that accounts for this case, the same logic Rik
introduced in 2010, now via dup_anon_vma() (and ultimately
anon_vma_clone()), so there is no problem permitting this.
This series fixes this mistake and also ensures that scalability concerns
remain addressed by explicitly checking that whatever VMA is being merged
has not been forked.
A full set of self tests which reproduce the issue are provided, as well
as updating userland VMA tests to assert this behaviour.
The self tests additionally assert scalability concerns are addressed.
This patch (of 3):
anon_vma_chain's were introduced by Rik von Riel in commit 5beb49305251
("mm: change anon_vma linking to fix multi-process server scalability
issue").
This patch was introduced in March 2010. As part of this change, careful
attention was made to the instance of mprotect() causing a VMA merge, with
one faulted (i.e. having anon_vma set) and another not:
/*
* Easily overlooked: when mprotect shifts the boundary,
* make sure the expanding vma has anon_vma set if the
* shrinking vma had, to cover any anon pages imported.
*/
In the modern VMA code, this is handled in dup_anon_vma() (and ultimately
anon_vma_clone()).
This case is one of the three configurations of adjacent VMA anon_vma
state that we might encounter on merge (where dst is the VMA which will be
merged into and src the one being merged into dst):
1. dst->anon_vma, src->anon_vma - These must be equal, no-op.
2. dst->anon_vma, !src->anon_vma - We simply use dst->anon_vma, no-op.
3. !dst->anon_vma, src->anon_vma - The case in question here.
In case 3, the instance addressed here - we duplicate the AVC connections
from src and place into dst.
However, in practice, we very often do NOT do this.
This appears to be due to an inadvertent consequence of the change
introduced by commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs"),
introduced in May 2011.
This implies that this merge case was functional only for a little over a
year, and has since been broken for ~15 years.
Here, lock scalability concerns lead to us restricting anonymous merges
only to those VMAs with 1 entry in their vma->anon_vma_chain, that is, a
VMA that is not connected to any parent process's anon_vma.
The mergeability test looks like this:
static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
struct anon_vma *anon_vma2, struct vm_area_struct *vma)
{
if ((!anon_vma1 || !anon_vma2) && (!vma ||
!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)))
return true;
return anon_vma1 == anon_vma2;
}
However, we have a problem here - typically the vma passed here is the
destination VMA.
For instance in vma_merge_existing_range() we invoke:
can_vma_merge_left()
-> [ check that there is an immediately adjacent prior VMA ]
-> can_vma_merge_after()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], prev->anon_vma, prev)
So if we were considering a target unfaulted 'prev':
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
This would call is_mergeable_anon_vma(NULL, vma->anon_vma, prev).
The list_is_singular() check for vma->anon_vma_chain, an empty list on
fault, would cause this merge to _fail_ even though all else indicates a
merge.
Equally a simple merge into a next VMA would hit the same problem:
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
can_vma_merge_right()
-> [ check that there is an immediately adjacent succeeding VMA ]
-> can_vma_merge_before()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], next->anon_vma, next)
For a 3-way merge, we'd also hit the same problem if it was configured like
this for instance:
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
As we'd call can_vma_merge_left() for prev, and can_vma_merge_right() for
next, both of which would fail.
vma_merge_new_range() (and relatedly, vma_expand()) are not impacted, as
the new VMA would never already be faulted (it is a proposed new range).
Because we already handle each of the aforementioned merge cases, and can
absolutely therefore deal with an existing VMA merge with !dst->anon_vma,
src->anon_vma, there is absolutely no reason to disallow this kind of
merge.
It seems that the intention of this patch is to ensure that, in the
instance of merging unfaulted VMAs with faulted ones, we never wish to do
so with those with multiple AVCs due to the fact that anon_vma lock's are
held across both parent and child anon_vma's (actually, the 'root' parent
anon_vma's lock is used).
In fact, the original commit alludes to this - "find_mergeable_anon_vma()
already considers this case".
In find_mergeable_anon_vma() however, we check the anon_vma which will be
merged from, if it is set, then we check
list_is_singular(vma->anon_vma_chain).
So to match this logic, update is_mergeable_anon_vma() to perform this
scalability check on the VMA whose anon_vma we ultimately merge into.
This matches existing behaviour with forked VMAs, only we no longer
wrongly disallow ALL empty target merges.
So we both allow merge cases and ensure the scalability check is correctly
applied.
We may wish to revisit these lock scalability concerns at a later date and
ensure they are still valid.
Additionally, correct userland VMA tests which were mistakenly not
asserting these cases correctly previously to now correctly assert this,
and to ensure vmg->anon_vma state is always consistent to account for
newly introduced asserts.
Link: https://lkml.kernel.org/r/cover.1744104124.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/18c756fc9eaf7ad082a710c91133b8346f8cd9a8.1744104124.git.lorenzo.stoakes@oracle.com
Fixes: 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Yeoreum Yun <yeoreum.yun@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-08 10:29:31 +01:00
|
|
|
vmg.anon_vma = &dummy_anon_vma;
|
|
|
|
vma_set_dummy_anon_vma(vma, &dummy_anon_vma_chain);
|
2025-06-18 20:42:53 +01:00
|
|
|
vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
|
2024-08-30 19:10:14 +01:00
|
|
|
vmg.prev = vma_prev;
|
mm: simplify vma merge structure and expand comments
Patch series "mm: further simplify VMA merge operation", v3.
While significant efforts have been made to improve the VMA merge
operation, there remains remnants of the bad (or rather confusing) old
days, which make the code difficult to understand, more bug prone and thus
harder to modify.
This series attempts to significantly improve matters in a number of
respects - with a focus on simplifying the commit_merge() function which
actually actions the merge operation - and importantly, adjusting the two
most confusing merge cases - those in which we 'adjust' the VMA
immediately adjacent to the one being merged.
One source of confusion are the VMAs being threaded through the operation
themselves - vmg->prev, vmg->vma and vmg->next.
At the start of the operation, vmg->vma is either NULL if a new VMA is
propose to be added, or if not then a pointer to an existing VMA being
modified, and prev/next are (perhaps not present) VMAs sat immediately
before and after the range specified in vmg->start, end, respectively.
However, during the VMA merge operation, we change vmg->start, end and
pgoff to span the newly merged range and vmg->vma to either be:
a. The ultimately returned VMA (in most cases) or b. A VMA which we will
manipulate, but ultimately instead return vmg->next.
Case b. especially here is confusing for somebody reading this code, but
the fact we update this state, along with vmg->start, end, pgoff only
makes matters worse.
We simplify things by replacing vmg->vma with vmg->middle and never
changing it - this is always either NULL (for a new VMA) or the VMA being
modified between vmg->prev and vmg->next.
We further simplify by placing the merged VMA in a new vmg->target field -
whether case b. above is the case or not. The reader of the code can now
simply rely on vmg->middle being the middle VMA and vmg->target being the
ultimately merged VMA.
We additionally tackle the confusing cases where we 'adjust' VMAs other
than the one we ultimately return as the merged VMA (this includes case b.
above). These are:
(1)
merge
<----------->
|------||--------| |------------|---|
| prev || middle | -> | target | m |
|------||--------| |------------|---|
In which case middle must be adjusted so middle->vm_start is increased as
well as performing the merge.
(2) (equivalent to case b. above)
<------------->
|---------||------| |---|-------------|
| middle || next | -> | m | target |
|---------||------| |---|-------------|
In which case next must be adjusted so next->vm_start is decreased as well
as performing the merge.
This cases have previously been performed by calculating and passing
around a dubious and confusing 'adj_start' parameter along side a pointer
to an 'adjust' VMA indicating which VMA requires additional adjustment
(middle in case 1 and next in case 2).
With the VMG structure in place we are able to avoid this by simply
setting a merge flag to describe each case:
(1) Sets the vmg->__adjust_middle_start flag
(2) Sets the vmg->__adjust_next_start flag
By doing so it turns out we can vastly simplify the logic and calculate
what is required to perform the operation.
Taken together the refactorings make it far easier to understand what is
being done even in these more confusing cases, make the code far more
maintainable, debuggable, and testable, providing more internal state
indicating what is happening in the merge operation.
The changes have no functional net impact on the merge operation and
everything should still behave as it did before.
This patch (of 5):
The merge code, while much improved, still has a number of points of
confusion. As part of a broader series cleaning this up to make this more
maintainable, we start by addressing some confusion around
vma_merge_struct fields.
So far, the caller either provides no vmg->vma (a new VMA) or supplies the
existing VMA which is being altered, setting vmg->start,end,pgoff to the
proposed VMA dimensions.
vmg->vma is then updated, as are vmg->start,end,pgoff as the merge process
proceeds and the appropriate merge strategy is determined.
This is rather confusing, as vmg->vma starts off as the 'middle' VMA
between vmg->prev,next, but becomes the 'target' VMA, except in one
specific edge case (merge next, shrink middle).
Int his patch we introduce vmg->middle to describe the VMA that is between
vmg->prev and vmg->next, and does NOT change during the merge operation.
We replace vmg->vma with vmg->target, and use this only during the merge
operation itself.
Aside from the merge right, shrink middle case, this becomes the VMA that
forms the basis of the VMA that is returned. This edge case can be
addressed in a future commit.
We also add a number of comments to explain what is going on.
Finally, we adjust the ASCII diagrams showing each merge case in
vma_merge_existing_range() to be clearer - the arrow range previously
showed the vmg->start, end spanned area, but it is clearer to change this
to show the final merged VMA.
This patch has no change in functional behaviour.
Link: https://lkml.kernel.org/r/cover.1738326519.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/4dfe60f1419d55e5d0516f56349695d73a57184c.1738326519.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-31 12:31:49 +00:00
|
|
|
vmg.middle = vma;
|
2024-08-30 19:10:14 +01:00
|
|
|
|
|
|
|
ASSERT_EQ(merge_existing(&vmg), vma_prev);
|
mm: avoid using vma_merge() for new VMAs
Abstract vma_merge_new_vma() to use vma_merge_struct and rename the
resultant function vma_merge_new_range() to be clear what the purpose of
this function is - a new VMA is desired in the specified range, and we
wish to see if it is possible to 'merge' surrounding VMAs into this range
rather than having to allocate a new VMA.
Note that this function uses vma_extend() exclusively, so adopts its
requirement that the iterator point at or before the gap. We add an
assert to this effect.
This is as opposed to vma_merge_existing_range(), which will be introduced
in a subsequent commit, and provide the same functionality for cases in
which we are modifying an existing VMA.
In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.
Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().
Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.
Also add the ability for the vmg to track state, and able to report
errors, allowing for us to differentiate a failed merge from an inability
to allocate memory in callers.
This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.
Also introduce vma_iter_next_rewind() to allow for retrieval of the next,
and (optionally) the prev VMA, rewinding to the start of the previous gap.
Introduce are_anon_vmas_compatible() to abstract individual VMA anon_vma
comparison for the case of merging on both sides where the anon_vma of the
VMA being merged maybe compatible with prev and next, but prev and next's
anon_vma's may not be compatible with each other.
Finally also introduce can_vma_merge_left() / can_vma_merge_right() to
check adjacent VMA compatibility and that they are indeed adjacent.
Link: https://lkml.kernel.org/r/49d37c0769b6b9dc03b27fe4d059173832556392.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:18 +01:00
|
|
|
ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
|
2024-08-30 19:10:14 +01:00
|
|
|
|
|
|
|
ASSERT_EQ(vma_prev->vm_start, 0);
|
|
|
|
ASSERT_EQ(vma_prev->vm_end, 0x8000);
|
|
|
|
|
|
|
|
ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
|
|
|
|
ASSERT_TRUE(vma_prev->anon_vma->was_cloned);
|
|
|
|
|
|
|
|
cleanup_mm(&mm, &vmi);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* vma has anon_vma, we assign to prev.
|
|
|
|
*
|
|
|
|
* |<----->|
|
|
|
|
* |-------*************
|
|
|
|
* prev vma
|
|
|
|
* extend shrink/delete
|
|
|
|
*/
|
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
|
|
|
|
vma = alloc_and_link_vma(&mm, 0x3000, 0x8000, 3, vm_flags);
|
2024-08-30 19:10:14 +01:00
|
|
|
|
mm/vma: fix incorrectly disallowed anonymous VMA merges
Patch series "fix incorrectly disallowed anonymous VMA merges", v2.
It appears that we have been incorrectly rejecting merge cases for 15
years, apparently by mistake.
Imagine a range of anonymous mapped momemory divided into two VMAs like
this, with incompatible protection bits:
RW RWX
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
mprotect(RW)
Now imagine mprotect()'ing vma so it is RW. This appears as if it should
merge, it does not.
Neither does this case, again mprotect()'ing vma RW:
RWX RW
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
mprotect(RW)
Nor:
RW RWX RW
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
mprotect(RW)
What's going on here?
In commit 5beb49305251 ("mm: change anon_vma linking to fix multi-process
server scalability issue"), from 2010, Rik von Riel took careful care to
account for these cases - commenting that '[this is] easily overlooked:
when mprotect shifts the boundary, make sure the expanding vma has
anon_vma set if the shrinking vma had, to cover any anon pages imported.'
However, commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
introduced a little over a year later, appears to have accidentally
disallowed this.
By adjusting the is_mergeable_anon_vma() function to avoid lock contention
across large trees of forked anon_vma's, this commit wrongly assumed the
VMA being checked (the ostensible merge 'target') should be faulted, that
is, have an anon_vma, and thus an anon_vma_chain list established, but
only of length 1.
This appears to have been unintentional, as disallowing empty target VMAs
like this across the board makes no sense.
We already have logic that accounts for this case, the same logic Rik
introduced in 2010, now via dup_anon_vma() (and ultimately
anon_vma_clone()), so there is no problem permitting this.
This series fixes this mistake and also ensures that scalability concerns
remain addressed by explicitly checking that whatever VMA is being merged
has not been forked.
A full set of self tests which reproduce the issue are provided, as well
as updating userland VMA tests to assert this behaviour.
The self tests additionally assert scalability concerns are addressed.
This patch (of 3):
anon_vma_chain's were introduced by Rik von Riel in commit 5beb49305251
("mm: change anon_vma linking to fix multi-process server scalability
issue").
This patch was introduced in March 2010. As part of this change, careful
attention was made to the instance of mprotect() causing a VMA merge, with
one faulted (i.e. having anon_vma set) and another not:
/*
* Easily overlooked: when mprotect shifts the boundary,
* make sure the expanding vma has anon_vma set if the
* shrinking vma had, to cover any anon pages imported.
*/
In the modern VMA code, this is handled in dup_anon_vma() (and ultimately
anon_vma_clone()).
This case is one of the three configurations of adjacent VMA anon_vma
state that we might encounter on merge (where dst is the VMA which will be
merged into and src the one being merged into dst):
1. dst->anon_vma, src->anon_vma - These must be equal, no-op.
2. dst->anon_vma, !src->anon_vma - We simply use dst->anon_vma, no-op.
3. !dst->anon_vma, src->anon_vma - The case in question here.
In case 3, the instance addressed here - we duplicate the AVC connections
from src and place into dst.
However, in practice, we very often do NOT do this.
This appears to be due to an inadvertent consequence of the change
introduced by commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs"),
introduced in May 2011.
This implies that this merge case was functional only for a little over a
year, and has since been broken for ~15 years.
Here, lock scalability concerns lead to us restricting anonymous merges
only to those VMAs with 1 entry in their vma->anon_vma_chain, that is, a
VMA that is not connected to any parent process's anon_vma.
The mergeability test looks like this:
static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
struct anon_vma *anon_vma2, struct vm_area_struct *vma)
{
if ((!anon_vma1 || !anon_vma2) && (!vma ||
!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)))
return true;
return anon_vma1 == anon_vma2;
}
However, we have a problem here - typically the vma passed here is the
destination VMA.
For instance in vma_merge_existing_range() we invoke:
can_vma_merge_left()
-> [ check that there is an immediately adjacent prior VMA ]
-> can_vma_merge_after()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], prev->anon_vma, prev)
So if we were considering a target unfaulted 'prev':
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
This would call is_mergeable_anon_vma(NULL, vma->anon_vma, prev).
The list_is_singular() check for vma->anon_vma_chain, an empty list on
fault, would cause this merge to _fail_ even though all else indicates a
merge.
Equally a simple merge into a next VMA would hit the same problem:
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
can_vma_merge_right()
-> [ check that there is an immediately adjacent succeeding VMA ]
-> can_vma_merge_before()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], next->anon_vma, next)
For a 3-way merge, we'd also hit the same problem if it was configured like
this for instance:
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
As we'd call can_vma_merge_left() for prev, and can_vma_merge_right() for
next, both of which would fail.
vma_merge_new_range() (and relatedly, vma_expand()) are not impacted, as
the new VMA would never already be faulted (it is a proposed new range).
Because we already handle each of the aforementioned merge cases, and can
absolutely therefore deal with an existing VMA merge with !dst->anon_vma,
src->anon_vma, there is absolutely no reason to disallow this kind of
merge.
It seems that the intention of this patch is to ensure that, in the
instance of merging unfaulted VMAs with faulted ones, we never wish to do
so with those with multiple AVCs due to the fact that anon_vma lock's are
held across both parent and child anon_vma's (actually, the 'root' parent
anon_vma's lock is used).
In fact, the original commit alludes to this - "find_mergeable_anon_vma()
already considers this case".
In find_mergeable_anon_vma() however, we check the anon_vma which will be
merged from, if it is set, then we check
list_is_singular(vma->anon_vma_chain).
So to match this logic, update is_mergeable_anon_vma() to perform this
scalability check on the VMA whose anon_vma we ultimately merge into.
This matches existing behaviour with forked VMAs, only we no longer
wrongly disallow ALL empty target merges.
So we both allow merge cases and ensure the scalability check is correctly
applied.
We may wish to revisit these lock scalability concerns at a later date and
ensure they are still valid.
Additionally, correct userland VMA tests which were mistakenly not
asserting these cases correctly previously to now correctly assert this,
and to ensure vmg->anon_vma state is always consistent to account for
newly introduced asserts.
Link: https://lkml.kernel.org/r/cover.1744104124.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/18c756fc9eaf7ad082a710c91133b8346f8cd9a8.1744104124.git.lorenzo.stoakes@oracle.com
Fixes: 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Yeoreum Yun <yeoreum.yun@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-08 10:29:31 +01:00
|
|
|
vma_set_dummy_anon_vma(vma, &dummy_anon_vma_chain);
|
2025-06-18 20:42:53 +01:00
|
|
|
vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
|
2024-08-30 19:10:14 +01:00
|
|
|
vmg.prev = vma_prev;
|
mm: simplify vma merge structure and expand comments
Patch series "mm: further simplify VMA merge operation", v3.
While significant efforts have been made to improve the VMA merge
operation, there remains remnants of the bad (or rather confusing) old
days, which make the code difficult to understand, more bug prone and thus
harder to modify.
This series attempts to significantly improve matters in a number of
respects - with a focus on simplifying the commit_merge() function which
actually actions the merge operation - and importantly, adjusting the two
most confusing merge cases - those in which we 'adjust' the VMA
immediately adjacent to the one being merged.
One source of confusion are the VMAs being threaded through the operation
themselves - vmg->prev, vmg->vma and vmg->next.
At the start of the operation, vmg->vma is either NULL if a new VMA is
propose to be added, or if not then a pointer to an existing VMA being
modified, and prev/next are (perhaps not present) VMAs sat immediately
before and after the range specified in vmg->start, end, respectively.
However, during the VMA merge operation, we change vmg->start, end and
pgoff to span the newly merged range and vmg->vma to either be:
a. The ultimately returned VMA (in most cases) or b. A VMA which we will
manipulate, but ultimately instead return vmg->next.
Case b. especially here is confusing for somebody reading this code, but
the fact we update this state, along with vmg->start, end, pgoff only
makes matters worse.
We simplify things by replacing vmg->vma with vmg->middle and never
changing it - this is always either NULL (for a new VMA) or the VMA being
modified between vmg->prev and vmg->next.
We further simplify by placing the merged VMA in a new vmg->target field -
whether case b. above is the case or not. The reader of the code can now
simply rely on vmg->middle being the middle VMA and vmg->target being the
ultimately merged VMA.
We additionally tackle the confusing cases where we 'adjust' VMAs other
than the one we ultimately return as the merged VMA (this includes case b.
above). These are:
(1)
merge
<----------->
|------||--------| |------------|---|
| prev || middle | -> | target | m |
|------||--------| |------------|---|
In which case middle must be adjusted so middle->vm_start is increased as
well as performing the merge.
(2) (equivalent to case b. above)
<------------->
|---------||------| |---|-------------|
| middle || next | -> | m | target |
|---------||------| |---|-------------|
In which case next must be adjusted so next->vm_start is decreased as well
as performing the merge.
This cases have previously been performed by calculating and passing
around a dubious and confusing 'adj_start' parameter along side a pointer
to an 'adjust' VMA indicating which VMA requires additional adjustment
(middle in case 1 and next in case 2).
With the VMG structure in place we are able to avoid this by simply
setting a merge flag to describe each case:
(1) Sets the vmg->__adjust_middle_start flag
(2) Sets the vmg->__adjust_next_start flag
By doing so it turns out we can vastly simplify the logic and calculate
what is required to perform the operation.
Taken together the refactorings make it far easier to understand what is
being done even in these more confusing cases, make the code far more
maintainable, debuggable, and testable, providing more internal state
indicating what is happening in the merge operation.
The changes have no functional net impact on the merge operation and
everything should still behave as it did before.
This patch (of 5):
The merge code, while much improved, still has a number of points of
confusion. As part of a broader series cleaning this up to make this more
maintainable, we start by addressing some confusion around
vma_merge_struct fields.
So far, the caller either provides no vmg->vma (a new VMA) or supplies the
existing VMA which is being altered, setting vmg->start,end,pgoff to the
proposed VMA dimensions.
vmg->vma is then updated, as are vmg->start,end,pgoff as the merge process
proceeds and the appropriate merge strategy is determined.
This is rather confusing, as vmg->vma starts off as the 'middle' VMA
between vmg->prev,next, but becomes the 'target' VMA, except in one
specific edge case (merge next, shrink middle).
Int his patch we introduce vmg->middle to describe the VMA that is between
vmg->prev and vmg->next, and does NOT change during the merge operation.
We replace vmg->vma with vmg->target, and use this only during the merge
operation itself.
Aside from the merge right, shrink middle case, this becomes the VMA that
forms the basis of the VMA that is returned. This edge case can be
addressed in a future commit.
We also add a number of comments to explain what is going on.
Finally, we adjust the ASCII diagrams showing each merge case in
vma_merge_existing_range() to be clearer - the arrow range previously
showed the vmg->start, end spanned area, but it is clearer to change this
to show the final merged VMA.
This patch has no change in functional behaviour.
Link: https://lkml.kernel.org/r/cover.1738326519.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/4dfe60f1419d55e5d0516f56349695d73a57184c.1738326519.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-31 12:31:49 +00:00
|
|
|
vmg.middle = vma;
|
2024-08-30 19:10:14 +01:00
|
|
|
|
|
|
|
ASSERT_EQ(merge_existing(&vmg), vma_prev);
|
mm: avoid using vma_merge() for new VMAs
Abstract vma_merge_new_vma() to use vma_merge_struct and rename the
resultant function vma_merge_new_range() to be clear what the purpose of
this function is - a new VMA is desired in the specified range, and we
wish to see if it is possible to 'merge' surrounding VMAs into this range
rather than having to allocate a new VMA.
Note that this function uses vma_extend() exclusively, so adopts its
requirement that the iterator point at or before the gap. We add an
assert to this effect.
This is as opposed to vma_merge_existing_range(), which will be introduced
in a subsequent commit, and provide the same functionality for cases in
which we are modifying an existing VMA.
In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.
Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().
Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.
Also add the ability for the vmg to track state, and able to report
errors, allowing for us to differentiate a failed merge from an inability
to allocate memory in callers.
This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.
Also introduce vma_iter_next_rewind() to allow for retrieval of the next,
and (optionally) the prev VMA, rewinding to the start of the previous gap.
Introduce are_anon_vmas_compatible() to abstract individual VMA anon_vma
comparison for the case of merging on both sides where the anon_vma of the
VMA being merged maybe compatible with prev and next, but prev and next's
anon_vma's may not be compatible with each other.
Finally also introduce can_vma_merge_left() / can_vma_merge_right() to
check adjacent VMA compatibility and that they are indeed adjacent.
Link: https://lkml.kernel.org/r/49d37c0769b6b9dc03b27fe4d059173832556392.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:18 +01:00
|
|
|
ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
|
2024-08-30 19:10:14 +01:00
|
|
|
|
|
|
|
ASSERT_EQ(vma_prev->vm_start, 0);
|
|
|
|
ASSERT_EQ(vma_prev->vm_end, 0x5000);
|
|
|
|
|
|
|
|
ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
|
|
|
|
ASSERT_TRUE(vma_prev->anon_vma->was_cloned);
|
|
|
|
|
|
|
|
cleanup_mm(&mm, &vmi);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* vma has anon_vma, we assign to next.
|
|
|
|
*
|
|
|
|
* |<----->|
|
|
|
|
* *************-------|
|
|
|
|
* vma next
|
|
|
|
* shrink/delete extend
|
|
|
|
*/
|
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vma = alloc_and_link_vma(&mm, 0, 0x5000, 0, vm_flags);
|
|
|
|
vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, vm_flags);
|
2024-08-30 19:10:14 +01:00
|
|
|
|
mm/vma: fix incorrectly disallowed anonymous VMA merges
Patch series "fix incorrectly disallowed anonymous VMA merges", v2.
It appears that we have been incorrectly rejecting merge cases for 15
years, apparently by mistake.
Imagine a range of anonymous mapped momemory divided into two VMAs like
this, with incompatible protection bits:
RW RWX
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
mprotect(RW)
Now imagine mprotect()'ing vma so it is RW. This appears as if it should
merge, it does not.
Neither does this case, again mprotect()'ing vma RW:
RWX RW
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
mprotect(RW)
Nor:
RW RWX RW
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
mprotect(RW)
What's going on here?
In commit 5beb49305251 ("mm: change anon_vma linking to fix multi-process
server scalability issue"), from 2010, Rik von Riel took careful care to
account for these cases - commenting that '[this is] easily overlooked:
when mprotect shifts the boundary, make sure the expanding vma has
anon_vma set if the shrinking vma had, to cover any anon pages imported.'
However, commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
introduced a little over a year later, appears to have accidentally
disallowed this.
By adjusting the is_mergeable_anon_vma() function to avoid lock contention
across large trees of forked anon_vma's, this commit wrongly assumed the
VMA being checked (the ostensible merge 'target') should be faulted, that
is, have an anon_vma, and thus an anon_vma_chain list established, but
only of length 1.
This appears to have been unintentional, as disallowing empty target VMAs
like this across the board makes no sense.
We already have logic that accounts for this case, the same logic Rik
introduced in 2010, now via dup_anon_vma() (and ultimately
anon_vma_clone()), so there is no problem permitting this.
This series fixes this mistake and also ensures that scalability concerns
remain addressed by explicitly checking that whatever VMA is being merged
has not been forked.
A full set of self tests which reproduce the issue are provided, as well
as updating userland VMA tests to assert this behaviour.
The self tests additionally assert scalability concerns are addressed.
This patch (of 3):
anon_vma_chain's were introduced by Rik von Riel in commit 5beb49305251
("mm: change anon_vma linking to fix multi-process server scalability
issue").
This patch was introduced in March 2010. As part of this change, careful
attention was made to the instance of mprotect() causing a VMA merge, with
one faulted (i.e. having anon_vma set) and another not:
/*
* Easily overlooked: when mprotect shifts the boundary,
* make sure the expanding vma has anon_vma set if the
* shrinking vma had, to cover any anon pages imported.
*/
In the modern VMA code, this is handled in dup_anon_vma() (and ultimately
anon_vma_clone()).
This case is one of the three configurations of adjacent VMA anon_vma
state that we might encounter on merge (where dst is the VMA which will be
merged into and src the one being merged into dst):
1. dst->anon_vma, src->anon_vma - These must be equal, no-op.
2. dst->anon_vma, !src->anon_vma - We simply use dst->anon_vma, no-op.
3. !dst->anon_vma, src->anon_vma - The case in question here.
In case 3, the instance addressed here - we duplicate the AVC connections
from src and place into dst.
However, in practice, we very often do NOT do this.
This appears to be due to an inadvertent consequence of the change
introduced by commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs"),
introduced in May 2011.
This implies that this merge case was functional only for a little over a
year, and has since been broken for ~15 years.
Here, lock scalability concerns lead to us restricting anonymous merges
only to those VMAs with 1 entry in their vma->anon_vma_chain, that is, a
VMA that is not connected to any parent process's anon_vma.
The mergeability test looks like this:
static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
struct anon_vma *anon_vma2, struct vm_area_struct *vma)
{
if ((!anon_vma1 || !anon_vma2) && (!vma ||
!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)))
return true;
return anon_vma1 == anon_vma2;
}
However, we have a problem here - typically the vma passed here is the
destination VMA.
For instance in vma_merge_existing_range() we invoke:
can_vma_merge_left()
-> [ check that there is an immediately adjacent prior VMA ]
-> can_vma_merge_after()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], prev->anon_vma, prev)
So if we were considering a target unfaulted 'prev':
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
This would call is_mergeable_anon_vma(NULL, vma->anon_vma, prev).
The list_is_singular() check for vma->anon_vma_chain, an empty list on
fault, would cause this merge to _fail_ even though all else indicates a
merge.
Equally a simple merge into a next VMA would hit the same problem:
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
can_vma_merge_right()
-> [ check that there is an immediately adjacent succeeding VMA ]
-> can_vma_merge_before()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], next->anon_vma, next)
For a 3-way merge, we'd also hit the same problem if it was configured like
this for instance:
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
As we'd call can_vma_merge_left() for prev, and can_vma_merge_right() for
next, both of which would fail.
vma_merge_new_range() (and relatedly, vma_expand()) are not impacted, as
the new VMA would never already be faulted (it is a proposed new range).
Because we already handle each of the aforementioned merge cases, and can
absolutely therefore deal with an existing VMA merge with !dst->anon_vma,
src->anon_vma, there is absolutely no reason to disallow this kind of
merge.
It seems that the intention of this patch is to ensure that, in the
instance of merging unfaulted VMAs with faulted ones, we never wish to do
so with those with multiple AVCs due to the fact that anon_vma lock's are
held across both parent and child anon_vma's (actually, the 'root' parent
anon_vma's lock is used).
In fact, the original commit alludes to this - "find_mergeable_anon_vma()
already considers this case".
In find_mergeable_anon_vma() however, we check the anon_vma which will be
merged from, if it is set, then we check
list_is_singular(vma->anon_vma_chain).
So to match this logic, update is_mergeable_anon_vma() to perform this
scalability check on the VMA whose anon_vma we ultimately merge into.
This matches existing behaviour with forked VMAs, only we no longer
wrongly disallow ALL empty target merges.
So we both allow merge cases and ensure the scalability check is correctly
applied.
We may wish to revisit these lock scalability concerns at a later date and
ensure they are still valid.
Additionally, correct userland VMA tests which were mistakenly not
asserting these cases correctly previously to now correctly assert this,
and to ensure vmg->anon_vma state is always consistent to account for
newly introduced asserts.
Link: https://lkml.kernel.org/r/cover.1744104124.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/18c756fc9eaf7ad082a710c91133b8346f8cd9a8.1744104124.git.lorenzo.stoakes@oracle.com
Fixes: 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Yeoreum Yun <yeoreum.yun@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-08 10:29:31 +01:00
|
|
|
vma_set_dummy_anon_vma(vma, &dummy_anon_vma_chain);
|
2025-06-18 20:42:53 +01:00
|
|
|
vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
|
2024-08-30 19:10:14 +01:00
|
|
|
vmg.prev = vma;
|
mm: simplify vma merge structure and expand comments
Patch series "mm: further simplify VMA merge operation", v3.
While significant efforts have been made to improve the VMA merge
operation, there remains remnants of the bad (or rather confusing) old
days, which make the code difficult to understand, more bug prone and thus
harder to modify.
This series attempts to significantly improve matters in a number of
respects - with a focus on simplifying the commit_merge() function which
actually actions the merge operation - and importantly, adjusting the two
most confusing merge cases - those in which we 'adjust' the VMA
immediately adjacent to the one being merged.
One source of confusion are the VMAs being threaded through the operation
themselves - vmg->prev, vmg->vma and vmg->next.
At the start of the operation, vmg->vma is either NULL if a new VMA is
propose to be added, or if not then a pointer to an existing VMA being
modified, and prev/next are (perhaps not present) VMAs sat immediately
before and after the range specified in vmg->start, end, respectively.
However, during the VMA merge operation, we change vmg->start, end and
pgoff to span the newly merged range and vmg->vma to either be:
a. The ultimately returned VMA (in most cases) or b. A VMA which we will
manipulate, but ultimately instead return vmg->next.
Case b. especially here is confusing for somebody reading this code, but
the fact we update this state, along with vmg->start, end, pgoff only
makes matters worse.
We simplify things by replacing vmg->vma with vmg->middle and never
changing it - this is always either NULL (for a new VMA) or the VMA being
modified between vmg->prev and vmg->next.
We further simplify by placing the merged VMA in a new vmg->target field -
whether case b. above is the case or not. The reader of the code can now
simply rely on vmg->middle being the middle VMA and vmg->target being the
ultimately merged VMA.
We additionally tackle the confusing cases where we 'adjust' VMAs other
than the one we ultimately return as the merged VMA (this includes case b.
above). These are:
(1)
merge
<----------->
|------||--------| |------------|---|
| prev || middle | -> | target | m |
|------||--------| |------------|---|
In which case middle must be adjusted so middle->vm_start is increased as
well as performing the merge.
(2) (equivalent to case b. above)
<------------->
|---------||------| |---|-------------|
| middle || next | -> | m | target |
|---------||------| |---|-------------|
In which case next must be adjusted so next->vm_start is decreased as well
as performing the merge.
This cases have previously been performed by calculating and passing
around a dubious and confusing 'adj_start' parameter along side a pointer
to an 'adjust' VMA indicating which VMA requires additional adjustment
(middle in case 1 and next in case 2).
With the VMG structure in place we are able to avoid this by simply
setting a merge flag to describe each case:
(1) Sets the vmg->__adjust_middle_start flag
(2) Sets the vmg->__adjust_next_start flag
By doing so it turns out we can vastly simplify the logic and calculate
what is required to perform the operation.
Taken together the refactorings make it far easier to understand what is
being done even in these more confusing cases, make the code far more
maintainable, debuggable, and testable, providing more internal state
indicating what is happening in the merge operation.
The changes have no functional net impact on the merge operation and
everything should still behave as it did before.
This patch (of 5):
The merge code, while much improved, still has a number of points of
confusion. As part of a broader series cleaning this up to make this more
maintainable, we start by addressing some confusion around
vma_merge_struct fields.
So far, the caller either provides no vmg->vma (a new VMA) or supplies the
existing VMA which is being altered, setting vmg->start,end,pgoff to the
proposed VMA dimensions.
vmg->vma is then updated, as are vmg->start,end,pgoff as the merge process
proceeds and the appropriate merge strategy is determined.
This is rather confusing, as vmg->vma starts off as the 'middle' VMA
between vmg->prev,next, but becomes the 'target' VMA, except in one
specific edge case (merge next, shrink middle).
Int his patch we introduce vmg->middle to describe the VMA that is between
vmg->prev and vmg->next, and does NOT change during the merge operation.
We replace vmg->vma with vmg->target, and use this only during the merge
operation itself.
Aside from the merge right, shrink middle case, this becomes the VMA that
forms the basis of the VMA that is returned. This edge case can be
addressed in a future commit.
We also add a number of comments to explain what is going on.
Finally, we adjust the ASCII diagrams showing each merge case in
vma_merge_existing_range() to be clearer - the arrow range previously
showed the vmg->start, end spanned area, but it is clearer to change this
to show the final merged VMA.
This patch has no change in functional behaviour.
Link: https://lkml.kernel.org/r/cover.1738326519.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/4dfe60f1419d55e5d0516f56349695d73a57184c.1738326519.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-31 12:31:49 +00:00
|
|
|
vmg.middle = vma;
|
2024-08-30 19:10:14 +01:00
|
|
|
|
|
|
|
ASSERT_EQ(merge_existing(&vmg), vma_next);
|
mm: avoid using vma_merge() for new VMAs
Abstract vma_merge_new_vma() to use vma_merge_struct and rename the
resultant function vma_merge_new_range() to be clear what the purpose of
this function is - a new VMA is desired in the specified range, and we
wish to see if it is possible to 'merge' surrounding VMAs into this range
rather than having to allocate a new VMA.
Note that this function uses vma_extend() exclusively, so adopts its
requirement that the iterator point at or before the gap. We add an
assert to this effect.
This is as opposed to vma_merge_existing_range(), which will be introduced
in a subsequent commit, and provide the same functionality for cases in
which we are modifying an existing VMA.
In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.
Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().
Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.
Also add the ability for the vmg to track state, and able to report
errors, allowing for us to differentiate a failed merge from an inability
to allocate memory in callers.
This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.
Also introduce vma_iter_next_rewind() to allow for retrieval of the next,
and (optionally) the prev VMA, rewinding to the start of the previous gap.
Introduce are_anon_vmas_compatible() to abstract individual VMA anon_vma
comparison for the case of merging on both sides where the anon_vma of the
VMA being merged maybe compatible with prev and next, but prev and next's
anon_vma's may not be compatible with each other.
Finally also introduce can_vma_merge_left() / can_vma_merge_right() to
check adjacent VMA compatibility and that they are indeed adjacent.
Link: https://lkml.kernel.org/r/49d37c0769b6b9dc03b27fe4d059173832556392.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:18 +01:00
|
|
|
ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
|
2024-08-30 19:10:14 +01:00
|
|
|
|
|
|
|
ASSERT_EQ(vma_next->vm_start, 0x3000);
|
|
|
|
ASSERT_EQ(vma_next->vm_end, 0x8000);
|
|
|
|
|
|
|
|
ASSERT_EQ(vma_next->anon_vma, &dummy_anon_vma);
|
|
|
|
ASSERT_TRUE(vma_next->anon_vma->was_cloned);
|
|
|
|
|
|
|
|
cleanup_mm(&mm, &vmi);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool test_vmi_prealloc_fail(void)
|
|
|
|
{
|
2025-06-18 20:42:53 +01:00
|
|
|
vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
|
2024-08-30 19:10:14 +01:00
|
|
|
struct mm_struct mm = {};
|
|
|
|
VMA_ITERATOR(vmi, &mm, 0);
|
|
|
|
struct vma_merge_struct vmg = {
|
|
|
|
.mm = &mm,
|
|
|
|
.vmi = &vmi,
|
|
|
|
};
|
mm/vma: fix incorrectly disallowed anonymous VMA merges
Patch series "fix incorrectly disallowed anonymous VMA merges", v2.
It appears that we have been incorrectly rejecting merge cases for 15
years, apparently by mistake.
Imagine a range of anonymous mapped momemory divided into two VMAs like
this, with incompatible protection bits:
RW RWX
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
mprotect(RW)
Now imagine mprotect()'ing vma so it is RW. This appears as if it should
merge, it does not.
Neither does this case, again mprotect()'ing vma RW:
RWX RW
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
mprotect(RW)
Nor:
RW RWX RW
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
mprotect(RW)
What's going on here?
In commit 5beb49305251 ("mm: change anon_vma linking to fix multi-process
server scalability issue"), from 2010, Rik von Riel took careful care to
account for these cases - commenting that '[this is] easily overlooked:
when mprotect shifts the boundary, make sure the expanding vma has
anon_vma set if the shrinking vma had, to cover any anon pages imported.'
However, commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
introduced a little over a year later, appears to have accidentally
disallowed this.
By adjusting the is_mergeable_anon_vma() function to avoid lock contention
across large trees of forked anon_vma's, this commit wrongly assumed the
VMA being checked (the ostensible merge 'target') should be faulted, that
is, have an anon_vma, and thus an anon_vma_chain list established, but
only of length 1.
This appears to have been unintentional, as disallowing empty target VMAs
like this across the board makes no sense.
We already have logic that accounts for this case, the same logic Rik
introduced in 2010, now via dup_anon_vma() (and ultimately
anon_vma_clone()), so there is no problem permitting this.
This series fixes this mistake and also ensures that scalability concerns
remain addressed by explicitly checking that whatever VMA is being merged
has not been forked.
A full set of self tests which reproduce the issue are provided, as well
as updating userland VMA tests to assert this behaviour.
The self tests additionally assert scalability concerns are addressed.
This patch (of 3):
anon_vma_chain's were introduced by Rik von Riel in commit 5beb49305251
("mm: change anon_vma linking to fix multi-process server scalability
issue").
This patch was introduced in March 2010. As part of this change, careful
attention was made to the instance of mprotect() causing a VMA merge, with
one faulted (i.e. having anon_vma set) and another not:
/*
* Easily overlooked: when mprotect shifts the boundary,
* make sure the expanding vma has anon_vma set if the
* shrinking vma had, to cover any anon pages imported.
*/
In the modern VMA code, this is handled in dup_anon_vma() (and ultimately
anon_vma_clone()).
This case is one of the three configurations of adjacent VMA anon_vma
state that we might encounter on merge (where dst is the VMA which will be
merged into and src the one being merged into dst):
1. dst->anon_vma, src->anon_vma - These must be equal, no-op.
2. dst->anon_vma, !src->anon_vma - We simply use dst->anon_vma, no-op.
3. !dst->anon_vma, src->anon_vma - The case in question here.
In case 3, the instance addressed here - we duplicate the AVC connections
from src and place into dst.
However, in practice, we very often do NOT do this.
This appears to be due to an inadvertent consequence of the change
introduced by commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs"),
introduced in May 2011.
This implies that this merge case was functional only for a little over a
year, and has since been broken for ~15 years.
Here, lock scalability concerns lead to us restricting anonymous merges
only to those VMAs with 1 entry in their vma->anon_vma_chain, that is, a
VMA that is not connected to any parent process's anon_vma.
The mergeability test looks like this:
static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
struct anon_vma *anon_vma2, struct vm_area_struct *vma)
{
if ((!anon_vma1 || !anon_vma2) && (!vma ||
!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)))
return true;
return anon_vma1 == anon_vma2;
}
However, we have a problem here - typically the vma passed here is the
destination VMA.
For instance in vma_merge_existing_range() we invoke:
can_vma_merge_left()
-> [ check that there is an immediately adjacent prior VMA ]
-> can_vma_merge_after()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], prev->anon_vma, prev)
So if we were considering a target unfaulted 'prev':
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
This would call is_mergeable_anon_vma(NULL, vma->anon_vma, prev).
The list_is_singular() check for vma->anon_vma_chain, an empty list on
fault, would cause this merge to _fail_ even though all else indicates a
merge.
Equally a simple merge into a next VMA would hit the same problem:
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
can_vma_merge_right()
-> [ check that there is an immediately adjacent succeeding VMA ]
-> can_vma_merge_before()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], next->anon_vma, next)
For a 3-way merge, we'd also hit the same problem if it was configured like
this for instance:
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
As we'd call can_vma_merge_left() for prev, and can_vma_merge_right() for
next, both of which would fail.
vma_merge_new_range() (and relatedly, vma_expand()) are not impacted, as
the new VMA would never already be faulted (it is a proposed new range).
Because we already handle each of the aforementioned merge cases, and can
absolutely therefore deal with an existing VMA merge with !dst->anon_vma,
src->anon_vma, there is absolutely no reason to disallow this kind of
merge.
It seems that the intention of this patch is to ensure that, in the
instance of merging unfaulted VMAs with faulted ones, we never wish to do
so with those with multiple AVCs due to the fact that anon_vma lock's are
held across both parent and child anon_vma's (actually, the 'root' parent
anon_vma's lock is used).
In fact, the original commit alludes to this - "find_mergeable_anon_vma()
already considers this case".
In find_mergeable_anon_vma() however, we check the anon_vma which will be
merged from, if it is set, then we check
list_is_singular(vma->anon_vma_chain).
So to match this logic, update is_mergeable_anon_vma() to perform this
scalability check on the VMA whose anon_vma we ultimately merge into.
This matches existing behaviour with forked VMAs, only we no longer
wrongly disallow ALL empty target merges.
So we both allow merge cases and ensure the scalability check is correctly
applied.
We may wish to revisit these lock scalability concerns at a later date and
ensure they are still valid.
Additionally, correct userland VMA tests which were mistakenly not
asserting these cases correctly previously to now correctly assert this,
and to ensure vmg->anon_vma state is always consistent to account for
newly introduced asserts.
Link: https://lkml.kernel.org/r/cover.1744104124.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/18c756fc9eaf7ad082a710c91133b8346f8cd9a8.1744104124.git.lorenzo.stoakes@oracle.com
Fixes: 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Yeoreum Yun <yeoreum.yun@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-08 10:29:31 +01:00
|
|
|
struct anon_vma_chain avc = {};
|
2024-08-30 19:10:14 +01:00
|
|
|
struct vm_area_struct *vma_prev, *vma;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We are merging vma into prev, with vma possessing an anon_vma, which
|
|
|
|
* will be duplicated. We cause the vmi preallocation to fail and assert
|
|
|
|
* the duplicated anon_vma is unlinked.
|
|
|
|
*/
|
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
|
|
|
|
vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
|
2024-08-30 19:10:14 +01:00
|
|
|
vma->anon_vma = &dummy_anon_vma;
|
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vmg_set_range_anon_vma(&vmg, 0x3000, 0x5000, 3, vm_flags, &dummy_anon_vma);
|
2024-08-30 19:10:14 +01:00
|
|
|
vmg.prev = vma_prev;
|
mm: simplify vma merge structure and expand comments
Patch series "mm: further simplify VMA merge operation", v3.
While significant efforts have been made to improve the VMA merge
operation, there remains remnants of the bad (or rather confusing) old
days, which make the code difficult to understand, more bug prone and thus
harder to modify.
This series attempts to significantly improve matters in a number of
respects - with a focus on simplifying the commit_merge() function which
actually actions the merge operation - and importantly, adjusting the two
most confusing merge cases - those in which we 'adjust' the VMA
immediately adjacent to the one being merged.
One source of confusion are the VMAs being threaded through the operation
themselves - vmg->prev, vmg->vma and vmg->next.
At the start of the operation, vmg->vma is either NULL if a new VMA is
propose to be added, or if not then a pointer to an existing VMA being
modified, and prev/next are (perhaps not present) VMAs sat immediately
before and after the range specified in vmg->start, end, respectively.
However, during the VMA merge operation, we change vmg->start, end and
pgoff to span the newly merged range and vmg->vma to either be:
a. The ultimately returned VMA (in most cases) or b. A VMA which we will
manipulate, but ultimately instead return vmg->next.
Case b. especially here is confusing for somebody reading this code, but
the fact we update this state, along with vmg->start, end, pgoff only
makes matters worse.
We simplify things by replacing vmg->vma with vmg->middle and never
changing it - this is always either NULL (for a new VMA) or the VMA being
modified between vmg->prev and vmg->next.
We further simplify by placing the merged VMA in a new vmg->target field -
whether case b. above is the case or not. The reader of the code can now
simply rely on vmg->middle being the middle VMA and vmg->target being the
ultimately merged VMA.
We additionally tackle the confusing cases where we 'adjust' VMAs other
than the one we ultimately return as the merged VMA (this includes case b.
above). These are:
(1)
merge
<----------->
|------||--------| |------------|---|
| prev || middle | -> | target | m |
|------||--------| |------------|---|
In which case middle must be adjusted so middle->vm_start is increased as
well as performing the merge.
(2) (equivalent to case b. above)
<------------->
|---------||------| |---|-------------|
| middle || next | -> | m | target |
|---------||------| |---|-------------|
In which case next must be adjusted so next->vm_start is decreased as well
as performing the merge.
This cases have previously been performed by calculating and passing
around a dubious and confusing 'adj_start' parameter along side a pointer
to an 'adjust' VMA indicating which VMA requires additional adjustment
(middle in case 1 and next in case 2).
With the VMG structure in place we are able to avoid this by simply
setting a merge flag to describe each case:
(1) Sets the vmg->__adjust_middle_start flag
(2) Sets the vmg->__adjust_next_start flag
By doing so it turns out we can vastly simplify the logic and calculate
what is required to perform the operation.
Taken together the refactorings make it far easier to understand what is
being done even in these more confusing cases, make the code far more
maintainable, debuggable, and testable, providing more internal state
indicating what is happening in the merge operation.
The changes have no functional net impact on the merge operation and
everything should still behave as it did before.
This patch (of 5):
The merge code, while much improved, still has a number of points of
confusion. As part of a broader series cleaning this up to make this more
maintainable, we start by addressing some confusion around
vma_merge_struct fields.
So far, the caller either provides no vmg->vma (a new VMA) or supplies the
existing VMA which is being altered, setting vmg->start,end,pgoff to the
proposed VMA dimensions.
vmg->vma is then updated, as are vmg->start,end,pgoff as the merge process
proceeds and the appropriate merge strategy is determined.
This is rather confusing, as vmg->vma starts off as the 'middle' VMA
between vmg->prev,next, but becomes the 'target' VMA, except in one
specific edge case (merge next, shrink middle).
Int his patch we introduce vmg->middle to describe the VMA that is between
vmg->prev and vmg->next, and does NOT change during the merge operation.
We replace vmg->vma with vmg->target, and use this only during the merge
operation itself.
Aside from the merge right, shrink middle case, this becomes the VMA that
forms the basis of the VMA that is returned. This edge case can be
addressed in a future commit.
We also add a number of comments to explain what is going on.
Finally, we adjust the ASCII diagrams showing each merge case in
vma_merge_existing_range() to be clearer - the arrow range previously
showed the vmg->start, end spanned area, but it is clearer to change this
to show the final merged VMA.
This patch has no change in functional behaviour.
Link: https://lkml.kernel.org/r/cover.1738326519.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/4dfe60f1419d55e5d0516f56349695d73a57184c.1738326519.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-31 12:31:49 +00:00
|
|
|
vmg.middle = vma;
|
mm/vma: fix incorrectly disallowed anonymous VMA merges
Patch series "fix incorrectly disallowed anonymous VMA merges", v2.
It appears that we have been incorrectly rejecting merge cases for 15
years, apparently by mistake.
Imagine a range of anonymous mapped momemory divided into two VMAs like
this, with incompatible protection bits:
RW RWX
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
mprotect(RW)
Now imagine mprotect()'ing vma so it is RW. This appears as if it should
merge, it does not.
Neither does this case, again mprotect()'ing vma RW:
RWX RW
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
mprotect(RW)
Nor:
RW RWX RW
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
mprotect(RW)
What's going on here?
In commit 5beb49305251 ("mm: change anon_vma linking to fix multi-process
server scalability issue"), from 2010, Rik von Riel took careful care to
account for these cases - commenting that '[this is] easily overlooked:
when mprotect shifts the boundary, make sure the expanding vma has
anon_vma set if the shrinking vma had, to cover any anon pages imported.'
However, commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
introduced a little over a year later, appears to have accidentally
disallowed this.
By adjusting the is_mergeable_anon_vma() function to avoid lock contention
across large trees of forked anon_vma's, this commit wrongly assumed the
VMA being checked (the ostensible merge 'target') should be faulted, that
is, have an anon_vma, and thus an anon_vma_chain list established, but
only of length 1.
This appears to have been unintentional, as disallowing empty target VMAs
like this across the board makes no sense.
We already have logic that accounts for this case, the same logic Rik
introduced in 2010, now via dup_anon_vma() (and ultimately
anon_vma_clone()), so there is no problem permitting this.
This series fixes this mistake and also ensures that scalability concerns
remain addressed by explicitly checking that whatever VMA is being merged
has not been forked.
A full set of self tests which reproduce the issue are provided, as well
as updating userland VMA tests to assert this behaviour.
The self tests additionally assert scalability concerns are addressed.
This patch (of 3):
anon_vma_chain's were introduced by Rik von Riel in commit 5beb49305251
("mm: change anon_vma linking to fix multi-process server scalability
issue").
This patch was introduced in March 2010. As part of this change, careful
attention was made to the instance of mprotect() causing a VMA merge, with
one faulted (i.e. having anon_vma set) and another not:
/*
* Easily overlooked: when mprotect shifts the boundary,
* make sure the expanding vma has anon_vma set if the
* shrinking vma had, to cover any anon pages imported.
*/
In the modern VMA code, this is handled in dup_anon_vma() (and ultimately
anon_vma_clone()).
This case is one of the three configurations of adjacent VMA anon_vma
state that we might encounter on merge (where dst is the VMA which will be
merged into and src the one being merged into dst):
1. dst->anon_vma, src->anon_vma - These must be equal, no-op.
2. dst->anon_vma, !src->anon_vma - We simply use dst->anon_vma, no-op.
3. !dst->anon_vma, src->anon_vma - The case in question here.
In case 3, the instance addressed here - we duplicate the AVC connections
from src and place into dst.
However, in practice, we very often do NOT do this.
This appears to be due to an inadvertent consequence of the change
introduced by commit 965f55dea0e3 ("mmap: avoid merging cloned VMAs"),
introduced in May 2011.
This implies that this merge case was functional only for a little over a
year, and has since been broken for ~15 years.
Here, lock scalability concerns lead to us restricting anonymous merges
only to those VMAs with 1 entry in their vma->anon_vma_chain, that is, a
VMA that is not connected to any parent process's anon_vma.
The mergeability test looks like this:
static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
struct anon_vma *anon_vma2, struct vm_area_struct *vma)
{
if ((!anon_vma1 || !anon_vma2) && (!vma ||
!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)))
return true;
return anon_vma1 == anon_vma2;
}
However, we have a problem here - typically the vma passed here is the
destination VMA.
For instance in vma_merge_existing_range() we invoke:
can_vma_merge_left()
-> [ check that there is an immediately adjacent prior VMA ]
-> can_vma_merge_after()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], prev->anon_vma, prev)
So if we were considering a target unfaulted 'prev':
unfaulted faulted
|-----------|-----------|
| prev | vma |
|-----------|-----------|
This would call is_mergeable_anon_vma(NULL, vma->anon_vma, prev).
The list_is_singular() check for vma->anon_vma_chain, an empty list on
fault, would cause this merge to _fail_ even though all else indicates a
merge.
Equally a simple merge into a next VMA would hit the same problem:
faulted unfaulted
|-----------|-----------|
| vma | next |
|-----------|-----------|
can_vma_merge_right()
-> [ check that there is an immediately adjacent succeeding VMA ]
-> can_vma_merge_before()
-> is_mergeable_vma() for general attribute check
-> is_mergeable_anon_vma([ proposed anon_vma ], next->anon_vma, next)
For a 3-way merge, we'd also hit the same problem if it was configured like
this for instance:
unfaulted faulted unfaulted
|-----------|-----------|-----------|
| prev | vma | next |
|-----------|-----------|-----------|
As we'd call can_vma_merge_left() for prev, and can_vma_merge_right() for
next, both of which would fail.
vma_merge_new_range() (and relatedly, vma_expand()) are not impacted, as
the new VMA would never already be faulted (it is a proposed new range).
Because we already handle each of the aforementioned merge cases, and can
absolutely therefore deal with an existing VMA merge with !dst->anon_vma,
src->anon_vma, there is absolutely no reason to disallow this kind of
merge.
It seems that the intention of this patch is to ensure that, in the
instance of merging unfaulted VMAs with faulted ones, we never wish to do
so with those with multiple AVCs due to the fact that anon_vma lock's are
held across both parent and child anon_vma's (actually, the 'root' parent
anon_vma's lock is used).
In fact, the original commit alludes to this - "find_mergeable_anon_vma()
already considers this case".
In find_mergeable_anon_vma() however, we check the anon_vma which will be
merged from, if it is set, then we check
list_is_singular(vma->anon_vma_chain).
So to match this logic, update is_mergeable_anon_vma() to perform this
scalability check on the VMA whose anon_vma we ultimately merge into.
This matches existing behaviour with forked VMAs, only we no longer
wrongly disallow ALL empty target merges.
So we both allow merge cases and ensure the scalability check is correctly
applied.
We may wish to revisit these lock scalability concerns at a later date and
ensure they are still valid.
Additionally, correct userland VMA tests which were mistakenly not
asserting these cases correctly previously to now correctly assert this,
and to ensure vmg->anon_vma state is always consistent to account for
newly introduced asserts.
Link: https://lkml.kernel.org/r/cover.1744104124.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/18c756fc9eaf7ad082a710c91133b8346f8cd9a8.1744104124.git.lorenzo.stoakes@oracle.com
Fixes: 965f55dea0e3 ("mmap: avoid merging cloned VMAs")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Yeoreum Yun <yeoreum.yun@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-08 10:29:31 +01:00
|
|
|
vma_set_dummy_anon_vma(vma, &avc);
|
2024-08-30 19:10:14 +01:00
|
|
|
|
|
|
|
fail_prealloc = true;
|
|
|
|
|
|
|
|
/* This will cause the merge to fail. */
|
|
|
|
ASSERT_EQ(merge_existing(&vmg), NULL);
|
mm: avoid using vma_merge() for new VMAs
Abstract vma_merge_new_vma() to use vma_merge_struct and rename the
resultant function vma_merge_new_range() to be clear what the purpose of
this function is - a new VMA is desired in the specified range, and we
wish to see if it is possible to 'merge' surrounding VMAs into this range
rather than having to allocate a new VMA.
Note that this function uses vma_extend() exclusively, so adopts its
requirement that the iterator point at or before the gap. We add an
assert to this effect.
This is as opposed to vma_merge_existing_range(), which will be introduced
in a subsequent commit, and provide the same functionality for cases in
which we are modifying an existing VMA.
In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.
Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().
Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.
Also add the ability for the vmg to track state, and able to report
errors, allowing for us to differentiate a failed merge from an inability
to allocate memory in callers.
This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.
Also introduce vma_iter_next_rewind() to allow for retrieval of the next,
and (optionally) the prev VMA, rewinding to the start of the previous gap.
Introduce are_anon_vmas_compatible() to abstract individual VMA anon_vma
comparison for the case of merging on both sides where the anon_vma of the
VMA being merged maybe compatible with prev and next, but prev and next's
anon_vma's may not be compatible with each other.
Finally also introduce can_vma_merge_left() / can_vma_merge_right() to
check adjacent VMA compatibility and that they are indeed adjacent.
Link: https://lkml.kernel.org/r/49d37c0769b6b9dc03b27fe4d059173832556392.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:18 +01:00
|
|
|
ASSERT_EQ(vmg.state, VMA_MERGE_ERROR_NOMEM);
|
2024-08-30 19:10:14 +01:00
|
|
|
/* We will already have assigned the anon_vma. */
|
|
|
|
ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
|
|
|
|
/* And it was both cloned and unlinked. */
|
|
|
|
ASSERT_TRUE(dummy_anon_vma.was_cloned);
|
|
|
|
ASSERT_TRUE(dummy_anon_vma.was_unlinked);
|
|
|
|
|
|
|
|
cleanup_mm(&mm, &vmi); /* Resets fail_prealloc too. */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We repeat the same operation for expanding a VMA, which is what new
|
|
|
|
* VMA merging ultimately uses too. This asserts that unlinking is
|
|
|
|
* performed in this case too.
|
|
|
|
*/
|
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
|
|
|
|
vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
|
2024-08-30 19:10:14 +01:00
|
|
|
vma->anon_vma = &dummy_anon_vma;
|
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vmg_set_range(&vmg, 0, 0x5000, 3, vm_flags);
|
2025-06-13 19:48:07 +01:00
|
|
|
vmg.target = vma_prev;
|
2024-08-30 19:10:14 +01:00
|
|
|
vmg.next = vma;
|
|
|
|
|
|
|
|
fail_prealloc = true;
|
|
|
|
ASSERT_EQ(expand_existing(&vmg), -ENOMEM);
|
mm: avoid using vma_merge() for new VMAs
Abstract vma_merge_new_vma() to use vma_merge_struct and rename the
resultant function vma_merge_new_range() to be clear what the purpose of
this function is - a new VMA is desired in the specified range, and we
wish to see if it is possible to 'merge' surrounding VMAs into this range
rather than having to allocate a new VMA.
Note that this function uses vma_extend() exclusively, so adopts its
requirement that the iterator point at or before the gap. We add an
assert to this effect.
This is as opposed to vma_merge_existing_range(), which will be introduced
in a subsequent commit, and provide the same functionality for cases in
which we are modifying an existing VMA.
In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.
Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().
Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.
Also add the ability for the vmg to track state, and able to report
errors, allowing for us to differentiate a failed merge from an inability
to allocate memory in callers.
This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.
Also introduce vma_iter_next_rewind() to allow for retrieval of the next,
and (optionally) the prev VMA, rewinding to the start of the previous gap.
Introduce are_anon_vmas_compatible() to abstract individual VMA anon_vma
comparison for the case of merging on both sides where the anon_vma of the
VMA being merged maybe compatible with prev and next, but prev and next's
anon_vma's may not be compatible with each other.
Finally also introduce can_vma_merge_left() / can_vma_merge_right() to
check adjacent VMA compatibility and that they are indeed adjacent.
Link: https://lkml.kernel.org/r/49d37c0769b6b9dc03b27fe4d059173832556392.1725040657.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Mark Brown <broonie@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-08-30 19:10:18 +01:00
|
|
|
ASSERT_EQ(vmg.state, VMA_MERGE_ERROR_NOMEM);
|
2024-08-30 19:10:14 +01:00
|
|
|
|
|
|
|
ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
|
|
|
|
ASSERT_TRUE(dummy_anon_vma.was_cloned);
|
|
|
|
ASSERT_TRUE(dummy_anon_vma.was_unlinked);
|
|
|
|
|
|
|
|
cleanup_mm(&mm, &vmi);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool test_merge_extend(void)
|
|
|
|
{
|
2025-06-18 20:42:53 +01:00
|
|
|
vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
|
2024-08-30 19:10:14 +01:00
|
|
|
struct mm_struct mm = {};
|
|
|
|
VMA_ITERATOR(vmi, &mm, 0x1000);
|
|
|
|
struct vm_area_struct *vma;
|
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vma = alloc_and_link_vma(&mm, 0, 0x1000, 0, vm_flags);
|
|
|
|
alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, vm_flags);
|
2024-08-30 19:10:14 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Extend a VMA into the gap between itself and the following VMA.
|
|
|
|
* This should result in a merge.
|
|
|
|
*
|
|
|
|
* <->
|
|
|
|
* * *
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
ASSERT_EQ(vma_merge_extend(&vmi, vma, 0x2000), vma);
|
|
|
|
ASSERT_EQ(vma->vm_start, 0);
|
|
|
|
ASSERT_EQ(vma->vm_end, 0x4000);
|
|
|
|
ASSERT_EQ(vma->vm_pgoff, 0);
|
|
|
|
ASSERT_TRUE(vma_write_started(vma));
|
|
|
|
ASSERT_EQ(mm.map_count, 1);
|
|
|
|
|
|
|
|
cleanup_mm(&mm, &vmi);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool test_copy_vma(void)
|
|
|
|
{
|
2025-06-18 20:42:53 +01:00
|
|
|
vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
|
2024-08-30 19:10:14 +01:00
|
|
|
struct mm_struct mm = {};
|
|
|
|
bool need_locks = false;
|
|
|
|
VMA_ITERATOR(vmi, &mm, 0);
|
|
|
|
struct vm_area_struct *vma, *vma_new, *vma_next;
|
|
|
|
|
|
|
|
/* Move backwards and do not merge. */
|
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
|
2024-08-30 19:10:14 +01:00
|
|
|
vma_new = copy_vma(&vma, 0, 0x2000, 0, &need_locks);
|
|
|
|
ASSERT_NE(vma_new, vma);
|
|
|
|
ASSERT_EQ(vma_new->vm_start, 0);
|
|
|
|
ASSERT_EQ(vma_new->vm_end, 0x2000);
|
|
|
|
ASSERT_EQ(vma_new->vm_pgoff, 0);
|
2025-02-13 14:46:41 -08:00
|
|
|
vma_assert_attached(vma_new);
|
2024-08-30 19:10:14 +01:00
|
|
|
|
|
|
|
cleanup_mm(&mm, &vmi);
|
|
|
|
|
|
|
|
/* Move a VMA into position next to another and merge the two. */
|
|
|
|
|
2025-06-18 20:42:53 +01:00
|
|
|
vma = alloc_and_link_vma(&mm, 0, 0x2000, 0, vm_flags);
|
|
|
|
vma_next = alloc_and_link_vma(&mm, 0x6000, 0x8000, 6, vm_flags);
|
2024-08-30 19:10:14 +01:00
|
|
|
vma_new = copy_vma(&vma, 0x4000, 0x2000, 4, &need_locks);
|
2025-02-13 14:46:41 -08:00
|
|
|
vma_assert_attached(vma_new);
|
2024-08-30 19:10:14 +01:00
|
|
|
|
|
|
|
ASSERT_EQ(vma_new, vma_next);
|
|
|
|
|
|
|
|
cleanup_mm(&mm, &vmi);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2024-10-17 15:31:46 +01:00
|
|
|
static bool test_expand_only_mode(void)
|
|
|
|
{
|
2025-06-18 20:42:53 +01:00
|
|
|
vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
|
2024-10-17 15:31:46 +01:00
|
|
|
struct mm_struct mm = {};
|
|
|
|
VMA_ITERATOR(vmi, &mm, 0);
|
|
|
|
struct vm_area_struct *vma_prev, *vma;
|
2025-06-18 20:42:53 +01:00
|
|
|
VMG_STATE(vmg, &mm, &vmi, 0x5000, 0x9000, vm_flags, 5);
|
2024-10-17 15:31:46 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Place a VMA prior to the one we're expanding so we assert that we do
|
|
|
|
* not erroneously try to traverse to the previous VMA even though we
|
2025-01-31 12:31:50 +00:00
|
|
|
* have, through the use of the just_expand flag, indicated we do not
|
2024-10-17 15:31:46 +01:00
|
|
|
* need to do so.
|
|
|
|
*/
|
2025-06-18 20:42:53 +01:00
|
|
|
alloc_and_link_vma(&mm, 0, 0x2000, 0, vm_flags);
|
2024-10-17 15:31:46 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We will be positioned at the prev VMA, but looking to expand to
|
|
|
|
* 0x9000.
|
|
|
|
*/
|
|
|
|
vma_iter_set(&vmi, 0x3000);
|
2025-06-18 20:42:53 +01:00
|
|
|
vma_prev = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
|
2024-10-17 15:31:46 +01:00
|
|
|
vmg.prev = vma_prev;
|
2025-01-31 12:31:50 +00:00
|
|
|
vmg.just_expand = true;
|
2024-10-17 15:31:46 +01:00
|
|
|
|
|
|
|
vma = vma_merge_new_range(&vmg);
|
|
|
|
ASSERT_NE(vma, NULL);
|
|
|
|
ASSERT_EQ(vma, vma_prev);
|
|
|
|
ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
|
|
|
|
ASSERT_EQ(vma->vm_start, 0x3000);
|
|
|
|
ASSERT_EQ(vma->vm_end, 0x9000);
|
|
|
|
ASSERT_EQ(vma->vm_pgoff, 3);
|
|
|
|
ASSERT_TRUE(vma_write_started(vma));
|
|
|
|
ASSERT_EQ(vma_iter_addr(&vmi), 0x3000);
|
2025-02-13 14:46:41 -08:00
|
|
|
vma_assert_attached(vma);
|
2024-10-17 15:31:46 +01:00
|
|
|
|
|
|
|
cleanup_mm(&mm, &vmi);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2024-12-13 16:24:09 +00:00
|
|
|
static bool test_mmap_region_basic(void)
|
|
|
|
{
|
|
|
|
struct mm_struct mm = {};
|
|
|
|
unsigned long addr;
|
|
|
|
struct vm_area_struct *vma;
|
|
|
|
VMA_ITERATOR(vmi, &mm, 0);
|
|
|
|
|
|
|
|
current->mm = &mm;
|
|
|
|
|
|
|
|
/* Map at 0x300000, length 0x3000. */
|
|
|
|
addr = __mmap_region(NULL, 0x300000, 0x3000,
|
|
|
|
VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE,
|
|
|
|
0x300, NULL);
|
|
|
|
ASSERT_EQ(addr, 0x300000);
|
|
|
|
|
|
|
|
/* Map at 0x250000, length 0x3000. */
|
|
|
|
addr = __mmap_region(NULL, 0x250000, 0x3000,
|
|
|
|
VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE,
|
|
|
|
0x250, NULL);
|
|
|
|
ASSERT_EQ(addr, 0x250000);
|
|
|
|
|
|
|
|
/* Map at 0x303000, merging to 0x300000 of length 0x6000. */
|
|
|
|
addr = __mmap_region(NULL, 0x303000, 0x3000,
|
|
|
|
VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE,
|
|
|
|
0x303, NULL);
|
|
|
|
ASSERT_EQ(addr, 0x303000);
|
|
|
|
|
|
|
|
/* Map at 0x24d000, merging to 0x250000 of length 0x6000. */
|
|
|
|
addr = __mmap_region(NULL, 0x24d000, 0x3000,
|
|
|
|
VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE,
|
|
|
|
0x24d, NULL);
|
|
|
|
ASSERT_EQ(addr, 0x24d000);
|
|
|
|
|
|
|
|
ASSERT_EQ(mm.map_count, 2);
|
|
|
|
|
|
|
|
for_each_vma(vmi, vma) {
|
|
|
|
if (vma->vm_start == 0x300000) {
|
|
|
|
ASSERT_EQ(vma->vm_end, 0x306000);
|
|
|
|
ASSERT_EQ(vma->vm_pgoff, 0x300);
|
|
|
|
} else if (vma->vm_start == 0x24d000) {
|
|
|
|
ASSERT_EQ(vma->vm_end, 0x253000);
|
|
|
|
ASSERT_EQ(vma->vm_pgoff, 0x24d);
|
|
|
|
} else {
|
|
|
|
ASSERT_FALSE(true);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
cleanup_mm(&mm, &vmi);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2024-07-29 12:50:41 +01:00
|
|
|
int main(void)
|
|
|
|
{
|
|
|
|
int num_tests = 0, num_fail = 0;
|
|
|
|
|
|
|
|
maple_tree_init();
|
mm: perform VMA allocation, freeing, duplication in mm
Right now these are performed in kernel/fork.c which is odd and a
violation of separation of concerns, as well as preventing us from
integrating this and related logic into userland VMA testing going
forward.
There is a fly in the ointment - nommu - mmap.c is not compiled if
CONFIG_MMU not set, and neither is vma.c.
To square the circle, let's add a new file - vma_init.c. This will be
compiled for both CONFIG_MMU and nommu builds, and will also form part of
the VMA userland testing.
This allows us to de-duplicate code, while maintaining separation of
concerns and the ability for us to userland test this logic.
Update the VMA userland tests accordingly, additionally adding a
detach_free_vma() helper function to correctly detach VMAs before freeing
them in test code, as this change was triggering the assert for this.
[akpm@linux-foundation.org: remove stray newline, per Liam]
Link: https://lkml.kernel.org/r/f97b3a85a6da0196b28070df331b99e22b263be8.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-28 16:28:17 +01:00
|
|
|
vma_state_init();
|
2024-07-29 12:50:41 +01:00
|
|
|
|
|
|
|
#define TEST(name) \
|
|
|
|
do { \
|
|
|
|
num_tests++; \
|
|
|
|
if (!test_##name()) { \
|
|
|
|
num_fail++; \
|
|
|
|
fprintf(stderr, "Test " #name " FAILED\n"); \
|
|
|
|
} \
|
|
|
|
} while (0)
|
|
|
|
|
2024-08-30 19:10:14 +01:00
|
|
|
/* Very simple tests to kick the tyres. */
|
2024-07-29 12:50:41 +01:00
|
|
|
TEST(simple_merge);
|
|
|
|
TEST(simple_modify);
|
|
|
|
TEST(simple_expand);
|
|
|
|
TEST(simple_shrink);
|
|
|
|
|
2024-08-30 19:10:14 +01:00
|
|
|
TEST(merge_new);
|
|
|
|
TEST(vma_merge_special_flags);
|
|
|
|
TEST(vma_merge_with_close);
|
|
|
|
TEST(vma_merge_new_with_close);
|
|
|
|
TEST(merge_existing);
|
|
|
|
TEST(anon_vma_non_mergeable);
|
|
|
|
TEST(dup_anon_vma);
|
|
|
|
TEST(vmi_prealloc_fail);
|
|
|
|
TEST(merge_extend);
|
|
|
|
TEST(copy_vma);
|
2024-10-17 15:31:46 +01:00
|
|
|
TEST(expand_only_mode);
|
2024-08-30 19:10:14 +01:00
|
|
|
|
2024-12-13 16:24:09 +00:00
|
|
|
TEST(mmap_region_basic);
|
|
|
|
|
2024-07-29 12:50:41 +01:00
|
|
|
#undef TEST
|
|
|
|
|
|
|
|
printf("%d tests run, %d passed, %d failed.\n",
|
|
|
|
num_tests, num_tests - num_fail, num_fail);
|
|
|
|
|
|
|
|
return num_fail == 0 ? EXIT_SUCCESS : EXIT_FAILURE;
|
|
|
|
}
|