linux/mm/vma_exec.c

// SPDX-License-Identifier: GPL-2.0-only

/*
 * Functions explicitly implemented for exec functionality which however are
 * explicitly VMA-only logic.
 */

#include "vma_internal.h"
#include "vma.h"

/*
 * Relocate a VMA downwards by shift bytes. There cannot be any VMAs between
 * this VMA and its relocated range, which will now reside at [vma->vm_start -
 * shift, vma->vm_end - shift).
 *
 * This function is almost certainly NOT what you want for anything other than
 * early executable temporary stack relocation.
 */
int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
{
	/*
	 * The process proceeds as follows:
	 *
	 * 1) Use shift to calculate the new vma endpoints.
	 * 2) Extend vma to cover both the old and new ranges.  This ensures the
	 *    arguments passed to subsequent functions are consistent.
	 * 3) Move vma's page tables to the new range.
	 * 4) Free up any cleared pgd range.
	 * 5) Shrink the vma to cover only the new range.
	 */

	struct mm_struct *mm = vma->vm_mm;
	unsigned long old_start = vma->vm_start;
	unsigned long old_end = vma->vm_end;
	unsigned long length = old_end - old_start;
	unsigned long new_start = old_start - shift;
	unsigned long new_end = old_end - shift;
	VMA_ITERATOR(vmi, mm, new_start);
	VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff);
	struct vm_area_struct *next;
	struct mmu_gather tlb;
	PAGETABLE_MOVE(pmc, vma, vma, old_start, new_start, length);

	BUG_ON(new_start > new_end);

	/*
	 * ensure there are no vmas between where we want to go
	 * and where we are
	 */
	if (vma != vma_next(&vmi))
		return -EFAULT;

	vma_iter_prev_range(&vmi);
	/*
	 * cover the whole range: [new_start, old_end)
	 */
	vmg.target = vma;
	if (vma_expand(&vmg))
		return -ENOMEM;

	/*
	 * move the page tables downwards, on failure we rely on
	 * process cleanup to remove whatever mess we made.
	 */
	pmc.for_stack = true;
	if (length != move_page_tables(&pmc))
		return -ENOMEM;

	tlb_gather_mmu(&tlb, mm);
	next = vma_next(&vmi);
	if (new_end > old_start) {
		/*
		 * when the old and new regions overlap clear from new_end.
		 */
		free_pgd_range(&tlb, new_end, old_end, new_end,
			next ? next->vm_start : USER_PGTABLES_CEILING);
	} else {
		/*
		 * otherwise, clean from old_start; this is done to not touch
		 * the address space in [new_end, old_start) some architectures
		 * have constraints on va-space that make this illegal (IA64) -
		 * for the others its just a little faster.
		 */
		free_pgd_range(&tlb, old_start, old_end, new_end,
			next ? next->vm_start : USER_PGTABLES_CEILING);
	}
	tlb_finish_mmu(&tlb);

	vma_prev(&vmi);
	/* Shrink the vma to just the new range */
	return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
}

/*
 * Establish the stack VMA in an execve'd process, located temporarily at the
 * maximum stack address provided by the architecture.
 *
 * We later relocate this downwards in relocate_vma_down().
 *
 * This function is almost certainly NOT what you want for anything other than
 * early executable initialisation.
 *
 * On success, returns 0 and sets *vmap to the stack VMA and *top_mem_p to the
 * maximum addressable location in the stack (that is capable of storing a
 * system word of data).
 */
int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap,
			  unsigned long *top_mem_p)
{
	int err;
	struct vm_area_struct *vma = vm_area_alloc(mm);

	if (!vma)
		return -ENOMEM;

	vma_set_anonymous(vma);

	if (mmap_write_lock_killable(mm)) {
		err = -EINTR;
		goto err_free;
	}

	/*
	 * Need to be called with mmap write lock
	 * held, to avoid race with ksmd.
	 */
	err = ksm_execve(mm);
	if (err)
		goto err_ksm;

	/*
	 * Place the stack at the largest stack address the architecture
	 * supports. Later, we'll move this to an appropriate place. We don't
	 * use STACK_TOP because that can depend on attributes which aren't
	 * configured yet.
	 */
	BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
	vma->vm_end = STACK_TOP_MAX;
	vma->vm_start = vma->vm_end - PAGE_SIZE;
	vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP);
	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);

	err = insert_vm_struct(mm, vma);
	if (err)
		goto err;

	mm->stack_vm = mm->total_vm = 1;
	mmap_write_unlock(mm);
	*vmap = vma;
	*top_mem_p = vma->vm_end - sizeof(void *);
	return 0;

err:
	ksm_exit(mm);
err_ksm:
	mmap_write_unlock(mm);
err_free:
	*vmap = NULL;
	vm_area_free(vma);
	return err;
}
mm: establish mm/vma_exec.c for shared exec/mm VMA functionality Patch series "move all VMA allocation, freeing and duplication logic to mm", v3. Currently VMA allocation, freeing and duplication exist in kernel/fork.c, which is a violation of separation of concerns, and leaves these functions exposed to the rest of the kernel when they are in fact internal implementation details. Resolve this by moving this logic to mm, and making it internal to vma.c, vma.h. This also allows us, in future, to provide userland testing around this functionality. We additionally abstract dup_mmap() to mm, being careful to ensure kernel/fork.c acceses this via the mm internal header so it is not exposed elsewhere in the kernel. As part of this change, also abstract initial stack allocation performed in __bprm_mm_init() out of fs code into mm via the create_init_stack_vma(), as this code uses vm_area_alloc() and vm_area_free(). In order to do so sensibly, we introduce a new mm/vma_exec.c file, which contains the code that is shared by mm and exec. This file is added to both memory mapping and exec sections in MAINTAINERS so both sets of maintainers can maintain oversight. As part of this change, we also move relocate_vma_down() to mm/vma_exec.c so all shared mm/exec functionality is kept in one place. We add code shared between nommu and mmu-enabled configurations in order to share VMA allocation, freeing and duplication code correctly while also keeping these functions available in userland VMA testing. This is achieved by adding a mm/vma_init.c file which is also compiled by the userland tests. This patch (of 4): There is functionality that overlaps the exec and memory mapping subsystems. While it properly belongs in mm, it is important that exec maintainers maintain oversight of this functionality correctly. We can establish both goals by adding a new mm/vma_exec.c file which contains these 'glue' functions, and have fs/exec.c import them. As a part of this change, to ensure that proper oversight is achieved, add the file to both the MEMORY MAPPING and EXEC & BINFMT API, ELF sections. scripts/get_maintainer.pl can correctly handle files in multiple entries and this neatly handles the cross-over. [akpm@linux-foundation.org: fix comment typo] Link: https://lkml.kernel.org/r/80f0d0c6-0b68-47f9-ab78-0ab7f74677fc@lucifer.local Link: https://lkml.kernel.org/r/cover.1745853549.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/91f2cee8f17d65214a9d83abb7011aa15f1ea690.1745853549.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com> Reviewed-by: Suren Baghdasaryan <surenb@google.com> Reviewed-by: Pedro Falcato <pfalcato@suse.de> Reviewed-by: David Hildenbrand <david@redhat.com> Reviewed-by: Kees Cook <kees@kernel.org> Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Christian Brauner <brauner@kernel.org> Cc: Jan Kara <jack@suse.cz> Cc: Jann Horn <jannh@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> 2025-04-28 16:28:14 +01:00			`// SPDX-License-Identifier: GPL-2.0-only`

			`/*`
			`* Functions explicitly implemented for exec functionality which however are`
			`* explicitly VMA-only logic.`
			`*/`

			`#include "vma_internal.h"`
			`#include "vma.h"`

			`/*`
			`* Relocate a VMA downwards by shift bytes. There cannot be any VMAs between`
			`* this VMA and its relocated range, which will now reside at [vma->vm_start -`
			`* shift, vma->vm_end - shift).`
			`*`
			`* This function is almost certainly NOT what you want for anything other than`
			`* early executable temporary stack relocation.`
			`*/`
			`int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)`
			`{`
			`/*`
			`* The process proceeds as follows:`
			`*`
			`* 1) Use shift to calculate the new vma endpoints.`
			`* 2) Extend vma to cover both the old and new ranges. This ensures the`
			`* arguments passed to subsequent functions are consistent.`
			`* 3) Move vma's page tables to the new range.`
			`* 4) Free up any cleared pgd range.`
			`* 5) Shrink the vma to cover only the new range.`
			`*/`

			`struct mm_struct *mm = vma->vm_mm;`
			`unsigned long old_start = vma->vm_start;`
			`unsigned long old_end = vma->vm_end;`
			`unsigned long length = old_end - old_start;`
			`unsigned long new_start = old_start - shift;`
			`unsigned long new_end = old_end - shift;`
			`VMA_ITERATOR(vmi, mm, new_start);`
			`VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff);`
			`struct vm_area_struct *next;`
			`struct mmu_gather tlb;`
			`PAGETABLE_MOVE(pmc, vma, vma, old_start, new_start, length);`

			`BUG_ON(new_start > new_end);`

			`/*`
			`* ensure there are no vmas between where we want to go`
			`* and where we are`
			`*/`
			`if (vma != vma_next(&vmi))`
			`return -EFAULT;`

			`vma_iter_prev_range(&vmi);`
			`/*`
			`* cover the whole range: [new_start, old_end)`
			`*/`
mm/vma: use vmg->target to specify target VMA for new VMA merge In commit 3a75ccba047b ("mm: simplify vma merge structure and expand comments") we introduced the vmg->target field to make the merging of existing VMAs simpler - clarifying precisely which VMA would eventually become the merged VMA once the merge operation was complete. New VMA merging did not get quite the same treatment, retaining the rather confusing convention of storing the target VMA in vmg->middle. This patch corrects this state of affairs, utilising vmg->target for this purpose for both vma_merge_new_range() and also for vma_expand(). We retain the WARN_ON for vmg->middle being specified in vma_merge_new_range() as doing so would make no sense, but add an additional debug assert for setting vmg->target. This patch additionally updates VMA userland testing to account for this change. [lorenzo.stoakes@oracle.com: make comment consistent in vma_expand()] Link: https://lkml.kernel.org/r/c54f45e3-a6ac-4749-93c0-cc9e3080ee37@lucifer.local Link: https://lkml.kernel.org/r/20250613184807.108089-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Cc: Jann Horn <jannh@google.com> Cc: Kees Cook <kees@kernel.org> Cc: Liam Howlett <liam.howlett@oracle.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> 2025-06-13 19:48:07 +01:00			`vmg.target = vma;`
mm: establish mm/vma_exec.c for shared exec/mm VMA functionality Patch series "move all VMA allocation, freeing and duplication logic to mm", v3. Currently VMA allocation, freeing and duplication exist in kernel/fork.c, which is a violation of separation of concerns, and leaves these functions exposed to the rest of the kernel when they are in fact internal implementation details. Resolve this by moving this logic to mm, and making it internal to vma.c, vma.h. This also allows us, in future, to provide userland testing around this functionality. We additionally abstract dup_mmap() to mm, being careful to ensure kernel/fork.c acceses this via the mm internal header so it is not exposed elsewhere in the kernel. As part of this change, also abstract initial stack allocation performed in __bprm_mm_init() out of fs code into mm via the create_init_stack_vma(), as this code uses vm_area_alloc() and vm_area_free(). In order to do so sensibly, we introduce a new mm/vma_exec.c file, which contains the code that is shared by mm and exec. This file is added to both memory mapping and exec sections in MAINTAINERS so both sets of maintainers can maintain oversight. As part of this change, we also move relocate_vma_down() to mm/vma_exec.c so all shared mm/exec functionality is kept in one place. We add code shared between nommu and mmu-enabled configurations in order to share VMA allocation, freeing and duplication code correctly while also keeping these functions available in userland VMA testing. This is achieved by adding a mm/vma_init.c file which is also compiled by the userland tests. This patch (of 4): There is functionality that overlaps the exec and memory mapping subsystems. While it properly belongs in mm, it is important that exec maintainers maintain oversight of this functionality correctly. We can establish both goals by adding a new mm/vma_exec.c file which contains these 'glue' functions, and have fs/exec.c import them. As a part of this change, to ensure that proper oversight is achieved, add the file to both the MEMORY MAPPING and EXEC & BINFMT API, ELF sections. scripts/get_maintainer.pl can correctly handle files in multiple entries and this neatly handles the cross-over. [akpm@linux-foundation.org: fix comment typo] Link: https://lkml.kernel.org/r/80f0d0c6-0b68-47f9-ab78-0ab7f74677fc@lucifer.local Link: https://lkml.kernel.org/r/cover.1745853549.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/91f2cee8f17d65214a9d83abb7011aa15f1ea690.1745853549.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com> Reviewed-by: Suren Baghdasaryan <surenb@google.com> Reviewed-by: Pedro Falcato <pfalcato@suse.de> Reviewed-by: David Hildenbrand <david@redhat.com> Reviewed-by: Kees Cook <kees@kernel.org> Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Christian Brauner <brauner@kernel.org> Cc: Jan Kara <jack@suse.cz> Cc: Jann Horn <jannh@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> 2025-04-28 16:28:14 +01:00			`if (vma_expand(&vmg))`
			`return -ENOMEM;`

			`/*`
			`* move the page tables downwards, on failure we rely on`
			`* process cleanup to remove whatever mess we made.`
			`*/`
			`pmc.for_stack = true;`
			`if (length != move_page_tables(&pmc))`
			`return -ENOMEM;`

			`tlb_gather_mmu(&tlb, mm);`
			`next = vma_next(&vmi);`
			`if (new_end > old_start) {`
			`/*`
			`* when the old and new regions overlap clear from new_end.`
			`*/`
			`free_pgd_range(&tlb, new_end, old_end, new_end,`
			`next ? next->vm_start : USER_PGTABLES_CEILING);`
			`} else {`
			`/*`
			`* otherwise, clean from old_start; this is done to not touch`
			`* the address space in [new_end, old_start) some architectures`
			`* have constraints on va-space that make this illegal (IA64) -`
			`* for the others its just a little faster.`
			`*/`
			`free_pgd_range(&tlb, old_start, old_end, new_end,`
			`next ? next->vm_start : USER_PGTABLES_CEILING);`
			`}`
			`tlb_finish_mmu(&tlb);`

			`vma_prev(&vmi);`
			`/* Shrink the vma to just the new range */`
			`return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);`
			`}`
mm: abstract initial stack setup to mm subsystem There are peculiarities within the kernel where what is very clearly mm code is performed elsewhere arbitrarily. This violates separation of concerns and makes it harder to refactor code to make changes to how fundamental initialisation and operation of mm logic is performed. One such case is the creation of the VMA containing the initial stack upon execve()'ing a new process. This is currently performed in __bprm_mm_init() in fs/exec.c. Abstract this operation to create_init_stack_vma(). This allows us to limit use of vma allocation and free code to fork and mm only. We previously did the same for the step at which we relocate the initial stack VMA downwards via relocate_vma_down(), now we move the initial VMA establishment too. Take the opportunity to also move insert_vm_struct() to mm/vma.c as it's no longer needed anywhere outside of mm. Link: https://lkml.kernel.org/r/118c950ef7a8dd19ab20a23a68c3603751acd30e.1745853549.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Acked-by: David Hildenbrand <david@redhat.com> Reviewed-by: Suren Baghdasaryan <surenb@google.com> Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com> Reviewed-by: Pedro Falcato <pfalcato@suse.de> Reviewed-by: Kees Cook <kees@kernel.org> Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Christian Brauner <brauner@kernel.org> Cc: Jan Kara <jack@suse.cz> Cc: Jann Horn <jannh@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> 2025-04-28 16:28:15 +01:00
			`/*`
			`* Establish the stack VMA in an execve'd process, located temporarily at the`
			`* maximum stack address provided by the architecture.`
			`*`
			`* We later relocate this downwards in relocate_vma_down().`
			`*`
			`* This function is almost certainly NOT what you want for anything other than`
			`* early executable initialisation.`
			`*`
			`* On success, returns 0 and sets vmap to the stack VMA and top_mem_p to the`
			`* maximum addressable location in the stack (that is capable of storing a`
			`* system word of data).`
			`*/`
			`int create_init_stack_vma(struct mm_struct mm, struct vm_area_struct *vmap,`
			`unsigned long *top_mem_p)`
			`{`
			`int err;`
			`struct vm_area_struct *vma = vm_area_alloc(mm);`

			`if (!vma)`
			`return -ENOMEM;`

			`vma_set_anonymous(vma);`

			`if (mmap_write_lock_killable(mm)) {`
			`err = -EINTR;`
			`goto err_free;`
			`}`

			`/*`
			`* Need to be called with mmap write lock`
			`* held, to avoid race with ksmd.`
			`*/`
			`err = ksm_execve(mm);`
			`if (err)`
			`goto err_ksm;`

			`/*`
			`* Place the stack at the largest stack address the architecture`
			`* supports. Later, we'll move this to an appropriate place. We don't`
			`* use STACK_TOP because that can depend on attributes which aren't`
			`* configured yet.`
			`*/`
			`BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);`
			`vma->vm_end = STACK_TOP_MAX;`
			`vma->vm_start = vma->vm_end - PAGE_SIZE;`
			`vm_flags_init(vma, VM_SOFTDIRTY \| VM_STACK_FLAGS \| VM_STACK_INCOMPLETE_SETUP);`
			`vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);`

			`err = insert_vm_struct(mm, vma);`
			`if (err)`
			`goto err;`

			`mm->stack_vm = mm->total_vm = 1;`
			`mmap_write_unlock(mm);`
			`*vmap = vma;`
			`top_mem_p = vma->vm_end - sizeof(void );`
			`return 0;`

			`err:`
			`ksm_exit(mm);`
			`err_ksm:`
			`mmap_write_unlock(mm);`
			`err_free:`
			`*vmap = NULL;`
			`vm_area_free(vma);`
			`return err;`
			`}`