mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-09-18 22:14:16 +00:00

Instead of always sharing the FDs with the userspace process, only hand over the FDs needed for mmap when required. The idea is that userspace might be able to force the stub into executing an mmap syscall, however, it will not be able to manipulate the control flow sufficiently to have access to an FD that would allow mapping arbitrary memory. Security wise, we need to be sure that only the expected syscalls are executed after the kernel sends FDs through the socket. This is currently not the case, as userspace can trivially jump to the rt_sigreturn syscall instruction to execute any syscall that the stub is permitted to do. With this, it can trick the kernel to send the FD, which in turn allows userspace to freely map any physical memory. As such, this is currently *not* secure. However, in principle the approach should be fine with a more strict SECCOMP filter and a careful review of the stub control flow (as userspace can prepare a stack). With some care, it is likely possible to extend the security model to SMP if desired. Signed-off-by: Benjamin Berg <benjamin.berg@intel.com> Link: https://patch.msgid.link/20250602130052.545733-8-benjamin@sipsolutions.net Signed-off-by: Johannes Berg <johannes.berg@intel.com>
145 lines
3.3 KiB
C
145 lines
3.3 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de)
|
|
* Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
|
|
*/
|
|
|
|
#include <linux/mm.h>
|
|
#include <linux/sched/signal.h>
|
|
#include <linux/slab.h>
|
|
|
|
#include <shared/irq_kern.h>
|
|
#include <asm/pgalloc.h>
|
|
#include <asm/sections.h>
|
|
#include <asm/mmu_context.h>
|
|
#include <as-layout.h>
|
|
#include <os.h>
|
|
#include <skas.h>
|
|
#include <stub-data.h>
|
|
|
|
/* Ensure the stub_data struct covers the allocated area */
|
|
static_assert(sizeof(struct stub_data) == STUB_DATA_PAGES * UM_KERN_PAGE_SIZE);
|
|
|
|
spinlock_t mm_list_lock;
|
|
struct list_head mm_list;
|
|
|
|
int init_new_context(struct task_struct *task, struct mm_struct *mm)
|
|
{
|
|
struct mm_id *new_id = &mm->context.id;
|
|
unsigned long stack = 0;
|
|
int ret = -ENOMEM;
|
|
|
|
stack = __get_free_pages(GFP_KERNEL | __GFP_ZERO, ilog2(STUB_DATA_PAGES));
|
|
if (stack == 0)
|
|
goto out;
|
|
|
|
new_id->stack = stack;
|
|
|
|
scoped_guard(spinlock_irqsave, &mm_list_lock) {
|
|
/* Insert into list, used for lookups when the child dies */
|
|
list_add(&mm->context.list, &mm_list);
|
|
}
|
|
|
|
ret = start_userspace(new_id);
|
|
if (ret < 0)
|
|
goto out_free;
|
|
|
|
/* Ensure the new MM is clean and nothing unwanted is mapped */
|
|
unmap(new_id, 0, STUB_START);
|
|
|
|
return 0;
|
|
|
|
out_free:
|
|
free_pages(new_id->stack, ilog2(STUB_DATA_PAGES));
|
|
out:
|
|
return ret;
|
|
}
|
|
|
|
void destroy_context(struct mm_struct *mm)
|
|
{
|
|
struct mm_context *mmu = &mm->context;
|
|
|
|
/*
|
|
* If init_new_context wasn't called, this will be
|
|
* zero, resulting in a kill(0), which will result in the
|
|
* whole UML suddenly dying. Also, cover negative and
|
|
* 1 cases, since they shouldn't happen either.
|
|
*
|
|
* Negative cases happen if the child died unexpectedly.
|
|
*/
|
|
if (mmu->id.pid >= 0 && mmu->id.pid < 2) {
|
|
printk(KERN_ERR "corrupt mm_context - pid = %d\n",
|
|
mmu->id.pid);
|
|
return;
|
|
}
|
|
|
|
if (mmu->id.pid > 0) {
|
|
os_kill_ptraced_process(mmu->id.pid, 1);
|
|
mmu->id.pid = -1;
|
|
}
|
|
|
|
if (using_seccomp && mmu->id.sock)
|
|
os_close_file(mmu->id.sock);
|
|
|
|
free_pages(mmu->id.stack, ilog2(STUB_DATA_PAGES));
|
|
|
|
guard(spinlock_irqsave)(&mm_list_lock);
|
|
|
|
list_del(&mm->context.list);
|
|
}
|
|
|
|
static irqreturn_t mm_sigchld_irq(int irq, void* dev)
|
|
{
|
|
struct mm_context *mm_context;
|
|
pid_t pid;
|
|
|
|
guard(spinlock)(&mm_list_lock);
|
|
|
|
while ((pid = os_reap_child()) > 0) {
|
|
/*
|
|
* A child died, check if we have an MM with the PID. This is
|
|
* only relevant in SECCOMP mode (as ptrace will fail anyway).
|
|
*
|
|
* See wait_stub_done_seccomp for more details.
|
|
*/
|
|
list_for_each_entry(mm_context, &mm_list, list) {
|
|
if (mm_context->id.pid == pid) {
|
|
struct stub_data *stub_data;
|
|
printk("Unexpectedly lost MM child! Affected tasks will segfault.");
|
|
|
|
/* Marks the MM as dead */
|
|
mm_context->id.pid = -1;
|
|
|
|
/*
|
|
* NOTE: If SMP is implemented, a futex_wake
|
|
* needs to be added here.
|
|
*/
|
|
stub_data = (void *)mm_context->id.stack;
|
|
stub_data->futex = FUTEX_IN_KERN;
|
|
|
|
/*
|
|
* NOTE: Currently executing syscalls by
|
|
* affected tasks may finish normally.
|
|
*/
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
return IRQ_HANDLED;
|
|
}
|
|
|
|
static int __init init_child_tracking(void)
|
|
{
|
|
int err;
|
|
|
|
spin_lock_init(&mm_list_lock);
|
|
INIT_LIST_HEAD(&mm_list);
|
|
|
|
err = request_irq(SIGCHLD_IRQ, mm_sigchld_irq, 0, "SIGCHLD", NULL);
|
|
if (err < 0)
|
|
panic("Failed to register SIGCHLD IRQ: %d", err);
|
|
|
|
return 0;
|
|
}
|
|
early_initcall(init_child_tracking)
|