* Further fixups for ITS mitigation

* Avoid using large pages for kernel mappings when PSE is not enumerated
  * Avoid ever making indirect calls to TDX assembly helpers
  * Fix a FRED single step issue when not using an external debugger
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEV76QKkVc4xCGURexaDWVMHDJkrAFAmhQWhsACgkQaDWVMHDJ
 krBOLg/8CQ4VVzSqaE5llP/nYkehQblmWM03UFHaOr8gzzPF4Pi+Aov9SpPu/X5E
 k0va6s4SuV2XVuWAyKkzJlPxjKBzp8gmOI0XgerEqKMTEihElmSkb89d5O5EhqqM
 OYNHe5dIhfDbESrbUep2HFvSWR9Q5Df1Gt8gMDhBzjxT7PlJF/U/sle2/G33Ydhj
 9IJvAyh349sJTF9+N8nVUB9YYNE2L6ozf/o14lDF+SoBytLJWugYUVgAukwrQeII
 kjcfLYuUGPLeWZcczFA/mqKYWQ+MJ+2Qx8Rh2P00HQhIAdGGKyoIla2b4Lw9MfAk
 xYPuWWjvOyI0IiumdKfMfnKRgHpqC8IuCZSPlooNiB8Mk2Nd+0L3C0z96Ov08cwg
 nKgKQVS1E0FGq35S2VzS01pitasgxFsBBQft9fTIfL+EBtYNfm+TK3bAr6Ep9b4M
 RqcnE997AkFx2D4AWnBLY3lKMi2XcP6b0GPGSXSJAHQlzPZNquYXPNEzI8jOQ4IH
 E0F8f5P26XDvFCX5P7EfV9TjACSPYRxZtHtwN9OQWjFngbK8cMmP94XmORSFyFec
 AFBZ5ZcgLVfQmNzjKljTUvfZvpQCNIiC8oADAVlcfUsm2B45cnYwpvsuz8vswmdQ
 uDrDa87Mh20d8JRMLIMZ2rh7HrXaB2skdxQ9niS/uv0L8jKNq9o=
 =BEsb
 -----END PGP SIGNATURE-----

Merge tag 'x86_urgent_for_6.16-rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Dave Hansen:
 "This is a pretty scattered set of fixes. The majority of them are
  further fixups around the recent ITS mitigations.

  The rest don't really have a coherent story:

   - Some flavors of Xen PV guests don't support large pages, but the
     set_memory.c code assumes all CPUs support them.

     Avoid problems with a quick CPU feature check.

   - The TDX code has some wrappers to help retry calls to the TDX
     module. They use function pointers to assembly functions and the
     compiler usually generates direct CALLs. But some new compilers,
     plus -Os turned them in to indirect CALLs and the assembly code was
     not annotated for indirect calls.

     Force inlining of the helper to fix it up.

   - Last, a FRED issue showed up when single-stepping. It's fine when
     using an external debugger, but was getting stuck returning from a
     SIGTRAP handler otherwise.

     Clear the FRED 'swevent' bit to ensure that forward progress is
     made"

* tag 'x86_urgent_for_6.16-rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  Revert "mm/execmem: Unify early execmem_cache behaviour"
  x86/its: explicitly manage permissions for ITS pages
  x86/its: move its_pages array to struct mod_arch_specific
  x86/Kconfig: only enable ROX cache in execmem when STRICT_MODULE_RWX is set
  x86/mm/pat: don't collapse pages without PSE set
  x86/virt/tdx: Avoid indirect calls to TDX assembly functions
  selftests/x86: Add a test to detect infinite SIGTRAP handler loop
  x86/fred/signal: Prevent immediate repeat of single step trap on return from SIGTRAP handler
This commit is contained in:
Linus Torvalds 2025-06-16 11:36:21 -07:00
commit 9afe652958
16 changed files with 207 additions and 84 deletions

View file

@ -89,7 +89,7 @@ config X86
select ARCH_HAS_DMA_OPS if GART_IOMMU || XEN
select ARCH_HAS_EARLY_DEBUG if KGDB
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_EXECMEM_ROX if X86_64
select ARCH_HAS_EXECMEM_ROX if X86_64 && STRICT_MODULE_RWX
select ARCH_HAS_FAST_MULTIPLIER
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL

View file

@ -5,12 +5,20 @@
#include <asm-generic/module.h>
#include <asm/orc_types.h>
struct its_array {
#ifdef CONFIG_MITIGATION_ITS
void **pages;
int num;
#endif
};
struct mod_arch_specific {
#ifdef CONFIG_UNWINDER_ORC
unsigned int num_orcs;
int *orc_unwind_ip;
struct orc_entry *orc_unwind;
#endif
struct its_array its_pages;
};
#endif /* _ASM_X86_MODULE_H */

View file

@ -24,4 +24,26 @@ int ia32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs);
int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs);
int x32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs);
/*
* To prevent immediate repeat of single step trap on return from SIGTRAP
* handler if the trap flag (TF) is set without an external debugger attached,
* clear the software event flag in the augmented SS, ensuring no single-step
* trap is pending upon ERETU completion.
*
* Note, this function should be called in sigreturn() before the original
* state is restored to make sure the TF is read from the entry frame.
*/
static __always_inline void prevent_single_step_upon_eretu(struct pt_regs *regs)
{
/*
* If the trap flag (TF) is set, i.e., the sigreturn() SYSCALL instruction
* is being single-stepped, do not clear the software event flag in the
* augmented SS, thus a debugger won't skip over the following instruction.
*/
#ifdef CONFIG_X86_FRED
if (!(regs->flags & X86_EFLAGS_TF))
regs->fred_ss.swevent = 0;
#endif
}
#endif /* _ASM_X86_SIGHANDLING_H */

View file

@ -106,7 +106,7 @@ void tdx_init(void);
typedef u64 (*sc_func_t)(u64 fn, struct tdx_module_args *args);
static inline u64 sc_retry(sc_func_t func, u64 fn,
static __always_inline u64 sc_retry(sc_func_t func, u64 fn,
struct tdx_module_args *args)
{
int retry = RDRAND_RETRY_LOOPS;

View file

@ -116,6 +116,24 @@ static struct module *its_mod;
#endif
static void *its_page;
static unsigned int its_offset;
struct its_array its_pages;
static void *__its_alloc(struct its_array *pages)
{
void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE);
if (!page)
return NULL;
void *tmp = krealloc(pages->pages, (pages->num+1) * sizeof(void *),
GFP_KERNEL);
if (!tmp)
return NULL;
pages->pages = tmp;
pages->pages[pages->num++] = page;
return no_free_ptr(page);
}
/* Initialize a thunk with the "jmp *reg; int3" instructions. */
static void *its_init_thunk(void *thunk, int reg)
@ -151,6 +169,21 @@ static void *its_init_thunk(void *thunk, int reg)
return thunk + offset;
}
static void its_pages_protect(struct its_array *pages)
{
for (int i = 0; i < pages->num; i++) {
void *page = pages->pages[i];
execmem_restore_rox(page, PAGE_SIZE);
}
}
static void its_fini_core(void)
{
if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX))
its_pages_protect(&its_pages);
kfree(its_pages.pages);
}
#ifdef CONFIG_MODULES
void its_init_mod(struct module *mod)
{
@ -173,10 +206,8 @@ void its_fini_mod(struct module *mod)
its_page = NULL;
mutex_unlock(&text_mutex);
for (int i = 0; i < mod->its_num_pages; i++) {
void *page = mod->its_page_array[i];
execmem_restore_rox(page, PAGE_SIZE);
}
if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
its_pages_protect(&mod->arch.its_pages);
}
void its_free_mod(struct module *mod)
@ -184,37 +215,33 @@ void its_free_mod(struct module *mod)
if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
return;
for (int i = 0; i < mod->its_num_pages; i++) {
void *page = mod->its_page_array[i];
for (int i = 0; i < mod->arch.its_pages.num; i++) {
void *page = mod->arch.its_pages.pages[i];
execmem_free(page);
}
kfree(mod->its_page_array);
kfree(mod->arch.its_pages.pages);
}
#endif /* CONFIG_MODULES */
static void *its_alloc(void)
{
void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE);
struct its_array *pages = &its_pages;
void *page;
#ifdef CONFIG_MODULE
if (its_mod)
pages = &its_mod->arch.its_pages;
#endif
page = __its_alloc(pages);
if (!page)
return NULL;
#ifdef CONFIG_MODULES
if (its_mod) {
void *tmp = krealloc(its_mod->its_page_array,
(its_mod->its_num_pages+1) * sizeof(void *),
GFP_KERNEL);
if (!tmp)
return NULL;
execmem_make_temp_rw(page, PAGE_SIZE);
if (pages == &its_pages)
set_memory_x((unsigned long)page, 1);
its_mod->its_page_array = tmp;
its_mod->its_page_array[its_mod->its_num_pages++] = page;
execmem_make_temp_rw(page, PAGE_SIZE);
}
#endif /* CONFIG_MODULES */
return no_free_ptr(page);
return page;
}
static void *its_allocate_thunk(int reg)
@ -268,7 +295,9 @@ u8 *its_static_thunk(int reg)
return thunk;
}
#endif
#else
static inline void its_fini_core(void) {}
#endif /* CONFIG_MITIGATION_ITS */
/*
* Nomenclature for variable names to simplify and clarify this code and ease
@ -2338,6 +2367,8 @@ void __init alternative_instructions(void)
apply_retpolines(__retpoline_sites, __retpoline_sites_end);
apply_returns(__return_sites, __return_sites_end);
its_fini_core();
/*
* Adjust all CALL instructions to point to func()-10, including
* those in .altinstr_replacement.

View file

@ -152,6 +152,8 @@ SYSCALL32_DEFINE0(sigreturn)
struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8);
sigset_t set;
prevent_single_step_upon_eretu(regs);
if (!access_ok(frame, sizeof(*frame)))
goto badframe;
if (__get_user(set.sig[0], &frame->sc.oldmask)
@ -175,6 +177,8 @@ SYSCALL32_DEFINE0(rt_sigreturn)
struct rt_sigframe_ia32 __user *frame;
sigset_t set;
prevent_single_step_upon_eretu(regs);
frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4);
if (!access_ok(frame, sizeof(*frame)))

View file

@ -250,6 +250,8 @@ SYSCALL_DEFINE0(rt_sigreturn)
sigset_t set;
unsigned long uc_flags;
prevent_single_step_upon_eretu(regs);
frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
if (!access_ok(frame, sizeof(*frame)))
goto badframe;
@ -366,6 +368,8 @@ COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn)
sigset_t set;
unsigned long uc_flags;
prevent_single_step_upon_eretu(regs);
frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8);
if (!access_ok(frame, sizeof(*frame)))

View file

@ -30,7 +30,6 @@
#include <linux/initrd.h>
#include <linux/cpumask.h>
#include <linux/gfp.h>
#include <linux/execmem.h>
#include <asm/asm.h>
#include <asm/bios_ebda.h>
@ -749,8 +748,6 @@ void mark_rodata_ro(void)
pr_info("Write protecting kernel text and read-only data: %luk\n",
size >> 10);
execmem_cache_make_ro();
kernel_set_to_readonly = 1;
#ifdef CONFIG_CPA_DEBUG

View file

@ -34,7 +34,6 @@
#include <linux/gfp.h>
#include <linux/kcore.h>
#include <linux/bootmem_info.h>
#include <linux/execmem.h>
#include <asm/processor.h>
#include <asm/bios_ebda.h>
@ -1392,8 +1391,6 @@ void mark_rodata_ro(void)
(end - start) >> 10);
set_memory_ro(start, (end - start) >> PAGE_SHIFT);
execmem_cache_make_ro();
kernel_set_to_readonly = 1;
/*

View file

@ -1257,6 +1257,9 @@ static int collapse_pmd_page(pmd_t *pmd, unsigned long addr,
pgprot_t pgprot;
int i = 0;
if (!cpu_feature_enabled(X86_FEATURE_PSE))
return 0;
addr &= PMD_MASK;
pte = pte_offset_kernel(pmd, addr);
first = *pte;

View file

@ -75,8 +75,9 @@ static inline void seamcall_err_ret(u64 fn, u64 err,
args->r9, args->r10, args->r11);
}
static inline int sc_retry_prerr(sc_func_t func, sc_err_func_t err_func,
u64 fn, struct tdx_module_args *args)
static __always_inline int sc_retry_prerr(sc_func_t func,
sc_err_func_t err_func,
u64 fn, struct tdx_module_args *args)
{
u64 sret = sc_retry(func, fn, args);

View file

@ -54,7 +54,7 @@ enum execmem_range_flags {
EXECMEM_ROX_CACHE = (1 << 1),
};
#if defined(CONFIG_ARCH_HAS_EXECMEM_ROX) && defined(CONFIG_EXECMEM)
#ifdef CONFIG_ARCH_HAS_EXECMEM_ROX
/**
* execmem_fill_trapping_insns - set memory to contain instructions that
* will trap
@ -94,15 +94,9 @@ int execmem_make_temp_rw(void *ptr, size_t size);
* Return: 0 on success or negative error code on failure.
*/
int execmem_restore_rox(void *ptr, size_t size);
/*
* Called from mark_readonly(), where the system transitions to ROX.
*/
void execmem_cache_make_ro(void);
#else
static inline int execmem_make_temp_rw(void *ptr, size_t size) { return 0; }
static inline int execmem_restore_rox(void *ptr, size_t size) { return 0; }
static inline void execmem_cache_make_ro(void) { }
#endif
/**

View file

@ -586,11 +586,6 @@ struct module {
atomic_t refcnt;
#endif
#ifdef CONFIG_MITIGATION_ITS
int its_num_pages;
void **its_page_array;
#endif
#ifdef CONFIG_CONSTRUCTORS
/* Constructor functions. */
ctor_fn_t *ctors;

View file

@ -254,34 +254,6 @@ out_unlock:
return ptr;
}
static bool execmem_cache_rox = false;
void execmem_cache_make_ro(void)
{
struct maple_tree *free_areas = &execmem_cache.free_areas;
struct maple_tree *busy_areas = &execmem_cache.busy_areas;
MA_STATE(mas_free, free_areas, 0, ULONG_MAX);
MA_STATE(mas_busy, busy_areas, 0, ULONG_MAX);
struct mutex *mutex = &execmem_cache.mutex;
void *area;
execmem_cache_rox = true;
mutex_lock(mutex);
mas_for_each(&mas_free, area, ULONG_MAX) {
unsigned long pages = mas_range_len(&mas_free) >> PAGE_SHIFT;
set_memory_ro(mas_free.index, pages);
}
mas_for_each(&mas_busy, area, ULONG_MAX) {
unsigned long pages = mas_range_len(&mas_busy) >> PAGE_SHIFT;
set_memory_ro(mas_busy.index, pages);
}
mutex_unlock(mutex);
}
static int execmem_cache_populate(struct execmem_range *range, size_t size)
{
unsigned long vm_flags = VM_ALLOW_HUGE_VMAP;
@ -302,15 +274,9 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size)
/* fill memory with instructions that will trap */
execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true);
if (execmem_cache_rox) {
err = set_memory_rox((unsigned long)p, vm->nr_pages);
if (err)
goto err_free_mem;
} else {
err = set_memory_x((unsigned long)p, vm->nr_pages);
if (err)
goto err_free_mem;
}
err = set_memory_rox((unsigned long)p, vm->nr_pages);
if (err)
goto err_free_mem;
err = execmem_cache_add(p, alloc_size);
if (err)

View file

@ -12,7 +12,7 @@ CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh "$(CC)" trivial_program.c -no-pie)
TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap_vdso \
check_initial_reg_state sigreturn iopl ioperm \
test_vsyscall mov_ss_trap \
test_vsyscall mov_ss_trap sigtrap_loop \
syscall_arg_fault fsgsbase_restore sigaltstack
TARGETS_C_BOTHBITS += nx_stack
TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \

View file

@ -0,0 +1,101 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2025 Intel Corporation
*/
#define _GNU_SOURCE
#include <err.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ucontext.h>
#ifdef __x86_64__
# define REG_IP REG_RIP
#else
# define REG_IP REG_EIP
#endif
static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), int flags)
{
struct sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.sa_sigaction = handler;
sa.sa_flags = SA_SIGINFO | flags;
sigemptyset(&sa.sa_mask);
if (sigaction(sig, &sa, 0))
err(1, "sigaction");
return;
}
static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
{
ucontext_t *ctx = (ucontext_t *)ctx_void;
static unsigned int loop_count_on_same_ip;
static unsigned long last_trap_ip;
if (last_trap_ip == ctx->uc_mcontext.gregs[REG_IP]) {
printf("\tTrapped at %016lx\n", last_trap_ip);
/*
* If the same IP is hit more than 10 times in a row, it is
* _considered_ an infinite loop.
*/
if (++loop_count_on_same_ip > 10) {
printf("[FAIL]\tDetected SIGTRAP infinite loop\n");
exit(1);
}
return;
}
loop_count_on_same_ip = 0;
last_trap_ip = ctx->uc_mcontext.gregs[REG_IP];
printf("\tTrapped at %016lx\n", last_trap_ip);
}
int main(int argc, char *argv[])
{
sethandler(SIGTRAP, sigtrap, 0);
/*
* Set the Trap Flag (TF) to single-step the test code, therefore to
* trigger a SIGTRAP signal after each instruction until the TF is
* cleared.
*
* Because the arithmetic flags are not significant here, the TF is
* set by pushing 0x302 onto the stack and then popping it into the
* flags register.
*
* Four instructions in the following asm code are executed with the
* TF set, thus the SIGTRAP handler is expected to run four times.
*/
printf("[RUN]\tSIGTRAP infinite loop detection\n");
asm volatile(
#ifdef __x86_64__
/*
* Avoid clobbering the redzone
*
* Equivalent to "sub $128, %rsp", however -128 can be encoded
* in a single byte immediate while 128 uses 4 bytes.
*/
"add $-128, %rsp\n\t"
#endif
"push $0x302\n\t"
"popf\n\t"
"nop\n\t"
"nop\n\t"
"push $0x202\n\t"
"popf\n\t"
#ifdef __x86_64__
"sub $-128, %rsp\n\t"
#endif
);
printf("[OK]\tNo SIGTRAP infinite loop detected\n");
return 0;
}