x86/percpu/64: Use relative percpu offsets

The percpu section is currently linked at absolute address 0, because
older compilers hard-coded the stack protector canary value at a fixed
offset from the start of the GS segment.  Now that the canary is a
normal percpu variable, the percpu section does not need to be linked
at a specific address.

x86-64 will now calculate the percpu offsets as the delta between the
initial percpu address and the dynamically allocated memory, like other
architectures.  Note that GSBASE is limited to the canonical address
width (48 or 57 bits, sign-extended).  As long as the kernel text,
modules, and the dynamically allocated percpu memory are all in the
negative address space, the delta will not overflow this limit.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/r/20250123190747.745588-9-brgerst@gmail.com
This commit is contained in:
Brian Gerst 2025-01-23 14:07:40 -05:00 committed by Ingo Molnar
parent 80d47defdd
commit 9d7de2aa8b
8 changed files with 27 additions and 65 deletions

View file

@ -431,7 +431,11 @@ DECLARE_INIT_PER_CPU(fixed_percpu_data);
static inline unsigned long cpu_kernelmode_gs_base(int cpu) static inline unsigned long cpu_kernelmode_gs_base(int cpu)
{ {
return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu); #ifdef CONFIG_SMP
return per_cpu_offset(cpu);
#else
return 0;
#endif
} }
extern asmlinkage void entry_SYSCALL32_ignore(void); extern asmlinkage void entry_SYSCALL32_ignore(void);

View file

@ -61,11 +61,14 @@ SYM_CODE_START_NOALIGN(startup_64)
/* Set up the stack for verify_cpu() */ /* Set up the stack for verify_cpu() */
leaq __top_init_kernel_stack(%rip), %rsp leaq __top_init_kernel_stack(%rip), %rsp
/* Setup GSBASE to allow stack canary access for C code */ /*
* Set up GSBASE.
* Note that on SMP the boot CPU uses the init data section until
* the per-CPU areas are set up.
*/
movl $MSR_GS_BASE, %ecx movl $MSR_GS_BASE, %ecx
leaq INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx xorl %eax, %eax
movl %edx, %eax xorl %edx, %edx
shrq $32, %rdx
wrmsr wrmsr
call startup_64_setup_gdt_idt call startup_64_setup_gdt_idt
@ -359,16 +362,12 @@ SYM_INNER_LABEL(common_startup_64, SYM_L_LOCAL)
movl %eax,%fs movl %eax,%fs
movl %eax,%gs movl %eax,%gs
/* Set up %gs. /*
* * Set up GSBASE.
* The base of %gs always points to fixed_percpu_data.
* Note that, on SMP, the boot cpu uses init data section until * Note that, on SMP, the boot cpu uses init data section until
* the per cpu areas are set up. * the per cpu areas are set up.
*/ */
movl $MSR_GS_BASE,%ecx movl $MSR_GS_BASE,%ecx
#ifndef CONFIG_SMP
leaq INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx
#endif
movl %edx, %eax movl %edx, %eax
shrq $32, %rdx shrq $32, %rdx
wrmsr wrmsr

View file

@ -23,18 +23,10 @@
#include <asm/cpumask.h> #include <asm/cpumask.h>
#include <asm/cpu.h> #include <asm/cpu.h>
#ifdef CONFIG_X86_64 DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off);
#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load)
#else
#define BOOT_PERCPU_OFFSET 0
#endif
DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET;
EXPORT_PER_CPU_SYMBOL(this_cpu_off); EXPORT_PER_CPU_SYMBOL(this_cpu_off);
unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init = { unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init;
[0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET,
};
EXPORT_SYMBOL(__per_cpu_offset); EXPORT_SYMBOL(__per_cpu_offset);
/* /*

View file

@ -112,12 +112,6 @@ ASSERT(__relocate_kernel_end - __relocate_kernel_start <= KEXEC_CONTROL_CODE_MAX
PHDRS { PHDRS {
text PT_LOAD FLAGS(5); /* R_E */ text PT_LOAD FLAGS(5); /* R_E */
data PT_LOAD FLAGS(6); /* RW_ */ data PT_LOAD FLAGS(6); /* RW_ */
#ifdef CONFIG_X86_64
#ifdef CONFIG_SMP
percpu PT_LOAD FLAGS(6); /* RW_ */
#endif
init PT_LOAD FLAGS(7); /* RWE */
#endif
note PT_NOTE FLAGS(0); /* ___ */ note PT_NOTE FLAGS(0); /* ___ */
} }
@ -216,21 +210,7 @@ SECTIONS
__init_begin = .; /* paired with __init_end */ __init_begin = .; /* paired with __init_end */
} }
#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
/*
* percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
* output PHDR, so the next output section - .init.text - should
* start another segment - init.
*/
PERCPU_VADDR(INTERNODE_CACHE_BYTES, 0, :percpu)
ASSERT(SIZEOF(.data..percpu) < CONFIG_PHYSICAL_START,
"per-CPU data too large - increase CONFIG_PHYSICAL_START")
#endif
INIT_TEXT_SECTION(PAGE_SIZE) INIT_TEXT_SECTION(PAGE_SIZE)
#ifdef CONFIG_X86_64
:init
#endif
/* /*
* Section for code used exclusively before alternatives are run. All * Section for code used exclusively before alternatives are run. All
@ -347,9 +327,7 @@ SECTIONS
EXIT_DATA EXIT_DATA
} }
#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
PERCPU_SECTION(INTERNODE_CACHE_BYTES) PERCPU_SECTION(INTERNODE_CACHE_BYTES)
#endif
RUNTIME_CONST_VARIABLES RUNTIME_CONST_VARIABLES
RUNTIME_CONST(ptr, USER_PTR_MAX) RUNTIME_CONST(ptr, USER_PTR_MAX)
@ -497,16 +475,11 @@ PROVIDE(__ref_stack_chk_guard = __stack_chk_guard);
* Per-cpu symbols which need to be offset from __per_cpu_load * Per-cpu symbols which need to be offset from __per_cpu_load
* for the boot processor. * for the boot processor.
*/ */
#define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x) + __per_cpu_load #define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x)
INIT_PER_CPU(gdt_page); INIT_PER_CPU(gdt_page);
INIT_PER_CPU(fixed_percpu_data); INIT_PER_CPU(fixed_percpu_data);
INIT_PER_CPU(irq_stack_backing_store); INIT_PER_CPU(irq_stack_backing_store);
#ifdef CONFIG_SMP
. = ASSERT((fixed_percpu_data == 0),
"fixed_percpu_data is not at start of per-cpu area");
#endif
#ifdef CONFIG_MITIGATION_UNRET_ENTRY #ifdef CONFIG_MITIGATION_UNRET_ENTRY
. = ASSERT((retbleed_return_thunk & 0x3f) == 0, "retbleed_return_thunk not cacheline-aligned"); . = ASSERT((retbleed_return_thunk & 0x3f) == 0, "retbleed_return_thunk not cacheline-aligned");
#endif #endif

View file

@ -179,9 +179,8 @@ SYM_CODE_START(pvh_start_xen)
* the per-CPU areas are set up. * the per-CPU areas are set up.
*/ */
movl $MSR_GS_BASE,%ecx movl $MSR_GS_BASE,%ecx
leaq INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx xorl %eax, %eax
movq %edx, %eax xorl %edx, %edx
shrq $32, %rdx
wrmsr wrmsr
/* Call xen_prepare_pvh() via the kernel virtual mapping */ /* Call xen_prepare_pvh() via the kernel virtual mapping */

View file

@ -835,12 +835,7 @@ static void percpu_init(void)
*/ */
static int is_percpu_sym(ElfW(Sym) *sym, const char *symname) static int is_percpu_sym(ElfW(Sym) *sym, const char *symname)
{ {
int shndx = sym_index(sym); return 0;
return (shndx == per_cpu_shndx) &&
strcmp(symname, "__init_begin") &&
strcmp(symname, "__per_cpu_load") &&
strncmp(symname, "init_per_cpu_", 13);
} }
@ -1062,7 +1057,8 @@ static int cmp_relocs(const void *va, const void *vb)
static void sort_relocs(struct relocs *r) static void sort_relocs(struct relocs *r)
{ {
qsort(r->offset, r->count, sizeof(r->offset[0]), cmp_relocs); if (r->count)
qsort(r->offset, r->count, sizeof(r->offset[0]), cmp_relocs);
} }
static int write32(uint32_t v, FILE *f) static int write32(uint32_t v, FILE *f)

View file

@ -31,15 +31,14 @@ SYM_CODE_START(startup_xen)
leaq __top_init_kernel_stack(%rip), %rsp leaq __top_init_kernel_stack(%rip), %rsp
/* Set up %gs. /*
* * Set up GSBASE.
* The base of %gs always points to fixed_percpu_data.
* Note that, on SMP, the boot cpu uses init data section until * Note that, on SMP, the boot cpu uses init data section until
* the per cpu areas are set up. * the per cpu areas are set up.
*/ */
movl $MSR_GS_BASE,%ecx movl $MSR_GS_BASE,%ecx
movq $INIT_PER_CPU_VAR(fixed_percpu_data),%rax xorl %eax, %eax
cdq xorl %edx, %edx
wrmsr wrmsr
mov %rsi, %rdi mov %rsi, %rdi

View file

@ -1872,7 +1872,7 @@ config KALLSYMS_ALL
config KALLSYMS_ABSOLUTE_PERCPU config KALLSYMS_ABSOLUTE_PERCPU
bool bool
depends on KALLSYMS depends on KALLSYMS
default X86_64 && SMP default n
# end of the "standard kernel features (expert users)" menu # end of the "standard kernel features (expert users)" menu