linux/arch/x86/kernel/relocate_kernel_64.S

/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * relocate_kernel.S - put the kernel image in place to boot
 * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
 */

#include <linux/linkage.h>
#include <linux/stringify.h>
#include <asm/alternative.h>
#include <asm/page_types.h>
#include <asm/kexec.h>
#include <asm/processor-flags.h>
#include <asm/pgtable_types.h>
#include <asm/nospec-branch.h>
#include <asm/unwind_hints.h>
#include <asm/asm-offsets.h>

/*
 * Must be relocatable PIC code callable as a C function, in particular
 * there must be a plain RET and not jump to return thunk.
 */

#define PTR(x) (x << 3)
#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)

/*
 * The .text..relocate_kernel and .data..relocate_kernel sections are copied
 * into the control page, and the remainder of the page is used as the stack.
 */

	.section .data..relocate_kernel,"a";
/* Minimal CPU state */
SYM_DATA_LOCAL(saved_rsp, .quad 0)
SYM_DATA_LOCAL(saved_cr0, .quad 0)
SYM_DATA_LOCAL(saved_cr3, .quad 0)
SYM_DATA_LOCAL(saved_cr4, .quad 0)
	/* other data */
SYM_DATA(kexec_va_control_page, .quad 0)
SYM_DATA(kexec_pa_table_page, .quad 0)
SYM_DATA(kexec_pa_swap_page, .quad 0)
SYM_DATA_LOCAL(pa_backup_pages_map, .quad 0)
SYM_DATA(kexec_debug_8250_mmio32, .quad 0)
SYM_DATA(kexec_debug_8250_port, .word 0)

	.balign 16
SYM_DATA_START_LOCAL(kexec_debug_gdt)
	.word   kexec_debug_gdt_end - kexec_debug_gdt - 1
	.long   0
	.word   0
	.quad   0x00cf9a000000ffff      /* __KERNEL32_CS */
	.quad   0x00af9a000000ffff      /* __KERNEL_CS */
	.quad   0x00cf92000000ffff      /* __KERNEL_DS */
SYM_DATA_END_LABEL(kexec_debug_gdt, SYM_L_LOCAL, kexec_debug_gdt_end)

	.balign 8
SYM_DATA_START(kexec_debug_idt)
	.skip 0x100, 0x00
SYM_DATA_END(kexec_debug_idt)

	.section .text..relocate_kernel,"ax";
	.code64
SYM_CODE_START_NOALIGN(relocate_kernel)
	UNWIND_HINT_END_OF_STACK
	ANNOTATE_NOENDBR
	/*
	 * %rdi indirection_page
	 * %rsi pa_control_page
	 * %rdx start address
	 * %rcx preserve_context
	 * %r8  host_mem_enc_active
	 */

	/* Save the CPU context, used for jumping back */
	pushq %rbx
	pushq %rbp
	pushq %r12
	pushq %r13
	pushq %r14
	pushq %r15
	pushf

	/* Invalidate GDT/IDT, zero out flags */
	pushq	$0
	pushq	$0

	lidt	(%rsp)
	lgdt	(%rsp)
	addq	$8, %rsp
	popfq

	/* Switch to the identity mapped page tables */
	movq	%cr3, %rax
	movq	kexec_pa_table_page(%rip), %r9
	movq	%r9, %cr3

	/* Leave CR4 in %r13 to enable the right paging mode later. */
	movq	%cr4, %r13

	/* Disable global pages immediately to ensure this mapping is RWX */
	movq	%r13, %r12
	andq	$~(X86_CR4_PGE), %r12
	movq	%r12, %cr4

	/* Save %rsp and CRs. */
	movq	%r13, saved_cr4(%rip)
	movq    %rsp, saved_rsp(%rip)
	movq	%rax, saved_cr3(%rip)
	movq	%cr0, %rax
	movq	%rax, saved_cr0(%rip)

	/* save indirection list for jumping back */
	movq	%rdi, pa_backup_pages_map(%rip)

	/* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */
	movq	%rcx, %r11

	/* setup a new stack at the end of the physical control page */
	lea	PAGE_SIZE(%rsi), %rsp

	/* jump to identity mapped page */
0:	addq	$identity_mapped - 0b, %rsi
	subq	$__relocate_kernel_start - 0b, %rsi
	ANNOTATE_RETPOLINE_SAFE
	jmp	*%rsi
SYM_CODE_END(relocate_kernel)

SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
	UNWIND_HINT_END_OF_STACK
	/*
	 * %rdi	indirection page
	 * %rdx start address
	 * %r8 host_mem_enc_active
	 * %r9 page table page
	 * %r11 preserve_context
	 * %r13 original CR4 when relocate_kernel() was invoked
	 */

	/* store the start address on the stack */
	pushq   %rdx

	/* Create a GDTR (16 bits limit, 64 bits addr) on stack */
	leaq	kexec_debug_gdt(%rip), %rax
	pushq	%rax
	pushw	(%rax)

	/* Load the GDT, put the stack back */
	lgdt	(%rsp)
	addq	$10, %rsp

	/* Test that we can load segments */
	movq	%ds, %rax
	movq	%rax, %ds

	/* Now an IDTR on the stack to load the IDT the kernel created */
	leaq	kexec_debug_idt(%rip), %rsi
	pushq	%rsi
	pushw	$0xff
	lidt	(%rsp)
	addq	$10, %rsp

	//int3

	/*
	 * Clear X86_CR4_CET (if it was set) such that we can clear CR0_WP
	 * below.
	 */
	movq	%cr4, %rax
	andq	$~(X86_CR4_CET), %rax
	movq	%rax, %cr4

	/*
	 * Set cr0 to a known state:
	 *  - Paging enabled
	 *  - Alignment check disabled
	 *  - Write protect disabled
	 *  - No task switch
	 *  - Don't do FP software emulation.
	 *  - Protected mode enabled
	 */
	movq	%cr0, %rax
	andq	$~(X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %rax
	orl	$(X86_CR0_PG | X86_CR0_PE), %eax
	movq	%rax, %cr0

	/*
	 * Set cr4 to a known state:
	 *  - physical address extension enabled
	 *  - 5-level paging, if it was enabled before
	 *  - Machine check exception on TDX guest, if it was enabled before.
	 *    Clearing MCE might not be allowed in TDX guests, depending on setup.
	 *
	 * Use R13 that contains the original CR4 value, read in relocate_kernel().
	 * PAE is always set in the original CR4.
	 */
	andl	$(X86_CR4_PAE | X86_CR4_LA57), %r13d
	ALTERNATIVE "", __stringify(orl $X86_CR4_MCE, %r13d), X86_FEATURE_TDX_GUEST
	movq	%r13, %cr4

	/* Flush the TLB (needed?) */
	movq	%r9, %cr3

	/*
	 * If SME is active, there could be old encrypted cache line
	 * entries that will conflict with the now unencrypted memory
	 * used by kexec. Flush the caches before copying the kernel.
	 */
	testq	%r8, %r8
	jz .Lsme_off
	wbinvd
.Lsme_off:

	call	swap_pages

	/*
	 * To be certain of avoiding problems with self-modifying code
	 * I need to execute a serializing instruction here.
	 * So I flush the TLB by reloading %cr3 here, it's handy,
	 * and not processor dependent.
	 */
	movq	%cr3, %rax
	movq	%rax, %cr3

	testq	%r11, %r11	/* preserve_context */
	jnz .Lrelocate

	/*
	 * set all of the registers to known values
	 * leave %rsp alone
	 */

	xorl	%eax, %eax
	xorl	%ebx, %ebx
	xorl    %ecx, %ecx
	xorl    %edx, %edx
	xorl    %esi, %esi
	xorl    %edi, %edi
	xorl    %ebp, %ebp
	xorl	%r8d, %r8d
	xorl	%r9d, %r9d
	xorl	%r10d, %r10d
	xorl	%r11d, %r11d
	xorl	%r12d, %r12d
	xorl	%r13d, %r13d
	xorl	%r14d, %r14d
	xorl	%r15d, %r15d

	ANNOTATE_UNRET_SAFE
	ret
	int3

.Lrelocate:
	popq	%rdx

	/* Use the swap page for the callee's stack */
	movq	kexec_pa_swap_page(%rip), %r10
	leaq	PAGE_SIZE(%r10), %rsp

	/* push the existing entry point onto the callee's stack */
	pushq	%rdx

	ANNOTATE_RETPOLINE_SAFE
	call	*%rdx

	/* get the re-entry point of the peer system */
	popq	%rbp
	movq	kexec_pa_swap_page(%rip), %r10
	movq	pa_backup_pages_map(%rip), %rdi
	movq	kexec_pa_table_page(%rip), %rax
	movq	%rax, %cr3

	/* Find start (and end) of this physical mapping of control page */
	leaq	(%rip), %r8
	ANNOTATE_NOENDBR
	andq	$PAGE_MASK, %r8
	lea	PAGE_SIZE(%r8), %rsp
	movl	$1, %r11d	/* Ensure preserve_context flag is set */
	call	swap_pages
	movq	kexec_va_control_page(%rip), %rax
0:	addq	$virtual_mapped - 0b, %rax
	subq	$__relocate_kernel_start - 0b, %rax
	pushq	%rax
	ANNOTATE_UNRET_SAFE
	ret
	int3
SYM_CODE_END(identity_mapped)

SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
	UNWIND_HINT_END_OF_STACK
	ANNOTATE_NOENDBR // RET target, above
	movq	saved_rsp(%rip), %rsp
	movq	saved_cr4(%rip), %rax
	movq	%rax, %cr4
	movq	saved_cr3(%rip), %rax
	movq	saved_cr0(%rip), %r8
	movq	%rax, %cr3
	movq	%r8, %cr0

#ifdef CONFIG_KEXEC_JUMP
	/* Saved in save_processor_state. */
	movq    $saved_context, %rax
	lgdt    saved_context_gdt_desc(%rax)
#endif

	/* relocate_kernel() returns the re-entry point for next time */
	movq	%rbp, %rax

	popf
	popq	%r15
	popq	%r14
	popq	%r13
	popq	%r12
	popq	%rbp
	popq	%rbx
	ANNOTATE_UNRET_SAFE
	ret
	int3
SYM_CODE_END(virtual_mapped)

	/* Do the copies */
SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
	UNWIND_HINT_END_OF_STACK
	/*
	 * %rdi indirection page
	 * %r11 preserve_context
	 */
	movq	%rdi, %rcx	/* Put the indirection_page in %rcx */
	xorl	%edi, %edi
	xorl	%esi, %esi
	jmp	.Lstart		/* Should start with an indirection record */

.Lloop:	/* top, read another word for the indirection page */

	movq	(%rbx), %rcx
	addq	$8,	%rbx
.Lstart:
	testb	$0x1,	%cl   /* is it a destination page? */
	jz	.Lnotdest
	movq	%rcx,	%rdi
	andq	$0xfffffffffffff000, %rdi
	jmp	.Lloop
.Lnotdest:
	testb	$0x2,	%cl   /* is it an indirection page? */
	jz	.Lnotind
	movq	%rcx,   %rbx
	andq	$0xfffffffffffff000, %rbx
	jmp	.Lloop
.Lnotind:
	testb	$0x4,	%cl   /* is it the done indicator? */
	jz	.Lnotdone
	jmp	.Ldone
.Lnotdone:
	testb	$0x8,	%cl   /* is it the source indicator? */
	jz	.Lloop	      /* Ignore it otherwise */
	movq	%rcx,   %rsi  /* For ever source page do a copy */
	andq	$0xfffffffffffff000, %rsi

	movq	%rdi, %rdx    /* Save destination page to %rdx */
	movq	%rsi, %rax    /* Save source page to %rax */

	testq	%r11, %r11    /* Only actually swap for ::preserve_context */
	jz	.Lnoswap

	/* copy source page to swap page */
	movq	kexec_pa_swap_page(%rip), %rdi
	movl	$512, %ecx
	rep movsq

	/* copy destination page to source page */
	movq	%rax, %rdi
	movq	%rdx, %rsi
	movl	$512, %ecx
	rep movsq

	/* copy swap page to destination page */
	movq	%rdx, %rdi
	movq	kexec_pa_swap_page(%rip), %rsi
.Lnoswap:
	movl	$512, %ecx
	rep movsq

	lea	PAGE_SIZE(%rax), %rsi
	jmp	.Lloop
.Ldone:
	ANNOTATE_UNRET_SAFE
	ret
	int3
SYM_CODE_END(swap_pages)

/*
 * Generic 'print character' routine
 *  - %al: Character to be printed (may clobber %rax)
 *  - %rdx: MMIO address or port.
 */
#define XMTRDY          0x20

#define TXR             0       /*  Transmit register (WRITE) */
#define LSR             5       /*  Line Status               */

SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250)
	UNWIND_HINT_FUNC
	ANNOTATE_NOENDBR
	addw	$LSR, %dx
	xchg	%al, %ah
.Lxmtrdy_loop:
	inb	%dx, %al
	testb	$XMTRDY, %al
	jnz	.Lready
	pause
	jmp .Lxmtrdy_loop

.Lready:
	subw	$LSR, %dx
	xchg	%al, %ah
	outb	%al, %dx
pr_char_null:
	ANNOTATE_NOENDBR

	ANNOTATE_UNRET_SAFE
	ret
SYM_CODE_END(pr_char_8250)

SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250_mmio32)
	UNWIND_HINT_FUNC
	ANNOTATE_NOENDBR
.Lxmtrdy_loop_mmio:
	movb	(LSR*4)(%rdx), %ah
	testb	$XMTRDY, %ah
	jnz	.Lready_mmio
	pause
	jmp .Lxmtrdy_loop_mmio

.Lready_mmio:
	movb	%al, (%rdx)
	ANNOTATE_UNRET_SAFE
	ret
SYM_CODE_END(pr_char_8250_mmio32)

/*
 * Load pr_char function pointer into %rsi and load %rdx with whatever
 * that function wants to see there (typically port/MMIO address).
 */
.macro pr_setup
	leaq	pr_char_8250(%rip), %rsi
	movw	kexec_debug_8250_port(%rip), %dx
	testw	%dx, %dx
	jnz	1f

	leaq	pr_char_8250_mmio32(%rip), %rsi
	movq	kexec_debug_8250_mmio32(%rip), %rdx
	testq	%rdx, %rdx
	jnz	1f

	leaq	pr_char_null(%rip), %rsi
1:
.endm

/* Print the nybble in %bl, clobber %rax */
SYM_CODE_START_LOCAL_NOALIGN(pr_nybble)
	UNWIND_HINT_FUNC
	movb	%bl, %al
	nop
	andb	$0x0f, %al
	addb	$0x30, %al
	cmpb	$0x3a, %al
	jb	1f
	addb	$('a' - '0' - 10), %al
	ANNOTATE_RETPOLINE_SAFE
1:	jmp	*%rsi
SYM_CODE_END(pr_nybble)

SYM_CODE_START_LOCAL_NOALIGN(pr_qword)
	UNWIND_HINT_FUNC
	movq	$16, %rcx
1:	rolq	$4, %rbx
	call	pr_nybble
	loop	1b
	movb	$'\n', %al
	ANNOTATE_RETPOLINE_SAFE
	jmp	*%rsi
SYM_CODE_END(pr_qword)

.macro print_reg a, b, c, d, r
	movb	$\a, %al
	ANNOTATE_RETPOLINE_SAFE
	call	*%rsi
	movb	$\b, %al
	ANNOTATE_RETPOLINE_SAFE
	call	*%rsi
	movb	$\c, %al
	ANNOTATE_RETPOLINE_SAFE
	call	*%rsi
	movb	$\d, %al
	ANNOTATE_RETPOLINE_SAFE
	call	*%rsi
	movq	\r, %rbx
	call	pr_qword
.endm

SYM_CODE_START_NOALIGN(kexec_debug_exc_vectors)
	/* Each of these is 6 bytes. */
.macro vec_err exc
	UNWIND_HINT_ENTRY
	. = kexec_debug_exc_vectors + (\exc * KEXEC_DEBUG_EXC_HANDLER_SIZE)
	nop
	nop
	pushq	$\exc
	jmp	exc_handler
.endm

.macro vec_noerr exc
	UNWIND_HINT_ENTRY
	. = kexec_debug_exc_vectors + (\exc * KEXEC_DEBUG_EXC_HANDLER_SIZE)
	pushq	$0
	pushq	$\exc
	jmp	exc_handler
.endm

	ANNOTATE_NOENDBR
	vec_noerr  0 // #DE
	vec_noerr  1 // #DB
	vec_noerr  2 // #NMI
	vec_noerr  3 // #BP
	vec_noerr  4 // #OF
	vec_noerr  5 // #BR
	vec_noerr  6 // #UD
	vec_noerr  7 // #NM
	vec_err    8 // #DF
	vec_noerr  9
	vec_err   10 // #TS
	vec_err   11 // #NP
	vec_err   12 // #SS
	vec_err   13 // #GP
	vec_err   14 // #PF
	vec_noerr 15
SYM_CODE_END(kexec_debug_exc_vectors)

SYM_CODE_START_LOCAL_NOALIGN(exc_handler)
	/* No need for RET mitigations during kexec */
	VALIDATE_UNRET_END

	pushq	%rax
	pushq	%rbx
	pushq	%rcx
	pushq	%rdx
	pushq	%rsi

	/* Stack frame */
#define EXC_SS		0x58 /* Architectural... */
#define EXC_RSP		0x50
#define EXC_EFLAGS	0x48
#define EXC_CS		0x40
#define EXC_RIP		0x38
#define EXC_ERRORCODE	0x30 /* Either architectural or zero pushed by handler */
#define EXC_EXCEPTION	0x28 /* Pushed by handler entry point */
#define EXC_RAX		0x20 /* Pushed just above in exc_handler */
#define EXC_RBX		0x18
#define EXC_RCX		0x10
#define EXC_RDX		0x08
#define EXC_RSI		0x00

	/* Set up %rdx/%rsi for debug output */
	pr_setup

	/* rip and exception info */
	print_reg 'E', 'x', 'c', ':', EXC_EXCEPTION(%rsp)
	print_reg 'E', 'r', 'r', ':', EXC_ERRORCODE(%rsp)
	print_reg 'r', 'i', 'p', ':', EXC_RIP(%rsp)
	print_reg 'r', 's', 'p', ':', EXC_RSP(%rsp)

	/* We spilled these to the stack */
	print_reg 'r', 'a', 'x', ':', EXC_RAX(%rsp)
	print_reg 'r', 'b', 'x', ':', EXC_RBX(%rsp)
	print_reg 'r', 'c', 'x', ':', EXC_RCX(%rsp)
	print_reg 'r', 'd', 'x', ':', EXC_RDX(%rsp)
	print_reg 'r', 's', 'i', ':', EXC_RSI(%rsp)

	/* Other registers untouched */
	print_reg 'r', 'd', 'i', ':', %rdi
	print_reg 'r', '8', ' ', ':', %r8
	print_reg 'r', '9', ' ', ':', %r9
	print_reg 'r', '1', '0', ':', %r10
	print_reg 'r', '1', '1', ':', %r11
	print_reg 'r', '1', '2', ':', %r12
	print_reg 'r', '1', '3', ':', %r13
	print_reg 'r', '1', '4', ':', %r14
	print_reg 'r', '1', '5', ':', %r15
	print_reg 'c', 'r', '2', ':', %cr2

	/* Only return from INT3 */
	cmpq	$3, EXC_EXCEPTION(%rsp)
	jne	.Ldie

	popq	%rsi
	popq	%rdx
	popq	%rcx
	popq	%rbx
	popq	%rax

	addq	$16, %rsp
	iretq

.Ldie:
	hlt
	jmp	.Ldie

SYM_CODE_END(exc_handler)