linux/arch/x86/kvm/smm.c

656 lines
18 KiB
C
Raw Permalink Normal View History

/* SPDX-License-Identifier: GPL-2.0 */
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include "x86.h"
#include "kvm_cache_regs.h"
#include "kvm_emulate.h"
#include "smm.h"
#include "cpuid.h"
#include "trace.h"
#define CHECK_SMRAM32_OFFSET(field, offset) \
ASSERT_STRUCT_OFFSET(struct kvm_smram_state_32, field, offset - 0xFE00)
#define CHECK_SMRAM64_OFFSET(field, offset) \
ASSERT_STRUCT_OFFSET(struct kvm_smram_state_64, field, offset - 0xFE00)
static void check_smram_offsets(void)
{
/* 32 bit SMRAM image */
CHECK_SMRAM32_OFFSET(reserved1, 0xFE00);
CHECK_SMRAM32_OFFSET(smbase, 0xFEF8);
CHECK_SMRAM32_OFFSET(smm_revision, 0xFEFC);
CHECK_SMRAM32_OFFSET(io_inst_restart, 0xFF00);
CHECK_SMRAM32_OFFSET(auto_hlt_restart, 0xFF02);
CHECK_SMRAM32_OFFSET(io_restart_rdi, 0xFF04);
CHECK_SMRAM32_OFFSET(io_restart_rcx, 0xFF08);
CHECK_SMRAM32_OFFSET(io_restart_rsi, 0xFF0C);
CHECK_SMRAM32_OFFSET(io_restart_rip, 0xFF10);
CHECK_SMRAM32_OFFSET(cr4, 0xFF14);
CHECK_SMRAM32_OFFSET(reserved2, 0xFF18);
CHECK_SMRAM32_OFFSET(int_shadow, 0xFF1A);
CHECK_SMRAM32_OFFSET(reserved3, 0xFF1B);
CHECK_SMRAM32_OFFSET(ds, 0xFF2C);
CHECK_SMRAM32_OFFSET(fs, 0xFF38);
CHECK_SMRAM32_OFFSET(gs, 0xFF44);
CHECK_SMRAM32_OFFSET(idtr, 0xFF50);
CHECK_SMRAM32_OFFSET(tr, 0xFF5C);
CHECK_SMRAM32_OFFSET(gdtr, 0xFF6C);
CHECK_SMRAM32_OFFSET(ldtr, 0xFF78);
CHECK_SMRAM32_OFFSET(es, 0xFF84);
CHECK_SMRAM32_OFFSET(cs, 0xFF90);
CHECK_SMRAM32_OFFSET(ss, 0xFF9C);
CHECK_SMRAM32_OFFSET(es_sel, 0xFFA8);
CHECK_SMRAM32_OFFSET(cs_sel, 0xFFAC);
CHECK_SMRAM32_OFFSET(ss_sel, 0xFFB0);
CHECK_SMRAM32_OFFSET(ds_sel, 0xFFB4);
CHECK_SMRAM32_OFFSET(fs_sel, 0xFFB8);
CHECK_SMRAM32_OFFSET(gs_sel, 0xFFBC);
CHECK_SMRAM32_OFFSET(ldtr_sel, 0xFFC0);
CHECK_SMRAM32_OFFSET(tr_sel, 0xFFC4);
CHECK_SMRAM32_OFFSET(dr7, 0xFFC8);
CHECK_SMRAM32_OFFSET(dr6, 0xFFCC);
CHECK_SMRAM32_OFFSET(gprs, 0xFFD0);
CHECK_SMRAM32_OFFSET(eip, 0xFFF0);
CHECK_SMRAM32_OFFSET(eflags, 0xFFF4);
CHECK_SMRAM32_OFFSET(cr3, 0xFFF8);
CHECK_SMRAM32_OFFSET(cr0, 0xFFFC);
/* 64 bit SMRAM image */
CHECK_SMRAM64_OFFSET(es, 0xFE00);
CHECK_SMRAM64_OFFSET(cs, 0xFE10);
CHECK_SMRAM64_OFFSET(ss, 0xFE20);
CHECK_SMRAM64_OFFSET(ds, 0xFE30);
CHECK_SMRAM64_OFFSET(fs, 0xFE40);
CHECK_SMRAM64_OFFSET(gs, 0xFE50);
CHECK_SMRAM64_OFFSET(gdtr, 0xFE60);
CHECK_SMRAM64_OFFSET(ldtr, 0xFE70);
CHECK_SMRAM64_OFFSET(idtr, 0xFE80);
CHECK_SMRAM64_OFFSET(tr, 0xFE90);
CHECK_SMRAM64_OFFSET(io_restart_rip, 0xFEA0);
CHECK_SMRAM64_OFFSET(io_restart_rcx, 0xFEA8);
CHECK_SMRAM64_OFFSET(io_restart_rsi, 0xFEB0);
CHECK_SMRAM64_OFFSET(io_restart_rdi, 0xFEB8);
CHECK_SMRAM64_OFFSET(io_restart_dword, 0xFEC0);
CHECK_SMRAM64_OFFSET(reserved1, 0xFEC4);
CHECK_SMRAM64_OFFSET(io_inst_restart, 0xFEC8);
CHECK_SMRAM64_OFFSET(auto_hlt_restart, 0xFEC9);
CHECK_SMRAM64_OFFSET(amd_nmi_mask, 0xFECA);
CHECK_SMRAM64_OFFSET(int_shadow, 0xFECB);
CHECK_SMRAM64_OFFSET(reserved2, 0xFECC);
CHECK_SMRAM64_OFFSET(efer, 0xFED0);
CHECK_SMRAM64_OFFSET(svm_guest_flag, 0xFED8);
CHECK_SMRAM64_OFFSET(svm_guest_vmcb_gpa, 0xFEE0);
CHECK_SMRAM64_OFFSET(svm_guest_virtual_int, 0xFEE8);
CHECK_SMRAM64_OFFSET(reserved3, 0xFEF0);
CHECK_SMRAM64_OFFSET(smm_revison, 0xFEFC);
CHECK_SMRAM64_OFFSET(smbase, 0xFF00);
CHECK_SMRAM64_OFFSET(reserved4, 0xFF04);
CHECK_SMRAM64_OFFSET(ssp, 0xFF18);
CHECK_SMRAM64_OFFSET(svm_guest_pat, 0xFF20);
CHECK_SMRAM64_OFFSET(svm_host_efer, 0xFF28);
CHECK_SMRAM64_OFFSET(svm_host_cr4, 0xFF30);
CHECK_SMRAM64_OFFSET(svm_host_cr3, 0xFF38);
CHECK_SMRAM64_OFFSET(svm_host_cr0, 0xFF40);
CHECK_SMRAM64_OFFSET(cr4, 0xFF48);
CHECK_SMRAM64_OFFSET(cr3, 0xFF50);
CHECK_SMRAM64_OFFSET(cr0, 0xFF58);
CHECK_SMRAM64_OFFSET(dr7, 0xFF60);
CHECK_SMRAM64_OFFSET(dr6, 0xFF68);
CHECK_SMRAM64_OFFSET(rflags, 0xFF70);
CHECK_SMRAM64_OFFSET(rip, 0xFF78);
CHECK_SMRAM64_OFFSET(gprs, 0xFF80);
BUILD_BUG_ON(sizeof(union kvm_smram) != 512);
}
#undef CHECK_SMRAM64_OFFSET
#undef CHECK_SMRAM32_OFFSET
void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm)
{
trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm);
if (entering_smm) {
vcpu->arch.hflags |= HF_SMM_MASK;
} else {
vcpu->arch.hflags &= ~(HF_SMM_MASK | HF_SMM_INSIDE_NMI_MASK);
/* Process a latched INIT or SMI, if any. */
kvm_make_request(KVM_REQ_EVENT, vcpu);
/*
* Even if KVM_SET_SREGS2 loaded PDPTRs out of band,
* on SMM exit we still need to reload them from
* guest memory
*/
vcpu->arch.pdptrs_from_userspace = false;
}
kvm_mmu_reset_context(vcpu);
}
KVM: SVM: Forcibly leave SMM mode on SHUTDOWN interception Previously, commit ed129ec9057f ("KVM: x86: forcibly leave nested mode on vCPU reset") addressed an issue where a triple fault occurring in nested mode could lead to use-after-free scenarios. However, the commit did not handle the analogous situation for System Management Mode (SMM). This omission results in triggering a WARN when KVM forces a vCPU INIT after SHUTDOWN interception while the vCPU is in SMM. This situation was reprodused using Syzkaller by: 1) Creating a KVM VM and vCPU 2) Sending a KVM_SMI ioctl to explicitly enter SMM 3) Executing invalid instructions causing consecutive exceptions and eventually a triple fault The issue manifests as follows: WARNING: CPU: 0 PID: 25506 at arch/x86/kvm/x86.c:12112 kvm_vcpu_reset+0x1d2/0x1530 arch/x86/kvm/x86.c:12112 Modules linked in: CPU: 0 PID: 25506 Comm: syz-executor.0 Not tainted 6.1.130-syzkaller-00157-g164fe5dde9b6 #0 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014 RIP: 0010:kvm_vcpu_reset+0x1d2/0x1530 arch/x86/kvm/x86.c:12112 Call Trace: <TASK> shutdown_interception+0x66/0xb0 arch/x86/kvm/svm/svm.c:2136 svm_invoke_exit_handler+0x110/0x530 arch/x86/kvm/svm/svm.c:3395 svm_handle_exit+0x424/0x920 arch/x86/kvm/svm/svm.c:3457 vcpu_enter_guest arch/x86/kvm/x86.c:10959 [inline] vcpu_run+0x2c43/0x5a90 arch/x86/kvm/x86.c:11062 kvm_arch_vcpu_ioctl_run+0x50f/0x1cf0 arch/x86/kvm/x86.c:11283 kvm_vcpu_ioctl+0x570/0xf00 arch/x86/kvm/../../../virt/kvm/kvm_main.c:4122 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:870 [inline] __se_sys_ioctl fs/ioctl.c:856 [inline] __x64_sys_ioctl+0x19a/0x210 fs/ioctl.c:856 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x35/0x80 arch/x86/entry/common.c:81 entry_SYSCALL_64_after_hwframe+0x6e/0xd8 Architecturally, INIT is blocked when the CPU is in SMM, hence KVM's WARN() in kvm_vcpu_reset() to guard against KVM bugs, e.g. to detect improper emulation of INIT. SHUTDOWN on SVM is a weird edge case where KVM needs to do _something_ sane with the VMCB, since it's technically undefined, and INIT is the least awful choice given KVM's ABI. So, double down on stuffing INIT on SHUTDOWN, and force the vCPU out of SMM to avoid any weirdness (and the WARN). Found by Linux Verification Center (linuxtesting.org) with Syzkaller. Fixes: ed129ec9057f ("KVM: x86: forcibly leave nested mode on vCPU reset") Cc: stable@vger.kernel.org Suggested-by: Sean Christopherson <seanjc@google.com> Signed-off-by: Mikhail Lobanov <m.lobanov@rosa.ru> Link: https://lore.kernel.org/r/20250414171207.155121-1-m.lobanov@rosa.ru [sean: massage changelog, make it clear this isn't architectural behavior] Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-04-14 20:12:06 +03:00
EXPORT_SYMBOL_GPL(kvm_smm_changed);
void process_smi(struct kvm_vcpu *vcpu)
{
vcpu->arch.smi_pending = true;
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
static u32 enter_smm_get_segment_flags(struct kvm_segment *seg)
{
u32 flags = 0;
flags |= seg->g << 23;
flags |= seg->db << 22;
flags |= seg->l << 21;
flags |= seg->avl << 20;
flags |= seg->present << 15;
flags |= seg->dpl << 13;
flags |= seg->s << 12;
flags |= seg->type << 8;
return flags;
}
static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu,
struct kvm_smm_seg_state_32 *state,
u32 *selector, int n)
{
struct kvm_segment seg;
kvm_get_segment(vcpu, &seg, n);
*selector = seg.selector;
state->base = seg.base;
state->limit = seg.limit;
state->flags = enter_smm_get_segment_flags(&seg);
}
#ifdef CONFIG_X86_64
static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu,
struct kvm_smm_seg_state_64 *state,
int n)
{
struct kvm_segment seg;
kvm_get_segment(vcpu, &seg, n);
state->selector = seg.selector;
state->attributes = enter_smm_get_segment_flags(&seg) >> 8;
state->limit = seg.limit;
state->base = seg.base;
}
#endif
static void enter_smm_save_state_32(struct kvm_vcpu *vcpu,
struct kvm_smram_state_32 *smram)
{
struct desc_ptr dt;
int i;
smram->cr0 = kvm_read_cr0(vcpu);
smram->cr3 = kvm_read_cr3(vcpu);
smram->eflags = kvm_get_rflags(vcpu);
smram->eip = kvm_rip_read(vcpu);
for (i = 0; i < 8; i++)
smram->gprs[i] = kvm_register_read_raw(vcpu, i);
smram->dr6 = (u32)vcpu->arch.dr6;
smram->dr7 = (u32)vcpu->arch.dr7;
enter_smm_save_seg_32(vcpu, &smram->tr, &smram->tr_sel, VCPU_SREG_TR);
enter_smm_save_seg_32(vcpu, &smram->ldtr, &smram->ldtr_sel, VCPU_SREG_LDTR);
kvm_x86_call(get_gdt)(vcpu, &dt);
smram->gdtr.base = dt.address;
smram->gdtr.limit = dt.size;
kvm_x86_call(get_idt)(vcpu, &dt);
smram->idtr.base = dt.address;
smram->idtr.limit = dt.size;
enter_smm_save_seg_32(vcpu, &smram->es, &smram->es_sel, VCPU_SREG_ES);
enter_smm_save_seg_32(vcpu, &smram->cs, &smram->cs_sel, VCPU_SREG_CS);
enter_smm_save_seg_32(vcpu, &smram->ss, &smram->ss_sel, VCPU_SREG_SS);
enter_smm_save_seg_32(vcpu, &smram->ds, &smram->ds_sel, VCPU_SREG_DS);
enter_smm_save_seg_32(vcpu, &smram->fs, &smram->fs_sel, VCPU_SREG_FS);
enter_smm_save_seg_32(vcpu, &smram->gs, &smram->gs_sel, VCPU_SREG_GS);
smram->cr4 = kvm_read_cr4(vcpu);
smram->smm_revision = 0x00020000;
smram->smbase = vcpu->arch.smbase;
smram->int_shadow = kvm_x86_call(get_interrupt_shadow)(vcpu);
}
#ifdef CONFIG_X86_64
static void enter_smm_save_state_64(struct kvm_vcpu *vcpu,
struct kvm_smram_state_64 *smram)
{
struct desc_ptr dt;
int i;
for (i = 0; i < 16; i++)
smram->gprs[15 - i] = kvm_register_read_raw(vcpu, i);
smram->rip = kvm_rip_read(vcpu);
smram->rflags = kvm_get_rflags(vcpu);
smram->dr6 = vcpu->arch.dr6;
smram->dr7 = vcpu->arch.dr7;
smram->cr0 = kvm_read_cr0(vcpu);
smram->cr3 = kvm_read_cr3(vcpu);
smram->cr4 = kvm_read_cr4(vcpu);
smram->smbase = vcpu->arch.smbase;
smram->smm_revison = 0x00020064;
smram->efer = vcpu->arch.efer;
enter_smm_save_seg_64(vcpu, &smram->tr, VCPU_SREG_TR);
kvm_x86_call(get_idt)(vcpu, &dt);
smram->idtr.limit = dt.size;
smram->idtr.base = dt.address;
enter_smm_save_seg_64(vcpu, &smram->ldtr, VCPU_SREG_LDTR);
kvm_x86_call(get_gdt)(vcpu, &dt);
smram->gdtr.limit = dt.size;
smram->gdtr.base = dt.address;
enter_smm_save_seg_64(vcpu, &smram->es, VCPU_SREG_ES);
enter_smm_save_seg_64(vcpu, &smram->cs, VCPU_SREG_CS);
enter_smm_save_seg_64(vcpu, &smram->ss, VCPU_SREG_SS);
enter_smm_save_seg_64(vcpu, &smram->ds, VCPU_SREG_DS);
enter_smm_save_seg_64(vcpu, &smram->fs, VCPU_SREG_FS);
enter_smm_save_seg_64(vcpu, &smram->gs, VCPU_SREG_GS);
smram->int_shadow = kvm_x86_call(get_interrupt_shadow)(vcpu);
}
#endif
void enter_smm(struct kvm_vcpu *vcpu)
{
struct kvm_segment cs, ds;
struct desc_ptr dt;
unsigned long cr0;
union kvm_smram smram;
check_smram_offsets();
memset(smram.bytes, 0, sizeof(smram.bytes));
#ifdef CONFIG_X86_64
KVM: x86: Replace (almost) all guest CPUID feature queries with cpu_caps Switch all queries (except XSAVES) of guest features from guest CPUID to guest capabilities, i.e. replace all calls to guest_cpuid_has() with calls to guest_cpu_cap_has(). Keep guest_cpuid_has() around for XSAVES, but subsume its helper guest_cpuid_get_register() and add a compile-time assertion to prevent using guest_cpuid_has() for any other feature. Add yet another comment for XSAVE to explain why KVM is allowed to query its raw guest CPUID. Opportunistically drop the unused guest_cpuid_clear(), as there should be no circumstance in which KVM needs to _clear_ a guest CPUID feature now that everything is tracked via cpu_caps. E.g. KVM may need to _change_ a feature to emulate dynamic CPUID flags, but KVM should never need to clear a feature in guest CPUID to prevent it from being used by the guest. Delete the last remnants of the governed features framework, as the lone holdout was vmx_adjust_secondary_exec_control()'s divergent behavior for governed vs. ungoverned features. Note, replacing guest_cpuid_has() checks with guest_cpu_cap_has() when computing reserved CR4 bits is a nop when viewed as a whole, as KVM's capabilities are already incorporated into the calculation, i.e. if a feature is present in guest CPUID but unsupported by KVM, its CR4 bit was already being marked as reserved, checking guest_cpu_cap_has() simply double-stamps that it's a reserved bit. Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com> Link: https://lore.kernel.org/r/20241128013424.4096668-51-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
2024-11-27 17:34:17 -08:00
if (guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
enter_smm_save_state_64(vcpu, &smram.smram64);
else
#endif
enter_smm_save_state_32(vcpu, &smram.smram32);
/*
* Give enter_smm() a chance to make ISA-specific changes to the vCPU
* state (e.g. leave guest mode) after we've saved the state into the
* SMM state-save area.
*
* Kill the VM in the unlikely case of failure, because the VM
* can be in undefined state in this case.
*/
if (kvm_x86_call(enter_smm)(vcpu, &smram))
goto error;
kvm_smm_changed(vcpu, true);
if (kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, &smram, sizeof(smram)))
goto error;
if (kvm_x86_call(get_nmi_mask)(vcpu))
vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
else
kvm_x86_call(set_nmi_mask)(vcpu, true);
kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
kvm_rip_write(vcpu, 0x8000);
kvm_x86_call(set_interrupt_shadow)(vcpu, 0);
cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
kvm_x86_call(set_cr0)(vcpu, cr0);
kvm_x86_call(set_cr4)(vcpu, 0);
/* Undocumented: IDT limit is set to zero on entry to SMM. */
dt.address = dt.size = 0;
kvm_x86_call(set_idt)(vcpu, &dt);
if (WARN_ON_ONCE(kvm_set_dr(vcpu, 7, DR7_FIXED_1)))
goto error;
cs.selector = (vcpu->arch.smbase >> 4) & 0xffff;
cs.base = vcpu->arch.smbase;
ds.selector = 0;
ds.base = 0;
cs.limit = ds.limit = 0xffffffff;
cs.type = ds.type = 0x3;
cs.dpl = ds.dpl = 0;
cs.db = ds.db = 0;
cs.s = ds.s = 1;
cs.l = ds.l = 0;
cs.g = ds.g = 1;
cs.avl = ds.avl = 0;
cs.present = ds.present = 1;
cs.unusable = ds.unusable = 0;
cs.padding = ds.padding = 0;
kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
kvm_set_segment(vcpu, &ds, VCPU_SREG_DS);
kvm_set_segment(vcpu, &ds, VCPU_SREG_ES);
kvm_set_segment(vcpu, &ds, VCPU_SREG_FS);
kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
#ifdef CONFIG_X86_64
KVM: x86: Replace (almost) all guest CPUID feature queries with cpu_caps Switch all queries (except XSAVES) of guest features from guest CPUID to guest capabilities, i.e. replace all calls to guest_cpuid_has() with calls to guest_cpu_cap_has(). Keep guest_cpuid_has() around for XSAVES, but subsume its helper guest_cpuid_get_register() and add a compile-time assertion to prevent using guest_cpuid_has() for any other feature. Add yet another comment for XSAVE to explain why KVM is allowed to query its raw guest CPUID. Opportunistically drop the unused guest_cpuid_clear(), as there should be no circumstance in which KVM needs to _clear_ a guest CPUID feature now that everything is tracked via cpu_caps. E.g. KVM may need to _change_ a feature to emulate dynamic CPUID flags, but KVM should never need to clear a feature in guest CPUID to prevent it from being used by the guest. Delete the last remnants of the governed features framework, as the lone holdout was vmx_adjust_secondary_exec_control()'s divergent behavior for governed vs. ungoverned features. Note, replacing guest_cpuid_has() checks with guest_cpu_cap_has() when computing reserved CR4 bits is a nop when viewed as a whole, as KVM's capabilities are already incorporated into the calculation, i.e. if a feature is present in guest CPUID but unsupported by KVM, its CR4 bit was already being marked as reserved, checking guest_cpu_cap_has() simply double-stamps that it's a reserved bit. Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com> Link: https://lore.kernel.org/r/20241128013424.4096668-51-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
2024-11-27 17:34:17 -08:00
if (guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
if (kvm_x86_call(set_efer)(vcpu, 0))
goto error;
#endif
KVM: x86: Defer runtime updates of dynamic CPUID bits until CPUID emulation Defer runtime CPUID updates until the next non-faulting CPUID emulation or KVM_GET_CPUID2, which are the only paths in KVM that consume the dynamic entries. Deferring the updates is especially beneficial to nested VM-Enter/VM-Exit, as KVM will almost always detect multiple state changes, not to mention the updates don't need to be realized while L2 is active if CPUID is being intercepted by L1 (CPUID is a mandatory intercept on Intel, but not AMD). Deferring CPUID updates shaves several hundred cycles from nested VMX roundtrips, as measured from L2 executing CPUID in a tight loop: SKX 6850 => 6450 ICX 9000 => 8800 EMR 7900 => 7700 Alternatively, KVM could update only the CPUID leaves that are affected by the state change, e.g. update XSAVE info only if XCR0 or XSS changes, but that adds non-trivial complexity and doesn't solve the underlying problem of nested transitions potentially changing both XCR0 and XSS, on both nested VM-Enter and VM-Exit. Skipping updates entirely if L2 is active and CPUID is being intercepted by L1 could work for the common case. However, simply skipping updates if L2 is active is *very* subtly dangerous and complex. Most KVM updates are triggered by changes to the current vCPU state, which may be L2 state, whereas performing updates only for L1 would requiring detecting changes to L1 state. KVM would need to either track relevant L1 state, or defer runtime CPUID updates until the next nested VM-Exit. The former is ugly and complex, while the latter comes with similar dangers to deferring all CPUID updates, and would only address the nested VM-Enter path. To guard against using stale data, disallow querying dynamic CPUID feature bits, i.e. features that KVM updates at runtime, via a compile-time assertion in guest_cpu_cap_has(). Exempt MWAIT from the rule, as the MISC_ENABLE_NO_MWAIT means that MWAIT is _conditionally_ a dynamic CPUID feature. Note, the rule could be enforced for MWAIT as well, e.g. by querying guest CPUID in kvm_emulate_monitor_mwait, but there's no obvious advtantage to doing so, and allowing MWAIT for guest_cpuid_has() opens up a different can of worms. MONITOR/MWAIT can't be virtualized (for a reasonable definition), and the nature of the MWAIT_NEVER_UD_FAULTS and MISC_ENABLE_NO_MWAIT quirks means checking X86_FEATURE_MWAIT outside of kvm_emulate_monitor_mwait() is wrong for other reasons. Beyond the aforementioned feature bits, the only other dynamic CPUID (sub)leaves are the XSAVE sizes, and similar to MWAIT, consuming those CPUID entries in KVM is all but guaranteed to be a bug. The layout for an actual XSAVE buffer depends on the format (compacted or not) and potentially the features that are actually enabled. E.g. see the logic in fpstate_clear_xstate_component() needed to poke into the guest's effective XSAVE state to clear MPX state on INIT. KVM does consume CPUID.0xD.0.{EAX,EDX} in kvm_check_cpuid() and cpuid_get_supported_xcr0(), but not EBX, which is the only dynamic output register in the leaf. Link: https://lore.kernel.org/r/20241211013302.1347853-6-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
2024-12-10 17:33:02 -08:00
vcpu->arch.cpuid_dynamic_bits_dirty = true;
kvm_mmu_reset_context(vcpu);
return;
error:
kvm_vm_dead(vcpu->kvm);
}
static void rsm_set_desc_flags(struct kvm_segment *desc, u32 flags)
{
desc->g = (flags >> 23) & 1;
desc->db = (flags >> 22) & 1;
desc->l = (flags >> 21) & 1;
desc->avl = (flags >> 20) & 1;
desc->present = (flags >> 15) & 1;
desc->dpl = (flags >> 13) & 3;
desc->s = (flags >> 12) & 1;
desc->type = (flags >> 8) & 15;
desc->unusable = !desc->present;
desc->padding = 0;
}
static int rsm_load_seg_32(struct kvm_vcpu *vcpu,
const struct kvm_smm_seg_state_32 *state,
u16 selector, int n)
{
struct kvm_segment desc;
desc.selector = selector;
desc.base = state->base;
desc.limit = state->limit;
rsm_set_desc_flags(&desc, state->flags);
kvm_set_segment(vcpu, &desc, n);
return X86EMUL_CONTINUE;
}
#ifdef CONFIG_X86_64
static int rsm_load_seg_64(struct kvm_vcpu *vcpu,
const struct kvm_smm_seg_state_64 *state,
int n)
{
struct kvm_segment desc;
desc.selector = state->selector;
rsm_set_desc_flags(&desc, state->attributes << 8);
desc.limit = state->limit;
desc.base = state->base;
kvm_set_segment(vcpu, &desc, n);
return X86EMUL_CONTINUE;
}
#endif
static int rsm_enter_protected_mode(struct kvm_vcpu *vcpu,
u64 cr0, u64 cr3, u64 cr4)
{
int bad;
u64 pcid;
/* In order to later set CR4.PCIDE, CR3[11:0] must be zero. */
pcid = 0;
if (cr4 & X86_CR4_PCIDE) {
pcid = cr3 & 0xfff;
cr3 &= ~0xfff;
}
bad = kvm_set_cr3(vcpu, cr3);
if (bad)
return X86EMUL_UNHANDLEABLE;
/*
* First enable PAE, long mode needs it before CR0.PG = 1 is set.
* Then enable protected mode. However, PCID cannot be enabled
* if EFER.LMA=0, so set it separately.
*/
bad = kvm_set_cr4(vcpu, cr4 & ~X86_CR4_PCIDE);
if (bad)
return X86EMUL_UNHANDLEABLE;
bad = kvm_set_cr0(vcpu, cr0);
if (bad)
return X86EMUL_UNHANDLEABLE;
if (cr4 & X86_CR4_PCIDE) {
bad = kvm_set_cr4(vcpu, cr4);
if (bad)
return X86EMUL_UNHANDLEABLE;
if (pcid) {
bad = kvm_set_cr3(vcpu, cr3 | pcid);
if (bad)
return X86EMUL_UNHANDLEABLE;
}
}
return X86EMUL_CONTINUE;
}
static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt,
const struct kvm_smram_state_32 *smstate)
{
struct kvm_vcpu *vcpu = ctxt->vcpu;
struct desc_ptr dt;
int i, r;
ctxt->eflags = smstate->eflags | X86_EFLAGS_FIXED;
ctxt->_eip = smstate->eip;
for (i = 0; i < 8; i++)
*reg_write(ctxt, i) = smstate->gprs[i];
if (kvm_set_dr(vcpu, 6, smstate->dr6))
return X86EMUL_UNHANDLEABLE;
if (kvm_set_dr(vcpu, 7, smstate->dr7))
return X86EMUL_UNHANDLEABLE;
rsm_load_seg_32(vcpu, &smstate->tr, smstate->tr_sel, VCPU_SREG_TR);
rsm_load_seg_32(vcpu, &smstate->ldtr, smstate->ldtr_sel, VCPU_SREG_LDTR);
dt.address = smstate->gdtr.base;
dt.size = smstate->gdtr.limit;
kvm_x86_call(set_gdt)(vcpu, &dt);
dt.address = smstate->idtr.base;
dt.size = smstate->idtr.limit;
kvm_x86_call(set_idt)(vcpu, &dt);
rsm_load_seg_32(vcpu, &smstate->es, smstate->es_sel, VCPU_SREG_ES);
rsm_load_seg_32(vcpu, &smstate->cs, smstate->cs_sel, VCPU_SREG_CS);
rsm_load_seg_32(vcpu, &smstate->ss, smstate->ss_sel, VCPU_SREG_SS);
rsm_load_seg_32(vcpu, &smstate->ds, smstate->ds_sel, VCPU_SREG_DS);
rsm_load_seg_32(vcpu, &smstate->fs, smstate->fs_sel, VCPU_SREG_FS);
rsm_load_seg_32(vcpu, &smstate->gs, smstate->gs_sel, VCPU_SREG_GS);
vcpu->arch.smbase = smstate->smbase;
r = rsm_enter_protected_mode(vcpu, smstate->cr0,
smstate->cr3, smstate->cr4);
if (r != X86EMUL_CONTINUE)
return r;
kvm_x86_call(set_interrupt_shadow)(vcpu, 0);
ctxt->interruptibility = (u8)smstate->int_shadow;
return r;
}
#ifdef CONFIG_X86_64
static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
const struct kvm_smram_state_64 *smstate)
{
struct kvm_vcpu *vcpu = ctxt->vcpu;
struct desc_ptr dt;
int i, r;
for (i = 0; i < 16; i++)
*reg_write(ctxt, i) = smstate->gprs[15 - i];
ctxt->_eip = smstate->rip;
ctxt->eflags = smstate->rflags | X86_EFLAGS_FIXED;
if (kvm_set_dr(vcpu, 6, smstate->dr6))
return X86EMUL_UNHANDLEABLE;
if (kvm_set_dr(vcpu, 7, smstate->dr7))
return X86EMUL_UNHANDLEABLE;
vcpu->arch.smbase = smstate->smbase;
if (kvm_set_msr(vcpu, MSR_EFER, smstate->efer & ~EFER_LMA))
return X86EMUL_UNHANDLEABLE;
rsm_load_seg_64(vcpu, &smstate->tr, VCPU_SREG_TR);
dt.size = smstate->idtr.limit;
dt.address = smstate->idtr.base;
kvm_x86_call(set_idt)(vcpu, &dt);
rsm_load_seg_64(vcpu, &smstate->ldtr, VCPU_SREG_LDTR);
dt.size = smstate->gdtr.limit;
dt.address = smstate->gdtr.base;
kvm_x86_call(set_gdt)(vcpu, &dt);
r = rsm_enter_protected_mode(vcpu, smstate->cr0, smstate->cr3, smstate->cr4);
if (r != X86EMUL_CONTINUE)
return r;
rsm_load_seg_64(vcpu, &smstate->es, VCPU_SREG_ES);
rsm_load_seg_64(vcpu, &smstate->cs, VCPU_SREG_CS);
rsm_load_seg_64(vcpu, &smstate->ss, VCPU_SREG_SS);
rsm_load_seg_64(vcpu, &smstate->ds, VCPU_SREG_DS);
rsm_load_seg_64(vcpu, &smstate->fs, VCPU_SREG_FS);
rsm_load_seg_64(vcpu, &smstate->gs, VCPU_SREG_GS);
kvm_x86_call(set_interrupt_shadow)(vcpu, 0);
ctxt->interruptibility = (u8)smstate->int_shadow;
return X86EMUL_CONTINUE;
}
#endif
int emulator_leave_smm(struct x86_emulate_ctxt *ctxt)
{
struct kvm_vcpu *vcpu = ctxt->vcpu;
unsigned long cr0;
union kvm_smram smram;
u64 smbase;
int ret;
smbase = vcpu->arch.smbase;
ret = kvm_vcpu_read_guest(vcpu, smbase + 0xfe00, smram.bytes, sizeof(smram));
if (ret < 0)
return X86EMUL_UNHANDLEABLE;
if ((vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK) == 0)
kvm_x86_call(set_nmi_mask)(vcpu, false);
kvm_smm_changed(vcpu, false);
/*
* Get back to real mode, to prepare a safe state in which to load
* CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU
* supports long mode.
*/
#ifdef CONFIG_X86_64
KVM: x86: Replace (almost) all guest CPUID feature queries with cpu_caps Switch all queries (except XSAVES) of guest features from guest CPUID to guest capabilities, i.e. replace all calls to guest_cpuid_has() with calls to guest_cpu_cap_has(). Keep guest_cpuid_has() around for XSAVES, but subsume its helper guest_cpuid_get_register() and add a compile-time assertion to prevent using guest_cpuid_has() for any other feature. Add yet another comment for XSAVE to explain why KVM is allowed to query its raw guest CPUID. Opportunistically drop the unused guest_cpuid_clear(), as there should be no circumstance in which KVM needs to _clear_ a guest CPUID feature now that everything is tracked via cpu_caps. E.g. KVM may need to _change_ a feature to emulate dynamic CPUID flags, but KVM should never need to clear a feature in guest CPUID to prevent it from being used by the guest. Delete the last remnants of the governed features framework, as the lone holdout was vmx_adjust_secondary_exec_control()'s divergent behavior for governed vs. ungoverned features. Note, replacing guest_cpuid_has() checks with guest_cpu_cap_has() when computing reserved CR4 bits is a nop when viewed as a whole, as KVM's capabilities are already incorporated into the calculation, i.e. if a feature is present in guest CPUID but unsupported by KVM, its CR4 bit was already being marked as reserved, checking guest_cpu_cap_has() simply double-stamps that it's a reserved bit. Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com> Link: https://lore.kernel.org/r/20241128013424.4096668-51-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
2024-11-27 17:34:17 -08:00
if (guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) {
struct kvm_segment cs_desc;
unsigned long cr4;
/* Zero CR4.PCIDE before CR0.PG. */
cr4 = kvm_read_cr4(vcpu);
if (cr4 & X86_CR4_PCIDE)
kvm_set_cr4(vcpu, cr4 & ~X86_CR4_PCIDE);
/* A 32-bit code segment is required to clear EFER.LMA. */
memset(&cs_desc, 0, sizeof(cs_desc));
cs_desc.type = 0xb;
cs_desc.s = cs_desc.g = cs_desc.present = 1;
kvm_set_segment(vcpu, &cs_desc, VCPU_SREG_CS);
}
#endif
/* For the 64-bit case, this will clear EFER.LMA. */
cr0 = kvm_read_cr0(vcpu);
if (cr0 & X86_CR0_PE)
kvm_set_cr0(vcpu, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
#ifdef CONFIG_X86_64
KVM: x86: Replace (almost) all guest CPUID feature queries with cpu_caps Switch all queries (except XSAVES) of guest features from guest CPUID to guest capabilities, i.e. replace all calls to guest_cpuid_has() with calls to guest_cpu_cap_has(). Keep guest_cpuid_has() around for XSAVES, but subsume its helper guest_cpuid_get_register() and add a compile-time assertion to prevent using guest_cpuid_has() for any other feature. Add yet another comment for XSAVE to explain why KVM is allowed to query its raw guest CPUID. Opportunistically drop the unused guest_cpuid_clear(), as there should be no circumstance in which KVM needs to _clear_ a guest CPUID feature now that everything is tracked via cpu_caps. E.g. KVM may need to _change_ a feature to emulate dynamic CPUID flags, but KVM should never need to clear a feature in guest CPUID to prevent it from being used by the guest. Delete the last remnants of the governed features framework, as the lone holdout was vmx_adjust_secondary_exec_control()'s divergent behavior for governed vs. ungoverned features. Note, replacing guest_cpuid_has() checks with guest_cpu_cap_has() when computing reserved CR4 bits is a nop when viewed as a whole, as KVM's capabilities are already incorporated into the calculation, i.e. if a feature is present in guest CPUID but unsupported by KVM, its CR4 bit was already being marked as reserved, checking guest_cpu_cap_has() simply double-stamps that it's a reserved bit. Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com> Link: https://lore.kernel.org/r/20241128013424.4096668-51-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
2024-11-27 17:34:17 -08:00
if (guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) {
unsigned long cr4, efer;
/* Clear CR4.PAE before clearing EFER.LME. */
cr4 = kvm_read_cr4(vcpu);
if (cr4 & X86_CR4_PAE)
kvm_set_cr4(vcpu, cr4 & ~X86_CR4_PAE);
/* And finally go back to 32-bit mode. */
efer = 0;
kvm_set_msr(vcpu, MSR_EFER, efer);
}
#endif
/*
KVM: x86: Forcibly leave nested if RSM to L2 hits shutdown Leave nested mode before synthesizing shutdown (a.k.a. TRIPLE_FAULT) if RSM fails when resuming L2 (a.k.a. guest mode). Architecturally, shutdown on RSM occurs _before_ the transition back to guest mode on both Intel and AMD. On Intel, per the SDM pseudocode, SMRAM state is loaded before critical VMX state: restore state normally from SMRAM; ... CR4.VMXE := value stored internally; IF internal storage indicates that the logical processor had been in VMX operation (root or non-root) THEN enter VMX operation (root or non-root); restore VMX-critical state as defined in Section 32.14.1; ... restore current VMCS pointer; FI; AMD's APM is both less clearcut and more explicit. Because AMD CPUs save VMCB and guest state in SMRAM itself, given the lack of anything in the APM to indicate a shutdown in guest mode is possible, a straightforward reading of the clause on invalid state is that _what_ state is invalid is irrelevant, i.e. all roads lead to shutdown. An RSM causes a processor shutdown if an invalid-state condition is found in the SMRAM state-save area. This fixes a bug found by syzkaller where synthesizing shutdown for L2 led to a nested VM-Exit (if L1 is intercepting shutdown), which in turn caused KVM to complain about trying to cancel a nested VM-Enter (see commit 759cbd59674a ("KVM: x86: nSVM/nVMX: set nested_run_pending on VM entry which is a result of RSM"). Note, Paolo pointed out that KVM shouldn't set nested_run_pending until after loading SMRAM state. But as above, that's only half the story, KVM shouldn't transition to guest mode either. Unfortunately, fixing that mess requires rewriting the nVMX and nSVM RSM flows to not piggyback their nested VM-Enter flows, as executing the nested VM-Enter flows after loading state from SMRAM would clobber much of said state. For now, add a FIXME to call out that transitioning to guest mode before loading state from SMRAM is wrong. Link: https://lore.kernel.org/all/CABgObfYaUHXyRmsmg8UjRomnpQ0Jnaog9-L2gMjsjkqChjDYUQ@mail.gmail.com Reported-by: syzbot+988d9efcdf137bc05f66@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/0000000000007a9acb06151e1670@google.com Reported-by: Zheyu Ma <zheyuma97@gmail.com> Closes: https://lore.kernel.org/all/CAMhUBjmXMYsEoVYw_M8hSZjBMHh24i88QYm-RY6HDta5YZ7Wgw@mail.gmail.com Analyzed-by: Michal Wilczynski <michal.wilczynski@intel.com> Cc: Kishen Maloor <kishen.maloor@intel.com> Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com> Link: https://lore.kernel.org/r/20240906161337.1118412-1-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
2024-09-06 09:13:37 -07:00
* FIXME: When resuming L2 (a.k.a. guest mode), the transition to guest
* mode should happen _after_ loading state from SMRAM. However, KVM
* piggybacks the nested VM-Enter flows (which is wrong for many other
* reasons), and so nSVM/nVMX would clobber state that is loaded from
* SMRAM and from the VMCS/VMCB.
*/
if (kvm_x86_call(leave_smm)(vcpu, &smram))
return X86EMUL_UNHANDLEABLE;
#ifdef CONFIG_X86_64
KVM: x86: Replace (almost) all guest CPUID feature queries with cpu_caps Switch all queries (except XSAVES) of guest features from guest CPUID to guest capabilities, i.e. replace all calls to guest_cpuid_has() with calls to guest_cpu_cap_has(). Keep guest_cpuid_has() around for XSAVES, but subsume its helper guest_cpuid_get_register() and add a compile-time assertion to prevent using guest_cpuid_has() for any other feature. Add yet another comment for XSAVE to explain why KVM is allowed to query its raw guest CPUID. Opportunistically drop the unused guest_cpuid_clear(), as there should be no circumstance in which KVM needs to _clear_ a guest CPUID feature now that everything is tracked via cpu_caps. E.g. KVM may need to _change_ a feature to emulate dynamic CPUID flags, but KVM should never need to clear a feature in guest CPUID to prevent it from being used by the guest. Delete the last remnants of the governed features framework, as the lone holdout was vmx_adjust_secondary_exec_control()'s divergent behavior for governed vs. ungoverned features. Note, replacing guest_cpuid_has() checks with guest_cpu_cap_has() when computing reserved CR4 bits is a nop when viewed as a whole, as KVM's capabilities are already incorporated into the calculation, i.e. if a feature is present in guest CPUID but unsupported by KVM, its CR4 bit was already being marked as reserved, checking guest_cpu_cap_has() simply double-stamps that it's a reserved bit. Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com> Link: https://lore.kernel.org/r/20241128013424.4096668-51-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
2024-11-27 17:34:17 -08:00
if (guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
KVM: x86: Forcibly leave nested if RSM to L2 hits shutdown Leave nested mode before synthesizing shutdown (a.k.a. TRIPLE_FAULT) if RSM fails when resuming L2 (a.k.a. guest mode). Architecturally, shutdown on RSM occurs _before_ the transition back to guest mode on both Intel and AMD. On Intel, per the SDM pseudocode, SMRAM state is loaded before critical VMX state: restore state normally from SMRAM; ... CR4.VMXE := value stored internally; IF internal storage indicates that the logical processor had been in VMX operation (root or non-root) THEN enter VMX operation (root or non-root); restore VMX-critical state as defined in Section 32.14.1; ... restore current VMCS pointer; FI; AMD's APM is both less clearcut and more explicit. Because AMD CPUs save VMCB and guest state in SMRAM itself, given the lack of anything in the APM to indicate a shutdown in guest mode is possible, a straightforward reading of the clause on invalid state is that _what_ state is invalid is irrelevant, i.e. all roads lead to shutdown. An RSM causes a processor shutdown if an invalid-state condition is found in the SMRAM state-save area. This fixes a bug found by syzkaller where synthesizing shutdown for L2 led to a nested VM-Exit (if L1 is intercepting shutdown), which in turn caused KVM to complain about trying to cancel a nested VM-Enter (see commit 759cbd59674a ("KVM: x86: nSVM/nVMX: set nested_run_pending on VM entry which is a result of RSM"). Note, Paolo pointed out that KVM shouldn't set nested_run_pending until after loading SMRAM state. But as above, that's only half the story, KVM shouldn't transition to guest mode either. Unfortunately, fixing that mess requires rewriting the nVMX and nSVM RSM flows to not piggyback their nested VM-Enter flows, as executing the nested VM-Enter flows after loading state from SMRAM would clobber much of said state. For now, add a FIXME to call out that transitioning to guest mode before loading state from SMRAM is wrong. Link: https://lore.kernel.org/all/CABgObfYaUHXyRmsmg8UjRomnpQ0Jnaog9-L2gMjsjkqChjDYUQ@mail.gmail.com Reported-by: syzbot+988d9efcdf137bc05f66@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/0000000000007a9acb06151e1670@google.com Reported-by: Zheyu Ma <zheyuma97@gmail.com> Closes: https://lore.kernel.org/all/CAMhUBjmXMYsEoVYw_M8hSZjBMHh24i88QYm-RY6HDta5YZ7Wgw@mail.gmail.com Analyzed-by: Michal Wilczynski <michal.wilczynski@intel.com> Cc: Kishen Maloor <kishen.maloor@intel.com> Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com> Link: https://lore.kernel.org/r/20240906161337.1118412-1-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
2024-09-06 09:13:37 -07:00
ret = rsm_load_state_64(ctxt, &smram.smram64);
else
#endif
KVM: x86: Forcibly leave nested if RSM to L2 hits shutdown Leave nested mode before synthesizing shutdown (a.k.a. TRIPLE_FAULT) if RSM fails when resuming L2 (a.k.a. guest mode). Architecturally, shutdown on RSM occurs _before_ the transition back to guest mode on both Intel and AMD. On Intel, per the SDM pseudocode, SMRAM state is loaded before critical VMX state: restore state normally from SMRAM; ... CR4.VMXE := value stored internally; IF internal storage indicates that the logical processor had been in VMX operation (root or non-root) THEN enter VMX operation (root or non-root); restore VMX-critical state as defined in Section 32.14.1; ... restore current VMCS pointer; FI; AMD's APM is both less clearcut and more explicit. Because AMD CPUs save VMCB and guest state in SMRAM itself, given the lack of anything in the APM to indicate a shutdown in guest mode is possible, a straightforward reading of the clause on invalid state is that _what_ state is invalid is irrelevant, i.e. all roads lead to shutdown. An RSM causes a processor shutdown if an invalid-state condition is found in the SMRAM state-save area. This fixes a bug found by syzkaller where synthesizing shutdown for L2 led to a nested VM-Exit (if L1 is intercepting shutdown), which in turn caused KVM to complain about trying to cancel a nested VM-Enter (see commit 759cbd59674a ("KVM: x86: nSVM/nVMX: set nested_run_pending on VM entry which is a result of RSM"). Note, Paolo pointed out that KVM shouldn't set nested_run_pending until after loading SMRAM state. But as above, that's only half the story, KVM shouldn't transition to guest mode either. Unfortunately, fixing that mess requires rewriting the nVMX and nSVM RSM flows to not piggyback their nested VM-Enter flows, as executing the nested VM-Enter flows after loading state from SMRAM would clobber much of said state. For now, add a FIXME to call out that transitioning to guest mode before loading state from SMRAM is wrong. Link: https://lore.kernel.org/all/CABgObfYaUHXyRmsmg8UjRomnpQ0Jnaog9-L2gMjsjkqChjDYUQ@mail.gmail.com Reported-by: syzbot+988d9efcdf137bc05f66@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/0000000000007a9acb06151e1670@google.com Reported-by: Zheyu Ma <zheyuma97@gmail.com> Closes: https://lore.kernel.org/all/CAMhUBjmXMYsEoVYw_M8hSZjBMHh24i88QYm-RY6HDta5YZ7Wgw@mail.gmail.com Analyzed-by: Michal Wilczynski <michal.wilczynski@intel.com> Cc: Kishen Maloor <kishen.maloor@intel.com> Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com> Link: https://lore.kernel.org/r/20240906161337.1118412-1-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
2024-09-06 09:13:37 -07:00
ret = rsm_load_state_32(ctxt, &smram.smram32);
/*
* If RSM fails and triggers shutdown, architecturally the shutdown
* occurs *before* the transition to guest mode. But due to KVM's
* flawed handling of RSM to L2 (see above), the vCPU may already be
* in_guest_mode(). Force the vCPU out of guest mode before delivering
* the shutdown, so that L1 enters shutdown instead of seeing a VM-Exit
* that architecturally shouldn't be possible.
*/
if (ret != X86EMUL_CONTINUE && is_guest_mode(vcpu))
kvm_leave_nested(vcpu);
return ret;
}