KVM SVM changes for 6.16:

- Wait for target vCPU to acknowledge KVM_REQ_UPDATE_PROTECTED_GUEST_STATE to
    fix a race between AP destroy and VMRUN.
 
  - Decrypt and dump the VMSA in dump_vmcb() if debugging enabled for the VM.
 
  - Add support for ALLOWED_SEV_FEATURES.
 
  - Add #VMGEXIT to the set of handlers special cased for CONFIG_RETPOLINE=y.
 
  - Treat DEBUGCTL[5:2] as reserved to pave the way for virtualizing features
    that utilize those bits.
 
  - Don't account temporary allocations in sev_send_update_data().
 
  - Add support for KVM_CAP_X86_BUS_LOCK_EXIT on SVM, via Bus Lock Threshold.
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEKTobbabEP7vbhhN9OlYIJqCjN/0FAmgwmwAACgkQOlYIJqCj
 N/1pHw//edW/x838POMeeCN8j39NBKErW9yZoQLhMbzogttRvfoba+xYY9zXyRFx
 8AXB8+2iLtb7pXUohc0eYN0mNqgD0SnoMLqGfn7nrkJafJSUAJHAoZn1Mdom1M1y
 jHvBPbHCMMsgdLV8wpDRqCNWTH+d5W0kcN5WjKwOswVLj1rybVfK7bSLMhvkk1e5
 RrOR4Ewf95/Ag2b36L4SvS1yG9fTClmKeGArMXhEXjy2INVSpBYyZMjVtjHiNzU9
 TjtB2RSM45O+Zl0T2fZdVW8LFhA6kVeL1v+Oo433CjOQE0LQff3Vl14GCANIlPJU
 PiWN/RIKdWkuxStIP3vw02eHzONCcg2GnNHzEyKQ1xW8lmrwzVRdXZzVsc2Dmowb
 7qGykBQ+wzoE0sMeZPA0k/QOSqg2vGxUQHjR7720loLV9m9Tu/mJnS9e179GJKgI
 e1ArSLwKmHpjwKZqU44IQVTZaxSC4Sg2kI670i21ChPgx8+oVkA6I0LFQXymx7uS
 2lbH+ovTlJSlP9fbaJhMwAU2wpSHAyXif/HPjdw2LTH3NdgXzfEnZfTlAWiP65LQ
 hnz5HvmUalW3x9kmzRmeDIAkDnAXhyt3ZQMvbNzqlO5AfS+Tqh4Ed5EFP3IrQAzK
 HQ+Gi0ip+B84t9Tbi6rfQwzTZEbSSOfYksC7TXqRGhNo/DvHumE=
 =k6rK
 -----END PGP SIGNATURE-----

Merge tag 'kvm-x86-svm-6.16' of https://github.com/kvm-x86/linux into HEAD

KVM SVM changes for 6.16:

 - Wait for target vCPU to acknowledge KVM_REQ_UPDATE_PROTECTED_GUEST_STATE to
   fix a race between AP destroy and VMRUN.

 - Decrypt and dump the VMSA in dump_vmcb() if debugging enabled for the VM.

 - Add support for ALLOWED_SEV_FEATURES.

 - Add #VMGEXIT to the set of handlers special cased for CONFIG_RETPOLINE=y.

 - Treat DEBUGCTL[5:2] as reserved to pave the way for virtualizing features
   that utilize those bits.

 - Don't account temporary allocations in sev_send_update_data().

 - Add support for KVM_CAP_X86_BUS_LOCK_EXIT on SVM, via Bus Lock Threshold.
This commit is contained in:
Paolo Bonzini 2025-05-27 12:15:49 -04:00
commit 4e02d4f973
14 changed files with 469 additions and 31 deletions

View file

@ -8001,6 +8001,11 @@ apply some other policy-based mitigation. When exiting to userspace, KVM sets
KVM_RUN_X86_BUS_LOCK in vcpu-run->flags, and conditionally sets the exit_reason
to KVM_EXIT_X86_BUS_LOCK.
Due to differences in the underlying hardware implementation, the vCPU's RIP at
the time of exit diverges between Intel and AMD. On Intel hosts, RIP points at
the next instruction, i.e. the exit is trap-like. On AMD hosts, RIP points at
the offending instruction, i.e. the exit is fault-like.
Note! Detected bus locks may be coincident with other exits to userspace, i.e.
KVM_RUN_X86_BUS_LOCK should be checked regardless of the primary exit reason if
userspace wants to take action on all detected bus locks.

View file

@ -379,6 +379,7 @@
#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* "v_spec_ctrl" Virtual SPEC_CTRL */
#define X86_FEATURE_VNMI (15*32+25) /* "vnmi" Virtual NMI */
#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* SVME addr check */
#define X86_FEATURE_BUS_LOCK_THRESHOLD (15*32+29) /* Bus lock threshold */
#define X86_FEATURE_IDLE_HLT (15*32+30) /* IDLE HLT intercept */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
@ -447,6 +448,7 @@
#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" SEV-ES full debug state swap support */
#define X86_FEATURE_RMPREAD (19*32+21) /* RMPREAD instruction */
#define X86_FEATURE_SEGMENTED_RMP (19*32+23) /* Segmented RMP support */
#define X86_FEATURE_ALLOWED_SEV_FEATURES (19*32+27) /* Allowed SEV Features */
#define X86_FEATURE_SVSM (19*32+28) /* "svsm" SVSM present */
#define X86_FEATURE_HV_INUSE_WR_ALLOWED (19*32+30) /* Allow Write to in-use hypervisor-owned pages */

View file

@ -125,7 +125,8 @@
KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_HV_TLB_FLUSH \
KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE KVM_ARCH_REQ(34)
#define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE \
KVM_ARCH_REQ_FLAGS(34, KVM_REQUEST_WAIT)
#define CR0_RESERVED_BITS \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@ -411,7 +412,6 @@ struct kvm_rmap_head {
};
struct kvm_pio_request {
unsigned long linear_rip;
unsigned long count;
int in;
int port;
@ -917,6 +917,7 @@ struct kvm_vcpu_arch {
bool emulate_regs_need_sync_to_vcpu;
bool emulate_regs_need_sync_from_vcpu;
int (*complete_userspace_io)(struct kvm_vcpu *vcpu);
unsigned long cui_linear_rip;
gpa_t time;
s8 pvclock_tsc_shift;

View file

@ -116,6 +116,7 @@ enum {
INTERCEPT_INVPCID,
INTERCEPT_MCOMMIT,
INTERCEPT_TLBSYNC,
INTERCEPT_BUSLOCK,
INTERCEPT_IDLE_HLT = 166,
};
@ -159,7 +160,12 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
u64 avic_physical_id; /* Offset 0xf8 */
u8 reserved_7[8];
u64 vmsa_pa; /* Used for an SEV-ES guest */
u8 reserved_8[720];
u8 reserved_8[16];
u16 bus_lock_counter; /* Offset 0x120 */
u8 reserved_9[22];
u64 allowed_sev_features; /* Offset 0x138 */
u64 guest_sev_features; /* Offset 0x140 */
u8 reserved_10[664];
/*
* Offset 0x3e0, 32 bytes reserved
* for use by hypervisor/software.
@ -291,6 +297,8 @@ static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_
#define SVM_SEV_FEAT_ALTERNATE_INJECTION BIT(4)
#define SVM_SEV_FEAT_DEBUG_SWAP BIT(5)
#define VMCB_ALLOWED_SEV_FEATURES_VALID BIT_ULL(63)
struct vmcb_seg {
u16 selector;
u16 attrib;

View file

@ -95,6 +95,7 @@
#define SVM_EXIT_CR14_WRITE_TRAP 0x09e
#define SVM_EXIT_CR15_WRITE_TRAP 0x09f
#define SVM_EXIT_INVPCID 0x0a2
#define SVM_EXIT_BUS_LOCK 0x0a5
#define SVM_EXIT_IDLE_HLT 0x0a6
#define SVM_EXIT_NPF 0x400
#define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401
@ -225,6 +226,7 @@
{ SVM_EXIT_CR4_WRITE_TRAP, "write_cr4_trap" }, \
{ SVM_EXIT_CR8_WRITE_TRAP, "write_cr8_trap" }, \
{ SVM_EXIT_INVPCID, "invpcid" }, \
{ SVM_EXIT_BUS_LOCK, "buslock" }, \
{ SVM_EXIT_IDLE_HLT, "idle-halt" }, \
{ SVM_EXIT_NPF, "npf" }, \
{ SVM_EXIT_AVIC_INCOMPLETE_IPI, "avic_incomplete_ipi" }, \

View file

@ -678,6 +678,33 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa;
vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa;
/*
* Stash vmcb02's counter if the guest hasn't moved past the guilty
* instruction; otherwise, reset the counter to '0'.
*
* In order to detect if L2 has made forward progress or not, track the
* RIP at which a bus lock has occurred on a per-vmcb12 basis. If RIP
* is changed, guest has clearly made forward progress, bus_lock_counter
* still remained '1', so reset bus_lock_counter to '0'. Eg. In the
* scenario, where a buslock happened in L1 before VMRUN, the bus lock
* firmly happened on an instruction in the past. Even if vmcb01's
* counter is still '1', (because the guilty instruction got patched),
* the vCPU has clearly made forward progress and so KVM should reset
* vmcb02's counter to '0'.
*
* If the RIP hasn't changed, stash the bus lock counter at nested VMRUN
* to prevent the same guilty instruction from triggering a VM-Exit. Eg.
* if userspace rate-limits the vCPU, then it's entirely possible that
* L1's tick interrupt is pending by the time userspace re-runs the
* vCPU. If KVM unconditionally clears the counter on VMRUN, then when
* L1 re-enters L2, the same instruction will trigger a VM-Exit and the
* entire cycle start over.
*/
if (vmcb02->save.rip && (svm->nested.ctl.bus_lock_rip == vmcb02->save.rip))
vmcb02->control.bus_lock_counter = 1;
else
vmcb02->control.bus_lock_counter = 0;
/* Done at vmrun: asid. */
/* Also overwritten later if necessary. */
@ -1039,6 +1066,13 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
}
/*
* Invalidate bus_lock_rip unless KVM is still waiting for the guest
* to make forward progress before re-enabling bus lock detection.
*/
if (!vmcb02->control.bus_lock_counter)
svm->nested.ctl.bus_lock_rip = INVALID_GPA;
nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
kvm_nested_vmexit_handle_ibrs(vcpu);

View file

@ -560,6 +560,8 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
return -EFAULT;
sev->policy = params.policy;
memset(&start, 0, sizeof(start));
dh_blob = NULL;
@ -1592,11 +1594,11 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
/* allocate memory for header and transport buffer */
ret = -ENOMEM;
hdr = kzalloc(params.hdr_len, GFP_KERNEL_ACCOUNT);
hdr = kzalloc(params.hdr_len, GFP_KERNEL);
if (!hdr)
goto e_unpin;
trans_data = kzalloc(params.trans_len, GFP_KERNEL_ACCOUNT);
trans_data = kzalloc(params.trans_len, GFP_KERNEL);
if (!trans_data)
goto e_free_hdr;
@ -2199,6 +2201,8 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (params.policy & SNP_POLICY_MASK_SINGLE_SOCKET)
return -EINVAL;
sev->policy = params.policy;
sev->snp_context = snp_context_create(kvm, argp);
if (!sev->snp_context)
return -ENOTTY;
@ -3994,10 +3998,8 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm)
* Unless Creation is deferred until INIT, signal the vCPU to update
* its state.
*/
if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT) {
kvm_make_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu);
kvm_vcpu_kick(target_vcpu);
}
if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT)
kvm_make_request_and_kick(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu);
return 0;
}
@ -4455,6 +4457,7 @@ void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm)
static void sev_es_init_vmcb(struct vcpu_svm *svm)
{
struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm);
struct vmcb *vmcb = svm->vmcb01.ptr;
struct kvm_vcpu *vcpu = &svm->vcpu;
@ -4470,6 +4473,10 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm)
if (svm->sev_es.vmsa && !svm->sev_es.snp_has_guest_vmsa)
svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa);
if (cpu_feature_enabled(X86_FEATURE_ALLOWED_SEV_FEATURES))
svm->vmcb->control.allowed_sev_features = sev->vmsa_features |
VMCB_ALLOWED_SEV_FEATURES_VALID;
/* Can't intercept CR register access, HV can't modify CR registers */
svm_clr_intercept(svm, INTERCEPT_CR0_READ);
svm_clr_intercept(svm, INTERCEPT_CR4_READ);
@ -4930,3 +4937,97 @@ int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
return level;
}
struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb_save_area *vmsa;
struct kvm_sev_info *sev;
int error = 0;
int ret;
if (!sev_es_guest(vcpu->kvm))
return NULL;
/*
* If the VMSA has not yet been encrypted, return a pointer to the
* current un-encrypted VMSA.
*/
if (!vcpu->arch.guest_state_protected)
return (struct vmcb_save_area *)svm->sev_es.vmsa;
sev = to_kvm_sev_info(vcpu->kvm);
/* Check if the SEV policy allows debugging */
if (sev_snp_guest(vcpu->kvm)) {
if (!(sev->policy & SNP_POLICY_DEBUG))
return NULL;
} else {
if (sev->policy & SEV_POLICY_NODBG)
return NULL;
}
if (sev_snp_guest(vcpu->kvm)) {
struct sev_data_snp_dbg dbg = {0};
vmsa = snp_alloc_firmware_page(__GFP_ZERO);
if (!vmsa)
return NULL;
dbg.gctx_paddr = __psp_pa(sev->snp_context);
dbg.src_addr = svm->vmcb->control.vmsa_pa;
dbg.dst_addr = __psp_pa(vmsa);
ret = sev_do_cmd(SEV_CMD_SNP_DBG_DECRYPT, &dbg, &error);
/*
* Return the target page to a hypervisor page no matter what.
* If this fails, the page can't be used, so leak it and don't
* try to use it.
*/
if (snp_page_reclaim(vcpu->kvm, PHYS_PFN(__pa(vmsa))))
return NULL;
if (ret) {
pr_err("SEV: SNP_DBG_DECRYPT failed ret=%d, fw_error=%d (%#x)\n",
ret, error, error);
free_page((unsigned long)vmsa);
return NULL;
}
} else {
struct sev_data_dbg dbg = {0};
struct page *vmsa_page;
vmsa_page = alloc_page(GFP_KERNEL);
if (!vmsa_page)
return NULL;
vmsa = page_address(vmsa_page);
dbg.handle = sev->handle;
dbg.src_addr = svm->vmcb->control.vmsa_pa;
dbg.dst_addr = __psp_pa(vmsa);
dbg.len = PAGE_SIZE;
ret = sev_do_cmd(SEV_CMD_DBG_DECRYPT, &dbg, &error);
if (ret) {
pr_err("SEV: SEV_CMD_DBG_DECRYPT failed ret=%d, fw_error=%d (0x%x)\n",
ret, error, error);
__free_page(vmsa_page);
return NULL;
}
}
return vmsa;
}
void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa)
{
/* If the VMSA has not yet been encrypted, nothing was allocated */
if (!vcpu->arch.guest_state_protected || !vmsa)
return;
free_page((unsigned long)vmsa);
}

View file

@ -29,6 +29,7 @@
#include <linux/cc_platform.h>
#include <linux/smp.h>
#include <linux/string_choices.h>
#include <linux/mutex.h>
#include <asm/apic.h>
#include <asm/perf_event.h>
@ -251,6 +252,8 @@ static unsigned long iopm_base;
DEFINE_PER_CPU(struct svm_cpu_data, svm_data);
static DEFINE_MUTEX(vmcb_dump_mutex);
/*
* Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via
* the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
@ -1377,6 +1380,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
}
if (vcpu->kvm->arch.bus_lock_detection_enabled)
svm_set_intercept(svm, INTERCEPT_BUSLOCK);
if (sev_guest(vcpu->kvm))
sev_init_vmcb(svm);
@ -3210,17 +3216,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
break;
}
/*
* AMD changed the architectural behavior of bits 5:2. On CPUs
* without BusLockTrap, bits 5:2 control "external pins", but
* on CPUs that support BusLockDetect, bit 2 enables BusLockTrap
* and bits 5:3 are reserved-to-zero. Sadly, old KVM allowed
* the guest to set bits 5:2 despite not actually virtualizing
* Performance-Monitoring/Breakpoint external pins. Drop bits
* 5:2 for backwards compatibility.
*/
data &= ~GENMASK(5, 2);
/*
* Suppress BTF as KVM doesn't virtualize BTF, but there's no
* way to communicate lack of support to the guest.
@ -3351,6 +3346,37 @@ static int invpcid_interception(struct kvm_vcpu *vcpu)
return kvm_handle_invpcid(vcpu, type, gva);
}
static inline int complete_userspace_buslock(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
/*
* If userspace has NOT changed RIP, then KVM's ABI is to let the guest
* execute the bus-locking instruction. Set the bus lock counter to '1'
* to effectively step past the bus lock.
*/
if (kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))
svm->vmcb->control.bus_lock_counter = 1;
return 1;
}
static int bus_lock_exit(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu);
vcpu->arch.complete_userspace_io = complete_userspace_buslock;
if (is_guest_mode(vcpu))
svm->nested.ctl.bus_lock_rip = vcpu->arch.cui_linear_rip;
return 0;
}
static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[SVM_EXIT_READ_CR0] = cr_interception,
[SVM_EXIT_READ_CR3] = cr_interception,
@ -3420,6 +3446,7 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[SVM_EXIT_INVPCID] = invpcid_interception,
[SVM_EXIT_IDLE_HLT] = kvm_emulate_halt,
[SVM_EXIT_NPF] = npf_interception,
[SVM_EXIT_BUS_LOCK] = bus_lock_exit,
[SVM_EXIT_RSM] = rsm_interception,
[SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
[SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception,
@ -3434,14 +3461,21 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
struct vmcb_control_area *control = &svm->vmcb->control;
struct vmcb_save_area *save = &svm->vmcb->save;
struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save;
char *vm_type;
if (!dump_invalid_vmcb) {
pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
return;
}
pr_err("VMCB %p, last attempted VMRUN on CPU %d\n",
svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
guard(mutex)(&vmcb_dump_mutex);
vm_type = sev_snp_guest(vcpu->kvm) ? "SEV-SNP" :
sev_es_guest(vcpu->kvm) ? "SEV-ES" :
sev_guest(vcpu->kvm) ? "SEV" : "SVM";
pr_err("%s vCPU%u VMCB %p, last attempted VMRUN on CPU %d\n",
vm_type, vcpu->vcpu_id, svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
pr_err("VMCB Control Area:\n");
pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
@ -3479,6 +3513,17 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa);
pr_err("%-20s%016llx\n", "allowed_sev_features:", control->allowed_sev_features);
pr_err("%-20s%016llx\n", "guest_sev_features:", control->guest_sev_features);
if (sev_es_guest(vcpu->kvm)) {
save = sev_decrypt_vmsa(vcpu);
if (!save)
goto no_vmsa;
save01 = save;
}
pr_err("VMCB State Save Area:\n");
pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
"es:",
@ -3549,6 +3594,63 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
pr_err("%-15s %016llx %-13s %016llx\n",
"excp_from:", save->last_excp_from,
"excp_to:", save->last_excp_to);
if (sev_es_guest(vcpu->kvm)) {
struct sev_es_save_area *vmsa = (struct sev_es_save_area *)save;
pr_err("%-15s %016llx\n",
"sev_features", vmsa->sev_features);
pr_err("%-15s %016llx %-13s %016llx\n",
"rax:", vmsa->rax, "rbx:", vmsa->rbx);
pr_err("%-15s %016llx %-13s %016llx\n",
"rcx:", vmsa->rcx, "rdx:", vmsa->rdx);
pr_err("%-15s %016llx %-13s %016llx\n",
"rsi:", vmsa->rsi, "rdi:", vmsa->rdi);
pr_err("%-15s %016llx %-13s %016llx\n",
"rbp:", vmsa->rbp, "rsp:", vmsa->rsp);
pr_err("%-15s %016llx %-13s %016llx\n",
"r8:", vmsa->r8, "r9:", vmsa->r9);
pr_err("%-15s %016llx %-13s %016llx\n",
"r10:", vmsa->r10, "r11:", vmsa->r11);
pr_err("%-15s %016llx %-13s %016llx\n",
"r12:", vmsa->r12, "r13:", vmsa->r13);
pr_err("%-15s %016llx %-13s %016llx\n",
"r14:", vmsa->r14, "r15:", vmsa->r15);
pr_err("%-15s %016llx %-13s %016llx\n",
"xcr0:", vmsa->xcr0, "xss:", vmsa->xss);
} else {
pr_err("%-15s %016llx %-13s %016lx\n",
"rax:", save->rax, "rbx:",
vcpu->arch.regs[VCPU_REGS_RBX]);
pr_err("%-15s %016lx %-13s %016lx\n",
"rcx:", vcpu->arch.regs[VCPU_REGS_RCX],
"rdx:", vcpu->arch.regs[VCPU_REGS_RDX]);
pr_err("%-15s %016lx %-13s %016lx\n",
"rsi:", vcpu->arch.regs[VCPU_REGS_RSI],
"rdi:", vcpu->arch.regs[VCPU_REGS_RDI]);
pr_err("%-15s %016lx %-13s %016llx\n",
"rbp:", vcpu->arch.regs[VCPU_REGS_RBP],
"rsp:", save->rsp);
#ifdef CONFIG_X86_64
pr_err("%-15s %016lx %-13s %016lx\n",
"r8:", vcpu->arch.regs[VCPU_REGS_R8],
"r9:", vcpu->arch.regs[VCPU_REGS_R9]);
pr_err("%-15s %016lx %-13s %016lx\n",
"r10:", vcpu->arch.regs[VCPU_REGS_R10],
"r11:", vcpu->arch.regs[VCPU_REGS_R11]);
pr_err("%-15s %016lx %-13s %016lx\n",
"r12:", vcpu->arch.regs[VCPU_REGS_R12],
"r13:", vcpu->arch.regs[VCPU_REGS_R13]);
pr_err("%-15s %016lx %-13s %016lx\n",
"r14:", vcpu->arch.regs[VCPU_REGS_R14],
"r15:", vcpu->arch.regs[VCPU_REGS_R15]);
#endif
}
no_vmsa:
if (sev_es_guest(vcpu->kvm))
sev_free_decrypted_vmsa(vcpu, save);
}
static bool svm_check_exit_valid(u64 exit_code)
@ -3585,6 +3687,10 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
return kvm_emulate_halt(vcpu);
else if (exit_code == SVM_EXIT_NPF)
return npf_interception(vcpu);
#ifdef CONFIG_KVM_AMD_SEV
else if (exit_code == SVM_EXIT_VMGEXIT)
return sev_handle_vmgexit(vcpu);
#endif
#endif
return svm_exit_handlers[exit_code](vcpu);
}
@ -5346,6 +5452,9 @@ static __init void svm_set_cpu_caps(void)
kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
}
if (cpu_feature_enabled(X86_FEATURE_BUS_LOCK_THRESHOLD))
kvm_caps.has_bus_lock_exit = true;
/* CPUID 0x80000008 */
if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
boot_cpu_has(X86_FEATURE_AMD_SSBD))

View file

@ -98,6 +98,7 @@ struct kvm_sev_info {
unsigned int asid; /* ASID used for this guest */
unsigned int handle; /* SEV firmware handle */
int fd; /* SEV device fd */
unsigned long policy;
unsigned long pages_locked; /* Number of pages locked */
struct list_head regions_list; /* List of registered regions */
u64 ap_jump_table; /* SEV-ES AP Jump Table address */
@ -114,6 +115,9 @@ struct kvm_sev_info {
struct mutex guest_req_mutex; /* Must acquire before using bounce buffers */
};
#define SEV_POLICY_NODBG BIT_ULL(0)
#define SNP_POLICY_DEBUG BIT_ULL(19)
struct kvm_svm {
struct kvm kvm;
@ -169,6 +173,7 @@ struct vmcb_ctrl_area_cached {
u64 nested_cr3;
u64 virt_ext;
u32 clean;
u64 bus_lock_rip;
union {
#if IS_ENABLED(CONFIG_HYPERV) || IS_ENABLED(CONFIG_KVM_HYPERV)
struct hv_vmcb_enlightenments hv_enlightenments;
@ -783,6 +788,8 @@ void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu);
int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end);
int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn);
struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu);
void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa);
#else
static inline struct page *snp_safe_alloc_page_node(int node, gfp_t gfp)
{
@ -814,6 +821,11 @@ static inline int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
return 0;
}
static inline struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu)
{
return NULL;
}
static inline void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa) {}
#endif
/* vmenter.S */

View file

@ -9382,7 +9382,7 @@ static int complete_fast_pio_out(struct kvm_vcpu *vcpu)
{
vcpu->arch.pio.count = 0;
if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip)))
if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip)))
return 1;
return kvm_skip_emulated_instruction(vcpu);
@ -9407,7 +9407,7 @@ static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
complete_fast_pio_out_port_0x7e;
kvm_skip_emulated_instruction(vcpu);
} else {
vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu);
vcpu->arch.complete_userspace_io = complete_fast_pio_out;
}
return 0;
@ -9420,7 +9420,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
/* We should only ever be called with arch.pio.count equal to 1 */
BUG_ON(vcpu->arch.pio.count != 1);
if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) {
if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))) {
vcpu->arch.pio.count = 0;
return 1;
}
@ -9449,7 +9449,7 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
return ret;
}
vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu);
vcpu->arch.complete_userspace_io = complete_fast_pio_in;
return 0;

View file

@ -1505,7 +1505,16 @@ bool kvm_vcpu_block(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu);
bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu);
void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
#ifndef CONFIG_S390
void __kvm_vcpu_kick(struct kvm_vcpu *vcpu, bool wait);
static inline void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
{
__kvm_vcpu_kick(vcpu, false);
}
#endif
int kvm_vcpu_yield_to(struct kvm_vcpu *target);
void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu, bool yield_to_kernel_mode);
@ -2253,6 +2262,14 @@ static __always_inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
__kvm_make_request(req, vcpu);
}
#ifndef CONFIG_S390
static inline void kvm_make_request_and_kick(int req, struct kvm_vcpu *vcpu)
{
kvm_make_request(req, vcpu);
__kvm_vcpu_kick(vcpu, req & KVM_REQUEST_WAIT);
}
#endif
static inline bool kvm_request_pending(struct kvm_vcpu *vcpu)
{
return READ_ONCE(vcpu->requests);

View file

@ -84,6 +84,7 @@ TEST_GEN_PROGS_x86 += x86/hyperv_svm_test
TEST_GEN_PROGS_x86 += x86/hyperv_tlb_flush
TEST_GEN_PROGS_x86 += x86/kvm_clock_test
TEST_GEN_PROGS_x86 += x86/kvm_pv_test
TEST_GEN_PROGS_x86 += x86/kvm_buslock_test
TEST_GEN_PROGS_x86 += x86/monitor_mwait_test
TEST_GEN_PROGS_x86 += x86/nested_emulation_test
TEST_GEN_PROGS_x86 += x86/nested_exceptions_test

View file

@ -0,0 +1,135 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2024 Advanced Micro Devices, Inc.
*/
#include <linux/atomic.h>
#include "kvm_util.h"
#include "processor.h"
#include "svm_util.h"
#include "vmx.h"
#include "test_util.h"
#define NR_BUS_LOCKS_PER_LEVEL 100
#define CACHE_LINE_SIZE 64
/*
* To generate a bus lock, carve out a buffer that precisely occupies two cache
* lines and perform an atomic access that splits the two lines.
*/
static u8 buffer[CACHE_LINE_SIZE * 2] __aligned(CACHE_LINE_SIZE);
static atomic_t *val = (void *)&buffer[CACHE_LINE_SIZE - (sizeof(*val) / 2)];
static void guest_generate_buslocks(void)
{
for (int i = 0; i < NR_BUS_LOCKS_PER_LEVEL; i++)
atomic_inc(val);
}
#define L2_GUEST_STACK_SIZE 64
static void l2_guest_code(void)
{
guest_generate_buslocks();
GUEST_DONE();
}
static void l1_svm_code(struct svm_test_data *svm)
{
unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
struct vmcb *vmcb = svm->vmcb;
generic_svm_setup(svm, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
run_guest(vmcb, svm->vmcb_gpa);
}
static void l1_vmx_code(struct vmx_pages *vmx)
{
unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
GUEST_ASSERT_EQ(prepare_for_vmx_operation(vmx), true);
GUEST_ASSERT_EQ(load_vmcs(vmx), true);
prepare_vmcs(vmx, NULL, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
GUEST_ASSERT(!vmwrite(GUEST_RIP, (u64)l2_guest_code));
GUEST_ASSERT(!vmlaunch());
}
static void guest_code(void *test_data)
{
guest_generate_buslocks();
if (this_cpu_has(X86_FEATURE_SVM))
l1_svm_code(test_data);
else if (this_cpu_has(X86_FEATURE_VMX))
l1_vmx_code(test_data);
else
GUEST_DONE();
TEST_FAIL("L2 should have signaled 'done'");
}
int main(int argc, char *argv[])
{
const bool has_nested = kvm_cpu_has(X86_FEATURE_SVM) || kvm_cpu_has(X86_FEATURE_VMX);
vm_vaddr_t nested_test_data_gva;
struct kvm_vcpu *vcpu;
struct kvm_run *run;
struct kvm_vm *vm;
int i, bus_locks = 0;
TEST_REQUIRE(kvm_has_cap(KVM_CAP_X86_BUS_LOCK_EXIT));
vm = vm_create(1);
vm_enable_cap(vm, KVM_CAP_X86_BUS_LOCK_EXIT, KVM_BUS_LOCK_DETECTION_EXIT);
vcpu = vm_vcpu_add(vm, 0, guest_code);
if (kvm_cpu_has(X86_FEATURE_SVM))
vcpu_alloc_svm(vm, &nested_test_data_gva);
else
vcpu_alloc_vmx(vm, &nested_test_data_gva);
vcpu_args_set(vcpu, 1, nested_test_data_gva);
run = vcpu->run;
for (i = 0; i <= NR_BUS_LOCKS_PER_LEVEL * (1 + has_nested); i++) {
struct ucall uc;
vcpu_run(vcpu);
if (run->exit_reason == KVM_EXIT_IO) {
switch (get_ucall(vcpu, &uc)) {
case UCALL_ABORT:
REPORT_GUEST_ASSERT(uc);
goto done;
case UCALL_SYNC:
continue;
case UCALL_DONE:
goto done;
default:
TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd);
}
}
TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_X86_BUS_LOCK);
/*
* Verify the counter is actually getting incremented, e.g. that
* KVM isn't skipping the instruction. On Intel, the exit is
* trap-like, i.e. the counter should already have been
* incremented. On AMD, it's fault-like, i.e. the counter will
* be incremented when the guest re-executes the instruction.
*/
sync_global_from_guest(vm, *val);
TEST_ASSERT_EQ(atomic_read(val), bus_locks + host_cpu_is_intel);
bus_locks++;
}
TEST_FAIL("Didn't receive UCALL_DONE, took %u bus lock exits\n", bus_locks);
done:
TEST_ASSERT_EQ(i, bus_locks);
kvm_vm_free(vm);
return 0;
}

View file

@ -3739,7 +3739,7 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
/*
* Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
*/
void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
void __kvm_vcpu_kick(struct kvm_vcpu *vcpu, bool wait)
{
int me, cpu;
@ -3768,13 +3768,24 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
*/
if (kvm_arch_vcpu_should_kick(vcpu)) {
cpu = READ_ONCE(vcpu->cpu);
if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
smp_send_reschedule(cpu);
if (cpu != me && (unsigned int)cpu < nr_cpu_ids && cpu_online(cpu)) {
/*
* Use a reschedule IPI to kick the vCPU if the caller
* doesn't need to wait for a response, as KVM allows
* kicking vCPUs while IRQs are disabled, but using the
* SMP function call framework with IRQs disabled can
* deadlock due to taking cross-CPU locks.
*/
if (wait)
smp_call_function_single(cpu, ack_kick, NULL, wait);
else
smp_send_reschedule(cpu);
}
}
out:
put_cpu();
}
EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
EXPORT_SYMBOL_GPL(__kvm_vcpu_kick);
#endif /* !CONFIG_S390 */
int kvm_vcpu_yield_to(struct kvm_vcpu *target)