linux/arch/x86/kvm/vmx/capabilities.h

402 lines
9.2 KiB
C
Raw Permalink Normal View History

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __KVM_X86_VMX_CAPS_H
#define __KVM_X86_VMX_CAPS_H
#include <asm/vmx.h>
#include "../lapic.h"
#include "../x86.h"
Merge remote-tracking branch 'kvm/next' into kvm-next-5.20 KVM/s390, KVM/x86 and common infrastructure changes for 5.20 x86: * Permit guests to ignore single-bit ECC errors * Fix races in gfn->pfn cache refresh; do not pin pages tracked by the cache * Intel IPI virtualization * Allow getting/setting pending triple fault with KVM_GET/SET_VCPU_EVENTS * PEBS virtualization * Simplify PMU emulation by just using PERF_TYPE_RAW events * More accurate event reinjection on SVM (avoid retrying instructions) * Allow getting/setting the state of the speaker port data bit * Refuse starting the kvm-intel module if VM-Entry/VM-Exit controls are inconsistent * "Notify" VM exit (detect microarchitectural hangs) for Intel * Cleanups for MCE MSR emulation s390: * add an interface to provide a hypervisor dump for secure guests * improve selftests to use TAP interface * enable interpretive execution of zPCI instructions (for PCI passthrough) * First part of deferred teardown * CPU Topology * PV attestation * Minor fixes Generic: * new selftests API using struct kvm_vcpu instead of a (vm, id) tuple x86: * Use try_cmpxchg64 instead of cmpxchg64 * Bugfixes * Ignore benign host accesses to PMU MSRs when PMU is disabled * Allow disabling KVM's "MONITOR/MWAIT are NOPs!" behavior * x86/MMU: Allow NX huge pages to be disabled on a per-vm basis * Port eager page splitting to shadow MMU as well * Enable CMCI capability by default and handle injected UCNA errors * Expose pid of vcpu threads in debugfs * x2AVIC support for AMD * cleanup PIO emulation * Fixes for LLDT/LTR emulation * Don't require refcounted "struct page" to create huge SPTEs x86 cleanups: * Use separate namespaces for guest PTEs and shadow PTEs bitmasks * PIO emulation * Reorganize rmap API, mostly around rmap destruction * Do not workaround very old KVM bugs for L0 that runs with nesting enabled * new selftests API for CPUID
2022-07-29 09:46:01 -04:00
#include "../pmu.h"
#include "../cpuid.h"
extern bool __read_mostly enable_vpid;
extern bool __read_mostly flexpriority_enabled;
extern bool __read_mostly enable_ept;
extern bool __read_mostly enable_unrestricted_guest;
extern bool __read_mostly enable_ept_ad_bits;
extern bool __read_mostly enable_pml;
extern int __read_mostly pt_mode;
#define PT_MODE_SYSTEM 0
#define PT_MODE_HOST_GUEST 1
#define PMU_CAP_FW_WRITES (1ULL << 13)
#define PMU_CAP_LBR_FMT 0x3f
struct nested_vmx_msrs {
/*
* We only store the "true" versions of the VMX capability MSRs. We
* generate the "non-true" versions by setting the must-be-1 bits
* according to the SDM.
*/
u32 procbased_ctls_low;
u32 procbased_ctls_high;
u32 secondary_ctls_low;
u32 secondary_ctls_high;
u32 pinbased_ctls_low;
u32 pinbased_ctls_high;
u32 exit_ctls_low;
u32 exit_ctls_high;
u32 entry_ctls_low;
u32 entry_ctls_high;
u32 misc_low;
u32 misc_high;
u32 ept_caps;
u32 vpid_caps;
u64 basic;
u64 cr0_fixed0;
u64 cr0_fixed1;
u64 cr4_fixed0;
u64 cr4_fixed1;
u64 vmcs_enum;
u64 vmfunc_controls;
};
struct vmcs_config {
u64 basic;
u32 pin_based_exec_ctrl;
u32 cpu_based_exec_ctrl;
u32 cpu_based_2nd_exec_ctrl;
u64 cpu_based_3rd_exec_ctrl;
u32 vmexit_ctrl;
u32 vmentry_ctrl;
u64 misc;
struct nested_vmx_msrs nested;
};
extern struct vmcs_config vmcs_config __ro_after_init;
struct vmx_capability {
u32 ept;
u32 vpid;
};
extern struct vmx_capability vmx_capability __ro_after_init;
static inline bool cpu_has_vmx_basic_inout(void)
{
return vmcs_config.basic & VMX_BASIC_INOUT;
}
static inline bool cpu_has_virtual_nmis(void)
{
return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS &&
vmcs_config.cpu_based_exec_ctrl & CPU_BASED_NMI_WINDOW_EXITING;
}
static inline bool cpu_has_vmx_preemption_timer(void)
{
return vmcs_config.pin_based_exec_ctrl &
PIN_BASED_VMX_PREEMPTION_TIMER;
}
static inline bool cpu_has_vmx_posted_intr(void)
{
return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
}
static inline bool cpu_has_load_ia32_efer(void)
{
return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_EFER;
}
static inline bool cpu_has_load_perf_global_ctrl(void)
{
return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
}
static inline bool cpu_has_vmx_mpx(void)
{
return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS;
}
static inline bool cpu_has_vmx_tpr_shadow(void)
{
return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
}
static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu)
{
return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu);
}
static inline bool cpu_has_vmx_msr_bitmap(void)
{
return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
}
static inline bool cpu_has_secondary_exec_ctrls(void)
{
return vmcs_config.cpu_based_exec_ctrl &
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
}
static inline bool cpu_has_tertiary_exec_ctrls(void)
{
return vmcs_config.cpu_based_exec_ctrl &
CPU_BASED_ACTIVATE_TERTIARY_CONTROLS;
}
static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
}
static inline bool cpu_has_vmx_ept(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_ENABLE_EPT;
}
static inline bool vmx_umip_emulated(void)
{
return !boot_cpu_has(X86_FEATURE_UMIP) &&
(vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_DESC);
}
static inline bool cpu_has_vmx_rdtscp(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_ENABLE_RDTSCP;
}
static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
}
static inline bool cpu_has_vmx_vpid(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_ENABLE_VPID;
}
static inline bool cpu_has_vmx_wbinvd_exit(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_WBINVD_EXITING;
}
static inline bool cpu_has_vmx_unrestricted_guest(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_UNRESTRICTED_GUEST;
}
static inline bool cpu_has_vmx_apic_register_virt(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_APIC_REGISTER_VIRT;
}
static inline bool cpu_has_vmx_virtual_intr_delivery(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
}
static inline bool cpu_has_vmx_ple(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_PAUSE_LOOP_EXITING;
}
static inline bool cpu_has_vmx_rdrand(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_RDRAND_EXITING;
}
static inline bool cpu_has_vmx_invpcid(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_ENABLE_INVPCID;
}
static inline bool cpu_has_vmx_vmfunc(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_ENABLE_VMFUNC;
}
static inline bool cpu_has_vmx_shadow_vmcs(void)
{
/* check if the cpu supports writing r/o exit information fields */
if (!(vmcs_config.misc & VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
return false;
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_SHADOW_VMCS;
}
static inline bool cpu_has_vmx_encls_vmexit(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_ENCLS_EXITING;
}
static inline bool cpu_has_vmx_rdseed(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_RDSEED_EXITING;
}
static inline bool cpu_has_vmx_pml(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
}
static inline bool cpu_has_vmx_xsaves(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_ENABLE_XSAVES;
}
static inline bool cpu_has_vmx_waitpkg(void)
KVM: x86: Add support for user wait instructions UMONITOR, UMWAIT and TPAUSE are a set of user wait instructions. This patch adds support for user wait instructions in KVM. Availability of the user wait instructions is indicated by the presence of the CPUID feature flag WAITPKG CPUID.0x07.0x0:ECX[5]. User wait instructions may be executed at any privilege level, and use 32bit IA32_UMWAIT_CONTROL MSR to set the maximum time. The behavior of user wait instructions in VMX non-root operation is determined first by the setting of the "enable user wait and pause" secondary processor-based VM-execution control bit 26. If the VM-execution control is 0, UMONITOR/UMWAIT/TPAUSE cause an invalid-opcode exception (#UD). If the VM-execution control is 1, treatment is based on the setting of the “RDTSC exiting†VM-execution control. Because KVM never enables RDTSC exiting, if the instruction causes a delay, the amount of time delayed is called here the physical delay. The physical delay is first computed by determining the virtual delay. If IA32_UMWAIT_CONTROL[31:2] is zero, the virtual delay is the value in EDX:EAX minus the value that RDTSC would return; if IA32_UMWAIT_CONTROL[31:2] is not zero, the virtual delay is the minimum of that difference and AND(IA32_UMWAIT_CONTROL,FFFFFFFCH). Because umwait and tpause can put a (psysical) CPU into a power saving state, by default we dont't expose it to kvm and enable it only when guest CPUID has it. Detailed information about user wait instructions can be found in the latest Intel 64 and IA-32 Architectures Software Developer's Manual. Co-developed-by: Jingqi Liu <jingqi.liu@intel.com> Signed-off-by: Jingqi Liu <jingqi.liu@intel.com> Signed-off-by: Tao Xu <tao3.xu@intel.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2019-07-16 14:55:49 +08:00
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
}
static inline bool cpu_has_vmx_tsc_scaling(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_TSC_SCALING;
}
KVM: VMX: Enable bus lock VM exit Virtual Machine can exploit bus locks to degrade the performance of system. Bus lock can be caused by split locked access to writeback(WB) memory or by using locks on uncacheable(UC) memory. The bus lock is typically >1000 cycles slower than an atomic operation within a cache line. It also disrupts performance on other cores (which must wait for the bus lock to be released before their memory operations can complete). To address the threat, bus lock VM exit is introduced to notify the VMM when a bus lock was acquired, allowing it to enforce throttling or other policy based mitigations. A VMM can enable VM exit due to bus locks by setting a new "Bus Lock Detection" VM-execution control(bit 30 of Secondary Processor-based VM execution controls). If delivery of this VM exit was preempted by a higher priority VM exit (e.g. EPT misconfiguration, EPT violation, APIC access VM exit, APIC write VM exit, exception bitmap exiting), bit 26 of exit reason in vmcs field is set to 1. In current implementation, the KVM exposes this capability through KVM_CAP_X86_BUS_LOCK_EXIT. The user can get the supported mode bitmap (i.e. off and exit) and enable it explicitly (disabled by default). If bus locks in guest are detected by KVM, exit to user space even when current exit reason is handled by KVM internally. Set a new field KVM_RUN_BUS_LOCK in vcpu->run->flags to inform the user space that there is a bus lock detected in guest. Document for Bus Lock VM exit is now available at the latest "Intel Architecture Instruction Set Extensions Programming Reference". Document Link: https://software.intel.com/content/www/us/en/develop/download/intel-architecture-instruction-set-extensions-programming-reference.html Co-developed-by: Xiaoyao Li <xiaoyao.li@intel.com> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com> Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com> Message-Id: <20201106090315.18606-4-chenyi.qiang@intel.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2020-11-06 17:03:14 +08:00
static inline bool cpu_has_vmx_bus_lock_detection(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_BUS_LOCK_DETECTION;
}
static inline bool cpu_has_vmx_apicv(void)
{
return cpu_has_vmx_apic_register_virt() &&
cpu_has_vmx_virtual_intr_delivery() &&
cpu_has_vmx_posted_intr();
}
KVM: VMX: enable IPI virtualization With IPI virtualization enabled, the processor emulates writes to APIC registers that would send IPIs. The processor sets the bit corresponding to the vector in target vCPU's PIR and may send a notification (IPI) specified by NDST and NV fields in target vCPU's Posted-Interrupt Descriptor (PID). It is similar to what IOMMU engine does when dealing with posted interrupt from devices. A PID-pointer table is used by the processor to locate the PID of a vCPU with the vCPU's APIC ID. The table size depends on maximum APIC ID assigned for current VM session from userspace. Allocating memory for PID-pointer table is deferred to vCPU creation, because irqchip mode and VM-scope maximum APIC ID is settled at that point. KVM can skip PID-pointer table allocation if !irqchip_in_kernel(). Like VT-d PI, if a vCPU goes to blocked state, VMM needs to switch its notification vector to wakeup vector. This can ensure that when an IPI for blocked vCPUs arrives, VMM can get control and wake up blocked vCPUs. And if a VCPU is preempted, its posted interrupt notification is suppressed. Note that IPI virtualization can only virualize physical-addressing, flat mode, unicast IPIs. Sending other IPIs would still cause a trap-like APIC-write VM-exit and need to be handled by VMM. Signed-off-by: Chao Gao <chao.gao@intel.com> Signed-off-by: Zeng Guang <guang.zeng@intel.com> Message-Id: <20220419154510.11938-1-guang.zeng@intel.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-04-19 23:45:10 +08:00
static inline bool cpu_has_vmx_ipiv(void)
{
return vmcs_config.cpu_based_3rd_exec_ctrl & TERTIARY_EXEC_IPI_VIRT;
}
static inline bool cpu_has_vmx_flexpriority(void)
{
return cpu_has_vmx_tpr_shadow() &&
cpu_has_vmx_virtualize_apic_accesses();
}
static inline bool cpu_has_vmx_ept_execute_only(void)
{
return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
}
static inline bool cpu_has_vmx_ept_4levels(void)
{
return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
}
static inline bool cpu_has_vmx_ept_5levels(void)
{
return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT;
}
static inline bool cpu_has_vmx_ept_mt_wb(void)
{
return vmx_capability.ept & VMX_EPTP_WB_BIT;
}
static inline bool cpu_has_vmx_ept_2m_page(void)
{
return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
}
static inline bool cpu_has_vmx_ept_1g_page(void)
{
return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
}
static inline int ept_caps_to_lpage_level(u32 ept_caps)
{
if (ept_caps & VMX_EPT_1GB_PAGE_BIT)
return PG_LEVEL_1G;
if (ept_caps & VMX_EPT_2MB_PAGE_BIT)
return PG_LEVEL_2M;
return PG_LEVEL_4K;
}
static inline bool cpu_has_vmx_ept_ad_bits(void)
{
return vmx_capability.ept & VMX_EPT_AD_BIT;
}
static inline bool cpu_has_vmx_invept_context(void)
{
return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
}
static inline bool cpu_has_vmx_invept_global(void)
{
return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
}
static inline bool cpu_has_vmx_invvpid(void)
{
return vmx_capability.vpid & VMX_VPID_INVVPID_BIT;
}
static inline bool cpu_has_vmx_invvpid_individual_addr(void)
{
return vmx_capability.vpid & VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT;
}
static inline bool cpu_has_vmx_invvpid_single(void)
{
return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
}
static inline bool cpu_has_vmx_invvpid_global(void)
{
return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
}
static inline bool cpu_has_vmx_intel_pt(void)
{
return (vmcs_config.misc & VMX_MISC_INTEL_PT) &&
(vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PT_USE_GPA) &&
(vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL);
}
/*
* Processor Trace can operate in one of three modes:
* a. system-wide: trace both host/guest and output to host buffer
* b. host-only: only trace host and output to host buffer
* c. host-guest: trace host and guest simultaneously and output to their
* respective buffer
*
* KVM currently only supports (a) and (c).
*/
static inline bool vmx_pt_mode_is_system(void)
{
return pt_mode == PT_MODE_SYSTEM;
}
static inline bool vmx_pt_mode_is_host_guest(void)
{
return pt_mode == PT_MODE_HOST_GUEST;
}
static inline bool vmx_pebs_supported(void)
{
return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept;
}
KVM: VMX: Enable Notify VM exit There are cases that malicious virtual machines can cause CPU stuck (due to event windows don't open up), e.g., infinite loop in microcode when nested #AC (CVE-2015-5307). No event window means no event (NMI, SMI and IRQ) can be delivered. It leads the CPU to be unavailable to host or other VMs. VMM can enable notify VM exit that a VM exit generated if no event window occurs in VM non-root mode for a specified amount of time (notify window). Feature enabling: - The new vmcs field SECONDARY_EXEC_NOTIFY_VM_EXITING is introduced to enable this feature. VMM can set NOTIFY_WINDOW vmcs field to adjust the expected notify window. - Add a new KVM capability KVM_CAP_X86_NOTIFY_VMEXIT so that user space can query and enable this feature in per-VM scope. The argument is a 64bit value: bits 63:32 are used for notify window, and bits 31:0 are for flags. Current supported flags: - KVM_X86_NOTIFY_VMEXIT_ENABLED: enable the feature with the notify window provided. - KVM_X86_NOTIFY_VMEXIT_USER: exit to userspace once the exits happen. - It's safe to even set notify window to zero since an internal hardware threshold is added to vmcs.notify_window. VM exit handling: - Introduce a vcpu state notify_window_exits to records the count of notify VM exits and expose it through the debugfs. - Notify VM exit can happen incident to delivery of a vector event. Allow it in KVM. - Exit to userspace unconditionally for handling when VM_CONTEXT_INVALID bit is set. Nested handling - Nested notify VM exits are not supported yet. Keep the same notify window control in vmcs02 as vmcs01, so that L1 can't escape the restriction of notify VM exits through launching L2 VM. Notify VM exit is defined in latest Intel Architecture Instruction Set Extensions Programming Reference, chapter 9.2. Co-developed-by: Xiaoyao Li <xiaoyao.li@intel.com> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com> Signed-off-by: Tao Xu <tao3.xu@intel.com> Co-developed-by: Chenyi Qiang <chenyi.qiang@intel.com> Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com> Message-Id: <20220524135624.22988-5-chenyi.qiang@intel.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-05-24 21:56:24 +08:00
static inline bool cpu_has_notify_vmexit(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_NOTIFY_VM_EXITING;
}
#endif /* __KVM_X86_VMX_CAPS_H */