linux/arch/x86/kvm/svm/svm.h

941 lines
27 KiB
C
Raw Permalink Normal View History

// SPDX-License-Identifier: GPL-2.0-only
/*
* Kernel-based Virtual Machine driver for Linux
*
* AMD SVM support
*
* Copyright (C) 2006 Qumranet, Inc.
* Copyright 2010 Red Hat, Inc. and/or its affiliates.
*
* Authors:
* Yaniv Kamay <yaniv@qumranet.com>
* Avi Kivity <avi@qumranet.com>
*/
#ifndef __SVM_SVM_H
#define __SVM_SVM_H
#include <linux/kvm_types.h>
#include <linux/kvm_host.h>
#include <linux/bits.h>
#include <asm/svm.h>
#include <asm/sev-common.h>
#include "cpuid.h"
#include "kvm_cache_regs.h"
/*
* Helpers to convert to/from physical addresses for pages whose address is
* consumed directly by hardware. Even though it's a physical address, SVM
* often restricts the address to the natural width, hence 'unsigned long'
* instead of 'hpa_t'.
*/
static inline unsigned long __sme_page_pa(struct page *page)
{
return __sme_set(page_to_pfn(page) << PAGE_SHIFT);
}
static inline struct page *__sme_pa_to_page(unsigned long pa)
{
return pfn_to_page(__sme_clr(pa) >> PAGE_SHIFT);
}
#define IOPM_SIZE PAGE_SIZE * 3
#define MSRPM_SIZE PAGE_SIZE * 2
extern bool npt_enabled;
extern int nrips;
extern int vgif;
extern bool intercept_smi;
extern bool x2avic_enabled;
KVM: x86: Add support for SVM's Virtual NMI Add support for SVM's Virtual NMIs implementation, which adds proper tracking of virtual NMI blocking, and an intr_ctrl flag that software can set to mark a virtual NMI as pending. Pending virtual NMIs are serviced by hardware if/when virtual NMIs become unblocked, i.e. act more or less like real NMIs. Introduce two new kvm_x86_ops callbacks so to support SVM's vNMI, as KVM needs to treat a pending vNMI as partially injected. Specifically, if two NMIs (for L1) arrive concurrently in KVM's software model, KVM's ABI is to inject one and pend the other. Without vNMI, KVM manually tracks the pending NMI and uses NMI windows to detect when the NMI should be injected. With vNMI, the pending NMI is simply stuffed into the VMCB and handed off to hardware. This means that KVM needs to be able to set a vNMI pending on-demand, and also query if a vNMI is pending, e.g. to honor the "at most one NMI pending" rule and to preserve all NMIs across save and restore. Warn if KVM attempts to open an NMI window when vNMI is fully enabled, as the above logic should prevent KVM from ever getting to kvm_check_and_inject_events() with two NMIs pending _in software_, and the "at most one NMI pending" logic should prevent having an NMI pending in hardware and an NMI pending in software if NMIs are also blocked, i.e. if KVM can't immediately inject the second NMI. Signed-off-by: Santosh Shukla <Santosh.Shukla@amd.com> Co-developed-by: Maxim Levitsky <mlevitsk@redhat.com> Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com> Link: https://lore.kernel.org/r/20230227084016.3368-11-santosh.shukla@amd.com [sean: rewrite shortlog and changelog, massage code comments] Signed-off-by: Sean Christopherson <seanjc@google.com>
2023-02-27 14:10:15 +05:30
extern bool vnmi;
extern int lbrv;
/*
* Clean bits in VMCB.
* VMCB_ALL_CLEAN_MASK might also need to
* be updated if this enum is modified.
*/
enum {
VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
pause filter count */
VMCB_PERM_MAP, /* IOPM Base and MSRPM Base */
VMCB_ASID, /* ASID */
VMCB_INTR, /* int_ctl, int_vector */
VMCB_NPT, /* npt_en, nCR3, gPAT */
VMCB_CR, /* CR0, CR3, CR4, EFER */
VMCB_DR, /* DR6, DR7 */
VMCB_DT, /* GDT, IDT */
VMCB_SEG, /* CS, DS, SS, ES, CPL */
VMCB_CR2, /* CR2 only */
VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
VMCB_AVIC, /* AVIC APIC_BAR, AVIC APIC_BACKING_PAGE,
* AVIC PHYSICAL_TABLE pointer,
* AVIC LOGICAL_TABLE pointer
*/
VMCB_SW = 31, /* Reserved for hypervisor/software use */
};
#define VMCB_ALL_CLEAN_MASK ( \
(1U << VMCB_INTERCEPTS) | (1U << VMCB_PERM_MAP) | \
(1U << VMCB_ASID) | (1U << VMCB_INTR) | \
(1U << VMCB_NPT) | (1U << VMCB_CR) | (1U << VMCB_DR) | \
(1U << VMCB_DT) | (1U << VMCB_SEG) | (1U << VMCB_CR2) | \
(1U << VMCB_LBR) | (1U << VMCB_AVIC) | \
(1U << VMCB_SW))
/* TPR and CR2 are always written before VMRUN */
#define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2))
struct kvm_sev_info {
bool active; /* SEV enabled guest */
bool es_active; /* SEV-ES enabled guest */
bool need_init; /* waiting for SEV_INIT2 */
unsigned int asid; /* ASID used for this guest */
unsigned int handle; /* SEV firmware handle */
int fd; /* SEV device fd */
unsigned long policy;
unsigned long pages_locked; /* Number of pages locked */
struct list_head regions_list; /* List of registered regions */
u64 ap_jump_table; /* SEV-ES AP Jump Table address */
u64 vmsa_features;
u16 ghcb_version; /* Highest guest GHCB protocol version allowed */
KVM: x86: Support KVM VMs sharing SEV context Add a capability for userspace to mirror SEV encryption context from one vm to another. On our side, this is intended to support a Migration Helper vCPU, but it can also be used generically to support other in-guest workloads scheduled by the host. The intention is for the primary guest and the mirror to have nearly identical memslots. The primary benefits of this are that: 1) The VMs do not share KVM contexts (think APIC/MSRs/etc), so they can't accidentally clobber each other. 2) The VMs can have different memory-views, which is necessary for post-copy migration (the migration vCPUs on the target need to read and write to pages, when the primary guest would VMEXIT). This does not change the threat model for AMD SEV. Any memory involved is still owned by the primary guest and its initial state is still attested to through the normal SEV_LAUNCH_* flows. If userspace wanted to circumvent SEV, they could achieve the same effect by simply attaching a vCPU to the primary VM. This patch deliberately leaves userspace in charge of the memslots for the mirror, as it already has the power to mess with them in the primary guest. This patch does not support SEV-ES (much less SNP), as it does not handle handing off attested VMSAs to the mirror. For additional context, we need a Migration Helper because SEV PSP migration is far too slow for our live migration on its own. Using an in-guest migrator lets us speed this up significantly. Signed-off-by: Nathan Tempelman <natet@google.com> Message-Id: <20210408223214.2582277-1-natet@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2021-04-08 22:32:14 +00:00
struct kvm *enc_context_owner; /* Owner of copied encryption context */
struct list_head mirror_vms; /* List of VMs mirroring */
struct list_head mirror_entry; /* Use as a list entry of mirrors */
struct misc_cg *misc_cg; /* For misc cgroup accounting */
atomic_t migration_in_progress;
void *snp_context; /* SNP guest context page */
KVM: SEV: Provide support for SNP_GUEST_REQUEST NAE event Version 2 of GHCB specification added support for the SNP Guest Request Message NAE event. The event allows for an SEV-SNP guest to make requests to the SEV-SNP firmware through the hypervisor using the SNP_GUEST_REQUEST API defined in the SEV-SNP firmware specification. This is used by guests primarily to request attestation reports from firmware. There are other request types are available as well, but the specifics of what guest requests are being made generally does not affect how they are handled by the hypervisor, which only serves as a proxy for the guest requests and firmware responses. Implement handling for these events. When an SNP Guest Request is issued, the guest will provide its own request/response pages, which could in theory be passed along directly to firmware. However, these pages would need special care: - Both pages are from shared guest memory, so they need to be protected from migration/etc. occurring while firmware reads/writes to them. At a minimum, this requires elevating the ref counts and potentially needing an explicit pinning of the memory. This places additional restrictions on what type of memory backends userspace can use for shared guest memory since there would be some reliance on using refcounted pages. - The response page needs to be switched to Firmware-owned state before the firmware can write to it, which can lead to potential host RMP #PFs if the guest is misbehaved and hands the host a guest page that KVM is writing to for other reasons (e.g. virtio buffers). Both of these issues can be avoided completely by using separately-allocated bounce pages for both the request/response pages and passing those to firmware instead. So that's the approach taken here. Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Co-developed-by: Alexey Kardashevskiy <aik@amd.com> Signed-off-by: Alexey Kardashevskiy <aik@amd.com> Co-developed-by: Ashish Kalra <ashish.kalra@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Reviewed-by: Liam Merwick <liam.merwick@oracle.com> [mdr: ensure FW command failures are indicated to guest, drop extended request handling to be re-written as separate patch, massage commit] Signed-off-by: Michael Roth <michael.roth@amd.com> Message-ID: <20240701223148.3798365-2-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-07-01 17:31:46 -05:00
void *guest_req_buf; /* Bounce buffer for SNP Guest Request input */
void *guest_resp_buf; /* Bounce buffer for SNP Guest Request output */
struct mutex guest_req_mutex; /* Must acquire before using bounce buffers */
KVM: SVM: Flush cache only on CPUs running SEV guest On AMD CPUs without ensuring cache consistency, each memory page reclamation in an SEV guest triggers a call to do WBNOINVD/WBINVD on all CPUs, thereby affecting the performance of other programs on the host. Typically, an AMD server may have 128 cores or more, while the SEV guest might only utilize 8 of these cores. Meanwhile, host can use qemu-affinity to bind these 8 vCPUs to specific physical CPUs. Therefore, keeping a record of the physical core numbers each time a vCPU runs can help avoid flushing the cache for all CPUs every time. Take care to allocate the cpumask used to track which CPUs have run a vCPU when copying or moving an "encryption context", as nothing guarantees memory in a mirror VM is a strict subset of the ASID owner, and the destination VM for intrahost migration needs to maintain it's own set of CPUs. E.g. for intrahost migration, if a CPU was used for the source VM but not the destination VM, then it can only have cached memory that was accessible to the source VM. And a CPU that was run in the source is also used by the destination is no different than a CPU that was run in the destination only. Note, KVM is guaranteed to do flush caches prior to sev_vm_destroy(), thanks to kvm_arch_guest_memory_reclaimed for SEV and SEV-ES, and kvm_arch_gmem_invalidate() for SEV-SNP. I.e. it's safe to free the cpumask prior to unregistering encrypted regions and freeing the ASID. Opportunistically clean up sev_vm_destroy()'s comment regarding what is (implicitly, what isn't) skipped for mirror VMs. Cc: Srikanth Aithal <sraithal@amd.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Signed-off-by: Zheyun Shen <szy0127@sjtu.edu.cn> Link: https://lore.kernel.org/r/20250522233733.3176144-9-seanjc@google.com Link: https://lore.kernel.org/all/935a82e3-f7ad-47d7-aaaf-f3d2b62ed768@amd.com Co-developed-by: Sean Christopherson <seanjc@google.com> Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-05-22 16:37:32 -07:00
cpumask_var_t have_run_cpus; /* CPUs that have done VMRUN for this VM. */
};
#define SEV_POLICY_NODBG BIT_ULL(0)
#define SNP_POLICY_DEBUG BIT_ULL(19)
struct kvm_svm {
struct kvm kvm;
/* Struct members for AVIC */
u32 avic_vm_id;
u32 *avic_logical_id_table;
u64 *avic_physical_id_table;
struct hlist_node hnode;
struct kvm_sev_info sev_info;
};
struct kvm_vcpu;
struct kvm_vmcb_info {
struct vmcb *ptr;
unsigned long pa;
int cpu;
uint64_t asid_generation;
};
struct vmcb_save_area_cached {
u64 efer;
u64 cr4;
u64 cr3;
u64 cr0;
u64 dr7;
u64 dr6;
};
struct vmcb_ctrl_area_cached {
u32 intercepts[MAX_INTERCEPT];
u16 pause_filter_thresh;
u16 pause_filter_count;
u64 iopm_base_pa;
u64 msrpm_base_pa;
u64 tsc_offset;
u32 asid;
u8 tlb_ctl;
u32 int_ctl;
u32 int_vector;
u32 int_state;
u32 exit_code;
u32 exit_code_hi;
u64 exit_info_1;
u64 exit_info_2;
u32 exit_int_info;
u32 exit_int_info_err;
u64 nested_ctl;
u32 event_inj;
u32 event_inj_err;
u64 next_rip;
u64 nested_cr3;
u64 virt_ext;
u32 clean;
KVM: SVM: Add support for KVM_CAP_X86_BUS_LOCK_EXIT on SVM CPUs Add support for KVM_CAP_X86_BUS_LOCK_EXIT on SVM CPUs with Bus Lock Threshold, which is close enough to VMX's Bus Lock Detection VM-Exit to allow reusing KVM_CAP_X86_BUS_LOCK_EXIT. The biggest difference between the two features is that Threshold is fault-like, whereas Detection is trap-like. To allow the guest to make forward progress, Threshold provides a per-VMCB counter which is decremented every time a bus lock occurs, and a VM-Exit is triggered if and only if the counter is '0'. To provide Detection-like semantics, initialize the counter to '0', i.e. exit on every bus lock, and when re-executing the guilty instruction, set the counter to '1' to effectively step past the instruction. Note, in the unlikely scenario that re-executing the instruction doesn't trigger a bus lock, e.g. because the guest has changed memory types or patched the guilty instruction, the bus lock counter will be left at '1', i.e. the guest will be able to do a bus lock on a different instruction. In a perfect world, KVM would ensure the counter is '0' if the guest has made forward progress, e.g. if RIP has changed. But trying to close that hole would incur non-trivial complexity, for marginal benefit; the intent of KVM_CAP_X86_BUS_LOCK_EXIT is to allow userspace rate-limit bus locks, not to allow for precise detection of problematic guest code. And, it's simply not feasible to fully close the hole, e.g. if an interrupt arrives before the original instruction can re-execute, the guest could step past a different bus lock. Suggested-by: Sean Christopherson <seanjc@google.com> Signed-off-by: Manali Shukla <manali.shukla@amd.com> Link: https://lore.kernel.org/r/20250502050346.14274-5-manali.shukla@amd.com [sean: fix typo in comment] Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-05-02 05:03:45 +00:00
u64 bus_lock_rip;
union {
#if IS_ENABLED(CONFIG_HYPERV) || IS_ENABLED(CONFIG_KVM_HYPERV)
struct hv_vmcb_enlightenments hv_enlightenments;
#endif
u8 reserved_sw[32];
};
};
struct svm_nested_state {
struct kvm_vmcb_info vmcb02;
u64 hsave_msr;
u64 vm_cr_msr;
u64 vmcb12_gpa;
u64 last_vmcb12_gpa;
/*
* The MSR permissions map used for vmcb02, which is the merge result
* of vmcb01 and vmcb12
*/
void *msrpm;
/* A VMRUN has started but has not yet been performed, so
* we cannot inject a nested vmexit yet. */
bool nested_run_pending;
/* cache for control fields of the guest */
struct vmcb_ctrl_area_cached ctl;
/*
* Note: this struct is not kept up-to-date while L2 runs; it is only
* valid within nested_svm_vmrun.
*/
struct vmcb_save_area_cached save;
bool initialized;
/*
* Indicates whether MSR bitmap for L2 needs to be rebuilt due to
* changes in MSR bitmap for L1 or switching to a different L2. Note,
* this flag can only be used reliably in conjunction with a paravirt L1
* which informs L0 whether any changes to MSR bitmap for L2 were done
* on its side.
*/
bool force_msr_bitmap_recalc;
};
struct vcpu_sev_es_state {
/* SEV-ES support */
struct sev_es_save_area *vmsa;
struct ghcb *ghcb;
u8 valid_bitmap[16];
struct kvm_host_map ghcb_map;
bool received_first_sipi;
unsigned int ap_reset_hold_type;
/* SEV-ES scratch area support */
u64 sw_scratch;
void *ghcb_sa;
u32 ghcb_sa_len;
bool ghcb_sa_sync;
bool ghcb_sa_free;
/* SNP Page-State-Change buffer entries currently being processed */
u16 psc_idx;
u16 psc_inflight;
bool psc_2m;
u64 ghcb_registered_gpa;
KVM: SEV: Support SEV-SNP AP Creation NAE event Add support for the SEV-SNP AP Creation NAE event. This allows SEV-SNP guests to alter the register state of the APs on their own. This allows the guest a way of simulating INIT-SIPI. A new event, KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, is created and used so as to avoid updating the VMSA pointer while the vCPU is running. For CREATE The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID. The GPA is saved in the svm struct of the target vCPU, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to the vCPU and then the vCPU is kicked. For CREATE_ON_INIT: The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID the next time an INIT is performed. The GPA is saved in the svm struct of the target vCPU. For DESTROY: The guest indicates it wishes to stop the vCPU. The GPA is cleared from the svm struct, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to vCPU and then the vCPU is kicked. The KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event handler will be invoked as a result of the event or as a result of an INIT. If a new VMSA is to be installed, the VMSA guest page is set as the VMSA in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_RUNNABLE. If a new VMSA is not to be installed, the VMSA is cleared in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_HALTED to prevent it from being run. Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Co-developed-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Message-ID: <20240501085210.2213060-13-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-05-01 03:52:02 -05:00
struct mutex snp_vmsa_mutex; /* Used to handle concurrent updates of VMSA. */
gpa_t snp_vmsa_gpa;
bool snp_ap_waiting_for_reset;
bool snp_has_guest_vmsa;
};
struct vcpu_svm {
struct kvm_vcpu vcpu;
/* vmcb always points at current_vmcb->ptr, it's purely a shorthand. */
struct vmcb *vmcb;
struct kvm_vmcb_info vmcb01;
struct kvm_vmcb_info *current_vmcb;
u32 asid;
KVM: nSVM: improve SYSENTER emulation on AMD Currently to support Intel->AMD migration, if CPU vendor is GenuineIntel, we emulate the full 64 value for MSR_IA32_SYSENTER_{EIP|ESP} msrs, and we also emulate the sysenter/sysexit instruction in long mode. (Emulator does still refuse to emulate sysenter in 64 bit mode, on the ground that the code for that wasn't tested and likely has no users) However when virtual vmload/vmsave is enabled, the vmload instruction will update these 32 bit msrs without triggering their msr intercept, which will lead to having stale values in kvm's shadow copy of these msrs, which relies on the intercept to be up to date. Fix/optimize this by doing the following: 1. Enable the MSR intercepts for SYSENTER MSRs iff vendor=GenuineIntel (This is both a tiny optimization and also ensures that in case the guest cpu vendor is AMD, the msrs will be 32 bit wide as AMD defined). 2. Store only high 32 bit part of these msrs on interception and combine it with hardware msr value on intercepted read/writes iff vendor=GenuineIntel. 3. Disable vmload/vmsave virtualization if vendor=GenuineIntel. (It is somewhat insane to set vendor=GenuineIntel and still enable SVM for the guest but well whatever). Then zero the high 32 bit parts when kvm intercepts and emulates vmload. Thanks a lot to Paulo Bonzini for helping me with fixing this in the most correct way. This patch fixes nested migration of 32 bit nested guests, that was broken because incorrect cached values of SYSENTER msrs were stored in the migration stream if L1 changed these msrs with vmload prior to L2 entry. Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com> Message-Id: <20210401111928.996871-3-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2021-04-01 14:19:28 +03:00
u32 sysenter_esp_hi;
u32 sysenter_eip_hi;
uint64_t tsc_aux;
u64 msr_decfg;
u64 next_rip;
u64 spec_ctrl;
u64 tsc_ratio_msr;
/*
* Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be
* translated into the appropriate L2_CFG bits on the host to
* perform speculative control.
*/
u64 virt_spec_ctrl;
void *msrpm;
ulong nmi_iret_rip;
struct svm_nested_state nested;
/* NMI mask value, used when vNMI is not enabled */
bool nmi_masked;
/*
* True when NMIs are still masked but guest IRET was just intercepted
* and KVM is waiting for RIP to change, which will signal that the
* intercepted IRET was retired and thus NMI can be unmasked.
*/
bool awaiting_iret_completion;
/*
* Set when KVM is awaiting IRET completion and needs to inject NMIs as
* soon as the IRET completes (e.g. NMI is pending injection). KVM
* temporarily steals RFLAGS.TF to single-step the guest in this case
* in order to regain control as soon as the NMI-blocking condition
* goes away.
*/
bool nmi_singlestep;
u64 nmi_singlestep_guest_rflags;
bool nmi_l1_to_l2;
KVM: SVM: Re-inject INT3/INTO instead of retrying the instruction Re-inject INT3/INTO instead of retrying the instruction if the CPU encountered an intercepted exception while vectoring the software exception, e.g. if vectoring INT3 encounters a #PF and KVM is using shadow paging. Retrying the instruction is architecturally wrong, e.g. will result in a spurious #DB if there's a code breakpoint on the INT3/O, and lack of re-injection also breaks nested virtualization, e.g. if L1 injects a software exception and vectoring the injected exception encounters an exception that is intercepted by L0 but not L1. Due to, ahem, deficiencies in the SVM architecture, acquiring the next RIP may require flowing through the emulator even if NRIPS is supported, as the CPU clears next_rip if the VM-Exit is due to an exception other than "exceptions caused by the INT3, INTO, and BOUND instructions". To deal with this, "skip" the instruction to calculate next_rip (if it's not already known), and then unwind the RIP write and any side effects (RFLAGS updates). Save the computed next_rip and use it to re-stuff next_rip if injection doesn't complete. This allows KVM to do the right thing if next_rip was known prior to injection, e.g. if L1 injects a soft event into L2, and there is no backing INTn instruction, e.g. if L1 is injecting an arbitrary event. Note, it's impossible to guarantee architectural correctness given SVM's architectural flaws. E.g. if the guest executes INTn (no KVM injection), an exit occurs while vectoring the INTn, and the guest modifies the code stream while the exit is being handled, KVM will compute the incorrect next_rip due to "skipping" the wrong instruction. A future enhancement to make this less awful would be for KVM to detect that the decoded instruction is not the correct INTn and drop the to-be-injected soft event (retrying is a lesser evil compared to shoving the wrong RIP on the exception stack). Reported-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com> Signed-off-by: Sean Christopherson <seanjc@google.com> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com> Message-Id: <65cb88deab40bc1649d509194864312a89bbe02e.1651440202.git.maciej.szmigiero@oracle.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-05-02 00:07:29 +02:00
unsigned long soft_int_csbase;
unsigned long soft_int_old_rip;
unsigned long soft_int_next_rip;
bool soft_int_injected;
u32 ldr_reg;
u32 dfr_reg;
KVM: SVM: Add enable_ipiv param, never set IsRunning if disabled Let userspace "disable" IPI virtualization for AVIC via the enable_ipiv module param, by never setting IsRunning. SVM doesn't provide a way to disable IPI virtualization in hardware, but by ensuring CPUs never see IsRunning=1, every IPI in the guest (except for self-IPIs) will generate a VM-Exit. To avoid setting the real IsRunning bit, while still allowing KVM to use each vCPU's entry to update GA log entries, simply maintain a shadow of the entry, without propagating IsRunning updates to the real table when IPI virtualization is disabled. Providing a way to effectively disable IPI virtualization will allow KVM to safely enable AVIC on hardware that is susceptible to erratum #1235, which causes hardware to sometimes fail to detect that the IsRunning bit has been cleared by software. Note, the table _must_ be fully populated, as broadcast IPIs skip invalid entries, i.e. won't generate VM-Exit if every entry is invalid, and so simply pointing the VMCB at a common dummy table won't work. Alternatively, KVM could allocate a shadow of the entire table, but that'd be a waste of 4KiB since the per-vCPU entry doesn't actually consume an additional 8 bytes of memory (vCPU structures are large enough that they are backed by order-N pages). Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com> [sean: keep "entry" variables, reuse enable_ipiv, split from erratum] Link: https://lore.kernel.org/r/20250611224604.313496-19-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-06-11 15:45:20 -07:00
/* This is essentially a shadow of the vCPU's actual entry in the
* Physical ID table that is programmed into the VMCB, i.e. that is
* seen by the CPU. If IPI virtualization is disabled, IsRunning is
* only ever set in the shadow, i.e. is never propagated to the "real"
* table, so that hardware never sees IsRunning=1.
*/
u64 avic_physical_id_entry;
/*
* Per-vCPU list of irqfds that are eligible to post IRQs directly to
* the vCPU (a.k.a. device posted IRQs, a.k.a. IRQ bypass). The list
* is used to reconfigure IRTEs when the vCPU is loaded/put (to set the
* target pCPU), when AVIC is toggled on/off (to (de)activate bypass),
* and if the irqfd becomes ineligible for posting (to put the IRTE
* back into remapped mode).
*/
struct list_head ir_list;
spinlock_t ir_list_lock;
struct vcpu_sev_es_state sev_es;
bool guest_state_loaded;
bool x2avic_msrs_intercepted;
/* Guest GIF value, used when vGIF is not enabled */
bool guest_gif;
};
struct svm_cpu_data {
u64 asid_generation;
u32 max_asid;
u32 next_asid;
u32 min_asid;
bool bp_spec_reduce_set;
struct vmcb *save_area;
unsigned long save_area_pa;
/* index = sev_asid, value = vmcb pointer */
struct vmcb **sev_vmcbs;
};
DECLARE_PER_CPU(struct svm_cpu_data, svm_data);
void recalc_intercepts(struct vcpu_svm *svm);
static __always_inline struct kvm_svm *to_kvm_svm(struct kvm *kvm)
{
return container_of(kvm, struct kvm_svm, kvm);
}
static __always_inline struct kvm_sev_info *to_kvm_sev_info(struct kvm *kvm)
{
return &to_kvm_svm(kvm)->sev_info;
}
#ifdef CONFIG_KVM_AMD_SEV
static __always_inline bool sev_guest(struct kvm *kvm)
{
return to_kvm_sev_info(kvm)->active;
}
static __always_inline bool sev_es_guest(struct kvm *kvm)
{
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
return sev->es_active && !WARN_ON_ONCE(!sev->active);
}
static __always_inline bool sev_snp_guest(struct kvm *kvm)
{
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
return (sev->vmsa_features & SVM_SEV_FEAT_SNP_ACTIVE) &&
!WARN_ON_ONCE(!sev_es_guest(kvm));
}
#else
#define sev_guest(kvm) false
#define sev_es_guest(kvm) false
#define sev_snp_guest(kvm) false
#endif
static inline bool ghcb_gpa_is_registered(struct vcpu_svm *svm, u64 val)
{
return svm->sev_es.ghcb_registered_gpa == val;
}
static inline void vmcb_mark_all_dirty(struct vmcb *vmcb)
{
vmcb->control.clean = 0;
}
static inline void vmcb_mark_all_clean(struct vmcb *vmcb)
{
vmcb->control.clean = VMCB_ALL_CLEAN_MASK
& ~VMCB_ALWAYS_DIRTY_MASK;
}
static inline void vmcb_mark_dirty(struct vmcb *vmcb, int bit)
{
vmcb->control.clean &= ~(1 << bit);
}
static inline bool vmcb_is_dirty(struct vmcb *vmcb, int bit)
{
return !test_bit(bit, (unsigned long *)&vmcb->control.clean);
}
static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
{
return container_of(vcpu, struct vcpu_svm, vcpu);
}
/*
* Only the PDPTRs are loaded on demand into the shadow MMU. All other
* fields are synchronized on VM-Exit, because accessing the VMCB is cheap.
*
* CR3 might be out of date in the VMCB but it is not marked dirty; instead,
* KVM_REQ_LOAD_MMU_PGD is always requested when the cached vcpu->arch.cr3
* is changed. svm_load_mmu_pgd() then syncs the new CR3 value into the VMCB.
*/
#define SVM_REGS_LAZY_LOAD_SET (1 << VCPU_EXREG_PDPTR)
static inline void vmcb_set_intercept(struct vmcb_control_area *control, u32 bit)
{
WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
__set_bit(bit, (unsigned long *)&control->intercepts);
}
static inline void vmcb_clr_intercept(struct vmcb_control_area *control, u32 bit)
{
WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
__clear_bit(bit, (unsigned long *)&control->intercepts);
}
static inline bool vmcb_is_intercept(struct vmcb_control_area *control, u32 bit)
{
WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
return test_bit(bit, (unsigned long *)&control->intercepts);
}
static inline bool vmcb12_is_intercept(struct vmcb_ctrl_area_cached *control, u32 bit)
{
WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
return test_bit(bit, (unsigned long *)&control->intercepts);
}
static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit)
{
struct vmcb *vmcb = svm->vmcb01.ptr;
WARN_ON_ONCE(bit >= 32);
vmcb_set_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
recalc_intercepts(svm);
}
static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit)
{
struct vmcb *vmcb = svm->vmcb01.ptr;
WARN_ON_ONCE(bit >= 32);
vmcb_clr_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
recalc_intercepts(svm);
}
static inline void svm_set_intercept(struct vcpu_svm *svm, int bit)
{
struct vmcb *vmcb = svm->vmcb01.ptr;
vmcb_set_intercept(&vmcb->control, bit);
recalc_intercepts(svm);
}
static inline void svm_clr_intercept(struct vcpu_svm *svm, int bit)
{
struct vmcb *vmcb = svm->vmcb01.ptr;
vmcb_clr_intercept(&vmcb->control, bit);
recalc_intercepts(svm);
}
static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit)
{
return vmcb_is_intercept(&svm->vmcb->control, bit);
}
static inline bool nested_vgif_enabled(struct vcpu_svm *svm)
{
return guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_VGIF) &&
(svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK);
}
static inline struct vmcb *get_vgif_vmcb(struct vcpu_svm *svm)
{
if (!vgif)
return NULL;
if (is_guest_mode(&svm->vcpu) && !nested_vgif_enabled(svm))
return svm->nested.vmcb02.ptr;
else
return svm->vmcb01.ptr;
}
static inline void enable_gif(struct vcpu_svm *svm)
{
struct vmcb *vmcb = get_vgif_vmcb(svm);
if (vmcb)
vmcb->control.int_ctl |= V_GIF_MASK;
else
svm->guest_gif = true;
}
static inline void disable_gif(struct vcpu_svm *svm)
{
struct vmcb *vmcb = get_vgif_vmcb(svm);
if (vmcb)
vmcb->control.int_ctl &= ~V_GIF_MASK;
else
svm->guest_gif = false;
}
static inline bool gif_set(struct vcpu_svm *svm)
{
struct vmcb *vmcb = get_vgif_vmcb(svm);
if (vmcb)
return !!(vmcb->control.int_ctl & V_GIF_MASK);
else
return svm->guest_gif;
}
static inline bool nested_npt_enabled(struct vcpu_svm *svm)
{
return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE;
}
static inline bool nested_vnmi_enabled(struct vcpu_svm *svm)
{
return guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_VNMI) &&
(svm->nested.ctl.int_ctl & V_NMI_ENABLE_MASK);
}
static inline bool is_x2apic_msrpm_offset(u32 offset)
{
/* 4 msrs per u8, and 4 u8 in u32 */
u32 msr = offset * 16;
return (msr >= APIC_BASE_MSR) &&
(msr < (APIC_BASE_MSR + 0x100));
}
KVM: x86: Add support for SVM's Virtual NMI Add support for SVM's Virtual NMIs implementation, which adds proper tracking of virtual NMI blocking, and an intr_ctrl flag that software can set to mark a virtual NMI as pending. Pending virtual NMIs are serviced by hardware if/when virtual NMIs become unblocked, i.e. act more or less like real NMIs. Introduce two new kvm_x86_ops callbacks so to support SVM's vNMI, as KVM needs to treat a pending vNMI as partially injected. Specifically, if two NMIs (for L1) arrive concurrently in KVM's software model, KVM's ABI is to inject one and pend the other. Without vNMI, KVM manually tracks the pending NMI and uses NMI windows to detect when the NMI should be injected. With vNMI, the pending NMI is simply stuffed into the VMCB and handed off to hardware. This means that KVM needs to be able to set a vNMI pending on-demand, and also query if a vNMI is pending, e.g. to honor the "at most one NMI pending" rule and to preserve all NMIs across save and restore. Warn if KVM attempts to open an NMI window when vNMI is fully enabled, as the above logic should prevent KVM from ever getting to kvm_check_and_inject_events() with two NMIs pending _in software_, and the "at most one NMI pending" logic should prevent having an NMI pending in hardware and an NMI pending in software if NMIs are also blocked, i.e. if KVM can't immediately inject the second NMI. Signed-off-by: Santosh Shukla <Santosh.Shukla@amd.com> Co-developed-by: Maxim Levitsky <mlevitsk@redhat.com> Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com> Link: https://lore.kernel.org/r/20230227084016.3368-11-santosh.shukla@amd.com [sean: rewrite shortlog and changelog, massage code comments] Signed-off-by: Sean Christopherson <seanjc@google.com>
2023-02-27 14:10:15 +05:30
static inline struct vmcb *get_vnmi_vmcb_l1(struct vcpu_svm *svm)
{
if (!vnmi)
return NULL;
if (is_guest_mode(&svm->vcpu))
return NULL;
else
return svm->vmcb01.ptr;
}
static inline bool is_vnmi_enabled(struct vcpu_svm *svm)
{
struct vmcb *vmcb = get_vnmi_vmcb_l1(svm);
if (vmcb)
return !!(vmcb->control.int_ctl & V_NMI_ENABLE_MASK);
else
return false;
}
static inline void svm_vmgexit_set_return_code(struct vcpu_svm *svm,
u64 response, u64 data)
{
ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, response);
ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, data);
}
static inline void svm_vmgexit_inject_exception(struct vcpu_svm *svm, u8 vector)
{
u64 data = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT | vector;
svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_ISSUE_EXCEPTION, data);
}
static inline void svm_vmgexit_bad_input(struct vcpu_svm *svm, u64 suberror)
{
svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_MALFORMED_INPUT, suberror);
}
static inline void svm_vmgexit_success(struct vcpu_svm *svm, u64 data)
{
svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_NO_ACTION, data);
}
static inline void svm_vmgexit_no_action(struct vcpu_svm *svm, u64 data)
{
svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_NO_ACTION, data);
}
/*
* The MSRPM is 8KiB in size, divided into four 2KiB ranges (the fourth range
* is reserved). Each MSR within a range is covered by two bits, one each for
* read (bit 0) and write (bit 1), where a bit value of '1' means intercepted.
*/
#define SVM_MSRPM_BYTES_PER_RANGE 2048
#define SVM_BITS_PER_MSR 2
#define SVM_MSRS_PER_BYTE (BITS_PER_BYTE / SVM_BITS_PER_MSR)
#define SVM_MSRS_PER_RANGE (SVM_MSRPM_BYTES_PER_RANGE * SVM_MSRS_PER_BYTE)
static_assert(SVM_MSRS_PER_RANGE == 8192);
#define SVM_MSRPM_OFFSET_MASK (SVM_MSRS_PER_RANGE - 1)
static __always_inline int svm_msrpm_bit_nr(u32 msr)
{
int range_nr;
switch (msr & ~SVM_MSRPM_OFFSET_MASK) {
case 0:
range_nr = 0;
break;
case 0xc0000000:
range_nr = 1;
break;
case 0xc0010000:
range_nr = 2;
break;
default:
return -EINVAL;
}
return range_nr * SVM_MSRPM_BYTES_PER_RANGE * BITS_PER_BYTE +
(msr & SVM_MSRPM_OFFSET_MASK) * SVM_BITS_PER_MSR;
}
#define __BUILD_SVM_MSR_BITMAP_HELPER(rtype, action, bitop, access, bit_rw) \
static inline rtype svm_##action##_msr_bitmap_##access(unsigned long *bitmap, \
u32 msr) \
{ \
int bit_nr; \
\
bit_nr = svm_msrpm_bit_nr(msr); \
if (bit_nr < 0) \
return (rtype)true; \
\
return bitop##_bit(bit_nr + bit_rw, bitmap); \
}
#define BUILD_SVM_MSR_BITMAP_HELPERS(ret_type, action, bitop) \
__BUILD_SVM_MSR_BITMAP_HELPER(ret_type, action, bitop, read, 0) \
__BUILD_SVM_MSR_BITMAP_HELPER(ret_type, action, bitop, write, 1)
BUILD_SVM_MSR_BITMAP_HELPERS(bool, test, test)
BUILD_SVM_MSR_BITMAP_HELPERS(void, clear, __clear)
BUILD_SVM_MSR_BITMAP_HELPERS(void, set, __set)
#define DEBUGCTL_RESERVED_BITS (~DEBUGCTLMSR_LBR)
/* svm.c */
extern bool dump_invalid_vmcb;
void *svm_alloc_permissions_map(unsigned long size, gfp_t gfp_mask);
static inline void *svm_vcpu_alloc_msrpm(void)
{
return svm_alloc_permissions_map(MSRPM_SIZE, GFP_KERNEL_ACCOUNT);
}
void svm_vcpu_free_msrpm(void *msrpm);
void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb);
void svm_enable_lbrv(struct kvm_vcpu *vcpu);
void svm_update_lbrv(struct kvm_vcpu *vcpu);
int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer);
void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
void disable_nmi_singlestep(struct vcpu_svm *svm);
bool svm_smi_blocked(struct kvm_vcpu *vcpu);
bool svm_nmi_blocked(struct kvm_vcpu *vcpu);
bool svm_interrupt_blocked(struct kvm_vcpu *vcpu);
void svm_set_gif(struct vcpu_svm *svm, bool value);
int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code);
void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
int read, int write);
void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool disable);
void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
int trig_mode, int vec);
void svm_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set);
static inline void svm_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
u32 msr, int type)
{
svm_set_intercept_for_msr(vcpu, msr, type, false);
}
static inline void svm_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
u32 msr, int type)
{
svm_set_intercept_for_msr(vcpu, msr, type, true);
}
/* nested.c */
#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
#define NESTED_EXIT_CONTINUE 2 /* Further checks needed */
static inline bool nested_svm_virtualize_tpr(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
return is_guest_mode(vcpu) && (svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK);
}
static inline bool nested_exit_on_smi(struct vcpu_svm *svm)
{
return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SMI);
}
static inline bool nested_exit_on_intr(struct vcpu_svm *svm)
{
return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_INTR);
}
static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
{
return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_NMI);
}
int __init nested_svm_init_msrpm_merge_offsets(void);
int enter_svm_guest_mode(struct kvm_vcpu *vcpu,
u64 vmcb_gpa, struct vmcb *vmcb12, bool from_vmrun);
KVM: x86: Forcibly leave nested virt when SMM state is toggled Forcibly leave nested virtualization operation if userspace toggles SMM state via KVM_SET_VCPU_EVENTS or KVM_SYNC_X86_EVENTS. If userspace forces the vCPU out of SMM while it's post-VMXON and then injects an SMI, vmx_enter_smm() will overwrite vmx->nested.smm.vmxon and end up with both vmxon=false and smm.vmxon=false, but all other nVMX state allocated. Don't attempt to gracefully handle the transition as (a) most transitions are nonsencial, e.g. forcing SMM while L2 is running, (b) there isn't sufficient information to handle all transitions, e.g. SVM wants access to the SMRAM save state, and (c) KVM_SET_VCPU_EVENTS must precede KVM_SET_NESTED_STATE during state restore as the latter disallows putting the vCPU into L2 if SMM is active, and disallows tagging the vCPU as being post-VMXON in SMM if SMM is not active. Abuse of KVM_SET_VCPU_EVENTS manifests as a WARN and memory leak in nVMX due to failure to free vmcs01's shadow VMCS, but the bug goes far beyond just a memory leak, e.g. toggling SMM on while L2 is active puts the vCPU in an architecturally impossible state. WARNING: CPU: 0 PID: 3606 at free_loaded_vmcs arch/x86/kvm/vmx/vmx.c:2665 [inline] WARNING: CPU: 0 PID: 3606 at free_loaded_vmcs+0x158/0x1a0 arch/x86/kvm/vmx/vmx.c:2656 Modules linked in: CPU: 1 PID: 3606 Comm: syz-executor725 Not tainted 5.17.0-rc1-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:free_loaded_vmcs arch/x86/kvm/vmx/vmx.c:2665 [inline] RIP: 0010:free_loaded_vmcs+0x158/0x1a0 arch/x86/kvm/vmx/vmx.c:2656 Code: <0f> 0b eb b3 e8 8f 4d 9f 00 e9 f7 fe ff ff 48 89 df e8 92 4d 9f 00 Call Trace: <TASK> kvm_arch_vcpu_destroy+0x72/0x2f0 arch/x86/kvm/x86.c:11123 kvm_vcpu_destroy arch/x86/kvm/../../../virt/kvm/kvm_main.c:441 [inline] kvm_destroy_vcpus+0x11f/0x290 arch/x86/kvm/../../../virt/kvm/kvm_main.c:460 kvm_free_vcpus arch/x86/kvm/x86.c:11564 [inline] kvm_arch_destroy_vm+0x2e8/0x470 arch/x86/kvm/x86.c:11676 kvm_destroy_vm arch/x86/kvm/../../../virt/kvm/kvm_main.c:1217 [inline] kvm_put_kvm+0x4fa/0xb00 arch/x86/kvm/../../../virt/kvm/kvm_main.c:1250 kvm_vm_release+0x3f/0x50 arch/x86/kvm/../../../virt/kvm/kvm_main.c:1273 __fput+0x286/0x9f0 fs/file_table.c:311 task_work_run+0xdd/0x1a0 kernel/task_work.c:164 exit_task_work include/linux/task_work.h:32 [inline] do_exit+0xb29/0x2a30 kernel/exit.c:806 do_group_exit+0xd2/0x2f0 kernel/exit.c:935 get_signal+0x4b0/0x28c0 kernel/signal.c:2862 arch_do_signal_or_restart+0x2a9/0x1c40 arch/x86/kernel/signal.c:868 handle_signal_work kernel/entry/common.c:148 [inline] exit_to_user_mode_loop kernel/entry/common.c:172 [inline] exit_to_user_mode_prepare+0x17d/0x290 kernel/entry/common.c:207 __syscall_exit_to_user_mode_work kernel/entry/common.c:289 [inline] syscall_exit_to_user_mode+0x19/0x60 kernel/entry/common.c:300 do_syscall_64+0x42/0xb0 arch/x86/entry/common.c:86 entry_SYSCALL_64_after_hwframe+0x44/0xae </TASK> Cc: stable@vger.kernel.org Reported-by: syzbot+8112db3ab20e70d50c31@syzkaller.appspotmail.com Signed-off-by: Sean Christopherson <seanjc@google.com> Message-Id: <20220125220358.2091737-1-seanjc@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-01-25 22:03:58 +00:00
void svm_leave_nested(struct kvm_vcpu *vcpu);
void svm_free_nested(struct vcpu_svm *svm);
int svm_allocate_nested(struct vcpu_svm *svm);
int nested_svm_vmrun(struct kvm_vcpu *vcpu);
void svm_copy_vmrun_state(struct vmcb_save_area *to_save,
struct vmcb_save_area *from_save);
void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb);
int nested_svm_vmexit(struct vcpu_svm *svm);
static inline int nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code)
{
svm->vmcb->control.exit_code = exit_code;
svm->vmcb->control.exit_info_1 = 0;
svm->vmcb->control.exit_info_2 = 0;
return nested_svm_vmexit(svm);
}
int nested_svm_exit_handled(struct vcpu_svm *svm);
int nested_svm_check_permissions(struct kvm_vcpu *vcpu);
int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
bool has_error_code, u32 error_code);
int nested_svm_exit_special(struct vcpu_svm *svm);
void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu);
void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu);
void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm,
struct vmcb_control_area *control);
void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm,
struct vmcb_save_area *save);
void nested_sync_control_from_vmcb02(struct vcpu_svm *svm);
void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm);
void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb);
extern struct kvm_x86_nested_ops svm_nested_ops;
/* avic.c */
#define AVIC_REQUIRED_APICV_INHIBITS \
( \
BIT(APICV_INHIBIT_REASON_DISABLED) | \
BIT(APICV_INHIBIT_REASON_ABSENT) | \
BIT(APICV_INHIBIT_REASON_HYPERV) | \
BIT(APICV_INHIBIT_REASON_NESTED) | \
BIT(APICV_INHIBIT_REASON_IRQWIN) | \
BIT(APICV_INHIBIT_REASON_PIT_REINJ) | \
BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \
BIT(APICV_INHIBIT_REASON_SEV) | \
BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \
BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) | \
BIT(APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED) | \
BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG) \
)
bool avic_hardware_setup(void);
int avic_ga_log_notifier(u32 ga_tag);
void avic_vm_destroy(struct kvm *kvm);
int avic_vm_init(struct kvm *kvm);
void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb);
int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu);
int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu);
int avic_init_vcpu(struct vcpu_svm *svm);
void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
void avic_vcpu_put(struct kvm_vcpu *vcpu);
void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu);
void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
unsigned int host_irq, uint32_t guest_irq,
struct kvm_vcpu *vcpu, u32 vector);
void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
void avic_ring_doorbell(struct kvm_vcpu *vcpu);
unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu);
KVM: SVM: Don't put/load AVIC when setting virtual APIC mode Move the VMCB updates from avic_refresh_apicv_exec_ctrl() into avic_set_virtual_apic_mode() and invert the dependency being said functions to avoid calling avic_vcpu_{load,put}() and avic_set_pi_irte_mode() when "only" setting the virtual APIC mode. avic_set_virtual_apic_mode() is invoked from common x86 with preemption enabled, which makes avic_vcpu_{load,put}() unhappy. Luckily, calling those and updating IRTE stuff is unnecessary as the only reason avic_set_virtual_apic_mode() is called is to handle transitions between xAPIC and x2APIC that don't also toggle APICv activation. And if activation doesn't change, there's no need to fiddle with the physical APIC ID table or update IRTE. The "full" refresh is guaranteed to be called if activation changes in this case as the only call to the "set" path is: kvm_vcpu_update_apicv(vcpu); static_call_cond(kvm_x86_set_virtual_apic_mode)(vcpu); and kvm_vcpu_update_apicv() invokes the refresh if activation changes: if (apic->apicv_active == activate) goto out; apic->apicv_active = activate; kvm_apic_update_apicv(vcpu); static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu); Rename the helper to reflect that it is also called during "refresh". WARNING: CPU: 183 PID: 49186 at arch/x86/kvm/svm/avic.c:1081 avic_vcpu_put+0xde/0xf0 [kvm_amd] CPU: 183 PID: 49186 Comm: stable Tainted: G O 6.0.0-smp--fcddbca45f0a-sink #34 Hardware name: Google, Inc. Arcadia_IT_80/Arcadia_IT_80, BIOS 10.48.0 01/27/2022 RIP: 0010:avic_vcpu_put+0xde/0xf0 [kvm_amd] avic_refresh_apicv_exec_ctrl+0x142/0x1c0 [kvm_amd] avic_set_virtual_apic_mode+0x5a/0x70 [kvm_amd] kvm_lapic_set_base+0x149/0x1a0 [kvm] kvm_set_apic_base+0x8f/0xd0 [kvm] kvm_set_msr_common+0xa3a/0xdc0 [kvm] svm_set_msr+0x364/0x6b0 [kvm_amd] __kvm_set_msr+0xb8/0x1c0 [kvm] kvm_emulate_wrmsr+0x58/0x1d0 [kvm] msr_interception+0x1c/0x30 [kvm_amd] svm_invoke_exit_handler+0x31/0x100 [kvm_amd] svm_handle_exit+0xfc/0x160 [kvm_amd] vcpu_enter_guest+0x21bb/0x23e0 [kvm] vcpu_run+0x92/0x450 [kvm] kvm_arch_vcpu_ioctl_run+0x43e/0x6e0 [kvm] kvm_vcpu_ioctl+0x559/0x620 [kvm] Fixes: 05c4fe8c1bd9 ("KVM: SVM: Refresh AVIC configuration when changing APIC mode") Cc: stable@vger.kernel.org Cc: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com> Signed-off-by: Sean Christopherson <seanjc@google.com> Message-Id: <20230106011306.85230-8-seanjc@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2023-01-06 01:12:40 +00:00
void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu);
/* sev.c */
int pre_sev_run(struct vcpu_svm *svm, int cpu);
void sev_init_vmcb(struct vcpu_svm *svm);
void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm);
int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
void sev_es_vcpu_reset(struct vcpu_svm *svm);
void sev_es_recalc_msr_intercepts(struct kvm_vcpu *vcpu);
void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa);
void sev_es_unmap_ghcb(struct vcpu_svm *svm);
#ifdef CONFIG_KVM_AMD_SEV
int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp);
int sev_mem_enc_register_region(struct kvm *kvm,
struct kvm_enc_region *range);
int sev_mem_enc_unregister_region(struct kvm *kvm,
struct kvm_enc_region *range);
int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd);
int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd);
KVM: SEV: add cache flush to solve SEV cache incoherency issues Flush the CPU caches when memory is reclaimed from an SEV guest (where reclaim also includes it being unmapped from KVM's memslots). Due to lack of coherency for SEV encrypted memory, failure to flush results in silent data corruption if userspace is malicious/broken and doesn't ensure SEV guest memory is properly pinned and unpinned. Cache coherency is not enforced across the VM boundary in SEV (AMD APM vol.2 Section 15.34.7). Confidential cachelines, generated by confidential VM guests have to be explicitly flushed on the host side. If a memory page containing dirty confidential cachelines was released by VM and reallocated to another user, the cachelines may corrupt the new user at a later time. KVM takes a shortcut by assuming all confidential memory remain pinned until the end of VM lifetime. Therefore, KVM does not flush cache at mmu_notifier invalidation events. Because of this incorrect assumption and the lack of cache flushing, malicous userspace can crash the host kernel: creating a malicious VM and continuously allocates/releases unpinned confidential memory pages when the VM is running. Add cache flush operations to mmu_notifier operations to ensure that any physical memory leaving the guest VM get flushed. In particular, hook mmu_notifier_invalidate_range_start and mmu_notifier_release events and flush cache accordingly. The hook after releasing the mmu lock to avoid contention with other vCPUs. Cc: stable@vger.kernel.org Suggested-by: Sean Christpherson <seanjc@google.com> Reported-by: Mingwei Zhang <mizhang@google.com> Signed-off-by: Mingwei Zhang <mizhang@google.com> Message-Id: <20220421031407.2516575-4-mizhang@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-04-21 03:14:07 +00:00
void sev_guest_memory_reclaimed(struct kvm *kvm);
int sev_handle_vmgexit(struct kvm_vcpu *vcpu);
KVM: SEV: add cache flush to solve SEV cache incoherency issues Flush the CPU caches when memory is reclaimed from an SEV guest (where reclaim also includes it being unmapped from KVM's memslots). Due to lack of coherency for SEV encrypted memory, failure to flush results in silent data corruption if userspace is malicious/broken and doesn't ensure SEV guest memory is properly pinned and unpinned. Cache coherency is not enforced across the VM boundary in SEV (AMD APM vol.2 Section 15.34.7). Confidential cachelines, generated by confidential VM guests have to be explicitly flushed on the host side. If a memory page containing dirty confidential cachelines was released by VM and reallocated to another user, the cachelines may corrupt the new user at a later time. KVM takes a shortcut by assuming all confidential memory remain pinned until the end of VM lifetime. Therefore, KVM does not flush cache at mmu_notifier invalidation events. Because of this incorrect assumption and the lack of cache flushing, malicous userspace can crash the host kernel: creating a malicious VM and continuously allocates/releases unpinned confidential memory pages when the VM is running. Add cache flush operations to mmu_notifier operations to ensure that any physical memory leaving the guest VM get flushed. In particular, hook mmu_notifier_invalidate_range_start and mmu_notifier_release events and flush cache accordingly. The hook after releasing the mmu lock to avoid contention with other vCPUs. Cc: stable@vger.kernel.org Suggested-by: Sean Christpherson <seanjc@google.com> Reported-by: Mingwei Zhang <mizhang@google.com> Signed-off-by: Mingwei Zhang <mizhang@google.com> Message-Id: <20220421031407.2516575-4-mizhang@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-04-21 03:14:07 +00:00
/* These symbols are used in common code and are stubbed below. */
struct page *snp_safe_alloc_page_node(int node, gfp_t gfp);
static inline struct page *snp_safe_alloc_page(void)
{
return snp_safe_alloc_page_node(numa_node_id(), GFP_KERNEL_ACCOUNT);
}
void sev_free_vcpu(struct kvm_vcpu *vcpu);
void sev_vm_destroy(struct kvm *kvm);
void __init sev_set_cpu_caps(void);
void __init sev_hardware_setup(void);
void sev_hardware_unsetup(void);
int sev_cpu_init(struct svm_cpu_data *sd);
int sev_dev_get_attr(u32 group, u64 attr, u64 *val);
extern unsigned int max_sev_asid;
void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code);
KVM: SEV: Support SEV-SNP AP Creation NAE event Add support for the SEV-SNP AP Creation NAE event. This allows SEV-SNP guests to alter the register state of the APs on their own. This allows the guest a way of simulating INIT-SIPI. A new event, KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, is created and used so as to avoid updating the VMSA pointer while the vCPU is running. For CREATE The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID. The GPA is saved in the svm struct of the target vCPU, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to the vCPU and then the vCPU is kicked. For CREATE_ON_INIT: The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID the next time an INIT is performed. The GPA is saved in the svm struct of the target vCPU. For DESTROY: The guest indicates it wishes to stop the vCPU. The GPA is cleared from the svm struct, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to vCPU and then the vCPU is kicked. The KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event handler will be invoked as a result of the event or as a result of an INIT. If a new VMSA is to be installed, the VMSA guest page is set as the VMSA in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_RUNNABLE. If a new VMSA is not to be installed, the VMSA is cleared in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_HALTED to prevent it from being run. Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Co-developed-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Message-ID: <20240501085210.2213060-13-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-05-01 03:52:02 -05:00
void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu);
int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end);
int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn);
struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu);
void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa);
#else
static inline struct page *snp_safe_alloc_page_node(int node, gfp_t gfp)
{
return alloc_pages_node(node, gfp | __GFP_ZERO, 0);
}
static inline struct page *snp_safe_alloc_page(void)
{
return snp_safe_alloc_page_node(numa_node_id(), GFP_KERNEL_ACCOUNT);
}
static inline void sev_free_vcpu(struct kvm_vcpu *vcpu) {}
static inline void sev_vm_destroy(struct kvm *kvm) {}
static inline void __init sev_set_cpu_caps(void) {}
static inline void __init sev_hardware_setup(void) {}
static inline void sev_hardware_unsetup(void) {}
static inline int sev_cpu_init(struct svm_cpu_data *sd) { return 0; }
static inline int sev_dev_get_attr(u32 group, u64 attr, u64 *val) { return -ENXIO; }
#define max_sev_asid 0
static inline void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) {}
KVM: SEV: Support SEV-SNP AP Creation NAE event Add support for the SEV-SNP AP Creation NAE event. This allows SEV-SNP guests to alter the register state of the APs on their own. This allows the guest a way of simulating INIT-SIPI. A new event, KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, is created and used so as to avoid updating the VMSA pointer while the vCPU is running. For CREATE The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID. The GPA is saved in the svm struct of the target vCPU, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to the vCPU and then the vCPU is kicked. For CREATE_ON_INIT: The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID the next time an INIT is performed. The GPA is saved in the svm struct of the target vCPU. For DESTROY: The guest indicates it wishes to stop the vCPU. The GPA is cleared from the svm struct, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to vCPU and then the vCPU is kicked. The KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event handler will be invoked as a result of the event or as a result of an INIT. If a new VMSA is to be installed, the VMSA guest page is set as the VMSA in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_RUNNABLE. If a new VMSA is not to be installed, the VMSA is cleared in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_HALTED to prevent it from being run. Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Co-developed-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Message-ID: <20240501085210.2213060-13-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-05-01 03:52:02 -05:00
static inline void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) {}
static inline int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order)
{
return 0;
}
static inline void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) {}
static inline int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
{
return 0;
}
static inline struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu)
{
return NULL;
}
static inline void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa) {}
#endif
/* vmenter.S */
void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted,
struct sev_es_save_area *hostsa);
void __svm_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted);
#define DEFINE_KVM_GHCB_ACCESSORS(field) \
static __always_inline bool kvm_ghcb_##field##_is_valid(const struct vcpu_svm *svm) \
{ \
return test_bit(GHCB_BITMAP_IDX(field), \
(unsigned long *)&svm->sev_es.valid_bitmap); \
} \
\
static __always_inline u64 kvm_ghcb_get_##field##_if_valid(struct vcpu_svm *svm, struct ghcb *ghcb) \
{ \
return kvm_ghcb_##field##_is_valid(svm) ? ghcb->save.field : 0; \
} \
DEFINE_KVM_GHCB_ACCESSORS(cpl)
DEFINE_KVM_GHCB_ACCESSORS(rax)
DEFINE_KVM_GHCB_ACCESSORS(rcx)
DEFINE_KVM_GHCB_ACCESSORS(rdx)
DEFINE_KVM_GHCB_ACCESSORS(rbx)
DEFINE_KVM_GHCB_ACCESSORS(rsi)
DEFINE_KVM_GHCB_ACCESSORS(sw_exit_code)
DEFINE_KVM_GHCB_ACCESSORS(sw_exit_info_1)
DEFINE_KVM_GHCB_ACCESSORS(sw_exit_info_2)
DEFINE_KVM_GHCB_ACCESSORS(sw_scratch)
DEFINE_KVM_GHCB_ACCESSORS(xcr0)
#endif