linux/arch/x86/kvm/svm/sev.c

5058 lines
136 KiB
C
Raw Permalink Normal View History

// SPDX-License-Identifier: GPL-2.0-only
/*
* Kernel-based Virtual Machine driver for Linux
*
* AMD SVM-SEV support
*
* Copyright 2010 Red Hat, Inc. and/or its affiliates.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_types.h>
#include <linux/kvm_host.h>
#include <linux/kernel.h>
#include <linux/highmem.h>
#include <linux/psp.h>
#include <linux/psp-sev.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/misc_cgroup.h>
#include <linux/processor.h>
#include <linux/trace_events.h>
KVM: SEV: Provide support for SNP_GUEST_REQUEST NAE event Version 2 of GHCB specification added support for the SNP Guest Request Message NAE event. The event allows for an SEV-SNP guest to make requests to the SEV-SNP firmware through the hypervisor using the SNP_GUEST_REQUEST API defined in the SEV-SNP firmware specification. This is used by guests primarily to request attestation reports from firmware. There are other request types are available as well, but the specifics of what guest requests are being made generally does not affect how they are handled by the hypervisor, which only serves as a proxy for the guest requests and firmware responses. Implement handling for these events. When an SNP Guest Request is issued, the guest will provide its own request/response pages, which could in theory be passed along directly to firmware. However, these pages would need special care: - Both pages are from shared guest memory, so they need to be protected from migration/etc. occurring while firmware reads/writes to them. At a minimum, this requires elevating the ref counts and potentially needing an explicit pinning of the memory. This places additional restrictions on what type of memory backends userspace can use for shared guest memory since there would be some reliance on using refcounted pages. - The response page needs to be switched to Firmware-owned state before the firmware can write to it, which can lead to potential host RMP #PFs if the guest is misbehaved and hands the host a guest page that KVM is writing to for other reasons (e.g. virtio buffers). Both of these issues can be avoided completely by using separately-allocated bounce pages for both the request/response pages and passing those to firmware instead. So that's the approach taken here. Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Co-developed-by: Alexey Kardashevskiy <aik@amd.com> Signed-off-by: Alexey Kardashevskiy <aik@amd.com> Co-developed-by: Ashish Kalra <ashish.kalra@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Reviewed-by: Liam Merwick <liam.merwick@oracle.com> [mdr: ensure FW command failures are indicated to guest, drop extended request handling to be re-written as separate patch, massage commit] Signed-off-by: Michael Roth <michael.roth@amd.com> Message-ID: <20240701223148.3798365-2-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-07-01 17:31:46 -05:00
#include <uapi/linux/sev-guest.h>
#include <asm/pkru.h>
#include <asm/trapnr.h>
#include <asm/fpu/xcr.h>
#include <asm/fpu/xstate.h>
#include <asm/debugreg.h>
#include <asm/msr.h>
#include <asm/sev.h>
#include "mmu.h"
#include "x86.h"
#include "svm.h"
#include "svm_ops.h"
#include "cpuid.h"
#include "trace.h"
#define GHCB_VERSION_MAX 2ULL
#define GHCB_VERSION_DEFAULT 2ULL
#define GHCB_VERSION_MIN 1ULL
KVM: SEV: Support SEV-SNP AP Creation NAE event Add support for the SEV-SNP AP Creation NAE event. This allows SEV-SNP guests to alter the register state of the APs on their own. This allows the guest a way of simulating INIT-SIPI. A new event, KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, is created and used so as to avoid updating the VMSA pointer while the vCPU is running. For CREATE The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID. The GPA is saved in the svm struct of the target vCPU, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to the vCPU and then the vCPU is kicked. For CREATE_ON_INIT: The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID the next time an INIT is performed. The GPA is saved in the svm struct of the target vCPU. For DESTROY: The guest indicates it wishes to stop the vCPU. The GPA is cleared from the svm struct, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to vCPU and then the vCPU is kicked. The KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event handler will be invoked as a result of the event or as a result of an INIT. If a new VMSA is to be installed, the VMSA guest page is set as the VMSA in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_RUNNABLE. If a new VMSA is not to be installed, the VMSA is cleared in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_HALTED to prevent it from being run. Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Co-developed-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Message-ID: <20240501085210.2213060-13-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-05-01 03:52:02 -05:00
#define GHCB_HV_FT_SUPPORTED (GHCB_HV_FT_SNP | GHCB_HV_FT_SNP_AP_CREATION)
/* enable/disable SEV support */
static bool sev_enabled = true;
module_param_named(sev, sev_enabled, bool, 0444);
/* enable/disable SEV-ES support */
static bool sev_es_enabled = true;
module_param_named(sev_es, sev_es_enabled, bool, 0444);
/* enable/disable SEV-SNP support */
static bool sev_snp_enabled = true;
module_param_named(sev_snp, sev_snp_enabled, bool, 0444);
/* enable/disable SEV-ES DebugSwap support */
static bool sev_es_debug_swap_enabled = true;
module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444);
static u64 sev_supported_vmsa_features;
#define AP_RESET_HOLD_NONE 0
#define AP_RESET_HOLD_NAE_EVENT 1
#define AP_RESET_HOLD_MSR_PROTO 2
/* As defined by SEV-SNP Firmware ABI, under "Guest Policy". */
#define SNP_POLICY_MASK_API_MINOR GENMASK_ULL(7, 0)
#define SNP_POLICY_MASK_API_MAJOR GENMASK_ULL(15, 8)
#define SNP_POLICY_MASK_SMT BIT_ULL(16)
#define SNP_POLICY_MASK_RSVD_MBO BIT_ULL(17)
#define SNP_POLICY_MASK_DEBUG BIT_ULL(19)
#define SNP_POLICY_MASK_SINGLE_SOCKET BIT_ULL(20)
#define SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \
SNP_POLICY_MASK_API_MAJOR | \
SNP_POLICY_MASK_SMT | \
SNP_POLICY_MASK_RSVD_MBO | \
SNP_POLICY_MASK_DEBUG | \
SNP_POLICY_MASK_SINGLE_SOCKET)
#define INITIAL_VMSA_GPA 0xFFFFFFFFF000
static u8 sev_enc_bit;
static DECLARE_RWSEM(sev_deactivate_lock);
static DEFINE_MUTEX(sev_bitmap_lock);
unsigned int max_sev_asid;
static unsigned int min_sev_asid;
static unsigned long sev_me_mask;
static unsigned int nr_asids;
static unsigned long *sev_asid_bitmap;
static unsigned long *sev_reclaim_asid_bitmap;
static int snp_decommission_context(struct kvm *kvm);
struct enc_region {
struct list_head list;
unsigned long npages;
struct page **pages;
unsigned long uaddr;
unsigned long size;
};
/* Called with the sev_bitmap_lock held, or on shutdown */
static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid)
{
int ret, error = 0;
unsigned int asid;
/* Check if there are any ASIDs to reclaim before performing a flush */
asid = find_next_bit(sev_reclaim_asid_bitmap, nr_asids, min_asid);
if (asid > max_asid)
return -EBUSY;
/*
* DEACTIVATE will clear the WBINVD indicator causing DF_FLUSH to fail,
* so it must be guarded.
*/
down_write(&sev_deactivate_lock);
KVM: SEV: Prefer WBNOINVD over WBINVD for cache maintenance efficiency AMD CPUs currently execute WBINVD in the host when unregistering SEV guest memory or when deactivating SEV guests. Such cache maintenance is performed to prevent data corruption, wherein the encrypted (C=1) version of a dirty cache line might otherwise only be written back after the memory is written in a different context (ex: C=0), yielding corruption. However, WBINVD is performance-costly, especially because it invalidates processor caches. Strictly-speaking, unless the SEV ASID is being recycled (meaning the SNP firmware requires the use of WBINVD prior to DF_FLUSH), the cache invalidation triggered by WBINVD is unnecessary; only the writeback is needed to prevent data corruption in remaining scenarios. To improve performance in these scenarios, use WBNOINVD when available instead of WBINVD. WBNOINVD still writes back all dirty lines (preventing host data corruption by SEV guests) but does *not* invalidate processor caches. Note that the implementation of wbnoinvd() ensures fall back to WBINVD if WBNOINVD is unavailable. In anticipation of forthcoming optimizations to limit the WBNOINVD only to physical CPUs that have executed SEV guests, place the call to wbnoinvd_on_all_cpus() in a wrapper function sev_writeback_caches(). Signed-off-by: Kevin Loughlin <kevinloughlin@google.com> Reviewed-by: Mingwei Zhang <mizhang@google.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Link: https://lore.kernel.org/r/20250201000259.3289143-3-kevinloughlin@google.com [sean: tweak comment regarding CLFUSH] Cc: Francesco Lavra <francescolavra.fl@gmail.com> Link: https://lore.kernel.org/r/20250522233733.3176144-8-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-05-22 16:37:31 -07:00
/* SNP firmware requires use of WBINVD for ASID recycling. */
wbinvd_on_all_cpus();
if (sev_snp_enabled)
ret = sev_do_cmd(SEV_CMD_SNP_DF_FLUSH, NULL, &error);
else
ret = sev_guest_df_flush(&error);
up_write(&sev_deactivate_lock);
if (ret)
pr_err("SEV%s: DF_FLUSH failed, ret=%d, error=%#x\n",
sev_snp_enabled ? "-SNP" : "", ret, error);
return ret;
}
KVM: x86: Support KVM VMs sharing SEV context Add a capability for userspace to mirror SEV encryption context from one vm to another. On our side, this is intended to support a Migration Helper vCPU, but it can also be used generically to support other in-guest workloads scheduled by the host. The intention is for the primary guest and the mirror to have nearly identical memslots. The primary benefits of this are that: 1) The VMs do not share KVM contexts (think APIC/MSRs/etc), so they can't accidentally clobber each other. 2) The VMs can have different memory-views, which is necessary for post-copy migration (the migration vCPUs on the target need to read and write to pages, when the primary guest would VMEXIT). This does not change the threat model for AMD SEV. Any memory involved is still owned by the primary guest and its initial state is still attested to through the normal SEV_LAUNCH_* flows. If userspace wanted to circumvent SEV, they could achieve the same effect by simply attaching a vCPU to the primary VM. This patch deliberately leaves userspace in charge of the memslots for the mirror, as it already has the power to mess with them in the primary guest. This patch does not support SEV-ES (much less SNP), as it does not handle handing off attested VMSAs to the mirror. For additional context, we need a Migration Helper because SEV PSP migration is far too slow for our live migration on its own. Using an in-guest migrator lets us speed this up significantly. Signed-off-by: Nathan Tempelman <natet@google.com> Message-Id: <20210408223214.2582277-1-natet@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2021-04-08 22:32:14 +00:00
static inline bool is_mirroring_enc_context(struct kvm *kvm)
{
return !!to_kvm_sev_info(kvm)->enc_context_owner;
KVM: x86: Support KVM VMs sharing SEV context Add a capability for userspace to mirror SEV encryption context from one vm to another. On our side, this is intended to support a Migration Helper vCPU, but it can also be used generically to support other in-guest workloads scheduled by the host. The intention is for the primary guest and the mirror to have nearly identical memslots. The primary benefits of this are that: 1) The VMs do not share KVM contexts (think APIC/MSRs/etc), so they can't accidentally clobber each other. 2) The VMs can have different memory-views, which is necessary for post-copy migration (the migration vCPUs on the target need to read and write to pages, when the primary guest would VMEXIT). This does not change the threat model for AMD SEV. Any memory involved is still owned by the primary guest and its initial state is still attested to through the normal SEV_LAUNCH_* flows. If userspace wanted to circumvent SEV, they could achieve the same effect by simply attaching a vCPU to the primary VM. This patch deliberately leaves userspace in charge of the memslots for the mirror, as it already has the power to mess with them in the primary guest. This patch does not support SEV-ES (much less SNP), as it does not handle handing off attested VMSAs to the mirror. For additional context, we need a Migration Helper because SEV PSP migration is far too slow for our live migration on its own. Using an in-guest migrator lets us speed this up significantly. Signed-off-by: Nathan Tempelman <natet@google.com> Message-Id: <20210408223214.2582277-1-natet@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2021-04-08 22:32:14 +00:00
}
static bool sev_vcpu_has_debug_swap(struct vcpu_svm *svm)
{
struct kvm_vcpu *vcpu = &svm->vcpu;
struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
return sev->vmsa_features & SVM_SEV_FEAT_DEBUG_SWAP;
}
/* Must be called with the sev_bitmap_lock held */
static bool __sev_recycle_asids(unsigned int min_asid, unsigned int max_asid)
{
if (sev_flush_asids(min_asid, max_asid))
return false;
/* The flush process will flush all reclaimable SEV and SEV-ES ASIDs */
bitmap_xor(sev_asid_bitmap, sev_asid_bitmap, sev_reclaim_asid_bitmap,
nr_asids);
bitmap_zero(sev_reclaim_asid_bitmap, nr_asids);
return true;
}
static int sev_misc_cg_try_charge(struct kvm_sev_info *sev)
{
enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
return misc_cg_try_charge(type, sev->misc_cg, 1);
}
static void sev_misc_cg_uncharge(struct kvm_sev_info *sev)
{
enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
misc_cg_uncharge(type, sev->misc_cg, 1);
}
static int sev_asid_new(struct kvm_sev_info *sev)
{
/*
* SEV-enabled guests must use asid from min_sev_asid to max_sev_asid.
* SEV-ES-enabled guest can use from 1 to min_sev_asid - 1.
* Note: min ASID can end up larger than the max if basic SEV support is
* effectively disabled by disallowing use of ASIDs for SEV guests.
*/
unsigned int min_asid = sev->es_active ? 1 : min_sev_asid;
unsigned int max_asid = sev->es_active ? min_sev_asid - 1 : max_sev_asid;
unsigned int asid;
bool retry = true;
int ret;
if (min_asid > max_asid)
return -ENOTTY;
WARN_ON(sev->misc_cg);
sev->misc_cg = get_current_misc_cg();
ret = sev_misc_cg_try_charge(sev);
if (ret) {
put_misc_cg(sev->misc_cg);
sev->misc_cg = NULL;
return ret;
}
mutex_lock(&sev_bitmap_lock);
again:
asid = find_next_zero_bit(sev_asid_bitmap, max_asid + 1, min_asid);
if (asid > max_asid) {
if (retry && __sev_recycle_asids(min_asid, max_asid)) {
retry = false;
goto again;
}
mutex_unlock(&sev_bitmap_lock);
ret = -EBUSY;
goto e_uncharge;
}
__set_bit(asid, sev_asid_bitmap);
mutex_unlock(&sev_bitmap_lock);
sev->asid = asid;
return 0;
e_uncharge:
sev_misc_cg_uncharge(sev);
put_misc_cg(sev->misc_cg);
sev->misc_cg = NULL;
return ret;
}
static unsigned int sev_get_asid(struct kvm *kvm)
{
return to_kvm_sev_info(kvm)->asid;
}
static void sev_asid_free(struct kvm_sev_info *sev)
{
struct svm_cpu_data *sd;
int cpu;
mutex_lock(&sev_bitmap_lock);
__set_bit(sev->asid, sev_reclaim_asid_bitmap);
for_each_possible_cpu(cpu) {
sd = per_cpu_ptr(&svm_data, cpu);
sd->sev_vmcbs[sev->asid] = NULL;
}
mutex_unlock(&sev_bitmap_lock);
sev_misc_cg_uncharge(sev);
put_misc_cg(sev->misc_cg);
sev->misc_cg = NULL;
}
static void sev_decommission(unsigned int handle)
{
struct sev_data_decommission decommission;
if (!handle)
return;
decommission.handle = handle;
sev_guest_decommission(&decommission, NULL);
}
/*
* Transition a page to hypervisor-owned/shared state in the RMP table. This
* should not fail under normal conditions, but leak the page should that
* happen since it will no longer be usable by the host due to RMP protections.
*/
static int kvm_rmp_make_shared(struct kvm *kvm, u64 pfn, enum pg_level level)
{
if (KVM_BUG_ON(rmp_make_shared(pfn, level), kvm)) {
snp_leak_pages(pfn, page_level_size(level) >> PAGE_SHIFT);
return -EIO;
}
return 0;
}
/*
* Certain page-states, such as Pre-Guest and Firmware pages (as documented
* in Chapter 5 of the SEV-SNP Firmware ABI under "Page States") cannot be
* directly transitioned back to normal/hypervisor-owned state via RMPUPDATE
* unless they are reclaimed first.
*
* Until they are reclaimed and subsequently transitioned via RMPUPDATE, they
* might not be usable by the host due to being set as immutable or still
* being associated with a guest ASID.
*
* Bug the VM and leak the page if reclaim fails, or if the RMP entry can't be
* converted back to shared, as the page is no longer usable due to RMP
* protections, and it's infeasible for the guest to continue on.
*/
static int snp_page_reclaim(struct kvm *kvm, u64 pfn)
{
struct sev_data_snp_page_reclaim data = {0};
int fw_err, rc;
data.paddr = __sme_set(pfn << PAGE_SHIFT);
rc = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &fw_err);
if (KVM_BUG(rc, kvm, "Failed to reclaim PFN %llx, rc %d fw_err %d", pfn, rc, fw_err)) {
snp_leak_pages(pfn, 1);
return -EIO;
}
if (kvm_rmp_make_shared(kvm, pfn, PG_LEVEL_4K))
return -EIO;
return rc;
}
static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
{
struct sev_data_deactivate deactivate;
if (!handle)
return;
deactivate.handle = handle;
/* Guard DEACTIVATE against WBINVD/DF_FLUSH used in ASID recycling */
down_read(&sev_deactivate_lock);
sev_guest_deactivate(&deactivate, NULL);
up_read(&sev_deactivate_lock);
sev_decommission(handle);
}
KVM: SEV: Provide support for SNP_GUEST_REQUEST NAE event Version 2 of GHCB specification added support for the SNP Guest Request Message NAE event. The event allows for an SEV-SNP guest to make requests to the SEV-SNP firmware through the hypervisor using the SNP_GUEST_REQUEST API defined in the SEV-SNP firmware specification. This is used by guests primarily to request attestation reports from firmware. There are other request types are available as well, but the specifics of what guest requests are being made generally does not affect how they are handled by the hypervisor, which only serves as a proxy for the guest requests and firmware responses. Implement handling for these events. When an SNP Guest Request is issued, the guest will provide its own request/response pages, which could in theory be passed along directly to firmware. However, these pages would need special care: - Both pages are from shared guest memory, so they need to be protected from migration/etc. occurring while firmware reads/writes to them. At a minimum, this requires elevating the ref counts and potentially needing an explicit pinning of the memory. This places additional restrictions on what type of memory backends userspace can use for shared guest memory since there would be some reliance on using refcounted pages. - The response page needs to be switched to Firmware-owned state before the firmware can write to it, which can lead to potential host RMP #PFs if the guest is misbehaved and hands the host a guest page that KVM is writing to for other reasons (e.g. virtio buffers). Both of these issues can be avoided completely by using separately-allocated bounce pages for both the request/response pages and passing those to firmware instead. So that's the approach taken here. Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Co-developed-by: Alexey Kardashevskiy <aik@amd.com> Signed-off-by: Alexey Kardashevskiy <aik@amd.com> Co-developed-by: Ashish Kalra <ashish.kalra@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Reviewed-by: Liam Merwick <liam.merwick@oracle.com> [mdr: ensure FW command failures are indicated to guest, drop extended request handling to be re-written as separate patch, massage commit] Signed-off-by: Michael Roth <michael.roth@amd.com> Message-ID: <20240701223148.3798365-2-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-07-01 17:31:46 -05:00
/*
* This sets up bounce buffers/firmware pages to handle SNP Guest Request
* messages (e.g. attestation requests). See "SNP Guest Request" in the GHCB
* 2.0 specification for more details.
*
* Technically, when an SNP Guest Request is issued, the guest will provide its
* own request/response pages, which could in theory be passed along directly
* to firmware rather than using bounce pages. However, these pages would need
* special care:
*
* - Both pages are from shared guest memory, so they need to be protected
* from migration/etc. occurring while firmware reads/writes to them. At a
* minimum, this requires elevating the ref counts and potentially needing
* an explicit pinning of the memory. This places additional restrictions
* on what type of memory backends userspace can use for shared guest
* memory since there is some reliance on using refcounted pages.
*
* - The response page needs to be switched to Firmware-owned[1] state
* before the firmware can write to it, which can lead to potential
* host RMP #PFs if the guest is misbehaved and hands the host a
* guest page that KVM might write to for other reasons (e.g. virtio
* buffers/etc.).
*
* Both of these issues can be avoided completely by using separately-allocated
* bounce pages for both the request/response pages and passing those to
* firmware instead. So that's what is being set up here.
*
* Guest requests rely on message sequence numbers to ensure requests are
* issued to firmware in the order the guest issues them, so concurrent guest
* requests generally shouldn't happen. But a misbehaved guest could issue
* concurrent guest requests in theory, so a mutex is used to serialize
* access to the bounce buffers.
*
* [1] See the "Page States" section of the SEV-SNP Firmware ABI for more
* details on Firmware-owned pages, along with "RMP and VMPL Access Checks"
* in the APM for details on the related RMP restrictions.
*/
static int snp_guest_req_init(struct kvm *kvm)
{
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
struct page *req_page;
req_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!req_page)
return -ENOMEM;
sev->guest_resp_buf = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!sev->guest_resp_buf) {
__free_page(req_page);
return -EIO;
}
sev->guest_req_buf = page_address(req_page);
mutex_init(&sev->guest_req_mutex);
return 0;
}
static void snp_guest_req_cleanup(struct kvm *kvm)
{
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
if (sev->guest_resp_buf)
snp_free_firmware_page(sev->guest_resp_buf);
if (sev->guest_req_buf)
__free_page(virt_to_page(sev->guest_req_buf));
sev->guest_req_buf = NULL;
sev->guest_resp_buf = NULL;
}
static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
struct kvm_sev_init *data,
unsigned long vm_type)
{
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
crypto: ccp: Add support to initialize the AMD-SP for SEV-SNP Before SNP VMs can be launched, the platform must be appropriately configured and initialized via the SNP_INIT command. During the execution of SNP_INIT command, the firmware configures and enables SNP security policy enforcement in many system components. Some system components write to regions of memory reserved by early x86 firmware (e.g. UEFI). Other system components write to regions provided by the operation system, hypervisor, or x86 firmware. Such system components can only write to HV-fixed pages or Default pages. They will error when attempting to write to pages in other page states after SNP_INIT enables their SNP enforcement. Starting in SNP firmware v1.52, the SNP_INIT_EX command takes a list of system physical address ranges to convert into the HV-fixed page states during the RMP initialization. If INIT_RMP is 1, hypervisors should provide all system physical address ranges that the hypervisor will never assign to a guest until the next RMP re-initialization. For instance, the memory that UEFI reserves should be included in the range list. This allows system components that occasionally write to memory (e.g. logging to UEFI reserved regions) to not fail due to RMP initialization and SNP enablement. Note that SNP_INIT(_EX) must not be executed while non-SEV guests are executing, otherwise it is possible that the system could reset or hang. The psp_init_on_probe module parameter was added for SEV/SEV-ES support and the init_ex_path module parameter to allow for time for the necessary file system to be mounted/available. SNP_INIT(_EX) does not use the file associated with init_ex_path. So, to avoid running into issues where SNP_INIT(_EX) is called while there are other running guests, issue it during module probe regardless of the psp_init_on_probe setting, but maintain the previous deferrable handling for SEV/SEV-ES initialization. [ mdr: Squash in psp_init_on_probe changes from Tom, reduce proliferation of 'probe' function parameter where possible. bp: Fix 32-bit allmodconfig build. ] Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Co-developed-by: Ashish Kalra <ashish.kalra@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Co-developed-by: Jarkko Sakkinen <jarkko@profian.com> Signed-off-by: Jarkko Sakkinen <jarkko@profian.com> Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> Link: https://lore.kernel.org/r/20240126041126.1927228-14-michael.roth@amd.com
2024-01-25 22:11:13 -06:00
struct sev_platform_init_args init_args = {0};
bool es_active = vm_type != KVM_X86_SEV_VM;
u64 valid_vmsa_features = es_active ? sev_supported_vmsa_features : 0;
int ret;
if (kvm->created_vcpus)
return -EINVAL;
if (data->flags)
return -EINVAL;
if (data->vmsa_features & ~valid_vmsa_features)
return -EINVAL;
if (data->ghcb_version > GHCB_VERSION_MAX || (!es_active && data->ghcb_version))
return -EINVAL;
if (unlikely(sev->active))
return -EINVAL;
sev->active = true;
sev->es_active = es_active;
sev->vmsa_features = data->vmsa_features;
sev->ghcb_version = data->ghcb_version;
/*
* Currently KVM supports the full range of mandatory features defined
* by version 2 of the GHCB protocol, so default to that for SEV-ES
* guests created via KVM_SEV_INIT2.
*/
if (sev->es_active && !sev->ghcb_version)
sev->ghcb_version = GHCB_VERSION_DEFAULT;
if (vm_type == KVM_X86_SNP_VM)
sev->vmsa_features |= SVM_SEV_FEAT_SNP_ACTIVE;
ret = sev_asid_new(sev);
if (ret)
goto e_no_asid;
crypto: ccp: Add support to initialize the AMD-SP for SEV-SNP Before SNP VMs can be launched, the platform must be appropriately configured and initialized via the SNP_INIT command. During the execution of SNP_INIT command, the firmware configures and enables SNP security policy enforcement in many system components. Some system components write to regions of memory reserved by early x86 firmware (e.g. UEFI). Other system components write to regions provided by the operation system, hypervisor, or x86 firmware. Such system components can only write to HV-fixed pages or Default pages. They will error when attempting to write to pages in other page states after SNP_INIT enables their SNP enforcement. Starting in SNP firmware v1.52, the SNP_INIT_EX command takes a list of system physical address ranges to convert into the HV-fixed page states during the RMP initialization. If INIT_RMP is 1, hypervisors should provide all system physical address ranges that the hypervisor will never assign to a guest until the next RMP re-initialization. For instance, the memory that UEFI reserves should be included in the range list. This allows system components that occasionally write to memory (e.g. logging to UEFI reserved regions) to not fail due to RMP initialization and SNP enablement. Note that SNP_INIT(_EX) must not be executed while non-SEV guests are executing, otherwise it is possible that the system could reset or hang. The psp_init_on_probe module parameter was added for SEV/SEV-ES support and the init_ex_path module parameter to allow for time for the necessary file system to be mounted/available. SNP_INIT(_EX) does not use the file associated with init_ex_path. So, to avoid running into issues where SNP_INIT(_EX) is called while there are other running guests, issue it during module probe regardless of the psp_init_on_probe setting, but maintain the previous deferrable handling for SEV/SEV-ES initialization. [ mdr: Squash in psp_init_on_probe changes from Tom, reduce proliferation of 'probe' function parameter where possible. bp: Fix 32-bit allmodconfig build. ] Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Co-developed-by: Ashish Kalra <ashish.kalra@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Co-developed-by: Jarkko Sakkinen <jarkko@profian.com> Signed-off-by: Jarkko Sakkinen <jarkko@profian.com> Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> Link: https://lore.kernel.org/r/20240126041126.1927228-14-michael.roth@amd.com
2024-01-25 22:11:13 -06:00
init_args.probe = false;
ret = sev_platform_init(&init_args);
if (ret)
KVM: SVM: Flush cache only on CPUs running SEV guest On AMD CPUs without ensuring cache consistency, each memory page reclamation in an SEV guest triggers a call to do WBNOINVD/WBINVD on all CPUs, thereby affecting the performance of other programs on the host. Typically, an AMD server may have 128 cores or more, while the SEV guest might only utilize 8 of these cores. Meanwhile, host can use qemu-affinity to bind these 8 vCPUs to specific physical CPUs. Therefore, keeping a record of the physical core numbers each time a vCPU runs can help avoid flushing the cache for all CPUs every time. Take care to allocate the cpumask used to track which CPUs have run a vCPU when copying or moving an "encryption context", as nothing guarantees memory in a mirror VM is a strict subset of the ASID owner, and the destination VM for intrahost migration needs to maintain it's own set of CPUs. E.g. for intrahost migration, if a CPU was used for the source VM but not the destination VM, then it can only have cached memory that was accessible to the source VM. And a CPU that was run in the source is also used by the destination is no different than a CPU that was run in the destination only. Note, KVM is guaranteed to do flush caches prior to sev_vm_destroy(), thanks to kvm_arch_guest_memory_reclaimed for SEV and SEV-ES, and kvm_arch_gmem_invalidate() for SEV-SNP. I.e. it's safe to free the cpumask prior to unregistering encrypted regions and freeing the ASID. Opportunistically clean up sev_vm_destroy()'s comment regarding what is (implicitly, what isn't) skipped for mirror VMs. Cc: Srikanth Aithal <sraithal@amd.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Signed-off-by: Zheyun Shen <szy0127@sjtu.edu.cn> Link: https://lore.kernel.org/r/20250522233733.3176144-9-seanjc@google.com Link: https://lore.kernel.org/all/935a82e3-f7ad-47d7-aaaf-f3d2b62ed768@amd.com Co-developed-by: Sean Christopherson <seanjc@google.com> Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-05-22 16:37:32 -07:00
goto e_free_asid;
if (!zalloc_cpumask_var(&sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
ret = -ENOMEM;
goto e_free_asid;
}
KVM: SEV: Provide support for SNP_GUEST_REQUEST NAE event Version 2 of GHCB specification added support for the SNP Guest Request Message NAE event. The event allows for an SEV-SNP guest to make requests to the SEV-SNP firmware through the hypervisor using the SNP_GUEST_REQUEST API defined in the SEV-SNP firmware specification. This is used by guests primarily to request attestation reports from firmware. There are other request types are available as well, but the specifics of what guest requests are being made generally does not affect how they are handled by the hypervisor, which only serves as a proxy for the guest requests and firmware responses. Implement handling for these events. When an SNP Guest Request is issued, the guest will provide its own request/response pages, which could in theory be passed along directly to firmware. However, these pages would need special care: - Both pages are from shared guest memory, so they need to be protected from migration/etc. occurring while firmware reads/writes to them. At a minimum, this requires elevating the ref counts and potentially needing an explicit pinning of the memory. This places additional restrictions on what type of memory backends userspace can use for shared guest memory since there would be some reliance on using refcounted pages. - The response page needs to be switched to Firmware-owned state before the firmware can write to it, which can lead to potential host RMP #PFs if the guest is misbehaved and hands the host a guest page that KVM is writing to for other reasons (e.g. virtio buffers). Both of these issues can be avoided completely by using separately-allocated bounce pages for both the request/response pages and passing those to firmware instead. So that's the approach taken here. Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Co-developed-by: Alexey Kardashevskiy <aik@amd.com> Signed-off-by: Alexey Kardashevskiy <aik@amd.com> Co-developed-by: Ashish Kalra <ashish.kalra@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Reviewed-by: Liam Merwick <liam.merwick@oracle.com> [mdr: ensure FW command failures are indicated to guest, drop extended request handling to be re-written as separate patch, massage commit] Signed-off-by: Michael Roth <michael.roth@amd.com> Message-ID: <20240701223148.3798365-2-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-07-01 17:31:46 -05:00
/* This needs to happen after SEV/SNP firmware initialization. */
if (vm_type == KVM_X86_SNP_VM) {
ret = snp_guest_req_init(kvm);
if (ret)
goto e_free;
}
KVM: SEV: Provide support for SNP_GUEST_REQUEST NAE event Version 2 of GHCB specification added support for the SNP Guest Request Message NAE event. The event allows for an SEV-SNP guest to make requests to the SEV-SNP firmware through the hypervisor using the SNP_GUEST_REQUEST API defined in the SEV-SNP firmware specification. This is used by guests primarily to request attestation reports from firmware. There are other request types are available as well, but the specifics of what guest requests are being made generally does not affect how they are handled by the hypervisor, which only serves as a proxy for the guest requests and firmware responses. Implement handling for these events. When an SNP Guest Request is issued, the guest will provide its own request/response pages, which could in theory be passed along directly to firmware. However, these pages would need special care: - Both pages are from shared guest memory, so they need to be protected from migration/etc. occurring while firmware reads/writes to them. At a minimum, this requires elevating the ref counts and potentially needing an explicit pinning of the memory. This places additional restrictions on what type of memory backends userspace can use for shared guest memory since there would be some reliance on using refcounted pages. - The response page needs to be switched to Firmware-owned state before the firmware can write to it, which can lead to potential host RMP #PFs if the guest is misbehaved and hands the host a guest page that KVM is writing to for other reasons (e.g. virtio buffers). Both of these issues can be avoided completely by using separately-allocated bounce pages for both the request/response pages and passing those to firmware instead. So that's the approach taken here. Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Co-developed-by: Alexey Kardashevskiy <aik@amd.com> Signed-off-by: Alexey Kardashevskiy <aik@amd.com> Co-developed-by: Ashish Kalra <ashish.kalra@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Reviewed-by: Liam Merwick <liam.merwick@oracle.com> [mdr: ensure FW command failures are indicated to guest, drop extended request handling to be re-written as separate patch, massage commit] Signed-off-by: Michael Roth <michael.roth@amd.com> Message-ID: <20240701223148.3798365-2-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-07-01 17:31:46 -05:00
INIT_LIST_HEAD(&sev->regions_list);
INIT_LIST_HEAD(&sev->mirror_vms);
sev->need_init = false;
kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_SEV);
return 0;
e_free:
KVM: SVM: Flush cache only on CPUs running SEV guest On AMD CPUs without ensuring cache consistency, each memory page reclamation in an SEV guest triggers a call to do WBNOINVD/WBINVD on all CPUs, thereby affecting the performance of other programs on the host. Typically, an AMD server may have 128 cores or more, while the SEV guest might only utilize 8 of these cores. Meanwhile, host can use qemu-affinity to bind these 8 vCPUs to specific physical CPUs. Therefore, keeping a record of the physical core numbers each time a vCPU runs can help avoid flushing the cache for all CPUs every time. Take care to allocate the cpumask used to track which CPUs have run a vCPU when copying or moving an "encryption context", as nothing guarantees memory in a mirror VM is a strict subset of the ASID owner, and the destination VM for intrahost migration needs to maintain it's own set of CPUs. E.g. for intrahost migration, if a CPU was used for the source VM but not the destination VM, then it can only have cached memory that was accessible to the source VM. And a CPU that was run in the source is also used by the destination is no different than a CPU that was run in the destination only. Note, KVM is guaranteed to do flush caches prior to sev_vm_destroy(), thanks to kvm_arch_guest_memory_reclaimed for SEV and SEV-ES, and kvm_arch_gmem_invalidate() for SEV-SNP. I.e. it's safe to free the cpumask prior to unregistering encrypted regions and freeing the ASID. Opportunistically clean up sev_vm_destroy()'s comment regarding what is (implicitly, what isn't) skipped for mirror VMs. Cc: Srikanth Aithal <sraithal@amd.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Signed-off-by: Zheyun Shen <szy0127@sjtu.edu.cn> Link: https://lore.kernel.org/r/20250522233733.3176144-9-seanjc@google.com Link: https://lore.kernel.org/all/935a82e3-f7ad-47d7-aaaf-f3d2b62ed768@amd.com Co-developed-by: Sean Christopherson <seanjc@google.com> Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-05-22 16:37:32 -07:00
free_cpumask_var(sev->have_run_cpus);
e_free_asid:
crypto: ccp: Add support to initialize the AMD-SP for SEV-SNP Before SNP VMs can be launched, the platform must be appropriately configured and initialized via the SNP_INIT command. During the execution of SNP_INIT command, the firmware configures and enables SNP security policy enforcement in many system components. Some system components write to regions of memory reserved by early x86 firmware (e.g. UEFI). Other system components write to regions provided by the operation system, hypervisor, or x86 firmware. Such system components can only write to HV-fixed pages or Default pages. They will error when attempting to write to pages in other page states after SNP_INIT enables their SNP enforcement. Starting in SNP firmware v1.52, the SNP_INIT_EX command takes a list of system physical address ranges to convert into the HV-fixed page states during the RMP initialization. If INIT_RMP is 1, hypervisors should provide all system physical address ranges that the hypervisor will never assign to a guest until the next RMP re-initialization. For instance, the memory that UEFI reserves should be included in the range list. This allows system components that occasionally write to memory (e.g. logging to UEFI reserved regions) to not fail due to RMP initialization and SNP enablement. Note that SNP_INIT(_EX) must not be executed while non-SEV guests are executing, otherwise it is possible that the system could reset or hang. The psp_init_on_probe module parameter was added for SEV/SEV-ES support and the init_ex_path module parameter to allow for time for the necessary file system to be mounted/available. SNP_INIT(_EX) does not use the file associated with init_ex_path. So, to avoid running into issues where SNP_INIT(_EX) is called while there are other running guests, issue it during module probe regardless of the psp_init_on_probe setting, but maintain the previous deferrable handling for SEV/SEV-ES initialization. [ mdr: Squash in psp_init_on_probe changes from Tom, reduce proliferation of 'probe' function parameter where possible. bp: Fix 32-bit allmodconfig build. ] Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Co-developed-by: Ashish Kalra <ashish.kalra@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Co-developed-by: Jarkko Sakkinen <jarkko@profian.com> Signed-off-by: Jarkko Sakkinen <jarkko@profian.com> Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> Link: https://lore.kernel.org/r/20240126041126.1927228-14-michael.roth@amd.com
2024-01-25 22:11:13 -06:00
argp->error = init_args.error;
sev_asid_free(sev);
sev->asid = 0;
e_no_asid:
sev->vmsa_features = 0;
sev->es_active = false;
sev->active = false;
return ret;
}
static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_init data = {
.vmsa_features = 0,
.ghcb_version = 0,
};
unsigned long vm_type;
if (kvm->arch.vm_type != KVM_X86_DEFAULT_VM)
return -EINVAL;
vm_type = (argp->id == KVM_SEV_INIT ? KVM_X86_SEV_VM : KVM_X86_SEV_ES_VM);
/*
* KVM_SEV_ES_INIT has been deprecated by KVM_SEV_INIT2, so it will
* continue to only ever support the minimal GHCB protocol version.
*/
if (vm_type == KVM_X86_SEV_ES_VM)
data.ghcb_version = GHCB_VERSION_MIN;
return __sev_guest_init(kvm, argp, &data, vm_type);
}
static int sev_guest_init2(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_init data;
if (!to_kvm_sev_info(kvm)->need_init)
return -EINVAL;
if (kvm->arch.vm_type != KVM_X86_SEV_VM &&
kvm->arch.vm_type != KVM_X86_SEV_ES_VM &&
kvm->arch.vm_type != KVM_X86_SNP_VM)
return -EINVAL;
if (copy_from_user(&data, u64_to_user_ptr(argp->data), sizeof(data)))
return -EFAULT;
return __sev_guest_init(kvm, argp, &data, kvm->arch.vm_type);
}
static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
{
unsigned int asid = sev_get_asid(kvm);
struct sev_data_activate activate;
int ret;
/* activate ASID on the given handle */
activate.handle = handle;
activate.asid = asid;
ret = sev_guest_activate(&activate, error);
return ret;
}
static int __sev_issue_cmd(int fd, int id, void *data, int *error)
{
CLASS(fd, f)(fd);
if (fd_empty(f))
return -EBADF;
return sev_issue_cmd_external_user(fd_file(f), id, data, error);
}
static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error)
{
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
return __sev_issue_cmd(sev->fd, id, data, error);
}
static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
struct sev_data_launch_start start;
struct kvm_sev_launch_start params;
void *dh_blob, *session_blob;
int *error = &argp->error;
int ret;
if (!sev_guest(kvm))
return -ENOTTY;
if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
return -EFAULT;
sev->policy = params.policy;
memset(&start, 0, sizeof(start));
dh_blob = NULL;
if (params.dh_uaddr) {
dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len);
if (IS_ERR(dh_blob))
return PTR_ERR(dh_blob);
start.dh_cert_address = __sme_set(__pa(dh_blob));
start.dh_cert_len = params.dh_len;
}
session_blob = NULL;
if (params.session_uaddr) {
session_blob = psp_copy_user_blob(params.session_uaddr, params.session_len);
if (IS_ERR(session_blob)) {
ret = PTR_ERR(session_blob);
goto e_free_dh;
}
start.session_address = __sme_set(__pa(session_blob));
start.session_len = params.session_len;
}
start.handle = params.handle;
start.policy = params.policy;
/* create memory encryption context */
ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, &start, error);
if (ret)
goto e_free_session;
/* Bind ASID to this guest */
ret = sev_bind_asid(kvm, start.handle, error);
if (ret) {
sev_decommission(start.handle);
goto e_free_session;
}
/* return handle to userspace */
params.handle = start.handle;
if (copy_to_user(u64_to_user_ptr(argp->data), &params, sizeof(params))) {
sev_unbind_asid(kvm, start.handle);
ret = -EFAULT;
goto e_free_session;
}
sev->handle = start.handle;
sev->fd = argp->sev_fd;
e_free_session:
kfree(session_blob);
e_free_dh:
kfree(dh_blob);
return ret;
}
static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
unsigned long ulen, unsigned long *n,
unsigned int flags)
{
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
unsigned long npages, size;
int npinned;
unsigned long locked, lock_limit;
struct page **pages;
unsigned long first, last;
int ret;
lockdep_assert_held(&kvm->lock);
if (ulen == 0 || uaddr + ulen < uaddr)
return ERR_PTR(-EINVAL);
/* Calculate number of pages. */
first = (uaddr & PAGE_MASK) >> PAGE_SHIFT;
last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT;
npages = (last - first + 1);
locked = sev->pages_locked + npages;
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit);
return ERR_PTR(-ENOMEM);
}
if (WARN_ON_ONCE(npages > INT_MAX))
return ERR_PTR(-EINVAL);
/* Avoid using vmalloc for smaller buffers. */
size = npages * sizeof(struct page *);
if (size > PAGE_SIZE)
pages = __vmalloc(size, GFP_KERNEL_ACCOUNT);
else
pages = kmalloc(size, GFP_KERNEL_ACCOUNT);
if (!pages)
return ERR_PTR(-ENOMEM);
/* Pin the user virtual address. */
npinned = pin_user_pages_fast(uaddr, npages, flags, pages);
if (npinned != npages) {
pr_err("SEV: Failure locking %lu pages.\n", npages);
ret = -ENOMEM;
goto err;
}
*n = npages;
sev->pages_locked = locked;
return pages;
err:
if (npinned > 0)
unpin_user_pages(pages, npinned);
kvfree(pages);
return ERR_PTR(ret);
}
static void sev_unpin_memory(struct kvm *kvm, struct page **pages,
unsigned long npages)
{
unpin_user_pages(pages, npages);
kvfree(pages);
to_kvm_sev_info(kvm)->pages_locked -= npages;
}
static void sev_clflush_pages(struct page *pages[], unsigned long npages)
{
uint8_t *page_virtual;
unsigned long i;
if (this_cpu_has(X86_FEATURE_SME_COHERENT) || npages == 0 ||
pages == NULL)
return;
for (i = 0; i < npages; i++) {
KVM: SVM: Replace kmap_atomic() with kmap_local_page() The use of kmap_atomic() is being deprecated in favor of kmap_local_page()[1]. The main difference between atomic and local mappings is that local mappings don't disable page faults or preemption. There're 2 reasons we can use kmap_local_page() here: 1. SEV is 64-bit only and kmap_local_page() doesn't disable migration in this case, but here the function clflush_cache_range() uses CLFLUSHOPT instruction to flush, and on x86 CLFLUSHOPT is not CPU-local and flushes the page out of the entire cache hierarchy on all CPUs (APM volume 3, chapter 3, CLFLUSHOPT). So there's no need to disable preemption to ensure CPU-local. 2. clflush_cache_range() doesn't need to disable pagefault and the mapping is still valid even if sleeps. This is also true for sched out/in when preempted. In addition, though kmap_local_page() is a thin wrapper around page_address() on 64-bit, kmap_local_page() should still be used here in preference to page_address() since page_address() isn't suitable to be used in a generic function (like sev_clflush_pages()) where the page passed in is not easy to determine the source of allocation. Keeping the kmap* API in place means it can be used for things other than highmem mappings[2]. Therefore, sev_clflush_pages() is a function that should use kmap_local_page() in place of kmap_atomic(). Convert the calls of kmap_atomic() / kunmap_atomic() to kmap_local_page() / kunmap_local(). [1]: https://lore.kernel.org/all/20220813220034.806698-1-ira.weiny@intel.com [2]: https://lore.kernel.org/lkml/5d667258-b58b-3d28-3609-e7914c99b31b@intel.com/ Suggested-by: Dave Hansen <dave.hansen@intel.com> Suggested-by: Ira Weiny <ira.weiny@intel.com> Suggested-by: Fabio M. De Francesco <fmdefrancesco@gmail.com> Signed-off-by: Zhao Liu <zhao1.liu@intel.com> Reviewed-by: Sean Christopherson <seanjc@google.com> Link: https://lore.kernel.org/r/20220928092748.463631-1-zhao1.liu@linux.intel.com Signed-off-by: Sean Christopherson <seanjc@google.com>
2022-09-28 17:27:48 +08:00
page_virtual = kmap_local_page(pages[i]);
clflush_cache_range(page_virtual, PAGE_SIZE);
KVM: SVM: Replace kmap_atomic() with kmap_local_page() The use of kmap_atomic() is being deprecated in favor of kmap_local_page()[1]. The main difference between atomic and local mappings is that local mappings don't disable page faults or preemption. There're 2 reasons we can use kmap_local_page() here: 1. SEV is 64-bit only and kmap_local_page() doesn't disable migration in this case, but here the function clflush_cache_range() uses CLFLUSHOPT instruction to flush, and on x86 CLFLUSHOPT is not CPU-local and flushes the page out of the entire cache hierarchy on all CPUs (APM volume 3, chapter 3, CLFLUSHOPT). So there's no need to disable preemption to ensure CPU-local. 2. clflush_cache_range() doesn't need to disable pagefault and the mapping is still valid even if sleeps. This is also true for sched out/in when preempted. In addition, though kmap_local_page() is a thin wrapper around page_address() on 64-bit, kmap_local_page() should still be used here in preference to page_address() since page_address() isn't suitable to be used in a generic function (like sev_clflush_pages()) where the page passed in is not easy to determine the source of allocation. Keeping the kmap* API in place means it can be used for things other than highmem mappings[2]. Therefore, sev_clflush_pages() is a function that should use kmap_local_page() in place of kmap_atomic(). Convert the calls of kmap_atomic() / kunmap_atomic() to kmap_local_page() / kunmap_local(). [1]: https://lore.kernel.org/all/20220813220034.806698-1-ira.weiny@intel.com [2]: https://lore.kernel.org/lkml/5d667258-b58b-3d28-3609-e7914c99b31b@intel.com/ Suggested-by: Dave Hansen <dave.hansen@intel.com> Suggested-by: Ira Weiny <ira.weiny@intel.com> Suggested-by: Fabio M. De Francesco <fmdefrancesco@gmail.com> Signed-off-by: Zhao Liu <zhao1.liu@intel.com> Reviewed-by: Sean Christopherson <seanjc@google.com> Link: https://lore.kernel.org/r/20220928092748.463631-1-zhao1.liu@linux.intel.com Signed-off-by: Sean Christopherson <seanjc@google.com>
2022-09-28 17:27:48 +08:00
kunmap_local(page_virtual);
cond_resched();
}
}
KVM: SVM: Flush cache only on CPUs running SEV guest On AMD CPUs without ensuring cache consistency, each memory page reclamation in an SEV guest triggers a call to do WBNOINVD/WBINVD on all CPUs, thereby affecting the performance of other programs on the host. Typically, an AMD server may have 128 cores or more, while the SEV guest might only utilize 8 of these cores. Meanwhile, host can use qemu-affinity to bind these 8 vCPUs to specific physical CPUs. Therefore, keeping a record of the physical core numbers each time a vCPU runs can help avoid flushing the cache for all CPUs every time. Take care to allocate the cpumask used to track which CPUs have run a vCPU when copying or moving an "encryption context", as nothing guarantees memory in a mirror VM is a strict subset of the ASID owner, and the destination VM for intrahost migration needs to maintain it's own set of CPUs. E.g. for intrahost migration, if a CPU was used for the source VM but not the destination VM, then it can only have cached memory that was accessible to the source VM. And a CPU that was run in the source is also used by the destination is no different than a CPU that was run in the destination only. Note, KVM is guaranteed to do flush caches prior to sev_vm_destroy(), thanks to kvm_arch_guest_memory_reclaimed for SEV and SEV-ES, and kvm_arch_gmem_invalidate() for SEV-SNP. I.e. it's safe to free the cpumask prior to unregistering encrypted regions and freeing the ASID. Opportunistically clean up sev_vm_destroy()'s comment regarding what is (implicitly, what isn't) skipped for mirror VMs. Cc: Srikanth Aithal <sraithal@amd.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Signed-off-by: Zheyun Shen <szy0127@sjtu.edu.cn> Link: https://lore.kernel.org/r/20250522233733.3176144-9-seanjc@google.com Link: https://lore.kernel.org/all/935a82e3-f7ad-47d7-aaaf-f3d2b62ed768@amd.com Co-developed-by: Sean Christopherson <seanjc@google.com> Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-05-22 16:37:32 -07:00
static void sev_writeback_caches(struct kvm *kvm)
KVM: SEV: Prefer WBNOINVD over WBINVD for cache maintenance efficiency AMD CPUs currently execute WBINVD in the host when unregistering SEV guest memory or when deactivating SEV guests. Such cache maintenance is performed to prevent data corruption, wherein the encrypted (C=1) version of a dirty cache line might otherwise only be written back after the memory is written in a different context (ex: C=0), yielding corruption. However, WBINVD is performance-costly, especially because it invalidates processor caches. Strictly-speaking, unless the SEV ASID is being recycled (meaning the SNP firmware requires the use of WBINVD prior to DF_FLUSH), the cache invalidation triggered by WBINVD is unnecessary; only the writeback is needed to prevent data corruption in remaining scenarios. To improve performance in these scenarios, use WBNOINVD when available instead of WBINVD. WBNOINVD still writes back all dirty lines (preventing host data corruption by SEV guests) but does *not* invalidate processor caches. Note that the implementation of wbnoinvd() ensures fall back to WBINVD if WBNOINVD is unavailable. In anticipation of forthcoming optimizations to limit the WBNOINVD only to physical CPUs that have executed SEV guests, place the call to wbnoinvd_on_all_cpus() in a wrapper function sev_writeback_caches(). Signed-off-by: Kevin Loughlin <kevinloughlin@google.com> Reviewed-by: Mingwei Zhang <mizhang@google.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Link: https://lore.kernel.org/r/20250201000259.3289143-3-kevinloughlin@google.com [sean: tweak comment regarding CLFUSH] Cc: Francesco Lavra <francescolavra.fl@gmail.com> Link: https://lore.kernel.org/r/20250522233733.3176144-8-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-05-22 16:37:31 -07:00
{
KVM: SVM: Flush cache only on CPUs running SEV guest On AMD CPUs without ensuring cache consistency, each memory page reclamation in an SEV guest triggers a call to do WBNOINVD/WBINVD on all CPUs, thereby affecting the performance of other programs on the host. Typically, an AMD server may have 128 cores or more, while the SEV guest might only utilize 8 of these cores. Meanwhile, host can use qemu-affinity to bind these 8 vCPUs to specific physical CPUs. Therefore, keeping a record of the physical core numbers each time a vCPU runs can help avoid flushing the cache for all CPUs every time. Take care to allocate the cpumask used to track which CPUs have run a vCPU when copying or moving an "encryption context", as nothing guarantees memory in a mirror VM is a strict subset of the ASID owner, and the destination VM for intrahost migration needs to maintain it's own set of CPUs. E.g. for intrahost migration, if a CPU was used for the source VM but not the destination VM, then it can only have cached memory that was accessible to the source VM. And a CPU that was run in the source is also used by the destination is no different than a CPU that was run in the destination only. Note, KVM is guaranteed to do flush caches prior to sev_vm_destroy(), thanks to kvm_arch_guest_memory_reclaimed for SEV and SEV-ES, and kvm_arch_gmem_invalidate() for SEV-SNP. I.e. it's safe to free the cpumask prior to unregistering encrypted regions and freeing the ASID. Opportunistically clean up sev_vm_destroy()'s comment regarding what is (implicitly, what isn't) skipped for mirror VMs. Cc: Srikanth Aithal <sraithal@amd.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Signed-off-by: Zheyun Shen <szy0127@sjtu.edu.cn> Link: https://lore.kernel.org/r/20250522233733.3176144-9-seanjc@google.com Link: https://lore.kernel.org/all/935a82e3-f7ad-47d7-aaaf-f3d2b62ed768@amd.com Co-developed-by: Sean Christopherson <seanjc@google.com> Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-05-22 16:37:32 -07:00
/*
* Note, the caller is responsible for ensuring correctness if the mask
* can be modified, e.g. if a CPU could be doing VMRUN.
*/
if (cpumask_empty(to_kvm_sev_info(kvm)->have_run_cpus))
return;
KVM: SEV: Prefer WBNOINVD over WBINVD for cache maintenance efficiency AMD CPUs currently execute WBINVD in the host when unregistering SEV guest memory or when deactivating SEV guests. Such cache maintenance is performed to prevent data corruption, wherein the encrypted (C=1) version of a dirty cache line might otherwise only be written back after the memory is written in a different context (ex: C=0), yielding corruption. However, WBINVD is performance-costly, especially because it invalidates processor caches. Strictly-speaking, unless the SEV ASID is being recycled (meaning the SNP firmware requires the use of WBINVD prior to DF_FLUSH), the cache invalidation triggered by WBINVD is unnecessary; only the writeback is needed to prevent data corruption in remaining scenarios. To improve performance in these scenarios, use WBNOINVD when available instead of WBINVD. WBNOINVD still writes back all dirty lines (preventing host data corruption by SEV guests) but does *not* invalidate processor caches. Note that the implementation of wbnoinvd() ensures fall back to WBINVD if WBNOINVD is unavailable. In anticipation of forthcoming optimizations to limit the WBNOINVD only to physical CPUs that have executed SEV guests, place the call to wbnoinvd_on_all_cpus() in a wrapper function sev_writeback_caches(). Signed-off-by: Kevin Loughlin <kevinloughlin@google.com> Reviewed-by: Mingwei Zhang <mizhang@google.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Link: https://lore.kernel.org/r/20250201000259.3289143-3-kevinloughlin@google.com [sean: tweak comment regarding CLFUSH] Cc: Francesco Lavra <francescolavra.fl@gmail.com> Link: https://lore.kernel.org/r/20250522233733.3176144-8-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-05-22 16:37:31 -07:00
/*
* Ensure that all dirty guest tagged cache entries are written back
* before releasing the pages back to the system for use. CLFLUSH will
* not do this without SME_COHERENT, and flushing many cache lines
* individually is slower than blasting WBINVD for large VMs, so issue
KVM: SVM: Flush cache only on CPUs running SEV guest On AMD CPUs without ensuring cache consistency, each memory page reclamation in an SEV guest triggers a call to do WBNOINVD/WBINVD on all CPUs, thereby affecting the performance of other programs on the host. Typically, an AMD server may have 128 cores or more, while the SEV guest might only utilize 8 of these cores. Meanwhile, host can use qemu-affinity to bind these 8 vCPUs to specific physical CPUs. Therefore, keeping a record of the physical core numbers each time a vCPU runs can help avoid flushing the cache for all CPUs every time. Take care to allocate the cpumask used to track which CPUs have run a vCPU when copying or moving an "encryption context", as nothing guarantees memory in a mirror VM is a strict subset of the ASID owner, and the destination VM for intrahost migration needs to maintain it's own set of CPUs. E.g. for intrahost migration, if a CPU was used for the source VM but not the destination VM, then it can only have cached memory that was accessible to the source VM. And a CPU that was run in the source is also used by the destination is no different than a CPU that was run in the destination only. Note, KVM is guaranteed to do flush caches prior to sev_vm_destroy(), thanks to kvm_arch_guest_memory_reclaimed for SEV and SEV-ES, and kvm_arch_gmem_invalidate() for SEV-SNP. I.e. it's safe to free the cpumask prior to unregistering encrypted regions and freeing the ASID. Opportunistically clean up sev_vm_destroy()'s comment regarding what is (implicitly, what isn't) skipped for mirror VMs. Cc: Srikanth Aithal <sraithal@amd.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Signed-off-by: Zheyun Shen <szy0127@sjtu.edu.cn> Link: https://lore.kernel.org/r/20250522233733.3176144-9-seanjc@google.com Link: https://lore.kernel.org/all/935a82e3-f7ad-47d7-aaaf-f3d2b62ed768@amd.com Co-developed-by: Sean Christopherson <seanjc@google.com> Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-05-22 16:37:32 -07:00
* WBNOINVD (or WBINVD if the "no invalidate" variant is unsupported)
* on CPUs that have done VMRUN, i.e. may have dirtied data using the
* VM's ASID.
*
* For simplicity, never remove CPUs from the bitmap. Ideally, KVM
* would clear the mask when flushing caches, but doing so requires
* serializing multiple calls and having responding CPUs (to the IPI)
* mark themselves as still running if they are running (or about to
* run) a vCPU for the VM.
KVM: SEV: Prefer WBNOINVD over WBINVD for cache maintenance efficiency AMD CPUs currently execute WBINVD in the host when unregistering SEV guest memory or when deactivating SEV guests. Such cache maintenance is performed to prevent data corruption, wherein the encrypted (C=1) version of a dirty cache line might otherwise only be written back after the memory is written in a different context (ex: C=0), yielding corruption. However, WBINVD is performance-costly, especially because it invalidates processor caches. Strictly-speaking, unless the SEV ASID is being recycled (meaning the SNP firmware requires the use of WBINVD prior to DF_FLUSH), the cache invalidation triggered by WBINVD is unnecessary; only the writeback is needed to prevent data corruption in remaining scenarios. To improve performance in these scenarios, use WBNOINVD when available instead of WBINVD. WBNOINVD still writes back all dirty lines (preventing host data corruption by SEV guests) but does *not* invalidate processor caches. Note that the implementation of wbnoinvd() ensures fall back to WBINVD if WBNOINVD is unavailable. In anticipation of forthcoming optimizations to limit the WBNOINVD only to physical CPUs that have executed SEV guests, place the call to wbnoinvd_on_all_cpus() in a wrapper function sev_writeback_caches(). Signed-off-by: Kevin Loughlin <kevinloughlin@google.com> Reviewed-by: Mingwei Zhang <mizhang@google.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Link: https://lore.kernel.org/r/20250201000259.3289143-3-kevinloughlin@google.com [sean: tweak comment regarding CLFUSH] Cc: Francesco Lavra <francescolavra.fl@gmail.com> Link: https://lore.kernel.org/r/20250522233733.3176144-8-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-05-22 16:37:31 -07:00
*/
KVM: SVM: Flush cache only on CPUs running SEV guest On AMD CPUs without ensuring cache consistency, each memory page reclamation in an SEV guest triggers a call to do WBNOINVD/WBINVD on all CPUs, thereby affecting the performance of other programs on the host. Typically, an AMD server may have 128 cores or more, while the SEV guest might only utilize 8 of these cores. Meanwhile, host can use qemu-affinity to bind these 8 vCPUs to specific physical CPUs. Therefore, keeping a record of the physical core numbers each time a vCPU runs can help avoid flushing the cache for all CPUs every time. Take care to allocate the cpumask used to track which CPUs have run a vCPU when copying or moving an "encryption context", as nothing guarantees memory in a mirror VM is a strict subset of the ASID owner, and the destination VM for intrahost migration needs to maintain it's own set of CPUs. E.g. for intrahost migration, if a CPU was used for the source VM but not the destination VM, then it can only have cached memory that was accessible to the source VM. And a CPU that was run in the source is also used by the destination is no different than a CPU that was run in the destination only. Note, KVM is guaranteed to do flush caches prior to sev_vm_destroy(), thanks to kvm_arch_guest_memory_reclaimed for SEV and SEV-ES, and kvm_arch_gmem_invalidate() for SEV-SNP. I.e. it's safe to free the cpumask prior to unregistering encrypted regions and freeing the ASID. Opportunistically clean up sev_vm_destroy()'s comment regarding what is (implicitly, what isn't) skipped for mirror VMs. Cc: Srikanth Aithal <sraithal@amd.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Signed-off-by: Zheyun Shen <szy0127@sjtu.edu.cn> Link: https://lore.kernel.org/r/20250522233733.3176144-9-seanjc@google.com Link: https://lore.kernel.org/all/935a82e3-f7ad-47d7-aaaf-f3d2b62ed768@amd.com Co-developed-by: Sean Christopherson <seanjc@google.com> Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-05-22 16:37:32 -07:00
wbnoinvd_on_cpus_mask(to_kvm_sev_info(kvm)->have_run_cpus);
KVM: SEV: Prefer WBNOINVD over WBINVD for cache maintenance efficiency AMD CPUs currently execute WBINVD in the host when unregistering SEV guest memory or when deactivating SEV guests. Such cache maintenance is performed to prevent data corruption, wherein the encrypted (C=1) version of a dirty cache line might otherwise only be written back after the memory is written in a different context (ex: C=0), yielding corruption. However, WBINVD is performance-costly, especially because it invalidates processor caches. Strictly-speaking, unless the SEV ASID is being recycled (meaning the SNP firmware requires the use of WBINVD prior to DF_FLUSH), the cache invalidation triggered by WBINVD is unnecessary; only the writeback is needed to prevent data corruption in remaining scenarios. To improve performance in these scenarios, use WBNOINVD when available instead of WBINVD. WBNOINVD still writes back all dirty lines (preventing host data corruption by SEV guests) but does *not* invalidate processor caches. Note that the implementation of wbnoinvd() ensures fall back to WBINVD if WBNOINVD is unavailable. In anticipation of forthcoming optimizations to limit the WBNOINVD only to physical CPUs that have executed SEV guests, place the call to wbnoinvd_on_all_cpus() in a wrapper function sev_writeback_caches(). Signed-off-by: Kevin Loughlin <kevinloughlin@google.com> Reviewed-by: Mingwei Zhang <mizhang@google.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Link: https://lore.kernel.org/r/20250201000259.3289143-3-kevinloughlin@google.com [sean: tweak comment regarding CLFUSH] Cc: Francesco Lavra <francescolavra.fl@gmail.com> Link: https://lore.kernel.org/r/20250522233733.3176144-8-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-05-22 16:37:31 -07:00
}
static unsigned long get_num_contig_pages(unsigned long idx,
struct page **inpages, unsigned long npages)
{
unsigned long paddr, next_paddr;
unsigned long i = idx + 1, pages = 1;
/* find the number of contiguous pages starting from idx */
paddr = __sme_page_pa(inpages[idx]);
while (i < npages) {
next_paddr = __sme_page_pa(inpages[i++]);
if ((paddr + PAGE_SIZE) == next_paddr) {
pages++;
paddr = next_paddr;
continue;
}
break;
}
return pages;
}
static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i;
struct kvm_sev_launch_update_data params;
struct sev_data_launch_update_data data;
struct page **inpages;
int ret;
if (!sev_guest(kvm))
return -ENOTTY;
if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
return -EFAULT;
vaddr = params.uaddr;
size = params.len;
vaddr_end = vaddr + size;
/* Lock the user memory. */
inpages = sev_pin_memory(kvm, vaddr, size, &npages, FOLL_WRITE);
if (IS_ERR(inpages))
return PTR_ERR(inpages);
/*
* Flush (on non-coherent CPUs) before LAUNCH_UPDATE encrypts pages in
* place; the cache may contain the data that was written unencrypted.
*/
sev_clflush_pages(inpages, npages);
data.reserved = 0;
data.handle = to_kvm_sev_info(kvm)->handle;
for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) {
int offset, len;
/*
* If the user buffer is not page-aligned, calculate the offset
* within the page.
*/
offset = vaddr & (PAGE_SIZE - 1);
/* Calculate the number of pages that can be encrypted in one go. */
pages = get_num_contig_pages(i, inpages, npages);
len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size);
data.len = len;
data.address = __sme_page_pa(inpages[i]) + offset;
ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, &data, &argp->error);
if (ret)
goto e_unpin;
size -= len;
next_vaddr = vaddr + len;
}
e_unpin:
/* content of memory is updated, mark pages dirty */
for (i = 0; i < npages; i++) {
set_page_dirty_lock(inpages[i]);
mark_page_accessed(inpages[i]);
}
/* unlock the user pages */
sev_unpin_memory(kvm, inpages, npages);
return ret;
}
static int sev_es_sync_vmsa(struct vcpu_svm *svm)
{
struct kvm_vcpu *vcpu = &svm->vcpu;
struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
struct sev_es_save_area *save = svm->sev_es.vmsa;
struct xregs_state *xsave;
const u8 *s;
u8 *d;
int i;
/* Check some debug related fields before encrypting the VMSA */
if (svm->vcpu.guest_debug || (svm->vmcb->save.dr7 & ~DR7_FIXED_1))
return -EINVAL;
/*
* SEV-ES will use a VMSA that is pointed to by the VMCB, not
* the traditional VMSA that is part of the VMCB. Copy the
* traditional VMSA as it has been built so far (in prep
* for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state.
*/
memcpy(save, &svm->vmcb->save, sizeof(svm->vmcb->save));
/* Sync registgers */
save->rax = svm->vcpu.arch.regs[VCPU_REGS_RAX];
save->rbx = svm->vcpu.arch.regs[VCPU_REGS_RBX];
save->rcx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
save->rdx = svm->vcpu.arch.regs[VCPU_REGS_RDX];
save->rsp = svm->vcpu.arch.regs[VCPU_REGS_RSP];
save->rbp = svm->vcpu.arch.regs[VCPU_REGS_RBP];
save->rsi = svm->vcpu.arch.regs[VCPU_REGS_RSI];
save->rdi = svm->vcpu.arch.regs[VCPU_REGS_RDI];
#ifdef CONFIG_X86_64
save->r8 = svm->vcpu.arch.regs[VCPU_REGS_R8];
save->r9 = svm->vcpu.arch.regs[VCPU_REGS_R9];
save->r10 = svm->vcpu.arch.regs[VCPU_REGS_R10];
save->r11 = svm->vcpu.arch.regs[VCPU_REGS_R11];
save->r12 = svm->vcpu.arch.regs[VCPU_REGS_R12];
save->r13 = svm->vcpu.arch.regs[VCPU_REGS_R13];
save->r14 = svm->vcpu.arch.regs[VCPU_REGS_R14];
save->r15 = svm->vcpu.arch.regs[VCPU_REGS_R15];
#endif
save->rip = svm->vcpu.arch.regs[VCPU_REGS_RIP];
/* Sync some non-GPR registers before encrypting */
save->xcr0 = svm->vcpu.arch.xcr0;
save->pkru = svm->vcpu.arch.pkru;
save->xss = svm->vcpu.arch.ia32_xss;
save->dr6 = svm->vcpu.arch.dr6;
save->sev_features = sev->vmsa_features;
/*
* Skip FPU and AVX setup with KVM_SEV_ES_INIT to avoid
* breaking older measurements.
*/
if (vcpu->kvm->arch.vm_type != KVM_X86_DEFAULT_VM) {
xsave = &vcpu->arch.guest_fpu.fpstate->regs.xsave;
save->x87_dp = xsave->i387.rdp;
save->mxcsr = xsave->i387.mxcsr;
save->x87_ftw = xsave->i387.twd;
save->x87_fsw = xsave->i387.swd;
save->x87_fcw = xsave->i387.cwd;
save->x87_fop = xsave->i387.fop;
save->x87_ds = 0;
save->x87_cs = 0;
save->x87_rip = xsave->i387.rip;
for (i = 0; i < 8; i++) {
/*
* The format of the x87 save area is undocumented and
* definitely not what you would expect. It consists of
* an 8*8 bytes area with bytes 0-7, and an 8*2 bytes
* area with bytes 8-9 of each register.
*/
d = save->fpreg_x87 + i * 8;
s = ((u8 *)xsave->i387.st_space) + i * 16;
memcpy(d, s, 8);
save->fpreg_x87[64 + i * 2] = s[8];
save->fpreg_x87[64 + i * 2 + 1] = s[9];
}
memcpy(save->fpreg_xmm, xsave->i387.xmm_space, 256);
s = get_xsave_addr(xsave, XFEATURE_YMM);
if (s)
memcpy(save->fpreg_ymm, s, 256);
else
memset(save->fpreg_ymm, 0, 256);
}
pr_debug("Virtual Machine Save Area (VMSA):\n");
print_hex_dump_debug("", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false);
return 0;
}
static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu,
int *error)
{
struct sev_data_launch_update_vmsa vmsa;
struct vcpu_svm *svm = to_svm(vcpu);
int ret;
if (vcpu->guest_debug) {
pr_warn_once("KVM_SET_GUEST_DEBUG for SEV-ES guest is not supported");
return -EINVAL;
}
/* Perform some pre-encryption checks against the VMSA */
ret = sev_es_sync_vmsa(svm);
if (ret)
return ret;
/*
* The LAUNCH_UPDATE_VMSA command will perform in-place encryption of
* the VMSA memory content (i.e it will write the same memory region
* with the guest's key), so invalidate it first.
*/
clflush_cache_range(svm->sev_es.vmsa, PAGE_SIZE);
vmsa.reserved = 0;
vmsa.handle = to_kvm_sev_info(kvm)->handle;
vmsa.address = __sme_pa(svm->sev_es.vmsa);
vmsa.len = PAGE_SIZE;
ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error);
if (ret)
return ret;
/*
* SEV-ES guests maintain an encrypted version of their FPU
* state which is restored and saved on VMRUN and VMEXIT.
* Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't
* do xsave/xrstor on it.
*/
fpstate_set_confidential(&vcpu->arch.guest_fpu);
vcpu->arch.guest_state_protected = true;
/*
* SEV-ES guest mandates LBR Virtualization to be _always_ ON. Enable it
* only after setting guest_state_protected because KVM_SET_MSRS allows
* dynamic toggling of LBRV (for performance reason) on write access to
* MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set.
*/
svm_enable_lbrv(vcpu);
return 0;
}
static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_vcpu *vcpu;
unsigned long i;
int ret;
if (!sev_es_guest(kvm))
return -ENOTTY;
kvm_for_each_vcpu(i, vcpu, kvm) {
ret = mutex_lock_killable(&vcpu->mutex);
if (ret)
return ret;
ret = __sev_launch_update_vmsa(kvm, vcpu, &argp->error);
mutex_unlock(&vcpu->mutex);
if (ret)
return ret;
}
return 0;
}
static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
void __user *measure = u64_to_user_ptr(argp->data);
struct sev_data_launch_measure data;
struct kvm_sev_launch_measure params;
void __user *p = NULL;
void *blob = NULL;
int ret;
if (!sev_guest(kvm))
return -ENOTTY;
if (copy_from_user(&params, measure, sizeof(params)))
return -EFAULT;
memset(&data, 0, sizeof(data));
/* User wants to query the blob length */
if (!params.len)
goto cmd;
p = u64_to_user_ptr(params.uaddr);
if (p) {
if (params.len > SEV_FW_BLOB_MAX_SIZE)
return -EINVAL;
blob = kzalloc(params.len, GFP_KERNEL_ACCOUNT);
if (!blob)
return -ENOMEM;
data.address = __psp_pa(blob);
data.len = params.len;
}
cmd:
data.handle = to_kvm_sev_info(kvm)->handle;
ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, &data, &argp->error);
/*
* If we query the session length, FW responded with expected data.
*/
if (!params.len)
goto done;
if (ret)
goto e_free_blob;
if (blob) {
if (copy_to_user(p, blob, params.len))
ret = -EFAULT;
}
done:
params.len = data.len;
if (copy_to_user(measure, &params, sizeof(params)))
ret = -EFAULT;
e_free_blob:
kfree(blob);
return ret;
}
static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct sev_data_launch_finish data;
if (!sev_guest(kvm))
return -ENOTTY;
data.handle = to_kvm_sev_info(kvm)->handle;
return sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, &data, &argp->error);
}
static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_guest_status params;
struct sev_data_guest_status data;
int ret;
if (!sev_guest(kvm))
return -ENOTTY;
memset(&data, 0, sizeof(data));
data.handle = to_kvm_sev_info(kvm)->handle;
ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, &data, &argp->error);
if (ret)
return ret;
params.policy = data.policy;
params.state = data.state;
params.handle = data.handle;
if (copy_to_user(u64_to_user_ptr(argp->data), &params, sizeof(params)))
ret = -EFAULT;
return ret;
}
static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src,
unsigned long dst, int size,
int *error, bool enc)
{
struct sev_data_dbg data;
data.reserved = 0;
data.handle = to_kvm_sev_info(kvm)->handle;
data.dst_addr = dst;
data.src_addr = src;
data.len = size;
return sev_issue_cmd(kvm,
enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT,
&data, error);
}
static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
unsigned long dst_paddr, int sz, int *err)
{
int offset;
/*
* Its safe to read more than we are asked, caller should ensure that
* destination has enough space.
*/
offset = src_paddr & 15;
src_paddr = round_down(src_paddr, 16);
sz = round_up(sz + offset, 16);
return __sev_issue_dbg_cmd(kvm, src_paddr, dst_paddr, sz, err, false);
}
static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
void __user *dst_uaddr,
unsigned long dst_paddr,
int size, int *err)
{
struct page *tpage = NULL;
int ret, offset;
/* if inputs are not 16-byte then use intermediate buffer */
if (!IS_ALIGNED(dst_paddr, 16) ||
!IS_ALIGNED(paddr, 16) ||
!IS_ALIGNED(size, 16)) {
tpage = (void *)alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!tpage)
return -ENOMEM;
dst_paddr = __sme_page_pa(tpage);
}
ret = __sev_dbg_decrypt(kvm, paddr, dst_paddr, size, err);
if (ret)
goto e_free;
if (tpage) {
offset = paddr & 15;
if (copy_to_user(dst_uaddr, page_address(tpage) + offset, size))
ret = -EFAULT;
}
e_free:
if (tpage)
__free_page(tpage);
return ret;
}
static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
void __user *vaddr,
unsigned long dst_paddr,
void __user *dst_vaddr,
int size, int *error)
{
struct page *src_tpage = NULL;
struct page *dst_tpage = NULL;
int ret, len = size;
/* If source buffer is not aligned then use an intermediate buffer */
if (!IS_ALIGNED((unsigned long)vaddr, 16)) {
src_tpage = alloc_page(GFP_KERNEL_ACCOUNT);
if (!src_tpage)
return -ENOMEM;
if (copy_from_user(page_address(src_tpage), vaddr, size)) {
__free_page(src_tpage);
return -EFAULT;
}
paddr = __sme_page_pa(src_tpage);
}
/*
* If destination buffer or length is not aligned then do read-modify-write:
* - decrypt destination in an intermediate buffer
* - copy the source buffer in an intermediate buffer
* - use the intermediate buffer as source buffer
*/
if (!IS_ALIGNED((unsigned long)dst_vaddr, 16) || !IS_ALIGNED(size, 16)) {
int dst_offset;
dst_tpage = alloc_page(GFP_KERNEL_ACCOUNT);
if (!dst_tpage) {
ret = -ENOMEM;
goto e_free;
}
ret = __sev_dbg_decrypt(kvm, dst_paddr,
__sme_page_pa(dst_tpage), size, error);
if (ret)
goto e_free;
/*
* If source is kernel buffer then use memcpy() otherwise
* copy_from_user().
*/
dst_offset = dst_paddr & 15;
if (src_tpage)
memcpy(page_address(dst_tpage) + dst_offset,
page_address(src_tpage), size);
else {
if (copy_from_user(page_address(dst_tpage) + dst_offset,
vaddr, size)) {
ret = -EFAULT;
goto e_free;
}
}
paddr = __sme_page_pa(dst_tpage);
dst_paddr = round_down(dst_paddr, 16);
len = round_up(size, 16);
}
ret = __sev_issue_dbg_cmd(kvm, paddr, dst_paddr, len, error, true);
e_free:
if (src_tpage)
__free_page(src_tpage);
if (dst_tpage)
__free_page(dst_tpage);
return ret;
}
static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
{
unsigned long vaddr, vaddr_end, next_vaddr;
unsigned long dst_vaddr;
struct page **src_p, **dst_p;
struct kvm_sev_dbg debug;
unsigned long n;
unsigned int size;
int ret;
if (!sev_guest(kvm))
return -ENOTTY;
if (copy_from_user(&debug, u64_to_user_ptr(argp->data), sizeof(debug)))
return -EFAULT;
if (!debug.len || debug.src_uaddr + debug.len < debug.src_uaddr)
return -EINVAL;
if (!debug.dst_uaddr)
return -EINVAL;
vaddr = debug.src_uaddr;
size = debug.len;
vaddr_end = vaddr + size;
dst_vaddr = debug.dst_uaddr;
for (; vaddr < vaddr_end; vaddr = next_vaddr) {
int len, s_off, d_off;
/* lock userspace source and destination page */
src_p = sev_pin_memory(kvm, vaddr & PAGE_MASK, PAGE_SIZE, &n, 0);
if (IS_ERR(src_p))
return PTR_ERR(src_p);
dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, FOLL_WRITE);
if (IS_ERR(dst_p)) {
sev_unpin_memory(kvm, src_p, n);
return PTR_ERR(dst_p);
}
/*
* Flush (on non-coherent CPUs) before DBG_{DE,EN}CRYPT read or modify
* the pages; flush the destination too so that future accesses do not
* see stale data.
*/
sev_clflush_pages(src_p, 1);
sev_clflush_pages(dst_p, 1);
/*
* Since user buffer may not be page aligned, calculate the
* offset within the page.
*/
s_off = vaddr & ~PAGE_MASK;
d_off = dst_vaddr & ~PAGE_MASK;
len = min_t(size_t, (PAGE_SIZE - s_off), size);
if (dec)
ret = __sev_dbg_decrypt_user(kvm,
__sme_page_pa(src_p[0]) + s_off,
(void __user *)dst_vaddr,
__sme_page_pa(dst_p[0]) + d_off,
len, &argp->error);
else
ret = __sev_dbg_encrypt_user(kvm,
__sme_page_pa(src_p[0]) + s_off,
(void __user *)vaddr,
__sme_page_pa(dst_p[0]) + d_off,
(void __user *)dst_vaddr,
len, &argp->error);
sev_unpin_memory(kvm, src_p, n);
sev_unpin_memory(kvm, dst_p, n);
if (ret)
goto err;
next_vaddr = vaddr + len;
dst_vaddr = dst_vaddr + len;
size -= len;
}
err:
return ret;
}
static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct sev_data_launch_secret data;
struct kvm_sev_launch_secret params;
struct page **pages;
void *blob, *hdr;
unsigned long n, i;
int ret, offset;
if (!sev_guest(kvm))
return -ENOTTY;
if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
return -EFAULT;
pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, FOLL_WRITE);
if (IS_ERR(pages))
return PTR_ERR(pages);
/*
* Flush (on non-coherent CPUs) before LAUNCH_SECRET encrypts pages in
* place; the cache may contain the data that was written unencrypted.
*/
sev_clflush_pages(pages, n);
/*
* The secret must be copied into contiguous memory region, lets verify
* that userspace memory pages are contiguous before we issue command.
*/
if (get_num_contig_pages(0, pages, n) != n) {
ret = -EINVAL;
goto e_unpin_memory;
}
memset(&data, 0, sizeof(data));
offset = params.guest_uaddr & (PAGE_SIZE - 1);
data.guest_address = __sme_page_pa(pages[0]) + offset;
data.guest_len = params.guest_len;
blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
if (IS_ERR(blob)) {
ret = PTR_ERR(blob);
goto e_unpin_memory;
}
data.trans_address = __psp_pa(blob);
data.trans_len = params.trans_len;
hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
if (IS_ERR(hdr)) {
ret = PTR_ERR(hdr);
goto e_free_blob;
}
data.hdr_address = __psp_pa(hdr);
data.hdr_len = params.hdr_len;
data.handle = to_kvm_sev_info(kvm)->handle;
ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, &data, &argp->error);
kfree(hdr);
e_free_blob:
kfree(blob);
e_unpin_memory:
/* content of memory is updated, mark pages dirty */
for (i = 0; i < n; i++) {
set_page_dirty_lock(pages[i]);
mark_page_accessed(pages[i]);
}
sev_unpin_memory(kvm, pages, n);
return ret;
}
static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
void __user *report = u64_to_user_ptr(argp->data);
struct sev_data_attestation_report data;
struct kvm_sev_attestation_report params;
void __user *p;
void *blob = NULL;
int ret;
if (!sev_guest(kvm))
return -ENOTTY;
if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
return -EFAULT;
memset(&data, 0, sizeof(data));
/* User wants to query the blob length */
if (!params.len)
goto cmd;
p = u64_to_user_ptr(params.uaddr);
if (p) {
if (params.len > SEV_FW_BLOB_MAX_SIZE)
return -EINVAL;
blob = kzalloc(params.len, GFP_KERNEL_ACCOUNT);
if (!blob)
return -ENOMEM;
data.address = __psp_pa(blob);
data.len = params.len;
memcpy(data.mnonce, params.mnonce, sizeof(params.mnonce));
}
cmd:
data.handle = to_kvm_sev_info(kvm)->handle;
ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, &data, &argp->error);
/*
* If we query the session length, FW responded with expected data.
*/
if (!params.len)
goto done;
if (ret)
goto e_free_blob;
if (blob) {
if (copy_to_user(p, blob, params.len))
ret = -EFAULT;
}
done:
params.len = data.len;
if (copy_to_user(report, &params, sizeof(params)))
ret = -EFAULT;
e_free_blob:
kfree(blob);
return ret;
}
/* Userspace wants to query session length. */
static int
__sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp,
struct kvm_sev_send_start *params)
{
struct sev_data_send_start data;
int ret;
memset(&data, 0, sizeof(data));
data.handle = to_kvm_sev_info(kvm)->handle;
ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
params->session_len = data.session_len;
if (copy_to_user(u64_to_user_ptr(argp->data), params,
sizeof(struct kvm_sev_send_start)))
ret = -EFAULT;
return ret;
}
static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct sev_data_send_start data;
struct kvm_sev_send_start params;
void *amd_certs, *session_data;
void *pdh_cert, *plat_certs;
int ret;
if (!sev_guest(kvm))
return -ENOTTY;
if (copy_from_user(&params, u64_to_user_ptr(argp->data),
sizeof(struct kvm_sev_send_start)))
return -EFAULT;
/* if session_len is zero, userspace wants to query the session length */
if (!params.session_len)
return __sev_send_start_query_session_length(kvm, argp,
&params);
/* some sanity checks */
if (!params.pdh_cert_uaddr || !params.pdh_cert_len ||
!params.session_uaddr || params.session_len > SEV_FW_BLOB_MAX_SIZE)
return -EINVAL;
/* allocate the memory to hold the session data blob */
session_data = kzalloc(params.session_len, GFP_KERNEL_ACCOUNT);
if (!session_data)
return -ENOMEM;
/* copy the certificate blobs from userspace */
pdh_cert = psp_copy_user_blob(params.pdh_cert_uaddr,
params.pdh_cert_len);
if (IS_ERR(pdh_cert)) {
ret = PTR_ERR(pdh_cert);
goto e_free_session;
}
plat_certs = psp_copy_user_blob(params.plat_certs_uaddr,
params.plat_certs_len);
if (IS_ERR(plat_certs)) {
ret = PTR_ERR(plat_certs);
goto e_free_pdh;
}
amd_certs = psp_copy_user_blob(params.amd_certs_uaddr,
params.amd_certs_len);
if (IS_ERR(amd_certs)) {
ret = PTR_ERR(amd_certs);
goto e_free_plat_cert;
}
/* populate the FW SEND_START field with system physical address */
memset(&data, 0, sizeof(data));
data.pdh_cert_address = __psp_pa(pdh_cert);
data.pdh_cert_len = params.pdh_cert_len;
data.plat_certs_address = __psp_pa(plat_certs);
data.plat_certs_len = params.plat_certs_len;
data.amd_certs_address = __psp_pa(amd_certs);
data.amd_certs_len = params.amd_certs_len;
data.session_address = __psp_pa(session_data);
data.session_len = params.session_len;
data.handle = to_kvm_sev_info(kvm)->handle;
ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
if (!ret && copy_to_user(u64_to_user_ptr(params.session_uaddr),
session_data, params.session_len)) {
ret = -EFAULT;
goto e_free_amd_cert;
}
params.policy = data.policy;
params.session_len = data.session_len;
if (copy_to_user(u64_to_user_ptr(argp->data), &params,
sizeof(struct kvm_sev_send_start)))
ret = -EFAULT;
e_free_amd_cert:
kfree(amd_certs);
e_free_plat_cert:
kfree(plat_certs);
e_free_pdh:
kfree(pdh_cert);
e_free_session:
kfree(session_data);
return ret;
}
/* Userspace wants to query either header or trans length. */
static int
__sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp,
struct kvm_sev_send_update_data *params)
{
struct sev_data_send_update_data data;
int ret;
memset(&data, 0, sizeof(data));
data.handle = to_kvm_sev_info(kvm)->handle;
ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
params->hdr_len = data.hdr_len;
params->trans_len = data.trans_len;
if (copy_to_user(u64_to_user_ptr(argp->data), params,
sizeof(struct kvm_sev_send_update_data)))
ret = -EFAULT;
return ret;
}
static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct sev_data_send_update_data data;
struct kvm_sev_send_update_data params;
void *hdr, *trans_data;
struct page **guest_page;
unsigned long n;
int ret, offset;
if (!sev_guest(kvm))
return -ENOTTY;
if (copy_from_user(&params, u64_to_user_ptr(argp->data),
sizeof(struct kvm_sev_send_update_data)))
return -EFAULT;
/* userspace wants to query either header or trans length */
if (!params.trans_len || !params.hdr_len)
return __sev_send_update_data_query_lengths(kvm, argp, &params);
if (!params.trans_uaddr || !params.guest_uaddr ||
!params.guest_len || !params.hdr_uaddr)
return -EINVAL;
/* Check if we are crossing the page boundary */
offset = params.guest_uaddr & (PAGE_SIZE - 1);
if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE)
return -EINVAL;
/* Pin guest memory */
guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
PAGE_SIZE, &n, 0);
if (IS_ERR(guest_page))
return PTR_ERR(guest_page);
/* allocate memory for header and transport buffer */
ret = -ENOMEM;
hdr = kzalloc(params.hdr_len, GFP_KERNEL);
if (!hdr)
goto e_unpin;
trans_data = kzalloc(params.trans_len, GFP_KERNEL);
if (!trans_data)
goto e_free_hdr;
memset(&data, 0, sizeof(data));
data.hdr_address = __psp_pa(hdr);
data.hdr_len = params.hdr_len;
data.trans_address = __psp_pa(trans_data);
data.trans_len = params.trans_len;
/* The SEND_UPDATE_DATA command requires C-bit to be always set. */
data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
data.guest_address |= sev_me_mask;
data.guest_len = params.guest_len;
data.handle = to_kvm_sev_info(kvm)->handle;
ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
if (ret)
goto e_free_trans_data;
/* copy transport buffer to user space */
if (copy_to_user(u64_to_user_ptr(params.trans_uaddr),
trans_data, params.trans_len)) {
ret = -EFAULT;
goto e_free_trans_data;
}
/* Copy packet header to userspace. */
if (copy_to_user(u64_to_user_ptr(params.hdr_uaddr), hdr,
params.hdr_len))
ret = -EFAULT;
e_free_trans_data:
kfree(trans_data);
e_free_hdr:
kfree(hdr);
e_unpin:
sev_unpin_memory(kvm, guest_page, n);
return ret;
}
static int sev_send_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct sev_data_send_finish data;
if (!sev_guest(kvm))
return -ENOTTY;
data.handle = to_kvm_sev_info(kvm)->handle;
return sev_issue_cmd(kvm, SEV_CMD_SEND_FINISH, &data, &argp->error);
}
static int sev_send_cancel(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct sev_data_send_cancel data;
if (!sev_guest(kvm))
return -ENOTTY;
data.handle = to_kvm_sev_info(kvm)->handle;
return sev_issue_cmd(kvm, SEV_CMD_SEND_CANCEL, &data, &argp->error);
}
static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
struct sev_data_receive_start start;
struct kvm_sev_receive_start params;
int *error = &argp->error;
void *session_data;
void *pdh_data;
int ret;
if (!sev_guest(kvm))
return -ENOTTY;
/* Get parameter from the userspace */
if (copy_from_user(&params, u64_to_user_ptr(argp->data),
sizeof(struct kvm_sev_receive_start)))
return -EFAULT;
/* some sanity checks */
if (!params.pdh_uaddr || !params.pdh_len ||
!params.session_uaddr || !params.session_len)
return -EINVAL;
pdh_data = psp_copy_user_blob(params.pdh_uaddr, params.pdh_len);
if (IS_ERR(pdh_data))
return PTR_ERR(pdh_data);
session_data = psp_copy_user_blob(params.session_uaddr,
params.session_len);
if (IS_ERR(session_data)) {
ret = PTR_ERR(session_data);
goto e_free_pdh;
}
memset(&start, 0, sizeof(start));
start.handle = params.handle;
start.policy = params.policy;
start.pdh_cert_address = __psp_pa(pdh_data);
start.pdh_cert_len = params.pdh_len;
start.session_address = __psp_pa(session_data);
start.session_len = params.session_len;
/* create memory encryption context */
ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_RECEIVE_START, &start,
error);
if (ret)
goto e_free_session;
/* Bind ASID to this guest */
ret = sev_bind_asid(kvm, start.handle, error);
KVM: SVM: fix missing sev_decommission in sev_receive_start DECOMMISSION the current SEV context if binding an ASID fails after RECEIVE_START. Per AMD's SEV API, RECEIVE_START generates a new guest context and thus needs to be paired with DECOMMISSION: The RECEIVE_START command is the only command other than the LAUNCH_START command that generates a new guest context and guest handle. The missing DECOMMISSION can result in subsequent SEV launch failures, as the firmware leaks memory and might not able to allocate more SEV guest contexts in the future. Note, LAUNCH_START suffered the same bug, but was previously fixed by commit 934002cd660b ("KVM: SVM: Call SEV Guest Decommission if ASID binding fails"). Cc: Alper Gun <alpergun@google.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Brijesh Singh <brijesh.singh@amd.com> Cc: David Rienjes <rientjes@google.com> Cc: Marc Orr <marcorr@google.com> Cc: John Allen <john.allen@amd.com> Cc: Peter Gonda <pgonda@google.com> Cc: Sean Christopherson <seanjc@google.com> Cc: Tom Lendacky <thomas.lendacky@amd.com> Cc: Vipin Sharma <vipinsh@google.com> Cc: stable@vger.kernel.org Reviewed-by: Marc Orr <marcorr@google.com> Acked-by: Brijesh Singh <brijesh.singh@amd.com> Fixes: af43cbbf954b ("KVM: SVM: Add support for KVM_SEV_RECEIVE_START command") Signed-off-by: Mingwei Zhang <mizhang@google.com> Reviewed-by: Sean Christopherson <seanjc@google.com> Message-Id: <20210912181815.3899316-1-mizhang@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2021-09-12 18:18:15 +00:00
if (ret) {
sev_decommission(start.handle);
goto e_free_session;
KVM: SVM: fix missing sev_decommission in sev_receive_start DECOMMISSION the current SEV context if binding an ASID fails after RECEIVE_START. Per AMD's SEV API, RECEIVE_START generates a new guest context and thus needs to be paired with DECOMMISSION: The RECEIVE_START command is the only command other than the LAUNCH_START command that generates a new guest context and guest handle. The missing DECOMMISSION can result in subsequent SEV launch failures, as the firmware leaks memory and might not able to allocate more SEV guest contexts in the future. Note, LAUNCH_START suffered the same bug, but was previously fixed by commit 934002cd660b ("KVM: SVM: Call SEV Guest Decommission if ASID binding fails"). Cc: Alper Gun <alpergun@google.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Brijesh Singh <brijesh.singh@amd.com> Cc: David Rienjes <rientjes@google.com> Cc: Marc Orr <marcorr@google.com> Cc: John Allen <john.allen@amd.com> Cc: Peter Gonda <pgonda@google.com> Cc: Sean Christopherson <seanjc@google.com> Cc: Tom Lendacky <thomas.lendacky@amd.com> Cc: Vipin Sharma <vipinsh@google.com> Cc: stable@vger.kernel.org Reviewed-by: Marc Orr <marcorr@google.com> Acked-by: Brijesh Singh <brijesh.singh@amd.com> Fixes: af43cbbf954b ("KVM: SVM: Add support for KVM_SEV_RECEIVE_START command") Signed-off-by: Mingwei Zhang <mizhang@google.com> Reviewed-by: Sean Christopherson <seanjc@google.com> Message-Id: <20210912181815.3899316-1-mizhang@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2021-09-12 18:18:15 +00:00
}
params.handle = start.handle;
if (copy_to_user(u64_to_user_ptr(argp->data),
&params, sizeof(struct kvm_sev_receive_start))) {
ret = -EFAULT;
sev_unbind_asid(kvm, start.handle);
goto e_free_session;
}
sev->handle = start.handle;
sev->fd = argp->sev_fd;
e_free_session:
kfree(session_data);
e_free_pdh:
kfree(pdh_data);
return ret;
}
static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_receive_update_data params;
struct sev_data_receive_update_data data;
void *hdr = NULL, *trans = NULL;
struct page **guest_page;
unsigned long n;
int ret, offset;
if (!sev_guest(kvm))
return -EINVAL;
if (copy_from_user(&params, u64_to_user_ptr(argp->data),
sizeof(struct kvm_sev_receive_update_data)))
return -EFAULT;
if (!params.hdr_uaddr || !params.hdr_len ||
!params.guest_uaddr || !params.guest_len ||
!params.trans_uaddr || !params.trans_len)
return -EINVAL;
/* Check if we are crossing the page boundary */
offset = params.guest_uaddr & (PAGE_SIZE - 1);
if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE)
return -EINVAL;
hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
if (IS_ERR(hdr))
return PTR_ERR(hdr);
trans = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto e_free_hdr;
}
memset(&data, 0, sizeof(data));
data.hdr_address = __psp_pa(hdr);
data.hdr_len = params.hdr_len;
data.trans_address = __psp_pa(trans);
data.trans_len = params.trans_len;
/* Pin guest memory */
guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
PAGE_SIZE, &n, FOLL_WRITE);
if (IS_ERR(guest_page)) {
ret = PTR_ERR(guest_page);
goto e_free_trans;
}
/*
* Flush (on non-coherent CPUs) before RECEIVE_UPDATE_DATA, the PSP
* encrypts the written data with the guest's key, and the cache may
* contain dirty, unencrypted data.
*/
sev_clflush_pages(guest_page, n);
/* The RECEIVE_UPDATE_DATA command requires C-bit to be always set. */
data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
data.guest_address |= sev_me_mask;
data.guest_len = params.guest_len;
data.handle = to_kvm_sev_info(kvm)->handle;
ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, &data,
&argp->error);
sev_unpin_memory(kvm, guest_page, n);
e_free_trans:
kfree(trans);
e_free_hdr:
kfree(hdr);
return ret;
}
static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct sev_data_receive_finish data;
if (!sev_guest(kvm))
return -ENOTTY;
data.handle = to_kvm_sev_info(kvm)->handle;
return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error);
}
static bool is_cmd_allowed_from_mirror(u32 cmd_id)
{
/*
* Allow mirrors VM to call KVM_SEV_LAUNCH_UPDATE_VMSA to enable SEV-ES
* active mirror VMs. Also allow the debugging and status commands.
*/
if (cmd_id == KVM_SEV_LAUNCH_UPDATE_VMSA ||
cmd_id == KVM_SEV_GUEST_STATUS || cmd_id == KVM_SEV_DBG_DECRYPT ||
cmd_id == KVM_SEV_DBG_ENCRYPT)
return true;
return false;
}
static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
{
struct kvm_sev_info *dst_sev = to_kvm_sev_info(dst_kvm);
struct kvm_sev_info *src_sev = to_kvm_sev_info(src_kvm);
int r = -EBUSY;
if (dst_kvm == src_kvm)
return -EINVAL;
/*
* Bail if these VMs are already involved in a migration to avoid
* deadlock between two VMs trying to migrate to/from each other.
*/
if (atomic_cmpxchg_acquire(&dst_sev->migration_in_progress, 0, 1))
return -EBUSY;
if (atomic_cmpxchg_acquire(&src_sev->migration_in_progress, 0, 1))
goto release_dst;
r = -EINTR;
if (mutex_lock_killable(&dst_kvm->lock))
goto release_src;
if (mutex_lock_killable_nested(&src_kvm->lock, SINGLE_DEPTH_NESTING))
goto unlock_dst;
return 0;
unlock_dst:
mutex_unlock(&dst_kvm->lock);
release_src:
atomic_set_release(&src_sev->migration_in_progress, 0);
release_dst:
atomic_set_release(&dst_sev->migration_in_progress, 0);
return r;
}
static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
{
struct kvm_sev_info *dst_sev = to_kvm_sev_info(dst_kvm);
struct kvm_sev_info *src_sev = to_kvm_sev_info(src_kvm);
mutex_unlock(&dst_kvm->lock);
mutex_unlock(&src_kvm->lock);
atomic_set_release(&dst_sev->migration_in_progress, 0);
atomic_set_release(&src_sev->migration_in_progress, 0);
}
static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm)
{
struct kvm_sev_info *dst = to_kvm_sev_info(dst_kvm);
struct kvm_sev_info *src = to_kvm_sev_info(src_kvm);
struct kvm_vcpu *dst_vcpu, *src_vcpu;
struct vcpu_svm *dst_svm, *src_svm;
struct kvm_sev_info *mirror;
unsigned long i;
dst->active = true;
dst->asid = src->asid;
dst->handle = src->handle;
dst->pages_locked = src->pages_locked;
dst->enc_context_owner = src->enc_context_owner;
dst->es_active = src->es_active;
dst->vmsa_features = src->vmsa_features;
src->asid = 0;
src->active = false;
src->handle = 0;
src->pages_locked = 0;
src->enc_context_owner = NULL;
src->es_active = false;
list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
/*
* If this VM has mirrors, "transfer" each mirror's refcount of the
* source to the destination (this KVM). The caller holds a reference
* to the source, so there's no danger of use-after-free.
*/
list_cut_before(&dst->mirror_vms, &src->mirror_vms, &src->mirror_vms);
list_for_each_entry(mirror, &dst->mirror_vms, mirror_entry) {
kvm_get_kvm(dst_kvm);
kvm_put_kvm(src_kvm);
mirror->enc_context_owner = dst_kvm;
}
/*
* If this VM is a mirror, remove the old mirror from the owners list
* and add the new mirror to the list.
*/
if (is_mirroring_enc_context(dst_kvm)) {
struct kvm_sev_info *owner_sev_info = to_kvm_sev_info(dst->enc_context_owner);
list_del(&src->mirror_entry);
list_add_tail(&dst->mirror_entry, &owner_sev_info->mirror_vms);
}
kvm_for_each_vcpu(i, dst_vcpu, dst_kvm) {
dst_svm = to_svm(dst_vcpu);
sev_init_vmcb(dst_svm);
if (!dst->es_active)
continue;
/*
* Note, the source is not required to have the same number of
* vCPUs as the destination when migrating a vanilla SEV VM.
*/
KVM: SVM: Get source vCPUs from source VM for SEV-ES intrahost migration Fix a goof where KVM tries to grab source vCPUs from the destination VM when doing intrahost migration. Grabbing the wrong vCPU not only hoses the guest, it also crashes the host due to the VMSA pointer being left NULL. BUG: unable to handle page fault for address: ffffe38687000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] SMP NOPTI CPU: 39 PID: 17143 Comm: sev_migrate_tes Tainted: GO 6.5.0-smp--fff2e47e6c3b-next #151 Hardware name: Google, Inc. Arcadia_IT_80/Arcadia_IT_80, BIOS 34.28.0 07/10/2023 RIP: 0010:__free_pages+0x15/0xd0 RSP: 0018:ffff923fcf6e3c78 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffffe38687000000 RCX: 0000000000000100 RDX: 0000000000000100 RSI: 0000000000000000 RDI: ffffe38687000000 RBP: ffff923fcf6e3c88 R08: ffff923fcafb0000 R09: 0000000000000000 R10: 0000000000000000 R11: ffffffff83619b90 R12: ffff923fa9540000 R13: 0000000000080007 R14: ffff923f6d35d000 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff929d0d7c0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffe38687000000 CR3: 0000005224c34005 CR4: 0000000000770ee0 PKRU: 55555554 Call Trace: <TASK> sev_free_vcpu+0xcb/0x110 [kvm_amd] svm_vcpu_free+0x75/0xf0 [kvm_amd] kvm_arch_vcpu_destroy+0x36/0x140 [kvm] kvm_destroy_vcpus+0x67/0x100 [kvm] kvm_arch_destroy_vm+0x161/0x1d0 [kvm] kvm_put_kvm+0x276/0x560 [kvm] kvm_vm_release+0x25/0x30 [kvm] __fput+0x106/0x280 ____fput+0x12/0x20 task_work_run+0x86/0xb0 do_exit+0x2e3/0x9c0 do_group_exit+0xb1/0xc0 __x64_sys_exit_group+0x1b/0x20 do_syscall_64+0x41/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd </TASK> CR2: ffffe38687000000 Fixes: 6defa24d3b12 ("KVM: SEV: Init target VMCBs in sev_migrate_from") Cc: stable@vger.kernel.org Cc: Peter Gonda <pgonda@google.com> Reviewed-by: Peter Gonda <pgonda@google.com> Reviewed-by: Pankaj Gupta <pankaj.gupta@amd.com> Link: https://lore.kernel.org/r/20230825022357.2852133-2-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
2023-08-24 19:23:56 -07:00
src_vcpu = kvm_get_vcpu(src_kvm, i);
src_svm = to_svm(src_vcpu);
/*
* Transfer VMSA and GHCB state to the destination. Nullify and
* clear source fields as appropriate, the state now belongs to
* the destination.
*/
memcpy(&dst_svm->sev_es, &src_svm->sev_es, sizeof(src_svm->sev_es));
dst_svm->vmcb->control.ghcb_gpa = src_svm->vmcb->control.ghcb_gpa;
dst_svm->vmcb->control.vmsa_pa = src_svm->vmcb->control.vmsa_pa;
dst_vcpu->arch.guest_state_protected = true;
memset(&src_svm->sev_es, 0, sizeof(src_svm->sev_es));
src_svm->vmcb->control.ghcb_gpa = INVALID_PAGE;
src_svm->vmcb->control.vmsa_pa = INVALID_PAGE;
src_vcpu->arch.guest_state_protected = false;
}
}
static int sev_check_source_vcpus(struct kvm *dst, struct kvm *src)
{
struct kvm_vcpu *src_vcpu;
unsigned long i;
KVM: SVM: Reject SEV{-ES} intra host migration if vCPU creation is in-flight Reject migration of SEV{-ES} state if either the source or destination VM is actively creating a vCPU, i.e. if kvm_vm_ioctl_create_vcpu() is in the section between incrementing created_vcpus and online_vcpus. The bulk of vCPU creation runs _outside_ of kvm->lock to allow creating multiple vCPUs in parallel, and so sev_info.es_active can get toggled from false=>true in the destination VM after (or during) svm_vcpu_create(), resulting in an SEV{-ES} VM effectively having a non-SEV{-ES} vCPU. The issue manifests most visibly as a crash when trying to free a vCPU's NULL VMSA page in an SEV-ES VM, but any number of things can go wrong. BUG: unable to handle page fault for address: ffffebde00000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: Oops: 0000 [#1] SMP KASAN NOPTI CPU: 227 UID: 0 PID: 64063 Comm: syz.5.60023 Tainted: G U O 6.15.0-smp-DEV #2 NONE Tainted: [U]=USER, [O]=OOT_MODULE Hardware name: Google, Inc. Arcadia_IT_80/Arcadia_IT_80, BIOS 12.52.0-0 10/28/2024 RIP: 0010:constant_test_bit arch/x86/include/asm/bitops.h:206 [inline] RIP: 0010:arch_test_bit arch/x86/include/asm/bitops.h:238 [inline] RIP: 0010:_test_bit include/asm-generic/bitops/instrumented-non-atomic.h:142 [inline] RIP: 0010:PageHead include/linux/page-flags.h:866 [inline] RIP: 0010:___free_pages+0x3e/0x120 mm/page_alloc.c:5067 Code: <49> f7 06 40 00 00 00 75 05 45 31 ff eb 0c 66 90 4c 89 f0 4c 39 f0 RSP: 0018:ffff8984551978d0 EFLAGS: 00010246 RAX: 0000777f80000001 RBX: 0000000000000000 RCX: ffffffff918aeb98 RDX: 0000000000000000 RSI: 0000000000000008 RDI: ffffebde00000000 RBP: 0000000000000000 R08: ffffebde00000007 R09: 1ffffd7bc0000000 R10: dffffc0000000000 R11: fffff97bc0000001 R12: dffffc0000000000 R13: ffff8983e19751a8 R14: ffffebde00000000 R15: 1ffffd7bc0000000 FS: 0000000000000000(0000) GS:ffff89ee661d3000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffebde00000000 CR3: 000000793ceaa000 CR4: 0000000000350ef0 DR0: 0000000000000000 DR1: 0000000000000b5f DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Call Trace: <TASK> sev_free_vcpu+0x413/0x630 arch/x86/kvm/svm/sev.c:3169 svm_vcpu_free+0x13a/0x2a0 arch/x86/kvm/svm/svm.c:1515 kvm_arch_vcpu_destroy+0x6a/0x1d0 arch/x86/kvm/x86.c:12396 kvm_vcpu_destroy virt/kvm/kvm_main.c:470 [inline] kvm_destroy_vcpus+0xd1/0x300 virt/kvm/kvm_main.c:490 kvm_arch_destroy_vm+0x636/0x820 arch/x86/kvm/x86.c:12895 kvm_put_kvm+0xb8e/0xfb0 virt/kvm/kvm_main.c:1310 kvm_vm_release+0x48/0x60 virt/kvm/kvm_main.c:1369 __fput+0x3e4/0x9e0 fs/file_table.c:465 task_work_run+0x1a9/0x220 kernel/task_work.c:227 exit_task_work include/linux/task_work.h:40 [inline] do_exit+0x7f0/0x25b0 kernel/exit.c:953 do_group_exit+0x203/0x2d0 kernel/exit.c:1102 get_signal+0x1357/0x1480 kernel/signal.c:3034 arch_do_signal_or_restart+0x40/0x690 arch/x86/kernel/signal.c:337 exit_to_user_mode_loop kernel/entry/common.c:111 [inline] exit_to_user_mode_prepare include/linux/entry-common.h:329 [inline] __syscall_exit_to_user_mode_work kernel/entry/common.c:207 [inline] syscall_exit_to_user_mode+0x67/0xb0 kernel/entry/common.c:218 do_syscall_64+0x7c/0x150 arch/x86/entry/syscall_64.c:100 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7f87a898e969 </TASK> Modules linked in: gq(O) gsmi: Log Shutdown Reason 0x03 CR2: ffffebde00000000 ---[ end trace 0000000000000000 ]--- Deliberately don't check for a NULL VMSA when freeing the vCPU, as crashing the host is likely desirable due to the VMSA being consumed by hardware. E.g. if KVM manages to allow VMRUN on the vCPU, hardware may read/write a bogus VMSA page. Accessing PFN 0 is "fine"-ish now that it's sequestered away thanks to L1TF, but panicking in this scenario is preferable to potentially running with corrupted state. Reported-by: Alexander Potapenko <glider@google.com> Tested-by: Alexander Potapenko <glider@google.com> Fixes: 0b020f5af092 ("KVM: SEV: Add support for SEV-ES intra host migration") Fixes: b56639318bb2 ("KVM: SEV: Add support for SEV intra host migration") Cc: stable@vger.kernel.org Cc: James Houghton <jthoughton@google.com> Cc: Peter Gonda <pgonda@google.com> Reviewed-by: Liam Merwick <liam.merwick@oracle.com> Tested-by: Liam Merwick <liam.merwick@oracle.com> Reviewed-by: James Houghton <jthoughton@google.com> Link: https://lore.kernel.org/r/20250602224459.41505-2-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-06-02 15:44:58 -07:00
if (src->created_vcpus != atomic_read(&src->online_vcpus) ||
dst->created_vcpus != atomic_read(&dst->online_vcpus))
return -EBUSY;
if (!sev_es_guest(src))
return 0;
if (atomic_read(&src->online_vcpus) != atomic_read(&dst->online_vcpus))
return -EINVAL;
kvm_for_each_vcpu(i, src_vcpu, src) {
if (!src_vcpu->arch.guest_state_protected)
return -EINVAL;
}
return 0;
}
int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
{
struct kvm_sev_info *dst_sev = to_kvm_sev_info(kvm);
struct kvm_sev_info *src_sev, *cg_cleanup_sev;
CLASS(fd, f)(source_fd);
struct kvm *source_kvm;
bool charged = false;
int ret;
if (fd_empty(f))
return -EBADF;
if (!file_is_kvm(fd_file(f)))
return -EBADF;
source_kvm = fd_file(f)->private_data;
ret = sev_lock_two_vms(kvm, source_kvm);
if (ret)
return ret;
if (kvm->arch.vm_type != source_kvm->arch.vm_type ||
sev_guest(kvm) || !sev_guest(source_kvm)) {
ret = -EINVAL;
goto out_unlock;
}
src_sev = to_kvm_sev_info(source_kvm);
dst_sev->misc_cg = get_current_misc_cg();
cg_cleanup_sev = dst_sev;
if (dst_sev->misc_cg != src_sev->misc_cg) {
ret = sev_misc_cg_try_charge(dst_sev);
if (ret)
goto out_dst_cgroup;
charged = true;
}
ret = kvm_lock_all_vcpus(kvm);
if (ret)
goto out_dst_cgroup;
ret = kvm_lock_all_vcpus(source_kvm);
if (ret)
goto out_dst_vcpu;
ret = sev_check_source_vcpus(kvm, source_kvm);
if (ret)
goto out_source_vcpu;
KVM: SVM: Flush cache only on CPUs running SEV guest On AMD CPUs without ensuring cache consistency, each memory page reclamation in an SEV guest triggers a call to do WBNOINVD/WBINVD on all CPUs, thereby affecting the performance of other programs on the host. Typically, an AMD server may have 128 cores or more, while the SEV guest might only utilize 8 of these cores. Meanwhile, host can use qemu-affinity to bind these 8 vCPUs to specific physical CPUs. Therefore, keeping a record of the physical core numbers each time a vCPU runs can help avoid flushing the cache for all CPUs every time. Take care to allocate the cpumask used to track which CPUs have run a vCPU when copying or moving an "encryption context", as nothing guarantees memory in a mirror VM is a strict subset of the ASID owner, and the destination VM for intrahost migration needs to maintain it's own set of CPUs. E.g. for intrahost migration, if a CPU was used for the source VM but not the destination VM, then it can only have cached memory that was accessible to the source VM. And a CPU that was run in the source is also used by the destination is no different than a CPU that was run in the destination only. Note, KVM is guaranteed to do flush caches prior to sev_vm_destroy(), thanks to kvm_arch_guest_memory_reclaimed for SEV and SEV-ES, and kvm_arch_gmem_invalidate() for SEV-SNP. I.e. it's safe to free the cpumask prior to unregistering encrypted regions and freeing the ASID. Opportunistically clean up sev_vm_destroy()'s comment regarding what is (implicitly, what isn't) skipped for mirror VMs. Cc: Srikanth Aithal <sraithal@amd.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Signed-off-by: Zheyun Shen <szy0127@sjtu.edu.cn> Link: https://lore.kernel.org/r/20250522233733.3176144-9-seanjc@google.com Link: https://lore.kernel.org/all/935a82e3-f7ad-47d7-aaaf-f3d2b62ed768@amd.com Co-developed-by: Sean Christopherson <seanjc@google.com> Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-05-22 16:37:32 -07:00
/*
* Allocate a new have_run_cpus for the destination, i.e. don't copy
* the set of CPUs from the source. If a CPU was used to run a vCPU in
* the source VM but is never used for the destination VM, then the CPU
* can only have cached memory that was accessible to the source VM.
*/
if (!zalloc_cpumask_var(&dst_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
ret = -ENOMEM;
goto out_source_vcpu;
}
sev_migrate_from(kvm, source_kvm);
kvm_vm_dead(source_kvm);
cg_cleanup_sev = src_sev;
ret = 0;
out_source_vcpu:
kvm_unlock_all_vcpus(source_kvm);
out_dst_vcpu:
kvm_unlock_all_vcpus(kvm);
out_dst_cgroup:
/* Operates on the source on success, on the destination on failure. */
if (charged)
sev_misc_cg_uncharge(cg_cleanup_sev);
put_misc_cg(cg_cleanup_sev->misc_cg);
cg_cleanup_sev->misc_cg = NULL;
out_unlock:
sev_unlock_two_vms(kvm, source_kvm);
return ret;
}
int sev_dev_get_attr(u32 group, u64 attr, u64 *val)
{
if (group != KVM_X86_GRP_SEV)
return -ENXIO;
switch (attr) {
case KVM_X86_SEV_VMSA_FEATURES:
*val = sev_supported_vmsa_features;
return 0;
default:
return -ENXIO;
}
}
/*
* The guest context contains all the information, keys and metadata
* associated with the guest that the firmware tracks to implement SEV
* and SNP features. The firmware stores the guest context in hypervisor
* provide page via the SNP_GCTX_CREATE command.
*/
static void *snp_context_create(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct sev_data_snp_addr data = {};
void *context;
int rc;
/* Allocate memory for context page */
context = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT);
if (!context)
return NULL;
data.address = __psp_pa(context);
rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_GCTX_CREATE, &data, &argp->error);
if (rc) {
pr_warn("Failed to create SEV-SNP context, rc %d fw_error %d",
rc, argp->error);
snp_free_firmware_page(context);
return NULL;
}
return context;
}
static int snp_bind_asid(struct kvm *kvm, int *error)
{
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
struct sev_data_snp_activate data = {0};
data.gctx_paddr = __psp_pa(sev->snp_context);
data.asid = sev_get_asid(kvm);
return sev_issue_cmd(kvm, SEV_CMD_SNP_ACTIVATE, &data, error);
}
static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
struct sev_data_snp_launch_start start = {0};
struct kvm_sev_snp_launch_start params;
int rc;
if (!sev_snp_guest(kvm))
return -ENOTTY;
if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
return -EFAULT;
/* Don't allow userspace to allocate memory for more than 1 SNP context. */
if (sev->snp_context)
return -EINVAL;
if (params.flags)
return -EINVAL;
if (params.policy & ~SNP_POLICY_MASK_VALID)
return -EINVAL;
/* Check for policy bits that must be set */
if (!(params.policy & SNP_POLICY_MASK_RSVD_MBO))
return -EINVAL;
sev->policy = params.policy;
sev->snp_context = snp_context_create(kvm, argp);
if (!sev->snp_context)
return -ENOTTY;
start.gctx_paddr = __psp_pa(sev->snp_context);
start.policy = params.policy;
memcpy(start.gosvw, params.gosvw, sizeof(params.gosvw));
rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_START, &start, &argp->error);
if (rc) {
pr_debug("%s: SEV_CMD_SNP_LAUNCH_START firmware command failed, rc %d\n",
__func__, rc);
goto e_free_context;
}
sev->fd = argp->sev_fd;
rc = snp_bind_asid(kvm, &argp->error);
if (rc) {
pr_debug("%s: Failed to bind ASID to SEV-SNP context, rc %d\n",
__func__, rc);
goto e_free_context;
}
return 0;
e_free_context:
snp_decommission_context(kvm);
return rc;
}
struct sev_gmem_populate_args {
__u8 type;
int sev_fd;
int fw_error;
};
static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pfn,
void __user *src, int order, void *opaque)
{
struct sev_gmem_populate_args *sev_populate_args = opaque;
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
int n_private = 0, ret, i;
int npages = (1 << order);
gfn_t gfn;
if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src))
return -EINVAL;
for (gfn = gfn_start, i = 0; gfn < gfn_start + npages; gfn++, i++) {
struct sev_data_snp_launch_update fw_args = {0};
bool assigned = false;
int level;
ret = snp_lookup_rmpentry((u64)pfn + i, &assigned, &level);
if (ret || assigned) {
pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n",
__func__, gfn, ret, assigned);
ret = ret ? -EINVAL : -EEXIST;
goto err;
}
if (src) {
void *vaddr = kmap_local_pfn(pfn + i);
if (copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE)) {
ret = -EFAULT;
goto err;
}
kunmap_local(vaddr);
}
ret = rmp_make_private(pfn + i, gfn << PAGE_SHIFT, PG_LEVEL_4K,
sev_get_asid(kvm), true);
if (ret)
goto err;
n_private++;
fw_args.gctx_paddr = __psp_pa(sev->snp_context);
fw_args.address = __sme_set(pfn_to_hpa(pfn + i));
fw_args.page_size = PG_LEVEL_TO_RMP(PG_LEVEL_4K);
fw_args.page_type = sev_populate_args->type;
ret = __sev_issue_cmd(sev_populate_args->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE,
&fw_args, &sev_populate_args->fw_error);
if (ret)
goto fw_err;
}
return 0;
fw_err:
/*
* If the firmware command failed handle the reclaim and cleanup of that
* PFN specially vs. prior pages which can be cleaned up below without
* needing to reclaim in advance.
*
* Additionally, when invalid CPUID function entries are detected,
* firmware writes the expected values into the page and leaves it
* unencrypted so it can be used for debugging and error-reporting.
*
* Copy this page back into the source buffer so userspace can use this
* information to provide information on which CPUID leaves/fields
* failed CPUID validation.
*/
if (!snp_page_reclaim(kvm, pfn + i) &&
sev_populate_args->type == KVM_SEV_SNP_PAGE_TYPE_CPUID &&
sev_populate_args->fw_error == SEV_RET_INVALID_PARAM) {
void *vaddr = kmap_local_pfn(pfn + i);
if (copy_to_user(src + i * PAGE_SIZE, vaddr, PAGE_SIZE))
pr_debug("Failed to write CPUID page back to userspace\n");
kunmap_local(vaddr);
}
/* pfn + i is hypervisor-owned now, so skip below cleanup for it. */
n_private--;
err:
pr_debug("%s: exiting with error ret %d (fw_error %d), restoring %d gmem PFNs to shared.\n",
__func__, ret, sev_populate_args->fw_error, n_private);
for (i = 0; i < n_private; i++)
kvm_rmp_make_shared(kvm, pfn + i, PG_LEVEL_4K);
return ret;
}
static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
struct sev_gmem_populate_args sev_populate_args = {0};
struct kvm_sev_snp_launch_update params;
struct kvm_memory_slot *memslot;
long npages, count;
void __user *src;
int ret = 0;
if (!sev_snp_guest(kvm) || !sev->snp_context)
return -EINVAL;
if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
return -EFAULT;
pr_debug("%s: GFN start 0x%llx length 0x%llx type %d flags %d\n", __func__,
params.gfn_start, params.len, params.type, params.flags);
if (!PAGE_ALIGNED(params.len) || params.flags ||
(params.type != KVM_SEV_SNP_PAGE_TYPE_NORMAL &&
params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO &&
params.type != KVM_SEV_SNP_PAGE_TYPE_UNMEASURED &&
params.type != KVM_SEV_SNP_PAGE_TYPE_SECRETS &&
params.type != KVM_SEV_SNP_PAGE_TYPE_CPUID))
return -EINVAL;
npages = params.len / PAGE_SIZE;
/*
* For each GFN that's being prepared as part of the initial guest
* state, the following pre-conditions are verified:
*
* 1) The backing memslot is a valid private memslot.
* 2) The GFN has been set to private via KVM_SET_MEMORY_ATTRIBUTES
* beforehand.
* 3) The PFN of the guest_memfd has not already been set to private
* in the RMP table.
*
* The KVM MMU relies on kvm->mmu_invalidate_seq to retry nested page
* faults if there's a race between a fault and an attribute update via
* KVM_SET_MEMORY_ATTRIBUTES, and a similar approach could be utilized
* here. However, kvm->slots_lock guards against both this as well as
* concurrent memslot updates occurring while these checks are being
* performed, so use that here to make it easier to reason about the
* initial expected state and better guard against unexpected
* situations.
*/
mutex_lock(&kvm->slots_lock);
memslot = gfn_to_memslot(kvm, params.gfn_start);
if (!kvm_slot_can_be_private(memslot)) {
ret = -EINVAL;
goto out;
}
sev_populate_args.sev_fd = argp->sev_fd;
sev_populate_args.type = params.type;
src = params.type == KVM_SEV_SNP_PAGE_TYPE_ZERO ? NULL : u64_to_user_ptr(params.uaddr);
count = kvm_gmem_populate(kvm, params.gfn_start, src, npages,
sev_gmem_post_populate, &sev_populate_args);
if (count < 0) {
argp->error = sev_populate_args.fw_error;
pr_debug("%s: kvm_gmem_populate failed, ret %ld (fw_error %d)\n",
__func__, count, argp->error);
ret = -EIO;
} else {
params.gfn_start += count;
params.len -= count * PAGE_SIZE;
if (params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO)
params.uaddr += count * PAGE_SIZE;
ret = 0;
if (copy_to_user(u64_to_user_ptr(argp->data), &params, sizeof(params)))
ret = -EFAULT;
}
out:
mutex_unlock(&kvm->slots_lock);
return ret;
}
static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
struct sev_data_snp_launch_update data = {};
struct kvm_vcpu *vcpu;
unsigned long i;
int ret;
data.gctx_paddr = __psp_pa(sev->snp_context);
data.page_type = SNP_PAGE_TYPE_VMSA;
kvm_for_each_vcpu(i, vcpu, kvm) {
struct vcpu_svm *svm = to_svm(vcpu);
u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT;
ret = sev_es_sync_vmsa(svm);
if (ret)
return ret;
/* Transition the VMSA page to a firmware state. */
ret = rmp_make_private(pfn, INITIAL_VMSA_GPA, PG_LEVEL_4K, sev->asid, true);
if (ret)
return ret;
/* Issue the SNP command to encrypt the VMSA */
data.address = __sme_pa(svm->sev_es.vmsa);
ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE,
&data, &argp->error);
if (ret) {
snp_page_reclaim(kvm, pfn);
return ret;
}
svm->vcpu.arch.guest_state_protected = true;
/*
* SEV-ES (and thus SNP) guest mandates LBR Virtualization to
* be _always_ ON. Enable it only after setting
* guest_state_protected because KVM_SET_MSRS allows dynamic
* toggling of LBRV (for performance reason) on write access to
* MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set.
*/
svm_enable_lbrv(vcpu);
}
return 0;
}
static int snp_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
struct kvm_sev_snp_launch_finish params;
struct sev_data_snp_launch_finish *data;
void *id_block = NULL, *id_auth = NULL;
int ret;
if (!sev_snp_guest(kvm))
return -ENOTTY;
if (!sev->snp_context)
return -EINVAL;
if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
return -EFAULT;
if (params.flags)
return -EINVAL;
/* Measure all vCPUs using LAUNCH_UPDATE before finalizing the launch flow. */
ret = snp_launch_update_vmsa(kvm, argp);
if (ret)
return ret;
data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
if (!data)
return -ENOMEM;
if (params.id_block_en) {
id_block = psp_copy_user_blob(params.id_block_uaddr, KVM_SEV_SNP_ID_BLOCK_SIZE);
if (IS_ERR(id_block)) {
ret = PTR_ERR(id_block);
goto e_free;
}
data->id_block_en = 1;
data->id_block_paddr = __sme_pa(id_block);
id_auth = psp_copy_user_blob(params.id_auth_uaddr, KVM_SEV_SNP_ID_AUTH_SIZE);
if (IS_ERR(id_auth)) {
ret = PTR_ERR(id_auth);
goto e_free_id_block;
}
data->id_auth_paddr = __sme_pa(id_auth);
if (params.auth_key_en)
data->auth_key_en = 1;
}
data->vcek_disabled = params.vcek_disabled;
memcpy(data->host_data, params.host_data, KVM_SEV_SNP_FINISH_DATA_SIZE);
data->gctx_paddr = __psp_pa(sev->snp_context);
ret = sev_issue_cmd(kvm, SEV_CMD_SNP_LAUNCH_FINISH, data, &argp->error);
/*
* Now that there will be no more SNP_LAUNCH_UPDATE ioctls, private pages
* can be given to the guest simply by marking the RMP entry as private.
* This can happen on first access and also with KVM_PRE_FAULT_MEMORY.
*/
if (!ret)
kvm->arch.pre_fault_allowed = true;
kfree(id_auth);
e_free_id_block:
kfree(id_block);
e_free:
kfree(data);
return ret;
}
int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
{
struct kvm_sev_cmd sev_cmd;
int r;
if (!sev_enabled)
return -ENOTTY;
if (!argp)
return 0;
if (copy_from_user(&sev_cmd, argp, sizeof(struct kvm_sev_cmd)))
return -EFAULT;
mutex_lock(&kvm->lock);
/* Only the enc_context_owner handles some memory enc operations. */
if (is_mirroring_enc_context(kvm) &&
!is_cmd_allowed_from_mirror(sev_cmd.id)) {
KVM: x86: Support KVM VMs sharing SEV context Add a capability for userspace to mirror SEV encryption context from one vm to another. On our side, this is intended to support a Migration Helper vCPU, but it can also be used generically to support other in-guest workloads scheduled by the host. The intention is for the primary guest and the mirror to have nearly identical memslots. The primary benefits of this are that: 1) The VMs do not share KVM contexts (think APIC/MSRs/etc), so they can't accidentally clobber each other. 2) The VMs can have different memory-views, which is necessary for post-copy migration (the migration vCPUs on the target need to read and write to pages, when the primary guest would VMEXIT). This does not change the threat model for AMD SEV. Any memory involved is still owned by the primary guest and its initial state is still attested to through the normal SEV_LAUNCH_* flows. If userspace wanted to circumvent SEV, they could achieve the same effect by simply attaching a vCPU to the primary VM. This patch deliberately leaves userspace in charge of the memslots for the mirror, as it already has the power to mess with them in the primary guest. This patch does not support SEV-ES (much less SNP), as it does not handle handing off attested VMSAs to the mirror. For additional context, we need a Migration Helper because SEV PSP migration is far too slow for our live migration on its own. Using an in-guest migrator lets us speed this up significantly. Signed-off-by: Nathan Tempelman <natet@google.com> Message-Id: <20210408223214.2582277-1-natet@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2021-04-08 22:32:14 +00:00
r = -EINVAL;
goto out;
}
/*
* Once KVM_SEV_INIT2 initializes a KVM instance as an SNP guest, only
* allow the use of SNP-specific commands.
*/
if (sev_snp_guest(kvm) && sev_cmd.id < KVM_SEV_SNP_LAUNCH_START) {
r = -EPERM;
goto out;
}
switch (sev_cmd.id) {
case KVM_SEV_ES_INIT:
if (!sev_es_enabled) {
r = -ENOTTY;
goto out;
}
fallthrough;
case KVM_SEV_INIT:
r = sev_guest_init(kvm, &sev_cmd);
break;
case KVM_SEV_INIT2:
r = sev_guest_init2(kvm, &sev_cmd);
break;
case KVM_SEV_LAUNCH_START:
r = sev_launch_start(kvm, &sev_cmd);
break;
case KVM_SEV_LAUNCH_UPDATE_DATA:
r = sev_launch_update_data(kvm, &sev_cmd);
break;
case KVM_SEV_LAUNCH_UPDATE_VMSA:
r = sev_launch_update_vmsa(kvm, &sev_cmd);
break;
case KVM_SEV_LAUNCH_MEASURE:
r = sev_launch_measure(kvm, &sev_cmd);
break;
case KVM_SEV_LAUNCH_FINISH:
r = sev_launch_finish(kvm, &sev_cmd);
break;
case KVM_SEV_GUEST_STATUS:
r = sev_guest_status(kvm, &sev_cmd);
break;
case KVM_SEV_DBG_DECRYPT:
r = sev_dbg_crypt(kvm, &sev_cmd, true);
break;
case KVM_SEV_DBG_ENCRYPT:
r = sev_dbg_crypt(kvm, &sev_cmd, false);
break;
case KVM_SEV_LAUNCH_SECRET:
r = sev_launch_secret(kvm, &sev_cmd);
break;
case KVM_SEV_GET_ATTESTATION_REPORT:
r = sev_get_attestation_report(kvm, &sev_cmd);
break;
case KVM_SEV_SEND_START:
r = sev_send_start(kvm, &sev_cmd);
break;
case KVM_SEV_SEND_UPDATE_DATA:
r = sev_send_update_data(kvm, &sev_cmd);
break;
case KVM_SEV_SEND_FINISH:
r = sev_send_finish(kvm, &sev_cmd);
break;
case KVM_SEV_SEND_CANCEL:
r = sev_send_cancel(kvm, &sev_cmd);
break;
case KVM_SEV_RECEIVE_START:
r = sev_receive_start(kvm, &sev_cmd);
break;
case KVM_SEV_RECEIVE_UPDATE_DATA:
r = sev_receive_update_data(kvm, &sev_cmd);
break;
case KVM_SEV_RECEIVE_FINISH:
r = sev_receive_finish(kvm, &sev_cmd);
break;
case KVM_SEV_SNP_LAUNCH_START:
r = snp_launch_start(kvm, &sev_cmd);
break;
case KVM_SEV_SNP_LAUNCH_UPDATE:
r = snp_launch_update(kvm, &sev_cmd);
break;
case KVM_SEV_SNP_LAUNCH_FINISH:
r = snp_launch_finish(kvm, &sev_cmd);
break;
default:
r = -EINVAL;
goto out;
}
if (copy_to_user(argp, &sev_cmd, sizeof(struct kvm_sev_cmd)))
r = -EFAULT;
out:
mutex_unlock(&kvm->lock);
return r;
}
int sev_mem_enc_register_region(struct kvm *kvm,
struct kvm_enc_region *range)
{
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
struct enc_region *region;
int ret = 0;
if (!sev_guest(kvm))
return -ENOTTY;
KVM: x86: Support KVM VMs sharing SEV context Add a capability for userspace to mirror SEV encryption context from one vm to another. On our side, this is intended to support a Migration Helper vCPU, but it can also be used generically to support other in-guest workloads scheduled by the host. The intention is for the primary guest and the mirror to have nearly identical memslots. The primary benefits of this are that: 1) The VMs do not share KVM contexts (think APIC/MSRs/etc), so they can't accidentally clobber each other. 2) The VMs can have different memory-views, which is necessary for post-copy migration (the migration vCPUs on the target need to read and write to pages, when the primary guest would VMEXIT). This does not change the threat model for AMD SEV. Any memory involved is still owned by the primary guest and its initial state is still attested to through the normal SEV_LAUNCH_* flows. If userspace wanted to circumvent SEV, they could achieve the same effect by simply attaching a vCPU to the primary VM. This patch deliberately leaves userspace in charge of the memslots for the mirror, as it already has the power to mess with them in the primary guest. This patch does not support SEV-ES (much less SNP), as it does not handle handing off attested VMSAs to the mirror. For additional context, we need a Migration Helper because SEV PSP migration is far too slow for our live migration on its own. Using an in-guest migrator lets us speed this up significantly. Signed-off-by: Nathan Tempelman <natet@google.com> Message-Id: <20210408223214.2582277-1-natet@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2021-04-08 22:32:14 +00:00
/* If kvm is mirroring encryption context it isn't responsible for it */
if (is_mirroring_enc_context(kvm))
return -EINVAL;
if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
return -EINVAL;
region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
if (!region)
return -ENOMEM;
mutex_lock(&kvm->lock);
region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages,
FOLL_WRITE | FOLL_LONGTERM);
if (IS_ERR(region->pages)) {
ret = PTR_ERR(region->pages);
mutex_unlock(&kvm->lock);
goto e_free;
}
/*
* The guest may change the memory encryption attribute from C=0 -> C=1
* or vice versa for this memory range. Lets make sure caches are
* flushed to ensure that guest data gets written into memory with
* correct C-bit. Note, this must be done before dropping kvm->lock,
* as region and its array of pages can be freed by a different task
* once kvm->lock is released.
*/
sev_clflush_pages(region->pages, region->npages);
region->uaddr = range->addr;
region->size = range->size;
list_add_tail(&region->list, &sev->regions_list);
mutex_unlock(&kvm->lock);
return ret;
e_free:
kfree(region);
return ret;
}
static struct enc_region *
find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
{
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
struct list_head *head = &sev->regions_list;
struct enc_region *i;
list_for_each_entry(i, head, list) {
if (i->uaddr == range->addr &&
i->size == range->size)
return i;
}
return NULL;
}
static void __unregister_enc_region_locked(struct kvm *kvm,
struct enc_region *region)
{
sev_unpin_memory(kvm, region->pages, region->npages);
list_del(&region->list);
kfree(region);
}
int sev_mem_enc_unregister_region(struct kvm *kvm,
struct kvm_enc_region *range)
{
struct enc_region *region;
int ret;
KVM: x86: Support KVM VMs sharing SEV context Add a capability for userspace to mirror SEV encryption context from one vm to another. On our side, this is intended to support a Migration Helper vCPU, but it can also be used generically to support other in-guest workloads scheduled by the host. The intention is for the primary guest and the mirror to have nearly identical memslots. The primary benefits of this are that: 1) The VMs do not share KVM contexts (think APIC/MSRs/etc), so they can't accidentally clobber each other. 2) The VMs can have different memory-views, which is necessary for post-copy migration (the migration vCPUs on the target need to read and write to pages, when the primary guest would VMEXIT). This does not change the threat model for AMD SEV. Any memory involved is still owned by the primary guest and its initial state is still attested to through the normal SEV_LAUNCH_* flows. If userspace wanted to circumvent SEV, they could achieve the same effect by simply attaching a vCPU to the primary VM. This patch deliberately leaves userspace in charge of the memslots for the mirror, as it already has the power to mess with them in the primary guest. This patch does not support SEV-ES (much less SNP), as it does not handle handing off attested VMSAs to the mirror. For additional context, we need a Migration Helper because SEV PSP migration is far too slow for our live migration on its own. Using an in-guest migrator lets us speed this up significantly. Signed-off-by: Nathan Tempelman <natet@google.com> Message-Id: <20210408223214.2582277-1-natet@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2021-04-08 22:32:14 +00:00
/* If kvm is mirroring encryption context it isn't responsible for it */
if (is_mirroring_enc_context(kvm))
return -EINVAL;
mutex_lock(&kvm->lock);
if (!sev_guest(kvm)) {
ret = -ENOTTY;
goto failed;
}
region = find_enc_region(kvm, range);
if (!region) {
ret = -EINVAL;
goto failed;
}
KVM: SVM: Flush cache only on CPUs running SEV guest On AMD CPUs without ensuring cache consistency, each memory page reclamation in an SEV guest triggers a call to do WBNOINVD/WBINVD on all CPUs, thereby affecting the performance of other programs on the host. Typically, an AMD server may have 128 cores or more, while the SEV guest might only utilize 8 of these cores. Meanwhile, host can use qemu-affinity to bind these 8 vCPUs to specific physical CPUs. Therefore, keeping a record of the physical core numbers each time a vCPU runs can help avoid flushing the cache for all CPUs every time. Take care to allocate the cpumask used to track which CPUs have run a vCPU when copying or moving an "encryption context", as nothing guarantees memory in a mirror VM is a strict subset of the ASID owner, and the destination VM for intrahost migration needs to maintain it's own set of CPUs. E.g. for intrahost migration, if a CPU was used for the source VM but not the destination VM, then it can only have cached memory that was accessible to the source VM. And a CPU that was run in the source is also used by the destination is no different than a CPU that was run in the destination only. Note, KVM is guaranteed to do flush caches prior to sev_vm_destroy(), thanks to kvm_arch_guest_memory_reclaimed for SEV and SEV-ES, and kvm_arch_gmem_invalidate() for SEV-SNP. I.e. it's safe to free the cpumask prior to unregistering encrypted regions and freeing the ASID. Opportunistically clean up sev_vm_destroy()'s comment regarding what is (implicitly, what isn't) skipped for mirror VMs. Cc: Srikanth Aithal <sraithal@amd.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Signed-off-by: Zheyun Shen <szy0127@sjtu.edu.cn> Link: https://lore.kernel.org/r/20250522233733.3176144-9-seanjc@google.com Link: https://lore.kernel.org/all/935a82e3-f7ad-47d7-aaaf-f3d2b62ed768@amd.com Co-developed-by: Sean Christopherson <seanjc@google.com> Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-05-22 16:37:32 -07:00
sev_writeback_caches(kvm);
__unregister_enc_region_locked(kvm, region);
mutex_unlock(&kvm->lock);
return 0;
failed:
mutex_unlock(&kvm->lock);
return ret;
}
int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
KVM: x86: Support KVM VMs sharing SEV context Add a capability for userspace to mirror SEV encryption context from one vm to another. On our side, this is intended to support a Migration Helper vCPU, but it can also be used generically to support other in-guest workloads scheduled by the host. The intention is for the primary guest and the mirror to have nearly identical memslots. The primary benefits of this are that: 1) The VMs do not share KVM contexts (think APIC/MSRs/etc), so they can't accidentally clobber each other. 2) The VMs can have different memory-views, which is necessary for post-copy migration (the migration vCPUs on the target need to read and write to pages, when the primary guest would VMEXIT). This does not change the threat model for AMD SEV. Any memory involved is still owned by the primary guest and its initial state is still attested to through the normal SEV_LAUNCH_* flows. If userspace wanted to circumvent SEV, they could achieve the same effect by simply attaching a vCPU to the primary VM. This patch deliberately leaves userspace in charge of the memslots for the mirror, as it already has the power to mess with them in the primary guest. This patch does not support SEV-ES (much less SNP), as it does not handle handing off attested VMSAs to the mirror. For additional context, we need a Migration Helper because SEV PSP migration is far too slow for our live migration on its own. Using an in-guest migrator lets us speed this up significantly. Signed-off-by: Nathan Tempelman <natet@google.com> Message-Id: <20210408223214.2582277-1-natet@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2021-04-08 22:32:14 +00:00
{
CLASS(fd, f)(source_fd);
KVM: x86: Support KVM VMs sharing SEV context Add a capability for userspace to mirror SEV encryption context from one vm to another. On our side, this is intended to support a Migration Helper vCPU, but it can also be used generically to support other in-guest workloads scheduled by the host. The intention is for the primary guest and the mirror to have nearly identical memslots. The primary benefits of this are that: 1) The VMs do not share KVM contexts (think APIC/MSRs/etc), so they can't accidentally clobber each other. 2) The VMs can have different memory-views, which is necessary for post-copy migration (the migration vCPUs on the target need to read and write to pages, when the primary guest would VMEXIT). This does not change the threat model for AMD SEV. Any memory involved is still owned by the primary guest and its initial state is still attested to through the normal SEV_LAUNCH_* flows. If userspace wanted to circumvent SEV, they could achieve the same effect by simply attaching a vCPU to the primary VM. This patch deliberately leaves userspace in charge of the memslots for the mirror, as it already has the power to mess with them in the primary guest. This patch does not support SEV-ES (much less SNP), as it does not handle handing off attested VMSAs to the mirror. For additional context, we need a Migration Helper because SEV PSP migration is far too slow for our live migration on its own. Using an in-guest migrator lets us speed this up significantly. Signed-off-by: Nathan Tempelman <natet@google.com> Message-Id: <20210408223214.2582277-1-natet@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2021-04-08 22:32:14 +00:00
struct kvm *source_kvm;
struct kvm_sev_info *source_sev, *mirror_sev;
KVM: x86: Support KVM VMs sharing SEV context Add a capability for userspace to mirror SEV encryption context from one vm to another. On our side, this is intended to support a Migration Helper vCPU, but it can also be used generically to support other in-guest workloads scheduled by the host. The intention is for the primary guest and the mirror to have nearly identical memslots. The primary benefits of this are that: 1) The VMs do not share KVM contexts (think APIC/MSRs/etc), so they can't accidentally clobber each other. 2) The VMs can have different memory-views, which is necessary for post-copy migration (the migration vCPUs on the target need to read and write to pages, when the primary guest would VMEXIT). This does not change the threat model for AMD SEV. Any memory involved is still owned by the primary guest and its initial state is still attested to through the normal SEV_LAUNCH_* flows. If userspace wanted to circumvent SEV, they could achieve the same effect by simply attaching a vCPU to the primary VM. This patch deliberately leaves userspace in charge of the memslots for the mirror, as it already has the power to mess with them in the primary guest. This patch does not support SEV-ES (much less SNP), as it does not handle handing off attested VMSAs to the mirror. For additional context, we need a Migration Helper because SEV PSP migration is far too slow for our live migration on its own. Using an in-guest migrator lets us speed this up significantly. Signed-off-by: Nathan Tempelman <natet@google.com> Message-Id: <20210408223214.2582277-1-natet@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2021-04-08 22:32:14 +00:00
int ret;
if (fd_empty(f))
return -EBADF;
if (!file_is_kvm(fd_file(f)))
return -EBADF;
KVM: x86: Support KVM VMs sharing SEV context Add a capability for userspace to mirror SEV encryption context from one vm to another. On our side, this is intended to support a Migration Helper vCPU, but it can also be used generically to support other in-guest workloads scheduled by the host. The intention is for the primary guest and the mirror to have nearly identical memslots. The primary benefits of this are that: 1) The VMs do not share KVM contexts (think APIC/MSRs/etc), so they can't accidentally clobber each other. 2) The VMs can have different memory-views, which is necessary for post-copy migration (the migration vCPUs on the target need to read and write to pages, when the primary guest would VMEXIT). This does not change the threat model for AMD SEV. Any memory involved is still owned by the primary guest and its initial state is still attested to through the normal SEV_LAUNCH_* flows. If userspace wanted to circumvent SEV, they could achieve the same effect by simply attaching a vCPU to the primary VM. This patch deliberately leaves userspace in charge of the memslots for the mirror, as it already has the power to mess with them in the primary guest. This patch does not support SEV-ES (much less SNP), as it does not handle handing off attested VMSAs to the mirror. For additional context, we need a Migration Helper because SEV PSP migration is far too slow for our live migration on its own. Using an in-guest migrator lets us speed this up significantly. Signed-off-by: Nathan Tempelman <natet@google.com> Message-Id: <20210408223214.2582277-1-natet@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2021-04-08 22:32:14 +00:00
source_kvm = fd_file(f)->private_data;
ret = sev_lock_two_vms(kvm, source_kvm);
if (ret)
return ret;
KVM: x86: Support KVM VMs sharing SEV context Add a capability for userspace to mirror SEV encryption context from one vm to another. On our side, this is intended to support a Migration Helper vCPU, but it can also be used generically to support other in-guest workloads scheduled by the host. The intention is for the primary guest and the mirror to have nearly identical memslots. The primary benefits of this are that: 1) The VMs do not share KVM contexts (think APIC/MSRs/etc), so they can't accidentally clobber each other. 2) The VMs can have different memory-views, which is necessary for post-copy migration (the migration vCPUs on the target need to read and write to pages, when the primary guest would VMEXIT). This does not change the threat model for AMD SEV. Any memory involved is still owned by the primary guest and its initial state is still attested to through the normal SEV_LAUNCH_* flows. If userspace wanted to circumvent SEV, they could achieve the same effect by simply attaching a vCPU to the primary VM. This patch deliberately leaves userspace in charge of the memslots for the mirror, as it already has the power to mess with them in the primary guest. This patch does not support SEV-ES (much less SNP), as it does not handle handing off attested VMSAs to the mirror. For additional context, we need a Migration Helper because SEV PSP migration is far too slow for our live migration on its own. Using an in-guest migrator lets us speed this up significantly. Signed-off-by: Nathan Tempelman <natet@google.com> Message-Id: <20210408223214.2582277-1-natet@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2021-04-08 22:32:14 +00:00
/*
* Mirrors of mirrors should work, but let's not get silly. Also
* disallow out-of-band SEV/SEV-ES init if the target is already an
* SEV guest, or if vCPUs have been created. KVM relies on vCPUs being
* created after SEV/SEV-ES initialization, e.g. to init intercepts.
*/
if (sev_guest(kvm) || !sev_guest(source_kvm) ||
is_mirroring_enc_context(source_kvm) || kvm->created_vcpus) {
KVM: x86: Support KVM VMs sharing SEV context Add a capability for userspace to mirror SEV encryption context from one vm to another. On our side, this is intended to support a Migration Helper vCPU, but it can also be used generically to support other in-guest workloads scheduled by the host. The intention is for the primary guest and the mirror to have nearly identical memslots. The primary benefits of this are that: 1) The VMs do not share KVM contexts (think APIC/MSRs/etc), so they can't accidentally clobber each other. 2) The VMs can have different memory-views, which is necessary for post-copy migration (the migration vCPUs on the target need to read and write to pages, when the primary guest would VMEXIT). This does not change the threat model for AMD SEV. Any memory involved is still owned by the primary guest and its initial state is still attested to through the normal SEV_LAUNCH_* flows. If userspace wanted to circumvent SEV, they could achieve the same effect by simply attaching a vCPU to the primary VM. This patch deliberately leaves userspace in charge of the memslots for the mirror, as it already has the power to mess with them in the primary guest. This patch does not support SEV-ES (much less SNP), as it does not handle handing off attested VMSAs to the mirror. For additional context, we need a Migration Helper because SEV PSP migration is far too slow for our live migration on its own. Using an in-guest migrator lets us speed this up significantly. Signed-off-by: Nathan Tempelman <natet@google.com> Message-Id: <20210408223214.2582277-1-natet@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2021-04-08 22:32:14 +00:00
ret = -EINVAL;
goto e_unlock;
KVM: x86: Support KVM VMs sharing SEV context Add a capability for userspace to mirror SEV encryption context from one vm to another. On our side, this is intended to support a Migration Helper vCPU, but it can also be used generically to support other in-guest workloads scheduled by the host. The intention is for the primary guest and the mirror to have nearly identical memslots. The primary benefits of this are that: 1) The VMs do not share KVM contexts (think APIC/MSRs/etc), so they can't accidentally clobber each other. 2) The VMs can have different memory-views, which is necessary for post-copy migration (the migration vCPUs on the target need to read and write to pages, when the primary guest would VMEXIT). This does not change the threat model for AMD SEV. Any memory involved is still owned by the primary guest and its initial state is still attested to through the normal SEV_LAUNCH_* flows. If userspace wanted to circumvent SEV, they could achieve the same effect by simply attaching a vCPU to the primary VM. This patch deliberately leaves userspace in charge of the memslots for the mirror, as it already has the power to mess with them in the primary guest. This patch does not support SEV-ES (much less SNP), as it does not handle handing off attested VMSAs to the mirror. For additional context, we need a Migration Helper because SEV PSP migration is far too slow for our live migration on its own. Using an in-guest migrator lets us speed this up significantly. Signed-off-by: Nathan Tempelman <natet@google.com> Message-Id: <20210408223214.2582277-1-natet@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2021-04-08 22:32:14 +00:00
}
KVM: SVM: Flush cache only on CPUs running SEV guest On AMD CPUs without ensuring cache consistency, each memory page reclamation in an SEV guest triggers a call to do WBNOINVD/WBINVD on all CPUs, thereby affecting the performance of other programs on the host. Typically, an AMD server may have 128 cores or more, while the SEV guest might only utilize 8 of these cores. Meanwhile, host can use qemu-affinity to bind these 8 vCPUs to specific physical CPUs. Therefore, keeping a record of the physical core numbers each time a vCPU runs can help avoid flushing the cache for all CPUs every time. Take care to allocate the cpumask used to track which CPUs have run a vCPU when copying or moving an "encryption context", as nothing guarantees memory in a mirror VM is a strict subset of the ASID owner, and the destination VM for intrahost migration needs to maintain it's own set of CPUs. E.g. for intrahost migration, if a CPU was used for the source VM but not the destination VM, then it can only have cached memory that was accessible to the source VM. And a CPU that was run in the source is also used by the destination is no different than a CPU that was run in the destination only. Note, KVM is guaranteed to do flush caches prior to sev_vm_destroy(), thanks to kvm_arch_guest_memory_reclaimed for SEV and SEV-ES, and kvm_arch_gmem_invalidate() for SEV-SNP. I.e. it's safe to free the cpumask prior to unregistering encrypted regions and freeing the ASID. Opportunistically clean up sev_vm_destroy()'s comment regarding what is (implicitly, what isn't) skipped for mirror VMs. Cc: Srikanth Aithal <sraithal@amd.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Signed-off-by: Zheyun Shen <szy0127@sjtu.edu.cn> Link: https://lore.kernel.org/r/20250522233733.3176144-9-seanjc@google.com Link: https://lore.kernel.org/all/935a82e3-f7ad-47d7-aaaf-f3d2b62ed768@amd.com Co-developed-by: Sean Christopherson <seanjc@google.com> Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-05-22 16:37:32 -07:00
mirror_sev = to_kvm_sev_info(kvm);
if (!zalloc_cpumask_var(&mirror_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
ret = -ENOMEM;
goto e_unlock;
}
KVM: x86: Support KVM VMs sharing SEV context Add a capability for userspace to mirror SEV encryption context from one vm to another. On our side, this is intended to support a Migration Helper vCPU, but it can also be used generically to support other in-guest workloads scheduled by the host. The intention is for the primary guest and the mirror to have nearly identical memslots. The primary benefits of this are that: 1) The VMs do not share KVM contexts (think APIC/MSRs/etc), so they can't accidentally clobber each other. 2) The VMs can have different memory-views, which is necessary for post-copy migration (the migration vCPUs on the target need to read and write to pages, when the primary guest would VMEXIT). This does not change the threat model for AMD SEV. Any memory involved is still owned by the primary guest and its initial state is still attested to through the normal SEV_LAUNCH_* flows. If userspace wanted to circumvent SEV, they could achieve the same effect by simply attaching a vCPU to the primary VM. This patch deliberately leaves userspace in charge of the memslots for the mirror, as it already has the power to mess with them in the primary guest. This patch does not support SEV-ES (much less SNP), as it does not handle handing off attested VMSAs to the mirror. For additional context, we need a Migration Helper because SEV PSP migration is far too slow for our live migration on its own. Using an in-guest migrator lets us speed this up significantly. Signed-off-by: Nathan Tempelman <natet@google.com> Message-Id: <20210408223214.2582277-1-natet@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2021-04-08 22:32:14 +00:00
/*
* The mirror kvm holds an enc_context_owner ref so its asid can't
* disappear until we're done with it
*/
source_sev = to_kvm_sev_info(source_kvm);
KVM: x86: Support KVM VMs sharing SEV context Add a capability for userspace to mirror SEV encryption context from one vm to another. On our side, this is intended to support a Migration Helper vCPU, but it can also be used generically to support other in-guest workloads scheduled by the host. The intention is for the primary guest and the mirror to have nearly identical memslots. The primary benefits of this are that: 1) The VMs do not share KVM contexts (think APIC/MSRs/etc), so they can't accidentally clobber each other. 2) The VMs can have different memory-views, which is necessary for post-copy migration (the migration vCPUs on the target need to read and write to pages, when the primary guest would VMEXIT). This does not change the threat model for AMD SEV. Any memory involved is still owned by the primary guest and its initial state is still attested to through the normal SEV_LAUNCH_* flows. If userspace wanted to circumvent SEV, they could achieve the same effect by simply attaching a vCPU to the primary VM. This patch deliberately leaves userspace in charge of the memslots for the mirror, as it already has the power to mess with them in the primary guest. This patch does not support SEV-ES (much less SNP), as it does not handle handing off attested VMSAs to the mirror. For additional context, we need a Migration Helper because SEV PSP migration is far too slow for our live migration on its own. Using an in-guest migrator lets us speed this up significantly. Signed-off-by: Nathan Tempelman <natet@google.com> Message-Id: <20210408223214.2582277-1-natet@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2021-04-08 22:32:14 +00:00
kvm_get_kvm(source_kvm);
list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms);
KVM: x86: Support KVM VMs sharing SEV context Add a capability for userspace to mirror SEV encryption context from one vm to another. On our side, this is intended to support a Migration Helper vCPU, but it can also be used generically to support other in-guest workloads scheduled by the host. The intention is for the primary guest and the mirror to have nearly identical memslots. The primary benefits of this are that: 1) The VMs do not share KVM contexts (think APIC/MSRs/etc), so they can't accidentally clobber each other. 2) The VMs can have different memory-views, which is necessary for post-copy migration (the migration vCPUs on the target need to read and write to pages, when the primary guest would VMEXIT). This does not change the threat model for AMD SEV. Any memory involved is still owned by the primary guest and its initial state is still attested to through the normal SEV_LAUNCH_* flows. If userspace wanted to circumvent SEV, they could achieve the same effect by simply attaching a vCPU to the primary VM. This patch deliberately leaves userspace in charge of the memslots for the mirror, as it already has the power to mess with them in the primary guest. This patch does not support SEV-ES (much less SNP), as it does not handle handing off attested VMSAs to the mirror. For additional context, we need a Migration Helper because SEV PSP migration is far too slow for our live migration on its own. Using an in-guest migrator lets us speed this up significantly. Signed-off-by: Nathan Tempelman <natet@google.com> Message-Id: <20210408223214.2582277-1-natet@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2021-04-08 22:32:14 +00:00
/* Set enc_context_owner and copy its encryption context over */
mirror_sev->enc_context_owner = source_kvm;
mirror_sev->active = true;
mirror_sev->asid = source_sev->asid;
mirror_sev->fd = source_sev->fd;
mirror_sev->es_active = source_sev->es_active;
mirror_sev->need_init = false;
mirror_sev->handle = source_sev->handle;
INIT_LIST_HEAD(&mirror_sev->regions_list);
INIT_LIST_HEAD(&mirror_sev->mirror_vms);
ret = 0;
/*
* Do not copy ap_jump_table. Since the mirror does not share the same
* KVM contexts as the original, and they may have different
* memory-views.
*/
KVM: x86: Support KVM VMs sharing SEV context Add a capability for userspace to mirror SEV encryption context from one vm to another. On our side, this is intended to support a Migration Helper vCPU, but it can also be used generically to support other in-guest workloads scheduled by the host. The intention is for the primary guest and the mirror to have nearly identical memslots. The primary benefits of this are that: 1) The VMs do not share KVM contexts (think APIC/MSRs/etc), so they can't accidentally clobber each other. 2) The VMs can have different memory-views, which is necessary for post-copy migration (the migration vCPUs on the target need to read and write to pages, when the primary guest would VMEXIT). This does not change the threat model for AMD SEV. Any memory involved is still owned by the primary guest and its initial state is still attested to through the normal SEV_LAUNCH_* flows. If userspace wanted to circumvent SEV, they could achieve the same effect by simply attaching a vCPU to the primary VM. This patch deliberately leaves userspace in charge of the memslots for the mirror, as it already has the power to mess with them in the primary guest. This patch does not support SEV-ES (much less SNP), as it does not handle handing off attested VMSAs to the mirror. For additional context, we need a Migration Helper because SEV PSP migration is far too slow for our live migration on its own. Using an in-guest migrator lets us speed this up significantly. Signed-off-by: Nathan Tempelman <natet@google.com> Message-Id: <20210408223214.2582277-1-natet@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2021-04-08 22:32:14 +00:00
e_unlock:
sev_unlock_two_vms(kvm, source_kvm);
KVM: x86: Support KVM VMs sharing SEV context Add a capability for userspace to mirror SEV encryption context from one vm to another. On our side, this is intended to support a Migration Helper vCPU, but it can also be used generically to support other in-guest workloads scheduled by the host. The intention is for the primary guest and the mirror to have nearly identical memslots. The primary benefits of this are that: 1) The VMs do not share KVM contexts (think APIC/MSRs/etc), so they can't accidentally clobber each other. 2) The VMs can have different memory-views, which is necessary for post-copy migration (the migration vCPUs on the target need to read and write to pages, when the primary guest would VMEXIT). This does not change the threat model for AMD SEV. Any memory involved is still owned by the primary guest and its initial state is still attested to through the normal SEV_LAUNCH_* flows. If userspace wanted to circumvent SEV, they could achieve the same effect by simply attaching a vCPU to the primary VM. This patch deliberately leaves userspace in charge of the memslots for the mirror, as it already has the power to mess with them in the primary guest. This patch does not support SEV-ES (much less SNP), as it does not handle handing off attested VMSAs to the mirror. For additional context, we need a Migration Helper because SEV PSP migration is far too slow for our live migration on its own. Using an in-guest migrator lets us speed this up significantly. Signed-off-by: Nathan Tempelman <natet@google.com> Message-Id: <20210408223214.2582277-1-natet@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2021-04-08 22:32:14 +00:00
return ret;
}
static int snp_decommission_context(struct kvm *kvm)
{
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
struct sev_data_snp_addr data = {};
int ret;
/* If context is not created then do nothing */
if (!sev->snp_context)
return 0;
/* Do the decommision, which will unbind the ASID from the SNP context */
data.address = __sme_pa(sev->snp_context);
down_write(&sev_deactivate_lock);
ret = sev_do_cmd(SEV_CMD_SNP_DECOMMISSION, &data, NULL);
up_write(&sev_deactivate_lock);
if (WARN_ONCE(ret, "Failed to release guest context, ret %d", ret))
return ret;
snp_free_firmware_page(sev->snp_context);
sev->snp_context = NULL;
return 0;
}
void sev_vm_destroy(struct kvm *kvm)
{
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
struct list_head *head = &sev->regions_list;
struct list_head *pos, *q;
if (!sev_guest(kvm))
return;
WARN_ON(!list_empty(&sev->mirror_vms));
KVM: SVM: Flush cache only on CPUs running SEV guest On AMD CPUs without ensuring cache consistency, each memory page reclamation in an SEV guest triggers a call to do WBNOINVD/WBINVD on all CPUs, thereby affecting the performance of other programs on the host. Typically, an AMD server may have 128 cores or more, while the SEV guest might only utilize 8 of these cores. Meanwhile, host can use qemu-affinity to bind these 8 vCPUs to specific physical CPUs. Therefore, keeping a record of the physical core numbers each time a vCPU runs can help avoid flushing the cache for all CPUs every time. Take care to allocate the cpumask used to track which CPUs have run a vCPU when copying or moving an "encryption context", as nothing guarantees memory in a mirror VM is a strict subset of the ASID owner, and the destination VM for intrahost migration needs to maintain it's own set of CPUs. E.g. for intrahost migration, if a CPU was used for the source VM but not the destination VM, then it can only have cached memory that was accessible to the source VM. And a CPU that was run in the source is also used by the destination is no different than a CPU that was run in the destination only. Note, KVM is guaranteed to do flush caches prior to sev_vm_destroy(), thanks to kvm_arch_guest_memory_reclaimed for SEV and SEV-ES, and kvm_arch_gmem_invalidate() for SEV-SNP. I.e. it's safe to free the cpumask prior to unregistering encrypted regions and freeing the ASID. Opportunistically clean up sev_vm_destroy()'s comment regarding what is (implicitly, what isn't) skipped for mirror VMs. Cc: Srikanth Aithal <sraithal@amd.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Signed-off-by: Zheyun Shen <szy0127@sjtu.edu.cn> Link: https://lore.kernel.org/r/20250522233733.3176144-9-seanjc@google.com Link: https://lore.kernel.org/all/935a82e3-f7ad-47d7-aaaf-f3d2b62ed768@amd.com Co-developed-by: Sean Christopherson <seanjc@google.com> Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-05-22 16:37:32 -07:00
free_cpumask_var(sev->have_run_cpus);
/*
* If this is a mirror VM, remove it from the owner's list of a mirrors
* and skip ASID cleanup (the ASID is tied to the lifetime of the owner).
* Note, mirror VMs don't support registering encrypted regions.
*/
KVM: x86: Support KVM VMs sharing SEV context Add a capability for userspace to mirror SEV encryption context from one vm to another. On our side, this is intended to support a Migration Helper vCPU, but it can also be used generically to support other in-guest workloads scheduled by the host. The intention is for the primary guest and the mirror to have nearly identical memslots. The primary benefits of this are that: 1) The VMs do not share KVM contexts (think APIC/MSRs/etc), so they can't accidentally clobber each other. 2) The VMs can have different memory-views, which is necessary for post-copy migration (the migration vCPUs on the target need to read and write to pages, when the primary guest would VMEXIT). This does not change the threat model for AMD SEV. Any memory involved is still owned by the primary guest and its initial state is still attested to through the normal SEV_LAUNCH_* flows. If userspace wanted to circumvent SEV, they could achieve the same effect by simply attaching a vCPU to the primary VM. This patch deliberately leaves userspace in charge of the memslots for the mirror, as it already has the power to mess with them in the primary guest. This patch does not support SEV-ES (much less SNP), as it does not handle handing off attested VMSAs to the mirror. For additional context, we need a Migration Helper because SEV PSP migration is far too slow for our live migration on its own. Using an in-guest migrator lets us speed this up significantly. Signed-off-by: Nathan Tempelman <natet@google.com> Message-Id: <20210408223214.2582277-1-natet@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2021-04-08 22:32:14 +00:00
if (is_mirroring_enc_context(kvm)) {
struct kvm *owner_kvm = sev->enc_context_owner;
mutex_lock(&owner_kvm->lock);
list_del(&sev->mirror_entry);
mutex_unlock(&owner_kvm->lock);
kvm_put_kvm(owner_kvm);
KVM: x86: Support KVM VMs sharing SEV context Add a capability for userspace to mirror SEV encryption context from one vm to another. On our side, this is intended to support a Migration Helper vCPU, but it can also be used generically to support other in-guest workloads scheduled by the host. The intention is for the primary guest and the mirror to have nearly identical memslots. The primary benefits of this are that: 1) The VMs do not share KVM contexts (think APIC/MSRs/etc), so they can't accidentally clobber each other. 2) The VMs can have different memory-views, which is necessary for post-copy migration (the migration vCPUs on the target need to read and write to pages, when the primary guest would VMEXIT). This does not change the threat model for AMD SEV. Any memory involved is still owned by the primary guest and its initial state is still attested to through the normal SEV_LAUNCH_* flows. If userspace wanted to circumvent SEV, they could achieve the same effect by simply attaching a vCPU to the primary VM. This patch deliberately leaves userspace in charge of the memslots for the mirror, as it already has the power to mess with them in the primary guest. This patch does not support SEV-ES (much less SNP), as it does not handle handing off attested VMSAs to the mirror. For additional context, we need a Migration Helper because SEV PSP migration is far too slow for our live migration on its own. Using an in-guest migrator lets us speed this up significantly. Signed-off-by: Nathan Tempelman <natet@google.com> Message-Id: <20210408223214.2582277-1-natet@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2021-04-08 22:32:14 +00:00
return;
}
/*
* if userspace was terminated before unregistering the memory regions
* then lets unpin all the registered memory.
*/
if (!list_empty(head)) {
list_for_each_safe(pos, q, head) {
__unregister_enc_region_locked(kvm,
list_entry(pos, struct enc_region, list));
cond_resched();
}
}
if (sev_snp_guest(kvm)) {
KVM: SEV: Provide support for SNP_GUEST_REQUEST NAE event Version 2 of GHCB specification added support for the SNP Guest Request Message NAE event. The event allows for an SEV-SNP guest to make requests to the SEV-SNP firmware through the hypervisor using the SNP_GUEST_REQUEST API defined in the SEV-SNP firmware specification. This is used by guests primarily to request attestation reports from firmware. There are other request types are available as well, but the specifics of what guest requests are being made generally does not affect how they are handled by the hypervisor, which only serves as a proxy for the guest requests and firmware responses. Implement handling for these events. When an SNP Guest Request is issued, the guest will provide its own request/response pages, which could in theory be passed along directly to firmware. However, these pages would need special care: - Both pages are from shared guest memory, so they need to be protected from migration/etc. occurring while firmware reads/writes to them. At a minimum, this requires elevating the ref counts and potentially needing an explicit pinning of the memory. This places additional restrictions on what type of memory backends userspace can use for shared guest memory since there would be some reliance on using refcounted pages. - The response page needs to be switched to Firmware-owned state before the firmware can write to it, which can lead to potential host RMP #PFs if the guest is misbehaved and hands the host a guest page that KVM is writing to for other reasons (e.g. virtio buffers). Both of these issues can be avoided completely by using separately-allocated bounce pages for both the request/response pages and passing those to firmware instead. So that's the approach taken here. Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Co-developed-by: Alexey Kardashevskiy <aik@amd.com> Signed-off-by: Alexey Kardashevskiy <aik@amd.com> Co-developed-by: Ashish Kalra <ashish.kalra@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Reviewed-by: Liam Merwick <liam.merwick@oracle.com> [mdr: ensure FW command failures are indicated to guest, drop extended request handling to be re-written as separate patch, massage commit] Signed-off-by: Michael Roth <michael.roth@amd.com> Message-ID: <20240701223148.3798365-2-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-07-01 17:31:46 -05:00
snp_guest_req_cleanup(kvm);
/*
* Decomission handles unbinding of the ASID. If it fails for
* some unexpected reason, just leak the ASID.
*/
if (snp_decommission_context(kvm))
return;
} else {
sev_unbind_asid(kvm, sev->handle);
}
sev_asid_free(sev);
}
void __init sev_set_cpu_caps(void)
{
if (sev_enabled) {
kvm_cpu_cap_set(X86_FEATURE_SEV);
kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_VM);
}
if (sev_es_enabled) {
kvm_cpu_cap_set(X86_FEATURE_SEV_ES);
kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_ES_VM);
}
if (sev_snp_enabled) {
kvm_cpu_cap_set(X86_FEATURE_SEV_SNP);
kvm_caps.supported_vm_types |= BIT(KVM_X86_SNP_VM);
}
}
static bool is_sev_snp_initialized(void)
{
struct sev_user_data_snp_status *status;
struct sev_data_snp_addr buf;
bool initialized = false;
int ret, error = 0;
status = snp_alloc_firmware_page(GFP_KERNEL | __GFP_ZERO);
if (!status)
return false;
buf.address = __psp_pa(status);
ret = sev_do_cmd(SEV_CMD_SNP_PLATFORM_STATUS, &buf, &error);
if (ret) {
pr_err("SEV: SNP_PLATFORM_STATUS failed ret=%d, fw_error=%d (%#x)\n",
ret, error, error);
goto out;
}
initialized = !!status->state;
out:
snp_free_firmware_page(status);
return initialized;
}
void __init sev_hardware_setup(void)
{
unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count;
struct sev_platform_init_args init_args = {0};
bool sev_snp_supported = false;
bool sev_es_supported = false;
bool sev_supported = false;
if (!sev_enabled || !npt_enabled || !nrips)
goto out;
/*
* SEV must obviously be supported in hardware. Sanity check that the
* CPU supports decode assists, which is mandatory for SEV guests to
* support instruction emulation. Ditto for flushing by ASID, as SEV
* guests are bound to a single ASID, i.e. KVM can't rotate to a new
* ASID to effect a TLB flush.
*/
if (!boot_cpu_has(X86_FEATURE_SEV) ||
WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) ||
WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_FLUSHBYASID)))
goto out;
/*
* The kernel's initcall infrastructure lacks the ability to express
* dependencies between initcalls, whereas the modules infrastructure
* automatically handles dependencies via symbol loading. Ensure the
* PSP SEV driver is initialized before proceeding if KVM is built-in,
* as the dependency isn't handled by the initcall infrastructure.
*/
if (IS_BUILTIN(CONFIG_KVM_AMD) && sev_module_init())
goto out;
/* Retrieve SEV CPUID information */
cpuid(0x8000001f, &eax, &ebx, &ecx, &edx);
/* Set encryption bit location for SEV-ES guests */
sev_enc_bit = ebx & 0x3f;
/* Maximum number of encrypted guests supported simultaneously */
max_sev_asid = ecx;
if (!max_sev_asid)
goto out;
/* Minimum ASID value that should be used for SEV guest */
min_sev_asid = edx;
sev_me_mask = 1UL << (ebx & 0x3f);
/*
* Initialize SEV ASID bitmaps. Allocate space for ASID 0 in the bitmap,
* even though it's never used, so that the bitmap is indexed by the
* actual ASID.
*/
nr_asids = max_sev_asid + 1;
sev_asid_bitmap = bitmap_zalloc(nr_asids, GFP_KERNEL);
if (!sev_asid_bitmap)
goto out;
sev_reclaim_asid_bitmap = bitmap_zalloc(nr_asids, GFP_KERNEL);
if (!sev_reclaim_asid_bitmap) {
bitmap_free(sev_asid_bitmap);
sev_asid_bitmap = NULL;
goto out;
}
if (min_sev_asid <= max_sev_asid) {
sev_asid_count = max_sev_asid - min_sev_asid + 1;
WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count));
}
sev_supported = true;
/* SEV-ES support requested? */
if (!sev_es_enabled)
goto out;
/*
* SEV-ES requires MMIO caching as KVM doesn't have access to the guest
* instruction stream, i.e. can't emulate in response to a #NPF and
* instead relies on #NPF(RSVD) being reflected into the guest as #VC
* (the guest can then do a #VMGEXIT to request MMIO emulation).
*/
if (!enable_mmio_caching)
goto out;
/* Does the CPU support SEV-ES? */
if (!boot_cpu_has(X86_FEATURE_SEV_ES))
goto out;
if (!lbrv) {
WARN_ONCE(!boot_cpu_has(X86_FEATURE_LBRV),
"LBRV must be present for SEV-ES support");
goto out;
}
/* Has the system been allocated ASIDs for SEV-ES? */
if (min_sev_asid == 1)
goto out;
sev_es_asid_count = min_sev_asid - 1;
WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count));
sev_es_supported = true;
sev_snp_supported = sev_snp_enabled && cc_platform_has(CC_ATTR_HOST_SEV_SNP);
out:
if (sev_enabled) {
init_args.probe = true;
if (sev_platform_init(&init_args))
sev_supported = sev_es_supported = sev_snp_supported = false;
else if (sev_snp_supported)
sev_snp_supported = is_sev_snp_initialized();
}
if (boot_cpu_has(X86_FEATURE_SEV))
pr_info("SEV %s (ASIDs %u - %u)\n",
sev_supported ? min_sev_asid <= max_sev_asid ? "enabled" :
"unusable" :
"disabled",
min_sev_asid, max_sev_asid);
if (boot_cpu_has(X86_FEATURE_SEV_ES))
pr_info("SEV-ES %s (ASIDs %u - %u)\n",
str_enabled_disabled(sev_es_supported),
min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1);
if (boot_cpu_has(X86_FEATURE_SEV_SNP))
pr_info("SEV-SNP %s (ASIDs %u - %u)\n",
str_enabled_disabled(sev_snp_supported),
min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1);
sev_enabled = sev_supported;
sev_es_enabled = sev_es_supported;
sev_snp_enabled = sev_snp_supported;
if (!sev_es_enabled || !cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) ||
!cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP))
sev_es_debug_swap_enabled = false;
sev_supported_vmsa_features = 0;
if (sev_es_debug_swap_enabled)
sev_supported_vmsa_features |= SVM_SEV_FEAT_DEBUG_SWAP;
}
void sev_hardware_unsetup(void)
{
if (!sev_enabled)
return;
/* No need to take sev_bitmap_lock, all VMs have been destroyed. */
sev_flush_asids(1, max_sev_asid);
bitmap_free(sev_asid_bitmap);
bitmap_free(sev_reclaim_asid_bitmap);
misc_cg_set_capacity(MISC_CG_RES_SEV, 0);
misc_cg_set_capacity(MISC_CG_RES_SEV_ES, 0);
sev_platform_shutdown();
}
int sev_cpu_init(struct svm_cpu_data *sd)
{
if (!sev_enabled)
return 0;
sd->sev_vmcbs = kcalloc(nr_asids, sizeof(void *), GFP_KERNEL);
if (!sd->sev_vmcbs)
return -ENOMEM;
return 0;
}
/*
* Pages used by hardware to hold guest encrypted state must be flushed before
* returning them to the system.
*/
static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va)
{
unsigned int asid = sev_get_asid(vcpu->kvm);
/*
* Note! The address must be a kernel address, as regular page walk
* checks are performed by VM_PAGE_FLUSH, i.e. operating on a user
* address is non-deterministic and unsafe. This function deliberately
* takes a pointer to deter passing in a user address.
*/
unsigned long addr = (unsigned long)va;
/*
* If CPU enforced cache coherency for encrypted mappings of the
* same physical page is supported, use CLFLUSHOPT instead. NOTE: cache
* flush is still needed in order to work properly with DMA devices.
*/
if (boot_cpu_has(X86_FEATURE_SME_COHERENT)) {
clflush_cache_range(va, PAGE_SIZE);
return;
}
/*
* VM Page Flush takes a host virtual address and a guest ASID. Fall
KVM: SEV: Prefer WBNOINVD over WBINVD for cache maintenance efficiency AMD CPUs currently execute WBINVD in the host when unregistering SEV guest memory or when deactivating SEV guests. Such cache maintenance is performed to prevent data corruption, wherein the encrypted (C=1) version of a dirty cache line might otherwise only be written back after the memory is written in a different context (ex: C=0), yielding corruption. However, WBINVD is performance-costly, especially because it invalidates processor caches. Strictly-speaking, unless the SEV ASID is being recycled (meaning the SNP firmware requires the use of WBINVD prior to DF_FLUSH), the cache invalidation triggered by WBINVD is unnecessary; only the writeback is needed to prevent data corruption in remaining scenarios. To improve performance in these scenarios, use WBNOINVD when available instead of WBINVD. WBNOINVD still writes back all dirty lines (preventing host data corruption by SEV guests) but does *not* invalidate processor caches. Note that the implementation of wbnoinvd() ensures fall back to WBINVD if WBNOINVD is unavailable. In anticipation of forthcoming optimizations to limit the WBNOINVD only to physical CPUs that have executed SEV guests, place the call to wbnoinvd_on_all_cpus() in a wrapper function sev_writeback_caches(). Signed-off-by: Kevin Loughlin <kevinloughlin@google.com> Reviewed-by: Mingwei Zhang <mizhang@google.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Link: https://lore.kernel.org/r/20250201000259.3289143-3-kevinloughlin@google.com [sean: tweak comment regarding CLFUSH] Cc: Francesco Lavra <francescolavra.fl@gmail.com> Link: https://lore.kernel.org/r/20250522233733.3176144-8-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-05-22 16:37:31 -07:00
* back to full writeback of caches if this faults so as not to make
* any problems worse by leaving stale encrypted data in the cache.
*/
if (WARN_ON_ONCE(wrmsrq_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid)))
KVM: SEV: Prefer WBNOINVD over WBINVD for cache maintenance efficiency AMD CPUs currently execute WBINVD in the host when unregistering SEV guest memory or when deactivating SEV guests. Such cache maintenance is performed to prevent data corruption, wherein the encrypted (C=1) version of a dirty cache line might otherwise only be written back after the memory is written in a different context (ex: C=0), yielding corruption. However, WBINVD is performance-costly, especially because it invalidates processor caches. Strictly-speaking, unless the SEV ASID is being recycled (meaning the SNP firmware requires the use of WBINVD prior to DF_FLUSH), the cache invalidation triggered by WBINVD is unnecessary; only the writeback is needed to prevent data corruption in remaining scenarios. To improve performance in these scenarios, use WBNOINVD when available instead of WBINVD. WBNOINVD still writes back all dirty lines (preventing host data corruption by SEV guests) but does *not* invalidate processor caches. Note that the implementation of wbnoinvd() ensures fall back to WBINVD if WBNOINVD is unavailable. In anticipation of forthcoming optimizations to limit the WBNOINVD only to physical CPUs that have executed SEV guests, place the call to wbnoinvd_on_all_cpus() in a wrapper function sev_writeback_caches(). Signed-off-by: Kevin Loughlin <kevinloughlin@google.com> Reviewed-by: Mingwei Zhang <mizhang@google.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Link: https://lore.kernel.org/r/20250201000259.3289143-3-kevinloughlin@google.com [sean: tweak comment regarding CLFUSH] Cc: Francesco Lavra <francescolavra.fl@gmail.com> Link: https://lore.kernel.org/r/20250522233733.3176144-8-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-05-22 16:37:31 -07:00
goto do_sev_writeback_caches;
return;
KVM: SEV: Prefer WBNOINVD over WBINVD for cache maintenance efficiency AMD CPUs currently execute WBINVD in the host when unregistering SEV guest memory or when deactivating SEV guests. Such cache maintenance is performed to prevent data corruption, wherein the encrypted (C=1) version of a dirty cache line might otherwise only be written back after the memory is written in a different context (ex: C=0), yielding corruption. However, WBINVD is performance-costly, especially because it invalidates processor caches. Strictly-speaking, unless the SEV ASID is being recycled (meaning the SNP firmware requires the use of WBINVD prior to DF_FLUSH), the cache invalidation triggered by WBINVD is unnecessary; only the writeback is needed to prevent data corruption in remaining scenarios. To improve performance in these scenarios, use WBNOINVD when available instead of WBINVD. WBNOINVD still writes back all dirty lines (preventing host data corruption by SEV guests) but does *not* invalidate processor caches. Note that the implementation of wbnoinvd() ensures fall back to WBINVD if WBNOINVD is unavailable. In anticipation of forthcoming optimizations to limit the WBNOINVD only to physical CPUs that have executed SEV guests, place the call to wbnoinvd_on_all_cpus() in a wrapper function sev_writeback_caches(). Signed-off-by: Kevin Loughlin <kevinloughlin@google.com> Reviewed-by: Mingwei Zhang <mizhang@google.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Link: https://lore.kernel.org/r/20250201000259.3289143-3-kevinloughlin@google.com [sean: tweak comment regarding CLFUSH] Cc: Francesco Lavra <francescolavra.fl@gmail.com> Link: https://lore.kernel.org/r/20250522233733.3176144-8-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-05-22 16:37:31 -07:00
do_sev_writeback_caches:
KVM: SVM: Flush cache only on CPUs running SEV guest On AMD CPUs without ensuring cache consistency, each memory page reclamation in an SEV guest triggers a call to do WBNOINVD/WBINVD on all CPUs, thereby affecting the performance of other programs on the host. Typically, an AMD server may have 128 cores or more, while the SEV guest might only utilize 8 of these cores. Meanwhile, host can use qemu-affinity to bind these 8 vCPUs to specific physical CPUs. Therefore, keeping a record of the physical core numbers each time a vCPU runs can help avoid flushing the cache for all CPUs every time. Take care to allocate the cpumask used to track which CPUs have run a vCPU when copying or moving an "encryption context", as nothing guarantees memory in a mirror VM is a strict subset of the ASID owner, and the destination VM for intrahost migration needs to maintain it's own set of CPUs. E.g. for intrahost migration, if a CPU was used for the source VM but not the destination VM, then it can only have cached memory that was accessible to the source VM. And a CPU that was run in the source is also used by the destination is no different than a CPU that was run in the destination only. Note, KVM is guaranteed to do flush caches prior to sev_vm_destroy(), thanks to kvm_arch_guest_memory_reclaimed for SEV and SEV-ES, and kvm_arch_gmem_invalidate() for SEV-SNP. I.e. it's safe to free the cpumask prior to unregistering encrypted regions and freeing the ASID. Opportunistically clean up sev_vm_destroy()'s comment regarding what is (implicitly, what isn't) skipped for mirror VMs. Cc: Srikanth Aithal <sraithal@amd.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Signed-off-by: Zheyun Shen <szy0127@sjtu.edu.cn> Link: https://lore.kernel.org/r/20250522233733.3176144-9-seanjc@google.com Link: https://lore.kernel.org/all/935a82e3-f7ad-47d7-aaaf-f3d2b62ed768@amd.com Co-developed-by: Sean Christopherson <seanjc@google.com> Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-05-22 16:37:32 -07:00
sev_writeback_caches(vcpu->kvm);
}
KVM: SEV: add cache flush to solve SEV cache incoherency issues Flush the CPU caches when memory is reclaimed from an SEV guest (where reclaim also includes it being unmapped from KVM's memslots). Due to lack of coherency for SEV encrypted memory, failure to flush results in silent data corruption if userspace is malicious/broken and doesn't ensure SEV guest memory is properly pinned and unpinned. Cache coherency is not enforced across the VM boundary in SEV (AMD APM vol.2 Section 15.34.7). Confidential cachelines, generated by confidential VM guests have to be explicitly flushed on the host side. If a memory page containing dirty confidential cachelines was released by VM and reallocated to another user, the cachelines may corrupt the new user at a later time. KVM takes a shortcut by assuming all confidential memory remain pinned until the end of VM lifetime. Therefore, KVM does not flush cache at mmu_notifier invalidation events. Because of this incorrect assumption and the lack of cache flushing, malicous userspace can crash the host kernel: creating a malicious VM and continuously allocates/releases unpinned confidential memory pages when the VM is running. Add cache flush operations to mmu_notifier operations to ensure that any physical memory leaving the guest VM get flushed. In particular, hook mmu_notifier_invalidate_range_start and mmu_notifier_release events and flush cache accordingly. The hook after releasing the mmu lock to avoid contention with other vCPUs. Cc: stable@vger.kernel.org Suggested-by: Sean Christpherson <seanjc@google.com> Reported-by: Mingwei Zhang <mizhang@google.com> Signed-off-by: Mingwei Zhang <mizhang@google.com> Message-Id: <20220421031407.2516575-4-mizhang@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-04-21 03:14:07 +00:00
void sev_guest_memory_reclaimed(struct kvm *kvm)
{
/*
* With SNP+gmem, private/encrypted memory is unreachable via the
KVM: SEV: Prefer WBNOINVD over WBINVD for cache maintenance efficiency AMD CPUs currently execute WBINVD in the host when unregistering SEV guest memory or when deactivating SEV guests. Such cache maintenance is performed to prevent data corruption, wherein the encrypted (C=1) version of a dirty cache line might otherwise only be written back after the memory is written in a different context (ex: C=0), yielding corruption. However, WBINVD is performance-costly, especially because it invalidates processor caches. Strictly-speaking, unless the SEV ASID is being recycled (meaning the SNP firmware requires the use of WBINVD prior to DF_FLUSH), the cache invalidation triggered by WBINVD is unnecessary; only the writeback is needed to prevent data corruption in remaining scenarios. To improve performance in these scenarios, use WBNOINVD when available instead of WBINVD. WBNOINVD still writes back all dirty lines (preventing host data corruption by SEV guests) but does *not* invalidate processor caches. Note that the implementation of wbnoinvd() ensures fall back to WBINVD if WBNOINVD is unavailable. In anticipation of forthcoming optimizations to limit the WBNOINVD only to physical CPUs that have executed SEV guests, place the call to wbnoinvd_on_all_cpus() in a wrapper function sev_writeback_caches(). Signed-off-by: Kevin Loughlin <kevinloughlin@google.com> Reviewed-by: Mingwei Zhang <mizhang@google.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Link: https://lore.kernel.org/r/20250201000259.3289143-3-kevinloughlin@google.com [sean: tweak comment regarding CLFUSH] Cc: Francesco Lavra <francescolavra.fl@gmail.com> Link: https://lore.kernel.org/r/20250522233733.3176144-8-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-05-22 16:37:31 -07:00
* hva-based mmu notifiers, i.e. these events are explicitly scoped to
* shared pages, where there's no need to flush caches.
*/
if (!sev_guest(kvm) || sev_snp_guest(kvm))
KVM: SEV: add cache flush to solve SEV cache incoherency issues Flush the CPU caches when memory is reclaimed from an SEV guest (where reclaim also includes it being unmapped from KVM's memslots). Due to lack of coherency for SEV encrypted memory, failure to flush results in silent data corruption if userspace is malicious/broken and doesn't ensure SEV guest memory is properly pinned and unpinned. Cache coherency is not enforced across the VM boundary in SEV (AMD APM vol.2 Section 15.34.7). Confidential cachelines, generated by confidential VM guests have to be explicitly flushed on the host side. If a memory page containing dirty confidential cachelines was released by VM and reallocated to another user, the cachelines may corrupt the new user at a later time. KVM takes a shortcut by assuming all confidential memory remain pinned until the end of VM lifetime. Therefore, KVM does not flush cache at mmu_notifier invalidation events. Because of this incorrect assumption and the lack of cache flushing, malicous userspace can crash the host kernel: creating a malicious VM and continuously allocates/releases unpinned confidential memory pages when the VM is running. Add cache flush operations to mmu_notifier operations to ensure that any physical memory leaving the guest VM get flushed. In particular, hook mmu_notifier_invalidate_range_start and mmu_notifier_release events and flush cache accordingly. The hook after releasing the mmu lock to avoid contention with other vCPUs. Cc: stable@vger.kernel.org Suggested-by: Sean Christpherson <seanjc@google.com> Reported-by: Mingwei Zhang <mizhang@google.com> Signed-off-by: Mingwei Zhang <mizhang@google.com> Message-Id: <20220421031407.2516575-4-mizhang@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-04-21 03:14:07 +00:00
return;
KVM: SVM: Flush cache only on CPUs running SEV guest On AMD CPUs without ensuring cache consistency, each memory page reclamation in an SEV guest triggers a call to do WBNOINVD/WBINVD on all CPUs, thereby affecting the performance of other programs on the host. Typically, an AMD server may have 128 cores or more, while the SEV guest might only utilize 8 of these cores. Meanwhile, host can use qemu-affinity to bind these 8 vCPUs to specific physical CPUs. Therefore, keeping a record of the physical core numbers each time a vCPU runs can help avoid flushing the cache for all CPUs every time. Take care to allocate the cpumask used to track which CPUs have run a vCPU when copying or moving an "encryption context", as nothing guarantees memory in a mirror VM is a strict subset of the ASID owner, and the destination VM for intrahost migration needs to maintain it's own set of CPUs. E.g. for intrahost migration, if a CPU was used for the source VM but not the destination VM, then it can only have cached memory that was accessible to the source VM. And a CPU that was run in the source is also used by the destination is no different than a CPU that was run in the destination only. Note, KVM is guaranteed to do flush caches prior to sev_vm_destroy(), thanks to kvm_arch_guest_memory_reclaimed for SEV and SEV-ES, and kvm_arch_gmem_invalidate() for SEV-SNP. I.e. it's safe to free the cpumask prior to unregistering encrypted regions and freeing the ASID. Opportunistically clean up sev_vm_destroy()'s comment regarding what is (implicitly, what isn't) skipped for mirror VMs. Cc: Srikanth Aithal <sraithal@amd.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Signed-off-by: Zheyun Shen <szy0127@sjtu.edu.cn> Link: https://lore.kernel.org/r/20250522233733.3176144-9-seanjc@google.com Link: https://lore.kernel.org/all/935a82e3-f7ad-47d7-aaaf-f3d2b62ed768@amd.com Co-developed-by: Sean Christopherson <seanjc@google.com> Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-05-22 16:37:32 -07:00
sev_writeback_caches(kvm);
KVM: SEV: add cache flush to solve SEV cache incoherency issues Flush the CPU caches when memory is reclaimed from an SEV guest (where reclaim also includes it being unmapped from KVM's memslots). Due to lack of coherency for SEV encrypted memory, failure to flush results in silent data corruption if userspace is malicious/broken and doesn't ensure SEV guest memory is properly pinned and unpinned. Cache coherency is not enforced across the VM boundary in SEV (AMD APM vol.2 Section 15.34.7). Confidential cachelines, generated by confidential VM guests have to be explicitly flushed on the host side. If a memory page containing dirty confidential cachelines was released by VM and reallocated to another user, the cachelines may corrupt the new user at a later time. KVM takes a shortcut by assuming all confidential memory remain pinned until the end of VM lifetime. Therefore, KVM does not flush cache at mmu_notifier invalidation events. Because of this incorrect assumption and the lack of cache flushing, malicous userspace can crash the host kernel: creating a malicious VM and continuously allocates/releases unpinned confidential memory pages when the VM is running. Add cache flush operations to mmu_notifier operations to ensure that any physical memory leaving the guest VM get flushed. In particular, hook mmu_notifier_invalidate_range_start and mmu_notifier_release events and flush cache accordingly. The hook after releasing the mmu lock to avoid contention with other vCPUs. Cc: stable@vger.kernel.org Suggested-by: Sean Christpherson <seanjc@google.com> Reported-by: Mingwei Zhang <mizhang@google.com> Signed-off-by: Mingwei Zhang <mizhang@google.com> Message-Id: <20220421031407.2516575-4-mizhang@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-04-21 03:14:07 +00:00
}
void sev_free_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm;
if (!sev_es_guest(vcpu->kvm))
return;
svm = to_svm(vcpu);
/*
* If it's an SNP guest, then the VMSA was marked in the RMP table as
* a guest-owned page. Transition the page to hypervisor state before
* releasing it back to the system.
*/
if (sev_snp_guest(vcpu->kvm)) {
u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT;
if (kvm_rmp_make_shared(vcpu->kvm, pfn, PG_LEVEL_4K))
goto skip_vmsa_free;
}
if (vcpu->arch.guest_state_protected)
sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa);
__free_page(virt_to_page(svm->sev_es.vmsa));
skip_vmsa_free:
if (svm->sev_es.ghcb_sa_free)
kvfree(svm->sev_es.ghcb_sa);
}
static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control)
{
return (((u64)control->exit_code_hi) << 32) | control->exit_code;
}
static void dump_ghcb(struct vcpu_svm *svm)
{
struct vmcb_control_area *control = &svm->vmcb->control;
unsigned int nbits;
/* Re-use the dump_invalid_vmcb module parameter */
if (!dump_invalid_vmcb) {
pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
return;
}
nbits = sizeof(svm->sev_es.valid_bitmap) * 8;
/*
* Print KVM's snapshot of the GHCB values that were (unsuccessfully)
* used to handle the exit. If the guest has since modified the GHCB
* itself, dumping the raw GHCB won't help debug why KVM was unable to
* handle the VMGEXIT that KVM observed.
*/
pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa);
pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code",
kvm_ghcb_get_sw_exit_code(control), kvm_ghcb_sw_exit_code_is_valid(svm));
pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1",
control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm));
pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2",
control->exit_info_2, kvm_ghcb_sw_exit_info_2_is_valid(svm));
pr_err("%-20s%016llx is_valid: %u\n", "sw_scratch",
svm->sev_es.sw_scratch, kvm_ghcb_sw_scratch_is_valid(svm));
pr_err("%-20s%*pb\n", "valid_bitmap", nbits, svm->sev_es.valid_bitmap);
}
static void sev_es_sync_to_ghcb(struct vcpu_svm *svm)
{
struct kvm_vcpu *vcpu = &svm->vcpu;
struct ghcb *ghcb = svm->sev_es.ghcb;
/*
* The GHCB protocol so far allows for the following data
* to be returned:
* GPRs RAX, RBX, RCX, RDX
*
* Copy their values, even if they may not have been written during the
* VM-Exit. It's the guest's responsibility to not consume random data.
*/
ghcb_set_rax(ghcb, vcpu->arch.regs[VCPU_REGS_RAX]);
ghcb_set_rbx(ghcb, vcpu->arch.regs[VCPU_REGS_RBX]);
ghcb_set_rcx(ghcb, vcpu->arch.regs[VCPU_REGS_RCX]);
ghcb_set_rdx(ghcb, vcpu->arch.regs[VCPU_REGS_RDX]);
}
static void sev_es_sync_from_ghcb(struct vcpu_svm *svm)
{
struct vmcb_control_area *control = &svm->vmcb->control;
struct kvm_vcpu *vcpu = &svm->vcpu;
struct ghcb *ghcb = svm->sev_es.ghcb;
u64 exit_code;
/*
* The GHCB protocol so far allows for the following data
* to be supplied:
* GPRs RAX, RBX, RCX, RDX
* XCR0
* CPL
*
* VMMCALL allows the guest to provide extra registers. KVM also
* expects RSI for hypercalls, so include that, too.
*
* Copy their values to the appropriate location if supplied.
*/
memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
BUILD_BUG_ON(sizeof(svm->sev_es.valid_bitmap) != sizeof(ghcb->save.valid_bitmap));
memcpy(&svm->sev_es.valid_bitmap, &ghcb->save.valid_bitmap, sizeof(ghcb->save.valid_bitmap));
vcpu->arch.regs[VCPU_REGS_RAX] = kvm_ghcb_get_rax_if_valid(svm, ghcb);
vcpu->arch.regs[VCPU_REGS_RBX] = kvm_ghcb_get_rbx_if_valid(svm, ghcb);
vcpu->arch.regs[VCPU_REGS_RCX] = kvm_ghcb_get_rcx_if_valid(svm, ghcb);
vcpu->arch.regs[VCPU_REGS_RDX] = kvm_ghcb_get_rdx_if_valid(svm, ghcb);
vcpu->arch.regs[VCPU_REGS_RSI] = kvm_ghcb_get_rsi_if_valid(svm, ghcb);
svm->vmcb->save.cpl = kvm_ghcb_get_cpl_if_valid(svm, ghcb);
if (kvm_ghcb_xcr0_is_valid(svm)) {
vcpu->arch.xcr0 = ghcb_get_xcr0(ghcb);
KVM: x86: Defer runtime updates of dynamic CPUID bits until CPUID emulation Defer runtime CPUID updates until the next non-faulting CPUID emulation or KVM_GET_CPUID2, which are the only paths in KVM that consume the dynamic entries. Deferring the updates is especially beneficial to nested VM-Enter/VM-Exit, as KVM will almost always detect multiple state changes, not to mention the updates don't need to be realized while L2 is active if CPUID is being intercepted by L1 (CPUID is a mandatory intercept on Intel, but not AMD). Deferring CPUID updates shaves several hundred cycles from nested VMX roundtrips, as measured from L2 executing CPUID in a tight loop: SKX 6850 => 6450 ICX 9000 => 8800 EMR 7900 => 7700 Alternatively, KVM could update only the CPUID leaves that are affected by the state change, e.g. update XSAVE info only if XCR0 or XSS changes, but that adds non-trivial complexity and doesn't solve the underlying problem of nested transitions potentially changing both XCR0 and XSS, on both nested VM-Enter and VM-Exit. Skipping updates entirely if L2 is active and CPUID is being intercepted by L1 could work for the common case. However, simply skipping updates if L2 is active is *very* subtly dangerous and complex. Most KVM updates are triggered by changes to the current vCPU state, which may be L2 state, whereas performing updates only for L1 would requiring detecting changes to L1 state. KVM would need to either track relevant L1 state, or defer runtime CPUID updates until the next nested VM-Exit. The former is ugly and complex, while the latter comes with similar dangers to deferring all CPUID updates, and would only address the nested VM-Enter path. To guard against using stale data, disallow querying dynamic CPUID feature bits, i.e. features that KVM updates at runtime, via a compile-time assertion in guest_cpu_cap_has(). Exempt MWAIT from the rule, as the MISC_ENABLE_NO_MWAIT means that MWAIT is _conditionally_ a dynamic CPUID feature. Note, the rule could be enforced for MWAIT as well, e.g. by querying guest CPUID in kvm_emulate_monitor_mwait, but there's no obvious advtantage to doing so, and allowing MWAIT for guest_cpuid_has() opens up a different can of worms. MONITOR/MWAIT can't be virtualized (for a reasonable definition), and the nature of the MWAIT_NEVER_UD_FAULTS and MISC_ENABLE_NO_MWAIT quirks means checking X86_FEATURE_MWAIT outside of kvm_emulate_monitor_mwait() is wrong for other reasons. Beyond the aforementioned feature bits, the only other dynamic CPUID (sub)leaves are the XSAVE sizes, and similar to MWAIT, consuming those CPUID entries in KVM is all but guaranteed to be a bug. The layout for an actual XSAVE buffer depends on the format (compacted or not) and potentially the features that are actually enabled. E.g. see the logic in fpstate_clear_xstate_component() needed to poke into the guest's effective XSAVE state to clear MPX state on INIT. KVM does consume CPUID.0xD.0.{EAX,EDX} in kvm_check_cpuid() and cpuid_get_supported_xcr0(), but not EBX, which is the only dynamic output register in the leaf. Link: https://lore.kernel.org/r/20241211013302.1347853-6-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
2024-12-10 17:33:02 -08:00
vcpu->arch.cpuid_dynamic_bits_dirty = true;
}
/* Copy the GHCB exit information into the VMCB fields */
exit_code = ghcb_get_sw_exit_code(ghcb);
control->exit_code = lower_32_bits(exit_code);
control->exit_code_hi = upper_32_bits(exit_code);
control->exit_info_1 = ghcb_get_sw_exit_info_1(ghcb);
control->exit_info_2 = ghcb_get_sw_exit_info_2(ghcb);
svm->sev_es.sw_scratch = kvm_ghcb_get_sw_scratch_if_valid(svm, ghcb);
/* Clear the valid entries fields */
memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
}
KVM: SVM: Exit to userspace on ENOMEM/EFAULT GHCB errors Exit to userspace if setup_vmgexit_scratch() fails due to OOM or because copying data from guest (userspace) memory failed/faulted. The OOM scenario is clearcut, it's userspace's decision as to whether it should terminate the guest, free memory, etc... As for -EFAULT, arguably, any guest issue is a violation of the guest's contract with userspace, and thus userspace needs to decide how to proceed. E.g. userspace defines what is RAM vs. MMIO and communicates that directly to the guest, KVM is not involved in deciding what is/isn't RAM nor in communicating that information to the guest. If the scratch GPA doesn't resolve to a memslot, then the guest is not honoring the memory configuration as defined by userspace. And if userspace unmaps an hva for whatever reason, then exiting to userspace with -EFAULT is absolutely the right thing to do. KVM's ABI currently sucks and doesn't provide enough information to act on the -EFAULT, but that will hopefully be remedied in the future as there are multiple use cases, e.g. uffd and virtiofs truncation, that shouldn't require any work in KVM beyond returning -EFAULT with a small amount of metadata. KVM could define its ABI such that failure to access the scratch area is reflected into the guest, i.e. establish a contract with userspace, but that's undesirable as it limits KVM's options in the future, e.g. in the potential uffd case any failure on a uaccess needs to kick out to userspace. KVM does have several cases where it reflects these errors into the guest, e.g. kvm_pv_clock_pairing() and Hyper-V emulation, but KVM would preferably "fix" those instead of propagating the falsehood that any memory failure is the guest's fault. Lastly, returning a boolean as an "error" for that a helper that isn't named accordingly never works out well. Fixes: ad5b353240c8 ("KVM: SVM: Do not terminate SEV-ES guests on GHCB validation failure") Cc: Alper Gun <alpergun@google.com> Cc: Peter Gonda <pgonda@google.com> Signed-off-by: Sean Christopherson <seanjc@google.com> Message-Id: <20220225205209.3881130-1-seanjc@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-02-25 20:52:09 +00:00
static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
{
KVM: SEV: only access GHCB fields once A KVM guest using SEV-ES or SEV-SNP with multiple vCPUs can trigger a double fetch race condition vulnerability and invoke the VMGEXIT handler recursively. sev_handle_vmgexit() maps the GHCB page using kvm_vcpu_map() and then fetches the exit code using ghcb_get_sw_exit_code(). Soon after, sev_es_validate_vmgexit() fetches the exit code again. Since the GHCB page is shared with the guest, the guest is able to quickly swap the values with another vCPU and hence bypass the validation. One vmexit code that can be rejected by sev_es_validate_vmgexit() is SVM_EXIT_VMGEXIT; if sev_handle_vmgexit() observes it in the second fetch, the call to svm_invoke_exit_handler() will invoke sev_handle_vmgexit() again recursively. To avoid the race, always fetch the GHCB data from the places where sev_es_sync_from_ghcb stores it. Exploiting recursions on linux kernel has been proven feasible in the past, but the impact is mitigated by stack guard pages (CONFIG_VMAP_STACK). Still, if an attacker manages to call the handler multiple times, they can theoretically trigger a stack overflow and cause a denial-of-service, or potentially guest-to-host escape in kernel configurations without stack guard pages. Note that winning the race reliably in every iteration is very tricky due to the very tight window of the fetches; depending on the compiler settings, they are often consecutive because of optimization and inlining. Tested by booting an SEV-ES RHEL9 guest. Fixes: CVE-2023-4155 Fixes: 291bd20d5d88 ("KVM: SVM: Add initial support for a VMGEXIT VMEXIT") Cc: stable@vger.kernel.org Reported-by: Andy Nguyen <theflow@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2023-08-04 12:56:36 -04:00
struct vmcb_control_area *control = &svm->vmcb->control;
struct kvm_vcpu *vcpu = &svm->vcpu;
u64 exit_code;
u64 reason;
/*
* Retrieve the exit code now even though it may not be marked valid
* as it could help with debugging.
*/
KVM: SEV: only access GHCB fields once A KVM guest using SEV-ES or SEV-SNP with multiple vCPUs can trigger a double fetch race condition vulnerability and invoke the VMGEXIT handler recursively. sev_handle_vmgexit() maps the GHCB page using kvm_vcpu_map() and then fetches the exit code using ghcb_get_sw_exit_code(). Soon after, sev_es_validate_vmgexit() fetches the exit code again. Since the GHCB page is shared with the guest, the guest is able to quickly swap the values with another vCPU and hence bypass the validation. One vmexit code that can be rejected by sev_es_validate_vmgexit() is SVM_EXIT_VMGEXIT; if sev_handle_vmgexit() observes it in the second fetch, the call to svm_invoke_exit_handler() will invoke sev_handle_vmgexit() again recursively. To avoid the race, always fetch the GHCB data from the places where sev_es_sync_from_ghcb stores it. Exploiting recursions on linux kernel has been proven feasible in the past, but the impact is mitigated by stack guard pages (CONFIG_VMAP_STACK). Still, if an attacker manages to call the handler multiple times, they can theoretically trigger a stack overflow and cause a denial-of-service, or potentially guest-to-host escape in kernel configurations without stack guard pages. Note that winning the race reliably in every iteration is very tricky due to the very tight window of the fetches; depending on the compiler settings, they are often consecutive because of optimization and inlining. Tested by booting an SEV-ES RHEL9 guest. Fixes: CVE-2023-4155 Fixes: 291bd20d5d88 ("KVM: SVM: Add initial support for a VMGEXIT VMEXIT") Cc: stable@vger.kernel.org Reported-by: Andy Nguyen <theflow@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2023-08-04 12:56:36 -04:00
exit_code = kvm_ghcb_get_sw_exit_code(control);
/* Only GHCB Usage code 0 is supported */
if (svm->sev_es.ghcb->ghcb_usage) {
reason = GHCB_ERR_INVALID_USAGE;
goto vmgexit_err;
}
reason = GHCB_ERR_MISSING_INPUT;
if (!kvm_ghcb_sw_exit_code_is_valid(svm) ||
!kvm_ghcb_sw_exit_info_1_is_valid(svm) ||
!kvm_ghcb_sw_exit_info_2_is_valid(svm))
goto vmgexit_err;
KVM: SEV: only access GHCB fields once A KVM guest using SEV-ES or SEV-SNP with multiple vCPUs can trigger a double fetch race condition vulnerability and invoke the VMGEXIT handler recursively. sev_handle_vmgexit() maps the GHCB page using kvm_vcpu_map() and then fetches the exit code using ghcb_get_sw_exit_code(). Soon after, sev_es_validate_vmgexit() fetches the exit code again. Since the GHCB page is shared with the guest, the guest is able to quickly swap the values with another vCPU and hence bypass the validation. One vmexit code that can be rejected by sev_es_validate_vmgexit() is SVM_EXIT_VMGEXIT; if sev_handle_vmgexit() observes it in the second fetch, the call to svm_invoke_exit_handler() will invoke sev_handle_vmgexit() again recursively. To avoid the race, always fetch the GHCB data from the places where sev_es_sync_from_ghcb stores it. Exploiting recursions on linux kernel has been proven feasible in the past, but the impact is mitigated by stack guard pages (CONFIG_VMAP_STACK). Still, if an attacker manages to call the handler multiple times, they can theoretically trigger a stack overflow and cause a denial-of-service, or potentially guest-to-host escape in kernel configurations without stack guard pages. Note that winning the race reliably in every iteration is very tricky due to the very tight window of the fetches; depending on the compiler settings, they are often consecutive because of optimization and inlining. Tested by booting an SEV-ES RHEL9 guest. Fixes: CVE-2023-4155 Fixes: 291bd20d5d88 ("KVM: SVM: Add initial support for a VMGEXIT VMEXIT") Cc: stable@vger.kernel.org Reported-by: Andy Nguyen <theflow@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2023-08-04 12:56:36 -04:00
switch (exit_code) {
case SVM_EXIT_READ_DR7:
break;
case SVM_EXIT_WRITE_DR7:
if (!kvm_ghcb_rax_is_valid(svm))
goto vmgexit_err;
break;
case SVM_EXIT_RDTSC:
break;
case SVM_EXIT_RDPMC:
if (!kvm_ghcb_rcx_is_valid(svm))
goto vmgexit_err;
break;
case SVM_EXIT_CPUID:
if (!kvm_ghcb_rax_is_valid(svm) ||
!kvm_ghcb_rcx_is_valid(svm))
goto vmgexit_err;
KVM: SEV: only access GHCB fields once A KVM guest using SEV-ES or SEV-SNP with multiple vCPUs can trigger a double fetch race condition vulnerability and invoke the VMGEXIT handler recursively. sev_handle_vmgexit() maps the GHCB page using kvm_vcpu_map() and then fetches the exit code using ghcb_get_sw_exit_code(). Soon after, sev_es_validate_vmgexit() fetches the exit code again. Since the GHCB page is shared with the guest, the guest is able to quickly swap the values with another vCPU and hence bypass the validation. One vmexit code that can be rejected by sev_es_validate_vmgexit() is SVM_EXIT_VMGEXIT; if sev_handle_vmgexit() observes it in the second fetch, the call to svm_invoke_exit_handler() will invoke sev_handle_vmgexit() again recursively. To avoid the race, always fetch the GHCB data from the places where sev_es_sync_from_ghcb stores it. Exploiting recursions on linux kernel has been proven feasible in the past, but the impact is mitigated by stack guard pages (CONFIG_VMAP_STACK). Still, if an attacker manages to call the handler multiple times, they can theoretically trigger a stack overflow and cause a denial-of-service, or potentially guest-to-host escape in kernel configurations without stack guard pages. Note that winning the race reliably in every iteration is very tricky due to the very tight window of the fetches; depending on the compiler settings, they are often consecutive because of optimization and inlining. Tested by booting an SEV-ES RHEL9 guest. Fixes: CVE-2023-4155 Fixes: 291bd20d5d88 ("KVM: SVM: Add initial support for a VMGEXIT VMEXIT") Cc: stable@vger.kernel.org Reported-by: Andy Nguyen <theflow@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2023-08-04 12:56:36 -04:00
if (vcpu->arch.regs[VCPU_REGS_RAX] == 0xd)
if (!kvm_ghcb_xcr0_is_valid(svm))
goto vmgexit_err;
break;
case SVM_EXIT_INVD:
break;
case SVM_EXIT_IOIO:
KVM: SEV: only access GHCB fields once A KVM guest using SEV-ES or SEV-SNP with multiple vCPUs can trigger a double fetch race condition vulnerability and invoke the VMGEXIT handler recursively. sev_handle_vmgexit() maps the GHCB page using kvm_vcpu_map() and then fetches the exit code using ghcb_get_sw_exit_code(). Soon after, sev_es_validate_vmgexit() fetches the exit code again. Since the GHCB page is shared with the guest, the guest is able to quickly swap the values with another vCPU and hence bypass the validation. One vmexit code that can be rejected by sev_es_validate_vmgexit() is SVM_EXIT_VMGEXIT; if sev_handle_vmgexit() observes it in the second fetch, the call to svm_invoke_exit_handler() will invoke sev_handle_vmgexit() again recursively. To avoid the race, always fetch the GHCB data from the places where sev_es_sync_from_ghcb stores it. Exploiting recursions on linux kernel has been proven feasible in the past, but the impact is mitigated by stack guard pages (CONFIG_VMAP_STACK). Still, if an attacker manages to call the handler multiple times, they can theoretically trigger a stack overflow and cause a denial-of-service, or potentially guest-to-host escape in kernel configurations without stack guard pages. Note that winning the race reliably in every iteration is very tricky due to the very tight window of the fetches; depending on the compiler settings, they are often consecutive because of optimization and inlining. Tested by booting an SEV-ES RHEL9 guest. Fixes: CVE-2023-4155 Fixes: 291bd20d5d88 ("KVM: SVM: Add initial support for a VMGEXIT VMEXIT") Cc: stable@vger.kernel.org Reported-by: Andy Nguyen <theflow@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2023-08-04 12:56:36 -04:00
if (control->exit_info_1 & SVM_IOIO_STR_MASK) {
if (!kvm_ghcb_sw_scratch_is_valid(svm))
goto vmgexit_err;
} else {
KVM: SEV: only access GHCB fields once A KVM guest using SEV-ES or SEV-SNP with multiple vCPUs can trigger a double fetch race condition vulnerability and invoke the VMGEXIT handler recursively. sev_handle_vmgexit() maps the GHCB page using kvm_vcpu_map() and then fetches the exit code using ghcb_get_sw_exit_code(). Soon after, sev_es_validate_vmgexit() fetches the exit code again. Since the GHCB page is shared with the guest, the guest is able to quickly swap the values with another vCPU and hence bypass the validation. One vmexit code that can be rejected by sev_es_validate_vmgexit() is SVM_EXIT_VMGEXIT; if sev_handle_vmgexit() observes it in the second fetch, the call to svm_invoke_exit_handler() will invoke sev_handle_vmgexit() again recursively. To avoid the race, always fetch the GHCB data from the places where sev_es_sync_from_ghcb stores it. Exploiting recursions on linux kernel has been proven feasible in the past, but the impact is mitigated by stack guard pages (CONFIG_VMAP_STACK). Still, if an attacker manages to call the handler multiple times, they can theoretically trigger a stack overflow and cause a denial-of-service, or potentially guest-to-host escape in kernel configurations without stack guard pages. Note that winning the race reliably in every iteration is very tricky due to the very tight window of the fetches; depending on the compiler settings, they are often consecutive because of optimization and inlining. Tested by booting an SEV-ES RHEL9 guest. Fixes: CVE-2023-4155 Fixes: 291bd20d5d88 ("KVM: SVM: Add initial support for a VMGEXIT VMEXIT") Cc: stable@vger.kernel.org Reported-by: Andy Nguyen <theflow@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2023-08-04 12:56:36 -04:00
if (!(control->exit_info_1 & SVM_IOIO_TYPE_MASK))
if (!kvm_ghcb_rax_is_valid(svm))
goto vmgexit_err;
}
break;
case SVM_EXIT_MSR:
if (!kvm_ghcb_rcx_is_valid(svm))
goto vmgexit_err;
KVM: SEV: only access GHCB fields once A KVM guest using SEV-ES or SEV-SNP with multiple vCPUs can trigger a double fetch race condition vulnerability and invoke the VMGEXIT handler recursively. sev_handle_vmgexit() maps the GHCB page using kvm_vcpu_map() and then fetches the exit code using ghcb_get_sw_exit_code(). Soon after, sev_es_validate_vmgexit() fetches the exit code again. Since the GHCB page is shared with the guest, the guest is able to quickly swap the values with another vCPU and hence bypass the validation. One vmexit code that can be rejected by sev_es_validate_vmgexit() is SVM_EXIT_VMGEXIT; if sev_handle_vmgexit() observes it in the second fetch, the call to svm_invoke_exit_handler() will invoke sev_handle_vmgexit() again recursively. To avoid the race, always fetch the GHCB data from the places where sev_es_sync_from_ghcb stores it. Exploiting recursions on linux kernel has been proven feasible in the past, but the impact is mitigated by stack guard pages (CONFIG_VMAP_STACK). Still, if an attacker manages to call the handler multiple times, they can theoretically trigger a stack overflow and cause a denial-of-service, or potentially guest-to-host escape in kernel configurations without stack guard pages. Note that winning the race reliably in every iteration is very tricky due to the very tight window of the fetches; depending on the compiler settings, they are often consecutive because of optimization and inlining. Tested by booting an SEV-ES RHEL9 guest. Fixes: CVE-2023-4155 Fixes: 291bd20d5d88 ("KVM: SVM: Add initial support for a VMGEXIT VMEXIT") Cc: stable@vger.kernel.org Reported-by: Andy Nguyen <theflow@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2023-08-04 12:56:36 -04:00
if (control->exit_info_1) {
if (!kvm_ghcb_rax_is_valid(svm) ||
!kvm_ghcb_rdx_is_valid(svm))
goto vmgexit_err;
}
break;
case SVM_EXIT_VMMCALL:
if (!kvm_ghcb_rax_is_valid(svm) ||
!kvm_ghcb_cpl_is_valid(svm))
goto vmgexit_err;
break;
case SVM_EXIT_RDTSCP:
break;
case SVM_EXIT_WBINVD:
break;
case SVM_EXIT_MONITOR:
if (!kvm_ghcb_rax_is_valid(svm) ||
!kvm_ghcb_rcx_is_valid(svm) ||
!kvm_ghcb_rdx_is_valid(svm))
goto vmgexit_err;
break;
case SVM_EXIT_MWAIT:
if (!kvm_ghcb_rax_is_valid(svm) ||
!kvm_ghcb_rcx_is_valid(svm))
goto vmgexit_err;
break;
case SVM_VMGEXIT_MMIO_READ:
case SVM_VMGEXIT_MMIO_WRITE:
if (!kvm_ghcb_sw_scratch_is_valid(svm))
goto vmgexit_err;
break;
KVM: SEV: Support SEV-SNP AP Creation NAE event Add support for the SEV-SNP AP Creation NAE event. This allows SEV-SNP guests to alter the register state of the APs on their own. This allows the guest a way of simulating INIT-SIPI. A new event, KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, is created and used so as to avoid updating the VMSA pointer while the vCPU is running. For CREATE The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID. The GPA is saved in the svm struct of the target vCPU, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to the vCPU and then the vCPU is kicked. For CREATE_ON_INIT: The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID the next time an INIT is performed. The GPA is saved in the svm struct of the target vCPU. For DESTROY: The guest indicates it wishes to stop the vCPU. The GPA is cleared from the svm struct, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to vCPU and then the vCPU is kicked. The KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event handler will be invoked as a result of the event or as a result of an INIT. If a new VMSA is to be installed, the VMSA guest page is set as the VMSA in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_RUNNABLE. If a new VMSA is not to be installed, the VMSA is cleared in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_HALTED to prevent it from being run. Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Co-developed-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Message-ID: <20240501085210.2213060-13-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-05-01 03:52:02 -05:00
case SVM_VMGEXIT_AP_CREATION:
if (!sev_snp_guest(vcpu->kvm))
goto vmgexit_err;
if (lower_32_bits(control->exit_info_1) != SVM_VMGEXIT_AP_DESTROY)
if (!kvm_ghcb_rax_is_valid(svm))
goto vmgexit_err;
break;
case SVM_VMGEXIT_NMI_COMPLETE:
KVM: SVM: Add support for booting APs in an SEV-ES guest Typically under KVM, an AP is booted using the INIT-SIPI-SIPI sequence, where the guest vCPU register state is updated and then the vCPU is VMRUN to begin execution of the AP. For an SEV-ES guest, this won't work because the guest register state is encrypted. Following the GHCB specification, the hypervisor must not alter the guest register state, so KVM must track an AP/vCPU boot. Should the guest want to park the AP, it must use the AP Reset Hold exit event in place of, for example, a HLT loop. First AP boot (first INIT-SIPI-SIPI sequence): Execute the AP (vCPU) as it was initialized and measured by the SEV-ES support. It is up to the guest to transfer control of the AP to the proper location. Subsequent AP boot: KVM will expect to receive an AP Reset Hold exit event indicating that the vCPU is being parked and will require an INIT-SIPI-SIPI sequence to awaken it. When the AP Reset Hold exit event is received, KVM will place the vCPU into a simulated HLT mode. Upon receiving the INIT-SIPI-SIPI sequence, KVM will make the vCPU runnable. It is again up to the guest to then transfer control of the AP to the proper location. To differentiate between an actual HLT and an AP Reset Hold, a new MP state is introduced, KVM_MP_STATE_AP_RESET_HOLD, which the vCPU is placed in upon receiving the AP Reset Hold exit event. Additionally, to communicate the AP Reset Hold exit event up to userspace (if needed), a new exit reason is introduced, KVM_EXIT_AP_RESET_HOLD. A new x86 ops function is introduced, vcpu_deliver_sipi_vector, in order to accomplish AP booting. For VMX, vcpu_deliver_sipi_vector is set to the original SIPI delivery function, kvm_vcpu_deliver_sipi_vector(). SVM adds a new function that, for non SEV-ES guests, invokes the original SIPI delivery function, kvm_vcpu_deliver_sipi_vector(), but for SEV-ES guests, implements the logic above. Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Message-Id: <e8fbebe8eb161ceaabdad7c01a5859a78b424d5e.1609791600.git.thomas.lendacky@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2021-01-04 14:20:01 -06:00
case SVM_VMGEXIT_AP_HLT_LOOP:
case SVM_VMGEXIT_AP_JUMP_TABLE:
case SVM_VMGEXIT_UNSUPPORTED_EVENT:
case SVM_VMGEXIT_HV_FEATURES:
case SVM_VMGEXIT_TERM_REQUEST:
break;
case SVM_VMGEXIT_PSC:
if (!sev_snp_guest(vcpu->kvm) || !kvm_ghcb_sw_scratch_is_valid(svm))
goto vmgexit_err;
break;
KVM: SEV: Provide support for SNP_GUEST_REQUEST NAE event Version 2 of GHCB specification added support for the SNP Guest Request Message NAE event. The event allows for an SEV-SNP guest to make requests to the SEV-SNP firmware through the hypervisor using the SNP_GUEST_REQUEST API defined in the SEV-SNP firmware specification. This is used by guests primarily to request attestation reports from firmware. There are other request types are available as well, but the specifics of what guest requests are being made generally does not affect how they are handled by the hypervisor, which only serves as a proxy for the guest requests and firmware responses. Implement handling for these events. When an SNP Guest Request is issued, the guest will provide its own request/response pages, which could in theory be passed along directly to firmware. However, these pages would need special care: - Both pages are from shared guest memory, so they need to be protected from migration/etc. occurring while firmware reads/writes to them. At a minimum, this requires elevating the ref counts and potentially needing an explicit pinning of the memory. This places additional restrictions on what type of memory backends userspace can use for shared guest memory since there would be some reliance on using refcounted pages. - The response page needs to be switched to Firmware-owned state before the firmware can write to it, which can lead to potential host RMP #PFs if the guest is misbehaved and hands the host a guest page that KVM is writing to for other reasons (e.g. virtio buffers). Both of these issues can be avoided completely by using separately-allocated bounce pages for both the request/response pages and passing those to firmware instead. So that's the approach taken here. Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Co-developed-by: Alexey Kardashevskiy <aik@amd.com> Signed-off-by: Alexey Kardashevskiy <aik@amd.com> Co-developed-by: Ashish Kalra <ashish.kalra@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Reviewed-by: Liam Merwick <liam.merwick@oracle.com> [mdr: ensure FW command failures are indicated to guest, drop extended request handling to be re-written as separate patch, massage commit] Signed-off-by: Michael Roth <michael.roth@amd.com> Message-ID: <20240701223148.3798365-2-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-07-01 17:31:46 -05:00
case SVM_VMGEXIT_GUEST_REQUEST:
case SVM_VMGEXIT_EXT_GUEST_REQUEST:
KVM: SEV: Provide support for SNP_GUEST_REQUEST NAE event Version 2 of GHCB specification added support for the SNP Guest Request Message NAE event. The event allows for an SEV-SNP guest to make requests to the SEV-SNP firmware through the hypervisor using the SNP_GUEST_REQUEST API defined in the SEV-SNP firmware specification. This is used by guests primarily to request attestation reports from firmware. There are other request types are available as well, but the specifics of what guest requests are being made generally does not affect how they are handled by the hypervisor, which only serves as a proxy for the guest requests and firmware responses. Implement handling for these events. When an SNP Guest Request is issued, the guest will provide its own request/response pages, which could in theory be passed along directly to firmware. However, these pages would need special care: - Both pages are from shared guest memory, so they need to be protected from migration/etc. occurring while firmware reads/writes to them. At a minimum, this requires elevating the ref counts and potentially needing an explicit pinning of the memory. This places additional restrictions on what type of memory backends userspace can use for shared guest memory since there would be some reliance on using refcounted pages. - The response page needs to be switched to Firmware-owned state before the firmware can write to it, which can lead to potential host RMP #PFs if the guest is misbehaved and hands the host a guest page that KVM is writing to for other reasons (e.g. virtio buffers). Both of these issues can be avoided completely by using separately-allocated bounce pages for both the request/response pages and passing those to firmware instead. So that's the approach taken here. Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Co-developed-by: Alexey Kardashevskiy <aik@amd.com> Signed-off-by: Alexey Kardashevskiy <aik@amd.com> Co-developed-by: Ashish Kalra <ashish.kalra@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Reviewed-by: Liam Merwick <liam.merwick@oracle.com> [mdr: ensure FW command failures are indicated to guest, drop extended request handling to be re-written as separate patch, massage commit] Signed-off-by: Michael Roth <michael.roth@amd.com> Message-ID: <20240701223148.3798365-2-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-07-01 17:31:46 -05:00
if (!sev_snp_guest(vcpu->kvm) ||
!PAGE_ALIGNED(control->exit_info_1) ||
!PAGE_ALIGNED(control->exit_info_2) ||
control->exit_info_1 == control->exit_info_2)
goto vmgexit_err;
break;
default:
reason = GHCB_ERR_INVALID_EVENT;
goto vmgexit_err;
}
KVM: SVM: Exit to userspace on ENOMEM/EFAULT GHCB errors Exit to userspace if setup_vmgexit_scratch() fails due to OOM or because copying data from guest (userspace) memory failed/faulted. The OOM scenario is clearcut, it's userspace's decision as to whether it should terminate the guest, free memory, etc... As for -EFAULT, arguably, any guest issue is a violation of the guest's contract with userspace, and thus userspace needs to decide how to proceed. E.g. userspace defines what is RAM vs. MMIO and communicates that directly to the guest, KVM is not involved in deciding what is/isn't RAM nor in communicating that information to the guest. If the scratch GPA doesn't resolve to a memslot, then the guest is not honoring the memory configuration as defined by userspace. And if userspace unmaps an hva for whatever reason, then exiting to userspace with -EFAULT is absolutely the right thing to do. KVM's ABI currently sucks and doesn't provide enough information to act on the -EFAULT, but that will hopefully be remedied in the future as there are multiple use cases, e.g. uffd and virtiofs truncation, that shouldn't require any work in KVM beyond returning -EFAULT with a small amount of metadata. KVM could define its ABI such that failure to access the scratch area is reflected into the guest, i.e. establish a contract with userspace, but that's undesirable as it limits KVM's options in the future, e.g. in the potential uffd case any failure on a uaccess needs to kick out to userspace. KVM does have several cases where it reflects these errors into the guest, e.g. kvm_pv_clock_pairing() and Hyper-V emulation, but KVM would preferably "fix" those instead of propagating the falsehood that any memory failure is the guest's fault. Lastly, returning a boolean as an "error" for that a helper that isn't named accordingly never works out well. Fixes: ad5b353240c8 ("KVM: SVM: Do not terminate SEV-ES guests on GHCB validation failure") Cc: Alper Gun <alpergun@google.com> Cc: Peter Gonda <pgonda@google.com> Signed-off-by: Sean Christopherson <seanjc@google.com> Message-Id: <20220225205209.3881130-1-seanjc@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-02-25 20:52:09 +00:00
return 0;
vmgexit_err:
if (reason == GHCB_ERR_INVALID_USAGE) {
vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n",
svm->sev_es.ghcb->ghcb_usage);
} else if (reason == GHCB_ERR_INVALID_EVENT) {
vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is not valid\n",
exit_code);
} else {
vcpu_unimpl(vcpu, "vmgexit: exit code %#llx input is not valid\n",
exit_code);
dump_ghcb(svm);
}
svm_vmgexit_bad_input(svm, reason);
KVM: SVM: Exit to userspace on ENOMEM/EFAULT GHCB errors Exit to userspace if setup_vmgexit_scratch() fails due to OOM or because copying data from guest (userspace) memory failed/faulted. The OOM scenario is clearcut, it's userspace's decision as to whether it should terminate the guest, free memory, etc... As for -EFAULT, arguably, any guest issue is a violation of the guest's contract with userspace, and thus userspace needs to decide how to proceed. E.g. userspace defines what is RAM vs. MMIO and communicates that directly to the guest, KVM is not involved in deciding what is/isn't RAM nor in communicating that information to the guest. If the scratch GPA doesn't resolve to a memslot, then the guest is not honoring the memory configuration as defined by userspace. And if userspace unmaps an hva for whatever reason, then exiting to userspace with -EFAULT is absolutely the right thing to do. KVM's ABI currently sucks and doesn't provide enough information to act on the -EFAULT, but that will hopefully be remedied in the future as there are multiple use cases, e.g. uffd and virtiofs truncation, that shouldn't require any work in KVM beyond returning -EFAULT with a small amount of metadata. KVM could define its ABI such that failure to access the scratch area is reflected into the guest, i.e. establish a contract with userspace, but that's undesirable as it limits KVM's options in the future, e.g. in the potential uffd case any failure on a uaccess needs to kick out to userspace. KVM does have several cases where it reflects these errors into the guest, e.g. kvm_pv_clock_pairing() and Hyper-V emulation, but KVM would preferably "fix" those instead of propagating the falsehood that any memory failure is the guest's fault. Lastly, returning a boolean as an "error" for that a helper that isn't named accordingly never works out well. Fixes: ad5b353240c8 ("KVM: SVM: Do not terminate SEV-ES guests on GHCB validation failure") Cc: Alper Gun <alpergun@google.com> Cc: Peter Gonda <pgonda@google.com> Signed-off-by: Sean Christopherson <seanjc@google.com> Message-Id: <20220225205209.3881130-1-seanjc@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-02-25 20:52:09 +00:00
/* Resume the guest to "return" the error code. */
return 1;
}
void sev_es_unmap_ghcb(struct vcpu_svm *svm)
{
/* Clear any indication that the vCPU is in a type of AP Reset Hold */
svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NONE;
if (!svm->sev_es.ghcb)
return;
if (svm->sev_es.ghcb_sa_free) {
/*
* The scratch area lives outside the GHCB, so there is a
* buffer that, depending on the operation performed, may
* need to be synced, then freed.
*/
if (svm->sev_es.ghcb_sa_sync) {
kvm_write_guest(svm->vcpu.kvm,
svm->sev_es.sw_scratch,
svm->sev_es.ghcb_sa,
svm->sev_es.ghcb_sa_len);
svm->sev_es.ghcb_sa_sync = false;
}
kvfree(svm->sev_es.ghcb_sa);
svm->sev_es.ghcb_sa = NULL;
svm->sev_es.ghcb_sa_free = false;
}
trace_kvm_vmgexit_exit(svm->vcpu.vcpu_id, svm->sev_es.ghcb);
sev_es_sync_to_ghcb(svm);
kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map);
svm->sev_es.ghcb = NULL;
}
int pre_sev_run(struct vcpu_svm *svm, int cpu)
{
struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
struct kvm *kvm = svm->vcpu.kvm;
unsigned int asid = sev_get_asid(kvm);
/*
* Reject KVM_RUN if userspace attempts to run the vCPU with an invalid
* VMSA, e.g. if userspace forces the vCPU to be RUNNABLE after an SNP
* AP Destroy event.
*/
if (sev_es_guest(kvm) && !VALID_PAGE(svm->vmcb->control.vmsa_pa))
return -EINVAL;
KVM: SVM: Flush cache only on CPUs running SEV guest On AMD CPUs without ensuring cache consistency, each memory page reclamation in an SEV guest triggers a call to do WBNOINVD/WBINVD on all CPUs, thereby affecting the performance of other programs on the host. Typically, an AMD server may have 128 cores or more, while the SEV guest might only utilize 8 of these cores. Meanwhile, host can use qemu-affinity to bind these 8 vCPUs to specific physical CPUs. Therefore, keeping a record of the physical core numbers each time a vCPU runs can help avoid flushing the cache for all CPUs every time. Take care to allocate the cpumask used to track which CPUs have run a vCPU when copying or moving an "encryption context", as nothing guarantees memory in a mirror VM is a strict subset of the ASID owner, and the destination VM for intrahost migration needs to maintain it's own set of CPUs. E.g. for intrahost migration, if a CPU was used for the source VM but not the destination VM, then it can only have cached memory that was accessible to the source VM. And a CPU that was run in the source is also used by the destination is no different than a CPU that was run in the destination only. Note, KVM is guaranteed to do flush caches prior to sev_vm_destroy(), thanks to kvm_arch_guest_memory_reclaimed for SEV and SEV-ES, and kvm_arch_gmem_invalidate() for SEV-SNP. I.e. it's safe to free the cpumask prior to unregistering encrypted regions and freeing the ASID. Opportunistically clean up sev_vm_destroy()'s comment regarding what is (implicitly, what isn't) skipped for mirror VMs. Cc: Srikanth Aithal <sraithal@amd.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Signed-off-by: Zheyun Shen <szy0127@sjtu.edu.cn> Link: https://lore.kernel.org/r/20250522233733.3176144-9-seanjc@google.com Link: https://lore.kernel.org/all/935a82e3-f7ad-47d7-aaaf-f3d2b62ed768@amd.com Co-developed-by: Sean Christopherson <seanjc@google.com> Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-05-22 16:37:32 -07:00
/*
* To optimize cache flushes when memory is reclaimed from an SEV VM,
* track physical CPUs that enter the guest for SEV VMs and thus can
* have encrypted, dirty data in the cache, and flush caches only for
* CPUs that have entered the guest.
*/
if (!cpumask_test_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus))
cpumask_set_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus);
/* Assign the asid allocated with this SEV guest */
svm->asid = asid;
/*
* Flush guest TLB:
*
* 1) when different VMCB for the same ASID is to be run on the same host CPU.
* 2) or this VMCB was executed on different host CPU in previous VMRUNs.
*/
if (sd->sev_vmcbs[asid] == svm->vmcb &&
svm->vcpu.arch.last_vmentry_cpu == cpu)
return 0;
sd->sev_vmcbs[asid] = svm->vmcb;
svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
return 0;
}
#define GHCB_SCRATCH_AREA_LIMIT (16ULL * PAGE_SIZE)
KVM: SVM: Exit to userspace on ENOMEM/EFAULT GHCB errors Exit to userspace if setup_vmgexit_scratch() fails due to OOM or because copying data from guest (userspace) memory failed/faulted. The OOM scenario is clearcut, it's userspace's decision as to whether it should terminate the guest, free memory, etc... As for -EFAULT, arguably, any guest issue is a violation of the guest's contract with userspace, and thus userspace needs to decide how to proceed. E.g. userspace defines what is RAM vs. MMIO and communicates that directly to the guest, KVM is not involved in deciding what is/isn't RAM nor in communicating that information to the guest. If the scratch GPA doesn't resolve to a memslot, then the guest is not honoring the memory configuration as defined by userspace. And if userspace unmaps an hva for whatever reason, then exiting to userspace with -EFAULT is absolutely the right thing to do. KVM's ABI currently sucks and doesn't provide enough information to act on the -EFAULT, but that will hopefully be remedied in the future as there are multiple use cases, e.g. uffd and virtiofs truncation, that shouldn't require any work in KVM beyond returning -EFAULT with a small amount of metadata. KVM could define its ABI such that failure to access the scratch area is reflected into the guest, i.e. establish a contract with userspace, but that's undesirable as it limits KVM's options in the future, e.g. in the potential uffd case any failure on a uaccess needs to kick out to userspace. KVM does have several cases where it reflects these errors into the guest, e.g. kvm_pv_clock_pairing() and Hyper-V emulation, but KVM would preferably "fix" those instead of propagating the falsehood that any memory failure is the guest's fault. Lastly, returning a boolean as an "error" for that a helper that isn't named accordingly never works out well. Fixes: ad5b353240c8 ("KVM: SVM: Do not terminate SEV-ES guests on GHCB validation failure") Cc: Alper Gun <alpergun@google.com> Cc: Peter Gonda <pgonda@google.com> Signed-off-by: Sean Christopherson <seanjc@google.com> Message-Id: <20220225205209.3881130-1-seanjc@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-02-25 20:52:09 +00:00
static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
{
struct vmcb_control_area *control = &svm->vmcb->control;
u64 ghcb_scratch_beg, ghcb_scratch_end;
u64 scratch_gpa_beg, scratch_gpa_end;
void *scratch_va;
scratch_gpa_beg = svm->sev_es.sw_scratch;
if (!scratch_gpa_beg) {
pr_err("vmgexit: scratch gpa not provided\n");
goto e_scratch;
}
scratch_gpa_end = scratch_gpa_beg + len;
if (scratch_gpa_end < scratch_gpa_beg) {
pr_err("vmgexit: scratch length (%#llx) not valid for scratch address (%#llx)\n",
len, scratch_gpa_beg);
goto e_scratch;
}
if ((scratch_gpa_beg & PAGE_MASK) == control->ghcb_gpa) {
/* Scratch area begins within GHCB */
ghcb_scratch_beg = control->ghcb_gpa +
offsetof(struct ghcb, shared_buffer);
ghcb_scratch_end = control->ghcb_gpa +
offsetof(struct ghcb, reserved_0xff0);
/*
* If the scratch area begins within the GHCB, it must be
* completely contained in the GHCB shared buffer area.
*/
if (scratch_gpa_beg < ghcb_scratch_beg ||
scratch_gpa_end > ghcb_scratch_end) {
pr_err("vmgexit: scratch area is outside of GHCB shared buffer area (%#llx - %#llx)\n",
scratch_gpa_beg, scratch_gpa_end);
goto e_scratch;
}
scratch_va = (void *)svm->sev_es.ghcb;
scratch_va += (scratch_gpa_beg - control->ghcb_gpa);
} else {
/*
* The guest memory must be read into a kernel buffer, so
* limit the size
*/
if (len > GHCB_SCRATCH_AREA_LIMIT) {
pr_err("vmgexit: scratch area exceeds KVM limits (%#llx requested, %#llx limit)\n",
len, GHCB_SCRATCH_AREA_LIMIT);
goto e_scratch;
}
scratch_va = kvzalloc(len, GFP_KERNEL_ACCOUNT);
if (!scratch_va)
KVM: SVM: Exit to userspace on ENOMEM/EFAULT GHCB errors Exit to userspace if setup_vmgexit_scratch() fails due to OOM or because copying data from guest (userspace) memory failed/faulted. The OOM scenario is clearcut, it's userspace's decision as to whether it should terminate the guest, free memory, etc... As for -EFAULT, arguably, any guest issue is a violation of the guest's contract with userspace, and thus userspace needs to decide how to proceed. E.g. userspace defines what is RAM vs. MMIO and communicates that directly to the guest, KVM is not involved in deciding what is/isn't RAM nor in communicating that information to the guest. If the scratch GPA doesn't resolve to a memslot, then the guest is not honoring the memory configuration as defined by userspace. And if userspace unmaps an hva for whatever reason, then exiting to userspace with -EFAULT is absolutely the right thing to do. KVM's ABI currently sucks and doesn't provide enough information to act on the -EFAULT, but that will hopefully be remedied in the future as there are multiple use cases, e.g. uffd and virtiofs truncation, that shouldn't require any work in KVM beyond returning -EFAULT with a small amount of metadata. KVM could define its ABI such that failure to access the scratch area is reflected into the guest, i.e. establish a contract with userspace, but that's undesirable as it limits KVM's options in the future, e.g. in the potential uffd case any failure on a uaccess needs to kick out to userspace. KVM does have several cases where it reflects these errors into the guest, e.g. kvm_pv_clock_pairing() and Hyper-V emulation, but KVM would preferably "fix" those instead of propagating the falsehood that any memory failure is the guest's fault. Lastly, returning a boolean as an "error" for that a helper that isn't named accordingly never works out well. Fixes: ad5b353240c8 ("KVM: SVM: Do not terminate SEV-ES guests on GHCB validation failure") Cc: Alper Gun <alpergun@google.com> Cc: Peter Gonda <pgonda@google.com> Signed-off-by: Sean Christopherson <seanjc@google.com> Message-Id: <20220225205209.3881130-1-seanjc@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-02-25 20:52:09 +00:00
return -ENOMEM;
if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, len)) {
/* Unable to copy scratch area from guest */
pr_err("vmgexit: kvm_read_guest for scratch area failed\n");
kvfree(scratch_va);
KVM: SVM: Exit to userspace on ENOMEM/EFAULT GHCB errors Exit to userspace if setup_vmgexit_scratch() fails due to OOM or because copying data from guest (userspace) memory failed/faulted. The OOM scenario is clearcut, it's userspace's decision as to whether it should terminate the guest, free memory, etc... As for -EFAULT, arguably, any guest issue is a violation of the guest's contract with userspace, and thus userspace needs to decide how to proceed. E.g. userspace defines what is RAM vs. MMIO and communicates that directly to the guest, KVM is not involved in deciding what is/isn't RAM nor in communicating that information to the guest. If the scratch GPA doesn't resolve to a memslot, then the guest is not honoring the memory configuration as defined by userspace. And if userspace unmaps an hva for whatever reason, then exiting to userspace with -EFAULT is absolutely the right thing to do. KVM's ABI currently sucks and doesn't provide enough information to act on the -EFAULT, but that will hopefully be remedied in the future as there are multiple use cases, e.g. uffd and virtiofs truncation, that shouldn't require any work in KVM beyond returning -EFAULT with a small amount of metadata. KVM could define its ABI such that failure to access the scratch area is reflected into the guest, i.e. establish a contract with userspace, but that's undesirable as it limits KVM's options in the future, e.g. in the potential uffd case any failure on a uaccess needs to kick out to userspace. KVM does have several cases where it reflects these errors into the guest, e.g. kvm_pv_clock_pairing() and Hyper-V emulation, but KVM would preferably "fix" those instead of propagating the falsehood that any memory failure is the guest's fault. Lastly, returning a boolean as an "error" for that a helper that isn't named accordingly never works out well. Fixes: ad5b353240c8 ("KVM: SVM: Do not terminate SEV-ES guests on GHCB validation failure") Cc: Alper Gun <alpergun@google.com> Cc: Peter Gonda <pgonda@google.com> Signed-off-by: Sean Christopherson <seanjc@google.com> Message-Id: <20220225205209.3881130-1-seanjc@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-02-25 20:52:09 +00:00
return -EFAULT;
}
/*
* The scratch area is outside the GHCB. The operation will
* dictate whether the buffer needs to be synced before running
* the vCPU next time (i.e. a read was requested so the data
* must be written back to the guest memory).
*/
svm->sev_es.ghcb_sa_sync = sync;
svm->sev_es.ghcb_sa_free = true;
}
svm->sev_es.ghcb_sa = scratch_va;
svm->sev_es.ghcb_sa_len = len;
KVM: SVM: Exit to userspace on ENOMEM/EFAULT GHCB errors Exit to userspace if setup_vmgexit_scratch() fails due to OOM or because copying data from guest (userspace) memory failed/faulted. The OOM scenario is clearcut, it's userspace's decision as to whether it should terminate the guest, free memory, etc... As for -EFAULT, arguably, any guest issue is a violation of the guest's contract with userspace, and thus userspace needs to decide how to proceed. E.g. userspace defines what is RAM vs. MMIO and communicates that directly to the guest, KVM is not involved in deciding what is/isn't RAM nor in communicating that information to the guest. If the scratch GPA doesn't resolve to a memslot, then the guest is not honoring the memory configuration as defined by userspace. And if userspace unmaps an hva for whatever reason, then exiting to userspace with -EFAULT is absolutely the right thing to do. KVM's ABI currently sucks and doesn't provide enough information to act on the -EFAULT, but that will hopefully be remedied in the future as there are multiple use cases, e.g. uffd and virtiofs truncation, that shouldn't require any work in KVM beyond returning -EFAULT with a small amount of metadata. KVM could define its ABI such that failure to access the scratch area is reflected into the guest, i.e. establish a contract with userspace, but that's undesirable as it limits KVM's options in the future, e.g. in the potential uffd case any failure on a uaccess needs to kick out to userspace. KVM does have several cases where it reflects these errors into the guest, e.g. kvm_pv_clock_pairing() and Hyper-V emulation, but KVM would preferably "fix" those instead of propagating the falsehood that any memory failure is the guest's fault. Lastly, returning a boolean as an "error" for that a helper that isn't named accordingly never works out well. Fixes: ad5b353240c8 ("KVM: SVM: Do not terminate SEV-ES guests on GHCB validation failure") Cc: Alper Gun <alpergun@google.com> Cc: Peter Gonda <pgonda@google.com> Signed-off-by: Sean Christopherson <seanjc@google.com> Message-Id: <20220225205209.3881130-1-seanjc@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-02-25 20:52:09 +00:00
return 0;
e_scratch:
svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_SCRATCH_AREA);
KVM: SVM: Exit to userspace on ENOMEM/EFAULT GHCB errors Exit to userspace if setup_vmgexit_scratch() fails due to OOM or because copying data from guest (userspace) memory failed/faulted. The OOM scenario is clearcut, it's userspace's decision as to whether it should terminate the guest, free memory, etc... As for -EFAULT, arguably, any guest issue is a violation of the guest's contract with userspace, and thus userspace needs to decide how to proceed. E.g. userspace defines what is RAM vs. MMIO and communicates that directly to the guest, KVM is not involved in deciding what is/isn't RAM nor in communicating that information to the guest. If the scratch GPA doesn't resolve to a memslot, then the guest is not honoring the memory configuration as defined by userspace. And if userspace unmaps an hva for whatever reason, then exiting to userspace with -EFAULT is absolutely the right thing to do. KVM's ABI currently sucks and doesn't provide enough information to act on the -EFAULT, but that will hopefully be remedied in the future as there are multiple use cases, e.g. uffd and virtiofs truncation, that shouldn't require any work in KVM beyond returning -EFAULT with a small amount of metadata. KVM could define its ABI such that failure to access the scratch area is reflected into the guest, i.e. establish a contract with userspace, but that's undesirable as it limits KVM's options in the future, e.g. in the potential uffd case any failure on a uaccess needs to kick out to userspace. KVM does have several cases where it reflects these errors into the guest, e.g. kvm_pv_clock_pairing() and Hyper-V emulation, but KVM would preferably "fix" those instead of propagating the falsehood that any memory failure is the guest's fault. Lastly, returning a boolean as an "error" for that a helper that isn't named accordingly never works out well. Fixes: ad5b353240c8 ("KVM: SVM: Do not terminate SEV-ES guests on GHCB validation failure") Cc: Alper Gun <alpergun@google.com> Cc: Peter Gonda <pgonda@google.com> Signed-off-by: Sean Christopherson <seanjc@google.com> Message-Id: <20220225205209.3881130-1-seanjc@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-02-25 20:52:09 +00:00
return 1;
}
static void set_ghcb_msr_bits(struct vcpu_svm *svm, u64 value, u64 mask,
unsigned int pos)
{
svm->vmcb->control.ghcb_gpa &= ~(mask << pos);
svm->vmcb->control.ghcb_gpa |= (value & mask) << pos;
}
static u64 get_ghcb_msr_bits(struct vcpu_svm *svm, u64 mask, unsigned int pos)
{
return (svm->vmcb->control.ghcb_gpa >> pos) & mask;
}
static void set_ghcb_msr(struct vcpu_svm *svm, u64 value)
{
svm->vmcb->control.ghcb_gpa = value;
}
static int snp_rmptable_psmash(kvm_pfn_t pfn)
{
int ret;
pfn = pfn & ~(KVM_PAGES_PER_HPAGE(PG_LEVEL_2M) - 1);
/*
* PSMASH_FAIL_INUSE indicates another processor is modifying the
* entry, so retry until that's no longer the case.
*/
do {
ret = psmash(pfn);
} while (ret == PSMASH_FAIL_INUSE);
return ret;
}
static int snp_complete_psc_msr(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
if (vcpu->run->hypercall.ret)
set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR);
else
set_ghcb_msr(svm, GHCB_MSR_PSC_RESP);
return 1; /* resume guest */
}
static int snp_begin_psc_msr(struct vcpu_svm *svm, u64 ghcb_msr)
{
u64 gpa = gfn_to_gpa(GHCB_MSR_PSC_REQ_TO_GFN(ghcb_msr));
u8 op = GHCB_MSR_PSC_REQ_TO_OP(ghcb_msr);
struct kvm_vcpu *vcpu = &svm->vcpu;
if (op != SNP_PAGE_STATE_PRIVATE && op != SNP_PAGE_STATE_SHARED) {
set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR);
return 1; /* resume guest */
}
if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR);
return 1; /* resume guest */
}
vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
/*
* In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
* assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
* it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting
* vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
*/
vcpu->run->hypercall.ret = 0;
vcpu->run->hypercall.args[0] = gpa;
vcpu->run->hypercall.args[1] = 1;
vcpu->run->hypercall.args[2] = (op == SNP_PAGE_STATE_PRIVATE)
? KVM_MAP_GPA_RANGE_ENCRYPTED
: KVM_MAP_GPA_RANGE_DECRYPTED;
vcpu->run->hypercall.args[2] |= KVM_MAP_GPA_RANGE_PAGE_SZ_4K;
vcpu->arch.complete_userspace_io = snp_complete_psc_msr;
return 0; /* forward request to userspace */
}
struct psc_buffer {
struct psc_hdr hdr;
struct psc_entry entries[];
} __packed;
static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc);
static void snp_complete_psc(struct vcpu_svm *svm, u64 psc_ret)
{
svm->sev_es.psc_inflight = 0;
svm->sev_es.psc_idx = 0;
svm->sev_es.psc_2m = false;
/*
* PSC requests always get a "no action" response in SW_EXITINFO1, with
* a PSC-specific return code in SW_EXITINFO2 that provides the "real"
* return code. E.g. if the PSC request was interrupted, the need to
* retry is communicated via SW_EXITINFO2, not SW_EXITINFO1.
*/
svm_vmgexit_no_action(svm, psc_ret);
}
static void __snp_complete_one_psc(struct vcpu_svm *svm)
{
struct psc_buffer *psc = svm->sev_es.ghcb_sa;
struct psc_entry *entries = psc->entries;
struct psc_hdr *hdr = &psc->hdr;
__u16 idx;
/*
* Everything in-flight has been processed successfully. Update the
* corresponding entries in the guest's PSC buffer and zero out the
* count of in-flight PSC entries.
*/
for (idx = svm->sev_es.psc_idx; svm->sev_es.psc_inflight;
svm->sev_es.psc_inflight--, idx++) {
struct psc_entry *entry = &entries[idx];
entry->cur_page = entry->pagesize ? 512 : 1;
}
hdr->cur_entry = idx;
}
static int snp_complete_one_psc(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct psc_buffer *psc = svm->sev_es.ghcb_sa;
if (vcpu->run->hypercall.ret) {
snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
return 1; /* resume guest */
}
__snp_complete_one_psc(svm);
/* Handle the next range (if any). */
return snp_begin_psc(svm, psc);
}
static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc)
{
struct psc_entry *entries = psc->entries;
struct kvm_vcpu *vcpu = &svm->vcpu;
struct psc_hdr *hdr = &psc->hdr;
struct psc_entry entry_start;
u16 idx, idx_start, idx_end;
int npages;
bool huge;
u64 gfn;
if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
return 1;
}
next_range:
/* There should be no other PSCs in-flight at this point. */
if (WARN_ON_ONCE(svm->sev_es.psc_inflight)) {
snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
return 1;
}
/*
* The PSC descriptor buffer can be modified by a misbehaved guest after
* validation, so take care to only use validated copies of values used
* for things like array indexing.
*/
idx_start = hdr->cur_entry;
idx_end = hdr->end_entry;
if (idx_end >= VMGEXIT_PSC_MAX_COUNT) {
snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_HDR);
return 1;
}
/* Find the start of the next range which needs processing. */
for (idx = idx_start; idx <= idx_end; idx++, hdr->cur_entry++) {
entry_start = entries[idx];
gfn = entry_start.gfn;
huge = entry_start.pagesize;
npages = huge ? 512 : 1;
if (entry_start.cur_page > npages || !IS_ALIGNED(gfn, npages)) {
snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_ENTRY);
return 1;
}
if (entry_start.cur_page) {
/*
* If this is a partially-completed 2M range, force 4K handling
* for the remaining pages since they're effectively split at
* this point. Subsequent code should ensure this doesn't get
* combined with adjacent PSC entries where 2M handling is still
* possible.
*/
npages -= entry_start.cur_page;
gfn += entry_start.cur_page;
huge = false;
}
if (npages)
break;
}
if (idx > idx_end) {
/* Nothing more to process. */
snp_complete_psc(svm, 0);
return 1;
}
svm->sev_es.psc_2m = huge;
svm->sev_es.psc_idx = idx;
svm->sev_es.psc_inflight = 1;
/*
* Find all subsequent PSC entries that contain adjacent GPA
* ranges/operations and can be combined into a single
* KVM_HC_MAP_GPA_RANGE exit.
*/
while (++idx <= idx_end) {
struct psc_entry entry = entries[idx];
if (entry.operation != entry_start.operation ||
entry.gfn != entry_start.gfn + npages ||
entry.cur_page || !!entry.pagesize != huge)
break;
svm->sev_es.psc_inflight++;
npages += huge ? 512 : 1;
}
switch (entry_start.operation) {
case VMGEXIT_PSC_OP_PRIVATE:
case VMGEXIT_PSC_OP_SHARED:
vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
/*
* In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
* assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
* it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting
* vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
*/
vcpu->run->hypercall.ret = 0;
vcpu->run->hypercall.args[0] = gfn_to_gpa(gfn);
vcpu->run->hypercall.args[1] = npages;
vcpu->run->hypercall.args[2] = entry_start.operation == VMGEXIT_PSC_OP_PRIVATE
? KVM_MAP_GPA_RANGE_ENCRYPTED
: KVM_MAP_GPA_RANGE_DECRYPTED;
vcpu->run->hypercall.args[2] |= entry_start.pagesize
? KVM_MAP_GPA_RANGE_PAGE_SZ_2M
: KVM_MAP_GPA_RANGE_PAGE_SZ_4K;
vcpu->arch.complete_userspace_io = snp_complete_one_psc;
return 0; /* forward request to userspace */
default:
/*
* Only shared/private PSC operations are currently supported, so if the
* entire range consists of unsupported operations (e.g. SMASH/UNSMASH),
* then consider the entire range completed and avoid exiting to
* userspace. In theory snp_complete_psc() can always be called directly
* at this point to complete the current range and start the next one,
* but that could lead to unexpected levels of recursion.
*/
__snp_complete_one_psc(svm);
goto next_range;
}
BUG();
}
/*
* Invoked as part of svm_vcpu_reset() processing of an init event.
*/
void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu)
KVM: SEV: Support SEV-SNP AP Creation NAE event Add support for the SEV-SNP AP Creation NAE event. This allows SEV-SNP guests to alter the register state of the APs on their own. This allows the guest a way of simulating INIT-SIPI. A new event, KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, is created and used so as to avoid updating the VMSA pointer while the vCPU is running. For CREATE The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID. The GPA is saved in the svm struct of the target vCPU, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to the vCPU and then the vCPU is kicked. For CREATE_ON_INIT: The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID the next time an INIT is performed. The GPA is saved in the svm struct of the target vCPU. For DESTROY: The guest indicates it wishes to stop the vCPU. The GPA is cleared from the svm struct, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to vCPU and then the vCPU is kicked. The KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event handler will be invoked as a result of the event or as a result of an INIT. If a new VMSA is to be installed, the VMSA guest page is set as the VMSA in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_RUNNABLE. If a new VMSA is not to be installed, the VMSA is cleared in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_HALTED to prevent it from being run. Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Co-developed-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Message-ID: <20240501085210.2213060-13-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-05-01 03:52:02 -05:00
{
struct vcpu_svm *svm = to_svm(vcpu);
struct kvm_memory_slot *slot;
struct page *page;
kvm_pfn_t pfn;
gfn_t gfn;
KVM: SEV: Support SEV-SNP AP Creation NAE event Add support for the SEV-SNP AP Creation NAE event. This allows SEV-SNP guests to alter the register state of the APs on their own. This allows the guest a way of simulating INIT-SIPI. A new event, KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, is created and used so as to avoid updating the VMSA pointer while the vCPU is running. For CREATE The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID. The GPA is saved in the svm struct of the target vCPU, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to the vCPU and then the vCPU is kicked. For CREATE_ON_INIT: The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID the next time an INIT is performed. The GPA is saved in the svm struct of the target vCPU. For DESTROY: The guest indicates it wishes to stop the vCPU. The GPA is cleared from the svm struct, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to vCPU and then the vCPU is kicked. The KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event handler will be invoked as a result of the event or as a result of an INIT. If a new VMSA is to be installed, the VMSA guest page is set as the VMSA in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_RUNNABLE. If a new VMSA is not to be installed, the VMSA is cleared in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_HALTED to prevent it from being run. Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Co-developed-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Message-ID: <20240501085210.2213060-13-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-05-01 03:52:02 -05:00
if (!sev_snp_guest(vcpu->kvm))
return;
guard(mutex)(&svm->sev_es.snp_vmsa_mutex);
if (!svm->sev_es.snp_ap_waiting_for_reset)
return;
svm->sev_es.snp_ap_waiting_for_reset = false;
KVM: SEV: Support SEV-SNP AP Creation NAE event Add support for the SEV-SNP AP Creation NAE event. This allows SEV-SNP guests to alter the register state of the APs on their own. This allows the guest a way of simulating INIT-SIPI. A new event, KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, is created and used so as to avoid updating the VMSA pointer while the vCPU is running. For CREATE The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID. The GPA is saved in the svm struct of the target vCPU, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to the vCPU and then the vCPU is kicked. For CREATE_ON_INIT: The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID the next time an INIT is performed. The GPA is saved in the svm struct of the target vCPU. For DESTROY: The guest indicates it wishes to stop the vCPU. The GPA is cleared from the svm struct, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to vCPU and then the vCPU is kicked. The KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event handler will be invoked as a result of the event or as a result of an INIT. If a new VMSA is to be installed, the VMSA guest page is set as the VMSA in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_RUNNABLE. If a new VMSA is not to be installed, the VMSA is cleared in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_HALTED to prevent it from being run. Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Co-developed-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Message-ID: <20240501085210.2213060-13-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-05-01 03:52:02 -05:00
/* Mark the vCPU as offline and not runnable */
vcpu->arch.pv.pv_unhalted = false;
kvm_set_mp_state(vcpu, KVM_MP_STATE_HALTED);
KVM: SEV: Support SEV-SNP AP Creation NAE event Add support for the SEV-SNP AP Creation NAE event. This allows SEV-SNP guests to alter the register state of the APs on their own. This allows the guest a way of simulating INIT-SIPI. A new event, KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, is created and used so as to avoid updating the VMSA pointer while the vCPU is running. For CREATE The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID. The GPA is saved in the svm struct of the target vCPU, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to the vCPU and then the vCPU is kicked. For CREATE_ON_INIT: The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID the next time an INIT is performed. The GPA is saved in the svm struct of the target vCPU. For DESTROY: The guest indicates it wishes to stop the vCPU. The GPA is cleared from the svm struct, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to vCPU and then the vCPU is kicked. The KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event handler will be invoked as a result of the event or as a result of an INIT. If a new VMSA is to be installed, the VMSA guest page is set as the VMSA in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_RUNNABLE. If a new VMSA is not to be installed, the VMSA is cleared in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_HALTED to prevent it from being run. Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Co-developed-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Message-ID: <20240501085210.2213060-13-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-05-01 03:52:02 -05:00
/* Clear use of the VMSA */
svm->vmcb->control.vmsa_pa = INVALID_PAGE;
/*
* When replacing the VMSA during SEV-SNP AP creation,
* mark the VMCB dirty so that full state is always reloaded.
*/
vmcb_mark_all_dirty(svm->vmcb);
if (!VALID_PAGE(svm->sev_es.snp_vmsa_gpa))
return;
KVM: SEV: Support SEV-SNP AP Creation NAE event Add support for the SEV-SNP AP Creation NAE event. This allows SEV-SNP guests to alter the register state of the APs on their own. This allows the guest a way of simulating INIT-SIPI. A new event, KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, is created and used so as to avoid updating the VMSA pointer while the vCPU is running. For CREATE The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID. The GPA is saved in the svm struct of the target vCPU, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to the vCPU and then the vCPU is kicked. For CREATE_ON_INIT: The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID the next time an INIT is performed. The GPA is saved in the svm struct of the target vCPU. For DESTROY: The guest indicates it wishes to stop the vCPU. The GPA is cleared from the svm struct, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to vCPU and then the vCPU is kicked. The KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event handler will be invoked as a result of the event or as a result of an INIT. If a new VMSA is to be installed, the VMSA guest page is set as the VMSA in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_RUNNABLE. If a new VMSA is not to be installed, the VMSA is cleared in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_HALTED to prevent it from being run. Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Co-developed-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Message-ID: <20240501085210.2213060-13-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-05-01 03:52:02 -05:00
gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa);
svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
KVM: SEV: Support SEV-SNP AP Creation NAE event Add support for the SEV-SNP AP Creation NAE event. This allows SEV-SNP guests to alter the register state of the APs on their own. This allows the guest a way of simulating INIT-SIPI. A new event, KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, is created and used so as to avoid updating the VMSA pointer while the vCPU is running. For CREATE The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID. The GPA is saved in the svm struct of the target vCPU, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to the vCPU and then the vCPU is kicked. For CREATE_ON_INIT: The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID the next time an INIT is performed. The GPA is saved in the svm struct of the target vCPU. For DESTROY: The guest indicates it wishes to stop the vCPU. The GPA is cleared from the svm struct, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to vCPU and then the vCPU is kicked. The KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event handler will be invoked as a result of the event or as a result of an INIT. If a new VMSA is to be installed, the VMSA guest page is set as the VMSA in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_RUNNABLE. If a new VMSA is not to be installed, the VMSA is cleared in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_HALTED to prevent it from being run. Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Co-developed-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Message-ID: <20240501085210.2213060-13-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-05-01 03:52:02 -05:00
slot = gfn_to_memslot(vcpu->kvm, gfn);
if (!slot)
KVM: SEV: Support SEV-SNP AP Creation NAE event Add support for the SEV-SNP AP Creation NAE event. This allows SEV-SNP guests to alter the register state of the APs on their own. This allows the guest a way of simulating INIT-SIPI. A new event, KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, is created and used so as to avoid updating the VMSA pointer while the vCPU is running. For CREATE The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID. The GPA is saved in the svm struct of the target vCPU, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to the vCPU and then the vCPU is kicked. For CREATE_ON_INIT: The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID the next time an INIT is performed. The GPA is saved in the svm struct of the target vCPU. For DESTROY: The guest indicates it wishes to stop the vCPU. The GPA is cleared from the svm struct, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to vCPU and then the vCPU is kicked. The KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event handler will be invoked as a result of the event or as a result of an INIT. If a new VMSA is to be installed, the VMSA guest page is set as the VMSA in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_RUNNABLE. If a new VMSA is not to be installed, the VMSA is cleared in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_HALTED to prevent it from being run. Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Co-developed-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Message-ID: <20240501085210.2213060-13-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-05-01 03:52:02 -05:00
return;
/*
* The new VMSA will be private memory guest memory, so retrieve the
* PFN from the gmem backend.
*/
if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, &page, NULL))
KVM: SEV: Support SEV-SNP AP Creation NAE event Add support for the SEV-SNP AP Creation NAE event. This allows SEV-SNP guests to alter the register state of the APs on their own. This allows the guest a way of simulating INIT-SIPI. A new event, KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, is created and used so as to avoid updating the VMSA pointer while the vCPU is running. For CREATE The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID. The GPA is saved in the svm struct of the target vCPU, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to the vCPU and then the vCPU is kicked. For CREATE_ON_INIT: The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID the next time an INIT is performed. The GPA is saved in the svm struct of the target vCPU. For DESTROY: The guest indicates it wishes to stop the vCPU. The GPA is cleared from the svm struct, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to vCPU and then the vCPU is kicked. The KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event handler will be invoked as a result of the event or as a result of an INIT. If a new VMSA is to be installed, the VMSA guest page is set as the VMSA in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_RUNNABLE. If a new VMSA is not to be installed, the VMSA is cleared in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_HALTED to prevent it from being run. Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Co-developed-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Message-ID: <20240501085210.2213060-13-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-05-01 03:52:02 -05:00
return;
/*
* From this point forward, the VMSA will always be a guest-mapped page
* rather than the initial one allocated by KVM in svm->sev_es.vmsa. In
* theory, svm->sev_es.vmsa could be free'd and cleaned up here, but
KVM: SEV: Prefer WBNOINVD over WBINVD for cache maintenance efficiency AMD CPUs currently execute WBINVD in the host when unregistering SEV guest memory or when deactivating SEV guests. Such cache maintenance is performed to prevent data corruption, wherein the encrypted (C=1) version of a dirty cache line might otherwise only be written back after the memory is written in a different context (ex: C=0), yielding corruption. However, WBINVD is performance-costly, especially because it invalidates processor caches. Strictly-speaking, unless the SEV ASID is being recycled (meaning the SNP firmware requires the use of WBINVD prior to DF_FLUSH), the cache invalidation triggered by WBINVD is unnecessary; only the writeback is needed to prevent data corruption in remaining scenarios. To improve performance in these scenarios, use WBNOINVD when available instead of WBINVD. WBNOINVD still writes back all dirty lines (preventing host data corruption by SEV guests) but does *not* invalidate processor caches. Note that the implementation of wbnoinvd() ensures fall back to WBINVD if WBNOINVD is unavailable. In anticipation of forthcoming optimizations to limit the WBNOINVD only to physical CPUs that have executed SEV guests, place the call to wbnoinvd_on_all_cpus() in a wrapper function sev_writeback_caches(). Signed-off-by: Kevin Loughlin <kevinloughlin@google.com> Reviewed-by: Mingwei Zhang <mizhang@google.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Link: https://lore.kernel.org/r/20250201000259.3289143-3-kevinloughlin@google.com [sean: tweak comment regarding CLFUSH] Cc: Francesco Lavra <francescolavra.fl@gmail.com> Link: https://lore.kernel.org/r/20250522233733.3176144-8-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-05-22 16:37:31 -07:00
* that involves cleanups like flushing caches, which would ideally be
* handled during teardown rather than guest boot. Deferring that also
* allows the existing logic for SEV-ES VMSAs to be re-used with
* minimal SNP-specific changes.
*/
svm->sev_es.snp_has_guest_vmsa = true;
KVM: SEV: Support SEV-SNP AP Creation NAE event Add support for the SEV-SNP AP Creation NAE event. This allows SEV-SNP guests to alter the register state of the APs on their own. This allows the guest a way of simulating INIT-SIPI. A new event, KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, is created and used so as to avoid updating the VMSA pointer while the vCPU is running. For CREATE The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID. The GPA is saved in the svm struct of the target vCPU, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to the vCPU and then the vCPU is kicked. For CREATE_ON_INIT: The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID the next time an INIT is performed. The GPA is saved in the svm struct of the target vCPU. For DESTROY: The guest indicates it wishes to stop the vCPU. The GPA is cleared from the svm struct, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to vCPU and then the vCPU is kicked. The KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event handler will be invoked as a result of the event or as a result of an INIT. If a new VMSA is to be installed, the VMSA guest page is set as the VMSA in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_RUNNABLE. If a new VMSA is not to be installed, the VMSA is cleared in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_HALTED to prevent it from being run. Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Co-developed-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Message-ID: <20240501085210.2213060-13-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-05-01 03:52:02 -05:00
/* Use the new VMSA */
svm->vmcb->control.vmsa_pa = pfn_to_hpa(pfn);
KVM: SEV: Support SEV-SNP AP Creation NAE event Add support for the SEV-SNP AP Creation NAE event. This allows SEV-SNP guests to alter the register state of the APs on their own. This allows the guest a way of simulating INIT-SIPI. A new event, KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, is created and used so as to avoid updating the VMSA pointer while the vCPU is running. For CREATE The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID. The GPA is saved in the svm struct of the target vCPU, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to the vCPU and then the vCPU is kicked. For CREATE_ON_INIT: The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID the next time an INIT is performed. The GPA is saved in the svm struct of the target vCPU. For DESTROY: The guest indicates it wishes to stop the vCPU. The GPA is cleared from the svm struct, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to vCPU and then the vCPU is kicked. The KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event handler will be invoked as a result of the event or as a result of an INIT. If a new VMSA is to be installed, the VMSA guest page is set as the VMSA in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_RUNNABLE. If a new VMSA is not to be installed, the VMSA is cleared in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_HALTED to prevent it from being run. Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Co-developed-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Message-ID: <20240501085210.2213060-13-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-05-01 03:52:02 -05:00
/* Mark the vCPU as runnable */
KVM SVM changes for 6.15 - Ensure the PSP driver is initialized when both the PSP and KVM modules are built-in (the initcall framework doesn't handle dependencies). - Use long-term pins when registering encrypted memory regions, so that the pages are migrated out of MIGRATE_CMA/ZONE_MOVABLE and don't lead to excessive fragmentation. - Add macros and helpers for setting GHCB return/error codes. - Add support for Idle HLT interception, which elides interception if the vCPU has a pending, unmasked virtual IRQ when HLT is executed. - Fix a bug in INVPCID emulation where KVM fails to check for a non-canonical address. - Don't attempt VMRUN for SEV-ES+ guests if the vCPU's VMSA is invalid, e.g. because the vCPU was "destroyed" via SNP's AP Creation hypercall. - Reject SNP AP Creation if the requested SEV features for the vCPU don't match the VM's configured set of features. - Misc cleanups -----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEEKTobbabEP7vbhhN9OlYIJqCjN/0FAmfZoO4ACgkQOlYIJqCj N/2m1BAAkbn6k0ZzJj0GTYqNh8ejWFBa4Mt+RHXrf9deNXLJuQFPRAQi2En3rQwj esISxA0dzsADEYkCXHxDsUfsJ729kQYAIyN5r4h3GftriKlNBmHLZJsXHXnpAZ0F 2Yjwu/r5zw/c5/mOYkcBjJ0gDgoDWNI0UoA656HTqE88E6v7DoSZlZBZZRSCMXRi 8jIQSzmQQkKsWi0c/N5LKm8E+6HFLJB1BnASbXbwXChIoi2pFE/wAv6ntC+V5DzN Y5oqDtf3evOBrpmMbN7t2I4KJ8VW1+041whANymFK1QARBBevCYY1ezCHg2RIHpc cyS8G+wice7IMSnqTNtJvN2IpwBkV2SqRyxwBKS2j1ec1xeoX2JT23tOom1XpPhW diqiSUto2xQIPz3x8fddtAHvY0W11jpXt4MUyOzdefbBLGQBB4EsxbnwxY+i6kKh 0tdw4R1uzvbn1sHW+p2hOvtkgxSLmYFGIrYEUMCxXRxOviHfPWzCBlucEOOceU1D 2o/SgoBWS6xF8KxMxwnVLE9q8/Baiua8Ak2h2cLapHwWGpRaeJGFbz/TwbcDaKVy gW34W8KXc4WNWiFwoD6WRqrSDTRXG3XAtn0vjwvCqD6PBPRleALWsAxq8ztenYIy 2se051XsKGg+e64zsAZFNdzIDrSGIHWfZb9ec398cF/iuKTGWLQ= =iZrd -----END PGP SIGNATURE----- Merge tag 'kvm-x86-svm-6.15' of https://github.com/kvm-x86/linux into HEAD KVM SVM changes for 6.15 - Ensure the PSP driver is initialized when both the PSP and KVM modules are built-in (the initcall framework doesn't handle dependencies). - Use long-term pins when registering encrypted memory regions, so that the pages are migrated out of MIGRATE_CMA/ZONE_MOVABLE and don't lead to excessive fragmentation. - Add macros and helpers for setting GHCB return/error codes. - Add support for Idle HLT interception, which elides interception if the vCPU has a pending, unmasked virtual IRQ when HLT is executed. - Fix a bug in INVPCID emulation where KVM fails to check for a non-canonical address. - Don't attempt VMRUN for SEV-ES+ guests if the vCPU's VMSA is invalid, e.g. because the vCPU was "destroyed" via SNP's AP Creation hypercall. - Reject SNP AP Creation if the requested SEV features for the vCPU don't match the VM's configured set of features. - Misc cleanups
2025-03-19 09:10:44 -04:00
kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
KVM: SEV: Support SEV-SNP AP Creation NAE event Add support for the SEV-SNP AP Creation NAE event. This allows SEV-SNP guests to alter the register state of the APs on their own. This allows the guest a way of simulating INIT-SIPI. A new event, KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, is created and used so as to avoid updating the VMSA pointer while the vCPU is running. For CREATE The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID. The GPA is saved in the svm struct of the target vCPU, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to the vCPU and then the vCPU is kicked. For CREATE_ON_INIT: The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID the next time an INIT is performed. The GPA is saved in the svm struct of the target vCPU. For DESTROY: The guest indicates it wishes to stop the vCPU. The GPA is cleared from the svm struct, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to vCPU and then the vCPU is kicked. The KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event handler will be invoked as a result of the event or as a result of an INIT. If a new VMSA is to be installed, the VMSA guest page is set as the VMSA in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_RUNNABLE. If a new VMSA is not to be installed, the VMSA is cleared in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_HALTED to prevent it from being run. Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Co-developed-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Message-ID: <20240501085210.2213060-13-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-05-01 03:52:02 -05:00
/*
* gmem pages aren't currently migratable, but if this ever changes
* then care should be taken to ensure svm->sev_es.vmsa is pinned
* through some other means.
*/
kvm_release_page_clean(page);
KVM: SEV: Support SEV-SNP AP Creation NAE event Add support for the SEV-SNP AP Creation NAE event. This allows SEV-SNP guests to alter the register state of the APs on their own. This allows the guest a way of simulating INIT-SIPI. A new event, KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, is created and used so as to avoid updating the VMSA pointer while the vCPU is running. For CREATE The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID. The GPA is saved in the svm struct of the target vCPU, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to the vCPU and then the vCPU is kicked. For CREATE_ON_INIT: The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID the next time an INIT is performed. The GPA is saved in the svm struct of the target vCPU. For DESTROY: The guest indicates it wishes to stop the vCPU. The GPA is cleared from the svm struct, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to vCPU and then the vCPU is kicked. The KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event handler will be invoked as a result of the event or as a result of an INIT. If a new VMSA is to be installed, the VMSA guest page is set as the VMSA in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_RUNNABLE. If a new VMSA is not to be installed, the VMSA is cleared in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_HALTED to prevent it from being run. Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Co-developed-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Message-ID: <20240501085210.2213060-13-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-05-01 03:52:02 -05:00
}
static int sev_snp_ap_creation(struct vcpu_svm *svm)
{
struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm);
KVM: SEV: Support SEV-SNP AP Creation NAE event Add support for the SEV-SNP AP Creation NAE event. This allows SEV-SNP guests to alter the register state of the APs on their own. This allows the guest a way of simulating INIT-SIPI. A new event, KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, is created and used so as to avoid updating the VMSA pointer while the vCPU is running. For CREATE The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID. The GPA is saved in the svm struct of the target vCPU, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to the vCPU and then the vCPU is kicked. For CREATE_ON_INIT: The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID the next time an INIT is performed. The GPA is saved in the svm struct of the target vCPU. For DESTROY: The guest indicates it wishes to stop the vCPU. The GPA is cleared from the svm struct, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to vCPU and then the vCPU is kicked. The KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event handler will be invoked as a result of the event or as a result of an INIT. If a new VMSA is to be installed, the VMSA guest page is set as the VMSA in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_RUNNABLE. If a new VMSA is not to be installed, the VMSA is cleared in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_HALTED to prevent it from being run. Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Co-developed-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Message-ID: <20240501085210.2213060-13-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-05-01 03:52:02 -05:00
struct kvm_vcpu *vcpu = &svm->vcpu;
struct kvm_vcpu *target_vcpu;
struct vcpu_svm *target_svm;
unsigned int request;
unsigned int apic_id;
request = lower_32_bits(svm->vmcb->control.exit_info_1);
apic_id = upper_32_bits(svm->vmcb->control.exit_info_1);
/* Validate the APIC ID */
target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, apic_id);
if (!target_vcpu) {
vcpu_unimpl(vcpu, "vmgexit: invalid AP APIC ID [%#x] from guest\n",
apic_id);
return -EINVAL;
}
target_svm = to_svm(target_vcpu);
guard(mutex)(&target_svm->sev_es.snp_vmsa_mutex);
KVM: SEV: Support SEV-SNP AP Creation NAE event Add support for the SEV-SNP AP Creation NAE event. This allows SEV-SNP guests to alter the register state of the APs on their own. This allows the guest a way of simulating INIT-SIPI. A new event, KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, is created and used so as to avoid updating the VMSA pointer while the vCPU is running. For CREATE The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID. The GPA is saved in the svm struct of the target vCPU, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to the vCPU and then the vCPU is kicked. For CREATE_ON_INIT: The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID the next time an INIT is performed. The GPA is saved in the svm struct of the target vCPU. For DESTROY: The guest indicates it wishes to stop the vCPU. The GPA is cleared from the svm struct, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to vCPU and then the vCPU is kicked. The KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event handler will be invoked as a result of the event or as a result of an INIT. If a new VMSA is to be installed, the VMSA guest page is set as the VMSA in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_RUNNABLE. If a new VMSA is not to be installed, the VMSA is cleared in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_HALTED to prevent it from being run. Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Co-developed-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Message-ID: <20240501085210.2213060-13-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-05-01 03:52:02 -05:00
switch (request) {
case SVM_VMGEXIT_AP_CREATE_ON_INIT:
case SVM_VMGEXIT_AP_CREATE:
if (vcpu->arch.regs[VCPU_REGS_RAX] != sev->vmsa_features) {
vcpu_unimpl(vcpu, "vmgexit: mismatched AP sev_features [%#lx] != [%#llx] from guest\n",
vcpu->arch.regs[VCPU_REGS_RAX], sev->vmsa_features);
return -EINVAL;
}
KVM: SEV: Support SEV-SNP AP Creation NAE event Add support for the SEV-SNP AP Creation NAE event. This allows SEV-SNP guests to alter the register state of the APs on their own. This allows the guest a way of simulating INIT-SIPI. A new event, KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, is created and used so as to avoid updating the VMSA pointer while the vCPU is running. For CREATE The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID. The GPA is saved in the svm struct of the target vCPU, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to the vCPU and then the vCPU is kicked. For CREATE_ON_INIT: The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID the next time an INIT is performed. The GPA is saved in the svm struct of the target vCPU. For DESTROY: The guest indicates it wishes to stop the vCPU. The GPA is cleared from the svm struct, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to vCPU and then the vCPU is kicked. The KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event handler will be invoked as a result of the event or as a result of an INIT. If a new VMSA is to be installed, the VMSA guest page is set as the VMSA in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_RUNNABLE. If a new VMSA is not to be installed, the VMSA is cleared in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_HALTED to prevent it from being run. Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Co-developed-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Message-ID: <20240501085210.2213060-13-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-05-01 03:52:02 -05:00
if (!page_address_valid(vcpu, svm->vmcb->control.exit_info_2)) {
vcpu_unimpl(vcpu, "vmgexit: invalid AP VMSA address [%#llx] from guest\n",
svm->vmcb->control.exit_info_2);
return -EINVAL;
KVM: SEV: Support SEV-SNP AP Creation NAE event Add support for the SEV-SNP AP Creation NAE event. This allows SEV-SNP guests to alter the register state of the APs on their own. This allows the guest a way of simulating INIT-SIPI. A new event, KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, is created and used so as to avoid updating the VMSA pointer while the vCPU is running. For CREATE The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID. The GPA is saved in the svm struct of the target vCPU, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to the vCPU and then the vCPU is kicked. For CREATE_ON_INIT: The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID the next time an INIT is performed. The GPA is saved in the svm struct of the target vCPU. For DESTROY: The guest indicates it wishes to stop the vCPU. The GPA is cleared from the svm struct, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to vCPU and then the vCPU is kicked. The KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event handler will be invoked as a result of the event or as a result of an INIT. If a new VMSA is to be installed, the VMSA guest page is set as the VMSA in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_RUNNABLE. If a new VMSA is not to be installed, the VMSA is cleared in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_HALTED to prevent it from being run. Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Co-developed-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Message-ID: <20240501085210.2213060-13-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-05-01 03:52:02 -05:00
}
/*
* Malicious guest can RMPADJUST a large page into VMSA which
* will hit the SNP erratum where the CPU will incorrectly signal
* an RMP violation #PF if a hugepage collides with the RMP entry
* of VMSA page, reject the AP CREATE request if VMSA address from
* guest is 2M aligned.
*/
if (IS_ALIGNED(svm->vmcb->control.exit_info_2, PMD_SIZE)) {
vcpu_unimpl(vcpu,
"vmgexit: AP VMSA address [%llx] from guest is unsafe as it is 2M aligned\n",
svm->vmcb->control.exit_info_2);
return -EINVAL;
KVM: SEV: Support SEV-SNP AP Creation NAE event Add support for the SEV-SNP AP Creation NAE event. This allows SEV-SNP guests to alter the register state of the APs on their own. This allows the guest a way of simulating INIT-SIPI. A new event, KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, is created and used so as to avoid updating the VMSA pointer while the vCPU is running. For CREATE The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID. The GPA is saved in the svm struct of the target vCPU, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to the vCPU and then the vCPU is kicked. For CREATE_ON_INIT: The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID the next time an INIT is performed. The GPA is saved in the svm struct of the target vCPU. For DESTROY: The guest indicates it wishes to stop the vCPU. The GPA is cleared from the svm struct, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to vCPU and then the vCPU is kicked. The KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event handler will be invoked as a result of the event or as a result of an INIT. If a new VMSA is to be installed, the VMSA guest page is set as the VMSA in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_RUNNABLE. If a new VMSA is not to be installed, the VMSA is cleared in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_HALTED to prevent it from being run. Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Co-developed-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Message-ID: <20240501085210.2213060-13-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-05-01 03:52:02 -05:00
}
target_svm->sev_es.snp_vmsa_gpa = svm->vmcb->control.exit_info_2;
break;
case SVM_VMGEXIT_AP_DESTROY:
target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
KVM: SEV: Support SEV-SNP AP Creation NAE event Add support for the SEV-SNP AP Creation NAE event. This allows SEV-SNP guests to alter the register state of the APs on their own. This allows the guest a way of simulating INIT-SIPI. A new event, KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, is created and used so as to avoid updating the VMSA pointer while the vCPU is running. For CREATE The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID. The GPA is saved in the svm struct of the target vCPU, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to the vCPU and then the vCPU is kicked. For CREATE_ON_INIT: The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID the next time an INIT is performed. The GPA is saved in the svm struct of the target vCPU. For DESTROY: The guest indicates it wishes to stop the vCPU. The GPA is cleared from the svm struct, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to vCPU and then the vCPU is kicked. The KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event handler will be invoked as a result of the event or as a result of an INIT. If a new VMSA is to be installed, the VMSA guest page is set as the VMSA in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_RUNNABLE. If a new VMSA is not to be installed, the VMSA is cleared in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_HALTED to prevent it from being run. Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Co-developed-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Message-ID: <20240501085210.2213060-13-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-05-01 03:52:02 -05:00
break;
default:
vcpu_unimpl(vcpu, "vmgexit: invalid AP creation request [%#x] from guest\n",
request);
return -EINVAL;
KVM: SEV: Support SEV-SNP AP Creation NAE event Add support for the SEV-SNP AP Creation NAE event. This allows SEV-SNP guests to alter the register state of the APs on their own. This allows the guest a way of simulating INIT-SIPI. A new event, KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, is created and used so as to avoid updating the VMSA pointer while the vCPU is running. For CREATE The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID. The GPA is saved in the svm struct of the target vCPU, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to the vCPU and then the vCPU is kicked. For CREATE_ON_INIT: The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID the next time an INIT is performed. The GPA is saved in the svm struct of the target vCPU. For DESTROY: The guest indicates it wishes to stop the vCPU. The GPA is cleared from the svm struct, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to vCPU and then the vCPU is kicked. The KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event handler will be invoked as a result of the event or as a result of an INIT. If a new VMSA is to be installed, the VMSA guest page is set as the VMSA in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_RUNNABLE. If a new VMSA is not to be installed, the VMSA is cleared in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_HALTED to prevent it from being run. Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Co-developed-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Message-ID: <20240501085210.2213060-13-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-05-01 03:52:02 -05:00
}
target_svm->sev_es.snp_ap_waiting_for_reset = true;
/*
* Unless Creation is deferred until INIT, signal the vCPU to update
* its state.
*/
KVM: SVM: Fix SNP AP destroy race with VMRUN An AP destroy request for a target vCPU is typically followed by an RMPADJUST to remove the VMSA attribute from the page currently being used as the VMSA for the target vCPU. This can result in a vCPU that is about to VMRUN to exit with #VMEXIT_INVALID. This usually does not happen as APs are typically sitting in HLT when being destroyed and therefore the vCPU thread is not running at the time. However, if HLT is allowed inside the VM, then the vCPU could be about to VMRUN when the VMSA attribute is removed from the VMSA page, resulting in a #VMEXIT_INVALID when the vCPU actually issues the VMRUN and causing the guest to crash. An RMPADJUST against an in-use (already running) VMSA results in a #NPF for the vCPU issuing the RMPADJUST, so the VMSA attribute cannot be changed until the VMRUN for target vCPU exits. The Qemu command line option '-overcommit cpu-pm=on' is an example of allowing HLT inside the guest. Update the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event to include the KVM_REQUEST_WAIT flag. The kvm_vcpu_kick() function will not wait for requests to be honored, so create kvm_make_request_and_kick() that will add a new event request and honor the KVM_REQUEST_WAIT flag. This will ensure that the target vCPU sees the AP destroy request before returning to the initiating vCPU should the target vCPU be in guest mode. Fixes: e366f92ea99e ("KVM: SEV: Support SEV-SNP AP Creation NAE event") Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Link: https://lore.kernel.org/r/fe2c885bf35643dd224e91294edb6777d5df23a4.1743097196.git.thomas.lendacky@amd.com [sean: add a comment explaining the use of smp_send_reschedule()] Co-developed-by: Sean Christopherson <seanjc@google.com> Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-03-27 12:39:56 -05:00
if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT)
kvm_make_request_and_kick(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu);
KVM: SEV: Support SEV-SNP AP Creation NAE event Add support for the SEV-SNP AP Creation NAE event. This allows SEV-SNP guests to alter the register state of the APs on their own. This allows the guest a way of simulating INIT-SIPI. A new event, KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, is created and used so as to avoid updating the VMSA pointer while the vCPU is running. For CREATE The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID. The GPA is saved in the svm struct of the target vCPU, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to the vCPU and then the vCPU is kicked. For CREATE_ON_INIT: The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID the next time an INIT is performed. The GPA is saved in the svm struct of the target vCPU. For DESTROY: The guest indicates it wishes to stop the vCPU. The GPA is cleared from the svm struct, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to vCPU and then the vCPU is kicked. The KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event handler will be invoked as a result of the event or as a result of an INIT. If a new VMSA is to be installed, the VMSA guest page is set as the VMSA in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_RUNNABLE. If a new VMSA is not to be installed, the VMSA is cleared in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_HALTED to prevent it from being run. Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Co-developed-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Message-ID: <20240501085210.2213060-13-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-05-01 03:52:02 -05:00
return 0;
KVM: SEV: Support SEV-SNP AP Creation NAE event Add support for the SEV-SNP AP Creation NAE event. This allows SEV-SNP guests to alter the register state of the APs on their own. This allows the guest a way of simulating INIT-SIPI. A new event, KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, is created and used so as to avoid updating the VMSA pointer while the vCPU is running. For CREATE The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID. The GPA is saved in the svm struct of the target vCPU, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to the vCPU and then the vCPU is kicked. For CREATE_ON_INIT: The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID the next time an INIT is performed. The GPA is saved in the svm struct of the target vCPU. For DESTROY: The guest indicates it wishes to stop the vCPU. The GPA is cleared from the svm struct, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to vCPU and then the vCPU is kicked. The KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event handler will be invoked as a result of the event or as a result of an INIT. If a new VMSA is to be installed, the VMSA guest page is set as the VMSA in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_RUNNABLE. If a new VMSA is not to be installed, the VMSA is cleared in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_HALTED to prevent it from being run. Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Co-developed-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Message-ID: <20240501085210.2213060-13-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-05-01 03:52:02 -05:00
}
KVM: SEV: Provide support for SNP_GUEST_REQUEST NAE event Version 2 of GHCB specification added support for the SNP Guest Request Message NAE event. The event allows for an SEV-SNP guest to make requests to the SEV-SNP firmware through the hypervisor using the SNP_GUEST_REQUEST API defined in the SEV-SNP firmware specification. This is used by guests primarily to request attestation reports from firmware. There are other request types are available as well, but the specifics of what guest requests are being made generally does not affect how they are handled by the hypervisor, which only serves as a proxy for the guest requests and firmware responses. Implement handling for these events. When an SNP Guest Request is issued, the guest will provide its own request/response pages, which could in theory be passed along directly to firmware. However, these pages would need special care: - Both pages are from shared guest memory, so they need to be protected from migration/etc. occurring while firmware reads/writes to them. At a minimum, this requires elevating the ref counts and potentially needing an explicit pinning of the memory. This places additional restrictions on what type of memory backends userspace can use for shared guest memory since there would be some reliance on using refcounted pages. - The response page needs to be switched to Firmware-owned state before the firmware can write to it, which can lead to potential host RMP #PFs if the guest is misbehaved and hands the host a guest page that KVM is writing to for other reasons (e.g. virtio buffers). Both of these issues can be avoided completely by using separately-allocated bounce pages for both the request/response pages and passing those to firmware instead. So that's the approach taken here. Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Co-developed-by: Alexey Kardashevskiy <aik@amd.com> Signed-off-by: Alexey Kardashevskiy <aik@amd.com> Co-developed-by: Ashish Kalra <ashish.kalra@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Reviewed-by: Liam Merwick <liam.merwick@oracle.com> [mdr: ensure FW command failures are indicated to guest, drop extended request handling to be re-written as separate patch, massage commit] Signed-off-by: Michael Roth <michael.roth@amd.com> Message-ID: <20240701223148.3798365-2-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-07-01 17:31:46 -05:00
static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa)
{
struct sev_data_snp_guest_request data = {0};
struct kvm *kvm = svm->vcpu.kvm;
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
sev_ret_code fw_err = 0;
int ret;
if (!sev_snp_guest(kvm))
return -EINVAL;
mutex_lock(&sev->guest_req_mutex);
if (kvm_read_guest(kvm, req_gpa, sev->guest_req_buf, PAGE_SIZE)) {
ret = -EIO;
goto out_unlock;
}
data.gctx_paddr = __psp_pa(sev->snp_context);
data.req_paddr = __psp_pa(sev->guest_req_buf);
data.res_paddr = __psp_pa(sev->guest_resp_buf);
/*
* Firmware failures are propagated on to guest, but any other failure
* condition along the way should be reported to userspace. E.g. if
* the PSP is dead and commands are timing out.
*/
ret = sev_issue_cmd(kvm, SEV_CMD_SNP_GUEST_REQUEST, &data, &fw_err);
if (ret && !fw_err)
goto out_unlock;
if (kvm_write_guest(kvm, resp_gpa, sev->guest_resp_buf, PAGE_SIZE)) {
ret = -EIO;
goto out_unlock;
}
/* No action is requested *from KVM* if there was a firmware error. */
svm_vmgexit_no_action(svm, SNP_GUEST_ERR(0, fw_err));
KVM: SEV: Provide support for SNP_GUEST_REQUEST NAE event Version 2 of GHCB specification added support for the SNP Guest Request Message NAE event. The event allows for an SEV-SNP guest to make requests to the SEV-SNP firmware through the hypervisor using the SNP_GUEST_REQUEST API defined in the SEV-SNP firmware specification. This is used by guests primarily to request attestation reports from firmware. There are other request types are available as well, but the specifics of what guest requests are being made generally does not affect how they are handled by the hypervisor, which only serves as a proxy for the guest requests and firmware responses. Implement handling for these events. When an SNP Guest Request is issued, the guest will provide its own request/response pages, which could in theory be passed along directly to firmware. However, these pages would need special care: - Both pages are from shared guest memory, so they need to be protected from migration/etc. occurring while firmware reads/writes to them. At a minimum, this requires elevating the ref counts and potentially needing an explicit pinning of the memory. This places additional restrictions on what type of memory backends userspace can use for shared guest memory since there would be some reliance on using refcounted pages. - The response page needs to be switched to Firmware-owned state before the firmware can write to it, which can lead to potential host RMP #PFs if the guest is misbehaved and hands the host a guest page that KVM is writing to for other reasons (e.g. virtio buffers). Both of these issues can be avoided completely by using separately-allocated bounce pages for both the request/response pages and passing those to firmware instead. So that's the approach taken here. Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Co-developed-by: Alexey Kardashevskiy <aik@amd.com> Signed-off-by: Alexey Kardashevskiy <aik@amd.com> Co-developed-by: Ashish Kalra <ashish.kalra@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Reviewed-by: Liam Merwick <liam.merwick@oracle.com> [mdr: ensure FW command failures are indicated to guest, drop extended request handling to be re-written as separate patch, massage commit] Signed-off-by: Michael Roth <michael.roth@amd.com> Message-ID: <20240701223148.3798365-2-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-07-01 17:31:46 -05:00
ret = 1; /* resume guest */
out_unlock:
mutex_unlock(&sev->guest_req_mutex);
return ret;
}
static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa)
{
struct kvm *kvm = svm->vcpu.kvm;
u8 msg_type;
if (!sev_snp_guest(kvm))
return -EINVAL;
if (kvm_read_guest(kvm, req_gpa + offsetof(struct snp_guest_msg_hdr, msg_type),
&msg_type, 1))
return -EIO;
/*
* As per GHCB spec, requests of type MSG_REPORT_REQ also allow for
* additional certificate data to be provided alongside the attestation
* report via the guest-provided data pages indicated by RAX/RBX. The
* certificate data is optional and requires additional KVM enablement
* to provide an interface for userspace to provide it, but KVM still
* needs to be able to handle extended guest requests either way. So
* provide a stub implementation that will always return an empty
* certificate table in the guest-provided data pages.
*/
if (msg_type == SNP_MSG_REPORT_REQ) {
struct kvm_vcpu *vcpu = &svm->vcpu;
u64 data_npages;
gpa_t data_gpa;
if (!kvm_ghcb_rax_is_valid(svm) || !kvm_ghcb_rbx_is_valid(svm))
goto request_invalid;
data_gpa = vcpu->arch.regs[VCPU_REGS_RAX];
data_npages = vcpu->arch.regs[VCPU_REGS_RBX];
if (!PAGE_ALIGNED(data_gpa))
goto request_invalid;
/*
* As per GHCB spec (see "SNP Extended Guest Request"), the
* certificate table is terminated by 24-bytes of zeroes.
*/
if (data_npages && kvm_clear_guest(kvm, data_gpa, 24))
return -EIO;
}
return snp_handle_guest_req(svm, req_gpa, resp_gpa);
request_invalid:
svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT);
return 1; /* resume guest */
}
static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
{
struct vmcb_control_area *control = &svm->vmcb->control;
struct kvm_vcpu *vcpu = &svm->vcpu;
struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
u64 ghcb_info;
int ret = 1;
ghcb_info = control->ghcb_gpa & GHCB_MSR_INFO_MASK;
trace_kvm_vmgexit_msr_protocol_enter(svm->vcpu.vcpu_id,
control->ghcb_gpa);
switch (ghcb_info) {
case GHCB_MSR_SEV_INFO_REQ:
set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version,
GHCB_VERSION_MIN,
sev_enc_bit));
break;
case GHCB_MSR_CPUID_REQ: {
u64 cpuid_fn, cpuid_reg, cpuid_value;
cpuid_fn = get_ghcb_msr_bits(svm,
GHCB_MSR_CPUID_FUNC_MASK,
GHCB_MSR_CPUID_FUNC_POS);
/* Initialize the registers needed by the CPUID intercept */
vcpu->arch.regs[VCPU_REGS_RAX] = cpuid_fn;
vcpu->arch.regs[VCPU_REGS_RCX] = 0;
ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_CPUID);
if (!ret) {
/* Error, keep GHCB MSR value as-is */
break;
}
cpuid_reg = get_ghcb_msr_bits(svm,
GHCB_MSR_CPUID_REG_MASK,
GHCB_MSR_CPUID_REG_POS);
if (cpuid_reg == 0)
cpuid_value = vcpu->arch.regs[VCPU_REGS_RAX];
else if (cpuid_reg == 1)
cpuid_value = vcpu->arch.regs[VCPU_REGS_RBX];
else if (cpuid_reg == 2)
cpuid_value = vcpu->arch.regs[VCPU_REGS_RCX];
else
cpuid_value = vcpu->arch.regs[VCPU_REGS_RDX];
set_ghcb_msr_bits(svm, cpuid_value,
GHCB_MSR_CPUID_VALUE_MASK,
GHCB_MSR_CPUID_VALUE_POS);
set_ghcb_msr_bits(svm, GHCB_MSR_CPUID_RESP,
GHCB_MSR_INFO_MASK,
GHCB_MSR_INFO_POS);
break;
}
case GHCB_MSR_AP_RESET_HOLD_REQ:
svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_MSR_PROTO;
ret = kvm_emulate_ap_reset_hold(&svm->vcpu);
/*
* Preset the result to a non-SIPI return and then only set
* the result to non-zero when delivering a SIPI.
*/
set_ghcb_msr_bits(svm, 0,
GHCB_MSR_AP_RESET_HOLD_RESULT_MASK,
GHCB_MSR_AP_RESET_HOLD_RESULT_POS);
set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP,
GHCB_MSR_INFO_MASK,
GHCB_MSR_INFO_POS);
break;
case GHCB_MSR_HV_FT_REQ:
set_ghcb_msr_bits(svm, GHCB_HV_FT_SUPPORTED,
GHCB_MSR_HV_FT_MASK, GHCB_MSR_HV_FT_POS);
set_ghcb_msr_bits(svm, GHCB_MSR_HV_FT_RESP,
GHCB_MSR_INFO_MASK, GHCB_MSR_INFO_POS);
break;
case GHCB_MSR_PREF_GPA_REQ:
if (!sev_snp_guest(vcpu->kvm))
goto out_terminate;
set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_NONE, GHCB_MSR_GPA_VALUE_MASK,
GHCB_MSR_GPA_VALUE_POS);
set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_RESP, GHCB_MSR_INFO_MASK,
GHCB_MSR_INFO_POS);
break;
case GHCB_MSR_REG_GPA_REQ: {
u64 gfn;
if (!sev_snp_guest(vcpu->kvm))
goto out_terminate;
gfn = get_ghcb_msr_bits(svm, GHCB_MSR_GPA_VALUE_MASK,
GHCB_MSR_GPA_VALUE_POS);
svm->sev_es.ghcb_registered_gpa = gfn_to_gpa(gfn);
set_ghcb_msr_bits(svm, gfn, GHCB_MSR_GPA_VALUE_MASK,
GHCB_MSR_GPA_VALUE_POS);
set_ghcb_msr_bits(svm, GHCB_MSR_REG_GPA_RESP, GHCB_MSR_INFO_MASK,
GHCB_MSR_INFO_POS);
break;
}
case GHCB_MSR_PSC_REQ:
if (!sev_snp_guest(vcpu->kvm))
goto out_terminate;
ret = snp_begin_psc_msr(svm, control->ghcb_gpa);
break;
case GHCB_MSR_TERM_REQ: {
u64 reason_set, reason_code;
reason_set = get_ghcb_msr_bits(svm,
GHCB_MSR_TERM_REASON_SET_MASK,
GHCB_MSR_TERM_REASON_SET_POS);
reason_code = get_ghcb_msr_bits(svm,
GHCB_MSR_TERM_REASON_MASK,
GHCB_MSR_TERM_REASON_POS);
pr_info("SEV-ES guest requested termination: %#llx:%#llx\n",
reason_set, reason_code);
goto out_terminate;
}
default:
/* Error, keep GHCB MSR value as-is */
break;
}
trace_kvm_vmgexit_msr_protocol_exit(svm->vcpu.vcpu_id,
control->ghcb_gpa, ret);
return ret;
out_terminate:
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM;
vcpu->run->system_event.ndata = 1;
vcpu->run->system_event.data[0] = control->ghcb_gpa;
return 0;
}
int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb_control_area *control = &svm->vmcb->control;
u64 ghcb_gpa, exit_code;
int ret;
/* Validate the GHCB */
ghcb_gpa = control->ghcb_gpa;
if (ghcb_gpa & GHCB_MSR_INFO_MASK)
return sev_handle_vmgexit_msr_protocol(svm);
if (!ghcb_gpa) {
vcpu_unimpl(vcpu, "vmgexit: GHCB gpa is not set\n");
/* Without a GHCB, just return right back to the guest */
return 1;
}
if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->sev_es.ghcb_map)) {
/* Unable to map GHCB from guest */
vcpu_unimpl(vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n",
ghcb_gpa);
/* Without a GHCB, just return right back to the guest */
return 1;
}
svm->sev_es.ghcb = svm->sev_es.ghcb_map.hva;
trace_kvm_vmgexit_enter(vcpu->vcpu_id, svm->sev_es.ghcb);
sev_es_sync_from_ghcb(svm);
/* SEV-SNP guest requires that the GHCB GPA must be registered */
if (sev_snp_guest(svm->vcpu.kvm) && !ghcb_gpa_is_registered(svm, ghcb_gpa)) {
vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB GPA [%#llx] is not registered.\n", ghcb_gpa);
return -EINVAL;
}
KVM: SVM: Exit to userspace on ENOMEM/EFAULT GHCB errors Exit to userspace if setup_vmgexit_scratch() fails due to OOM or because copying data from guest (userspace) memory failed/faulted. The OOM scenario is clearcut, it's userspace's decision as to whether it should terminate the guest, free memory, etc... As for -EFAULT, arguably, any guest issue is a violation of the guest's contract with userspace, and thus userspace needs to decide how to proceed. E.g. userspace defines what is RAM vs. MMIO and communicates that directly to the guest, KVM is not involved in deciding what is/isn't RAM nor in communicating that information to the guest. If the scratch GPA doesn't resolve to a memslot, then the guest is not honoring the memory configuration as defined by userspace. And if userspace unmaps an hva for whatever reason, then exiting to userspace with -EFAULT is absolutely the right thing to do. KVM's ABI currently sucks and doesn't provide enough information to act on the -EFAULT, but that will hopefully be remedied in the future as there are multiple use cases, e.g. uffd and virtiofs truncation, that shouldn't require any work in KVM beyond returning -EFAULT with a small amount of metadata. KVM could define its ABI such that failure to access the scratch area is reflected into the guest, i.e. establish a contract with userspace, but that's undesirable as it limits KVM's options in the future, e.g. in the potential uffd case any failure on a uaccess needs to kick out to userspace. KVM does have several cases where it reflects these errors into the guest, e.g. kvm_pv_clock_pairing() and Hyper-V emulation, but KVM would preferably "fix" those instead of propagating the falsehood that any memory failure is the guest's fault. Lastly, returning a boolean as an "error" for that a helper that isn't named accordingly never works out well. Fixes: ad5b353240c8 ("KVM: SVM: Do not terminate SEV-ES guests on GHCB validation failure") Cc: Alper Gun <alpergun@google.com> Cc: Peter Gonda <pgonda@google.com> Signed-off-by: Sean Christopherson <seanjc@google.com> Message-Id: <20220225205209.3881130-1-seanjc@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-02-25 20:52:09 +00:00
ret = sev_es_validate_vmgexit(svm);
if (ret)
return ret;
svm_vmgexit_success(svm, 0);
KVM: SEV: only access GHCB fields once A KVM guest using SEV-ES or SEV-SNP with multiple vCPUs can trigger a double fetch race condition vulnerability and invoke the VMGEXIT handler recursively. sev_handle_vmgexit() maps the GHCB page using kvm_vcpu_map() and then fetches the exit code using ghcb_get_sw_exit_code(). Soon after, sev_es_validate_vmgexit() fetches the exit code again. Since the GHCB page is shared with the guest, the guest is able to quickly swap the values with another vCPU and hence bypass the validation. One vmexit code that can be rejected by sev_es_validate_vmgexit() is SVM_EXIT_VMGEXIT; if sev_handle_vmgexit() observes it in the second fetch, the call to svm_invoke_exit_handler() will invoke sev_handle_vmgexit() again recursively. To avoid the race, always fetch the GHCB data from the places where sev_es_sync_from_ghcb stores it. Exploiting recursions on linux kernel has been proven feasible in the past, but the impact is mitigated by stack guard pages (CONFIG_VMAP_STACK). Still, if an attacker manages to call the handler multiple times, they can theoretically trigger a stack overflow and cause a denial-of-service, or potentially guest-to-host escape in kernel configurations without stack guard pages. Note that winning the race reliably in every iteration is very tricky due to the very tight window of the fetches; depending on the compiler settings, they are often consecutive because of optimization and inlining. Tested by booting an SEV-ES RHEL9 guest. Fixes: CVE-2023-4155 Fixes: 291bd20d5d88 ("KVM: SVM: Add initial support for a VMGEXIT VMEXIT") Cc: stable@vger.kernel.org Reported-by: Andy Nguyen <theflow@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2023-08-04 12:56:36 -04:00
exit_code = kvm_ghcb_get_sw_exit_code(control);
switch (exit_code) {
case SVM_VMGEXIT_MMIO_READ:
KVM: SVM: Exit to userspace on ENOMEM/EFAULT GHCB errors Exit to userspace if setup_vmgexit_scratch() fails due to OOM or because copying data from guest (userspace) memory failed/faulted. The OOM scenario is clearcut, it's userspace's decision as to whether it should terminate the guest, free memory, etc... As for -EFAULT, arguably, any guest issue is a violation of the guest's contract with userspace, and thus userspace needs to decide how to proceed. E.g. userspace defines what is RAM vs. MMIO and communicates that directly to the guest, KVM is not involved in deciding what is/isn't RAM nor in communicating that information to the guest. If the scratch GPA doesn't resolve to a memslot, then the guest is not honoring the memory configuration as defined by userspace. And if userspace unmaps an hva for whatever reason, then exiting to userspace with -EFAULT is absolutely the right thing to do. KVM's ABI currently sucks and doesn't provide enough information to act on the -EFAULT, but that will hopefully be remedied in the future as there are multiple use cases, e.g. uffd and virtiofs truncation, that shouldn't require any work in KVM beyond returning -EFAULT with a small amount of metadata. KVM could define its ABI such that failure to access the scratch area is reflected into the guest, i.e. establish a contract with userspace, but that's undesirable as it limits KVM's options in the future, e.g. in the potential uffd case any failure on a uaccess needs to kick out to userspace. KVM does have several cases where it reflects these errors into the guest, e.g. kvm_pv_clock_pairing() and Hyper-V emulation, but KVM would preferably "fix" those instead of propagating the falsehood that any memory failure is the guest's fault. Lastly, returning a boolean as an "error" for that a helper that isn't named accordingly never works out well. Fixes: ad5b353240c8 ("KVM: SVM: Do not terminate SEV-ES guests on GHCB validation failure") Cc: Alper Gun <alpergun@google.com> Cc: Peter Gonda <pgonda@google.com> Signed-off-by: Sean Christopherson <seanjc@google.com> Message-Id: <20220225205209.3881130-1-seanjc@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-02-25 20:52:09 +00:00
ret = setup_vmgexit_scratch(svm, true, control->exit_info_2);
if (ret)
break;
ret = kvm_sev_es_mmio_read(vcpu,
control->exit_info_1,
control->exit_info_2,
svm->sev_es.ghcb_sa);
break;
case SVM_VMGEXIT_MMIO_WRITE:
KVM: SVM: Exit to userspace on ENOMEM/EFAULT GHCB errors Exit to userspace if setup_vmgexit_scratch() fails due to OOM or because copying data from guest (userspace) memory failed/faulted. The OOM scenario is clearcut, it's userspace's decision as to whether it should terminate the guest, free memory, etc... As for -EFAULT, arguably, any guest issue is a violation of the guest's contract with userspace, and thus userspace needs to decide how to proceed. E.g. userspace defines what is RAM vs. MMIO and communicates that directly to the guest, KVM is not involved in deciding what is/isn't RAM nor in communicating that information to the guest. If the scratch GPA doesn't resolve to a memslot, then the guest is not honoring the memory configuration as defined by userspace. And if userspace unmaps an hva for whatever reason, then exiting to userspace with -EFAULT is absolutely the right thing to do. KVM's ABI currently sucks and doesn't provide enough information to act on the -EFAULT, but that will hopefully be remedied in the future as there are multiple use cases, e.g. uffd and virtiofs truncation, that shouldn't require any work in KVM beyond returning -EFAULT with a small amount of metadata. KVM could define its ABI such that failure to access the scratch area is reflected into the guest, i.e. establish a contract with userspace, but that's undesirable as it limits KVM's options in the future, e.g. in the potential uffd case any failure on a uaccess needs to kick out to userspace. KVM does have several cases where it reflects these errors into the guest, e.g. kvm_pv_clock_pairing() and Hyper-V emulation, but KVM would preferably "fix" those instead of propagating the falsehood that any memory failure is the guest's fault. Lastly, returning a boolean as an "error" for that a helper that isn't named accordingly never works out well. Fixes: ad5b353240c8 ("KVM: SVM: Do not terminate SEV-ES guests on GHCB validation failure") Cc: Alper Gun <alpergun@google.com> Cc: Peter Gonda <pgonda@google.com> Signed-off-by: Sean Christopherson <seanjc@google.com> Message-Id: <20220225205209.3881130-1-seanjc@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-02-25 20:52:09 +00:00
ret = setup_vmgexit_scratch(svm, false, control->exit_info_2);
if (ret)
break;
ret = kvm_sev_es_mmio_write(vcpu,
control->exit_info_1,
control->exit_info_2,
svm->sev_es.ghcb_sa);
break;
case SVM_VMGEXIT_NMI_COMPLETE:
++vcpu->stat.nmi_window_exits;
svm->nmi_masked = false;
kvm_make_request(KVM_REQ_EVENT, vcpu);
ret = 1;
break;
KVM: SVM: Add support for booting APs in an SEV-ES guest Typically under KVM, an AP is booted using the INIT-SIPI-SIPI sequence, where the guest vCPU register state is updated and then the vCPU is VMRUN to begin execution of the AP. For an SEV-ES guest, this won't work because the guest register state is encrypted. Following the GHCB specification, the hypervisor must not alter the guest register state, so KVM must track an AP/vCPU boot. Should the guest want to park the AP, it must use the AP Reset Hold exit event in place of, for example, a HLT loop. First AP boot (first INIT-SIPI-SIPI sequence): Execute the AP (vCPU) as it was initialized and measured by the SEV-ES support. It is up to the guest to transfer control of the AP to the proper location. Subsequent AP boot: KVM will expect to receive an AP Reset Hold exit event indicating that the vCPU is being parked and will require an INIT-SIPI-SIPI sequence to awaken it. When the AP Reset Hold exit event is received, KVM will place the vCPU into a simulated HLT mode. Upon receiving the INIT-SIPI-SIPI sequence, KVM will make the vCPU runnable. It is again up to the guest to then transfer control of the AP to the proper location. To differentiate between an actual HLT and an AP Reset Hold, a new MP state is introduced, KVM_MP_STATE_AP_RESET_HOLD, which the vCPU is placed in upon receiving the AP Reset Hold exit event. Additionally, to communicate the AP Reset Hold exit event up to userspace (if needed), a new exit reason is introduced, KVM_EXIT_AP_RESET_HOLD. A new x86 ops function is introduced, vcpu_deliver_sipi_vector, in order to accomplish AP booting. For VMX, vcpu_deliver_sipi_vector is set to the original SIPI delivery function, kvm_vcpu_deliver_sipi_vector(). SVM adds a new function that, for non SEV-ES guests, invokes the original SIPI delivery function, kvm_vcpu_deliver_sipi_vector(), but for SEV-ES guests, implements the logic above. Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Message-Id: <e8fbebe8eb161ceaabdad7c01a5859a78b424d5e.1609791600.git.thomas.lendacky@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2021-01-04 14:20:01 -06:00
case SVM_VMGEXIT_AP_HLT_LOOP:
svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NAE_EVENT;
ret = kvm_emulate_ap_reset_hold(vcpu);
KVM: SVM: Add support for booting APs in an SEV-ES guest Typically under KVM, an AP is booted using the INIT-SIPI-SIPI sequence, where the guest vCPU register state is updated and then the vCPU is VMRUN to begin execution of the AP. For an SEV-ES guest, this won't work because the guest register state is encrypted. Following the GHCB specification, the hypervisor must not alter the guest register state, so KVM must track an AP/vCPU boot. Should the guest want to park the AP, it must use the AP Reset Hold exit event in place of, for example, a HLT loop. First AP boot (first INIT-SIPI-SIPI sequence): Execute the AP (vCPU) as it was initialized and measured by the SEV-ES support. It is up to the guest to transfer control of the AP to the proper location. Subsequent AP boot: KVM will expect to receive an AP Reset Hold exit event indicating that the vCPU is being parked and will require an INIT-SIPI-SIPI sequence to awaken it. When the AP Reset Hold exit event is received, KVM will place the vCPU into a simulated HLT mode. Upon receiving the INIT-SIPI-SIPI sequence, KVM will make the vCPU runnable. It is again up to the guest to then transfer control of the AP to the proper location. To differentiate between an actual HLT and an AP Reset Hold, a new MP state is introduced, KVM_MP_STATE_AP_RESET_HOLD, which the vCPU is placed in upon receiving the AP Reset Hold exit event. Additionally, to communicate the AP Reset Hold exit event up to userspace (if needed), a new exit reason is introduced, KVM_EXIT_AP_RESET_HOLD. A new x86 ops function is introduced, vcpu_deliver_sipi_vector, in order to accomplish AP booting. For VMX, vcpu_deliver_sipi_vector is set to the original SIPI delivery function, kvm_vcpu_deliver_sipi_vector(). SVM adds a new function that, for non SEV-ES guests, invokes the original SIPI delivery function, kvm_vcpu_deliver_sipi_vector(), but for SEV-ES guests, implements the logic above. Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Message-Id: <e8fbebe8eb161ceaabdad7c01a5859a78b424d5e.1609791600.git.thomas.lendacky@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2021-01-04 14:20:01 -06:00
break;
case SVM_VMGEXIT_AP_JUMP_TABLE: {
struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
switch (control->exit_info_1) {
case 0:
/* Set AP jump table address */
sev->ap_jump_table = control->exit_info_2;
break;
case 1:
/* Get AP jump table address */
svm_vmgexit_success(svm, sev->ap_jump_table);
break;
default:
pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n",
control->exit_info_1);
svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT);
}
KVM: SVM: Exit to userspace on ENOMEM/EFAULT GHCB errors Exit to userspace if setup_vmgexit_scratch() fails due to OOM or because copying data from guest (userspace) memory failed/faulted. The OOM scenario is clearcut, it's userspace's decision as to whether it should terminate the guest, free memory, etc... As for -EFAULT, arguably, any guest issue is a violation of the guest's contract with userspace, and thus userspace needs to decide how to proceed. E.g. userspace defines what is RAM vs. MMIO and communicates that directly to the guest, KVM is not involved in deciding what is/isn't RAM nor in communicating that information to the guest. If the scratch GPA doesn't resolve to a memslot, then the guest is not honoring the memory configuration as defined by userspace. And if userspace unmaps an hva for whatever reason, then exiting to userspace with -EFAULT is absolutely the right thing to do. KVM's ABI currently sucks and doesn't provide enough information to act on the -EFAULT, but that will hopefully be remedied in the future as there are multiple use cases, e.g. uffd and virtiofs truncation, that shouldn't require any work in KVM beyond returning -EFAULT with a small amount of metadata. KVM could define its ABI such that failure to access the scratch area is reflected into the guest, i.e. establish a contract with userspace, but that's undesirable as it limits KVM's options in the future, e.g. in the potential uffd case any failure on a uaccess needs to kick out to userspace. KVM does have several cases where it reflects these errors into the guest, e.g. kvm_pv_clock_pairing() and Hyper-V emulation, but KVM would preferably "fix" those instead of propagating the falsehood that any memory failure is the guest's fault. Lastly, returning a boolean as an "error" for that a helper that isn't named accordingly never works out well. Fixes: ad5b353240c8 ("KVM: SVM: Do not terminate SEV-ES guests on GHCB validation failure") Cc: Alper Gun <alpergun@google.com> Cc: Peter Gonda <pgonda@google.com> Signed-off-by: Sean Christopherson <seanjc@google.com> Message-Id: <20220225205209.3881130-1-seanjc@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-02-25 20:52:09 +00:00
ret = 1;
break;
}
case SVM_VMGEXIT_HV_FEATURES:
svm_vmgexit_success(svm, GHCB_HV_FT_SUPPORTED);
ret = 1;
break;
case SVM_VMGEXIT_TERM_REQUEST:
pr_info("SEV-ES guest requested termination: reason %#llx info %#llx\n",
control->exit_info_1, control->exit_info_2);
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM;
vcpu->run->system_event.ndata = 1;
vcpu->run->system_event.data[0] = control->ghcb_gpa;
break;
case SVM_VMGEXIT_PSC:
ret = setup_vmgexit_scratch(svm, true, control->exit_info_2);
if (ret)
break;
ret = snp_begin_psc(svm, svm->sev_es.ghcb_sa);
break;
KVM: SEV: Support SEV-SNP AP Creation NAE event Add support for the SEV-SNP AP Creation NAE event. This allows SEV-SNP guests to alter the register state of the APs on their own. This allows the guest a way of simulating INIT-SIPI. A new event, KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, is created and used so as to avoid updating the VMSA pointer while the vCPU is running. For CREATE The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID. The GPA is saved in the svm struct of the target vCPU, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to the vCPU and then the vCPU is kicked. For CREATE_ON_INIT: The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID the next time an INIT is performed. The GPA is saved in the svm struct of the target vCPU. For DESTROY: The guest indicates it wishes to stop the vCPU. The GPA is cleared from the svm struct, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to vCPU and then the vCPU is kicked. The KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event handler will be invoked as a result of the event or as a result of an INIT. If a new VMSA is to be installed, the VMSA guest page is set as the VMSA in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_RUNNABLE. If a new VMSA is not to be installed, the VMSA is cleared in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_HALTED to prevent it from being run. Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Co-developed-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Message-ID: <20240501085210.2213060-13-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-05-01 03:52:02 -05:00
case SVM_VMGEXIT_AP_CREATION:
ret = sev_snp_ap_creation(svm);
if (ret) {
svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT);
KVM: SEV: Support SEV-SNP AP Creation NAE event Add support for the SEV-SNP AP Creation NAE event. This allows SEV-SNP guests to alter the register state of the APs on their own. This allows the guest a way of simulating INIT-SIPI. A new event, KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, is created and used so as to avoid updating the VMSA pointer while the vCPU is running. For CREATE The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID. The GPA is saved in the svm struct of the target vCPU, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to the vCPU and then the vCPU is kicked. For CREATE_ON_INIT: The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID the next time an INIT is performed. The GPA is saved in the svm struct of the target vCPU. For DESTROY: The guest indicates it wishes to stop the vCPU. The GPA is cleared from the svm struct, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to vCPU and then the vCPU is kicked. The KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event handler will be invoked as a result of the event or as a result of an INIT. If a new VMSA is to be installed, the VMSA guest page is set as the VMSA in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_RUNNABLE. If a new VMSA is not to be installed, the VMSA is cleared in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_HALTED to prevent it from being run. Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Co-developed-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Message-ID: <20240501085210.2213060-13-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-05-01 03:52:02 -05:00
}
ret = 1;
break;
KVM: SEV: Provide support for SNP_GUEST_REQUEST NAE event Version 2 of GHCB specification added support for the SNP Guest Request Message NAE event. The event allows for an SEV-SNP guest to make requests to the SEV-SNP firmware through the hypervisor using the SNP_GUEST_REQUEST API defined in the SEV-SNP firmware specification. This is used by guests primarily to request attestation reports from firmware. There are other request types are available as well, but the specifics of what guest requests are being made generally does not affect how they are handled by the hypervisor, which only serves as a proxy for the guest requests and firmware responses. Implement handling for these events. When an SNP Guest Request is issued, the guest will provide its own request/response pages, which could in theory be passed along directly to firmware. However, these pages would need special care: - Both pages are from shared guest memory, so they need to be protected from migration/etc. occurring while firmware reads/writes to them. At a minimum, this requires elevating the ref counts and potentially needing an explicit pinning of the memory. This places additional restrictions on what type of memory backends userspace can use for shared guest memory since there would be some reliance on using refcounted pages. - The response page needs to be switched to Firmware-owned state before the firmware can write to it, which can lead to potential host RMP #PFs if the guest is misbehaved and hands the host a guest page that KVM is writing to for other reasons (e.g. virtio buffers). Both of these issues can be avoided completely by using separately-allocated bounce pages for both the request/response pages and passing those to firmware instead. So that's the approach taken here. Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Co-developed-by: Alexey Kardashevskiy <aik@amd.com> Signed-off-by: Alexey Kardashevskiy <aik@amd.com> Co-developed-by: Ashish Kalra <ashish.kalra@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Reviewed-by: Liam Merwick <liam.merwick@oracle.com> [mdr: ensure FW command failures are indicated to guest, drop extended request handling to be re-written as separate patch, massage commit] Signed-off-by: Michael Roth <michael.roth@amd.com> Message-ID: <20240701223148.3798365-2-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-07-01 17:31:46 -05:00
case SVM_VMGEXIT_GUEST_REQUEST:
ret = snp_handle_guest_req(svm, control->exit_info_1, control->exit_info_2);
break;
case SVM_VMGEXIT_EXT_GUEST_REQUEST:
ret = snp_handle_ext_guest_req(svm, control->exit_info_1, control->exit_info_2);
break;
case SVM_VMGEXIT_UNSUPPORTED_EVENT:
vcpu_unimpl(vcpu,
"vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
control->exit_info_1, control->exit_info_2);
ret = -EINVAL;
break;
default:
ret = svm_invoke_exit_handler(vcpu, exit_code);
}
return ret;
}
int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in)
{
int count;
int bytes;
KVM: SVM: Exit to userspace on ENOMEM/EFAULT GHCB errors Exit to userspace if setup_vmgexit_scratch() fails due to OOM or because copying data from guest (userspace) memory failed/faulted. The OOM scenario is clearcut, it's userspace's decision as to whether it should terminate the guest, free memory, etc... As for -EFAULT, arguably, any guest issue is a violation of the guest's contract with userspace, and thus userspace needs to decide how to proceed. E.g. userspace defines what is RAM vs. MMIO and communicates that directly to the guest, KVM is not involved in deciding what is/isn't RAM nor in communicating that information to the guest. If the scratch GPA doesn't resolve to a memslot, then the guest is not honoring the memory configuration as defined by userspace. And if userspace unmaps an hva for whatever reason, then exiting to userspace with -EFAULT is absolutely the right thing to do. KVM's ABI currently sucks and doesn't provide enough information to act on the -EFAULT, but that will hopefully be remedied in the future as there are multiple use cases, e.g. uffd and virtiofs truncation, that shouldn't require any work in KVM beyond returning -EFAULT with a small amount of metadata. KVM could define its ABI such that failure to access the scratch area is reflected into the guest, i.e. establish a contract with userspace, but that's undesirable as it limits KVM's options in the future, e.g. in the potential uffd case any failure on a uaccess needs to kick out to userspace. KVM does have several cases where it reflects these errors into the guest, e.g. kvm_pv_clock_pairing() and Hyper-V emulation, but KVM would preferably "fix" those instead of propagating the falsehood that any memory failure is the guest's fault. Lastly, returning a boolean as an "error" for that a helper that isn't named accordingly never works out well. Fixes: ad5b353240c8 ("KVM: SVM: Do not terminate SEV-ES guests on GHCB validation failure") Cc: Alper Gun <alpergun@google.com> Cc: Peter Gonda <pgonda@google.com> Signed-off-by: Sean Christopherson <seanjc@google.com> Message-Id: <20220225205209.3881130-1-seanjc@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-02-25 20:52:09 +00:00
int r;
if (svm->vmcb->control.exit_info_2 > INT_MAX)
return -EINVAL;
count = svm->vmcb->control.exit_info_2;
if (unlikely(check_mul_overflow(count, size, &bytes)))
return -EINVAL;
KVM: SVM: Exit to userspace on ENOMEM/EFAULT GHCB errors Exit to userspace if setup_vmgexit_scratch() fails due to OOM or because copying data from guest (userspace) memory failed/faulted. The OOM scenario is clearcut, it's userspace's decision as to whether it should terminate the guest, free memory, etc... As for -EFAULT, arguably, any guest issue is a violation of the guest's contract with userspace, and thus userspace needs to decide how to proceed. E.g. userspace defines what is RAM vs. MMIO and communicates that directly to the guest, KVM is not involved in deciding what is/isn't RAM nor in communicating that information to the guest. If the scratch GPA doesn't resolve to a memslot, then the guest is not honoring the memory configuration as defined by userspace. And if userspace unmaps an hva for whatever reason, then exiting to userspace with -EFAULT is absolutely the right thing to do. KVM's ABI currently sucks and doesn't provide enough information to act on the -EFAULT, but that will hopefully be remedied in the future as there are multiple use cases, e.g. uffd and virtiofs truncation, that shouldn't require any work in KVM beyond returning -EFAULT with a small amount of metadata. KVM could define its ABI such that failure to access the scratch area is reflected into the guest, i.e. establish a contract with userspace, but that's undesirable as it limits KVM's options in the future, e.g. in the potential uffd case any failure on a uaccess needs to kick out to userspace. KVM does have several cases where it reflects these errors into the guest, e.g. kvm_pv_clock_pairing() and Hyper-V emulation, but KVM would preferably "fix" those instead of propagating the falsehood that any memory failure is the guest's fault. Lastly, returning a boolean as an "error" for that a helper that isn't named accordingly never works out well. Fixes: ad5b353240c8 ("KVM: SVM: Do not terminate SEV-ES guests on GHCB validation failure") Cc: Alper Gun <alpergun@google.com> Cc: Peter Gonda <pgonda@google.com> Signed-off-by: Sean Christopherson <seanjc@google.com> Message-Id: <20220225205209.3881130-1-seanjc@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-02-25 20:52:09 +00:00
r = setup_vmgexit_scratch(svm, in, bytes);
if (r)
return r;
return kvm_sev_es_string_io(&svm->vcpu, size, port, svm->sev_es.ghcb_sa,
count, in);
}
void sev_es_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
2023-09-15 15:54:30 -05:00
{
/* Clear intercepts on MSRs that are context switched by hardware. */
svm_disable_intercept_for_msr(vcpu, MSR_AMD64_SEV_ES_GHCB, MSR_TYPE_RW);
svm_disable_intercept_for_msr(vcpu, MSR_EFER, MSR_TYPE_RW);
svm_disable_intercept_for_msr(vcpu, MSR_IA32_CR_PAT, MSR_TYPE_RW);
2023-09-15 15:54:30 -05:00
if (boot_cpu_has(X86_FEATURE_V_TSC_AUX))
svm_set_intercept_for_msr(vcpu, MSR_TSC_AUX, MSR_TYPE_RW,
!guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) &&
!guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID));
KVM: SEV: Do not intercept accesses to MSR_IA32_XSS for SEV-ES guests When intercepts are enabled for MSR_IA32_XSS, the host will swap in/out the guest-defined values while context-switching to/from guest mode. However, in the case of SEV-ES, vcpu->arch.guest_state_protected is set, so the guest-defined value is effectively ignored when switching to guest mode with the understanding that the VMSA will handle swapping in/out this register state. However, SVM is still configured to intercept these accesses for SEV-ES guests, so the values in the initial MSR_IA32_XSS are effectively read-only, and a guest will experience undefined behavior if it actually tries to write to this MSR. Fortunately, only CET/shadowstack makes use of this register on SEV-ES-capable systems currently, which isn't yet widely used, but this may become more of an issue in the future. Additionally, enabling intercepts of MSR_IA32_XSS results in #VC exceptions in the guest in certain paths that can lead to unexpected #VC nesting levels. One example is SEV-SNP guests when handling #VC exceptions for CPUID instructions involving leaf 0xD, subleaf 0x1, since they will access MSR_IA32_XSS as part of servicing the CPUID #VC, then generate another #VC when accessing MSR_IA32_XSS, which can lead to guest crashes if an NMI occurs at that point in time. Running perf on a guest while it is issuing such a sequence is one example where these can be problematic. Address this by disabling intercepts of MSR_IA32_XSS for SEV-ES guests if the host/guest configuration allows it. If the host/guest configuration doesn't allow for MSR_IA32_XSS, leave it intercepted so that it can be caught by the existing checks in kvm_{set,get}_msr_common() if the guest still attempts to access it. Fixes: 376c6d285017 ("KVM: SVM: Provide support for SEV-ES vCPU creation/loading") Cc: Alexey Kardashevskiy <aik@amd.com> Suggested-by: Tom Lendacky <thomas.lendacky@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Message-Id: <20231016132819.1002933-4-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2023-10-16 08:27:32 -05:00
/*
* For SEV-ES, accesses to MSR_IA32_XSS should not be intercepted if
* the host/guest supports its use.
*
* KVM treats the guest as being capable of using XSAVES even if XSAVES
* isn't enabled in guest CPUID as there is no intercept for XSAVES,
* i.e. the guest can use XSAVES/XRSTOR to read/write XSS if XSAVE is
* exposed to the guest and XSAVES is supported in hardware. Condition
* full XSS passthrough on the guest being able to use XSAVES *and*
* XSAVES being exposed to the guest so that KVM can at least honor
* guest CPUID for RDMSR and WRMSR.
KVM: SEV: Do not intercept accesses to MSR_IA32_XSS for SEV-ES guests When intercepts are enabled for MSR_IA32_XSS, the host will swap in/out the guest-defined values while context-switching to/from guest mode. However, in the case of SEV-ES, vcpu->arch.guest_state_protected is set, so the guest-defined value is effectively ignored when switching to guest mode with the understanding that the VMSA will handle swapping in/out this register state. However, SVM is still configured to intercept these accesses for SEV-ES guests, so the values in the initial MSR_IA32_XSS are effectively read-only, and a guest will experience undefined behavior if it actually tries to write to this MSR. Fortunately, only CET/shadowstack makes use of this register on SEV-ES-capable systems currently, which isn't yet widely used, but this may become more of an issue in the future. Additionally, enabling intercepts of MSR_IA32_XSS results in #VC exceptions in the guest in certain paths that can lead to unexpected #VC nesting levels. One example is SEV-SNP guests when handling #VC exceptions for CPUID instructions involving leaf 0xD, subleaf 0x1, since they will access MSR_IA32_XSS as part of servicing the CPUID #VC, then generate another #VC when accessing MSR_IA32_XSS, which can lead to guest crashes if an NMI occurs at that point in time. Running perf on a guest while it is issuing such a sequence is one example where these can be problematic. Address this by disabling intercepts of MSR_IA32_XSS for SEV-ES guests if the host/guest configuration allows it. If the host/guest configuration doesn't allow for MSR_IA32_XSS, leave it intercepted so that it can be caught by the existing checks in kvm_{set,get}_msr_common() if the guest still attempts to access it. Fixes: 376c6d285017 ("KVM: SVM: Provide support for SEV-ES vCPU creation/loading") Cc: Alexey Kardashevskiy <aik@amd.com> Suggested-by: Tom Lendacky <thomas.lendacky@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Message-Id: <20231016132819.1002933-4-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2023-10-16 08:27:32 -05:00
*/
svm_set_intercept_for_msr(vcpu, MSR_IA32_XSS, MSR_TYPE_RW,
!guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) ||
!guest_cpuid_has(vcpu, X86_FEATURE_XSAVES));
2023-09-15 15:54:30 -05:00
}
void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm)
{
struct kvm_vcpu *vcpu = &svm->vcpu;
struct kvm_cpuid_entry2 *best;
/* For sev guests, the memory encryption bit is not reserved in CR3. */
best = kvm_find_cpuid_entry(vcpu, 0x8000001F);
if (best)
vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
}
static void sev_es_init_vmcb(struct vcpu_svm *svm)
{
struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm);
struct vmcb *vmcb = svm->vmcb01.ptr;
svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ES_ENABLE;
/*
* An SEV-ES guest requires a VMSA area that is a separate from the
* VMCB page. Do not include the encryption mask on the VMSA physical
* address since hardware will access it using the guest key. Note,
* the VMSA will be NULL if this vCPU is the destination for intrahost
* migration, and will be copied later.
*/
if (!svm->sev_es.snp_has_guest_vmsa) {
if (svm->sev_es.vmsa)
svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa);
else
svm->vmcb->control.vmsa_pa = INVALID_PAGE;
}
if (cpu_feature_enabled(X86_FEATURE_ALLOWED_SEV_FEATURES))
svm->vmcb->control.allowed_sev_features = sev->vmsa_features |
VMCB_ALLOWED_SEV_FEATURES_VALID;
/* Can't intercept CR register access, HV can't modify CR registers */
svm_clr_intercept(svm, INTERCEPT_CR0_READ);
svm_clr_intercept(svm, INTERCEPT_CR4_READ);
svm_clr_intercept(svm, INTERCEPT_CR8_READ);
svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
svm_clr_intercept(svm, INTERCEPT_CR4_WRITE);
svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
svm_clr_intercept(svm, INTERCEPT_SELECTIVE_CR0);
/* Track EFER/CR register changes */
svm_set_intercept(svm, TRAP_EFER_WRITE);
svm_set_intercept(svm, TRAP_CR0_WRITE);
svm_set_intercept(svm, TRAP_CR4_WRITE);
svm_set_intercept(svm, TRAP_CR8_WRITE);
vmcb->control.intercepts[INTERCEPT_DR] = 0;
if (!sev_vcpu_has_debug_swap(svm)) {
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
recalc_intercepts(svm);
} else {
/*
* Disable #DB intercept iff DebugSwap is enabled. KVM doesn't
* allow debugging SEV-ES guests, and enables DebugSwap iff
* NO_NESTED_DATA_BP is supported, so there's no reason to
* intercept #DB when DebugSwap is enabled. For simplicity
* with respect to guest debug, intercept #DB for other VMs
* even if NO_NESTED_DATA_BP is supported, i.e. even if the
* guest can't DoS the CPU with infinite #DB vectoring.
*/
clr_exception_intercept(svm, DB_VECTOR);
}
/* Can't intercept XSETBV, HV can't modify XCR0 directly */
svm_clr_intercept(svm, INTERCEPT_XSETBV);
}
void sev_init_vmcb(struct vcpu_svm *svm)
{
svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
clr_exception_intercept(svm, UD_VECTOR);
/*
* Don't intercept #GP for SEV guests, e.g. for the VMware backdoor, as
* KVM can't decrypt guest memory to decode the faulting instruction.
*/
clr_exception_intercept(svm, GP_VECTOR);
if (sev_es_guest(svm->vcpu.kvm))
sev_es_init_vmcb(svm);
}
void sev_es_vcpu_reset(struct vcpu_svm *svm)
{
struct kvm_vcpu *vcpu = &svm->vcpu;
struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
/*
* Set the GHCB MSR value as per the GHCB specification when emulating
* vCPU RESET for an SEV-ES guest.
*/
set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version,
GHCB_VERSION_MIN,
sev_enc_bit));
KVM: SEV: Support SEV-SNP AP Creation NAE event Add support for the SEV-SNP AP Creation NAE event. This allows SEV-SNP guests to alter the register state of the APs on their own. This allows the guest a way of simulating INIT-SIPI. A new event, KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, is created and used so as to avoid updating the VMSA pointer while the vCPU is running. For CREATE The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID. The GPA is saved in the svm struct of the target vCPU, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to the vCPU and then the vCPU is kicked. For CREATE_ON_INIT: The guest supplies the GPA of the VMSA to be used for the vCPU with the specified APIC ID the next time an INIT is performed. The GPA is saved in the svm struct of the target vCPU. For DESTROY: The guest indicates it wishes to stop the vCPU. The GPA is cleared from the svm struct, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added to vCPU and then the vCPU is kicked. The KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event handler will be invoked as a result of the event or as a result of an INIT. If a new VMSA is to be installed, the VMSA guest page is set as the VMSA in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_RUNNABLE. If a new VMSA is not to be installed, the VMSA is cleared in the vCPU VMCB and the vCPU state is set to KVM_MP_STATE_HALTED to prevent it from being run. Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Co-developed-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Message-ID: <20240501085210.2213060-13-michael.roth@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-05-01 03:52:02 -05:00
mutex_init(&svm->sev_es.snp_vmsa_mutex);
}
void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa)
{
KVM: SVM: Save host DR masks on CPUs with DebugSwap When running SEV-SNP guests on a CPU that supports DebugSwap, always save the host's DR0..DR3 mask MSR values irrespective of whether or not DebugSwap is enabled, to ensure the host values aren't clobbered by the CPU. And for now, also save DR0..DR3, even though doing so isn't necessary (see below). SVM_VMGEXIT_AP_CREATE is deeply flawed in that it allows the *guest* to create a VMSA with guest-controlled SEV_FEATURES. A well behaved guest can inform the hypervisor, i.e. KVM, of its "requested" features, but on CPUs without ALLOWED_SEV_FEATURES support, nothing prevents the guest from lying about which SEV features are being enabled (or not!). If a misbehaving guest enables DebugSwap in a secondary vCPU's VMSA, the CPU will load the DR0..DR3 mask MSRs on #VMEXIT, i.e. will clobber the MSRs with '0' if KVM doesn't save its desired value. Note, DR0..DR3 themselves are "ok", as DR7 is reset on #VMEXIT, and KVM restores all DRs in common x86 code as needed via hw_breakpoint_restore(). I.e. there is no risk of host DR0..DR3 being clobbered (when it matters). However, there is a flaw in the opposite direction; because the guest can lie about enabling DebugSwap, i.e. can *disable* DebugSwap without KVM's knowledge, KVM must not rely on the CPU to restore DRs. Defer fixing that wart, as it's more of a documentation issue than a bug in the code. Note, KVM added support for DebugSwap on commit d1f85fbe836e ("KVM: SEV: Enable data breakpoints in SEV-ES"), but that is not an appropriate Fixes, as the underlying flaw exists in hardware, not in KVM. I.e. all kernels that support SEV-SNP need to be patched, not just kernels with KVM's full support for DebugSwap (ignoring that DebugSwap support landed first). Opportunistically fix an incorrect statement in the comment; on CPUs without DebugSwap, the CPU does NOT save or load debug registers, i.e. Fixes: e366f92ea99e ("KVM: SEV: Support SEV-SNP AP Creation NAE event") Cc: stable@vger.kernel.org Cc: Naveen N Rao <naveen@kernel.org> Cc: Kim Phillips <kim.phillips@amd.com> Cc: Tom Lendacky <thomas.lendacky@amd.com> Cc: Alexey Kardashevskiy <aik@amd.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Link: https://lore.kernel.org/r/20250227012541.3234589-2-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-02-26 17:25:32 -08:00
struct kvm *kvm = svm->vcpu.kvm;
/*
* All host state for SEV-ES guests is categorized into three swap types
* based on how it is handled by hardware during a world switch:
*
* A: VMRUN: Host state saved in host save area
* VMEXIT: Host state loaded from host save area
*
* B: VMRUN: Host state _NOT_ saved in host save area
* VMEXIT: Host state loaded from host save area
*
* C: VMRUN: Host state _NOT_ saved in host save area
* VMEXIT: Host state initialized to default(reset) values
*
* Manually save type-B state, i.e. state that is loaded by VMEXIT but
* isn't saved by VMRUN, that isn't already saved by VMSAVE (performed
* by common SVM code).
*/
hostsa->xcr0 = kvm_host.xcr0;
hostsa->pkru = read_pkru();
hostsa->xss = kvm_host.xss;
/*
* If DebugSwap is enabled, debug registers are loaded but NOT saved by
KVM: SVM: Save host DR masks on CPUs with DebugSwap When running SEV-SNP guests on a CPU that supports DebugSwap, always save the host's DR0..DR3 mask MSR values irrespective of whether or not DebugSwap is enabled, to ensure the host values aren't clobbered by the CPU. And for now, also save DR0..DR3, even though doing so isn't necessary (see below). SVM_VMGEXIT_AP_CREATE is deeply flawed in that it allows the *guest* to create a VMSA with guest-controlled SEV_FEATURES. A well behaved guest can inform the hypervisor, i.e. KVM, of its "requested" features, but on CPUs without ALLOWED_SEV_FEATURES support, nothing prevents the guest from lying about which SEV features are being enabled (or not!). If a misbehaving guest enables DebugSwap in a secondary vCPU's VMSA, the CPU will load the DR0..DR3 mask MSRs on #VMEXIT, i.e. will clobber the MSRs with '0' if KVM doesn't save its desired value. Note, DR0..DR3 themselves are "ok", as DR7 is reset on #VMEXIT, and KVM restores all DRs in common x86 code as needed via hw_breakpoint_restore(). I.e. there is no risk of host DR0..DR3 being clobbered (when it matters). However, there is a flaw in the opposite direction; because the guest can lie about enabling DebugSwap, i.e. can *disable* DebugSwap without KVM's knowledge, KVM must not rely on the CPU to restore DRs. Defer fixing that wart, as it's more of a documentation issue than a bug in the code. Note, KVM added support for DebugSwap on commit d1f85fbe836e ("KVM: SEV: Enable data breakpoints in SEV-ES"), but that is not an appropriate Fixes, as the underlying flaw exists in hardware, not in KVM. I.e. all kernels that support SEV-SNP need to be patched, not just kernels with KVM's full support for DebugSwap (ignoring that DebugSwap support landed first). Opportunistically fix an incorrect statement in the comment; on CPUs without DebugSwap, the CPU does NOT save or load debug registers, i.e. Fixes: e366f92ea99e ("KVM: SEV: Support SEV-SNP AP Creation NAE event") Cc: stable@vger.kernel.org Cc: Naveen N Rao <naveen@kernel.org> Cc: Kim Phillips <kim.phillips@amd.com> Cc: Tom Lendacky <thomas.lendacky@amd.com> Cc: Alexey Kardashevskiy <aik@amd.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Link: https://lore.kernel.org/r/20250227012541.3234589-2-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-02-26 17:25:32 -08:00
* the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU does
* not save or load debug registers. Sadly, KVM can't prevent SNP
* guests from lying about DebugSwap on secondary vCPUs, i.e. the
* SEV_FEATURES provided at "AP Create" isn't guaranteed to match what
* the guest has actually enabled (or not!) in the VMSA.
*
* If DebugSwap is *possible*, save the masks so that they're restored
* if the guest enables DebugSwap. But for the DRs themselves, do NOT
* rely on the CPU to restore the host values; KVM will restore them as
* needed in common code, via hw_breakpoint_restore(). Note, KVM does
* NOT support virtualizing Breakpoint Extensions, i.e. the mask MSRs
* don't need to be restored per se, KVM just needs to ensure they are
* loaded with the correct values *if* the CPU writes the MSRs.
*/
KVM: SVM: Save host DR masks on CPUs with DebugSwap When running SEV-SNP guests on a CPU that supports DebugSwap, always save the host's DR0..DR3 mask MSR values irrespective of whether or not DebugSwap is enabled, to ensure the host values aren't clobbered by the CPU. And for now, also save DR0..DR3, even though doing so isn't necessary (see below). SVM_VMGEXIT_AP_CREATE is deeply flawed in that it allows the *guest* to create a VMSA with guest-controlled SEV_FEATURES. A well behaved guest can inform the hypervisor, i.e. KVM, of its "requested" features, but on CPUs without ALLOWED_SEV_FEATURES support, nothing prevents the guest from lying about which SEV features are being enabled (or not!). If a misbehaving guest enables DebugSwap in a secondary vCPU's VMSA, the CPU will load the DR0..DR3 mask MSRs on #VMEXIT, i.e. will clobber the MSRs with '0' if KVM doesn't save its desired value. Note, DR0..DR3 themselves are "ok", as DR7 is reset on #VMEXIT, and KVM restores all DRs in common x86 code as needed via hw_breakpoint_restore(). I.e. there is no risk of host DR0..DR3 being clobbered (when it matters). However, there is a flaw in the opposite direction; because the guest can lie about enabling DebugSwap, i.e. can *disable* DebugSwap without KVM's knowledge, KVM must not rely on the CPU to restore DRs. Defer fixing that wart, as it's more of a documentation issue than a bug in the code. Note, KVM added support for DebugSwap on commit d1f85fbe836e ("KVM: SEV: Enable data breakpoints in SEV-ES"), but that is not an appropriate Fixes, as the underlying flaw exists in hardware, not in KVM. I.e. all kernels that support SEV-SNP need to be patched, not just kernels with KVM's full support for DebugSwap (ignoring that DebugSwap support landed first). Opportunistically fix an incorrect statement in the comment; on CPUs without DebugSwap, the CPU does NOT save or load debug registers, i.e. Fixes: e366f92ea99e ("KVM: SEV: Support SEV-SNP AP Creation NAE event") Cc: stable@vger.kernel.org Cc: Naveen N Rao <naveen@kernel.org> Cc: Kim Phillips <kim.phillips@amd.com> Cc: Tom Lendacky <thomas.lendacky@amd.com> Cc: Alexey Kardashevskiy <aik@amd.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Link: https://lore.kernel.org/r/20250227012541.3234589-2-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-02-26 17:25:32 -08:00
if (sev_vcpu_has_debug_swap(svm) ||
(sev_snp_guest(kvm) && cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP))) {
hostsa->dr0_addr_mask = amd_get_dr_addr_mask(0);
hostsa->dr1_addr_mask = amd_get_dr_addr_mask(1);
hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2);
hostsa->dr3_addr_mask = amd_get_dr_addr_mask(3);
}
}
KVM: SVM: Add support for booting APs in an SEV-ES guest Typically under KVM, an AP is booted using the INIT-SIPI-SIPI sequence, where the guest vCPU register state is updated and then the vCPU is VMRUN to begin execution of the AP. For an SEV-ES guest, this won't work because the guest register state is encrypted. Following the GHCB specification, the hypervisor must not alter the guest register state, so KVM must track an AP/vCPU boot. Should the guest want to park the AP, it must use the AP Reset Hold exit event in place of, for example, a HLT loop. First AP boot (first INIT-SIPI-SIPI sequence): Execute the AP (vCPU) as it was initialized and measured by the SEV-ES support. It is up to the guest to transfer control of the AP to the proper location. Subsequent AP boot: KVM will expect to receive an AP Reset Hold exit event indicating that the vCPU is being parked and will require an INIT-SIPI-SIPI sequence to awaken it. When the AP Reset Hold exit event is received, KVM will place the vCPU into a simulated HLT mode. Upon receiving the INIT-SIPI-SIPI sequence, KVM will make the vCPU runnable. It is again up to the guest to then transfer control of the AP to the proper location. To differentiate between an actual HLT and an AP Reset Hold, a new MP state is introduced, KVM_MP_STATE_AP_RESET_HOLD, which the vCPU is placed in upon receiving the AP Reset Hold exit event. Additionally, to communicate the AP Reset Hold exit event up to userspace (if needed), a new exit reason is introduced, KVM_EXIT_AP_RESET_HOLD. A new x86 ops function is introduced, vcpu_deliver_sipi_vector, in order to accomplish AP booting. For VMX, vcpu_deliver_sipi_vector is set to the original SIPI delivery function, kvm_vcpu_deliver_sipi_vector(). SVM adds a new function that, for non SEV-ES guests, invokes the original SIPI delivery function, kvm_vcpu_deliver_sipi_vector(), but for SEV-ES guests, implements the logic above. Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Message-Id: <e8fbebe8eb161ceaabdad7c01a5859a78b424d5e.1609791600.git.thomas.lendacky@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2021-01-04 14:20:01 -06:00
void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
{
struct vcpu_svm *svm = to_svm(vcpu);
/* First SIPI: Use the values as initially set by the VMM */
if (!svm->sev_es.received_first_sipi) {
svm->sev_es.received_first_sipi = true;
KVM: SVM: Add support for booting APs in an SEV-ES guest Typically under KVM, an AP is booted using the INIT-SIPI-SIPI sequence, where the guest vCPU register state is updated and then the vCPU is VMRUN to begin execution of the AP. For an SEV-ES guest, this won't work because the guest register state is encrypted. Following the GHCB specification, the hypervisor must not alter the guest register state, so KVM must track an AP/vCPU boot. Should the guest want to park the AP, it must use the AP Reset Hold exit event in place of, for example, a HLT loop. First AP boot (first INIT-SIPI-SIPI sequence): Execute the AP (vCPU) as it was initialized and measured by the SEV-ES support. It is up to the guest to transfer control of the AP to the proper location. Subsequent AP boot: KVM will expect to receive an AP Reset Hold exit event indicating that the vCPU is being parked and will require an INIT-SIPI-SIPI sequence to awaken it. When the AP Reset Hold exit event is received, KVM will place the vCPU into a simulated HLT mode. Upon receiving the INIT-SIPI-SIPI sequence, KVM will make the vCPU runnable. It is again up to the guest to then transfer control of the AP to the proper location. To differentiate between an actual HLT and an AP Reset Hold, a new MP state is introduced, KVM_MP_STATE_AP_RESET_HOLD, which the vCPU is placed in upon receiving the AP Reset Hold exit event. Additionally, to communicate the AP Reset Hold exit event up to userspace (if needed), a new exit reason is introduced, KVM_EXIT_AP_RESET_HOLD. A new x86 ops function is introduced, vcpu_deliver_sipi_vector, in order to accomplish AP booting. For VMX, vcpu_deliver_sipi_vector is set to the original SIPI delivery function, kvm_vcpu_deliver_sipi_vector(). SVM adds a new function that, for non SEV-ES guests, invokes the original SIPI delivery function, kvm_vcpu_deliver_sipi_vector(), but for SEV-ES guests, implements the logic above. Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Message-Id: <e8fbebe8eb161ceaabdad7c01a5859a78b424d5e.1609791600.git.thomas.lendacky@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2021-01-04 14:20:01 -06:00
return;
}
/* Subsequent SIPI */
switch (svm->sev_es.ap_reset_hold_type) {
case AP_RESET_HOLD_NAE_EVENT:
/*
* Return from an AP Reset Hold VMGEXIT, where the guest will
* set the CS and RIP. Set SW_EXIT_INFO_2 to a non-zero value.
*/
svm_vmgexit_success(svm, 1);
break;
case AP_RESET_HOLD_MSR_PROTO:
/*
* Return from an AP Reset Hold VMGEXIT, where the guest will
* set the CS and RIP. Set GHCB data field to a non-zero value.
*/
set_ghcb_msr_bits(svm, 1,
GHCB_MSR_AP_RESET_HOLD_RESULT_MASK,
GHCB_MSR_AP_RESET_HOLD_RESULT_POS);
set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP,
GHCB_MSR_INFO_MASK,
GHCB_MSR_INFO_POS);
break;
default:
break;
}
KVM: SVM: Add support for booting APs in an SEV-ES guest Typically under KVM, an AP is booted using the INIT-SIPI-SIPI sequence, where the guest vCPU register state is updated and then the vCPU is VMRUN to begin execution of the AP. For an SEV-ES guest, this won't work because the guest register state is encrypted. Following the GHCB specification, the hypervisor must not alter the guest register state, so KVM must track an AP/vCPU boot. Should the guest want to park the AP, it must use the AP Reset Hold exit event in place of, for example, a HLT loop. First AP boot (first INIT-SIPI-SIPI sequence): Execute the AP (vCPU) as it was initialized and measured by the SEV-ES support. It is up to the guest to transfer control of the AP to the proper location. Subsequent AP boot: KVM will expect to receive an AP Reset Hold exit event indicating that the vCPU is being parked and will require an INIT-SIPI-SIPI sequence to awaken it. When the AP Reset Hold exit event is received, KVM will place the vCPU into a simulated HLT mode. Upon receiving the INIT-SIPI-SIPI sequence, KVM will make the vCPU runnable. It is again up to the guest to then transfer control of the AP to the proper location. To differentiate between an actual HLT and an AP Reset Hold, a new MP state is introduced, KVM_MP_STATE_AP_RESET_HOLD, which the vCPU is placed in upon receiving the AP Reset Hold exit event. Additionally, to communicate the AP Reset Hold exit event up to userspace (if needed), a new exit reason is introduced, KVM_EXIT_AP_RESET_HOLD. A new x86 ops function is introduced, vcpu_deliver_sipi_vector, in order to accomplish AP booting. For VMX, vcpu_deliver_sipi_vector is set to the original SIPI delivery function, kvm_vcpu_deliver_sipi_vector(). SVM adds a new function that, for non SEV-ES guests, invokes the original SIPI delivery function, kvm_vcpu_deliver_sipi_vector(), but for SEV-ES guests, implements the logic above. Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> Message-Id: <e8fbebe8eb161ceaabdad7c01a5859a78b424d5e.1609791600.git.thomas.lendacky@amd.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2021-01-04 14:20:01 -06:00
}
KVM: SEV: Make AVIC backing, VMSA and VMCB memory allocation SNP safe Implement a workaround for an SNP erratum where the CPU will incorrectly signal an RMP violation #PF if a hugepage (2MB or 1GB) collides with the RMP entry of a VMCB, VMSA or AVIC backing page. When SEV-SNP is globally enabled, the CPU marks the VMCB, VMSA, and AVIC backing pages as "in-use" via a reserved bit in the corresponding RMP entry after a successful VMRUN. This is done for _all_ VMs, not just SNP-Active VMs. If the hypervisor accesses an in-use page through a writable translation, the CPU will throw an RMP violation #PF. On early SNP hardware, if an in-use page is 2MB-aligned and software accesses any part of the associated 2MB region with a hugepage, the CPU will incorrectly treat the entire 2MB region as in-use and signal a an RMP violation #PF. To avoid this, the recommendation is to not use a 2MB-aligned page for the VMCB, VMSA or AVIC pages. Add a generic allocator that will ensure that the page returned is not 2MB-aligned and is safe to be used when SEV-SNP is enabled. Also implement similar handling for the VMCB/VMSA pages of nested guests. [ mdr: Squash in nested guest handling from Ashish, commit msg fixups. ] Reported-by: Alper Gun <alpergun@google.com> # for nested VMSA case Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Co-developed-by: Marc Orr <marcorr@google.com> Signed-off-by: Marc Orr <marcorr@google.com> Co-developed-by: Ashish Kalra <ashish.kalra@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> Acked-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Paolo Bonzini <pbonzini@redhat.com> Link: https://lore.kernel.org/r/20240126041126.1927228-22-michael.roth@amd.com
2024-01-25 22:11:21 -06:00
struct page *snp_safe_alloc_page_node(int node, gfp_t gfp)
KVM: SEV: Make AVIC backing, VMSA and VMCB memory allocation SNP safe Implement a workaround for an SNP erratum where the CPU will incorrectly signal an RMP violation #PF if a hugepage (2MB or 1GB) collides with the RMP entry of a VMCB, VMSA or AVIC backing page. When SEV-SNP is globally enabled, the CPU marks the VMCB, VMSA, and AVIC backing pages as "in-use" via a reserved bit in the corresponding RMP entry after a successful VMRUN. This is done for _all_ VMs, not just SNP-Active VMs. If the hypervisor accesses an in-use page through a writable translation, the CPU will throw an RMP violation #PF. On early SNP hardware, if an in-use page is 2MB-aligned and software accesses any part of the associated 2MB region with a hugepage, the CPU will incorrectly treat the entire 2MB region as in-use and signal a an RMP violation #PF. To avoid this, the recommendation is to not use a 2MB-aligned page for the VMCB, VMSA or AVIC pages. Add a generic allocator that will ensure that the page returned is not 2MB-aligned and is safe to be used when SEV-SNP is enabled. Also implement similar handling for the VMCB/VMSA pages of nested guests. [ mdr: Squash in nested guest handling from Ashish, commit msg fixups. ] Reported-by: Alper Gun <alpergun@google.com> # for nested VMSA case Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Co-developed-by: Marc Orr <marcorr@google.com> Signed-off-by: Marc Orr <marcorr@google.com> Co-developed-by: Ashish Kalra <ashish.kalra@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> Acked-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Paolo Bonzini <pbonzini@redhat.com> Link: https://lore.kernel.org/r/20240126041126.1927228-22-michael.roth@amd.com
2024-01-25 22:11:21 -06:00
{
unsigned long pfn;
struct page *p;
if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
return alloc_pages_node(node, gfp | __GFP_ZERO, 0);
KVM: SEV: Make AVIC backing, VMSA and VMCB memory allocation SNP safe Implement a workaround for an SNP erratum where the CPU will incorrectly signal an RMP violation #PF if a hugepage (2MB or 1GB) collides with the RMP entry of a VMCB, VMSA or AVIC backing page. When SEV-SNP is globally enabled, the CPU marks the VMCB, VMSA, and AVIC backing pages as "in-use" via a reserved bit in the corresponding RMP entry after a successful VMRUN. This is done for _all_ VMs, not just SNP-Active VMs. If the hypervisor accesses an in-use page through a writable translation, the CPU will throw an RMP violation #PF. On early SNP hardware, if an in-use page is 2MB-aligned and software accesses any part of the associated 2MB region with a hugepage, the CPU will incorrectly treat the entire 2MB region as in-use and signal a an RMP violation #PF. To avoid this, the recommendation is to not use a 2MB-aligned page for the VMCB, VMSA or AVIC pages. Add a generic allocator that will ensure that the page returned is not 2MB-aligned and is safe to be used when SEV-SNP is enabled. Also implement similar handling for the VMCB/VMSA pages of nested guests. [ mdr: Squash in nested guest handling from Ashish, commit msg fixups. ] Reported-by: Alper Gun <alpergun@google.com> # for nested VMSA case Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Co-developed-by: Marc Orr <marcorr@google.com> Signed-off-by: Marc Orr <marcorr@google.com> Co-developed-by: Ashish Kalra <ashish.kalra@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> Acked-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Paolo Bonzini <pbonzini@redhat.com> Link: https://lore.kernel.org/r/20240126041126.1927228-22-michael.roth@amd.com
2024-01-25 22:11:21 -06:00
/*
* Allocate an SNP-safe page to workaround the SNP erratum where
* the CPU will incorrectly signal an RMP violation #PF if a
* hugepage (2MB or 1GB) collides with the RMP entry of a
* 2MB-aligned VMCB, VMSA, or AVIC backing page.
*
* Allocate one extra page, choose a page which is not
* 2MB-aligned, and free the other.
*/
p = alloc_pages_node(node, gfp | __GFP_ZERO, 1);
KVM: SEV: Make AVIC backing, VMSA and VMCB memory allocation SNP safe Implement a workaround for an SNP erratum where the CPU will incorrectly signal an RMP violation #PF if a hugepage (2MB or 1GB) collides with the RMP entry of a VMCB, VMSA or AVIC backing page. When SEV-SNP is globally enabled, the CPU marks the VMCB, VMSA, and AVIC backing pages as "in-use" via a reserved bit in the corresponding RMP entry after a successful VMRUN. This is done for _all_ VMs, not just SNP-Active VMs. If the hypervisor accesses an in-use page through a writable translation, the CPU will throw an RMP violation #PF. On early SNP hardware, if an in-use page is 2MB-aligned and software accesses any part of the associated 2MB region with a hugepage, the CPU will incorrectly treat the entire 2MB region as in-use and signal a an RMP violation #PF. To avoid this, the recommendation is to not use a 2MB-aligned page for the VMCB, VMSA or AVIC pages. Add a generic allocator that will ensure that the page returned is not 2MB-aligned and is safe to be used when SEV-SNP is enabled. Also implement similar handling for the VMCB/VMSA pages of nested guests. [ mdr: Squash in nested guest handling from Ashish, commit msg fixups. ] Reported-by: Alper Gun <alpergun@google.com> # for nested VMSA case Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Co-developed-by: Marc Orr <marcorr@google.com> Signed-off-by: Marc Orr <marcorr@google.com> Co-developed-by: Ashish Kalra <ashish.kalra@amd.com> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> Signed-off-by: Michael Roth <michael.roth@amd.com> Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> Acked-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Paolo Bonzini <pbonzini@redhat.com> Link: https://lore.kernel.org/r/20240126041126.1927228-22-michael.roth@amd.com
2024-01-25 22:11:21 -06:00
if (!p)
return NULL;
split_page(p, 1);
pfn = page_to_pfn(p);
if (IS_ALIGNED(pfn, PTRS_PER_PMD))
__free_page(p++);
else
__free_page(p + 1);
return p;
}
void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code)
{
struct kvm_memory_slot *slot;
struct kvm *kvm = vcpu->kvm;
int order, rmp_level, ret;
struct page *page;
bool assigned;
kvm_pfn_t pfn;
gfn_t gfn;
gfn = gpa >> PAGE_SHIFT;
/*
* The only time RMP faults occur for shared pages is when the guest is
* triggering an RMP fault for an implicit page-state change from
* shared->private. Implicit page-state changes are forwarded to
* userspace via KVM_EXIT_MEMORY_FAULT events, however, so RMP faults
* for shared pages should not end up here.
*/
if (!kvm_mem_is_private(kvm, gfn)) {
pr_warn_ratelimited("SEV: Unexpected RMP fault for non-private GPA 0x%llx\n",
gpa);
return;
}
slot = gfn_to_memslot(kvm, gfn);
if (!kvm_slot_can_be_private(slot)) {
pr_warn_ratelimited("SEV: Unexpected RMP fault, non-private slot for GPA 0x%llx\n",
gpa);
return;
}
ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, &page, &order);
if (ret) {
pr_warn_ratelimited("SEV: Unexpected RMP fault, no backing page for private GPA 0x%llx\n",
gpa);
return;
}
ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level);
if (ret || !assigned) {
pr_warn_ratelimited("SEV: Unexpected RMP fault, no assigned RMP entry found for GPA 0x%llx PFN 0x%llx error %d\n",
gpa, pfn, ret);
goto out_no_trace;
}
/*
* There are 2 cases where a PSMASH may be needed to resolve an #NPF
* with PFERR_GUEST_RMP_BIT set:
*
* 1) RMPADJUST/PVALIDATE can trigger an #NPF with PFERR_GUEST_SIZEM
* bit set if the guest issues them with a smaller granularity than
* what is indicated by the page-size bit in the 2MB RMP entry for
* the PFN that backs the GPA.
*
* 2) Guest access via NPT can trigger an #NPF if the NPT mapping is
* smaller than what is indicated by the 2MB RMP entry for the PFN
* that backs the GPA.
*
* In both these cases, the corresponding 2M RMP entry needs to
* be PSMASH'd to 512 4K RMP entries. If the RMP entry is already
* split into 4K RMP entries, then this is likely a spurious case which
* can occur when there are concurrent accesses by the guest to a 2MB
* GPA range that is backed by a 2MB-aligned PFN who's RMP entry is in
* the process of being PMASH'd into 4K entries. These cases should
* resolve automatically on subsequent accesses, so just ignore them
* here.
*/
if (rmp_level == PG_LEVEL_4K)
goto out;
ret = snp_rmptable_psmash(pfn);
if (ret) {
/*
* Look it up again. If it's 4K now then the PSMASH may have
* raced with another process and the issue has already resolved
* itself.
*/
if (!snp_lookup_rmpentry(pfn, &assigned, &rmp_level) &&
assigned && rmp_level == PG_LEVEL_4K)
goto out;
pr_warn_ratelimited("SEV: Unable to split RMP entry for GPA 0x%llx PFN 0x%llx ret %d\n",
gpa, pfn, ret);
}
kvm_zap_gfn_range(kvm, gfn, gfn + PTRS_PER_PMD);
out:
trace_kvm_rmp_fault(vcpu, gpa, pfn, error_code, rmp_level, ret);
out_no_trace:
kvm_release_page_unused(page);
}
static bool is_pfn_range_shared(kvm_pfn_t start, kvm_pfn_t end)
{
kvm_pfn_t pfn = start;
while (pfn < end) {
int ret, rmp_level;
bool assigned;
ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level);
if (ret) {
pr_warn_ratelimited("SEV: Failed to retrieve RMP entry: PFN 0x%llx GFN start 0x%llx GFN end 0x%llx RMP level %d error %d\n",
pfn, start, end, rmp_level, ret);
return false;
}
if (assigned) {
pr_debug("%s: overlap detected, PFN 0x%llx start 0x%llx end 0x%llx RMP level %d\n",
__func__, pfn, start, end, rmp_level);
return false;
}
pfn++;
}
return true;
}
static u8 max_level_for_order(int order)
{
if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M))
return PG_LEVEL_2M;
return PG_LEVEL_4K;
}
static bool is_large_rmp_possible(struct kvm *kvm, kvm_pfn_t pfn, int order)
{
kvm_pfn_t pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD);
/*
* If this is a large folio, and the entire 2M range containing the
* PFN is currently shared, then the entire 2M-aligned range can be
* set to private via a single 2M RMP entry.
*/
if (max_level_for_order(order) > PG_LEVEL_4K &&
is_pfn_range_shared(pfn_aligned, pfn_aligned + PTRS_PER_PMD))
return true;
return false;
}
int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order)
{
struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
kvm_pfn_t pfn_aligned;
gfn_t gfn_aligned;
int level, rc;
bool assigned;
if (!sev_snp_guest(kvm))
return 0;
rc = snp_lookup_rmpentry(pfn, &assigned, &level);
if (rc) {
pr_err_ratelimited("SEV: Failed to look up RMP entry: GFN %llx PFN %llx error %d\n",
gfn, pfn, rc);
return -ENOENT;
}
if (assigned) {
pr_debug("%s: already assigned: gfn %llx pfn %llx max_order %d level %d\n",
__func__, gfn, pfn, max_order, level);
return 0;
}
if (is_large_rmp_possible(kvm, pfn, max_order)) {
level = PG_LEVEL_2M;
pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD);
gfn_aligned = ALIGN_DOWN(gfn, PTRS_PER_PMD);
} else {
level = PG_LEVEL_4K;
pfn_aligned = pfn;
gfn_aligned = gfn;
}
rc = rmp_make_private(pfn_aligned, gfn_to_gpa(gfn_aligned), level, sev->asid, false);
if (rc) {
pr_err_ratelimited("SEV: Failed to update RMP entry: GFN %llx PFN %llx level %d error %d\n",
gfn, pfn, level, rc);
return -EINVAL;
}
pr_debug("%s: updated: gfn %llx pfn %llx pfn_aligned %llx max_order %d level %d\n",
__func__, gfn, pfn, pfn_aligned, max_order, level);
return 0;
}
void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
{
kvm_pfn_t pfn;
if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
return;
pr_debug("%s: PFN start 0x%llx PFN end 0x%llx\n", __func__, start, end);
for (pfn = start; pfn < end;) {
bool use_2m_update = false;
int rc, rmp_level;
bool assigned;
rc = snp_lookup_rmpentry(pfn, &assigned, &rmp_level);
if (rc || !assigned)
goto next_pfn;
use_2m_update = IS_ALIGNED(pfn, PTRS_PER_PMD) &&
end >= (pfn + PTRS_PER_PMD) &&
rmp_level > PG_LEVEL_4K;
/*
* If an unaligned PFN corresponds to a 2M region assigned as a
* large page in the RMP table, PSMASH the region into individual
* 4K RMP entries before attempting to convert a 4K sub-page.
*/
if (!use_2m_update && rmp_level > PG_LEVEL_4K) {
/*
* This shouldn't fail, but if it does, report it, but
* still try to update RMP entry to shared and pray this
* was a spurious error that can be addressed later.
*/
rc = snp_rmptable_psmash(pfn);
WARN_ONCE(rc, "SEV: Failed to PSMASH RMP entry for PFN 0x%llx error %d\n",
pfn, rc);
}
rc = rmp_make_shared(pfn, use_2m_update ? PG_LEVEL_2M : PG_LEVEL_4K);
if (WARN_ONCE(rc, "SEV: Failed to update RMP entry for PFN 0x%llx error %d\n",
pfn, rc))
goto next_pfn;
/*
* SEV-ES avoids host/guest cache coherency issues through
KVM: SEV: Prefer WBNOINVD over WBINVD for cache maintenance efficiency AMD CPUs currently execute WBINVD in the host when unregistering SEV guest memory or when deactivating SEV guests. Such cache maintenance is performed to prevent data corruption, wherein the encrypted (C=1) version of a dirty cache line might otherwise only be written back after the memory is written in a different context (ex: C=0), yielding corruption. However, WBINVD is performance-costly, especially because it invalidates processor caches. Strictly-speaking, unless the SEV ASID is being recycled (meaning the SNP firmware requires the use of WBINVD prior to DF_FLUSH), the cache invalidation triggered by WBINVD is unnecessary; only the writeback is needed to prevent data corruption in remaining scenarios. To improve performance in these scenarios, use WBNOINVD when available instead of WBINVD. WBNOINVD still writes back all dirty lines (preventing host data corruption by SEV guests) but does *not* invalidate processor caches. Note that the implementation of wbnoinvd() ensures fall back to WBINVD if WBNOINVD is unavailable. In anticipation of forthcoming optimizations to limit the WBNOINVD only to physical CPUs that have executed SEV guests, place the call to wbnoinvd_on_all_cpus() in a wrapper function sev_writeback_caches(). Signed-off-by: Kevin Loughlin <kevinloughlin@google.com> Reviewed-by: Mingwei Zhang <mizhang@google.com> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> Link: https://lore.kernel.org/r/20250201000259.3289143-3-kevinloughlin@google.com [sean: tweak comment regarding CLFUSH] Cc: Francesco Lavra <francescolavra.fl@gmail.com> Link: https://lore.kernel.org/r/20250522233733.3176144-8-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-05-22 16:37:31 -07:00
* WBNOINVD hooks issued via MMU notifiers during run-time, and
* KVM's VM destroy path at shutdown. Those MMU notifier events
* don't cover gmem since there is no requirement to map pages
* to a HVA in order to use them for a running guest. While the
* shutdown path would still likely cover things for SNP guests,
* userspace may also free gmem pages during run-time via
* hole-punching operations on the guest_memfd, so flush the
* cache entries for these pages before free'ing them back to
* the host.
*/
clflush_cache_range(__va(pfn_to_hpa(pfn)),
use_2m_update ? PMD_SIZE : PAGE_SIZE);
next_pfn:
pfn += use_2m_update ? PTRS_PER_PMD : 1;
cond_resched();
}
}
int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
{
int level, rc;
bool assigned;
if (!sev_snp_guest(kvm))
return 0;
rc = snp_lookup_rmpentry(pfn, &assigned, &level);
if (rc || !assigned)
return PG_LEVEL_4K;
return level;
}
struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb_save_area *vmsa;
struct kvm_sev_info *sev;
int error = 0;
int ret;
if (!sev_es_guest(vcpu->kvm))
return NULL;
/*
* If the VMSA has not yet been encrypted, return a pointer to the
* current un-encrypted VMSA.
*/
if (!vcpu->arch.guest_state_protected)
return (struct vmcb_save_area *)svm->sev_es.vmsa;
sev = to_kvm_sev_info(vcpu->kvm);
/* Check if the SEV policy allows debugging */
if (sev_snp_guest(vcpu->kvm)) {
if (!(sev->policy & SNP_POLICY_DEBUG))
return NULL;
} else {
if (sev->policy & SEV_POLICY_NODBG)
return NULL;
}
if (sev_snp_guest(vcpu->kvm)) {
struct sev_data_snp_dbg dbg = {0};
vmsa = snp_alloc_firmware_page(__GFP_ZERO);
if (!vmsa)
return NULL;
dbg.gctx_paddr = __psp_pa(sev->snp_context);
dbg.src_addr = svm->vmcb->control.vmsa_pa;
dbg.dst_addr = __psp_pa(vmsa);
ret = sev_do_cmd(SEV_CMD_SNP_DBG_DECRYPT, &dbg, &error);
/*
* Return the target page to a hypervisor page no matter what.
* If this fails, the page can't be used, so leak it and don't
* try to use it.
*/
if (snp_page_reclaim(vcpu->kvm, PHYS_PFN(__pa(vmsa))))
return NULL;
if (ret) {
pr_err("SEV: SNP_DBG_DECRYPT failed ret=%d, fw_error=%d (%#x)\n",
ret, error, error);
free_page((unsigned long)vmsa);
return NULL;
}
} else {
struct sev_data_dbg dbg = {0};
struct page *vmsa_page;
vmsa_page = alloc_page(GFP_KERNEL);
if (!vmsa_page)
return NULL;
vmsa = page_address(vmsa_page);
dbg.handle = sev->handle;
dbg.src_addr = svm->vmcb->control.vmsa_pa;
dbg.dst_addr = __psp_pa(vmsa);
dbg.len = PAGE_SIZE;
ret = sev_do_cmd(SEV_CMD_DBG_DECRYPT, &dbg, &error);
if (ret) {
pr_err("SEV: SEV_CMD_DBG_DECRYPT failed ret=%d, fw_error=%d (0x%x)\n",
ret, error, error);
__free_page(vmsa_page);
return NULL;
}
}
return vmsa;
}
void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa)
{
/* If the VMSA has not yet been encrypted, nothing was allocated */
if (!vcpu->arch.guest_state_protected || !vmsa)
return;
free_page((unsigned long)vmsa);
}