2020-06-22 13:20:31 -07:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0 */
|
|
|
|
#ifndef __KVM_X86_MMU_INTERNAL_H
|
|
|
|
#define __KVM_X86_MMU_INTERNAL_H
|
|
|
|
|
2020-06-22 13:20:32 -07:00
|
|
|
#include <linux/types.h>
|
2020-10-16 10:29:37 -04:00
|
|
|
#include <linux/kvm_host.h>
|
2020-06-22 13:20:32 -07:00
|
|
|
#include <asm/kvm_host.h>
|
|
|
|
|
2023-07-28 17:47:19 -07:00
|
|
|
#ifdef CONFIG_KVM_PROVE_MMU
|
KVM: x86/mmu: Convert "runtime" WARN_ON() assertions to WARN_ON_ONCE()
Convert all "runtime" assertions, i.e. assertions that can be triggered
while running vCPUs, from WARN_ON() to WARN_ON_ONCE(). Every WARN in the
MMU that is tied to running vCPUs, i.e. not contained to loading and
initializing KVM, is likely to fire _a lot_ when it does trigger. E.g. if
KVM ends up with a bug that causes a root to be invalidated before the
page fault handler is invoked, pretty much _every_ page fault VM-Exit
triggers the WARN.
If a WARN is triggered frequently, the resulting spam usually causes a lot
of damage of its own, e.g. consumes resources to log the WARN and pollutes
the kernel log, often to the point where other useful information can be
lost. In many case, the damage caused by the spam is actually worse than
the bug itself, e.g. KVM can almost always recover from an unexpectedly
invalid root.
On the flip side, warning every time is rarely helpful for debug and
triage, i.e. a single splat is usually sufficient to point a debugger in
the right direction, and automated testing, e.g. syzkaller, typically runs
with warn_on_panic=1, i.e. will never get past the first WARN anyways.
Lastly, when an assertions fails multiple times, the stack traces in KVM
are almost always identical, i.e. the full splat only needs to be captured
once. And _if_ there is value in captruing information about the failed
assert, a ratelimited printk() is sufficient and less likely to rack up a
large amount of collateral damage.
Link: https://lore.kernel.org/r/20230729004722.1056172-8-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2023-07-28 17:47:17 -07:00
|
|
|
#define KVM_MMU_WARN_ON(x) WARN_ON_ONCE(x)
|
2020-10-16 10:29:37 -04:00
|
|
|
#else
|
2023-07-28 17:47:20 -07:00
|
|
|
#define KVM_MMU_WARN_ON(x) BUILD_BUG_ON_INVALID(x)
|
2020-10-16 10:29:37 -04:00
|
|
|
#endif
|
|
|
|
|
2022-06-14 23:33:24 +00:00
|
|
|
/* Page table builder macros common to shadow (host) PTEs and guest PTEs. */
|
2023-09-13 20:42:16 +08:00
|
|
|
#define __PT_BASE_ADDR_MASK GENMASK_ULL(51, 12)
|
2022-06-14 23:33:24 +00:00
|
|
|
#define __PT_LEVEL_SHIFT(level, bits_per_level) \
|
|
|
|
(PAGE_SHIFT + ((level) - 1) * (bits_per_level))
|
|
|
|
#define __PT_INDEX(address, level, bits_per_level) \
|
|
|
|
(((address) >> __PT_LEVEL_SHIFT(level, bits_per_level)) & ((1 << (bits_per_level)) - 1))
|
|
|
|
|
|
|
|
#define __PT_LVL_ADDR_MASK(base_addr_mask, level, bits_per_level) \
|
|
|
|
((base_addr_mask) & ~((1ULL << (PAGE_SHIFT + (((level) - 1) * (bits_per_level)))) - 1))
|
|
|
|
|
|
|
|
#define __PT_LVL_OFFSET_MASK(base_addr_mask, level, bits_per_level) \
|
|
|
|
((base_addr_mask) & ((1ULL << (PAGE_SHIFT + (((level) - 1) * (bits_per_level)))) - 1))
|
|
|
|
|
|
|
|
#define __PT_ENT_PER_PAGE(bits_per_level) (1 << (bits_per_level))
|
|
|
|
|
2021-03-09 14:42:06 -08:00
|
|
|
/*
|
|
|
|
* Unlike regular MMU roots, PAE "roots", a.k.a. PDPTEs/PDPTRs, have a PRESENT
|
|
|
|
* bit, and thus are guaranteed to be non-zero when valid. And, when a guest
|
|
|
|
* PDPTR is !PRESENT, its corresponding PAE root cannot be set to INVALID_PAGE,
|
|
|
|
* as the CPU would treat that as PRESENT PDPTR with reserved bits set. Use
|
|
|
|
* '0' instead of INVALID_PAGE to indicate an invalid PAE root.
|
|
|
|
*/
|
|
|
|
#define INVALID_PAE_ROOT 0
|
|
|
|
#define IS_VALID_PAE_ROOT(x) (!!(x))
|
|
|
|
|
KVM: x86/mmu: Use dummy root, backed by zero page, for !visible guest roots
When attempting to allocate a shadow root for a !visible guest root gfn,
e.g. that resides in MMIO space, load a dummy root that is backed by the
zero page instead of immediately synthesizing a triple fault shutdown
(using the zero page ensures any attempt to translate memory will generate
a !PRESENT fault and thus VM-Exit).
Unless the vCPU is racing with memslot activity, KVM will inject a page
fault due to not finding a visible slot in FNAME(walk_addr_generic), i.e.
the end result is mostly same, but critically KVM will inject a fault only
*after* KVM runs the vCPU with the bogus root.
Waiting to inject a fault until after running the vCPU fixes a bug where
KVM would bail from nested VM-Enter if L1 tried to run L2 with TDP enabled
and a !visible root. Even though a bad root will *probably* lead to
shutdown, (a) it's not guaranteed and (b) the CPU won't read the
underlying memory until after VM-Enter succeeds. E.g. if L1 runs L2 with
a VMX preemption timer value of '0', then architecturally the preemption
timer VM-Exit is guaranteed to occur before the CPU executes any
instruction, i.e. before the CPU needs to translate a GPA to a HPA (so
long as there are no injected events with higher priority than the
preemption timer).
If KVM manages to get to FNAME(fetch) with a dummy root, e.g. because
userspace created a memslot between installing the dummy root and handling
the page fault, simply unload the MMU to allocate a new root and retry the
instruction. Use KVM_REQ_MMU_FREE_OBSOLETE_ROOTS to drop the root, as
invoking kvm_mmu_free_roots() while holding mmu_lock would deadlock, and
conceptually the dummy root has indeeed become obsolete. The only
difference versus existing usage of KVM_REQ_MMU_FREE_OBSOLETE_ROOTS is
that the root has become obsolete due to memslot *creation*, not memslot
deletion or movement.
Reported-by: Reima Ishii <ishiir@g.ecc.u-tokyo.ac.jp>
Cc: Yu Zhang <yu.c.zhang@linux.intel.com>
Link: https://lore.kernel.org/r/20230729005200.1057358-6-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2023-07-28 17:52:00 -07:00
|
|
|
static inline hpa_t kvm_mmu_get_dummy_root(void)
|
|
|
|
{
|
|
|
|
return my_zero_pfn(0) << PAGE_SHIFT;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool kvm_mmu_is_dummy_root(hpa_t shadow_page)
|
|
|
|
{
|
|
|
|
return is_zero_pfn(shadow_page >> PAGE_SHIFT);
|
|
|
|
}
|
|
|
|
|
2022-02-26 00:15:31 +00:00
|
|
|
typedef u64 __rcu *tdp_ptep_t;
|
|
|
|
|
2020-06-22 13:20:32 -07:00
|
|
|
struct kvm_mmu_page {
|
2021-09-01 15:10:23 -07:00
|
|
|
/*
|
|
|
|
* Note, "link" through "spt" fit in a single 64 byte cache line on
|
|
|
|
* 64-bit kernels, keep it that way unless there's a reason not to.
|
|
|
|
*/
|
2020-06-22 13:20:32 -07:00
|
|
|
struct list_head link;
|
|
|
|
struct hlist_node hash_link;
|
|
|
|
|
2021-09-01 15:10:22 -07:00
|
|
|
bool tdp_mmu_page;
|
2020-06-22 13:20:32 -07:00
|
|
|
bool unsync;
|
KVM: x86/mmu: Stop zapping invalidated TDP MMU roots asynchronously
Stop zapping invalidate TDP MMU roots via work queue now that KVM
preserves TDP MMU roots until they are explicitly invalidated. Zapping
roots asynchronously was effectively a workaround to avoid stalling a vCPU
for an extended during if a vCPU unloaded a root, which at the time
happened whenever the guest toggled CR0.WP (a frequent operation for some
guest kernels).
While a clever hack, zapping roots via an unbound worker had subtle,
unintended consequences on host scheduling, especially when zapping
multiple roots, e.g. as part of a memslot. Because the work of zapping a
root is no longer bound to the task that initiated the zap, things like
the CPU affinity and priority of the original task get lost. Losing the
affinity and priority can be especially problematic if unbound workqueues
aren't affined to a small number of CPUs, as zapping multiple roots can
cause KVM to heavily utilize the majority of CPUs in the system, *beyond*
the CPUs KVM is already using to run vCPUs.
When deleting a memslot via KVM_SET_USER_MEMORY_REGION, the async root
zap can result in KVM occupying all logical CPUs for ~8ms, and result in
high priority tasks not being scheduled in in a timely manner. In v5.15,
which doesn't preserve unloaded roots, the issues were even more noticeable
as KVM would zap roots more frequently and could occupy all CPUs for 50ms+.
Consuming all CPUs for an extended duration can lead to significant jitter
throughout the system, e.g. on ChromeOS with virtio-gpu, deleting memslots
is a semi-frequent operation as memslots are deleted and recreated with
different host virtual addresses to react to host GPU drivers allocating
and freeing GPU blobs. On ChromeOS, the jitter manifests as audio blips
during games due to the audio server's tasks not getting scheduled in
promptly, despite the tasks having a high realtime priority.
Deleting memslots isn't exactly a fast path and should be avoided when
possible, and ChromeOS is working towards utilizing MAP_FIXED to avoid the
memslot shenanigans, but KVM is squarely in the wrong. Not to mention
that removing the async zapping eliminates a non-trivial amount of
complexity.
Note, one of the subtle behaviors hidden behind the async zapping is that
KVM would zap invalidated roots only once (ignoring partial zaps from
things like mmu_notifier events). Preserve this behavior by adding a flag
to identify roots that are scheduled to be zapped versus roots that have
already been zapped but not yet freed.
Add a comment calling out why kvm_tdp_mmu_invalidate_all_roots() can
encounter invalid roots, as it's not at all obvious why zapping
invalidated roots shouldn't simply zap all invalid roots.
Reported-by: Pattara Teerapong <pteerapong@google.com>
Cc: David Stevens <stevensd@google.com>
Cc: Yiwei Zhang<zzyiwei@google.com>
Cc: Paul Hsia <paulhsia@google.com>
Cc: stable@vger.kernel.org
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20230916003916.2545000-4-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2023-09-15 17:39:15 -07:00
|
|
|
union {
|
|
|
|
u8 mmu_valid_gen;
|
|
|
|
|
|
|
|
/* Only accessed under slots_lock. */
|
|
|
|
bool tdp_mmu_scheduled_root_to_zap;
|
|
|
|
};
|
2022-10-19 16:56:12 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The shadow page can't be replaced by an equivalent huge page
|
|
|
|
* because it is being used to map an executable page in the guest
|
|
|
|
* and the NX huge page mitigation is enabled.
|
|
|
|
*/
|
|
|
|
bool nx_huge_page_disallowed;
|
2020-06-22 13:20:32 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The following two entries are used to key the shadow page in the
|
|
|
|
* hash table.
|
|
|
|
*/
|
|
|
|
union kvm_mmu_page_role role;
|
|
|
|
gfn_t gfn;
|
|
|
|
|
|
|
|
u64 *spt;
|
KVM: x86/mmu: Cache the access bits of shadowed translations
Splitting huge pages requires allocating/finding shadow pages to replace
the huge page. Shadow pages are keyed, in part, off the guest access
permissions they are shadowing. For fully direct MMUs, there is no
shadowing so the access bits in the shadow page role are always ACC_ALL.
But during shadow paging, the guest can enforce whatever access
permissions it wants.
In particular, eager page splitting needs to know the permissions to use
for the subpages, but KVM cannot retrieve them from the guest page
tables because eager page splitting does not have a vCPU. Fortunately,
the guest access permissions are easy to cache whenever page faults or
FNAME(sync_page) update the shadow page tables; this is an extension of
the existing cache of the shadowed GFNs in the gfns array of the shadow
page. The access bits only take up 3 bits, which leaves 61 bits left
over for gfns, which is more than enough.
Now that the gfns array caches more information than just GFNs, rename
it to shadowed_translation.
While here, preemptively fix up the WARN_ON() that detects gfn
mismatches in direct SPs. The WARN_ON() was paired with a
pr_err_ratelimited(), which means that users could sometimes see the
WARN without the accompanying error message. Fix this by outputting the
error message as part of the WARN splat, and opportunistically make
them WARN_ONCE() because if these ever fire, they are all but guaranteed
to fire a lot and will bring down the kernel.
Signed-off-by: David Matlack <dmatlack@google.com>
Message-Id: <20220516232138.1783324-18-dmatlack@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-06-22 15:27:04 -04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Stores the result of the guest translation being shadowed by each
|
|
|
|
* SPTE. KVM shadows two types of guest translations: nGPA -> GPA
|
|
|
|
* (shadow EPT/NPT) and GVA -> GPA (traditional shadow paging). In both
|
|
|
|
* cases the result of the translation is a GPA and a set of access
|
|
|
|
* constraints.
|
|
|
|
*
|
|
|
|
* The GFN is stored in the upper bits (PAGE_SHIFT) and the shadowed
|
|
|
|
* access permissions are stored in the lower bits. Note, for
|
|
|
|
* convenience and uniformity across guests, the access permissions are
|
|
|
|
* stored in KVM format (e.g. ACC_EXEC_MASK) not the raw guest format.
|
|
|
|
*/
|
|
|
|
u64 *shadowed_translation;
|
|
|
|
|
2021-04-01 16:37:29 -07:00
|
|
|
/* Currently serving as active root */
|
|
|
|
union {
|
|
|
|
int root_count;
|
|
|
|
refcount_t tdp_mmu_root_count;
|
|
|
|
};
|
2020-06-22 13:20:32 -07:00
|
|
|
unsigned int unsync_children;
|
2022-02-26 00:15:31 +00:00
|
|
|
union {
|
|
|
|
struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
|
|
|
|
tdp_ptep_t ptep;
|
|
|
|
};
|
KVM: x86/mmu: Stop zapping invalidated TDP MMU roots asynchronously
Stop zapping invalidate TDP MMU roots via work queue now that KVM
preserves TDP MMU roots until they are explicitly invalidated. Zapping
roots asynchronously was effectively a workaround to avoid stalling a vCPU
for an extended during if a vCPU unloaded a root, which at the time
happened whenever the guest toggled CR0.WP (a frequent operation for some
guest kernels).
While a clever hack, zapping roots via an unbound worker had subtle,
unintended consequences on host scheduling, especially when zapping
multiple roots, e.g. as part of a memslot. Because the work of zapping a
root is no longer bound to the task that initiated the zap, things like
the CPU affinity and priority of the original task get lost. Losing the
affinity and priority can be especially problematic if unbound workqueues
aren't affined to a small number of CPUs, as zapping multiple roots can
cause KVM to heavily utilize the majority of CPUs in the system, *beyond*
the CPUs KVM is already using to run vCPUs.
When deleting a memslot via KVM_SET_USER_MEMORY_REGION, the async root
zap can result in KVM occupying all logical CPUs for ~8ms, and result in
high priority tasks not being scheduled in in a timely manner. In v5.15,
which doesn't preserve unloaded roots, the issues were even more noticeable
as KVM would zap roots more frequently and could occupy all CPUs for 50ms+.
Consuming all CPUs for an extended duration can lead to significant jitter
throughout the system, e.g. on ChromeOS with virtio-gpu, deleting memslots
is a semi-frequent operation as memslots are deleted and recreated with
different host virtual addresses to react to host GPU drivers allocating
and freeing GPU blobs. On ChromeOS, the jitter manifests as audio blips
during games due to the audio server's tasks not getting scheduled in
promptly, despite the tasks having a high realtime priority.
Deleting memslots isn't exactly a fast path and should be avoided when
possible, and ChromeOS is working towards utilizing MAP_FIXED to avoid the
memslot shenanigans, but KVM is squarely in the wrong. Not to mention
that removing the async zapping eliminates a non-trivial amount of
complexity.
Note, one of the subtle behaviors hidden behind the async zapping is that
KVM would zap invalidated roots only once (ignoring partial zaps from
things like mmu_notifier events). Preserve this behavior by adding a flag
to identify roots that are scheduled to be zapped versus roots that have
already been zapped but not yet freed.
Add a comment calling out why kvm_tdp_mmu_invalidate_all_roots() can
encounter invalid roots, as it's not at all obvious why zapping
invalidated roots shouldn't simply zap all invalid roots.
Reported-by: Pattara Teerapong <pteerapong@google.com>
Cc: David Stevens <stevensd@google.com>
Cc: Yiwei Zhang<zzyiwei@google.com>
Cc: Paul Hsia <paulhsia@google.com>
Cc: stable@vger.kernel.org
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20230916003916.2545000-4-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2023-09-15 17:39:15 -07:00
|
|
|
DECLARE_BITMAP(unsync_child_bitmap, 512);
|
2020-06-22 13:20:32 -07:00
|
|
|
|
2022-10-19 16:56:11 +00:00
|
|
|
/*
|
|
|
|
* Tracks shadow pages that, if zapped, would allow KVM to create an NX
|
2022-10-19 16:56:12 +00:00
|
|
|
* huge page. A shadow page will have nx_huge_page_disallowed set but
|
|
|
|
* not be on the list if a huge page is disallowed for other reasons,
|
|
|
|
* e.g. because KVM is shadowing a PTE at the same gfn, the memslot
|
|
|
|
* isn't properly aligned, etc...
|
2022-10-19 16:56:11 +00:00
|
|
|
*/
|
2022-10-19 16:56:12 +00:00
|
|
|
struct list_head possible_nx_huge_page_link;
|
2020-06-22 13:20:32 -07:00
|
|
|
#ifdef CONFIG_X86_32
|
|
|
|
/*
|
|
|
|
* Used out of the mmu-lock to avoid reading spte values while an
|
|
|
|
* update is in progress; see the comments in __get_spte_lockless().
|
|
|
|
*/
|
|
|
|
int clear_spte_count;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* Number of writes since the last time traversal visited this page. */
|
|
|
|
atomic_t write_flooding_count;
|
2020-10-14 20:26:44 +02:00
|
|
|
|
2021-02-06 09:53:33 -05:00
|
|
|
#ifdef CONFIG_X86_64
|
2021-03-18 15:28:01 +01:00
|
|
|
/* Used for freeing the page asynchronously if it is a TDP MMU page. */
|
2021-02-02 10:57:23 -08:00
|
|
|
struct rcu_head rcu_head;
|
2021-02-06 09:53:33 -05:00
|
|
|
#endif
|
2020-06-22 13:20:32 -07:00
|
|
|
};
|
|
|
|
|
2020-10-14 20:26:44 +02:00
|
|
|
extern struct kmem_cache *mmu_page_header_cache;
|
|
|
|
|
2021-03-25 19:19:45 -07:00
|
|
|
static inline int kvm_mmu_role_as_id(union kvm_mmu_page_role role)
|
|
|
|
{
|
|
|
|
return role.smm ? 1 : 0;
|
|
|
|
}
|
|
|
|
|
2021-03-15 16:38:03 -07:00
|
|
|
static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
|
|
|
|
{
|
2021-03-25 19:19:45 -07:00
|
|
|
return kvm_mmu_role_as_id(sp->role);
|
2021-03-15 16:38:03 -07:00
|
|
|
}
|
|
|
|
|
2021-11-17 18:08:42 -08:00
|
|
|
static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp)
|
2020-10-16 10:29:37 -04:00
|
|
|
{
|
|
|
|
/*
|
2021-02-25 12:47:26 -08:00
|
|
|
* When using the EPT page-modification log, the GPAs in the CPU dirty
|
|
|
|
* log would come from L2 rather than L1. Therefore, we need to rely
|
|
|
|
* on write protection to record dirty pages, which bypasses PML, since
|
|
|
|
* writes now result in a vmexit. Note, the check on CPU dirty logging
|
|
|
|
* being enabled is mandatory as the bits used to denote WP-only SPTEs
|
2021-11-17 18:08:42 -08:00
|
|
|
* are reserved for PAE paging (32-bit KVM).
|
2020-10-16 10:29:37 -04:00
|
|
|
*/
|
2021-11-17 18:08:42 -08:00
|
|
|
return kvm_x86_ops.cpu_dirty_log_size && sp->role.guest_mode;
|
2020-10-16 10:29:37 -04:00
|
|
|
}
|
|
|
|
|
2022-10-10 20:19:12 +08:00
|
|
|
static inline gfn_t gfn_round_for_level(gfn_t gfn, int level)
|
|
|
|
{
|
|
|
|
return gfn & -KVM_PAGES_PER_HPAGE(level);
|
|
|
|
}
|
|
|
|
|
2021-11-15 15:45:58 -08:00
|
|
|
int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
|
2021-09-29 09:19:32 -04:00
|
|
|
gfn_t gfn, bool can_unsync, bool prefetch);
|
2020-10-16 10:29:37 -04:00
|
|
|
|
2021-07-12 22:33:38 -04:00
|
|
|
void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn);
|
|
|
|
void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn);
|
2020-06-22 13:20:31 -07:00
|
|
|
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
|
2021-04-29 11:41:14 +08:00
|
|
|
struct kvm_memory_slot *slot, u64 gfn,
|
|
|
|
int min_level);
|
2022-10-10 20:19:13 +08:00
|
|
|
|
|
|
|
/* Flush the given page (huge or not) of guest memory. */
|
|
|
|
static inline void kvm_flush_remote_tlbs_gfn(struct kvm *kvm, gfn_t gfn, int level)
|
|
|
|
{
|
2023-01-26 10:40:22 -08:00
|
|
|
kvm_flush_remote_tlbs_range(kvm, gfn_round_for_level(gfn, level),
|
|
|
|
KVM_PAGES_PER_HPAGE(level));
|
2022-10-10 20:19:13 +08:00
|
|
|
}
|
|
|
|
|
2021-07-30 18:04:52 -04:00
|
|
|
unsigned int pte_list_count(struct kvm_rmap_head *rmap_head);
|
2020-06-22 13:20:31 -07:00
|
|
|
|
2022-04-23 03:47:47 +00:00
|
|
|
extern int nx_huge_pages;
|
2022-06-13 21:25:21 +00:00
|
|
|
static inline bool is_nx_huge_page_enabled(struct kvm *kvm)
|
2022-04-23 03:47:47 +00:00
|
|
|
{
|
2022-06-13 21:25:21 +00:00
|
|
|
return READ_ONCE(nx_huge_pages) && !kvm->arch.disable_nx_huge_pages;
|
2022-04-23 03:47:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
struct kvm_page_fault {
|
|
|
|
/* arguments to kvm_mmu_do_page_fault. */
|
|
|
|
const gpa_t addr;
|
2024-02-27 18:41:35 -08:00
|
|
|
const u64 error_code;
|
2022-04-23 03:47:47 +00:00
|
|
|
const bool prefetch;
|
|
|
|
|
|
|
|
/* Derived from error_code. */
|
|
|
|
const bool exec;
|
|
|
|
const bool write;
|
|
|
|
const bool present;
|
|
|
|
const bool rsvd;
|
|
|
|
const bool user;
|
|
|
|
|
|
|
|
/* Derived from mmu and global state. */
|
|
|
|
const bool is_tdp;
|
KVM: x86/mmu: Handle page fault for private memory
Add support for resolving page faults on guest private memory for VMs
that differentiate between "shared" and "private" memory. For such VMs,
KVM_MEM_GUEST_MEMFD memslots can include both fd-based private memory and
hva-based shared memory, and KVM needs to map in the "correct" variant,
i.e. KVM needs to map the gfn shared/private as appropriate based on the
current state of the gfn's KVM_MEMORY_ATTRIBUTE_PRIVATE flag.
For AMD's SEV-SNP and Intel's TDX, the guest effectively gets to request
shared vs. private via a bit in the guest page tables, i.e. what the guest
wants may conflict with the current memory attributes. To support such
"implicit" conversion requests, exit to user with KVM_EXIT_MEMORY_FAULT
to forward the request to userspace. Add a new flag for memory faults,
KVM_MEMORY_EXIT_FLAG_PRIVATE, to communicate whether the guest wants to
map memory as shared vs. private.
Like KVM_MEMORY_ATTRIBUTE_PRIVATE, use bit 3 for flagging private memory
so that KVM can use bits 0-2 for capturing RWX behavior if/when userspace
needs such information, e.g. a likely user of KVM_EXIT_MEMORY_FAULT is to
exit on missing mappings when handling guest page fault VM-Exits. In
that case, userspace will want to know RWX information in order to
correctly/precisely resolve the fault.
Note, private memory *must* be backed by guest_memfd, i.e. shared mappings
always come from the host userspace page tables, and private mappings
always come from a guest_memfd instance.
Co-developed-by: Yu Zhang <yu.c.zhang@linux.intel.com>
Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Fuad Tabba <tabba@google.com>
Tested-by: Fuad Tabba <tabba@google.com>
Message-Id: <20231027182217.3615211-21-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2023-10-27 11:22:02 -07:00
|
|
|
const bool is_private;
|
2022-04-23 03:47:47 +00:00
|
|
|
const bool nx_huge_page_workaround_enabled;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Whether a >4KB mapping can be created or is forbidden due to NX
|
|
|
|
* hugepages.
|
|
|
|
*/
|
|
|
|
bool huge_page_disallowed;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Maximum page size that can be created for this fault; input to
|
2022-09-21 10:35:46 -07:00
|
|
|
* FNAME(fetch), direct_map() and kvm_tdp_mmu_map().
|
2022-04-23 03:47:47 +00:00
|
|
|
*/
|
|
|
|
u8 max_level;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Page size that can be created based on the max_level and the
|
|
|
|
* page size used by the host mapping.
|
|
|
|
*/
|
|
|
|
u8 req_level;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Page size that will be created based on the req_level and
|
|
|
|
* huge_page_disallowed.
|
|
|
|
*/
|
|
|
|
u8 goal_level;
|
|
|
|
|
|
|
|
/* Shifted addr, or result of guest page table walk if addr is a gva. */
|
|
|
|
gfn_t gfn;
|
|
|
|
|
|
|
|
/* The memslot containing gfn. May be NULL. */
|
|
|
|
struct kvm_memory_slot *slot;
|
|
|
|
|
|
|
|
/* Outputs of kvm_faultin_pfn. */
|
2022-09-21 10:35:39 -07:00
|
|
|
unsigned long mmu_seq;
|
2022-04-23 03:47:47 +00:00
|
|
|
kvm_pfn_t pfn;
|
|
|
|
hva_t hva;
|
|
|
|
bool map_writable;
|
2023-02-02 18:28:15 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Indicates the guest is trying to write a gfn that contains one or
|
|
|
|
* more of the PTEs used to translate the write itself, i.e. the access
|
|
|
|
* is changing its own translation in the guest page tables.
|
|
|
|
*/
|
|
|
|
bool write_fault_to_shadow_pgtable;
|
2022-04-23 03:47:47 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
|
|
|
|
|
2020-10-14 11:26:50 -07:00
|
|
|
/*
|
2022-04-23 03:47:47 +00:00
|
|
|
* Return values of handle_mmio_page_fault(), mmu.page_fault(), fast_page_fault(),
|
|
|
|
* and of course kvm_mmu_do_page_fault().
|
2020-10-14 11:26:50 -07:00
|
|
|
*
|
2022-04-23 03:47:46 +00:00
|
|
|
* RET_PF_CONTINUE: So far, so good, keep handling the page fault.
|
2020-10-14 11:26:50 -07:00
|
|
|
* RET_PF_RETRY: let CPU fault again on the address.
|
|
|
|
* RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
|
|
|
|
* RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
|
|
|
|
* RET_PF_FIXED: The faulting entry has been fixed.
|
|
|
|
* RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU.
|
2021-07-13 22:09:53 +00:00
|
|
|
*
|
|
|
|
* Any names added to this enum should be exported to userspace for use in
|
|
|
|
* tracepoints via TRACE_DEFINE_ENUM() in mmutrace.h
|
2022-04-23 03:47:46 +00:00
|
|
|
*
|
|
|
|
* Note, all values must be greater than or equal to zero so as not to encroach
|
|
|
|
* on -errno return values. Somewhat arbitrarily use '0' for CONTINUE, which
|
|
|
|
* will allow for efficient machine code when checking for CONTINUE, e.g.
|
|
|
|
* "TEST %rax, %rax, JNZ", as all "stop!" values are non-zero.
|
2020-10-14 11:26:50 -07:00
|
|
|
*/
|
|
|
|
enum {
|
2022-04-23 03:47:46 +00:00
|
|
|
RET_PF_CONTINUE = 0,
|
|
|
|
RET_PF_RETRY,
|
2020-10-14 11:26:50 -07:00
|
|
|
RET_PF_EMULATE,
|
|
|
|
RET_PF_INVALID,
|
|
|
|
RET_PF_FIXED,
|
|
|
|
RET_PF_SPURIOUS,
|
|
|
|
};
|
|
|
|
|
2024-02-27 18:41:32 -08:00
|
|
|
static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
|
|
|
|
struct kvm_page_fault *fault)
|
|
|
|
{
|
|
|
|
kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
|
|
|
|
PAGE_SIZE, fault->write, fault->exec,
|
|
|
|
fault->is_private);
|
|
|
|
}
|
|
|
|
|
2022-04-23 03:47:47 +00:00
|
|
|
static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
|
2024-02-27 18:41:35 -08:00
|
|
|
u64 err, bool prefetch, int *emulation_type)
|
2022-04-23 03:47:47 +00:00
|
|
|
{
|
|
|
|
struct kvm_page_fault fault = {
|
|
|
|
.addr = cr2_or_gpa,
|
|
|
|
.error_code = err,
|
|
|
|
.exec = err & PFERR_FETCH_MASK,
|
|
|
|
.write = err & PFERR_WRITE_MASK,
|
|
|
|
.present = err & PFERR_PRESENT_MASK,
|
|
|
|
.rsvd = err & PFERR_RSVD_MASK,
|
|
|
|
.user = err & PFERR_USER_MASK,
|
|
|
|
.prefetch = prefetch,
|
|
|
|
.is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault),
|
2022-06-13 21:25:21 +00:00
|
|
|
.nx_huge_page_workaround_enabled =
|
|
|
|
is_nx_huge_page_enabled(vcpu->kvm),
|
2022-04-23 03:47:47 +00:00
|
|
|
|
|
|
|
.max_level = KVM_MAX_HUGEPAGE_LEVEL,
|
|
|
|
.req_level = PG_LEVEL_4K,
|
|
|
|
.goal_level = PG_LEVEL_4K,
|
KVM: x86/mmu: Use synthetic page fault error code to indicate private faults
Add and use a synthetic, KVM-defined page fault error code to indicate
whether a fault is to private vs. shared memory. TDX and SNP have
different mechanisms for reporting private vs. shared, and KVM's
software-protected VMs have no mechanism at all. Usurp an error code
flag to avoid having to plumb another parameter to kvm_mmu_page_fault()
and friends.
Alternatively, KVM could borrow AMD's PFERR_GUEST_ENC_MASK, i.e. set it
for TDX and software-protected VMs as appropriate, but that would require
*clearing* the flag for SEV and SEV-ES VMs, which support encrypted
memory at the hardware layer, but don't utilize private memory at the
KVM layer.
Opportunistically add a comment to call out that the logic for software-
protected VMs is (and was before this commit) broken for nested MMUs, i.e.
for nested TDP, as the GPA is an L2 GPA. Punt on trying to play nice with
nested MMUs as there is a _lot_ of functionality that simply doesn't work
for software-protected VMs, e.g. all of the paths where KVM accesses guest
memory need to be updated to be aware of private vs. shared memory.
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20240228024147.41573-6-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-02-27 09:28:08 -05:00
|
|
|
.is_private = err & PFERR_PRIVATE_ACCESS,
|
2024-02-27 18:41:46 -08:00
|
|
|
|
|
|
|
.pfn = KVM_PFN_ERR_FAULT,
|
|
|
|
.hva = KVM_HVA_ERR_BAD,
|
2022-04-23 03:47:47 +00:00
|
|
|
};
|
2022-04-23 03:47:49 +00:00
|
|
|
int r;
|
|
|
|
|
2022-09-21 10:35:43 -07:00
|
|
|
if (vcpu->arch.mmu->root_role.direct) {
|
|
|
|
fault.gfn = fault.addr >> PAGE_SHIFT;
|
|
|
|
fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn);
|
|
|
|
}
|
|
|
|
|
2022-04-23 03:47:49 +00:00
|
|
|
/*
|
|
|
|
* Async #PF "faults", a.k.a. prefetch faults, are not faults from the
|
|
|
|
* guest perspective and have already been counted at the time of the
|
|
|
|
* original fault.
|
|
|
|
*/
|
|
|
|
if (!prefetch)
|
|
|
|
vcpu->stat.pf_taken++;
|
2022-04-23 03:47:48 +00:00
|
|
|
|
2023-11-21 08:07:32 -08:00
|
|
|
if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && fault.is_tdp)
|
2022-04-23 03:47:49 +00:00
|
|
|
r = kvm_tdp_page_fault(vcpu, &fault);
|
|
|
|
else
|
|
|
|
r = vcpu->arch.mmu->page_fault(vcpu, &fault);
|
2022-04-23 03:47:48 +00:00
|
|
|
|
2024-02-27 18:41:32 -08:00
|
|
|
/*
|
|
|
|
* Not sure what's happening, but punt to userspace and hope that
|
|
|
|
* they can fix it by changing memory to shared, or they can
|
|
|
|
* provide a better error.
|
|
|
|
*/
|
|
|
|
if (r == RET_PF_EMULATE && fault.is_private) {
|
|
|
|
pr_warn_ratelimited("kvm: unexpected emulation request on private memory\n");
|
|
|
|
kvm_mmu_prepare_memory_fault_exit(vcpu, &fault);
|
|
|
|
return -EFAULT;
|
|
|
|
}
|
|
|
|
|
2023-02-02 18:28:15 +00:00
|
|
|
if (fault.write_fault_to_shadow_pgtable && emulation_type)
|
|
|
|
*emulation_type |= EMULTYPE_WRITE_PF_TO_SP;
|
|
|
|
|
2022-04-23 03:47:49 +00:00
|
|
|
/*
|
|
|
|
* Similar to above, prefetch faults aren't truly spurious, and the
|
|
|
|
* async #PF path doesn't do emulation. Do count faults that are fixed
|
|
|
|
* by the async #PF handler though, otherwise they'll never be counted.
|
|
|
|
*/
|
|
|
|
if (r == RET_PF_FIXED)
|
|
|
|
vcpu->stat.pf_fixed++;
|
|
|
|
else if (prefetch)
|
|
|
|
;
|
|
|
|
else if (r == RET_PF_EMULATE)
|
|
|
|
vcpu->stat.pf_emulate++;
|
|
|
|
else if (r == RET_PF_SPURIOUS)
|
|
|
|
vcpu->stat.pf_spurious++;
|
|
|
|
return r;
|
2022-04-23 03:47:47 +00:00
|
|
|
}
|
|
|
|
|
2021-04-01 16:37:24 -07:00
|
|
|
int kvm_mmu_max_mapping_level(struct kvm *kvm,
|
|
|
|
const struct kvm_memory_slot *slot, gfn_t gfn,
|
2022-07-15 23:21:04 +00:00
|
|
|
int max_level);
|
2021-08-07 09:21:53 -04:00
|
|
|
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
|
2021-08-06 04:35:50 -04:00
|
|
|
void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);
|
2020-10-14 11:26:50 -07:00
|
|
|
|
|
|
|
void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
|
|
|
|
|
2022-10-19 16:56:14 +00:00
|
|
|
void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
|
|
|
|
void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
|
2020-10-14 11:27:00 -07:00
|
|
|
|
2020-06-22 13:20:31 -07:00
|
|
|
#endif /* __KVM_X86_MMU_INTERNAL_H */
|