2019-06-03 07:44:50 +02:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0-only */
|
2012-12-10 15:35:24 +00:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2012,2013 - ARM Ltd
|
|
|
|
* Author: Marc Zyngier <marc.zyngier@arm.com>
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef __ARM64_KVM_MMU_H__
|
|
|
|
#define __ARM64_KVM_MMU_H__
|
|
|
|
|
|
|
|
#include <asm/page.h>
|
|
|
|
#include <asm/memory.h>
|
2020-09-28 11:45:24 +01:00
|
|
|
#include <asm/mmu.h>
|
2015-11-16 11:28:18 +00:00
|
|
|
#include <asm/cpufeature.h>
|
2012-12-10 15:35:24 +00:00
|
|
|
|
|
|
|
/*
|
2015-01-29 13:50:34 +00:00
|
|
|
* As ARMv8.0 only has the TTBR0_EL2 register, we cannot express
|
2012-12-10 15:35:24 +00:00
|
|
|
* "negative" addresses. This makes it impossible to directly share
|
|
|
|
* mappings with the kernel.
|
|
|
|
*
|
|
|
|
* Instead, give the HYP mode its own VA region at a fixed offset from
|
|
|
|
* the kernel by just masking the top bits (which are all ones for a
|
2016-06-30 18:40:34 +01:00
|
|
|
* kernel address). We need to find out how many bits to mask.
|
2015-01-29 13:50:34 +00:00
|
|
|
*
|
2016-06-30 18:40:34 +01:00
|
|
|
* We want to build a set of page tables that cover both parts of the
|
|
|
|
* idmap (the trampoline page used to initialize EL2), and our normal
|
|
|
|
* runtime VA space, at the same time.
|
|
|
|
*
|
|
|
|
* Given that the kernel uses VA_BITS for its entire address space,
|
|
|
|
* and that half of that space (VA_BITS - 1) is used for the linear
|
|
|
|
* mapping, we can also limit the EL2 space to (VA_BITS - 1).
|
|
|
|
*
|
|
|
|
* The main question is "Within the VA_BITS space, does EL2 use the
|
|
|
|
* top or the bottom half of that space to shadow the kernel's linear
|
|
|
|
* mapping?". As we need to idmap the trampoline page, this is
|
|
|
|
* determined by the range in which this page lives.
|
|
|
|
*
|
|
|
|
* If the page is in the bottom half, we have to use the top half. If
|
|
|
|
* the page is in the top half, we have to use the bottom half:
|
|
|
|
*
|
2017-01-10 13:35:49 -08:00
|
|
|
* T = __pa_symbol(__hyp_idmap_text_start)
|
2016-06-30 18:40:34 +01:00
|
|
|
* if (T & BIT(VA_BITS - 1))
|
|
|
|
* HYP_VA_MIN = 0 //idmap in upper half
|
|
|
|
* else
|
|
|
|
* HYP_VA_MIN = 1 << (VA_BITS - 1)
|
|
|
|
* HYP_VA_MAX = HYP_VA_MIN + (1 << (VA_BITS - 1)) - 1
|
|
|
|
*
|
|
|
|
* When using VHE, there are no separate hyp mappings and all KVM
|
|
|
|
* functionality is already mapped as part of the main kernel
|
|
|
|
* mappings, and none of this applies in that case.
|
2012-12-10 15:35:24 +00:00
|
|
|
*/
|
2016-06-30 18:40:39 +01:00
|
|
|
|
2012-12-10 15:35:24 +00:00
|
|
|
#ifdef __ASSEMBLY__
|
|
|
|
|
2015-01-29 13:50:34 +00:00
|
|
|
#include <asm/alternative.h>
|
|
|
|
|
2020-10-24 16:33:38 +01:00
|
|
|
/*
|
2021-01-05 18:05:39 +00:00
|
|
|
* Convert a hypervisor VA to a PA
|
|
|
|
* reg: hypervisor address to be converted in place
|
2020-10-24 16:33:38 +01:00
|
|
|
* tmp: temporary register
|
|
|
|
*/
|
2021-01-05 18:05:39 +00:00
|
|
|
.macro hyp_pa reg, tmp
|
|
|
|
ldr_l \tmp, hyp_physvirt_offset
|
|
|
|
add \reg, \reg, \tmp
|
2020-10-24 16:33:38 +01:00
|
|
|
.endm
|
|
|
|
|
2020-12-02 18:41:05 +00:00
|
|
|
/*
|
2021-01-05 18:05:39 +00:00
|
|
|
* Convert a hypervisor VA to a kernel image address
|
|
|
|
* reg: hypervisor address to be converted in place
|
2020-12-02 18:41:05 +00:00
|
|
|
* tmp: temporary register
|
|
|
|
*
|
|
|
|
* The actual code generation takes place in kvm_get_kimage_voffset, and
|
|
|
|
* the instructions below are only there to reserve the space and
|
2021-01-05 18:05:39 +00:00
|
|
|
* perform the register allocation (kvm_get_kimage_voffset uses the
|
2020-12-02 18:41:05 +00:00
|
|
|
* specific registers encoded in the instructions).
|
|
|
|
*/
|
2021-01-05 18:05:39 +00:00
|
|
|
.macro hyp_kimg_va reg, tmp
|
|
|
|
/* Convert hyp VA -> PA. */
|
|
|
|
hyp_pa \reg, \tmp
|
|
|
|
|
|
|
|
/* Load kimage_voffset. */
|
2022-09-12 17:22:08 +01:00
|
|
|
alternative_cb ARM64_ALWAYS_SYSTEM, kvm_get_kimage_voffset
|
2020-12-02 18:41:05 +00:00
|
|
|
movz \tmp, #0
|
|
|
|
movk \tmp, #0, lsl #16
|
|
|
|
movk \tmp, #0, lsl #32
|
|
|
|
movk \tmp, #0, lsl #48
|
|
|
|
alternative_cb_end
|
|
|
|
|
2021-01-05 18:05:39 +00:00
|
|
|
/* Convert PA -> kimg VA. */
|
|
|
|
add \reg, \reg, \tmp
|
2020-12-02 18:41:05 +00:00
|
|
|
.endm
|
|
|
|
|
2012-12-10 15:35:24 +00:00
|
|
|
#else
|
|
|
|
|
2020-06-08 21:32:42 -07:00
|
|
|
#include <linux/pgtable.h>
|
2014-10-10 12:14:28 +02:00
|
|
|
#include <asm/pgalloc.h>
|
2017-03-10 20:32:23 +00:00
|
|
|
#include <asm/cache.h>
|
2012-12-10 15:35:24 +00:00
|
|
|
#include <asm/cacheflush.h>
|
2015-03-19 16:42:28 +00:00
|
|
|
#include <asm/mmu_context.h>
|
2023-02-09 17:58:20 +00:00
|
|
|
#include <asm/kvm_emulate.h>
|
2021-11-22 12:18:43 +00:00
|
|
|
#include <asm/kvm_host.h>
|
2024-06-14 15:45:37 +01:00
|
|
|
#include <asm/kvm_nested.h>
|
2012-12-10 15:35:24 +00:00
|
|
|
|
2017-12-03 17:36:55 +00:00
|
|
|
void kvm_update_va_mask(struct alt_instr *alt,
|
|
|
|
__le32 *origptr, __le32 *updptr, int nr_inst);
|
2019-11-28 20:58:05 +01:00
|
|
|
void kvm_compute_layout(void);
|
2021-01-05 18:05:38 +00:00
|
|
|
void kvm_apply_hyp_relocations(void);
|
2017-12-03 17:36:55 +00:00
|
|
|
|
2021-03-18 14:33:11 +00:00
|
|
|
#define __hyp_pa(x) (((phys_addr_t)(x)) + hyp_physvirt_offset)
|
|
|
|
|
2024-02-08 10:54:21 +00:00
|
|
|
/*
|
|
|
|
* Convert a kernel VA into a HYP VA.
|
|
|
|
*
|
|
|
|
* Can be called from hyp or non-hyp context.
|
|
|
|
*
|
|
|
|
* The actual code generation takes place in kvm_update_va_mask(), and
|
|
|
|
* the instructions below are only there to reserve the space and
|
|
|
|
* perform the register allocation (kvm_update_va_mask() uses the
|
|
|
|
* specific registers encoded in the instructions).
|
|
|
|
*/
|
2020-02-20 16:58:37 +00:00
|
|
|
static __always_inline unsigned long __kern_hyp_va(unsigned long v)
|
2016-06-30 18:40:40 +01:00
|
|
|
{
|
2024-02-08 10:54:21 +00:00
|
|
|
/*
|
|
|
|
* This #ifndef is an optimisation for when this is called from VHE hyp
|
|
|
|
* context. When called from a VHE non-hyp context, kvm_update_va_mask() will
|
|
|
|
* replace the instructions with `nop`s.
|
|
|
|
*/
|
2023-03-30 18:47:52 +01:00
|
|
|
#ifndef __KVM_VHE_HYPERVISOR__
|
2024-02-08 10:54:21 +00:00
|
|
|
asm volatile(ALTERNATIVE_CB("and %0, %0, #1\n" /* mask with va_mask */
|
|
|
|
"ror %0, %0, #1\n" /* rotate to the first tag bit */
|
|
|
|
"add %0, %0, #0\n" /* insert the low 12 bits of the tag */
|
|
|
|
"add %0, %0, #0, lsl 12\n" /* insert the top 12 bits of the tag */
|
|
|
|
"ror %0, %0, #63\n", /* rotate back */
|
2022-09-12 17:22:08 +01:00
|
|
|
ARM64_ALWAYS_SYSTEM,
|
2017-12-03 17:36:55 +00:00
|
|
|
kvm_update_va_mask)
|
|
|
|
: "+r" (v));
|
2023-03-30 18:47:52 +01:00
|
|
|
#endif
|
2016-06-30 18:40:40 +01:00
|
|
|
return v;
|
|
|
|
}
|
|
|
|
|
2016-10-18 18:37:49 +01:00
|
|
|
#define kern_hyp_va(v) ((typeof(v))(__kern_hyp_va((unsigned long)(v))))
|
2012-12-10 15:35:24 +00:00
|
|
|
|
2025-01-07 11:28:21 +00:00
|
|
|
extern u32 __hyp_va_bits;
|
|
|
|
|
2012-12-10 15:35:24 +00:00
|
|
|
/*
|
2019-02-14 01:45:46 +00:00
|
|
|
* We currently support using a VM-specified IPA size. For backward
|
|
|
|
* compatibility, the default IPA size is fixed to 40bits.
|
2012-12-10 15:35:24 +00:00
|
|
|
*/
|
2014-07-09 11:17:04 -05:00
|
|
|
#define KVM_PHYS_SHIFT (40)
|
2018-09-26 17:32:44 +01:00
|
|
|
|
KVM: arm64: Move VTCR_EL2 into struct s2_mmu
We currently have a global VTCR_EL2 value for each guest, even
if the guest uses NV. This implies that the guest's own S2 must
fit in the host's. This is odd, for multiple reasons:
- the PARange values and the number of IPA bits don't necessarily
match: you can have 33 bits of IPA space, and yet you can only
describe 32 or 36 bits of PARange
- When userspace set the IPA space, it creates a contract with the
kernel saying "this is the IPA space I'm prepared to handle".
At no point does it constraint the guest's own IPA space as
long as the guest doesn't try to use a [I]PA outside of the
IPA space set by userspace
- We don't even try to hide the value of ID_AA64MMFR0_EL1.PARange.
And then there is the consequence of the above: if a guest tries
to create a S2 that has for input address something that is larger
than the IPA space defined by the host, we inject a fatal exception.
This is no good. For all intent and purposes, a guest should be
able to have the S2 it really wants, as long as the *output* address
of that S2 isn't outside of the IPA space.
For that, we need to have a per-s2_mmu VTCR_EL2 setting, which
allows us to represent the full PARange. Move the vctr field into
the s2_mmu structure, which has no impact whatsoever, except for NV.
Note that once we are able to override ID_AA64MMFR0_EL1.PARange
from userspace, we'll also be able to restrict the size of the
shadow S2 that NV uses.
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20231012205108.3937270-1-maz@kernel.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
2023-10-12 21:51:08 +01:00
|
|
|
#define kvm_phys_shift(mmu) VTCR_EL2_IPA((mmu)->vtcr)
|
|
|
|
#define kvm_phys_size(mmu) (_AC(1, ULL) << kvm_phys_shift(mmu))
|
|
|
|
#define kvm_phys_mask(mmu) (kvm_phys_size(mmu) - _AC(1, ULL))
|
2012-12-10 15:35:24 +00:00
|
|
|
|
2020-09-11 14:25:12 +01:00
|
|
|
#include <asm/kvm_pgtable.h>
|
2016-03-22 14:16:52 +00:00
|
|
|
#include <asm/stage2_pgtable.h>
|
|
|
|
|
2021-12-15 16:12:23 +00:00
|
|
|
int kvm_share_hyp(void *from, void *to);
|
2021-12-15 16:12:31 +00:00
|
|
|
void kvm_unshare_hyp(void *from, void *to);
|
2020-09-11 14:25:12 +01:00
|
|
|
int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot);
|
2022-04-20 14:42:54 -07:00
|
|
|
int __create_hyp_mappings(unsigned long start, unsigned long size,
|
|
|
|
unsigned long phys, enum kvm_pgtable_prot prot);
|
2022-04-20 14:42:52 -07:00
|
|
|
int hyp_alloc_private_va_range(size_t size, unsigned long *haddr);
|
2017-12-04 16:26:09 +00:00
|
|
|
int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
|
2017-12-04 16:43:23 +00:00
|
|
|
void __iomem **kaddr,
|
|
|
|
void __iomem **haddr);
|
2018-02-13 11:00:29 +00:00
|
|
|
int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
|
|
|
|
void **haddr);
|
2023-08-11 12:20:37 +01:00
|
|
|
int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr);
|
KVM: x86: Unify pr_fmt to use module name for all KVM modules
Define pr_fmt using KBUILD_MODNAME for all KVM x86 code so that printks
use consistent formatting across common x86, Intel, and AMD code. In
addition to providing consistent print formatting, using KBUILD_MODNAME,
e.g. kvm_amd and kvm_intel, allows referencing SVM and VMX (and SEV and
SGX and ...) as technologies without generating weird messages, and
without causing naming conflicts with other kernel code, e.g. "SEV: ",
"tdx: ", "sgx: " etc.. are all used by the kernel for non-KVM subsystems.
Opportunistically move away from printk() for prints that need to be
modified anyways, e.g. to drop a manual "kvm: " prefix.
Opportunistically convert a few SGX WARNs that are similarly modified to
WARN_ONCE; in the very unlikely event that the WARNs fire, odds are good
that they would fire repeatedly and spam the kernel log without providing
unique information in each print.
Note, defining pr_fmt yields undesirable results for code that uses KVM's
printk wrappers, e.g. vcpu_unimpl(). But, that's a pre-existing problem
as SVM/kvm_amd already defines a pr_fmt, and thankfully use of KVM's
wrappers is relatively limited in KVM x86 code.
Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Paul Durrant <paul@xen.org>
Message-Id: <20221130230934.1014142-35-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-11-30 23:09:18 +00:00
|
|
|
void __init free_hyp_pgds(void);
|
2012-12-10 15:35:24 +00:00
|
|
|
|
2024-10-07 23:30:26 +00:00
|
|
|
void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start,
|
|
|
|
u64 size, bool may_block);
|
2024-06-14 15:45:40 +01:00
|
|
|
void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end);
|
|
|
|
void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end);
|
2024-06-14 15:45:37 +01:00
|
|
|
|
2014-11-27 10:35:03 +01:00
|
|
|
void stage2_unmap_vm(struct kvm *kvm);
|
2022-11-10 19:02:51 +00:00
|
|
|
int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type);
|
2023-04-26 17:23:26 +00:00
|
|
|
void kvm_uninit_stage2_mmu(struct kvm *kvm);
|
2019-01-04 21:09:05 +01:00
|
|
|
void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu);
|
2012-12-10 15:35:24 +00:00
|
|
|
int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
|
2014-09-17 14:56:18 -07:00
|
|
|
phys_addr_t pa, unsigned long size, bool writable);
|
2012-12-10 15:35:24 +00:00
|
|
|
|
2020-06-23 21:14:15 +08:00
|
|
|
int kvm_handle_guest_abort(struct kvm_vcpu *vcpu);
|
2012-12-10 15:35:24 +00:00
|
|
|
|
|
|
|
phys_addr_t kvm_mmu_get_httbr(void);
|
|
|
|
phys_addr_t kvm_get_idmap_vector(void);
|
KVM: x86: Unify pr_fmt to use module name for all KVM modules
Define pr_fmt using KBUILD_MODNAME for all KVM x86 code so that printks
use consistent formatting across common x86, Intel, and AMD code. In
addition to providing consistent print formatting, using KBUILD_MODNAME,
e.g. kvm_amd and kvm_intel, allows referencing SVM and VMX (and SEV and
SGX and ...) as technologies without generating weird messages, and
without causing naming conflicts with other kernel code, e.g. "SEV: ",
"tdx: ", "sgx: " etc.. are all used by the kernel for non-KVM subsystems.
Opportunistically move away from printk() for prints that need to be
modified anyways, e.g. to drop a manual "kvm: " prefix.
Opportunistically convert a few SGX WARNs that are similarly modified to
WARN_ONCE; in the very unlikely event that the WARNs fire, odds are good
that they would fire repeatedly and spam the kernel log without providing
unique information in each print.
Note, defining pr_fmt yields undesirable results for code that uses KVM's
printk wrappers, e.g. vcpu_unimpl(). But, that's a pre-existing problem
as SVM/kvm_amd already defines a pr_fmt, and thankfully use of KVM's
wrappers is relatively limited in KVM x86 code.
Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Paul Durrant <paul@xen.org>
Message-Id: <20221130230934.1014142-35-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-11-30 23:09:18 +00:00
|
|
|
int __init kvm_mmu_init(u32 *hyp_va_bits);
|
2020-06-04 16:46:23 -07:00
|
|
|
|
2021-03-19 10:01:23 +00:00
|
|
|
static inline void *__kvm_vector_slot2addr(void *base,
|
|
|
|
enum arm64_hyp_spectre_vector slot)
|
|
|
|
{
|
|
|
|
int idx = slot - (slot != HYP_VECTOR_DIRECT);
|
|
|
|
|
|
|
|
return base + (idx * SZ_2K);
|
|
|
|
}
|
|
|
|
|
2012-12-10 15:35:24 +00:00
|
|
|
struct kvm;
|
|
|
|
|
2021-05-24 09:29:55 +01:00
|
|
|
#define kvm_flush_dcache_to_poc(a,l) \
|
arm64: Rename arm64-internal cache maintenance functions
Although naming across the codebase isn't that consistent, it
tends to follow certain patterns. Moreover, the term "flush"
isn't defined in the Arm Architecture reference manual, and might
be interpreted to mean clean, invalidate, or both for a cache.
Rename arm64-internal functions to make the naming internally
consistent, as well as making it consistent with the Arm ARM, by
specifying whether it applies to the instruction, data, or both
caches, whether the operation is a clean, invalidate, or both.
Also specify which point the operation applies to, i.e., to the
point of unification (PoU), coherency (PoC), or persistence
(PoP).
This commit applies the following sed transformation to all files
under arch/arm64:
"s/\b__flush_cache_range\b/caches_clean_inval_pou_macro/g;"\
"s/\b__flush_icache_range\b/caches_clean_inval_pou/g;"\
"s/\binvalidate_icache_range\b/icache_inval_pou/g;"\
"s/\b__flush_dcache_area\b/dcache_clean_inval_poc/g;"\
"s/\b__inval_dcache_area\b/dcache_inval_poc/g;"\
"s/__clean_dcache_area_poc\b/dcache_clean_poc/g;"\
"s/\b__clean_dcache_area_pop\b/dcache_clean_pop/g;"\
"s/\b__clean_dcache_area_pou\b/dcache_clean_pou/g;"\
"s/\b__flush_cache_user_range\b/caches_clean_inval_user_pou/g;"\
"s/\b__flush_icache_all\b/icache_inval_all_pou/g;"
Note that __clean_dcache_area_poc is deliberately missing a word
boundary check at the beginning in order to match the efistub
symbols in image-vars.h.
Also note that, despite its name, __flush_icache_range operates
on both instruction and data caches. The name change here
reflects that.
No functional change intended.
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20210524083001.2586635-19-tabba@google.com
Signed-off-by: Will Deacon <will@kernel.org>
2021-05-24 09:30:01 +01:00
|
|
|
dcache_clean_inval_poc((unsigned long)(a), (unsigned long)(a)+(l))
|
2014-01-14 19:13:10 +00:00
|
|
|
|
|
|
|
static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
|
2012-12-10 15:35:24 +00:00
|
|
|
{
|
2023-02-09 17:58:20 +00:00
|
|
|
u64 cache_bits = SCTLR_ELx_M | SCTLR_ELx_C;
|
|
|
|
int reg;
|
|
|
|
|
|
|
|
if (vcpu_is_el2(vcpu))
|
|
|
|
reg = SCTLR_EL2;
|
|
|
|
else
|
|
|
|
reg = SCTLR_EL1;
|
|
|
|
|
|
|
|
return (vcpu_read_sys_reg(vcpu, reg) & cache_bits) == cache_bits;
|
2014-01-14 19:13:10 +00:00
|
|
|
}
|
|
|
|
|
2021-06-17 18:58:23 +08:00
|
|
|
static inline void __clean_dcache_guest_page(void *va, size_t size)
|
2014-01-14 19:13:10 +00:00
|
|
|
{
|
2018-04-06 12:27:28 +01:00
|
|
|
/*
|
|
|
|
* With FWB, we ensure that the guest always accesses memory using
|
|
|
|
* cacheable attributes, and we don't have to clean to PoC when
|
|
|
|
* faulting in pages. Furthermore, FWB implies IDC, so cleaning to
|
|
|
|
* PoU is not required either in this case.
|
|
|
|
*/
|
arm64: kvm: Use cpus_have_final_cap() explicitly
Much of the arm64 KVM code uses cpus_have_const_cap() to check for
cpucaps, but this is unnecessary and it would be preferable to use
cpus_have_final_cap().
For historical reasons, cpus_have_const_cap() is more complicated than
it needs to be. Before cpucaps are finalized, it will perform a bitmap
test of the system_cpucaps bitmap, and once cpucaps are finalized it
will use an alternative branch. This used to be necessary to handle some
race conditions in the window between cpucap detection and the
subsequent patching of alternatives and static branches, where different
branches could be out-of-sync with one another (or w.r.t. alternative
sequences). Now that we use alternative branches instead of static
branches, these are all patched atomically w.r.t. one another, and there
are only a handful of cases that need special care in the window between
cpucap detection and alternative patching.
Due to the above, it would be nice to remove cpus_have_const_cap(), and
migrate callers over to alternative_has_cap_*(), cpus_have_final_cap(),
or cpus_have_cap() depending on when their requirements. This will
remove redundant instructions and improve code generation, and will make
it easier to determine how each callsite will behave before, during, and
after alternative patching.
KVM is initialized after cpucaps have been finalized and alternatives
have been patched. Since commit:
d86de40decaa14e6 ("arm64: cpufeature: upgrade hyp caps to final")
... use of cpus_have_const_cap() in hyp code is automatically converted
to use cpus_have_final_cap():
| static __always_inline bool cpus_have_const_cap(int num)
| {
| if (is_hyp_code())
| return cpus_have_final_cap(num);
| else if (system_capabilities_finalized())
| return __cpus_have_const_cap(num);
| else
| return cpus_have_cap(num);
| }
Thus, converting hyp code to use cpus_have_final_cap() directly will not
result in any functional change.
Non-hyp KVM code is also not executed until cpucaps have been finalized,
and it would be preferable to extent the same treatment to this code and
use cpus_have_final_cap() directly.
This patch converts instances of cpus_have_const_cap() in KVM-only code
over to cpus_have_final_cap(). As all of this code runs after cpucaps
have been finalized, there should be no functional change as a result of
this patch, but the redundant instructions generated by
cpus_have_const_cap() will be removed from the non-hyp KVM code.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2023-10-16 11:24:32 +01:00
|
|
|
if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
|
2018-04-06 12:27:28 +01:00
|
|
|
return;
|
|
|
|
|
2017-01-25 12:29:59 +00:00
|
|
|
kvm_flush_dcache_to_poc(va, size);
|
2017-10-23 17:11:15 +01:00
|
|
|
}
|
2014-01-14 19:13:10 +00:00
|
|
|
|
2023-09-20 08:01:33 +00:00
|
|
|
static inline size_t __invalidate_icache_max_range(void)
|
|
|
|
{
|
|
|
|
u8 iminline;
|
|
|
|
u64 ctr;
|
|
|
|
|
|
|
|
asm volatile(ALTERNATIVE_CB("movz %0, #0\n"
|
|
|
|
"movk %0, #0, lsl #16\n"
|
|
|
|
"movk %0, #0, lsl #32\n"
|
|
|
|
"movk %0, #0, lsl #48\n",
|
|
|
|
ARM64_ALWAYS_SYSTEM,
|
|
|
|
kvm_compute_final_ctr_el0)
|
|
|
|
: "=r" (ctr));
|
|
|
|
|
|
|
|
iminline = SYS_FIELD_GET(CTR_EL0, IminLine, ctr) + 2;
|
|
|
|
return MAX_DVM_OPS << iminline;
|
|
|
|
}
|
|
|
|
|
2021-06-17 18:58:23 +08:00
|
|
|
static inline void __invalidate_icache_guest_page(void *va, size_t size)
|
2017-10-23 17:11:15 +01:00
|
|
|
{
|
2023-09-20 08:01:33 +00:00
|
|
|
/*
|
|
|
|
* Blow the whole I-cache if it is aliasing (i.e. VIPT) or the
|
|
|
|
* invalidation range exceeds our arbitrary limit on invadations by
|
|
|
|
* cache line.
|
|
|
|
*/
|
|
|
|
if (icache_is_aliasing() || size > __invalidate_icache_max_range())
|
arm64: Rename arm64-internal cache maintenance functions
Although naming across the codebase isn't that consistent, it
tends to follow certain patterns. Moreover, the term "flush"
isn't defined in the Arm Architecture reference manual, and might
be interpreted to mean clean, invalidate, or both for a cache.
Rename arm64-internal functions to make the naming internally
consistent, as well as making it consistent with the Arm ARM, by
specifying whether it applies to the instruction, data, or both
caches, whether the operation is a clean, invalidate, or both.
Also specify which point the operation applies to, i.e., to the
point of unification (PoU), coherency (PoC), or persistence
(PoP).
This commit applies the following sed transformation to all files
under arch/arm64:
"s/\b__flush_cache_range\b/caches_clean_inval_pou_macro/g;"\
"s/\b__flush_icache_range\b/caches_clean_inval_pou/g;"\
"s/\binvalidate_icache_range\b/icache_inval_pou/g;"\
"s/\b__flush_dcache_area\b/dcache_clean_inval_poc/g;"\
"s/\b__inval_dcache_area\b/dcache_inval_poc/g;"\
"s/__clean_dcache_area_poc\b/dcache_clean_poc/g;"\
"s/\b__clean_dcache_area_pop\b/dcache_clean_pop/g;"\
"s/\b__clean_dcache_area_pou\b/dcache_clean_pou/g;"\
"s/\b__flush_cache_user_range\b/caches_clean_inval_user_pou/g;"\
"s/\b__flush_icache_all\b/icache_inval_all_pou/g;"
Note that __clean_dcache_area_poc is deliberately missing a word
boundary check at the beginning in order to match the efistub
symbols in image-vars.h.
Also note that, despite its name, __flush_icache_range operates
on both instruction and data caches. The name change here
reflects that.
No functional change intended.
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20210524083001.2586635-19-tabba@google.com
Signed-off-by: Will Deacon <will@kernel.org>
2021-05-24 09:30:01 +01:00
|
|
|
icache_inval_all_pou();
|
2023-09-20 08:01:33 +00:00
|
|
|
else
|
2021-06-18 17:30:39 +01:00
|
|
|
icache_inval_pou((unsigned long)va, (unsigned long)va + size);
|
2012-12-10 15:35:24 +00:00
|
|
|
}
|
|
|
|
|
2014-12-19 16:05:31 +00:00
|
|
|
void kvm_set_way_flush(struct kvm_vcpu *vcpu);
|
|
|
|
void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled);
|
2014-01-15 12:50:23 +00:00
|
|
|
|
2015-11-16 11:28:18 +00:00
|
|
|
static inline unsigned int kvm_get_vmid_bits(void)
|
|
|
|
{
|
2017-03-23 15:14:39 +00:00
|
|
|
int reg = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
|
2015-11-16 11:28:18 +00:00
|
|
|
|
2020-05-12 07:27:27 +05:30
|
|
|
return get_vmid_bits(reg);
|
2015-11-16 11:28:18 +00:00
|
|
|
}
|
|
|
|
|
2018-05-11 15:20:14 +01:00
|
|
|
/*
|
|
|
|
* We are not in the kvm->srcu critical section most of the time, so we take
|
|
|
|
* the SRCU read lock here. Since we copy the data from the user page, we
|
|
|
|
* can immediately drop the lock again.
|
|
|
|
*/
|
|
|
|
static inline int kvm_read_guest_lock(struct kvm *kvm,
|
|
|
|
gpa_t gpa, void *data, unsigned long len)
|
|
|
|
{
|
|
|
|
int srcu_idx = srcu_read_lock(&kvm->srcu);
|
|
|
|
int ret = kvm_read_guest(kvm, gpa, data, len);
|
|
|
|
|
|
|
|
srcu_read_unlock(&kvm->srcu, srcu_idx);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
KVM: arm/arm64: vgic-its: Take the srcu lock when writing to guest memory
When halting a guest, QEMU flushes the virtual ITS caches, which
amounts to writing to the various tables that the guest has allocated.
When doing this, we fail to take the srcu lock, and the kernel
shouts loudly if running a lockdep kernel:
[ 69.680416] =============================
[ 69.680819] WARNING: suspicious RCU usage
[ 69.681526] 5.1.0-rc1-00008-g600025238f51-dirty #18 Not tainted
[ 69.682096] -----------------------------
[ 69.682501] ./include/linux/kvm_host.h:605 suspicious rcu_dereference_check() usage!
[ 69.683225]
[ 69.683225] other info that might help us debug this:
[ 69.683225]
[ 69.683975]
[ 69.683975] rcu_scheduler_active = 2, debug_locks = 1
[ 69.684598] 6 locks held by qemu-system-aar/4097:
[ 69.685059] #0: 0000000034196013 (&kvm->lock){+.+.}, at: vgic_its_set_attr+0x244/0x3a0
[ 69.686087] #1: 00000000f2ed935e (&its->its_lock){+.+.}, at: vgic_its_set_attr+0x250/0x3a0
[ 69.686919] #2: 000000005e71ea54 (&vcpu->mutex){+.+.}, at: lock_all_vcpus+0x64/0xd0
[ 69.687698] #3: 00000000c17e548d (&vcpu->mutex){+.+.}, at: lock_all_vcpus+0x64/0xd0
[ 69.688475] #4: 00000000ba386017 (&vcpu->mutex){+.+.}, at: lock_all_vcpus+0x64/0xd0
[ 69.689978] #5: 00000000c2c3c335 (&vcpu->mutex){+.+.}, at: lock_all_vcpus+0x64/0xd0
[ 69.690729]
[ 69.690729] stack backtrace:
[ 69.691151] CPU: 2 PID: 4097 Comm: qemu-system-aar Not tainted 5.1.0-rc1-00008-g600025238f51-dirty #18
[ 69.691984] Hardware name: rockchip evb_rk3399/evb_rk3399, BIOS 2019.04-rc3-00124-g2feec69fb1 03/15/2019
[ 69.692831] Call trace:
[ 69.694072] lockdep_rcu_suspicious+0xcc/0x110
[ 69.694490] gfn_to_memslot+0x174/0x190
[ 69.694853] kvm_write_guest+0x50/0xb0
[ 69.695209] vgic_its_save_tables_v0+0x248/0x330
[ 69.695639] vgic_its_set_attr+0x298/0x3a0
[ 69.696024] kvm_device_ioctl_attr+0x9c/0xd8
[ 69.696424] kvm_device_ioctl+0x8c/0xf8
[ 69.696788] do_vfs_ioctl+0xc8/0x960
[ 69.697128] ksys_ioctl+0x8c/0xa0
[ 69.697445] __arm64_sys_ioctl+0x28/0x38
[ 69.697817] el0_svc_common+0xd8/0x138
[ 69.698173] el0_svc_handler+0x38/0x78
[ 69.698528] el0_svc+0x8/0xc
The fix is to obviously take the srcu lock, just like we do on the
read side of things since bf308242ab98. One wonders why this wasn't
fixed at the same time, but hey...
Fixes: bf308242ab98 ("KVM: arm/arm64: VGIC/ITS: protect kvm_read_guest() calls with SRCU lock")
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
2019-03-19 12:47:11 +00:00
|
|
|
static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa,
|
|
|
|
const void *data, unsigned long len)
|
|
|
|
{
|
|
|
|
int srcu_idx = srcu_read_lock(&kvm->srcu);
|
|
|
|
int ret = kvm_write_guest(kvm, gpa, data, len);
|
|
|
|
|
|
|
|
srcu_read_unlock(&kvm->srcu, srcu_idx);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2017-12-13 17:07:18 +00:00
|
|
|
#define kvm_phys_to_vttbr(addr) phys_to_ttbr(addr)
|
|
|
|
|
2021-08-06 12:31:08 +01:00
|
|
|
/*
|
|
|
|
* When this is (directly or indirectly) used on the TLB invalidation
|
|
|
|
* path, we rely on a previously issued DSB so that page table updates
|
|
|
|
* and VMID reads are correctly ordered.
|
|
|
|
*/
|
2019-01-04 21:09:05 +01:00
|
|
|
static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu)
|
2018-07-31 14:08:57 +01:00
|
|
|
{
|
2019-01-04 21:09:05 +01:00
|
|
|
struct kvm_vmid *vmid = &mmu->vmid;
|
2018-12-11 15:26:31 +01:00
|
|
|
u64 vmid_field, baddr;
|
|
|
|
u64 cnp = system_supports_cnp() ? VTTBR_CNP_BIT : 0;
|
|
|
|
|
2019-01-04 21:09:05 +01:00
|
|
|
baddr = mmu->pgd_phys;
|
2021-11-22 12:18:43 +00:00
|
|
|
vmid_field = atomic64_read(&vmid->id) << VTTBR_VMID_SHIFT;
|
|
|
|
vmid_field &= VTTBR_VMID_MASK(kvm_arm_vmid_bits);
|
2018-12-11 15:26:31 +01:00
|
|
|
return kvm_phys_to_vttbr(baddr) | vmid_field | cnp;
|
2018-07-31 14:08:57 +01:00
|
|
|
}
|
|
|
|
|
2020-05-28 14:12:59 +01:00
|
|
|
/*
|
|
|
|
* Must be called from hyp code running at EL2 with an updated VTTBR
|
|
|
|
* and interrupts disabled.
|
|
|
|
*/
|
2021-08-06 12:31:07 +01:00
|
|
|
static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu,
|
|
|
|
struct kvm_arch *arch)
|
2020-05-28 14:12:59 +01:00
|
|
|
{
|
KVM: arm64: Move VTCR_EL2 into struct s2_mmu
We currently have a global VTCR_EL2 value for each guest, even
if the guest uses NV. This implies that the guest's own S2 must
fit in the host's. This is odd, for multiple reasons:
- the PARange values and the number of IPA bits don't necessarily
match: you can have 33 bits of IPA space, and yet you can only
describe 32 or 36 bits of PARange
- When userspace set the IPA space, it creates a contract with the
kernel saying "this is the IPA space I'm prepared to handle".
At no point does it constraint the guest's own IPA space as
long as the guest doesn't try to use a [I]PA outside of the
IPA space set by userspace
- We don't even try to hide the value of ID_AA64MMFR0_EL1.PARange.
And then there is the consequence of the above: if a guest tries
to create a S2 that has for input address something that is larger
than the IPA space defined by the host, we inject a fatal exception.
This is no good. For all intent and purposes, a guest should be
able to have the S2 it really wants, as long as the *output* address
of that S2 isn't outside of the IPA space.
For that, we need to have a per-s2_mmu VTCR_EL2 setting, which
allows us to represent the full PARange. Move the vctr field into
the s2_mmu structure, which has no impact whatsoever, except for NV.
Note that once we are able to override ID_AA64MMFR0_EL1.PARange
from userspace, we'll also be able to restrict the size of the
shadow S2 that NV uses.
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20231012205108.3937270-1-maz@kernel.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
2023-10-12 21:51:08 +01:00
|
|
|
write_sysreg(mmu->vtcr, vtcr_el2);
|
2019-01-04 21:09:05 +01:00
|
|
|
write_sysreg(kvm_get_vttbr(mmu), vttbr_el2);
|
2020-05-28 14:12:59 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* ARM errata 1165522 and 1530923 require the actual execution of the
|
|
|
|
* above before we can switch to the EL1/EL0 translation regime used by
|
|
|
|
* the guest.
|
|
|
|
*/
|
|
|
|
asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
|
|
|
|
}
|
|
|
|
|
2021-03-19 10:01:28 +00:00
|
|
|
static inline struct kvm *kvm_s2_mmu_to_kvm(struct kvm_s2_mmu *mmu)
|
|
|
|
{
|
|
|
|
return container_of(mmu->arch, struct kvm, arch);
|
|
|
|
}
|
2024-06-14 15:45:37 +01:00
|
|
|
|
|
|
|
static inline u64 get_vmid(u64 vttbr)
|
|
|
|
{
|
|
|
|
return (vttbr & VTTBR_VMID_MASK(kvm_get_vmid_bits())) >>
|
|
|
|
VTTBR_VMID_SHIFT;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool kvm_s2_mmu_valid(struct kvm_s2_mmu *mmu)
|
|
|
|
{
|
|
|
|
return !(mmu->tlb_vttbr & VTTBR_CNP_BIT);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool kvm_is_nested_s2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Be careful, mmu may not be fully initialised so do look at
|
|
|
|
* *any* of its fields.
|
|
|
|
*/
|
|
|
|
return &kvm->arch.mmu != mmu;
|
|
|
|
}
|
|
|
|
|
2024-12-18 19:40:59 +00:00
|
|
|
static inline void kvm_fault_lock(struct kvm *kvm)
|
|
|
|
{
|
|
|
|
if (is_protected_kvm_enabled())
|
|
|
|
write_lock(&kvm->mmu_lock);
|
|
|
|
else
|
|
|
|
read_lock(&kvm->mmu_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void kvm_fault_unlock(struct kvm *kvm)
|
|
|
|
{
|
|
|
|
if (is_protected_kvm_enabled())
|
|
|
|
write_unlock(&kvm->mmu_lock);
|
|
|
|
else
|
|
|
|
read_unlock(&kvm->mmu_lock);
|
|
|
|
}
|
|
|
|
|
2025-07-05 07:17:15 +00:00
|
|
|
/*
|
|
|
|
* ARM64 KVM relies on a simple conversion from physaddr to a kernel
|
|
|
|
* virtual address (KVA) when it does cache maintenance as the CMO
|
|
|
|
* instructions work on virtual addresses. This is incompatible with
|
|
|
|
* VM_PFNMAP VMAs which may not have a kernel direct mapping to a
|
|
|
|
* virtual address.
|
|
|
|
*
|
|
|
|
* With S2FWB and CACHE DIC features, KVM need not do cache flushing
|
|
|
|
* and CMOs are NOP'd. This has the effect of no longer requiring a
|
|
|
|
* KVA for addresses mapped into the S2. The presence of these features
|
|
|
|
* are thus necessary to support cacheable S2 mapping of VM_PFNMAP.
|
|
|
|
*/
|
|
|
|
static inline bool kvm_supports_cacheable_pfnmap(void)
|
|
|
|
{
|
|
|
|
return cpus_have_final_cap(ARM64_HAS_STAGE2_FWB) &&
|
|
|
|
cpus_have_final_cap(ARM64_HAS_CACHE_DIC);
|
|
|
|
}
|
|
|
|
|
2024-09-09 12:47:21 +00:00
|
|
|
#ifdef CONFIG_PTDUMP_STAGE2_DEBUGFS
|
|
|
|
void kvm_s2_ptdump_create_debugfs(struct kvm *kvm);
|
|
|
|
#else
|
|
|
|
static inline void kvm_s2_ptdump_create_debugfs(struct kvm *kvm) {}
|
|
|
|
#endif /* CONFIG_PTDUMP_STAGE2_DEBUGFS */
|
|
|
|
|
2012-12-10 15:35:24 +00:00
|
|
|
#endif /* __ASSEMBLY__ */
|
|
|
|
#endif /* __ARM64_KVM_MMU_H__ */
|