2020-09-15 23:10:49 +01:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
|
|
|
/*
|
2020-11-13 11:38:45 +00:00
|
|
|
* Handle detection, reporting and mitigation of Spectre v1, v2, v3a and v4, as
|
2020-09-15 23:10:49 +01:00
|
|
|
* detailed at:
|
|
|
|
*
|
|
|
|
* https://developer.arm.com/support/arm-security-updates/speculative-processor-vulnerability
|
|
|
|
*
|
|
|
|
* This code was originally written hastily under an awful lot of stress and so
|
|
|
|
* aspects of it are somewhat hacky. Unfortunately, changing anything in here
|
|
|
|
* instantly makes me feel ill. Thanks, Jann. Thann.
|
|
|
|
*
|
|
|
|
* Copyright (C) 2018 ARM Ltd, All Rights Reserved.
|
|
|
|
* Copyright (C) 2020 Google LLC
|
|
|
|
*
|
|
|
|
* "If there's something strange in your neighbourhood, who you gonna call?"
|
|
|
|
*
|
|
|
|
* Authors: Will Deacon <will@kernel.org> and Marc Zyngier <maz@kernel.org>
|
|
|
|
*/
|
|
|
|
|
2020-09-15 23:30:17 +01:00
|
|
|
#include <linux/arm-smccc.h>
|
2022-03-03 16:53:56 +00:00
|
|
|
#include <linux/bpf.h>
|
2020-09-15 23:30:17 +01:00
|
|
|
#include <linux/cpu.h>
|
2020-09-15 23:10:49 +01:00
|
|
|
#include <linux/device.h>
|
2020-09-18 11:45:57 +01:00
|
|
|
#include <linux/nospec.h>
|
2020-09-15 23:30:17 +01:00
|
|
|
#include <linux/prctl.h>
|
2020-09-28 14:06:50 +01:00
|
|
|
#include <linux/sched/task_stack.h>
|
2020-09-15 23:30:17 +01:00
|
|
|
|
arm64: Mitigate spectre style branch history side channels
Speculation attacks against some high-performance processors can
make use of branch history to influence future speculation.
When taking an exception from user-space, a sequence of branches
or a firmware call overwrites or invalidates the branch history.
The sequence of branches is added to the vectors, and should appear
before the first indirect branch. For systems using KPTI the sequence
is added to the kpti trampoline where it has a free register as the exit
from the trampoline is via a 'ret'. For systems not using KPTI, the same
register tricks are used to free up a register in the vectors.
For the firmware call, arch-workaround-3 clobbers 4 registers, so
there is no choice but to save them to the EL1 stack. This only happens
for entry from EL0, so if we take an exception due to the stack access,
it will not become re-entrant.
For KVM, the existing branch-predictor-hardening vectors are used.
When a spectre version of these vectors is in use, the firmware call
is sufficient to mitigate against Spectre-BHB. For the non-spectre
versions, the sequence of branches is added to the indirect vector.
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
2021-11-10 14:48:00 +00:00
|
|
|
#include <asm/debug-monitors.h>
|
2020-06-30 13:55:59 +01:00
|
|
|
#include <asm/insn.h>
|
2020-09-15 23:30:17 +01:00
|
|
|
#include <asm/spectre.h>
|
|
|
|
#include <asm/traps.h>
|
arm64: Mitigate spectre style branch history side channels
Speculation attacks against some high-performance processors can
make use of branch history to influence future speculation.
When taking an exception from user-space, a sequence of branches
or a firmware call overwrites or invalidates the branch history.
The sequence of branches is added to the vectors, and should appear
before the first indirect branch. For systems using KPTI the sequence
is added to the kpti trampoline where it has a free register as the exit
from the trampoline is via a 'ret'. For systems not using KPTI, the same
register tricks are used to free up a register in the vectors.
For the firmware call, arch-workaround-3 clobbers 4 registers, so
there is no choice but to save them to the EL1 stack. This only happens
for entry from EL0, so if we take an exception due to the stack access,
it will not become re-entrant.
For KVM, the existing branch-predictor-hardening vectors are used.
When a spectre version of these vectors is in use, the firmware call
is sufficient to mitigate against Spectre-BHB. For the non-spectre
versions, the sequence of branches is added to the indirect vector.
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
2021-11-10 14:48:00 +00:00
|
|
|
#include <asm/vectors.h>
|
2020-11-13 11:38:44 +00:00
|
|
|
#include <asm/virt.h>
|
2020-09-15 23:30:17 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We try to ensure that the mitigation state can never change as the result of
|
|
|
|
* onlining a late CPU.
|
|
|
|
*/
|
|
|
|
static void update_mitigation_state(enum mitigation_state *oldp,
|
|
|
|
enum mitigation_state new)
|
|
|
|
{
|
|
|
|
enum mitigation_state state;
|
|
|
|
|
|
|
|
do {
|
|
|
|
state = READ_ONCE(*oldp);
|
|
|
|
if (new <= state)
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* Userspace almost certainly can't deal with this. */
|
|
|
|
if (WARN_ON(system_capabilities_finalized()))
|
|
|
|
break;
|
|
|
|
} while (cmpxchg_relaxed(oldp, state, new) != state);
|
|
|
|
}
|
2020-09-15 23:10:49 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Spectre v1.
|
|
|
|
*
|
|
|
|
* The kernel can't protect userspace for this one: it's each person for
|
|
|
|
* themselves. Advertise what we're doing and be done with it.
|
|
|
|
*/
|
|
|
|
ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr,
|
|
|
|
char *buf)
|
|
|
|
{
|
|
|
|
return sprintf(buf, "Mitigation: __user pointer sanitization\n");
|
|
|
|
}
|
2020-09-15 23:30:17 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Spectre v2.
|
|
|
|
*
|
|
|
|
* This one sucks. A CPU is either:
|
|
|
|
*
|
|
|
|
* - Mitigated in hardware and advertised by ID_AA64PFR0_EL1.CSV2.
|
|
|
|
* - Mitigated in hardware and listed in our "safe list".
|
|
|
|
* - Mitigated in software by firmware.
|
2020-10-20 16:01:57 +01:00
|
|
|
* - Mitigated in software by a CPU-specific dance in the kernel and a
|
|
|
|
* firmware call at EL2.
|
2020-09-15 23:30:17 +01:00
|
|
|
* - Vulnerable.
|
|
|
|
*
|
|
|
|
* It's not unlikely for different CPUs in a big.LITTLE system to fall into
|
|
|
|
* different camps.
|
|
|
|
*/
|
|
|
|
static enum mitigation_state spectre_v2_state;
|
|
|
|
|
|
|
|
static bool __read_mostly __nospectre_v2;
|
|
|
|
static int __init parse_spectre_v2_param(char *str)
|
|
|
|
{
|
|
|
|
__nospectre_v2 = true;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
early_param("nospectre_v2", parse_spectre_v2_param);
|
|
|
|
|
|
|
|
static bool spectre_v2_mitigations_off(void)
|
|
|
|
{
|
|
|
|
bool ret = __nospectre_v2 || cpu_mitigations_off();
|
|
|
|
|
|
|
|
if (ret)
|
|
|
|
pr_info_once("spectre-v2 mitigation disabled by command line option\n");
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2022-02-08 16:08:13 +00:00
|
|
|
static const char *get_bhb_affected_string(enum mitigation_state bhb_state)
|
|
|
|
{
|
|
|
|
switch (bhb_state) {
|
|
|
|
case SPECTRE_UNAFFECTED:
|
|
|
|
return "";
|
|
|
|
default:
|
|
|
|
case SPECTRE_VULNERABLE:
|
|
|
|
return ", but not BHB";
|
|
|
|
case SPECTRE_MITIGATED:
|
|
|
|
return ", BHB";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-03-03 16:53:56 +00:00
|
|
|
static bool _unprivileged_ebpf_enabled(void)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_BPF_SYSCALL
|
|
|
|
return !sysctl_unprivileged_bpf_disabled;
|
|
|
|
#else
|
|
|
|
return false;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2020-09-15 23:30:17 +01:00
|
|
|
ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr,
|
|
|
|
char *buf)
|
|
|
|
{
|
2022-02-08 16:08:13 +00:00
|
|
|
enum mitigation_state bhb_state = arm64_get_spectre_bhb_state();
|
|
|
|
const char *bhb_str = get_bhb_affected_string(bhb_state);
|
|
|
|
const char *v2_str = "Branch predictor hardening";
|
|
|
|
|
2020-09-15 23:30:17 +01:00
|
|
|
switch (spectre_v2_state) {
|
|
|
|
case SPECTRE_UNAFFECTED:
|
2022-02-08 16:08:13 +00:00
|
|
|
if (bhb_state == SPECTRE_UNAFFECTED)
|
|
|
|
return sprintf(buf, "Not affected\n");
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Platforms affected by Spectre-BHB can't report
|
|
|
|
* "Not affected" for Spectre-v2.
|
|
|
|
*/
|
|
|
|
v2_str = "CSV2";
|
|
|
|
fallthrough;
|
2020-09-15 23:30:17 +01:00
|
|
|
case SPECTRE_MITIGATED:
|
2022-03-03 16:53:56 +00:00
|
|
|
if (bhb_state == SPECTRE_MITIGATED && _unprivileged_ebpf_enabled())
|
|
|
|
return sprintf(buf, "Vulnerable: Unprivileged eBPF enabled\n");
|
|
|
|
|
2022-02-08 16:08:13 +00:00
|
|
|
return sprintf(buf, "Mitigation: %s%s\n", v2_str, bhb_str);
|
2020-09-15 23:30:17 +01:00
|
|
|
case SPECTRE_VULNERABLE:
|
|
|
|
fallthrough;
|
|
|
|
default:
|
|
|
|
return sprintf(buf, "Vulnerable\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static enum mitigation_state spectre_v2_get_cpu_hw_mitigation_state(void)
|
|
|
|
{
|
|
|
|
u64 pfr0;
|
|
|
|
static const struct midr_range spectre_v2_safe_list[] = {
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_CORTEX_A35),
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_CORTEX_A53),
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_CORTEX_A55),
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53),
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_HISI_TSV110),
|
2020-11-05 00:22:12 +01:00
|
|
|
MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_2XX_SILVER),
|
2020-09-15 23:30:17 +01:00
|
|
|
MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER),
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER),
|
|
|
|
{ /* sentinel */ }
|
|
|
|
};
|
|
|
|
|
|
|
|
/* If the CPU has CSV2 set, we're safe */
|
|
|
|
pfr0 = read_cpuid(ID_AA64PFR0_EL1);
|
2022-09-05 23:54:03 +01:00
|
|
|
if (cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_CSV2_SHIFT))
|
2020-09-15 23:30:17 +01:00
|
|
|
return SPECTRE_UNAFFECTED;
|
|
|
|
|
|
|
|
/* Alternatively, we have a list of unaffected CPUs */
|
2025-02-21 14:02:24 +00:00
|
|
|
if (is_midr_in_range_list(spectre_v2_safe_list))
|
2020-09-15 23:30:17 +01:00
|
|
|
return SPECTRE_UNAFFECTED;
|
|
|
|
|
|
|
|
return SPECTRE_VULNERABLE;
|
|
|
|
}
|
|
|
|
|
|
|
|
static enum mitigation_state spectre_v2_get_cpu_fw_mitigation_state(void)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct arm_smccc_res res;
|
|
|
|
|
|
|
|
arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
|
|
|
|
ARM_SMCCC_ARCH_WORKAROUND_1, &res);
|
|
|
|
|
|
|
|
ret = res.a0;
|
|
|
|
switch (ret) {
|
|
|
|
case SMCCC_RET_SUCCESS:
|
|
|
|
return SPECTRE_MITIGATED;
|
|
|
|
case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED:
|
|
|
|
return SPECTRE_UNAFFECTED;
|
|
|
|
default:
|
|
|
|
fallthrough;
|
|
|
|
case SMCCC_RET_NOT_SUPPORTED:
|
|
|
|
return SPECTRE_VULNERABLE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool has_spectre_v2(const struct arm64_cpu_capabilities *entry, int scope)
|
|
|
|
{
|
|
|
|
WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
|
|
|
|
|
|
|
|
if (spectre_v2_get_cpu_hw_mitigation_state() == SPECTRE_UNAFFECTED)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (spectre_v2_get_cpu_fw_mitigation_state() == SPECTRE_UNAFFECTED)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
enum mitigation_state arm64_get_spectre_v2_state(void)
|
|
|
|
{
|
|
|
|
return spectre_v2_state;
|
|
|
|
}
|
|
|
|
|
2020-11-13 11:38:44 +00:00
|
|
|
DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data);
|
2020-09-15 23:30:17 +01:00
|
|
|
|
|
|
|
static void install_bp_hardening_cb(bp_hardening_cb_t fn)
|
|
|
|
{
|
2020-11-13 11:38:44 +00:00
|
|
|
__this_cpu_write(bp_hardening_data.fn, fn);
|
2020-09-15 23:30:17 +01:00
|
|
|
|
|
|
|
/*
|
2020-10-20 14:45:44 -07:00
|
|
|
* Vinz Clortho takes the hyp_vecs start/end "keys" at
|
|
|
|
* the door when we're a guest. Skip the hyp-vectors work.
|
2020-09-15 23:30:17 +01:00
|
|
|
*/
|
2020-11-13 11:38:44 +00:00
|
|
|
if (!is_hyp_mode_available())
|
2020-09-15 23:30:17 +01:00
|
|
|
return;
|
|
|
|
|
2020-11-13 11:38:44 +00:00
|
|
|
__this_cpu_write(bp_hardening_data.slot, HYP_VECTOR_SPECTRE_DIRECT);
|
2020-09-15 23:30:17 +01:00
|
|
|
}
|
|
|
|
|
arm64: prevent instrumentation of bp hardening callbacks
We may call arm64_apply_bp_hardening() early during entry (e.g. in
el0_ia()) before it is safe to run instrumented code. Unfortunately this
may result in running instrumented code in two cases:
* The hardening callbacks called by arm64_apply_bp_hardening() are not
marked as `noinstr`, and have been observed to be instrumented when
compiled with either GCC or LLVM.
* Since arm64_apply_bp_hardening() itself is only marked as `inline`
rather than `__always_inline`, it is possible that the compiler
decides to place it out-of-line, whereupon it may be instrumented.
For example, with defconfig built with clang 13.0.0,
call_hvc_arch_workaround_1() is compiled as:
| <call_hvc_arch_workaround_1>:
| d503233f paciasp
| f81f0ffe str x30, [sp, #-16]!
| 320183e0 mov w0, #0x80008000
| d503201f nop
| d4000002 hvc #0x0
| f84107fe ldr x30, [sp], #16
| d50323bf autiasp
| d65f03c0 ret
... but when CONFIG_FTRACE=y and CONFIG_KCOV=y this is compiled as:
| <call_hvc_arch_workaround_1>:
| d503245f bti c
| d503201f nop
| d503201f nop
| d503233f paciasp
| a9bf7bfd stp x29, x30, [sp, #-16]!
| 910003fd mov x29, sp
| 94000000 bl 0 <__sanitizer_cov_trace_pc>
| 320183e0 mov w0, #0x80008000
| d503201f nop
| d4000002 hvc #0x0
| a8c17bfd ldp x29, x30, [sp], #16
| d50323bf autiasp
| d65f03c0 ret
... with a patchable function entry registered with ftrace, and a direct
call to __sanitizer_cov_trace_pc(). Neither of these are safe early
during entry sequences.
This patch avoids the unsafe instrumentation by marking
arm64_apply_bp_hardening() as `__always_inline` and by marking the
hardening functions as `noinstr`. This avoids the potential for
instrumentation, and causes clang to consistently generate the function
as with the defconfig sample.
Note: in the defconfig compilation, when CONFIG_SVE=y, x30 is spilled to
the stack without being placed in a frame record, which will result in a
missing entry if call_hvc_arch_workaround_1() is backtraced. Similar is
true of qcom_link_stack_sanitisation(), where inline asm spills the LR
to a GPR prior to corrupting it. This is not a significant issue
presently as we will only backtrace here if an exception is taken, and
in such cases we may omit entries for other reasons today.
The relevant hardening functions were introduced in commits:
ec82b567a74fbdff ("arm64: Implement branch predictor hardening for Falkor")
b092201e00206141 ("arm64: Add ARM_SMCCC_ARCH_WORKAROUND_1 BP hardening support")
... and these were subsequently moved in commit:
d4647f0a2ad71110 ("arm64: Rewrite Spectre-v2 mitigation code")
The arm64_apply_bp_hardening() function was introduced in commit:
0f15adbb2861ce6f ("arm64: Add skeleton to harden the branch predictor against aliasing attacks")
... and was subsequently moved and reworked in commit:
6279017e807708a0 ("KVM: arm64: Move BP hardening helpers into spectre.h")
Fixes: ec82b567a74fbdff ("arm64: Implement branch predictor hardening for Falkor")
Fixes: b092201e00206141 ("arm64: Add ARM_SMCCC_ARCH_WORKAROUND_1 BP hardening support")
Fixes: d4647f0a2ad71110 ("arm64: Rewrite Spectre-v2 mitigation code")
Fixes: 0f15adbb2861ce6f ("arm64: Add skeleton to harden the branch predictor against aliasing attacks")
Fixes: 6279017e807708a0 ("KVM: arm64: Move BP hardening helpers into spectre.h")
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Will Deacon <will@kernel.org>
Acked-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20220224181028.512873-1-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
2022-02-24 18:10:28 +00:00
|
|
|
/* Called during entry so must be noinstr */
|
|
|
|
static noinstr void call_smc_arch_workaround_1(void)
|
2020-09-15 23:30:17 +01:00
|
|
|
{
|
|
|
|
arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL);
|
|
|
|
}
|
|
|
|
|
arm64: prevent instrumentation of bp hardening callbacks
We may call arm64_apply_bp_hardening() early during entry (e.g. in
el0_ia()) before it is safe to run instrumented code. Unfortunately this
may result in running instrumented code in two cases:
* The hardening callbacks called by arm64_apply_bp_hardening() are not
marked as `noinstr`, and have been observed to be instrumented when
compiled with either GCC or LLVM.
* Since arm64_apply_bp_hardening() itself is only marked as `inline`
rather than `__always_inline`, it is possible that the compiler
decides to place it out-of-line, whereupon it may be instrumented.
For example, with defconfig built with clang 13.0.0,
call_hvc_arch_workaround_1() is compiled as:
| <call_hvc_arch_workaround_1>:
| d503233f paciasp
| f81f0ffe str x30, [sp, #-16]!
| 320183e0 mov w0, #0x80008000
| d503201f nop
| d4000002 hvc #0x0
| f84107fe ldr x30, [sp], #16
| d50323bf autiasp
| d65f03c0 ret
... but when CONFIG_FTRACE=y and CONFIG_KCOV=y this is compiled as:
| <call_hvc_arch_workaround_1>:
| d503245f bti c
| d503201f nop
| d503201f nop
| d503233f paciasp
| a9bf7bfd stp x29, x30, [sp, #-16]!
| 910003fd mov x29, sp
| 94000000 bl 0 <__sanitizer_cov_trace_pc>
| 320183e0 mov w0, #0x80008000
| d503201f nop
| d4000002 hvc #0x0
| a8c17bfd ldp x29, x30, [sp], #16
| d50323bf autiasp
| d65f03c0 ret
... with a patchable function entry registered with ftrace, and a direct
call to __sanitizer_cov_trace_pc(). Neither of these are safe early
during entry sequences.
This patch avoids the unsafe instrumentation by marking
arm64_apply_bp_hardening() as `__always_inline` and by marking the
hardening functions as `noinstr`. This avoids the potential for
instrumentation, and causes clang to consistently generate the function
as with the defconfig sample.
Note: in the defconfig compilation, when CONFIG_SVE=y, x30 is spilled to
the stack without being placed in a frame record, which will result in a
missing entry if call_hvc_arch_workaround_1() is backtraced. Similar is
true of qcom_link_stack_sanitisation(), where inline asm spills the LR
to a GPR prior to corrupting it. This is not a significant issue
presently as we will only backtrace here if an exception is taken, and
in such cases we may omit entries for other reasons today.
The relevant hardening functions were introduced in commits:
ec82b567a74fbdff ("arm64: Implement branch predictor hardening for Falkor")
b092201e00206141 ("arm64: Add ARM_SMCCC_ARCH_WORKAROUND_1 BP hardening support")
... and these were subsequently moved in commit:
d4647f0a2ad71110 ("arm64: Rewrite Spectre-v2 mitigation code")
The arm64_apply_bp_hardening() function was introduced in commit:
0f15adbb2861ce6f ("arm64: Add skeleton to harden the branch predictor against aliasing attacks")
... and was subsequently moved and reworked in commit:
6279017e807708a0 ("KVM: arm64: Move BP hardening helpers into spectre.h")
Fixes: ec82b567a74fbdff ("arm64: Implement branch predictor hardening for Falkor")
Fixes: b092201e00206141 ("arm64: Add ARM_SMCCC_ARCH_WORKAROUND_1 BP hardening support")
Fixes: d4647f0a2ad71110 ("arm64: Rewrite Spectre-v2 mitigation code")
Fixes: 0f15adbb2861ce6f ("arm64: Add skeleton to harden the branch predictor against aliasing attacks")
Fixes: 6279017e807708a0 ("KVM: arm64: Move BP hardening helpers into spectre.h")
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Will Deacon <will@kernel.org>
Acked-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20220224181028.512873-1-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
2022-02-24 18:10:28 +00:00
|
|
|
/* Called during entry so must be noinstr */
|
|
|
|
static noinstr void call_hvc_arch_workaround_1(void)
|
2020-09-15 23:30:17 +01:00
|
|
|
{
|
|
|
|
arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL);
|
|
|
|
}
|
|
|
|
|
arm64: prevent instrumentation of bp hardening callbacks
We may call arm64_apply_bp_hardening() early during entry (e.g. in
el0_ia()) before it is safe to run instrumented code. Unfortunately this
may result in running instrumented code in two cases:
* The hardening callbacks called by arm64_apply_bp_hardening() are not
marked as `noinstr`, and have been observed to be instrumented when
compiled with either GCC or LLVM.
* Since arm64_apply_bp_hardening() itself is only marked as `inline`
rather than `__always_inline`, it is possible that the compiler
decides to place it out-of-line, whereupon it may be instrumented.
For example, with defconfig built with clang 13.0.0,
call_hvc_arch_workaround_1() is compiled as:
| <call_hvc_arch_workaround_1>:
| d503233f paciasp
| f81f0ffe str x30, [sp, #-16]!
| 320183e0 mov w0, #0x80008000
| d503201f nop
| d4000002 hvc #0x0
| f84107fe ldr x30, [sp], #16
| d50323bf autiasp
| d65f03c0 ret
... but when CONFIG_FTRACE=y and CONFIG_KCOV=y this is compiled as:
| <call_hvc_arch_workaround_1>:
| d503245f bti c
| d503201f nop
| d503201f nop
| d503233f paciasp
| a9bf7bfd stp x29, x30, [sp, #-16]!
| 910003fd mov x29, sp
| 94000000 bl 0 <__sanitizer_cov_trace_pc>
| 320183e0 mov w0, #0x80008000
| d503201f nop
| d4000002 hvc #0x0
| a8c17bfd ldp x29, x30, [sp], #16
| d50323bf autiasp
| d65f03c0 ret
... with a patchable function entry registered with ftrace, and a direct
call to __sanitizer_cov_trace_pc(). Neither of these are safe early
during entry sequences.
This patch avoids the unsafe instrumentation by marking
arm64_apply_bp_hardening() as `__always_inline` and by marking the
hardening functions as `noinstr`. This avoids the potential for
instrumentation, and causes clang to consistently generate the function
as with the defconfig sample.
Note: in the defconfig compilation, when CONFIG_SVE=y, x30 is spilled to
the stack without being placed in a frame record, which will result in a
missing entry if call_hvc_arch_workaround_1() is backtraced. Similar is
true of qcom_link_stack_sanitisation(), where inline asm spills the LR
to a GPR prior to corrupting it. This is not a significant issue
presently as we will only backtrace here if an exception is taken, and
in such cases we may omit entries for other reasons today.
The relevant hardening functions were introduced in commits:
ec82b567a74fbdff ("arm64: Implement branch predictor hardening for Falkor")
b092201e00206141 ("arm64: Add ARM_SMCCC_ARCH_WORKAROUND_1 BP hardening support")
... and these were subsequently moved in commit:
d4647f0a2ad71110 ("arm64: Rewrite Spectre-v2 mitigation code")
The arm64_apply_bp_hardening() function was introduced in commit:
0f15adbb2861ce6f ("arm64: Add skeleton to harden the branch predictor against aliasing attacks")
... and was subsequently moved and reworked in commit:
6279017e807708a0 ("KVM: arm64: Move BP hardening helpers into spectre.h")
Fixes: ec82b567a74fbdff ("arm64: Implement branch predictor hardening for Falkor")
Fixes: b092201e00206141 ("arm64: Add ARM_SMCCC_ARCH_WORKAROUND_1 BP hardening support")
Fixes: d4647f0a2ad71110 ("arm64: Rewrite Spectre-v2 mitigation code")
Fixes: 0f15adbb2861ce6f ("arm64: Add skeleton to harden the branch predictor against aliasing attacks")
Fixes: 6279017e807708a0 ("KVM: arm64: Move BP hardening helpers into spectre.h")
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Will Deacon <will@kernel.org>
Acked-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20220224181028.512873-1-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
2022-02-24 18:10:28 +00:00
|
|
|
/* Called during entry so must be noinstr */
|
|
|
|
static noinstr void qcom_link_stack_sanitisation(void)
|
2020-09-15 23:30:17 +01:00
|
|
|
{
|
|
|
|
u64 tmp;
|
|
|
|
|
|
|
|
asm volatile("mov %0, x30 \n"
|
|
|
|
".rept 16 \n"
|
|
|
|
"bl . + 4 \n"
|
|
|
|
".endr \n"
|
|
|
|
"mov x30, %0 \n"
|
|
|
|
: "=&r" (tmp));
|
|
|
|
}
|
|
|
|
|
2020-10-20 16:01:57 +01:00
|
|
|
static bp_hardening_cb_t spectre_v2_get_sw_mitigation_cb(void)
|
|
|
|
{
|
|
|
|
u32 midr = read_cpuid_id();
|
|
|
|
if (((midr & MIDR_CPU_MODEL_MASK) != MIDR_QCOM_FALKOR) &&
|
|
|
|
((midr & MIDR_CPU_MODEL_MASK) != MIDR_QCOM_FALKOR_V1))
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
return qcom_link_stack_sanitisation;
|
|
|
|
}
|
|
|
|
|
2020-09-15 23:30:17 +01:00
|
|
|
static enum mitigation_state spectre_v2_enable_fw_mitigation(void)
|
|
|
|
{
|
|
|
|
bp_hardening_cb_t cb;
|
|
|
|
enum mitigation_state state;
|
|
|
|
|
|
|
|
state = spectre_v2_get_cpu_fw_mitigation_state();
|
|
|
|
if (state != SPECTRE_MITIGATED)
|
|
|
|
return state;
|
|
|
|
|
|
|
|
if (spectre_v2_mitigations_off())
|
|
|
|
return SPECTRE_VULNERABLE;
|
|
|
|
|
|
|
|
switch (arm_smccc_1_1_get_conduit()) {
|
|
|
|
case SMCCC_CONDUIT_HVC:
|
|
|
|
cb = call_hvc_arch_workaround_1;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case SMCCC_CONDUIT_SMC:
|
|
|
|
cb = call_smc_arch_workaround_1;
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
return SPECTRE_VULNERABLE;
|
|
|
|
}
|
|
|
|
|
2020-10-20 16:01:57 +01:00
|
|
|
/*
|
|
|
|
* Prefer a CPU-specific workaround if it exists. Note that we
|
|
|
|
* still rely on firmware for the mitigation at EL2.
|
|
|
|
*/
|
|
|
|
cb = spectre_v2_get_sw_mitigation_cb() ?: cb;
|
2020-09-15 23:30:17 +01:00
|
|
|
install_bp_hardening_cb(cb);
|
|
|
|
return SPECTRE_MITIGATED;
|
|
|
|
}
|
|
|
|
|
|
|
|
void spectre_v2_enable_mitigation(const struct arm64_cpu_capabilities *__unused)
|
|
|
|
{
|
|
|
|
enum mitigation_state state;
|
|
|
|
|
|
|
|
WARN_ON(preemptible());
|
|
|
|
|
|
|
|
state = spectre_v2_get_cpu_hw_mitigation_state();
|
|
|
|
if (state == SPECTRE_VULNERABLE)
|
|
|
|
state = spectre_v2_enable_fw_mitigation();
|
|
|
|
|
|
|
|
update_mitigation_state(&spectre_v2_state, state);
|
|
|
|
}
|
2020-09-18 11:45:57 +01:00
|
|
|
|
2020-11-13 11:38:45 +00:00
|
|
|
/*
|
|
|
|
* Spectre-v3a.
|
|
|
|
*
|
|
|
|
* Phew, there's not an awful lot to do here! We just instruct EL2 to use
|
|
|
|
* an indirect trampoline for the hyp vectors so that guests can't read
|
|
|
|
* VBAR_EL2 to defeat randomisation of the hypervisor VA layout.
|
|
|
|
*/
|
2020-11-13 11:38:46 +00:00
|
|
|
bool has_spectre_v3a(const struct arm64_cpu_capabilities *entry, int scope)
|
|
|
|
{
|
|
|
|
static const struct midr_range spectre_v3a_unsafe_list[] = {
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_CORTEX_A57),
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_CORTEX_A72),
|
|
|
|
{},
|
|
|
|
};
|
|
|
|
|
|
|
|
WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
|
2025-02-21 14:02:24 +00:00
|
|
|
return is_midr_in_range_list(spectre_v3a_unsafe_list);
|
2020-11-13 11:38:46 +00:00
|
|
|
}
|
|
|
|
|
2020-11-13 11:38:45 +00:00
|
|
|
void spectre_v3a_enable_mitigation(const struct arm64_cpu_capabilities *__unused)
|
2020-11-13 11:38:44 +00:00
|
|
|
{
|
|
|
|
struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data);
|
|
|
|
|
2020-11-13 11:38:45 +00:00
|
|
|
if (this_cpu_has_cap(ARM64_SPECTRE_V3A))
|
2020-11-13 11:38:44 +00:00
|
|
|
data->slot += HYP_VECTOR_INDIRECT;
|
|
|
|
}
|
|
|
|
|
2020-09-18 11:54:33 +01:00
|
|
|
/*
|
|
|
|
* Spectre v4.
|
|
|
|
*
|
|
|
|
* If you thought Spectre v2 was nasty, wait until you see this mess. A CPU is
|
|
|
|
* either:
|
|
|
|
*
|
|
|
|
* - Mitigated in hardware and listed in our "safe list".
|
|
|
|
* - Mitigated in hardware via PSTATE.SSBS.
|
|
|
|
* - Mitigated in software by firmware (sometimes referred to as SSBD).
|
|
|
|
*
|
|
|
|
* Wait, that doesn't sound so bad, does it? Keep reading...
|
|
|
|
*
|
|
|
|
* A major source of headaches is that the software mitigation is enabled both
|
|
|
|
* on a per-task basis, but can also be forced on for the kernel, necessitating
|
|
|
|
* both context-switch *and* entry/exit hooks. To make it even worse, some CPUs
|
|
|
|
* allow EL0 to toggle SSBS directly, which can end up with the prctl() state
|
|
|
|
* being stale when re-entering the kernel. The usual big.LITTLE caveats apply,
|
|
|
|
* so you can have systems that have both firmware and SSBS mitigations. This
|
|
|
|
* means we actually have to reject late onlining of CPUs with mitigations if
|
|
|
|
* all of the currently onlined CPUs are safelisted, as the mitigation tends to
|
|
|
|
* be opt-in for userspace. Yes, really, the cure is worse than the disease.
|
|
|
|
*
|
|
|
|
* The only good part is that if the firmware mitigation is present, then it is
|
|
|
|
* present for all CPUs, meaning we don't have to worry about late onlining of a
|
|
|
|
* vulnerable CPU if one of the boot CPUs is using the firmware mitigation.
|
|
|
|
*
|
|
|
|
* Give me a VAX-11/780 any day of the week...
|
|
|
|
*/
|
|
|
|
static enum mitigation_state spectre_v4_state;
|
|
|
|
|
|
|
|
/* This is the per-cpu state tracking whether we need to talk to firmware */
|
|
|
|
DEFINE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required);
|
|
|
|
|
|
|
|
enum spectre_v4_policy {
|
|
|
|
SPECTRE_V4_POLICY_MITIGATION_DYNAMIC,
|
|
|
|
SPECTRE_V4_POLICY_MITIGATION_ENABLED,
|
|
|
|
SPECTRE_V4_POLICY_MITIGATION_DISABLED,
|
|
|
|
};
|
|
|
|
|
|
|
|
static enum spectre_v4_policy __read_mostly __spectre_v4_policy;
|
|
|
|
|
|
|
|
static const struct spectre_v4_param {
|
|
|
|
const char *str;
|
|
|
|
enum spectre_v4_policy policy;
|
|
|
|
} spectre_v4_params[] = {
|
|
|
|
{ "force-on", SPECTRE_V4_POLICY_MITIGATION_ENABLED, },
|
|
|
|
{ "force-off", SPECTRE_V4_POLICY_MITIGATION_DISABLED, },
|
|
|
|
{ "kernel", SPECTRE_V4_POLICY_MITIGATION_DYNAMIC, },
|
|
|
|
};
|
|
|
|
static int __init parse_spectre_v4_param(char *str)
|
2020-09-18 11:45:57 +01:00
|
|
|
{
|
2020-09-18 11:54:33 +01:00
|
|
|
int i;
|
2020-09-18 11:45:57 +01:00
|
|
|
|
2020-09-18 11:54:33 +01:00
|
|
|
if (!str || !str[0])
|
|
|
|
return -EINVAL;
|
2020-09-18 11:45:57 +01:00
|
|
|
|
2020-09-18 11:54:33 +01:00
|
|
|
for (i = 0; i < ARRAY_SIZE(spectre_v4_params); i++) {
|
|
|
|
const struct spectre_v4_param *param = &spectre_v4_params[i];
|
|
|
|
|
|
|
|
if (strncmp(str, param->str, strlen(param->str)))
|
|
|
|
continue;
|
2020-09-18 11:45:57 +01:00
|
|
|
|
2020-09-18 11:54:33 +01:00
|
|
|
__spectre_v4_policy = param->policy;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return -EINVAL;
|
2020-09-18 11:45:57 +01:00
|
|
|
}
|
2020-09-18 11:54:33 +01:00
|
|
|
early_param("ssbd", parse_spectre_v4_param);
|
2020-09-18 11:45:57 +01:00
|
|
|
|
|
|
|
/*
|
2020-09-18 11:54:33 +01:00
|
|
|
* Because this was all written in a rush by people working in different silos,
|
|
|
|
* we've ended up with multiple command line options to control the same thing.
|
|
|
|
* Wrap these up in some helpers, which prefer disabling the mitigation if faced
|
|
|
|
* with contradictory parameters. The mitigation is always either "off",
|
|
|
|
* "dynamic" or "on".
|
2020-09-18 11:45:57 +01:00
|
|
|
*/
|
2020-09-18 11:54:33 +01:00
|
|
|
static bool spectre_v4_mitigations_off(void)
|
2020-09-18 11:45:57 +01:00
|
|
|
{
|
2020-09-18 11:54:33 +01:00
|
|
|
bool ret = cpu_mitigations_off() ||
|
|
|
|
__spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DISABLED;
|
2020-09-18 11:45:57 +01:00
|
|
|
|
2020-09-18 11:54:33 +01:00
|
|
|
if (ret)
|
|
|
|
pr_info_once("spectre-v4 mitigation disabled by command-line option\n");
|
2020-09-18 11:45:57 +01:00
|
|
|
|
2020-09-18 11:54:33 +01:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Do we need to toggle the mitigation state on entry to/exit from the kernel? */
|
|
|
|
static bool spectre_v4_mitigations_dynamic(void)
|
|
|
|
{
|
|
|
|
return !spectre_v4_mitigations_off() &&
|
|
|
|
__spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DYNAMIC;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool spectre_v4_mitigations_on(void)
|
|
|
|
{
|
|
|
|
return !spectre_v4_mitigations_off() &&
|
|
|
|
__spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_ENABLED;
|
|
|
|
}
|
|
|
|
|
|
|
|
ssize_t cpu_show_spec_store_bypass(struct device *dev,
|
|
|
|
struct device_attribute *attr, char *buf)
|
|
|
|
{
|
|
|
|
switch (spectre_v4_state) {
|
|
|
|
case SPECTRE_UNAFFECTED:
|
|
|
|
return sprintf(buf, "Not affected\n");
|
|
|
|
case SPECTRE_MITIGATED:
|
|
|
|
return sprintf(buf, "Mitigation: Speculative Store Bypass disabled via prctl\n");
|
|
|
|
case SPECTRE_VULNERABLE:
|
|
|
|
fallthrough;
|
|
|
|
default:
|
|
|
|
return sprintf(buf, "Vulnerable\n");
|
2020-09-18 11:45:57 +01:00
|
|
|
}
|
2020-09-18 11:54:33 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
enum mitigation_state arm64_get_spectre_v4_state(void)
|
|
|
|
{
|
|
|
|
return spectre_v4_state;
|
|
|
|
}
|
|
|
|
|
|
|
|
static enum mitigation_state spectre_v4_get_cpu_hw_mitigation_state(void)
|
|
|
|
{
|
|
|
|
static const struct midr_range spectre_v4_safe_list[] = {
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_CORTEX_A35),
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_CORTEX_A53),
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_CORTEX_A55),
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53),
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER),
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER),
|
|
|
|
{ /* sentinel */ },
|
|
|
|
};
|
|
|
|
|
2025-02-21 14:02:24 +00:00
|
|
|
if (is_midr_in_range_list(spectre_v4_safe_list))
|
2020-09-18 11:54:33 +01:00
|
|
|
return SPECTRE_UNAFFECTED;
|
|
|
|
|
|
|
|
/* CPU features are detected first */
|
|
|
|
if (this_cpu_has_cap(ARM64_SSBS))
|
|
|
|
return SPECTRE_MITIGATED;
|
|
|
|
|
|
|
|
return SPECTRE_VULNERABLE;
|
|
|
|
}
|
|
|
|
|
|
|
|
static enum mitigation_state spectre_v4_get_cpu_fw_mitigation_state(void)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct arm_smccc_res res;
|
|
|
|
|
|
|
|
arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
|
|
|
|
ARM_SMCCC_ARCH_WORKAROUND_2, &res);
|
|
|
|
|
|
|
|
ret = res.a0;
|
|
|
|
switch (ret) {
|
|
|
|
case SMCCC_RET_SUCCESS:
|
|
|
|
return SPECTRE_MITIGATED;
|
|
|
|
case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED:
|
|
|
|
fallthrough;
|
|
|
|
case SMCCC_RET_NOT_REQUIRED:
|
|
|
|
return SPECTRE_UNAFFECTED;
|
|
|
|
default:
|
|
|
|
fallthrough;
|
|
|
|
case SMCCC_RET_NOT_SUPPORTED:
|
|
|
|
return SPECTRE_VULNERABLE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool has_spectre_v4(const struct arm64_cpu_capabilities *cap, int scope)
|
|
|
|
{
|
|
|
|
enum mitigation_state state;
|
|
|
|
|
|
|
|
WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
|
|
|
|
|
|
|
|
state = spectre_v4_get_cpu_hw_mitigation_state();
|
|
|
|
if (state == SPECTRE_VULNERABLE)
|
|
|
|
state = spectre_v4_get_cpu_fw_mitigation_state();
|
|
|
|
|
|
|
|
return state != SPECTRE_UNAFFECTED;
|
|
|
|
}
|
|
|
|
|
arm64: factor out EL1 SSBS emulation hook
Currently call_undef_hook() is used to handle UNDEFINED exceptions from
EL0 and EL1. As support for deprecated instructions may be enabled
independently, the handlers for individual instructions are organised as
a linked list of struct undef_hook which can be manipulated dynamically.
As this can be manipulated dynamically, the list is protected with a
raw_spinlock which must be acquired when handling UNDEFINED exceptions
or when manipulating the list of handlers.
This locking is unfortunate as it serialises handling of UNDEFINED
exceptions, and requires RCU to be enabled for lockdep, requiring the
use of RCU_NONIDLE() in resume path of cpu_suspend() since commit:
a2c42bbabbe260b7 ("arm64: spectre: Prevent lockdep splat on v4 mitigation enable path")
The list of UNDEFINED handlers largely consist of handlers for
exceptions taken from EL0, and the only handler for exceptions taken
from EL1 handles `MSR SSBS, #imm` on CPUs which feature PSTATE.SSBS but
lack the corresponding MSR (Immediate) instruction. Other than this we
never expect to take an UNDEFINED exception from EL1 in normal
operation.
This patch reworks do_el0_undef() to invoke the EL1 SSBS handler
directly, relegating call_undef_hook() to only handle EL0 UNDEFs. This
removes redundant work to iterate the list for EL1 UNDEFs, and removes
the need for locking, permitting EL1 UNDEFs to be handled in parallel
without contention.
The RCU_NONIDLE() call in cpu_suspend() will be removed in a subsequent
patch, as there are other potential issues with the use of
instrumentable code and RCU in the CPU suspend code.
I've tested this by forcing the detection of SSBS on a CPU that doesn't
have it, and verifying that the try_emulate_el1_ssbs() callback is
invoked.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Joey Gouly <joey.gouly@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20221019144123.612388-4-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
2022-10-19 15:41:17 +01:00
|
|
|
bool try_emulate_el1_ssbs(struct pt_regs *regs, u32 instr)
|
2020-09-18 11:54:33 +01:00
|
|
|
{
|
arm64: factor out EL1 SSBS emulation hook
Currently call_undef_hook() is used to handle UNDEFINED exceptions from
EL0 and EL1. As support for deprecated instructions may be enabled
independently, the handlers for individual instructions are organised as
a linked list of struct undef_hook which can be manipulated dynamically.
As this can be manipulated dynamically, the list is protected with a
raw_spinlock which must be acquired when handling UNDEFINED exceptions
or when manipulating the list of handlers.
This locking is unfortunate as it serialises handling of UNDEFINED
exceptions, and requires RCU to be enabled for lockdep, requiring the
use of RCU_NONIDLE() in resume path of cpu_suspend() since commit:
a2c42bbabbe260b7 ("arm64: spectre: Prevent lockdep splat on v4 mitigation enable path")
The list of UNDEFINED handlers largely consist of handlers for
exceptions taken from EL0, and the only handler for exceptions taken
from EL1 handles `MSR SSBS, #imm` on CPUs which feature PSTATE.SSBS but
lack the corresponding MSR (Immediate) instruction. Other than this we
never expect to take an UNDEFINED exception from EL1 in normal
operation.
This patch reworks do_el0_undef() to invoke the EL1 SSBS handler
directly, relegating call_undef_hook() to only handle EL0 UNDEFs. This
removes redundant work to iterate the list for EL1 UNDEFs, and removes
the need for locking, permitting EL1 UNDEFs to be handled in parallel
without contention.
The RCU_NONIDLE() call in cpu_suspend() will be removed in a subsequent
patch, as there are other potential issues with the use of
instrumentable code and RCU in the CPU suspend code.
I've tested this by forcing the detection of SSBS on a CPU that doesn't
have it, and verifying that the try_emulate_el1_ssbs() callback is
invoked.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Joey Gouly <joey.gouly@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20221019144123.612388-4-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
2022-10-19 15:41:17 +01:00
|
|
|
const u32 instr_mask = ~(1U << PSTATE_Imm_shift);
|
|
|
|
const u32 instr_val = 0xd500401f | PSTATE_SSBS;
|
|
|
|
|
|
|
|
if ((instr & instr_mask) != instr_val)
|
|
|
|
return false;
|
2020-09-18 11:54:33 +01:00
|
|
|
|
|
|
|
if (instr & BIT(PSTATE_Imm_shift))
|
|
|
|
regs->pstate |= PSR_SSBS_BIT;
|
|
|
|
else
|
|
|
|
regs->pstate &= ~PSR_SSBS_BIT;
|
|
|
|
|
|
|
|
arm64_skip_faulting_instruction(regs, 4);
|
arm64: factor out EL1 SSBS emulation hook
Currently call_undef_hook() is used to handle UNDEFINED exceptions from
EL0 and EL1. As support for deprecated instructions may be enabled
independently, the handlers for individual instructions are organised as
a linked list of struct undef_hook which can be manipulated dynamically.
As this can be manipulated dynamically, the list is protected with a
raw_spinlock which must be acquired when handling UNDEFINED exceptions
or when manipulating the list of handlers.
This locking is unfortunate as it serialises handling of UNDEFINED
exceptions, and requires RCU to be enabled for lockdep, requiring the
use of RCU_NONIDLE() in resume path of cpu_suspend() since commit:
a2c42bbabbe260b7 ("arm64: spectre: Prevent lockdep splat on v4 mitigation enable path")
The list of UNDEFINED handlers largely consist of handlers for
exceptions taken from EL0, and the only handler for exceptions taken
from EL1 handles `MSR SSBS, #imm` on CPUs which feature PSTATE.SSBS but
lack the corresponding MSR (Immediate) instruction. Other than this we
never expect to take an UNDEFINED exception from EL1 in normal
operation.
This patch reworks do_el0_undef() to invoke the EL1 SSBS handler
directly, relegating call_undef_hook() to only handle EL0 UNDEFs. This
removes redundant work to iterate the list for EL1 UNDEFs, and removes
the need for locking, permitting EL1 UNDEFs to be handled in parallel
without contention.
The RCU_NONIDLE() call in cpu_suspend() will be removed in a subsequent
patch, as there are other potential issues with the use of
instrumentable code and RCU in the CPU suspend code.
I've tested this by forcing the detection of SSBS on a CPU that doesn't
have it, and verifying that the try_emulate_el1_ssbs() callback is
invoked.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Joey Gouly <joey.gouly@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20221019144123.612388-4-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
2022-10-19 15:41:17 +01:00
|
|
|
return true;
|
2020-09-18 11:54:33 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static enum mitigation_state spectre_v4_enable_hw_mitigation(void)
|
|
|
|
{
|
|
|
|
enum mitigation_state state;
|
2020-09-18 11:45:57 +01:00
|
|
|
|
|
|
|
/*
|
2020-09-18 11:54:33 +01:00
|
|
|
* If the system is mitigated but this CPU doesn't have SSBS, then
|
|
|
|
* we must be on the safelist and there's nothing more to do.
|
2020-09-18 11:45:57 +01:00
|
|
|
*/
|
2020-09-18 11:54:33 +01:00
|
|
|
state = spectre_v4_get_cpu_hw_mitigation_state();
|
|
|
|
if (state != SPECTRE_MITIGATED || !this_cpu_has_cap(ARM64_SSBS))
|
|
|
|
return state;
|
|
|
|
|
|
|
|
if (spectre_v4_mitigations_off()) {
|
|
|
|
sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_DSSBS);
|
2020-11-13 12:49:22 +00:00
|
|
|
set_pstate_ssbs(1);
|
2020-09-18 11:54:33 +01:00
|
|
|
return SPECTRE_VULNERABLE;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* SCTLR_EL1.DSSBS was initialised to 0 during boot */
|
2020-11-13 12:49:22 +00:00
|
|
|
set_pstate_ssbs(0);
|
2024-05-08 09:14:00 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* SSBS is self-synchronizing and is intended to affect subsequent
|
|
|
|
* speculative instructions, but some CPUs can speculate with a stale
|
|
|
|
* value of SSBS.
|
|
|
|
*
|
|
|
|
* Mitigate this with an unconditional speculation barrier, as CPUs
|
|
|
|
* could mis-speculate branches and bypass a conditional barrier.
|
|
|
|
*/
|
2024-06-03 12:18:11 +01:00
|
|
|
if (IS_ENABLED(CONFIG_ARM64_ERRATUM_3194386))
|
2024-05-08 09:14:00 +01:00
|
|
|
spec_bar();
|
|
|
|
|
2020-09-18 11:54:33 +01:00
|
|
|
return SPECTRE_MITIGATED;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Patch a branch over the Spectre-v4 mitigation code with a NOP so that
|
|
|
|
* we fallthrough and check whether firmware needs to be called on this CPU.
|
|
|
|
*/
|
|
|
|
void __init spectre_v4_patch_fw_mitigation_enable(struct alt_instr *alt,
|
|
|
|
__le32 *origptr,
|
|
|
|
__le32 *updptr, int nr_inst)
|
|
|
|
{
|
|
|
|
BUG_ON(nr_inst != 1); /* Branch -> NOP */
|
|
|
|
|
|
|
|
if (spectre_v4_mitigations_off())
|
|
|
|
return;
|
|
|
|
|
2022-09-12 17:22:05 +01:00
|
|
|
if (cpus_have_cap(ARM64_SSBS))
|
2020-09-18 11:54:33 +01:00
|
|
|
return;
|
|
|
|
|
|
|
|
if (spectre_v4_mitigations_dynamic())
|
|
|
|
*updptr = cpu_to_le32(aarch64_insn_gen_nop());
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Patch a NOP in the Spectre-v4 mitigation code with an SMC/HVC instruction
|
|
|
|
* to call into firmware to adjust the mitigation state.
|
|
|
|
*/
|
2021-11-16 15:00:51 +00:00
|
|
|
void __init smccc_patch_fw_mitigation_conduit(struct alt_instr *alt,
|
|
|
|
__le32 *origptr,
|
|
|
|
__le32 *updptr, int nr_inst)
|
2020-09-18 11:54:33 +01:00
|
|
|
{
|
|
|
|
u32 insn;
|
|
|
|
|
|
|
|
BUG_ON(nr_inst != 1); /* NOP -> HVC/SMC */
|
|
|
|
|
|
|
|
switch (arm_smccc_1_1_get_conduit()) {
|
|
|
|
case SMCCC_CONDUIT_HVC:
|
|
|
|
insn = aarch64_insn_get_hvc_value();
|
|
|
|
break;
|
|
|
|
case SMCCC_CONDUIT_SMC:
|
|
|
|
insn = aarch64_insn_get_smc_value();
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
*updptr = cpu_to_le32(insn);
|
|
|
|
}
|
|
|
|
|
|
|
|
static enum mitigation_state spectre_v4_enable_fw_mitigation(void)
|
|
|
|
{
|
|
|
|
enum mitigation_state state;
|
|
|
|
|
|
|
|
state = spectre_v4_get_cpu_fw_mitigation_state();
|
|
|
|
if (state != SPECTRE_MITIGATED)
|
|
|
|
return state;
|
|
|
|
|
|
|
|
if (spectre_v4_mitigations_off()) {
|
|
|
|
arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_WORKAROUND_2, false, NULL);
|
|
|
|
return SPECTRE_VULNERABLE;
|
|
|
|
}
|
|
|
|
|
|
|
|
arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_WORKAROUND_2, true, NULL);
|
|
|
|
|
|
|
|
if (spectre_v4_mitigations_dynamic())
|
|
|
|
__this_cpu_write(arm64_ssbd_callback_required, 1);
|
|
|
|
|
|
|
|
return SPECTRE_MITIGATED;
|
|
|
|
}
|
|
|
|
|
|
|
|
void spectre_v4_enable_mitigation(const struct arm64_cpu_capabilities *__unused)
|
|
|
|
{
|
|
|
|
enum mitigation_state state;
|
|
|
|
|
|
|
|
WARN_ON(preemptible());
|
|
|
|
|
|
|
|
state = spectre_v4_enable_hw_mitigation();
|
|
|
|
if (state == SPECTRE_VULNERABLE)
|
|
|
|
state = spectre_v4_enable_fw_mitigation();
|
|
|
|
|
|
|
|
update_mitigation_state(&spectre_v4_state, state);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __update_pstate_ssbs(struct pt_regs *regs, bool state)
|
|
|
|
{
|
|
|
|
u64 bit = compat_user_mode(regs) ? PSR_AA32_SSBS_BIT : PSR_SSBS_BIT;
|
|
|
|
|
|
|
|
if (state)
|
|
|
|
regs->pstate |= bit;
|
|
|
|
else
|
|
|
|
regs->pstate &= ~bit;
|
|
|
|
}
|
|
|
|
|
|
|
|
void spectre_v4_enable_task_mitigation(struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
struct pt_regs *regs = task_pt_regs(tsk);
|
|
|
|
bool ssbs = false, kthread = tsk->flags & PF_KTHREAD;
|
|
|
|
|
|
|
|
if (spectre_v4_mitigations_off())
|
|
|
|
ssbs = true;
|
|
|
|
else if (spectre_v4_mitigations_dynamic() && !kthread)
|
|
|
|
ssbs = !test_tsk_thread_flag(tsk, TIF_SSBD);
|
|
|
|
|
|
|
|
__update_pstate_ssbs(regs, ssbs);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The Spectre-v4 mitigation can be controlled via a prctl() from userspace.
|
|
|
|
* This is interesting because the "speculation disabled" behaviour can be
|
|
|
|
* configured so that it is preserved across exec(), which means that the
|
|
|
|
* prctl() may be necessary even when PSTATE.SSBS can be toggled directly
|
|
|
|
* from userspace.
|
|
|
|
*/
|
2020-09-28 14:03:00 +01:00
|
|
|
static void ssbd_prctl_enable_mitigation(struct task_struct *task)
|
|
|
|
{
|
|
|
|
task_clear_spec_ssb_noexec(task);
|
|
|
|
task_set_spec_ssb_disable(task);
|
|
|
|
set_tsk_thread_flag(task, TIF_SSBD);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void ssbd_prctl_disable_mitigation(struct task_struct *task)
|
|
|
|
{
|
|
|
|
task_clear_spec_ssb_noexec(task);
|
|
|
|
task_clear_spec_ssb_disable(task);
|
|
|
|
clear_tsk_thread_flag(task, TIF_SSBD);
|
|
|
|
}
|
|
|
|
|
2020-09-18 11:54:33 +01:00
|
|
|
static int ssbd_prctl_set(struct task_struct *task, unsigned long ctrl)
|
|
|
|
{
|
2020-09-18 11:45:57 +01:00
|
|
|
switch (ctrl) {
|
|
|
|
case PR_SPEC_ENABLE:
|
2020-09-18 11:54:33 +01:00
|
|
|
/* Enable speculation: disable mitigation */
|
|
|
|
/*
|
|
|
|
* Force disabled speculation prevents it from being
|
|
|
|
* re-enabled.
|
|
|
|
*/
|
|
|
|
if (task_spec_ssb_force_disable(task))
|
|
|
|
return -EPERM;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the mitigation is forced on, then speculation is forced
|
|
|
|
* off and we again prevent it from being re-enabled.
|
|
|
|
*/
|
|
|
|
if (spectre_v4_mitigations_on())
|
2020-09-18 11:45:57 +01:00
|
|
|
return -EPERM;
|
2020-09-18 11:54:33 +01:00
|
|
|
|
2020-09-28 14:03:00 +01:00
|
|
|
ssbd_prctl_disable_mitigation(task);
|
2020-09-18 11:45:57 +01:00
|
|
|
break;
|
|
|
|
case PR_SPEC_FORCE_DISABLE:
|
2020-09-18 11:54:33 +01:00
|
|
|
/* Force disable speculation: force enable mitigation */
|
|
|
|
/*
|
|
|
|
* If the mitigation is forced off, then speculation is forced
|
|
|
|
* on and we prevent it from being disabled.
|
|
|
|
*/
|
|
|
|
if (spectre_v4_mitigations_off())
|
2020-09-18 11:45:57 +01:00
|
|
|
return -EPERM;
|
2020-09-18 11:54:33 +01:00
|
|
|
|
2020-09-18 11:45:57 +01:00
|
|
|
task_set_spec_ssb_force_disable(task);
|
2020-09-18 11:54:33 +01:00
|
|
|
fallthrough;
|
|
|
|
case PR_SPEC_DISABLE:
|
|
|
|
/* Disable speculation: enable mitigation */
|
|
|
|
/* Same as PR_SPEC_FORCE_DISABLE */
|
|
|
|
if (spectre_v4_mitigations_off())
|
|
|
|
return -EPERM;
|
|
|
|
|
2020-09-28 14:03:00 +01:00
|
|
|
ssbd_prctl_enable_mitigation(task);
|
|
|
|
break;
|
|
|
|
case PR_SPEC_DISABLE_NOEXEC:
|
|
|
|
/* Disable speculation until execve(): enable mitigation */
|
|
|
|
/*
|
|
|
|
* If the mitigation state is forced one way or the other, then
|
|
|
|
* we must fail now before we try to toggle it on execve().
|
|
|
|
*/
|
|
|
|
if (task_spec_ssb_force_disable(task) ||
|
|
|
|
spectre_v4_mitigations_off() ||
|
|
|
|
spectre_v4_mitigations_on()) {
|
|
|
|
return -EPERM;
|
|
|
|
}
|
|
|
|
|
|
|
|
ssbd_prctl_enable_mitigation(task);
|
|
|
|
task_set_spec_ssb_noexec(task);
|
2020-09-18 11:45:57 +01:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return -ERANGE;
|
|
|
|
}
|
|
|
|
|
2020-09-18 11:54:33 +01:00
|
|
|
spectre_v4_enable_task_mitigation(task);
|
2020-09-18 11:45:57 +01:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which,
|
|
|
|
unsigned long ctrl)
|
|
|
|
{
|
|
|
|
switch (which) {
|
|
|
|
case PR_SPEC_STORE_BYPASS:
|
|
|
|
return ssbd_prctl_set(task, ctrl);
|
|
|
|
default:
|
|
|
|
return -ENODEV;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int ssbd_prctl_get(struct task_struct *task)
|
|
|
|
{
|
2020-09-18 11:54:33 +01:00
|
|
|
switch (spectre_v4_state) {
|
|
|
|
case SPECTRE_UNAFFECTED:
|
2020-09-18 11:45:57 +01:00
|
|
|
return PR_SPEC_NOT_AFFECTED;
|
2020-09-18 11:54:33 +01:00
|
|
|
case SPECTRE_MITIGATED:
|
|
|
|
if (spectre_v4_mitigations_on())
|
|
|
|
return PR_SPEC_NOT_AFFECTED;
|
|
|
|
|
|
|
|
if (spectre_v4_mitigations_dynamic())
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* Mitigations are disabled, so we're vulnerable. */
|
|
|
|
fallthrough;
|
|
|
|
case SPECTRE_VULNERABLE:
|
|
|
|
fallthrough;
|
|
|
|
default:
|
|
|
|
return PR_SPEC_ENABLE;
|
2020-09-18 11:45:57 +01:00
|
|
|
}
|
2020-09-18 11:54:33 +01:00
|
|
|
|
|
|
|
/* Check the mitigation state for this task */
|
|
|
|
if (task_spec_ssb_force_disable(task))
|
|
|
|
return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
|
|
|
|
|
2020-09-28 14:03:00 +01:00
|
|
|
if (task_spec_ssb_noexec(task))
|
|
|
|
return PR_SPEC_PRCTL | PR_SPEC_DISABLE_NOEXEC;
|
|
|
|
|
2020-09-18 11:54:33 +01:00
|
|
|
if (task_spec_ssb_disable(task))
|
|
|
|
return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
|
|
|
|
|
|
|
|
return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
|
2020-09-18 11:45:57 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
|
|
|
|
{
|
|
|
|
switch (which) {
|
|
|
|
case PR_SPEC_STORE_BYPASS:
|
|
|
|
return ssbd_prctl_get(task);
|
|
|
|
default:
|
|
|
|
return -ENODEV;
|
|
|
|
}
|
|
|
|
}
|
2021-11-18 13:59:46 +00:00
|
|
|
|
arm64: Mitigate spectre style branch history side channels
Speculation attacks against some high-performance processors can
make use of branch history to influence future speculation.
When taking an exception from user-space, a sequence of branches
or a firmware call overwrites or invalidates the branch history.
The sequence of branches is added to the vectors, and should appear
before the first indirect branch. For systems using KPTI the sequence
is added to the kpti trampoline where it has a free register as the exit
from the trampoline is via a 'ret'. For systems not using KPTI, the same
register tricks are used to free up a register in the vectors.
For the firmware call, arch-workaround-3 clobbers 4 registers, so
there is no choice but to save them to the EL1 stack. This only happens
for entry from EL0, so if we take an exception due to the stack access,
it will not become re-entrant.
For KVM, the existing branch-predictor-hardening vectors are used.
When a spectre version of these vectors is in use, the firmware call
is sufficient to mitigate against Spectre-BHB. For the non-spectre
versions, the sequence of branches is added to the indirect vector.
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
2021-11-10 14:48:00 +00:00
|
|
|
/*
|
|
|
|
* Spectre BHB.
|
|
|
|
*
|
|
|
|
* A CPU is either:
|
|
|
|
* - Mitigated by a branchy loop a CPU specific number of times, and listed
|
|
|
|
* in our "loop mitigated list".
|
|
|
|
* - Mitigated in software by the firmware Spectre v2 call.
|
2021-12-10 14:32:56 +00:00
|
|
|
* - Has the ClearBHB instruction to perform the mitigation.
|
arm64: Mitigate spectre style branch history side channels
Speculation attacks against some high-performance processors can
make use of branch history to influence future speculation.
When taking an exception from user-space, a sequence of branches
or a firmware call overwrites or invalidates the branch history.
The sequence of branches is added to the vectors, and should appear
before the first indirect branch. For systems using KPTI the sequence
is added to the kpti trampoline where it has a free register as the exit
from the trampoline is via a 'ret'. For systems not using KPTI, the same
register tricks are used to free up a register in the vectors.
For the firmware call, arch-workaround-3 clobbers 4 registers, so
there is no choice but to save them to the EL1 stack. This only happens
for entry from EL0, so if we take an exception due to the stack access,
it will not become re-entrant.
For KVM, the existing branch-predictor-hardening vectors are used.
When a spectre version of these vectors is in use, the firmware call
is sufficient to mitigate against Spectre-BHB. For the non-spectre
versions, the sequence of branches is added to the indirect vector.
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
2021-11-10 14:48:00 +00:00
|
|
|
* - Has the 'Exception Clears Branch History Buffer' (ECBHB) feature, so no
|
|
|
|
* software mitigation in the vectors is needed.
|
|
|
|
* - Has CSV2.3, so is unaffected.
|
|
|
|
*/
|
2022-02-08 16:08:13 +00:00
|
|
|
static enum mitigation_state spectre_bhb_state;
|
|
|
|
|
|
|
|
enum mitigation_state arm64_get_spectre_bhb_state(void)
|
|
|
|
{
|
|
|
|
return spectre_bhb_state;
|
|
|
|
}
|
|
|
|
|
arm64: Mitigate spectre style branch history side channels
Speculation attacks against some high-performance processors can
make use of branch history to influence future speculation.
When taking an exception from user-space, a sequence of branches
or a firmware call overwrites or invalidates the branch history.
The sequence of branches is added to the vectors, and should appear
before the first indirect branch. For systems using KPTI the sequence
is added to the kpti trampoline where it has a free register as the exit
from the trampoline is via a 'ret'. For systems not using KPTI, the same
register tricks are used to free up a register in the vectors.
For the firmware call, arch-workaround-3 clobbers 4 registers, so
there is no choice but to save them to the EL1 stack. This only happens
for entry from EL0, so if we take an exception due to the stack access,
it will not become re-entrant.
For KVM, the existing branch-predictor-hardening vectors are used.
When a spectre version of these vectors is in use, the firmware call
is sufficient to mitigate against Spectre-BHB. For the non-spectre
versions, the sequence of branches is added to the indirect vector.
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
2021-11-10 14:48:00 +00:00
|
|
|
enum bhb_mitigation_bits {
|
|
|
|
BHB_LOOP,
|
|
|
|
BHB_FW,
|
|
|
|
BHB_HW,
|
2021-12-10 14:32:56 +00:00
|
|
|
BHB_INSN,
|
arm64: Mitigate spectre style branch history side channels
Speculation attacks against some high-performance processors can
make use of branch history to influence future speculation.
When taking an exception from user-space, a sequence of branches
or a firmware call overwrites or invalidates the branch history.
The sequence of branches is added to the vectors, and should appear
before the first indirect branch. For systems using KPTI the sequence
is added to the kpti trampoline where it has a free register as the exit
from the trampoline is via a 'ret'. For systems not using KPTI, the same
register tricks are used to free up a register in the vectors.
For the firmware call, arch-workaround-3 clobbers 4 registers, so
there is no choice but to save them to the EL1 stack. This only happens
for entry from EL0, so if we take an exception due to the stack access,
it will not become re-entrant.
For KVM, the existing branch-predictor-hardening vectors are used.
When a spectre version of these vectors is in use, the firmware call
is sufficient to mitigate against Spectre-BHB. For the non-spectre
versions, the sequence of branches is added to the indirect vector.
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
2021-11-10 14:48:00 +00:00
|
|
|
};
|
|
|
|
static unsigned long system_bhb_mitigations;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This must be called with SCOPE_LOCAL_CPU for each type of CPU, before any
|
|
|
|
* SCOPE_SYSTEM call will give the right answer.
|
|
|
|
*/
|
2025-01-07 12:05:59 -08:00
|
|
|
static bool is_spectre_bhb_safe(int scope)
|
|
|
|
{
|
|
|
|
static const struct midr_range spectre_bhb_safe_list[] = {
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_CORTEX_A35),
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_CORTEX_A53),
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_CORTEX_A55),
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_CORTEX_A510),
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_CORTEX_A520),
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53),
|
2025-01-07 12:06:00 -08:00
|
|
|
MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_2XX_SILVER),
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER),
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER),
|
2025-01-07 12:05:59 -08:00
|
|
|
{},
|
|
|
|
};
|
|
|
|
static bool all_safe = true;
|
|
|
|
|
|
|
|
if (scope != SCOPE_LOCAL_CPU)
|
|
|
|
return all_safe;
|
|
|
|
|
ARM:
* Nested virtualization support for VGICv3, giving the nested
hypervisor control of the VGIC hardware when running an L2 VM
* Removal of 'late' nested virtualization feature register masking,
making the supported feature set directly visible to userspace
* Support for emulating FEAT_PMUv3 on Apple silicon, taking advantage
of an IMPLEMENTATION DEFINED trap that covers all PMUv3 registers
* Paravirtual interface for discovering the set of CPU implementations
where a VM may run, addressing a longstanding issue of guest CPU
errata awareness in big-little systems and cross-implementation VM
migration
* Userspace control of the registers responsible for identifying a
particular CPU implementation (MIDR_EL1, REVIDR_EL1, AIDR_EL1),
allowing VMs to be migrated cross-implementation
* pKVM updates, including support for tracking stage-2 page table
allocations in the protected hypervisor in the 'SecPageTable' stat
* Fixes to vPMU, ensuring that userspace updates to the vPMU after
KVM_RUN are reflected into the backing perf events
LoongArch:
* Remove unnecessary header include path
* Assume constant PGD during VM context switch
* Add perf events support for guest VM
RISC-V:
* Disable the kernel perf counter during configure
* KVM selftests improvements for PMU
* Fix warning at the time of KVM module removal
x86:
* Add support for aging of SPTEs without holding mmu_lock. Not taking mmu_lock
allows multiple aging actions to run in parallel, and more importantly avoids
stalling vCPUs. This includes an implementation of per-rmap-entry locking;
aging the gfn is done with only a per-rmap single-bin spinlock taken, whereas
locking an rmap for write requires taking both the per-rmap spinlock and
the mmu_lock.
Note that this decreases slightly the accuracy of accessed-page information,
because changes to the SPTE outside aging might not use atomic operations
even if they could race against a clear of the Accessed bit. This is
deliberate because KVM and mm/ tolerate false positives/negatives for
accessed information, and testing has shown that reducing the latency of
aging is far more beneficial to overall system performance than providing
"perfect" young/old information.
* Defer runtime CPUID updates until KVM emulates a CPUID instruction, to
coalesce updates when multiple pieces of vCPU state are changing, e.g. as
part of a nested transition.
* Fix a variety of nested emulation bugs, and add VMX support for synthesizing
nested VM-Exit on interception (instead of injecting #UD into L2).
* Drop "support" for async page faults for protected guests that do not set
SEND_ALWAYS (i.e. that only want async page faults at CPL3)
* Bring a bit of sanity to x86's VM teardown code, which has accumulated
a lot of cruft over the years. Particularly, destroy vCPUs before
the MMU, despite the latter being a VM-wide operation.
* Add common secure TSC infrastructure for use within SNP and in the
future TDX
* Block KVM_CAP_SYNC_REGS if guest state is protected. It does not make
sense to use the capability if the relevant registers are not
available for reading or writing.
* Don't take kvm->lock when iterating over vCPUs in the suspend notifier to
fix a largely theoretical deadlock.
* Use the vCPU's actual Xen PV clock information when starting the Xen timer,
as the cached state in arch.hv_clock can be stale/bogus.
* Fix a bug where KVM could bleed PVCLOCK_GUEST_STOPPED across different
PV clocks; restrict PVCLOCK_GUEST_STOPPED to kvmclock, as KVM's suspend
notifier only accounts for kvmclock, and there's no evidence that the
flag is actually supported by Xen guests.
* Clean up the per-vCPU "cache" of its reference pvclock, and instead only
track the vCPU's TSC scaling (multipler+shift) metadata (which is moderately
expensive to compute, and rarely changes for modern setups).
* Don't write to the Xen hypercall page on MSR writes that are initiated by
the host (userspace or KVM) to fix a class of bugs where KVM can write to
guest memory at unexpected times, e.g. during vCPU creation if userspace has
set the Xen hypercall MSR index to collide with an MSR that KVM emulates.
* Restrict the Xen hypercall MSR index to the unofficial synthetic range to
reduce the set of possible collisions with MSRs that are emulated by KVM
(collisions can still happen as KVM emulates Hyper-V MSRs, which also reside
in the synthetic range).
* Clean up and optimize KVM's handling of Xen MSR writes and xen_hvm_config.
* Update Xen TSC leaves during CPUID emulation instead of modifying the CPUID
entries when updating PV clocks; there is no guarantee PV clocks will be
updated between TSC frequency changes and CPUID emulation, and guest reads
of the TSC leaves should be rare, i.e. are not a hot path.
x86 (Intel):
* Fix a bug where KVM unnecessarily reads XFD_ERR from hardware and thus
modifies the vCPU's XFD_ERR on a #NM due to CR0.TS=1.
* Pass XFD_ERR as the payload when injecting #NM, as a preparatory step
for upcoming FRED virtualization support.
* Decouple the EPT entry RWX protection bit macros from the EPT Violation
bits, both as a general cleanup and in anticipation of adding support for
emulating Mode-Based Execution Control (MBEC).
* Reject KVM_RUN if userspace manages to gain control and stuff invalid guest
state while KVM is in the middle of emulating nested VM-Enter.
* Add a macro to handle KVM's sanity checks on entry/exit VMCS control pairs
in anticipation of adding sanity checks for secondary exit controls (the
primary field is out of bits).
x86 (AMD):
* Ensure the PSP driver is initialized when both the PSP and KVM modules are
built-in (the initcall framework doesn't handle dependencies).
* Use long-term pins when registering encrypted memory regions, so that the
pages are migrated out of MIGRATE_CMA/ZONE_MOVABLE and don't lead to
excessive fragmentation.
* Add macros and helpers for setting GHCB return/error codes.
* Add support for Idle HLT interception, which elides interception if the vCPU
has a pending, unmasked virtual IRQ when HLT is executed.
* Fix a bug in INVPCID emulation where KVM fails to check for a non-canonical
address.
* Don't attempt VMRUN for SEV-ES+ guests if the vCPU's VMSA is invalid, e.g.
because the vCPU was "destroyed" via SNP's AP Creation hypercall.
* Reject SNP AP Creation if the requested SEV features for the vCPU don't
match the VM's configured set of features.
Selftests:
* Fix again the Intel PMU counters test; add a data load and do CLFLUSH{OPT} on the data
instead of executing code. The theory is that modern Intel CPUs have
learned new code prefetching tricks that bypass the PMU counters.
* Fix a flaw in the Intel PMU counters test where it asserts that an event is
counting correctly without actually knowing what the event counts on the
underlying hardware.
* Fix a variety of flaws, bugs, and false failures/passes dirty_log_test, and
improve its coverage by collecting all dirty entries on each iteration.
* Fix a few minor bugs related to handling of stats FDs.
* Add infrastructure to make vCPU and VM stats FDs available to tests by
default (open the FDs during VM/vCPU creation).
* Relax an assertion on the number of HLT exits in the xAPIC IPI test when
running on a CPU that supports AMD's Idle HLT (which elides interception of
HLT if a virtual IRQ is pending and unmasked).
-----BEGIN PGP SIGNATURE-----
iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmfcTkEUHHBib256aW5p
QHJlZGhhdC5jb20ACgkQv/vSX3jHroMnQAf/cPx72hJOdNy4Qrm8M33YLXVRVV00
yEZ8eN8TWdOclr0ltE/w/ELGh/qS4CU8pjURAk0A6lPioU+mdcTn3dPEqMDMVYom
uOQ2lusEHw0UuSnGZSEjvZJsE/Ro2NSAsHIB6PWRqig1ZBPJzyu0frce34pMpeQH
diwriJL9lKPAhBWXnUQ9BKoi1R0P5OLW9ahX4SOWk7cAFg4DLlDE66Nqf6nKqViw
DwEucTiUEg5+a3d93gihdD4JNl+fb3vI2erxrMxjFjkacl0qgqRu3ei3DG0MfdHU
wNcFSG5B1n0OECKxr80lr1Ip1KTVNNij0Ks+w6Gc6lSg9c4PptnNkfLK3A==
=nnCN
-----END PGP SIGNATURE-----
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Paolo Bonzini:
"ARM:
- Nested virtualization support for VGICv3, giving the nested
hypervisor control of the VGIC hardware when running an L2 VM
- Removal of 'late' nested virtualization feature register masking,
making the supported feature set directly visible to userspace
- Support for emulating FEAT_PMUv3 on Apple silicon, taking advantage
of an IMPLEMENTATION DEFINED trap that covers all PMUv3 registers
- Paravirtual interface for discovering the set of CPU
implementations where a VM may run, addressing a longstanding issue
of guest CPU errata awareness in big-little systems and
cross-implementation VM migration
- Userspace control of the registers responsible for identifying a
particular CPU implementation (MIDR_EL1, REVIDR_EL1, AIDR_EL1),
allowing VMs to be migrated cross-implementation
- pKVM updates, including support for tracking stage-2 page table
allocations in the protected hypervisor in the 'SecPageTable' stat
- Fixes to vPMU, ensuring that userspace updates to the vPMU after
KVM_RUN are reflected into the backing perf events
LoongArch:
- Remove unnecessary header include path
- Assume constant PGD during VM context switch
- Add perf events support for guest VM
RISC-V:
- Disable the kernel perf counter during configure
- KVM selftests improvements for PMU
- Fix warning at the time of KVM module removal
x86:
- Add support for aging of SPTEs without holding mmu_lock.
Not taking mmu_lock allows multiple aging actions to run in
parallel, and more importantly avoids stalling vCPUs. This includes
an implementation of per-rmap-entry locking; aging the gfn is done
with only a per-rmap single-bin spinlock taken, whereas locking an
rmap for write requires taking both the per-rmap spinlock and the
mmu_lock.
Note that this decreases slightly the accuracy of accessed-page
information, because changes to the SPTE outside aging might not
use atomic operations even if they could race against a clear of
the Accessed bit.
This is deliberate because KVM and mm/ tolerate false
positives/negatives for accessed information, and testing has shown
that reducing the latency of aging is far more beneficial to
overall system performance than providing "perfect" young/old
information.
- Defer runtime CPUID updates until KVM emulates a CPUID instruction,
to coalesce updates when multiple pieces of vCPU state are
changing, e.g. as part of a nested transition
- Fix a variety of nested emulation bugs, and add VMX support for
synthesizing nested VM-Exit on interception (instead of injecting
#UD into L2)
- Drop "support" for async page faults for protected guests that do
not set SEND_ALWAYS (i.e. that only want async page faults at CPL3)
- Bring a bit of sanity to x86's VM teardown code, which has
accumulated a lot of cruft over the years. Particularly, destroy
vCPUs before the MMU, despite the latter being a VM-wide operation
- Add common secure TSC infrastructure for use within SNP and in the
future TDX
- Block KVM_CAP_SYNC_REGS if guest state is protected. It does not
make sense to use the capability if the relevant registers are not
available for reading or writing
- Don't take kvm->lock when iterating over vCPUs in the suspend
notifier to fix a largely theoretical deadlock
- Use the vCPU's actual Xen PV clock information when starting the
Xen timer, as the cached state in arch.hv_clock can be stale/bogus
- Fix a bug where KVM could bleed PVCLOCK_GUEST_STOPPED across
different PV clocks; restrict PVCLOCK_GUEST_STOPPED to kvmclock, as
KVM's suspend notifier only accounts for kvmclock, and there's no
evidence that the flag is actually supported by Xen guests
- Clean up the per-vCPU "cache" of its reference pvclock, and instead
only track the vCPU's TSC scaling (multipler+shift) metadata (which
is moderately expensive to compute, and rarely changes for modern
setups)
- Don't write to the Xen hypercall page on MSR writes that are
initiated by the host (userspace or KVM) to fix a class of bugs
where KVM can write to guest memory at unexpected times, e.g.
during vCPU creation if userspace has set the Xen hypercall MSR
index to collide with an MSR that KVM emulates
- Restrict the Xen hypercall MSR index to the unofficial synthetic
range to reduce the set of possible collisions with MSRs that are
emulated by KVM (collisions can still happen as KVM emulates
Hyper-V MSRs, which also reside in the synthetic range)
- Clean up and optimize KVM's handling of Xen MSR writes and
xen_hvm_config
- Update Xen TSC leaves during CPUID emulation instead of modifying
the CPUID entries when updating PV clocks; there is no guarantee PV
clocks will be updated between TSC frequency changes and CPUID
emulation, and guest reads of the TSC leaves should be rare, i.e.
are not a hot path
x86 (Intel):
- Fix a bug where KVM unnecessarily reads XFD_ERR from hardware and
thus modifies the vCPU's XFD_ERR on a #NM due to CR0.TS=1
- Pass XFD_ERR as the payload when injecting #NM, as a preparatory
step for upcoming FRED virtualization support
- Decouple the EPT entry RWX protection bit macros from the EPT
Violation bits, both as a general cleanup and in anticipation of
adding support for emulating Mode-Based Execution Control (MBEC)
- Reject KVM_RUN if userspace manages to gain control and stuff
invalid guest state while KVM is in the middle of emulating nested
VM-Enter
- Add a macro to handle KVM's sanity checks on entry/exit VMCS
control pairs in anticipation of adding sanity checks for secondary
exit controls (the primary field is out of bits)
x86 (AMD):
- Ensure the PSP driver is initialized when both the PSP and KVM
modules are built-in (the initcall framework doesn't handle
dependencies)
- Use long-term pins when registering encrypted memory regions, so
that the pages are migrated out of MIGRATE_CMA/ZONE_MOVABLE and
don't lead to excessive fragmentation
- Add macros and helpers for setting GHCB return/error codes
- Add support for Idle HLT interception, which elides interception if
the vCPU has a pending, unmasked virtual IRQ when HLT is executed
- Fix a bug in INVPCID emulation where KVM fails to check for a
non-canonical address
- Don't attempt VMRUN for SEV-ES+ guests if the vCPU's VMSA is
invalid, e.g. because the vCPU was "destroyed" via SNP's AP
Creation hypercall
- Reject SNP AP Creation if the requested SEV features for the vCPU
don't match the VM's configured set of features
Selftests:
- Fix again the Intel PMU counters test; add a data load and do
CLFLUSH{OPT} on the data instead of executing code. The theory is
that modern Intel CPUs have learned new code prefetching tricks
that bypass the PMU counters
- Fix a flaw in the Intel PMU counters test where it asserts that an
event is counting correctly without actually knowing what the event
counts on the underlying hardware
- Fix a variety of flaws, bugs, and false failures/passes
dirty_log_test, and improve its coverage by collecting all dirty
entries on each iteration
- Fix a few minor bugs related to handling of stats FDs
- Add infrastructure to make vCPU and VM stats FDs available to tests
by default (open the FDs during VM/vCPU creation)
- Relax an assertion on the number of HLT exits in the xAPIC IPI test
when running on a CPU that supports AMD's Idle HLT (which elides
interception of HLT if a virtual IRQ is pending and unmasked)"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (216 commits)
RISC-V: KVM: Optimize comments in kvm_riscv_vcpu_isa_disable_allowed
RISC-V: KVM: Teardown riscv specific bits after kvm_exit
LoongArch: KVM: Register perf callbacks for guest
LoongArch: KVM: Implement arch-specific functions for guest perf
LoongArch: KVM: Add stub for kvm_arch_vcpu_preempted_in_kernel()
LoongArch: KVM: Remove PGD saving during VM context switch
LoongArch: KVM: Remove unnecessary header include path
KVM: arm64: Tear down vGIC on failed vCPU creation
KVM: arm64: PMU: Reload when resetting
KVM: arm64: PMU: Reload when user modifies registers
KVM: arm64: PMU: Fix SET_ONE_REG for vPMC regs
KVM: arm64: PMU: Assume PMU presence in pmu-emul.c
KVM: arm64: PMU: Set raw values from user to PM{C,I}NTEN{SET,CLR}, PMOVS{SET,CLR}
KVM: arm64: Create each pKVM hyp vcpu after its corresponding host vcpu
KVM: arm64: Factor out pKVM hyp vcpu creation to separate function
KVM: arm64: Initialize HCRX_EL2 traps in pKVM
KVM: arm64: Factor out setting HCRX_EL2 traps into separate function
KVM: x86: block KVM_CAP_SYNC_REGS if guest state is protected
KVM: x86: Add infrastructure for secure TSC
KVM: x86: Push down setting vcpu.arch.user_set_tsc
...
2025-03-25 14:22:07 -07:00
|
|
|
if (is_midr_in_range_list(spectre_bhb_safe_list))
|
2025-01-07 12:05:59 -08:00
|
|
|
return true;
|
|
|
|
|
|
|
|
all_safe = false;
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
static u8 spectre_bhb_loop_affected(void)
|
arm64: Mitigate spectre style branch history side channels
Speculation attacks against some high-performance processors can
make use of branch history to influence future speculation.
When taking an exception from user-space, a sequence of branches
or a firmware call overwrites or invalidates the branch history.
The sequence of branches is added to the vectors, and should appear
before the first indirect branch. For systems using KPTI the sequence
is added to the kpti trampoline where it has a free register as the exit
from the trampoline is via a 'ret'. For systems not using KPTI, the same
register tricks are used to free up a register in the vectors.
For the firmware call, arch-workaround-3 clobbers 4 registers, so
there is no choice but to save them to the EL1 stack. This only happens
for entry from EL0, so if we take an exception due to the stack access,
it will not become re-entrant.
For KVM, the existing branch-predictor-hardening vectors are used.
When a spectre version of these vectors is in use, the firmware call
is sufficient to mitigate against Spectre-BHB. For the non-spectre
versions, the sequence of branches is added to the indirect vector.
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
2021-11-10 14:48:00 +00:00
|
|
|
{
|
|
|
|
u8 k = 0;
|
2025-01-07 12:05:59 -08:00
|
|
|
|
2025-01-07 12:06:02 -08:00
|
|
|
static const struct midr_range spectre_bhb_k132_list[] = {
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_CORTEX_X3),
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2),
|
2025-05-01 11:47:47 +01:00
|
|
|
{},
|
2025-01-07 12:06:02 -08:00
|
|
|
};
|
|
|
|
static const struct midr_range spectre_bhb_k38_list[] = {
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_CORTEX_A715),
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_CORTEX_A720),
|
2025-05-01 11:47:47 +01:00
|
|
|
{},
|
2025-01-07 12:06:02 -08:00
|
|
|
};
|
2025-01-07 12:05:59 -08:00
|
|
|
static const struct midr_range spectre_bhb_k32_list[] = {
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_CORTEX_A78),
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_CORTEX_A78AE),
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C),
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_CORTEX_X1),
|
2024-08-12 17:50:22 +01:00
|
|
|
MIDR_ALL_VERSIONS(MIDR_CORTEX_X1C),
|
2025-01-07 12:05:59 -08:00
|
|
|
MIDR_ALL_VERSIONS(MIDR_CORTEX_A710),
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_CORTEX_X2),
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2),
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1),
|
|
|
|
{},
|
|
|
|
};
|
|
|
|
static const struct midr_range spectre_bhb_k24_list[] = {
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_CORTEX_A76),
|
2025-01-07 12:06:02 -08:00
|
|
|
MIDR_ALL_VERSIONS(MIDR_CORTEX_A76AE),
|
2025-01-07 12:05:59 -08:00
|
|
|
MIDR_ALL_VERSIONS(MIDR_CORTEX_A77),
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1),
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_GOLD),
|
2025-03-25 22:19:00 +08:00
|
|
|
MIDR_ALL_VERSIONS(MIDR_HISI_HIP09),
|
2025-01-07 12:05:59 -08:00
|
|
|
{},
|
|
|
|
};
|
|
|
|
static const struct midr_range spectre_bhb_k11_list[] = {
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_AMPERE1),
|
|
|
|
{},
|
|
|
|
};
|
|
|
|
static const struct midr_range spectre_bhb_k8_list[] = {
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_CORTEX_A72),
|
|
|
|
MIDR_ALL_VERSIONS(MIDR_CORTEX_A57),
|
|
|
|
{},
|
|
|
|
};
|
|
|
|
|
ARM:
* Nested virtualization support for VGICv3, giving the nested
hypervisor control of the VGIC hardware when running an L2 VM
* Removal of 'late' nested virtualization feature register masking,
making the supported feature set directly visible to userspace
* Support for emulating FEAT_PMUv3 on Apple silicon, taking advantage
of an IMPLEMENTATION DEFINED trap that covers all PMUv3 registers
* Paravirtual interface for discovering the set of CPU implementations
where a VM may run, addressing a longstanding issue of guest CPU
errata awareness in big-little systems and cross-implementation VM
migration
* Userspace control of the registers responsible for identifying a
particular CPU implementation (MIDR_EL1, REVIDR_EL1, AIDR_EL1),
allowing VMs to be migrated cross-implementation
* pKVM updates, including support for tracking stage-2 page table
allocations in the protected hypervisor in the 'SecPageTable' stat
* Fixes to vPMU, ensuring that userspace updates to the vPMU after
KVM_RUN are reflected into the backing perf events
LoongArch:
* Remove unnecessary header include path
* Assume constant PGD during VM context switch
* Add perf events support for guest VM
RISC-V:
* Disable the kernel perf counter during configure
* KVM selftests improvements for PMU
* Fix warning at the time of KVM module removal
x86:
* Add support for aging of SPTEs without holding mmu_lock. Not taking mmu_lock
allows multiple aging actions to run in parallel, and more importantly avoids
stalling vCPUs. This includes an implementation of per-rmap-entry locking;
aging the gfn is done with only a per-rmap single-bin spinlock taken, whereas
locking an rmap for write requires taking both the per-rmap spinlock and
the mmu_lock.
Note that this decreases slightly the accuracy of accessed-page information,
because changes to the SPTE outside aging might not use atomic operations
even if they could race against a clear of the Accessed bit. This is
deliberate because KVM and mm/ tolerate false positives/negatives for
accessed information, and testing has shown that reducing the latency of
aging is far more beneficial to overall system performance than providing
"perfect" young/old information.
* Defer runtime CPUID updates until KVM emulates a CPUID instruction, to
coalesce updates when multiple pieces of vCPU state are changing, e.g. as
part of a nested transition.
* Fix a variety of nested emulation bugs, and add VMX support for synthesizing
nested VM-Exit on interception (instead of injecting #UD into L2).
* Drop "support" for async page faults for protected guests that do not set
SEND_ALWAYS (i.e. that only want async page faults at CPL3)
* Bring a bit of sanity to x86's VM teardown code, which has accumulated
a lot of cruft over the years. Particularly, destroy vCPUs before
the MMU, despite the latter being a VM-wide operation.
* Add common secure TSC infrastructure for use within SNP and in the
future TDX
* Block KVM_CAP_SYNC_REGS if guest state is protected. It does not make
sense to use the capability if the relevant registers are not
available for reading or writing.
* Don't take kvm->lock when iterating over vCPUs in the suspend notifier to
fix a largely theoretical deadlock.
* Use the vCPU's actual Xen PV clock information when starting the Xen timer,
as the cached state in arch.hv_clock can be stale/bogus.
* Fix a bug where KVM could bleed PVCLOCK_GUEST_STOPPED across different
PV clocks; restrict PVCLOCK_GUEST_STOPPED to kvmclock, as KVM's suspend
notifier only accounts for kvmclock, and there's no evidence that the
flag is actually supported by Xen guests.
* Clean up the per-vCPU "cache" of its reference pvclock, and instead only
track the vCPU's TSC scaling (multipler+shift) metadata (which is moderately
expensive to compute, and rarely changes for modern setups).
* Don't write to the Xen hypercall page on MSR writes that are initiated by
the host (userspace or KVM) to fix a class of bugs where KVM can write to
guest memory at unexpected times, e.g. during vCPU creation if userspace has
set the Xen hypercall MSR index to collide with an MSR that KVM emulates.
* Restrict the Xen hypercall MSR index to the unofficial synthetic range to
reduce the set of possible collisions with MSRs that are emulated by KVM
(collisions can still happen as KVM emulates Hyper-V MSRs, which also reside
in the synthetic range).
* Clean up and optimize KVM's handling of Xen MSR writes and xen_hvm_config.
* Update Xen TSC leaves during CPUID emulation instead of modifying the CPUID
entries when updating PV clocks; there is no guarantee PV clocks will be
updated between TSC frequency changes and CPUID emulation, and guest reads
of the TSC leaves should be rare, i.e. are not a hot path.
x86 (Intel):
* Fix a bug where KVM unnecessarily reads XFD_ERR from hardware and thus
modifies the vCPU's XFD_ERR on a #NM due to CR0.TS=1.
* Pass XFD_ERR as the payload when injecting #NM, as a preparatory step
for upcoming FRED virtualization support.
* Decouple the EPT entry RWX protection bit macros from the EPT Violation
bits, both as a general cleanup and in anticipation of adding support for
emulating Mode-Based Execution Control (MBEC).
* Reject KVM_RUN if userspace manages to gain control and stuff invalid guest
state while KVM is in the middle of emulating nested VM-Enter.
* Add a macro to handle KVM's sanity checks on entry/exit VMCS control pairs
in anticipation of adding sanity checks for secondary exit controls (the
primary field is out of bits).
x86 (AMD):
* Ensure the PSP driver is initialized when both the PSP and KVM modules are
built-in (the initcall framework doesn't handle dependencies).
* Use long-term pins when registering encrypted memory regions, so that the
pages are migrated out of MIGRATE_CMA/ZONE_MOVABLE and don't lead to
excessive fragmentation.
* Add macros and helpers for setting GHCB return/error codes.
* Add support for Idle HLT interception, which elides interception if the vCPU
has a pending, unmasked virtual IRQ when HLT is executed.
* Fix a bug in INVPCID emulation where KVM fails to check for a non-canonical
address.
* Don't attempt VMRUN for SEV-ES+ guests if the vCPU's VMSA is invalid, e.g.
because the vCPU was "destroyed" via SNP's AP Creation hypercall.
* Reject SNP AP Creation if the requested SEV features for the vCPU don't
match the VM's configured set of features.
Selftests:
* Fix again the Intel PMU counters test; add a data load and do CLFLUSH{OPT} on the data
instead of executing code. The theory is that modern Intel CPUs have
learned new code prefetching tricks that bypass the PMU counters.
* Fix a flaw in the Intel PMU counters test where it asserts that an event is
counting correctly without actually knowing what the event counts on the
underlying hardware.
* Fix a variety of flaws, bugs, and false failures/passes dirty_log_test, and
improve its coverage by collecting all dirty entries on each iteration.
* Fix a few minor bugs related to handling of stats FDs.
* Add infrastructure to make vCPU and VM stats FDs available to tests by
default (open the FDs during VM/vCPU creation).
* Relax an assertion on the number of HLT exits in the xAPIC IPI test when
running on a CPU that supports AMD's Idle HLT (which elides interception of
HLT if a virtual IRQ is pending and unmasked).
-----BEGIN PGP SIGNATURE-----
iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmfcTkEUHHBib256aW5p
QHJlZGhhdC5jb20ACgkQv/vSX3jHroMnQAf/cPx72hJOdNy4Qrm8M33YLXVRVV00
yEZ8eN8TWdOclr0ltE/w/ELGh/qS4CU8pjURAk0A6lPioU+mdcTn3dPEqMDMVYom
uOQ2lusEHw0UuSnGZSEjvZJsE/Ro2NSAsHIB6PWRqig1ZBPJzyu0frce34pMpeQH
diwriJL9lKPAhBWXnUQ9BKoi1R0P5OLW9ahX4SOWk7cAFg4DLlDE66Nqf6nKqViw
DwEucTiUEg5+a3d93gihdD4JNl+fb3vI2erxrMxjFjkacl0qgqRu3ei3DG0MfdHU
wNcFSG5B1n0OECKxr80lr1Ip1KTVNNij0Ks+w6Gc6lSg9c4PptnNkfLK3A==
=nnCN
-----END PGP SIGNATURE-----
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Paolo Bonzini:
"ARM:
- Nested virtualization support for VGICv3, giving the nested
hypervisor control of the VGIC hardware when running an L2 VM
- Removal of 'late' nested virtualization feature register masking,
making the supported feature set directly visible to userspace
- Support for emulating FEAT_PMUv3 on Apple silicon, taking advantage
of an IMPLEMENTATION DEFINED trap that covers all PMUv3 registers
- Paravirtual interface for discovering the set of CPU
implementations where a VM may run, addressing a longstanding issue
of guest CPU errata awareness in big-little systems and
cross-implementation VM migration
- Userspace control of the registers responsible for identifying a
particular CPU implementation (MIDR_EL1, REVIDR_EL1, AIDR_EL1),
allowing VMs to be migrated cross-implementation
- pKVM updates, including support for tracking stage-2 page table
allocations in the protected hypervisor in the 'SecPageTable' stat
- Fixes to vPMU, ensuring that userspace updates to the vPMU after
KVM_RUN are reflected into the backing perf events
LoongArch:
- Remove unnecessary header include path
- Assume constant PGD during VM context switch
- Add perf events support for guest VM
RISC-V:
- Disable the kernel perf counter during configure
- KVM selftests improvements for PMU
- Fix warning at the time of KVM module removal
x86:
- Add support for aging of SPTEs without holding mmu_lock.
Not taking mmu_lock allows multiple aging actions to run in
parallel, and more importantly avoids stalling vCPUs. This includes
an implementation of per-rmap-entry locking; aging the gfn is done
with only a per-rmap single-bin spinlock taken, whereas locking an
rmap for write requires taking both the per-rmap spinlock and the
mmu_lock.
Note that this decreases slightly the accuracy of accessed-page
information, because changes to the SPTE outside aging might not
use atomic operations even if they could race against a clear of
the Accessed bit.
This is deliberate because KVM and mm/ tolerate false
positives/negatives for accessed information, and testing has shown
that reducing the latency of aging is far more beneficial to
overall system performance than providing "perfect" young/old
information.
- Defer runtime CPUID updates until KVM emulates a CPUID instruction,
to coalesce updates when multiple pieces of vCPU state are
changing, e.g. as part of a nested transition
- Fix a variety of nested emulation bugs, and add VMX support for
synthesizing nested VM-Exit on interception (instead of injecting
#UD into L2)
- Drop "support" for async page faults for protected guests that do
not set SEND_ALWAYS (i.e. that only want async page faults at CPL3)
- Bring a bit of sanity to x86's VM teardown code, which has
accumulated a lot of cruft over the years. Particularly, destroy
vCPUs before the MMU, despite the latter being a VM-wide operation
- Add common secure TSC infrastructure for use within SNP and in the
future TDX
- Block KVM_CAP_SYNC_REGS if guest state is protected. It does not
make sense to use the capability if the relevant registers are not
available for reading or writing
- Don't take kvm->lock when iterating over vCPUs in the suspend
notifier to fix a largely theoretical deadlock
- Use the vCPU's actual Xen PV clock information when starting the
Xen timer, as the cached state in arch.hv_clock can be stale/bogus
- Fix a bug where KVM could bleed PVCLOCK_GUEST_STOPPED across
different PV clocks; restrict PVCLOCK_GUEST_STOPPED to kvmclock, as
KVM's suspend notifier only accounts for kvmclock, and there's no
evidence that the flag is actually supported by Xen guests
- Clean up the per-vCPU "cache" of its reference pvclock, and instead
only track the vCPU's TSC scaling (multipler+shift) metadata (which
is moderately expensive to compute, and rarely changes for modern
setups)
- Don't write to the Xen hypercall page on MSR writes that are
initiated by the host (userspace or KVM) to fix a class of bugs
where KVM can write to guest memory at unexpected times, e.g.
during vCPU creation if userspace has set the Xen hypercall MSR
index to collide with an MSR that KVM emulates
- Restrict the Xen hypercall MSR index to the unofficial synthetic
range to reduce the set of possible collisions with MSRs that are
emulated by KVM (collisions can still happen as KVM emulates
Hyper-V MSRs, which also reside in the synthetic range)
- Clean up and optimize KVM's handling of Xen MSR writes and
xen_hvm_config
- Update Xen TSC leaves during CPUID emulation instead of modifying
the CPUID entries when updating PV clocks; there is no guarantee PV
clocks will be updated between TSC frequency changes and CPUID
emulation, and guest reads of the TSC leaves should be rare, i.e.
are not a hot path
x86 (Intel):
- Fix a bug where KVM unnecessarily reads XFD_ERR from hardware and
thus modifies the vCPU's XFD_ERR on a #NM due to CR0.TS=1
- Pass XFD_ERR as the payload when injecting #NM, as a preparatory
step for upcoming FRED virtualization support
- Decouple the EPT entry RWX protection bit macros from the EPT
Violation bits, both as a general cleanup and in anticipation of
adding support for emulating Mode-Based Execution Control (MBEC)
- Reject KVM_RUN if userspace manages to gain control and stuff
invalid guest state while KVM is in the middle of emulating nested
VM-Enter
- Add a macro to handle KVM's sanity checks on entry/exit VMCS
control pairs in anticipation of adding sanity checks for secondary
exit controls (the primary field is out of bits)
x86 (AMD):
- Ensure the PSP driver is initialized when both the PSP and KVM
modules are built-in (the initcall framework doesn't handle
dependencies)
- Use long-term pins when registering encrypted memory regions, so
that the pages are migrated out of MIGRATE_CMA/ZONE_MOVABLE and
don't lead to excessive fragmentation
- Add macros and helpers for setting GHCB return/error codes
- Add support for Idle HLT interception, which elides interception if
the vCPU has a pending, unmasked virtual IRQ when HLT is executed
- Fix a bug in INVPCID emulation where KVM fails to check for a
non-canonical address
- Don't attempt VMRUN for SEV-ES+ guests if the vCPU's VMSA is
invalid, e.g. because the vCPU was "destroyed" via SNP's AP
Creation hypercall
- Reject SNP AP Creation if the requested SEV features for the vCPU
don't match the VM's configured set of features
Selftests:
- Fix again the Intel PMU counters test; add a data load and do
CLFLUSH{OPT} on the data instead of executing code. The theory is
that modern Intel CPUs have learned new code prefetching tricks
that bypass the PMU counters
- Fix a flaw in the Intel PMU counters test where it asserts that an
event is counting correctly without actually knowing what the event
counts on the underlying hardware
- Fix a variety of flaws, bugs, and false failures/passes
dirty_log_test, and improve its coverage by collecting all dirty
entries on each iteration
- Fix a few minor bugs related to handling of stats FDs
- Add infrastructure to make vCPU and VM stats FDs available to tests
by default (open the FDs during VM/vCPU creation)
- Relax an assertion on the number of HLT exits in the xAPIC IPI test
when running on a CPU that supports AMD's Idle HLT (which elides
interception of HLT if a virtual IRQ is pending and unmasked)"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (216 commits)
RISC-V: KVM: Optimize comments in kvm_riscv_vcpu_isa_disable_allowed
RISC-V: KVM: Teardown riscv specific bits after kvm_exit
LoongArch: KVM: Register perf callbacks for guest
LoongArch: KVM: Implement arch-specific functions for guest perf
LoongArch: KVM: Add stub for kvm_arch_vcpu_preempted_in_kernel()
LoongArch: KVM: Remove PGD saving during VM context switch
LoongArch: KVM: Remove unnecessary header include path
KVM: arm64: Tear down vGIC on failed vCPU creation
KVM: arm64: PMU: Reload when resetting
KVM: arm64: PMU: Reload when user modifies registers
KVM: arm64: PMU: Fix SET_ONE_REG for vPMC regs
KVM: arm64: PMU: Assume PMU presence in pmu-emul.c
KVM: arm64: PMU: Set raw values from user to PM{C,I}NTEN{SET,CLR}, PMOVS{SET,CLR}
KVM: arm64: Create each pKVM hyp vcpu after its corresponding host vcpu
KVM: arm64: Factor out pKVM hyp vcpu creation to separate function
KVM: arm64: Initialize HCRX_EL2 traps in pKVM
KVM: arm64: Factor out setting HCRX_EL2 traps into separate function
KVM: x86: block KVM_CAP_SYNC_REGS if guest state is protected
KVM: x86: Add infrastructure for secure TSC
KVM: x86: Push down setting vcpu.arch.user_set_tsc
...
2025-03-25 14:22:07 -07:00
|
|
|
if (is_midr_in_range_list(spectre_bhb_k132_list))
|
2025-01-07 12:06:02 -08:00
|
|
|
k = 132;
|
ARM:
* Nested virtualization support for VGICv3, giving the nested
hypervisor control of the VGIC hardware when running an L2 VM
* Removal of 'late' nested virtualization feature register masking,
making the supported feature set directly visible to userspace
* Support for emulating FEAT_PMUv3 on Apple silicon, taking advantage
of an IMPLEMENTATION DEFINED trap that covers all PMUv3 registers
* Paravirtual interface for discovering the set of CPU implementations
where a VM may run, addressing a longstanding issue of guest CPU
errata awareness in big-little systems and cross-implementation VM
migration
* Userspace control of the registers responsible for identifying a
particular CPU implementation (MIDR_EL1, REVIDR_EL1, AIDR_EL1),
allowing VMs to be migrated cross-implementation
* pKVM updates, including support for tracking stage-2 page table
allocations in the protected hypervisor in the 'SecPageTable' stat
* Fixes to vPMU, ensuring that userspace updates to the vPMU after
KVM_RUN are reflected into the backing perf events
LoongArch:
* Remove unnecessary header include path
* Assume constant PGD during VM context switch
* Add perf events support for guest VM
RISC-V:
* Disable the kernel perf counter during configure
* KVM selftests improvements for PMU
* Fix warning at the time of KVM module removal
x86:
* Add support for aging of SPTEs without holding mmu_lock. Not taking mmu_lock
allows multiple aging actions to run in parallel, and more importantly avoids
stalling vCPUs. This includes an implementation of per-rmap-entry locking;
aging the gfn is done with only a per-rmap single-bin spinlock taken, whereas
locking an rmap for write requires taking both the per-rmap spinlock and
the mmu_lock.
Note that this decreases slightly the accuracy of accessed-page information,
because changes to the SPTE outside aging might not use atomic operations
even if they could race against a clear of the Accessed bit. This is
deliberate because KVM and mm/ tolerate false positives/negatives for
accessed information, and testing has shown that reducing the latency of
aging is far more beneficial to overall system performance than providing
"perfect" young/old information.
* Defer runtime CPUID updates until KVM emulates a CPUID instruction, to
coalesce updates when multiple pieces of vCPU state are changing, e.g. as
part of a nested transition.
* Fix a variety of nested emulation bugs, and add VMX support for synthesizing
nested VM-Exit on interception (instead of injecting #UD into L2).
* Drop "support" for async page faults for protected guests that do not set
SEND_ALWAYS (i.e. that only want async page faults at CPL3)
* Bring a bit of sanity to x86's VM teardown code, which has accumulated
a lot of cruft over the years. Particularly, destroy vCPUs before
the MMU, despite the latter being a VM-wide operation.
* Add common secure TSC infrastructure for use within SNP and in the
future TDX
* Block KVM_CAP_SYNC_REGS if guest state is protected. It does not make
sense to use the capability if the relevant registers are not
available for reading or writing.
* Don't take kvm->lock when iterating over vCPUs in the suspend notifier to
fix a largely theoretical deadlock.
* Use the vCPU's actual Xen PV clock information when starting the Xen timer,
as the cached state in arch.hv_clock can be stale/bogus.
* Fix a bug where KVM could bleed PVCLOCK_GUEST_STOPPED across different
PV clocks; restrict PVCLOCK_GUEST_STOPPED to kvmclock, as KVM's suspend
notifier only accounts for kvmclock, and there's no evidence that the
flag is actually supported by Xen guests.
* Clean up the per-vCPU "cache" of its reference pvclock, and instead only
track the vCPU's TSC scaling (multipler+shift) metadata (which is moderately
expensive to compute, and rarely changes for modern setups).
* Don't write to the Xen hypercall page on MSR writes that are initiated by
the host (userspace or KVM) to fix a class of bugs where KVM can write to
guest memory at unexpected times, e.g. during vCPU creation if userspace has
set the Xen hypercall MSR index to collide with an MSR that KVM emulates.
* Restrict the Xen hypercall MSR index to the unofficial synthetic range to
reduce the set of possible collisions with MSRs that are emulated by KVM
(collisions can still happen as KVM emulates Hyper-V MSRs, which also reside
in the synthetic range).
* Clean up and optimize KVM's handling of Xen MSR writes and xen_hvm_config.
* Update Xen TSC leaves during CPUID emulation instead of modifying the CPUID
entries when updating PV clocks; there is no guarantee PV clocks will be
updated between TSC frequency changes and CPUID emulation, and guest reads
of the TSC leaves should be rare, i.e. are not a hot path.
x86 (Intel):
* Fix a bug where KVM unnecessarily reads XFD_ERR from hardware and thus
modifies the vCPU's XFD_ERR on a #NM due to CR0.TS=1.
* Pass XFD_ERR as the payload when injecting #NM, as a preparatory step
for upcoming FRED virtualization support.
* Decouple the EPT entry RWX protection bit macros from the EPT Violation
bits, both as a general cleanup and in anticipation of adding support for
emulating Mode-Based Execution Control (MBEC).
* Reject KVM_RUN if userspace manages to gain control and stuff invalid guest
state while KVM is in the middle of emulating nested VM-Enter.
* Add a macro to handle KVM's sanity checks on entry/exit VMCS control pairs
in anticipation of adding sanity checks for secondary exit controls (the
primary field is out of bits).
x86 (AMD):
* Ensure the PSP driver is initialized when both the PSP and KVM modules are
built-in (the initcall framework doesn't handle dependencies).
* Use long-term pins when registering encrypted memory regions, so that the
pages are migrated out of MIGRATE_CMA/ZONE_MOVABLE and don't lead to
excessive fragmentation.
* Add macros and helpers for setting GHCB return/error codes.
* Add support for Idle HLT interception, which elides interception if the vCPU
has a pending, unmasked virtual IRQ when HLT is executed.
* Fix a bug in INVPCID emulation where KVM fails to check for a non-canonical
address.
* Don't attempt VMRUN for SEV-ES+ guests if the vCPU's VMSA is invalid, e.g.
because the vCPU was "destroyed" via SNP's AP Creation hypercall.
* Reject SNP AP Creation if the requested SEV features for the vCPU don't
match the VM's configured set of features.
Selftests:
* Fix again the Intel PMU counters test; add a data load and do CLFLUSH{OPT} on the data
instead of executing code. The theory is that modern Intel CPUs have
learned new code prefetching tricks that bypass the PMU counters.
* Fix a flaw in the Intel PMU counters test where it asserts that an event is
counting correctly without actually knowing what the event counts on the
underlying hardware.
* Fix a variety of flaws, bugs, and false failures/passes dirty_log_test, and
improve its coverage by collecting all dirty entries on each iteration.
* Fix a few minor bugs related to handling of stats FDs.
* Add infrastructure to make vCPU and VM stats FDs available to tests by
default (open the FDs during VM/vCPU creation).
* Relax an assertion on the number of HLT exits in the xAPIC IPI test when
running on a CPU that supports AMD's Idle HLT (which elides interception of
HLT if a virtual IRQ is pending and unmasked).
-----BEGIN PGP SIGNATURE-----
iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmfcTkEUHHBib256aW5p
QHJlZGhhdC5jb20ACgkQv/vSX3jHroMnQAf/cPx72hJOdNy4Qrm8M33YLXVRVV00
yEZ8eN8TWdOclr0ltE/w/ELGh/qS4CU8pjURAk0A6lPioU+mdcTn3dPEqMDMVYom
uOQ2lusEHw0UuSnGZSEjvZJsE/Ro2NSAsHIB6PWRqig1ZBPJzyu0frce34pMpeQH
diwriJL9lKPAhBWXnUQ9BKoi1R0P5OLW9ahX4SOWk7cAFg4DLlDE66Nqf6nKqViw
DwEucTiUEg5+a3d93gihdD4JNl+fb3vI2erxrMxjFjkacl0qgqRu3ei3DG0MfdHU
wNcFSG5B1n0OECKxr80lr1Ip1KTVNNij0Ks+w6Gc6lSg9c4PptnNkfLK3A==
=nnCN
-----END PGP SIGNATURE-----
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Paolo Bonzini:
"ARM:
- Nested virtualization support for VGICv3, giving the nested
hypervisor control of the VGIC hardware when running an L2 VM
- Removal of 'late' nested virtualization feature register masking,
making the supported feature set directly visible to userspace
- Support for emulating FEAT_PMUv3 on Apple silicon, taking advantage
of an IMPLEMENTATION DEFINED trap that covers all PMUv3 registers
- Paravirtual interface for discovering the set of CPU
implementations where a VM may run, addressing a longstanding issue
of guest CPU errata awareness in big-little systems and
cross-implementation VM migration
- Userspace control of the registers responsible for identifying a
particular CPU implementation (MIDR_EL1, REVIDR_EL1, AIDR_EL1),
allowing VMs to be migrated cross-implementation
- pKVM updates, including support for tracking stage-2 page table
allocations in the protected hypervisor in the 'SecPageTable' stat
- Fixes to vPMU, ensuring that userspace updates to the vPMU after
KVM_RUN are reflected into the backing perf events
LoongArch:
- Remove unnecessary header include path
- Assume constant PGD during VM context switch
- Add perf events support for guest VM
RISC-V:
- Disable the kernel perf counter during configure
- KVM selftests improvements for PMU
- Fix warning at the time of KVM module removal
x86:
- Add support for aging of SPTEs without holding mmu_lock.
Not taking mmu_lock allows multiple aging actions to run in
parallel, and more importantly avoids stalling vCPUs. This includes
an implementation of per-rmap-entry locking; aging the gfn is done
with only a per-rmap single-bin spinlock taken, whereas locking an
rmap for write requires taking both the per-rmap spinlock and the
mmu_lock.
Note that this decreases slightly the accuracy of accessed-page
information, because changes to the SPTE outside aging might not
use atomic operations even if they could race against a clear of
the Accessed bit.
This is deliberate because KVM and mm/ tolerate false
positives/negatives for accessed information, and testing has shown
that reducing the latency of aging is far more beneficial to
overall system performance than providing "perfect" young/old
information.
- Defer runtime CPUID updates until KVM emulates a CPUID instruction,
to coalesce updates when multiple pieces of vCPU state are
changing, e.g. as part of a nested transition
- Fix a variety of nested emulation bugs, and add VMX support for
synthesizing nested VM-Exit on interception (instead of injecting
#UD into L2)
- Drop "support" for async page faults for protected guests that do
not set SEND_ALWAYS (i.e. that only want async page faults at CPL3)
- Bring a bit of sanity to x86's VM teardown code, which has
accumulated a lot of cruft over the years. Particularly, destroy
vCPUs before the MMU, despite the latter being a VM-wide operation
- Add common secure TSC infrastructure for use within SNP and in the
future TDX
- Block KVM_CAP_SYNC_REGS if guest state is protected. It does not
make sense to use the capability if the relevant registers are not
available for reading or writing
- Don't take kvm->lock when iterating over vCPUs in the suspend
notifier to fix a largely theoretical deadlock
- Use the vCPU's actual Xen PV clock information when starting the
Xen timer, as the cached state in arch.hv_clock can be stale/bogus
- Fix a bug where KVM could bleed PVCLOCK_GUEST_STOPPED across
different PV clocks; restrict PVCLOCK_GUEST_STOPPED to kvmclock, as
KVM's suspend notifier only accounts for kvmclock, and there's no
evidence that the flag is actually supported by Xen guests
- Clean up the per-vCPU "cache" of its reference pvclock, and instead
only track the vCPU's TSC scaling (multipler+shift) metadata (which
is moderately expensive to compute, and rarely changes for modern
setups)
- Don't write to the Xen hypercall page on MSR writes that are
initiated by the host (userspace or KVM) to fix a class of bugs
where KVM can write to guest memory at unexpected times, e.g.
during vCPU creation if userspace has set the Xen hypercall MSR
index to collide with an MSR that KVM emulates
- Restrict the Xen hypercall MSR index to the unofficial synthetic
range to reduce the set of possible collisions with MSRs that are
emulated by KVM (collisions can still happen as KVM emulates
Hyper-V MSRs, which also reside in the synthetic range)
- Clean up and optimize KVM's handling of Xen MSR writes and
xen_hvm_config
- Update Xen TSC leaves during CPUID emulation instead of modifying
the CPUID entries when updating PV clocks; there is no guarantee PV
clocks will be updated between TSC frequency changes and CPUID
emulation, and guest reads of the TSC leaves should be rare, i.e.
are not a hot path
x86 (Intel):
- Fix a bug where KVM unnecessarily reads XFD_ERR from hardware and
thus modifies the vCPU's XFD_ERR on a #NM due to CR0.TS=1
- Pass XFD_ERR as the payload when injecting #NM, as a preparatory
step for upcoming FRED virtualization support
- Decouple the EPT entry RWX protection bit macros from the EPT
Violation bits, both as a general cleanup and in anticipation of
adding support for emulating Mode-Based Execution Control (MBEC)
- Reject KVM_RUN if userspace manages to gain control and stuff
invalid guest state while KVM is in the middle of emulating nested
VM-Enter
- Add a macro to handle KVM's sanity checks on entry/exit VMCS
control pairs in anticipation of adding sanity checks for secondary
exit controls (the primary field is out of bits)
x86 (AMD):
- Ensure the PSP driver is initialized when both the PSP and KVM
modules are built-in (the initcall framework doesn't handle
dependencies)
- Use long-term pins when registering encrypted memory regions, so
that the pages are migrated out of MIGRATE_CMA/ZONE_MOVABLE and
don't lead to excessive fragmentation
- Add macros and helpers for setting GHCB return/error codes
- Add support for Idle HLT interception, which elides interception if
the vCPU has a pending, unmasked virtual IRQ when HLT is executed
- Fix a bug in INVPCID emulation where KVM fails to check for a
non-canonical address
- Don't attempt VMRUN for SEV-ES+ guests if the vCPU's VMSA is
invalid, e.g. because the vCPU was "destroyed" via SNP's AP
Creation hypercall
- Reject SNP AP Creation if the requested SEV features for the vCPU
don't match the VM's configured set of features
Selftests:
- Fix again the Intel PMU counters test; add a data load and do
CLFLUSH{OPT} on the data instead of executing code. The theory is
that modern Intel CPUs have learned new code prefetching tricks
that bypass the PMU counters
- Fix a flaw in the Intel PMU counters test where it asserts that an
event is counting correctly without actually knowing what the event
counts on the underlying hardware
- Fix a variety of flaws, bugs, and false failures/passes
dirty_log_test, and improve its coverage by collecting all dirty
entries on each iteration
- Fix a few minor bugs related to handling of stats FDs
- Add infrastructure to make vCPU and VM stats FDs available to tests
by default (open the FDs during VM/vCPU creation)
- Relax an assertion on the number of HLT exits in the xAPIC IPI test
when running on a CPU that supports AMD's Idle HLT (which elides
interception of HLT if a virtual IRQ is pending and unmasked)"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (216 commits)
RISC-V: KVM: Optimize comments in kvm_riscv_vcpu_isa_disable_allowed
RISC-V: KVM: Teardown riscv specific bits after kvm_exit
LoongArch: KVM: Register perf callbacks for guest
LoongArch: KVM: Implement arch-specific functions for guest perf
LoongArch: KVM: Add stub for kvm_arch_vcpu_preempted_in_kernel()
LoongArch: KVM: Remove PGD saving during VM context switch
LoongArch: KVM: Remove unnecessary header include path
KVM: arm64: Tear down vGIC on failed vCPU creation
KVM: arm64: PMU: Reload when resetting
KVM: arm64: PMU: Reload when user modifies registers
KVM: arm64: PMU: Fix SET_ONE_REG for vPMC regs
KVM: arm64: PMU: Assume PMU presence in pmu-emul.c
KVM: arm64: PMU: Set raw values from user to PM{C,I}NTEN{SET,CLR}, PMOVS{SET,CLR}
KVM: arm64: Create each pKVM hyp vcpu after its corresponding host vcpu
KVM: arm64: Factor out pKVM hyp vcpu creation to separate function
KVM: arm64: Initialize HCRX_EL2 traps in pKVM
KVM: arm64: Factor out setting HCRX_EL2 traps into separate function
KVM: x86: block KVM_CAP_SYNC_REGS if guest state is protected
KVM: x86: Add infrastructure for secure TSC
KVM: x86: Push down setting vcpu.arch.user_set_tsc
...
2025-03-25 14:22:07 -07:00
|
|
|
else if (is_midr_in_range_list(spectre_bhb_k38_list))
|
2025-01-07 12:06:02 -08:00
|
|
|
k = 38;
|
ARM:
* Nested virtualization support for VGICv3, giving the nested
hypervisor control of the VGIC hardware when running an L2 VM
* Removal of 'late' nested virtualization feature register masking,
making the supported feature set directly visible to userspace
* Support for emulating FEAT_PMUv3 on Apple silicon, taking advantage
of an IMPLEMENTATION DEFINED trap that covers all PMUv3 registers
* Paravirtual interface for discovering the set of CPU implementations
where a VM may run, addressing a longstanding issue of guest CPU
errata awareness in big-little systems and cross-implementation VM
migration
* Userspace control of the registers responsible for identifying a
particular CPU implementation (MIDR_EL1, REVIDR_EL1, AIDR_EL1),
allowing VMs to be migrated cross-implementation
* pKVM updates, including support for tracking stage-2 page table
allocations in the protected hypervisor in the 'SecPageTable' stat
* Fixes to vPMU, ensuring that userspace updates to the vPMU after
KVM_RUN are reflected into the backing perf events
LoongArch:
* Remove unnecessary header include path
* Assume constant PGD during VM context switch
* Add perf events support for guest VM
RISC-V:
* Disable the kernel perf counter during configure
* KVM selftests improvements for PMU
* Fix warning at the time of KVM module removal
x86:
* Add support for aging of SPTEs without holding mmu_lock. Not taking mmu_lock
allows multiple aging actions to run in parallel, and more importantly avoids
stalling vCPUs. This includes an implementation of per-rmap-entry locking;
aging the gfn is done with only a per-rmap single-bin spinlock taken, whereas
locking an rmap for write requires taking both the per-rmap spinlock and
the mmu_lock.
Note that this decreases slightly the accuracy of accessed-page information,
because changes to the SPTE outside aging might not use atomic operations
even if they could race against a clear of the Accessed bit. This is
deliberate because KVM and mm/ tolerate false positives/negatives for
accessed information, and testing has shown that reducing the latency of
aging is far more beneficial to overall system performance than providing
"perfect" young/old information.
* Defer runtime CPUID updates until KVM emulates a CPUID instruction, to
coalesce updates when multiple pieces of vCPU state are changing, e.g. as
part of a nested transition.
* Fix a variety of nested emulation bugs, and add VMX support for synthesizing
nested VM-Exit on interception (instead of injecting #UD into L2).
* Drop "support" for async page faults for protected guests that do not set
SEND_ALWAYS (i.e. that only want async page faults at CPL3)
* Bring a bit of sanity to x86's VM teardown code, which has accumulated
a lot of cruft over the years. Particularly, destroy vCPUs before
the MMU, despite the latter being a VM-wide operation.
* Add common secure TSC infrastructure for use within SNP and in the
future TDX
* Block KVM_CAP_SYNC_REGS if guest state is protected. It does not make
sense to use the capability if the relevant registers are not
available for reading or writing.
* Don't take kvm->lock when iterating over vCPUs in the suspend notifier to
fix a largely theoretical deadlock.
* Use the vCPU's actual Xen PV clock information when starting the Xen timer,
as the cached state in arch.hv_clock can be stale/bogus.
* Fix a bug where KVM could bleed PVCLOCK_GUEST_STOPPED across different
PV clocks; restrict PVCLOCK_GUEST_STOPPED to kvmclock, as KVM's suspend
notifier only accounts for kvmclock, and there's no evidence that the
flag is actually supported by Xen guests.
* Clean up the per-vCPU "cache" of its reference pvclock, and instead only
track the vCPU's TSC scaling (multipler+shift) metadata (which is moderately
expensive to compute, and rarely changes for modern setups).
* Don't write to the Xen hypercall page on MSR writes that are initiated by
the host (userspace or KVM) to fix a class of bugs where KVM can write to
guest memory at unexpected times, e.g. during vCPU creation if userspace has
set the Xen hypercall MSR index to collide with an MSR that KVM emulates.
* Restrict the Xen hypercall MSR index to the unofficial synthetic range to
reduce the set of possible collisions with MSRs that are emulated by KVM
(collisions can still happen as KVM emulates Hyper-V MSRs, which also reside
in the synthetic range).
* Clean up and optimize KVM's handling of Xen MSR writes and xen_hvm_config.
* Update Xen TSC leaves during CPUID emulation instead of modifying the CPUID
entries when updating PV clocks; there is no guarantee PV clocks will be
updated between TSC frequency changes and CPUID emulation, and guest reads
of the TSC leaves should be rare, i.e. are not a hot path.
x86 (Intel):
* Fix a bug where KVM unnecessarily reads XFD_ERR from hardware and thus
modifies the vCPU's XFD_ERR on a #NM due to CR0.TS=1.
* Pass XFD_ERR as the payload when injecting #NM, as a preparatory step
for upcoming FRED virtualization support.
* Decouple the EPT entry RWX protection bit macros from the EPT Violation
bits, both as a general cleanup and in anticipation of adding support for
emulating Mode-Based Execution Control (MBEC).
* Reject KVM_RUN if userspace manages to gain control and stuff invalid guest
state while KVM is in the middle of emulating nested VM-Enter.
* Add a macro to handle KVM's sanity checks on entry/exit VMCS control pairs
in anticipation of adding sanity checks for secondary exit controls (the
primary field is out of bits).
x86 (AMD):
* Ensure the PSP driver is initialized when both the PSP and KVM modules are
built-in (the initcall framework doesn't handle dependencies).
* Use long-term pins when registering encrypted memory regions, so that the
pages are migrated out of MIGRATE_CMA/ZONE_MOVABLE and don't lead to
excessive fragmentation.
* Add macros and helpers for setting GHCB return/error codes.
* Add support for Idle HLT interception, which elides interception if the vCPU
has a pending, unmasked virtual IRQ when HLT is executed.
* Fix a bug in INVPCID emulation where KVM fails to check for a non-canonical
address.
* Don't attempt VMRUN for SEV-ES+ guests if the vCPU's VMSA is invalid, e.g.
because the vCPU was "destroyed" via SNP's AP Creation hypercall.
* Reject SNP AP Creation if the requested SEV features for the vCPU don't
match the VM's configured set of features.
Selftests:
* Fix again the Intel PMU counters test; add a data load and do CLFLUSH{OPT} on the data
instead of executing code. The theory is that modern Intel CPUs have
learned new code prefetching tricks that bypass the PMU counters.
* Fix a flaw in the Intel PMU counters test where it asserts that an event is
counting correctly without actually knowing what the event counts on the
underlying hardware.
* Fix a variety of flaws, bugs, and false failures/passes dirty_log_test, and
improve its coverage by collecting all dirty entries on each iteration.
* Fix a few minor bugs related to handling of stats FDs.
* Add infrastructure to make vCPU and VM stats FDs available to tests by
default (open the FDs during VM/vCPU creation).
* Relax an assertion on the number of HLT exits in the xAPIC IPI test when
running on a CPU that supports AMD's Idle HLT (which elides interception of
HLT if a virtual IRQ is pending and unmasked).
-----BEGIN PGP SIGNATURE-----
iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmfcTkEUHHBib256aW5p
QHJlZGhhdC5jb20ACgkQv/vSX3jHroMnQAf/cPx72hJOdNy4Qrm8M33YLXVRVV00
yEZ8eN8TWdOclr0ltE/w/ELGh/qS4CU8pjURAk0A6lPioU+mdcTn3dPEqMDMVYom
uOQ2lusEHw0UuSnGZSEjvZJsE/Ro2NSAsHIB6PWRqig1ZBPJzyu0frce34pMpeQH
diwriJL9lKPAhBWXnUQ9BKoi1R0P5OLW9ahX4SOWk7cAFg4DLlDE66Nqf6nKqViw
DwEucTiUEg5+a3d93gihdD4JNl+fb3vI2erxrMxjFjkacl0qgqRu3ei3DG0MfdHU
wNcFSG5B1n0OECKxr80lr1Ip1KTVNNij0Ks+w6Gc6lSg9c4PptnNkfLK3A==
=nnCN
-----END PGP SIGNATURE-----
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Paolo Bonzini:
"ARM:
- Nested virtualization support for VGICv3, giving the nested
hypervisor control of the VGIC hardware when running an L2 VM
- Removal of 'late' nested virtualization feature register masking,
making the supported feature set directly visible to userspace
- Support for emulating FEAT_PMUv3 on Apple silicon, taking advantage
of an IMPLEMENTATION DEFINED trap that covers all PMUv3 registers
- Paravirtual interface for discovering the set of CPU
implementations where a VM may run, addressing a longstanding issue
of guest CPU errata awareness in big-little systems and
cross-implementation VM migration
- Userspace control of the registers responsible for identifying a
particular CPU implementation (MIDR_EL1, REVIDR_EL1, AIDR_EL1),
allowing VMs to be migrated cross-implementation
- pKVM updates, including support for tracking stage-2 page table
allocations in the protected hypervisor in the 'SecPageTable' stat
- Fixes to vPMU, ensuring that userspace updates to the vPMU after
KVM_RUN are reflected into the backing perf events
LoongArch:
- Remove unnecessary header include path
- Assume constant PGD during VM context switch
- Add perf events support for guest VM
RISC-V:
- Disable the kernel perf counter during configure
- KVM selftests improvements for PMU
- Fix warning at the time of KVM module removal
x86:
- Add support for aging of SPTEs without holding mmu_lock.
Not taking mmu_lock allows multiple aging actions to run in
parallel, and more importantly avoids stalling vCPUs. This includes
an implementation of per-rmap-entry locking; aging the gfn is done
with only a per-rmap single-bin spinlock taken, whereas locking an
rmap for write requires taking both the per-rmap spinlock and the
mmu_lock.
Note that this decreases slightly the accuracy of accessed-page
information, because changes to the SPTE outside aging might not
use atomic operations even if they could race against a clear of
the Accessed bit.
This is deliberate because KVM and mm/ tolerate false
positives/negatives for accessed information, and testing has shown
that reducing the latency of aging is far more beneficial to
overall system performance than providing "perfect" young/old
information.
- Defer runtime CPUID updates until KVM emulates a CPUID instruction,
to coalesce updates when multiple pieces of vCPU state are
changing, e.g. as part of a nested transition
- Fix a variety of nested emulation bugs, and add VMX support for
synthesizing nested VM-Exit on interception (instead of injecting
#UD into L2)
- Drop "support" for async page faults for protected guests that do
not set SEND_ALWAYS (i.e. that only want async page faults at CPL3)
- Bring a bit of sanity to x86's VM teardown code, which has
accumulated a lot of cruft over the years. Particularly, destroy
vCPUs before the MMU, despite the latter being a VM-wide operation
- Add common secure TSC infrastructure for use within SNP and in the
future TDX
- Block KVM_CAP_SYNC_REGS if guest state is protected. It does not
make sense to use the capability if the relevant registers are not
available for reading or writing
- Don't take kvm->lock when iterating over vCPUs in the suspend
notifier to fix a largely theoretical deadlock
- Use the vCPU's actual Xen PV clock information when starting the
Xen timer, as the cached state in arch.hv_clock can be stale/bogus
- Fix a bug where KVM could bleed PVCLOCK_GUEST_STOPPED across
different PV clocks; restrict PVCLOCK_GUEST_STOPPED to kvmclock, as
KVM's suspend notifier only accounts for kvmclock, and there's no
evidence that the flag is actually supported by Xen guests
- Clean up the per-vCPU "cache" of its reference pvclock, and instead
only track the vCPU's TSC scaling (multipler+shift) metadata (which
is moderately expensive to compute, and rarely changes for modern
setups)
- Don't write to the Xen hypercall page on MSR writes that are
initiated by the host (userspace or KVM) to fix a class of bugs
where KVM can write to guest memory at unexpected times, e.g.
during vCPU creation if userspace has set the Xen hypercall MSR
index to collide with an MSR that KVM emulates
- Restrict the Xen hypercall MSR index to the unofficial synthetic
range to reduce the set of possible collisions with MSRs that are
emulated by KVM (collisions can still happen as KVM emulates
Hyper-V MSRs, which also reside in the synthetic range)
- Clean up and optimize KVM's handling of Xen MSR writes and
xen_hvm_config
- Update Xen TSC leaves during CPUID emulation instead of modifying
the CPUID entries when updating PV clocks; there is no guarantee PV
clocks will be updated between TSC frequency changes and CPUID
emulation, and guest reads of the TSC leaves should be rare, i.e.
are not a hot path
x86 (Intel):
- Fix a bug where KVM unnecessarily reads XFD_ERR from hardware and
thus modifies the vCPU's XFD_ERR on a #NM due to CR0.TS=1
- Pass XFD_ERR as the payload when injecting #NM, as a preparatory
step for upcoming FRED virtualization support
- Decouple the EPT entry RWX protection bit macros from the EPT
Violation bits, both as a general cleanup and in anticipation of
adding support for emulating Mode-Based Execution Control (MBEC)
- Reject KVM_RUN if userspace manages to gain control and stuff
invalid guest state while KVM is in the middle of emulating nested
VM-Enter
- Add a macro to handle KVM's sanity checks on entry/exit VMCS
control pairs in anticipation of adding sanity checks for secondary
exit controls (the primary field is out of bits)
x86 (AMD):
- Ensure the PSP driver is initialized when both the PSP and KVM
modules are built-in (the initcall framework doesn't handle
dependencies)
- Use long-term pins when registering encrypted memory regions, so
that the pages are migrated out of MIGRATE_CMA/ZONE_MOVABLE and
don't lead to excessive fragmentation
- Add macros and helpers for setting GHCB return/error codes
- Add support for Idle HLT interception, which elides interception if
the vCPU has a pending, unmasked virtual IRQ when HLT is executed
- Fix a bug in INVPCID emulation where KVM fails to check for a
non-canonical address
- Don't attempt VMRUN for SEV-ES+ guests if the vCPU's VMSA is
invalid, e.g. because the vCPU was "destroyed" via SNP's AP
Creation hypercall
- Reject SNP AP Creation if the requested SEV features for the vCPU
don't match the VM's configured set of features
Selftests:
- Fix again the Intel PMU counters test; add a data load and do
CLFLUSH{OPT} on the data instead of executing code. The theory is
that modern Intel CPUs have learned new code prefetching tricks
that bypass the PMU counters
- Fix a flaw in the Intel PMU counters test where it asserts that an
event is counting correctly without actually knowing what the event
counts on the underlying hardware
- Fix a variety of flaws, bugs, and false failures/passes
dirty_log_test, and improve its coverage by collecting all dirty
entries on each iteration
- Fix a few minor bugs related to handling of stats FDs
- Add infrastructure to make vCPU and VM stats FDs available to tests
by default (open the FDs during VM/vCPU creation)
- Relax an assertion on the number of HLT exits in the xAPIC IPI test
when running on a CPU that supports AMD's Idle HLT (which elides
interception of HLT if a virtual IRQ is pending and unmasked)"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (216 commits)
RISC-V: KVM: Optimize comments in kvm_riscv_vcpu_isa_disable_allowed
RISC-V: KVM: Teardown riscv specific bits after kvm_exit
LoongArch: KVM: Register perf callbacks for guest
LoongArch: KVM: Implement arch-specific functions for guest perf
LoongArch: KVM: Add stub for kvm_arch_vcpu_preempted_in_kernel()
LoongArch: KVM: Remove PGD saving during VM context switch
LoongArch: KVM: Remove unnecessary header include path
KVM: arm64: Tear down vGIC on failed vCPU creation
KVM: arm64: PMU: Reload when resetting
KVM: arm64: PMU: Reload when user modifies registers
KVM: arm64: PMU: Fix SET_ONE_REG for vPMC regs
KVM: arm64: PMU: Assume PMU presence in pmu-emul.c
KVM: arm64: PMU: Set raw values from user to PM{C,I}NTEN{SET,CLR}, PMOVS{SET,CLR}
KVM: arm64: Create each pKVM hyp vcpu after its corresponding host vcpu
KVM: arm64: Factor out pKVM hyp vcpu creation to separate function
KVM: arm64: Initialize HCRX_EL2 traps in pKVM
KVM: arm64: Factor out setting HCRX_EL2 traps into separate function
KVM: x86: block KVM_CAP_SYNC_REGS if guest state is protected
KVM: x86: Add infrastructure for secure TSC
KVM: x86: Push down setting vcpu.arch.user_set_tsc
...
2025-03-25 14:22:07 -07:00
|
|
|
else if (is_midr_in_range_list(spectre_bhb_k32_list))
|
2025-01-07 12:05:59 -08:00
|
|
|
k = 32;
|
ARM:
* Nested virtualization support for VGICv3, giving the nested
hypervisor control of the VGIC hardware when running an L2 VM
* Removal of 'late' nested virtualization feature register masking,
making the supported feature set directly visible to userspace
* Support for emulating FEAT_PMUv3 on Apple silicon, taking advantage
of an IMPLEMENTATION DEFINED trap that covers all PMUv3 registers
* Paravirtual interface for discovering the set of CPU implementations
where a VM may run, addressing a longstanding issue of guest CPU
errata awareness in big-little systems and cross-implementation VM
migration
* Userspace control of the registers responsible for identifying a
particular CPU implementation (MIDR_EL1, REVIDR_EL1, AIDR_EL1),
allowing VMs to be migrated cross-implementation
* pKVM updates, including support for tracking stage-2 page table
allocations in the protected hypervisor in the 'SecPageTable' stat
* Fixes to vPMU, ensuring that userspace updates to the vPMU after
KVM_RUN are reflected into the backing perf events
LoongArch:
* Remove unnecessary header include path
* Assume constant PGD during VM context switch
* Add perf events support for guest VM
RISC-V:
* Disable the kernel perf counter during configure
* KVM selftests improvements for PMU
* Fix warning at the time of KVM module removal
x86:
* Add support for aging of SPTEs without holding mmu_lock. Not taking mmu_lock
allows multiple aging actions to run in parallel, and more importantly avoids
stalling vCPUs. This includes an implementation of per-rmap-entry locking;
aging the gfn is done with only a per-rmap single-bin spinlock taken, whereas
locking an rmap for write requires taking both the per-rmap spinlock and
the mmu_lock.
Note that this decreases slightly the accuracy of accessed-page information,
because changes to the SPTE outside aging might not use atomic operations
even if they could race against a clear of the Accessed bit. This is
deliberate because KVM and mm/ tolerate false positives/negatives for
accessed information, and testing has shown that reducing the latency of
aging is far more beneficial to overall system performance than providing
"perfect" young/old information.
* Defer runtime CPUID updates until KVM emulates a CPUID instruction, to
coalesce updates when multiple pieces of vCPU state are changing, e.g. as
part of a nested transition.
* Fix a variety of nested emulation bugs, and add VMX support for synthesizing
nested VM-Exit on interception (instead of injecting #UD into L2).
* Drop "support" for async page faults for protected guests that do not set
SEND_ALWAYS (i.e. that only want async page faults at CPL3)
* Bring a bit of sanity to x86's VM teardown code, which has accumulated
a lot of cruft over the years. Particularly, destroy vCPUs before
the MMU, despite the latter being a VM-wide operation.
* Add common secure TSC infrastructure for use within SNP and in the
future TDX
* Block KVM_CAP_SYNC_REGS if guest state is protected. It does not make
sense to use the capability if the relevant registers are not
available for reading or writing.
* Don't take kvm->lock when iterating over vCPUs in the suspend notifier to
fix a largely theoretical deadlock.
* Use the vCPU's actual Xen PV clock information when starting the Xen timer,
as the cached state in arch.hv_clock can be stale/bogus.
* Fix a bug where KVM could bleed PVCLOCK_GUEST_STOPPED across different
PV clocks; restrict PVCLOCK_GUEST_STOPPED to kvmclock, as KVM's suspend
notifier only accounts for kvmclock, and there's no evidence that the
flag is actually supported by Xen guests.
* Clean up the per-vCPU "cache" of its reference pvclock, and instead only
track the vCPU's TSC scaling (multipler+shift) metadata (which is moderately
expensive to compute, and rarely changes for modern setups).
* Don't write to the Xen hypercall page on MSR writes that are initiated by
the host (userspace or KVM) to fix a class of bugs where KVM can write to
guest memory at unexpected times, e.g. during vCPU creation if userspace has
set the Xen hypercall MSR index to collide with an MSR that KVM emulates.
* Restrict the Xen hypercall MSR index to the unofficial synthetic range to
reduce the set of possible collisions with MSRs that are emulated by KVM
(collisions can still happen as KVM emulates Hyper-V MSRs, which also reside
in the synthetic range).
* Clean up and optimize KVM's handling of Xen MSR writes and xen_hvm_config.
* Update Xen TSC leaves during CPUID emulation instead of modifying the CPUID
entries when updating PV clocks; there is no guarantee PV clocks will be
updated between TSC frequency changes and CPUID emulation, and guest reads
of the TSC leaves should be rare, i.e. are not a hot path.
x86 (Intel):
* Fix a bug where KVM unnecessarily reads XFD_ERR from hardware and thus
modifies the vCPU's XFD_ERR on a #NM due to CR0.TS=1.
* Pass XFD_ERR as the payload when injecting #NM, as a preparatory step
for upcoming FRED virtualization support.
* Decouple the EPT entry RWX protection bit macros from the EPT Violation
bits, both as a general cleanup and in anticipation of adding support for
emulating Mode-Based Execution Control (MBEC).
* Reject KVM_RUN if userspace manages to gain control and stuff invalid guest
state while KVM is in the middle of emulating nested VM-Enter.
* Add a macro to handle KVM's sanity checks on entry/exit VMCS control pairs
in anticipation of adding sanity checks for secondary exit controls (the
primary field is out of bits).
x86 (AMD):
* Ensure the PSP driver is initialized when both the PSP and KVM modules are
built-in (the initcall framework doesn't handle dependencies).
* Use long-term pins when registering encrypted memory regions, so that the
pages are migrated out of MIGRATE_CMA/ZONE_MOVABLE and don't lead to
excessive fragmentation.
* Add macros and helpers for setting GHCB return/error codes.
* Add support for Idle HLT interception, which elides interception if the vCPU
has a pending, unmasked virtual IRQ when HLT is executed.
* Fix a bug in INVPCID emulation where KVM fails to check for a non-canonical
address.
* Don't attempt VMRUN for SEV-ES+ guests if the vCPU's VMSA is invalid, e.g.
because the vCPU was "destroyed" via SNP's AP Creation hypercall.
* Reject SNP AP Creation if the requested SEV features for the vCPU don't
match the VM's configured set of features.
Selftests:
* Fix again the Intel PMU counters test; add a data load and do CLFLUSH{OPT} on the data
instead of executing code. The theory is that modern Intel CPUs have
learned new code prefetching tricks that bypass the PMU counters.
* Fix a flaw in the Intel PMU counters test where it asserts that an event is
counting correctly without actually knowing what the event counts on the
underlying hardware.
* Fix a variety of flaws, bugs, and false failures/passes dirty_log_test, and
improve its coverage by collecting all dirty entries on each iteration.
* Fix a few minor bugs related to handling of stats FDs.
* Add infrastructure to make vCPU and VM stats FDs available to tests by
default (open the FDs during VM/vCPU creation).
* Relax an assertion on the number of HLT exits in the xAPIC IPI test when
running on a CPU that supports AMD's Idle HLT (which elides interception of
HLT if a virtual IRQ is pending and unmasked).
-----BEGIN PGP SIGNATURE-----
iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmfcTkEUHHBib256aW5p
QHJlZGhhdC5jb20ACgkQv/vSX3jHroMnQAf/cPx72hJOdNy4Qrm8M33YLXVRVV00
yEZ8eN8TWdOclr0ltE/w/ELGh/qS4CU8pjURAk0A6lPioU+mdcTn3dPEqMDMVYom
uOQ2lusEHw0UuSnGZSEjvZJsE/Ro2NSAsHIB6PWRqig1ZBPJzyu0frce34pMpeQH
diwriJL9lKPAhBWXnUQ9BKoi1R0P5OLW9ahX4SOWk7cAFg4DLlDE66Nqf6nKqViw
DwEucTiUEg5+a3d93gihdD4JNl+fb3vI2erxrMxjFjkacl0qgqRu3ei3DG0MfdHU
wNcFSG5B1n0OECKxr80lr1Ip1KTVNNij0Ks+w6Gc6lSg9c4PptnNkfLK3A==
=nnCN
-----END PGP SIGNATURE-----
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Paolo Bonzini:
"ARM:
- Nested virtualization support for VGICv3, giving the nested
hypervisor control of the VGIC hardware when running an L2 VM
- Removal of 'late' nested virtualization feature register masking,
making the supported feature set directly visible to userspace
- Support for emulating FEAT_PMUv3 on Apple silicon, taking advantage
of an IMPLEMENTATION DEFINED trap that covers all PMUv3 registers
- Paravirtual interface for discovering the set of CPU
implementations where a VM may run, addressing a longstanding issue
of guest CPU errata awareness in big-little systems and
cross-implementation VM migration
- Userspace control of the registers responsible for identifying a
particular CPU implementation (MIDR_EL1, REVIDR_EL1, AIDR_EL1),
allowing VMs to be migrated cross-implementation
- pKVM updates, including support for tracking stage-2 page table
allocations in the protected hypervisor in the 'SecPageTable' stat
- Fixes to vPMU, ensuring that userspace updates to the vPMU after
KVM_RUN are reflected into the backing perf events
LoongArch:
- Remove unnecessary header include path
- Assume constant PGD during VM context switch
- Add perf events support for guest VM
RISC-V:
- Disable the kernel perf counter during configure
- KVM selftests improvements for PMU
- Fix warning at the time of KVM module removal
x86:
- Add support for aging of SPTEs without holding mmu_lock.
Not taking mmu_lock allows multiple aging actions to run in
parallel, and more importantly avoids stalling vCPUs. This includes
an implementation of per-rmap-entry locking; aging the gfn is done
with only a per-rmap single-bin spinlock taken, whereas locking an
rmap for write requires taking both the per-rmap spinlock and the
mmu_lock.
Note that this decreases slightly the accuracy of accessed-page
information, because changes to the SPTE outside aging might not
use atomic operations even if they could race against a clear of
the Accessed bit.
This is deliberate because KVM and mm/ tolerate false
positives/negatives for accessed information, and testing has shown
that reducing the latency of aging is far more beneficial to
overall system performance than providing "perfect" young/old
information.
- Defer runtime CPUID updates until KVM emulates a CPUID instruction,
to coalesce updates when multiple pieces of vCPU state are
changing, e.g. as part of a nested transition
- Fix a variety of nested emulation bugs, and add VMX support for
synthesizing nested VM-Exit on interception (instead of injecting
#UD into L2)
- Drop "support" for async page faults for protected guests that do
not set SEND_ALWAYS (i.e. that only want async page faults at CPL3)
- Bring a bit of sanity to x86's VM teardown code, which has
accumulated a lot of cruft over the years. Particularly, destroy
vCPUs before the MMU, despite the latter being a VM-wide operation
- Add common secure TSC infrastructure for use within SNP and in the
future TDX
- Block KVM_CAP_SYNC_REGS if guest state is protected. It does not
make sense to use the capability if the relevant registers are not
available for reading or writing
- Don't take kvm->lock when iterating over vCPUs in the suspend
notifier to fix a largely theoretical deadlock
- Use the vCPU's actual Xen PV clock information when starting the
Xen timer, as the cached state in arch.hv_clock can be stale/bogus
- Fix a bug where KVM could bleed PVCLOCK_GUEST_STOPPED across
different PV clocks; restrict PVCLOCK_GUEST_STOPPED to kvmclock, as
KVM's suspend notifier only accounts for kvmclock, and there's no
evidence that the flag is actually supported by Xen guests
- Clean up the per-vCPU "cache" of its reference pvclock, and instead
only track the vCPU's TSC scaling (multipler+shift) metadata (which
is moderately expensive to compute, and rarely changes for modern
setups)
- Don't write to the Xen hypercall page on MSR writes that are
initiated by the host (userspace or KVM) to fix a class of bugs
where KVM can write to guest memory at unexpected times, e.g.
during vCPU creation if userspace has set the Xen hypercall MSR
index to collide with an MSR that KVM emulates
- Restrict the Xen hypercall MSR index to the unofficial synthetic
range to reduce the set of possible collisions with MSRs that are
emulated by KVM (collisions can still happen as KVM emulates
Hyper-V MSRs, which also reside in the synthetic range)
- Clean up and optimize KVM's handling of Xen MSR writes and
xen_hvm_config
- Update Xen TSC leaves during CPUID emulation instead of modifying
the CPUID entries when updating PV clocks; there is no guarantee PV
clocks will be updated between TSC frequency changes and CPUID
emulation, and guest reads of the TSC leaves should be rare, i.e.
are not a hot path
x86 (Intel):
- Fix a bug where KVM unnecessarily reads XFD_ERR from hardware and
thus modifies the vCPU's XFD_ERR on a #NM due to CR0.TS=1
- Pass XFD_ERR as the payload when injecting #NM, as a preparatory
step for upcoming FRED virtualization support
- Decouple the EPT entry RWX protection bit macros from the EPT
Violation bits, both as a general cleanup and in anticipation of
adding support for emulating Mode-Based Execution Control (MBEC)
- Reject KVM_RUN if userspace manages to gain control and stuff
invalid guest state while KVM is in the middle of emulating nested
VM-Enter
- Add a macro to handle KVM's sanity checks on entry/exit VMCS
control pairs in anticipation of adding sanity checks for secondary
exit controls (the primary field is out of bits)
x86 (AMD):
- Ensure the PSP driver is initialized when both the PSP and KVM
modules are built-in (the initcall framework doesn't handle
dependencies)
- Use long-term pins when registering encrypted memory regions, so
that the pages are migrated out of MIGRATE_CMA/ZONE_MOVABLE and
don't lead to excessive fragmentation
- Add macros and helpers for setting GHCB return/error codes
- Add support for Idle HLT interception, which elides interception if
the vCPU has a pending, unmasked virtual IRQ when HLT is executed
- Fix a bug in INVPCID emulation where KVM fails to check for a
non-canonical address
- Don't attempt VMRUN for SEV-ES+ guests if the vCPU's VMSA is
invalid, e.g. because the vCPU was "destroyed" via SNP's AP
Creation hypercall
- Reject SNP AP Creation if the requested SEV features for the vCPU
don't match the VM's configured set of features
Selftests:
- Fix again the Intel PMU counters test; add a data load and do
CLFLUSH{OPT} on the data instead of executing code. The theory is
that modern Intel CPUs have learned new code prefetching tricks
that bypass the PMU counters
- Fix a flaw in the Intel PMU counters test where it asserts that an
event is counting correctly without actually knowing what the event
counts on the underlying hardware
- Fix a variety of flaws, bugs, and false failures/passes
dirty_log_test, and improve its coverage by collecting all dirty
entries on each iteration
- Fix a few minor bugs related to handling of stats FDs
- Add infrastructure to make vCPU and VM stats FDs available to tests
by default (open the FDs during VM/vCPU creation)
- Relax an assertion on the number of HLT exits in the xAPIC IPI test
when running on a CPU that supports AMD's Idle HLT (which elides
interception of HLT if a virtual IRQ is pending and unmasked)"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (216 commits)
RISC-V: KVM: Optimize comments in kvm_riscv_vcpu_isa_disable_allowed
RISC-V: KVM: Teardown riscv specific bits after kvm_exit
LoongArch: KVM: Register perf callbacks for guest
LoongArch: KVM: Implement arch-specific functions for guest perf
LoongArch: KVM: Add stub for kvm_arch_vcpu_preempted_in_kernel()
LoongArch: KVM: Remove PGD saving during VM context switch
LoongArch: KVM: Remove unnecessary header include path
KVM: arm64: Tear down vGIC on failed vCPU creation
KVM: arm64: PMU: Reload when resetting
KVM: arm64: PMU: Reload when user modifies registers
KVM: arm64: PMU: Fix SET_ONE_REG for vPMC regs
KVM: arm64: PMU: Assume PMU presence in pmu-emul.c
KVM: arm64: PMU: Set raw values from user to PM{C,I}NTEN{SET,CLR}, PMOVS{SET,CLR}
KVM: arm64: Create each pKVM hyp vcpu after its corresponding host vcpu
KVM: arm64: Factor out pKVM hyp vcpu creation to separate function
KVM: arm64: Initialize HCRX_EL2 traps in pKVM
KVM: arm64: Factor out setting HCRX_EL2 traps into separate function
KVM: x86: block KVM_CAP_SYNC_REGS if guest state is protected
KVM: x86: Add infrastructure for secure TSC
KVM: x86: Push down setting vcpu.arch.user_set_tsc
...
2025-03-25 14:22:07 -07:00
|
|
|
else if (is_midr_in_range_list(spectre_bhb_k24_list))
|
2025-01-07 12:05:59 -08:00
|
|
|
k = 24;
|
ARM:
* Nested virtualization support for VGICv3, giving the nested
hypervisor control of the VGIC hardware when running an L2 VM
* Removal of 'late' nested virtualization feature register masking,
making the supported feature set directly visible to userspace
* Support for emulating FEAT_PMUv3 on Apple silicon, taking advantage
of an IMPLEMENTATION DEFINED trap that covers all PMUv3 registers
* Paravirtual interface for discovering the set of CPU implementations
where a VM may run, addressing a longstanding issue of guest CPU
errata awareness in big-little systems and cross-implementation VM
migration
* Userspace control of the registers responsible for identifying a
particular CPU implementation (MIDR_EL1, REVIDR_EL1, AIDR_EL1),
allowing VMs to be migrated cross-implementation
* pKVM updates, including support for tracking stage-2 page table
allocations in the protected hypervisor in the 'SecPageTable' stat
* Fixes to vPMU, ensuring that userspace updates to the vPMU after
KVM_RUN are reflected into the backing perf events
LoongArch:
* Remove unnecessary header include path
* Assume constant PGD during VM context switch
* Add perf events support for guest VM
RISC-V:
* Disable the kernel perf counter during configure
* KVM selftests improvements for PMU
* Fix warning at the time of KVM module removal
x86:
* Add support for aging of SPTEs without holding mmu_lock. Not taking mmu_lock
allows multiple aging actions to run in parallel, and more importantly avoids
stalling vCPUs. This includes an implementation of per-rmap-entry locking;
aging the gfn is done with only a per-rmap single-bin spinlock taken, whereas
locking an rmap for write requires taking both the per-rmap spinlock and
the mmu_lock.
Note that this decreases slightly the accuracy of accessed-page information,
because changes to the SPTE outside aging might not use atomic operations
even if they could race against a clear of the Accessed bit. This is
deliberate because KVM and mm/ tolerate false positives/negatives for
accessed information, and testing has shown that reducing the latency of
aging is far more beneficial to overall system performance than providing
"perfect" young/old information.
* Defer runtime CPUID updates until KVM emulates a CPUID instruction, to
coalesce updates when multiple pieces of vCPU state are changing, e.g. as
part of a nested transition.
* Fix a variety of nested emulation bugs, and add VMX support for synthesizing
nested VM-Exit on interception (instead of injecting #UD into L2).
* Drop "support" for async page faults for protected guests that do not set
SEND_ALWAYS (i.e. that only want async page faults at CPL3)
* Bring a bit of sanity to x86's VM teardown code, which has accumulated
a lot of cruft over the years. Particularly, destroy vCPUs before
the MMU, despite the latter being a VM-wide operation.
* Add common secure TSC infrastructure for use within SNP and in the
future TDX
* Block KVM_CAP_SYNC_REGS if guest state is protected. It does not make
sense to use the capability if the relevant registers are not
available for reading or writing.
* Don't take kvm->lock when iterating over vCPUs in the suspend notifier to
fix a largely theoretical deadlock.
* Use the vCPU's actual Xen PV clock information when starting the Xen timer,
as the cached state in arch.hv_clock can be stale/bogus.
* Fix a bug where KVM could bleed PVCLOCK_GUEST_STOPPED across different
PV clocks; restrict PVCLOCK_GUEST_STOPPED to kvmclock, as KVM's suspend
notifier only accounts for kvmclock, and there's no evidence that the
flag is actually supported by Xen guests.
* Clean up the per-vCPU "cache" of its reference pvclock, and instead only
track the vCPU's TSC scaling (multipler+shift) metadata (which is moderately
expensive to compute, and rarely changes for modern setups).
* Don't write to the Xen hypercall page on MSR writes that are initiated by
the host (userspace or KVM) to fix a class of bugs where KVM can write to
guest memory at unexpected times, e.g. during vCPU creation if userspace has
set the Xen hypercall MSR index to collide with an MSR that KVM emulates.
* Restrict the Xen hypercall MSR index to the unofficial synthetic range to
reduce the set of possible collisions with MSRs that are emulated by KVM
(collisions can still happen as KVM emulates Hyper-V MSRs, which also reside
in the synthetic range).
* Clean up and optimize KVM's handling of Xen MSR writes and xen_hvm_config.
* Update Xen TSC leaves during CPUID emulation instead of modifying the CPUID
entries when updating PV clocks; there is no guarantee PV clocks will be
updated between TSC frequency changes and CPUID emulation, and guest reads
of the TSC leaves should be rare, i.e. are not a hot path.
x86 (Intel):
* Fix a bug where KVM unnecessarily reads XFD_ERR from hardware and thus
modifies the vCPU's XFD_ERR on a #NM due to CR0.TS=1.
* Pass XFD_ERR as the payload when injecting #NM, as a preparatory step
for upcoming FRED virtualization support.
* Decouple the EPT entry RWX protection bit macros from the EPT Violation
bits, both as a general cleanup and in anticipation of adding support for
emulating Mode-Based Execution Control (MBEC).
* Reject KVM_RUN if userspace manages to gain control and stuff invalid guest
state while KVM is in the middle of emulating nested VM-Enter.
* Add a macro to handle KVM's sanity checks on entry/exit VMCS control pairs
in anticipation of adding sanity checks for secondary exit controls (the
primary field is out of bits).
x86 (AMD):
* Ensure the PSP driver is initialized when both the PSP and KVM modules are
built-in (the initcall framework doesn't handle dependencies).
* Use long-term pins when registering encrypted memory regions, so that the
pages are migrated out of MIGRATE_CMA/ZONE_MOVABLE and don't lead to
excessive fragmentation.
* Add macros and helpers for setting GHCB return/error codes.
* Add support for Idle HLT interception, which elides interception if the vCPU
has a pending, unmasked virtual IRQ when HLT is executed.
* Fix a bug in INVPCID emulation where KVM fails to check for a non-canonical
address.
* Don't attempt VMRUN for SEV-ES+ guests if the vCPU's VMSA is invalid, e.g.
because the vCPU was "destroyed" via SNP's AP Creation hypercall.
* Reject SNP AP Creation if the requested SEV features for the vCPU don't
match the VM's configured set of features.
Selftests:
* Fix again the Intel PMU counters test; add a data load and do CLFLUSH{OPT} on the data
instead of executing code. The theory is that modern Intel CPUs have
learned new code prefetching tricks that bypass the PMU counters.
* Fix a flaw in the Intel PMU counters test where it asserts that an event is
counting correctly without actually knowing what the event counts on the
underlying hardware.
* Fix a variety of flaws, bugs, and false failures/passes dirty_log_test, and
improve its coverage by collecting all dirty entries on each iteration.
* Fix a few minor bugs related to handling of stats FDs.
* Add infrastructure to make vCPU and VM stats FDs available to tests by
default (open the FDs during VM/vCPU creation).
* Relax an assertion on the number of HLT exits in the xAPIC IPI test when
running on a CPU that supports AMD's Idle HLT (which elides interception of
HLT if a virtual IRQ is pending and unmasked).
-----BEGIN PGP SIGNATURE-----
iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmfcTkEUHHBib256aW5p
QHJlZGhhdC5jb20ACgkQv/vSX3jHroMnQAf/cPx72hJOdNy4Qrm8M33YLXVRVV00
yEZ8eN8TWdOclr0ltE/w/ELGh/qS4CU8pjURAk0A6lPioU+mdcTn3dPEqMDMVYom
uOQ2lusEHw0UuSnGZSEjvZJsE/Ro2NSAsHIB6PWRqig1ZBPJzyu0frce34pMpeQH
diwriJL9lKPAhBWXnUQ9BKoi1R0P5OLW9ahX4SOWk7cAFg4DLlDE66Nqf6nKqViw
DwEucTiUEg5+a3d93gihdD4JNl+fb3vI2erxrMxjFjkacl0qgqRu3ei3DG0MfdHU
wNcFSG5B1n0OECKxr80lr1Ip1KTVNNij0Ks+w6Gc6lSg9c4PptnNkfLK3A==
=nnCN
-----END PGP SIGNATURE-----
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Paolo Bonzini:
"ARM:
- Nested virtualization support for VGICv3, giving the nested
hypervisor control of the VGIC hardware when running an L2 VM
- Removal of 'late' nested virtualization feature register masking,
making the supported feature set directly visible to userspace
- Support for emulating FEAT_PMUv3 on Apple silicon, taking advantage
of an IMPLEMENTATION DEFINED trap that covers all PMUv3 registers
- Paravirtual interface for discovering the set of CPU
implementations where a VM may run, addressing a longstanding issue
of guest CPU errata awareness in big-little systems and
cross-implementation VM migration
- Userspace control of the registers responsible for identifying a
particular CPU implementation (MIDR_EL1, REVIDR_EL1, AIDR_EL1),
allowing VMs to be migrated cross-implementation
- pKVM updates, including support for tracking stage-2 page table
allocations in the protected hypervisor in the 'SecPageTable' stat
- Fixes to vPMU, ensuring that userspace updates to the vPMU after
KVM_RUN are reflected into the backing perf events
LoongArch:
- Remove unnecessary header include path
- Assume constant PGD during VM context switch
- Add perf events support for guest VM
RISC-V:
- Disable the kernel perf counter during configure
- KVM selftests improvements for PMU
- Fix warning at the time of KVM module removal
x86:
- Add support for aging of SPTEs without holding mmu_lock.
Not taking mmu_lock allows multiple aging actions to run in
parallel, and more importantly avoids stalling vCPUs. This includes
an implementation of per-rmap-entry locking; aging the gfn is done
with only a per-rmap single-bin spinlock taken, whereas locking an
rmap for write requires taking both the per-rmap spinlock and the
mmu_lock.
Note that this decreases slightly the accuracy of accessed-page
information, because changes to the SPTE outside aging might not
use atomic operations even if they could race against a clear of
the Accessed bit.
This is deliberate because KVM and mm/ tolerate false
positives/negatives for accessed information, and testing has shown
that reducing the latency of aging is far more beneficial to
overall system performance than providing "perfect" young/old
information.
- Defer runtime CPUID updates until KVM emulates a CPUID instruction,
to coalesce updates when multiple pieces of vCPU state are
changing, e.g. as part of a nested transition
- Fix a variety of nested emulation bugs, and add VMX support for
synthesizing nested VM-Exit on interception (instead of injecting
#UD into L2)
- Drop "support" for async page faults for protected guests that do
not set SEND_ALWAYS (i.e. that only want async page faults at CPL3)
- Bring a bit of sanity to x86's VM teardown code, which has
accumulated a lot of cruft over the years. Particularly, destroy
vCPUs before the MMU, despite the latter being a VM-wide operation
- Add common secure TSC infrastructure for use within SNP and in the
future TDX
- Block KVM_CAP_SYNC_REGS if guest state is protected. It does not
make sense to use the capability if the relevant registers are not
available for reading or writing
- Don't take kvm->lock when iterating over vCPUs in the suspend
notifier to fix a largely theoretical deadlock
- Use the vCPU's actual Xen PV clock information when starting the
Xen timer, as the cached state in arch.hv_clock can be stale/bogus
- Fix a bug where KVM could bleed PVCLOCK_GUEST_STOPPED across
different PV clocks; restrict PVCLOCK_GUEST_STOPPED to kvmclock, as
KVM's suspend notifier only accounts for kvmclock, and there's no
evidence that the flag is actually supported by Xen guests
- Clean up the per-vCPU "cache" of its reference pvclock, and instead
only track the vCPU's TSC scaling (multipler+shift) metadata (which
is moderately expensive to compute, and rarely changes for modern
setups)
- Don't write to the Xen hypercall page on MSR writes that are
initiated by the host (userspace or KVM) to fix a class of bugs
where KVM can write to guest memory at unexpected times, e.g.
during vCPU creation if userspace has set the Xen hypercall MSR
index to collide with an MSR that KVM emulates
- Restrict the Xen hypercall MSR index to the unofficial synthetic
range to reduce the set of possible collisions with MSRs that are
emulated by KVM (collisions can still happen as KVM emulates
Hyper-V MSRs, which also reside in the synthetic range)
- Clean up and optimize KVM's handling of Xen MSR writes and
xen_hvm_config
- Update Xen TSC leaves during CPUID emulation instead of modifying
the CPUID entries when updating PV clocks; there is no guarantee PV
clocks will be updated between TSC frequency changes and CPUID
emulation, and guest reads of the TSC leaves should be rare, i.e.
are not a hot path
x86 (Intel):
- Fix a bug where KVM unnecessarily reads XFD_ERR from hardware and
thus modifies the vCPU's XFD_ERR on a #NM due to CR0.TS=1
- Pass XFD_ERR as the payload when injecting #NM, as a preparatory
step for upcoming FRED virtualization support
- Decouple the EPT entry RWX protection bit macros from the EPT
Violation bits, both as a general cleanup and in anticipation of
adding support for emulating Mode-Based Execution Control (MBEC)
- Reject KVM_RUN if userspace manages to gain control and stuff
invalid guest state while KVM is in the middle of emulating nested
VM-Enter
- Add a macro to handle KVM's sanity checks on entry/exit VMCS
control pairs in anticipation of adding sanity checks for secondary
exit controls (the primary field is out of bits)
x86 (AMD):
- Ensure the PSP driver is initialized when both the PSP and KVM
modules are built-in (the initcall framework doesn't handle
dependencies)
- Use long-term pins when registering encrypted memory regions, so
that the pages are migrated out of MIGRATE_CMA/ZONE_MOVABLE and
don't lead to excessive fragmentation
- Add macros and helpers for setting GHCB return/error codes
- Add support for Idle HLT interception, which elides interception if
the vCPU has a pending, unmasked virtual IRQ when HLT is executed
- Fix a bug in INVPCID emulation where KVM fails to check for a
non-canonical address
- Don't attempt VMRUN for SEV-ES+ guests if the vCPU's VMSA is
invalid, e.g. because the vCPU was "destroyed" via SNP's AP
Creation hypercall
- Reject SNP AP Creation if the requested SEV features for the vCPU
don't match the VM's configured set of features
Selftests:
- Fix again the Intel PMU counters test; add a data load and do
CLFLUSH{OPT} on the data instead of executing code. The theory is
that modern Intel CPUs have learned new code prefetching tricks
that bypass the PMU counters
- Fix a flaw in the Intel PMU counters test where it asserts that an
event is counting correctly without actually knowing what the event
counts on the underlying hardware
- Fix a variety of flaws, bugs, and false failures/passes
dirty_log_test, and improve its coverage by collecting all dirty
entries on each iteration
- Fix a few minor bugs related to handling of stats FDs
- Add infrastructure to make vCPU and VM stats FDs available to tests
by default (open the FDs during VM/vCPU creation)
- Relax an assertion on the number of HLT exits in the xAPIC IPI test
when running on a CPU that supports AMD's Idle HLT (which elides
interception of HLT if a virtual IRQ is pending and unmasked)"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (216 commits)
RISC-V: KVM: Optimize comments in kvm_riscv_vcpu_isa_disable_allowed
RISC-V: KVM: Teardown riscv specific bits after kvm_exit
LoongArch: KVM: Register perf callbacks for guest
LoongArch: KVM: Implement arch-specific functions for guest perf
LoongArch: KVM: Add stub for kvm_arch_vcpu_preempted_in_kernel()
LoongArch: KVM: Remove PGD saving during VM context switch
LoongArch: KVM: Remove unnecessary header include path
KVM: arm64: Tear down vGIC on failed vCPU creation
KVM: arm64: PMU: Reload when resetting
KVM: arm64: PMU: Reload when user modifies registers
KVM: arm64: PMU: Fix SET_ONE_REG for vPMC regs
KVM: arm64: PMU: Assume PMU presence in pmu-emul.c
KVM: arm64: PMU: Set raw values from user to PM{C,I}NTEN{SET,CLR}, PMOVS{SET,CLR}
KVM: arm64: Create each pKVM hyp vcpu after its corresponding host vcpu
KVM: arm64: Factor out pKVM hyp vcpu creation to separate function
KVM: arm64: Initialize HCRX_EL2 traps in pKVM
KVM: arm64: Factor out setting HCRX_EL2 traps into separate function
KVM: x86: block KVM_CAP_SYNC_REGS if guest state is protected
KVM: x86: Add infrastructure for secure TSC
KVM: x86: Push down setting vcpu.arch.user_set_tsc
...
2025-03-25 14:22:07 -07:00
|
|
|
else if (is_midr_in_range_list(spectre_bhb_k11_list))
|
2025-01-07 12:05:59 -08:00
|
|
|
k = 11;
|
ARM:
* Nested virtualization support for VGICv3, giving the nested
hypervisor control of the VGIC hardware when running an L2 VM
* Removal of 'late' nested virtualization feature register masking,
making the supported feature set directly visible to userspace
* Support for emulating FEAT_PMUv3 on Apple silicon, taking advantage
of an IMPLEMENTATION DEFINED trap that covers all PMUv3 registers
* Paravirtual interface for discovering the set of CPU implementations
where a VM may run, addressing a longstanding issue of guest CPU
errata awareness in big-little systems and cross-implementation VM
migration
* Userspace control of the registers responsible for identifying a
particular CPU implementation (MIDR_EL1, REVIDR_EL1, AIDR_EL1),
allowing VMs to be migrated cross-implementation
* pKVM updates, including support for tracking stage-2 page table
allocations in the protected hypervisor in the 'SecPageTable' stat
* Fixes to vPMU, ensuring that userspace updates to the vPMU after
KVM_RUN are reflected into the backing perf events
LoongArch:
* Remove unnecessary header include path
* Assume constant PGD during VM context switch
* Add perf events support for guest VM
RISC-V:
* Disable the kernel perf counter during configure
* KVM selftests improvements for PMU
* Fix warning at the time of KVM module removal
x86:
* Add support for aging of SPTEs without holding mmu_lock. Not taking mmu_lock
allows multiple aging actions to run in parallel, and more importantly avoids
stalling vCPUs. This includes an implementation of per-rmap-entry locking;
aging the gfn is done with only a per-rmap single-bin spinlock taken, whereas
locking an rmap for write requires taking both the per-rmap spinlock and
the mmu_lock.
Note that this decreases slightly the accuracy of accessed-page information,
because changes to the SPTE outside aging might not use atomic operations
even if they could race against a clear of the Accessed bit. This is
deliberate because KVM and mm/ tolerate false positives/negatives for
accessed information, and testing has shown that reducing the latency of
aging is far more beneficial to overall system performance than providing
"perfect" young/old information.
* Defer runtime CPUID updates until KVM emulates a CPUID instruction, to
coalesce updates when multiple pieces of vCPU state are changing, e.g. as
part of a nested transition.
* Fix a variety of nested emulation bugs, and add VMX support for synthesizing
nested VM-Exit on interception (instead of injecting #UD into L2).
* Drop "support" for async page faults for protected guests that do not set
SEND_ALWAYS (i.e. that only want async page faults at CPL3)
* Bring a bit of sanity to x86's VM teardown code, which has accumulated
a lot of cruft over the years. Particularly, destroy vCPUs before
the MMU, despite the latter being a VM-wide operation.
* Add common secure TSC infrastructure for use within SNP and in the
future TDX
* Block KVM_CAP_SYNC_REGS if guest state is protected. It does not make
sense to use the capability if the relevant registers are not
available for reading or writing.
* Don't take kvm->lock when iterating over vCPUs in the suspend notifier to
fix a largely theoretical deadlock.
* Use the vCPU's actual Xen PV clock information when starting the Xen timer,
as the cached state in arch.hv_clock can be stale/bogus.
* Fix a bug where KVM could bleed PVCLOCK_GUEST_STOPPED across different
PV clocks; restrict PVCLOCK_GUEST_STOPPED to kvmclock, as KVM's suspend
notifier only accounts for kvmclock, and there's no evidence that the
flag is actually supported by Xen guests.
* Clean up the per-vCPU "cache" of its reference pvclock, and instead only
track the vCPU's TSC scaling (multipler+shift) metadata (which is moderately
expensive to compute, and rarely changes for modern setups).
* Don't write to the Xen hypercall page on MSR writes that are initiated by
the host (userspace or KVM) to fix a class of bugs where KVM can write to
guest memory at unexpected times, e.g. during vCPU creation if userspace has
set the Xen hypercall MSR index to collide with an MSR that KVM emulates.
* Restrict the Xen hypercall MSR index to the unofficial synthetic range to
reduce the set of possible collisions with MSRs that are emulated by KVM
(collisions can still happen as KVM emulates Hyper-V MSRs, which also reside
in the synthetic range).
* Clean up and optimize KVM's handling of Xen MSR writes and xen_hvm_config.
* Update Xen TSC leaves during CPUID emulation instead of modifying the CPUID
entries when updating PV clocks; there is no guarantee PV clocks will be
updated between TSC frequency changes and CPUID emulation, and guest reads
of the TSC leaves should be rare, i.e. are not a hot path.
x86 (Intel):
* Fix a bug where KVM unnecessarily reads XFD_ERR from hardware and thus
modifies the vCPU's XFD_ERR on a #NM due to CR0.TS=1.
* Pass XFD_ERR as the payload when injecting #NM, as a preparatory step
for upcoming FRED virtualization support.
* Decouple the EPT entry RWX protection bit macros from the EPT Violation
bits, both as a general cleanup and in anticipation of adding support for
emulating Mode-Based Execution Control (MBEC).
* Reject KVM_RUN if userspace manages to gain control and stuff invalid guest
state while KVM is in the middle of emulating nested VM-Enter.
* Add a macro to handle KVM's sanity checks on entry/exit VMCS control pairs
in anticipation of adding sanity checks for secondary exit controls (the
primary field is out of bits).
x86 (AMD):
* Ensure the PSP driver is initialized when both the PSP and KVM modules are
built-in (the initcall framework doesn't handle dependencies).
* Use long-term pins when registering encrypted memory regions, so that the
pages are migrated out of MIGRATE_CMA/ZONE_MOVABLE and don't lead to
excessive fragmentation.
* Add macros and helpers for setting GHCB return/error codes.
* Add support for Idle HLT interception, which elides interception if the vCPU
has a pending, unmasked virtual IRQ when HLT is executed.
* Fix a bug in INVPCID emulation where KVM fails to check for a non-canonical
address.
* Don't attempt VMRUN for SEV-ES+ guests if the vCPU's VMSA is invalid, e.g.
because the vCPU was "destroyed" via SNP's AP Creation hypercall.
* Reject SNP AP Creation if the requested SEV features for the vCPU don't
match the VM's configured set of features.
Selftests:
* Fix again the Intel PMU counters test; add a data load and do CLFLUSH{OPT} on the data
instead of executing code. The theory is that modern Intel CPUs have
learned new code prefetching tricks that bypass the PMU counters.
* Fix a flaw in the Intel PMU counters test where it asserts that an event is
counting correctly without actually knowing what the event counts on the
underlying hardware.
* Fix a variety of flaws, bugs, and false failures/passes dirty_log_test, and
improve its coverage by collecting all dirty entries on each iteration.
* Fix a few minor bugs related to handling of stats FDs.
* Add infrastructure to make vCPU and VM stats FDs available to tests by
default (open the FDs during VM/vCPU creation).
* Relax an assertion on the number of HLT exits in the xAPIC IPI test when
running on a CPU that supports AMD's Idle HLT (which elides interception of
HLT if a virtual IRQ is pending and unmasked).
-----BEGIN PGP SIGNATURE-----
iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmfcTkEUHHBib256aW5p
QHJlZGhhdC5jb20ACgkQv/vSX3jHroMnQAf/cPx72hJOdNy4Qrm8M33YLXVRVV00
yEZ8eN8TWdOclr0ltE/w/ELGh/qS4CU8pjURAk0A6lPioU+mdcTn3dPEqMDMVYom
uOQ2lusEHw0UuSnGZSEjvZJsE/Ro2NSAsHIB6PWRqig1ZBPJzyu0frce34pMpeQH
diwriJL9lKPAhBWXnUQ9BKoi1R0P5OLW9ahX4SOWk7cAFg4DLlDE66Nqf6nKqViw
DwEucTiUEg5+a3d93gihdD4JNl+fb3vI2erxrMxjFjkacl0qgqRu3ei3DG0MfdHU
wNcFSG5B1n0OECKxr80lr1Ip1KTVNNij0Ks+w6Gc6lSg9c4PptnNkfLK3A==
=nnCN
-----END PGP SIGNATURE-----
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Paolo Bonzini:
"ARM:
- Nested virtualization support for VGICv3, giving the nested
hypervisor control of the VGIC hardware when running an L2 VM
- Removal of 'late' nested virtualization feature register masking,
making the supported feature set directly visible to userspace
- Support for emulating FEAT_PMUv3 on Apple silicon, taking advantage
of an IMPLEMENTATION DEFINED trap that covers all PMUv3 registers
- Paravirtual interface for discovering the set of CPU
implementations where a VM may run, addressing a longstanding issue
of guest CPU errata awareness in big-little systems and
cross-implementation VM migration
- Userspace control of the registers responsible for identifying a
particular CPU implementation (MIDR_EL1, REVIDR_EL1, AIDR_EL1),
allowing VMs to be migrated cross-implementation
- pKVM updates, including support for tracking stage-2 page table
allocations in the protected hypervisor in the 'SecPageTable' stat
- Fixes to vPMU, ensuring that userspace updates to the vPMU after
KVM_RUN are reflected into the backing perf events
LoongArch:
- Remove unnecessary header include path
- Assume constant PGD during VM context switch
- Add perf events support for guest VM
RISC-V:
- Disable the kernel perf counter during configure
- KVM selftests improvements for PMU
- Fix warning at the time of KVM module removal
x86:
- Add support for aging of SPTEs without holding mmu_lock.
Not taking mmu_lock allows multiple aging actions to run in
parallel, and more importantly avoids stalling vCPUs. This includes
an implementation of per-rmap-entry locking; aging the gfn is done
with only a per-rmap single-bin spinlock taken, whereas locking an
rmap for write requires taking both the per-rmap spinlock and the
mmu_lock.
Note that this decreases slightly the accuracy of accessed-page
information, because changes to the SPTE outside aging might not
use atomic operations even if they could race against a clear of
the Accessed bit.
This is deliberate because KVM and mm/ tolerate false
positives/negatives for accessed information, and testing has shown
that reducing the latency of aging is far more beneficial to
overall system performance than providing "perfect" young/old
information.
- Defer runtime CPUID updates until KVM emulates a CPUID instruction,
to coalesce updates when multiple pieces of vCPU state are
changing, e.g. as part of a nested transition
- Fix a variety of nested emulation bugs, and add VMX support for
synthesizing nested VM-Exit on interception (instead of injecting
#UD into L2)
- Drop "support" for async page faults for protected guests that do
not set SEND_ALWAYS (i.e. that only want async page faults at CPL3)
- Bring a bit of sanity to x86's VM teardown code, which has
accumulated a lot of cruft over the years. Particularly, destroy
vCPUs before the MMU, despite the latter being a VM-wide operation
- Add common secure TSC infrastructure for use within SNP and in the
future TDX
- Block KVM_CAP_SYNC_REGS if guest state is protected. It does not
make sense to use the capability if the relevant registers are not
available for reading or writing
- Don't take kvm->lock when iterating over vCPUs in the suspend
notifier to fix a largely theoretical deadlock
- Use the vCPU's actual Xen PV clock information when starting the
Xen timer, as the cached state in arch.hv_clock can be stale/bogus
- Fix a bug where KVM could bleed PVCLOCK_GUEST_STOPPED across
different PV clocks; restrict PVCLOCK_GUEST_STOPPED to kvmclock, as
KVM's suspend notifier only accounts for kvmclock, and there's no
evidence that the flag is actually supported by Xen guests
- Clean up the per-vCPU "cache" of its reference pvclock, and instead
only track the vCPU's TSC scaling (multipler+shift) metadata (which
is moderately expensive to compute, and rarely changes for modern
setups)
- Don't write to the Xen hypercall page on MSR writes that are
initiated by the host (userspace or KVM) to fix a class of bugs
where KVM can write to guest memory at unexpected times, e.g.
during vCPU creation if userspace has set the Xen hypercall MSR
index to collide with an MSR that KVM emulates
- Restrict the Xen hypercall MSR index to the unofficial synthetic
range to reduce the set of possible collisions with MSRs that are
emulated by KVM (collisions can still happen as KVM emulates
Hyper-V MSRs, which also reside in the synthetic range)
- Clean up and optimize KVM's handling of Xen MSR writes and
xen_hvm_config
- Update Xen TSC leaves during CPUID emulation instead of modifying
the CPUID entries when updating PV clocks; there is no guarantee PV
clocks will be updated between TSC frequency changes and CPUID
emulation, and guest reads of the TSC leaves should be rare, i.e.
are not a hot path
x86 (Intel):
- Fix a bug where KVM unnecessarily reads XFD_ERR from hardware and
thus modifies the vCPU's XFD_ERR on a #NM due to CR0.TS=1
- Pass XFD_ERR as the payload when injecting #NM, as a preparatory
step for upcoming FRED virtualization support
- Decouple the EPT entry RWX protection bit macros from the EPT
Violation bits, both as a general cleanup and in anticipation of
adding support for emulating Mode-Based Execution Control (MBEC)
- Reject KVM_RUN if userspace manages to gain control and stuff
invalid guest state while KVM is in the middle of emulating nested
VM-Enter
- Add a macro to handle KVM's sanity checks on entry/exit VMCS
control pairs in anticipation of adding sanity checks for secondary
exit controls (the primary field is out of bits)
x86 (AMD):
- Ensure the PSP driver is initialized when both the PSP and KVM
modules are built-in (the initcall framework doesn't handle
dependencies)
- Use long-term pins when registering encrypted memory regions, so
that the pages are migrated out of MIGRATE_CMA/ZONE_MOVABLE and
don't lead to excessive fragmentation
- Add macros and helpers for setting GHCB return/error codes
- Add support for Idle HLT interception, which elides interception if
the vCPU has a pending, unmasked virtual IRQ when HLT is executed
- Fix a bug in INVPCID emulation where KVM fails to check for a
non-canonical address
- Don't attempt VMRUN for SEV-ES+ guests if the vCPU's VMSA is
invalid, e.g. because the vCPU was "destroyed" via SNP's AP
Creation hypercall
- Reject SNP AP Creation if the requested SEV features for the vCPU
don't match the VM's configured set of features
Selftests:
- Fix again the Intel PMU counters test; add a data load and do
CLFLUSH{OPT} on the data instead of executing code. The theory is
that modern Intel CPUs have learned new code prefetching tricks
that bypass the PMU counters
- Fix a flaw in the Intel PMU counters test where it asserts that an
event is counting correctly without actually knowing what the event
counts on the underlying hardware
- Fix a variety of flaws, bugs, and false failures/passes
dirty_log_test, and improve its coverage by collecting all dirty
entries on each iteration
- Fix a few minor bugs related to handling of stats FDs
- Add infrastructure to make vCPU and VM stats FDs available to tests
by default (open the FDs during VM/vCPU creation)
- Relax an assertion on the number of HLT exits in the xAPIC IPI test
when running on a CPU that supports AMD's Idle HLT (which elides
interception of HLT if a virtual IRQ is pending and unmasked)"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (216 commits)
RISC-V: KVM: Optimize comments in kvm_riscv_vcpu_isa_disable_allowed
RISC-V: KVM: Teardown riscv specific bits after kvm_exit
LoongArch: KVM: Register perf callbacks for guest
LoongArch: KVM: Implement arch-specific functions for guest perf
LoongArch: KVM: Add stub for kvm_arch_vcpu_preempted_in_kernel()
LoongArch: KVM: Remove PGD saving during VM context switch
LoongArch: KVM: Remove unnecessary header include path
KVM: arm64: Tear down vGIC on failed vCPU creation
KVM: arm64: PMU: Reload when resetting
KVM: arm64: PMU: Reload when user modifies registers
KVM: arm64: PMU: Fix SET_ONE_REG for vPMC regs
KVM: arm64: PMU: Assume PMU presence in pmu-emul.c
KVM: arm64: PMU: Set raw values from user to PM{C,I}NTEN{SET,CLR}, PMOVS{SET,CLR}
KVM: arm64: Create each pKVM hyp vcpu after its corresponding host vcpu
KVM: arm64: Factor out pKVM hyp vcpu creation to separate function
KVM: arm64: Initialize HCRX_EL2 traps in pKVM
KVM: arm64: Factor out setting HCRX_EL2 traps into separate function
KVM: x86: block KVM_CAP_SYNC_REGS if guest state is protected
KVM: x86: Add infrastructure for secure TSC
KVM: x86: Push down setting vcpu.arch.user_set_tsc
...
2025-03-25 14:22:07 -07:00
|
|
|
else if (is_midr_in_range_list(spectre_bhb_k8_list))
|
2025-01-07 12:05:59 -08:00
|
|
|
k = 8;
|
arm64: Mitigate spectre style branch history side channels
Speculation attacks against some high-performance processors can
make use of branch history to influence future speculation.
When taking an exception from user-space, a sequence of branches
or a firmware call overwrites or invalidates the branch history.
The sequence of branches is added to the vectors, and should appear
before the first indirect branch. For systems using KPTI the sequence
is added to the kpti trampoline where it has a free register as the exit
from the trampoline is via a 'ret'. For systems not using KPTI, the same
register tricks are used to free up a register in the vectors.
For the firmware call, arch-workaround-3 clobbers 4 registers, so
there is no choice but to save them to the EL1 stack. This only happens
for entry from EL0, so if we take an exception due to the stack access,
it will not become re-entrant.
For KVM, the existing branch-predictor-hardening vectors are used.
When a spectre version of these vectors is in use, the firmware call
is sufficient to mitigate against Spectre-BHB. For the non-spectre
versions, the sequence of branches is added to the indirect vector.
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
2021-11-10 14:48:00 +00:00
|
|
|
|
|
|
|
return k;
|
|
|
|
}
|
|
|
|
|
|
|
|
static enum mitigation_state spectre_bhb_get_cpu_fw_mitigation_state(void)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct arm_smccc_res res;
|
|
|
|
|
|
|
|
arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
|
|
|
|
ARM_SMCCC_ARCH_WORKAROUND_3, &res);
|
|
|
|
|
|
|
|
ret = res.a0;
|
|
|
|
switch (ret) {
|
|
|
|
case SMCCC_RET_SUCCESS:
|
|
|
|
return SPECTRE_MITIGATED;
|
|
|
|
case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED:
|
|
|
|
return SPECTRE_UNAFFECTED;
|
|
|
|
default:
|
|
|
|
fallthrough;
|
|
|
|
case SMCCC_RET_NOT_SUPPORTED:
|
|
|
|
return SPECTRE_VULNERABLE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2025-01-07 12:05:59 -08:00
|
|
|
static bool has_spectre_bhb_fw_mitigation(void)
|
arm64: Mitigate spectre style branch history side channels
Speculation attacks against some high-performance processors can
make use of branch history to influence future speculation.
When taking an exception from user-space, a sequence of branches
or a firmware call overwrites or invalidates the branch history.
The sequence of branches is added to the vectors, and should appear
before the first indirect branch. For systems using KPTI the sequence
is added to the kpti trampoline where it has a free register as the exit
from the trampoline is via a 'ret'. For systems not using KPTI, the same
register tricks are used to free up a register in the vectors.
For the firmware call, arch-workaround-3 clobbers 4 registers, so
there is no choice but to save them to the EL1 stack. This only happens
for entry from EL0, so if we take an exception due to the stack access,
it will not become re-entrant.
For KVM, the existing branch-predictor-hardening vectors are used.
When a spectre version of these vectors is in use, the firmware call
is sufficient to mitigate against Spectre-BHB. For the non-spectre
versions, the sequence of branches is added to the indirect vector.
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
2021-11-10 14:48:00 +00:00
|
|
|
{
|
|
|
|
enum mitigation_state fw_state;
|
|
|
|
bool has_smccc = arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_NONE;
|
|
|
|
|
|
|
|
fw_state = spectre_bhb_get_cpu_fw_mitigation_state();
|
2025-01-07 12:05:59 -08:00
|
|
|
return has_smccc && fw_state == SPECTRE_MITIGATED;
|
arm64: Mitigate spectre style branch history side channels
Speculation attacks against some high-performance processors can
make use of branch history to influence future speculation.
When taking an exception from user-space, a sequence of branches
or a firmware call overwrites or invalidates the branch history.
The sequence of branches is added to the vectors, and should appear
before the first indirect branch. For systems using KPTI the sequence
is added to the kpti trampoline where it has a free register as the exit
from the trampoline is via a 'ret'. For systems not using KPTI, the same
register tricks are used to free up a register in the vectors.
For the firmware call, arch-workaround-3 clobbers 4 registers, so
there is no choice but to save them to the EL1 stack. This only happens
for entry from EL0, so if we take an exception due to the stack access,
it will not become re-entrant.
For KVM, the existing branch-predictor-hardening vectors are used.
When a spectre version of these vectors is in use, the firmware call
is sufficient to mitigate against Spectre-BHB. For the non-spectre
versions, the sequence of branches is added to the indirect vector.
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
2021-11-10 14:48:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static bool supports_ecbhb(int scope)
|
|
|
|
{
|
|
|
|
u64 mmfr1;
|
|
|
|
|
|
|
|
if (scope == SCOPE_LOCAL_CPU)
|
|
|
|
mmfr1 = read_sysreg_s(SYS_ID_AA64MMFR1_EL1);
|
|
|
|
else
|
|
|
|
mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
|
|
|
|
|
|
|
|
return cpuid_feature_extract_unsigned_field(mmfr1,
|
2022-09-05 23:54:07 +01:00
|
|
|
ID_AA64MMFR1_EL1_ECBHB_SHIFT);
|
arm64: Mitigate spectre style branch history side channels
Speculation attacks against some high-performance processors can
make use of branch history to influence future speculation.
When taking an exception from user-space, a sequence of branches
or a firmware call overwrites or invalidates the branch history.
The sequence of branches is added to the vectors, and should appear
before the first indirect branch. For systems using KPTI the sequence
is added to the kpti trampoline where it has a free register as the exit
from the trampoline is via a 'ret'. For systems not using KPTI, the same
register tricks are used to free up a register in the vectors.
For the firmware call, arch-workaround-3 clobbers 4 registers, so
there is no choice but to save them to the EL1 stack. This only happens
for entry from EL0, so if we take an exception due to the stack access,
it will not become re-entrant.
For KVM, the existing branch-predictor-hardening vectors are used.
When a spectre version of these vectors is in use, the firmware call
is sufficient to mitigate against Spectre-BHB. For the non-spectre
versions, the sequence of branches is added to the indirect vector.
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
2021-11-10 14:48:00 +00:00
|
|
|
}
|
|
|
|
|
2025-01-07 12:05:59 -08:00
|
|
|
static u8 max_bhb_k;
|
|
|
|
|
arm64: Mitigate spectre style branch history side channels
Speculation attacks against some high-performance processors can
make use of branch history to influence future speculation.
When taking an exception from user-space, a sequence of branches
or a firmware call overwrites or invalidates the branch history.
The sequence of branches is added to the vectors, and should appear
before the first indirect branch. For systems using KPTI the sequence
is added to the kpti trampoline where it has a free register as the exit
from the trampoline is via a 'ret'. For systems not using KPTI, the same
register tricks are used to free up a register in the vectors.
For the firmware call, arch-workaround-3 clobbers 4 registers, so
there is no choice but to save them to the EL1 stack. This only happens
for entry from EL0, so if we take an exception due to the stack access,
it will not become re-entrant.
For KVM, the existing branch-predictor-hardening vectors are used.
When a spectre version of these vectors is in use, the firmware call
is sufficient to mitigate against Spectre-BHB. For the non-spectre
versions, the sequence of branches is added to the indirect vector.
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
2021-11-10 14:48:00 +00:00
|
|
|
bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry,
|
|
|
|
int scope)
|
|
|
|
{
|
|
|
|
WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
|
|
|
|
|
|
|
|
if (supports_csv2p3(scope))
|
|
|
|
return false;
|
|
|
|
|
2025-01-07 12:05:59 -08:00
|
|
|
if (is_spectre_bhb_safe(scope))
|
|
|
|
return false;
|
arm64: Mitigate spectre style branch history side channels
Speculation attacks against some high-performance processors can
make use of branch history to influence future speculation.
When taking an exception from user-space, a sequence of branches
or a firmware call overwrites or invalidates the branch history.
The sequence of branches is added to the vectors, and should appear
before the first indirect branch. For systems using KPTI the sequence
is added to the kpti trampoline where it has a free register as the exit
from the trampoline is via a 'ret'. For systems not using KPTI, the same
register tricks are used to free up a register in the vectors.
For the firmware call, arch-workaround-3 clobbers 4 registers, so
there is no choice but to save them to the EL1 stack. This only happens
for entry from EL0, so if we take an exception due to the stack access,
it will not become re-entrant.
For KVM, the existing branch-predictor-hardening vectors are used.
When a spectre version of these vectors is in use, the firmware call
is sufficient to mitigate against Spectre-BHB. For the non-spectre
versions, the sequence of branches is added to the indirect vector.
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
2021-11-10 14:48:00 +00:00
|
|
|
|
2025-01-07 12:05:59 -08:00
|
|
|
/*
|
|
|
|
* At this point the core isn't known to be "safe" so we're going to
|
|
|
|
* assume it's vulnerable. We still need to update `max_bhb_k` though,
|
|
|
|
* but only if we aren't mitigating with clearbhb though.
|
|
|
|
*/
|
|
|
|
if (scope == SCOPE_LOCAL_CPU && !supports_clearbhb(SCOPE_LOCAL_CPU))
|
|
|
|
max_bhb_k = max(max_bhb_k, spectre_bhb_loop_affected());
|
arm64: Mitigate spectre style branch history side channels
Speculation attacks against some high-performance processors can
make use of branch history to influence future speculation.
When taking an exception from user-space, a sequence of branches
or a firmware call overwrites or invalidates the branch history.
The sequence of branches is added to the vectors, and should appear
before the first indirect branch. For systems using KPTI the sequence
is added to the kpti trampoline where it has a free register as the exit
from the trampoline is via a 'ret'. For systems not using KPTI, the same
register tricks are used to free up a register in the vectors.
For the firmware call, arch-workaround-3 clobbers 4 registers, so
there is no choice but to save them to the EL1 stack. This only happens
for entry from EL0, so if we take an exception due to the stack access,
it will not become re-entrant.
For KVM, the existing branch-predictor-hardening vectors are used.
When a spectre version of these vectors is in use, the firmware call
is sufficient to mitigate against Spectre-BHB. For the non-spectre
versions, the sequence of branches is added to the indirect vector.
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
2021-11-10 14:48:00 +00:00
|
|
|
|
2025-01-07 12:05:59 -08:00
|
|
|
return true;
|
arm64: Mitigate spectre style branch history side channels
Speculation attacks against some high-performance processors can
make use of branch history to influence future speculation.
When taking an exception from user-space, a sequence of branches
or a firmware call overwrites or invalidates the branch history.
The sequence of branches is added to the vectors, and should appear
before the first indirect branch. For systems using KPTI the sequence
is added to the kpti trampoline where it has a free register as the exit
from the trampoline is via a 'ret'. For systems not using KPTI, the same
register tricks are used to free up a register in the vectors.
For the firmware call, arch-workaround-3 clobbers 4 registers, so
there is no choice but to save them to the EL1 stack. This only happens
for entry from EL0, so if we take an exception due to the stack access,
it will not become re-entrant.
For KVM, the existing branch-predictor-hardening vectors are used.
When a spectre version of these vectors is in use, the firmware call
is sufficient to mitigate against Spectre-BHB. For the non-spectre
versions, the sequence of branches is added to the indirect vector.
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
2021-11-10 14:48:00 +00:00
|
|
|
}
|
|
|
|
|
2025-04-29 13:55:17 +01:00
|
|
|
u8 get_spectre_bhb_loop_value(void)
|
|
|
|
{
|
|
|
|
return max_bhb_k;
|
|
|
|
}
|
|
|
|
|
arm64: Mitigate spectre style branch history side channels
Speculation attacks against some high-performance processors can
make use of branch history to influence future speculation.
When taking an exception from user-space, a sequence of branches
or a firmware call overwrites or invalidates the branch history.
The sequence of branches is added to the vectors, and should appear
before the first indirect branch. For systems using KPTI the sequence
is added to the kpti trampoline where it has a free register as the exit
from the trampoline is via a 'ret'. For systems not using KPTI, the same
register tricks are used to free up a register in the vectors.
For the firmware call, arch-workaround-3 clobbers 4 registers, so
there is no choice but to save them to the EL1 stack. This only happens
for entry from EL0, so if we take an exception due to the stack access,
it will not become re-entrant.
For KVM, the existing branch-predictor-hardening vectors are used.
When a spectre version of these vectors is in use, the firmware call
is sufficient to mitigate against Spectre-BHB. For the non-spectre
versions, the sequence of branches is added to the indirect vector.
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
2021-11-10 14:48:00 +00:00
|
|
|
static void this_cpu_set_vectors(enum arm64_bp_harden_el1_vectors slot)
|
|
|
|
{
|
|
|
|
const char *v = arm64_get_bp_hardening_vector(slot);
|
|
|
|
|
|
|
|
__this_cpu_write(this_cpu_vector, v);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* When KPTI is in use, the vectors are switched when exiting to
|
|
|
|
* user-space.
|
|
|
|
*/
|
arm64: Avoid cpus_have_const_cap() for ARM64_UNMAP_KERNEL_AT_EL0
In arm64_kernel_unmapped_at_el0() we use cpus_have_const_cap() to check
for ARM64_UNMAP_KERNEL_AT_EL0, but this is only necessary so that
arm64_get_bp_hardening_vector() and this_cpu_set_vectors() can run prior
to alternatives being patched. Otherwise this is not necessary and
alternative_has_cap_*() would be preferable.
For historical reasons, cpus_have_const_cap() is more complicated than
it needs to be. Before cpucaps are finalized, it will perform a bitmap
test of the system_cpucaps bitmap, and once cpucaps are finalized it
will use an alternative branch. This used to be necessary to handle some
race conditions in the window between cpucap detection and the
subsequent patching of alternatives and static branches, where different
branches could be out-of-sync with one another (or w.r.t. alternative
sequences). Now that we use alternative branches instead of static
branches, these are all patched atomically w.r.t. one another, and there
are only a handful of cases that need special care in the window between
cpucap detection and alternative patching.
Due to the above, it would be nice to remove cpus_have_const_cap(), and
migrate callers over to alternative_has_cap_*(), cpus_have_final_cap(),
or cpus_have_cap() depending on when their requirements. This will
remove redundant instructions and improve code generation, and will make
it easier to determine how each callsite will behave before, during, and
after alternative patching.
The ARM64_UNMAP_KERNEL_AT_EL0 cpucap is a system-wide feature that is
detected and patched before any translation tables are created for
userspace. In the window between detecting the ARM64_UNMAP_KERNEL_AT_EL0
cpucap and patching alternatives, most users of
arm64_kernel_unmapped_at_el0() do not need to know that the cpucap has
been detected:
* As KVM is initialized after cpucaps are finalized, no usaef of
arm64_kernel_unmapped_at_el0() in the KVM code is reachable during
this window.
* The arm64_mm_context_get() function in arch/arm64/mm/context.c is only
called after the SMMU driver is brought up after alternatives have
been patched. Thus this can safely use cpus_have_final_cap() or
alternative_has_cap_*().
Similarly the asids_update_limit() function is called after
alternatives have been patched as an arch_initcall, and this can
safely use cpus_have_final_cap() or alternative_has_cap_*().
Similarly we do not expect an ASID rollover to occur between cpucaps
being detected and patching alternatives. Thus
set_reserved_asid_bits() can safely use cpus_have_final_cap() or
alternative_has_cap_*().
* The __tlbi_user() and __tlbi_user_level() macros are not used during
this window, and only need to invalidate additional entries once
userspace translation tables have been active on a CPU. Thus these can
safely use alternative_has_cap_*().
* The xen_kernel_unmapped_at_usr() function is not used during this
window as it is only used in a late_initcall. Thus this can safely use
cpus_have_final_cap() or alternative_has_cap_*().
* The arm64_get_meltdown_state() function is not used during this
window. It only used by arm64_get_meltdown_state() and KVM code, both
of which are only used after cpucaps have been finalized. Thus this
can safely use cpus_have_final_cap() or alternative_has_cap_*().
* The tls_thread_switch() uses arm64_kernel_unmapped_at_el0() as an
optimization to avoid zeroing tpidrro_el0 when KPTI is enabled
and this will be trampled by the KPTI trampoline. It doesn't matter if
this continues to zero the register during the window between
detecting the cpucap and patching alternatives, so this can safely use
alternative_has_cap_*().
* The sdei_arch_get_entry_point() and do_sdei_event() functions aren't
reachable at this time as the SDEI driver is registered later by
acpi_init() -> acpi_ghes_init() -> sdei_init(), where acpi_init is a
subsys_initcall. Thus these can safely use cpus_have_final_cap() or
alternative_has_cap_*().
* The uses under drivers/ aren't reachable at this time as the drivers
are registered later:
- TRBE is registered via module_init()
- SMMUv3 is registred via module_driver()
- SPE is registred via module_init()
* The arm64_get_bp_hardening_vector() and this_cpu_set_vectors()
functions need to run on boot CPUs prior to patching alternatives.
As these are only called during the onlining of a CPU, it's fine to
perform a system_cpucaps bitmap test using cpus_have_cap().
This patch modifies this_cpu_set_vectors() to use cpus_have_cap(), and
replaced all other use of cpus_have_const_cap() with
alternative_has_cap_unlikely(), which will avoid generating code to test
the system_cpucaps bitmap and should be better for all subsequent calls
at runtime. The ARM64_UNMAP_KERNEL_AT_EL0 cpucap is added to
cpucap_is_possible() so that code can be elided entirely when this is
not possible.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: James Morse <james.morse@arm.com>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2023-10-16 11:24:53 +01:00
|
|
|
if (cpus_have_cap(ARM64_UNMAP_KERNEL_AT_EL0))
|
arm64: Mitigate spectre style branch history side channels
Speculation attacks against some high-performance processors can
make use of branch history to influence future speculation.
When taking an exception from user-space, a sequence of branches
or a firmware call overwrites or invalidates the branch history.
The sequence of branches is added to the vectors, and should appear
before the first indirect branch. For systems using KPTI the sequence
is added to the kpti trampoline where it has a free register as the exit
from the trampoline is via a 'ret'. For systems not using KPTI, the same
register tricks are used to free up a register in the vectors.
For the firmware call, arch-workaround-3 clobbers 4 registers, so
there is no choice but to save them to the EL1 stack. This only happens
for entry from EL0, so if we take an exception due to the stack access,
it will not become re-entrant.
For KVM, the existing branch-predictor-hardening vectors are used.
When a spectre version of these vectors is in use, the firmware call
is sufficient to mitigate against Spectre-BHB. For the non-spectre
versions, the sequence of branches is added to the indirect vector.
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
2021-11-10 14:48:00 +00:00
|
|
|
return;
|
|
|
|
|
|
|
|
write_sysreg(v, vbar_el1);
|
|
|
|
isb();
|
|
|
|
}
|
|
|
|
|
2021-12-09 15:13:24 +00:00
|
|
|
bool __read_mostly __nospectre_bhb;
|
2022-08-26 19:40:50 +08:00
|
|
|
static int __init parse_spectre_bhb_param(char *str)
|
|
|
|
{
|
|
|
|
__nospectre_bhb = true;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
early_param("nospectre_bhb", parse_spectre_bhb_param);
|
|
|
|
|
arm64: Mitigate spectre style branch history side channels
Speculation attacks against some high-performance processors can
make use of branch history to influence future speculation.
When taking an exception from user-space, a sequence of branches
or a firmware call overwrites or invalidates the branch history.
The sequence of branches is added to the vectors, and should appear
before the first indirect branch. For systems using KPTI the sequence
is added to the kpti trampoline where it has a free register as the exit
from the trampoline is via a 'ret'. For systems not using KPTI, the same
register tricks are used to free up a register in the vectors.
For the firmware call, arch-workaround-3 clobbers 4 registers, so
there is no choice but to save them to the EL1 stack. This only happens
for entry from EL0, so if we take an exception due to the stack access,
it will not become re-entrant.
For KVM, the existing branch-predictor-hardening vectors are used.
When a spectre version of these vectors is in use, the firmware call
is sufficient to mitigate against Spectre-BHB. For the non-spectre
versions, the sequence of branches is added to the indirect vector.
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
2021-11-10 14:48:00 +00:00
|
|
|
void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry)
|
|
|
|
{
|
|
|
|
bp_hardening_cb_t cpu_cb;
|
2025-01-07 12:05:59 -08:00
|
|
|
enum mitigation_state state = SPECTRE_VULNERABLE;
|
arm64: Mitigate spectre style branch history side channels
Speculation attacks against some high-performance processors can
make use of branch history to influence future speculation.
When taking an exception from user-space, a sequence of branches
or a firmware call overwrites or invalidates the branch history.
The sequence of branches is added to the vectors, and should appear
before the first indirect branch. For systems using KPTI the sequence
is added to the kpti trampoline where it has a free register as the exit
from the trampoline is via a 'ret'. For systems not using KPTI, the same
register tricks are used to free up a register in the vectors.
For the firmware call, arch-workaround-3 clobbers 4 registers, so
there is no choice but to save them to the EL1 stack. This only happens
for entry from EL0, so if we take an exception due to the stack access,
it will not become re-entrant.
For KVM, the existing branch-predictor-hardening vectors are used.
When a spectre version of these vectors is in use, the firmware call
is sufficient to mitigate against Spectre-BHB. For the non-spectre
versions, the sequence of branches is added to the indirect vector.
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
2021-11-10 14:48:00 +00:00
|
|
|
struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data);
|
|
|
|
|
|
|
|
if (!is_spectre_bhb_affected(entry, SCOPE_LOCAL_CPU))
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (arm64_get_spectre_v2_state() == SPECTRE_VULNERABLE) {
|
|
|
|
/* No point mitigating Spectre-BHB alone. */
|
|
|
|
} else if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY)) {
|
|
|
|
pr_info_once("spectre-bhb mitigation disabled by compile time option\n");
|
2022-08-26 19:40:50 +08:00
|
|
|
} else if (cpu_mitigations_off() || __nospectre_bhb) {
|
arm64: Mitigate spectre style branch history side channels
Speculation attacks against some high-performance processors can
make use of branch history to influence future speculation.
When taking an exception from user-space, a sequence of branches
or a firmware call overwrites or invalidates the branch history.
The sequence of branches is added to the vectors, and should appear
before the first indirect branch. For systems using KPTI the sequence
is added to the kpti trampoline where it has a free register as the exit
from the trampoline is via a 'ret'. For systems not using KPTI, the same
register tricks are used to free up a register in the vectors.
For the firmware call, arch-workaround-3 clobbers 4 registers, so
there is no choice but to save them to the EL1 stack. This only happens
for entry from EL0, so if we take an exception due to the stack access,
it will not become re-entrant.
For KVM, the existing branch-predictor-hardening vectors are used.
When a spectre version of these vectors is in use, the firmware call
is sufficient to mitigate against Spectre-BHB. For the non-spectre
versions, the sequence of branches is added to the indirect vector.
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
2021-11-10 14:48:00 +00:00
|
|
|
pr_info_once("spectre-bhb mitigation disabled by command line option\n");
|
|
|
|
} else if (supports_ecbhb(SCOPE_LOCAL_CPU)) {
|
|
|
|
state = SPECTRE_MITIGATED;
|
|
|
|
set_bit(BHB_HW, &system_bhb_mitigations);
|
2021-12-10 14:32:56 +00:00
|
|
|
} else if (supports_clearbhb(SCOPE_LOCAL_CPU)) {
|
|
|
|
/*
|
|
|
|
* Ensure KVM uses the indirect vector which will have ClearBHB
|
|
|
|
* added.
|
|
|
|
*/
|
|
|
|
if (!data->slot)
|
|
|
|
data->slot = HYP_VECTOR_INDIRECT;
|
|
|
|
|
|
|
|
this_cpu_set_vectors(EL1_VECTOR_BHB_CLEAR_INSN);
|
|
|
|
state = SPECTRE_MITIGATED;
|
|
|
|
set_bit(BHB_INSN, &system_bhb_mitigations);
|
2025-01-07 12:05:59 -08:00
|
|
|
} else if (spectre_bhb_loop_affected()) {
|
arm64: Mitigate spectre style branch history side channels
Speculation attacks against some high-performance processors can
make use of branch history to influence future speculation.
When taking an exception from user-space, a sequence of branches
or a firmware call overwrites or invalidates the branch history.
The sequence of branches is added to the vectors, and should appear
before the first indirect branch. For systems using KPTI the sequence
is added to the kpti trampoline where it has a free register as the exit
from the trampoline is via a 'ret'. For systems not using KPTI, the same
register tricks are used to free up a register in the vectors.
For the firmware call, arch-workaround-3 clobbers 4 registers, so
there is no choice but to save them to the EL1 stack. This only happens
for entry from EL0, so if we take an exception due to the stack access,
it will not become re-entrant.
For KVM, the existing branch-predictor-hardening vectors are used.
When a spectre version of these vectors is in use, the firmware call
is sufficient to mitigate against Spectre-BHB. For the non-spectre
versions, the sequence of branches is added to the indirect vector.
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
2021-11-10 14:48:00 +00:00
|
|
|
/*
|
|
|
|
* Ensure KVM uses the indirect vector which will have the
|
|
|
|
* branchy-loop added. A57/A72-r0 will already have selected
|
|
|
|
* the spectre-indirect vector, which is sufficient for BHB
|
|
|
|
* too.
|
|
|
|
*/
|
|
|
|
if (!data->slot)
|
|
|
|
data->slot = HYP_VECTOR_INDIRECT;
|
|
|
|
|
|
|
|
this_cpu_set_vectors(EL1_VECTOR_BHB_LOOP);
|
|
|
|
state = SPECTRE_MITIGATED;
|
|
|
|
set_bit(BHB_LOOP, &system_bhb_mitigations);
|
2025-01-07 12:05:59 -08:00
|
|
|
} else if (has_spectre_bhb_fw_mitigation()) {
|
|
|
|
/*
|
|
|
|
* Ensure KVM uses one of the spectre bp_hardening
|
|
|
|
* vectors. The indirect vector doesn't include the EL3
|
|
|
|
* call, so needs upgrading to
|
|
|
|
* HYP_VECTOR_SPECTRE_INDIRECT.
|
|
|
|
*/
|
|
|
|
if (!data->slot || data->slot == HYP_VECTOR_INDIRECT)
|
|
|
|
data->slot += 1;
|
|
|
|
|
|
|
|
this_cpu_set_vectors(EL1_VECTOR_BHB_FW);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The WA3 call in the vectors supersedes the WA1 call
|
|
|
|
* made during context-switch. Uninstall any firmware
|
|
|
|
* bp_hardening callback.
|
|
|
|
*/
|
|
|
|
cpu_cb = spectre_v2_get_sw_mitigation_cb();
|
|
|
|
if (__this_cpu_read(bp_hardening_data.fn) != cpu_cb)
|
|
|
|
__this_cpu_write(bp_hardening_data.fn, NULL);
|
|
|
|
|
|
|
|
state = SPECTRE_MITIGATED;
|
|
|
|
set_bit(BHB_FW, &system_bhb_mitigations);
|
arm64: Mitigate spectre style branch history side channels
Speculation attacks against some high-performance processors can
make use of branch history to influence future speculation.
When taking an exception from user-space, a sequence of branches
or a firmware call overwrites or invalidates the branch history.
The sequence of branches is added to the vectors, and should appear
before the first indirect branch. For systems using KPTI the sequence
is added to the kpti trampoline where it has a free register as the exit
from the trampoline is via a 'ret'. For systems not using KPTI, the same
register tricks are used to free up a register in the vectors.
For the firmware call, arch-workaround-3 clobbers 4 registers, so
there is no choice but to save them to the EL1 stack. This only happens
for entry from EL0, so if we take an exception due to the stack access,
it will not become re-entrant.
For KVM, the existing branch-predictor-hardening vectors are used.
When a spectre version of these vectors is in use, the firmware call
is sufficient to mitigate against Spectre-BHB. For the non-spectre
versions, the sequence of branches is added to the indirect vector.
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
2021-11-10 14:48:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
update_mitigation_state(&spectre_bhb_state, state);
|
|
|
|
}
|
|
|
|
|
2024-08-19 14:15:53 +01:00
|
|
|
bool is_spectre_bhb_fw_mitigated(void)
|
|
|
|
{
|
|
|
|
return test_bit(BHB_FW, &system_bhb_mitigations);
|
|
|
|
}
|
|
|
|
|
2021-11-18 13:59:46 +00:00
|
|
|
/* Patched to NOP when enabled */
|
|
|
|
void noinstr spectre_bhb_patch_loop_mitigation_enable(struct alt_instr *alt,
|
|
|
|
__le32 *origptr,
|
|
|
|
__le32 *updptr, int nr_inst)
|
|
|
|
{
|
|
|
|
BUG_ON(nr_inst != 1);
|
arm64: Mitigate spectre style branch history side channels
Speculation attacks against some high-performance processors can
make use of branch history to influence future speculation.
When taking an exception from user-space, a sequence of branches
or a firmware call overwrites or invalidates the branch history.
The sequence of branches is added to the vectors, and should appear
before the first indirect branch. For systems using KPTI the sequence
is added to the kpti trampoline where it has a free register as the exit
from the trampoline is via a 'ret'. For systems not using KPTI, the same
register tricks are used to free up a register in the vectors.
For the firmware call, arch-workaround-3 clobbers 4 registers, so
there is no choice but to save them to the EL1 stack. This only happens
for entry from EL0, so if we take an exception due to the stack access,
it will not become re-entrant.
For KVM, the existing branch-predictor-hardening vectors are used.
When a spectre version of these vectors is in use, the firmware call
is sufficient to mitigate against Spectre-BHB. For the non-spectre
versions, the sequence of branches is added to the indirect vector.
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
2021-11-10 14:48:00 +00:00
|
|
|
|
|
|
|
if (test_bit(BHB_LOOP, &system_bhb_mitigations))
|
|
|
|
*updptr++ = cpu_to_le32(aarch64_insn_gen_nop());
|
2021-11-18 13:59:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Patched to NOP when enabled */
|
|
|
|
void noinstr spectre_bhb_patch_fw_mitigation_enabled(struct alt_instr *alt,
|
|
|
|
__le32 *origptr,
|
|
|
|
__le32 *updptr, int nr_inst)
|
|
|
|
{
|
|
|
|
BUG_ON(nr_inst != 1);
|
arm64: Mitigate spectre style branch history side channels
Speculation attacks against some high-performance processors can
make use of branch history to influence future speculation.
When taking an exception from user-space, a sequence of branches
or a firmware call overwrites or invalidates the branch history.
The sequence of branches is added to the vectors, and should appear
before the first indirect branch. For systems using KPTI the sequence
is added to the kpti trampoline where it has a free register as the exit
from the trampoline is via a 'ret'. For systems not using KPTI, the same
register tricks are used to free up a register in the vectors.
For the firmware call, arch-workaround-3 clobbers 4 registers, so
there is no choice but to save them to the EL1 stack. This only happens
for entry from EL0, so if we take an exception due to the stack access,
it will not become re-entrant.
For KVM, the existing branch-predictor-hardening vectors are used.
When a spectre version of these vectors is in use, the firmware call
is sufficient to mitigate against Spectre-BHB. For the non-spectre
versions, the sequence of branches is added to the indirect vector.
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
2021-11-10 14:48:00 +00:00
|
|
|
|
|
|
|
if (test_bit(BHB_FW, &system_bhb_mitigations))
|
|
|
|
*updptr++ = cpu_to_le32(aarch64_insn_gen_nop());
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Patched to correct the immediate */
|
|
|
|
void noinstr spectre_bhb_patch_loop_iter(struct alt_instr *alt,
|
|
|
|
__le32 *origptr, __le32 *updptr, int nr_inst)
|
|
|
|
{
|
|
|
|
u8 rd;
|
|
|
|
u32 insn;
|
|
|
|
|
|
|
|
BUG_ON(nr_inst != 1); /* MOV -> MOV */
|
|
|
|
|
|
|
|
if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY))
|
|
|
|
return;
|
|
|
|
|
|
|
|
insn = le32_to_cpu(*origptr);
|
|
|
|
rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, insn);
|
2025-01-07 12:05:59 -08:00
|
|
|
insn = aarch64_insn_gen_movewide(rd, max_bhb_k, 0,
|
arm64: Mitigate spectre style branch history side channels
Speculation attacks against some high-performance processors can
make use of branch history to influence future speculation.
When taking an exception from user-space, a sequence of branches
or a firmware call overwrites or invalidates the branch history.
The sequence of branches is added to the vectors, and should appear
before the first indirect branch. For systems using KPTI the sequence
is added to the kpti trampoline where it has a free register as the exit
from the trampoline is via a 'ret'. For systems not using KPTI, the same
register tricks are used to free up a register in the vectors.
For the firmware call, arch-workaround-3 clobbers 4 registers, so
there is no choice but to save them to the EL1 stack. This only happens
for entry from EL0, so if we take an exception due to the stack access,
it will not become re-entrant.
For KVM, the existing branch-predictor-hardening vectors are used.
When a spectre version of these vectors is in use, the firmware call
is sufficient to mitigate against Spectre-BHB. For the non-spectre
versions, the sequence of branches is added to the indirect vector.
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
2021-11-10 14:48:00 +00:00
|
|
|
AARCH64_INSN_VARIANT_64BIT,
|
|
|
|
AARCH64_INSN_MOVEWIDE_ZERO);
|
|
|
|
*updptr++ = cpu_to_le32(insn);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Patched to mov WA3 when supported */
|
|
|
|
void noinstr spectre_bhb_patch_wa3(struct alt_instr *alt,
|
|
|
|
__le32 *origptr, __le32 *updptr, int nr_inst)
|
|
|
|
{
|
|
|
|
u8 rd;
|
|
|
|
u32 insn;
|
|
|
|
|
|
|
|
BUG_ON(nr_inst != 1); /* MOV -> MOV */
|
|
|
|
|
|
|
|
if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY) ||
|
|
|
|
!test_bit(BHB_FW, &system_bhb_mitigations))
|
|
|
|
return;
|
|
|
|
|
|
|
|
insn = le32_to_cpu(*origptr);
|
|
|
|
rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, insn);
|
|
|
|
|
|
|
|
insn = aarch64_insn_gen_logical_immediate(AARCH64_INSN_LOGIC_ORR,
|
|
|
|
AARCH64_INSN_VARIANT_32BIT,
|
|
|
|
AARCH64_INSN_REG_ZR, rd,
|
|
|
|
ARM_SMCCC_ARCH_WORKAROUND_3);
|
|
|
|
if (WARN_ON_ONCE(insn == AARCH64_BREAK_FAULT))
|
|
|
|
return;
|
|
|
|
|
|
|
|
*updptr++ = cpu_to_le32(insn);
|
2021-11-18 13:59:46 +00:00
|
|
|
}
|
2021-12-10 14:32:56 +00:00
|
|
|
|
|
|
|
/* Patched to NOP when not supported */
|
|
|
|
void __init spectre_bhb_patch_clearbhb(struct alt_instr *alt,
|
|
|
|
__le32 *origptr, __le32 *updptr, int nr_inst)
|
|
|
|
{
|
|
|
|
BUG_ON(nr_inst != 2);
|
|
|
|
|
|
|
|
if (test_bit(BHB_INSN, &system_bhb_mitigations))
|
|
|
|
return;
|
|
|
|
|
|
|
|
*updptr++ = cpu_to_le32(aarch64_insn_gen_nop());
|
|
|
|
*updptr++ = cpu_to_le32(aarch64_insn_gen_nop());
|
|
|
|
}
|
2022-03-03 16:53:56 +00:00
|
|
|
|
|
|
|
#ifdef CONFIG_BPF_SYSCALL
|
|
|
|
#define EBPF_WARN "Unprivileged eBPF is enabled, data leaks possible via Spectre v2 BHB attacks!\n"
|
|
|
|
void unpriv_ebpf_notify(int new_state)
|
|
|
|
{
|
|
|
|
if (spectre_v2_state == SPECTRE_VULNERABLE ||
|
|
|
|
spectre_bhb_state != SPECTRE_MITIGATED)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (!new_state)
|
|
|
|
pr_err("WARNING: %s", EBPF_WARN);
|
|
|
|
}
|
|
|
|
#endif
|