linux/arch/arm64/kernel/proton-pack.c

1201 lines
31 KiB
C
Raw Permalink Normal View History

// SPDX-License-Identifier: GPL-2.0-only
/*
* Handle detection, reporting and mitigation of Spectre v1, v2, v3a and v4, as
* detailed at:
*
* https://developer.arm.com/support/arm-security-updates/speculative-processor-vulnerability
*
* This code was originally written hastily under an awful lot of stress and so
* aspects of it are somewhat hacky. Unfortunately, changing anything in here
* instantly makes me feel ill. Thanks, Jann. Thann.
*
* Copyright (C) 2018 ARM Ltd, All Rights Reserved.
* Copyright (C) 2020 Google LLC
*
* "If there's something strange in your neighbourhood, who you gonna call?"
*
* Authors: Will Deacon <will@kernel.org> and Marc Zyngier <maz@kernel.org>
*/
#include <linux/arm-smccc.h>
#include <linux/bpf.h>
#include <linux/cpu.h>
#include <linux/device.h>
#include <linux/nospec.h>
#include <linux/prctl.h>
#include <linux/sched/task_stack.h>
#include <asm/debug-monitors.h>
#include <asm/insn.h>
#include <asm/spectre.h>
#include <asm/traps.h>
#include <asm/vectors.h>
#include <asm/virt.h>
/*
* We try to ensure that the mitigation state can never change as the result of
* onlining a late CPU.
*/
static void update_mitigation_state(enum mitigation_state *oldp,
enum mitigation_state new)
{
enum mitigation_state state;
do {
state = READ_ONCE(*oldp);
if (new <= state)
break;
/* Userspace almost certainly can't deal with this. */
if (WARN_ON(system_capabilities_finalized()))
break;
} while (cmpxchg_relaxed(oldp, state, new) != state);
}
/*
* Spectre v1.
*
* The kernel can't protect userspace for this one: it's each person for
* themselves. Advertise what we're doing and be done with it.
*/
ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr,
char *buf)
{
return sprintf(buf, "Mitigation: __user pointer sanitization\n");
}
/*
* Spectre v2.
*
* This one sucks. A CPU is either:
*
* - Mitigated in hardware and advertised by ID_AA64PFR0_EL1.CSV2.
* - Mitigated in hardware and listed in our "safe list".
* - Mitigated in software by firmware.
* - Mitigated in software by a CPU-specific dance in the kernel and a
* firmware call at EL2.
* - Vulnerable.
*
* It's not unlikely for different CPUs in a big.LITTLE system to fall into
* different camps.
*/
static enum mitigation_state spectre_v2_state;
static bool __read_mostly __nospectre_v2;
static int __init parse_spectre_v2_param(char *str)
{
__nospectre_v2 = true;
return 0;
}
early_param("nospectre_v2", parse_spectre_v2_param);
static bool spectre_v2_mitigations_off(void)
{
bool ret = __nospectre_v2 || cpu_mitigations_off();
if (ret)
pr_info_once("spectre-v2 mitigation disabled by command line option\n");
return ret;
}
static const char *get_bhb_affected_string(enum mitigation_state bhb_state)
{
switch (bhb_state) {
case SPECTRE_UNAFFECTED:
return "";
default:
case SPECTRE_VULNERABLE:
return ", but not BHB";
case SPECTRE_MITIGATED:
return ", BHB";
}
}
static bool _unprivileged_ebpf_enabled(void)
{
#ifdef CONFIG_BPF_SYSCALL
return !sysctl_unprivileged_bpf_disabled;
#else
return false;
#endif
}
ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr,
char *buf)
{
enum mitigation_state bhb_state = arm64_get_spectre_bhb_state();
const char *bhb_str = get_bhb_affected_string(bhb_state);
const char *v2_str = "Branch predictor hardening";
switch (spectre_v2_state) {
case SPECTRE_UNAFFECTED:
if (bhb_state == SPECTRE_UNAFFECTED)
return sprintf(buf, "Not affected\n");
/*
* Platforms affected by Spectre-BHB can't report
* "Not affected" for Spectre-v2.
*/
v2_str = "CSV2";
fallthrough;
case SPECTRE_MITIGATED:
if (bhb_state == SPECTRE_MITIGATED && _unprivileged_ebpf_enabled())
return sprintf(buf, "Vulnerable: Unprivileged eBPF enabled\n");
return sprintf(buf, "Mitigation: %s%s\n", v2_str, bhb_str);
case SPECTRE_VULNERABLE:
fallthrough;
default:
return sprintf(buf, "Vulnerable\n");
}
}
static enum mitigation_state spectre_v2_get_cpu_hw_mitigation_state(void)
{
u64 pfr0;
static const struct midr_range spectre_v2_safe_list[] = {
MIDR_ALL_VERSIONS(MIDR_CORTEX_A35),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A53),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A55),
MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53),
MIDR_ALL_VERSIONS(MIDR_HISI_TSV110),
MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_2XX_SILVER),
MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER),
MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER),
{ /* sentinel */ }
};
/* If the CPU has CSV2 set, we're safe */
pfr0 = read_cpuid(ID_AA64PFR0_EL1);
if (cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_CSV2_SHIFT))
return SPECTRE_UNAFFECTED;
/* Alternatively, we have a list of unaffected CPUs */
if (is_midr_in_range_list(spectre_v2_safe_list))
return SPECTRE_UNAFFECTED;
return SPECTRE_VULNERABLE;
}
static enum mitigation_state spectre_v2_get_cpu_fw_mitigation_state(void)
{
int ret;
struct arm_smccc_res res;
arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
ARM_SMCCC_ARCH_WORKAROUND_1, &res);
ret = res.a0;
switch (ret) {
case SMCCC_RET_SUCCESS:
return SPECTRE_MITIGATED;
case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED:
return SPECTRE_UNAFFECTED;
default:
fallthrough;
case SMCCC_RET_NOT_SUPPORTED:
return SPECTRE_VULNERABLE;
}
}
bool has_spectre_v2(const struct arm64_cpu_capabilities *entry, int scope)
{
WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
if (spectre_v2_get_cpu_hw_mitigation_state() == SPECTRE_UNAFFECTED)
return false;
if (spectre_v2_get_cpu_fw_mitigation_state() == SPECTRE_UNAFFECTED)
return false;
return true;
}
enum mitigation_state arm64_get_spectre_v2_state(void)
{
return spectre_v2_state;
}
DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data);
static void install_bp_hardening_cb(bp_hardening_cb_t fn)
{
__this_cpu_write(bp_hardening_data.fn, fn);
/*
* Vinz Clortho takes the hyp_vecs start/end "keys" at
* the door when we're a guest. Skip the hyp-vectors work.
*/
if (!is_hyp_mode_available())
return;
__this_cpu_write(bp_hardening_data.slot, HYP_VECTOR_SPECTRE_DIRECT);
}
arm64: prevent instrumentation of bp hardening callbacks We may call arm64_apply_bp_hardening() early during entry (e.g. in el0_ia()) before it is safe to run instrumented code. Unfortunately this may result in running instrumented code in two cases: * The hardening callbacks called by arm64_apply_bp_hardening() are not marked as `noinstr`, and have been observed to be instrumented when compiled with either GCC or LLVM. * Since arm64_apply_bp_hardening() itself is only marked as `inline` rather than `__always_inline`, it is possible that the compiler decides to place it out-of-line, whereupon it may be instrumented. For example, with defconfig built with clang 13.0.0, call_hvc_arch_workaround_1() is compiled as: | <call_hvc_arch_workaround_1>: | d503233f paciasp | f81f0ffe str x30, [sp, #-16]! | 320183e0 mov w0, #0x80008000 | d503201f nop | d4000002 hvc #0x0 | f84107fe ldr x30, [sp], #16 | d50323bf autiasp | d65f03c0 ret ... but when CONFIG_FTRACE=y and CONFIG_KCOV=y this is compiled as: | <call_hvc_arch_workaround_1>: | d503245f bti c | d503201f nop | d503201f nop | d503233f paciasp | a9bf7bfd stp x29, x30, [sp, #-16]! | 910003fd mov x29, sp | 94000000 bl 0 <__sanitizer_cov_trace_pc> | 320183e0 mov w0, #0x80008000 | d503201f nop | d4000002 hvc #0x0 | a8c17bfd ldp x29, x30, [sp], #16 | d50323bf autiasp | d65f03c0 ret ... with a patchable function entry registered with ftrace, and a direct call to __sanitizer_cov_trace_pc(). Neither of these are safe early during entry sequences. This patch avoids the unsafe instrumentation by marking arm64_apply_bp_hardening() as `__always_inline` and by marking the hardening functions as `noinstr`. This avoids the potential for instrumentation, and causes clang to consistently generate the function as with the defconfig sample. Note: in the defconfig compilation, when CONFIG_SVE=y, x30 is spilled to the stack without being placed in a frame record, which will result in a missing entry if call_hvc_arch_workaround_1() is backtraced. Similar is true of qcom_link_stack_sanitisation(), where inline asm spills the LR to a GPR prior to corrupting it. This is not a significant issue presently as we will only backtrace here if an exception is taken, and in such cases we may omit entries for other reasons today. The relevant hardening functions were introduced in commits: ec82b567a74fbdff ("arm64: Implement branch predictor hardening for Falkor") b092201e00206141 ("arm64: Add ARM_SMCCC_ARCH_WORKAROUND_1 BP hardening support") ... and these were subsequently moved in commit: d4647f0a2ad71110 ("arm64: Rewrite Spectre-v2 mitigation code") The arm64_apply_bp_hardening() function was introduced in commit: 0f15adbb2861ce6f ("arm64: Add skeleton to harden the branch predictor against aliasing attacks") ... and was subsequently moved and reworked in commit: 6279017e807708a0 ("KVM: arm64: Move BP hardening helpers into spectre.h") Fixes: ec82b567a74fbdff ("arm64: Implement branch predictor hardening for Falkor") Fixes: b092201e00206141 ("arm64: Add ARM_SMCCC_ARCH_WORKAROUND_1 BP hardening support") Fixes: d4647f0a2ad71110 ("arm64: Rewrite Spectre-v2 mitigation code") Fixes: 0f15adbb2861ce6f ("arm64: Add skeleton to harden the branch predictor against aliasing attacks") Fixes: 6279017e807708a0 ("KVM: arm64: Move BP hardening helpers into spectre.h") Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Ard Biesheuvel <ardb@kernel.org> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: James Morse <james.morse@arm.com> Cc: Marc Zyngier <maz@kernel.org> Cc: Mark Brown <broonie@kernel.org> Cc: Will Deacon <will@kernel.org> Acked-by: Marc Zyngier <maz@kernel.org> Reviewed-by: Mark Brown <broonie@kernel.org> Link: https://lore.kernel.org/r/20220224181028.512873-1-mark.rutland@arm.com Signed-off-by: Will Deacon <will@kernel.org>
2022-02-24 18:10:28 +00:00
/* Called during entry so must be noinstr */
static noinstr void call_smc_arch_workaround_1(void)
{
arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL);
}
arm64: prevent instrumentation of bp hardening callbacks We may call arm64_apply_bp_hardening() early during entry (e.g. in el0_ia()) before it is safe to run instrumented code. Unfortunately this may result in running instrumented code in two cases: * The hardening callbacks called by arm64_apply_bp_hardening() are not marked as `noinstr`, and have been observed to be instrumented when compiled with either GCC or LLVM. * Since arm64_apply_bp_hardening() itself is only marked as `inline` rather than `__always_inline`, it is possible that the compiler decides to place it out-of-line, whereupon it may be instrumented. For example, with defconfig built with clang 13.0.0, call_hvc_arch_workaround_1() is compiled as: | <call_hvc_arch_workaround_1>: | d503233f paciasp | f81f0ffe str x30, [sp, #-16]! | 320183e0 mov w0, #0x80008000 | d503201f nop | d4000002 hvc #0x0 | f84107fe ldr x30, [sp], #16 | d50323bf autiasp | d65f03c0 ret ... but when CONFIG_FTRACE=y and CONFIG_KCOV=y this is compiled as: | <call_hvc_arch_workaround_1>: | d503245f bti c | d503201f nop | d503201f nop | d503233f paciasp | a9bf7bfd stp x29, x30, [sp, #-16]! | 910003fd mov x29, sp | 94000000 bl 0 <__sanitizer_cov_trace_pc> | 320183e0 mov w0, #0x80008000 | d503201f nop | d4000002 hvc #0x0 | a8c17bfd ldp x29, x30, [sp], #16 | d50323bf autiasp | d65f03c0 ret ... with a patchable function entry registered with ftrace, and a direct call to __sanitizer_cov_trace_pc(). Neither of these are safe early during entry sequences. This patch avoids the unsafe instrumentation by marking arm64_apply_bp_hardening() as `__always_inline` and by marking the hardening functions as `noinstr`. This avoids the potential for instrumentation, and causes clang to consistently generate the function as with the defconfig sample. Note: in the defconfig compilation, when CONFIG_SVE=y, x30 is spilled to the stack without being placed in a frame record, which will result in a missing entry if call_hvc_arch_workaround_1() is backtraced. Similar is true of qcom_link_stack_sanitisation(), where inline asm spills the LR to a GPR prior to corrupting it. This is not a significant issue presently as we will only backtrace here if an exception is taken, and in such cases we may omit entries for other reasons today. The relevant hardening functions were introduced in commits: ec82b567a74fbdff ("arm64: Implement branch predictor hardening for Falkor") b092201e00206141 ("arm64: Add ARM_SMCCC_ARCH_WORKAROUND_1 BP hardening support") ... and these were subsequently moved in commit: d4647f0a2ad71110 ("arm64: Rewrite Spectre-v2 mitigation code") The arm64_apply_bp_hardening() function was introduced in commit: 0f15adbb2861ce6f ("arm64: Add skeleton to harden the branch predictor against aliasing attacks") ... and was subsequently moved and reworked in commit: 6279017e807708a0 ("KVM: arm64: Move BP hardening helpers into spectre.h") Fixes: ec82b567a74fbdff ("arm64: Implement branch predictor hardening for Falkor") Fixes: b092201e00206141 ("arm64: Add ARM_SMCCC_ARCH_WORKAROUND_1 BP hardening support") Fixes: d4647f0a2ad71110 ("arm64: Rewrite Spectre-v2 mitigation code") Fixes: 0f15adbb2861ce6f ("arm64: Add skeleton to harden the branch predictor against aliasing attacks") Fixes: 6279017e807708a0 ("KVM: arm64: Move BP hardening helpers into spectre.h") Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Ard Biesheuvel <ardb@kernel.org> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: James Morse <james.morse@arm.com> Cc: Marc Zyngier <maz@kernel.org> Cc: Mark Brown <broonie@kernel.org> Cc: Will Deacon <will@kernel.org> Acked-by: Marc Zyngier <maz@kernel.org> Reviewed-by: Mark Brown <broonie@kernel.org> Link: https://lore.kernel.org/r/20220224181028.512873-1-mark.rutland@arm.com Signed-off-by: Will Deacon <will@kernel.org>
2022-02-24 18:10:28 +00:00
/* Called during entry so must be noinstr */
static noinstr void call_hvc_arch_workaround_1(void)
{
arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL);
}
arm64: prevent instrumentation of bp hardening callbacks We may call arm64_apply_bp_hardening() early during entry (e.g. in el0_ia()) before it is safe to run instrumented code. Unfortunately this may result in running instrumented code in two cases: * The hardening callbacks called by arm64_apply_bp_hardening() are not marked as `noinstr`, and have been observed to be instrumented when compiled with either GCC or LLVM. * Since arm64_apply_bp_hardening() itself is only marked as `inline` rather than `__always_inline`, it is possible that the compiler decides to place it out-of-line, whereupon it may be instrumented. For example, with defconfig built with clang 13.0.0, call_hvc_arch_workaround_1() is compiled as: | <call_hvc_arch_workaround_1>: | d503233f paciasp | f81f0ffe str x30, [sp, #-16]! | 320183e0 mov w0, #0x80008000 | d503201f nop | d4000002 hvc #0x0 | f84107fe ldr x30, [sp], #16 | d50323bf autiasp | d65f03c0 ret ... but when CONFIG_FTRACE=y and CONFIG_KCOV=y this is compiled as: | <call_hvc_arch_workaround_1>: | d503245f bti c | d503201f nop | d503201f nop | d503233f paciasp | a9bf7bfd stp x29, x30, [sp, #-16]! | 910003fd mov x29, sp | 94000000 bl 0 <__sanitizer_cov_trace_pc> | 320183e0 mov w0, #0x80008000 | d503201f nop | d4000002 hvc #0x0 | a8c17bfd ldp x29, x30, [sp], #16 | d50323bf autiasp | d65f03c0 ret ... with a patchable function entry registered with ftrace, and a direct call to __sanitizer_cov_trace_pc(). Neither of these are safe early during entry sequences. This patch avoids the unsafe instrumentation by marking arm64_apply_bp_hardening() as `__always_inline` and by marking the hardening functions as `noinstr`. This avoids the potential for instrumentation, and causes clang to consistently generate the function as with the defconfig sample. Note: in the defconfig compilation, when CONFIG_SVE=y, x30 is spilled to the stack without being placed in a frame record, which will result in a missing entry if call_hvc_arch_workaround_1() is backtraced. Similar is true of qcom_link_stack_sanitisation(), where inline asm spills the LR to a GPR prior to corrupting it. This is not a significant issue presently as we will only backtrace here if an exception is taken, and in such cases we may omit entries for other reasons today. The relevant hardening functions were introduced in commits: ec82b567a74fbdff ("arm64: Implement branch predictor hardening for Falkor") b092201e00206141 ("arm64: Add ARM_SMCCC_ARCH_WORKAROUND_1 BP hardening support") ... and these were subsequently moved in commit: d4647f0a2ad71110 ("arm64: Rewrite Spectre-v2 mitigation code") The arm64_apply_bp_hardening() function was introduced in commit: 0f15adbb2861ce6f ("arm64: Add skeleton to harden the branch predictor against aliasing attacks") ... and was subsequently moved and reworked in commit: 6279017e807708a0 ("KVM: arm64: Move BP hardening helpers into spectre.h") Fixes: ec82b567a74fbdff ("arm64: Implement branch predictor hardening for Falkor") Fixes: b092201e00206141 ("arm64: Add ARM_SMCCC_ARCH_WORKAROUND_1 BP hardening support") Fixes: d4647f0a2ad71110 ("arm64: Rewrite Spectre-v2 mitigation code") Fixes: 0f15adbb2861ce6f ("arm64: Add skeleton to harden the branch predictor against aliasing attacks") Fixes: 6279017e807708a0 ("KVM: arm64: Move BP hardening helpers into spectre.h") Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Ard Biesheuvel <ardb@kernel.org> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: James Morse <james.morse@arm.com> Cc: Marc Zyngier <maz@kernel.org> Cc: Mark Brown <broonie@kernel.org> Cc: Will Deacon <will@kernel.org> Acked-by: Marc Zyngier <maz@kernel.org> Reviewed-by: Mark Brown <broonie@kernel.org> Link: https://lore.kernel.org/r/20220224181028.512873-1-mark.rutland@arm.com Signed-off-by: Will Deacon <will@kernel.org>
2022-02-24 18:10:28 +00:00
/* Called during entry so must be noinstr */
static noinstr void qcom_link_stack_sanitisation(void)
{
u64 tmp;
asm volatile("mov %0, x30 \n"
".rept 16 \n"
"bl . + 4 \n"
".endr \n"
"mov x30, %0 \n"
: "=&r" (tmp));
}
static bp_hardening_cb_t spectre_v2_get_sw_mitigation_cb(void)
{
u32 midr = read_cpuid_id();
if (((midr & MIDR_CPU_MODEL_MASK) != MIDR_QCOM_FALKOR) &&
((midr & MIDR_CPU_MODEL_MASK) != MIDR_QCOM_FALKOR_V1))
return NULL;
return qcom_link_stack_sanitisation;
}
static enum mitigation_state spectre_v2_enable_fw_mitigation(void)
{
bp_hardening_cb_t cb;
enum mitigation_state state;
state = spectre_v2_get_cpu_fw_mitigation_state();
if (state != SPECTRE_MITIGATED)
return state;
if (spectre_v2_mitigations_off())
return SPECTRE_VULNERABLE;
switch (arm_smccc_1_1_get_conduit()) {
case SMCCC_CONDUIT_HVC:
cb = call_hvc_arch_workaround_1;
break;
case SMCCC_CONDUIT_SMC:
cb = call_smc_arch_workaround_1;
break;
default:
return SPECTRE_VULNERABLE;
}
/*
* Prefer a CPU-specific workaround if it exists. Note that we
* still rely on firmware for the mitigation at EL2.
*/
cb = spectre_v2_get_sw_mitigation_cb() ?: cb;
install_bp_hardening_cb(cb);
return SPECTRE_MITIGATED;
}
void spectre_v2_enable_mitigation(const struct arm64_cpu_capabilities *__unused)
{
enum mitigation_state state;
WARN_ON(preemptible());
state = spectre_v2_get_cpu_hw_mitigation_state();
if (state == SPECTRE_VULNERABLE)
state = spectre_v2_enable_fw_mitigation();
update_mitigation_state(&spectre_v2_state, state);
}
/*
* Spectre-v3a.
*
* Phew, there's not an awful lot to do here! We just instruct EL2 to use
* an indirect trampoline for the hyp vectors so that guests can't read
* VBAR_EL2 to defeat randomisation of the hypervisor VA layout.
*/
bool has_spectre_v3a(const struct arm64_cpu_capabilities *entry, int scope)
{
static const struct midr_range spectre_v3a_unsafe_list[] = {
MIDR_ALL_VERSIONS(MIDR_CORTEX_A57),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A72),
{},
};
WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
return is_midr_in_range_list(spectre_v3a_unsafe_list);
}
void spectre_v3a_enable_mitigation(const struct arm64_cpu_capabilities *__unused)
{
struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data);
if (this_cpu_has_cap(ARM64_SPECTRE_V3A))
data->slot += HYP_VECTOR_INDIRECT;
}
/*
* Spectre v4.
*
* If you thought Spectre v2 was nasty, wait until you see this mess. A CPU is
* either:
*
* - Mitigated in hardware and listed in our "safe list".
* - Mitigated in hardware via PSTATE.SSBS.
* - Mitigated in software by firmware (sometimes referred to as SSBD).
*
* Wait, that doesn't sound so bad, does it? Keep reading...
*
* A major source of headaches is that the software mitigation is enabled both
* on a per-task basis, but can also be forced on for the kernel, necessitating
* both context-switch *and* entry/exit hooks. To make it even worse, some CPUs
* allow EL0 to toggle SSBS directly, which can end up with the prctl() state
* being stale when re-entering the kernel. The usual big.LITTLE caveats apply,
* so you can have systems that have both firmware and SSBS mitigations. This
* means we actually have to reject late onlining of CPUs with mitigations if
* all of the currently onlined CPUs are safelisted, as the mitigation tends to
* be opt-in for userspace. Yes, really, the cure is worse than the disease.
*
* The only good part is that if the firmware mitigation is present, then it is
* present for all CPUs, meaning we don't have to worry about late onlining of a
* vulnerable CPU if one of the boot CPUs is using the firmware mitigation.
*
* Give me a VAX-11/780 any day of the week...
*/
static enum mitigation_state spectre_v4_state;
/* This is the per-cpu state tracking whether we need to talk to firmware */
DEFINE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required);
enum spectre_v4_policy {
SPECTRE_V4_POLICY_MITIGATION_DYNAMIC,
SPECTRE_V4_POLICY_MITIGATION_ENABLED,
SPECTRE_V4_POLICY_MITIGATION_DISABLED,
};
static enum spectre_v4_policy __read_mostly __spectre_v4_policy;
static const struct spectre_v4_param {
const char *str;
enum spectre_v4_policy policy;
} spectre_v4_params[] = {
{ "force-on", SPECTRE_V4_POLICY_MITIGATION_ENABLED, },
{ "force-off", SPECTRE_V4_POLICY_MITIGATION_DISABLED, },
{ "kernel", SPECTRE_V4_POLICY_MITIGATION_DYNAMIC, },
};
static int __init parse_spectre_v4_param(char *str)
{
int i;
if (!str || !str[0])
return -EINVAL;
for (i = 0; i < ARRAY_SIZE(spectre_v4_params); i++) {
const struct spectre_v4_param *param = &spectre_v4_params[i];
if (strncmp(str, param->str, strlen(param->str)))
continue;
__spectre_v4_policy = param->policy;
return 0;
}
return -EINVAL;
}
early_param("ssbd", parse_spectre_v4_param);
/*
* Because this was all written in a rush by people working in different silos,
* we've ended up with multiple command line options to control the same thing.
* Wrap these up in some helpers, which prefer disabling the mitigation if faced
* with contradictory parameters. The mitigation is always either "off",
* "dynamic" or "on".
*/
static bool spectre_v4_mitigations_off(void)
{
bool ret = cpu_mitigations_off() ||
__spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DISABLED;
if (ret)
pr_info_once("spectre-v4 mitigation disabled by command-line option\n");
return ret;
}
/* Do we need to toggle the mitigation state on entry to/exit from the kernel? */
static bool spectre_v4_mitigations_dynamic(void)
{
return !spectre_v4_mitigations_off() &&
__spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DYNAMIC;
}
static bool spectre_v4_mitigations_on(void)
{
return !spectre_v4_mitigations_off() &&
__spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_ENABLED;
}
ssize_t cpu_show_spec_store_bypass(struct device *dev,
struct device_attribute *attr, char *buf)
{
switch (spectre_v4_state) {
case SPECTRE_UNAFFECTED:
return sprintf(buf, "Not affected\n");
case SPECTRE_MITIGATED:
return sprintf(buf, "Mitigation: Speculative Store Bypass disabled via prctl\n");
case SPECTRE_VULNERABLE:
fallthrough;
default:
return sprintf(buf, "Vulnerable\n");
}
}
enum mitigation_state arm64_get_spectre_v4_state(void)
{
return spectre_v4_state;
}
static enum mitigation_state spectre_v4_get_cpu_hw_mitigation_state(void)
{
static const struct midr_range spectre_v4_safe_list[] = {
MIDR_ALL_VERSIONS(MIDR_CORTEX_A35),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A53),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A55),
MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53),
MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER),
MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER),
{ /* sentinel */ },
};
if (is_midr_in_range_list(spectre_v4_safe_list))
return SPECTRE_UNAFFECTED;
/* CPU features are detected first */
if (this_cpu_has_cap(ARM64_SSBS))
return SPECTRE_MITIGATED;
return SPECTRE_VULNERABLE;
}
static enum mitigation_state spectre_v4_get_cpu_fw_mitigation_state(void)
{
int ret;
struct arm_smccc_res res;
arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
ARM_SMCCC_ARCH_WORKAROUND_2, &res);
ret = res.a0;
switch (ret) {
case SMCCC_RET_SUCCESS:
return SPECTRE_MITIGATED;
case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED:
fallthrough;
case SMCCC_RET_NOT_REQUIRED:
return SPECTRE_UNAFFECTED;
default:
fallthrough;
case SMCCC_RET_NOT_SUPPORTED:
return SPECTRE_VULNERABLE;
}
}
bool has_spectre_v4(const struct arm64_cpu_capabilities *cap, int scope)
{
enum mitigation_state state;
WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
state = spectre_v4_get_cpu_hw_mitigation_state();
if (state == SPECTRE_VULNERABLE)
state = spectre_v4_get_cpu_fw_mitigation_state();
return state != SPECTRE_UNAFFECTED;
}
arm64: factor out EL1 SSBS emulation hook Currently call_undef_hook() is used to handle UNDEFINED exceptions from EL0 and EL1. As support for deprecated instructions may be enabled independently, the handlers for individual instructions are organised as a linked list of struct undef_hook which can be manipulated dynamically. As this can be manipulated dynamically, the list is protected with a raw_spinlock which must be acquired when handling UNDEFINED exceptions or when manipulating the list of handlers. This locking is unfortunate as it serialises handling of UNDEFINED exceptions, and requires RCU to be enabled for lockdep, requiring the use of RCU_NONIDLE() in resume path of cpu_suspend() since commit: a2c42bbabbe260b7 ("arm64: spectre: Prevent lockdep splat on v4 mitigation enable path") The list of UNDEFINED handlers largely consist of handlers for exceptions taken from EL0, and the only handler for exceptions taken from EL1 handles `MSR SSBS, #imm` on CPUs which feature PSTATE.SSBS but lack the corresponding MSR (Immediate) instruction. Other than this we never expect to take an UNDEFINED exception from EL1 in normal operation. This patch reworks do_el0_undef() to invoke the EL1 SSBS handler directly, relegating call_undef_hook() to only handle EL0 UNDEFs. This removes redundant work to iterate the list for EL1 UNDEFs, and removes the need for locking, permitting EL1 UNDEFs to be handled in parallel without contention. The RCU_NONIDLE() call in cpu_suspend() will be removed in a subsequent patch, as there are other potential issues with the use of instrumentable code and RCU in the CPU suspend code. I've tested this by forcing the detection of SSBS on a CPU that doesn't have it, and verifying that the try_emulate_el1_ssbs() callback is invoked. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: James Morse <james.morse@arm.com> Cc: Joey Gouly <joey.gouly@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Link: https://lore.kernel.org/r/20221019144123.612388-4-mark.rutland@arm.com Signed-off-by: Will Deacon <will@kernel.org>
2022-10-19 15:41:17 +01:00
bool try_emulate_el1_ssbs(struct pt_regs *regs, u32 instr)
{
arm64: factor out EL1 SSBS emulation hook Currently call_undef_hook() is used to handle UNDEFINED exceptions from EL0 and EL1. As support for deprecated instructions may be enabled independently, the handlers for individual instructions are organised as a linked list of struct undef_hook which can be manipulated dynamically. As this can be manipulated dynamically, the list is protected with a raw_spinlock which must be acquired when handling UNDEFINED exceptions or when manipulating the list of handlers. This locking is unfortunate as it serialises handling of UNDEFINED exceptions, and requires RCU to be enabled for lockdep, requiring the use of RCU_NONIDLE() in resume path of cpu_suspend() since commit: a2c42bbabbe260b7 ("arm64: spectre: Prevent lockdep splat on v4 mitigation enable path") The list of UNDEFINED handlers largely consist of handlers for exceptions taken from EL0, and the only handler for exceptions taken from EL1 handles `MSR SSBS, #imm` on CPUs which feature PSTATE.SSBS but lack the corresponding MSR (Immediate) instruction. Other than this we never expect to take an UNDEFINED exception from EL1 in normal operation. This patch reworks do_el0_undef() to invoke the EL1 SSBS handler directly, relegating call_undef_hook() to only handle EL0 UNDEFs. This removes redundant work to iterate the list for EL1 UNDEFs, and removes the need for locking, permitting EL1 UNDEFs to be handled in parallel without contention. The RCU_NONIDLE() call in cpu_suspend() will be removed in a subsequent patch, as there are other potential issues with the use of instrumentable code and RCU in the CPU suspend code. I've tested this by forcing the detection of SSBS on a CPU that doesn't have it, and verifying that the try_emulate_el1_ssbs() callback is invoked. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: James Morse <james.morse@arm.com> Cc: Joey Gouly <joey.gouly@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Link: https://lore.kernel.org/r/20221019144123.612388-4-mark.rutland@arm.com Signed-off-by: Will Deacon <will@kernel.org>
2022-10-19 15:41:17 +01:00
const u32 instr_mask = ~(1U << PSTATE_Imm_shift);
const u32 instr_val = 0xd500401f | PSTATE_SSBS;
if ((instr & instr_mask) != instr_val)
return false;
if (instr & BIT(PSTATE_Imm_shift))
regs->pstate |= PSR_SSBS_BIT;
else
regs->pstate &= ~PSR_SSBS_BIT;
arm64_skip_faulting_instruction(regs, 4);
arm64: factor out EL1 SSBS emulation hook Currently call_undef_hook() is used to handle UNDEFINED exceptions from EL0 and EL1. As support for deprecated instructions may be enabled independently, the handlers for individual instructions are organised as a linked list of struct undef_hook which can be manipulated dynamically. As this can be manipulated dynamically, the list is protected with a raw_spinlock which must be acquired when handling UNDEFINED exceptions or when manipulating the list of handlers. This locking is unfortunate as it serialises handling of UNDEFINED exceptions, and requires RCU to be enabled for lockdep, requiring the use of RCU_NONIDLE() in resume path of cpu_suspend() since commit: a2c42bbabbe260b7 ("arm64: spectre: Prevent lockdep splat on v4 mitigation enable path") The list of UNDEFINED handlers largely consist of handlers for exceptions taken from EL0, and the only handler for exceptions taken from EL1 handles `MSR SSBS, #imm` on CPUs which feature PSTATE.SSBS but lack the corresponding MSR (Immediate) instruction. Other than this we never expect to take an UNDEFINED exception from EL1 in normal operation. This patch reworks do_el0_undef() to invoke the EL1 SSBS handler directly, relegating call_undef_hook() to only handle EL0 UNDEFs. This removes redundant work to iterate the list for EL1 UNDEFs, and removes the need for locking, permitting EL1 UNDEFs to be handled in parallel without contention. The RCU_NONIDLE() call in cpu_suspend() will be removed in a subsequent patch, as there are other potential issues with the use of instrumentable code and RCU in the CPU suspend code. I've tested this by forcing the detection of SSBS on a CPU that doesn't have it, and verifying that the try_emulate_el1_ssbs() callback is invoked. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: James Morse <james.morse@arm.com> Cc: Joey Gouly <joey.gouly@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Link: https://lore.kernel.org/r/20221019144123.612388-4-mark.rutland@arm.com Signed-off-by: Will Deacon <will@kernel.org>
2022-10-19 15:41:17 +01:00
return true;
}
static enum mitigation_state spectre_v4_enable_hw_mitigation(void)
{
enum mitigation_state state;
/*
* If the system is mitigated but this CPU doesn't have SSBS, then
* we must be on the safelist and there's nothing more to do.
*/
state = spectre_v4_get_cpu_hw_mitigation_state();
if (state != SPECTRE_MITIGATED || !this_cpu_has_cap(ARM64_SSBS))
return state;
if (spectre_v4_mitigations_off()) {
sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_DSSBS);
set_pstate_ssbs(1);
return SPECTRE_VULNERABLE;
}
/* SCTLR_EL1.DSSBS was initialised to 0 during boot */
set_pstate_ssbs(0);
/*
* SSBS is self-synchronizing and is intended to affect subsequent
* speculative instructions, but some CPUs can speculate with a stale
* value of SSBS.
*
* Mitigate this with an unconditional speculation barrier, as CPUs
* could mis-speculate branches and bypass a conditional barrier.
*/
if (IS_ENABLED(CONFIG_ARM64_ERRATUM_3194386))
spec_bar();
return SPECTRE_MITIGATED;
}
/*
* Patch a branch over the Spectre-v4 mitigation code with a NOP so that
* we fallthrough and check whether firmware needs to be called on this CPU.
*/
void __init spectre_v4_patch_fw_mitigation_enable(struct alt_instr *alt,
__le32 *origptr,
__le32 *updptr, int nr_inst)
{
BUG_ON(nr_inst != 1); /* Branch -> NOP */
if (spectre_v4_mitigations_off())
return;
if (cpus_have_cap(ARM64_SSBS))
return;
if (spectre_v4_mitigations_dynamic())
*updptr = cpu_to_le32(aarch64_insn_gen_nop());
}
/*
* Patch a NOP in the Spectre-v4 mitigation code with an SMC/HVC instruction
* to call into firmware to adjust the mitigation state.
*/
void __init smccc_patch_fw_mitigation_conduit(struct alt_instr *alt,
__le32 *origptr,
__le32 *updptr, int nr_inst)
{
u32 insn;
BUG_ON(nr_inst != 1); /* NOP -> HVC/SMC */
switch (arm_smccc_1_1_get_conduit()) {
case SMCCC_CONDUIT_HVC:
insn = aarch64_insn_get_hvc_value();
break;
case SMCCC_CONDUIT_SMC:
insn = aarch64_insn_get_smc_value();
break;
default:
return;
}
*updptr = cpu_to_le32(insn);
}
static enum mitigation_state spectre_v4_enable_fw_mitigation(void)
{
enum mitigation_state state;
state = spectre_v4_get_cpu_fw_mitigation_state();
if (state != SPECTRE_MITIGATED)
return state;
if (spectre_v4_mitigations_off()) {
arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_WORKAROUND_2, false, NULL);
return SPECTRE_VULNERABLE;
}
arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_WORKAROUND_2, true, NULL);
if (spectre_v4_mitigations_dynamic())
__this_cpu_write(arm64_ssbd_callback_required, 1);
return SPECTRE_MITIGATED;
}
void spectre_v4_enable_mitigation(const struct arm64_cpu_capabilities *__unused)
{
enum mitigation_state state;
WARN_ON(preemptible());
state = spectre_v4_enable_hw_mitigation();
if (state == SPECTRE_VULNERABLE)
state = spectre_v4_enable_fw_mitigation();
update_mitigation_state(&spectre_v4_state, state);
}
static void __update_pstate_ssbs(struct pt_regs *regs, bool state)
{
u64 bit = compat_user_mode(regs) ? PSR_AA32_SSBS_BIT : PSR_SSBS_BIT;
if (state)
regs->pstate |= bit;
else
regs->pstate &= ~bit;
}
void spectre_v4_enable_task_mitigation(struct task_struct *tsk)
{
struct pt_regs *regs = task_pt_regs(tsk);
bool ssbs = false, kthread = tsk->flags & PF_KTHREAD;
if (spectre_v4_mitigations_off())
ssbs = true;
else if (spectre_v4_mitigations_dynamic() && !kthread)
ssbs = !test_tsk_thread_flag(tsk, TIF_SSBD);
__update_pstate_ssbs(regs, ssbs);
}
/*
* The Spectre-v4 mitigation can be controlled via a prctl() from userspace.
* This is interesting because the "speculation disabled" behaviour can be
* configured so that it is preserved across exec(), which means that the
* prctl() may be necessary even when PSTATE.SSBS can be toggled directly
* from userspace.
*/
static void ssbd_prctl_enable_mitigation(struct task_struct *task)
{
task_clear_spec_ssb_noexec(task);
task_set_spec_ssb_disable(task);
set_tsk_thread_flag(task, TIF_SSBD);
}
static void ssbd_prctl_disable_mitigation(struct task_struct *task)
{
task_clear_spec_ssb_noexec(task);
task_clear_spec_ssb_disable(task);
clear_tsk_thread_flag(task, TIF_SSBD);
}
static int ssbd_prctl_set(struct task_struct *task, unsigned long ctrl)
{
switch (ctrl) {
case PR_SPEC_ENABLE:
/* Enable speculation: disable mitigation */
/*
* Force disabled speculation prevents it from being
* re-enabled.
*/
if (task_spec_ssb_force_disable(task))
return -EPERM;
/*
* If the mitigation is forced on, then speculation is forced
* off and we again prevent it from being re-enabled.
*/
if (spectre_v4_mitigations_on())
return -EPERM;
ssbd_prctl_disable_mitigation(task);
break;
case PR_SPEC_FORCE_DISABLE:
/* Force disable speculation: force enable mitigation */
/*
* If the mitigation is forced off, then speculation is forced
* on and we prevent it from being disabled.
*/
if (spectre_v4_mitigations_off())
return -EPERM;
task_set_spec_ssb_force_disable(task);
fallthrough;
case PR_SPEC_DISABLE:
/* Disable speculation: enable mitigation */
/* Same as PR_SPEC_FORCE_DISABLE */
if (spectre_v4_mitigations_off())
return -EPERM;
ssbd_prctl_enable_mitigation(task);
break;
case PR_SPEC_DISABLE_NOEXEC:
/* Disable speculation until execve(): enable mitigation */
/*
* If the mitigation state is forced one way or the other, then
* we must fail now before we try to toggle it on execve().
*/
if (task_spec_ssb_force_disable(task) ||
spectre_v4_mitigations_off() ||
spectre_v4_mitigations_on()) {
return -EPERM;
}
ssbd_prctl_enable_mitigation(task);
task_set_spec_ssb_noexec(task);
break;
default:
return -ERANGE;
}
spectre_v4_enable_task_mitigation(task);
return 0;
}
int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which,
unsigned long ctrl)
{
switch (which) {
case PR_SPEC_STORE_BYPASS:
return ssbd_prctl_set(task, ctrl);
default:
return -ENODEV;
}
}
static int ssbd_prctl_get(struct task_struct *task)
{
switch (spectre_v4_state) {
case SPECTRE_UNAFFECTED:
return PR_SPEC_NOT_AFFECTED;
case SPECTRE_MITIGATED:
if (spectre_v4_mitigations_on())
return PR_SPEC_NOT_AFFECTED;
if (spectre_v4_mitigations_dynamic())
break;
/* Mitigations are disabled, so we're vulnerable. */
fallthrough;
case SPECTRE_VULNERABLE:
fallthrough;
default:
return PR_SPEC_ENABLE;
}
/* Check the mitigation state for this task */
if (task_spec_ssb_force_disable(task))
return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
if (task_spec_ssb_noexec(task))
return PR_SPEC_PRCTL | PR_SPEC_DISABLE_NOEXEC;
if (task_spec_ssb_disable(task))
return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
}
int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
{
switch (which) {
case PR_SPEC_STORE_BYPASS:
return ssbd_prctl_get(task);
default:
return -ENODEV;
}
}
/*
* Spectre BHB.
*
* A CPU is either:
* - Mitigated by a branchy loop a CPU specific number of times, and listed
* in our "loop mitigated list".
* - Mitigated in software by the firmware Spectre v2 call.
* - Has the ClearBHB instruction to perform the mitigation.
* - Has the 'Exception Clears Branch History Buffer' (ECBHB) feature, so no
* software mitigation in the vectors is needed.
* - Has CSV2.3, so is unaffected.
*/
static enum mitigation_state spectre_bhb_state;
enum mitigation_state arm64_get_spectre_bhb_state(void)
{
return spectre_bhb_state;
}
enum bhb_mitigation_bits {
BHB_LOOP,
BHB_FW,
BHB_HW,
BHB_INSN,
};
static unsigned long system_bhb_mitigations;
/*
* This must be called with SCOPE_LOCAL_CPU for each type of CPU, before any
* SCOPE_SYSTEM call will give the right answer.
*/
arm64: errata: Assume that unknown CPUs _are_ vulnerable to Spectre BHB The code for detecting CPUs that are vulnerable to Spectre BHB was based on a hardcoded list of CPU IDs that were known to be affected. Unfortunately, the list mostly only contained the IDs of standard ARM cores. The IDs for many cores that are minor variants of the standard ARM cores (like many Qualcomm Kyro CPUs) weren't listed. This led the code to assume that those variants were not affected. Flip the code on its head and instead assume that a core is vulnerable if it doesn't have CSV2_3 but is unrecognized as being safe. This involves creating a "Spectre BHB safe" list. As of right now, the only CPU IDs added to the "Spectre BHB safe" list are ARM Cortex A35, A53, A55, A510, and A520. This list was created by looking for cores that weren't listed in ARM's list [1] as per review feedback on v2 of this patch [2]. Additionally Brahma A53 is added as per mailing list feedback [3]. NOTE: this patch will not actually _mitigate_ anyone, it will simply cause them to report themselves as vulnerable. If any cores in the system are reported as vulnerable but not mitigated then the whole system will be reported as vulnerable though the system will attempt to mitigate with the information it has about the known cores. [1] https://developer.arm.com/Arm%20Security%20Center/Spectre-BHB [2] https://lore.kernel.org/r/20241219175128.GA25477@willie-the-truck [3] https://lore.kernel.org/r/18dbd7d1-a46c-4112-a425-320c99f67a8d@broadcom.com Fixes: 558c303c9734 ("arm64: Mitigate spectre style branch history side channels") Cc: stable@vger.kernel.org Reviewed-by: Julius Werner <jwerner@chromium.org> Signed-off-by: Douglas Anderson <dianders@chromium.org> Link: https://lore.kernel.org/r/20250107120555.v4.2.I2040fa004dafe196243f67ebcc647cbedbb516e6@changeid Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2025-01-07 12:05:59 -08:00
static bool is_spectre_bhb_safe(int scope)
{
static const struct midr_range spectre_bhb_safe_list[] = {
MIDR_ALL_VERSIONS(MIDR_CORTEX_A35),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A53),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A55),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A510),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A520),
MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53),
MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_2XX_SILVER),
MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER),
MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER),
arm64: errata: Assume that unknown CPUs _are_ vulnerable to Spectre BHB The code for detecting CPUs that are vulnerable to Spectre BHB was based on a hardcoded list of CPU IDs that were known to be affected. Unfortunately, the list mostly only contained the IDs of standard ARM cores. The IDs for many cores that are minor variants of the standard ARM cores (like many Qualcomm Kyro CPUs) weren't listed. This led the code to assume that those variants were not affected. Flip the code on its head and instead assume that a core is vulnerable if it doesn't have CSV2_3 but is unrecognized as being safe. This involves creating a "Spectre BHB safe" list. As of right now, the only CPU IDs added to the "Spectre BHB safe" list are ARM Cortex A35, A53, A55, A510, and A520. This list was created by looking for cores that weren't listed in ARM's list [1] as per review feedback on v2 of this patch [2]. Additionally Brahma A53 is added as per mailing list feedback [3]. NOTE: this patch will not actually _mitigate_ anyone, it will simply cause them to report themselves as vulnerable. If any cores in the system are reported as vulnerable but not mitigated then the whole system will be reported as vulnerable though the system will attempt to mitigate with the information it has about the known cores. [1] https://developer.arm.com/Arm%20Security%20Center/Spectre-BHB [2] https://lore.kernel.org/r/20241219175128.GA25477@willie-the-truck [3] https://lore.kernel.org/r/18dbd7d1-a46c-4112-a425-320c99f67a8d@broadcom.com Fixes: 558c303c9734 ("arm64: Mitigate spectre style branch history side channels") Cc: stable@vger.kernel.org Reviewed-by: Julius Werner <jwerner@chromium.org> Signed-off-by: Douglas Anderson <dianders@chromium.org> Link: https://lore.kernel.org/r/20250107120555.v4.2.I2040fa004dafe196243f67ebcc647cbedbb516e6@changeid Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2025-01-07 12:05:59 -08:00
{},
};
static bool all_safe = true;
if (scope != SCOPE_LOCAL_CPU)
return all_safe;
ARM: * Nested virtualization support for VGICv3, giving the nested hypervisor control of the VGIC hardware when running an L2 VM * Removal of 'late' nested virtualization feature register masking, making the supported feature set directly visible to userspace * Support for emulating FEAT_PMUv3 on Apple silicon, taking advantage of an IMPLEMENTATION DEFINED trap that covers all PMUv3 registers * Paravirtual interface for discovering the set of CPU implementations where a VM may run, addressing a longstanding issue of guest CPU errata awareness in big-little systems and cross-implementation VM migration * Userspace control of the registers responsible for identifying a particular CPU implementation (MIDR_EL1, REVIDR_EL1, AIDR_EL1), allowing VMs to be migrated cross-implementation * pKVM updates, including support for tracking stage-2 page table allocations in the protected hypervisor in the 'SecPageTable' stat * Fixes to vPMU, ensuring that userspace updates to the vPMU after KVM_RUN are reflected into the backing perf events LoongArch: * Remove unnecessary header include path * Assume constant PGD during VM context switch * Add perf events support for guest VM RISC-V: * Disable the kernel perf counter during configure * KVM selftests improvements for PMU * Fix warning at the time of KVM module removal x86: * Add support for aging of SPTEs without holding mmu_lock. Not taking mmu_lock allows multiple aging actions to run in parallel, and more importantly avoids stalling vCPUs. This includes an implementation of per-rmap-entry locking; aging the gfn is done with only a per-rmap single-bin spinlock taken, whereas locking an rmap for write requires taking both the per-rmap spinlock and the mmu_lock. Note that this decreases slightly the accuracy of accessed-page information, because changes to the SPTE outside aging might not use atomic operations even if they could race against a clear of the Accessed bit. This is deliberate because KVM and mm/ tolerate false positives/negatives for accessed information, and testing has shown that reducing the latency of aging is far more beneficial to overall system performance than providing "perfect" young/old information. * Defer runtime CPUID updates until KVM emulates a CPUID instruction, to coalesce updates when multiple pieces of vCPU state are changing, e.g. as part of a nested transition. * Fix a variety of nested emulation bugs, and add VMX support for synthesizing nested VM-Exit on interception (instead of injecting #UD into L2). * Drop "support" for async page faults for protected guests that do not set SEND_ALWAYS (i.e. that only want async page faults at CPL3) * Bring a bit of sanity to x86's VM teardown code, which has accumulated a lot of cruft over the years. Particularly, destroy vCPUs before the MMU, despite the latter being a VM-wide operation. * Add common secure TSC infrastructure for use within SNP and in the future TDX * Block KVM_CAP_SYNC_REGS if guest state is protected. It does not make sense to use the capability if the relevant registers are not available for reading or writing. * Don't take kvm->lock when iterating over vCPUs in the suspend notifier to fix a largely theoretical deadlock. * Use the vCPU's actual Xen PV clock information when starting the Xen timer, as the cached state in arch.hv_clock can be stale/bogus. * Fix a bug where KVM could bleed PVCLOCK_GUEST_STOPPED across different PV clocks; restrict PVCLOCK_GUEST_STOPPED to kvmclock, as KVM's suspend notifier only accounts for kvmclock, and there's no evidence that the flag is actually supported by Xen guests. * Clean up the per-vCPU "cache" of its reference pvclock, and instead only track the vCPU's TSC scaling (multipler+shift) metadata (which is moderately expensive to compute, and rarely changes for modern setups). * Don't write to the Xen hypercall page on MSR writes that are initiated by the host (userspace or KVM) to fix a class of bugs where KVM can write to guest memory at unexpected times, e.g. during vCPU creation if userspace has set the Xen hypercall MSR index to collide with an MSR that KVM emulates. * Restrict the Xen hypercall MSR index to the unofficial synthetic range to reduce the set of possible collisions with MSRs that are emulated by KVM (collisions can still happen as KVM emulates Hyper-V MSRs, which also reside in the synthetic range). * Clean up and optimize KVM's handling of Xen MSR writes and xen_hvm_config. * Update Xen TSC leaves during CPUID emulation instead of modifying the CPUID entries when updating PV clocks; there is no guarantee PV clocks will be updated between TSC frequency changes and CPUID emulation, and guest reads of the TSC leaves should be rare, i.e. are not a hot path. x86 (Intel): * Fix a bug where KVM unnecessarily reads XFD_ERR from hardware and thus modifies the vCPU's XFD_ERR on a #NM due to CR0.TS=1. * Pass XFD_ERR as the payload when injecting #NM, as a preparatory step for upcoming FRED virtualization support. * Decouple the EPT entry RWX protection bit macros from the EPT Violation bits, both as a general cleanup and in anticipation of adding support for emulating Mode-Based Execution Control (MBEC). * Reject KVM_RUN if userspace manages to gain control and stuff invalid guest state while KVM is in the middle of emulating nested VM-Enter. * Add a macro to handle KVM's sanity checks on entry/exit VMCS control pairs in anticipation of adding sanity checks for secondary exit controls (the primary field is out of bits). x86 (AMD): * Ensure the PSP driver is initialized when both the PSP and KVM modules are built-in (the initcall framework doesn't handle dependencies). * Use long-term pins when registering encrypted memory regions, so that the pages are migrated out of MIGRATE_CMA/ZONE_MOVABLE and don't lead to excessive fragmentation. * Add macros and helpers for setting GHCB return/error codes. * Add support for Idle HLT interception, which elides interception if the vCPU has a pending, unmasked virtual IRQ when HLT is executed. * Fix a bug in INVPCID emulation where KVM fails to check for a non-canonical address. * Don't attempt VMRUN for SEV-ES+ guests if the vCPU's VMSA is invalid, e.g. because the vCPU was "destroyed" via SNP's AP Creation hypercall. * Reject SNP AP Creation if the requested SEV features for the vCPU don't match the VM's configured set of features. Selftests: * Fix again the Intel PMU counters test; add a data load and do CLFLUSH{OPT} on the data instead of executing code. The theory is that modern Intel CPUs have learned new code prefetching tricks that bypass the PMU counters. * Fix a flaw in the Intel PMU counters test where it asserts that an event is counting correctly without actually knowing what the event counts on the underlying hardware. * Fix a variety of flaws, bugs, and false failures/passes dirty_log_test, and improve its coverage by collecting all dirty entries on each iteration. * Fix a few minor bugs related to handling of stats FDs. * Add infrastructure to make vCPU and VM stats FDs available to tests by default (open the FDs during VM/vCPU creation). * Relax an assertion on the number of HLT exits in the xAPIC IPI test when running on a CPU that supports AMD's Idle HLT (which elides interception of HLT if a virtual IRQ is pending and unmasked). -----BEGIN PGP SIGNATURE----- iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmfcTkEUHHBib256aW5p QHJlZGhhdC5jb20ACgkQv/vSX3jHroMnQAf/cPx72hJOdNy4Qrm8M33YLXVRVV00 yEZ8eN8TWdOclr0ltE/w/ELGh/qS4CU8pjURAk0A6lPioU+mdcTn3dPEqMDMVYom uOQ2lusEHw0UuSnGZSEjvZJsE/Ro2NSAsHIB6PWRqig1ZBPJzyu0frce34pMpeQH diwriJL9lKPAhBWXnUQ9BKoi1R0P5OLW9ahX4SOWk7cAFg4DLlDE66Nqf6nKqViw DwEucTiUEg5+a3d93gihdD4JNl+fb3vI2erxrMxjFjkacl0qgqRu3ei3DG0MfdHU wNcFSG5B1n0OECKxr80lr1Ip1KTVNNij0Ks+w6Gc6lSg9c4PptnNkfLK3A== =nnCN -----END PGP SIGNATURE----- Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm Pull kvm updates from Paolo Bonzini: "ARM: - Nested virtualization support for VGICv3, giving the nested hypervisor control of the VGIC hardware when running an L2 VM - Removal of 'late' nested virtualization feature register masking, making the supported feature set directly visible to userspace - Support for emulating FEAT_PMUv3 on Apple silicon, taking advantage of an IMPLEMENTATION DEFINED trap that covers all PMUv3 registers - Paravirtual interface for discovering the set of CPU implementations where a VM may run, addressing a longstanding issue of guest CPU errata awareness in big-little systems and cross-implementation VM migration - Userspace control of the registers responsible for identifying a particular CPU implementation (MIDR_EL1, REVIDR_EL1, AIDR_EL1), allowing VMs to be migrated cross-implementation - pKVM updates, including support for tracking stage-2 page table allocations in the protected hypervisor in the 'SecPageTable' stat - Fixes to vPMU, ensuring that userspace updates to the vPMU after KVM_RUN are reflected into the backing perf events LoongArch: - Remove unnecessary header include path - Assume constant PGD during VM context switch - Add perf events support for guest VM RISC-V: - Disable the kernel perf counter during configure - KVM selftests improvements for PMU - Fix warning at the time of KVM module removal x86: - Add support for aging of SPTEs without holding mmu_lock. Not taking mmu_lock allows multiple aging actions to run in parallel, and more importantly avoids stalling vCPUs. This includes an implementation of per-rmap-entry locking; aging the gfn is done with only a per-rmap single-bin spinlock taken, whereas locking an rmap for write requires taking both the per-rmap spinlock and the mmu_lock. Note that this decreases slightly the accuracy of accessed-page information, because changes to the SPTE outside aging might not use atomic operations even if they could race against a clear of the Accessed bit. This is deliberate because KVM and mm/ tolerate false positives/negatives for accessed information, and testing has shown that reducing the latency of aging is far more beneficial to overall system performance than providing "perfect" young/old information. - Defer runtime CPUID updates until KVM emulates a CPUID instruction, to coalesce updates when multiple pieces of vCPU state are changing, e.g. as part of a nested transition - Fix a variety of nested emulation bugs, and add VMX support for synthesizing nested VM-Exit on interception (instead of injecting #UD into L2) - Drop "support" for async page faults for protected guests that do not set SEND_ALWAYS (i.e. that only want async page faults at CPL3) - Bring a bit of sanity to x86's VM teardown code, which has accumulated a lot of cruft over the years. Particularly, destroy vCPUs before the MMU, despite the latter being a VM-wide operation - Add common secure TSC infrastructure for use within SNP and in the future TDX - Block KVM_CAP_SYNC_REGS if guest state is protected. It does not make sense to use the capability if the relevant registers are not available for reading or writing - Don't take kvm->lock when iterating over vCPUs in the suspend notifier to fix a largely theoretical deadlock - Use the vCPU's actual Xen PV clock information when starting the Xen timer, as the cached state in arch.hv_clock can be stale/bogus - Fix a bug where KVM could bleed PVCLOCK_GUEST_STOPPED across different PV clocks; restrict PVCLOCK_GUEST_STOPPED to kvmclock, as KVM's suspend notifier only accounts for kvmclock, and there's no evidence that the flag is actually supported by Xen guests - Clean up the per-vCPU "cache" of its reference pvclock, and instead only track the vCPU's TSC scaling (multipler+shift) metadata (which is moderately expensive to compute, and rarely changes for modern setups) - Don't write to the Xen hypercall page on MSR writes that are initiated by the host (userspace or KVM) to fix a class of bugs where KVM can write to guest memory at unexpected times, e.g. during vCPU creation if userspace has set the Xen hypercall MSR index to collide with an MSR that KVM emulates - Restrict the Xen hypercall MSR index to the unofficial synthetic range to reduce the set of possible collisions with MSRs that are emulated by KVM (collisions can still happen as KVM emulates Hyper-V MSRs, which also reside in the synthetic range) - Clean up and optimize KVM's handling of Xen MSR writes and xen_hvm_config - Update Xen TSC leaves during CPUID emulation instead of modifying the CPUID entries when updating PV clocks; there is no guarantee PV clocks will be updated between TSC frequency changes and CPUID emulation, and guest reads of the TSC leaves should be rare, i.e. are not a hot path x86 (Intel): - Fix a bug where KVM unnecessarily reads XFD_ERR from hardware and thus modifies the vCPU's XFD_ERR on a #NM due to CR0.TS=1 - Pass XFD_ERR as the payload when injecting #NM, as a preparatory step for upcoming FRED virtualization support - Decouple the EPT entry RWX protection bit macros from the EPT Violation bits, both as a general cleanup and in anticipation of adding support for emulating Mode-Based Execution Control (MBEC) - Reject KVM_RUN if userspace manages to gain control and stuff invalid guest state while KVM is in the middle of emulating nested VM-Enter - Add a macro to handle KVM's sanity checks on entry/exit VMCS control pairs in anticipation of adding sanity checks for secondary exit controls (the primary field is out of bits) x86 (AMD): - Ensure the PSP driver is initialized when both the PSP and KVM modules are built-in (the initcall framework doesn't handle dependencies) - Use long-term pins when registering encrypted memory regions, so that the pages are migrated out of MIGRATE_CMA/ZONE_MOVABLE and don't lead to excessive fragmentation - Add macros and helpers for setting GHCB return/error codes - Add support for Idle HLT interception, which elides interception if the vCPU has a pending, unmasked virtual IRQ when HLT is executed - Fix a bug in INVPCID emulation where KVM fails to check for a non-canonical address - Don't attempt VMRUN for SEV-ES+ guests if the vCPU's VMSA is invalid, e.g. because the vCPU was "destroyed" via SNP's AP Creation hypercall - Reject SNP AP Creation if the requested SEV features for the vCPU don't match the VM's configured set of features Selftests: - Fix again the Intel PMU counters test; add a data load and do CLFLUSH{OPT} on the data instead of executing code. The theory is that modern Intel CPUs have learned new code prefetching tricks that bypass the PMU counters - Fix a flaw in the Intel PMU counters test where it asserts that an event is counting correctly without actually knowing what the event counts on the underlying hardware - Fix a variety of flaws, bugs, and false failures/passes dirty_log_test, and improve its coverage by collecting all dirty entries on each iteration - Fix a few minor bugs related to handling of stats FDs - Add infrastructure to make vCPU and VM stats FDs available to tests by default (open the FDs during VM/vCPU creation) - Relax an assertion on the number of HLT exits in the xAPIC IPI test when running on a CPU that supports AMD's Idle HLT (which elides interception of HLT if a virtual IRQ is pending and unmasked)" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (216 commits) RISC-V: KVM: Optimize comments in kvm_riscv_vcpu_isa_disable_allowed RISC-V: KVM: Teardown riscv specific bits after kvm_exit LoongArch: KVM: Register perf callbacks for guest LoongArch: KVM: Implement arch-specific functions for guest perf LoongArch: KVM: Add stub for kvm_arch_vcpu_preempted_in_kernel() LoongArch: KVM: Remove PGD saving during VM context switch LoongArch: KVM: Remove unnecessary header include path KVM: arm64: Tear down vGIC on failed vCPU creation KVM: arm64: PMU: Reload when resetting KVM: arm64: PMU: Reload when user modifies registers KVM: arm64: PMU: Fix SET_ONE_REG for vPMC regs KVM: arm64: PMU: Assume PMU presence in pmu-emul.c KVM: arm64: PMU: Set raw values from user to PM{C,I}NTEN{SET,CLR}, PMOVS{SET,CLR} KVM: arm64: Create each pKVM hyp vcpu after its corresponding host vcpu KVM: arm64: Factor out pKVM hyp vcpu creation to separate function KVM: arm64: Initialize HCRX_EL2 traps in pKVM KVM: arm64: Factor out setting HCRX_EL2 traps into separate function KVM: x86: block KVM_CAP_SYNC_REGS if guest state is protected KVM: x86: Add infrastructure for secure TSC KVM: x86: Push down setting vcpu.arch.user_set_tsc ...
2025-03-25 14:22:07 -07:00
if (is_midr_in_range_list(spectre_bhb_safe_list))
arm64: errata: Assume that unknown CPUs _are_ vulnerable to Spectre BHB The code for detecting CPUs that are vulnerable to Spectre BHB was based on a hardcoded list of CPU IDs that were known to be affected. Unfortunately, the list mostly only contained the IDs of standard ARM cores. The IDs for many cores that are minor variants of the standard ARM cores (like many Qualcomm Kyro CPUs) weren't listed. This led the code to assume that those variants were not affected. Flip the code on its head and instead assume that a core is vulnerable if it doesn't have CSV2_3 but is unrecognized as being safe. This involves creating a "Spectre BHB safe" list. As of right now, the only CPU IDs added to the "Spectre BHB safe" list are ARM Cortex A35, A53, A55, A510, and A520. This list was created by looking for cores that weren't listed in ARM's list [1] as per review feedback on v2 of this patch [2]. Additionally Brahma A53 is added as per mailing list feedback [3]. NOTE: this patch will not actually _mitigate_ anyone, it will simply cause them to report themselves as vulnerable. If any cores in the system are reported as vulnerable but not mitigated then the whole system will be reported as vulnerable though the system will attempt to mitigate with the information it has about the known cores. [1] https://developer.arm.com/Arm%20Security%20Center/Spectre-BHB [2] https://lore.kernel.org/r/20241219175128.GA25477@willie-the-truck [3] https://lore.kernel.org/r/18dbd7d1-a46c-4112-a425-320c99f67a8d@broadcom.com Fixes: 558c303c9734 ("arm64: Mitigate spectre style branch history side channels") Cc: stable@vger.kernel.org Reviewed-by: Julius Werner <jwerner@chromium.org> Signed-off-by: Douglas Anderson <dianders@chromium.org> Link: https://lore.kernel.org/r/20250107120555.v4.2.I2040fa004dafe196243f67ebcc647cbedbb516e6@changeid Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2025-01-07 12:05:59 -08:00
return true;
all_safe = false;
return false;
}
static u8 spectre_bhb_loop_affected(void)
{
u8 k = 0;
arm64: errata: Assume that unknown CPUs _are_ vulnerable to Spectre BHB The code for detecting CPUs that are vulnerable to Spectre BHB was based on a hardcoded list of CPU IDs that were known to be affected. Unfortunately, the list mostly only contained the IDs of standard ARM cores. The IDs for many cores that are minor variants of the standard ARM cores (like many Qualcomm Kyro CPUs) weren't listed. This led the code to assume that those variants were not affected. Flip the code on its head and instead assume that a core is vulnerable if it doesn't have CSV2_3 but is unrecognized as being safe. This involves creating a "Spectre BHB safe" list. As of right now, the only CPU IDs added to the "Spectre BHB safe" list are ARM Cortex A35, A53, A55, A510, and A520. This list was created by looking for cores that weren't listed in ARM's list [1] as per review feedback on v2 of this patch [2]. Additionally Brahma A53 is added as per mailing list feedback [3]. NOTE: this patch will not actually _mitigate_ anyone, it will simply cause them to report themselves as vulnerable. If any cores in the system are reported as vulnerable but not mitigated then the whole system will be reported as vulnerable though the system will attempt to mitigate with the information it has about the known cores. [1] https://developer.arm.com/Arm%20Security%20Center/Spectre-BHB [2] https://lore.kernel.org/r/20241219175128.GA25477@willie-the-truck [3] https://lore.kernel.org/r/18dbd7d1-a46c-4112-a425-320c99f67a8d@broadcom.com Fixes: 558c303c9734 ("arm64: Mitigate spectre style branch history side channels") Cc: stable@vger.kernel.org Reviewed-by: Julius Werner <jwerner@chromium.org> Signed-off-by: Douglas Anderson <dianders@chromium.org> Link: https://lore.kernel.org/r/20250107120555.v4.2.I2040fa004dafe196243f67ebcc647cbedbb516e6@changeid Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2025-01-07 12:05:59 -08:00
static const struct midr_range spectre_bhb_k132_list[] = {
MIDR_ALL_VERSIONS(MIDR_CORTEX_X3),
MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2),
arm64: errata: Add missing sentinels to Spectre-BHB MIDR arrays Commit a5951389e58d ("arm64: errata: Add newer ARM cores to the spectre_bhb_loop_affected() lists") added some additional CPUs to the Spectre-BHB workaround, including some new arrays for designs that require new 'k' values for the workaround to be effective. Unfortunately, the new arrays omitted the sentinel entry and so is_midr_in_range_list() will walk off the end when it doesn't find a match. With UBSAN enabled, this leads to a crash during boot when is_midr_in_range_list() is inlined (which was more common prior to c8c2647e69be ("arm64: Make  _midr_in_range_list() an exported function")): | Internal error: aarch64 BRK: 00000000f2000001 [#1] PREEMPT SMP | pstate: 804000c5 (Nzcv daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--) | pc : spectre_bhb_loop_affected+0x28/0x30 | lr : is_spectre_bhb_affected+0x170/0x190 | [...] | Call trace: | spectre_bhb_loop_affected+0x28/0x30 | update_cpu_capabilities+0xc0/0x184 | init_cpu_features+0x188/0x1a4 | cpuinfo_store_boot_cpu+0x4c/0x60 | smp_prepare_boot_cpu+0x38/0x54 | start_kernel+0x8c/0x478 | __primary_switched+0xc8/0xd4 | Code: 6b09011f 54000061 52801080 d65f03c0 (d4200020) | ---[ end trace 0000000000000000 ]--- | Kernel panic - not syncing: aarch64 BRK: Fatal exception Add the missing sentinel entries. Cc: Lee Jones <lee@kernel.org> Cc: James Morse <james.morse@arm.com> Cc: Doug Anderson <dianders@chromium.org> Cc: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> Cc: <stable@vger.kernel.org> Reported-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Fixes: a5951389e58d ("arm64: errata: Add newer ARM cores to the spectre_bhb_loop_affected() lists") Signed-off-by: Will Deacon <will@kernel.org> Reviewed-by: Lee Jones <lee@kernel.org> Reviewed-by: Douglas Anderson <dianders@chromium.org> Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Link: https://lore.kernel.org/r/20250501104747.28431-1-will@kernel.org Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2025-05-01 11:47:47 +01:00
{},
};
static const struct midr_range spectre_bhb_k38_list[] = {
MIDR_ALL_VERSIONS(MIDR_CORTEX_A715),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A720),
arm64: errata: Add missing sentinels to Spectre-BHB MIDR arrays Commit a5951389e58d ("arm64: errata: Add newer ARM cores to the spectre_bhb_loop_affected() lists") added some additional CPUs to the Spectre-BHB workaround, including some new arrays for designs that require new 'k' values for the workaround to be effective. Unfortunately, the new arrays omitted the sentinel entry and so is_midr_in_range_list() will walk off the end when it doesn't find a match. With UBSAN enabled, this leads to a crash during boot when is_midr_in_range_list() is inlined (which was more common prior to c8c2647e69be ("arm64: Make  _midr_in_range_list() an exported function")): | Internal error: aarch64 BRK: 00000000f2000001 [#1] PREEMPT SMP | pstate: 804000c5 (Nzcv daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--) | pc : spectre_bhb_loop_affected+0x28/0x30 | lr : is_spectre_bhb_affected+0x170/0x190 | [...] | Call trace: | spectre_bhb_loop_affected+0x28/0x30 | update_cpu_capabilities+0xc0/0x184 | init_cpu_features+0x188/0x1a4 | cpuinfo_store_boot_cpu+0x4c/0x60 | smp_prepare_boot_cpu+0x38/0x54 | start_kernel+0x8c/0x478 | __primary_switched+0xc8/0xd4 | Code: 6b09011f 54000061 52801080 d65f03c0 (d4200020) | ---[ end trace 0000000000000000 ]--- | Kernel panic - not syncing: aarch64 BRK: Fatal exception Add the missing sentinel entries. Cc: Lee Jones <lee@kernel.org> Cc: James Morse <james.morse@arm.com> Cc: Doug Anderson <dianders@chromium.org> Cc: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> Cc: <stable@vger.kernel.org> Reported-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Fixes: a5951389e58d ("arm64: errata: Add newer ARM cores to the spectre_bhb_loop_affected() lists") Signed-off-by: Will Deacon <will@kernel.org> Reviewed-by: Lee Jones <lee@kernel.org> Reviewed-by: Douglas Anderson <dianders@chromium.org> Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Link: https://lore.kernel.org/r/20250501104747.28431-1-will@kernel.org Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2025-05-01 11:47:47 +01:00
{},
};
arm64: errata: Assume that unknown CPUs _are_ vulnerable to Spectre BHB The code for detecting CPUs that are vulnerable to Spectre BHB was based on a hardcoded list of CPU IDs that were known to be affected. Unfortunately, the list mostly only contained the IDs of standard ARM cores. The IDs for many cores that are minor variants of the standard ARM cores (like many Qualcomm Kyro CPUs) weren't listed. This led the code to assume that those variants were not affected. Flip the code on its head and instead assume that a core is vulnerable if it doesn't have CSV2_3 but is unrecognized as being safe. This involves creating a "Spectre BHB safe" list. As of right now, the only CPU IDs added to the "Spectre BHB safe" list are ARM Cortex A35, A53, A55, A510, and A520. This list was created by looking for cores that weren't listed in ARM's list [1] as per review feedback on v2 of this patch [2]. Additionally Brahma A53 is added as per mailing list feedback [3]. NOTE: this patch will not actually _mitigate_ anyone, it will simply cause them to report themselves as vulnerable. If any cores in the system are reported as vulnerable but not mitigated then the whole system will be reported as vulnerable though the system will attempt to mitigate with the information it has about the known cores. [1] https://developer.arm.com/Arm%20Security%20Center/Spectre-BHB [2] https://lore.kernel.org/r/20241219175128.GA25477@willie-the-truck [3] https://lore.kernel.org/r/18dbd7d1-a46c-4112-a425-320c99f67a8d@broadcom.com Fixes: 558c303c9734 ("arm64: Mitigate spectre style branch history side channels") Cc: stable@vger.kernel.org Reviewed-by: Julius Werner <jwerner@chromium.org> Signed-off-by: Douglas Anderson <dianders@chromium.org> Link: https://lore.kernel.org/r/20250107120555.v4.2.I2040fa004dafe196243f67ebcc647cbedbb516e6@changeid Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2025-01-07 12:05:59 -08:00
static const struct midr_range spectre_bhb_k32_list[] = {
MIDR_ALL_VERSIONS(MIDR_CORTEX_A78),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A78AE),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C),
MIDR_ALL_VERSIONS(MIDR_CORTEX_X1),
MIDR_ALL_VERSIONS(MIDR_CORTEX_X1C),
arm64: errata: Assume that unknown CPUs _are_ vulnerable to Spectre BHB The code for detecting CPUs that are vulnerable to Spectre BHB was based on a hardcoded list of CPU IDs that were known to be affected. Unfortunately, the list mostly only contained the IDs of standard ARM cores. The IDs for many cores that are minor variants of the standard ARM cores (like many Qualcomm Kyro CPUs) weren't listed. This led the code to assume that those variants were not affected. Flip the code on its head and instead assume that a core is vulnerable if it doesn't have CSV2_3 but is unrecognized as being safe. This involves creating a "Spectre BHB safe" list. As of right now, the only CPU IDs added to the "Spectre BHB safe" list are ARM Cortex A35, A53, A55, A510, and A520. This list was created by looking for cores that weren't listed in ARM's list [1] as per review feedback on v2 of this patch [2]. Additionally Brahma A53 is added as per mailing list feedback [3]. NOTE: this patch will not actually _mitigate_ anyone, it will simply cause them to report themselves as vulnerable. If any cores in the system are reported as vulnerable but not mitigated then the whole system will be reported as vulnerable though the system will attempt to mitigate with the information it has about the known cores. [1] https://developer.arm.com/Arm%20Security%20Center/Spectre-BHB [2] https://lore.kernel.org/r/20241219175128.GA25477@willie-the-truck [3] https://lore.kernel.org/r/18dbd7d1-a46c-4112-a425-320c99f67a8d@broadcom.com Fixes: 558c303c9734 ("arm64: Mitigate spectre style branch history side channels") Cc: stable@vger.kernel.org Reviewed-by: Julius Werner <jwerner@chromium.org> Signed-off-by: Douglas Anderson <dianders@chromium.org> Link: https://lore.kernel.org/r/20250107120555.v4.2.I2040fa004dafe196243f67ebcc647cbedbb516e6@changeid Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2025-01-07 12:05:59 -08:00
MIDR_ALL_VERSIONS(MIDR_CORTEX_A710),
MIDR_ALL_VERSIONS(MIDR_CORTEX_X2),
MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2),
MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1),
{},
};
static const struct midr_range spectre_bhb_k24_list[] = {
MIDR_ALL_VERSIONS(MIDR_CORTEX_A76),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A76AE),
arm64: errata: Assume that unknown CPUs _are_ vulnerable to Spectre BHB The code for detecting CPUs that are vulnerable to Spectre BHB was based on a hardcoded list of CPU IDs that were known to be affected. Unfortunately, the list mostly only contained the IDs of standard ARM cores. The IDs for many cores that are minor variants of the standard ARM cores (like many Qualcomm Kyro CPUs) weren't listed. This led the code to assume that those variants were not affected. Flip the code on its head and instead assume that a core is vulnerable if it doesn't have CSV2_3 but is unrecognized as being safe. This involves creating a "Spectre BHB safe" list. As of right now, the only CPU IDs added to the "Spectre BHB safe" list are ARM Cortex A35, A53, A55, A510, and A520. This list was created by looking for cores that weren't listed in ARM's list [1] as per review feedback on v2 of this patch [2]. Additionally Brahma A53 is added as per mailing list feedback [3]. NOTE: this patch will not actually _mitigate_ anyone, it will simply cause them to report themselves as vulnerable. If any cores in the system are reported as vulnerable but not mitigated then the whole system will be reported as vulnerable though the system will attempt to mitigate with the information it has about the known cores. [1] https://developer.arm.com/Arm%20Security%20Center/Spectre-BHB [2] https://lore.kernel.org/r/20241219175128.GA25477@willie-the-truck [3] https://lore.kernel.org/r/18dbd7d1-a46c-4112-a425-320c99f67a8d@broadcom.com Fixes: 558c303c9734 ("arm64: Mitigate spectre style branch history side channels") Cc: stable@vger.kernel.org Reviewed-by: Julius Werner <jwerner@chromium.org> Signed-off-by: Douglas Anderson <dianders@chromium.org> Link: https://lore.kernel.org/r/20250107120555.v4.2.I2040fa004dafe196243f67ebcc647cbedbb516e6@changeid Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2025-01-07 12:05:59 -08:00
MIDR_ALL_VERSIONS(MIDR_CORTEX_A77),
MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1),
MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_GOLD),
MIDR_ALL_VERSIONS(MIDR_HISI_HIP09),
arm64: errata: Assume that unknown CPUs _are_ vulnerable to Spectre BHB The code for detecting CPUs that are vulnerable to Spectre BHB was based on a hardcoded list of CPU IDs that were known to be affected. Unfortunately, the list mostly only contained the IDs of standard ARM cores. The IDs for many cores that are minor variants of the standard ARM cores (like many Qualcomm Kyro CPUs) weren't listed. This led the code to assume that those variants were not affected. Flip the code on its head and instead assume that a core is vulnerable if it doesn't have CSV2_3 but is unrecognized as being safe. This involves creating a "Spectre BHB safe" list. As of right now, the only CPU IDs added to the "Spectre BHB safe" list are ARM Cortex A35, A53, A55, A510, and A520. This list was created by looking for cores that weren't listed in ARM's list [1] as per review feedback on v2 of this patch [2]. Additionally Brahma A53 is added as per mailing list feedback [3]. NOTE: this patch will not actually _mitigate_ anyone, it will simply cause them to report themselves as vulnerable. If any cores in the system are reported as vulnerable but not mitigated then the whole system will be reported as vulnerable though the system will attempt to mitigate with the information it has about the known cores. [1] https://developer.arm.com/Arm%20Security%20Center/Spectre-BHB [2] https://lore.kernel.org/r/20241219175128.GA25477@willie-the-truck [3] https://lore.kernel.org/r/18dbd7d1-a46c-4112-a425-320c99f67a8d@broadcom.com Fixes: 558c303c9734 ("arm64: Mitigate spectre style branch history side channels") Cc: stable@vger.kernel.org Reviewed-by: Julius Werner <jwerner@chromium.org> Signed-off-by: Douglas Anderson <dianders@chromium.org> Link: https://lore.kernel.org/r/20250107120555.v4.2.I2040fa004dafe196243f67ebcc647cbedbb516e6@changeid Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2025-01-07 12:05:59 -08:00
{},
};
static const struct midr_range spectre_bhb_k11_list[] = {
MIDR_ALL_VERSIONS(MIDR_AMPERE1),
{},
};
static const struct midr_range spectre_bhb_k8_list[] = {
MIDR_ALL_VERSIONS(MIDR_CORTEX_A72),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A57),
{},
};
ARM: * Nested virtualization support for VGICv3, giving the nested hypervisor control of the VGIC hardware when running an L2 VM * Removal of 'late' nested virtualization feature register masking, making the supported feature set directly visible to userspace * Support for emulating FEAT_PMUv3 on Apple silicon, taking advantage of an IMPLEMENTATION DEFINED trap that covers all PMUv3 registers * Paravirtual interface for discovering the set of CPU implementations where a VM may run, addressing a longstanding issue of guest CPU errata awareness in big-little systems and cross-implementation VM migration * Userspace control of the registers responsible for identifying a particular CPU implementation (MIDR_EL1, REVIDR_EL1, AIDR_EL1), allowing VMs to be migrated cross-implementation * pKVM updates, including support for tracking stage-2 page table allocations in the protected hypervisor in the 'SecPageTable' stat * Fixes to vPMU, ensuring that userspace updates to the vPMU after KVM_RUN are reflected into the backing perf events LoongArch: * Remove unnecessary header include path * Assume constant PGD during VM context switch * Add perf events support for guest VM RISC-V: * Disable the kernel perf counter during configure * KVM selftests improvements for PMU * Fix warning at the time of KVM module removal x86: * Add support for aging of SPTEs without holding mmu_lock. Not taking mmu_lock allows multiple aging actions to run in parallel, and more importantly avoids stalling vCPUs. This includes an implementation of per-rmap-entry locking; aging the gfn is done with only a per-rmap single-bin spinlock taken, whereas locking an rmap for write requires taking both the per-rmap spinlock and the mmu_lock. Note that this decreases slightly the accuracy of accessed-page information, because changes to the SPTE outside aging might not use atomic operations even if they could race against a clear of the Accessed bit. This is deliberate because KVM and mm/ tolerate false positives/negatives for accessed information, and testing has shown that reducing the latency of aging is far more beneficial to overall system performance than providing "perfect" young/old information. * Defer runtime CPUID updates until KVM emulates a CPUID instruction, to coalesce updates when multiple pieces of vCPU state are changing, e.g. as part of a nested transition. * Fix a variety of nested emulation bugs, and add VMX support for synthesizing nested VM-Exit on interception (instead of injecting #UD into L2). * Drop "support" for async page faults for protected guests that do not set SEND_ALWAYS (i.e. that only want async page faults at CPL3) * Bring a bit of sanity to x86's VM teardown code, which has accumulated a lot of cruft over the years. Particularly, destroy vCPUs before the MMU, despite the latter being a VM-wide operation. * Add common secure TSC infrastructure for use within SNP and in the future TDX * Block KVM_CAP_SYNC_REGS if guest state is protected. It does not make sense to use the capability if the relevant registers are not available for reading or writing. * Don't take kvm->lock when iterating over vCPUs in the suspend notifier to fix a largely theoretical deadlock. * Use the vCPU's actual Xen PV clock information when starting the Xen timer, as the cached state in arch.hv_clock can be stale/bogus. * Fix a bug where KVM could bleed PVCLOCK_GUEST_STOPPED across different PV clocks; restrict PVCLOCK_GUEST_STOPPED to kvmclock, as KVM's suspend notifier only accounts for kvmclock, and there's no evidence that the flag is actually supported by Xen guests. * Clean up the per-vCPU "cache" of its reference pvclock, and instead only track the vCPU's TSC scaling (multipler+shift) metadata (which is moderately expensive to compute, and rarely changes for modern setups). * Don't write to the Xen hypercall page on MSR writes that are initiated by the host (userspace or KVM) to fix a class of bugs where KVM can write to guest memory at unexpected times, e.g. during vCPU creation if userspace has set the Xen hypercall MSR index to collide with an MSR that KVM emulates. * Restrict the Xen hypercall MSR index to the unofficial synthetic range to reduce the set of possible collisions with MSRs that are emulated by KVM (collisions can still happen as KVM emulates Hyper-V MSRs, which also reside in the synthetic range). * Clean up and optimize KVM's handling of Xen MSR writes and xen_hvm_config. * Update Xen TSC leaves during CPUID emulation instead of modifying the CPUID entries when updating PV clocks; there is no guarantee PV clocks will be updated between TSC frequency changes and CPUID emulation, and guest reads of the TSC leaves should be rare, i.e. are not a hot path. x86 (Intel): * Fix a bug where KVM unnecessarily reads XFD_ERR from hardware and thus modifies the vCPU's XFD_ERR on a #NM due to CR0.TS=1. * Pass XFD_ERR as the payload when injecting #NM, as a preparatory step for upcoming FRED virtualization support. * Decouple the EPT entry RWX protection bit macros from the EPT Violation bits, both as a general cleanup and in anticipation of adding support for emulating Mode-Based Execution Control (MBEC). * Reject KVM_RUN if userspace manages to gain control and stuff invalid guest state while KVM is in the middle of emulating nested VM-Enter. * Add a macro to handle KVM's sanity checks on entry/exit VMCS control pairs in anticipation of adding sanity checks for secondary exit controls (the primary field is out of bits). x86 (AMD): * Ensure the PSP driver is initialized when both the PSP and KVM modules are built-in (the initcall framework doesn't handle dependencies). * Use long-term pins when registering encrypted memory regions, so that the pages are migrated out of MIGRATE_CMA/ZONE_MOVABLE and don't lead to excessive fragmentation. * Add macros and helpers for setting GHCB return/error codes. * Add support for Idle HLT interception, which elides interception if the vCPU has a pending, unmasked virtual IRQ when HLT is executed. * Fix a bug in INVPCID emulation where KVM fails to check for a non-canonical address. * Don't attempt VMRUN for SEV-ES+ guests if the vCPU's VMSA is invalid, e.g. because the vCPU was "destroyed" via SNP's AP Creation hypercall. * Reject SNP AP Creation if the requested SEV features for the vCPU don't match the VM's configured set of features. Selftests: * Fix again the Intel PMU counters test; add a data load and do CLFLUSH{OPT} on the data instead of executing code. The theory is that modern Intel CPUs have learned new code prefetching tricks that bypass the PMU counters. * Fix a flaw in the Intel PMU counters test where it asserts that an event is counting correctly without actually knowing what the event counts on the underlying hardware. * Fix a variety of flaws, bugs, and false failures/passes dirty_log_test, and improve its coverage by collecting all dirty entries on each iteration. * Fix a few minor bugs related to handling of stats FDs. * Add infrastructure to make vCPU and VM stats FDs available to tests by default (open the FDs during VM/vCPU creation). * Relax an assertion on the number of HLT exits in the xAPIC IPI test when running on a CPU that supports AMD's Idle HLT (which elides interception of HLT if a virtual IRQ is pending and unmasked). -----BEGIN PGP SIGNATURE----- iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmfcTkEUHHBib256aW5p QHJlZGhhdC5jb20ACgkQv/vSX3jHroMnQAf/cPx72hJOdNy4Qrm8M33YLXVRVV00 yEZ8eN8TWdOclr0ltE/w/ELGh/qS4CU8pjURAk0A6lPioU+mdcTn3dPEqMDMVYom uOQ2lusEHw0UuSnGZSEjvZJsE/Ro2NSAsHIB6PWRqig1ZBPJzyu0frce34pMpeQH diwriJL9lKPAhBWXnUQ9BKoi1R0P5OLW9ahX4SOWk7cAFg4DLlDE66Nqf6nKqViw DwEucTiUEg5+a3d93gihdD4JNl+fb3vI2erxrMxjFjkacl0qgqRu3ei3DG0MfdHU wNcFSG5B1n0OECKxr80lr1Ip1KTVNNij0Ks+w6Gc6lSg9c4PptnNkfLK3A== =nnCN -----END PGP SIGNATURE----- Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm Pull kvm updates from Paolo Bonzini: "ARM: - Nested virtualization support for VGICv3, giving the nested hypervisor control of the VGIC hardware when running an L2 VM - Removal of 'late' nested virtualization feature register masking, making the supported feature set directly visible to userspace - Support for emulating FEAT_PMUv3 on Apple silicon, taking advantage of an IMPLEMENTATION DEFINED trap that covers all PMUv3 registers - Paravirtual interface for discovering the set of CPU implementations where a VM may run, addressing a longstanding issue of guest CPU errata awareness in big-little systems and cross-implementation VM migration - Userspace control of the registers responsible for identifying a particular CPU implementation (MIDR_EL1, REVIDR_EL1, AIDR_EL1), allowing VMs to be migrated cross-implementation - pKVM updates, including support for tracking stage-2 page table allocations in the protected hypervisor in the 'SecPageTable' stat - Fixes to vPMU, ensuring that userspace updates to the vPMU after KVM_RUN are reflected into the backing perf events LoongArch: - Remove unnecessary header include path - Assume constant PGD during VM context switch - Add perf events support for guest VM RISC-V: - Disable the kernel perf counter during configure - KVM selftests improvements for PMU - Fix warning at the time of KVM module removal x86: - Add support for aging of SPTEs without holding mmu_lock. Not taking mmu_lock allows multiple aging actions to run in parallel, and more importantly avoids stalling vCPUs. This includes an implementation of per-rmap-entry locking; aging the gfn is done with only a per-rmap single-bin spinlock taken, whereas locking an rmap for write requires taking both the per-rmap spinlock and the mmu_lock. Note that this decreases slightly the accuracy of accessed-page information, because changes to the SPTE outside aging might not use atomic operations even if they could race against a clear of the Accessed bit. This is deliberate because KVM and mm/ tolerate false positives/negatives for accessed information, and testing has shown that reducing the latency of aging is far more beneficial to overall system performance than providing "perfect" young/old information. - Defer runtime CPUID updates until KVM emulates a CPUID instruction, to coalesce updates when multiple pieces of vCPU state are changing, e.g. as part of a nested transition - Fix a variety of nested emulation bugs, and add VMX support for synthesizing nested VM-Exit on interception (instead of injecting #UD into L2) - Drop "support" for async page faults for protected guests that do not set SEND_ALWAYS (i.e. that only want async page faults at CPL3) - Bring a bit of sanity to x86's VM teardown code, which has accumulated a lot of cruft over the years. Particularly, destroy vCPUs before the MMU, despite the latter being a VM-wide operation - Add common secure TSC infrastructure for use within SNP and in the future TDX - Block KVM_CAP_SYNC_REGS if guest state is protected. It does not make sense to use the capability if the relevant registers are not available for reading or writing - Don't take kvm->lock when iterating over vCPUs in the suspend notifier to fix a largely theoretical deadlock - Use the vCPU's actual Xen PV clock information when starting the Xen timer, as the cached state in arch.hv_clock can be stale/bogus - Fix a bug where KVM could bleed PVCLOCK_GUEST_STOPPED across different PV clocks; restrict PVCLOCK_GUEST_STOPPED to kvmclock, as KVM's suspend notifier only accounts for kvmclock, and there's no evidence that the flag is actually supported by Xen guests - Clean up the per-vCPU "cache" of its reference pvclock, and instead only track the vCPU's TSC scaling (multipler+shift) metadata (which is moderately expensive to compute, and rarely changes for modern setups) - Don't write to the Xen hypercall page on MSR writes that are initiated by the host (userspace or KVM) to fix a class of bugs where KVM can write to guest memory at unexpected times, e.g. during vCPU creation if userspace has set the Xen hypercall MSR index to collide with an MSR that KVM emulates - Restrict the Xen hypercall MSR index to the unofficial synthetic range to reduce the set of possible collisions with MSRs that are emulated by KVM (collisions can still happen as KVM emulates Hyper-V MSRs, which also reside in the synthetic range) - Clean up and optimize KVM's handling of Xen MSR writes and xen_hvm_config - Update Xen TSC leaves during CPUID emulation instead of modifying the CPUID entries when updating PV clocks; there is no guarantee PV clocks will be updated between TSC frequency changes and CPUID emulation, and guest reads of the TSC leaves should be rare, i.e. are not a hot path x86 (Intel): - Fix a bug where KVM unnecessarily reads XFD_ERR from hardware and thus modifies the vCPU's XFD_ERR on a #NM due to CR0.TS=1 - Pass XFD_ERR as the payload when injecting #NM, as a preparatory step for upcoming FRED virtualization support - Decouple the EPT entry RWX protection bit macros from the EPT Violation bits, both as a general cleanup and in anticipation of adding support for emulating Mode-Based Execution Control (MBEC) - Reject KVM_RUN if userspace manages to gain control and stuff invalid guest state while KVM is in the middle of emulating nested VM-Enter - Add a macro to handle KVM's sanity checks on entry/exit VMCS control pairs in anticipation of adding sanity checks for secondary exit controls (the primary field is out of bits) x86 (AMD): - Ensure the PSP driver is initialized when both the PSP and KVM modules are built-in (the initcall framework doesn't handle dependencies) - Use long-term pins when registering encrypted memory regions, so that the pages are migrated out of MIGRATE_CMA/ZONE_MOVABLE and don't lead to excessive fragmentation - Add macros and helpers for setting GHCB return/error codes - Add support for Idle HLT interception, which elides interception if the vCPU has a pending, unmasked virtual IRQ when HLT is executed - Fix a bug in INVPCID emulation where KVM fails to check for a non-canonical address - Don't attempt VMRUN for SEV-ES+ guests if the vCPU's VMSA is invalid, e.g. because the vCPU was "destroyed" via SNP's AP Creation hypercall - Reject SNP AP Creation if the requested SEV features for the vCPU don't match the VM's configured set of features Selftests: - Fix again the Intel PMU counters test; add a data load and do CLFLUSH{OPT} on the data instead of executing code. The theory is that modern Intel CPUs have learned new code prefetching tricks that bypass the PMU counters - Fix a flaw in the Intel PMU counters test where it asserts that an event is counting correctly without actually knowing what the event counts on the underlying hardware - Fix a variety of flaws, bugs, and false failures/passes dirty_log_test, and improve its coverage by collecting all dirty entries on each iteration - Fix a few minor bugs related to handling of stats FDs - Add infrastructure to make vCPU and VM stats FDs available to tests by default (open the FDs during VM/vCPU creation) - Relax an assertion on the number of HLT exits in the xAPIC IPI test when running on a CPU that supports AMD's Idle HLT (which elides interception of HLT if a virtual IRQ is pending and unmasked)" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (216 commits) RISC-V: KVM: Optimize comments in kvm_riscv_vcpu_isa_disable_allowed RISC-V: KVM: Teardown riscv specific bits after kvm_exit LoongArch: KVM: Register perf callbacks for guest LoongArch: KVM: Implement arch-specific functions for guest perf LoongArch: KVM: Add stub for kvm_arch_vcpu_preempted_in_kernel() LoongArch: KVM: Remove PGD saving during VM context switch LoongArch: KVM: Remove unnecessary header include path KVM: arm64: Tear down vGIC on failed vCPU creation KVM: arm64: PMU: Reload when resetting KVM: arm64: PMU: Reload when user modifies registers KVM: arm64: PMU: Fix SET_ONE_REG for vPMC regs KVM: arm64: PMU: Assume PMU presence in pmu-emul.c KVM: arm64: PMU: Set raw values from user to PM{C,I}NTEN{SET,CLR}, PMOVS{SET,CLR} KVM: arm64: Create each pKVM hyp vcpu after its corresponding host vcpu KVM: arm64: Factor out pKVM hyp vcpu creation to separate function KVM: arm64: Initialize HCRX_EL2 traps in pKVM KVM: arm64: Factor out setting HCRX_EL2 traps into separate function KVM: x86: block KVM_CAP_SYNC_REGS if guest state is protected KVM: x86: Add infrastructure for secure TSC KVM: x86: Push down setting vcpu.arch.user_set_tsc ...
2025-03-25 14:22:07 -07:00
if (is_midr_in_range_list(spectre_bhb_k132_list))
k = 132;
ARM: * Nested virtualization support for VGICv3, giving the nested hypervisor control of the VGIC hardware when running an L2 VM * Removal of 'late' nested virtualization feature register masking, making the supported feature set directly visible to userspace * Support for emulating FEAT_PMUv3 on Apple silicon, taking advantage of an IMPLEMENTATION DEFINED trap that covers all PMUv3 registers * Paravirtual interface for discovering the set of CPU implementations where a VM may run, addressing a longstanding issue of guest CPU errata awareness in big-little systems and cross-implementation VM migration * Userspace control of the registers responsible for identifying a particular CPU implementation (MIDR_EL1, REVIDR_EL1, AIDR_EL1), allowing VMs to be migrated cross-implementation * pKVM updates, including support for tracking stage-2 page table allocations in the protected hypervisor in the 'SecPageTable' stat * Fixes to vPMU, ensuring that userspace updates to the vPMU after KVM_RUN are reflected into the backing perf events LoongArch: * Remove unnecessary header include path * Assume constant PGD during VM context switch * Add perf events support for guest VM RISC-V: * Disable the kernel perf counter during configure * KVM selftests improvements for PMU * Fix warning at the time of KVM module removal x86: * Add support for aging of SPTEs without holding mmu_lock. Not taking mmu_lock allows multiple aging actions to run in parallel, and more importantly avoids stalling vCPUs. This includes an implementation of per-rmap-entry locking; aging the gfn is done with only a per-rmap single-bin spinlock taken, whereas locking an rmap for write requires taking both the per-rmap spinlock and the mmu_lock. Note that this decreases slightly the accuracy of accessed-page information, because changes to the SPTE outside aging might not use atomic operations even if they could race against a clear of the Accessed bit. This is deliberate because KVM and mm/ tolerate false positives/negatives for accessed information, and testing has shown that reducing the latency of aging is far more beneficial to overall system performance than providing "perfect" young/old information. * Defer runtime CPUID updates until KVM emulates a CPUID instruction, to coalesce updates when multiple pieces of vCPU state are changing, e.g. as part of a nested transition. * Fix a variety of nested emulation bugs, and add VMX support for synthesizing nested VM-Exit on interception (instead of injecting #UD into L2). * Drop "support" for async page faults for protected guests that do not set SEND_ALWAYS (i.e. that only want async page faults at CPL3) * Bring a bit of sanity to x86's VM teardown code, which has accumulated a lot of cruft over the years. Particularly, destroy vCPUs before the MMU, despite the latter being a VM-wide operation. * Add common secure TSC infrastructure for use within SNP and in the future TDX * Block KVM_CAP_SYNC_REGS if guest state is protected. It does not make sense to use the capability if the relevant registers are not available for reading or writing. * Don't take kvm->lock when iterating over vCPUs in the suspend notifier to fix a largely theoretical deadlock. * Use the vCPU's actual Xen PV clock information when starting the Xen timer, as the cached state in arch.hv_clock can be stale/bogus. * Fix a bug where KVM could bleed PVCLOCK_GUEST_STOPPED across different PV clocks; restrict PVCLOCK_GUEST_STOPPED to kvmclock, as KVM's suspend notifier only accounts for kvmclock, and there's no evidence that the flag is actually supported by Xen guests. * Clean up the per-vCPU "cache" of its reference pvclock, and instead only track the vCPU's TSC scaling (multipler+shift) metadata (which is moderately expensive to compute, and rarely changes for modern setups). * Don't write to the Xen hypercall page on MSR writes that are initiated by the host (userspace or KVM) to fix a class of bugs where KVM can write to guest memory at unexpected times, e.g. during vCPU creation if userspace has set the Xen hypercall MSR index to collide with an MSR that KVM emulates. * Restrict the Xen hypercall MSR index to the unofficial synthetic range to reduce the set of possible collisions with MSRs that are emulated by KVM (collisions can still happen as KVM emulates Hyper-V MSRs, which also reside in the synthetic range). * Clean up and optimize KVM's handling of Xen MSR writes and xen_hvm_config. * Update Xen TSC leaves during CPUID emulation instead of modifying the CPUID entries when updating PV clocks; there is no guarantee PV clocks will be updated between TSC frequency changes and CPUID emulation, and guest reads of the TSC leaves should be rare, i.e. are not a hot path. x86 (Intel): * Fix a bug where KVM unnecessarily reads XFD_ERR from hardware and thus modifies the vCPU's XFD_ERR on a #NM due to CR0.TS=1. * Pass XFD_ERR as the payload when injecting #NM, as a preparatory step for upcoming FRED virtualization support. * Decouple the EPT entry RWX protection bit macros from the EPT Violation bits, both as a general cleanup and in anticipation of adding support for emulating Mode-Based Execution Control (MBEC). * Reject KVM_RUN if userspace manages to gain control and stuff invalid guest state while KVM is in the middle of emulating nested VM-Enter. * Add a macro to handle KVM's sanity checks on entry/exit VMCS control pairs in anticipation of adding sanity checks for secondary exit controls (the primary field is out of bits). x86 (AMD): * Ensure the PSP driver is initialized when both the PSP and KVM modules are built-in (the initcall framework doesn't handle dependencies). * Use long-term pins when registering encrypted memory regions, so that the pages are migrated out of MIGRATE_CMA/ZONE_MOVABLE and don't lead to excessive fragmentation. * Add macros and helpers for setting GHCB return/error codes. * Add support for Idle HLT interception, which elides interception if the vCPU has a pending, unmasked virtual IRQ when HLT is executed. * Fix a bug in INVPCID emulation where KVM fails to check for a non-canonical address. * Don't attempt VMRUN for SEV-ES+ guests if the vCPU's VMSA is invalid, e.g. because the vCPU was "destroyed" via SNP's AP Creation hypercall. * Reject SNP AP Creation if the requested SEV features for the vCPU don't match the VM's configured set of features. Selftests: * Fix again the Intel PMU counters test; add a data load and do CLFLUSH{OPT} on the data instead of executing code. The theory is that modern Intel CPUs have learned new code prefetching tricks that bypass the PMU counters. * Fix a flaw in the Intel PMU counters test where it asserts that an event is counting correctly without actually knowing what the event counts on the underlying hardware. * Fix a variety of flaws, bugs, and false failures/passes dirty_log_test, and improve its coverage by collecting all dirty entries on each iteration. * Fix a few minor bugs related to handling of stats FDs. * Add infrastructure to make vCPU and VM stats FDs available to tests by default (open the FDs during VM/vCPU creation). * Relax an assertion on the number of HLT exits in the xAPIC IPI test when running on a CPU that supports AMD's Idle HLT (which elides interception of HLT if a virtual IRQ is pending and unmasked). -----BEGIN PGP SIGNATURE----- iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmfcTkEUHHBib256aW5p QHJlZGhhdC5jb20ACgkQv/vSX3jHroMnQAf/cPx72hJOdNy4Qrm8M33YLXVRVV00 yEZ8eN8TWdOclr0ltE/w/ELGh/qS4CU8pjURAk0A6lPioU+mdcTn3dPEqMDMVYom uOQ2lusEHw0UuSnGZSEjvZJsE/Ro2NSAsHIB6PWRqig1ZBPJzyu0frce34pMpeQH diwriJL9lKPAhBWXnUQ9BKoi1R0P5OLW9ahX4SOWk7cAFg4DLlDE66Nqf6nKqViw DwEucTiUEg5+a3d93gihdD4JNl+fb3vI2erxrMxjFjkacl0qgqRu3ei3DG0MfdHU wNcFSG5B1n0OECKxr80lr1Ip1KTVNNij0Ks+w6Gc6lSg9c4PptnNkfLK3A== =nnCN -----END PGP SIGNATURE----- Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm Pull kvm updates from Paolo Bonzini: "ARM: - Nested virtualization support for VGICv3, giving the nested hypervisor control of the VGIC hardware when running an L2 VM - Removal of 'late' nested virtualization feature register masking, making the supported feature set directly visible to userspace - Support for emulating FEAT_PMUv3 on Apple silicon, taking advantage of an IMPLEMENTATION DEFINED trap that covers all PMUv3 registers - Paravirtual interface for discovering the set of CPU implementations where a VM may run, addressing a longstanding issue of guest CPU errata awareness in big-little systems and cross-implementation VM migration - Userspace control of the registers responsible for identifying a particular CPU implementation (MIDR_EL1, REVIDR_EL1, AIDR_EL1), allowing VMs to be migrated cross-implementation - pKVM updates, including support for tracking stage-2 page table allocations in the protected hypervisor in the 'SecPageTable' stat - Fixes to vPMU, ensuring that userspace updates to the vPMU after KVM_RUN are reflected into the backing perf events LoongArch: - Remove unnecessary header include path - Assume constant PGD during VM context switch - Add perf events support for guest VM RISC-V: - Disable the kernel perf counter during configure - KVM selftests improvements for PMU - Fix warning at the time of KVM module removal x86: - Add support for aging of SPTEs without holding mmu_lock. Not taking mmu_lock allows multiple aging actions to run in parallel, and more importantly avoids stalling vCPUs. This includes an implementation of per-rmap-entry locking; aging the gfn is done with only a per-rmap single-bin spinlock taken, whereas locking an rmap for write requires taking both the per-rmap spinlock and the mmu_lock. Note that this decreases slightly the accuracy of accessed-page information, because changes to the SPTE outside aging might not use atomic operations even if they could race against a clear of the Accessed bit. This is deliberate because KVM and mm/ tolerate false positives/negatives for accessed information, and testing has shown that reducing the latency of aging is far more beneficial to overall system performance than providing "perfect" young/old information. - Defer runtime CPUID updates until KVM emulates a CPUID instruction, to coalesce updates when multiple pieces of vCPU state are changing, e.g. as part of a nested transition - Fix a variety of nested emulation bugs, and add VMX support for synthesizing nested VM-Exit on interception (instead of injecting #UD into L2) - Drop "support" for async page faults for protected guests that do not set SEND_ALWAYS (i.e. that only want async page faults at CPL3) - Bring a bit of sanity to x86's VM teardown code, which has accumulated a lot of cruft over the years. Particularly, destroy vCPUs before the MMU, despite the latter being a VM-wide operation - Add common secure TSC infrastructure for use within SNP and in the future TDX - Block KVM_CAP_SYNC_REGS if guest state is protected. It does not make sense to use the capability if the relevant registers are not available for reading or writing - Don't take kvm->lock when iterating over vCPUs in the suspend notifier to fix a largely theoretical deadlock - Use the vCPU's actual Xen PV clock information when starting the Xen timer, as the cached state in arch.hv_clock can be stale/bogus - Fix a bug where KVM could bleed PVCLOCK_GUEST_STOPPED across different PV clocks; restrict PVCLOCK_GUEST_STOPPED to kvmclock, as KVM's suspend notifier only accounts for kvmclock, and there's no evidence that the flag is actually supported by Xen guests - Clean up the per-vCPU "cache" of its reference pvclock, and instead only track the vCPU's TSC scaling (multipler+shift) metadata (which is moderately expensive to compute, and rarely changes for modern setups) - Don't write to the Xen hypercall page on MSR writes that are initiated by the host (userspace or KVM) to fix a class of bugs where KVM can write to guest memory at unexpected times, e.g. during vCPU creation if userspace has set the Xen hypercall MSR index to collide with an MSR that KVM emulates - Restrict the Xen hypercall MSR index to the unofficial synthetic range to reduce the set of possible collisions with MSRs that are emulated by KVM (collisions can still happen as KVM emulates Hyper-V MSRs, which also reside in the synthetic range) - Clean up and optimize KVM's handling of Xen MSR writes and xen_hvm_config - Update Xen TSC leaves during CPUID emulation instead of modifying the CPUID entries when updating PV clocks; there is no guarantee PV clocks will be updated between TSC frequency changes and CPUID emulation, and guest reads of the TSC leaves should be rare, i.e. are not a hot path x86 (Intel): - Fix a bug where KVM unnecessarily reads XFD_ERR from hardware and thus modifies the vCPU's XFD_ERR on a #NM due to CR0.TS=1 - Pass XFD_ERR as the payload when injecting #NM, as a preparatory step for upcoming FRED virtualization support - Decouple the EPT entry RWX protection bit macros from the EPT Violation bits, both as a general cleanup and in anticipation of adding support for emulating Mode-Based Execution Control (MBEC) - Reject KVM_RUN if userspace manages to gain control and stuff invalid guest state while KVM is in the middle of emulating nested VM-Enter - Add a macro to handle KVM's sanity checks on entry/exit VMCS control pairs in anticipation of adding sanity checks for secondary exit controls (the primary field is out of bits) x86 (AMD): - Ensure the PSP driver is initialized when both the PSP and KVM modules are built-in (the initcall framework doesn't handle dependencies) - Use long-term pins when registering encrypted memory regions, so that the pages are migrated out of MIGRATE_CMA/ZONE_MOVABLE and don't lead to excessive fragmentation - Add macros and helpers for setting GHCB return/error codes - Add support for Idle HLT interception, which elides interception if the vCPU has a pending, unmasked virtual IRQ when HLT is executed - Fix a bug in INVPCID emulation where KVM fails to check for a non-canonical address - Don't attempt VMRUN for SEV-ES+ guests if the vCPU's VMSA is invalid, e.g. because the vCPU was "destroyed" via SNP's AP Creation hypercall - Reject SNP AP Creation if the requested SEV features for the vCPU don't match the VM's configured set of features Selftests: - Fix again the Intel PMU counters test; add a data load and do CLFLUSH{OPT} on the data instead of executing code. The theory is that modern Intel CPUs have learned new code prefetching tricks that bypass the PMU counters - Fix a flaw in the Intel PMU counters test where it asserts that an event is counting correctly without actually knowing what the event counts on the underlying hardware - Fix a variety of flaws, bugs, and false failures/passes dirty_log_test, and improve its coverage by collecting all dirty entries on each iteration - Fix a few minor bugs related to handling of stats FDs - Add infrastructure to make vCPU and VM stats FDs available to tests by default (open the FDs during VM/vCPU creation) - Relax an assertion on the number of HLT exits in the xAPIC IPI test when running on a CPU that supports AMD's Idle HLT (which elides interception of HLT if a virtual IRQ is pending and unmasked)" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (216 commits) RISC-V: KVM: Optimize comments in kvm_riscv_vcpu_isa_disable_allowed RISC-V: KVM: Teardown riscv specific bits after kvm_exit LoongArch: KVM: Register perf callbacks for guest LoongArch: KVM: Implement arch-specific functions for guest perf LoongArch: KVM: Add stub for kvm_arch_vcpu_preempted_in_kernel() LoongArch: KVM: Remove PGD saving during VM context switch LoongArch: KVM: Remove unnecessary header include path KVM: arm64: Tear down vGIC on failed vCPU creation KVM: arm64: PMU: Reload when resetting KVM: arm64: PMU: Reload when user modifies registers KVM: arm64: PMU: Fix SET_ONE_REG for vPMC regs KVM: arm64: PMU: Assume PMU presence in pmu-emul.c KVM: arm64: PMU: Set raw values from user to PM{C,I}NTEN{SET,CLR}, PMOVS{SET,CLR} KVM: arm64: Create each pKVM hyp vcpu after its corresponding host vcpu KVM: arm64: Factor out pKVM hyp vcpu creation to separate function KVM: arm64: Initialize HCRX_EL2 traps in pKVM KVM: arm64: Factor out setting HCRX_EL2 traps into separate function KVM: x86: block KVM_CAP_SYNC_REGS if guest state is protected KVM: x86: Add infrastructure for secure TSC KVM: x86: Push down setting vcpu.arch.user_set_tsc ...
2025-03-25 14:22:07 -07:00
else if (is_midr_in_range_list(spectre_bhb_k38_list))
k = 38;
ARM: * Nested virtualization support for VGICv3, giving the nested hypervisor control of the VGIC hardware when running an L2 VM * Removal of 'late' nested virtualization feature register masking, making the supported feature set directly visible to userspace * Support for emulating FEAT_PMUv3 on Apple silicon, taking advantage of an IMPLEMENTATION DEFINED trap that covers all PMUv3 registers * Paravirtual interface for discovering the set of CPU implementations where a VM may run, addressing a longstanding issue of guest CPU errata awareness in big-little systems and cross-implementation VM migration * Userspace control of the registers responsible for identifying a particular CPU implementation (MIDR_EL1, REVIDR_EL1, AIDR_EL1), allowing VMs to be migrated cross-implementation * pKVM updates, including support for tracking stage-2 page table allocations in the protected hypervisor in the 'SecPageTable' stat * Fixes to vPMU, ensuring that userspace updates to the vPMU after KVM_RUN are reflected into the backing perf events LoongArch: * Remove unnecessary header include path * Assume constant PGD during VM context switch * Add perf events support for guest VM RISC-V: * Disable the kernel perf counter during configure * KVM selftests improvements for PMU * Fix warning at the time of KVM module removal x86: * Add support for aging of SPTEs without holding mmu_lock. Not taking mmu_lock allows multiple aging actions to run in parallel, and more importantly avoids stalling vCPUs. This includes an implementation of per-rmap-entry locking; aging the gfn is done with only a per-rmap single-bin spinlock taken, whereas locking an rmap for write requires taking both the per-rmap spinlock and the mmu_lock. Note that this decreases slightly the accuracy of accessed-page information, because changes to the SPTE outside aging might not use atomic operations even if they could race against a clear of the Accessed bit. This is deliberate because KVM and mm/ tolerate false positives/negatives for accessed information, and testing has shown that reducing the latency of aging is far more beneficial to overall system performance than providing "perfect" young/old information. * Defer runtime CPUID updates until KVM emulates a CPUID instruction, to coalesce updates when multiple pieces of vCPU state are changing, e.g. as part of a nested transition. * Fix a variety of nested emulation bugs, and add VMX support for synthesizing nested VM-Exit on interception (instead of injecting #UD into L2). * Drop "support" for async page faults for protected guests that do not set SEND_ALWAYS (i.e. that only want async page faults at CPL3) * Bring a bit of sanity to x86's VM teardown code, which has accumulated a lot of cruft over the years. Particularly, destroy vCPUs before the MMU, despite the latter being a VM-wide operation. * Add common secure TSC infrastructure for use within SNP and in the future TDX * Block KVM_CAP_SYNC_REGS if guest state is protected. It does not make sense to use the capability if the relevant registers are not available for reading or writing. * Don't take kvm->lock when iterating over vCPUs in the suspend notifier to fix a largely theoretical deadlock. * Use the vCPU's actual Xen PV clock information when starting the Xen timer, as the cached state in arch.hv_clock can be stale/bogus. * Fix a bug where KVM could bleed PVCLOCK_GUEST_STOPPED across different PV clocks; restrict PVCLOCK_GUEST_STOPPED to kvmclock, as KVM's suspend notifier only accounts for kvmclock, and there's no evidence that the flag is actually supported by Xen guests. * Clean up the per-vCPU "cache" of its reference pvclock, and instead only track the vCPU's TSC scaling (multipler+shift) metadata (which is moderately expensive to compute, and rarely changes for modern setups). * Don't write to the Xen hypercall page on MSR writes that are initiated by the host (userspace or KVM) to fix a class of bugs where KVM can write to guest memory at unexpected times, e.g. during vCPU creation if userspace has set the Xen hypercall MSR index to collide with an MSR that KVM emulates. * Restrict the Xen hypercall MSR index to the unofficial synthetic range to reduce the set of possible collisions with MSRs that are emulated by KVM (collisions can still happen as KVM emulates Hyper-V MSRs, which also reside in the synthetic range). * Clean up and optimize KVM's handling of Xen MSR writes and xen_hvm_config. * Update Xen TSC leaves during CPUID emulation instead of modifying the CPUID entries when updating PV clocks; there is no guarantee PV clocks will be updated between TSC frequency changes and CPUID emulation, and guest reads of the TSC leaves should be rare, i.e. are not a hot path. x86 (Intel): * Fix a bug where KVM unnecessarily reads XFD_ERR from hardware and thus modifies the vCPU's XFD_ERR on a #NM due to CR0.TS=1. * Pass XFD_ERR as the payload when injecting #NM, as a preparatory step for upcoming FRED virtualization support. * Decouple the EPT entry RWX protection bit macros from the EPT Violation bits, both as a general cleanup and in anticipation of adding support for emulating Mode-Based Execution Control (MBEC). * Reject KVM_RUN if userspace manages to gain control and stuff invalid guest state while KVM is in the middle of emulating nested VM-Enter. * Add a macro to handle KVM's sanity checks on entry/exit VMCS control pairs in anticipation of adding sanity checks for secondary exit controls (the primary field is out of bits). x86 (AMD): * Ensure the PSP driver is initialized when both the PSP and KVM modules are built-in (the initcall framework doesn't handle dependencies). * Use long-term pins when registering encrypted memory regions, so that the pages are migrated out of MIGRATE_CMA/ZONE_MOVABLE and don't lead to excessive fragmentation. * Add macros and helpers for setting GHCB return/error codes. * Add support for Idle HLT interception, which elides interception if the vCPU has a pending, unmasked virtual IRQ when HLT is executed. * Fix a bug in INVPCID emulation where KVM fails to check for a non-canonical address. * Don't attempt VMRUN for SEV-ES+ guests if the vCPU's VMSA is invalid, e.g. because the vCPU was "destroyed" via SNP's AP Creation hypercall. * Reject SNP AP Creation if the requested SEV features for the vCPU don't match the VM's configured set of features. Selftests: * Fix again the Intel PMU counters test; add a data load and do CLFLUSH{OPT} on the data instead of executing code. The theory is that modern Intel CPUs have learned new code prefetching tricks that bypass the PMU counters. * Fix a flaw in the Intel PMU counters test where it asserts that an event is counting correctly without actually knowing what the event counts on the underlying hardware. * Fix a variety of flaws, bugs, and false failures/passes dirty_log_test, and improve its coverage by collecting all dirty entries on each iteration. * Fix a few minor bugs related to handling of stats FDs. * Add infrastructure to make vCPU and VM stats FDs available to tests by default (open the FDs during VM/vCPU creation). * Relax an assertion on the number of HLT exits in the xAPIC IPI test when running on a CPU that supports AMD's Idle HLT (which elides interception of HLT if a virtual IRQ is pending and unmasked). -----BEGIN PGP SIGNATURE----- iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmfcTkEUHHBib256aW5p QHJlZGhhdC5jb20ACgkQv/vSX3jHroMnQAf/cPx72hJOdNy4Qrm8M33YLXVRVV00 yEZ8eN8TWdOclr0ltE/w/ELGh/qS4CU8pjURAk0A6lPioU+mdcTn3dPEqMDMVYom uOQ2lusEHw0UuSnGZSEjvZJsE/Ro2NSAsHIB6PWRqig1ZBPJzyu0frce34pMpeQH diwriJL9lKPAhBWXnUQ9BKoi1R0P5OLW9ahX4SOWk7cAFg4DLlDE66Nqf6nKqViw DwEucTiUEg5+a3d93gihdD4JNl+fb3vI2erxrMxjFjkacl0qgqRu3ei3DG0MfdHU wNcFSG5B1n0OECKxr80lr1Ip1KTVNNij0Ks+w6Gc6lSg9c4PptnNkfLK3A== =nnCN -----END PGP SIGNATURE----- Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm Pull kvm updates from Paolo Bonzini: "ARM: - Nested virtualization support for VGICv3, giving the nested hypervisor control of the VGIC hardware when running an L2 VM - Removal of 'late' nested virtualization feature register masking, making the supported feature set directly visible to userspace - Support for emulating FEAT_PMUv3 on Apple silicon, taking advantage of an IMPLEMENTATION DEFINED trap that covers all PMUv3 registers - Paravirtual interface for discovering the set of CPU implementations where a VM may run, addressing a longstanding issue of guest CPU errata awareness in big-little systems and cross-implementation VM migration - Userspace control of the registers responsible for identifying a particular CPU implementation (MIDR_EL1, REVIDR_EL1, AIDR_EL1), allowing VMs to be migrated cross-implementation - pKVM updates, including support for tracking stage-2 page table allocations in the protected hypervisor in the 'SecPageTable' stat - Fixes to vPMU, ensuring that userspace updates to the vPMU after KVM_RUN are reflected into the backing perf events LoongArch: - Remove unnecessary header include path - Assume constant PGD during VM context switch - Add perf events support for guest VM RISC-V: - Disable the kernel perf counter during configure - KVM selftests improvements for PMU - Fix warning at the time of KVM module removal x86: - Add support for aging of SPTEs without holding mmu_lock. Not taking mmu_lock allows multiple aging actions to run in parallel, and more importantly avoids stalling vCPUs. This includes an implementation of per-rmap-entry locking; aging the gfn is done with only a per-rmap single-bin spinlock taken, whereas locking an rmap for write requires taking both the per-rmap spinlock and the mmu_lock. Note that this decreases slightly the accuracy of accessed-page information, because changes to the SPTE outside aging might not use atomic operations even if they could race against a clear of the Accessed bit. This is deliberate because KVM and mm/ tolerate false positives/negatives for accessed information, and testing has shown that reducing the latency of aging is far more beneficial to overall system performance than providing "perfect" young/old information. - Defer runtime CPUID updates until KVM emulates a CPUID instruction, to coalesce updates when multiple pieces of vCPU state are changing, e.g. as part of a nested transition - Fix a variety of nested emulation bugs, and add VMX support for synthesizing nested VM-Exit on interception (instead of injecting #UD into L2) - Drop "support" for async page faults for protected guests that do not set SEND_ALWAYS (i.e. that only want async page faults at CPL3) - Bring a bit of sanity to x86's VM teardown code, which has accumulated a lot of cruft over the years. Particularly, destroy vCPUs before the MMU, despite the latter being a VM-wide operation - Add common secure TSC infrastructure for use within SNP and in the future TDX - Block KVM_CAP_SYNC_REGS if guest state is protected. It does not make sense to use the capability if the relevant registers are not available for reading or writing - Don't take kvm->lock when iterating over vCPUs in the suspend notifier to fix a largely theoretical deadlock - Use the vCPU's actual Xen PV clock information when starting the Xen timer, as the cached state in arch.hv_clock can be stale/bogus - Fix a bug where KVM could bleed PVCLOCK_GUEST_STOPPED across different PV clocks; restrict PVCLOCK_GUEST_STOPPED to kvmclock, as KVM's suspend notifier only accounts for kvmclock, and there's no evidence that the flag is actually supported by Xen guests - Clean up the per-vCPU "cache" of its reference pvclock, and instead only track the vCPU's TSC scaling (multipler+shift) metadata (which is moderately expensive to compute, and rarely changes for modern setups) - Don't write to the Xen hypercall page on MSR writes that are initiated by the host (userspace or KVM) to fix a class of bugs where KVM can write to guest memory at unexpected times, e.g. during vCPU creation if userspace has set the Xen hypercall MSR index to collide with an MSR that KVM emulates - Restrict the Xen hypercall MSR index to the unofficial synthetic range to reduce the set of possible collisions with MSRs that are emulated by KVM (collisions can still happen as KVM emulates Hyper-V MSRs, which also reside in the synthetic range) - Clean up and optimize KVM's handling of Xen MSR writes and xen_hvm_config - Update Xen TSC leaves during CPUID emulation instead of modifying the CPUID entries when updating PV clocks; there is no guarantee PV clocks will be updated between TSC frequency changes and CPUID emulation, and guest reads of the TSC leaves should be rare, i.e. are not a hot path x86 (Intel): - Fix a bug where KVM unnecessarily reads XFD_ERR from hardware and thus modifies the vCPU's XFD_ERR on a #NM due to CR0.TS=1 - Pass XFD_ERR as the payload when injecting #NM, as a preparatory step for upcoming FRED virtualization support - Decouple the EPT entry RWX protection bit macros from the EPT Violation bits, both as a general cleanup and in anticipation of adding support for emulating Mode-Based Execution Control (MBEC) - Reject KVM_RUN if userspace manages to gain control and stuff invalid guest state while KVM is in the middle of emulating nested VM-Enter - Add a macro to handle KVM's sanity checks on entry/exit VMCS control pairs in anticipation of adding sanity checks for secondary exit controls (the primary field is out of bits) x86 (AMD): - Ensure the PSP driver is initialized when both the PSP and KVM modules are built-in (the initcall framework doesn't handle dependencies) - Use long-term pins when registering encrypted memory regions, so that the pages are migrated out of MIGRATE_CMA/ZONE_MOVABLE and don't lead to excessive fragmentation - Add macros and helpers for setting GHCB return/error codes - Add support for Idle HLT interception, which elides interception if the vCPU has a pending, unmasked virtual IRQ when HLT is executed - Fix a bug in INVPCID emulation where KVM fails to check for a non-canonical address - Don't attempt VMRUN for SEV-ES+ guests if the vCPU's VMSA is invalid, e.g. because the vCPU was "destroyed" via SNP's AP Creation hypercall - Reject SNP AP Creation if the requested SEV features for the vCPU don't match the VM's configured set of features Selftests: - Fix again the Intel PMU counters test; add a data load and do CLFLUSH{OPT} on the data instead of executing code. The theory is that modern Intel CPUs have learned new code prefetching tricks that bypass the PMU counters - Fix a flaw in the Intel PMU counters test where it asserts that an event is counting correctly without actually knowing what the event counts on the underlying hardware - Fix a variety of flaws, bugs, and false failures/passes dirty_log_test, and improve its coverage by collecting all dirty entries on each iteration - Fix a few minor bugs related to handling of stats FDs - Add infrastructure to make vCPU and VM stats FDs available to tests by default (open the FDs during VM/vCPU creation) - Relax an assertion on the number of HLT exits in the xAPIC IPI test when running on a CPU that supports AMD's Idle HLT (which elides interception of HLT if a virtual IRQ is pending and unmasked)" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (216 commits) RISC-V: KVM: Optimize comments in kvm_riscv_vcpu_isa_disable_allowed RISC-V: KVM: Teardown riscv specific bits after kvm_exit LoongArch: KVM: Register perf callbacks for guest LoongArch: KVM: Implement arch-specific functions for guest perf LoongArch: KVM: Add stub for kvm_arch_vcpu_preempted_in_kernel() LoongArch: KVM: Remove PGD saving during VM context switch LoongArch: KVM: Remove unnecessary header include path KVM: arm64: Tear down vGIC on failed vCPU creation KVM: arm64: PMU: Reload when resetting KVM: arm64: PMU: Reload when user modifies registers KVM: arm64: PMU: Fix SET_ONE_REG for vPMC regs KVM: arm64: PMU: Assume PMU presence in pmu-emul.c KVM: arm64: PMU: Set raw values from user to PM{C,I}NTEN{SET,CLR}, PMOVS{SET,CLR} KVM: arm64: Create each pKVM hyp vcpu after its corresponding host vcpu KVM: arm64: Factor out pKVM hyp vcpu creation to separate function KVM: arm64: Initialize HCRX_EL2 traps in pKVM KVM: arm64: Factor out setting HCRX_EL2 traps into separate function KVM: x86: block KVM_CAP_SYNC_REGS if guest state is protected KVM: x86: Add infrastructure for secure TSC KVM: x86: Push down setting vcpu.arch.user_set_tsc ...
2025-03-25 14:22:07 -07:00
else if (is_midr_in_range_list(spectre_bhb_k32_list))
arm64: errata: Assume that unknown CPUs _are_ vulnerable to Spectre BHB The code for detecting CPUs that are vulnerable to Spectre BHB was based on a hardcoded list of CPU IDs that were known to be affected. Unfortunately, the list mostly only contained the IDs of standard ARM cores. The IDs for many cores that are minor variants of the standard ARM cores (like many Qualcomm Kyro CPUs) weren't listed. This led the code to assume that those variants were not affected. Flip the code on its head and instead assume that a core is vulnerable if it doesn't have CSV2_3 but is unrecognized as being safe. This involves creating a "Spectre BHB safe" list. As of right now, the only CPU IDs added to the "Spectre BHB safe" list are ARM Cortex A35, A53, A55, A510, and A520. This list was created by looking for cores that weren't listed in ARM's list [1] as per review feedback on v2 of this patch [2]. Additionally Brahma A53 is added as per mailing list feedback [3]. NOTE: this patch will not actually _mitigate_ anyone, it will simply cause them to report themselves as vulnerable. If any cores in the system are reported as vulnerable but not mitigated then the whole system will be reported as vulnerable though the system will attempt to mitigate with the information it has about the known cores. [1] https://developer.arm.com/Arm%20Security%20Center/Spectre-BHB [2] https://lore.kernel.org/r/20241219175128.GA25477@willie-the-truck [3] https://lore.kernel.org/r/18dbd7d1-a46c-4112-a425-320c99f67a8d@broadcom.com Fixes: 558c303c9734 ("arm64: Mitigate spectre style branch history side channels") Cc: stable@vger.kernel.org Reviewed-by: Julius Werner <jwerner@chromium.org> Signed-off-by: Douglas Anderson <dianders@chromium.org> Link: https://lore.kernel.org/r/20250107120555.v4.2.I2040fa004dafe196243f67ebcc647cbedbb516e6@changeid Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2025-01-07 12:05:59 -08:00
k = 32;
ARM: * Nested virtualization support for VGICv3, giving the nested hypervisor control of the VGIC hardware when running an L2 VM * Removal of 'late' nested virtualization feature register masking, making the supported feature set directly visible to userspace * Support for emulating FEAT_PMUv3 on Apple silicon, taking advantage of an IMPLEMENTATION DEFINED trap that covers all PMUv3 registers * Paravirtual interface for discovering the set of CPU implementations where a VM may run, addressing a longstanding issue of guest CPU errata awareness in big-little systems and cross-implementation VM migration * Userspace control of the registers responsible for identifying a particular CPU implementation (MIDR_EL1, REVIDR_EL1, AIDR_EL1), allowing VMs to be migrated cross-implementation * pKVM updates, including support for tracking stage-2 page table allocations in the protected hypervisor in the 'SecPageTable' stat * Fixes to vPMU, ensuring that userspace updates to the vPMU after KVM_RUN are reflected into the backing perf events LoongArch: * Remove unnecessary header include path * Assume constant PGD during VM context switch * Add perf events support for guest VM RISC-V: * Disable the kernel perf counter during configure * KVM selftests improvements for PMU * Fix warning at the time of KVM module removal x86: * Add support for aging of SPTEs without holding mmu_lock. Not taking mmu_lock allows multiple aging actions to run in parallel, and more importantly avoids stalling vCPUs. This includes an implementation of per-rmap-entry locking; aging the gfn is done with only a per-rmap single-bin spinlock taken, whereas locking an rmap for write requires taking both the per-rmap spinlock and the mmu_lock. Note that this decreases slightly the accuracy of accessed-page information, because changes to the SPTE outside aging might not use atomic operations even if they could race against a clear of the Accessed bit. This is deliberate because KVM and mm/ tolerate false positives/negatives for accessed information, and testing has shown that reducing the latency of aging is far more beneficial to overall system performance than providing "perfect" young/old information. * Defer runtime CPUID updates until KVM emulates a CPUID instruction, to coalesce updates when multiple pieces of vCPU state are changing, e.g. as part of a nested transition. * Fix a variety of nested emulation bugs, and add VMX support for synthesizing nested VM-Exit on interception (instead of injecting #UD into L2). * Drop "support" for async page faults for protected guests that do not set SEND_ALWAYS (i.e. that only want async page faults at CPL3) * Bring a bit of sanity to x86's VM teardown code, which has accumulated a lot of cruft over the years. Particularly, destroy vCPUs before the MMU, despite the latter being a VM-wide operation. * Add common secure TSC infrastructure for use within SNP and in the future TDX * Block KVM_CAP_SYNC_REGS if guest state is protected. It does not make sense to use the capability if the relevant registers are not available for reading or writing. * Don't take kvm->lock when iterating over vCPUs in the suspend notifier to fix a largely theoretical deadlock. * Use the vCPU's actual Xen PV clock information when starting the Xen timer, as the cached state in arch.hv_clock can be stale/bogus. * Fix a bug where KVM could bleed PVCLOCK_GUEST_STOPPED across different PV clocks; restrict PVCLOCK_GUEST_STOPPED to kvmclock, as KVM's suspend notifier only accounts for kvmclock, and there's no evidence that the flag is actually supported by Xen guests. * Clean up the per-vCPU "cache" of its reference pvclock, and instead only track the vCPU's TSC scaling (multipler+shift) metadata (which is moderately expensive to compute, and rarely changes for modern setups). * Don't write to the Xen hypercall page on MSR writes that are initiated by the host (userspace or KVM) to fix a class of bugs where KVM can write to guest memory at unexpected times, e.g. during vCPU creation if userspace has set the Xen hypercall MSR index to collide with an MSR that KVM emulates. * Restrict the Xen hypercall MSR index to the unofficial synthetic range to reduce the set of possible collisions with MSRs that are emulated by KVM (collisions can still happen as KVM emulates Hyper-V MSRs, which also reside in the synthetic range). * Clean up and optimize KVM's handling of Xen MSR writes and xen_hvm_config. * Update Xen TSC leaves during CPUID emulation instead of modifying the CPUID entries when updating PV clocks; there is no guarantee PV clocks will be updated between TSC frequency changes and CPUID emulation, and guest reads of the TSC leaves should be rare, i.e. are not a hot path. x86 (Intel): * Fix a bug where KVM unnecessarily reads XFD_ERR from hardware and thus modifies the vCPU's XFD_ERR on a #NM due to CR0.TS=1. * Pass XFD_ERR as the payload when injecting #NM, as a preparatory step for upcoming FRED virtualization support. * Decouple the EPT entry RWX protection bit macros from the EPT Violation bits, both as a general cleanup and in anticipation of adding support for emulating Mode-Based Execution Control (MBEC). * Reject KVM_RUN if userspace manages to gain control and stuff invalid guest state while KVM is in the middle of emulating nested VM-Enter. * Add a macro to handle KVM's sanity checks on entry/exit VMCS control pairs in anticipation of adding sanity checks for secondary exit controls (the primary field is out of bits). x86 (AMD): * Ensure the PSP driver is initialized when both the PSP and KVM modules are built-in (the initcall framework doesn't handle dependencies). * Use long-term pins when registering encrypted memory regions, so that the pages are migrated out of MIGRATE_CMA/ZONE_MOVABLE and don't lead to excessive fragmentation. * Add macros and helpers for setting GHCB return/error codes. * Add support for Idle HLT interception, which elides interception if the vCPU has a pending, unmasked virtual IRQ when HLT is executed. * Fix a bug in INVPCID emulation where KVM fails to check for a non-canonical address. * Don't attempt VMRUN for SEV-ES+ guests if the vCPU's VMSA is invalid, e.g. because the vCPU was "destroyed" via SNP's AP Creation hypercall. * Reject SNP AP Creation if the requested SEV features for the vCPU don't match the VM's configured set of features. Selftests: * Fix again the Intel PMU counters test; add a data load and do CLFLUSH{OPT} on the data instead of executing code. The theory is that modern Intel CPUs have learned new code prefetching tricks that bypass the PMU counters. * Fix a flaw in the Intel PMU counters test where it asserts that an event is counting correctly without actually knowing what the event counts on the underlying hardware. * Fix a variety of flaws, bugs, and false failures/passes dirty_log_test, and improve its coverage by collecting all dirty entries on each iteration. * Fix a few minor bugs related to handling of stats FDs. * Add infrastructure to make vCPU and VM stats FDs available to tests by default (open the FDs during VM/vCPU creation). * Relax an assertion on the number of HLT exits in the xAPIC IPI test when running on a CPU that supports AMD's Idle HLT (which elides interception of HLT if a virtual IRQ is pending and unmasked). -----BEGIN PGP SIGNATURE----- iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmfcTkEUHHBib256aW5p QHJlZGhhdC5jb20ACgkQv/vSX3jHroMnQAf/cPx72hJOdNy4Qrm8M33YLXVRVV00 yEZ8eN8TWdOclr0ltE/w/ELGh/qS4CU8pjURAk0A6lPioU+mdcTn3dPEqMDMVYom uOQ2lusEHw0UuSnGZSEjvZJsE/Ro2NSAsHIB6PWRqig1ZBPJzyu0frce34pMpeQH diwriJL9lKPAhBWXnUQ9BKoi1R0P5OLW9ahX4SOWk7cAFg4DLlDE66Nqf6nKqViw DwEucTiUEg5+a3d93gihdD4JNl+fb3vI2erxrMxjFjkacl0qgqRu3ei3DG0MfdHU wNcFSG5B1n0OECKxr80lr1Ip1KTVNNij0Ks+w6Gc6lSg9c4PptnNkfLK3A== =nnCN -----END PGP SIGNATURE----- Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm Pull kvm updates from Paolo Bonzini: "ARM: - Nested virtualization support for VGICv3, giving the nested hypervisor control of the VGIC hardware when running an L2 VM - Removal of 'late' nested virtualization feature register masking, making the supported feature set directly visible to userspace - Support for emulating FEAT_PMUv3 on Apple silicon, taking advantage of an IMPLEMENTATION DEFINED trap that covers all PMUv3 registers - Paravirtual interface for discovering the set of CPU implementations where a VM may run, addressing a longstanding issue of guest CPU errata awareness in big-little systems and cross-implementation VM migration - Userspace control of the registers responsible for identifying a particular CPU implementation (MIDR_EL1, REVIDR_EL1, AIDR_EL1), allowing VMs to be migrated cross-implementation - pKVM updates, including support for tracking stage-2 page table allocations in the protected hypervisor in the 'SecPageTable' stat - Fixes to vPMU, ensuring that userspace updates to the vPMU after KVM_RUN are reflected into the backing perf events LoongArch: - Remove unnecessary header include path - Assume constant PGD during VM context switch - Add perf events support for guest VM RISC-V: - Disable the kernel perf counter during configure - KVM selftests improvements for PMU - Fix warning at the time of KVM module removal x86: - Add support for aging of SPTEs without holding mmu_lock. Not taking mmu_lock allows multiple aging actions to run in parallel, and more importantly avoids stalling vCPUs. This includes an implementation of per-rmap-entry locking; aging the gfn is done with only a per-rmap single-bin spinlock taken, whereas locking an rmap for write requires taking both the per-rmap spinlock and the mmu_lock. Note that this decreases slightly the accuracy of accessed-page information, because changes to the SPTE outside aging might not use atomic operations even if they could race against a clear of the Accessed bit. This is deliberate because KVM and mm/ tolerate false positives/negatives for accessed information, and testing has shown that reducing the latency of aging is far more beneficial to overall system performance than providing "perfect" young/old information. - Defer runtime CPUID updates until KVM emulates a CPUID instruction, to coalesce updates when multiple pieces of vCPU state are changing, e.g. as part of a nested transition - Fix a variety of nested emulation bugs, and add VMX support for synthesizing nested VM-Exit on interception (instead of injecting #UD into L2) - Drop "support" for async page faults for protected guests that do not set SEND_ALWAYS (i.e. that only want async page faults at CPL3) - Bring a bit of sanity to x86's VM teardown code, which has accumulated a lot of cruft over the years. Particularly, destroy vCPUs before the MMU, despite the latter being a VM-wide operation - Add common secure TSC infrastructure for use within SNP and in the future TDX - Block KVM_CAP_SYNC_REGS if guest state is protected. It does not make sense to use the capability if the relevant registers are not available for reading or writing - Don't take kvm->lock when iterating over vCPUs in the suspend notifier to fix a largely theoretical deadlock - Use the vCPU's actual Xen PV clock information when starting the Xen timer, as the cached state in arch.hv_clock can be stale/bogus - Fix a bug where KVM could bleed PVCLOCK_GUEST_STOPPED across different PV clocks; restrict PVCLOCK_GUEST_STOPPED to kvmclock, as KVM's suspend notifier only accounts for kvmclock, and there's no evidence that the flag is actually supported by Xen guests - Clean up the per-vCPU "cache" of its reference pvclock, and instead only track the vCPU's TSC scaling (multipler+shift) metadata (which is moderately expensive to compute, and rarely changes for modern setups) - Don't write to the Xen hypercall page on MSR writes that are initiated by the host (userspace or KVM) to fix a class of bugs where KVM can write to guest memory at unexpected times, e.g. during vCPU creation if userspace has set the Xen hypercall MSR index to collide with an MSR that KVM emulates - Restrict the Xen hypercall MSR index to the unofficial synthetic range to reduce the set of possible collisions with MSRs that are emulated by KVM (collisions can still happen as KVM emulates Hyper-V MSRs, which also reside in the synthetic range) - Clean up and optimize KVM's handling of Xen MSR writes and xen_hvm_config - Update Xen TSC leaves during CPUID emulation instead of modifying the CPUID entries when updating PV clocks; there is no guarantee PV clocks will be updated between TSC frequency changes and CPUID emulation, and guest reads of the TSC leaves should be rare, i.e. are not a hot path x86 (Intel): - Fix a bug where KVM unnecessarily reads XFD_ERR from hardware and thus modifies the vCPU's XFD_ERR on a #NM due to CR0.TS=1 - Pass XFD_ERR as the payload when injecting #NM, as a preparatory step for upcoming FRED virtualization support - Decouple the EPT entry RWX protection bit macros from the EPT Violation bits, both as a general cleanup and in anticipation of adding support for emulating Mode-Based Execution Control (MBEC) - Reject KVM_RUN if userspace manages to gain control and stuff invalid guest state while KVM is in the middle of emulating nested VM-Enter - Add a macro to handle KVM's sanity checks on entry/exit VMCS control pairs in anticipation of adding sanity checks for secondary exit controls (the primary field is out of bits) x86 (AMD): - Ensure the PSP driver is initialized when both the PSP and KVM modules are built-in (the initcall framework doesn't handle dependencies) - Use long-term pins when registering encrypted memory regions, so that the pages are migrated out of MIGRATE_CMA/ZONE_MOVABLE and don't lead to excessive fragmentation - Add macros and helpers for setting GHCB return/error codes - Add support for Idle HLT interception, which elides interception if the vCPU has a pending, unmasked virtual IRQ when HLT is executed - Fix a bug in INVPCID emulation where KVM fails to check for a non-canonical address - Don't attempt VMRUN for SEV-ES+ guests if the vCPU's VMSA is invalid, e.g. because the vCPU was "destroyed" via SNP's AP Creation hypercall - Reject SNP AP Creation if the requested SEV features for the vCPU don't match the VM's configured set of features Selftests: - Fix again the Intel PMU counters test; add a data load and do CLFLUSH{OPT} on the data instead of executing code. The theory is that modern Intel CPUs have learned new code prefetching tricks that bypass the PMU counters - Fix a flaw in the Intel PMU counters test where it asserts that an event is counting correctly without actually knowing what the event counts on the underlying hardware - Fix a variety of flaws, bugs, and false failures/passes dirty_log_test, and improve its coverage by collecting all dirty entries on each iteration - Fix a few minor bugs related to handling of stats FDs - Add infrastructure to make vCPU and VM stats FDs available to tests by default (open the FDs during VM/vCPU creation) - Relax an assertion on the number of HLT exits in the xAPIC IPI test when running on a CPU that supports AMD's Idle HLT (which elides interception of HLT if a virtual IRQ is pending and unmasked)" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (216 commits) RISC-V: KVM: Optimize comments in kvm_riscv_vcpu_isa_disable_allowed RISC-V: KVM: Teardown riscv specific bits after kvm_exit LoongArch: KVM: Register perf callbacks for guest LoongArch: KVM: Implement arch-specific functions for guest perf LoongArch: KVM: Add stub for kvm_arch_vcpu_preempted_in_kernel() LoongArch: KVM: Remove PGD saving during VM context switch LoongArch: KVM: Remove unnecessary header include path KVM: arm64: Tear down vGIC on failed vCPU creation KVM: arm64: PMU: Reload when resetting KVM: arm64: PMU: Reload when user modifies registers KVM: arm64: PMU: Fix SET_ONE_REG for vPMC regs KVM: arm64: PMU: Assume PMU presence in pmu-emul.c KVM: arm64: PMU: Set raw values from user to PM{C,I}NTEN{SET,CLR}, PMOVS{SET,CLR} KVM: arm64: Create each pKVM hyp vcpu after its corresponding host vcpu KVM: arm64: Factor out pKVM hyp vcpu creation to separate function KVM: arm64: Initialize HCRX_EL2 traps in pKVM KVM: arm64: Factor out setting HCRX_EL2 traps into separate function KVM: x86: block KVM_CAP_SYNC_REGS if guest state is protected KVM: x86: Add infrastructure for secure TSC KVM: x86: Push down setting vcpu.arch.user_set_tsc ...
2025-03-25 14:22:07 -07:00
else if (is_midr_in_range_list(spectre_bhb_k24_list))
arm64: errata: Assume that unknown CPUs _are_ vulnerable to Spectre BHB The code for detecting CPUs that are vulnerable to Spectre BHB was based on a hardcoded list of CPU IDs that were known to be affected. Unfortunately, the list mostly only contained the IDs of standard ARM cores. The IDs for many cores that are minor variants of the standard ARM cores (like many Qualcomm Kyro CPUs) weren't listed. This led the code to assume that those variants were not affected. Flip the code on its head and instead assume that a core is vulnerable if it doesn't have CSV2_3 but is unrecognized as being safe. This involves creating a "Spectre BHB safe" list. As of right now, the only CPU IDs added to the "Spectre BHB safe" list are ARM Cortex A35, A53, A55, A510, and A520. This list was created by looking for cores that weren't listed in ARM's list [1] as per review feedback on v2 of this patch [2]. Additionally Brahma A53 is added as per mailing list feedback [3]. NOTE: this patch will not actually _mitigate_ anyone, it will simply cause them to report themselves as vulnerable. If any cores in the system are reported as vulnerable but not mitigated then the whole system will be reported as vulnerable though the system will attempt to mitigate with the information it has about the known cores. [1] https://developer.arm.com/Arm%20Security%20Center/Spectre-BHB [2] https://lore.kernel.org/r/20241219175128.GA25477@willie-the-truck [3] https://lore.kernel.org/r/18dbd7d1-a46c-4112-a425-320c99f67a8d@broadcom.com Fixes: 558c303c9734 ("arm64: Mitigate spectre style branch history side channels") Cc: stable@vger.kernel.org Reviewed-by: Julius Werner <jwerner@chromium.org> Signed-off-by: Douglas Anderson <dianders@chromium.org> Link: https://lore.kernel.org/r/20250107120555.v4.2.I2040fa004dafe196243f67ebcc647cbedbb516e6@changeid Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2025-01-07 12:05:59 -08:00
k = 24;
ARM: * Nested virtualization support for VGICv3, giving the nested hypervisor control of the VGIC hardware when running an L2 VM * Removal of 'late' nested virtualization feature register masking, making the supported feature set directly visible to userspace * Support for emulating FEAT_PMUv3 on Apple silicon, taking advantage of an IMPLEMENTATION DEFINED trap that covers all PMUv3 registers * Paravirtual interface for discovering the set of CPU implementations where a VM may run, addressing a longstanding issue of guest CPU errata awareness in big-little systems and cross-implementation VM migration * Userspace control of the registers responsible for identifying a particular CPU implementation (MIDR_EL1, REVIDR_EL1, AIDR_EL1), allowing VMs to be migrated cross-implementation * pKVM updates, including support for tracking stage-2 page table allocations in the protected hypervisor in the 'SecPageTable' stat * Fixes to vPMU, ensuring that userspace updates to the vPMU after KVM_RUN are reflected into the backing perf events LoongArch: * Remove unnecessary header include path * Assume constant PGD during VM context switch * Add perf events support for guest VM RISC-V: * Disable the kernel perf counter during configure * KVM selftests improvements for PMU * Fix warning at the time of KVM module removal x86: * Add support for aging of SPTEs without holding mmu_lock. Not taking mmu_lock allows multiple aging actions to run in parallel, and more importantly avoids stalling vCPUs. This includes an implementation of per-rmap-entry locking; aging the gfn is done with only a per-rmap single-bin spinlock taken, whereas locking an rmap for write requires taking both the per-rmap spinlock and the mmu_lock. Note that this decreases slightly the accuracy of accessed-page information, because changes to the SPTE outside aging might not use atomic operations even if they could race against a clear of the Accessed bit. This is deliberate because KVM and mm/ tolerate false positives/negatives for accessed information, and testing has shown that reducing the latency of aging is far more beneficial to overall system performance than providing "perfect" young/old information. * Defer runtime CPUID updates until KVM emulates a CPUID instruction, to coalesce updates when multiple pieces of vCPU state are changing, e.g. as part of a nested transition. * Fix a variety of nested emulation bugs, and add VMX support for synthesizing nested VM-Exit on interception (instead of injecting #UD into L2). * Drop "support" for async page faults for protected guests that do not set SEND_ALWAYS (i.e. that only want async page faults at CPL3) * Bring a bit of sanity to x86's VM teardown code, which has accumulated a lot of cruft over the years. Particularly, destroy vCPUs before the MMU, despite the latter being a VM-wide operation. * Add common secure TSC infrastructure for use within SNP and in the future TDX * Block KVM_CAP_SYNC_REGS if guest state is protected. It does not make sense to use the capability if the relevant registers are not available for reading or writing. * Don't take kvm->lock when iterating over vCPUs in the suspend notifier to fix a largely theoretical deadlock. * Use the vCPU's actual Xen PV clock information when starting the Xen timer, as the cached state in arch.hv_clock can be stale/bogus. * Fix a bug where KVM could bleed PVCLOCK_GUEST_STOPPED across different PV clocks; restrict PVCLOCK_GUEST_STOPPED to kvmclock, as KVM's suspend notifier only accounts for kvmclock, and there's no evidence that the flag is actually supported by Xen guests. * Clean up the per-vCPU "cache" of its reference pvclock, and instead only track the vCPU's TSC scaling (multipler+shift) metadata (which is moderately expensive to compute, and rarely changes for modern setups). * Don't write to the Xen hypercall page on MSR writes that are initiated by the host (userspace or KVM) to fix a class of bugs where KVM can write to guest memory at unexpected times, e.g. during vCPU creation if userspace has set the Xen hypercall MSR index to collide with an MSR that KVM emulates. * Restrict the Xen hypercall MSR index to the unofficial synthetic range to reduce the set of possible collisions with MSRs that are emulated by KVM (collisions can still happen as KVM emulates Hyper-V MSRs, which also reside in the synthetic range). * Clean up and optimize KVM's handling of Xen MSR writes and xen_hvm_config. * Update Xen TSC leaves during CPUID emulation instead of modifying the CPUID entries when updating PV clocks; there is no guarantee PV clocks will be updated between TSC frequency changes and CPUID emulation, and guest reads of the TSC leaves should be rare, i.e. are not a hot path. x86 (Intel): * Fix a bug where KVM unnecessarily reads XFD_ERR from hardware and thus modifies the vCPU's XFD_ERR on a #NM due to CR0.TS=1. * Pass XFD_ERR as the payload when injecting #NM, as a preparatory step for upcoming FRED virtualization support. * Decouple the EPT entry RWX protection bit macros from the EPT Violation bits, both as a general cleanup and in anticipation of adding support for emulating Mode-Based Execution Control (MBEC). * Reject KVM_RUN if userspace manages to gain control and stuff invalid guest state while KVM is in the middle of emulating nested VM-Enter. * Add a macro to handle KVM's sanity checks on entry/exit VMCS control pairs in anticipation of adding sanity checks for secondary exit controls (the primary field is out of bits). x86 (AMD): * Ensure the PSP driver is initialized when both the PSP and KVM modules are built-in (the initcall framework doesn't handle dependencies). * Use long-term pins when registering encrypted memory regions, so that the pages are migrated out of MIGRATE_CMA/ZONE_MOVABLE and don't lead to excessive fragmentation. * Add macros and helpers for setting GHCB return/error codes. * Add support for Idle HLT interception, which elides interception if the vCPU has a pending, unmasked virtual IRQ when HLT is executed. * Fix a bug in INVPCID emulation where KVM fails to check for a non-canonical address. * Don't attempt VMRUN for SEV-ES+ guests if the vCPU's VMSA is invalid, e.g. because the vCPU was "destroyed" via SNP's AP Creation hypercall. * Reject SNP AP Creation if the requested SEV features for the vCPU don't match the VM's configured set of features. Selftests: * Fix again the Intel PMU counters test; add a data load and do CLFLUSH{OPT} on the data instead of executing code. The theory is that modern Intel CPUs have learned new code prefetching tricks that bypass the PMU counters. * Fix a flaw in the Intel PMU counters test where it asserts that an event is counting correctly without actually knowing what the event counts on the underlying hardware. * Fix a variety of flaws, bugs, and false failures/passes dirty_log_test, and improve its coverage by collecting all dirty entries on each iteration. * Fix a few minor bugs related to handling of stats FDs. * Add infrastructure to make vCPU and VM stats FDs available to tests by default (open the FDs during VM/vCPU creation). * Relax an assertion on the number of HLT exits in the xAPIC IPI test when running on a CPU that supports AMD's Idle HLT (which elides interception of HLT if a virtual IRQ is pending and unmasked). -----BEGIN PGP SIGNATURE----- iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmfcTkEUHHBib256aW5p QHJlZGhhdC5jb20ACgkQv/vSX3jHroMnQAf/cPx72hJOdNy4Qrm8M33YLXVRVV00 yEZ8eN8TWdOclr0ltE/w/ELGh/qS4CU8pjURAk0A6lPioU+mdcTn3dPEqMDMVYom uOQ2lusEHw0UuSnGZSEjvZJsE/Ro2NSAsHIB6PWRqig1ZBPJzyu0frce34pMpeQH diwriJL9lKPAhBWXnUQ9BKoi1R0P5OLW9ahX4SOWk7cAFg4DLlDE66Nqf6nKqViw DwEucTiUEg5+a3d93gihdD4JNl+fb3vI2erxrMxjFjkacl0qgqRu3ei3DG0MfdHU wNcFSG5B1n0OECKxr80lr1Ip1KTVNNij0Ks+w6Gc6lSg9c4PptnNkfLK3A== =nnCN -----END PGP SIGNATURE----- Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm Pull kvm updates from Paolo Bonzini: "ARM: - Nested virtualization support for VGICv3, giving the nested hypervisor control of the VGIC hardware when running an L2 VM - Removal of 'late' nested virtualization feature register masking, making the supported feature set directly visible to userspace - Support for emulating FEAT_PMUv3 on Apple silicon, taking advantage of an IMPLEMENTATION DEFINED trap that covers all PMUv3 registers - Paravirtual interface for discovering the set of CPU implementations where a VM may run, addressing a longstanding issue of guest CPU errata awareness in big-little systems and cross-implementation VM migration - Userspace control of the registers responsible for identifying a particular CPU implementation (MIDR_EL1, REVIDR_EL1, AIDR_EL1), allowing VMs to be migrated cross-implementation - pKVM updates, including support for tracking stage-2 page table allocations in the protected hypervisor in the 'SecPageTable' stat - Fixes to vPMU, ensuring that userspace updates to the vPMU after KVM_RUN are reflected into the backing perf events LoongArch: - Remove unnecessary header include path - Assume constant PGD during VM context switch - Add perf events support for guest VM RISC-V: - Disable the kernel perf counter during configure - KVM selftests improvements for PMU - Fix warning at the time of KVM module removal x86: - Add support for aging of SPTEs without holding mmu_lock. Not taking mmu_lock allows multiple aging actions to run in parallel, and more importantly avoids stalling vCPUs. This includes an implementation of per-rmap-entry locking; aging the gfn is done with only a per-rmap single-bin spinlock taken, whereas locking an rmap for write requires taking both the per-rmap spinlock and the mmu_lock. Note that this decreases slightly the accuracy of accessed-page information, because changes to the SPTE outside aging might not use atomic operations even if they could race against a clear of the Accessed bit. This is deliberate because KVM and mm/ tolerate false positives/negatives for accessed information, and testing has shown that reducing the latency of aging is far more beneficial to overall system performance than providing "perfect" young/old information. - Defer runtime CPUID updates until KVM emulates a CPUID instruction, to coalesce updates when multiple pieces of vCPU state are changing, e.g. as part of a nested transition - Fix a variety of nested emulation bugs, and add VMX support for synthesizing nested VM-Exit on interception (instead of injecting #UD into L2) - Drop "support" for async page faults for protected guests that do not set SEND_ALWAYS (i.e. that only want async page faults at CPL3) - Bring a bit of sanity to x86's VM teardown code, which has accumulated a lot of cruft over the years. Particularly, destroy vCPUs before the MMU, despite the latter being a VM-wide operation - Add common secure TSC infrastructure for use within SNP and in the future TDX - Block KVM_CAP_SYNC_REGS if guest state is protected. It does not make sense to use the capability if the relevant registers are not available for reading or writing - Don't take kvm->lock when iterating over vCPUs in the suspend notifier to fix a largely theoretical deadlock - Use the vCPU's actual Xen PV clock information when starting the Xen timer, as the cached state in arch.hv_clock can be stale/bogus - Fix a bug where KVM could bleed PVCLOCK_GUEST_STOPPED across different PV clocks; restrict PVCLOCK_GUEST_STOPPED to kvmclock, as KVM's suspend notifier only accounts for kvmclock, and there's no evidence that the flag is actually supported by Xen guests - Clean up the per-vCPU "cache" of its reference pvclock, and instead only track the vCPU's TSC scaling (multipler+shift) metadata (which is moderately expensive to compute, and rarely changes for modern setups) - Don't write to the Xen hypercall page on MSR writes that are initiated by the host (userspace or KVM) to fix a class of bugs where KVM can write to guest memory at unexpected times, e.g. during vCPU creation if userspace has set the Xen hypercall MSR index to collide with an MSR that KVM emulates - Restrict the Xen hypercall MSR index to the unofficial synthetic range to reduce the set of possible collisions with MSRs that are emulated by KVM (collisions can still happen as KVM emulates Hyper-V MSRs, which also reside in the synthetic range) - Clean up and optimize KVM's handling of Xen MSR writes and xen_hvm_config - Update Xen TSC leaves during CPUID emulation instead of modifying the CPUID entries when updating PV clocks; there is no guarantee PV clocks will be updated between TSC frequency changes and CPUID emulation, and guest reads of the TSC leaves should be rare, i.e. are not a hot path x86 (Intel): - Fix a bug where KVM unnecessarily reads XFD_ERR from hardware and thus modifies the vCPU's XFD_ERR on a #NM due to CR0.TS=1 - Pass XFD_ERR as the payload when injecting #NM, as a preparatory step for upcoming FRED virtualization support - Decouple the EPT entry RWX protection bit macros from the EPT Violation bits, both as a general cleanup and in anticipation of adding support for emulating Mode-Based Execution Control (MBEC) - Reject KVM_RUN if userspace manages to gain control and stuff invalid guest state while KVM is in the middle of emulating nested VM-Enter - Add a macro to handle KVM's sanity checks on entry/exit VMCS control pairs in anticipation of adding sanity checks for secondary exit controls (the primary field is out of bits) x86 (AMD): - Ensure the PSP driver is initialized when both the PSP and KVM modules are built-in (the initcall framework doesn't handle dependencies) - Use long-term pins when registering encrypted memory regions, so that the pages are migrated out of MIGRATE_CMA/ZONE_MOVABLE and don't lead to excessive fragmentation - Add macros and helpers for setting GHCB return/error codes - Add support for Idle HLT interception, which elides interception if the vCPU has a pending, unmasked virtual IRQ when HLT is executed - Fix a bug in INVPCID emulation where KVM fails to check for a non-canonical address - Don't attempt VMRUN for SEV-ES+ guests if the vCPU's VMSA is invalid, e.g. because the vCPU was "destroyed" via SNP's AP Creation hypercall - Reject SNP AP Creation if the requested SEV features for the vCPU don't match the VM's configured set of features Selftests: - Fix again the Intel PMU counters test; add a data load and do CLFLUSH{OPT} on the data instead of executing code. The theory is that modern Intel CPUs have learned new code prefetching tricks that bypass the PMU counters - Fix a flaw in the Intel PMU counters test where it asserts that an event is counting correctly without actually knowing what the event counts on the underlying hardware - Fix a variety of flaws, bugs, and false failures/passes dirty_log_test, and improve its coverage by collecting all dirty entries on each iteration - Fix a few minor bugs related to handling of stats FDs - Add infrastructure to make vCPU and VM stats FDs available to tests by default (open the FDs during VM/vCPU creation) - Relax an assertion on the number of HLT exits in the xAPIC IPI test when running on a CPU that supports AMD's Idle HLT (which elides interception of HLT if a virtual IRQ is pending and unmasked)" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (216 commits) RISC-V: KVM: Optimize comments in kvm_riscv_vcpu_isa_disable_allowed RISC-V: KVM: Teardown riscv specific bits after kvm_exit LoongArch: KVM: Register perf callbacks for guest LoongArch: KVM: Implement arch-specific functions for guest perf LoongArch: KVM: Add stub for kvm_arch_vcpu_preempted_in_kernel() LoongArch: KVM: Remove PGD saving during VM context switch LoongArch: KVM: Remove unnecessary header include path KVM: arm64: Tear down vGIC on failed vCPU creation KVM: arm64: PMU: Reload when resetting KVM: arm64: PMU: Reload when user modifies registers KVM: arm64: PMU: Fix SET_ONE_REG for vPMC regs KVM: arm64: PMU: Assume PMU presence in pmu-emul.c KVM: arm64: PMU: Set raw values from user to PM{C,I}NTEN{SET,CLR}, PMOVS{SET,CLR} KVM: arm64: Create each pKVM hyp vcpu after its corresponding host vcpu KVM: arm64: Factor out pKVM hyp vcpu creation to separate function KVM: arm64: Initialize HCRX_EL2 traps in pKVM KVM: arm64: Factor out setting HCRX_EL2 traps into separate function KVM: x86: block KVM_CAP_SYNC_REGS if guest state is protected KVM: x86: Add infrastructure for secure TSC KVM: x86: Push down setting vcpu.arch.user_set_tsc ...
2025-03-25 14:22:07 -07:00
else if (is_midr_in_range_list(spectre_bhb_k11_list))
arm64: errata: Assume that unknown CPUs _are_ vulnerable to Spectre BHB The code for detecting CPUs that are vulnerable to Spectre BHB was based on a hardcoded list of CPU IDs that were known to be affected. Unfortunately, the list mostly only contained the IDs of standard ARM cores. The IDs for many cores that are minor variants of the standard ARM cores (like many Qualcomm Kyro CPUs) weren't listed. This led the code to assume that those variants were not affected. Flip the code on its head and instead assume that a core is vulnerable if it doesn't have CSV2_3 but is unrecognized as being safe. This involves creating a "Spectre BHB safe" list. As of right now, the only CPU IDs added to the "Spectre BHB safe" list are ARM Cortex A35, A53, A55, A510, and A520. This list was created by looking for cores that weren't listed in ARM's list [1] as per review feedback on v2 of this patch [2]. Additionally Brahma A53 is added as per mailing list feedback [3]. NOTE: this patch will not actually _mitigate_ anyone, it will simply cause them to report themselves as vulnerable. If any cores in the system are reported as vulnerable but not mitigated then the whole system will be reported as vulnerable though the system will attempt to mitigate with the information it has about the known cores. [1] https://developer.arm.com/Arm%20Security%20Center/Spectre-BHB [2] https://lore.kernel.org/r/20241219175128.GA25477@willie-the-truck [3] https://lore.kernel.org/r/18dbd7d1-a46c-4112-a425-320c99f67a8d@broadcom.com Fixes: 558c303c9734 ("arm64: Mitigate spectre style branch history side channels") Cc: stable@vger.kernel.org Reviewed-by: Julius Werner <jwerner@chromium.org> Signed-off-by: Douglas Anderson <dianders@chromium.org> Link: https://lore.kernel.org/r/20250107120555.v4.2.I2040fa004dafe196243f67ebcc647cbedbb516e6@changeid Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2025-01-07 12:05:59 -08:00
k = 11;
ARM: * Nested virtualization support for VGICv3, giving the nested hypervisor control of the VGIC hardware when running an L2 VM * Removal of 'late' nested virtualization feature register masking, making the supported feature set directly visible to userspace * Support for emulating FEAT_PMUv3 on Apple silicon, taking advantage of an IMPLEMENTATION DEFINED trap that covers all PMUv3 registers * Paravirtual interface for discovering the set of CPU implementations where a VM may run, addressing a longstanding issue of guest CPU errata awareness in big-little systems and cross-implementation VM migration * Userspace control of the registers responsible for identifying a particular CPU implementation (MIDR_EL1, REVIDR_EL1, AIDR_EL1), allowing VMs to be migrated cross-implementation * pKVM updates, including support for tracking stage-2 page table allocations in the protected hypervisor in the 'SecPageTable' stat * Fixes to vPMU, ensuring that userspace updates to the vPMU after KVM_RUN are reflected into the backing perf events LoongArch: * Remove unnecessary header include path * Assume constant PGD during VM context switch * Add perf events support for guest VM RISC-V: * Disable the kernel perf counter during configure * KVM selftests improvements for PMU * Fix warning at the time of KVM module removal x86: * Add support for aging of SPTEs without holding mmu_lock. Not taking mmu_lock allows multiple aging actions to run in parallel, and more importantly avoids stalling vCPUs. This includes an implementation of per-rmap-entry locking; aging the gfn is done with only a per-rmap single-bin spinlock taken, whereas locking an rmap for write requires taking both the per-rmap spinlock and the mmu_lock. Note that this decreases slightly the accuracy of accessed-page information, because changes to the SPTE outside aging might not use atomic operations even if they could race against a clear of the Accessed bit. This is deliberate because KVM and mm/ tolerate false positives/negatives for accessed information, and testing has shown that reducing the latency of aging is far more beneficial to overall system performance than providing "perfect" young/old information. * Defer runtime CPUID updates until KVM emulates a CPUID instruction, to coalesce updates when multiple pieces of vCPU state are changing, e.g. as part of a nested transition. * Fix a variety of nested emulation bugs, and add VMX support for synthesizing nested VM-Exit on interception (instead of injecting #UD into L2). * Drop "support" for async page faults for protected guests that do not set SEND_ALWAYS (i.e. that only want async page faults at CPL3) * Bring a bit of sanity to x86's VM teardown code, which has accumulated a lot of cruft over the years. Particularly, destroy vCPUs before the MMU, despite the latter being a VM-wide operation. * Add common secure TSC infrastructure for use within SNP and in the future TDX * Block KVM_CAP_SYNC_REGS if guest state is protected. It does not make sense to use the capability if the relevant registers are not available for reading or writing. * Don't take kvm->lock when iterating over vCPUs in the suspend notifier to fix a largely theoretical deadlock. * Use the vCPU's actual Xen PV clock information when starting the Xen timer, as the cached state in arch.hv_clock can be stale/bogus. * Fix a bug where KVM could bleed PVCLOCK_GUEST_STOPPED across different PV clocks; restrict PVCLOCK_GUEST_STOPPED to kvmclock, as KVM's suspend notifier only accounts for kvmclock, and there's no evidence that the flag is actually supported by Xen guests. * Clean up the per-vCPU "cache" of its reference pvclock, and instead only track the vCPU's TSC scaling (multipler+shift) metadata (which is moderately expensive to compute, and rarely changes for modern setups). * Don't write to the Xen hypercall page on MSR writes that are initiated by the host (userspace or KVM) to fix a class of bugs where KVM can write to guest memory at unexpected times, e.g. during vCPU creation if userspace has set the Xen hypercall MSR index to collide with an MSR that KVM emulates. * Restrict the Xen hypercall MSR index to the unofficial synthetic range to reduce the set of possible collisions with MSRs that are emulated by KVM (collisions can still happen as KVM emulates Hyper-V MSRs, which also reside in the synthetic range). * Clean up and optimize KVM's handling of Xen MSR writes and xen_hvm_config. * Update Xen TSC leaves during CPUID emulation instead of modifying the CPUID entries when updating PV clocks; there is no guarantee PV clocks will be updated between TSC frequency changes and CPUID emulation, and guest reads of the TSC leaves should be rare, i.e. are not a hot path. x86 (Intel): * Fix a bug where KVM unnecessarily reads XFD_ERR from hardware and thus modifies the vCPU's XFD_ERR on a #NM due to CR0.TS=1. * Pass XFD_ERR as the payload when injecting #NM, as a preparatory step for upcoming FRED virtualization support. * Decouple the EPT entry RWX protection bit macros from the EPT Violation bits, both as a general cleanup and in anticipation of adding support for emulating Mode-Based Execution Control (MBEC). * Reject KVM_RUN if userspace manages to gain control and stuff invalid guest state while KVM is in the middle of emulating nested VM-Enter. * Add a macro to handle KVM's sanity checks on entry/exit VMCS control pairs in anticipation of adding sanity checks for secondary exit controls (the primary field is out of bits). x86 (AMD): * Ensure the PSP driver is initialized when both the PSP and KVM modules are built-in (the initcall framework doesn't handle dependencies). * Use long-term pins when registering encrypted memory regions, so that the pages are migrated out of MIGRATE_CMA/ZONE_MOVABLE and don't lead to excessive fragmentation. * Add macros and helpers for setting GHCB return/error codes. * Add support for Idle HLT interception, which elides interception if the vCPU has a pending, unmasked virtual IRQ when HLT is executed. * Fix a bug in INVPCID emulation where KVM fails to check for a non-canonical address. * Don't attempt VMRUN for SEV-ES+ guests if the vCPU's VMSA is invalid, e.g. because the vCPU was "destroyed" via SNP's AP Creation hypercall. * Reject SNP AP Creation if the requested SEV features for the vCPU don't match the VM's configured set of features. Selftests: * Fix again the Intel PMU counters test; add a data load and do CLFLUSH{OPT} on the data instead of executing code. The theory is that modern Intel CPUs have learned new code prefetching tricks that bypass the PMU counters. * Fix a flaw in the Intel PMU counters test where it asserts that an event is counting correctly without actually knowing what the event counts on the underlying hardware. * Fix a variety of flaws, bugs, and false failures/passes dirty_log_test, and improve its coverage by collecting all dirty entries on each iteration. * Fix a few minor bugs related to handling of stats FDs. * Add infrastructure to make vCPU and VM stats FDs available to tests by default (open the FDs during VM/vCPU creation). * Relax an assertion on the number of HLT exits in the xAPIC IPI test when running on a CPU that supports AMD's Idle HLT (which elides interception of HLT if a virtual IRQ is pending and unmasked). -----BEGIN PGP SIGNATURE----- iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmfcTkEUHHBib256aW5p QHJlZGhhdC5jb20ACgkQv/vSX3jHroMnQAf/cPx72hJOdNy4Qrm8M33YLXVRVV00 yEZ8eN8TWdOclr0ltE/w/ELGh/qS4CU8pjURAk0A6lPioU+mdcTn3dPEqMDMVYom uOQ2lusEHw0UuSnGZSEjvZJsE/Ro2NSAsHIB6PWRqig1ZBPJzyu0frce34pMpeQH diwriJL9lKPAhBWXnUQ9BKoi1R0P5OLW9ahX4SOWk7cAFg4DLlDE66Nqf6nKqViw DwEucTiUEg5+a3d93gihdD4JNl+fb3vI2erxrMxjFjkacl0qgqRu3ei3DG0MfdHU wNcFSG5B1n0OECKxr80lr1Ip1KTVNNij0Ks+w6Gc6lSg9c4PptnNkfLK3A== =nnCN -----END PGP SIGNATURE----- Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm Pull kvm updates from Paolo Bonzini: "ARM: - Nested virtualization support for VGICv3, giving the nested hypervisor control of the VGIC hardware when running an L2 VM - Removal of 'late' nested virtualization feature register masking, making the supported feature set directly visible to userspace - Support for emulating FEAT_PMUv3 on Apple silicon, taking advantage of an IMPLEMENTATION DEFINED trap that covers all PMUv3 registers - Paravirtual interface for discovering the set of CPU implementations where a VM may run, addressing a longstanding issue of guest CPU errata awareness in big-little systems and cross-implementation VM migration - Userspace control of the registers responsible for identifying a particular CPU implementation (MIDR_EL1, REVIDR_EL1, AIDR_EL1), allowing VMs to be migrated cross-implementation - pKVM updates, including support for tracking stage-2 page table allocations in the protected hypervisor in the 'SecPageTable' stat - Fixes to vPMU, ensuring that userspace updates to the vPMU after KVM_RUN are reflected into the backing perf events LoongArch: - Remove unnecessary header include path - Assume constant PGD during VM context switch - Add perf events support for guest VM RISC-V: - Disable the kernel perf counter during configure - KVM selftests improvements for PMU - Fix warning at the time of KVM module removal x86: - Add support for aging of SPTEs without holding mmu_lock. Not taking mmu_lock allows multiple aging actions to run in parallel, and more importantly avoids stalling vCPUs. This includes an implementation of per-rmap-entry locking; aging the gfn is done with only a per-rmap single-bin spinlock taken, whereas locking an rmap for write requires taking both the per-rmap spinlock and the mmu_lock. Note that this decreases slightly the accuracy of accessed-page information, because changes to the SPTE outside aging might not use atomic operations even if they could race against a clear of the Accessed bit. This is deliberate because KVM and mm/ tolerate false positives/negatives for accessed information, and testing has shown that reducing the latency of aging is far more beneficial to overall system performance than providing "perfect" young/old information. - Defer runtime CPUID updates until KVM emulates a CPUID instruction, to coalesce updates when multiple pieces of vCPU state are changing, e.g. as part of a nested transition - Fix a variety of nested emulation bugs, and add VMX support for synthesizing nested VM-Exit on interception (instead of injecting #UD into L2) - Drop "support" for async page faults for protected guests that do not set SEND_ALWAYS (i.e. that only want async page faults at CPL3) - Bring a bit of sanity to x86's VM teardown code, which has accumulated a lot of cruft over the years. Particularly, destroy vCPUs before the MMU, despite the latter being a VM-wide operation - Add common secure TSC infrastructure for use within SNP and in the future TDX - Block KVM_CAP_SYNC_REGS if guest state is protected. It does not make sense to use the capability if the relevant registers are not available for reading or writing - Don't take kvm->lock when iterating over vCPUs in the suspend notifier to fix a largely theoretical deadlock - Use the vCPU's actual Xen PV clock information when starting the Xen timer, as the cached state in arch.hv_clock can be stale/bogus - Fix a bug where KVM could bleed PVCLOCK_GUEST_STOPPED across different PV clocks; restrict PVCLOCK_GUEST_STOPPED to kvmclock, as KVM's suspend notifier only accounts for kvmclock, and there's no evidence that the flag is actually supported by Xen guests - Clean up the per-vCPU "cache" of its reference pvclock, and instead only track the vCPU's TSC scaling (multipler+shift) metadata (which is moderately expensive to compute, and rarely changes for modern setups) - Don't write to the Xen hypercall page on MSR writes that are initiated by the host (userspace or KVM) to fix a class of bugs where KVM can write to guest memory at unexpected times, e.g. during vCPU creation if userspace has set the Xen hypercall MSR index to collide with an MSR that KVM emulates - Restrict the Xen hypercall MSR index to the unofficial synthetic range to reduce the set of possible collisions with MSRs that are emulated by KVM (collisions can still happen as KVM emulates Hyper-V MSRs, which also reside in the synthetic range) - Clean up and optimize KVM's handling of Xen MSR writes and xen_hvm_config - Update Xen TSC leaves during CPUID emulation instead of modifying the CPUID entries when updating PV clocks; there is no guarantee PV clocks will be updated between TSC frequency changes and CPUID emulation, and guest reads of the TSC leaves should be rare, i.e. are not a hot path x86 (Intel): - Fix a bug where KVM unnecessarily reads XFD_ERR from hardware and thus modifies the vCPU's XFD_ERR on a #NM due to CR0.TS=1 - Pass XFD_ERR as the payload when injecting #NM, as a preparatory step for upcoming FRED virtualization support - Decouple the EPT entry RWX protection bit macros from the EPT Violation bits, both as a general cleanup and in anticipation of adding support for emulating Mode-Based Execution Control (MBEC) - Reject KVM_RUN if userspace manages to gain control and stuff invalid guest state while KVM is in the middle of emulating nested VM-Enter - Add a macro to handle KVM's sanity checks on entry/exit VMCS control pairs in anticipation of adding sanity checks for secondary exit controls (the primary field is out of bits) x86 (AMD): - Ensure the PSP driver is initialized when both the PSP and KVM modules are built-in (the initcall framework doesn't handle dependencies) - Use long-term pins when registering encrypted memory regions, so that the pages are migrated out of MIGRATE_CMA/ZONE_MOVABLE and don't lead to excessive fragmentation - Add macros and helpers for setting GHCB return/error codes - Add support for Idle HLT interception, which elides interception if the vCPU has a pending, unmasked virtual IRQ when HLT is executed - Fix a bug in INVPCID emulation where KVM fails to check for a non-canonical address - Don't attempt VMRUN for SEV-ES+ guests if the vCPU's VMSA is invalid, e.g. because the vCPU was "destroyed" via SNP's AP Creation hypercall - Reject SNP AP Creation if the requested SEV features for the vCPU don't match the VM's configured set of features Selftests: - Fix again the Intel PMU counters test; add a data load and do CLFLUSH{OPT} on the data instead of executing code. The theory is that modern Intel CPUs have learned new code prefetching tricks that bypass the PMU counters - Fix a flaw in the Intel PMU counters test where it asserts that an event is counting correctly without actually knowing what the event counts on the underlying hardware - Fix a variety of flaws, bugs, and false failures/passes dirty_log_test, and improve its coverage by collecting all dirty entries on each iteration - Fix a few minor bugs related to handling of stats FDs - Add infrastructure to make vCPU and VM stats FDs available to tests by default (open the FDs during VM/vCPU creation) - Relax an assertion on the number of HLT exits in the xAPIC IPI test when running on a CPU that supports AMD's Idle HLT (which elides interception of HLT if a virtual IRQ is pending and unmasked)" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (216 commits) RISC-V: KVM: Optimize comments in kvm_riscv_vcpu_isa_disable_allowed RISC-V: KVM: Teardown riscv specific bits after kvm_exit LoongArch: KVM: Register perf callbacks for guest LoongArch: KVM: Implement arch-specific functions for guest perf LoongArch: KVM: Add stub for kvm_arch_vcpu_preempted_in_kernel() LoongArch: KVM: Remove PGD saving during VM context switch LoongArch: KVM: Remove unnecessary header include path KVM: arm64: Tear down vGIC on failed vCPU creation KVM: arm64: PMU: Reload when resetting KVM: arm64: PMU: Reload when user modifies registers KVM: arm64: PMU: Fix SET_ONE_REG for vPMC regs KVM: arm64: PMU: Assume PMU presence in pmu-emul.c KVM: arm64: PMU: Set raw values from user to PM{C,I}NTEN{SET,CLR}, PMOVS{SET,CLR} KVM: arm64: Create each pKVM hyp vcpu after its corresponding host vcpu KVM: arm64: Factor out pKVM hyp vcpu creation to separate function KVM: arm64: Initialize HCRX_EL2 traps in pKVM KVM: arm64: Factor out setting HCRX_EL2 traps into separate function KVM: x86: block KVM_CAP_SYNC_REGS if guest state is protected KVM: x86: Add infrastructure for secure TSC KVM: x86: Push down setting vcpu.arch.user_set_tsc ...
2025-03-25 14:22:07 -07:00
else if (is_midr_in_range_list(spectre_bhb_k8_list))
arm64: errata: Assume that unknown CPUs _are_ vulnerable to Spectre BHB The code for detecting CPUs that are vulnerable to Spectre BHB was based on a hardcoded list of CPU IDs that were known to be affected. Unfortunately, the list mostly only contained the IDs of standard ARM cores. The IDs for many cores that are minor variants of the standard ARM cores (like many Qualcomm Kyro CPUs) weren't listed. This led the code to assume that those variants were not affected. Flip the code on its head and instead assume that a core is vulnerable if it doesn't have CSV2_3 but is unrecognized as being safe. This involves creating a "Spectre BHB safe" list. As of right now, the only CPU IDs added to the "Spectre BHB safe" list are ARM Cortex A35, A53, A55, A510, and A520. This list was created by looking for cores that weren't listed in ARM's list [1] as per review feedback on v2 of this patch [2]. Additionally Brahma A53 is added as per mailing list feedback [3]. NOTE: this patch will not actually _mitigate_ anyone, it will simply cause them to report themselves as vulnerable. If any cores in the system are reported as vulnerable but not mitigated then the whole system will be reported as vulnerable though the system will attempt to mitigate with the information it has about the known cores. [1] https://developer.arm.com/Arm%20Security%20Center/Spectre-BHB [2] https://lore.kernel.org/r/20241219175128.GA25477@willie-the-truck [3] https://lore.kernel.org/r/18dbd7d1-a46c-4112-a425-320c99f67a8d@broadcom.com Fixes: 558c303c9734 ("arm64: Mitigate spectre style branch history side channels") Cc: stable@vger.kernel.org Reviewed-by: Julius Werner <jwerner@chromium.org> Signed-off-by: Douglas Anderson <dianders@chromium.org> Link: https://lore.kernel.org/r/20250107120555.v4.2.I2040fa004dafe196243f67ebcc647cbedbb516e6@changeid Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2025-01-07 12:05:59 -08:00
k = 8;
return k;
}
static enum mitigation_state spectre_bhb_get_cpu_fw_mitigation_state(void)
{
int ret;
struct arm_smccc_res res;
arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
ARM_SMCCC_ARCH_WORKAROUND_3, &res);
ret = res.a0;
switch (ret) {
case SMCCC_RET_SUCCESS:
return SPECTRE_MITIGATED;
case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED:
return SPECTRE_UNAFFECTED;
default:
fallthrough;
case SMCCC_RET_NOT_SUPPORTED:
return SPECTRE_VULNERABLE;
}
}
arm64: errata: Assume that unknown CPUs _are_ vulnerable to Spectre BHB The code for detecting CPUs that are vulnerable to Spectre BHB was based on a hardcoded list of CPU IDs that were known to be affected. Unfortunately, the list mostly only contained the IDs of standard ARM cores. The IDs for many cores that are minor variants of the standard ARM cores (like many Qualcomm Kyro CPUs) weren't listed. This led the code to assume that those variants were not affected. Flip the code on its head and instead assume that a core is vulnerable if it doesn't have CSV2_3 but is unrecognized as being safe. This involves creating a "Spectre BHB safe" list. As of right now, the only CPU IDs added to the "Spectre BHB safe" list are ARM Cortex A35, A53, A55, A510, and A520. This list was created by looking for cores that weren't listed in ARM's list [1] as per review feedback on v2 of this patch [2]. Additionally Brahma A53 is added as per mailing list feedback [3]. NOTE: this patch will not actually _mitigate_ anyone, it will simply cause them to report themselves as vulnerable. If any cores in the system are reported as vulnerable but not mitigated then the whole system will be reported as vulnerable though the system will attempt to mitigate with the information it has about the known cores. [1] https://developer.arm.com/Arm%20Security%20Center/Spectre-BHB [2] https://lore.kernel.org/r/20241219175128.GA25477@willie-the-truck [3] https://lore.kernel.org/r/18dbd7d1-a46c-4112-a425-320c99f67a8d@broadcom.com Fixes: 558c303c9734 ("arm64: Mitigate spectre style branch history side channels") Cc: stable@vger.kernel.org Reviewed-by: Julius Werner <jwerner@chromium.org> Signed-off-by: Douglas Anderson <dianders@chromium.org> Link: https://lore.kernel.org/r/20250107120555.v4.2.I2040fa004dafe196243f67ebcc647cbedbb516e6@changeid Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2025-01-07 12:05:59 -08:00
static bool has_spectre_bhb_fw_mitigation(void)
{
enum mitigation_state fw_state;
bool has_smccc = arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_NONE;
fw_state = spectre_bhb_get_cpu_fw_mitigation_state();
arm64: errata: Assume that unknown CPUs _are_ vulnerable to Spectre BHB The code for detecting CPUs that are vulnerable to Spectre BHB was based on a hardcoded list of CPU IDs that were known to be affected. Unfortunately, the list mostly only contained the IDs of standard ARM cores. The IDs for many cores that are minor variants of the standard ARM cores (like many Qualcomm Kyro CPUs) weren't listed. This led the code to assume that those variants were not affected. Flip the code on its head and instead assume that a core is vulnerable if it doesn't have CSV2_3 but is unrecognized as being safe. This involves creating a "Spectre BHB safe" list. As of right now, the only CPU IDs added to the "Spectre BHB safe" list are ARM Cortex A35, A53, A55, A510, and A520. This list was created by looking for cores that weren't listed in ARM's list [1] as per review feedback on v2 of this patch [2]. Additionally Brahma A53 is added as per mailing list feedback [3]. NOTE: this patch will not actually _mitigate_ anyone, it will simply cause them to report themselves as vulnerable. If any cores in the system are reported as vulnerable but not mitigated then the whole system will be reported as vulnerable though the system will attempt to mitigate with the information it has about the known cores. [1] https://developer.arm.com/Arm%20Security%20Center/Spectre-BHB [2] https://lore.kernel.org/r/20241219175128.GA25477@willie-the-truck [3] https://lore.kernel.org/r/18dbd7d1-a46c-4112-a425-320c99f67a8d@broadcom.com Fixes: 558c303c9734 ("arm64: Mitigate spectre style branch history side channels") Cc: stable@vger.kernel.org Reviewed-by: Julius Werner <jwerner@chromium.org> Signed-off-by: Douglas Anderson <dianders@chromium.org> Link: https://lore.kernel.org/r/20250107120555.v4.2.I2040fa004dafe196243f67ebcc647cbedbb516e6@changeid Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2025-01-07 12:05:59 -08:00
return has_smccc && fw_state == SPECTRE_MITIGATED;
}
static bool supports_ecbhb(int scope)
{
u64 mmfr1;
if (scope == SCOPE_LOCAL_CPU)
mmfr1 = read_sysreg_s(SYS_ID_AA64MMFR1_EL1);
else
mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
return cpuid_feature_extract_unsigned_field(mmfr1,
ID_AA64MMFR1_EL1_ECBHB_SHIFT);
}
arm64: errata: Assume that unknown CPUs _are_ vulnerable to Spectre BHB The code for detecting CPUs that are vulnerable to Spectre BHB was based on a hardcoded list of CPU IDs that were known to be affected. Unfortunately, the list mostly only contained the IDs of standard ARM cores. The IDs for many cores that are minor variants of the standard ARM cores (like many Qualcomm Kyro CPUs) weren't listed. This led the code to assume that those variants were not affected. Flip the code on its head and instead assume that a core is vulnerable if it doesn't have CSV2_3 but is unrecognized as being safe. This involves creating a "Spectre BHB safe" list. As of right now, the only CPU IDs added to the "Spectre BHB safe" list are ARM Cortex A35, A53, A55, A510, and A520. This list was created by looking for cores that weren't listed in ARM's list [1] as per review feedback on v2 of this patch [2]. Additionally Brahma A53 is added as per mailing list feedback [3]. NOTE: this patch will not actually _mitigate_ anyone, it will simply cause them to report themselves as vulnerable. If any cores in the system are reported as vulnerable but not mitigated then the whole system will be reported as vulnerable though the system will attempt to mitigate with the information it has about the known cores. [1] https://developer.arm.com/Arm%20Security%20Center/Spectre-BHB [2] https://lore.kernel.org/r/20241219175128.GA25477@willie-the-truck [3] https://lore.kernel.org/r/18dbd7d1-a46c-4112-a425-320c99f67a8d@broadcom.com Fixes: 558c303c9734 ("arm64: Mitigate spectre style branch history side channels") Cc: stable@vger.kernel.org Reviewed-by: Julius Werner <jwerner@chromium.org> Signed-off-by: Douglas Anderson <dianders@chromium.org> Link: https://lore.kernel.org/r/20250107120555.v4.2.I2040fa004dafe196243f67ebcc647cbedbb516e6@changeid Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2025-01-07 12:05:59 -08:00
static u8 max_bhb_k;
bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry,
int scope)
{
WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
if (supports_csv2p3(scope))
return false;
arm64: errata: Assume that unknown CPUs _are_ vulnerable to Spectre BHB The code for detecting CPUs that are vulnerable to Spectre BHB was based on a hardcoded list of CPU IDs that were known to be affected. Unfortunately, the list mostly only contained the IDs of standard ARM cores. The IDs for many cores that are minor variants of the standard ARM cores (like many Qualcomm Kyro CPUs) weren't listed. This led the code to assume that those variants were not affected. Flip the code on its head and instead assume that a core is vulnerable if it doesn't have CSV2_3 but is unrecognized as being safe. This involves creating a "Spectre BHB safe" list. As of right now, the only CPU IDs added to the "Spectre BHB safe" list are ARM Cortex A35, A53, A55, A510, and A520. This list was created by looking for cores that weren't listed in ARM's list [1] as per review feedback on v2 of this patch [2]. Additionally Brahma A53 is added as per mailing list feedback [3]. NOTE: this patch will not actually _mitigate_ anyone, it will simply cause them to report themselves as vulnerable. If any cores in the system are reported as vulnerable but not mitigated then the whole system will be reported as vulnerable though the system will attempt to mitigate with the information it has about the known cores. [1] https://developer.arm.com/Arm%20Security%20Center/Spectre-BHB [2] https://lore.kernel.org/r/20241219175128.GA25477@willie-the-truck [3] https://lore.kernel.org/r/18dbd7d1-a46c-4112-a425-320c99f67a8d@broadcom.com Fixes: 558c303c9734 ("arm64: Mitigate spectre style branch history side channels") Cc: stable@vger.kernel.org Reviewed-by: Julius Werner <jwerner@chromium.org> Signed-off-by: Douglas Anderson <dianders@chromium.org> Link: https://lore.kernel.org/r/20250107120555.v4.2.I2040fa004dafe196243f67ebcc647cbedbb516e6@changeid Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2025-01-07 12:05:59 -08:00
if (is_spectre_bhb_safe(scope))
return false;
arm64: errata: Assume that unknown CPUs _are_ vulnerable to Spectre BHB The code for detecting CPUs that are vulnerable to Spectre BHB was based on a hardcoded list of CPU IDs that were known to be affected. Unfortunately, the list mostly only contained the IDs of standard ARM cores. The IDs for many cores that are minor variants of the standard ARM cores (like many Qualcomm Kyro CPUs) weren't listed. This led the code to assume that those variants were not affected. Flip the code on its head and instead assume that a core is vulnerable if it doesn't have CSV2_3 but is unrecognized as being safe. This involves creating a "Spectre BHB safe" list. As of right now, the only CPU IDs added to the "Spectre BHB safe" list are ARM Cortex A35, A53, A55, A510, and A520. This list was created by looking for cores that weren't listed in ARM's list [1] as per review feedback on v2 of this patch [2]. Additionally Brahma A53 is added as per mailing list feedback [3]. NOTE: this patch will not actually _mitigate_ anyone, it will simply cause them to report themselves as vulnerable. If any cores in the system are reported as vulnerable but not mitigated then the whole system will be reported as vulnerable though the system will attempt to mitigate with the information it has about the known cores. [1] https://developer.arm.com/Arm%20Security%20Center/Spectre-BHB [2] https://lore.kernel.org/r/20241219175128.GA25477@willie-the-truck [3] https://lore.kernel.org/r/18dbd7d1-a46c-4112-a425-320c99f67a8d@broadcom.com Fixes: 558c303c9734 ("arm64: Mitigate spectre style branch history side channels") Cc: stable@vger.kernel.org Reviewed-by: Julius Werner <jwerner@chromium.org> Signed-off-by: Douglas Anderson <dianders@chromium.org> Link: https://lore.kernel.org/r/20250107120555.v4.2.I2040fa004dafe196243f67ebcc647cbedbb516e6@changeid Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2025-01-07 12:05:59 -08:00
/*
* At this point the core isn't known to be "safe" so we're going to
* assume it's vulnerable. We still need to update `max_bhb_k` though,
* but only if we aren't mitigating with clearbhb though.
*/
if (scope == SCOPE_LOCAL_CPU && !supports_clearbhb(SCOPE_LOCAL_CPU))
max_bhb_k = max(max_bhb_k, spectre_bhb_loop_affected());
arm64: errata: Assume that unknown CPUs _are_ vulnerable to Spectre BHB The code for detecting CPUs that are vulnerable to Spectre BHB was based on a hardcoded list of CPU IDs that were known to be affected. Unfortunately, the list mostly only contained the IDs of standard ARM cores. The IDs for many cores that are minor variants of the standard ARM cores (like many Qualcomm Kyro CPUs) weren't listed. This led the code to assume that those variants were not affected. Flip the code on its head and instead assume that a core is vulnerable if it doesn't have CSV2_3 but is unrecognized as being safe. This involves creating a "Spectre BHB safe" list. As of right now, the only CPU IDs added to the "Spectre BHB safe" list are ARM Cortex A35, A53, A55, A510, and A520. This list was created by looking for cores that weren't listed in ARM's list [1] as per review feedback on v2 of this patch [2]. Additionally Brahma A53 is added as per mailing list feedback [3]. NOTE: this patch will not actually _mitigate_ anyone, it will simply cause them to report themselves as vulnerable. If any cores in the system are reported as vulnerable but not mitigated then the whole system will be reported as vulnerable though the system will attempt to mitigate with the information it has about the known cores. [1] https://developer.arm.com/Arm%20Security%20Center/Spectre-BHB [2] https://lore.kernel.org/r/20241219175128.GA25477@willie-the-truck [3] https://lore.kernel.org/r/18dbd7d1-a46c-4112-a425-320c99f67a8d@broadcom.com Fixes: 558c303c9734 ("arm64: Mitigate spectre style branch history side channels") Cc: stable@vger.kernel.org Reviewed-by: Julius Werner <jwerner@chromium.org> Signed-off-by: Douglas Anderson <dianders@chromium.org> Link: https://lore.kernel.org/r/20250107120555.v4.2.I2040fa004dafe196243f67ebcc647cbedbb516e6@changeid Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2025-01-07 12:05:59 -08:00
return true;
}
u8 get_spectre_bhb_loop_value(void)
{
return max_bhb_k;
}
static void this_cpu_set_vectors(enum arm64_bp_harden_el1_vectors slot)
{
const char *v = arm64_get_bp_hardening_vector(slot);
__this_cpu_write(this_cpu_vector, v);
/*
* When KPTI is in use, the vectors are switched when exiting to
* user-space.
*/
arm64: Avoid cpus_have_const_cap() for ARM64_UNMAP_KERNEL_AT_EL0 In arm64_kernel_unmapped_at_el0() we use cpus_have_const_cap() to check for ARM64_UNMAP_KERNEL_AT_EL0, but this is only necessary so that arm64_get_bp_hardening_vector() and this_cpu_set_vectors() can run prior to alternatives being patched. Otherwise this is not necessary and alternative_has_cap_*() would be preferable. For historical reasons, cpus_have_const_cap() is more complicated than it needs to be. Before cpucaps are finalized, it will perform a bitmap test of the system_cpucaps bitmap, and once cpucaps are finalized it will use an alternative branch. This used to be necessary to handle some race conditions in the window between cpucap detection and the subsequent patching of alternatives and static branches, where different branches could be out-of-sync with one another (or w.r.t. alternative sequences). Now that we use alternative branches instead of static branches, these are all patched atomically w.r.t. one another, and there are only a handful of cases that need special care in the window between cpucap detection and alternative patching. Due to the above, it would be nice to remove cpus_have_const_cap(), and migrate callers over to alternative_has_cap_*(), cpus_have_final_cap(), or cpus_have_cap() depending on when their requirements. This will remove redundant instructions and improve code generation, and will make it easier to determine how each callsite will behave before, during, and after alternative patching. The ARM64_UNMAP_KERNEL_AT_EL0 cpucap is a system-wide feature that is detected and patched before any translation tables are created for userspace. In the window between detecting the ARM64_UNMAP_KERNEL_AT_EL0 cpucap and patching alternatives, most users of arm64_kernel_unmapped_at_el0() do not need to know that the cpucap has been detected: * As KVM is initialized after cpucaps are finalized, no usaef of arm64_kernel_unmapped_at_el0() in the KVM code is reachable during this window. * The arm64_mm_context_get() function in arch/arm64/mm/context.c is only called after the SMMU driver is brought up after alternatives have been patched. Thus this can safely use cpus_have_final_cap() or alternative_has_cap_*(). Similarly the asids_update_limit() function is called after alternatives have been patched as an arch_initcall, and this can safely use cpus_have_final_cap() or alternative_has_cap_*(). Similarly we do not expect an ASID rollover to occur between cpucaps being detected and patching alternatives. Thus set_reserved_asid_bits() can safely use cpus_have_final_cap() or alternative_has_cap_*(). * The __tlbi_user() and __tlbi_user_level() macros are not used during this window, and only need to invalidate additional entries once userspace translation tables have been active on a CPU. Thus these can safely use alternative_has_cap_*(). * The xen_kernel_unmapped_at_usr() function is not used during this window as it is only used in a late_initcall. Thus this can safely use cpus_have_final_cap() or alternative_has_cap_*(). * The arm64_get_meltdown_state() function is not used during this window. It only used by arm64_get_meltdown_state() and KVM code, both of which are only used after cpucaps have been finalized. Thus this can safely use cpus_have_final_cap() or alternative_has_cap_*(). * The tls_thread_switch() uses arm64_kernel_unmapped_at_el0() as an optimization to avoid zeroing tpidrro_el0 when KPTI is enabled and this will be trampled by the KPTI trampoline. It doesn't matter if this continues to zero the register during the window between detecting the cpucap and patching alternatives, so this can safely use alternative_has_cap_*(). * The sdei_arch_get_entry_point() and do_sdei_event() functions aren't reachable at this time as the SDEI driver is registered later by acpi_init() -> acpi_ghes_init() -> sdei_init(), where acpi_init is a subsys_initcall. Thus these can safely use cpus_have_final_cap() or alternative_has_cap_*(). * The uses under drivers/ aren't reachable at this time as the drivers are registered later: - TRBE is registered via module_init() - SMMUv3 is registred via module_driver() - SPE is registred via module_init() * The arm64_get_bp_hardening_vector() and this_cpu_set_vectors() functions need to run on boot CPUs prior to patching alternatives. As these are only called during the onlining of a CPU, it's fine to perform a system_cpucaps bitmap test using cpus_have_cap(). This patch modifies this_cpu_set_vectors() to use cpus_have_cap(), and replaced all other use of cpus_have_const_cap() with alternative_has_cap_unlikely(), which will avoid generating code to test the system_cpucaps bitmap and should be better for all subsequent calls at runtime. The ARM64_UNMAP_KERNEL_AT_EL0 cpucap is added to cpucap_is_possible() so that code can be elided entirely when this is not possible. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Ard Biesheuvel <ardb@kernel.org> Cc: James Morse <james.morse@arm.com> Cc: Suzuki K Poulose <suzuki.poulose@arm.com> Cc: Will Deacon <will@kernel.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2023-10-16 11:24:53 +01:00
if (cpus_have_cap(ARM64_UNMAP_KERNEL_AT_EL0))
return;
write_sysreg(v, vbar_el1);
isb();
}
bool __read_mostly __nospectre_bhb;
static int __init parse_spectre_bhb_param(char *str)
{
__nospectre_bhb = true;
return 0;
}
early_param("nospectre_bhb", parse_spectre_bhb_param);
void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry)
{
bp_hardening_cb_t cpu_cb;
arm64: errata: Assume that unknown CPUs _are_ vulnerable to Spectre BHB The code for detecting CPUs that are vulnerable to Spectre BHB was based on a hardcoded list of CPU IDs that were known to be affected. Unfortunately, the list mostly only contained the IDs of standard ARM cores. The IDs for many cores that are minor variants of the standard ARM cores (like many Qualcomm Kyro CPUs) weren't listed. This led the code to assume that those variants were not affected. Flip the code on its head and instead assume that a core is vulnerable if it doesn't have CSV2_3 but is unrecognized as being safe. This involves creating a "Spectre BHB safe" list. As of right now, the only CPU IDs added to the "Spectre BHB safe" list are ARM Cortex A35, A53, A55, A510, and A520. This list was created by looking for cores that weren't listed in ARM's list [1] as per review feedback on v2 of this patch [2]. Additionally Brahma A53 is added as per mailing list feedback [3]. NOTE: this patch will not actually _mitigate_ anyone, it will simply cause them to report themselves as vulnerable. If any cores in the system are reported as vulnerable but not mitigated then the whole system will be reported as vulnerable though the system will attempt to mitigate with the information it has about the known cores. [1] https://developer.arm.com/Arm%20Security%20Center/Spectre-BHB [2] https://lore.kernel.org/r/20241219175128.GA25477@willie-the-truck [3] https://lore.kernel.org/r/18dbd7d1-a46c-4112-a425-320c99f67a8d@broadcom.com Fixes: 558c303c9734 ("arm64: Mitigate spectre style branch history side channels") Cc: stable@vger.kernel.org Reviewed-by: Julius Werner <jwerner@chromium.org> Signed-off-by: Douglas Anderson <dianders@chromium.org> Link: https://lore.kernel.org/r/20250107120555.v4.2.I2040fa004dafe196243f67ebcc647cbedbb516e6@changeid Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2025-01-07 12:05:59 -08:00
enum mitigation_state state = SPECTRE_VULNERABLE;
struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data);
if (!is_spectre_bhb_affected(entry, SCOPE_LOCAL_CPU))
return;
if (arm64_get_spectre_v2_state() == SPECTRE_VULNERABLE) {
/* No point mitigating Spectre-BHB alone. */
} else if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY)) {
pr_info_once("spectre-bhb mitigation disabled by compile time option\n");
} else if (cpu_mitigations_off() || __nospectre_bhb) {
pr_info_once("spectre-bhb mitigation disabled by command line option\n");
} else if (supports_ecbhb(SCOPE_LOCAL_CPU)) {
state = SPECTRE_MITIGATED;
set_bit(BHB_HW, &system_bhb_mitigations);
} else if (supports_clearbhb(SCOPE_LOCAL_CPU)) {
/*
* Ensure KVM uses the indirect vector which will have ClearBHB
* added.
*/
if (!data->slot)
data->slot = HYP_VECTOR_INDIRECT;
this_cpu_set_vectors(EL1_VECTOR_BHB_CLEAR_INSN);
state = SPECTRE_MITIGATED;
set_bit(BHB_INSN, &system_bhb_mitigations);
arm64: errata: Assume that unknown CPUs _are_ vulnerable to Spectre BHB The code for detecting CPUs that are vulnerable to Spectre BHB was based on a hardcoded list of CPU IDs that were known to be affected. Unfortunately, the list mostly only contained the IDs of standard ARM cores. The IDs for many cores that are minor variants of the standard ARM cores (like many Qualcomm Kyro CPUs) weren't listed. This led the code to assume that those variants were not affected. Flip the code on its head and instead assume that a core is vulnerable if it doesn't have CSV2_3 but is unrecognized as being safe. This involves creating a "Spectre BHB safe" list. As of right now, the only CPU IDs added to the "Spectre BHB safe" list are ARM Cortex A35, A53, A55, A510, and A520. This list was created by looking for cores that weren't listed in ARM's list [1] as per review feedback on v2 of this patch [2]. Additionally Brahma A53 is added as per mailing list feedback [3]. NOTE: this patch will not actually _mitigate_ anyone, it will simply cause them to report themselves as vulnerable. If any cores in the system are reported as vulnerable but not mitigated then the whole system will be reported as vulnerable though the system will attempt to mitigate with the information it has about the known cores. [1] https://developer.arm.com/Arm%20Security%20Center/Spectre-BHB [2] https://lore.kernel.org/r/20241219175128.GA25477@willie-the-truck [3] https://lore.kernel.org/r/18dbd7d1-a46c-4112-a425-320c99f67a8d@broadcom.com Fixes: 558c303c9734 ("arm64: Mitigate spectre style branch history side channels") Cc: stable@vger.kernel.org Reviewed-by: Julius Werner <jwerner@chromium.org> Signed-off-by: Douglas Anderson <dianders@chromium.org> Link: https://lore.kernel.org/r/20250107120555.v4.2.I2040fa004dafe196243f67ebcc647cbedbb516e6@changeid Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2025-01-07 12:05:59 -08:00
} else if (spectre_bhb_loop_affected()) {
/*
* Ensure KVM uses the indirect vector which will have the
* branchy-loop added. A57/A72-r0 will already have selected
* the spectre-indirect vector, which is sufficient for BHB
* too.
*/
if (!data->slot)
data->slot = HYP_VECTOR_INDIRECT;
this_cpu_set_vectors(EL1_VECTOR_BHB_LOOP);
state = SPECTRE_MITIGATED;
set_bit(BHB_LOOP, &system_bhb_mitigations);
arm64: errata: Assume that unknown CPUs _are_ vulnerable to Spectre BHB The code for detecting CPUs that are vulnerable to Spectre BHB was based on a hardcoded list of CPU IDs that were known to be affected. Unfortunately, the list mostly only contained the IDs of standard ARM cores. The IDs for many cores that are minor variants of the standard ARM cores (like many Qualcomm Kyro CPUs) weren't listed. This led the code to assume that those variants were not affected. Flip the code on its head and instead assume that a core is vulnerable if it doesn't have CSV2_3 but is unrecognized as being safe. This involves creating a "Spectre BHB safe" list. As of right now, the only CPU IDs added to the "Spectre BHB safe" list are ARM Cortex A35, A53, A55, A510, and A520. This list was created by looking for cores that weren't listed in ARM's list [1] as per review feedback on v2 of this patch [2]. Additionally Brahma A53 is added as per mailing list feedback [3]. NOTE: this patch will not actually _mitigate_ anyone, it will simply cause them to report themselves as vulnerable. If any cores in the system are reported as vulnerable but not mitigated then the whole system will be reported as vulnerable though the system will attempt to mitigate with the information it has about the known cores. [1] https://developer.arm.com/Arm%20Security%20Center/Spectre-BHB [2] https://lore.kernel.org/r/20241219175128.GA25477@willie-the-truck [3] https://lore.kernel.org/r/18dbd7d1-a46c-4112-a425-320c99f67a8d@broadcom.com Fixes: 558c303c9734 ("arm64: Mitigate spectre style branch history side channels") Cc: stable@vger.kernel.org Reviewed-by: Julius Werner <jwerner@chromium.org> Signed-off-by: Douglas Anderson <dianders@chromium.org> Link: https://lore.kernel.org/r/20250107120555.v4.2.I2040fa004dafe196243f67ebcc647cbedbb516e6@changeid Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2025-01-07 12:05:59 -08:00
} else if (has_spectre_bhb_fw_mitigation()) {
/*
* Ensure KVM uses one of the spectre bp_hardening
* vectors. The indirect vector doesn't include the EL3
* call, so needs upgrading to
* HYP_VECTOR_SPECTRE_INDIRECT.
*/
if (!data->slot || data->slot == HYP_VECTOR_INDIRECT)
data->slot += 1;
this_cpu_set_vectors(EL1_VECTOR_BHB_FW);
/*
* The WA3 call in the vectors supersedes the WA1 call
* made during context-switch. Uninstall any firmware
* bp_hardening callback.
*/
cpu_cb = spectre_v2_get_sw_mitigation_cb();
if (__this_cpu_read(bp_hardening_data.fn) != cpu_cb)
__this_cpu_write(bp_hardening_data.fn, NULL);
state = SPECTRE_MITIGATED;
set_bit(BHB_FW, &system_bhb_mitigations);
}
update_mitigation_state(&spectre_bhb_state, state);
}
bool is_spectre_bhb_fw_mitigated(void)
{
return test_bit(BHB_FW, &system_bhb_mitigations);
}
/* Patched to NOP when enabled */
void noinstr spectre_bhb_patch_loop_mitigation_enable(struct alt_instr *alt,
__le32 *origptr,
__le32 *updptr, int nr_inst)
{
BUG_ON(nr_inst != 1);
if (test_bit(BHB_LOOP, &system_bhb_mitigations))
*updptr++ = cpu_to_le32(aarch64_insn_gen_nop());
}
/* Patched to NOP when enabled */
void noinstr spectre_bhb_patch_fw_mitigation_enabled(struct alt_instr *alt,
__le32 *origptr,
__le32 *updptr, int nr_inst)
{
BUG_ON(nr_inst != 1);
if (test_bit(BHB_FW, &system_bhb_mitigations))
*updptr++ = cpu_to_le32(aarch64_insn_gen_nop());
}
/* Patched to correct the immediate */
void noinstr spectre_bhb_patch_loop_iter(struct alt_instr *alt,
__le32 *origptr, __le32 *updptr, int nr_inst)
{
u8 rd;
u32 insn;
BUG_ON(nr_inst != 1); /* MOV -> MOV */
if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY))
return;
insn = le32_to_cpu(*origptr);
rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, insn);
arm64: errata: Assume that unknown CPUs _are_ vulnerable to Spectre BHB The code for detecting CPUs that are vulnerable to Spectre BHB was based on a hardcoded list of CPU IDs that were known to be affected. Unfortunately, the list mostly only contained the IDs of standard ARM cores. The IDs for many cores that are minor variants of the standard ARM cores (like many Qualcomm Kyro CPUs) weren't listed. This led the code to assume that those variants were not affected. Flip the code on its head and instead assume that a core is vulnerable if it doesn't have CSV2_3 but is unrecognized as being safe. This involves creating a "Spectre BHB safe" list. As of right now, the only CPU IDs added to the "Spectre BHB safe" list are ARM Cortex A35, A53, A55, A510, and A520. This list was created by looking for cores that weren't listed in ARM's list [1] as per review feedback on v2 of this patch [2]. Additionally Brahma A53 is added as per mailing list feedback [3]. NOTE: this patch will not actually _mitigate_ anyone, it will simply cause them to report themselves as vulnerable. If any cores in the system are reported as vulnerable but not mitigated then the whole system will be reported as vulnerable though the system will attempt to mitigate with the information it has about the known cores. [1] https://developer.arm.com/Arm%20Security%20Center/Spectre-BHB [2] https://lore.kernel.org/r/20241219175128.GA25477@willie-the-truck [3] https://lore.kernel.org/r/18dbd7d1-a46c-4112-a425-320c99f67a8d@broadcom.com Fixes: 558c303c9734 ("arm64: Mitigate spectre style branch history side channels") Cc: stable@vger.kernel.org Reviewed-by: Julius Werner <jwerner@chromium.org> Signed-off-by: Douglas Anderson <dianders@chromium.org> Link: https://lore.kernel.org/r/20250107120555.v4.2.I2040fa004dafe196243f67ebcc647cbedbb516e6@changeid Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2025-01-07 12:05:59 -08:00
insn = aarch64_insn_gen_movewide(rd, max_bhb_k, 0,
AARCH64_INSN_VARIANT_64BIT,
AARCH64_INSN_MOVEWIDE_ZERO);
*updptr++ = cpu_to_le32(insn);
}
/* Patched to mov WA3 when supported */
void noinstr spectre_bhb_patch_wa3(struct alt_instr *alt,
__le32 *origptr, __le32 *updptr, int nr_inst)
{
u8 rd;
u32 insn;
BUG_ON(nr_inst != 1); /* MOV -> MOV */
if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY) ||
!test_bit(BHB_FW, &system_bhb_mitigations))
return;
insn = le32_to_cpu(*origptr);
rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, insn);
insn = aarch64_insn_gen_logical_immediate(AARCH64_INSN_LOGIC_ORR,
AARCH64_INSN_VARIANT_32BIT,
AARCH64_INSN_REG_ZR, rd,
ARM_SMCCC_ARCH_WORKAROUND_3);
if (WARN_ON_ONCE(insn == AARCH64_BREAK_FAULT))
return;
*updptr++ = cpu_to_le32(insn);
}
/* Patched to NOP when not supported */
void __init spectre_bhb_patch_clearbhb(struct alt_instr *alt,
__le32 *origptr, __le32 *updptr, int nr_inst)
{
BUG_ON(nr_inst != 2);
if (test_bit(BHB_INSN, &system_bhb_mitigations))
return;
*updptr++ = cpu_to_le32(aarch64_insn_gen_nop());
*updptr++ = cpu_to_le32(aarch64_insn_gen_nop());
}
#ifdef CONFIG_BPF_SYSCALL
#define EBPF_WARN "Unprivileged eBPF is enabled, data leaks possible via Spectre v2 BHB attacks!\n"
void unpriv_ebpf_notify(int new_state)
{
if (spectre_v2_state == SPECTRE_VULNERABLE ||
spectre_bhb_state != SPECTRE_MITIGATED)
return;
if (!new_state)
pr_err("WARNING: %s", EBPF_WARN);
}
#endif