linux/arch/arm64/kvm/vgic/vgic-init.c

752 lines
19 KiB
C
Raw Permalink Normal View History

// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2015, 2016 ARM Ltd.
*/
#include <linux/uaccess.h>
#include <linux/interrupt.h>
#include <linux/cpu.h>
#include <linux/kvm_host.h>
#include <kvm/arm_vgic.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_mmu.h>
#include "vgic.h"
/*
* Initialization rules: there are multiple stages to the vgic
* initialization, both for the distributor and the CPU interfaces. The basic
* idea is that even though the VGIC is not functional or not requested from
* user space, the critical path of the run loop can still call VGIC functions
* that just won't do anything, without them having to check additional
* initialization flags to ensure they don't look at uninitialized data
* structures.
*
* Distributor:
*
* - kvm_vgic_early_init(): initialization of static data that doesn't
* depend on any sizing information or emulation type. No allocation
* is allowed there.
*
* - vgic_init(): allocation and initialization of the generic data
* structures that depend on sizing information (number of CPUs,
* number of interrupts). Also initializes the vcpu specific data
* structures. Can be executed lazily for GICv2.
*
* CPU Interface:
*
* - kvm_vgic_vcpu_init(): initialization of static data that doesn't depend
* on any sizing information. Private interrupts are allocated if not
* already allocated at vgic-creation time.
*/
/* EARLY INIT */
/**
* kvm_vgic_early_init() - Initialize static VGIC VCPU data structures
* @kvm: The VM whose VGIC districutor should be initialized
*
* Only do initialization of static structures that don't require any
* allocation or sizing information from userspace. vgic_init() called
* kvm_vgic_dist_init() which takes care of the rest.
*/
void kvm_vgic_early_init(struct kvm *kvm)
{
struct vgic_dist *dist = &kvm->arch.vgic;
xa_init_flags(&dist->lpi_xa, XA_FLAGS_LOCK_IRQ);
}
/* CREATION */
static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type);
/**
* kvm_vgic_create: triggered by the instantiation of the VGIC device by
* user space, either through the legacy KVM_CREATE_IRQCHIP ioctl (v2 only)
* or through the generic KVM_CREATE_DEVICE API ioctl.
* irqchip_in_kernel() tells you if this function succeeded or not.
* @kvm: kvm struct pointer
* @type: KVM_DEV_TYPE_ARM_VGIC_V[23]
*/
int kvm_vgic_create(struct kvm *kvm, u32 type)
{
struct kvm_vcpu *vcpu;
unsigned long i;
int ret;
/*
* This function is also called by the KVM_CREATE_IRQCHIP handler,
* which had no chance yet to check the availability of the GICv2
* emulation. So check this here again. KVM_CREATE_DEVICE does
* the proper checks already.
*/
if (type == KVM_DEV_TYPE_ARM_VGIC_V2 &&
!kvm_vgic_global_state.can_emulate_gicv2)
return -ENODEV;
KVM: arm64: vgic-init: Plug vCPU vs. VGIC creation race syzkaller has found another ugly race in the VGIC, this time dealing with VGIC creation. Since kvm_vgic_create() doesn't sufficiently protect against in-flight vCPU creations, it is possible to get a vCPU into the kernel w/ an in-kernel VGIC but no allocation of private IRQs: Unable to handle kernel NULL pointer dereference at virtual address 0000000000000d20 Mem abort info: ESR = 0x0000000096000046 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x06: level 2 translation fault Data abort info: ISV = 0, ISS = 0x00000046, ISS2 = 0x00000000 CM = 0, WnR = 1, TnD = 0, TagAccess = 0 GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 user pgtable: 4k pages, 48-bit VAs, pgdp=0000000103e4f000 [0000000000000d20] pgd=0800000102e1c403, p4d=0800000102e1c403, pud=0800000101146403, pmd=0000000000000000 Internal error: Oops: 0000000096000046 [#1] PREEMPT SMP CPU: 9 UID: 0 PID: 246 Comm: test Not tainted 6.14.0-rc6-00097-g0c90821f5db8 #16 Hardware name: linux,dummy-virt (DT) pstate: 814020c5 (Nzcv daIF +PAN -UAO -TCO +DIT -SSBS BTYPE=--) pc : _raw_spin_lock_irqsave+0x34/0x8c lr : kvm_vgic_set_owner+0x54/0xa4 sp : ffff80008086ba20 x29: ffff80008086ba20 x28: ffff0000c19b5640 x27: 0000000000000000 x26: 0000000000000000 x25: ffff0000c4879bd0 x24: 000000000000001e x23: 0000000000000000 x22: 0000000000000000 x21: ffff0000c487af80 x20: ffff0000c487af18 x19: 0000000000000000 x18: 0000001afadd5a8b x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000001 x14: ffff0000c19b56c0 x13: 0030c9adf9d9889e x12: ffffc263710e1908 x11: 0000001afb0d74f2 x10: e0966b840b373664 x9 : ec806bf7d6a57cd5 x8 : ffff80008086b980 x7 : 0000000000000001 x6 : 0000000000000001 x5 : 0000000080800054 x4 : 4ec4ec4ec4ec4ec5 x3 : 0000000000000000 x2 : 0000000000000001 x1 : 0000000000000000 x0 : 0000000000000d20 Call trace: _raw_spin_lock_irqsave+0x34/0x8c (P) kvm_vgic_set_owner+0x54/0xa4 kvm_timer_enable+0xf4/0x274 kvm_arch_vcpu_run_pid_change+0xe0/0x380 kvm_vcpu_ioctl+0x93c/0x9e0 __arm64_sys_ioctl+0xb4/0xec invoke_syscall+0x48/0x110 el0_svc_common.constprop.0+0x40/0xe0 do_el0_svc+0x1c/0x28 el0_svc+0x30/0xd0 el0t_64_sync_handler+0x10c/0x138 el0t_64_sync+0x198/0x19c Code: b9000841 d503201f 52800001 52800022 (88e17c02) ---[ end trace 0000000000000000 ]--- Plug the race by explicitly checking for an in-progress vCPU creation and failing kvm_vgic_create() when that's the case. Add some comments to document all the things kvm_vgic_create() is trying to guard against too. Reported-by: Alexander Potapenko <glider@google.com> Signed-off-by: Oliver Upton <oliver.upton@linux.dev> Tested-by: Alexander Potapenko <glider@google.com> Link: https://lore.kernel.org/r/20250523194722.4066715-6-oliver.upton@linux.dev Signed-off-by: Marc Zyngier <maz@kernel.org>
2025-05-23 12:47:22 -07:00
/*
* Ensure mutual exclusion with vCPU creation and any vCPU ioctls by:
*
* - Holding kvm->lock to prevent KVM_CREATE_VCPU from reaching
* kvm_arch_vcpu_precreate() and ensuring created_vcpus is stable.
* This alone is insufficient, as kvm_vm_ioctl_create_vcpu() drops
* the kvm->lock before completing the vCPU creation.
*/
lockdep_assert_held(&kvm->lock);
KVM: arm64: vgic-init: Plug vCPU vs. VGIC creation race syzkaller has found another ugly race in the VGIC, this time dealing with VGIC creation. Since kvm_vgic_create() doesn't sufficiently protect against in-flight vCPU creations, it is possible to get a vCPU into the kernel w/ an in-kernel VGIC but no allocation of private IRQs: Unable to handle kernel NULL pointer dereference at virtual address 0000000000000d20 Mem abort info: ESR = 0x0000000096000046 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x06: level 2 translation fault Data abort info: ISV = 0, ISS = 0x00000046, ISS2 = 0x00000000 CM = 0, WnR = 1, TnD = 0, TagAccess = 0 GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 user pgtable: 4k pages, 48-bit VAs, pgdp=0000000103e4f000 [0000000000000d20] pgd=0800000102e1c403, p4d=0800000102e1c403, pud=0800000101146403, pmd=0000000000000000 Internal error: Oops: 0000000096000046 [#1] PREEMPT SMP CPU: 9 UID: 0 PID: 246 Comm: test Not tainted 6.14.0-rc6-00097-g0c90821f5db8 #16 Hardware name: linux,dummy-virt (DT) pstate: 814020c5 (Nzcv daIF +PAN -UAO -TCO +DIT -SSBS BTYPE=--) pc : _raw_spin_lock_irqsave+0x34/0x8c lr : kvm_vgic_set_owner+0x54/0xa4 sp : ffff80008086ba20 x29: ffff80008086ba20 x28: ffff0000c19b5640 x27: 0000000000000000 x26: 0000000000000000 x25: ffff0000c4879bd0 x24: 000000000000001e x23: 0000000000000000 x22: 0000000000000000 x21: ffff0000c487af80 x20: ffff0000c487af18 x19: 0000000000000000 x18: 0000001afadd5a8b x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000001 x14: ffff0000c19b56c0 x13: 0030c9adf9d9889e x12: ffffc263710e1908 x11: 0000001afb0d74f2 x10: e0966b840b373664 x9 : ec806bf7d6a57cd5 x8 : ffff80008086b980 x7 : 0000000000000001 x6 : 0000000000000001 x5 : 0000000080800054 x4 : 4ec4ec4ec4ec4ec5 x3 : 0000000000000000 x2 : 0000000000000001 x1 : 0000000000000000 x0 : 0000000000000d20 Call trace: _raw_spin_lock_irqsave+0x34/0x8c (P) kvm_vgic_set_owner+0x54/0xa4 kvm_timer_enable+0xf4/0x274 kvm_arch_vcpu_run_pid_change+0xe0/0x380 kvm_vcpu_ioctl+0x93c/0x9e0 __arm64_sys_ioctl+0xb4/0xec invoke_syscall+0x48/0x110 el0_svc_common.constprop.0+0x40/0xe0 do_el0_svc+0x1c/0x28 el0_svc+0x30/0xd0 el0t_64_sync_handler+0x10c/0x138 el0t_64_sync+0x198/0x19c Code: b9000841 d503201f 52800001 52800022 (88e17c02) ---[ end trace 0000000000000000 ]--- Plug the race by explicitly checking for an in-progress vCPU creation and failing kvm_vgic_create() when that's the case. Add some comments to document all the things kvm_vgic_create() is trying to guard against too. Reported-by: Alexander Potapenko <glider@google.com> Signed-off-by: Oliver Upton <oliver.upton@linux.dev> Tested-by: Alexander Potapenko <glider@google.com> Link: https://lore.kernel.org/r/20250523194722.4066715-6-oliver.upton@linux.dev Signed-off-by: Marc Zyngier <maz@kernel.org>
2025-05-23 12:47:22 -07:00
/*
* - Acquiring the vCPU mutex for every *online* vCPU to prevent
* concurrent vCPU ioctls for vCPUs already visible to userspace.
*/
ret = -EBUSY;
KVM: arm64: use kvm_trylock_all_vcpus when locking all vCPUs Use kvm_trylock_all_vcpus instead of a custom implementation when locking all vCPUs of a VM, to avoid triggering a lockdep warning, in the case in which the VM is configured to have more than MAX_LOCK_DEPTH vCPUs. This fixes the following false lockdep warning: [ 328.171264] BUG: MAX_LOCK_DEPTH too low! [ 328.175227] turning off the locking correctness validator. [ 328.180726] Please attach the output of /proc/lock_stat to the bug report [ 328.187531] depth: 48 max: 48! [ 328.190678] 48 locks held by qemu-kvm/11664: [ 328.194957] #0: ffff800086de5ba0 (&kvm->lock){+.+.}-{3:3}, at: kvm_ioctl_create_device+0x174/0x5b0 [ 328.204048] #1: ffff0800e78800b8 (&vcpu->mutex){+.+.}-{3:3}, at: lock_all_vcpus+0x16c/0x2a0 [ 328.212521] #2: ffff07ffeee51e98 (&vcpu->mutex){+.+.}-{3:3}, at: lock_all_vcpus+0x16c/0x2a0 [ 328.220991] #3: ffff0800dc7d80b8 (&vcpu->mutex){+.+.}-{3:3}, at: lock_all_vcpus+0x16c/0x2a0 [ 328.229463] #4: ffff07ffe0c980b8 (&vcpu->mutex){+.+.}-{3:3}, at: lock_all_vcpus+0x16c/0x2a0 [ 328.237934] #5: ffff0800a3883c78 (&vcpu->mutex){+.+.}-{3:3}, at: lock_all_vcpus+0x16c/0x2a0 [ 328.246405] #6: ffff07fffbe480b8 (&vcpu->mutex){+.+.}-{3:3}, at: lock_all_vcpus+0x16c/0x2a0 Suggested-by: Paolo Bonzini <pbonzini@redhat.com> Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com> Acked-by: Marc Zyngier <maz@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Message-ID: <20250512180407.659015-6-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2025-05-12 14:04:06 -04:00
if (kvm_trylock_all_vcpus(kvm))
return ret;
KVM: arm64: vgic-init: Plug vCPU vs. VGIC creation race syzkaller has found another ugly race in the VGIC, this time dealing with VGIC creation. Since kvm_vgic_create() doesn't sufficiently protect against in-flight vCPU creations, it is possible to get a vCPU into the kernel w/ an in-kernel VGIC but no allocation of private IRQs: Unable to handle kernel NULL pointer dereference at virtual address 0000000000000d20 Mem abort info: ESR = 0x0000000096000046 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x06: level 2 translation fault Data abort info: ISV = 0, ISS = 0x00000046, ISS2 = 0x00000000 CM = 0, WnR = 1, TnD = 0, TagAccess = 0 GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 user pgtable: 4k pages, 48-bit VAs, pgdp=0000000103e4f000 [0000000000000d20] pgd=0800000102e1c403, p4d=0800000102e1c403, pud=0800000101146403, pmd=0000000000000000 Internal error: Oops: 0000000096000046 [#1] PREEMPT SMP CPU: 9 UID: 0 PID: 246 Comm: test Not tainted 6.14.0-rc6-00097-g0c90821f5db8 #16 Hardware name: linux,dummy-virt (DT) pstate: 814020c5 (Nzcv daIF +PAN -UAO -TCO +DIT -SSBS BTYPE=--) pc : _raw_spin_lock_irqsave+0x34/0x8c lr : kvm_vgic_set_owner+0x54/0xa4 sp : ffff80008086ba20 x29: ffff80008086ba20 x28: ffff0000c19b5640 x27: 0000000000000000 x26: 0000000000000000 x25: ffff0000c4879bd0 x24: 000000000000001e x23: 0000000000000000 x22: 0000000000000000 x21: ffff0000c487af80 x20: ffff0000c487af18 x19: 0000000000000000 x18: 0000001afadd5a8b x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000001 x14: ffff0000c19b56c0 x13: 0030c9adf9d9889e x12: ffffc263710e1908 x11: 0000001afb0d74f2 x10: e0966b840b373664 x9 : ec806bf7d6a57cd5 x8 : ffff80008086b980 x7 : 0000000000000001 x6 : 0000000000000001 x5 : 0000000080800054 x4 : 4ec4ec4ec4ec4ec5 x3 : 0000000000000000 x2 : 0000000000000001 x1 : 0000000000000000 x0 : 0000000000000d20 Call trace: _raw_spin_lock_irqsave+0x34/0x8c (P) kvm_vgic_set_owner+0x54/0xa4 kvm_timer_enable+0xf4/0x274 kvm_arch_vcpu_run_pid_change+0xe0/0x380 kvm_vcpu_ioctl+0x93c/0x9e0 __arm64_sys_ioctl+0xb4/0xec invoke_syscall+0x48/0x110 el0_svc_common.constprop.0+0x40/0xe0 do_el0_svc+0x1c/0x28 el0_svc+0x30/0xd0 el0t_64_sync_handler+0x10c/0x138 el0t_64_sync+0x198/0x19c Code: b9000841 d503201f 52800001 52800022 (88e17c02) ---[ end trace 0000000000000000 ]--- Plug the race by explicitly checking for an in-progress vCPU creation and failing kvm_vgic_create() when that's the case. Add some comments to document all the things kvm_vgic_create() is trying to guard against too. Reported-by: Alexander Potapenko <glider@google.com> Signed-off-by: Oliver Upton <oliver.upton@linux.dev> Tested-by: Alexander Potapenko <glider@google.com> Link: https://lore.kernel.org/r/20250523194722.4066715-6-oliver.upton@linux.dev Signed-off-by: Marc Zyngier <maz@kernel.org>
2025-05-23 12:47:22 -07:00
/*
* - Taking the config_lock which protects VGIC data structures such
* as the per-vCPU arrays of private IRQs (SGIs, PPIs).
*/
mutex_lock(&kvm->arch.config_lock);
KVM: arm64: vgic-init: Plug vCPU vs. VGIC creation race syzkaller has found another ugly race in the VGIC, this time dealing with VGIC creation. Since kvm_vgic_create() doesn't sufficiently protect against in-flight vCPU creations, it is possible to get a vCPU into the kernel w/ an in-kernel VGIC but no allocation of private IRQs: Unable to handle kernel NULL pointer dereference at virtual address 0000000000000d20 Mem abort info: ESR = 0x0000000096000046 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x06: level 2 translation fault Data abort info: ISV = 0, ISS = 0x00000046, ISS2 = 0x00000000 CM = 0, WnR = 1, TnD = 0, TagAccess = 0 GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 user pgtable: 4k pages, 48-bit VAs, pgdp=0000000103e4f000 [0000000000000d20] pgd=0800000102e1c403, p4d=0800000102e1c403, pud=0800000101146403, pmd=0000000000000000 Internal error: Oops: 0000000096000046 [#1] PREEMPT SMP CPU: 9 UID: 0 PID: 246 Comm: test Not tainted 6.14.0-rc6-00097-g0c90821f5db8 #16 Hardware name: linux,dummy-virt (DT) pstate: 814020c5 (Nzcv daIF +PAN -UAO -TCO +DIT -SSBS BTYPE=--) pc : _raw_spin_lock_irqsave+0x34/0x8c lr : kvm_vgic_set_owner+0x54/0xa4 sp : ffff80008086ba20 x29: ffff80008086ba20 x28: ffff0000c19b5640 x27: 0000000000000000 x26: 0000000000000000 x25: ffff0000c4879bd0 x24: 000000000000001e x23: 0000000000000000 x22: 0000000000000000 x21: ffff0000c487af80 x20: ffff0000c487af18 x19: 0000000000000000 x18: 0000001afadd5a8b x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000001 x14: ffff0000c19b56c0 x13: 0030c9adf9d9889e x12: ffffc263710e1908 x11: 0000001afb0d74f2 x10: e0966b840b373664 x9 : ec806bf7d6a57cd5 x8 : ffff80008086b980 x7 : 0000000000000001 x6 : 0000000000000001 x5 : 0000000080800054 x4 : 4ec4ec4ec4ec4ec5 x3 : 0000000000000000 x2 : 0000000000000001 x1 : 0000000000000000 x0 : 0000000000000d20 Call trace: _raw_spin_lock_irqsave+0x34/0x8c (P) kvm_vgic_set_owner+0x54/0xa4 kvm_timer_enable+0xf4/0x274 kvm_arch_vcpu_run_pid_change+0xe0/0x380 kvm_vcpu_ioctl+0x93c/0x9e0 __arm64_sys_ioctl+0xb4/0xec invoke_syscall+0x48/0x110 el0_svc_common.constprop.0+0x40/0xe0 do_el0_svc+0x1c/0x28 el0_svc+0x30/0xd0 el0t_64_sync_handler+0x10c/0x138 el0t_64_sync+0x198/0x19c Code: b9000841 d503201f 52800001 52800022 (88e17c02) ---[ end trace 0000000000000000 ]--- Plug the race by explicitly checking for an in-progress vCPU creation and failing kvm_vgic_create() when that's the case. Add some comments to document all the things kvm_vgic_create() is trying to guard against too. Reported-by: Alexander Potapenko <glider@google.com> Signed-off-by: Oliver Upton <oliver.upton@linux.dev> Tested-by: Alexander Potapenko <glider@google.com> Link: https://lore.kernel.org/r/20250523194722.4066715-6-oliver.upton@linux.dev Signed-off-by: Marc Zyngier <maz@kernel.org>
2025-05-23 12:47:22 -07:00
/*
* - Bailing on the entire thing if a vCPU is in the middle of creation,
* dropped the kvm->lock, but hasn't reached kvm_arch_vcpu_create().
*
* The whole combination of this guarantees that no vCPU can get into
* KVM with a VGIC configuration inconsistent with the VM's VGIC.
*/
if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus))
goto out_unlock;
if (irqchip_in_kernel(kvm)) {
ret = -EEXIST;
goto out_unlock;
}
kvm_for_each_vcpu(i, vcpu, kvm) {
if (vcpu_has_run_once(vcpu))
goto out_unlock;
}
ret = 0;
if (type == KVM_DEV_TYPE_ARM_VGIC_V2)
kvm->max_vcpus = VGIC_V2_MAX_CPUS;
else
kvm->max_vcpus = VGIC_V3_MAX_CPUS;
if (atomic_read(&kvm->online_vcpus) > kvm->max_vcpus) {
ret = -E2BIG;
goto out_unlock;
}
kvm_for_each_vcpu(i, vcpu, kvm) {
ret = vgic_allocate_private_irqs_locked(vcpu, type);
if (ret)
break;
}
if (ret) {
kvm_for_each_vcpu(i, vcpu, kvm) {
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
kfree(vgic_cpu->private_irqs);
vgic_cpu->private_irqs = NULL;
}
goto out_unlock;
}
kvm->arch.vgic.in_kernel = true;
kvm->arch.vgic.vgic_model = type;
kvm->arch.vgic.implementation_rev = KVM_VGIC_IMP_REV_LATEST;
kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
if (type == KVM_DEV_TYPE_ARM_VGIC_V2)
kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
else
INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions);
if (type == KVM_DEV_TYPE_ARM_VGIC_V3)
kvm->arch.vgic.nassgicap = system_supports_direct_sgis();
out_unlock:
mutex_unlock(&kvm->arch.config_lock);
KVM: arm64: use kvm_trylock_all_vcpus when locking all vCPUs Use kvm_trylock_all_vcpus instead of a custom implementation when locking all vCPUs of a VM, to avoid triggering a lockdep warning, in the case in which the VM is configured to have more than MAX_LOCK_DEPTH vCPUs. This fixes the following false lockdep warning: [ 328.171264] BUG: MAX_LOCK_DEPTH too low! [ 328.175227] turning off the locking correctness validator. [ 328.180726] Please attach the output of /proc/lock_stat to the bug report [ 328.187531] depth: 48 max: 48! [ 328.190678] 48 locks held by qemu-kvm/11664: [ 328.194957] #0: ffff800086de5ba0 (&kvm->lock){+.+.}-{3:3}, at: kvm_ioctl_create_device+0x174/0x5b0 [ 328.204048] #1: ffff0800e78800b8 (&vcpu->mutex){+.+.}-{3:3}, at: lock_all_vcpus+0x16c/0x2a0 [ 328.212521] #2: ffff07ffeee51e98 (&vcpu->mutex){+.+.}-{3:3}, at: lock_all_vcpus+0x16c/0x2a0 [ 328.220991] #3: ffff0800dc7d80b8 (&vcpu->mutex){+.+.}-{3:3}, at: lock_all_vcpus+0x16c/0x2a0 [ 328.229463] #4: ffff07ffe0c980b8 (&vcpu->mutex){+.+.}-{3:3}, at: lock_all_vcpus+0x16c/0x2a0 [ 328.237934] #5: ffff0800a3883c78 (&vcpu->mutex){+.+.}-{3:3}, at: lock_all_vcpus+0x16c/0x2a0 [ 328.246405] #6: ffff07fffbe480b8 (&vcpu->mutex){+.+.}-{3:3}, at: lock_all_vcpus+0x16c/0x2a0 Suggested-by: Paolo Bonzini <pbonzini@redhat.com> Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com> Acked-by: Marc Zyngier <maz@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Message-ID: <20250512180407.659015-6-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2025-05-12 14:04:06 -04:00
kvm_unlock_all_vcpus(kvm);
return ret;
}
/* INIT/DESTROY */
/**
* kvm_vgic_dist_init: initialize the dist data structures
* @kvm: kvm struct pointer
* @nr_spis: number of spis, frozen by caller
*/
static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
{
struct vgic_dist *dist = &kvm->arch.vgic;
struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0);
int i;
dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL_ACCOUNT);
if (!dist->spis)
return -ENOMEM;
/*
* In the following code we do not take the irq struct lock since
* no other action on irq structs can happen while the VGIC is
* not initialized yet:
* If someone wants to inject an interrupt or does a MMIO access, we
* require prior initialization in case of a virtual GICv3 or trigger
* initialization when using a virtual GICv2.
*/
for (i = 0; i < nr_spis; i++) {
struct vgic_irq *irq = &dist->spis[i];
irq->intid = i + VGIC_NR_PRIVATE_IRQS;
INIT_LIST_HEAD(&irq->ap_list);
raw_spin_lock_init(&irq->irq_lock);
irq->vcpu = NULL;
irq->target_vcpu = vcpu0;
kref_init(&irq->refcount);
switch (dist->vgic_model) {
case KVM_DEV_TYPE_ARM_VGIC_V2:
irq->targets = 0;
irq->group = 0;
break;
case KVM_DEV_TYPE_ARM_VGIC_V3:
irq->mpidr = 0;
irq->group = 1;
break;
default:
kfree(dist->spis);
dist->spis = NULL;
return -EINVAL;
}
}
return 0;
}
/* Default GICv3 Maintenance Interrupt INTID, as per SBSA */
#define DEFAULT_MI_INTID 25
int kvm_vgic_vcpu_nv_init(struct kvm_vcpu *vcpu)
{
int ret;
guard(mutex)(&vcpu->kvm->arch.config_lock);
/*
* Matching the tradition established with the timers, provide
* a default PPI for the maintenance interrupt. It makes
* things easier to reason about.
*/
if (vcpu->kvm->arch.vgic.mi_intid == 0)
vcpu->kvm->arch.vgic.mi_intid = DEFAULT_MI_INTID;
ret = kvm_vgic_set_owner(vcpu, vcpu->kvm->arch.vgic.mi_intid, vcpu);
return ret;
}
static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
int i;
lockdep_assert_held(&vcpu->kvm->arch.config_lock);
if (vgic_cpu->private_irqs)
return 0;
vgic_cpu->private_irqs = kcalloc(VGIC_NR_PRIVATE_IRQS,
sizeof(struct vgic_irq),
GFP_KERNEL_ACCOUNT);
if (!vgic_cpu->private_irqs)
return -ENOMEM;
/*
* Enable and configure all SGIs to be edge-triggered and
* configure all PPIs as level-triggered.
*/
for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) {
struct vgic_irq *irq = &vgic_cpu->private_irqs[i];
INIT_LIST_HEAD(&irq->ap_list);
raw_spin_lock_init(&irq->irq_lock);
irq->intid = i;
irq->vcpu = NULL;
irq->target_vcpu = vcpu;
kref_init(&irq->refcount);
if (vgic_irq_is_sgi(i)) {
/* SGIs */
irq->enabled = 1;
irq->config = VGIC_CONFIG_EDGE;
} else {
/* PPIs */
irq->config = VGIC_CONFIG_LEVEL;
}
switch (type) {
case KVM_DEV_TYPE_ARM_VGIC_V3:
irq->group = 1;
irq->mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
break;
case KVM_DEV_TYPE_ARM_VGIC_V2:
irq->group = 0;
irq->targets = BIT(vcpu->vcpu_id);
break;
}
}
return 0;
}
static int vgic_allocate_private_irqs(struct kvm_vcpu *vcpu, u32 type)
{
int ret;
mutex_lock(&vcpu->kvm->arch.config_lock);
ret = vgic_allocate_private_irqs_locked(vcpu, type);
mutex_unlock(&vcpu->kvm->arch.config_lock);
return ret;
}
/**
* kvm_vgic_vcpu_init() - Initialize static VGIC VCPU data
* structures and register VCPU-specific KVM iodevs
*
* @vcpu: pointer to the VCPU being created and initialized
*
* Only do initialization, but do not actually enable the
* VGIC CPU interface
*/
int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
int ret = 0;
vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF;
INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
raw_spin_lock_init(&vgic_cpu->ap_list_lock);
atomic_set(&vgic_cpu->vgic_v3.its_vpe.vlpi_count, 0);
if (!irqchip_in_kernel(vcpu->kvm))
return 0;
ret = vgic_allocate_private_irqs(vcpu, dist->vgic_model);
if (ret)
return ret;
/*
* If we are creating a VCPU with a GICv3 we must also register the
* KVM io device for the redistributor that belongs to this VCPU.
*/
if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
KVM: arm64: vgic: Fix a circular locking issue Lockdep reports a circular lock dependency between the srcu and the config_lock: [ 262.179917] -> #1 (&kvm->srcu){.+.+}-{0:0}: [ 262.182010] __synchronize_srcu+0xb0/0x224 [ 262.183422] synchronize_srcu_expedited+0x24/0x34 [ 262.184554] kvm_io_bus_register_dev+0x324/0x50c [ 262.185650] vgic_register_redist_iodev+0x254/0x398 [ 262.186740] vgic_v3_set_redist_base+0x3b0/0x724 [ 262.188087] kvm_vgic_addr+0x364/0x600 [ 262.189189] vgic_set_common_attr+0x90/0x544 [ 262.190278] vgic_v3_set_attr+0x74/0x9c [ 262.191432] kvm_device_ioctl+0x2a0/0x4e4 [ 262.192515] __arm64_sys_ioctl+0x7ac/0x1ba8 [ 262.193612] invoke_syscall.constprop.0+0x70/0x1e0 [ 262.195006] do_el0_svc+0xe4/0x2d4 [ 262.195929] el0_svc+0x44/0x8c [ 262.196917] el0t_64_sync_handler+0xf4/0x120 [ 262.198238] el0t_64_sync+0x190/0x194 [ 262.199224] [ 262.199224] -> #0 (&kvm->arch.config_lock){+.+.}-{3:3}: [ 262.201094] __lock_acquire+0x2b70/0x626c [ 262.202245] lock_acquire+0x454/0x778 [ 262.203132] __mutex_lock+0x190/0x8b4 [ 262.204023] mutex_lock_nested+0x24/0x30 [ 262.205100] vgic_mmio_write_v3_misc+0x5c/0x2a0 [ 262.206178] dispatch_mmio_write+0xd8/0x258 [ 262.207498] __kvm_io_bus_write+0x1e0/0x350 [ 262.208582] kvm_io_bus_write+0xe0/0x1cc [ 262.209653] io_mem_abort+0x2ac/0x6d8 [ 262.210569] kvm_handle_guest_abort+0x9b8/0x1f88 [ 262.211937] handle_exit+0xc4/0x39c [ 262.212971] kvm_arch_vcpu_ioctl_run+0x90c/0x1c04 [ 262.214154] kvm_vcpu_ioctl+0x450/0x12f8 [ 262.215233] __arm64_sys_ioctl+0x7ac/0x1ba8 [ 262.216402] invoke_syscall.constprop.0+0x70/0x1e0 [ 262.217774] do_el0_svc+0xe4/0x2d4 [ 262.218758] el0_svc+0x44/0x8c [ 262.219941] el0t_64_sync_handler+0xf4/0x120 [ 262.221110] el0t_64_sync+0x190/0x194 Note that the current report, which can be triggered by the vgic_irq kselftest, is a triple chain that includes slots_lock, but after inverting the slots_lock/config_lock dependency, the actual problem reported above remains. In several places, the vgic code calls kvm_io_bus_register_dev(), which synchronizes the srcu, while holding config_lock (#1). And the MMIO handler takes the config_lock while holding the srcu read lock (#0). Break dependency #1, by registering the distributor and redistributors without holding config_lock. The ITS also uses kvm_io_bus_register_dev() but already relies on slots_lock to serialize calls. The distributor iodev is created on the first KVM_RUN call. Multiple threads will race for vgic initialization, and only the first one will see !vgic_ready() under the lock. To serialize those threads, rely on slots_lock rather than config_lock. Redistributors are created earlier, through KVM_DEV_ARM_VGIC_GRP_ADDR ioctls and vCPU creation. Similarly, serialize the iodev creation with slots_lock, and the rest with config_lock. Fixes: f00327731131 ("KVM: arm64: Use config_lock to protect vgic state") Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org> Reviewed-by: Oliver Upton <oliver.upton@linux.dev> Signed-off-by: Marc Zyngier <maz@kernel.org> Link: https://lore.kernel.org/r/20230518100914.2837292-2-jean-philippe@linaro.org
2023-05-18 11:09:15 +01:00
mutex_lock(&vcpu->kvm->slots_lock);
ret = vgic_register_redist_iodev(vcpu);
KVM: arm64: vgic: Fix a circular locking issue Lockdep reports a circular lock dependency between the srcu and the config_lock: [ 262.179917] -> #1 (&kvm->srcu){.+.+}-{0:0}: [ 262.182010] __synchronize_srcu+0xb0/0x224 [ 262.183422] synchronize_srcu_expedited+0x24/0x34 [ 262.184554] kvm_io_bus_register_dev+0x324/0x50c [ 262.185650] vgic_register_redist_iodev+0x254/0x398 [ 262.186740] vgic_v3_set_redist_base+0x3b0/0x724 [ 262.188087] kvm_vgic_addr+0x364/0x600 [ 262.189189] vgic_set_common_attr+0x90/0x544 [ 262.190278] vgic_v3_set_attr+0x74/0x9c [ 262.191432] kvm_device_ioctl+0x2a0/0x4e4 [ 262.192515] __arm64_sys_ioctl+0x7ac/0x1ba8 [ 262.193612] invoke_syscall.constprop.0+0x70/0x1e0 [ 262.195006] do_el0_svc+0xe4/0x2d4 [ 262.195929] el0_svc+0x44/0x8c [ 262.196917] el0t_64_sync_handler+0xf4/0x120 [ 262.198238] el0t_64_sync+0x190/0x194 [ 262.199224] [ 262.199224] -> #0 (&kvm->arch.config_lock){+.+.}-{3:3}: [ 262.201094] __lock_acquire+0x2b70/0x626c [ 262.202245] lock_acquire+0x454/0x778 [ 262.203132] __mutex_lock+0x190/0x8b4 [ 262.204023] mutex_lock_nested+0x24/0x30 [ 262.205100] vgic_mmio_write_v3_misc+0x5c/0x2a0 [ 262.206178] dispatch_mmio_write+0xd8/0x258 [ 262.207498] __kvm_io_bus_write+0x1e0/0x350 [ 262.208582] kvm_io_bus_write+0xe0/0x1cc [ 262.209653] io_mem_abort+0x2ac/0x6d8 [ 262.210569] kvm_handle_guest_abort+0x9b8/0x1f88 [ 262.211937] handle_exit+0xc4/0x39c [ 262.212971] kvm_arch_vcpu_ioctl_run+0x90c/0x1c04 [ 262.214154] kvm_vcpu_ioctl+0x450/0x12f8 [ 262.215233] __arm64_sys_ioctl+0x7ac/0x1ba8 [ 262.216402] invoke_syscall.constprop.0+0x70/0x1e0 [ 262.217774] do_el0_svc+0xe4/0x2d4 [ 262.218758] el0_svc+0x44/0x8c [ 262.219941] el0t_64_sync_handler+0xf4/0x120 [ 262.221110] el0t_64_sync+0x190/0x194 Note that the current report, which can be triggered by the vgic_irq kselftest, is a triple chain that includes slots_lock, but after inverting the slots_lock/config_lock dependency, the actual problem reported above remains. In several places, the vgic code calls kvm_io_bus_register_dev(), which synchronizes the srcu, while holding config_lock (#1). And the MMIO handler takes the config_lock while holding the srcu read lock (#0). Break dependency #1, by registering the distributor and redistributors without holding config_lock. The ITS also uses kvm_io_bus_register_dev() but already relies on slots_lock to serialize calls. The distributor iodev is created on the first KVM_RUN call. Multiple threads will race for vgic initialization, and only the first one will see !vgic_ready() under the lock. To serialize those threads, rely on slots_lock rather than config_lock. Redistributors are created earlier, through KVM_DEV_ARM_VGIC_GRP_ADDR ioctls and vCPU creation. Similarly, serialize the iodev creation with slots_lock, and the rest with config_lock. Fixes: f00327731131 ("KVM: arm64: Use config_lock to protect vgic state") Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org> Reviewed-by: Oliver Upton <oliver.upton@linux.dev> Signed-off-by: Marc Zyngier <maz@kernel.org> Link: https://lore.kernel.org/r/20230518100914.2837292-2-jean-philippe@linaro.org
2023-05-18 11:09:15 +01:00
mutex_unlock(&vcpu->kvm->slots_lock);
}
return ret;
}
static void kvm_vgic_vcpu_enable(struct kvm_vcpu *vcpu)
{
if (kvm_vgic_global_state.type == VGIC_V2)
vgic_v2_enable(vcpu);
else
vgic_v3_enable(vcpu);
}
/*
* vgic_init: allocates and initializes dist and vcpu data structures
* depending on two dimensioning parameters:
* - the number of spis
* - the number of vcpus
* The function is generally called when nr_spis has been explicitly set
* by the guest through the KVM DEVICE API. If not nr_spis is set to 256.
* vgic_initialized() returns true when this function has succeeded.
*/
int vgic_init(struct kvm *kvm)
{
struct vgic_dist *dist = &kvm->arch.vgic;
struct kvm_vcpu *vcpu;
int ret = 0;
unsigned long idx;
lockdep_assert_held(&kvm->arch.config_lock);
if (vgic_initialized(kvm))
return 0;
/* Are we also in the middle of creating a VCPU? */
if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus))
return -EBUSY;
/* freeze the number of spis */
if (!dist->nr_spis)
dist->nr_spis = VGIC_NR_IRQS_LEGACY - VGIC_NR_PRIVATE_IRQS;
ret = kvm_vgic_dist_init(kvm, dist->nr_spis);
if (ret)
goto out;
/*
* Ensure vPEs are allocated if direct IRQ injection (e.g. vSGIs,
* vLPIs) is supported.
*/
if (vgic_supports_direct_irqs(kvm)) {
ret = vgic_v4_init(kvm);
if (ret)
goto out;
}
kvm_for_each_vcpu(idx, vcpu, kvm)
kvm_vgic_vcpu_enable(vcpu);
ret = kvm_vgic_setup_default_irq_routing(kvm);
if (ret)
goto out;
vgic_debug_init(kvm);
dist->initialized = true;
out:
return ret;
}
static void kvm_vgic_dist_destroy(struct kvm *kvm)
{
struct vgic_dist *dist = &kvm->arch.vgic;
struct vgic_redist_region *rdreg, *next;
dist->ready = false;
dist->initialized = false;
kfree(dist->spis);
dist->spis = NULL;
dist->nr_spis = 0;
dist->vgic_dist_base = VGIC_ADDR_UNDEF;
if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list)
vgic_v3_free_redist_region(kvm, rdreg);
INIT_LIST_HEAD(&dist->rd_regions);
} else {
dist->vgic_cpu_base = VGIC_ADDR_UNDEF;
}
if (vgic_supports_direct_irqs(kvm))
vgic_v4_teardown(kvm);
xa_destroy(&dist->lpi_xa);
}
static void __kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
KVM: arm64: vgic-v3: Retire all pending LPIs on vcpu destroy It's likely that the vcpu fails to handle all virtual interrupts if userspace decides to destroy it, leaving the pending ones stay in the ap_list. If the un-handled one is a LPI, its vgic_irq structure will be eventually leaked because of an extra refcount increment in vgic_queue_irq_unlock(). This was detected by kmemleak on almost every guest destroy, the backtrace is as follows: unreferenced object 0xffff80725aed5500 (size 128): comm "CPU 5/KVM", pid 40711, jiffies 4298024754 (age 166366.512s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 08 01 a9 73 6d 80 ff ff ...........sm... c8 61 ee a9 00 20 ff ff 28 1e 55 81 6c 80 ff ff .a... ..(.U.l... backtrace: [<000000004bcaa122>] kmem_cache_alloc_trace+0x2dc/0x418 [<0000000069c7dabb>] vgic_add_lpi+0x88/0x418 [<00000000bfefd5c5>] vgic_its_cmd_handle_mapi+0x4dc/0x588 [<00000000cf993975>] vgic_its_process_commands.part.5+0x484/0x1198 [<000000004bd3f8e3>] vgic_its_process_commands+0x50/0x80 [<00000000b9a65b2b>] vgic_mmio_write_its_cwriter+0xac/0x108 [<0000000009641ebb>] dispatch_mmio_write+0xd0/0x188 [<000000008f79d288>] __kvm_io_bus_write+0x134/0x240 [<00000000882f39ac>] kvm_io_bus_write+0xe0/0x150 [<0000000078197602>] io_mem_abort+0x484/0x7b8 [<0000000060954e3c>] kvm_handle_guest_abort+0x4cc/0xa58 [<00000000e0d0cd65>] handle_exit+0x24c/0x770 [<00000000b44a7fad>] kvm_arch_vcpu_ioctl_run+0x460/0x1988 [<0000000025fb897c>] kvm_vcpu_ioctl+0x4f8/0xee0 [<000000003271e317>] do_vfs_ioctl+0x160/0xcd8 [<00000000e7f39607>] ksys_ioctl+0x98/0xd8 Fix it by retiring all pending LPIs in the ap_list on the destroy path. p.s. I can also reproduce it on a normal guest shutdown. It is because userspace still send LPIs to vcpu (through KVM_SIGNAL_MSI ioctl) while the guest is being shutdown and unable to handle it. A little strange though and haven't dig further... Reviewed-by: James Morse <james.morse@arm.com> Signed-off-by: Zenghui Yu <yuzenghui@huawei.com> [maz: moved the distributor deallocation down to avoid an UAF splat] Signed-off-by: Marc Zyngier <maz@kernel.org> Link: https://lore.kernel.org/r/20200414030349.625-2-yuzenghui@huawei.com
2020-04-14 11:03:47 +08:00
/*
* Retire all pending LPIs on this vcpu anyway as we're
* going to destroy it.
*/
vgic_flush_pending_lpis(vcpu);
INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
kfree(vgic_cpu->private_irqs);
vgic_cpu->private_irqs = NULL;
KVM: arm64: Unregister redistributor for failed vCPU creation Alex reports that syzkaller has managed to trigger a use-after-free when tearing down a VM: BUG: KASAN: slab-use-after-free in kvm_put_kvm+0x300/0xe68 virt/kvm/kvm_main.c:5769 Read of size 8 at addr ffffff801c6890d0 by task syz.3.2219/10758 CPU: 3 UID: 0 PID: 10758 Comm: syz.3.2219 Not tainted 6.11.0-rc6-dirty #64 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0x17c/0x1a8 arch/arm64/kernel/stacktrace.c:317 show_stack+0x2c/0x3c arch/arm64/kernel/stacktrace.c:324 __dump_stack lib/dump_stack.c:93 [inline] dump_stack_lvl+0x94/0xc0 lib/dump_stack.c:119 print_report+0x144/0x7a4 mm/kasan/report.c:377 kasan_report+0xcc/0x128 mm/kasan/report.c:601 __asan_report_load8_noabort+0x20/0x2c mm/kasan/report_generic.c:381 kvm_put_kvm+0x300/0xe68 virt/kvm/kvm_main.c:5769 kvm_vm_release+0x4c/0x60 virt/kvm/kvm_main.c:1409 __fput+0x198/0x71c fs/file_table.c:422 ____fput+0x20/0x30 fs/file_table.c:450 task_work_run+0x1cc/0x23c kernel/task_work.c:228 do_notify_resume+0x144/0x1a0 include/linux/resume_user_mode.h:50 el0_svc+0x64/0x68 arch/arm64/kernel/entry-common.c:169 el0t_64_sync_handler+0x90/0xfc arch/arm64/kernel/entry-common.c:730 el0t_64_sync+0x190/0x194 arch/arm64/kernel/entry.S:598 Upon closer inspection, it appears that we do not properly tear down the MMIO registration for a vCPU that fails creation late in the game, e.g. a vCPU w/ the same ID already exists in the VM. It is important to consider the context of commit that introduced this bug by moving the unregistration out of __kvm_vgic_vcpu_destroy(). That change correctly sought to avoid an srcu v. config_lock inversion by breaking up the vCPU teardown into two parts, one guarded by the config_lock. Fix the use-after-free while avoiding lock inversion by adding a special-cased unregistration to __kvm_vgic_vcpu_destroy(). This is safe because failed vCPUs are torn down outside of the config_lock. Cc: stable@vger.kernel.org Fixes: f616506754d3 ("KVM: arm64: vgic: Don't hold config_lock while unregistering redistributors") Reported-by: Alexander Potapenko <glider@google.com> Signed-off-by: Oliver Upton <oliver.upton@linux.dev> Link: https://lore.kernel.org/r/20241007223909.2157336-1-oliver.upton@linux.dev Signed-off-by: Marc Zyngier <maz@kernel.org>
2024-10-07 22:39:09 +00:00
if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
/*
* If this vCPU is being destroyed because of a failed creation
* then unregister the redistributor to avoid leaving behind a
* dangling pointer to the vCPU struct.
*
* vCPUs that have been successfully created (i.e. added to
* kvm->vcpu_array) get unregistered in kvm_vgic_destroy(), as
* this function gets called while holding kvm->arch.config_lock
* in the VM teardown path and would otherwise introduce a lock
* inversion w.r.t. kvm->srcu.
*
* vCPUs that failed creation are torn down outside of the
* kvm->arch.config_lock and do not get unregistered in
* kvm_vgic_destroy(), meaning it is both safe and necessary to
* do so here.
*/
if (kvm_get_vcpu_by_id(vcpu->kvm, vcpu->vcpu_id) != vcpu)
vgic_unregister_redist_iodev(vcpu);
vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF;
KVM: arm64: Unregister redistributor for failed vCPU creation Alex reports that syzkaller has managed to trigger a use-after-free when tearing down a VM: BUG: KASAN: slab-use-after-free in kvm_put_kvm+0x300/0xe68 virt/kvm/kvm_main.c:5769 Read of size 8 at addr ffffff801c6890d0 by task syz.3.2219/10758 CPU: 3 UID: 0 PID: 10758 Comm: syz.3.2219 Not tainted 6.11.0-rc6-dirty #64 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0x17c/0x1a8 arch/arm64/kernel/stacktrace.c:317 show_stack+0x2c/0x3c arch/arm64/kernel/stacktrace.c:324 __dump_stack lib/dump_stack.c:93 [inline] dump_stack_lvl+0x94/0xc0 lib/dump_stack.c:119 print_report+0x144/0x7a4 mm/kasan/report.c:377 kasan_report+0xcc/0x128 mm/kasan/report.c:601 __asan_report_load8_noabort+0x20/0x2c mm/kasan/report_generic.c:381 kvm_put_kvm+0x300/0xe68 virt/kvm/kvm_main.c:5769 kvm_vm_release+0x4c/0x60 virt/kvm/kvm_main.c:1409 __fput+0x198/0x71c fs/file_table.c:422 ____fput+0x20/0x30 fs/file_table.c:450 task_work_run+0x1cc/0x23c kernel/task_work.c:228 do_notify_resume+0x144/0x1a0 include/linux/resume_user_mode.h:50 el0_svc+0x64/0x68 arch/arm64/kernel/entry-common.c:169 el0t_64_sync_handler+0x90/0xfc arch/arm64/kernel/entry-common.c:730 el0t_64_sync+0x190/0x194 arch/arm64/kernel/entry.S:598 Upon closer inspection, it appears that we do not properly tear down the MMIO registration for a vCPU that fails creation late in the game, e.g. a vCPU w/ the same ID already exists in the VM. It is important to consider the context of commit that introduced this bug by moving the unregistration out of __kvm_vgic_vcpu_destroy(). That change correctly sought to avoid an srcu v. config_lock inversion by breaking up the vCPU teardown into two parts, one guarded by the config_lock. Fix the use-after-free while avoiding lock inversion by adding a special-cased unregistration to __kvm_vgic_vcpu_destroy(). This is safe because failed vCPUs are torn down outside of the config_lock. Cc: stable@vger.kernel.org Fixes: f616506754d3 ("KVM: arm64: vgic: Don't hold config_lock while unregistering redistributors") Reported-by: Alexander Potapenko <glider@google.com> Signed-off-by: Oliver Upton <oliver.upton@linux.dev> Link: https://lore.kernel.org/r/20241007223909.2157336-1-oliver.upton@linux.dev Signed-off-by: Marc Zyngier <maz@kernel.org>
2024-10-07 22:39:09 +00:00
}
}
void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
{
struct kvm *kvm = vcpu->kvm;
mutex_lock(&kvm->slots_lock);
__kvm_vgic_vcpu_destroy(vcpu);
mutex_unlock(&kvm->slots_lock);
}
void kvm_vgic_destroy(struct kvm *kvm)
{
struct kvm_vcpu *vcpu;
unsigned long i;
mutex_lock(&kvm->slots_lock);
mutex_lock(&kvm->arch.config_lock);
vgic_debug_destroy(kvm);
kvm_for_each_vcpu(i, vcpu, kvm)
__kvm_vgic_vcpu_destroy(vcpu);
KVM: arm64: vgic-v3: Retire all pending LPIs on vcpu destroy It's likely that the vcpu fails to handle all virtual interrupts if userspace decides to destroy it, leaving the pending ones stay in the ap_list. If the un-handled one is a LPI, its vgic_irq structure will be eventually leaked because of an extra refcount increment in vgic_queue_irq_unlock(). This was detected by kmemleak on almost every guest destroy, the backtrace is as follows: unreferenced object 0xffff80725aed5500 (size 128): comm "CPU 5/KVM", pid 40711, jiffies 4298024754 (age 166366.512s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 08 01 a9 73 6d 80 ff ff ...........sm... c8 61 ee a9 00 20 ff ff 28 1e 55 81 6c 80 ff ff .a... ..(.U.l... backtrace: [<000000004bcaa122>] kmem_cache_alloc_trace+0x2dc/0x418 [<0000000069c7dabb>] vgic_add_lpi+0x88/0x418 [<00000000bfefd5c5>] vgic_its_cmd_handle_mapi+0x4dc/0x588 [<00000000cf993975>] vgic_its_process_commands.part.5+0x484/0x1198 [<000000004bd3f8e3>] vgic_its_process_commands+0x50/0x80 [<00000000b9a65b2b>] vgic_mmio_write_its_cwriter+0xac/0x108 [<0000000009641ebb>] dispatch_mmio_write+0xd0/0x188 [<000000008f79d288>] __kvm_io_bus_write+0x134/0x240 [<00000000882f39ac>] kvm_io_bus_write+0xe0/0x150 [<0000000078197602>] io_mem_abort+0x484/0x7b8 [<0000000060954e3c>] kvm_handle_guest_abort+0x4cc/0xa58 [<00000000e0d0cd65>] handle_exit+0x24c/0x770 [<00000000b44a7fad>] kvm_arch_vcpu_ioctl_run+0x460/0x1988 [<0000000025fb897c>] kvm_vcpu_ioctl+0x4f8/0xee0 [<000000003271e317>] do_vfs_ioctl+0x160/0xcd8 [<00000000e7f39607>] ksys_ioctl+0x98/0xd8 Fix it by retiring all pending LPIs in the ap_list on the destroy path. p.s. I can also reproduce it on a normal guest shutdown. It is because userspace still send LPIs to vcpu (through KVM_SIGNAL_MSI ioctl) while the guest is being shutdown and unable to handle it. A little strange though and haven't dig further... Reviewed-by: James Morse <james.morse@arm.com> Signed-off-by: Zenghui Yu <yuzenghui@huawei.com> [maz: moved the distributor deallocation down to avoid an UAF splat] Signed-off-by: Marc Zyngier <maz@kernel.org> Link: https://lore.kernel.org/r/20200414030349.625-2-yuzenghui@huawei.com
2020-04-14 11:03:47 +08:00
kvm_vgic_dist_destroy(kvm);
mutex_unlock(&kvm->arch.config_lock);
if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
kvm_for_each_vcpu(i, vcpu, kvm)
vgic_unregister_redist_iodev(vcpu);
mutex_unlock(&kvm->slots_lock);
}
/**
* vgic_lazy_init: Lazy init is only allowed if the GIC exposed to the guest
* is a GICv2. A GICv3 must be explicitly initialized by userspace using the
* KVM_DEV_ARM_VGIC_GRP_CTRL KVM_DEVICE group.
* @kvm: kvm struct pointer
*/
int vgic_lazy_init(struct kvm *kvm)
{
int ret = 0;
if (unlikely(!vgic_initialized(kvm))) {
/*
* We only provide the automatic initialization of the VGIC
* for the legacy case of a GICv2. Any other type must
* be explicitly initialized once setup with the respective
* KVM device call.
*/
if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2)
return -EBUSY;
mutex_lock(&kvm->arch.config_lock);
ret = vgic_init(kvm);
mutex_unlock(&kvm->arch.config_lock);
}
return ret;
}
/* RESOURCE MAPPING */
/**
* kvm_vgic_map_resources - map the MMIO regions
* @kvm: kvm struct pointer
*
* Map the MMIO regions depending on the VGIC model exposed to the guest
* called on the first VCPU run.
* Also map the virtual CPU interface into the VM.
* v2 calls vgic_init() if not already done.
* v3 and derivatives return an error if the VGIC is not initialized.
* vgic_ready() returns true if this function has succeeded.
*/
int kvm_vgic_map_resources(struct kvm *kvm)
{
struct vgic_dist *dist = &kvm->arch.vgic;
enum vgic_type type;
KVM: arm64: vgic: Fix a circular locking issue Lockdep reports a circular lock dependency between the srcu and the config_lock: [ 262.179917] -> #1 (&kvm->srcu){.+.+}-{0:0}: [ 262.182010] __synchronize_srcu+0xb0/0x224 [ 262.183422] synchronize_srcu_expedited+0x24/0x34 [ 262.184554] kvm_io_bus_register_dev+0x324/0x50c [ 262.185650] vgic_register_redist_iodev+0x254/0x398 [ 262.186740] vgic_v3_set_redist_base+0x3b0/0x724 [ 262.188087] kvm_vgic_addr+0x364/0x600 [ 262.189189] vgic_set_common_attr+0x90/0x544 [ 262.190278] vgic_v3_set_attr+0x74/0x9c [ 262.191432] kvm_device_ioctl+0x2a0/0x4e4 [ 262.192515] __arm64_sys_ioctl+0x7ac/0x1ba8 [ 262.193612] invoke_syscall.constprop.0+0x70/0x1e0 [ 262.195006] do_el0_svc+0xe4/0x2d4 [ 262.195929] el0_svc+0x44/0x8c [ 262.196917] el0t_64_sync_handler+0xf4/0x120 [ 262.198238] el0t_64_sync+0x190/0x194 [ 262.199224] [ 262.199224] -> #0 (&kvm->arch.config_lock){+.+.}-{3:3}: [ 262.201094] __lock_acquire+0x2b70/0x626c [ 262.202245] lock_acquire+0x454/0x778 [ 262.203132] __mutex_lock+0x190/0x8b4 [ 262.204023] mutex_lock_nested+0x24/0x30 [ 262.205100] vgic_mmio_write_v3_misc+0x5c/0x2a0 [ 262.206178] dispatch_mmio_write+0xd8/0x258 [ 262.207498] __kvm_io_bus_write+0x1e0/0x350 [ 262.208582] kvm_io_bus_write+0xe0/0x1cc [ 262.209653] io_mem_abort+0x2ac/0x6d8 [ 262.210569] kvm_handle_guest_abort+0x9b8/0x1f88 [ 262.211937] handle_exit+0xc4/0x39c [ 262.212971] kvm_arch_vcpu_ioctl_run+0x90c/0x1c04 [ 262.214154] kvm_vcpu_ioctl+0x450/0x12f8 [ 262.215233] __arm64_sys_ioctl+0x7ac/0x1ba8 [ 262.216402] invoke_syscall.constprop.0+0x70/0x1e0 [ 262.217774] do_el0_svc+0xe4/0x2d4 [ 262.218758] el0_svc+0x44/0x8c [ 262.219941] el0t_64_sync_handler+0xf4/0x120 [ 262.221110] el0t_64_sync+0x190/0x194 Note that the current report, which can be triggered by the vgic_irq kselftest, is a triple chain that includes slots_lock, but after inverting the slots_lock/config_lock dependency, the actual problem reported above remains. In several places, the vgic code calls kvm_io_bus_register_dev(), which synchronizes the srcu, while holding config_lock (#1). And the MMIO handler takes the config_lock while holding the srcu read lock (#0). Break dependency #1, by registering the distributor and redistributors without holding config_lock. The ITS also uses kvm_io_bus_register_dev() but already relies on slots_lock to serialize calls. The distributor iodev is created on the first KVM_RUN call. Multiple threads will race for vgic initialization, and only the first one will see !vgic_ready() under the lock. To serialize those threads, rely on slots_lock rather than config_lock. Redistributors are created earlier, through KVM_DEV_ARM_VGIC_GRP_ADDR ioctls and vCPU creation. Similarly, serialize the iodev creation with slots_lock, and the rest with config_lock. Fixes: f00327731131 ("KVM: arm64: Use config_lock to protect vgic state") Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org> Reviewed-by: Oliver Upton <oliver.upton@linux.dev> Signed-off-by: Marc Zyngier <maz@kernel.org> Link: https://lore.kernel.org/r/20230518100914.2837292-2-jean-philippe@linaro.org
2023-05-18 11:09:15 +01:00
gpa_t dist_base;
int ret = 0;
if (likely(vgic_ready(kvm)))
return 0;
KVM: arm64: vgic: Fix a circular locking issue Lockdep reports a circular lock dependency between the srcu and the config_lock: [ 262.179917] -> #1 (&kvm->srcu){.+.+}-{0:0}: [ 262.182010] __synchronize_srcu+0xb0/0x224 [ 262.183422] synchronize_srcu_expedited+0x24/0x34 [ 262.184554] kvm_io_bus_register_dev+0x324/0x50c [ 262.185650] vgic_register_redist_iodev+0x254/0x398 [ 262.186740] vgic_v3_set_redist_base+0x3b0/0x724 [ 262.188087] kvm_vgic_addr+0x364/0x600 [ 262.189189] vgic_set_common_attr+0x90/0x544 [ 262.190278] vgic_v3_set_attr+0x74/0x9c [ 262.191432] kvm_device_ioctl+0x2a0/0x4e4 [ 262.192515] __arm64_sys_ioctl+0x7ac/0x1ba8 [ 262.193612] invoke_syscall.constprop.0+0x70/0x1e0 [ 262.195006] do_el0_svc+0xe4/0x2d4 [ 262.195929] el0_svc+0x44/0x8c [ 262.196917] el0t_64_sync_handler+0xf4/0x120 [ 262.198238] el0t_64_sync+0x190/0x194 [ 262.199224] [ 262.199224] -> #0 (&kvm->arch.config_lock){+.+.}-{3:3}: [ 262.201094] __lock_acquire+0x2b70/0x626c [ 262.202245] lock_acquire+0x454/0x778 [ 262.203132] __mutex_lock+0x190/0x8b4 [ 262.204023] mutex_lock_nested+0x24/0x30 [ 262.205100] vgic_mmio_write_v3_misc+0x5c/0x2a0 [ 262.206178] dispatch_mmio_write+0xd8/0x258 [ 262.207498] __kvm_io_bus_write+0x1e0/0x350 [ 262.208582] kvm_io_bus_write+0xe0/0x1cc [ 262.209653] io_mem_abort+0x2ac/0x6d8 [ 262.210569] kvm_handle_guest_abort+0x9b8/0x1f88 [ 262.211937] handle_exit+0xc4/0x39c [ 262.212971] kvm_arch_vcpu_ioctl_run+0x90c/0x1c04 [ 262.214154] kvm_vcpu_ioctl+0x450/0x12f8 [ 262.215233] __arm64_sys_ioctl+0x7ac/0x1ba8 [ 262.216402] invoke_syscall.constprop.0+0x70/0x1e0 [ 262.217774] do_el0_svc+0xe4/0x2d4 [ 262.218758] el0_svc+0x44/0x8c [ 262.219941] el0t_64_sync_handler+0xf4/0x120 [ 262.221110] el0t_64_sync+0x190/0x194 Note that the current report, which can be triggered by the vgic_irq kselftest, is a triple chain that includes slots_lock, but after inverting the slots_lock/config_lock dependency, the actual problem reported above remains. In several places, the vgic code calls kvm_io_bus_register_dev(), which synchronizes the srcu, while holding config_lock (#1). And the MMIO handler takes the config_lock while holding the srcu read lock (#0). Break dependency #1, by registering the distributor and redistributors without holding config_lock. The ITS also uses kvm_io_bus_register_dev() but already relies on slots_lock to serialize calls. The distributor iodev is created on the first KVM_RUN call. Multiple threads will race for vgic initialization, and only the first one will see !vgic_ready() under the lock. To serialize those threads, rely on slots_lock rather than config_lock. Redistributors are created earlier, through KVM_DEV_ARM_VGIC_GRP_ADDR ioctls and vCPU creation. Similarly, serialize the iodev creation with slots_lock, and the rest with config_lock. Fixes: f00327731131 ("KVM: arm64: Use config_lock to protect vgic state") Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org> Reviewed-by: Oliver Upton <oliver.upton@linux.dev> Signed-off-by: Marc Zyngier <maz@kernel.org> Link: https://lore.kernel.org/r/20230518100914.2837292-2-jean-philippe@linaro.org
2023-05-18 11:09:15 +01:00
mutex_lock(&kvm->slots_lock);
mutex_lock(&kvm->arch.config_lock);
if (vgic_ready(kvm))
goto out;
if (!irqchip_in_kernel(kvm))
goto out;
if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) {
ret = vgic_v2_map_resources(kvm);
type = VGIC_V2;
} else {
ret = vgic_v3_map_resources(kvm);
type = VGIC_V3;
}
if (ret)
KVM: arm64: vgic: Fix a circular locking issue Lockdep reports a circular lock dependency between the srcu and the config_lock: [ 262.179917] -> #1 (&kvm->srcu){.+.+}-{0:0}: [ 262.182010] __synchronize_srcu+0xb0/0x224 [ 262.183422] synchronize_srcu_expedited+0x24/0x34 [ 262.184554] kvm_io_bus_register_dev+0x324/0x50c [ 262.185650] vgic_register_redist_iodev+0x254/0x398 [ 262.186740] vgic_v3_set_redist_base+0x3b0/0x724 [ 262.188087] kvm_vgic_addr+0x364/0x600 [ 262.189189] vgic_set_common_attr+0x90/0x544 [ 262.190278] vgic_v3_set_attr+0x74/0x9c [ 262.191432] kvm_device_ioctl+0x2a0/0x4e4 [ 262.192515] __arm64_sys_ioctl+0x7ac/0x1ba8 [ 262.193612] invoke_syscall.constprop.0+0x70/0x1e0 [ 262.195006] do_el0_svc+0xe4/0x2d4 [ 262.195929] el0_svc+0x44/0x8c [ 262.196917] el0t_64_sync_handler+0xf4/0x120 [ 262.198238] el0t_64_sync+0x190/0x194 [ 262.199224] [ 262.199224] -> #0 (&kvm->arch.config_lock){+.+.}-{3:3}: [ 262.201094] __lock_acquire+0x2b70/0x626c [ 262.202245] lock_acquire+0x454/0x778 [ 262.203132] __mutex_lock+0x190/0x8b4 [ 262.204023] mutex_lock_nested+0x24/0x30 [ 262.205100] vgic_mmio_write_v3_misc+0x5c/0x2a0 [ 262.206178] dispatch_mmio_write+0xd8/0x258 [ 262.207498] __kvm_io_bus_write+0x1e0/0x350 [ 262.208582] kvm_io_bus_write+0xe0/0x1cc [ 262.209653] io_mem_abort+0x2ac/0x6d8 [ 262.210569] kvm_handle_guest_abort+0x9b8/0x1f88 [ 262.211937] handle_exit+0xc4/0x39c [ 262.212971] kvm_arch_vcpu_ioctl_run+0x90c/0x1c04 [ 262.214154] kvm_vcpu_ioctl+0x450/0x12f8 [ 262.215233] __arm64_sys_ioctl+0x7ac/0x1ba8 [ 262.216402] invoke_syscall.constprop.0+0x70/0x1e0 [ 262.217774] do_el0_svc+0xe4/0x2d4 [ 262.218758] el0_svc+0x44/0x8c [ 262.219941] el0t_64_sync_handler+0xf4/0x120 [ 262.221110] el0t_64_sync+0x190/0x194 Note that the current report, which can be triggered by the vgic_irq kselftest, is a triple chain that includes slots_lock, but after inverting the slots_lock/config_lock dependency, the actual problem reported above remains. In several places, the vgic code calls kvm_io_bus_register_dev(), which synchronizes the srcu, while holding config_lock (#1). And the MMIO handler takes the config_lock while holding the srcu read lock (#0). Break dependency #1, by registering the distributor and redistributors without holding config_lock. The ITS also uses kvm_io_bus_register_dev() but already relies on slots_lock to serialize calls. The distributor iodev is created on the first KVM_RUN call. Multiple threads will race for vgic initialization, and only the first one will see !vgic_ready() under the lock. To serialize those threads, rely on slots_lock rather than config_lock. Redistributors are created earlier, through KVM_DEV_ARM_VGIC_GRP_ADDR ioctls and vCPU creation. Similarly, serialize the iodev creation with slots_lock, and the rest with config_lock. Fixes: f00327731131 ("KVM: arm64: Use config_lock to protect vgic state") Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org> Reviewed-by: Oliver Upton <oliver.upton@linux.dev> Signed-off-by: Marc Zyngier <maz@kernel.org> Link: https://lore.kernel.org/r/20230518100914.2837292-2-jean-philippe@linaro.org
2023-05-18 11:09:15 +01:00
goto out;
KVM: arm64: vgic: Fix a circular locking issue Lockdep reports a circular lock dependency between the srcu and the config_lock: [ 262.179917] -> #1 (&kvm->srcu){.+.+}-{0:0}: [ 262.182010] __synchronize_srcu+0xb0/0x224 [ 262.183422] synchronize_srcu_expedited+0x24/0x34 [ 262.184554] kvm_io_bus_register_dev+0x324/0x50c [ 262.185650] vgic_register_redist_iodev+0x254/0x398 [ 262.186740] vgic_v3_set_redist_base+0x3b0/0x724 [ 262.188087] kvm_vgic_addr+0x364/0x600 [ 262.189189] vgic_set_common_attr+0x90/0x544 [ 262.190278] vgic_v3_set_attr+0x74/0x9c [ 262.191432] kvm_device_ioctl+0x2a0/0x4e4 [ 262.192515] __arm64_sys_ioctl+0x7ac/0x1ba8 [ 262.193612] invoke_syscall.constprop.0+0x70/0x1e0 [ 262.195006] do_el0_svc+0xe4/0x2d4 [ 262.195929] el0_svc+0x44/0x8c [ 262.196917] el0t_64_sync_handler+0xf4/0x120 [ 262.198238] el0t_64_sync+0x190/0x194 [ 262.199224] [ 262.199224] -> #0 (&kvm->arch.config_lock){+.+.}-{3:3}: [ 262.201094] __lock_acquire+0x2b70/0x626c [ 262.202245] lock_acquire+0x454/0x778 [ 262.203132] __mutex_lock+0x190/0x8b4 [ 262.204023] mutex_lock_nested+0x24/0x30 [ 262.205100] vgic_mmio_write_v3_misc+0x5c/0x2a0 [ 262.206178] dispatch_mmio_write+0xd8/0x258 [ 262.207498] __kvm_io_bus_write+0x1e0/0x350 [ 262.208582] kvm_io_bus_write+0xe0/0x1cc [ 262.209653] io_mem_abort+0x2ac/0x6d8 [ 262.210569] kvm_handle_guest_abort+0x9b8/0x1f88 [ 262.211937] handle_exit+0xc4/0x39c [ 262.212971] kvm_arch_vcpu_ioctl_run+0x90c/0x1c04 [ 262.214154] kvm_vcpu_ioctl+0x450/0x12f8 [ 262.215233] __arm64_sys_ioctl+0x7ac/0x1ba8 [ 262.216402] invoke_syscall.constprop.0+0x70/0x1e0 [ 262.217774] do_el0_svc+0xe4/0x2d4 [ 262.218758] el0_svc+0x44/0x8c [ 262.219941] el0t_64_sync_handler+0xf4/0x120 [ 262.221110] el0t_64_sync+0x190/0x194 Note that the current report, which can be triggered by the vgic_irq kselftest, is a triple chain that includes slots_lock, but after inverting the slots_lock/config_lock dependency, the actual problem reported above remains. In several places, the vgic code calls kvm_io_bus_register_dev(), which synchronizes the srcu, while holding config_lock (#1). And the MMIO handler takes the config_lock while holding the srcu read lock (#0). Break dependency #1, by registering the distributor and redistributors without holding config_lock. The ITS also uses kvm_io_bus_register_dev() but already relies on slots_lock to serialize calls. The distributor iodev is created on the first KVM_RUN call. Multiple threads will race for vgic initialization, and only the first one will see !vgic_ready() under the lock. To serialize those threads, rely on slots_lock rather than config_lock. Redistributors are created earlier, through KVM_DEV_ARM_VGIC_GRP_ADDR ioctls and vCPU creation. Similarly, serialize the iodev creation with slots_lock, and the rest with config_lock. Fixes: f00327731131 ("KVM: arm64: Use config_lock to protect vgic state") Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org> Reviewed-by: Oliver Upton <oliver.upton@linux.dev> Signed-off-by: Marc Zyngier <maz@kernel.org> Link: https://lore.kernel.org/r/20230518100914.2837292-2-jean-philippe@linaro.org
2023-05-18 11:09:15 +01:00
dist_base = dist->vgic_dist_base;
mutex_unlock(&kvm->arch.config_lock);
ret = vgic_register_dist_iodev(kvm, dist_base, type);
if (ret) {
KVM: arm64: vgic: Fix a circular locking issue Lockdep reports a circular lock dependency between the srcu and the config_lock: [ 262.179917] -> #1 (&kvm->srcu){.+.+}-{0:0}: [ 262.182010] __synchronize_srcu+0xb0/0x224 [ 262.183422] synchronize_srcu_expedited+0x24/0x34 [ 262.184554] kvm_io_bus_register_dev+0x324/0x50c [ 262.185650] vgic_register_redist_iodev+0x254/0x398 [ 262.186740] vgic_v3_set_redist_base+0x3b0/0x724 [ 262.188087] kvm_vgic_addr+0x364/0x600 [ 262.189189] vgic_set_common_attr+0x90/0x544 [ 262.190278] vgic_v3_set_attr+0x74/0x9c [ 262.191432] kvm_device_ioctl+0x2a0/0x4e4 [ 262.192515] __arm64_sys_ioctl+0x7ac/0x1ba8 [ 262.193612] invoke_syscall.constprop.0+0x70/0x1e0 [ 262.195006] do_el0_svc+0xe4/0x2d4 [ 262.195929] el0_svc+0x44/0x8c [ 262.196917] el0t_64_sync_handler+0xf4/0x120 [ 262.198238] el0t_64_sync+0x190/0x194 [ 262.199224] [ 262.199224] -> #0 (&kvm->arch.config_lock){+.+.}-{3:3}: [ 262.201094] __lock_acquire+0x2b70/0x626c [ 262.202245] lock_acquire+0x454/0x778 [ 262.203132] __mutex_lock+0x190/0x8b4 [ 262.204023] mutex_lock_nested+0x24/0x30 [ 262.205100] vgic_mmio_write_v3_misc+0x5c/0x2a0 [ 262.206178] dispatch_mmio_write+0xd8/0x258 [ 262.207498] __kvm_io_bus_write+0x1e0/0x350 [ 262.208582] kvm_io_bus_write+0xe0/0x1cc [ 262.209653] io_mem_abort+0x2ac/0x6d8 [ 262.210569] kvm_handle_guest_abort+0x9b8/0x1f88 [ 262.211937] handle_exit+0xc4/0x39c [ 262.212971] kvm_arch_vcpu_ioctl_run+0x90c/0x1c04 [ 262.214154] kvm_vcpu_ioctl+0x450/0x12f8 [ 262.215233] __arm64_sys_ioctl+0x7ac/0x1ba8 [ 262.216402] invoke_syscall.constprop.0+0x70/0x1e0 [ 262.217774] do_el0_svc+0xe4/0x2d4 [ 262.218758] el0_svc+0x44/0x8c [ 262.219941] el0t_64_sync_handler+0xf4/0x120 [ 262.221110] el0t_64_sync+0x190/0x194 Note that the current report, which can be triggered by the vgic_irq kselftest, is a triple chain that includes slots_lock, but after inverting the slots_lock/config_lock dependency, the actual problem reported above remains. In several places, the vgic code calls kvm_io_bus_register_dev(), which synchronizes the srcu, while holding config_lock (#1). And the MMIO handler takes the config_lock while holding the srcu read lock (#0). Break dependency #1, by registering the distributor and redistributors without holding config_lock. The ITS also uses kvm_io_bus_register_dev() but already relies on slots_lock to serialize calls. The distributor iodev is created on the first KVM_RUN call. Multiple threads will race for vgic initialization, and only the first one will see !vgic_ready() under the lock. To serialize those threads, rely on slots_lock rather than config_lock. Redistributors are created earlier, through KVM_DEV_ARM_VGIC_GRP_ADDR ioctls and vCPU creation. Similarly, serialize the iodev creation with slots_lock, and the rest with config_lock. Fixes: f00327731131 ("KVM: arm64: Use config_lock to protect vgic state") Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org> Reviewed-by: Oliver Upton <oliver.upton@linux.dev> Signed-off-by: Marc Zyngier <maz@kernel.org> Link: https://lore.kernel.org/r/20230518100914.2837292-2-jean-philippe@linaro.org
2023-05-18 11:09:15 +01:00
kvm_err("Unable to register VGIC dist MMIO regions\n");
goto out_slots;
}
/*
* kvm_io_bus_register_dev() guarantees all readers see the new MMIO
* registration before returning through synchronize_srcu(), which also
* implies a full memory barrier. As such, marking the distributor as
* 'ready' here is guaranteed to be ordered after all vCPUs having seen
* a completely configured distributor.
*/
dist->ready = true;
goto out_slots;
out:
mutex_unlock(&kvm->arch.config_lock);
out_slots:
if (ret)
KVM: arm64: Don't eagerly teardown the vgic on init error As there is very little ordering in the KVM API, userspace can instanciate a half-baked GIC (missing its memory map, for example) at almost any time. This means that, with the right timing, a thread running vcpu-0 can enter the kernel without a GIC configured and get a GIC created behind its back by another thread. Amusingly, it will pick up that GIC and start messing with the data structures without the GIC having been fully initialised. Similarly, a thread running vcpu-1 can enter the kernel, and try to init the GIC that was previously created. Since this GIC isn't properly configured (no memory map), it fails to correctly initialise. And that's the point where we decide to teardown the GIC, freeing all its resources. Behind vcpu-0's back. Things stop pretty abruptly, with a variety of symptoms. Clearly, this isn't good, we should be a bit more careful about this. It is obvious that this guest is not viable, as it is missing some important part of its configuration. So instead of trying to tear bits of it down, let's just mark it as *dead*. It means that any further interaction from userspace will result in -EIO. The memory will be released on the "normal" path, when userspace gives up. Cc: stable@vger.kernel.org Reported-by: Alexander Potapenko <glider@google.com> Reviewed-by: Oliver Upton <oliver.upton@linux.dev> Link: https://lore.kernel.org/r/20241009183603.3221824-1-maz@kernel.org Signed-off-by: Marc Zyngier <maz@kernel.org>
2024-10-09 19:36:03 +01:00
kvm_vm_dead(kvm);
mutex_unlock(&kvm->slots_lock);
return ret;
}
/* GENERIC PROBE */
void kvm_vgic_cpu_up(void)
{
enable_percpu_irq(kvm_vgic_global_state.maint_irq, 0);
}
void kvm_vgic_cpu_down(void)
{
disable_percpu_irq(kvm_vgic_global_state.maint_irq);
}
static irqreturn_t vgic_maintenance_handler(int irq, void *data)
{
struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)data;
/*
* We cannot rely on the vgic maintenance interrupt to be
* delivered synchronously. This means we can only use it to
* exit the VM, and we perform the handling of EOIed
* interrupts on the exit path (see vgic_fold_lr_state).
*
* Of course, NV throws a wrench in this plan, and needs
* something special.
*/
if (vcpu && vgic_state_is_nested(vcpu))
vgic_v3_handle_nested_maint_irq(vcpu);
return IRQ_HANDLED;
}
static struct gic_kvm_info *gic_kvm_info;
void __init vgic_set_kvm_info(const struct gic_kvm_info *info)
{
BUG_ON(gic_kvm_info != NULL);
gic_kvm_info = kmalloc(sizeof(*info), GFP_KERNEL);
if (gic_kvm_info)
*gic_kvm_info = *info;
}
/**
* kvm_vgic_init_cpu_hardware - initialize the GIC VE hardware
*
* For a specific CPU, initialize the GIC VE hardware.
*/
void kvm_vgic_init_cpu_hardware(void)
{
BUG_ON(preemptible());
/*
* We want to make sure the list registers start out clear so that we
* only have the program the used registers.
*/
if (kvm_vgic_global_state.type == VGIC_V2) {
vgic_v2_init_lrs();
} else if (kvm_vgic_global_state.type == VGIC_V3 ||
kvm_vgic_global_state.has_gcie_v3_compat) {
kvm_call_hyp(__vgic_v3_init_lrs);
}
}
/**
* kvm_vgic_hyp_init: populates the kvm_vgic_global_state variable
* according to the host GIC model. Accordingly calls either
* vgic_v2/v3_probe which registers the KVM_DEVICE that can be
* instantiated by a guest later on .
*/
int kvm_vgic_hyp_init(void)
{
bool has_mask;
int ret;
if (!gic_kvm_info)
return -ENODEV;
has_mask = !gic_kvm_info->no_maint_irq_mask;
if (has_mask && !gic_kvm_info->maint_irq) {
kvm_err("No vgic maintenance irq\n");
return -ENXIO;
}
/*
* If we get one of these oddball non-GICs, taint the kernel,
* as we have no idea of how they *really* behave.
*/
if (gic_kvm_info->no_hw_deactivation) {
kvm_info("Non-architectural vgic, tainting kernel\n");
add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
kvm_vgic_global_state.no_hw_deactivation = true;
}
switch (gic_kvm_info->type) {
case GIC_V2:
ret = vgic_v2_probe(gic_kvm_info);
break;
case GIC_V3:
ret = vgic_v3_probe(gic_kvm_info);
if (!ret) {
static_branch_enable(&kvm_vgic_global_state.gicv3_cpuif);
kvm_info("GIC system register CPU interface enabled\n");
}
break;
case GIC_V5:
ret = vgic_v5_probe(gic_kvm_info);
break;
default:
ret = -ENODEV;
}
kvm_vgic_global_state.maint_irq = gic_kvm_info->maint_irq;
kfree(gic_kvm_info);
gic_kvm_info = NULL;
if (ret)
return ret;
if (!has_mask && !kvm_vgic_global_state.maint_irq)
return 0;
ret = request_percpu_irq(kvm_vgic_global_state.maint_irq,
vgic_maintenance_handler,
"vgic", kvm_get_running_vcpus());
if (ret) {
kvm_err("Cannot register interrupt %d\n",
kvm_vgic_global_state.maint_irq);
return ret;
}
kvm_info("vgic interrupt IRQ%d\n", kvm_vgic_global_state.maint_irq);
return 0;
}