linux/drivers/xen/events/events_base.c

2313 lines
53 KiB
C
Raw Permalink Normal View History

// SPDX-License-Identifier: GPL-2.0-only
/*
* Xen event channels
*
* Xen models interrupts with abstract event channels. Because each
* domain gets 1024 event channels, but NR_IRQ is not that large, we
* must dynamically map irqs<->event channels. The event channels
* interface with the rest of the kernel by defining a xen interrupt
* chip. When an event is received, it is mapped to an irq and sent
* through the normal interrupt processing path.
*
* There are four kinds of events which can be mapped to an event
* channel:
*
* 1. Inter-domain notifications. This includes all the virtual
* device events, since they're driven by front-ends in another domain
* (typically dom0).
* 2. VIRQs, typically used for timers. These are per-cpu events.
* 3. IPIs.
* 4. PIRQs - Hardware interrupts.
*
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
#include <linux/linkage.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/moduleparam.h>
#include <linux/string.h>
mm: remove include/linux/bootmem.h Move remaining definitions and declarations from include/linux/bootmem.h into include/linux/memblock.h and remove the redundant header. The includes were replaced with the semantic patch below and then semi-automated removal of duplicated '#include <linux/memblock.h> @@ @@ - #include <linux/bootmem.h> + #include <linux/memblock.h> [sfr@canb.auug.org.au: dma-direct: fix up for the removal of linux/bootmem.h] Link: http://lkml.kernel.org/r/20181002185342.133d1680@canb.auug.org.au [sfr@canb.auug.org.au: powerpc: fix up for removal of linux/bootmem.h] Link: http://lkml.kernel.org/r/20181005161406.73ef8727@canb.auug.org.au [sfr@canb.auug.org.au: x86/kaslr, ACPI/NUMA: fix for linux/bootmem.h removal] Link: http://lkml.kernel.org/r/20181008190341.5e396491@canb.auug.org.au Link: http://lkml.kernel.org/r/1536927045-23536-30-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com> Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Chris Zankel <chris@zankel.net> Cc: "David S. Miller" <davem@davemloft.net> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Greentime Hu <green.hu@gmail.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Guan Xuetao <gxt@pku.edu.cn> Cc: Ingo Molnar <mingo@redhat.com> Cc: "James E.J. Bottomley" <jejb@parisc-linux.org> Cc: Jonas Bonn <jonas@southpole.se> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Ley Foon Tan <lftan@altera.com> Cc: Mark Salter <msalter@redhat.com> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Matt Turner <mattst88@gmail.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Michal Simek <monstr@monstr.eu> Cc: Palmer Dabbelt <palmer@sifive.com> Cc: Paul Burton <paul.burton@mips.com> Cc: Richard Kuo <rkuo@codeaurora.org> Cc: Richard Weinberger <richard@nod.at> Cc: Rich Felker <dalias@libc.org> Cc: Russell King <linux@armlinux.org.uk> Cc: Serge Semin <fancer.lancer@gmail.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Tony Luck <tony.luck@intel.com> Cc: Vineet Gupta <vgupta@synopsys.com> Cc: Yoshinori Sato <ysato@users.sourceforge.jp> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-10-30 15:09:49 -07:00
#include <linux/memblock.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 17:04:11 +09:00
#include <linux/slab.h>
#include <linux/irqnr.h>
#include <linux/pci.h>
xen/events: replace evtchn_rwlock with RCU In unprivileged Xen guests event handling can cause a deadlock with Xen console handling. The evtchn_rwlock and the hvc_lock are taken in opposite sequence in __hvc_poll() and in Xen console IRQ handling. Normally this is no problem, as the evtchn_rwlock is taken as a reader in both paths, but as soon as an event channel is being closed, the lock will be taken as a writer, which will cause read_lock() to block: CPU0 CPU1 CPU2 (IRQ handling) (__hvc_poll()) (closing event channel) read_lock(evtchn_rwlock) spin_lock(hvc_lock) write_lock(evtchn_rwlock) [blocks] spin_lock(hvc_lock) [blocks] read_lock(evtchn_rwlock) [blocks due to writer waiting, and not in_interrupt()] This issue can be avoided by replacing evtchn_rwlock with RCU in xen_free_irq(). Note that RCU is used only to delay freeing of the irq_info memory. There is no RCU based dereferencing or replacement of pointers involved. In order to avoid potential races between removing the irq_info reference and handling of interrupts, set the irq_info pointer to NULL only when freeing its memory. The IRQ itself must be freed at that time, too, as otherwise the same IRQ number could be allocated again before handling of the old instance would have been finished. This is XSA-441 / CVE-2023-34324. Fixes: 54c9de89895e ("xen/events: add a new "late EOI" evtchn framework") Reported-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com> Signed-off-by: Juergen Gross <jgross@suse.com> Reviewed-by: Julien Grall <jgrall@amazon.com> Signed-off-by: Juergen Gross <jgross@suse.com>
2023-08-28 08:09:47 +02:00
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
#include <linux/cpuhotplug.h>
#include <linux/atomic.h>
#include <linux/ktime.h>
#ifdef CONFIG_X86
#include <asm/desc.h>
#include <asm/ptrace.h>
#include <asm/idtentry.h>
#include <asm/irq.h>
#include <asm/io_apic.h>
#include <asm/i8259.h>
#include <asm/xen/cpuid.h>
#include <asm/xen/pci.h>
#endif
#include <asm/sync_bitops.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
#include <xen/page.h>
#include <xen/xen.h>
#include <xen/hvm.h>
#include <xen/xen-ops.h>
#include <xen/events.h>
#include <xen/interface/xen.h>
#include <xen/interface/event_channel.h>
#include <xen/interface/hvm/hvm_op.h>
#include <xen/interface/hvm/params.h>
#include <xen/interface/physdev.h>
#include <xen/interface/sched.h>
xen: Support 64-bit PV guest receiving NMIs This is based on a patch that Zhenzhong Duan had sent - which was missing some of the remaining pieces. The kernel has the logic to handle Xen-type-exceptions using the paravirt interface in the assembler code (see PARAVIRT_ADJUST_EXCEPTION_FRAME - pv_irq_ops.adjust_exception_frame and and INTERRUPT_RETURN - pv_cpu_ops.iret). That means the nmi handler (and other exception handlers) use the hypervisor iret. The other changes that would be neccessary for this would be to translate the NMI_VECTOR to one of the entries on the ipi_vector and make xen_send_IPI_mask_allbutself use different events. Fortunately for us commit 1db01b4903639fcfaec213701a494fe3fb2c490b (xen: Clean up apic ipi interface) implemented this and we piggyback on the cleanup such that the apic IPI interface will pass the right vector value for NMI. With this patch we can trigger NMIs within a PV guest (only tested x86_64). For this to work with normal PV guests (not initial domain) we need the domain to be able to use the APIC ops - they are already implemented to use the Xen event channels. For that to be turned on in a PV domU we need to remove the masking of X86_FEATURE_APIC. Incidentally that means kgdb will also now work within a PV guest without using the 'nokgdbroundup' workaround. Note that the 32-bit version is different and this patch does not enable that. CC: Lisa Nguyen <lisa@xenapiadmin.com> CC: Ben Guthro <benjamin.guthro@citrix.com> CC: Zhenzhong Duan <zhenzhong.duan@oracle.com> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> [v1: Fixed up per David Vrabel comments] Reviewed-by: Ben Guthro <benjamin.guthro@citrix.com> Reviewed-by: David Vrabel <david.vrabel@citrix.com>
2013-07-19 11:51:31 -04:00
#include <xen/interface/vcpu.h>
#include <xen/xenbus.h>
#include <asm/hw_irq.h>
#include "events_internal.h"
#undef MODULE_PARAM_PREFIX
#define MODULE_PARAM_PREFIX "xen."
/* Interrupt types. */
enum xen_irq_type {
IRQT_UNBOUND = 0,
IRQT_PIRQ,
IRQT_VIRQ,
IRQT_IPI,
IRQT_EVTCHN
};
/*
* Packed IRQ information:
* type - enum xen_irq_type
* event channel - irq->event channel mapping
* cpu - cpu this event channel is bound to
* index - type-specific information:
* PIRQ - vector, with MSB being "needs EIO", or physical IRQ of the HVM
* guest, or GSI (real passthrough IRQ) of the device.
* VIRQ - virq number
* IPI - IPI vector
* EVTCHN -
*/
struct irq_info {
struct list_head list;
struct list_head eoi_list;
xen/events: replace evtchn_rwlock with RCU In unprivileged Xen guests event handling can cause a deadlock with Xen console handling. The evtchn_rwlock and the hvc_lock are taken in opposite sequence in __hvc_poll() and in Xen console IRQ handling. Normally this is no problem, as the evtchn_rwlock is taken as a reader in both paths, but as soon as an event channel is being closed, the lock will be taken as a writer, which will cause read_lock() to block: CPU0 CPU1 CPU2 (IRQ handling) (__hvc_poll()) (closing event channel) read_lock(evtchn_rwlock) spin_lock(hvc_lock) write_lock(evtchn_rwlock) [blocks] spin_lock(hvc_lock) [blocks] read_lock(evtchn_rwlock) [blocks due to writer waiting, and not in_interrupt()] This issue can be avoided by replacing evtchn_rwlock with RCU in xen_free_irq(). Note that RCU is used only to delay freeing of the irq_info memory. There is no RCU based dereferencing or replacement of pointers involved. In order to avoid potential races between removing the irq_info reference and handling of interrupts, set the irq_info pointer to NULL only when freeing its memory. The IRQ itself must be freed at that time, too, as otherwise the same IRQ number could be allocated again before handling of the old instance would have been finished. This is XSA-441 / CVE-2023-34324. Fixes: 54c9de89895e ("xen/events: add a new "late EOI" evtchn framework") Reported-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com> Signed-off-by: Juergen Gross <jgross@suse.com> Reviewed-by: Julien Grall <jgrall@amazon.com> Signed-off-by: Juergen Gross <jgross@suse.com>
2023-08-28 08:09:47 +02:00
struct rcu_work rwork;
short refcnt;
u8 spurious_cnt;
u8 is_accounted;
short type; /* type: IRQT_* */
u8 mask_reason; /* Why is event channel masked */
#define EVT_MASK_REASON_EXPLICIT 0x01
#define EVT_MASK_REASON_TEMPORARY 0x02
#define EVT_MASK_REASON_EOI_PENDING 0x04
u8 is_active; /* Is event just being handled? */
unsigned irq;
evtchn_port_t evtchn; /* event channel */
unsigned short cpu; /* cpu bound */
unsigned short eoi_cpu; /* EOI must happen on this cpu-1 */
unsigned int irq_epoch; /* If eoi_cpu valid: irq_epoch of event */
u64 eoi_time; /* Time in jiffies when to EOI. */
raw_spinlock_t lock;
bool is_static; /* Is event channel static */
union {
unsigned short virq;
enum ipi_vector ipi;
struct {
unsigned short pirq;
unsigned short gsi;
unsigned char vector;
unsigned char flags;
uint16_t domid;
} pirq;
struct xenbus_device *interdomain;
} u;
};
#define PIRQ_NEEDS_EOI (1 << 0)
#define PIRQ_SHAREABLE (1 << 1)
#define PIRQ_MSI_GROUP (1 << 2)
static uint __read_mostly event_loop_timeout = 2;
module_param(event_loop_timeout, uint, 0644);
static uint __read_mostly event_eoi_delay = 10;
module_param(event_eoi_delay, uint, 0644);
const struct evtchn_ops *evtchn_ops;
/*
* This lock protects updates to the following mapping and reference-count
* arrays. The lock does not need to be acquired to read the mapping tables.
*/
xen/irq: Alter the locking to use a mutex instead of a spinlock. When we allocate/change the IRQ informations, we do not need to use spinlocks. We can use a mutex (which is what the generic IRQ code does for allocations/changes). Fixes a slew of: BUG: sleeping function called from invalid context at /linux/kernel/mutex.c:271 in_atomic(): 1, irqs_disabled(): 0, pid: 3216, name: xenstored 2 locks held by xenstored/3216: #0: (&u->bind_mutex){......}, at: [<ffffffffa02e0920>] evtchn_ioctl+0x30/0x3a0 [xen_evtchn] #1: (irq_mapping_update_lock){......}, at: [<ffffffff8138b274>] bind_evtchn_to_irq+0x24/0x90 Pid: 3216, comm: xenstored Not tainted 3.1.0-rc6-00021-g437a3d1 #2 Call Trace: [<ffffffff81088d10>] __might_sleep+0x100/0x130 [<ffffffff81645c2f>] mutex_lock_nested+0x2f/0x50 [<ffffffff81627529>] __irq_alloc_descs+0x49/0x200 [<ffffffffa02e0920>] ? evtchn_ioctl+0x30/0x3a0 [xen_evtchn] [<ffffffff8138b214>] xen_allocate_irq_dynamic+0x34/0x70 [<ffffffff8138b2ad>] bind_evtchn_to_irq+0x5d/0x90 [<ffffffffa02e03c0>] ? evtchn_bind_to_user+0x60/0x60 [xen_evtchn] [<ffffffff8138c282>] bind_evtchn_to_irqhandler+0x32/0x80 [<ffffffffa02e03a9>] evtchn_bind_to_user+0x49/0x60 [xen_evtchn] [<ffffffffa02e0a34>] evtchn_ioctl+0x144/0x3a0 [xen_evtchn] [<ffffffff811b4070>] ? vfsmount_lock_local_unlock+0x50/0x80 [<ffffffff811a6a1a>] do_vfs_ioctl+0x9a/0x5e0 [<ffffffff811b476f>] ? mntput+0x1f/0x30 [<ffffffff81196259>] ? fput+0x199/0x240 [<ffffffff811a7001>] sys_ioctl+0xa1/0xb0 [<ffffffff8164ea82>] system_call_fastpath+0x16/0x1b Reported-by: Jim Burns <jim_burn@bellsouth.net> Acked-by: Ian Campbell <ian.campbell@citrix.com> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
2011-09-14 05:10:00 -04:00
static DEFINE_MUTEX(irq_mapping_update_lock);
/*
* Lock hierarchy:
*
* irq_mapping_update_lock
xen/events: replace evtchn_rwlock with RCU In unprivileged Xen guests event handling can cause a deadlock with Xen console handling. The evtchn_rwlock and the hvc_lock are taken in opposite sequence in __hvc_poll() and in Xen console IRQ handling. Normally this is no problem, as the evtchn_rwlock is taken as a reader in both paths, but as soon as an event channel is being closed, the lock will be taken as a writer, which will cause read_lock() to block: CPU0 CPU1 CPU2 (IRQ handling) (__hvc_poll()) (closing event channel) read_lock(evtchn_rwlock) spin_lock(hvc_lock) write_lock(evtchn_rwlock) [blocks] spin_lock(hvc_lock) [blocks] read_lock(evtchn_rwlock) [blocks due to writer waiting, and not in_interrupt()] This issue can be avoided by replacing evtchn_rwlock with RCU in xen_free_irq(). Note that RCU is used only to delay freeing of the irq_info memory. There is no RCU based dereferencing or replacement of pointers involved. In order to avoid potential races between removing the irq_info reference and handling of interrupts, set the irq_info pointer to NULL only when freeing its memory. The IRQ itself must be freed at that time, too, as otherwise the same IRQ number could be allocated again before handling of the old instance would have been finished. This is XSA-441 / CVE-2023-34324. Fixes: 54c9de89895e ("xen/events: add a new "late EOI" evtchn framework") Reported-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com> Signed-off-by: Juergen Gross <jgross@suse.com> Reviewed-by: Julien Grall <jgrall@amazon.com> Signed-off-by: Juergen Gross <jgross@suse.com>
2023-08-28 08:09:47 +02:00
* IRQ-desc lock
* percpu eoi_list_lock
* irq_info->lock
*/
static LIST_HEAD(xen_irq_list_head);
/* IRQ <-> VIRQ mapping. */
static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1};
/* IRQ <-> IPI mapping */
static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1};
/* Cache for IPI event channels - needed for hot cpu unplug (avoid RCU usage). */
static DEFINE_PER_CPU(evtchn_port_t [XEN_NR_IPIS], ipi_to_evtchn) = {[0 ... XEN_NR_IPIS-1] = 0};
/* Event channel distribution data */
static atomic_t channels_on_cpu[NR_CPUS];
static int **evtchn_to_irq;
#ifdef CONFIG_X86
static unsigned long *pirq_eoi_map;
#endif
static bool (*pirq_needs_eoi)(struct irq_info *info);
#define EVTCHN_ROW(e) (e / (PAGE_SIZE/sizeof(**evtchn_to_irq)))
#define EVTCHN_COL(e) (e % (PAGE_SIZE/sizeof(**evtchn_to_irq)))
#define EVTCHN_PER_ROW (PAGE_SIZE / sizeof(**evtchn_to_irq))
/* Xen will never allocate port zero for any purpose. */
#define VALID_EVTCHN(chn) ((chn) != 0)
static struct irq_info *legacy_info_ptrs[NR_IRQS_LEGACY];
static struct irq_chip xen_dynamic_chip;
static struct irq_chip xen_lateeoi_chip;
static struct irq_chip xen_percpu_chip;
static struct irq_chip xen_pirq_chip;
static void enable_dynirq(struct irq_data *data);
static DEFINE_PER_CPU(unsigned int, irq_epoch);
xen/events: Fix race in set_evtchn_to_irq There is a TOCTOU issue in set_evtchn_to_irq. Rows in the evtchn_to_irq mapping are lazily allocated in this function. The check whether the row is already present and the row initialization is not synchronized. Two threads can at the same time allocate a new row for evtchn_to_irq and add the irq mapping to the their newly allocated row. One thread will overwrite what the other has set for evtchn_to_irq[row] and therefore the irq mapping is lost. This will trigger a BUG_ON later in bind_evtchn_to_cpu: INFO: pci 0000:1a:15.4: [1d0f:8061] type 00 class 0x010802 INFO: nvme 0000:1a:12.1: enabling device (0000 -> 0002) INFO: nvme nvme77: 1/0/0 default/read/poll queues CRIT: kernel BUG at drivers/xen/events/events_base.c:427! WARN: invalid opcode: 0000 [#1] SMP NOPTI WARN: Workqueue: nvme-reset-wq nvme_reset_work [nvme] WARN: RIP: e030:bind_evtchn_to_cpu+0xc2/0xd0 WARN: Call Trace: WARN: set_affinity_irq+0x121/0x150 WARN: irq_do_set_affinity+0x37/0xe0 WARN: irq_setup_affinity+0xf6/0x170 WARN: irq_startup+0x64/0xe0 WARN: __setup_irq+0x69e/0x740 WARN: ? request_threaded_irq+0xad/0x160 WARN: request_threaded_irq+0xf5/0x160 WARN: ? nvme_timeout+0x2f0/0x2f0 [nvme] WARN: pci_request_irq+0xa9/0xf0 WARN: ? pci_alloc_irq_vectors_affinity+0xbb/0x130 WARN: queue_request_irq+0x4c/0x70 [nvme] WARN: nvme_reset_work+0x82d/0x1550 [nvme] WARN: ? check_preempt_wakeup+0x14f/0x230 WARN: ? check_preempt_curr+0x29/0x80 WARN: ? nvme_irq_check+0x30/0x30 [nvme] WARN: process_one_work+0x18e/0x3c0 WARN: worker_thread+0x30/0x3a0 WARN: ? process_one_work+0x3c0/0x3c0 WARN: kthread+0x113/0x130 WARN: ? kthread_park+0x90/0x90 WARN: ret_from_fork+0x3a/0x50 This patch sets evtchn_to_irq rows via a cmpxchg operation so that they will be set only once. The row is now cleared before writing it to evtchn_to_irq in order to not create a race once the row is visible for other threads. While at it, do not require the page to be zeroed, because it will be overwritten with -1's in clear_evtchn_to_irq_row anyway. Signed-off-by: Maximilian Heyne <mheyne@amazon.de> Fixes: d0b075ffeede ("xen/events: Refactor evtchn_to_irq array to be dynamically allocated") Link: https://lore.kernel.org/r/20210812130930.127134-1-mheyne@amazon.de Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com> Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
2021-08-12 13:09:27 +00:00
static void clear_evtchn_to_irq_row(int *evtchn_row)
{
unsigned col;
for (col = 0; col < EVTCHN_PER_ROW; col++)
xen/events: Fix race in set_evtchn_to_irq There is a TOCTOU issue in set_evtchn_to_irq. Rows in the evtchn_to_irq mapping are lazily allocated in this function. The check whether the row is already present and the row initialization is not synchronized. Two threads can at the same time allocate a new row for evtchn_to_irq and add the irq mapping to the their newly allocated row. One thread will overwrite what the other has set for evtchn_to_irq[row] and therefore the irq mapping is lost. This will trigger a BUG_ON later in bind_evtchn_to_cpu: INFO: pci 0000:1a:15.4: [1d0f:8061] type 00 class 0x010802 INFO: nvme 0000:1a:12.1: enabling device (0000 -> 0002) INFO: nvme nvme77: 1/0/0 default/read/poll queues CRIT: kernel BUG at drivers/xen/events/events_base.c:427! WARN: invalid opcode: 0000 [#1] SMP NOPTI WARN: Workqueue: nvme-reset-wq nvme_reset_work [nvme] WARN: RIP: e030:bind_evtchn_to_cpu+0xc2/0xd0 WARN: Call Trace: WARN: set_affinity_irq+0x121/0x150 WARN: irq_do_set_affinity+0x37/0xe0 WARN: irq_setup_affinity+0xf6/0x170 WARN: irq_startup+0x64/0xe0 WARN: __setup_irq+0x69e/0x740 WARN: ? request_threaded_irq+0xad/0x160 WARN: request_threaded_irq+0xf5/0x160 WARN: ? nvme_timeout+0x2f0/0x2f0 [nvme] WARN: pci_request_irq+0xa9/0xf0 WARN: ? pci_alloc_irq_vectors_affinity+0xbb/0x130 WARN: queue_request_irq+0x4c/0x70 [nvme] WARN: nvme_reset_work+0x82d/0x1550 [nvme] WARN: ? check_preempt_wakeup+0x14f/0x230 WARN: ? check_preempt_curr+0x29/0x80 WARN: ? nvme_irq_check+0x30/0x30 [nvme] WARN: process_one_work+0x18e/0x3c0 WARN: worker_thread+0x30/0x3a0 WARN: ? process_one_work+0x3c0/0x3c0 WARN: kthread+0x113/0x130 WARN: ? kthread_park+0x90/0x90 WARN: ret_from_fork+0x3a/0x50 This patch sets evtchn_to_irq rows via a cmpxchg operation so that they will be set only once. The row is now cleared before writing it to evtchn_to_irq in order to not create a race once the row is visible for other threads. While at it, do not require the page to be zeroed, because it will be overwritten with -1's in clear_evtchn_to_irq_row anyway. Signed-off-by: Maximilian Heyne <mheyne@amazon.de> Fixes: d0b075ffeede ("xen/events: Refactor evtchn_to_irq array to be dynamically allocated") Link: https://lore.kernel.org/r/20210812130930.127134-1-mheyne@amazon.de Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com> Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
2021-08-12 13:09:27 +00:00
WRITE_ONCE(evtchn_row[col], -1);
}
static void clear_evtchn_to_irq_all(void)
{
unsigned row;
for (row = 0; row < EVTCHN_ROW(xen_evtchn_max_channels()); row++) {
if (evtchn_to_irq[row] == NULL)
continue;
xen/events: Fix race in set_evtchn_to_irq There is a TOCTOU issue in set_evtchn_to_irq. Rows in the evtchn_to_irq mapping are lazily allocated in this function. The check whether the row is already present and the row initialization is not synchronized. Two threads can at the same time allocate a new row for evtchn_to_irq and add the irq mapping to the their newly allocated row. One thread will overwrite what the other has set for evtchn_to_irq[row] and therefore the irq mapping is lost. This will trigger a BUG_ON later in bind_evtchn_to_cpu: INFO: pci 0000:1a:15.4: [1d0f:8061] type 00 class 0x010802 INFO: nvme 0000:1a:12.1: enabling device (0000 -> 0002) INFO: nvme nvme77: 1/0/0 default/read/poll queues CRIT: kernel BUG at drivers/xen/events/events_base.c:427! WARN: invalid opcode: 0000 [#1] SMP NOPTI WARN: Workqueue: nvme-reset-wq nvme_reset_work [nvme] WARN: RIP: e030:bind_evtchn_to_cpu+0xc2/0xd0 WARN: Call Trace: WARN: set_affinity_irq+0x121/0x150 WARN: irq_do_set_affinity+0x37/0xe0 WARN: irq_setup_affinity+0xf6/0x170 WARN: irq_startup+0x64/0xe0 WARN: __setup_irq+0x69e/0x740 WARN: ? request_threaded_irq+0xad/0x160 WARN: request_threaded_irq+0xf5/0x160 WARN: ? nvme_timeout+0x2f0/0x2f0 [nvme] WARN: pci_request_irq+0xa9/0xf0 WARN: ? pci_alloc_irq_vectors_affinity+0xbb/0x130 WARN: queue_request_irq+0x4c/0x70 [nvme] WARN: nvme_reset_work+0x82d/0x1550 [nvme] WARN: ? check_preempt_wakeup+0x14f/0x230 WARN: ? check_preempt_curr+0x29/0x80 WARN: ? nvme_irq_check+0x30/0x30 [nvme] WARN: process_one_work+0x18e/0x3c0 WARN: worker_thread+0x30/0x3a0 WARN: ? process_one_work+0x3c0/0x3c0 WARN: kthread+0x113/0x130 WARN: ? kthread_park+0x90/0x90 WARN: ret_from_fork+0x3a/0x50 This patch sets evtchn_to_irq rows via a cmpxchg operation so that they will be set only once. The row is now cleared before writing it to evtchn_to_irq in order to not create a race once the row is visible for other threads. While at it, do not require the page to be zeroed, because it will be overwritten with -1's in clear_evtchn_to_irq_row anyway. Signed-off-by: Maximilian Heyne <mheyne@amazon.de> Fixes: d0b075ffeede ("xen/events: Refactor evtchn_to_irq array to be dynamically allocated") Link: https://lore.kernel.org/r/20210812130930.127134-1-mheyne@amazon.de Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com> Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
2021-08-12 13:09:27 +00:00
clear_evtchn_to_irq_row(evtchn_to_irq[row]);
}
}
static int set_evtchn_to_irq(evtchn_port_t evtchn, unsigned int irq)
{
unsigned row;
unsigned col;
xen/events: Fix race in set_evtchn_to_irq There is a TOCTOU issue in set_evtchn_to_irq. Rows in the evtchn_to_irq mapping are lazily allocated in this function. The check whether the row is already present and the row initialization is not synchronized. Two threads can at the same time allocate a new row for evtchn_to_irq and add the irq mapping to the their newly allocated row. One thread will overwrite what the other has set for evtchn_to_irq[row] and therefore the irq mapping is lost. This will trigger a BUG_ON later in bind_evtchn_to_cpu: INFO: pci 0000:1a:15.4: [1d0f:8061] type 00 class 0x010802 INFO: nvme 0000:1a:12.1: enabling device (0000 -> 0002) INFO: nvme nvme77: 1/0/0 default/read/poll queues CRIT: kernel BUG at drivers/xen/events/events_base.c:427! WARN: invalid opcode: 0000 [#1] SMP NOPTI WARN: Workqueue: nvme-reset-wq nvme_reset_work [nvme] WARN: RIP: e030:bind_evtchn_to_cpu+0xc2/0xd0 WARN: Call Trace: WARN: set_affinity_irq+0x121/0x150 WARN: irq_do_set_affinity+0x37/0xe0 WARN: irq_setup_affinity+0xf6/0x170 WARN: irq_startup+0x64/0xe0 WARN: __setup_irq+0x69e/0x740 WARN: ? request_threaded_irq+0xad/0x160 WARN: request_threaded_irq+0xf5/0x160 WARN: ? nvme_timeout+0x2f0/0x2f0 [nvme] WARN: pci_request_irq+0xa9/0xf0 WARN: ? pci_alloc_irq_vectors_affinity+0xbb/0x130 WARN: queue_request_irq+0x4c/0x70 [nvme] WARN: nvme_reset_work+0x82d/0x1550 [nvme] WARN: ? check_preempt_wakeup+0x14f/0x230 WARN: ? check_preempt_curr+0x29/0x80 WARN: ? nvme_irq_check+0x30/0x30 [nvme] WARN: process_one_work+0x18e/0x3c0 WARN: worker_thread+0x30/0x3a0 WARN: ? process_one_work+0x3c0/0x3c0 WARN: kthread+0x113/0x130 WARN: ? kthread_park+0x90/0x90 WARN: ret_from_fork+0x3a/0x50 This patch sets evtchn_to_irq rows via a cmpxchg operation so that they will be set only once. The row is now cleared before writing it to evtchn_to_irq in order to not create a race once the row is visible for other threads. While at it, do not require the page to be zeroed, because it will be overwritten with -1's in clear_evtchn_to_irq_row anyway. Signed-off-by: Maximilian Heyne <mheyne@amazon.de> Fixes: d0b075ffeede ("xen/events: Refactor evtchn_to_irq array to be dynamically allocated") Link: https://lore.kernel.org/r/20210812130930.127134-1-mheyne@amazon.de Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com> Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
2021-08-12 13:09:27 +00:00
int *evtchn_row;
if (evtchn >= xen_evtchn_max_channels())
return -EINVAL;
row = EVTCHN_ROW(evtchn);
col = EVTCHN_COL(evtchn);
if (evtchn_to_irq[row] == NULL) {
/* Unallocated irq entries return -1 anyway */
if (irq == -1)
return 0;
xen/events: Fix race in set_evtchn_to_irq There is a TOCTOU issue in set_evtchn_to_irq. Rows in the evtchn_to_irq mapping are lazily allocated in this function. The check whether the row is already present and the row initialization is not synchronized. Two threads can at the same time allocate a new row for evtchn_to_irq and add the irq mapping to the their newly allocated row. One thread will overwrite what the other has set for evtchn_to_irq[row] and therefore the irq mapping is lost. This will trigger a BUG_ON later in bind_evtchn_to_cpu: INFO: pci 0000:1a:15.4: [1d0f:8061] type 00 class 0x010802 INFO: nvme 0000:1a:12.1: enabling device (0000 -> 0002) INFO: nvme nvme77: 1/0/0 default/read/poll queues CRIT: kernel BUG at drivers/xen/events/events_base.c:427! WARN: invalid opcode: 0000 [#1] SMP NOPTI WARN: Workqueue: nvme-reset-wq nvme_reset_work [nvme] WARN: RIP: e030:bind_evtchn_to_cpu+0xc2/0xd0 WARN: Call Trace: WARN: set_affinity_irq+0x121/0x150 WARN: irq_do_set_affinity+0x37/0xe0 WARN: irq_setup_affinity+0xf6/0x170 WARN: irq_startup+0x64/0xe0 WARN: __setup_irq+0x69e/0x740 WARN: ? request_threaded_irq+0xad/0x160 WARN: request_threaded_irq+0xf5/0x160 WARN: ? nvme_timeout+0x2f0/0x2f0 [nvme] WARN: pci_request_irq+0xa9/0xf0 WARN: ? pci_alloc_irq_vectors_affinity+0xbb/0x130 WARN: queue_request_irq+0x4c/0x70 [nvme] WARN: nvme_reset_work+0x82d/0x1550 [nvme] WARN: ? check_preempt_wakeup+0x14f/0x230 WARN: ? check_preempt_curr+0x29/0x80 WARN: ? nvme_irq_check+0x30/0x30 [nvme] WARN: process_one_work+0x18e/0x3c0 WARN: worker_thread+0x30/0x3a0 WARN: ? process_one_work+0x3c0/0x3c0 WARN: kthread+0x113/0x130 WARN: ? kthread_park+0x90/0x90 WARN: ret_from_fork+0x3a/0x50 This patch sets evtchn_to_irq rows via a cmpxchg operation so that they will be set only once. The row is now cleared before writing it to evtchn_to_irq in order to not create a race once the row is visible for other threads. While at it, do not require the page to be zeroed, because it will be overwritten with -1's in clear_evtchn_to_irq_row anyway. Signed-off-by: Maximilian Heyne <mheyne@amazon.de> Fixes: d0b075ffeede ("xen/events: Refactor evtchn_to_irq array to be dynamically allocated") Link: https://lore.kernel.org/r/20210812130930.127134-1-mheyne@amazon.de Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com> Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
2021-08-12 13:09:27 +00:00
evtchn_row = (int *) __get_free_pages(GFP_KERNEL, 0);
if (evtchn_row == NULL)
return -ENOMEM;
xen/events: Fix race in set_evtchn_to_irq There is a TOCTOU issue in set_evtchn_to_irq. Rows in the evtchn_to_irq mapping are lazily allocated in this function. The check whether the row is already present and the row initialization is not synchronized. Two threads can at the same time allocate a new row for evtchn_to_irq and add the irq mapping to the their newly allocated row. One thread will overwrite what the other has set for evtchn_to_irq[row] and therefore the irq mapping is lost. This will trigger a BUG_ON later in bind_evtchn_to_cpu: INFO: pci 0000:1a:15.4: [1d0f:8061] type 00 class 0x010802 INFO: nvme 0000:1a:12.1: enabling device (0000 -> 0002) INFO: nvme nvme77: 1/0/0 default/read/poll queues CRIT: kernel BUG at drivers/xen/events/events_base.c:427! WARN: invalid opcode: 0000 [#1] SMP NOPTI WARN: Workqueue: nvme-reset-wq nvme_reset_work [nvme] WARN: RIP: e030:bind_evtchn_to_cpu+0xc2/0xd0 WARN: Call Trace: WARN: set_affinity_irq+0x121/0x150 WARN: irq_do_set_affinity+0x37/0xe0 WARN: irq_setup_affinity+0xf6/0x170 WARN: irq_startup+0x64/0xe0 WARN: __setup_irq+0x69e/0x740 WARN: ? request_threaded_irq+0xad/0x160 WARN: request_threaded_irq+0xf5/0x160 WARN: ? nvme_timeout+0x2f0/0x2f0 [nvme] WARN: pci_request_irq+0xa9/0xf0 WARN: ? pci_alloc_irq_vectors_affinity+0xbb/0x130 WARN: queue_request_irq+0x4c/0x70 [nvme] WARN: nvme_reset_work+0x82d/0x1550 [nvme] WARN: ? check_preempt_wakeup+0x14f/0x230 WARN: ? check_preempt_curr+0x29/0x80 WARN: ? nvme_irq_check+0x30/0x30 [nvme] WARN: process_one_work+0x18e/0x3c0 WARN: worker_thread+0x30/0x3a0 WARN: ? process_one_work+0x3c0/0x3c0 WARN: kthread+0x113/0x130 WARN: ? kthread_park+0x90/0x90 WARN: ret_from_fork+0x3a/0x50 This patch sets evtchn_to_irq rows via a cmpxchg operation so that they will be set only once. The row is now cleared before writing it to evtchn_to_irq in order to not create a race once the row is visible for other threads. While at it, do not require the page to be zeroed, because it will be overwritten with -1's in clear_evtchn_to_irq_row anyway. Signed-off-by: Maximilian Heyne <mheyne@amazon.de> Fixes: d0b075ffeede ("xen/events: Refactor evtchn_to_irq array to be dynamically allocated") Link: https://lore.kernel.org/r/20210812130930.127134-1-mheyne@amazon.de Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com> Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
2021-08-12 13:09:27 +00:00
clear_evtchn_to_irq_row(evtchn_row);
/*
* We've prepared an empty row for the mapping. If a different
* thread was faster inserting it, we can drop ours.
*/
if (cmpxchg(&evtchn_to_irq[row], NULL, evtchn_row) != NULL)
free_page((unsigned long) evtchn_row);
}
WRITE_ONCE(evtchn_to_irq[row][col], irq);
return 0;
}
/* Get info for IRQ */
static struct irq_info *info_for_irq(unsigned irq)
{
if (irq < nr_legacy_irqs())
return legacy_info_ptrs[irq];
else
return irq_get_chip_data(irq);
}
static void set_info_for_irq(unsigned int irq, struct irq_info *info)
{
if (irq < nr_legacy_irqs())
legacy_info_ptrs[irq] = info;
else
irq_set_chip_data(irq, info);
}
static struct irq_info *evtchn_to_info(evtchn_port_t evtchn)
{
int irq;
if (evtchn >= xen_evtchn_max_channels())
return NULL;
if (evtchn_to_irq[EVTCHN_ROW(evtchn)] == NULL)
return NULL;
irq = READ_ONCE(evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)]);
return (irq < 0) ? NULL : info_for_irq(irq);
}
/* Per CPU channel accounting */
static void channels_on_cpu_dec(struct irq_info *info)
{
if (!info->is_accounted)
return;
info->is_accounted = 0;
if (WARN_ON_ONCE(info->cpu >= nr_cpu_ids))
return;
WARN_ON_ONCE(!atomic_add_unless(&channels_on_cpu[info->cpu], -1 , 0));
}
static void channels_on_cpu_inc(struct irq_info *info)
{
if (WARN_ON_ONCE(info->cpu >= nr_cpu_ids))
return;
if (WARN_ON_ONCE(!atomic_add_unless(&channels_on_cpu[info->cpu], 1,
INT_MAX)))
return;
info->is_accounted = 1;
}
static void xen_irq_free_desc(unsigned int irq)
{
/* Legacy IRQ descriptors are managed by the arch. */
if (irq >= nr_legacy_irqs())
irq_free_desc(irq);
}
xen/events: replace evtchn_rwlock with RCU In unprivileged Xen guests event handling can cause a deadlock with Xen console handling. The evtchn_rwlock and the hvc_lock are taken in opposite sequence in __hvc_poll() and in Xen console IRQ handling. Normally this is no problem, as the evtchn_rwlock is taken as a reader in both paths, but as soon as an event channel is being closed, the lock will be taken as a writer, which will cause read_lock() to block: CPU0 CPU1 CPU2 (IRQ handling) (__hvc_poll()) (closing event channel) read_lock(evtchn_rwlock) spin_lock(hvc_lock) write_lock(evtchn_rwlock) [blocks] spin_lock(hvc_lock) [blocks] read_lock(evtchn_rwlock) [blocks due to writer waiting, and not in_interrupt()] This issue can be avoided by replacing evtchn_rwlock with RCU in xen_free_irq(). Note that RCU is used only to delay freeing of the irq_info memory. There is no RCU based dereferencing or replacement of pointers involved. In order to avoid potential races between removing the irq_info reference and handling of interrupts, set the irq_info pointer to NULL only when freeing its memory. The IRQ itself must be freed at that time, too, as otherwise the same IRQ number could be allocated again before handling of the old instance would have been finished. This is XSA-441 / CVE-2023-34324. Fixes: 54c9de89895e ("xen/events: add a new "late EOI" evtchn framework") Reported-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com> Signed-off-by: Juergen Gross <jgross@suse.com> Reviewed-by: Julien Grall <jgrall@amazon.com> Signed-off-by: Juergen Gross <jgross@suse.com>
2023-08-28 08:09:47 +02:00
static void delayed_free_irq(struct work_struct *work)
{
struct irq_info *info = container_of(to_rcu_work(work), struct irq_info,
rwork);
unsigned int irq = info->irq;
/* Remove the info pointer only now, with no potential users left. */
set_info_for_irq(irq, NULL);
kfree(info);
xen_irq_free_desc(irq);
xen/events: replace evtchn_rwlock with RCU In unprivileged Xen guests event handling can cause a deadlock with Xen console handling. The evtchn_rwlock and the hvc_lock are taken in opposite sequence in __hvc_poll() and in Xen console IRQ handling. Normally this is no problem, as the evtchn_rwlock is taken as a reader in both paths, but as soon as an event channel is being closed, the lock will be taken as a writer, which will cause read_lock() to block: CPU0 CPU1 CPU2 (IRQ handling) (__hvc_poll()) (closing event channel) read_lock(evtchn_rwlock) spin_lock(hvc_lock) write_lock(evtchn_rwlock) [blocks] spin_lock(hvc_lock) [blocks] read_lock(evtchn_rwlock) [blocks due to writer waiting, and not in_interrupt()] This issue can be avoided by replacing evtchn_rwlock with RCU in xen_free_irq(). Note that RCU is used only to delay freeing of the irq_info memory. There is no RCU based dereferencing or replacement of pointers involved. In order to avoid potential races between removing the irq_info reference and handling of interrupts, set the irq_info pointer to NULL only when freeing its memory. The IRQ itself must be freed at that time, too, as otherwise the same IRQ number could be allocated again before handling of the old instance would have been finished. This is XSA-441 / CVE-2023-34324. Fixes: 54c9de89895e ("xen/events: add a new "late EOI" evtchn framework") Reported-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com> Signed-off-by: Juergen Gross <jgross@suse.com> Reviewed-by: Julien Grall <jgrall@amazon.com> Signed-off-by: Juergen Gross <jgross@suse.com>
2023-08-28 08:09:47 +02:00
}
/* Constructors for packed IRQ information. */
static int xen_irq_info_common_setup(struct irq_info *info,
enum xen_irq_type type,
evtchn_port_t evtchn,
unsigned short cpu)
{
int ret;
BUG_ON(info->type != IRQT_UNBOUND && info->type != type);
info->type = type;
info->evtchn = evtchn;
info->cpu = cpu;
info->mask_reason = EVT_MASK_REASON_EXPLICIT;
raw_spin_lock_init(&info->lock);
ret = set_evtchn_to_irq(evtchn, info->irq);
if (ret < 0)
return ret;
irq_clear_status_flags(info->irq, IRQ_NOREQUEST | IRQ_NOAUTOEN);
return xen_evtchn_port_setup(evtchn);
}
static int xen_irq_info_evtchn_setup(struct irq_info *info,
evtchn_port_t evtchn,
struct xenbus_device *dev)
{
int ret;
ret = xen_irq_info_common_setup(info, IRQT_EVTCHN, evtchn, 0);
info->u.interdomain = dev;
if (dev)
atomic_inc(&dev->event_channels);
return ret;
}
static int xen_irq_info_ipi_setup(struct irq_info *info, unsigned int cpu,
evtchn_port_t evtchn, enum ipi_vector ipi)
{
info->u.ipi = ipi;
per_cpu(ipi_to_irq, cpu)[ipi] = info->irq;
per_cpu(ipi_to_evtchn, cpu)[ipi] = evtchn;
return xen_irq_info_common_setup(info, IRQT_IPI, evtchn, 0);
}
static int xen_irq_info_virq_setup(struct irq_info *info, unsigned int cpu,
evtchn_port_t evtchn, unsigned int virq)
{
info->u.virq = virq;
per_cpu(virq_to_irq, cpu)[virq] = info->irq;
return xen_irq_info_common_setup(info, IRQT_VIRQ, evtchn, 0);
}
static int xen_irq_info_pirq_setup(struct irq_info *info, evtchn_port_t evtchn,
unsigned int pirq, unsigned int gsi,
uint16_t domid, unsigned char flags)
{
info->u.pirq.pirq = pirq;
info->u.pirq.gsi = gsi;
info->u.pirq.domid = domid;
info->u.pirq.flags = flags;
return xen_irq_info_common_setup(info, IRQT_PIRQ, evtchn, 0);
}
static void xen_irq_info_cleanup(struct irq_info *info)
{
set_evtchn_to_irq(info->evtchn, -1);
xen_evtchn_port_remove(info->evtchn, info->cpu);
info->evtchn = 0;
channels_on_cpu_dec(info);
}
/*
* Accessors for packed IRQ information.
*/
static evtchn_port_t evtchn_from_irq(unsigned int irq)
{
const struct irq_info *info = NULL;
if (likely(irq < irq_get_nr_irqs()))
info = info_for_irq(irq);
if (!info)
return 0;
return info->evtchn;
}
unsigned int irq_from_evtchn(evtchn_port_t evtchn)
{
struct irq_info *info = evtchn_to_info(evtchn);
return info ? info->irq : -1;
}
EXPORT_SYMBOL_GPL(irq_from_evtchn);
int irq_evtchn_from_virq(unsigned int cpu, unsigned int virq,
evtchn_port_t *evtchn)
{
int irq = per_cpu(virq_to_irq, cpu)[virq];
*evtchn = evtchn_from_irq(irq);
return irq;
}
static enum ipi_vector ipi_from_irq(struct irq_info *info)
{
BUG_ON(info == NULL);
BUG_ON(info->type != IRQT_IPI);
return info->u.ipi;
}
static unsigned int virq_from_irq(struct irq_info *info)
{
BUG_ON(info == NULL);
BUG_ON(info->type != IRQT_VIRQ);
return info->u.virq;
}
static unsigned int pirq_from_irq(struct irq_info *info)
{
BUG_ON(info == NULL);
BUG_ON(info->type != IRQT_PIRQ);
return info->u.pirq.pirq;
}
unsigned int cpu_from_evtchn(evtchn_port_t evtchn)
{
struct irq_info *info = evtchn_to_info(evtchn);
return info ? info->cpu : 0;
}
static void do_mask(struct irq_info *info, u8 reason)
{
unsigned long flags;
raw_spin_lock_irqsave(&info->lock, flags);
if (!info->mask_reason)
mask_evtchn(info->evtchn);
info->mask_reason |= reason;
raw_spin_unlock_irqrestore(&info->lock, flags);
}
static void do_unmask(struct irq_info *info, u8 reason)
{
unsigned long flags;
raw_spin_lock_irqsave(&info->lock, flags);
info->mask_reason &= ~reason;
if (!info->mask_reason)
unmask_evtchn(info->evtchn);
raw_spin_unlock_irqrestore(&info->lock, flags);
}
#ifdef CONFIG_X86
static bool pirq_check_eoi_map(struct irq_info *info)
{
return test_bit(pirq_from_irq(info), pirq_eoi_map);
}
#endif
static bool pirq_needs_eoi_flag(struct irq_info *info)
{
BUG_ON(info->type != IRQT_PIRQ);
return info->u.pirq.flags & PIRQ_NEEDS_EOI;
}
static void bind_evtchn_to_cpu(struct irq_info *info, unsigned int cpu,
bool force_affinity)
{
if (IS_ENABLED(CONFIG_SMP) && force_affinity) {
struct irq_data *data = irq_get_irq_data(info->irq);
irq_data_update_affinity(data, cpumask_of(cpu));
irq_data_update_effective_affinity(data, cpumask_of(cpu));
}
xen_evtchn_port_bind_to_cpu(info->evtchn, cpu, info->cpu);
channels_on_cpu_dec(info);
info->cpu = cpu;
channels_on_cpu_inc(info);
}
/**
* notify_remote_via_irq - send event to remote end of event channel via irq
* @irq: irq of event channel to send event to
*
* Unlike notify_remote_via_evtchn(), this is safe to use across
* save/restore. Notifications on a broken connection are silently
* dropped.
*/
void notify_remote_via_irq(int irq)
{
evtchn_port_t evtchn = evtchn_from_irq(irq);
if (VALID_EVTCHN(evtchn))
notify_remote_via_evtchn(evtchn);
}
EXPORT_SYMBOL_GPL(notify_remote_via_irq);
struct lateeoi_work {
struct delayed_work delayed;
spinlock_t eoi_list_lock;
struct list_head eoi_list;
};
static DEFINE_PER_CPU(struct lateeoi_work, lateeoi);
static void lateeoi_list_del(struct irq_info *info)
{
struct lateeoi_work *eoi = &per_cpu(lateeoi, info->eoi_cpu);
unsigned long flags;
spin_lock_irqsave(&eoi->eoi_list_lock, flags);
list_del_init(&info->eoi_list);
spin_unlock_irqrestore(&eoi->eoi_list_lock, flags);
}
static void lateeoi_list_add(struct irq_info *info)
{
struct lateeoi_work *eoi = &per_cpu(lateeoi, info->eoi_cpu);
struct irq_info *elem;
u64 now = get_jiffies_64();
unsigned long delay;
unsigned long flags;
if (now < info->eoi_time)
delay = info->eoi_time - now;
else
delay = 1;
spin_lock_irqsave(&eoi->eoi_list_lock, flags);
elem = list_first_entry_or_null(&eoi->eoi_list, struct irq_info,
eoi_list);
if (!elem || info->eoi_time < elem->eoi_time) {
list_add(&info->eoi_list, &eoi->eoi_list);
mod_delayed_work_on(info->eoi_cpu, system_wq,
&eoi->delayed, delay);
} else {
list_for_each_entry_reverse(elem, &eoi->eoi_list, eoi_list) {
if (elem->eoi_time <= info->eoi_time)
break;
}
list_add(&info->eoi_list, &elem->eoi_list);
}
spin_unlock_irqrestore(&eoi->eoi_list_lock, flags);
}
static void xen_irq_lateeoi_locked(struct irq_info *info, bool spurious)
{
evtchn_port_t evtchn;
unsigned int cpu;
unsigned int delay = 0;
evtchn = info->evtchn;
if (!VALID_EVTCHN(evtchn) || !list_empty(&info->eoi_list))
return;
if (spurious) {
struct xenbus_device *dev = info->u.interdomain;
unsigned int threshold = 1;
if (dev && dev->spurious_threshold)
threshold = dev->spurious_threshold;
if ((1 << info->spurious_cnt) < (HZ << 2)) {
if (info->spurious_cnt != 0xFF)
info->spurious_cnt++;
}
if (info->spurious_cnt > threshold) {
delay = 1 << (info->spurious_cnt - 1 - threshold);
if (delay > HZ)
delay = HZ;
if (!info->eoi_time)
info->eoi_cpu = smp_processor_id();
info->eoi_time = get_jiffies_64() + delay;
if (dev)
atomic_add(delay, &dev->jiffies_eoi_delayed);
}
if (dev)
atomic_inc(&dev->spurious_events);
} else {
info->spurious_cnt = 0;
}
cpu = info->eoi_cpu;
if (info->eoi_time &&
(info->irq_epoch == per_cpu(irq_epoch, cpu) || delay)) {
lateeoi_list_add(info);
return;
}
info->eoi_time = 0;
/* is_active hasn't been reset yet, do it now. */
smp_store_release(&info->is_active, 0);
do_unmask(info, EVT_MASK_REASON_EOI_PENDING);
}
static void xen_irq_lateeoi_worker(struct work_struct *work)
{
struct lateeoi_work *eoi;
struct irq_info *info;
u64 now = get_jiffies_64();
unsigned long flags;
eoi = container_of(to_delayed_work(work), struct lateeoi_work, delayed);
xen/events: replace evtchn_rwlock with RCU In unprivileged Xen guests event handling can cause a deadlock with Xen console handling. The evtchn_rwlock and the hvc_lock are taken in opposite sequence in __hvc_poll() and in Xen console IRQ handling. Normally this is no problem, as the evtchn_rwlock is taken as a reader in both paths, but as soon as an event channel is being closed, the lock will be taken as a writer, which will cause read_lock() to block: CPU0 CPU1 CPU2 (IRQ handling) (__hvc_poll()) (closing event channel) read_lock(evtchn_rwlock) spin_lock(hvc_lock) write_lock(evtchn_rwlock) [blocks] spin_lock(hvc_lock) [blocks] read_lock(evtchn_rwlock) [blocks due to writer waiting, and not in_interrupt()] This issue can be avoided by replacing evtchn_rwlock with RCU in xen_free_irq(). Note that RCU is used only to delay freeing of the irq_info memory. There is no RCU based dereferencing or replacement of pointers involved. In order to avoid potential races between removing the irq_info reference and handling of interrupts, set the irq_info pointer to NULL only when freeing its memory. The IRQ itself must be freed at that time, too, as otherwise the same IRQ number could be allocated again before handling of the old instance would have been finished. This is XSA-441 / CVE-2023-34324. Fixes: 54c9de89895e ("xen/events: add a new "late EOI" evtchn framework") Reported-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com> Signed-off-by: Juergen Gross <jgross@suse.com> Reviewed-by: Julien Grall <jgrall@amazon.com> Signed-off-by: Juergen Gross <jgross@suse.com>
2023-08-28 08:09:47 +02:00
rcu_read_lock();
while (true) {
xen/events: replace evtchn_rwlock with RCU In unprivileged Xen guests event handling can cause a deadlock with Xen console handling. The evtchn_rwlock and the hvc_lock are taken in opposite sequence in __hvc_poll() and in Xen console IRQ handling. Normally this is no problem, as the evtchn_rwlock is taken as a reader in both paths, but as soon as an event channel is being closed, the lock will be taken as a writer, which will cause read_lock() to block: CPU0 CPU1 CPU2 (IRQ handling) (__hvc_poll()) (closing event channel) read_lock(evtchn_rwlock) spin_lock(hvc_lock) write_lock(evtchn_rwlock) [blocks] spin_lock(hvc_lock) [blocks] read_lock(evtchn_rwlock) [blocks due to writer waiting, and not in_interrupt()] This issue can be avoided by replacing evtchn_rwlock with RCU in xen_free_irq(). Note that RCU is used only to delay freeing of the irq_info memory. There is no RCU based dereferencing or replacement of pointers involved. In order to avoid potential races between removing the irq_info reference and handling of interrupts, set the irq_info pointer to NULL only when freeing its memory. The IRQ itself must be freed at that time, too, as otherwise the same IRQ number could be allocated again before handling of the old instance would have been finished. This is XSA-441 / CVE-2023-34324. Fixes: 54c9de89895e ("xen/events: add a new "late EOI" evtchn framework") Reported-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com> Signed-off-by: Juergen Gross <jgross@suse.com> Reviewed-by: Julien Grall <jgrall@amazon.com> Signed-off-by: Juergen Gross <jgross@suse.com>
2023-08-28 08:09:47 +02:00
spin_lock_irqsave(&eoi->eoi_list_lock, flags);
info = list_first_entry_or_null(&eoi->eoi_list, struct irq_info,
eoi_list);
xen/events: replace evtchn_rwlock with RCU In unprivileged Xen guests event handling can cause a deadlock with Xen console handling. The evtchn_rwlock and the hvc_lock are taken in opposite sequence in __hvc_poll() and in Xen console IRQ handling. Normally this is no problem, as the evtchn_rwlock is taken as a reader in both paths, but as soon as an event channel is being closed, the lock will be taken as a writer, which will cause read_lock() to block: CPU0 CPU1 CPU2 (IRQ handling) (__hvc_poll()) (closing event channel) read_lock(evtchn_rwlock) spin_lock(hvc_lock) write_lock(evtchn_rwlock) [blocks] spin_lock(hvc_lock) [blocks] read_lock(evtchn_rwlock) [blocks due to writer waiting, and not in_interrupt()] This issue can be avoided by replacing evtchn_rwlock with RCU in xen_free_irq(). Note that RCU is used only to delay freeing of the irq_info memory. There is no RCU based dereferencing or replacement of pointers involved. In order to avoid potential races between removing the irq_info reference and handling of interrupts, set the irq_info pointer to NULL only when freeing its memory. The IRQ itself must be freed at that time, too, as otherwise the same IRQ number could be allocated again before handling of the old instance would have been finished. This is XSA-441 / CVE-2023-34324. Fixes: 54c9de89895e ("xen/events: add a new "late EOI" evtchn framework") Reported-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com> Signed-off-by: Juergen Gross <jgross@suse.com> Reviewed-by: Julien Grall <jgrall@amazon.com> Signed-off-by: Juergen Gross <jgross@suse.com>
2023-08-28 08:09:47 +02:00
if (info == NULL)
break;
if (now < info->eoi_time) {
mod_delayed_work_on(info->eoi_cpu, system_wq,
&eoi->delayed,
info->eoi_time - now);
break;
}
list_del_init(&info->eoi_list);
xen/events: replace evtchn_rwlock with RCU In unprivileged Xen guests event handling can cause a deadlock with Xen console handling. The evtchn_rwlock and the hvc_lock are taken in opposite sequence in __hvc_poll() and in Xen console IRQ handling. Normally this is no problem, as the evtchn_rwlock is taken as a reader in both paths, but as soon as an event channel is being closed, the lock will be taken as a writer, which will cause read_lock() to block: CPU0 CPU1 CPU2 (IRQ handling) (__hvc_poll()) (closing event channel) read_lock(evtchn_rwlock) spin_lock(hvc_lock) write_lock(evtchn_rwlock) [blocks] spin_lock(hvc_lock) [blocks] read_lock(evtchn_rwlock) [blocks due to writer waiting, and not in_interrupt()] This issue can be avoided by replacing evtchn_rwlock with RCU in xen_free_irq(). Note that RCU is used only to delay freeing of the irq_info memory. There is no RCU based dereferencing or replacement of pointers involved. In order to avoid potential races between removing the irq_info reference and handling of interrupts, set the irq_info pointer to NULL only when freeing its memory. The IRQ itself must be freed at that time, too, as otherwise the same IRQ number could be allocated again before handling of the old instance would have been finished. This is XSA-441 / CVE-2023-34324. Fixes: 54c9de89895e ("xen/events: add a new "late EOI" evtchn framework") Reported-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com> Signed-off-by: Juergen Gross <jgross@suse.com> Reviewed-by: Julien Grall <jgrall@amazon.com> Signed-off-by: Juergen Gross <jgross@suse.com>
2023-08-28 08:09:47 +02:00
spin_unlock_irqrestore(&eoi->eoi_list_lock, flags);
info->eoi_time = 0;
xen_irq_lateeoi_locked(info, false);
}
xen/events: replace evtchn_rwlock with RCU In unprivileged Xen guests event handling can cause a deadlock with Xen console handling. The evtchn_rwlock and the hvc_lock are taken in opposite sequence in __hvc_poll() and in Xen console IRQ handling. Normally this is no problem, as the evtchn_rwlock is taken as a reader in both paths, but as soon as an event channel is being closed, the lock will be taken as a writer, which will cause read_lock() to block: CPU0 CPU1 CPU2 (IRQ handling) (__hvc_poll()) (closing event channel) read_lock(evtchn_rwlock) spin_lock(hvc_lock) write_lock(evtchn_rwlock) [blocks] spin_lock(hvc_lock) [blocks] read_lock(evtchn_rwlock) [blocks due to writer waiting, and not in_interrupt()] This issue can be avoided by replacing evtchn_rwlock with RCU in xen_free_irq(). Note that RCU is used only to delay freeing of the irq_info memory. There is no RCU based dereferencing or replacement of pointers involved. In order to avoid potential races between removing the irq_info reference and handling of interrupts, set the irq_info pointer to NULL only when freeing its memory. The IRQ itself must be freed at that time, too, as otherwise the same IRQ number could be allocated again before handling of the old instance would have been finished. This is XSA-441 / CVE-2023-34324. Fixes: 54c9de89895e ("xen/events: add a new "late EOI" evtchn framework") Reported-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com> Signed-off-by: Juergen Gross <jgross@suse.com> Reviewed-by: Julien Grall <jgrall@amazon.com> Signed-off-by: Juergen Gross <jgross@suse.com>
2023-08-28 08:09:47 +02:00
spin_unlock_irqrestore(&eoi->eoi_list_lock, flags);
xen/events: replace evtchn_rwlock with RCU In unprivileged Xen guests event handling can cause a deadlock with Xen console handling. The evtchn_rwlock and the hvc_lock are taken in opposite sequence in __hvc_poll() and in Xen console IRQ handling. Normally this is no problem, as the evtchn_rwlock is taken as a reader in both paths, but as soon as an event channel is being closed, the lock will be taken as a writer, which will cause read_lock() to block: CPU0 CPU1 CPU2 (IRQ handling) (__hvc_poll()) (closing event channel) read_lock(evtchn_rwlock) spin_lock(hvc_lock) write_lock(evtchn_rwlock) [blocks] spin_lock(hvc_lock) [blocks] read_lock(evtchn_rwlock) [blocks due to writer waiting, and not in_interrupt()] This issue can be avoided by replacing evtchn_rwlock with RCU in xen_free_irq(). Note that RCU is used only to delay freeing of the irq_info memory. There is no RCU based dereferencing or replacement of pointers involved. In order to avoid potential races between removing the irq_info reference and handling of interrupts, set the irq_info pointer to NULL only when freeing its memory. The IRQ itself must be freed at that time, too, as otherwise the same IRQ number could be allocated again before handling of the old instance would have been finished. This is XSA-441 / CVE-2023-34324. Fixes: 54c9de89895e ("xen/events: add a new "late EOI" evtchn framework") Reported-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com> Signed-off-by: Juergen Gross <jgross@suse.com> Reviewed-by: Julien Grall <jgrall@amazon.com> Signed-off-by: Juergen Gross <jgross@suse.com>
2023-08-28 08:09:47 +02:00
rcu_read_unlock();
}
static void xen_cpu_init_eoi(unsigned int cpu)
{
struct lateeoi_work *eoi = &per_cpu(lateeoi, cpu);
INIT_DELAYED_WORK(&eoi->delayed, xen_irq_lateeoi_worker);
spin_lock_init(&eoi->eoi_list_lock);
INIT_LIST_HEAD(&eoi->eoi_list);
}
void xen_irq_lateeoi(unsigned int irq, unsigned int eoi_flags)
{
struct irq_info *info;
xen/events: replace evtchn_rwlock with RCU In unprivileged Xen guests event handling can cause a deadlock with Xen console handling. The evtchn_rwlock and the hvc_lock are taken in opposite sequence in __hvc_poll() and in Xen console IRQ handling. Normally this is no problem, as the evtchn_rwlock is taken as a reader in both paths, but as soon as an event channel is being closed, the lock will be taken as a writer, which will cause read_lock() to block: CPU0 CPU1 CPU2 (IRQ handling) (__hvc_poll()) (closing event channel) read_lock(evtchn_rwlock) spin_lock(hvc_lock) write_lock(evtchn_rwlock) [blocks] spin_lock(hvc_lock) [blocks] read_lock(evtchn_rwlock) [blocks due to writer waiting, and not in_interrupt()] This issue can be avoided by replacing evtchn_rwlock with RCU in xen_free_irq(). Note that RCU is used only to delay freeing of the irq_info memory. There is no RCU based dereferencing or replacement of pointers involved. In order to avoid potential races between removing the irq_info reference and handling of interrupts, set the irq_info pointer to NULL only when freeing its memory. The IRQ itself must be freed at that time, too, as otherwise the same IRQ number could be allocated again before handling of the old instance would have been finished. This is XSA-441 / CVE-2023-34324. Fixes: 54c9de89895e ("xen/events: add a new "late EOI" evtchn framework") Reported-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com> Signed-off-by: Juergen Gross <jgross@suse.com> Reviewed-by: Julien Grall <jgrall@amazon.com> Signed-off-by: Juergen Gross <jgross@suse.com>
2023-08-28 08:09:47 +02:00
rcu_read_lock();
info = info_for_irq(irq);
if (info)
xen_irq_lateeoi_locked(info, eoi_flags & XEN_EOI_FLAG_SPURIOUS);
xen/events: replace evtchn_rwlock with RCU In unprivileged Xen guests event handling can cause a deadlock with Xen console handling. The evtchn_rwlock and the hvc_lock are taken in opposite sequence in __hvc_poll() and in Xen console IRQ handling. Normally this is no problem, as the evtchn_rwlock is taken as a reader in both paths, but as soon as an event channel is being closed, the lock will be taken as a writer, which will cause read_lock() to block: CPU0 CPU1 CPU2 (IRQ handling) (__hvc_poll()) (closing event channel) read_lock(evtchn_rwlock) spin_lock(hvc_lock) write_lock(evtchn_rwlock) [blocks] spin_lock(hvc_lock) [blocks] read_lock(evtchn_rwlock) [blocks due to writer waiting, and not in_interrupt()] This issue can be avoided by replacing evtchn_rwlock with RCU in xen_free_irq(). Note that RCU is used only to delay freeing of the irq_info memory. There is no RCU based dereferencing or replacement of pointers involved. In order to avoid potential races between removing the irq_info reference and handling of interrupts, set the irq_info pointer to NULL only when freeing its memory. The IRQ itself must be freed at that time, too, as otherwise the same IRQ number could be allocated again before handling of the old instance would have been finished. This is XSA-441 / CVE-2023-34324. Fixes: 54c9de89895e ("xen/events: add a new "late EOI" evtchn framework") Reported-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com> Signed-off-by: Juergen Gross <jgross@suse.com> Reviewed-by: Julien Grall <jgrall@amazon.com> Signed-off-by: Juergen Gross <jgross@suse.com>
2023-08-28 08:09:47 +02:00
rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(xen_irq_lateeoi);
static struct irq_info *xen_irq_init(unsigned int irq)
{
struct irq_info *info;
info = kzalloc(sizeof(*info), GFP_KERNEL);
if (info) {
info->irq = irq;
info->type = IRQT_UNBOUND;
info->refcnt = -1;
INIT_RCU_WORK(&info->rwork, delayed_free_irq);
set_info_for_irq(irq, info);
INIT_LIST_HEAD(&info->eoi_list);
list_add_tail(&info->list, &xen_irq_list_head);
}
return info;
}
static struct irq_info *xen_allocate_irq_dynamic(void)
{
int irq = irq_alloc_desc_from(0, -1);
struct irq_info *info = NULL;
xen: Find an unbound irq number in reverse order (high to low). In earlier Xen Linux kernels, the IRQ mapping was a straight 1:1 and the find_unbound_irq started looking around 256 for open IRQs and up. IRQs from 0 to 255 were reserved for PCI devices. Previous to this patch, the 'find_unbound_irq' started looking at get_nr_hw_irqs() number. For privileged domain where the ACPI information is available that returns the upper-bound of what the GSIs. For non-privileged PV domains, where ACPI is no-existent the get_nr_hw_irqs() reports the IRQ_LEGACY (16). With PCI passthrough enabled, and with PCI cards that have IRQs pinned to a higher number than 16 we collide with previously allocated IRQs. Specifically the PCI IRQs collide with the IPI's for Xen functions (as they are allocated earlier). For example: 00:00.11 USB Controller: ATI Technologies Inc SB700 USB OHCI1 Controller (prog-if 10 [OHCI]) ... Interrupt: pin A routed to IRQ 18 [root@localhost ~]# cat /proc/interrupts | head CPU0 CPU1 CPU2 16: 38186 0 0 xen-dyn-virq timer0 17: 149 0 0 xen-dyn-ipi spinlock0 18: 962 0 0 xen-dyn-ipi resched0 and when the USB controller is loaded, the kernel reports: IRQ handler type mismatch for IRQ 18 current handler: resched0 One way to fix this is to reverse the logic when looking for un-used IRQ numbers and start with the highest available number. With that, we would get: CPU0 CPU1 CPU2 ... snip .. 292: 35 0 0 xen-dyn-ipi callfunc0 293: 3992 0 0 xen-dyn-ipi resched0 294: 224 0 0 xen-dyn-ipi spinlock0 295: 57183 0 0 xen-dyn-virq timer0 NMI: 0 0 0 Non-maskable interrupts .. snip .. And interrupts for PCI cards are now accessible. This patch also includes the fix, found by Ian Campbell, titled "xen: fix off-by-one error in find_unbound_irq." [v2: Added an explanation in the code] [v3: Rebased on top of tip/irq/core] Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
2010-10-18 10:49:10 -04:00
if (irq >= 0) {
info = xen_irq_init(irq);
if (!info)
xen_irq_free_desc(irq);
}
return info;
}
static struct irq_info *xen_allocate_irq_gsi(unsigned int gsi)
{
int irq;
struct irq_info *info;
xen: events: allocate GSIs and dynamic IRQs from separate IRQ ranges. There are three cases which we need to care about, PV guest, PV domain 0 and HVM guest. The PV guest case is simple since it has no access to ACPI or real APICs and therefore has no GSIs therefore we simply dynamically allocate all IRQs. The potentially interesting case here is PIRQ type event channels associated with passed through PCI devices. However even in this case the guest has no direct interaction with the physical GSI since that happens in the PCI backend. The PV domain 0 and HVM guest cases are actually the same. In domain 0 case the kernel sees the host ACPI and GSIs (although it only sees the APIC indirectly via the hypervisor) and in the HVM guest case it sees the virtualised ACPI and emulated APICs. In these cases we start allocating dynamic IRQs at nr_irqs_gsi so that they cannot clash with any GSI. Currently xen_allocate_irq_dynamic starts at nr_irqs and works backwards looking for a free IRQ in order to (try and) avoid clashing with GSIs used in domain 0 and in HVM guests. This change avoids that although we retain the behaviour of allowing dynamic IRQs to encroach on the GSI range if no suitable IRQs are available since a future IRQ clash is deemed preferable to failure right now. Signed-off-by: Ian Campbell <ian.campbell@citrix.com> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com> Cc: Jeremy Fitzhardinge <jeremy@goop.org>
2011-03-03 11:57:44 -05:00
/*
* A PV guest has no concept of a GSI (since it has no ACPI
* nor access to/knowledge of the physical APICs). Therefore
* all IRQs are dynamically allocated from the entire IRQ
* space.
*/
if (xen_pv_domain() && !xen_initial_domain())
return xen_allocate_irq_dynamic();
/* Legacy IRQ descriptors are already allocated by the arch. */
if (gsi < nr_legacy_irqs())
irq = gsi;
else
irq = irq_alloc_desc_at(gsi, -1);
info = xen_irq_init(irq);
if (!info)
xen_irq_free_desc(irq);
return info;
}
static void xen_free_irq(struct irq_info *info)
{
if (WARN_ON(!info))
return;
if (!list_empty(&info->eoi_list))
lateeoi_list_del(info);
list_del(&info->list);
WARN_ON(info->refcnt > 0);
xen/events: replace evtchn_rwlock with RCU In unprivileged Xen guests event handling can cause a deadlock with Xen console handling. The evtchn_rwlock and the hvc_lock are taken in opposite sequence in __hvc_poll() and in Xen console IRQ handling. Normally this is no problem, as the evtchn_rwlock is taken as a reader in both paths, but as soon as an event channel is being closed, the lock will be taken as a writer, which will cause read_lock() to block: CPU0 CPU1 CPU2 (IRQ handling) (__hvc_poll()) (closing event channel) read_lock(evtchn_rwlock) spin_lock(hvc_lock) write_lock(evtchn_rwlock) [blocks] spin_lock(hvc_lock) [blocks] read_lock(evtchn_rwlock) [blocks due to writer waiting, and not in_interrupt()] This issue can be avoided by replacing evtchn_rwlock with RCU in xen_free_irq(). Note that RCU is used only to delay freeing of the irq_info memory. There is no RCU based dereferencing or replacement of pointers involved. In order to avoid potential races between removing the irq_info reference and handling of interrupts, set the irq_info pointer to NULL only when freeing its memory. The IRQ itself must be freed at that time, too, as otherwise the same IRQ number could be allocated again before handling of the old instance would have been finished. This is XSA-441 / CVE-2023-34324. Fixes: 54c9de89895e ("xen/events: add a new "late EOI" evtchn framework") Reported-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com> Signed-off-by: Juergen Gross <jgross@suse.com> Reviewed-by: Julien Grall <jgrall@amazon.com> Signed-off-by: Juergen Gross <jgross@suse.com>
2023-08-28 08:09:47 +02:00
queue_rcu_work(system_wq, &info->rwork);
}
/* Not called for lateeoi events. */
static void event_handler_exit(struct irq_info *info)
{
smp_store_release(&info->is_active, 0);
clear_evtchn(info->evtchn);
}
static void pirq_query_unmask(struct irq_info *info)
{
struct physdev_irq_status_query irq_status;
irq_status.irq = pirq_from_irq(info);
if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status))
irq_status.flags = 0;
info->u.pirq.flags &= ~PIRQ_NEEDS_EOI;
if (irq_status.flags & XENIRQSTAT_needs_eoi)
info->u.pirq.flags |= PIRQ_NEEDS_EOI;
}
static void do_eoi_pirq(struct irq_info *info)
{
struct physdev_eoi eoi = { .irq = pirq_from_irq(info) };
int rc = 0;
if (!VALID_EVTCHN(info->evtchn))
return;
event_handler_exit(info);
if (pirq_needs_eoi(info)) {
rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
WARN_ON(rc);
}
}
static void eoi_pirq(struct irq_data *data)
{
struct irq_info *info = info_for_irq(data->irq);
do_eoi_pirq(info);
}
static void do_disable_dynirq(struct irq_info *info)
{
if (VALID_EVTCHN(info->evtchn))
do_mask(info, EVT_MASK_REASON_EXPLICIT);
}
static void disable_dynirq(struct irq_data *data)
{
struct irq_info *info = info_for_irq(data->irq);
if (info)
do_disable_dynirq(info);
}
static void mask_ack_pirq(struct irq_data *data)
{
struct irq_info *info = info_for_irq(data->irq);
if (info) {
do_disable_dynirq(info);
do_eoi_pirq(info);
}
}
static unsigned int __startup_pirq(struct irq_info *info)
{
struct evtchn_bind_pirq bind_pirq;
evtchn_port_t evtchn = info->evtchn;
int rc;
if (VALID_EVTCHN(evtchn))
goto out;
bind_pirq.pirq = pirq_from_irq(info);
/* NB. We are happy to share unless we are probing. */
bind_pirq.flags = info->u.pirq.flags & PIRQ_SHAREABLE ?
BIND_PIRQ__WILL_SHARE : 0;
rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq);
if (rc != 0) {
pr_warn("Failed to obtain physical IRQ %d\n", info->irq);
return 0;
}
evtchn = bind_pirq.port;
pirq_query_unmask(info);
rc = set_evtchn_to_irq(evtchn, info->irq);
if (rc)
goto err;
info->evtchn = evtchn;
bind_evtchn_to_cpu(info, 0, false);
rc = xen_evtchn_port_setup(evtchn);
if (rc)
goto err;
out:
do_unmask(info, EVT_MASK_REASON_EXPLICIT);
do_eoi_pirq(info);
return 0;
err:
pr_err("irq%d: Failed to set port to irq mapping (%d)\n", info->irq,
rc);
xen_evtchn_close(evtchn);
return 0;
}
static unsigned int startup_pirq(struct irq_data *data)
{
struct irq_info *info = info_for_irq(data->irq);
return __startup_pirq(info);
}
static void shutdown_pirq(struct irq_data *data)
{
struct irq_info *info = info_for_irq(data->irq);
evtchn_port_t evtchn = info->evtchn;
BUG_ON(info->type != IRQT_PIRQ);
if (!VALID_EVTCHN(evtchn))
return;
do_mask(info, EVT_MASK_REASON_EXPLICIT);
xen_irq_info_cleanup(info);
xen/events: close evtchn after mapping cleanup shutdown_pirq and startup_pirq are not taking the irq_mapping_update_lock because they can't due to lock inversion. Both are called with the irq_desc->lock being taking. The lock order, however, is first irq_mapping_update_lock and then irq_desc->lock. This opens multiple races: - shutdown_pirq can be interrupted by a function that allocates an event channel: CPU0 CPU1 shutdown_pirq { xen_evtchn_close(e) __startup_pirq { EVTCHNOP_bind_pirq -> returns just freed evtchn e set_evtchn_to_irq(e, irq) } xen_irq_info_cleanup() { set_evtchn_to_irq(e, -1) } } Assume here event channel e refers here to the same event channel number. After this race the evtchn_to_irq mapping for e is invalid (-1). - __startup_pirq races with __unbind_from_irq in a similar way. Because __startup_pirq doesn't take irq_mapping_update_lock it can grab the evtchn that __unbind_from_irq is currently freeing and cleaning up. In this case even though the event channel is allocated, its mapping can be unset in evtchn_to_irq. The fix is to first cleanup the mappings and then close the event channel. In this way, when an event channel gets allocated it's potential previous evtchn_to_irq mappings are guaranteed to be unset already. This is also the reverse order of the allocation where first the event channel is allocated and then the mappings are setup. On a 5.10 kernel prior to commit 3fcdaf3d7634 ("xen/events: modify internal [un]bind interfaces"), we hit a BUG like the following during probing of NVMe devices. The issue is that during nvme_setup_io_queues, pci_free_irq is called for every device which results in a call to shutdown_pirq. With many nvme devices it's therefore likely to hit this race during boot because there will be multiple calls to shutdown_pirq and startup_pirq are running potentially in parallel. ------------[ cut here ]------------ blkfront: xvda: barrier or flush: disabled; persistent grants: enabled; indirect descriptors: enabled; bounce buffer: enabled kernel BUG at drivers/xen/events/events_base.c:499! invalid opcode: 0000 [#1] SMP PTI CPU: 44 PID: 375 Comm: kworker/u257:23 Not tainted 5.10.201-191.748.amzn2.x86_64 #1 Hardware name: Xen HVM domU, BIOS 4.11.amazon 08/24/2006 Workqueue: nvme-reset-wq nvme_reset_work RIP: 0010:bind_evtchn_to_cpu+0xdf/0xf0 Code: 5d 41 5e c3 cc cc cc cc 44 89 f7 e8 2b 55 ad ff 49 89 c5 48 85 c0 0f 84 64 ff ff ff 4c 8b 68 30 41 83 fe ff 0f 85 60 ff ff ff <0f> 0b 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 0f 1f 44 00 00 RSP: 0000:ffffc9000d533b08 EFLAGS: 00010046 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000006 RDX: 0000000000000028 RSI: 00000000ffffffff RDI: 00000000ffffffff RBP: ffff888107419680 R08: 0000000000000000 R09: ffffffff82d72b00 R10: 0000000000000000 R11: 0000000000000000 R12: 00000000000001ed R13: 0000000000000000 R14: 00000000ffffffff R15: 0000000000000002 FS: 0000000000000000(0000) GS:ffff88bc8b500000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 0000000002610001 CR4: 00000000001706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? show_trace_log_lvl+0x1c1/0x2d9 ? show_trace_log_lvl+0x1c1/0x2d9 ? set_affinity_irq+0xdc/0x1c0 ? __die_body.cold+0x8/0xd ? die+0x2b/0x50 ? do_trap+0x90/0x110 ? bind_evtchn_to_cpu+0xdf/0xf0 ? do_error_trap+0x65/0x80 ? bind_evtchn_to_cpu+0xdf/0xf0 ? exc_invalid_op+0x4e/0x70 ? bind_evtchn_to_cpu+0xdf/0xf0 ? asm_exc_invalid_op+0x12/0x20 ? bind_evtchn_to_cpu+0xdf/0xf0 ? bind_evtchn_to_cpu+0xc5/0xf0 set_affinity_irq+0xdc/0x1c0 irq_do_set_affinity+0x1d7/0x1f0 irq_setup_affinity+0xd6/0x1a0 irq_startup+0x8a/0xf0 __setup_irq+0x639/0x6d0 ? nvme_suspend+0x150/0x150 request_threaded_irq+0x10c/0x180 ? nvme_suspend+0x150/0x150 pci_request_irq+0xa8/0xf0 ? __blk_mq_free_request+0x74/0xa0 queue_request_irq+0x6f/0x80 nvme_create_queue+0x1af/0x200 nvme_create_io_queues+0xbd/0xf0 nvme_setup_io_queues+0x246/0x320 ? nvme_irq_check+0x30/0x30 nvme_reset_work+0x1c8/0x400 process_one_work+0x1b0/0x350 worker_thread+0x49/0x310 ? process_one_work+0x350/0x350 kthread+0x11b/0x140 ? __kthread_bind_mask+0x60/0x60 ret_from_fork+0x22/0x30 Modules linked in: ---[ end trace a11715de1eee1873 ]--- Fixes: d46a78b05c0e ("xen: implement pirq type event channels") Cc: stable@vger.kernel.org Co-debugged-by: Andrew Panyakin <apanyaki@amazon.com> Signed-off-by: Maximilian Heyne <mheyne@amazon.de> Reviewed-by: Juergen Gross <jgross@suse.com> Link: https://lore.kernel.org/r/20240124163130.31324-1-mheyne@amazon.de Signed-off-by: Juergen Gross <jgross@suse.com>
2024-01-24 16:31:28 +00:00
xen_evtchn_close(evtchn);
}
static void enable_pirq(struct irq_data *data)
{
enable_dynirq(data);
}
static void disable_pirq(struct irq_data *data)
{
disable_dynirq(data);
}
int xen_irq_from_gsi(unsigned gsi)
{
struct irq_info *info;
list_for_each_entry(info, &xen_irq_list_head, list) {
if (info->type != IRQT_PIRQ)
continue;
if (info->u.pirq.gsi == gsi)
return info->irq;
}
return -1;
}
EXPORT_SYMBOL_GPL(xen_irq_from_gsi);
static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
{
evtchn_port_t evtchn;
xen/events: close evtchn after mapping cleanup shutdown_pirq and startup_pirq are not taking the irq_mapping_update_lock because they can't due to lock inversion. Both are called with the irq_desc->lock being taking. The lock order, however, is first irq_mapping_update_lock and then irq_desc->lock. This opens multiple races: - shutdown_pirq can be interrupted by a function that allocates an event channel: CPU0 CPU1 shutdown_pirq { xen_evtchn_close(e) __startup_pirq { EVTCHNOP_bind_pirq -> returns just freed evtchn e set_evtchn_to_irq(e, irq) } xen_irq_info_cleanup() { set_evtchn_to_irq(e, -1) } } Assume here event channel e refers here to the same event channel number. After this race the evtchn_to_irq mapping for e is invalid (-1). - __startup_pirq races with __unbind_from_irq in a similar way. Because __startup_pirq doesn't take irq_mapping_update_lock it can grab the evtchn that __unbind_from_irq is currently freeing and cleaning up. In this case even though the event channel is allocated, its mapping can be unset in evtchn_to_irq. The fix is to first cleanup the mappings and then close the event channel. In this way, when an event channel gets allocated it's potential previous evtchn_to_irq mappings are guaranteed to be unset already. This is also the reverse order of the allocation where first the event channel is allocated and then the mappings are setup. On a 5.10 kernel prior to commit 3fcdaf3d7634 ("xen/events: modify internal [un]bind interfaces"), we hit a BUG like the following during probing of NVMe devices. The issue is that during nvme_setup_io_queues, pci_free_irq is called for every device which results in a call to shutdown_pirq. With many nvme devices it's therefore likely to hit this race during boot because there will be multiple calls to shutdown_pirq and startup_pirq are running potentially in parallel. ------------[ cut here ]------------ blkfront: xvda: barrier or flush: disabled; persistent grants: enabled; indirect descriptors: enabled; bounce buffer: enabled kernel BUG at drivers/xen/events/events_base.c:499! invalid opcode: 0000 [#1] SMP PTI CPU: 44 PID: 375 Comm: kworker/u257:23 Not tainted 5.10.201-191.748.amzn2.x86_64 #1 Hardware name: Xen HVM domU, BIOS 4.11.amazon 08/24/2006 Workqueue: nvme-reset-wq nvme_reset_work RIP: 0010:bind_evtchn_to_cpu+0xdf/0xf0 Code: 5d 41 5e c3 cc cc cc cc 44 89 f7 e8 2b 55 ad ff 49 89 c5 48 85 c0 0f 84 64 ff ff ff 4c 8b 68 30 41 83 fe ff 0f 85 60 ff ff ff <0f> 0b 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 0f 1f 44 00 00 RSP: 0000:ffffc9000d533b08 EFLAGS: 00010046 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000006 RDX: 0000000000000028 RSI: 00000000ffffffff RDI: 00000000ffffffff RBP: ffff888107419680 R08: 0000000000000000 R09: ffffffff82d72b00 R10: 0000000000000000 R11: 0000000000000000 R12: 00000000000001ed R13: 0000000000000000 R14: 00000000ffffffff R15: 0000000000000002 FS: 0000000000000000(0000) GS:ffff88bc8b500000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 0000000002610001 CR4: 00000000001706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? show_trace_log_lvl+0x1c1/0x2d9 ? show_trace_log_lvl+0x1c1/0x2d9 ? set_affinity_irq+0xdc/0x1c0 ? __die_body.cold+0x8/0xd ? die+0x2b/0x50 ? do_trap+0x90/0x110 ? bind_evtchn_to_cpu+0xdf/0xf0 ? do_error_trap+0x65/0x80 ? bind_evtchn_to_cpu+0xdf/0xf0 ? exc_invalid_op+0x4e/0x70 ? bind_evtchn_to_cpu+0xdf/0xf0 ? asm_exc_invalid_op+0x12/0x20 ? bind_evtchn_to_cpu+0xdf/0xf0 ? bind_evtchn_to_cpu+0xc5/0xf0 set_affinity_irq+0xdc/0x1c0 irq_do_set_affinity+0x1d7/0x1f0 irq_setup_affinity+0xd6/0x1a0 irq_startup+0x8a/0xf0 __setup_irq+0x639/0x6d0 ? nvme_suspend+0x150/0x150 request_threaded_irq+0x10c/0x180 ? nvme_suspend+0x150/0x150 pci_request_irq+0xa8/0xf0 ? __blk_mq_free_request+0x74/0xa0 queue_request_irq+0x6f/0x80 nvme_create_queue+0x1af/0x200 nvme_create_io_queues+0xbd/0xf0 nvme_setup_io_queues+0x246/0x320 ? nvme_irq_check+0x30/0x30 nvme_reset_work+0x1c8/0x400 process_one_work+0x1b0/0x350 worker_thread+0x49/0x310 ? process_one_work+0x350/0x350 kthread+0x11b/0x140 ? __kthread_bind_mask+0x60/0x60 ret_from_fork+0x22/0x30 Modules linked in: ---[ end trace a11715de1eee1873 ]--- Fixes: d46a78b05c0e ("xen: implement pirq type event channels") Cc: stable@vger.kernel.org Co-debugged-by: Andrew Panyakin <apanyaki@amazon.com> Signed-off-by: Maximilian Heyne <mheyne@amazon.de> Reviewed-by: Juergen Gross <jgross@suse.com> Link: https://lore.kernel.org/r/20240124163130.31324-1-mheyne@amazon.de Signed-off-by: Juergen Gross <jgross@suse.com>
2024-01-24 16:31:28 +00:00
bool close_evtchn = false;
if (!info) {
xen_irq_free_desc(irq);
return;
}
if (info->refcnt > 0) {
info->refcnt--;
if (info->refcnt != 0)
return;
}
evtchn = info->evtchn;
if (VALID_EVTCHN(evtchn)) {
unsigned int cpu = info->cpu;
struct xenbus_device *dev;
if (!info->is_static)
xen/events: close evtchn after mapping cleanup shutdown_pirq and startup_pirq are not taking the irq_mapping_update_lock because they can't due to lock inversion. Both are called with the irq_desc->lock being taking. The lock order, however, is first irq_mapping_update_lock and then irq_desc->lock. This opens multiple races: - shutdown_pirq can be interrupted by a function that allocates an event channel: CPU0 CPU1 shutdown_pirq { xen_evtchn_close(e) __startup_pirq { EVTCHNOP_bind_pirq -> returns just freed evtchn e set_evtchn_to_irq(e, irq) } xen_irq_info_cleanup() { set_evtchn_to_irq(e, -1) } } Assume here event channel e refers here to the same event channel number. After this race the evtchn_to_irq mapping for e is invalid (-1). - __startup_pirq races with __unbind_from_irq in a similar way. Because __startup_pirq doesn't take irq_mapping_update_lock it can grab the evtchn that __unbind_from_irq is currently freeing and cleaning up. In this case even though the event channel is allocated, its mapping can be unset in evtchn_to_irq. The fix is to first cleanup the mappings and then close the event channel. In this way, when an event channel gets allocated it's potential previous evtchn_to_irq mappings are guaranteed to be unset already. This is also the reverse order of the allocation where first the event channel is allocated and then the mappings are setup. On a 5.10 kernel prior to commit 3fcdaf3d7634 ("xen/events: modify internal [un]bind interfaces"), we hit a BUG like the following during probing of NVMe devices. The issue is that during nvme_setup_io_queues, pci_free_irq is called for every device which results in a call to shutdown_pirq. With many nvme devices it's therefore likely to hit this race during boot because there will be multiple calls to shutdown_pirq and startup_pirq are running potentially in parallel. ------------[ cut here ]------------ blkfront: xvda: barrier or flush: disabled; persistent grants: enabled; indirect descriptors: enabled; bounce buffer: enabled kernel BUG at drivers/xen/events/events_base.c:499! invalid opcode: 0000 [#1] SMP PTI CPU: 44 PID: 375 Comm: kworker/u257:23 Not tainted 5.10.201-191.748.amzn2.x86_64 #1 Hardware name: Xen HVM domU, BIOS 4.11.amazon 08/24/2006 Workqueue: nvme-reset-wq nvme_reset_work RIP: 0010:bind_evtchn_to_cpu+0xdf/0xf0 Code: 5d 41 5e c3 cc cc cc cc 44 89 f7 e8 2b 55 ad ff 49 89 c5 48 85 c0 0f 84 64 ff ff ff 4c 8b 68 30 41 83 fe ff 0f 85 60 ff ff ff <0f> 0b 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 0f 1f 44 00 00 RSP: 0000:ffffc9000d533b08 EFLAGS: 00010046 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000006 RDX: 0000000000000028 RSI: 00000000ffffffff RDI: 00000000ffffffff RBP: ffff888107419680 R08: 0000000000000000 R09: ffffffff82d72b00 R10: 0000000000000000 R11: 0000000000000000 R12: 00000000000001ed R13: 0000000000000000 R14: 00000000ffffffff R15: 0000000000000002 FS: 0000000000000000(0000) GS:ffff88bc8b500000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 0000000002610001 CR4: 00000000001706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? show_trace_log_lvl+0x1c1/0x2d9 ? show_trace_log_lvl+0x1c1/0x2d9 ? set_affinity_irq+0xdc/0x1c0 ? __die_body.cold+0x8/0xd ? die+0x2b/0x50 ? do_trap+0x90/0x110 ? bind_evtchn_to_cpu+0xdf/0xf0 ? do_error_trap+0x65/0x80 ? bind_evtchn_to_cpu+0xdf/0xf0 ? exc_invalid_op+0x4e/0x70 ? bind_evtchn_to_cpu+0xdf/0xf0 ? asm_exc_invalid_op+0x12/0x20 ? bind_evtchn_to_cpu+0xdf/0xf0 ? bind_evtchn_to_cpu+0xc5/0xf0 set_affinity_irq+0xdc/0x1c0 irq_do_set_affinity+0x1d7/0x1f0 irq_setup_affinity+0xd6/0x1a0 irq_startup+0x8a/0xf0 __setup_irq+0x639/0x6d0 ? nvme_suspend+0x150/0x150 request_threaded_irq+0x10c/0x180 ? nvme_suspend+0x150/0x150 pci_request_irq+0xa8/0xf0 ? __blk_mq_free_request+0x74/0xa0 queue_request_irq+0x6f/0x80 nvme_create_queue+0x1af/0x200 nvme_create_io_queues+0xbd/0xf0 nvme_setup_io_queues+0x246/0x320 ? nvme_irq_check+0x30/0x30 nvme_reset_work+0x1c8/0x400 process_one_work+0x1b0/0x350 worker_thread+0x49/0x310 ? process_one_work+0x350/0x350 kthread+0x11b/0x140 ? __kthread_bind_mask+0x60/0x60 ret_from_fork+0x22/0x30 Modules linked in: ---[ end trace a11715de1eee1873 ]--- Fixes: d46a78b05c0e ("xen: implement pirq type event channels") Cc: stable@vger.kernel.org Co-debugged-by: Andrew Panyakin <apanyaki@amazon.com> Signed-off-by: Maximilian Heyne <mheyne@amazon.de> Reviewed-by: Juergen Gross <jgross@suse.com> Link: https://lore.kernel.org/r/20240124163130.31324-1-mheyne@amazon.de Signed-off-by: Juergen Gross <jgross@suse.com>
2024-01-24 16:31:28 +00:00
close_evtchn = true;
switch (info->type) {
case IRQT_VIRQ:
per_cpu(virq_to_irq, cpu)[virq_from_irq(info)] = -1;
break;
case IRQT_IPI:
per_cpu(ipi_to_irq, cpu)[ipi_from_irq(info)] = -1;
per_cpu(ipi_to_evtchn, cpu)[ipi_from_irq(info)] = 0;
break;
case IRQT_EVTCHN:
dev = info->u.interdomain;
if (dev)
atomic_dec(&dev->event_channels);
break;
default:
break;
}
xen_irq_info_cleanup(info);
xen/events: close evtchn after mapping cleanup shutdown_pirq and startup_pirq are not taking the irq_mapping_update_lock because they can't due to lock inversion. Both are called with the irq_desc->lock being taking. The lock order, however, is first irq_mapping_update_lock and then irq_desc->lock. This opens multiple races: - shutdown_pirq can be interrupted by a function that allocates an event channel: CPU0 CPU1 shutdown_pirq { xen_evtchn_close(e) __startup_pirq { EVTCHNOP_bind_pirq -> returns just freed evtchn e set_evtchn_to_irq(e, irq) } xen_irq_info_cleanup() { set_evtchn_to_irq(e, -1) } } Assume here event channel e refers here to the same event channel number. After this race the evtchn_to_irq mapping for e is invalid (-1). - __startup_pirq races with __unbind_from_irq in a similar way. Because __startup_pirq doesn't take irq_mapping_update_lock it can grab the evtchn that __unbind_from_irq is currently freeing and cleaning up. In this case even though the event channel is allocated, its mapping can be unset in evtchn_to_irq. The fix is to first cleanup the mappings and then close the event channel. In this way, when an event channel gets allocated it's potential previous evtchn_to_irq mappings are guaranteed to be unset already. This is also the reverse order of the allocation where first the event channel is allocated and then the mappings are setup. On a 5.10 kernel prior to commit 3fcdaf3d7634 ("xen/events: modify internal [un]bind interfaces"), we hit a BUG like the following during probing of NVMe devices. The issue is that during nvme_setup_io_queues, pci_free_irq is called for every device which results in a call to shutdown_pirq. With many nvme devices it's therefore likely to hit this race during boot because there will be multiple calls to shutdown_pirq and startup_pirq are running potentially in parallel. ------------[ cut here ]------------ blkfront: xvda: barrier or flush: disabled; persistent grants: enabled; indirect descriptors: enabled; bounce buffer: enabled kernel BUG at drivers/xen/events/events_base.c:499! invalid opcode: 0000 [#1] SMP PTI CPU: 44 PID: 375 Comm: kworker/u257:23 Not tainted 5.10.201-191.748.amzn2.x86_64 #1 Hardware name: Xen HVM domU, BIOS 4.11.amazon 08/24/2006 Workqueue: nvme-reset-wq nvme_reset_work RIP: 0010:bind_evtchn_to_cpu+0xdf/0xf0 Code: 5d 41 5e c3 cc cc cc cc 44 89 f7 e8 2b 55 ad ff 49 89 c5 48 85 c0 0f 84 64 ff ff ff 4c 8b 68 30 41 83 fe ff 0f 85 60 ff ff ff <0f> 0b 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 0f 1f 44 00 00 RSP: 0000:ffffc9000d533b08 EFLAGS: 00010046 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000006 RDX: 0000000000000028 RSI: 00000000ffffffff RDI: 00000000ffffffff RBP: ffff888107419680 R08: 0000000000000000 R09: ffffffff82d72b00 R10: 0000000000000000 R11: 0000000000000000 R12: 00000000000001ed R13: 0000000000000000 R14: 00000000ffffffff R15: 0000000000000002 FS: 0000000000000000(0000) GS:ffff88bc8b500000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 0000000002610001 CR4: 00000000001706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? show_trace_log_lvl+0x1c1/0x2d9 ? show_trace_log_lvl+0x1c1/0x2d9 ? set_affinity_irq+0xdc/0x1c0 ? __die_body.cold+0x8/0xd ? die+0x2b/0x50 ? do_trap+0x90/0x110 ? bind_evtchn_to_cpu+0xdf/0xf0 ? do_error_trap+0x65/0x80 ? bind_evtchn_to_cpu+0xdf/0xf0 ? exc_invalid_op+0x4e/0x70 ? bind_evtchn_to_cpu+0xdf/0xf0 ? asm_exc_invalid_op+0x12/0x20 ? bind_evtchn_to_cpu+0xdf/0xf0 ? bind_evtchn_to_cpu+0xc5/0xf0 set_affinity_irq+0xdc/0x1c0 irq_do_set_affinity+0x1d7/0x1f0 irq_setup_affinity+0xd6/0x1a0 irq_startup+0x8a/0xf0 __setup_irq+0x639/0x6d0 ? nvme_suspend+0x150/0x150 request_threaded_irq+0x10c/0x180 ? nvme_suspend+0x150/0x150 pci_request_irq+0xa8/0xf0 ? __blk_mq_free_request+0x74/0xa0 queue_request_irq+0x6f/0x80 nvme_create_queue+0x1af/0x200 nvme_create_io_queues+0xbd/0xf0 nvme_setup_io_queues+0x246/0x320 ? nvme_irq_check+0x30/0x30 nvme_reset_work+0x1c8/0x400 process_one_work+0x1b0/0x350 worker_thread+0x49/0x310 ? process_one_work+0x350/0x350 kthread+0x11b/0x140 ? __kthread_bind_mask+0x60/0x60 ret_from_fork+0x22/0x30 Modules linked in: ---[ end trace a11715de1eee1873 ]--- Fixes: d46a78b05c0e ("xen: implement pirq type event channels") Cc: stable@vger.kernel.org Co-debugged-by: Andrew Panyakin <apanyaki@amazon.com> Signed-off-by: Maximilian Heyne <mheyne@amazon.de> Reviewed-by: Juergen Gross <jgross@suse.com> Link: https://lore.kernel.org/r/20240124163130.31324-1-mheyne@amazon.de Signed-off-by: Juergen Gross <jgross@suse.com>
2024-01-24 16:31:28 +00:00
if (close_evtchn)
xen_evtchn_close(evtchn);
}
xen_free_irq(info);
}
/*
* Do not make any assumptions regarding the relationship between the
* IRQ number returned here and the Xen pirq argument.
*
* Note: We don't assign an event channel until the irq actually started
* up. Return an existing irq if we've already got one for the gsi.
*
* Shareable implies level triggered, not shareable implies edge
* triggered here.
*/
int xen_bind_pirq_gsi_to_irq(unsigned gsi,
unsigned pirq, int shareable, char *name)
{
struct irq_info *info;
struct physdev_irq irq_op;
int ret;
xen/irq: Alter the locking to use a mutex instead of a spinlock. When we allocate/change the IRQ informations, we do not need to use spinlocks. We can use a mutex (which is what the generic IRQ code does for allocations/changes). Fixes a slew of: BUG: sleeping function called from invalid context at /linux/kernel/mutex.c:271 in_atomic(): 1, irqs_disabled(): 0, pid: 3216, name: xenstored 2 locks held by xenstored/3216: #0: (&u->bind_mutex){......}, at: [<ffffffffa02e0920>] evtchn_ioctl+0x30/0x3a0 [xen_evtchn] #1: (irq_mapping_update_lock){......}, at: [<ffffffff8138b274>] bind_evtchn_to_irq+0x24/0x90 Pid: 3216, comm: xenstored Not tainted 3.1.0-rc6-00021-g437a3d1 #2 Call Trace: [<ffffffff81088d10>] __might_sleep+0x100/0x130 [<ffffffff81645c2f>] mutex_lock_nested+0x2f/0x50 [<ffffffff81627529>] __irq_alloc_descs+0x49/0x200 [<ffffffffa02e0920>] ? evtchn_ioctl+0x30/0x3a0 [xen_evtchn] [<ffffffff8138b214>] xen_allocate_irq_dynamic+0x34/0x70 [<ffffffff8138b2ad>] bind_evtchn_to_irq+0x5d/0x90 [<ffffffffa02e03c0>] ? evtchn_bind_to_user+0x60/0x60 [xen_evtchn] [<ffffffff8138c282>] bind_evtchn_to_irqhandler+0x32/0x80 [<ffffffffa02e03a9>] evtchn_bind_to_user+0x49/0x60 [xen_evtchn] [<ffffffffa02e0a34>] evtchn_ioctl+0x144/0x3a0 [xen_evtchn] [<ffffffff811b4070>] ? vfsmount_lock_local_unlock+0x50/0x80 [<ffffffff811a6a1a>] do_vfs_ioctl+0x9a/0x5e0 [<ffffffff811b476f>] ? mntput+0x1f/0x30 [<ffffffff81196259>] ? fput+0x199/0x240 [<ffffffff811a7001>] sys_ioctl+0xa1/0xb0 [<ffffffff8164ea82>] system_call_fastpath+0x16/0x1b Reported-by: Jim Burns <jim_burn@bellsouth.net> Acked-by: Ian Campbell <ian.campbell@citrix.com> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
2011-09-14 05:10:00 -04:00
mutex_lock(&irq_mapping_update_lock);
ret = xen_irq_from_gsi(gsi);
if (ret != -1) {
pr_info("%s: returning irq %d for gsi %u\n",
__func__, ret, gsi);
goto out;
}
info = xen_allocate_irq_gsi(gsi);
if (!info)
goto out;
irq_op.irq = info->irq;
irq_op.vector = 0;
/* Only the privileged domain can do this. For non-priv, the pcifront
* driver provides a PCI bus that does the call to do exactly
* this in the priv domain. */
if (xen_initial_domain() &&
HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
xen_free_irq(info);
ret = -ENOSPC;
goto out;
}
ret = xen_irq_info_pirq_setup(info, 0, pirq, gsi, DOMID_SELF,
shareable ? PIRQ_SHAREABLE : 0);
if (ret < 0) {
__unbind_from_irq(info, info->irq);
goto out;
}
pirq_query_unmask(info);
/* We try to use the handler with the appropriate semantic for the
* type of interrupt: if the interrupt is an edge triggered
* interrupt we use handle_edge_irq.
*
* On the other hand if the interrupt is level triggered we use
* handle_fasteoi_irq like the native code does for this kind of
* interrupts.
*
* Depending on the Xen version, pirq_needs_eoi might return true
* not only for level triggered interrupts but for edge triggered
* interrupts too. In any case Xen always honors the eoi mechanism,
* not injecting any more pirqs of the same kind if the first one
* hasn't received an eoi yet. Therefore using the fasteoi handler
* is the right choice either way.
*/
if (shareable)
irq_set_chip_and_handler_name(info->irq, &xen_pirq_chip,
handle_fasteoi_irq, name);
else
irq_set_chip_and_handler_name(info->irq, &xen_pirq_chip,
handle_edge_irq, name);
ret = info->irq;
out:
xen/irq: Alter the locking to use a mutex instead of a spinlock. When we allocate/change the IRQ informations, we do not need to use spinlocks. We can use a mutex (which is what the generic IRQ code does for allocations/changes). Fixes a slew of: BUG: sleeping function called from invalid context at /linux/kernel/mutex.c:271 in_atomic(): 1, irqs_disabled(): 0, pid: 3216, name: xenstored 2 locks held by xenstored/3216: #0: (&u->bind_mutex){......}, at: [<ffffffffa02e0920>] evtchn_ioctl+0x30/0x3a0 [xen_evtchn] #1: (irq_mapping_update_lock){......}, at: [<ffffffff8138b274>] bind_evtchn_to_irq+0x24/0x90 Pid: 3216, comm: xenstored Not tainted 3.1.0-rc6-00021-g437a3d1 #2 Call Trace: [<ffffffff81088d10>] __might_sleep+0x100/0x130 [<ffffffff81645c2f>] mutex_lock_nested+0x2f/0x50 [<ffffffff81627529>] __irq_alloc_descs+0x49/0x200 [<ffffffffa02e0920>] ? evtchn_ioctl+0x30/0x3a0 [xen_evtchn] [<ffffffff8138b214>] xen_allocate_irq_dynamic+0x34/0x70 [<ffffffff8138b2ad>] bind_evtchn_to_irq+0x5d/0x90 [<ffffffffa02e03c0>] ? evtchn_bind_to_user+0x60/0x60 [xen_evtchn] [<ffffffff8138c282>] bind_evtchn_to_irqhandler+0x32/0x80 [<ffffffffa02e03a9>] evtchn_bind_to_user+0x49/0x60 [xen_evtchn] [<ffffffffa02e0a34>] evtchn_ioctl+0x144/0x3a0 [xen_evtchn] [<ffffffff811b4070>] ? vfsmount_lock_local_unlock+0x50/0x80 [<ffffffff811a6a1a>] do_vfs_ioctl+0x9a/0x5e0 [<ffffffff811b476f>] ? mntput+0x1f/0x30 [<ffffffff81196259>] ? fput+0x199/0x240 [<ffffffff811a7001>] sys_ioctl+0xa1/0xb0 [<ffffffff8164ea82>] system_call_fastpath+0x16/0x1b Reported-by: Jim Burns <jim_burn@bellsouth.net> Acked-by: Ian Campbell <ian.campbell@citrix.com> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
2011-09-14 05:10:00 -04:00
mutex_unlock(&irq_mapping_update_lock);
return ret;
}
#ifdef CONFIG_PCI_MSI
int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc)
{
int rc;
struct physdev_get_free_pirq op_get_free_pirq;
op_get_free_pirq.type = MAP_PIRQ_TYPE_MSI;
rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_free_pirq, &op_get_free_pirq);
WARN_ONCE(rc == -ENOSYS,
"hypervisor does not support the PHYSDEVOP_get_free_pirq interface\n");
return rc ? -1 : op_get_free_pirq.pirq;
}
int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc,
int pirq, int nvec, const char *name, domid_t domid)
{
int i, irq, ret;
struct irq_info *info;
xen/irq: Alter the locking to use a mutex instead of a spinlock. When we allocate/change the IRQ informations, we do not need to use spinlocks. We can use a mutex (which is what the generic IRQ code does for allocations/changes). Fixes a slew of: BUG: sleeping function called from invalid context at /linux/kernel/mutex.c:271 in_atomic(): 1, irqs_disabled(): 0, pid: 3216, name: xenstored 2 locks held by xenstored/3216: #0: (&u->bind_mutex){......}, at: [<ffffffffa02e0920>] evtchn_ioctl+0x30/0x3a0 [xen_evtchn] #1: (irq_mapping_update_lock){......}, at: [<ffffffff8138b274>] bind_evtchn_to_irq+0x24/0x90 Pid: 3216, comm: xenstored Not tainted 3.1.0-rc6-00021-g437a3d1 #2 Call Trace: [<ffffffff81088d10>] __might_sleep+0x100/0x130 [<ffffffff81645c2f>] mutex_lock_nested+0x2f/0x50 [<ffffffff81627529>] __irq_alloc_descs+0x49/0x200 [<ffffffffa02e0920>] ? evtchn_ioctl+0x30/0x3a0 [xen_evtchn] [<ffffffff8138b214>] xen_allocate_irq_dynamic+0x34/0x70 [<ffffffff8138b2ad>] bind_evtchn_to_irq+0x5d/0x90 [<ffffffffa02e03c0>] ? evtchn_bind_to_user+0x60/0x60 [xen_evtchn] [<ffffffff8138c282>] bind_evtchn_to_irqhandler+0x32/0x80 [<ffffffffa02e03a9>] evtchn_bind_to_user+0x49/0x60 [xen_evtchn] [<ffffffffa02e0a34>] evtchn_ioctl+0x144/0x3a0 [xen_evtchn] [<ffffffff811b4070>] ? vfsmount_lock_local_unlock+0x50/0x80 [<ffffffff811a6a1a>] do_vfs_ioctl+0x9a/0x5e0 [<ffffffff811b476f>] ? mntput+0x1f/0x30 [<ffffffff81196259>] ? fput+0x199/0x240 [<ffffffff811a7001>] sys_ioctl+0xa1/0xb0 [<ffffffff8164ea82>] system_call_fastpath+0x16/0x1b Reported-by: Jim Burns <jim_burn@bellsouth.net> Acked-by: Ian Campbell <ian.campbell@citrix.com> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
2011-09-14 05:10:00 -04:00
mutex_lock(&irq_mapping_update_lock);
irq = irq_alloc_descs(-1, 0, nvec, -1);
if (irq < 0)
goto out;
for (i = 0; i < nvec; i++) {
info = xen_irq_init(irq + i);
if (!info) {
ret = -ENOMEM;
goto error_irq;
}
irq_set_chip_and_handler_name(irq + i, &xen_pirq_chip, handle_edge_irq, name);
ret = xen_irq_info_pirq_setup(info, 0, pirq + i, 0, domid,
i == 0 ? 0 : PIRQ_MSI_GROUP);
if (ret < 0)
goto error_irq;
}
ret = irq_set_msi_desc(irq, msidesc);
if (ret < 0)
goto error_irq;
out:
xen/irq: Alter the locking to use a mutex instead of a spinlock. When we allocate/change the IRQ informations, we do not need to use spinlocks. We can use a mutex (which is what the generic IRQ code does for allocations/changes). Fixes a slew of: BUG: sleeping function called from invalid context at /linux/kernel/mutex.c:271 in_atomic(): 1, irqs_disabled(): 0, pid: 3216, name: xenstored 2 locks held by xenstored/3216: #0: (&u->bind_mutex){......}, at: [<ffffffffa02e0920>] evtchn_ioctl+0x30/0x3a0 [xen_evtchn] #1: (irq_mapping_update_lock){......}, at: [<ffffffff8138b274>] bind_evtchn_to_irq+0x24/0x90 Pid: 3216, comm: xenstored Not tainted 3.1.0-rc6-00021-g437a3d1 #2 Call Trace: [<ffffffff81088d10>] __might_sleep+0x100/0x130 [<ffffffff81645c2f>] mutex_lock_nested+0x2f/0x50 [<ffffffff81627529>] __irq_alloc_descs+0x49/0x200 [<ffffffffa02e0920>] ? evtchn_ioctl+0x30/0x3a0 [xen_evtchn] [<ffffffff8138b214>] xen_allocate_irq_dynamic+0x34/0x70 [<ffffffff8138b2ad>] bind_evtchn_to_irq+0x5d/0x90 [<ffffffffa02e03c0>] ? evtchn_bind_to_user+0x60/0x60 [xen_evtchn] [<ffffffff8138c282>] bind_evtchn_to_irqhandler+0x32/0x80 [<ffffffffa02e03a9>] evtchn_bind_to_user+0x49/0x60 [xen_evtchn] [<ffffffffa02e0a34>] evtchn_ioctl+0x144/0x3a0 [xen_evtchn] [<ffffffff811b4070>] ? vfsmount_lock_local_unlock+0x50/0x80 [<ffffffff811a6a1a>] do_vfs_ioctl+0x9a/0x5e0 [<ffffffff811b476f>] ? mntput+0x1f/0x30 [<ffffffff81196259>] ? fput+0x199/0x240 [<ffffffff811a7001>] sys_ioctl+0xa1/0xb0 [<ffffffff8164ea82>] system_call_fastpath+0x16/0x1b Reported-by: Jim Burns <jim_burn@bellsouth.net> Acked-by: Ian Campbell <ian.campbell@citrix.com> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
2011-09-14 05:10:00 -04:00
mutex_unlock(&irq_mapping_update_lock);
return irq;
error_irq:
while (nvec--) {
info = info_for_irq(irq + nvec);
__unbind_from_irq(info, irq + nvec);
}
xen/irq: Alter the locking to use a mutex instead of a spinlock. When we allocate/change the IRQ informations, we do not need to use spinlocks. We can use a mutex (which is what the generic IRQ code does for allocations/changes). Fixes a slew of: BUG: sleeping function called from invalid context at /linux/kernel/mutex.c:271 in_atomic(): 1, irqs_disabled(): 0, pid: 3216, name: xenstored 2 locks held by xenstored/3216: #0: (&u->bind_mutex){......}, at: [<ffffffffa02e0920>] evtchn_ioctl+0x30/0x3a0 [xen_evtchn] #1: (irq_mapping_update_lock){......}, at: [<ffffffff8138b274>] bind_evtchn_to_irq+0x24/0x90 Pid: 3216, comm: xenstored Not tainted 3.1.0-rc6-00021-g437a3d1 #2 Call Trace: [<ffffffff81088d10>] __might_sleep+0x100/0x130 [<ffffffff81645c2f>] mutex_lock_nested+0x2f/0x50 [<ffffffff81627529>] __irq_alloc_descs+0x49/0x200 [<ffffffffa02e0920>] ? evtchn_ioctl+0x30/0x3a0 [xen_evtchn] [<ffffffff8138b214>] xen_allocate_irq_dynamic+0x34/0x70 [<ffffffff8138b2ad>] bind_evtchn_to_irq+0x5d/0x90 [<ffffffffa02e03c0>] ? evtchn_bind_to_user+0x60/0x60 [xen_evtchn] [<ffffffff8138c282>] bind_evtchn_to_irqhandler+0x32/0x80 [<ffffffffa02e03a9>] evtchn_bind_to_user+0x49/0x60 [xen_evtchn] [<ffffffffa02e0a34>] evtchn_ioctl+0x144/0x3a0 [xen_evtchn] [<ffffffff811b4070>] ? vfsmount_lock_local_unlock+0x50/0x80 [<ffffffff811a6a1a>] do_vfs_ioctl+0x9a/0x5e0 [<ffffffff811b476f>] ? mntput+0x1f/0x30 [<ffffffff81196259>] ? fput+0x199/0x240 [<ffffffff811a7001>] sys_ioctl+0xa1/0xb0 [<ffffffff8164ea82>] system_call_fastpath+0x16/0x1b Reported-by: Jim Burns <jim_burn@bellsouth.net> Acked-by: Ian Campbell <ian.campbell@citrix.com> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
2011-09-14 05:10:00 -04:00
mutex_unlock(&irq_mapping_update_lock);
return ret;
}
#endif
int xen_destroy_irq(int irq)
{
struct physdev_unmap_pirq unmap_irq;
struct irq_info *info = info_for_irq(irq);
int rc = -ENOENT;
xen/irq: Alter the locking to use a mutex instead of a spinlock. When we allocate/change the IRQ informations, we do not need to use spinlocks. We can use a mutex (which is what the generic IRQ code does for allocations/changes). Fixes a slew of: BUG: sleeping function called from invalid context at /linux/kernel/mutex.c:271 in_atomic(): 1, irqs_disabled(): 0, pid: 3216, name: xenstored 2 locks held by xenstored/3216: #0: (&u->bind_mutex){......}, at: [<ffffffffa02e0920>] evtchn_ioctl+0x30/0x3a0 [xen_evtchn] #1: (irq_mapping_update_lock){......}, at: [<ffffffff8138b274>] bind_evtchn_to_irq+0x24/0x90 Pid: 3216, comm: xenstored Not tainted 3.1.0-rc6-00021-g437a3d1 #2 Call Trace: [<ffffffff81088d10>] __might_sleep+0x100/0x130 [<ffffffff81645c2f>] mutex_lock_nested+0x2f/0x50 [<ffffffff81627529>] __irq_alloc_descs+0x49/0x200 [<ffffffffa02e0920>] ? evtchn_ioctl+0x30/0x3a0 [xen_evtchn] [<ffffffff8138b214>] xen_allocate_irq_dynamic+0x34/0x70 [<ffffffff8138b2ad>] bind_evtchn_to_irq+0x5d/0x90 [<ffffffffa02e03c0>] ? evtchn_bind_to_user+0x60/0x60 [xen_evtchn] [<ffffffff8138c282>] bind_evtchn_to_irqhandler+0x32/0x80 [<ffffffffa02e03a9>] evtchn_bind_to_user+0x49/0x60 [xen_evtchn] [<ffffffffa02e0a34>] evtchn_ioctl+0x144/0x3a0 [xen_evtchn] [<ffffffff811b4070>] ? vfsmount_lock_local_unlock+0x50/0x80 [<ffffffff811a6a1a>] do_vfs_ioctl+0x9a/0x5e0 [<ffffffff811b476f>] ? mntput+0x1f/0x30 [<ffffffff81196259>] ? fput+0x199/0x240 [<ffffffff811a7001>] sys_ioctl+0xa1/0xb0 [<ffffffff8164ea82>] system_call_fastpath+0x16/0x1b Reported-by: Jim Burns <jim_burn@bellsouth.net> Acked-by: Ian Campbell <ian.campbell@citrix.com> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
2011-09-14 05:10:00 -04:00
mutex_lock(&irq_mapping_update_lock);
/*
* If trying to remove a vector in a MSI group different
* than the first one skip the PIRQ unmap unless this vector
* is the first one in the group.
*/
if (xen_initial_domain() && !(info->u.pirq.flags & PIRQ_MSI_GROUP)) {
unmap_irq.pirq = info->u.pirq.pirq;
unmap_irq.domid = info->u.pirq.domid;
rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq);
/* If another domain quits without making the pci_disable_msix
* call, the Xen hypervisor takes care of freeing the PIRQs
* (free_domain_pirqs).
*/
if ((rc == -ESRCH && info->u.pirq.domid != DOMID_SELF))
pr_info("domain %d does not have %d anymore\n",
info->u.pirq.domid, info->u.pirq.pirq);
else if (rc) {
pr_warn("unmap irq failed %d\n", rc);
goto out;
}
}
xen_free_irq(info);
out:
xen/irq: Alter the locking to use a mutex instead of a spinlock. When we allocate/change the IRQ informations, we do not need to use spinlocks. We can use a mutex (which is what the generic IRQ code does for allocations/changes). Fixes a slew of: BUG: sleeping function called from invalid context at /linux/kernel/mutex.c:271 in_atomic(): 1, irqs_disabled(): 0, pid: 3216, name: xenstored 2 locks held by xenstored/3216: #0: (&u->bind_mutex){......}, at: [<ffffffffa02e0920>] evtchn_ioctl+0x30/0x3a0 [xen_evtchn] #1: (irq_mapping_update_lock){......}, at: [<ffffffff8138b274>] bind_evtchn_to_irq+0x24/0x90 Pid: 3216, comm: xenstored Not tainted 3.1.0-rc6-00021-g437a3d1 #2 Call Trace: [<ffffffff81088d10>] __might_sleep+0x100/0x130 [<ffffffff81645c2f>] mutex_lock_nested+0x2f/0x50 [<ffffffff81627529>] __irq_alloc_descs+0x49/0x200 [<ffffffffa02e0920>] ? evtchn_ioctl+0x30/0x3a0 [xen_evtchn] [<ffffffff8138b214>] xen_allocate_irq_dynamic+0x34/0x70 [<ffffffff8138b2ad>] bind_evtchn_to_irq+0x5d/0x90 [<ffffffffa02e03c0>] ? evtchn_bind_to_user+0x60/0x60 [xen_evtchn] [<ffffffff8138c282>] bind_evtchn_to_irqhandler+0x32/0x80 [<ffffffffa02e03a9>] evtchn_bind_to_user+0x49/0x60 [xen_evtchn] [<ffffffffa02e0a34>] evtchn_ioctl+0x144/0x3a0 [xen_evtchn] [<ffffffff811b4070>] ? vfsmount_lock_local_unlock+0x50/0x80 [<ffffffff811a6a1a>] do_vfs_ioctl+0x9a/0x5e0 [<ffffffff811b476f>] ? mntput+0x1f/0x30 [<ffffffff81196259>] ? fput+0x199/0x240 [<ffffffff811a7001>] sys_ioctl+0xa1/0xb0 [<ffffffff8164ea82>] system_call_fastpath+0x16/0x1b Reported-by: Jim Burns <jim_burn@bellsouth.net> Acked-by: Ian Campbell <ian.campbell@citrix.com> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
2011-09-14 05:10:00 -04:00
mutex_unlock(&irq_mapping_update_lock);
return rc;
}
int xen_pirq_from_irq(unsigned irq)
{
struct irq_info *info = info_for_irq(irq);
return pirq_from_irq(info);
}
EXPORT_SYMBOL_GPL(xen_pirq_from_irq);
static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip,
struct xenbus_device *dev, bool shared)
{
int ret = -ENOMEM;
struct irq_info *info;
if (evtchn >= xen_evtchn_max_channels())
return -ENOMEM;
xen/irq: Alter the locking to use a mutex instead of a spinlock. When we allocate/change the IRQ informations, we do not need to use spinlocks. We can use a mutex (which is what the generic IRQ code does for allocations/changes). Fixes a slew of: BUG: sleeping function called from invalid context at /linux/kernel/mutex.c:271 in_atomic(): 1, irqs_disabled(): 0, pid: 3216, name: xenstored 2 locks held by xenstored/3216: #0: (&u->bind_mutex){......}, at: [<ffffffffa02e0920>] evtchn_ioctl+0x30/0x3a0 [xen_evtchn] #1: (irq_mapping_update_lock){......}, at: [<ffffffff8138b274>] bind_evtchn_to_irq+0x24/0x90 Pid: 3216, comm: xenstored Not tainted 3.1.0-rc6-00021-g437a3d1 #2 Call Trace: [<ffffffff81088d10>] __might_sleep+0x100/0x130 [<ffffffff81645c2f>] mutex_lock_nested+0x2f/0x50 [<ffffffff81627529>] __irq_alloc_descs+0x49/0x200 [<ffffffffa02e0920>] ? evtchn_ioctl+0x30/0x3a0 [xen_evtchn] [<ffffffff8138b214>] xen_allocate_irq_dynamic+0x34/0x70 [<ffffffff8138b2ad>] bind_evtchn_to_irq+0x5d/0x90 [<ffffffffa02e03c0>] ? evtchn_bind_to_user+0x60/0x60 [xen_evtchn] [<ffffffff8138c282>] bind_evtchn_to_irqhandler+0x32/0x80 [<ffffffffa02e03a9>] evtchn_bind_to_user+0x49/0x60 [xen_evtchn] [<ffffffffa02e0a34>] evtchn_ioctl+0x144/0x3a0 [xen_evtchn] [<ffffffff811b4070>] ? vfsmount_lock_local_unlock+0x50/0x80 [<ffffffff811a6a1a>] do_vfs_ioctl+0x9a/0x5e0 [<ffffffff811b476f>] ? mntput+0x1f/0x30 [<ffffffff81196259>] ? fput+0x199/0x240 [<ffffffff811a7001>] sys_ioctl+0xa1/0xb0 [<ffffffff8164ea82>] system_call_fastpath+0x16/0x1b Reported-by: Jim Burns <jim_burn@bellsouth.net> Acked-by: Ian Campbell <ian.campbell@citrix.com> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
2011-09-14 05:10:00 -04:00
mutex_lock(&irq_mapping_update_lock);
info = evtchn_to_info(evtchn);
if (!info) {
info = xen_allocate_irq_dynamic();
if (!info)
goto out;
irq_set_chip_and_handler_name(info->irq, chip,
handle_edge_irq, "event");
ret = xen_irq_info_evtchn_setup(info, evtchn, dev);
if (ret < 0) {
__unbind_from_irq(info, info->irq);
goto out;
}
/*
* New interdomain events are initially bound to vCPU0 This
* is required to setup the event channel in the first
* place and also important for UP guests because the
* affinity setting is not invoked on them so nothing would
* bind the channel.
*/
bind_evtchn_to_cpu(info, 0, false);
} else if (!WARN_ON(info->type != IRQT_EVTCHN)) {
if (shared && !WARN_ON(info->refcnt < 0))
info->refcnt++;
}
ret = info->irq;
out:
xen/irq: Alter the locking to use a mutex instead of a spinlock. When we allocate/change the IRQ informations, we do not need to use spinlocks. We can use a mutex (which is what the generic IRQ code does for allocations/changes). Fixes a slew of: BUG: sleeping function called from invalid context at /linux/kernel/mutex.c:271 in_atomic(): 1, irqs_disabled(): 0, pid: 3216, name: xenstored 2 locks held by xenstored/3216: #0: (&u->bind_mutex){......}, at: [<ffffffffa02e0920>] evtchn_ioctl+0x30/0x3a0 [xen_evtchn] #1: (irq_mapping_update_lock){......}, at: [<ffffffff8138b274>] bind_evtchn_to_irq+0x24/0x90 Pid: 3216, comm: xenstored Not tainted 3.1.0-rc6-00021-g437a3d1 #2 Call Trace: [<ffffffff81088d10>] __might_sleep+0x100/0x130 [<ffffffff81645c2f>] mutex_lock_nested+0x2f/0x50 [<ffffffff81627529>] __irq_alloc_descs+0x49/0x200 [<ffffffffa02e0920>] ? evtchn_ioctl+0x30/0x3a0 [xen_evtchn] [<ffffffff8138b214>] xen_allocate_irq_dynamic+0x34/0x70 [<ffffffff8138b2ad>] bind_evtchn_to_irq+0x5d/0x90 [<ffffffffa02e03c0>] ? evtchn_bind_to_user+0x60/0x60 [xen_evtchn] [<ffffffff8138c282>] bind_evtchn_to_irqhandler+0x32/0x80 [<ffffffffa02e03a9>] evtchn_bind_to_user+0x49/0x60 [xen_evtchn] [<ffffffffa02e0a34>] evtchn_ioctl+0x144/0x3a0 [xen_evtchn] [<ffffffff811b4070>] ? vfsmount_lock_local_unlock+0x50/0x80 [<ffffffff811a6a1a>] do_vfs_ioctl+0x9a/0x5e0 [<ffffffff811b476f>] ? mntput+0x1f/0x30 [<ffffffff81196259>] ? fput+0x199/0x240 [<ffffffff811a7001>] sys_ioctl+0xa1/0xb0 [<ffffffff8164ea82>] system_call_fastpath+0x16/0x1b Reported-by: Jim Burns <jim_burn@bellsouth.net> Acked-by: Ian Campbell <ian.campbell@citrix.com> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
2011-09-14 05:10:00 -04:00
mutex_unlock(&irq_mapping_update_lock);
return ret;
}
int bind_evtchn_to_irq(evtchn_port_t evtchn)
{
return bind_evtchn_to_irq_chip(evtchn, &xen_dynamic_chip, NULL, false);
}
EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
int bind_evtchn_to_irq_lateeoi(evtchn_port_t evtchn)
{
return bind_evtchn_to_irq_chip(evtchn, &xen_lateeoi_chip, NULL, false);
}
EXPORT_SYMBOL_GPL(bind_evtchn_to_irq_lateeoi);
static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
{
struct evtchn_bind_ipi bind_ipi;
evtchn_port_t evtchn;
struct irq_info *info;
int ret;
xen/irq: Alter the locking to use a mutex instead of a spinlock. When we allocate/change the IRQ informations, we do not need to use spinlocks. We can use a mutex (which is what the generic IRQ code does for allocations/changes). Fixes a slew of: BUG: sleeping function called from invalid context at /linux/kernel/mutex.c:271 in_atomic(): 1, irqs_disabled(): 0, pid: 3216, name: xenstored 2 locks held by xenstored/3216: #0: (&u->bind_mutex){......}, at: [<ffffffffa02e0920>] evtchn_ioctl+0x30/0x3a0 [xen_evtchn] #1: (irq_mapping_update_lock){......}, at: [<ffffffff8138b274>] bind_evtchn_to_irq+0x24/0x90 Pid: 3216, comm: xenstored Not tainted 3.1.0-rc6-00021-g437a3d1 #2 Call Trace: [<ffffffff81088d10>] __might_sleep+0x100/0x130 [<ffffffff81645c2f>] mutex_lock_nested+0x2f/0x50 [<ffffffff81627529>] __irq_alloc_descs+0x49/0x200 [<ffffffffa02e0920>] ? evtchn_ioctl+0x30/0x3a0 [xen_evtchn] [<ffffffff8138b214>] xen_allocate_irq_dynamic+0x34/0x70 [<ffffffff8138b2ad>] bind_evtchn_to_irq+0x5d/0x90 [<ffffffffa02e03c0>] ? evtchn_bind_to_user+0x60/0x60 [xen_evtchn] [<ffffffff8138c282>] bind_evtchn_to_irqhandler+0x32/0x80 [<ffffffffa02e03a9>] evtchn_bind_to_user+0x49/0x60 [xen_evtchn] [<ffffffffa02e0a34>] evtchn_ioctl+0x144/0x3a0 [xen_evtchn] [<ffffffff811b4070>] ? vfsmount_lock_local_unlock+0x50/0x80 [<ffffffff811a6a1a>] do_vfs_ioctl+0x9a/0x5e0 [<ffffffff811b476f>] ? mntput+0x1f/0x30 [<ffffffff81196259>] ? fput+0x199/0x240 [<ffffffff811a7001>] sys_ioctl+0xa1/0xb0 [<ffffffff8164ea82>] system_call_fastpath+0x16/0x1b Reported-by: Jim Burns <jim_burn@bellsouth.net> Acked-by: Ian Campbell <ian.campbell@citrix.com> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
2011-09-14 05:10:00 -04:00
mutex_lock(&irq_mapping_update_lock);
ret = per_cpu(ipi_to_irq, cpu)[ipi];
if (ret == -1) {
info = xen_allocate_irq_dynamic();
if (!info)
goto out;
irq_set_chip_and_handler_name(info->irq, &xen_percpu_chip,
handle_percpu_irq, "ipi");
bind_ipi.vcpu = xen_vcpu_nr(cpu);
if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
&bind_ipi) != 0)
BUG();
evtchn = bind_ipi.port;
ret = xen_irq_info_ipi_setup(info, cpu, evtchn, ipi);
if (ret < 0) {
__unbind_from_irq(info, info->irq);
goto out;
}
/*
* Force the affinity mask to the target CPU so proc shows
* the correct target.
*/
bind_evtchn_to_cpu(info, cpu, true);
ret = info->irq;
} else {
info = info_for_irq(ret);
WARN_ON(info == NULL || info->type != IRQT_IPI);
}
out:
xen/irq: Alter the locking to use a mutex instead of a spinlock. When we allocate/change the IRQ informations, we do not need to use spinlocks. We can use a mutex (which is what the generic IRQ code does for allocations/changes). Fixes a slew of: BUG: sleeping function called from invalid context at /linux/kernel/mutex.c:271 in_atomic(): 1, irqs_disabled(): 0, pid: 3216, name: xenstored 2 locks held by xenstored/3216: #0: (&u->bind_mutex){......}, at: [<ffffffffa02e0920>] evtchn_ioctl+0x30/0x3a0 [xen_evtchn] #1: (irq_mapping_update_lock){......}, at: [<ffffffff8138b274>] bind_evtchn_to_irq+0x24/0x90 Pid: 3216, comm: xenstored Not tainted 3.1.0-rc6-00021-g437a3d1 #2 Call Trace: [<ffffffff81088d10>] __might_sleep+0x100/0x130 [<ffffffff81645c2f>] mutex_lock_nested+0x2f/0x50 [<ffffffff81627529>] __irq_alloc_descs+0x49/0x200 [<ffffffffa02e0920>] ? evtchn_ioctl+0x30/0x3a0 [xen_evtchn] [<ffffffff8138b214>] xen_allocate_irq_dynamic+0x34/0x70 [<ffffffff8138b2ad>] bind_evtchn_to_irq+0x5d/0x90 [<ffffffffa02e03c0>] ? evtchn_bind_to_user+0x60/0x60 [xen_evtchn] [<ffffffff8138c282>] bind_evtchn_to_irqhandler+0x32/0x80 [<ffffffffa02e03a9>] evtchn_bind_to_user+0x49/0x60 [xen_evtchn] [<ffffffffa02e0a34>] evtchn_ioctl+0x144/0x3a0 [xen_evtchn] [<ffffffff811b4070>] ? vfsmount_lock_local_unlock+0x50/0x80 [<ffffffff811a6a1a>] do_vfs_ioctl+0x9a/0x5e0 [<ffffffff811b476f>] ? mntput+0x1f/0x30 [<ffffffff81196259>] ? fput+0x199/0x240 [<ffffffff811a7001>] sys_ioctl+0xa1/0xb0 [<ffffffff8164ea82>] system_call_fastpath+0x16/0x1b Reported-by: Jim Burns <jim_burn@bellsouth.net> Acked-by: Ian Campbell <ian.campbell@citrix.com> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
2011-09-14 05:10:00 -04:00
mutex_unlock(&irq_mapping_update_lock);
return ret;
}
static int bind_interdomain_evtchn_to_irq_chip(struct xenbus_device *dev,
evtchn_port_t remote_port,
struct irq_chip *chip,
bool shared)
{
struct evtchn_bind_interdomain bind_interdomain;
int err;
bind_interdomain.remote_dom = dev->otherend_id;
bind_interdomain.remote_port = remote_port;
err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
&bind_interdomain);
return err ? : bind_evtchn_to_irq_chip(bind_interdomain.local_port,
chip, dev, shared);
}
int bind_interdomain_evtchn_to_irq_lateeoi(struct xenbus_device *dev,
evtchn_port_t remote_port)
{
return bind_interdomain_evtchn_to_irq_chip(dev, remote_port,
&xen_lateeoi_chip, false);
}
EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irq_lateeoi);
static int find_virq(unsigned int virq, unsigned int cpu, evtchn_port_t *evtchn)
{
struct evtchn_status status;
evtchn_port_t port;
int rc = -ENOENT;
memset(&status, 0, sizeof(status));
for (port = 0; port < xen_evtchn_max_channels(); port++) {
status.dom = DOMID_SELF;
status.port = port;
rc = HYPERVISOR_event_channel_op(EVTCHNOP_status, &status);
if (rc < 0)
continue;
if (status.status != EVTCHNSTAT_virq)
continue;
if (status.u.virq == virq && status.vcpu == xen_vcpu_nr(cpu)) {
*evtchn = port;
break;
}
}
return rc;
}
/**
* xen_evtchn_nr_channels - number of usable event channel ports
*
* This may be less than the maximum supported by the current
* hypervisor ABI. Use xen_evtchn_max_channels() for the maximum
* supported.
*/
unsigned xen_evtchn_nr_channels(void)
{
return evtchn_ops->nr_channels();
}
EXPORT_SYMBOL_GPL(xen_evtchn_nr_channels);
int bind_virq_to_irq(unsigned int virq, unsigned int cpu, bool percpu)
{
struct evtchn_bind_virq bind_virq;
evtchn_port_t evtchn = 0;
struct irq_info *info;
int ret;
xen/irq: Alter the locking to use a mutex instead of a spinlock. When we allocate/change the IRQ informations, we do not need to use spinlocks. We can use a mutex (which is what the generic IRQ code does for allocations/changes). Fixes a slew of: BUG: sleeping function called from invalid context at /linux/kernel/mutex.c:271 in_atomic(): 1, irqs_disabled(): 0, pid: 3216, name: xenstored 2 locks held by xenstored/3216: #0: (&u->bind_mutex){......}, at: [<ffffffffa02e0920>] evtchn_ioctl+0x30/0x3a0 [xen_evtchn] #1: (irq_mapping_update_lock){......}, at: [<ffffffff8138b274>] bind_evtchn_to_irq+0x24/0x90 Pid: 3216, comm: xenstored Not tainted 3.1.0-rc6-00021-g437a3d1 #2 Call Trace: [<ffffffff81088d10>] __might_sleep+0x100/0x130 [<ffffffff81645c2f>] mutex_lock_nested+0x2f/0x50 [<ffffffff81627529>] __irq_alloc_descs+0x49/0x200 [<ffffffffa02e0920>] ? evtchn_ioctl+0x30/0x3a0 [xen_evtchn] [<ffffffff8138b214>] xen_allocate_irq_dynamic+0x34/0x70 [<ffffffff8138b2ad>] bind_evtchn_to_irq+0x5d/0x90 [<ffffffffa02e03c0>] ? evtchn_bind_to_user+0x60/0x60 [xen_evtchn] [<ffffffff8138c282>] bind_evtchn_to_irqhandler+0x32/0x80 [<ffffffffa02e03a9>] evtchn_bind_to_user+0x49/0x60 [xen_evtchn] [<ffffffffa02e0a34>] evtchn_ioctl+0x144/0x3a0 [xen_evtchn] [<ffffffff811b4070>] ? vfsmount_lock_local_unlock+0x50/0x80 [<ffffffff811a6a1a>] do_vfs_ioctl+0x9a/0x5e0 [<ffffffff811b476f>] ? mntput+0x1f/0x30 [<ffffffff81196259>] ? fput+0x199/0x240 [<ffffffff811a7001>] sys_ioctl+0xa1/0xb0 [<ffffffff8164ea82>] system_call_fastpath+0x16/0x1b Reported-by: Jim Burns <jim_burn@bellsouth.net> Acked-by: Ian Campbell <ian.campbell@citrix.com> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
2011-09-14 05:10:00 -04:00
mutex_lock(&irq_mapping_update_lock);
ret = per_cpu(virq_to_irq, cpu)[virq];
if (ret == -1) {
info = xen_allocate_irq_dynamic();
if (!info)
goto out;
if (percpu)
irq_set_chip_and_handler_name(info->irq, &xen_percpu_chip,
handle_percpu_irq, "virq");
else
irq_set_chip_and_handler_name(info->irq, &xen_dynamic_chip,
handle_edge_irq, "virq");
bind_virq.virq = virq;
bind_virq.vcpu = xen_vcpu_nr(cpu);
ret = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
&bind_virq);
if (ret == 0)
evtchn = bind_virq.port;
else {
if (ret == -EEXIST)
ret = find_virq(virq, cpu, &evtchn);
BUG_ON(ret < 0);
}
ret = xen_irq_info_virq_setup(info, cpu, evtchn, virq);
if (ret < 0) {
__unbind_from_irq(info, info->irq);
goto out;
}
/*
* Force the affinity mask for percpu interrupts so proc
* shows the correct target.
*/
bind_evtchn_to_cpu(info, cpu, percpu);
ret = info->irq;
} else {
info = info_for_irq(ret);
WARN_ON(info == NULL || info->type != IRQT_VIRQ);
}
out:
xen/irq: Alter the locking to use a mutex instead of a spinlock. When we allocate/change the IRQ informations, we do not need to use spinlocks. We can use a mutex (which is what the generic IRQ code does for allocations/changes). Fixes a slew of: BUG: sleeping function called from invalid context at /linux/kernel/mutex.c:271 in_atomic(): 1, irqs_disabled(): 0, pid: 3216, name: xenstored 2 locks held by xenstored/3216: #0: (&u->bind_mutex){......}, at: [<ffffffffa02e0920>] evtchn_ioctl+0x30/0x3a0 [xen_evtchn] #1: (irq_mapping_update_lock){......}, at: [<ffffffff8138b274>] bind_evtchn_to_irq+0x24/0x90 Pid: 3216, comm: xenstored Not tainted 3.1.0-rc6-00021-g437a3d1 #2 Call Trace: [<ffffffff81088d10>] __might_sleep+0x100/0x130 [<ffffffff81645c2f>] mutex_lock_nested+0x2f/0x50 [<ffffffff81627529>] __irq_alloc_descs+0x49/0x200 [<ffffffffa02e0920>] ? evtchn_ioctl+0x30/0x3a0 [xen_evtchn] [<ffffffff8138b214>] xen_allocate_irq_dynamic+0x34/0x70 [<ffffffff8138b2ad>] bind_evtchn_to_irq+0x5d/0x90 [<ffffffffa02e03c0>] ? evtchn_bind_to_user+0x60/0x60 [xen_evtchn] [<ffffffff8138c282>] bind_evtchn_to_irqhandler+0x32/0x80 [<ffffffffa02e03a9>] evtchn_bind_to_user+0x49/0x60 [xen_evtchn] [<ffffffffa02e0a34>] evtchn_ioctl+0x144/0x3a0 [xen_evtchn] [<ffffffff811b4070>] ? vfsmount_lock_local_unlock+0x50/0x80 [<ffffffff811a6a1a>] do_vfs_ioctl+0x9a/0x5e0 [<ffffffff811b476f>] ? mntput+0x1f/0x30 [<ffffffff81196259>] ? fput+0x199/0x240 [<ffffffff811a7001>] sys_ioctl+0xa1/0xb0 [<ffffffff8164ea82>] system_call_fastpath+0x16/0x1b Reported-by: Jim Burns <jim_burn@bellsouth.net> Acked-by: Ian Campbell <ian.campbell@citrix.com> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
2011-09-14 05:10:00 -04:00
mutex_unlock(&irq_mapping_update_lock);
return ret;
}
static void unbind_from_irq(unsigned int irq)
{
struct irq_info *info;
xen/irq: Alter the locking to use a mutex instead of a spinlock. When we allocate/change the IRQ informations, we do not need to use spinlocks. We can use a mutex (which is what the generic IRQ code does for allocations/changes). Fixes a slew of: BUG: sleeping function called from invalid context at /linux/kernel/mutex.c:271 in_atomic(): 1, irqs_disabled(): 0, pid: 3216, name: xenstored 2 locks held by xenstored/3216: #0: (&u->bind_mutex){......}, at: [<ffffffffa02e0920>] evtchn_ioctl+0x30/0x3a0 [xen_evtchn] #1: (irq_mapping_update_lock){......}, at: [<ffffffff8138b274>] bind_evtchn_to_irq+0x24/0x90 Pid: 3216, comm: xenstored Not tainted 3.1.0-rc6-00021-g437a3d1 #2 Call Trace: [<ffffffff81088d10>] __might_sleep+0x100/0x130 [<ffffffff81645c2f>] mutex_lock_nested+0x2f/0x50 [<ffffffff81627529>] __irq_alloc_descs+0x49/0x200 [<ffffffffa02e0920>] ? evtchn_ioctl+0x30/0x3a0 [xen_evtchn] [<ffffffff8138b214>] xen_allocate_irq_dynamic+0x34/0x70 [<ffffffff8138b2ad>] bind_evtchn_to_irq+0x5d/0x90 [<ffffffffa02e03c0>] ? evtchn_bind_to_user+0x60/0x60 [xen_evtchn] [<ffffffff8138c282>] bind_evtchn_to_irqhandler+0x32/0x80 [<ffffffffa02e03a9>] evtchn_bind_to_user+0x49/0x60 [xen_evtchn] [<ffffffffa02e0a34>] evtchn_ioctl+0x144/0x3a0 [xen_evtchn] [<ffffffff811b4070>] ? vfsmount_lock_local_unlock+0x50/0x80 [<ffffffff811a6a1a>] do_vfs_ioctl+0x9a/0x5e0 [<ffffffff811b476f>] ? mntput+0x1f/0x30 [<ffffffff81196259>] ? fput+0x199/0x240 [<ffffffff811a7001>] sys_ioctl+0xa1/0xb0 [<ffffffff8164ea82>] system_call_fastpath+0x16/0x1b Reported-by: Jim Burns <jim_burn@bellsouth.net> Acked-by: Ian Campbell <ian.campbell@citrix.com> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
2011-09-14 05:10:00 -04:00
mutex_lock(&irq_mapping_update_lock);
info = info_for_irq(irq);
__unbind_from_irq(info, irq);
xen/irq: Alter the locking to use a mutex instead of a spinlock. When we allocate/change the IRQ informations, we do not need to use spinlocks. We can use a mutex (which is what the generic IRQ code does for allocations/changes). Fixes a slew of: BUG: sleeping function called from invalid context at /linux/kernel/mutex.c:271 in_atomic(): 1, irqs_disabled(): 0, pid: 3216, name: xenstored 2 locks held by xenstored/3216: #0: (&u->bind_mutex){......}, at: [<ffffffffa02e0920>] evtchn_ioctl+0x30/0x3a0 [xen_evtchn] #1: (irq_mapping_update_lock){......}, at: [<ffffffff8138b274>] bind_evtchn_to_irq+0x24/0x90 Pid: 3216, comm: xenstored Not tainted 3.1.0-rc6-00021-g437a3d1 #2 Call Trace: [<ffffffff81088d10>] __might_sleep+0x100/0x130 [<ffffffff81645c2f>] mutex_lock_nested+0x2f/0x50 [<ffffffff81627529>] __irq_alloc_descs+0x49/0x200 [<ffffffffa02e0920>] ? evtchn_ioctl+0x30/0x3a0 [xen_evtchn] [<ffffffff8138b214>] xen_allocate_irq_dynamic+0x34/0x70 [<ffffffff8138b2ad>] bind_evtchn_to_irq+0x5d/0x90 [<ffffffffa02e03c0>] ? evtchn_bind_to_user+0x60/0x60 [xen_evtchn] [<ffffffff8138c282>] bind_evtchn_to_irqhandler+0x32/0x80 [<ffffffffa02e03a9>] evtchn_bind_to_user+0x49/0x60 [xen_evtchn] [<ffffffffa02e0a34>] evtchn_ioctl+0x144/0x3a0 [xen_evtchn] [<ffffffff811b4070>] ? vfsmount_lock_local_unlock+0x50/0x80 [<ffffffff811a6a1a>] do_vfs_ioctl+0x9a/0x5e0 [<ffffffff811b476f>] ? mntput+0x1f/0x30 [<ffffffff81196259>] ? fput+0x199/0x240 [<ffffffff811a7001>] sys_ioctl+0xa1/0xb0 [<ffffffff8164ea82>] system_call_fastpath+0x16/0x1b Reported-by: Jim Burns <jim_burn@bellsouth.net> Acked-by: Ian Campbell <ian.campbell@citrix.com> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
2011-09-14 05:10:00 -04:00
mutex_unlock(&irq_mapping_update_lock);
}
static int bind_evtchn_to_irqhandler_chip(evtchn_port_t evtchn,
irq_handler_t handler,
unsigned long irqflags,
const char *devname, void *dev_id,
struct irq_chip *chip)
{
int irq, retval;
irq = bind_evtchn_to_irq_chip(evtchn, chip, NULL,
irqflags & IRQF_SHARED);
if (irq < 0)
return irq;
retval = request_irq(irq, handler, irqflags, devname, dev_id);
if (retval != 0) {
unbind_from_irq(irq);
return retval;
}
return irq;
}
int bind_evtchn_to_irqhandler(evtchn_port_t evtchn,
irq_handler_t handler,
unsigned long irqflags,
const char *devname, void *dev_id)
{
return bind_evtchn_to_irqhandler_chip(evtchn, handler, irqflags,
devname, dev_id,
&xen_dynamic_chip);
}
EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
int bind_evtchn_to_irqhandler_lateeoi(evtchn_port_t evtchn,
irq_handler_t handler,
unsigned long irqflags,
const char *devname, void *dev_id)
{
return bind_evtchn_to_irqhandler_chip(evtchn, handler, irqflags,
devname, dev_id,
&xen_lateeoi_chip);
}
EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler_lateeoi);
static int bind_interdomain_evtchn_to_irqhandler_chip(
struct xenbus_device *dev, evtchn_port_t remote_port,
irq_handler_t handler, unsigned long irqflags,
const char *devname, void *dev_id, struct irq_chip *chip)
{
int irq, retval;
irq = bind_interdomain_evtchn_to_irq_chip(dev, remote_port, chip,
irqflags & IRQF_SHARED);
if (irq < 0)
return irq;
retval = request_irq(irq, handler, irqflags, devname, dev_id);
if (retval != 0) {
unbind_from_irq(irq);
return retval;
}
return irq;
}
int bind_interdomain_evtchn_to_irqhandler_lateeoi(struct xenbus_device *dev,
evtchn_port_t remote_port,
irq_handler_t handler,
unsigned long irqflags,
const char *devname,
void *dev_id)
{
return bind_interdomain_evtchn_to_irqhandler_chip(dev,
remote_port, handler, irqflags, devname,
dev_id, &xen_lateeoi_chip);
}
EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler_lateeoi);
int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
irq_handler_t handler,
unsigned long irqflags, const char *devname, void *dev_id)
{
int irq, retval;
irq = bind_virq_to_irq(virq, cpu, irqflags & IRQF_PERCPU);
if (irq < 0)
return irq;
retval = request_irq(irq, handler, irqflags, devname, dev_id);
if (retval != 0) {
unbind_from_irq(irq);
return retval;
}
return irq;
}
EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
int bind_ipi_to_irqhandler(enum ipi_vector ipi,
unsigned int cpu,
irq_handler_t handler,
unsigned long irqflags,
const char *devname,
void *dev_id)
{
int irq, retval;
irq = bind_ipi_to_irq(ipi, cpu);
if (irq < 0)
return irq;
irqflags |= IRQF_NO_SUSPEND | IRQF_FORCE_RESUME | IRQF_EARLY_RESUME;
retval = request_irq(irq, handler, irqflags, devname, dev_id);
if (retval != 0) {
unbind_from_irq(irq);
return retval;
}
return irq;
}
void unbind_from_irqhandler(unsigned int irq, void *dev_id)
{
struct irq_info *info = info_for_irq(irq);
if (WARN_ON(!info))
return;
free_irq(irq, dev_id);
unbind_from_irq(irq);
}
EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
/**
* xen_set_irq_priority() - set an event channel priority.
* @irq:irq bound to an event channel.
* @priority: priority between XEN_IRQ_PRIORITY_MAX and XEN_IRQ_PRIORITY_MIN.
*/
int xen_set_irq_priority(unsigned irq, unsigned priority)
{
struct evtchn_set_priority set_priority;
set_priority.port = evtchn_from_irq(irq);
set_priority.priority = priority;
return HYPERVISOR_event_channel_op(EVTCHNOP_set_priority,
&set_priority);
}
EXPORT_SYMBOL_GPL(xen_set_irq_priority);
int evtchn_make_refcounted(evtchn_port_t evtchn, bool is_static)
{
struct irq_info *info = evtchn_to_info(evtchn);
if (!info)
return -ENOENT;
WARN_ON(info->refcnt != -1);
info->refcnt = 1;
info->is_static = is_static;
return 0;
}
EXPORT_SYMBOL_GPL(evtchn_make_refcounted);
int evtchn_get(evtchn_port_t evtchn)
{
struct irq_info *info;
int err = -ENOENT;
if (evtchn >= xen_evtchn_max_channels())
return -EINVAL;
mutex_lock(&irq_mapping_update_lock);
info = evtchn_to_info(evtchn);
if (!info)
goto done;
err = -EINVAL;
if (info->refcnt <= 0 || info->refcnt == SHRT_MAX)
goto done;
info->refcnt++;
err = 0;
done:
mutex_unlock(&irq_mapping_update_lock);
return err;
}
EXPORT_SYMBOL_GPL(evtchn_get);
void evtchn_put(evtchn_port_t evtchn)
{
struct irq_info *info = evtchn_to_info(evtchn);
if (WARN_ON(!info))
return;
unbind_from_irq(info->irq);
}
EXPORT_SYMBOL_GPL(evtchn_put);
void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
{
evtchn_port_t evtchn;
xen: Support 64-bit PV guest receiving NMIs This is based on a patch that Zhenzhong Duan had sent - which was missing some of the remaining pieces. The kernel has the logic to handle Xen-type-exceptions using the paravirt interface in the assembler code (see PARAVIRT_ADJUST_EXCEPTION_FRAME - pv_irq_ops.adjust_exception_frame and and INTERRUPT_RETURN - pv_cpu_ops.iret). That means the nmi handler (and other exception handlers) use the hypervisor iret. The other changes that would be neccessary for this would be to translate the NMI_VECTOR to one of the entries on the ipi_vector and make xen_send_IPI_mask_allbutself use different events. Fortunately for us commit 1db01b4903639fcfaec213701a494fe3fb2c490b (xen: Clean up apic ipi interface) implemented this and we piggyback on the cleanup such that the apic IPI interface will pass the right vector value for NMI. With this patch we can trigger NMIs within a PV guest (only tested x86_64). For this to work with normal PV guests (not initial domain) we need the domain to be able to use the APIC ops - they are already implemented to use the Xen event channels. For that to be turned on in a PV domU we need to remove the masking of X86_FEATURE_APIC. Incidentally that means kgdb will also now work within a PV guest without using the 'nokgdbroundup' workaround. Note that the 32-bit version is different and this patch does not enable that. CC: Lisa Nguyen <lisa@xenapiadmin.com> CC: Ben Guthro <benjamin.guthro@citrix.com> CC: Zhenzhong Duan <zhenzhong.duan@oracle.com> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> [v1: Fixed up per David Vrabel comments] Reviewed-by: Ben Guthro <benjamin.guthro@citrix.com> Reviewed-by: David Vrabel <david.vrabel@citrix.com>
2013-07-19 11:51:31 -04:00
#ifdef CONFIG_X86
xen: Support 64-bit PV guest receiving NMIs This is based on a patch that Zhenzhong Duan had sent - which was missing some of the remaining pieces. The kernel has the logic to handle Xen-type-exceptions using the paravirt interface in the assembler code (see PARAVIRT_ADJUST_EXCEPTION_FRAME - pv_irq_ops.adjust_exception_frame and and INTERRUPT_RETURN - pv_cpu_ops.iret). That means the nmi handler (and other exception handlers) use the hypervisor iret. The other changes that would be neccessary for this would be to translate the NMI_VECTOR to one of the entries on the ipi_vector and make xen_send_IPI_mask_allbutself use different events. Fortunately for us commit 1db01b4903639fcfaec213701a494fe3fb2c490b (xen: Clean up apic ipi interface) implemented this and we piggyback on the cleanup such that the apic IPI interface will pass the right vector value for NMI. With this patch we can trigger NMIs within a PV guest (only tested x86_64). For this to work with normal PV guests (not initial domain) we need the domain to be able to use the APIC ops - they are already implemented to use the Xen event channels. For that to be turned on in a PV domU we need to remove the masking of X86_FEATURE_APIC. Incidentally that means kgdb will also now work within a PV guest without using the 'nokgdbroundup' workaround. Note that the 32-bit version is different and this patch does not enable that. CC: Lisa Nguyen <lisa@xenapiadmin.com> CC: Ben Guthro <benjamin.guthro@citrix.com> CC: Zhenzhong Duan <zhenzhong.duan@oracle.com> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> [v1: Fixed up per David Vrabel comments] Reviewed-by: Ben Guthro <benjamin.guthro@citrix.com> Reviewed-by: David Vrabel <david.vrabel@citrix.com>
2013-07-19 11:51:31 -04:00
if (unlikely(vector == XEN_NMI_VECTOR)) {
int rc = HYPERVISOR_vcpu_op(VCPUOP_send_nmi, xen_vcpu_nr(cpu),
NULL);
xen: Support 64-bit PV guest receiving NMIs This is based on a patch that Zhenzhong Duan had sent - which was missing some of the remaining pieces. The kernel has the logic to handle Xen-type-exceptions using the paravirt interface in the assembler code (see PARAVIRT_ADJUST_EXCEPTION_FRAME - pv_irq_ops.adjust_exception_frame and and INTERRUPT_RETURN - pv_cpu_ops.iret). That means the nmi handler (and other exception handlers) use the hypervisor iret. The other changes that would be neccessary for this would be to translate the NMI_VECTOR to one of the entries on the ipi_vector and make xen_send_IPI_mask_allbutself use different events. Fortunately for us commit 1db01b4903639fcfaec213701a494fe3fb2c490b (xen: Clean up apic ipi interface) implemented this and we piggyback on the cleanup such that the apic IPI interface will pass the right vector value for NMI. With this patch we can trigger NMIs within a PV guest (only tested x86_64). For this to work with normal PV guests (not initial domain) we need the domain to be able to use the APIC ops - they are already implemented to use the Xen event channels. For that to be turned on in a PV domU we need to remove the masking of X86_FEATURE_APIC. Incidentally that means kgdb will also now work within a PV guest without using the 'nokgdbroundup' workaround. Note that the 32-bit version is different and this patch does not enable that. CC: Lisa Nguyen <lisa@xenapiadmin.com> CC: Ben Guthro <benjamin.guthro@citrix.com> CC: Zhenzhong Duan <zhenzhong.duan@oracle.com> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> [v1: Fixed up per David Vrabel comments] Reviewed-by: Ben Guthro <benjamin.guthro@citrix.com> Reviewed-by: David Vrabel <david.vrabel@citrix.com>
2013-07-19 11:51:31 -04:00
if (rc < 0)
printk(KERN_WARNING "Sending nmi to CPU%d failed (rc:%d)\n", cpu, rc);
return;
}
#endif
evtchn = per_cpu(ipi_to_evtchn, cpu)[vector];
BUG_ON(evtchn == 0);
notify_remote_via_evtchn(evtchn);
}
struct evtchn_loop_ctrl {
ktime_t timeout;
unsigned count;
bool defer_eoi;
};
void handle_irq_for_port(evtchn_port_t port, struct evtchn_loop_ctrl *ctrl)
{
struct irq_info *info = evtchn_to_info(port);
struct xenbus_device *dev;
if (!info)
return;
/*
* Check for timeout every 256 events.
* We are setting the timeout value only after the first 256
* events in order to not hurt the common case of few loop
* iterations. The 256 is basically an arbitrary value.
*
* In case we are hitting the timeout we need to defer all further
* EOIs in order to ensure to leave the event handling loop rather
* sooner than later.
*/
if (!ctrl->defer_eoi && !(++ctrl->count & 0xff)) {
ktime_t kt = ktime_get();
if (!ctrl->timeout) {
kt = ktime_add_ms(kt,
jiffies_to_msecs(event_loop_timeout));
ctrl->timeout = kt;
} else if (kt > ctrl->timeout) {
ctrl->defer_eoi = true;
}
}
if (xchg_acquire(&info->is_active, 1))
return;
dev = (info->type == IRQT_EVTCHN) ? info->u.interdomain : NULL;
if (dev)
atomic_inc(&dev->events);
if (ctrl->defer_eoi) {
info->eoi_cpu = smp_processor_id();
info->irq_epoch = __this_cpu_read(irq_epoch);
info->eoi_time = get_jiffies_64() + event_eoi_delay;
}
generic_handle_irq(info->irq);
}
int xen_evtchn_do_upcall(void)
{
struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
int ret = vcpu_info->evtchn_upcall_pending ? IRQ_HANDLED : IRQ_NONE;
int cpu = smp_processor_id();
struct evtchn_loop_ctrl ctrl = { 0 };
xen/events: replace evtchn_rwlock with RCU In unprivileged Xen guests event handling can cause a deadlock with Xen console handling. The evtchn_rwlock and the hvc_lock are taken in opposite sequence in __hvc_poll() and in Xen console IRQ handling. Normally this is no problem, as the evtchn_rwlock is taken as a reader in both paths, but as soon as an event channel is being closed, the lock will be taken as a writer, which will cause read_lock() to block: CPU0 CPU1 CPU2 (IRQ handling) (__hvc_poll()) (closing event channel) read_lock(evtchn_rwlock) spin_lock(hvc_lock) write_lock(evtchn_rwlock) [blocks] spin_lock(hvc_lock) [blocks] read_lock(evtchn_rwlock) [blocks due to writer waiting, and not in_interrupt()] This issue can be avoided by replacing evtchn_rwlock with RCU in xen_free_irq(). Note that RCU is used only to delay freeing of the irq_info memory. There is no RCU based dereferencing or replacement of pointers involved. In order to avoid potential races between removing the irq_info reference and handling of interrupts, set the irq_info pointer to NULL only when freeing its memory. The IRQ itself must be freed at that time, too, as otherwise the same IRQ number could be allocated again before handling of the old instance would have been finished. This is XSA-441 / CVE-2023-34324. Fixes: 54c9de89895e ("xen/events: add a new "late EOI" evtchn framework") Reported-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com> Signed-off-by: Juergen Gross <jgross@suse.com> Reviewed-by: Julien Grall <jgrall@amazon.com> Signed-off-by: Juergen Gross <jgross@suse.com>
2023-08-28 08:09:47 +02:00
/*
* When closing an event channel the associated IRQ must not be freed
* until all cpus have left the event handling loop. This is ensured
* by taking the rcu_read_lock() while handling events, as freeing of
* the IRQ is handled via queue_rcu_work() _after_ closing the event
* channel.
*/
rcu_read_lock();
do {
vcpu_info->evtchn_upcall_pending = 0;
xen_evtchn_handle_events(cpu, &ctrl);
BUG_ON(!irqs_disabled());
virt_rmb(); /* Hypervisor can set upcall pending. */
} while (vcpu_info->evtchn_upcall_pending);
xen/events: replace evtchn_rwlock with RCU In unprivileged Xen guests event handling can cause a deadlock with Xen console handling. The evtchn_rwlock and the hvc_lock are taken in opposite sequence in __hvc_poll() and in Xen console IRQ handling. Normally this is no problem, as the evtchn_rwlock is taken as a reader in both paths, but as soon as an event channel is being closed, the lock will be taken as a writer, which will cause read_lock() to block: CPU0 CPU1 CPU2 (IRQ handling) (__hvc_poll()) (closing event channel) read_lock(evtchn_rwlock) spin_lock(hvc_lock) write_lock(evtchn_rwlock) [blocks] spin_lock(hvc_lock) [blocks] read_lock(evtchn_rwlock) [blocks due to writer waiting, and not in_interrupt()] This issue can be avoided by replacing evtchn_rwlock with RCU in xen_free_irq(). Note that RCU is used only to delay freeing of the irq_info memory. There is no RCU based dereferencing or replacement of pointers involved. In order to avoid potential races between removing the irq_info reference and handling of interrupts, set the irq_info pointer to NULL only when freeing its memory. The IRQ itself must be freed at that time, too, as otherwise the same IRQ number could be allocated again before handling of the old instance would have been finished. This is XSA-441 / CVE-2023-34324. Fixes: 54c9de89895e ("xen/events: add a new "late EOI" evtchn framework") Reported-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com> Signed-off-by: Juergen Gross <jgross@suse.com> Reviewed-by: Julien Grall <jgrall@amazon.com> Signed-off-by: Juergen Gross <jgross@suse.com>
2023-08-28 08:09:47 +02:00
rcu_read_unlock();
/*
* Increment irq_epoch only now to defer EOIs only for
* xen_irq_lateeoi() invocations occurring from inside the loop
* above.
*/
__this_cpu_inc(irq_epoch);
return ret;
}
EXPORT_SYMBOL_GPL(xen_evtchn_do_upcall);
/* Rebind a new event channel to an existing irq. */
void rebind_evtchn_irq(evtchn_port_t evtchn, int irq)
{
struct irq_info *info = info_for_irq(irq);
if (WARN_ON(!info))
return;
/* Make sure the irq is masked, since the new event channel
will also be masked. */
disable_irq(irq);
xen/irq: Alter the locking to use a mutex instead of a spinlock. When we allocate/change the IRQ informations, we do not need to use spinlocks. We can use a mutex (which is what the generic IRQ code does for allocations/changes). Fixes a slew of: BUG: sleeping function called from invalid context at /linux/kernel/mutex.c:271 in_atomic(): 1, irqs_disabled(): 0, pid: 3216, name: xenstored 2 locks held by xenstored/3216: #0: (&u->bind_mutex){......}, at: [<ffffffffa02e0920>] evtchn_ioctl+0x30/0x3a0 [xen_evtchn] #1: (irq_mapping_update_lock){......}, at: [<ffffffff8138b274>] bind_evtchn_to_irq+0x24/0x90 Pid: 3216, comm: xenstored Not tainted 3.1.0-rc6-00021-g437a3d1 #2 Call Trace: [<ffffffff81088d10>] __might_sleep+0x100/0x130 [<ffffffff81645c2f>] mutex_lock_nested+0x2f/0x50 [<ffffffff81627529>] __irq_alloc_descs+0x49/0x200 [<ffffffffa02e0920>] ? evtchn_ioctl+0x30/0x3a0 [xen_evtchn] [<ffffffff8138b214>] xen_allocate_irq_dynamic+0x34/0x70 [<ffffffff8138b2ad>] bind_evtchn_to_irq+0x5d/0x90 [<ffffffffa02e03c0>] ? evtchn_bind_to_user+0x60/0x60 [xen_evtchn] [<ffffffff8138c282>] bind_evtchn_to_irqhandler+0x32/0x80 [<ffffffffa02e03a9>] evtchn_bind_to_user+0x49/0x60 [xen_evtchn] [<ffffffffa02e0a34>] evtchn_ioctl+0x144/0x3a0 [xen_evtchn] [<ffffffff811b4070>] ? vfsmount_lock_local_unlock+0x50/0x80 [<ffffffff811a6a1a>] do_vfs_ioctl+0x9a/0x5e0 [<ffffffff811b476f>] ? mntput+0x1f/0x30 [<ffffffff81196259>] ? fput+0x199/0x240 [<ffffffff811a7001>] sys_ioctl+0xa1/0xb0 [<ffffffff8164ea82>] system_call_fastpath+0x16/0x1b Reported-by: Jim Burns <jim_burn@bellsouth.net> Acked-by: Ian Campbell <ian.campbell@citrix.com> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
2011-09-14 05:10:00 -04:00
mutex_lock(&irq_mapping_update_lock);
/* After resume the irq<->evtchn mappings are all cleared out */
BUG_ON(evtchn_to_info(evtchn));
/* Expect irq to have been bound before,
so there should be a proper type */
BUG_ON(info->type == IRQT_UNBOUND);
info->irq = irq;
(void)xen_irq_info_evtchn_setup(info, evtchn, NULL);
xen/irq: Alter the locking to use a mutex instead of a spinlock. When we allocate/change the IRQ informations, we do not need to use spinlocks. We can use a mutex (which is what the generic IRQ code does for allocations/changes). Fixes a slew of: BUG: sleeping function called from invalid context at /linux/kernel/mutex.c:271 in_atomic(): 1, irqs_disabled(): 0, pid: 3216, name: xenstored 2 locks held by xenstored/3216: #0: (&u->bind_mutex){......}, at: [<ffffffffa02e0920>] evtchn_ioctl+0x30/0x3a0 [xen_evtchn] #1: (irq_mapping_update_lock){......}, at: [<ffffffff8138b274>] bind_evtchn_to_irq+0x24/0x90 Pid: 3216, comm: xenstored Not tainted 3.1.0-rc6-00021-g437a3d1 #2 Call Trace: [<ffffffff81088d10>] __might_sleep+0x100/0x130 [<ffffffff81645c2f>] mutex_lock_nested+0x2f/0x50 [<ffffffff81627529>] __irq_alloc_descs+0x49/0x200 [<ffffffffa02e0920>] ? evtchn_ioctl+0x30/0x3a0 [xen_evtchn] [<ffffffff8138b214>] xen_allocate_irq_dynamic+0x34/0x70 [<ffffffff8138b2ad>] bind_evtchn_to_irq+0x5d/0x90 [<ffffffffa02e03c0>] ? evtchn_bind_to_user+0x60/0x60 [xen_evtchn] [<ffffffff8138c282>] bind_evtchn_to_irqhandler+0x32/0x80 [<ffffffffa02e03a9>] evtchn_bind_to_user+0x49/0x60 [xen_evtchn] [<ffffffffa02e0a34>] evtchn_ioctl+0x144/0x3a0 [xen_evtchn] [<ffffffff811b4070>] ? vfsmount_lock_local_unlock+0x50/0x80 [<ffffffff811a6a1a>] do_vfs_ioctl+0x9a/0x5e0 [<ffffffff811b476f>] ? mntput+0x1f/0x30 [<ffffffff81196259>] ? fput+0x199/0x240 [<ffffffff811a7001>] sys_ioctl+0xa1/0xb0 [<ffffffff8164ea82>] system_call_fastpath+0x16/0x1b Reported-by: Jim Burns <jim_burn@bellsouth.net> Acked-by: Ian Campbell <ian.campbell@citrix.com> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
2011-09-14 05:10:00 -04:00
mutex_unlock(&irq_mapping_update_lock);
bind_evtchn_to_cpu(info, info->cpu, false);
/* Unmask the event channel. */
enable_irq(irq);
}
/* Rebind an evtchn so that it gets delivered to a specific cpu */
static int xen_rebind_evtchn_to_cpu(struct irq_info *info, unsigned int tcpu)
{
struct evtchn_bind_vcpu bind_vcpu;
evtchn_port_t evtchn = info ? info->evtchn : 0;
if (!VALID_EVTCHN(evtchn))
return -1;
xen: Revert commits da72ff5bfcb0 and 72a9b186292d Recent discussion (http://marc.info/?l=xen-devel&m=149192184523741) established that commit 72a9b186292d ("xen: Remove event channel notification through Xen PCI platform device") (and thus commit da72ff5bfcb0 ("partially revert "xen: Remove event channel notification through Xen PCI platform device"")) are unnecessary and, in fact, prevent HVM guests from booting on Xen releases prior to 4.0 Therefore we revert both of those commits. The summary of that discussion is below: Here is the brief summary of the current situation: Before the offending commit (72a9b186292): 1) INTx does not work because of the reset_watches path. 2) The reset_watches path is only taken if you have Xen > 4.0 3) The Linux Kernel by default will use vector inject if the hypervisor support. So even INTx does not work no body running the kernel with Xen > 4.0 would notice. Unless he explicitly disabled this feature either in the kernel or in Xen (and this can only be disabled by modifying the code, not user-supported way to do it). After the offending commit (+ partial revert): 1) INTx is no longer support for HVM (only for PV guests). 2) Any HVM guest The kernel will not boot on Xen < 4.0 which does not have vector injection support. Since the only other mode supported is INTx which. So based on this summary, I think before commit (72a9b186292) we were in much better position from a user point of view. Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com> Reviewed-by: Juergen Gross <jgross@suse.com> Signed-off-by: Juergen Gross <jgross@suse.com>
2017-04-24 15:04:53 -04:00
if (!xen_support_evtchn_rebind())
return -1;
/* Send future instances of this interrupt to other vcpu. */
bind_vcpu.port = evtchn;
bind_vcpu.vcpu = xen_vcpu_nr(tcpu);
/*
* Mask the event while changing the VCPU binding to prevent
* it being delivered on an unexpected VCPU.
*/
do_mask(info, EVT_MASK_REASON_TEMPORARY);
/*
* If this fails, it usually just indicates that we're dealing with a
* virq or IPI channel, which don't actually need to be rebound. Ignore
* it, but don't do the xenlinux-level rebind in that case.
*/
if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
bind_evtchn_to_cpu(info, tcpu, false);
do_unmask(info, EVT_MASK_REASON_TEMPORARY);
return 0;
}
/*
* Find the CPU within @dest mask which has the least number of channels
* assigned. This is not precise as the per cpu counts can be modified
* concurrently.
*/
static unsigned int select_target_cpu(const struct cpumask *dest)
{
unsigned int cpu, best_cpu = UINT_MAX, minch = UINT_MAX;
for_each_cpu_and(cpu, dest, cpu_online_mask) {
unsigned int curch = atomic_read(&channels_on_cpu[cpu]);
if (curch < minch) {
minch = curch;
best_cpu = cpu;
}
}
/*
* Catch the unlikely case that dest contains no online CPUs. Can't
* recurse.
*/
if (best_cpu == UINT_MAX)
return select_target_cpu(cpu_online_mask);
return best_cpu;
}
static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest,
bool force)
{
unsigned int tcpu = select_target_cpu(dest);
int ret;
ret = xen_rebind_evtchn_to_cpu(info_for_irq(data->irq), tcpu);
if (!ret)
irq_data_update_effective_affinity(data, cpumask_of(tcpu));
return ret;
}
static void enable_dynirq(struct irq_data *data)
{
struct irq_info *info = info_for_irq(data->irq);
evtchn_port_t evtchn = info ? info->evtchn : 0;
if (VALID_EVTCHN(evtchn))
do_unmask(info, EVT_MASK_REASON_EXPLICIT);
}
static void do_ack_dynirq(struct irq_info *info)
{
evtchn_port_t evtchn = info->evtchn;
if (VALID_EVTCHN(evtchn))
event_handler_exit(info);
}
static void ack_dynirq(struct irq_data *data)
{
struct irq_info *info = info_for_irq(data->irq);
if (info)
do_ack_dynirq(info);
}
static void mask_ack_dynirq(struct irq_data *data)
{
struct irq_info *info = info_for_irq(data->irq);
if (info) {
do_disable_dynirq(info);
do_ack_dynirq(info);
}
}
static void lateeoi_ack_dynirq(struct irq_data *data)
{
struct irq_info *info = info_for_irq(data->irq);
evtchn_port_t evtchn = info ? info->evtchn : 0;
if (VALID_EVTCHN(evtchn)) {
do_mask(info, EVT_MASK_REASON_EOI_PENDING);
/*
* Don't call event_handler_exit().
* Need to keep is_active non-zero in order to ignore re-raised
* events after cpu affinity changes while a lateeoi is pending.
*/
clear_evtchn(evtchn);
}
}
static void lateeoi_mask_ack_dynirq(struct irq_data *data)
{
struct irq_info *info = info_for_irq(data->irq);
evtchn_port_t evtchn = info ? info->evtchn : 0;
if (VALID_EVTCHN(evtchn)) {
do_mask(info, EVT_MASK_REASON_EXPLICIT);
event_handler_exit(info);
}
}
static int retrigger_dynirq(struct irq_data *data)
{
struct irq_info *info = info_for_irq(data->irq);
evtchn_port_t evtchn = info ? info->evtchn : 0;
if (!VALID_EVTCHN(evtchn))
return 0;
do_mask(info, EVT_MASK_REASON_TEMPORARY);
set_evtchn(evtchn);
do_unmask(info, EVT_MASK_REASON_TEMPORARY);
return 1;
}
static void restore_pirqs(void)
{
int pirq, rc, irq, gsi;
struct physdev_map_pirq map_irq;
struct irq_info *info;
list_for_each_entry(info, &xen_irq_list_head, list) {
if (info->type != IRQT_PIRQ)
continue;
pirq = info->u.pirq.pirq;
gsi = info->u.pirq.gsi;
irq = info->irq;
/* save/restore of PT devices doesn't work, so at this point the
* only devices present are GSI based emulated devices */
if (!gsi)
continue;
map_irq.domid = DOMID_SELF;
map_irq.type = MAP_PIRQ_TYPE_GSI;
map_irq.index = gsi;
map_irq.pirq = pirq;
rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
if (rc) {
pr_warn("xen map irq failed gsi=%d irq=%d pirq=%d rc=%d\n",
gsi, irq, pirq, rc);
xen_free_irq(info);
continue;
}
printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq);
__startup_pirq(info);
}
}
static void restore_cpu_virqs(unsigned int cpu)
{
struct evtchn_bind_virq bind_virq;
evtchn_port_t evtchn;
struct irq_info *info;
int virq, irq;
for (virq = 0; virq < NR_VIRQS; virq++) {
if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1)
continue;
info = info_for_irq(irq);
BUG_ON(virq_from_irq(info) != virq);
/* Get a new binding from Xen. */
bind_virq.virq = virq;
bind_virq.vcpu = xen_vcpu_nr(cpu);
if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
&bind_virq) != 0)
BUG();
evtchn = bind_virq.port;
/* Record the new mapping. */
xen_irq_info_virq_setup(info, cpu, evtchn, virq);
/* The affinity mask is still valid */
bind_evtchn_to_cpu(info, cpu, false);
}
}
static void restore_cpu_ipis(unsigned int cpu)
{
struct evtchn_bind_ipi bind_ipi;
evtchn_port_t evtchn;
struct irq_info *info;
int ipi, irq;
for (ipi = 0; ipi < XEN_NR_IPIS; ipi++) {
if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1)
continue;
info = info_for_irq(irq);
BUG_ON(ipi_from_irq(info) != ipi);
/* Get a new binding from Xen. */
bind_ipi.vcpu = xen_vcpu_nr(cpu);
if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
&bind_ipi) != 0)
BUG();
evtchn = bind_ipi.port;
/* Record the new mapping. */
xen_irq_info_ipi_setup(info, cpu, evtchn, ipi);
/* The affinity mask is still valid */
bind_evtchn_to_cpu(info, cpu, false);
}
}
xen: implement Xen-specific spinlocks The standard ticket spinlocks are very expensive in a virtual environment, because their performance depends on Xen's scheduler giving vcpus time in the order that they're supposed to take the spinlock. This implements a Xen-specific spinlock, which should be much more efficient. The fast-path is essentially the old Linux-x86 locks, using a single lock byte. The locker decrements the byte; if the result is 0, then they have the lock. If the lock is negative, then locker must spin until the lock is positive again. When there's contention, the locker spin for 2^16[*] iterations waiting to get the lock. If it fails to get the lock in that time, it adds itself to the contention count in the lock and blocks on a per-cpu event channel. When unlocking the spinlock, the locker looks to see if there's anyone blocked waiting for the lock by checking for a non-zero waiter count. If there's a waiter, it traverses the per-cpu "lock_spinners" variable, which contains which lock each CPU is waiting on. It picks one CPU waiting on the lock and sends it an event to wake it up. This allows efficient fast-path spinlock operation, while allowing spinning vcpus to give up their processor time while waiting for a contended lock. [*] 2^16 iterations is threshold at which 98% locks have been taken according to Thomas Friebel's Xen Summit talk "Preventing Guests from Spinning Around". Therefore, we'd expect the lock and unlock slow paths will only be entered 2% of the time. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <clameter@linux-foundation.org> Cc: Petr Tesarik <ptesarik@suse.cz> Cc: Virtualization <virtualization@lists.linux-foundation.org> Cc: Xen devel <xen-devel@lists.xensource.com> Cc: Thomas Friebel <thomas.friebel@amd.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-07-07 12:07:53 -07:00
/* Clear an irq's pending state, in preparation for polling on it */
void xen_clear_irq_pending(int irq)
{
struct irq_info *info = info_for_irq(irq);
evtchn_port_t evtchn = info ? info->evtchn : 0;
xen: implement Xen-specific spinlocks The standard ticket spinlocks are very expensive in a virtual environment, because their performance depends on Xen's scheduler giving vcpus time in the order that they're supposed to take the spinlock. This implements a Xen-specific spinlock, which should be much more efficient. The fast-path is essentially the old Linux-x86 locks, using a single lock byte. The locker decrements the byte; if the result is 0, then they have the lock. If the lock is negative, then locker must spin until the lock is positive again. When there's contention, the locker spin for 2^16[*] iterations waiting to get the lock. If it fails to get the lock in that time, it adds itself to the contention count in the lock and blocks on a per-cpu event channel. When unlocking the spinlock, the locker looks to see if there's anyone blocked waiting for the lock by checking for a non-zero waiter count. If there's a waiter, it traverses the per-cpu "lock_spinners" variable, which contains which lock each CPU is waiting on. It picks one CPU waiting on the lock and sends it an event to wake it up. This allows efficient fast-path spinlock operation, while allowing spinning vcpus to give up their processor time while waiting for a contended lock. [*] 2^16 iterations is threshold at which 98% locks have been taken according to Thomas Friebel's Xen Summit talk "Preventing Guests from Spinning Around". Therefore, we'd expect the lock and unlock slow paths will only be entered 2% of the time. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <clameter@linux-foundation.org> Cc: Petr Tesarik <ptesarik@suse.cz> Cc: Virtualization <virtualization@lists.linux-foundation.org> Cc: Xen devel <xen-devel@lists.xensource.com> Cc: Thomas Friebel <thomas.friebel@amd.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-07-07 12:07:53 -07:00
if (VALID_EVTCHN(evtchn))
event_handler_exit(info);
xen: implement Xen-specific spinlocks The standard ticket spinlocks are very expensive in a virtual environment, because their performance depends on Xen's scheduler giving vcpus time in the order that they're supposed to take the spinlock. This implements a Xen-specific spinlock, which should be much more efficient. The fast-path is essentially the old Linux-x86 locks, using a single lock byte. The locker decrements the byte; if the result is 0, then they have the lock. If the lock is negative, then locker must spin until the lock is positive again. When there's contention, the locker spin for 2^16[*] iterations waiting to get the lock. If it fails to get the lock in that time, it adds itself to the contention count in the lock and blocks on a per-cpu event channel. When unlocking the spinlock, the locker looks to see if there's anyone blocked waiting for the lock by checking for a non-zero waiter count. If there's a waiter, it traverses the per-cpu "lock_spinners" variable, which contains which lock each CPU is waiting on. It picks one CPU waiting on the lock and sends it an event to wake it up. This allows efficient fast-path spinlock operation, while allowing spinning vcpus to give up their processor time while waiting for a contended lock. [*] 2^16 iterations is threshold at which 98% locks have been taken according to Thomas Friebel's Xen Summit talk "Preventing Guests from Spinning Around". Therefore, we'd expect the lock and unlock slow paths will only be entered 2% of the time. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <clameter@linux-foundation.org> Cc: Petr Tesarik <ptesarik@suse.cz> Cc: Virtualization <virtualization@lists.linux-foundation.org> Cc: Xen devel <xen-devel@lists.xensource.com> Cc: Thomas Friebel <thomas.friebel@amd.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-07-07 12:07:53 -07:00
}
EXPORT_SYMBOL(xen_clear_irq_pending);
bool xen_test_irq_pending(int irq)
{
evtchn_port_t evtchn = evtchn_from_irq(irq);
bool ret = false;
if (VALID_EVTCHN(evtchn))
ret = test_evtchn(evtchn);
return ret;
}
/* Poll waiting for an irq to become pending with timeout. In the usual case,
* the irq will be disabled so it won't deliver an interrupt. */
void xen_poll_irq_timeout(int irq, u64 timeout)
xen: implement Xen-specific spinlocks The standard ticket spinlocks are very expensive in a virtual environment, because their performance depends on Xen's scheduler giving vcpus time in the order that they're supposed to take the spinlock. This implements a Xen-specific spinlock, which should be much more efficient. The fast-path is essentially the old Linux-x86 locks, using a single lock byte. The locker decrements the byte; if the result is 0, then they have the lock. If the lock is negative, then locker must spin until the lock is positive again. When there's contention, the locker spin for 2^16[*] iterations waiting to get the lock. If it fails to get the lock in that time, it adds itself to the contention count in the lock and blocks on a per-cpu event channel. When unlocking the spinlock, the locker looks to see if there's anyone blocked waiting for the lock by checking for a non-zero waiter count. If there's a waiter, it traverses the per-cpu "lock_spinners" variable, which contains which lock each CPU is waiting on. It picks one CPU waiting on the lock and sends it an event to wake it up. This allows efficient fast-path spinlock operation, while allowing spinning vcpus to give up their processor time while waiting for a contended lock. [*] 2^16 iterations is threshold at which 98% locks have been taken according to Thomas Friebel's Xen Summit talk "Preventing Guests from Spinning Around". Therefore, we'd expect the lock and unlock slow paths will only be entered 2% of the time. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <clameter@linux-foundation.org> Cc: Petr Tesarik <ptesarik@suse.cz> Cc: Virtualization <virtualization@lists.linux-foundation.org> Cc: Xen devel <xen-devel@lists.xensource.com> Cc: Thomas Friebel <thomas.friebel@amd.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-07-07 12:07:53 -07:00
{
evtchn_port_t evtchn = evtchn_from_irq(irq);
if (VALID_EVTCHN(evtchn)) {
struct sched_poll poll;
poll.nr_ports = 1;
poll.timeout = timeout;
set_xen_guest_handle(poll.ports, &evtchn);
xen: implement Xen-specific spinlocks The standard ticket spinlocks are very expensive in a virtual environment, because their performance depends on Xen's scheduler giving vcpus time in the order that they're supposed to take the spinlock. This implements a Xen-specific spinlock, which should be much more efficient. The fast-path is essentially the old Linux-x86 locks, using a single lock byte. The locker decrements the byte; if the result is 0, then they have the lock. If the lock is negative, then locker must spin until the lock is positive again. When there's contention, the locker spin for 2^16[*] iterations waiting to get the lock. If it fails to get the lock in that time, it adds itself to the contention count in the lock and blocks on a per-cpu event channel. When unlocking the spinlock, the locker looks to see if there's anyone blocked waiting for the lock by checking for a non-zero waiter count. If there's a waiter, it traverses the per-cpu "lock_spinners" variable, which contains which lock each CPU is waiting on. It picks one CPU waiting on the lock and sends it an event to wake it up. This allows efficient fast-path spinlock operation, while allowing spinning vcpus to give up their processor time while waiting for a contended lock. [*] 2^16 iterations is threshold at which 98% locks have been taken according to Thomas Friebel's Xen Summit talk "Preventing Guests from Spinning Around". Therefore, we'd expect the lock and unlock slow paths will only be entered 2% of the time. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <clameter@linux-foundation.org> Cc: Petr Tesarik <ptesarik@suse.cz> Cc: Virtualization <virtualization@lists.linux-foundation.org> Cc: Xen devel <xen-devel@lists.xensource.com> Cc: Thomas Friebel <thomas.friebel@amd.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-07-07 12:07:53 -07:00
if (HYPERVISOR_sched_op(SCHEDOP_poll, &poll) != 0)
BUG();
}
}
EXPORT_SYMBOL(xen_poll_irq_timeout);
/* Poll waiting for an irq to become pending. In the usual case, the
* irq will be disabled so it won't deliver an interrupt. */
void xen_poll_irq(int irq)
{
xen_poll_irq_timeout(irq, 0 /* no timeout */);
}
xen: implement Xen-specific spinlocks The standard ticket spinlocks are very expensive in a virtual environment, because their performance depends on Xen's scheduler giving vcpus time in the order that they're supposed to take the spinlock. This implements a Xen-specific spinlock, which should be much more efficient. The fast-path is essentially the old Linux-x86 locks, using a single lock byte. The locker decrements the byte; if the result is 0, then they have the lock. If the lock is negative, then locker must spin until the lock is positive again. When there's contention, the locker spin for 2^16[*] iterations waiting to get the lock. If it fails to get the lock in that time, it adds itself to the contention count in the lock and blocks on a per-cpu event channel. When unlocking the spinlock, the locker looks to see if there's anyone blocked waiting for the lock by checking for a non-zero waiter count. If there's a waiter, it traverses the per-cpu "lock_spinners" variable, which contains which lock each CPU is waiting on. It picks one CPU waiting on the lock and sends it an event to wake it up. This allows efficient fast-path spinlock operation, while allowing spinning vcpus to give up their processor time while waiting for a contended lock. [*] 2^16 iterations is threshold at which 98% locks have been taken according to Thomas Friebel's Xen Summit talk "Preventing Guests from Spinning Around". Therefore, we'd expect the lock and unlock slow paths will only be entered 2% of the time. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <clameter@linux-foundation.org> Cc: Petr Tesarik <ptesarik@suse.cz> Cc: Virtualization <virtualization@lists.linux-foundation.org> Cc: Xen devel <xen-devel@lists.xensource.com> Cc: Thomas Friebel <thomas.friebel@amd.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-07-07 12:07:53 -07:00
/* Check whether the IRQ line is shared with other guests. */
int xen_test_irq_shared(int irq)
{
struct irq_info *info = info_for_irq(irq);
struct physdev_irq_status_query irq_status;
if (WARN_ON(!info))
return -ENOENT;
irq_status.irq = info->u.pirq.pirq;
if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status))
return 0;
return !(irq_status.flags & XENIRQSTAT_shared);
}
EXPORT_SYMBOL_GPL(xen_test_irq_shared);
void xen_irq_resume(void)
{
unsigned int cpu;
struct irq_info *info;
/* New event-channel space is not 'live' yet. */
xen_evtchn_resume();
/* No IRQ <-> event-channel mappings. */
list_for_each_entry(info, &xen_irq_list_head, list) {
/* Zap event-channel binding */
info->evtchn = 0;
/* Adjust accounting */
channels_on_cpu_dec(info);
}
clear_evtchn_to_irq_all();
for_each_possible_cpu(cpu) {
restore_cpu_virqs(cpu);
restore_cpu_ipis(cpu);
}
xen: events: do not unmask event channels on resume The IRQ core code will take care of disabling and reenabling interrupts over suspend resume automatically, therefore we do not need to do this in the Xen event channel code. The only exception is those event channels marked IRQF_NO_SUSPEND which the IRQ core ignores. We must unmask these ourselves, taking care to obey the current IRQ_DISABLED status. Failure check for IRQ_DISABLED leads to enabling polled only event channels, such as that associated with the pv spinlocks, which must never be enabled: [ 21.970432] ------------[ cut here ]------------ [ 21.970432] kernel BUG at arch/x86/xen/spinlock.c:343! [ 21.970432] invalid opcode: 0000 [#1] SMP [ 21.970432] last sysfs file: /sys/devices/virtual/net/lo/operstate [ 21.970432] Modules linked in: [ 21.970432] [ 21.970432] Pid: 0, comm: swapper Not tainted (2.6.32.24-x86_32p-xen-01034-g787c727 #34) [ 21.970432] EIP: 0061:[<c102e209>] EFLAGS: 00010046 CPU: 3 [ 21.970432] EIP is at dummy_handler+0x3/0x7 [ 21.970432] EAX: 0000021c EBX: dfc16880 ECX: 0000001a EDX: 00000000 [ 21.970432] ESI: dfc02c00 EDI: 00000001 EBP: dfc47e10 ESP: dfc47e10 [ 21.970432] DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0069 [ 21.970432] Process swapper (pid: 0, ti=dfc46000 task=dfc39440 task.ti=dfc46000) [ 21.970432] Stack: [ 21.970432] dfc47e30 c10a39f0 0000021c 00000000 00000000 dfc16880 0000021c 00000001 [ 21.970432] <0> dfc47e40 c10a4f08 0000021c 00000000 dfc47e78 c12240a7 c1839284 c1839284 [ 21.970432] <0> 00000200 00000000 00000000 f5720000 c1f3d028 c1f3d02c 00000180 dfc47e90 [ 21.970432] Call Trace: [ 21.970432] [<c10a39f0>] ? handle_IRQ_event+0x5f/0x122 [ 21.970432] [<c10a4f08>] ? handle_percpu_irq+0x2f/0x55 [ 21.970432] [<c12240a7>] ? __xen_evtchn_do_upcall+0xdb/0x15f [ 21.970432] [<c122481e>] ? xen_evtchn_do_upcall+0x20/0x30 [ 21.970432] [<c1030d47>] ? xen_do_upcall+0x7/0xc [ 21.970432] [<c102007b>] ? apic_reg_read+0xd3/0x22d [ 21.970432] [<c1002227>] ? hypercall_page+0x227/0x1005 [ 21.970432] [<c102d30b>] ? xen_force_evtchn_callback+0xf/0x14 [ 21.970432] [<c102da7c>] ? check_events+0x8/0xc [ 21.970432] [<c102da3b>] ? xen_irq_enable_direct_end+0x0/0x1 [ 21.970432] [<c105e485>] ? finish_task_switch+0x62/0xba [ 21.970432] [<c14e3f84>] ? schedule+0x808/0x89d [ 21.970432] [<c1084dc5>] ? hrtimer_start_expires+0x1a/0x22 [ 21.970432] [<c1085154>] ? tick_nohz_restart_sched_tick+0x15a/0x162 [ 21.970432] [<c102f43a>] ? cpu_idle+0x6d/0x6f [ 21.970432] [<c14db29e>] ? cpu_bringup_and_idle+0xd/0xf [ 21.970432] Code: 5d 0f 95 c0 0f b6 c0 c3 55 66 83 78 02 00 89 e5 5d 0f 95 \ c0 0f b6 c0 c3 55 b2 01 86 10 31 c0 84 d2 89 e5 0f 94 c0 5d c3 55 89 e5 <0f> 0b \ eb fe 55 80 3d 4c ce 84 c1 00 89 e5 57 56 89 c6 53 74 15 [ 21.970432] EIP: [<c102e209>] dummy_handler+0x3/0x7 SS:ESP 0069:dfc47e10 [ 21.970432] ---[ end trace c0b71f7e12cf3011 ]--- Signed-off-by: Ian Campbell <ian.campbell@citrix.com> Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
2010-11-01 16:30:09 +00:00
restore_pirqs();
}
static struct irq_chip xen_dynamic_chip __read_mostly = {
.name = "xen-dyn",
.irq_disable = disable_dynirq,
.irq_mask = disable_dynirq,
.irq_unmask = enable_dynirq,
.irq_ack = ack_dynirq,
.irq_mask_ack = mask_ack_dynirq,
.irq_set_affinity = set_affinity_irq,
.irq_retrigger = retrigger_dynirq,
};
static struct irq_chip xen_lateeoi_chip __read_mostly = {
/* The chip name needs to contain "xen-dyn" for irqbalance to work. */
.name = "xen-dyn-lateeoi",
.irq_disable = disable_dynirq,
.irq_mask = disable_dynirq,
.irq_unmask = enable_dynirq,
.irq_ack = lateeoi_ack_dynirq,
.irq_mask_ack = lateeoi_mask_ack_dynirq,
.irq_set_affinity = set_affinity_irq,
.irq_retrigger = retrigger_dynirq,
};
static struct irq_chip xen_pirq_chip __read_mostly = {
.name = "xen-pirq",
.irq_startup = startup_pirq,
.irq_shutdown = shutdown_pirq,
.irq_enable = enable_pirq,
.irq_disable = disable_pirq,
.irq_mask = disable_dynirq,
.irq_unmask = enable_dynirq,
.irq_ack = eoi_pirq,
.irq_eoi = eoi_pirq,
.irq_mask_ack = mask_ack_pirq,
.irq_set_affinity = set_affinity_irq,
.irq_retrigger = retrigger_dynirq,
};
static struct irq_chip xen_percpu_chip __read_mostly = {
.name = "xen-percpu",
.irq_disable = disable_dynirq,
.irq_mask = disable_dynirq,
.irq_unmask = enable_dynirq,
.irq_ack = ack_dynirq,
};
#ifdef CONFIG_X86
#ifdef CONFIG_XEN_PVHVM
/* Vector callbacks are better than PCI interrupts to receive event
* channel notifications because we can receive vector callbacks on any
* vcpu and we don't need PCI support or APIC interactions. */
void xen_setup_callback_vector(void)
{
uint64_t callback_via;
xen: Remove event channel notification through Xen PCI platform device Ever since commit 254d1a3f02eb ("xen/pv-on-hvm kexec: shutdown watches from old kernel") using the INTx interrupt from Xen PCI platform device for event channel notification would just lockup the guest during bootup. postcore_initcall now calls xs_reset_watches which will eventually try to read a value from XenStore and will get stuck on read_reply at XenBus forever since the platform driver is not probed yet and its INTx interrupt handler is not registered yet. That means that the guest can not be notified at this moment of any pending event channels and none of the per-event handlers will ever be invoked (including the XenStore one) and the reply will never be picked up by the kernel. The exact stack where things get stuck during xenbus_init: -xenbus_init -xs_init -xs_reset_watches -xenbus_scanf -xenbus_read -xs_single -xs_single -xs_talkv Vector callbacks have always been the favourite event notification mechanism since their introduction in commit 38e20b07efd5 ("x86/xen: event channels delivery on HVM.") and the vector callback feature has always been advertised for quite some time by Xen that's why INTx was broken for several years now without impacting anyone. Luckily this also means that event channel notification through INTx is basically dead-code which can be safely removed without impacting anybody since it has been effectively disabled for more than 4 years with nobody complaining about it (at least as far as I'm aware of). This commit removes event channel notification through Xen PCI platform device. Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> Cc: David Vrabel <david.vrabel@citrix.com> Cc: Juergen Gross <jgross@suse.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Ingo Molnar <mingo@redhat.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: x86@kernel.org Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Bjorn Helgaas <bhelgaas@google.com> Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com> Cc: Julien Grall <julien.grall@citrix.com> Cc: Vitaly Kuznetsov <vkuznets@redhat.com> Cc: Paul Gortmaker <paul.gortmaker@windriver.com> Cc: Ross Lagerwall <ross.lagerwall@citrix.com> Cc: xen-devel@lists.xenproject.org Cc: linux-kernel@vger.kernel.org Cc: linux-pci@vger.kernel.org Cc: Anthony Liguori <aliguori@amazon.com> Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de> Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com> Signed-off-by: David Vrabel <david.vrabel@citrix.com>
2016-08-26 23:55:36 +02:00
xen: Revert commits da72ff5bfcb0 and 72a9b186292d Recent discussion (http://marc.info/?l=xen-devel&m=149192184523741) established that commit 72a9b186292d ("xen: Remove event channel notification through Xen PCI platform device") (and thus commit da72ff5bfcb0 ("partially revert "xen: Remove event channel notification through Xen PCI platform device"")) are unnecessary and, in fact, prevent HVM guests from booting on Xen releases prior to 4.0 Therefore we revert both of those commits. The summary of that discussion is below: Here is the brief summary of the current situation: Before the offending commit (72a9b186292): 1) INTx does not work because of the reset_watches path. 2) The reset_watches path is only taken if you have Xen > 4.0 3) The Linux Kernel by default will use vector inject if the hypervisor support. So even INTx does not work no body running the kernel with Xen > 4.0 would notice. Unless he explicitly disabled this feature either in the kernel or in Xen (and this can only be disabled by modifying the code, not user-supported way to do it). After the offending commit (+ partial revert): 1) INTx is no longer support for HVM (only for PV guests). 2) Any HVM guest The kernel will not boot on Xen < 4.0 which does not have vector injection support. Since the only other mode supported is INTx which. So based on this summary, I think before commit (72a9b186292) we were in much better position from a user point of view. Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com> Reviewed-by: Juergen Gross <jgross@suse.com> Signed-off-by: Juergen Gross <jgross@suse.com>
2017-04-24 15:04:53 -04:00
if (xen_have_vector_callback) {
callback_via = HVM_CALLBACK_VECTOR(HYPERVISOR_CALLBACK_VECTOR);
if (xen_set_callback_via(callback_via)) {
xen: Revert commits da72ff5bfcb0 and 72a9b186292d Recent discussion (http://marc.info/?l=xen-devel&m=149192184523741) established that commit 72a9b186292d ("xen: Remove event channel notification through Xen PCI platform device") (and thus commit da72ff5bfcb0 ("partially revert "xen: Remove event channel notification through Xen PCI platform device"")) are unnecessary and, in fact, prevent HVM guests from booting on Xen releases prior to 4.0 Therefore we revert both of those commits. The summary of that discussion is below: Here is the brief summary of the current situation: Before the offending commit (72a9b186292): 1) INTx does not work because of the reset_watches path. 2) The reset_watches path is only taken if you have Xen > 4.0 3) The Linux Kernel by default will use vector inject if the hypervisor support. So even INTx does not work no body running the kernel with Xen > 4.0 would notice. Unless he explicitly disabled this feature either in the kernel or in Xen (and this can only be disabled by modifying the code, not user-supported way to do it). After the offending commit (+ partial revert): 1) INTx is no longer support for HVM (only for PV guests). 2) Any HVM guest The kernel will not boot on Xen < 4.0 which does not have vector injection support. Since the only other mode supported is INTx which. So based on this summary, I think before commit (72a9b186292) we were in much better position from a user point of view. Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com> Reviewed-by: Juergen Gross <jgross@suse.com> Signed-off-by: Juergen Gross <jgross@suse.com>
2017-04-24 15:04:53 -04:00
pr_err("Request for Xen HVM callback vector failed\n");
xen_have_vector_callback = false;
xen: Revert commits da72ff5bfcb0 and 72a9b186292d Recent discussion (http://marc.info/?l=xen-devel&m=149192184523741) established that commit 72a9b186292d ("xen: Remove event channel notification through Xen PCI platform device") (and thus commit da72ff5bfcb0 ("partially revert "xen: Remove event channel notification through Xen PCI platform device"")) are unnecessary and, in fact, prevent HVM guests from booting on Xen releases prior to 4.0 Therefore we revert both of those commits. The summary of that discussion is below: Here is the brief summary of the current situation: Before the offending commit (72a9b186292): 1) INTx does not work because of the reset_watches path. 2) The reset_watches path is only taken if you have Xen > 4.0 3) The Linux Kernel by default will use vector inject if the hypervisor support. So even INTx does not work no body running the kernel with Xen > 4.0 would notice. Unless he explicitly disabled this feature either in the kernel or in Xen (and this can only be disabled by modifying the code, not user-supported way to do it). After the offending commit (+ partial revert): 1) INTx is no longer support for HVM (only for PV guests). 2) Any HVM guest The kernel will not boot on Xen < 4.0 which does not have vector injection support. Since the only other mode supported is INTx which. So based on this summary, I think before commit (72a9b186292) we were in much better position from a user point of view. Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com> Reviewed-by: Juergen Gross <jgross@suse.com> Signed-off-by: Juergen Gross <jgross@suse.com>
2017-04-24 15:04:53 -04:00
}
}
}
/*
* Setup per-vCPU vector-type callbacks. If this setup is unavailable,
* fallback to the global vector-type callback.
*/
static __init void xen_init_setup_upcall_vector(void)
{
if (!xen_have_vector_callback)
return;
if ((cpuid_eax(xen_cpuid_base() + 4) & XEN_HVM_CPUID_UPCALL_VECTOR) &&
!xen_set_upcall_vector(0))
xen_percpu_upcall = true;
else if (xen_feature(XENFEAT_hvm_callback_vector))
xen_setup_callback_vector();
else
xen_have_vector_callback = false;
}
int xen_set_upcall_vector(unsigned int cpu)
{
int rc;
xen_hvm_evtchn_upcall_vector_t op = {
.vector = HYPERVISOR_CALLBACK_VECTOR,
.vcpu = per_cpu(xen_vcpu_id, cpu),
};
rc = HYPERVISOR_hvm_op(HVMOP_set_evtchn_upcall_vector, &op);
if (rc)
return rc;
/* Trick toolstack to think we are enlightened. */
if (!cpu)
rc = xen_set_callback_via(1);
return rc;
}
static __init void xen_alloc_callback_vector(void)
{
if (!xen_have_vector_callback)
return;
pr_info("Xen HVM callback vector for event delivery is enabled\n");
sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_xen_hvm_callback);
}
#else
void xen_setup_callback_vector(void) {}
static inline void xen_init_setup_upcall_vector(void) {}
int xen_set_upcall_vector(unsigned int cpu) {}
static inline void xen_alloc_callback_vector(void) {}
#endif /* CONFIG_XEN_PVHVM */
#endif /* CONFIG_X86 */
bool xen_fifo_events = true;
module_param_named(fifo_events, xen_fifo_events, bool, 0);
static int xen_evtchn_cpu_prepare(unsigned int cpu)
{
int ret = 0;
xen_cpu_init_eoi(cpu);
if (evtchn_ops->percpu_init)
ret = evtchn_ops->percpu_init(cpu);
return ret;
}
static int xen_evtchn_cpu_dead(unsigned int cpu)
{
int ret = 0;
if (evtchn_ops->percpu_deinit)
ret = evtchn_ops->percpu_deinit(cpu);
return ret;
}
void __init xen_init_IRQ(void)
{
int ret = -EINVAL;
evtchn_port_t evtchn;
if (xen_fifo_events)
ret = xen_evtchn_fifo_init();
if (ret < 0) {
xen_evtchn_2l_init();
xen_fifo_events = false;
}
xen_cpu_init_eoi(smp_processor_id());
cpuhp_setup_state_nocalls(CPUHP_XEN_EVTCHN_PREPARE,
"xen/evtchn:prepare",
xen_evtchn_cpu_prepare, xen_evtchn_cpu_dead);
evtchn_to_irq = kcalloc(EVTCHN_ROW(xen_evtchn_max_channels()),
sizeof(*evtchn_to_irq), GFP_KERNEL);
BUG_ON(!evtchn_to_irq);
/* No event channels are 'live' right now. */
for (evtchn = 0; evtchn < xen_evtchn_nr_channels(); evtchn++)
mask_evtchn(evtchn);
pirq_needs_eoi = pirq_needs_eoi_flag;
#ifdef CONFIG_X86
if (xen_pv_domain()) {
if (xen_initial_domain())
pci_xen_initial_domain();
}
xen_init_setup_upcall_vector();
xen_alloc_callback_vector();
if (xen_hvm_domain()) {
native_init_IRQ();
/* pci_xen_hvm_init must be called after native_init_IRQ so that
* __acpi_register_gsi can point at the right function */
pci_xen_hvm_init();
} else {
int rc;
struct physdev_pirq_eoi_gmfn eoi_gmfn;
pirq_eoi_map = (void *)__get_free_page(GFP_KERNEL|__GFP_ZERO);
eoi_gmfn.gmfn = virt_to_gfn(pirq_eoi_map);
rc = HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn_v2, &eoi_gmfn);
if (rc != 0) {
free_page((unsigned long) pirq_eoi_map);
pirq_eoi_map = NULL;
} else
pirq_needs_eoi = pirq_check_eoi_map;
}
#endif
}