2019-06-03 07:44:50 +02:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0-only */
|
2012-03-05 11:49:29 +00:00
|
|
|
/*
|
|
|
|
* Based on arch/arm/include/asm/io.h
|
|
|
|
*
|
|
|
|
* Copyright (C) 1996-2000 Russell King
|
|
|
|
* Copyright (C) 2012 ARM Ltd.
|
|
|
|
*/
|
|
|
|
#ifndef __ASM_IO_H
|
|
|
|
#define __ASM_IO_H
|
|
|
|
|
|
|
|
#include <linux/types.h>
|
2020-06-08 21:32:42 -07:00
|
|
|
#include <linux/pgtable.h>
|
2012-03-05 11:49:29 +00:00
|
|
|
|
|
|
|
#include <asm/byteorder.h>
|
|
|
|
#include <asm/barrier.h>
|
arm64: Fix overlapping VA allocations
PCI IO space was intended to be 16MiB, at 32MiB below MODULES_VADDR, but
commit d1e6dc91b532d3d3 ("arm64: Add architectural support for PCI")
extended this to cover the full 32MiB. The final 8KiB of this 32MiB is
also allocated for the fixmap, allowing for potential clashes between
the two.
This change was masked by assumptions in mem_init and the page table
dumping code, which assumed the I/O space to be 16MiB long through
seaparte hard-coded definitions.
This patch changes the definition of the PCI I/O space allocation to
live in asm/memory.h, along with the other VA space allocations. As the
fixmap allocation depends on the number of fixmap entries, this is moved
below the PCI I/O space allocation. Both the fixmap and PCI I/O space
are guarded with 2MB of padding. Sites assuming the I/O space was 16MiB
are moved over use new PCI_IO_{START,END} definitions, which will keep
in sync with the size of the IO space (now restored to 16MiB).
As a useful side effect, the use of the new PCI_IO_{START,END}
definitions prevents a build issue in the dumping code due to a (now
redundant) missing include of io.h for PCI_IOBASE.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Laura Abbott <lauraa@codeaurora.org>
Cc: Liviu Dudau <liviu.dudau@arm.com>
Cc: Steve Capper <steve.capper@linaro.org>
Cc: Will Deacon <will.deacon@arm.com>
[catalin.marinas@arm.com: reorder FIXADDR and PCI_IO address_markers_idx enum]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2015-01-22 18:20:35 +00:00
|
|
|
#include <asm/memory.h>
|
2014-04-07 15:39:52 -07:00
|
|
|
#include <asm/early_ioremap.h>
|
2014-11-14 15:54:11 +00:00
|
|
|
#include <asm/alternative.h>
|
|
|
|
#include <asm/cpufeature.h>
|
arm64: rsi: Add support for checking whether an MMIO is protected
On Arm CCA, with RMM-v1.0, all MMIO regions are shared. However, in
the future, an Arm CCA-v1.0 compliant guest may be run in a lesser
privileged partition in the Realm World (with Arm CCA-v1.1 Planes
feature). In this case, some of the MMIO regions may be emulated
by a higher privileged component in the Realm world, i.e, protected.
Thus the guest must decide today, whether a given MMIO region is shared
vs Protected and create the stage1 mapping accordingly. On Arm CCA, this
detection is based on the "IPA State" (RIPAS == RIPAS_IO). Provide a
helper to run this check on a given range of MMIO.
Also, provide a arm64 helper which may be hooked in by other solutions.
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Steven Price <steven.price@arm.com>
Link: https://lore.kernel.org/r/20241017131434.40935-5-steven.price@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2024-10-17 14:14:27 +01:00
|
|
|
#include <asm/rsi.h>
|
2012-03-05 11:49:29 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Generic IO read/write. These perform native-endian accesses.
|
|
|
|
*/
|
2014-07-28 17:25:48 +02:00
|
|
|
#define __raw_writeb __raw_writeb
|
2023-05-19 12:21:01 +02:00
|
|
|
static __always_inline void __raw_writeb(u8 val, volatile void __iomem *addr)
|
2012-03-05 11:49:29 +00:00
|
|
|
{
|
arm64: io: permit offset addressing
Currently our IO accessors all use register addressing without offsets,
but we could safely use offset addressing (without writeback) to
simplify and optimize the generated code.
To function correctly under a hypervisor which emulates IO accesses, we
must ensure that any faulting/trapped IO access results in an ESR_ELx
value with ESR_ELX.ISS.ISV=1 and with the tranfer register described in
ESR_ELx.ISS.SRT. This means that we can only use loads/stores of a
single general purpose register (or the zero register), and must avoid
writeback addressing modes. However, we can use immediate offset
addressing modes, as these still provide ESR_ELX.ISS.ISV=1 and a valid
ESR_ELx.ISS.SRT when those accesses fault at Stage-2.
Currently we only use register addressing without offsets. We use the
"r" constraint to place the address into a register, and manually
generate the register addressing by surrounding the resulting register
operand with square braces, e.g.
| static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
| {
| asm volatile("str %x0, [%1]" : : "rZ" (val), "r" (addr));
| }
Due to this, sequences of adjacent accesses need to generate addresses
using separate instructions. For example, the following code:
| void writeq_zero_8_times(void *ptr)
| {
| writeq_relaxed(0, ptr + 8 * 0);
| writeq_relaxed(0, ptr + 8 * 1);
| writeq_relaxed(0, ptr + 8 * 2);
| writeq_relaxed(0, ptr + 8 * 3);
| writeq_relaxed(0, ptr + 8 * 4);
| writeq_relaxed(0, ptr + 8 * 5);
| writeq_relaxed(0, ptr + 8 * 6);
| writeq_relaxed(0, ptr + 8 * 7);
| }
... is compiled to:
| <writeq_zero_8_times>:
| str xzr, [x0]
| add x1, x0, #0x8
| str xzr, [x1]
| add x1, x0, #0x10
| str xzr, [x1]
| add x1, x0, #0x18
| str xzr, [x1]
| add x1, x0, #0x20
| str xzr, [x1]
| add x1, x0, #0x28
| str xzr, [x1]
| add x1, x0, #0x30
| str xzr, [x1]
| add x0, x0, #0x38
| str xzr, [x0]
| ret
As described above, we could safely use immediate offset addressing,
which would allow the ADDs to be folded into the address generation for
the STRs, resulting in simpler and smaller generated assembly. We can do
this by using the "o" constraint to allow the compiler to generate
offset addressing (without writeback) for a memory operand, e.g.
| static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
| {
| volatile u64 __iomem *ptr = addr;
| asm volatile("str %x0, %1" : : "rZ" (val), "o" (*ptr));
| }
... which results in the earlier code sequence being compiled to:
| <writeq_zero_8_times>:
| str xzr, [x0]
| str xzr, [x0, #8]
| str xzr, [x0, #16]
| str xzr, [x0, #24]
| str xzr, [x0, #32]
| str xzr, [x0, #40]
| str xzr, [x0, #48]
| str xzr, [x0, #56]
| ret
As Will notes at:
https://lore.kernel.org/linux-arm-kernel/20240117160528.GA3398@willie-the-truck/
... some compilers struggle with a plain "o" constraint, so it's
preferable to use "Qo", where the additional "Q" constraint permits
using non-offset register addressing.
This patch modifies our IO write accessors to use "Qo" constraints,
resulting in the better code generation described above. The IO read
accessors are left as-is because ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE
requires that non-offset register addressing is used, as the LDAR
instruction does not support offset addressing.
When compiling v6.8-rc1 defconfig with GCC 13.2.0, this saves ~4KiB of
text:
| [mark@lakrids:~/src/linux]% ls -al vmlinux-*
| -rwxr-xr-x 1 mark mark 153960576 Jan 23 12:01 vmlinux-after
| -rwxr-xr-x 1 mark mark 153862192 Jan 23 11:57 vmlinux-before
|
| [mark@lakrids:~/src/linux]% size vmlinux-before vmlinux-after
| text data bss dec hex filename
| 26708921 16690350 622736 44022007 29fb8f7 vmlinux-before
| 26704761 16690414 622736 44017911 29fa8f7 vmlinux-after
... though due to internal alignment of sections, this has no impact on
the size of the resulting Image:
| [mark@lakrids:~/src/linux]% ls -al Image-*
| -rw-r--r-- 1 mark mark 43590144 Jan 23 12:01 Image-after
| -rw-r--r-- 1 mark mark 43590144 Jan 23 11:57 Image-before
Aside from the better code generation, there should be no functional
change as a result of this patch. I have lightly tested this patch,
including booting under KVM (where some devices such as PL011 are
emulated).
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Will Deacon <will@kernel.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20240124111259.874975-1-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2024-01-24 11:12:59 +00:00
|
|
|
volatile u8 __iomem *ptr = addr;
|
|
|
|
asm volatile("strb %w0, %1" : : "rZ" (val), "Qo" (*ptr));
|
2012-03-05 11:49:29 +00:00
|
|
|
}
|
|
|
|
|
2014-07-28 17:25:48 +02:00
|
|
|
#define __raw_writew __raw_writew
|
2023-05-19 12:21:01 +02:00
|
|
|
static __always_inline void __raw_writew(u16 val, volatile void __iomem *addr)
|
2012-03-05 11:49:29 +00:00
|
|
|
{
|
arm64: io: permit offset addressing
Currently our IO accessors all use register addressing without offsets,
but we could safely use offset addressing (without writeback) to
simplify and optimize the generated code.
To function correctly under a hypervisor which emulates IO accesses, we
must ensure that any faulting/trapped IO access results in an ESR_ELx
value with ESR_ELX.ISS.ISV=1 and with the tranfer register described in
ESR_ELx.ISS.SRT. This means that we can only use loads/stores of a
single general purpose register (or the zero register), and must avoid
writeback addressing modes. However, we can use immediate offset
addressing modes, as these still provide ESR_ELX.ISS.ISV=1 and a valid
ESR_ELx.ISS.SRT when those accesses fault at Stage-2.
Currently we only use register addressing without offsets. We use the
"r" constraint to place the address into a register, and manually
generate the register addressing by surrounding the resulting register
operand with square braces, e.g.
| static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
| {
| asm volatile("str %x0, [%1]" : : "rZ" (val), "r" (addr));
| }
Due to this, sequences of adjacent accesses need to generate addresses
using separate instructions. For example, the following code:
| void writeq_zero_8_times(void *ptr)
| {
| writeq_relaxed(0, ptr + 8 * 0);
| writeq_relaxed(0, ptr + 8 * 1);
| writeq_relaxed(0, ptr + 8 * 2);
| writeq_relaxed(0, ptr + 8 * 3);
| writeq_relaxed(0, ptr + 8 * 4);
| writeq_relaxed(0, ptr + 8 * 5);
| writeq_relaxed(0, ptr + 8 * 6);
| writeq_relaxed(0, ptr + 8 * 7);
| }
... is compiled to:
| <writeq_zero_8_times>:
| str xzr, [x0]
| add x1, x0, #0x8
| str xzr, [x1]
| add x1, x0, #0x10
| str xzr, [x1]
| add x1, x0, #0x18
| str xzr, [x1]
| add x1, x0, #0x20
| str xzr, [x1]
| add x1, x0, #0x28
| str xzr, [x1]
| add x1, x0, #0x30
| str xzr, [x1]
| add x0, x0, #0x38
| str xzr, [x0]
| ret
As described above, we could safely use immediate offset addressing,
which would allow the ADDs to be folded into the address generation for
the STRs, resulting in simpler and smaller generated assembly. We can do
this by using the "o" constraint to allow the compiler to generate
offset addressing (without writeback) for a memory operand, e.g.
| static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
| {
| volatile u64 __iomem *ptr = addr;
| asm volatile("str %x0, %1" : : "rZ" (val), "o" (*ptr));
| }
... which results in the earlier code sequence being compiled to:
| <writeq_zero_8_times>:
| str xzr, [x0]
| str xzr, [x0, #8]
| str xzr, [x0, #16]
| str xzr, [x0, #24]
| str xzr, [x0, #32]
| str xzr, [x0, #40]
| str xzr, [x0, #48]
| str xzr, [x0, #56]
| ret
As Will notes at:
https://lore.kernel.org/linux-arm-kernel/20240117160528.GA3398@willie-the-truck/
... some compilers struggle with a plain "o" constraint, so it's
preferable to use "Qo", where the additional "Q" constraint permits
using non-offset register addressing.
This patch modifies our IO write accessors to use "Qo" constraints,
resulting in the better code generation described above. The IO read
accessors are left as-is because ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE
requires that non-offset register addressing is used, as the LDAR
instruction does not support offset addressing.
When compiling v6.8-rc1 defconfig with GCC 13.2.0, this saves ~4KiB of
text:
| [mark@lakrids:~/src/linux]% ls -al vmlinux-*
| -rwxr-xr-x 1 mark mark 153960576 Jan 23 12:01 vmlinux-after
| -rwxr-xr-x 1 mark mark 153862192 Jan 23 11:57 vmlinux-before
|
| [mark@lakrids:~/src/linux]% size vmlinux-before vmlinux-after
| text data bss dec hex filename
| 26708921 16690350 622736 44022007 29fb8f7 vmlinux-before
| 26704761 16690414 622736 44017911 29fa8f7 vmlinux-after
... though due to internal alignment of sections, this has no impact on
the size of the resulting Image:
| [mark@lakrids:~/src/linux]% ls -al Image-*
| -rw-r--r-- 1 mark mark 43590144 Jan 23 12:01 Image-after
| -rw-r--r-- 1 mark mark 43590144 Jan 23 11:57 Image-before
Aside from the better code generation, there should be no functional
change as a result of this patch. I have lightly tested this patch,
including booting under KVM (where some devices such as PL011 are
emulated).
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Will Deacon <will@kernel.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20240124111259.874975-1-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2024-01-24 11:12:59 +00:00
|
|
|
volatile u16 __iomem *ptr = addr;
|
|
|
|
asm volatile("strh %w0, %1" : : "rZ" (val), "Qo" (*ptr));
|
2012-03-05 11:49:29 +00:00
|
|
|
}
|
|
|
|
|
2014-07-28 17:25:48 +02:00
|
|
|
#define __raw_writel __raw_writel
|
2020-02-20 16:58:39 +00:00
|
|
|
static __always_inline void __raw_writel(u32 val, volatile void __iomem *addr)
|
2012-03-05 11:49:29 +00:00
|
|
|
{
|
arm64: io: permit offset addressing
Currently our IO accessors all use register addressing without offsets,
but we could safely use offset addressing (without writeback) to
simplify and optimize the generated code.
To function correctly under a hypervisor which emulates IO accesses, we
must ensure that any faulting/trapped IO access results in an ESR_ELx
value with ESR_ELX.ISS.ISV=1 and with the tranfer register described in
ESR_ELx.ISS.SRT. This means that we can only use loads/stores of a
single general purpose register (or the zero register), and must avoid
writeback addressing modes. However, we can use immediate offset
addressing modes, as these still provide ESR_ELX.ISS.ISV=1 and a valid
ESR_ELx.ISS.SRT when those accesses fault at Stage-2.
Currently we only use register addressing without offsets. We use the
"r" constraint to place the address into a register, and manually
generate the register addressing by surrounding the resulting register
operand with square braces, e.g.
| static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
| {
| asm volatile("str %x0, [%1]" : : "rZ" (val), "r" (addr));
| }
Due to this, sequences of adjacent accesses need to generate addresses
using separate instructions. For example, the following code:
| void writeq_zero_8_times(void *ptr)
| {
| writeq_relaxed(0, ptr + 8 * 0);
| writeq_relaxed(0, ptr + 8 * 1);
| writeq_relaxed(0, ptr + 8 * 2);
| writeq_relaxed(0, ptr + 8 * 3);
| writeq_relaxed(0, ptr + 8 * 4);
| writeq_relaxed(0, ptr + 8 * 5);
| writeq_relaxed(0, ptr + 8 * 6);
| writeq_relaxed(0, ptr + 8 * 7);
| }
... is compiled to:
| <writeq_zero_8_times>:
| str xzr, [x0]
| add x1, x0, #0x8
| str xzr, [x1]
| add x1, x0, #0x10
| str xzr, [x1]
| add x1, x0, #0x18
| str xzr, [x1]
| add x1, x0, #0x20
| str xzr, [x1]
| add x1, x0, #0x28
| str xzr, [x1]
| add x1, x0, #0x30
| str xzr, [x1]
| add x0, x0, #0x38
| str xzr, [x0]
| ret
As described above, we could safely use immediate offset addressing,
which would allow the ADDs to be folded into the address generation for
the STRs, resulting in simpler and smaller generated assembly. We can do
this by using the "o" constraint to allow the compiler to generate
offset addressing (without writeback) for a memory operand, e.g.
| static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
| {
| volatile u64 __iomem *ptr = addr;
| asm volatile("str %x0, %1" : : "rZ" (val), "o" (*ptr));
| }
... which results in the earlier code sequence being compiled to:
| <writeq_zero_8_times>:
| str xzr, [x0]
| str xzr, [x0, #8]
| str xzr, [x0, #16]
| str xzr, [x0, #24]
| str xzr, [x0, #32]
| str xzr, [x0, #40]
| str xzr, [x0, #48]
| str xzr, [x0, #56]
| ret
As Will notes at:
https://lore.kernel.org/linux-arm-kernel/20240117160528.GA3398@willie-the-truck/
... some compilers struggle with a plain "o" constraint, so it's
preferable to use "Qo", where the additional "Q" constraint permits
using non-offset register addressing.
This patch modifies our IO write accessors to use "Qo" constraints,
resulting in the better code generation described above. The IO read
accessors are left as-is because ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE
requires that non-offset register addressing is used, as the LDAR
instruction does not support offset addressing.
When compiling v6.8-rc1 defconfig with GCC 13.2.0, this saves ~4KiB of
text:
| [mark@lakrids:~/src/linux]% ls -al vmlinux-*
| -rwxr-xr-x 1 mark mark 153960576 Jan 23 12:01 vmlinux-after
| -rwxr-xr-x 1 mark mark 153862192 Jan 23 11:57 vmlinux-before
|
| [mark@lakrids:~/src/linux]% size vmlinux-before vmlinux-after
| text data bss dec hex filename
| 26708921 16690350 622736 44022007 29fb8f7 vmlinux-before
| 26704761 16690414 622736 44017911 29fa8f7 vmlinux-after
... though due to internal alignment of sections, this has no impact on
the size of the resulting Image:
| [mark@lakrids:~/src/linux]% ls -al Image-*
| -rw-r--r-- 1 mark mark 43590144 Jan 23 12:01 Image-after
| -rw-r--r-- 1 mark mark 43590144 Jan 23 11:57 Image-before
Aside from the better code generation, there should be no functional
change as a result of this patch. I have lightly tested this patch,
including booting under KVM (where some devices such as PL011 are
emulated).
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Will Deacon <will@kernel.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20240124111259.874975-1-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2024-01-24 11:12:59 +00:00
|
|
|
volatile u32 __iomem *ptr = addr;
|
|
|
|
asm volatile("str %w0, %1" : : "rZ" (val), "Qo" (*ptr));
|
2012-03-05 11:49:29 +00:00
|
|
|
}
|
|
|
|
|
2014-07-28 17:25:48 +02:00
|
|
|
#define __raw_writeq __raw_writeq
|
2023-05-19 12:21:01 +02:00
|
|
|
static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
|
2012-03-05 11:49:29 +00:00
|
|
|
{
|
arm64: io: permit offset addressing
Currently our IO accessors all use register addressing without offsets,
but we could safely use offset addressing (without writeback) to
simplify and optimize the generated code.
To function correctly under a hypervisor which emulates IO accesses, we
must ensure that any faulting/trapped IO access results in an ESR_ELx
value with ESR_ELX.ISS.ISV=1 and with the tranfer register described in
ESR_ELx.ISS.SRT. This means that we can only use loads/stores of a
single general purpose register (or the zero register), and must avoid
writeback addressing modes. However, we can use immediate offset
addressing modes, as these still provide ESR_ELX.ISS.ISV=1 and a valid
ESR_ELx.ISS.SRT when those accesses fault at Stage-2.
Currently we only use register addressing without offsets. We use the
"r" constraint to place the address into a register, and manually
generate the register addressing by surrounding the resulting register
operand with square braces, e.g.
| static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
| {
| asm volatile("str %x0, [%1]" : : "rZ" (val), "r" (addr));
| }
Due to this, sequences of adjacent accesses need to generate addresses
using separate instructions. For example, the following code:
| void writeq_zero_8_times(void *ptr)
| {
| writeq_relaxed(0, ptr + 8 * 0);
| writeq_relaxed(0, ptr + 8 * 1);
| writeq_relaxed(0, ptr + 8 * 2);
| writeq_relaxed(0, ptr + 8 * 3);
| writeq_relaxed(0, ptr + 8 * 4);
| writeq_relaxed(0, ptr + 8 * 5);
| writeq_relaxed(0, ptr + 8 * 6);
| writeq_relaxed(0, ptr + 8 * 7);
| }
... is compiled to:
| <writeq_zero_8_times>:
| str xzr, [x0]
| add x1, x0, #0x8
| str xzr, [x1]
| add x1, x0, #0x10
| str xzr, [x1]
| add x1, x0, #0x18
| str xzr, [x1]
| add x1, x0, #0x20
| str xzr, [x1]
| add x1, x0, #0x28
| str xzr, [x1]
| add x1, x0, #0x30
| str xzr, [x1]
| add x0, x0, #0x38
| str xzr, [x0]
| ret
As described above, we could safely use immediate offset addressing,
which would allow the ADDs to be folded into the address generation for
the STRs, resulting in simpler and smaller generated assembly. We can do
this by using the "o" constraint to allow the compiler to generate
offset addressing (without writeback) for a memory operand, e.g.
| static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
| {
| volatile u64 __iomem *ptr = addr;
| asm volatile("str %x0, %1" : : "rZ" (val), "o" (*ptr));
| }
... which results in the earlier code sequence being compiled to:
| <writeq_zero_8_times>:
| str xzr, [x0]
| str xzr, [x0, #8]
| str xzr, [x0, #16]
| str xzr, [x0, #24]
| str xzr, [x0, #32]
| str xzr, [x0, #40]
| str xzr, [x0, #48]
| str xzr, [x0, #56]
| ret
As Will notes at:
https://lore.kernel.org/linux-arm-kernel/20240117160528.GA3398@willie-the-truck/
... some compilers struggle with a plain "o" constraint, so it's
preferable to use "Qo", where the additional "Q" constraint permits
using non-offset register addressing.
This patch modifies our IO write accessors to use "Qo" constraints,
resulting in the better code generation described above. The IO read
accessors are left as-is because ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE
requires that non-offset register addressing is used, as the LDAR
instruction does not support offset addressing.
When compiling v6.8-rc1 defconfig with GCC 13.2.0, this saves ~4KiB of
text:
| [mark@lakrids:~/src/linux]% ls -al vmlinux-*
| -rwxr-xr-x 1 mark mark 153960576 Jan 23 12:01 vmlinux-after
| -rwxr-xr-x 1 mark mark 153862192 Jan 23 11:57 vmlinux-before
|
| [mark@lakrids:~/src/linux]% size vmlinux-before vmlinux-after
| text data bss dec hex filename
| 26708921 16690350 622736 44022007 29fb8f7 vmlinux-before
| 26704761 16690414 622736 44017911 29fa8f7 vmlinux-after
... though due to internal alignment of sections, this has no impact on
the size of the resulting Image:
| [mark@lakrids:~/src/linux]% ls -al Image-*
| -rw-r--r-- 1 mark mark 43590144 Jan 23 12:01 Image-after
| -rw-r--r-- 1 mark mark 43590144 Jan 23 11:57 Image-before
Aside from the better code generation, there should be no functional
change as a result of this patch. I have lightly tested this patch,
including booting under KVM (where some devices such as PL011 are
emulated).
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Will Deacon <will@kernel.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20240124111259.874975-1-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2024-01-24 11:12:59 +00:00
|
|
|
volatile u64 __iomem *ptr = addr;
|
|
|
|
asm volatile("str %x0, %1" : : "rZ" (val), "Qo" (*ptr));
|
2012-03-05 11:49:29 +00:00
|
|
|
}
|
|
|
|
|
2014-07-28 17:25:48 +02:00
|
|
|
#define __raw_readb __raw_readb
|
2023-05-19 12:21:01 +02:00
|
|
|
static __always_inline u8 __raw_readb(const volatile void __iomem *addr)
|
2012-03-05 11:49:29 +00:00
|
|
|
{
|
|
|
|
u8 val;
|
2014-11-14 15:54:11 +00:00
|
|
|
asm volatile(ALTERNATIVE("ldrb %w0, [%1]",
|
|
|
|
"ldarb %w0, [%1]",
|
|
|
|
ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE)
|
|
|
|
: "=r" (val) : "r" (addr));
|
2012-03-05 11:49:29 +00:00
|
|
|
return val;
|
|
|
|
}
|
|
|
|
|
2014-07-28 17:25:48 +02:00
|
|
|
#define __raw_readw __raw_readw
|
2023-05-19 12:21:01 +02:00
|
|
|
static __always_inline u16 __raw_readw(const volatile void __iomem *addr)
|
2012-03-05 11:49:29 +00:00
|
|
|
{
|
|
|
|
u16 val;
|
2014-11-14 15:54:11 +00:00
|
|
|
|
|
|
|
asm volatile(ALTERNATIVE("ldrh %w0, [%1]",
|
|
|
|
"ldarh %w0, [%1]",
|
|
|
|
ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE)
|
|
|
|
: "=r" (val) : "r" (addr));
|
2012-03-05 11:49:29 +00:00
|
|
|
return val;
|
|
|
|
}
|
|
|
|
|
2014-07-28 17:25:48 +02:00
|
|
|
#define __raw_readl __raw_readl
|
2020-02-20 16:58:39 +00:00
|
|
|
static __always_inline u32 __raw_readl(const volatile void __iomem *addr)
|
2012-03-05 11:49:29 +00:00
|
|
|
{
|
|
|
|
u32 val;
|
2014-11-14 15:54:11 +00:00
|
|
|
asm volatile(ALTERNATIVE("ldr %w0, [%1]",
|
|
|
|
"ldar %w0, [%1]",
|
|
|
|
ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE)
|
|
|
|
: "=r" (val) : "r" (addr));
|
2012-03-05 11:49:29 +00:00
|
|
|
return val;
|
|
|
|
}
|
|
|
|
|
2014-07-28 17:25:48 +02:00
|
|
|
#define __raw_readq __raw_readq
|
2023-05-19 12:21:01 +02:00
|
|
|
static __always_inline u64 __raw_readq(const volatile void __iomem *addr)
|
2012-03-05 11:49:29 +00:00
|
|
|
{
|
|
|
|
u64 val;
|
2014-11-14 15:54:11 +00:00
|
|
|
asm volatile(ALTERNATIVE("ldr %0, [%1]",
|
|
|
|
"ldar %0, [%1]",
|
|
|
|
ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE)
|
|
|
|
: "=r" (val) : "r" (addr));
|
2012-03-05 11:49:29 +00:00
|
|
|
return val;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* IO barriers */
|
2022-05-18 22:14:10 +05:30
|
|
|
#define __io_ar(v) \
|
2018-11-07 23:06:15 +00:00
|
|
|
({ \
|
|
|
|
unsigned long tmp; \
|
|
|
|
\
|
arm64: io: Relax implicit barriers in default I/O accessors
The arm64 implementation of the default I/O accessors requires barrier
instructions to satisfy the memory ordering requirements documented in
memory-barriers.txt [1], which are largely derived from the behaviour of
I/O accesses on x86.
Of particular interest are the requirements that a write to a device
must be ordered against prior writes to memory, and a read from a device
must be ordered against subsequent reads from memory. We satisfy these
requirements using various flavours of DSB: the most expensive barrier
we have, since it implies completion of prior accesses. This was deemed
necessary when we first implemented the accessors, since accesses to
different endpoints could propagate independently and therefore the only
way to enforce order is to rely on completion guarantees [2].
Since then, the Armv8 memory model has been retrospectively strengthened
to require "other-multi-copy atomicity", a property that requires memory
accesses from an observer to become visible to all other observers
simultaneously [3]. In other words, propagation of accesses is limited
to transitioning from locally observed to globally observed. It recently
became apparent that this change also has a subtle impact on our I/O
accessors for shared peripherals, allowing us to use the cheaper DMB
instruction instead.
As a concrete example, consider the following:
memcpy(dma_buffer, data, bufsz);
writel(DMA_START, dev->ctrl_reg);
A DMB ST instruction between the final write to the DMA buffer and the
write to the control register will ensure that the writes to the DMA
buffer are observed before the write to the control register by all
observers. Put another way, if an observer can see the write to the
control register, it can also see the writes to memory. This has always
been the case and is not sufficient to provide the ordering required by
Linux, since there is no guarantee that the master interface of the
DMA-capable device has observed either of the accesses. However, in an
other-multi-copy atomic world, we can infer two things:
1. A write arriving at an endpoint shared between multiple CPUs is
visible to all CPUs
2. A write that is visible to all CPUs is also visible to all other
observers in the shareability domain
Pieced together, this allows us to use DMB OSHST for our default I/O
write accessors and DMB OSHLD for our default I/O read accessors (the
outer-shareability is for handling non-cacheable mappings) for shared
devices. Memory-mapped, DMA-capable peripherals that are private to a
CPU (i.e. inaccessible to other CPUs) still require the DSB, however
these are few and far between and typically require special treatment
anyway which is outside of the scope of the portable driver API (e.g.
GIC, page-table walker, SPE profiler).
Note that our mandatory barriers remain as DSBs, since there are cases
where they are used to flush the store buffer of the CPU, e.g. when
publishing page table updates to the SMMU.
[1] https://git.kernel.org/linus/4614bbdee357
[2] https://www.youtube.com/watch?v=i6DayghhA8Q
[3] https://www.cl.cam.ac.uk/~pes20/armv8-mca/
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
2019-06-07 15:48:58 +01:00
|
|
|
dma_rmb(); \
|
2018-11-07 23:06:15 +00:00
|
|
|
\
|
|
|
|
/* \
|
|
|
|
* Create a dummy control dependency from the IO read to any \
|
|
|
|
* later instructions. This ensures that a subsequent call to \
|
|
|
|
* udelay() will be ordered due to the ISB in get_cycles(). \
|
|
|
|
*/ \
|
|
|
|
asm volatile("eor %0, %1, %1\n" \
|
|
|
|
"cbnz %0, ." \
|
2018-11-29 16:31:04 +00:00
|
|
|
: "=r" (tmp) : "r" ((unsigned long)(v)) \
|
|
|
|
: "memory"); \
|
2018-11-07 23:06:15 +00:00
|
|
|
})
|
|
|
|
|
2022-05-18 22:14:10 +05:30
|
|
|
#define __io_bw() dma_wmb()
|
|
|
|
#define __io_br(v)
|
|
|
|
#define __io_aw(v)
|
2012-03-05 11:49:29 +00:00
|
|
|
|
2022-05-18 22:14:10 +05:30
|
|
|
/* arm64-specific, don't use in portable drivers */
|
|
|
|
#define __iormb(v) __io_ar(v)
|
|
|
|
#define __iowmb() __io_bw()
|
|
|
|
#define __iomb() dma_mb()
|
2012-03-05 11:49:29 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* I/O port access primitives.
|
|
|
|
*/
|
2014-09-29 15:29:31 +01:00
|
|
|
#define arch_has_dev_port() (1)
|
arm64: Fix overlapping VA allocations
PCI IO space was intended to be 16MiB, at 32MiB below MODULES_VADDR, but
commit d1e6dc91b532d3d3 ("arm64: Add architectural support for PCI")
extended this to cover the full 32MiB. The final 8KiB of this 32MiB is
also allocated for the fixmap, allowing for potential clashes between
the two.
This change was masked by assumptions in mem_init and the page table
dumping code, which assumed the I/O space to be 16MiB long through
seaparte hard-coded definitions.
This patch changes the definition of the PCI I/O space allocation to
live in asm/memory.h, along with the other VA space allocations. As the
fixmap allocation depends on the number of fixmap entries, this is moved
below the PCI I/O space allocation. Both the fixmap and PCI I/O space
are guarded with 2MB of padding. Sites assuming the I/O space was 16MiB
are moved over use new PCI_IO_{START,END} definitions, which will keep
in sync with the size of the IO space (now restored to 16MiB).
As a useful side effect, the use of the new PCI_IO_{START,END}
definitions prevents a build issue in the dumping code due to a (now
redundant) missing include of io.h for PCI_IOBASE.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Laura Abbott <lauraa@codeaurora.org>
Cc: Liviu Dudau <liviu.dudau@arm.com>
Cc: Steve Capper <steve.capper@linaro.org>
Cc: Will Deacon <will.deacon@arm.com>
[catalin.marinas@arm.com: reorder FIXADDR and PCI_IO address_markers_idx enum]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2015-01-22 18:20:35 +00:00
|
|
|
#define IO_SPACE_LIMIT (PCI_IO_SIZE - 1)
|
|
|
|
#define PCI_IOBASE ((void __iomem *)PCI_IO_START)
|
2012-03-05 11:49:29 +00:00
|
|
|
|
2024-04-11 13:46:17 -03:00
|
|
|
/*
|
|
|
|
* The ARM64 iowrite implementation is intended to support drivers that want to
|
|
|
|
* use write combining. For instance PCI drivers using write combining with a 64
|
|
|
|
* byte __iowrite64_copy() expect to get a 64 byte MemWr TLP on the PCIe bus.
|
|
|
|
*
|
|
|
|
* Newer ARM core have sensitive write combining buffers, it is important that
|
|
|
|
* the stores be contiguous blocks of store instructions. Normal memcpy
|
|
|
|
* approaches have a very low chance to generate write combining.
|
|
|
|
*
|
|
|
|
* Since this is the only API on ARM64 that should be used with write combining
|
|
|
|
* it also integrates the DGH hint which is supposed to lower the latency to
|
|
|
|
* emit the large TLP from the CPU.
|
|
|
|
*/
|
|
|
|
|
arm64/io: add constant-argument check
In some configurations __const_iowrite32_copy() does not get inlined
and gcc runs into the BUILD_BUG():
In file included from <command-line>:
In function '__const_memcpy_toio_aligned32',
inlined from '__const_iowrite32_copy' at arch/arm64/include/asm/io.h:203:3,
inlined from '__const_iowrite32_copy' at arch/arm64/include/asm/io.h:199:20:
include/linux/compiler_types.h:487:45: error: call to '__compiletime_assert_538' declared with attribute error: BUILD_BUG failed
487 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
| ^
include/linux/compiler_types.h:468:25: note: in definition of macro '__compiletime_assert'
468 | prefix ## suffix(); \
| ^~~~~~
include/linux/compiler_types.h:487:9: note: in expansion of macro '_compiletime_assert'
487 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
| ^~~~~~~~~~~~~~~~~~~
include/linux/build_bug.h:39:37: note: in expansion of macro 'compiletime_assert'
39 | #define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
| ^~~~~~~~~~~~~~~~~~
include/linux/build_bug.h:59:21: note: in expansion of macro 'BUILD_BUG_ON_MSG'
59 | #define BUILD_BUG() BUILD_BUG_ON_MSG(1, "BUILD_BUG failed")
| ^~~~~~~~~~~~~~~~
arch/arm64/include/asm/io.h:193:17: note: in expansion of macro 'BUILD_BUG'
193 | BUILD_BUG();
| ^~~~~~~~~
Move the check for constant arguments into the inline function to ensure
it is still constant if the compiler decides against inlining it, and
mark them as __always_inline to override the logic that sometimes leads
to the compiler not producing the simplified output.
Note that either the __always_inline annotation or the check for a
constant value are sufficient here, but combining the two looks cleaner
as it also avoids the macro. With clang-8 and older, the macro was still
needed, but all versions of gcc and clang can reliably perform constant
folding here.
Fixes: ead79118dae6 ("arm64/io: Provide a WC friendly __iowriteXX_copy()")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20240604210006.668912-1-arnd@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
2024-06-04 22:59:57 +02:00
|
|
|
static __always_inline void
|
|
|
|
__const_memcpy_toio_aligned32(volatile u32 __iomem *to, const u32 *from,
|
|
|
|
size_t count)
|
2024-04-11 13:46:17 -03:00
|
|
|
{
|
|
|
|
switch (count) {
|
|
|
|
case 8:
|
|
|
|
asm volatile("str %w0, [%8, #4 * 0]\n"
|
|
|
|
"str %w1, [%8, #4 * 1]\n"
|
|
|
|
"str %w2, [%8, #4 * 2]\n"
|
|
|
|
"str %w3, [%8, #4 * 3]\n"
|
|
|
|
"str %w4, [%8, #4 * 4]\n"
|
|
|
|
"str %w5, [%8, #4 * 5]\n"
|
|
|
|
"str %w6, [%8, #4 * 6]\n"
|
|
|
|
"str %w7, [%8, #4 * 7]\n"
|
|
|
|
:
|
|
|
|
: "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]),
|
|
|
|
"rZ"(from[3]), "rZ"(from[4]), "rZ"(from[5]),
|
|
|
|
"rZ"(from[6]), "rZ"(from[7]), "r"(to));
|
|
|
|
break;
|
|
|
|
case 4:
|
|
|
|
asm volatile("str %w0, [%4, #4 * 0]\n"
|
|
|
|
"str %w1, [%4, #4 * 1]\n"
|
|
|
|
"str %w2, [%4, #4 * 2]\n"
|
|
|
|
"str %w3, [%4, #4 * 3]\n"
|
|
|
|
:
|
|
|
|
: "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]),
|
|
|
|
"rZ"(from[3]), "r"(to));
|
|
|
|
break;
|
|
|
|
case 2:
|
|
|
|
asm volatile("str %w0, [%2, #4 * 0]\n"
|
|
|
|
"str %w1, [%2, #4 * 1]\n"
|
|
|
|
:
|
|
|
|
: "rZ"(from[0]), "rZ"(from[1]), "r"(to));
|
|
|
|
break;
|
|
|
|
case 1:
|
|
|
|
__raw_writel(*from, to);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
BUILD_BUG();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void __iowrite32_copy_full(void __iomem *to, const void *from, size_t count);
|
|
|
|
|
arm64/io: add constant-argument check
In some configurations __const_iowrite32_copy() does not get inlined
and gcc runs into the BUILD_BUG():
In file included from <command-line>:
In function '__const_memcpy_toio_aligned32',
inlined from '__const_iowrite32_copy' at arch/arm64/include/asm/io.h:203:3,
inlined from '__const_iowrite32_copy' at arch/arm64/include/asm/io.h:199:20:
include/linux/compiler_types.h:487:45: error: call to '__compiletime_assert_538' declared with attribute error: BUILD_BUG failed
487 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
| ^
include/linux/compiler_types.h:468:25: note: in definition of macro '__compiletime_assert'
468 | prefix ## suffix(); \
| ^~~~~~
include/linux/compiler_types.h:487:9: note: in expansion of macro '_compiletime_assert'
487 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
| ^~~~~~~~~~~~~~~~~~~
include/linux/build_bug.h:39:37: note: in expansion of macro 'compiletime_assert'
39 | #define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
| ^~~~~~~~~~~~~~~~~~
include/linux/build_bug.h:59:21: note: in expansion of macro 'BUILD_BUG_ON_MSG'
59 | #define BUILD_BUG() BUILD_BUG_ON_MSG(1, "BUILD_BUG failed")
| ^~~~~~~~~~~~~~~~
arch/arm64/include/asm/io.h:193:17: note: in expansion of macro 'BUILD_BUG'
193 | BUILD_BUG();
| ^~~~~~~~~
Move the check for constant arguments into the inline function to ensure
it is still constant if the compiler decides against inlining it, and
mark them as __always_inline to override the logic that sometimes leads
to the compiler not producing the simplified output.
Note that either the __always_inline annotation or the check for a
constant value are sufficient here, but combining the two looks cleaner
as it also avoids the macro. With clang-8 and older, the macro was still
needed, but all versions of gcc and clang can reliably perform constant
folding here.
Fixes: ead79118dae6 ("arm64/io: Provide a WC friendly __iowriteXX_copy()")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20240604210006.668912-1-arnd@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
2024-06-04 22:59:57 +02:00
|
|
|
static __always_inline void
|
|
|
|
__iowrite32_copy(void __iomem *to, const void *from, size_t count)
|
2024-04-11 13:46:17 -03:00
|
|
|
{
|
arm64/io: add constant-argument check
In some configurations __const_iowrite32_copy() does not get inlined
and gcc runs into the BUILD_BUG():
In file included from <command-line>:
In function '__const_memcpy_toio_aligned32',
inlined from '__const_iowrite32_copy' at arch/arm64/include/asm/io.h:203:3,
inlined from '__const_iowrite32_copy' at arch/arm64/include/asm/io.h:199:20:
include/linux/compiler_types.h:487:45: error: call to '__compiletime_assert_538' declared with attribute error: BUILD_BUG failed
487 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
| ^
include/linux/compiler_types.h:468:25: note: in definition of macro '__compiletime_assert'
468 | prefix ## suffix(); \
| ^~~~~~
include/linux/compiler_types.h:487:9: note: in expansion of macro '_compiletime_assert'
487 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
| ^~~~~~~~~~~~~~~~~~~
include/linux/build_bug.h:39:37: note: in expansion of macro 'compiletime_assert'
39 | #define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
| ^~~~~~~~~~~~~~~~~~
include/linux/build_bug.h:59:21: note: in expansion of macro 'BUILD_BUG_ON_MSG'
59 | #define BUILD_BUG() BUILD_BUG_ON_MSG(1, "BUILD_BUG failed")
| ^~~~~~~~~~~~~~~~
arch/arm64/include/asm/io.h:193:17: note: in expansion of macro 'BUILD_BUG'
193 | BUILD_BUG();
| ^~~~~~~~~
Move the check for constant arguments into the inline function to ensure
it is still constant if the compiler decides against inlining it, and
mark them as __always_inline to override the logic that sometimes leads
to the compiler not producing the simplified output.
Note that either the __always_inline annotation or the check for a
constant value are sufficient here, but combining the two looks cleaner
as it also avoids the macro. With clang-8 and older, the macro was still
needed, but all versions of gcc and clang can reliably perform constant
folding here.
Fixes: ead79118dae6 ("arm64/io: Provide a WC friendly __iowriteXX_copy()")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20240604210006.668912-1-arnd@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
2024-06-04 22:59:57 +02:00
|
|
|
if (__builtin_constant_p(count) &&
|
|
|
|
(count == 8 || count == 4 || count == 2 || count == 1)) {
|
2024-04-11 13:46:17 -03:00
|
|
|
__const_memcpy_toio_aligned32(to, from, count);
|
|
|
|
dgh();
|
|
|
|
} else {
|
|
|
|
__iowrite32_copy_full(to, from, count);
|
|
|
|
}
|
|
|
|
}
|
arm64/io: add constant-argument check
In some configurations __const_iowrite32_copy() does not get inlined
and gcc runs into the BUILD_BUG():
In file included from <command-line>:
In function '__const_memcpy_toio_aligned32',
inlined from '__const_iowrite32_copy' at arch/arm64/include/asm/io.h:203:3,
inlined from '__const_iowrite32_copy' at arch/arm64/include/asm/io.h:199:20:
include/linux/compiler_types.h:487:45: error: call to '__compiletime_assert_538' declared with attribute error: BUILD_BUG failed
487 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
| ^
include/linux/compiler_types.h:468:25: note: in definition of macro '__compiletime_assert'
468 | prefix ## suffix(); \
| ^~~~~~
include/linux/compiler_types.h:487:9: note: in expansion of macro '_compiletime_assert'
487 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
| ^~~~~~~~~~~~~~~~~~~
include/linux/build_bug.h:39:37: note: in expansion of macro 'compiletime_assert'
39 | #define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
| ^~~~~~~~~~~~~~~~~~
include/linux/build_bug.h:59:21: note: in expansion of macro 'BUILD_BUG_ON_MSG'
59 | #define BUILD_BUG() BUILD_BUG_ON_MSG(1, "BUILD_BUG failed")
| ^~~~~~~~~~~~~~~~
arch/arm64/include/asm/io.h:193:17: note: in expansion of macro 'BUILD_BUG'
193 | BUILD_BUG();
| ^~~~~~~~~
Move the check for constant arguments into the inline function to ensure
it is still constant if the compiler decides against inlining it, and
mark them as __always_inline to override the logic that sometimes leads
to the compiler not producing the simplified output.
Note that either the __always_inline annotation or the check for a
constant value are sufficient here, but combining the two looks cleaner
as it also avoids the macro. With clang-8 and older, the macro was still
needed, but all versions of gcc and clang can reliably perform constant
folding here.
Fixes: ead79118dae6 ("arm64/io: Provide a WC friendly __iowriteXX_copy()")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20240604210006.668912-1-arnd@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
2024-06-04 22:59:57 +02:00
|
|
|
#define __iowrite32_copy __iowrite32_copy
|
2024-04-11 13:46:17 -03:00
|
|
|
|
arm64/io: add constant-argument check
In some configurations __const_iowrite32_copy() does not get inlined
and gcc runs into the BUILD_BUG():
In file included from <command-line>:
In function '__const_memcpy_toio_aligned32',
inlined from '__const_iowrite32_copy' at arch/arm64/include/asm/io.h:203:3,
inlined from '__const_iowrite32_copy' at arch/arm64/include/asm/io.h:199:20:
include/linux/compiler_types.h:487:45: error: call to '__compiletime_assert_538' declared with attribute error: BUILD_BUG failed
487 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
| ^
include/linux/compiler_types.h:468:25: note: in definition of macro '__compiletime_assert'
468 | prefix ## suffix(); \
| ^~~~~~
include/linux/compiler_types.h:487:9: note: in expansion of macro '_compiletime_assert'
487 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
| ^~~~~~~~~~~~~~~~~~~
include/linux/build_bug.h:39:37: note: in expansion of macro 'compiletime_assert'
39 | #define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
| ^~~~~~~~~~~~~~~~~~
include/linux/build_bug.h:59:21: note: in expansion of macro 'BUILD_BUG_ON_MSG'
59 | #define BUILD_BUG() BUILD_BUG_ON_MSG(1, "BUILD_BUG failed")
| ^~~~~~~~~~~~~~~~
arch/arm64/include/asm/io.h:193:17: note: in expansion of macro 'BUILD_BUG'
193 | BUILD_BUG();
| ^~~~~~~~~
Move the check for constant arguments into the inline function to ensure
it is still constant if the compiler decides against inlining it, and
mark them as __always_inline to override the logic that sometimes leads
to the compiler not producing the simplified output.
Note that either the __always_inline annotation or the check for a
constant value are sufficient here, but combining the two looks cleaner
as it also avoids the macro. With clang-8 and older, the macro was still
needed, but all versions of gcc and clang can reliably perform constant
folding here.
Fixes: ead79118dae6 ("arm64/io: Provide a WC friendly __iowriteXX_copy()")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20240604210006.668912-1-arnd@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
2024-06-04 22:59:57 +02:00
|
|
|
static __always_inline void
|
|
|
|
__const_memcpy_toio_aligned64(volatile u64 __iomem *to, const u64 *from,
|
|
|
|
size_t count)
|
2024-04-11 13:46:17 -03:00
|
|
|
{
|
|
|
|
switch (count) {
|
|
|
|
case 8:
|
|
|
|
asm volatile("str %x0, [%8, #8 * 0]\n"
|
|
|
|
"str %x1, [%8, #8 * 1]\n"
|
|
|
|
"str %x2, [%8, #8 * 2]\n"
|
|
|
|
"str %x3, [%8, #8 * 3]\n"
|
|
|
|
"str %x4, [%8, #8 * 4]\n"
|
|
|
|
"str %x5, [%8, #8 * 5]\n"
|
|
|
|
"str %x6, [%8, #8 * 6]\n"
|
|
|
|
"str %x7, [%8, #8 * 7]\n"
|
|
|
|
:
|
|
|
|
: "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]),
|
|
|
|
"rZ"(from[3]), "rZ"(from[4]), "rZ"(from[5]),
|
|
|
|
"rZ"(from[6]), "rZ"(from[7]), "r"(to));
|
|
|
|
break;
|
|
|
|
case 4:
|
|
|
|
asm volatile("str %x0, [%4, #8 * 0]\n"
|
|
|
|
"str %x1, [%4, #8 * 1]\n"
|
|
|
|
"str %x2, [%4, #8 * 2]\n"
|
|
|
|
"str %x3, [%4, #8 * 3]\n"
|
|
|
|
:
|
|
|
|
: "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]),
|
|
|
|
"rZ"(from[3]), "r"(to));
|
|
|
|
break;
|
|
|
|
case 2:
|
|
|
|
asm volatile("str %x0, [%2, #8 * 0]\n"
|
|
|
|
"str %x1, [%2, #8 * 1]\n"
|
|
|
|
:
|
|
|
|
: "rZ"(from[0]), "rZ"(from[1]), "r"(to));
|
|
|
|
break;
|
|
|
|
case 1:
|
|
|
|
__raw_writeq(*from, to);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
BUILD_BUG();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void __iowrite64_copy_full(void __iomem *to, const void *from, size_t count);
|
|
|
|
|
arm64/io: add constant-argument check
In some configurations __const_iowrite32_copy() does not get inlined
and gcc runs into the BUILD_BUG():
In file included from <command-line>:
In function '__const_memcpy_toio_aligned32',
inlined from '__const_iowrite32_copy' at arch/arm64/include/asm/io.h:203:3,
inlined from '__const_iowrite32_copy' at arch/arm64/include/asm/io.h:199:20:
include/linux/compiler_types.h:487:45: error: call to '__compiletime_assert_538' declared with attribute error: BUILD_BUG failed
487 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
| ^
include/linux/compiler_types.h:468:25: note: in definition of macro '__compiletime_assert'
468 | prefix ## suffix(); \
| ^~~~~~
include/linux/compiler_types.h:487:9: note: in expansion of macro '_compiletime_assert'
487 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
| ^~~~~~~~~~~~~~~~~~~
include/linux/build_bug.h:39:37: note: in expansion of macro 'compiletime_assert'
39 | #define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
| ^~~~~~~~~~~~~~~~~~
include/linux/build_bug.h:59:21: note: in expansion of macro 'BUILD_BUG_ON_MSG'
59 | #define BUILD_BUG() BUILD_BUG_ON_MSG(1, "BUILD_BUG failed")
| ^~~~~~~~~~~~~~~~
arch/arm64/include/asm/io.h:193:17: note: in expansion of macro 'BUILD_BUG'
193 | BUILD_BUG();
| ^~~~~~~~~
Move the check for constant arguments into the inline function to ensure
it is still constant if the compiler decides against inlining it, and
mark them as __always_inline to override the logic that sometimes leads
to the compiler not producing the simplified output.
Note that either the __always_inline annotation or the check for a
constant value are sufficient here, but combining the two looks cleaner
as it also avoids the macro. With clang-8 and older, the macro was still
needed, but all versions of gcc and clang can reliably perform constant
folding here.
Fixes: ead79118dae6 ("arm64/io: Provide a WC friendly __iowriteXX_copy()")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20240604210006.668912-1-arnd@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
2024-06-04 22:59:57 +02:00
|
|
|
static __always_inline void
|
|
|
|
__iowrite64_copy(void __iomem *to, const void *from, size_t count)
|
2024-04-11 13:46:17 -03:00
|
|
|
{
|
arm64/io: add constant-argument check
In some configurations __const_iowrite32_copy() does not get inlined
and gcc runs into the BUILD_BUG():
In file included from <command-line>:
In function '__const_memcpy_toio_aligned32',
inlined from '__const_iowrite32_copy' at arch/arm64/include/asm/io.h:203:3,
inlined from '__const_iowrite32_copy' at arch/arm64/include/asm/io.h:199:20:
include/linux/compiler_types.h:487:45: error: call to '__compiletime_assert_538' declared with attribute error: BUILD_BUG failed
487 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
| ^
include/linux/compiler_types.h:468:25: note: in definition of macro '__compiletime_assert'
468 | prefix ## suffix(); \
| ^~~~~~
include/linux/compiler_types.h:487:9: note: in expansion of macro '_compiletime_assert'
487 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
| ^~~~~~~~~~~~~~~~~~~
include/linux/build_bug.h:39:37: note: in expansion of macro 'compiletime_assert'
39 | #define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
| ^~~~~~~~~~~~~~~~~~
include/linux/build_bug.h:59:21: note: in expansion of macro 'BUILD_BUG_ON_MSG'
59 | #define BUILD_BUG() BUILD_BUG_ON_MSG(1, "BUILD_BUG failed")
| ^~~~~~~~~~~~~~~~
arch/arm64/include/asm/io.h:193:17: note: in expansion of macro 'BUILD_BUG'
193 | BUILD_BUG();
| ^~~~~~~~~
Move the check for constant arguments into the inline function to ensure
it is still constant if the compiler decides against inlining it, and
mark them as __always_inline to override the logic that sometimes leads
to the compiler not producing the simplified output.
Note that either the __always_inline annotation or the check for a
constant value are sufficient here, but combining the two looks cleaner
as it also avoids the macro. With clang-8 and older, the macro was still
needed, but all versions of gcc and clang can reliably perform constant
folding here.
Fixes: ead79118dae6 ("arm64/io: Provide a WC friendly __iowriteXX_copy()")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20240604210006.668912-1-arnd@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
2024-06-04 22:59:57 +02:00
|
|
|
if (__builtin_constant_p(count) &&
|
|
|
|
(count == 8 || count == 4 || count == 2 || count == 1)) {
|
2024-04-11 13:46:17 -03:00
|
|
|
__const_memcpy_toio_aligned64(to, from, count);
|
|
|
|
dgh();
|
|
|
|
} else {
|
|
|
|
__iowrite64_copy_full(to, from, count);
|
|
|
|
}
|
|
|
|
}
|
arm64/io: add constant-argument check
In some configurations __const_iowrite32_copy() does not get inlined
and gcc runs into the BUILD_BUG():
In file included from <command-line>:
In function '__const_memcpy_toio_aligned32',
inlined from '__const_iowrite32_copy' at arch/arm64/include/asm/io.h:203:3,
inlined from '__const_iowrite32_copy' at arch/arm64/include/asm/io.h:199:20:
include/linux/compiler_types.h:487:45: error: call to '__compiletime_assert_538' declared with attribute error: BUILD_BUG failed
487 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
| ^
include/linux/compiler_types.h:468:25: note: in definition of macro '__compiletime_assert'
468 | prefix ## suffix(); \
| ^~~~~~
include/linux/compiler_types.h:487:9: note: in expansion of macro '_compiletime_assert'
487 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
| ^~~~~~~~~~~~~~~~~~~
include/linux/build_bug.h:39:37: note: in expansion of macro 'compiletime_assert'
39 | #define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
| ^~~~~~~~~~~~~~~~~~
include/linux/build_bug.h:59:21: note: in expansion of macro 'BUILD_BUG_ON_MSG'
59 | #define BUILD_BUG() BUILD_BUG_ON_MSG(1, "BUILD_BUG failed")
| ^~~~~~~~~~~~~~~~
arch/arm64/include/asm/io.h:193:17: note: in expansion of macro 'BUILD_BUG'
193 | BUILD_BUG();
| ^~~~~~~~~
Move the check for constant arguments into the inline function to ensure
it is still constant if the compiler decides against inlining it, and
mark them as __always_inline to override the logic that sometimes leads
to the compiler not producing the simplified output.
Note that either the __always_inline annotation or the check for a
constant value are sufficient here, but combining the two looks cleaner
as it also avoids the macro. With clang-8 and older, the macro was still
needed, but all versions of gcc and clang can reliably perform constant
folding here.
Fixes: ead79118dae6 ("arm64/io: Provide a WC friendly __iowriteXX_copy()")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20240604210006.668912-1-arnd@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
2024-06-04 22:59:57 +02:00
|
|
|
#define __iowrite64_copy __iowrite64_copy
|
2024-04-11 13:46:17 -03:00
|
|
|
|
2012-03-05 11:49:29 +00:00
|
|
|
/*
|
|
|
|
* I/O memory mapping functions.
|
|
|
|
*/
|
|
|
|
|
2024-08-30 14:01:48 +01:00
|
|
|
typedef int (*ioremap_prot_hook_t)(phys_addr_t phys_addr, size_t size,
|
|
|
|
pgprot_t *prot);
|
|
|
|
int arm64_ioremap_prot_hook_register(const ioremap_prot_hook_t hook);
|
|
|
|
|
2023-07-06 23:45:19 +08:00
|
|
|
#define ioremap_prot ioremap_prot
|
2022-06-07 20:50:26 +08:00
|
|
|
|
|
|
|
#define _PAGE_IOREMAP PROT_DEVICE_nGnRE
|
|
|
|
|
|
|
|
#define ioremap_wc(addr, size) \
|
2025-02-18 15:49:54 +05:30
|
|
|
ioremap_prot((addr), (size), __pgprot(PROT_NORMAL_NC))
|
2022-06-07 20:50:26 +08:00
|
|
|
#define ioremap_np(addr, size) \
|
2025-02-18 15:49:54 +05:30
|
|
|
ioremap_prot((addr), (size), __pgprot(PROT_DEVICE_nGnRnE))
|
2017-04-19 17:48:52 +01:00
|
|
|
|
2012-03-05 11:49:29 +00:00
|
|
|
/*
|
2016-05-19 18:11:04 +03:00
|
|
|
* io{read,write}{16,32,64}be() macros
|
2012-03-05 11:49:29 +00:00
|
|
|
*/
|
2018-11-07 23:06:15 +00:00
|
|
|
#define ioread16be(p) ({ __u16 __v = be16_to_cpu((__force __be16)__raw_readw(p)); __iormb(__v); __v; })
|
|
|
|
#define ioread32be(p) ({ __u32 __v = be32_to_cpu((__force __be32)__raw_readl(p)); __iormb(__v); __v; })
|
|
|
|
#define ioread64be(p) ({ __u64 __v = be64_to_cpu((__force __be64)__raw_readq(p)); __iormb(__v); __v; })
|
2012-03-05 11:49:29 +00:00
|
|
|
|
2014-07-28 17:25:48 +02:00
|
|
|
#define iowrite16be(v,p) ({ __iowmb(); __raw_writew((__force __u16)cpu_to_be16(v), p); })
|
|
|
|
#define iowrite32be(v,p) ({ __iowmb(); __raw_writel((__force __u32)cpu_to_be32(v), p); })
|
2016-05-19 18:11:04 +03:00
|
|
|
#define iowrite64be(v,p) ({ __iowmb(); __raw_writeq((__force __u64)cpu_to_be64(v), p); })
|
2012-03-05 11:49:29 +00:00
|
|
|
|
2014-07-28 17:25:48 +02:00
|
|
|
#include <asm-generic/io.h>
|
|
|
|
|
2022-06-07 20:50:26 +08:00
|
|
|
#define ioremap_cache ioremap_cache
|
|
|
|
static inline void __iomem *ioremap_cache(phys_addr_t addr, size_t size)
|
|
|
|
{
|
|
|
|
if (pfn_is_map_memory(__phys_to_pfn(addr)))
|
|
|
|
return (void __iomem *)__phys_to_virt(addr);
|
|
|
|
|
2025-02-18 15:49:54 +05:30
|
|
|
return ioremap_prot(addr, size, __pgprot(PROT_NORMAL));
|
2022-06-07 20:50:26 +08:00
|
|
|
}
|
|
|
|
|
2014-07-28 17:25:48 +02:00
|
|
|
/*
|
|
|
|
* More restrictive address range checking than the default implementation
|
|
|
|
* (PHYS_OFFSET and PHYS_MASK taken into account).
|
|
|
|
*/
|
|
|
|
#define ARCH_HAS_VALID_PHYS_ADDR_RANGE
|
|
|
|
extern int valid_phys_addr_range(phys_addr_t addr, size_t size);
|
|
|
|
extern int valid_mmap_phys_addr_range(unsigned long pfn, size_t size);
|
|
|
|
|
2022-05-09 17:34:28 -07:00
|
|
|
extern bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size,
|
|
|
|
unsigned long flags);
|
|
|
|
#define arch_memremap_can_ram_remap arch_memremap_can_ram_remap
|
|
|
|
|
arm64: rsi: Add support for checking whether an MMIO is protected
On Arm CCA, with RMM-v1.0, all MMIO regions are shared. However, in
the future, an Arm CCA-v1.0 compliant guest may be run in a lesser
privileged partition in the Realm World (with Arm CCA-v1.1 Planes
feature). In this case, some of the MMIO regions may be emulated
by a higher privileged component in the Realm world, i.e, protected.
Thus the guest must decide today, whether a given MMIO region is shared
vs Protected and create the stage1 mapping accordingly. On Arm CCA, this
detection is based on the "IPA State" (RIPAS == RIPAS_IO). Provide a
helper to run this check on a given range of MMIO.
Also, provide a arm64 helper which may be hooked in by other solutions.
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Steven Price <steven.price@arm.com>
Link: https://lore.kernel.org/r/20241017131434.40935-5-steven.price@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2024-10-17 14:14:27 +01:00
|
|
|
static inline bool arm64_is_protected_mmio(phys_addr_t phys_addr, size_t size)
|
|
|
|
{
|
|
|
|
if (unlikely(is_realm_world()))
|
|
|
|
return __arm64_is_protected_mmio(phys_addr, size);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2012-03-05 11:49:29 +00:00
|
|
|
#endif /* __ASM_IO_H */
|