2019-06-03 07:44:50 +02:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0-only */
|
2013-11-06 17:20:22 +00:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2013 ARM Ltd.
|
|
|
|
*/
|
|
|
|
#ifndef __ASM_WORD_AT_A_TIME_H
|
|
|
|
#define __ASM_WORD_AT_A_TIME_H
|
|
|
|
|
2016-12-24 11:46:01 -08:00
|
|
|
#include <linux/uaccess.h>
|
2016-01-01 15:02:12 +01:00
|
|
|
|
2013-11-06 17:20:22 +00:00
|
|
|
#ifndef __AARCH64EB__
|
|
|
|
|
2023-12-26 18:00:00 +00:00
|
|
|
#include <linux/bitops.h>
|
|
|
|
#include <linux/wordpart.h>
|
2013-11-06 17:20:22 +00:00
|
|
|
|
|
|
|
struct word_at_a_time {
|
|
|
|
const unsigned long one_bits, high_bits;
|
|
|
|
};
|
|
|
|
|
|
|
|
#define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) }
|
|
|
|
|
|
|
|
static inline unsigned long has_zero(unsigned long a, unsigned long *bits,
|
|
|
|
const struct word_at_a_time *c)
|
|
|
|
{
|
|
|
|
unsigned long mask = ((a - c->one_bits) & ~a) & c->high_bits;
|
|
|
|
*bits = mask;
|
|
|
|
return mask;
|
|
|
|
}
|
|
|
|
|
|
|
|
#define prep_zero_mask(a, bits, c) (bits)
|
arm64: word-at-a-time: improve byte count calculations for LE
Do the same optimization as x86-64: do __ffs() on the intermediate value
that found whether there is a zero byte, before we've actually computed
the final byte mask.
The logic is:
has_zero():
Check if the word has a zero byte in it, which indicates the end
of the loop, and prepare a value to be used for the rest of the
sequence.
The standard LE implementation just creates a word that has the
high bit set in each byte of the word that was zero.
Example: 0xaa00bbccdd00eeff -> 0x0080000000800000
prep_zero_mask():
Possibly do more prep to then clean up the initial fast result
from has_zero, so that it can be combined with another zero mask
with a simple logical "or" to create a final mask.
This is only used on big-endian machines that use a different
algorithm, and is a no-op here.
create_zero_mask():
This is "step 1" of creating the count and the mask, and is
meant for any common operations between the two.
In the old implementation, this actually created the zero mask,
that was then used for masking and for counting the number of
bits in the mask.
In the new implementation, this is a no-op.
count_zero():
This takes the mask bits, and counts the number of bytes before
the first zero byte.
In the old implementation, it counted the number of bits in the
final byte mask (which was the same as the C standard "find last
set bit" that uses the silly "starts at one" counting) and shifted
the value down by three.
In the new implementation, we know the intermediate mask isn't
zero, and it just does "find first set" with the sane semantics
without any off-by-one issues, and again shifts by three (which
also masks off the bit offset in the zero byte itself).
Example: 0x0080000000800000 -> 2
zero_bytemask():
This takes the mask bits, and turns it into an actual byte mask
of the bytes preceding the first zero byte.
In the old implementation, this was a no-op, because the work
had already been done by create_zero_mask().
In the new implementation, this does what create_zero_mask()
used to do.
Example: 0x0080000000800000 -> 0x000000000000ffff
The difference between the old and the new implementation is that
"count_zero()" ends up scheduling better because it is being done on a
value that is available earlier (before the final mask).
But more importantly, it can be implemented without the insane semantics
of the standard bit finding helpers that have the off-by-one issue and
have to special-case the zero mask situation.
On arm64, the new "count_zero()" ends up just "rbit + clz" plus the
shift right that then ends up being subsumed by the "add to final
length".
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2024-06-18 18:14:48 -07:00
|
|
|
#define create_zero_mask(bits) (bits)
|
|
|
|
#define find_zero(bits) (__ffs(bits) >> 3)
|
2013-11-06 17:20:22 +00:00
|
|
|
|
arm64: word-at-a-time: improve byte count calculations for LE
Do the same optimization as x86-64: do __ffs() on the intermediate value
that found whether there is a zero byte, before we've actually computed
the final byte mask.
The logic is:
has_zero():
Check if the word has a zero byte in it, which indicates the end
of the loop, and prepare a value to be used for the rest of the
sequence.
The standard LE implementation just creates a word that has the
high bit set in each byte of the word that was zero.
Example: 0xaa00bbccdd00eeff -> 0x0080000000800000
prep_zero_mask():
Possibly do more prep to then clean up the initial fast result
from has_zero, so that it can be combined with another zero mask
with a simple logical "or" to create a final mask.
This is only used on big-endian machines that use a different
algorithm, and is a no-op here.
create_zero_mask():
This is "step 1" of creating the count and the mask, and is
meant for any common operations between the two.
In the old implementation, this actually created the zero mask,
that was then used for masking and for counting the number of
bits in the mask.
In the new implementation, this is a no-op.
count_zero():
This takes the mask bits, and counts the number of bytes before
the first zero byte.
In the old implementation, it counted the number of bits in the
final byte mask (which was the same as the C standard "find last
set bit" that uses the silly "starts at one" counting) and shifted
the value down by three.
In the new implementation, we know the intermediate mask isn't
zero, and it just does "find first set" with the sane semantics
without any off-by-one issues, and again shifts by three (which
also masks off the bit offset in the zero byte itself).
Example: 0x0080000000800000 -> 2
zero_bytemask():
This takes the mask bits, and turns it into an actual byte mask
of the bytes preceding the first zero byte.
In the old implementation, this was a no-op, because the work
had already been done by create_zero_mask().
In the new implementation, this does what create_zero_mask()
used to do.
Example: 0x0080000000800000 -> 0x000000000000ffff
The difference between the old and the new implementation is that
"count_zero()" ends up scheduling better because it is being done on a
value that is available earlier (before the final mask).
But more importantly, it can be implemented without the insane semantics
of the standard bit finding helpers that have the off-by-one issue and
have to special-case the zero mask situation.
On arm64, the new "count_zero()" ends up just "rbit + clz" plus the
shift right that then ends up being subsumed by the "add to final
length".
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2024-06-18 18:14:48 -07:00
|
|
|
static inline unsigned long zero_bytemask(unsigned long bits)
|
2013-11-06 17:20:22 +00:00
|
|
|
{
|
|
|
|
bits = (bits - 1) & ~bits;
|
|
|
|
return bits >> 7;
|
|
|
|
}
|
|
|
|
|
|
|
|
#else /* __AARCH64EB__ */
|
|
|
|
#include <asm-generic/word-at-a-time.h>
|
|
|
|
#endif
|
|
|
|
|
2013-11-06 19:32:13 +00:00
|
|
|
/*
|
|
|
|
* Load an unaligned word from kernel space.
|
|
|
|
*
|
|
|
|
* In the (very unlikely) case of the word being a page-crosser
|
|
|
|
* and the next page not being mapped, take the exception and
|
|
|
|
* return zeroes in the non-existing part.
|
|
|
|
*/
|
|
|
|
static inline unsigned long load_unaligned_zeropad(const void *addr)
|
|
|
|
{
|
arm64: extable: add load_unaligned_zeropad() handler
For inline assembly, we place exception fixups out-of-line in the
`.fixup` section such that these are out of the way of the fast path.
This has a few drawbacks:
* Since the fixup code is anonymous, backtraces will symbolize fixups as
offsets from the nearest prior symbol, currently
`__entry_tramp_text_end`. This is confusing, and painful to debug
without access to the relevant vmlinux.
* Since the exception handler adjusts the PC to execute the fixup, and
the fixup uses a direct branch back into the function it fixes,
backtraces of fixups miss the original function. This is confusing,
and violates requirements for RELIABLE_STACKTRACE (and therefore
LIVEPATCH).
* Inline assembly and associated fixups are generated from templates,
and we have many copies of logically identical fixups which only
differ in which specific registers are written to and which address is
branched to at the end of the fixup. This is potentially wasteful of
I-cache resources, and makes it hard to add additional logic to fixups
without significant bloat.
* In the case of load_unaligned_zeropad(), the logic in the fixup
requires a temporary register that we must allocate even in the
fast-path where it will not be used.
This patch address all four concerns for load_unaligned_zeropad() fixups
by adding a dedicated exception handler which performs the fixup logic
in exception context and subsequent returns back after the faulting
instruction. For the moment, the fixup logic is identical to the old
assembly fixup logic, but in future we could enhance this by taking the
ESR and FAR into account to constrain the faults we try to fix up, or to
specialize fixups for MTE tag check faults.
Other than backtracing, there should be no functional change as a result
of this patch.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20211019160219.5202-13-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
2021-10-19 17:02:18 +01:00
|
|
|
unsigned long ret;
|
2013-11-06 19:32:13 +00:00
|
|
|
|
2023-03-11 00:43:31 +01:00
|
|
|
__mte_enable_tco_async();
|
2021-03-15 13:20:15 +00:00
|
|
|
|
2013-11-06 19:32:13 +00:00
|
|
|
/* Load word from unaligned pointer addr */
|
|
|
|
asm(
|
arm64: extable: add load_unaligned_zeropad() handler
For inline assembly, we place exception fixups out-of-line in the
`.fixup` section such that these are out of the way of the fast path.
This has a few drawbacks:
* Since the fixup code is anonymous, backtraces will symbolize fixups as
offsets from the nearest prior symbol, currently
`__entry_tramp_text_end`. This is confusing, and painful to debug
without access to the relevant vmlinux.
* Since the exception handler adjusts the PC to execute the fixup, and
the fixup uses a direct branch back into the function it fixes,
backtraces of fixups miss the original function. This is confusing,
and violates requirements for RELIABLE_STACKTRACE (and therefore
LIVEPATCH).
* Inline assembly and associated fixups are generated from templates,
and we have many copies of logically identical fixups which only
differ in which specific registers are written to and which address is
branched to at the end of the fixup. This is potentially wasteful of
I-cache resources, and makes it hard to add additional logic to fixups
without significant bloat.
* In the case of load_unaligned_zeropad(), the logic in the fixup
requires a temporary register that we must allocate even in the
fast-path where it will not be used.
This patch address all four concerns for load_unaligned_zeropad() fixups
by adding a dedicated exception handler which performs the fixup logic
in exception context and subsequent returns back after the faulting
instruction. For the moment, the fixup logic is identical to the old
assembly fixup logic, but in future we could enhance this by taking the
ESR and FAR into account to constrain the faults we try to fix up, or to
specialize fixups for MTE tag check faults.
Other than backtracing, there should be no functional change as a result
of this patch.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20211019160219.5202-13-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
2021-10-19 17:02:18 +01:00
|
|
|
"1: ldr %0, %2\n"
|
2013-11-06 19:32:13 +00:00
|
|
|
"2:\n"
|
arm64: extable: add load_unaligned_zeropad() handler
For inline assembly, we place exception fixups out-of-line in the
`.fixup` section such that these are out of the way of the fast path.
This has a few drawbacks:
* Since the fixup code is anonymous, backtraces will symbolize fixups as
offsets from the nearest prior symbol, currently
`__entry_tramp_text_end`. This is confusing, and painful to debug
without access to the relevant vmlinux.
* Since the exception handler adjusts the PC to execute the fixup, and
the fixup uses a direct branch back into the function it fixes,
backtraces of fixups miss the original function. This is confusing,
and violates requirements for RELIABLE_STACKTRACE (and therefore
LIVEPATCH).
* Inline assembly and associated fixups are generated from templates,
and we have many copies of logically identical fixups which only
differ in which specific registers are written to and which address is
branched to at the end of the fixup. This is potentially wasteful of
I-cache resources, and makes it hard to add additional logic to fixups
without significant bloat.
* In the case of load_unaligned_zeropad(), the logic in the fixup
requires a temporary register that we must allocate even in the
fast-path where it will not be used.
This patch address all four concerns for load_unaligned_zeropad() fixups
by adding a dedicated exception handler which performs the fixup logic
in exception context and subsequent returns back after the faulting
instruction. For the moment, the fixup logic is identical to the old
assembly fixup logic, but in future we could enhance this by taking the
ESR and FAR into account to constrain the faults we try to fix up, or to
specialize fixups for MTE tag check faults.
Other than backtracing, there should be no functional change as a result
of this patch.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20211019160219.5202-13-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
2021-10-19 17:02:18 +01:00
|
|
|
_ASM_EXTABLE_LOAD_UNALIGNED_ZEROPAD(1b, 2b, %0, %1)
|
|
|
|
: "=&r" (ret)
|
2013-11-06 19:32:13 +00:00
|
|
|
: "r" (addr), "Q" (*(unsigned long *)addr));
|
|
|
|
|
2023-03-11 00:43:31 +01:00
|
|
|
__mte_disable_tco_async();
|
2021-03-15 13:20:15 +00:00
|
|
|
|
2013-11-06 19:32:13 +00:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2013-11-06 17:20:22 +00:00
|
|
|
#endif /* __ASM_WORD_AT_A_TIME_H */
|