linux/arch/arm64/include/asm/atomic_lse.h

316 lines
8.2 KiB
C
Raw Permalink Normal View History

/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Based on arch/arm/include/asm/atomic.h
*
* Copyright (C) 1996 Russell King.
* Copyright (C) 2002 Deep Blue Solutions Ltd.
* Copyright (C) 2012 ARM Ltd.
*/
#ifndef __ASM_ATOMIC_LSE_H
#define __ASM_ATOMIC_LSE_H
#define ATOMIC_OP(op, asm_op) \
static __always_inline void \
__lse_atomic_##op(int i, atomic_t *v) \
{ \
asm volatile( \
__LSE_PREAMBLE \
arm64: atomics: format whitespace consistently The code for the atomic ops is formatted inconsistently, and while this is not a functional problem it is rather distracting when working on them. Some have ops have consistent indentation, e.g. | #define ATOMIC_OP_ADD_RETURN(name, mb, cl...) \ | static inline int __lse_atomic_add_return##name(int i, atomic_t *v) \ | { \ | u32 tmp; \ | \ | asm volatile( \ | __LSE_PREAMBLE \ | " ldadd" #mb " %w[i], %w[tmp], %[v]\n" \ | " add %w[i], %w[i], %w[tmp]" \ | : [i] "+r" (i), [v] "+Q" (v->counter), [tmp] "=&r" (tmp) \ | : "r" (v) \ | : cl); \ | \ | return i; \ | } While others have negative indentation for some lines, and/or have misaligned trailing backslashes, e.g. | static inline void __lse_atomic_##op(int i, atomic_t *v) \ | { \ | asm volatile( \ | __LSE_PREAMBLE \ | " " #asm_op " %w[i], %[v]\n" \ | : [i] "+r" (i), [v] "+Q" (v->counter) \ | : "r" (v)); \ | } This patch makes the indentation consistent and also aligns the trailing backslashes. This makes the code easier to read for those (like myself) who are easily distracted by these inconsistencies. This is intended as a cleanup. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Acked-by: Will Deacon <will@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20211210151410.2782645-2-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-12-10 15:14:06 +00:00
" " #asm_op " %w[i], %[v]\n" \
arm64: atomics: lse: improve constraints for simple ops We have overly conservative assembly constraints for the basic FEAT_LSE atomic instructions, and using more accurate and permissive constraints will allow for better code generation. The FEAT_LSE basic atomic instructions have come in two forms: LD{op}{order}{size} <Rs>, <Rt>, [<Rn>] ST{op}{order}{size} <Rs>, [<Rn>] The ST* forms are aliases of the LD* forms where: ST{op}{order}{size} <Rs>, [<Rn>] Is: LD{op}{order}{size} <Rs>, XZR, [<Rn>] For either form, both <Rs> and <Rn> are read but not written back to, and <Rt> is written with the original value of the memory location. Where (<Rt> == <Rs>) or (<Rt> == <Rn>), <Rt> is written *after* the other register value(s) are consumed. There are no UNPREDICTABLE or CONSTRAINED UNPREDICTABLE behaviours when any pair of <Rs>, <Rt>, or <Rn> are the same register. Our current inline assembly always uses <Rs> == <Rt>, treating this register as both an input and an output (using a '+r' constraint). This forces the compiler to do some unnecessary register shuffling and/or redundant value generation. For example, the compiler cannot reuse the <Rs> value, and currently GCC 11.1.0 will compile: __lse_atomic_add(1, a); __lse_atomic_add(1, b); __lse_atomic_add(1, c); As: mov w3, #0x1 mov w4, w3 stadd w4, [x0] mov w0, w3 stadd w0, [x1] stadd w3, [x2] We can improve this with more accurate constraints, separating <Rs> and <Rt>, where <Rs> is an input-only register ('r'), and <Rt> is an output-only value ('=r'). As <Rt> is written back after <Rs> is consumed, it does not need to be earlyclobber ('=&r'), leaving the compiler free to use the same register for both <Rs> and <Rt> where this is desirable. At the same time, the redundant 'r' constraint for `v` is removed, as the `+Q` constraint is sufficient. With this change, the above example becomes: mov w3, #0x1 stadd w3, [x0] stadd w3, [x1] stadd w3, [x2] I've made this change for the non-value-returning and FETCH ops. The RETURN ops have a multi-instruction sequence for which we cannot use the same constraints, and a subsequent patch will rewrite hte RETURN ops in terms of the FETCH ops, relying on the ability for the compiler to reuse the <Rs> value. This is intended as an optimization. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Acked-by: Will Deacon <will@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20211210151410.2782645-5-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-12-10 15:14:09 +00:00
: [v] "+Q" (v->counter) \
: [i] "r" (i)); \
}
ATOMIC_OP(andnot, stclr)
ATOMIC_OP(or, stset)
ATOMIC_OP(xor, steor)
ATOMIC_OP(add, stadd)
static __always_inline void __lse_atomic_sub(int i, atomic_t *v)
arm64: atomics lse: define SUBs in terms of ADDs The FEAT_LSE atomic instructions include atomic ADD instructions (`stadd*` and `ldadd*`), but do not include atomic SUB instructions, so we must build all of the SUB operations using the ADD instructions. We open-code these today, with each SUB op implemented as a copy of the corresponding ADD op with a leading `neg` instruction in the inline assembly to negate the `i` argument. As the compiler has no visibility of the `neg`, this leads to less than optimal code generation when generating `i` into a register. For example, __les_atomic_fetch_sub(1, v) can be compiled to: mov w1, #0x1 neg w1, w1 ldaddal w1, w1, [x2] This patch improves this by replacing the `neg` with negation in C before the inline assembly block, e.g. i = -i; This allows the compiler to generate `i` into a register more optimally, e.g. mov w1, #0xffffffff ldaddal w1, w1, [x2] With this change the assembly for each SUB op is identical to the corresponding ADD op (including barriers and clobbers), so I've removed the inline assembly and rewritten each SUB op in terms of the corresponding ADD op, e.g. | static inline void __lse_atomic_sub(int i, atomic_t *v) | { | __lse_atomic_add(-i, v); | } For clarity I've moved the definition of each SUB op immediately after the corresponding ADD op, and used a single macro to create the RETURN forms of both ops. This is intended as an optimization and cleanup. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Acked-by: Will Deacon <will@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20211210151410.2782645-3-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-12-10 15:14:07 +00:00
{
__lse_atomic_add(-i, v);
}
#undef ATOMIC_OP
#define ATOMIC_FETCH_OP(name, mb, op, asm_op, cl...) \
static __always_inline int \
__lse_atomic_fetch_##op##name(int i, atomic_t *v) \
{ \
arm64: atomics: lse: improve constraints for simple ops We have overly conservative assembly constraints for the basic FEAT_LSE atomic instructions, and using more accurate and permissive constraints will allow for better code generation. The FEAT_LSE basic atomic instructions have come in two forms: LD{op}{order}{size} <Rs>, <Rt>, [<Rn>] ST{op}{order}{size} <Rs>, [<Rn>] The ST* forms are aliases of the LD* forms where: ST{op}{order}{size} <Rs>, [<Rn>] Is: LD{op}{order}{size} <Rs>, XZR, [<Rn>] For either form, both <Rs> and <Rn> are read but not written back to, and <Rt> is written with the original value of the memory location. Where (<Rt> == <Rs>) or (<Rt> == <Rn>), <Rt> is written *after* the other register value(s) are consumed. There are no UNPREDICTABLE or CONSTRAINED UNPREDICTABLE behaviours when any pair of <Rs>, <Rt>, or <Rn> are the same register. Our current inline assembly always uses <Rs> == <Rt>, treating this register as both an input and an output (using a '+r' constraint). This forces the compiler to do some unnecessary register shuffling and/or redundant value generation. For example, the compiler cannot reuse the <Rs> value, and currently GCC 11.1.0 will compile: __lse_atomic_add(1, a); __lse_atomic_add(1, b); __lse_atomic_add(1, c); As: mov w3, #0x1 mov w4, w3 stadd w4, [x0] mov w0, w3 stadd w0, [x1] stadd w3, [x2] We can improve this with more accurate constraints, separating <Rs> and <Rt>, where <Rs> is an input-only register ('r'), and <Rt> is an output-only value ('=r'). As <Rt> is written back after <Rs> is consumed, it does not need to be earlyclobber ('=&r'), leaving the compiler free to use the same register for both <Rs> and <Rt> where this is desirable. At the same time, the redundant 'r' constraint for `v` is removed, as the `+Q` constraint is sufficient. With this change, the above example becomes: mov w3, #0x1 stadd w3, [x0] stadd w3, [x1] stadd w3, [x2] I've made this change for the non-value-returning and FETCH ops. The RETURN ops have a multi-instruction sequence for which we cannot use the same constraints, and a subsequent patch will rewrite hte RETURN ops in terms of the FETCH ops, relying on the ability for the compiler to reuse the <Rs> value. This is intended as an optimization. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Acked-by: Will Deacon <will@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20211210151410.2782645-5-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-12-10 15:14:09 +00:00
int old; \
\
asm volatile( \
__LSE_PREAMBLE \
arm64: atomics: lse: improve constraints for simple ops We have overly conservative assembly constraints for the basic FEAT_LSE atomic instructions, and using more accurate and permissive constraints will allow for better code generation. The FEAT_LSE basic atomic instructions have come in two forms: LD{op}{order}{size} <Rs>, <Rt>, [<Rn>] ST{op}{order}{size} <Rs>, [<Rn>] The ST* forms are aliases of the LD* forms where: ST{op}{order}{size} <Rs>, [<Rn>] Is: LD{op}{order}{size} <Rs>, XZR, [<Rn>] For either form, both <Rs> and <Rn> are read but not written back to, and <Rt> is written with the original value of the memory location. Where (<Rt> == <Rs>) or (<Rt> == <Rn>), <Rt> is written *after* the other register value(s) are consumed. There are no UNPREDICTABLE or CONSTRAINED UNPREDICTABLE behaviours when any pair of <Rs>, <Rt>, or <Rn> are the same register. Our current inline assembly always uses <Rs> == <Rt>, treating this register as both an input and an output (using a '+r' constraint). This forces the compiler to do some unnecessary register shuffling and/or redundant value generation. For example, the compiler cannot reuse the <Rs> value, and currently GCC 11.1.0 will compile: __lse_atomic_add(1, a); __lse_atomic_add(1, b); __lse_atomic_add(1, c); As: mov w3, #0x1 mov w4, w3 stadd w4, [x0] mov w0, w3 stadd w0, [x1] stadd w3, [x2] We can improve this with more accurate constraints, separating <Rs> and <Rt>, where <Rs> is an input-only register ('r'), and <Rt> is an output-only value ('=r'). As <Rt> is written back after <Rs> is consumed, it does not need to be earlyclobber ('=&r'), leaving the compiler free to use the same register for both <Rs> and <Rt> where this is desirable. At the same time, the redundant 'r' constraint for `v` is removed, as the `+Q` constraint is sufficient. With this change, the above example becomes: mov w3, #0x1 stadd w3, [x0] stadd w3, [x1] stadd w3, [x2] I've made this change for the non-value-returning and FETCH ops. The RETURN ops have a multi-instruction sequence for which we cannot use the same constraints, and a subsequent patch will rewrite hte RETURN ops in terms of the FETCH ops, relying on the ability for the compiler to reuse the <Rs> value. This is intended as an optimization. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Acked-by: Will Deacon <will@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20211210151410.2782645-5-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-12-10 15:14:09 +00:00
" " #asm_op #mb " %w[i], %w[old], %[v]" \
: [v] "+Q" (v->counter), \
[old] "=r" (old) \
: [i] "r" (i) \
: cl); \
\
arm64: atomics: lse: improve constraints for simple ops We have overly conservative assembly constraints for the basic FEAT_LSE atomic instructions, and using more accurate and permissive constraints will allow for better code generation. The FEAT_LSE basic atomic instructions have come in two forms: LD{op}{order}{size} <Rs>, <Rt>, [<Rn>] ST{op}{order}{size} <Rs>, [<Rn>] The ST* forms are aliases of the LD* forms where: ST{op}{order}{size} <Rs>, [<Rn>] Is: LD{op}{order}{size} <Rs>, XZR, [<Rn>] For either form, both <Rs> and <Rn> are read but not written back to, and <Rt> is written with the original value of the memory location. Where (<Rt> == <Rs>) or (<Rt> == <Rn>), <Rt> is written *after* the other register value(s) are consumed. There are no UNPREDICTABLE or CONSTRAINED UNPREDICTABLE behaviours when any pair of <Rs>, <Rt>, or <Rn> are the same register. Our current inline assembly always uses <Rs> == <Rt>, treating this register as both an input and an output (using a '+r' constraint). This forces the compiler to do some unnecessary register shuffling and/or redundant value generation. For example, the compiler cannot reuse the <Rs> value, and currently GCC 11.1.0 will compile: __lse_atomic_add(1, a); __lse_atomic_add(1, b); __lse_atomic_add(1, c); As: mov w3, #0x1 mov w4, w3 stadd w4, [x0] mov w0, w3 stadd w0, [x1] stadd w3, [x2] We can improve this with more accurate constraints, separating <Rs> and <Rt>, where <Rs> is an input-only register ('r'), and <Rt> is an output-only value ('=r'). As <Rt> is written back after <Rs> is consumed, it does not need to be earlyclobber ('=&r'), leaving the compiler free to use the same register for both <Rs> and <Rt> where this is desirable. At the same time, the redundant 'r' constraint for `v` is removed, as the `+Q` constraint is sufficient. With this change, the above example becomes: mov w3, #0x1 stadd w3, [x0] stadd w3, [x1] stadd w3, [x2] I've made this change for the non-value-returning and FETCH ops. The RETURN ops have a multi-instruction sequence for which we cannot use the same constraints, and a subsequent patch will rewrite hte RETURN ops in terms of the FETCH ops, relying on the ability for the compiler to reuse the <Rs> value. This is intended as an optimization. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Acked-by: Will Deacon <will@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20211210151410.2782645-5-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-12-10 15:14:09 +00:00
return old; \
}
#define ATOMIC_FETCH_OPS(op, asm_op) \
ATOMIC_FETCH_OP(_relaxed, , op, asm_op) \
ATOMIC_FETCH_OP(_acquire, a, op, asm_op, "memory") \
ATOMIC_FETCH_OP(_release, l, op, asm_op, "memory") \
ATOMIC_FETCH_OP( , al, op, asm_op, "memory")
ATOMIC_FETCH_OPS(andnot, ldclr)
ATOMIC_FETCH_OPS(or, ldset)
ATOMIC_FETCH_OPS(xor, ldeor)
ATOMIC_FETCH_OPS(add, ldadd)
#undef ATOMIC_FETCH_OP
#undef ATOMIC_FETCH_OPS
arm64: atomics lse: define SUBs in terms of ADDs The FEAT_LSE atomic instructions include atomic ADD instructions (`stadd*` and `ldadd*`), but do not include atomic SUB instructions, so we must build all of the SUB operations using the ADD instructions. We open-code these today, with each SUB op implemented as a copy of the corresponding ADD op with a leading `neg` instruction in the inline assembly to negate the `i` argument. As the compiler has no visibility of the `neg`, this leads to less than optimal code generation when generating `i` into a register. For example, __les_atomic_fetch_sub(1, v) can be compiled to: mov w1, #0x1 neg w1, w1 ldaddal w1, w1, [x2] This patch improves this by replacing the `neg` with negation in C before the inline assembly block, e.g. i = -i; This allows the compiler to generate `i` into a register more optimally, e.g. mov w1, #0xffffffff ldaddal w1, w1, [x2] With this change the assembly for each SUB op is identical to the corresponding ADD op (including barriers and clobbers), so I've removed the inline assembly and rewritten each SUB op in terms of the corresponding ADD op, e.g. | static inline void __lse_atomic_sub(int i, atomic_t *v) | { | __lse_atomic_add(-i, v); | } For clarity I've moved the definition of each SUB op immediately after the corresponding ADD op, and used a single macro to create the RETURN forms of both ops. This is intended as an optimization and cleanup. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Acked-by: Will Deacon <will@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20211210151410.2782645-3-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-12-10 15:14:07 +00:00
#define ATOMIC_FETCH_OP_SUB(name) \
static __always_inline int \
__lse_atomic_fetch_sub##name(int i, atomic_t *v) \
arm64: atomics lse: define SUBs in terms of ADDs The FEAT_LSE atomic instructions include atomic ADD instructions (`stadd*` and `ldadd*`), but do not include atomic SUB instructions, so we must build all of the SUB operations using the ADD instructions. We open-code these today, with each SUB op implemented as a copy of the corresponding ADD op with a leading `neg` instruction in the inline assembly to negate the `i` argument. As the compiler has no visibility of the `neg`, this leads to less than optimal code generation when generating `i` into a register. For example, __les_atomic_fetch_sub(1, v) can be compiled to: mov w1, #0x1 neg w1, w1 ldaddal w1, w1, [x2] This patch improves this by replacing the `neg` with negation in C before the inline assembly block, e.g. i = -i; This allows the compiler to generate `i` into a register more optimally, e.g. mov w1, #0xffffffff ldaddal w1, w1, [x2] With this change the assembly for each SUB op is identical to the corresponding ADD op (including barriers and clobbers), so I've removed the inline assembly and rewritten each SUB op in terms of the corresponding ADD op, e.g. | static inline void __lse_atomic_sub(int i, atomic_t *v) | { | __lse_atomic_add(-i, v); | } For clarity I've moved the definition of each SUB op immediately after the corresponding ADD op, and used a single macro to create the RETURN forms of both ops. This is intended as an optimization and cleanup. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Acked-by: Will Deacon <will@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20211210151410.2782645-3-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-12-10 15:14:07 +00:00
{ \
return __lse_atomic_fetch_add##name(-i, v); \
}
ATOMIC_FETCH_OP_SUB(_relaxed)
ATOMIC_FETCH_OP_SUB(_acquire)
ATOMIC_FETCH_OP_SUB(_release)
ATOMIC_FETCH_OP_SUB( )
#undef ATOMIC_FETCH_OP_SUB
arm64: atomics: lse: define RETURN ops in terms of FETCH ops The FEAT_LSE atomic instructions include LD* instructions which return the original value of a memory location can be used to directly implement FETCH opertations. Each RETURN op is implemented as a copy of the corresponding FETCH op with a trailing instruction to generate the new value of the memory location. We only directly implement *_fetch_add*(), for which we have a trailing `add` instruction. As the compiler has no visibility of the `add`, this leads to less than optimal code generation when consuming the result. For example, the compiler cannot constant-fold the addition into later operations, and currently GCC 11.1.0 will compile: return __lse_atomic_sub_return(1, v) == 0; As: mov w1, #0xffffffff ldaddal w1, w2, [x0] add w1, w1, w2 cmp w1, #0x0 cset w0, eq // eq = none ret This patch improves this by replacing the `add` with C addition after the inline assembly block, e.g. ret += i; This allows the compiler to manipulate `i`. This permits the compiler to merge the `add` and `cmp` for the above, e.g. mov w1, #0xffffffff ldaddal w1, w1, [x0] cmp w1, #0x1 cset w0, eq // eq = none ret With this change the assembly for each RETURN op is identical to the corresponding FETCH op (including barriers and clobbers) so I've removed the inline assembly and rewritten each RETURN op in terms of the corresponding FETCH op, e.g. | static inline void __lse_atomic_add_return(int i, atomic_t *v) | { | return __lse_atomic_fetch_add(i, v) + i | } The new construction does not adversely affect the common case, and before and after this patch GCC 11.1.0 can compile: __lse_atomic_add_return(i, v) As: ldaddal w0, w2, [x1] add w0, w0, w2 ... while having the freedom to do better elsewhere. This is intended as an optimization and cleanup. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Acked-by: Will Deacon <will@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20211210151410.2782645-6-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-12-10 15:14:10 +00:00
#define ATOMIC_OP_ADD_SUB_RETURN(name) \
static __always_inline int \
__lse_atomic_add_return##name(int i, atomic_t *v) \
{ \
arm64: atomics: lse: define RETURN ops in terms of FETCH ops The FEAT_LSE atomic instructions include LD* instructions which return the original value of a memory location can be used to directly implement FETCH opertations. Each RETURN op is implemented as a copy of the corresponding FETCH op with a trailing instruction to generate the new value of the memory location. We only directly implement *_fetch_add*(), for which we have a trailing `add` instruction. As the compiler has no visibility of the `add`, this leads to less than optimal code generation when consuming the result. For example, the compiler cannot constant-fold the addition into later operations, and currently GCC 11.1.0 will compile: return __lse_atomic_sub_return(1, v) == 0; As: mov w1, #0xffffffff ldaddal w1, w2, [x0] add w1, w1, w2 cmp w1, #0x0 cset w0, eq // eq = none ret This patch improves this by replacing the `add` with C addition after the inline assembly block, e.g. ret += i; This allows the compiler to manipulate `i`. This permits the compiler to merge the `add` and `cmp` for the above, e.g. mov w1, #0xffffffff ldaddal w1, w1, [x0] cmp w1, #0x1 cset w0, eq // eq = none ret With this change the assembly for each RETURN op is identical to the corresponding FETCH op (including barriers and clobbers) so I've removed the inline assembly and rewritten each RETURN op in terms of the corresponding FETCH op, e.g. | static inline void __lse_atomic_add_return(int i, atomic_t *v) | { | return __lse_atomic_fetch_add(i, v) + i | } The new construction does not adversely affect the common case, and before and after this patch GCC 11.1.0 can compile: __lse_atomic_add_return(i, v) As: ldaddal w0, w2, [x1] add w0, w0, w2 ... while having the freedom to do better elsewhere. This is intended as an optimization and cleanup. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Acked-by: Will Deacon <will@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20211210151410.2782645-6-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-12-10 15:14:10 +00:00
return __lse_atomic_fetch_add##name(i, v) + i; \
arm64: atomics lse: define SUBs in terms of ADDs The FEAT_LSE atomic instructions include atomic ADD instructions (`stadd*` and `ldadd*`), but do not include atomic SUB instructions, so we must build all of the SUB operations using the ADD instructions. We open-code these today, with each SUB op implemented as a copy of the corresponding ADD op with a leading `neg` instruction in the inline assembly to negate the `i` argument. As the compiler has no visibility of the `neg`, this leads to less than optimal code generation when generating `i` into a register. For example, __les_atomic_fetch_sub(1, v) can be compiled to: mov w1, #0x1 neg w1, w1 ldaddal w1, w1, [x2] This patch improves this by replacing the `neg` with negation in C before the inline assembly block, e.g. i = -i; This allows the compiler to generate `i` into a register more optimally, e.g. mov w1, #0xffffffff ldaddal w1, w1, [x2] With this change the assembly for each SUB op is identical to the corresponding ADD op (including barriers and clobbers), so I've removed the inline assembly and rewritten each SUB op in terms of the corresponding ADD op, e.g. | static inline void __lse_atomic_sub(int i, atomic_t *v) | { | __lse_atomic_add(-i, v); | } For clarity I've moved the definition of each SUB op immediately after the corresponding ADD op, and used a single macro to create the RETURN forms of both ops. This is intended as an optimization and cleanup. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Acked-by: Will Deacon <will@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20211210151410.2782645-3-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-12-10 15:14:07 +00:00
} \
\
static __always_inline int \
__lse_atomic_sub_return##name(int i, atomic_t *v) \
arm64: atomics lse: define SUBs in terms of ADDs The FEAT_LSE atomic instructions include atomic ADD instructions (`stadd*` and `ldadd*`), but do not include atomic SUB instructions, so we must build all of the SUB operations using the ADD instructions. We open-code these today, with each SUB op implemented as a copy of the corresponding ADD op with a leading `neg` instruction in the inline assembly to negate the `i` argument. As the compiler has no visibility of the `neg`, this leads to less than optimal code generation when generating `i` into a register. For example, __les_atomic_fetch_sub(1, v) can be compiled to: mov w1, #0x1 neg w1, w1 ldaddal w1, w1, [x2] This patch improves this by replacing the `neg` with negation in C before the inline assembly block, e.g. i = -i; This allows the compiler to generate `i` into a register more optimally, e.g. mov w1, #0xffffffff ldaddal w1, w1, [x2] With this change the assembly for each SUB op is identical to the corresponding ADD op (including barriers and clobbers), so I've removed the inline assembly and rewritten each SUB op in terms of the corresponding ADD op, e.g. | static inline void __lse_atomic_sub(int i, atomic_t *v) | { | __lse_atomic_add(-i, v); | } For clarity I've moved the definition of each SUB op immediately after the corresponding ADD op, and used a single macro to create the RETURN forms of both ops. This is intended as an optimization and cleanup. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Acked-by: Will Deacon <will@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20211210151410.2782645-3-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-12-10 15:14:07 +00:00
{ \
arm64: atomics: lse: define RETURN ops in terms of FETCH ops The FEAT_LSE atomic instructions include LD* instructions which return the original value of a memory location can be used to directly implement FETCH opertations. Each RETURN op is implemented as a copy of the corresponding FETCH op with a trailing instruction to generate the new value of the memory location. We only directly implement *_fetch_add*(), for which we have a trailing `add` instruction. As the compiler has no visibility of the `add`, this leads to less than optimal code generation when consuming the result. For example, the compiler cannot constant-fold the addition into later operations, and currently GCC 11.1.0 will compile: return __lse_atomic_sub_return(1, v) == 0; As: mov w1, #0xffffffff ldaddal w1, w2, [x0] add w1, w1, w2 cmp w1, #0x0 cset w0, eq // eq = none ret This patch improves this by replacing the `add` with C addition after the inline assembly block, e.g. ret += i; This allows the compiler to manipulate `i`. This permits the compiler to merge the `add` and `cmp` for the above, e.g. mov w1, #0xffffffff ldaddal w1, w1, [x0] cmp w1, #0x1 cset w0, eq // eq = none ret With this change the assembly for each RETURN op is identical to the corresponding FETCH op (including barriers and clobbers) so I've removed the inline assembly and rewritten each RETURN op in terms of the corresponding FETCH op, e.g. | static inline void __lse_atomic_add_return(int i, atomic_t *v) | { | return __lse_atomic_fetch_add(i, v) + i | } The new construction does not adversely affect the common case, and before and after this patch GCC 11.1.0 can compile: __lse_atomic_add_return(i, v) As: ldaddal w0, w2, [x1] add w0, w0, w2 ... while having the freedom to do better elsewhere. This is intended as an optimization and cleanup. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Acked-by: Will Deacon <will@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20211210151410.2782645-6-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-12-10 15:14:10 +00:00
return __lse_atomic_fetch_sub(i, v) - i; \
}
arm64: atomics: lse: define RETURN ops in terms of FETCH ops The FEAT_LSE atomic instructions include LD* instructions which return the original value of a memory location can be used to directly implement FETCH opertations. Each RETURN op is implemented as a copy of the corresponding FETCH op with a trailing instruction to generate the new value of the memory location. We only directly implement *_fetch_add*(), for which we have a trailing `add` instruction. As the compiler has no visibility of the `add`, this leads to less than optimal code generation when consuming the result. For example, the compiler cannot constant-fold the addition into later operations, and currently GCC 11.1.0 will compile: return __lse_atomic_sub_return(1, v) == 0; As: mov w1, #0xffffffff ldaddal w1, w2, [x0] add w1, w1, w2 cmp w1, #0x0 cset w0, eq // eq = none ret This patch improves this by replacing the `add` with C addition after the inline assembly block, e.g. ret += i; This allows the compiler to manipulate `i`. This permits the compiler to merge the `add` and `cmp` for the above, e.g. mov w1, #0xffffffff ldaddal w1, w1, [x0] cmp w1, #0x1 cset w0, eq // eq = none ret With this change the assembly for each RETURN op is identical to the corresponding FETCH op (including barriers and clobbers) so I've removed the inline assembly and rewritten each RETURN op in terms of the corresponding FETCH op, e.g. | static inline void __lse_atomic_add_return(int i, atomic_t *v) | { | return __lse_atomic_fetch_add(i, v) + i | } The new construction does not adversely affect the common case, and before and after this patch GCC 11.1.0 can compile: __lse_atomic_add_return(i, v) As: ldaddal w0, w2, [x1] add w0, w0, w2 ... while having the freedom to do better elsewhere. This is intended as an optimization and cleanup. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Acked-by: Will Deacon <will@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20211210151410.2782645-6-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-12-10 15:14:10 +00:00
ATOMIC_OP_ADD_SUB_RETURN(_relaxed)
ATOMIC_OP_ADD_SUB_RETURN(_acquire)
ATOMIC_OP_ADD_SUB_RETURN(_release)
ATOMIC_OP_ADD_SUB_RETURN( )
arm64: atomics lse: define SUBs in terms of ADDs The FEAT_LSE atomic instructions include atomic ADD instructions (`stadd*` and `ldadd*`), but do not include atomic SUB instructions, so we must build all of the SUB operations using the ADD instructions. We open-code these today, with each SUB op implemented as a copy of the corresponding ADD op with a leading `neg` instruction in the inline assembly to negate the `i` argument. As the compiler has no visibility of the `neg`, this leads to less than optimal code generation when generating `i` into a register. For example, __les_atomic_fetch_sub(1, v) can be compiled to: mov w1, #0x1 neg w1, w1 ldaddal w1, w1, [x2] This patch improves this by replacing the `neg` with negation in C before the inline assembly block, e.g. i = -i; This allows the compiler to generate `i` into a register more optimally, e.g. mov w1, #0xffffffff ldaddal w1, w1, [x2] With this change the assembly for each SUB op is identical to the corresponding ADD op (including barriers and clobbers), so I've removed the inline assembly and rewritten each SUB op in terms of the corresponding ADD op, e.g. | static inline void __lse_atomic_sub(int i, atomic_t *v) | { | __lse_atomic_add(-i, v); | } For clarity I've moved the definition of each SUB op immediately after the corresponding ADD op, and used a single macro to create the RETURN forms of both ops. This is intended as an optimization and cleanup. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Acked-by: Will Deacon <will@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20211210151410.2782645-3-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-12-10 15:14:07 +00:00
#undef ATOMIC_OP_ADD_SUB_RETURN
static __always_inline void __lse_atomic_and(int i, atomic_t *v)
{
arm64: atomics: lse: define ANDs in terms of ANDNOTs The FEAT_LSE atomic instructions include atomic bit-clear instructions (`ldclr*` and `stclr*`) which can be used to directly implement ANDNOT operations. Each AND op is implemented as a copy of the corresponding ANDNOT op with a leading `mvn` instruction to apply a bitwise NOT to the `i` argument. As the compiler has no visibility of the `mvn`, this leads to less than optimal code generation when generating `i` into a register. For example, __lse_atomic_fetch_and(0xf, v) can be compiled to: mov w1, #0xf mvn w1, w1 ldclral w1, w1, [x2] This patch improves this by replacing the `mvn` with NOT in C before the inline assembly block, e.g. i = ~i; This allows the compiler to generate `i` into a register more optimally, e.g. mov w1, #0xfffffff0 ldclral w1, w1, [x2] With this change the assembly for each AND op is identical to the corresponding ANDNOT op (including barriers and clobbers), so I've removed the inline assembly and rewritten each AND op in terms of the corresponding ANDNOT op, e.g. | static inline void __lse_atomic_and(int i, atomic_t *v) | { | return __lse_atomic_andnot(~i, v); | } This is intended as an optimization and cleanup. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Acked-by: Will Deacon <will@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20211210151410.2782645-4-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-12-10 15:14:08 +00:00
return __lse_atomic_andnot(~i, v);
}
#define ATOMIC_FETCH_OP_AND(name, mb, cl...) \
static __always_inline int \
__lse_atomic_fetch_and##name(int i, atomic_t *v) \
{ \
arm64: atomics: lse: define ANDs in terms of ANDNOTs The FEAT_LSE atomic instructions include atomic bit-clear instructions (`ldclr*` and `stclr*`) which can be used to directly implement ANDNOT operations. Each AND op is implemented as a copy of the corresponding ANDNOT op with a leading `mvn` instruction to apply a bitwise NOT to the `i` argument. As the compiler has no visibility of the `mvn`, this leads to less than optimal code generation when generating `i` into a register. For example, __lse_atomic_fetch_and(0xf, v) can be compiled to: mov w1, #0xf mvn w1, w1 ldclral w1, w1, [x2] This patch improves this by replacing the `mvn` with NOT in C before the inline assembly block, e.g. i = ~i; This allows the compiler to generate `i` into a register more optimally, e.g. mov w1, #0xfffffff0 ldclral w1, w1, [x2] With this change the assembly for each AND op is identical to the corresponding ANDNOT op (including barriers and clobbers), so I've removed the inline assembly and rewritten each AND op in terms of the corresponding ANDNOT op, e.g. | static inline void __lse_atomic_and(int i, atomic_t *v) | { | return __lse_atomic_andnot(~i, v); | } This is intended as an optimization and cleanup. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Acked-by: Will Deacon <will@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20211210151410.2782645-4-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-12-10 15:14:08 +00:00
return __lse_atomic_fetch_andnot##name(~i, v); \
}
ATOMIC_FETCH_OP_AND(_relaxed, )
ATOMIC_FETCH_OP_AND(_acquire, a, "memory")
ATOMIC_FETCH_OP_AND(_release, l, "memory")
ATOMIC_FETCH_OP_AND( , al, "memory")
#undef ATOMIC_FETCH_OP_AND
#define ATOMIC64_OP(op, asm_op) \
static __always_inline void \
__lse_atomic64_##op(s64 i, atomic64_t *v) \
{ \
asm volatile( \
__LSE_PREAMBLE \
arm64: atomics: format whitespace consistently The code for the atomic ops is formatted inconsistently, and while this is not a functional problem it is rather distracting when working on them. Some have ops have consistent indentation, e.g. | #define ATOMIC_OP_ADD_RETURN(name, mb, cl...) \ | static inline int __lse_atomic_add_return##name(int i, atomic_t *v) \ | { \ | u32 tmp; \ | \ | asm volatile( \ | __LSE_PREAMBLE \ | " ldadd" #mb " %w[i], %w[tmp], %[v]\n" \ | " add %w[i], %w[i], %w[tmp]" \ | : [i] "+r" (i), [v] "+Q" (v->counter), [tmp] "=&r" (tmp) \ | : "r" (v) \ | : cl); \ | \ | return i; \ | } While others have negative indentation for some lines, and/or have misaligned trailing backslashes, e.g. | static inline void __lse_atomic_##op(int i, atomic_t *v) \ | { \ | asm volatile( \ | __LSE_PREAMBLE \ | " " #asm_op " %w[i], %[v]\n" \ | : [i] "+r" (i), [v] "+Q" (v->counter) \ | : "r" (v)); \ | } This patch makes the indentation consistent and also aligns the trailing backslashes. This makes the code easier to read for those (like myself) who are easily distracted by these inconsistencies. This is intended as a cleanup. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Acked-by: Will Deacon <will@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20211210151410.2782645-2-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-12-10 15:14:06 +00:00
" " #asm_op " %[i], %[v]\n" \
arm64: atomics: lse: improve constraints for simple ops We have overly conservative assembly constraints for the basic FEAT_LSE atomic instructions, and using more accurate and permissive constraints will allow for better code generation. The FEAT_LSE basic atomic instructions have come in two forms: LD{op}{order}{size} <Rs>, <Rt>, [<Rn>] ST{op}{order}{size} <Rs>, [<Rn>] The ST* forms are aliases of the LD* forms where: ST{op}{order}{size} <Rs>, [<Rn>] Is: LD{op}{order}{size} <Rs>, XZR, [<Rn>] For either form, both <Rs> and <Rn> are read but not written back to, and <Rt> is written with the original value of the memory location. Where (<Rt> == <Rs>) or (<Rt> == <Rn>), <Rt> is written *after* the other register value(s) are consumed. There are no UNPREDICTABLE or CONSTRAINED UNPREDICTABLE behaviours when any pair of <Rs>, <Rt>, or <Rn> are the same register. Our current inline assembly always uses <Rs> == <Rt>, treating this register as both an input and an output (using a '+r' constraint). This forces the compiler to do some unnecessary register shuffling and/or redundant value generation. For example, the compiler cannot reuse the <Rs> value, and currently GCC 11.1.0 will compile: __lse_atomic_add(1, a); __lse_atomic_add(1, b); __lse_atomic_add(1, c); As: mov w3, #0x1 mov w4, w3 stadd w4, [x0] mov w0, w3 stadd w0, [x1] stadd w3, [x2] We can improve this with more accurate constraints, separating <Rs> and <Rt>, where <Rs> is an input-only register ('r'), and <Rt> is an output-only value ('=r'). As <Rt> is written back after <Rs> is consumed, it does not need to be earlyclobber ('=&r'), leaving the compiler free to use the same register for both <Rs> and <Rt> where this is desirable. At the same time, the redundant 'r' constraint for `v` is removed, as the `+Q` constraint is sufficient. With this change, the above example becomes: mov w3, #0x1 stadd w3, [x0] stadd w3, [x1] stadd w3, [x2] I've made this change for the non-value-returning and FETCH ops. The RETURN ops have a multi-instruction sequence for which we cannot use the same constraints, and a subsequent patch will rewrite hte RETURN ops in terms of the FETCH ops, relying on the ability for the compiler to reuse the <Rs> value. This is intended as an optimization. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Acked-by: Will Deacon <will@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20211210151410.2782645-5-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-12-10 15:14:09 +00:00
: [v] "+Q" (v->counter) \
: [i] "r" (i)); \
}
ATOMIC64_OP(andnot, stclr)
ATOMIC64_OP(or, stset)
ATOMIC64_OP(xor, steor)
ATOMIC64_OP(add, stadd)
static __always_inline void __lse_atomic64_sub(s64 i, atomic64_t *v)
arm64: atomics lse: define SUBs in terms of ADDs The FEAT_LSE atomic instructions include atomic ADD instructions (`stadd*` and `ldadd*`), but do not include atomic SUB instructions, so we must build all of the SUB operations using the ADD instructions. We open-code these today, with each SUB op implemented as a copy of the corresponding ADD op with a leading `neg` instruction in the inline assembly to negate the `i` argument. As the compiler has no visibility of the `neg`, this leads to less than optimal code generation when generating `i` into a register. For example, __les_atomic_fetch_sub(1, v) can be compiled to: mov w1, #0x1 neg w1, w1 ldaddal w1, w1, [x2] This patch improves this by replacing the `neg` with negation in C before the inline assembly block, e.g. i = -i; This allows the compiler to generate `i` into a register more optimally, e.g. mov w1, #0xffffffff ldaddal w1, w1, [x2] With this change the assembly for each SUB op is identical to the corresponding ADD op (including barriers and clobbers), so I've removed the inline assembly and rewritten each SUB op in terms of the corresponding ADD op, e.g. | static inline void __lse_atomic_sub(int i, atomic_t *v) | { | __lse_atomic_add(-i, v); | } For clarity I've moved the definition of each SUB op immediately after the corresponding ADD op, and used a single macro to create the RETURN forms of both ops. This is intended as an optimization and cleanup. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Acked-by: Will Deacon <will@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20211210151410.2782645-3-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-12-10 15:14:07 +00:00
{
__lse_atomic64_add(-i, v);
}
#undef ATOMIC64_OP
#define ATOMIC64_FETCH_OP(name, mb, op, asm_op, cl...) \
static __always_inline long \
__lse_atomic64_fetch_##op##name(s64 i, atomic64_t *v) \
{ \
arm64: atomics: lse: improve constraints for simple ops We have overly conservative assembly constraints for the basic FEAT_LSE atomic instructions, and using more accurate and permissive constraints will allow for better code generation. The FEAT_LSE basic atomic instructions have come in two forms: LD{op}{order}{size} <Rs>, <Rt>, [<Rn>] ST{op}{order}{size} <Rs>, [<Rn>] The ST* forms are aliases of the LD* forms where: ST{op}{order}{size} <Rs>, [<Rn>] Is: LD{op}{order}{size} <Rs>, XZR, [<Rn>] For either form, both <Rs> and <Rn> are read but not written back to, and <Rt> is written with the original value of the memory location. Where (<Rt> == <Rs>) or (<Rt> == <Rn>), <Rt> is written *after* the other register value(s) are consumed. There are no UNPREDICTABLE or CONSTRAINED UNPREDICTABLE behaviours when any pair of <Rs>, <Rt>, or <Rn> are the same register. Our current inline assembly always uses <Rs> == <Rt>, treating this register as both an input and an output (using a '+r' constraint). This forces the compiler to do some unnecessary register shuffling and/or redundant value generation. For example, the compiler cannot reuse the <Rs> value, and currently GCC 11.1.0 will compile: __lse_atomic_add(1, a); __lse_atomic_add(1, b); __lse_atomic_add(1, c); As: mov w3, #0x1 mov w4, w3 stadd w4, [x0] mov w0, w3 stadd w0, [x1] stadd w3, [x2] We can improve this with more accurate constraints, separating <Rs> and <Rt>, where <Rs> is an input-only register ('r'), and <Rt> is an output-only value ('=r'). As <Rt> is written back after <Rs> is consumed, it does not need to be earlyclobber ('=&r'), leaving the compiler free to use the same register for both <Rs> and <Rt> where this is desirable. At the same time, the redundant 'r' constraint for `v` is removed, as the `+Q` constraint is sufficient. With this change, the above example becomes: mov w3, #0x1 stadd w3, [x0] stadd w3, [x1] stadd w3, [x2] I've made this change for the non-value-returning and FETCH ops. The RETURN ops have a multi-instruction sequence for which we cannot use the same constraints, and a subsequent patch will rewrite hte RETURN ops in terms of the FETCH ops, relying on the ability for the compiler to reuse the <Rs> value. This is intended as an optimization. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Acked-by: Will Deacon <will@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20211210151410.2782645-5-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-12-10 15:14:09 +00:00
s64 old; \
\
asm volatile( \
__LSE_PREAMBLE \
arm64: atomics: lse: improve constraints for simple ops We have overly conservative assembly constraints for the basic FEAT_LSE atomic instructions, and using more accurate and permissive constraints will allow for better code generation. The FEAT_LSE basic atomic instructions have come in two forms: LD{op}{order}{size} <Rs>, <Rt>, [<Rn>] ST{op}{order}{size} <Rs>, [<Rn>] The ST* forms are aliases of the LD* forms where: ST{op}{order}{size} <Rs>, [<Rn>] Is: LD{op}{order}{size} <Rs>, XZR, [<Rn>] For either form, both <Rs> and <Rn> are read but not written back to, and <Rt> is written with the original value of the memory location. Where (<Rt> == <Rs>) or (<Rt> == <Rn>), <Rt> is written *after* the other register value(s) are consumed. There are no UNPREDICTABLE or CONSTRAINED UNPREDICTABLE behaviours when any pair of <Rs>, <Rt>, or <Rn> are the same register. Our current inline assembly always uses <Rs> == <Rt>, treating this register as both an input and an output (using a '+r' constraint). This forces the compiler to do some unnecessary register shuffling and/or redundant value generation. For example, the compiler cannot reuse the <Rs> value, and currently GCC 11.1.0 will compile: __lse_atomic_add(1, a); __lse_atomic_add(1, b); __lse_atomic_add(1, c); As: mov w3, #0x1 mov w4, w3 stadd w4, [x0] mov w0, w3 stadd w0, [x1] stadd w3, [x2] We can improve this with more accurate constraints, separating <Rs> and <Rt>, where <Rs> is an input-only register ('r'), and <Rt> is an output-only value ('=r'). As <Rt> is written back after <Rs> is consumed, it does not need to be earlyclobber ('=&r'), leaving the compiler free to use the same register for both <Rs> and <Rt> where this is desirable. At the same time, the redundant 'r' constraint for `v` is removed, as the `+Q` constraint is sufficient. With this change, the above example becomes: mov w3, #0x1 stadd w3, [x0] stadd w3, [x1] stadd w3, [x2] I've made this change for the non-value-returning and FETCH ops. The RETURN ops have a multi-instruction sequence for which we cannot use the same constraints, and a subsequent patch will rewrite hte RETURN ops in terms of the FETCH ops, relying on the ability for the compiler to reuse the <Rs> value. This is intended as an optimization. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Acked-by: Will Deacon <will@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20211210151410.2782645-5-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-12-10 15:14:09 +00:00
" " #asm_op #mb " %[i], %[old], %[v]" \
: [v] "+Q" (v->counter), \
[old] "=r" (old) \
: [i] "r" (i) \
: cl); \
\
arm64: atomics: lse: improve constraints for simple ops We have overly conservative assembly constraints for the basic FEAT_LSE atomic instructions, and using more accurate and permissive constraints will allow for better code generation. The FEAT_LSE basic atomic instructions have come in two forms: LD{op}{order}{size} <Rs>, <Rt>, [<Rn>] ST{op}{order}{size} <Rs>, [<Rn>] The ST* forms are aliases of the LD* forms where: ST{op}{order}{size} <Rs>, [<Rn>] Is: LD{op}{order}{size} <Rs>, XZR, [<Rn>] For either form, both <Rs> and <Rn> are read but not written back to, and <Rt> is written with the original value of the memory location. Where (<Rt> == <Rs>) or (<Rt> == <Rn>), <Rt> is written *after* the other register value(s) are consumed. There are no UNPREDICTABLE or CONSTRAINED UNPREDICTABLE behaviours when any pair of <Rs>, <Rt>, or <Rn> are the same register. Our current inline assembly always uses <Rs> == <Rt>, treating this register as both an input and an output (using a '+r' constraint). This forces the compiler to do some unnecessary register shuffling and/or redundant value generation. For example, the compiler cannot reuse the <Rs> value, and currently GCC 11.1.0 will compile: __lse_atomic_add(1, a); __lse_atomic_add(1, b); __lse_atomic_add(1, c); As: mov w3, #0x1 mov w4, w3 stadd w4, [x0] mov w0, w3 stadd w0, [x1] stadd w3, [x2] We can improve this with more accurate constraints, separating <Rs> and <Rt>, where <Rs> is an input-only register ('r'), and <Rt> is an output-only value ('=r'). As <Rt> is written back after <Rs> is consumed, it does not need to be earlyclobber ('=&r'), leaving the compiler free to use the same register for both <Rs> and <Rt> where this is desirable. At the same time, the redundant 'r' constraint for `v` is removed, as the `+Q` constraint is sufficient. With this change, the above example becomes: mov w3, #0x1 stadd w3, [x0] stadd w3, [x1] stadd w3, [x2] I've made this change for the non-value-returning and FETCH ops. The RETURN ops have a multi-instruction sequence for which we cannot use the same constraints, and a subsequent patch will rewrite hte RETURN ops in terms of the FETCH ops, relying on the ability for the compiler to reuse the <Rs> value. This is intended as an optimization. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Acked-by: Will Deacon <will@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20211210151410.2782645-5-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-12-10 15:14:09 +00:00
return old; \
}
#define ATOMIC64_FETCH_OPS(op, asm_op) \
ATOMIC64_FETCH_OP(_relaxed, , op, asm_op) \
ATOMIC64_FETCH_OP(_acquire, a, op, asm_op, "memory") \
ATOMIC64_FETCH_OP(_release, l, op, asm_op, "memory") \
ATOMIC64_FETCH_OP( , al, op, asm_op, "memory")
ATOMIC64_FETCH_OPS(andnot, ldclr)
ATOMIC64_FETCH_OPS(or, ldset)
ATOMIC64_FETCH_OPS(xor, ldeor)
ATOMIC64_FETCH_OPS(add, ldadd)
#undef ATOMIC64_FETCH_OP
#undef ATOMIC64_FETCH_OPS
arm64: atomics lse: define SUBs in terms of ADDs The FEAT_LSE atomic instructions include atomic ADD instructions (`stadd*` and `ldadd*`), but do not include atomic SUB instructions, so we must build all of the SUB operations using the ADD instructions. We open-code these today, with each SUB op implemented as a copy of the corresponding ADD op with a leading `neg` instruction in the inline assembly to negate the `i` argument. As the compiler has no visibility of the `neg`, this leads to less than optimal code generation when generating `i` into a register. For example, __les_atomic_fetch_sub(1, v) can be compiled to: mov w1, #0x1 neg w1, w1 ldaddal w1, w1, [x2] This patch improves this by replacing the `neg` with negation in C before the inline assembly block, e.g. i = -i; This allows the compiler to generate `i` into a register more optimally, e.g. mov w1, #0xffffffff ldaddal w1, w1, [x2] With this change the assembly for each SUB op is identical to the corresponding ADD op (including barriers and clobbers), so I've removed the inline assembly and rewritten each SUB op in terms of the corresponding ADD op, e.g. | static inline void __lse_atomic_sub(int i, atomic_t *v) | { | __lse_atomic_add(-i, v); | } For clarity I've moved the definition of each SUB op immediately after the corresponding ADD op, and used a single macro to create the RETURN forms of both ops. This is intended as an optimization and cleanup. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Acked-by: Will Deacon <will@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20211210151410.2782645-3-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-12-10 15:14:07 +00:00
#define ATOMIC64_FETCH_OP_SUB(name) \
static __always_inline long \
__lse_atomic64_fetch_sub##name(s64 i, atomic64_t *v) \
arm64: atomics lse: define SUBs in terms of ADDs The FEAT_LSE atomic instructions include atomic ADD instructions (`stadd*` and `ldadd*`), but do not include atomic SUB instructions, so we must build all of the SUB operations using the ADD instructions. We open-code these today, with each SUB op implemented as a copy of the corresponding ADD op with a leading `neg` instruction in the inline assembly to negate the `i` argument. As the compiler has no visibility of the `neg`, this leads to less than optimal code generation when generating `i` into a register. For example, __les_atomic_fetch_sub(1, v) can be compiled to: mov w1, #0x1 neg w1, w1 ldaddal w1, w1, [x2] This patch improves this by replacing the `neg` with negation in C before the inline assembly block, e.g. i = -i; This allows the compiler to generate `i` into a register more optimally, e.g. mov w1, #0xffffffff ldaddal w1, w1, [x2] With this change the assembly for each SUB op is identical to the corresponding ADD op (including barriers and clobbers), so I've removed the inline assembly and rewritten each SUB op in terms of the corresponding ADD op, e.g. | static inline void __lse_atomic_sub(int i, atomic_t *v) | { | __lse_atomic_add(-i, v); | } For clarity I've moved the definition of each SUB op immediately after the corresponding ADD op, and used a single macro to create the RETURN forms of both ops. This is intended as an optimization and cleanup. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Acked-by: Will Deacon <will@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20211210151410.2782645-3-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-12-10 15:14:07 +00:00
{ \
return __lse_atomic64_fetch_add##name(-i, v); \
}
ATOMIC64_FETCH_OP_SUB(_relaxed)
ATOMIC64_FETCH_OP_SUB(_acquire)
ATOMIC64_FETCH_OP_SUB(_release)
ATOMIC64_FETCH_OP_SUB( )
#undef ATOMIC64_FETCH_OP_SUB
arm64: atomics: lse: define RETURN ops in terms of FETCH ops The FEAT_LSE atomic instructions include LD* instructions which return the original value of a memory location can be used to directly implement FETCH opertations. Each RETURN op is implemented as a copy of the corresponding FETCH op with a trailing instruction to generate the new value of the memory location. We only directly implement *_fetch_add*(), for which we have a trailing `add` instruction. As the compiler has no visibility of the `add`, this leads to less than optimal code generation when consuming the result. For example, the compiler cannot constant-fold the addition into later operations, and currently GCC 11.1.0 will compile: return __lse_atomic_sub_return(1, v) == 0; As: mov w1, #0xffffffff ldaddal w1, w2, [x0] add w1, w1, w2 cmp w1, #0x0 cset w0, eq // eq = none ret This patch improves this by replacing the `add` with C addition after the inline assembly block, e.g. ret += i; This allows the compiler to manipulate `i`. This permits the compiler to merge the `add` and `cmp` for the above, e.g. mov w1, #0xffffffff ldaddal w1, w1, [x0] cmp w1, #0x1 cset w0, eq // eq = none ret With this change the assembly for each RETURN op is identical to the corresponding FETCH op (including barriers and clobbers) so I've removed the inline assembly and rewritten each RETURN op in terms of the corresponding FETCH op, e.g. | static inline void __lse_atomic_add_return(int i, atomic_t *v) | { | return __lse_atomic_fetch_add(i, v) + i | } The new construction does not adversely affect the common case, and before and after this patch GCC 11.1.0 can compile: __lse_atomic_add_return(i, v) As: ldaddal w0, w2, [x1] add w0, w0, w2 ... while having the freedom to do better elsewhere. This is intended as an optimization and cleanup. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Acked-by: Will Deacon <will@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20211210151410.2782645-6-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-12-10 15:14:10 +00:00
#define ATOMIC64_OP_ADD_SUB_RETURN(name) \
static __always_inline long \
__lse_atomic64_add_return##name(s64 i, atomic64_t *v) \
{ \
arm64: atomics: lse: define RETURN ops in terms of FETCH ops The FEAT_LSE atomic instructions include LD* instructions which return the original value of a memory location can be used to directly implement FETCH opertations. Each RETURN op is implemented as a copy of the corresponding FETCH op with a trailing instruction to generate the new value of the memory location. We only directly implement *_fetch_add*(), for which we have a trailing `add` instruction. As the compiler has no visibility of the `add`, this leads to less than optimal code generation when consuming the result. For example, the compiler cannot constant-fold the addition into later operations, and currently GCC 11.1.0 will compile: return __lse_atomic_sub_return(1, v) == 0; As: mov w1, #0xffffffff ldaddal w1, w2, [x0] add w1, w1, w2 cmp w1, #0x0 cset w0, eq // eq = none ret This patch improves this by replacing the `add` with C addition after the inline assembly block, e.g. ret += i; This allows the compiler to manipulate `i`. This permits the compiler to merge the `add` and `cmp` for the above, e.g. mov w1, #0xffffffff ldaddal w1, w1, [x0] cmp w1, #0x1 cset w0, eq // eq = none ret With this change the assembly for each RETURN op is identical to the corresponding FETCH op (including barriers and clobbers) so I've removed the inline assembly and rewritten each RETURN op in terms of the corresponding FETCH op, e.g. | static inline void __lse_atomic_add_return(int i, atomic_t *v) | { | return __lse_atomic_fetch_add(i, v) + i | } The new construction does not adversely affect the common case, and before and after this patch GCC 11.1.0 can compile: __lse_atomic_add_return(i, v) As: ldaddal w0, w2, [x1] add w0, w0, w2 ... while having the freedom to do better elsewhere. This is intended as an optimization and cleanup. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Acked-by: Will Deacon <will@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20211210151410.2782645-6-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-12-10 15:14:10 +00:00
return __lse_atomic64_fetch_add##name(i, v) + i; \
arm64: atomics lse: define SUBs in terms of ADDs The FEAT_LSE atomic instructions include atomic ADD instructions (`stadd*` and `ldadd*`), but do not include atomic SUB instructions, so we must build all of the SUB operations using the ADD instructions. We open-code these today, with each SUB op implemented as a copy of the corresponding ADD op with a leading `neg` instruction in the inline assembly to negate the `i` argument. As the compiler has no visibility of the `neg`, this leads to less than optimal code generation when generating `i` into a register. For example, __les_atomic_fetch_sub(1, v) can be compiled to: mov w1, #0x1 neg w1, w1 ldaddal w1, w1, [x2] This patch improves this by replacing the `neg` with negation in C before the inline assembly block, e.g. i = -i; This allows the compiler to generate `i` into a register more optimally, e.g. mov w1, #0xffffffff ldaddal w1, w1, [x2] With this change the assembly for each SUB op is identical to the corresponding ADD op (including barriers and clobbers), so I've removed the inline assembly and rewritten each SUB op in terms of the corresponding ADD op, e.g. | static inline void __lse_atomic_sub(int i, atomic_t *v) | { | __lse_atomic_add(-i, v); | } For clarity I've moved the definition of each SUB op immediately after the corresponding ADD op, and used a single macro to create the RETURN forms of both ops. This is intended as an optimization and cleanup. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Acked-by: Will Deacon <will@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20211210151410.2782645-3-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-12-10 15:14:07 +00:00
} \
\
static __always_inline long \
__lse_atomic64_sub_return##name(s64 i, atomic64_t *v) \
arm64: atomics lse: define SUBs in terms of ADDs The FEAT_LSE atomic instructions include atomic ADD instructions (`stadd*` and `ldadd*`), but do not include atomic SUB instructions, so we must build all of the SUB operations using the ADD instructions. We open-code these today, with each SUB op implemented as a copy of the corresponding ADD op with a leading `neg` instruction in the inline assembly to negate the `i` argument. As the compiler has no visibility of the `neg`, this leads to less than optimal code generation when generating `i` into a register. For example, __les_atomic_fetch_sub(1, v) can be compiled to: mov w1, #0x1 neg w1, w1 ldaddal w1, w1, [x2] This patch improves this by replacing the `neg` with negation in C before the inline assembly block, e.g. i = -i; This allows the compiler to generate `i` into a register more optimally, e.g. mov w1, #0xffffffff ldaddal w1, w1, [x2] With this change the assembly for each SUB op is identical to the corresponding ADD op (including barriers and clobbers), so I've removed the inline assembly and rewritten each SUB op in terms of the corresponding ADD op, e.g. | static inline void __lse_atomic_sub(int i, atomic_t *v) | { | __lse_atomic_add(-i, v); | } For clarity I've moved the definition of each SUB op immediately after the corresponding ADD op, and used a single macro to create the RETURN forms of both ops. This is intended as an optimization and cleanup. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Acked-by: Will Deacon <will@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20211210151410.2782645-3-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-12-10 15:14:07 +00:00
{ \
arm64: atomics: lse: define RETURN ops in terms of FETCH ops The FEAT_LSE atomic instructions include LD* instructions which return the original value of a memory location can be used to directly implement FETCH opertations. Each RETURN op is implemented as a copy of the corresponding FETCH op with a trailing instruction to generate the new value of the memory location. We only directly implement *_fetch_add*(), for which we have a trailing `add` instruction. As the compiler has no visibility of the `add`, this leads to less than optimal code generation when consuming the result. For example, the compiler cannot constant-fold the addition into later operations, and currently GCC 11.1.0 will compile: return __lse_atomic_sub_return(1, v) == 0; As: mov w1, #0xffffffff ldaddal w1, w2, [x0] add w1, w1, w2 cmp w1, #0x0 cset w0, eq // eq = none ret This patch improves this by replacing the `add` with C addition after the inline assembly block, e.g. ret += i; This allows the compiler to manipulate `i`. This permits the compiler to merge the `add` and `cmp` for the above, e.g. mov w1, #0xffffffff ldaddal w1, w1, [x0] cmp w1, #0x1 cset w0, eq // eq = none ret With this change the assembly for each RETURN op is identical to the corresponding FETCH op (including barriers and clobbers) so I've removed the inline assembly and rewritten each RETURN op in terms of the corresponding FETCH op, e.g. | static inline void __lse_atomic_add_return(int i, atomic_t *v) | { | return __lse_atomic_fetch_add(i, v) + i | } The new construction does not adversely affect the common case, and before and after this patch GCC 11.1.0 can compile: __lse_atomic_add_return(i, v) As: ldaddal w0, w2, [x1] add w0, w0, w2 ... while having the freedom to do better elsewhere. This is intended as an optimization and cleanup. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Acked-by: Will Deacon <will@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20211210151410.2782645-6-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-12-10 15:14:10 +00:00
return __lse_atomic64_fetch_sub##name(i, v) - i; \
}
arm64: atomics: lse: define RETURN ops in terms of FETCH ops The FEAT_LSE atomic instructions include LD* instructions which return the original value of a memory location can be used to directly implement FETCH opertations. Each RETURN op is implemented as a copy of the corresponding FETCH op with a trailing instruction to generate the new value of the memory location. We only directly implement *_fetch_add*(), for which we have a trailing `add` instruction. As the compiler has no visibility of the `add`, this leads to less than optimal code generation when consuming the result. For example, the compiler cannot constant-fold the addition into later operations, and currently GCC 11.1.0 will compile: return __lse_atomic_sub_return(1, v) == 0; As: mov w1, #0xffffffff ldaddal w1, w2, [x0] add w1, w1, w2 cmp w1, #0x0 cset w0, eq // eq = none ret This patch improves this by replacing the `add` with C addition after the inline assembly block, e.g. ret += i; This allows the compiler to manipulate `i`. This permits the compiler to merge the `add` and `cmp` for the above, e.g. mov w1, #0xffffffff ldaddal w1, w1, [x0] cmp w1, #0x1 cset w0, eq // eq = none ret With this change the assembly for each RETURN op is identical to the corresponding FETCH op (including barriers and clobbers) so I've removed the inline assembly and rewritten each RETURN op in terms of the corresponding FETCH op, e.g. | static inline void __lse_atomic_add_return(int i, atomic_t *v) | { | return __lse_atomic_fetch_add(i, v) + i | } The new construction does not adversely affect the common case, and before and after this patch GCC 11.1.0 can compile: __lse_atomic_add_return(i, v) As: ldaddal w0, w2, [x1] add w0, w0, w2 ... while having the freedom to do better elsewhere. This is intended as an optimization and cleanup. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Acked-by: Will Deacon <will@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20211210151410.2782645-6-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-12-10 15:14:10 +00:00
ATOMIC64_OP_ADD_SUB_RETURN(_relaxed)
ATOMIC64_OP_ADD_SUB_RETURN(_acquire)
ATOMIC64_OP_ADD_SUB_RETURN(_release)
ATOMIC64_OP_ADD_SUB_RETURN( )
arm64: atomics lse: define SUBs in terms of ADDs The FEAT_LSE atomic instructions include atomic ADD instructions (`stadd*` and `ldadd*`), but do not include atomic SUB instructions, so we must build all of the SUB operations using the ADD instructions. We open-code these today, with each SUB op implemented as a copy of the corresponding ADD op with a leading `neg` instruction in the inline assembly to negate the `i` argument. As the compiler has no visibility of the `neg`, this leads to less than optimal code generation when generating `i` into a register. For example, __les_atomic_fetch_sub(1, v) can be compiled to: mov w1, #0x1 neg w1, w1 ldaddal w1, w1, [x2] This patch improves this by replacing the `neg` with negation in C before the inline assembly block, e.g. i = -i; This allows the compiler to generate `i` into a register more optimally, e.g. mov w1, #0xffffffff ldaddal w1, w1, [x2] With this change the assembly for each SUB op is identical to the corresponding ADD op (including barriers and clobbers), so I've removed the inline assembly and rewritten each SUB op in terms of the corresponding ADD op, e.g. | static inline void __lse_atomic_sub(int i, atomic_t *v) | { | __lse_atomic_add(-i, v); | } For clarity I've moved the definition of each SUB op immediately after the corresponding ADD op, and used a single macro to create the RETURN forms of both ops. This is intended as an optimization and cleanup. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Acked-by: Will Deacon <will@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20211210151410.2782645-3-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-12-10 15:14:07 +00:00
#undef ATOMIC64_OP_ADD_SUB_RETURN
static __always_inline void __lse_atomic64_and(s64 i, atomic64_t *v)
{
arm64: atomics: lse: define ANDs in terms of ANDNOTs The FEAT_LSE atomic instructions include atomic bit-clear instructions (`ldclr*` and `stclr*`) which can be used to directly implement ANDNOT operations. Each AND op is implemented as a copy of the corresponding ANDNOT op with a leading `mvn` instruction to apply a bitwise NOT to the `i` argument. As the compiler has no visibility of the `mvn`, this leads to less than optimal code generation when generating `i` into a register. For example, __lse_atomic_fetch_and(0xf, v) can be compiled to: mov w1, #0xf mvn w1, w1 ldclral w1, w1, [x2] This patch improves this by replacing the `mvn` with NOT in C before the inline assembly block, e.g. i = ~i; This allows the compiler to generate `i` into a register more optimally, e.g. mov w1, #0xfffffff0 ldclral w1, w1, [x2] With this change the assembly for each AND op is identical to the corresponding ANDNOT op (including barriers and clobbers), so I've removed the inline assembly and rewritten each AND op in terms of the corresponding ANDNOT op, e.g. | static inline void __lse_atomic_and(int i, atomic_t *v) | { | return __lse_atomic_andnot(~i, v); | } This is intended as an optimization and cleanup. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Acked-by: Will Deacon <will@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20211210151410.2782645-4-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-12-10 15:14:08 +00:00
return __lse_atomic64_andnot(~i, v);
}
#define ATOMIC64_FETCH_OP_AND(name, mb, cl...) \
static __always_inline long \
__lse_atomic64_fetch_and##name(s64 i, atomic64_t *v) \
{ \
arm64: atomics: lse: define ANDs in terms of ANDNOTs The FEAT_LSE atomic instructions include atomic bit-clear instructions (`ldclr*` and `stclr*`) which can be used to directly implement ANDNOT operations. Each AND op is implemented as a copy of the corresponding ANDNOT op with a leading `mvn` instruction to apply a bitwise NOT to the `i` argument. As the compiler has no visibility of the `mvn`, this leads to less than optimal code generation when generating `i` into a register. For example, __lse_atomic_fetch_and(0xf, v) can be compiled to: mov w1, #0xf mvn w1, w1 ldclral w1, w1, [x2] This patch improves this by replacing the `mvn` with NOT in C before the inline assembly block, e.g. i = ~i; This allows the compiler to generate `i` into a register more optimally, e.g. mov w1, #0xfffffff0 ldclral w1, w1, [x2] With this change the assembly for each AND op is identical to the corresponding ANDNOT op (including barriers and clobbers), so I've removed the inline assembly and rewritten each AND op in terms of the corresponding ANDNOT op, e.g. | static inline void __lse_atomic_and(int i, atomic_t *v) | { | return __lse_atomic_andnot(~i, v); | } This is intended as an optimization and cleanup. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will@kernel.org> Acked-by: Will Deacon <will@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20211210151410.2782645-4-mark.rutland@arm.com Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-12-10 15:14:08 +00:00
return __lse_atomic64_fetch_andnot##name(~i, v); \
}
ATOMIC64_FETCH_OP_AND(_relaxed, )
ATOMIC64_FETCH_OP_AND(_acquire, a, "memory")
ATOMIC64_FETCH_OP_AND(_release, l, "memory")
ATOMIC64_FETCH_OP_AND( , al, "memory")
#undef ATOMIC64_FETCH_OP_AND
static __always_inline s64 __lse_atomic64_dec_if_positive(atomic64_t *v)
{
unsigned long tmp;
asm volatile(
__LSE_PREAMBLE
"1: ldr %x[tmp], %[v]\n"
" subs %[ret], %x[tmp], #1\n"
" b.lt 2f\n"
" casal %x[tmp], %[ret], %[v]\n"
" sub %x[tmp], %x[tmp], #1\n"
" sub %x[tmp], %x[tmp], %[ret]\n"
" cbnz %x[tmp], 1b\n"
"2:"
: [ret] "+&r" (v), [v] "+Q" (v->counter), [tmp] "=&r" (tmp)
:
: "cc", "memory");
return (long)v;
}
#define __CMPXCHG_CASE(w, sfx, name, sz, mb, cl...) \
static __always_inline u##sz \
__lse__cmpxchg_case_##name##sz(volatile void *ptr, \
u##sz old, \
u##sz new) \
{ \
asm volatile( \
__LSE_PREAMBLE \
arm64: atomics: lse: improve cmpxchg implementation For historical reasons, the LSE implementation of cmpxchg*() hard-codes the GPRs to use, and shuffles registers around with MOVs. This is no longer necessary, and can be simplified. When the LSE cmpxchg implementation was added in commit: c342f78217e822d2 ("arm64: cmpxchg: patch in lse instructions when supported by the CPU") ... the LL/SC implementation of cmpxchg() would be placed out-of-line, and the in-line assembly for cmpxchg would default to: NOP BL <ll_sc_cmpxchg*_implementation> NOP The LL/SC implementation of each cmpxchg() function accepted arguments as per AAPCS64 rules, to it was necessary to place the pointer in x0, the older value in X1, and the new value in x2, and acquire the return value from x0. The LL/SC implementation required a temporary register (e.g. for the STXR status value). As the LL/SC implementation preserved the old value, the LSE implementation does likewise. Since commit: addfc38672c73efd ("arm64: atomics: avoid out-of-line ll/sc atomics") ... the LSE and LL/SC implementations of cmpxchg are inlined as separate asm blocks, with another branch choosing between thw two. Due to this, it is no longer necessary for the LSE implementation to match the register constraints of the LL/SC implementation. This was partially dealt with by removing the hard-coded use of x30 in commit: 3337cb5aea594e40 ("arm64: avoid using hard-coded registers for LSE atomics") ... but we didn't clean up the hard-coding of x0, x1, and x2. This patch simplifies the LSE implementation of cmpxchg, removing the register shuffling and directly clobbering the 'old' argument. This gives the compiler greater freedom for register allocation, and avoids redundant work. The new constraints permit 'old' (Rs) and 'new' (Rt) to be allocated to the same register when the initial values of the two are the same, e.g. resulting in: CAS X0, X0, [X1] This is safe as Rs is only written back after the initial values of Rs and Rt are consumed, and there are no UNPREDICTABLE behaviours to avoid when Rs == Rt. The new constraints also permit 'new' to be allocated to the zero register, avoiding a MOV in a few cases. The same cannot be done for 'old' as it is both an input and output, and any caller of cmpxchg() should care about the output value. Note that for CAS* the use of the zero register never affects the ordering (while for SWP* the use of the zero regsiter for the 'old' value drops any ACQUIRE semantic). Compared to v6.2-rc4, a defconfig vmlinux is ~116KiB smaller, though the resulting Image is the same size due to internal alignment and padding: [mark@lakrids:~/src/linux]% ls -al vmlinux-* -rwxr-xr-x 1 mark mark 137269304 Jan 16 11:59 vmlinux-after -rwxr-xr-x 1 mark mark 137387936 Jan 16 10:54 vmlinux-before [mark@lakrids:~/src/linux]% ls -al Image-* -rw-r--r-- 1 mark mark 38711808 Jan 16 11:59 Image-after -rw-r--r-- 1 mark mark 38711808 Jan 16 10:54 Image-before This patch does not touch cmpxchg_double*() as that requires contiguous register pairs, and separate patches will replace it with cmpxchg128*(). There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Robin Murphy <robin.murphy@arm.com> Cc: Will Deacon <will@kernel.org> Link: https://lore.kernel.org/r/20230314153700.787701-2-mark.rutland@arm.com Signed-off-by: Will Deacon <will@kernel.org>
2023-03-14 15:36:57 +00:00
" cas" #mb #sfx " %" #w "[old], %" #w "[new], %[v]\n" \
: [v] "+Q" (*(u##sz *)ptr), \
[old] "+r" (old) \
: [new] "rZ" (new) \
: cl); \
\
arm64: atomics: lse: improve cmpxchg implementation For historical reasons, the LSE implementation of cmpxchg*() hard-codes the GPRs to use, and shuffles registers around with MOVs. This is no longer necessary, and can be simplified. When the LSE cmpxchg implementation was added in commit: c342f78217e822d2 ("arm64: cmpxchg: patch in lse instructions when supported by the CPU") ... the LL/SC implementation of cmpxchg() would be placed out-of-line, and the in-line assembly for cmpxchg would default to: NOP BL <ll_sc_cmpxchg*_implementation> NOP The LL/SC implementation of each cmpxchg() function accepted arguments as per AAPCS64 rules, to it was necessary to place the pointer in x0, the older value in X1, and the new value in x2, and acquire the return value from x0. The LL/SC implementation required a temporary register (e.g. for the STXR status value). As the LL/SC implementation preserved the old value, the LSE implementation does likewise. Since commit: addfc38672c73efd ("arm64: atomics: avoid out-of-line ll/sc atomics") ... the LSE and LL/SC implementations of cmpxchg are inlined as separate asm blocks, with another branch choosing between thw two. Due to this, it is no longer necessary for the LSE implementation to match the register constraints of the LL/SC implementation. This was partially dealt with by removing the hard-coded use of x30 in commit: 3337cb5aea594e40 ("arm64: avoid using hard-coded registers for LSE atomics") ... but we didn't clean up the hard-coding of x0, x1, and x2. This patch simplifies the LSE implementation of cmpxchg, removing the register shuffling and directly clobbering the 'old' argument. This gives the compiler greater freedom for register allocation, and avoids redundant work. The new constraints permit 'old' (Rs) and 'new' (Rt) to be allocated to the same register when the initial values of the two are the same, e.g. resulting in: CAS X0, X0, [X1] This is safe as Rs is only written back after the initial values of Rs and Rt are consumed, and there are no UNPREDICTABLE behaviours to avoid when Rs == Rt. The new constraints also permit 'new' to be allocated to the zero register, avoiding a MOV in a few cases. The same cannot be done for 'old' as it is both an input and output, and any caller of cmpxchg() should care about the output value. Note that for CAS* the use of the zero register never affects the ordering (while for SWP* the use of the zero regsiter for the 'old' value drops any ACQUIRE semantic). Compared to v6.2-rc4, a defconfig vmlinux is ~116KiB smaller, though the resulting Image is the same size due to internal alignment and padding: [mark@lakrids:~/src/linux]% ls -al vmlinux-* -rwxr-xr-x 1 mark mark 137269304 Jan 16 11:59 vmlinux-after -rwxr-xr-x 1 mark mark 137387936 Jan 16 10:54 vmlinux-before [mark@lakrids:~/src/linux]% ls -al Image-* -rw-r--r-- 1 mark mark 38711808 Jan 16 11:59 Image-after -rw-r--r-- 1 mark mark 38711808 Jan 16 10:54 Image-before This patch does not touch cmpxchg_double*() as that requires contiguous register pairs, and separate patches will replace it with cmpxchg128*(). There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Robin Murphy <robin.murphy@arm.com> Cc: Will Deacon <will@kernel.org> Link: https://lore.kernel.org/r/20230314153700.787701-2-mark.rutland@arm.com Signed-off-by: Will Deacon <will@kernel.org>
2023-03-14 15:36:57 +00:00
return old; \
}
__CMPXCHG_CASE(w, b, , 8, )
__CMPXCHG_CASE(w, h, , 16, )
__CMPXCHG_CASE(w, , , 32, )
__CMPXCHG_CASE(x, , , 64, )
__CMPXCHG_CASE(w, b, acq_, 8, a, "memory")
__CMPXCHG_CASE(w, h, acq_, 16, a, "memory")
__CMPXCHG_CASE(w, , acq_, 32, a, "memory")
__CMPXCHG_CASE(x, , acq_, 64, a, "memory")
__CMPXCHG_CASE(w, b, rel_, 8, l, "memory")
__CMPXCHG_CASE(w, h, rel_, 16, l, "memory")
__CMPXCHG_CASE(w, , rel_, 32, l, "memory")
__CMPXCHG_CASE(x, , rel_, 64, l, "memory")
__CMPXCHG_CASE(w, b, mb_, 8, al, "memory")
__CMPXCHG_CASE(w, h, mb_, 16, al, "memory")
__CMPXCHG_CASE(w, , mb_, 32, al, "memory")
__CMPXCHG_CASE(x, , mb_, 64, al, "memory")
#undef __CMPXCHG_CASE
#define __CMPXCHG128(name, mb, cl...) \
static __always_inline u128 \
__lse__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new) \
{ \
union __u128_halves r, o = { .full = (old) }, \
n = { .full = (new) }; \
register unsigned long x0 asm ("x0") = o.low; \
register unsigned long x1 asm ("x1") = o.high; \
register unsigned long x2 asm ("x2") = n.low; \
register unsigned long x3 asm ("x3") = n.high; \
register unsigned long x4 asm ("x4") = (unsigned long)ptr; \
\
asm volatile( \
__LSE_PREAMBLE \
" casp" #mb "\t%[old1], %[old2], %[new1], %[new2], %[v]\n"\
: [old1] "+&r" (x0), [old2] "+&r" (x1), \
[v] "+Q" (*(u128 *)ptr) \
: [new1] "r" (x2), [new2] "r" (x3), [ptr] "r" (x4), \
[oldval1] "r" (o.low), [oldval2] "r" (o.high) \
: cl); \
\
r.low = x0; r.high = x1; \
\
return r.full; \
}
__CMPXCHG128( , )
__CMPXCHG128(_mb, al, "memory")
#undef __CMPXCHG128
#endif /* __ASM_ATOMIC_LSE_H */