2019-06-04 10:11:33 +02:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0-only */
|
2014-03-21 10:19:17 +01:00
|
|
|
/*
|
|
|
|
* linux/arch/arm64/crypto/aes-ce.S - AES cipher for ARMv8 with
|
|
|
|
* Crypto Extensions
|
|
|
|
*
|
2017-07-24 11:28:10 +01:00
|
|
|
* Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
|
2014-03-21 10:19:17 +01:00
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/linkage.h>
|
2016-10-11 19:15:19 +01:00
|
|
|
#include <asm/assembler.h>
|
2014-03-21 10:19:17 +01:00
|
|
|
|
2020-02-18 19:58:26 +00:00
|
|
|
#define AES_FUNC_START(func) SYM_FUNC_START(ce_ ## func)
|
|
|
|
#define AES_FUNC_END(func) SYM_FUNC_END(ce_ ## func)
|
2014-03-21 10:19:17 +01:00
|
|
|
|
|
|
|
.arch armv8-a+crypto
|
|
|
|
|
2018-09-10 16:41:15 +02:00
|
|
|
xtsmask .req v16
|
crypto: arm64/aes-ce - implement 5 way interleave for ECB, CBC and CTR
This implements 5-way interleaving for ECB, CBC decryption and CTR,
resulting in a speedup of ~11% on Marvell ThunderX2, which has a
very deep pipeline and therefore a high issue latency for NEON
instructions operating on the same registers.
Note that XTS is left alone: implementing 5-way interleave there
would either involve spilling of the calculated tweaks to the
stack, or recalculating them after the encryption operation, and
doing either of those would most likely penalize low end cores.
For ECB, this is not a concern at all, given that we have plenty
of spare registers. For CTR and CBC decryption, we take advantage
of the fact that v16 is not used by the CE version of the code
(which is the only one targeted by the optimization), and so we
can reshuffle the code a bit and avoid having to spill to memory
(with the exception of one extra reload in the CBC routine)
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2019-06-24 19:38:31 +02:00
|
|
|
cbciv .req v16
|
|
|
|
vctr .req v16
|
2018-09-10 16:41:15 +02:00
|
|
|
|
|
|
|
.macro xts_reload_mask, tmp
|
|
|
|
.endm
|
|
|
|
|
2019-09-03 09:43:34 -07:00
|
|
|
.macro xts_cts_skip_tw, reg, lbl
|
|
|
|
.endm
|
|
|
|
|
2014-03-21 10:19:17 +01:00
|
|
|
/* preload all round keys */
|
crypto: arm64/aes-ce - Simplify round key load sequence
Tweak the round key logic so that they can be loaded using a single
branchless sequence using overlapping loads. This is shorter and
simpler, and puts the conditional branches based on the key size further
apart, which might benefit microarchitectures that cannot record taken
branches at every instruction. For these branches, use test-bit-branch
instructions that don't clobber the condition flags.
Note that none of this has any impact on performance, positive or
otherwise (and the branch prediction benefit would only benefit AES-192
which nobody uses). It does make for nicer code, though.
While at it, use \@ to generate the labels inside the macros, which is
more robust than using fixed numbers, which could clash inadvertently.
Also, bring aes-neon.S in line with these changes, including the switch
to test-and-branch instructions, to avoid surprises in the future when
we might start relying on the condition flags being preserved in the
chaining mode wrappers in aes-modes.S
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2024-04-15 15:04:26 +02:00
|
|
|
.macro load_round_keys, rk, nr, tmp
|
|
|
|
add \tmp, \rk, \nr, sxtw #4
|
|
|
|
sub \tmp, \tmp, #160
|
|
|
|
ld1 {v17.4s-v20.4s}, [\rk]
|
|
|
|
ld1 {v21.4s-v24.4s}, [\tmp], #64
|
|
|
|
ld1 {v25.4s-v28.4s}, [\tmp], #64
|
|
|
|
ld1 {v29.4s-v31.4s}, [\tmp]
|
2014-03-21 10:19:17 +01:00
|
|
|
.endm
|
|
|
|
|
|
|
|
/* prepare for encryption with key in rk[] */
|
2018-04-30 18:18:24 +02:00
|
|
|
.macro enc_prepare, rounds, rk, temp
|
crypto: arm64/aes-ce - Simplify round key load sequence
Tweak the round key logic so that they can be loaded using a single
branchless sequence using overlapping loads. This is shorter and
simpler, and puts the conditional branches based on the key size further
apart, which might benefit microarchitectures that cannot record taken
branches at every instruction. For these branches, use test-bit-branch
instructions that don't clobber the condition flags.
Note that none of this has any impact on performance, positive or
otherwise (and the branch prediction benefit would only benefit AES-192
which nobody uses). It does make for nicer code, though.
While at it, use \@ to generate the labels inside the macros, which is
more robust than using fixed numbers, which could clash inadvertently.
Also, bring aes-neon.S in line with these changes, including the switch
to test-and-branch instructions, to avoid surprises in the future when
we might start relying on the condition flags being preserved in the
chaining mode wrappers in aes-modes.S
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2024-04-15 15:04:26 +02:00
|
|
|
load_round_keys \rk, \rounds, \temp
|
2014-03-21 10:19:17 +01:00
|
|
|
.endm
|
|
|
|
|
|
|
|
/* prepare for encryption (again) but with new key in rk[] */
|
2018-04-30 18:18:24 +02:00
|
|
|
.macro enc_switch_key, rounds, rk, temp
|
crypto: arm64/aes-ce - Simplify round key load sequence
Tweak the round key logic so that they can be loaded using a single
branchless sequence using overlapping loads. This is shorter and
simpler, and puts the conditional branches based on the key size further
apart, which might benefit microarchitectures that cannot record taken
branches at every instruction. For these branches, use test-bit-branch
instructions that don't clobber the condition flags.
Note that none of this has any impact on performance, positive or
otherwise (and the branch prediction benefit would only benefit AES-192
which nobody uses). It does make for nicer code, though.
While at it, use \@ to generate the labels inside the macros, which is
more robust than using fixed numbers, which could clash inadvertently.
Also, bring aes-neon.S in line with these changes, including the switch
to test-and-branch instructions, to avoid surprises in the future when
we might start relying on the condition flags being preserved in the
chaining mode wrappers in aes-modes.S
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2024-04-15 15:04:26 +02:00
|
|
|
load_round_keys \rk, \rounds, \temp
|
2014-03-21 10:19:17 +01:00
|
|
|
.endm
|
|
|
|
|
|
|
|
/* prepare for decryption with key in rk[] */
|
2018-04-30 18:18:24 +02:00
|
|
|
.macro dec_prepare, rounds, rk, temp
|
crypto: arm64/aes-ce - Simplify round key load sequence
Tweak the round key logic so that they can be loaded using a single
branchless sequence using overlapping loads. This is shorter and
simpler, and puts the conditional branches based on the key size further
apart, which might benefit microarchitectures that cannot record taken
branches at every instruction. For these branches, use test-bit-branch
instructions that don't clobber the condition flags.
Note that none of this has any impact on performance, positive or
otherwise (and the branch prediction benefit would only benefit AES-192
which nobody uses). It does make for nicer code, though.
While at it, use \@ to generate the labels inside the macros, which is
more robust than using fixed numbers, which could clash inadvertently.
Also, bring aes-neon.S in line with these changes, including the switch
to test-and-branch instructions, to avoid surprises in the future when
we might start relying on the condition flags being preserved in the
chaining mode wrappers in aes-modes.S
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2024-04-15 15:04:26 +02:00
|
|
|
load_round_keys \rk, \rounds, \temp
|
2014-03-21 10:19:17 +01:00
|
|
|
.endm
|
|
|
|
|
2019-06-24 19:38:30 +02:00
|
|
|
.macro do_enc_Nx, de, mc, k, i0, i1, i2, i3, i4
|
2014-03-21 10:19:17 +01:00
|
|
|
aes\de \i0\().16b, \k\().16b
|
|
|
|
aes\mc \i0\().16b, \i0\().16b
|
|
|
|
.ifnb \i1
|
2015-03-17 18:05:13 +00:00
|
|
|
aes\de \i1\().16b, \k\().16b
|
2014-03-21 10:19:17 +01:00
|
|
|
aes\mc \i1\().16b, \i1\().16b
|
|
|
|
.ifnb \i3
|
2015-03-17 18:05:13 +00:00
|
|
|
aes\de \i2\().16b, \k\().16b
|
2014-03-21 10:19:17 +01:00
|
|
|
aes\mc \i2\().16b, \i2\().16b
|
2015-03-17 18:05:13 +00:00
|
|
|
aes\de \i3\().16b, \k\().16b
|
2014-03-21 10:19:17 +01:00
|
|
|
aes\mc \i3\().16b, \i3\().16b
|
2019-06-24 19:38:30 +02:00
|
|
|
.ifnb \i4
|
|
|
|
aes\de \i4\().16b, \k\().16b
|
|
|
|
aes\mc \i4\().16b, \i4\().16b
|
|
|
|
.endif
|
2014-03-21 10:19:17 +01:00
|
|
|
.endif
|
|
|
|
.endif
|
|
|
|
.endm
|
|
|
|
|
2019-06-24 19:38:30 +02:00
|
|
|
/* up to 5 interleaved encryption rounds with the same round key */
|
|
|
|
.macro round_Nx, enc, k, i0, i1, i2, i3, i4
|
2014-03-21 10:19:17 +01:00
|
|
|
.ifc \enc, e
|
2019-06-24 19:38:30 +02:00
|
|
|
do_enc_Nx e, mc, \k, \i0, \i1, \i2, \i3, \i4
|
2014-03-21 10:19:17 +01:00
|
|
|
.else
|
2019-06-24 19:38:30 +02:00
|
|
|
do_enc_Nx d, imc, \k, \i0, \i1, \i2, \i3, \i4
|
2014-03-21 10:19:17 +01:00
|
|
|
.endif
|
|
|
|
.endm
|
|
|
|
|
2019-06-24 19:38:30 +02:00
|
|
|
/* up to 5 interleaved final rounds */
|
|
|
|
.macro fin_round_Nx, de, k, k2, i0, i1, i2, i3, i4
|
2014-03-21 10:19:17 +01:00
|
|
|
aes\de \i0\().16b, \k\().16b
|
|
|
|
.ifnb \i1
|
|
|
|
aes\de \i1\().16b, \k\().16b
|
|
|
|
.ifnb \i3
|
|
|
|
aes\de \i2\().16b, \k\().16b
|
|
|
|
aes\de \i3\().16b, \k\().16b
|
2019-06-24 19:38:30 +02:00
|
|
|
.ifnb \i4
|
|
|
|
aes\de \i4\().16b, \k\().16b
|
|
|
|
.endif
|
2014-03-21 10:19:17 +01:00
|
|
|
.endif
|
|
|
|
.endif
|
|
|
|
eor \i0\().16b, \i0\().16b, \k2\().16b
|
|
|
|
.ifnb \i1
|
|
|
|
eor \i1\().16b, \i1\().16b, \k2\().16b
|
|
|
|
.ifnb \i3
|
|
|
|
eor \i2\().16b, \i2\().16b, \k2\().16b
|
|
|
|
eor \i3\().16b, \i3\().16b, \k2\().16b
|
2019-06-24 19:38:30 +02:00
|
|
|
.ifnb \i4
|
|
|
|
eor \i4\().16b, \i4\().16b, \k2\().16b
|
|
|
|
.endif
|
2014-03-21 10:19:17 +01:00
|
|
|
.endif
|
|
|
|
.endif
|
|
|
|
.endm
|
|
|
|
|
2019-06-24 19:38:30 +02:00
|
|
|
/* up to 5 interleaved blocks */
|
|
|
|
.macro do_block_Nx, enc, rounds, i0, i1, i2, i3, i4
|
crypto: arm64/aes-ce - Simplify round key load sequence
Tweak the round key logic so that they can be loaded using a single
branchless sequence using overlapping loads. This is shorter and
simpler, and puts the conditional branches based on the key size further
apart, which might benefit microarchitectures that cannot record taken
branches at every instruction. For these branches, use test-bit-branch
instructions that don't clobber the condition flags.
Note that none of this has any impact on performance, positive or
otherwise (and the branch prediction benefit would only benefit AES-192
which nobody uses). It does make for nicer code, though.
While at it, use \@ to generate the labels inside the macros, which is
more robust than using fixed numbers, which could clash inadvertently.
Also, bring aes-neon.S in line with these changes, including the switch
to test-and-branch instructions, to avoid surprises in the future when
we might start relying on the condition flags being preserved in the
chaining mode wrappers in aes-modes.S
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2024-04-15 15:04:26 +02:00
|
|
|
tbz \rounds, #2, .L\@ /* 128 bits */
|
2019-06-24 19:38:30 +02:00
|
|
|
round_Nx \enc, v17, \i0, \i1, \i2, \i3, \i4
|
|
|
|
round_Nx \enc, v18, \i0, \i1, \i2, \i3, \i4
|
crypto: arm64/aes-ce - Simplify round key load sequence
Tweak the round key logic so that they can be loaded using a single
branchless sequence using overlapping loads. This is shorter and
simpler, and puts the conditional branches based on the key size further
apart, which might benefit microarchitectures that cannot record taken
branches at every instruction. For these branches, use test-bit-branch
instructions that don't clobber the condition flags.
Note that none of this has any impact on performance, positive or
otherwise (and the branch prediction benefit would only benefit AES-192
which nobody uses). It does make for nicer code, though.
While at it, use \@ to generate the labels inside the macros, which is
more robust than using fixed numbers, which could clash inadvertently.
Also, bring aes-neon.S in line with these changes, including the switch
to test-and-branch instructions, to avoid surprises in the future when
we might start relying on the condition flags being preserved in the
chaining mode wrappers in aes-modes.S
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2024-04-15 15:04:26 +02:00
|
|
|
tbz \rounds, #1, .L\@ /* 192 bits */
|
|
|
|
round_Nx \enc, v19, \i0, \i1, \i2, \i3, \i4
|
2019-06-24 19:38:30 +02:00
|
|
|
round_Nx \enc, v20, \i0, \i1, \i2, \i3, \i4
|
crypto: arm64/aes-ce - Simplify round key load sequence
Tweak the round key logic so that they can be loaded using a single
branchless sequence using overlapping loads. This is shorter and
simpler, and puts the conditional branches based on the key size further
apart, which might benefit microarchitectures that cannot record taken
branches at every instruction. For these branches, use test-bit-branch
instructions that don't clobber the condition flags.
Note that none of this has any impact on performance, positive or
otherwise (and the branch prediction benefit would only benefit AES-192
which nobody uses). It does make for nicer code, though.
While at it, use \@ to generate the labels inside the macros, which is
more robust than using fixed numbers, which could clash inadvertently.
Also, bring aes-neon.S in line with these changes, including the switch
to test-and-branch instructions, to avoid surprises in the future when
we might start relying on the condition flags being preserved in the
chaining mode wrappers in aes-modes.S
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2024-04-15 15:04:26 +02:00
|
|
|
.L\@: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29
|
2019-06-24 19:38:30 +02:00
|
|
|
round_Nx \enc, \key, \i0, \i1, \i2, \i3, \i4
|
2014-03-21 10:19:17 +01:00
|
|
|
.endr
|
2019-06-24 19:38:30 +02:00
|
|
|
fin_round_Nx \enc, v30, v31, \i0, \i1, \i2, \i3, \i4
|
2014-03-21 10:19:17 +01:00
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro encrypt_block, in, rounds, t0, t1, t2
|
|
|
|
do_block_Nx e, \rounds, \in
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro encrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2
|
|
|
|
do_block_Nx e, \rounds, \i0, \i1, \i2, \i3
|
|
|
|
.endm
|
|
|
|
|
2019-06-24 19:38:30 +02:00
|
|
|
.macro encrypt_block5x, i0, i1, i2, i3, i4, rounds, t0, t1, t2
|
|
|
|
do_block_Nx e, \rounds, \i0, \i1, \i2, \i3, \i4
|
2014-03-21 10:19:17 +01:00
|
|
|
.endm
|
|
|
|
|
2019-06-24 19:38:30 +02:00
|
|
|
.macro decrypt_block, in, rounds, t0, t1, t2
|
|
|
|
do_block_Nx d, \rounds, \in
|
2014-03-21 10:19:17 +01:00
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro decrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2
|
|
|
|
do_block_Nx d, \rounds, \i0, \i1, \i2, \i3
|
|
|
|
.endm
|
|
|
|
|
2019-06-24 19:38:30 +02:00
|
|
|
.macro decrypt_block5x, i0, i1, i2, i3, i4, rounds, t0, t1, t2
|
|
|
|
do_block_Nx d, \rounds, \i0, \i1, \i2, \i3, \i4
|
|
|
|
.endm
|
|
|
|
|
|
|
|
#define MAX_STRIDE 5
|
|
|
|
|
2014-03-21 10:19:17 +01:00
|
|
|
#include "aes-modes.S"
|