mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-08-05 16:54:27 +00:00

Move the contents of arch/x86/lib/crypto/ into lib/crypto/x86/. The new code organization makes a lot more sense for how this code actually works and is developed. In particular, it makes it possible to build each algorithm as a single module, with better inlining and dead code elimination. For a more detailed explanation, see the patchset which did this for the CRC library code: https://lore.kernel.org/r/20250607200454.73587-1-ebiggers@kernel.org/. Also see the patchset which did this for SHA-512: https://lore.kernel.org/linux-crypto/20250616014019.415791-1-ebiggers@kernel.org/ This is just a preparatory commit, which does the move to get the files into their new location but keeps them building the same way as before. Later commits will make the actual improvements to the way the arch-optimized code is integrated for each algorithm. Add a gitignore entry for the removed directory arch/x86/lib/crypto/ so that people don't accidentally commit leftover generated files. Acked-by: Ard Biesheuvel <ardb@kernel.org> Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com> Reviewed-by: Sohil Mehta <sohil.mehta@intel.com> Link: https://lore.kernel.org/r/20250619191908.134235-9-ebiggers@kernel.org Signed-off-by: Eric Biggers <ebiggers@kernel.org>
252 lines
6.9 KiB
ArmAsm
252 lines
6.9 KiB
ArmAsm
/* SPDX-License-Identifier: GPL-2.0 OR MIT */
|
|
/*
|
|
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
|
|
* Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
|
|
*/
|
|
|
|
#include <linux/linkage.h>
|
|
|
|
.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
|
|
.align 32
|
|
IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667
|
|
.octa 0x5BE0CD191F83D9AB9B05688C510E527F
|
|
.section .rodata.cst16.ROT16, "aM", @progbits, 16
|
|
.align 16
|
|
ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302
|
|
.section .rodata.cst16.ROR328, "aM", @progbits, 16
|
|
.align 16
|
|
ROR328: .octa 0x0C0F0E0D080B0A090407060500030201
|
|
.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
|
|
.align 64
|
|
SIGMA:
|
|
.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
|
|
.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7
|
|
.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1
|
|
.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0
|
|
.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8
|
|
.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14
|
|
.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2
|
|
.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6
|
|
.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4
|
|
.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12
|
|
.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
|
|
.align 64
|
|
SIGMA2:
|
|
.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
|
|
.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7
|
|
.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9
|
|
.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5
|
|
.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12
|
|
.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9
|
|
.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0
|
|
.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10
|
|
.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14
|
|
.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9
|
|
|
|
.text
|
|
SYM_FUNC_START(blake2s_compress_ssse3)
|
|
testq %rdx,%rdx
|
|
je .Lendofloop
|
|
movdqu (%rdi),%xmm0
|
|
movdqu 0x10(%rdi),%xmm1
|
|
movdqa ROT16(%rip),%xmm12
|
|
movdqa ROR328(%rip),%xmm13
|
|
movdqu 0x20(%rdi),%xmm14
|
|
movq %rcx,%xmm15
|
|
leaq SIGMA+0xa0(%rip),%r8
|
|
jmp .Lbeginofloop
|
|
.align 32
|
|
.Lbeginofloop:
|
|
movdqa %xmm0,%xmm10
|
|
movdqa %xmm1,%xmm11
|
|
paddq %xmm15,%xmm14
|
|
movdqa IV(%rip),%xmm2
|
|
movdqa %xmm14,%xmm3
|
|
pxor IV+0x10(%rip),%xmm3
|
|
leaq SIGMA(%rip),%rcx
|
|
.Lroundloop:
|
|
movzbl (%rcx),%eax
|
|
movd (%rsi,%rax,4),%xmm4
|
|
movzbl 0x1(%rcx),%eax
|
|
movd (%rsi,%rax,4),%xmm5
|
|
movzbl 0x2(%rcx),%eax
|
|
movd (%rsi,%rax,4),%xmm6
|
|
movzbl 0x3(%rcx),%eax
|
|
movd (%rsi,%rax,4),%xmm7
|
|
punpckldq %xmm5,%xmm4
|
|
punpckldq %xmm7,%xmm6
|
|
punpcklqdq %xmm6,%xmm4
|
|
paddd %xmm4,%xmm0
|
|
paddd %xmm1,%xmm0
|
|
pxor %xmm0,%xmm3
|
|
pshufb %xmm12,%xmm3
|
|
paddd %xmm3,%xmm2
|
|
pxor %xmm2,%xmm1
|
|
movdqa %xmm1,%xmm8
|
|
psrld $0xc,%xmm1
|
|
pslld $0x14,%xmm8
|
|
por %xmm8,%xmm1
|
|
movzbl 0x4(%rcx),%eax
|
|
movd (%rsi,%rax,4),%xmm5
|
|
movzbl 0x5(%rcx),%eax
|
|
movd (%rsi,%rax,4),%xmm6
|
|
movzbl 0x6(%rcx),%eax
|
|
movd (%rsi,%rax,4),%xmm7
|
|
movzbl 0x7(%rcx),%eax
|
|
movd (%rsi,%rax,4),%xmm4
|
|
punpckldq %xmm6,%xmm5
|
|
punpckldq %xmm4,%xmm7
|
|
punpcklqdq %xmm7,%xmm5
|
|
paddd %xmm5,%xmm0
|
|
paddd %xmm1,%xmm0
|
|
pxor %xmm0,%xmm3
|
|
pshufb %xmm13,%xmm3
|
|
paddd %xmm3,%xmm2
|
|
pxor %xmm2,%xmm1
|
|
movdqa %xmm1,%xmm8
|
|
psrld $0x7,%xmm1
|
|
pslld $0x19,%xmm8
|
|
por %xmm8,%xmm1
|
|
pshufd $0x93,%xmm0,%xmm0
|
|
pshufd $0x4e,%xmm3,%xmm3
|
|
pshufd $0x39,%xmm2,%xmm2
|
|
movzbl 0x8(%rcx),%eax
|
|
movd (%rsi,%rax,4),%xmm6
|
|
movzbl 0x9(%rcx),%eax
|
|
movd (%rsi,%rax,4),%xmm7
|
|
movzbl 0xa(%rcx),%eax
|
|
movd (%rsi,%rax,4),%xmm4
|
|
movzbl 0xb(%rcx),%eax
|
|
movd (%rsi,%rax,4),%xmm5
|
|
punpckldq %xmm7,%xmm6
|
|
punpckldq %xmm5,%xmm4
|
|
punpcklqdq %xmm4,%xmm6
|
|
paddd %xmm6,%xmm0
|
|
paddd %xmm1,%xmm0
|
|
pxor %xmm0,%xmm3
|
|
pshufb %xmm12,%xmm3
|
|
paddd %xmm3,%xmm2
|
|
pxor %xmm2,%xmm1
|
|
movdqa %xmm1,%xmm8
|
|
psrld $0xc,%xmm1
|
|
pslld $0x14,%xmm8
|
|
por %xmm8,%xmm1
|
|
movzbl 0xc(%rcx),%eax
|
|
movd (%rsi,%rax,4),%xmm7
|
|
movzbl 0xd(%rcx),%eax
|
|
movd (%rsi,%rax,4),%xmm4
|
|
movzbl 0xe(%rcx),%eax
|
|
movd (%rsi,%rax,4),%xmm5
|
|
movzbl 0xf(%rcx),%eax
|
|
movd (%rsi,%rax,4),%xmm6
|
|
punpckldq %xmm4,%xmm7
|
|
punpckldq %xmm6,%xmm5
|
|
punpcklqdq %xmm5,%xmm7
|
|
paddd %xmm7,%xmm0
|
|
paddd %xmm1,%xmm0
|
|
pxor %xmm0,%xmm3
|
|
pshufb %xmm13,%xmm3
|
|
paddd %xmm3,%xmm2
|
|
pxor %xmm2,%xmm1
|
|
movdqa %xmm1,%xmm8
|
|
psrld $0x7,%xmm1
|
|
pslld $0x19,%xmm8
|
|
por %xmm8,%xmm1
|
|
pshufd $0x39,%xmm0,%xmm0
|
|
pshufd $0x4e,%xmm3,%xmm3
|
|
pshufd $0x93,%xmm2,%xmm2
|
|
addq $0x10,%rcx
|
|
cmpq %r8,%rcx
|
|
jnz .Lroundloop
|
|
pxor %xmm2,%xmm0
|
|
pxor %xmm3,%xmm1
|
|
pxor %xmm10,%xmm0
|
|
pxor %xmm11,%xmm1
|
|
addq $0x40,%rsi
|
|
decq %rdx
|
|
jnz .Lbeginofloop
|
|
movdqu %xmm0,(%rdi)
|
|
movdqu %xmm1,0x10(%rdi)
|
|
movdqu %xmm14,0x20(%rdi)
|
|
.Lendofloop:
|
|
RET
|
|
SYM_FUNC_END(blake2s_compress_ssse3)
|
|
|
|
SYM_FUNC_START(blake2s_compress_avx512)
|
|
vmovdqu (%rdi),%xmm0
|
|
vmovdqu 0x10(%rdi),%xmm1
|
|
vmovdqu 0x20(%rdi),%xmm4
|
|
vmovq %rcx,%xmm5
|
|
vmovdqa IV(%rip),%xmm14
|
|
vmovdqa IV+16(%rip),%xmm15
|
|
jmp .Lblake2s_compress_avx512_mainloop
|
|
.align 32
|
|
.Lblake2s_compress_avx512_mainloop:
|
|
vmovdqa %xmm0,%xmm10
|
|
vmovdqa %xmm1,%xmm11
|
|
vpaddq %xmm5,%xmm4,%xmm4
|
|
vmovdqa %xmm14,%xmm2
|
|
vpxor %xmm15,%xmm4,%xmm3
|
|
vmovdqu (%rsi),%ymm6
|
|
vmovdqu 0x20(%rsi),%ymm7
|
|
addq $0x40,%rsi
|
|
leaq SIGMA2(%rip),%rax
|
|
movb $0xa,%cl
|
|
.Lblake2s_compress_avx512_roundloop:
|
|
addq $0x40,%rax
|
|
vmovdqa -0x40(%rax),%ymm8
|
|
vmovdqa -0x20(%rax),%ymm9
|
|
vpermi2d %ymm7,%ymm6,%ymm8
|
|
vpermi2d %ymm7,%ymm6,%ymm9
|
|
vmovdqa %ymm8,%ymm6
|
|
vmovdqa %ymm9,%ymm7
|
|
vpaddd %xmm8,%xmm0,%xmm0
|
|
vpaddd %xmm1,%xmm0,%xmm0
|
|
vpxor %xmm0,%xmm3,%xmm3
|
|
vprord $0x10,%xmm3,%xmm3
|
|
vpaddd %xmm3,%xmm2,%xmm2
|
|
vpxor %xmm2,%xmm1,%xmm1
|
|
vprord $0xc,%xmm1,%xmm1
|
|
vextracti128 $0x1,%ymm8,%xmm8
|
|
vpaddd %xmm8,%xmm0,%xmm0
|
|
vpaddd %xmm1,%xmm0,%xmm0
|
|
vpxor %xmm0,%xmm3,%xmm3
|
|
vprord $0x8,%xmm3,%xmm3
|
|
vpaddd %xmm3,%xmm2,%xmm2
|
|
vpxor %xmm2,%xmm1,%xmm1
|
|
vprord $0x7,%xmm1,%xmm1
|
|
vpshufd $0x93,%xmm0,%xmm0
|
|
vpshufd $0x4e,%xmm3,%xmm3
|
|
vpshufd $0x39,%xmm2,%xmm2
|
|
vpaddd %xmm9,%xmm0,%xmm0
|
|
vpaddd %xmm1,%xmm0,%xmm0
|
|
vpxor %xmm0,%xmm3,%xmm3
|
|
vprord $0x10,%xmm3,%xmm3
|
|
vpaddd %xmm3,%xmm2,%xmm2
|
|
vpxor %xmm2,%xmm1,%xmm1
|
|
vprord $0xc,%xmm1,%xmm1
|
|
vextracti128 $0x1,%ymm9,%xmm9
|
|
vpaddd %xmm9,%xmm0,%xmm0
|
|
vpaddd %xmm1,%xmm0,%xmm0
|
|
vpxor %xmm0,%xmm3,%xmm3
|
|
vprord $0x8,%xmm3,%xmm3
|
|
vpaddd %xmm3,%xmm2,%xmm2
|
|
vpxor %xmm2,%xmm1,%xmm1
|
|
vprord $0x7,%xmm1,%xmm1
|
|
vpshufd $0x39,%xmm0,%xmm0
|
|
vpshufd $0x4e,%xmm3,%xmm3
|
|
vpshufd $0x93,%xmm2,%xmm2
|
|
decb %cl
|
|
jne .Lblake2s_compress_avx512_roundloop
|
|
vpxor %xmm10,%xmm0,%xmm0
|
|
vpxor %xmm11,%xmm1,%xmm1
|
|
vpxor %xmm2,%xmm0,%xmm0
|
|
vpxor %xmm3,%xmm1,%xmm1
|
|
decq %rdx
|
|
jne .Lblake2s_compress_avx512_mainloop
|
|
vmovdqu %xmm0,(%rdi)
|
|
vmovdqu %xmm1,0x10(%rdi)
|
|
vmovdqu %xmm4,0x20(%rdi)
|
|
vzeroupper
|
|
RET
|
|
SYM_FUNC_END(blake2s_compress_avx512)
|