mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-08-05 16:54:27 +00:00

Move the contents of arch/mips/lib/crypto/ into lib/crypto/mips/. The new code organization makes a lot more sense for how this code actually works and is developed. In particular, it makes it possible to build each algorithm as a single module, with better inlining and dead code elimination. For a more detailed explanation, see the patchset which did this for the CRC library code: https://lore.kernel.org/r/20250607200454.73587-1-ebiggers@kernel.org/. Also see the patchset which did this for SHA-512: https://lore.kernel.org/linux-crypto/20250616014019.415791-1-ebiggers@kernel.org/ This is just a preparatory commit, which does the move to get the files into their new location but keeps them building the same way as before. Later commits will make the actual improvements to the way the arch-optimized code is integrated for each algorithm. Add a gitignore entry for the removed directory arch/mips/lib/crypto/ so that people don't accidentally commit leftover generated files. Acked-by: Ard Biesheuvel <ardb@kernel.org> Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com> Reviewed-by: Sohil Mehta <sohil.mehta@intel.com> Link: https://lore.kernel.org/r/20250619191908.134235-4-ebiggers@kernel.org Signed-off-by: Eric Biggers <ebiggers@kernel.org>
1273 lines
24 KiB
Raku
1273 lines
24 KiB
Raku
#!/usr/bin/env perl
|
|
# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
|
|
#
|
|
# ====================================================================
|
|
# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL
|
|
# project.
|
|
# ====================================================================
|
|
|
|
# Poly1305 hash for MIPS.
|
|
#
|
|
# May 2016
|
|
#
|
|
# Numbers are cycles per processed byte with poly1305_blocks alone.
|
|
#
|
|
# IALU/gcc
|
|
# R1x000 ~5.5/+130% (big-endian)
|
|
# Octeon II 2.50/+70% (little-endian)
|
|
#
|
|
# March 2019
|
|
#
|
|
# Add 32-bit code path.
|
|
#
|
|
# October 2019
|
|
#
|
|
# Modulo-scheduling reduction allows to omit dependency chain at the
|
|
# end of inner loop and improve performance. Also optimize MIPS32R2
|
|
# code path for MIPS 1004K core. Per René von Dorst's suggestions.
|
|
#
|
|
# IALU/gcc
|
|
# R1x000 ~9.8/? (big-endian)
|
|
# Octeon II 3.65/+140% (little-endian)
|
|
# MT7621/1004K 4.75/? (little-endian)
|
|
#
|
|
######################################################################
|
|
# There is a number of MIPS ABI in use, O32 and N32/64 are most
|
|
# widely used. Then there is a new contender: NUBI. It appears that if
|
|
# one picks the latter, it's possible to arrange code in ABI neutral
|
|
# manner. Therefore let's stick to NUBI register layout:
|
|
#
|
|
($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
|
|
($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
|
|
($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
|
|
($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
|
|
#
|
|
# The return value is placed in $a0. Following coding rules facilitate
|
|
# interoperability:
|
|
#
|
|
# - never ever touch $tp, "thread pointer", former $gp [o32 can be
|
|
# excluded from the rule, because it's specified volatile];
|
|
# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
|
|
# old code];
|
|
# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
|
|
#
|
|
# For reference here is register layout for N32/64 MIPS ABIs:
|
|
#
|
|
# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
|
|
# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
|
|
# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
|
|
# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
|
|
# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
|
|
#
|
|
# <appro@openssl.org>
|
|
#
|
|
######################################################################
|
|
|
|
$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64
|
|
|
|
$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
|
|
|
|
if ($flavour =~ /64|n32/i) {{{
|
|
######################################################################
|
|
# 64-bit code path
|
|
#
|
|
|
|
my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
|
|
my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
|
|
|
|
$code.=<<___;
|
|
#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\
|
|
defined(_MIPS_ARCH_MIPS64R6)) \\
|
|
&& !defined(_MIPS_ARCH_MIPS64R2)
|
|
# define _MIPS_ARCH_MIPS64R2
|
|
#endif
|
|
|
|
#if defined(_MIPS_ARCH_MIPS64R6)
|
|
# define dmultu(rs,rt)
|
|
# define mflo(rd,rs,rt) dmulu rd,rs,rt
|
|
# define mfhi(rd,rs,rt) dmuhu rd,rs,rt
|
|
#else
|
|
# define dmultu(rs,rt) dmultu rs,rt
|
|
# define mflo(rd,rs,rt) mflo rd
|
|
# define mfhi(rd,rs,rt) mfhi rd
|
|
#endif
|
|
|
|
#ifdef __KERNEL__
|
|
# define poly1305_init poly1305_block_init_arch
|
|
# define poly1305_blocks poly1305_blocks_arch
|
|
# define poly1305_emit poly1305_emit_arch
|
|
#endif
|
|
|
|
#if defined(__MIPSEB__) && !defined(MIPSEB)
|
|
# define MIPSEB
|
|
#endif
|
|
|
|
#ifdef MIPSEB
|
|
# define MSB 0
|
|
# define LSB 7
|
|
#else
|
|
# define MSB 7
|
|
# define LSB 0
|
|
#endif
|
|
|
|
.text
|
|
.set noat
|
|
.set noreorder
|
|
|
|
.align 5
|
|
.globl poly1305_init
|
|
.ent poly1305_init
|
|
poly1305_init:
|
|
.frame $sp,0,$ra
|
|
.set reorder
|
|
|
|
sd $zero,0($ctx)
|
|
sd $zero,8($ctx)
|
|
sd $zero,16($ctx)
|
|
|
|
beqz $inp,.Lno_key
|
|
|
|
#if defined(_MIPS_ARCH_MIPS64R6)
|
|
andi $tmp0,$inp,7 # $inp % 8
|
|
dsubu $inp,$inp,$tmp0 # align $inp
|
|
sll $tmp0,$tmp0,3 # byte to bit offset
|
|
ld $in0,0($inp)
|
|
ld $in1,8($inp)
|
|
beqz $tmp0,.Laligned_key
|
|
ld $tmp2,16($inp)
|
|
|
|
subu $tmp1,$zero,$tmp0
|
|
# ifdef MIPSEB
|
|
dsllv $in0,$in0,$tmp0
|
|
dsrlv $tmp3,$in1,$tmp1
|
|
dsllv $in1,$in1,$tmp0
|
|
dsrlv $tmp2,$tmp2,$tmp1
|
|
# else
|
|
dsrlv $in0,$in0,$tmp0
|
|
dsllv $tmp3,$in1,$tmp1
|
|
dsrlv $in1,$in1,$tmp0
|
|
dsllv $tmp2,$tmp2,$tmp1
|
|
# endif
|
|
or $in0,$in0,$tmp3
|
|
or $in1,$in1,$tmp2
|
|
.Laligned_key:
|
|
#else
|
|
ldl $in0,0+MSB($inp)
|
|
ldl $in1,8+MSB($inp)
|
|
ldr $in0,0+LSB($inp)
|
|
ldr $in1,8+LSB($inp)
|
|
#endif
|
|
#ifdef MIPSEB
|
|
# if defined(_MIPS_ARCH_MIPS64R2)
|
|
dsbh $in0,$in0 # byte swap
|
|
dsbh $in1,$in1
|
|
dshd $in0,$in0
|
|
dshd $in1,$in1
|
|
# else
|
|
ori $tmp0,$zero,0xFF
|
|
dsll $tmp2,$tmp0,32
|
|
or $tmp0,$tmp2 # 0x000000FF000000FF
|
|
|
|
and $tmp1,$in0,$tmp0 # byte swap
|
|
and $tmp3,$in1,$tmp0
|
|
dsrl $tmp2,$in0,24
|
|
dsrl $tmp4,$in1,24
|
|
dsll $tmp1,24
|
|
dsll $tmp3,24
|
|
and $tmp2,$tmp0
|
|
and $tmp4,$tmp0
|
|
dsll $tmp0,8 # 0x0000FF000000FF00
|
|
or $tmp1,$tmp2
|
|
or $tmp3,$tmp4
|
|
and $tmp2,$in0,$tmp0
|
|
and $tmp4,$in1,$tmp0
|
|
dsrl $in0,8
|
|
dsrl $in1,8
|
|
dsll $tmp2,8
|
|
dsll $tmp4,8
|
|
and $in0,$tmp0
|
|
and $in1,$tmp0
|
|
or $tmp1,$tmp2
|
|
or $tmp3,$tmp4
|
|
or $in0,$tmp1
|
|
or $in1,$tmp3
|
|
dsrl $tmp1,$in0,32
|
|
dsrl $tmp3,$in1,32
|
|
dsll $in0,32
|
|
dsll $in1,32
|
|
or $in0,$tmp1
|
|
or $in1,$tmp3
|
|
# endif
|
|
#endif
|
|
li $tmp0,1
|
|
dsll $tmp0,32 # 0x0000000100000000
|
|
daddiu $tmp0,-63 # 0x00000000ffffffc1
|
|
dsll $tmp0,28 # 0x0ffffffc10000000
|
|
daddiu $tmp0,-1 # 0x0ffffffc0fffffff
|
|
|
|
and $in0,$tmp0
|
|
daddiu $tmp0,-3 # 0x0ffffffc0ffffffc
|
|
and $in1,$tmp0
|
|
|
|
sd $in0,24($ctx)
|
|
dsrl $tmp0,$in1,2
|
|
sd $in1,32($ctx)
|
|
daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2)
|
|
sd $tmp0,40($ctx)
|
|
|
|
.Lno_key:
|
|
li $v0,0 # return 0
|
|
jr $ra
|
|
.end poly1305_init
|
|
___
|
|
{
|
|
my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
|
|
|
|
my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
|
|
($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
|
|
my ($shr,$shl) = ($s6,$s7); # used on R6
|
|
|
|
$code.=<<___;
|
|
.align 5
|
|
.globl poly1305_blocks
|
|
.ent poly1305_blocks
|
|
poly1305_blocks:
|
|
.set noreorder
|
|
dsrl $len,4 # number of complete blocks
|
|
bnez $len,poly1305_blocks_internal
|
|
nop
|
|
jr $ra
|
|
nop
|
|
.end poly1305_blocks
|
|
|
|
.align 5
|
|
.ent poly1305_blocks_internal
|
|
poly1305_blocks_internal:
|
|
.set noreorder
|
|
#if defined(_MIPS_ARCH_MIPS64R6)
|
|
.frame $sp,8*8,$ra
|
|
.mask $SAVED_REGS_MASK|0x000c0000,-8
|
|
dsubu $sp,8*8
|
|
sd $s7,56($sp)
|
|
sd $s6,48($sp)
|
|
#else
|
|
.frame $sp,6*8,$ra
|
|
.mask $SAVED_REGS_MASK,-8
|
|
dsubu $sp,6*8
|
|
#endif
|
|
sd $s5,40($sp)
|
|
sd $s4,32($sp)
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
|
|
sd $s3,24($sp)
|
|
sd $s2,16($sp)
|
|
sd $s1,8($sp)
|
|
sd $s0,0($sp)
|
|
___
|
|
$code.=<<___;
|
|
.set reorder
|
|
|
|
#if defined(_MIPS_ARCH_MIPS64R6)
|
|
andi $shr,$inp,7
|
|
dsubu $inp,$inp,$shr # align $inp
|
|
sll $shr,$shr,3 # byte to bit offset
|
|
subu $shl,$zero,$shr
|
|
#endif
|
|
|
|
ld $h0,0($ctx) # load hash value
|
|
ld $h1,8($ctx)
|
|
ld $h2,16($ctx)
|
|
|
|
ld $r0,24($ctx) # load key
|
|
ld $r1,32($ctx)
|
|
ld $rs1,40($ctx)
|
|
|
|
dsll $len,4
|
|
daddu $len,$inp # end of buffer
|
|
b .Loop
|
|
|
|
.align 4
|
|
.Loop:
|
|
#if defined(_MIPS_ARCH_MIPS64R6)
|
|
ld $in0,0($inp) # load input
|
|
ld $in1,8($inp)
|
|
beqz $shr,.Laligned_inp
|
|
|
|
ld $tmp2,16($inp)
|
|
# ifdef MIPSEB
|
|
dsllv $in0,$in0,$shr
|
|
dsrlv $tmp3,$in1,$shl
|
|
dsllv $in1,$in1,$shr
|
|
dsrlv $tmp2,$tmp2,$shl
|
|
# else
|
|
dsrlv $in0,$in0,$shr
|
|
dsllv $tmp3,$in1,$shl
|
|
dsrlv $in1,$in1,$shr
|
|
dsllv $tmp2,$tmp2,$shl
|
|
# endif
|
|
or $in0,$in0,$tmp3
|
|
or $in1,$in1,$tmp2
|
|
.Laligned_inp:
|
|
#else
|
|
ldl $in0,0+MSB($inp) # load input
|
|
ldl $in1,8+MSB($inp)
|
|
ldr $in0,0+LSB($inp)
|
|
ldr $in1,8+LSB($inp)
|
|
#endif
|
|
daddiu $inp,16
|
|
#ifdef MIPSEB
|
|
# if defined(_MIPS_ARCH_MIPS64R2)
|
|
dsbh $in0,$in0 # byte swap
|
|
dsbh $in1,$in1
|
|
dshd $in0,$in0
|
|
dshd $in1,$in1
|
|
# else
|
|
ori $tmp0,$zero,0xFF
|
|
dsll $tmp2,$tmp0,32
|
|
or $tmp0,$tmp2 # 0x000000FF000000FF
|
|
|
|
and $tmp1,$in0,$tmp0 # byte swap
|
|
and $tmp3,$in1,$tmp0
|
|
dsrl $tmp2,$in0,24
|
|
dsrl $tmp4,$in1,24
|
|
dsll $tmp1,24
|
|
dsll $tmp3,24
|
|
and $tmp2,$tmp0
|
|
and $tmp4,$tmp0
|
|
dsll $tmp0,8 # 0x0000FF000000FF00
|
|
or $tmp1,$tmp2
|
|
or $tmp3,$tmp4
|
|
and $tmp2,$in0,$tmp0
|
|
and $tmp4,$in1,$tmp0
|
|
dsrl $in0,8
|
|
dsrl $in1,8
|
|
dsll $tmp2,8
|
|
dsll $tmp4,8
|
|
and $in0,$tmp0
|
|
and $in1,$tmp0
|
|
or $tmp1,$tmp2
|
|
or $tmp3,$tmp4
|
|
or $in0,$tmp1
|
|
or $in1,$tmp3
|
|
dsrl $tmp1,$in0,32
|
|
dsrl $tmp3,$in1,32
|
|
dsll $in0,32
|
|
dsll $in1,32
|
|
or $in0,$tmp1
|
|
or $in1,$tmp3
|
|
# endif
|
|
#endif
|
|
dsrl $tmp1,$h2,2 # modulo-scheduled reduction
|
|
andi $h2,$h2,3
|
|
dsll $tmp0,$tmp1,2
|
|
|
|
daddu $d0,$h0,$in0 # accumulate input
|
|
daddu $tmp1,$tmp0
|
|
sltu $tmp0,$d0,$h0
|
|
daddu $d0,$d0,$tmp1 # ... and residue
|
|
sltu $tmp1,$d0,$tmp1
|
|
daddu $d1,$h1,$in1
|
|
daddu $tmp0,$tmp1
|
|
sltu $tmp1,$d1,$h1
|
|
daddu $d1,$tmp0
|
|
|
|
dmultu ($r0,$d0) # h0*r0
|
|
daddu $d2,$h2,$padbit
|
|
sltu $tmp0,$d1,$tmp0
|
|
mflo ($h0,$r0,$d0)
|
|
mfhi ($h1,$r0,$d0)
|
|
|
|
dmultu ($rs1,$d1) # h1*5*r1
|
|
daddu $d2,$tmp1
|
|
daddu $d2,$tmp0
|
|
mflo ($tmp0,$rs1,$d1)
|
|
mfhi ($tmp1,$rs1,$d1)
|
|
|
|
dmultu ($r1,$d0) # h0*r1
|
|
mflo ($tmp2,$r1,$d0)
|
|
mfhi ($h2,$r1,$d0)
|
|
daddu $h0,$tmp0
|
|
daddu $h1,$tmp1
|
|
sltu $tmp0,$h0,$tmp0
|
|
|
|
dmultu ($r0,$d1) # h1*r0
|
|
daddu $h1,$tmp0
|
|
daddu $h1,$tmp2
|
|
mflo ($tmp0,$r0,$d1)
|
|
mfhi ($tmp1,$r0,$d1)
|
|
|
|
dmultu ($rs1,$d2) # h2*5*r1
|
|
sltu $tmp2,$h1,$tmp2
|
|
daddu $h2,$tmp2
|
|
mflo ($tmp2,$rs1,$d2)
|
|
|
|
dmultu ($r0,$d2) # h2*r0
|
|
daddu $h1,$tmp0
|
|
daddu $h2,$tmp1
|
|
mflo ($tmp3,$r0,$d2)
|
|
sltu $tmp0,$h1,$tmp0
|
|
daddu $h2,$tmp0
|
|
|
|
daddu $h1,$tmp2
|
|
sltu $tmp2,$h1,$tmp2
|
|
daddu $h2,$tmp2
|
|
daddu $h2,$tmp3
|
|
|
|
bne $inp,$len,.Loop
|
|
|
|
sd $h0,0($ctx) # store hash value
|
|
sd $h1,8($ctx)
|
|
sd $h2,16($ctx)
|
|
|
|
.set noreorder
|
|
#if defined(_MIPS_ARCH_MIPS64R6)
|
|
ld $s7,56($sp)
|
|
ld $s6,48($sp)
|
|
#endif
|
|
ld $s5,40($sp) # epilogue
|
|
ld $s4,32($sp)
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue
|
|
ld $s3,24($sp)
|
|
ld $s2,16($sp)
|
|
ld $s1,8($sp)
|
|
ld $s0,0($sp)
|
|
___
|
|
$code.=<<___;
|
|
jr $ra
|
|
#if defined(_MIPS_ARCH_MIPS64R6)
|
|
daddu $sp,8*8
|
|
#else
|
|
daddu $sp,6*8
|
|
#endif
|
|
.end poly1305_blocks_internal
|
|
___
|
|
}
|
|
{
|
|
my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
|
|
|
|
$code.=<<___;
|
|
.align 5
|
|
.globl poly1305_emit
|
|
.ent poly1305_emit
|
|
poly1305_emit:
|
|
.frame $sp,0,$ra
|
|
.set reorder
|
|
|
|
ld $tmp2,16($ctx)
|
|
ld $tmp0,0($ctx)
|
|
ld $tmp1,8($ctx)
|
|
|
|
li $in0,-4 # final reduction
|
|
dsrl $in1,$tmp2,2
|
|
and $in0,$tmp2
|
|
andi $tmp2,$tmp2,3
|
|
daddu $in0,$in1
|
|
|
|
daddu $tmp0,$tmp0,$in0
|
|
sltu $in1,$tmp0,$in0
|
|
daddiu $in0,$tmp0,5 # compare to modulus
|
|
daddu $tmp1,$tmp1,$in1
|
|
sltiu $tmp3,$in0,5
|
|
sltu $tmp4,$tmp1,$in1
|
|
daddu $in1,$tmp1,$tmp3
|
|
daddu $tmp2,$tmp2,$tmp4
|
|
sltu $tmp3,$in1,$tmp3
|
|
daddu $tmp2,$tmp2,$tmp3
|
|
|
|
dsrl $tmp2,2 # see if it carried/borrowed
|
|
dsubu $tmp2,$zero,$tmp2
|
|
|
|
xor $in0,$tmp0
|
|
xor $in1,$tmp1
|
|
and $in0,$tmp2
|
|
and $in1,$tmp2
|
|
xor $in0,$tmp0
|
|
xor $in1,$tmp1
|
|
|
|
lwu $tmp0,0($nonce) # load nonce
|
|
lwu $tmp1,4($nonce)
|
|
lwu $tmp2,8($nonce)
|
|
lwu $tmp3,12($nonce)
|
|
dsll $tmp1,32
|
|
dsll $tmp3,32
|
|
or $tmp0,$tmp1
|
|
or $tmp2,$tmp3
|
|
|
|
daddu $in0,$tmp0 # accumulate nonce
|
|
daddu $in1,$tmp2
|
|
sltu $tmp0,$in0,$tmp0
|
|
daddu $in1,$tmp0
|
|
|
|
dsrl $tmp0,$in0,8 # write mac value
|
|
dsrl $tmp1,$in0,16
|
|
dsrl $tmp2,$in0,24
|
|
sb $in0,0($mac)
|
|
dsrl $tmp3,$in0,32
|
|
sb $tmp0,1($mac)
|
|
dsrl $tmp0,$in0,40
|
|
sb $tmp1,2($mac)
|
|
dsrl $tmp1,$in0,48
|
|
sb $tmp2,3($mac)
|
|
dsrl $tmp2,$in0,56
|
|
sb $tmp3,4($mac)
|
|
dsrl $tmp3,$in1,8
|
|
sb $tmp0,5($mac)
|
|
dsrl $tmp0,$in1,16
|
|
sb $tmp1,6($mac)
|
|
dsrl $tmp1,$in1,24
|
|
sb $tmp2,7($mac)
|
|
|
|
sb $in1,8($mac)
|
|
dsrl $tmp2,$in1,32
|
|
sb $tmp3,9($mac)
|
|
dsrl $tmp3,$in1,40
|
|
sb $tmp0,10($mac)
|
|
dsrl $tmp0,$in1,48
|
|
sb $tmp1,11($mac)
|
|
dsrl $tmp1,$in1,56
|
|
sb $tmp2,12($mac)
|
|
sb $tmp3,13($mac)
|
|
sb $tmp0,14($mac)
|
|
sb $tmp1,15($mac)
|
|
|
|
jr $ra
|
|
.end poly1305_emit
|
|
.rdata
|
|
.asciiz "Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm"
|
|
.align 2
|
|
___
|
|
}
|
|
}}} else {{{
|
|
######################################################################
|
|
# 32-bit code path
|
|
#
|
|
|
|
my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
|
|
my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
|
|
($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2);
|
|
|
|
$code.=<<___;
|
|
#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\
|
|
defined(_MIPS_ARCH_MIPS32R6)) \\
|
|
&& !defined(_MIPS_ARCH_MIPS32R2)
|
|
# define _MIPS_ARCH_MIPS32R2
|
|
#endif
|
|
|
|
#if defined(_MIPS_ARCH_MIPS32R6)
|
|
# define multu(rs,rt)
|
|
# define mflo(rd,rs,rt) mulu rd,rs,rt
|
|
# define mfhi(rd,rs,rt) muhu rd,rs,rt
|
|
#else
|
|
# define multu(rs,rt) multu rs,rt
|
|
# define mflo(rd,rs,rt) mflo rd
|
|
# define mfhi(rd,rs,rt) mfhi rd
|
|
#endif
|
|
|
|
#ifdef __KERNEL__
|
|
# define poly1305_init poly1305_block_init_arch
|
|
# define poly1305_blocks poly1305_blocks_arch
|
|
# define poly1305_emit poly1305_emit_arch
|
|
#endif
|
|
|
|
#if defined(__MIPSEB__) && !defined(MIPSEB)
|
|
# define MIPSEB
|
|
#endif
|
|
|
|
#ifdef MIPSEB
|
|
# define MSB 0
|
|
# define LSB 3
|
|
#else
|
|
# define MSB 3
|
|
# define LSB 0
|
|
#endif
|
|
|
|
.text
|
|
.set noat
|
|
.set noreorder
|
|
|
|
.align 5
|
|
.globl poly1305_init
|
|
.ent poly1305_init
|
|
poly1305_init:
|
|
.frame $sp,0,$ra
|
|
.set reorder
|
|
|
|
sw $zero,0($ctx)
|
|
sw $zero,4($ctx)
|
|
sw $zero,8($ctx)
|
|
sw $zero,12($ctx)
|
|
sw $zero,16($ctx)
|
|
|
|
beqz $inp,.Lno_key
|
|
|
|
#if defined(_MIPS_ARCH_MIPS32R6)
|
|
andi $tmp0,$inp,3 # $inp % 4
|
|
subu $inp,$inp,$tmp0 # align $inp
|
|
sll $tmp0,$tmp0,3 # byte to bit offset
|
|
lw $in0,0($inp)
|
|
lw $in1,4($inp)
|
|
lw $in2,8($inp)
|
|
lw $in3,12($inp)
|
|
beqz $tmp0,.Laligned_key
|
|
|
|
lw $tmp2,16($inp)
|
|
subu $tmp1,$zero,$tmp0
|
|
# ifdef MIPSEB
|
|
sllv $in0,$in0,$tmp0
|
|
srlv $tmp3,$in1,$tmp1
|
|
sllv $in1,$in1,$tmp0
|
|
or $in0,$in0,$tmp3
|
|
srlv $tmp3,$in2,$tmp1
|
|
sllv $in2,$in2,$tmp0
|
|
or $in1,$in1,$tmp3
|
|
srlv $tmp3,$in3,$tmp1
|
|
sllv $in3,$in3,$tmp0
|
|
or $in2,$in2,$tmp3
|
|
srlv $tmp2,$tmp2,$tmp1
|
|
or $in3,$in3,$tmp2
|
|
# else
|
|
srlv $in0,$in0,$tmp0
|
|
sllv $tmp3,$in1,$tmp1
|
|
srlv $in1,$in1,$tmp0
|
|
or $in0,$in0,$tmp3
|
|
sllv $tmp3,$in2,$tmp1
|
|
srlv $in2,$in2,$tmp0
|
|
or $in1,$in1,$tmp3
|
|
sllv $tmp3,$in3,$tmp1
|
|
srlv $in3,$in3,$tmp0
|
|
or $in2,$in2,$tmp3
|
|
sllv $tmp2,$tmp2,$tmp1
|
|
or $in3,$in3,$tmp2
|
|
# endif
|
|
.Laligned_key:
|
|
#else
|
|
lwl $in0,0+MSB($inp)
|
|
lwl $in1,4+MSB($inp)
|
|
lwl $in2,8+MSB($inp)
|
|
lwl $in3,12+MSB($inp)
|
|
lwr $in0,0+LSB($inp)
|
|
lwr $in1,4+LSB($inp)
|
|
lwr $in2,8+LSB($inp)
|
|
lwr $in3,12+LSB($inp)
|
|
#endif
|
|
#ifdef MIPSEB
|
|
# if defined(_MIPS_ARCH_MIPS32R2)
|
|
wsbh $in0,$in0 # byte swap
|
|
wsbh $in1,$in1
|
|
wsbh $in2,$in2
|
|
wsbh $in3,$in3
|
|
rotr $in0,$in0,16
|
|
rotr $in1,$in1,16
|
|
rotr $in2,$in2,16
|
|
rotr $in3,$in3,16
|
|
# else
|
|
srl $tmp0,$in0,24 # byte swap
|
|
srl $tmp1,$in0,8
|
|
andi $tmp2,$in0,0xFF00
|
|
sll $in0,$in0,24
|
|
andi $tmp1,0xFF00
|
|
sll $tmp2,$tmp2,8
|
|
or $in0,$tmp0
|
|
srl $tmp0,$in1,24
|
|
or $tmp1,$tmp2
|
|
srl $tmp2,$in1,8
|
|
or $in0,$tmp1
|
|
andi $tmp1,$in1,0xFF00
|
|
sll $in1,$in1,24
|
|
andi $tmp2,0xFF00
|
|
sll $tmp1,$tmp1,8
|
|
or $in1,$tmp0
|
|
srl $tmp0,$in2,24
|
|
or $tmp2,$tmp1
|
|
srl $tmp1,$in2,8
|
|
or $in1,$tmp2
|
|
andi $tmp2,$in2,0xFF00
|
|
sll $in2,$in2,24
|
|
andi $tmp1,0xFF00
|
|
sll $tmp2,$tmp2,8
|
|
or $in2,$tmp0
|
|
srl $tmp0,$in3,24
|
|
or $tmp1,$tmp2
|
|
srl $tmp2,$in3,8
|
|
or $in2,$tmp1
|
|
andi $tmp1,$in3,0xFF00
|
|
sll $in3,$in3,24
|
|
andi $tmp2,0xFF00
|
|
sll $tmp1,$tmp1,8
|
|
or $in3,$tmp0
|
|
or $tmp2,$tmp1
|
|
or $in3,$tmp2
|
|
# endif
|
|
#endif
|
|
lui $tmp0,0x0fff
|
|
ori $tmp0,0xffff # 0x0fffffff
|
|
and $in0,$in0,$tmp0
|
|
subu $tmp0,3 # 0x0ffffffc
|
|
and $in1,$in1,$tmp0
|
|
and $in2,$in2,$tmp0
|
|
and $in3,$in3,$tmp0
|
|
|
|
sw $in0,20($ctx)
|
|
sw $in1,24($ctx)
|
|
sw $in2,28($ctx)
|
|
sw $in3,32($ctx)
|
|
|
|
srl $tmp1,$in1,2
|
|
srl $tmp2,$in2,2
|
|
srl $tmp3,$in3,2
|
|
addu $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2)
|
|
addu $in2,$in2,$tmp2
|
|
addu $in3,$in3,$tmp3
|
|
sw $in1,36($ctx)
|
|
sw $in2,40($ctx)
|
|
sw $in3,44($ctx)
|
|
.Lno_key:
|
|
li $v0,0
|
|
jr $ra
|
|
.end poly1305_init
|
|
___
|
|
{
|
|
my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000";
|
|
|
|
my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
|
|
($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11);
|
|
my ($d0,$d1,$d2,$d3) =
|
|
($a4,$a5,$a6,$a7);
|
|
my $shr = $t2; # used on R6
|
|
my $one = $t2; # used on R2
|
|
|
|
$code.=<<___;
|
|
.globl poly1305_blocks
|
|
.align 5
|
|
.ent poly1305_blocks
|
|
poly1305_blocks:
|
|
.frame $sp,16*4,$ra
|
|
.mask $SAVED_REGS_MASK,-4
|
|
.set noreorder
|
|
subu $sp, $sp,4*12
|
|
sw $s11,4*11($sp)
|
|
sw $s10,4*10($sp)
|
|
sw $s9, 4*9($sp)
|
|
sw $s8, 4*8($sp)
|
|
sw $s7, 4*7($sp)
|
|
sw $s6, 4*6($sp)
|
|
sw $s5, 4*5($sp)
|
|
sw $s4, 4*4($sp)
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
|
|
sw $s3, 4*3($sp)
|
|
sw $s2, 4*2($sp)
|
|
sw $s1, 4*1($sp)
|
|
sw $s0, 4*0($sp)
|
|
___
|
|
$code.=<<___;
|
|
.set reorder
|
|
|
|
srl $len,4 # number of complete blocks
|
|
li $one,1
|
|
beqz $len,.Labort
|
|
|
|
#if defined(_MIPS_ARCH_MIPS32R6)
|
|
andi $shr,$inp,3
|
|
subu $inp,$inp,$shr # align $inp
|
|
sll $shr,$shr,3 # byte to bit offset
|
|
#endif
|
|
|
|
lw $h0,0($ctx) # load hash value
|
|
lw $h1,4($ctx)
|
|
lw $h2,8($ctx)
|
|
lw $h3,12($ctx)
|
|
lw $h4,16($ctx)
|
|
|
|
lw $r0,20($ctx) # load key
|
|
lw $r1,24($ctx)
|
|
lw $r2,28($ctx)
|
|
lw $r3,32($ctx)
|
|
lw $rs1,36($ctx)
|
|
lw $rs2,40($ctx)
|
|
lw $rs3,44($ctx)
|
|
|
|
sll $len,4
|
|
addu $len,$len,$inp # end of buffer
|
|
b .Loop
|
|
|
|
.align 4
|
|
.Loop:
|
|
#if defined(_MIPS_ARCH_MIPS32R6)
|
|
lw $d0,0($inp) # load input
|
|
lw $d1,4($inp)
|
|
lw $d2,8($inp)
|
|
lw $d3,12($inp)
|
|
beqz $shr,.Laligned_inp
|
|
|
|
lw $t0,16($inp)
|
|
subu $t1,$zero,$shr
|
|
# ifdef MIPSEB
|
|
sllv $d0,$d0,$shr
|
|
srlv $at,$d1,$t1
|
|
sllv $d1,$d1,$shr
|
|
or $d0,$d0,$at
|
|
srlv $at,$d2,$t1
|
|
sllv $d2,$d2,$shr
|
|
or $d1,$d1,$at
|
|
srlv $at,$d3,$t1
|
|
sllv $d3,$d3,$shr
|
|
or $d2,$d2,$at
|
|
srlv $t0,$t0,$t1
|
|
or $d3,$d3,$t0
|
|
# else
|
|
srlv $d0,$d0,$shr
|
|
sllv $at,$d1,$t1
|
|
srlv $d1,$d1,$shr
|
|
or $d0,$d0,$at
|
|
sllv $at,$d2,$t1
|
|
srlv $d2,$d2,$shr
|
|
or $d1,$d1,$at
|
|
sllv $at,$d3,$t1
|
|
srlv $d3,$d3,$shr
|
|
or $d2,$d2,$at
|
|
sllv $t0,$t0,$t1
|
|
or $d3,$d3,$t0
|
|
# endif
|
|
.Laligned_inp:
|
|
#else
|
|
lwl $d0,0+MSB($inp) # load input
|
|
lwl $d1,4+MSB($inp)
|
|
lwl $d2,8+MSB($inp)
|
|
lwl $d3,12+MSB($inp)
|
|
lwr $d0,0+LSB($inp)
|
|
lwr $d1,4+LSB($inp)
|
|
lwr $d2,8+LSB($inp)
|
|
lwr $d3,12+LSB($inp)
|
|
#endif
|
|
#ifdef MIPSEB
|
|
# if defined(_MIPS_ARCH_MIPS32R2)
|
|
wsbh $d0,$d0 # byte swap
|
|
wsbh $d1,$d1
|
|
wsbh $d2,$d2
|
|
wsbh $d3,$d3
|
|
rotr $d0,$d0,16
|
|
rotr $d1,$d1,16
|
|
rotr $d2,$d2,16
|
|
rotr $d3,$d3,16
|
|
# else
|
|
srl $at,$d0,24 # byte swap
|
|
srl $t0,$d0,8
|
|
andi $t1,$d0,0xFF00
|
|
sll $d0,$d0,24
|
|
andi $t0,0xFF00
|
|
sll $t1,$t1,8
|
|
or $d0,$at
|
|
srl $at,$d1,24
|
|
or $t0,$t1
|
|
srl $t1,$d1,8
|
|
or $d0,$t0
|
|
andi $t0,$d1,0xFF00
|
|
sll $d1,$d1,24
|
|
andi $t1,0xFF00
|
|
sll $t0,$t0,8
|
|
or $d1,$at
|
|
srl $at,$d2,24
|
|
or $t1,$t0
|
|
srl $t0,$d2,8
|
|
or $d1,$t1
|
|
andi $t1,$d2,0xFF00
|
|
sll $d2,$d2,24
|
|
andi $t0,0xFF00
|
|
sll $t1,$t1,8
|
|
or $d2,$at
|
|
srl $at,$d3,24
|
|
or $t0,$t1
|
|
srl $t1,$d3,8
|
|
or $d2,$t0
|
|
andi $t0,$d3,0xFF00
|
|
sll $d3,$d3,24
|
|
andi $t1,0xFF00
|
|
sll $t0,$t0,8
|
|
or $d3,$at
|
|
or $t1,$t0
|
|
or $d3,$t1
|
|
# endif
|
|
#endif
|
|
srl $t0,$h4,2 # modulo-scheduled reduction
|
|
andi $h4,$h4,3
|
|
sll $at,$t0,2
|
|
|
|
addu $d0,$d0,$h0 # accumulate input
|
|
addu $t0,$t0,$at
|
|
sltu $h0,$d0,$h0
|
|
addu $d0,$d0,$t0 # ... and residue
|
|
sltu $at,$d0,$t0
|
|
|
|
addu $d1,$d1,$h1
|
|
addu $h0,$h0,$at # carry
|
|
sltu $h1,$d1,$h1
|
|
addu $d1,$d1,$h0
|
|
sltu $h0,$d1,$h0
|
|
|
|
addu $d2,$d2,$h2
|
|
addu $h1,$h1,$h0 # carry
|
|
sltu $h2,$d2,$h2
|
|
addu $d2,$d2,$h1
|
|
sltu $h1,$d2,$h1
|
|
|
|
addu $d3,$d3,$h3
|
|
addu $h2,$h2,$h1 # carry
|
|
sltu $h3,$d3,$h3
|
|
addu $d3,$d3,$h2
|
|
|
|
#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6)
|
|
multu $r0,$d0 # d0*r0
|
|
sltu $h2,$d3,$h2
|
|
maddu $rs3,$d1 # d1*s3
|
|
addu $h3,$h3,$h2 # carry
|
|
maddu $rs2,$d2 # d2*s2
|
|
addu $h4,$h4,$padbit
|
|
maddu $rs1,$d3 # d3*s1
|
|
addu $h4,$h4,$h3
|
|
mfhi $at
|
|
mflo $h0
|
|
|
|
multu $r1,$d0 # d0*r1
|
|
maddu $r0,$d1 # d1*r0
|
|
maddu $rs3,$d2 # d2*s3
|
|
maddu $rs2,$d3 # d3*s2
|
|
maddu $rs1,$h4 # h4*s1
|
|
maddu $at,$one # hi*1
|
|
mfhi $at
|
|
mflo $h1
|
|
|
|
multu $r2,$d0 # d0*r2
|
|
maddu $r1,$d1 # d1*r1
|
|
maddu $r0,$d2 # d2*r0
|
|
maddu $rs3,$d3 # d3*s3
|
|
maddu $rs2,$h4 # h4*s2
|
|
maddu $at,$one # hi*1
|
|
mfhi $at
|
|
mflo $h2
|
|
|
|
mul $t0,$r0,$h4 # h4*r0
|
|
|
|
multu $r3,$d0 # d0*r3
|
|
maddu $r2,$d1 # d1*r2
|
|
maddu $r1,$d2 # d2*r1
|
|
maddu $r0,$d3 # d3*r0
|
|
maddu $rs3,$h4 # h4*s3
|
|
maddu $at,$one # hi*1
|
|
mfhi $at
|
|
mflo $h3
|
|
|
|
addiu $inp,$inp,16
|
|
|
|
addu $h4,$t0,$at
|
|
#else
|
|
multu ($r0,$d0) # d0*r0
|
|
mflo ($h0,$r0,$d0)
|
|
mfhi ($h1,$r0,$d0)
|
|
|
|
sltu $h2,$d3,$h2
|
|
addu $h3,$h3,$h2 # carry
|
|
|
|
multu ($rs3,$d1) # d1*s3
|
|
mflo ($at,$rs3,$d1)
|
|
mfhi ($t0,$rs3,$d1)
|
|
|
|
addu $h4,$h4,$padbit
|
|
addiu $inp,$inp,16
|
|
addu $h4,$h4,$h3
|
|
|
|
multu ($rs2,$d2) # d2*s2
|
|
mflo ($a3,$rs2,$d2)
|
|
mfhi ($t1,$rs2,$d2)
|
|
addu $h0,$h0,$at
|
|
addu $h1,$h1,$t0
|
|
multu ($rs1,$d3) # d3*s1
|
|
sltu $at,$h0,$at
|
|
addu $h1,$h1,$at
|
|
|
|
mflo ($at,$rs1,$d3)
|
|
mfhi ($t0,$rs1,$d3)
|
|
addu $h0,$h0,$a3
|
|
addu $h1,$h1,$t1
|
|
multu ($r1,$d0) # d0*r1
|
|
sltu $a3,$h0,$a3
|
|
addu $h1,$h1,$a3
|
|
|
|
|
|
mflo ($a3,$r1,$d0)
|
|
mfhi ($h2,$r1,$d0)
|
|
addu $h0,$h0,$at
|
|
addu $h1,$h1,$t0
|
|
multu ($r0,$d1) # d1*r0
|
|
sltu $at,$h0,$at
|
|
addu $h1,$h1,$at
|
|
|
|
mflo ($at,$r0,$d1)
|
|
mfhi ($t0,$r0,$d1)
|
|
addu $h1,$h1,$a3
|
|
sltu $a3,$h1,$a3
|
|
multu ($rs3,$d2) # d2*s3
|
|
addu $h2,$h2,$a3
|
|
|
|
mflo ($a3,$rs3,$d2)
|
|
mfhi ($t1,$rs3,$d2)
|
|
addu $h1,$h1,$at
|
|
addu $h2,$h2,$t0
|
|
multu ($rs2,$d3) # d3*s2
|
|
sltu $at,$h1,$at
|
|
addu $h2,$h2,$at
|
|
|
|
mflo ($at,$rs2,$d3)
|
|
mfhi ($t0,$rs2,$d3)
|
|
addu $h1,$h1,$a3
|
|
addu $h2,$h2,$t1
|
|
multu ($rs1,$h4) # h4*s1
|
|
sltu $a3,$h1,$a3
|
|
addu $h2,$h2,$a3
|
|
|
|
mflo ($a3,$rs1,$h4)
|
|
addu $h1,$h1,$at
|
|
addu $h2,$h2,$t0
|
|
multu ($r2,$d0) # d0*r2
|
|
sltu $at,$h1,$at
|
|
addu $h2,$h2,$at
|
|
|
|
|
|
mflo ($at,$r2,$d0)
|
|
mfhi ($h3,$r2,$d0)
|
|
addu $h1,$h1,$a3
|
|
sltu $a3,$h1,$a3
|
|
multu ($r1,$d1) # d1*r1
|
|
addu $h2,$h2,$a3
|
|
|
|
mflo ($a3,$r1,$d1)
|
|
mfhi ($t1,$r1,$d1)
|
|
addu $h2,$h2,$at
|
|
sltu $at,$h2,$at
|
|
multu ($r0,$d2) # d2*r0
|
|
addu $h3,$h3,$at
|
|
|
|
mflo ($at,$r0,$d2)
|
|
mfhi ($t0,$r0,$d2)
|
|
addu $h2,$h2,$a3
|
|
addu $h3,$h3,$t1
|
|
multu ($rs3,$d3) # d3*s3
|
|
sltu $a3,$h2,$a3
|
|
addu $h3,$h3,$a3
|
|
|
|
mflo ($a3,$rs3,$d3)
|
|
mfhi ($t1,$rs3,$d3)
|
|
addu $h2,$h2,$at
|
|
addu $h3,$h3,$t0
|
|
multu ($rs2,$h4) # h4*s2
|
|
sltu $at,$h2,$at
|
|
addu $h3,$h3,$at
|
|
|
|
mflo ($at,$rs2,$h4)
|
|
addu $h2,$h2,$a3
|
|
addu $h3,$h3,$t1
|
|
multu ($r3,$d0) # d0*r3
|
|
sltu $a3,$h2,$a3
|
|
addu $h3,$h3,$a3
|
|
|
|
|
|
mflo ($a3,$r3,$d0)
|
|
mfhi ($t1,$r3,$d0)
|
|
addu $h2,$h2,$at
|
|
sltu $at,$h2,$at
|
|
multu ($r2,$d1) # d1*r2
|
|
addu $h3,$h3,$at
|
|
|
|
mflo ($at,$r2,$d1)
|
|
mfhi ($t0,$r2,$d1)
|
|
addu $h3,$h3,$a3
|
|
sltu $a3,$h3,$a3
|
|
multu ($r0,$d3) # d3*r0
|
|
addu $t1,$t1,$a3
|
|
|
|
mflo ($a3,$r0,$d3)
|
|
mfhi ($d3,$r0,$d3)
|
|
addu $h3,$h3,$at
|
|
addu $t1,$t1,$t0
|
|
multu ($r1,$d2) # d2*r1
|
|
sltu $at,$h3,$at
|
|
addu $t1,$t1,$at
|
|
|
|
mflo ($at,$r1,$d2)
|
|
mfhi ($t0,$r1,$d2)
|
|
addu $h3,$h3,$a3
|
|
addu $t1,$t1,$d3
|
|
multu ($rs3,$h4) # h4*s3
|
|
sltu $a3,$h3,$a3
|
|
addu $t1,$t1,$a3
|
|
|
|
mflo ($a3,$rs3,$h4)
|
|
addu $h3,$h3,$at
|
|
addu $t1,$t1,$t0
|
|
multu ($r0,$h4) # h4*r0
|
|
sltu $at,$h3,$at
|
|
addu $t1,$t1,$at
|
|
|
|
|
|
mflo ($h4,$r0,$h4)
|
|
addu $h3,$h3,$a3
|
|
sltu $a3,$h3,$a3
|
|
addu $t1,$t1,$a3
|
|
addu $h4,$h4,$t1
|
|
|
|
li $padbit,1 # if we loop, padbit is 1
|
|
#endif
|
|
bne $inp,$len,.Loop
|
|
|
|
sw $h0,0($ctx) # store hash value
|
|
sw $h1,4($ctx)
|
|
sw $h2,8($ctx)
|
|
sw $h3,12($ctx)
|
|
sw $h4,16($ctx)
|
|
|
|
.set noreorder
|
|
.Labort:
|
|
lw $s11,4*11($sp)
|
|
lw $s10,4*10($sp)
|
|
lw $s9, 4*9($sp)
|
|
lw $s8, 4*8($sp)
|
|
lw $s7, 4*7($sp)
|
|
lw $s6, 4*6($sp)
|
|
lw $s5, 4*5($sp)
|
|
lw $s4, 4*4($sp)
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
|
|
lw $s3, 4*3($sp)
|
|
lw $s2, 4*2($sp)
|
|
lw $s1, 4*1($sp)
|
|
lw $s0, 4*0($sp)
|
|
___
|
|
$code.=<<___;
|
|
jr $ra
|
|
addu $sp,$sp,4*12
|
|
.end poly1305_blocks
|
|
___
|
|
}
|
|
{
|
|
my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
|
|
|
|
$code.=<<___;
|
|
.align 5
|
|
.globl poly1305_emit
|
|
.ent poly1305_emit
|
|
poly1305_emit:
|
|
.frame $sp,0,$ra
|
|
.set reorder
|
|
|
|
lw $tmp4,16($ctx)
|
|
lw $tmp0,0($ctx)
|
|
lw $tmp1,4($ctx)
|
|
lw $tmp2,8($ctx)
|
|
lw $tmp3,12($ctx)
|
|
|
|
li $in0,-4 # final reduction
|
|
srl $ctx,$tmp4,2
|
|
and $in0,$in0,$tmp4
|
|
andi $tmp4,$tmp4,3
|
|
addu $ctx,$ctx,$in0
|
|
|
|
addu $tmp0,$tmp0,$ctx
|
|
sltu $ctx,$tmp0,$ctx
|
|
addiu $in0,$tmp0,5 # compare to modulus
|
|
addu $tmp1,$tmp1,$ctx
|
|
sltiu $in1,$in0,5
|
|
sltu $ctx,$tmp1,$ctx
|
|
addu $in1,$in1,$tmp1
|
|
addu $tmp2,$tmp2,$ctx
|
|
sltu $in2,$in1,$tmp1
|
|
sltu $ctx,$tmp2,$ctx
|
|
addu $in2,$in2,$tmp2
|
|
addu $tmp3,$tmp3,$ctx
|
|
sltu $in3,$in2,$tmp2
|
|
sltu $ctx,$tmp3,$ctx
|
|
addu $in3,$in3,$tmp3
|
|
addu $tmp4,$tmp4,$ctx
|
|
sltu $ctx,$in3,$tmp3
|
|
addu $ctx,$tmp4
|
|
|
|
srl $ctx,2 # see if it carried/borrowed
|
|
subu $ctx,$zero,$ctx
|
|
|
|
xor $in0,$tmp0
|
|
xor $in1,$tmp1
|
|
xor $in2,$tmp2
|
|
xor $in3,$tmp3
|
|
and $in0,$ctx
|
|
and $in1,$ctx
|
|
and $in2,$ctx
|
|
and $in3,$ctx
|
|
xor $in0,$tmp0
|
|
xor $in1,$tmp1
|
|
xor $in2,$tmp2
|
|
xor $in3,$tmp3
|
|
|
|
lw $tmp0,0($nonce) # load nonce
|
|
lw $tmp1,4($nonce)
|
|
lw $tmp2,8($nonce)
|
|
lw $tmp3,12($nonce)
|
|
|
|
addu $in0,$tmp0 # accumulate nonce
|
|
sltu $ctx,$in0,$tmp0
|
|
|
|
addu $in1,$tmp1
|
|
sltu $tmp1,$in1,$tmp1
|
|
addu $in1,$ctx
|
|
sltu $ctx,$in1,$ctx
|
|
addu $ctx,$tmp1
|
|
|
|
addu $in2,$tmp2
|
|
sltu $tmp2,$in2,$tmp2
|
|
addu $in2,$ctx
|
|
sltu $ctx,$in2,$ctx
|
|
addu $ctx,$tmp2
|
|
|
|
addu $in3,$tmp3
|
|
addu $in3,$ctx
|
|
|
|
srl $tmp0,$in0,8 # write mac value
|
|
srl $tmp1,$in0,16
|
|
srl $tmp2,$in0,24
|
|
sb $in0, 0($mac)
|
|
sb $tmp0,1($mac)
|
|
srl $tmp0,$in1,8
|
|
sb $tmp1,2($mac)
|
|
srl $tmp1,$in1,16
|
|
sb $tmp2,3($mac)
|
|
srl $tmp2,$in1,24
|
|
sb $in1, 4($mac)
|
|
sb $tmp0,5($mac)
|
|
srl $tmp0,$in2,8
|
|
sb $tmp1,6($mac)
|
|
srl $tmp1,$in2,16
|
|
sb $tmp2,7($mac)
|
|
srl $tmp2,$in2,24
|
|
sb $in2, 8($mac)
|
|
sb $tmp0,9($mac)
|
|
srl $tmp0,$in3,8
|
|
sb $tmp1,10($mac)
|
|
srl $tmp1,$in3,16
|
|
sb $tmp2,11($mac)
|
|
srl $tmp2,$in3,24
|
|
sb $in3, 12($mac)
|
|
sb $tmp0,13($mac)
|
|
sb $tmp1,14($mac)
|
|
sb $tmp2,15($mac)
|
|
|
|
jr $ra
|
|
.end poly1305_emit
|
|
.rdata
|
|
.asciiz "Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm"
|
|
.align 2
|
|
___
|
|
}
|
|
}}}
|
|
|
|
$output=pop and open STDOUT,">$output";
|
|
print $code;
|
|
close STDOUT;
|