linux/lib/crypto/mips/poly1305-mips.pl
Eric Biggers 7e54e993ab lib/crypto: mips: Move arch/mips/lib/crypto/ into lib/crypto/
Move the contents of arch/mips/lib/crypto/ into lib/crypto/mips/.

The new code organization makes a lot more sense for how this code
actually works and is developed.  In particular, it makes it possible to
build each algorithm as a single module, with better inlining and dead
code elimination.  For a more detailed explanation, see the patchset
which did this for the CRC library code:
https://lore.kernel.org/r/20250607200454.73587-1-ebiggers@kernel.org/.
Also see the patchset which did this for SHA-512:
https://lore.kernel.org/linux-crypto/20250616014019.415791-1-ebiggers@kernel.org/

This is just a preparatory commit, which does the move to get the files
into their new location but keeps them building the same way as before.
Later commits will make the actual improvements to the way the
arch-optimized code is integrated for each algorithm.

Add a gitignore entry for the removed directory arch/mips/lib/crypto/ so
that people don't accidentally commit leftover generated files.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Sohil Mehta <sohil.mehta@intel.com>
Link: https://lore.kernel.org/r/20250619191908.134235-4-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
2025-06-30 09:26:20 -07:00

1273 lines
24 KiB
Raku

#!/usr/bin/env perl
# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
#
# ====================================================================
# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL
# project.
# ====================================================================
# Poly1305 hash for MIPS.
#
# May 2016
#
# Numbers are cycles per processed byte with poly1305_blocks alone.
#
# IALU/gcc
# R1x000 ~5.5/+130% (big-endian)
# Octeon II 2.50/+70% (little-endian)
#
# March 2019
#
# Add 32-bit code path.
#
# October 2019
#
# Modulo-scheduling reduction allows to omit dependency chain at the
# end of inner loop and improve performance. Also optimize MIPS32R2
# code path for MIPS 1004K core. Per René von Dorst's suggestions.
#
# IALU/gcc
# R1x000 ~9.8/? (big-endian)
# Octeon II 3.65/+140% (little-endian)
# MT7621/1004K 4.75/? (little-endian)
#
######################################################################
# There is a number of MIPS ABI in use, O32 and N32/64 are most
# widely used. Then there is a new contender: NUBI. It appears that if
# one picks the latter, it's possible to arrange code in ABI neutral
# manner. Therefore let's stick to NUBI register layout:
#
($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
#
# The return value is placed in $a0. Following coding rules facilitate
# interoperability:
#
# - never ever touch $tp, "thread pointer", former $gp [o32 can be
# excluded from the rule, because it's specified volatile];
# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
# old code];
# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
#
# For reference here is register layout for N32/64 MIPS ABIs:
#
# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
#
# <appro@openssl.org>
#
######################################################################
$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64
$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
if ($flavour =~ /64|n32/i) {{{
######################################################################
# 64-bit code path
#
my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
$code.=<<___;
#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\
defined(_MIPS_ARCH_MIPS64R6)) \\
&& !defined(_MIPS_ARCH_MIPS64R2)
# define _MIPS_ARCH_MIPS64R2
#endif
#if defined(_MIPS_ARCH_MIPS64R6)
# define dmultu(rs,rt)
# define mflo(rd,rs,rt) dmulu rd,rs,rt
# define mfhi(rd,rs,rt) dmuhu rd,rs,rt
#else
# define dmultu(rs,rt) dmultu rs,rt
# define mflo(rd,rs,rt) mflo rd
# define mfhi(rd,rs,rt) mfhi rd
#endif
#ifdef __KERNEL__
# define poly1305_init poly1305_block_init_arch
# define poly1305_blocks poly1305_blocks_arch
# define poly1305_emit poly1305_emit_arch
#endif
#if defined(__MIPSEB__) && !defined(MIPSEB)
# define MIPSEB
#endif
#ifdef MIPSEB
# define MSB 0
# define LSB 7
#else
# define MSB 7
# define LSB 0
#endif
.text
.set noat
.set noreorder
.align 5
.globl poly1305_init
.ent poly1305_init
poly1305_init:
.frame $sp,0,$ra
.set reorder
sd $zero,0($ctx)
sd $zero,8($ctx)
sd $zero,16($ctx)
beqz $inp,.Lno_key
#if defined(_MIPS_ARCH_MIPS64R6)
andi $tmp0,$inp,7 # $inp % 8
dsubu $inp,$inp,$tmp0 # align $inp
sll $tmp0,$tmp0,3 # byte to bit offset
ld $in0,0($inp)
ld $in1,8($inp)
beqz $tmp0,.Laligned_key
ld $tmp2,16($inp)
subu $tmp1,$zero,$tmp0
# ifdef MIPSEB
dsllv $in0,$in0,$tmp0
dsrlv $tmp3,$in1,$tmp1
dsllv $in1,$in1,$tmp0
dsrlv $tmp2,$tmp2,$tmp1
# else
dsrlv $in0,$in0,$tmp0
dsllv $tmp3,$in1,$tmp1
dsrlv $in1,$in1,$tmp0
dsllv $tmp2,$tmp2,$tmp1
# endif
or $in0,$in0,$tmp3
or $in1,$in1,$tmp2
.Laligned_key:
#else
ldl $in0,0+MSB($inp)
ldl $in1,8+MSB($inp)
ldr $in0,0+LSB($inp)
ldr $in1,8+LSB($inp)
#endif
#ifdef MIPSEB
# if defined(_MIPS_ARCH_MIPS64R2)
dsbh $in0,$in0 # byte swap
dsbh $in1,$in1
dshd $in0,$in0
dshd $in1,$in1
# else
ori $tmp0,$zero,0xFF
dsll $tmp2,$tmp0,32
or $tmp0,$tmp2 # 0x000000FF000000FF
and $tmp1,$in0,$tmp0 # byte swap
and $tmp3,$in1,$tmp0
dsrl $tmp2,$in0,24
dsrl $tmp4,$in1,24
dsll $tmp1,24
dsll $tmp3,24
and $tmp2,$tmp0
and $tmp4,$tmp0
dsll $tmp0,8 # 0x0000FF000000FF00
or $tmp1,$tmp2
or $tmp3,$tmp4
and $tmp2,$in0,$tmp0
and $tmp4,$in1,$tmp0
dsrl $in0,8
dsrl $in1,8
dsll $tmp2,8
dsll $tmp4,8
and $in0,$tmp0
and $in1,$tmp0
or $tmp1,$tmp2
or $tmp3,$tmp4
or $in0,$tmp1
or $in1,$tmp3
dsrl $tmp1,$in0,32
dsrl $tmp3,$in1,32
dsll $in0,32
dsll $in1,32
or $in0,$tmp1
or $in1,$tmp3
# endif
#endif
li $tmp0,1
dsll $tmp0,32 # 0x0000000100000000
daddiu $tmp0,-63 # 0x00000000ffffffc1
dsll $tmp0,28 # 0x0ffffffc10000000
daddiu $tmp0,-1 # 0x0ffffffc0fffffff
and $in0,$tmp0
daddiu $tmp0,-3 # 0x0ffffffc0ffffffc
and $in1,$tmp0
sd $in0,24($ctx)
dsrl $tmp0,$in1,2
sd $in1,32($ctx)
daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2)
sd $tmp0,40($ctx)
.Lno_key:
li $v0,0 # return 0
jr $ra
.end poly1305_init
___
{
my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
my ($shr,$shl) = ($s6,$s7); # used on R6
$code.=<<___;
.align 5
.globl poly1305_blocks
.ent poly1305_blocks
poly1305_blocks:
.set noreorder
dsrl $len,4 # number of complete blocks
bnez $len,poly1305_blocks_internal
nop
jr $ra
nop
.end poly1305_blocks
.align 5
.ent poly1305_blocks_internal
poly1305_blocks_internal:
.set noreorder
#if defined(_MIPS_ARCH_MIPS64R6)
.frame $sp,8*8,$ra
.mask $SAVED_REGS_MASK|0x000c0000,-8
dsubu $sp,8*8
sd $s7,56($sp)
sd $s6,48($sp)
#else
.frame $sp,6*8,$ra
.mask $SAVED_REGS_MASK,-8
dsubu $sp,6*8
#endif
sd $s5,40($sp)
sd $s4,32($sp)
___
$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
sd $s3,24($sp)
sd $s2,16($sp)
sd $s1,8($sp)
sd $s0,0($sp)
___
$code.=<<___;
.set reorder
#if defined(_MIPS_ARCH_MIPS64R6)
andi $shr,$inp,7
dsubu $inp,$inp,$shr # align $inp
sll $shr,$shr,3 # byte to bit offset
subu $shl,$zero,$shr
#endif
ld $h0,0($ctx) # load hash value
ld $h1,8($ctx)
ld $h2,16($ctx)
ld $r0,24($ctx) # load key
ld $r1,32($ctx)
ld $rs1,40($ctx)
dsll $len,4
daddu $len,$inp # end of buffer
b .Loop
.align 4
.Loop:
#if defined(_MIPS_ARCH_MIPS64R6)
ld $in0,0($inp) # load input
ld $in1,8($inp)
beqz $shr,.Laligned_inp
ld $tmp2,16($inp)
# ifdef MIPSEB
dsllv $in0,$in0,$shr
dsrlv $tmp3,$in1,$shl
dsllv $in1,$in1,$shr
dsrlv $tmp2,$tmp2,$shl
# else
dsrlv $in0,$in0,$shr
dsllv $tmp3,$in1,$shl
dsrlv $in1,$in1,$shr
dsllv $tmp2,$tmp2,$shl
# endif
or $in0,$in0,$tmp3
or $in1,$in1,$tmp2
.Laligned_inp:
#else
ldl $in0,0+MSB($inp) # load input
ldl $in1,8+MSB($inp)
ldr $in0,0+LSB($inp)
ldr $in1,8+LSB($inp)
#endif
daddiu $inp,16
#ifdef MIPSEB
# if defined(_MIPS_ARCH_MIPS64R2)
dsbh $in0,$in0 # byte swap
dsbh $in1,$in1
dshd $in0,$in0
dshd $in1,$in1
# else
ori $tmp0,$zero,0xFF
dsll $tmp2,$tmp0,32
or $tmp0,$tmp2 # 0x000000FF000000FF
and $tmp1,$in0,$tmp0 # byte swap
and $tmp3,$in1,$tmp0
dsrl $tmp2,$in0,24
dsrl $tmp4,$in1,24
dsll $tmp1,24
dsll $tmp3,24
and $tmp2,$tmp0
and $tmp4,$tmp0
dsll $tmp0,8 # 0x0000FF000000FF00
or $tmp1,$tmp2
or $tmp3,$tmp4
and $tmp2,$in0,$tmp0
and $tmp4,$in1,$tmp0
dsrl $in0,8
dsrl $in1,8
dsll $tmp2,8
dsll $tmp4,8
and $in0,$tmp0
and $in1,$tmp0
or $tmp1,$tmp2
or $tmp3,$tmp4
or $in0,$tmp1
or $in1,$tmp3
dsrl $tmp1,$in0,32
dsrl $tmp3,$in1,32
dsll $in0,32
dsll $in1,32
or $in0,$tmp1
or $in1,$tmp3
# endif
#endif
dsrl $tmp1,$h2,2 # modulo-scheduled reduction
andi $h2,$h2,3
dsll $tmp0,$tmp1,2
daddu $d0,$h0,$in0 # accumulate input
daddu $tmp1,$tmp0
sltu $tmp0,$d0,$h0
daddu $d0,$d0,$tmp1 # ... and residue
sltu $tmp1,$d0,$tmp1
daddu $d1,$h1,$in1
daddu $tmp0,$tmp1
sltu $tmp1,$d1,$h1
daddu $d1,$tmp0
dmultu ($r0,$d0) # h0*r0
daddu $d2,$h2,$padbit
sltu $tmp0,$d1,$tmp0
mflo ($h0,$r0,$d0)
mfhi ($h1,$r0,$d0)
dmultu ($rs1,$d1) # h1*5*r1
daddu $d2,$tmp1
daddu $d2,$tmp0
mflo ($tmp0,$rs1,$d1)
mfhi ($tmp1,$rs1,$d1)
dmultu ($r1,$d0) # h0*r1
mflo ($tmp2,$r1,$d0)
mfhi ($h2,$r1,$d0)
daddu $h0,$tmp0
daddu $h1,$tmp1
sltu $tmp0,$h0,$tmp0
dmultu ($r0,$d1) # h1*r0
daddu $h1,$tmp0
daddu $h1,$tmp2
mflo ($tmp0,$r0,$d1)
mfhi ($tmp1,$r0,$d1)
dmultu ($rs1,$d2) # h2*5*r1
sltu $tmp2,$h1,$tmp2
daddu $h2,$tmp2
mflo ($tmp2,$rs1,$d2)
dmultu ($r0,$d2) # h2*r0
daddu $h1,$tmp0
daddu $h2,$tmp1
mflo ($tmp3,$r0,$d2)
sltu $tmp0,$h1,$tmp0
daddu $h2,$tmp0
daddu $h1,$tmp2
sltu $tmp2,$h1,$tmp2
daddu $h2,$tmp2
daddu $h2,$tmp3
bne $inp,$len,.Loop
sd $h0,0($ctx) # store hash value
sd $h1,8($ctx)
sd $h2,16($ctx)
.set noreorder
#if defined(_MIPS_ARCH_MIPS64R6)
ld $s7,56($sp)
ld $s6,48($sp)
#endif
ld $s5,40($sp) # epilogue
ld $s4,32($sp)
___
$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue
ld $s3,24($sp)
ld $s2,16($sp)
ld $s1,8($sp)
ld $s0,0($sp)
___
$code.=<<___;
jr $ra
#if defined(_MIPS_ARCH_MIPS64R6)
daddu $sp,8*8
#else
daddu $sp,6*8
#endif
.end poly1305_blocks_internal
___
}
{
my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
$code.=<<___;
.align 5
.globl poly1305_emit
.ent poly1305_emit
poly1305_emit:
.frame $sp,0,$ra
.set reorder
ld $tmp2,16($ctx)
ld $tmp0,0($ctx)
ld $tmp1,8($ctx)
li $in0,-4 # final reduction
dsrl $in1,$tmp2,2
and $in0,$tmp2
andi $tmp2,$tmp2,3
daddu $in0,$in1
daddu $tmp0,$tmp0,$in0
sltu $in1,$tmp0,$in0
daddiu $in0,$tmp0,5 # compare to modulus
daddu $tmp1,$tmp1,$in1
sltiu $tmp3,$in0,5
sltu $tmp4,$tmp1,$in1
daddu $in1,$tmp1,$tmp3
daddu $tmp2,$tmp2,$tmp4
sltu $tmp3,$in1,$tmp3
daddu $tmp2,$tmp2,$tmp3
dsrl $tmp2,2 # see if it carried/borrowed
dsubu $tmp2,$zero,$tmp2
xor $in0,$tmp0
xor $in1,$tmp1
and $in0,$tmp2
and $in1,$tmp2
xor $in0,$tmp0
xor $in1,$tmp1
lwu $tmp0,0($nonce) # load nonce
lwu $tmp1,4($nonce)
lwu $tmp2,8($nonce)
lwu $tmp3,12($nonce)
dsll $tmp1,32
dsll $tmp3,32
or $tmp0,$tmp1
or $tmp2,$tmp3
daddu $in0,$tmp0 # accumulate nonce
daddu $in1,$tmp2
sltu $tmp0,$in0,$tmp0
daddu $in1,$tmp0
dsrl $tmp0,$in0,8 # write mac value
dsrl $tmp1,$in0,16
dsrl $tmp2,$in0,24
sb $in0,0($mac)
dsrl $tmp3,$in0,32
sb $tmp0,1($mac)
dsrl $tmp0,$in0,40
sb $tmp1,2($mac)
dsrl $tmp1,$in0,48
sb $tmp2,3($mac)
dsrl $tmp2,$in0,56
sb $tmp3,4($mac)
dsrl $tmp3,$in1,8
sb $tmp0,5($mac)
dsrl $tmp0,$in1,16
sb $tmp1,6($mac)
dsrl $tmp1,$in1,24
sb $tmp2,7($mac)
sb $in1,8($mac)
dsrl $tmp2,$in1,32
sb $tmp3,9($mac)
dsrl $tmp3,$in1,40
sb $tmp0,10($mac)
dsrl $tmp0,$in1,48
sb $tmp1,11($mac)
dsrl $tmp1,$in1,56
sb $tmp2,12($mac)
sb $tmp3,13($mac)
sb $tmp0,14($mac)
sb $tmp1,15($mac)
jr $ra
.end poly1305_emit
.rdata
.asciiz "Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm"
.align 2
___
}
}}} else {{{
######################################################################
# 32-bit code path
#
my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2);
$code.=<<___;
#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\
defined(_MIPS_ARCH_MIPS32R6)) \\
&& !defined(_MIPS_ARCH_MIPS32R2)
# define _MIPS_ARCH_MIPS32R2
#endif
#if defined(_MIPS_ARCH_MIPS32R6)
# define multu(rs,rt)
# define mflo(rd,rs,rt) mulu rd,rs,rt
# define mfhi(rd,rs,rt) muhu rd,rs,rt
#else
# define multu(rs,rt) multu rs,rt
# define mflo(rd,rs,rt) mflo rd
# define mfhi(rd,rs,rt) mfhi rd
#endif
#ifdef __KERNEL__
# define poly1305_init poly1305_block_init_arch
# define poly1305_blocks poly1305_blocks_arch
# define poly1305_emit poly1305_emit_arch
#endif
#if defined(__MIPSEB__) && !defined(MIPSEB)
# define MIPSEB
#endif
#ifdef MIPSEB
# define MSB 0
# define LSB 3
#else
# define MSB 3
# define LSB 0
#endif
.text
.set noat
.set noreorder
.align 5
.globl poly1305_init
.ent poly1305_init
poly1305_init:
.frame $sp,0,$ra
.set reorder
sw $zero,0($ctx)
sw $zero,4($ctx)
sw $zero,8($ctx)
sw $zero,12($ctx)
sw $zero,16($ctx)
beqz $inp,.Lno_key
#if defined(_MIPS_ARCH_MIPS32R6)
andi $tmp0,$inp,3 # $inp % 4
subu $inp,$inp,$tmp0 # align $inp
sll $tmp0,$tmp0,3 # byte to bit offset
lw $in0,0($inp)
lw $in1,4($inp)
lw $in2,8($inp)
lw $in3,12($inp)
beqz $tmp0,.Laligned_key
lw $tmp2,16($inp)
subu $tmp1,$zero,$tmp0
# ifdef MIPSEB
sllv $in0,$in0,$tmp0
srlv $tmp3,$in1,$tmp1
sllv $in1,$in1,$tmp0
or $in0,$in0,$tmp3
srlv $tmp3,$in2,$tmp1
sllv $in2,$in2,$tmp0
or $in1,$in1,$tmp3
srlv $tmp3,$in3,$tmp1
sllv $in3,$in3,$tmp0
or $in2,$in2,$tmp3
srlv $tmp2,$tmp2,$tmp1
or $in3,$in3,$tmp2
# else
srlv $in0,$in0,$tmp0
sllv $tmp3,$in1,$tmp1
srlv $in1,$in1,$tmp0
or $in0,$in0,$tmp3
sllv $tmp3,$in2,$tmp1
srlv $in2,$in2,$tmp0
or $in1,$in1,$tmp3
sllv $tmp3,$in3,$tmp1
srlv $in3,$in3,$tmp0
or $in2,$in2,$tmp3
sllv $tmp2,$tmp2,$tmp1
or $in3,$in3,$tmp2
# endif
.Laligned_key:
#else
lwl $in0,0+MSB($inp)
lwl $in1,4+MSB($inp)
lwl $in2,8+MSB($inp)
lwl $in3,12+MSB($inp)
lwr $in0,0+LSB($inp)
lwr $in1,4+LSB($inp)
lwr $in2,8+LSB($inp)
lwr $in3,12+LSB($inp)
#endif
#ifdef MIPSEB
# if defined(_MIPS_ARCH_MIPS32R2)
wsbh $in0,$in0 # byte swap
wsbh $in1,$in1
wsbh $in2,$in2
wsbh $in3,$in3
rotr $in0,$in0,16
rotr $in1,$in1,16
rotr $in2,$in2,16
rotr $in3,$in3,16
# else
srl $tmp0,$in0,24 # byte swap
srl $tmp1,$in0,8
andi $tmp2,$in0,0xFF00
sll $in0,$in0,24
andi $tmp1,0xFF00
sll $tmp2,$tmp2,8
or $in0,$tmp0
srl $tmp0,$in1,24
or $tmp1,$tmp2
srl $tmp2,$in1,8
or $in0,$tmp1
andi $tmp1,$in1,0xFF00
sll $in1,$in1,24
andi $tmp2,0xFF00
sll $tmp1,$tmp1,8
or $in1,$tmp0
srl $tmp0,$in2,24
or $tmp2,$tmp1
srl $tmp1,$in2,8
or $in1,$tmp2
andi $tmp2,$in2,0xFF00
sll $in2,$in2,24
andi $tmp1,0xFF00
sll $tmp2,$tmp2,8
or $in2,$tmp0
srl $tmp0,$in3,24
or $tmp1,$tmp2
srl $tmp2,$in3,8
or $in2,$tmp1
andi $tmp1,$in3,0xFF00
sll $in3,$in3,24
andi $tmp2,0xFF00
sll $tmp1,$tmp1,8
or $in3,$tmp0
or $tmp2,$tmp1
or $in3,$tmp2
# endif
#endif
lui $tmp0,0x0fff
ori $tmp0,0xffff # 0x0fffffff
and $in0,$in0,$tmp0
subu $tmp0,3 # 0x0ffffffc
and $in1,$in1,$tmp0
and $in2,$in2,$tmp0
and $in3,$in3,$tmp0
sw $in0,20($ctx)
sw $in1,24($ctx)
sw $in2,28($ctx)
sw $in3,32($ctx)
srl $tmp1,$in1,2
srl $tmp2,$in2,2
srl $tmp3,$in3,2
addu $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2)
addu $in2,$in2,$tmp2
addu $in3,$in3,$tmp3
sw $in1,36($ctx)
sw $in2,40($ctx)
sw $in3,44($ctx)
.Lno_key:
li $v0,0
jr $ra
.end poly1305_init
___
{
my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000";
my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11);
my ($d0,$d1,$d2,$d3) =
($a4,$a5,$a6,$a7);
my $shr = $t2; # used on R6
my $one = $t2; # used on R2
$code.=<<___;
.globl poly1305_blocks
.align 5
.ent poly1305_blocks
poly1305_blocks:
.frame $sp,16*4,$ra
.mask $SAVED_REGS_MASK,-4
.set noreorder
subu $sp, $sp,4*12
sw $s11,4*11($sp)
sw $s10,4*10($sp)
sw $s9, 4*9($sp)
sw $s8, 4*8($sp)
sw $s7, 4*7($sp)
sw $s6, 4*6($sp)
sw $s5, 4*5($sp)
sw $s4, 4*4($sp)
___
$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
sw $s3, 4*3($sp)
sw $s2, 4*2($sp)
sw $s1, 4*1($sp)
sw $s0, 4*0($sp)
___
$code.=<<___;
.set reorder
srl $len,4 # number of complete blocks
li $one,1
beqz $len,.Labort
#if defined(_MIPS_ARCH_MIPS32R6)
andi $shr,$inp,3
subu $inp,$inp,$shr # align $inp
sll $shr,$shr,3 # byte to bit offset
#endif
lw $h0,0($ctx) # load hash value
lw $h1,4($ctx)
lw $h2,8($ctx)
lw $h3,12($ctx)
lw $h4,16($ctx)
lw $r0,20($ctx) # load key
lw $r1,24($ctx)
lw $r2,28($ctx)
lw $r3,32($ctx)
lw $rs1,36($ctx)
lw $rs2,40($ctx)
lw $rs3,44($ctx)
sll $len,4
addu $len,$len,$inp # end of buffer
b .Loop
.align 4
.Loop:
#if defined(_MIPS_ARCH_MIPS32R6)
lw $d0,0($inp) # load input
lw $d1,4($inp)
lw $d2,8($inp)
lw $d3,12($inp)
beqz $shr,.Laligned_inp
lw $t0,16($inp)
subu $t1,$zero,$shr
# ifdef MIPSEB
sllv $d0,$d0,$shr
srlv $at,$d1,$t1
sllv $d1,$d1,$shr
or $d0,$d0,$at
srlv $at,$d2,$t1
sllv $d2,$d2,$shr
or $d1,$d1,$at
srlv $at,$d3,$t1
sllv $d3,$d3,$shr
or $d2,$d2,$at
srlv $t0,$t0,$t1
or $d3,$d3,$t0
# else
srlv $d0,$d0,$shr
sllv $at,$d1,$t1
srlv $d1,$d1,$shr
or $d0,$d0,$at
sllv $at,$d2,$t1
srlv $d2,$d2,$shr
or $d1,$d1,$at
sllv $at,$d3,$t1
srlv $d3,$d3,$shr
or $d2,$d2,$at
sllv $t0,$t0,$t1
or $d3,$d3,$t0
# endif
.Laligned_inp:
#else
lwl $d0,0+MSB($inp) # load input
lwl $d1,4+MSB($inp)
lwl $d2,8+MSB($inp)
lwl $d3,12+MSB($inp)
lwr $d0,0+LSB($inp)
lwr $d1,4+LSB($inp)
lwr $d2,8+LSB($inp)
lwr $d3,12+LSB($inp)
#endif
#ifdef MIPSEB
# if defined(_MIPS_ARCH_MIPS32R2)
wsbh $d0,$d0 # byte swap
wsbh $d1,$d1
wsbh $d2,$d2
wsbh $d3,$d3
rotr $d0,$d0,16
rotr $d1,$d1,16
rotr $d2,$d2,16
rotr $d3,$d3,16
# else
srl $at,$d0,24 # byte swap
srl $t0,$d0,8
andi $t1,$d0,0xFF00
sll $d0,$d0,24
andi $t0,0xFF00
sll $t1,$t1,8
or $d0,$at
srl $at,$d1,24
or $t0,$t1
srl $t1,$d1,8
or $d0,$t0
andi $t0,$d1,0xFF00
sll $d1,$d1,24
andi $t1,0xFF00
sll $t0,$t0,8
or $d1,$at
srl $at,$d2,24
or $t1,$t0
srl $t0,$d2,8
or $d1,$t1
andi $t1,$d2,0xFF00
sll $d2,$d2,24
andi $t0,0xFF00
sll $t1,$t1,8
or $d2,$at
srl $at,$d3,24
or $t0,$t1
srl $t1,$d3,8
or $d2,$t0
andi $t0,$d3,0xFF00
sll $d3,$d3,24
andi $t1,0xFF00
sll $t0,$t0,8
or $d3,$at
or $t1,$t0
or $d3,$t1
# endif
#endif
srl $t0,$h4,2 # modulo-scheduled reduction
andi $h4,$h4,3
sll $at,$t0,2
addu $d0,$d0,$h0 # accumulate input
addu $t0,$t0,$at
sltu $h0,$d0,$h0
addu $d0,$d0,$t0 # ... and residue
sltu $at,$d0,$t0
addu $d1,$d1,$h1
addu $h0,$h0,$at # carry
sltu $h1,$d1,$h1
addu $d1,$d1,$h0
sltu $h0,$d1,$h0
addu $d2,$d2,$h2
addu $h1,$h1,$h0 # carry
sltu $h2,$d2,$h2
addu $d2,$d2,$h1
sltu $h1,$d2,$h1
addu $d3,$d3,$h3
addu $h2,$h2,$h1 # carry
sltu $h3,$d3,$h3
addu $d3,$d3,$h2
#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6)
multu $r0,$d0 # d0*r0
sltu $h2,$d3,$h2
maddu $rs3,$d1 # d1*s3
addu $h3,$h3,$h2 # carry
maddu $rs2,$d2 # d2*s2
addu $h4,$h4,$padbit
maddu $rs1,$d3 # d3*s1
addu $h4,$h4,$h3
mfhi $at
mflo $h0
multu $r1,$d0 # d0*r1
maddu $r0,$d1 # d1*r0
maddu $rs3,$d2 # d2*s3
maddu $rs2,$d3 # d3*s2
maddu $rs1,$h4 # h4*s1
maddu $at,$one # hi*1
mfhi $at
mflo $h1
multu $r2,$d0 # d0*r2
maddu $r1,$d1 # d1*r1
maddu $r0,$d2 # d2*r0
maddu $rs3,$d3 # d3*s3
maddu $rs2,$h4 # h4*s2
maddu $at,$one # hi*1
mfhi $at
mflo $h2
mul $t0,$r0,$h4 # h4*r0
multu $r3,$d0 # d0*r3
maddu $r2,$d1 # d1*r2
maddu $r1,$d2 # d2*r1
maddu $r0,$d3 # d3*r0
maddu $rs3,$h4 # h4*s3
maddu $at,$one # hi*1
mfhi $at
mflo $h3
addiu $inp,$inp,16
addu $h4,$t0,$at
#else
multu ($r0,$d0) # d0*r0
mflo ($h0,$r0,$d0)
mfhi ($h1,$r0,$d0)
sltu $h2,$d3,$h2
addu $h3,$h3,$h2 # carry
multu ($rs3,$d1) # d1*s3
mflo ($at,$rs3,$d1)
mfhi ($t0,$rs3,$d1)
addu $h4,$h4,$padbit
addiu $inp,$inp,16
addu $h4,$h4,$h3
multu ($rs2,$d2) # d2*s2
mflo ($a3,$rs2,$d2)
mfhi ($t1,$rs2,$d2)
addu $h0,$h0,$at
addu $h1,$h1,$t0
multu ($rs1,$d3) # d3*s1
sltu $at,$h0,$at
addu $h1,$h1,$at
mflo ($at,$rs1,$d3)
mfhi ($t0,$rs1,$d3)
addu $h0,$h0,$a3
addu $h1,$h1,$t1
multu ($r1,$d0) # d0*r1
sltu $a3,$h0,$a3
addu $h1,$h1,$a3
mflo ($a3,$r1,$d0)
mfhi ($h2,$r1,$d0)
addu $h0,$h0,$at
addu $h1,$h1,$t0
multu ($r0,$d1) # d1*r0
sltu $at,$h0,$at
addu $h1,$h1,$at
mflo ($at,$r0,$d1)
mfhi ($t0,$r0,$d1)
addu $h1,$h1,$a3
sltu $a3,$h1,$a3
multu ($rs3,$d2) # d2*s3
addu $h2,$h2,$a3
mflo ($a3,$rs3,$d2)
mfhi ($t1,$rs3,$d2)
addu $h1,$h1,$at
addu $h2,$h2,$t0
multu ($rs2,$d3) # d3*s2
sltu $at,$h1,$at
addu $h2,$h2,$at
mflo ($at,$rs2,$d3)
mfhi ($t0,$rs2,$d3)
addu $h1,$h1,$a3
addu $h2,$h2,$t1
multu ($rs1,$h4) # h4*s1
sltu $a3,$h1,$a3
addu $h2,$h2,$a3
mflo ($a3,$rs1,$h4)
addu $h1,$h1,$at
addu $h2,$h2,$t0
multu ($r2,$d0) # d0*r2
sltu $at,$h1,$at
addu $h2,$h2,$at
mflo ($at,$r2,$d0)
mfhi ($h3,$r2,$d0)
addu $h1,$h1,$a3
sltu $a3,$h1,$a3
multu ($r1,$d1) # d1*r1
addu $h2,$h2,$a3
mflo ($a3,$r1,$d1)
mfhi ($t1,$r1,$d1)
addu $h2,$h2,$at
sltu $at,$h2,$at
multu ($r0,$d2) # d2*r0
addu $h3,$h3,$at
mflo ($at,$r0,$d2)
mfhi ($t0,$r0,$d2)
addu $h2,$h2,$a3
addu $h3,$h3,$t1
multu ($rs3,$d3) # d3*s3
sltu $a3,$h2,$a3
addu $h3,$h3,$a3
mflo ($a3,$rs3,$d3)
mfhi ($t1,$rs3,$d3)
addu $h2,$h2,$at
addu $h3,$h3,$t0
multu ($rs2,$h4) # h4*s2
sltu $at,$h2,$at
addu $h3,$h3,$at
mflo ($at,$rs2,$h4)
addu $h2,$h2,$a3
addu $h3,$h3,$t1
multu ($r3,$d0) # d0*r3
sltu $a3,$h2,$a3
addu $h3,$h3,$a3
mflo ($a3,$r3,$d0)
mfhi ($t1,$r3,$d0)
addu $h2,$h2,$at
sltu $at,$h2,$at
multu ($r2,$d1) # d1*r2
addu $h3,$h3,$at
mflo ($at,$r2,$d1)
mfhi ($t0,$r2,$d1)
addu $h3,$h3,$a3
sltu $a3,$h3,$a3
multu ($r0,$d3) # d3*r0
addu $t1,$t1,$a3
mflo ($a3,$r0,$d3)
mfhi ($d3,$r0,$d3)
addu $h3,$h3,$at
addu $t1,$t1,$t0
multu ($r1,$d2) # d2*r1
sltu $at,$h3,$at
addu $t1,$t1,$at
mflo ($at,$r1,$d2)
mfhi ($t0,$r1,$d2)
addu $h3,$h3,$a3
addu $t1,$t1,$d3
multu ($rs3,$h4) # h4*s3
sltu $a3,$h3,$a3
addu $t1,$t1,$a3
mflo ($a3,$rs3,$h4)
addu $h3,$h3,$at
addu $t1,$t1,$t0
multu ($r0,$h4) # h4*r0
sltu $at,$h3,$at
addu $t1,$t1,$at
mflo ($h4,$r0,$h4)
addu $h3,$h3,$a3
sltu $a3,$h3,$a3
addu $t1,$t1,$a3
addu $h4,$h4,$t1
li $padbit,1 # if we loop, padbit is 1
#endif
bne $inp,$len,.Loop
sw $h0,0($ctx) # store hash value
sw $h1,4($ctx)
sw $h2,8($ctx)
sw $h3,12($ctx)
sw $h4,16($ctx)
.set noreorder
.Labort:
lw $s11,4*11($sp)
lw $s10,4*10($sp)
lw $s9, 4*9($sp)
lw $s8, 4*8($sp)
lw $s7, 4*7($sp)
lw $s6, 4*6($sp)
lw $s5, 4*5($sp)
lw $s4, 4*4($sp)
___
$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
lw $s3, 4*3($sp)
lw $s2, 4*2($sp)
lw $s1, 4*1($sp)
lw $s0, 4*0($sp)
___
$code.=<<___;
jr $ra
addu $sp,$sp,4*12
.end poly1305_blocks
___
}
{
my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
$code.=<<___;
.align 5
.globl poly1305_emit
.ent poly1305_emit
poly1305_emit:
.frame $sp,0,$ra
.set reorder
lw $tmp4,16($ctx)
lw $tmp0,0($ctx)
lw $tmp1,4($ctx)
lw $tmp2,8($ctx)
lw $tmp3,12($ctx)
li $in0,-4 # final reduction
srl $ctx,$tmp4,2
and $in0,$in0,$tmp4
andi $tmp4,$tmp4,3
addu $ctx,$ctx,$in0
addu $tmp0,$tmp0,$ctx
sltu $ctx,$tmp0,$ctx
addiu $in0,$tmp0,5 # compare to modulus
addu $tmp1,$tmp1,$ctx
sltiu $in1,$in0,5
sltu $ctx,$tmp1,$ctx
addu $in1,$in1,$tmp1
addu $tmp2,$tmp2,$ctx
sltu $in2,$in1,$tmp1
sltu $ctx,$tmp2,$ctx
addu $in2,$in2,$tmp2
addu $tmp3,$tmp3,$ctx
sltu $in3,$in2,$tmp2
sltu $ctx,$tmp3,$ctx
addu $in3,$in3,$tmp3
addu $tmp4,$tmp4,$ctx
sltu $ctx,$in3,$tmp3
addu $ctx,$tmp4
srl $ctx,2 # see if it carried/borrowed
subu $ctx,$zero,$ctx
xor $in0,$tmp0
xor $in1,$tmp1
xor $in2,$tmp2
xor $in3,$tmp3
and $in0,$ctx
and $in1,$ctx
and $in2,$ctx
and $in3,$ctx
xor $in0,$tmp0
xor $in1,$tmp1
xor $in2,$tmp2
xor $in3,$tmp3
lw $tmp0,0($nonce) # load nonce
lw $tmp1,4($nonce)
lw $tmp2,8($nonce)
lw $tmp3,12($nonce)
addu $in0,$tmp0 # accumulate nonce
sltu $ctx,$in0,$tmp0
addu $in1,$tmp1
sltu $tmp1,$in1,$tmp1
addu $in1,$ctx
sltu $ctx,$in1,$ctx
addu $ctx,$tmp1
addu $in2,$tmp2
sltu $tmp2,$in2,$tmp2
addu $in2,$ctx
sltu $ctx,$in2,$ctx
addu $ctx,$tmp2
addu $in3,$tmp3
addu $in3,$ctx
srl $tmp0,$in0,8 # write mac value
srl $tmp1,$in0,16
srl $tmp2,$in0,24
sb $in0, 0($mac)
sb $tmp0,1($mac)
srl $tmp0,$in1,8
sb $tmp1,2($mac)
srl $tmp1,$in1,16
sb $tmp2,3($mac)
srl $tmp2,$in1,24
sb $in1, 4($mac)
sb $tmp0,5($mac)
srl $tmp0,$in2,8
sb $tmp1,6($mac)
srl $tmp1,$in2,16
sb $tmp2,7($mac)
srl $tmp2,$in2,24
sb $in2, 8($mac)
sb $tmp0,9($mac)
srl $tmp0,$in3,8
sb $tmp1,10($mac)
srl $tmp1,$in3,16
sb $tmp2,11($mac)
srl $tmp2,$in3,24
sb $in3, 12($mac)
sb $tmp0,13($mac)
sb $tmp1,14($mac)
sb $tmp2,15($mac)
jr $ra
.end poly1305_emit
.rdata
.asciiz "Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm"
.align 2
___
}
}}}
$output=pop and open STDOUT,">$output";
print $code;
close STDOUT;