#!/usr/bin/env perl # SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause # # ==================================================================== # Written by Andy Polyakov, @dot-asm, originally for the OpenSSL # project. # ==================================================================== # Poly1305 hash for MIPS. # # May 2016 # # Numbers are cycles per processed byte with poly1305_blocks alone. # # IALU/gcc # R1x000 ~5.5/+130% (big-endian) # Octeon II 2.50/+70% (little-endian) # # March 2019 # # Add 32-bit code path. # # October 2019 # # Modulo-scheduling reduction allows to omit dependency chain at the # end of inner loop and improve performance. Also optimize MIPS32R2 # code path for MIPS 1004K core. Per René von Dorst's suggestions. # # IALU/gcc # R1x000 ~9.8/? (big-endian) # Octeon II 3.65/+140% (little-endian) # MT7621/1004K 4.75/? (little-endian) # ###################################################################### # There is a number of MIPS ABI in use, O32 and N32/64 are most # widely used. Then there is a new contender: NUBI. It appears that if # one picks the latter, it's possible to arrange code in ABI neutral # manner. Therefore let's stick to NUBI register layout: # ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); # # The return value is placed in $a0. Following coding rules facilitate # interoperability: # # - never ever touch $tp, "thread pointer", former $gp [o32 can be # excluded from the rule, because it's specified volatile]; # - copy return value to $t0, former $v0 [or to $a0 if you're adapting # old code]; # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; # # For reference here is register layout for N32/64 MIPS ABIs: # # ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); # # # ###################################################################### $flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64 $v0 = ($flavour =~ /nubi/i) ? $a0 : $t0; if ($flavour =~ /64|n32/i) {{{ ###################################################################### # 64-bit code path # my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1); $code.=<<___; #if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\ defined(_MIPS_ARCH_MIPS64R6)) \\ && !defined(_MIPS_ARCH_MIPS64R2) # define _MIPS_ARCH_MIPS64R2 #endif #if defined(_MIPS_ARCH_MIPS64R6) # define dmultu(rs,rt) # define mflo(rd,rs,rt) dmulu rd,rs,rt # define mfhi(rd,rs,rt) dmuhu rd,rs,rt #else # define dmultu(rs,rt) dmultu rs,rt # define mflo(rd,rs,rt) mflo rd # define mfhi(rd,rs,rt) mfhi rd #endif #ifdef __KERNEL__ # define poly1305_init poly1305_init_mips # define poly1305_blocks poly1305_blocks_mips # define poly1305_emit poly1305_emit_mips #endif #if defined(__MIPSEB__) && !defined(MIPSEB) # define MIPSEB #endif #ifdef MIPSEB # define MSB 0 # define LSB 7 #else # define MSB 7 # define LSB 0 #endif .text .set noat .set noreorder .align 5 .globl poly1305_init .ent poly1305_init poly1305_init: .frame $sp,0,$ra .set reorder sd $zero,0($ctx) sd $zero,8($ctx) sd $zero,16($ctx) beqz $inp,.Lno_key #if defined(_MIPS_ARCH_MIPS64R6) andi $tmp0,$inp,7 # $inp % 8 dsubu $inp,$inp,$tmp0 # align $inp sll $tmp0,$tmp0,3 # byte to bit offset ld $in0,0($inp) ld $in1,8($inp) beqz $tmp0,.Laligned_key ld $tmp2,16($inp) subu $tmp1,$zero,$tmp0 # ifdef MIPSEB dsllv $in0,$in0,$tmp0 dsrlv $tmp3,$in1,$tmp1 dsllv $in1,$in1,$tmp0 dsrlv $tmp2,$tmp2,$tmp1 # else dsrlv $in0,$in0,$tmp0 dsllv $tmp3,$in1,$tmp1 dsrlv $in1,$in1,$tmp0 dsllv $tmp2,$tmp2,$tmp1 # endif or $in0,$in0,$tmp3 or $in1,$in1,$tmp2 .Laligned_key: #else ldl $in0,0+MSB($inp) ldl $in1,8+MSB($inp) ldr $in0,0+LSB($inp) ldr $in1,8+LSB($inp) #endif #ifdef MIPSEB # if defined(_MIPS_ARCH_MIPS64R2) dsbh $in0,$in0 # byte swap dsbh $in1,$in1 dshd $in0,$in0 dshd $in1,$in1 # else ori $tmp0,$zero,0xFF dsll $tmp2,$tmp0,32 or $tmp0,$tmp2 # 0x000000FF000000FF and $tmp1,$in0,$tmp0 # byte swap and $tmp3,$in1,$tmp0 dsrl $tmp2,$in0,24 dsrl $tmp4,$in1,24 dsll $tmp1,24 dsll $tmp3,24 and $tmp2,$tmp0 and $tmp4,$tmp0 dsll $tmp0,8 # 0x0000FF000000FF00 or $tmp1,$tmp2 or $tmp3,$tmp4 and $tmp2,$in0,$tmp0 and $tmp4,$in1,$tmp0 dsrl $in0,8 dsrl $in1,8 dsll $tmp2,8 dsll $tmp4,8 and $in0,$tmp0 and $in1,$tmp0 or $tmp1,$tmp2 or $tmp3,$tmp4 or $in0,$tmp1 or $in1,$tmp3 dsrl $tmp1,$in0,32 dsrl $tmp3,$in1,32 dsll $in0,32 dsll $in1,32 or $in0,$tmp1 or $in1,$tmp3 # endif #endif li $tmp0,1 dsll $tmp0,32 # 0x0000000100000000 daddiu $tmp0,-63 # 0x00000000ffffffc1 dsll $tmp0,28 # 0x0ffffffc10000000 daddiu $tmp0,-1 # 0x0ffffffc0fffffff and $in0,$tmp0 daddiu $tmp0,-3 # 0x0ffffffc0ffffffc and $in1,$tmp0 sd $in0,24($ctx) dsrl $tmp0,$in1,2 sd $in1,32($ctx) daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2) sd $tmp0,40($ctx) .Lno_key: li $v0,0 # return 0 jr $ra .end poly1305_init ___ { my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000"; my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) = ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2); my ($shr,$shl) = ($s6,$s7); # used on R6 $code.=<<___; .align 5 .globl poly1305_blocks .ent poly1305_blocks poly1305_blocks: .set noreorder dsrl $len,4 # number of complete blocks bnez $len,poly1305_blocks_internal nop jr $ra nop .end poly1305_blocks .align 5 .ent poly1305_blocks_internal poly1305_blocks_internal: .set noreorder #if defined(_MIPS_ARCH_MIPS64R6) .frame $sp,8*8,$ra .mask $SAVED_REGS_MASK|0x000c0000,-8 dsubu $sp,8*8 sd $s7,56($sp) sd $s6,48($sp) #else .frame $sp,6*8,$ra .mask $SAVED_REGS_MASK,-8 dsubu $sp,6*8 #endif sd $s5,40($sp) sd $s4,32($sp) ___ $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue sd $s3,24($sp) sd $s2,16($sp) sd $s1,8($sp) sd $s0,0($sp) ___ $code.=<<___; .set reorder #if defined(_MIPS_ARCH_MIPS64R6) andi $shr,$inp,7 dsubu $inp,$inp,$shr # align $inp sll $shr,$shr,3 # byte to bit offset subu $shl,$zero,$shr #endif ld $h0,0($ctx) # load hash value ld $h1,8($ctx) ld $h2,16($ctx) ld $r0,24($ctx) # load key ld $r1,32($ctx) ld $rs1,40($ctx) dsll $len,4 daddu $len,$inp # end of buffer b .Loop .align 4 .Loop: #if defined(_MIPS_ARCH_MIPS64R6) ld $in0,0($inp) # load input ld $in1,8($inp) beqz $shr,.Laligned_inp ld $tmp2,16($inp) # ifdef MIPSEB dsllv $in0,$in0,$shr dsrlv $tmp3,$in1,$shl dsllv $in1,$in1,$shr dsrlv $tmp2,$tmp2,$shl # else dsrlv $in0,$in0,$shr dsllv $tmp3,$in1,$shl dsrlv $in1,$in1,$shr dsllv $tmp2,$tmp2,$shl # endif or $in0,$in0,$tmp3 or $in1,$in1,$tmp2 .Laligned_inp: #else ldl $in0,0+MSB($inp) # load input ldl $in1,8+MSB($inp) ldr $in0,0+LSB($inp) ldr $in1,8+LSB($inp) #endif daddiu $inp,16 #ifdef MIPSEB # if defined(_MIPS_ARCH_MIPS64R2) dsbh $in0,$in0 # byte swap dsbh $in1,$in1 dshd $in0,$in0 dshd $in1,$in1 # else ori $tmp0,$zero,0xFF dsll $tmp2,$tmp0,32 or $tmp0,$tmp2 # 0x000000FF000000FF and $tmp1,$in0,$tmp0 # byte swap and $tmp3,$in1,$tmp0 dsrl $tmp2,$in0,24 dsrl $tmp4,$in1,24 dsll $tmp1,24 dsll $tmp3,24 and $tmp2,$tmp0 and $tmp4,$tmp0 dsll $tmp0,8 # 0x0000FF000000FF00 or $tmp1,$tmp2 or $tmp3,$tmp4 and $tmp2,$in0,$tmp0 and $tmp4,$in1,$tmp0 dsrl $in0,8 dsrl $in1,8 dsll $tmp2,8 dsll $tmp4,8 and $in0,$tmp0 and $in1,$tmp0 or $tmp1,$tmp2 or $tmp3,$tmp4 or $in0,$tmp1 or $in1,$tmp3 dsrl $tmp1,$in0,32 dsrl $tmp3,$in1,32 dsll $in0,32 dsll $in1,32 or $in0,$tmp1 or $in1,$tmp3 # endif #endif dsrl $tmp1,$h2,2 # modulo-scheduled reduction andi $h2,$h2,3 dsll $tmp0,$tmp1,2 daddu $d0,$h0,$in0 # accumulate input daddu $tmp1,$tmp0 sltu $tmp0,$d0,$h0 daddu $d0,$d0,$tmp1 # ... and residue sltu $tmp1,$d0,$tmp1 daddu $d1,$h1,$in1 daddu $tmp0,$tmp1 sltu $tmp1,$d1,$h1 daddu $d1,$tmp0 dmultu ($r0,$d0) # h0*r0 daddu $d2,$h2,$padbit sltu $tmp0,$d1,$tmp0 mflo ($h0,$r0,$d0) mfhi ($h1,$r0,$d0) dmultu ($rs1,$d1) # h1*5*r1 daddu $d2,$tmp1 daddu $d2,$tmp0 mflo ($tmp0,$rs1,$d1) mfhi ($tmp1,$rs1,$d1) dmultu ($r1,$d0) # h0*r1 mflo ($tmp2,$r1,$d0) mfhi ($h2,$r1,$d0) daddu $h0,$tmp0 daddu $h1,$tmp1 sltu $tmp0,$h0,$tmp0 dmultu ($r0,$d1) # h1*r0 daddu $h1,$tmp0 daddu $h1,$tmp2 mflo ($tmp0,$r0,$d1) mfhi ($tmp1,$r0,$d1) dmultu ($rs1,$d2) # h2*5*r1 sltu $tmp2,$h1,$tmp2 daddu $h2,$tmp2 mflo ($tmp2,$rs1,$d2) dmultu ($r0,$d2) # h2*r0 daddu $h1,$tmp0 daddu $h2,$tmp1 mflo ($tmp3,$r0,$d2) sltu $tmp0,$h1,$tmp0 daddu $h2,$tmp0 daddu $h1,$tmp2 sltu $tmp2,$h1,$tmp2 daddu $h2,$tmp2 daddu $h2,$tmp3 bne $inp,$len,.Loop sd $h0,0($ctx) # store hash value sd $h1,8($ctx) sd $h2,16($ctx) .set noreorder #if defined(_MIPS_ARCH_MIPS64R6) ld $s7,56($sp) ld $s6,48($sp) #endif ld $s5,40($sp) # epilogue ld $s4,32($sp) ___ $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue ld $s3,24($sp) ld $s2,16($sp) ld $s1,8($sp) ld $s0,0($sp) ___ $code.=<<___; jr $ra #if defined(_MIPS_ARCH_MIPS64R6) daddu $sp,8*8 #else daddu $sp,6*8 #endif .end poly1305_blocks_internal ___ } { my ($ctx,$mac,$nonce) = ($a0,$a1,$a2); $code.=<<___; .align 5 .globl poly1305_emit .ent poly1305_emit poly1305_emit: .frame $sp,0,$ra .set reorder ld $tmp2,16($ctx) ld $tmp0,0($ctx) ld $tmp1,8($ctx) li $in0,-4 # final reduction dsrl $in1,$tmp2,2 and $in0,$tmp2 andi $tmp2,$tmp2,3 daddu $in0,$in1 daddu $tmp0,$tmp0,$in0 sltu $in1,$tmp0,$in0 daddiu $in0,$tmp0,5 # compare to modulus daddu $tmp1,$tmp1,$in1 sltiu $tmp3,$in0,5 sltu $tmp4,$tmp1,$in1 daddu $in1,$tmp1,$tmp3 daddu $tmp2,$tmp2,$tmp4 sltu $tmp3,$in1,$tmp3 daddu $tmp2,$tmp2,$tmp3 dsrl $tmp2,2 # see if it carried/borrowed dsubu $tmp2,$zero,$tmp2 xor $in0,$tmp0 xor $in1,$tmp1 and $in0,$tmp2 and $in1,$tmp2 xor $in0,$tmp0 xor $in1,$tmp1 lwu $tmp0,0($nonce) # load nonce lwu $tmp1,4($nonce) lwu $tmp2,8($nonce) lwu $tmp3,12($nonce) dsll $tmp1,32 dsll $tmp3,32 or $tmp0,$tmp1 or $tmp2,$tmp3 daddu $in0,$tmp0 # accumulate nonce daddu $in1,$tmp2 sltu $tmp0,$in0,$tmp0 daddu $in1,$tmp0 dsrl $tmp0,$in0,8 # write mac value dsrl $tmp1,$in0,16 dsrl $tmp2,$in0,24 sb $in0,0($mac) dsrl $tmp3,$in0,32 sb $tmp0,1($mac) dsrl $tmp0,$in0,40 sb $tmp1,2($mac) dsrl $tmp1,$in0,48 sb $tmp2,3($mac) dsrl $tmp2,$in0,56 sb $tmp3,4($mac) dsrl $tmp3,$in1,8 sb $tmp0,5($mac) dsrl $tmp0,$in1,16 sb $tmp1,6($mac) dsrl $tmp1,$in1,24 sb $tmp2,7($mac) sb $in1,8($mac) dsrl $tmp2,$in1,32 sb $tmp3,9($mac) dsrl $tmp3,$in1,40 sb $tmp0,10($mac) dsrl $tmp0,$in1,48 sb $tmp1,11($mac) dsrl $tmp1,$in1,56 sb $tmp2,12($mac) sb $tmp3,13($mac) sb $tmp0,14($mac) sb $tmp1,15($mac) jr $ra .end poly1305_emit .rdata .asciiz "Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm" .align 2 ___ } }}} else {{{ ###################################################################### # 32-bit code path # my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) = ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2); $code.=<<___; #if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\ defined(_MIPS_ARCH_MIPS32R6)) \\ && !defined(_MIPS_ARCH_MIPS32R2) # define _MIPS_ARCH_MIPS32R2 #endif #if defined(_MIPS_ARCH_MIPS32R6) # define multu(rs,rt) # define mflo(rd,rs,rt) mulu rd,rs,rt # define mfhi(rd,rs,rt) muhu rd,rs,rt #else # define multu(rs,rt) multu rs,rt # define mflo(rd,rs,rt) mflo rd # define mfhi(rd,rs,rt) mfhi rd #endif #ifdef __KERNEL__ # define poly1305_init poly1305_init_mips # define poly1305_blocks poly1305_blocks_mips # define poly1305_emit poly1305_emit_mips #endif #if defined(__MIPSEB__) && !defined(MIPSEB) # define MIPSEB #endif #ifdef MIPSEB # define MSB 0 # define LSB 3 #else # define MSB 3 # define LSB 0 #endif .text .set noat .set noreorder .align 5 .globl poly1305_init .ent poly1305_init poly1305_init: .frame $sp,0,$ra .set reorder sw $zero,0($ctx) sw $zero,4($ctx) sw $zero,8($ctx) sw $zero,12($ctx) sw $zero,16($ctx) beqz $inp,.Lno_key #if defined(_MIPS_ARCH_MIPS32R6) andi $tmp0,$inp,3 # $inp % 4 subu $inp,$inp,$tmp0 # align $inp sll $tmp0,$tmp0,3 # byte to bit offset lw $in0,0($inp) lw $in1,4($inp) lw $in2,8($inp) lw $in3,12($inp) beqz $tmp0,.Laligned_key lw $tmp2,16($inp) subu $tmp1,$zero,$tmp0 # ifdef MIPSEB sllv $in0,$in0,$tmp0 srlv $tmp3,$in1,$tmp1 sllv $in1,$in1,$tmp0 or $in0,$in0,$tmp3 srlv $tmp3,$in2,$tmp1 sllv $in2,$in2,$tmp0 or $in1,$in1,$tmp3 srlv $tmp3,$in3,$tmp1 sllv $in3,$in3,$tmp0 or $in2,$in2,$tmp3 srlv $tmp2,$tmp2,$tmp1 or $in3,$in3,$tmp2 # else srlv $in0,$in0,$tmp0 sllv $tmp3,$in1,$tmp1 srlv $in1,$in1,$tmp0 or $in0,$in0,$tmp3 sllv $tmp3,$in2,$tmp1 srlv $in2,$in2,$tmp0 or $in1,$in1,$tmp3 sllv $tmp3,$in3,$tmp1 srlv $in3,$in3,$tmp0 or $in2,$in2,$tmp3 sllv $tmp2,$tmp2,$tmp1 or $in3,$in3,$tmp2 # endif .Laligned_key: #else lwl $in0,0+MSB($inp) lwl $in1,4+MSB($inp) lwl $in2,8+MSB($inp) lwl $in3,12+MSB($inp) lwr $in0,0+LSB($inp) lwr $in1,4+LSB($inp) lwr $in2,8+LSB($inp) lwr $in3,12+LSB($inp) #endif #ifdef MIPSEB # if defined(_MIPS_ARCH_MIPS32R2) wsbh $in0,$in0 # byte swap wsbh $in1,$in1 wsbh $in2,$in2 wsbh $in3,$in3 rotr $in0,$in0,16 rotr $in1,$in1,16 rotr $in2,$in2,16 rotr $in3,$in3,16 # else srl $tmp0,$in0,24 # byte swap srl $tmp1,$in0,8 andi $tmp2,$in0,0xFF00 sll $in0,$in0,24 andi $tmp1,0xFF00 sll $tmp2,$tmp2,8 or $in0,$tmp0 srl $tmp0,$in1,24 or $tmp1,$tmp2 srl $tmp2,$in1,8 or $in0,$tmp1 andi $tmp1,$in1,0xFF00 sll $in1,$in1,24 andi $tmp2,0xFF00 sll $tmp1,$tmp1,8 or $in1,$tmp0 srl $tmp0,$in2,24 or $tmp2,$tmp1 srl $tmp1,$in2,8 or $in1,$tmp2 andi $tmp2,$in2,0xFF00 sll $in2,$in2,24 andi $tmp1,0xFF00 sll $tmp2,$tmp2,8 or $in2,$tmp0 srl $tmp0,$in3,24 or $tmp1,$tmp2 srl $tmp2,$in3,8 or $in2,$tmp1 andi $tmp1,$in3,0xFF00 sll $in3,$in3,24 andi $tmp2,0xFF00 sll $tmp1,$tmp1,8 or $in3,$tmp0 or $tmp2,$tmp1 or $in3,$tmp2 # endif #endif lui $tmp0,0x0fff ori $tmp0,0xffff # 0x0fffffff and $in0,$in0,$tmp0 subu $tmp0,3 # 0x0ffffffc and $in1,$in1,$tmp0 and $in2,$in2,$tmp0 and $in3,$in3,$tmp0 sw $in0,20($ctx) sw $in1,24($ctx) sw $in2,28($ctx) sw $in3,32($ctx) srl $tmp1,$in1,2 srl $tmp2,$in2,2 srl $tmp3,$in3,2 addu $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2) addu $in2,$in2,$tmp2 addu $in3,$in3,$tmp3 sw $in1,36($ctx) sw $in2,40($ctx) sw $in3,44($ctx) .Lno_key: li $v0,0 jr $ra .end poly1305_init ___ { my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000"; my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) = ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11); my ($d0,$d1,$d2,$d3) = ($a4,$a5,$a6,$a7); my $shr = $t2; # used on R6 my $one = $t2; # used on R2 $code.=<<___; .globl poly1305_blocks .align 5 .ent poly1305_blocks poly1305_blocks: .frame $sp,16*4,$ra .mask $SAVED_REGS_MASK,-4 .set noreorder subu $sp, $sp,4*12 sw $s11,4*11($sp) sw $s10,4*10($sp) sw $s9, 4*9($sp) sw $s8, 4*8($sp) sw $s7, 4*7($sp) sw $s6, 4*6($sp) sw $s5, 4*5($sp) sw $s4, 4*4($sp) ___ $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue sw $s3, 4*3($sp) sw $s2, 4*2($sp) sw $s1, 4*1($sp) sw $s0, 4*0($sp) ___ $code.=<<___; .set reorder srl $len,4 # number of complete blocks li $one,1 beqz $len,.Labort #if defined(_MIPS_ARCH_MIPS32R6) andi $shr,$inp,3 subu $inp,$inp,$shr # align $inp sll $shr,$shr,3 # byte to bit offset #endif lw $h0,0($ctx) # load hash value lw $h1,4($ctx) lw $h2,8($ctx) lw $h3,12($ctx) lw $h4,16($ctx) lw $r0,20($ctx) # load key lw $r1,24($ctx) lw $r2,28($ctx) lw $r3,32($ctx) lw $rs1,36($ctx) lw $rs2,40($ctx) lw $rs3,44($ctx) sll $len,4 addu $len,$len,$inp # end of buffer b .Loop .align 4 .Loop: #if defined(_MIPS_ARCH_MIPS32R6) lw $d0,0($inp) # load input lw $d1,4($inp) lw $d2,8($inp) lw $d3,12($inp) beqz $shr,.Laligned_inp lw $t0,16($inp) subu $t1,$zero,$shr # ifdef MIPSEB sllv $d0,$d0,$shr srlv $at,$d1,$t1 sllv $d1,$d1,$shr or $d0,$d0,$at srlv $at,$d2,$t1 sllv $d2,$d2,$shr or $d1,$d1,$at srlv $at,$d3,$t1 sllv $d3,$d3,$shr or $d2,$d2,$at srlv $t0,$t0,$t1 or $d3,$d3,$t0 # else srlv $d0,$d0,$shr sllv $at,$d1,$t1 srlv $d1,$d1,$shr or $d0,$d0,$at sllv $at,$d2,$t1 srlv $d2,$d2,$shr or $d1,$d1,$at sllv $at,$d3,$t1 srlv $d3,$d3,$shr or $d2,$d2,$at sllv $t0,$t0,$t1 or $d3,$d3,$t0 # endif .Laligned_inp: #else lwl $d0,0+MSB($inp) # load input lwl $d1,4+MSB($inp) lwl $d2,8+MSB($inp) lwl $d3,12+MSB($inp) lwr $d0,0+LSB($inp) lwr $d1,4+LSB($inp) lwr $d2,8+LSB($inp) lwr $d3,12+LSB($inp) #endif #ifdef MIPSEB # if defined(_MIPS_ARCH_MIPS32R2) wsbh $d0,$d0 # byte swap wsbh $d1,$d1 wsbh $d2,$d2 wsbh $d3,$d3 rotr $d0,$d0,16 rotr $d1,$d1,16 rotr $d2,$d2,16 rotr $d3,$d3,16 # else srl $at,$d0,24 # byte swap srl $t0,$d0,8 andi $t1,$d0,0xFF00 sll $d0,$d0,24 andi $t0,0xFF00 sll $t1,$t1,8 or $d0,$at srl $at,$d1,24 or $t0,$t1 srl $t1,$d1,8 or $d0,$t0 andi $t0,$d1,0xFF00 sll $d1,$d1,24 andi $t1,0xFF00 sll $t0,$t0,8 or $d1,$at srl $at,$d2,24 or $t1,$t0 srl $t0,$d2,8 or $d1,$t1 andi $t1,$d2,0xFF00 sll $d2,$d2,24 andi $t0,0xFF00 sll $t1,$t1,8 or $d2,$at srl $at,$d3,24 or $t0,$t1 srl $t1,$d3,8 or $d2,$t0 andi $t0,$d3,0xFF00 sll $d3,$d3,24 andi $t1,0xFF00 sll $t0,$t0,8 or $d3,$at or $t1,$t0 or $d3,$t1 # endif #endif srl $t0,$h4,2 # modulo-scheduled reduction andi $h4,$h4,3 sll $at,$t0,2 addu $d0,$d0,$h0 # accumulate input addu $t0,$t0,$at sltu $h0,$d0,$h0 addu $d0,$d0,$t0 # ... and residue sltu $at,$d0,$t0 addu $d1,$d1,$h1 addu $h0,$h0,$at # carry sltu $h1,$d1,$h1 addu $d1,$d1,$h0 sltu $h0,$d1,$h0 addu $d2,$d2,$h2 addu $h1,$h1,$h0 # carry sltu $h2,$d2,$h2 addu $d2,$d2,$h1 sltu $h1,$d2,$h1 addu $d3,$d3,$h3 addu $h2,$h2,$h1 # carry sltu $h3,$d3,$h3 addu $d3,$d3,$h2 #if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6) multu $r0,$d0 # d0*r0 sltu $h2,$d3,$h2 maddu $rs3,$d1 # d1*s3 addu $h3,$h3,$h2 # carry maddu $rs2,$d2 # d2*s2 addu $h4,$h4,$padbit maddu $rs1,$d3 # d3*s1 addu $h4,$h4,$h3 mfhi $at mflo $h0 multu $r1,$d0 # d0*r1 maddu $r0,$d1 # d1*r0 maddu $rs3,$d2 # d2*s3 maddu $rs2,$d3 # d3*s2 maddu $rs1,$h4 # h4*s1 maddu $at,$one # hi*1 mfhi $at mflo $h1 multu $r2,$d0 # d0*r2 maddu $r1,$d1 # d1*r1 maddu $r0,$d2 # d2*r0 maddu $rs3,$d3 # d3*s3 maddu $rs2,$h4 # h4*s2 maddu $at,$one # hi*1 mfhi $at mflo $h2 mul $t0,$r0,$h4 # h4*r0 multu $r3,$d0 # d0*r3 maddu $r2,$d1 # d1*r2 maddu $r1,$d2 # d2*r1 maddu $r0,$d3 # d3*r0 maddu $rs3,$h4 # h4*s3 maddu $at,$one # hi*1 mfhi $at mflo $h3 addiu $inp,$inp,16 addu $h4,$t0,$at #else multu ($r0,$d0) # d0*r0 mflo ($h0,$r0,$d0) mfhi ($h1,$r0,$d0) sltu $h2,$d3,$h2 addu $h3,$h3,$h2 # carry multu ($rs3,$d1) # d1*s3 mflo ($at,$rs3,$d1) mfhi ($t0,$rs3,$d1) addu $h4,$h4,$padbit addiu $inp,$inp,16 addu $h4,$h4,$h3 multu ($rs2,$d2) # d2*s2 mflo ($a3,$rs2,$d2) mfhi ($t1,$rs2,$d2) addu $h0,$h0,$at addu $h1,$h1,$t0 multu ($rs1,$d3) # d3*s1 sltu $at,$h0,$at addu $h1,$h1,$at mflo ($at,$rs1,$d3) mfhi ($t0,$rs1,$d3) addu $h0,$h0,$a3 addu $h1,$h1,$t1 multu ($r1,$d0) # d0*r1 sltu $a3,$h0,$a3 addu $h1,$h1,$a3 mflo ($a3,$r1,$d0) mfhi ($h2,$r1,$d0) addu $h0,$h0,$at addu $h1,$h1,$t0 multu ($r0,$d1) # d1*r0 sltu $at,$h0,$at addu $h1,$h1,$at mflo ($at,$r0,$d1) mfhi ($t0,$r0,$d1) addu $h1,$h1,$a3 sltu $a3,$h1,$a3 multu ($rs3,$d2) # d2*s3 addu $h2,$h2,$a3 mflo ($a3,$rs3,$d2) mfhi ($t1,$rs3,$d2) addu $h1,$h1,$at addu $h2,$h2,$t0 multu ($rs2,$d3) # d3*s2 sltu $at,$h1,$at addu $h2,$h2,$at mflo ($at,$rs2,$d3) mfhi ($t0,$rs2,$d3) addu $h1,$h1,$a3 addu $h2,$h2,$t1 multu ($rs1,$h4) # h4*s1 sltu $a3,$h1,$a3 addu $h2,$h2,$a3 mflo ($a3,$rs1,$h4) addu $h1,$h1,$at addu $h2,$h2,$t0 multu ($r2,$d0) # d0*r2 sltu $at,$h1,$at addu $h2,$h2,$at mflo ($at,$r2,$d0) mfhi ($h3,$r2,$d0) addu $h1,$h1,$a3 sltu $a3,$h1,$a3 multu ($r1,$d1) # d1*r1 addu $h2,$h2,$a3 mflo ($a3,$r1,$d1) mfhi ($t1,$r1,$d1) addu $h2,$h2,$at sltu $at,$h2,$at multu ($r0,$d2) # d2*r0 addu $h3,$h3,$at mflo ($at,$r0,$d2) mfhi ($t0,$r0,$d2) addu $h2,$h2,$a3 addu $h3,$h3,$t1 multu ($rs3,$d3) # d3*s3 sltu $a3,$h2,$a3 addu $h3,$h3,$a3 mflo ($a3,$rs3,$d3) mfhi ($t1,$rs3,$d3) addu $h2,$h2,$at addu $h3,$h3,$t0 multu ($rs2,$h4) # h4*s2 sltu $at,$h2,$at addu $h3,$h3,$at mflo ($at,$rs2,$h4) addu $h2,$h2,$a3 addu $h3,$h3,$t1 multu ($r3,$d0) # d0*r3 sltu $a3,$h2,$a3 addu $h3,$h3,$a3 mflo ($a3,$r3,$d0) mfhi ($t1,$r3,$d0) addu $h2,$h2,$at sltu $at,$h2,$at multu ($r2,$d1) # d1*r2 addu $h3,$h3,$at mflo ($at,$r2,$d1) mfhi ($t0,$r2,$d1) addu $h3,$h3,$a3 sltu $a3,$h3,$a3 multu ($r0,$d3) # d3*r0 addu $t1,$t1,$a3 mflo ($a3,$r0,$d3) mfhi ($d3,$r0,$d3) addu $h3,$h3,$at addu $t1,$t1,$t0 multu ($r1,$d2) # d2*r1 sltu $at,$h3,$at addu $t1,$t1,$at mflo ($at,$r1,$d2) mfhi ($t0,$r1,$d2) addu $h3,$h3,$a3 addu $t1,$t1,$d3 multu ($rs3,$h4) # h4*s3 sltu $a3,$h3,$a3 addu $t1,$t1,$a3 mflo ($a3,$rs3,$h4) addu $h3,$h3,$at addu $t1,$t1,$t0 multu ($r0,$h4) # h4*r0 sltu $at,$h3,$at addu $t1,$t1,$at mflo ($h4,$r0,$h4) addu $h3,$h3,$a3 sltu $a3,$h3,$a3 addu $t1,$t1,$a3 addu $h4,$h4,$t1 li $padbit,1 # if we loop, padbit is 1 #endif bne $inp,$len,.Loop sw $h0,0($ctx) # store hash value sw $h1,4($ctx) sw $h2,8($ctx) sw $h3,12($ctx) sw $h4,16($ctx) .set noreorder .Labort: lw $s11,4*11($sp) lw $s10,4*10($sp) lw $s9, 4*9($sp) lw $s8, 4*8($sp) lw $s7, 4*7($sp) lw $s6, 4*6($sp) lw $s5, 4*5($sp) lw $s4, 4*4($sp) ___ $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue lw $s3, 4*3($sp) lw $s2, 4*2($sp) lw $s1, 4*1($sp) lw $s0, 4*0($sp) ___ $code.=<<___; jr $ra addu $sp,$sp,4*12 .end poly1305_blocks ___ } { my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3); $code.=<<___; .align 5 .globl poly1305_emit .ent poly1305_emit poly1305_emit: .frame $sp,0,$ra .set reorder lw $tmp4,16($ctx) lw $tmp0,0($ctx) lw $tmp1,4($ctx) lw $tmp2,8($ctx) lw $tmp3,12($ctx) li $in0,-4 # final reduction srl $ctx,$tmp4,2 and $in0,$in0,$tmp4 andi $tmp4,$tmp4,3 addu $ctx,$ctx,$in0 addu $tmp0,$tmp0,$ctx sltu $ctx,$tmp0,$ctx addiu $in0,$tmp0,5 # compare to modulus addu $tmp1,$tmp1,$ctx sltiu $in1,$in0,5 sltu $ctx,$tmp1,$ctx addu $in1,$in1,$tmp1 addu $tmp2,$tmp2,$ctx sltu $in2,$in1,$tmp1 sltu $ctx,$tmp2,$ctx addu $in2,$in2,$tmp2 addu $tmp3,$tmp3,$ctx sltu $in3,$in2,$tmp2 sltu $ctx,$tmp3,$ctx addu $in3,$in3,$tmp3 addu $tmp4,$tmp4,$ctx sltu $ctx,$in3,$tmp3 addu $ctx,$tmp4 srl $ctx,2 # see if it carried/borrowed subu $ctx,$zero,$ctx xor $in0,$tmp0 xor $in1,$tmp1 xor $in2,$tmp2 xor $in3,$tmp3 and $in0,$ctx and $in1,$ctx and $in2,$ctx and $in3,$ctx xor $in0,$tmp0 xor $in1,$tmp1 xor $in2,$tmp2 xor $in3,$tmp3 lw $tmp0,0($nonce) # load nonce lw $tmp1,4($nonce) lw $tmp2,8($nonce) lw $tmp3,12($nonce) addu $in0,$tmp0 # accumulate nonce sltu $ctx,$in0,$tmp0 addu $in1,$tmp1 sltu $tmp1,$in1,$tmp1 addu $in1,$ctx sltu $ctx,$in1,$ctx addu $ctx,$tmp1 addu $in2,$tmp2 sltu $tmp2,$in2,$tmp2 addu $in2,$ctx sltu $ctx,$in2,$ctx addu $ctx,$tmp2 addu $in3,$tmp3 addu $in3,$ctx srl $tmp0,$in0,8 # write mac value srl $tmp1,$in0,16 srl $tmp2,$in0,24 sb $in0, 0($mac) sb $tmp0,1($mac) srl $tmp0,$in1,8 sb $tmp1,2($mac) srl $tmp1,$in1,16 sb $tmp2,3($mac) srl $tmp2,$in1,24 sb $in1, 4($mac) sb $tmp0,5($mac) srl $tmp0,$in2,8 sb $tmp1,6($mac) srl $tmp1,$in2,16 sb $tmp2,7($mac) srl $tmp2,$in2,24 sb $in2, 8($mac) sb $tmp0,9($mac) srl $tmp0,$in3,8 sb $tmp1,10($mac) srl $tmp1,$in3,16 sb $tmp2,11($mac) srl $tmp2,$in3,24 sb $in3, 12($mac) sb $tmp0,13($mac) sb $tmp1,14($mac) sb $tmp2,15($mac) jr $ra .end poly1305_emit .rdata .asciiz "Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm" .align 2 ___ } }}} $output=pop and open STDOUT,">$output"; print $code; close STDOUT;