From 2e99d194d4036bc378cc757cbd5f841a6d7dc43b Mon Sep 17 00:00:00 2001 From: René van Dorst Date: Tue, 25 Sep 2018 14:04:35 +0200 Subject: chacha20-mips32r2: reduce stack and branches in loop, refactor jumptable handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: René van Dorst --- src/crypto/zinc/chacha20/chacha20-mips.S | 173 +++++++++++++++++-------------- 1 file changed, 94 insertions(+), 79 deletions(-) (limited to 'src/crypto/zinc/chacha20/chacha20-mips.S') diff --git a/src/crypto/zinc/chacha20/chacha20-mips.S b/src/crypto/zinc/chacha20/chacha20-mips.S index 8796da3..031ee5e 100644 --- a/src/crypto/zinc/chacha20/chacha20-mips.S +++ b/src/crypto/zinc/chacha20/chacha20-mips.S @@ -5,9 +5,8 @@ */ #define MASK_U32 0x3c -#define MASK_BYTES 0x03 #define CHACHA20_BLOCK_SIZE 64 -#define STACK_SIZE 40 +#define STACK_SIZE 32 #define X0 $t0 #define X1 $t1 @@ -19,7 +18,7 @@ #define X7 $t7 #define X8 $t8 #define X9 $t9 -#define X10 $s7 +#define X10 $v1 #define X11 $s6 #define X12 $s5 #define X13 $s4 @@ -49,10 +48,9 @@ * They are used to handling the last bytes which are not multiple of 4. */ #define SAVED_X X15 -#define SAVED_CA $fp +#define SAVED_CA $s7 -#define PTR_LAST_ROUND $v1 -#define IS_UNALIGNED $fp +#define IS_UNALIGNED $s7 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ #define MSB 0 @@ -212,12 +210,9 @@ chacha20_mips: /* Return bytes = 0. */ beqz BYTES, .Lchacha20_mips_end - /* Calculate PTR_LAST_ROUND */ - addiu PTR_LAST_ROUND, BYTES, -1 - ins PTR_LAST_ROUND, $zero, 0, 6 - addu PTR_LAST_ROUND, OUT + lw NONCE_0, 48(STATE) - /* Save s0-s7, fp */ + /* Save s0-s7 */ sw $s0, 0($sp) sw $s1, 4($sp) sw $s2, 8($sp) @@ -226,9 +221,6 @@ chacha20_mips: sw $s5, 20($sp) sw $s6, 24($sp) sw $s7, 28($sp) - sw $fp, 32($sp) - - lw NONCE_0, 48(STATE) /* Test IN or OUT is unaligned. * IS_UNALIGNED = ( IN | OUT ) & 0x00000003 @@ -236,7 +228,8 @@ chacha20_mips: or IS_UNALIGNED, IN, OUT andi IS_UNALIGNED, 0x3 - andi BYTES, (CHACHA20_BLOCK_SIZE-1) + /* Set number of rounds */ + li $at, 20 b .Lchacha20_rounds_start @@ -266,7 +259,6 @@ chacha20_mips: lw X14, 56(STATE) lw X15, 60(STATE) - li $at, 20 .Loop_chacha20_xor_rounds: addiu $at, -2 AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16); @@ -279,110 +271,107 @@ chacha20_mips: AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7); bnez $at, .Loop_chacha20_xor_rounds - andi $at, BYTES, MASK_U32 + addiu BYTES, -(CHACHA20_BLOCK_SIZE) /* Is data src/dst unaligned? Jump */ bnez IS_UNALIGNED, .Loop_chacha20_unaligned - /* Load upper half of jump table addr */ - lui T0, %hi(.Lchacha20_mips_jmptbl_aligned_0) + /* Set number rounds here to fill delayslot. */ + li $at, 20 - /* Last round? No, do a full block. */ - bne OUT, PTR_LAST_ROUND, .Lchacha20_mips_xor_aligned_16_b + /* BYTES < 0, it has no full block. */ + bltz BYTES, .Lchacha20_mips_no_full_block_aligned - /* Calculate lower half jump table offset */ - ins T0, $at, 1, 6 + FOR_EACH_WORD_REV(STORE_ALIGNED) - /* Full block? Jump */ - beqz BYTES, .Lchacha20_mips_xor_aligned_16_b + /* BYTES > 0? Loop again. */ + bgtz BYTES, .Loop_chacha20_rounds - /* Add STATE with offset */ - addu T1, STATE, $at + /* Place this here to fill delay slot */ + addiu NONCE_0, 1 - /* Add lower half jump table addr */ - addiu T0, %lo(.Lchacha20_mips_jmptbl_aligned_0) + /* BYTES < 0? Handle last bytes */ + bltz BYTES, .Lchacha20_mips_xor_bytes - /* Read value from STATE */ - lw SAVED_CA, 0(T1) +.Lchacha20_mips_xor_done: + /* Restore used registers */ + lw $s0, 0($sp) + lw $s1, 4($sp) + lw $s2, 8($sp) + lw $s3, 12($sp) + lw $s4, 16($sp) + lw $s5, 20($sp) + lw $s6, 24($sp) + lw $s7, 28($sp) - jr T0 + /* Write NONCE_0 back to right location in state */ + sw NONCE_0, 48(STATE) -.Loop_chacha20_unaligned: - /* Load upper half of jump table addr */ - lui T0, %hi(.Lchacha20_mips_jmptbl_unaligned_0) +.Lchacha20_mips_end: + addiu $sp, STACK_SIZE + jr $ra - /* Last round? No, do a full block. */ - bne OUT, PTR_LAST_ROUND, .Lchacha20_mips_xor_unaligned_16_b +.Lchacha20_mips_no_full_block_aligned: + /* Restore the offset on BYTES */ + addiu BYTES, CHACHA20_BLOCK_SIZE + + /* Get number of full WORDS */ + andi $at, BYTES, MASK_U32 + + /* Load upper half of jump table addr */ + lui T0, %hi(.Lchacha20_mips_jmptbl_aligned_0) /* Calculate lower half jump table offset */ ins T0, $at, 1, 6 - /* Full block? Jump */ - beqz BYTES, .Lchacha20_mips_xor_unaligned_16_b - - /* Add STATE with offset */ + /* Add offset to STATE */ addu T1, STATE, $at /* Add lower half jump table addr */ - addiu T0, %lo(.Lchacha20_mips_jmptbl_unaligned_0) + addiu T0, %lo(.Lchacha20_mips_jmptbl_aligned_0) /* Read value from STATE */ lw SAVED_CA, 0(T1) - jr T0 + /* Store remaining bytecounter as negative value */ + subu BYTES, $at, BYTES -/* Aligned code path - */ -.align 4 - FOR_EACH_WORD_REV(STORE_ALIGNED) - - bne OUT, PTR_LAST_ROUND, .Loop_chacha20_rounds + jr T0 - /* Increase NONCE_0, return NONCE_0 value */ - addiu NONCE_0, 1 - bne $at, BYTES, .Lchacha20_mips_xor_bytes + /* Jump table */ + FOR_EACH_WORD(JMPTBL_ALIGNED) -.Lchacha20_mips_xor_done: - /* Write NONCE_0 back to right location in state */ - sw NONCE_0, 48(STATE) - /* Restore used registers */ - lw $s0, 0($sp) - lw $s1, 4($sp) - lw $s2, 8($sp) - lw $s3, 12($sp) - lw $s4, 16($sp) - lw $s5, 20($sp) - lw $s6, 24($sp) - lw $s7, 28($sp) - lw $fp, 32($sp) -.Lchacha20_mips_end: - addiu $sp, STACK_SIZE - jr $ra +.Loop_chacha20_unaligned: + /* Set number rounds here to fill delayslot. */ + li $at, 20 - /* Jump table */ - FOR_EACH_WORD(JMPTBL_ALIGNED) + /* BYTES > 0, it has no full block. */ + bltz BYTES, .Lchacha20_mips_no_full_block_unaligned - /* Unaligned code path */ FOR_EACH_WORD_REV(STORE_UNALIGNED) - bne OUT, PTR_LAST_ROUND, .Loop_chacha20_rounds + /* BYTES > 0? Loop again. */ + bgtz BYTES, .Loop_chacha20_rounds + /* Write NONCE_0 back to right location in state */ + sw NONCE_0, 48(STATE) + + .set noreorder /* Fall through to byte handling */ - .set noreorder - beq $at, BYTES, .Lchacha20_mips_xor_done - /* Empty delayslot, increase NONCE_0, return NONCE_0 value */ + bgez BYTES, .Lchacha20_mips_xor_done .Lchacha20_mips_xor_unaligned_0_b: .Lchacha20_mips_xor_aligned_0_b: + /* Place this here to fill delay slot */ addiu NONCE_0, 1 - .set reorder + .set reorder .Lchacha20_mips_xor_bytes: addu IN, $at addu OUT, $at /* First byte */ lbu T1, 0(IN) - andi $at, BYTES, 2 + addiu $at, BYTES, 1 CPU_TO_LE32(SAVED_X) ROTR(SAVED_X) xor T1, SAVED_X @@ -390,7 +379,7 @@ chacha20_mips: beqz $at, .Lchacha20_mips_xor_done /* Second byte */ lbu T1, 1(IN) - andi $at, BYTES, 1 + addiu $at, BYTES, 2 ROTx SAVED_X, 8 xor T1, SAVED_X sb T1, 1(OUT) @@ -402,7 +391,33 @@ chacha20_mips: sb T1, 2(OUT) b .Lchacha20_mips_xor_done -.Lchacha20_mips_jmptbl_unaligned: +.Lchacha20_mips_no_full_block_unaligned: + /* Restore the offset on BYTES */ + addiu BYTES, CHACHA20_BLOCK_SIZE + + /* Get number of full WORDS */ + andi $at, BYTES, MASK_U32 + + /* Load upper half of jump table addr */ + lui T0, %hi(.Lchacha20_mips_jmptbl_unaligned_0) + + /* Calculate lower half jump table offset */ + ins T0, $at, 1, 6 + + /* Add offset to STATE */ + addu T1, STATE, $at + + /* Add lower half jump table addr */ + addiu T0, %lo(.Lchacha20_mips_jmptbl_unaligned_0) + + /* Read value from STATE */ + lw SAVED_CA, 0(T1) + + /* Store remaining bytecounter as negative value */ + subu BYTES, $at, BYTES + + jr T0 + /* Jump table */ FOR_EACH_WORD(JMPTBL_UNALIGNED) .end chacha20_mips -- cgit v1.2.3-59-g8ed1b