aboutsummaryrefslogblamecommitdiffstatshomepage
path: root/src/crypto/zinc/chacha20/chacha20-mips.S
blob: 031ee5e794df750e7026f2855d307c8bc115804e (plain) (tree)
1
2
3
4
5
6
7
8
9

                                             



                                                                                        
                                    
                                  
                                  








                   

                   
                   




                   
                                                                                



                      

                     



                           












                                                                        
                           
 
                           
















                                          



























































                                                             
                 



                                          
                
                                     
                 








                                                           
                 


                                          
                
                                     
                 


                                       





                                                                           

                                         
                            
                                                          
                        
                                                   
                
                                                    
                 
                       
 

                                           
                            
                                                            
                        
                                                   
                
                                                    
                 
                       















                                                       



                     
              
                                    
 

                                
                               
                                          
 
                                  
 
                        







                            
 
                                       
                                                   
           

                                     
 

                                  

                                       






                                        
                        












                              

                            


                              
 
                          
                       







                                                       
                                              
 
                                             
 

                                                      
 

                                                       
 

                                                            
 
                                        
 

                                            
 

                                                
 

                                                
 









                                    
 

                                                           
 


                               
 








                                                         
 

                                                    
 
                                 


                                            
                                                         



                                   

                                                           
 
                  
 

                                     
 
 


                                                       
 

                                                              
 
                                          
 

                                            
 



                                                           
                                           
                                               

                                  
                                                
                          
                    

                          
                       
                        

                         
                             


                            
                          
                                             

                         
                             

                           
                          
                                             



                           
                          
                                        
 


























                                                           

                                       

                  
/* SPDX-License-Identifier: GPL-2.0 OR MIT */
/*
 * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
 * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
 */

#define MASK_U32		0x3c
#define CHACHA20_BLOCK_SIZE	64
#define STACK_SIZE		32

#define X0	$t0
#define X1	$t1
#define X2	$t2
#define X3	$t3
#define X4	$t4
#define X5	$t5
#define X6	$t6
#define X7	$t7
#define X8	$t8
#define X9	$t9
#define X10	$v1
#define X11	$s6
#define X12	$s5
#define X13	$s4
#define X14	$s3
#define X15	$s2
/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
#define T0	$s1
#define T1	$s0
#define T(n)	T ## n
#define X(n)	X ## n

/* Input arguments */
#define STATE		$a0
#define OUT		$a1
#define IN		$a2
#define BYTES		$a3

/* Output argument */
/* NONCE[0] is kept in a register and not in memory.
 * We don't want to touch original value in memory.
 * Must be incremented every loop iteration.
 */
#define NONCE_0		$v0

/* SAVED_X and SAVED_CA are set in the jump table.
 * Use regs which are overwritten on exit else we don't leak clear data.
 * They are used to handling the last bytes which are not multiple of 4.
 */
#define SAVED_X		X15
#define SAVED_CA	$s7

#define IS_UNALIGNED	$s7

#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
#define MSB 0
#define LSB 3
#define ROTx rotl
#define ROTR(n) rotr n, 24
#define	CPU_TO_LE32(n) \
	wsbh	n; \
	rotr	n, 16;
#else
#define MSB 3
#define LSB 0
#define ROTx rotr
#define CPU_TO_LE32(n)
#define ROTR(n)
#endif

#define FOR_EACH_WORD(x) \
	x( 0); \
	x( 1); \
	x( 2); \
	x( 3); \
	x( 4); \
	x( 5); \
	x( 6); \
	x( 7); \
	x( 8); \
	x( 9); \
	x(10); \
	x(11); \
	x(12); \
	x(13); \
	x(14); \
	x(15);

#define FOR_EACH_WORD_REV(x) \
	x(15); \
	x(14); \
	x(13); \
	x(12); \
	x(11); \
	x(10); \
	x( 9); \
	x( 8); \
	x( 7); \
	x( 6); \
	x( 5); \
	x( 4); \
	x( 3); \
	x( 2); \
	x( 1); \
	x( 0);

#define PLUS_ONE_0	 1
#define PLUS_ONE_1	 2
#define PLUS_ONE_2	 3
#define PLUS_ONE_3	 4
#define PLUS_ONE_4	 5
#define PLUS_ONE_5	 6
#define PLUS_ONE_6	 7
#define PLUS_ONE_7	 8
#define PLUS_ONE_8	 9
#define PLUS_ONE_9	10
#define PLUS_ONE_10	11
#define PLUS_ONE_11	12
#define PLUS_ONE_12	13
#define PLUS_ONE_13	14
#define PLUS_ONE_14	15
#define PLUS_ONE_15	16
#define PLUS_ONE(x)	PLUS_ONE_ ## x
#define _CONCAT3(a,b,c)	a ## b ## c
#define CONCAT3(a,b,c)	_CONCAT3(a,b,c)

#define STORE_UNALIGNED(x) \
CONCAT3(.Lchacha20_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
	.if (x != 12); \
		lw	T0, (x*4)(STATE); \
	.endif; \
	lwl	T1, (x*4)+MSB ## (IN); \
	lwr	T1, (x*4)+LSB ## (IN); \
	.if (x == 12); \
		addu	X ## x, NONCE_0; \
	.else; \
		addu	X ## x, T0; \
	.endif; \
	CPU_TO_LE32(X ## x); \
	xor	X ## x, T1; \
	swl	X ## x, (x*4)+MSB ## (OUT); \
	swr	X ## x, (x*4)+LSB ## (OUT);

#define STORE_ALIGNED(x) \
CONCAT3(.Lchacha20_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
	.if (x != 12); \
		lw	T0, (x*4)(STATE); \
	.endif; \
	lw	T1, (x*4) ## (IN); \
	.if (x == 12); \
		addu	X ## x, NONCE_0; \
	.else; \
		addu	X ## x, T0; \
	.endif; \
	CPU_TO_LE32(X ## x); \
	xor	X ## x, T1; \
	sw	X ## x, (x*4) ## (OUT);

/* Jump table macro.
 * Used for setup and handling the last bytes, which are not multiple of 4.
 * X15 is free to store Xn
 * Every jumptable entry must be equal in size.
 */
#define JMPTBL_ALIGNED(x) \
.Lchacha20_mips_jmptbl_aligned_ ## x: ; \
	.set	noreorder; \
	b	.Lchacha20_mips_xor_aligned_ ## x ## _b; \
	.if (x == 12); \
		addu	SAVED_X, X ## x, NONCE_0; \
	.else; \
		addu	SAVED_X, X ## x, SAVED_CA; \
	.endif; \
	.set	reorder

#define JMPTBL_UNALIGNED(x) \
.Lchacha20_mips_jmptbl_unaligned_ ## x: ; \
	.set	noreorder; \
	b	.Lchacha20_mips_xor_unaligned_ ## x ## _b; \
	.if (x == 12); \
		addu	SAVED_X, X ## x, NONCE_0; \
	.else; \
		addu	SAVED_X, X ## x, SAVED_CA; \
	.endif; \
	.set	reorder

#define AXR(A, B, C, D,  K, L, M, N,  V, W, Y, Z,  S) \
	addu	X(A), X(K); \
	addu	X(B), X(L); \
	addu	X(C), X(M); \
	addu	X(D), X(N); \
	xor	X(V), X(A); \
	xor	X(W), X(B); \
	xor	X(Y), X(C); \
	xor	X(Z), X(D); \
	rotl	X(V), S;    \
	rotl	X(W), S;    \
	rotl	X(Y), S;    \
	rotl	X(Z), S;

.text
.set	reorder
.set	noat
.globl	chacha20_mips
.ent	chacha20_mips
chacha20_mips:
	.frame	$sp, STACK_SIZE, $ra

	addiu	$sp, -STACK_SIZE

	/* Return bytes = 0. */
	beqz	BYTES, .Lchacha20_mips_end

	lw	NONCE_0, 48(STATE)

	/* Save s0-s7 */
	sw	$s0,  0($sp)
	sw	$s1,  4($sp)
	sw	$s2,  8($sp)
	sw	$s3, 12($sp)
	sw	$s4, 16($sp)
	sw	$s5, 20($sp)
	sw	$s6, 24($sp)
	sw	$s7, 28($sp)

	/* Test IN or OUT is unaligned.
	 * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
	 */
	or	IS_UNALIGNED, IN, OUT
	andi	IS_UNALIGNED, 0x3

	/* Set number of rounds */
	li	$at, 20

	b	.Lchacha20_rounds_start

.align 4
.Loop_chacha20_rounds:
	addiu	IN,  CHACHA20_BLOCK_SIZE
	addiu	OUT, CHACHA20_BLOCK_SIZE
	addiu	NONCE_0, 1

.Lchacha20_rounds_start:
	lw	X0,  0(STATE)
	lw	X1,  4(STATE)
	lw	X2,  8(STATE)
	lw	X3,  12(STATE)

	lw	X4,  16(STATE)
	lw	X5,  20(STATE)
	lw	X6,  24(STATE)
	lw	X7,  28(STATE)
	lw	X8,  32(STATE)
	lw	X9,  36(STATE)
	lw	X10, 40(STATE)
	lw	X11, 44(STATE)

	move	X12, NONCE_0
	lw	X13, 52(STATE)
	lw	X14, 56(STATE)
	lw	X15, 60(STATE)

.Loop_chacha20_xor_rounds:
	addiu	$at, -2
	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
	bnez	$at, .Loop_chacha20_xor_rounds

	addiu	BYTES, -(CHACHA20_BLOCK_SIZE)

	/* Is data src/dst unaligned? Jump */
	bnez	IS_UNALIGNED, .Loop_chacha20_unaligned

	/* Set number rounds here to fill delayslot. */
	li	$at, 20

	/* BYTES < 0, it has no full block. */
	bltz	BYTES, .Lchacha20_mips_no_full_block_aligned

	FOR_EACH_WORD_REV(STORE_ALIGNED)

	/* BYTES > 0? Loop again. */
	bgtz	BYTES, .Loop_chacha20_rounds

	/* Place this here to fill delay slot */
	addiu	NONCE_0, 1

	/* BYTES < 0? Handle last bytes */
	bltz	BYTES, .Lchacha20_mips_xor_bytes

.Lchacha20_mips_xor_done:
	/* Restore used registers */
	lw	$s0,  0($sp)
	lw	$s1,  4($sp)
	lw	$s2,  8($sp)
	lw	$s3, 12($sp)
	lw	$s4, 16($sp)
	lw	$s5, 20($sp)
	lw	$s6, 24($sp)
	lw	$s7, 28($sp)

	/* Write NONCE_0 back to right location in state */
	sw	NONCE_0, 48(STATE)

.Lchacha20_mips_end:
	addiu	$sp, STACK_SIZE
	jr	$ra

.Lchacha20_mips_no_full_block_aligned:
	/* Restore the offset on BYTES */
	addiu	BYTES, CHACHA20_BLOCK_SIZE

	/* Get number of full WORDS */
	andi	$at, BYTES, MASK_U32

	/* Load upper half of jump table addr */
	lui	T0, %hi(.Lchacha20_mips_jmptbl_aligned_0)

	/* Calculate lower half jump table offset */
	ins	T0, $at, 1, 6

	/* Add offset to STATE */
	addu	T1, STATE, $at

	/* Add lower half jump table addr */
	addiu	T0, %lo(.Lchacha20_mips_jmptbl_aligned_0)

	/* Read value from STATE */
	lw	SAVED_CA, 0(T1)

	/* Store remaining bytecounter as negative value */
	subu	BYTES, $at, BYTES

	jr	T0

	/* Jump table */
	FOR_EACH_WORD(JMPTBL_ALIGNED)


.Loop_chacha20_unaligned:
	/* Set number rounds here to fill delayslot. */
	li	$at, 20

	/* BYTES > 0, it has no full block. */
	bltz	BYTES, .Lchacha20_mips_no_full_block_unaligned

	FOR_EACH_WORD_REV(STORE_UNALIGNED)

	/* BYTES > 0? Loop again. */
	bgtz	BYTES, .Loop_chacha20_rounds

	/* Write NONCE_0 back to right location in state */
	sw	NONCE_0, 48(STATE)

	.set noreorder
	/* Fall through to byte handling */
	bgez	BYTES, .Lchacha20_mips_xor_done
.Lchacha20_mips_xor_unaligned_0_b:
.Lchacha20_mips_xor_aligned_0_b:
	/* Place this here to fill delay slot */
	addiu	NONCE_0, 1
	.set reorder

.Lchacha20_mips_xor_bytes:
	addu	IN, $at
	addu	OUT, $at
	/* First byte */
	lbu	T1, 0(IN)
	addiu	$at, BYTES, 1
	CPU_TO_LE32(SAVED_X)
	ROTR(SAVED_X)
	xor	T1, SAVED_X
	sb	T1, 0(OUT)
	beqz	$at, .Lchacha20_mips_xor_done
	/* Second byte */
	lbu	T1, 1(IN)
	addiu	$at, BYTES, 2
	ROTx	SAVED_X, 8
	xor	T1, SAVED_X
	sb	T1, 1(OUT)
	beqz	$at, .Lchacha20_mips_xor_done
	/* Third byte */
	lbu	T1, 2(IN)
	ROTx	SAVED_X, 8
	xor	T1, SAVED_X
	sb	T1, 2(OUT)
	b	.Lchacha20_mips_xor_done

.Lchacha20_mips_no_full_block_unaligned:
	/* Restore the offset on BYTES */
	addiu	BYTES, CHACHA20_BLOCK_SIZE

	/* Get number of full WORDS */
	andi	$at, BYTES, MASK_U32

	/* Load upper half of jump table addr */
	lui	T0, %hi(.Lchacha20_mips_jmptbl_unaligned_0)

	/* Calculate lower half jump table offset */
	ins	T0, $at, 1, 6

	/* Add offset to STATE */
	addu	T1, STATE, $at

	/* Add lower half jump table addr */
	addiu	T0, %lo(.Lchacha20_mips_jmptbl_unaligned_0)

	/* Read value from STATE */
	lw	SAVED_CA, 0(T1)

	/* Store remaining bytecounter as negative value */
	subu	BYTES, $at, BYTES

	jr	T0

	/* Jump table */
	FOR_EACH_WORD(JMPTBL_UNALIGNED)
.end chacha20_mips
.set at