aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/src
diff options
context:
space:
mode:
authorJason A. Donenfeld <Jason@zx2c4.com>2018-09-23 17:34:34 +0200
committerJason A. Donenfeld <Jason@zx2c4.com>2018-09-23 17:34:39 +0200
commitbcec13ea5a01f88e4765ef22636b356df0938fa3 (patch)
tree90ec997ba151185a450f6897f35650ab83358e0f /src
parentcrypto-arm: rework KERNEL_MODE_NEON handling again (diff)
downloadwireguard-monolithic-historical-bcec13ea5a01f88e4765ef22636b356df0938fa3.tar.xz
wireguard-monolithic-historical-bcec13ea5a01f88e4765ef22636b356df0938fa3.zip
chacha20-arm: switch to entirely Andy Polyakov's implementation
Diffstat (limited to 'src')
-rw-r--r--src/crypto/zinc/chacha20/chacha20-arm-glue.h25
-rw-r--r--src/crypto/zinc/chacha20/chacha20-arm.S1063
2 files changed, 287 insertions, 801 deletions
diff --git a/src/crypto/zinc/chacha20/chacha20-arm-glue.h b/src/crypto/zinc/chacha20/chacha20-arm-glue.h
index 1f1add0..2fd4348 100644
--- a/src/crypto/zinc/chacha20/chacha20-arm-glue.h
+++ b/src/crypto/zinc/chacha20/chacha20-arm-glue.h
@@ -12,9 +12,6 @@
asmlinkage void chacha20_arm(u8 *out, const u8 *in, const size_t len,
const u32 key[8], const u32 counter[4]);
-#if defined(CONFIG_ARM)
-asmlinkage void hchacha20_arm(const u32 state[16], u32 out[8]);
-#endif
#if defined(CONFIG_KERNEL_MODE_NEON)
asmlinkage void chacha20_neon(u8 *out, const u8 *in, const size_t len,
const u32 key[8], const u32 counter[4]);
@@ -60,27 +57,5 @@ static inline bool chacha20_arch(struct chacha20_ctx *state, u8 *dst,
static inline bool hchacha20_arch(u8 *derived_key, const u8 *nonce,
const u8 *key, simd_context_t *simd_context)
{
-#if defined(CONFIG_ARM)
- u32 x[] = { CHACHA20_CONSTANT_EXPA,
- CHACHA20_CONSTANT_ND_3,
- CHACHA20_CONSTANT_2_BY,
- CHACHA20_CONSTANT_TE_K,
- get_unaligned_le32(key + 0),
- get_unaligned_le32(key + 4),
- get_unaligned_le32(key + 8),
- get_unaligned_le32(key + 12),
- get_unaligned_le32(key + 16),
- get_unaligned_le32(key + 20),
- get_unaligned_le32(key + 24),
- get_unaligned_le32(key + 28),
- get_unaligned_le32(nonce + 0),
- get_unaligned_le32(nonce + 4),
- get_unaligned_le32(nonce + 8),
- get_unaligned_le32(nonce + 12)
- };
- hchacha20_arm(x, (u32 *)derived_key);
- return true;
-#else
return false;
-#endif
}
diff --git a/src/crypto/zinc/chacha20/chacha20-arm.S b/src/crypto/zinc/chacha20/chacha20-arm.S
index 7ac2e26..da0d25d 100644
--- a/src/crypto/zinc/chacha20/chacha20-arm.S
+++ b/src/crypto/zinc/chacha20/chacha20-arm.S
@@ -1,476 +1,12 @@
/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
/*
- * Copyright (C) 2018 Google, Inc.
* Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
* Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
- */
-
-#include <linux/linkage.h>
-
-/*
- * The following scalar routine was written by Eric Biggers.
- *
- * Design notes:
- *
- * 16 registers would be needed to hold the state matrix, but only 14 are
- * available because 'sp' and 'pc' cannot be used. So we spill the elements
- * (x8, x9) to the stack and swap them out with (x10, x11). This adds one
- * 'ldrd' and one 'strd' instruction per round.
*
- * All rotates are performed using the implicit rotate operand accepted by the
- * 'add' and 'eor' instructions. This is faster than using explicit rotate
- * instructions. To make this work, we allow the values in the second and last
- * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
- * wrong rotation amount. The rotation amount is then fixed up just in time
- * when the values are used. 'brot' is the number of bits the values in row 'b'
- * need to be rotated right to arrive at the correct values, and 'drot'
- * similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
- * that they end up as (25, 24) after every round.
- */
-
- // ChaCha state registers
- X0 .req r0
- X1 .req r1
- X2 .req r2
- X3 .req r3
- X4 .req r4
- X5 .req r5
- X6 .req r6
- X7 .req r7
- X8_X10 .req r8 // shared by x8 and x10
- X9_X11 .req r9 // shared by x9 and x11
- X12 .req r10
- X13 .req r11
- X14 .req r12
- X15 .req r14
-
-.Lexpand_32byte_k:
- // "expand 32-byte k"
- .word 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
-
-#ifdef __thumb2__
-# define adrl adr
-#endif
-
-.macro __rev out, in, t0, t1, t2
-.if __LINUX_ARM_ARCH__ >= 6
- rev \out, \in
-.else
- lsl \t0, \in, #24
- and \t1, \in, #0xff00
- and \t2, \in, #0xff0000
- orr \out, \t0, \in, lsr #24
- orr \out, \out, \t1, lsl #8
- orr \out, \out, \t2, lsr #8
-.endif
-.endm
-
-.macro _le32_bswap x, t0, t1, t2
-#ifdef __ARMEB__
- __rev \x, \x, \t0, \t1, \t2
-#endif
-.endm
-
-.macro _le32_bswap_4x a, b, c, d, t0, t1, t2
- _le32_bswap \a, \t0, \t1, \t2
- _le32_bswap \b, \t0, \t1, \t2
- _le32_bswap \c, \t0, \t1, \t2
- _le32_bswap \d, \t0, \t1, \t2
-.endm
-
-.macro __ldrd a, b, src, offset
-#if __LINUX_ARM_ARCH__ >= 6
- ldrd \a, \b, [\src, #\offset]
-#else
- ldr \a, [\src, #\offset]
- ldr \b, [\src, #\offset + 4]
-#endif
-.endm
-
-.macro __strd a, b, dst, offset
-#if __LINUX_ARM_ARCH__ >= 6
- strd \a, \b, [\dst, #\offset]
-#else
- str \a, [\dst, #\offset]
- str \b, [\dst, #\offset + 4]
-#endif
-.endm
-
-.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2
-
- // a += b; d ^= a; d = rol(d, 16);
- add \a1, \a1, \b1, ror #brot
- add \a2, \a2, \b2, ror #brot
- eor \d1, \a1, \d1, ror #drot
- eor \d2, \a2, \d2, ror #drot
- // drot == 32 - 16 == 16
-
- // c += d; b ^= c; b = rol(b, 12);
- add \c1, \c1, \d1, ror #16
- add \c2, \c2, \d2, ror #16
- eor \b1, \c1, \b1, ror #brot
- eor \b2, \c2, \b2, ror #brot
- // brot == 32 - 12 == 20
-
- // a += b; d ^= a; d = rol(d, 8);
- add \a1, \a1, \b1, ror #20
- add \a2, \a2, \b2, ror #20
- eor \d1, \a1, \d1, ror #16
- eor \d2, \a2, \d2, ror #16
- // drot == 32 - 8 == 24
-
- // c += d; b ^= c; b = rol(b, 7);
- add \c1, \c1, \d1, ror #24
- add \c2, \c2, \d2, ror #24
- eor \b1, \c1, \b1, ror #20
- eor \b2, \c2, \b2, ror #20
- // brot == 32 - 7 == 25
-.endm
-
-.macro _doubleround
-
- // column round
-
- // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
- _halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13
-
- // save (x8, x9); restore (x10, x11)
- __strd X8_X10, X9_X11, sp, 0
- __ldrd X8_X10, X9_X11, sp, 8
-
- // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
- _halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15
-
- .set brot, 25
- .set drot, 24
-
- // diagonal round
-
- // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
- _halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12
-
- // save (x10, x11); restore (x8, x9)
- __strd X8_X10, X9_X11, sp, 8
- __ldrd X8_X10, X9_X11, sp, 0
-
- // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
- _halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14
-.endm
-
-.macro _chacha_permute nrounds
- .set brot, 0
- .set drot, 0
- .rept \nrounds / 2
- _doubleround
- .endr
-.endm
-
-.macro _chacha nrounds
-
-.Lnext_block\@:
- // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
- // Registers contain x0-x9,x12-x15.
-
- // Do the core ChaCha permutation to update x0-x15.
- _chacha_permute \nrounds
-
- add sp, #8
- // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
- // Registers contain x0-x9,x12-x15.
- // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
-
- // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
- push {X8_X10, X9_X11, X12, X13, X14, X15}
-
- // Load (OUT, IN, LEN).
- ldr r14, [sp, #96]
- ldr r12, [sp, #100]
- ldr r11, [sp, #104]
-
- orr r10, r14, r12
-
- // Use slow path if fewer than 64 bytes remain.
- cmp r11, #64
- blt .Lxor_slowpath\@
-
- // Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on
- // ARMv6+, since ldmia and stmia (used below) still require alignment.
- tst r10, #3
- bne .Lxor_slowpath\@
-
- // Fast path: XOR 64 bytes of aligned data.
-
- // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
- // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
- // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
-
- // x0-x3
- __ldrd r8, r9, sp, 32
- __ldrd r10, r11, sp, 40
- add X0, X0, r8
- add X1, X1, r9
- add X2, X2, r10
- add X3, X3, r11
- _le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
- ldmia r12!, {r8-r11}
- eor X0, X0, r8
- eor X1, X1, r9
- eor X2, X2, r10
- eor X3, X3, r11
- stmia r14!, {X0-X3}
-
- // x4-x7
- __ldrd r8, r9, sp, 48
- __ldrd r10, r11, sp, 56
- add X4, r8, X4, ror #brot
- add X5, r9, X5, ror #brot
- ldmia r12!, {X0-X3}
- add X6, r10, X6, ror #brot
- add X7, r11, X7, ror #brot
- _le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
- eor X4, X4, X0
- eor X5, X5, X1
- eor X6, X6, X2
- eor X7, X7, X3
- stmia r14!, {X4-X7}
-
- // x8-x15
- pop {r0-r7} // (x8-x9,x12-x15,x10-x11)
- __ldrd r8, r9, sp, 32
- __ldrd r10, r11, sp, 40
- add r0, r0, r8 // x8
- add r1, r1, r9 // x9
- add r6, r6, r10 // x10
- add r7, r7, r11 // x11
- _le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
- ldmia r12!, {r8-r11}
- eor r0, r0, r8 // x8
- eor r1, r1, r9 // x9
- eor r6, r6, r10 // x10
- eor r7, r7, r11 // x11
- stmia r14!, {r0,r1,r6,r7}
- ldmia r12!, {r0,r1,r6,r7}
- __ldrd r8, r9, sp, 48
- __ldrd r10, r11, sp, 56
- add r2, r8, r2, ror #drot // x12
- add r3, r9, r3, ror #drot // x13
- add r4, r10, r4, ror #drot // x14
- add r5, r11, r5, ror #drot // x15
- _le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
- ldr r9, [sp, #72] // load LEN
- eor r2, r2, r0 // x12
- eor r3, r3, r1 // x13
- eor r4, r4, r6 // x14
- eor r5, r5, r7 // x15
- subs r9, #64 // decrement and check LEN
- stmia r14!, {r2-r5}
-
- beq .Ldone\@
-
-.Lprepare_for_next_block\@:
-
- // Stack: x0-x15 OUT IN LEN
-
- // Increment block counter (x12)
- add r8, #1
-
- // Store updated (OUT, IN, LEN)
- str r14, [sp, #64]
- str r12, [sp, #68]
- str r9, [sp, #72]
-
- mov r14, sp
-
- // Store updated block counter (x12)
- str r8, [sp, #48]
-
- sub sp, #16
-
- // Reload state and do next block
- ldmia r14!, {r0-r11} // load x0-x11
- __strd r10, r11, sp, 8 // store x10-x11 before state
- ldmia r14, {r10-r12,r14} // load x12-x15
- b .Lnext_block\@
-
-.Lxor_slowpath\@:
- // Slow path: < 64 bytes remaining, or unaligned input or output buffer.
- // We handle it by storing the 64 bytes of keystream to the stack, then
- // XOR-ing the needed portion with the data.
-
- // Allocate keystream buffer
- sub sp, #64
- mov r14, sp
-
- // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
- // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
- // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
-
- // Save keystream for x0-x3
- __ldrd r8, r9, sp, 96
- __ldrd r10, r11, sp, 104
- add X0, X0, r8
- add X1, X1, r9
- add X2, X2, r10
- add X3, X3, r11
- _le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
- stmia r14!, {X0-X3}
-
- // Save keystream for x4-x7
- __ldrd r8, r9, sp, 112
- __ldrd r10, r11, sp, 120
- add X4, r8, X4, ror #brot
- add X5, r9, X5, ror #brot
- add X6, r10, X6, ror #brot
- add X7, r11, X7, ror #brot
- _le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
- add r8, sp, #64
- stmia r14!, {X4-X7}
-
- // Save keystream for x8-x15
- ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11)
- __ldrd r8, r9, sp, 128
- __ldrd r10, r11, sp, 136
- add r0, r0, r8 // x8
- add r1, r1, r9 // x9
- add r6, r6, r10 // x10
- add r7, r7, r11 // x11
- _le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
- stmia r14!, {r0,r1,r6,r7}
- __ldrd r8, r9, sp, 144
- __ldrd r10, r11, sp, 152
- add r2, r8, r2, ror #drot // x12
- add r3, r9, r3, ror #drot // x13
- add r4, r10, r4, ror #drot // x14
- add r5, r11, r5, ror #drot // x15
- _le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
- stmia r14, {r2-r5}
-
- // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
- // Registers: r8 is block counter, r12 is IN.
-
- ldr r9, [sp, #168] // LEN
- ldr r14, [sp, #160] // OUT
- cmp r9, #64
- mov r0, sp
- movle r1, r9
- movgt r1, #64
- // r1 is number of bytes to XOR, in range [1, 64]
-
-.if __LINUX_ARM_ARCH__ < 6
- orr r2, r12, r14
- tst r2, #3 // IN or OUT misaligned?
- bne .Lxor_next_byte\@
-.endif
-
- // XOR a word at a time
-.rept 16
- subs r1, #4
- blt .Lxor_words_done\@
- ldr r2, [r12], #4
- ldr r3, [r0], #4
- eor r2, r2, r3
- str r2, [r14], #4
-.endr
- b .Lxor_slowpath_done\@
-.Lxor_words_done\@:
- ands r1, r1, #3
- beq .Lxor_slowpath_done\@
-
- // XOR a byte at a time
-.Lxor_next_byte\@:
- ldrb r2, [r12], #1
- ldrb r3, [r0], #1
- eor r2, r2, r3
- strb r2, [r14], #1
- subs r1, #1
- bne .Lxor_next_byte\@
-
-.Lxor_slowpath_done\@:
- subs r9, #64
- add sp, #96
- bgt .Lprepare_for_next_block\@
-
-.Ldone\@:
-.endm // _chacha
-
-/*
- * void chacha20_arm(u8 *out, const u8 *in, size_t len, const u32 key[8],
- * const u32 iv[4]);
- */
-ENTRY(chacha20_arm)
- cmp r2, #0 // len == 0?
- bxeq lr
-
- push {r0-r2,r4-r11,lr}
-
- // Push state x0-x15 onto stack.
- // Also store an extra copy of x10-x11 just before the state.
-
- ldr r4, [sp, #48] // iv
- mov r0, sp
- sub sp, #80
-
- // iv: x12-x15
- ldm r4, {X12,X13,X14,X15}
- stmdb r0!, {X12,X13,X14,X15}
-
- // key: x4-x11
- __ldrd X8_X10, X9_X11, r3, 24
- __strd X8_X10, X9_X11, sp, 8
- stmdb r0!, {X8_X10, X9_X11}
- ldm r3, {X4-X9_X11}
- stmdb r0!, {X4-X9_X11}
-
- // constants: x0-x3
- adrl X3, .Lexpand_32byte_k
- ldm X3, {X0-X3}
- __strd X0, X1, sp, 16
- __strd X2, X3, sp, 24
-
- _chacha 20
-
- add sp, #76
- pop {r4-r11, pc}
-ENDPROC(chacha20_arm)
-
-/*
- * void hchacha20_arm(const u32 state[16], u32 out[8]);
+ * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS.
*/
-ENTRY(hchacha20_arm)
- push {r1,r4-r11,lr}
-
- mov r14, r0
- ldmia r14!, {r0-r11} // load x0-x11
- push {r10-r11} // store x10-x11 to stack
- ldm r14, {r10-r12,r14} // load x12-x15
- sub sp, #8
-
- _chacha_permute 20
-
- // Skip over (unused0-unused1, x10-x11)
- add sp, #16
-
- // Fix up rotations of x12-x15
- ror X12, X12, #drot
- ror X13, X13, #drot
- pop {r4} // load 'out'
- ror X14, X14, #drot
- ror X15, X15, #drot
-
- // Store (x0-x3,x12-x15) to 'out' after byte swapping
- _le32_bswap_4x X0, X1, X2, X3, X4, X5, X6
- _le32_bswap_4x X12, X13, X14, X15, X4, X5, X6
- stm r4, {X0,X1,X2,X3,X12,X13,X14,X15}
- pop {r4-r11,pc}
-ENDPROC(hchacha20_arm)
-
-#ifdef CONFIG_KERNEL_MODE_NEON
-/*
- * This following NEON routine was ported from Andy Polyakov's implementation
- * from CRYPTOGAMS. It begins with parts of the CRYPTOGAMS scalar routine,
- * since certain NEON code paths actually branch to it.
- */
+#include <linux/linkage.h>
.text
#if defined(__thumb2__) || defined(__clang__)
@@ -491,9 +27,18 @@ ENDPROC(hchacha20_arm)
.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
.Lone:
.long 1,0,0,0
-.word -1
+.word -1
.align 5
+ENTRY(chacha20_arm)
+ ldr r12,[sp,#0] @ pull pointer to counter and nonce
+ stmdb sp!,{r0-r2,r4-r11,lr}
+ cmp r2,#0 @ len==0?
+#ifdef __thumb2__
+ itt eq
+#endif
+ addeq sp,sp,#4*3
+ beq .Lno_data_arm
.Lshort:
ldmia r12,{r4-r7} @ load counter and nonce
sub sp,sp,#4*(16) @ off-load area
@@ -519,9 +64,17 @@ ENDPROC(hchacha20_arm)
str r14, [sp,#4*(32+0)] @ save out
.Loop_outer_enter:
ldr r11, [sp,#4*(15)]
+ mov r4,r4,ror#19 @ twist b[0..3]
ldr r12,[sp,#4*(12)] @ modulo-scheduled load
+ mov r5,r5,ror#19
ldr r10, [sp,#4*(13)]
+ mov r6,r6,ror#19
ldr r14,[sp,#4*(14)]
+ mov r7,r7,ror#19
+ mov r11,r11,ror#8 @ twist d[0..3]
+ mov r12,r12,ror#8
+ mov r10,r10,ror#8
+ mov r14,r14,ror#8
str r11, [sp,#4*(16+15)]
mov r11,#10
b .Loop
@@ -529,114 +82,82 @@ ENDPROC(hchacha20_arm)
.align 4
.Loop:
subs r11,r11,#1
- add r0,r0,r4
- mov r12,r12,ror#16
- add r1,r1,r5
- mov r10,r10,ror#16
- eor r12,r12,r0,ror#16
- eor r10,r10,r1,ror#16
- add r8,r8,r12
- mov r4,r4,ror#20
- add r9,r9,r10
- mov r5,r5,ror#20
- eor r4,r4,r8,ror#20
- eor r5,r5,r9,ror#20
- add r0,r0,r4
- mov r12,r12,ror#24
- add r1,r1,r5
- mov r10,r10,ror#24
- eor r12,r12,r0,ror#24
- eor r10,r10,r1,ror#24
- add r8,r8,r12
- mov r4,r4,ror#25
- add r9,r9,r10
- mov r5,r5,ror#25
+ add r0,r0,r4,ror#13
+ add r1,r1,r5,ror#13
+ eor r12,r0,r12,ror#24
+ eor r10,r1,r10,ror#24
+ add r8,r8,r12,ror#16
+ add r9,r9,r10,ror#16
+ eor r4,r8,r4,ror#13
+ eor r5,r9,r5,ror#13
+ add r0,r0,r4,ror#20
+ add r1,r1,r5,ror#20
+ eor r12,r0,r12,ror#16
+ eor r10,r1,r10,ror#16
+ add r8,r8,r12,ror#24
str r10,[sp,#4*(16+13)]
+ add r9,r9,r10,ror#24
ldr r10,[sp,#4*(16+15)]
- eor r4,r4,r8,ror#25
- eor r5,r5,r9,ror#25
str r8,[sp,#4*(16+8)]
- ldr r8,[sp,#4*(16+10)]
- add r2,r2,r6
- mov r14,r14,ror#16
+ eor r4,r4,r8,ror#12
str r9,[sp,#4*(16+9)]
+ eor r5,r5,r9,ror#12
+ ldr r8,[sp,#4*(16+10)]
+ add r2,r2,r6,ror#13
ldr r9,[sp,#4*(16+11)]
- add r3,r3,r7
- mov r10,r10,ror#16
- eor r14,r14,r2,ror#16
- eor r10,r10,r3,ror#16
- add r8,r8,r14
- mov r6,r6,ror#20
- add r9,r9,r10
- mov r7,r7,ror#20
- eor r6,r6,r8,ror#20
- eor r7,r7,r9,ror#20
- add r2,r2,r6
- mov r14,r14,ror#24
- add r3,r3,r7
- mov r10,r10,ror#24
- eor r14,r14,r2,ror#24
- eor r10,r10,r3,ror#24
- add r8,r8,r14
- mov r6,r6,ror#25
- add r9,r9,r10
- mov r7,r7,ror#25
- eor r6,r6,r8,ror#25
- eor r7,r7,r9,ror#25
- add r0,r0,r5
- mov r10,r10,ror#16
- add r1,r1,r6
- mov r12,r12,ror#16
- eor r10,r10,r0,ror#16
- eor r12,r12,r1,ror#16
- add r8,r8,r10
- mov r5,r5,ror#20
- add r9,r9,r12
- mov r6,r6,ror#20
- eor r5,r5,r8,ror#20
- eor r6,r6,r9,ror#20
- add r0,r0,r5
- mov r10,r10,ror#24
- add r1,r1,r6
- mov r12,r12,ror#24
- eor r10,r10,r0,ror#24
- eor r12,r12,r1,ror#24
- add r8,r8,r10
- mov r5,r5,ror#25
+ add r3,r3,r7,ror#13
+ eor r14,r2,r14,ror#24
+ eor r10,r3,r10,ror#24
+ add r8,r8,r14,ror#16
+ add r9,r9,r10,ror#16
+ eor r6,r8,r6,ror#13
+ eor r7,r9,r7,ror#13
+ add r2,r2,r6,ror#20
+ add r3,r3,r7,ror#20
+ eor r14,r2,r14,ror#16
+ eor r10,r3,r10,ror#16
+ add r8,r8,r14,ror#24
+ add r9,r9,r10,ror#24
+ eor r6,r6,r8,ror#12
+ eor r7,r7,r9,ror#12
+ add r0,r0,r5,ror#13
+ add r1,r1,r6,ror#13
+ eor r10,r0,r10,ror#24
+ eor r12,r1,r12,ror#24
+ add r8,r8,r10,ror#16
+ add r9,r9,r12,ror#16
+ eor r5,r8,r5,ror#13
+ eor r6,r9,r6,ror#13
+ add r0,r0,r5,ror#20
+ add r1,r1,r6,ror#20
+ eor r10,r0,r10,ror#16
+ eor r12,r1,r12,ror#16
str r10,[sp,#4*(16+15)]
+ add r8,r8,r10,ror#24
ldr r10,[sp,#4*(16+13)]
- add r9,r9,r12
- mov r6,r6,ror#25
- eor r5,r5,r8,ror#25
- eor r6,r6,r9,ror#25
+ add r9,r9,r12,ror#24
str r8,[sp,#4*(16+10)]
- ldr r8,[sp,#4*(16+8)]
- add r2,r2,r7
- mov r10,r10,ror#16
+ eor r5,r5,r8,ror#12
str r9,[sp,#4*(16+11)]
+ eor r6,r6,r9,ror#12
+ ldr r8,[sp,#4*(16+8)]
+ add r2,r2,r7,ror#13
ldr r9,[sp,#4*(16+9)]
- add r3,r3,r4
- mov r14,r14,ror#16
- eor r10,r10,r2,ror#16
- eor r14,r14,r3,ror#16
- add r8,r8,r10
- mov r7,r7,ror#20
- add r9,r9,r14
- mov r4,r4,ror#20
- eor r7,r7,r8,ror#20
- eor r4,r4,r9,ror#20
- add r2,r2,r7
- mov r10,r10,ror#24
- add r3,r3,r4
- mov r14,r14,ror#24
- eor r10,r10,r2,ror#24
- eor r14,r14,r3,ror#24
- add r8,r8,r10
- mov r7,r7,ror#25
- add r9,r9,r14
- mov r4,r4,ror#25
- eor r7,r7,r8,ror#25
- eor r4,r4,r9,ror#25
+ add r3,r3,r4,ror#13
+ eor r10,r2,r10,ror#24
+ eor r14,r3,r14,ror#24
+ add r8,r8,r10,ror#16
+ add r9,r9,r14,ror#16
+ eor r7,r8,r7,ror#13
+ eor r4,r9,r4,ror#13
+ add r2,r2,r7,ror#20
+ add r3,r3,r4,ror#20
+ eor r10,r2,r10,ror#16
+ eor r14,r3,r14,ror#16
+ add r8,r8,r10,ror#24
+ add r9,r9,r14,ror#24
+ eor r7,r7,r8,ror#12
+ eor r4,r4,r9,ror#12
bne .Loop
ldr r11,[sp,#4*(32+2)] @ load len
@@ -651,7 +172,7 @@ ENDPROC(hchacha20_arm)
@ rx and second half at sp+4*(16+8)
cmp r11,#64 @ done yet?
-#ifdef __thumb2__
+#ifdef __thumb2__
itete lo
#endif
addlo r12,sp,#4*(0) @ shortcut or ...
@@ -676,7 +197,7 @@ ENDPROC(hchacha20_arm)
add r0,r0,r8 @ accumulate key material
add r1,r1,r9
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
ldrhs r8,[r12],#16 @ load input
@@ -684,7 +205,7 @@ ENDPROC(hchacha20_arm)
add r2,r2,r10
add r3,r3,r11
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
ldrhs r10,[r12,#-8]
@@ -695,14 +216,14 @@ ENDPROC(hchacha20_arm)
rev r2,r2
rev r3,r3
#endif
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
eorhs r0,r0,r8 @ xor with input
eorhs r1,r1,r9
add r8,sp,#4*(4)
str r0,[r14],#16 @ store output
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
eorhs r2,r2,r10
@@ -712,16 +233,16 @@ ENDPROC(hchacha20_arm)
str r2,[r14,#-8]
str r3,[r14,#-4]
- add r4,r4,r8 @ accumulate key material
- add r5,r5,r9
-#ifdef __thumb2__
+ add r4,r8,r4,ror#13 @ accumulate key material
+ add r5,r9,r5,ror#13
+#ifdef __thumb2__
itt hs
#endif
ldrhs r8,[r12],#16 @ load input
ldrhs r9,[r12,#-12]
- add r6,r6,r10
- add r7,r7,r11
-#ifdef __thumb2__
+ add r6,r10,r6,ror#13
+ add r7,r11,r7,ror#13
+#ifdef __thumb2__
itt hs
#endif
ldrhs r10,[r12,#-8]
@@ -732,14 +253,14 @@ ENDPROC(hchacha20_arm)
rev r6,r6
rev r7,r7
#endif
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
eorhs r4,r4,r8
eorhs r5,r5,r9
add r8,sp,#4*(8)
str r4,[r14],#16 @ store output
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
eorhs r6,r6,r10
@@ -754,19 +275,19 @@ ENDPROC(hchacha20_arm)
add r0,r0,r8 @ accumulate key material
add r1,r1,r9
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
ldrhs r8,[r12],#16 @ load input
ldrhs r9,[r12,#-12]
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hi
#endif
strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it
strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it
add r2,r2,r10
add r3,r3,r11
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
ldrhs r10,[r12,#-8]
@@ -777,14 +298,14 @@ ENDPROC(hchacha20_arm)
rev r2,r2
rev r3,r3
#endif
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
eorhs r0,r0,r8
eorhs r1,r1,r9
add r8,sp,#4*(12)
str r0,[r14],#16 @ store output
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
eorhs r2,r2,r10
@@ -794,21 +315,21 @@ ENDPROC(hchacha20_arm)
str r2,[r14,#-8]
str r3,[r14,#-4]
- add r4,r4,r8 @ accumulate key material
- add r5,r5,r9
-#ifdef __thumb2__
+ add r4,r8,r4,ror#24 @ accumulate key material
+ add r5,r9,r5,ror#24
+#ifdef __thumb2__
itt hi
#endif
addhi r8,r8,#1 @ next counter value
strhi r8,[sp,#4*(12)] @ save next counter value
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
ldrhs r8,[r12],#16 @ load input
ldrhs r9,[r12,#-12]
- add r6,r6,r10
- add r7,r7,r11
-#ifdef __thumb2__
+ add r6,r10,r6,ror#24
+ add r7,r11,r7,ror#24
+#ifdef __thumb2__
itt hs
#endif
ldrhs r10,[r12,#-8]
@@ -819,23 +340,23 @@ ENDPROC(hchacha20_arm)
rev r6,r6
rev r7,r7
#endif
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
eorhs r4,r4,r8
eorhs r5,r5,r9
-#ifdef __thumb2__
+#ifdef __thumb2__
it ne
#endif
ldrne r8,[sp,#4*(32+2)] @ re-load len
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
eorhs r6,r6,r10
eorhs r7,r7,r11
str r4,[r14],#16 @ store output
str r5,[r14,#-12]
-#ifdef __thumb2__
+#ifdef __thumb2__
it hs
#endif
subhs r11,r8,#64 @ len-=64
@@ -854,10 +375,10 @@ ENDPROC(hchacha20_arm)
#endif
#if __LINUX_ARM_ARCH__ < 7
ldr r11,[sp,#4*(3)]
- add r0,r0,r8 @ accumulate key material
- add r1,r1,r9
- add r2,r2,r10
-#ifdef __thumb2__
+ add r0,r8,r0 @ accumulate key material
+ add r1,r9,r1
+ add r2,r10,r2
+#ifdef __thumb2__
itete lo
#endif
eorlo r8,r8,r8 @ zero or ...
@@ -865,8 +386,8 @@ ENDPROC(hchacha20_arm)
eorlo r9,r9,r9
ldrhsb r9,[r12,#-12]
- add r3,r3,r11
-#ifdef __thumb2__
+ add r3,r11,r3
+#ifdef __thumb2__
itete lo
#endif
eorlo r10,r10,r10
@@ -876,7 +397,7 @@ ENDPROC(hchacha20_arm)
eor r0,r8,r0 @ xor with input (or zero)
eor r1,r9,r1
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
ldrhsb r8,[r12,#-15] @ load more input
@@ -884,7 +405,7 @@ ENDPROC(hchacha20_arm)
eor r2,r10,r2
strb r0,[r14],#16 @ store output
eor r3,r11,r3
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
ldrhsb r10,[r12,#-7]
@@ -893,7 +414,7 @@ ENDPROC(hchacha20_arm)
eor r0,r8,r0,lsr#8
strb r2,[r14,#-8]
eor r1,r9,r1,lsr#8
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
ldrhsb r8,[r12,#-14] @ load more input
@@ -902,7 +423,7 @@ ENDPROC(hchacha20_arm)
eor r2,r10,r2,lsr#8
strb r0,[r14,#-15]
eor r3,r11,r3,lsr#8
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
ldrhsb r10,[r12,#-6]
@@ -911,7 +432,7 @@ ENDPROC(hchacha20_arm)
eor r0,r8,r0,lsr#8
strb r2,[r14,#-7]
eor r1,r9,r1,lsr#8
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
ldrhsb r8,[r12,#-13] @ load more input
@@ -920,7 +441,7 @@ ENDPROC(hchacha20_arm)
eor r2,r10,r2,lsr#8
strb r0,[r14,#-14]
eor r3,r11,r3,lsr#8
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
ldrhsb r10,[r12,#-5]
@@ -939,10 +460,10 @@ ENDPROC(hchacha20_arm)
add r8,sp,#4*(4+0)
ldmia r8,{r8-r11} @ load key material
add r0,sp,#4*(16+8)
- add r4,r4,r8 @ accumulate key material
- add r5,r5,r9
- add r6,r6,r10
-#ifdef __thumb2__
+ add r4,r8,r4,ror#13 @ accumulate key material
+ add r5,r9,r5,ror#13
+ add r6,r10,r6,ror#13
+#ifdef __thumb2__
itete lo
#endif
eorlo r8,r8,r8 @ zero or ...
@@ -950,8 +471,8 @@ ENDPROC(hchacha20_arm)
eorlo r9,r9,r9
ldrhsb r9,[r12,#-12]
- add r7,r7,r11
-#ifdef __thumb2__
+ add r7,r11,r7,ror#13
+#ifdef __thumb2__
itete lo
#endif
eorlo r10,r10,r10
@@ -961,7 +482,7 @@ ENDPROC(hchacha20_arm)
eor r4,r8,r4 @ xor with input (or zero)
eor r5,r9,r5
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
ldrhsb r8,[r12,#-15] @ load more input
@@ -969,7 +490,7 @@ ENDPROC(hchacha20_arm)
eor r6,r10,r6
strb r4,[r14],#16 @ store output
eor r7,r11,r7
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
ldrhsb r10,[r12,#-7]
@@ -978,7 +499,7 @@ ENDPROC(hchacha20_arm)
eor r4,r8,r4,lsr#8
strb r6,[r14,#-8]
eor r5,r9,r5,lsr#8
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
ldrhsb r8,[r12,#-14] @ load more input
@@ -987,7 +508,7 @@ ENDPROC(hchacha20_arm)
eor r6,r10,r6,lsr#8
strb r4,[r14,#-15]
eor r7,r11,r7,lsr#8
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
ldrhsb r10,[r12,#-6]
@@ -996,7 +517,7 @@ ENDPROC(hchacha20_arm)
eor r4,r8,r4,lsr#8
strb r6,[r14,#-7]
eor r5,r9,r5,lsr#8
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
ldrhsb r8,[r12,#-13] @ load more input
@@ -1005,7 +526,7 @@ ENDPROC(hchacha20_arm)
eor r6,r10,r6,lsr#8
strb r4,[r14,#-14]
eor r7,r11,r7,lsr#8
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
ldrhsb r10,[r12,#-5]
@@ -1024,15 +545,15 @@ ENDPROC(hchacha20_arm)
add r8,sp,#4*(4+4)
ldmia r8,{r8-r11} @ load key material
ldmia r0,{r0-r7} @ load second half
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hi
#endif
strhi r10,[sp,#4*(16+10)] @ copy "rx"
strhi r11,[sp,#4*(16+11)] @ copy "rx"
- add r0,r0,r8 @ accumulate key material
- add r1,r1,r9
- add r2,r2,r10
-#ifdef __thumb2__
+ add r0,r8,r0 @ accumulate key material
+ add r1,r9,r1
+ add r2,r10,r2
+#ifdef __thumb2__
itete lo
#endif
eorlo r8,r8,r8 @ zero or ...
@@ -1040,8 +561,8 @@ ENDPROC(hchacha20_arm)
eorlo r9,r9,r9
ldrhsb r9,[r12,#-12]
- add r3,r3,r11
-#ifdef __thumb2__
+ add r3,r11,r3
+#ifdef __thumb2__
itete lo
#endif
eorlo r10,r10,r10
@@ -1051,7 +572,7 @@ ENDPROC(hchacha20_arm)
eor r0,r8,r0 @ xor with input (or zero)
eor r1,r9,r1
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
ldrhsb r8,[r12,#-15] @ load more input
@@ -1059,7 +580,7 @@ ENDPROC(hchacha20_arm)
eor r2,r10,r2
strb r0,[r14],#16 @ store output
eor r3,r11,r3
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
ldrhsb r10,[r12,#-7]
@@ -1068,7 +589,7 @@ ENDPROC(hchacha20_arm)
eor r0,r8,r0,lsr#8
strb r2,[r14,#-8]
eor r1,r9,r1,lsr#8
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
ldrhsb r8,[r12,#-14] @ load more input
@@ -1077,7 +598,7 @@ ENDPROC(hchacha20_arm)
eor r2,r10,r2,lsr#8
strb r0,[r14,#-15]
eor r3,r11,r3,lsr#8
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
ldrhsb r10,[r12,#-6]
@@ -1086,7 +607,7 @@ ENDPROC(hchacha20_arm)
eor r0,r8,r0,lsr#8
strb r2,[r14,#-7]
eor r1,r9,r1,lsr#8
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
ldrhsb r8,[r12,#-13] @ load more input
@@ -1095,7 +616,7 @@ ENDPROC(hchacha20_arm)
eor r2,r10,r2,lsr#8
strb r0,[r14,#-14]
eor r3,r11,r3,lsr#8
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
ldrhsb r10,[r12,#-5]
@@ -1113,15 +634,15 @@ ENDPROC(hchacha20_arm)
strb r3,[r14,#-1]
add r8,sp,#4*(4+8)
ldmia r8,{r8-r11} @ load key material
- add r4,r4,r8 @ accumulate key material
-#ifdef __thumb2__
+ add r4,r8,r4,ror#24 @ accumulate key material
+#ifdef __thumb2__
itt hi
#endif
addhi r8,r8,#1 @ next counter value
strhi r8,[sp,#4*(12)] @ save next counter value
- add r5,r5,r9
- add r6,r6,r10
-#ifdef __thumb2__
+ add r5,r9,r5,ror#24
+ add r6,r10,r6,ror#24
+#ifdef __thumb2__
itete lo
#endif
eorlo r8,r8,r8 @ zero or ...
@@ -1129,8 +650,8 @@ ENDPROC(hchacha20_arm)
eorlo r9,r9,r9
ldrhsb r9,[r12,#-12]
- add r7,r7,r11
-#ifdef __thumb2__
+ add r7,r11,r7,ror#24
+#ifdef __thumb2__
itete lo
#endif
eorlo r10,r10,r10
@@ -1140,7 +661,7 @@ ENDPROC(hchacha20_arm)
eor r4,r8,r4 @ xor with input (or zero)
eor r5,r9,r5
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
ldrhsb r8,[r12,#-15] @ load more input
@@ -1148,7 +669,7 @@ ENDPROC(hchacha20_arm)
eor r6,r10,r6
strb r4,[r14],#16 @ store output
eor r7,r11,r7
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
ldrhsb r10,[r12,#-7]
@@ -1157,7 +678,7 @@ ENDPROC(hchacha20_arm)
eor r4,r8,r4,lsr#8
strb r6,[r14,#-8]
eor r5,r9,r5,lsr#8
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
ldrhsb r8,[r12,#-14] @ load more input
@@ -1166,7 +687,7 @@ ENDPROC(hchacha20_arm)
eor r6,r10,r6,lsr#8
strb r4,[r14,#-15]
eor r7,r11,r7,lsr#8
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
ldrhsb r10,[r12,#-6]
@@ -1175,7 +696,7 @@ ENDPROC(hchacha20_arm)
eor r4,r8,r4,lsr#8
strb r6,[r14,#-7]
eor r5,r9,r5,lsr#8
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
ldrhsb r8,[r12,#-13] @ load more input
@@ -1184,7 +705,7 @@ ENDPROC(hchacha20_arm)
eor r6,r10,r6,lsr#8
strb r4,[r14,#-14]
eor r7,r11,r7,lsr#8
-#ifdef __thumb2__
+#ifdef __thumb2__
itt hs
#endif
ldrhsb r10,[r12,#-5]
@@ -1200,11 +721,11 @@ ENDPROC(hchacha20_arm)
eor r7,r11,r7,lsr#8
strb r6,[r14,#-5]
strb r7,[r14,#-1]
-#ifdef __thumb2__
+#ifdef __thumb2__
it ne
#endif
ldrne r8,[sp,#4*(32+2)] @ re-load len
-#ifdef __thumb2__
+#ifdef __thumb2__
it hs
#endif
subhs r11,r8,#64 @ len-=64
@@ -1228,14 +749,17 @@ ENDPROC(hchacha20_arm)
.Ldone:
add sp,sp,#4*(32+3)
+.Lno_data_arm:
ldmia sp!,{r4-r11,pc}
+ENDPROC(chacha20_arm)
+#ifdef CONFIG_KERNEL_MODE_NEON
.align 5
.Lsigma2:
.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
.Lone2:
.long 1,0,0,0
-.word -1
+.word -1
.arch armv7-a
.fpu neon
@@ -1265,7 +789,8 @@ ENTRY(chacha20_neon)
add r12,sp,#4*8
ldmia r14,{r0-r3} @ load sigma
vld1.32 {q0},[r14]! @ load sigma
- vld1.32 {q12},[r14] @ one
+ vld1.32 {q12},[r14]! @ one
+ @ vld1.32 {d30},[r14] @ rot8
vst1.32 {q2-q3},[r12] @ copy 1/2key|counter|nonce
vst1.32 {q0-q1},[sp] @ copy sigma|1/2key
@@ -1278,6 +803,7 @@ ENTRY(chacha20_neon)
vmov q4,q0
vstr d28,[sp,#4*(16+4)]
vmov q8,q0
+ @ vstr d30,[sp,#4*(16+6)]
vmov q5,q1
vmov q9,q1
b .Loop_neon_enter
@@ -1287,6 +813,7 @@ ENTRY(chacha20_neon)
ldmia sp,{r0-r9} @ load key material
cmp r11,#64*2 @ if len<=64*2
bls .Lbreak_neon @ switch to integer-only
+ @ vldr d30,[sp,#4*(16+6)] @ rot8
vmov q4,q0
str r11,[sp,#4*(32+2)] @ save len
vmov q8,q0
@@ -1296,237 +823,213 @@ ENTRY(chacha20_neon)
vmov q9,q1
.Loop_neon_enter:
ldr r11, [sp,#4*(15)]
+ mov r4,r4,ror#19 @ twist b[0..3]
vadd.i32 q7,q3,q12 @ counter+1
ldr r12,[sp,#4*(12)] @ modulo-scheduled load
+ mov r5,r5,ror#19
vmov q6,q2
ldr r10, [sp,#4*(13)]
+ mov r6,r6,ror#19
vmov q10,q2
ldr r14,[sp,#4*(14)]
+ mov r7,r7,ror#19
vadd.i32 q11,q7,q12 @ counter+2
+ add r12,r12,#3 @ counter+3
+ mov r11,r11,ror#8 @ twist d[0..3]
+ mov r12,r12,ror#8
+ mov r10,r10,ror#8
+ mov r14,r14,ror#8
str r11, [sp,#4*(16+15)]
mov r11,#10
- add r12,r12,#3 @ counter+3
b .Loop_neon
.align 4
.Loop_neon:
subs r11,r11,#1
vadd.i32 q0,q0,q1
- add r0,r0,r4
+ add r0,r0,r4,ror#13
vadd.i32 q4,q4,q5
- mov r12,r12,ror#16
+ add r1,r1,r5,ror#13
vadd.i32 q8,q8,q9
- add r1,r1,r5
+ eor r12,r0,r12,ror#24
veor q3,q3,q0
- mov r10,r10,ror#16
+ eor r10,r1,r10,ror#24
veor q7,q7,q4
- eor r12,r12,r0,ror#16
+ add r8,r8,r12,ror#16
veor q11,q11,q8
- eor r10,r10,r1,ror#16
+ add r9,r9,r10,ror#16
vrev32.16 q3,q3
- add r8,r8,r12
+ eor r4,r8,r4,ror#13
vrev32.16 q7,q7
- mov r4,r4,ror#20
+ eor r5,r9,r5,ror#13
vrev32.16 q11,q11
- add r9,r9,r10
+ add r0,r0,r4,ror#20
vadd.i32 q2,q2,q3
- mov r5,r5,ror#20
+ add r1,r1,r5,ror#20
vadd.i32 q6,q6,q7
- eor r4,r4,r8,ror#20
+ eor r12,r0,r12,ror#16
vadd.i32 q10,q10,q11
- eor r5,r5,r9,ror#20
+ eor r10,r1,r10,ror#16
veor q12,q1,q2
- add r0,r0,r4
+ add r8,r8,r12,ror#24
veor q13,q5,q6
- mov r12,r12,ror#24
+ str r10,[sp,#4*(16+13)]
veor q14,q9,q10
- add r1,r1,r5
+ add r9,r9,r10,ror#24
vshr.u32 q1,q12,#20
- mov r10,r10,ror#24
+ ldr r10,[sp,#4*(16+15)]
vshr.u32 q5,q13,#20
- eor r12,r12,r0,ror#24
+ str r8,[sp,#4*(16+8)]
vshr.u32 q9,q14,#20
- eor r10,r10,r1,ror#24
+ eor r4,r4,r8,ror#12
vsli.32 q1,q12,#12
- add r8,r8,r12
+ str r9,[sp,#4*(16+9)]
vsli.32 q5,q13,#12
- mov r4,r4,ror#25
+ eor r5,r5,r9,ror#12
vsli.32 q9,q14,#12
- add r9,r9,r10
+ ldr r8,[sp,#4*(16+10)]
vadd.i32 q0,q0,q1
- mov r5,r5,ror#25
+ add r2,r2,r6,ror#13
vadd.i32 q4,q4,q5
- str r10,[sp,#4*(16+13)]
+ ldr r9,[sp,#4*(16+11)]
vadd.i32 q8,q8,q9
- ldr r10,[sp,#4*(16+15)]
+ add r3,r3,r7,ror#13
veor q12,q3,q0
- eor r4,r4,r8,ror#25
+ eor r14,r2,r14,ror#24
veor q13,q7,q4
- eor r5,r5,r9,ror#25
+ eor r10,r3,r10,ror#24
veor q14,q11,q8
- str r8,[sp,#4*(16+8)]
+ add r8,r8,r14,ror#16
vshr.u32 q3,q12,#24
- ldr r8,[sp,#4*(16+10)]
+ add r9,r9,r10,ror#16
vshr.u32 q7,q13,#24
- add r2,r2,r6
+ eor r6,r8,r6,ror#13
vshr.u32 q11,q14,#24
- mov r14,r14,ror#16
+ eor r7,r9,r7,ror#13
vsli.32 q3,q12,#8
- str r9,[sp,#4*(16+9)]
+ add r2,r2,r6,ror#20
vsli.32 q7,q13,#8
- ldr r9,[sp,#4*(16+11)]
+ add r3,r3,r7,ror#20
vsli.32 q11,q14,#8
- add r3,r3,r7
+ eor r14,r2,r14,ror#16
vadd.i32 q2,q2,q3
- mov r10,r10,ror#16
+ eor r10,r3,r10,ror#16
vadd.i32 q6,q6,q7
- eor r14,r14,r2,ror#16
+ add r8,r8,r14,ror#24
vadd.i32 q10,q10,q11
- eor r10,r10,r3,ror#16
+ add r9,r9,r10,ror#24
veor q12,q1,q2
- add r8,r8,r14
+ eor r6,r6,r8,ror#12
veor q13,q5,q6
- mov r6,r6,ror#20
+ eor r7,r7,r9,ror#12
veor q14,q9,q10
- add r9,r9,r10
vshr.u32 q1,q12,#25
- mov r7,r7,ror#20
vshr.u32 q5,q13,#25
- eor r6,r6,r8,ror#20
vshr.u32 q9,q14,#25
- eor r7,r7,r9,ror#20
vsli.32 q1,q12,#7
- add r2,r2,r6
vsli.32 q5,q13,#7
- mov r14,r14,ror#24
vsli.32 q9,q14,#7
- add r3,r3,r7
vext.8 q2,q2,q2,#8
- mov r10,r10,ror#24
vext.8 q6,q6,q6,#8
- eor r14,r14,r2,ror#24
vext.8 q10,q10,q10,#8
- eor r10,r10,r3,ror#24
vext.8 q1,q1,q1,#4
- add r8,r8,r14
vext.8 q5,q5,q5,#4
- mov r6,r6,ror#25
vext.8 q9,q9,q9,#4
- add r9,r9,r10
vext.8 q3,q3,q3,#12
- mov r7,r7,ror#25
vext.8 q7,q7,q7,#12
- eor r6,r6,r8,ror#25
vext.8 q11,q11,q11,#12
- eor r7,r7,r9,ror#25
vadd.i32 q0,q0,q1
- add r0,r0,r5
+ add r0,r0,r5,ror#13
vadd.i32 q4,q4,q5
- mov r10,r10,ror#16
+ add r1,r1,r6,ror#13
vadd.i32 q8,q8,q9
- add r1,r1,r6
+ eor r10,r0,r10,ror#24
veor q3,q3,q0
- mov r12,r12,ror#16
+ eor r12,r1,r12,ror#24
veor q7,q7,q4
- eor r10,r10,r0,ror#16
+ add r8,r8,r10,ror#16
veor q11,q11,q8
- eor r12,r12,r1,ror#16
+ add r9,r9,r12,ror#16
vrev32.16 q3,q3
- add r8,r8,r10
+ eor r5,r8,r5,ror#13
vrev32.16 q7,q7
- mov r5,r5,ror#20
+ eor r6,r9,r6,ror#13
vrev32.16 q11,q11
- add r9,r9,r12
+ add r0,r0,r5,ror#20
vadd.i32 q2,q2,q3
- mov r6,r6,ror#20
+ add r1,r1,r6,ror#20
vadd.i32 q6,q6,q7
- eor r5,r5,r8,ror#20
+ eor r10,r0,r10,ror#16
vadd.i32 q10,q10,q11
- eor r6,r6,r9,ror#20
+ eor r12,r1,r12,ror#16
veor q12,q1,q2
- add r0,r0,r5
+ str r10,[sp,#4*(16+15)]
veor q13,q5,q6
- mov r10,r10,ror#24
+ add r8,r8,r10,ror#24
veor q14,q9,q10
- add r1,r1,r6
+ ldr r10,[sp,#4*(16+13)]
vshr.u32 q1,q12,#20
- mov r12,r12,ror#24
+ add r9,r9,r12,ror#24
vshr.u32 q5,q13,#20
- eor r10,r10,r0,ror#24
+ str r8,[sp,#4*(16+10)]
vshr.u32 q9,q14,#20
- eor r12,r12,r1,ror#24
+ eor r5,r5,r8,ror#12
vsli.32 q1,q12,#12
- add r8,r8,r10
+ str r9,[sp,#4*(16+11)]
vsli.32 q5,q13,#12
- mov r5,r5,ror#25
+ eor r6,r6,r9,ror#12
vsli.32 q9,q14,#12
- str r10,[sp,#4*(16+15)]
+ ldr r8,[sp,#4*(16+8)]
vadd.i32 q0,q0,q1
- ldr r10,[sp,#4*(16+13)]
+ add r2,r2,r7,ror#13
vadd.i32 q4,q4,q5
- add r9,r9,r12
+ ldr r9,[sp,#4*(16+9)]
vadd.i32 q8,q8,q9
- mov r6,r6,ror#25
+ add r3,r3,r4,ror#13
veor q12,q3,q0
- eor r5,r5,r8,ror#25
+ eor r10,r2,r10,ror#24
veor q13,q7,q4
- eor r6,r6,r9,ror#25
+ eor r14,r3,r14,ror#24
veor q14,q11,q8
- str r8,[sp,#4*(16+10)]
+ add r8,r8,r10,ror#16
vshr.u32 q3,q12,#24
- ldr r8,[sp,#4*(16+8)]
+ add r9,r9,r14,ror#16
vshr.u32 q7,q13,#24
- add r2,r2,r7
+ eor r7,r8,r7,ror#13
vshr.u32 q11,q14,#24
- mov r10,r10,ror#16
+ eor r4,r9,r4,ror#13
vsli.32 q3,q12,#8
- str r9,[sp,#4*(16+11)]
+ add r2,r2,r7,ror#20
vsli.32 q7,q13,#8
- ldr r9,[sp,#4*(16+9)]
+ add r3,r3,r4,ror#20
vsli.32 q11,q14,#8
- add r3,r3,r4
+ eor r10,r2,r10,ror#16
vadd.i32 q2,q2,q3
- mov r14,r14,ror#16
+ eor r14,r3,r14,ror#16
vadd.i32 q6,q6,q7
- eor r10,r10,r2,ror#16
+ add r8,r8,r10,ror#24
vadd.i32 q10,q10,q11
- eor r14,r14,r3,ror#16
+ add r9,r9,r14,ror#24
veor q12,q1,q2
- add r8,r8,r10
+ eor r7,r7,r8,ror#12
veor q13,q5,q6
- mov r7,r7,ror#20
+ eor r4,r4,r9,ror#12
veor q14,q9,q10
- add r9,r9,r14
vshr.u32 q1,q12,#25
- mov r4,r4,ror#20
vshr.u32 q5,q13,#25
- eor r7,r7,r8,ror#20
vshr.u32 q9,q14,#25
- eor r4,r4,r9,ror#20
vsli.32 q1,q12,#7
- add r2,r2,r7
vsli.32 q5,q13,#7
- mov r10,r10,ror#24
vsli.32 q9,q14,#7
- add r3,r3,r4
vext.8 q2,q2,q2,#8
- mov r14,r14,ror#24
vext.8 q6,q6,q6,#8
- eor r10,r10,r2,ror#24
vext.8 q10,q10,q10,#8
- eor r14,r14,r3,ror#24
vext.8 q1,q1,q1,#12
- add r8,r8,r10
vext.8 q5,q5,q5,#12
- mov r7,r7,ror#25
vext.8 q9,q9,q9,#12
- add r9,r9,r14
vext.8 q3,q3,q3,#4
- mov r4,r4,ror#25
vext.8 q7,q7,q7,#4
- eor r7,r7,r8,ror#25
vext.8 q11,q11,q11,#4
- eor r4,r4,r9,ror#25
bne .Loop_neon
add r11,sp,#32
@@ -1631,13 +1134,13 @@ ENTRY(chacha20_neon)
str r2,[r14,#-8]
str r3,[r14,#-4]
- add r4,r4,r8 @ accumulate key material
+ add r4,r8,r4,ror#13 @ accumulate key material
ldr r8,[r12],#16 @ load input
- add r5,r5,r9
+ add r5,r9,r5,ror#13
ldr r9,[r12,#-12]
- add r6,r6,r10
+ add r6,r10,r6,ror#13
ldr r10,[r12,#-8]
- add r7,r7,r11
+ add r7,r11,r7,ror#13
ldr r11,[r12,#-4]
#ifdef __ARMEB__
rev r4,r4
@@ -1663,13 +1166,13 @@ ENTRY(chacha20_neon)
ldr r8,[r12],#16 @ load input
add r1,r1,r9
ldr r9,[r12,#-12]
-#ifdef __thumb2__
+#ifdef __thumb2__
it hi
#endif
strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it
add r2,r2,r10
ldr r10,[r12,#-8]
-#ifdef __thumb2__
+#ifdef __thumb2__
it hi
#endif
strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it
@@ -1692,15 +1195,15 @@ ENTRY(chacha20_neon)
str r2,[r14,#-8]
str r3,[r14,#-4]
- add r4,r4,r8 @ accumulate key material
+ add r4,r8,r4,ror#24 @ accumulate key material
add r8,r8,#4 @ next counter value
- add r5,r5,r9
+ add r5,r9,r5,ror#24
str r8,[sp,#4*(12)] @ save next counter value
ldr r8,[r12],#16 @ load input
- add r6,r6,r10
+ add r6,r10,r6,ror#24
add r4,r4,#3 @ counter+3
ldr r9,[r12,#-12]
- add r7,r7,r11
+ add r7,r11,r7,ror#24
ldr r10,[r12,#-8]
ldr r11,[r12,#-4]
#ifdef __ARMEB__
@@ -1710,7 +1213,7 @@ ENTRY(chacha20_neon)
rev r7,r7
#endif
eor r4,r4,r8
-#ifdef __thumb2__
+#ifdef __thumb2__
it hi
#endif
ldrhi r8,[sp,#4*(32+2)] @ re-load len
@@ -1744,9 +1247,17 @@ ENTRY(chacha20_neon)
str r14,[sp,#4*(20+16+11)] @ copy "rx"
ldr r11, [sp,#4*(15)]
+ mov r4,r4,ror#19 @ twist b[0..3]
ldr r12,[sp,#4*(12)] @ modulo-scheduled load
+ mov r5,r5,ror#19
ldr r10, [sp,#4*(13)]
+ mov r6,r6,ror#19
ldr r14,[sp,#4*(14)]
+ mov r7,r7,ror#19
+ mov r11,r11,ror#8 @ twist d[0..3]
+ mov r12,r12,ror#8
+ mov r10,r10,ror#8
+ mov r14,r14,ror#8
str r11, [sp,#4*(20+16+15)]
add r11,sp,#4*(20)
vst1.32 {q0-q1},[r11]! @ copy key
@@ -1858,11 +1369,11 @@ ENTRY(chacha20_neon)
add r3,r3,r11
ldmia r8,{r8-r11} @ load key material
- add r4,r4,r8 @ accumulate key material
+ add r4,r8,r4,ror#13 @ accumulate key material
add r8,sp,#4*(8)
- add r5,r5,r9
- add r6,r6,r10
- add r7,r7,r11
+ add r5,r9,r5,ror#13
+ add r6,r10,r6,ror#13
+ add r7,r11,r7,ror#13
ldmia r8,{r8-r11} @ load key material
#ifdef __ARMEB__
rev r0,r0
@@ -1886,12 +1397,12 @@ ENTRY(chacha20_neon)
add r3,r3,r11
ldmia r8,{r8-r11} @ load key material
- add r4,r4,r8 @ accumulate key material
+ add r4,r8,r4,ror#24 @ accumulate key material
add r8,sp,#4*(8)
- add r5,r5,r9
+ add r5,r9,r5,ror#24
add r4,r4,#3 @ counter+3
- add r6,r6,r10
- add r7,r7,r11
+ add r6,r10,r6,ror#24
+ add r7,r11,r7,ror#24
ldr r11,[sp,#4*(32+2)] @ re-load len
#ifdef __ARMEB__
rev r0,r0