From 08825b907d933e80bba6077a545831978fa950f4 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 8 Nov 2018 17:08:22 +0100 Subject: chacha20,poly1305: switch to perlasm originals on mips and arm We also separate out Eric Biggers' Cortex A7 implementation into its own file. --- src/Makefile | 2 +- src/crypto/Kbuild.include | 8 +- src/crypto/zinc/chacha20/chacha20-arm.S | 1860 --------------------- src/crypto/zinc/chacha20/chacha20-arm.pl | 1227 ++++++++++++++ src/crypto/zinc/chacha20/chacha20-arm64.S | 1942 ---------------------- src/crypto/zinc/chacha20/chacha20-arm64.pl | 1164 +++++++++++++ src/crypto/zinc/chacha20/chacha20-unrolled-arm.S | 461 +++++ src/crypto/zinc/poly1305/poly1305-arm.S | 1117 ------------- src/crypto/zinc/poly1305/poly1305-arm.pl | 1272 ++++++++++++++ src/crypto/zinc/poly1305/poly1305-arm64.S | 824 --------- src/crypto/zinc/poly1305/poly1305-arm64.pl | 972 +++++++++++ src/crypto/zinc/poly1305/poly1305-mips64.S | 360 ---- src/crypto/zinc/poly1305/poly1305-mips64.pl | 467 ++++++ src/tests/qemu/Makefile | 2 +- 14 files changed, 5572 insertions(+), 6106 deletions(-) delete mode 100644 src/crypto/zinc/chacha20/chacha20-arm.S create mode 100644 src/crypto/zinc/chacha20/chacha20-arm.pl delete mode 100644 src/crypto/zinc/chacha20/chacha20-arm64.S create mode 100644 src/crypto/zinc/chacha20/chacha20-arm64.pl create mode 100644 src/crypto/zinc/chacha20/chacha20-unrolled-arm.S delete mode 100644 src/crypto/zinc/poly1305/poly1305-arm.S create mode 100644 src/crypto/zinc/poly1305/poly1305-arm.pl delete mode 100644 src/crypto/zinc/poly1305/poly1305-arm64.S create mode 100644 src/crypto/zinc/poly1305/poly1305-arm64.pl delete mode 100644 src/crypto/zinc/poly1305/poly1305-mips64.S create mode 100644 src/crypto/zinc/poly1305/poly1305-mips64.pl (limited to 'src') diff --git a/src/Makefile b/src/Makefile index fcfa069..d26c6d1 100644 --- a/src/Makefile +++ b/src/Makefile @@ -52,7 +52,7 @@ install: @$(MAKE) -C tools install rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d)) -DKMS_SOURCES := version.h Makefile Kbuild Kconfig dkms.conf $(filter-out version.h wireguard.mod.c tools/% tests/%,$(call rwildcard,,*.c *.h *.S *.include)) +DKMS_SOURCES := version.h Makefile Kbuild Kconfig dkms.conf $(filter-out version.h wireguard.mod.c tools/% tests/%,$(call rwildcard,,*.c *.h *.S *.pl *.include)) dkms-install: $(DKMS_SOURCES) @$(foreach f,$(DKMS_SOURCES),install -v -m0644 -D $(f) $(DESTDIR)$(DKMSDIR)/$(f);) diff --git a/src/crypto/Kbuild.include b/src/crypto/Kbuild.include index ab5ecea..5fb9445 100644 --- a/src/crypto/Kbuild.include +++ b/src/crypto/Kbuild.include @@ -16,7 +16,7 @@ endif zinc-y += chacha20/chacha20.o zinc-$(CONFIG_ZINC_ARCH_X86_64) += chacha20/chacha20-x86_64.o -zinc-$(CONFIG_ZINC_ARCH_ARM) += chacha20/chacha20-arm.o +zinc-$(CONFIG_ZINC_ARCH_ARM) += chacha20/chacha20-arm.o chacha20/chacha20-unrolled-arm.o zinc-$(CONFIG_ZINC_ARCH_ARM64) += chacha20/chacha20-arm64.o zinc-$(CONFIG_ZINC_ARCH_MIPS) += chacha20/chacha20-mips.o AFLAGS_chacha20-mips.o += -O2 # This is required to fill the branch delay slots @@ -37,6 +37,12 @@ zinc-$(CONFIG_ZINC_ARCH_X86_64) += blake2s/blake2s-x86_64.o zinc-y += curve25519/curve25519.o zinc-$(CONFIG_ZINC_ARCH_ARM) += curve25519/curve25519-arm.o +quiet_cmd_perlasm = PERLASM $@ + cmd_perlasm = $(PERL) $< > $@ +%.S: %.pl + $(call cmd,perlasm) +targets += $(patsubst %.o,crypto/zinc/%.S,$(zinc-y)) + wireguard-y += $(addprefix crypto/zinc/,$(zinc-y)) ccflags-y += -I$(src)/crypto/include ccflags-$(CONFIG_ZINC_ARCH_X86_64) += -DCONFIG_ZINC_ARCH_X86_64 diff --git a/src/crypto/zinc/chacha20/chacha20-arm.S b/src/crypto/zinc/chacha20/chacha20-arm.S deleted file mode 100644 index 79ed18f..0000000 --- a/src/crypto/zinc/chacha20/chacha20-arm.S +++ /dev/null @@ -1,1860 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ -/* - * Copyright (C) 2018 Google, Inc. - * Copyright (C) 2015-2018 Jason A. Donenfeld . All Rights Reserved. - * Copyright (C) 2006-2017 CRYPTOGAMS by . All Rights Reserved. - */ - -#include -#include - -/* - * The following scalar routine was written by Eric Biggers. - * - * Design notes: - * - * 16 registers would be needed to hold the state matrix, but only 14 are - * available because 'sp' and 'pc' cannot be used. So we spill the elements - * (x8, x9) to the stack and swap them out with (x10, x11). This adds one - * 'ldrd' and one 'strd' instruction per round. - * - * All rotates are performed using the implicit rotate operand accepted by the - * 'add' and 'eor' instructions. This is faster than using explicit rotate - * instructions. To make this work, we allow the values in the second and last - * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the - * wrong rotation amount. The rotation amount is then fixed up just in time - * when the values are used. 'brot' is the number of bits the values in row 'b' - * need to be rotated right to arrive at the correct values, and 'drot' - * similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such - * that they end up as (25, 24) after every round. - */ - - // ChaCha state registers - X0 .req r0 - X1 .req r1 - X2 .req r2 - X3 .req r3 - X4 .req r4 - X5 .req r5 - X6 .req r6 - X7 .req r7 - X8_X10 .req r8 // shared by x8 and x10 - X9_X11 .req r9 // shared by x9 and x11 - X12 .req r10 - X13 .req r11 - X14 .req r12 - X15 .req r14 - -.Lexpand_32byte_k: - // "expand 32-byte k" - .word 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 - -#ifdef __thumb2__ -# define adrl adr -#endif - -.macro __rev out, in, t0, t1, t2 -.if __LINUX_ARM_ARCH__ >= 6 - rev \out, \in -.else - lsl \t0, \in, #24 - and \t1, \in, #0xff00 - and \t2, \in, #0xff0000 - orr \out, \t0, \in, lsr #24 - orr \out, \out, \t1, lsl #8 - orr \out, \out, \t2, lsr #8 -.endif -.endm - -.macro _le32_bswap x, t0, t1, t2 -#ifdef __ARMEB__ - __rev \x, \x, \t0, \t1, \t2 -#endif -.endm - -.macro _le32_bswap_4x a, b, c, d, t0, t1, t2 - _le32_bswap \a, \t0, \t1, \t2 - _le32_bswap \b, \t0, \t1, \t2 - _le32_bswap \c, \t0, \t1, \t2 - _le32_bswap \d, \t0, \t1, \t2 -.endm - -.macro __ldrd a, b, src, offset -#if __LINUX_ARM_ARCH__ >= 6 - ldrd \a, \b, [\src, #\offset] -#else - ldr \a, [\src, #\offset] - ldr \b, [\src, #\offset + 4] -#endif -.endm - -.macro __strd a, b, dst, offset -#if __LINUX_ARM_ARCH__ >= 6 - strd \a, \b, [\dst, #\offset] -#else - str \a, [\dst, #\offset] - str \b, [\dst, #\offset + 4] -#endif -.endm - -.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2 - - // a += b; d ^= a; d = rol(d, 16); - add \a1, \a1, \b1, ror #brot - add \a2, \a2, \b2, ror #brot - eor \d1, \a1, \d1, ror #drot - eor \d2, \a2, \d2, ror #drot - // drot == 32 - 16 == 16 - - // c += d; b ^= c; b = rol(b, 12); - add \c1, \c1, \d1, ror #16 - add \c2, \c2, \d2, ror #16 - eor \b1, \c1, \b1, ror #brot - eor \b2, \c2, \b2, ror #brot - // brot == 32 - 12 == 20 - - // a += b; d ^= a; d = rol(d, 8); - add \a1, \a1, \b1, ror #20 - add \a2, \a2, \b2, ror #20 - eor \d1, \a1, \d1, ror #16 - eor \d2, \a2, \d2, ror #16 - // drot == 32 - 8 == 24 - - // c += d; b ^= c; b = rol(b, 7); - add \c1, \c1, \d1, ror #24 - add \c2, \c2, \d2, ror #24 - eor \b1, \c1, \b1, ror #20 - eor \b2, \c2, \b2, ror #20 - // brot == 32 - 7 == 25 -.endm - -.macro _doubleround - - // column round - - // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13) - _halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13 - - // save (x8, x9); restore (x10, x11) - __strd X8_X10, X9_X11, sp, 0 - __ldrd X8_X10, X9_X11, sp, 8 - - // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15) - _halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15 - - .set brot, 25 - .set drot, 24 - - // diagonal round - - // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12) - _halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12 - - // save (x10, x11); restore (x8, x9) - __strd X8_X10, X9_X11, sp, 8 - __ldrd X8_X10, X9_X11, sp, 0 - - // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14) - _halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14 -.endm - -.macro _chacha_permute nrounds - .set brot, 0 - .set drot, 0 - .rept \nrounds / 2 - _doubleround - .endr -.endm - -.macro _chacha nrounds - -.Lnext_block\@: - // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN - // Registers contain x0-x9,x12-x15. - - // Do the core ChaCha permutation to update x0-x15. - _chacha_permute \nrounds - - add sp, #8 - // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN - // Registers contain x0-x9,x12-x15. - // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. - - // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15). - push {X8_X10, X9_X11, X12, X13, X14, X15} - - // Load (OUT, IN, LEN). - ldr r14, [sp, #96] - ldr r12, [sp, #100] - ldr r11, [sp, #104] - - orr r10, r14, r12 - - // Use slow path if fewer than 64 bytes remain. - cmp r11, #64 - blt .Lxor_slowpath\@ - - // Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on - // ARMv6+, since ldmia and stmia (used below) still require alignment. - tst r10, #3 - bne .Lxor_slowpath\@ - - // Fast path: XOR 64 bytes of aligned data. - - // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN - // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT. - // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. - - // x0-x3 - __ldrd r8, r9, sp, 32 - __ldrd r10, r11, sp, 40 - add X0, X0, r8 - add X1, X1, r9 - add X2, X2, r10 - add X3, X3, r11 - _le32_bswap_4x X0, X1, X2, X3, r8, r9, r10 - ldmia r12!, {r8-r11} - eor X0, X0, r8 - eor X1, X1, r9 - eor X2, X2, r10 - eor X3, X3, r11 - stmia r14!, {X0-X3} - - // x4-x7 - __ldrd r8, r9, sp, 48 - __ldrd r10, r11, sp, 56 - add X4, r8, X4, ror #brot - add X5, r9, X5, ror #brot - ldmia r12!, {X0-X3} - add X6, r10, X6, ror #brot - add X7, r11, X7, ror #brot - _le32_bswap_4x X4, X5, X6, X7, r8, r9, r10 - eor X4, X4, X0 - eor X5, X5, X1 - eor X6, X6, X2 - eor X7, X7, X3 - stmia r14!, {X4-X7} - - // x8-x15 - pop {r0-r7} // (x8-x9,x12-x15,x10-x11) - __ldrd r8, r9, sp, 32 - __ldrd r10, r11, sp, 40 - add r0, r0, r8 // x8 - add r1, r1, r9 // x9 - add r6, r6, r10 // x10 - add r7, r7, r11 // x11 - _le32_bswap_4x r0, r1, r6, r7, r8, r9, r10 - ldmia r12!, {r8-r11} - eor r0, r0, r8 // x8 - eor r1, r1, r9 // x9 - eor r6, r6, r10 // x10 - eor r7, r7, r11 // x11 - stmia r14!, {r0,r1,r6,r7} - ldmia r12!, {r0,r1,r6,r7} - __ldrd r8, r9, sp, 48 - __ldrd r10, r11, sp, 56 - add r2, r8, r2, ror #drot // x12 - add r3, r9, r3, ror #drot // x13 - add r4, r10, r4, ror #drot // x14 - add r5, r11, r5, ror #drot // x15 - _le32_bswap_4x r2, r3, r4, r5, r9, r10, r11 - ldr r9, [sp, #72] // load LEN - eor r2, r2, r0 // x12 - eor r3, r3, r1 // x13 - eor r4, r4, r6 // x14 - eor r5, r5, r7 // x15 - subs r9, #64 // decrement and check LEN - stmia r14!, {r2-r5} - - beq .Ldone\@ - -.Lprepare_for_next_block\@: - - // Stack: x0-x15 OUT IN LEN - - // Increment block counter (x12) - add r8, #1 - - // Store updated (OUT, IN, LEN) - str r14, [sp, #64] - str r12, [sp, #68] - str r9, [sp, #72] - - mov r14, sp - - // Store updated block counter (x12) - str r8, [sp, #48] - - sub sp, #16 - - // Reload state and do next block - ldmia r14!, {r0-r11} // load x0-x11 - __strd r10, r11, sp, 8 // store x10-x11 before state - ldmia r14, {r10-r12,r14} // load x12-x15 - b .Lnext_block\@ - -.Lxor_slowpath\@: - // Slow path: < 64 bytes remaining, or unaligned input or output buffer. - // We handle it by storing the 64 bytes of keystream to the stack, then - // XOR-ing the needed portion with the data. - - // Allocate keystream buffer - sub sp, #64 - mov r14, sp - - // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN - // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0. - // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. - - // Save keystream for x0-x3 - __ldrd r8, r9, sp, 96 - __ldrd r10, r11, sp, 104 - add X0, X0, r8 - add X1, X1, r9 - add X2, X2, r10 - add X3, X3, r11 - _le32_bswap_4x X0, X1, X2, X3, r8, r9, r10 - stmia r14!, {X0-X3} - - // Save keystream for x4-x7 - __ldrd r8, r9, sp, 112 - __ldrd r10, r11, sp, 120 - add X4, r8, X4, ror #brot - add X5, r9, X5, ror #brot - add X6, r10, X6, ror #brot - add X7, r11, X7, ror #brot - _le32_bswap_4x X4, X5, X6, X7, r8, r9, r10 - add r8, sp, #64 - stmia r14!, {X4-X7} - - // Save keystream for x8-x15 - ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11) - __ldrd r8, r9, sp, 128 - __ldrd r10, r11, sp, 136 - add r0, r0, r8 // x8 - add r1, r1, r9 // x9 - add r6, r6, r10 // x10 - add r7, r7, r11 // x11 - _le32_bswap_4x r0, r1, r6, r7, r8, r9, r10 - stmia r14!, {r0,r1,r6,r7} - __ldrd r8, r9, sp, 144 - __ldrd r10, r11, sp, 152 - add r2, r8, r2, ror #drot // x12 - add r3, r9, r3, ror #drot // x13 - add r4, r10, r4, ror #drot // x14 - add r5, r11, r5, ror #drot // x15 - _le32_bswap_4x r2, r3, r4, r5, r9, r10, r11 - stmia r14, {r2-r5} - - // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN - // Registers: r8 is block counter, r12 is IN. - - ldr r9, [sp, #168] // LEN - ldr r14, [sp, #160] // OUT - cmp r9, #64 - mov r0, sp - movle r1, r9 - movgt r1, #64 - // r1 is number of bytes to XOR, in range [1, 64] - -.if __LINUX_ARM_ARCH__ < 6 - orr r2, r12, r14 - tst r2, #3 // IN or OUT misaligned? - bne .Lxor_next_byte\@ -.endif - - // XOR a word at a time -.rept 16 - subs r1, #4 - blt .Lxor_words_done\@ - ldr r2, [r12], #4 - ldr r3, [r0], #4 - eor r2, r2, r3 - str r2, [r14], #4 -.endr - b .Lxor_slowpath_done\@ -.Lxor_words_done\@: - ands r1, r1, #3 - beq .Lxor_slowpath_done\@ - - // XOR a byte at a time -.Lxor_next_byte\@: - ldrb r2, [r12], #1 - ldrb r3, [r0], #1 - eor r2, r2, r3 - strb r2, [r14], #1 - subs r1, #1 - bne .Lxor_next_byte\@ - -.Lxor_slowpath_done\@: - subs r9, #64 - add sp, #96 - bgt .Lprepare_for_next_block\@ - -.Ldone\@: -.endm // _chacha - -/* - * void chacha20_arm(u8 *out, const u8 *in, size_t len, const u32 key[8], - * const u32 iv[4]); - */ -ENTRY(chacha20_arm) - cmp r2, #0 // len == 0? - reteq lr - - push {r0-r2,r4-r11,lr} - - // Push state x0-x15 onto stack. - // Also store an extra copy of x10-x11 just before the state. - - ldr r4, [sp, #48] // iv - mov r0, sp - sub sp, #80 - - // iv: x12-x15 - ldm r4, {X12,X13,X14,X15} - stmdb r0!, {X12,X13,X14,X15} - - // key: x4-x11 - __ldrd X8_X10, X9_X11, r3, 24 - __strd X8_X10, X9_X11, sp, 8 - stmdb r0!, {X8_X10, X9_X11} - ldm r3, {X4-X9_X11} - stmdb r0!, {X4-X9_X11} - - // constants: x0-x3 - adrl X3, .Lexpand_32byte_k - ldm X3, {X0-X3} - __strd X0, X1, sp, 16 - __strd X2, X3, sp, 24 - - _chacha 20 - - add sp, #76 - pop {r4-r11, pc} -ENDPROC(chacha20_arm) - -/* - * void hchacha20_arm(const u32 state[16], u32 out[8]); - */ -ENTRY(hchacha20_arm) - push {r1,r4-r11,lr} - - mov r14, r0 - ldmia r14!, {r0-r11} // load x0-x11 - push {r10-r11} // store x10-x11 to stack - ldm r14, {r10-r12,r14} // load x12-x15 - sub sp, #8 - - _chacha_permute 20 - - // Skip over (unused0-unused1, x10-x11) - add sp, #16 - - // Fix up rotations of x12-x15 - ror X12, X12, #drot - ror X13, X13, #drot - pop {r4} // load 'out' - ror X14, X14, #drot - ror X15, X15, #drot - - // Store (x0-x3,x12-x15) to 'out' - stm r4, {X0,X1,X2,X3,X12,X13,X14,X15} - - pop {r4-r11,pc} -ENDPROC(hchacha20_arm) - -#ifdef CONFIG_KERNEL_MODE_NEON -/* - * This following NEON routine was ported from Andy Polyakov's implementation - * from CRYPTOGAMS. It begins with parts of the CRYPTOGAMS scalar routine, - * since certain NEON code paths actually branch to it. - */ - -.text -#if defined(__thumb2__) || defined(__clang__) -.syntax unified -#endif -#if defined(__thumb2__) -.thumb -#else -.code 32 -#endif - -#if defined(__thumb2__) || defined(__clang__) -#define ldrhsb ldrbhs -#endif - -.align 4 -.Loop_outer: - ldmia sp,{r0-r9} @ load key material - str r11,[sp,#4*(32+2)] @ save len - str r12, [sp,#4*(32+1)] @ save inp - str r14, [sp,#4*(32+0)] @ save out -.Loop_outer_enter: - ldr r11, [sp,#4*(15)] - mov r4,r4,ror#19 @ twist b[0..3] - ldr r12,[sp,#4*(12)] @ modulo-scheduled load - mov r5,r5,ror#19 - ldr r10, [sp,#4*(13)] - mov r6,r6,ror#19 - ldr r14,[sp,#4*(14)] - mov r7,r7,ror#19 - mov r11,r11,ror#8 @ twist d[0..3] - mov r12,r12,ror#8 - mov r10,r10,ror#8 - mov r14,r14,ror#8 - str r11, [sp,#4*(16+15)] - mov r11,#10 - b .Loop - -.align 4 -.Loop: - subs r11,r11,#1 - add r0,r0,r4,ror#13 - add r1,r1,r5,ror#13 - eor r12,r0,r12,ror#24 - eor r10,r1,r10,ror#24 - add r8,r8,r12,ror#16 - add r9,r9,r10,ror#16 - eor r4,r8,r4,ror#13 - eor r5,r9,r5,ror#13 - add r0,r0,r4,ror#20 - add r1,r1,r5,ror#20 - eor r12,r0,r12,ror#16 - eor r10,r1,r10,ror#16 - add r8,r8,r12,ror#24 - str r10,[sp,#4*(16+13)] - add r9,r9,r10,ror#24 - ldr r10,[sp,#4*(16+15)] - str r8,[sp,#4*(16+8)] - eor r4,r4,r8,ror#12 - str r9,[sp,#4*(16+9)] - eor r5,r5,r9,ror#12 - ldr r8,[sp,#4*(16+10)] - add r2,r2,r6,ror#13 - ldr r9,[sp,#4*(16+11)] - add r3,r3,r7,ror#13 - eor r14,r2,r14,ror#24 - eor r10,r3,r10,ror#24 - add r8,r8,r14,ror#16 - add r9,r9,r10,ror#16 - eor r6,r8,r6,ror#13 - eor r7,r9,r7,ror#13 - add r2,r2,r6,ror#20 - add r3,r3,r7,ror#20 - eor r14,r2,r14,ror#16 - eor r10,r3,r10,ror#16 - add r8,r8,r14,ror#24 - add r9,r9,r10,ror#24 - eor r6,r6,r8,ror#12 - eor r7,r7,r9,ror#12 - add r0,r0,r5,ror#13 - add r1,r1,r6,ror#13 - eor r10,r0,r10,ror#24 - eor r12,r1,r12,ror#24 - add r8,r8,r10,ror#16 - add r9,r9,r12,ror#16 - eor r5,r8,r5,ror#13 - eor r6,r9,r6,ror#13 - add r0,r0,r5,ror#20 - add r1,r1,r6,ror#20 - eor r10,r0,r10,ror#16 - eor r12,r1,r12,ror#16 - str r10,[sp,#4*(16+15)] - add r8,r8,r10,ror#24 - ldr r10,[sp,#4*(16+13)] - add r9,r9,r12,ror#24 - str r8,[sp,#4*(16+10)] - eor r5,r5,r8,ror#12 - str r9,[sp,#4*(16+11)] - eor r6,r6,r9,ror#12 - ldr r8,[sp,#4*(16+8)] - add r2,r2,r7,ror#13 - ldr r9,[sp,#4*(16+9)] - add r3,r3,r4,ror#13 - eor r10,r2,r10,ror#24 - eor r14,r3,r14,ror#24 - add r8,r8,r10,ror#16 - add r9,r9,r14,ror#16 - eor r7,r8,r7,ror#13 - eor r4,r9,r4,ror#13 - add r2,r2,r7,ror#20 - add r3,r3,r4,ror#20 - eor r10,r2,r10,ror#16 - eor r14,r3,r14,ror#16 - add r8,r8,r10,ror#24 - add r9,r9,r14,ror#24 - eor r7,r7,r8,ror#12 - eor r4,r4,r9,ror#12 - bne .Loop - - ldr r11,[sp,#4*(32+2)] @ load len - - str r8, [sp,#4*(16+8)] @ modulo-scheduled store - str r9, [sp,#4*(16+9)] - str r12,[sp,#4*(16+12)] - str r10, [sp,#4*(16+13)] - str r14,[sp,#4*(16+14)] - - @ at this point we have first half of 512-bit result in - @ rx and second half at sp+4*(16+8) - - cmp r11,#64 @ done yet? -#ifdef __thumb2__ - itete lo -#endif - addlo r12,sp,#4*(0) @ shortcut or ... - ldrhs r12,[sp,#4*(32+1)] @ ... load inp - addlo r14,sp,#4*(0) @ shortcut or ... - ldrhs r14,[sp,#4*(32+0)] @ ... load out - - ldr r8,[sp,#4*(0)] @ load key material - ldr r9,[sp,#4*(1)] - -#if __LINUX_ARM_ARCH__ >= 6 || !defined(__ARMEB__) -#if __LINUX_ARM_ARCH__ < 7 - orr r10,r12,r14 - tst r10,#3 @ are input and output aligned? - ldr r10,[sp,#4*(2)] - bne .Lunaligned - cmp r11,#64 @ restore flags -#else - ldr r10,[sp,#4*(2)] -#endif - ldr r11,[sp,#4*(3)] - - add r0,r0,r8 @ accumulate key material - add r1,r1,r9 -#ifdef __thumb2__ - itt hs -#endif - ldrhs r8,[r12],#16 @ load input - ldrhs r9,[r12,#-12] - - add r2,r2,r10 - add r3,r3,r11 -#ifdef __thumb2__ - itt hs -#endif - ldrhs r10,[r12,#-8] - ldrhs r11,[r12,#-4] -#if __LINUX_ARM_ARCH__ >= 6 && defined(__ARMEB__) - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 -#endif -#ifdef __thumb2__ - itt hs -#endif - eorhs r0,r0,r8 @ xor with input - eorhs r1,r1,r9 - add r8,sp,#4*(4) - str r0,[r14],#16 @ store output -#ifdef __thumb2__ - itt hs -#endif - eorhs r2,r2,r10 - eorhs r3,r3,r11 - ldmia r8,{r8-r11} @ load key material - str r1,[r14,#-12] - str r2,[r14,#-8] - str r3,[r14,#-4] - - add r4,r8,r4,ror#13 @ accumulate key material - add r5,r9,r5,ror#13 -#ifdef __thumb2__ - itt hs -#endif - ldrhs r8,[r12],#16 @ load input - ldrhs r9,[r12,#-12] - add r6,r10,r6,ror#13 - add r7,r11,r7,ror#13 -#ifdef __thumb2__ - itt hs -#endif - ldrhs r10,[r12,#-8] - ldrhs r11,[r12,#-4] -#if __LINUX_ARM_ARCH__ >= 6 && defined(__ARMEB__) - rev r4,r4 - rev r5,r5 - rev r6,r6 - rev r7,r7 -#endif -#ifdef __thumb2__ - itt hs -#endif - eorhs r4,r4,r8 - eorhs r5,r5,r9 - add r8,sp,#4*(8) - str r4,[r14],#16 @ store output -#ifdef __thumb2__ - itt hs -#endif - eorhs r6,r6,r10 - eorhs r7,r7,r11 - str r5,[r14,#-12] - ldmia r8,{r8-r11} @ load key material - str r6,[r14,#-8] - add r0,sp,#4*(16+8) - str r7,[r14,#-4] - - ldmia r0,{r0-r7} @ load second half - - add r0,r0,r8 @ accumulate key material - add r1,r1,r9 -#ifdef __thumb2__ - itt hs -#endif - ldrhs r8,[r12],#16 @ load input - ldrhs r9,[r12,#-12] -#ifdef __thumb2__ - itt hi -#endif - strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it - strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it - add r2,r2,r10 - add r3,r3,r11 -#ifdef __thumb2__ - itt hs -#endif - ldrhs r10,[r12,#-8] - ldrhs r11,[r12,#-4] -#if __LINUX_ARM_ARCH__ >= 6 && defined(__ARMEB__) - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 -#endif -#ifdef __thumb2__ - itt hs -#endif - eorhs r0,r0,r8 - eorhs r1,r1,r9 - add r8,sp,#4*(12) - str r0,[r14],#16 @ store output -#ifdef __thumb2__ - itt hs -#endif - eorhs r2,r2,r10 - eorhs r3,r3,r11 - str r1,[r14,#-12] - ldmia r8,{r8-r11} @ load key material - str r2,[r14,#-8] - str r3,[r14,#-4] - - add r4,r8,r4,ror#24 @ accumulate key material - add r5,r9,r5,ror#24 -#ifdef __thumb2__ - itt hi -#endif - addhi r8,r8,#1 @ next counter value - strhi r8,[sp,#4*(12)] @ save next counter value -#ifdef __thumb2__ - itt hs -#endif - ldrhs r8,[r12],#16 @ load input - ldrhs r9,[r12,#-12] - add r6,r10,r6,ror#24 - add r7,r11,r7,ror#24 -#ifdef __thumb2__ - itt hs -#endif - ldrhs r10,[r12,#-8] - ldrhs r11,[r12,#-4] -#if __LINUX_ARM_ARCH__ >= 6 && defined(__ARMEB__) - rev r4,r4 - rev r5,r5 - rev r6,r6 - rev r7,r7 -#endif -#ifdef __thumb2__ - itt hs -#endif - eorhs r4,r4,r8 - eorhs r5,r5,r9 -#ifdef __thumb2__ - it ne -#endif - ldrne r8,[sp,#4*(32+2)] @ re-load len -#ifdef __thumb2__ - itt hs -#endif - eorhs r6,r6,r10 - eorhs r7,r7,r11 - str r4,[r14],#16 @ store output - str r5,[r14,#-12] -#ifdef __thumb2__ - it hs -#endif - subhs r11,r8,#64 @ len-=64 - str r6,[r14,#-8] - str r7,[r14,#-4] - bhi .Loop_outer - - beq .Ldone -#if __LINUX_ARM_ARCH__ < 7 - b .Ltail - -.align 4 -.Lunaligned: @ unaligned endian-neutral path - cmp r11,#64 @ restore flags -#endif -#endif -#if __LINUX_ARM_ARCH__ < 7 - ldr r11,[sp,#4*(3)] - add r0,r8,r0 @ accumulate key material - add r1,r9,r1 - add r2,r10,r2 -#ifdef __thumb2__ - itete lo -#endif - eorlo r8,r8,r8 @ zero or ... - ldrhsb r8,[r12],#16 @ ... load input - eorlo r9,r9,r9 - ldrhsb r9,[r12,#-12] - - add r3,r11,r3 -#ifdef __thumb2__ - itete lo -#endif - eorlo r10,r10,r10 - ldrhsb r10,[r12,#-8] - eorlo r11,r11,r11 - ldrhsb r11,[r12,#-4] - - eor r0,r8,r0 @ xor with input (or zero) - eor r1,r9,r1 -#ifdef __thumb2__ - itt hs -#endif - ldrhsb r8,[r12,#-15] @ load more input - ldrhsb r9,[r12,#-11] - eor r2,r10,r2 - strb r0,[r14],#16 @ store output - eor r3,r11,r3 -#ifdef __thumb2__ - itt hs -#endif - ldrhsb r10,[r12,#-7] - ldrhsb r11,[r12,#-3] - strb r1,[r14,#-12] - eor r0,r8,r0,lsr#8 - strb r2,[r14,#-8] - eor r1,r9,r1,lsr#8 -#ifdef __thumb2__ - itt hs -#endif - ldrhsb r8,[r12,#-14] @ load more input - ldrhsb r9,[r12,#-10] - strb r3,[r14,#-4] - eor r2,r10,r2,lsr#8 - strb r0,[r14,#-15] - eor r3,r11,r3,lsr#8 -#ifdef __thumb2__ - itt hs -#endif - ldrhsb r10,[r12,#-6] - ldrhsb r11,[r12,#-2] - strb r1,[r14,#-11] - eor r0,r8,r0,lsr#8 - strb r2,[r14,#-7] - eor r1,r9,r1,lsr#8 -#ifdef __thumb2__ - itt hs -#endif - ldrhsb r8,[r12,#-13] @ load more input - ldrhsb r9,[r12,#-9] - strb r3,[r14,#-3] - eor r2,r10,r2,lsr#8 - strb r0,[r14,#-14] - eor r3,r11,r3,lsr#8 -#ifdef __thumb2__ - itt hs -#endif - ldrhsb r10,[r12,#-5] - ldrhsb r11,[r12,#-1] - strb r1,[r14,#-10] - strb r2,[r14,#-6] - eor r0,r8,r0,lsr#8 - strb r3,[r14,#-2] - eor r1,r9,r1,lsr#8 - strb r0,[r14,#-13] - eor r2,r10,r2,lsr#8 - strb r1,[r14,#-9] - eor r3,r11,r3,lsr#8 - strb r2,[r14,#-5] - strb r3,[r14,#-1] - add r8,sp,#4*(4+0) - ldmia r8,{r8-r11} @ load key material - add r0,sp,#4*(16+8) - add r4,r8,r4,ror#13 @ accumulate key material - add r5,r9,r5,ror#13 - add r6,r10,r6,ror#13 -#ifdef __thumb2__ - itete lo -#endif - eorlo r8,r8,r8 @ zero or ... - ldrhsb r8,[r12],#16 @ ... load input - eorlo r9,r9,r9 - ldrhsb r9,[r12,#-12] - - add r7,r11,r7,ror#13 -#ifdef __thumb2__ - itete lo -#endif - eorlo r10,r10,r10 - ldrhsb r10,[r12,#-8] - eorlo r11,r11,r11 - ldrhsb r11,[r12,#-4] - - eor r4,r8,r4 @ xor with input (or zero) - eor r5,r9,r5 -#ifdef __thumb2__ - itt hs -#endif - ldrhsb r8,[r12,#-15] @ load more input - ldrhsb r9,[r12,#-11] - eor r6,r10,r6 - strb r4,[r14],#16 @ store output - eor r7,r11,r7 -#ifdef __thumb2__ - itt hs -#endif - ldrhsb r10,[r12,#-7] - ldrhsb r11,[r12,#-3] - strb r5,[r14,#-12] - eor r4,r8,r4,lsr#8 - strb r6,[r14,#-8] - eor r5,r9,r5,lsr#8 -#ifdef __thumb2__ - itt hs -#endif - ldrhsb r8,[r12,#-14] @ load more input - ldrhsb r9,[r12,#-10] - strb r7,[r14,#-4] - eor r6,r10,r6,lsr#8 - strb r4,[r14,#-15] - eor r7,r11,r7,lsr#8 -#ifdef __thumb2__ - itt hs -#endif - ldrhsb r10,[r12,#-6] - ldrhsb r11,[r12,#-2] - strb r5,[r14,#-11] - eor r4,r8,r4,lsr#8 - strb r6,[r14,#-7] - eor r5,r9,r5,lsr#8 -#ifdef __thumb2__ - itt hs -#endif - ldrhsb r8,[r12,#-13] @ load more input - ldrhsb r9,[r12,#-9] - strb r7,[r14,#-3] - eor r6,r10,r6,lsr#8 - strb r4,[r14,#-14] - eor r7,r11,r7,lsr#8 -#ifdef __thumb2__ - itt hs -#endif - ldrhsb r10,[r12,#-5] - ldrhsb r11,[r12,#-1] - strb r5,[r14,#-10] - strb r6,[r14,#-6] - eor r4,r8,r4,lsr#8 - strb r7,[r14,#-2] - eor r5,r9,r5,lsr#8 - strb r4,[r14,#-13] - eor r6,r10,r6,lsr#8 - strb r5,[r14,#-9] - eor r7,r11,r7,lsr#8 - strb r6,[r14,#-5] - strb r7,[r14,#-1] - add r8,sp,#4*(4+4) - ldmia r8,{r8-r11} @ load key material - ldmia r0,{r0-r7} @ load second half -#ifdef __thumb2__ - itt hi -#endif - strhi r10,[sp,#4*(16+10)] @ copy "rx" - strhi r11,[sp,#4*(16+11)] @ copy "rx" - add r0,r8,r0 @ accumulate key material - add r1,r9,r1 - add r2,r10,r2 -#ifdef __thumb2__ - itete lo -#endif - eorlo r8,r8,r8 @ zero or ... - ldrhsb r8,[r12],#16 @ ... load input - eorlo r9,r9,r9 - ldrhsb r9,[r12,#-12] - - add r3,r11,r3 -#ifdef __thumb2__ - itete lo -#endif - eorlo r10,r10,r10 - ldrhsb r10,[r12,#-8] - eorlo r11,r11,r11 - ldrhsb r11,[r12,#-4] - - eor r0,r8,r0 @ xor with input (or zero) - eor r1,r9,r1 -#ifdef __thumb2__ - itt hs -#endif - ldrhsb r8,[r12,#-15] @ load more input - ldrhsb r9,[r12,#-11] - eor r2,r10,r2 - strb r0,[r14],#16 @ store output - eor r3,r11,r3 -#ifdef __thumb2__ - itt hs -#endif - ldrhsb r10,[r12,#-7] - ldrhsb r11,[r12,#-3] - strb r1,[r14,#-12] - eor r0,r8,r0,lsr#8 - strb r2,[r14,#-8] - eor r1,r9,r1,lsr#8 -#ifdef __thumb2__ - itt hs -#endif - ldrhsb r8,[r12,#-14] @ load more input - ldrhsb r9,[r12,#-10] - strb r3,[r14,#-4] - eor r2,r10,r2,lsr#8 - strb r0,[r14,#-15] - eor r3,r11,r3,lsr#8 -#ifdef __thumb2__ - itt hs -#endif - ldrhsb r10,[r12,#-6] - ldrhsb r11,[r12,#-2] - strb r1,[r14,#-11] - eor r0,r8,r0,lsr#8 - strb r2,[r14,#-7] - eor r1,r9,r1,lsr#8 -#ifdef __thumb2__ - itt hs -#endif - ldrhsb r8,[r12,#-13] @ load more input - ldrhsb r9,[r12,#-9] - strb r3,[r14,#-3] - eor r2,r10,r2,lsr#8 - strb r0,[r14,#-14] - eor r3,r11,r3,lsr#8 -#ifdef __thumb2__ - itt hs -#endif - ldrhsb r10,[r12,#-5] - ldrhsb r11,[r12,#-1] - strb r1,[r14,#-10] - strb r2,[r14,#-6] - eor r0,r8,r0,lsr#8 - strb r3,[r14,#-2] - eor r1,r9,r1,lsr#8 - strb r0,[r14,#-13] - eor r2,r10,r2,lsr#8 - strb r1,[r14,#-9] - eor r3,r11,r3,lsr#8 - strb r2,[r14,#-5] - strb r3,[r14,#-1] - add r8,sp,#4*(4+8) - ldmia r8,{r8-r11} @ load key material - add r4,r8,r4,ror#24 @ accumulate key material -#ifdef __thumb2__ - itt hi -#endif - addhi r8,r8,#1 @ next counter value - strhi r8,[sp,#4*(12)] @ save next counter value - add r5,r9,r5,ror#24 - add r6,r10,r6,ror#24 -#ifdef __thumb2__ - itete lo -#endif - eorlo r8,r8,r8 @ zero or ... - ldrhsb r8,[r12],#16 @ ... load input - eorlo r9,r9,r9 - ldrhsb r9,[r12,#-12] - - add r7,r11,r7,ror#24 -#ifdef __thumb2__ - itete lo -#endif - eorlo r10,r10,r10 - ldrhsb r10,[r12,#-8] - eorlo r11,r11,r11 - ldrhsb r11,[r12,#-4] - - eor r4,r8,r4 @ xor with input (or zero) - eor r5,r9,r5 -#ifdef __thumb2__ - itt hs -#endif - ldrhsb r8,[r12,#-15] @ load more input - ldrhsb r9,[r12,#-11] - eor r6,r10,r6 - strb r4,[r14],#16 @ store output - eor r7,r11,r7 -#ifdef __thumb2__ - itt hs -#endif - ldrhsb r10,[r12,#-7] - ldrhsb r11,[r12,#-3] - strb r5,[r14,#-12] - eor r4,r8,r4,lsr#8 - strb r6,[r14,#-8] - eor r5,r9,r5,lsr#8 -#ifdef __thumb2__ - itt hs -#endif - ldrhsb r8,[r12,#-14] @ load more input - ldrhsb r9,[r12,#-10] - strb r7,[r14,#-4] - eor r6,r10,r6,lsr#8 - strb r4,[r14,#-15] - eor r7,r11,r7,lsr#8 -#ifdef __thumb2__ - itt hs -#endif - ldrhsb r10,[r12,#-6] - ldrhsb r11,[r12,#-2] - strb r5,[r14,#-11] - eor r4,r8,r4,lsr#8 - strb r6,[r14,#-7] - eor r5,r9,r5,lsr#8 -#ifdef __thumb2__ - itt hs -#endif - ldrhsb r8,[r12,#-13] @ load more input - ldrhsb r9,[r12,#-9] - strb r7,[r14,#-3] - eor r6,r10,r6,lsr#8 - strb r4,[r14,#-14] - eor r7,r11,r7,lsr#8 -#ifdef __thumb2__ - itt hs -#endif - ldrhsb r10,[r12,#-5] - ldrhsb r11,[r12,#-1] - strb r5,[r14,#-10] - strb r6,[r14,#-6] - eor r4,r8,r4,lsr#8 - strb r7,[r14,#-2] - eor r5,r9,r5,lsr#8 - strb r4,[r14,#-13] - eor r6,r10,r6,lsr#8 - strb r5,[r14,#-9] - eor r7,r11,r7,lsr#8 - strb r6,[r14,#-5] - strb r7,[r14,#-1] -#ifdef __thumb2__ - it ne -#endif - ldrne r8,[sp,#4*(32+2)] @ re-load len -#ifdef __thumb2__ - it hs -#endif - subhs r11,r8,#64 @ len-=64 - bhi .Loop_outer - - beq .Ldone -#endif - -.Ltail: - ldr r12,[sp,#4*(32+1)] @ load inp - add r9,sp,#4*(0) - ldr r14,[sp,#4*(32+0)] @ load out - -.Loop_tail: - ldrb r10,[r9],#1 @ read buffer on stack - ldrb r11,[r12],#1 @ read input - subs r8,r8,#1 - eor r11,r11,r10 - strb r11,[r14],#1 @ store output - bne .Loop_tail - -.Ldone: - add sp,sp,#4*(32+3) - ldmia sp!,{r4-r11,pc} - -.align 5 -.Lsigma2: -.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral -.Lone2: -.long 1,0,0,0 -.word -1 - -.arch armv7-a -.fpu neon - -.align 5 -ENTRY(chacha20_neon) - ldr r12,[sp,#0] @ pull pointer to counter and nonce - stmdb sp!,{r0-r2,r4-r11,lr} - cmp r2,#0 @ len==0? -#ifdef __thumb2__ - itt eq -#endif - addeq sp,sp,#4*3 - beq .Lno_data_neon -.Lchacha20_neon_begin: - adr r14,.Lsigma2 - vstmdb sp!,{d8-d15} @ ABI spec says so - stmdb sp!,{r0-r3} - - vld1.32 {q1-q2},[r3] @ load key - ldmia r3,{r4-r11} @ load key - - sub sp,sp,#4*(16+16) - vld1.32 {q3},[r12] @ load counter and nonce - add r12,sp,#4*8 - ldmia r14,{r0-r3} @ load sigma - vld1.32 {q0},[r14]! @ load sigma - vld1.32 {q12},[r14]! @ one - @ vld1.32 {d30},[r14] @ rot8 - vst1.32 {q2-q3},[r12] @ copy 1/2key|counter|nonce - vst1.32 {q0-q1},[sp] @ copy sigma|1/2key - - str r10,[sp,#4*(16+10)] @ off-load "rx" - str r11,[sp,#4*(16+11)] @ off-load "rx" - vshl.i32 d26,d24,#1 @ two - vstr d24,[sp,#4*(16+0)] - vshl.i32 d28,d24,#2 @ four - vstr d26,[sp,#4*(16+2)] - vmov q4,q0 - vstr d28,[sp,#4*(16+4)] - vmov q8,q0 - @ vstr d30,[sp,#4*(16+6)] - vmov q5,q1 - vmov q9,q1 - b .Loop_neon_enter - -.align 4 -.Loop_neon_outer: - ldmia sp,{r0-r9} @ load key material - cmp r11,#64*2 @ if len<=64*2 - bls .Lbreak_neon @ switch to integer-only - @ vldr d30,[sp,#4*(16+6)] @ rot8 - vmov q4,q0 - str r11,[sp,#4*(32+2)] @ save len - vmov q8,q0 - str r12, [sp,#4*(32+1)] @ save inp - vmov q5,q1 - str r14, [sp,#4*(32+0)] @ save out - vmov q9,q1 -.Loop_neon_enter: - ldr r11, [sp,#4*(15)] - mov r4,r4,ror#19 @ twist b[0..3] - vadd.i32 q7,q3,q12 @ counter+1 - ldr r12,[sp,#4*(12)] @ modulo-scheduled load - mov r5,r5,ror#19 - vmov q6,q2 - ldr r10, [sp,#4*(13)] - mov r6,r6,ror#19 - vmov q10,q2 - ldr r14,[sp,#4*(14)] - mov r7,r7,ror#19 - vadd.i32 q11,q7,q12 @ counter+2 - add r12,r12,#3 @ counter+3 - mov r11,r11,ror#8 @ twist d[0..3] - mov r12,r12,ror#8 - mov r10,r10,ror#8 - mov r14,r14,ror#8 - str r11, [sp,#4*(16+15)] - mov r11,#10 - b .Loop_neon - -.align 4 -.Loop_neon: - subs r11,r11,#1 - vadd.i32 q0,q0,q1 - add r0,r0,r4,ror#13 - vadd.i32 q4,q4,q5 - add r1,r1,r5,ror#13 - vadd.i32 q8,q8,q9 - eor r12,r0,r12,ror#24 - veor q3,q3,q0 - eor r10,r1,r10,ror#24 - veor q7,q7,q4 - add r8,r8,r12,ror#16 - veor q11,q11,q8 - add r9,r9,r10,ror#16 - vrev32.16 q3,q3 - eor r4,r8,r4,ror#13 - vrev32.16 q7,q7 - eor r5,r9,r5,ror#13 - vrev32.16 q11,q11 - add r0,r0,r4,ror#20 - vadd.i32 q2,q2,q3 - add r1,r1,r5,ror#20 - vadd.i32 q6,q6,q7 - eor r12,r0,r12,ror#16 - vadd.i32 q10,q10,q11 - eor r10,r1,r10,ror#16 - veor q12,q1,q2 - add r8,r8,r12,ror#24 - veor q13,q5,q6 - str r10,[sp,#4*(16+13)] - veor q14,q9,q10 - add r9,r9,r10,ror#24 - vshr.u32 q1,q12,#20 - ldr r10,[sp,#4*(16+15)] - vshr.u32 q5,q13,#20 - str r8,[sp,#4*(16+8)] - vshr.u32 q9,q14,#20 - eor r4,r4,r8,ror#12 - vsli.32 q1,q12,#12 - str r9,[sp,#4*(16+9)] - vsli.32 q5,q13,#12 - eor r5,r5,r9,ror#12 - vsli.32 q9,q14,#12 - ldr r8,[sp,#4*(16+10)] - vadd.i32 q0,q0,q1 - add r2,r2,r6,ror#13 - vadd.i32 q4,q4,q5 - ldr r9,[sp,#4*(16+11)] - vadd.i32 q8,q8,q9 - add r3,r3,r7,ror#13 - veor q12,q3,q0 - eor r14,r2,r14,ror#24 - veor q13,q7,q4 - eor r10,r3,r10,ror#24 - veor q14,q11,q8 - add r8,r8,r14,ror#16 - vshr.u32 q3,q12,#24 - add r9,r9,r10,ror#16 - vshr.u32 q7,q13,#24 - eor r6,r8,r6,ror#13 - vshr.u32 q11,q14,#24 - eor r7,r9,r7,ror#13 - vsli.32 q3,q12,#8 - add r2,r2,r6,ror#20 - vsli.32 q7,q13,#8 - add r3,r3,r7,ror#20 - vsli.32 q11,q14,#8 - eor r14,r2,r14,ror#16 - vadd.i32 q2,q2,q3 - eor r10,r3,r10,ror#16 - vadd.i32 q6,q6,q7 - add r8,r8,r14,ror#24 - vadd.i32 q10,q10,q11 - add r9,r9,r10,ror#24 - veor q12,q1,q2 - eor r6,r6,r8,ror#12 - veor q13,q5,q6 - eor r7,r7,r9,ror#12 - veor q14,q9,q10 - vshr.u32 q1,q12,#25 - vshr.u32 q5,q13,#25 - vshr.u32 q9,q14,#25 - vsli.32 q1,q12,#7 - vsli.32 q5,q13,#7 - vsli.32 q9,q14,#7 - vext.8 q2,q2,q2,#8 - vext.8 q6,q6,q6,#8 - vext.8 q10,q10,q10,#8 - vext.8 q1,q1,q1,#4 - vext.8 q5,q5,q5,#4 - vext.8 q9,q9,q9,#4 - vext.8 q3,q3,q3,#12 - vext.8 q7,q7,q7,#12 - vext.8 q11,q11,q11,#12 - vadd.i32 q0,q0,q1 - add r0,r0,r5,ror#13 - vadd.i32 q4,q4,q5 - add r1,r1,r6,ror#13 - vadd.i32 q8,q8,q9 - eor r10,r0,r10,ror#24 - veor q3,q3,q0 - eor r12,r1,r12,ror#24 - veor q7,q7,q4 - add r8,r8,r10,ror#16 - veor q11,q11,q8 - add r9,r9,r12,ror#16 - vrev32.16 q3,q3 - eor r5,r8,r5,ror#13 - vrev32.16 q7,q7 - eor r6,r9,r6,ror#13 - vrev32.16 q11,q11 - add r0,r0,r5,ror#20 - vadd.i32 q2,q2,q3 - add r1,r1,r6,ror#20 - vadd.i32 q6,q6,q7 - eor r10,r0,r10,ror#16 - vadd.i32 q10,q10,q11 - eor r12,r1,r12,ror#16 - veor q12,q1,q2 - str r10,[sp,#4*(16+15)] - veor q13,q5,q6 - add r8,r8,r10,ror#24 - veor q14,q9,q10 - ldr r10,[sp,#4*(16+13)] - vshr.u32 q1,q12,#20 - add r9,r9,r12,ror#24 - vshr.u32 q5,q13,#20 - str r8,[sp,#4*(16+10)] - vshr.u32 q9,q14,#20 - eor r5,r5,r8,ror#12 - vsli.32 q1,q12,#12 - str r9,[sp,#4*(16+11)] - vsli.32 q5,q13,#12 - eor r6,r6,r9,ror#12 - vsli.32 q9,q14,#12 - ldr r8,[sp,#4*(16+8)] - vadd.i32 q0,q0,q1 - add r2,r2,r7,ror#13 - vadd.i32 q4,q4,q5 - ldr r9,[sp,#4*(16+9)] - vadd.i32 q8,q8,q9 - add r3,r3,r4,ror#13 - veor q12,q3,q0 - eor r10,r2,r10,ror#24 - veor q13,q7,q4 - eor r14,r3,r14,ror#24 - veor q14,q11,q8 - add r8,r8,r10,ror#16 - vshr.u32 q3,q12,#24 - add r9,r9,r14,ror#16 - vshr.u32 q7,q13,#24 - eor r7,r8,r7,ror#13 - vshr.u32 q11,q14,#24 - eor r4,r9,r4,ror#13 - vsli.32 q3,q12,#8 - add r2,r2,r7,ror#20 - vsli.32 q7,q13,#8 - add r3,r3,r4,ror#20 - vsli.32 q11,q14,#8 - eor r10,r2,r10,ror#16 - vadd.i32 q2,q2,q3 - eor r14,r3,r14,ror#16 - vadd.i32 q6,q6,q7 - add r8,r8,r10,ror#24 - vadd.i32 q10,q10,q11 - add r9,r9,r14,ror#24 - veor q12,q1,q2 - eor r7,r7,r8,ror#12 - veor q13,q5,q6 - eor r4,r4,r9,ror#12 - veor q14,q9,q10 - vshr.u32 q1,q12,#25 - vshr.u32 q5,q13,#25 - vshr.u32 q9,q14,#25 - vsli.32 q1,q12,#7 - vsli.32 q5,q13,#7 - vsli.32 q9,q14,#7 - vext.8 q2,q2,q2,#8 - vext.8 q6,q6,q6,#8 - vext.8 q10,q10,q10,#8 - vext.8 q1,q1,q1,#12 - vext.8 q5,q5,q5,#12 - vext.8 q9,q9,q9,#12 - vext.8 q3,q3,q3,#4 - vext.8 q7,q7,q7,#4 - vext.8 q11,q11,q11,#4 - bne .Loop_neon - - add r11,sp,#32 - vld1.32 {q12-q13},[sp] @ load key material - vld1.32 {q14-q15},[r11] - - ldr r11,[sp,#4*(32+2)] @ load len - - str r8, [sp,#4*(16+8)] @ modulo-scheduled store - str r9, [sp,#4*(16+9)] - str r12,[sp,#4*(16+12)] - str r10, [sp,#4*(16+13)] - str r14,[sp,#4*(16+14)] - - @ at this point we have first half of 512-bit result in - @ rx and second half at sp+4*(16+8) - - ldr r12,[sp,#4*(32+1)] @ load inp - ldr r14,[sp,#4*(32+0)] @ load out - - vadd.i32 q0,q0,q12 @ accumulate key material - vadd.i32 q4,q4,q12 - vadd.i32 q8,q8,q12 - vldr d24,[sp,#4*(16+0)] @ one - - vadd.i32 q1,q1,q13 - vadd.i32 q5,q5,q13 - vadd.i32 q9,q9,q13 - vldr d26,[sp,#4*(16+2)] @ two - - vadd.i32 q2,q2,q14 - vadd.i32 q6,q6,q14 - vadd.i32 q10,q10,q14 - vadd.i32 d14,d14,d24 @ counter+1 - vadd.i32 d22,d22,d26 @ counter+2 - - vadd.i32 q3,q3,q15 - vadd.i32 q7,q7,q15 - vadd.i32 q11,q11,q15 - - cmp r11,#64*4 - blo .Ltail_neon - - vld1.8 {q12-q13},[r12]! @ load input - mov r11,sp - vld1.8 {q14-q15},[r12]! - veor q0,q0,q12 @ xor with input - veor q1,q1,q13 - vld1.8 {q12-q13},[r12]! - veor q2,q2,q14 - veor q3,q3,q15 - vld1.8 {q14-q15},[r12]! - - veor q4,q4,q12 - vst1.8 {q0-q1},[r14]! @ store output - veor q5,q5,q13 - vld1.8 {q12-q13},[r12]! - veor q6,q6,q14 - vst1.8 {q2-q3},[r14]! - veor q7,q7,q15 - vld1.8 {q14-q15},[r12]! - - veor q8,q8,q12 - vld1.32 {q0-q1},[r11]! @ load for next iteration - veor d25,d25,d25 - vldr d24,[sp,#4*(16+4)] @ four - veor q9,q9,q13 - vld1.32 {q2-q3},[r11] - veor q10,q10,q14 - vst1.8 {q4-q5},[r14]! - veor q11,q11,q15 - vst1.8 {q6-q7},[r14]! - - vadd.i32 d6,d6,d24 @ next counter value - vldr d24,[sp,#4*(16+0)] @ one - - ldmia sp,{r8-r11} @ load key material - add r0,r0,r8 @ accumulate key material - ldr r8,[r12],#16 @ load input - vst1.8 {q8-q9},[r14]! - add r1,r1,r9 - ldr r9,[r12,#-12] - vst1.8 {q10-q11},[r14]! - add r2,r2,r10 - ldr r10,[r12,#-8] - add r3,r3,r11 - ldr r11,[r12,#-4] -#ifdef __ARMEB__ - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 -#endif - eor r0,r0,r8 @ xor with input - add r8,sp,#4*(4) - eor r1,r1,r9 - str r0,[r14],#16 @ store output - eor r2,r2,r10 - str r1,[r14,#-12] - eor r3,r3,r11 - ldmia r8,{r8-r11} @ load key material - str r2,[r14,#-8] - str r3,[r14,#-4] - - add r4,r8,r4,ror#13 @ accumulate key material - ldr r8,[r12],#16 @ load input - add r5,r9,r5,ror#13 - ldr r9,[r12,#-12] - add r6,r10,r6,ror#13 - ldr r10,[r12,#-8] - add r7,r11,r7,ror#13 - ldr r11,[r12,#-4] -#ifdef __ARMEB__ - rev r4,r4 - rev r5,r5 - rev r6,r6 - rev r7,r7 -#endif - eor r4,r4,r8 - add r8,sp,#4*(8) - eor r5,r5,r9 - str r4,[r14],#16 @ store output - eor r6,r6,r10 - str r5,[r14,#-12] - eor r7,r7,r11 - ldmia r8,{r8-r11} @ load key material - str r6,[r14,#-8] - add r0,sp,#4*(16+8) - str r7,[r14,#-4] - - ldmia r0,{r0-r7} @ load second half - - add r0,r0,r8 @ accumulate key material - ldr r8,[r12],#16 @ load input - add r1,r1,r9 - ldr r9,[r12,#-12] -#ifdef __thumb2__ - it hi -#endif - strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it - add r2,r2,r10 - ldr r10,[r12,#-8] -#ifdef __thumb2__ - it hi -#endif - strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it - add r3,r3,r11 - ldr r11,[r12,#-4] -#ifdef __ARMEB__ - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 -#endif - eor r0,r0,r8 - add r8,sp,#4*(12) - eor r1,r1,r9 - str r0,[r14],#16 @ store output - eor r2,r2,r10 - str r1,[r14,#-12] - eor r3,r3,r11 - ldmia r8,{r8-r11} @ load key material - str r2,[r14,#-8] - str r3,[r14,#-4] - - add r4,r8,r4,ror#24 @ accumulate key material - add r8,r8,#4 @ next counter value - add r5,r9,r5,ror#24 - str r8,[sp,#4*(12)] @ save next counter value - ldr r8,[r12],#16 @ load input - add r6,r10,r6,ror#24 - add r4,r4,#3 @ counter+3 - ldr r9,[r12,#-12] - add r7,r11,r7,ror#24 - ldr r10,[r12,#-8] - ldr r11,[r12,#-4] -#ifdef __ARMEB__ - rev r4,r4 - rev r5,r5 - rev r6,r6 - rev r7,r7 -#endif - eor r4,r4,r8 -#ifdef __thumb2__ - it hi -#endif - ldrhi r8,[sp,#4*(32+2)] @ re-load len - eor r5,r5,r9 - eor r6,r6,r10 - str r4,[r14],#16 @ store output - eor r7,r7,r11 - str r5,[r14,#-12] - sub r11,r8,#64*4 @ len-=64*4 - str r6,[r14,#-8] - str r7,[r14,#-4] - bhi .Loop_neon_outer - - b .Ldone_neon - -.align 4 -.Lbreak_neon: - @ harmonize NEON and integer-only stack frames: load data - @ from NEON frame, but save to integer-only one; distance - @ between the two is 4*(32+4+16-32)=4*(20). - - str r11, [sp,#4*(20+32+2)] @ save len - add r11,sp,#4*(32+4) - str r12, [sp,#4*(20+32+1)] @ save inp - str r14, [sp,#4*(20+32+0)] @ save out - - ldr r12,[sp,#4*(16+10)] - ldr r14,[sp,#4*(16+11)] - vldmia r11,{d8-d15} @ fulfill ABI requirement - str r12,[sp,#4*(20+16+10)] @ copy "rx" - str r14,[sp,#4*(20+16+11)] @ copy "rx" - - ldr r11, [sp,#4*(15)] - mov r4,r4,ror#19 @ twist b[0..3] - ldr r12,[sp,#4*(12)] @ modulo-scheduled load - mov r5,r5,ror#19 - ldr r10, [sp,#4*(13)] - mov r6,r6,ror#19 - ldr r14,[sp,#4*(14)] - mov r7,r7,ror#19 - mov r11,r11,ror#8 @ twist d[0..3] - mov r12,r12,ror#8 - mov r10,r10,ror#8 - mov r14,r14,ror#8 - str r11, [sp,#4*(20+16+15)] - add r11,sp,#4*(20) - vst1.32 {q0-q1},[r11]! @ copy key - add sp,sp,#4*(20) @ switch frame - vst1.32 {q2-q3},[r11] - mov r11,#10 - b .Loop @ go integer-only - -.align 4 -.Ltail_neon: - cmp r11,#64*3 - bhs .L192_or_more_neon - cmp r11,#64*2 - bhs .L128_or_more_neon - cmp r11,#64*1 - bhs .L64_or_more_neon - - add r8,sp,#4*(8) - vst1.8 {q0-q1},[sp] - add r10,sp,#4*(0) - vst1.8 {q2-q3},[r8] - b .Loop_tail_neon - -.align 4 -.L64_or_more_neon: - vld1.8 {q12-q13},[r12]! - vld1.8 {q14-q15},[r12]! - veor q0,q0,q12 - veor q1,q1,q13 - veor q2,q2,q14 - veor q3,q3,q15 - vst1.8 {q0-q1},[r14]! - vst1.8 {q2-q3},[r14]! - - beq .Ldone_neon - - add r8,sp,#4*(8) - vst1.8 {q4-q5},[sp] - add r10,sp,#4*(0) - vst1.8 {q6-q7},[r8] - sub r11,r11,#64*1 @ len-=64*1 - b .Loop_tail_neon - -.align 4 -.L128_or_more_neon: - vld1.8 {q12-q13},[r12]! - vld1.8 {q14-q15},[r12]! - veor q0,q0,q12 - veor q1,q1,q13 - vld1.8 {q12-q13},[r12]! - veor q2,q2,q14 - veor q3,q3,q15 - vld1.8 {q14-q15},[r12]! - - veor q4,q4,q12 - veor q5,q5,q13 - vst1.8 {q0-q1},[r14]! - veor q6,q6,q14 - vst1.8 {q2-q3},[r14]! - veor q7,q7,q15 - vst1.8 {q4-q5},[r14]! - vst1.8 {q6-q7},[r14]! - - beq .Ldone_neon - - add r8,sp,#4*(8) - vst1.8 {q8-q9},[sp] - add r10,sp,#4*(0) - vst1.8 {q10-q11},[r8] - sub r11,r11,#64*2 @ len-=64*2 - b .Loop_tail_neon - -.align 4 -.L192_or_more_neon: - vld1.8 {q12-q13},[r12]! - vld1.8 {q14-q15},[r12]! - veor q0,q0,q12 - veor q1,q1,q13 - vld1.8 {q12-q13},[r12]! - veor q2,q2,q14 - veor q3,q3,q15 - vld1.8 {q14-q15},[r12]! - - veor q4,q4,q12 - veor q5,q5,q13 - vld1.8 {q12-q13},[r12]! - veor q6,q6,q14 - vst1.8 {q0-q1},[r14]! - veor q7,q7,q15 - vld1.8 {q14-q15},[r12]! - - veor q8,q8,q12 - vst1.8 {q2-q3},[r14]! - veor q9,q9,q13 - vst1.8 {q4-q5},[r14]! - veor q10,q10,q14 - vst1.8 {q6-q7},[r14]! - veor q11,q11,q15 - vst1.8 {q8-q9},[r14]! - vst1.8 {q10-q11},[r14]! - - beq .Ldone_neon - - ldmia sp,{r8-r11} @ load key material - add r0,r0,r8 @ accumulate key material - add r8,sp,#4*(4) - add r1,r1,r9 - add r2,r2,r10 - add r3,r3,r11 - ldmia r8,{r8-r11} @ load key material - - add r4,r8,r4,ror#13 @ accumulate key material - add r8,sp,#4*(8) - add r5,r9,r5,ror#13 - add r6,r10,r6,ror#13 - add r7,r11,r7,ror#13 - ldmia r8,{r8-r11} @ load key material -#ifdef __ARMEB__ - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 - rev r4,r4 - rev r5,r5 - rev r6,r6 - rev r7,r7 -#endif - stmia sp,{r0-r7} - add r0,sp,#4*(16+8) - - ldmia r0,{r0-r7} @ load second half - - add r0,r0,r8 @ accumulate key material - add r8,sp,#4*(12) - add r1,r1,r9 - add r2,r2,r10 - add r3,r3,r11 - ldmia r8,{r8-r11} @ load key material - - add r4,r8,r4,ror#24 @ accumulate key material - add r8,sp,#4*(8) - add r5,r9,r5,ror#24 - add r4,r4,#3 @ counter+3 - add r6,r10,r6,ror#24 - add r7,r11,r7,ror#24 - ldr r11,[sp,#4*(32+2)] @ re-load len -#ifdef __ARMEB__ - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 - rev r4,r4 - rev r5,r5 - rev r6,r6 - rev r7,r7 -#endif - stmia r8,{r0-r7} - add r10,sp,#4*(0) - sub r11,r11,#64*3 @ len-=64*3 - -.Loop_tail_neon: - ldrb r8,[r10],#1 @ read buffer on stack - ldrb r9,[r12],#1 @ read input - subs r11,r11,#1 - eor r8,r8,r9 - strb r8,[r14],#1 @ store output - bne .Loop_tail_neon - -.Ldone_neon: - add sp,sp,#4*(32+4) - vldmia sp,{d8-d15} - add sp,sp,#4*(16+3) -.Lno_data_neon: - ldmia sp!,{r4-r11,pc} -ENDPROC(chacha20_neon) -#endif diff --git a/src/crypto/zinc/chacha20/chacha20-arm.pl b/src/crypto/zinc/chacha20/chacha20-arm.pl new file mode 100644 index 0000000..3621957 --- /dev/null +++ b/src/crypto/zinc/chacha20/chacha20-arm.pl @@ -0,0 +1,1227 @@ +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +# +# This code is taken from the OpenSSL project but the author, Andy Polyakov, +# has relicensed it under the licenses specified in the SPDX header above. +# The original headers, including the original license headers, are +# included below for completeness. +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# December 2014 +# +# ChaCha20 for ARMv4. +# +# September 2018 +# +# Improve scalar performance per Eric Biggers' suggestion to eliminate +# separate rotates. This requires b[0..3] and d[0..3] to be maintained +# pre-rotated, hence odd twists prior inner loop and when accumulating +# key material. Since amount of instructions is reduced as result, even +# NEON performance is improved somewhat, most notably by ~9% on low-end +# Cortex-A5/A7. Full unroll was shown to provide even better scalar +# performance on Cortex-A5/A7, naturally at the cost of manyfold size +# increase. We let it be. Oversized code works in benchmarks, but is not +# necessarily optimal in real life, when it's likely to be out-of-cache +# upon entry and evict significant part of cache upon completion. +# +# Performance in cycles per byte out of large buffer. +# +# IALU/gcc-4.4 1xNEON 3xNEON+1xIALU +# +# Cortex-A5 14.2(*)/+160% 21.8 12.9(**) +# Cortex-A8 10.2(*)/+190% 13.9 6.10 +# Cortex-A9 10.8(*)/+150% 14.3 6.50 +# Cortex-A15 11.0/+40% 16.0 4.90 +# Snapdragon S4 13.9(***)/+90% 13.6 4.90 +# +# (*) most "favourable" result for aligned data on little-endian +# processor, result for misaligned data is 10-15% lower; +# (**) pure 4xNEON [with "vertical" layout] was shown to provide ~8% +# better performance on Cortex-A5/A7, but not on others; +# (***) it's 17% slower than original, trade-off is considered +# acceptable, because of improvement on others, specifically +# +36% on Cortex-A5/A7 and +20% on Cortex-A9; + +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +sub AUTOLOAD() # thunk [simplified] x86-style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; + my $arg = pop; + $arg = "#$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; +} + +my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x")); +my @t=map("r$_",(8..11)); + +sub ROUND { +my ($a0,$b0,$c0,$d0)=@_; +my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); +my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); +my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); +my $odd = $d0&1; +my ($xc,$xc_) = (@t[0..1]); +my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]); +my @ret; + + # Consider order in which variables are addressed by their + # index: + # + # a b c d + # + # 0 4 8 12 < even round + # 1 5 9 13 + # 2 6 10 14 + # 3 7 11 15 + # 0 5 10 15 < odd round + # 1 6 11 12 + # 2 7 8 13 + # 3 4 9 14 + # + # 'a', 'b' are permanently allocated in registers, @x[0..7], + # while 'c's and pair of 'd's are maintained in memory. If + # you observe 'c' column, you'll notice that pair of 'c's is + # invariant between rounds. This means that we have to reload + # them once per round, in the middle. This is why you'll see + # bunch of 'c' stores and loads in the middle, but none in + # the beginning or end. If you observe 'd' column, you'll + # notice that 15 and 13 are reused in next pair of rounds. + # This is why these two are chosen for offloading to memory, + # to make loads count more. + push @ret,( + "&add (@x[$a0],@x[$a0],@x[$b0],'ror#13')", + "&add (@x[$a1],@x[$a1],@x[$b1],'ror#13')", + "&eor ($xd,@x[$a0],$xd,'ror#24')", + "&eor ($xd_,@x[$a1],$xd_,'ror#24')", + + "&add ($xc,$xc,$xd,'ror#16')", + "&add ($xc_,$xc_,$xd_,'ror#16')", + "&eor (@x[$b0],$xc, @x[$b0],'ror#13')", + "&eor (@x[$b1],$xc_,@x[$b1],'ror#13')", + + "&add (@x[$a0],@x[$a0],@x[$b0],'ror#20')", + "&add (@x[$a1],@x[$a1],@x[$b1],'ror#20')", + "&eor ($xd,@x[$a0],$xd,'ror#16')", + "&eor ($xd_,@x[$a1],$xd_,'ror#16')" ); + push @ret,( + "&str ($xd,'[sp,#4*(16+$d0)]')" ) if ($odd); + push @ret,( + "&add ($xc,$xc,$xd,'ror#24')" ); + push @ret,( + "&ldr ($xd,'[sp,#4*(16+$d2)]')" ) if ($odd); + push @ret,( + "&str ($xd_,'[sp,#4*(16+$d1)]')" ) if (!$odd); + push @ret,( + "&add ($xc_,$xc_,$xd_,'ror#24')" ); + push @ret,( + "&ldr ($xd_,'[sp,#4*(16+$d3)]')" ) if (!$odd); + push @ret,( + "&str ($xc,'[sp,#4*(16+$c0)]')", + "&eor (@x[$b0],@x[$b0],$xc,'ror#12')", + "&str ($xc_,'[sp,#4*(16+$c1)]')", + "&eor (@x[$b1],@x[$b1],$xc_,'ror#12')" ); + + $xd=@x[$d2] if (!$odd); + $xd_=@x[$d3] if ($odd); + push @ret,( + "&ldr ($xc,'[sp,#4*(16+$c2)]')", + "&add (@x[$a2],@x[$a2],@x[$b2],'ror#13')", + "&ldr ($xc_,'[sp,#4*(16+$c3)]')", + "&add (@x[$a3],@x[$a3],@x[$b3],'ror#13')", + "&eor ($xd,@x[$a2],$xd,'ror#24')", + "&eor ($xd_,@x[$a3],$xd_,'ror#24')", + + "&add ($xc,$xc,$xd,'ror#16')", + "&add ($xc_,$xc_,$xd_,'ror#16')", + "&eor (@x[$b2],$xc, @x[$b2],'ror#13')", + "&eor (@x[$b3],$xc_,@x[$b3],'ror#13')", + + "&add (@x[$a2],@x[$a2],@x[$b2],'ror#20')", + "&add (@x[$a3],@x[$a3],@x[$b3],'ror#20')", + "&eor ($xd,@x[$a2],$xd,'ror#16')", + "&eor ($xd_,@x[$a3],$xd_,'ror#16')", + + "&add ($xc,$xc,$xd,'ror#24')", + "&add ($xc_,$xc_,$xd_,'ror#24')", + "&eor (@x[$b2],@x[$b2],$xc,'ror#12')", + "&eor (@x[$b3],@x[$b3],$xc_,'ror#12')" ); + + @ret; +} + +$code.=<<___; +#ifndef __KERNEL__ +# include "arm_arch.h" +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__ +# define ChaCha20_ctr32 chacha20_arm_cryptogams +# define ChaCha20_neon chacha20_neon +#endif + +.text +#if defined(__thumb2__) || defined(__clang__) +.syntax unified +# define ldrhsb ldrbhs +#endif +#if defined(__thumb2__) +.thumb +#else +.code 32 +#endif + +.align 5 +.Lsigma: +.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral +.Lone: +.long 1,0,0,0 +.Lrot8: +.long 0x02010003,0x06050407 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-.LChaCha20_ctr32 +#else +.word -1 +#endif + +.globl ChaCha20_ctr32 +.type ChaCha20_ctr32,%function +.align 5 +ChaCha20_ctr32: +.LChaCha20_ctr32: + ldr r12,[sp,#0] @ pull pointer to counter and nonce + stmdb sp!,{r0-r2,r4-r11,lr} +#if __ARM_ARCH__<7 && !defined(__thumb2__) + sub r14,pc,#16 @ ChaCha20_ctr32 +#else + adr r14,.LChaCha20_ctr32 +#endif + cmp r2,#0 @ len==0? +#ifdef __thumb2__ + itt eq +#endif + addeq sp,sp,#4*3 + beq .Lno_data +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + cmp r2,#192 @ test len + bls .Lshort + ldr r4,[r14,#-24] + ldr r4,[r14,r4] +# ifdef __APPLE__ + ldr r4,[r4] +# endif + tst r4,#ARMV7_NEON + bne .LChaCha20_neon +.Lshort: +#endif + ldmia r12,{r4-r7} @ load counter and nonce + sub sp,sp,#4*(16) @ off-load area + sub r14,r14,#64 @ .Lsigma + stmdb sp!,{r4-r7} @ copy counter and nonce + ldmia r3,{r4-r11} @ load key + ldmia r14,{r0-r3} @ load sigma + stmdb sp!,{r4-r11} @ copy key + stmdb sp!,{r0-r3} @ copy sigma + str r10,[sp,#4*(16+10)] @ off-load "@x[10]" + str r11,[sp,#4*(16+11)] @ off-load "@x[11]" + b .Loop_outer_enter + +.align 4 +.Loop_outer: + ldmia sp,{r0-r9} @ load key material + str @t[3],[sp,#4*(32+2)] @ save len + str r12, [sp,#4*(32+1)] @ save inp + str r14, [sp,#4*(32+0)] @ save out +.Loop_outer_enter: + ldr @t[3], [sp,#4*(15)] + mov @x[4],@x[4],ror#19 @ twist b[0..3] + ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load + mov @x[5],@x[5],ror#19 + ldr @t[2], [sp,#4*(13)] + mov @x[6],@x[6],ror#19 + ldr @x[14],[sp,#4*(14)] + mov @x[7],@x[7],ror#19 + mov @t[3],@t[3],ror#8 @ twist d[0..3] + mov @x[12],@x[12],ror#8 + mov @t[2],@t[2],ror#8 + mov @x[14],@x[14],ror#8 + str @t[3], [sp,#4*(16+15)] + mov @t[3],#10 + b .Loop + +.align 4 +.Loop: + subs @t[3],@t[3],#1 +___ + foreach (&ROUND(0, 4, 8,12)) { eval; } + foreach (&ROUND(0, 5,10,15)) { eval; } +$code.=<<___; + bne .Loop + + ldr @t[3],[sp,#4*(32+2)] @ load len + + str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store + str @t[1], [sp,#4*(16+9)] + str @x[12],[sp,#4*(16+12)] + str @t[2], [sp,#4*(16+13)] + str @x[14],[sp,#4*(16+14)] + + @ at this point we have first half of 512-bit result in + @ @x[0-7] and second half at sp+4*(16+8) + + cmp @t[3],#64 @ done yet? +#ifdef __thumb2__ + itete lo +#endif + addlo r12,sp,#4*(0) @ shortcut or ... + ldrhs r12,[sp,#4*(32+1)] @ ... load inp + addlo r14,sp,#4*(0) @ shortcut or ... + ldrhs r14,[sp,#4*(32+0)] @ ... load out + + ldr @t[0],[sp,#4*(0)] @ load key material + ldr @t[1],[sp,#4*(1)] + +#if __ARM_ARCH__>=6 || !defined(__ARMEB__) +# if __ARM_ARCH__<7 + orr @t[2],r12,r14 + tst @t[2],#3 @ are input and output aligned? + ldr @t[2],[sp,#4*(2)] + bne .Lunaligned + cmp @t[3],#64 @ restore flags +# else + ldr @t[2],[sp,#4*(2)] +# endif + ldr @t[3],[sp,#4*(3)] + + add @x[0],@x[0],@t[0] @ accumulate key material + add @x[1],@x[1],@t[1] +# ifdef __thumb2__ + itt hs +# endif + ldrhs @t[0],[r12],#16 @ load input + ldrhs @t[1],[r12,#-12] + + add @x[2],@x[2],@t[2] + add @x[3],@x[3],@t[3] +# ifdef __thumb2__ + itt hs +# endif + ldrhs @t[2],[r12,#-8] + ldrhs @t[3],[r12,#-4] +# if __ARM_ARCH__>=6 && defined(__ARMEB__) + rev @x[0],@x[0] + rev @x[1],@x[1] + rev @x[2],@x[2] + rev @x[3],@x[3] +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs @x[0],@x[0],@t[0] @ xor with input + eorhs @x[1],@x[1],@t[1] + add @t[0],sp,#4*(4) + str @x[0],[r14],#16 @ store output +# ifdef __thumb2__ + itt hs +# endif + eorhs @x[2],@x[2],@t[2] + eorhs @x[3],@x[3],@t[3] + ldmia @t[0],{@t[0]-@t[3]} @ load key material + str @x[1],[r14,#-12] + str @x[2],[r14,#-8] + str @x[3],[r14,#-4] + + add @x[4],@t[0],@x[4],ror#13 @ accumulate key material + add @x[5],@t[1],@x[5],ror#13 +# ifdef __thumb2__ + itt hs +# endif + ldrhs @t[0],[r12],#16 @ load input + ldrhs @t[1],[r12,#-12] + add @x[6],@t[2],@x[6],ror#13 + add @x[7],@t[3],@x[7],ror#13 +# ifdef __thumb2__ + itt hs +# endif + ldrhs @t[2],[r12,#-8] + ldrhs @t[3],[r12,#-4] +# if __ARM_ARCH__>=6 && defined(__ARMEB__) + rev @x[4],@x[4] + rev @x[5],@x[5] + rev @x[6],@x[6] + rev @x[7],@x[7] +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs @x[4],@x[4],@t[0] + eorhs @x[5],@x[5],@t[1] + add @t[0],sp,#4*(8) + str @x[4],[r14],#16 @ store output +# ifdef __thumb2__ + itt hs +# endif + eorhs @x[6],@x[6],@t[2] + eorhs @x[7],@x[7],@t[3] + str @x[5],[r14,#-12] + ldmia @t[0],{@t[0]-@t[3]} @ load key material + str @x[6],[r14,#-8] + add @x[0],sp,#4*(16+8) + str @x[7],[r14,#-4] + + ldmia @x[0],{@x[0]-@x[7]} @ load second half + + add @x[0],@x[0],@t[0] @ accumulate key material + add @x[1],@x[1],@t[1] +# ifdef __thumb2__ + itt hs +# endif + ldrhs @t[0],[r12],#16 @ load input + ldrhs @t[1],[r12,#-12] +# ifdef __thumb2__ + itt hi +# endif + strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it + strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it + add @x[2],@x[2],@t[2] + add @x[3],@x[3],@t[3] +# ifdef __thumb2__ + itt hs +# endif + ldrhs @t[2],[r12,#-8] + ldrhs @t[3],[r12,#-4] +# if __ARM_ARCH__>=6 && defined(__ARMEB__) + rev @x[0],@x[0] + rev @x[1],@x[1] + rev @x[2],@x[2] + rev @x[3],@x[3] +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs @x[0],@x[0],@t[0] + eorhs @x[1],@x[1],@t[1] + add @t[0],sp,#4*(12) + str @x[0],[r14],#16 @ store output +# ifdef __thumb2__ + itt hs +# endif + eorhs @x[2],@x[2],@t[2] + eorhs @x[3],@x[3],@t[3] + str @x[1],[r14,#-12] + ldmia @t[0],{@t[0]-@t[3]} @ load key material + str @x[2],[r14,#-8] + str @x[3],[r14,#-4] + + add @x[4],@t[0],@x[4],ror#24 @ accumulate key material + add @x[5],@t[1],@x[5],ror#24 +# ifdef __thumb2__ + itt hi +# endif + addhi @t[0],@t[0],#1 @ next counter value + strhi @t[0],[sp,#4*(12)] @ save next counter value +# ifdef __thumb2__ + itt hs +# endif + ldrhs @t[0],[r12],#16 @ load input + ldrhs @t[1],[r12,#-12] + add @x[6],@t[2],@x[6],ror#24 + add @x[7],@t[3],@x[7],ror#24 +# ifdef __thumb2__ + itt hs +# endif + ldrhs @t[2],[r12,#-8] + ldrhs @t[3],[r12,#-4] +# if __ARM_ARCH__>=6 && defined(__ARMEB__) + rev @x[4],@x[4] + rev @x[5],@x[5] + rev @x[6],@x[6] + rev @x[7],@x[7] +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs @x[4],@x[4],@t[0] + eorhs @x[5],@x[5],@t[1] +# ifdef __thumb2__ + it ne +# endif + ldrne @t[0],[sp,#4*(32+2)] @ re-load len +# ifdef __thumb2__ + itt hs +# endif + eorhs @x[6],@x[6],@t[2] + eorhs @x[7],@x[7],@t[3] + str @x[4],[r14],#16 @ store output + str @x[5],[r14,#-12] +# ifdef __thumb2__ + it hs +# endif + subhs @t[3],@t[0],#64 @ len-=64 + str @x[6],[r14,#-8] + str @x[7],[r14,#-4] + bhi .Loop_outer + + beq .Ldone +# if __ARM_ARCH__<7 + b .Ltail + +.align 4 +.Lunaligned: @ unaligned endian-neutral path + cmp @t[3],#64 @ restore flags +# endif +#endif +#if __ARM_ARCH__<7 + ldr @t[3],[sp,#4*(3)] +___ +for ($i=0;$i<16;$i+=4) { +my $j=$i&0x7; +my $twist=""; +if ($i==4) { $twist = ",ror#13"; } +elsif ($i==12) { $twist = ",ror#24"; } + +$code.=<<___ if ($i==4); + add @x[0],sp,#4*(16+8) +___ +$code.=<<___ if ($i==8); + ldmia @x[0],{@x[0]-@x[7]} @ load second half +# ifdef __thumb2__ + itt hi +# endif + strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" + strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" +___ +$code.=<<___; + add @x[$j+0],@t[0],@x[$j+0]$twist @ accumulate key material +___ +$code.=<<___ if ($i==12); +# ifdef __thumb2__ + itt hi +# endif + addhi @t[0],@t[0],#1 @ next counter value + strhi @t[0],[sp,#4*(12)] @ save next counter value +___ +$code.=<<___; + add @x[$j+1],@t[1],@x[$j+1]$twist + add @x[$j+2],@t[2],@x[$j+2]$twist +# ifdef __thumb2__ + itete lo +# endif + eorlo @t[0],@t[0],@t[0] @ zero or ... + ldrhsb @t[0],[r12],#16 @ ... load input + eorlo @t[1],@t[1],@t[1] + ldrhsb @t[1],[r12,#-12] + + add @x[$j+3],@t[3],@x[$j+3]$twist +# ifdef __thumb2__ + itete lo +# endif + eorlo @t[2],@t[2],@t[2] + ldrhsb @t[2],[r12,#-8] + eorlo @t[3],@t[3],@t[3] + ldrhsb @t[3],[r12,#-4] + + eor @x[$j+0],@t[0],@x[$j+0] @ xor with input (or zero) + eor @x[$j+1],@t[1],@x[$j+1] +# ifdef __thumb2__ + itt hs +# endif + ldrhsb @t[0],[r12,#-15] @ load more input + ldrhsb @t[1],[r12,#-11] + eor @x[$j+2],@t[2],@x[$j+2] + strb @x[$j+0],[r14],#16 @ store output + eor @x[$j+3],@t[3],@x[$j+3] +# ifdef __thumb2__ + itt hs +# endif + ldrhsb @t[2],[r12,#-7] + ldrhsb @t[3],[r12,#-3] + strb @x[$j+1],[r14,#-12] + eor @x[$j+0],@t[0],@x[$j+0],lsr#8 + strb @x[$j+2],[r14,#-8] + eor @x[$j+1],@t[1],@x[$j+1],lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb @t[0],[r12,#-14] @ load more input + ldrhsb @t[1],[r12,#-10] + strb @x[$j+3],[r14,#-4] + eor @x[$j+2],@t[2],@x[$j+2],lsr#8 + strb @x[$j+0],[r14,#-15] + eor @x[$j+3],@t[3],@x[$j+3],lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb @t[2],[r12,#-6] + ldrhsb @t[3],[r12,#-2] + strb @x[$j+1],[r14,#-11] + eor @x[$j+0],@t[0],@x[$j+0],lsr#8 + strb @x[$j+2],[r14,#-7] + eor @x[$j+1],@t[1],@x[$j+1],lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb @t[0],[r12,#-13] @ load more input + ldrhsb @t[1],[r12,#-9] + strb @x[$j+3],[r14,#-3] + eor @x[$j+2],@t[2],@x[$j+2],lsr#8 + strb @x[$j+0],[r14,#-14] + eor @x[$j+3],@t[3],@x[$j+3],lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb @t[2],[r12,#-5] + ldrhsb @t[3],[r12,#-1] + strb @x[$j+1],[r14,#-10] + strb @x[$j+2],[r14,#-6] + eor @x[$j+0],@t[0],@x[$j+0],lsr#8 + strb @x[$j+3],[r14,#-2] + eor @x[$j+1],@t[1],@x[$j+1],lsr#8 + strb @x[$j+0],[r14,#-13] + eor @x[$j+2],@t[2],@x[$j+2],lsr#8 + strb @x[$j+1],[r14,#-9] + eor @x[$j+3],@t[3],@x[$j+3],lsr#8 + strb @x[$j+2],[r14,#-5] + strb @x[$j+3],[r14,#-1] +___ +$code.=<<___ if ($i<12); + add @t[0],sp,#4*(4+$i) + ldmia @t[0],{@t[0]-@t[3]} @ load key material +___ +} +$code.=<<___; +# ifdef __thumb2__ + it ne +# endif + ldrne @t[0],[sp,#4*(32+2)] @ re-load len +# ifdef __thumb2__ + it hs +# endif + subhs @t[3],@t[0],#64 @ len-=64 + bhi .Loop_outer + + beq .Ldone +#endif + +.Ltail: + ldr r12,[sp,#4*(32+1)] @ load inp + add @t[1],sp,#4*(0) + ldr r14,[sp,#4*(32+0)] @ load out + +.Loop_tail: + ldrb @t[2],[@t[1]],#1 @ read buffer on stack + ldrb @t[3],[r12],#1 @ read input + subs @t[0],@t[0],#1 + eor @t[3],@t[3],@t[2] + strb @t[3],[r14],#1 @ store output + bne .Loop_tail + +.Ldone: + add sp,sp,#4*(32+3) +.Lno_data: +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r11,pc} +#else + ldmia sp!,{r4-r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + .long 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size ChaCha20_ctr32,.-ChaCha20_ctr32 +___ + +{{{ +my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) = + map("q$_",(0..15)); + +# This can replace vshr-by-24+vsli-by-8. It gives ~3% improvement on +# Cortex-A5/A7, but hurts Cortex-A9 by 5% and Snapdragon S4 by 14%! +sub vperm() +{ my ($dst,$src,$tbl) = @_; + $code .= " vtbl.8 $dst#lo,{$src#lo},$tbl#lo\n"; + $code .= " vtbl.8 $dst#hi,{$src#hi},$tbl#lo\n"; +} + +sub NEONROUND { +my $odd = pop; +my ($a,$b,$c,$d,$t)=@_; + + ( + "&vadd_i32 ($a,$a,$b)", + "&veor ($d,$d,$a)", + "&vrev32_16 ($d,$d)", # vrot ($d,16) + + "&vadd_i32 ($c,$c,$d)", + "&veor ($t,$b,$c)", + "&vshr_u32 ($b,$t,20)", + "&vsli_32 ($b,$t,12)", + + "&vadd_i32 ($a,$a,$b)", + "&veor ($t,$d,$a)", + "&vshr_u32 ($d,$t,24)", + "&vsli_32 ($d,$t,8)", + #"&vperm ($d,$t,$t3)", + + "&vadd_i32 ($c,$c,$d)", + "&veor ($t,$b,$c)", + "&vshr_u32 ($b,$t,25)", + "&vsli_32 ($b,$t,7)", + + "&vext_8 ($c,$c,$c,8)", + "&vext_8 ($b,$b,$b,$odd?12:4)", + "&vext_8 ($d,$d,$d,$odd?4:12)" + ); +} + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +# ifdef __KERNEL__ +.globl ChaCha20_neon +@ For optimal performance it's appropriate for caller to enforce +@ minimum input length, 193 bytes is suggested. +# endif +.type ChaCha20_neon,%function +.align 5 +ChaCha20_neon: + ldr r12,[sp,#0] @ pull pointer to counter and nonce + stmdb sp!,{r0-r2,r4-r11,lr} +.LChaCha20_neon: + adr r14,.Lsigma + vstmdb sp!,{d8-d15} @ ABI spec says so + stmdb sp!,{r0-r3} + + vld1.32 {$b0-$c0},[r3] @ load key + ldmia r3,{r4-r11} @ load key + + sub sp,sp,#4*(16+16) + vld1.32 {$d0},[r12] @ load counter and nonce + add r12,sp,#4*8 + ldmia r14,{r0-r3} @ load sigma + vld1.32 {$a0},[r14]! @ load sigma + vld1.32 {$t0},[r14]! @ one + @ vld1.32 {$t3#lo},[r14] @ rot8 + vst1.32 {$c0-$d0},[r12] @ copy 1/2key|counter|nonce + vst1.32 {$a0-$b0},[sp] @ copy sigma|1/2key + + str r10,[sp,#4*(16+10)] @ off-load "@x[10]" + str r11,[sp,#4*(16+11)] @ off-load "@x[11]" + vshl.i32 $t1#lo,$t0#lo,#1 @ two + vstr $t0#lo,[sp,#4*(16+0)] + vshl.i32 $t2#lo,$t0#lo,#2 @ four + vstr $t1#lo,[sp,#4*(16+2)] + vmov $a1,$a0 + vstr $t2#lo,[sp,#4*(16+4)] + vmov $a2,$a0 + @ vstr $t3#lo,[sp,#4*(16+6)] + vmov $b1,$b0 + vmov $b2,$b0 + b .Loop_neon_enter + +.align 4 +.Loop_neon_outer: + ldmia sp,{r0-r9} @ load key material + cmp @t[3],#64*2 @ if len<=64*2 + bls .Lbreak_neon @ switch to integer-only + @ vldr $t3#lo,[sp,#4*(16+6)] @ rot8 + vmov $a1,$a0 + str @t[3],[sp,#4*(32+2)] @ save len + vmov $a2,$a0 + str r12, [sp,#4*(32+1)] @ save inp + vmov $b1,$b0 + str r14, [sp,#4*(32+0)] @ save out + vmov $b2,$b0 +.Loop_neon_enter: + ldr @t[3], [sp,#4*(15)] + mov @x[4],@x[4],ror#19 @ twist b[0..3] + vadd.i32 $d1,$d0,$t0 @ counter+1 + ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load + mov @x[5],@x[5],ror#19 + vmov $c1,$c0 + ldr @t[2], [sp,#4*(13)] + mov @x[6],@x[6],ror#19 + vmov $c2,$c0 + ldr @x[14],[sp,#4*(14)] + mov @x[7],@x[7],ror#19 + vadd.i32 $d2,$d1,$t0 @ counter+2 + add @x[12],@x[12],#3 @ counter+3 + mov @t[3],@t[3],ror#8 @ twist d[0..3] + mov @x[12],@x[12],ror#8 + mov @t[2],@t[2],ror#8 + mov @x[14],@x[14],ror#8 + str @t[3], [sp,#4*(16+15)] + mov @t[3],#10 + b .Loop_neon + +.align 4 +.Loop_neon: + subs @t[3],@t[3],#1 +___ + my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0); + my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0); + my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0); + my @thread3=&ROUND(0,4,8,12); + + foreach (@thread0) { + eval; eval(shift(@thread3)); + eval(shift(@thread1)); eval(shift(@thread3)); + eval(shift(@thread2)); eval(shift(@thread3)); + } + + @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1); + @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1); + @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1); + @thread3=&ROUND(0,5,10,15); + + foreach (@thread0) { + eval; eval(shift(@thread3)); + eval(shift(@thread1)); eval(shift(@thread3)); + eval(shift(@thread2)); eval(shift(@thread3)); + } +$code.=<<___; + bne .Loop_neon + + add @t[3],sp,#32 + vld1.32 {$t0-$t1},[sp] @ load key material + vld1.32 {$t2-$t3},[@t[3]] + + ldr @t[3],[sp,#4*(32+2)] @ load len + + str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store + str @t[1], [sp,#4*(16+9)] + str @x[12],[sp,#4*(16+12)] + str @t[2], [sp,#4*(16+13)] + str @x[14],[sp,#4*(16+14)] + + @ at this point we have first half of 512-bit result in + @ @x[0-7] and second half at sp+4*(16+8) + + ldr r12,[sp,#4*(32+1)] @ load inp + ldr r14,[sp,#4*(32+0)] @ load out + + vadd.i32 $a0,$a0,$t0 @ accumulate key material + vadd.i32 $a1,$a1,$t0 + vadd.i32 $a2,$a2,$t0 + vldr $t0#lo,[sp,#4*(16+0)] @ one + + vadd.i32 $b0,$b0,$t1 + vadd.i32 $b1,$b1,$t1 + vadd.i32 $b2,$b2,$t1 + vldr $t1#lo,[sp,#4*(16+2)] @ two + + vadd.i32 $c0,$c0,$t2 + vadd.i32 $c1,$c1,$t2 + vadd.i32 $c2,$c2,$t2 + vadd.i32 $d1#lo,$d1#lo,$t0#lo @ counter+1 + vadd.i32 $d2#lo,$d2#lo,$t1#lo @ counter+2 + + vadd.i32 $d0,$d0,$t3 + vadd.i32 $d1,$d1,$t3 + vadd.i32 $d2,$d2,$t3 + + cmp @t[3],#64*4 + blo .Ltail_neon + + vld1.8 {$t0-$t1},[r12]! @ load input + mov @t[3],sp + vld1.8 {$t2-$t3},[r12]! + veor $a0,$a0,$t0 @ xor with input + veor $b0,$b0,$t1 + vld1.8 {$t0-$t1},[r12]! + veor $c0,$c0,$t2 + veor $d0,$d0,$t3 + vld1.8 {$t2-$t3},[r12]! + + veor $a1,$a1,$t0 + vst1.8 {$a0-$b0},[r14]! @ store output + veor $b1,$b1,$t1 + vld1.8 {$t0-$t1},[r12]! + veor $c1,$c1,$t2 + vst1.8 {$c0-$d0},[r14]! + veor $d1,$d1,$t3 + vld1.8 {$t2-$t3},[r12]! + + veor $a2,$a2,$t0 + vld1.32 {$a0-$b0},[@t[3]]! @ load for next iteration + veor $t0#hi,$t0#hi,$t0#hi + vldr $t0#lo,[sp,#4*(16+4)] @ four + veor $b2,$b2,$t1 + vld1.32 {$c0-$d0},[@t[3]] + veor $c2,$c2,$t2 + vst1.8 {$a1-$b1},[r14]! + veor $d2,$d2,$t3 + vst1.8 {$c1-$d1},[r14]! + + vadd.i32 $d0#lo,$d0#lo,$t0#lo @ next counter value + vldr $t0#lo,[sp,#4*(16+0)] @ one + + ldmia sp,{@t[0]-@t[3]} @ load key material + add @x[0],@x[0],@t[0] @ accumulate key material + ldr @t[0],[r12],#16 @ load input + vst1.8 {$a2-$b2},[r14]! + add @x[1],@x[1],@t[1] + ldr @t[1],[r12,#-12] + vst1.8 {$c2-$d2},[r14]! + add @x[2],@x[2],@t[2] + ldr @t[2],[r12,#-8] + add @x[3],@x[3],@t[3] + ldr @t[3],[r12,#-4] +# ifdef __ARMEB__ + rev @x[0],@x[0] + rev @x[1],@x[1] + rev @x[2],@x[2] + rev @x[3],@x[3] +# endif + eor @x[0],@x[0],@t[0] @ xor with input + add @t[0],sp,#4*(4) + eor @x[1],@x[1],@t[1] + str @x[0],[r14],#16 @ store output + eor @x[2],@x[2],@t[2] + str @x[1],[r14,#-12] + eor @x[3],@x[3],@t[3] + ldmia @t[0],{@t[0]-@t[3]} @ load key material + str @x[2],[r14,#-8] + str @x[3],[r14,#-4] + + add @x[4],@t[0],@x[4],ror#13 @ accumulate key material + ldr @t[0],[r12],#16 @ load input + add @x[5],@t[1],@x[5],ror#13 + ldr @t[1],[r12,#-12] + add @x[6],@t[2],@x[6],ror#13 + ldr @t[2],[r12,#-8] + add @x[7],@t[3],@x[7],ror#13 + ldr @t[3],[r12,#-4] +# ifdef __ARMEB__ + rev @x[4],@x[4] + rev @x[5],@x[5] + rev @x[6],@x[6] + rev @x[7],@x[7] +# endif + eor @x[4],@x[4],@t[0] + add @t[0],sp,#4*(8) + eor @x[5],@x[5],@t[1] + str @x[4],[r14],#16 @ store output + eor @x[6],@x[6],@t[2] + str @x[5],[r14,#-12] + eor @x[7],@x[7],@t[3] + ldmia @t[0],{@t[0]-@t[3]} @ load key material + str @x[6],[r14,#-8] + add @x[0],sp,#4*(16+8) + str @x[7],[r14,#-4] + + ldmia @x[0],{@x[0]-@x[7]} @ load second half + + add @x[0],@x[0],@t[0] @ accumulate key material + ldr @t[0],[r12],#16 @ load input + add @x[1],@x[1],@t[1] + ldr @t[1],[r12,#-12] +# ifdef __thumb2__ + it hi +# endif + strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it + add @x[2],@x[2],@t[2] + ldr @t[2],[r12,#-8] +# ifdef __thumb2__ + it hi +# endif + strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it + add @x[3],@x[3],@t[3] + ldr @t[3],[r12,#-4] +# ifdef __ARMEB__ + rev @x[0],@x[0] + rev @x[1],@x[1] + rev @x[2],@x[2] + rev @x[3],@x[3] +# endif + eor @x[0],@x[0],@t[0] + add @t[0],sp,#4*(12) + eor @x[1],@x[1],@t[1] + str @x[0],[r14],#16 @ store output + eor @x[2],@x[2],@t[2] + str @x[1],[r14,#-12] + eor @x[3],@x[3],@t[3] + ldmia @t[0],{@t[0]-@t[3]} @ load key material + str @x[2],[r14,#-8] + str @x[3],[r14,#-4] + + add @x[4],@t[0],@x[4],ror#24 @ accumulate key material + add @t[0],@t[0],#4 @ next counter value + add @x[5],@t[1],@x[5],ror#24 + str @t[0],[sp,#4*(12)] @ save next counter value + ldr @t[0],[r12],#16 @ load input + add @x[6],@t[2],@x[6],ror#24 + add @x[4],@x[4],#3 @ counter+3 + ldr @t[1],[r12,#-12] + add @x[7],@t[3],@x[7],ror#24 + ldr @t[2],[r12,#-8] + ldr @t[3],[r12,#-4] +# ifdef __ARMEB__ + rev @x[4],@x[4] + rev @x[5],@x[5] + rev @x[6],@x[6] + rev @x[7],@x[7] +# endif + eor @x[4],@x[4],@t[0] +# ifdef __thumb2__ + it hi +# endif + ldrhi @t[0],[sp,#4*(32+2)] @ re-load len + eor @x[5],@x[5],@t[1] + eor @x[6],@x[6],@t[2] + str @x[4],[r14],#16 @ store output + eor @x[7],@x[7],@t[3] + str @x[5],[r14,#-12] + sub @t[3],@t[0],#64*4 @ len-=64*4 + str @x[6],[r14,#-8] + str @x[7],[r14,#-4] + bhi .Loop_neon_outer + + b .Ldone_neon + +.align 4 +.Lbreak_neon: + @ harmonize NEON and integer-only stack frames: load data + @ from NEON frame, but save to integer-only one; distance + @ between the two is 4*(32+4+16-32)=4*(20). + + str @t[3], [sp,#4*(20+32+2)] @ save len + add @t[3],sp,#4*(32+4) + str r12, [sp,#4*(20+32+1)] @ save inp + str r14, [sp,#4*(20+32+0)] @ save out + + ldr @x[12],[sp,#4*(16+10)] + ldr @x[14],[sp,#4*(16+11)] + vldmia @t[3],{d8-d15} @ fulfill ABI requirement + str @x[12],[sp,#4*(20+16+10)] @ copy "@x[10]" + str @x[14],[sp,#4*(20+16+11)] @ copy "@x[11]" + + ldr @t[3], [sp,#4*(15)] + mov @x[4],@x[4],ror#19 @ twist b[0..3] + ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load + mov @x[5],@x[5],ror#19 + ldr @t[2], [sp,#4*(13)] + mov @x[6],@x[6],ror#19 + ldr @x[14],[sp,#4*(14)] + mov @x[7],@x[7],ror#19 + mov @t[3],@t[3],ror#8 @ twist d[0..3] + mov @x[12],@x[12],ror#8 + mov @t[2],@t[2],ror#8 + mov @x[14],@x[14],ror#8 + str @t[3], [sp,#4*(20+16+15)] + add @t[3],sp,#4*(20) + vst1.32 {$a0-$b0},[@t[3]]! @ copy key + add sp,sp,#4*(20) @ switch frame + vst1.32 {$c0-$d0},[@t[3]] + mov @t[3],#10 + b .Loop @ go integer-only + +.align 4 +.Ltail_neon: + cmp @t[3],#64*3 + bhs .L192_or_more_neon + cmp @t[3],#64*2 + bhs .L128_or_more_neon + cmp @t[3],#64*1 + bhs .L64_or_more_neon + + add @t[0],sp,#4*(8) + vst1.8 {$a0-$b0},[sp] + add @t[2],sp,#4*(0) + vst1.8 {$c0-$d0},[@t[0]] + b .Loop_tail_neon + +.align 4 +.L64_or_more_neon: + vld1.8 {$t0-$t1},[r12]! + vld1.8 {$t2-$t3},[r12]! + veor $a0,$a0,$t0 + veor $b0,$b0,$t1 + veor $c0,$c0,$t2 + veor $d0,$d0,$t3 + vst1.8 {$a0-$b0},[r14]! + vst1.8 {$c0-$d0},[r14]! + + beq .Ldone_neon + + add @t[0],sp,#4*(8) + vst1.8 {$a1-$b1},[sp] + add @t[2],sp,#4*(0) + vst1.8 {$c1-$d1},[@t[0]] + sub @t[3],@t[3],#64*1 @ len-=64*1 + b .Loop_tail_neon + +.align 4 +.L128_or_more_neon: + vld1.8 {$t0-$t1},[r12]! + vld1.8 {$t2-$t3},[r12]! + veor $a0,$a0,$t0 + veor $b0,$b0,$t1 + vld1.8 {$t0-$t1},[r12]! + veor $c0,$c0,$t2 + veor $d0,$d0,$t3 + vld1.8 {$t2-$t3},[r12]! + + veor $a1,$a1,$t0 + veor $b1,$b1,$t1 + vst1.8 {$a0-$b0},[r14]! + veor $c1,$c1,$t2 + vst1.8 {$c0-$d0},[r14]! + veor $d1,$d1,$t3 + vst1.8 {$a1-$b1},[r14]! + vst1.8 {$c1-$d1},[r14]! + + beq .Ldone_neon + + add @t[0],sp,#4*(8) + vst1.8 {$a2-$b2},[sp] + add @t[2],sp,#4*(0) + vst1.8 {$c2-$d2},[@t[0]] + sub @t[3],@t[3],#64*2 @ len-=64*2 + b .Loop_tail_neon + +.align 4 +.L192_or_more_neon: + vld1.8 {$t0-$t1},[r12]! + vld1.8 {$t2-$t3},[r12]! + veor $a0,$a0,$t0 + veor $b0,$b0,$t1 + vld1.8 {$t0-$t1},[r12]! + veor $c0,$c0,$t2 + veor $d0,$d0,$t3 + vld1.8 {$t2-$t3},[r12]! + + veor $a1,$a1,$t0 + veor $b1,$b1,$t1 + vld1.8 {$t0-$t1},[r12]! + veor $c1,$c1,$t2 + vst1.8 {$a0-$b0},[r14]! + veor $d1,$d1,$t3 + vld1.8 {$t2-$t3},[r12]! + + veor $a2,$a2,$t0 + vst1.8 {$c0-$d0},[r14]! + veor $b2,$b2,$t1 + vst1.8 {$a1-$b1},[r14]! + veor $c2,$c2,$t2 + vst1.8 {$c1-$d1},[r14]! + veor $d2,$d2,$t3 + vst1.8 {$a2-$b2},[r14]! + vst1.8 {$c2-$d2},[r14]! + + beq .Ldone_neon + + ldmia sp,{@t[0]-@t[3]} @ load key material + add @x[0],@x[0],@t[0] @ accumulate key material + add @t[0],sp,#4*(4) + add @x[1],@x[1],@t[1] + add @x[2],@x[2],@t[2] + add @x[3],@x[3],@t[3] + ldmia @t[0],{@t[0]-@t[3]} @ load key material + + add @x[4],@t[0],@x[4],ror#13 @ accumulate key material + add @t[0],sp,#4*(8) + add @x[5],@t[1],@x[5],ror#13 + add @x[6],@t[2],@x[6],ror#13 + add @x[7],@t[3],@x[7],ror#13 + ldmia @t[0],{@t[0]-@t[3]} @ load key material +# ifdef __ARMEB__ + rev @x[0],@x[0] + rev @x[1],@x[1] + rev @x[2],@x[2] + rev @x[3],@x[3] + rev @x[4],@x[4] + rev @x[5],@x[5] + rev @x[6],@x[6] + rev @x[7],@x[7] +# endif + stmia sp,{@x[0]-@x[7]} + add @x[0],sp,#4*(16+8) + + ldmia @x[0],{@x[0]-@x[7]} @ load second half + + add @x[0],@x[0],@t[0] @ accumulate key material + add @t[0],sp,#4*(12) + add @x[1],@x[1],@t[1] + add @x[2],@x[2],@t[2] + add @x[3],@x[3],@t[3] + ldmia @t[0],{@t[0]-@t[3]} @ load key material + + add @x[4],@t[0],@x[4],ror#24 @ accumulate key material + add @t[0],sp,#4*(8) + add @x[5],@t[1],@x[5],ror#24 + add @x[4],@x[4],#3 @ counter+3 + add @x[6],@t[2],@x[6],ror#24 + add @x[7],@t[3],@x[7],ror#24 + ldr @t[3],[sp,#4*(32+2)] @ re-load len +# ifdef __ARMEB__ + rev @x[0],@x[0] + rev @x[1],@x[1] + rev @x[2],@x[2] + rev @x[3],@x[3] + rev @x[4],@x[4] + rev @x[5],@x[5] + rev @x[6],@x[6] + rev @x[7],@x[7] +# endif + stmia @t[0],{@x[0]-@x[7]} + add @t[2],sp,#4*(0) + sub @t[3],@t[3],#64*3 @ len-=64*3 + +.Loop_tail_neon: + ldrb @t[0],[@t[2]],#1 @ read buffer on stack + ldrb @t[1],[r12],#1 @ read input + subs @t[3],@t[3],#1 + eor @t[0],@t[0],@t[1] + strb @t[0],[r14],#1 @ store output + bne .Loop_tail_neon + +.Ldone_neon: + add sp,sp,#4*(32+4) + vldmia sp,{d8-d15} + add sp,sp,#4*(16+3) + ldmia sp!,{r4-r11,pc} +.size ChaCha20_neon,.-ChaCha20_neon +# ifndef __KERNEL__ +.comm OPENSSL_armcap_P,4,4 +# endif +#endif +___ +}}} + +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/@/ and !/^$/); + print; +} +close SELF; + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; + + print $_,"\n"; +} +close STDOUT; diff --git a/src/crypto/zinc/chacha20/chacha20-arm64.S b/src/crypto/zinc/chacha20/chacha20-arm64.S deleted file mode 100644 index 1ae11a5..0000000 --- a/src/crypto/zinc/chacha20/chacha20-arm64.S +++ /dev/null @@ -1,1942 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ -/* - * Copyright (C) 2015-2018 Jason A. Donenfeld . All Rights Reserved. - * Copyright (C) 2006-2017 CRYPTOGAMS by . All Rights Reserved. - * - * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS. - */ - -#include - -.text -.align 5 -.Lsigma: -.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral -.Lone: -.long 1,0,0,0 - -.align 5 -ENTRY(chacha20_arm) - cbz x2,.Labort - - stp x29,x30,[sp,#-96]! - add x29,sp,#0 - - adr x5,.Lsigma - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#64 - - ldp x22,x23,[x5] // load sigma - ldp x24,x25,[x3] // load key - ldp x26,x27,[x3,#16] - ldp x28,x30,[x4] // load counter -#ifdef __AARCH64EB__ - ror x24,x24,#32 - ror x25,x25,#32 - ror x26,x26,#32 - ror x27,x27,#32 - ror x28,x28,#32 - ror x30,x30,#32 -#endif - -.Loop_outer: - mov w5,w22 // unpack key block - lsr x6,x22,#32 - mov w7,w23 - lsr x8,x23,#32 - mov w9,w24 - lsr x10,x24,#32 - mov w11,w25 - lsr x12,x25,#32 - mov w13,w26 - lsr x14,x26,#32 - mov w15,w27 - lsr x16,x27,#32 - mov w17,w28 - lsr x19,x28,#32 - mov w20,w30 - lsr x21,x30,#32 - - mov x4,#10 - subs x2,x2,#64 -.Loop: - sub x4,x4,#1 - add w5,w5,w9 - add w6,w6,w10 - add w7,w7,w11 - add w8,w8,w12 - eor w17,w17,w5 - eor w19,w19,w6 - eor w20,w20,w7 - eor w21,w21,w8 - ror w17,w17,#16 - ror w19,w19,#16 - ror w20,w20,#16 - ror w21,w21,#16 - add w13,w13,w17 - add w14,w14,w19 - add w15,w15,w20 - add w16,w16,w21 - eor w9,w9,w13 - eor w10,w10,w14 - eor w11,w11,w15 - eor w12,w12,w16 - ror w9,w9,#20 - ror w10,w10,#20 - ror w11,w11,#20 - ror w12,w12,#20 - add w5,w5,w9 - add w6,w6,w10 - add w7,w7,w11 - add w8,w8,w12 - eor w17,w17,w5 - eor w19,w19,w6 - eor w20,w20,w7 - eor w21,w21,w8 - ror w17,w17,#24 - ror w19,w19,#24 - ror w20,w20,#24 - ror w21,w21,#24 - add w13,w13,w17 - add w14,w14,w19 - add w15,w15,w20 - add w16,w16,w21 - eor w9,w9,w13 - eor w10,w10,w14 - eor w11,w11,w15 - eor w12,w12,w16 - ror w9,w9,#25 - ror w10,w10,#25 - ror w11,w11,#25 - ror w12,w12,#25 - add w5,w5,w10 - add w6,w6,w11 - add w7,w7,w12 - add w8,w8,w9 - eor w21,w21,w5 - eor w17,w17,w6 - eor w19,w19,w7 - eor w20,w20,w8 - ror w21,w21,#16 - ror w17,w17,#16 - ror w19,w19,#16 - ror w20,w20,#16 - add w15,w15,w21 - add w16,w16,w17 - add w13,w13,w19 - add w14,w14,w20 - eor w10,w10,w15 - eor w11,w11,w16 - eor w12,w12,w13 - eor w9,w9,w14 - ror w10,w10,#20 - ror w11,w11,#20 - ror w12,w12,#20 - ror w9,w9,#20 - add w5,w5,w10 - add w6,w6,w11 - add w7,w7,w12 - add w8,w8,w9 - eor w21,w21,w5 - eor w17,w17,w6 - eor w19,w19,w7 - eor w20,w20,w8 - ror w21,w21,#24 - ror w17,w17,#24 - ror w19,w19,#24 - ror w20,w20,#24 - add w15,w15,w21 - add w16,w16,w17 - add w13,w13,w19 - add w14,w14,w20 - eor w10,w10,w15 - eor w11,w11,w16 - eor w12,w12,w13 - eor w9,w9,w14 - ror w10,w10,#25 - ror w11,w11,#25 - ror w12,w12,#25 - ror w9,w9,#25 - cbnz x4,.Loop - - add w5,w5,w22 // accumulate key block - add x6,x6,x22,lsr#32 - add w7,w7,w23 - add x8,x8,x23,lsr#32 - add w9,w9,w24 - add x10,x10,x24,lsr#32 - add w11,w11,w25 - add x12,x12,x25,lsr#32 - add w13,w13,w26 - add x14,x14,x26,lsr#32 - add w15,w15,w27 - add x16,x16,x27,lsr#32 - add w17,w17,w28 - add x19,x19,x28,lsr#32 - add w20,w20,w30 - add x21,x21,x30,lsr#32 - - b.lo .Ltail - - add x5,x5,x6,lsl#32 // pack - add x7,x7,x8,lsl#32 - ldp x6,x8,[x1,#0] // load input - add x9,x9,x10,lsl#32 - add x11,x11,x12,lsl#32 - ldp x10,x12,[x1,#16] - add x13,x13,x14,lsl#32 - add x15,x15,x16,lsl#32 - ldp x14,x16,[x1,#32] - add x17,x17,x19,lsl#32 - add x20,x20,x21,lsl#32 - ldp x19,x21,[x1,#48] - add x1,x1,#64 -#ifdef __AARCH64EB__ - rev x5,x5 - rev x7,x7 - rev x9,x9 - rev x11,x11 - rev x13,x13 - rev x15,x15 - rev x17,x17 - rev x20,x20 -#endif - eor x5,x5,x6 - eor x7,x7,x8 - eor x9,x9,x10 - eor x11,x11,x12 - eor x13,x13,x14 - eor x15,x15,x16 - eor x17,x17,x19 - eor x20,x20,x21 - - stp x5,x7,[x0,#0] // store output - add x28,x28,#1 // increment counter - stp x9,x11,[x0,#16] - stp x13,x15,[x0,#32] - stp x17,x20,[x0,#48] - add x0,x0,#64 - - b.hi .Loop_outer - - ldp x19,x20,[x29,#16] - add sp,sp,#64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 -.Labort: - ret - -.align 4 -.Ltail: - add x2,x2,#64 -.Less_than_64: - sub x0,x0,#1 - add x1,x1,x2 - add x0,x0,x2 - add x4,sp,x2 - neg x2,x2 - - add x5,x5,x6,lsl#32 // pack - add x7,x7,x8,lsl#32 - add x9,x9,x10,lsl#32 - add x11,x11,x12,lsl#32 - add x13,x13,x14,lsl#32 - add x15,x15,x16,lsl#32 - add x17,x17,x19,lsl#32 - add x20,x20,x21,lsl#32 -#ifdef __AARCH64EB__ - rev x5,x5 - rev x7,x7 - rev x9,x9 - rev x11,x11 - rev x13,x13 - rev x15,x15 - rev x17,x17 - rev x20,x20 -#endif - stp x5,x7,[sp,#0] - stp x9,x11,[sp,#16] - stp x13,x15,[sp,#32] - stp x17,x20,[sp,#48] - -.Loop_tail: - ldrb w10,[x1,x2] - ldrb w11,[x4,x2] - add x2,x2,#1 - eor w10,w10,w11 - strb w10,[x0,x2] - cbnz x2,.Loop_tail - - stp xzr,xzr,[sp,#0] - stp xzr,xzr,[sp,#16] - stp xzr,xzr,[sp,#32] - stp xzr,xzr,[sp,#48] - - ldp x19,x20,[x29,#16] - add sp,sp,#64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 - ret -ENDPROC(chacha20_arm) - -#ifdef CONFIG_KERNEL_MODE_NEON -.align 5 -ENTRY(chacha20_neon) - cbz x2,.Labort_neon - - stp x29,x30,[sp,#-96]! - add x29,sp,#0 - - adr x5,.Lsigma - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - cmp x2,#512 - b.hs .L512_or_more_neon - - sub sp,sp,#64 - - ldp x22,x23,[x5] // load sigma - ld1 {v24.4s},[x5],#16 - ldp x24,x25,[x3] // load key - ldp x26,x27,[x3,#16] - ld1 {v25.4s,v26.4s},[x3] - ldp x28,x30,[x4] // load counter - ld1 {v27.4s},[x4] - ld1 {v31.4s},[x5] -#ifdef __AARCH64EB__ - rev64 v24.4s,v24.4s - ror x24,x24,#32 - ror x25,x25,#32 - ror x26,x26,#32 - ror x27,x27,#32 - ror x28,x28,#32 - ror x30,x30,#32 -#endif - add v27.4s,v27.4s,v31.4s // += 1 - add v28.4s,v27.4s,v31.4s - add v29.4s,v28.4s,v31.4s - shl v31.4s,v31.4s,#2 // 1 -> 4 - -.Loop_outer_neon: - mov w5,w22 // unpack key block - lsr x6,x22,#32 - mov v0.16b,v24.16b - mov w7,w23 - lsr x8,x23,#32 - mov v4.16b,v24.16b - mov w9,w24 - lsr x10,x24,#32 - mov v16.16b,v24.16b - mov w11,w25 - mov v1.16b,v25.16b - lsr x12,x25,#32 - mov v5.16b,v25.16b - mov w13,w26 - mov v17.16b,v25.16b - lsr x14,x26,#32 - mov v3.16b,v27.16b - mov w15,w27 - mov v7.16b,v28.16b - lsr x16,x27,#32 - mov v19.16b,v29.16b - mov w17,w28 - mov v2.16b,v26.16b - lsr x19,x28,#32 - mov v6.16b,v26.16b - mov w20,w30 - mov v18.16b,v26.16b - lsr x21,x30,#32 - - mov x4,#10 - subs x2,x2,#256 -.Loop_neon: - sub x4,x4,#1 - add v0.4s,v0.4s,v1.4s - add w5,w5,w9 - add v4.4s,v4.4s,v5.4s - add w6,w6,w10 - add v16.4s,v16.4s,v17.4s - add w7,w7,w11 - eor v3.16b,v3.16b,v0.16b - add w8,w8,w12 - eor v7.16b,v7.16b,v4.16b - eor w17,w17,w5 - eor v19.16b,v19.16b,v16.16b - eor w19,w19,w6 - rev32 v3.8h,v3.8h - eor w20,w20,w7 - rev32 v7.8h,v7.8h - eor w21,w21,w8 - rev32 v19.8h,v19.8h - ror w17,w17,#16 - add v2.4s,v2.4s,v3.4s - ror w19,w19,#16 - add v6.4s,v6.4s,v7.4s - ror w20,w20,#16 - add v18.4s,v18.4s,v19.4s - ror w21,w21,#16 - eor v20.16b,v1.16b,v2.16b - add w13,w13,w17 - eor v21.16b,v5.16b,v6.16b - add w14,w14,w19 - eor v22.16b,v17.16b,v18.16b - add w15,w15,w20 - ushr v1.4s,v20.4s,#20 - add w16,w16,w21 - ushr v5.4s,v21.4s,#20 - eor w9,w9,w13 - ushr v17.4s,v22.4s,#20 - eor w10,w10,w14 - sli v1.4s,v20.4s,#12 - eor w11,w11,w15 - sli v5.4s,v21.4s,#12 - eor w12,w12,w16 - sli v17.4s,v22.4s,#12 - ror w9,w9,#20 - add v0.4s,v0.4s,v1.4s - ror w10,w10,#20 - add v4.4s,v4.4s,v5.4s - ror w11,w11,#20 - add v16.4s,v16.4s,v17.4s - ror w12,w12,#20 - eor v20.16b,v3.16b,v0.16b - add w5,w5,w9 - eor v21.16b,v7.16b,v4.16b - add w6,w6,w10 - eor v22.16b,v19.16b,v16.16b - add w7,w7,w11 - ushr v3.4s,v20.4s,#24 - add w8,w8,w12 - ushr v7.4s,v21.4s,#24 - eor w17,w17,w5 - ushr v19.4s,v22.4s,#24 - eor w19,w19,w6 - sli v3.4s,v20.4s,#8 - eor w20,w20,w7 - sli v7.4s,v21.4s,#8 - eor w21,w21,w8 - sli v19.4s,v22.4s,#8 - ror w17,w17,#24 - add v2.4s,v2.4s,v3.4s - ror w19,w19,#24 - add v6.4s,v6.4s,v7.4s - ror w20,w20,#24 - add v18.4s,v18.4s,v19.4s - ror w21,w21,#24 - eor v20.16b,v1.16b,v2.16b - add w13,w13,w17 - eor v21.16b,v5.16b,v6.16b - add w14,w14,w19 - eor v22.16b,v17.16b,v18.16b - add w15,w15,w20 - ushr v1.4s,v20.4s,#25 - add w16,w16,w21 - ushr v5.4s,v21.4s,#25 - eor w9,w9,w13 - ushr v17.4s,v22.4s,#25 - eor w10,w10,w14 - sli v1.4s,v20.4s,#7 - eor w11,w11,w15 - sli v5.4s,v21.4s,#7 - eor w12,w12,w16 - sli v17.4s,v22.4s,#7 - ror w9,w9,#25 - ext v2.16b,v2.16b,v2.16b,#8 - ror w10,w10,#25 - ext v6.16b,v6.16b,v6.16b,#8 - ror w11,w11,#25 - ext v18.16b,v18.16b,v18.16b,#8 - ror w12,w12,#25 - ext v3.16b,v3.16b,v3.16b,#12 - ext v7.16b,v7.16b,v7.16b,#12 - ext v19.16b,v19.16b,v19.16b,#12 - ext v1.16b,v1.16b,v1.16b,#4 - ext v5.16b,v5.16b,v5.16b,#4 - ext v17.16b,v17.16b,v17.16b,#4 - add v0.4s,v0.4s,v1.4s - add w5,w5,w10 - add v4.4s,v4.4s,v5.4s - add w6,w6,w11 - add v16.4s,v16.4s,v17.4s - add w7,w7,w12 - eor v3.16b,v3.16b,v0.16b - add w8,w8,w9 - eor v7.16b,v7.16b,v4.16b - eor w21,w21,w5 - eor v19.16b,v19.16b,v16.16b - eor w17,w17,w6 - rev32 v3.8h,v3.8h - eor w19,w19,w7 - rev32 v7.8h,v7.8h - eor w20,w20,w8 - rev32 v19.8h,v19.8h - ror w21,w21,#16 - add v2.4s,v2.4s,v3.4s - ror w17,w17,#16 - add v6.4s,v6.4s,v7.4s - ror w19,w19,#16 - add v18.4s,v18.4s,v19.4s - ror w20,w20,#16 - eor v20.16b,v1.16b,v2.16b - add w15,w15,w21 - eor v21.16b,v5.16b,v6.16b - add w16,w16,w17 - eor v22.16b,v17.16b,v18.16b - add w13,w13,w19 - ushr v1.4s,v20.4s,#20 - add w14,w14,w20 - ushr v5.4s,v21.4s,#20 - eor w10,w10,w15 - ushr v17.4s,v22.4s,#20 - eor w11,w11,w16 - sli v1.4s,v20.4s,#12 - eor w12,w12,w13 - sli v5.4s,v21.4s,#12 - eor w9,w9,w14 - sli v17.4s,v22.4s,#12 - ror w10,w10,#20 - add v0.4s,v0.4s,v1.4s - ror w11,w11,#20 - add v4.4s,v4.4s,v5.4s - ror w12,w12,#20 - add v16.4s,v16.4s,v17.4s - ror w9,w9,#20 - eor v20.16b,v3.16b,v0.16b - add w5,w5,w10 - eor v21.16b,v7.16b,v4.16b - add w6,w6,w11 - eor v22.16b,v19.16b,v16.16b - add w7,w7,w12 - ushr v3.4s,v20.4s,#24 - add w8,w8,w9 - ushr v7.4s,v21.4s,#24 - eor w21,w21,w5 - ushr v19.4s,v22.4s,#24 - eor w17,w17,w6 - sli v3.4s,v20.4s,#8 - eor w19,w19,w7 - sli v7.4s,v21.4s,#8 - eor w20,w20,w8 - sli v19.4s,v22.4s,#8 - ror w21,w21,#24 - add v2.4s,v2.4s,v3.4s - ror w17,w17,#24 - add v6.4s,v6.4s,v7.4s - ror w19,w19,#24 - add v18.4s,v18.4s,v19.4s - ror w20,w20,#24 - eor v20.16b,v1.16b,v2.16b - add w15,w15,w21 - eor v21.16b,v5.16b,v6.16b - add w16,w16,w17 - eor v22.16b,v17.16b,v18.16b - add w13,w13,w19 - ushr v1.4s,v20.4s,#25 - add w14,w14,w20 - ushr v5.4s,v21.4s,#25 - eor w10,w10,w15 - ushr v17.4s,v22.4s,#25 - eor w11,w11,w16 - sli v1.4s,v20.4s,#7 - eor w12,w12,w13 - sli v5.4s,v21.4s,#7 - eor w9,w9,w14 - sli v17.4s,v22.4s,#7 - ror w10,w10,#25 - ext v2.16b,v2.16b,v2.16b,#8 - ror w11,w11,#25 - ext v6.16b,v6.16b,v6.16b,#8 - ror w12,w12,#25 - ext v18.16b,v18.16b,v18.16b,#8 - ror w9,w9,#25 - ext v3.16b,v3.16b,v3.16b,#4 - ext v7.16b,v7.16b,v7.16b,#4 - ext v19.16b,v19.16b,v19.16b,#4 - ext v1.16b,v1.16b,v1.16b,#12 - ext v5.16b,v5.16b,v5.16b,#12 - ext v17.16b,v17.16b,v17.16b,#12 - cbnz x4,.Loop_neon - - add w5,w5,w22 // accumulate key block - add v0.4s,v0.4s,v24.4s - add x6,x6,x22,lsr#32 - add v4.4s,v4.4s,v24.4s - add w7,w7,w23 - add v16.4s,v16.4s,v24.4s - add x8,x8,x23,lsr#32 - add v2.4s,v2.4s,v26.4s - add w9,w9,w24 - add v6.4s,v6.4s,v26.4s - add x10,x10,x24,lsr#32 - add v18.4s,v18.4s,v26.4s - add w11,w11,w25 - add v3.4s,v3.4s,v27.4s - add x12,x12,x25,lsr#32 - add w13,w13,w26 - add v7.4s,v7.4s,v28.4s - add x14,x14,x26,lsr#32 - add w15,w15,w27 - add v19.4s,v19.4s,v29.4s - add x16,x16,x27,lsr#32 - add w17,w17,w28 - add v1.4s,v1.4s,v25.4s - add x19,x19,x28,lsr#32 - add w20,w20,w30 - add v5.4s,v5.4s,v25.4s - add x21,x21,x30,lsr#32 - add v17.4s,v17.4s,v25.4s - - b.lo .Ltail_neon - - add x5,x5,x6,lsl#32 // pack - add x7,x7,x8,lsl#32 - ldp x6,x8,[x1,#0] // load input - add x9,x9,x10,lsl#32 - add x11,x11,x12,lsl#32 - ldp x10,x12,[x1,#16] - add x13,x13,x14,lsl#32 - add x15,x15,x16,lsl#32 - ldp x14,x16,[x1,#32] - add x17,x17,x19,lsl#32 - add x20,x20,x21,lsl#32 - ldp x19,x21,[x1,#48] - add x1,x1,#64 -#ifdef __AARCH64EB__ - rev x5,x5 - rev x7,x7 - rev x9,x9 - rev x11,x11 - rev x13,x13 - rev x15,x15 - rev x17,x17 - rev x20,x20 -#endif - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 - eor x5,x5,x6 - eor x7,x7,x8 - eor x9,x9,x10 - eor x11,x11,x12 - eor x13,x13,x14 - eor v0.16b,v0.16b,v20.16b - eor x15,x15,x16 - eor v1.16b,v1.16b,v21.16b - eor x17,x17,x19 - eor v2.16b,v2.16b,v22.16b - eor x20,x20,x21 - eor v3.16b,v3.16b,v23.16b - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 - - stp x5,x7,[x0,#0] // store output - add x28,x28,#4 // increment counter - stp x9,x11,[x0,#16] - add v27.4s,v27.4s,v31.4s // += 4 - stp x13,x15,[x0,#32] - add v28.4s,v28.4s,v31.4s - stp x17,x20,[x0,#48] - add v29.4s,v29.4s,v31.4s - add x0,x0,#64 - - st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 - ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 - - eor v4.16b,v4.16b,v20.16b - eor v5.16b,v5.16b,v21.16b - eor v6.16b,v6.16b,v22.16b - eor v7.16b,v7.16b,v23.16b - st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 - - eor v16.16b,v16.16b,v0.16b - eor v17.16b,v17.16b,v1.16b - eor v18.16b,v18.16b,v2.16b - eor v19.16b,v19.16b,v3.16b - st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 - - b.hi .Loop_outer_neon - - ldp x19,x20,[x29,#16] - add sp,sp,#64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 - ret - -.Ltail_neon: - add x2,x2,#256 - cmp x2,#64 - b.lo .Less_than_64 - - add x5,x5,x6,lsl#32 // pack - add x7,x7,x8,lsl#32 - ldp x6,x8,[x1,#0] // load input - add x9,x9,x10,lsl#32 - add x11,x11,x12,lsl#32 - ldp x10,x12,[x1,#16] - add x13,x13,x14,lsl#32 - add x15,x15,x16,lsl#32 - ldp x14,x16,[x1,#32] - add x17,x17,x19,lsl#32 - add x20,x20,x21,lsl#32 - ldp x19,x21,[x1,#48] - add x1,x1,#64 -#ifdef __AARCH64EB__ - rev x5,x5 - rev x7,x7 - rev x9,x9 - rev x11,x11 - rev x13,x13 - rev x15,x15 - rev x17,x17 - rev x20,x20 -#endif - eor x5,x5,x6 - eor x7,x7,x8 - eor x9,x9,x10 - eor x11,x11,x12 - eor x13,x13,x14 - eor x15,x15,x16 - eor x17,x17,x19 - eor x20,x20,x21 - - stp x5,x7,[x0,#0] // store output - add x28,x28,#4 // increment counter - stp x9,x11,[x0,#16] - stp x13,x15,[x0,#32] - stp x17,x20,[x0,#48] - add x0,x0,#64 - b.eq .Ldone_neon - sub x2,x2,#64 - cmp x2,#64 - b.lo .Less_than_128 - - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 - eor v0.16b,v0.16b,v20.16b - eor v1.16b,v1.16b,v21.16b - eor v2.16b,v2.16b,v22.16b - eor v3.16b,v3.16b,v23.16b - st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 - b.eq .Ldone_neon - sub x2,x2,#64 - cmp x2,#64 - b.lo .Less_than_192 - - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 - eor v4.16b,v4.16b,v20.16b - eor v5.16b,v5.16b,v21.16b - eor v6.16b,v6.16b,v22.16b - eor v7.16b,v7.16b,v23.16b - st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 - b.eq .Ldone_neon - sub x2,x2,#64 - - st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] - b .Last_neon - -.Less_than_128: - st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] - b .Last_neon -.Less_than_192: - st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] - b .Last_neon - -.align 4 -.Last_neon: - sub x0,x0,#1 - add x1,x1,x2 - add x0,x0,x2 - add x4,sp,x2 - neg x2,x2 - -.Loop_tail_neon: - ldrb w10,[x1,x2] - ldrb w11,[x4,x2] - add x2,x2,#1 - eor w10,w10,w11 - strb w10,[x0,x2] - cbnz x2,.Loop_tail_neon - - stp xzr,xzr,[sp,#0] - stp xzr,xzr,[sp,#16] - stp xzr,xzr,[sp,#32] - stp xzr,xzr,[sp,#48] - -.Ldone_neon: - ldp x19,x20,[x29,#16] - add sp,sp,#64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 - ret - -.L512_or_more_neon: - sub sp,sp,#128+64 - - ldp x22,x23,[x5] // load sigma - ld1 {v24.4s},[x5],#16 - ldp x24,x25,[x3] // load key - ldp x26,x27,[x3,#16] - ld1 {v25.4s,v26.4s},[x3] - ldp x28,x30,[x4] // load counter - ld1 {v27.4s},[x4] - ld1 {v31.4s},[x5] -#ifdef __AARCH64EB__ - rev64 v24.4s,v24.4s - ror x24,x24,#32 - ror x25,x25,#32 - ror x26,x26,#32 - ror x27,x27,#32 - ror x28,x28,#32 - ror x30,x30,#32 -#endif - add v27.4s,v27.4s,v31.4s // += 1 - stp q24,q25,[sp,#0] // off-load key block, invariant part - add v27.4s,v27.4s,v31.4s // not typo - str q26,[sp,#32] - add v28.4s,v27.4s,v31.4s - add v29.4s,v28.4s,v31.4s - add v30.4s,v29.4s,v31.4s - shl v31.4s,v31.4s,#2 // 1 -> 4 - - stp d8,d9,[sp,#128+0] // meet ABI requirements - stp d10,d11,[sp,#128+16] - stp d12,d13,[sp,#128+32] - stp d14,d15,[sp,#128+48] - - sub x2,x2,#512 // not typo - -.Loop_outer_512_neon: - mov v0.16b,v24.16b - mov v4.16b,v24.16b - mov v8.16b,v24.16b - mov v12.16b,v24.16b - mov v16.16b,v24.16b - mov v20.16b,v24.16b - mov v1.16b,v25.16b - mov w5,w22 // unpack key block - mov v5.16b,v25.16b - lsr x6,x22,#32 - mov v9.16b,v25.16b - mov w7,w23 - mov v13.16b,v25.16b - lsr x8,x23,#32 - mov v17.16b,v25.16b - mov w9,w24 - mov v21.16b,v25.16b - lsr x10,x24,#32 - mov v3.16b,v27.16b - mov w11,w25 - mov v7.16b,v28.16b - lsr x12,x25,#32 - mov v11.16b,v29.16b - mov w13,w26 - mov v15.16b,v30.16b - lsr x14,x26,#32 - mov v2.16b,v26.16b - mov w15,w27 - mov v6.16b,v26.16b - lsr x16,x27,#32 - add v19.4s,v3.4s,v31.4s // +4 - mov w17,w28 - add v23.4s,v7.4s,v31.4s // +4 - lsr x19,x28,#32 - mov v10.16b,v26.16b - mov w20,w30 - mov v14.16b,v26.16b - lsr x21,x30,#32 - mov v18.16b,v26.16b - stp q27,q28,[sp,#48] // off-load key block, variable part - mov v22.16b,v26.16b - str q29,[sp,#80] - - mov x4,#5 - subs x2,x2,#512 -.Loop_upper_neon: - sub x4,x4,#1 - add v0.4s,v0.4s,v1.4s - add w5,w5,w9 - add v4.4s,v4.4s,v5.4s - add w6,w6,w10 - add v8.4s,v8.4s,v9.4s - add w7,w7,w11 - add v12.4s,v12.4s,v13.4s - add w8,w8,w12 - add v16.4s,v16.4s,v17.4s - eor w17,w17,w5 - add v20.4s,v20.4s,v21.4s - eor w19,w19,w6 - eor v3.16b,v3.16b,v0.16b - eor w20,w20,w7 - eor v7.16b,v7.16b,v4.16b - eor w21,w21,w8 - eor v11.16b,v11.16b,v8.16b - ror w17,w17,#16 - eor v15.16b,v15.16b,v12.16b - ror w19,w19,#16 - eor v19.16b,v19.16b,v16.16b - ror w20,w20,#16 - eor v23.16b,v23.16b,v20.16b - ror w21,w21,#16 - rev32 v3.8h,v3.8h - add w13,w13,w17 - rev32 v7.8h,v7.8h - add w14,w14,w19 - rev32 v11.8h,v11.8h - add w15,w15,w20 - rev32 v15.8h,v15.8h - add w16,w16,w21 - rev32 v19.8h,v19.8h - eor w9,w9,w13 - rev32 v23.8h,v23.8h - eor w10,w10,w14 - add v2.4s,v2.4s,v3.4s - eor w11,w11,w15 - add v6.4s,v6.4s,v7.4s - eor w12,w12,w16 - add v10.4s,v10.4s,v11.4s - ror w9,w9,#20 - add v14.4s,v14.4s,v15.4s - ror w10,w10,#20 - add v18.4s,v18.4s,v19.4s - ror w11,w11,#20 - add v22.4s,v22.4s,v23.4s - ror w12,w12,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w9 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w10 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w11 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w12 - eor v28.16b,v17.16b,v18.16b - eor w17,w17,w5 - eor v29.16b,v21.16b,v22.16b - eor w19,w19,w6 - ushr v1.4s,v24.4s,#20 - eor w20,w20,w7 - ushr v5.4s,v25.4s,#20 - eor w21,w21,w8 - ushr v9.4s,v26.4s,#20 - ror w17,w17,#24 - ushr v13.4s,v27.4s,#20 - ror w19,w19,#24 - ushr v17.4s,v28.4s,#20 - ror w20,w20,#24 - ushr v21.4s,v29.4s,#20 - ror w21,w21,#24 - sli v1.4s,v24.4s,#12 - add w13,w13,w17 - sli v5.4s,v25.4s,#12 - add w14,w14,w19 - sli v9.4s,v26.4s,#12 - add w15,w15,w20 - sli v13.4s,v27.4s,#12 - add w16,w16,w21 - sli v17.4s,v28.4s,#12 - eor w9,w9,w13 - sli v21.4s,v29.4s,#12 - eor w10,w10,w14 - add v0.4s,v0.4s,v1.4s - eor w11,w11,w15 - add v4.4s,v4.4s,v5.4s - eor w12,w12,w16 - add v8.4s,v8.4s,v9.4s - ror w9,w9,#25 - add v12.4s,v12.4s,v13.4s - ror w10,w10,#25 - add v16.4s,v16.4s,v17.4s - ror w11,w11,#25 - add v20.4s,v20.4s,v21.4s - ror w12,w12,#25 - eor v24.16b,v3.16b,v0.16b - add w5,w5,w10 - eor v25.16b,v7.16b,v4.16b - add w6,w6,w11 - eor v26.16b,v11.16b,v8.16b - add w7,w7,w12 - eor v27.16b,v15.16b,v12.16b - add w8,w8,w9 - eor v28.16b,v19.16b,v16.16b - eor w21,w21,w5 - eor v29.16b,v23.16b,v20.16b - eor w17,w17,w6 - ushr v3.4s,v24.4s,#24 - eor w19,w19,w7 - ushr v7.4s,v25.4s,#24 - eor w20,w20,w8 - ushr v11.4s,v26.4s,#24 - ror w21,w21,#16 - ushr v15.4s,v27.4s,#24 - ror w17,w17,#16 - ushr v19.4s,v28.4s,#24 - ror w19,w19,#16 - ushr v23.4s,v29.4s,#24 - ror w20,w20,#16 - sli v3.4s,v24.4s,#8 - add w15,w15,w21 - sli v7.4s,v25.4s,#8 - add w16,w16,w17 - sli v11.4s,v26.4s,#8 - add w13,w13,w19 - sli v15.4s,v27.4s,#8 - add w14,w14,w20 - sli v19.4s,v28.4s,#8 - eor w10,w10,w15 - sli v23.4s,v29.4s,#8 - eor w11,w11,w16 - add v2.4s,v2.4s,v3.4s - eor w12,w12,w13 - add v6.4s,v6.4s,v7.4s - eor w9,w9,w14 - add v10.4s,v10.4s,v11.4s - ror w10,w10,#20 - add v14.4s,v14.4s,v15.4s - ror w11,w11,#20 - add v18.4s,v18.4s,v19.4s - ror w12,w12,#20 - add v22.4s,v22.4s,v23.4s - ror w9,w9,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w10 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w11 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w12 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w9 - eor v28.16b,v17.16b,v18.16b - eor w21,w21,w5 - eor v29.16b,v21.16b,v22.16b - eor w17,w17,w6 - ushr v1.4s,v24.4s,#25 - eor w19,w19,w7 - ushr v5.4s,v25.4s,#25 - eor w20,w20,w8 - ushr v9.4s,v26.4s,#25 - ror w21,w21,#24 - ushr v13.4s,v27.4s,#25 - ror w17,w17,#24 - ushr v17.4s,v28.4s,#25 - ror w19,w19,#24 - ushr v21.4s,v29.4s,#25 - ror w20,w20,#24 - sli v1.4s,v24.4s,#7 - add w15,w15,w21 - sli v5.4s,v25.4s,#7 - add w16,w16,w17 - sli v9.4s,v26.4s,#7 - add w13,w13,w19 - sli v13.4s,v27.4s,#7 - add w14,w14,w20 - sli v17.4s,v28.4s,#7 - eor w10,w10,w15 - sli v21.4s,v29.4s,#7 - eor w11,w11,w16 - ext v2.16b,v2.16b,v2.16b,#8 - eor w12,w12,w13 - ext v6.16b,v6.16b,v6.16b,#8 - eor w9,w9,w14 - ext v10.16b,v10.16b,v10.16b,#8 - ror w10,w10,#25 - ext v14.16b,v14.16b,v14.16b,#8 - ror w11,w11,#25 - ext v18.16b,v18.16b,v18.16b,#8 - ror w12,w12,#25 - ext v22.16b,v22.16b,v22.16b,#8 - ror w9,w9,#25 - ext v3.16b,v3.16b,v3.16b,#12 - ext v7.16b,v7.16b,v7.16b,#12 - ext v11.16b,v11.16b,v11.16b,#12 - ext v15.16b,v15.16b,v15.16b,#12 - ext v19.16b,v19.16b,v19.16b,#12 - ext v23.16b,v23.16b,v23.16b,#12 - ext v1.16b,v1.16b,v1.16b,#4 - ext v5.16b,v5.16b,v5.16b,#4 - ext v9.16b,v9.16b,v9.16b,#4 - ext v13.16b,v13.16b,v13.16b,#4 - ext v17.16b,v17.16b,v17.16b,#4 - ext v21.16b,v21.16b,v21.16b,#4 - add v0.4s,v0.4s,v1.4s - add w5,w5,w9 - add v4.4s,v4.4s,v5.4s - add w6,w6,w10 - add v8.4s,v8.4s,v9.4s - add w7,w7,w11 - add v12.4s,v12.4s,v13.4s - add w8,w8,w12 - add v16.4s,v16.4s,v17.4s - eor w17,w17,w5 - add v20.4s,v20.4s,v21.4s - eor w19,w19,w6 - eor v3.16b,v3.16b,v0.16b - eor w20,w20,w7 - eor v7.16b,v7.16b,v4.16b - eor w21,w21,w8 - eor v11.16b,v11.16b,v8.16b - ror w17,w17,#16 - eor v15.16b,v15.16b,v12.16b - ror w19,w19,#16 - eor v19.16b,v19.16b,v16.16b - ror w20,w20,#16 - eor v23.16b,v23.16b,v20.16b - ror w21,w21,#16 - rev32 v3.8h,v3.8h - add w13,w13,w17 - rev32 v7.8h,v7.8h - add w14,w14,w19 - rev32 v11.8h,v11.8h - add w15,w15,w20 - rev32 v15.8h,v15.8h - add w16,w16,w21 - rev32 v19.8h,v19.8h - eor w9,w9,w13 - rev32 v23.8h,v23.8h - eor w10,w10,w14 - add v2.4s,v2.4s,v3.4s - eor w11,w11,w15 - add v6.4s,v6.4s,v7.4s - eor w12,w12,w16 - add v10.4s,v10.4s,v11.4s - ror w9,w9,#20 - add v14.4s,v14.4s,v15.4s - ror w10,w10,#20 - add v18.4s,v18.4s,v19.4s - ror w11,w11,#20 - add v22.4s,v22.4s,v23.4s - ror w12,w12,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w9 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w10 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w11 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w12 - eor v28.16b,v17.16b,v18.16b - eor w17,w17,w5 - eor v29.16b,v21.16b,v22.16b - eor w19,w19,w6 - ushr v1.4s,v24.4s,#20 - eor w20,w20,w7 - ushr v5.4s,v25.4s,#20 - eor w21,w21,w8 - ushr v9.4s,v26.4s,#20 - ror w17,w17,#24 - ushr v13.4s,v27.4s,#20 - ror w19,w19,#24 - ushr v17.4s,v28.4s,#20 - ror w20,w20,#24 - ushr v21.4s,v29.4s,#20 - ror w21,w21,#24 - sli v1.4s,v24.4s,#12 - add w13,w13,w17 - sli v5.4s,v25.4s,#12 - add w14,w14,w19 - sli v9.4s,v26.4s,#12 - add w15,w15,w20 - sli v13.4s,v27.4s,#12 - add w16,w16,w21 - sli v17.4s,v28.4s,#12 - eor w9,w9,w13 - sli v21.4s,v29.4s,#12 - eor w10,w10,w14 - add v0.4s,v0.4s,v1.4s - eor w11,w11,w15 - add v4.4s,v4.4s,v5.4s - eor w12,w12,w16 - add v8.4s,v8.4s,v9.4s - ror w9,w9,#25 - add v12.4s,v12.4s,v13.4s - ror w10,w10,#25 - add v16.4s,v16.4s,v17.4s - ror w11,w11,#25 - add v20.4s,v20.4s,v21.4s - ror w12,w12,#25 - eor v24.16b,v3.16b,v0.16b - add w5,w5,w10 - eor v25.16b,v7.16b,v4.16b - add w6,w6,w11 - eor v26.16b,v11.16b,v8.16b - add w7,w7,w12 - eor v27.16b,v15.16b,v12.16b - add w8,w8,w9 - eor v28.16b,v19.16b,v16.16b - eor w21,w21,w5 - eor v29.16b,v23.16b,v20.16b - eor w17,w17,w6 - ushr v3.4s,v24.4s,#24 - eor w19,w19,w7 - ushr v7.4s,v25.4s,#24 - eor w20,w20,w8 - ushr v11.4s,v26.4s,#24 - ror w21,w21,#16 - ushr v15.4s,v27.4s,#24 - ror w17,w17,#16 - ushr v19.4s,v28.4s,#24 - ror w19,w19,#16 - ushr v23.4s,v29.4s,#24 - ror w20,w20,#16 - sli v3.4s,v24.4s,#8 - add w15,w15,w21 - sli v7.4s,v25.4s,#8 - add w16,w16,w17 - sli v11.4s,v26.4s,#8 - add w13,w13,w19 - sli v15.4s,v27.4s,#8 - add w14,w14,w20 - sli v19.4s,v28.4s,#8 - eor w10,w10,w15 - sli v23.4s,v29.4s,#8 - eor w11,w11,w16 - add v2.4s,v2.4s,v3.4s - eor w12,w12,w13 - add v6.4s,v6.4s,v7.4s - eor w9,w9,w14 - add v10.4s,v10.4s,v11.4s - ror w10,w10,#20 - add v14.4s,v14.4s,v15.4s - ror w11,w11,#20 - add v18.4s,v18.4s,v19.4s - ror w12,w12,#20 - add v22.4s,v22.4s,v23.4s - ror w9,w9,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w10 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w11 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w12 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w9 - eor v28.16b,v17.16b,v18.16b - eor w21,w21,w5 - eor v29.16b,v21.16b,v22.16b - eor w17,w17,w6 - ushr v1.4s,v24.4s,#25 - eor w19,w19,w7 - ushr v5.4s,v25.4s,#25 - eor w20,w20,w8 - ushr v9.4s,v26.4s,#25 - ror w21,w21,#24 - ushr v13.4s,v27.4s,#25 - ror w17,w17,#24 - ushr v17.4s,v28.4s,#25 - ror w19,w19,#24 - ushr v21.4s,v29.4s,#25 - ror w20,w20,#24 - sli v1.4s,v24.4s,#7 - add w15,w15,w21 - sli v5.4s,v25.4s,#7 - add w16,w16,w17 - sli v9.4s,v26.4s,#7 - add w13,w13,w19 - sli v13.4s,v27.4s,#7 - add w14,w14,w20 - sli v17.4s,v28.4s,#7 - eor w10,w10,w15 - sli v21.4s,v29.4s,#7 - eor w11,w11,w16 - ext v2.16b,v2.16b,v2.16b,#8 - eor w12,w12,w13 - ext v6.16b,v6.16b,v6.16b,#8 - eor w9,w9,w14 - ext v10.16b,v10.16b,v10.16b,#8 - ror w10,w10,#25 - ext v14.16b,v14.16b,v14.16b,#8 - ror w11,w11,#25 - ext v18.16b,v18.16b,v18.16b,#8 - ror w12,w12,#25 - ext v22.16b,v22.16b,v22.16b,#8 - ror w9,w9,#25 - ext v3.16b,v3.16b,v3.16b,#4 - ext v7.16b,v7.16b,v7.16b,#4 - ext v11.16b,v11.16b,v11.16b,#4 - ext v15.16b,v15.16b,v15.16b,#4 - ext v19.16b,v19.16b,v19.16b,#4 - ext v23.16b,v23.16b,v23.16b,#4 - ext v1.16b,v1.16b,v1.16b,#12 - ext v5.16b,v5.16b,v5.16b,#12 - ext v9.16b,v9.16b,v9.16b,#12 - ext v13.16b,v13.16b,v13.16b,#12 - ext v17.16b,v17.16b,v17.16b,#12 - ext v21.16b,v21.16b,v21.16b,#12 - cbnz x4,.Loop_upper_neon - - add w5,w5,w22 // accumulate key block - add x6,x6,x22,lsr#32 - add w7,w7,w23 - add x8,x8,x23,lsr#32 - add w9,w9,w24 - add x10,x10,x24,lsr#32 - add w11,w11,w25 - add x12,x12,x25,lsr#32 - add w13,w13,w26 - add x14,x14,x26,lsr#32 - add w15,w15,w27 - add x16,x16,x27,lsr#32 - add w17,w17,w28 - add x19,x19,x28,lsr#32 - add w20,w20,w30 - add x21,x21,x30,lsr#32 - - add x5,x5,x6,lsl#32 // pack - add x7,x7,x8,lsl#32 - ldp x6,x8,[x1,#0] // load input - add x9,x9,x10,lsl#32 - add x11,x11,x12,lsl#32 - ldp x10,x12,[x1,#16] - add x13,x13,x14,lsl#32 - add x15,x15,x16,lsl#32 - ldp x14,x16,[x1,#32] - add x17,x17,x19,lsl#32 - add x20,x20,x21,lsl#32 - ldp x19,x21,[x1,#48] - add x1,x1,#64 -#ifdef __AARCH64EB__ - rev x5,x5 - rev x7,x7 - rev x9,x9 - rev x11,x11 - rev x13,x13 - rev x15,x15 - rev x17,x17 - rev x20,x20 -#endif - eor x5,x5,x6 - eor x7,x7,x8 - eor x9,x9,x10 - eor x11,x11,x12 - eor x13,x13,x14 - eor x15,x15,x16 - eor x17,x17,x19 - eor x20,x20,x21 - - stp x5,x7,[x0,#0] // store output - add x28,x28,#1 // increment counter - mov w5,w22 // unpack key block - lsr x6,x22,#32 - stp x9,x11,[x0,#16] - mov w7,w23 - lsr x8,x23,#32 - stp x13,x15,[x0,#32] - mov w9,w24 - lsr x10,x24,#32 - stp x17,x20,[x0,#48] - add x0,x0,#64 - mov w11,w25 - lsr x12,x25,#32 - mov w13,w26 - lsr x14,x26,#32 - mov w15,w27 - lsr x16,x27,#32 - mov w17,w28 - lsr x19,x28,#32 - mov w20,w30 - lsr x21,x30,#32 - - mov x4,#5 -.Loop_lower_neon: - sub x4,x4,#1 - add v0.4s,v0.4s,v1.4s - add w5,w5,w9 - add v4.4s,v4.4s,v5.4s - add w6,w6,w10 - add v8.4s,v8.4s,v9.4s - add w7,w7,w11 - add v12.4s,v12.4s,v13.4s - add w8,w8,w12 - add v16.4s,v16.4s,v17.4s - eor w17,w17,w5 - add v20.4s,v20.4s,v21.4s - eor w19,w19,w6 - eor v3.16b,v3.16b,v0.16b - eor w20,w20,w7 - eor v7.16b,v7.16b,v4.16b - eor w21,w21,w8 - eor v11.16b,v11.16b,v8.16b - ror w17,w17,#16 - eor v15.16b,v15.16b,v12.16b - ror w19,w19,#16 - eor v19.16b,v19.16b,v16.16b - ror w20,w20,#16 - eor v23.16b,v23.16b,v20.16b - ror w21,w21,#16 - rev32 v3.8h,v3.8h - add w13,w13,w17 - rev32 v7.8h,v7.8h - add w14,w14,w19 - rev32 v11.8h,v11.8h - add w15,w15,w20 - rev32 v15.8h,v15.8h - add w16,w16,w21 - rev32 v19.8h,v19.8h - eor w9,w9,w13 - rev32 v23.8h,v23.8h - eor w10,w10,w14 - add v2.4s,v2.4s,v3.4s - eor w11,w11,w15 - add v6.4s,v6.4s,v7.4s - eor w12,w12,w16 - add v10.4s,v10.4s,v11.4s - ror w9,w9,#20 - add v14.4s,v14.4s,v15.4s - ror w10,w10,#20 - add v18.4s,v18.4s,v19.4s - ror w11,w11,#20 - add v22.4s,v22.4s,v23.4s - ror w12,w12,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w9 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w10 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w11 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w12 - eor v28.16b,v17.16b,v18.16b - eor w17,w17,w5 - eor v29.16b,v21.16b,v22.16b - eor w19,w19,w6 - ushr v1.4s,v24.4s,#20 - eor w20,w20,w7 - ushr v5.4s,v25.4s,#20 - eor w21,w21,w8 - ushr v9.4s,v26.4s,#20 - ror w17,w17,#24 - ushr v13.4s,v27.4s,#20 - ror w19,w19,#24 - ushr v17.4s,v28.4s,#20 - ror w20,w20,#24 - ushr v21.4s,v29.4s,#20 - ror w21,w21,#24 - sli v1.4s,v24.4s,#12 - add w13,w13,w17 - sli v5.4s,v25.4s,#12 - add w14,w14,w19 - sli v9.4s,v26.4s,#12 - add w15,w15,w20 - sli v13.4s,v27.4s,#12 - add w16,w16,w21 - sli v17.4s,v28.4s,#12 - eor w9,w9,w13 - sli v21.4s,v29.4s,#12 - eor w10,w10,w14 - add v0.4s,v0.4s,v1.4s - eor w11,w11,w15 - add v4.4s,v4.4s,v5.4s - eor w12,w12,w16 - add v8.4s,v8.4s,v9.4s - ror w9,w9,#25 - add v12.4s,v12.4s,v13.4s - ror w10,w10,#25 - add v16.4s,v16.4s,v17.4s - ror w11,w11,#25 - add v20.4s,v20.4s,v21.4s - ror w12,w12,#25 - eor v24.16b,v3.16b,v0.16b - add w5,w5,w10 - eor v25.16b,v7.16b,v4.16b - add w6,w6,w11 - eor v26.16b,v11.16b,v8.16b - add w7,w7,w12 - eor v27.16b,v15.16b,v12.16b - add w8,w8,w9 - eor v28.16b,v19.16b,v16.16b - eor w21,w21,w5 - eor v29.16b,v23.16b,v20.16b - eor w17,w17,w6 - ushr v3.4s,v24.4s,#24 - eor w19,w19,w7 - ushr v7.4s,v25.4s,#24 - eor w20,w20,w8 - ushr v11.4s,v26.4s,#24 - ror w21,w21,#16 - ushr v15.4s,v27.4s,#24 - ror w17,w17,#16 - ushr v19.4s,v28.4s,#24 - ror w19,w19,#16 - ushr v23.4s,v29.4s,#24 - ror w20,w20,#16 - sli v3.4s,v24.4s,#8 - add w15,w15,w21 - sli v7.4s,v25.4s,#8 - add w16,w16,w17 - sli v11.4s,v26.4s,#8 - add w13,w13,w19 - sli v15.4s,v27.4s,#8 - add w14,w14,w20 - sli v19.4s,v28.4s,#8 - eor w10,w10,w15 - sli v23.4s,v29.4s,#8 - eor w11,w11,w16 - add v2.4s,v2.4s,v3.4s - eor w12,w12,w13 - add v6.4s,v6.4s,v7.4s - eor w9,w9,w14 - add v10.4s,v10.4s,v11.4s - ror w10,w10,#20 - add v14.4s,v14.4s,v15.4s - ror w11,w11,#20 - add v18.4s,v18.4s,v19.4s - ror w12,w12,#20 - add v22.4s,v22.4s,v23.4s - ror w9,w9,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w10 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w11 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w12 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w9 - eor v28.16b,v17.16b,v18.16b - eor w21,w21,w5 - eor v29.16b,v21.16b,v22.16b - eor w17,w17,w6 - ushr v1.4s,v24.4s,#25 - eor w19,w19,w7 - ushr v5.4s,v25.4s,#25 - eor w20,w20,w8 - ushr v9.4s,v26.4s,#25 - ror w21,w21,#24 - ushr v13.4s,v27.4s,#25 - ror w17,w17,#24 - ushr v17.4s,v28.4s,#25 - ror w19,w19,#24 - ushr v21.4s,v29.4s,#25 - ror w20,w20,#24 - sli v1.4s,v24.4s,#7 - add w15,w15,w21 - sli v5.4s,v25.4s,#7 - add w16,w16,w17 - sli v9.4s,v26.4s,#7 - add w13,w13,w19 - sli v13.4s,v27.4s,#7 - add w14,w14,w20 - sli v17.4s,v28.4s,#7 - eor w10,w10,w15 - sli v21.4s,v29.4s,#7 - eor w11,w11,w16 - ext v2.16b,v2.16b,v2.16b,#8 - eor w12,w12,w13 - ext v6.16b,v6.16b,v6.16b,#8 - eor w9,w9,w14 - ext v10.16b,v10.16b,v10.16b,#8 - ror w10,w10,#25 - ext v14.16b,v14.16b,v14.16b,#8 - ror w11,w11,#25 - ext v18.16b,v18.16b,v18.16b,#8 - ror w12,w12,#25 - ext v22.16b,v22.16b,v22.16b,#8 - ror w9,w9,#25 - ext v3.16b,v3.16b,v3.16b,#12 - ext v7.16b,v7.16b,v7.16b,#12 - ext v11.16b,v11.16b,v11.16b,#12 - ext v15.16b,v15.16b,v15.16b,#12 - ext v19.16b,v19.16b,v19.16b,#12 - ext v23.16b,v23.16b,v23.16b,#12 - ext v1.16b,v1.16b,v1.16b,#4 - ext v5.16b,v5.16b,v5.16b,#4 - ext v9.16b,v9.16b,v9.16b,#4 - ext v13.16b,v13.16b,v13.16b,#4 - ext v17.16b,v17.16b,v17.16b,#4 - ext v21.16b,v21.16b,v21.16b,#4 - add v0.4s,v0.4s,v1.4s - add w5,w5,w9 - add v4.4s,v4.4s,v5.4s - add w6,w6,w10 - add v8.4s,v8.4s,v9.4s - add w7,w7,w11 - add v12.4s,v12.4s,v13.4s - add w8,w8,w12 - add v16.4s,v16.4s,v17.4s - eor w17,w17,w5 - add v20.4s,v20.4s,v21.4s - eor w19,w19,w6 - eor v3.16b,v3.16b,v0.16b - eor w20,w20,w7 - eor v7.16b,v7.16b,v4.16b - eor w21,w21,w8 - eor v11.16b,v11.16b,v8.16b - ror w17,w17,#16 - eor v15.16b,v15.16b,v12.16b - ror w19,w19,#16 - eor v19.16b,v19.16b,v16.16b - ror w20,w20,#16 - eor v23.16b,v23.16b,v20.16b - ror w21,w21,#16 - rev32 v3.8h,v3.8h - add w13,w13,w17 - rev32 v7.8h,v7.8h - add w14,w14,w19 - rev32 v11.8h,v11.8h - add w15,w15,w20 - rev32 v15.8h,v15.8h - add w16,w16,w21 - rev32 v19.8h,v19.8h - eor w9,w9,w13 - rev32 v23.8h,v23.8h - eor w10,w10,w14 - add v2.4s,v2.4s,v3.4s - eor w11,w11,w15 - add v6.4s,v6.4s,v7.4s - eor w12,w12,w16 - add v10.4s,v10.4s,v11.4s - ror w9,w9,#20 - add v14.4s,v14.4s,v15.4s - ror w10,w10,#20 - add v18.4s,v18.4s,v19.4s - ror w11,w11,#20 - add v22.4s,v22.4s,v23.4s - ror w12,w12,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w9 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w10 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w11 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w12 - eor v28.16b,v17.16b,v18.16b - eor w17,w17,w5 - eor v29.16b,v21.16b,v22.16b - eor w19,w19,w6 - ushr v1.4s,v24.4s,#20 - eor w20,w20,w7 - ushr v5.4s,v25.4s,#20 - eor w21,w21,w8 - ushr v9.4s,v26.4s,#20 - ror w17,w17,#24 - ushr v13.4s,v27.4s,#20 - ror w19,w19,#24 - ushr v17.4s,v28.4s,#20 - ror w20,w20,#24 - ushr v21.4s,v29.4s,#20 - ror w21,w21,#24 - sli v1.4s,v24.4s,#12 - add w13,w13,w17 - sli v5.4s,v25.4s,#12 - add w14,w14,w19 - sli v9.4s,v26.4s,#12 - add w15,w15,w20 - sli v13.4s,v27.4s,#12 - add w16,w16,w21 - sli v17.4s,v28.4s,#12 - eor w9,w9,w13 - sli v21.4s,v29.4s,#12 - eor w10,w10,w14 - add v0.4s,v0.4s,v1.4s - eor w11,w11,w15 - add v4.4s,v4.4s,v5.4s - eor w12,w12,w16 - add v8.4s,v8.4s,v9.4s - ror w9,w9,#25 - add v12.4s,v12.4s,v13.4s - ror w10,w10,#25 - add v16.4s,v16.4s,v17.4s - ror w11,w11,#25 - add v20.4s,v20.4s,v21.4s - ror w12,w12,#25 - eor v24.16b,v3.16b,v0.16b - add w5,w5,w10 - eor v25.16b,v7.16b,v4.16b - add w6,w6,w11 - eor v26.16b,v11.16b,v8.16b - add w7,w7,w12 - eor v27.16b,v15.16b,v12.16b - add w8,w8,w9 - eor v28.16b,v19.16b,v16.16b - eor w21,w21,w5 - eor v29.16b,v23.16b,v20.16b - eor w17,w17,w6 - ushr v3.4s,v24.4s,#24 - eor w19,w19,w7 - ushr v7.4s,v25.4s,#24 - eor w20,w20,w8 - ushr v11.4s,v26.4s,#24 - ror w21,w21,#16 - ushr v15.4s,v27.4s,#24 - ror w17,w17,#16 - ushr v19.4s,v28.4s,#24 - ror w19,w19,#16 - ushr v23.4s,v29.4s,#24 - ror w20,w20,#16 - sli v3.4s,v24.4s,#8 - add w15,w15,w21 - sli v7.4s,v25.4s,#8 - add w16,w16,w17 - sli v11.4s,v26.4s,#8 - add w13,w13,w19 - sli v15.4s,v27.4s,#8 - add w14,w14,w20 - sli v19.4s,v28.4s,#8 - eor w10,w10,w15 - sli v23.4s,v29.4s,#8 - eor w11,w11,w16 - add v2.4s,v2.4s,v3.4s - eor w12,w12,w13 - add v6.4s,v6.4s,v7.4s - eor w9,w9,w14 - add v10.4s,v10.4s,v11.4s - ror w10,w10,#20 - add v14.4s,v14.4s,v15.4s - ror w11,w11,#20 - add v18.4s,v18.4s,v19.4s - ror w12,w12,#20 - add v22.4s,v22.4s,v23.4s - ror w9,w9,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w10 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w11 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w12 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w9 - eor v28.16b,v17.16b,v18.16b - eor w21,w21,w5 - eor v29.16b,v21.16b,v22.16b - eor w17,w17,w6 - ushr v1.4s,v24.4s,#25 - eor w19,w19,w7 - ushr v5.4s,v25.4s,#25 - eor w20,w20,w8 - ushr v9.4s,v26.4s,#25 - ror w21,w21,#24 - ushr v13.4s,v27.4s,#25 - ror w17,w17,#24 - ushr v17.4s,v28.4s,#25 - ror w19,w19,#24 - ushr v21.4s,v29.4s,#25 - ror w20,w20,#24 - sli v1.4s,v24.4s,#7 - add w15,w15,w21 - sli v5.4s,v25.4s,#7 - add w16,w16,w17 - sli v9.4s,v26.4s,#7 - add w13,w13,w19 - sli v13.4s,v27.4s,#7 - add w14,w14,w20 - sli v17.4s,v28.4s,#7 - eor w10,w10,w15 - sli v21.4s,v29.4s,#7 - eor w11,w11,w16 - ext v2.16b,v2.16b,v2.16b,#8 - eor w12,w12,w13 - ext v6.16b,v6.16b,v6.16b,#8 - eor w9,w9,w14 - ext v10.16b,v10.16b,v10.16b,#8 - ror w10,w10,#25 - ext v14.16b,v14.16b,v14.16b,#8 - ror w11,w11,#25 - ext v18.16b,v18.16b,v18.16b,#8 - ror w12,w12,#25 - ext v22.16b,v22.16b,v22.16b,#8 - ror w9,w9,#25 - ext v3.16b,v3.16b,v3.16b,#4 - ext v7.16b,v7.16b,v7.16b,#4 - ext v11.16b,v11.16b,v11.16b,#4 - ext v15.16b,v15.16b,v15.16b,#4 - ext v19.16b,v19.16b,v19.16b,#4 - ext v23.16b,v23.16b,v23.16b,#4 - ext v1.16b,v1.16b,v1.16b,#12 - ext v5.16b,v5.16b,v5.16b,#12 - ext v9.16b,v9.16b,v9.16b,#12 - ext v13.16b,v13.16b,v13.16b,#12 - ext v17.16b,v17.16b,v17.16b,#12 - ext v21.16b,v21.16b,v21.16b,#12 - cbnz x4,.Loop_lower_neon - - add w5,w5,w22 // accumulate key block - ldp q24,q25,[sp,#0] - add x6,x6,x22,lsr#32 - ldp q26,q27,[sp,#32] - add w7,w7,w23 - ldp q28,q29,[sp,#64] - add x8,x8,x23,lsr#32 - add v0.4s,v0.4s,v24.4s - add w9,w9,w24 - add v4.4s,v4.4s,v24.4s - add x10,x10,x24,lsr#32 - add v8.4s,v8.4s,v24.4s - add w11,w11,w25 - add v12.4s,v12.4s,v24.4s - add x12,x12,x25,lsr#32 - add v16.4s,v16.4s,v24.4s - add w13,w13,w26 - add v20.4s,v20.4s,v24.4s - add x14,x14,x26,lsr#32 - add v2.4s,v2.4s,v26.4s - add w15,w15,w27 - add v6.4s,v6.4s,v26.4s - add x16,x16,x27,lsr#32 - add v10.4s,v10.4s,v26.4s - add w17,w17,w28 - add v14.4s,v14.4s,v26.4s - add x19,x19,x28,lsr#32 - add v18.4s,v18.4s,v26.4s - add w20,w20,w30 - add v22.4s,v22.4s,v26.4s - add x21,x21,x30,lsr#32 - add v19.4s,v19.4s,v31.4s // +4 - add x5,x5,x6,lsl#32 // pack - add v23.4s,v23.4s,v31.4s // +4 - add x7,x7,x8,lsl#32 - add v3.4s,v3.4s,v27.4s - ldp x6,x8,[x1,#0] // load input - add v7.4s,v7.4s,v28.4s - add x9,x9,x10,lsl#32 - add v11.4s,v11.4s,v29.4s - add x11,x11,x12,lsl#32 - add v15.4s,v15.4s,v30.4s - ldp x10,x12,[x1,#16] - add v19.4s,v19.4s,v27.4s - add x13,x13,x14,lsl#32 - add v23.4s,v23.4s,v28.4s - add x15,x15,x16,lsl#32 - add v1.4s,v1.4s,v25.4s - ldp x14,x16,[x1,#32] - add v5.4s,v5.4s,v25.4s - add x17,x17,x19,lsl#32 - add v9.4s,v9.4s,v25.4s - add x20,x20,x21,lsl#32 - add v13.4s,v13.4s,v25.4s - ldp x19,x21,[x1,#48] - add v17.4s,v17.4s,v25.4s - add x1,x1,#64 - add v21.4s,v21.4s,v25.4s - -#ifdef __AARCH64EB__ - rev x5,x5 - rev x7,x7 - rev x9,x9 - rev x11,x11 - rev x13,x13 - rev x15,x15 - rev x17,x17 - rev x20,x20 -#endif - ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 - eor x5,x5,x6 - eor x7,x7,x8 - eor x9,x9,x10 - eor x11,x11,x12 - eor x13,x13,x14 - eor v0.16b,v0.16b,v24.16b - eor x15,x15,x16 - eor v1.16b,v1.16b,v25.16b - eor x17,x17,x19 - eor v2.16b,v2.16b,v26.16b - eor x20,x20,x21 - eor v3.16b,v3.16b,v27.16b - ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 - - stp x5,x7,[x0,#0] // store output - add x28,x28,#7 // increment counter - stp x9,x11,[x0,#16] - stp x13,x15,[x0,#32] - stp x17,x20,[x0,#48] - add x0,x0,#64 - st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 - - ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 - eor v4.16b,v4.16b,v24.16b - eor v5.16b,v5.16b,v25.16b - eor v6.16b,v6.16b,v26.16b - eor v7.16b,v7.16b,v27.16b - st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 - - ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 - eor v8.16b,v8.16b,v0.16b - ldp q24,q25,[sp,#0] - eor v9.16b,v9.16b,v1.16b - ldp q26,q27,[sp,#32] - eor v10.16b,v10.16b,v2.16b - eor v11.16b,v11.16b,v3.16b - st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 - - ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 - eor v12.16b,v12.16b,v4.16b - eor v13.16b,v13.16b,v5.16b - eor v14.16b,v14.16b,v6.16b - eor v15.16b,v15.16b,v7.16b - st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 - - ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 - eor v16.16b,v16.16b,v8.16b - eor v17.16b,v17.16b,v9.16b - eor v18.16b,v18.16b,v10.16b - eor v19.16b,v19.16b,v11.16b - st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 - - shl v0.4s,v31.4s,#1 // 4 -> 8 - eor v20.16b,v20.16b,v12.16b - eor v21.16b,v21.16b,v13.16b - eor v22.16b,v22.16b,v14.16b - eor v23.16b,v23.16b,v15.16b - st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 - - add v27.4s,v27.4s,v0.4s // += 8 - add v28.4s,v28.4s,v0.4s - add v29.4s,v29.4s,v0.4s - add v30.4s,v30.4s,v0.4s - - b.hs .Loop_outer_512_neon - - adds x2,x2,#512 - ushr v0.4s,v31.4s,#2 // 4 -> 1 - - ldp d8,d9,[sp,#128+0] // meet ABI requirements - ldp d10,d11,[sp,#128+16] - ldp d12,d13,[sp,#128+32] - ldp d14,d15,[sp,#128+48] - - stp q24,q31,[sp,#0] // wipe off-load area - stp q24,q31,[sp,#32] - stp q24,q31,[sp,#64] - - b.eq .Ldone_512_neon - - cmp x2,#192 - sub v27.4s,v27.4s,v0.4s // -= 1 - sub v28.4s,v28.4s,v0.4s - sub v29.4s,v29.4s,v0.4s - add sp,sp,#128 - b.hs .Loop_outer_neon - - eor v25.16b,v25.16b,v25.16b - eor v26.16b,v26.16b,v26.16b - eor v27.16b,v27.16b,v27.16b - eor v28.16b,v28.16b,v28.16b - eor v29.16b,v29.16b,v29.16b - eor v30.16b,v30.16b,v30.16b - b .Loop_outer - -.Ldone_512_neon: - ldp x19,x20,[x29,#16] - add sp,sp,#128+64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 -.Labort_neon: - ret -ENDPROC(chacha20_neon) -#endif diff --git a/src/crypto/zinc/chacha20/chacha20-arm64.pl b/src/crypto/zinc/chacha20/chacha20-arm64.pl new file mode 100644 index 0000000..7926c8d --- /dev/null +++ b/src/crypto/zinc/chacha20/chacha20-arm64.pl @@ -0,0 +1,1164 @@ +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +# +# This code is taken from the OpenSSL project but the author, Andy Polyakov, +# has relicensed it under the licenses specified in the SPDX header above. +# The original headers, including the original license headers, are +# included below for completeness. +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# June 2015 +# +# ChaCha20 for ARMv8. +# +# Performance in cycles per byte out of large buffer. +# +# IALU/gcc-4.9 3xNEON+1xIALU 6xNEON+2xIALU(*) +# +# Apple A7 5.50/+49% 3.33 1.70 +# Cortex-A53 8.40/+80% 4.72 4.72(**) +# Cortex-A57 8.06/+43% 4.90 4.43(***) +# Denver 4.50/+82% 2.63 2.67(**) +# X-Gene 9.50/+46% 8.82 8.89(**) +# Mongoose 8.00/+44% 3.64 3.25(***) +# Kryo 8.17/+50% 4.83 4.65(***) +# +# (*) since no non-Apple processor exhibits significantly better +# performance, the code path is #ifdef __APPLE__-ed; +# (**) it's expected that doubling interleave factor doesn't help +# all processors, only those with higher NEON latency and +# higher instruction issue rate; +# (***) expected improvement was actually higher; + +$flavour=shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +sub AUTOLOAD() # thunk [simplified] x86-style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; + my $arg = pop; + $arg = "#$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; +} + +my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4)); + +my @x=map("x$_",(5..17,19..21)); +my @d=map("x$_",(22..28,30)); + +sub ROUND { +my ($a0,$b0,$c0,$d0)=@_; +my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); +my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); +my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); + + ( + "&add_32 (@x[$a0],@x[$a0],@x[$b0])", + "&add_32 (@x[$a1],@x[$a1],@x[$b1])", + "&add_32 (@x[$a2],@x[$a2],@x[$b2])", + "&add_32 (@x[$a3],@x[$a3],@x[$b3])", + "&eor_32 (@x[$d0],@x[$d0],@x[$a0])", + "&eor_32 (@x[$d1],@x[$d1],@x[$a1])", + "&eor_32 (@x[$d2],@x[$d2],@x[$a2])", + "&eor_32 (@x[$d3],@x[$d3],@x[$a3])", + "&ror_32 (@x[$d0],@x[$d0],16)", + "&ror_32 (@x[$d1],@x[$d1],16)", + "&ror_32 (@x[$d2],@x[$d2],16)", + "&ror_32 (@x[$d3],@x[$d3],16)", + + "&add_32 (@x[$c0],@x[$c0],@x[$d0])", + "&add_32 (@x[$c1],@x[$c1],@x[$d1])", + "&add_32 (@x[$c2],@x[$c2],@x[$d2])", + "&add_32 (@x[$c3],@x[$c3],@x[$d3])", + "&eor_32 (@x[$b0],@x[$b0],@x[$c0])", + "&eor_32 (@x[$b1],@x[$b1],@x[$c1])", + "&eor_32 (@x[$b2],@x[$b2],@x[$c2])", + "&eor_32 (@x[$b3],@x[$b3],@x[$c3])", + "&ror_32 (@x[$b0],@x[$b0],20)", + "&ror_32 (@x[$b1],@x[$b1],20)", + "&ror_32 (@x[$b2],@x[$b2],20)", + "&ror_32 (@x[$b3],@x[$b3],20)", + + "&add_32 (@x[$a0],@x[$a0],@x[$b0])", + "&add_32 (@x[$a1],@x[$a1],@x[$b1])", + "&add_32 (@x[$a2],@x[$a2],@x[$b2])", + "&add_32 (@x[$a3],@x[$a3],@x[$b3])", + "&eor_32 (@x[$d0],@x[$d0],@x[$a0])", + "&eor_32 (@x[$d1],@x[$d1],@x[$a1])", + "&eor_32 (@x[$d2],@x[$d2],@x[$a2])", + "&eor_32 (@x[$d3],@x[$d3],@x[$a3])", + "&ror_32 (@x[$d0],@x[$d0],24)", + "&ror_32 (@x[$d1],@x[$d1],24)", + "&ror_32 (@x[$d2],@x[$d2],24)", + "&ror_32 (@x[$d3],@x[$d3],24)", + + "&add_32 (@x[$c0],@x[$c0],@x[$d0])", + "&add_32 (@x[$c1],@x[$c1],@x[$d1])", + "&add_32 (@x[$c2],@x[$c2],@x[$d2])", + "&add_32 (@x[$c3],@x[$c3],@x[$d3])", + "&eor_32 (@x[$b0],@x[$b0],@x[$c0])", + "&eor_32 (@x[$b1],@x[$b1],@x[$c1])", + "&eor_32 (@x[$b2],@x[$b2],@x[$c2])", + "&eor_32 (@x[$b3],@x[$b3],@x[$c3])", + "&ror_32 (@x[$b0],@x[$b0],25)", + "&ror_32 (@x[$b1],@x[$b1],25)", + "&ror_32 (@x[$b2],@x[$b2],25)", + "&ror_32 (@x[$b3],@x[$b3],25)" + ); +} + +$code.=<<___; +#ifndef __KERNEL__ +# include "arm_arch.h" +.extern OPENSSL_armcap_P +#else +# define ChaCha20_ctr32 chacha20_arm +# define ChaCha20_neon chacha20_neon +#endif + +.text + +.align 5 +.Lsigma: +.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral +.Lone: +.long 1,0,0,0 +#ifndef __KERNEL__ +.LOPENSSL_armcap_P: +# ifdef __ILP32__ +.long OPENSSL_armcap_P-. +# else +.quad OPENSSL_armcap_P-. +# endif +#endif + +.globl ChaCha20_ctr32 +.type ChaCha20_ctr32,%function +.align 5 +ChaCha20_ctr32: + cbz $len,.Labort +#ifndef __KERNEL__ + adr @x[0],.LOPENSSL_armcap_P + cmp $len,#192 + b.lo .Lshort +# ifdef __ILP32__ + ldrsw @x[1],[@x[0]] +# else + ldr @x[1],[@x[0]] +# endif + ldr w17,[@x[1],@x[0]] + tst w17,#ARMV7_NEON + b.ne ChaCha20_neon + +.Lshort: +#endif + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adr @x[0],.Lsigma + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#64 + + ldp @d[0],@d[1],[@x[0]] // load sigma + ldp @d[2],@d[3],[$key] // load key + ldp @d[4],@d[5],[$key,#16] + ldp @d[6],@d[7],[$ctr] // load counter +#ifdef __AARCH64EB__ + ror @d[2],@d[2],#32 + ror @d[3],@d[3],#32 + ror @d[4],@d[4],#32 + ror @d[5],@d[5],#32 + ror @d[6],@d[6],#32 + ror @d[7],@d[7],#32 +#endif + +.Loop_outer: + mov.32 @x[0],@d[0] // unpack key block + lsr @x[1],@d[0],#32 + mov.32 @x[2],@d[1] + lsr @x[3],@d[1],#32 + mov.32 @x[4],@d[2] + lsr @x[5],@d[2],#32 + mov.32 @x[6],@d[3] + lsr @x[7],@d[3],#32 + mov.32 @x[8],@d[4] + lsr @x[9],@d[4],#32 + mov.32 @x[10],@d[5] + lsr @x[11],@d[5],#32 + mov.32 @x[12],@d[6] + lsr @x[13],@d[6],#32 + mov.32 @x[14],@d[7] + lsr @x[15],@d[7],#32 + + mov $ctr,#10 + subs $len,$len,#64 +.Loop: + sub $ctr,$ctr,#1 +___ + foreach (&ROUND(0, 4, 8,12)) { eval; } + foreach (&ROUND(0, 5,10,15)) { eval; } +$code.=<<___; + cbnz $ctr,.Loop + + add.32 @x[0],@x[0],@d[0] // accumulate key block + add @x[1],@x[1],@d[0],lsr#32 + add.32 @x[2],@x[2],@d[1] + add @x[3],@x[3],@d[1],lsr#32 + add.32 @x[4],@x[4],@d[2] + add @x[5],@x[5],@d[2],lsr#32 + add.32 @x[6],@x[6],@d[3] + add @x[7],@x[7],@d[3],lsr#32 + add.32 @x[8],@x[8],@d[4] + add @x[9],@x[9],@d[4],lsr#32 + add.32 @x[10],@x[10],@d[5] + add @x[11],@x[11],@d[5],lsr#32 + add.32 @x[12],@x[12],@d[6] + add @x[13],@x[13],@d[6],lsr#32 + add.32 @x[14],@x[14],@d[7] + add @x[15],@x[15],@d[7],lsr#32 + + b.lo .Ltail + + add @x[0],@x[0],@x[1],lsl#32 // pack + add @x[2],@x[2],@x[3],lsl#32 + ldp @x[1],@x[3],[$inp,#0] // load input + add @x[4],@x[4],@x[5],lsl#32 + add @x[6],@x[6],@x[7],lsl#32 + ldp @x[5],@x[7],[$inp,#16] + add @x[8],@x[8],@x[9],lsl#32 + add @x[10],@x[10],@x[11],lsl#32 + ldp @x[9],@x[11],[$inp,#32] + add @x[12],@x[12],@x[13],lsl#32 + add @x[14],@x[14],@x[15],lsl#32 + ldp @x[13],@x[15],[$inp,#48] + add $inp,$inp,#64 +#ifdef __AARCH64EB__ + rev @x[0],@x[0] + rev @x[2],@x[2] + rev @x[4],@x[4] + rev @x[6],@x[6] + rev @x[8],@x[8] + rev @x[10],@x[10] + rev @x[12],@x[12] + rev @x[14],@x[14] +#endif + eor @x[0],@x[0],@x[1] + eor @x[2],@x[2],@x[3] + eor @x[4],@x[4],@x[5] + eor @x[6],@x[6],@x[7] + eor @x[8],@x[8],@x[9] + eor @x[10],@x[10],@x[11] + eor @x[12],@x[12],@x[13] + eor @x[14],@x[14],@x[15] + + stp @x[0],@x[2],[$out,#0] // store output + add @d[6],@d[6],#1 // increment counter + stp @x[4],@x[6],[$out,#16] + stp @x[8],@x[10],[$out,#32] + stp @x[12],@x[14],[$out,#48] + add $out,$out,#64 + + b.hi .Loop_outer + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 +.Labort: + ret + +.align 4 +.Ltail: + add $len,$len,#64 +.Less_than_64: + sub $out,$out,#1 + add $inp,$inp,$len + add $out,$out,$len + add $ctr,sp,$len + neg $len,$len + + add @x[0],@x[0],@x[1],lsl#32 // pack + add @x[2],@x[2],@x[3],lsl#32 + add @x[4],@x[4],@x[5],lsl#32 + add @x[6],@x[6],@x[7],lsl#32 + add @x[8],@x[8],@x[9],lsl#32 + add @x[10],@x[10],@x[11],lsl#32 + add @x[12],@x[12],@x[13],lsl#32 + add @x[14],@x[14],@x[15],lsl#32 +#ifdef __AARCH64EB__ + rev @x[0],@x[0] + rev @x[2],@x[2] + rev @x[4],@x[4] + rev @x[6],@x[6] + rev @x[8],@x[8] + rev @x[10],@x[10] + rev @x[12],@x[12] + rev @x[14],@x[14] +#endif + stp @x[0],@x[2],[sp,#0] + stp @x[4],@x[6],[sp,#16] + stp @x[8],@x[10],[sp,#32] + stp @x[12],@x[14],[sp,#48] + +.Loop_tail: + ldrb w10,[$inp,$len] + ldrb w11,[$ctr,$len] + add $len,$len,#1 + eor w10,w10,w11 + strb w10,[$out,$len] + cbnz $len,.Loop_tail + + stp xzr,xzr,[sp,#0] + stp xzr,xzr,[sp,#16] + stp xzr,xzr,[sp,#32] + stp xzr,xzr,[sp,#48] + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + ret +.size ChaCha20_ctr32,.-ChaCha20_ctr32 +___ + +{{{ +my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2,$T3) = + map("v$_.4s",(0..7,16..23)); +my (@K)=map("v$_.4s",(24..30)); +my $ONE="v31.4s"; + +sub NEONROUND { +my $odd = pop; +my ($a,$b,$c,$d,$t)=@_; + + ( + "&add ('$a','$a','$b')", + "&eor ('$d','$d','$a')", + "&rev32_16 ('$d','$d')", # vrot ($d,16) + + "&add ('$c','$c','$d')", + "&eor ('$t','$b','$c')", + "&ushr ('$b','$t',20)", + "&sli ('$b','$t',12)", + + "&add ('$a','$a','$b')", + "&eor ('$t','$d','$a')", + "&ushr ('$d','$t',24)", + "&sli ('$d','$t',8)", + + "&add ('$c','$c','$d')", + "&eor ('$t','$b','$c')", + "&ushr ('$b','$t',25)", + "&sli ('$b','$t',7)", + + "&ext ('$c','$c','$c',8)", + "&ext ('$d','$d','$d',$odd?4:12)", + "&ext ('$b','$b','$b',$odd?12:4)" + ); +} + +$code.=<<___; + +#ifdef __KERNEL__ +.globl ChaCha20_neon +.type ChaCha20_neon,%function +#endif +.type ChaCha20_neon,%function +.align 5 +ChaCha20_neon: + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adr @x[0],.Lsigma + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] +#ifdef __APPLE__ + cmp $len,#512 + b.hs .L512_or_more_neon +#endif + + sub sp,sp,#64 + + ldp @d[0],@d[1],[@x[0]] // load sigma + ld1 {@K[0]},[@x[0]],#16 + ldp @d[2],@d[3],[$key] // load key + ldp @d[4],@d[5],[$key,#16] + ld1 {@K[1],@K[2]},[$key] + ldp @d[6],@d[7],[$ctr] // load counter + ld1 {@K[3]},[$ctr] + ld1 {$ONE},[@x[0]] +#ifdef __AARCH64EB__ + rev64 @K[0],@K[0] + ror @d[2],@d[2],#32 + ror @d[3],@d[3],#32 + ror @d[4],@d[4],#32 + ror @d[5],@d[5],#32 + ror @d[6],@d[6],#32 + ror @d[7],@d[7],#32 +#endif + add @K[3],@K[3],$ONE // += 1 + add @K[4],@K[3],$ONE + add @K[5],@K[4],$ONE + shl $ONE,$ONE,#2 // 1 -> 4 + +.Loop_outer_neon: + mov.32 @x[0],@d[0] // unpack key block + lsr @x[1],@d[0],#32 + mov $A0,@K[0] + mov.32 @x[2],@d[1] + lsr @x[3],@d[1],#32 + mov $A1,@K[0] + mov.32 @x[4],@d[2] + lsr @x[5],@d[2],#32 + mov $A2,@K[0] + mov.32 @x[6],@d[3] + mov $B0,@K[1] + lsr @x[7],@d[3],#32 + mov $B1,@K[1] + mov.32 @x[8],@d[4] + mov $B2,@K[1] + lsr @x[9],@d[4],#32 + mov $D0,@K[3] + mov.32 @x[10],@d[5] + mov $D1,@K[4] + lsr @x[11],@d[5],#32 + mov $D2,@K[5] + mov.32 @x[12],@d[6] + mov $C0,@K[2] + lsr @x[13],@d[6],#32 + mov $C1,@K[2] + mov.32 @x[14],@d[7] + mov $C2,@K[2] + lsr @x[15],@d[7],#32 + + mov $ctr,#10 + subs $len,$len,#256 +.Loop_neon: + sub $ctr,$ctr,#1 +___ + my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0); + my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0); + my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0); + my @thread3=&ROUND(0,4,8,12); + + foreach (@thread0) { + eval; eval(shift(@thread3)); + eval(shift(@thread1)); eval(shift(@thread3)); + eval(shift(@thread2)); eval(shift(@thread3)); + } + + @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1); + @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1); + @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1); + @thread3=&ROUND(0,5,10,15); + + foreach (@thread0) { + eval; eval(shift(@thread3)); + eval(shift(@thread1)); eval(shift(@thread3)); + eval(shift(@thread2)); eval(shift(@thread3)); + } +$code.=<<___; + cbnz $ctr,.Loop_neon + + add.32 @x[0],@x[0],@d[0] // accumulate key block + add $A0,$A0,@K[0] + add @x[1],@x[1],@d[0],lsr#32 + add $A1,$A1,@K[0] + add.32 @x[2],@x[2],@d[1] + add $A2,$A2,@K[0] + add @x[3],@x[3],@d[1],lsr#32 + add $C0,$C0,@K[2] + add.32 @x[4],@x[4],@d[2] + add $C1,$C1,@K[2] + add @x[5],@x[5],@d[2],lsr#32 + add $C2,$C2,@K[2] + add.32 @x[6],@x[6],@d[3] + add $D0,$D0,@K[3] + add @x[7],@x[7],@d[3],lsr#32 + add.32 @x[8],@x[8],@d[4] + add $D1,$D1,@K[4] + add @x[9],@x[9],@d[4],lsr#32 + add.32 @x[10],@x[10],@d[5] + add $D2,$D2,@K[5] + add @x[11],@x[11],@d[5],lsr#32 + add.32 @x[12],@x[12],@d[6] + add $B0,$B0,@K[1] + add @x[13],@x[13],@d[6],lsr#32 + add.32 @x[14],@x[14],@d[7] + add $B1,$B1,@K[1] + add @x[15],@x[15],@d[7],lsr#32 + add $B2,$B2,@K[1] + + b.lo .Ltail_neon + + add @x[0],@x[0],@x[1],lsl#32 // pack + add @x[2],@x[2],@x[3],lsl#32 + ldp @x[1],@x[3],[$inp,#0] // load input + add @x[4],@x[4],@x[5],lsl#32 + add @x[6],@x[6],@x[7],lsl#32 + ldp @x[5],@x[7],[$inp,#16] + add @x[8],@x[8],@x[9],lsl#32 + add @x[10],@x[10],@x[11],lsl#32 + ldp @x[9],@x[11],[$inp,#32] + add @x[12],@x[12],@x[13],lsl#32 + add @x[14],@x[14],@x[15],lsl#32 + ldp @x[13],@x[15],[$inp,#48] + add $inp,$inp,#64 +#ifdef __AARCH64EB__ + rev @x[0],@x[0] + rev @x[2],@x[2] + rev @x[4],@x[4] + rev @x[6],@x[6] + rev @x[8],@x[8] + rev @x[10],@x[10] + rev @x[12],@x[12] + rev @x[14],@x[14] +#endif + ld1.8 {$T0-$T3},[$inp],#64 + eor @x[0],@x[0],@x[1] + eor @x[2],@x[2],@x[3] + eor @x[4],@x[4],@x[5] + eor @x[6],@x[6],@x[7] + eor @x[8],@x[8],@x[9] + eor $A0,$A0,$T0 + eor @x[10],@x[10],@x[11] + eor $B0,$B0,$T1 + eor @x[12],@x[12],@x[13] + eor $C0,$C0,$T2 + eor @x[14],@x[14],@x[15] + eor $D0,$D0,$T3 + ld1.8 {$T0-$T3},[$inp],#64 + + stp @x[0],@x[2],[$out,#0] // store output + add @d[6],@d[6],#4 // increment counter + stp @x[4],@x[6],[$out,#16] + add @K[3],@K[3],$ONE // += 4 + stp @x[8],@x[10],[$out,#32] + add @K[4],@K[4],$ONE + stp @x[12],@x[14],[$out,#48] + add @K[5],@K[5],$ONE + add $out,$out,#64 + + st1.8 {$A0-$D0},[$out],#64 + ld1.8 {$A0-$D0},[$inp],#64 + + eor $A1,$A1,$T0 + eor $B1,$B1,$T1 + eor $C1,$C1,$T2 + eor $D1,$D1,$T3 + st1.8 {$A1-$D1},[$out],#64 + + eor $A2,$A2,$A0 + eor $B2,$B2,$B0 + eor $C2,$C2,$C0 + eor $D2,$D2,$D0 + st1.8 {$A2-$D2},[$out],#64 + + b.hi .Loop_outer_neon + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + ret + +.Ltail_neon: + add $len,$len,#256 + cmp $len,#64 + b.lo .Less_than_64 + + add @x[0],@x[0],@x[1],lsl#32 // pack + add @x[2],@x[2],@x[3],lsl#32 + ldp @x[1],@x[3],[$inp,#0] // load input + add @x[4],@x[4],@x[5],lsl#32 + add @x[6],@x[6],@x[7],lsl#32 + ldp @x[5],@x[7],[$inp,#16] + add @x[8],@x[8],@x[9],lsl#32 + add @x[10],@x[10],@x[11],lsl#32 + ldp @x[9],@x[11],[$inp,#32] + add @x[12],@x[12],@x[13],lsl#32 + add @x[14],@x[14],@x[15],lsl#32 + ldp @x[13],@x[15],[$inp,#48] + add $inp,$inp,#64 +#ifdef __AARCH64EB__ + rev @x[0],@x[0] + rev @x[2],@x[2] + rev @x[4],@x[4] + rev @x[6],@x[6] + rev @x[8],@x[8] + rev @x[10],@x[10] + rev @x[12],@x[12] + rev @x[14],@x[14] +#endif + eor @x[0],@x[0],@x[1] + eor @x[2],@x[2],@x[3] + eor @x[4],@x[4],@x[5] + eor @x[6],@x[6],@x[7] + eor @x[8],@x[8],@x[9] + eor @x[10],@x[10],@x[11] + eor @x[12],@x[12],@x[13] + eor @x[14],@x[14],@x[15] + + stp @x[0],@x[2],[$out,#0] // store output + add @d[6],@d[6],#4 // increment counter + stp @x[4],@x[6],[$out,#16] + stp @x[8],@x[10],[$out,#32] + stp @x[12],@x[14],[$out,#48] + add $out,$out,#64 + b.eq .Ldone_neon + sub $len,$len,#64 + cmp $len,#64 + b.lo .Less_than_128 + + ld1.8 {$T0-$T3},[$inp],#64 + eor $A0,$A0,$T0 + eor $B0,$B0,$T1 + eor $C0,$C0,$T2 + eor $D0,$D0,$T3 + st1.8 {$A0-$D0},[$out],#64 + b.eq .Ldone_neon + sub $len,$len,#64 + cmp $len,#64 + b.lo .Less_than_192 + + ld1.8 {$T0-$T3},[$inp],#64 + eor $A1,$A1,$T0 + eor $B1,$B1,$T1 + eor $C1,$C1,$T2 + eor $D1,$D1,$T3 + st1.8 {$A1-$D1},[$out],#64 + b.eq .Ldone_neon + sub $len,$len,#64 + + st1.8 {$A2-$D2},[sp] + b .Last_neon + +.Less_than_128: + st1.8 {$A0-$D0},[sp] + b .Last_neon +.Less_than_192: + st1.8 {$A1-$D1},[sp] + b .Last_neon + +.align 4 +.Last_neon: + sub $out,$out,#1 + add $inp,$inp,$len + add $out,$out,$len + add $ctr,sp,$len + neg $len,$len + +.Loop_tail_neon: + ldrb w10,[$inp,$len] + ldrb w11,[$ctr,$len] + add $len,$len,#1 + eor w10,w10,w11 + strb w10,[$out,$len] + cbnz $len,.Loop_tail_neon + + stp xzr,xzr,[sp,#0] + stp xzr,xzr,[sp,#16] + stp xzr,xzr,[sp,#32] + stp xzr,xzr,[sp,#48] + +.Ldone_neon: + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + ret +.size ChaCha20_neon,.-ChaCha20_neon +___ +{ +my ($T0,$T1,$T2,$T3,$T4,$T5)=@K; +my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2, + $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23)); + +$code.=<<___; +#ifdef __APPLE__ +.type ChaCha20_512_neon,%function +.align 5 +ChaCha20_512_neon: + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adr @x[0],.Lsigma + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + +.L512_or_more_neon: + sub sp,sp,#128+64 + + ldp @d[0],@d[1],[@x[0]] // load sigma + ld1 {@K[0]},[@x[0]],#16 + ldp @d[2],@d[3],[$key] // load key + ldp @d[4],@d[5],[$key,#16] + ld1 {@K[1],@K[2]},[$key] + ldp @d[6],@d[7],[$ctr] // load counter + ld1 {@K[3]},[$ctr] + ld1 {$ONE},[@x[0]] +# ifdef __AARCH64EB__ + rev64 @K[0],@K[0] + ror @d[2],@d[2],#32 + ror @d[3],@d[3],#32 + ror @d[4],@d[4],#32 + ror @d[5],@d[5],#32 + ror @d[6],@d[6],#32 + ror @d[7],@d[7],#32 +# endif + add @K[3],@K[3],$ONE // += 1 + stp @K[0],@K[1],[sp,#0] // off-load key block, invariant part + add @K[3],@K[3],$ONE // not typo + str @K[2],[sp,#32] + add @K[4],@K[3],$ONE + add @K[5],@K[4],$ONE + add @K[6],@K[5],$ONE + shl $ONE,$ONE,#2 // 1 -> 4 + + stp d8,d9,[sp,#128+0] // meet ABI requirements + stp d10,d11,[sp,#128+16] + stp d12,d13,[sp,#128+32] + stp d14,d15,[sp,#128+48] + + sub $len,$len,#512 // not typo + +.Loop_outer_512_neon: + mov $A0,@K[0] + mov $A1,@K[0] + mov $A2,@K[0] + mov $A3,@K[0] + mov $A4,@K[0] + mov $A5,@K[0] + mov $B0,@K[1] + mov.32 @x[0],@d[0] // unpack key block + mov $B1,@K[1] + lsr @x[1],@d[0],#32 + mov $B2,@K[1] + mov.32 @x[2],@d[1] + mov $B3,@K[1] + lsr @x[3],@d[1],#32 + mov $B4,@K[1] + mov.32 @x[4],@d[2] + mov $B5,@K[1] + lsr @x[5],@d[2],#32 + mov $D0,@K[3] + mov.32 @x[6],@d[3] + mov $D1,@K[4] + lsr @x[7],@d[3],#32 + mov $D2,@K[5] + mov.32 @x[8],@d[4] + mov $D3,@K[6] + lsr @x[9],@d[4],#32 + mov $C0,@K[2] + mov.32 @x[10],@d[5] + mov $C1,@K[2] + lsr @x[11],@d[5],#32 + add $D4,$D0,$ONE // +4 + mov.32 @x[12],@d[6] + add $D5,$D1,$ONE // +4 + lsr @x[13],@d[6],#32 + mov $C2,@K[2] + mov.32 @x[14],@d[7] + mov $C3,@K[2] + lsr @x[15],@d[7],#32 + mov $C4,@K[2] + stp @K[3],@K[4],[sp,#48] // off-load key block, variable part + mov $C5,@K[2] + str @K[5],[sp,#80] + + mov $ctr,#5 + subs $len,$len,#512 +.Loop_upper_neon: + sub $ctr,$ctr,#1 +___ + my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0); + my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0); + my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0); + my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0); + my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0); + my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0); + my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); + my $diff = ($#thread0+1)*6 - $#thread67 - 1; + my $i = 0; + + foreach (@thread0) { + eval; eval(shift(@thread67)); + eval(shift(@thread1)); eval(shift(@thread67)); + eval(shift(@thread2)); eval(shift(@thread67)); + eval(shift(@thread3)); eval(shift(@thread67)); + eval(shift(@thread4)); eval(shift(@thread67)); + eval(shift(@thread5)); eval(shift(@thread67)); + } + + @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1); + @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1); + @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1); + @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1); + @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1); + @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1); + @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); + + foreach (@thread0) { + eval; eval(shift(@thread67)); + eval(shift(@thread1)); eval(shift(@thread67)); + eval(shift(@thread2)); eval(shift(@thread67)); + eval(shift(@thread3)); eval(shift(@thread67)); + eval(shift(@thread4)); eval(shift(@thread67)); + eval(shift(@thread5)); eval(shift(@thread67)); + } +$code.=<<___; + cbnz $ctr,.Loop_upper_neon + + add.32 @x[0],@x[0],@d[0] // accumulate key block + add @x[1],@x[1],@d[0],lsr#32 + add.32 @x[2],@x[2],@d[1] + add @x[3],@x[3],@d[1],lsr#32 + add.32 @x[4],@x[4],@d[2] + add @x[5],@x[5],@d[2],lsr#32 + add.32 @x[6],@x[6],@d[3] + add @x[7],@x[7],@d[3],lsr#32 + add.32 @x[8],@x[8],@d[4] + add @x[9],@x[9],@d[4],lsr#32 + add.32 @x[10],@x[10],@d[5] + add @x[11],@x[11],@d[5],lsr#32 + add.32 @x[12],@x[12],@d[6] + add @x[13],@x[13],@d[6],lsr#32 + add.32 @x[14],@x[14],@d[7] + add @x[15],@x[15],@d[7],lsr#32 + + add @x[0],@x[0],@x[1],lsl#32 // pack + add @x[2],@x[2],@x[3],lsl#32 + ldp @x[1],@x[3],[$inp,#0] // load input + add @x[4],@x[4],@x[5],lsl#32 + add @x[6],@x[6],@x[7],lsl#32 + ldp @x[5],@x[7],[$inp,#16] + add @x[8],@x[8],@x[9],lsl#32 + add @x[10],@x[10],@x[11],lsl#32 + ldp @x[9],@x[11],[$inp,#32] + add @x[12],@x[12],@x[13],lsl#32 + add @x[14],@x[14],@x[15],lsl#32 + ldp @x[13],@x[15],[$inp,#48] + add $inp,$inp,#64 +# ifdef __AARCH64EB__ + rev @x[0],@x[0] + rev @x[2],@x[2] + rev @x[4],@x[4] + rev @x[6],@x[6] + rev @x[8],@x[8] + rev @x[10],@x[10] + rev @x[12],@x[12] + rev @x[14],@x[14] +# endif + eor @x[0],@x[0],@x[1] + eor @x[2],@x[2],@x[3] + eor @x[4],@x[4],@x[5] + eor @x[6],@x[6],@x[7] + eor @x[8],@x[8],@x[9] + eor @x[10],@x[10],@x[11] + eor @x[12],@x[12],@x[13] + eor @x[14],@x[14],@x[15] + + stp @x[0],@x[2],[$out,#0] // store output + add @d[6],@d[6],#1 // increment counter + mov.32 @x[0],@d[0] // unpack key block + lsr @x[1],@d[0],#32 + stp @x[4],@x[6],[$out,#16] + mov.32 @x[2],@d[1] + lsr @x[3],@d[1],#32 + stp @x[8],@x[10],[$out,#32] + mov.32 @x[4],@d[2] + lsr @x[5],@d[2],#32 + stp @x[12],@x[14],[$out,#48] + add $out,$out,#64 + mov.32 @x[6],@d[3] + lsr @x[7],@d[3],#32 + mov.32 @x[8],@d[4] + lsr @x[9],@d[4],#32 + mov.32 @x[10],@d[5] + lsr @x[11],@d[5],#32 + mov.32 @x[12],@d[6] + lsr @x[13],@d[6],#32 + mov.32 @x[14],@d[7] + lsr @x[15],@d[7],#32 + + mov $ctr,#5 +.Loop_lower_neon: + sub $ctr,$ctr,#1 +___ + @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0); + @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0); + @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0); + @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0); + @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0); + @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0); + @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); + + foreach (@thread0) { + eval; eval(shift(@thread67)); + eval(shift(@thread1)); eval(shift(@thread67)); + eval(shift(@thread2)); eval(shift(@thread67)); + eval(shift(@thread3)); eval(shift(@thread67)); + eval(shift(@thread4)); eval(shift(@thread67)); + eval(shift(@thread5)); eval(shift(@thread67)); + } + + @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1); + @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1); + @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1); + @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1); + @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1); + @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1); + @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); + + foreach (@thread0) { + eval; eval(shift(@thread67)); + eval(shift(@thread1)); eval(shift(@thread67)); + eval(shift(@thread2)); eval(shift(@thread67)); + eval(shift(@thread3)); eval(shift(@thread67)); + eval(shift(@thread4)); eval(shift(@thread67)); + eval(shift(@thread5)); eval(shift(@thread67)); + } +$code.=<<___; + cbnz $ctr,.Loop_lower_neon + + add.32 @x[0],@x[0],@d[0] // accumulate key block + ldp @K[0],@K[1],[sp,#0] + add @x[1],@x[1],@d[0],lsr#32 + ldp @K[2],@K[3],[sp,#32] + add.32 @x[2],@x[2],@d[1] + ldp @K[4],@K[5],[sp,#64] + add @x[3],@x[3],@d[1],lsr#32 + add $A0,$A0,@K[0] + add.32 @x[4],@x[4],@d[2] + add $A1,$A1,@K[0] + add @x[5],@x[5],@d[2],lsr#32 + add $A2,$A2,@K[0] + add.32 @x[6],@x[6],@d[3] + add $A3,$A3,@K[0] + add @x[7],@x[7],@d[3],lsr#32 + add $A4,$A4,@K[0] + add.32 @x[8],@x[8],@d[4] + add $A5,$A5,@K[0] + add @x[9],@x[9],@d[4],lsr#32 + add $C0,$C0,@K[2] + add.32 @x[10],@x[10],@d[5] + add $C1,$C1,@K[2] + add @x[11],@x[11],@d[5],lsr#32 + add $C2,$C2,@K[2] + add.32 @x[12],@x[12],@d[6] + add $C3,$C3,@K[2] + add @x[13],@x[13],@d[6],lsr#32 + add $C4,$C4,@K[2] + add.32 @x[14],@x[14],@d[7] + add $C5,$C5,@K[2] + add @x[15],@x[15],@d[7],lsr#32 + add $D4,$D4,$ONE // +4 + add @x[0],@x[0],@x[1],lsl#32 // pack + add $D5,$D5,$ONE // +4 + add @x[2],@x[2],@x[3],lsl#32 + add $D0,$D0,@K[3] + ldp @x[1],@x[3],[$inp,#0] // load input + add $D1,$D1,@K[4] + add @x[4],@x[4],@x[5],lsl#32 + add $D2,$D2,@K[5] + add @x[6],@x[6],@x[7],lsl#32 + add $D3,$D3,@K[6] + ldp @x[5],@x[7],[$inp,#16] + add $D4,$D4,@K[3] + add @x[8],@x[8],@x[9],lsl#32 + add $D5,$D5,@K[4] + add @x[10],@x[10],@x[11],lsl#32 + add $B0,$B0,@K[1] + ldp @x[9],@x[11],[$inp,#32] + add $B1,$B1,@K[1] + add @x[12],@x[12],@x[13],lsl#32 + add $B2,$B2,@K[1] + add @x[14],@x[14],@x[15],lsl#32 + add $B3,$B3,@K[1] + ldp @x[13],@x[15],[$inp,#48] + add $B4,$B4,@K[1] + add $inp,$inp,#64 + add $B5,$B5,@K[1] + +# ifdef __AARCH64EB__ + rev @x[0],@x[0] + rev @x[2],@x[2] + rev @x[4],@x[4] + rev @x[6],@x[6] + rev @x[8],@x[8] + rev @x[10],@x[10] + rev @x[12],@x[12] + rev @x[14],@x[14] +# endif + ld1.8 {$T0-$T3},[$inp],#64 + eor @x[0],@x[0],@x[1] + eor @x[2],@x[2],@x[3] + eor @x[4],@x[4],@x[5] + eor @x[6],@x[6],@x[7] + eor @x[8],@x[8],@x[9] + eor $A0,$A0,$T0 + eor @x[10],@x[10],@x[11] + eor $B0,$B0,$T1 + eor @x[12],@x[12],@x[13] + eor $C0,$C0,$T2 + eor @x[14],@x[14],@x[15] + eor $D0,$D0,$T3 + ld1.8 {$T0-$T3},[$inp],#64 + + stp @x[0],@x[2],[$out,#0] // store output + add @d[6],@d[6],#7 // increment counter + stp @x[4],@x[6],[$out,#16] + stp @x[8],@x[10],[$out,#32] + stp @x[12],@x[14],[$out,#48] + add $out,$out,#64 + st1.8 {$A0-$D0},[$out],#64 + + ld1.8 {$A0-$D0},[$inp],#64 + eor $A1,$A1,$T0 + eor $B1,$B1,$T1 + eor $C1,$C1,$T2 + eor $D1,$D1,$T3 + st1.8 {$A1-$D1},[$out],#64 + + ld1.8 {$A1-$D1},[$inp],#64 + eor $A2,$A2,$A0 + ldp @K[0],@K[1],[sp,#0] + eor $B2,$B2,$B0 + ldp @K[2],@K[3],[sp,#32] + eor $C2,$C2,$C0 + eor $D2,$D2,$D0 + st1.8 {$A2-$D2},[$out],#64 + + ld1.8 {$A2-$D2},[$inp],#64 + eor $A3,$A3,$A1 + eor $B3,$B3,$B1 + eor $C3,$C3,$C1 + eor $D3,$D3,$D1 + st1.8 {$A3-$D3},[$out],#64 + + ld1.8 {$A3-$D3},[$inp],#64 + eor $A4,$A4,$A2 + eor $B4,$B4,$B2 + eor $C4,$C4,$C2 + eor $D4,$D4,$D2 + st1.8 {$A4-$D4},[$out],#64 + + shl $A0,$ONE,#1 // 4 -> 8 + eor $A5,$A5,$A3 + eor $B5,$B5,$B3 + eor $C5,$C5,$C3 + eor $D5,$D5,$D3 + st1.8 {$A5-$D5},[$out],#64 + + add @K[3],@K[3],$A0 // += 8 + add @K[4],@K[4],$A0 + add @K[5],@K[5],$A0 + add @K[6],@K[6],$A0 + + b.hs .Loop_outer_512_neon + + adds $len,$len,#512 + ushr $A0,$ONE,#2 // 4 -> 1 + + ldp d8,d9,[sp,#128+0] // meet ABI requirements + ldp d10,d11,[sp,#128+16] + ldp d12,d13,[sp,#128+32] + ldp d14,d15,[sp,#128+48] + + stp @K[0],$ONE,[sp,#0] // wipe off-load area + stp @K[0],$ONE,[sp,#32] + stp @K[0],$ONE,[sp,#64] + + b.eq .Ldone_512_neon + + cmp $len,#192 + sub @K[3],@K[3],$A0 // -= 1 + sub @K[4],@K[4],$A0 + sub @K[5],@K[5],$A0 + add sp,sp,#128 + b.hs .Loop_outer_neon + + eor @K[1],@K[1],@K[1] + eor @K[2],@K[2],@K[2] + eor @K[3],@K[3],@K[3] + eor @K[4],@K[4],@K[4] + eor @K[5],@K[5],@K[5] + eor @K[6],@K[6],@K[6] + b .Loop_outer + +.Ldone_512_neon: + ldp x19,x20,[x29,#16] + add sp,sp,#128+64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + ret +.size ChaCha20_512_neon,.-ChaCha20_512_neon +#endif +___ +} +}}} + +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/\/\// and !/^$/); + print; +} +close SELF; + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + (s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1)) or + (m/\b(eor|ext|mov)\b/ and (s/\.4s/\.16b/g or 1)) or + (s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1)) or + (m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1)) or + (s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1)); + + #s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; + + print $_,"\n"; +} +close STDOUT; # flush diff --git a/src/crypto/zinc/chacha20/chacha20-unrolled-arm.S b/src/crypto/zinc/chacha20/chacha20-unrolled-arm.S new file mode 100644 index 0000000..2140319 --- /dev/null +++ b/src/crypto/zinc/chacha20/chacha20-unrolled-arm.S @@ -0,0 +1,461 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2018 Google, Inc. + */ + +#include +#include + +/* + * Design notes: + * + * 16 registers would be needed to hold the state matrix, but only 14 are + * available because 'sp' and 'pc' cannot be used. So we spill the elements + * (x8, x9) to the stack and swap them out with (x10, x11). This adds one + * 'ldrd' and one 'strd' instruction per round. + * + * All rotates are performed using the implicit rotate operand accepted by the + * 'add' and 'eor' instructions. This is faster than using explicit rotate + * instructions. To make this work, we allow the values in the second and last + * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the + * wrong rotation amount. The rotation amount is then fixed up just in time + * when the values are used. 'brot' is the number of bits the values in row 'b' + * need to be rotated right to arrive at the correct values, and 'drot' + * similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such + * that they end up as (25, 24) after every round. + */ + + // ChaCha state registers + X0 .req r0 + X1 .req r1 + X2 .req r2 + X3 .req r3 + X4 .req r4 + X5 .req r5 + X6 .req r6 + X7 .req r7 + X8_X10 .req r8 // shared by x8 and x10 + X9_X11 .req r9 // shared by x9 and x11 + X12 .req r10 + X13 .req r11 + X14 .req r12 + X15 .req r14 + +.Lexpand_32byte_k: + // "expand 32-byte k" + .word 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 + +#ifdef __thumb2__ +# define adrl adr +#endif + +.macro __rev out, in, t0, t1, t2 +.if __LINUX_ARM_ARCH__ >= 6 + rev \out, \in +.else + lsl \t0, \in, #24 + and \t1, \in, #0xff00 + and \t2, \in, #0xff0000 + orr \out, \t0, \in, lsr #24 + orr \out, \out, \t1, lsl #8 + orr \out, \out, \t2, lsr #8 +.endif +.endm + +.macro _le32_bswap x, t0, t1, t2 +#ifdef __ARMEB__ + __rev \x, \x, \t0, \t1, \t2 +#endif +.endm + +.macro _le32_bswap_4x a, b, c, d, t0, t1, t2 + _le32_bswap \a, \t0, \t1, \t2 + _le32_bswap \b, \t0, \t1, \t2 + _le32_bswap \c, \t0, \t1, \t2 + _le32_bswap \d, \t0, \t1, \t2 +.endm + +.macro __ldrd a, b, src, offset +#if __LINUX_ARM_ARCH__ >= 6 + ldrd \a, \b, [\src, #\offset] +#else + ldr \a, [\src, #\offset] + ldr \b, [\src, #\offset + 4] +#endif +.endm + +.macro __strd a, b, dst, offset +#if __LINUX_ARM_ARCH__ >= 6 + strd \a, \b, [\dst, #\offset] +#else + str \a, [\dst, #\offset] + str \b, [\dst, #\offset + 4] +#endif +.endm + +.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2 + + // a += b; d ^= a; d = rol(d, 16); + add \a1, \a1, \b1, ror #brot + add \a2, \a2, \b2, ror #brot + eor \d1, \a1, \d1, ror #drot + eor \d2, \a2, \d2, ror #drot + // drot == 32 - 16 == 16 + + // c += d; b ^= c; b = rol(b, 12); + add \c1, \c1, \d1, ror #16 + add \c2, \c2, \d2, ror #16 + eor \b1, \c1, \b1, ror #brot + eor \b2, \c2, \b2, ror #brot + // brot == 32 - 12 == 20 + + // a += b; d ^= a; d = rol(d, 8); + add \a1, \a1, \b1, ror #20 + add \a2, \a2, \b2, ror #20 + eor \d1, \a1, \d1, ror #16 + eor \d2, \a2, \d2, ror #16 + // drot == 32 - 8 == 24 + + // c += d; b ^= c; b = rol(b, 7); + add \c1, \c1, \d1, ror #24 + add \c2, \c2, \d2, ror #24 + eor \b1, \c1, \b1, ror #20 + eor \b2, \c2, \b2, ror #20 + // brot == 32 - 7 == 25 +.endm + +.macro _doubleround + + // column round + + // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13) + _halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13 + + // save (x8, x9); restore (x10, x11) + __strd X8_X10, X9_X11, sp, 0 + __ldrd X8_X10, X9_X11, sp, 8 + + // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15) + _halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15 + + .set brot, 25 + .set drot, 24 + + // diagonal round + + // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12) + _halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12 + + // save (x10, x11); restore (x8, x9) + __strd X8_X10, X9_X11, sp, 8 + __ldrd X8_X10, X9_X11, sp, 0 + + // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14) + _halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14 +.endm + +.macro _chacha_permute nrounds + .set brot, 0 + .set drot, 0 + .rept \nrounds / 2 + _doubleround + .endr +.endm + +.macro _chacha nrounds + +.Lnext_block\@: + // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN + // Registers contain x0-x9,x12-x15. + + // Do the core ChaCha permutation to update x0-x15. + _chacha_permute \nrounds + + add sp, #8 + // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN + // Registers contain x0-x9,x12-x15. + // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. + + // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15). + push {X8_X10, X9_X11, X12, X13, X14, X15} + + // Load (OUT, IN, LEN). + ldr r14, [sp, #96] + ldr r12, [sp, #100] + ldr r11, [sp, #104] + + orr r10, r14, r12 + + // Use slow path if fewer than 64 bytes remain. + cmp r11, #64 + blt .Lxor_slowpath\@ + + // Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on + // ARMv6+, since ldmia and stmia (used below) still require alignment. + tst r10, #3 + bne .Lxor_slowpath\@ + + // Fast path: XOR 64 bytes of aligned data. + + // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN + // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT. + // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. + + // x0-x3 + __ldrd r8, r9, sp, 32 + __ldrd r10, r11, sp, 40 + add X0, X0, r8 + add X1, X1, r9 + add X2, X2, r10 + add X3, X3, r11 + _le32_bswap_4x X0, X1, X2, X3, r8, r9, r10 + ldmia r12!, {r8-r11} + eor X0, X0, r8 + eor X1, X1, r9 + eor X2, X2, r10 + eor X3, X3, r11 + stmia r14!, {X0-X3} + + // x4-x7 + __ldrd r8, r9, sp, 48 + __ldrd r10, r11, sp, 56 + add X4, r8, X4, ror #brot + add X5, r9, X5, ror #brot + ldmia r12!, {X0-X3} + add X6, r10, X6, ror #brot + add X7, r11, X7, ror #brot + _le32_bswap_4x X4, X5, X6, X7, r8, r9, r10 + eor X4, X4, X0 + eor X5, X5, X1 + eor X6, X6, X2 + eor X7, X7, X3 + stmia r14!, {X4-X7} + + // x8-x15 + pop {r0-r7} // (x8-x9,x12-x15,x10-x11) + __ldrd r8, r9, sp, 32 + __ldrd r10, r11, sp, 40 + add r0, r0, r8 // x8 + add r1, r1, r9 // x9 + add r6, r6, r10 // x10 + add r7, r7, r11 // x11 + _le32_bswap_4x r0, r1, r6, r7, r8, r9, r10 + ldmia r12!, {r8-r11} + eor r0, r0, r8 // x8 + eor r1, r1, r9 // x9 + eor r6, r6, r10 // x10 + eor r7, r7, r11 // x11 + stmia r14!, {r0,r1,r6,r7} + ldmia r12!, {r0,r1,r6,r7} + __ldrd r8, r9, sp, 48 + __ldrd r10, r11, sp, 56 + add r2, r8, r2, ror #drot // x12 + add r3, r9, r3, ror #drot // x13 + add r4, r10, r4, ror #drot // x14 + add r5, r11, r5, ror #drot // x15 + _le32_bswap_4x r2, r3, r4, r5, r9, r10, r11 + ldr r9, [sp, #72] // load LEN + eor r2, r2, r0 // x12 + eor r3, r3, r1 // x13 + eor r4, r4, r6 // x14 + eor r5, r5, r7 // x15 + subs r9, #64 // decrement and check LEN + stmia r14!, {r2-r5} + + beq .Ldone\@ + +.Lprepare_for_next_block\@: + + // Stack: x0-x15 OUT IN LEN + + // Increment block counter (x12) + add r8, #1 + + // Store updated (OUT, IN, LEN) + str r14, [sp, #64] + str r12, [sp, #68] + str r9, [sp, #72] + + mov r14, sp + + // Store updated block counter (x12) + str r8, [sp, #48] + + sub sp, #16 + + // Reload state and do next block + ldmia r14!, {r0-r11} // load x0-x11 + __strd r10, r11, sp, 8 // store x10-x11 before state + ldmia r14, {r10-r12,r14} // load x12-x15 + b .Lnext_block\@ + +.Lxor_slowpath\@: + // Slow path: < 64 bytes remaining, or unaligned input or output buffer. + // We handle it by storing the 64 bytes of keystream to the stack, then + // XOR-ing the needed portion with the data. + + // Allocate keystream buffer + sub sp, #64 + mov r14, sp + + // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN + // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0. + // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. + + // Save keystream for x0-x3 + __ldrd r8, r9, sp, 96 + __ldrd r10, r11, sp, 104 + add X0, X0, r8 + add X1, X1, r9 + add X2, X2, r10 + add X3, X3, r11 + _le32_bswap_4x X0, X1, X2, X3, r8, r9, r10 + stmia r14!, {X0-X3} + + // Save keystream for x4-x7 + __ldrd r8, r9, sp, 112 + __ldrd r10, r11, sp, 120 + add X4, r8, X4, ror #brot + add X5, r9, X5, ror #brot + add X6, r10, X6, ror #brot + add X7, r11, X7, ror #brot + _le32_bswap_4x X4, X5, X6, X7, r8, r9, r10 + add r8, sp, #64 + stmia r14!, {X4-X7} + + // Save keystream for x8-x15 + ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11) + __ldrd r8, r9, sp, 128 + __ldrd r10, r11, sp, 136 + add r0, r0, r8 // x8 + add r1, r1, r9 // x9 + add r6, r6, r10 // x10 + add r7, r7, r11 // x11 + _le32_bswap_4x r0, r1, r6, r7, r8, r9, r10 + stmia r14!, {r0,r1,r6,r7} + __ldrd r8, r9, sp, 144 + __ldrd r10, r11, sp, 152 + add r2, r8, r2, ror #drot // x12 + add r3, r9, r3, ror #drot // x13 + add r4, r10, r4, ror #drot // x14 + add r5, r11, r5, ror #drot // x15 + _le32_bswap_4x r2, r3, r4, r5, r9, r10, r11 + stmia r14, {r2-r5} + + // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN + // Registers: r8 is block counter, r12 is IN. + + ldr r9, [sp, #168] // LEN + ldr r14, [sp, #160] // OUT + cmp r9, #64 + mov r0, sp + movle r1, r9 + movgt r1, #64 + // r1 is number of bytes to XOR, in range [1, 64] + +.if __LINUX_ARM_ARCH__ < 6 + orr r2, r12, r14 + tst r2, #3 // IN or OUT misaligned? + bne .Lxor_next_byte\@ +.endif + + // XOR a word at a time +.rept 16 + subs r1, #4 + blt .Lxor_words_done\@ + ldr r2, [r12], #4 + ldr r3, [r0], #4 + eor r2, r2, r3 + str r2, [r14], #4 +.endr + b .Lxor_slowpath_done\@ +.Lxor_words_done\@: + ands r1, r1, #3 + beq .Lxor_slowpath_done\@ + + // XOR a byte at a time +.Lxor_next_byte\@: + ldrb r2, [r12], #1 + ldrb r3, [r0], #1 + eor r2, r2, r3 + strb r2, [r14], #1 + subs r1, #1 + bne .Lxor_next_byte\@ + +.Lxor_slowpath_done\@: + subs r9, #64 + add sp, #96 + bgt .Lprepare_for_next_block\@ + +.Ldone\@: +.endm // _chacha + +/* + * void chacha20_arm(u8 *out, const u8 *in, size_t len, const u32 key[8], + * const u32 iv[4]); + */ +ENTRY(chacha20_arm) + cmp r2, #0 // len == 0? + reteq lr + + push {r0-r2,r4-r11,lr} + + // Push state x0-x15 onto stack. + // Also store an extra copy of x10-x11 just before the state. + + ldr r4, [sp, #48] // iv + mov r0, sp + sub sp, #80 + + // iv: x12-x15 + ldm r4, {X12,X13,X14,X15} + stmdb r0!, {X12,X13,X14,X15} + + // key: x4-x11 + __ldrd X8_X10, X9_X11, r3, 24 + __strd X8_X10, X9_X11, sp, 8 + stmdb r0!, {X8_X10, X9_X11} + ldm r3, {X4-X9_X11} + stmdb r0!, {X4-X9_X11} + + // constants: x0-x3 + adrl X3, .Lexpand_32byte_k + ldm X3, {X0-X3} + __strd X0, X1, sp, 16 + __strd X2, X3, sp, 24 + + _chacha 20 + + add sp, #76 + pop {r4-r11, pc} +ENDPROC(chacha20_arm) + +/* + * void hchacha20_arm(const u32 state[16], u32 out[8]); + */ +ENTRY(hchacha20_arm) + push {r1,r4-r11,lr} + + mov r14, r0 + ldmia r14!, {r0-r11} // load x0-x11 + push {r10-r11} // store x10-x11 to stack + ldm r14, {r10-r12,r14} // load x12-x15 + sub sp, #8 + + _chacha_permute 20 + + // Skip over (unused0-unused1, x10-x11) + add sp, #16 + + // Fix up rotations of x12-x15 + ror X12, X12, #drot + ror X13, X13, #drot + pop {r4} // load 'out' + ror X14, X14, #drot + ror X15, X15, #drot + + // Store (x0-x3,x12-x15) to 'out' + stm r4, {X0,X1,X2,X3,X12,X13,X14,X15} + + pop {r4-r11,pc} +ENDPROC(hchacha20_arm) diff --git a/src/crypto/zinc/poly1305/poly1305-arm.S b/src/crypto/zinc/poly1305/poly1305-arm.S deleted file mode 100644 index 4a0e9d4..0000000 --- a/src/crypto/zinc/poly1305/poly1305-arm.S +++ /dev/null @@ -1,1117 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ -/* - * Copyright (C) 2015-2018 Jason A. Donenfeld . All Rights Reserved. - * Copyright (C) 2006-2017 CRYPTOGAMS by . All Rights Reserved. - * - * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS. - */ - -#include - -.text -#if defined(__thumb2__) -.syntax unified -.thumb -#else -.code 32 -#endif - -.align 5 -ENTRY(poly1305_init_arm) - stmdb sp!,{r4-r11} - - eor r3,r3,r3 - cmp r1,#0 - str r3,[r0,#0] @ zero hash value - str r3,[r0,#4] - str r3,[r0,#8] - str r3,[r0,#12] - str r3,[r0,#16] - str r3,[r0,#36] @ is_base2_26 - add r0,r0,#20 - -#ifdef __thumb2__ - it eq -#endif - moveq r0,#0 - beq .Lno_key - - ldrb r4,[r1,#0] - mov r10,#0x0fffffff - ldrb r5,[r1,#1] - and r3,r10,#-4 @ 0x0ffffffc - ldrb r6,[r1,#2] - ldrb r7,[r1,#3] - orr r4,r4,r5,lsl#8 - ldrb r5,[r1,#4] - orr r4,r4,r6,lsl#16 - ldrb r6,[r1,#5] - orr r4,r4,r7,lsl#24 - ldrb r7,[r1,#6] - and r4,r4,r10 - - ldrb r8,[r1,#7] - orr r5,r5,r6,lsl#8 - ldrb r6,[r1,#8] - orr r5,r5,r7,lsl#16 - ldrb r7,[r1,#9] - orr r5,r5,r8,lsl#24 - ldrb r8,[r1,#10] - and r5,r5,r3 - - ldrb r9,[r1,#11] - orr r6,r6,r7,lsl#8 - ldrb r7,[r1,#12] - orr r6,r6,r8,lsl#16 - ldrb r8,[r1,#13] - orr r6,r6,r9,lsl#24 - ldrb r9,[r1,#14] - and r6,r6,r3 - - ldrb r10,[r1,#15] - orr r7,r7,r8,lsl#8 - str r4,[r0,#0] - orr r7,r7,r9,lsl#16 - str r5,[r0,#4] - orr r7,r7,r10,lsl#24 - str r6,[r0,#8] - and r7,r7,r3 - str r7,[r0,#12] -.Lno_key: - ldmia sp!,{r4-r11} -#if __LINUX_ARM_ARCH__ >= 5 - bx lr @ bx lr -#else - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - .word 0xe12fff1e @ interoperable with Thumb ISA:-) -#endif -ENDPROC(poly1305_init_arm) - -.align 5 -ENTRY(poly1305_blocks_arm) -.Lpoly1305_blocks_arm: - stmdb sp!,{r3-r11,lr} - - ands r2,r2,#-16 - beq .Lno_data - - cmp r3,#0 - add r2,r2,r1 @ end pointer - sub sp,sp,#32 - - ldmia r0,{r4-r12} @ load context - - str r0,[sp,#12] @ offload stuff - mov lr,r1 - str r2,[sp,#16] - str r10,[sp,#20] - str r11,[sp,#24] - str r12,[sp,#28] - b .Loop - -.Loop: -#if __LINUX_ARM_ARCH__ < 7 - ldrb r0,[lr],#16 @ load input -#ifdef __thumb2__ - it hi -#endif - addhi r8,r8,#1 @ 1<<128 - ldrb r1,[lr,#-15] - ldrb r2,[lr,#-14] - ldrb r3,[lr,#-13] - orr r1,r0,r1,lsl#8 - ldrb r0,[lr,#-12] - orr r2,r1,r2,lsl#16 - ldrb r1,[lr,#-11] - orr r3,r2,r3,lsl#24 - ldrb r2,[lr,#-10] - adds r4,r4,r3 @ accumulate input - - ldrb r3,[lr,#-9] - orr r1,r0,r1,lsl#8 - ldrb r0,[lr,#-8] - orr r2,r1,r2,lsl#16 - ldrb r1,[lr,#-7] - orr r3,r2,r3,lsl#24 - ldrb r2,[lr,#-6] - adcs r5,r5,r3 - - ldrb r3,[lr,#-5] - orr r1,r0,r1,lsl#8 - ldrb r0,[lr,#-4] - orr r2,r1,r2,lsl#16 - ldrb r1,[lr,#-3] - orr r3,r2,r3,lsl#24 - ldrb r2,[lr,#-2] - adcs r6,r6,r3 - - ldrb r3,[lr,#-1] - orr r1,r0,r1,lsl#8 - str lr,[sp,#8] @ offload input pointer - orr r2,r1,r2,lsl#16 - add r10,r10,r10,lsr#2 - orr r3,r2,r3,lsl#24 -#else - ldr r0,[lr],#16 @ load input -#ifdef __thumb2__ - it hi -#endif - addhi r8,r8,#1 @ padbit - ldr r1,[lr,#-12] - ldr r2,[lr,#-8] - ldr r3,[lr,#-4] -#ifdef __ARMEB__ - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 -#endif - adds r4,r4,r0 @ accumulate input - str lr,[sp,#8] @ offload input pointer - adcs r5,r5,r1 - add r10,r10,r10,lsr#2 - adcs r6,r6,r2 -#endif - add r11,r11,r11,lsr#2 - adcs r7,r7,r3 - add r12,r12,r12,lsr#2 - - umull r2,r3,r5,r9 - adc r8,r8,#0 - umull r0,r1,r4,r9 - umlal r2,r3,r8,r10 - umlal r0,r1,r7,r10 - ldr r10,[sp,#20] @ reload r10 - umlal r2,r3,r6,r12 - umlal r0,r1,r5,r12 - umlal r2,r3,r7,r11 - umlal r0,r1,r6,r11 - umlal r2,r3,r4,r10 - str r0,[sp,#0] @ future r4 - mul r0,r11,r8 - ldr r11,[sp,#24] @ reload r11 - adds r2,r2,r1 @ d1+=d0>>32 - eor r1,r1,r1 - adc lr,r3,#0 @ future r6 - str r2,[sp,#4] @ future r5 - - mul r2,r12,r8 - eor r3,r3,r3 - umlal r0,r1,r7,r12 - ldr r12,[sp,#28] @ reload r12 - umlal r2,r3,r7,r9 - umlal r0,r1,r6,r9 - umlal r2,r3,r6,r10 - umlal r0,r1,r5,r10 - umlal r2,r3,r5,r11 - umlal r0,r1,r4,r11 - umlal r2,r3,r4,r12 - ldr r4,[sp,#0] - mul r8,r9,r8 - ldr r5,[sp,#4] - - adds r6,lr,r0 @ d2+=d1>>32 - ldr lr,[sp,#8] @ reload input pointer - adc r1,r1,#0 - adds r7,r2,r1 @ d3+=d2>>32 - ldr r0,[sp,#16] @ reload end pointer - adc r3,r3,#0 - add r8,r8,r3 @ h4+=d3>>32 - - and r1,r8,#-4 - and r8,r8,#3 - add r1,r1,r1,lsr#2 @ *=5 - adds r4,r4,r1 - adcs r5,r5,#0 - adcs r6,r6,#0 - adcs r7,r7,#0 - adc r8,r8,#0 - - cmp r0,lr @ done yet? - bhi .Loop - - ldr r0,[sp,#12] - add sp,sp,#32 - stmia r0,{r4-r8} @ store the result - -.Lno_data: -#if __LINUX_ARM_ARCH__ >= 5 - ldmia sp!,{r3-r11,pc} -#else - ldmia sp!,{r3-r11,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - .word 0xe12fff1e @ interoperable with Thumb ISA:-) -#endif -ENDPROC(poly1305_blocks_arm) - -.align 5 -ENTRY(poly1305_emit_arm) - stmdb sp!,{r4-r11} -.Lpoly1305_emit_enter: - ldmia r0,{r3-r7} - adds r8,r3,#5 @ compare to modulus - adcs r9,r4,#0 - adcs r10,r5,#0 - adcs r11,r6,#0 - adc r7,r7,#0 - tst r7,#4 @ did it carry/borrow? - -#ifdef __thumb2__ - it ne -#endif - movne r3,r8 - ldr r8,[r2,#0] -#ifdef __thumb2__ - it ne -#endif - movne r4,r9 - ldr r9,[r2,#4] -#ifdef __thumb2__ - it ne -#endif - movne r5,r10 - ldr r10,[r2,#8] -#ifdef __thumb2__ - it ne -#endif - movne r6,r11 - ldr r11,[r2,#12] - - adds r3,r3,r8 - adcs r4,r4,r9 - adcs r5,r5,r10 - adc r6,r6,r11 - -#if __LINUX_ARM_ARCH__ >= 7 -#ifdef __ARMEB__ - rev r3,r3 - rev r4,r4 - rev r5,r5 - rev r6,r6 -#endif - str r3,[r1,#0] - str r4,[r1,#4] - str r5,[r1,#8] - str r6,[r1,#12] -#else - strb r3,[r1,#0] - mov r3,r3,lsr#8 - strb r4,[r1,#4] - mov r4,r4,lsr#8 - strb r5,[r1,#8] - mov r5,r5,lsr#8 - strb r6,[r1,#12] - mov r6,r6,lsr#8 - - strb r3,[r1,#1] - mov r3,r3,lsr#8 - strb r4,[r1,#5] - mov r4,r4,lsr#8 - strb r5,[r1,#9] - mov r5,r5,lsr#8 - strb r6,[r1,#13] - mov r6,r6,lsr#8 - - strb r3,[r1,#2] - mov r3,r3,lsr#8 - strb r4,[r1,#6] - mov r4,r4,lsr#8 - strb r5,[r1,#10] - mov r5,r5,lsr#8 - strb r6,[r1,#14] - mov r6,r6,lsr#8 - - strb r3,[r1,#3] - strb r4,[r1,#7] - strb r5,[r1,#11] - strb r6,[r1,#15] -#endif - ldmia sp!,{r4-r11} -#if __LINUX_ARM_ARCH__ >= 5 - bx lr @ bx lr -#else - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - .word 0xe12fff1e @ interoperable with Thumb ISA:-) -#endif -ENDPROC(poly1305_emit_arm) - - -#ifdef CONFIG_KERNEL_MODE_NEON -.fpu neon - -.align 5 -ENTRY(poly1305_init_neon) -.Lpoly1305_init_neon: - ldr r4,[r0,#20] @ load key base 2^32 - ldr r5,[r0,#24] - ldr r6,[r0,#28] - ldr r7,[r0,#32] - - and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 - mov r3,r4,lsr#26 - mov r4,r5,lsr#20 - orr r3,r3,r5,lsl#6 - mov r5,r6,lsr#14 - orr r4,r4,r6,lsl#12 - mov r6,r7,lsr#8 - orr r5,r5,r7,lsl#18 - and r3,r3,#0x03ffffff - and r4,r4,#0x03ffffff - and r5,r5,#0x03ffffff - - vdup.32 d0,r2 @ r^1 in both lanes - add r2,r3,r3,lsl#2 @ *5 - vdup.32 d1,r3 - add r3,r4,r4,lsl#2 - vdup.32 d2,r2 - vdup.32 d3,r4 - add r4,r5,r5,lsl#2 - vdup.32 d4,r3 - vdup.32 d5,r5 - add r5,r6,r6,lsl#2 - vdup.32 d6,r4 - vdup.32 d7,r6 - vdup.32 d8,r5 - - mov r5,#2 @ counter - -.Lsquare_neon: - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - - vmull.u32 q5,d0,d0[1] - vmull.u32 q6,d1,d0[1] - vmull.u32 q7,d3,d0[1] - vmull.u32 q8,d5,d0[1] - vmull.u32 q9,d7,d0[1] - - vmlal.u32 q5,d7,d2[1] - vmlal.u32 q6,d0,d1[1] - vmlal.u32 q7,d1,d1[1] - vmlal.u32 q8,d3,d1[1] - vmlal.u32 q9,d5,d1[1] - - vmlal.u32 q5,d5,d4[1] - vmlal.u32 q6,d7,d4[1] - vmlal.u32 q8,d1,d3[1] - vmlal.u32 q7,d0,d3[1] - vmlal.u32 q9,d3,d3[1] - - vmlal.u32 q5,d3,d6[1] - vmlal.u32 q8,d0,d5[1] - vmlal.u32 q6,d5,d6[1] - vmlal.u32 q7,d7,d6[1] - vmlal.u32 q9,d1,d5[1] - - vmlal.u32 q8,d7,d8[1] - vmlal.u32 q5,d1,d8[1] - vmlal.u32 q6,d3,d8[1] - vmlal.u32 q7,d5,d8[1] - vmlal.u32 q9,d0,d7[1] - - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein - @ and P. Schwabe - @ - @ H0>>+H1>>+H2>>+H3>>+H4 - @ H3>>+H4>>*5+H0>>+H1 - @ - @ Trivia. - @ - @ Result of multiplication of n-bit number by m-bit number is - @ n+m bits wide. However! Even though 2^n is a n+1-bit number, - @ m-bit number multiplied by 2^n is still n+m bits wide. - @ - @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2, - @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit - @ one is n+1 bits wide. - @ - @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that - @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4 - @ can be 27. However! In cases when their width exceeds 26 bits - @ they are limited by 2^26+2^6. This in turn means that *sum* - @ of the products with these values can still be viewed as sum - @ of 52-bit numbers as long as the amount of addends is not a - @ power of 2. For example, - @ - @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4, - @ - @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or - @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than - @ 8 * (2^52) or 2^55. However, the value is then multiplied by - @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12), - @ which is less than 32 * (2^52) or 2^57. And when processing - @ data we are looking at triple as many addends... - @ - @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and - @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the - @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while - @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32 - @ instruction accepts 2x32-bit input and writes 2x64-bit result. - @ This means that result of reduction have to be compressed upon - @ loop wrap-around. This can be done in the process of reduction - @ to minimize amount of instructions [as well as amount of - @ 128-bit instructions, which benefits low-end processors], but - @ one has to watch for H2 (which is narrower than H0) and 5*H4 - @ not being wider than 58 bits, so that result of right shift - @ by 26 bits fits in 32 bits. This is also useful on x86, - @ because it allows to use paddd in place for paddq, which - @ benefits Atom, where paddq is ridiculously slow. - - vshr.u64 q15,q8,#26 - vmovn.i64 d16,q8 - vshr.u64 q4,q5,#26 - vmovn.i64 d10,q5 - vadd.i64 q9,q9,q15 @ h3 -> h4 - vbic.i32 d16,#0xfc000000 @ &=0x03ffffff - vadd.i64 q6,q6,q4 @ h0 -> h1 - vbic.i32 d10,#0xfc000000 - - vshrn.u64 d30,q9,#26 - vmovn.i64 d18,q9 - vshr.u64 q4,q6,#26 - vmovn.i64 d12,q6 - vadd.i64 q7,q7,q4 @ h1 -> h2 - vbic.i32 d18,#0xfc000000 - vbic.i32 d12,#0xfc000000 - - vadd.i32 d10,d10,d30 - vshl.u32 d30,d30,#2 - vshrn.u64 d8,q7,#26 - vmovn.i64 d14,q7 - vadd.i32 d10,d10,d30 @ h4 -> h0 - vadd.i32 d16,d16,d8 @ h2 -> h3 - vbic.i32 d14,#0xfc000000 - - vshr.u32 d30,d10,#26 - vbic.i32 d10,#0xfc000000 - vshr.u32 d8,d16,#26 - vbic.i32 d16,#0xfc000000 - vadd.i32 d12,d12,d30 @ h0 -> h1 - vadd.i32 d18,d18,d8 @ h3 -> h4 - - subs r5,r5,#1 - beq .Lsquare_break_neon - - add r6,r0,#(48+0*9*4) - add r7,r0,#(48+1*9*4) - - vtrn.32 d0,d10 @ r^2:r^1 - vtrn.32 d3,d14 - vtrn.32 d5,d16 - vtrn.32 d1,d12 - vtrn.32 d7,d18 - - vshl.u32 d4,d3,#2 @ *5 - vshl.u32 d6,d5,#2 - vshl.u32 d2,d1,#2 - vshl.u32 d8,d7,#2 - vadd.i32 d4,d4,d3 - vadd.i32 d2,d2,d1 - vadd.i32 d6,d6,d5 - vadd.i32 d8,d8,d7 - - vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! - vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! - vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! - vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! - vst1.32 {d8[0]},[r6,:32] - vst1.32 {d8[1]},[r7,:32] - - b .Lsquare_neon - -.align 4 -.Lsquare_break_neon: - add r6,r0,#(48+2*4*9) - add r7,r0,#(48+3*4*9) - - vmov d0,d10 @ r^4:r^3 - vshl.u32 d2,d12,#2 @ *5 - vmov d1,d12 - vshl.u32 d4,d14,#2 - vmov d3,d14 - vshl.u32 d6,d16,#2 - vmov d5,d16 - vshl.u32 d8,d18,#2 - vmov d7,d18 - vadd.i32 d2,d2,d12 - vadd.i32 d4,d4,d14 - vadd.i32 d6,d6,d16 - vadd.i32 d8,d8,d18 - - vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! - vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! - vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! - vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! - vst1.32 {d8[0]},[r6] - vst1.32 {d8[1]},[r7] - - bx lr @ bx lr -ENDPROC(poly1305_init_neon) - -.align 5 -ENTRY(poly1305_blocks_neon) - ldr ip,[r0,#36] @ is_base2_26 - ands r2,r2,#-16 - beq .Lno_data_neon - - cmp r2,#64 - bhs .Lenter_neon - tst ip,ip @ is_base2_26? - beq .Lpoly1305_blocks_arm - -.Lenter_neon: - stmdb sp!,{r4-r7} - vstmdb sp!,{d8-d15} @ ABI specification says so - - tst ip,ip @ is_base2_26? - bne .Lbase2_26_neon - - stmdb sp!,{r1-r3,lr} - bl .Lpoly1305_init_neon - - ldr r4,[r0,#0] @ load hash value base 2^32 - ldr r5,[r0,#4] - ldr r6,[r0,#8] - ldr r7,[r0,#12] - ldr ip,[r0,#16] - - and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 - mov r3,r4,lsr#26 - veor d10,d10,d10 - mov r4,r5,lsr#20 - orr r3,r3,r5,lsl#6 - veor d12,d12,d12 - mov r5,r6,lsr#14 - orr r4,r4,r6,lsl#12 - veor d14,d14,d14 - mov r6,r7,lsr#8 - orr r5,r5,r7,lsl#18 - veor d16,d16,d16 - and r3,r3,#0x03ffffff - orr r6,r6,ip,lsl#24 - veor d18,d18,d18 - and r4,r4,#0x03ffffff - mov r1,#1 - and r5,r5,#0x03ffffff - str r1,[r0,#36] @ is_base2_26 - - vmov.32 d10[0],r2 - vmov.32 d12[0],r3 - vmov.32 d14[0],r4 - vmov.32 d16[0],r5 - vmov.32 d18[0],r6 - adr r5,.Lzeros - - ldmia sp!,{r1-r3,lr} - b .Lbase2_32_neon - -.align 4 -.Lbase2_26_neon: - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ load hash value - - veor d10,d10,d10 - veor d12,d12,d12 - veor d14,d14,d14 - veor d16,d16,d16 - veor d18,d18,d18 - vld4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]! - adr r5,.Lzeros - vld1.32 {d18[0]},[r0] - sub r0,r0,#16 @ rewind - -.Lbase2_32_neon: - add r4,r1,#32 - mov r3,r3,lsl#24 - tst r2,#31 - beq .Leven - - vld4.32 {d20[0],d22[0],d24[0],d26[0]},[r1]! - vmov.32 d28[0],r3 - sub r2,r2,#16 - add r4,r1,#32 - -#ifdef __ARMEB__ - vrev32.8 q10,q10 - vrev32.8 q13,q13 - vrev32.8 q11,q11 - vrev32.8 q12,q12 -#endif - vsri.u32 d28,d26,#8 @ base 2^32 -> base 2^26 - vshl.u32 d26,d26,#18 - - vsri.u32 d26,d24,#14 - vshl.u32 d24,d24,#12 - vadd.i32 d29,d28,d18 @ add hash value and move to #hi - - vbic.i32 d26,#0xfc000000 - vsri.u32 d24,d22,#20 - vshl.u32 d22,d22,#6 - - vbic.i32 d24,#0xfc000000 - vsri.u32 d22,d20,#26 - vadd.i32 d27,d26,d16 - - vbic.i32 d20,#0xfc000000 - vbic.i32 d22,#0xfc000000 - vadd.i32 d25,d24,d14 - - vadd.i32 d21,d20,d10 - vadd.i32 d23,d22,d12 - - mov r7,r5 - add r6,r0,#48 - - cmp r2,r2 - b .Long_tail - -.align 4 -.Leven: - subs r2,r2,#64 - it lo - movlo r4,r5 - - vmov.i32 q14,#1<<24 @ padbit, yes, always - vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1] - add r1,r1,#64 - vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0) - add r4,r4,#64 - itt hi - addhi r7,r0,#(48+1*9*4) - addhi r6,r0,#(48+3*9*4) - -#ifdef __ARMEB__ - vrev32.8 q10,q10 - vrev32.8 q13,q13 - vrev32.8 q11,q11 - vrev32.8 q12,q12 -#endif - vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26 - vshl.u32 q13,q13,#18 - - vsri.u32 q13,q12,#14 - vshl.u32 q12,q12,#12 - - vbic.i32 q13,#0xfc000000 - vsri.u32 q12,q11,#20 - vshl.u32 q11,q11,#6 - - vbic.i32 q12,#0xfc000000 - vsri.u32 q11,q10,#26 - - vbic.i32 q10,#0xfc000000 - vbic.i32 q11,#0xfc000000 - - bls .Lskip_loop - - vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^2 - vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4 - vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! - vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! - b .Loop_neon - -.align 5 -.Loop_neon: - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 - @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r - @ ___________________/ - @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 - @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r - @ ___________________/ ____________________/ - @ - @ Note that we start with inp[2:3]*r^2. This is because it - @ doesn't depend on reduction in previous iteration. - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ inp[2:3]*r^2 - - vadd.i32 d24,d24,d14 @ accumulate inp[0:1] - vmull.u32 q7,d25,d0[1] - vadd.i32 d20,d20,d10 - vmull.u32 q5,d21,d0[1] - vadd.i32 d26,d26,d16 - vmull.u32 q8,d27,d0[1] - vmlal.u32 q7,d23,d1[1] - vadd.i32 d22,d22,d12 - vmull.u32 q6,d23,d0[1] - - vadd.i32 d28,d28,d18 - vmull.u32 q9,d29,d0[1] - subs r2,r2,#64 - vmlal.u32 q5,d29,d2[1] - it lo - movlo r4,r5 - vmlal.u32 q8,d25,d1[1] - vld1.32 d8[1],[r7,:32] - vmlal.u32 q6,d21,d1[1] - vmlal.u32 q9,d27,d1[1] - - vmlal.u32 q5,d27,d4[1] - vmlal.u32 q8,d23,d3[1] - vmlal.u32 q9,d25,d3[1] - vmlal.u32 q6,d29,d4[1] - vmlal.u32 q7,d21,d3[1] - - vmlal.u32 q8,d21,d5[1] - vmlal.u32 q5,d25,d6[1] - vmlal.u32 q9,d23,d5[1] - vmlal.u32 q6,d27,d6[1] - vmlal.u32 q7,d29,d6[1] - - vmlal.u32 q8,d29,d8[1] - vmlal.u32 q5,d23,d8[1] - vmlal.u32 q9,d21,d7[1] - vmlal.u32 q6,d25,d8[1] - vmlal.u32 q7,d27,d8[1] - - vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0) - add r4,r4,#64 - - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ (hash+inp[0:1])*r^4 and accumulate - - vmlal.u32 q8,d26,d0[0] - vmlal.u32 q5,d20,d0[0] - vmlal.u32 q9,d28,d0[0] - vmlal.u32 q6,d22,d0[0] - vmlal.u32 q7,d24,d0[0] - vld1.32 d8[0],[r6,:32] - - vmlal.u32 q8,d24,d1[0] - vmlal.u32 q5,d28,d2[0] - vmlal.u32 q9,d26,d1[0] - vmlal.u32 q6,d20,d1[0] - vmlal.u32 q7,d22,d1[0] - - vmlal.u32 q8,d22,d3[0] - vmlal.u32 q5,d26,d4[0] - vmlal.u32 q9,d24,d3[0] - vmlal.u32 q6,d28,d4[0] - vmlal.u32 q7,d20,d3[0] - - vmlal.u32 q8,d20,d5[0] - vmlal.u32 q5,d24,d6[0] - vmlal.u32 q9,d22,d5[0] - vmlal.u32 q6,d26,d6[0] - vmlal.u32 q8,d28,d8[0] - - vmlal.u32 q7,d28,d6[0] - vmlal.u32 q5,d22,d8[0] - vmlal.u32 q9,d20,d7[0] - vmov.i32 q14,#1<<24 @ padbit, yes, always - vmlal.u32 q6,d24,d8[0] - vmlal.u32 q7,d26,d8[0] - - vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1] - add r1,r1,#64 -#ifdef __ARMEB__ - vrev32.8 q10,q10 - vrev32.8 q11,q11 - vrev32.8 q12,q12 - vrev32.8 q13,q13 -#endif - - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ lazy reduction interleaved with base 2^32 -> base 2^26 of - @ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14. - - vshr.u64 q15,q8,#26 - vmovn.i64 d16,q8 - vshr.u64 q4,q5,#26 - vmovn.i64 d10,q5 - vadd.i64 q9,q9,q15 @ h3 -> h4 - vbic.i32 d16,#0xfc000000 - vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26 - vadd.i64 q6,q6,q4 @ h0 -> h1 - vshl.u32 q13,q13,#18 - vbic.i32 d10,#0xfc000000 - - vshrn.u64 d30,q9,#26 - vmovn.i64 d18,q9 - vshr.u64 q4,q6,#26 - vmovn.i64 d12,q6 - vadd.i64 q7,q7,q4 @ h1 -> h2 - vsri.u32 q13,q12,#14 - vbic.i32 d18,#0xfc000000 - vshl.u32 q12,q12,#12 - vbic.i32 d12,#0xfc000000 - - vadd.i32 d10,d10,d30 - vshl.u32 d30,d30,#2 - vbic.i32 q13,#0xfc000000 - vshrn.u64 d8,q7,#26 - vmovn.i64 d14,q7 - vaddl.u32 q5,d10,d30 @ h4 -> h0 [widen for a sec] - vsri.u32 q12,q11,#20 - vadd.i32 d16,d16,d8 @ h2 -> h3 - vshl.u32 q11,q11,#6 - vbic.i32 d14,#0xfc000000 - vbic.i32 q12,#0xfc000000 - - vshrn.u64 d30,q5,#26 @ re-narrow - vmovn.i64 d10,q5 - vsri.u32 q11,q10,#26 - vbic.i32 q10,#0xfc000000 - vshr.u32 d8,d16,#26 - vbic.i32 d16,#0xfc000000 - vbic.i32 d10,#0xfc000000 - vadd.i32 d12,d12,d30 @ h0 -> h1 - vadd.i32 d18,d18,d8 @ h3 -> h4 - vbic.i32 q11,#0xfc000000 - - bhi .Loop_neon - -.Lskip_loop: - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 - - add r7,r0,#(48+0*9*4) - add r6,r0,#(48+1*9*4) - adds r2,r2,#32 - it ne - movne r2,#0 - bne .Long_tail - - vadd.i32 d25,d24,d14 @ add hash value and move to #hi - vadd.i32 d21,d20,d10 - vadd.i32 d27,d26,d16 - vadd.i32 d23,d22,d12 - vadd.i32 d29,d28,d18 - -.Long_tail: - vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^1 - vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^2 - - vadd.i32 d24,d24,d14 @ can be redundant - vmull.u32 q7,d25,d0 - vadd.i32 d20,d20,d10 - vmull.u32 q5,d21,d0 - vadd.i32 d26,d26,d16 - vmull.u32 q8,d27,d0 - vadd.i32 d22,d22,d12 - vmull.u32 q6,d23,d0 - vadd.i32 d28,d28,d18 - vmull.u32 q9,d29,d0 - - vmlal.u32 q5,d29,d2 - vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! - vmlal.u32 q8,d25,d1 - vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! - vmlal.u32 q6,d21,d1 - vmlal.u32 q9,d27,d1 - vmlal.u32 q7,d23,d1 - - vmlal.u32 q8,d23,d3 - vld1.32 d8[1],[r7,:32] - vmlal.u32 q5,d27,d4 - vld1.32 d8[0],[r6,:32] - vmlal.u32 q9,d25,d3 - vmlal.u32 q6,d29,d4 - vmlal.u32 q7,d21,d3 - - vmlal.u32 q8,d21,d5 - it ne - addne r7,r0,#(48+2*9*4) - vmlal.u32 q5,d25,d6 - it ne - addne r6,r0,#(48+3*9*4) - vmlal.u32 q9,d23,d5 - vmlal.u32 q6,d27,d6 - vmlal.u32 q7,d29,d6 - - vmlal.u32 q8,d29,d8 - vorn q0,q0,q0 @ all-ones, can be redundant - vmlal.u32 q5,d23,d8 - vshr.u64 q0,q0,#38 - vmlal.u32 q9,d21,d7 - vmlal.u32 q6,d25,d8 - vmlal.u32 q7,d27,d8 - - beq .Lshort_tail - - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ (hash+inp[0:1])*r^4:r^3 and accumulate - - vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^3 - vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4 - - vmlal.u32 q7,d24,d0 - vmlal.u32 q5,d20,d0 - vmlal.u32 q8,d26,d0 - vmlal.u32 q6,d22,d0 - vmlal.u32 q9,d28,d0 - - vmlal.u32 q5,d28,d2 - vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! - vmlal.u32 q8,d24,d1 - vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! - vmlal.u32 q6,d20,d1 - vmlal.u32 q9,d26,d1 - vmlal.u32 q7,d22,d1 - - vmlal.u32 q8,d22,d3 - vld1.32 d8[1],[r7,:32] - vmlal.u32 q5,d26,d4 - vld1.32 d8[0],[r6,:32] - vmlal.u32 q9,d24,d3 - vmlal.u32 q6,d28,d4 - vmlal.u32 q7,d20,d3 - - vmlal.u32 q8,d20,d5 - vmlal.u32 q5,d24,d6 - vmlal.u32 q9,d22,d5 - vmlal.u32 q6,d26,d6 - vmlal.u32 q7,d28,d6 - - vmlal.u32 q8,d28,d8 - vorn q0,q0,q0 @ all-ones - vmlal.u32 q5,d22,d8 - vshr.u64 q0,q0,#38 - vmlal.u32 q9,d20,d7 - vmlal.u32 q6,d24,d8 - vmlal.u32 q7,d26,d8 - -.Lshort_tail: - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ horizontal addition - - vadd.i64 d16,d16,d17 - vadd.i64 d10,d10,d11 - vadd.i64 d18,d18,d19 - vadd.i64 d12,d12,d13 - vadd.i64 d14,d14,d15 - - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ lazy reduction, but without narrowing - - vshr.u64 q15,q8,#26 - vand.i64 q8,q8,q0 - vshr.u64 q4,q5,#26 - vand.i64 q5,q5,q0 - vadd.i64 q9,q9,q15 @ h3 -> h4 - vadd.i64 q6,q6,q4 @ h0 -> h1 - - vshr.u64 q15,q9,#26 - vand.i64 q9,q9,q0 - vshr.u64 q4,q6,#26 - vand.i64 q6,q6,q0 - vadd.i64 q7,q7,q4 @ h1 -> h2 - - vadd.i64 q5,q5,q15 - vshl.u64 q15,q15,#2 - vshr.u64 q4,q7,#26 - vand.i64 q7,q7,q0 - vadd.i64 q5,q5,q15 @ h4 -> h0 - vadd.i64 q8,q8,q4 @ h2 -> h3 - - vshr.u64 q15,q5,#26 - vand.i64 q5,q5,q0 - vshr.u64 q4,q8,#26 - vand.i64 q8,q8,q0 - vadd.i64 q6,q6,q15 @ h0 -> h1 - vadd.i64 q9,q9,q4 @ h3 -> h4 - - cmp r2,#0 - bne .Leven - - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ store hash value - - vst4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]! - vst1.32 {d18[0]},[r0] - - vldmia sp!,{d8-d15} @ epilogue - ldmia sp!,{r4-r7} -.Lno_data_neon: - bx lr @ bx lr -ENDPROC(poly1305_blocks_neon) - -.align 5 -ENTRY(poly1305_emit_neon) - ldr ip,[r0,#36] @ is_base2_26 - - stmdb sp!,{r4-r11} - - tst ip,ip - beq .Lpoly1305_emit_enter - - ldmia r0,{r3-r7} - eor r8,r8,r8 - - adds r3,r3,r4,lsl#26 @ base 2^26 -> base 2^32 - mov r4,r4,lsr#6 - adcs r4,r4,r5,lsl#20 - mov r5,r5,lsr#12 - adcs r5,r5,r6,lsl#14 - mov r6,r6,lsr#18 - adcs r6,r6,r7,lsl#8 - adc r7,r8,r7,lsr#24 @ can be partially reduced ... - - and r8,r7,#-4 @ ... so reduce - and r7,r6,#3 - add r8,r8,r8,lsr#2 @ *= 5 - adds r3,r3,r8 - adcs r4,r4,#0 - adcs r5,r5,#0 - adcs r6,r6,#0 - adc r7,r7,#0 - - adds r8,r3,#5 @ compare to modulus - adcs r9,r4,#0 - adcs r10,r5,#0 - adcs r11,r6,#0 - adc r7,r7,#0 - tst r7,#4 @ did it carry/borrow? - - it ne - movne r3,r8 - ldr r8,[r2,#0] - it ne - movne r4,r9 - ldr r9,[r2,#4] - it ne - movne r5,r10 - ldr r10,[r2,#8] - it ne - movne r6,r11 - ldr r11,[r2,#12] - - adds r3,r3,r8 @ accumulate nonce - adcs r4,r4,r9 - adcs r5,r5,r10 - adc r6,r6,r11 - -#ifdef __ARMEB__ - rev r3,r3 - rev r4,r4 - rev r5,r5 - rev r6,r6 -#endif - str r3,[r1,#0] @ store the result - str r4,[r1,#4] - str r5,[r1,#8] - str r6,[r1,#12] - - ldmia sp!,{r4-r11} - bx lr @ bx lr -ENDPROC(poly1305_emit_neon) - -.align 5 -.Lzeros: -.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -#endif diff --git a/src/crypto/zinc/poly1305/poly1305-arm.pl b/src/crypto/zinc/poly1305/poly1305-arm.pl new file mode 100644 index 0000000..88a4260 --- /dev/null +++ b/src/crypto/zinc/poly1305/poly1305-arm.pl @@ -0,0 +1,1272 @@ +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +# +# This code is taken from the OpenSSL project but the author, Andy Polyakov, +# has relicensed it under the licenses specified in the SPDX header above. +# The original headers, including the original license headers, are +# included below for completeness. +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# IALU(*)/gcc-4.4 NEON +# +# ARM11xx(ARMv6) 7.78/+100% - +# Cortex-A5 6.35/+130% 3.00 +# Cortex-A8 6.25/+115% 2.36 +# Cortex-A9 5.10/+95% 2.55 +# Cortex-A15 3.85/+85% 1.25(**) +# Snapdragon S4 5.70/+100% 1.48(**) +# +# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data; +# (**) these are trade-off results, they can be improved by ~8% but at +# the cost of 15/12% regression on Cortex-A5/A7, it's even possible +# to improve Cortex-A9 result, but then A5/A7 loose more than 20%; + +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +($ctx,$inp,$len,$padbit)=map("r$_",(0..3)); + +$code.=<<___; +#ifndef __KERNEL__ +# include "arm_arch.h" +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__ +# define poly1305_init poly1305_init_arm +# define poly1305_blocks poly1305_blocks_arm +# define poly1305_emit poly1305_emit_arm +.globl poly1305_emit_neon +.globl poly1305_blocks_neon +#endif + +.text +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif + +.globl poly1305_emit +.globl poly1305_blocks +.globl poly1305_init +.type poly1305_init,%function +.align 5 +poly1305_init: +.Lpoly1305_init: + stmdb sp!,{r4-r11} + + eor r3,r3,r3 + cmp $inp,#0 + str r3,[$ctx,#0] @ zero hash value + str r3,[$ctx,#4] + str r3,[$ctx,#8] + str r3,[$ctx,#12] + str r3,[$ctx,#16] + str r3,[$ctx,#36] @ is_base2_26 + add $ctx,$ctx,#20 + +#ifdef __thumb2__ + it eq +#endif + moveq r0,#0 + beq .Lno_key + +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + adr r11,.Lpoly1305_init + ldr r12,.LOPENSSL_armcap +#endif + ldrb r4,[$inp,#0] + mov r10,#0x0fffffff + ldrb r5,[$inp,#1] + and r3,r10,#-4 @ 0x0ffffffc + ldrb r6,[$inp,#2] + ldrb r7,[$inp,#3] + orr r4,r4,r5,lsl#8 + ldrb r5,[$inp,#4] + orr r4,r4,r6,lsl#16 + ldrb r6,[$inp,#5] + orr r4,r4,r7,lsl#24 + ldrb r7,[$inp,#6] + and r4,r4,r10 + +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + ldr r12,[r11,r12] @ OPENSSL_armcap_P +# ifdef __APPLE__ + ldr r12,[r12] +# endif +#endif + ldrb r8,[$inp,#7] + orr r5,r5,r6,lsl#8 + ldrb r6,[$inp,#8] + orr r5,r5,r7,lsl#16 + ldrb r7,[$inp,#9] + orr r5,r5,r8,lsl#24 + ldrb r8,[$inp,#10] + and r5,r5,r3 + +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + tst r12,#ARMV7_NEON @ check for NEON +# ifdef __APPLE__ + adr r9,poly1305_blocks_neon + adr r11,poly1305_blocks +# ifdef __thumb2__ + it ne +# endif + movne r11,r9 + adr r12,poly1305_emit + adr r10,poly1305_emit_neon +# ifdef __thumb2__ + it ne +# endif + movne r12,r10 +# else +# ifdef __thumb2__ + itete eq +# endif + addeq r12,r11,#(poly1305_emit-.Lpoly1305_init) + addne r12,r11,#(poly1305_emit_neon-.Lpoly1305_init) + addeq r11,r11,#(poly1305_blocks-.Lpoly1305_init) + addne r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init) +# endif +# ifdef __thumb2__ + orr r12,r12,#1 @ thumb-ify address + orr r11,r11,#1 +# endif +#endif + ldrb r9,[$inp,#11] + orr r6,r6,r7,lsl#8 + ldrb r7,[$inp,#12] + orr r6,r6,r8,lsl#16 + ldrb r8,[$inp,#13] + orr r6,r6,r9,lsl#24 + ldrb r9,[$inp,#14] + and r6,r6,r3 + + ldrb r10,[$inp,#15] + orr r7,r7,r8,lsl#8 + str r4,[$ctx,#0] + orr r7,r7,r9,lsl#16 + str r5,[$ctx,#4] + orr r7,r7,r10,lsl#24 + str r6,[$ctx,#8] + and r7,r7,r3 + str r7,[$ctx,#12] +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + stmia r2,{r11,r12} @ fill functions table + mov r0,#1 +#else + mov r0,#0 +#endif +.Lno_key: + ldmia sp!,{r4-r11} +#if __ARM_ARCH__>=5 + ret @ bx lr +#else + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size poly1305_init,.-poly1305_init +___ +{ +my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12)); +my ($s1,$s2,$s3)=($r1,$r2,$r3); + +$code.=<<___; +.type poly1305_blocks,%function +.align 5 +poly1305_blocks: +.Lpoly1305_blocks: + stmdb sp!,{r3-r11,lr} + + ands $len,$len,#-16 + beq .Lno_data + + cmp $padbit,#0 + add $len,$len,$inp @ end pointer + sub sp,sp,#32 + + ldmia $ctx,{$h0-$r3} @ load context + + str $ctx,[sp,#12] @ offload stuff + mov lr,$inp + str $len,[sp,#16] + str $r1,[sp,#20] + str $r2,[sp,#24] + str $r3,[sp,#28] + b .Loop + +.Loop: +#if __ARM_ARCH__<7 + ldrb r0,[lr],#16 @ load input +# ifdef __thumb2__ + it hi +# endif + addhi $h4,$h4,#1 @ 1<<128 + ldrb r1,[lr,#-15] + ldrb r2,[lr,#-14] + ldrb r3,[lr,#-13] + orr r1,r0,r1,lsl#8 + ldrb r0,[lr,#-12] + orr r2,r1,r2,lsl#16 + ldrb r1,[lr,#-11] + orr r3,r2,r3,lsl#24 + ldrb r2,[lr,#-10] + adds $h0,$h0,r3 @ accumulate input + + ldrb r3,[lr,#-9] + orr r1,r0,r1,lsl#8 + ldrb r0,[lr,#-8] + orr r2,r1,r2,lsl#16 + ldrb r1,[lr,#-7] + orr r3,r2,r3,lsl#24 + ldrb r2,[lr,#-6] + adcs $h1,$h1,r3 + + ldrb r3,[lr,#-5] + orr r1,r0,r1,lsl#8 + ldrb r0,[lr,#-4] + orr r2,r1,r2,lsl#16 + ldrb r1,[lr,#-3] + orr r3,r2,r3,lsl#24 + ldrb r2,[lr,#-2] + adcs $h2,$h2,r3 + + ldrb r3,[lr,#-1] + orr r1,r0,r1,lsl#8 + str lr,[sp,#8] @ offload input pointer + orr r2,r1,r2,lsl#16 + add $s1,$r1,$r1,lsr#2 + orr r3,r2,r3,lsl#24 +#else + ldr r0,[lr],#16 @ load input +# ifdef __thumb2__ + it hi +# endif + addhi $h4,$h4,#1 @ padbit + ldr r1,[lr,#-12] + ldr r2,[lr,#-8] + ldr r3,[lr,#-4] +# ifdef __ARMEB__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +# endif + adds $h0,$h0,r0 @ accumulate input + str lr,[sp,#8] @ offload input pointer + adcs $h1,$h1,r1 + add $s1,$r1,$r1,lsr#2 + adcs $h2,$h2,r2 +#endif + add $s2,$r2,$r2,lsr#2 + adcs $h3,$h3,r3 + add $s3,$r3,$r3,lsr#2 + + umull r2,r3,$h1,$r0 + adc $h4,$h4,#0 + umull r0,r1,$h0,$r0 + umlal r2,r3,$h4,$s1 + umlal r0,r1,$h3,$s1 + ldr $r1,[sp,#20] @ reload $r1 + umlal r2,r3,$h2,$s3 + umlal r0,r1,$h1,$s3 + umlal r2,r3,$h3,$s2 + umlal r0,r1,$h2,$s2 + umlal r2,r3,$h0,$r1 + str r0,[sp,#0] @ future $h0 + mul r0,$s2,$h4 + ldr $r2,[sp,#24] @ reload $r2 + adds r2,r2,r1 @ d1+=d0>>32 + eor r1,r1,r1 + adc lr,r3,#0 @ future $h2 + str r2,[sp,#4] @ future $h1 + + mul r2,$s3,$h4 + eor r3,r3,r3 + umlal r0,r1,$h3,$s3 + ldr $r3,[sp,#28] @ reload $r3 + umlal r2,r3,$h3,$r0 + umlal r0,r1,$h2,$r0 + umlal r2,r3,$h2,$r1 + umlal r0,r1,$h1,$r1 + umlal r2,r3,$h1,$r2 + umlal r0,r1,$h0,$r2 + umlal r2,r3,$h0,$r3 + ldr $h0,[sp,#0] + mul $h4,$r0,$h4 + ldr $h1,[sp,#4] + + adds $h2,lr,r0 @ d2+=d1>>32 + ldr lr,[sp,#8] @ reload input pointer + adc r1,r1,#0 + adds $h3,r2,r1 @ d3+=d2>>32 + ldr r0,[sp,#16] @ reload end pointer + adc r3,r3,#0 + add $h4,$h4,r3 @ h4+=d3>>32 + + and r1,$h4,#-4 + and $h4,$h4,#3 + add r1,r1,r1,lsr#2 @ *=5 + adds $h0,$h0,r1 + adcs $h1,$h1,#0 + adcs $h2,$h2,#0 + adcs $h3,$h3,#0 + adc $h4,$h4,#0 + + cmp r0,lr @ done yet? + bhi .Loop + + ldr $ctx,[sp,#12] + add sp,sp,#32 + stmia $ctx,{$h0-$h4} @ store the result + +.Lno_data: +#if __ARM_ARCH__>=5 + ldmia sp!,{r3-r11,pc} +#else + ldmia sp!,{r3-r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size poly1305_blocks,.-poly1305_blocks +___ +} +{ +my ($ctx,$mac,$nonce)=map("r$_",(0..2)); +my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11)); +my $g4=$h4; + +$code.=<<___; +.type poly1305_emit,%function +.align 5 +poly1305_emit: + stmdb sp!,{r4-r11} +.Lpoly1305_emit_enter: + + ldmia $ctx,{$h0-$h4} + adds $g0,$h0,#5 @ compare to modulus + adcs $g1,$h1,#0 + adcs $g2,$h2,#0 + adcs $g3,$h3,#0 + adc $g4,$h4,#0 + tst $g4,#4 @ did it carry/borrow? + +#ifdef __thumb2__ + it ne +#endif + movne $h0,$g0 + ldr $g0,[$nonce,#0] +#ifdef __thumb2__ + it ne +#endif + movne $h1,$g1 + ldr $g1,[$nonce,#4] +#ifdef __thumb2__ + it ne +#endif + movne $h2,$g2 + ldr $g2,[$nonce,#8] +#ifdef __thumb2__ + it ne +#endif + movne $h3,$g3 + ldr $g3,[$nonce,#12] + + adds $h0,$h0,$g0 + adcs $h1,$h1,$g1 + adcs $h2,$h2,$g2 + adc $h3,$h3,$g3 + +#if __ARM_ARCH__>=7 +# ifdef __ARMEB__ + rev $h0,$h0 + rev $h1,$h1 + rev $h2,$h2 + rev $h3,$h3 +# endif + str $h0,[$mac,#0] + str $h1,[$mac,#4] + str $h2,[$mac,#8] + str $h3,[$mac,#12] +#else + strb $h0,[$mac,#0] + mov $h0,$h0,lsr#8 + strb $h1,[$mac,#4] + mov $h1,$h1,lsr#8 + strb $h2,[$mac,#8] + mov $h2,$h2,lsr#8 + strb $h3,[$mac,#12] + mov $h3,$h3,lsr#8 + + strb $h0,[$mac,#1] + mov $h0,$h0,lsr#8 + strb $h1,[$mac,#5] + mov $h1,$h1,lsr#8 + strb $h2,[$mac,#9] + mov $h2,$h2,lsr#8 + strb $h3,[$mac,#13] + mov $h3,$h3,lsr#8 + + strb $h0,[$mac,#2] + mov $h0,$h0,lsr#8 + strb $h1,[$mac,#6] + mov $h1,$h1,lsr#8 + strb $h2,[$mac,#10] + mov $h2,$h2,lsr#8 + strb $h3,[$mac,#14] + mov $h3,$h3,lsr#8 + + strb $h0,[$mac,#3] + strb $h1,[$mac,#7] + strb $h2,[$mac,#11] + strb $h3,[$mac,#15] +#endif + ldmia sp!,{r4-r11} +#if __ARM_ARCH__>=5 + ret @ bx lr +#else + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size poly1305_emit,.-poly1305_emit +___ +{ +my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9)); +my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14)); +my ($T0,$T1,$MASK) = map("q$_",(15,4,0)); + +my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7)); + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 +.fpu neon + +.type poly1305_init_neon,%function +.align 5 +poly1305_init_neon: +.Lpoly1305_init_neon: + ldr r4,[$ctx,#20] @ load key base 2^32 + ldr r5,[$ctx,#24] + ldr r6,[$ctx,#28] + ldr r7,[$ctx,#32] + + and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 + mov r3,r4,lsr#26 + mov r4,r5,lsr#20 + orr r3,r3,r5,lsl#6 + mov r5,r6,lsr#14 + orr r4,r4,r6,lsl#12 + mov r6,r7,lsr#8 + orr r5,r5,r7,lsl#18 + and r3,r3,#0x03ffffff + and r4,r4,#0x03ffffff + and r5,r5,#0x03ffffff + + vdup.32 $R0,r2 @ r^1 in both lanes + add r2,r3,r3,lsl#2 @ *5 + vdup.32 $R1,r3 + add r3,r4,r4,lsl#2 + vdup.32 $S1,r2 + vdup.32 $R2,r4 + add r4,r5,r5,lsl#2 + vdup.32 $S2,r3 + vdup.32 $R3,r5 + add r5,r6,r6,lsl#2 + vdup.32 $S3,r4 + vdup.32 $R4,r6 + vdup.32 $S4,r5 + + mov $zeros,#2 @ counter + +.Lsquare_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + + vmull.u32 $D0,$R0,${R0}[1] + vmull.u32 $D1,$R1,${R0}[1] + vmull.u32 $D2,$R2,${R0}[1] + vmull.u32 $D3,$R3,${R0}[1] + vmull.u32 $D4,$R4,${R0}[1] + + vmlal.u32 $D0,$R4,${S1}[1] + vmlal.u32 $D1,$R0,${R1}[1] + vmlal.u32 $D2,$R1,${R1}[1] + vmlal.u32 $D3,$R2,${R1}[1] + vmlal.u32 $D4,$R3,${R1}[1] + + vmlal.u32 $D0,$R3,${S2}[1] + vmlal.u32 $D1,$R4,${S2}[1] + vmlal.u32 $D3,$R1,${R2}[1] + vmlal.u32 $D2,$R0,${R2}[1] + vmlal.u32 $D4,$R2,${R2}[1] + + vmlal.u32 $D0,$R2,${S3}[1] + vmlal.u32 $D3,$R0,${R3}[1] + vmlal.u32 $D1,$R3,${S3}[1] + vmlal.u32 $D2,$R4,${S3}[1] + vmlal.u32 $D4,$R1,${R3}[1] + + vmlal.u32 $D3,$R4,${S4}[1] + vmlal.u32 $D0,$R1,${S4}[1] + vmlal.u32 $D1,$R2,${S4}[1] + vmlal.u32 $D2,$R3,${S4}[1] + vmlal.u32 $D4,$R0,${R4}[1] + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein + @ and P. Schwabe + @ + @ H0>>+H1>>+H2>>+H3>>+H4 + @ H3>>+H4>>*5+H0>>+H1 + @ + @ Trivia. + @ + @ Result of multiplication of n-bit number by m-bit number is + @ n+m bits wide. However! Even though 2^n is a n+1-bit number, + @ m-bit number multiplied by 2^n is still n+m bits wide. + @ + @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2, + @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit + @ one is n+1 bits wide. + @ + @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that + @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4 + @ can be 27. However! In cases when their width exceeds 26 bits + @ they are limited by 2^26+2^6. This in turn means that *sum* + @ of the products with these values can still be viewed as sum + @ of 52-bit numbers as long as the amount of addends is not a + @ power of 2. For example, + @ + @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4, + @ + @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or + @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than + @ 8 * (2^52) or 2^55. However, the value is then multiplied by + @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12), + @ which is less than 32 * (2^52) or 2^57. And when processing + @ data we are looking at triple as many addends... + @ + @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and + @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the + @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while + @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32 + @ instruction accepts 2x32-bit input and writes 2x64-bit result. + @ This means that result of reduction have to be compressed upon + @ loop wrap-around. This can be done in the process of reduction + @ to minimize amount of instructions [as well as amount of + @ 128-bit instructions, which benefits low-end processors], but + @ one has to watch for H2 (which is narrower than H0) and 5*H4 + @ not being wider than 58 bits, so that result of right shift + @ by 26 bits fits in 32 bits. This is also useful on x86, + @ because it allows to use paddd in place for paddq, which + @ benefits Atom, where paddq is ridiculously slow. + + vshr.u64 $T0,$D3,#26 + vmovn.i64 $D3#lo,$D3 + vshr.u64 $T1,$D0,#26 + vmovn.i64 $D0#lo,$D0 + vadd.i64 $D4,$D4,$T0 @ h3 -> h4 + vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff + vadd.i64 $D1,$D1,$T1 @ h0 -> h1 + vbic.i32 $D0#lo,#0xfc000000 + + vshrn.u64 $T0#lo,$D4,#26 + vmovn.i64 $D4#lo,$D4 + vshr.u64 $T1,$D1,#26 + vmovn.i64 $D1#lo,$D1 + vadd.i64 $D2,$D2,$T1 @ h1 -> h2 + vbic.i32 $D4#lo,#0xfc000000 + vbic.i32 $D1#lo,#0xfc000000 + + vadd.i32 $D0#lo,$D0#lo,$T0#lo + vshl.u32 $T0#lo,$T0#lo,#2 + vshrn.u64 $T1#lo,$D2,#26 + vmovn.i64 $D2#lo,$D2 + vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0 + vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3 + vbic.i32 $D2#lo,#0xfc000000 + + vshr.u32 $T0#lo,$D0#lo,#26 + vbic.i32 $D0#lo,#0xfc000000 + vshr.u32 $T1#lo,$D3#lo,#26 + vbic.i32 $D3#lo,#0xfc000000 + vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1 + vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4 + + subs $zeros,$zeros,#1 + beq .Lsquare_break_neon + + add $tbl0,$ctx,#(48+0*9*4) + add $tbl1,$ctx,#(48+1*9*4) + + vtrn.32 $R0,$D0#lo @ r^2:r^1 + vtrn.32 $R2,$D2#lo + vtrn.32 $R3,$D3#lo + vtrn.32 $R1,$D1#lo + vtrn.32 $R4,$D4#lo + + vshl.u32 $S2,$R2,#2 @ *5 + vshl.u32 $S3,$R3,#2 + vshl.u32 $S1,$R1,#2 + vshl.u32 $S4,$R4,#2 + vadd.i32 $S2,$S2,$R2 + vadd.i32 $S1,$S1,$R1 + vadd.i32 $S3,$S3,$R3 + vadd.i32 $S4,$S4,$R4 + + vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! + vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! + vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! + vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! + vst1.32 {${S4}[0]},[$tbl0,:32] + vst1.32 {${S4}[1]},[$tbl1,:32] + + b .Lsquare_neon + +.align 4 +.Lsquare_break_neon: + add $tbl0,$ctx,#(48+2*4*9) + add $tbl1,$ctx,#(48+3*4*9) + + vmov $R0,$D0#lo @ r^4:r^3 + vshl.u32 $S1,$D1#lo,#2 @ *5 + vmov $R1,$D1#lo + vshl.u32 $S2,$D2#lo,#2 + vmov $R2,$D2#lo + vshl.u32 $S3,$D3#lo,#2 + vmov $R3,$D3#lo + vshl.u32 $S4,$D4#lo,#2 + vmov $R4,$D4#lo + vadd.i32 $S1,$S1,$D1#lo + vadd.i32 $S2,$S2,$D2#lo + vadd.i32 $S3,$S3,$D3#lo + vadd.i32 $S4,$S4,$D4#lo + + vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! + vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! + vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! + vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! + vst1.32 {${S4}[0]},[$tbl0] + vst1.32 {${S4}[1]},[$tbl1] + + ret @ bx lr +.size poly1305_init_neon,.-poly1305_init_neon + +.type poly1305_blocks_neon,%function +.align 5 +poly1305_blocks_neon: + ldr ip,[$ctx,#36] @ is_base2_26 + ands $len,$len,#-16 + beq .Lno_data_neon + + cmp $len,#64 + bhs .Lenter_neon + tst ip,ip @ is_base2_26? + beq .Lpoly1305_blocks + +.Lenter_neon: + stmdb sp!,{r4-r7} + vstmdb sp!,{d8-d15} @ ABI specification says so + + tst ip,ip @ is_base2_26? + bne .Lbase2_26_neon + + stmdb sp!,{r1-r3,lr} + bl .Lpoly1305_init_neon + + ldr r4,[$ctx,#0] @ load hash value base 2^32 + ldr r5,[$ctx,#4] + ldr r6,[$ctx,#8] + ldr r7,[$ctx,#12] + ldr ip,[$ctx,#16] + + and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 + mov r3,r4,lsr#26 + veor $D0#lo,$D0#lo,$D0#lo + mov r4,r5,lsr#20 + orr r3,r3,r5,lsl#6 + veor $D1#lo,$D1#lo,$D1#lo + mov r5,r6,lsr#14 + orr r4,r4,r6,lsl#12 + veor $D2#lo,$D2#lo,$D2#lo + mov r6,r7,lsr#8 + orr r5,r5,r7,lsl#18 + veor $D3#lo,$D3#lo,$D3#lo + and r3,r3,#0x03ffffff + orr r6,r6,ip,lsl#24 + veor $D4#lo,$D4#lo,$D4#lo + and r4,r4,#0x03ffffff + mov r1,#1 + and r5,r5,#0x03ffffff + str r1,[$ctx,#36] @ is_base2_26 + + vmov.32 $D0#lo[0],r2 + vmov.32 $D1#lo[0],r3 + vmov.32 $D2#lo[0],r4 + vmov.32 $D3#lo[0],r5 + vmov.32 $D4#lo[0],r6 + adr $zeros,.Lzeros + + ldmia sp!,{r1-r3,lr} + b .Lbase2_32_neon + +.align 4 +.Lbase2_26_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ load hash value + + veor $D0#lo,$D0#lo,$D0#lo + veor $D1#lo,$D1#lo,$D1#lo + veor $D2#lo,$D2#lo,$D2#lo + veor $D3#lo,$D3#lo,$D3#lo + veor $D4#lo,$D4#lo,$D4#lo + vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]! + adr $zeros,.Lzeros + vld1.32 {$D4#lo[0]},[$ctx] + sub $ctx,$ctx,#16 @ rewind + +.Lbase2_32_neon: + add $in2,$inp,#32 + mov $padbit,$padbit,lsl#24 + tst $len,#31 + beq .Leven + + vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]! + vmov.32 $H4#lo[0],$padbit + sub $len,$len,#16 + add $in2,$inp,#32 + +# ifdef __ARMEB__ + vrev32.8 $H0,$H0 + vrev32.8 $H3,$H3 + vrev32.8 $H1,$H1 + vrev32.8 $H2,$H2 +# endif + vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26 + vshl.u32 $H3#lo,$H3#lo,#18 + + vsri.u32 $H3#lo,$H2#lo,#14 + vshl.u32 $H2#lo,$H2#lo,#12 + vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi + + vbic.i32 $H3#lo,#0xfc000000 + vsri.u32 $H2#lo,$H1#lo,#20 + vshl.u32 $H1#lo,$H1#lo,#6 + + vbic.i32 $H2#lo,#0xfc000000 + vsri.u32 $H1#lo,$H0#lo,#26 + vadd.i32 $H3#hi,$H3#lo,$D3#lo + + vbic.i32 $H0#lo,#0xfc000000 + vbic.i32 $H1#lo,#0xfc000000 + vadd.i32 $H2#hi,$H2#lo,$D2#lo + + vadd.i32 $H0#hi,$H0#lo,$D0#lo + vadd.i32 $H1#hi,$H1#lo,$D1#lo + + mov $tbl1,$zeros + add $tbl0,$ctx,#48 + + cmp $len,$len + b .Long_tail + +.align 4 +.Leven: + subs $len,$len,#64 + it lo + movlo $in2,$zeros + + vmov.i32 $H4,#1<<24 @ padbit, yes, always + vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1] + add $inp,$inp,#64 + vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0) + add $in2,$in2,#64 + itt hi + addhi $tbl1,$ctx,#(48+1*9*4) + addhi $tbl0,$ctx,#(48+3*9*4) + +# ifdef __ARMEB__ + vrev32.8 $H0,$H0 + vrev32.8 $H3,$H3 + vrev32.8 $H1,$H1 + vrev32.8 $H2,$H2 +# endif + vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26 + vshl.u32 $H3,$H3,#18 + + vsri.u32 $H3,$H2,#14 + vshl.u32 $H2,$H2,#12 + + vbic.i32 $H3,#0xfc000000 + vsri.u32 $H2,$H1,#20 + vshl.u32 $H1,$H1,#6 + + vbic.i32 $H2,#0xfc000000 + vsri.u32 $H1,$H0,#26 + + vbic.i32 $H0,#0xfc000000 + vbic.i32 $H1,#0xfc000000 + + bls .Lskip_loop + + vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2 + vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4 + vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! + vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! + b .Loop_neon + +.align 5 +.Loop_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 + @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r + @ \___________________/ + @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 + @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r + @ \___________________/ \____________________/ + @ + @ Note that we start with inp[2:3]*r^2. This is because it + @ doesn't depend on reduction in previous iteration. + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ inp[2:3]*r^2 + + vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1] + vmull.u32 $D2,$H2#hi,${R0}[1] + vadd.i32 $H0#lo,$H0#lo,$D0#lo + vmull.u32 $D0,$H0#hi,${R0}[1] + vadd.i32 $H3#lo,$H3#lo,$D3#lo + vmull.u32 $D3,$H3#hi,${R0}[1] + vmlal.u32 $D2,$H1#hi,${R1}[1] + vadd.i32 $H1#lo,$H1#lo,$D1#lo + vmull.u32 $D1,$H1#hi,${R0}[1] + + vadd.i32 $H4#lo,$H4#lo,$D4#lo + vmull.u32 $D4,$H4#hi,${R0}[1] + subs $len,$len,#64 + vmlal.u32 $D0,$H4#hi,${S1}[1] + it lo + movlo $in2,$zeros + vmlal.u32 $D3,$H2#hi,${R1}[1] + vld1.32 ${S4}[1],[$tbl1,:32] + vmlal.u32 $D1,$H0#hi,${R1}[1] + vmlal.u32 $D4,$H3#hi,${R1}[1] + + vmlal.u32 $D0,$H3#hi,${S2}[1] + vmlal.u32 $D3,$H1#hi,${R2}[1] + vmlal.u32 $D4,$H2#hi,${R2}[1] + vmlal.u32 $D1,$H4#hi,${S2}[1] + vmlal.u32 $D2,$H0#hi,${R2}[1] + + vmlal.u32 $D3,$H0#hi,${R3}[1] + vmlal.u32 $D0,$H2#hi,${S3}[1] + vmlal.u32 $D4,$H1#hi,${R3}[1] + vmlal.u32 $D1,$H3#hi,${S3}[1] + vmlal.u32 $D2,$H4#hi,${S3}[1] + + vmlal.u32 $D3,$H4#hi,${S4}[1] + vmlal.u32 $D0,$H1#hi,${S4}[1] + vmlal.u32 $D4,$H0#hi,${R4}[1] + vmlal.u32 $D1,$H2#hi,${S4}[1] + vmlal.u32 $D2,$H3#hi,${S4}[1] + + vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0) + add $in2,$in2,#64 + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ (hash+inp[0:1])*r^4 and accumulate + + vmlal.u32 $D3,$H3#lo,${R0}[0] + vmlal.u32 $D0,$H0#lo,${R0}[0] + vmlal.u32 $D4,$H4#lo,${R0}[0] + vmlal.u32 $D1,$H1#lo,${R0}[0] + vmlal.u32 $D2,$H2#lo,${R0}[0] + vld1.32 ${S4}[0],[$tbl0,:32] + + vmlal.u32 $D3,$H2#lo,${R1}[0] + vmlal.u32 $D0,$H4#lo,${S1}[0] + vmlal.u32 $D4,$H3#lo,${R1}[0] + vmlal.u32 $D1,$H0#lo,${R1}[0] + vmlal.u32 $D2,$H1#lo,${R1}[0] + + vmlal.u32 $D3,$H1#lo,${R2}[0] + vmlal.u32 $D0,$H3#lo,${S2}[0] + vmlal.u32 $D4,$H2#lo,${R2}[0] + vmlal.u32 $D1,$H4#lo,${S2}[0] + vmlal.u32 $D2,$H0#lo,${R2}[0] + + vmlal.u32 $D3,$H0#lo,${R3}[0] + vmlal.u32 $D0,$H2#lo,${S3}[0] + vmlal.u32 $D4,$H1#lo,${R3}[0] + vmlal.u32 $D1,$H3#lo,${S3}[0] + vmlal.u32 $D3,$H4#lo,${S4}[0] + + vmlal.u32 $D2,$H4#lo,${S3}[0] + vmlal.u32 $D0,$H1#lo,${S4}[0] + vmlal.u32 $D4,$H0#lo,${R4}[0] + vmov.i32 $H4,#1<<24 @ padbit, yes, always + vmlal.u32 $D1,$H2#lo,${S4}[0] + vmlal.u32 $D2,$H3#lo,${S4}[0] + + vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1] + add $inp,$inp,#64 +# ifdef __ARMEB__ + vrev32.8 $H0,$H0 + vrev32.8 $H1,$H1 + vrev32.8 $H2,$H2 + vrev32.8 $H3,$H3 +# endif + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ lazy reduction interleaved with base 2^32 -> base 2^26 of + @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4. + + vshr.u64 $T0,$D3,#26 + vmovn.i64 $D3#lo,$D3 + vshr.u64 $T1,$D0,#26 + vmovn.i64 $D0#lo,$D0 + vadd.i64 $D4,$D4,$T0 @ h3 -> h4 + vbic.i32 $D3#lo,#0xfc000000 + vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26 + vadd.i64 $D1,$D1,$T1 @ h0 -> h1 + vshl.u32 $H3,$H3,#18 + vbic.i32 $D0#lo,#0xfc000000 + + vshrn.u64 $T0#lo,$D4,#26 + vmovn.i64 $D4#lo,$D4 + vshr.u64 $T1,$D1,#26 + vmovn.i64 $D1#lo,$D1 + vadd.i64 $D2,$D2,$T1 @ h1 -> h2 + vsri.u32 $H3,$H2,#14 + vbic.i32 $D4#lo,#0xfc000000 + vshl.u32 $H2,$H2,#12 + vbic.i32 $D1#lo,#0xfc000000 + + vadd.i32 $D0#lo,$D0#lo,$T0#lo + vshl.u32 $T0#lo,$T0#lo,#2 + vbic.i32 $H3,#0xfc000000 + vshrn.u64 $T1#lo,$D2,#26 + vmovn.i64 $D2#lo,$D2 + vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec] + vsri.u32 $H2,$H1,#20 + vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3 + vshl.u32 $H1,$H1,#6 + vbic.i32 $D2#lo,#0xfc000000 + vbic.i32 $H2,#0xfc000000 + + vshrn.u64 $T0#lo,$D0,#26 @ re-narrow + vmovn.i64 $D0#lo,$D0 + vsri.u32 $H1,$H0,#26 + vbic.i32 $H0,#0xfc000000 + vshr.u32 $T1#lo,$D3#lo,#26 + vbic.i32 $D3#lo,#0xfc000000 + vbic.i32 $D0#lo,#0xfc000000 + vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1 + vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4 + vbic.i32 $H1,#0xfc000000 + + bhi .Loop_neon + +.Lskip_loop: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 + + add $tbl1,$ctx,#(48+0*9*4) + add $tbl0,$ctx,#(48+1*9*4) + adds $len,$len,#32 + it ne + movne $len,#0 + bne .Long_tail + + vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi + vadd.i32 $H0#hi,$H0#lo,$D0#lo + vadd.i32 $H3#hi,$H3#lo,$D3#lo + vadd.i32 $H1#hi,$H1#lo,$D1#lo + vadd.i32 $H4#hi,$H4#lo,$D4#lo + +.Long_tail: + vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1 + vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2 + + vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant + vmull.u32 $D2,$H2#hi,$R0 + vadd.i32 $H0#lo,$H0#lo,$D0#lo + vmull.u32 $D0,$H0#hi,$R0 + vadd.i32 $H3#lo,$H3#lo,$D3#lo + vmull.u32 $D3,$H3#hi,$R0 + vadd.i32 $H1#lo,$H1#lo,$D1#lo + vmull.u32 $D1,$H1#hi,$R0 + vadd.i32 $H4#lo,$H4#lo,$D4#lo + vmull.u32 $D4,$H4#hi,$R0 + + vmlal.u32 $D0,$H4#hi,$S1 + vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! + vmlal.u32 $D3,$H2#hi,$R1 + vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! + vmlal.u32 $D1,$H0#hi,$R1 + vmlal.u32 $D4,$H3#hi,$R1 + vmlal.u32 $D2,$H1#hi,$R1 + + vmlal.u32 $D3,$H1#hi,$R2 + vld1.32 ${S4}[1],[$tbl1,:32] + vmlal.u32 $D0,$H3#hi,$S2 + vld1.32 ${S4}[0],[$tbl0,:32] + vmlal.u32 $D4,$H2#hi,$R2 + vmlal.u32 $D1,$H4#hi,$S2 + vmlal.u32 $D2,$H0#hi,$R2 + + vmlal.u32 $D3,$H0#hi,$R3 + it ne + addne $tbl1,$ctx,#(48+2*9*4) + vmlal.u32 $D0,$H2#hi,$S3 + it ne + addne $tbl0,$ctx,#(48+3*9*4) + vmlal.u32 $D4,$H1#hi,$R3 + vmlal.u32 $D1,$H3#hi,$S3 + vmlal.u32 $D2,$H4#hi,$S3 + + vmlal.u32 $D3,$H4#hi,$S4 + vorn $MASK,$MASK,$MASK @ all-ones, can be redundant + vmlal.u32 $D0,$H1#hi,$S4 + vshr.u64 $MASK,$MASK,#38 + vmlal.u32 $D4,$H0#hi,$R4 + vmlal.u32 $D1,$H2#hi,$S4 + vmlal.u32 $D2,$H3#hi,$S4 + + beq .Lshort_tail + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ (hash+inp[0:1])*r^4:r^3 and accumulate + + vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3 + vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4 + + vmlal.u32 $D2,$H2#lo,$R0 + vmlal.u32 $D0,$H0#lo,$R0 + vmlal.u32 $D3,$H3#lo,$R0 + vmlal.u32 $D1,$H1#lo,$R0 + vmlal.u32 $D4,$H4#lo,$R0 + + vmlal.u32 $D0,$H4#lo,$S1 + vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! + vmlal.u32 $D3,$H2#lo,$R1 + vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! + vmlal.u32 $D1,$H0#lo,$R1 + vmlal.u32 $D4,$H3#lo,$R1 + vmlal.u32 $D2,$H1#lo,$R1 + + vmlal.u32 $D3,$H1#lo,$R2 + vld1.32 ${S4}[1],[$tbl1,:32] + vmlal.u32 $D0,$H3#lo,$S2 + vld1.32 ${S4}[0],[$tbl0,:32] + vmlal.u32 $D4,$H2#lo,$R2 + vmlal.u32 $D1,$H4#lo,$S2 + vmlal.u32 $D2,$H0#lo,$R2 + + vmlal.u32 $D3,$H0#lo,$R3 + vmlal.u32 $D0,$H2#lo,$S3 + vmlal.u32 $D4,$H1#lo,$R3 + vmlal.u32 $D1,$H3#lo,$S3 + vmlal.u32 $D2,$H4#lo,$S3 + + vmlal.u32 $D3,$H4#lo,$S4 + vorn $MASK,$MASK,$MASK @ all-ones + vmlal.u32 $D0,$H1#lo,$S4 + vshr.u64 $MASK,$MASK,#38 + vmlal.u32 $D4,$H0#lo,$R4 + vmlal.u32 $D1,$H2#lo,$S4 + vmlal.u32 $D2,$H3#lo,$S4 + +.Lshort_tail: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ horizontal addition + + vadd.i64 $D3#lo,$D3#lo,$D3#hi + vadd.i64 $D0#lo,$D0#lo,$D0#hi + vadd.i64 $D4#lo,$D4#lo,$D4#hi + vadd.i64 $D1#lo,$D1#lo,$D1#hi + vadd.i64 $D2#lo,$D2#lo,$D2#hi + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ lazy reduction, but without narrowing + + vshr.u64 $T0,$D3,#26 + vand.i64 $D3,$D3,$MASK + vshr.u64 $T1,$D0,#26 + vand.i64 $D0,$D0,$MASK + vadd.i64 $D4,$D4,$T0 @ h3 -> h4 + vadd.i64 $D1,$D1,$T1 @ h0 -> h1 + + vshr.u64 $T0,$D4,#26 + vand.i64 $D4,$D4,$MASK + vshr.u64 $T1,$D1,#26 + vand.i64 $D1,$D1,$MASK + vadd.i64 $D2,$D2,$T1 @ h1 -> h2 + + vadd.i64 $D0,$D0,$T0 + vshl.u64 $T0,$T0,#2 + vshr.u64 $T1,$D2,#26 + vand.i64 $D2,$D2,$MASK + vadd.i64 $D0,$D0,$T0 @ h4 -> h0 + vadd.i64 $D3,$D3,$T1 @ h2 -> h3 + + vshr.u64 $T0,$D0,#26 + vand.i64 $D0,$D0,$MASK + vshr.u64 $T1,$D3,#26 + vand.i64 $D3,$D3,$MASK + vadd.i64 $D1,$D1,$T0 @ h0 -> h1 + vadd.i64 $D4,$D4,$T1 @ h3 -> h4 + + cmp $len,#0 + bne .Leven + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ store hash value + + vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]! + vst1.32 {$D4#lo[0]},[$ctx] + + vldmia sp!,{d8-d15} @ epilogue + ldmia sp!,{r4-r7} +.Lno_data_neon: + ret @ bx lr +.size poly1305_blocks_neon,.-poly1305_blocks_neon + +.type poly1305_emit_neon,%function +.align 5 +poly1305_emit_neon: + ldr ip,[$ctx,#36] @ is_base2_26 + + stmdb sp!,{r4-r11} + + tst ip,ip + beq .Lpoly1305_emit_enter + + ldmia $ctx,{$h0-$h4} + eor $g0,$g0,$g0 + + adds $h0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32 + mov $h1,$h1,lsr#6 + adcs $h1,$h1,$h2,lsl#20 + mov $h2,$h2,lsr#12 + adcs $h2,$h2,$h3,lsl#14 + mov $h3,$h3,lsr#18 + adcs $h3,$h3,$h4,lsl#8 + adc $h4,$g0,$h4,lsr#24 @ can be partially reduced ... + + and $g0,$h4,#-4 @ ... so reduce + and $h4,$h3,#3 + add $g0,$g0,$g0,lsr#2 @ *= 5 + adds $h0,$h0,$g0 + adcs $h1,$h1,#0 + adcs $h2,$h2,#0 + adcs $h3,$h3,#0 + adc $h4,$h4,#0 + + adds $g0,$h0,#5 @ compare to modulus + adcs $g1,$h1,#0 + adcs $g2,$h2,#0 + adcs $g3,$h3,#0 + adc $g4,$h4,#0 + tst $g4,#4 @ did it carry/borrow? + + it ne + movne $h0,$g0 + ldr $g0,[$nonce,#0] + it ne + movne $h1,$g1 + ldr $g1,[$nonce,#4] + it ne + movne $h2,$g2 + ldr $g2,[$nonce,#8] + it ne + movne $h3,$g3 + ldr $g3,[$nonce,#12] + + adds $h0,$h0,$g0 @ accumulate nonce + adcs $h1,$h1,$g1 + adcs $h2,$h2,$g2 + adc $h3,$h3,$g3 + +# ifdef __ARMEB__ + rev $h0,$h0 + rev $h1,$h1 + rev $h2,$h2 + rev $h3,$h3 +# endif + str $h0,[$mac,#0] @ store the result + str $h1,[$mac,#4] + str $h2,[$mac,#8] + str $h3,[$mac,#12] + + ldmia sp!,{r4-r11} + ret @ bx lr +.size poly1305_emit_neon,.-poly1305_emit_neon + +.align 5 +.Lzeros: +.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +# ifndef __KERNEL__ +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-.Lpoly1305_init +# endif +#endif +___ +} } +$code.=<<___; +.align 2 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.comm OPENSSL_armcap_P,4,4 +#endif +___ + +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/@/ and !/^$/); + print; +} +close SELF; + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or + s/\bret\b/bx lr/go or + s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 + + print $_,"\n"; +} +close STDOUT; # enforce flush diff --git a/src/crypto/zinc/poly1305/poly1305-arm64.S b/src/crypto/zinc/poly1305/poly1305-arm64.S deleted file mode 100644 index 5f4e7fb..0000000 --- a/src/crypto/zinc/poly1305/poly1305-arm64.S +++ /dev/null @@ -1,824 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ -/* - * Copyright (C) 2015-2018 Jason A. Donenfeld . All Rights Reserved. - * Copyright (C) 2006-2017 CRYPTOGAMS by . All Rights Reserved. - * - * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS. - */ - -#include -.text - -.align 5 -ENTRY(poly1305_init_arm) - cmp x1,xzr - stp xzr,xzr,[x0] // zero hash value - stp xzr,xzr,[x0,#16] // [along with is_base2_26] - - csel x0,xzr,x0,eq - b.eq .Lno_key - - ldp x7,x8,[x1] // load key - mov x9,#0xfffffffc0fffffff - movk x9,#0x0fff,lsl#48 -#ifdef __AARCH64EB__ - rev x7,x7 // flip bytes - rev x8,x8 -#endif - and x7,x7,x9 // &=0ffffffc0fffffff - and x9,x9,#-4 - and x8,x8,x9 // &=0ffffffc0ffffffc - stp x7,x8,[x0,#32] // save key value - -.Lno_key: - ret -ENDPROC(poly1305_init_arm) - -.align 5 -ENTRY(poly1305_blocks_arm) - ands x2,x2,#-16 - b.eq .Lno_data - - ldp x4,x5,[x0] // load hash value - ldp x7,x8,[x0,#32] // load key value - ldr x6,[x0,#16] - add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) - b .Loop - -.align 5 -.Loop: - ldp x10,x11,[x1],#16 // load input - sub x2,x2,#16 -#ifdef __AARCH64EB__ - rev x10,x10 - rev x11,x11 -#endif - adds x4,x4,x10 // accumulate input - adcs x5,x5,x11 - - mul x12,x4,x7 // h0*r0 - adc x6,x6,x3 - umulh x13,x4,x7 - - mul x10,x5,x9 // h1*5*r1 - umulh x11,x5,x9 - - adds x12,x12,x10 - mul x10,x4,x8 // h0*r1 - adc x13,x13,x11 - umulh x14,x4,x8 - - adds x13,x13,x10 - mul x10,x5,x7 // h1*r0 - adc x14,x14,xzr - umulh x11,x5,x7 - - adds x13,x13,x10 - mul x10,x6,x9 // h2*5*r1 - adc x14,x14,x11 - mul x11,x6,x7 // h2*r0 - - adds x13,x13,x10 - adc x14,x14,x11 - - and x10,x14,#-4 // final reduction - and x6,x14,#3 - add x10,x10,x14,lsr#2 - adds x4,x12,x10 - adcs x5,x13,xzr - adc x6,x6,xzr - - cbnz x2,.Loop - - stp x4,x5,[x0] // store hash value - str x6,[x0,#16] - -.Lno_data: - ret -ENDPROC(poly1305_blocks_arm) - -.align 5 -ENTRY(poly1305_emit_arm) - ldp x4,x5,[x0] // load hash base 2^64 - ldr x6,[x0,#16] - ldp x10,x11,[x2] // load nonce - - adds x12,x4,#5 // compare to modulus - adcs x13,x5,xzr - adc x14,x6,xzr - - tst x14,#-4 // see if it's carried/borrowed - - csel x4,x4,x12,eq - csel x5,x5,x13,eq - -#ifdef __AARCH64EB__ - ror x10,x10,#32 // flip nonce words - ror x11,x11,#32 -#endif - adds x4,x4,x10 // accumulate nonce - adc x5,x5,x11 -#ifdef __AARCH64EB__ - rev x4,x4 // flip output bytes - rev x5,x5 -#endif - stp x4,x5,[x1] // write result - - ret -ENDPROC(poly1305_emit_arm) - -.align 5 -__poly1305_mult: - mul x12,x4,x7 // h0*r0 - umulh x13,x4,x7 - - mul x10,x5,x9 // h1*5*r1 - umulh x11,x5,x9 - - adds x12,x12,x10 - mul x10,x4,x8 // h0*r1 - adc x13,x13,x11 - umulh x14,x4,x8 - - adds x13,x13,x10 - mul x10,x5,x7 // h1*r0 - adc x14,x14,xzr - umulh x11,x5,x7 - - adds x13,x13,x10 - mul x10,x6,x9 // h2*5*r1 - adc x14,x14,x11 - mul x11,x6,x7 // h2*r0 - - adds x13,x13,x10 - adc x14,x14,x11 - - and x10,x14,#-4 // final reduction - and x6,x14,#3 - add x10,x10,x14,lsr#2 - adds x4,x12,x10 - adcs x5,x13,xzr - adc x6,x6,xzr - - ret - -__poly1305_splat: - and x12,x4,#0x03ffffff // base 2^64 -> base 2^26 - ubfx x13,x4,#26,#26 - extr x14,x5,x4,#52 - and x14,x14,#0x03ffffff - ubfx x15,x5,#14,#26 - extr x16,x6,x5,#40 - - str w12,[x0,#16*0] // r0 - add w12,w13,w13,lsl#2 // r1*5 - str w13,[x0,#16*1] // r1 - add w13,w14,w14,lsl#2 // r2*5 - str w12,[x0,#16*2] // s1 - str w14,[x0,#16*3] // r2 - add w14,w15,w15,lsl#2 // r3*5 - str w13,[x0,#16*4] // s2 - str w15,[x0,#16*5] // r3 - add w15,w16,w16,lsl#2 // r4*5 - str w14,[x0,#16*6] // s3 - str w16,[x0,#16*7] // r4 - str w15,[x0,#16*8] // s4 - - ret - -#ifdef CONFIG_KERNEL_MODE_NEON -.align 5 -ENTRY(poly1305_blocks_neon) - ldr x17,[x0,#24] - cmp x2,#128 - b.hs .Lblocks_neon - cbz x17,poly1305_blocks_arm - -.Lblocks_neon: - stp x29,x30,[sp,#-80]! - add x29,sp,#0 - - ands x2,x2,#-16 - b.eq .Lno_data_neon - - cbz x17,.Lbase2_64_neon - - ldp w10,w11,[x0] // load hash value base 2^26 - ldp w12,w13,[x0,#8] - ldr w14,[x0,#16] - - tst x2,#31 - b.eq .Leven_neon - - ldp x7,x8,[x0,#32] // load key value - - add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 - lsr x5,x12,#12 - adds x4,x4,x12,lsl#52 - add x5,x5,x13,lsl#14 - adc x5,x5,xzr - lsr x6,x14,#24 - adds x5,x5,x14,lsl#40 - adc x14,x6,xzr // can be partially reduced... - - ldp x12,x13,[x1],#16 // load input - sub x2,x2,#16 - add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) - - and x10,x14,#-4 // ... so reduce - and x6,x14,#3 - add x10,x10,x14,lsr#2 - adds x4,x4,x10 - adcs x5,x5,xzr - adc x6,x6,xzr - -#ifdef __AARCH64EB__ - rev x12,x12 - rev x13,x13 -#endif - adds x4,x4,x12 // accumulate input - adcs x5,x5,x13 - adc x6,x6,x3 - - bl __poly1305_mult - ldr x30,[sp,#8] - - cbz x3,.Lstore_base2_64_neon - - and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 - ubfx x11,x4,#26,#26 - extr x12,x5,x4,#52 - and x12,x12,#0x03ffffff - ubfx x13,x5,#14,#26 - extr x14,x6,x5,#40 - - cbnz x2,.Leven_neon - - stp w10,w11,[x0] // store hash value base 2^26 - stp w12,w13,[x0,#8] - str w14,[x0,#16] - b .Lno_data_neon - -.align 4 -.Lstore_base2_64_neon: - stp x4,x5,[x0] // store hash value base 2^64 - stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed - b .Lno_data_neon - -.align 4 -.Lbase2_64_neon: - ldp x7,x8,[x0,#32] // load key value - - ldp x4,x5,[x0] // load hash value base 2^64 - ldr x6,[x0,#16] - - tst x2,#31 - b.eq .Linit_neon - - ldp x12,x13,[x1],#16 // load input - sub x2,x2,#16 - add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) -#ifdef __AARCH64EB__ - rev x12,x12 - rev x13,x13 -#endif - adds x4,x4,x12 // accumulate input - adcs x5,x5,x13 - adc x6,x6,x3 - - bl __poly1305_mult - -.Linit_neon: - and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 - ubfx x11,x4,#26,#26 - extr x12,x5,x4,#52 - and x12,x12,#0x03ffffff - ubfx x13,x5,#14,#26 - extr x14,x6,x5,#40 - - stp d8,d9,[sp,#16] // meet ABI requirements - stp d10,d11,[sp,#32] - stp d12,d13,[sp,#48] - stp d14,d15,[sp,#64] - - fmov d24,x10 - fmov d25,x11 - fmov d26,x12 - fmov d27,x13 - fmov d28,x14 - - ////////////////////////////////// initialize r^n table - mov x4,x7 // r^1 - add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) - mov x5,x8 - mov x6,xzr - add x0,x0,#48+12 - bl __poly1305_splat - - bl __poly1305_mult // r^2 - sub x0,x0,#4 - bl __poly1305_splat - - bl __poly1305_mult // r^3 - sub x0,x0,#4 - bl __poly1305_splat - - bl __poly1305_mult // r^4 - sub x0,x0,#4 - bl __poly1305_splat - ldr x30,[sp,#8] - - add x16,x1,#32 - adr x17,.Lzeros - subs x2,x2,#64 - csel x16,x17,x16,lo - - mov x4,#1 - str x4,[x0,#-24] // set is_base2_26 - sub x0,x0,#48 // restore original x0 - b .Ldo_neon - -.align 4 -.Leven_neon: - add x16,x1,#32 - adr x17,.Lzeros - subs x2,x2,#64 - csel x16,x17,x16,lo - - stp d8,d9,[sp,#16] // meet ABI requirements - stp d10,d11,[sp,#32] - stp d12,d13,[sp,#48] - stp d14,d15,[sp,#64] - - fmov d24,x10 - fmov d25,x11 - fmov d26,x12 - fmov d27,x13 - fmov d28,x14 - -.Ldo_neon: - ldp x8,x12,[x16],#16 // inp[2:3] (or zero) - ldp x9,x13,[x16],#48 - - lsl x3,x3,#24 - add x15,x0,#48 - -#ifdef __AARCH64EB__ - rev x8,x8 - rev x12,x12 - rev x9,x9 - rev x13,x13 -#endif - and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 - and x5,x9,#0x03ffffff - ubfx x6,x8,#26,#26 - ubfx x7,x9,#26,#26 - add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 - extr x8,x12,x8,#52 - extr x9,x13,x9,#52 - add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 - fmov d14,x4 - and x8,x8,#0x03ffffff - and x9,x9,#0x03ffffff - ubfx x10,x12,#14,#26 - ubfx x11,x13,#14,#26 - add x12,x3,x12,lsr#40 - add x13,x3,x13,lsr#40 - add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 - fmov d15,x6 - add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 - add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 - fmov d16,x8 - fmov d17,x10 - fmov d18,x12 - - ldp x8,x12,[x1],#16 // inp[0:1] - ldp x9,x13,[x1],#48 - - ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64 - ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64 - ld1 {v8.4s},[x15] - -#ifdef __AARCH64EB__ - rev x8,x8 - rev x12,x12 - rev x9,x9 - rev x13,x13 -#endif - and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 - and x5,x9,#0x03ffffff - ubfx x6,x8,#26,#26 - ubfx x7,x9,#26,#26 - add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 - extr x8,x12,x8,#52 - extr x9,x13,x9,#52 - add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 - fmov d9,x4 - and x8,x8,#0x03ffffff - and x9,x9,#0x03ffffff - ubfx x10,x12,#14,#26 - ubfx x11,x13,#14,#26 - add x12,x3,x12,lsr#40 - add x13,x3,x13,lsr#40 - add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 - fmov d10,x6 - add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 - add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 - movi v31.2d,#-1 - fmov d11,x8 - fmov d12,x10 - fmov d13,x12 - ushr v31.2d,v31.2d,#38 - - b.ls .Lskip_loop - -.align 4 -.Loop_neon: - //////////////////////////////////////////////////////////////// - // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 - // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r - // ___________________/ - // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 - // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r - // ___________________/ ____________________/ - // - // Note that we start with inp[2:3]*r^2. This is because it - // doesn't depend on reduction in previous iteration. - //////////////////////////////////////////////////////////////// - // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 - // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 - // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 - // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 - // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 - - subs x2,x2,#64 - umull v23.2d,v14.2s,v7.s[2] - csel x16,x17,x16,lo - umull v22.2d,v14.2s,v5.s[2] - umull v21.2d,v14.2s,v3.s[2] - ldp x8,x12,[x16],#16 // inp[2:3] (or zero) - umull v20.2d,v14.2s,v1.s[2] - ldp x9,x13,[x16],#48 - umull v19.2d,v14.2s,v0.s[2] -#ifdef __AARCH64EB__ - rev x8,x8 - rev x12,x12 - rev x9,x9 - rev x13,x13 -#endif - - umlal v23.2d,v15.2s,v5.s[2] - and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 - umlal v22.2d,v15.2s,v3.s[2] - and x5,x9,#0x03ffffff - umlal v21.2d,v15.2s,v1.s[2] - ubfx x6,x8,#26,#26 - umlal v20.2d,v15.2s,v0.s[2] - ubfx x7,x9,#26,#26 - umlal v19.2d,v15.2s,v8.s[2] - add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 - - umlal v23.2d,v16.2s,v3.s[2] - extr x8,x12,x8,#52 - umlal v22.2d,v16.2s,v1.s[2] - extr x9,x13,x9,#52 - umlal v21.2d,v16.2s,v0.s[2] - add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 - umlal v20.2d,v16.2s,v8.s[2] - fmov d14,x4 - umlal v19.2d,v16.2s,v6.s[2] - and x8,x8,#0x03ffffff - - umlal v23.2d,v17.2s,v1.s[2] - and x9,x9,#0x03ffffff - umlal v22.2d,v17.2s,v0.s[2] - ubfx x10,x12,#14,#26 - umlal v21.2d,v17.2s,v8.s[2] - ubfx x11,x13,#14,#26 - umlal v20.2d,v17.2s,v6.s[2] - add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 - umlal v19.2d,v17.2s,v4.s[2] - fmov d15,x6 - - add v11.2s,v11.2s,v26.2s - add x12,x3,x12,lsr#40 - umlal v23.2d,v18.2s,v0.s[2] - add x13,x3,x13,lsr#40 - umlal v22.2d,v18.2s,v8.s[2] - add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 - umlal v21.2d,v18.2s,v6.s[2] - add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 - umlal v20.2d,v18.2s,v4.s[2] - fmov d16,x8 - umlal v19.2d,v18.2s,v2.s[2] - fmov d17,x10 - - //////////////////////////////////////////////////////////////// - // (hash+inp[0:1])*r^4 and accumulate - - add v9.2s,v9.2s,v24.2s - fmov d18,x12 - umlal v22.2d,v11.2s,v1.s[0] - ldp x8,x12,[x1],#16 // inp[0:1] - umlal v19.2d,v11.2s,v6.s[0] - ldp x9,x13,[x1],#48 - umlal v23.2d,v11.2s,v3.s[0] - umlal v20.2d,v11.2s,v8.s[0] - umlal v21.2d,v11.2s,v0.s[0] -#ifdef __AARCH64EB__ - rev x8,x8 - rev x12,x12 - rev x9,x9 - rev x13,x13 -#endif - - add v10.2s,v10.2s,v25.2s - umlal v22.2d,v9.2s,v5.s[0] - umlal v23.2d,v9.2s,v7.s[0] - and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 - umlal v21.2d,v9.2s,v3.s[0] - and x5,x9,#0x03ffffff - umlal v19.2d,v9.2s,v0.s[0] - ubfx x6,x8,#26,#26 - umlal v20.2d,v9.2s,v1.s[0] - ubfx x7,x9,#26,#26 - - add v12.2s,v12.2s,v27.2s - add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 - umlal v22.2d,v10.2s,v3.s[0] - extr x8,x12,x8,#52 - umlal v23.2d,v10.2s,v5.s[0] - extr x9,x13,x9,#52 - umlal v19.2d,v10.2s,v8.s[0] - add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 - umlal v21.2d,v10.2s,v1.s[0] - fmov d9,x4 - umlal v20.2d,v10.2s,v0.s[0] - and x8,x8,#0x03ffffff - - add v13.2s,v13.2s,v28.2s - and x9,x9,#0x03ffffff - umlal v22.2d,v12.2s,v0.s[0] - ubfx x10,x12,#14,#26 - umlal v19.2d,v12.2s,v4.s[0] - ubfx x11,x13,#14,#26 - umlal v23.2d,v12.2s,v1.s[0] - add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 - umlal v20.2d,v12.2s,v6.s[0] - fmov d10,x6 - umlal v21.2d,v12.2s,v8.s[0] - add x12,x3,x12,lsr#40 - - umlal v22.2d,v13.2s,v8.s[0] - add x13,x3,x13,lsr#40 - umlal v19.2d,v13.2s,v2.s[0] - add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 - umlal v23.2d,v13.2s,v0.s[0] - add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 - umlal v20.2d,v13.2s,v4.s[0] - fmov d11,x8 - umlal v21.2d,v13.2s,v6.s[0] - fmov d12,x10 - fmov d13,x12 - - ///////////////////////////////////////////////////////////////// - // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein - // and P. Schwabe - // - // [see discussion in poly1305-armv4 module] - - ushr v29.2d,v22.2d,#26 - xtn v27.2s,v22.2d - ushr v30.2d,v19.2d,#26 - and v19.16b,v19.16b,v31.16b - add v23.2d,v23.2d,v29.2d // h3 -> h4 - bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff - add v20.2d,v20.2d,v30.2d // h0 -> h1 - - ushr v29.2d,v23.2d,#26 - xtn v28.2s,v23.2d - ushr v30.2d,v20.2d,#26 - xtn v25.2s,v20.2d - bic v28.2s,#0xfc,lsl#24 - add v21.2d,v21.2d,v30.2d // h1 -> h2 - - add v19.2d,v19.2d,v29.2d - shl v29.2d,v29.2d,#2 - shrn v30.2s,v21.2d,#26 - xtn v26.2s,v21.2d - add v19.2d,v19.2d,v29.2d // h4 -> h0 - bic v25.2s,#0xfc,lsl#24 - add v27.2s,v27.2s,v30.2s // h2 -> h3 - bic v26.2s,#0xfc,lsl#24 - - shrn v29.2s,v19.2d,#26 - xtn v24.2s,v19.2d - ushr v30.2s,v27.2s,#26 - bic v27.2s,#0xfc,lsl#24 - bic v24.2s,#0xfc,lsl#24 - add v25.2s,v25.2s,v29.2s // h0 -> h1 - add v28.2s,v28.2s,v30.2s // h3 -> h4 - - b.hi .Loop_neon - -.Lskip_loop: - dup v16.2d,v16.d[0] - add v11.2s,v11.2s,v26.2s - - //////////////////////////////////////////////////////////////// - // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 - - adds x2,x2,#32 - b.ne .Long_tail - - dup v16.2d,v11.d[0] - add v14.2s,v9.2s,v24.2s - add v17.2s,v12.2s,v27.2s - add v15.2s,v10.2s,v25.2s - add v18.2s,v13.2s,v28.2s - -.Long_tail: - dup v14.2d,v14.d[0] - umull2 v19.2d,v16.4s,v6.4s - umull2 v22.2d,v16.4s,v1.4s - umull2 v23.2d,v16.4s,v3.4s - umull2 v21.2d,v16.4s,v0.4s - umull2 v20.2d,v16.4s,v8.4s - - dup v15.2d,v15.d[0] - umlal2 v19.2d,v14.4s,v0.4s - umlal2 v21.2d,v14.4s,v3.4s - umlal2 v22.2d,v14.4s,v5.4s - umlal2 v23.2d,v14.4s,v7.4s - umlal2 v20.2d,v14.4s,v1.4s - - dup v17.2d,v17.d[0] - umlal2 v19.2d,v15.4s,v8.4s - umlal2 v22.2d,v15.4s,v3.4s - umlal2 v21.2d,v15.4s,v1.4s - umlal2 v23.2d,v15.4s,v5.4s - umlal2 v20.2d,v15.4s,v0.4s - - dup v18.2d,v18.d[0] - umlal2 v22.2d,v17.4s,v0.4s - umlal2 v23.2d,v17.4s,v1.4s - umlal2 v19.2d,v17.4s,v4.4s - umlal2 v20.2d,v17.4s,v6.4s - umlal2 v21.2d,v17.4s,v8.4s - - umlal2 v22.2d,v18.4s,v8.4s - umlal2 v19.2d,v18.4s,v2.4s - umlal2 v23.2d,v18.4s,v0.4s - umlal2 v20.2d,v18.4s,v4.4s - umlal2 v21.2d,v18.4s,v6.4s - - b.eq .Lshort_tail - - //////////////////////////////////////////////////////////////// - // (hash+inp[0:1])*r^4:r^3 and accumulate - - add v9.2s,v9.2s,v24.2s - umlal v22.2d,v11.2s,v1.2s - umlal v19.2d,v11.2s,v6.2s - umlal v23.2d,v11.2s,v3.2s - umlal v20.2d,v11.2s,v8.2s - umlal v21.2d,v11.2s,v0.2s - - add v10.2s,v10.2s,v25.2s - umlal v22.2d,v9.2s,v5.2s - umlal v19.2d,v9.2s,v0.2s - umlal v23.2d,v9.2s,v7.2s - umlal v20.2d,v9.2s,v1.2s - umlal v21.2d,v9.2s,v3.2s - - add v12.2s,v12.2s,v27.2s - umlal v22.2d,v10.2s,v3.2s - umlal v19.2d,v10.2s,v8.2s - umlal v23.2d,v10.2s,v5.2s - umlal v20.2d,v10.2s,v0.2s - umlal v21.2d,v10.2s,v1.2s - - add v13.2s,v13.2s,v28.2s - umlal v22.2d,v12.2s,v0.2s - umlal v19.2d,v12.2s,v4.2s - umlal v23.2d,v12.2s,v1.2s - umlal v20.2d,v12.2s,v6.2s - umlal v21.2d,v12.2s,v8.2s - - umlal v22.2d,v13.2s,v8.2s - umlal v19.2d,v13.2s,v2.2s - umlal v23.2d,v13.2s,v0.2s - umlal v20.2d,v13.2s,v4.2s - umlal v21.2d,v13.2s,v6.2s - -.Lshort_tail: - //////////////////////////////////////////////////////////////// - // horizontal add - - addp v22.2d,v22.2d,v22.2d - ldp d8,d9,[sp,#16] // meet ABI requirements - addp v19.2d,v19.2d,v19.2d - ldp d10,d11,[sp,#32] - addp v23.2d,v23.2d,v23.2d - ldp d12,d13,[sp,#48] - addp v20.2d,v20.2d,v20.2d - ldp d14,d15,[sp,#64] - addp v21.2d,v21.2d,v21.2d - - //////////////////////////////////////////////////////////////// - // lazy reduction, but without narrowing - - ushr v29.2d,v22.2d,#26 - and v22.16b,v22.16b,v31.16b - ushr v30.2d,v19.2d,#26 - and v19.16b,v19.16b,v31.16b - - add v23.2d,v23.2d,v29.2d // h3 -> h4 - add v20.2d,v20.2d,v30.2d // h0 -> h1 - - ushr v29.2d,v23.2d,#26 - and v23.16b,v23.16b,v31.16b - ushr v30.2d,v20.2d,#26 - and v20.16b,v20.16b,v31.16b - add v21.2d,v21.2d,v30.2d // h1 -> h2 - - add v19.2d,v19.2d,v29.2d - shl v29.2d,v29.2d,#2 - ushr v30.2d,v21.2d,#26 - and v21.16b,v21.16b,v31.16b - add v19.2d,v19.2d,v29.2d // h4 -> h0 - add v22.2d,v22.2d,v30.2d // h2 -> h3 - - ushr v29.2d,v19.2d,#26 - and v19.16b,v19.16b,v31.16b - ushr v30.2d,v22.2d,#26 - and v22.16b,v22.16b,v31.16b - add v20.2d,v20.2d,v29.2d // h0 -> h1 - add v23.2d,v23.2d,v30.2d // h3 -> h4 - - //////////////////////////////////////////////////////////////// - // write the result, can be partially reduced - - st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16 - st1 {v23.s}[0],[x0] - -.Lno_data_neon: - ldr x29,[sp],#80 - ret -ENDPROC(poly1305_blocks_neon) - -.align 5 -ENTRY(poly1305_emit_neon) - ldr x17,[x0,#24] - cbz x17,poly1305_emit_arm - - ldp w10,w11,[x0] // load hash value base 2^26 - ldp w12,w13,[x0,#8] - ldr w14,[x0,#16] - - add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 - lsr x5,x12,#12 - adds x4,x4,x12,lsl#52 - add x5,x5,x13,lsl#14 - adc x5,x5,xzr - lsr x6,x14,#24 - adds x5,x5,x14,lsl#40 - adc x6,x6,xzr // can be partially reduced... - - ldp x10,x11,[x2] // load nonce - - and x12,x6,#-4 // ... so reduce - add x12,x12,x6,lsr#2 - and x6,x6,#3 - adds x4,x4,x12 - adcs x5,x5,xzr - adc x6,x6,xzr - - adds x12,x4,#5 // compare to modulus - adcs x13,x5,xzr - adc x14,x6,xzr - - tst x14,#-4 // see if it's carried/borrowed - - csel x4,x4,x12,eq - csel x5,x5,x13,eq - -#ifdef __AARCH64EB__ - ror x10,x10,#32 // flip nonce words - ror x11,x11,#32 -#endif - adds x4,x4,x10 // accumulate nonce - adc x5,x5,x11 -#ifdef __AARCH64EB__ - rev x4,x4 // flip output bytes - rev x5,x5 -#endif - stp x4,x5,[x1] // write result - - ret -ENDPROC(poly1305_emit_neon) - -.align 5 -.Lzeros: -.long 0,0,0,0,0,0,0,0 -#endif diff --git a/src/crypto/zinc/poly1305/poly1305-arm64.pl b/src/crypto/zinc/poly1305/poly1305-arm64.pl new file mode 100644 index 0000000..cf0ce9d --- /dev/null +++ b/src/crypto/zinc/poly1305/poly1305-arm64.pl @@ -0,0 +1,972 @@ +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +# +# This code is taken from the OpenSSL project but the author, Andy Polyakov, +# has relicensed it under the licenses specified in the SPDX header above. +# The original headers, including the original license headers, are +# included below for completeness. +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# This module implements Poly1305 hash for ARMv8. +# +# June 2015 +# +# Numbers are cycles per processed byte with poly1305_blocks alone. +# +# IALU/gcc-4.9 NEON +# +# Apple A7 1.86/+5% 0.72 +# Cortex-A53 2.69/+58% 1.47 +# Cortex-A57 2.70/+7% 1.14 +# Denver 1.64/+50% 1.18(*) +# X-Gene 2.13/+68% 2.27 +# Mongoose 1.77/+75% 1.12 +# Kryo 2.70/+55% 1.13 +# +# (*) estimate based on resources availability is less than 1.0, +# i.e. measured result is worse than expected, presumably binary +# translator is not almighty; + +$flavour=shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3)); +my ($mac,$nonce)=($inp,$len); + +my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14)); + +$code.=<<___; +#ifndef __KERNEL__ +# include "arm_arch.h" +.extern OPENSSL_armcap_P +#else +# define poly1305_init poly1305_init_arm +# define poly1305_blocks poly1305_blocks_arm +# define poly1305_emit poly1305_emit_arm +#endif + +.text + +// forward "declarations" are required for Apple +.globl poly1305_blocks +.globl poly1305_emit +#ifdef __KERNEL__ +.globl poly1305_blocks_neon +.globl poly1305_emit_neon +#endif + +.globl poly1305_init +.type poly1305_init,%function +.align 5 +poly1305_init: + cmp $inp,xzr + stp xzr,xzr,[$ctx] // zero hash value + stp xzr,xzr,[$ctx,#16] // [along with is_base2_26] + + csel x0,xzr,x0,eq + b.eq .Lno_key + +#ifndef __KERNEL__ +# ifdef __ILP32__ + ldrsw $t1,.LOPENSSL_armcap_P +# else + ldr $t1,.LOPENSSL_armcap_P +# endif + adr $t0,.LOPENSSL_armcap_P + ldr w17,[$t0,$t1] +#endif + + ldp $r0,$r1,[$inp] // load key + mov $s1,#0xfffffffc0fffffff + movk $s1,#0x0fff,lsl#48 +#ifdef __AARCH64EB__ + rev $r0,$r0 // flip bytes + rev $r1,$r1 +#endif + and $r0,$r0,$s1 // &=0ffffffc0fffffff + and $s1,$s1,#-4 + and $r1,$r1,$s1 // &=0ffffffc0ffffffc + stp $r0,$r1,[$ctx,#32] // save key value + +#ifndef __KERNEL__ + tst w17,#ARMV7_NEON + + adr $d0,poly1305_blocks + adr $r0,poly1305_blocks_neon + adr $d1,poly1305_emit + adr $r1,poly1305_emit_neon + + csel $d0,$d0,$r0,eq + csel $d1,$d1,$r1,eq + +# ifdef __ILP32__ + stp w12,w13,[$len] +# else + stp $d0,$d1,[$len] +# endif + + mov x0,#1 +#else + mov x0,#0 +#endif +.Lno_key: + ret +.size poly1305_init,.-poly1305_init + +.type poly1305_blocks,%function +.align 5 +poly1305_blocks: + ands $len,$len,#-16 + b.eq .Lno_data + + ldp $h0,$h1,[$ctx] // load hash value + ldp $r0,$r1,[$ctx,#32] // load key value + ldr $h2,[$ctx,#16] + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) + b .Loop + +.align 5 +.Loop: + ldp $t0,$t1,[$inp],#16 // load input + sub $len,$len,#16 +#ifdef __AARCH64EB__ + rev $t0,$t0 + rev $t1,$t1 +#endif + adds $h0,$h0,$t0 // accumulate input + adcs $h1,$h1,$t1 + + mul $d0,$h0,$r0 // h0*r0 + adc $h2,$h2,$padbit + umulh $d1,$h0,$r0 + + mul $t0,$h1,$s1 // h1*5*r1 + umulh $t1,$h1,$s1 + + adds $d0,$d0,$t0 + mul $t0,$h0,$r1 // h0*r1 + adc $d1,$d1,$t1 + umulh $d2,$h0,$r1 + + adds $d1,$d1,$t0 + mul $t0,$h1,$r0 // h1*r0 + adc $d2,$d2,xzr + umulh $t1,$h1,$r0 + + adds $d1,$d1,$t0 + mul $t0,$h2,$s1 // h2*5*r1 + adc $d2,$d2,$t1 + mul $t1,$h2,$r0 // h2*r0 + + adds $d1,$d1,$t0 + adc $d2,$d2,$t1 + + and $t0,$d2,#-4 // final reduction + and $h2,$d2,#3 + add $t0,$t0,$d2,lsr#2 + adds $h0,$d0,$t0 + adcs $h1,$d1,xzr + adc $h2,$h2,xzr + + cbnz $len,.Loop + + stp $h0,$h1,[$ctx] // store hash value + str $h2,[$ctx,#16] + +.Lno_data: + ret +.size poly1305_blocks,.-poly1305_blocks + +.type poly1305_emit,%function +.align 5 +poly1305_emit: + ldp $h0,$h1,[$ctx] // load hash base 2^64 + ldr $h2,[$ctx,#16] + ldp $t0,$t1,[$nonce] // load nonce + + adds $d0,$h0,#5 // compare to modulus + adcs $d1,$h1,xzr + adc $d2,$h2,xzr + + tst $d2,#-4 // see if it's carried/borrowed + + csel $h0,$h0,$d0,eq + csel $h1,$h1,$d1,eq + +#ifdef __AARCH64EB__ + ror $t0,$t0,#32 // flip nonce words + ror $t1,$t1,#32 +#endif + adds $h0,$h0,$t0 // accumulate nonce + adc $h1,$h1,$t1 +#ifdef __AARCH64EB__ + rev $h0,$h0 // flip output bytes + rev $h1,$h1 +#endif + stp $h0,$h1,[$mac] // write result + + ret +.size poly1305_emit,.-poly1305_emit +___ +my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8)); +my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13)); +my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18)); +my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23)); +my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28)); +my ($T0,$T1,$MASK) = map("v$_",(29..31)); + +my ($in2,$zeros)=("x16","x17"); +my $is_base2_26 = $zeros; # borrow + +$code.=<<___; +.type __poly1305_mult,%function +.align 5 +__poly1305_mult: + mul $d0,$h0,$r0 // h0*r0 + umulh $d1,$h0,$r0 + + mul $t0,$h1,$s1 // h1*5*r1 + umulh $t1,$h1,$s1 + + adds $d0,$d0,$t0 + mul $t0,$h0,$r1 // h0*r1 + adc $d1,$d1,$t1 + umulh $d2,$h0,$r1 + + adds $d1,$d1,$t0 + mul $t0,$h1,$r0 // h1*r0 + adc $d2,$d2,xzr + umulh $t1,$h1,$r0 + + adds $d1,$d1,$t0 + mul $t0,$h2,$s1 // h2*5*r1 + adc $d2,$d2,$t1 + mul $t1,$h2,$r0 // h2*r0 + + adds $d1,$d1,$t0 + adc $d2,$d2,$t1 + + and $t0,$d2,#-4 // final reduction + and $h2,$d2,#3 + add $t0,$t0,$d2,lsr#2 + adds $h0,$d0,$t0 + adcs $h1,$d1,xzr + adc $h2,$h2,xzr + + ret +.size __poly1305_mult,.-__poly1305_mult + +.type __poly1305_splat,%function +.align 5 +__poly1305_splat: + and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26 + ubfx x13,$h0,#26,#26 + extr x14,$h1,$h0,#52 + and x14,x14,#0x03ffffff + ubfx x15,$h1,#14,#26 + extr x16,$h2,$h1,#40 + + str w12,[$ctx,#16*0] // r0 + add w12,w13,w13,lsl#2 // r1*5 + str w13,[$ctx,#16*1] // r1 + add w13,w14,w14,lsl#2 // r2*5 + str w12,[$ctx,#16*2] // s1 + str w14,[$ctx,#16*3] // r2 + add w14,w15,w15,lsl#2 // r3*5 + str w13,[$ctx,#16*4] // s2 + str w15,[$ctx,#16*5] // r3 + add w15,w16,w16,lsl#2 // r4*5 + str w14,[$ctx,#16*6] // s3 + str w16,[$ctx,#16*7] // r4 + str w15,[$ctx,#16*8] // s4 + + ret +.size __poly1305_splat,.-__poly1305_splat + +.type poly1305_blocks_neon,%function +.align 5 +poly1305_blocks_neon: + ldr $is_base2_26,[$ctx,#24] + cmp $len,#128 + b.hs .Lblocks_neon + cbz $is_base2_26,poly1305_blocks + +.Lblocks_neon: + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + + ands $len,$len,#-16 + b.eq .Lno_data_neon + + cbz $is_base2_26,.Lbase2_64_neon + + ldp w10,w11,[$ctx] // load hash value base 2^26 + ldp w12,w13,[$ctx,#8] + ldr w14,[$ctx,#16] + + tst $len,#31 + b.eq .Leven_neon + + ldp $r0,$r1,[$ctx,#32] // load key value + + add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64 + lsr $h1,x12,#12 + adds $h0,$h0,x12,lsl#52 + add $h1,$h1,x13,lsl#14 + adc $h1,$h1,xzr + lsr $h2,x14,#24 + adds $h1,$h1,x14,lsl#40 + adc $d2,$h2,xzr // can be partially reduced... + + ldp $d0,$d1,[$inp],#16 // load input + sub $len,$len,#16 + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) + + and $t0,$d2,#-4 // ... so reduce + and $h2,$d2,#3 + add $t0,$t0,$d2,lsr#2 + adds $h0,$h0,$t0 + adcs $h1,$h1,xzr + adc $h2,$h2,xzr + +#ifdef __AARCH64EB__ + rev $d0,$d0 + rev $d1,$d1 +#endif + adds $h0,$h0,$d0 // accumulate input + adcs $h1,$h1,$d1 + adc $h2,$h2,$padbit + + bl __poly1305_mult + ldr x30,[sp,#8] + + cbz $padbit,.Lstore_base2_64_neon + + and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 + ubfx x11,$h0,#26,#26 + extr x12,$h1,$h0,#52 + and x12,x12,#0x03ffffff + ubfx x13,$h1,#14,#26 + extr x14,$h2,$h1,#40 + + cbnz $len,.Leven_neon + + stp w10,w11,[$ctx] // store hash value base 2^26 + stp w12,w13,[$ctx,#8] + str w14,[$ctx,#16] + b .Lno_data_neon + +.align 4 +.Lstore_base2_64_neon: + stp $h0,$h1,[$ctx] // store hash value base 2^64 + stp $h2,xzr,[$ctx,#16] // note that is_base2_26 is zeroed + b .Lno_data_neon + +.align 4 +.Lbase2_64_neon: + ldp $r0,$r1,[$ctx,#32] // load key value + + ldp $h0,$h1,[$ctx] // load hash value base 2^64 + ldr $h2,[$ctx,#16] + + tst $len,#31 + b.eq .Linit_neon + + ldp $d0,$d1,[$inp],#16 // load input + sub $len,$len,#16 + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) +#ifdef __AARCH64EB__ + rev $d0,$d0 + rev $d1,$d1 +#endif + adds $h0,$h0,$d0 // accumulate input + adcs $h1,$h1,$d1 + adc $h2,$h2,$padbit + + bl __poly1305_mult + +.Linit_neon: + and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 + ubfx x11,$h0,#26,#26 + extr x12,$h1,$h0,#52 + and x12,x12,#0x03ffffff + ubfx x13,$h1,#14,#26 + extr x14,$h2,$h1,#40 + + stp d8,d9,[sp,#16] // meet ABI requirements + stp d10,d11,[sp,#32] + stp d12,d13,[sp,#48] + stp d14,d15,[sp,#64] + + fmov ${H0},x10 + fmov ${H1},x11 + fmov ${H2},x12 + fmov ${H3},x13 + fmov ${H4},x14 + + ////////////////////////////////// initialize r^n table + mov $h0,$r0 // r^1 + add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) + mov $h1,$r1 + mov $h2,xzr + add $ctx,$ctx,#48+12 + bl __poly1305_splat + + bl __poly1305_mult // r^2 + sub $ctx,$ctx,#4 + bl __poly1305_splat + + bl __poly1305_mult // r^3 + sub $ctx,$ctx,#4 + bl __poly1305_splat + + bl __poly1305_mult // r^4 + sub $ctx,$ctx,#4 + bl __poly1305_splat + ldr x30,[sp,#8] + + add $in2,$inp,#32 + adr $zeros,.Lzeros + subs $len,$len,#64 + csel $in2,$zeros,$in2,lo + + mov x4,#1 + str x4,[$ctx,#-24] // set is_base2_26 + sub $ctx,$ctx,#48 // restore original $ctx + b .Ldo_neon + +.align 4 +.Leven_neon: + add $in2,$inp,#32 + adr $zeros,.Lzeros + subs $len,$len,#64 + csel $in2,$zeros,$in2,lo + + stp d8,d9,[sp,#16] // meet ABI requirements + stp d10,d11,[sp,#32] + stp d12,d13,[sp,#48] + stp d14,d15,[sp,#64] + + fmov ${H0},x10 + fmov ${H1},x11 + fmov ${H2},x12 + fmov ${H3},x13 + fmov ${H4},x14 + +.Ldo_neon: + ldp x8,x12,[$in2],#16 // inp[2:3] (or zero) + ldp x9,x13,[$in2],#48 + + lsl $padbit,$padbit,#24 + add x15,$ctx,#48 + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + and x5,x9,#0x03ffffff + ubfx x6,x8,#26,#26 + ubfx x7,x9,#26,#26 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + extr x8,x12,x8,#52 + extr x9,x13,x9,#52 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + fmov $IN23_0,x4 + and x8,x8,#0x03ffffff + and x9,x9,#0x03ffffff + ubfx x10,x12,#14,#26 + ubfx x11,x13,#14,#26 + add x12,$padbit,x12,lsr#40 + add x13,$padbit,x13,lsr#40 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + fmov $IN23_1,x6 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + fmov $IN23_2,x8 + fmov $IN23_3,x10 + fmov $IN23_4,x12 + + ldp x8,x12,[$inp],#16 // inp[0:1] + ldp x9,x13,[$inp],#48 + + ld1 {$R0,$R1,$S1,$R2},[x15],#64 + ld1 {$S2,$R3,$S3,$R4},[x15],#64 + ld1 {$S4},[x15] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + and x5,x9,#0x03ffffff + ubfx x6,x8,#26,#26 + ubfx x7,x9,#26,#26 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + extr x8,x12,x8,#52 + extr x9,x13,x9,#52 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + fmov $IN01_0,x4 + and x8,x8,#0x03ffffff + and x9,x9,#0x03ffffff + ubfx x10,x12,#14,#26 + ubfx x11,x13,#14,#26 + add x12,$padbit,x12,lsr#40 + add x13,$padbit,x13,lsr#40 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + fmov $IN01_1,x6 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + movi $MASK.2d,#-1 + fmov $IN01_2,x8 + fmov $IN01_3,x10 + fmov $IN01_4,x12 + ushr $MASK.2d,$MASK.2d,#38 + + b.ls .Lskip_loop + +.align 4 +.Loop_neon: + //////////////////////////////////////////////////////////////// + // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 + // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r + // \___________________/ + // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 + // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r + // \___________________/ \____________________/ + // + // Note that we start with inp[2:3]*r^2. This is because it + // doesn't depend on reduction in previous iteration. + //////////////////////////////////////////////////////////////// + // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 + // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 + // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 + // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 + // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 + + subs $len,$len,#64 + umull $ACC4,$IN23_0,${R4}[2] + csel $in2,$zeros,$in2,lo + umull $ACC3,$IN23_0,${R3}[2] + umull $ACC2,$IN23_0,${R2}[2] + ldp x8,x12,[$in2],#16 // inp[2:3] (or zero) + umull $ACC1,$IN23_0,${R1}[2] + ldp x9,x13,[$in2],#48 + umull $ACC0,$IN23_0,${R0}[2] +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + + umlal $ACC4,$IN23_1,${R3}[2] + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + umlal $ACC3,$IN23_1,${R2}[2] + and x5,x9,#0x03ffffff + umlal $ACC2,$IN23_1,${R1}[2] + ubfx x6,x8,#26,#26 + umlal $ACC1,$IN23_1,${R0}[2] + ubfx x7,x9,#26,#26 + umlal $ACC0,$IN23_1,${S4}[2] + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + + umlal $ACC4,$IN23_2,${R2}[2] + extr x8,x12,x8,#52 + umlal $ACC3,$IN23_2,${R1}[2] + extr x9,x13,x9,#52 + umlal $ACC2,$IN23_2,${R0}[2] + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + umlal $ACC1,$IN23_2,${S4}[2] + fmov $IN23_0,x4 + umlal $ACC0,$IN23_2,${S3}[2] + and x8,x8,#0x03ffffff + + umlal $ACC4,$IN23_3,${R1}[2] + and x9,x9,#0x03ffffff + umlal $ACC3,$IN23_3,${R0}[2] + ubfx x10,x12,#14,#26 + umlal $ACC2,$IN23_3,${S4}[2] + ubfx x11,x13,#14,#26 + umlal $ACC1,$IN23_3,${S3}[2] + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + umlal $ACC0,$IN23_3,${S2}[2] + fmov $IN23_1,x6 + + add $IN01_2,$IN01_2,$H2 + add x12,$padbit,x12,lsr#40 + umlal $ACC4,$IN23_4,${R0}[2] + add x13,$padbit,x13,lsr#40 + umlal $ACC3,$IN23_4,${S4}[2] + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + umlal $ACC2,$IN23_4,${S3}[2] + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + umlal $ACC1,$IN23_4,${S2}[2] + fmov $IN23_2,x8 + umlal $ACC0,$IN23_4,${S1}[2] + fmov $IN23_3,x10 + + //////////////////////////////////////////////////////////////// + // (hash+inp[0:1])*r^4 and accumulate + + add $IN01_0,$IN01_0,$H0 + fmov $IN23_4,x12 + umlal $ACC3,$IN01_2,${R1}[0] + ldp x8,x12,[$inp],#16 // inp[0:1] + umlal $ACC0,$IN01_2,${S3}[0] + ldp x9,x13,[$inp],#48 + umlal $ACC4,$IN01_2,${R2}[0] + umlal $ACC1,$IN01_2,${S4}[0] + umlal $ACC2,$IN01_2,${R0}[0] +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + + add $IN01_1,$IN01_1,$H1 + umlal $ACC3,$IN01_0,${R3}[0] + umlal $ACC4,$IN01_0,${R4}[0] + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + umlal $ACC2,$IN01_0,${R2}[0] + and x5,x9,#0x03ffffff + umlal $ACC0,$IN01_0,${R0}[0] + ubfx x6,x8,#26,#26 + umlal $ACC1,$IN01_0,${R1}[0] + ubfx x7,x9,#26,#26 + + add $IN01_3,$IN01_3,$H3 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + umlal $ACC3,$IN01_1,${R2}[0] + extr x8,x12,x8,#52 + umlal $ACC4,$IN01_1,${R3}[0] + extr x9,x13,x9,#52 + umlal $ACC0,$IN01_1,${S4}[0] + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + umlal $ACC2,$IN01_1,${R1}[0] + fmov $IN01_0,x4 + umlal $ACC1,$IN01_1,${R0}[0] + and x8,x8,#0x03ffffff + + add $IN01_4,$IN01_4,$H4 + and x9,x9,#0x03ffffff + umlal $ACC3,$IN01_3,${R0}[0] + ubfx x10,x12,#14,#26 + umlal $ACC0,$IN01_3,${S2}[0] + ubfx x11,x13,#14,#26 + umlal $ACC4,$IN01_3,${R1}[0] + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + umlal $ACC1,$IN01_3,${S3}[0] + fmov $IN01_1,x6 + umlal $ACC2,$IN01_3,${S4}[0] + add x12,$padbit,x12,lsr#40 + + umlal $ACC3,$IN01_4,${S4}[0] + add x13,$padbit,x13,lsr#40 + umlal $ACC0,$IN01_4,${S1}[0] + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + umlal $ACC4,$IN01_4,${R0}[0] + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + umlal $ACC1,$IN01_4,${S2}[0] + fmov $IN01_2,x8 + umlal $ACC2,$IN01_4,${S3}[0] + fmov $IN01_3,x10 + fmov $IN01_4,x12 + + ///////////////////////////////////////////////////////////////// + // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein + // and P. Schwabe + // + // [see discussion in poly1305-armv4 module] + + ushr $T0.2d,$ACC3,#26 + xtn $H3,$ACC3 + ushr $T1.2d,$ACC0,#26 + and $ACC0,$ACC0,$MASK.2d + add $ACC4,$ACC4,$T0.2d // h3 -> h4 + bic $H3,#0xfc,lsl#24 // &=0x03ffffff + add $ACC1,$ACC1,$T1.2d // h0 -> h1 + + ushr $T0.2d,$ACC4,#26 + xtn $H4,$ACC4 + ushr $T1.2d,$ACC1,#26 + xtn $H1,$ACC1 + bic $H4,#0xfc,lsl#24 + add $ACC2,$ACC2,$T1.2d // h1 -> h2 + + add $ACC0,$ACC0,$T0.2d + shl $T0.2d,$T0.2d,#2 + shrn $T1.2s,$ACC2,#26 + xtn $H2,$ACC2 + add $ACC0,$ACC0,$T0.2d // h4 -> h0 + bic $H1,#0xfc,lsl#24 + add $H3,$H3,$T1.2s // h2 -> h3 + bic $H2,#0xfc,lsl#24 + + shrn $T0.2s,$ACC0,#26 + xtn $H0,$ACC0 + ushr $T1.2s,$H3,#26 + bic $H3,#0xfc,lsl#24 + bic $H0,#0xfc,lsl#24 + add $H1,$H1,$T0.2s // h0 -> h1 + add $H4,$H4,$T1.2s // h3 -> h4 + + b.hi .Loop_neon + +.Lskip_loop: + dup $IN23_2,${IN23_2}[0] + add $IN01_2,$IN01_2,$H2 + + //////////////////////////////////////////////////////////////// + // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 + + adds $len,$len,#32 + b.ne .Long_tail + + dup $IN23_2,${IN01_2}[0] + add $IN23_0,$IN01_0,$H0 + add $IN23_3,$IN01_3,$H3 + add $IN23_1,$IN01_1,$H1 + add $IN23_4,$IN01_4,$H4 + +.Long_tail: + dup $IN23_0,${IN23_0}[0] + umull2 $ACC0,$IN23_2,${S3} + umull2 $ACC3,$IN23_2,${R1} + umull2 $ACC4,$IN23_2,${R2} + umull2 $ACC2,$IN23_2,${R0} + umull2 $ACC1,$IN23_2,${S4} + + dup $IN23_1,${IN23_1}[0] + umlal2 $ACC0,$IN23_0,${R0} + umlal2 $ACC2,$IN23_0,${R2} + umlal2 $ACC3,$IN23_0,${R3} + umlal2 $ACC4,$IN23_0,${R4} + umlal2 $ACC1,$IN23_0,${R1} + + dup $IN23_3,${IN23_3}[0] + umlal2 $ACC0,$IN23_1,${S4} + umlal2 $ACC3,$IN23_1,${R2} + umlal2 $ACC2,$IN23_1,${R1} + umlal2 $ACC4,$IN23_1,${R3} + umlal2 $ACC1,$IN23_1,${R0} + + dup $IN23_4,${IN23_4}[0] + umlal2 $ACC3,$IN23_3,${R0} + umlal2 $ACC4,$IN23_3,${R1} + umlal2 $ACC0,$IN23_3,${S2} + umlal2 $ACC1,$IN23_3,${S3} + umlal2 $ACC2,$IN23_3,${S4} + + umlal2 $ACC3,$IN23_4,${S4} + umlal2 $ACC0,$IN23_4,${S1} + umlal2 $ACC4,$IN23_4,${R0} + umlal2 $ACC1,$IN23_4,${S2} + umlal2 $ACC2,$IN23_4,${S3} + + b.eq .Lshort_tail + + //////////////////////////////////////////////////////////////// + // (hash+inp[0:1])*r^4:r^3 and accumulate + + add $IN01_0,$IN01_0,$H0 + umlal $ACC3,$IN01_2,${R1} + umlal $ACC0,$IN01_2,${S3} + umlal $ACC4,$IN01_2,${R2} + umlal $ACC1,$IN01_2,${S4} + umlal $ACC2,$IN01_2,${R0} + + add $IN01_1,$IN01_1,$H1 + umlal $ACC3,$IN01_0,${R3} + umlal $ACC0,$IN01_0,${R0} + umlal $ACC4,$IN01_0,${R4} + umlal $ACC1,$IN01_0,${R1} + umlal $ACC2,$IN01_0,${R2} + + add $IN01_3,$IN01_3,$H3 + umlal $ACC3,$IN01_1,${R2} + umlal $ACC0,$IN01_1,${S4} + umlal $ACC4,$IN01_1,${R3} + umlal $ACC1,$IN01_1,${R0} + umlal $ACC2,$IN01_1,${R1} + + add $IN01_4,$IN01_4,$H4 + umlal $ACC3,$IN01_3,${R0} + umlal $ACC0,$IN01_3,${S2} + umlal $ACC4,$IN01_3,${R1} + umlal $ACC1,$IN01_3,${S3} + umlal $ACC2,$IN01_3,${S4} + + umlal $ACC3,$IN01_4,${S4} + umlal $ACC0,$IN01_4,${S1} + umlal $ACC4,$IN01_4,${R0} + umlal $ACC1,$IN01_4,${S2} + umlal $ACC2,$IN01_4,${S3} + +.Lshort_tail: + //////////////////////////////////////////////////////////////// + // horizontal add + + addp $ACC3,$ACC3,$ACC3 + ldp d8,d9,[sp,#16] // meet ABI requirements + addp $ACC0,$ACC0,$ACC0 + ldp d10,d11,[sp,#32] + addp $ACC4,$ACC4,$ACC4 + ldp d12,d13,[sp,#48] + addp $ACC1,$ACC1,$ACC1 + ldp d14,d15,[sp,#64] + addp $ACC2,$ACC2,$ACC2 + + //////////////////////////////////////////////////////////////// + // lazy reduction, but without narrowing + + ushr $T0.2d,$ACC3,#26 + and $ACC3,$ACC3,$MASK.2d + ushr $T1.2d,$ACC0,#26 + and $ACC0,$ACC0,$MASK.2d + + add $ACC4,$ACC4,$T0.2d // h3 -> h4 + add $ACC1,$ACC1,$T1.2d // h0 -> h1 + + ushr $T0.2d,$ACC4,#26 + and $ACC4,$ACC4,$MASK.2d + ushr $T1.2d,$ACC1,#26 + and $ACC1,$ACC1,$MASK.2d + add $ACC2,$ACC2,$T1.2d // h1 -> h2 + + add $ACC0,$ACC0,$T0.2d + shl $T0.2d,$T0.2d,#2 + ushr $T1.2d,$ACC2,#26 + and $ACC2,$ACC2,$MASK.2d + add $ACC0,$ACC0,$T0.2d // h4 -> h0 + add $ACC3,$ACC3,$T1.2d // h2 -> h3 + + ushr $T0.2d,$ACC0,#26 + and $ACC0,$ACC0,$MASK.2d + ushr $T1.2d,$ACC3,#26 + and $ACC3,$ACC3,$MASK.2d + add $ACC1,$ACC1,$T0.2d // h0 -> h1 + add $ACC4,$ACC4,$T1.2d // h3 -> h4 + + //////////////////////////////////////////////////////////////// + // write the result, can be partially reduced + + st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16 + st1 {$ACC4}[0],[$ctx] + +.Lno_data_neon: + ldr x29,[sp],#80 + ret +.size poly1305_blocks_neon,.-poly1305_blocks_neon + +.type poly1305_emit_neon,%function +.align 5 +poly1305_emit_neon: + ldr $is_base2_26,[$ctx,#24] + cbz $is_base2_26,poly1305_emit + + ldp w10,w11,[$ctx] // load hash value base 2^26 + ldp w12,w13,[$ctx,#8] + ldr w14,[$ctx,#16] + + add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64 + lsr $h1,x12,#12 + adds $h0,$h0,x12,lsl#52 + add $h1,$h1,x13,lsl#14 + adc $h1,$h1,xzr + lsr $h2,x14,#24 + adds $h1,$h1,x14,lsl#40 + adc $h2,$h2,xzr // can be partially reduced... + + ldp $t0,$t1,[$nonce] // load nonce + + and $d0,$h2,#-4 // ... so reduce + add $d0,$d0,$h2,lsr#2 + and $h2,$h2,#3 + adds $h0,$h0,$d0 + adcs $h1,$h1,xzr + adc $h2,$h2,xzr + + adds $d0,$h0,#5 // compare to modulus + adcs $d1,$h1,xzr + adc $d2,$h2,xzr + + tst $d2,#-4 // see if it's carried/borrowed + + csel $h0,$h0,$d0,eq + csel $h1,$h1,$d1,eq + +#ifdef __AARCH64EB__ + ror $t0,$t0,#32 // flip nonce words + ror $t1,$t1,#32 +#endif + adds $h0,$h0,$t0 // accumulate nonce + adc $h1,$h1,$t1 +#ifdef __AARCH64EB__ + rev $h0,$h0 // flip output bytes + rev $h1,$h1 +#endif + stp $h0,$h1,[$mac] // write result + + ret +.size poly1305_emit_neon,.-poly1305_emit_neon + +.align 5 +.Lzeros: +.long 0,0,0,0,0,0,0,0 +#ifndef __KERNEL__ +.LOPENSSL_armcap_P: +#ifdef __ILP32__ +.long OPENSSL_armcap_P-. +#else +.quad OPENSSL_armcap_P-. +#endif +#endif +.align 2 +___ + +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/\/\// and !/^$/); + print; +} +close SELF; + +foreach (split("\n",$code)) { + s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or + s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or + (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or + (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or + (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or + (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or + (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1)); + + s/\.[124]([sd])\[/.$1\[/; + + print $_,"\n"; +} +close STDOUT; diff --git a/src/crypto/zinc/poly1305/poly1305-mips64.S b/src/crypto/zinc/poly1305/poly1305-mips64.S deleted file mode 100644 index 272a86c..0000000 --- a/src/crypto/zinc/poly1305/poly1305-mips64.S +++ /dev/null @@ -1,360 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ -/* - * Copyright (C) 2015-2018 Jason A. Donenfeld . All Rights Reserved. - * Copyright (C) 2006-2017 CRYPTOGAMS by . All Rights Reserved. - * - * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS. - */ - -#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \ - defined(_MIPS_ARCH_MIPS64R6)) && !defined(_MIPS_ARCH_MIPS64R2) -#define _MIPS_ARCH_MIPS64R2 -#endif - -#ifdef __MIPSEB__ -#define MSB 0 -#define LSB 7 -#else -#define MSB 7 -#define LSB 0 -#endif - -#if defined(_MIPS_ARCH_MIPS64R6) -#define dmultu(rs,rt) -#define mflo(rd,rs,rt) dmulu rd,rs,rt -#define mfhi(rd,rs,rt) dmuhu rd,rs,rt -#else -#define dmultu(rs,rt) dmultu rs,rt -#define multu(rs,rt) multu rs,rt -#define mflo(rd,rs,rt) mflo rd -#define mfhi(rd,rs,rt) mfhi rd -#endif - -.text -.set noat -.set noreorder - -/* While most of the assembly in the kernel prefers ENTRY() and ENDPROC(), - * there is no existing MIPS assembly that uses it, and MIPS assembler seems - * to like its own .ent/.end notation, which the MIPS include files don't - * provide in a MIPS-specific ENTRY/ENDPROC definition. So, we skip these - * for now, until somebody complains. */ - -.align 5 -.globl poly1305_init_mips -.ent poly1305_init_mips -poly1305_init_mips: - .frame $29,0,$31 - .set reorder - - sd $0,0($4) - sd $0,8($4) - sd $0,16($4) - - beqz $5,.Lno_key - -#if defined(_MIPS_ARCH_MIPS64R6) - ld $8,0($5) - ld $9,8($5) -#else - ldl $8,0+MSB($5) - ldl $9,8+MSB($5) - ldr $8,0+LSB($5) - ldr $9,8+LSB($5) -#endif -#ifdef __MIPSEB__ -#if defined(_MIPS_ARCH_MIPS64R2) - dsbh $8,$8 # byte swap - dsbh $9,$9 - dshd $8,$8 - dshd $9,$9 -#else - ori $10,$0,0xFF - dsll $1,$10,32 - or $10,$1 # 0x000000FF000000FF - - and $11,$8,$10 # byte swap - and $2,$9,$10 - dsrl $1,$8,24 - dsrl $24,$9,24 - dsll $11,24 - dsll $2,24 - and $1,$10 - and $24,$10 - dsll $10,8 # 0x0000FF000000FF00 - or $11,$1 - or $2,$24 - and $1,$8,$10 - and $24,$9,$10 - dsrl $8,8 - dsrl $9,8 - dsll $1,8 - dsll $24,8 - and $8,$10 - and $9,$10 - or $11,$1 - or $2,$24 - or $8,$11 - or $9,$2 - dsrl $11,$8,32 - dsrl $2,$9,32 - dsll $8,32 - dsll $9,32 - or $8,$11 - or $9,$2 -#endif -#endif - li $10,1 - dsll $10,32 - daddiu $10,-63 - dsll $10,28 - daddiu $10,-1 # 0ffffffc0fffffff - - and $8,$10 - daddiu $10,-3 # 0ffffffc0ffffffc - and $9,$10 - - sd $8,24($4) - dsrl $10,$9,2 - sd $9,32($4) - daddu $10,$9 # s1 = r1 + (r1 >> 2) - sd $10,40($4) - -.Lno_key: - li $2,0 # return 0 - jr $31 -.end poly1305_init_mips - -.align 5 -.globl poly1305_blocks_mips -.ent poly1305_blocks_mips -poly1305_blocks_mips: - .set noreorder - dsrl $6,4 # number of complete blocks - bnez $6,poly1305_blocks_internal - nop - jr $31 - nop -.end poly1305_blocks_mips - -.align 5 -.ent poly1305_blocks_internal -poly1305_blocks_internal: - .frame $29,6*8,$31 - .mask 0x00030000,-8 - .set noreorder - dsubu $29,6*8 - sd $17,40($29) - sd $16,32($29) - .set reorder - - ld $12,0($4) # load hash value - ld $13,8($4) - ld $14,16($4) - - ld $15,24($4) # load key - ld $16,32($4) - ld $17,40($4) - -.Loop: -#if defined(_MIPS_ARCH_MIPS64R6) - ld $8,0($5) # load input - ld $9,8($5) -#else - ldl $8,0+MSB($5) # load input - ldl $9,8+MSB($5) - ldr $8,0+LSB($5) - ldr $9,8+LSB($5) -#endif - daddiu $6,-1 - daddiu $5,16 -#ifdef __MIPSEB__ -#if defined(_MIPS_ARCH_MIPS64R2) - dsbh $8,$8 # byte swap - dsbh $9,$9 - dshd $8,$8 - dshd $9,$9 -#else - ori $10,$0,0xFF - dsll $1,$10,32 - or $10,$1 # 0x000000FF000000FF - - and $11,$8,$10 # byte swap - and $2,$9,$10 - dsrl $1,$8,24 - dsrl $24,$9,24 - dsll $11,24 - dsll $2,24 - and $1,$10 - and $24,$10 - dsll $10,8 # 0x0000FF000000FF00 - or $11,$1 - or $2,$24 - and $1,$8,$10 - and $24,$9,$10 - dsrl $8,8 - dsrl $9,8 - dsll $1,8 - dsll $24,8 - and $8,$10 - and $9,$10 - or $11,$1 - or $2,$24 - or $8,$11 - or $9,$2 - dsrl $11,$8,32 - dsrl $2,$9,32 - dsll $8,32 - dsll $9,32 - or $8,$11 - or $9,$2 -#endif -#endif - daddu $12,$8 # accumulate input - daddu $13,$9 - sltu $10,$12,$8 - sltu $11,$13,$9 - daddu $13,$10 - - dmultu ($15,$12) # h0*r0 - daddu $14,$7 - sltu $10,$13,$10 - mflo ($8,$15,$12) - mfhi ($9,$15,$12) - - dmultu ($17,$13) # h1*5*r1 - daddu $10,$11 - daddu $14,$10 - mflo ($10,$17,$13) - mfhi ($11,$17,$13) - - dmultu ($16,$12) # h0*r1 - daddu $8,$10 - daddu $9,$11 - mflo ($1,$16,$12) - mfhi ($25,$16,$12) - sltu $10,$8,$10 - daddu $9,$10 - - dmultu ($15,$13) # h1*r0 - daddu $9,$1 - sltu $1,$9,$1 - mflo ($10,$15,$13) - mfhi ($11,$15,$13) - daddu $25,$1 - - dmultu ($17,$14) # h2*5*r1 - daddu $9,$10 - daddu $25,$11 - mflo ($1,$17,$14) - - dmultu ($15,$14) # h2*r0 - sltu $10,$9,$10 - daddu $25,$10 - mflo ($2,$15,$14) - - daddu $9,$1 - daddu $25,$2 - sltu $1,$9,$1 - daddu $25,$1 - - li $10,-4 # final reduction - and $10,$25 - dsrl $11,$25,2 - andi $14,$25,3 - daddu $10,$11 - daddu $12,$8,$10 - sltu $10,$12,$10 - daddu $13,$9,$10 - sltu $10,$13,$10 - daddu $14,$14,$10 - - bnez $6,.Loop - - sd $12,0($4) # store hash value - sd $13,8($4) - sd $14,16($4) - - .set noreorder - ld $17,40($29) # epilogue - ld $16,32($29) - jr $31 - daddu $29,6*8 -.end poly1305_blocks_internal - -.align 5 -.globl poly1305_emit_mips -.ent poly1305_emit_mips -poly1305_emit_mips: - .frame $29,0,$31 - .set reorder - - ld $10,0($4) - ld $11,8($4) - ld $1,16($4) - - daddiu $8,$10,5 # compare to modulus - sltiu $2,$8,5 - daddu $9,$11,$2 - sltu $2,$9,$2 - daddu $1,$1,$2 - - dsrl $1,2 # see if it carried/borrowed - dsubu $1,$0,$1 - nor $2,$0,$1 - - and $8,$1 - and $10,$2 - and $9,$1 - and $11,$2 - or $8,$10 - or $9,$11 - - lwu $10,0($6) # load nonce - lwu $11,4($6) - lwu $1,8($6) - lwu $2,12($6) - dsll $11,32 - dsll $2,32 - or $10,$11 - or $1,$2 - - daddu $8,$10 # accumulate nonce - daddu $9,$1 - sltu $10,$8,$10 - daddu $9,$10 - - dsrl $10,$8,8 # write mac value - dsrl $11,$8,16 - dsrl $1,$8,24 - sb $8,0($5) - dsrl $2,$8,32 - sb $10,1($5) - dsrl $10,$8,40 - sb $11,2($5) - dsrl $11,$8,48 - sb $1,3($5) - dsrl $1,$8,56 - sb $2,4($5) - dsrl $2,$9,8 - sb $10,5($5) - dsrl $10,$9,16 - sb $11,6($5) - dsrl $11,$9,24 - sb $1,7($5) - - sb $9,8($5) - dsrl $1,$9,32 - sb $2,9($5) - dsrl $2,$9,40 - sb $10,10($5) - dsrl $10,$9,48 - sb $11,11($5) - dsrl $11,$9,56 - sb $1,12($5) - sb $2,13($5) - sb $10,14($5) - sb $11,15($5) - - jr $31 -.end poly1305_emit_mips diff --git a/src/crypto/zinc/poly1305/poly1305-mips64.pl b/src/crypto/zinc/poly1305/poly1305-mips64.pl new file mode 100644 index 0000000..d30a03d --- /dev/null +++ b/src/crypto/zinc/poly1305/poly1305-mips64.pl @@ -0,0 +1,467 @@ +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +# +# This code is taken from the OpenSSL project but the author, Andy Polyakov, +# has relicensed it under the licenses specified in the SPDX header above. +# The original headers, including the original license headers, are +# included below for completeness. +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# Poly1305 hash for MIPS64. +# +# May 2016 +# +# Numbers are cycles per processed byte with poly1305_blocks alone. +# +# IALU/gcc +# R1x000 5.64/+120% (big-endian) +# Octeon II 3.80/+280% (little-endian) + +###################################################################### +# There is a number of MIPS ABI in use, O32 and N32/64 are most +# widely used. Then there is a new contender: NUBI. It appears that if +# one picks the latter, it's possible to arrange code in ABI neutral +# manner. Therefore let's stick to NUBI register layout: +# +($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); +($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); +($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); +# +# The return value is placed in $a0. Following coding rules facilitate +# interoperability: +# +# - never ever touch $tp, "thread pointer", former $gp [o32 can be +# excluded from the rule, because it's specified volatile]; +# - copy return value to $t0, former $v0 [or to $a0 if you're adapting +# old code]; +# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; +# +# For reference here is register layout for N32/64 MIPS ABIs: +# +# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); +# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); +# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); +# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); +# +# +# +###################################################################### + +$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64 + +die "MIPS64 only" unless ($flavour =~ /64|n32/i); + +$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0; +$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000"; + +($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); +($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1); + +$code.=<<___; +#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\ + defined(_MIPS_ARCH_MIPS64R6)) \\ + && !defined(_MIPS_ARCH_MIPS64R2) +# define _MIPS_ARCH_MIPS64R2 +#endif + +#if defined(_MIPS_ARCH_MIPS64R6) +# define dmultu(rs,rt) +# define mflo(rd,rs,rt) dmulu rd,rs,rt +# define mfhi(rd,rs,rt) dmuhu rd,rs,rt +#else +# define dmultu(rs,rt) dmultu rs,rt +# define mflo(rd,rs,rt) mflo rd +# define mfhi(rd,rs,rt) mfhi rd +#endif + +#ifdef __KERNEL__ +# define poly1305_init poly1305_init_mips +# define poly1305_blocks poly1305_blocks_mips +# define poly1305_emit poly1305_emit_mips +#endif + +#if defined(__MIPSEB__) && !defined(MIPSEB) +# define MIPSEB +#endif + +#ifdef MIPSEB +# define MSB 0 +# define LSB 7 +#else +# define MSB 7 +# define LSB 0 +#endif + +.text +.set noat +.set noreorder + +.align 5 +.globl poly1305_init +.ent poly1305_init +poly1305_init: + .frame $sp,0,$ra + .set reorder + + sd $zero,0($ctx) + sd $zero,8($ctx) + sd $zero,16($ctx) + + beqz $inp,.Lno_key + +#if defined(_MIPS_ARCH_MIPS64R6) + ld $in0,0($inp) + ld $in1,8($inp) +#else + ldl $in0,0+MSB($inp) + ldl $in1,8+MSB($inp) + ldr $in0,0+LSB($inp) + ldr $in1,8+LSB($inp) +#endif +#ifdef MIPSEB +# if defined(_MIPS_ARCH_MIPS64R2) + dsbh $in0,$in0 # byte swap + dsbh $in1,$in1 + dshd $in0,$in0 + dshd $in1,$in1 +# else + ori $tmp0,$zero,0xFF + dsll $tmp2,$tmp0,32 + or $tmp0,$tmp2 # 0x000000FF000000FF + + and $tmp1,$in0,$tmp0 # byte swap + and $tmp3,$in1,$tmp0 + dsrl $tmp2,$in0,24 + dsrl $tmp4,$in1,24 + dsll $tmp1,24 + dsll $tmp3,24 + and $tmp2,$tmp0 + and $tmp4,$tmp0 + dsll $tmp0,8 # 0x0000FF000000FF00 + or $tmp1,$tmp2 + or $tmp3,$tmp4 + and $tmp2,$in0,$tmp0 + and $tmp4,$in1,$tmp0 + dsrl $in0,8 + dsrl $in1,8 + dsll $tmp2,8 + dsll $tmp4,8 + and $in0,$tmp0 + and $in1,$tmp0 + or $tmp1,$tmp2 + or $tmp3,$tmp4 + or $in0,$tmp1 + or $in1,$tmp3 + dsrl $tmp1,$in0,32 + dsrl $tmp3,$in1,32 + dsll $in0,32 + dsll $in1,32 + or $in0,$tmp1 + or $in1,$tmp3 +# endif +#endif + li $tmp0,1 + dsll $tmp0,32 + daddiu $tmp0,-63 + dsll $tmp0,28 + daddiu $tmp0,-1 # 0ffffffc0fffffff + + and $in0,$tmp0 + daddiu $tmp0,-3 # 0ffffffc0ffffffc + and $in1,$tmp0 + + sd $in0,24($ctx) + dsrl $tmp0,$in1,2 + sd $in1,32($ctx) + daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2) + sd $tmp0,40($ctx) + +.Lno_key: + li $v0,0 # return 0 + jr $ra +.end poly1305_init +___ +{ +my ($h0,$h1,$h2,$r0,$r1,$s1,$d0,$d1,$d2) = + ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2); + +$code.=<<___; +.align 5 +.globl poly1305_blocks +.ent poly1305_blocks +poly1305_blocks: + .set noreorder + dsrl $len,4 # number of complete blocks + bnez $len,poly1305_blocks_internal + nop + jr $ra + nop +.end poly1305_blocks + +.align 5 +.ent poly1305_blocks_internal +poly1305_blocks_internal: + .frame $sp,6*8,$ra + .mask $SAVED_REGS_MASK,-8 + .set noreorder + dsubu $sp,6*8 + sd $s5,40($sp) + sd $s4,32($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue + sd $s3,24($sp) + sd $s2,16($sp) + sd $s1,8($sp) + sd $s0,0($sp) +___ +$code.=<<___; + .set reorder + + ld $h0,0($ctx) # load hash value + ld $h1,8($ctx) + ld $h2,16($ctx) + + ld $r0,24($ctx) # load key + ld $r1,32($ctx) + ld $s1,40($ctx) + +.Loop: +#if defined(_MIPS_ARCH_MIPS64R6) + ld $in0,0($inp) # load input + ld $in1,8($inp) +#else + ldl $in0,0+MSB($inp) # load input + ldl $in1,8+MSB($inp) + ldr $in0,0+LSB($inp) + ldr $in1,8+LSB($inp) +#endif + daddiu $len,-1 + daddiu $inp,16 +#ifdef MIPSEB +# if defined(_MIPS_ARCH_MIPS64R2) + dsbh $in0,$in0 # byte swap + dsbh $in1,$in1 + dshd $in0,$in0 + dshd $in1,$in1 +# else + ori $tmp0,$zero,0xFF + dsll $tmp2,$tmp0,32 + or $tmp0,$tmp2 # 0x000000FF000000FF + + and $tmp1,$in0,$tmp0 # byte swap + and $tmp3,$in1,$tmp0 + dsrl $tmp2,$in0,24 + dsrl $tmp4,$in1,24 + dsll $tmp1,24 + dsll $tmp3,24 + and $tmp2,$tmp0 + and $tmp4,$tmp0 + dsll $tmp0,8 # 0x0000FF000000FF00 + or $tmp1,$tmp2 + or $tmp3,$tmp4 + and $tmp2,$in0,$tmp0 + and $tmp4,$in1,$tmp0 + dsrl $in0,8 + dsrl $in1,8 + dsll $tmp2,8 + dsll $tmp4,8 + and $in0,$tmp0 + and $in1,$tmp0 + or $tmp1,$tmp2 + or $tmp3,$tmp4 + or $in0,$tmp1 + or $in1,$tmp3 + dsrl $tmp1,$in0,32 + dsrl $tmp3,$in1,32 + dsll $in0,32 + dsll $in1,32 + or $in0,$tmp1 + or $in1,$tmp3 +# endif +#endif + daddu $h0,$in0 # accumulate input + daddu $h1,$in1 + sltu $tmp0,$h0,$in0 + sltu $tmp1,$h1,$in1 + daddu $h1,$tmp0 + + dmultu ($r0,$h0) # h0*r0 + daddu $h2,$padbit + sltu $tmp0,$h1,$tmp0 + mflo ($d0,$r0,$h0) + mfhi ($d1,$r0,$h0) + + dmultu ($s1,$h1) # h1*5*r1 + daddu $tmp0,$tmp1 + daddu $h2,$tmp0 + mflo ($tmp0,$s1,$h1) + mfhi ($tmp1,$s1,$h1) + + dmultu ($r1,$h0) # h0*r1 + daddu $d0,$tmp0 + daddu $d1,$tmp1 + mflo ($tmp2,$r1,$h0) + mfhi ($d2,$r1,$h0) + sltu $tmp0,$d0,$tmp0 + daddu $d1,$tmp0 + + dmultu ($r0,$h1) # h1*r0 + daddu $d1,$tmp2 + sltu $tmp2,$d1,$tmp2 + mflo ($tmp0,$r0,$h1) + mfhi ($tmp1,$r0,$h1) + daddu $d2,$tmp2 + + dmultu ($s1,$h2) # h2*5*r1 + daddu $d1,$tmp0 + daddu $d2,$tmp1 + mflo ($tmp2,$s1,$h2) + + dmultu ($r0,$h2) # h2*r0 + sltu $tmp0,$d1,$tmp0 + daddu $d2,$tmp0 + mflo ($tmp3,$r0,$h2) + + daddu $d1,$tmp2 + daddu $d2,$tmp3 + sltu $tmp2,$d1,$tmp2 + daddu $d2,$tmp2 + + li $tmp0,-4 # final reduction + and $tmp0,$d2 + dsrl $tmp1,$d2,2 + andi $h2,$d2,3 + daddu $tmp0,$tmp1 + daddu $h0,$d0,$tmp0 + sltu $tmp0,$h0,$tmp0 + daddu $h1,$d1,$tmp0 + sltu $tmp0,$h1,$tmp0 + daddu $h2,$h2,$tmp0 + + bnez $len,.Loop + + sd $h0,0($ctx) # store hash value + sd $h1,8($ctx) + sd $h2,16($ctx) + + .set noreorder + ld $s5,40($sp) # epilogue + ld $s4,32($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue + ld $s3,24($sp) + ld $s2,16($sp) + ld $s1,8($sp) + ld $s0,0($sp) +___ +$code.=<<___; + jr $ra + daddu $sp,6*8 +.end poly1305_blocks_internal +___ +} +{ +my ($ctx,$mac,$nonce) = ($a0,$a1,$a2); + +$code.=<<___; +.align 5 +.globl poly1305_emit +.ent poly1305_emit +poly1305_emit: + .frame $sp,0,$ra + .set reorder + + ld $tmp0,0($ctx) + ld $tmp1,8($ctx) + ld $tmp2,16($ctx) + + daddiu $in0,$tmp0,5 # compare to modulus + sltiu $tmp3,$in0,5 + daddu $in1,$tmp1,$tmp3 + sltu $tmp3,$in1,$tmp3 + daddu $tmp2,$tmp2,$tmp3 + + dsrl $tmp2,2 # see if it carried/borrowed + dsubu $tmp2,$zero,$tmp2 + nor $tmp3,$zero,$tmp2 + + and $in0,$tmp2 + and $tmp0,$tmp3 + and $in1,$tmp2 + and $tmp1,$tmp3 + or $in0,$tmp0 + or $in1,$tmp1 + + lwu $tmp0,0($nonce) # load nonce + lwu $tmp1,4($nonce) + lwu $tmp2,8($nonce) + lwu $tmp3,12($nonce) + dsll $tmp1,32 + dsll $tmp3,32 + or $tmp0,$tmp1 + or $tmp2,$tmp3 + + daddu $in0,$tmp0 # accumulate nonce + daddu $in1,$tmp2 + sltu $tmp0,$in0,$tmp0 + daddu $in1,$tmp0 + + dsrl $tmp0,$in0,8 # write mac value + dsrl $tmp1,$in0,16 + dsrl $tmp2,$in0,24 + sb $in0,0($mac) + dsrl $tmp3,$in0,32 + sb $tmp0,1($mac) + dsrl $tmp0,$in0,40 + sb $tmp1,2($mac) + dsrl $tmp1,$in0,48 + sb $tmp2,3($mac) + dsrl $tmp2,$in0,56 + sb $tmp3,4($mac) + dsrl $tmp3,$in1,8 + sb $tmp0,5($mac) + dsrl $tmp0,$in1,16 + sb $tmp1,6($mac) + dsrl $tmp1,$in1,24 + sb $tmp2,7($mac) + + sb $in1,8($mac) + dsrl $tmp2,$in1,32 + sb $tmp3,9($mac) + dsrl $tmp3,$in1,40 + sb $tmp0,10($mac) + dsrl $tmp0,$in1,48 + sb $tmp1,11($mac) + dsrl $tmp1,$in1,56 + sb $tmp2,12($mac) + sb $tmp3,13($mac) + sb $tmp0,14($mac) + sb $tmp1,15($mac) + + jr $ra +.end poly1305_emit +.rdata +.align 2 +___ +} + +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/\/\// and !/^$/); + print; +} +close SELF; + +$output=pop and open STDOUT,">$output"; +print $code; +close STDOUT; + diff --git a/src/tests/qemu/Makefile b/src/tests/qemu/Makefile index e511b69..e20509f 100644 --- a/src/tests/qemu/Makefile +++ b/src/tests/qemu/Makefile @@ -23,7 +23,7 @@ NR_CPUS ?= 4 MIRROR := https://download.wireguard.com/qemu-test/distfiles/ rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d)) -WIREGUARD_SOURCES := ../../Kbuild ../../Kconfig $(filter-out ../../tools/% ../../tests/%,$(call rwildcard,../../,*.c *.h *.S *.include)) +WIREGUARD_SOURCES := ../../Kbuild ../../Kconfig $(filter-out ../../tools/% ../../tests/%,$(call rwildcard,../../,*.c *.h *.S *.pl *.include)) TOOLS_SOURCES := $(wildcard ../../tools/*.c ../../tools/*.h ../../uapi/*.h ../../crypto/zinc/curve25519/curve25519-hacl64.c ../../crypto/zinc/curve25519/curve25519-fiat32.c) default: qemu -- cgit v1.2.3-59-g8ed1b