aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/src/crypto/zinc
diff options
context:
space:
mode:
authorJason A. Donenfeld <Jason@zx2c4.com>2018-08-28 23:50:35 -0600
committerJason A. Donenfeld <Jason@zx2c4.com>2018-09-03 23:52:11 -0600
commit4a0e319af86c0d38304535293f6fc32fe436ef1d (patch)
tree6fca1e89becf3ff1afdcec7b6bc725e256af5811 /src/crypto/zinc
parentuapi: reformat (diff)
downloadwireguard-monolithic-historical-4a0e319af86c0d38304535293f6fc32fe436ef1d.tar.xz
wireguard-monolithic-historical-4a0e319af86c0d38304535293f6fc32fe436ef1d.zip
crypto: import zinc
Diffstat (limited to 'src/crypto/zinc')
-rw-r--r--src/crypto/zinc/blake2s/blake2s-x86_64-glue.h64
-rw-r--r--src/crypto/zinc/blake2s/blake2s-x86_64.S685
-rw-r--r--src/crypto/zinc/blake2s/blake2s.c274
-rw-r--r--src/crypto/zinc/chacha20/chacha20-arm-glue.h50
-rw-r--r--src/crypto/zinc/chacha20/chacha20-arm.S1473
-rw-r--r--src/crypto/zinc/chacha20/chacha20-arm64.S1942
-rw-r--r--src/crypto/zinc/chacha20/chacha20-mips-glue.h28
-rw-r--r--src/crypto/zinc/chacha20/chacha20-mips.S474
-rw-r--r--src/crypto/zinc/chacha20/chacha20-x86_64-glue.h104
-rw-r--r--src/crypto/zinc/chacha20/chacha20-x86_64.S2632
-rw-r--r--src/crypto/zinc/chacha20/chacha20.c168
-rw-r--r--src/crypto/zinc/chacha20poly1305.c333
-rw-r--r--src/crypto/zinc/curve25519/curve25519-arm-glue.h46
-rw-r--r--src/crypto/zinc/curve25519/curve25519-arm.S2095
-rw-r--r--src/crypto/zinc/curve25519/curve25519-fiat32.h862
-rw-r--r--src/crypto/zinc/curve25519/curve25519-hacl64.h785
-rw-r--r--src/crypto/zinc/curve25519/curve25519-x86_64-glue.h50
-rw-r--r--src/crypto/zinc/curve25519/curve25519-x86_64.h2333
-rw-r--r--src/crypto/zinc/curve25519/curve25519.c83
-rw-r--r--src/crypto/zinc/poly1305/poly1305-arm-glue.h69
-rw-r--r--src/crypto/zinc/poly1305/poly1305-arm.S1117
-rw-r--r--src/crypto/zinc/poly1305/poly1305-arm64.S822
-rw-r--r--src/crypto/zinc/poly1305/poly1305-mips-glue.h40
-rw-r--r--src/crypto/zinc/poly1305/poly1305-mips.S417
-rw-r--r--src/crypto/zinc/poly1305/poly1305-mips64.S357
-rw-r--r--src/crypto/zinc/poly1305/poly1305-x86_64-glue.h111
-rw-r--r--src/crypto/zinc/poly1305/poly1305-x86_64.S2792
-rw-r--r--src/crypto/zinc/poly1305/poly1305.c289
-rw-r--r--src/crypto/zinc/selftest/blake2s.h2095
-rw-r--r--src/crypto/zinc/selftest/chacha20poly1305.h7848
-rw-r--r--src/crypto/zinc/selftest/curve25519.h1321
-rw-r--r--src/crypto/zinc/selftest/poly1305.h1572
32 files changed, 33331 insertions, 0 deletions
diff --git a/src/crypto/zinc/blake2s/blake2s-x86_64-glue.h b/src/crypto/zinc/blake2s/blake2s-x86_64-glue.h
new file mode 100644
index 0000000..9dadaa3
--- /dev/null
+++ b/src/crypto/zinc/blake2s/blake2s-x86_64-glue.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <zinc/blake2s.h>
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+#include <asm/fpu/api.h>
+#include <asm/simd.h>
+
+#ifdef CONFIG_AS_AVX
+asmlinkage void blake2s_compress_avx(struct blake2s_state *state,
+ const u8 *block, const size_t nblocks,
+ const u32 inc);
+#endif
+#ifdef CONFIG_AS_AVX512
+asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
+ const u8 *block, const size_t nblocks,
+ const u32 inc);
+#endif
+
+static bool blake2s_use_avx __ro_after_init;
+static bool blake2s_use_avx512 __ro_after_init;
+
+void __init blake2s_fpu_init(void)
+{
+ blake2s_use_avx =
+ boot_cpu_has(X86_FEATURE_AVX) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+#ifndef COMPAT_CANNOT_USE_AVX512
+ blake2s_use_avx512 =
+ boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_AVX512F) &&
+ boot_cpu_has(X86_FEATURE_AVX512VL) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
+ XFEATURE_MASK_AVX512, NULL);
+#endif
+}
+
+static inline bool blake2s_arch(struct blake2s_state *state, const u8 *block,
+ size_t nblocks, const u32 inc)
+{
+#ifdef CONFIG_AS_AVX512
+ if (blake2s_use_avx512 && irq_fpu_usable()) {
+ kernel_fpu_begin();
+ blake2s_compress_avx512(state, block, nblocks, inc);
+ kernel_fpu_end();
+ return true;
+ }
+#endif
+#ifdef CONFIG_AS_AVX
+ if (blake2s_use_avx && irq_fpu_usable()) {
+ kernel_fpu_begin();
+ blake2s_compress_avx(state, block, nblocks, inc);
+ kernel_fpu_end();
+ return true;
+ }
+#endif
+ return false;
+}
+
+#define HAVE_BLAKE2S_ARCH_IMPLEMENTATION
diff --git a/src/crypto/zinc/blake2s/blake2s-x86_64.S b/src/crypto/zinc/blake2s/blake2s-x86_64.S
new file mode 100644
index 0000000..ee683e4
--- /dev/null
+++ b/src/crypto/zinc/blake2s/blake2s-x86_64.S
@@ -0,0 +1,685 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright (C) 2017 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
+ */
+
+#include <linux/linkage.h>
+
+.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
+.align 32
+IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667
+ .octa 0x5BE0CD191F83D9AB9B05688C510E527F
+.section .rodata.cst16.ROT16, "aM", @progbits, 16
+.align 16
+ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302
+.section .rodata.cst16.ROR328, "aM", @progbits, 16
+.align 16
+ROR328: .octa 0x0C0F0E0D080B0A090407060500030201
+#ifdef CONFIG_AS_AVX512
+.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 640
+.align 64
+SIGMA:
+.long 0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15
+.long 11, 2, 12, 14, 9, 8, 15, 3, 4, 0, 13, 6, 10, 1, 7, 5
+.long 10, 12, 11, 6, 5, 9, 13, 3, 4, 15, 14, 2, 0, 7, 8, 1
+.long 10, 9, 7, 0, 11, 14, 1, 12, 6, 2, 15, 3, 13, 8, 5, 4
+.long 4, 9, 8, 13, 14, 0, 10, 11, 7, 3, 12, 1, 5, 6, 15, 2
+.long 2, 10, 4, 14, 13, 3, 9, 11, 6, 5, 7, 12, 15, 1, 8, 0
+.long 4, 11, 14, 8, 13, 10, 12, 5, 2, 1, 15, 3, 9, 7, 0, 6
+.long 6, 12, 0, 13, 15, 2, 1, 10, 4, 5, 11, 14, 8, 3, 9, 7
+.long 14, 5, 4, 12, 9, 7, 3, 10, 2, 0, 6, 15, 11, 1, 13, 8
+.long 11, 7, 13, 10, 12, 14, 0, 15, 4, 5, 6, 9, 2, 1, 8, 3
+#endif /* CONFIG_AS_AVX512 */
+
+.text
+#ifdef CONFIG_AS_AVX
+ENTRY(blake2s_compress_avx)
+ movl %ecx, %ecx
+ testq %rdx, %rdx
+ je .Lendofloop
+ .align 32
+.Lbeginofloop:
+ addq %rcx, 32(%rdi)
+ vmovdqu IV+16(%rip), %xmm1
+ vmovdqu (%rsi), %xmm4
+ vpxor 32(%rdi), %xmm1, %xmm1
+ vmovdqu 16(%rsi), %xmm3
+ vshufps $136, %xmm3, %xmm4, %xmm6
+ vmovdqa ROT16(%rip), %xmm7
+ vpaddd (%rdi), %xmm6, %xmm6
+ vpaddd 16(%rdi), %xmm6, %xmm6
+ vpxor %xmm6, %xmm1, %xmm1
+ vmovdqu IV(%rip), %xmm8
+ vpshufb %xmm7, %xmm1, %xmm1
+ vmovdqu 48(%rsi), %xmm5
+ vpaddd %xmm1, %xmm8, %xmm8
+ vpxor 16(%rdi), %xmm8, %xmm9
+ vmovdqu 32(%rsi), %xmm2
+ vpblendw $12, %xmm3, %xmm5, %xmm13
+ vshufps $221, %xmm5, %xmm2, %xmm12
+ vpunpckhqdq %xmm2, %xmm4, %xmm14
+ vpslld $20, %xmm9, %xmm0
+ vpsrld $12, %xmm9, %xmm9
+ vpxor %xmm0, %xmm9, %xmm0
+ vshufps $221, %xmm3, %xmm4, %xmm9
+ vpaddd %xmm9, %xmm6, %xmm9
+ vpaddd %xmm0, %xmm9, %xmm9
+ vpxor %xmm9, %xmm1, %xmm1
+ vmovdqa ROR328(%rip), %xmm6
+ vpshufb %xmm6, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm8, %xmm8
+ vpxor %xmm8, %xmm0, %xmm0
+ vpshufd $147, %xmm1, %xmm1
+ vpshufd $78, %xmm8, %xmm8
+ vpslld $25, %xmm0, %xmm10
+ vpsrld $7, %xmm0, %xmm0
+ vpxor %xmm10, %xmm0, %xmm0
+ vshufps $136, %xmm5, %xmm2, %xmm10
+ vpshufd $57, %xmm0, %xmm0
+ vpaddd %xmm10, %xmm9, %xmm9
+ vpaddd %xmm0, %xmm9, %xmm9
+ vpxor %xmm9, %xmm1, %xmm1
+ vpaddd %xmm12, %xmm9, %xmm9
+ vpblendw $12, %xmm2, %xmm3, %xmm12
+ vpshufb %xmm7, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm8, %xmm8
+ vpxor %xmm8, %xmm0, %xmm10
+ vpslld $20, %xmm10, %xmm0
+ vpsrld $12, %xmm10, %xmm10
+ vpxor %xmm0, %xmm10, %xmm0
+ vpaddd %xmm0, %xmm9, %xmm9
+ vpxor %xmm9, %xmm1, %xmm1
+ vpshufb %xmm6, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm8, %xmm8
+ vpxor %xmm8, %xmm0, %xmm0
+ vpshufd $57, %xmm1, %xmm1
+ vpshufd $78, %xmm8, %xmm8
+ vpslld $25, %xmm0, %xmm10
+ vpsrld $7, %xmm0, %xmm0
+ vpxor %xmm10, %xmm0, %xmm0
+ vpslldq $4, %xmm5, %xmm10
+ vpblendw $240, %xmm10, %xmm12, %xmm12
+ vpshufd $147, %xmm0, %xmm0
+ vpshufd $147, %xmm12, %xmm12
+ vpaddd %xmm9, %xmm12, %xmm12
+ vpaddd %xmm0, %xmm12, %xmm12
+ vpxor %xmm12, %xmm1, %xmm1
+ vpshufb %xmm7, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm8, %xmm8
+ vpxor %xmm8, %xmm0, %xmm11
+ vpslld $20, %xmm11, %xmm9
+ vpsrld $12, %xmm11, %xmm11
+ vpxor %xmm9, %xmm11, %xmm0
+ vpshufd $8, %xmm2, %xmm9
+ vpblendw $192, %xmm5, %xmm3, %xmm11
+ vpblendw $240, %xmm11, %xmm9, %xmm9
+ vpshufd $177, %xmm9, %xmm9
+ vpaddd %xmm12, %xmm9, %xmm9
+ vpaddd %xmm0, %xmm9, %xmm11
+ vpxor %xmm11, %xmm1, %xmm1
+ vpshufb %xmm6, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm8, %xmm8
+ vpxor %xmm8, %xmm0, %xmm9
+ vpshufd $147, %xmm1, %xmm1
+ vpshufd $78, %xmm8, %xmm8
+ vpslld $25, %xmm9, %xmm0
+ vpsrld $7, %xmm9, %xmm9
+ vpxor %xmm0, %xmm9, %xmm0
+ vpslldq $4, %xmm3, %xmm9
+ vpblendw $48, %xmm9, %xmm2, %xmm9
+ vpblendw $240, %xmm9, %xmm4, %xmm9
+ vpshufd $57, %xmm0, %xmm0
+ vpshufd $177, %xmm9, %xmm9
+ vpaddd %xmm11, %xmm9, %xmm9
+ vpaddd %xmm0, %xmm9, %xmm9
+ vpxor %xmm9, %xmm1, %xmm1
+ vpshufb %xmm7, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm8, %xmm11
+ vpxor %xmm11, %xmm0, %xmm0
+ vpslld $20, %xmm0, %xmm8
+ vpsrld $12, %xmm0, %xmm0
+ vpxor %xmm8, %xmm0, %xmm0
+ vpunpckhdq %xmm3, %xmm4, %xmm8
+ vpblendw $12, %xmm10, %xmm8, %xmm12
+ vpshufd $177, %xmm12, %xmm12
+ vpaddd %xmm9, %xmm12, %xmm9
+ vpaddd %xmm0, %xmm9, %xmm9
+ vpxor %xmm9, %xmm1, %xmm1
+ vpshufb %xmm6, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm11, %xmm11
+ vpxor %xmm11, %xmm0, %xmm0
+ vpshufd $57, %xmm1, %xmm1
+ vpshufd $78, %xmm11, %xmm11
+ vpslld $25, %xmm0, %xmm12
+ vpsrld $7, %xmm0, %xmm0
+ vpxor %xmm12, %xmm0, %xmm0
+ vpunpckhdq %xmm5, %xmm2, %xmm12
+ vpshufd $147, %xmm0, %xmm0
+ vpblendw $15, %xmm13, %xmm12, %xmm12
+ vpslldq $8, %xmm5, %xmm13
+ vpshufd $210, %xmm12, %xmm12
+ vpaddd %xmm9, %xmm12, %xmm9
+ vpaddd %xmm0, %xmm9, %xmm9
+ vpxor %xmm9, %xmm1, %xmm1
+ vpshufb %xmm7, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm11, %xmm11
+ vpxor %xmm11, %xmm0, %xmm0
+ vpslld $20, %xmm0, %xmm12
+ vpsrld $12, %xmm0, %xmm0
+ vpxor %xmm12, %xmm0, %xmm0
+ vpunpckldq %xmm4, %xmm2, %xmm12
+ vpblendw $240, %xmm4, %xmm12, %xmm12
+ vpblendw $192, %xmm13, %xmm12, %xmm12
+ vpsrldq $12, %xmm3, %xmm13
+ vpaddd %xmm12, %xmm9, %xmm9
+ vpaddd %xmm0, %xmm9, %xmm9
+ vpxor %xmm9, %xmm1, %xmm1
+ vpshufb %xmm6, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm11, %xmm11
+ vpxor %xmm11, %xmm0, %xmm0
+ vpshufd $147, %xmm1, %xmm1
+ vpshufd $78, %xmm11, %xmm11
+ vpslld $25, %xmm0, %xmm12
+ vpsrld $7, %xmm0, %xmm0
+ vpxor %xmm12, %xmm0, %xmm0
+ vpblendw $60, %xmm2, %xmm4, %xmm12
+ vpblendw $3, %xmm13, %xmm12, %xmm12
+ vpshufd $57, %xmm0, %xmm0
+ vpshufd $78, %xmm12, %xmm12
+ vpaddd %xmm9, %xmm12, %xmm9
+ vpaddd %xmm0, %xmm9, %xmm9
+ vpxor %xmm9, %xmm1, %xmm1
+ vpshufb %xmm7, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm11, %xmm11
+ vpxor %xmm11, %xmm0, %xmm12
+ vpslld $20, %xmm12, %xmm13
+ vpsrld $12, %xmm12, %xmm0
+ vpblendw $51, %xmm3, %xmm4, %xmm12
+ vpxor %xmm13, %xmm0, %xmm0
+ vpblendw $192, %xmm10, %xmm12, %xmm10
+ vpslldq $8, %xmm2, %xmm12
+ vpshufd $27, %xmm10, %xmm10
+ vpaddd %xmm9, %xmm10, %xmm9
+ vpaddd %xmm0, %xmm9, %xmm9
+ vpxor %xmm9, %xmm1, %xmm1
+ vpshufb %xmm6, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm11, %xmm11
+ vpxor %xmm11, %xmm0, %xmm0
+ vpshufd $57, %xmm1, %xmm1
+ vpshufd $78, %xmm11, %xmm11
+ vpslld $25, %xmm0, %xmm10
+ vpsrld $7, %xmm0, %xmm0
+ vpxor %xmm10, %xmm0, %xmm0
+ vpunpckhdq %xmm2, %xmm8, %xmm10
+ vpshufd $147, %xmm0, %xmm0
+ vpblendw $12, %xmm5, %xmm10, %xmm10
+ vpshufd $210, %xmm10, %xmm10
+ vpaddd %xmm9, %xmm10, %xmm9
+ vpaddd %xmm0, %xmm9, %xmm9
+ vpxor %xmm9, %xmm1, %xmm1
+ vpshufb %xmm7, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm11, %xmm11
+ vpxor %xmm11, %xmm0, %xmm10
+ vpslld $20, %xmm10, %xmm0
+ vpsrld $12, %xmm10, %xmm10
+ vpxor %xmm0, %xmm10, %xmm0
+ vpblendw $12, %xmm4, %xmm5, %xmm10
+ vpblendw $192, %xmm12, %xmm10, %xmm10
+ vpunpckldq %xmm2, %xmm4, %xmm12
+ vpshufd $135, %xmm10, %xmm10
+ vpaddd %xmm9, %xmm10, %xmm9
+ vpaddd %xmm0, %xmm9, %xmm9
+ vpxor %xmm9, %xmm1, %xmm1
+ vpshufb %xmm6, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm11, %xmm13
+ vpxor %xmm13, %xmm0, %xmm0
+ vpshufd $147, %xmm1, %xmm1
+ vpshufd $78, %xmm13, %xmm13
+ vpslld $25, %xmm0, %xmm10
+ vpsrld $7, %xmm0, %xmm0
+ vpxor %xmm10, %xmm0, %xmm0
+ vpblendw $15, %xmm3, %xmm4, %xmm10
+ vpblendw $192, %xmm5, %xmm10, %xmm10
+ vpshufd $57, %xmm0, %xmm0
+ vpshufd $198, %xmm10, %xmm10
+ vpaddd %xmm9, %xmm10, %xmm10
+ vpaddd %xmm0, %xmm10, %xmm10
+ vpxor %xmm10, %xmm1, %xmm1
+ vpshufb %xmm7, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm13, %xmm13
+ vpxor %xmm13, %xmm0, %xmm9
+ vpslld $20, %xmm9, %xmm0
+ vpsrld $12, %xmm9, %xmm9
+ vpxor %xmm0, %xmm9, %xmm0
+ vpunpckhdq %xmm2, %xmm3, %xmm9
+ vpunpcklqdq %xmm12, %xmm9, %xmm15
+ vpunpcklqdq %xmm12, %xmm8, %xmm12
+ vpblendw $15, %xmm5, %xmm8, %xmm8
+ vpaddd %xmm15, %xmm10, %xmm15
+ vpaddd %xmm0, %xmm15, %xmm15
+ vpxor %xmm15, %xmm1, %xmm1
+ vpshufd $141, %xmm8, %xmm8
+ vpshufb %xmm6, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm13, %xmm13
+ vpxor %xmm13, %xmm0, %xmm0
+ vpshufd $57, %xmm1, %xmm1
+ vpshufd $78, %xmm13, %xmm13
+ vpslld $25, %xmm0, %xmm10
+ vpsrld $7, %xmm0, %xmm0
+ vpxor %xmm10, %xmm0, %xmm0
+ vpunpcklqdq %xmm2, %xmm3, %xmm10
+ vpshufd $147, %xmm0, %xmm0
+ vpblendw $51, %xmm14, %xmm10, %xmm14
+ vpshufd $135, %xmm14, %xmm14
+ vpaddd %xmm15, %xmm14, %xmm14
+ vpaddd %xmm0, %xmm14, %xmm14
+ vpxor %xmm14, %xmm1, %xmm1
+ vpunpcklqdq %xmm3, %xmm4, %xmm15
+ vpshufb %xmm7, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm13, %xmm13
+ vpxor %xmm13, %xmm0, %xmm0
+ vpslld $20, %xmm0, %xmm11
+ vpsrld $12, %xmm0, %xmm0
+ vpxor %xmm11, %xmm0, %xmm0
+ vpunpckhqdq %xmm5, %xmm3, %xmm11
+ vpblendw $51, %xmm15, %xmm11, %xmm11
+ vpunpckhqdq %xmm3, %xmm5, %xmm15
+ vpaddd %xmm11, %xmm14, %xmm11
+ vpaddd %xmm0, %xmm11, %xmm11
+ vpxor %xmm11, %xmm1, %xmm1
+ vpshufb %xmm6, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm13, %xmm13
+ vpxor %xmm13, %xmm0, %xmm0
+ vpshufd $147, %xmm1, %xmm1
+ vpshufd $78, %xmm13, %xmm13
+ vpslld $25, %xmm0, %xmm14
+ vpsrld $7, %xmm0, %xmm0
+ vpxor %xmm14, %xmm0, %xmm14
+ vpunpckhqdq %xmm4, %xmm2, %xmm0
+ vpshufd $57, %xmm14, %xmm14
+ vpblendw $51, %xmm15, %xmm0, %xmm15
+ vpaddd %xmm15, %xmm11, %xmm15
+ vpaddd %xmm14, %xmm15, %xmm15
+ vpxor %xmm15, %xmm1, %xmm1
+ vpshufb %xmm7, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm13, %xmm13
+ vpxor %xmm13, %xmm14, %xmm14
+ vpslld $20, %xmm14, %xmm11
+ vpsrld $12, %xmm14, %xmm14
+ vpxor %xmm11, %xmm14, %xmm14
+ vpblendw $3, %xmm2, %xmm4, %xmm11
+ vpslldq $8, %xmm11, %xmm0
+ vpblendw $15, %xmm5, %xmm0, %xmm0
+ vpshufd $99, %xmm0, %xmm0
+ vpaddd %xmm15, %xmm0, %xmm15
+ vpaddd %xmm14, %xmm15, %xmm15
+ vpxor %xmm15, %xmm1, %xmm0
+ vpaddd %xmm12, %xmm15, %xmm15
+ vpshufb %xmm6, %xmm0, %xmm0
+ vpaddd %xmm0, %xmm13, %xmm13
+ vpxor %xmm13, %xmm14, %xmm14
+ vpshufd $57, %xmm0, %xmm0
+ vpshufd $78, %xmm13, %xmm13
+ vpslld $25, %xmm14, %xmm1
+ vpsrld $7, %xmm14, %xmm14
+ vpxor %xmm1, %xmm14, %xmm14
+ vpblendw $3, %xmm5, %xmm4, %xmm1
+ vpshufd $147, %xmm14, %xmm14
+ vpaddd %xmm14, %xmm15, %xmm15
+ vpxor %xmm15, %xmm0, %xmm0
+ vpshufb %xmm7, %xmm0, %xmm0
+ vpaddd %xmm0, %xmm13, %xmm13
+ vpxor %xmm13, %xmm14, %xmm14
+ vpslld $20, %xmm14, %xmm12
+ vpsrld $12, %xmm14, %xmm14
+ vpxor %xmm12, %xmm14, %xmm14
+ vpsrldq $4, %xmm2, %xmm12
+ vpblendw $60, %xmm12, %xmm1, %xmm1
+ vpaddd %xmm1, %xmm15, %xmm15
+ vpaddd %xmm14, %xmm15, %xmm15
+ vpxor %xmm15, %xmm0, %xmm0
+ vpblendw $12, %xmm4, %xmm3, %xmm1
+ vpshufb %xmm6, %xmm0, %xmm0
+ vpaddd %xmm0, %xmm13, %xmm13
+ vpxor %xmm13, %xmm14, %xmm14
+ vpshufd $147, %xmm0, %xmm0
+ vpshufd $78, %xmm13, %xmm13
+ vpslld $25, %xmm14, %xmm12
+ vpsrld $7, %xmm14, %xmm14
+ vpxor %xmm12, %xmm14, %xmm14
+ vpsrldq $4, %xmm5, %xmm12
+ vpblendw $48, %xmm12, %xmm1, %xmm1
+ vpshufd $33, %xmm5, %xmm12
+ vpshufd $57, %xmm14, %xmm14
+ vpshufd $108, %xmm1, %xmm1
+ vpblendw $51, %xmm12, %xmm10, %xmm12
+ vpaddd %xmm15, %xmm1, %xmm15
+ vpaddd %xmm14, %xmm15, %xmm15
+ vpxor %xmm15, %xmm0, %xmm0
+ vpaddd %xmm12, %xmm15, %xmm15
+ vpshufb %xmm7, %xmm0, %xmm0
+ vpaddd %xmm0, %xmm13, %xmm1
+ vpxor %xmm1, %xmm14, %xmm14
+ vpslld $20, %xmm14, %xmm13
+ vpsrld $12, %xmm14, %xmm14
+ vpxor %xmm13, %xmm14, %xmm14
+ vpslldq $12, %xmm3, %xmm13
+ vpaddd %xmm14, %xmm15, %xmm15
+ vpxor %xmm15, %xmm0, %xmm0
+ vpshufb %xmm6, %xmm0, %xmm0
+ vpaddd %xmm0, %xmm1, %xmm1
+ vpxor %xmm1, %xmm14, %xmm14
+ vpshufd $57, %xmm0, %xmm0
+ vpshufd $78, %xmm1, %xmm1
+ vpslld $25, %xmm14, %xmm12
+ vpsrld $7, %xmm14, %xmm14
+ vpxor %xmm12, %xmm14, %xmm14
+ vpblendw $51, %xmm5, %xmm4, %xmm12
+ vpshufd $147, %xmm14, %xmm14
+ vpblendw $192, %xmm13, %xmm12, %xmm12
+ vpaddd %xmm12, %xmm15, %xmm15
+ vpaddd %xmm14, %xmm15, %xmm15
+ vpxor %xmm15, %xmm0, %xmm0
+ vpsrldq $4, %xmm3, %xmm12
+ vpshufb %xmm7, %xmm0, %xmm0
+ vpaddd %xmm0, %xmm1, %xmm1
+ vpxor %xmm1, %xmm14, %xmm14
+ vpslld $20, %xmm14, %xmm13
+ vpsrld $12, %xmm14, %xmm14
+ vpxor %xmm13, %xmm14, %xmm14
+ vpblendw $48, %xmm2, %xmm5, %xmm13
+ vpblendw $3, %xmm12, %xmm13, %xmm13
+ vpshufd $156, %xmm13, %xmm13
+ vpaddd %xmm15, %xmm13, %xmm15
+ vpaddd %xmm14, %xmm15, %xmm15
+ vpxor %xmm15, %xmm0, %xmm0
+ vpshufb %xmm6, %xmm0, %xmm0
+ vpaddd %xmm0, %xmm1, %xmm1
+ vpxor %xmm1, %xmm14, %xmm14
+ vpshufd $147, %xmm0, %xmm0
+ vpshufd $78, %xmm1, %xmm1
+ vpslld $25, %xmm14, %xmm13
+ vpsrld $7, %xmm14, %xmm14
+ vpxor %xmm13, %xmm14, %xmm14
+ vpunpcklqdq %xmm2, %xmm4, %xmm13
+ vpshufd $57, %xmm14, %xmm14
+ vpblendw $12, %xmm12, %xmm13, %xmm12
+ vpshufd $180, %xmm12, %xmm12
+ vpaddd %xmm15, %xmm12, %xmm15
+ vpaddd %xmm14, %xmm15, %xmm15
+ vpxor %xmm15, %xmm0, %xmm0
+ vpshufb %xmm7, %xmm0, %xmm0
+ vpaddd %xmm0, %xmm1, %xmm1
+ vpxor %xmm1, %xmm14, %xmm14
+ vpslld $20, %xmm14, %xmm12
+ vpsrld $12, %xmm14, %xmm14
+ vpxor %xmm12, %xmm14, %xmm14
+ vpunpckhqdq %xmm9, %xmm4, %xmm12
+ vpshufd $198, %xmm12, %xmm12
+ vpaddd %xmm15, %xmm12, %xmm15
+ vpaddd %xmm14, %xmm15, %xmm15
+ vpxor %xmm15, %xmm0, %xmm0
+ vpaddd %xmm15, %xmm8, %xmm15
+ vpshufb %xmm6, %xmm0, %xmm0
+ vpaddd %xmm0, %xmm1, %xmm1
+ vpxor %xmm1, %xmm14, %xmm14
+ vpshufd $57, %xmm0, %xmm0
+ vpshufd $78, %xmm1, %xmm1
+ vpslld $25, %xmm14, %xmm12
+ vpsrld $7, %xmm14, %xmm14
+ vpxor %xmm12, %xmm14, %xmm14
+ vpsrldq $4, %xmm4, %xmm12
+ vpshufd $147, %xmm14, %xmm14
+ vpaddd %xmm14, %xmm15, %xmm15
+ vpxor %xmm15, %xmm0, %xmm0
+ vpshufb %xmm7, %xmm0, %xmm0
+ vpaddd %xmm0, %xmm1, %xmm1
+ vpxor %xmm1, %xmm14, %xmm14
+ vpslld $20, %xmm14, %xmm8
+ vpsrld $12, %xmm14, %xmm14
+ vpxor %xmm14, %xmm8, %xmm14
+ vpblendw $48, %xmm5, %xmm2, %xmm8
+ vpblendw $3, %xmm12, %xmm8, %xmm8
+ vpunpckhqdq %xmm5, %xmm4, %xmm12
+ vpshufd $75, %xmm8, %xmm8
+ vpblendw $60, %xmm10, %xmm12, %xmm10
+ vpaddd %xmm15, %xmm8, %xmm15
+ vpaddd %xmm14, %xmm15, %xmm15
+ vpxor %xmm0, %xmm15, %xmm0
+ vpshufd $45, %xmm10, %xmm10
+ vpshufb %xmm6, %xmm0, %xmm0
+ vpaddd %xmm15, %xmm10, %xmm15
+ vpaddd %xmm0, %xmm1, %xmm1
+ vpxor %xmm1, %xmm14, %xmm14
+ vpshufd $147, %xmm0, %xmm0
+ vpshufd $78, %xmm1, %xmm1
+ vpslld $25, %xmm14, %xmm8
+ vpsrld $7, %xmm14, %xmm14
+ vpxor %xmm14, %xmm8, %xmm8
+ vpshufd $57, %xmm8, %xmm8
+ vpaddd %xmm8, %xmm15, %xmm15
+ vpxor %xmm0, %xmm15, %xmm0
+ vpshufb %xmm7, %xmm0, %xmm0
+ vpaddd %xmm0, %xmm1, %xmm1
+ vpxor %xmm8, %xmm1, %xmm8
+ vpslld $20, %xmm8, %xmm10
+ vpsrld $12, %xmm8, %xmm8
+ vpxor %xmm8, %xmm10, %xmm10
+ vpunpckldq %xmm3, %xmm4, %xmm8
+ vpunpcklqdq %xmm9, %xmm8, %xmm9
+ vpaddd %xmm9, %xmm15, %xmm9
+ vpaddd %xmm10, %xmm9, %xmm9
+ vpxor %xmm0, %xmm9, %xmm8
+ vpshufb %xmm6, %xmm8, %xmm8
+ vpaddd %xmm8, %xmm1, %xmm1
+ vpxor %xmm1, %xmm10, %xmm10
+ vpshufd $57, %xmm8, %xmm8
+ vpshufd $78, %xmm1, %xmm1
+ vpslld $25, %xmm10, %xmm12
+ vpsrld $7, %xmm10, %xmm10
+ vpxor %xmm10, %xmm12, %xmm10
+ vpblendw $48, %xmm4, %xmm3, %xmm12
+ vpshufd $147, %xmm10, %xmm0
+ vpunpckhdq %xmm5, %xmm3, %xmm10
+ vpshufd $78, %xmm12, %xmm12
+ vpunpcklqdq %xmm4, %xmm10, %xmm10
+ vpblendw $192, %xmm2, %xmm10, %xmm10
+ vpshufhw $78, %xmm10, %xmm10
+ vpaddd %xmm10, %xmm9, %xmm10
+ vpaddd %xmm0, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm8
+ vpshufb %xmm7, %xmm8, %xmm8
+ vpaddd %xmm8, %xmm1, %xmm1
+ vpxor %xmm0, %xmm1, %xmm9
+ vpslld $20, %xmm9, %xmm0
+ vpsrld $12, %xmm9, %xmm9
+ vpxor %xmm9, %xmm0, %xmm0
+ vpunpckhdq %xmm5, %xmm4, %xmm9
+ vpblendw $240, %xmm9, %xmm2, %xmm13
+ vpshufd $39, %xmm13, %xmm13
+ vpaddd %xmm10, %xmm13, %xmm10
+ vpaddd %xmm0, %xmm10, %xmm10
+ vpxor %xmm8, %xmm10, %xmm8
+ vpblendw $12, %xmm4, %xmm2, %xmm13
+ vpshufb %xmm6, %xmm8, %xmm8
+ vpslldq $4, %xmm13, %xmm13
+ vpblendw $15, %xmm5, %xmm13, %xmm13
+ vpaddd %xmm8, %xmm1, %xmm1
+ vpxor %xmm1, %xmm0, %xmm0
+ vpaddd %xmm13, %xmm10, %xmm13
+ vpshufd $147, %xmm8, %xmm8
+ vpshufd $78, %xmm1, %xmm1
+ vpslld $25, %xmm0, %xmm14
+ vpsrld $7, %xmm0, %xmm0
+ vpxor %xmm0, %xmm14, %xmm14
+ vpshufd $57, %xmm14, %xmm14
+ vpaddd %xmm14, %xmm13, %xmm13
+ vpxor %xmm8, %xmm13, %xmm8
+ vpaddd %xmm13, %xmm12, %xmm12
+ vpshufb %xmm7, %xmm8, %xmm8
+ vpaddd %xmm8, %xmm1, %xmm1
+ vpxor %xmm14, %xmm1, %xmm14
+ vpslld $20, %xmm14, %xmm10
+ vpsrld $12, %xmm14, %xmm14
+ vpxor %xmm14, %xmm10, %xmm10
+ vpaddd %xmm10, %xmm12, %xmm12
+ vpxor %xmm8, %xmm12, %xmm8
+ vpshufb %xmm6, %xmm8, %xmm8
+ vpaddd %xmm8, %xmm1, %xmm1
+ vpxor %xmm1, %xmm10, %xmm0
+ vpshufd $57, %xmm8, %xmm8
+ vpshufd $78, %xmm1, %xmm1
+ vpslld $25, %xmm0, %xmm10
+ vpsrld $7, %xmm0, %xmm0
+ vpxor %xmm0, %xmm10, %xmm10
+ vpblendw $48, %xmm2, %xmm3, %xmm0
+ vpblendw $15, %xmm11, %xmm0, %xmm0
+ vpshufd $147, %xmm10, %xmm10
+ vpshufd $114, %xmm0, %xmm0
+ vpaddd %xmm12, %xmm0, %xmm0
+ vpaddd %xmm10, %xmm0, %xmm0
+ vpxor %xmm8, %xmm0, %xmm8
+ vpshufb %xmm7, %xmm8, %xmm8
+ vpaddd %xmm8, %xmm1, %xmm1
+ vpxor %xmm10, %xmm1, %xmm10
+ vpslld $20, %xmm10, %xmm11
+ vpsrld $12, %xmm10, %xmm10
+ vpxor %xmm10, %xmm11, %xmm10
+ vpslldq $4, %xmm4, %xmm11
+ vpblendw $192, %xmm11, %xmm3, %xmm3
+ vpunpckldq %xmm5, %xmm4, %xmm4
+ vpshufd $99, %xmm3, %xmm3
+ vpaddd %xmm0, %xmm3, %xmm3
+ vpaddd %xmm10, %xmm3, %xmm3
+ vpxor %xmm8, %xmm3, %xmm11
+ vpunpckldq %xmm5, %xmm2, %xmm0
+ vpblendw $192, %xmm2, %xmm5, %xmm2
+ vpshufb %xmm6, %xmm11, %xmm11
+ vpunpckhqdq %xmm0, %xmm9, %xmm0
+ vpblendw $15, %xmm4, %xmm2, %xmm4
+ vpaddd %xmm11, %xmm1, %xmm1
+ vpxor %xmm1, %xmm10, %xmm10
+ vpshufd $147, %xmm11, %xmm11
+ vpshufd $201, %xmm0, %xmm0
+ vpslld $25, %xmm10, %xmm8
+ vpsrld $7, %xmm10, %xmm10
+ vpxor %xmm10, %xmm8, %xmm10
+ vpshufd $78, %xmm1, %xmm1
+ vpaddd %xmm3, %xmm0, %xmm0
+ vpshufd $27, %xmm4, %xmm4
+ vpshufd $57, %xmm10, %xmm10
+ vpaddd %xmm10, %xmm0, %xmm0
+ vpxor %xmm11, %xmm0, %xmm11
+ vpaddd %xmm0, %xmm4, %xmm0
+ vpshufb %xmm7, %xmm11, %xmm7
+ vpaddd %xmm7, %xmm1, %xmm1
+ vpxor %xmm10, %xmm1, %xmm10
+ vpslld $20, %xmm10, %xmm8
+ vpsrld $12, %xmm10, %xmm10
+ vpxor %xmm10, %xmm8, %xmm8
+ vpaddd %xmm8, %xmm0, %xmm0
+ vpxor %xmm7, %xmm0, %xmm7
+ vpshufb %xmm6, %xmm7, %xmm6
+ vpaddd %xmm6, %xmm1, %xmm1
+ vpxor %xmm1, %xmm8, %xmm8
+ vpshufd $78, %xmm1, %xmm1
+ vpshufd $57, %xmm6, %xmm6
+ vpslld $25, %xmm8, %xmm2
+ vpsrld $7, %xmm8, %xmm8
+ vpxor %xmm8, %xmm2, %xmm8
+ vpxor (%rdi), %xmm1, %xmm1
+ vpshufd $147, %xmm8, %xmm8
+ vpxor %xmm0, %xmm1, %xmm0
+ vmovups %xmm0, (%rdi)
+ vpxor 16(%rdi), %xmm8, %xmm0
+ vpxor %xmm6, %xmm0, %xmm6
+ vmovups %xmm6, 16(%rdi)
+ addq $64, %rsi
+ decq %rdx
+ jnz .Lbeginofloop
+.Lendofloop:
+ ret
+ENDPROC(blake2s_compress_avx)
+#endif /* CONFIG_AS_AVX */
+
+#ifdef CONFIG_AS_AVX512
+ENTRY(blake2s_compress_avx512)
+ vmovdqu (%rdi),%xmm0
+ vmovdqu 0x10(%rdi),%xmm1
+ vmovdqu 0x20(%rdi),%xmm4
+ vmovq %rcx,%xmm5
+ vmovdqa IV(%rip),%xmm14
+ vmovdqa IV+16(%rip),%xmm15
+ jmp .Lblake2s_compress_avx512_mainloop
+.align 32
+.Lblake2s_compress_avx512_mainloop:
+ vmovdqa %xmm0,%xmm10
+ vmovdqa %xmm1,%xmm11
+ vpaddq %xmm5,%xmm4,%xmm4
+ vmovdqa %xmm14,%xmm2
+ vpxor %xmm15,%xmm4,%xmm3
+ vmovdqu (%rsi),%ymm6
+ vmovdqu 0x20(%rsi),%ymm7
+ addq $0x40,%rsi
+ leaq SIGMA(%rip),%rax
+ movb $0xa,%cl
+.Lblake2s_compress_avx512_roundloop:
+ addq $0x40,%rax
+ vmovdqa -0x40(%rax),%ymm8
+ vmovdqa -0x20(%rax),%ymm9
+ vpermi2d %ymm7,%ymm6,%ymm8
+ vpermi2d %ymm7,%ymm6,%ymm9
+ vmovdqa %ymm8,%ymm6
+ vmovdqa %ymm9,%ymm7
+ vpaddd %xmm8,%xmm0,%xmm0
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+ vprord $0x10,%xmm3,%xmm3
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+ vprord $0xc,%xmm1,%xmm1
+ vextracti128 $0x1,%ymm8,%xmm8
+ vpaddd %xmm8,%xmm0,%xmm0
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+ vprord $0x8,%xmm3,%xmm3
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+ vprord $0x7,%xmm1,%xmm1
+ vpshufd $0x39,%xmm1,%xmm1
+ vpshufd $0x4e,%xmm2,%xmm2
+ vpshufd $0x93,%xmm3,%xmm3
+ vpaddd %xmm9,%xmm0,%xmm0
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+ vprord $0x10,%xmm3,%xmm3
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+ vprord $0xc,%xmm1,%xmm1
+ vextracti128 $0x1,%ymm9,%xmm9
+ vpaddd %xmm9,%xmm0,%xmm0
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+ vprord $0x8,%xmm3,%xmm3
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+ vprord $0x7,%xmm1,%xmm1
+ vpshufd $0x93,%xmm1,%xmm1
+ vpshufd $0x4e,%xmm2,%xmm2
+ vpshufd $0x39,%xmm3,%xmm3
+ decb %cl
+ jne .Lblake2s_compress_avx512_roundloop
+ vpxor %xmm10,%xmm0,%xmm0
+ vpxor %xmm11,%xmm1,%xmm1
+ vpxor %xmm2,%xmm0,%xmm0
+ vpxor %xmm3,%xmm1,%xmm1
+ decq %rdx
+ jne .Lblake2s_compress_avx512_mainloop
+ vmovdqu %xmm0,(%rdi)
+ vmovdqu %xmm1,0x10(%rdi)
+ vmovdqu %xmm4,0x20(%rdi)
+ vzeroupper
+ retq
+ENDPROC(blake2s_compress_avx512)
+#endif /* CONFIG_AS_AVX512 */
diff --git a/src/crypto/zinc/blake2s/blake2s.c b/src/crypto/zinc/blake2s/blake2s.c
new file mode 100644
index 0000000..76232a3
--- /dev/null
+++ b/src/crypto/zinc/blake2s/blake2s.c
@@ -0,0 +1,274 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2012 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * This is an implementation of the BLAKE2s hash and PRF functions.
+ *
+ * Information: https://blake2.net/
+ *
+ */
+
+#include <zinc/blake2s.h>
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/bug.h>
+#include <asm/unaligned.h>
+
+typedef union {
+ struct {
+ u8 digest_length;
+ u8 key_length;
+ u8 fanout;
+ u8 depth;
+ u32 leaf_length;
+ u32 node_offset;
+ u16 xof_length;
+ u8 node_depth;
+ u8 inner_length;
+ u8 salt[8];
+ u8 personal[8];
+ };
+ __le32 words[8];
+} __packed blake2s_param;
+
+static const u32 blake2s_iv[8] = {
+ 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
+ 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
+};
+
+static const u8 blake2s_sigma[10][16] = {
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+ { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+ { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+ { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+ { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+ { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+ { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+ { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+ { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+};
+
+static inline void blake2s_set_lastblock(struct blake2s_state *state)
+{
+ if (state->last_node)
+ state->f[1] = -1;
+ state->f[0] = -1;
+}
+
+static inline void blake2s_increment_counter(struct blake2s_state *state,
+ const u32 inc)
+{
+ state->t[0] += inc;
+ state->t[1] += (state->t[0] < inc);
+}
+
+static inline void blake2s_init_param(struct blake2s_state *state,
+ const blake2s_param *param)
+{
+ int i;
+
+ memset(state, 0, sizeof(struct blake2s_state));
+ for (i = 0; i < 8; ++i)
+ state->h[i] = blake2s_iv[i] ^ le32_to_cpu(param->words[i]);
+}
+
+void blake2s_init(struct blake2s_state *state, const size_t outlen)
+{
+ blake2s_param param __aligned(__alignof__(u32)) = {
+ .digest_length = outlen,
+ .fanout = 1,
+ .depth = 1
+ };
+
+#ifdef DEBUG
+ BUG_ON(!outlen || outlen > BLAKE2S_OUTBYTES);
+#endif
+ blake2s_init_param(state, &param);
+}
+EXPORT_SYMBOL(blake2s_init);
+
+void blake2s_init_key(struct blake2s_state *state, const size_t outlen,
+ const void *key, const size_t keylen)
+{
+ blake2s_param param = { .digest_length = outlen,
+ .key_length = keylen,
+ .fanout = 1,
+ .depth = 1 };
+ u8 block[BLAKE2S_BLOCKBYTES] = { 0 };
+
+#ifdef DEBUG
+ BUG_ON(!outlen || outlen > BLAKE2S_OUTBYTES || !key || !keylen ||
+ keylen > BLAKE2S_KEYBYTES);
+#endif
+ blake2s_init_param(state, &param);
+ memcpy(block, key, keylen);
+ blake2s_update(state, block, BLAKE2S_BLOCKBYTES);
+ memzero_explicit(block, BLAKE2S_BLOCKBYTES);
+}
+EXPORT_SYMBOL(blake2s_init_key);
+
+#ifndef HAVE_BLAKE2S_ARCH_IMPLEMENTATION
+void __init blake2s_fpu_init(void)
+{
+}
+static inline bool blake2s_arch(struct blake2s_state *state, const u8 *block,
+ const size_t nblocks, const u32 inc)
+{
+ return false;
+}
+#endif
+
+static inline void blake2s_compress(struct blake2s_state *state,
+ const u8 *block, size_t nblocks,
+ const u32 inc)
+{
+ u32 m[16];
+ u32 v[16];
+ int i;
+
+#ifdef DEBUG
+ BUG_ON(nblocks > 1 && inc != BLAKE2S_BLOCKBYTES);
+#endif
+
+ if (blake2s_arch(state, block, nblocks, inc))
+ return;
+
+ while (nblocks > 0) {
+ blake2s_increment_counter(state, inc);
+
+#ifdef __LITTLE_ENDIAN
+ memcpy(m, block, BLAKE2S_BLOCKBYTES);
+#else
+ for (i = 0; i < 16; ++i)
+ m[i] = get_unaligned_le32(block + i * sizeof(m[i]));
+#endif
+ memcpy(v, state->h, 32);
+ v[ 8] = blake2s_iv[0];
+ v[ 9] = blake2s_iv[1];
+ v[10] = blake2s_iv[2];
+ v[11] = blake2s_iv[3];
+ v[12] = blake2s_iv[4] ^ state->t[0];
+ v[13] = blake2s_iv[5] ^ state->t[1];
+ v[14] = blake2s_iv[6] ^ state->f[0];
+ v[15] = blake2s_iv[7] ^ state->f[1];
+
+#define G(r, i, a, b, c, d) do { \
+ a += b + m[blake2s_sigma[r][2 * i + 0]]; \
+ d = ror32(d ^ a, 16); \
+ c += d; \
+ b = ror32(b ^ c, 12); \
+ a += b + m[blake2s_sigma[r][2 * i + 1]]; \
+ d = ror32(d ^ a, 8); \
+ c += d; \
+ b = ror32(b ^ c, 7); \
+} while (0)
+
+#define ROUND(r) do { \
+ G(r, 0, v[0], v[ 4], v[ 8], v[12]); \
+ G(r, 1, v[1], v[ 5], v[ 9], v[13]); \
+ G(r, 2, v[2], v[ 6], v[10], v[14]); \
+ G(r, 3, v[3], v[ 7], v[11], v[15]); \
+ G(r, 4, v[0], v[ 5], v[10], v[15]); \
+ G(r, 5, v[1], v[ 6], v[11], v[12]); \
+ G(r, 6, v[2], v[ 7], v[ 8], v[13]); \
+ G(r, 7, v[3], v[ 4], v[ 9], v[14]); \
+} while (0)
+ ROUND(0);
+ ROUND(1);
+ ROUND(2);
+ ROUND(3);
+ ROUND(4);
+ ROUND(5);
+ ROUND(6);
+ ROUND(7);
+ ROUND(8);
+ ROUND(9);
+
+#undef G
+#undef ROUND
+
+ for (i = 0; i < 8; ++i)
+ state->h[i] ^= v[i] ^ v[i + 8];
+
+ block += BLAKE2S_BLOCKBYTES;
+ --nblocks;
+ }
+}
+
+void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen)
+{
+ const size_t fill = BLAKE2S_BLOCKBYTES - state->buflen;
+
+ if (unlikely(!inlen))
+ return;
+ if (inlen > fill) {
+ memcpy(state->buf + state->buflen, in, fill);
+ blake2s_compress(state, state->buf, 1, BLAKE2S_BLOCKBYTES);
+ state->buflen = 0;
+ in += fill;
+ inlen -= fill;
+ }
+ if (inlen > BLAKE2S_BLOCKBYTES) {
+ const size_t nblocks =
+ (inlen + BLAKE2S_BLOCKBYTES - 1) / BLAKE2S_BLOCKBYTES;
+ /* Hash one less (full) block than strictly possible */
+ blake2s_compress(state, in, nblocks - 1, BLAKE2S_BLOCKBYTES);
+ in += BLAKE2S_BLOCKBYTES * (nblocks - 1);
+ inlen -= BLAKE2S_BLOCKBYTES * (nblocks - 1);
+ }
+ memcpy(state->buf + state->buflen, in, inlen);
+ state->buflen += inlen;
+}
+EXPORT_SYMBOL(blake2s_update);
+
+void __blake2s_final(struct blake2s_state *state)
+{
+ blake2s_set_lastblock(state);
+ memset(state->buf + state->buflen, 0,
+ BLAKE2S_BLOCKBYTES - state->buflen); /* Padding */
+ blake2s_compress(state, state->buf, 1, state->buflen);
+}
+EXPORT_SYMBOL(__blake2s_final);
+
+void blake2s_hmac(u8 *out, const u8 *in, const u8 *key, const size_t outlen,
+ const size_t inlen, const size_t keylen)
+{
+ struct blake2s_state state;
+ u8 x_key[BLAKE2S_BLOCKBYTES] __aligned(__alignof__(u32)) = { 0 };
+ u8 i_hash[BLAKE2S_OUTBYTES] __aligned(__alignof__(u32));
+ int i;
+
+ if (keylen > BLAKE2S_BLOCKBYTES) {
+ blake2s_init(&state, BLAKE2S_OUTBYTES);
+ blake2s_update(&state, key, keylen);
+ blake2s_final(&state, x_key, BLAKE2S_OUTBYTES);
+ } else
+ memcpy(x_key, key, keylen);
+
+ for (i = 0; i < BLAKE2S_BLOCKBYTES; ++i)
+ x_key[i] ^= 0x36;
+
+ blake2s_init(&state, BLAKE2S_OUTBYTES);
+ blake2s_update(&state, x_key, BLAKE2S_BLOCKBYTES);
+ blake2s_update(&state, in, inlen);
+ blake2s_final(&state, i_hash, BLAKE2S_OUTBYTES);
+
+ for (i = 0; i < BLAKE2S_BLOCKBYTES; ++i)
+ x_key[i] ^= 0x5c ^ 0x36;
+
+ blake2s_init(&state, BLAKE2S_OUTBYTES);
+ blake2s_update(&state, x_key, BLAKE2S_BLOCKBYTES);
+ blake2s_update(&state, i_hash, BLAKE2S_OUTBYTES);
+ blake2s_final(&state, i_hash, BLAKE2S_OUTBYTES);
+
+ memcpy(out, i_hash, outlen);
+ memzero_explicit(x_key, BLAKE2S_BLOCKBYTES);
+ memzero_explicit(i_hash, BLAKE2S_OUTBYTES);
+}
+EXPORT_SYMBOL(blake2s_hmac);
+
+#include "../selftest/blake2s.h"
diff --git a/src/crypto/zinc/chacha20/chacha20-arm-glue.h b/src/crypto/zinc/chacha20/chacha20-arm-glue.h
new file mode 100644
index 0000000..d323615
--- /dev/null
+++ b/src/crypto/zinc/chacha20/chacha20-arm-glue.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <zinc/chacha20.h>
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+
+asmlinkage void chacha20_arm(u8 *out, const u8 *in, const size_t len,
+ const u32 key[8], const u32 counter[4]);
+#if IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && \
+ (defined(CONFIG_64BIT) || __LINUX_ARM_ARCH__ >= 7)
+#define ARM_USE_NEON
+asmlinkage void chacha20_neon(u8 *out, const u8 *in, const size_t len,
+ const u32 key[8], const u32 counter[4]);
+#endif
+
+static bool chacha20_use_neon __ro_after_init;
+
+void __init chacha20_fpu_init(void)
+{
+#if defined(CONFIG_ARM64)
+ chacha20_use_neon = elf_hwcap & HWCAP_ASIMD;
+#elif defined(CONFIG_ARM)
+ chacha20_use_neon = elf_hwcap & HWCAP_NEON;
+#endif
+}
+
+static inline bool chacha20_arch(u8 *dst, const u8 *src, const size_t len,
+ const u32 key[8], const u32 counter[4],
+ simd_context_t simd_context)
+{
+#if defined(ARM_USE_NEON)
+ if (simd_context == HAVE_FULL_SIMD && chacha20_use_neon) {
+ chacha20_neon(dst, src, len, key, counter);
+ return true;
+ }
+#endif
+ chacha20_arm(dst, src, len, key, counter);
+ return true;
+}
+
+static inline bool hchacha20_arch(u8 *derived_key, const u8 *nonce,
+ const u8 *key, simd_context_t simd_context)
+{
+ return false;
+}
+
+#define HAVE_CHACHA20_ARCH_IMPLEMENTATION
diff --git a/src/crypto/zinc/chacha20/chacha20-arm.S b/src/crypto/zinc/chacha20/chacha20-arm.S
new file mode 100644
index 0000000..4b2090f
--- /dev/null
+++ b/src/crypto/zinc/chacha20/chacha20-arm.S
@@ -0,0 +1,1473 @@
+/* SPDX-License-Identifier: OpenSSL OR (BSD-3-Clause OR GPL-2.0)
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * This is based in part on Andy Polyakov's implementation from OpenSSL.
+ */
+
+#include <linux/linkage.h>
+
+.text
+#if defined(__thumb2__) || defined(__clang__)
+.syntax unified
+#endif
+#if defined(__thumb2__)
+.thumb
+#else
+.code 32
+#endif
+
+#if defined(__thumb2__) || defined(__clang__)
+#define ldrhsb ldrbhs
+#endif
+
+.align 5
+.Lsigma:
+.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
+.Lone:
+.long 1,0,0,0
+.word -1
+
+#if __LINUX_ARM_ARCH__ >= 7 && IS_ENABLED(CONFIG_KERNEL_MODE_NEON)
+.arch armv7-a
+.fpu neon
+
+.align 5
+ENTRY(chacha20_neon)
+ ldr r12,[sp,#0] @ pull pointer to counter and nonce
+ stmdb sp!,{r0-r2,r4-r11,lr}
+ cmp r2,#0 @ len==0?
+#ifdef __thumb2__
+ itt eq
+#endif
+ addeq sp,sp,#4*3
+ beq .Lno_data_neon
+ cmp r2,#192 @ test len
+ bls .Lshort
+.Lchacha20_neon_begin:
+ adr r14,.Lsigma
+ vstmdb sp!,{d8-d15} @ ABI spec says so
+ stmdb sp!,{r0-r3}
+
+ vld1.32 {q1-q2},[r3] @ load key
+ ldmia r3,{r4-r11} @ load key
+
+ sub sp,sp,#4*(16+16)
+ vld1.32 {q3},[r12] @ load counter and nonce
+ add r12,sp,#4*8
+ ldmia r14,{r0-r3} @ load sigma
+ vld1.32 {q0},[r14]! @ load sigma
+ vld1.32 {q12},[r14] @ one
+ vst1.32 {q2-q3},[r12] @ copy 1/2key|counter|nonce
+ vst1.32 {q0-q1},[sp] @ copy sigma|1/2key
+
+ str r10,[sp,#4*(16+10)] @ off-load "rx"
+ str r11,[sp,#4*(16+11)] @ off-load "rx"
+ vshl.i32 d26,d24,#1 @ two
+ vstr d24,[sp,#4*(16+0)]
+ vshl.i32 d28,d24,#2 @ four
+ vstr d26,[sp,#4*(16+2)]
+ vmov q4,q0
+ vstr d28,[sp,#4*(16+4)]
+ vmov q8,q0
+ vmov q5,q1
+ vmov q9,q1
+ b .Loop_neon_enter
+
+.align 4
+.Loop_neon_outer:
+ ldmia sp,{r0-r9} @ load key material
+ cmp r11,#64*2 @ if len<=64*2
+ bls .Lbreak_neon @ switch to integer-only
+ vmov q4,q0
+ str r11,[sp,#4*(32+2)] @ save len
+ vmov q8,q0
+ str r12, [sp,#4*(32+1)] @ save inp
+ vmov q5,q1
+ str r14, [sp,#4*(32+0)] @ save out
+ vmov q9,q1
+.Loop_neon_enter:
+ ldr r11, [sp,#4*(15)]
+ vadd.i32 q7,q3,q12 @ counter+1
+ ldr r12,[sp,#4*(12)] @ modulo-scheduled load
+ vmov q6,q2
+ ldr r10, [sp,#4*(13)]
+ vmov q10,q2
+ ldr r14,[sp,#4*(14)]
+ vadd.i32 q11,q7,q12 @ counter+2
+ str r11, [sp,#4*(16+15)]
+ mov r11,#10
+ add r12,r12,#3 @ counter+3
+ b .Loop_neon
+
+.align 4
+.Loop_neon:
+ subs r11,r11,#1
+ vadd.i32 q0,q0,q1
+ add r0,r0,r4
+ vadd.i32 q4,q4,q5
+ mov r12,r12,ror#16
+ vadd.i32 q8,q8,q9
+ add r1,r1,r5
+ veor q3,q3,q0
+ mov r10,r10,ror#16
+ veor q7,q7,q4
+ eor r12,r12,r0,ror#16
+ veor q11,q11,q8
+ eor r10,r10,r1,ror#16
+ vrev32.16 q3,q3
+ add r8,r8,r12
+ vrev32.16 q7,q7
+ mov r4,r4,ror#20
+ vrev32.16 q11,q11
+ add r9,r9,r10
+ vadd.i32 q2,q2,q3
+ mov r5,r5,ror#20
+ vadd.i32 q6,q6,q7
+ eor r4,r4,r8,ror#20
+ vadd.i32 q10,q10,q11
+ eor r5,r5,r9,ror#20
+ veor q12,q1,q2
+ add r0,r0,r4
+ veor q13,q5,q6
+ mov r12,r12,ror#24
+ veor q14,q9,q10
+ add r1,r1,r5
+ vshr.u32 q1,q12,#20
+ mov r10,r10,ror#24
+ vshr.u32 q5,q13,#20
+ eor r12,r12,r0,ror#24
+ vshr.u32 q9,q14,#20
+ eor r10,r10,r1,ror#24
+ vsli.32 q1,q12,#12
+ add r8,r8,r12
+ vsli.32 q5,q13,#12
+ mov r4,r4,ror#25
+ vsli.32 q9,q14,#12
+ add r9,r9,r10
+ vadd.i32 q0,q0,q1
+ mov r5,r5,ror#25
+ vadd.i32 q4,q4,q5
+ str r10,[sp,#4*(16+13)]
+ vadd.i32 q8,q8,q9
+ ldr r10,[sp,#4*(16+15)]
+ veor q12,q3,q0
+ eor r4,r4,r8,ror#25
+ veor q13,q7,q4
+ eor r5,r5,r9,ror#25
+ veor q14,q11,q8
+ str r8,[sp,#4*(16+8)]
+ vshr.u32 q3,q12,#24
+ ldr r8,[sp,#4*(16+10)]
+ vshr.u32 q7,q13,#24
+ add r2,r2,r6
+ vshr.u32 q11,q14,#24
+ mov r14,r14,ror#16
+ vsli.32 q3,q12,#8
+ str r9,[sp,#4*(16+9)]
+ vsli.32 q7,q13,#8
+ ldr r9,[sp,#4*(16+11)]
+ vsli.32 q11,q14,#8
+ add r3,r3,r7
+ vadd.i32 q2,q2,q3
+ mov r10,r10,ror#16
+ vadd.i32 q6,q6,q7
+ eor r14,r14,r2,ror#16
+ vadd.i32 q10,q10,q11
+ eor r10,r10,r3,ror#16
+ veor q12,q1,q2
+ add r8,r8,r14
+ veor q13,q5,q6
+ mov r6,r6,ror#20
+ veor q14,q9,q10
+ add r9,r9,r10
+ vshr.u32 q1,q12,#25
+ mov r7,r7,ror#20
+ vshr.u32 q5,q13,#25
+ eor r6,r6,r8,ror#20
+ vshr.u32 q9,q14,#25
+ eor r7,r7,r9,ror#20
+ vsli.32 q1,q12,#7
+ add r2,r2,r6
+ vsli.32 q5,q13,#7
+ mov r14,r14,ror#24
+ vsli.32 q9,q14,#7
+ add r3,r3,r7
+ vext.8 q2,q2,q2,#8
+ mov r10,r10,ror#24
+ vext.8 q6,q6,q6,#8
+ eor r14,r14,r2,ror#24
+ vext.8 q10,q10,q10,#8
+ eor r10,r10,r3,ror#24
+ vext.8 q1,q1,q1,#4
+ add r8,r8,r14
+ vext.8 q5,q5,q5,#4
+ mov r6,r6,ror#25
+ vext.8 q9,q9,q9,#4
+ add r9,r9,r10
+ vext.8 q3,q3,q3,#12
+ mov r7,r7,ror#25
+ vext.8 q7,q7,q7,#12
+ eor r6,r6,r8,ror#25
+ vext.8 q11,q11,q11,#12
+ eor r7,r7,r9,ror#25
+ vadd.i32 q0,q0,q1
+ add r0,r0,r5
+ vadd.i32 q4,q4,q5
+ mov r10,r10,ror#16
+ vadd.i32 q8,q8,q9
+ add r1,r1,r6
+ veor q3,q3,q0
+ mov r12,r12,ror#16
+ veor q7,q7,q4
+ eor r10,r10,r0,ror#16
+ veor q11,q11,q8
+ eor r12,r12,r1,ror#16
+ vrev32.16 q3,q3
+ add r8,r8,r10
+ vrev32.16 q7,q7
+ mov r5,r5,ror#20
+ vrev32.16 q11,q11
+ add r9,r9,r12
+ vadd.i32 q2,q2,q3
+ mov r6,r6,ror#20
+ vadd.i32 q6,q6,q7
+ eor r5,r5,r8,ror#20
+ vadd.i32 q10,q10,q11
+ eor r6,r6,r9,ror#20
+ veor q12,q1,q2
+ add r0,r0,r5
+ veor q13,q5,q6
+ mov r10,r10,ror#24
+ veor q14,q9,q10
+ add r1,r1,r6
+ vshr.u32 q1,q12,#20
+ mov r12,r12,ror#24
+ vshr.u32 q5,q13,#20
+ eor r10,r10,r0,ror#24
+ vshr.u32 q9,q14,#20
+ eor r12,r12,r1,ror#24
+ vsli.32 q1,q12,#12
+ add r8,r8,r10
+ vsli.32 q5,q13,#12
+ mov r5,r5,ror#25
+ vsli.32 q9,q14,#12
+ str r10,[sp,#4*(16+15)]
+ vadd.i32 q0,q0,q1
+ ldr r10,[sp,#4*(16+13)]
+ vadd.i32 q4,q4,q5
+ add r9,r9,r12
+ vadd.i32 q8,q8,q9
+ mov r6,r6,ror#25
+ veor q12,q3,q0
+ eor r5,r5,r8,ror#25
+ veor q13,q7,q4
+ eor r6,r6,r9,ror#25
+ veor q14,q11,q8
+ str r8,[sp,#4*(16+10)]
+ vshr.u32 q3,q12,#24
+ ldr r8,[sp,#4*(16+8)]
+ vshr.u32 q7,q13,#24
+ add r2,r2,r7
+ vshr.u32 q11,q14,#24
+ mov r10,r10,ror#16
+ vsli.32 q3,q12,#8
+ str r9,[sp,#4*(16+11)]
+ vsli.32 q7,q13,#8
+ ldr r9,[sp,#4*(16+9)]
+ vsli.32 q11,q14,#8
+ add r3,r3,r4
+ vadd.i32 q2,q2,q3
+ mov r14,r14,ror#16
+ vadd.i32 q6,q6,q7
+ eor r10,r10,r2,ror#16
+ vadd.i32 q10,q10,q11
+ eor r14,r14,r3,ror#16
+ veor q12,q1,q2
+ add r8,r8,r10
+ veor q13,q5,q6
+ mov r7,r7,ror#20
+ veor q14,q9,q10
+ add r9,r9,r14
+ vshr.u32 q1,q12,#25
+ mov r4,r4,ror#20
+ vshr.u32 q5,q13,#25
+ eor r7,r7,r8,ror#20
+ vshr.u32 q9,q14,#25
+ eor r4,r4,r9,ror#20
+ vsli.32 q1,q12,#7
+ add r2,r2,r7
+ vsli.32 q5,q13,#7
+ mov r10,r10,ror#24
+ vsli.32 q9,q14,#7
+ add r3,r3,r4
+ vext.8 q2,q2,q2,#8
+ mov r14,r14,ror#24
+ vext.8 q6,q6,q6,#8
+ eor r10,r10,r2,ror#24
+ vext.8 q10,q10,q10,#8
+ eor r14,r14,r3,ror#24
+ vext.8 q1,q1,q1,#12
+ add r8,r8,r10
+ vext.8 q5,q5,q5,#12
+ mov r7,r7,ror#25
+ vext.8 q9,q9,q9,#12
+ add r9,r9,r14
+ vext.8 q3,q3,q3,#4
+ mov r4,r4,ror#25
+ vext.8 q7,q7,q7,#4
+ eor r7,r7,r8,ror#25
+ vext.8 q11,q11,q11,#4
+ eor r4,r4,r9,ror#25
+ bne .Loop_neon
+
+ add r11,sp,#32
+ vld1.32 {q12-q13},[sp] @ load key material
+ vld1.32 {q14-q15},[r11]
+
+ ldr r11,[sp,#4*(32+2)] @ load len
+
+ str r8, [sp,#4*(16+8)] @ modulo-scheduled store
+ str r9, [sp,#4*(16+9)]
+ str r12,[sp,#4*(16+12)]
+ str r10, [sp,#4*(16+13)]
+ str r14,[sp,#4*(16+14)]
+
+ @ at this point we have first half of 512-bit result in
+ @ rx and second half at sp+4*(16+8)
+
+ ldr r12,[sp,#4*(32+1)] @ load inp
+ ldr r14,[sp,#4*(32+0)] @ load out
+
+ vadd.i32 q0,q0,q12 @ accumulate key material
+ vadd.i32 q4,q4,q12
+ vadd.i32 q8,q8,q12
+ vldr d24,[sp,#4*(16+0)] @ one
+
+ vadd.i32 q1,q1,q13
+ vadd.i32 q5,q5,q13
+ vadd.i32 q9,q9,q13
+ vldr d26,[sp,#4*(16+2)] @ two
+
+ vadd.i32 q2,q2,q14
+ vadd.i32 q6,q6,q14
+ vadd.i32 q10,q10,q14
+ vadd.i32 d14,d14,d24 @ counter+1
+ vadd.i32 d22,d22,d26 @ counter+2
+
+ vadd.i32 q3,q3,q15
+ vadd.i32 q7,q7,q15
+ vadd.i32 q11,q11,q15
+
+ cmp r11,#64*4
+ blo .Ltail_neon
+
+ vld1.8 {q12-q13},[r12]! @ load input
+ mov r11,sp
+ vld1.8 {q14-q15},[r12]!
+ veor q0,q0,q12 @ xor with input
+ veor q1,q1,q13
+ vld1.8 {q12-q13},[r12]!
+ veor q2,q2,q14
+ veor q3,q3,q15
+ vld1.8 {q14-q15},[r12]!
+
+ veor q4,q4,q12
+ vst1.8 {q0-q1},[r14]! @ store output
+ veor q5,q5,q13
+ vld1.8 {q12-q13},[r12]!
+ veor q6,q6,q14
+ vst1.8 {q2-q3},[r14]!
+ veor q7,q7,q15
+ vld1.8 {q14-q15},[r12]!
+
+ veor q8,q8,q12
+ vld1.32 {q0-q1},[r11]! @ load for next iteration
+ veor d25,d25,d25
+ vldr d24,[sp,#4*(16+4)] @ four
+ veor q9,q9,q13
+ vld1.32 {q2-q3},[r11]
+ veor q10,q10,q14
+ vst1.8 {q4-q5},[r14]!
+ veor q11,q11,q15
+ vst1.8 {q6-q7},[r14]!
+
+ vadd.i32 d6,d6,d24 @ next counter value
+ vldr d24,[sp,#4*(16+0)] @ one
+
+ ldmia sp,{r8-r11} @ load key material
+ add r0,r0,r8 @ accumulate key material
+ ldr r8,[r12],#16 @ load input
+ vst1.8 {q8-q9},[r14]!
+ add r1,r1,r9
+ ldr r9,[r12,#-12]
+ vst1.8 {q10-q11},[r14]!
+ add r2,r2,r10
+ ldr r10,[r12,#-8]
+ add r3,r3,r11
+ ldr r11,[r12,#-4]
+#ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+#endif
+ eor r0,r0,r8 @ xor with input
+ add r8,sp,#4*(4)
+ eor r1,r1,r9
+ str r0,[r14],#16 @ store output
+ eor r2,r2,r10
+ str r1,[r14,#-12]
+ eor r3,r3,r11
+ ldmia r8,{r8-r11} @ load key material
+ str r2,[r14,#-8]
+ str r3,[r14,#-4]
+
+ add r4,r4,r8 @ accumulate key material
+ ldr r8,[r12],#16 @ load input
+ add r5,r5,r9
+ ldr r9,[r12,#-12]
+ add r6,r6,r10
+ ldr r10,[r12,#-8]
+ add r7,r7,r11
+ ldr r11,[r12,#-4]
+#ifdef __ARMEB__
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+#endif
+ eor r4,r4,r8
+ add r8,sp,#4*(8)
+ eor r5,r5,r9
+ str r4,[r14],#16 @ store output
+ eor r6,r6,r10
+ str r5,[r14,#-12]
+ eor r7,r7,r11
+ ldmia r8,{r8-r11} @ load key material
+ str r6,[r14,#-8]
+ add r0,sp,#4*(16+8)
+ str r7,[r14,#-4]
+
+ ldmia r0,{r0-r7} @ load second half
+
+ add r0,r0,r8 @ accumulate key material
+ ldr r8,[r12],#16 @ load input
+ add r1,r1,r9
+ ldr r9,[r12,#-12]
+#ifdef __thumb2__
+ it hi
+#endif
+ strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it
+ add r2,r2,r10
+ ldr r10,[r12,#-8]
+#ifdef __thumb2__
+ it hi
+#endif
+ strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it
+ add r3,r3,r11
+ ldr r11,[r12,#-4]
+#ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+#endif
+ eor r0,r0,r8
+ add r8,sp,#4*(12)
+ eor r1,r1,r9
+ str r0,[r14],#16 @ store output
+ eor r2,r2,r10
+ str r1,[r14,#-12]
+ eor r3,r3,r11
+ ldmia r8,{r8-r11} @ load key material
+ str r2,[r14,#-8]
+ str r3,[r14,#-4]
+
+ add r4,r4,r8 @ accumulate key material
+ add r8,r8,#4 @ next counter value
+ add r5,r5,r9
+ str r8,[sp,#4*(12)] @ save next counter value
+ ldr r8,[r12],#16 @ load input
+ add r6,r6,r10
+ add r4,r4,#3 @ counter+3
+ ldr r9,[r12,#-12]
+ add r7,r7,r11
+ ldr r10,[r12,#-8]
+ ldr r11,[r12,#-4]
+#ifdef __ARMEB__
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+#endif
+ eor r4,r4,r8
+#ifdef __thumb2__
+ it hi
+#endif
+ ldrhi r8,[sp,#4*(32+2)] @ re-load len
+ eor r5,r5,r9
+ eor r6,r6,r10
+ str r4,[r14],#16 @ store output
+ eor r7,r7,r11
+ str r5,[r14,#-12]
+ sub r11,r8,#64*4 @ len-=64*4
+ str r6,[r14,#-8]
+ str r7,[r14,#-4]
+ bhi .Loop_neon_outer
+
+ b .Ldone_neon
+
+.align 4
+.Lbreak_neon:
+ @ harmonize NEON and integer-only stack frames: load data
+ @ from NEON frame, but save to integer-only one; distance
+ @ between the two is 4*(32+4+16-32)=4*(20).
+
+ str r11, [sp,#4*(20+32+2)] @ save len
+ add r11,sp,#4*(32+4)
+ str r12, [sp,#4*(20+32+1)] @ save inp
+ str r14, [sp,#4*(20+32+0)] @ save out
+
+ ldr r12,[sp,#4*(16+10)]
+ ldr r14,[sp,#4*(16+11)]
+ vldmia r11,{d8-d15} @ fulfill ABI requirement
+ str r12,[sp,#4*(20+16+10)] @ copy "rx"
+ str r14,[sp,#4*(20+16+11)] @ copy "rx"
+
+ ldr r11, [sp,#4*(15)]
+ ldr r12,[sp,#4*(12)] @ modulo-scheduled load
+ ldr r10, [sp,#4*(13)]
+ ldr r14,[sp,#4*(14)]
+ str r11, [sp,#4*(20+16+15)]
+ add r11,sp,#4*(20)
+ vst1.32 {q0-q1},[r11]! @ copy key
+ add sp,sp,#4*(20) @ switch frame
+ vst1.32 {q2-q3},[r11]
+ mov r11,#10
+ b .Loop @ go integer-only
+
+.align 4
+.Ltail_neon:
+ cmp r11,#64*3
+ bhs .L192_or_more_neon
+ cmp r11,#64*2
+ bhs .L128_or_more_neon
+ cmp r11,#64*1
+ bhs .L64_or_more_neon
+
+ add r8,sp,#4*(8)
+ vst1.8 {q0-q1},[sp]
+ add r10,sp,#4*(0)
+ vst1.8 {q2-q3},[r8]
+ b .Loop_tail_neon
+
+.align 4
+.L64_or_more_neon:
+ vld1.8 {q12-q13},[r12]!
+ vld1.8 {q14-q15},[r12]!
+ veor q0,q0,q12
+ veor q1,q1,q13
+ veor q2,q2,q14
+ veor q3,q3,q15
+ vst1.8 {q0-q1},[r14]!
+ vst1.8 {q2-q3},[r14]!
+
+ beq .Ldone_neon
+
+ add r8,sp,#4*(8)
+ vst1.8 {q4-q5},[sp]
+ add r10,sp,#4*(0)
+ vst1.8 {q6-q7},[r8]
+ sub r11,r11,#64*1 @ len-=64*1
+ b .Loop_tail_neon
+
+.align 4
+.L128_or_more_neon:
+ vld1.8 {q12-q13},[r12]!
+ vld1.8 {q14-q15},[r12]!
+ veor q0,q0,q12
+ veor q1,q1,q13
+ vld1.8 {q12-q13},[r12]!
+ veor q2,q2,q14
+ veor q3,q3,q15
+ vld1.8 {q14-q15},[r12]!
+
+ veor q4,q4,q12
+ veor q5,q5,q13
+ vst1.8 {q0-q1},[r14]!
+ veor q6,q6,q14
+ vst1.8 {q2-q3},[r14]!
+ veor q7,q7,q15
+ vst1.8 {q4-q5},[r14]!
+ vst1.8 {q6-q7},[r14]!
+
+ beq .Ldone_neon
+
+ add r8,sp,#4*(8)
+ vst1.8 {q8-q9},[sp]
+ add r10,sp,#4*(0)
+ vst1.8 {q10-q11},[r8]
+ sub r11,r11,#64*2 @ len-=64*2
+ b .Loop_tail_neon
+
+.align 4
+.L192_or_more_neon:
+ vld1.8 {q12-q13},[r12]!
+ vld1.8 {q14-q15},[r12]!
+ veor q0,q0,q12
+ veor q1,q1,q13
+ vld1.8 {q12-q13},[r12]!
+ veor q2,q2,q14
+ veor q3,q3,q15
+ vld1.8 {q14-q15},[r12]!
+
+ veor q4,q4,q12
+ veor q5,q5,q13
+ vld1.8 {q12-q13},[r12]!
+ veor q6,q6,q14
+ vst1.8 {q0-q1},[r14]!
+ veor q7,q7,q15
+ vld1.8 {q14-q15},[r12]!
+
+ veor q8,q8,q12
+ vst1.8 {q2-q3},[r14]!
+ veor q9,q9,q13
+ vst1.8 {q4-q5},[r14]!
+ veor q10,q10,q14
+ vst1.8 {q6-q7},[r14]!
+ veor q11,q11,q15
+ vst1.8 {q8-q9},[r14]!
+ vst1.8 {q10-q11},[r14]!
+
+ beq .Ldone_neon
+
+ ldmia sp,{r8-r11} @ load key material
+ add r0,r0,r8 @ accumulate key material
+ add r8,sp,#4*(4)
+ add r1,r1,r9
+ add r2,r2,r10
+ add r3,r3,r11
+ ldmia r8,{r8-r11} @ load key material
+
+ add r4,r4,r8 @ accumulate key material
+ add r8,sp,#4*(8)
+ add r5,r5,r9
+ add r6,r6,r10
+ add r7,r7,r11
+ ldmia r8,{r8-r11} @ load key material
+#ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+#endif
+ stmia sp,{r0-r7}
+ add r0,sp,#4*(16+8)
+
+ ldmia r0,{r0-r7} @ load second half
+
+ add r0,r0,r8 @ accumulate key material
+ add r8,sp,#4*(12)
+ add r1,r1,r9
+ add r2,r2,r10
+ add r3,r3,r11
+ ldmia r8,{r8-r11} @ load key material
+
+ add r4,r4,r8 @ accumulate key material
+ add r8,sp,#4*(8)
+ add r5,r5,r9
+ add r4,r4,#3 @ counter+3
+ add r6,r6,r10
+ add r7,r7,r11
+ ldr r11,[sp,#4*(32+2)] @ re-load len
+#ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+#endif
+ stmia r8,{r0-r7}
+ add r10,sp,#4*(0)
+ sub r11,r11,#64*3 @ len-=64*3
+
+.Loop_tail_neon:
+ ldrb r8,[r10],#1 @ read buffer on stack
+ ldrb r9,[r12],#1 @ read input
+ subs r11,r11,#1
+ eor r8,r8,r9
+ strb r8,[r14],#1 @ store output
+ bne .Loop_tail_neon
+
+.Ldone_neon:
+ add sp,sp,#4*(32+4)
+ vldmia sp,{d8-d15}
+ add sp,sp,#4*(16+3)
+.Lno_data_neon:
+ ldmia sp!,{r4-r11,pc}
+ENDPROC(chacha20_neon)
+#endif
+
+.align 5
+.Lsigma2:
+.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
+.Lone2:
+.long 1,0,0,0
+.word -1
+
+.align 5
+ENTRY(chacha20_arm)
+ ldr r12,[sp,#0] @ pull pointer to counter and nonce
+ stmdb sp!,{r0-r2,r4-r11,lr}
+ cmp r2,#0 @ len==0?
+#ifdef __thumb2__
+ itt eq
+#endif
+ addeq sp,sp,#4*3
+ beq .Lno_data_arm
+.Lshort:
+ ldmia r12,{r4-r7} @ load counter and nonce
+ sub sp,sp,#4*(16) @ off-load area
+#if __LINUX_ARM_ARCH__ < 7 && !defined(__thumb2__)
+ sub r14,pc,#100 @ .Lsigma2
+#else
+ adr r14,.Lsigma2 @ .Lsigma2
+#endif
+ stmdb sp!,{r4-r7} @ copy counter and nonce
+ ldmia r3,{r4-r11} @ load key
+ ldmia r14,{r0-r3} @ load sigma
+ stmdb sp!,{r4-r11} @ copy key
+ stmdb sp!,{r0-r3} @ copy sigma
+ str r10,[sp,#4*(16+10)] @ off-load "rx"
+ str r11,[sp,#4*(16+11)] @ off-load "rx"
+ b .Loop_outer_enter
+
+.align 4
+.Loop_outer:
+ ldmia sp,{r0-r9} @ load key material
+ str r11,[sp,#4*(32+2)] @ save len
+ str r12, [sp,#4*(32+1)] @ save inp
+ str r14, [sp,#4*(32+0)] @ save out
+.Loop_outer_enter:
+ ldr r11, [sp,#4*(15)]
+ ldr r12,[sp,#4*(12)] @ modulo-scheduled load
+ ldr r10, [sp,#4*(13)]
+ ldr r14,[sp,#4*(14)]
+ str r11, [sp,#4*(16+15)]
+ mov r11,#10
+ b .Loop
+
+.align 4
+.Loop:
+ subs r11,r11,#1
+ add r0,r0,r4
+ mov r12,r12,ror#16
+ add r1,r1,r5
+ mov r10,r10,ror#16
+ eor r12,r12,r0,ror#16
+ eor r10,r10,r1,ror#16
+ add r8,r8,r12
+ mov r4,r4,ror#20
+ add r9,r9,r10
+ mov r5,r5,ror#20
+ eor r4,r4,r8,ror#20
+ eor r5,r5,r9,ror#20
+ add r0,r0,r4
+ mov r12,r12,ror#24
+ add r1,r1,r5
+ mov r10,r10,ror#24
+ eor r12,r12,r0,ror#24
+ eor r10,r10,r1,ror#24
+ add r8,r8,r12
+ mov r4,r4,ror#25
+ add r9,r9,r10
+ mov r5,r5,ror#25
+ str r10,[sp,#4*(16+13)]
+ ldr r10,[sp,#4*(16+15)]
+ eor r4,r4,r8,ror#25
+ eor r5,r5,r9,ror#25
+ str r8,[sp,#4*(16+8)]
+ ldr r8,[sp,#4*(16+10)]
+ add r2,r2,r6
+ mov r14,r14,ror#16
+ str r9,[sp,#4*(16+9)]
+ ldr r9,[sp,#4*(16+11)]
+ add r3,r3,r7
+ mov r10,r10,ror#16
+ eor r14,r14,r2,ror#16
+ eor r10,r10,r3,ror#16
+ add r8,r8,r14
+ mov r6,r6,ror#20
+ add r9,r9,r10
+ mov r7,r7,ror#20
+ eor r6,r6,r8,ror#20
+ eor r7,r7,r9,ror#20
+ add r2,r2,r6
+ mov r14,r14,ror#24
+ add r3,r3,r7
+ mov r10,r10,ror#24
+ eor r14,r14,r2,ror#24
+ eor r10,r10,r3,ror#24
+ add r8,r8,r14
+ mov r6,r6,ror#25
+ add r9,r9,r10
+ mov r7,r7,ror#25
+ eor r6,r6,r8,ror#25
+ eor r7,r7,r9,ror#25
+ add r0,r0,r5
+ mov r10,r10,ror#16
+ add r1,r1,r6
+ mov r12,r12,ror#16
+ eor r10,r10,r0,ror#16
+ eor r12,r12,r1,ror#16
+ add r8,r8,r10
+ mov r5,r5,ror#20
+ add r9,r9,r12
+ mov r6,r6,ror#20
+ eor r5,r5,r8,ror#20
+ eor r6,r6,r9,ror#20
+ add r0,r0,r5
+ mov r10,r10,ror#24
+ add r1,r1,r6
+ mov r12,r12,ror#24
+ eor r10,r10,r0,ror#24
+ eor r12,r12,r1,ror#24
+ add r8,r8,r10
+ mov r5,r5,ror#25
+ str r10,[sp,#4*(16+15)]
+ ldr r10,[sp,#4*(16+13)]
+ add r9,r9,r12
+ mov r6,r6,ror#25
+ eor r5,r5,r8,ror#25
+ eor r6,r6,r9,ror#25
+ str r8,[sp,#4*(16+10)]
+ ldr r8,[sp,#4*(16+8)]
+ add r2,r2,r7
+ mov r10,r10,ror#16
+ str r9,[sp,#4*(16+11)]
+ ldr r9,[sp,#4*(16+9)]
+ add r3,r3,r4
+ mov r14,r14,ror#16
+ eor r10,r10,r2,ror#16
+ eor r14,r14,r3,ror#16
+ add r8,r8,r10
+ mov r7,r7,ror#20
+ add r9,r9,r14
+ mov r4,r4,ror#20
+ eor r7,r7,r8,ror#20
+ eor r4,r4,r9,ror#20
+ add r2,r2,r7
+ mov r10,r10,ror#24
+ add r3,r3,r4
+ mov r14,r14,ror#24
+ eor r10,r10,r2,ror#24
+ eor r14,r14,r3,ror#24
+ add r8,r8,r10
+ mov r7,r7,ror#25
+ add r9,r9,r14
+ mov r4,r4,ror#25
+ eor r7,r7,r8,ror#25
+ eor r4,r4,r9,ror#25
+ bne .Loop
+
+ ldr r11,[sp,#4*(32+2)] @ load len
+
+ str r8, [sp,#4*(16+8)] @ modulo-scheduled store
+ str r9, [sp,#4*(16+9)]
+ str r12,[sp,#4*(16+12)]
+ str r10, [sp,#4*(16+13)]
+ str r14,[sp,#4*(16+14)]
+
+ @ at this point we have first half of 512-bit result in
+ @ rx and second half at sp+4*(16+8)
+
+ cmp r11,#64 @ done yet?
+#ifdef __thumb2__
+ itete lo
+#endif
+ addlo r12,sp,#4*(0) @ shortcut or ...
+ ldrhs r12,[sp,#4*(32+1)] @ ... load inp
+ addlo r14,sp,#4*(0) @ shortcut or ...
+ ldrhs r14,[sp,#4*(32+0)] @ ... load out
+
+ ldr r8,[sp,#4*(0)] @ load key material
+ ldr r9,[sp,#4*(1)]
+
+#if __LINUX_ARM_ARCH__ >= 6 || !defined(__ARMEB__)
+#if __LINUX_ARM_ARCH__ < 7
+ orr r10,r12,r14
+ tst r10,#3 @ are input and output aligned?
+ ldr r10,[sp,#4*(2)]
+ bne .Lunaligned
+ cmp r11,#64 @ restore flags
+#else
+ ldr r10,[sp,#4*(2)]
+#endif
+ ldr r11,[sp,#4*(3)]
+
+ add r0,r0,r8 @ accumulate key material
+ add r1,r1,r9
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhs r8,[r12],#16 @ load input
+ ldrhs r9,[r12,#-12]
+
+ add r2,r2,r10
+ add r3,r3,r11
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhs r10,[r12,#-8]
+ ldrhs r11,[r12,#-4]
+#if __LINUX_ARM_ARCH__ >= 6 && defined(__ARMEB__)
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+#endif
+#ifdef __thumb2__
+ itt hs
+#endif
+ eorhs r0,r0,r8 @ xor with input
+ eorhs r1,r1,r9
+ add r8,sp,#4*(4)
+ str r0,[r14],#16 @ store output
+#ifdef __thumb2__
+ itt hs
+#endif
+ eorhs r2,r2,r10
+ eorhs r3,r3,r11
+ ldmia r8,{r8-r11} @ load key material
+ str r1,[r14,#-12]
+ str r2,[r14,#-8]
+ str r3,[r14,#-4]
+
+ add r4,r4,r8 @ accumulate key material
+ add r5,r5,r9
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhs r8,[r12],#16 @ load input
+ ldrhs r9,[r12,#-12]
+ add r6,r6,r10
+ add r7,r7,r11
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhs r10,[r12,#-8]
+ ldrhs r11,[r12,#-4]
+#if __LINUX_ARM_ARCH__ >= 6 && defined(__ARMEB__)
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+#endif
+#ifdef __thumb2__
+ itt hs
+#endif
+ eorhs r4,r4,r8
+ eorhs r5,r5,r9
+ add r8,sp,#4*(8)
+ str r4,[r14],#16 @ store output
+#ifdef __thumb2__
+ itt hs
+#endif
+ eorhs r6,r6,r10
+ eorhs r7,r7,r11
+ str r5,[r14,#-12]
+ ldmia r8,{r8-r11} @ load key material
+ str r6,[r14,#-8]
+ add r0,sp,#4*(16+8)
+ str r7,[r14,#-4]
+
+ ldmia r0,{r0-r7} @ load second half
+
+ add r0,r0,r8 @ accumulate key material
+ add r1,r1,r9
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhs r8,[r12],#16 @ load input
+ ldrhs r9,[r12,#-12]
+#ifdef __thumb2__
+ itt hi
+#endif
+ strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it
+ strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it
+ add r2,r2,r10
+ add r3,r3,r11
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhs r10,[r12,#-8]
+ ldrhs r11,[r12,#-4]
+#if __LINUX_ARM_ARCH__ >= 6 && defined(__ARMEB__)
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+#endif
+#ifdef __thumb2__
+ itt hs
+#endif
+ eorhs r0,r0,r8
+ eorhs r1,r1,r9
+ add r8,sp,#4*(12)
+ str r0,[r14],#16 @ store output
+#ifdef __thumb2__
+ itt hs
+#endif
+ eorhs r2,r2,r10
+ eorhs r3,r3,r11
+ str r1,[r14,#-12]
+ ldmia r8,{r8-r11} @ load key material
+ str r2,[r14,#-8]
+ str r3,[r14,#-4]
+
+ add r4,r4,r8 @ accumulate key material
+ add r5,r5,r9
+#ifdef __thumb2__
+ itt hi
+#endif
+ addhi r8,r8,#1 @ next counter value
+ strhi r8,[sp,#4*(12)] @ save next counter value
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhs r8,[r12],#16 @ load input
+ ldrhs r9,[r12,#-12]
+ add r6,r6,r10
+ add r7,r7,r11
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhs r10,[r12,#-8]
+ ldrhs r11,[r12,#-4]
+#if __LINUX_ARM_ARCH__ >= 6 && defined(__ARMEB__)
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+#endif
+#ifdef __thumb2__
+ itt hs
+#endif
+ eorhs r4,r4,r8
+ eorhs r5,r5,r9
+#ifdef __thumb2__
+ it ne
+#endif
+ ldrne r8,[sp,#4*(32+2)] @ re-load len
+#ifdef __thumb2__
+ itt hs
+#endif
+ eorhs r6,r6,r10
+ eorhs r7,r7,r11
+ str r4,[r14],#16 @ store output
+ str r5,[r14,#-12]
+#ifdef __thumb2__
+ it hs
+#endif
+ subhs r11,r8,#64 @ len-=64
+ str r6,[r14,#-8]
+ str r7,[r14,#-4]
+ bhi .Loop_outer
+
+ beq .Ldone
+#if __LINUX_ARM_ARCH__ < 7
+ b .Ltail
+
+.align 4
+.Lunaligned: @ unaligned endian-neutral path
+ cmp r11,#64 @ restore flags
+#endif
+#endif
+#if __LINUX_ARM_ARCH__ < 7
+ ldr r11,[sp,#4*(3)]
+ add r0,r0,r8 @ accumulate key material
+ add r1,r1,r9
+ add r2,r2,r10
+#ifdef __thumb2__
+ itete lo
+#endif
+ eorlo r8,r8,r8 @ zero or ...
+ ldrhsb r8,[r12],#16 @ ... load input
+ eorlo r9,r9,r9
+ ldrhsb r9,[r12,#-12]
+
+ add r3,r3,r11
+#ifdef __thumb2__
+ itete lo
+#endif
+ eorlo r10,r10,r10
+ ldrhsb r10,[r12,#-8]
+ eorlo r11,r11,r11
+ ldrhsb r11,[r12,#-4]
+
+ eor r0,r8,r0 @ xor with input (or zero)
+ eor r1,r9,r1
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-15] @ load more input
+ ldrhsb r9,[r12,#-11]
+ eor r2,r10,r2
+ strb r0,[r14],#16 @ store output
+ eor r3,r11,r3
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-7]
+ ldrhsb r11,[r12,#-3]
+ strb r1,[r14,#-12]
+ eor r0,r8,r0,lsr#8
+ strb r2,[r14,#-8]
+ eor r1,r9,r1,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-14] @ load more input
+ ldrhsb r9,[r12,#-10]
+ strb r3,[r14,#-4]
+ eor r2,r10,r2,lsr#8
+ strb r0,[r14,#-15]
+ eor r3,r11,r3,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-6]
+ ldrhsb r11,[r12,#-2]
+ strb r1,[r14,#-11]
+ eor r0,r8,r0,lsr#8
+ strb r2,[r14,#-7]
+ eor r1,r9,r1,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-13] @ load more input
+ ldrhsb r9,[r12,#-9]
+ strb r3,[r14,#-3]
+ eor r2,r10,r2,lsr#8
+ strb r0,[r14,#-14]
+ eor r3,r11,r3,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-5]
+ ldrhsb r11,[r12,#-1]
+ strb r1,[r14,#-10]
+ strb r2,[r14,#-6]
+ eor r0,r8,r0,lsr#8
+ strb r3,[r14,#-2]
+ eor r1,r9,r1,lsr#8
+ strb r0,[r14,#-13]
+ eor r2,r10,r2,lsr#8
+ strb r1,[r14,#-9]
+ eor r3,r11,r3,lsr#8
+ strb r2,[r14,#-5]
+ strb r3,[r14,#-1]
+ add r8,sp,#4*(4+0)
+ ldmia r8,{r8-r11} @ load key material
+ add r0,sp,#4*(16+8)
+ add r4,r4,r8 @ accumulate key material
+ add r5,r5,r9
+ add r6,r6,r10
+#ifdef __thumb2__
+ itete lo
+#endif
+ eorlo r8,r8,r8 @ zero or ...
+ ldrhsb r8,[r12],#16 @ ... load input
+ eorlo r9,r9,r9
+ ldrhsb r9,[r12,#-12]
+
+ add r7,r7,r11
+#ifdef __thumb2__
+ itete lo
+#endif
+ eorlo r10,r10,r10
+ ldrhsb r10,[r12,#-8]
+ eorlo r11,r11,r11
+ ldrhsb r11,[r12,#-4]
+
+ eor r4,r8,r4 @ xor with input (or zero)
+ eor r5,r9,r5
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-15] @ load more input
+ ldrhsb r9,[r12,#-11]
+ eor r6,r10,r6
+ strb r4,[r14],#16 @ store output
+ eor r7,r11,r7
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-7]
+ ldrhsb r11,[r12,#-3]
+ strb r5,[r14,#-12]
+ eor r4,r8,r4,lsr#8
+ strb r6,[r14,#-8]
+ eor r5,r9,r5,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-14] @ load more input
+ ldrhsb r9,[r12,#-10]
+ strb r7,[r14,#-4]
+ eor r6,r10,r6,lsr#8
+ strb r4,[r14,#-15]
+ eor r7,r11,r7,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-6]
+ ldrhsb r11,[r12,#-2]
+ strb r5,[r14,#-11]
+ eor r4,r8,r4,lsr#8
+ strb r6,[r14,#-7]
+ eor r5,r9,r5,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-13] @ load more input
+ ldrhsb r9,[r12,#-9]
+ strb r7,[r14,#-3]
+ eor r6,r10,r6,lsr#8
+ strb r4,[r14,#-14]
+ eor r7,r11,r7,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-5]
+ ldrhsb r11,[r12,#-1]
+ strb r5,[r14,#-10]
+ strb r6,[r14,#-6]
+ eor r4,r8,r4,lsr#8
+ strb r7,[r14,#-2]
+ eor r5,r9,r5,lsr#8
+ strb r4,[r14,#-13]
+ eor r6,r10,r6,lsr#8
+ strb r5,[r14,#-9]
+ eor r7,r11,r7,lsr#8
+ strb r6,[r14,#-5]
+ strb r7,[r14,#-1]
+ add r8,sp,#4*(4+4)
+ ldmia r8,{r8-r11} @ load key material
+ ldmia r0,{r0-r7} @ load second half
+#ifdef __thumb2__
+ itt hi
+#endif
+ strhi r10,[sp,#4*(16+10)] @ copy "rx"
+ strhi r11,[sp,#4*(16+11)] @ copy "rx"
+ add r0,r0,r8 @ accumulate key material
+ add r1,r1,r9
+ add r2,r2,r10
+#ifdef __thumb2__
+ itete lo
+#endif
+ eorlo r8,r8,r8 @ zero or ...
+ ldrhsb r8,[r12],#16 @ ... load input
+ eorlo r9,r9,r9
+ ldrhsb r9,[r12,#-12]
+
+ add r3,r3,r11
+#ifdef __thumb2__
+ itete lo
+#endif
+ eorlo r10,r10,r10
+ ldrhsb r10,[r12,#-8]
+ eorlo r11,r11,r11
+ ldrhsb r11,[r12,#-4]
+
+ eor r0,r8,r0 @ xor with input (or zero)
+ eor r1,r9,r1
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-15] @ load more input
+ ldrhsb r9,[r12,#-11]
+ eor r2,r10,r2
+ strb r0,[r14],#16 @ store output
+ eor r3,r11,r3
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-7]
+ ldrhsb r11,[r12,#-3]
+ strb r1,[r14,#-12]
+ eor r0,r8,r0,lsr#8
+ strb r2,[r14,#-8]
+ eor r1,r9,r1,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-14] @ load more input
+ ldrhsb r9,[r12,#-10]
+ strb r3,[r14,#-4]
+ eor r2,r10,r2,lsr#8
+ strb r0,[r14,#-15]
+ eor r3,r11,r3,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-6]
+ ldrhsb r11,[r12,#-2]
+ strb r1,[r14,#-11]
+ eor r0,r8,r0,lsr#8
+ strb r2,[r14,#-7]
+ eor r1,r9,r1,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-13] @ load more input
+ ldrhsb r9,[r12,#-9]
+ strb r3,[r14,#-3]
+ eor r2,r10,r2,lsr#8
+ strb r0,[r14,#-14]
+ eor r3,r11,r3,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-5]
+ ldrhsb r11,[r12,#-1]
+ strb r1,[r14,#-10]
+ strb r2,[r14,#-6]
+ eor r0,r8,r0,lsr#8
+ strb r3,[r14,#-2]
+ eor r1,r9,r1,lsr#8
+ strb r0,[r14,#-13]
+ eor r2,r10,r2,lsr#8
+ strb r1,[r14,#-9]
+ eor r3,r11,r3,lsr#8
+ strb r2,[r14,#-5]
+ strb r3,[r14,#-1]
+ add r8,sp,#4*(4+8)
+ ldmia r8,{r8-r11} @ load key material
+ add r4,r4,r8 @ accumulate key material
+#ifdef __thumb2__
+ itt hi
+#endif
+ addhi r8,r8,#1 @ next counter value
+ strhi r8,[sp,#4*(12)] @ save next counter value
+ add r5,r5,r9
+ add r6,r6,r10
+#ifdef __thumb2__
+ itete lo
+#endif
+ eorlo r8,r8,r8 @ zero or ...
+ ldrhsb r8,[r12],#16 @ ... load input
+ eorlo r9,r9,r9
+ ldrhsb r9,[r12,#-12]
+
+ add r7,r7,r11
+#ifdef __thumb2__
+ itete lo
+#endif
+ eorlo r10,r10,r10
+ ldrhsb r10,[r12,#-8]
+ eorlo r11,r11,r11
+ ldrhsb r11,[r12,#-4]
+
+ eor r4,r8,r4 @ xor with input (or zero)
+ eor r5,r9,r5
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-15] @ load more input
+ ldrhsb r9,[r12,#-11]
+ eor r6,r10,r6
+ strb r4,[r14],#16 @ store output
+ eor r7,r11,r7
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-7]
+ ldrhsb r11,[r12,#-3]
+ strb r5,[r14,#-12]
+ eor r4,r8,r4,lsr#8
+ strb r6,[r14,#-8]
+ eor r5,r9,r5,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-14] @ load more input
+ ldrhsb r9,[r12,#-10]
+ strb r7,[r14,#-4]
+ eor r6,r10,r6,lsr#8
+ strb r4,[r14,#-15]
+ eor r7,r11,r7,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-6]
+ ldrhsb r11,[r12,#-2]
+ strb r5,[r14,#-11]
+ eor r4,r8,r4,lsr#8
+ strb r6,[r14,#-7]
+ eor r5,r9,r5,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-13] @ load more input
+ ldrhsb r9,[r12,#-9]
+ strb r7,[r14,#-3]
+ eor r6,r10,r6,lsr#8
+ strb r4,[r14,#-14]
+ eor r7,r11,r7,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-5]
+ ldrhsb r11,[r12,#-1]
+ strb r5,[r14,#-10]
+ strb r6,[r14,#-6]
+ eor r4,r8,r4,lsr#8
+ strb r7,[r14,#-2]
+ eor r5,r9,r5,lsr#8
+ strb r4,[r14,#-13]
+ eor r6,r10,r6,lsr#8
+ strb r5,[r14,#-9]
+ eor r7,r11,r7,lsr#8
+ strb r6,[r14,#-5]
+ strb r7,[r14,#-1]
+#ifdef __thumb2__
+ it ne
+#endif
+ ldrne r8,[sp,#4*(32+2)] @ re-load len
+#ifdef __thumb2__
+ it hs
+#endif
+ subhs r11,r8,#64 @ len-=64
+ bhi .Loop_outer
+
+ beq .Ldone
+#endif
+
+.Ltail:
+ ldr r12,[sp,#4*(32+1)] @ load inp
+ add r9,sp,#4*(0)
+ ldr r14,[sp,#4*(32+0)] @ load out
+
+.Loop_tail:
+ ldrb r10,[r9],#1 @ read buffer on stack
+ ldrb r11,[r12],#1 @ read input
+ subs r8,r8,#1
+ eor r11,r11,r10
+ strb r11,[r14],#1 @ store output
+ bne .Loop_tail
+
+.Ldone:
+ add sp,sp,#4*(32+3)
+.Lno_data_arm:
+ ldmia sp!,{r4-r11,pc}
+ENDPROC(chacha20_arm)
diff --git a/src/crypto/zinc/chacha20/chacha20-arm64.S b/src/crypto/zinc/chacha20/chacha20-arm64.S
new file mode 100644
index 0000000..a70df6b
--- /dev/null
+++ b/src/crypto/zinc/chacha20/chacha20-arm64.S
@@ -0,0 +1,1942 @@
+/* SPDX-License-Identifier: OpenSSL OR (BSD-3-Clause OR GPL-2.0)
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * This is based in part on Andy Polyakov's implementation from OpenSSL.
+ */
+
+#include <linux/linkage.h>
+
+.text
+.align 5
+.Lsigma:
+.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
+.Lone:
+.long 1,0,0,0
+
+.align 5
+ENTRY(chacha20_arm)
+ cbz x2,.Labort
+.Lshort:
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+ adr x5,.Lsigma
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#64
+
+ ldp x22,x23,[x5] // load sigma
+ ldp x24,x25,[x3] // load key
+ ldp x26,x27,[x3,#16]
+ ldp x28,x30,[x4] // load counter
+#ifdef __ARMEB__
+ ror x24,x24,#32
+ ror x25,x25,#32
+ ror x26,x26,#32
+ ror x27,x27,#32
+ ror x28,x28,#32
+ ror x30,x30,#32
+#endif
+
+.Loop_outer:
+ mov w5,w22 // unpack key block
+ lsr x6,x22,#32
+ mov w7,w23
+ lsr x8,x23,#32
+ mov w9,w24
+ lsr x10,x24,#32
+ mov w11,w25
+ lsr x12,x25,#32
+ mov w13,w26
+ lsr x14,x26,#32
+ mov w15,w27
+ lsr x16,x27,#32
+ mov w17,w28
+ lsr x19,x28,#32
+ mov w20,w30
+ lsr x21,x30,#32
+
+ mov x4,#10
+ subs x2,x2,#64
+.Loop:
+ sub x4,x4,#1
+ add w5,w5,w9
+ add w6,w6,w10
+ add w7,w7,w11
+ add w8,w8,w12
+ eor w17,w17,w5
+ eor w19,w19,w6
+ eor w20,w20,w7
+ eor w21,w21,w8
+ ror w17,w17,#16
+ ror w19,w19,#16
+ ror w20,w20,#16
+ ror w21,w21,#16
+ add w13,w13,w17
+ add w14,w14,w19
+ add w15,w15,w20
+ add w16,w16,w21
+ eor w9,w9,w13
+ eor w10,w10,w14
+ eor w11,w11,w15
+ eor w12,w12,w16
+ ror w9,w9,#20
+ ror w10,w10,#20
+ ror w11,w11,#20
+ ror w12,w12,#20
+ add w5,w5,w9
+ add w6,w6,w10
+ add w7,w7,w11
+ add w8,w8,w12
+ eor w17,w17,w5
+ eor w19,w19,w6
+ eor w20,w20,w7
+ eor w21,w21,w8
+ ror w17,w17,#24
+ ror w19,w19,#24
+ ror w20,w20,#24
+ ror w21,w21,#24
+ add w13,w13,w17
+ add w14,w14,w19
+ add w15,w15,w20
+ add w16,w16,w21
+ eor w9,w9,w13
+ eor w10,w10,w14
+ eor w11,w11,w15
+ eor w12,w12,w16
+ ror w9,w9,#25
+ ror w10,w10,#25
+ ror w11,w11,#25
+ ror w12,w12,#25
+ add w5,w5,w10
+ add w6,w6,w11
+ add w7,w7,w12
+ add w8,w8,w9
+ eor w21,w21,w5
+ eor w17,w17,w6
+ eor w19,w19,w7
+ eor w20,w20,w8
+ ror w21,w21,#16
+ ror w17,w17,#16
+ ror w19,w19,#16
+ ror w20,w20,#16
+ add w15,w15,w21
+ add w16,w16,w17
+ add w13,w13,w19
+ add w14,w14,w20
+ eor w10,w10,w15
+ eor w11,w11,w16
+ eor w12,w12,w13
+ eor w9,w9,w14
+ ror w10,w10,#20
+ ror w11,w11,#20
+ ror w12,w12,#20
+ ror w9,w9,#20
+ add w5,w5,w10
+ add w6,w6,w11
+ add w7,w7,w12
+ add w8,w8,w9
+ eor w21,w21,w5
+ eor w17,w17,w6
+ eor w19,w19,w7
+ eor w20,w20,w8
+ ror w21,w21,#24
+ ror w17,w17,#24
+ ror w19,w19,#24
+ ror w20,w20,#24
+ add w15,w15,w21
+ add w16,w16,w17
+ add w13,w13,w19
+ add w14,w14,w20
+ eor w10,w10,w15
+ eor w11,w11,w16
+ eor w12,w12,w13
+ eor w9,w9,w14
+ ror w10,w10,#25
+ ror w11,w11,#25
+ ror w12,w12,#25
+ ror w9,w9,#25
+ cbnz x4,.Loop
+
+ add w5,w5,w22 // accumulate key block
+ add x6,x6,x22,lsr#32
+ add w7,w7,w23
+ add x8,x8,x23,lsr#32
+ add w9,w9,w24
+ add x10,x10,x24,lsr#32
+ add w11,w11,w25
+ add x12,x12,x25,lsr#32
+ add w13,w13,w26
+ add x14,x14,x26,lsr#32
+ add w15,w15,w27
+ add x16,x16,x27,lsr#32
+ add w17,w17,w28
+ add x19,x19,x28,lsr#32
+ add w20,w20,w30
+ add x21,x21,x30,lsr#32
+
+ b.lo .Ltail
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor x15,x15,x16
+ eor x17,x17,x19
+ eor x20,x20,x21
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#1 // increment counter
+ stp x9,x11,[x0,#16]
+ stp x13,x15,[x0,#32]
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+
+ b.hi .Loop_outer
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+.Labort:
+ ret
+
+.align 4
+.Ltail:
+ add x2,x2,#64
+.Less_than_64:
+ sub x0,x0,#1
+ add x1,x1,x2
+ add x0,x0,x2
+ add x4,sp,x2
+ neg x2,x2
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ stp x5,x7,[sp,#0]
+ stp x9,x11,[sp,#16]
+ stp x13,x15,[sp,#32]
+ stp x17,x20,[sp,#48]
+
+.Loop_tail:
+ ldrb w10,[x1,x2]
+ ldrb w11,[x4,x2]
+ add x2,x2,#1
+ eor w10,w10,w11
+ strb w10,[x0,x2]
+ cbnz x2,.Loop_tail
+
+ stp xzr,xzr,[sp,#0]
+ stp xzr,xzr,[sp,#16]
+ stp xzr,xzr,[sp,#32]
+ stp xzr,xzr,[sp,#48]
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ ret
+ENDPROC(chacha20_arm)
+
+.align 5
+ENTRY(chacha20_neon)
+ cbz x2,.Labort_neon
+ cmp x2,#192
+ b.lo .Lshort
+
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+ adr x5,.Lsigma
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ cmp x2,#512
+ b.hs .L512_or_more_neon
+
+ sub sp,sp,#64
+
+ ldp x22,x23,[x5] // load sigma
+ ld1 {v24.4s},[x5],#16
+ ldp x24,x25,[x3] // load key
+ ldp x26,x27,[x3,#16]
+ ld1 {v25.4s,v26.4s},[x3]
+ ldp x28,x30,[x4] // load counter
+ ld1 {v27.4s},[x4]
+ ld1 {v31.4s},[x5]
+#ifdef __ARMEB__
+ rev64 v24.4s,v24.4s
+ ror x24,x24,#32
+ ror x25,x25,#32
+ ror x26,x26,#32
+ ror x27,x27,#32
+ ror x28,x28,#32
+ ror x30,x30,#32
+#endif
+ add v27.4s,v27.4s,v31.4s // += 1
+ add v28.4s,v27.4s,v31.4s
+ add v29.4s,v28.4s,v31.4s
+ shl v31.4s,v31.4s,#2 // 1 -> 4
+
+.Loop_outer_neon:
+ mov w5,w22 // unpack key block
+ lsr x6,x22,#32
+ mov v0.16b,v24.16b
+ mov w7,w23
+ lsr x8,x23,#32
+ mov v4.16b,v24.16b
+ mov w9,w24
+ lsr x10,x24,#32
+ mov v16.16b,v24.16b
+ mov w11,w25
+ mov v1.16b,v25.16b
+ lsr x12,x25,#32
+ mov v5.16b,v25.16b
+ mov w13,w26
+ mov v17.16b,v25.16b
+ lsr x14,x26,#32
+ mov v3.16b,v27.16b
+ mov w15,w27
+ mov v7.16b,v28.16b
+ lsr x16,x27,#32
+ mov v19.16b,v29.16b
+ mov w17,w28
+ mov v2.16b,v26.16b
+ lsr x19,x28,#32
+ mov v6.16b,v26.16b
+ mov w20,w30
+ mov v18.16b,v26.16b
+ lsr x21,x30,#32
+
+ mov x4,#10
+ subs x2,x2,#256
+.Loop_neon:
+ sub x4,x4,#1
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v16.4s,v16.4s,v17.4s
+ add w7,w7,w11
+ eor v3.16b,v3.16b,v0.16b
+ add w8,w8,w12
+ eor v7.16b,v7.16b,v4.16b
+ eor w17,w17,w5
+ eor v19.16b,v19.16b,v16.16b
+ eor w19,w19,w6
+ rev32 v3.8h,v3.8h
+ eor w20,w20,w7
+ rev32 v7.8h,v7.8h
+ eor w21,w21,w8
+ rev32 v19.8h,v19.8h
+ ror w17,w17,#16
+ add v2.4s,v2.4s,v3.4s
+ ror w19,w19,#16
+ add v6.4s,v6.4s,v7.4s
+ ror w20,w20,#16
+ add v18.4s,v18.4s,v19.4s
+ ror w21,w21,#16
+ eor v20.16b,v1.16b,v2.16b
+ add w13,w13,w17
+ eor v21.16b,v5.16b,v6.16b
+ add w14,w14,w19
+ eor v22.16b,v17.16b,v18.16b
+ add w15,w15,w20
+ ushr v1.4s,v20.4s,#20
+ add w16,w16,w21
+ ushr v5.4s,v21.4s,#20
+ eor w9,w9,w13
+ ushr v17.4s,v22.4s,#20
+ eor w10,w10,w14
+ sli v1.4s,v20.4s,#12
+ eor w11,w11,w15
+ sli v5.4s,v21.4s,#12
+ eor w12,w12,w16
+ sli v17.4s,v22.4s,#12
+ ror w9,w9,#20
+ add v0.4s,v0.4s,v1.4s
+ ror w10,w10,#20
+ add v4.4s,v4.4s,v5.4s
+ ror w11,w11,#20
+ add v16.4s,v16.4s,v17.4s
+ ror w12,w12,#20
+ eor v20.16b,v3.16b,v0.16b
+ add w5,w5,w9
+ eor v21.16b,v7.16b,v4.16b
+ add w6,w6,w10
+ eor v22.16b,v19.16b,v16.16b
+ add w7,w7,w11
+ ushr v3.4s,v20.4s,#24
+ add w8,w8,w12
+ ushr v7.4s,v21.4s,#24
+ eor w17,w17,w5
+ ushr v19.4s,v22.4s,#24
+ eor w19,w19,w6
+ sli v3.4s,v20.4s,#8
+ eor w20,w20,w7
+ sli v7.4s,v21.4s,#8
+ eor w21,w21,w8
+ sli v19.4s,v22.4s,#8
+ ror w17,w17,#24
+ add v2.4s,v2.4s,v3.4s
+ ror w19,w19,#24
+ add v6.4s,v6.4s,v7.4s
+ ror w20,w20,#24
+ add v18.4s,v18.4s,v19.4s
+ ror w21,w21,#24
+ eor v20.16b,v1.16b,v2.16b
+ add w13,w13,w17
+ eor v21.16b,v5.16b,v6.16b
+ add w14,w14,w19
+ eor v22.16b,v17.16b,v18.16b
+ add w15,w15,w20
+ ushr v1.4s,v20.4s,#25
+ add w16,w16,w21
+ ushr v5.4s,v21.4s,#25
+ eor w9,w9,w13
+ ushr v17.4s,v22.4s,#25
+ eor w10,w10,w14
+ sli v1.4s,v20.4s,#7
+ eor w11,w11,w15
+ sli v5.4s,v21.4s,#7
+ eor w12,w12,w16
+ sli v17.4s,v22.4s,#7
+ ror w9,w9,#25
+ ext v2.16b,v2.16b,v2.16b,#8
+ ror w10,w10,#25
+ ext v6.16b,v6.16b,v6.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v3.16b,v3.16b,v3.16b,#12
+ ext v7.16b,v7.16b,v7.16b,#12
+ ext v19.16b,v19.16b,v19.16b,#12
+ ext v1.16b,v1.16b,v1.16b,#4
+ ext v5.16b,v5.16b,v5.16b,#4
+ ext v17.16b,v17.16b,v17.16b,#4
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w10
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w11
+ add v16.4s,v16.4s,v17.4s
+ add w7,w7,w12
+ eor v3.16b,v3.16b,v0.16b
+ add w8,w8,w9
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w5
+ eor v19.16b,v19.16b,v16.16b
+ eor w17,w17,w6
+ rev32 v3.8h,v3.8h
+ eor w19,w19,w7
+ rev32 v7.8h,v7.8h
+ eor w20,w20,w8
+ rev32 v19.8h,v19.8h
+ ror w21,w21,#16
+ add v2.4s,v2.4s,v3.4s
+ ror w17,w17,#16
+ add v6.4s,v6.4s,v7.4s
+ ror w19,w19,#16
+ add v18.4s,v18.4s,v19.4s
+ ror w20,w20,#16
+ eor v20.16b,v1.16b,v2.16b
+ add w15,w15,w21
+ eor v21.16b,v5.16b,v6.16b
+ add w16,w16,w17
+ eor v22.16b,v17.16b,v18.16b
+ add w13,w13,w19
+ ushr v1.4s,v20.4s,#20
+ add w14,w14,w20
+ ushr v5.4s,v21.4s,#20
+ eor w10,w10,w15
+ ushr v17.4s,v22.4s,#20
+ eor w11,w11,w16
+ sli v1.4s,v20.4s,#12
+ eor w12,w12,w13
+ sli v5.4s,v21.4s,#12
+ eor w9,w9,w14
+ sli v17.4s,v22.4s,#12
+ ror w10,w10,#20
+ add v0.4s,v0.4s,v1.4s
+ ror w11,w11,#20
+ add v4.4s,v4.4s,v5.4s
+ ror w12,w12,#20
+ add v16.4s,v16.4s,v17.4s
+ ror w9,w9,#20
+ eor v20.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v21.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v22.16b,v19.16b,v16.16b
+ add w7,w7,w12
+ ushr v3.4s,v20.4s,#24
+ add w8,w8,w9
+ ushr v7.4s,v21.4s,#24
+ eor w21,w21,w5
+ ushr v19.4s,v22.4s,#24
+ eor w17,w17,w6
+ sli v3.4s,v20.4s,#8
+ eor w19,w19,w7
+ sli v7.4s,v21.4s,#8
+ eor w20,w20,w8
+ sli v19.4s,v22.4s,#8
+ ror w21,w21,#24
+ add v2.4s,v2.4s,v3.4s
+ ror w17,w17,#24
+ add v6.4s,v6.4s,v7.4s
+ ror w19,w19,#24
+ add v18.4s,v18.4s,v19.4s
+ ror w20,w20,#24
+ eor v20.16b,v1.16b,v2.16b
+ add w15,w15,w21
+ eor v21.16b,v5.16b,v6.16b
+ add w16,w16,w17
+ eor v22.16b,v17.16b,v18.16b
+ add w13,w13,w19
+ ushr v1.4s,v20.4s,#25
+ add w14,w14,w20
+ ushr v5.4s,v21.4s,#25
+ eor w10,w10,w15
+ ushr v17.4s,v22.4s,#25
+ eor w11,w11,w16
+ sli v1.4s,v20.4s,#7
+ eor w12,w12,w13
+ sli v5.4s,v21.4s,#7
+ eor w9,w9,w14
+ sli v17.4s,v22.4s,#7
+ ror w10,w10,#25
+ ext v2.16b,v2.16b,v2.16b,#8
+ ror w11,w11,#25
+ ext v6.16b,v6.16b,v6.16b,#8
+ ror w12,w12,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#4
+ ext v7.16b,v7.16b,v7.16b,#4
+ ext v19.16b,v19.16b,v19.16b,#4
+ ext v1.16b,v1.16b,v1.16b,#12
+ ext v5.16b,v5.16b,v5.16b,#12
+ ext v17.16b,v17.16b,v17.16b,#12
+ cbnz x4,.Loop_neon
+
+ add w5,w5,w22 // accumulate key block
+ add v0.4s,v0.4s,v24.4s
+ add x6,x6,x22,lsr#32
+ add v4.4s,v4.4s,v24.4s
+ add w7,w7,w23
+ add v16.4s,v16.4s,v24.4s
+ add x8,x8,x23,lsr#32
+ add v2.4s,v2.4s,v26.4s
+ add w9,w9,w24
+ add v6.4s,v6.4s,v26.4s
+ add x10,x10,x24,lsr#32
+ add v18.4s,v18.4s,v26.4s
+ add w11,w11,w25
+ add v3.4s,v3.4s,v27.4s
+ add x12,x12,x25,lsr#32
+ add w13,w13,w26
+ add v7.4s,v7.4s,v28.4s
+ add x14,x14,x26,lsr#32
+ add w15,w15,w27
+ add v19.4s,v19.4s,v29.4s
+ add x16,x16,x27,lsr#32
+ add w17,w17,w28
+ add v1.4s,v1.4s,v25.4s
+ add x19,x19,x28,lsr#32
+ add w20,w20,w30
+ add v5.4s,v5.4s,v25.4s
+ add x21,x21,x30,lsr#32
+ add v17.4s,v17.4s,v25.4s
+
+ b.lo .Ltail_neon
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor v0.16b,v0.16b,v20.16b
+ eor x15,x15,x16
+ eor v1.16b,v1.16b,v21.16b
+ eor x17,x17,x19
+ eor v2.16b,v2.16b,v22.16b
+ eor x20,x20,x21
+ eor v3.16b,v3.16b,v23.16b
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#4 // increment counter
+ stp x9,x11,[x0,#16]
+ add v27.4s,v27.4s,v31.4s // += 4
+ stp x13,x15,[x0,#32]
+ add v28.4s,v28.4s,v31.4s
+ stp x17,x20,[x0,#48]
+ add v29.4s,v29.4s,v31.4s
+ add x0,x0,#64
+
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+ ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+
+ eor v4.16b,v4.16b,v20.16b
+ eor v5.16b,v5.16b,v21.16b
+ eor v6.16b,v6.16b,v22.16b
+ eor v7.16b,v7.16b,v23.16b
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+ eor v16.16b,v16.16b,v0.16b
+ eor v17.16b,v17.16b,v1.16b
+ eor v18.16b,v18.16b,v2.16b
+ eor v19.16b,v19.16b,v3.16b
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+ b.hi .Loop_outer_neon
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ ret
+
+.Ltail_neon:
+ add x2,x2,#256
+ cmp x2,#64
+ b.lo .Less_than_64
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor x15,x15,x16
+ eor x17,x17,x19
+ eor x20,x20,x21
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#4 // increment counter
+ stp x9,x11,[x0,#16]
+ stp x13,x15,[x0,#32]
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+ b.eq .Ldone_neon
+ sub x2,x2,#64
+ cmp x2,#64
+ b.lo .Less_than_128
+
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+ eor v0.16b,v0.16b,v20.16b
+ eor v1.16b,v1.16b,v21.16b
+ eor v2.16b,v2.16b,v22.16b
+ eor v3.16b,v3.16b,v23.16b
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+ b.eq .Ldone_neon
+ sub x2,x2,#64
+ cmp x2,#64
+ b.lo .Less_than_192
+
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+ eor v4.16b,v4.16b,v20.16b
+ eor v5.16b,v5.16b,v21.16b
+ eor v6.16b,v6.16b,v22.16b
+ eor v7.16b,v7.16b,v23.16b
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+ b.eq .Ldone_neon
+ sub x2,x2,#64
+
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
+ b .Last_neon
+
+.Less_than_128:
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp]
+ b .Last_neon
+.Less_than_192:
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp]
+ b .Last_neon
+
+.align 4
+.Last_neon:
+ sub x0,x0,#1
+ add x1,x1,x2
+ add x0,x0,x2
+ add x4,sp,x2
+ neg x2,x2
+
+.Loop_tail_neon:
+ ldrb w10,[x1,x2]
+ ldrb w11,[x4,x2]
+ add x2,x2,#1
+ eor w10,w10,w11
+ strb w10,[x0,x2]
+ cbnz x2,.Loop_tail_neon
+
+ stp xzr,xzr,[sp,#0]
+ stp xzr,xzr,[sp,#16]
+ stp xzr,xzr,[sp,#32]
+ stp xzr,xzr,[sp,#48]
+
+.Ldone_neon:
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ ret
+
+.L512_or_more_neon:
+ sub sp,sp,#128+64
+
+ ldp x22,x23,[x5] // load sigma
+ ld1 {v24.4s},[x5],#16
+ ldp x24,x25,[x3] // load key
+ ldp x26,x27,[x3,#16]
+ ld1 {v25.4s,v26.4s},[x3]
+ ldp x28,x30,[x4] // load counter
+ ld1 {v27.4s},[x4]
+ ld1 {v31.4s},[x5]
+#ifdef __ARMEB__
+ rev64 v24.4s,v24.4s
+ ror x24,x24,#32
+ ror x25,x25,#32
+ ror x26,x26,#32
+ ror x27,x27,#32
+ ror x28,x28,#32
+ ror x30,x30,#32
+#endif
+ add v27.4s,v27.4s,v31.4s // += 1
+ stp q24,q25,[sp,#0] // off-load key block, invariant part
+ add v27.4s,v27.4s,v31.4s // not typo
+ str q26,[sp,#32]
+ add v28.4s,v27.4s,v31.4s
+ add v29.4s,v28.4s,v31.4s
+ add v30.4s,v29.4s,v31.4s
+ shl v31.4s,v31.4s,#2 // 1 -> 4
+
+ stp d8,d9,[sp,#128+0] // meet ABI requirements
+ stp d10,d11,[sp,#128+16]
+ stp d12,d13,[sp,#128+32]
+ stp d14,d15,[sp,#128+48]
+
+ sub x2,x2,#512 // not typo
+
+.Loop_outer_512_neon:
+ mov v0.16b,v24.16b
+ mov v4.16b,v24.16b
+ mov v8.16b,v24.16b
+ mov v12.16b,v24.16b
+ mov v16.16b,v24.16b
+ mov v20.16b,v24.16b
+ mov v1.16b,v25.16b
+ mov w5,w22 // unpack key block
+ mov v5.16b,v25.16b
+ lsr x6,x22,#32
+ mov v9.16b,v25.16b
+ mov w7,w23
+ mov v13.16b,v25.16b
+ lsr x8,x23,#32
+ mov v17.16b,v25.16b
+ mov w9,w24
+ mov v21.16b,v25.16b
+ lsr x10,x24,#32
+ mov v3.16b,v27.16b
+ mov w11,w25
+ mov v7.16b,v28.16b
+ lsr x12,x25,#32
+ mov v11.16b,v29.16b
+ mov w13,w26
+ mov v15.16b,v30.16b
+ lsr x14,x26,#32
+ mov v2.16b,v26.16b
+ mov w15,w27
+ mov v6.16b,v26.16b
+ lsr x16,x27,#32
+ add v19.4s,v3.4s,v31.4s // +4
+ mov w17,w28
+ add v23.4s,v7.4s,v31.4s // +4
+ lsr x19,x28,#32
+ mov v10.16b,v26.16b
+ mov w20,w30
+ mov v14.16b,v26.16b
+ lsr x21,x30,#32
+ mov v18.16b,v26.16b
+ stp q27,q28,[sp,#48] // off-load key block, variable part
+ mov v22.16b,v26.16b
+ str q29,[sp,#80]
+
+ mov x4,#5
+ subs x2,x2,#512
+.Loop_upper_neon:
+ sub x4,x4,#1
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#12
+ ext v7.16b,v7.16b,v7.16b,#12
+ ext v11.16b,v11.16b,v11.16b,#12
+ ext v15.16b,v15.16b,v15.16b,#12
+ ext v19.16b,v19.16b,v19.16b,#12
+ ext v23.16b,v23.16b,v23.16b,#12
+ ext v1.16b,v1.16b,v1.16b,#4
+ ext v5.16b,v5.16b,v5.16b,#4
+ ext v9.16b,v9.16b,v9.16b,#4
+ ext v13.16b,v13.16b,v13.16b,#4
+ ext v17.16b,v17.16b,v17.16b,#4
+ ext v21.16b,v21.16b,v21.16b,#4
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#4
+ ext v7.16b,v7.16b,v7.16b,#4
+ ext v11.16b,v11.16b,v11.16b,#4
+ ext v15.16b,v15.16b,v15.16b,#4
+ ext v19.16b,v19.16b,v19.16b,#4
+ ext v23.16b,v23.16b,v23.16b,#4
+ ext v1.16b,v1.16b,v1.16b,#12
+ ext v5.16b,v5.16b,v5.16b,#12
+ ext v9.16b,v9.16b,v9.16b,#12
+ ext v13.16b,v13.16b,v13.16b,#12
+ ext v17.16b,v17.16b,v17.16b,#12
+ ext v21.16b,v21.16b,v21.16b,#12
+ cbnz x4,.Loop_upper_neon
+
+ add w5,w5,w22 // accumulate key block
+ add x6,x6,x22,lsr#32
+ add w7,w7,w23
+ add x8,x8,x23,lsr#32
+ add w9,w9,w24
+ add x10,x10,x24,lsr#32
+ add w11,w11,w25
+ add x12,x12,x25,lsr#32
+ add w13,w13,w26
+ add x14,x14,x26,lsr#32
+ add w15,w15,w27
+ add x16,x16,x27,lsr#32
+ add w17,w17,w28
+ add x19,x19,x28,lsr#32
+ add w20,w20,w30
+ add x21,x21,x30,lsr#32
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor x15,x15,x16
+ eor x17,x17,x19
+ eor x20,x20,x21
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#1 // increment counter
+ mov w5,w22 // unpack key block
+ lsr x6,x22,#32
+ stp x9,x11,[x0,#16]
+ mov w7,w23
+ lsr x8,x23,#32
+ stp x13,x15,[x0,#32]
+ mov w9,w24
+ lsr x10,x24,#32
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+ mov w11,w25
+ lsr x12,x25,#32
+ mov w13,w26
+ lsr x14,x26,#32
+ mov w15,w27
+ lsr x16,x27,#32
+ mov w17,w28
+ lsr x19,x28,#32
+ mov w20,w30
+ lsr x21,x30,#32
+
+ mov x4,#5
+.Loop_lower_neon:
+ sub x4,x4,#1
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#12
+ ext v7.16b,v7.16b,v7.16b,#12
+ ext v11.16b,v11.16b,v11.16b,#12
+ ext v15.16b,v15.16b,v15.16b,#12
+ ext v19.16b,v19.16b,v19.16b,#12
+ ext v23.16b,v23.16b,v23.16b,#12
+ ext v1.16b,v1.16b,v1.16b,#4
+ ext v5.16b,v5.16b,v5.16b,#4
+ ext v9.16b,v9.16b,v9.16b,#4
+ ext v13.16b,v13.16b,v13.16b,#4
+ ext v17.16b,v17.16b,v17.16b,#4
+ ext v21.16b,v21.16b,v21.16b,#4
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#4
+ ext v7.16b,v7.16b,v7.16b,#4
+ ext v11.16b,v11.16b,v11.16b,#4
+ ext v15.16b,v15.16b,v15.16b,#4
+ ext v19.16b,v19.16b,v19.16b,#4
+ ext v23.16b,v23.16b,v23.16b,#4
+ ext v1.16b,v1.16b,v1.16b,#12
+ ext v5.16b,v5.16b,v5.16b,#12
+ ext v9.16b,v9.16b,v9.16b,#12
+ ext v13.16b,v13.16b,v13.16b,#12
+ ext v17.16b,v17.16b,v17.16b,#12
+ ext v21.16b,v21.16b,v21.16b,#12
+ cbnz x4,.Loop_lower_neon
+
+ add w5,w5,w22 // accumulate key block
+ ldp q24,q25,[sp,#0]
+ add x6,x6,x22,lsr#32
+ ldp q26,q27,[sp,#32]
+ add w7,w7,w23
+ ldp q28,q29,[sp,#64]
+ add x8,x8,x23,lsr#32
+ add v0.4s,v0.4s,v24.4s
+ add w9,w9,w24
+ add v4.4s,v4.4s,v24.4s
+ add x10,x10,x24,lsr#32
+ add v8.4s,v8.4s,v24.4s
+ add w11,w11,w25
+ add v12.4s,v12.4s,v24.4s
+ add x12,x12,x25,lsr#32
+ add v16.4s,v16.4s,v24.4s
+ add w13,w13,w26
+ add v20.4s,v20.4s,v24.4s
+ add x14,x14,x26,lsr#32
+ add v2.4s,v2.4s,v26.4s
+ add w15,w15,w27
+ add v6.4s,v6.4s,v26.4s
+ add x16,x16,x27,lsr#32
+ add v10.4s,v10.4s,v26.4s
+ add w17,w17,w28
+ add v14.4s,v14.4s,v26.4s
+ add x19,x19,x28,lsr#32
+ add v18.4s,v18.4s,v26.4s
+ add w20,w20,w30
+ add v22.4s,v22.4s,v26.4s
+ add x21,x21,x30,lsr#32
+ add v19.4s,v19.4s,v31.4s // +4
+ add x5,x5,x6,lsl#32 // pack
+ add v23.4s,v23.4s,v31.4s // +4
+ add x7,x7,x8,lsl#32
+ add v3.4s,v3.4s,v27.4s
+ ldp x6,x8,[x1,#0] // load input
+ add v7.4s,v7.4s,v28.4s
+ add x9,x9,x10,lsl#32
+ add v11.4s,v11.4s,v29.4s
+ add x11,x11,x12,lsl#32
+ add v15.4s,v15.4s,v30.4s
+ ldp x10,x12,[x1,#16]
+ add v19.4s,v19.4s,v27.4s
+ add x13,x13,x14,lsl#32
+ add v23.4s,v23.4s,v28.4s
+ add x15,x15,x16,lsl#32
+ add v1.4s,v1.4s,v25.4s
+ ldp x14,x16,[x1,#32]
+ add v5.4s,v5.4s,v25.4s
+ add x17,x17,x19,lsl#32
+ add v9.4s,v9.4s,v25.4s
+ add x20,x20,x21,lsl#32
+ add v13.4s,v13.4s,v25.4s
+ ldp x19,x21,[x1,#48]
+ add v17.4s,v17.4s,v25.4s
+ add x1,x1,#64
+ add v21.4s,v21.4s,v25.4s
+
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor v0.16b,v0.16b,v24.16b
+ eor x15,x15,x16
+ eor v1.16b,v1.16b,v25.16b
+ eor x17,x17,x19
+ eor v2.16b,v2.16b,v26.16b
+ eor x20,x20,x21
+ eor v3.16b,v3.16b,v27.16b
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#7 // increment counter
+ stp x9,x11,[x0,#16]
+ stp x13,x15,[x0,#32]
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+
+ ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+ eor v4.16b,v4.16b,v24.16b
+ eor v5.16b,v5.16b,v25.16b
+ eor v6.16b,v6.16b,v26.16b
+ eor v7.16b,v7.16b,v27.16b
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+ ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+ eor v8.16b,v8.16b,v0.16b
+ ldp q24,q25,[sp,#0]
+ eor v9.16b,v9.16b,v1.16b
+ ldp q26,q27,[sp,#32]
+ eor v10.16b,v10.16b,v2.16b
+ eor v11.16b,v11.16b,v3.16b
+ st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
+
+ ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
+ eor v12.16b,v12.16b,v4.16b
+ eor v13.16b,v13.16b,v5.16b
+ eor v14.16b,v14.16b,v6.16b
+ eor v15.16b,v15.16b,v7.16b
+ st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
+
+ ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
+ eor v16.16b,v16.16b,v8.16b
+ eor v17.16b,v17.16b,v9.16b
+ eor v18.16b,v18.16b,v10.16b
+ eor v19.16b,v19.16b,v11.16b
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+ shl v0.4s,v31.4s,#1 // 4 -> 8
+ eor v20.16b,v20.16b,v12.16b
+ eor v21.16b,v21.16b,v13.16b
+ eor v22.16b,v22.16b,v14.16b
+ eor v23.16b,v23.16b,v15.16b
+ st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
+
+ add v27.4s,v27.4s,v0.4s // += 8
+ add v28.4s,v28.4s,v0.4s
+ add v29.4s,v29.4s,v0.4s
+ add v30.4s,v30.4s,v0.4s
+
+ b.hs .Loop_outer_512_neon
+
+ adds x2,x2,#512
+ ushr v0.4s,v31.4s,#2 // 4 -> 1
+
+ ldp d8,d9,[sp,#128+0] // meet ABI requirements
+ ldp d10,d11,[sp,#128+16]
+ ldp d12,d13,[sp,#128+32]
+ ldp d14,d15,[sp,#128+48]
+
+ stp q24,q31,[sp,#0] // wipe off-load area
+ stp q24,q31,[sp,#32]
+ stp q24,q31,[sp,#64]
+
+ b.eq .Ldone_512_neon
+
+ cmp x2,#192
+ sub v27.4s,v27.4s,v0.4s // -= 1
+ sub v28.4s,v28.4s,v0.4s
+ sub v29.4s,v29.4s,v0.4s
+ add sp,sp,#128
+ b.hs .Loop_outer_neon
+
+ eor v25.16b,v25.16b,v25.16b
+ eor v26.16b,v26.16b,v26.16b
+ eor v27.16b,v27.16b,v27.16b
+ eor v28.16b,v28.16b,v28.16b
+ eor v29.16b,v29.16b,v29.16b
+ eor v30.16b,v30.16b,v30.16b
+ b .Loop_outer
+
+.Ldone_512_neon:
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#128+64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+.Labort_neon:
+ ret
+ENDPROC(chacha20_neon)
diff --git a/src/crypto/zinc/chacha20/chacha20-mips-glue.h b/src/crypto/zinc/chacha20/chacha20-mips-glue.h
new file mode 100644
index 0000000..5b2c8ce
--- /dev/null
+++ b/src/crypto/zinc/chacha20/chacha20-mips-glue.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <zinc/chacha20.h>
+
+asmlinkage void chacha20_mips(u8 *out, const u8 *in, const size_t len,
+ const u32 key[8], const u32 counter[4]);
+void __init chacha20_fpu_init(void)
+{
+}
+
+static inline bool chacha20_arch(u8 *dst, const u8 *src, const size_t len,
+ const u32 key[8], const u32 counter[4],
+ simd_context_t simd_context)
+{
+ chacha20_mips(dst, src, len, key, counter);
+ return true;
+}
+
+static inline bool hchacha20_arch(u8 *derived_key, const u8 *nonce,
+ const u8 *key, simd_context_t simd_context)
+{
+ return false;
+}
+
+#define HAVE_CHACHA20_ARCH_IMPLEMENTATION
diff --git a/src/crypto/zinc/chacha20/chacha20-mips.S b/src/crypto/zinc/chacha20/chacha20-mips.S
new file mode 100644
index 0000000..77da2c2
--- /dev/null
+++ b/src/crypto/zinc/chacha20/chacha20-mips.S
@@ -0,0 +1,474 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#define MASK_U32 0x3c
+#define MASK_BYTES 0x03
+#define CHACHA20_BLOCK_SIZE 64
+#define STACK_SIZE 4*16
+
+#define X0 $t0
+#define X1 $t1
+#define X2 $t2
+#define X3 $t3
+#define X4 $t4
+#define X5 $t5
+#define X6 $t6
+#define X7 $t7
+#define X8 $v1
+#define X9 $fp
+#define X10 $s7
+#define X11 $s6
+#define X12 $s5
+#define X13 $s4
+#define X14 $s3
+#define X15 $s2
+/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
+#define T0 $s1
+#define T1 $s0
+#define T(n) T ## n
+#define X(n) X ## n
+
+/* Input arguments */
+#define OUT $a0
+#define IN $a1
+#define BYTES $a2
+/* KEY and NONCE argument must be u32 aligned */
+#define KEY $a3
+/* NONCE pointer is given via stack */
+#define NONCE $t9
+
+/* Output argument */
+/* NONCE[0] is kept in a register and not in memory.
+ * We don't want to touch original value in memory.
+ * Must be incremented every loop iteration.
+ */
+#define NONCE_0 $v0
+
+/* SAVED_X and SAVED_CA are set in the jump table.
+ * Use regs which are overwritten on exit else we don't leak clear data.
+ * They are used to handling the last bytes which are not multiple of 4.
+ */
+#define SAVED_X X15
+#define SAVED_CA $ra
+
+#define PTR_LAST_ROUND $t8
+
+/* ChaCha20 constants and stack location */
+#define CONSTANT_OFS_SP 48
+#define UNALIGNED_OFS_SP 40
+
+#define CONSTANT_1 0x61707865
+#define CONSTANT_2 0x3320646e
+#define CONSTANT_3 0x79622d32
+#define CONSTANT_4 0x6b206574
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define MSB 0
+#define LSB 3
+#define ROTx rotl
+#define ROTR(n) rotr n, 24
+#define CPU_TO_LE32(n) \
+ wsbh n; \
+ rotr n, 16;
+#else
+#define MSB 3
+#define LSB 0
+#define ROTx rotr
+#define CPU_TO_LE32(n)
+#define ROTR(n)
+#endif
+
+#define STORE_UNALIGNED(x, a, s, o) \
+.Lchacha20_mips_xor_unaligned_ ## x ## _b: ; \
+ .if ((s != NONCE) || (o != 0)); \
+ lw T0, o(s); \
+ .endif; \
+ lwl T1, x-4+MSB ## (IN); \
+ lwr T1, x-4+LSB ## (IN); \
+ .if ((s == NONCE) && (o == 0)); \
+ addu X ## a, NONCE_0; \
+ .else; \
+ addu X ## a, T0; \
+ .endif; \
+ CPU_TO_LE32(X ## a); \
+ xor X ## a, T1; \
+ swl X ## a, x-4+MSB ## (OUT); \
+ swr X ## a, x-4+LSB ## (OUT);
+
+#define STORE_ALIGNED(x, a, s, o) \
+.Lchacha20_mips_xor_aligned_ ## x ## _b: ; \
+ .if ((s != NONCE) || (o != 0)); \
+ lw T0, o(s); \
+ .endif; \
+ lw T1, x-4 ## (IN); \
+ .if ((s == NONCE) && (o == 0)); \
+ addu X ## a, NONCE_0; \
+ .else; \
+ addu X ## a, T0; \
+ .endif; \
+ CPU_TO_LE32(X ## a); \
+ xor X ## a, T1; \
+ sw X ## a, x-4 ## (OUT);
+
+/* Jump table macro.
+ * Used for setup and handling the last bytes, which are not multiple of 4.
+ * X15 is free to store Xn
+ * Every jumptable entry must be equal in size.
+ */
+#define JMPTBL_ALIGNED(x, a, s, o) \
+.Lchacha20_mips_jmptbl_aligned_ ## a: ; \
+ .if ((s == NONCE) && (o == 0)); \
+ move SAVED_CA, NONCE_0; \
+ .else; \
+ lw SAVED_CA, o(s);\
+ .endif; \
+ b .Lchacha20_mips_xor_aligned_ ## x ## _b; \
+ move SAVED_X, X ## a;
+
+#define JMPTBL_UNALIGNED(x, a, s, o) \
+.Lchacha20_mips_jmptbl_unaligned_ ## a: ; \
+ .if ((s == NONCE) && (o == 0)); \
+ move SAVED_CA, NONCE_0; \
+ .else; \
+ lw SAVED_CA, o(s);\
+ .endif; \
+ b .Lchacha20_mips_xor_unaligned_ ## x ## _b; \
+ move SAVED_X, X ## a;
+
+#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \
+ addu X(A), X(K); \
+ addu X(B), X(L); \
+ addu X(C), X(M); \
+ addu X(D), X(N); \
+ xor X(V), X(A); \
+ xor X(W), X(B); \
+ xor X(Y), X(C); \
+ xor X(Z), X(D); \
+ rotl X(V), S; \
+ rotl X(W), S; \
+ rotl X(Y), S; \
+ rotl X(Z), S;
+
+.text
+.set reorder
+.set noat
+.globl chacha20_mips
+.ent chacha20_mips
+chacha20_mips:
+ .frame $sp, STACK_SIZE, $ra
+ /* This is in the fifth argument */
+ lw NONCE, 16($sp)
+
+ /* Return bytes = 0. */
+ .set noreorder
+ beqz BYTES, .Lchacha20_mips_end
+ addiu $sp, -STACK_SIZE
+ .set reorder
+
+ /* Calculate PTR_LAST_ROUND */
+ addiu PTR_LAST_ROUND, BYTES, -1
+ ins PTR_LAST_ROUND, $zero, 0, 6
+ addu PTR_LAST_ROUND, OUT
+
+ /* Save s0-s7, fp, ra. */
+ sw $ra, 0($sp)
+ sw $fp, 4($sp)
+ sw $s0, 8($sp)
+ sw $s1, 12($sp)
+ sw $s2, 16($sp)
+ sw $s3, 20($sp)
+ sw $s4, 24($sp)
+ sw $s5, 28($sp)
+ sw $s6, 32($sp)
+ sw $s7, 36($sp)
+
+ lw NONCE_0, 0(NONCE)
+ /* Test IN or OUT is unaligned.
+ * UNALIGNED (T1) = ( IN | OUT ) & 0x00000003
+ */
+ or T1, IN, OUT
+ andi T1, 0x3
+
+ /* Load constant */
+ lui X0, %hi(CONSTANT_1)
+ lui X1, %hi(CONSTANT_2)
+ lui X2, %hi(CONSTANT_3)
+ lui X3, %hi(CONSTANT_4)
+ ori X0, %lo(CONSTANT_1)
+ ori X1, %lo(CONSTANT_2)
+ ori X2, %lo(CONSTANT_3)
+ ori X3, %lo(CONSTANT_4)
+
+ /* Store constant on stack. */
+ sw X0, 0+CONSTANT_OFS_SP($sp)
+ sw X1, 4+CONSTANT_OFS_SP($sp)
+ sw X2, 8+CONSTANT_OFS_SP($sp)
+ sw X3, 12+CONSTANT_OFS_SP($sp)
+
+ sw T1, UNALIGNED_OFS_SP($sp)
+
+ .set noreorder
+ b .Lchacha20_rounds_start
+ andi BYTES, (CHACHA20_BLOCK_SIZE-1)
+ .set reorder
+
+.align 4
+.Loop_chacha20_rounds:
+ addiu IN, CHACHA20_BLOCK_SIZE
+ addiu OUT, CHACHA20_BLOCK_SIZE
+ addiu NONCE_0, 1
+
+ lw X0, 0+CONSTANT_OFS_SP($sp)
+ lw X1, 4+CONSTANT_OFS_SP($sp)
+ lw X2, 8+CONSTANT_OFS_SP($sp)
+ lw X3, 12+CONSTANT_OFS_SP($sp)
+ lw T1, UNALIGNED_OFS_SP($sp)
+
+.Lchacha20_rounds_start:
+ lw X4, 0(KEY)
+ lw X5, 4(KEY)
+ lw X6, 8(KEY)
+ lw X7, 12(KEY)
+ lw X8, 16(KEY)
+ lw X9, 20(KEY)
+ lw X10, 24(KEY)
+ lw X11, 28(KEY)
+
+ move X12, NONCE_0
+ lw X13, 4(NONCE)
+ lw X14, 8(NONCE)
+ lw X15, 12(NONCE)
+
+ li $at, 9
+.Loop_chacha20_xor_rounds:
+ AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
+ AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
+ AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
+ AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
+ AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
+ AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
+ AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
+ AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
+ .set noreorder
+ bnez $at, .Loop_chacha20_xor_rounds
+ addiu $at, -1
+
+ /* Unaligned? Jump */
+ bnez T1, .Loop_chacha20_unaligned
+ andi $at, BYTES, MASK_U32
+
+ /* Last round? No jump */
+ bne OUT, PTR_LAST_ROUND, .Lchacha20_mips_xor_aligned_64_b
+ /* Load upper half of jump table addr */
+ lui T0, %hi(.Lchacha20_mips_jmptbl_aligned_0)
+
+ /* Full block? Jump */
+ beqz BYTES, .Lchacha20_mips_xor_aligned_64_b
+ /* Calculate lower half jump table addr and offset */
+ ins T0, $at, 2, 6
+
+ subu T0, $at
+ addiu T0, %lo(.Lchacha20_mips_jmptbl_aligned_0)
+
+ jr T0
+ /* Delay slot */
+ nop
+
+ .set reorder
+
+.Loop_chacha20_unaligned:
+ .set noreorder
+
+ /* Last round? no jump */
+ bne OUT, PTR_LAST_ROUND, .Lchacha20_mips_xor_unaligned_64_b
+ /* Load upper half of jump table addr */
+ lui T0, %hi(.Lchacha20_mips_jmptbl_unaligned_0)
+
+ /* Full block? Jump */
+ beqz BYTES, .Lchacha20_mips_xor_unaligned_64_b
+
+ /* Calculate lower half jump table addr and offset */
+ ins T0, $at, 2, 6
+ subu T0, $at
+ addiu T0, %lo(.Lchacha20_mips_jmptbl_unaligned_0)
+
+ jr T0
+ /* Delay slot */
+ nop
+
+ .set reorder
+
+/* Aligned code path
+ */
+.align 4
+ STORE_ALIGNED(64, 15, NONCE,12)
+ STORE_ALIGNED(60, 14, NONCE, 8)
+ STORE_ALIGNED(56, 13, NONCE, 4)
+ STORE_ALIGNED(52, 12, NONCE, 0)
+ STORE_ALIGNED(48, 11, KEY, 28)
+ STORE_ALIGNED(44, 10, KEY, 24)
+ STORE_ALIGNED(40, 9, KEY, 20)
+ STORE_ALIGNED(36, 8, KEY, 16)
+ STORE_ALIGNED(32, 7, KEY, 12)
+ STORE_ALIGNED(28, 6, KEY, 8)
+ STORE_ALIGNED(24, 5, KEY, 4)
+ STORE_ALIGNED(20, 4, KEY, 0)
+ STORE_ALIGNED(16, 3, $sp, 12+CONSTANT_OFS_SP)
+ STORE_ALIGNED(12, 2, $sp, 8+CONSTANT_OFS_SP)
+ STORE_ALIGNED( 8, 1, $sp, 4+CONSTANT_OFS_SP)
+.Lchacha20_mips_xor_aligned_4_b:
+ /* STORE_ALIGNED( 4, 0, $sp, 0+CONSTANT_OFS_SP) */
+ lw T0, 0+CONSTANT_OFS_SP($sp)
+ lw T1, 0(IN)
+ addu X0, T0
+ CPU_TO_LE32(X0)
+ xor X0, T1
+ .set noreorder
+ bne OUT, PTR_LAST_ROUND, .Loop_chacha20_rounds
+ sw X0, 0(OUT)
+ .set reorder
+
+ .set noreorder
+ bne $at, BYTES, .Lchacha20_mips_xor_bytes
+ /* Empty delayslot, Increase NONCE_0, return NONCE_0 value */
+ addiu NONCE_0, 1
+ .set noreorder
+
+.Lchacha20_mips_xor_done:
+ /* Restore used registers */
+ lw $ra, 0($sp)
+ lw $fp, 4($sp)
+ lw $s0, 8($sp)
+ lw $s1, 12($sp)
+ lw $s2, 16($sp)
+ lw $s3, 20($sp)
+ lw $s4, 24($sp)
+ lw $s5, 28($sp)
+ lw $s6, 32($sp)
+ lw $s7, 36($sp)
+.Lchacha20_mips_end:
+ .set noreorder
+ jr $ra
+ addiu $sp, STACK_SIZE
+ .set reorder
+
+ .set noreorder
+ /* Start jump table */
+ JMPTBL_ALIGNED( 0, 0, $sp, 0+CONSTANT_OFS_SP)
+ JMPTBL_ALIGNED( 4, 1, $sp, 4+CONSTANT_OFS_SP)
+ JMPTBL_ALIGNED( 8, 2, $sp, 8+CONSTANT_OFS_SP)
+ JMPTBL_ALIGNED(12, 3, $sp, 12+CONSTANT_OFS_SP)
+ JMPTBL_ALIGNED(16, 4, KEY, 0)
+ JMPTBL_ALIGNED(20, 5, KEY, 4)
+ JMPTBL_ALIGNED(24, 6, KEY, 8)
+ JMPTBL_ALIGNED(28, 7, KEY, 12)
+ JMPTBL_ALIGNED(32, 8, KEY, 16)
+ JMPTBL_ALIGNED(36, 9, KEY, 20)
+ JMPTBL_ALIGNED(40, 10, KEY, 24)
+ JMPTBL_ALIGNED(44, 11, KEY, 28)
+ JMPTBL_ALIGNED(48, 12, NONCE, 0)
+ JMPTBL_ALIGNED(52, 13, NONCE, 4)
+ JMPTBL_ALIGNED(56, 14, NONCE, 8)
+ JMPTBL_ALIGNED(60, 15, NONCE,12)
+ /* End jump table */
+ .set reorder
+
+/* Unaligned code path
+ */
+ STORE_UNALIGNED(64, 15, NONCE,12)
+ STORE_UNALIGNED(60, 14, NONCE, 8)
+ STORE_UNALIGNED(56, 13, NONCE, 4)
+ STORE_UNALIGNED(52, 12, NONCE, 0)
+ STORE_UNALIGNED(48, 11, KEY, 28)
+ STORE_UNALIGNED(44, 10, KEY, 24)
+ STORE_UNALIGNED(40, 9, KEY, 20)
+ STORE_UNALIGNED(36, 8, KEY, 16)
+ STORE_UNALIGNED(32, 7, KEY, 12)
+ STORE_UNALIGNED(28, 6, KEY, 8)
+ STORE_UNALIGNED(24, 5, KEY, 4)
+ STORE_UNALIGNED(20, 4, KEY, 0)
+ STORE_UNALIGNED(16, 3, $sp, 12+CONSTANT_OFS_SP)
+ STORE_UNALIGNED(12, 2, $sp, 8+CONSTANT_OFS_SP)
+ STORE_UNALIGNED( 8, 1, $sp, 4+CONSTANT_OFS_SP)
+.Lchacha20_mips_xor_unaligned_4_b:
+ /* STORE_UNALIGNED( 4, 0, $sp, 0+CONSTANT_OFS_SP) */
+ lw T0, 0+CONSTANT_OFS_SP($sp)
+ lwl T1, 0+MSB(IN)
+ lwr T1, 0+LSB(IN)
+ addu X0, T0
+ CPU_TO_LE32(X0)
+ xor X0, T1
+ swl X0, 0+MSB(OUT)
+ .set noreorder
+ bne OUT, PTR_LAST_ROUND, .Loop_chacha20_rounds
+ swr X0, 0+LSB(OUT)
+ .set reorder
+
+ /* Fall through to byte handling */
+ .set noreorder
+ beq $at, BYTES, .Lchacha20_mips_xor_done
+ /* Empty delayslot, increase NONCE_0, return NONCE_0 value */
+.Lchacha20_mips_xor_unaligned_0_b:
+.Lchacha20_mips_xor_aligned_0_b:
+ addiu NONCE_0, 1
+ .set reorder
+
+.Lchacha20_mips_xor_bytes:
+ addu OUT, $at
+ addu IN, $at
+ addu SAVED_X, SAVED_CA
+ /* First byte */
+ lbu T1, 0(IN)
+ andi $at, BYTES, 2
+ CPU_TO_LE32(SAVED_X)
+ ROTR(SAVED_X)
+ xor T1, SAVED_X
+ .set noreorder
+ beqz $at, .Lchacha20_mips_xor_done
+ sb T1, 0(OUT)
+ .set reorder
+ /* Second byte */
+ lbu T1, 1(IN)
+ andi $at, BYTES, 1
+ ROTx SAVED_X, 8
+ xor T1, SAVED_X
+ .set noreorder
+ beqz $at, .Lchacha20_mips_xor_done
+ sb T1, 1(OUT)
+ .set reorder
+ /* Third byte */
+ lbu T1, 2(IN)
+ ROTx SAVED_X, 8
+ xor T1, SAVED_X
+ .set noreorder
+ b .Lchacha20_mips_xor_done
+ sb T1, 2(OUT)
+ .set reorder
+.set noreorder
+
+.Lchacha20_mips_jmptbl_unaligned:
+ /* Start jump table */
+ JMPTBL_UNALIGNED( 0, 0, $sp, 0+CONSTANT_OFS_SP)
+ JMPTBL_UNALIGNED( 4, 1, $sp, 4+CONSTANT_OFS_SP)
+ JMPTBL_UNALIGNED( 8, 2, $sp, 8+CONSTANT_OFS_SP)
+ JMPTBL_UNALIGNED(12, 3, $sp, 12+CONSTANT_OFS_SP)
+ JMPTBL_UNALIGNED(16, 4, KEY, 0)
+ JMPTBL_UNALIGNED(20, 5, KEY, 4)
+ JMPTBL_UNALIGNED(24, 6, KEY, 8)
+ JMPTBL_UNALIGNED(28, 7, KEY, 12)
+ JMPTBL_UNALIGNED(32, 8, KEY, 16)
+ JMPTBL_UNALIGNED(36, 9, KEY, 20)
+ JMPTBL_UNALIGNED(40, 10, KEY, 24)
+ JMPTBL_UNALIGNED(44, 11, KEY, 28)
+ JMPTBL_UNALIGNED(48, 12, NONCE, 0)
+ JMPTBL_UNALIGNED(52, 13, NONCE, 4)
+ JMPTBL_UNALIGNED(56, 14, NONCE, 8)
+ JMPTBL_UNALIGNED(60, 15, NONCE,12)
+ /* End jump table */
+.set reorder
+
+.end chacha20_mips
+.set at
diff --git a/src/crypto/zinc/chacha20/chacha20-x86_64-glue.h b/src/crypto/zinc/chacha20/chacha20-x86_64-glue.h
new file mode 100644
index 0000000..616813d
--- /dev/null
+++ b/src/crypto/zinc/chacha20/chacha20-x86_64-glue.h
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <zinc/chacha20.h>
+#include <asm/fpu/api.h>
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+#include <asm/intel-family.h>
+
+#ifdef CONFIG_AS_SSSE3
+asmlinkage void hchacha20_ssse3(u8 *derived_key, const u8 *nonce,
+ const u8 *key);
+asmlinkage void chacha20_ssse3(u8 *out, const u8 *in, const size_t len,
+ const u32 key[8], const u32 counter[4]);
+#endif
+#ifdef CONFIG_AS_AVX2
+asmlinkage void chacha20_avx2(u8 *out, const u8 *in, const size_t len,
+ const u32 key[8], const u32 counter[4]);
+#endif
+#ifdef CONFIG_AS_AVX512
+asmlinkage void chacha20_avx512(u8 *out, const u8 *in, const size_t len,
+ const u32 key[8], const u32 counter[4]);
+asmlinkage void chacha20_avx512vl(u8 *out, const u8 *in, const size_t len,
+ const u32 key[8], const u32 counter[4]);
+#endif
+
+static bool chacha20_use_ssse3 __ro_after_init;
+static bool chacha20_use_avx2 __ro_after_init;
+static bool chacha20_use_avx512 __ro_after_init;
+static bool chacha20_use_avx512vl __ro_after_init;
+
+void __init chacha20_fpu_init(void)
+{
+ chacha20_use_ssse3 = boot_cpu_has(X86_FEATURE_SSSE3);
+ chacha20_use_avx2 =
+ boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX2) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+#ifndef COMPAT_CANNOT_USE_AVX512
+ chacha20_use_avx512 =
+ boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_AVX512F) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
+ XFEATURE_MASK_AVX512, NULL) &&
+ /* Skylake downclocks unacceptably much when using zmm. */
+ boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X;
+ chacha20_use_avx512vl =
+ boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_AVX512F) &&
+ boot_cpu_has(X86_FEATURE_AVX512VL) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
+ XFEATURE_MASK_AVX512, NULL);
+#endif
+}
+
+static inline bool chacha20_arch(u8 *dst, const u8 *src, const size_t len,
+ const u32 key[8], const u32 counter[4],
+ simd_context_t simd_context)
+{
+ if (simd_context != HAVE_FULL_SIMD)
+ return false;
+
+#ifdef CONFIG_AS_AVX512
+ if (chacha20_use_avx512) {
+ chacha20_avx512(dst, src, len, key, counter);
+ return true;
+ }
+ if (chacha20_use_avx512vl) {
+ chacha20_avx512vl(dst, src, len, key, counter);
+ return true;
+ }
+#endif
+#ifdef CONFIG_AS_AVX2
+ if (chacha20_use_avx2) {
+ chacha20_avx2(dst, src, len, key, counter);
+ return true;
+ }
+#endif
+#ifdef CONFIG_AS_SSSE3
+ if (chacha20_use_ssse3) {
+ chacha20_ssse3(dst, src, len, key, counter);
+ return true;
+ }
+#endif
+ return false;
+}
+
+static inline bool hchacha20_arch(u8 *derived_key, const u8 *nonce,
+ const u8 *key, simd_context_t simd_context)
+{
+#if defined(CONFIG_AS_SSSE3)
+ if (simd_context == HAVE_FULL_SIMD && chacha20_use_ssse3) {
+ hchacha20_ssse3(derived_key, nonce, key);
+ return true;
+ }
+#endif
+ return false;
+}
+
+#define HAVE_CHACHA20_ARCH_IMPLEMENTATION
diff --git a/src/crypto/zinc/chacha20/chacha20-x86_64.S b/src/crypto/zinc/chacha20/chacha20-x86_64.S
new file mode 100644
index 0000000..2451feb
--- /dev/null
+++ b/src/crypto/zinc/chacha20/chacha20-x86_64.S
@@ -0,0 +1,2632 @@
+/* SPDX-License-Identifier: OpenSSL OR (BSD-3-Clause OR GPL-2.0)
+ *
+ * Copyright (C) 2017 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * This is based in part on Andy Polyakov's implementation from OpenSSL.
+ */
+
+#include <linux/linkage.h>
+
+.section .rodata.cst16.Lzero, "aM", @progbits, 16
+.align 16
+.Lzero:
+.long 0,0,0,0
+.section .rodata.cst16.Lone, "aM", @progbits, 16
+.align 16
+.Lone:
+.long 1,0,0,0
+.section .rodata.cst16.Linc, "aM", @progbits, 16
+.align 16
+.Linc:
+.long 0,1,2,3
+.section .rodata.cst16.Lfour, "aM", @progbits, 16
+.align 16
+.Lfour:
+.long 4,4,4,4
+.section .rodata.cst32.Lincy, "aM", @progbits, 32
+.align 32
+.Lincy:
+.long 0,2,4,6,1,3,5,7
+.section .rodata.cst32.Leight, "aM", @progbits, 32
+.align 32
+.Leight:
+.long 8,8,8,8,8,8,8,8
+.section .rodata.cst16.Lrot16, "aM", @progbits, 16
+.align 16
+.Lrot16:
+.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
+.section .rodata.cst16.Lrot24, "aM", @progbits, 16
+.align 16
+.Lrot24:
+.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
+.section .rodata.cst16.Lsigma, "aM", @progbits, 16
+.align 16
+.Lsigma:
+.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
+.section .rodata.cst64.Lzeroz, "aM", @progbits, 64
+.align 64
+.Lzeroz:
+.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
+.section .rodata.cst64.Lfourz, "aM", @progbits, 64
+.align 64
+.Lfourz:
+.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
+.section .rodata.cst64.Lincz, "aM", @progbits, 64
+.align 64
+.Lincz:
+.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+.section .rodata.cst64.Lsixteen, "aM", @progbits, 64
+.align 64
+.Lsixteen:
+.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
+.section .rodata.cst32.Ltwoy, "aM", @progbits, 32
+.align 64
+.Ltwoy:
+.long 2,0,0,0, 2,0,0,0
+
+.text
+
+#ifdef CONFIG_AS_SSSE3
+.align 32
+ENTRY(hchacha20_ssse3)
+ movdqa .Lsigma(%rip),%xmm0
+ movdqu (%rdx),%xmm1
+ movdqu 16(%rdx),%xmm2
+ movdqu (%rsi),%xmm3
+ movdqa .Lrot16(%rip),%xmm6
+ movdqa .Lrot24(%rip),%xmm7
+ movq $10,%r8
+ .align 32
+.Loop_hssse3:
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm6,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm7,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $57,%xmm1,%xmm1
+ pshufd $147,%xmm3,%xmm3
+ nop
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm6,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm7,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $147,%xmm1,%xmm1
+ pshufd $57,%xmm3,%xmm3
+ decq %r8
+ jnz .Loop_hssse3
+ movdqu %xmm0,0(%rdi)
+ movdqu %xmm3,16(%rdi)
+ ret
+ENDPROC(hchacha20_ssse3)
+
+.align 32
+ENTRY(chacha20_ssse3)
+.Lchacha20_ssse3:
+ cmpq $0,%rdx
+ je .Lssse3_epilogue
+ leaq 8(%rsp),%r10
+
+ cmpq $128,%rdx
+ ja .Lchacha20_4x
+
+.Ldo_sse3_after_all:
+ subq $64+8,%rsp
+ andq $-32,%rsp
+ movdqa .Lsigma(%rip),%xmm0
+ movdqu (%rcx),%xmm1
+ movdqu 16(%rcx),%xmm2
+ movdqu (%r8),%xmm3
+ movdqa .Lrot16(%rip),%xmm6
+ movdqa .Lrot24(%rip),%xmm7
+
+ movdqa %xmm0,0(%rsp)
+ movdqa %xmm1,16(%rsp)
+ movdqa %xmm2,32(%rsp)
+ movdqa %xmm3,48(%rsp)
+ movq $10,%r8
+ jmp .Loop_ssse3
+
+.align 32
+.Loop_outer_ssse3:
+ movdqa .Lone(%rip),%xmm3
+ movdqa 0(%rsp),%xmm0
+ movdqa 16(%rsp),%xmm1
+ movdqa 32(%rsp),%xmm2
+ paddd 48(%rsp),%xmm3
+ movq $10,%r8
+ movdqa %xmm3,48(%rsp)
+ jmp .Loop_ssse3
+
+.align 32
+.Loop_ssse3:
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm6,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm7,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $57,%xmm1,%xmm1
+ pshufd $147,%xmm3,%xmm3
+ nop
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm6,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm7,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $147,%xmm1,%xmm1
+ pshufd $57,%xmm3,%xmm3
+ decq %r8
+ jnz .Loop_ssse3
+ paddd 0(%rsp),%xmm0
+ paddd 16(%rsp),%xmm1
+ paddd 32(%rsp),%xmm2
+ paddd 48(%rsp),%xmm3
+
+ cmpq $64,%rdx
+ jb .Ltail_ssse3
+
+ movdqu 0(%rsi),%xmm4
+ movdqu 16(%rsi),%xmm5
+ pxor %xmm4,%xmm0
+ movdqu 32(%rsi),%xmm4
+ pxor %xmm5,%xmm1
+ movdqu 48(%rsi),%xmm5
+ leaq 64(%rsi),%rsi
+ pxor %xmm4,%xmm2
+ pxor %xmm5,%xmm3
+
+ movdqu %xmm0,0(%rdi)
+ movdqu %xmm1,16(%rdi)
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm3,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ subq $64,%rdx
+ jnz .Loop_outer_ssse3
+
+ jmp .Ldone_ssse3
+
+.align 16
+.Ltail_ssse3:
+ movdqa %xmm0,0(%rsp)
+ movdqa %xmm1,16(%rsp)
+ movdqa %xmm2,32(%rsp)
+ movdqa %xmm3,48(%rsp)
+ xorq %r8,%r8
+
+.Loop_tail_ssse3:
+ movzbl (%rsi,%r8,1),%eax
+ movzbl (%rsp,%r8,1),%ecx
+ leaq 1(%r8),%r8
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r8,1)
+ decq %rdx
+ jnz .Loop_tail_ssse3
+
+.Ldone_ssse3:
+ leaq -8(%r10),%rsp
+
+.Lssse3_epilogue:
+ ret
+
+.align 32
+.Lchacha20_4x:
+ leaq 8(%rsp),%r10
+
+.Lproceed4x:
+ subq $0x140+8,%rsp
+ andq $-32,%rsp
+ movdqa .Lsigma(%rip),%xmm11
+ movdqu (%rcx),%xmm15
+ movdqu 16(%rcx),%xmm7
+ movdqu (%r8),%xmm3
+ leaq 256(%rsp),%rcx
+ leaq .Lrot16(%rip),%r9
+ leaq .Lrot24(%rip),%r11
+
+ pshufd $0x00,%xmm11,%xmm8
+ pshufd $0x55,%xmm11,%xmm9
+ movdqa %xmm8,64(%rsp)
+ pshufd $0xaa,%xmm11,%xmm10
+ movdqa %xmm9,80(%rsp)
+ pshufd $0xff,%xmm11,%xmm11
+ movdqa %xmm10,96(%rsp)
+ movdqa %xmm11,112(%rsp)
+
+ pshufd $0x00,%xmm15,%xmm12
+ pshufd $0x55,%xmm15,%xmm13
+ movdqa %xmm12,128-256(%rcx)
+ pshufd $0xaa,%xmm15,%xmm14
+ movdqa %xmm13,144-256(%rcx)
+ pshufd $0xff,%xmm15,%xmm15
+ movdqa %xmm14,160-256(%rcx)
+ movdqa %xmm15,176-256(%rcx)
+
+ pshufd $0x00,%xmm7,%xmm4
+ pshufd $0x55,%xmm7,%xmm5
+ movdqa %xmm4,192-256(%rcx)
+ pshufd $0xaa,%xmm7,%xmm6
+ movdqa %xmm5,208-256(%rcx)
+ pshufd $0xff,%xmm7,%xmm7
+ movdqa %xmm6,224-256(%rcx)
+ movdqa %xmm7,240-256(%rcx)
+
+ pshufd $0x00,%xmm3,%xmm0
+ pshufd $0x55,%xmm3,%xmm1
+ paddd .Linc(%rip),%xmm0
+ pshufd $0xaa,%xmm3,%xmm2
+ movdqa %xmm1,272-256(%rcx)
+ pshufd $0xff,%xmm3,%xmm3
+ movdqa %xmm2,288-256(%rcx)
+ movdqa %xmm3,304-256(%rcx)
+
+ jmp .Loop_enter4x
+
+.align 32
+.Loop_outer4x:
+ movdqa 64(%rsp),%xmm8
+ movdqa 80(%rsp),%xmm9
+ movdqa 96(%rsp),%xmm10
+ movdqa 112(%rsp),%xmm11
+ movdqa 128-256(%rcx),%xmm12
+ movdqa 144-256(%rcx),%xmm13
+ movdqa 160-256(%rcx),%xmm14
+ movdqa 176-256(%rcx),%xmm15
+ movdqa 192-256(%rcx),%xmm4
+ movdqa 208-256(%rcx),%xmm5
+ movdqa 224-256(%rcx),%xmm6
+ movdqa 240-256(%rcx),%xmm7
+ movdqa 256-256(%rcx),%xmm0
+ movdqa 272-256(%rcx),%xmm1
+ movdqa 288-256(%rcx),%xmm2
+ movdqa 304-256(%rcx),%xmm3
+ paddd .Lfour(%rip),%xmm0
+
+.Loop_enter4x:
+ movdqa %xmm6,32(%rsp)
+ movdqa %xmm7,48(%rsp)
+ movdqa (%r9),%xmm7
+ movl $10,%eax
+ movdqa %xmm0,256-256(%rcx)
+ jmp .Loop4x
+
+.align 32
+.Loop4x:
+ paddd %xmm12,%xmm8
+ paddd %xmm13,%xmm9
+ pxor %xmm8,%xmm0
+ pxor %xmm9,%xmm1
+ pshufb %xmm7,%xmm0
+ pshufb %xmm7,%xmm1
+ paddd %xmm0,%xmm4
+ paddd %xmm1,%xmm5
+ pxor %xmm4,%xmm12
+ pxor %xmm5,%xmm13
+ movdqa %xmm12,%xmm6
+ pslld $12,%xmm12
+ psrld $20,%xmm6
+ movdqa %xmm13,%xmm7
+ pslld $12,%xmm13
+ por %xmm6,%xmm12
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm13
+ paddd %xmm12,%xmm8
+ paddd %xmm13,%xmm9
+ pxor %xmm8,%xmm0
+ pxor %xmm9,%xmm1
+ pshufb %xmm6,%xmm0
+ pshufb %xmm6,%xmm1
+ paddd %xmm0,%xmm4
+ paddd %xmm1,%xmm5
+ pxor %xmm4,%xmm12
+ pxor %xmm5,%xmm13
+ movdqa %xmm12,%xmm7
+ pslld $7,%xmm12
+ psrld $25,%xmm7
+ movdqa %xmm13,%xmm6
+ pslld $7,%xmm13
+ por %xmm7,%xmm12
+ psrld $25,%xmm6
+ movdqa (%r9),%xmm7
+ por %xmm6,%xmm13
+ movdqa %xmm4,0(%rsp)
+ movdqa %xmm5,16(%rsp)
+ movdqa 32(%rsp),%xmm4
+ movdqa 48(%rsp),%xmm5
+ paddd %xmm14,%xmm10
+ paddd %xmm15,%xmm11
+ pxor %xmm10,%xmm2
+ pxor %xmm11,%xmm3
+ pshufb %xmm7,%xmm2
+ pshufb %xmm7,%xmm3
+ paddd %xmm2,%xmm4
+ paddd %xmm3,%xmm5
+ pxor %xmm4,%xmm14
+ pxor %xmm5,%xmm15
+ movdqa %xmm14,%xmm6
+ pslld $12,%xmm14
+ psrld $20,%xmm6
+ movdqa %xmm15,%xmm7
+ pslld $12,%xmm15
+ por %xmm6,%xmm14
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm15
+ paddd %xmm14,%xmm10
+ paddd %xmm15,%xmm11
+ pxor %xmm10,%xmm2
+ pxor %xmm11,%xmm3
+ pshufb %xmm6,%xmm2
+ pshufb %xmm6,%xmm3
+ paddd %xmm2,%xmm4
+ paddd %xmm3,%xmm5
+ pxor %xmm4,%xmm14
+ pxor %xmm5,%xmm15
+ movdqa %xmm14,%xmm7
+ pslld $7,%xmm14
+ psrld $25,%xmm7
+ movdqa %xmm15,%xmm6
+ pslld $7,%xmm15
+ por %xmm7,%xmm14
+ psrld $25,%xmm6
+ movdqa (%r9),%xmm7
+ por %xmm6,%xmm15
+ paddd %xmm13,%xmm8
+ paddd %xmm14,%xmm9
+ pxor %xmm8,%xmm3
+ pxor %xmm9,%xmm0
+ pshufb %xmm7,%xmm3
+ pshufb %xmm7,%xmm0
+ paddd %xmm3,%xmm4
+ paddd %xmm0,%xmm5
+ pxor %xmm4,%xmm13
+ pxor %xmm5,%xmm14
+ movdqa %xmm13,%xmm6
+ pslld $12,%xmm13
+ psrld $20,%xmm6
+ movdqa %xmm14,%xmm7
+ pslld $12,%xmm14
+ por %xmm6,%xmm13
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm14
+ paddd %xmm13,%xmm8
+ paddd %xmm14,%xmm9
+ pxor %xmm8,%xmm3
+ pxor %xmm9,%xmm0
+ pshufb %xmm6,%xmm3
+ pshufb %xmm6,%xmm0
+ paddd %xmm3,%xmm4
+ paddd %xmm0,%xmm5
+ pxor %xmm4,%xmm13
+ pxor %xmm5,%xmm14
+ movdqa %xmm13,%xmm7
+ pslld $7,%xmm13
+ psrld $25,%xmm7
+ movdqa %xmm14,%xmm6
+ pslld $7,%xmm14
+ por %xmm7,%xmm13
+ psrld $25,%xmm6
+ movdqa (%r9),%xmm7
+ por %xmm6,%xmm14
+ movdqa %xmm4,32(%rsp)
+ movdqa %xmm5,48(%rsp)
+ movdqa 0(%rsp),%xmm4
+ movdqa 16(%rsp),%xmm5
+ paddd %xmm15,%xmm10
+ paddd %xmm12,%xmm11
+ pxor %xmm10,%xmm1
+ pxor %xmm11,%xmm2
+ pshufb %xmm7,%xmm1
+ pshufb %xmm7,%xmm2
+ paddd %xmm1,%xmm4
+ paddd %xmm2,%xmm5
+ pxor %xmm4,%xmm15
+ pxor %xmm5,%xmm12
+ movdqa %xmm15,%xmm6
+ pslld $12,%xmm15
+ psrld $20,%xmm6
+ movdqa %xmm12,%xmm7
+ pslld $12,%xmm12
+ por %xmm6,%xmm15
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm12
+ paddd %xmm15,%xmm10
+ paddd %xmm12,%xmm11
+ pxor %xmm10,%xmm1
+ pxor %xmm11,%xmm2
+ pshufb %xmm6,%xmm1
+ pshufb %xmm6,%xmm2
+ paddd %xmm1,%xmm4
+ paddd %xmm2,%xmm5
+ pxor %xmm4,%xmm15
+ pxor %xmm5,%xmm12
+ movdqa %xmm15,%xmm7
+ pslld $7,%xmm15
+ psrld $25,%xmm7
+ movdqa %xmm12,%xmm6
+ pslld $7,%xmm12
+ por %xmm7,%xmm15
+ psrld $25,%xmm6
+ movdqa (%r9),%xmm7
+ por %xmm6,%xmm12
+ decl %eax
+ jnz .Loop4x
+
+ paddd 64(%rsp),%xmm8
+ paddd 80(%rsp),%xmm9
+ paddd 96(%rsp),%xmm10
+ paddd 112(%rsp),%xmm11
+
+ movdqa %xmm8,%xmm6
+ punpckldq %xmm9,%xmm8
+ movdqa %xmm10,%xmm7
+ punpckldq %xmm11,%xmm10
+ punpckhdq %xmm9,%xmm6
+ punpckhdq %xmm11,%xmm7
+ movdqa %xmm8,%xmm9
+ punpcklqdq %xmm10,%xmm8
+ movdqa %xmm6,%xmm11
+ punpcklqdq %xmm7,%xmm6
+ punpckhqdq %xmm10,%xmm9
+ punpckhqdq %xmm7,%xmm11
+ paddd 128-256(%rcx),%xmm12
+ paddd 144-256(%rcx),%xmm13
+ paddd 160-256(%rcx),%xmm14
+ paddd 176-256(%rcx),%xmm15
+
+ movdqa %xmm8,0(%rsp)
+ movdqa %xmm9,16(%rsp)
+ movdqa 32(%rsp),%xmm8
+ movdqa 48(%rsp),%xmm9
+
+ movdqa %xmm12,%xmm10
+ punpckldq %xmm13,%xmm12
+ movdqa %xmm14,%xmm7
+ punpckldq %xmm15,%xmm14
+ punpckhdq %xmm13,%xmm10
+ punpckhdq %xmm15,%xmm7
+ movdqa %xmm12,%xmm13
+ punpcklqdq %xmm14,%xmm12
+ movdqa %xmm10,%xmm15
+ punpcklqdq %xmm7,%xmm10
+ punpckhqdq %xmm14,%xmm13
+ punpckhqdq %xmm7,%xmm15
+ paddd 192-256(%rcx),%xmm4
+ paddd 208-256(%rcx),%xmm5
+ paddd 224-256(%rcx),%xmm8
+ paddd 240-256(%rcx),%xmm9
+
+ movdqa %xmm6,32(%rsp)
+ movdqa %xmm11,48(%rsp)
+
+ movdqa %xmm4,%xmm14
+ punpckldq %xmm5,%xmm4
+ movdqa %xmm8,%xmm7
+ punpckldq %xmm9,%xmm8
+ punpckhdq %xmm5,%xmm14
+ punpckhdq %xmm9,%xmm7
+ movdqa %xmm4,%xmm5
+ punpcklqdq %xmm8,%xmm4
+ movdqa %xmm14,%xmm9
+ punpcklqdq %xmm7,%xmm14
+ punpckhqdq %xmm8,%xmm5
+ punpckhqdq %xmm7,%xmm9
+ paddd 256-256(%rcx),%xmm0
+ paddd 272-256(%rcx),%xmm1
+ paddd 288-256(%rcx),%xmm2
+ paddd 304-256(%rcx),%xmm3
+
+ movdqa %xmm0,%xmm8
+ punpckldq %xmm1,%xmm0
+ movdqa %xmm2,%xmm7
+ punpckldq %xmm3,%xmm2
+ punpckhdq %xmm1,%xmm8
+ punpckhdq %xmm3,%xmm7
+ movdqa %xmm0,%xmm1
+ punpcklqdq %xmm2,%xmm0
+ movdqa %xmm8,%xmm3
+ punpcklqdq %xmm7,%xmm8
+ punpckhqdq %xmm2,%xmm1
+ punpckhqdq %xmm7,%xmm3
+ cmpq $256,%rdx
+ jb .Ltail4x
+
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ leaq 128(%rsi),%rsi
+ pxor 16(%rsp),%xmm6
+ pxor %xmm13,%xmm11
+ pxor %xmm5,%xmm2
+ pxor %xmm1,%xmm7
+
+ movdqu %xmm6,64(%rdi)
+ movdqu 0(%rsi),%xmm6
+ movdqu %xmm11,80(%rdi)
+ movdqu 16(%rsi),%xmm11
+ movdqu %xmm2,96(%rdi)
+ movdqu 32(%rsi),%xmm2
+ movdqu %xmm7,112(%rdi)
+ leaq 128(%rdi),%rdi
+ movdqu 48(%rsi),%xmm7
+ pxor 32(%rsp),%xmm6
+ pxor %xmm10,%xmm11
+ pxor %xmm14,%xmm2
+ pxor %xmm8,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ leaq 128(%rsi),%rsi
+ pxor 48(%rsp),%xmm6
+ pxor %xmm15,%xmm11
+ pxor %xmm9,%xmm2
+ pxor %xmm3,%xmm7
+ movdqu %xmm6,64(%rdi)
+ movdqu %xmm11,80(%rdi)
+ movdqu %xmm2,96(%rdi)
+ movdqu %xmm7,112(%rdi)
+ leaq 128(%rdi),%rdi
+
+ subq $256,%rdx
+ jnz .Loop_outer4x
+
+ jmp .Ldone4x
+
+.Ltail4x:
+ cmpq $192,%rdx
+ jae .L192_or_more4x
+ cmpq $128,%rdx
+ jae .L128_or_more4x
+ cmpq $64,%rdx
+ jae .L64_or_more4x
+
+
+ xorq %r9,%r9
+
+ movdqa %xmm12,16(%rsp)
+ movdqa %xmm4,32(%rsp)
+ movdqa %xmm0,48(%rsp)
+ jmp .Loop_tail4x
+
+.align 32
+.L64_or_more4x:
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+ movdqu %xmm6,0(%rdi)
+ movdqu %xmm11,16(%rdi)
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm7,48(%rdi)
+ je .Ldone4x
+
+ movdqa 16(%rsp),%xmm6
+ leaq 64(%rsi),%rsi
+ xorq %r9,%r9
+ movdqa %xmm6,0(%rsp)
+ movdqa %xmm13,16(%rsp)
+ leaq 64(%rdi),%rdi
+ movdqa %xmm5,32(%rsp)
+ subq $64,%rdx
+ movdqa %xmm1,48(%rsp)
+ jmp .Loop_tail4x
+
+.align 32
+.L128_or_more4x:
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ pxor 16(%rsp),%xmm6
+ pxor %xmm13,%xmm11
+ pxor %xmm5,%xmm2
+ pxor %xmm1,%xmm7
+ movdqu %xmm6,64(%rdi)
+ movdqu %xmm11,80(%rdi)
+ movdqu %xmm2,96(%rdi)
+ movdqu %xmm7,112(%rdi)
+ je .Ldone4x
+
+ movdqa 32(%rsp),%xmm6
+ leaq 128(%rsi),%rsi
+ xorq %r9,%r9
+ movdqa %xmm6,0(%rsp)
+ movdqa %xmm10,16(%rsp)
+ leaq 128(%rdi),%rdi
+ movdqa %xmm14,32(%rsp)
+ subq $128,%rdx
+ movdqa %xmm8,48(%rsp)
+ jmp .Loop_tail4x
+
+.align 32
+.L192_or_more4x:
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ leaq 128(%rsi),%rsi
+ pxor 16(%rsp),%xmm6
+ pxor %xmm13,%xmm11
+ pxor %xmm5,%xmm2
+ pxor %xmm1,%xmm7
+
+ movdqu %xmm6,64(%rdi)
+ movdqu 0(%rsi),%xmm6
+ movdqu %xmm11,80(%rdi)
+ movdqu 16(%rsi),%xmm11
+ movdqu %xmm2,96(%rdi)
+ movdqu 32(%rsi),%xmm2
+ movdqu %xmm7,112(%rdi)
+ leaq 128(%rdi),%rdi
+ movdqu 48(%rsi),%xmm7
+ pxor 32(%rsp),%xmm6
+ pxor %xmm10,%xmm11
+ pxor %xmm14,%xmm2
+ pxor %xmm8,%xmm7
+ movdqu %xmm6,0(%rdi)
+ movdqu %xmm11,16(%rdi)
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm7,48(%rdi)
+ je .Ldone4x
+
+ movdqa 48(%rsp),%xmm6
+ leaq 64(%rsi),%rsi
+ xorq %r9,%r9
+ movdqa %xmm6,0(%rsp)
+ movdqa %xmm15,16(%rsp)
+ leaq 64(%rdi),%rdi
+ movdqa %xmm9,32(%rsp)
+ subq $192,%rdx
+ movdqa %xmm3,48(%rsp)
+
+.Loop_tail4x:
+ movzbl (%rsi,%r9,1),%eax
+ movzbl (%rsp,%r9,1),%ecx
+ leaq 1(%r9),%r9
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r9,1)
+ decq %rdx
+ jnz .Loop_tail4x
+
+.Ldone4x:
+ leaq -8(%r10),%rsp
+
+.L4x_epilogue:
+ ret
+ENDPROC(chacha20_ssse3)
+#endif /* CONFIG_AS_SSSE3 */
+
+#ifdef CONFIG_AS_AVX2
+.align 32
+ENTRY(chacha20_avx2)
+.Lchacha20_avx2:
+ cmpq $0,%rdx
+ je .L8x_epilogue
+ leaq 8(%rsp),%r10
+
+ subq $0x280+8,%rsp
+ andq $-32,%rsp
+ vzeroupper
+
+ vbroadcasti128 .Lsigma(%rip),%ymm11
+ vbroadcasti128 (%rcx),%ymm3
+ vbroadcasti128 16(%rcx),%ymm15
+ vbroadcasti128 (%r8),%ymm7
+ leaq 256(%rsp),%rcx
+ leaq 512(%rsp),%rax
+ leaq .Lrot16(%rip),%r9
+ leaq .Lrot24(%rip),%r11
+
+ vpshufd $0x00,%ymm11,%ymm8
+ vpshufd $0x55,%ymm11,%ymm9
+ vmovdqa %ymm8,128-256(%rcx)
+ vpshufd $0xaa,%ymm11,%ymm10
+ vmovdqa %ymm9,160-256(%rcx)
+ vpshufd $0xff,%ymm11,%ymm11
+ vmovdqa %ymm10,192-256(%rcx)
+ vmovdqa %ymm11,224-256(%rcx)
+
+ vpshufd $0x00,%ymm3,%ymm0
+ vpshufd $0x55,%ymm3,%ymm1
+ vmovdqa %ymm0,256-256(%rcx)
+ vpshufd $0xaa,%ymm3,%ymm2
+ vmovdqa %ymm1,288-256(%rcx)
+ vpshufd $0xff,%ymm3,%ymm3
+ vmovdqa %ymm2,320-256(%rcx)
+ vmovdqa %ymm3,352-256(%rcx)
+
+ vpshufd $0x00,%ymm15,%ymm12
+ vpshufd $0x55,%ymm15,%ymm13
+ vmovdqa %ymm12,384-512(%rax)
+ vpshufd $0xaa,%ymm15,%ymm14
+ vmovdqa %ymm13,416-512(%rax)
+ vpshufd $0xff,%ymm15,%ymm15
+ vmovdqa %ymm14,448-512(%rax)
+ vmovdqa %ymm15,480-512(%rax)
+
+ vpshufd $0x00,%ymm7,%ymm4
+ vpshufd $0x55,%ymm7,%ymm5
+ vpaddd .Lincy(%rip),%ymm4,%ymm4
+ vpshufd $0xaa,%ymm7,%ymm6
+ vmovdqa %ymm5,544-512(%rax)
+ vpshufd $0xff,%ymm7,%ymm7
+ vmovdqa %ymm6,576-512(%rax)
+ vmovdqa %ymm7,608-512(%rax)
+
+ jmp .Loop_enter8x
+
+.align 32
+.Loop_outer8x:
+ vmovdqa 128-256(%rcx),%ymm8
+ vmovdqa 160-256(%rcx),%ymm9
+ vmovdqa 192-256(%rcx),%ymm10
+ vmovdqa 224-256(%rcx),%ymm11
+ vmovdqa 256-256(%rcx),%ymm0
+ vmovdqa 288-256(%rcx),%ymm1
+ vmovdqa 320-256(%rcx),%ymm2
+ vmovdqa 352-256(%rcx),%ymm3
+ vmovdqa 384-512(%rax),%ymm12
+ vmovdqa 416-512(%rax),%ymm13
+ vmovdqa 448-512(%rax),%ymm14
+ vmovdqa 480-512(%rax),%ymm15
+ vmovdqa 512-512(%rax),%ymm4
+ vmovdqa 544-512(%rax),%ymm5
+ vmovdqa 576-512(%rax),%ymm6
+ vmovdqa 608-512(%rax),%ymm7
+ vpaddd .Leight(%rip),%ymm4,%ymm4
+
+.Loop_enter8x:
+ vmovdqa %ymm14,64(%rsp)
+ vmovdqa %ymm15,96(%rsp)
+ vbroadcasti128 (%r9),%ymm15
+ vmovdqa %ymm4,512-512(%rax)
+ movl $10,%eax
+ jmp .Loop8x
+
+.align 32
+.Loop8x:
+ vpaddd %ymm0,%ymm8,%ymm8
+ vpxor %ymm4,%ymm8,%ymm4
+ vpshufb %ymm15,%ymm4,%ymm4
+ vpaddd %ymm1,%ymm9,%ymm9
+ vpxor %ymm5,%ymm9,%ymm5
+ vpshufb %ymm15,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm12,%ymm12
+ vpxor %ymm0,%ymm12,%ymm0
+ vpslld $12,%ymm0,%ymm14
+ vpsrld $20,%ymm0,%ymm0
+ vpor %ymm0,%ymm14,%ymm0
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm5,%ymm13,%ymm13
+ vpxor %ymm1,%ymm13,%ymm1
+ vpslld $12,%ymm1,%ymm15
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm1,%ymm15,%ymm1
+ vpaddd %ymm0,%ymm8,%ymm8
+ vpxor %ymm4,%ymm8,%ymm4
+ vpshufb %ymm14,%ymm4,%ymm4
+ vpaddd %ymm1,%ymm9,%ymm9
+ vpxor %ymm5,%ymm9,%ymm5
+ vpshufb %ymm14,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm12,%ymm12
+ vpxor %ymm0,%ymm12,%ymm0
+ vpslld $7,%ymm0,%ymm15
+ vpsrld $25,%ymm0,%ymm0
+ vpor %ymm0,%ymm15,%ymm0
+ vbroadcasti128 (%r9),%ymm15
+ vpaddd %ymm5,%ymm13,%ymm13
+ vpxor %ymm1,%ymm13,%ymm1
+ vpslld $7,%ymm1,%ymm14
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm1,%ymm14,%ymm1
+ vmovdqa %ymm12,0(%rsp)
+ vmovdqa %ymm13,32(%rsp)
+ vmovdqa 64(%rsp),%ymm12
+ vmovdqa 96(%rsp),%ymm13
+ vpaddd %ymm2,%ymm10,%ymm10
+ vpxor %ymm6,%ymm10,%ymm6
+ vpshufb %ymm15,%ymm6,%ymm6
+ vpaddd %ymm3,%ymm11,%ymm11
+ vpxor %ymm7,%ymm11,%ymm7
+ vpshufb %ymm15,%ymm7,%ymm7
+ vpaddd %ymm6,%ymm12,%ymm12
+ vpxor %ymm2,%ymm12,%ymm2
+ vpslld $12,%ymm2,%ymm14
+ vpsrld $20,%ymm2,%ymm2
+ vpor %ymm2,%ymm14,%ymm2
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm7,%ymm13,%ymm13
+ vpxor %ymm3,%ymm13,%ymm3
+ vpslld $12,%ymm3,%ymm15
+ vpsrld $20,%ymm3,%ymm3
+ vpor %ymm3,%ymm15,%ymm3
+ vpaddd %ymm2,%ymm10,%ymm10
+ vpxor %ymm6,%ymm10,%ymm6
+ vpshufb %ymm14,%ymm6,%ymm6
+ vpaddd %ymm3,%ymm11,%ymm11
+ vpxor %ymm7,%ymm11,%ymm7
+ vpshufb %ymm14,%ymm7,%ymm7
+ vpaddd %ymm6,%ymm12,%ymm12
+ vpxor %ymm2,%ymm12,%ymm2
+ vpslld $7,%ymm2,%ymm15
+ vpsrld $25,%ymm2,%ymm2
+ vpor %ymm2,%ymm15,%ymm2
+ vbroadcasti128 (%r9),%ymm15
+ vpaddd %ymm7,%ymm13,%ymm13
+ vpxor %ymm3,%ymm13,%ymm3
+ vpslld $7,%ymm3,%ymm14
+ vpsrld $25,%ymm3,%ymm3
+ vpor %ymm3,%ymm14,%ymm3
+ vpaddd %ymm1,%ymm8,%ymm8
+ vpxor %ymm7,%ymm8,%ymm7
+ vpshufb %ymm15,%ymm7,%ymm7
+ vpaddd %ymm2,%ymm9,%ymm9
+ vpxor %ymm4,%ymm9,%ymm4
+ vpshufb %ymm15,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm12,%ymm12
+ vpxor %ymm1,%ymm12,%ymm1
+ vpslld $12,%ymm1,%ymm14
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm1,%ymm14,%ymm1
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm4,%ymm13,%ymm13
+ vpxor %ymm2,%ymm13,%ymm2
+ vpslld $12,%ymm2,%ymm15
+ vpsrld $20,%ymm2,%ymm2
+ vpor %ymm2,%ymm15,%ymm2
+ vpaddd %ymm1,%ymm8,%ymm8
+ vpxor %ymm7,%ymm8,%ymm7
+ vpshufb %ymm14,%ymm7,%ymm7
+ vpaddd %ymm2,%ymm9,%ymm9
+ vpxor %ymm4,%ymm9,%ymm4
+ vpshufb %ymm14,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm12,%ymm12
+ vpxor %ymm1,%ymm12,%ymm1
+ vpslld $7,%ymm1,%ymm15
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm1,%ymm15,%ymm1
+ vbroadcasti128 (%r9),%ymm15
+ vpaddd %ymm4,%ymm13,%ymm13
+ vpxor %ymm2,%ymm13,%ymm2
+ vpslld $7,%ymm2,%ymm14
+ vpsrld $25,%ymm2,%ymm2
+ vpor %ymm2,%ymm14,%ymm2
+ vmovdqa %ymm12,64(%rsp)
+ vmovdqa %ymm13,96(%rsp)
+ vmovdqa 0(%rsp),%ymm12
+ vmovdqa 32(%rsp),%ymm13
+ vpaddd %ymm3,%ymm10,%ymm10
+ vpxor %ymm5,%ymm10,%ymm5
+ vpshufb %ymm15,%ymm5,%ymm5
+ vpaddd %ymm0,%ymm11,%ymm11
+ vpxor %ymm6,%ymm11,%ymm6
+ vpshufb %ymm15,%ymm6,%ymm6
+ vpaddd %ymm5,%ymm12,%ymm12
+ vpxor %ymm3,%ymm12,%ymm3
+ vpslld $12,%ymm3,%ymm14
+ vpsrld $20,%ymm3,%ymm3
+ vpor %ymm3,%ymm14,%ymm3
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm6,%ymm13,%ymm13
+ vpxor %ymm0,%ymm13,%ymm0
+ vpslld $12,%ymm0,%ymm15
+ vpsrld $20,%ymm0,%ymm0
+ vpor %ymm0,%ymm15,%ymm0
+ vpaddd %ymm3,%ymm10,%ymm10
+ vpxor %ymm5,%ymm10,%ymm5
+ vpshufb %ymm14,%ymm5,%ymm5
+ vpaddd %ymm0,%ymm11,%ymm11
+ vpxor %ymm6,%ymm11,%ymm6
+ vpshufb %ymm14,%ymm6,%ymm6
+ vpaddd %ymm5,%ymm12,%ymm12
+ vpxor %ymm3,%ymm12,%ymm3
+ vpslld $7,%ymm3,%ymm15
+ vpsrld $25,%ymm3,%ymm3
+ vpor %ymm3,%ymm15,%ymm3
+ vbroadcasti128 (%r9),%ymm15
+ vpaddd %ymm6,%ymm13,%ymm13
+ vpxor %ymm0,%ymm13,%ymm0
+ vpslld $7,%ymm0,%ymm14
+ vpsrld $25,%ymm0,%ymm0
+ vpor %ymm0,%ymm14,%ymm0
+ decl %eax
+ jnz .Loop8x
+
+ leaq 512(%rsp),%rax
+ vpaddd 128-256(%rcx),%ymm8,%ymm8
+ vpaddd 160-256(%rcx),%ymm9,%ymm9
+ vpaddd 192-256(%rcx),%ymm10,%ymm10
+ vpaddd 224-256(%rcx),%ymm11,%ymm11
+
+ vpunpckldq %ymm9,%ymm8,%ymm14
+ vpunpckldq %ymm11,%ymm10,%ymm15
+ vpunpckhdq %ymm9,%ymm8,%ymm8
+ vpunpckhdq %ymm11,%ymm10,%ymm10
+ vpunpcklqdq %ymm15,%ymm14,%ymm9
+ vpunpckhqdq %ymm15,%ymm14,%ymm14
+ vpunpcklqdq %ymm10,%ymm8,%ymm11
+ vpunpckhqdq %ymm10,%ymm8,%ymm8
+ vpaddd 256-256(%rcx),%ymm0,%ymm0
+ vpaddd 288-256(%rcx),%ymm1,%ymm1
+ vpaddd 320-256(%rcx),%ymm2,%ymm2
+ vpaddd 352-256(%rcx),%ymm3,%ymm3
+
+ vpunpckldq %ymm1,%ymm0,%ymm10
+ vpunpckldq %ymm3,%ymm2,%ymm15
+ vpunpckhdq %ymm1,%ymm0,%ymm0
+ vpunpckhdq %ymm3,%ymm2,%ymm2
+ vpunpcklqdq %ymm15,%ymm10,%ymm1
+ vpunpckhqdq %ymm15,%ymm10,%ymm10
+ vpunpcklqdq %ymm2,%ymm0,%ymm3
+ vpunpckhqdq %ymm2,%ymm0,%ymm0
+ vperm2i128 $0x20,%ymm1,%ymm9,%ymm15
+ vperm2i128 $0x31,%ymm1,%ymm9,%ymm1
+ vperm2i128 $0x20,%ymm10,%ymm14,%ymm9
+ vperm2i128 $0x31,%ymm10,%ymm14,%ymm10
+ vperm2i128 $0x20,%ymm3,%ymm11,%ymm14
+ vperm2i128 $0x31,%ymm3,%ymm11,%ymm3
+ vperm2i128 $0x20,%ymm0,%ymm8,%ymm11
+ vperm2i128 $0x31,%ymm0,%ymm8,%ymm0
+ vmovdqa %ymm15,0(%rsp)
+ vmovdqa %ymm9,32(%rsp)
+ vmovdqa 64(%rsp),%ymm15
+ vmovdqa 96(%rsp),%ymm9
+
+ vpaddd 384-512(%rax),%ymm12,%ymm12
+ vpaddd 416-512(%rax),%ymm13,%ymm13
+ vpaddd 448-512(%rax),%ymm15,%ymm15
+ vpaddd 480-512(%rax),%ymm9,%ymm9
+
+ vpunpckldq %ymm13,%ymm12,%ymm2
+ vpunpckldq %ymm9,%ymm15,%ymm8
+ vpunpckhdq %ymm13,%ymm12,%ymm12
+ vpunpckhdq %ymm9,%ymm15,%ymm15
+ vpunpcklqdq %ymm8,%ymm2,%ymm13
+ vpunpckhqdq %ymm8,%ymm2,%ymm2
+ vpunpcklqdq %ymm15,%ymm12,%ymm9
+ vpunpckhqdq %ymm15,%ymm12,%ymm12
+ vpaddd 512-512(%rax),%ymm4,%ymm4
+ vpaddd 544-512(%rax),%ymm5,%ymm5
+ vpaddd 576-512(%rax),%ymm6,%ymm6
+ vpaddd 608-512(%rax),%ymm7,%ymm7
+
+ vpunpckldq %ymm5,%ymm4,%ymm15
+ vpunpckldq %ymm7,%ymm6,%ymm8
+ vpunpckhdq %ymm5,%ymm4,%ymm4
+ vpunpckhdq %ymm7,%ymm6,%ymm6
+ vpunpcklqdq %ymm8,%ymm15,%ymm5
+ vpunpckhqdq %ymm8,%ymm15,%ymm15
+ vpunpcklqdq %ymm6,%ymm4,%ymm7
+ vpunpckhqdq %ymm6,%ymm4,%ymm4
+ vperm2i128 $0x20,%ymm5,%ymm13,%ymm8
+ vperm2i128 $0x31,%ymm5,%ymm13,%ymm5
+ vperm2i128 $0x20,%ymm15,%ymm2,%ymm13
+ vperm2i128 $0x31,%ymm15,%ymm2,%ymm15
+ vperm2i128 $0x20,%ymm7,%ymm9,%ymm2
+ vperm2i128 $0x31,%ymm7,%ymm9,%ymm7
+ vperm2i128 $0x20,%ymm4,%ymm12,%ymm9
+ vperm2i128 $0x31,%ymm4,%ymm12,%ymm4
+ vmovdqa 0(%rsp),%ymm6
+ vmovdqa 32(%rsp),%ymm12
+
+ cmpq $512,%rdx
+ jb .Ltail8x
+
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ vpxor 0(%rsi),%ymm12,%ymm12
+ vpxor 32(%rsi),%ymm13,%ymm13
+ vpxor 64(%rsi),%ymm10,%ymm10
+ vpxor 96(%rsi),%ymm15,%ymm15
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm12,0(%rdi)
+ vmovdqu %ymm13,32(%rdi)
+ vmovdqu %ymm10,64(%rdi)
+ vmovdqu %ymm15,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ vpxor 0(%rsi),%ymm14,%ymm14
+ vpxor 32(%rsi),%ymm2,%ymm2
+ vpxor 64(%rsi),%ymm3,%ymm3
+ vpxor 96(%rsi),%ymm7,%ymm7
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm14,0(%rdi)
+ vmovdqu %ymm2,32(%rdi)
+ vmovdqu %ymm3,64(%rdi)
+ vmovdqu %ymm7,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ vpxor 0(%rsi),%ymm11,%ymm11
+ vpxor 32(%rsi),%ymm9,%ymm9
+ vpxor 64(%rsi),%ymm0,%ymm0
+ vpxor 96(%rsi),%ymm4,%ymm4
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm11,0(%rdi)
+ vmovdqu %ymm9,32(%rdi)
+ vmovdqu %ymm0,64(%rdi)
+ vmovdqu %ymm4,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ subq $512,%rdx
+ jnz .Loop_outer8x
+
+ jmp .Ldone8x
+
+.Ltail8x:
+ cmpq $448,%rdx
+ jae .L448_or_more8x
+ cmpq $384,%rdx
+ jae .L384_or_more8x
+ cmpq $320,%rdx
+ jae .L320_or_more8x
+ cmpq $256,%rdx
+ jae .L256_or_more8x
+ cmpq $192,%rdx
+ jae .L192_or_more8x
+ cmpq $128,%rdx
+ jae .L128_or_more8x
+ cmpq $64,%rdx
+ jae .L64_or_more8x
+
+ xorq %r9,%r9
+ vmovdqa %ymm6,0(%rsp)
+ vmovdqa %ymm8,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L64_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ je .Ldone8x
+
+ leaq 64(%rsi),%rsi
+ xorq %r9,%r9
+ vmovdqa %ymm1,0(%rsp)
+ leaq 64(%rdi),%rdi
+ subq $64,%rdx
+ vmovdqa %ymm5,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L128_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ je .Ldone8x
+
+ leaq 128(%rsi),%rsi
+ xorq %r9,%r9
+ vmovdqa %ymm12,0(%rsp)
+ leaq 128(%rdi),%rdi
+ subq $128,%rdx
+ vmovdqa %ymm13,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L192_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ je .Ldone8x
+
+ leaq 192(%rsi),%rsi
+ xorq %r9,%r9
+ vmovdqa %ymm10,0(%rsp)
+ leaq 192(%rdi),%rdi
+ subq $192,%rdx
+ vmovdqa %ymm15,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L256_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ je .Ldone8x
+
+ leaq 256(%rsi),%rsi
+ xorq %r9,%r9
+ vmovdqa %ymm14,0(%rsp)
+ leaq 256(%rdi),%rdi
+ subq $256,%rdx
+ vmovdqa %ymm2,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L320_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vpxor 256(%rsi),%ymm14,%ymm14
+ vpxor 288(%rsi),%ymm2,%ymm2
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ vmovdqu %ymm14,256(%rdi)
+ vmovdqu %ymm2,288(%rdi)
+ je .Ldone8x
+
+ leaq 320(%rsi),%rsi
+ xorq %r9,%r9
+ vmovdqa %ymm3,0(%rsp)
+ leaq 320(%rdi),%rdi
+ subq $320,%rdx
+ vmovdqa %ymm7,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L384_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vpxor 256(%rsi),%ymm14,%ymm14
+ vpxor 288(%rsi),%ymm2,%ymm2
+ vpxor 320(%rsi),%ymm3,%ymm3
+ vpxor 352(%rsi),%ymm7,%ymm7
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ vmovdqu %ymm14,256(%rdi)
+ vmovdqu %ymm2,288(%rdi)
+ vmovdqu %ymm3,320(%rdi)
+ vmovdqu %ymm7,352(%rdi)
+ je .Ldone8x
+
+ leaq 384(%rsi),%rsi
+ xorq %r9,%r9
+ vmovdqa %ymm11,0(%rsp)
+ leaq 384(%rdi),%rdi
+ subq $384,%rdx
+ vmovdqa %ymm9,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L448_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vpxor 256(%rsi),%ymm14,%ymm14
+ vpxor 288(%rsi),%ymm2,%ymm2
+ vpxor 320(%rsi),%ymm3,%ymm3
+ vpxor 352(%rsi),%ymm7,%ymm7
+ vpxor 384(%rsi),%ymm11,%ymm11
+ vpxor 416(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ vmovdqu %ymm14,256(%rdi)
+ vmovdqu %ymm2,288(%rdi)
+ vmovdqu %ymm3,320(%rdi)
+ vmovdqu %ymm7,352(%rdi)
+ vmovdqu %ymm11,384(%rdi)
+ vmovdqu %ymm9,416(%rdi)
+ je .Ldone8x
+
+ leaq 448(%rsi),%rsi
+ xorq %r9,%r9
+ vmovdqa %ymm0,0(%rsp)
+ leaq 448(%rdi),%rdi
+ subq $448,%rdx
+ vmovdqa %ymm4,32(%rsp)
+
+.Loop_tail8x:
+ movzbl (%rsi,%r9,1),%eax
+ movzbl (%rsp,%r9,1),%ecx
+ leaq 1(%r9),%r9
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r9,1)
+ decq %rdx
+ jnz .Loop_tail8x
+
+.Ldone8x:
+ vzeroall
+ leaq -8(%r10),%rsp
+
+.L8x_epilogue:
+ ret
+ENDPROC(chacha20_avx2)
+#endif /* CONFIG_AS_AVX2 */
+
+#ifdef CONFIG_AS_AVX512
+.align 32
+ENTRY(chacha20_avx512)
+.Lchacha20_avx512:
+ cmpq $0,%rdx
+ je .Lavx512_epilogue
+ leaq 8(%rsp),%r10
+
+ cmpq $512,%rdx
+ ja .Lchacha20_16x
+
+ subq $64+8,%rsp
+ andq $-64,%rsp
+ vbroadcasti32x4 .Lsigma(%rip),%zmm0
+ vbroadcasti32x4 (%rcx),%zmm1
+ vbroadcasti32x4 16(%rcx),%zmm2
+ vbroadcasti32x4 (%r8),%zmm3
+
+ vmovdqa32 %zmm0,%zmm16
+ vmovdqa32 %zmm1,%zmm17
+ vmovdqa32 %zmm2,%zmm18
+ vpaddd .Lzeroz(%rip),%zmm3,%zmm3
+ vmovdqa32 .Lfourz(%rip),%zmm20
+ movq $10,%r8
+ vmovdqa32 %zmm3,%zmm19
+ jmp .Loop_avx512
+
+.align 16
+.Loop_outer_avx512:
+ vmovdqa32 %zmm16,%zmm0
+ vmovdqa32 %zmm17,%zmm1
+ vmovdqa32 %zmm18,%zmm2
+ vpaddd %zmm20,%zmm19,%zmm3
+ movq $10,%r8
+ vmovdqa32 %zmm3,%zmm19
+ jmp .Loop_avx512
+
+.align 32
+.Loop_avx512:
+ vpaddd %zmm1,%zmm0,%zmm0
+ vpxord %zmm0,%zmm3,%zmm3
+ vprold $16,%zmm3,%zmm3
+ vpaddd %zmm3,%zmm2,%zmm2
+ vpxord %zmm2,%zmm1,%zmm1
+ vprold $12,%zmm1,%zmm1
+ vpaddd %zmm1,%zmm0,%zmm0
+ vpxord %zmm0,%zmm3,%zmm3
+ vprold $8,%zmm3,%zmm3
+ vpaddd %zmm3,%zmm2,%zmm2
+ vpxord %zmm2,%zmm1,%zmm1
+ vprold $7,%zmm1,%zmm1
+ vpshufd $78,%zmm2,%zmm2
+ vpshufd $57,%zmm1,%zmm1
+ vpshufd $147,%zmm3,%zmm3
+ vpaddd %zmm1,%zmm0,%zmm0
+ vpxord %zmm0,%zmm3,%zmm3
+ vprold $16,%zmm3,%zmm3
+ vpaddd %zmm3,%zmm2,%zmm2
+ vpxord %zmm2,%zmm1,%zmm1
+ vprold $12,%zmm1,%zmm1
+ vpaddd %zmm1,%zmm0,%zmm0
+ vpxord %zmm0,%zmm3,%zmm3
+ vprold $8,%zmm3,%zmm3
+ vpaddd %zmm3,%zmm2,%zmm2
+ vpxord %zmm2,%zmm1,%zmm1
+ vprold $7,%zmm1,%zmm1
+ vpshufd $78,%zmm2,%zmm2
+ vpshufd $147,%zmm1,%zmm1
+ vpshufd $57,%zmm3,%zmm3
+ decq %r8
+ jnz .Loop_avx512
+ vpaddd %zmm16,%zmm0,%zmm0
+ vpaddd %zmm17,%zmm1,%zmm1
+ vpaddd %zmm18,%zmm2,%zmm2
+ vpaddd %zmm19,%zmm3,%zmm3
+
+ subq $64,%rdx
+ jb .Ltail64_avx512
+
+ vpxor 0(%rsi),%xmm0,%xmm4
+ vpxor 16(%rsi),%xmm1,%xmm5
+ vpxor 32(%rsi),%xmm2,%xmm6
+ vpxor 48(%rsi),%xmm3,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ jz .Ldone_avx512
+
+ vextracti32x4 $1,%zmm0,%xmm4
+ vextracti32x4 $1,%zmm1,%xmm5
+ vextracti32x4 $1,%zmm2,%xmm6
+ vextracti32x4 $1,%zmm3,%xmm7
+
+ subq $64,%rdx
+ jb .Ltail_avx512
+
+ vpxor 0(%rsi),%xmm4,%xmm4
+ vpxor 16(%rsi),%xmm5,%xmm5
+ vpxor 32(%rsi),%xmm6,%xmm6
+ vpxor 48(%rsi),%xmm7,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ jz .Ldone_avx512
+
+ vextracti32x4 $2,%zmm0,%xmm4
+ vextracti32x4 $2,%zmm1,%xmm5
+ vextracti32x4 $2,%zmm2,%xmm6
+ vextracti32x4 $2,%zmm3,%xmm7
+
+ subq $64,%rdx
+ jb .Ltail_avx512
+
+ vpxor 0(%rsi),%xmm4,%xmm4
+ vpxor 16(%rsi),%xmm5,%xmm5
+ vpxor 32(%rsi),%xmm6,%xmm6
+ vpxor 48(%rsi),%xmm7,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ jz .Ldone_avx512
+
+ vextracti32x4 $3,%zmm0,%xmm4
+ vextracti32x4 $3,%zmm1,%xmm5
+ vextracti32x4 $3,%zmm2,%xmm6
+ vextracti32x4 $3,%zmm3,%xmm7
+
+ subq $64,%rdx
+ jb .Ltail_avx512
+
+ vpxor 0(%rsi),%xmm4,%xmm4
+ vpxor 16(%rsi),%xmm5,%xmm5
+ vpxor 32(%rsi),%xmm6,%xmm6
+ vpxor 48(%rsi),%xmm7,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ jnz .Loop_outer_avx512
+
+ jmp .Ldone_avx512
+
+.align 16
+.Ltail64_avx512:
+ vmovdqa %xmm0,0(%rsp)
+ vmovdqa %xmm1,16(%rsp)
+ vmovdqa %xmm2,32(%rsp)
+ vmovdqa %xmm3,48(%rsp)
+ addq $64,%rdx
+ jmp .Loop_tail_avx512
+
+.align 16
+.Ltail_avx512:
+ vmovdqa %xmm4,0(%rsp)
+ vmovdqa %xmm5,16(%rsp)
+ vmovdqa %xmm6,32(%rsp)
+ vmovdqa %xmm7,48(%rsp)
+ addq $64,%rdx
+
+.Loop_tail_avx512:
+ movzbl (%rsi,%r8,1),%eax
+ movzbl (%rsp,%r8,1),%ecx
+ leaq 1(%r8),%r8
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r8,1)
+ decq %rdx
+ jnz .Loop_tail_avx512
+
+ vmovdqa32 %zmm16,0(%rsp)
+
+.Ldone_avx512:
+ vzeroall
+ leaq -8(%r10),%rsp
+
+.Lavx512_epilogue:
+ ret
+
+.align 32
+.Lchacha20_16x:
+ leaq 8(%rsp),%r10
+
+ subq $64+8,%rsp
+ andq $-64,%rsp
+ vzeroupper
+
+ leaq .Lsigma(%rip),%r9
+ vbroadcasti32x4 (%r9),%zmm3
+ vbroadcasti32x4 (%rcx),%zmm7
+ vbroadcasti32x4 16(%rcx),%zmm11
+ vbroadcasti32x4 (%r8),%zmm15
+
+ vpshufd $0x00,%zmm3,%zmm0
+ vpshufd $0x55,%zmm3,%zmm1
+ vpshufd $0xaa,%zmm3,%zmm2
+ vpshufd $0xff,%zmm3,%zmm3
+ vmovdqa64 %zmm0,%zmm16
+ vmovdqa64 %zmm1,%zmm17
+ vmovdqa64 %zmm2,%zmm18
+ vmovdqa64 %zmm3,%zmm19
+
+ vpshufd $0x00,%zmm7,%zmm4
+ vpshufd $0x55,%zmm7,%zmm5
+ vpshufd $0xaa,%zmm7,%zmm6
+ vpshufd $0xff,%zmm7,%zmm7
+ vmovdqa64 %zmm4,%zmm20
+ vmovdqa64 %zmm5,%zmm21
+ vmovdqa64 %zmm6,%zmm22
+ vmovdqa64 %zmm7,%zmm23
+
+ vpshufd $0x00,%zmm11,%zmm8
+ vpshufd $0x55,%zmm11,%zmm9
+ vpshufd $0xaa,%zmm11,%zmm10
+ vpshufd $0xff,%zmm11,%zmm11
+ vmovdqa64 %zmm8,%zmm24
+ vmovdqa64 %zmm9,%zmm25
+ vmovdqa64 %zmm10,%zmm26
+ vmovdqa64 %zmm11,%zmm27
+
+ vpshufd $0x00,%zmm15,%zmm12
+ vpshufd $0x55,%zmm15,%zmm13
+ vpshufd $0xaa,%zmm15,%zmm14
+ vpshufd $0xff,%zmm15,%zmm15
+ vpaddd .Lincz(%rip),%zmm12,%zmm12
+ vmovdqa64 %zmm12,%zmm28
+ vmovdqa64 %zmm13,%zmm29
+ vmovdqa64 %zmm14,%zmm30
+ vmovdqa64 %zmm15,%zmm31
+
+ movl $10,%eax
+ jmp .Loop16x
+
+.align 32
+.Loop_outer16x:
+ vpbroadcastd 0(%r9),%zmm0
+ vpbroadcastd 4(%r9),%zmm1
+ vpbroadcastd 8(%r9),%zmm2
+ vpbroadcastd 12(%r9),%zmm3
+ vpaddd .Lsixteen(%rip),%zmm28,%zmm28
+ vmovdqa64 %zmm20,%zmm4
+ vmovdqa64 %zmm21,%zmm5
+ vmovdqa64 %zmm22,%zmm6
+ vmovdqa64 %zmm23,%zmm7
+ vmovdqa64 %zmm24,%zmm8
+ vmovdqa64 %zmm25,%zmm9
+ vmovdqa64 %zmm26,%zmm10
+ vmovdqa64 %zmm27,%zmm11
+ vmovdqa64 %zmm28,%zmm12
+ vmovdqa64 %zmm29,%zmm13
+ vmovdqa64 %zmm30,%zmm14
+ vmovdqa64 %zmm31,%zmm15
+
+ vmovdqa64 %zmm0,%zmm16
+ vmovdqa64 %zmm1,%zmm17
+ vmovdqa64 %zmm2,%zmm18
+ vmovdqa64 %zmm3,%zmm19
+
+ movl $10,%eax
+ jmp .Loop16x
+
+.align 32
+.Loop16x:
+ vpaddd %zmm4,%zmm0,%zmm0
+ vpaddd %zmm5,%zmm1,%zmm1
+ vpaddd %zmm6,%zmm2,%zmm2
+ vpaddd %zmm7,%zmm3,%zmm3
+ vpxord %zmm0,%zmm12,%zmm12
+ vpxord %zmm1,%zmm13,%zmm13
+ vpxord %zmm2,%zmm14,%zmm14
+ vpxord %zmm3,%zmm15,%zmm15
+ vprold $16,%zmm12,%zmm12
+ vprold $16,%zmm13,%zmm13
+ vprold $16,%zmm14,%zmm14
+ vprold $16,%zmm15,%zmm15
+ vpaddd %zmm12,%zmm8,%zmm8
+ vpaddd %zmm13,%zmm9,%zmm9
+ vpaddd %zmm14,%zmm10,%zmm10
+ vpaddd %zmm15,%zmm11,%zmm11
+ vpxord %zmm8,%zmm4,%zmm4
+ vpxord %zmm9,%zmm5,%zmm5
+ vpxord %zmm10,%zmm6,%zmm6
+ vpxord %zmm11,%zmm7,%zmm7
+ vprold $12,%zmm4,%zmm4
+ vprold $12,%zmm5,%zmm5
+ vprold $12,%zmm6,%zmm6
+ vprold $12,%zmm7,%zmm7
+ vpaddd %zmm4,%zmm0,%zmm0
+ vpaddd %zmm5,%zmm1,%zmm1
+ vpaddd %zmm6,%zmm2,%zmm2
+ vpaddd %zmm7,%zmm3,%zmm3
+ vpxord %zmm0,%zmm12,%zmm12
+ vpxord %zmm1,%zmm13,%zmm13
+ vpxord %zmm2,%zmm14,%zmm14
+ vpxord %zmm3,%zmm15,%zmm15
+ vprold $8,%zmm12,%zmm12
+ vprold $8,%zmm13,%zmm13
+ vprold $8,%zmm14,%zmm14
+ vprold $8,%zmm15,%zmm15
+ vpaddd %zmm12,%zmm8,%zmm8
+ vpaddd %zmm13,%zmm9,%zmm9
+ vpaddd %zmm14,%zmm10,%zmm10
+ vpaddd %zmm15,%zmm11,%zmm11
+ vpxord %zmm8,%zmm4,%zmm4
+ vpxord %zmm9,%zmm5,%zmm5
+ vpxord %zmm10,%zmm6,%zmm6
+ vpxord %zmm11,%zmm7,%zmm7
+ vprold $7,%zmm4,%zmm4
+ vprold $7,%zmm5,%zmm5
+ vprold $7,%zmm6,%zmm6
+ vprold $7,%zmm7,%zmm7
+ vpaddd %zmm5,%zmm0,%zmm0
+ vpaddd %zmm6,%zmm1,%zmm1
+ vpaddd %zmm7,%zmm2,%zmm2
+ vpaddd %zmm4,%zmm3,%zmm3
+ vpxord %zmm0,%zmm15,%zmm15
+ vpxord %zmm1,%zmm12,%zmm12
+ vpxord %zmm2,%zmm13,%zmm13
+ vpxord %zmm3,%zmm14,%zmm14
+ vprold $16,%zmm15,%zmm15
+ vprold $16,%zmm12,%zmm12
+ vprold $16,%zmm13,%zmm13
+ vprold $16,%zmm14,%zmm14
+ vpaddd %zmm15,%zmm10,%zmm10
+ vpaddd %zmm12,%zmm11,%zmm11
+ vpaddd %zmm13,%zmm8,%zmm8
+ vpaddd %zmm14,%zmm9,%zmm9
+ vpxord %zmm10,%zmm5,%zmm5
+ vpxord %zmm11,%zmm6,%zmm6
+ vpxord %zmm8,%zmm7,%zmm7
+ vpxord %zmm9,%zmm4,%zmm4
+ vprold $12,%zmm5,%zmm5
+ vprold $12,%zmm6,%zmm6
+ vprold $12,%zmm7,%zmm7
+ vprold $12,%zmm4,%zmm4
+ vpaddd %zmm5,%zmm0,%zmm0
+ vpaddd %zmm6,%zmm1,%zmm1
+ vpaddd %zmm7,%zmm2,%zmm2
+ vpaddd %zmm4,%zmm3,%zmm3
+ vpxord %zmm0,%zmm15,%zmm15
+ vpxord %zmm1,%zmm12,%zmm12
+ vpxord %zmm2,%zmm13,%zmm13
+ vpxord %zmm3,%zmm14,%zmm14
+ vprold $8,%zmm15,%zmm15
+ vprold $8,%zmm12,%zmm12
+ vprold $8,%zmm13,%zmm13
+ vprold $8,%zmm14,%zmm14
+ vpaddd %zmm15,%zmm10,%zmm10
+ vpaddd %zmm12,%zmm11,%zmm11
+ vpaddd %zmm13,%zmm8,%zmm8
+ vpaddd %zmm14,%zmm9,%zmm9
+ vpxord %zmm10,%zmm5,%zmm5
+ vpxord %zmm11,%zmm6,%zmm6
+ vpxord %zmm8,%zmm7,%zmm7
+ vpxord %zmm9,%zmm4,%zmm4
+ vprold $7,%zmm5,%zmm5
+ vprold $7,%zmm6,%zmm6
+ vprold $7,%zmm7,%zmm7
+ vprold $7,%zmm4,%zmm4
+ decl %eax
+ jnz .Loop16x
+
+ vpaddd %zmm16,%zmm0,%zmm0
+ vpaddd %zmm17,%zmm1,%zmm1
+ vpaddd %zmm18,%zmm2,%zmm2
+ vpaddd %zmm19,%zmm3,%zmm3
+
+ vpunpckldq %zmm1,%zmm0,%zmm18
+ vpunpckldq %zmm3,%zmm2,%zmm19
+ vpunpckhdq %zmm1,%zmm0,%zmm0
+ vpunpckhdq %zmm3,%zmm2,%zmm2
+ vpunpcklqdq %zmm19,%zmm18,%zmm1
+ vpunpckhqdq %zmm19,%zmm18,%zmm18
+ vpunpcklqdq %zmm2,%zmm0,%zmm3
+ vpunpckhqdq %zmm2,%zmm0,%zmm0
+ vpaddd %zmm20,%zmm4,%zmm4
+ vpaddd %zmm21,%zmm5,%zmm5
+ vpaddd %zmm22,%zmm6,%zmm6
+ vpaddd %zmm23,%zmm7,%zmm7
+
+ vpunpckldq %zmm5,%zmm4,%zmm2
+ vpunpckldq %zmm7,%zmm6,%zmm19
+ vpunpckhdq %zmm5,%zmm4,%zmm4
+ vpunpckhdq %zmm7,%zmm6,%zmm6
+ vpunpcklqdq %zmm19,%zmm2,%zmm5
+ vpunpckhqdq %zmm19,%zmm2,%zmm2
+ vpunpcklqdq %zmm6,%zmm4,%zmm7
+ vpunpckhqdq %zmm6,%zmm4,%zmm4
+ vshufi32x4 $0x44,%zmm5,%zmm1,%zmm19
+ vshufi32x4 $0xee,%zmm5,%zmm1,%zmm5
+ vshufi32x4 $0x44,%zmm2,%zmm18,%zmm1
+ vshufi32x4 $0xee,%zmm2,%zmm18,%zmm2
+ vshufi32x4 $0x44,%zmm7,%zmm3,%zmm18
+ vshufi32x4 $0xee,%zmm7,%zmm3,%zmm7
+ vshufi32x4 $0x44,%zmm4,%zmm0,%zmm3
+ vshufi32x4 $0xee,%zmm4,%zmm0,%zmm4
+ vpaddd %zmm24,%zmm8,%zmm8
+ vpaddd %zmm25,%zmm9,%zmm9
+ vpaddd %zmm26,%zmm10,%zmm10
+ vpaddd %zmm27,%zmm11,%zmm11
+
+ vpunpckldq %zmm9,%zmm8,%zmm6
+ vpunpckldq %zmm11,%zmm10,%zmm0
+ vpunpckhdq %zmm9,%zmm8,%zmm8
+ vpunpckhdq %zmm11,%zmm10,%zmm10
+ vpunpcklqdq %zmm0,%zmm6,%zmm9
+ vpunpckhqdq %zmm0,%zmm6,%zmm6
+ vpunpcklqdq %zmm10,%zmm8,%zmm11
+ vpunpckhqdq %zmm10,%zmm8,%zmm8
+ vpaddd %zmm28,%zmm12,%zmm12
+ vpaddd %zmm29,%zmm13,%zmm13
+ vpaddd %zmm30,%zmm14,%zmm14
+ vpaddd %zmm31,%zmm15,%zmm15
+
+ vpunpckldq %zmm13,%zmm12,%zmm10
+ vpunpckldq %zmm15,%zmm14,%zmm0
+ vpunpckhdq %zmm13,%zmm12,%zmm12
+ vpunpckhdq %zmm15,%zmm14,%zmm14
+ vpunpcklqdq %zmm0,%zmm10,%zmm13
+ vpunpckhqdq %zmm0,%zmm10,%zmm10
+ vpunpcklqdq %zmm14,%zmm12,%zmm15
+ vpunpckhqdq %zmm14,%zmm12,%zmm12
+ vshufi32x4 $0x44,%zmm13,%zmm9,%zmm0
+ vshufi32x4 $0xee,%zmm13,%zmm9,%zmm13
+ vshufi32x4 $0x44,%zmm10,%zmm6,%zmm9
+ vshufi32x4 $0xee,%zmm10,%zmm6,%zmm10
+ vshufi32x4 $0x44,%zmm15,%zmm11,%zmm6
+ vshufi32x4 $0xee,%zmm15,%zmm11,%zmm15
+ vshufi32x4 $0x44,%zmm12,%zmm8,%zmm11
+ vshufi32x4 $0xee,%zmm12,%zmm8,%zmm12
+ vshufi32x4 $0x88,%zmm0,%zmm19,%zmm16
+ vshufi32x4 $0xdd,%zmm0,%zmm19,%zmm19
+ vshufi32x4 $0x88,%zmm13,%zmm5,%zmm0
+ vshufi32x4 $0xdd,%zmm13,%zmm5,%zmm13
+ vshufi32x4 $0x88,%zmm9,%zmm1,%zmm17
+ vshufi32x4 $0xdd,%zmm9,%zmm1,%zmm1
+ vshufi32x4 $0x88,%zmm10,%zmm2,%zmm9
+ vshufi32x4 $0xdd,%zmm10,%zmm2,%zmm10
+ vshufi32x4 $0x88,%zmm6,%zmm18,%zmm14
+ vshufi32x4 $0xdd,%zmm6,%zmm18,%zmm18
+ vshufi32x4 $0x88,%zmm15,%zmm7,%zmm6
+ vshufi32x4 $0xdd,%zmm15,%zmm7,%zmm15
+ vshufi32x4 $0x88,%zmm11,%zmm3,%zmm8
+ vshufi32x4 $0xdd,%zmm11,%zmm3,%zmm3
+ vshufi32x4 $0x88,%zmm12,%zmm4,%zmm11
+ vshufi32x4 $0xdd,%zmm12,%zmm4,%zmm12
+ cmpq $1024,%rdx
+ jb .Ltail16x
+
+ vpxord 0(%rsi),%zmm16,%zmm16
+ vpxord 64(%rsi),%zmm17,%zmm17
+ vpxord 128(%rsi),%zmm14,%zmm14
+ vpxord 192(%rsi),%zmm8,%zmm8
+ vmovdqu32 %zmm16,0(%rdi)
+ vmovdqu32 %zmm17,64(%rdi)
+ vmovdqu32 %zmm14,128(%rdi)
+ vmovdqu32 %zmm8,192(%rdi)
+
+ vpxord 256(%rsi),%zmm19,%zmm19
+ vpxord 320(%rsi),%zmm1,%zmm1
+ vpxord 384(%rsi),%zmm18,%zmm18
+ vpxord 448(%rsi),%zmm3,%zmm3
+ vmovdqu32 %zmm19,256(%rdi)
+ vmovdqu32 %zmm1,320(%rdi)
+ vmovdqu32 %zmm18,384(%rdi)
+ vmovdqu32 %zmm3,448(%rdi)
+
+ vpxord 512(%rsi),%zmm0,%zmm0
+ vpxord 576(%rsi),%zmm9,%zmm9
+ vpxord 640(%rsi),%zmm6,%zmm6
+ vpxord 704(%rsi),%zmm11,%zmm11
+ vmovdqu32 %zmm0,512(%rdi)
+ vmovdqu32 %zmm9,576(%rdi)
+ vmovdqu32 %zmm6,640(%rdi)
+ vmovdqu32 %zmm11,704(%rdi)
+
+ vpxord 768(%rsi),%zmm13,%zmm13
+ vpxord 832(%rsi),%zmm10,%zmm10
+ vpxord 896(%rsi),%zmm15,%zmm15
+ vpxord 960(%rsi),%zmm12,%zmm12
+ leaq 1024(%rsi),%rsi
+ vmovdqu32 %zmm13,768(%rdi)
+ vmovdqu32 %zmm10,832(%rdi)
+ vmovdqu32 %zmm15,896(%rdi)
+ vmovdqu32 %zmm12,960(%rdi)
+ leaq 1024(%rdi),%rdi
+
+ subq $1024,%rdx
+ jnz .Loop_outer16x
+
+ jmp .Ldone16x
+
+.align 32
+.Ltail16x:
+ xorq %r9,%r9
+ subq %rsi,%rdi
+ cmpq $64,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm16,%zmm16
+ vmovdqu32 %zmm16,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm17,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $128,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm17,%zmm17
+ vmovdqu32 %zmm17,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm14,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $192,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm14,%zmm14
+ vmovdqu32 %zmm14,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm8,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $256,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm8,%zmm8
+ vmovdqu32 %zmm8,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm19,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $320,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm19,%zmm19
+ vmovdqu32 %zmm19,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm1,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $384,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm1,%zmm1
+ vmovdqu32 %zmm1,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm18,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $448,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm18,%zmm18
+ vmovdqu32 %zmm18,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm3,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $512,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm3,%zmm3
+ vmovdqu32 %zmm3,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm0,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $576,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm0,%zmm0
+ vmovdqu32 %zmm0,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm9,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $640,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm9,%zmm9
+ vmovdqu32 %zmm9,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm6,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $704,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm6,%zmm6
+ vmovdqu32 %zmm6,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm11,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $768,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm11,%zmm11
+ vmovdqu32 %zmm11,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm13,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $832,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm13,%zmm13
+ vmovdqu32 %zmm13,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm10,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $896,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm10,%zmm10
+ vmovdqu32 %zmm10,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm15,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $960,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm15,%zmm15
+ vmovdqu32 %zmm15,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm12,%zmm16
+ leaq 64(%rsi),%rsi
+
+.Less_than_64_16x:
+ vmovdqa32 %zmm16,0(%rsp)
+ leaq (%rdi,%rsi,1),%rdi
+ andq $63,%rdx
+
+.Loop_tail16x:
+ movzbl (%rsi,%r9,1),%eax
+ movzbl (%rsp,%r9,1),%ecx
+ leaq 1(%r9),%r9
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r9,1)
+ decq %rdx
+ jnz .Loop_tail16x
+
+ vpxord %zmm16,%zmm16,%zmm16
+ vmovdqa32 %zmm16,0(%rsp)
+
+.Ldone16x:
+ vzeroall
+ leaq -8(%r10),%rsp
+
+.L16x_epilogue:
+ ret
+ENDPROC(chacha20_avx512)
+
+.align 32
+ENTRY(chacha20_avx512vl)
+ cmpq $0,%rdx
+ je .Lavx512vl_epilogue
+
+ leaq 8(%rsp),%r10
+
+ cmpq $128,%rdx
+ ja .Lchacha20_8xvl
+
+ subq $64+8,%rsp
+ andq $-64,%rsp
+ vbroadcasti128 .Lsigma(%rip),%ymm0
+ vbroadcasti128 (%rcx),%ymm1
+ vbroadcasti128 16(%rcx),%ymm2
+ vbroadcasti128 (%r8),%ymm3
+
+ vmovdqa32 %ymm0,%ymm16
+ vmovdqa32 %ymm1,%ymm17
+ vmovdqa32 %ymm2,%ymm18
+ vpaddd .Lzeroz(%rip),%ymm3,%ymm3
+ vmovdqa32 .Ltwoy(%rip),%ymm20
+ movq $10,%r8
+ vmovdqa32 %ymm3,%ymm19
+ jmp .Loop_avx512vl
+
+.align 16
+.Loop_outer_avx512vl:
+ vmovdqa32 %ymm18,%ymm2
+ vpaddd %ymm20,%ymm19,%ymm3
+ movq $10,%r8
+ vmovdqa32 %ymm3,%ymm19
+ jmp .Loop_avx512vl
+
+.align 32
+.Loop_avx512vl:
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+ vpshufd $78,%ymm2,%ymm2
+ vpshufd $57,%ymm1,%ymm1
+ vpshufd $147,%ymm3,%ymm3
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+ vpshufd $78,%ymm2,%ymm2
+ vpshufd $147,%ymm1,%ymm1
+ vpshufd $57,%ymm3,%ymm3
+ decq %r8
+ jnz .Loop_avx512vl
+ vpaddd %ymm16,%ymm0,%ymm0
+ vpaddd %ymm17,%ymm1,%ymm1
+ vpaddd %ymm18,%ymm2,%ymm2
+ vpaddd %ymm19,%ymm3,%ymm3
+
+ subq $64,%rdx
+ jb .Ltail64_avx512vl
+
+ vpxor 0(%rsi),%xmm0,%xmm4
+ vpxor 16(%rsi),%xmm1,%xmm5
+ vpxor 32(%rsi),%xmm2,%xmm6
+ vpxor 48(%rsi),%xmm3,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ jz .Ldone_avx512vl
+
+ vextracti128 $1,%ymm0,%xmm4
+ vextracti128 $1,%ymm1,%xmm5
+ vextracti128 $1,%ymm2,%xmm6
+ vextracti128 $1,%ymm3,%xmm7
+
+ subq $64,%rdx
+ jb .Ltail_avx512vl
+
+ vpxor 0(%rsi),%xmm4,%xmm4
+ vpxor 16(%rsi),%xmm5,%xmm5
+ vpxor 32(%rsi),%xmm6,%xmm6
+ vpxor 48(%rsi),%xmm7,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ vmovdqa32 %ymm16,%ymm0
+ vmovdqa32 %ymm17,%ymm1
+ jnz .Loop_outer_avx512vl
+
+ jmp .Ldone_avx512vl
+
+.align 16
+.Ltail64_avx512vl:
+ vmovdqa %xmm0,0(%rsp)
+ vmovdqa %xmm1,16(%rsp)
+ vmovdqa %xmm2,32(%rsp)
+ vmovdqa %xmm3,48(%rsp)
+ addq $64,%rdx
+ jmp .Loop_tail_avx512vl
+
+.align 16
+.Ltail_avx512vl:
+ vmovdqa %xmm4,0(%rsp)
+ vmovdqa %xmm5,16(%rsp)
+ vmovdqa %xmm6,32(%rsp)
+ vmovdqa %xmm7,48(%rsp)
+ addq $64,%rdx
+
+.Loop_tail_avx512vl:
+ movzbl (%rsi,%r8,1),%eax
+ movzbl (%rsp,%r8,1),%ecx
+ leaq 1(%r8),%r8
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r8,1)
+ decq %rdx
+ jnz .Loop_tail_avx512vl
+
+ vmovdqa32 %ymm16,0(%rsp)
+ vmovdqa32 %ymm16,32(%rsp)
+
+.Ldone_avx512vl:
+ vzeroall
+ leaq -8(%r10),%rsp
+.Lavx512vl_epilogue:
+ ret
+
+.align 32
+.Lchacha20_8xvl:
+ leaq 8(%rsp),%r10
+ subq $64+8,%rsp
+ andq $-64,%rsp
+ vzeroupper
+
+ leaq .Lsigma(%rip),%r9
+ vbroadcasti128 (%r9),%ymm3
+ vbroadcasti128 (%rcx),%ymm7
+ vbroadcasti128 16(%rcx),%ymm11
+ vbroadcasti128 (%r8),%ymm15
+
+ vpshufd $0x00,%ymm3,%ymm0
+ vpshufd $0x55,%ymm3,%ymm1
+ vpshufd $0xaa,%ymm3,%ymm2
+ vpshufd $0xff,%ymm3,%ymm3
+ vmovdqa64 %ymm0,%ymm16
+ vmovdqa64 %ymm1,%ymm17
+ vmovdqa64 %ymm2,%ymm18
+ vmovdqa64 %ymm3,%ymm19
+
+ vpshufd $0x00,%ymm7,%ymm4
+ vpshufd $0x55,%ymm7,%ymm5
+ vpshufd $0xaa,%ymm7,%ymm6
+ vpshufd $0xff,%ymm7,%ymm7
+ vmovdqa64 %ymm4,%ymm20
+ vmovdqa64 %ymm5,%ymm21
+ vmovdqa64 %ymm6,%ymm22
+ vmovdqa64 %ymm7,%ymm23
+
+ vpshufd $0x00,%ymm11,%ymm8
+ vpshufd $0x55,%ymm11,%ymm9
+ vpshufd $0xaa,%ymm11,%ymm10
+ vpshufd $0xff,%ymm11,%ymm11
+ vmovdqa64 %ymm8,%ymm24
+ vmovdqa64 %ymm9,%ymm25
+ vmovdqa64 %ymm10,%ymm26
+ vmovdqa64 %ymm11,%ymm27
+
+ vpshufd $0x00,%ymm15,%ymm12
+ vpshufd $0x55,%ymm15,%ymm13
+ vpshufd $0xaa,%ymm15,%ymm14
+ vpshufd $0xff,%ymm15,%ymm15
+ vpaddd .Lincy(%rip),%ymm12,%ymm12
+ vmovdqa64 %ymm12,%ymm28
+ vmovdqa64 %ymm13,%ymm29
+ vmovdqa64 %ymm14,%ymm30
+ vmovdqa64 %ymm15,%ymm31
+
+ movl $10,%eax
+ jmp .Loop8xvl
+
+.align 32
+.Loop_outer8xvl:
+
+
+ vpbroadcastd 8(%r9),%ymm2
+ vpbroadcastd 12(%r9),%ymm3
+ vpaddd .Leight(%rip),%ymm28,%ymm28
+ vmovdqa64 %ymm20,%ymm4
+ vmovdqa64 %ymm21,%ymm5
+ vmovdqa64 %ymm22,%ymm6
+ vmovdqa64 %ymm23,%ymm7
+ vmovdqa64 %ymm24,%ymm8
+ vmovdqa64 %ymm25,%ymm9
+ vmovdqa64 %ymm26,%ymm10
+ vmovdqa64 %ymm27,%ymm11
+ vmovdqa64 %ymm28,%ymm12
+ vmovdqa64 %ymm29,%ymm13
+ vmovdqa64 %ymm30,%ymm14
+ vmovdqa64 %ymm31,%ymm15
+
+ vmovdqa64 %ymm0,%ymm16
+ vmovdqa64 %ymm1,%ymm17
+ vmovdqa64 %ymm2,%ymm18
+ vmovdqa64 %ymm3,%ymm19
+
+ movl $10,%eax
+ jmp .Loop8xvl
+
+.align 32
+.Loop8xvl:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpxor %ymm0,%ymm12,%ymm12
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm3,%ymm15,%ymm15
+ vprold $16,%ymm12,%ymm12
+ vprold $16,%ymm13,%ymm13
+ vprold $16,%ymm14,%ymm14
+ vprold $16,%ymm15,%ymm15
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxor %ymm8,%ymm4,%ymm4
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm11,%ymm7,%ymm7
+ vprold $12,%ymm4,%ymm4
+ vprold $12,%ymm5,%ymm5
+ vprold $12,%ymm6,%ymm6
+ vprold $12,%ymm7,%ymm7
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpxor %ymm0,%ymm12,%ymm12
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm3,%ymm15,%ymm15
+ vprold $8,%ymm12,%ymm12
+ vprold $8,%ymm13,%ymm13
+ vprold $8,%ymm14,%ymm14
+ vprold $8,%ymm15,%ymm15
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxor %ymm8,%ymm4,%ymm4
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm11,%ymm7,%ymm7
+ vprold $7,%ymm4,%ymm4
+ vprold $7,%ymm5,%ymm5
+ vprold $7,%ymm6,%ymm6
+ vprold $7,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm0,%ymm0
+ vpaddd %ymm6,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpaddd %ymm4,%ymm3,%ymm3
+ vpxor %ymm0,%ymm15,%ymm15
+ vpxor %ymm1,%ymm12,%ymm12
+ vpxor %ymm2,%ymm13,%ymm13
+ vpxor %ymm3,%ymm14,%ymm14
+ vprold $16,%ymm15,%ymm15
+ vprold $16,%ymm12,%ymm12
+ vprold $16,%ymm13,%ymm13
+ vprold $16,%ymm14,%ymm14
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxor %ymm10,%ymm5,%ymm5
+ vpxor %ymm11,%ymm6,%ymm6
+ vpxor %ymm8,%ymm7,%ymm7
+ vpxor %ymm9,%ymm4,%ymm4
+ vprold $12,%ymm5,%ymm5
+ vprold $12,%ymm6,%ymm6
+ vprold $12,%ymm7,%ymm7
+ vprold $12,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm0,%ymm0
+ vpaddd %ymm6,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpaddd %ymm4,%ymm3,%ymm3
+ vpxor %ymm0,%ymm15,%ymm15
+ vpxor %ymm1,%ymm12,%ymm12
+ vpxor %ymm2,%ymm13,%ymm13
+ vpxor %ymm3,%ymm14,%ymm14
+ vprold $8,%ymm15,%ymm15
+ vprold $8,%ymm12,%ymm12
+ vprold $8,%ymm13,%ymm13
+ vprold $8,%ymm14,%ymm14
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxor %ymm10,%ymm5,%ymm5
+ vpxor %ymm11,%ymm6,%ymm6
+ vpxor %ymm8,%ymm7,%ymm7
+ vpxor %ymm9,%ymm4,%ymm4
+ vprold $7,%ymm5,%ymm5
+ vprold $7,%ymm6,%ymm6
+ vprold $7,%ymm7,%ymm7
+ vprold $7,%ymm4,%ymm4
+ decl %eax
+ jnz .Loop8xvl
+
+ vpaddd %ymm16,%ymm0,%ymm0
+ vpaddd %ymm17,%ymm1,%ymm1
+ vpaddd %ymm18,%ymm2,%ymm2
+ vpaddd %ymm19,%ymm3,%ymm3
+
+ vpunpckldq %ymm1,%ymm0,%ymm18
+ vpunpckldq %ymm3,%ymm2,%ymm19
+ vpunpckhdq %ymm1,%ymm0,%ymm0
+ vpunpckhdq %ymm3,%ymm2,%ymm2
+ vpunpcklqdq %ymm19,%ymm18,%ymm1
+ vpunpckhqdq %ymm19,%ymm18,%ymm18
+ vpunpcklqdq %ymm2,%ymm0,%ymm3
+ vpunpckhqdq %ymm2,%ymm0,%ymm0
+ vpaddd %ymm20,%ymm4,%ymm4
+ vpaddd %ymm21,%ymm5,%ymm5
+ vpaddd %ymm22,%ymm6,%ymm6
+ vpaddd %ymm23,%ymm7,%ymm7
+
+ vpunpckldq %ymm5,%ymm4,%ymm2
+ vpunpckldq %ymm7,%ymm6,%ymm19
+ vpunpckhdq %ymm5,%ymm4,%ymm4
+ vpunpckhdq %ymm7,%ymm6,%ymm6
+ vpunpcklqdq %ymm19,%ymm2,%ymm5
+ vpunpckhqdq %ymm19,%ymm2,%ymm2
+ vpunpcklqdq %ymm6,%ymm4,%ymm7
+ vpunpckhqdq %ymm6,%ymm4,%ymm4
+ vshufi32x4 $0,%ymm5,%ymm1,%ymm19
+ vshufi32x4 $3,%ymm5,%ymm1,%ymm5
+ vshufi32x4 $0,%ymm2,%ymm18,%ymm1
+ vshufi32x4 $3,%ymm2,%ymm18,%ymm2
+ vshufi32x4 $0,%ymm7,%ymm3,%ymm18
+ vshufi32x4 $3,%ymm7,%ymm3,%ymm7
+ vshufi32x4 $0,%ymm4,%ymm0,%ymm3
+ vshufi32x4 $3,%ymm4,%ymm0,%ymm4
+ vpaddd %ymm24,%ymm8,%ymm8
+ vpaddd %ymm25,%ymm9,%ymm9
+ vpaddd %ymm26,%ymm10,%ymm10
+ vpaddd %ymm27,%ymm11,%ymm11
+
+ vpunpckldq %ymm9,%ymm8,%ymm6
+ vpunpckldq %ymm11,%ymm10,%ymm0
+ vpunpckhdq %ymm9,%ymm8,%ymm8
+ vpunpckhdq %ymm11,%ymm10,%ymm10
+ vpunpcklqdq %ymm0,%ymm6,%ymm9
+ vpunpckhqdq %ymm0,%ymm6,%ymm6
+ vpunpcklqdq %ymm10,%ymm8,%ymm11
+ vpunpckhqdq %ymm10,%ymm8,%ymm8
+ vpaddd %ymm28,%ymm12,%ymm12
+ vpaddd %ymm29,%ymm13,%ymm13
+ vpaddd %ymm30,%ymm14,%ymm14
+ vpaddd %ymm31,%ymm15,%ymm15
+
+ vpunpckldq %ymm13,%ymm12,%ymm10
+ vpunpckldq %ymm15,%ymm14,%ymm0
+ vpunpckhdq %ymm13,%ymm12,%ymm12
+ vpunpckhdq %ymm15,%ymm14,%ymm14
+ vpunpcklqdq %ymm0,%ymm10,%ymm13
+ vpunpckhqdq %ymm0,%ymm10,%ymm10
+ vpunpcklqdq %ymm14,%ymm12,%ymm15
+ vpunpckhqdq %ymm14,%ymm12,%ymm12
+ vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
+ vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
+ vperm2i128 $0x20,%ymm10,%ymm6,%ymm9
+ vperm2i128 $0x31,%ymm10,%ymm6,%ymm10
+ vperm2i128 $0x20,%ymm15,%ymm11,%ymm6
+ vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
+ vperm2i128 $0x20,%ymm12,%ymm8,%ymm11
+ vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
+ cmpq $512,%rdx
+ jb .Ltail8xvl
+
+ movl $0x80,%eax
+ vpxord 0(%rsi),%ymm19,%ymm19
+ vpxor 32(%rsi),%ymm0,%ymm0
+ vpxor 64(%rsi),%ymm5,%ymm5
+ vpxor 96(%rsi),%ymm13,%ymm13
+ leaq (%rsi,%rax,1),%rsi
+ vmovdqu32 %ymm19,0(%rdi)
+ vmovdqu %ymm0,32(%rdi)
+ vmovdqu %ymm5,64(%rdi)
+ vmovdqu %ymm13,96(%rdi)
+ leaq (%rdi,%rax,1),%rdi
+
+ vpxor 0(%rsi),%ymm1,%ymm1
+ vpxor 32(%rsi),%ymm9,%ymm9
+ vpxor 64(%rsi),%ymm2,%ymm2
+ vpxor 96(%rsi),%ymm10,%ymm10
+ leaq (%rsi,%rax,1),%rsi
+ vmovdqu %ymm1,0(%rdi)
+ vmovdqu %ymm9,32(%rdi)
+ vmovdqu %ymm2,64(%rdi)
+ vmovdqu %ymm10,96(%rdi)
+ leaq (%rdi,%rax,1),%rdi
+
+ vpxord 0(%rsi),%ymm18,%ymm18
+ vpxor 32(%rsi),%ymm6,%ymm6
+ vpxor 64(%rsi),%ymm7,%ymm7
+ vpxor 96(%rsi),%ymm15,%ymm15
+ leaq (%rsi,%rax,1),%rsi
+ vmovdqu32 %ymm18,0(%rdi)
+ vmovdqu %ymm6,32(%rdi)
+ vmovdqu %ymm7,64(%rdi)
+ vmovdqu %ymm15,96(%rdi)
+ leaq (%rdi,%rax,1),%rdi
+
+ vpxor 0(%rsi),%ymm3,%ymm3
+ vpxor 32(%rsi),%ymm11,%ymm11
+ vpxor 64(%rsi),%ymm4,%ymm4
+ vpxor 96(%rsi),%ymm12,%ymm12
+ leaq (%rsi,%rax,1),%rsi
+ vmovdqu %ymm3,0(%rdi)
+ vmovdqu %ymm11,32(%rdi)
+ vmovdqu %ymm4,64(%rdi)
+ vmovdqu %ymm12,96(%rdi)
+ leaq (%rdi,%rax,1),%rdi
+
+ vpbroadcastd 0(%r9),%ymm0
+ vpbroadcastd 4(%r9),%ymm1
+
+ subq $512,%rdx
+ jnz .Loop_outer8xvl
+
+ jmp .Ldone8xvl
+
+.align 32
+.Ltail8xvl:
+ vmovdqa64 %ymm19,%ymm8
+ xorq %r9,%r9
+ subq %rsi,%rdi
+ cmpq $64,%rdx
+ jb .Less_than_64_8xvl
+ vpxor 0(%rsi),%ymm8,%ymm8
+ vpxor 32(%rsi),%ymm0,%ymm0
+ vmovdqu %ymm8,0(%rdi,%rsi,1)
+ vmovdqu %ymm0,32(%rdi,%rsi,1)
+ je .Ldone8xvl
+ vmovdqa %ymm5,%ymm8
+ vmovdqa %ymm13,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $128,%rdx
+ jb .Less_than_64_8xvl
+ vpxor 0(%rsi),%ymm5,%ymm5
+ vpxor 32(%rsi),%ymm13,%ymm13
+ vmovdqu %ymm5,0(%rdi,%rsi,1)
+ vmovdqu %ymm13,32(%rdi,%rsi,1)
+ je .Ldone8xvl
+ vmovdqa %ymm1,%ymm8
+ vmovdqa %ymm9,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $192,%rdx
+ jb .Less_than_64_8xvl
+ vpxor 0(%rsi),%ymm1,%ymm1
+ vpxor 32(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm1,0(%rdi,%rsi,1)
+ vmovdqu %ymm9,32(%rdi,%rsi,1)
+ je .Ldone8xvl
+ vmovdqa %ymm2,%ymm8
+ vmovdqa %ymm10,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $256,%rdx
+ jb .Less_than_64_8xvl
+ vpxor 0(%rsi),%ymm2,%ymm2
+ vpxor 32(%rsi),%ymm10,%ymm10
+ vmovdqu %ymm2,0(%rdi,%rsi,1)
+ vmovdqu %ymm10,32(%rdi,%rsi,1)
+ je .Ldone8xvl
+ vmovdqa32 %ymm18,%ymm8
+ vmovdqa %ymm6,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $320,%rdx
+ jb .Less_than_64_8xvl
+ vpxord 0(%rsi),%ymm18,%ymm18
+ vpxor 32(%rsi),%ymm6,%ymm6
+ vmovdqu32 %ymm18,0(%rdi,%rsi,1)
+ vmovdqu %ymm6,32(%rdi,%rsi,1)
+ je .Ldone8xvl
+ vmovdqa %ymm7,%ymm8
+ vmovdqa %ymm15,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $384,%rdx
+ jb .Less_than_64_8xvl
+ vpxor 0(%rsi),%ymm7,%ymm7
+ vpxor 32(%rsi),%ymm15,%ymm15
+ vmovdqu %ymm7,0(%rdi,%rsi,1)
+ vmovdqu %ymm15,32(%rdi,%rsi,1)
+ je .Ldone8xvl
+ vmovdqa %ymm3,%ymm8
+ vmovdqa %ymm11,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $448,%rdx
+ jb .Less_than_64_8xvl
+ vpxor 0(%rsi),%ymm3,%ymm3
+ vpxor 32(%rsi),%ymm11,%ymm11
+ vmovdqu %ymm3,0(%rdi,%rsi,1)
+ vmovdqu %ymm11,32(%rdi,%rsi,1)
+ je .Ldone8xvl
+ vmovdqa %ymm4,%ymm8
+ vmovdqa %ymm12,%ymm0
+ leaq 64(%rsi),%rsi
+
+.Less_than_64_8xvl:
+ vmovdqa %ymm8,0(%rsp)
+ vmovdqa %ymm0,32(%rsp)
+ leaq (%rdi,%rsi,1),%rdi
+ andq $63,%rdx
+
+.Loop_tail8xvl:
+ movzbl (%rsi,%r9,1),%eax
+ movzbl (%rsp,%r9,1),%ecx
+ leaq 1(%r9),%r9
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r9,1)
+ decq %rdx
+ jnz .Loop_tail8xvl
+
+ vpxor %ymm8,%ymm8,%ymm8
+ vmovdqa %ymm8,0(%rsp)
+ vmovdqa %ymm8,32(%rsp)
+
+.Ldone8xvl:
+ vzeroall
+ leaq -8(%r10),%rsp
+.L8xvl_epilogue:
+ ret
+ENDPROC(chacha20_avx512vl)
+
+#endif /* CONFIG_AS_AVX512 */
diff --git a/src/crypto/zinc/chacha20/chacha20.c b/src/crypto/zinc/chacha20/chacha20.c
new file mode 100644
index 0000000..ab5ef07
--- /dev/null
+++ b/src/crypto/zinc/chacha20/chacha20.c
@@ -0,0 +1,168 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * Implementation of the ChaCha20 stream cipher.
+ *
+ * Information: https://cr.yp.to/chacha.html
+ */
+
+#include <zinc/chacha20.h>
+
+#include <linux/kernel.h>
+#include <crypto/algapi.h>
+
+#ifndef HAVE_CHACHA20_ARCH_IMPLEMENTATION
+void __init chacha20_fpu_init(void)
+{
+}
+static inline bool chacha20_arch(u8 *out, const u8 *in, const size_t len,
+ const u32 key[8], const u32 counter[4],
+ simd_context_t simd_context)
+{
+ return false;
+}
+static inline bool hchacha20_arch(u8 *derived_key, const u8 *nonce,
+ const u8 *key, simd_context_t simd_context)
+{
+ return false;
+}
+#endif
+
+#define EXPAND_32_BYTE_K 0x61707865U, 0x3320646eU, 0x79622d32U, 0x6b206574U
+
+#define QUARTER_ROUND(x, a, b, c, d) ( \
+ x[a] += x[b], \
+ x[d] = rol32((x[d] ^ x[a]), 16), \
+ x[c] += x[d], \
+ x[b] = rol32((x[b] ^ x[c]), 12), \
+ x[a] += x[b], \
+ x[d] = rol32((x[d] ^ x[a]), 8), \
+ x[c] += x[d], \
+ x[b] = rol32((x[b] ^ x[c]), 7) \
+)
+
+#define C(i, j) (i * 4 + j)
+
+#define DOUBLE_ROUND(x) ( \
+ /* Column Round */ \
+ QUARTER_ROUND(x, C(0, 0), C(1, 0), C(2, 0), C(3, 0)), \
+ QUARTER_ROUND(x, C(0, 1), C(1, 1), C(2, 1), C(3, 1)), \
+ QUARTER_ROUND(x, C(0, 2), C(1, 2), C(2, 2), C(3, 2)), \
+ QUARTER_ROUND(x, C(0, 3), C(1, 3), C(2, 3), C(3, 3)), \
+ /* Diagonal Round */ \
+ QUARTER_ROUND(x, C(0, 0), C(1, 1), C(2, 2), C(3, 3)), \
+ QUARTER_ROUND(x, C(0, 1), C(1, 2), C(2, 3), C(3, 0)), \
+ QUARTER_ROUND(x, C(0, 2), C(1, 3), C(2, 0), C(3, 1)), \
+ QUARTER_ROUND(x, C(0, 3), C(1, 0), C(2, 1), C(3, 2)) \
+)
+
+#define TWENTY_ROUNDS(x) ( \
+ DOUBLE_ROUND(x), \
+ DOUBLE_ROUND(x), \
+ DOUBLE_ROUND(x), \
+ DOUBLE_ROUND(x), \
+ DOUBLE_ROUND(x), \
+ DOUBLE_ROUND(x), \
+ DOUBLE_ROUND(x), \
+ DOUBLE_ROUND(x), \
+ DOUBLE_ROUND(x), \
+ DOUBLE_ROUND(x) \
+)
+
+static void chacha20_block_generic(__le32 *stream, u32 *state)
+{
+ u32 x[CHACHA20_BLOCK_SIZE / sizeof(u32)];
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(x); ++i)
+ x[i] = state[i];
+
+ TWENTY_ROUNDS(x);
+
+ for (i = 0; i < ARRAY_SIZE(x); ++i)
+ stream[i] = cpu_to_le32(x[i] + state[i]);
+
+ ++state[12];
+}
+
+static void chacha20_generic(u8 *out, const u8 *in, u32 len, const u32 key[8],
+ const u32 counter[4])
+{
+ __le32 buf[CHACHA20_BLOCK_SIZE / sizeof(__le32)];
+ u32 x[] = {
+ EXPAND_32_BYTE_K,
+ key[0], key[1], key[2], key[3],
+ key[4], key[5], key[6], key[7],
+ counter[0], counter[1], counter[2], counter[3]
+ };
+
+ if (out != in)
+ memmove(out, in, len);
+
+ while (len >= CHACHA20_BLOCK_SIZE) {
+ chacha20_block_generic(buf, x);
+ crypto_xor(out, (u8 *)buf, CHACHA20_BLOCK_SIZE);
+ len -= CHACHA20_BLOCK_SIZE;
+ out += CHACHA20_BLOCK_SIZE;
+ }
+ if (len) {
+ chacha20_block_generic(buf, x);
+ crypto_xor(out, (u8 *)buf, len);
+ }
+}
+
+void chacha20(struct chacha20_ctx *state, u8 *dst, const u8 *src, u32 len,
+ simd_context_t simd_context)
+{
+ if (!chacha20_arch(dst, src, len, state->key, state->counter,
+ simd_context))
+ chacha20_generic(dst, src, len, state->key, state->counter);
+ state->counter[0] += (len + 63) / 64;
+}
+EXPORT_SYMBOL(chacha20);
+
+static void hchacha20_generic(u8 derived_key[CHACHA20_KEY_SIZE],
+ const u8 nonce[HCHACHA20_NONCE_SIZE],
+ const u8 key[HCHACHA20_KEY_SIZE])
+{
+ __le32 *out = (__force __le32 *)derived_key;
+ u32 x[] = { EXPAND_32_BYTE_K,
+ get_unaligned_le32(key + 0),
+ get_unaligned_le32(key + 4),
+ get_unaligned_le32(key + 8),
+ get_unaligned_le32(key + 12),
+ get_unaligned_le32(key + 16),
+ get_unaligned_le32(key + 20),
+ get_unaligned_le32(key + 24),
+ get_unaligned_le32(key + 28),
+ get_unaligned_le32(nonce + 0),
+ get_unaligned_le32(nonce + 4),
+ get_unaligned_le32(nonce + 8),
+ get_unaligned_le32(nonce + 12)
+ };
+
+ TWENTY_ROUNDS(x);
+
+ out[0] = cpu_to_le32(x[0]);
+ out[1] = cpu_to_le32(x[1]);
+ out[2] = cpu_to_le32(x[2]);
+ out[3] = cpu_to_le32(x[3]);
+ out[4] = cpu_to_le32(x[12]);
+ out[5] = cpu_to_le32(x[13]);
+ out[6] = cpu_to_le32(x[14]);
+ out[7] = cpu_to_le32(x[15]);
+}
+
+/* Derived key should be 32-bit aligned */
+void hchacha20(u8 derived_key[CHACHA20_KEY_SIZE],
+ const u8 nonce[HCHACHA20_NONCE_SIZE],
+ const u8 key[HCHACHA20_KEY_SIZE], simd_context_t simd_context)
+{
+ if (!hchacha20_arch(derived_key, nonce, key, simd_context))
+ hchacha20_generic(derived_key, nonce, key);
+}
+/* Deliberately not EXPORT_SYMBOL'd, since there are few reasons why somebody
+ * should be using this directly, rather than via xchacha20. Revisit only in
+ * the unlikely event that somebody has a good reason to export this.
+ */
diff --git a/src/crypto/zinc/chacha20poly1305.c b/src/crypto/zinc/chacha20poly1305.c
new file mode 100644
index 0000000..3991482
--- /dev/null
+++ b/src/crypto/zinc/chacha20poly1305.c
@@ -0,0 +1,333 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * This is an implementation of the ChaCha20Poly1305 AEAD construction.
+ *
+ * Information: https://tools.ietf.org/html/rfc8439
+ */
+
+#include <zinc/chacha20poly1305.h>
+#include <zinc/chacha20.h>
+#include <zinc/poly1305.h>
+#include <asm/unaligned.h>
+#include <linux/kernel.h>
+#include <crypto/scatterwalk.h>
+
+static const u8 pad0[16] = { 0 };
+
+static struct crypto_alg chacha20_alg = {
+ .cra_blocksize = 1,
+ .cra_alignmask = sizeof(u32) - 1
+};
+static struct crypto_blkcipher chacha20_cipher = {
+ .base = {
+ .__crt_alg = &chacha20_alg
+ }
+};
+static struct blkcipher_desc chacha20_desc = {
+ .tfm = &chacha20_cipher
+};
+
+static inline void
+__chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
+ const u8 *ad, const size_t ad_len, const u64 nonce,
+ const u8 key[CHACHA20POLY1305_KEYLEN],
+ simd_context_t simd_context)
+{
+ struct poly1305_ctx poly1305_state;
+ struct chacha20_ctx chacha20_state;
+ union {
+ u8 block0[POLY1305_KEY_SIZE];
+ __le64 lens[2];
+ } b = { { 0 } };
+
+ chacha20_init(&chacha20_state, key, nonce);
+ chacha20(&chacha20_state, b.block0, b.block0, sizeof(b.block0),
+ simd_context);
+ poly1305_init(&poly1305_state, b.block0, simd_context);
+
+ poly1305_update(&poly1305_state, ad, ad_len, simd_context);
+ poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf,
+ simd_context);
+
+ chacha20(&chacha20_state, dst, src, src_len, simd_context);
+
+ poly1305_update(&poly1305_state, dst, src_len, simd_context);
+ poly1305_update(&poly1305_state, pad0, (0x10 - src_len) & 0xf,
+ simd_context);
+
+ b.lens[0] = cpu_to_le64(ad_len);
+ b.lens[1] = cpu_to_le64(src_len);
+ poly1305_update(&poly1305_state, (u8 *)b.lens, sizeof(b.lens),
+ simd_context);
+
+ poly1305_finish(&poly1305_state, dst + src_len, simd_context);
+
+ memzero_explicit(&chacha20_state, sizeof(chacha20_state));
+ memzero_explicit(&b, sizeof(b));
+}
+
+void chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
+ const u8 *ad, const size_t ad_len,
+ const u64 nonce,
+ const u8 key[CHACHA20POLY1305_KEYLEN])
+{
+ simd_context_t simd_context;
+
+ simd_context = simd_get();
+ __chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len, nonce, key,
+ simd_context);
+ simd_put(simd_context);
+}
+EXPORT_SYMBOL(chacha20poly1305_encrypt);
+
+bool chacha20poly1305_encrypt_sg(struct scatterlist *dst,
+ struct scatterlist *src, const size_t src_len,
+ const u8 *ad, const size_t ad_len,
+ const u64 nonce,
+ const u8 key[CHACHA20POLY1305_KEYLEN],
+ simd_context_t simd_context)
+{
+ struct poly1305_ctx poly1305_state;
+ struct chacha20_ctx chacha20_state;
+ int ret = 0;
+ struct blkcipher_walk walk;
+ union {
+ u8 block0[POLY1305_KEY_SIZE];
+ u8 mac[POLY1305_MAC_SIZE];
+ __le64 lens[2];
+ } b = { { 0 } };
+
+ chacha20_init(&chacha20_state, key, nonce);
+ chacha20(&chacha20_state, b.block0, b.block0, sizeof(b.block0),
+ simd_context);
+ poly1305_init(&poly1305_state, b.block0, simd_context);
+
+ poly1305_update(&poly1305_state, ad, ad_len, simd_context);
+ poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf,
+ simd_context);
+
+ if (likely(src_len)) {
+ blkcipher_walk_init(&walk, dst, src, src_len);
+ ret = blkcipher_walk_virt_block(&chacha20_desc, &walk,
+ CHACHA20_BLOCK_SIZE);
+ while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
+ size_t chunk_len =
+ rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE);
+
+ chacha20(&chacha20_state, walk.dst.virt.addr,
+ walk.src.virt.addr, chunk_len, simd_context);
+ poly1305_update(&poly1305_state, walk.dst.virt.addr,
+ chunk_len, simd_context);
+ ret = blkcipher_walk_done(&chacha20_desc, &walk,
+ walk.nbytes % CHACHA20_BLOCK_SIZE);
+ }
+ if (walk.nbytes) {
+ chacha20(&chacha20_state, walk.dst.virt.addr,
+ walk.src.virt.addr, walk.nbytes, simd_context);
+ poly1305_update(&poly1305_state, walk.dst.virt.addr,
+ walk.nbytes, simd_context);
+ ret = blkcipher_walk_done(&chacha20_desc, &walk, 0);
+ }
+ }
+ if (unlikely(ret))
+ goto err;
+
+ poly1305_update(&poly1305_state, pad0, (0x10 - src_len) & 0xf,
+ simd_context);
+
+ b.lens[0] = cpu_to_le64(ad_len);
+ b.lens[1] = cpu_to_le64(src_len);
+ poly1305_update(&poly1305_state, (u8 *)b.lens, sizeof(b.lens),
+ simd_context);
+
+ poly1305_finish(&poly1305_state, b.mac, simd_context);
+ scatterwalk_map_and_copy(b.mac, dst, src_len, sizeof(b.mac), 1);
+err:
+ memzero_explicit(&chacha20_state, sizeof(chacha20_state));
+ memzero_explicit(&b, sizeof(b));
+ return !ret;
+}
+EXPORT_SYMBOL(chacha20poly1305_encrypt_sg);
+
+static inline bool
+__chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
+ const u8 *ad, const size_t ad_len, const u64 nonce,
+ const u8 key[CHACHA20POLY1305_KEYLEN],
+ simd_context_t simd_context)
+{
+ struct poly1305_ctx poly1305_state;
+ struct chacha20_ctx chacha20_state;
+ int ret;
+ size_t dst_len;
+ union {
+ u8 block0[POLY1305_KEY_SIZE];
+ u8 mac[POLY1305_MAC_SIZE];
+ __le64 lens[2];
+ } b = { { 0 } };
+
+ if (unlikely(src_len < POLY1305_MAC_SIZE))
+ return false;
+
+ chacha20_init(&chacha20_state, key, nonce);
+ chacha20(&chacha20_state, b.block0, b.block0, sizeof(b.block0),
+ simd_context);
+ poly1305_init(&poly1305_state, b.block0, simd_context);
+
+ poly1305_update(&poly1305_state, ad, ad_len, simd_context);
+ poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf,
+ simd_context);
+
+ dst_len = src_len - POLY1305_MAC_SIZE;
+ poly1305_update(&poly1305_state, src, dst_len, simd_context);
+ poly1305_update(&poly1305_state, pad0, (0x10 - dst_len) & 0xf,
+ simd_context);
+
+ b.lens[0] = cpu_to_le64(ad_len);
+ b.lens[1] = cpu_to_le64(dst_len);
+ poly1305_update(&poly1305_state, (u8 *)b.lens, sizeof(b.lens),
+ simd_context);
+
+ poly1305_finish(&poly1305_state, b.mac, simd_context);
+
+ ret = crypto_memneq(b.mac, src + dst_len, POLY1305_MAC_SIZE);
+ if (likely(!ret))
+ chacha20(&chacha20_state, dst, src, dst_len, simd_context);
+
+ memzero_explicit(&chacha20_state, sizeof(chacha20_state));
+ memzero_explicit(&b, sizeof(b));
+
+ return !ret;
+}
+
+bool chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
+ const u8 *ad, const size_t ad_len,
+ const u64 nonce,
+ const u8 key[CHACHA20POLY1305_KEYLEN])
+{
+ simd_context_t simd_context, ret;
+
+ simd_context = simd_get();
+ ret = __chacha20poly1305_decrypt(dst, src, src_len, ad, ad_len, nonce,
+ key, simd_context);
+ simd_put(simd_context);
+ return ret;
+}
+EXPORT_SYMBOL(chacha20poly1305_decrypt);
+
+bool chacha20poly1305_decrypt_sg(struct scatterlist *dst,
+ struct scatterlist *src, const size_t src_len,
+ const u8 *ad, const size_t ad_len,
+ const u64 nonce,
+ const u8 key[CHACHA20POLY1305_KEYLEN],
+ simd_context_t simd_context)
+{
+ struct poly1305_ctx poly1305_state;
+ struct chacha20_ctx chacha20_state;
+ struct blkcipher_walk walk;
+ int ret = 0;
+ size_t dst_len;
+ union {
+ u8 block0[POLY1305_KEY_SIZE];
+ struct {
+ u8 read_mac[POLY1305_MAC_SIZE];
+ u8 computed_mac[POLY1305_MAC_SIZE];
+ };
+ __le64 lens[2];
+ } b = { { 0 } };
+
+ if (unlikely(src_len < POLY1305_MAC_SIZE))
+ return false;
+
+ chacha20_init(&chacha20_state, key, nonce);
+ chacha20(&chacha20_state, b.block0, b.block0, sizeof(b.block0),
+ simd_context);
+ poly1305_init(&poly1305_state, b.block0, simd_context);
+
+ poly1305_update(&poly1305_state, ad, ad_len, simd_context);
+ poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf,
+ simd_context);
+
+ dst_len = src_len - POLY1305_MAC_SIZE;
+ if (likely(dst_len)) {
+ blkcipher_walk_init(&walk, dst, src, dst_len);
+ ret = blkcipher_walk_virt_block(&chacha20_desc, &walk,
+ CHACHA20_BLOCK_SIZE);
+ while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
+ size_t chunk_len =
+ rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE);
+
+ poly1305_update(&poly1305_state, walk.src.virt.addr,
+ chunk_len, simd_context);
+ chacha20(&chacha20_state, walk.dst.virt.addr,
+ walk.src.virt.addr, chunk_len, simd_context);
+ ret = blkcipher_walk_done(&chacha20_desc, &walk,
+ walk.nbytes % CHACHA20_BLOCK_SIZE);
+ }
+ if (walk.nbytes) {
+ poly1305_update(&poly1305_state, walk.src.virt.addr,
+ walk.nbytes, simd_context);
+ chacha20(&chacha20_state, walk.dst.virt.addr,
+ walk.src.virt.addr, walk.nbytes, simd_context);
+ ret = blkcipher_walk_done(&chacha20_desc, &walk, 0);
+ }
+ }
+ if (unlikely(ret))
+ goto err;
+
+ poly1305_update(&poly1305_state, pad0, (0x10 - dst_len) & 0xf,
+ simd_context);
+
+ b.lens[0] = cpu_to_le64(ad_len);
+ b.lens[1] = cpu_to_le64(dst_len);
+ poly1305_update(&poly1305_state, (u8 *)b.lens, sizeof(b.lens),
+ simd_context);
+
+ poly1305_finish(&poly1305_state, b.computed_mac, simd_context);
+
+ scatterwalk_map_and_copy(b.read_mac, src, dst_len, POLY1305_MAC_SIZE, 0);
+ ret = crypto_memneq(b.read_mac, b.computed_mac, POLY1305_MAC_SIZE);
+err:
+ memzero_explicit(&chacha20_state, sizeof(chacha20_state));
+ memzero_explicit(&b, sizeof(b));
+ return !ret;
+}
+EXPORT_SYMBOL(chacha20poly1305_decrypt_sg);
+
+void xchacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
+ const u8 *ad, const size_t ad_len,
+ const u8 nonce[XCHACHA20POLY1305_NONCELEN],
+ const u8 key[CHACHA20POLY1305_KEYLEN])
+{
+ simd_context_t simd_context = simd_get();
+ u8 derived_key[CHACHA20POLY1305_KEYLEN] __aligned(16);
+
+ hchacha20(derived_key, nonce, key, simd_context);
+ __chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len,
+ get_unaligned_le64(nonce + 16),
+ derived_key, simd_context);
+ memzero_explicit(derived_key, CHACHA20POLY1305_KEYLEN);
+ simd_put(simd_context);
+}
+EXPORT_SYMBOL(xchacha20poly1305_encrypt);
+
+bool xchacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
+ const u8 *ad, const size_t ad_len,
+ const u8 nonce[XCHACHA20POLY1305_NONCELEN],
+ const u8 key[CHACHA20POLY1305_KEYLEN])
+{
+ bool ret, simd_context = simd_get();
+ u8 derived_key[CHACHA20POLY1305_KEYLEN] __aligned(16);
+
+ hchacha20(derived_key, nonce, key, simd_context);
+ ret = __chacha20poly1305_decrypt(dst, src, src_len, ad, ad_len,
+ get_unaligned_le64(nonce + 16),
+ derived_key, simd_context);
+ memzero_explicit(derived_key, CHACHA20POLY1305_KEYLEN);
+ simd_put(simd_context);
+ return ret;
+}
+EXPORT_SYMBOL(xchacha20poly1305_decrypt);
+
+#include "selftest/chacha20poly1305.h"
diff --git a/src/crypto/zinc/curve25519/curve25519-arm-glue.h b/src/crypto/zinc/curve25519/curve25519-arm-glue.h
new file mode 100644
index 0000000..36e8002
--- /dev/null
+++ b/src/crypto/zinc/curve25519/curve25519-arm-glue.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <zinc/curve25519.h>
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+#if IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && __LINUX_ARM_ARCH__ == 7
+#define ARM_USE_NEON
+asmlinkage void curve25519_neon(u8 mypublic[CURVE25519_POINT_SIZE],
+ const u8 secret[CURVE25519_POINT_SIZE],
+ const u8 basepoint[CURVE25519_POINT_SIZE]);
+#endif
+
+static bool curve25519_use_neon __ro_after_init;
+
+void __init curve25519_fpu_init(void)
+{
+ curve25519_use_neon = elf_hwcap & HWCAP_NEON;
+}
+
+static inline bool curve25519_arch(u8 mypublic[CURVE25519_POINT_SIZE],
+ const u8 secret[CURVE25519_POINT_SIZE],
+ const u8 basepoint[CURVE25519_POINT_SIZE])
+{
+#ifdef ARM_USE_NEON
+ if (curve25519_use_neon && may_use_simd()) {
+ kernel_neon_begin();
+ curve25519_neon(mypublic, secret, basepoint);
+ kernel_neon_end();
+ return true;
+ }
+#endif
+ return false;
+}
+
+static inline bool curve25519_base_arch(u8 pub[CURVE25519_POINT_SIZE],
+ const u8 secret[CURVE25519_POINT_SIZE])
+{
+ return false;
+}
+
+#define HAVE_CURVE25519_ARCH_IMPLEMENTATION
diff --git a/src/crypto/zinc/curve25519/curve25519-arm.S b/src/crypto/zinc/curve25519/curve25519-arm.S
new file mode 100644
index 0000000..0020ed2
--- /dev/null
+++ b/src/crypto/zinc/curve25519/curve25519-arm.S
@@ -0,0 +1,2095 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This
+ * has been built from SUPERCOP's curve25519/neon2/scalarmult.pq using qhasm,
+ * but has subsequently been manually reworked for use in kernel space.
+ */
+
+#if __LINUX_ARM_ARCH__ == 7
+#include <linux/linkage.h>
+
+.text
+.fpu neon
+.align 4
+
+ENTRY(curve25519_neon)
+ push {r4-r11, lr}
+ mov ip, sp
+ sub sp, sp, #704
+ and sp, sp, #0xfffffff0
+ add r3, sp, #0
+ movw r4, #0
+ movw r5, #254
+ vmov.i32 q0, #1
+ vshr.u64 q1, q0, #7
+ vshr.u64 q0, q0, #8
+ vmov.i32 d4, #19
+ vmov.i32 d5, #38
+ add r6, sp, #480
+ vst1.8 {d2-d3}, [r6, : 128]
+ add r6, sp, #496
+ vst1.8 {d0-d1}, [r6, : 128]
+ add r6, sp, #512
+ vst1.8 {d4-d5}, [r6, : 128]
+ add r6, r3, #0
+ vmov.i32 q2, #0
+ vst1.8 {d4-d5}, [r6, : 128]!
+ vst1.8 {d4-d5}, [r6, : 128]!
+ vst1.8 d4, [r6, : 64]
+ add r6, r3, #0
+ movw r7, #960
+ sub r7, r7, #2
+ neg r7, r7
+ sub r7, r7, r7, LSL #7
+ str r7, [r6]
+ add r6, sp, #672
+ vld1.8 {d4-d5}, [r1]!
+ vld1.8 {d6-d7}, [r1]
+ vst1.8 {d4-d5}, [r6, : 128]!
+ vst1.8 {d6-d7}, [r6, : 128]
+ sub r1, r6, #16
+ ldrb r6, [r1]
+ and r6, r6, #248
+ strb r6, [r1]
+ ldrb r6, [r1, #31]
+ and r6, r6, #127
+ orr r6, r6, #64
+ strb r6, [r1, #31]
+ vmov.i64 q2, #0xffffffff
+ vshr.u64 q3, q2, #7
+ vshr.u64 q2, q2, #6
+ vld1.8 {d8}, [r2]
+ vld1.8 {d10}, [r2]
+ add r2, r2, #6
+ vld1.8 {d12}, [r2]
+ vld1.8 {d14}, [r2]
+ add r2, r2, #6
+ vld1.8 {d16}, [r2]
+ add r2, r2, #4
+ vld1.8 {d18}, [r2]
+ vld1.8 {d20}, [r2]
+ add r2, r2, #6
+ vld1.8 {d22}, [r2]
+ add r2, r2, #2
+ vld1.8 {d24}, [r2]
+ vld1.8 {d26}, [r2]
+ vshr.u64 q5, q5, #26
+ vshr.u64 q6, q6, #3
+ vshr.u64 q7, q7, #29
+ vshr.u64 q8, q8, #6
+ vshr.u64 q10, q10, #25
+ vshr.u64 q11, q11, #3
+ vshr.u64 q12, q12, #12
+ vshr.u64 q13, q13, #38
+ vand q4, q4, q2
+ vand q6, q6, q2
+ vand q8, q8, q2
+ vand q10, q10, q2
+ vand q2, q12, q2
+ vand q5, q5, q3
+ vand q7, q7, q3
+ vand q9, q9, q3
+ vand q11, q11, q3
+ vand q3, q13, q3
+ add r2, r3, #48
+ vadd.i64 q12, q4, q1
+ vadd.i64 q13, q10, q1
+ vshr.s64 q12, q12, #26
+ vshr.s64 q13, q13, #26
+ vadd.i64 q5, q5, q12
+ vshl.i64 q12, q12, #26
+ vadd.i64 q14, q5, q0
+ vadd.i64 q11, q11, q13
+ vshl.i64 q13, q13, #26
+ vadd.i64 q15, q11, q0
+ vsub.i64 q4, q4, q12
+ vshr.s64 q12, q14, #25
+ vsub.i64 q10, q10, q13
+ vshr.s64 q13, q15, #25
+ vadd.i64 q6, q6, q12
+ vshl.i64 q12, q12, #25
+ vadd.i64 q14, q6, q1
+ vadd.i64 q2, q2, q13
+ vsub.i64 q5, q5, q12
+ vshr.s64 q12, q14, #26
+ vshl.i64 q13, q13, #25
+ vadd.i64 q14, q2, q1
+ vadd.i64 q7, q7, q12
+ vshl.i64 q12, q12, #26
+ vadd.i64 q15, q7, q0
+ vsub.i64 q11, q11, q13
+ vshr.s64 q13, q14, #26
+ vsub.i64 q6, q6, q12
+ vshr.s64 q12, q15, #25
+ vadd.i64 q3, q3, q13
+ vshl.i64 q13, q13, #26
+ vadd.i64 q14, q3, q0
+ vadd.i64 q8, q8, q12
+ vshl.i64 q12, q12, #25
+ vadd.i64 q15, q8, q1
+ add r2, r2, #8
+ vsub.i64 q2, q2, q13
+ vshr.s64 q13, q14, #25
+ vsub.i64 q7, q7, q12
+ vshr.s64 q12, q15, #26
+ vadd.i64 q14, q13, q13
+ vadd.i64 q9, q9, q12
+ vtrn.32 d12, d14
+ vshl.i64 q12, q12, #26
+ vtrn.32 d13, d15
+ vadd.i64 q0, q9, q0
+ vadd.i64 q4, q4, q14
+ vst1.8 d12, [r2, : 64]!
+ vshl.i64 q6, q13, #4
+ vsub.i64 q7, q8, q12
+ vshr.s64 q0, q0, #25
+ vadd.i64 q4, q4, q6
+ vadd.i64 q6, q10, q0
+ vshl.i64 q0, q0, #25
+ vadd.i64 q8, q6, q1
+ vadd.i64 q4, q4, q13
+ vshl.i64 q10, q13, #25
+ vadd.i64 q1, q4, q1
+ vsub.i64 q0, q9, q0
+ vshr.s64 q8, q8, #26
+ vsub.i64 q3, q3, q10
+ vtrn.32 d14, d0
+ vshr.s64 q1, q1, #26
+ vtrn.32 d15, d1
+ vadd.i64 q0, q11, q8
+ vst1.8 d14, [r2, : 64]
+ vshl.i64 q7, q8, #26
+ vadd.i64 q5, q5, q1
+ vtrn.32 d4, d6
+ vshl.i64 q1, q1, #26
+ vtrn.32 d5, d7
+ vsub.i64 q3, q6, q7
+ add r2, r2, #16
+ vsub.i64 q1, q4, q1
+ vst1.8 d4, [r2, : 64]
+ vtrn.32 d6, d0
+ vtrn.32 d7, d1
+ sub r2, r2, #8
+ vtrn.32 d2, d10
+ vtrn.32 d3, d11
+ vst1.8 d6, [r2, : 64]
+ sub r2, r2, #24
+ vst1.8 d2, [r2, : 64]
+ add r2, r3, #96
+ vmov.i32 q0, #0
+ vmov.i64 d2, #0xff
+ vmov.i64 d3, #0
+ vshr.u32 q1, q1, #7
+ vst1.8 {d2-d3}, [r2, : 128]!
+ vst1.8 {d0-d1}, [r2, : 128]!
+ vst1.8 d0, [r2, : 64]
+ add r2, r3, #144
+ vmov.i32 q0, #0
+ vst1.8 {d0-d1}, [r2, : 128]!
+ vst1.8 {d0-d1}, [r2, : 128]!
+ vst1.8 d0, [r2, : 64]
+ add r2, r3, #240
+ vmov.i32 q0, #0
+ vmov.i64 d2, #0xff
+ vmov.i64 d3, #0
+ vshr.u32 q1, q1, #7
+ vst1.8 {d2-d3}, [r2, : 128]!
+ vst1.8 {d0-d1}, [r2, : 128]!
+ vst1.8 d0, [r2, : 64]
+ add r2, r3, #48
+ add r6, r3, #192
+ vld1.8 {d0-d1}, [r2, : 128]!
+ vld1.8 {d2-d3}, [r2, : 128]!
+ vld1.8 {d4}, [r2, : 64]
+ vst1.8 {d0-d1}, [r6, : 128]!
+ vst1.8 {d2-d3}, [r6, : 128]!
+ vst1.8 d4, [r6, : 64]
+.Lmainloop:
+ mov r2, r5, LSR #3
+ and r6, r5, #7
+ ldrb r2, [r1, r2]
+ mov r2, r2, LSR r6
+ and r2, r2, #1
+ str r5, [sp, #456]
+ eor r4, r4, r2
+ str r2, [sp, #460]
+ neg r2, r4
+ add r4, r3, #96
+ add r5, r3, #192
+ add r6, r3, #144
+ vld1.8 {d8-d9}, [r4, : 128]!
+ add r7, r3, #240
+ vld1.8 {d10-d11}, [r5, : 128]!
+ veor q6, q4, q5
+ vld1.8 {d14-d15}, [r6, : 128]!
+ vdup.i32 q8, r2
+ vld1.8 {d18-d19}, [r7, : 128]!
+ veor q10, q7, q9
+ vld1.8 {d22-d23}, [r4, : 128]!
+ vand q6, q6, q8
+ vld1.8 {d24-d25}, [r5, : 128]!
+ vand q10, q10, q8
+ vld1.8 {d26-d27}, [r6, : 128]!
+ veor q4, q4, q6
+ vld1.8 {d28-d29}, [r7, : 128]!
+ veor q5, q5, q6
+ vld1.8 {d0}, [r4, : 64]
+ veor q6, q7, q10
+ vld1.8 {d2}, [r5, : 64]
+ veor q7, q9, q10
+ vld1.8 {d4}, [r6, : 64]
+ veor q9, q11, q12
+ vld1.8 {d6}, [r7, : 64]
+ veor q10, q0, q1
+ sub r2, r4, #32
+ vand q9, q9, q8
+ sub r4, r5, #32
+ vand q10, q10, q8
+ sub r5, r6, #32
+ veor q11, q11, q9
+ sub r6, r7, #32
+ veor q0, q0, q10
+ veor q9, q12, q9
+ veor q1, q1, q10
+ veor q10, q13, q14
+ veor q12, q2, q3
+ vand q10, q10, q8
+ vand q8, q12, q8
+ veor q12, q13, q10
+ veor q2, q2, q8
+ veor q10, q14, q10
+ veor q3, q3, q8
+ vadd.i32 q8, q4, q6
+ vsub.i32 q4, q4, q6
+ vst1.8 {d16-d17}, [r2, : 128]!
+ vadd.i32 q6, q11, q12
+ vst1.8 {d8-d9}, [r5, : 128]!
+ vsub.i32 q4, q11, q12
+ vst1.8 {d12-d13}, [r2, : 128]!
+ vadd.i32 q6, q0, q2
+ vst1.8 {d8-d9}, [r5, : 128]!
+ vsub.i32 q0, q0, q2
+ vst1.8 d12, [r2, : 64]
+ vadd.i32 q2, q5, q7
+ vst1.8 d0, [r5, : 64]
+ vsub.i32 q0, q5, q7
+ vst1.8 {d4-d5}, [r4, : 128]!
+ vadd.i32 q2, q9, q10
+ vst1.8 {d0-d1}, [r6, : 128]!
+ vsub.i32 q0, q9, q10
+ vst1.8 {d4-d5}, [r4, : 128]!
+ vadd.i32 q2, q1, q3
+ vst1.8 {d0-d1}, [r6, : 128]!
+ vsub.i32 q0, q1, q3
+ vst1.8 d4, [r4, : 64]
+ vst1.8 d0, [r6, : 64]
+ add r2, sp, #512
+ add r4, r3, #96
+ add r5, r3, #144
+ vld1.8 {d0-d1}, [r2, : 128]
+ vld1.8 {d2-d3}, [r4, : 128]!
+ vld1.8 {d4-d5}, [r5, : 128]!
+ vzip.i32 q1, q2
+ vld1.8 {d6-d7}, [r4, : 128]!
+ vld1.8 {d8-d9}, [r5, : 128]!
+ vshl.i32 q5, q1, #1
+ vzip.i32 q3, q4
+ vshl.i32 q6, q2, #1
+ vld1.8 {d14}, [r4, : 64]
+ vshl.i32 q8, q3, #1
+ vld1.8 {d15}, [r5, : 64]
+ vshl.i32 q9, q4, #1
+ vmul.i32 d21, d7, d1
+ vtrn.32 d14, d15
+ vmul.i32 q11, q4, q0
+ vmul.i32 q0, q7, q0
+ vmull.s32 q12, d2, d2
+ vmlal.s32 q12, d11, d1
+ vmlal.s32 q12, d12, d0
+ vmlal.s32 q12, d13, d23
+ vmlal.s32 q12, d16, d22
+ vmlal.s32 q12, d7, d21
+ vmull.s32 q10, d2, d11
+ vmlal.s32 q10, d4, d1
+ vmlal.s32 q10, d13, d0
+ vmlal.s32 q10, d6, d23
+ vmlal.s32 q10, d17, d22
+ vmull.s32 q13, d10, d4
+ vmlal.s32 q13, d11, d3
+ vmlal.s32 q13, d13, d1
+ vmlal.s32 q13, d16, d0
+ vmlal.s32 q13, d17, d23
+ vmlal.s32 q13, d8, d22
+ vmull.s32 q1, d10, d5
+ vmlal.s32 q1, d11, d4
+ vmlal.s32 q1, d6, d1
+ vmlal.s32 q1, d17, d0
+ vmlal.s32 q1, d8, d23
+ vmull.s32 q14, d10, d6
+ vmlal.s32 q14, d11, d13
+ vmlal.s32 q14, d4, d4
+ vmlal.s32 q14, d17, d1
+ vmlal.s32 q14, d18, d0
+ vmlal.s32 q14, d9, d23
+ vmull.s32 q11, d10, d7
+ vmlal.s32 q11, d11, d6
+ vmlal.s32 q11, d12, d5
+ vmlal.s32 q11, d8, d1
+ vmlal.s32 q11, d19, d0
+ vmull.s32 q15, d10, d8
+ vmlal.s32 q15, d11, d17
+ vmlal.s32 q15, d12, d6
+ vmlal.s32 q15, d13, d5
+ vmlal.s32 q15, d19, d1
+ vmlal.s32 q15, d14, d0
+ vmull.s32 q2, d10, d9
+ vmlal.s32 q2, d11, d8
+ vmlal.s32 q2, d12, d7
+ vmlal.s32 q2, d13, d6
+ vmlal.s32 q2, d14, d1
+ vmull.s32 q0, d15, d1
+ vmlal.s32 q0, d10, d14
+ vmlal.s32 q0, d11, d19
+ vmlal.s32 q0, d12, d8
+ vmlal.s32 q0, d13, d17
+ vmlal.s32 q0, d6, d6
+ add r2, sp, #480
+ vld1.8 {d18-d19}, [r2, : 128]
+ vmull.s32 q3, d16, d7
+ vmlal.s32 q3, d10, d15
+ vmlal.s32 q3, d11, d14
+ vmlal.s32 q3, d12, d9
+ vmlal.s32 q3, d13, d8
+ add r2, sp, #496
+ vld1.8 {d8-d9}, [r2, : 128]
+ vadd.i64 q5, q12, q9
+ vadd.i64 q6, q15, q9
+ vshr.s64 q5, q5, #26
+ vshr.s64 q6, q6, #26
+ vadd.i64 q7, q10, q5
+ vshl.i64 q5, q5, #26
+ vadd.i64 q8, q7, q4
+ vadd.i64 q2, q2, q6
+ vshl.i64 q6, q6, #26
+ vadd.i64 q10, q2, q4
+ vsub.i64 q5, q12, q5
+ vshr.s64 q8, q8, #25
+ vsub.i64 q6, q15, q6
+ vshr.s64 q10, q10, #25
+ vadd.i64 q12, q13, q8
+ vshl.i64 q8, q8, #25
+ vadd.i64 q13, q12, q9
+ vadd.i64 q0, q0, q10
+ vsub.i64 q7, q7, q8
+ vshr.s64 q8, q13, #26
+ vshl.i64 q10, q10, #25
+ vadd.i64 q13, q0, q9
+ vadd.i64 q1, q1, q8
+ vshl.i64 q8, q8, #26
+ vadd.i64 q15, q1, q4
+ vsub.i64 q2, q2, q10
+ vshr.s64 q10, q13, #26
+ vsub.i64 q8, q12, q8
+ vshr.s64 q12, q15, #25
+ vadd.i64 q3, q3, q10
+ vshl.i64 q10, q10, #26
+ vadd.i64 q13, q3, q4
+ vadd.i64 q14, q14, q12
+ add r2, r3, #288
+ vshl.i64 q12, q12, #25
+ add r4, r3, #336
+ vadd.i64 q15, q14, q9
+ add r2, r2, #8
+ vsub.i64 q0, q0, q10
+ add r4, r4, #8
+ vshr.s64 q10, q13, #25
+ vsub.i64 q1, q1, q12
+ vshr.s64 q12, q15, #26
+ vadd.i64 q13, q10, q10
+ vadd.i64 q11, q11, q12
+ vtrn.32 d16, d2
+ vshl.i64 q12, q12, #26
+ vtrn.32 d17, d3
+ vadd.i64 q1, q11, q4
+ vadd.i64 q4, q5, q13
+ vst1.8 d16, [r2, : 64]!
+ vshl.i64 q5, q10, #4
+ vst1.8 d17, [r4, : 64]!
+ vsub.i64 q8, q14, q12
+ vshr.s64 q1, q1, #25
+ vadd.i64 q4, q4, q5
+ vadd.i64 q5, q6, q1
+ vshl.i64 q1, q1, #25
+ vadd.i64 q6, q5, q9
+ vadd.i64 q4, q4, q10
+ vshl.i64 q10, q10, #25
+ vadd.i64 q9, q4, q9
+ vsub.i64 q1, q11, q1
+ vshr.s64 q6, q6, #26
+ vsub.i64 q3, q3, q10
+ vtrn.32 d16, d2
+ vshr.s64 q9, q9, #26
+ vtrn.32 d17, d3
+ vadd.i64 q1, q2, q6
+ vst1.8 d16, [r2, : 64]
+ vshl.i64 q2, q6, #26
+ vst1.8 d17, [r4, : 64]
+ vadd.i64 q6, q7, q9
+ vtrn.32 d0, d6
+ vshl.i64 q7, q9, #26
+ vtrn.32 d1, d7
+ vsub.i64 q2, q5, q2
+ add r2, r2, #16
+ vsub.i64 q3, q4, q7
+ vst1.8 d0, [r2, : 64]
+ add r4, r4, #16
+ vst1.8 d1, [r4, : 64]
+ vtrn.32 d4, d2
+ vtrn.32 d5, d3
+ sub r2, r2, #8
+ sub r4, r4, #8
+ vtrn.32 d6, d12
+ vtrn.32 d7, d13
+ vst1.8 d4, [r2, : 64]
+ vst1.8 d5, [r4, : 64]
+ sub r2, r2, #24
+ sub r4, r4, #24
+ vst1.8 d6, [r2, : 64]
+ vst1.8 d7, [r4, : 64]
+ add r2, r3, #240
+ add r4, r3, #96
+ vld1.8 {d0-d1}, [r4, : 128]!
+ vld1.8 {d2-d3}, [r4, : 128]!
+ vld1.8 {d4}, [r4, : 64]
+ add r4, r3, #144
+ vld1.8 {d6-d7}, [r4, : 128]!
+ vtrn.32 q0, q3
+ vld1.8 {d8-d9}, [r4, : 128]!
+ vshl.i32 q5, q0, #4
+ vtrn.32 q1, q4
+ vshl.i32 q6, q3, #4
+ vadd.i32 q5, q5, q0
+ vadd.i32 q6, q6, q3
+ vshl.i32 q7, q1, #4
+ vld1.8 {d5}, [r4, : 64]
+ vshl.i32 q8, q4, #4
+ vtrn.32 d4, d5
+ vadd.i32 q7, q7, q1
+ vadd.i32 q8, q8, q4
+ vld1.8 {d18-d19}, [r2, : 128]!
+ vshl.i32 q10, q2, #4
+ vld1.8 {d22-d23}, [r2, : 128]!
+ vadd.i32 q10, q10, q2
+ vld1.8 {d24}, [r2, : 64]
+ vadd.i32 q5, q5, q0
+ add r2, r3, #192
+ vld1.8 {d26-d27}, [r2, : 128]!
+ vadd.i32 q6, q6, q3
+ vld1.8 {d28-d29}, [r2, : 128]!
+ vadd.i32 q8, q8, q4
+ vld1.8 {d25}, [r2, : 64]
+ vadd.i32 q10, q10, q2
+ vtrn.32 q9, q13
+ vadd.i32 q7, q7, q1
+ vadd.i32 q5, q5, q0
+ vtrn.32 q11, q14
+ vadd.i32 q6, q6, q3
+ add r2, sp, #528
+ vadd.i32 q10, q10, q2
+ vtrn.32 d24, d25
+ vst1.8 {d12-d13}, [r2, : 128]
+ vshl.i32 q6, q13, #1
+ add r2, sp, #544
+ vst1.8 {d20-d21}, [r2, : 128]
+ vshl.i32 q10, q14, #1
+ add r2, sp, #560
+ vst1.8 {d12-d13}, [r2, : 128]
+ vshl.i32 q15, q12, #1
+ vadd.i32 q8, q8, q4
+ vext.32 d10, d31, d30, #0
+ vadd.i32 q7, q7, q1
+ add r2, sp, #576
+ vst1.8 {d16-d17}, [r2, : 128]
+ vmull.s32 q8, d18, d5
+ vmlal.s32 q8, d26, d4
+ vmlal.s32 q8, d19, d9
+ vmlal.s32 q8, d27, d3
+ vmlal.s32 q8, d22, d8
+ vmlal.s32 q8, d28, d2
+ vmlal.s32 q8, d23, d7
+ vmlal.s32 q8, d29, d1
+ vmlal.s32 q8, d24, d6
+ vmlal.s32 q8, d25, d0
+ add r2, sp, #592
+ vst1.8 {d14-d15}, [r2, : 128]
+ vmull.s32 q2, d18, d4
+ vmlal.s32 q2, d12, d9
+ vmlal.s32 q2, d13, d8
+ vmlal.s32 q2, d19, d3
+ vmlal.s32 q2, d22, d2
+ vmlal.s32 q2, d23, d1
+ vmlal.s32 q2, d24, d0
+ add r2, sp, #608
+ vst1.8 {d20-d21}, [r2, : 128]
+ vmull.s32 q7, d18, d9
+ vmlal.s32 q7, d26, d3
+ vmlal.s32 q7, d19, d8
+ vmlal.s32 q7, d27, d2
+ vmlal.s32 q7, d22, d7
+ vmlal.s32 q7, d28, d1
+ vmlal.s32 q7, d23, d6
+ vmlal.s32 q7, d29, d0
+ add r2, sp, #624
+ vst1.8 {d10-d11}, [r2, : 128]
+ vmull.s32 q5, d18, d3
+ vmlal.s32 q5, d19, d2
+ vmlal.s32 q5, d22, d1
+ vmlal.s32 q5, d23, d0
+ vmlal.s32 q5, d12, d8
+ add r2, sp, #640
+ vst1.8 {d16-d17}, [r2, : 128]
+ vmull.s32 q4, d18, d8
+ vmlal.s32 q4, d26, d2
+ vmlal.s32 q4, d19, d7
+ vmlal.s32 q4, d27, d1
+ vmlal.s32 q4, d22, d6
+ vmlal.s32 q4, d28, d0
+ vmull.s32 q8, d18, d7
+ vmlal.s32 q8, d26, d1
+ vmlal.s32 q8, d19, d6
+ vmlal.s32 q8, d27, d0
+ add r2, sp, #544
+ vld1.8 {d20-d21}, [r2, : 128]
+ vmlal.s32 q7, d24, d21
+ vmlal.s32 q7, d25, d20
+ vmlal.s32 q4, d23, d21
+ vmlal.s32 q4, d29, d20
+ vmlal.s32 q8, d22, d21
+ vmlal.s32 q8, d28, d20
+ vmlal.s32 q5, d24, d20
+ add r2, sp, #544
+ vst1.8 {d14-d15}, [r2, : 128]
+ vmull.s32 q7, d18, d6
+ vmlal.s32 q7, d26, d0
+ add r2, sp, #624
+ vld1.8 {d30-d31}, [r2, : 128]
+ vmlal.s32 q2, d30, d21
+ vmlal.s32 q7, d19, d21
+ vmlal.s32 q7, d27, d20
+ add r2, sp, #592
+ vld1.8 {d26-d27}, [r2, : 128]
+ vmlal.s32 q4, d25, d27
+ vmlal.s32 q8, d29, d27
+ vmlal.s32 q8, d25, d26
+ vmlal.s32 q7, d28, d27
+ vmlal.s32 q7, d29, d26
+ add r2, sp, #576
+ vld1.8 {d28-d29}, [r2, : 128]
+ vmlal.s32 q4, d24, d29
+ vmlal.s32 q8, d23, d29
+ vmlal.s32 q8, d24, d28
+ vmlal.s32 q7, d22, d29
+ vmlal.s32 q7, d23, d28
+ add r2, sp, #576
+ vst1.8 {d8-d9}, [r2, : 128]
+ add r2, sp, #528
+ vld1.8 {d8-d9}, [r2, : 128]
+ vmlal.s32 q7, d24, d9
+ vmlal.s32 q7, d25, d31
+ vmull.s32 q1, d18, d2
+ vmlal.s32 q1, d19, d1
+ vmlal.s32 q1, d22, d0
+ vmlal.s32 q1, d24, d27
+ vmlal.s32 q1, d23, d20
+ vmlal.s32 q1, d12, d7
+ vmlal.s32 q1, d13, d6
+ vmull.s32 q6, d18, d1
+ vmlal.s32 q6, d19, d0
+ vmlal.s32 q6, d23, d27
+ vmlal.s32 q6, d22, d20
+ vmlal.s32 q6, d24, d26
+ vmull.s32 q0, d18, d0
+ vmlal.s32 q0, d22, d27
+ vmlal.s32 q0, d23, d26
+ vmlal.s32 q0, d24, d31
+ vmlal.s32 q0, d19, d20
+ add r2, sp, #608
+ vld1.8 {d18-d19}, [r2, : 128]
+ vmlal.s32 q2, d18, d7
+ vmlal.s32 q2, d19, d6
+ vmlal.s32 q5, d18, d6
+ vmlal.s32 q5, d19, d21
+ vmlal.s32 q1, d18, d21
+ vmlal.s32 q1, d19, d29
+ vmlal.s32 q0, d18, d28
+ vmlal.s32 q0, d19, d9
+ vmlal.s32 q6, d18, d29
+ vmlal.s32 q6, d19, d28
+ add r2, sp, #560
+ vld1.8 {d18-d19}, [r2, : 128]
+ add r2, sp, #480
+ vld1.8 {d22-d23}, [r2, : 128]
+ vmlal.s32 q5, d19, d7
+ vmlal.s32 q0, d18, d21
+ vmlal.s32 q0, d19, d29
+ vmlal.s32 q6, d18, d6
+ add r2, sp, #496
+ vld1.8 {d6-d7}, [r2, : 128]
+ vmlal.s32 q6, d19, d21
+ add r2, sp, #544
+ vld1.8 {d18-d19}, [r2, : 128]
+ vmlal.s32 q0, d30, d8
+ add r2, sp, #640
+ vld1.8 {d20-d21}, [r2, : 128]
+ vmlal.s32 q5, d30, d29
+ add r2, sp, #576
+ vld1.8 {d24-d25}, [r2, : 128]
+ vmlal.s32 q1, d30, d28
+ vadd.i64 q13, q0, q11
+ vadd.i64 q14, q5, q11
+ vmlal.s32 q6, d30, d9
+ vshr.s64 q4, q13, #26
+ vshr.s64 q13, q14, #26
+ vadd.i64 q7, q7, q4
+ vshl.i64 q4, q4, #26
+ vadd.i64 q14, q7, q3
+ vadd.i64 q9, q9, q13
+ vshl.i64 q13, q13, #26
+ vadd.i64 q15, q9, q3
+ vsub.i64 q0, q0, q4
+ vshr.s64 q4, q14, #25
+ vsub.i64 q5, q5, q13
+ vshr.s64 q13, q15, #25
+ vadd.i64 q6, q6, q4
+ vshl.i64 q4, q4, #25
+ vadd.i64 q14, q6, q11
+ vadd.i64 q2, q2, q13
+ vsub.i64 q4, q7, q4
+ vshr.s64 q7, q14, #26
+ vshl.i64 q13, q13, #25
+ vadd.i64 q14, q2, q11
+ vadd.i64 q8, q8, q7
+ vshl.i64 q7, q7, #26
+ vadd.i64 q15, q8, q3
+ vsub.i64 q9, q9, q13
+ vshr.s64 q13, q14, #26
+ vsub.i64 q6, q6, q7
+ vshr.s64 q7, q15, #25
+ vadd.i64 q10, q10, q13
+ vshl.i64 q13, q13, #26
+ vadd.i64 q14, q10, q3
+ vadd.i64 q1, q1, q7
+ add r2, r3, #144
+ vshl.i64 q7, q7, #25
+ add r4, r3, #96
+ vadd.i64 q15, q1, q11
+ add r2, r2, #8
+ vsub.i64 q2, q2, q13
+ add r4, r4, #8
+ vshr.s64 q13, q14, #25
+ vsub.i64 q7, q8, q7
+ vshr.s64 q8, q15, #26
+ vadd.i64 q14, q13, q13
+ vadd.i64 q12, q12, q8
+ vtrn.32 d12, d14
+ vshl.i64 q8, q8, #26
+ vtrn.32 d13, d15
+ vadd.i64 q3, q12, q3
+ vadd.i64 q0, q0, q14
+ vst1.8 d12, [r2, : 64]!
+ vshl.i64 q7, q13, #4
+ vst1.8 d13, [r4, : 64]!
+ vsub.i64 q1, q1, q8
+ vshr.s64 q3, q3, #25
+ vadd.i64 q0, q0, q7
+ vadd.i64 q5, q5, q3
+ vshl.i64 q3, q3, #25
+ vadd.i64 q6, q5, q11
+ vadd.i64 q0, q0, q13
+ vshl.i64 q7, q13, #25
+ vadd.i64 q8, q0, q11
+ vsub.i64 q3, q12, q3
+ vshr.s64 q6, q6, #26
+ vsub.i64 q7, q10, q7
+ vtrn.32 d2, d6
+ vshr.s64 q8, q8, #26
+ vtrn.32 d3, d7
+ vadd.i64 q3, q9, q6
+ vst1.8 d2, [r2, : 64]
+ vshl.i64 q6, q6, #26
+ vst1.8 d3, [r4, : 64]
+ vadd.i64 q1, q4, q8
+ vtrn.32 d4, d14
+ vshl.i64 q4, q8, #26
+ vtrn.32 d5, d15
+ vsub.i64 q5, q5, q6
+ add r2, r2, #16
+ vsub.i64 q0, q0, q4
+ vst1.8 d4, [r2, : 64]
+ add r4, r4, #16
+ vst1.8 d5, [r4, : 64]
+ vtrn.32 d10, d6
+ vtrn.32 d11, d7
+ sub r2, r2, #8
+ sub r4, r4, #8
+ vtrn.32 d0, d2
+ vtrn.32 d1, d3
+ vst1.8 d10, [r2, : 64]
+ vst1.8 d11, [r4, : 64]
+ sub r2, r2, #24
+ sub r4, r4, #24
+ vst1.8 d0, [r2, : 64]
+ vst1.8 d1, [r4, : 64]
+ add r2, r3, #288
+ add r4, r3, #336
+ vld1.8 {d0-d1}, [r2, : 128]!
+ vld1.8 {d2-d3}, [r4, : 128]!
+ vsub.i32 q0, q0, q1
+ vld1.8 {d2-d3}, [r2, : 128]!
+ vld1.8 {d4-d5}, [r4, : 128]!
+ vsub.i32 q1, q1, q2
+ add r5, r3, #240
+ vld1.8 {d4}, [r2, : 64]
+ vld1.8 {d6}, [r4, : 64]
+ vsub.i32 q2, q2, q3
+ vst1.8 {d0-d1}, [r5, : 128]!
+ vst1.8 {d2-d3}, [r5, : 128]!
+ vst1.8 d4, [r5, : 64]
+ add r2, r3, #144
+ add r4, r3, #96
+ add r5, r3, #144
+ add r6, r3, #192
+ vld1.8 {d0-d1}, [r2, : 128]!
+ vld1.8 {d2-d3}, [r4, : 128]!
+ vsub.i32 q2, q0, q1
+ vadd.i32 q0, q0, q1
+ vld1.8 {d2-d3}, [r2, : 128]!
+ vld1.8 {d6-d7}, [r4, : 128]!
+ vsub.i32 q4, q1, q3
+ vadd.i32 q1, q1, q3
+ vld1.8 {d6}, [r2, : 64]
+ vld1.8 {d10}, [r4, : 64]
+ vsub.i32 q6, q3, q5
+ vadd.i32 q3, q3, q5
+ vst1.8 {d4-d5}, [r5, : 128]!
+ vst1.8 {d0-d1}, [r6, : 128]!
+ vst1.8 {d8-d9}, [r5, : 128]!
+ vst1.8 {d2-d3}, [r6, : 128]!
+ vst1.8 d12, [r5, : 64]
+ vst1.8 d6, [r6, : 64]
+ add r2, r3, #0
+ add r4, r3, #240
+ vld1.8 {d0-d1}, [r4, : 128]!
+ vld1.8 {d2-d3}, [r4, : 128]!
+ vld1.8 {d4}, [r4, : 64]
+ add r4, r3, #336
+ vld1.8 {d6-d7}, [r4, : 128]!
+ vtrn.32 q0, q3
+ vld1.8 {d8-d9}, [r4, : 128]!
+ vshl.i32 q5, q0, #4
+ vtrn.32 q1, q4
+ vshl.i32 q6, q3, #4
+ vadd.i32 q5, q5, q0
+ vadd.i32 q6, q6, q3
+ vshl.i32 q7, q1, #4
+ vld1.8 {d5}, [r4, : 64]
+ vshl.i32 q8, q4, #4
+ vtrn.32 d4, d5
+ vadd.i32 q7, q7, q1
+ vadd.i32 q8, q8, q4
+ vld1.8 {d18-d19}, [r2, : 128]!
+ vshl.i32 q10, q2, #4
+ vld1.8 {d22-d23}, [r2, : 128]!
+ vadd.i32 q10, q10, q2
+ vld1.8 {d24}, [r2, : 64]
+ vadd.i32 q5, q5, q0
+ add r2, r3, #288
+ vld1.8 {d26-d27}, [r2, : 128]!
+ vadd.i32 q6, q6, q3
+ vld1.8 {d28-d29}, [r2, : 128]!
+ vadd.i32 q8, q8, q4
+ vld1.8 {d25}, [r2, : 64]
+ vadd.i32 q10, q10, q2
+ vtrn.32 q9, q13
+ vadd.i32 q7, q7, q1
+ vadd.i32 q5, q5, q0
+ vtrn.32 q11, q14
+ vadd.i32 q6, q6, q3
+ add r2, sp, #528
+ vadd.i32 q10, q10, q2
+ vtrn.32 d24, d25
+ vst1.8 {d12-d13}, [r2, : 128]
+ vshl.i32 q6, q13, #1
+ add r2, sp, #544
+ vst1.8 {d20-d21}, [r2, : 128]
+ vshl.i32 q10, q14, #1
+ add r2, sp, #560
+ vst1.8 {d12-d13}, [r2, : 128]
+ vshl.i32 q15, q12, #1
+ vadd.i32 q8, q8, q4
+ vext.32 d10, d31, d30, #0
+ vadd.i32 q7, q7, q1
+ add r2, sp, #576
+ vst1.8 {d16-d17}, [r2, : 128]
+ vmull.s32 q8, d18, d5
+ vmlal.s32 q8, d26, d4
+ vmlal.s32 q8, d19, d9
+ vmlal.s32 q8, d27, d3
+ vmlal.s32 q8, d22, d8
+ vmlal.s32 q8, d28, d2
+ vmlal.s32 q8, d23, d7
+ vmlal.s32 q8, d29, d1
+ vmlal.s32 q8, d24, d6
+ vmlal.s32 q8, d25, d0
+ add r2, sp, #592
+ vst1.8 {d14-d15}, [r2, : 128]
+ vmull.s32 q2, d18, d4
+ vmlal.s32 q2, d12, d9
+ vmlal.s32 q2, d13, d8
+ vmlal.s32 q2, d19, d3
+ vmlal.s32 q2, d22, d2
+ vmlal.s32 q2, d23, d1
+ vmlal.s32 q2, d24, d0
+ add r2, sp, #608
+ vst1.8 {d20-d21}, [r2, : 128]
+ vmull.s32 q7, d18, d9
+ vmlal.s32 q7, d26, d3
+ vmlal.s32 q7, d19, d8
+ vmlal.s32 q7, d27, d2
+ vmlal.s32 q7, d22, d7
+ vmlal.s32 q7, d28, d1
+ vmlal.s32 q7, d23, d6
+ vmlal.s32 q7, d29, d0
+ add r2, sp, #624
+ vst1.8 {d10-d11}, [r2, : 128]
+ vmull.s32 q5, d18, d3
+ vmlal.s32 q5, d19, d2
+ vmlal.s32 q5, d22, d1
+ vmlal.s32 q5, d23, d0
+ vmlal.s32 q5, d12, d8
+ add r2, sp, #640
+ vst1.8 {d16-d17}, [r2, : 128]
+ vmull.s32 q4, d18, d8
+ vmlal.s32 q4, d26, d2
+ vmlal.s32 q4, d19, d7
+ vmlal.s32 q4, d27, d1
+ vmlal.s32 q4, d22, d6
+ vmlal.s32 q4, d28, d0
+ vmull.s32 q8, d18, d7
+ vmlal.s32 q8, d26, d1
+ vmlal.s32 q8, d19, d6
+ vmlal.s32 q8, d27, d0
+ add r2, sp, #544
+ vld1.8 {d20-d21}, [r2, : 128]
+ vmlal.s32 q7, d24, d21
+ vmlal.s32 q7, d25, d20
+ vmlal.s32 q4, d23, d21
+ vmlal.s32 q4, d29, d20
+ vmlal.s32 q8, d22, d21
+ vmlal.s32 q8, d28, d20
+ vmlal.s32 q5, d24, d20
+ add r2, sp, #544
+ vst1.8 {d14-d15}, [r2, : 128]
+ vmull.s32 q7, d18, d6
+ vmlal.s32 q7, d26, d0
+ add r2, sp, #624
+ vld1.8 {d30-d31}, [r2, : 128]
+ vmlal.s32 q2, d30, d21
+ vmlal.s32 q7, d19, d21
+ vmlal.s32 q7, d27, d20
+ add r2, sp, #592
+ vld1.8 {d26-d27}, [r2, : 128]
+ vmlal.s32 q4, d25, d27
+ vmlal.s32 q8, d29, d27
+ vmlal.s32 q8, d25, d26
+ vmlal.s32 q7, d28, d27
+ vmlal.s32 q7, d29, d26
+ add r2, sp, #576
+ vld1.8 {d28-d29}, [r2, : 128]
+ vmlal.s32 q4, d24, d29
+ vmlal.s32 q8, d23, d29
+ vmlal.s32 q8, d24, d28
+ vmlal.s32 q7, d22, d29
+ vmlal.s32 q7, d23, d28
+ add r2, sp, #576
+ vst1.8 {d8-d9}, [r2, : 128]
+ add r2, sp, #528
+ vld1.8 {d8-d9}, [r2, : 128]
+ vmlal.s32 q7, d24, d9
+ vmlal.s32 q7, d25, d31
+ vmull.s32 q1, d18, d2
+ vmlal.s32 q1, d19, d1
+ vmlal.s32 q1, d22, d0
+ vmlal.s32 q1, d24, d27
+ vmlal.s32 q1, d23, d20
+ vmlal.s32 q1, d12, d7
+ vmlal.s32 q1, d13, d6
+ vmull.s32 q6, d18, d1
+ vmlal.s32 q6, d19, d0
+ vmlal.s32 q6, d23, d27
+ vmlal.s32 q6, d22, d20
+ vmlal.s32 q6, d24, d26
+ vmull.s32 q0, d18, d0
+ vmlal.s32 q0, d22, d27
+ vmlal.s32 q0, d23, d26
+ vmlal.s32 q0, d24, d31
+ vmlal.s32 q0, d19, d20
+ add r2, sp, #608
+ vld1.8 {d18-d19}, [r2, : 128]
+ vmlal.s32 q2, d18, d7
+ vmlal.s32 q2, d19, d6
+ vmlal.s32 q5, d18, d6
+ vmlal.s32 q5, d19, d21
+ vmlal.s32 q1, d18, d21
+ vmlal.s32 q1, d19, d29
+ vmlal.s32 q0, d18, d28
+ vmlal.s32 q0, d19, d9
+ vmlal.s32 q6, d18, d29
+ vmlal.s32 q6, d19, d28
+ add r2, sp, #560
+ vld1.8 {d18-d19}, [r2, : 128]
+ add r2, sp, #480
+ vld1.8 {d22-d23}, [r2, : 128]
+ vmlal.s32 q5, d19, d7
+ vmlal.s32 q0, d18, d21
+ vmlal.s32 q0, d19, d29
+ vmlal.s32 q6, d18, d6
+ add r2, sp, #496
+ vld1.8 {d6-d7}, [r2, : 128]
+ vmlal.s32 q6, d19, d21
+ add r2, sp, #544
+ vld1.8 {d18-d19}, [r2, : 128]
+ vmlal.s32 q0, d30, d8
+ add r2, sp, #640
+ vld1.8 {d20-d21}, [r2, : 128]
+ vmlal.s32 q5, d30, d29
+ add r2, sp, #576
+ vld1.8 {d24-d25}, [r2, : 128]
+ vmlal.s32 q1, d30, d28
+ vadd.i64 q13, q0, q11
+ vadd.i64 q14, q5, q11
+ vmlal.s32 q6, d30, d9
+ vshr.s64 q4, q13, #26
+ vshr.s64 q13, q14, #26
+ vadd.i64 q7, q7, q4
+ vshl.i64 q4, q4, #26
+ vadd.i64 q14, q7, q3
+ vadd.i64 q9, q9, q13
+ vshl.i64 q13, q13, #26
+ vadd.i64 q15, q9, q3
+ vsub.i64 q0, q0, q4
+ vshr.s64 q4, q14, #25
+ vsub.i64 q5, q5, q13
+ vshr.s64 q13, q15, #25
+ vadd.i64 q6, q6, q4
+ vshl.i64 q4, q4, #25
+ vadd.i64 q14, q6, q11
+ vadd.i64 q2, q2, q13
+ vsub.i64 q4, q7, q4
+ vshr.s64 q7, q14, #26
+ vshl.i64 q13, q13, #25
+ vadd.i64 q14, q2, q11
+ vadd.i64 q8, q8, q7
+ vshl.i64 q7, q7, #26
+ vadd.i64 q15, q8, q3
+ vsub.i64 q9, q9, q13
+ vshr.s64 q13, q14, #26
+ vsub.i64 q6, q6, q7
+ vshr.s64 q7, q15, #25
+ vadd.i64 q10, q10, q13
+ vshl.i64 q13, q13, #26
+ vadd.i64 q14, q10, q3
+ vadd.i64 q1, q1, q7
+ add r2, r3, #288
+ vshl.i64 q7, q7, #25
+ add r4, r3, #96
+ vadd.i64 q15, q1, q11
+ add r2, r2, #8
+ vsub.i64 q2, q2, q13
+ add r4, r4, #8
+ vshr.s64 q13, q14, #25
+ vsub.i64 q7, q8, q7
+ vshr.s64 q8, q15, #26
+ vadd.i64 q14, q13, q13
+ vadd.i64 q12, q12, q8
+ vtrn.32 d12, d14
+ vshl.i64 q8, q8, #26
+ vtrn.32 d13, d15
+ vadd.i64 q3, q12, q3
+ vadd.i64 q0, q0, q14
+ vst1.8 d12, [r2, : 64]!
+ vshl.i64 q7, q13, #4
+ vst1.8 d13, [r4, : 64]!
+ vsub.i64 q1, q1, q8
+ vshr.s64 q3, q3, #25
+ vadd.i64 q0, q0, q7
+ vadd.i64 q5, q5, q3
+ vshl.i64 q3, q3, #25
+ vadd.i64 q6, q5, q11
+ vadd.i64 q0, q0, q13
+ vshl.i64 q7, q13, #25
+ vadd.i64 q8, q0, q11
+ vsub.i64 q3, q12, q3
+ vshr.s64 q6, q6, #26
+ vsub.i64 q7, q10, q7
+ vtrn.32 d2, d6
+ vshr.s64 q8, q8, #26
+ vtrn.32 d3, d7
+ vadd.i64 q3, q9, q6
+ vst1.8 d2, [r2, : 64]
+ vshl.i64 q6, q6, #26
+ vst1.8 d3, [r4, : 64]
+ vadd.i64 q1, q4, q8
+ vtrn.32 d4, d14
+ vshl.i64 q4, q8, #26
+ vtrn.32 d5, d15
+ vsub.i64 q5, q5, q6
+ add r2, r2, #16
+ vsub.i64 q0, q0, q4
+ vst1.8 d4, [r2, : 64]
+ add r4, r4, #16
+ vst1.8 d5, [r4, : 64]
+ vtrn.32 d10, d6
+ vtrn.32 d11, d7
+ sub r2, r2, #8
+ sub r4, r4, #8
+ vtrn.32 d0, d2
+ vtrn.32 d1, d3
+ vst1.8 d10, [r2, : 64]
+ vst1.8 d11, [r4, : 64]
+ sub r2, r2, #24
+ sub r4, r4, #24
+ vst1.8 d0, [r2, : 64]
+ vst1.8 d1, [r4, : 64]
+ add r2, sp, #512
+ add r4, r3, #144
+ add r5, r3, #192
+ vld1.8 {d0-d1}, [r2, : 128]
+ vld1.8 {d2-d3}, [r4, : 128]!
+ vld1.8 {d4-d5}, [r5, : 128]!
+ vzip.i32 q1, q2
+ vld1.8 {d6-d7}, [r4, : 128]!
+ vld1.8 {d8-d9}, [r5, : 128]!
+ vshl.i32 q5, q1, #1
+ vzip.i32 q3, q4
+ vshl.i32 q6, q2, #1
+ vld1.8 {d14}, [r4, : 64]
+ vshl.i32 q8, q3, #1
+ vld1.8 {d15}, [r5, : 64]
+ vshl.i32 q9, q4, #1
+ vmul.i32 d21, d7, d1
+ vtrn.32 d14, d15
+ vmul.i32 q11, q4, q0
+ vmul.i32 q0, q7, q0
+ vmull.s32 q12, d2, d2
+ vmlal.s32 q12, d11, d1
+ vmlal.s32 q12, d12, d0
+ vmlal.s32 q12, d13, d23
+ vmlal.s32 q12, d16, d22
+ vmlal.s32 q12, d7, d21
+ vmull.s32 q10, d2, d11
+ vmlal.s32 q10, d4, d1
+ vmlal.s32 q10, d13, d0
+ vmlal.s32 q10, d6, d23
+ vmlal.s32 q10, d17, d22
+ vmull.s32 q13, d10, d4
+ vmlal.s32 q13, d11, d3
+ vmlal.s32 q13, d13, d1
+ vmlal.s32 q13, d16, d0
+ vmlal.s32 q13, d17, d23
+ vmlal.s32 q13, d8, d22
+ vmull.s32 q1, d10, d5
+ vmlal.s32 q1, d11, d4
+ vmlal.s32 q1, d6, d1
+ vmlal.s32 q1, d17, d0
+ vmlal.s32 q1, d8, d23
+ vmull.s32 q14, d10, d6
+ vmlal.s32 q14, d11, d13
+ vmlal.s32 q14, d4, d4
+ vmlal.s32 q14, d17, d1
+ vmlal.s32 q14, d18, d0
+ vmlal.s32 q14, d9, d23
+ vmull.s32 q11, d10, d7
+ vmlal.s32 q11, d11, d6
+ vmlal.s32 q11, d12, d5
+ vmlal.s32 q11, d8, d1
+ vmlal.s32 q11, d19, d0
+ vmull.s32 q15, d10, d8
+ vmlal.s32 q15, d11, d17
+ vmlal.s32 q15, d12, d6
+ vmlal.s32 q15, d13, d5
+ vmlal.s32 q15, d19, d1
+ vmlal.s32 q15, d14, d0
+ vmull.s32 q2, d10, d9
+ vmlal.s32 q2, d11, d8
+ vmlal.s32 q2, d12, d7
+ vmlal.s32 q2, d13, d6
+ vmlal.s32 q2, d14, d1
+ vmull.s32 q0, d15, d1
+ vmlal.s32 q0, d10, d14
+ vmlal.s32 q0, d11, d19
+ vmlal.s32 q0, d12, d8
+ vmlal.s32 q0, d13, d17
+ vmlal.s32 q0, d6, d6
+ add r2, sp, #480
+ vld1.8 {d18-d19}, [r2, : 128]
+ vmull.s32 q3, d16, d7
+ vmlal.s32 q3, d10, d15
+ vmlal.s32 q3, d11, d14
+ vmlal.s32 q3, d12, d9
+ vmlal.s32 q3, d13, d8
+ add r2, sp, #496
+ vld1.8 {d8-d9}, [r2, : 128]
+ vadd.i64 q5, q12, q9
+ vadd.i64 q6, q15, q9
+ vshr.s64 q5, q5, #26
+ vshr.s64 q6, q6, #26
+ vadd.i64 q7, q10, q5
+ vshl.i64 q5, q5, #26
+ vadd.i64 q8, q7, q4
+ vadd.i64 q2, q2, q6
+ vshl.i64 q6, q6, #26
+ vadd.i64 q10, q2, q4
+ vsub.i64 q5, q12, q5
+ vshr.s64 q8, q8, #25
+ vsub.i64 q6, q15, q6
+ vshr.s64 q10, q10, #25
+ vadd.i64 q12, q13, q8
+ vshl.i64 q8, q8, #25
+ vadd.i64 q13, q12, q9
+ vadd.i64 q0, q0, q10
+ vsub.i64 q7, q7, q8
+ vshr.s64 q8, q13, #26
+ vshl.i64 q10, q10, #25
+ vadd.i64 q13, q0, q9
+ vadd.i64 q1, q1, q8
+ vshl.i64 q8, q8, #26
+ vadd.i64 q15, q1, q4
+ vsub.i64 q2, q2, q10
+ vshr.s64 q10, q13, #26
+ vsub.i64 q8, q12, q8
+ vshr.s64 q12, q15, #25
+ vadd.i64 q3, q3, q10
+ vshl.i64 q10, q10, #26
+ vadd.i64 q13, q3, q4
+ vadd.i64 q14, q14, q12
+ add r2, r3, #144
+ vshl.i64 q12, q12, #25
+ add r4, r3, #192
+ vadd.i64 q15, q14, q9
+ add r2, r2, #8
+ vsub.i64 q0, q0, q10
+ add r4, r4, #8
+ vshr.s64 q10, q13, #25
+ vsub.i64 q1, q1, q12
+ vshr.s64 q12, q15, #26
+ vadd.i64 q13, q10, q10
+ vadd.i64 q11, q11, q12
+ vtrn.32 d16, d2
+ vshl.i64 q12, q12, #26
+ vtrn.32 d17, d3
+ vadd.i64 q1, q11, q4
+ vadd.i64 q4, q5, q13
+ vst1.8 d16, [r2, : 64]!
+ vshl.i64 q5, q10, #4
+ vst1.8 d17, [r4, : 64]!
+ vsub.i64 q8, q14, q12
+ vshr.s64 q1, q1, #25
+ vadd.i64 q4, q4, q5
+ vadd.i64 q5, q6, q1
+ vshl.i64 q1, q1, #25
+ vadd.i64 q6, q5, q9
+ vadd.i64 q4, q4, q10
+ vshl.i64 q10, q10, #25
+ vadd.i64 q9, q4, q9
+ vsub.i64 q1, q11, q1
+ vshr.s64 q6, q6, #26
+ vsub.i64 q3, q3, q10
+ vtrn.32 d16, d2
+ vshr.s64 q9, q9, #26
+ vtrn.32 d17, d3
+ vadd.i64 q1, q2, q6
+ vst1.8 d16, [r2, : 64]
+ vshl.i64 q2, q6, #26
+ vst1.8 d17, [r4, : 64]
+ vadd.i64 q6, q7, q9
+ vtrn.32 d0, d6
+ vshl.i64 q7, q9, #26
+ vtrn.32 d1, d7
+ vsub.i64 q2, q5, q2
+ add r2, r2, #16
+ vsub.i64 q3, q4, q7
+ vst1.8 d0, [r2, : 64]
+ add r4, r4, #16
+ vst1.8 d1, [r4, : 64]
+ vtrn.32 d4, d2
+ vtrn.32 d5, d3
+ sub r2, r2, #8
+ sub r4, r4, #8
+ vtrn.32 d6, d12
+ vtrn.32 d7, d13
+ vst1.8 d4, [r2, : 64]
+ vst1.8 d5, [r4, : 64]
+ sub r2, r2, #24
+ sub r4, r4, #24
+ vst1.8 d6, [r2, : 64]
+ vst1.8 d7, [r4, : 64]
+ add r2, r3, #336
+ add r4, r3, #288
+ vld1.8 {d0-d1}, [r2, : 128]!
+ vld1.8 {d2-d3}, [r4, : 128]!
+ vadd.i32 q0, q0, q1
+ vld1.8 {d2-d3}, [r2, : 128]!
+ vld1.8 {d4-d5}, [r4, : 128]!
+ vadd.i32 q1, q1, q2
+ add r5, r3, #288
+ vld1.8 {d4}, [r2, : 64]
+ vld1.8 {d6}, [r4, : 64]
+ vadd.i32 q2, q2, q3
+ vst1.8 {d0-d1}, [r5, : 128]!
+ vst1.8 {d2-d3}, [r5, : 128]!
+ vst1.8 d4, [r5, : 64]
+ add r2, r3, #48
+ add r4, r3, #144
+ vld1.8 {d0-d1}, [r4, : 128]!
+ vld1.8 {d2-d3}, [r4, : 128]!
+ vld1.8 {d4}, [r4, : 64]
+ add r4, r3, #288
+ vld1.8 {d6-d7}, [r4, : 128]!
+ vtrn.32 q0, q3
+ vld1.8 {d8-d9}, [r4, : 128]!
+ vshl.i32 q5, q0, #4
+ vtrn.32 q1, q4
+ vshl.i32 q6, q3, #4
+ vadd.i32 q5, q5, q0
+ vadd.i32 q6, q6, q3
+ vshl.i32 q7, q1, #4
+ vld1.8 {d5}, [r4, : 64]
+ vshl.i32 q8, q4, #4
+ vtrn.32 d4, d5
+ vadd.i32 q7, q7, q1
+ vadd.i32 q8, q8, q4
+ vld1.8 {d18-d19}, [r2, : 128]!
+ vshl.i32 q10, q2, #4
+ vld1.8 {d22-d23}, [r2, : 128]!
+ vadd.i32 q10, q10, q2
+ vld1.8 {d24}, [r2, : 64]
+ vadd.i32 q5, q5, q0
+ add r2, r3, #240
+ vld1.8 {d26-d27}, [r2, : 128]!
+ vadd.i32 q6, q6, q3
+ vld1.8 {d28-d29}, [r2, : 128]!
+ vadd.i32 q8, q8, q4
+ vld1.8 {d25}, [r2, : 64]
+ vadd.i32 q10, q10, q2
+ vtrn.32 q9, q13
+ vadd.i32 q7, q7, q1
+ vadd.i32 q5, q5, q0
+ vtrn.32 q11, q14
+ vadd.i32 q6, q6, q3
+ add r2, sp, #528
+ vadd.i32 q10, q10, q2
+ vtrn.32 d24, d25
+ vst1.8 {d12-d13}, [r2, : 128]
+ vshl.i32 q6, q13, #1
+ add r2, sp, #544
+ vst1.8 {d20-d21}, [r2, : 128]
+ vshl.i32 q10, q14, #1
+ add r2, sp, #560
+ vst1.8 {d12-d13}, [r2, : 128]
+ vshl.i32 q15, q12, #1
+ vadd.i32 q8, q8, q4
+ vext.32 d10, d31, d30, #0
+ vadd.i32 q7, q7, q1
+ add r2, sp, #576
+ vst1.8 {d16-d17}, [r2, : 128]
+ vmull.s32 q8, d18, d5
+ vmlal.s32 q8, d26, d4
+ vmlal.s32 q8, d19, d9
+ vmlal.s32 q8, d27, d3
+ vmlal.s32 q8, d22, d8
+ vmlal.s32 q8, d28, d2
+ vmlal.s32 q8, d23, d7
+ vmlal.s32 q8, d29, d1
+ vmlal.s32 q8, d24, d6
+ vmlal.s32 q8, d25, d0
+ add r2, sp, #592
+ vst1.8 {d14-d15}, [r2, : 128]
+ vmull.s32 q2, d18, d4
+ vmlal.s32 q2, d12, d9
+ vmlal.s32 q2, d13, d8
+ vmlal.s32 q2, d19, d3
+ vmlal.s32 q2, d22, d2
+ vmlal.s32 q2, d23, d1
+ vmlal.s32 q2, d24, d0
+ add r2, sp, #608
+ vst1.8 {d20-d21}, [r2, : 128]
+ vmull.s32 q7, d18, d9
+ vmlal.s32 q7, d26, d3
+ vmlal.s32 q7, d19, d8
+ vmlal.s32 q7, d27, d2
+ vmlal.s32 q7, d22, d7
+ vmlal.s32 q7, d28, d1
+ vmlal.s32 q7, d23, d6
+ vmlal.s32 q7, d29, d0
+ add r2, sp, #624
+ vst1.8 {d10-d11}, [r2, : 128]
+ vmull.s32 q5, d18, d3
+ vmlal.s32 q5, d19, d2
+ vmlal.s32 q5, d22, d1
+ vmlal.s32 q5, d23, d0
+ vmlal.s32 q5, d12, d8
+ add r2, sp, #640
+ vst1.8 {d16-d17}, [r2, : 128]
+ vmull.s32 q4, d18, d8
+ vmlal.s32 q4, d26, d2
+ vmlal.s32 q4, d19, d7
+ vmlal.s32 q4, d27, d1
+ vmlal.s32 q4, d22, d6
+ vmlal.s32 q4, d28, d0
+ vmull.s32 q8, d18, d7
+ vmlal.s32 q8, d26, d1
+ vmlal.s32 q8, d19, d6
+ vmlal.s32 q8, d27, d0
+ add r2, sp, #544
+ vld1.8 {d20-d21}, [r2, : 128]
+ vmlal.s32 q7, d24, d21
+ vmlal.s32 q7, d25, d20
+ vmlal.s32 q4, d23, d21
+ vmlal.s32 q4, d29, d20
+ vmlal.s32 q8, d22, d21
+ vmlal.s32 q8, d28, d20
+ vmlal.s32 q5, d24, d20
+ add r2, sp, #544
+ vst1.8 {d14-d15}, [r2, : 128]
+ vmull.s32 q7, d18, d6
+ vmlal.s32 q7, d26, d0
+ add r2, sp, #624
+ vld1.8 {d30-d31}, [r2, : 128]
+ vmlal.s32 q2, d30, d21
+ vmlal.s32 q7, d19, d21
+ vmlal.s32 q7, d27, d20
+ add r2, sp, #592
+ vld1.8 {d26-d27}, [r2, : 128]
+ vmlal.s32 q4, d25, d27
+ vmlal.s32 q8, d29, d27
+ vmlal.s32 q8, d25, d26
+ vmlal.s32 q7, d28, d27
+ vmlal.s32 q7, d29, d26
+ add r2, sp, #576
+ vld1.8 {d28-d29}, [r2, : 128]
+ vmlal.s32 q4, d24, d29
+ vmlal.s32 q8, d23, d29
+ vmlal.s32 q8, d24, d28
+ vmlal.s32 q7, d22, d29
+ vmlal.s32 q7, d23, d28
+ add r2, sp, #576
+ vst1.8 {d8-d9}, [r2, : 128]
+ add r2, sp, #528
+ vld1.8 {d8-d9}, [r2, : 128]
+ vmlal.s32 q7, d24, d9
+ vmlal.s32 q7, d25, d31
+ vmull.s32 q1, d18, d2
+ vmlal.s32 q1, d19, d1
+ vmlal.s32 q1, d22, d0
+ vmlal.s32 q1, d24, d27
+ vmlal.s32 q1, d23, d20
+ vmlal.s32 q1, d12, d7
+ vmlal.s32 q1, d13, d6
+ vmull.s32 q6, d18, d1
+ vmlal.s32 q6, d19, d0
+ vmlal.s32 q6, d23, d27
+ vmlal.s32 q6, d22, d20
+ vmlal.s32 q6, d24, d26
+ vmull.s32 q0, d18, d0
+ vmlal.s32 q0, d22, d27
+ vmlal.s32 q0, d23, d26
+ vmlal.s32 q0, d24, d31
+ vmlal.s32 q0, d19, d20
+ add r2, sp, #608
+ vld1.8 {d18-d19}, [r2, : 128]
+ vmlal.s32 q2, d18, d7
+ vmlal.s32 q2, d19, d6
+ vmlal.s32 q5, d18, d6
+ vmlal.s32 q5, d19, d21
+ vmlal.s32 q1, d18, d21
+ vmlal.s32 q1, d19, d29
+ vmlal.s32 q0, d18, d28
+ vmlal.s32 q0, d19, d9
+ vmlal.s32 q6, d18, d29
+ vmlal.s32 q6, d19, d28
+ add r2, sp, #560
+ vld1.8 {d18-d19}, [r2, : 128]
+ add r2, sp, #480
+ vld1.8 {d22-d23}, [r2, : 128]
+ vmlal.s32 q5, d19, d7
+ vmlal.s32 q0, d18, d21
+ vmlal.s32 q0, d19, d29
+ vmlal.s32 q6, d18, d6
+ add r2, sp, #496
+ vld1.8 {d6-d7}, [r2, : 128]
+ vmlal.s32 q6, d19, d21
+ add r2, sp, #544
+ vld1.8 {d18-d19}, [r2, : 128]
+ vmlal.s32 q0, d30, d8
+ add r2, sp, #640
+ vld1.8 {d20-d21}, [r2, : 128]
+ vmlal.s32 q5, d30, d29
+ add r2, sp, #576
+ vld1.8 {d24-d25}, [r2, : 128]
+ vmlal.s32 q1, d30, d28
+ vadd.i64 q13, q0, q11
+ vadd.i64 q14, q5, q11
+ vmlal.s32 q6, d30, d9
+ vshr.s64 q4, q13, #26
+ vshr.s64 q13, q14, #26
+ vadd.i64 q7, q7, q4
+ vshl.i64 q4, q4, #26
+ vadd.i64 q14, q7, q3
+ vadd.i64 q9, q9, q13
+ vshl.i64 q13, q13, #26
+ vadd.i64 q15, q9, q3
+ vsub.i64 q0, q0, q4
+ vshr.s64 q4, q14, #25
+ vsub.i64 q5, q5, q13
+ vshr.s64 q13, q15, #25
+ vadd.i64 q6, q6, q4
+ vshl.i64 q4, q4, #25
+ vadd.i64 q14, q6, q11
+ vadd.i64 q2, q2, q13
+ vsub.i64 q4, q7, q4
+ vshr.s64 q7, q14, #26
+ vshl.i64 q13, q13, #25
+ vadd.i64 q14, q2, q11
+ vadd.i64 q8, q8, q7
+ vshl.i64 q7, q7, #26
+ vadd.i64 q15, q8, q3
+ vsub.i64 q9, q9, q13
+ vshr.s64 q13, q14, #26
+ vsub.i64 q6, q6, q7
+ vshr.s64 q7, q15, #25
+ vadd.i64 q10, q10, q13
+ vshl.i64 q13, q13, #26
+ vadd.i64 q14, q10, q3
+ vadd.i64 q1, q1, q7
+ add r2, r3, #240
+ vshl.i64 q7, q7, #25
+ add r4, r3, #144
+ vadd.i64 q15, q1, q11
+ add r2, r2, #8
+ vsub.i64 q2, q2, q13
+ add r4, r4, #8
+ vshr.s64 q13, q14, #25
+ vsub.i64 q7, q8, q7
+ vshr.s64 q8, q15, #26
+ vadd.i64 q14, q13, q13
+ vadd.i64 q12, q12, q8
+ vtrn.32 d12, d14
+ vshl.i64 q8, q8, #26
+ vtrn.32 d13, d15
+ vadd.i64 q3, q12, q3
+ vadd.i64 q0, q0, q14
+ vst1.8 d12, [r2, : 64]!
+ vshl.i64 q7, q13, #4
+ vst1.8 d13, [r4, : 64]!
+ vsub.i64 q1, q1, q8
+ vshr.s64 q3, q3, #25
+ vadd.i64 q0, q0, q7
+ vadd.i64 q5, q5, q3
+ vshl.i64 q3, q3, #25
+ vadd.i64 q6, q5, q11
+ vadd.i64 q0, q0, q13
+ vshl.i64 q7, q13, #25
+ vadd.i64 q8, q0, q11
+ vsub.i64 q3, q12, q3
+ vshr.s64 q6, q6, #26
+ vsub.i64 q7, q10, q7
+ vtrn.32 d2, d6
+ vshr.s64 q8, q8, #26
+ vtrn.32 d3, d7
+ vadd.i64 q3, q9, q6
+ vst1.8 d2, [r2, : 64]
+ vshl.i64 q6, q6, #26
+ vst1.8 d3, [r4, : 64]
+ vadd.i64 q1, q4, q8
+ vtrn.32 d4, d14
+ vshl.i64 q4, q8, #26
+ vtrn.32 d5, d15
+ vsub.i64 q5, q5, q6
+ add r2, r2, #16
+ vsub.i64 q0, q0, q4
+ vst1.8 d4, [r2, : 64]
+ add r4, r4, #16
+ vst1.8 d5, [r4, : 64]
+ vtrn.32 d10, d6
+ vtrn.32 d11, d7
+ sub r2, r2, #8
+ sub r4, r4, #8
+ vtrn.32 d0, d2
+ vtrn.32 d1, d3
+ vst1.8 d10, [r2, : 64]
+ vst1.8 d11, [r4, : 64]
+ sub r2, r2, #24
+ sub r4, r4, #24
+ vst1.8 d0, [r2, : 64]
+ vst1.8 d1, [r4, : 64]
+ ldr r2, [sp, #456]
+ ldr r4, [sp, #460]
+ subs r5, r2, #1
+ bge .Lmainloop
+ add r1, r3, #144
+ add r2, r3, #336
+ vld1.8 {d0-d1}, [r1, : 128]!
+ vld1.8 {d2-d3}, [r1, : 128]!
+ vld1.8 {d4}, [r1, : 64]
+ vst1.8 {d0-d1}, [r2, : 128]!
+ vst1.8 {d2-d3}, [r2, : 128]!
+ vst1.8 d4, [r2, : 64]
+ movw r1, #0
+.Linvertloop:
+ add r2, r3, #144
+ movw r4, #0
+ movw r5, #2
+ cmp r1, #1
+ moveq r5, #1
+ addeq r2, r3, #336
+ addeq r4, r3, #48
+ cmp r1, #2
+ moveq r5, #1
+ addeq r2, r3, #48
+ cmp r1, #3
+ moveq r5, #5
+ addeq r4, r3, #336
+ cmp r1, #4
+ moveq r5, #10
+ cmp r1, #5
+ moveq r5, #20
+ cmp r1, #6
+ moveq r5, #10
+ addeq r2, r3, #336
+ addeq r4, r3, #336
+ cmp r1, #7
+ moveq r5, #50
+ cmp r1, #8
+ moveq r5, #100
+ cmp r1, #9
+ moveq r5, #50
+ addeq r2, r3, #336
+ cmp r1, #10
+ moveq r5, #5
+ addeq r2, r3, #48
+ cmp r1, #11
+ moveq r5, #0
+ addeq r2, r3, #96
+ add r6, r3, #144
+ add r7, r3, #288
+ vld1.8 {d0-d1}, [r6, : 128]!
+ vld1.8 {d2-d3}, [r6, : 128]!
+ vld1.8 {d4}, [r6, : 64]
+ vst1.8 {d0-d1}, [r7, : 128]!
+ vst1.8 {d2-d3}, [r7, : 128]!
+ vst1.8 d4, [r7, : 64]
+ cmp r5, #0
+ beq .Lskipsquaringloop
+.Lsquaringloop:
+ add r6, r3, #288
+ add r7, r3, #288
+ add r8, r3, #288
+ vmov.i32 q0, #19
+ vmov.i32 q1, #0
+ vmov.i32 q2, #1
+ vzip.i32 q1, q2
+ vld1.8 {d4-d5}, [r7, : 128]!
+ vld1.8 {d6-d7}, [r7, : 128]!
+ vld1.8 {d9}, [r7, : 64]
+ vld1.8 {d10-d11}, [r6, : 128]!
+ add r7, sp, #384
+ vld1.8 {d12-d13}, [r6, : 128]!
+ vmul.i32 q7, q2, q0
+ vld1.8 {d8}, [r6, : 64]
+ vext.32 d17, d11, d10, #1
+ vmul.i32 q9, q3, q0
+ vext.32 d16, d10, d8, #1
+ vshl.u32 q10, q5, q1
+ vext.32 d22, d14, d4, #1
+ vext.32 d24, d18, d6, #1
+ vshl.u32 q13, q6, q1
+ vshl.u32 d28, d8, d2
+ vrev64.i32 d22, d22
+ vmul.i32 d1, d9, d1
+ vrev64.i32 d24, d24
+ vext.32 d29, d8, d13, #1
+ vext.32 d0, d1, d9, #1
+ vrev64.i32 d0, d0
+ vext.32 d2, d9, d1, #1
+ vext.32 d23, d15, d5, #1
+ vmull.s32 q4, d20, d4
+ vrev64.i32 d23, d23
+ vmlal.s32 q4, d21, d1
+ vrev64.i32 d2, d2
+ vmlal.s32 q4, d26, d19
+ vext.32 d3, d5, d15, #1
+ vmlal.s32 q4, d27, d18
+ vrev64.i32 d3, d3
+ vmlal.s32 q4, d28, d15
+ vext.32 d14, d12, d11, #1
+ vmull.s32 q5, d16, d23
+ vext.32 d15, d13, d12, #1
+ vmlal.s32 q5, d17, d4
+ vst1.8 d8, [r7, : 64]!
+ vmlal.s32 q5, d14, d1
+ vext.32 d12, d9, d8, #0
+ vmlal.s32 q5, d15, d19
+ vmov.i64 d13, #0
+ vmlal.s32 q5, d29, d18
+ vext.32 d25, d19, d7, #1
+ vmlal.s32 q6, d20, d5
+ vrev64.i32 d25, d25
+ vmlal.s32 q6, d21, d4
+ vst1.8 d11, [r7, : 64]!
+ vmlal.s32 q6, d26, d1
+ vext.32 d9, d10, d10, #0
+ vmlal.s32 q6, d27, d19
+ vmov.i64 d8, #0
+ vmlal.s32 q6, d28, d18
+ vmlal.s32 q4, d16, d24
+ vmlal.s32 q4, d17, d5
+ vmlal.s32 q4, d14, d4
+ vst1.8 d12, [r7, : 64]!
+ vmlal.s32 q4, d15, d1
+ vext.32 d10, d13, d12, #0
+ vmlal.s32 q4, d29, d19
+ vmov.i64 d11, #0
+ vmlal.s32 q5, d20, d6
+ vmlal.s32 q5, d21, d5
+ vmlal.s32 q5, d26, d4
+ vext.32 d13, d8, d8, #0
+ vmlal.s32 q5, d27, d1
+ vmov.i64 d12, #0
+ vmlal.s32 q5, d28, d19
+ vst1.8 d9, [r7, : 64]!
+ vmlal.s32 q6, d16, d25
+ vmlal.s32 q6, d17, d6
+ vst1.8 d10, [r7, : 64]
+ vmlal.s32 q6, d14, d5
+ vext.32 d8, d11, d10, #0
+ vmlal.s32 q6, d15, d4
+ vmov.i64 d9, #0
+ vmlal.s32 q6, d29, d1
+ vmlal.s32 q4, d20, d7
+ vmlal.s32 q4, d21, d6
+ vmlal.s32 q4, d26, d5
+ vext.32 d11, d12, d12, #0
+ vmlal.s32 q4, d27, d4
+ vmov.i64 d10, #0
+ vmlal.s32 q4, d28, d1
+ vmlal.s32 q5, d16, d0
+ sub r6, r7, #32
+ vmlal.s32 q5, d17, d7
+ vmlal.s32 q5, d14, d6
+ vext.32 d30, d9, d8, #0
+ vmlal.s32 q5, d15, d5
+ vld1.8 {d31}, [r6, : 64]!
+ vmlal.s32 q5, d29, d4
+ vmlal.s32 q15, d20, d0
+ vext.32 d0, d6, d18, #1
+ vmlal.s32 q15, d21, d25
+ vrev64.i32 d0, d0
+ vmlal.s32 q15, d26, d24
+ vext.32 d1, d7, d19, #1
+ vext.32 d7, d10, d10, #0
+ vmlal.s32 q15, d27, d23
+ vrev64.i32 d1, d1
+ vld1.8 {d6}, [r6, : 64]
+ vmlal.s32 q15, d28, d22
+ vmlal.s32 q3, d16, d4
+ add r6, r6, #24
+ vmlal.s32 q3, d17, d2
+ vext.32 d4, d31, d30, #0
+ vmov d17, d11
+ vmlal.s32 q3, d14, d1
+ vext.32 d11, d13, d13, #0
+ vext.32 d13, d30, d30, #0
+ vmlal.s32 q3, d15, d0
+ vext.32 d1, d8, d8, #0
+ vmlal.s32 q3, d29, d3
+ vld1.8 {d5}, [r6, : 64]
+ sub r6, r6, #16
+ vext.32 d10, d6, d6, #0
+ vmov.i32 q1, #0xffffffff
+ vshl.i64 q4, q1, #25
+ add r7, sp, #480
+ vld1.8 {d14-d15}, [r7, : 128]
+ vadd.i64 q9, q2, q7
+ vshl.i64 q1, q1, #26
+ vshr.s64 q10, q9, #26
+ vld1.8 {d0}, [r6, : 64]!
+ vadd.i64 q5, q5, q10
+ vand q9, q9, q1
+ vld1.8 {d16}, [r6, : 64]!
+ add r6, sp, #496
+ vld1.8 {d20-d21}, [r6, : 128]
+ vadd.i64 q11, q5, q10
+ vsub.i64 q2, q2, q9
+ vshr.s64 q9, q11, #25
+ vext.32 d12, d5, d4, #0
+ vand q11, q11, q4
+ vadd.i64 q0, q0, q9
+ vmov d19, d7
+ vadd.i64 q3, q0, q7
+ vsub.i64 q5, q5, q11
+ vshr.s64 q11, q3, #26
+ vext.32 d18, d11, d10, #0
+ vand q3, q3, q1
+ vadd.i64 q8, q8, q11
+ vadd.i64 q11, q8, q10
+ vsub.i64 q0, q0, q3
+ vshr.s64 q3, q11, #25
+ vand q11, q11, q4
+ vadd.i64 q3, q6, q3
+ vadd.i64 q6, q3, q7
+ vsub.i64 q8, q8, q11
+ vshr.s64 q11, q6, #26
+ vand q6, q6, q1
+ vadd.i64 q9, q9, q11
+ vadd.i64 d25, d19, d21
+ vsub.i64 q3, q3, q6
+ vshr.s64 d23, d25, #25
+ vand q4, q12, q4
+ vadd.i64 d21, d23, d23
+ vshl.i64 d25, d23, #4
+ vadd.i64 d21, d21, d23
+ vadd.i64 d25, d25, d21
+ vadd.i64 d4, d4, d25
+ vzip.i32 q0, q8
+ vadd.i64 d12, d4, d14
+ add r6, r8, #8
+ vst1.8 d0, [r6, : 64]
+ vsub.i64 d19, d19, d9
+ add r6, r6, #16
+ vst1.8 d16, [r6, : 64]
+ vshr.s64 d22, d12, #26
+ vand q0, q6, q1
+ vadd.i64 d10, d10, d22
+ vzip.i32 q3, q9
+ vsub.i64 d4, d4, d0
+ sub r6, r6, #8
+ vst1.8 d6, [r6, : 64]
+ add r6, r6, #16
+ vst1.8 d18, [r6, : 64]
+ vzip.i32 q2, q5
+ sub r6, r6, #32
+ vst1.8 d4, [r6, : 64]
+ subs r5, r5, #1
+ bhi .Lsquaringloop
+.Lskipsquaringloop:
+ mov r2, r2
+ add r5, r3, #288
+ add r6, r3, #144
+ vmov.i32 q0, #19
+ vmov.i32 q1, #0
+ vmov.i32 q2, #1
+ vzip.i32 q1, q2
+ vld1.8 {d4-d5}, [r5, : 128]!
+ vld1.8 {d6-d7}, [r5, : 128]!
+ vld1.8 {d9}, [r5, : 64]
+ vld1.8 {d10-d11}, [r2, : 128]!
+ add r5, sp, #384
+ vld1.8 {d12-d13}, [r2, : 128]!
+ vmul.i32 q7, q2, q0
+ vld1.8 {d8}, [r2, : 64]
+ vext.32 d17, d11, d10, #1
+ vmul.i32 q9, q3, q0
+ vext.32 d16, d10, d8, #1
+ vshl.u32 q10, q5, q1
+ vext.32 d22, d14, d4, #1
+ vext.32 d24, d18, d6, #1
+ vshl.u32 q13, q6, q1
+ vshl.u32 d28, d8, d2
+ vrev64.i32 d22, d22
+ vmul.i32 d1, d9, d1
+ vrev64.i32 d24, d24
+ vext.32 d29, d8, d13, #1
+ vext.32 d0, d1, d9, #1
+ vrev64.i32 d0, d0
+ vext.32 d2, d9, d1, #1
+ vext.32 d23, d15, d5, #1
+ vmull.s32 q4, d20, d4
+ vrev64.i32 d23, d23
+ vmlal.s32 q4, d21, d1
+ vrev64.i32 d2, d2
+ vmlal.s32 q4, d26, d19
+ vext.32 d3, d5, d15, #1
+ vmlal.s32 q4, d27, d18
+ vrev64.i32 d3, d3
+ vmlal.s32 q4, d28, d15
+ vext.32 d14, d12, d11, #1
+ vmull.s32 q5, d16, d23
+ vext.32 d15, d13, d12, #1
+ vmlal.s32 q5, d17, d4
+ vst1.8 d8, [r5, : 64]!
+ vmlal.s32 q5, d14, d1
+ vext.32 d12, d9, d8, #0
+ vmlal.s32 q5, d15, d19
+ vmov.i64 d13, #0
+ vmlal.s32 q5, d29, d18
+ vext.32 d25, d19, d7, #1
+ vmlal.s32 q6, d20, d5
+ vrev64.i32 d25, d25
+ vmlal.s32 q6, d21, d4
+ vst1.8 d11, [r5, : 64]!
+ vmlal.s32 q6, d26, d1
+ vext.32 d9, d10, d10, #0
+ vmlal.s32 q6, d27, d19
+ vmov.i64 d8, #0
+ vmlal.s32 q6, d28, d18
+ vmlal.s32 q4, d16, d24
+ vmlal.s32 q4, d17, d5
+ vmlal.s32 q4, d14, d4
+ vst1.8 d12, [r5, : 64]!
+ vmlal.s32 q4, d15, d1
+ vext.32 d10, d13, d12, #0
+ vmlal.s32 q4, d29, d19
+ vmov.i64 d11, #0
+ vmlal.s32 q5, d20, d6
+ vmlal.s32 q5, d21, d5
+ vmlal.s32 q5, d26, d4
+ vext.32 d13, d8, d8, #0
+ vmlal.s32 q5, d27, d1
+ vmov.i64 d12, #0
+ vmlal.s32 q5, d28, d19
+ vst1.8 d9, [r5, : 64]!
+ vmlal.s32 q6, d16, d25
+ vmlal.s32 q6, d17, d6
+ vst1.8 d10, [r5, : 64]
+ vmlal.s32 q6, d14, d5
+ vext.32 d8, d11, d10, #0
+ vmlal.s32 q6, d15, d4
+ vmov.i64 d9, #0
+ vmlal.s32 q6, d29, d1
+ vmlal.s32 q4, d20, d7
+ vmlal.s32 q4, d21, d6
+ vmlal.s32 q4, d26, d5
+ vext.32 d11, d12, d12, #0
+ vmlal.s32 q4, d27, d4
+ vmov.i64 d10, #0
+ vmlal.s32 q4, d28, d1
+ vmlal.s32 q5, d16, d0
+ sub r2, r5, #32
+ vmlal.s32 q5, d17, d7
+ vmlal.s32 q5, d14, d6
+ vext.32 d30, d9, d8, #0
+ vmlal.s32 q5, d15, d5
+ vld1.8 {d31}, [r2, : 64]!
+ vmlal.s32 q5, d29, d4
+ vmlal.s32 q15, d20, d0
+ vext.32 d0, d6, d18, #1
+ vmlal.s32 q15, d21, d25
+ vrev64.i32 d0, d0
+ vmlal.s32 q15, d26, d24
+ vext.32 d1, d7, d19, #1
+ vext.32 d7, d10, d10, #0
+ vmlal.s32 q15, d27, d23
+ vrev64.i32 d1, d1
+ vld1.8 {d6}, [r2, : 64]
+ vmlal.s32 q15, d28, d22
+ vmlal.s32 q3, d16, d4
+ add r2, r2, #24
+ vmlal.s32 q3, d17, d2
+ vext.32 d4, d31, d30, #0
+ vmov d17, d11
+ vmlal.s32 q3, d14, d1
+ vext.32 d11, d13, d13, #0
+ vext.32 d13, d30, d30, #0
+ vmlal.s32 q3, d15, d0
+ vext.32 d1, d8, d8, #0
+ vmlal.s32 q3, d29, d3
+ vld1.8 {d5}, [r2, : 64]
+ sub r2, r2, #16
+ vext.32 d10, d6, d6, #0
+ vmov.i32 q1, #0xffffffff
+ vshl.i64 q4, q1, #25
+ add r5, sp, #480
+ vld1.8 {d14-d15}, [r5, : 128]
+ vadd.i64 q9, q2, q7
+ vshl.i64 q1, q1, #26
+ vshr.s64 q10, q9, #26
+ vld1.8 {d0}, [r2, : 64]!
+ vadd.i64 q5, q5, q10
+ vand q9, q9, q1
+ vld1.8 {d16}, [r2, : 64]!
+ add r2, sp, #496
+ vld1.8 {d20-d21}, [r2, : 128]
+ vadd.i64 q11, q5, q10
+ vsub.i64 q2, q2, q9
+ vshr.s64 q9, q11, #25
+ vext.32 d12, d5, d4, #0
+ vand q11, q11, q4
+ vadd.i64 q0, q0, q9
+ vmov d19, d7
+ vadd.i64 q3, q0, q7
+ vsub.i64 q5, q5, q11
+ vshr.s64 q11, q3, #26
+ vext.32 d18, d11, d10, #0
+ vand q3, q3, q1
+ vadd.i64 q8, q8, q11
+ vadd.i64 q11, q8, q10
+ vsub.i64 q0, q0, q3
+ vshr.s64 q3, q11, #25
+ vand q11, q11, q4
+ vadd.i64 q3, q6, q3
+ vadd.i64 q6, q3, q7
+ vsub.i64 q8, q8, q11
+ vshr.s64 q11, q6, #26
+ vand q6, q6, q1
+ vadd.i64 q9, q9, q11
+ vadd.i64 d25, d19, d21
+ vsub.i64 q3, q3, q6
+ vshr.s64 d23, d25, #25
+ vand q4, q12, q4
+ vadd.i64 d21, d23, d23
+ vshl.i64 d25, d23, #4
+ vadd.i64 d21, d21, d23
+ vadd.i64 d25, d25, d21
+ vadd.i64 d4, d4, d25
+ vzip.i32 q0, q8
+ vadd.i64 d12, d4, d14
+ add r2, r6, #8
+ vst1.8 d0, [r2, : 64]
+ vsub.i64 d19, d19, d9
+ add r2, r2, #16
+ vst1.8 d16, [r2, : 64]
+ vshr.s64 d22, d12, #26
+ vand q0, q6, q1
+ vadd.i64 d10, d10, d22
+ vzip.i32 q3, q9
+ vsub.i64 d4, d4, d0
+ sub r2, r2, #8
+ vst1.8 d6, [r2, : 64]
+ add r2, r2, #16
+ vst1.8 d18, [r2, : 64]
+ vzip.i32 q2, q5
+ sub r2, r2, #32
+ vst1.8 d4, [r2, : 64]
+ cmp r4, #0
+ beq .Lskippostcopy
+ add r2, r3, #144
+ mov r4, r4
+ vld1.8 {d0-d1}, [r2, : 128]!
+ vld1.8 {d2-d3}, [r2, : 128]!
+ vld1.8 {d4}, [r2, : 64]
+ vst1.8 {d0-d1}, [r4, : 128]!
+ vst1.8 {d2-d3}, [r4, : 128]!
+ vst1.8 d4, [r4, : 64]
+.Lskippostcopy:
+ cmp r1, #1
+ bne .Lskipfinalcopy
+ add r2, r3, #288
+ add r4, r3, #144
+ vld1.8 {d0-d1}, [r2, : 128]!
+ vld1.8 {d2-d3}, [r2, : 128]!
+ vld1.8 {d4}, [r2, : 64]
+ vst1.8 {d0-d1}, [r4, : 128]!
+ vst1.8 {d2-d3}, [r4, : 128]!
+ vst1.8 d4, [r4, : 64]
+.Lskipfinalcopy:
+ add r1, r1, #1
+ cmp r1, #12
+ blo .Linvertloop
+ add r1, r3, #144
+ ldr r2, [r1], #4
+ ldr r3, [r1], #4
+ ldr r4, [r1], #4
+ ldr r5, [r1], #4
+ ldr r6, [r1], #4
+ ldr r7, [r1], #4
+ ldr r8, [r1], #4
+ ldr r9, [r1], #4
+ ldr r10, [r1], #4
+ ldr r1, [r1]
+ add r11, r1, r1, LSL #4
+ add r11, r11, r1, LSL #1
+ add r11, r11, #16777216
+ mov r11, r11, ASR #25
+ add r11, r11, r2
+ mov r11, r11, ASR #26
+ add r11, r11, r3
+ mov r11, r11, ASR #25
+ add r11, r11, r4
+ mov r11, r11, ASR #26
+ add r11, r11, r5
+ mov r11, r11, ASR #25
+ add r11, r11, r6
+ mov r11, r11, ASR #26
+ add r11, r11, r7
+ mov r11, r11, ASR #25
+ add r11, r11, r8
+ mov r11, r11, ASR #26
+ add r11, r11, r9
+ mov r11, r11, ASR #25
+ add r11, r11, r10
+ mov r11, r11, ASR #26
+ add r11, r11, r1
+ mov r11, r11, ASR #25
+ add r2, r2, r11
+ add r2, r2, r11, LSL #1
+ add r2, r2, r11, LSL #4
+ mov r11, r2, ASR #26
+ add r3, r3, r11
+ sub r2, r2, r11, LSL #26
+ mov r11, r3, ASR #25
+ add r4, r4, r11
+ sub r3, r3, r11, LSL #25
+ mov r11, r4, ASR #26
+ add r5, r5, r11
+ sub r4, r4, r11, LSL #26
+ mov r11, r5, ASR #25
+ add r6, r6, r11
+ sub r5, r5, r11, LSL #25
+ mov r11, r6, ASR #26
+ add r7, r7, r11
+ sub r6, r6, r11, LSL #26
+ mov r11, r7, ASR #25
+ add r8, r8, r11
+ sub r7, r7, r11, LSL #25
+ mov r11, r8, ASR #26
+ add r9, r9, r11
+ sub r8, r8, r11, LSL #26
+ mov r11, r9, ASR #25
+ add r10, r10, r11
+ sub r9, r9, r11, LSL #25
+ mov r11, r10, ASR #26
+ add r1, r1, r11
+ sub r10, r10, r11, LSL #26
+ mov r11, r1, ASR #25
+ sub r1, r1, r11, LSL #25
+ add r2, r2, r3, LSL #26
+ mov r3, r3, LSR #6
+ add r3, r3, r4, LSL #19
+ mov r4, r4, LSR #13
+ add r4, r4, r5, LSL #13
+ mov r5, r5, LSR #19
+ add r5, r5, r6, LSL #6
+ add r6, r7, r8, LSL #25
+ mov r7, r8, LSR #7
+ add r7, r7, r9, LSL #19
+ mov r8, r9, LSR #13
+ add r8, r8, r10, LSL #12
+ mov r9, r10, LSR #20
+ add r1, r9, r1, LSL #6
+ str r2, [r0]
+ str r3, [r0, #4]
+ str r4, [r0, #8]
+ str r5, [r0, #12]
+ str r6, [r0, #16]
+ str r7, [r0, #20]
+ str r8, [r0, #24]
+ str r1, [r0, #28]
+ movw r0, #0
+ mov sp, ip
+ pop {r4-r11, pc}
+ENDPROC(curve25519_neon)
+
+#endif
diff --git a/src/crypto/zinc/curve25519/curve25519-fiat32.h b/src/crypto/zinc/curve25519/curve25519-fiat32.h
new file mode 100644
index 0000000..8fa327c
--- /dev/null
+++ b/src/crypto/zinc/curve25519/curve25519-fiat32.h
@@ -0,0 +1,862 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2016 The fiat-crypto Authors.
+ * Copyright (C) 2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * This is a machine-generated formally verified implementation of Curve25519
+ * ECDH from: <https://github.com/mit-plv/fiat-crypto>. Though originally
+ * machine generated, it has been tweaked to be suitable for use in the kernel.
+ * It is optimized for 32-bit machines and machines that cannot work efficiently
+ * with 128-bit integer types.
+ */
+
+/* fe means field element. Here the field is \Z/(2^255-19). An element t,
+ * entries t[0]...t[9], represents the integer t[0]+2^26 t[1]+2^51 t[2]+2^77
+ * t[3]+2^102 t[4]+...+2^230 t[9].
+ * fe limbs are bounded by 1.125*2^26,1.125*2^25,1.125*2^26,1.125*2^25,etc.
+ * Multiplication and carrying produce fe from fe_loose.
+ */
+typedef struct fe { u32 v[10]; } fe;
+
+/* fe_loose limbs are bounded by 3.375*2^26,3.375*2^25,3.375*2^26,3.375*2^25,etc
+ * Addition and subtraction produce fe_loose from (fe, fe).
+ */
+typedef struct fe_loose { u32 v[10]; } fe_loose;
+
+static __always_inline void fe_frombytes_impl(u32 h[10], const u8 *s)
+{
+ /* Ignores top bit of s. */
+ u32 a0 = get_unaligned_le32(s);
+ u32 a1 = get_unaligned_le32(s+4);
+ u32 a2 = get_unaligned_le32(s+8);
+ u32 a3 = get_unaligned_le32(s+12);
+ u32 a4 = get_unaligned_le32(s+16);
+ u32 a5 = get_unaligned_le32(s+20);
+ u32 a6 = get_unaligned_le32(s+24);
+ u32 a7 = get_unaligned_le32(s+28);
+ h[0] = a0&((1<<26)-1); /* 26 used, 32-26 left. 26 */
+ h[1] = (a0>>26) | ((a1&((1<<19)-1))<< 6); /* (32-26) + 19 = 6+19 = 25 */
+ h[2] = (a1>>19) | ((a2&((1<<13)-1))<<13); /* (32-19) + 13 = 13+13 = 26 */
+ h[3] = (a2>>13) | ((a3&((1<< 6)-1))<<19); /* (32-13) + 6 = 19+ 6 = 25 */
+ h[4] = (a3>> 6); /* (32- 6) = 26 */
+ h[5] = a4&((1<<25)-1); /* 25 */
+ h[6] = (a4>>25) | ((a5&((1<<19)-1))<< 7); /* (32-25) + 19 = 7+19 = 26 */
+ h[7] = (a5>>19) | ((a6&((1<<12)-1))<<13); /* (32-19) + 12 = 13+12 = 25 */
+ h[8] = (a6>>12) | ((a7&((1<< 6)-1))<<20); /* (32-12) + 6 = 20+ 6 = 26 */
+ h[9] = (a7>> 6)&((1<<25)-1); /* 25 */
+}
+
+static __always_inline void fe_frombytes(fe *h, const u8 *s)
+{
+ fe_frombytes_impl(h->v, s);
+}
+
+static __always_inline u8 /*bool*/
+addcarryx_u25(u8 /*bool*/ c, u32 a, u32 b, u32 *low)
+{
+ /* This function extracts 25 bits of result and 1 bit of carry
+ * (26 total), so a 32-bit intermediate is sufficient.
+ */
+ u32 x = a + b + c;
+ *low = x & ((1 << 25) - 1);
+ return (x >> 25) & 1;
+}
+
+static __always_inline u8 /*bool*/
+addcarryx_u26(u8 /*bool*/ c, u32 a, u32 b, u32 *low)
+{
+ /* This function extracts 26 bits of result and 1 bit of carry
+ * (27 total), so a 32-bit intermediate is sufficient.
+ */
+ u32 x = a + b + c;
+ *low = x & ((1 << 26) - 1);
+ return (x >> 26) & 1;
+}
+
+static __always_inline u8 /*bool*/
+subborrow_u25(u8 /*bool*/ c, u32 a, u32 b, u32 *low)
+{
+ /* This function extracts 25 bits of result and 1 bit of borrow
+ * (26 total), so a 32-bit intermediate is sufficient.
+ */
+ u32 x = a - b - c;
+ *low = x & ((1 << 25) - 1);
+ return x >> 31;
+}
+
+static __always_inline u8 /*bool*/
+subborrow_u26(u8 /*bool*/ c, u32 a, u32 b, u32 *low)
+{
+ /* This function extracts 26 bits of result and 1 bit of borrow
+ *(27 total), so a 32-bit intermediate is sufficient.
+ */
+ u32 x = a - b - c;
+ *low = x & ((1 << 26) - 1);
+ return x >> 31;
+}
+
+static __always_inline u32 cmovznz32(u32 t, u32 z, u32 nz)
+{
+ t = -!!t; /* all set if nonzero, 0 if 0 */
+ return (t&nz) | ((~t)&z);
+}
+
+static __always_inline void fe_freeze(u32 out[10], const u32 in1[10])
+{
+ { const u32 x17 = in1[9];
+ { const u32 x18 = in1[8];
+ { const u32 x16 = in1[7];
+ { const u32 x14 = in1[6];
+ { const u32 x12 = in1[5];
+ { const u32 x10 = in1[4];
+ { const u32 x8 = in1[3];
+ { const u32 x6 = in1[2];
+ { const u32 x4 = in1[1];
+ { const u32 x2 = in1[0];
+ { u32 x20; u8/*bool*/ x21 = subborrow_u26(0x0, x2, 0x3ffffed, &x20);
+ { u32 x23; u8/*bool*/ x24 = subborrow_u25(x21, x4, 0x1ffffff, &x23);
+ { u32 x26; u8/*bool*/ x27 = subborrow_u26(x24, x6, 0x3ffffff, &x26);
+ { u32 x29; u8/*bool*/ x30 = subborrow_u25(x27, x8, 0x1ffffff, &x29);
+ { u32 x32; u8/*bool*/ x33 = subborrow_u26(x30, x10, 0x3ffffff, &x32);
+ { u32 x35; u8/*bool*/ x36 = subborrow_u25(x33, x12, 0x1ffffff, &x35);
+ { u32 x38; u8/*bool*/ x39 = subborrow_u26(x36, x14, 0x3ffffff, &x38);
+ { u32 x41; u8/*bool*/ x42 = subborrow_u25(x39, x16, 0x1ffffff, &x41);
+ { u32 x44; u8/*bool*/ x45 = subborrow_u26(x42, x18, 0x3ffffff, &x44);
+ { u32 x47; u8/*bool*/ x48 = subborrow_u25(x45, x17, 0x1ffffff, &x47);
+ { u32 x49 = cmovznz32(x48, 0x0, 0xffffffff);
+ { u32 x50 = (x49 & 0x3ffffed);
+ { u32 x52; u8/*bool*/ x53 = addcarryx_u26(0x0, x20, x50, &x52);
+ { u32 x54 = (x49 & 0x1ffffff);
+ { u32 x56; u8/*bool*/ x57 = addcarryx_u25(x53, x23, x54, &x56);
+ { u32 x58 = (x49 & 0x3ffffff);
+ { u32 x60; u8/*bool*/ x61 = addcarryx_u26(x57, x26, x58, &x60);
+ { u32 x62 = (x49 & 0x1ffffff);
+ { u32 x64; u8/*bool*/ x65 = addcarryx_u25(x61, x29, x62, &x64);
+ { u32 x66 = (x49 & 0x3ffffff);
+ { u32 x68; u8/*bool*/ x69 = addcarryx_u26(x65, x32, x66, &x68);
+ { u32 x70 = (x49 & 0x1ffffff);
+ { u32 x72; u8/*bool*/ x73 = addcarryx_u25(x69, x35, x70, &x72);
+ { u32 x74 = (x49 & 0x3ffffff);
+ { u32 x76; u8/*bool*/ x77 = addcarryx_u26(x73, x38, x74, &x76);
+ { u32 x78 = (x49 & 0x1ffffff);
+ { u32 x80; u8/*bool*/ x81 = addcarryx_u25(x77, x41, x78, &x80);
+ { u32 x82 = (x49 & 0x3ffffff);
+ { u32 x84; u8/*bool*/ x85 = addcarryx_u26(x81, x44, x82, &x84);
+ { u32 x86 = (x49 & 0x1ffffff);
+ { u32 x88; addcarryx_u25(x85, x47, x86, &x88);
+ out[0] = x52;
+ out[1] = x56;
+ out[2] = x60;
+ out[3] = x64;
+ out[4] = x68;
+ out[5] = x72;
+ out[6] = x76;
+ out[7] = x80;
+ out[8] = x84;
+ out[9] = x88;
+ }}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}
+}
+
+static __always_inline void fe_tobytes(u8 s[32], const fe *f)
+{
+ u32 h[10];
+ fe_freeze(h, f->v);
+ s[0] = h[0] >> 0;
+ s[1] = h[0] >> 8;
+ s[2] = h[0] >> 16;
+ s[3] = (h[0] >> 24) | (h[1] << 2);
+ s[4] = h[1] >> 6;
+ s[5] = h[1] >> 14;
+ s[6] = (h[1] >> 22) | (h[2] << 3);
+ s[7] = h[2] >> 5;
+ s[8] = h[2] >> 13;
+ s[9] = (h[2] >> 21) | (h[3] << 5);
+ s[10] = h[3] >> 3;
+ s[11] = h[3] >> 11;
+ s[12] = (h[3] >> 19) | (h[4] << 6);
+ s[13] = h[4] >> 2;
+ s[14] = h[4] >> 10;
+ s[15] = h[4] >> 18;
+ s[16] = h[5] >> 0;
+ s[17] = h[5] >> 8;
+ s[18] = h[5] >> 16;
+ s[19] = (h[5] >> 24) | (h[6] << 1);
+ s[20] = h[6] >> 7;
+ s[21] = h[6] >> 15;
+ s[22] = (h[6] >> 23) | (h[7] << 3);
+ s[23] = h[7] >> 5;
+ s[24] = h[7] >> 13;
+ s[25] = (h[7] >> 21) | (h[8] << 4);
+ s[26] = h[8] >> 4;
+ s[27] = h[8] >> 12;
+ s[28] = (h[8] >> 20) | (h[9] << 6);
+ s[29] = h[9] >> 2;
+ s[30] = h[9] >> 10;
+ s[31] = h[9] >> 18;
+}
+
+/* h = f */
+static __always_inline void fe_copy(fe *h, const fe *f)
+{
+ memmove(h, f, sizeof(u32) * 10);
+}
+
+static __always_inline void fe_copy_lt(fe_loose *h, const fe *f)
+{
+ memmove(h, f, sizeof(u32) * 10);
+}
+
+/* h = 0 */
+static __always_inline void fe_0(fe *h)
+{
+ memset(h, 0, sizeof(u32) * 10);
+}
+
+/* h = 1 */
+static __always_inline void fe_1(fe *h)
+{
+ memset(h, 0, sizeof(u32) * 10);
+ h->v[0] = 1;
+}
+
+static void fe_add_impl(u32 out[10], const u32 in1[10], const u32 in2[10])
+{
+ { const u32 x20 = in1[9];
+ { const u32 x21 = in1[8];
+ { const u32 x19 = in1[7];
+ { const u32 x17 = in1[6];
+ { const u32 x15 = in1[5];
+ { const u32 x13 = in1[4];
+ { const u32 x11 = in1[3];
+ { const u32 x9 = in1[2];
+ { const u32 x7 = in1[1];
+ { const u32 x5 = in1[0];
+ { const u32 x38 = in2[9];
+ { const u32 x39 = in2[8];
+ { const u32 x37 = in2[7];
+ { const u32 x35 = in2[6];
+ { const u32 x33 = in2[5];
+ { const u32 x31 = in2[4];
+ { const u32 x29 = in2[3];
+ { const u32 x27 = in2[2];
+ { const u32 x25 = in2[1];
+ { const u32 x23 = in2[0];
+ out[0] = (x5 + x23);
+ out[1] = (x7 + x25);
+ out[2] = (x9 + x27);
+ out[3] = (x11 + x29);
+ out[4] = (x13 + x31);
+ out[5] = (x15 + x33);
+ out[6] = (x17 + x35);
+ out[7] = (x19 + x37);
+ out[8] = (x21 + x39);
+ out[9] = (x20 + x38);
+ }}}}}}}}}}}}}}}}}}}}
+}
+
+/* h = f + g
+ * Can overlap h with f or g.
+ */
+static __always_inline void fe_add(fe_loose *h, const fe *f, const fe *g)
+{
+ fe_add_impl(h->v, f->v, g->v);
+}
+
+static void fe_sub_impl(u32 out[10], const u32 in1[10], const u32 in2[10])
+{
+ { const u32 x20 = in1[9];
+ { const u32 x21 = in1[8];
+ { const u32 x19 = in1[7];
+ { const u32 x17 = in1[6];
+ { const u32 x15 = in1[5];
+ { const u32 x13 = in1[4];
+ { const u32 x11 = in1[3];
+ { const u32 x9 = in1[2];
+ { const u32 x7 = in1[1];
+ { const u32 x5 = in1[0];
+ { const u32 x38 = in2[9];
+ { const u32 x39 = in2[8];
+ { const u32 x37 = in2[7];
+ { const u32 x35 = in2[6];
+ { const u32 x33 = in2[5];
+ { const u32 x31 = in2[4];
+ { const u32 x29 = in2[3];
+ { const u32 x27 = in2[2];
+ { const u32 x25 = in2[1];
+ { const u32 x23 = in2[0];
+ out[0] = ((0x7ffffda + x5) - x23);
+ out[1] = ((0x3fffffe + x7) - x25);
+ out[2] = ((0x7fffffe + x9) - x27);
+ out[3] = ((0x3fffffe + x11) - x29);
+ out[4] = ((0x7fffffe + x13) - x31);
+ out[5] = ((0x3fffffe + x15) - x33);
+ out[6] = ((0x7fffffe + x17) - x35);
+ out[7] = ((0x3fffffe + x19) - x37);
+ out[8] = ((0x7fffffe + x21) - x39);
+ out[9] = ((0x3fffffe + x20) - x38);
+ }}}}}}}}}}}}}}}}}}}}
+}
+
+/* h = f - g
+ * Can overlap h with f or g.
+ */
+static __always_inline void fe_sub(fe_loose *h, const fe *f, const fe *g)
+{
+ fe_sub_impl(h->v, f->v, g->v);
+}
+
+static void fe_mul_impl(u32 out[10], const u32 in1[10], const u32 in2[10])
+{
+ { const u32 x20 = in1[9];
+ { const u32 x21 = in1[8];
+ { const u32 x19 = in1[7];
+ { const u32 x17 = in1[6];
+ { const u32 x15 = in1[5];
+ { const u32 x13 = in1[4];
+ { const u32 x11 = in1[3];
+ { const u32 x9 = in1[2];
+ { const u32 x7 = in1[1];
+ { const u32 x5 = in1[0];
+ { const u32 x38 = in2[9];
+ { const u32 x39 = in2[8];
+ { const u32 x37 = in2[7];
+ { const u32 x35 = in2[6];
+ { const u32 x33 = in2[5];
+ { const u32 x31 = in2[4];
+ { const u32 x29 = in2[3];
+ { const u32 x27 = in2[2];
+ { const u32 x25 = in2[1];
+ { const u32 x23 = in2[0];
+ { u64 x40 = ((u64)x23 * x5);
+ { u64 x41 = (((u64)x23 * x7) + ((u64)x25 * x5));
+ { u64 x42 = ((((u64)(0x2 * x25) * x7) + ((u64)x23 * x9)) + ((u64)x27 * x5));
+ { u64 x43 = (((((u64)x25 * x9) + ((u64)x27 * x7)) + ((u64)x23 * x11)) + ((u64)x29 * x5));
+ { u64 x44 = (((((u64)x27 * x9) + (0x2 * (((u64)x25 * x11) + ((u64)x29 * x7)))) + ((u64)x23 * x13)) + ((u64)x31 * x5));
+ { u64 x45 = (((((((u64)x27 * x11) + ((u64)x29 * x9)) + ((u64)x25 * x13)) + ((u64)x31 * x7)) + ((u64)x23 * x15)) + ((u64)x33 * x5));
+ { u64 x46 = (((((0x2 * ((((u64)x29 * x11) + ((u64)x25 * x15)) + ((u64)x33 * x7))) + ((u64)x27 * x13)) + ((u64)x31 * x9)) + ((u64)x23 * x17)) + ((u64)x35 * x5));
+ { u64 x47 = (((((((((u64)x29 * x13) + ((u64)x31 * x11)) + ((u64)x27 * x15)) + ((u64)x33 * x9)) + ((u64)x25 * x17)) + ((u64)x35 * x7)) + ((u64)x23 * x19)) + ((u64)x37 * x5));
+ { u64 x48 = (((((((u64)x31 * x13) + (0x2 * (((((u64)x29 * x15) + ((u64)x33 * x11)) + ((u64)x25 * x19)) + ((u64)x37 * x7)))) + ((u64)x27 * x17)) + ((u64)x35 * x9)) + ((u64)x23 * x21)) + ((u64)x39 * x5));
+ { u64 x49 = (((((((((((u64)x31 * x15) + ((u64)x33 * x13)) + ((u64)x29 * x17)) + ((u64)x35 * x11)) + ((u64)x27 * x19)) + ((u64)x37 * x9)) + ((u64)x25 * x21)) + ((u64)x39 * x7)) + ((u64)x23 * x20)) + ((u64)x38 * x5));
+ { u64 x50 = (((((0x2 * ((((((u64)x33 * x15) + ((u64)x29 * x19)) + ((u64)x37 * x11)) + ((u64)x25 * x20)) + ((u64)x38 * x7))) + ((u64)x31 * x17)) + ((u64)x35 * x13)) + ((u64)x27 * x21)) + ((u64)x39 * x9));
+ { u64 x51 = (((((((((u64)x33 * x17) + ((u64)x35 * x15)) + ((u64)x31 * x19)) + ((u64)x37 * x13)) + ((u64)x29 * x21)) + ((u64)x39 * x11)) + ((u64)x27 * x20)) + ((u64)x38 * x9));
+ { u64 x52 = (((((u64)x35 * x17) + (0x2 * (((((u64)x33 * x19) + ((u64)x37 * x15)) + ((u64)x29 * x20)) + ((u64)x38 * x11)))) + ((u64)x31 * x21)) + ((u64)x39 * x13));
+ { u64 x53 = (((((((u64)x35 * x19) + ((u64)x37 * x17)) + ((u64)x33 * x21)) + ((u64)x39 * x15)) + ((u64)x31 * x20)) + ((u64)x38 * x13));
+ { u64 x54 = (((0x2 * ((((u64)x37 * x19) + ((u64)x33 * x20)) + ((u64)x38 * x15))) + ((u64)x35 * x21)) + ((u64)x39 * x17));
+ { u64 x55 = (((((u64)x37 * x21) + ((u64)x39 * x19)) + ((u64)x35 * x20)) + ((u64)x38 * x17));
+ { u64 x56 = (((u64)x39 * x21) + (0x2 * (((u64)x37 * x20) + ((u64)x38 * x19))));
+ { u64 x57 = (((u64)x39 * x20) + ((u64)x38 * x21));
+ { u64 x58 = ((u64)(0x2 * x38) * x20);
+ { u64 x59 = (x48 + (x58 << 0x4));
+ { u64 x60 = (x59 + (x58 << 0x1));
+ { u64 x61 = (x60 + x58);
+ { u64 x62 = (x47 + (x57 << 0x4));
+ { u64 x63 = (x62 + (x57 << 0x1));
+ { u64 x64 = (x63 + x57);
+ { u64 x65 = (x46 + (x56 << 0x4));
+ { u64 x66 = (x65 + (x56 << 0x1));
+ { u64 x67 = (x66 + x56);
+ { u64 x68 = (x45 + (x55 << 0x4));
+ { u64 x69 = (x68 + (x55 << 0x1));
+ { u64 x70 = (x69 + x55);
+ { u64 x71 = (x44 + (x54 << 0x4));
+ { u64 x72 = (x71 + (x54 << 0x1));
+ { u64 x73 = (x72 + x54);
+ { u64 x74 = (x43 + (x53 << 0x4));
+ { u64 x75 = (x74 + (x53 << 0x1));
+ { u64 x76 = (x75 + x53);
+ { u64 x77 = (x42 + (x52 << 0x4));
+ { u64 x78 = (x77 + (x52 << 0x1));
+ { u64 x79 = (x78 + x52);
+ { u64 x80 = (x41 + (x51 << 0x4));
+ { u64 x81 = (x80 + (x51 << 0x1));
+ { u64 x82 = (x81 + x51);
+ { u64 x83 = (x40 + (x50 << 0x4));
+ { u64 x84 = (x83 + (x50 << 0x1));
+ { u64 x85 = (x84 + x50);
+ { u64 x86 = (x85 >> 0x1a);
+ { u32 x87 = ((u32)x85 & 0x3ffffff);
+ { u64 x88 = (x86 + x82);
+ { u64 x89 = (x88 >> 0x19);
+ { u32 x90 = ((u32)x88 & 0x1ffffff);
+ { u64 x91 = (x89 + x79);
+ { u64 x92 = (x91 >> 0x1a);
+ { u32 x93 = ((u32)x91 & 0x3ffffff);
+ { u64 x94 = (x92 + x76);
+ { u64 x95 = (x94 >> 0x19);
+ { u32 x96 = ((u32)x94 & 0x1ffffff);
+ { u64 x97 = (x95 + x73);
+ { u64 x98 = (x97 >> 0x1a);
+ { u32 x99 = ((u32)x97 & 0x3ffffff);
+ { u64 x100 = (x98 + x70);
+ { u64 x101 = (x100 >> 0x19);
+ { u32 x102 = ((u32)x100 & 0x1ffffff);
+ { u64 x103 = (x101 + x67);
+ { u64 x104 = (x103 >> 0x1a);
+ { u32 x105 = ((u32)x103 & 0x3ffffff);
+ { u64 x106 = (x104 + x64);
+ { u64 x107 = (x106 >> 0x19);
+ { u32 x108 = ((u32)x106 & 0x1ffffff);
+ { u64 x109 = (x107 + x61);
+ { u64 x110 = (x109 >> 0x1a);
+ { u32 x111 = ((u32)x109 & 0x3ffffff);
+ { u64 x112 = (x110 + x49);
+ { u64 x113 = (x112 >> 0x19);
+ { u32 x114 = ((u32)x112 & 0x1ffffff);
+ { u64 x115 = (x87 + (0x13 * x113));
+ { u32 x116 = (u32) (x115 >> 0x1a);
+ { u32 x117 = ((u32)x115 & 0x3ffffff);
+ { u32 x118 = (x116 + x90);
+ { u32 x119 = (x118 >> 0x19);
+ { u32 x120 = (x118 & 0x1ffffff);
+ out[0] = x117;
+ out[1] = x120;
+ out[2] = (x119 + x93);
+ out[3] = x96;
+ out[4] = x99;
+ out[5] = x102;
+ out[6] = x105;
+ out[7] = x108;
+ out[8] = x111;
+ out[9] = x114;
+ }}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}
+}
+
+static __always_inline void fe_mul_ttt(fe *h, const fe *f, const fe *g)
+{
+ fe_mul_impl(h->v, f->v, g->v);
+}
+
+static __always_inline void fe_mul_tlt(fe *h, const fe_loose *f, const fe *g)
+{
+ fe_mul_impl(h->v, f->v, g->v);
+}
+
+static __always_inline void
+fe_mul_tll(fe *h, const fe_loose *f, const fe_loose *g)
+{
+ fe_mul_impl(h->v, f->v, g->v);
+}
+
+static void fe_sqr_impl(u32 out[10], const u32 in1[10])
+{
+ { const u32 x17 = in1[9];
+ { const u32 x18 = in1[8];
+ { const u32 x16 = in1[7];
+ { const u32 x14 = in1[6];
+ { const u32 x12 = in1[5];
+ { const u32 x10 = in1[4];
+ { const u32 x8 = in1[3];
+ { const u32 x6 = in1[2];
+ { const u32 x4 = in1[1];
+ { const u32 x2 = in1[0];
+ { u64 x19 = ((u64)x2 * x2);
+ { u64 x20 = ((u64)(0x2 * x2) * x4);
+ { u64 x21 = (0x2 * (((u64)x4 * x4) + ((u64)x2 * x6)));
+ { u64 x22 = (0x2 * (((u64)x4 * x6) + ((u64)x2 * x8)));
+ { u64 x23 = ((((u64)x6 * x6) + ((u64)(0x4 * x4) * x8)) + ((u64)(0x2 * x2) * x10));
+ { u64 x24 = (0x2 * ((((u64)x6 * x8) + ((u64)x4 * x10)) + ((u64)x2 * x12)));
+ { u64 x25 = (0x2 * (((((u64)x8 * x8) + ((u64)x6 * x10)) + ((u64)x2 * x14)) + ((u64)(0x2 * x4) * x12)));
+ { u64 x26 = (0x2 * (((((u64)x8 * x10) + ((u64)x6 * x12)) + ((u64)x4 * x14)) + ((u64)x2 * x16)));
+ { u64 x27 = (((u64)x10 * x10) + (0x2 * ((((u64)x6 * x14) + ((u64)x2 * x18)) + (0x2 * (((u64)x4 * x16) + ((u64)x8 * x12))))));
+ { u64 x28 = (0x2 * ((((((u64)x10 * x12) + ((u64)x8 * x14)) + ((u64)x6 * x16)) + ((u64)x4 * x18)) + ((u64)x2 * x17)));
+ { u64 x29 = (0x2 * (((((u64)x12 * x12) + ((u64)x10 * x14)) + ((u64)x6 * x18)) + (0x2 * (((u64)x8 * x16) + ((u64)x4 * x17)))));
+ { u64 x30 = (0x2 * (((((u64)x12 * x14) + ((u64)x10 * x16)) + ((u64)x8 * x18)) + ((u64)x6 * x17)));
+ { u64 x31 = (((u64)x14 * x14) + (0x2 * (((u64)x10 * x18) + (0x2 * (((u64)x12 * x16) + ((u64)x8 * x17))))));
+ { u64 x32 = (0x2 * ((((u64)x14 * x16) + ((u64)x12 * x18)) + ((u64)x10 * x17)));
+ { u64 x33 = (0x2 * ((((u64)x16 * x16) + ((u64)x14 * x18)) + ((u64)(0x2 * x12) * x17)));
+ { u64 x34 = (0x2 * (((u64)x16 * x18) + ((u64)x14 * x17)));
+ { u64 x35 = (((u64)x18 * x18) + ((u64)(0x4 * x16) * x17));
+ { u64 x36 = ((u64)(0x2 * x18) * x17);
+ { u64 x37 = ((u64)(0x2 * x17) * x17);
+ { u64 x38 = (x27 + (x37 << 0x4));
+ { u64 x39 = (x38 + (x37 << 0x1));
+ { u64 x40 = (x39 + x37);
+ { u64 x41 = (x26 + (x36 << 0x4));
+ { u64 x42 = (x41 + (x36 << 0x1));
+ { u64 x43 = (x42 + x36);
+ { u64 x44 = (x25 + (x35 << 0x4));
+ { u64 x45 = (x44 + (x35 << 0x1));
+ { u64 x46 = (x45 + x35);
+ { u64 x47 = (x24 + (x34 << 0x4));
+ { u64 x48 = (x47 + (x34 << 0x1));
+ { u64 x49 = (x48 + x34);
+ { u64 x50 = (x23 + (x33 << 0x4));
+ { u64 x51 = (x50 + (x33 << 0x1));
+ { u64 x52 = (x51 + x33);
+ { u64 x53 = (x22 + (x32 << 0x4));
+ { u64 x54 = (x53 + (x32 << 0x1));
+ { u64 x55 = (x54 + x32);
+ { u64 x56 = (x21 + (x31 << 0x4));
+ { u64 x57 = (x56 + (x31 << 0x1));
+ { u64 x58 = (x57 + x31);
+ { u64 x59 = (x20 + (x30 << 0x4));
+ { u64 x60 = (x59 + (x30 << 0x1));
+ { u64 x61 = (x60 + x30);
+ { u64 x62 = (x19 + (x29 << 0x4));
+ { u64 x63 = (x62 + (x29 << 0x1));
+ { u64 x64 = (x63 + x29);
+ { u64 x65 = (x64 >> 0x1a);
+ { u32 x66 = ((u32)x64 & 0x3ffffff);
+ { u64 x67 = (x65 + x61);
+ { u64 x68 = (x67 >> 0x19);
+ { u32 x69 = ((u32)x67 & 0x1ffffff);
+ { u64 x70 = (x68 + x58);
+ { u64 x71 = (x70 >> 0x1a);
+ { u32 x72 = ((u32)x70 & 0x3ffffff);
+ { u64 x73 = (x71 + x55);
+ { u64 x74 = (x73 >> 0x19);
+ { u32 x75 = ((u32)x73 & 0x1ffffff);
+ { u64 x76 = (x74 + x52);
+ { u64 x77 = (x76 >> 0x1a);
+ { u32 x78 = ((u32)x76 & 0x3ffffff);
+ { u64 x79 = (x77 + x49);
+ { u64 x80 = (x79 >> 0x19);
+ { u32 x81 = ((u32)x79 & 0x1ffffff);
+ { u64 x82 = (x80 + x46);
+ { u64 x83 = (x82 >> 0x1a);
+ { u32 x84 = ((u32)x82 & 0x3ffffff);
+ { u64 x85 = (x83 + x43);
+ { u64 x86 = (x85 >> 0x19);
+ { u32 x87 = ((u32)x85 & 0x1ffffff);
+ { u64 x88 = (x86 + x40);
+ { u64 x89 = (x88 >> 0x1a);
+ { u32 x90 = ((u32)x88 & 0x3ffffff);
+ { u64 x91 = (x89 + x28);
+ { u64 x92 = (x91 >> 0x19);
+ { u32 x93 = ((u32)x91 & 0x1ffffff);
+ { u64 x94 = (x66 + (0x13 * x92));
+ { u32 x95 = (u32) (x94 >> 0x1a);
+ { u32 x96 = ((u32)x94 & 0x3ffffff);
+ { u32 x97 = (x95 + x69);
+ { u32 x98 = (x97 >> 0x19);
+ { u32 x99 = (x97 & 0x1ffffff);
+ out[0] = x96;
+ out[1] = x99;
+ out[2] = (x98 + x72);
+ out[3] = x75;
+ out[4] = x78;
+ out[5] = x81;
+ out[6] = x84;
+ out[7] = x87;
+ out[8] = x90;
+ out[9] = x93;
+ }}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}
+}
+
+static __always_inline void fe_sq_tl(fe *h, const fe_loose *f)
+{
+ fe_sqr_impl(h->v, f->v);
+}
+
+static __always_inline void fe_sq_tt(fe *h, const fe *f)
+{
+ fe_sqr_impl(h->v, f->v);
+}
+
+static __always_inline void fe_loose_invert(fe *out, const fe_loose *z)
+{
+ fe t0;
+ fe t1;
+ fe t2;
+ fe t3;
+ int i;
+
+ fe_sq_tl(&t0, z);
+ fe_sq_tt(&t1, &t0);
+ for (i = 1; i < 2; ++i)
+ fe_sq_tt(&t1, &t1);
+ fe_mul_tlt(&t1, z, &t1);
+ fe_mul_ttt(&t0, &t0, &t1);
+ fe_sq_tt(&t2, &t0);
+ fe_mul_ttt(&t1, &t1, &t2);
+ fe_sq_tt(&t2, &t1);
+ for (i = 1; i < 5; ++i)
+ fe_sq_tt(&t2, &t2);
+ fe_mul_ttt(&t1, &t2, &t1);
+ fe_sq_tt(&t2, &t1);
+ for (i = 1; i < 10; ++i)
+ fe_sq_tt(&t2, &t2);
+ fe_mul_ttt(&t2, &t2, &t1);
+ fe_sq_tt(&t3, &t2);
+ for (i = 1; i < 20; ++i)
+ fe_sq_tt(&t3, &t3);
+ fe_mul_ttt(&t2, &t3, &t2);
+ fe_sq_tt(&t2, &t2);
+ for (i = 1; i < 10; ++i)
+ fe_sq_tt(&t2, &t2);
+ fe_mul_ttt(&t1, &t2, &t1);
+ fe_sq_tt(&t2, &t1);
+ for (i = 1; i < 50; ++i)
+ fe_sq_tt(&t2, &t2);
+ fe_mul_ttt(&t2, &t2, &t1);
+ fe_sq_tt(&t3, &t2);
+ for (i = 1; i < 100; ++i)
+ fe_sq_tt(&t3, &t3);
+ fe_mul_ttt(&t2, &t3, &t2);
+ fe_sq_tt(&t2, &t2);
+ for (i = 1; i < 50; ++i)
+ fe_sq_tt(&t2, &t2);
+ fe_mul_ttt(&t1, &t2, &t1);
+ fe_sq_tt(&t1, &t1);
+ for (i = 1; i < 5; ++i)
+ fe_sq_tt(&t1, &t1);
+ fe_mul_ttt(out, &t1, &t0);
+}
+
+static __always_inline void fe_invert(fe *out, const fe *z)
+{
+ fe_loose l;
+ fe_copy_lt(&l, z);
+ fe_loose_invert(out, &l);
+}
+
+/* Replace (f,g) with (g,f) if b == 1;
+ * replace (f,g) with (f,g) if b == 0.
+ *
+ * Preconditions: b in {0,1}
+ */
+static __always_inline void fe_cswap(fe *f, fe *g, unsigned int b)
+{
+ unsigned i;
+ b = 0-b;
+ for (i = 0; i < 10; i++) {
+ u32 x = f->v[i] ^ g->v[i];
+ x &= b;
+ f->v[i] ^= x;
+ g->v[i] ^= x;
+ }
+}
+
+/* NOTE: based on fiat-crypto fe_mul, edited for in2=121666, 0, 0.*/
+static __always_inline void fe_mul_121666_impl(u32 out[10], const u32 in1[10])
+{
+ { const u32 x20 = in1[9];
+ { const u32 x21 = in1[8];
+ { const u32 x19 = in1[7];
+ { const u32 x17 = in1[6];
+ { const u32 x15 = in1[5];
+ { const u32 x13 = in1[4];
+ { const u32 x11 = in1[3];
+ { const u32 x9 = in1[2];
+ { const u32 x7 = in1[1];
+ { const u32 x5 = in1[0];
+ { const u32 x38 = 0;
+ { const u32 x39 = 0;
+ { const u32 x37 = 0;
+ { const u32 x35 = 0;
+ { const u32 x33 = 0;
+ { const u32 x31 = 0;
+ { const u32 x29 = 0;
+ { const u32 x27 = 0;
+ { const u32 x25 = 0;
+ { const u32 x23 = 121666;
+ { u64 x40 = ((u64)x23 * x5);
+ { u64 x41 = (((u64)x23 * x7) + ((u64)x25 * x5));
+ { u64 x42 = ((((u64)(0x2 * x25) * x7) + ((u64)x23 * x9)) + ((u64)x27 * x5));
+ { u64 x43 = (((((u64)x25 * x9) + ((u64)x27 * x7)) + ((u64)x23 * x11)) + ((u64)x29 * x5));
+ { u64 x44 = (((((u64)x27 * x9) + (0x2 * (((u64)x25 * x11) + ((u64)x29 * x7)))) + ((u64)x23 * x13)) + ((u64)x31 * x5));
+ { u64 x45 = (((((((u64)x27 * x11) + ((u64)x29 * x9)) + ((u64)x25 * x13)) + ((u64)x31 * x7)) + ((u64)x23 * x15)) + ((u64)x33 * x5));
+ { u64 x46 = (((((0x2 * ((((u64)x29 * x11) + ((u64)x25 * x15)) + ((u64)x33 * x7))) + ((u64)x27 * x13)) + ((u64)x31 * x9)) + ((u64)x23 * x17)) + ((u64)x35 * x5));
+ { u64 x47 = (((((((((u64)x29 * x13) + ((u64)x31 * x11)) + ((u64)x27 * x15)) + ((u64)x33 * x9)) + ((u64)x25 * x17)) + ((u64)x35 * x7)) + ((u64)x23 * x19)) + ((u64)x37 * x5));
+ { u64 x48 = (((((((u64)x31 * x13) + (0x2 * (((((u64)x29 * x15) + ((u64)x33 * x11)) + ((u64)x25 * x19)) + ((u64)x37 * x7)))) + ((u64)x27 * x17)) + ((u64)x35 * x9)) + ((u64)x23 * x21)) + ((u64)x39 * x5));
+ { u64 x49 = (((((((((((u64)x31 * x15) + ((u64)x33 * x13)) + ((u64)x29 * x17)) + ((u64)x35 * x11)) + ((u64)x27 * x19)) + ((u64)x37 * x9)) + ((u64)x25 * x21)) + ((u64)x39 * x7)) + ((u64)x23 * x20)) + ((u64)x38 * x5));
+ { u64 x50 = (((((0x2 * ((((((u64)x33 * x15) + ((u64)x29 * x19)) + ((u64)x37 * x11)) + ((u64)x25 * x20)) + ((u64)x38 * x7))) + ((u64)x31 * x17)) + ((u64)x35 * x13)) + ((u64)x27 * x21)) + ((u64)x39 * x9));
+ { u64 x51 = (((((((((u64)x33 * x17) + ((u64)x35 * x15)) + ((u64)x31 * x19)) + ((u64)x37 * x13)) + ((u64)x29 * x21)) + ((u64)x39 * x11)) + ((u64)x27 * x20)) + ((u64)x38 * x9));
+ { u64 x52 = (((((u64)x35 * x17) + (0x2 * (((((u64)x33 * x19) + ((u64)x37 * x15)) + ((u64)x29 * x20)) + ((u64)x38 * x11)))) + ((u64)x31 * x21)) + ((u64)x39 * x13));
+ { u64 x53 = (((((((u64)x35 * x19) + ((u64)x37 * x17)) + ((u64)x33 * x21)) + ((u64)x39 * x15)) + ((u64)x31 * x20)) + ((u64)x38 * x13));
+ { u64 x54 = (((0x2 * ((((u64)x37 * x19) + ((u64)x33 * x20)) + ((u64)x38 * x15))) + ((u64)x35 * x21)) + ((u64)x39 * x17));
+ { u64 x55 = (((((u64)x37 * x21) + ((u64)x39 * x19)) + ((u64)x35 * x20)) + ((u64)x38 * x17));
+ { u64 x56 = (((u64)x39 * x21) + (0x2 * (((u64)x37 * x20) + ((u64)x38 * x19))));
+ { u64 x57 = (((u64)x39 * x20) + ((u64)x38 * x21));
+ { u64 x58 = ((u64)(0x2 * x38) * x20);
+ { u64 x59 = (x48 + (x58 << 0x4));
+ { u64 x60 = (x59 + (x58 << 0x1));
+ { u64 x61 = (x60 + x58);
+ { u64 x62 = (x47 + (x57 << 0x4));
+ { u64 x63 = (x62 + (x57 << 0x1));
+ { u64 x64 = (x63 + x57);
+ { u64 x65 = (x46 + (x56 << 0x4));
+ { u64 x66 = (x65 + (x56 << 0x1));
+ { u64 x67 = (x66 + x56);
+ { u64 x68 = (x45 + (x55 << 0x4));
+ { u64 x69 = (x68 + (x55 << 0x1));
+ { u64 x70 = (x69 + x55);
+ { u64 x71 = (x44 + (x54 << 0x4));
+ { u64 x72 = (x71 + (x54 << 0x1));
+ { u64 x73 = (x72 + x54);
+ { u64 x74 = (x43 + (x53 << 0x4));
+ { u64 x75 = (x74 + (x53 << 0x1));
+ { u64 x76 = (x75 + x53);
+ { u64 x77 = (x42 + (x52 << 0x4));
+ { u64 x78 = (x77 + (x52 << 0x1));
+ { u64 x79 = (x78 + x52);
+ { u64 x80 = (x41 + (x51 << 0x4));
+ { u64 x81 = (x80 + (x51 << 0x1));
+ { u64 x82 = (x81 + x51);
+ { u64 x83 = (x40 + (x50 << 0x4));
+ { u64 x84 = (x83 + (x50 << 0x1));
+ { u64 x85 = (x84 + x50);
+ { u64 x86 = (x85 >> 0x1a);
+ { u32 x87 = ((u32)x85 & 0x3ffffff);
+ { u64 x88 = (x86 + x82);
+ { u64 x89 = (x88 >> 0x19);
+ { u32 x90 = ((u32)x88 & 0x1ffffff);
+ { u64 x91 = (x89 + x79);
+ { u64 x92 = (x91 >> 0x1a);
+ { u32 x93 = ((u32)x91 & 0x3ffffff);
+ { u64 x94 = (x92 + x76);
+ { u64 x95 = (x94 >> 0x19);
+ { u32 x96 = ((u32)x94 & 0x1ffffff);
+ { u64 x97 = (x95 + x73);
+ { u64 x98 = (x97 >> 0x1a);
+ { u32 x99 = ((u32)x97 & 0x3ffffff);
+ { u64 x100 = (x98 + x70);
+ { u64 x101 = (x100 >> 0x19);
+ { u32 x102 = ((u32)x100 & 0x1ffffff);
+ { u64 x103 = (x101 + x67);
+ { u64 x104 = (x103 >> 0x1a);
+ { u32 x105 = ((u32)x103 & 0x3ffffff);
+ { u64 x106 = (x104 + x64);
+ { u64 x107 = (x106 >> 0x19);
+ { u32 x108 = ((u32)x106 & 0x1ffffff);
+ { u64 x109 = (x107 + x61);
+ { u64 x110 = (x109 >> 0x1a);
+ { u32 x111 = ((u32)x109 & 0x3ffffff);
+ { u64 x112 = (x110 + x49);
+ { u64 x113 = (x112 >> 0x19);
+ { u32 x114 = ((u32)x112 & 0x1ffffff);
+ { u64 x115 = (x87 + (0x13 * x113));
+ { u32 x116 = (u32) (x115 >> 0x1a);
+ { u32 x117 = ((u32)x115 & 0x3ffffff);
+ { u32 x118 = (x116 + x90);
+ { u32 x119 = (x118 >> 0x19);
+ { u32 x120 = (x118 & 0x1ffffff);
+ out[0] = x117;
+ out[1] = x120;
+ out[2] = (x119 + x93);
+ out[3] = x96;
+ out[4] = x99;
+ out[5] = x102;
+ out[6] = x105;
+ out[7] = x108;
+ out[8] = x111;
+ out[9] = x114;
+ }}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}
+}
+
+static __always_inline void fe_mul121666(fe *h, const fe_loose *f)
+{
+ fe_mul_121666_impl(h->v, f->v);
+}
+
+static void curve25519_generic(u8 out[CURVE25519_POINT_SIZE],
+ const u8 scalar[CURVE25519_POINT_SIZE],
+ const u8 point[CURVE25519_POINT_SIZE])
+{
+ fe x1, x2, z2, x3, z3, tmp0, tmp1;
+ fe_loose x2l, z2l, x3l, tmp0l, tmp1l;
+ unsigned swap = 0;
+ int pos;
+ u8 e[32];
+
+ memcpy(e, scalar, 32);
+ normalize_secret(e);
+
+ /* The following implementation was transcribed to Coq and proven to
+ * correspond to unary scalar multiplication in affine coordinates given
+ * that x1 != 0 is the x coordinate of some point on the curve. It was
+ * also checked in Coq that doing a ladderstep with x1 = x3 = 0 gives
+ * z2' = z3' = 0, and z2 = z3 = 0 gives z2' = z3' = 0. The statement was
+ * quantified over the underlying field, so it applies to Curve25519
+ * itself and the quadratic twist of Curve25519. It was not proven in
+ * Coq that prime-field arithmetic correctly simulates extension-field
+ * arithmetic on prime-field values. The decoding of the byte array
+ * representation of e was not considered.
+ *
+ * Specification of Montgomery curves in affine coordinates:
+ * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Spec/MontgomeryCurve.v#L27>
+ *
+ * Proof that these form a group that is isomorphic to a Weierstrass
+ * curve:
+ * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/AffineProofs.v#L35>
+ *
+ * Coq transcription and correctness proof of the loop
+ * (where scalarbits=255):
+ * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L118>
+ * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L278>
+ * preconditions: 0 <= e < 2^255 (not necessarily e < order),
+ * fe_invert(0) = 0
+ */
+ fe_frombytes(&x1, point);
+ fe_1(&x2);
+ fe_0(&z2);
+ fe_copy(&x3, &x1);
+ fe_1(&z3);
+
+ for (pos = 254; pos >= 0; --pos) {
+ /* loop invariant as of right before the test, for the case
+ * where x1 != 0:
+ * pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3
+ * is nonzero
+ * let r := e >> (pos+1) in the following equalities of
+ * projective points:
+ * to_xz (r*P) === if swap then (x3, z3) else (x2, z2)
+ * to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3)
+ * x1 is the nonzero x coordinate of the nonzero
+ * point (r*P-(r+1)*P)
+ */
+ unsigned b = 1 & (e[pos / 8] >> (pos & 7));
+ swap ^= b;
+ fe_cswap(&x2, &x3, swap);
+ fe_cswap(&z2, &z3, swap);
+ swap = b;
+ /* Coq transcription of ladderstep formula (called from
+ * transcribed loop):
+ * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L89>
+ * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L131>
+ * x1 != 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L217>
+ * x1 = 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L147>
+ */
+ fe_sub(&tmp0l, &x3, &z3);
+ fe_sub(&tmp1l, &x2, &z2);
+ fe_add(&x2l, &x2, &z2);
+ fe_add(&z2l, &x3, &z3);
+ fe_mul_tll(&z3, &tmp0l, &x2l);
+ fe_mul_tll(&z2, &z2l, &tmp1l);
+ fe_sq_tl(&tmp0, &tmp1l);
+ fe_sq_tl(&tmp1, &x2l);
+ fe_add(&x3l, &z3, &z2);
+ fe_sub(&z2l, &z3, &z2);
+ fe_mul_ttt(&x2, &tmp1, &tmp0);
+ fe_sub(&tmp1l, &tmp1, &tmp0);
+ fe_sq_tl(&z2, &z2l);
+ fe_mul121666(&z3, &tmp1l);
+ fe_sq_tl(&x3, &x3l);
+ fe_add(&tmp0l, &tmp0, &z3);
+ fe_mul_ttt(&z3, &x1, &z2);
+ fe_mul_tll(&z2, &tmp1l, &tmp0l);
+ }
+ /* here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3)
+ * else (x2, z2)
+ */
+ fe_cswap(&x2, &x3, swap);
+ fe_cswap(&z2, &z3, swap);
+
+ fe_invert(&z2, &z2);
+ fe_mul_ttt(&x2, &x2, &z2);
+ fe_tobytes(out, &x2);
+
+ memzero_explicit(&x1, sizeof(x1));
+ memzero_explicit(&x2, sizeof(x2));
+ memzero_explicit(&z2, sizeof(z2));
+ memzero_explicit(&x3, sizeof(x3));
+ memzero_explicit(&z3, sizeof(z3));
+ memzero_explicit(&tmp0, sizeof(tmp0));
+ memzero_explicit(&tmp1, sizeof(tmp1));
+ memzero_explicit(&x2l, sizeof(x2l));
+ memzero_explicit(&z2l, sizeof(z2l));
+ memzero_explicit(&x3l, sizeof(x3l));
+ memzero_explicit(&tmp0l, sizeof(tmp0l));
+ memzero_explicit(&tmp1l, sizeof(tmp1l));
+ memzero_explicit(&e, sizeof(e));
+}
diff --git a/src/crypto/zinc/curve25519/curve25519-hacl64.h b/src/crypto/zinc/curve25519/curve25519-hacl64.h
new file mode 100644
index 0000000..ebe88fc
--- /dev/null
+++ b/src/crypto/zinc/curve25519/curve25519-hacl64.h
@@ -0,0 +1,785 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2016-2017 INRIA and Microsoft Corporation.
+ * Copyright (C) 2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * This is a machine-generated formally verified implementation of Curve25519
+ * ECDH from: <https://github.com/mitls/hacl-star>. Though originally machine
+ * generated, it has been tweaked to be suitable for use in the kernel. It is
+ * optimized for 64-bit machines that can efficiently work with 128-bit
+ * integer types.
+ */
+
+typedef __uint128_t u128;
+
+static __always_inline u64 u64_eq_mask(u64 a, u64 b)
+{
+ u64 x = a ^ b;
+ u64 minus_x = ~x + (u64)1U;
+ u64 x_or_minus_x = x | minus_x;
+ u64 xnx = x_or_minus_x >> (u32)63U;
+ u64 c = xnx - (u64)1U;
+ return c;
+}
+
+static __always_inline u64 u64_gte_mask(u64 a, u64 b)
+{
+ u64 x = a;
+ u64 y = b;
+ u64 x_xor_y = x ^ y;
+ u64 x_sub_y = x - y;
+ u64 x_sub_y_xor_y = x_sub_y ^ y;
+ u64 q = x_xor_y | x_sub_y_xor_y;
+ u64 x_xor_q = x ^ q;
+ u64 x_xor_q_ = x_xor_q >> (u32)63U;
+ u64 c = x_xor_q_ - (u64)1U;
+ return c;
+}
+
+static __always_inline void modulo_carry_top(u64 *b)
+{
+ u64 b4 = b[4];
+ u64 b0 = b[0];
+ u64 b4_ = b4 & 0x7ffffffffffffLLU;
+ u64 b0_ = b0 + 19 * (b4 >> 51);
+ b[4] = b4_;
+ b[0] = b0_;
+}
+
+static __always_inline void fproduct_copy_from_wide_(u64 *output, u128 *input)
+{
+ {
+ u128 xi = input[0];
+ output[0] = ((u64)(xi));
+ }
+ {
+ u128 xi = input[1];
+ output[1] = ((u64)(xi));
+ }
+ {
+ u128 xi = input[2];
+ output[2] = ((u64)(xi));
+ }
+ {
+ u128 xi = input[3];
+ output[3] = ((u64)(xi));
+ }
+ {
+ u128 xi = input[4];
+ output[4] = ((u64)(xi));
+ }
+}
+
+static __always_inline void
+fproduct_sum_scalar_multiplication_(u128 *output, u64 *input, u64 s)
+{
+ output[0] += (u128)input[0] * s;
+ output[1] += (u128)input[1] * s;
+ output[2] += (u128)input[2] * s;
+ output[3] += (u128)input[3] * s;
+ output[4] += (u128)input[4] * s;
+}
+
+static __always_inline void fproduct_carry_wide_(u128 *tmp)
+{
+ {
+ u32 ctr = 0;
+ u128 tctr = tmp[ctr];
+ u128 tctrp1 = tmp[ctr + 1];
+ u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
+ u128 c = ((tctr) >> (51));
+ tmp[ctr] = ((u128)(r0));
+ tmp[ctr + 1] = ((tctrp1) + (c));
+ }
+ {
+ u32 ctr = 1;
+ u128 tctr = tmp[ctr];
+ u128 tctrp1 = tmp[ctr + 1];
+ u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
+ u128 c = ((tctr) >> (51));
+ tmp[ctr] = ((u128)(r0));
+ tmp[ctr + 1] = ((tctrp1) + (c));
+ }
+
+ {
+ u32 ctr = 2;
+ u128 tctr = tmp[ctr];
+ u128 tctrp1 = tmp[ctr + 1];
+ u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
+ u128 c = ((tctr) >> (51));
+ tmp[ctr] = ((u128)(r0));
+ tmp[ctr + 1] = ((tctrp1) + (c));
+ }
+ {
+ u32 ctr = 3;
+ u128 tctr = tmp[ctr];
+ u128 tctrp1 = tmp[ctr + 1];
+ u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
+ u128 c = ((tctr) >> (51));
+ tmp[ctr] = ((u128)(r0));
+ tmp[ctr + 1] = ((tctrp1) + (c));
+ }
+}
+
+static __always_inline void fmul_shift_reduce(u64 *output)
+{
+ u64 tmp = output[4];
+ u64 b0;
+ {
+ u32 ctr = 5 - 0 - 1;
+ u64 z = output[ctr - 1];
+ output[ctr] = z;
+ }
+ {
+ u32 ctr = 5 - 1 - 1;
+ u64 z = output[ctr - 1];
+ output[ctr] = z;
+ }
+ {
+ u32 ctr = 5 - 2 - 1;
+ u64 z = output[ctr - 1];
+ output[ctr] = z;
+ }
+ {
+ u32 ctr = 5 - 3 - 1;
+ u64 z = output[ctr - 1];
+ output[ctr] = z;
+ }
+ output[0] = tmp;
+ b0 = output[0];
+ output[0] = 19 * b0;
+}
+
+static __always_inline void fmul_mul_shift_reduce_(u128 *output, u64 *input,
+ u64 *input21)
+{
+ u32 i;
+ u64 input2i;
+ {
+ u64 input2i = input21[0];
+ fproduct_sum_scalar_multiplication_(output, input, input2i);
+ fmul_shift_reduce(input);
+ }
+ {
+ u64 input2i = input21[1];
+ fproduct_sum_scalar_multiplication_(output, input, input2i);
+ fmul_shift_reduce(input);
+ }
+ {
+ u64 input2i = input21[2];
+ fproduct_sum_scalar_multiplication_(output, input, input2i);
+ fmul_shift_reduce(input);
+ }
+ {
+ u64 input2i = input21[3];
+ fproduct_sum_scalar_multiplication_(output, input, input2i);
+ fmul_shift_reduce(input);
+ }
+ i = 4;
+ input2i = input21[i];
+ fproduct_sum_scalar_multiplication_(output, input, input2i);
+}
+
+static __always_inline void fmul_fmul(u64 *output, u64 *input, u64 *input21)
+{
+ u64 tmp[5];
+ memcpy(tmp, input, 5 * sizeof(*input));
+ {
+ u128 b4;
+ u128 b0;
+ u128 b4_;
+ u128 b0_;
+ u64 i0;
+ u64 i1;
+ u64 i0_;
+ u64 i1_;
+ u128 t[5] = { 0 };
+ fmul_mul_shift_reduce_(t, tmp, input21);
+ fproduct_carry_wide_(t);
+ b4 = t[4];
+ b0 = t[0];
+ b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
+ b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
+ t[4] = b4_;
+ t[0] = b0_;
+ fproduct_copy_from_wide_(output, t);
+ i0 = output[0];
+ i1 = output[1];
+ i0_ = i0 & 0x7ffffffffffffLLU;
+ i1_ = i1 + (i0 >> 51);
+ output[0] = i0_;
+ output[1] = i1_;
+ }
+}
+
+static __always_inline void fsquare_fsquare__(u128 *tmp, u64 *output)
+{
+ u64 r0 = output[0];
+ u64 r1 = output[1];
+ u64 r2 = output[2];
+ u64 r3 = output[3];
+ u64 r4 = output[4];
+ u64 d0 = r0 * 2;
+ u64 d1 = r1 * 2;
+ u64 d2 = r2 * 2 * 19;
+ u64 d419 = r4 * 19;
+ u64 d4 = d419 * 2;
+ u128 s0 = ((((((u128)(r0) * (r0))) + (((u128)(d4) * (r1))))) +
+ (((u128)(d2) * (r3))));
+ u128 s1 = ((((((u128)(d0) * (r1))) + (((u128)(d4) * (r2))))) +
+ (((u128)(r3 * 19) * (r3))));
+ u128 s2 = ((((((u128)(d0) * (r2))) + (((u128)(r1) * (r1))))) +
+ (((u128)(d4) * (r3))));
+ u128 s3 = ((((((u128)(d0) * (r3))) + (((u128)(d1) * (r2))))) +
+ (((u128)(r4) * (d419))));
+ u128 s4 = ((((((u128)(d0) * (r4))) + (((u128)(d1) * (r3))))) +
+ (((u128)(r2) * (r2))));
+ tmp[0] = s0;
+ tmp[1] = s1;
+ tmp[2] = s2;
+ tmp[3] = s3;
+ tmp[4] = s4;
+}
+
+static __always_inline void fsquare_fsquare_(u128 *tmp, u64 *output)
+{
+ u128 b4;
+ u128 b0;
+ u128 b4_;
+ u128 b0_;
+ u64 i0;
+ u64 i1;
+ u64 i0_;
+ u64 i1_;
+ fsquare_fsquare__(tmp, output);
+ fproduct_carry_wide_(tmp);
+ b4 = tmp[4];
+ b0 = tmp[0];
+ b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
+ b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
+ tmp[4] = b4_;
+ tmp[0] = b0_;
+ fproduct_copy_from_wide_(output, tmp);
+ i0 = output[0];
+ i1 = output[1];
+ i0_ = i0 & 0x7ffffffffffffLLU;
+ i1_ = i1 + (i0 >> 51);
+ output[0] = i0_;
+ output[1] = i1_;
+}
+
+static __always_inline void fsquare_fsquare_times_(u64 *output, u128 *tmp,
+ u32 count1)
+{
+ u32 i;
+ fsquare_fsquare_(tmp, output);
+ for (i = 1; i < count1; ++i)
+ fsquare_fsquare_(tmp, output);
+}
+
+static __always_inline void fsquare_fsquare_times(u64 *output, u64 *input,
+ u32 count1)
+{
+ u128 t[5];
+ memcpy(output, input, 5 * sizeof(*input));
+ fsquare_fsquare_times_(output, t, count1);
+}
+
+static __always_inline void fsquare_fsquare_times_inplace(u64 *output,
+ u32 count1)
+{
+ u128 t[5];
+ fsquare_fsquare_times_(output, t, count1);
+}
+
+static __always_inline void crecip_crecip(u64 *out, u64 *z)
+{
+ u64 buf[20] = { 0 };
+ u64 *a0 = buf;
+ u64 *t00 = buf + 5;
+ u64 *b0 = buf + 10;
+ u64 *t01;
+ u64 *b1;
+ u64 *c0;
+ u64 *a;
+ u64 *t0;
+ u64 *b;
+ u64 *c;
+ fsquare_fsquare_times(a0, z, 1);
+ fsquare_fsquare_times(t00, a0, 2);
+ fmul_fmul(b0, t00, z);
+ fmul_fmul(a0, b0, a0);
+ fsquare_fsquare_times(t00, a0, 1);
+ fmul_fmul(b0, t00, b0);
+ fsquare_fsquare_times(t00, b0, 5);
+ t01 = buf + 5;
+ b1 = buf + 10;
+ c0 = buf + 15;
+ fmul_fmul(b1, t01, b1);
+ fsquare_fsquare_times(t01, b1, 10);
+ fmul_fmul(c0, t01, b1);
+ fsquare_fsquare_times(t01, c0, 20);
+ fmul_fmul(t01, t01, c0);
+ fsquare_fsquare_times_inplace(t01, 10);
+ fmul_fmul(b1, t01, b1);
+ fsquare_fsquare_times(t01, b1, 50);
+ a = buf;
+ t0 = buf + 5;
+ b = buf + 10;
+ c = buf + 15;
+ fmul_fmul(c, t0, b);
+ fsquare_fsquare_times(t0, c, 100);
+ fmul_fmul(t0, t0, c);
+ fsquare_fsquare_times_inplace(t0, 50);
+ fmul_fmul(t0, t0, b);
+ fsquare_fsquare_times_inplace(t0, 5);
+ fmul_fmul(out, t0, a);
+}
+
+static __always_inline void fsum(u64 *a, u64 *b)
+{
+ a[0] += b[0];
+ a[1] += b[1];
+ a[2] += b[2];
+ a[3] += b[3];
+ a[4] += b[4];
+}
+
+static __always_inline void fdifference(u64 *a, u64 *b)
+{
+ u64 tmp[5] = { 0 };
+ u64 b0;
+ u64 b1;
+ u64 b2;
+ u64 b3;
+ u64 b4;
+ memcpy(tmp, b, 5 * sizeof(*b));
+ b0 = tmp[0];
+ b1 = tmp[1];
+ b2 = tmp[2];
+ b3 = tmp[3];
+ b4 = tmp[4];
+ tmp[0] = b0 + 0x3fffffffffff68LLU;
+ tmp[1] = b1 + 0x3ffffffffffff8LLU;
+ tmp[2] = b2 + 0x3ffffffffffff8LLU;
+ tmp[3] = b3 + 0x3ffffffffffff8LLU;
+ tmp[4] = b4 + 0x3ffffffffffff8LLU;
+ {
+ u64 xi = a[0];
+ u64 yi = tmp[0];
+ a[0] = yi - xi;
+ }
+ {
+ u64 xi = a[1];
+ u64 yi = tmp[1];
+ a[1] = yi - xi;
+ }
+ {
+ u64 xi = a[2];
+ u64 yi = tmp[2];
+ a[2] = yi - xi;
+ }
+ {
+ u64 xi = a[3];
+ u64 yi = tmp[3];
+ a[3] = yi - xi;
+ }
+ {
+ u64 xi = a[4];
+ u64 yi = tmp[4];
+ a[4] = yi - xi;
+ }
+}
+
+static __always_inline void fscalar(u64 *output, u64 *b, u64 s)
+{
+ u128 tmp[5];
+ u128 b4;
+ u128 b0;
+ u128 b4_;
+ u128 b0_;
+ {
+ u64 xi = b[0];
+ tmp[0] = ((u128)(xi) * (s));
+ }
+ {
+ u64 xi = b[1];
+ tmp[1] = ((u128)(xi) * (s));
+ }
+ {
+ u64 xi = b[2];
+ tmp[2] = ((u128)(xi) * (s));
+ }
+ {
+ u64 xi = b[3];
+ tmp[3] = ((u128)(xi) * (s));
+ }
+ {
+ u64 xi = b[4];
+ tmp[4] = ((u128)(xi) * (s));
+ }
+ fproduct_carry_wide_(tmp);
+ b4 = tmp[4];
+ b0 = tmp[0];
+ b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
+ b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
+ tmp[4] = b4_;
+ tmp[0] = b0_;
+ fproduct_copy_from_wide_(output, tmp);
+}
+
+static __always_inline void fmul(u64 *output, u64 *a, u64 *b)
+{
+ fmul_fmul(output, a, b);
+}
+
+static __always_inline void crecip(u64 *output, u64 *input)
+{
+ crecip_crecip(output, input);
+}
+
+static __always_inline void point_swap_conditional_step(u64 *a, u64 *b,
+ u64 swap1, u32 ctr)
+{
+ u32 i = ctr - 1;
+ u64 ai = a[i];
+ u64 bi = b[i];
+ u64 x = swap1 & (ai ^ bi);
+ u64 ai1 = ai ^ x;
+ u64 bi1 = bi ^ x;
+ a[i] = ai1;
+ b[i] = bi1;
+}
+
+static __always_inline void point_swap_conditional5(u64 *a, u64 *b, u64 swap1)
+{
+ point_swap_conditional_step(a, b, swap1, 5);
+ point_swap_conditional_step(a, b, swap1, 4);
+ point_swap_conditional_step(a, b, swap1, 3);
+ point_swap_conditional_step(a, b, swap1, 2);
+ point_swap_conditional_step(a, b, swap1, 1);
+}
+
+static __always_inline void point_swap_conditional(u64 *a, u64 *b, u64 iswap)
+{
+ u64 swap1 = 0 - iswap;
+ point_swap_conditional5(a, b, swap1);
+ point_swap_conditional5(a + 5, b + 5, swap1);
+}
+
+static __always_inline void point_copy(u64 *output, u64 *input)
+{
+ memcpy(output, input, 5 * sizeof(*input));
+ memcpy(output + 5, input + 5, 5 * sizeof(*input));
+}
+
+static __always_inline void addanddouble_fmonty(u64 *pp, u64 *ppq, u64 *p,
+ u64 *pq, u64 *qmqp)
+{
+ u64 *qx = qmqp;
+ u64 *x2 = pp;
+ u64 *z2 = pp + 5;
+ u64 *x3 = ppq;
+ u64 *z3 = ppq + 5;
+ u64 *x = p;
+ u64 *z = p + 5;
+ u64 *xprime = pq;
+ u64 *zprime = pq + 5;
+ u64 buf[40] = { 0 };
+ u64 *origx = buf;
+ u64 *origxprime0 = buf + 5;
+ u64 *xxprime0;
+ u64 *zzprime0;
+ u64 *origxprime;
+ xxprime0 = buf + 25;
+ zzprime0 = buf + 30;
+ memcpy(origx, x, 5 * sizeof(*x));
+ fsum(x, z);
+ fdifference(z, origx);
+ memcpy(origxprime0, xprime, 5 * sizeof(*xprime));
+ fsum(xprime, zprime);
+ fdifference(zprime, origxprime0);
+ fmul(xxprime0, xprime, z);
+ fmul(zzprime0, x, zprime);
+ origxprime = buf + 5;
+ {
+ u64 *xx0;
+ u64 *zz0;
+ u64 *xxprime;
+ u64 *zzprime;
+ u64 *zzzprime;
+ xx0 = buf + 15;
+ zz0 = buf + 20;
+ xxprime = buf + 25;
+ zzprime = buf + 30;
+ zzzprime = buf + 35;
+ memcpy(origxprime, xxprime, 5 * sizeof(*xxprime));
+ fsum(xxprime, zzprime);
+ fdifference(zzprime, origxprime);
+ fsquare_fsquare_times(x3, xxprime, 1);
+ fsquare_fsquare_times(zzzprime, zzprime, 1);
+ fmul(z3, zzzprime, qx);
+ fsquare_fsquare_times(xx0, x, 1);
+ fsquare_fsquare_times(zz0, z, 1);
+ {
+ u64 *zzz;
+ u64 *xx;
+ u64 *zz;
+ u64 scalar;
+ zzz = buf + 10;
+ xx = buf + 15;
+ zz = buf + 20;
+ fmul(x2, xx, zz);
+ fdifference(zz, xx);
+ scalar = 121665;
+ fscalar(zzz, zz, scalar);
+ fsum(zzz, xx);
+ fmul(z2, zzz, zz);
+ }
+ }
+}
+
+static __always_inline void
+ladder_smallloop_cmult_small_loop_step(u64 *nq, u64 *nqpq, u64 *nq2, u64 *nqpq2,
+ u64 *q, u8 byt)
+{
+ u64 bit0 = (u64)(byt >> 7);
+ u64 bit;
+ point_swap_conditional(nq, nqpq, bit0);
+ addanddouble_fmonty(nq2, nqpq2, nq, nqpq, q);
+ bit = (u64)(byt >> 7);
+ point_swap_conditional(nq2, nqpq2, bit);
+}
+
+static __always_inline void
+ladder_smallloop_cmult_small_loop_double_step(u64 *nq, u64 *nqpq, u64 *nq2,
+ u64 *nqpq2, u64 *q, u8 byt)
+{
+ u8 byt1;
+ ladder_smallloop_cmult_small_loop_step(nq, nqpq, nq2, nqpq2, q, byt);
+ byt1 = byt << 1;
+ ladder_smallloop_cmult_small_loop_step(nq2, nqpq2, nq, nqpq, q, byt1);
+}
+
+static __always_inline void
+ladder_smallloop_cmult_small_loop(u64 *nq, u64 *nqpq, u64 *nq2, u64 *nqpq2,
+ u64 *q, u8 byt, u32 i)
+{
+ while (i--) {
+ ladder_smallloop_cmult_small_loop_double_step(nq, nqpq, nq2,
+ nqpq2, q, byt);
+ byt <<= 2;
+ }
+}
+
+static __always_inline void ladder_bigloop_cmult_big_loop(u8 *n1, u64 *nq,
+ u64 *nqpq, u64 *nq2,
+ u64 *nqpq2, u64 *q,
+ u32 i)
+{
+ while (i--) {
+ u8 byte = n1[i];
+ ladder_smallloop_cmult_small_loop(nq, nqpq, nq2, nqpq2, q,
+ byte, 4);
+ }
+}
+
+static __always_inline void ladder_cmult(u64 *result, u8 *n1, u64 *q)
+{
+ u64 point_buf[40] = { 0 };
+ u64 *nq = point_buf;
+ u64 *nqpq = point_buf + 10;
+ u64 *nq2 = point_buf + 20;
+ u64 *nqpq2 = point_buf + 30;
+ point_copy(nqpq, q);
+ nq[0] = 1;
+ ladder_bigloop_cmult_big_loop(n1, nq, nqpq, nq2, nqpq2, q, 32);
+ point_copy(result, nq);
+}
+
+static __always_inline void format_fexpand(u64 *output, const u8 *input)
+{
+ const u8 *x00 = input + 6;
+ const u8 *x01 = input + 12;
+ const u8 *x02 = input + 19;
+ const u8 *x0 = input + 24;
+ u64 i0, i1, i2, i3, i4, output0, output1, output2, output3, output4;
+ i0 = get_unaligned_le64(input);
+ i1 = get_unaligned_le64(x00);
+ i2 = get_unaligned_le64(x01);
+ i3 = get_unaligned_le64(x02);
+ i4 = get_unaligned_le64(x0);
+ output0 = i0 & 0x7ffffffffffffLLU;
+ output1 = i1 >> 3 & 0x7ffffffffffffLLU;
+ output2 = i2 >> 6 & 0x7ffffffffffffLLU;
+ output3 = i3 >> 1 & 0x7ffffffffffffLLU;
+ output4 = i4 >> 12 & 0x7ffffffffffffLLU;
+ output[0] = output0;
+ output[1] = output1;
+ output[2] = output2;
+ output[3] = output3;
+ output[4] = output4;
+}
+
+static __always_inline void format_fcontract_first_carry_pass(u64 *input)
+{
+ u64 t0 = input[0];
+ u64 t1 = input[1];
+ u64 t2 = input[2];
+ u64 t3 = input[3];
+ u64 t4 = input[4];
+ u64 t1_ = t1 + (t0 >> 51);
+ u64 t0_ = t0 & 0x7ffffffffffffLLU;
+ u64 t2_ = t2 + (t1_ >> 51);
+ u64 t1__ = t1_ & 0x7ffffffffffffLLU;
+ u64 t3_ = t3 + (t2_ >> 51);
+ u64 t2__ = t2_ & 0x7ffffffffffffLLU;
+ u64 t4_ = t4 + (t3_ >> 51);
+ u64 t3__ = t3_ & 0x7ffffffffffffLLU;
+ input[0] = t0_;
+ input[1] = t1__;
+ input[2] = t2__;
+ input[3] = t3__;
+ input[4] = t4_;
+}
+
+static __always_inline void format_fcontract_first_carry_full(u64 *input)
+{
+ format_fcontract_first_carry_pass(input);
+ modulo_carry_top(input);
+}
+
+static __always_inline void format_fcontract_second_carry_pass(u64 *input)
+{
+ u64 t0 = input[0];
+ u64 t1 = input[1];
+ u64 t2 = input[2];
+ u64 t3 = input[3];
+ u64 t4 = input[4];
+ u64 t1_ = t1 + (t0 >> 51);
+ u64 t0_ = t0 & 0x7ffffffffffffLLU;
+ u64 t2_ = t2 + (t1_ >> 51);
+ u64 t1__ = t1_ & 0x7ffffffffffffLLU;
+ u64 t3_ = t3 + (t2_ >> 51);
+ u64 t2__ = t2_ & 0x7ffffffffffffLLU;
+ u64 t4_ = t4 + (t3_ >> 51);
+ u64 t3__ = t3_ & 0x7ffffffffffffLLU;
+ input[0] = t0_;
+ input[1] = t1__;
+ input[2] = t2__;
+ input[3] = t3__;
+ input[4] = t4_;
+}
+
+static __always_inline void format_fcontract_second_carry_full(u64 *input)
+{
+ u64 i0;
+ u64 i1;
+ u64 i0_;
+ u64 i1_;
+ format_fcontract_second_carry_pass(input);
+ modulo_carry_top(input);
+ i0 = input[0];
+ i1 = input[1];
+ i0_ = i0 & 0x7ffffffffffffLLU;
+ i1_ = i1 + (i0 >> 51);
+ input[0] = i0_;
+ input[1] = i1_;
+}
+
+static __always_inline void format_fcontract_trim(u64 *input)
+{
+ u64 a0 = input[0];
+ u64 a1 = input[1];
+ u64 a2 = input[2];
+ u64 a3 = input[3];
+ u64 a4 = input[4];
+ u64 mask0 = u64_gte_mask(a0, 0x7ffffffffffedLLU);
+ u64 mask1 = u64_eq_mask(a1, 0x7ffffffffffffLLU);
+ u64 mask2 = u64_eq_mask(a2, 0x7ffffffffffffLLU);
+ u64 mask3 = u64_eq_mask(a3, 0x7ffffffffffffLLU);
+ u64 mask4 = u64_eq_mask(a4, 0x7ffffffffffffLLU);
+ u64 mask = (((mask0 & mask1) & mask2) & mask3) & mask4;
+ u64 a0_ = a0 - (0x7ffffffffffedLLU & mask);
+ u64 a1_ = a1 - (0x7ffffffffffffLLU & mask);
+ u64 a2_ = a2 - (0x7ffffffffffffLLU & mask);
+ u64 a3_ = a3 - (0x7ffffffffffffLLU & mask);
+ u64 a4_ = a4 - (0x7ffffffffffffLLU & mask);
+ input[0] = a0_;
+ input[1] = a1_;
+ input[2] = a2_;
+ input[3] = a3_;
+ input[4] = a4_;
+}
+
+static __always_inline void format_fcontract_store(u8 *output, u64 *input)
+{
+ u64 t0 = input[0];
+ u64 t1 = input[1];
+ u64 t2 = input[2];
+ u64 t3 = input[3];
+ u64 t4 = input[4];
+ u64 o0 = t1 << 51 | t0;
+ u64 o1 = t2 << 38 | t1 >> 13;
+ u64 o2 = t3 << 25 | t2 >> 26;
+ u64 o3 = t4 << 12 | t3 >> 39;
+ u8 *b0 = output;
+ u8 *b1 = output + 8;
+ u8 *b2 = output + 16;
+ u8 *b3 = output + 24;
+ put_unaligned_le64(o0, b0);
+ put_unaligned_le64(o1, b1);
+ put_unaligned_le64(o2, b2);
+ put_unaligned_le64(o3, b3);
+}
+
+static __always_inline void format_fcontract(u8 *output, u64 *input)
+{
+ format_fcontract_first_carry_full(input);
+ format_fcontract_second_carry_full(input);
+ format_fcontract_trim(input);
+ format_fcontract_store(output, input);
+}
+
+static __always_inline void format_scalar_of_point(u8 *scalar, u64 *point)
+{
+ u64 *x = point;
+ u64 *z = point + 5;
+ u64 buf[10] __aligned(32) = { 0 };
+ u64 *zmone = buf;
+ u64 *sc = buf + 5;
+ crecip(zmone, z);
+ fmul(sc, x, zmone);
+ format_fcontract(scalar, sc);
+}
+
+static void curve25519_generic(u8 mypublic[CURVE25519_POINT_SIZE],
+ const u8 secret[CURVE25519_POINT_SIZE],
+ const u8 basepoint[CURVE25519_POINT_SIZE])
+{
+ u64 buf0[10] __aligned(32) = { 0 };
+ u64 *x0 = buf0;
+ u64 *z = buf0 + 5;
+ u64 *q;
+ format_fexpand(x0, basepoint);
+ z[0] = 1;
+ q = buf0;
+ {
+ u8 e[32] __aligned(32) = { 0 };
+ u8 *scalar;
+ memcpy(e, secret, 32);
+ normalize_secret(e);
+ scalar = e;
+ {
+ u64 buf[15] = { 0 };
+ u64 *nq = buf;
+ u64 *x = nq;
+ x[0] = 1;
+ ladder_cmult(nq, scalar, q);
+ format_scalar_of_point(mypublic, nq);
+ memzero_explicit(buf, sizeof(buf));
+ }
+ memzero_explicit(e, sizeof(e));
+ }
+ memzero_explicit(buf0, sizeof(buf0));
+}
diff --git a/src/crypto/zinc/curve25519/curve25519-x86_64-glue.h b/src/crypto/zinc/curve25519/curve25519-x86_64-glue.h
new file mode 100644
index 0000000..4499cd9
--- /dev/null
+++ b/src/crypto/zinc/curve25519/curve25519-x86_64-glue.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <zinc/curve25519.h>
+#include <crypto/algapi.h>
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+
+#include "curve25519-x86_64.h"
+
+static bool curve25519_use_bmi2 __ro_after_init;
+static bool curve25519_use_adx __ro_after_init;
+
+void __init curve25519_fpu_init(void)
+{
+ curve25519_use_bmi2 = boot_cpu_has(X86_FEATURE_BMI2);
+ curve25519_use_adx = boot_cpu_has(X86_FEATURE_BMI2) &&
+ boot_cpu_has(X86_FEATURE_ADX);
+}
+
+static inline bool curve25519_arch(u8 mypublic[CURVE25519_POINT_SIZE],
+ const u8 secret[CURVE25519_POINT_SIZE],
+ const u8 basepoint[CURVE25519_POINT_SIZE])
+{
+ if (curve25519_use_adx) {
+ curve25519_adx(mypublic, secret, basepoint);
+ return true;
+ } else if (curve25519_use_bmi2) {
+ curve25519_bmi2(mypublic, secret, basepoint);
+ return true;
+ }
+ return false;
+}
+
+static inline bool curve25519_base_arch(u8 pub[CURVE25519_POINT_SIZE],
+ const u8 secret[CURVE25519_POINT_SIZE])
+{
+ if (curve25519_use_adx) {
+ curve25519_adx_base(pub, secret);
+ return true;
+ } else if (curve25519_use_bmi2) {
+ curve25519_bmi2_base(pub, secret);
+ return true;
+ }
+ return false;
+}
+
+#define HAVE_CURVE25519_ARCH_IMPLEMENTATION
diff --git a/src/crypto/zinc/curve25519/curve25519-x86_64.h b/src/crypto/zinc/curve25519/curve25519-x86_64.h
new file mode 100644
index 0000000..15c2e43
--- /dev/null
+++ b/src/crypto/zinc/curve25519/curve25519-x86_64.h
@@ -0,0 +1,2333 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (c) 2017 Armando Faz <armfazh@ic.unicamp.br>. All Rights Reserved.
+ * Copyright (C) 2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright (C) 2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
+ */
+
+enum { NUM_WORDS_ELTFP25519 = 4 };
+typedef __aligned(32) u64 eltfp25519_1w[NUM_WORDS_ELTFP25519];
+typedef __aligned(32) u64 eltfp25519_1w_buffer[2 * NUM_WORDS_ELTFP25519];
+
+#define mul_eltfp25519_1w_adx(c, a, b) do { \
+ mul_256x256_integer_adx(m.buffer, a, b); \
+ red_eltfp25519_1w_adx(c, m.buffer); \
+} while (0)
+
+#define mul_eltfp25519_1w_bmi2(c, a, b) do { \
+ mul_256x256_integer_bmi2(m.buffer, a, b); \
+ red_eltfp25519_1w_bmi2(c, m.buffer); \
+} while (0)
+
+#define sqr_eltfp25519_1w_adx(a) do { \
+ sqr_256x256_integer_adx(m.buffer, a); \
+ red_eltfp25519_1w_adx(a, m.buffer); \
+} while (0)
+
+#define sqr_eltfp25519_1w_bmi2(a) do { \
+ sqr_256x256_integer_bmi2(m.buffer, a); \
+ red_eltfp25519_1w_bmi2(a, m.buffer); \
+} while (0)
+
+#define mul_eltfp25519_2w_adx(c, a, b) do { \
+ mul2_256x256_integer_adx(m.buffer, a, b); \
+ red_eltfp25519_2w_adx(c, m.buffer); \
+} while (0)
+
+#define mul_eltfp25519_2w_bmi2(c, a, b) do { \
+ mul2_256x256_integer_bmi2(m.buffer, a, b); \
+ red_eltfp25519_2w_bmi2(c, m.buffer); \
+} while (0)
+
+#define sqr_eltfp25519_2w_adx(a) do { \
+ sqr2_256x256_integer_adx(m.buffer, a); \
+ red_eltfp25519_2w_adx(a, m.buffer); \
+} while (0)
+
+#define sqr_eltfp25519_2w_bmi2(a) do { \
+ sqr2_256x256_integer_bmi2(m.buffer, a); \
+ red_eltfp25519_2w_bmi2(a, m.buffer); \
+} while (0)
+
+#define sqrn_eltfp25519_1w_adx(a, times) do { \
+ int ____counter = (times); \
+ while (____counter-- > 0) \
+ sqr_eltfp25519_1w_adx(a); \
+} while (0)
+
+#define sqrn_eltfp25519_1w_bmi2(a, times) do { \
+ int ____counter = (times); \
+ while (____counter-- > 0) \
+ sqr_eltfp25519_1w_bmi2(a); \
+} while (0)
+
+#define copy_eltfp25519_1w(C, A) do { \
+ (C)[0] = (A)[0]; \
+ (C)[1] = (A)[1]; \
+ (C)[2] = (A)[2]; \
+ (C)[3] = (A)[3]; \
+} while (0)
+
+#define setzero_eltfp25519_1w(C) do { \
+ (C)[0] = 0; \
+ (C)[1] = 0; \
+ (C)[2] = 0; \
+ (C)[3] = 0; \
+} while (0)
+
+__aligned(32) static const u64 table_ladder_8k[252 * NUM_WORDS_ELTFP25519] = {
+ /* 1 */ 0xfffffffffffffff3UL, 0xffffffffffffffffUL,
+ 0xffffffffffffffffUL, 0x5fffffffffffffffUL,
+ /* 2 */ 0x6b8220f416aafe96UL, 0x82ebeb2b4f566a34UL,
+ 0xd5a9a5b075a5950fUL, 0x5142b2cf4b2488f4UL,
+ /* 3 */ 0x6aaebc750069680cUL, 0x89cf7820a0f99c41UL,
+ 0x2a58d9183b56d0f4UL, 0x4b5aca80e36011a4UL,
+ /* 4 */ 0x329132348c29745dUL, 0xf4a2e616e1642fd7UL,
+ 0x1e45bb03ff67bc34UL, 0x306912d0f42a9b4aUL,
+ /* 5 */ 0xff886507e6af7154UL, 0x04f50e13dfeec82fUL,
+ 0xaa512fe82abab5ceUL, 0x174e251a68d5f222UL,
+ /* 6 */ 0xcf96700d82028898UL, 0x1743e3370a2c02c5UL,
+ 0x379eec98b4e86eaaUL, 0x0c59888a51e0482eUL,
+ /* 7 */ 0xfbcbf1d699b5d189UL, 0xacaef0d58e9fdc84UL,
+ 0xc1c20d06231f7614UL, 0x2938218da274f972UL,
+ /* 8 */ 0xf6af49beff1d7f18UL, 0xcc541c22387ac9c2UL,
+ 0x96fcc9ef4015c56bUL, 0x69c1627c690913a9UL,
+ /* 9 */ 0x7a86fd2f4733db0eUL, 0xfdb8c4f29e087de9UL,
+ 0x095e4b1a8ea2a229UL, 0x1ad7a7c829b37a79UL,
+ /* 10 */ 0x342d89cad17ea0c0UL, 0x67bedda6cced2051UL,
+ 0x19ca31bf2bb42f74UL, 0x3df7b4c84980acbbUL,
+ /* 11 */ 0xa8c6444dc80ad883UL, 0xb91e440366e3ab85UL,
+ 0xc215cda00164f6d8UL, 0x3d867c6ef247e668UL,
+ /* 12 */ 0xc7dd582bcc3e658cUL, 0xfd2c4748ee0e5528UL,
+ 0xa0fd9b95cc9f4f71UL, 0x7529d871b0675ddfUL,
+ /* 13 */ 0xb8f568b42d3cbd78UL, 0x1233011b91f3da82UL,
+ 0x2dce6ccd4a7c3b62UL, 0x75e7fc8e9e498603UL,
+ /* 14 */ 0x2f4f13f1fcd0b6ecUL, 0xf1a8ca1f29ff7a45UL,
+ 0xc249c1a72981e29bUL, 0x6ebe0dbb8c83b56aUL,
+ /* 15 */ 0x7114fa8d170bb222UL, 0x65a2dcd5bf93935fUL,
+ 0xbdc41f68b59c979aUL, 0x2f0eef79a2ce9289UL,
+ /* 16 */ 0x42ecbf0c083c37ceUL, 0x2930bc09ec496322UL,
+ 0xf294b0c19cfeac0dUL, 0x3780aa4bedfabb80UL,
+ /* 17 */ 0x56c17d3e7cead929UL, 0xe7cb4beb2e5722c5UL,
+ 0x0ce931732dbfe15aUL, 0x41b883c7621052f8UL,
+ /* 18 */ 0xdbf75ca0c3d25350UL, 0x2936be086eb1e351UL,
+ 0xc936e03cb4a9b212UL, 0x1d45bf82322225aaUL,
+ /* 19 */ 0xe81ab1036a024cc5UL, 0xe212201c304c9a72UL,
+ 0xc5d73fba6832b1fcUL, 0x20ffdb5a4d839581UL,
+ /* 20 */ 0xa283d367be5d0fadUL, 0x6c2b25ca8b164475UL,
+ 0x9d4935467caaf22eUL, 0x5166408eee85ff49UL,
+ /* 21 */ 0x3c67baa2fab4e361UL, 0xb3e433c67ef35cefUL,
+ 0x5259729241159b1cUL, 0x6a621892d5b0ab33UL,
+ /* 22 */ 0x20b74a387555cdcbUL, 0x532aa10e1208923fUL,
+ 0xeaa17b7762281dd1UL, 0x61ab3443f05c44bfUL,
+ /* 23 */ 0x257a6c422324def8UL, 0x131c6c1017e3cf7fUL,
+ 0x23758739f630a257UL, 0x295a407a01a78580UL,
+ /* 24 */ 0xf8c443246d5da8d9UL, 0x19d775450c52fa5dUL,
+ 0x2afcfc92731bf83dUL, 0x7d10c8e81b2b4700UL,
+ /* 25 */ 0xc8e0271f70baa20bUL, 0x993748867ca63957UL,
+ 0x5412efb3cb7ed4bbUL, 0x3196d36173e62975UL,
+ /* 26 */ 0xde5bcad141c7dffcUL, 0x47cc8cd2b395c848UL,
+ 0xa34cd942e11af3cbUL, 0x0256dbf2d04ecec2UL,
+ /* 27 */ 0x875ab7e94b0e667fUL, 0xcad4dd83c0850d10UL,
+ 0x47f12e8f4e72c79fUL, 0x5f1a87bb8c85b19bUL,
+ /* 28 */ 0x7ae9d0b6437f51b8UL, 0x12c7ce5518879065UL,
+ 0x2ade09fe5cf77aeeUL, 0x23a05a2f7d2c5627UL,
+ /* 29 */ 0x5908e128f17c169aUL, 0xf77498dd8ad0852dUL,
+ 0x74b4c4ceab102f64UL, 0x183abadd10139845UL,
+ /* 30 */ 0xb165ba8daa92aaacUL, 0xd5c5ef9599386705UL,
+ 0xbe2f8f0cf8fc40d1UL, 0x2701e635ee204514UL,
+ /* 31 */ 0x629fa80020156514UL, 0xf223868764a8c1ceUL,
+ 0x5b894fff0b3f060eUL, 0x60d9944cf708a3faUL,
+ /* 32 */ 0xaeea001a1c7a201fUL, 0xebf16a633ee2ce63UL,
+ 0x6f7709594c7a07e1UL, 0x79b958150d0208cbUL,
+ /* 33 */ 0x24b55e5301d410e7UL, 0xe3a34edff3fdc84dUL,
+ 0xd88768e4904032d8UL, 0x131384427b3aaeecUL,
+ /* 34 */ 0x8405e51286234f14UL, 0x14dc4739adb4c529UL,
+ 0xb8a2b5b250634ffdUL, 0x2fe2a94ad8a7ff93UL,
+ /* 35 */ 0xec5c57efe843faddUL, 0x2843ce40f0bb9918UL,
+ 0xa4b561d6cf3d6305UL, 0x743629bde8fb777eUL,
+ /* 36 */ 0x343edd46bbaf738fUL, 0xed981828b101a651UL,
+ 0xa401760b882c797aUL, 0x1fc223e28dc88730UL,
+ /* 37 */ 0x48604e91fc0fba0eUL, 0xb637f78f052c6fa4UL,
+ 0x91ccac3d09e9239cUL, 0x23f7eed4437a687cUL,
+ /* 38 */ 0x5173b1118d9bd800UL, 0x29d641b63189d4a7UL,
+ 0xfdbf177988bbc586UL, 0x2959894fcad81df5UL,
+ /* 39 */ 0xaebc8ef3b4bbc899UL, 0x4148995ab26992b9UL,
+ 0x24e20b0134f92cfbUL, 0x40d158894a05dee8UL,
+ /* 40 */ 0x46b00b1185af76f6UL, 0x26bac77873187a79UL,
+ 0x3dc0bf95ab8fff5fUL, 0x2a608bd8945524d7UL,
+ /* 41 */ 0x26449588bd446302UL, 0x7c4bc21c0388439cUL,
+ 0x8e98a4f383bd11b2UL, 0x26218d7bc9d876b9UL,
+ /* 42 */ 0xe3081542997c178aUL, 0x3c2d29a86fb6606fUL,
+ 0x5c217736fa279374UL, 0x7dde05734afeb1faUL,
+ /* 43 */ 0x3bf10e3906d42babUL, 0xe4f7803e1980649cUL,
+ 0xe6053bf89595bf7aUL, 0x394faf38da245530UL,
+ /* 44 */ 0x7a8efb58896928f4UL, 0xfbc778e9cc6a113cUL,
+ 0x72670ce330af596fUL, 0x48f222a81d3d6cf7UL,
+ /* 45 */ 0xf01fce410d72caa7UL, 0x5a20ecc7213b5595UL,
+ 0x7bc21165c1fa1483UL, 0x07f89ae31da8a741UL,
+ /* 46 */ 0x05d2c2b4c6830ff9UL, 0xd43e330fc6316293UL,
+ 0xa5a5590a96d3a904UL, 0x705edb91a65333b6UL,
+ /* 47 */ 0x048ee15e0bb9a5f7UL, 0x3240cfca9e0aaf5dUL,
+ 0x8f4b71ceedc4a40bUL, 0x621c0da3de544a6dUL,
+ /* 48 */ 0x92872836a08c4091UL, 0xce8375b010c91445UL,
+ 0x8a72eb524f276394UL, 0x2667fcfa7ec83635UL,
+ /* 49 */ 0x7f4c173345e8752aUL, 0x061b47feee7079a5UL,
+ 0x25dd9afa9f86ff34UL, 0x3780cef5425dc89cUL,
+ /* 50 */ 0x1a46035a513bb4e9UL, 0x3e1ef379ac575adaUL,
+ 0xc78c5f1c5fa24b50UL, 0x321a967634fd9f22UL,
+ /* 51 */ 0x946707b8826e27faUL, 0x3dca84d64c506fd0UL,
+ 0xc189218075e91436UL, 0x6d9284169b3b8484UL,
+ /* 52 */ 0x3a67e840383f2ddfUL, 0x33eec9a30c4f9b75UL,
+ 0x3ec7c86fa783ef47UL, 0x26ec449fbac9fbc4UL,
+ /* 53 */ 0x5c0f38cba09b9e7dUL, 0x81168cc762a3478cUL,
+ 0x3e23b0d306fc121cUL, 0x5a238aa0a5efdcddUL,
+ /* 54 */ 0x1ba26121c4ea43ffUL, 0x36f8c77f7c8832b5UL,
+ 0x88fbea0b0adcf99aUL, 0x5ca9938ec25bebf9UL,
+ /* 55 */ 0xd5436a5e51fccda0UL, 0x1dbc4797c2cd893bUL,
+ 0x19346a65d3224a08UL, 0x0f5034e49b9af466UL,
+ /* 56 */ 0xf23c3967a1e0b96eUL, 0xe58b08fa867a4d88UL,
+ 0xfb2fabc6a7341679UL, 0x2a75381eb6026946UL,
+ /* 57 */ 0xc80a3be4c19420acUL, 0x66b1f6c681f2b6dcUL,
+ 0x7cf7036761e93388UL, 0x25abbbd8a660a4c4UL,
+ /* 58 */ 0x91ea12ba14fd5198UL, 0x684950fc4a3cffa9UL,
+ 0xf826842130f5ad28UL, 0x3ea988f75301a441UL,
+ /* 59 */ 0xc978109a695f8c6fUL, 0x1746eb4a0530c3f3UL,
+ 0x444d6d77b4459995UL, 0x75952b8c054e5cc7UL,
+ /* 60 */ 0xa3703f7915f4d6aaUL, 0x66c346202f2647d8UL,
+ 0xd01469df811d644bUL, 0x77fea47d81a5d71fUL,
+ /* 61 */ 0xc5e9529ef57ca381UL, 0x6eeeb4b9ce2f881aUL,
+ 0xb6e91a28e8009bd6UL, 0x4b80be3e9afc3fecUL,
+ /* 62 */ 0x7e3773c526aed2c5UL, 0x1b4afcb453c9a49dUL,
+ 0xa920bdd7baffb24dUL, 0x7c54699f122d400eUL,
+ /* 63 */ 0xef46c8e14fa94bc8UL, 0xe0b074ce2952ed5eUL,
+ 0xbea450e1dbd885d5UL, 0x61b68649320f712cUL,
+ /* 64 */ 0x8a485f7309ccbdd1UL, 0xbd06320d7d4d1a2dUL,
+ 0x25232973322dbef4UL, 0x445dc4758c17f770UL,
+ /* 65 */ 0xdb0434177cc8933cUL, 0xed6fe82175ea059fUL,
+ 0x1efebefdc053db34UL, 0x4adbe867c65daf99UL,
+ /* 66 */ 0x3acd71a2a90609dfUL, 0xe5e991856dd04050UL,
+ 0x1ec69b688157c23cUL, 0x697427f6885cfe4dUL,
+ /* 67 */ 0xd7be7b9b65e1a851UL, 0xa03d28d522c536ddUL,
+ 0x28399d658fd2b645UL, 0x49e5b7e17c2641e1UL,
+ /* 68 */ 0x6f8c3a98700457a4UL, 0x5078f0a25ebb6778UL,
+ 0xd13c3ccbc382960fUL, 0x2e003258a7df84b1UL,
+ /* 69 */ 0x8ad1f39be6296a1cUL, 0xc1eeaa652a5fbfb2UL,
+ 0x33ee0673fd26f3cbUL, 0x59256173a69d2cccUL,
+ /* 70 */ 0x41ea07aa4e18fc41UL, 0xd9fc19527c87a51eUL,
+ 0xbdaacb805831ca6fUL, 0x445b652dc916694fUL,
+ /* 71 */ 0xce92a3a7f2172315UL, 0x1edc282de11b9964UL,
+ 0xa1823aafe04c314aUL, 0x790a2d94437cf586UL,
+ /* 72 */ 0x71c447fb93f6e009UL, 0x8922a56722845276UL,
+ 0xbf70903b204f5169UL, 0x2f7a89891ba319feUL,
+ /* 73 */ 0x02a08eb577e2140cUL, 0xed9a4ed4427bdcf4UL,
+ 0x5253ec44e4323cd1UL, 0x3e88363c14e9355bUL,
+ /* 74 */ 0xaa66c14277110b8cUL, 0x1ae0391610a23390UL,
+ 0x2030bd12c93fc2a2UL, 0x3ee141579555c7abUL,
+ /* 75 */ 0x9214de3a6d6e7d41UL, 0x3ccdd88607f17efeUL,
+ 0x674f1288f8e11217UL, 0x5682250f329f93d0UL,
+ /* 76 */ 0x6cf00b136d2e396eUL, 0x6e4cf86f1014debfUL,
+ 0x5930b1b5bfcc4e83UL, 0x047069b48aba16b6UL,
+ /* 77 */ 0x0d4ce4ab69b20793UL, 0xb24db91a97d0fb9eUL,
+ 0xcdfa50f54e00d01dUL, 0x221b1085368bddb5UL,
+ /* 78 */ 0xe7e59468b1e3d8d2UL, 0x53c56563bd122f93UL,
+ 0xeee8a903e0663f09UL, 0x61efa662cbbe3d42UL,
+ /* 79 */ 0x2cf8ddddde6eab2aUL, 0x9bf80ad51435f231UL,
+ 0x5deadacec9f04973UL, 0x29275b5d41d29b27UL,
+ /* 80 */ 0xcfde0f0895ebf14fUL, 0xb9aab96b054905a7UL,
+ 0xcae80dd9a1c420fdUL, 0x0a63bf2f1673bbc7UL,
+ /* 81 */ 0x092f6e11958fbc8cUL, 0x672a81e804822fadUL,
+ 0xcac8351560d52517UL, 0x6f3f7722c8f192f8UL,
+ /* 82 */ 0xf8ba90ccc2e894b7UL, 0x2c7557a438ff9f0dUL,
+ 0x894d1d855ae52359UL, 0x68e122157b743d69UL,
+ /* 83 */ 0xd87e5570cfb919f3UL, 0x3f2cdecd95798db9UL,
+ 0x2121154710c0a2ceUL, 0x3c66a115246dc5b2UL,
+ /* 84 */ 0xcbedc562294ecb72UL, 0xba7143c36a280b16UL,
+ 0x9610c2efd4078b67UL, 0x6144735d946a4b1eUL,
+ /* 85 */ 0x536f111ed75b3350UL, 0x0211db8c2041d81bUL,
+ 0xf93cb1000e10413cUL, 0x149dfd3c039e8876UL,
+ /* 86 */ 0xd479dde46b63155bUL, 0xb66e15e93c837976UL,
+ 0xdafde43b1f13e038UL, 0x5fafda1a2e4b0b35UL,
+ /* 87 */ 0x3600bbdf17197581UL, 0x3972050bbe3cd2c2UL,
+ 0x5938906dbdd5be86UL, 0x34fce5e43f9b860fUL,
+ /* 88 */ 0x75a8a4cd42d14d02UL, 0x828dabc53441df65UL,
+ 0x33dcabedd2e131d3UL, 0x3ebad76fb814d25fUL,
+ /* 89 */ 0xd4906f566f70e10fUL, 0x5d12f7aa51690f5aUL,
+ 0x45adb16e76cefcf2UL, 0x01f768aead232999UL,
+ /* 90 */ 0x2b6cc77b6248febdUL, 0x3cd30628ec3aaffdUL,
+ 0xce1c0b80d4ef486aUL, 0x4c3bff2ea6f66c23UL,
+ /* 91 */ 0x3f2ec4094aeaeb5fUL, 0x61b19b286e372ca7UL,
+ 0x5eefa966de2a701dUL, 0x23b20565de55e3efUL,
+ /* 92 */ 0xe301ca5279d58557UL, 0x07b2d4ce27c2874fUL,
+ 0xa532cd8a9dcf1d67UL, 0x2a52fee23f2bff56UL,
+ /* 93 */ 0x8624efb37cd8663dUL, 0xbbc7ac20ffbd7594UL,
+ 0x57b85e9c82d37445UL, 0x7b3052cb86a6ec66UL,
+ /* 94 */ 0x3482f0ad2525e91eUL, 0x2cb68043d28edca0UL,
+ 0xaf4f6d052e1b003aUL, 0x185f8c2529781b0aUL,
+ /* 95 */ 0xaa41de5bd80ce0d6UL, 0x9407b2416853e9d6UL,
+ 0x563ec36e357f4c3aUL, 0x4cc4b8dd0e297bceUL,
+ /* 96 */ 0xa2fc1a52ffb8730eUL, 0x1811f16e67058e37UL,
+ 0x10f9a366cddf4ee1UL, 0x72f4a0c4a0b9f099UL,
+ /* 97 */ 0x8c16c06f663f4ea7UL, 0x693b3af74e970fbaUL,
+ 0x2102e7f1d69ec345UL, 0x0ba53cbc968a8089UL,
+ /* 98 */ 0xca3d9dc7fea15537UL, 0x4c6824bb51536493UL,
+ 0xb9886314844006b1UL, 0x40d2a72ab454cc60UL,
+ /* 99 */ 0x5936a1b712570975UL, 0x91b9d648debda657UL,
+ 0x3344094bb64330eaUL, 0x006ba10d12ee51d0UL,
+ /* 100 */ 0x19228468f5de5d58UL, 0x0eb12f4c38cc05b0UL,
+ 0xa1039f9dd5601990UL, 0x4502d4ce4fff0e0bUL,
+ /* 101 */ 0xeb2054106837c189UL, 0xd0f6544c6dd3b93cUL,
+ 0x40727064c416d74fUL, 0x6e15c6114b502ef0UL,
+ /* 102 */ 0x4df2a398cfb1a76bUL, 0x11256c7419f2f6b1UL,
+ 0x4a497962066e6043UL, 0x705b3aab41355b44UL,
+ /* 103 */ 0x365ef536d797b1d8UL, 0x00076bd622ddf0dbUL,
+ 0x3bbf33b0e0575a88UL, 0x3777aa05c8e4ca4dUL,
+ /* 104 */ 0x392745c85578db5fUL, 0x6fda4149dbae5ae2UL,
+ 0xb1f0b00b8adc9867UL, 0x09963437d36f1da3UL,
+ /* 105 */ 0x7e824e90a5dc3853UL, 0xccb5f6641f135cbdUL,
+ 0x6736d86c87ce8fccUL, 0x625f3ce26604249fUL,
+ /* 106 */ 0xaf8ac8059502f63fUL, 0x0c05e70a2e351469UL,
+ 0x35292e9c764b6305UL, 0x1a394360c7e23ac3UL,
+ /* 107 */ 0xd5c6d53251183264UL, 0x62065abd43c2b74fUL,
+ 0xb5fbf5d03b973f9bUL, 0x13a3da3661206e5eUL,
+ /* 108 */ 0xc6bd5837725d94e5UL, 0x18e30912205016c5UL,
+ 0x2088ce1570033c68UL, 0x7fba1f495c837987UL,
+ /* 109 */ 0x5a8c7423f2f9079dUL, 0x1735157b34023fc5UL,
+ 0xe4f9b49ad2fab351UL, 0x6691ff72c878e33cUL,
+ /* 110 */ 0x122c2adedc5eff3eUL, 0xf8dd4bf1d8956cf4UL,
+ 0xeb86205d9e9e5bdaUL, 0x049b92b9d975c743UL,
+ /* 111 */ 0xa5379730b0f6c05aUL, 0x72a0ffacc6f3a553UL,
+ 0xb0032c34b20dcd6dUL, 0x470e9dbc88d5164aUL,
+ /* 112 */ 0xb19cf10ca237c047UL, 0xb65466711f6c81a2UL,
+ 0xb3321bd16dd80b43UL, 0x48c14f600c5fbe8eUL,
+ /* 113 */ 0x66451c264aa6c803UL, 0xb66e3904a4fa7da6UL,
+ 0xd45f19b0b3128395UL, 0x31602627c3c9bc10UL,
+ /* 114 */ 0x3120dc4832e4e10dUL, 0xeb20c46756c717f7UL,
+ 0x00f52e3f67280294UL, 0x566d4fc14730c509UL,
+ /* 115 */ 0x7e3a5d40fd837206UL, 0xc1e926dc7159547aUL,
+ 0x216730fba68d6095UL, 0x22e8c3843f69cea7UL,
+ /* 116 */ 0x33d074e8930e4b2bUL, 0xb6e4350e84d15816UL,
+ 0x5534c26ad6ba2365UL, 0x7773c12f89f1f3f3UL,
+ /* 117 */ 0x8cba404da57962aaUL, 0x5b9897a81999ce56UL,
+ 0x508e862f121692fcUL, 0x3a81907fa093c291UL,
+ /* 118 */ 0x0dded0ff4725a510UL, 0x10d8cc10673fc503UL,
+ 0x5b9d151c9f1f4e89UL, 0x32a5c1d5cb09a44cUL,
+ /* 119 */ 0x1e0aa442b90541fbUL, 0x5f85eb7cc1b485dbUL,
+ 0xbee595ce8a9df2e5UL, 0x25e496c722422236UL,
+ /* 120 */ 0x5edf3c46cd0fe5b9UL, 0x34e75a7ed2a43388UL,
+ 0xe488de11d761e352UL, 0x0e878a01a085545cUL,
+ /* 121 */ 0xba493c77e021bb04UL, 0x2b4d1843c7df899aUL,
+ 0x9ea37a487ae80d67UL, 0x67a9958011e41794UL,
+ /* 122 */ 0x4b58051a6697b065UL, 0x47e33f7d8d6ba6d4UL,
+ 0xbb4da8d483ca46c1UL, 0x68becaa181c2db0dUL,
+ /* 123 */ 0x8d8980e90b989aa5UL, 0xf95eb14a2c93c99bUL,
+ 0x51c6c7c4796e73a2UL, 0x6e228363b5efb569UL,
+ /* 124 */ 0xc6bbc0b02dd624c8UL, 0x777eb47dec8170eeUL,
+ 0x3cde15a004cfafa9UL, 0x1dc6bc087160bf9bUL,
+ /* 125 */ 0x2e07e043eec34002UL, 0x18e9fc677a68dc7fUL,
+ 0xd8da03188bd15b9aUL, 0x48fbc3bb00568253UL,
+ /* 126 */ 0x57547d4cfb654ce1UL, 0xd3565b82a058e2adUL,
+ 0xf63eaf0bbf154478UL, 0x47531ef114dfbb18UL,
+ /* 127 */ 0xe1ec630a4278c587UL, 0x5507d546ca8e83f3UL,
+ 0x85e135c63adc0c2bUL, 0x0aa7efa85682844eUL,
+ /* 128 */ 0x72691ba8b3e1f615UL, 0x32b4e9701fbe3ffaUL,
+ 0x97b6d92e39bb7868UL, 0x2cfe53dea02e39e8UL,
+ /* 129 */ 0x687392cd85cd52b0UL, 0x27ff66c910e29831UL,
+ 0x97134556a9832d06UL, 0x269bb0360a84f8a0UL,
+ /* 130 */ 0x706e55457643f85cUL, 0x3734a48c9b597d1bUL,
+ 0x7aee91e8c6efa472UL, 0x5cd6abc198a9d9e0UL,
+ /* 131 */ 0x0e04de06cb3ce41aUL, 0xd8c6eb893402e138UL,
+ 0x904659bb686e3772UL, 0x7215c371746ba8c8UL,
+ /* 132 */ 0xfd12a97eeae4a2d9UL, 0x9514b7516394f2c5UL,
+ 0x266fd5809208f294UL, 0x5c847085619a26b9UL,
+ /* 133 */ 0x52985410fed694eaUL, 0x3c905b934a2ed254UL,
+ 0x10bb47692d3be467UL, 0x063b3d2d69e5e9e1UL,
+ /* 134 */ 0x472726eedda57debUL, 0xefb6c4ae10f41891UL,
+ 0x2b1641917b307614UL, 0x117c554fc4f45b7cUL,
+ /* 135 */ 0xc07cf3118f9d8812UL, 0x01dbd82050017939UL,
+ 0xd7e803f4171b2827UL, 0x1015e87487d225eaUL,
+ /* 136 */ 0xc58de3fed23acc4dUL, 0x50db91c294a7be2dUL,
+ 0x0b94d43d1c9cf457UL, 0x6b1640fa6e37524aUL,
+ /* 137 */ 0x692f346c5fda0d09UL, 0x200b1c59fa4d3151UL,
+ 0xb8c46f760777a296UL, 0x4b38395f3ffdfbcfUL,
+ /* 138 */ 0x18d25e00be54d671UL, 0x60d50582bec8aba6UL,
+ 0x87ad8f263b78b982UL, 0x50fdf64e9cda0432UL,
+ /* 139 */ 0x90f567aac578dcf0UL, 0xef1e9b0ef2a3133bUL,
+ 0x0eebba9242d9de71UL, 0x15473c9bf03101c7UL,
+ /* 140 */ 0x7c77e8ae56b78095UL, 0xb678e7666e6f078eUL,
+ 0x2da0b9615348ba1fUL, 0x7cf931c1ff733f0bUL,
+ /* 141 */ 0x26b357f50a0a366cUL, 0xe9708cf42b87d732UL,
+ 0xc13aeea5f91cb2c0UL, 0x35d90c991143bb4cUL,
+ /* 142 */ 0x47c1c404a9a0d9dcUL, 0x659e58451972d251UL,
+ 0x3875a8c473b38c31UL, 0x1fbd9ed379561f24UL,
+ /* 143 */ 0x11fabc6fd41ec28dUL, 0x7ef8dfe3cd2a2dcaUL,
+ 0x72e73b5d8c404595UL, 0x6135fa4954b72f27UL,
+ /* 144 */ 0xccfc32a2de24b69cUL, 0x3f55698c1f095d88UL,
+ 0xbe3350ed5ac3f929UL, 0x5e9bf806ca477eebUL,
+ /* 145 */ 0xe9ce8fb63c309f68UL, 0x5376f63565e1f9f4UL,
+ 0xd1afcfb35a6393f1UL, 0x6632a1ede5623506UL,
+ /* 146 */ 0x0b7d6c390c2ded4cUL, 0x56cb3281df04cb1fUL,
+ 0x66305a1249ecc3c7UL, 0x5d588b60a38ca72aUL,
+ /* 147 */ 0xa6ecbf78e8e5f42dUL, 0x86eeb44b3c8a3eecUL,
+ 0xec219c48fbd21604UL, 0x1aaf1af517c36731UL,
+ /* 148 */ 0xc306a2836769bde7UL, 0x208280622b1e2adbUL,
+ 0x8027f51ffbff94a6UL, 0x76cfa1ce1124f26bUL,
+ /* 149 */ 0x18eb00562422abb6UL, 0xf377c4d58f8c29c3UL,
+ 0x4dbbc207f531561aUL, 0x0253b7f082128a27UL,
+ /* 150 */ 0x3d1f091cb62c17e0UL, 0x4860e1abd64628a9UL,
+ 0x52d17436309d4253UL, 0x356f97e13efae576UL,
+ /* 151 */ 0xd351e11aa150535bUL, 0x3e6b45bb1dd878ccUL,
+ 0x0c776128bed92c98UL, 0x1d34ae93032885b8UL,
+ /* 152 */ 0x4ba0488ca85ba4c3UL, 0x985348c33c9ce6ceUL,
+ 0x66124c6f97bda770UL, 0x0f81a0290654124aUL,
+ /* 153 */ 0x9ed09ca6569b86fdUL, 0x811009fd18af9a2dUL,
+ 0xff08d03f93d8c20aUL, 0x52a148199faef26bUL,
+ /* 154 */ 0x3e03f9dc2d8d1b73UL, 0x4205801873961a70UL,
+ 0xc0d987f041a35970UL, 0x07aa1f15a1c0d549UL,
+ /* 155 */ 0xdfd46ce08cd27224UL, 0x6d0a024f934e4239UL,
+ 0x808a7a6399897b59UL, 0x0a4556e9e13d95a2UL,
+ /* 156 */ 0xd21a991fe9c13045UL, 0x9b0e8548fe7751b8UL,
+ 0x5da643cb4bf30035UL, 0x77db28d63940f721UL,
+ /* 157 */ 0xfc5eeb614adc9011UL, 0x5229419ae8c411ebUL,
+ 0x9ec3e7787d1dcf74UL, 0x340d053e216e4cb5UL,
+ /* 158 */ 0xcac7af39b48df2b4UL, 0xc0faec2871a10a94UL,
+ 0x140a69245ca575edUL, 0x0cf1c37134273a4cUL,
+ /* 159 */ 0xc8ee306ac224b8a5UL, 0x57eaee7ccb4930b0UL,
+ 0xa1e806bdaacbe74fUL, 0x7d9a62742eeb657dUL,
+ /* 160 */ 0x9eb6b6ef546c4830UL, 0x885cca1fddb36e2eUL,
+ 0xe6b9f383ef0d7105UL, 0x58654fef9d2e0412UL,
+ /* 161 */ 0xa905c4ffbe0e8e26UL, 0x942de5df9b31816eUL,
+ 0x497d723f802e88e1UL, 0x30684dea602f408dUL,
+ /* 162 */ 0x21e5a278a3e6cb34UL, 0xaefb6e6f5b151dc4UL,
+ 0xb30b8e049d77ca15UL, 0x28c3c9cf53b98981UL,
+ /* 163 */ 0x287fb721556cdd2aUL, 0x0d317ca897022274UL,
+ 0x7468c7423a543258UL, 0x4a7f11464eb5642fUL,
+ /* 164 */ 0xa237a4774d193aa6UL, 0xd865986ea92129a1UL,
+ 0x24c515ecf87c1a88UL, 0x604003575f39f5ebUL,
+ /* 165 */ 0x47b9f189570a9b27UL, 0x2b98cede465e4b78UL,
+ 0x026df551dbb85c20UL, 0x74fcd91047e21901UL,
+ /* 166 */ 0x13e2a90a23c1bfa3UL, 0x0cb0074e478519f6UL,
+ 0x5ff1cbbe3af6cf44UL, 0x67fe5438be812dbeUL,
+ /* 167 */ 0xd13cf64fa40f05b0UL, 0x054dfb2f32283787UL,
+ 0x4173915b7f0d2aeaUL, 0x482f144f1f610d4eUL,
+ /* 168 */ 0xf6210201b47f8234UL, 0x5d0ae1929e70b990UL,
+ 0xdcd7f455b049567cUL, 0x7e93d0f1f0916f01UL,
+ /* 169 */ 0xdd79cbf18a7db4faUL, 0xbe8391bf6f74c62fUL,
+ 0x027145d14b8291bdUL, 0x585a73ea2cbf1705UL,
+ /* 170 */ 0x485ca03e928a0db2UL, 0x10fc01a5742857e7UL,
+ 0x2f482edbd6d551a7UL, 0x0f0433b5048fdb8aUL,
+ /* 171 */ 0x60da2e8dd7dc6247UL, 0x88b4c9d38cd4819aUL,
+ 0x13033ac001f66697UL, 0x273b24fe3b367d75UL,
+ /* 172 */ 0xc6e8f66a31b3b9d4UL, 0x281514a494df49d5UL,
+ 0xd1726fdfc8b23da7UL, 0x4b3ae7d103dee548UL,
+ /* 173 */ 0xc6256e19ce4b9d7eUL, 0xff5c5cf186e3c61cUL,
+ 0xacc63ca34b8ec145UL, 0x74621888fee66574UL,
+ /* 174 */ 0x956f409645290a1eUL, 0xef0bf8e3263a962eUL,
+ 0xed6a50eb5ec2647bUL, 0x0694283a9dca7502UL,
+ /* 175 */ 0x769b963643a2dcd1UL, 0x42b7c8ea09fc5353UL,
+ 0x4f002aee13397eabUL, 0x63005e2c19b7d63aUL,
+ /* 176 */ 0xca6736da63023beaUL, 0x966c7f6db12a99b7UL,
+ 0xace09390c537c5e1UL, 0x0b696063a1aa89eeUL,
+ /* 177 */ 0xebb03e97288c56e5UL, 0x432a9f9f938c8be8UL,
+ 0xa6a5a93d5b717f71UL, 0x1a5fb4c3e18f9d97UL,
+ /* 178 */ 0x1c94e7ad1c60cdceUL, 0xee202a43fc02c4a0UL,
+ 0x8dafe4d867c46a20UL, 0x0a10263c8ac27b58UL,
+ /* 179 */ 0xd0dea9dfe4432a4aUL, 0x856af87bbe9277c5UL,
+ 0xce8472acc212c71aUL, 0x6f151b6d9bbb1e91UL,
+ /* 180 */ 0x26776c527ceed56aUL, 0x7d211cb7fbf8faecUL,
+ 0x37ae66a6fd4609ccUL, 0x1f81b702d2770c42UL,
+ /* 181 */ 0x2fb0b057eac58392UL, 0xe1dd89fe29744e9dUL,
+ 0xc964f8eb17beb4f8UL, 0x29571073c9a2d41eUL,
+ /* 182 */ 0xa948a18981c0e254UL, 0x2df6369b65b22830UL,
+ 0xa33eb2d75fcfd3c6UL, 0x078cd6ec4199a01fUL,
+ /* 183 */ 0x4a584a41ad900d2fUL, 0x32142b78e2c74c52UL,
+ 0x68c4e8338431c978UL, 0x7f69ea9008689fc2UL,
+ /* 184 */ 0x52f2c81e46a38265UL, 0xfd78072d04a832fdUL,
+ 0x8cd7d5fa25359e94UL, 0x4de71b7454cc29d2UL,
+ /* 185 */ 0x42eb60ad1eda6ac9UL, 0x0aad37dfdbc09c3aUL,
+ 0x81004b71e33cc191UL, 0x44e6be345122803cUL,
+ /* 186 */ 0x03fe8388ba1920dbUL, 0xf5d57c32150db008UL,
+ 0x49c8c4281af60c29UL, 0x21edb518de701aeeUL,
+ /* 187 */ 0x7fb63e418f06dc99UL, 0xa4460d99c166d7b8UL,
+ 0x24dd5248ce520a83UL, 0x5ec3ad712b928358UL,
+ /* 188 */ 0x15022a5fbd17930fUL, 0xa4f64a77d82570e3UL,
+ 0x12bc8d6915783712UL, 0x498194c0fc620abbUL,
+ /* 189 */ 0x38a2d9d255686c82UL, 0x785c6bd9193e21f0UL,
+ 0xe4d5c81ab24a5484UL, 0x56307860b2e20989UL,
+ /* 190 */ 0x429d55f78b4d74c4UL, 0x22f1834643350131UL,
+ 0x1e60c24598c71fffUL, 0x59f2f014979983efUL,
+ /* 191 */ 0x46a47d56eb494a44UL, 0x3e22a854d636a18eUL,
+ 0xb346e15274491c3bUL, 0x2ceafd4e5390cde7UL,
+ /* 192 */ 0xba8a8538be0d6675UL, 0x4b9074bb50818e23UL,
+ 0xcbdab89085d304c3UL, 0x61a24fe0e56192c4UL,
+ /* 193 */ 0xcb7615e6db525bcbUL, 0xdd7d8c35a567e4caUL,
+ 0xe6b4153acafcdd69UL, 0x2d668e097f3c9766UL,
+ /* 194 */ 0xa57e7e265ce55ef0UL, 0x5d9f4e527cd4b967UL,
+ 0xfbc83606492fd1e5UL, 0x090d52beb7c3f7aeUL,
+ /* 195 */ 0x09b9515a1e7b4d7cUL, 0x1f266a2599da44c0UL,
+ 0xa1c49548e2c55504UL, 0x7ef04287126f15ccUL,
+ /* 196 */ 0xfed1659dbd30ef15UL, 0x8b4ab9eec4e0277bUL,
+ 0x884d6236a5df3291UL, 0x1fd96ea6bf5cf788UL,
+ /* 197 */ 0x42a161981f190d9aUL, 0x61d849507e6052c1UL,
+ 0x9fe113bf285a2cd5UL, 0x7c22d676dbad85d8UL,
+ /* 198 */ 0x82e770ed2bfbd27dUL, 0x4c05b2ece996f5a5UL,
+ 0xcd40a9c2b0900150UL, 0x5895319213d9bf64UL,
+ /* 199 */ 0xe7cc5d703fea2e08UL, 0xb50c491258e2188cUL,
+ 0xcce30baa48205bf0UL, 0x537c659ccfa32d62UL,
+ /* 200 */ 0x37b6623a98cfc088UL, 0xfe9bed1fa4d6aca4UL,
+ 0x04d29b8e56a8d1b0UL, 0x725f71c40b519575UL,
+ /* 201 */ 0x28c7f89cd0339ce6UL, 0x8367b14469ddc18bUL,
+ 0x883ada83a6a1652cUL, 0x585f1974034d6c17UL,
+ /* 202 */ 0x89cfb266f1b19188UL, 0xe63b4863e7c35217UL,
+ 0xd88c9da6b4c0526aUL, 0x3e035c9df0954635UL,
+ /* 203 */ 0xdd9d5412fb45de9dUL, 0xdd684532e4cff40dUL,
+ 0x4b5c999b151d671cUL, 0x2d8c2cc811e7f690UL,
+ /* 204 */ 0x7f54be1d90055d40UL, 0xa464c5df464aaf40UL,
+ 0x33979624f0e917beUL, 0x2c018dc527356b30UL,
+ /* 205 */ 0xa5415024e330b3d4UL, 0x73ff3d96691652d3UL,
+ 0x94ec42c4ef9b59f1UL, 0x0747201618d08e5aUL,
+ /* 206 */ 0x4d6ca48aca411c53UL, 0x66415f2fcfa66119UL,
+ 0x9c4dd40051e227ffUL, 0x59810bc09a02f7ebUL,
+ /* 207 */ 0x2a7eb171b3dc101dUL, 0x441c5ab99ffef68eUL,
+ 0x32025c9b93b359eaUL, 0x5e8ce0a71e9d112fUL,
+ /* 208 */ 0xbfcccb92429503fdUL, 0xd271ba752f095d55UL,
+ 0x345ead5e972d091eUL, 0x18c8df11a83103baUL,
+ /* 209 */ 0x90cd949a9aed0f4cUL, 0xc5d1f4cb6660e37eUL,
+ 0xb8cac52d56c52e0bUL, 0x6e42e400c5808e0dUL,
+ /* 210 */ 0xa3b46966eeaefd23UL, 0x0c4f1f0be39ecdcaUL,
+ 0x189dc8c9d683a51dUL, 0x51f27f054c09351bUL,
+ /* 211 */ 0x4c487ccd2a320682UL, 0x587ea95bb3df1c96UL,
+ 0xc8ccf79e555cb8e8UL, 0x547dc829a206d73dUL,
+ /* 212 */ 0xb822a6cd80c39b06UL, 0xe96d54732000d4c6UL,
+ 0x28535b6f91463b4dUL, 0x228f4660e2486e1dUL,
+ /* 213 */ 0x98799538de8d3abfUL, 0x8cd8330045ebca6eUL,
+ 0x79952a008221e738UL, 0x4322e1a7535cd2bbUL,
+ /* 214 */ 0xb114c11819d1801cUL, 0x2016e4d84f3f5ec7UL,
+ 0xdd0e2df409260f4cUL, 0x5ec362c0ae5f7266UL,
+ /* 215 */ 0xc0462b18b8b2b4eeUL, 0x7cc8d950274d1afbUL,
+ 0xf25f7105436b02d2UL, 0x43bbf8dcbff9ccd3UL,
+ /* 216 */ 0xb6ad1767a039e9dfUL, 0xb0714da8f69d3583UL,
+ 0x5e55fa18b42931f5UL, 0x4ed5558f33c60961UL,
+ /* 217 */ 0x1fe37901c647a5ddUL, 0x593ddf1f8081d357UL,
+ 0x0249a4fd813fd7a6UL, 0x69acca274e9caf61UL,
+ /* 218 */ 0x047ba3ea330721c9UL, 0x83423fc20e7e1ea0UL,
+ 0x1df4c0af01314a60UL, 0x09a62dab89289527UL,
+ /* 219 */ 0xa5b325a49cc6cb00UL, 0xe94b5dc654b56cb6UL,
+ 0x3be28779adc994a0UL, 0x4296e8f8ba3a4aadUL,
+ /* 220 */ 0x328689761e451eabUL, 0x2e4d598bff59594aUL,
+ 0x49b96853d7a7084aUL, 0x4980a319601420a8UL,
+ /* 221 */ 0x9565b9e12f552c42UL, 0x8a5318db7100fe96UL,
+ 0x05c90b4d43add0d7UL, 0x538b4cd66a5d4edaUL,
+ /* 222 */ 0xf4e94fc3e89f039fUL, 0x592c9af26f618045UL,
+ 0x08a36eb5fd4b9550UL, 0x25fffaf6c2ed1419UL,
+ /* 223 */ 0x34434459cc79d354UL, 0xeeecbfb4b1d5476bUL,
+ 0xddeb34a061615d99UL, 0x5129cecceb64b773UL,
+ /* 224 */ 0xee43215894993520UL, 0x772f9c7cf14c0b3bUL,
+ 0xd2e2fce306bedad5UL, 0x715f42b546f06a97UL,
+ /* 225 */ 0x434ecdceda5b5f1aUL, 0x0da17115a49741a9UL,
+ 0x680bd77c73edad2eUL, 0x487c02354edd9041UL,
+ /* 226 */ 0xb8efeff3a70ed9c4UL, 0x56a32aa3e857e302UL,
+ 0xdf3a68bd48a2a5a0UL, 0x07f650b73176c444UL,
+ /* 227 */ 0xe38b9b1626e0ccb1UL, 0x79e053c18b09fb36UL,
+ 0x56d90319c9f94964UL, 0x1ca941e7ac9ff5c4UL,
+ /* 228 */ 0x49c4df29162fa0bbUL, 0x8488cf3282b33305UL,
+ 0x95dfda14cabb437dUL, 0x3391f78264d5ad86UL,
+ /* 229 */ 0x729ae06ae2b5095dUL, 0xd58a58d73259a946UL,
+ 0xe9834262d13921edUL, 0x27fedafaa54bb592UL,
+ /* 230 */ 0xa99dc5b829ad48bbUL, 0x5f025742499ee260UL,
+ 0x802c8ecd5d7513fdUL, 0x78ceb3ef3f6dd938UL,
+ /* 231 */ 0xc342f44f8a135d94UL, 0x7b9edb44828cdda3UL,
+ 0x9436d11a0537cfe7UL, 0x5064b164ec1ab4c8UL,
+ /* 232 */ 0x7020eccfd37eb2fcUL, 0x1f31ea3ed90d25fcUL,
+ 0x1b930d7bdfa1bb34UL, 0x5344467a48113044UL,
+ /* 233 */ 0x70073170f25e6dfbUL, 0xe385dc1a50114cc8UL,
+ 0x2348698ac8fc4f00UL, 0x2a77a55284dd40d8UL,
+ /* 234 */ 0xfe06afe0c98c6ce4UL, 0xc235df96dddfd6e4UL,
+ 0x1428d01e33bf1ed3UL, 0x785768ec9300bdafUL,
+ /* 235 */ 0x9702e57a91deb63bUL, 0x61bdb8bfe5ce8b80UL,
+ 0x645b426f3d1d58acUL, 0x4804a82227a557bcUL,
+ /* 236 */ 0x8e57048ab44d2601UL, 0x68d6501a4b3a6935UL,
+ 0xc39c9ec3f9e1c293UL, 0x4172f257d4de63e2UL,
+ /* 237 */ 0xd368b450330c6401UL, 0x040d3017418f2391UL,
+ 0x2c34bb6090b7d90dUL, 0x16f649228fdfd51fUL,
+ /* 238 */ 0xbea6818e2b928ef5UL, 0xe28ccf91cdc11e72UL,
+ 0x594aaa68e77a36cdUL, 0x313034806c7ffd0fUL,
+ /* 239 */ 0x8a9d27ac2249bd65UL, 0x19a3b464018e9512UL,
+ 0xc26ccff352b37ec7UL, 0x056f68341d797b21UL,
+ /* 240 */ 0x5e79d6757efd2327UL, 0xfabdbcb6553afe15UL,
+ 0xd3e7222c6eaf5a60UL, 0x7046c76d4dae743bUL,
+ /* 241 */ 0x660be872b18d4a55UL, 0x19992518574e1496UL,
+ 0xc103053a302bdcbbUL, 0x3ed8e9800b218e8eUL,
+ /* 242 */ 0x7b0b9239fa75e03eUL, 0xefe9fb684633c083UL,
+ 0x98a35fbe391a7793UL, 0x6065510fe2d0fe34UL,
+ /* 243 */ 0x55cb668548abad0cUL, 0xb4584548da87e527UL,
+ 0x2c43ecea0107c1ddUL, 0x526028809372de35UL,
+ /* 244 */ 0x3415c56af9213b1fUL, 0x5bee1a4d017e98dbUL,
+ 0x13f6b105b5cf709bUL, 0x5ff20e3482b29ab6UL,
+ /* 245 */ 0x0aa29c75cc2e6c90UL, 0xfc7d73ca3a70e206UL,
+ 0x899fc38fc4b5c515UL, 0x250386b124ffc207UL,
+ /* 246 */ 0x54ea28d5ae3d2b56UL, 0x9913149dd6de60ceUL,
+ 0x16694fc58f06d6c1UL, 0x46b23975eb018fc7UL,
+ /* 247 */ 0x470a6a0fb4b7b4e2UL, 0x5d92475a8f7253deUL,
+ 0xabeee5b52fbd3adbUL, 0x7fa20801a0806968UL,
+ /* 248 */ 0x76f3faf19f7714d2UL, 0xb3e840c12f4660c3UL,
+ 0x0fb4cd8df212744eUL, 0x4b065a251d3a2dd2UL,
+ /* 249 */ 0x5cebde383d77cd4aUL, 0x6adf39df882c9cb1UL,
+ 0xa2dd242eb09af759UL, 0x3147c0e50e5f6422UL,
+ /* 250 */ 0x164ca5101d1350dbUL, 0xf8d13479c33fc962UL,
+ 0xe640ce4d13e5da08UL, 0x4bdee0c45061f8baUL,
+ /* 251 */ 0xd7c46dc1a4edb1c9UL, 0x5514d7b6437fd98aUL,
+ 0x58942f6bb2a1c00bUL, 0x2dffb2ab1d70710eUL,
+ /* 252 */ 0xccdfcf2fc18b6d68UL, 0xa8ebcba8b7806167UL,
+ 0x980697f95e2937e3UL, 0x02fbba1cd0126e8cUL
+};
+
+/* c is two 512-bit products: c0[0:7]=a0[0:3]*b0[0:3] and c1[8:15]=a1[4:7]*b1[4:7]
+ * a is two 256-bit integers: a0[0:3] and a1[4:7]
+ * b is two 256-bit integers: b0[0:3] and b1[4:7]
+ */
+static void mul2_256x256_integer_adx(u64 *const c, const u64 *const a,
+ const u64 *const b)
+{
+ asm volatile(
+ "xorl %%r14d, %%r14d ;"
+ "movq (%1), %%rdx; " /* A[0] */
+ "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */
+ "xorl %%r10d, %%r10d ;"
+ "movq %%r8, (%0) ;"
+ "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */
+ "adox %%r10, %%r15 ;"
+ "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */
+ "adox %%r8, %%rax ;"
+ "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
+ "adox %%r10, %%rbx ;"
+ /******************************************/
+ "adox %%r14, %%rcx ;"
+
+ "movq 8(%1), %%rdx; " /* A[1] */
+ "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */
+ "adox %%r15, %%r8 ;"
+ "movq %%r8, 8(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */
+ "adox %%r10, %%r9 ;"
+ "adcx %%r9, %%rax ;"
+ "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */
+ "adox %%r8, %%r11 ;"
+ "adcx %%r11, %%rbx ;"
+ "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */
+ "adox %%r10, %%r13 ;"
+ "adcx %%r13, %%rcx ;"
+ /******************************************/
+ "adox %%r14, %%r15 ;"
+ "adcx %%r14, %%r15 ;"
+
+ "movq 16(%1), %%rdx; " /* A[2] */
+ "xorl %%r10d, %%r10d ;"
+ "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */
+ "adox %%rax, %%r8 ;"
+ "movq %%r8, 16(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */
+ "adox %%r10, %%r9 ;"
+ "adcx %%r9, %%rbx ;"
+ "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */
+ "adox %%r8, %%r11 ;"
+ "adcx %%r11, %%rcx ;"
+ "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
+ "adox %%r10, %%r13 ;"
+ "adcx %%r13, %%r15 ;"
+ /******************************************/
+ "adox %%r14, %%rax ;"
+ "adcx %%r14, %%rax ;"
+
+ "movq 24(%1), %%rdx; " /* A[3] */
+ "xorl %%r10d, %%r10d ;"
+ "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */
+ "adox %%rbx, %%r8 ;"
+ "movq %%r8, 24(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */
+ "adox %%r10, %%r9 ;"
+ "adcx %%r9, %%rcx ;"
+ "movq %%rcx, 32(%0) ;"
+ "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */
+ "adox %%r8, %%r11 ;"
+ "adcx %%r11, %%r15 ;"
+ "movq %%r15, 40(%0) ;"
+ "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
+ "adox %%r10, %%r13 ;"
+ "adcx %%r13, %%rax ;"
+ "movq %%rax, 48(%0) ;"
+ /******************************************/
+ "adox %%r14, %%rbx ;"
+ "adcx %%r14, %%rbx ;"
+ "movq %%rbx, 56(%0) ;"
+
+ "movq 32(%1), %%rdx; " /* C[0] */
+ "mulx 32(%2), %%r8, %%r15; " /* C[0]*D[0] */
+ "xorl %%r10d, %%r10d ;"
+ "movq %%r8, 64(%0);"
+ "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */
+ "adox %%r10, %%r15 ;"
+ "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */
+ "adox %%r8, %%rax ;"
+ "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */
+ "adox %%r10, %%rbx ;"
+ /******************************************/
+ "adox %%r14, %%rcx ;"
+
+ "movq 40(%1), %%rdx; " /* C[1] */
+ "xorl %%r10d, %%r10d ;"
+ "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */
+ "adox %%r15, %%r8 ;"
+ "movq %%r8, 72(%0);"
+ "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */
+ "adox %%r10, %%r9 ;"
+ "adcx %%r9, %%rax ;"
+ "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */
+ "adox %%r8, %%r11 ;"
+ "adcx %%r11, %%rbx ;"
+ "mulx 56(%2), %%r10, %%r15; " /* C[1]*D[3] */
+ "adox %%r10, %%r13 ;"
+ "adcx %%r13, %%rcx ;"
+ /******************************************/
+ "adox %%r14, %%r15 ;"
+ "adcx %%r14, %%r15 ;"
+
+ "movq 48(%1), %%rdx; " /* C[2] */
+ "xorl %%r10d, %%r10d ;"
+ "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */
+ "adox %%rax, %%r8 ;"
+ "movq %%r8, 80(%0);"
+ "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */
+ "adox %%r10, %%r9 ;"
+ "adcx %%r9, %%rbx ;"
+ "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */
+ "adox %%r8, %%r11 ;"
+ "adcx %%r11, %%rcx ;"
+ "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */
+ "adox %%r10, %%r13 ;"
+ "adcx %%r13, %%r15 ;"
+ /******************************************/
+ "adox %%r14, %%rax ;"
+ "adcx %%r14, %%rax ;"
+
+ "movq 56(%1), %%rdx; " /* C[3] */
+ "xorl %%r10d, %%r10d ;"
+ "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */
+ "adox %%rbx, %%r8 ;"
+ "movq %%r8, 88(%0);"
+ "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */
+ "adox %%r10, %%r9 ;"
+ "adcx %%r9, %%rcx ;"
+ "movq %%rcx, 96(%0) ;"
+ "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */
+ "adox %%r8, %%r11 ;"
+ "adcx %%r11, %%r15 ;"
+ "movq %%r15, 104(%0) ;"
+ "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */
+ "adox %%r10, %%r13 ;"
+ "adcx %%r13, %%rax ;"
+ "movq %%rax, 112(%0) ;"
+ /******************************************/
+ "adox %%r14, %%rbx ;"
+ "adcx %%r14, %%rbx ;"
+ "movq %%rbx, 120(%0) ;"
+ :
+ : "r"(c), "r"(a), "r"(b)
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
+ "%r10", "%r11", "%r13", "%r14", "%r15");
+}
+
+static void mul2_256x256_integer_bmi2(u64 *const c, const u64 *const a,
+ const u64 *const b)
+{
+ asm volatile(
+ "movq (%1), %%rdx; " /* A[0] */
+ "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */
+ "movq %%r8, (%0) ;"
+ "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */
+ "addq %%r10, %%r15 ;"
+ "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */
+ "adcq %%r8, %%rax ;"
+ "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
+ "adcq %%r10, %%rbx ;"
+ /******************************************/
+ "adcq $0, %%rcx ;"
+
+ "movq 8(%1), %%rdx; " /* A[1] */
+ "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */
+ "addq %%r15, %%r8 ;"
+ "movq %%r8, 8(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */
+ "adcq %%r10, %%r9 ;"
+ "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */
+ "adcq %%r8, %%r11 ;"
+ "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */
+ "adcq %%r10, %%r13 ;"
+ /******************************************/
+ "adcq $0, %%r15 ;"
+
+ "addq %%r9, %%rax ;"
+ "adcq %%r11, %%rbx ;"
+ "adcq %%r13, %%rcx ;"
+ "adcq $0, %%r15 ;"
+
+ "movq 16(%1), %%rdx; " /* A[2] */
+ "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */
+ "addq %%rax, %%r8 ;"
+ "movq %%r8, 16(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */
+ "adcq %%r10, %%r9 ;"
+ "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */
+ "adcq %%r8, %%r11 ;"
+ "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
+ "adcq %%r10, %%r13 ;"
+ /******************************************/
+ "adcq $0, %%rax ;"
+
+ "addq %%r9, %%rbx ;"
+ "adcq %%r11, %%rcx ;"
+ "adcq %%r13, %%r15 ;"
+ "adcq $0, %%rax ;"
+
+ "movq 24(%1), %%rdx; " /* A[3] */
+ "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */
+ "addq %%rbx, %%r8 ;"
+ "movq %%r8, 24(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */
+ "adcq %%r10, %%r9 ;"
+ "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */
+ "adcq %%r8, %%r11 ;"
+ "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
+ "adcq %%r10, %%r13 ;"
+ /******************************************/
+ "adcq $0, %%rbx ;"
+
+ "addq %%r9, %%rcx ;"
+ "movq %%rcx, 32(%0) ;"
+ "adcq %%r11, %%r15 ;"
+ "movq %%r15, 40(%0) ;"
+ "adcq %%r13, %%rax ;"
+ "movq %%rax, 48(%0) ;"
+ "adcq $0, %%rbx ;"
+ "movq %%rbx, 56(%0) ;"
+
+ "movq 32(%1), %%rdx; " /* C[0] */
+ "mulx 32(%2), %%r8, %%r15; " /* C[0]*D[0] */
+ "movq %%r8, 64(%0) ;"
+ "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */
+ "addq %%r10, %%r15 ;"
+ "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */
+ "adcq %%r8, %%rax ;"
+ "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */
+ "adcq %%r10, %%rbx ;"
+ /******************************************/
+ "adcq $0, %%rcx ;"
+
+ "movq 40(%1), %%rdx; " /* C[1] */
+ "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */
+ "addq %%r15, %%r8 ;"
+ "movq %%r8, 72(%0) ;"
+ "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */
+ "adcq %%r10, %%r9 ;"
+ "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */
+ "adcq %%r8, %%r11 ;"
+ "mulx 56(%2), %%r10, %%r15; " /* C[1]*D[3] */
+ "adcq %%r10, %%r13 ;"
+ /******************************************/
+ "adcq $0, %%r15 ;"
+
+ "addq %%r9, %%rax ;"
+ "adcq %%r11, %%rbx ;"
+ "adcq %%r13, %%rcx ;"
+ "adcq $0, %%r15 ;"
+
+ "movq 48(%1), %%rdx; " /* C[2] */
+ "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */
+ "addq %%rax, %%r8 ;"
+ "movq %%r8, 80(%0) ;"
+ "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */
+ "adcq %%r10, %%r9 ;"
+ "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */
+ "adcq %%r8, %%r11 ;"
+ "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */
+ "adcq %%r10, %%r13 ;"
+ /******************************************/
+ "adcq $0, %%rax ;"
+
+ "addq %%r9, %%rbx ;"
+ "adcq %%r11, %%rcx ;"
+ "adcq %%r13, %%r15 ;"
+ "adcq $0, %%rax ;"
+
+ "movq 56(%1), %%rdx; " /* C[3] */
+ "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */
+ "addq %%rbx, %%r8 ;"
+ "movq %%r8, 88(%0) ;"
+ "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */
+ "adcq %%r10, %%r9 ;"
+ "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */
+ "adcq %%r8, %%r11 ;"
+ "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */
+ "adcq %%r10, %%r13 ;"
+ /******************************************/
+ "adcq $0, %%rbx ;"
+
+ "addq %%r9, %%rcx ;"
+ "movq %%rcx, 96(%0) ;"
+ "adcq %%r11, %%r15 ;"
+ "movq %%r15, 104(%0) ;"
+ "adcq %%r13, %%rax ;"
+ "movq %%rax, 112(%0) ;"
+ "adcq $0, %%rbx ;"
+ "movq %%rbx, 120(%0) ;"
+ :
+ : "r"(c), "r"(a), "r"(b)
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
+ "%r10", "%r11", "%r13", "%r15");
+}
+
+static void sqr2_256x256_integer_adx(u64 *const c, const u64 *const a)
+{
+ asm volatile(
+ "movq (%1), %%rdx ;" /* A[0] */
+ "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */
+ "xorl %%r15d, %%r15d;"
+ "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */
+ "adcx %%r14, %%r9 ;"
+ "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */
+ "adcx %%rax, %%r10 ;"
+ "movq 24(%1), %%rdx ;" /* A[3] */
+ "mulx 8(%1), %%r11, %%rbx ;" /* A[1]*A[3] */
+ "adcx %%rcx, %%r11 ;"
+ "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */
+ "adcx %%rax, %%rbx ;"
+ "movq 8(%1), %%rdx ;" /* A[1] */
+ "adcx %%r15, %%r13 ;"
+ "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */
+ "movq $0, %%r14 ;"
+ /******************************************/
+ "adcx %%r15, %%r14 ;"
+
+ "xorl %%r15d, %%r15d;"
+ "adox %%rax, %%r10 ;"
+ "adcx %%r8, %%r8 ;"
+ "adox %%rcx, %%r11 ;"
+ "adcx %%r9, %%r9 ;"
+ "adox %%r15, %%rbx ;"
+ "adcx %%r10, %%r10 ;"
+ "adox %%r15, %%r13 ;"
+ "adcx %%r11, %%r11 ;"
+ "adox %%r15, %%r14 ;"
+ "adcx %%rbx, %%rbx ;"
+ "adcx %%r13, %%r13 ;"
+ "adcx %%r14, %%r14 ;"
+
+ "movq (%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
+ /*******************/
+ "movq %%rax, 0(%0) ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, 8(%0) ;"
+ "movq 8(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
+ "adcq %%rax, %%r9 ;"
+ "movq %%r9, 16(%0) ;"
+ "adcq %%rcx, %%r10 ;"
+ "movq %%r10, 24(%0) ;"
+ "movq 16(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
+ "adcq %%rax, %%r11 ;"
+ "movq %%r11, 32(%0) ;"
+ "adcq %%rcx, %%rbx ;"
+ "movq %%rbx, 40(%0) ;"
+ "movq 24(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
+ "adcq %%rax, %%r13 ;"
+ "movq %%r13, 48(%0) ;"
+ "adcq %%rcx, %%r14 ;"
+ "movq %%r14, 56(%0) ;"
+
+
+ "movq 32(%1), %%rdx ;" /* B[0] */
+ "mulx 40(%1), %%r8, %%r14 ;" /* B[1]*B[0] */
+ "xorl %%r15d, %%r15d;"
+ "mulx 48(%1), %%r9, %%r10 ;" /* B[2]*B[0] */
+ "adcx %%r14, %%r9 ;"
+ "mulx 56(%1), %%rax, %%rcx ;" /* B[3]*B[0] */
+ "adcx %%rax, %%r10 ;"
+ "movq 56(%1), %%rdx ;" /* B[3] */
+ "mulx 40(%1), %%r11, %%rbx ;" /* B[1]*B[3] */
+ "adcx %%rcx, %%r11 ;"
+ "mulx 48(%1), %%rax, %%r13 ;" /* B[2]*B[3] */
+ "adcx %%rax, %%rbx ;"
+ "movq 40(%1), %%rdx ;" /* B[1] */
+ "adcx %%r15, %%r13 ;"
+ "mulx 48(%1), %%rax, %%rcx ;" /* B[2]*B[1] */
+ "movq $0, %%r14 ;"
+ /******************************************/
+ "adcx %%r15, %%r14 ;"
+
+ "xorl %%r15d, %%r15d;"
+ "adox %%rax, %%r10 ;"
+ "adcx %%r8, %%r8 ;"
+ "adox %%rcx, %%r11 ;"
+ "adcx %%r9, %%r9 ;"
+ "adox %%r15, %%rbx ;"
+ "adcx %%r10, %%r10 ;"
+ "adox %%r15, %%r13 ;"
+ "adcx %%r11, %%r11 ;"
+ "adox %%r15, %%r14 ;"
+ "adcx %%rbx, %%rbx ;"
+ "adcx %%r13, %%r13 ;"
+ "adcx %%r14, %%r14 ;"
+
+ "movq 32(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* B[0]^2 */
+ /*******************/
+ "movq %%rax, 64(%0) ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, 72(%0) ;"
+ "movq 40(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* B[1]^2 */
+ "adcq %%rax, %%r9 ;"
+ "movq %%r9, 80(%0) ;"
+ "adcq %%rcx, %%r10 ;"
+ "movq %%r10, 88(%0) ;"
+ "movq 48(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* B[2]^2 */
+ "adcq %%rax, %%r11 ;"
+ "movq %%r11, 96(%0) ;"
+ "adcq %%rcx, %%rbx ;"
+ "movq %%rbx, 104(%0) ;"
+ "movq 56(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* B[3]^2 */
+ "adcq %%rax, %%r13 ;"
+ "movq %%r13, 112(%0) ;"
+ "adcq %%rcx, %%r14 ;"
+ "movq %%r14, 120(%0) ;"
+ :
+ : "r"(c), "r"(a)
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
+ "%r10", "%r11", "%r13", "%r14", "%r15");
+}
+
+static void sqr2_256x256_integer_bmi2(u64 *const c, const u64 *const a)
+{
+ asm volatile(
+ "movq 8(%1), %%rdx ;" /* A[1] */
+ "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */
+ "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */
+ "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */
+
+ "movq 16(%1), %%rdx ;" /* A[2] */
+ "mulx 24(%1), %%r15, %%r13 ;" /* A[3]*A[2] */
+ "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */
+
+ "addq %%rax, %%r9 ;"
+ "adcq %%rdx, %%r10 ;"
+ "adcq %%rcx, %%r11 ;"
+ "adcq %%r14, %%r15 ;"
+ "adcq $0, %%r13 ;"
+ "movq $0, %%r14 ;"
+ "adcq $0, %%r14 ;"
+
+ "movq (%1), %%rdx ;" /* A[0] */
+ "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */
+
+ "addq %%rax, %%r10 ;"
+ "adcq %%rcx, %%r11 ;"
+ "adcq $0, %%r15 ;"
+ "adcq $0, %%r13 ;"
+ "adcq $0, %%r14 ;"
+
+ "shldq $1, %%r13, %%r14 ;"
+ "shldq $1, %%r15, %%r13 ;"
+ "shldq $1, %%r11, %%r15 ;"
+ "shldq $1, %%r10, %%r11 ;"
+ "shldq $1, %%r9, %%r10 ;"
+ "shldq $1, %%r8, %%r9 ;"
+ "shlq $1, %%r8 ;"
+
+ /*******************/
+ "mulx %%rdx, %%rax, %%rcx ; " /* A[0]^2 */
+ /*******************/
+ "movq %%rax, 0(%0) ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, 8(%0) ;"
+ "movq 8(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ; " /* A[1]^2 */
+ "adcq %%rax, %%r9 ;"
+ "movq %%r9, 16(%0) ;"
+ "adcq %%rcx, %%r10 ;"
+ "movq %%r10, 24(%0) ;"
+ "movq 16(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ; " /* A[2]^2 */
+ "adcq %%rax, %%r11 ;"
+ "movq %%r11, 32(%0) ;"
+ "adcq %%rcx, %%r15 ;"
+ "movq %%r15, 40(%0) ;"
+ "movq 24(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ; " /* A[3]^2 */
+ "adcq %%rax, %%r13 ;"
+ "movq %%r13, 48(%0) ;"
+ "adcq %%rcx, %%r14 ;"
+ "movq %%r14, 56(%0) ;"
+
+ "movq 40(%1), %%rdx ;" /* B[1] */
+ "mulx 32(%1), %%r8, %%r9 ;" /* B[0]*B[1] */
+ "mulx 48(%1), %%r10, %%r11 ;" /* B[2]*B[1] */
+ "mulx 56(%1), %%rcx, %%r14 ;" /* B[3]*B[1] */
+
+ "movq 48(%1), %%rdx ;" /* B[2] */
+ "mulx 56(%1), %%r15, %%r13 ;" /* B[3]*B[2] */
+ "mulx 32(%1), %%rax, %%rdx ;" /* B[0]*B[2] */
+
+ "addq %%rax, %%r9 ;"
+ "adcq %%rdx, %%r10 ;"
+ "adcq %%rcx, %%r11 ;"
+ "adcq %%r14, %%r15 ;"
+ "adcq $0, %%r13 ;"
+ "movq $0, %%r14 ;"
+ "adcq $0, %%r14 ;"
+
+ "movq 32(%1), %%rdx ;" /* B[0] */
+ "mulx 56(%1), %%rax, %%rcx ;" /* B[0]*B[3] */
+
+ "addq %%rax, %%r10 ;"
+ "adcq %%rcx, %%r11 ;"
+ "adcq $0, %%r15 ;"
+ "adcq $0, %%r13 ;"
+ "adcq $0, %%r14 ;"
+
+ "shldq $1, %%r13, %%r14 ;"
+ "shldq $1, %%r15, %%r13 ;"
+ "shldq $1, %%r11, %%r15 ;"
+ "shldq $1, %%r10, %%r11 ;"
+ "shldq $1, %%r9, %%r10 ;"
+ "shldq $1, %%r8, %%r9 ;"
+ "shlq $1, %%r8 ;"
+
+ /*******************/
+ "mulx %%rdx, %%rax, %%rcx ; " /* B[0]^2 */
+ /*******************/
+ "movq %%rax, 64(%0) ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, 72(%0) ;"
+ "movq 40(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ; " /* B[1]^2 */
+ "adcq %%rax, %%r9 ;"
+ "movq %%r9, 80(%0) ;"
+ "adcq %%rcx, %%r10 ;"
+ "movq %%r10, 88(%0) ;"
+ "movq 48(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ; " /* B[2]^2 */
+ "adcq %%rax, %%r11 ;"
+ "movq %%r11, 96(%0) ;"
+ "adcq %%rcx, %%r15 ;"
+ "movq %%r15, 104(%0) ;"
+ "movq 56(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ; " /* B[3]^2 */
+ "adcq %%rax, %%r13 ;"
+ "movq %%r13, 112(%0) ;"
+ "adcq %%rcx, %%r14 ;"
+ "movq %%r14, 120(%0) ;"
+ :
+ : "r"(c), "r"(a)
+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
+ "%r11", "%r13", "%r14", "%r15");
+}
+
+static void red_eltfp25519_2w_adx(u64 *const c, const u64 *const a)
+{
+ asm volatile(
+ "movl $38, %%edx; " /* 2*c = 38 = 2^256 */
+ "mulx 32(%1), %%r8, %%r10; " /* c*C[4] */
+ "xorl %%ebx, %%ebx ;"
+ "adox (%1), %%r8 ;"
+ "mulx 40(%1), %%r9, %%r11; " /* c*C[5] */
+ "adcx %%r10, %%r9 ;"
+ "adox 8(%1), %%r9 ;"
+ "mulx 48(%1), %%r10, %%rax; " /* c*C[6] */
+ "adcx %%r11, %%r10 ;"
+ "adox 16(%1), %%r10 ;"
+ "mulx 56(%1), %%r11, %%rcx; " /* c*C[7] */
+ "adcx %%rax, %%r11 ;"
+ "adox 24(%1), %%r11 ;"
+ /***************************************/
+ "adcx %%rbx, %%rcx ;"
+ "adox %%rbx, %%rcx ;"
+ "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */
+ "adcx %%rcx, %%r8 ;"
+ "adcx %%rbx, %%r9 ;"
+ "movq %%r9, 8(%0) ;"
+ "adcx %%rbx, %%r10 ;"
+ "movq %%r10, 16(%0) ;"
+ "adcx %%rbx, %%r11 ;"
+ "movq %%r11, 24(%0) ;"
+ "mov $0, %%ecx ;"
+ "cmovc %%edx, %%ecx ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, (%0) ;"
+
+ "mulx 96(%1), %%r8, %%r10; " /* c*C[4] */
+ "xorl %%ebx, %%ebx ;"
+ "adox 64(%1), %%r8 ;"
+ "mulx 104(%1), %%r9, %%r11; " /* c*C[5] */
+ "adcx %%r10, %%r9 ;"
+ "adox 72(%1), %%r9 ;"
+ "mulx 112(%1), %%r10, %%rax; " /* c*C[6] */
+ "adcx %%r11, %%r10 ;"
+ "adox 80(%1), %%r10 ;"
+ "mulx 120(%1), %%r11, %%rcx; " /* c*C[7] */
+ "adcx %%rax, %%r11 ;"
+ "adox 88(%1), %%r11 ;"
+ /****************************************/
+ "adcx %%rbx, %%rcx ;"
+ "adox %%rbx, %%rcx ;"
+ "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */
+ "adcx %%rcx, %%r8 ;"
+ "adcx %%rbx, %%r9 ;"
+ "movq %%r9, 40(%0) ;"
+ "adcx %%rbx, %%r10 ;"
+ "movq %%r10, 48(%0) ;"
+ "adcx %%rbx, %%r11 ;"
+ "movq %%r11, 56(%0) ;"
+ "mov $0, %%ecx ;"
+ "cmovc %%edx, %%ecx ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, 32(%0) ;"
+ :
+ : "r"(c), "r"(a)
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
+ "%r10", "%r11");
+}
+
+static void red_eltfp25519_2w_bmi2(u64 *const c, const u64 *const a)
+{
+ asm volatile(
+ "movl $38, %%edx ; " /* 2*c = 38 = 2^256 */
+ "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */
+ "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */
+ "addq %%r10, %%r9 ;"
+ "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
+ "adcq %%r11, %%r10 ;"
+ "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
+ "adcq %%rax, %%r11 ;"
+ /***************************************/
+ "adcq $0, %%rcx ;"
+ "addq (%1), %%r8 ;"
+ "adcq 8(%1), %%r9 ;"
+ "adcq 16(%1), %%r10 ;"
+ "adcq 24(%1), %%r11 ;"
+ "adcq $0, %%rcx ;"
+ "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */
+ "addq %%rcx, %%r8 ;"
+ "adcq $0, %%r9 ;"
+ "movq %%r9, 8(%0) ;"
+ "adcq $0, %%r10 ;"
+ "movq %%r10, 16(%0) ;"
+ "adcq $0, %%r11 ;"
+ "movq %%r11, 24(%0) ;"
+ "mov $0, %%ecx ;"
+ "cmovc %%edx, %%ecx ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, (%0) ;"
+
+ "mulx 96(%1), %%r8, %%r10 ;" /* c*C[4] */
+ "mulx 104(%1), %%r9, %%r11 ;" /* c*C[5] */
+ "addq %%r10, %%r9 ;"
+ "mulx 112(%1), %%r10, %%rax ;" /* c*C[6] */
+ "adcq %%r11, %%r10 ;"
+ "mulx 120(%1), %%r11, %%rcx ;" /* c*C[7] */
+ "adcq %%rax, %%r11 ;"
+ /****************************************/
+ "adcq $0, %%rcx ;"
+ "addq 64(%1), %%r8 ;"
+ "adcq 72(%1), %%r9 ;"
+ "adcq 80(%1), %%r10 ;"
+ "adcq 88(%1), %%r11 ;"
+ "adcq $0, %%rcx ;"
+ "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */
+ "addq %%rcx, %%r8 ;"
+ "adcq $0, %%r9 ;"
+ "movq %%r9, 40(%0) ;"
+ "adcq $0, %%r10 ;"
+ "movq %%r10, 48(%0) ;"
+ "adcq $0, %%r11 ;"
+ "movq %%r11, 56(%0) ;"
+ "mov $0, %%ecx ;"
+ "cmovc %%edx, %%ecx ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, 32(%0) ;"
+ :
+ : "r"(c), "r"(a)
+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
+ "%r11");
+}
+
+static void mul_256x256_integer_adx(u64 *const c, const u64 *const a,
+ const u64 *const b)
+{
+ asm volatile(
+ "movq (%1), %%rdx; " /* A[0] */
+ "mulx (%2), %%r8, %%r9; " /* A[0]*B[0] */
+ "xorl %%r10d, %%r10d ;"
+ "movq %%r8, (%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[0]*B[1] */
+ "adox %%r9, %%r10 ;"
+ "movq %%r10, 8(%0) ;"
+ "mulx 16(%2), %%r15, %%r13; " /* A[0]*B[2] */
+ "adox %%r11, %%r15 ;"
+ "mulx 24(%2), %%r14, %%rdx; " /* A[0]*B[3] */
+ "adox %%r13, %%r14 ;"
+ "movq $0, %%rax ;"
+ /******************************************/
+ "adox %%rdx, %%rax ;"
+
+ "movq 8(%1), %%rdx; " /* A[1] */
+ "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */
+ "xorl %%r10d, %%r10d ;"
+ "adcx 8(%0), %%r8 ;"
+ "movq %%r8, 8(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */
+ "adox %%r9, %%r10 ;"
+ "adcx %%r15, %%r10 ;"
+ "movq %%r10, 16(%0) ;"
+ "mulx 16(%2), %%r15, %%r13; " /* A[1]*B[2] */
+ "adox %%r11, %%r15 ;"
+ "adcx %%r14, %%r15 ;"
+ "movq $0, %%r8 ;"
+ "mulx 24(%2), %%r14, %%rdx; " /* A[1]*B[3] */
+ "adox %%r13, %%r14 ;"
+ "adcx %%rax, %%r14 ;"
+ "movq $0, %%rax ;"
+ /******************************************/
+ "adox %%rdx, %%rax ;"
+ "adcx %%r8, %%rax ;"
+
+ "movq 16(%1), %%rdx; " /* A[2] */
+ "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */
+ "xorl %%r10d, %%r10d ;"
+ "adcx 16(%0), %%r8 ;"
+ "movq %%r8, 16(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */
+ "adox %%r9, %%r10 ;"
+ "adcx %%r15, %%r10 ;"
+ "movq %%r10, 24(%0) ;"
+ "mulx 16(%2), %%r15, %%r13; " /* A[2]*B[2] */
+ "adox %%r11, %%r15 ;"
+ "adcx %%r14, %%r15 ;"
+ "movq $0, %%r8 ;"
+ "mulx 24(%2), %%r14, %%rdx; " /* A[2]*B[3] */
+ "adox %%r13, %%r14 ;"
+ "adcx %%rax, %%r14 ;"
+ "movq $0, %%rax ;"
+ /******************************************/
+ "adox %%rdx, %%rax ;"
+ "adcx %%r8, %%rax ;"
+
+ "movq 24(%1), %%rdx; " /* A[3] */
+ "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */
+ "xorl %%r10d, %%r10d ;"
+ "adcx 24(%0), %%r8 ;"
+ "movq %%r8, 24(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */
+ "adox %%r9, %%r10 ;"
+ "adcx %%r15, %%r10 ;"
+ "movq %%r10, 32(%0) ;"
+ "mulx 16(%2), %%r15, %%r13; " /* A[3]*B[2] */
+ "adox %%r11, %%r15 ;"
+ "adcx %%r14, %%r15 ;"
+ "movq %%r15, 40(%0) ;"
+ "movq $0, %%r8 ;"
+ "mulx 24(%2), %%r14, %%rdx; " /* A[3]*B[3] */
+ "adox %%r13, %%r14 ;"
+ "adcx %%rax, %%r14 ;"
+ "movq %%r14, 48(%0) ;"
+ "movq $0, %%rax ;"
+ /******************************************/
+ "adox %%rdx, %%rax ;"
+ "adcx %%r8, %%rax ;"
+ "movq %%rax, 56(%0) ;"
+ :
+ : "r"(c), "r"(a), "r"(b)
+ : "memory", "cc", "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11",
+ "%r13", "%r14", "%r15");
+}
+
+static void mul_256x256_integer_bmi2(u64 *const c, const u64 *const a,
+ const u64 *const b)
+{
+ asm volatile(
+ "movq (%1), %%rdx; " /* A[0] */
+ "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */
+ "movq %%r8, (%0) ;"
+ "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */
+ "addq %%r10, %%r15 ;"
+ "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */
+ "adcq %%r8, %%rax ;"
+ "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
+ "adcq %%r10, %%rbx ;"
+ /******************************************/
+ "adcq $0, %%rcx ;"
+
+ "movq 8(%1), %%rdx; " /* A[1] */
+ "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */
+ "addq %%r15, %%r8 ;"
+ "movq %%r8, 8(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */
+ "adcq %%r10, %%r9 ;"
+ "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */
+ "adcq %%r8, %%r11 ;"
+ "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */
+ "adcq %%r10, %%r13 ;"
+ /******************************************/
+ "adcq $0, %%r15 ;"
+
+ "addq %%r9, %%rax ;"
+ "adcq %%r11, %%rbx ;"
+ "adcq %%r13, %%rcx ;"
+ "adcq $0, %%r15 ;"
+
+ "movq 16(%1), %%rdx; " /* A[2] */
+ "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */
+ "addq %%rax, %%r8 ;"
+ "movq %%r8, 16(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */
+ "adcq %%r10, %%r9 ;"
+ "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */
+ "adcq %%r8, %%r11 ;"
+ "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
+ "adcq %%r10, %%r13 ;"
+ /******************************************/
+ "adcq $0, %%rax ;"
+
+ "addq %%r9, %%rbx ;"
+ "adcq %%r11, %%rcx ;"
+ "adcq %%r13, %%r15 ;"
+ "adcq $0, %%rax ;"
+
+ "movq 24(%1), %%rdx; " /* A[3] */
+ "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */
+ "addq %%rbx, %%r8 ;"
+ "movq %%r8, 24(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */
+ "adcq %%r10, %%r9 ;"
+ "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */
+ "adcq %%r8, %%r11 ;"
+ "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
+ "adcq %%r10, %%r13 ;"
+ /******************************************/
+ "adcq $0, %%rbx ;"
+
+ "addq %%r9, %%rcx ;"
+ "movq %%rcx, 32(%0) ;"
+ "adcq %%r11, %%r15 ;"
+ "movq %%r15, 40(%0) ;"
+ "adcq %%r13, %%rax ;"
+ "movq %%rax, 48(%0) ;"
+ "adcq $0, %%rbx ;"
+ "movq %%rbx, 56(%0) ;"
+ :
+ : "r"(c), "r"(a), "r"(b)
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
+ "%r10", "%r11", "%r13", "%r15");
+}
+
+static void sqr_256x256_integer_adx(u64 *const c, const u64 *const a)
+{
+ asm volatile(
+ "movq (%1), %%rdx ;" /* A[0] */
+ "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */
+ "xorl %%r15d, %%r15d;"
+ "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */
+ "adcx %%r14, %%r9 ;"
+ "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */
+ "adcx %%rax, %%r10 ;"
+ "movq 24(%1), %%rdx ;" /* A[3] */
+ "mulx 8(%1), %%r11, %%rbx ;" /* A[1]*A[3] */
+ "adcx %%rcx, %%r11 ;"
+ "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */
+ "adcx %%rax, %%rbx ;"
+ "movq 8(%1), %%rdx ;" /* A[1] */
+ "adcx %%r15, %%r13 ;"
+ "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */
+ "movq $0, %%r14 ;"
+ /******************************************/
+ "adcx %%r15, %%r14 ;"
+
+ "xorl %%r15d, %%r15d;"
+ "adox %%rax, %%r10 ;"
+ "adcx %%r8, %%r8 ;"
+ "adox %%rcx, %%r11 ;"
+ "adcx %%r9, %%r9 ;"
+ "adox %%r15, %%rbx ;"
+ "adcx %%r10, %%r10 ;"
+ "adox %%r15, %%r13 ;"
+ "adcx %%r11, %%r11 ;"
+ "adox %%r15, %%r14 ;"
+ "adcx %%rbx, %%rbx ;"
+ "adcx %%r13, %%r13 ;"
+ "adcx %%r14, %%r14 ;"
+
+ "movq (%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
+ /*******************/
+ "movq %%rax, 0(%0) ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, 8(%0) ;"
+ "movq 8(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
+ "adcq %%rax, %%r9 ;"
+ "movq %%r9, 16(%0) ;"
+ "adcq %%rcx, %%r10 ;"
+ "movq %%r10, 24(%0) ;"
+ "movq 16(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
+ "adcq %%rax, %%r11 ;"
+ "movq %%r11, 32(%0) ;"
+ "adcq %%rcx, %%rbx ;"
+ "movq %%rbx, 40(%0) ;"
+ "movq 24(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
+ "adcq %%rax, %%r13 ;"
+ "movq %%r13, 48(%0) ;"
+ "adcq %%rcx, %%r14 ;"
+ "movq %%r14, 56(%0) ;"
+ :
+ : "r"(c), "r"(a)
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
+ "%r10", "%r11", "%r13", "%r14", "%r15");
+}
+
+static void sqr_256x256_integer_bmi2(u64 *const c, const u64 *const a)
+{
+ asm volatile(
+ "movq 8(%1), %%rdx ;" /* A[1] */
+ "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */
+ "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */
+ "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */
+
+ "movq 16(%1), %%rdx ;" /* A[2] */
+ "mulx 24(%1), %%r15, %%r13 ;" /* A[3]*A[2] */
+ "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */
+
+ "addq %%rax, %%r9 ;"
+ "adcq %%rdx, %%r10 ;"
+ "adcq %%rcx, %%r11 ;"
+ "adcq %%r14, %%r15 ;"
+ "adcq $0, %%r13 ;"
+ "movq $0, %%r14 ;"
+ "adcq $0, %%r14 ;"
+
+ "movq (%1), %%rdx ;" /* A[0] */
+ "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */
+
+ "addq %%rax, %%r10 ;"
+ "adcq %%rcx, %%r11 ;"
+ "adcq $0, %%r15 ;"
+ "adcq $0, %%r13 ;"
+ "adcq $0, %%r14 ;"
+
+ "shldq $1, %%r13, %%r14 ;"
+ "shldq $1, %%r15, %%r13 ;"
+ "shldq $1, %%r11, %%r15 ;"
+ "shldq $1, %%r10, %%r11 ;"
+ "shldq $1, %%r9, %%r10 ;"
+ "shldq $1, %%r8, %%r9 ;"
+ "shlq $1, %%r8 ;"
+
+ /*******************/
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
+ /*******************/
+ "movq %%rax, 0(%0) ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, 8(%0) ;"
+ "movq 8(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
+ "adcq %%rax, %%r9 ;"
+ "movq %%r9, 16(%0) ;"
+ "adcq %%rcx, %%r10 ;"
+ "movq %%r10, 24(%0) ;"
+ "movq 16(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
+ "adcq %%rax, %%r11 ;"
+ "movq %%r11, 32(%0) ;"
+ "adcq %%rcx, %%r15 ;"
+ "movq %%r15, 40(%0) ;"
+ "movq 24(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
+ "adcq %%rax, %%r13 ;"
+ "movq %%r13, 48(%0) ;"
+ "adcq %%rcx, %%r14 ;"
+ "movq %%r14, 56(%0) ;"
+ :
+ : "r"(c), "r"(a)
+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
+ "%r11", "%r13", "%r14", "%r15");
+}
+
+static void red_eltfp25519_1w_adx(u64 *const c, const u64 *const a)
+{
+ asm volatile(
+ "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */
+ "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */
+ "xorl %%ebx, %%ebx ;"
+ "adox (%1), %%r8 ;"
+ "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */
+ "adcx %%r10, %%r9 ;"
+ "adox 8(%1), %%r9 ;"
+ "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
+ "adcx %%r11, %%r10 ;"
+ "adox 16(%1), %%r10 ;"
+ "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
+ "adcx %%rax, %%r11 ;"
+ "adox 24(%1), %%r11 ;"
+ /***************************************/
+ "adcx %%rbx, %%rcx ;"
+ "adox %%rbx, %%rcx ;"
+ "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */
+ "adcx %%rcx, %%r8 ;"
+ "adcx %%rbx, %%r9 ;"
+ "movq %%r9, 8(%0) ;"
+ "adcx %%rbx, %%r10 ;"
+ "movq %%r10, 16(%0) ;"
+ "adcx %%rbx, %%r11 ;"
+ "movq %%r11, 24(%0) ;"
+ "mov $0, %%ecx ;"
+ "cmovc %%edx, %%ecx ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, (%0) ;"
+ :
+ : "r"(c), "r"(a)
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
+ "%r10", "%r11");
+}
+
+static void red_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a)
+{
+ asm volatile(
+ "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */
+ "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */
+ "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */
+ "addq %%r10, %%r9 ;"
+ "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
+ "adcq %%r11, %%r10 ;"
+ "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
+ "adcq %%rax, %%r11 ;"
+ /***************************************/
+ "adcq $0, %%rcx ;"
+ "addq (%1), %%r8 ;"
+ "adcq 8(%1), %%r9 ;"
+ "adcq 16(%1), %%r10 ;"
+ "adcq 24(%1), %%r11 ;"
+ "adcq $0, %%rcx ;"
+ "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */
+ "addq %%rcx, %%r8 ;"
+ "adcq $0, %%r9 ;"
+ "movq %%r9, 8(%0) ;"
+ "adcq $0, %%r10 ;"
+ "movq %%r10, 16(%0) ;"
+ "adcq $0, %%r11 ;"
+ "movq %%r11, 24(%0) ;"
+ "mov $0, %%ecx ;"
+ "cmovc %%edx, %%ecx ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, (%0) ;"
+ :
+ : "r"(c), "r"(a)
+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
+ "%r11");
+}
+
+static __always_inline void
+add_eltfp25519_1w_adx(u64 *const c, const u64 *const a, const u64 *const b)
+{
+ asm volatile(
+ "mov $38, %%eax ;"
+ "xorl %%ecx, %%ecx ;"
+ "movq (%2), %%r8 ;"
+ "adcx (%1), %%r8 ;"
+ "movq 8(%2), %%r9 ;"
+ "adcx 8(%1), %%r9 ;"
+ "movq 16(%2), %%r10 ;"
+ "adcx 16(%1), %%r10 ;"
+ "movq 24(%2), %%r11 ;"
+ "adcx 24(%1), %%r11 ;"
+ "cmovc %%eax, %%ecx ;"
+ "xorl %%eax, %%eax ;"
+ "adcx %%rcx, %%r8 ;"
+ "adcx %%rax, %%r9 ;"
+ "movq %%r9, 8(%0) ;"
+ "adcx %%rax, %%r10 ;"
+ "movq %%r10, 16(%0) ;"
+ "adcx %%rax, %%r11 ;"
+ "movq %%r11, 24(%0) ;"
+ "mov $38, %%ecx ;"
+ "cmovc %%ecx, %%eax ;"
+ "addq %%rax, %%r8 ;"
+ "movq %%r8, (%0) ;"
+ :
+ : "r"(c), "r"(a), "r"(b)
+ : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
+}
+
+static __always_inline void
+add_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a, const u64 *const b)
+{
+ asm volatile(
+ "mov $38, %%eax ;"
+ "movq (%2), %%r8 ;"
+ "addq (%1), %%r8 ;"
+ "movq 8(%2), %%r9 ;"
+ "adcq 8(%1), %%r9 ;"
+ "movq 16(%2), %%r10 ;"
+ "adcq 16(%1), %%r10 ;"
+ "movq 24(%2), %%r11 ;"
+ "adcq 24(%1), %%r11 ;"
+ "mov $0, %%ecx ;"
+ "cmovc %%eax, %%ecx ;"
+ "addq %%rcx, %%r8 ;"
+ "adcq $0, %%r9 ;"
+ "movq %%r9, 8(%0) ;"
+ "adcq $0, %%r10 ;"
+ "movq %%r10, 16(%0) ;"
+ "adcq $0, %%r11 ;"
+ "movq %%r11, 24(%0) ;"
+ "mov $0, %%ecx ;"
+ "cmovc %%eax, %%ecx ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, (%0) ;"
+ :
+ : "r"(c), "r"(a), "r"(b)
+ : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
+}
+
+static __always_inline void
+sub_eltfp25519_1w(u64 *const c, const u64 *const a, const u64 *const b)
+{
+ asm volatile(
+ "mov $38, %%eax ;"
+ "movq (%1), %%r8 ;"
+ "subq (%2), %%r8 ;"
+ "movq 8(%1), %%r9 ;"
+ "sbbq 8(%2), %%r9 ;"
+ "movq 16(%1), %%r10 ;"
+ "sbbq 16(%2), %%r10 ;"
+ "movq 24(%1), %%r11 ;"
+ "sbbq 24(%2), %%r11 ;"
+ "mov $0, %%ecx ;"
+ "cmovc %%eax, %%ecx ;"
+ "subq %%rcx, %%r8 ;"
+ "sbbq $0, %%r9 ;"
+ "movq %%r9, 8(%0) ;"
+ "sbbq $0, %%r10 ;"
+ "movq %%r10, 16(%0) ;"
+ "sbbq $0, %%r11 ;"
+ "movq %%r11, 24(%0) ;"
+ "mov $0, %%ecx ;"
+ "cmovc %%eax, %%ecx ;"
+ "subq %%rcx, %%r8 ;"
+ "movq %%r8, (%0) ;"
+ :
+ : "r"(c), "r"(a), "r"(b)
+ : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
+}
+
+/* Multiplication by a24 = (A+2)/4 = (486662+2)/4 = 121666 */
+static __always_inline void
+mul_a24_eltfp25519_1w(u64 *const c, const u64 *const a)
+{
+ const u64 a24 = 121666;
+ asm volatile(
+ "movq %2, %%rdx ;"
+ "mulx (%1), %%r8, %%r10 ;"
+ "mulx 8(%1), %%r9, %%r11 ;"
+ "addq %%r10, %%r9 ;"
+ "mulx 16(%1), %%r10, %%rax ;"
+ "adcq %%r11, %%r10 ;"
+ "mulx 24(%1), %%r11, %%rcx ;"
+ "adcq %%rax, %%r11 ;"
+ /**************************/
+ "adcq $0, %%rcx ;"
+ "movl $38, %%edx ;" /* 2*c = 38 = 2^256 mod 2^255-19*/
+ "imul %%rdx, %%rcx ;"
+ "addq %%rcx, %%r8 ;"
+ "adcq $0, %%r9 ;"
+ "movq %%r9, 8(%0) ;"
+ "adcq $0, %%r10 ;"
+ "movq %%r10, 16(%0) ;"
+ "adcq $0, %%r11 ;"
+ "movq %%r11, 24(%0) ;"
+ "mov $0, %%ecx ;"
+ "cmovc %%edx, %%ecx ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, (%0) ;"
+ :
+ : "r"(c), "r"(a), "r"(a24)
+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
+ "%r11");
+}
+
+static void inv_eltfp25519_1w_adx(u64 *const c, const u64 *const a)
+{
+ struct {
+ eltfp25519_1w_buffer buffer;
+ eltfp25519_1w x0, x1, x2;
+ } __aligned(32) m;
+ u64 *T[4];
+
+ T[0] = m.x0;
+ T[1] = c; /* x^(-1) */
+ T[2] = m.x1;
+ T[3] = m.x2;
+
+ copy_eltfp25519_1w(T[1], a);
+ sqrn_eltfp25519_1w_adx(T[1], 1);
+ copy_eltfp25519_1w(T[2], T[1]);
+ sqrn_eltfp25519_1w_adx(T[2], 2);
+ mul_eltfp25519_1w_adx(T[0], a, T[2]);
+ mul_eltfp25519_1w_adx(T[1], T[1], T[0]);
+ copy_eltfp25519_1w(T[2], T[1]);
+ sqrn_eltfp25519_1w_adx(T[2], 1);
+ mul_eltfp25519_1w_adx(T[0], T[0], T[2]);
+ copy_eltfp25519_1w(T[2], T[0]);
+ sqrn_eltfp25519_1w_adx(T[2], 5);
+ mul_eltfp25519_1w_adx(T[0], T[0], T[2]);
+ copy_eltfp25519_1w(T[2], T[0]);
+ sqrn_eltfp25519_1w_adx(T[2], 10);
+ mul_eltfp25519_1w_adx(T[2], T[2], T[0]);
+ copy_eltfp25519_1w(T[3], T[2]);
+ sqrn_eltfp25519_1w_adx(T[3], 20);
+ mul_eltfp25519_1w_adx(T[3], T[3], T[2]);
+ sqrn_eltfp25519_1w_adx(T[3], 10);
+ mul_eltfp25519_1w_adx(T[3], T[3], T[0]);
+ copy_eltfp25519_1w(T[0], T[3]);
+ sqrn_eltfp25519_1w_adx(T[0], 50);
+ mul_eltfp25519_1w_adx(T[0], T[0], T[3]);
+ copy_eltfp25519_1w(T[2], T[0]);
+ sqrn_eltfp25519_1w_adx(T[2], 100);
+ mul_eltfp25519_1w_adx(T[2], T[2], T[0]);
+ sqrn_eltfp25519_1w_adx(T[2], 50);
+ mul_eltfp25519_1w_adx(T[2], T[2], T[3]);
+ sqrn_eltfp25519_1w_adx(T[2], 5);
+ mul_eltfp25519_1w_adx(T[1], T[1], T[2]);
+
+ memzero_explicit(&m, sizeof(m));
+}
+
+static void inv_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a)
+{
+ struct {
+ eltfp25519_1w_buffer buffer;
+ eltfp25519_1w x0, x1, x2;
+ } __aligned(32) m;
+ u64 *T[5];
+
+ T[0] = m.x0;
+ T[1] = c; /* x^(-1) */
+ T[2] = m.x1;
+ T[3] = m.x2;
+
+ copy_eltfp25519_1w(T[1], a);
+ sqrn_eltfp25519_1w_bmi2(T[1], 1);
+ copy_eltfp25519_1w(T[2], T[1]);
+ sqrn_eltfp25519_1w_bmi2(T[2], 2);
+ mul_eltfp25519_1w_bmi2(T[0], a, T[2]);
+ mul_eltfp25519_1w_bmi2(T[1], T[1], T[0]);
+ copy_eltfp25519_1w(T[2], T[1]);
+ sqrn_eltfp25519_1w_bmi2(T[2], 1);
+ mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]);
+ copy_eltfp25519_1w(T[2], T[0]);
+ sqrn_eltfp25519_1w_bmi2(T[2], 5);
+ mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]);
+ copy_eltfp25519_1w(T[2], T[0]);
+ sqrn_eltfp25519_1w_bmi2(T[2], 10);
+ mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]);
+ copy_eltfp25519_1w(T[3], T[2]);
+ sqrn_eltfp25519_1w_bmi2(T[3], 20);
+ mul_eltfp25519_1w_bmi2(T[3], T[3], T[2]);
+ sqrn_eltfp25519_1w_bmi2(T[3], 10);
+ mul_eltfp25519_1w_bmi2(T[3], T[3], T[0]);
+ copy_eltfp25519_1w(T[0], T[3]);
+ sqrn_eltfp25519_1w_bmi2(T[0], 50);
+ mul_eltfp25519_1w_bmi2(T[0], T[0], T[3]);
+ copy_eltfp25519_1w(T[2], T[0]);
+ sqrn_eltfp25519_1w_bmi2(T[2], 100);
+ mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]);
+ sqrn_eltfp25519_1w_bmi2(T[2], 50);
+ mul_eltfp25519_1w_bmi2(T[2], T[2], T[3]);
+ sqrn_eltfp25519_1w_bmi2(T[2], 5);
+ mul_eltfp25519_1w_bmi2(T[1], T[1], T[2]);
+
+ memzero_explicit(&m, sizeof(m));
+}
+
+/* Given c, a 256-bit number, fred_eltfp25519_1w updates c
+ * with a number such that 0 <= C < 2**255-19.
+ */
+static __always_inline void fred_eltfp25519_1w(u64 *const c)
+{
+ u64 tmp0 = 38, tmp1 = 19;
+ asm volatile(
+ "btrq $63, %3 ;" /* Put bit 255 in carry flag and clear */
+ "cmovncl %k5, %k4 ;" /* c[255] ? 38 : 19 */
+
+ /* Add either 19 or 38 to c */
+ "addq %4, %0 ;"
+ "adcq $0, %1 ;"
+ "adcq $0, %2 ;"
+ "adcq $0, %3 ;"
+
+ /* Test for bit 255 again; only triggered on overflow modulo 2^255-19 */
+ "movl $0, %k4 ;"
+ "cmovnsl %k5, %k4 ;" /* c[255] ? 0 : 19 */
+ "btrq $63, %3 ;" /* Clear bit 255 */
+
+ /* Subtract 19 if necessary */
+ "subq %4, %0 ;"
+ "sbbq $0, %1 ;"
+ "sbbq $0, %2 ;"
+ "sbbq $0, %3 ;"
+
+ : "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3]), "+r"(tmp0),
+ "+r"(tmp1)
+ :
+ : "memory", "cc");
+}
+
+static __always_inline void cswap(u8 bit, u64 *const px, u64 *const py)
+{
+ u64 temp;
+ asm volatile(
+ "test %9, %9 ;"
+ "movq %0, %8 ;"
+ "cmovnzq %4, %0 ;"
+ "cmovnzq %8, %4 ;"
+ "movq %1, %8 ;"
+ "cmovnzq %5, %1 ;"
+ "cmovnzq %8, %5 ;"
+ "movq %2, %8 ;"
+ "cmovnzq %6, %2 ;"
+ "cmovnzq %8, %6 ;"
+ "movq %3, %8 ;"
+ "cmovnzq %7, %3 ;"
+ "cmovnzq %8, %7 ;"
+ : "+r"(px[0]), "+r"(px[1]), "+r"(px[2]), "+r"(px[3]),
+ "+r"(py[0]), "+r"(py[1]), "+r"(py[2]), "+r"(py[3]),
+ "=r"(temp)
+ : "r"(bit)
+ : "cc"
+ );
+}
+
+static __always_inline void cselect(u8 bit, u64 *const px, const u64 *const py)
+{
+ asm volatile(
+ "test %4, %4 ;"
+ "cmovnzq %5, %0 ;"
+ "cmovnzq %6, %1 ;"
+ "cmovnzq %7, %2 ;"
+ "cmovnzq %8, %3 ;"
+ : "+r"(px[0]), "+r"(px[1]), "+r"(px[2]), "+r"(px[3])
+ : "r"(bit), "rm"(py[0]), "rm"(py[1]), "rm"(py[2]), "rm"(py[3])
+ : "cc"
+ );
+}
+
+static __always_inline void clamp_secret(u8 secret[CURVE25519_POINT_SIZE])
+{
+ secret[0] &= 248;
+ secret[31] &= 127;
+ secret[31] |= 64;
+}
+
+static void curve25519_adx(u8 shared[CURVE25519_POINT_SIZE],
+ const u8 private_key[CURVE25519_POINT_SIZE],
+ const u8 session_key[CURVE25519_POINT_SIZE])
+{
+ struct {
+ u64 buffer[4 * NUM_WORDS_ELTFP25519];
+ u64 coordinates[4 * NUM_WORDS_ELTFP25519];
+ u64 workspace[6 * NUM_WORDS_ELTFP25519];
+ u8 session[CURVE25519_POINT_SIZE];
+ u8 private[CURVE25519_POINT_SIZE];
+ } __aligned(32) m;
+
+ int i = 0, j = 0;
+ u64 prev = 0;
+ u64 *const X1 = (u64 *)m.session;
+ u64 *const key = (u64 *)m.private;
+ u64 *const Px = m.coordinates + 0;
+ u64 *const Pz = m.coordinates + 4;
+ u64 *const Qx = m.coordinates + 8;
+ u64 *const Qz = m.coordinates + 12;
+ u64 *const X2 = Qx;
+ u64 *const Z2 = Qz;
+ u64 *const X3 = Px;
+ u64 *const Z3 = Pz;
+ u64 *const X2Z2 = Qx;
+ u64 *const X3Z3 = Px;
+
+ u64 *const A = m.workspace + 0;
+ u64 *const B = m.workspace + 4;
+ u64 *const D = m.workspace + 8;
+ u64 *const C = m.workspace + 12;
+ u64 *const DA = m.workspace + 16;
+ u64 *const CB = m.workspace + 20;
+ u64 *const AB = A;
+ u64 *const DC = D;
+ u64 *const DACB = DA;
+
+ memcpy(m.private, private_key, sizeof(m.private));
+ memcpy(m.session, session_key, sizeof(m.session));
+
+ clamp_secret(m.private);
+
+ /* As in the draft:
+ * When receiving such an array, implementations of curve25519
+ * MUST mask the most-significant bit in the final byte. This
+ * is done to preserve compatibility with point formats which
+ * reserve the sign bit for use in other protocols and to
+ * increase resistance to implementation fingerprinting
+ */
+ m.session[CURVE25519_POINT_SIZE - 1] &= (1 << (255 % 8)) - 1;
+
+ copy_eltfp25519_1w(Px, X1);
+ setzero_eltfp25519_1w(Pz);
+ setzero_eltfp25519_1w(Qx);
+ setzero_eltfp25519_1w(Qz);
+
+ Pz[0] = 1;
+ Qx[0] = 1;
+
+ /* main-loop */
+ prev = 0;
+ j = 62;
+ for (i = 3; i >= 0; --i) {
+ while (j >= 0) {
+ u64 bit = (key[i] >> j) & 0x1;
+ u64 swap = bit ^ prev;
+ prev = bit;
+
+ add_eltfp25519_1w_adx(A, X2, Z2); /* A = (X2+Z2) */
+ sub_eltfp25519_1w(B, X2, Z2); /* B = (X2-Z2) */
+ add_eltfp25519_1w_adx(C, X3, Z3); /* C = (X3+Z3) */
+ sub_eltfp25519_1w(D, X3, Z3); /* D = (X3-Z3) */
+ mul_eltfp25519_2w_adx(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */
+
+ cselect(swap, A, C);
+ cselect(swap, B, D);
+
+ sqr_eltfp25519_2w_adx(AB); /* [AA|BB] = [A^2|B^2] */
+ add_eltfp25519_1w_adx(X3, DA, CB); /* X3 = (DA+CB) */
+ sub_eltfp25519_1w(Z3, DA, CB); /* Z3 = (DA-CB) */
+ sqr_eltfp25519_2w_adx(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */
+
+ copy_eltfp25519_1w(X2, B); /* X2 = B^2 */
+ sub_eltfp25519_1w(Z2, A, B); /* Z2 = E = AA-BB */
+
+ mul_a24_eltfp25519_1w(B, Z2); /* B = a24*E */
+ add_eltfp25519_1w_adx(B, B, X2); /* B = a24*E+B */
+ mul_eltfp25519_2w_adx(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */
+ mul_eltfp25519_1w_adx(Z3, Z3, X1); /* Z3 = Z3*X1 */
+ --j;
+ }
+ j = 63;
+ }
+
+ inv_eltfp25519_1w_adx(A, Qz);
+ mul_eltfp25519_1w_adx((u64 *)shared, Qx, A);
+ fred_eltfp25519_1w((u64 *)shared);
+
+ memzero_explicit(&m, sizeof(m));
+}
+
+static void curve25519_adx_base(u8 session_key[CURVE25519_POINT_SIZE],
+ const u8 private_key[CURVE25519_POINT_SIZE])
+{
+ struct {
+ u64 buffer[4 * NUM_WORDS_ELTFP25519];
+ u64 coordinates[4 * NUM_WORDS_ELTFP25519];
+ u64 workspace[4 * NUM_WORDS_ELTFP25519];
+ u8 private[CURVE25519_POINT_SIZE];
+ } __aligned(32) m;
+
+ const int ite[4] = { 64, 64, 64, 63 };
+ const int q = 3;
+ u64 swap = 1;
+
+ int i = 0, j = 0, k = 0;
+ u64 *const key = (u64 *)m.private;
+ u64 *const Ur1 = m.coordinates + 0;
+ u64 *const Zr1 = m.coordinates + 4;
+ u64 *const Ur2 = m.coordinates + 8;
+ u64 *const Zr2 = m.coordinates + 12;
+
+ u64 *const UZr1 = m.coordinates + 0;
+ u64 *const ZUr2 = m.coordinates + 8;
+
+ u64 *const A = m.workspace + 0;
+ u64 *const B = m.workspace + 4;
+ u64 *const C = m.workspace + 8;
+ u64 *const D = m.workspace + 12;
+
+ u64 *const AB = m.workspace + 0;
+ u64 *const CD = m.workspace + 8;
+
+ const u64 *const P = table_ladder_8k;
+
+ memcpy(m.private, private_key, sizeof(m.private));
+
+ clamp_secret(m.private);
+
+ setzero_eltfp25519_1w(Ur1);
+ setzero_eltfp25519_1w(Zr1);
+ setzero_eltfp25519_1w(Zr2);
+ Ur1[0] = 1;
+ Zr1[0] = 1;
+ Zr2[0] = 1;
+
+ /* G-S */
+ Ur2[3] = 0x1eaecdeee27cab34UL;
+ Ur2[2] = 0xadc7a0b9235d48e2UL;
+ Ur2[1] = 0xbbf095ae14b2edf8UL;
+ Ur2[0] = 0x7e94e1fec82faabdUL;
+
+ /* main-loop */
+ j = q;
+ for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) {
+ while (j < ite[i]) {
+ u64 bit = (key[i] >> j) & 0x1;
+ k = (64 * i + j - q);
+ swap = swap ^ bit;
+ cswap(swap, Ur1, Ur2);
+ cswap(swap, Zr1, Zr2);
+ swap = bit;
+ /* Addition */
+ sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */
+ add_eltfp25519_1w_adx(A, Ur1, Zr1); /* A = Ur1+Zr1 */
+ mul_eltfp25519_1w_adx(C, &P[4 * k], B); /* C = M0-B */
+ sub_eltfp25519_1w(B, A, C); /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */
+ add_eltfp25519_1w_adx(A, A, C); /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */
+ sqr_eltfp25519_2w_adx(AB); /* A = A^2 | B = B^2 */
+ mul_eltfp25519_2w_adx(UZr1, ZUr2, AB); /* Ur1 = Zr2*A | Zr1 = Ur2*B */
+ ++j;
+ }
+ j = 0;
+ }
+
+ /* Doubling */
+ for (i = 0; i < q; ++i) {
+ add_eltfp25519_1w_adx(A, Ur1, Zr1); /* A = Ur1+Zr1 */
+ sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */
+ sqr_eltfp25519_2w_adx(AB); /* A = A**2 B = B**2 */
+ copy_eltfp25519_1w(C, B); /* C = B */
+ sub_eltfp25519_1w(B, A, B); /* B = A-B */
+ mul_a24_eltfp25519_1w(D, B); /* D = my_a24*B */
+ add_eltfp25519_1w_adx(D, D, C); /* D = D+C */
+ mul_eltfp25519_2w_adx(UZr1, AB, CD); /* Ur1 = A*B Zr1 = Zr1*A */
+ }
+
+ /* Convert to affine coordinates */
+ inv_eltfp25519_1w_adx(A, Zr1);
+ mul_eltfp25519_1w_adx((u64 *)session_key, Ur1, A);
+ fred_eltfp25519_1w((u64 *)session_key);
+
+ memzero_explicit(&m, sizeof(m));
+}
+
+static void curve25519_bmi2(u8 shared[CURVE25519_POINT_SIZE],
+ const u8 private_key[CURVE25519_POINT_SIZE],
+ const u8 session_key[CURVE25519_POINT_SIZE])
+{
+ struct {
+ u64 buffer[4 * NUM_WORDS_ELTFP25519];
+ u64 coordinates[4 * NUM_WORDS_ELTFP25519];
+ u64 workspace[6 * NUM_WORDS_ELTFP25519];
+ u8 session[CURVE25519_POINT_SIZE];
+ u8 private[CURVE25519_POINT_SIZE];
+ } __aligned(32) m;
+
+ int i = 0, j = 0;
+ u64 prev = 0;
+ u64 *const X1 = (u64 *)m.session;
+ u64 *const key = (u64 *)m.private;
+ u64 *const Px = m.coordinates + 0;
+ u64 *const Pz = m.coordinates + 4;
+ u64 *const Qx = m.coordinates + 8;
+ u64 *const Qz = m.coordinates + 12;
+ u64 *const X2 = Qx;
+ u64 *const Z2 = Qz;
+ u64 *const X3 = Px;
+ u64 *const Z3 = Pz;
+ u64 *const X2Z2 = Qx;
+ u64 *const X3Z3 = Px;
+
+ u64 *const A = m.workspace + 0;
+ u64 *const B = m.workspace + 4;
+ u64 *const D = m.workspace + 8;
+ u64 *const C = m.workspace + 12;
+ u64 *const DA = m.workspace + 16;
+ u64 *const CB = m.workspace + 20;
+ u64 *const AB = A;
+ u64 *const DC = D;
+ u64 *const DACB = DA;
+
+ memcpy(m.private, private_key, sizeof(m.private));
+ memcpy(m.session, session_key, sizeof(m.session));
+
+ clamp_secret(m.private);
+
+ /* As in the draft:
+ * When receiving such an array, implementations of curve25519
+ * MUST mask the most-significant bit in the final byte. This
+ * is done to preserve compatibility with point formats which
+ * reserve the sign bit for use in other protocols and to
+ * increase resistance to implementation fingerprinting
+ */
+ m.session[CURVE25519_POINT_SIZE - 1] &= (1 << (255 % 8)) - 1;
+
+ copy_eltfp25519_1w(Px, X1);
+ setzero_eltfp25519_1w(Pz);
+ setzero_eltfp25519_1w(Qx);
+ setzero_eltfp25519_1w(Qz);
+
+ Pz[0] = 1;
+ Qx[0] = 1;
+
+ /* main-loop */
+ prev = 0;
+ j = 62;
+ for (i = 3; i >= 0; --i) {
+ while (j >= 0) {
+ u64 bit = (key[i] >> j) & 0x1;
+ u64 swap = bit ^ prev;
+ prev = bit;
+
+ add_eltfp25519_1w_bmi2(A, X2, Z2); /* A = (X2+Z2) */
+ sub_eltfp25519_1w(B, X2, Z2); /* B = (X2-Z2) */
+ add_eltfp25519_1w_bmi2(C, X3, Z3); /* C = (X3+Z3) */
+ sub_eltfp25519_1w(D, X3, Z3); /* D = (X3-Z3) */
+ mul_eltfp25519_2w_bmi2(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */
+
+ cselect(swap, A, C);
+ cselect(swap, B, D);
+
+ sqr_eltfp25519_2w_bmi2(AB); /* [AA|BB] = [A^2|B^2] */
+ add_eltfp25519_1w_bmi2(X3, DA, CB); /* X3 = (DA+CB) */
+ sub_eltfp25519_1w(Z3, DA, CB); /* Z3 = (DA-CB) */
+ sqr_eltfp25519_2w_bmi2(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */
+
+ copy_eltfp25519_1w(X2, B); /* X2 = B^2 */
+ sub_eltfp25519_1w(Z2, A, B); /* Z2 = E = AA-BB */
+
+ mul_a24_eltfp25519_1w(B, Z2); /* B = a24*E */
+ add_eltfp25519_1w_bmi2(B, B, X2); /* B = a24*E+B */
+ mul_eltfp25519_2w_bmi2(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */
+ mul_eltfp25519_1w_bmi2(Z3, Z3, X1); /* Z3 = Z3*X1 */
+ --j;
+ }
+ j = 63;
+ }
+
+ inv_eltfp25519_1w_bmi2(A, Qz);
+ mul_eltfp25519_1w_bmi2((u64 *)shared, Qx, A);
+ fred_eltfp25519_1w((u64 *)shared);
+
+ memzero_explicit(&m, sizeof(m));
+}
+
+static void curve25519_bmi2_base(u8 session_key[CURVE25519_POINT_SIZE],
+ const u8 private_key[CURVE25519_POINT_SIZE])
+{
+ struct {
+ u64 buffer[4 * NUM_WORDS_ELTFP25519];
+ u64 coordinates[4 * NUM_WORDS_ELTFP25519];
+ u64 workspace[4 * NUM_WORDS_ELTFP25519];
+ u8 private[CURVE25519_POINT_SIZE];
+ } __aligned(32) m;
+
+ const int ite[4] = { 64, 64, 64, 63 };
+ const int q = 3;
+ u64 swap = 1;
+
+ int i = 0, j = 0, k = 0;
+ u64 *const key = (u64 *)m.private;
+ u64 *const Ur1 = m.coordinates + 0;
+ u64 *const Zr1 = m.coordinates + 4;
+ u64 *const Ur2 = m.coordinates + 8;
+ u64 *const Zr2 = m.coordinates + 12;
+
+ u64 *const UZr1 = m.coordinates + 0;
+ u64 *const ZUr2 = m.coordinates + 8;
+
+ u64 *const A = m.workspace + 0;
+ u64 *const B = m.workspace + 4;
+ u64 *const C = m.workspace + 8;
+ u64 *const D = m.workspace + 12;
+
+ u64 *const AB = m.workspace + 0;
+ u64 *const CD = m.workspace + 8;
+
+ const u64 *const P = table_ladder_8k;
+
+ memcpy(m.private, private_key, sizeof(m.private));
+
+ clamp_secret(m.private);
+
+ setzero_eltfp25519_1w(Ur1);
+ setzero_eltfp25519_1w(Zr1);
+ setzero_eltfp25519_1w(Zr2);
+ Ur1[0] = 1;
+ Zr1[0] = 1;
+ Zr2[0] = 1;
+
+ /* G-S */
+ Ur2[3] = 0x1eaecdeee27cab34UL;
+ Ur2[2] = 0xadc7a0b9235d48e2UL;
+ Ur2[1] = 0xbbf095ae14b2edf8UL;
+ Ur2[0] = 0x7e94e1fec82faabdUL;
+
+ /* main-loop */
+ j = q;
+ for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) {
+ while (j < ite[i]) {
+ u64 bit = (key[i] >> j) & 0x1;
+ k = (64 * i + j - q);
+ swap = swap ^ bit;
+ cswap(swap, Ur1, Ur2);
+ cswap(swap, Zr1, Zr2);
+ swap = bit;
+ /* Addition */
+ sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */
+ add_eltfp25519_1w_bmi2(A, Ur1, Zr1); /* A = Ur1+Zr1 */
+ mul_eltfp25519_1w_bmi2(C, &P[4 * k], B);/* C = M0-B */
+ sub_eltfp25519_1w(B, A, C); /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */
+ add_eltfp25519_1w_bmi2(A, A, C); /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */
+ sqr_eltfp25519_2w_bmi2(AB); /* A = A^2 | B = B^2 */
+ mul_eltfp25519_2w_bmi2(UZr1, ZUr2, AB); /* Ur1 = Zr2*A | Zr1 = Ur2*B */
+ ++j;
+ }
+ j = 0;
+ }
+
+ /* Doubling */
+ for (i = 0; i < q; ++i) {
+ add_eltfp25519_1w_bmi2(A, Ur1, Zr1); /* A = Ur1+Zr1 */
+ sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */
+ sqr_eltfp25519_2w_bmi2(AB); /* A = A**2 B = B**2 */
+ copy_eltfp25519_1w(C, B); /* C = B */
+ sub_eltfp25519_1w(B, A, B); /* B = A-B */
+ mul_a24_eltfp25519_1w(D, B); /* D = my_a24*B */
+ add_eltfp25519_1w_bmi2(D, D, C); /* D = D+C */
+ mul_eltfp25519_2w_bmi2(UZr1, AB, CD); /* Ur1 = A*B Zr1 = Zr1*A */
+ }
+
+ /* Convert to affine coordinates */
+ inv_eltfp25519_1w_bmi2(A, Zr1);
+ mul_eltfp25519_1w_bmi2((u64 *)session_key, Ur1, A);
+ fred_eltfp25519_1w((u64 *)session_key);
+
+ memzero_explicit(&m, sizeof(m));
+}
diff --git a/src/crypto/zinc/curve25519/curve25519.c b/src/crypto/zinc/curve25519/curve25519.c
new file mode 100644
index 0000000..183eeb5
--- /dev/null
+++ b/src/crypto/zinc/curve25519/curve25519.c
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * This is an implementation of the Curve25519 ECDH algorithm, using either
+ * a 32-bit implementation or a 64-bit implementation with 128-bit integers,
+ * depending on what is supported by the target compiler.
+ *
+ * Information: https://cr.yp.to/ecdh.html
+ */
+
+#include <zinc/curve25519.h>
+
+#include <asm/unaligned.h>
+#include <linux/version.h>
+#include <linux/string.h>
+#include <linux/random.h>
+#include <crypto/algapi.h>
+
+#ifndef HAVE_CURVE25519_ARCH_IMPLEMENTATION
+void __init curve25519_fpu_init(void)
+{
+}
+static inline bool curve25519_arch(u8 mypublic[CURVE25519_POINT_SIZE],
+ const u8 secret[CURVE25519_POINT_SIZE],
+ const u8 basepoint[CURVE25519_POINT_SIZE])
+{
+ return false;
+}
+static inline bool curve25519_base_arch(u8 pub[CURVE25519_POINT_SIZE],
+ const u8 secret[CURVE25519_POINT_SIZE])
+{
+ return false;
+}
+#endif
+
+static __always_inline void normalize_secret(u8 secret[CURVE25519_POINT_SIZE])
+{
+ secret[0] &= 248;
+ secret[31] &= 127;
+ secret[31] |= 64;
+}
+
+#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
+#include "curve25519-hacl64.h"
+#else
+#include "curve25519-fiat32.h"
+#endif
+
+static const u8 null_point[CURVE25519_POINT_SIZE] = { 0 };
+
+bool curve25519(u8 mypublic[CURVE25519_POINT_SIZE],
+ const u8 secret[CURVE25519_POINT_SIZE],
+ const u8 basepoint[CURVE25519_POINT_SIZE])
+{
+ if (!curve25519_arch(mypublic, secret, basepoint))
+ curve25519_generic(mypublic, secret, basepoint);
+ return crypto_memneq(mypublic, null_point, CURVE25519_POINT_SIZE);
+}
+EXPORT_SYMBOL(curve25519);
+
+bool curve25519_generate_public(u8 pub[CURVE25519_POINT_SIZE],
+ const u8 secret[CURVE25519_POINT_SIZE])
+{
+ static const u8 basepoint[CURVE25519_POINT_SIZE] __aligned(32) = { 9 };
+
+ if (unlikely(!crypto_memneq(secret, null_point, CURVE25519_POINT_SIZE)))
+ return false;
+
+ if (curve25519_base_arch(pub, secret))
+ return crypto_memneq(pub, null_point, CURVE25519_POINT_SIZE);
+ return curve25519(pub, secret, basepoint);
+}
+EXPORT_SYMBOL(curve25519_generate_public);
+
+void curve25519_generate_secret(u8 secret[CURVE25519_POINT_SIZE])
+{
+ get_random_bytes_wait(secret, CURVE25519_POINT_SIZE);
+ normalize_secret(secret);
+}
+EXPORT_SYMBOL(curve25519_generate_secret);
+
+#include "../selftest/curve25519.h"
diff --git a/src/crypto/zinc/poly1305/poly1305-arm-glue.h b/src/crypto/zinc/poly1305/poly1305-arm-glue.h
new file mode 100644
index 0000000..53f8fec
--- /dev/null
+++ b/src/crypto/zinc/poly1305/poly1305-arm-glue.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <zinc/poly1305.h>
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+
+asmlinkage void poly1305_init_arm(void *ctx, const u8 key[16]);
+asmlinkage void poly1305_blocks_arm(void *ctx, const u8 *inp, const size_t len,
+ const u32 padbit);
+asmlinkage void poly1305_emit_arm(void *ctx, u8 mac[16], const u32 nonce[4]);
+#if IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && \
+ (defined(CONFIG_64BIT) || __LINUX_ARM_ARCH__ >= 7)
+#define ARM_USE_NEON
+asmlinkage void poly1305_blocks_neon(void *ctx, const u8 *inp, const size_t len,
+ const u32 padbit);
+asmlinkage void poly1305_emit_neon(void *ctx, u8 mac[16], const u32 nonce[4]);
+#endif
+
+static bool poly1305_use_neon __ro_after_init;
+
+void __init poly1305_fpu_init(void)
+{
+#if defined(CONFIG_ARM64)
+ poly1305_use_neon = elf_hwcap & HWCAP_ASIMD;
+#elif defined(CONFIG_ARM)
+ poly1305_use_neon = elf_hwcap & HWCAP_NEON;
+#endif
+}
+
+static inline bool poly1305_init_arch(void *ctx,
+ const u8 key[POLY1305_KEY_SIZE],
+ simd_context_t simd_context)
+{
+ poly1305_init_arm(ctx, key);
+ return true;
+}
+
+static inline bool poly1305_blocks_arch(void *ctx, const u8 *inp,
+ const size_t len, const u32 padbit,
+ simd_context_t simd_context)
+{
+#if defined(ARM_USE_NEON)
+ if (simd_context == HAVE_FULL_SIMD && poly1305_use_neon) {
+ poly1305_blocks_neon(ctx, inp, len, padbit);
+ return true;
+ }
+#endif
+ poly1305_blocks_arm(ctx, inp, len, padbit);
+ return true;
+}
+
+static inline bool poly1305_emit_arch(void *ctx, u8 mac[POLY1305_MAC_SIZE],
+ const u32 nonce[4],
+ simd_context_t simd_context)
+{
+#if defined(ARM_USE_NEON)
+ if (simd_context == HAVE_FULL_SIMD && poly1305_use_neon) {
+ poly1305_emit_neon(ctx, mac, nonce);
+ return true;
+ }
+#endif
+ poly1305_emit_arm(ctx, mac, nonce);
+ return true;
+}
+
+#define HAVE_POLY1305_ARCH_IMPLEMENTATION
diff --git a/src/crypto/zinc/poly1305/poly1305-arm.S b/src/crypto/zinc/poly1305/poly1305-arm.S
new file mode 100644
index 0000000..277a11a
--- /dev/null
+++ b/src/crypto/zinc/poly1305/poly1305-arm.S
@@ -0,0 +1,1117 @@
+/* SPDX-License-Identifier: OpenSSL OR (BSD-3-Clause OR GPL-2.0)
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * This is based in part on Andy Polyakov's implementation from OpenSSL.
+ */
+
+#include <linux/linkage.h>
+
+.text
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+#else
+.code 32
+#endif
+
+.align 5
+ENTRY(poly1305_init_arm)
+ stmdb sp!,{r4-r11}
+
+ eor r3,r3,r3
+ cmp r1,#0
+ str r3,[r0,#0] @ zero hash value
+ str r3,[r0,#4]
+ str r3,[r0,#8]
+ str r3,[r0,#12]
+ str r3,[r0,#16]
+ str r3,[r0,#36] @ is_base2_26
+ add r0,r0,#20
+
+#ifdef __thumb2__
+ it eq
+#endif
+ moveq r0,#0
+ beq .Lno_key
+
+ ldrb r4,[r1,#0]
+ mov r10,#0x0fffffff
+ ldrb r5,[r1,#1]
+ and r3,r10,#-4 @ 0x0ffffffc
+ ldrb r6,[r1,#2]
+ ldrb r7,[r1,#3]
+ orr r4,r4,r5,lsl#8
+ ldrb r5,[r1,#4]
+ orr r4,r4,r6,lsl#16
+ ldrb r6,[r1,#5]
+ orr r4,r4,r7,lsl#24
+ ldrb r7,[r1,#6]
+ and r4,r4,r10
+
+ ldrb r8,[r1,#7]
+ orr r5,r5,r6,lsl#8
+ ldrb r6,[r1,#8]
+ orr r5,r5,r7,lsl#16
+ ldrb r7,[r1,#9]
+ orr r5,r5,r8,lsl#24
+ ldrb r8,[r1,#10]
+ and r5,r5,r3
+
+ ldrb r9,[r1,#11]
+ orr r6,r6,r7,lsl#8
+ ldrb r7,[r1,#12]
+ orr r6,r6,r8,lsl#16
+ ldrb r8,[r1,#13]
+ orr r6,r6,r9,lsl#24
+ ldrb r9,[r1,#14]
+ and r6,r6,r3
+
+ ldrb r10,[r1,#15]
+ orr r7,r7,r8,lsl#8
+ str r4,[r0,#0]
+ orr r7,r7,r9,lsl#16
+ str r5,[r0,#4]
+ orr r7,r7,r10,lsl#24
+ str r6,[r0,#8]
+ and r7,r7,r3
+ str r7,[r0,#12]
+.Lno_key:
+ ldmia sp!,{r4-r11}
+#if __LINUX_ARM_ARCH__ >= 5
+ bx lr @ bx lr
+#else
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ .word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+ENDPROC(poly1305_init_arm)
+
+.align 5
+ENTRY(poly1305_blocks_arm)
+.Lpoly1305_blocks_arm:
+ stmdb sp!,{r3-r11,lr}
+
+ ands r2,r2,#-16
+ beq .Lno_data
+
+ cmp r3,#0
+ add r2,r2,r1 @ end pointer
+ sub sp,sp,#32
+
+ ldmia r0,{r4-r12} @ load context
+
+ str r0,[sp,#12] @ offload stuff
+ mov lr,r1
+ str r2,[sp,#16]
+ str r10,[sp,#20]
+ str r11,[sp,#24]
+ str r12,[sp,#28]
+ b .Loop
+
+.Loop:
+#if __LINUX_ARM_ARCH__ < 7
+ ldrb r0,[lr],#16 @ load input
+#ifdef __thumb2__
+ it hi
+#endif
+ addhi r8,r8,#1 @ 1<<128
+ ldrb r1,[lr,#-15]
+ ldrb r2,[lr,#-14]
+ ldrb r3,[lr,#-13]
+ orr r1,r0,r1,lsl#8
+ ldrb r0,[lr,#-12]
+ orr r2,r1,r2,lsl#16
+ ldrb r1,[lr,#-11]
+ orr r3,r2,r3,lsl#24
+ ldrb r2,[lr,#-10]
+ adds r4,r4,r3 @ accumulate input
+
+ ldrb r3,[lr,#-9]
+ orr r1,r0,r1,lsl#8
+ ldrb r0,[lr,#-8]
+ orr r2,r1,r2,lsl#16
+ ldrb r1,[lr,#-7]
+ orr r3,r2,r3,lsl#24
+ ldrb r2,[lr,#-6]
+ adcs r5,r5,r3
+
+ ldrb r3,[lr,#-5]
+ orr r1,r0,r1,lsl#8
+ ldrb r0,[lr,#-4]
+ orr r2,r1,r2,lsl#16
+ ldrb r1,[lr,#-3]
+ orr r3,r2,r3,lsl#24
+ ldrb r2,[lr,#-2]
+ adcs r6,r6,r3
+
+ ldrb r3,[lr,#-1]
+ orr r1,r0,r1,lsl#8
+ str lr,[sp,#8] @ offload input pointer
+ orr r2,r1,r2,lsl#16
+ add r10,r10,r10,lsr#2
+ orr r3,r2,r3,lsl#24
+#else
+ ldr r0,[lr],#16 @ load input
+#ifdef __thumb2__
+ it hi
+#endif
+ addhi r8,r8,#1 @ padbit
+ ldr r1,[lr,#-12]
+ ldr r2,[lr,#-8]
+ ldr r3,[lr,#-4]
+#ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+#endif
+ adds r4,r4,r0 @ accumulate input
+ str lr,[sp,#8] @ offload input pointer
+ adcs r5,r5,r1
+ add r10,r10,r10,lsr#2
+ adcs r6,r6,r2
+#endif
+ add r11,r11,r11,lsr#2
+ adcs r7,r7,r3
+ add r12,r12,r12,lsr#2
+
+ umull r2,r3,r5,r9
+ adc r8,r8,#0
+ umull r0,r1,r4,r9
+ umlal r2,r3,r8,r10
+ umlal r0,r1,r7,r10
+ ldr r10,[sp,#20] @ reload r10
+ umlal r2,r3,r6,r12
+ umlal r0,r1,r5,r12
+ umlal r2,r3,r7,r11
+ umlal r0,r1,r6,r11
+ umlal r2,r3,r4,r10
+ str r0,[sp,#0] @ future r4
+ mul r0,r11,r8
+ ldr r11,[sp,#24] @ reload r11
+ adds r2,r2,r1 @ d1+=d0>>32
+ eor r1,r1,r1
+ adc lr,r3,#0 @ future r6
+ str r2,[sp,#4] @ future r5
+
+ mul r2,r12,r8
+ eor r3,r3,r3
+ umlal r0,r1,r7,r12
+ ldr r12,[sp,#28] @ reload r12
+ umlal r2,r3,r7,r9
+ umlal r0,r1,r6,r9
+ umlal r2,r3,r6,r10
+ umlal r0,r1,r5,r10
+ umlal r2,r3,r5,r11
+ umlal r0,r1,r4,r11
+ umlal r2,r3,r4,r12
+ ldr r4,[sp,#0]
+ mul r8,r9,r8
+ ldr r5,[sp,#4]
+
+ adds r6,lr,r0 @ d2+=d1>>32
+ ldr lr,[sp,#8] @ reload input pointer
+ adc r1,r1,#0
+ adds r7,r2,r1 @ d3+=d2>>32
+ ldr r0,[sp,#16] @ reload end pointer
+ adc r3,r3,#0
+ add r8,r8,r3 @ h4+=d3>>32
+
+ and r1,r8,#-4
+ and r8,r8,#3
+ add r1,r1,r1,lsr#2 @ *=5
+ adds r4,r4,r1
+ adcs r5,r5,#0
+ adcs r6,r6,#0
+ adcs r7,r7,#0
+ adc r8,r8,#0
+
+ cmp r0,lr @ done yet?
+ bhi .Loop
+
+ ldr r0,[sp,#12]
+ add sp,sp,#32
+ stmia r0,{r4-r8} @ store the result
+
+.Lno_data:
+#if __LINUX_ARM_ARCH__ >= 5
+ ldmia sp!,{r3-r11,pc}
+#else
+ ldmia sp!,{r3-r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ .word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+ENDPROC(poly1305_blocks_arm)
+
+.align 5
+ENTRY(poly1305_emit_arm)
+ stmdb sp!,{r4-r11}
+.Lpoly1305_emit_enter:
+ ldmia r0,{r3-r7}
+ adds r8,r3,#5 @ compare to modulus
+ adcs r9,r4,#0
+ adcs r10,r5,#0
+ adcs r11,r6,#0
+ adc r7,r7,#0
+ tst r7,#4 @ did it carry/borrow?
+
+#ifdef __thumb2__
+ it ne
+#endif
+ movne r3,r8
+ ldr r8,[r2,#0]
+#ifdef __thumb2__
+ it ne
+#endif
+ movne r4,r9
+ ldr r9,[r2,#4]
+#ifdef __thumb2__
+ it ne
+#endif
+ movne r5,r10
+ ldr r10,[r2,#8]
+#ifdef __thumb2__
+ it ne
+#endif
+ movne r6,r11
+ ldr r11,[r2,#12]
+
+ adds r3,r3,r8
+ adcs r4,r4,r9
+ adcs r5,r5,r10
+ adc r6,r6,r11
+
+#if __LINUX_ARM_ARCH__ >= 7
+#ifdef __ARMEB__
+ rev r3,r3
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+#endif
+ str r3,[r1,#0]
+ str r4,[r1,#4]
+ str r5,[r1,#8]
+ str r6,[r1,#12]
+#else
+ strb r3,[r1,#0]
+ mov r3,r3,lsr#8
+ strb r4,[r1,#4]
+ mov r4,r4,lsr#8
+ strb r5,[r1,#8]
+ mov r5,r5,lsr#8
+ strb r6,[r1,#12]
+ mov r6,r6,lsr#8
+
+ strb r3,[r1,#1]
+ mov r3,r3,lsr#8
+ strb r4,[r1,#5]
+ mov r4,r4,lsr#8
+ strb r5,[r1,#9]
+ mov r5,r5,lsr#8
+ strb r6,[r1,#13]
+ mov r6,r6,lsr#8
+
+ strb r3,[r1,#2]
+ mov r3,r3,lsr#8
+ strb r4,[r1,#6]
+ mov r4,r4,lsr#8
+ strb r5,[r1,#10]
+ mov r5,r5,lsr#8
+ strb r6,[r1,#14]
+ mov r6,r6,lsr#8
+
+ strb r3,[r1,#3]
+ strb r4,[r1,#7]
+ strb r5,[r1,#11]
+ strb r6,[r1,#15]
+#endif
+ ldmia sp!,{r4-r11}
+#if __LINUX_ARM_ARCH__ >= 5
+ bx lr @ bx lr
+#else
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ .word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+ENDPROC(poly1305_emit_arm)
+
+
+#if __LINUX_ARM_ARCH__ >= 7
+.fpu neon
+
+.align 5
+ENTRY(poly1305_init_neon)
+.Lpoly1305_init_neon:
+ ldr r4,[r0,#20] @ load key base 2^32
+ ldr r5,[r0,#24]
+ ldr r6,[r0,#28]
+ ldr r7,[r0,#32]
+
+ and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
+ mov r3,r4,lsr#26
+ mov r4,r5,lsr#20
+ orr r3,r3,r5,lsl#6
+ mov r5,r6,lsr#14
+ orr r4,r4,r6,lsl#12
+ mov r6,r7,lsr#8
+ orr r5,r5,r7,lsl#18
+ and r3,r3,#0x03ffffff
+ and r4,r4,#0x03ffffff
+ and r5,r5,#0x03ffffff
+
+ vdup.32 d0,r2 @ r^1 in both lanes
+ add r2,r3,r3,lsl#2 @ *5
+ vdup.32 d1,r3
+ add r3,r4,r4,lsl#2
+ vdup.32 d2,r2
+ vdup.32 d3,r4
+ add r4,r5,r5,lsl#2
+ vdup.32 d4,r3
+ vdup.32 d5,r5
+ add r5,r6,r6,lsl#2
+ vdup.32 d6,r4
+ vdup.32 d7,r6
+ vdup.32 d8,r5
+
+ mov r5,#2 @ counter
+
+.Lsquare_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+ @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+
+ vmull.u32 q5,d0,d0[1]
+ vmull.u32 q6,d1,d0[1]
+ vmull.u32 q7,d3,d0[1]
+ vmull.u32 q8,d5,d0[1]
+ vmull.u32 q9,d7,d0[1]
+
+ vmlal.u32 q5,d7,d2[1]
+ vmlal.u32 q6,d0,d1[1]
+ vmlal.u32 q7,d1,d1[1]
+ vmlal.u32 q8,d3,d1[1]
+ vmlal.u32 q9,d5,d1[1]
+
+ vmlal.u32 q5,d5,d4[1]
+ vmlal.u32 q6,d7,d4[1]
+ vmlal.u32 q8,d1,d3[1]
+ vmlal.u32 q7,d0,d3[1]
+ vmlal.u32 q9,d3,d3[1]
+
+ vmlal.u32 q5,d3,d6[1]
+ vmlal.u32 q8,d0,d5[1]
+ vmlal.u32 q6,d5,d6[1]
+ vmlal.u32 q7,d7,d6[1]
+ vmlal.u32 q9,d1,d5[1]
+
+ vmlal.u32 q8,d7,d8[1]
+ vmlal.u32 q5,d1,d8[1]
+ vmlal.u32 q6,d3,d8[1]
+ vmlal.u32 q7,d5,d8[1]
+ vmlal.u32 q9,d0,d7[1]
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+ @ and P. Schwabe
+ @
+ @ H0>>+H1>>+H2>>+H3>>+H4
+ @ H3>>+H4>>*5+H0>>+H1
+ @
+ @ Trivia.
+ @
+ @ Result of multiplication of n-bit number by m-bit number is
+ @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
+ @ m-bit number multiplied by 2^n is still n+m bits wide.
+ @
+ @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
+ @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
+ @ one is n+1 bits wide.
+ @
+ @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
+ @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
+ @ can be 27. However! In cases when their width exceeds 26 bits
+ @ they are limited by 2^26+2^6. This in turn means that *sum*
+ @ of the products with these values can still be viewed as sum
+ @ of 52-bit numbers as long as the amount of addends is not a
+ @ power of 2. For example,
+ @
+ @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
+ @
+ @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
+ @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
+ @ 8 * (2^52) or 2^55. However, the value is then multiplied by
+ @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
+ @ which is less than 32 * (2^52) or 2^57. And when processing
+ @ data we are looking at triple as many addends...
+ @
+ @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
+ @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
+ @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
+ @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
+ @ instruction accepts 2x32-bit input and writes 2x64-bit result.
+ @ This means that result of reduction have to be compressed upon
+ @ loop wrap-around. This can be done in the process of reduction
+ @ to minimize amount of instructions [as well as amount of
+ @ 128-bit instructions, which benefits low-end processors], but
+ @ one has to watch for H2 (which is narrower than H0) and 5*H4
+ @ not being wider than 58 bits, so that result of right shift
+ @ by 26 bits fits in 32 bits. This is also useful on x86,
+ @ because it allows to use paddd in place for paddq, which
+ @ benefits Atom, where paddq is ridiculously slow.
+
+ vshr.u64 q15,q8,#26
+ vmovn.i64 d16,q8
+ vshr.u64 q4,q5,#26
+ vmovn.i64 d10,q5
+ vadd.i64 q9,q9,q15 @ h3 -> h4
+ vbic.i32 d16,#0xfc000000 @ &=0x03ffffff
+ vadd.i64 q6,q6,q4 @ h0 -> h1
+ vbic.i32 d10,#0xfc000000
+
+ vshrn.u64 d30,q9,#26
+ vmovn.i64 d18,q9
+ vshr.u64 q4,q6,#26
+ vmovn.i64 d12,q6
+ vadd.i64 q7,q7,q4 @ h1 -> h2
+ vbic.i32 d18,#0xfc000000
+ vbic.i32 d12,#0xfc000000
+
+ vadd.i32 d10,d10,d30
+ vshl.u32 d30,d30,#2
+ vshrn.u64 d8,q7,#26
+ vmovn.i64 d14,q7
+ vadd.i32 d10,d10,d30 @ h4 -> h0
+ vadd.i32 d16,d16,d8 @ h2 -> h3
+ vbic.i32 d14,#0xfc000000
+
+ vshr.u32 d30,d10,#26
+ vbic.i32 d10,#0xfc000000
+ vshr.u32 d8,d16,#26
+ vbic.i32 d16,#0xfc000000
+ vadd.i32 d12,d12,d30 @ h0 -> h1
+ vadd.i32 d18,d18,d8 @ h3 -> h4
+
+ subs r5,r5,#1
+ beq .Lsquare_break_neon
+
+ add r6,r0,#(48+0*9*4)
+ add r7,r0,#(48+1*9*4)
+
+ vtrn.32 d0,d10 @ r^2:r^1
+ vtrn.32 d3,d14
+ vtrn.32 d5,d16
+ vtrn.32 d1,d12
+ vtrn.32 d7,d18
+
+ vshl.u32 d4,d3,#2 @ *5
+ vshl.u32 d6,d5,#2
+ vshl.u32 d2,d1,#2
+ vshl.u32 d8,d7,#2
+ vadd.i32 d4,d4,d3
+ vadd.i32 d2,d2,d1
+ vadd.i32 d6,d6,d5
+ vadd.i32 d8,d8,d7
+
+ vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!
+ vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!
+ vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
+ vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
+ vst1.32 {d8[0]},[r6,:32]
+ vst1.32 {d8[1]},[r7,:32]
+
+ b .Lsquare_neon
+
+.align 4
+.Lsquare_break_neon:
+ add r6,r0,#(48+2*4*9)
+ add r7,r0,#(48+3*4*9)
+
+ vmov d0,d10 @ r^4:r^3
+ vshl.u32 d2,d12,#2 @ *5
+ vmov d1,d12
+ vshl.u32 d4,d14,#2
+ vmov d3,d14
+ vshl.u32 d6,d16,#2
+ vmov d5,d16
+ vshl.u32 d8,d18,#2
+ vmov d7,d18
+ vadd.i32 d2,d2,d12
+ vadd.i32 d4,d4,d14
+ vadd.i32 d6,d6,d16
+ vadd.i32 d8,d8,d18
+
+ vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!
+ vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!
+ vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
+ vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
+ vst1.32 {d8[0]},[r6]
+ vst1.32 {d8[1]},[r7]
+
+ bx lr @ bx lr
+ENDPROC(poly1305_init_neon)
+
+.align 5
+ENTRY(poly1305_blocks_neon)
+ ldr ip,[r0,#36] @ is_base2_26
+ ands r2,r2,#-16
+ beq .Lno_data_neon
+
+ cmp r2,#64
+ bhs .Lenter_neon
+ tst ip,ip @ is_base2_26?
+ beq .Lpoly1305_blocks_arm
+
+.Lenter_neon:
+ stmdb sp!,{r4-r7}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+
+ tst ip,ip @ is_base2_26?
+ bne .Lbase2_26_neon
+
+ stmdb sp!,{r1-r3,lr}
+ bl .Lpoly1305_init_neon
+
+ ldr r4,[r0,#0] @ load hash value base 2^32
+ ldr r5,[r0,#4]
+ ldr r6,[r0,#8]
+ ldr r7,[r0,#12]
+ ldr ip,[r0,#16]
+
+ and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
+ mov r3,r4,lsr#26
+ veor d10,d10,d10
+ mov r4,r5,lsr#20
+ orr r3,r3,r5,lsl#6
+ veor d12,d12,d12
+ mov r5,r6,lsr#14
+ orr r4,r4,r6,lsl#12
+ veor d14,d14,d14
+ mov r6,r7,lsr#8
+ orr r5,r5,r7,lsl#18
+ veor d16,d16,d16
+ and r3,r3,#0x03ffffff
+ orr r6,r6,ip,lsl#24
+ veor d18,d18,d18
+ and r4,r4,#0x03ffffff
+ mov r1,#1
+ and r5,r5,#0x03ffffff
+ str r1,[r0,#36] @ is_base2_26
+
+ vmov.32 d10[0],r2
+ vmov.32 d12[0],r3
+ vmov.32 d14[0],r4
+ vmov.32 d16[0],r5
+ vmov.32 d18[0],r6
+ adr r5,.Lzeros
+
+ ldmia sp!,{r1-r3,lr}
+ b .Lbase2_32_neon
+
+.align 4
+.Lbase2_26_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ load hash value
+
+ veor d10,d10,d10
+ veor d12,d12,d12
+ veor d14,d14,d14
+ veor d16,d16,d16
+ veor d18,d18,d18
+ vld4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!
+ adr r5,.Lzeros
+ vld1.32 {d18[0]},[r0]
+ sub r0,r0,#16 @ rewind
+
+.Lbase2_32_neon:
+ add r4,r1,#32
+ mov r3,r3,lsl#24
+ tst r2,#31
+ beq .Leven
+
+ vld4.32 {d20[0],d22[0],d24[0],d26[0]},[r1]!
+ vmov.32 d28[0],r3
+ sub r2,r2,#16
+ add r4,r1,#32
+
+#ifdef __ARMEB__
+ vrev32.8 q10,q10
+ vrev32.8 q13,q13
+ vrev32.8 q11,q11
+ vrev32.8 q12,q12
+#endif
+ vsri.u32 d28,d26,#8 @ base 2^32 -> base 2^26
+ vshl.u32 d26,d26,#18
+
+ vsri.u32 d26,d24,#14
+ vshl.u32 d24,d24,#12
+ vadd.i32 d29,d28,d18 @ add hash value and move to #hi
+
+ vbic.i32 d26,#0xfc000000
+ vsri.u32 d24,d22,#20
+ vshl.u32 d22,d22,#6
+
+ vbic.i32 d24,#0xfc000000
+ vsri.u32 d22,d20,#26
+ vadd.i32 d27,d26,d16
+
+ vbic.i32 d20,#0xfc000000
+ vbic.i32 d22,#0xfc000000
+ vadd.i32 d25,d24,d14
+
+ vadd.i32 d21,d20,d10
+ vadd.i32 d23,d22,d12
+
+ mov r7,r5
+ add r6,r0,#48
+
+ cmp r2,r2
+ b .Long_tail
+
+.align 4
+.Leven:
+ subs r2,r2,#64
+ it lo
+ movlo r4,r5
+
+ vmov.i32 q14,#1<<24 @ padbit, yes, always
+ vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1]
+ add r1,r1,#64
+ vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0)
+ add r4,r4,#64
+ itt hi
+ addhi r7,r0,#(48+1*9*4)
+ addhi r6,r0,#(48+3*9*4)
+
+#ifdef __ARMEB__
+ vrev32.8 q10,q10
+ vrev32.8 q13,q13
+ vrev32.8 q11,q11
+ vrev32.8 q12,q12
+#endif
+ vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26
+ vshl.u32 q13,q13,#18
+
+ vsri.u32 q13,q12,#14
+ vshl.u32 q12,q12,#12
+
+ vbic.i32 q13,#0xfc000000
+ vsri.u32 q12,q11,#20
+ vshl.u32 q11,q11,#6
+
+ vbic.i32 q12,#0xfc000000
+ vsri.u32 q11,q10,#26
+
+ vbic.i32 q10,#0xfc000000
+ vbic.i32 q11,#0xfc000000
+
+ bls .Lskip_loop
+
+ vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^2
+ vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
+ vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
+ vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
+ b .Loop_neon
+
+.align 5
+.Loop_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+ @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+ @ ___________________/
+ @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+ @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+ @ ___________________/ ____________________/
+ @
+ @ Note that we start with inp[2:3]*r^2. This is because it
+ @ doesn't depend on reduction in previous iteration.
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+ @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ inp[2:3]*r^2
+
+ vadd.i32 d24,d24,d14 @ accumulate inp[0:1]
+ vmull.u32 q7,d25,d0[1]
+ vadd.i32 d20,d20,d10
+ vmull.u32 q5,d21,d0[1]
+ vadd.i32 d26,d26,d16
+ vmull.u32 q8,d27,d0[1]
+ vmlal.u32 q7,d23,d1[1]
+ vadd.i32 d22,d22,d12
+ vmull.u32 q6,d23,d0[1]
+
+ vadd.i32 d28,d28,d18
+ vmull.u32 q9,d29,d0[1]
+ subs r2,r2,#64
+ vmlal.u32 q5,d29,d2[1]
+ it lo
+ movlo r4,r5
+ vmlal.u32 q8,d25,d1[1]
+ vld1.32 d8[1],[r7,:32]
+ vmlal.u32 q6,d21,d1[1]
+ vmlal.u32 q9,d27,d1[1]
+
+ vmlal.u32 q5,d27,d4[1]
+ vmlal.u32 q8,d23,d3[1]
+ vmlal.u32 q9,d25,d3[1]
+ vmlal.u32 q6,d29,d4[1]
+ vmlal.u32 q7,d21,d3[1]
+
+ vmlal.u32 q8,d21,d5[1]
+ vmlal.u32 q5,d25,d6[1]
+ vmlal.u32 q9,d23,d5[1]
+ vmlal.u32 q6,d27,d6[1]
+ vmlal.u32 q7,d29,d6[1]
+
+ vmlal.u32 q8,d29,d8[1]
+ vmlal.u32 q5,d23,d8[1]
+ vmlal.u32 q9,d21,d7[1]
+ vmlal.u32 q6,d25,d8[1]
+ vmlal.u32 q7,d27,d8[1]
+
+ vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0)
+ add r4,r4,#64
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ (hash+inp[0:1])*r^4 and accumulate
+
+ vmlal.u32 q8,d26,d0[0]
+ vmlal.u32 q5,d20,d0[0]
+ vmlal.u32 q9,d28,d0[0]
+ vmlal.u32 q6,d22,d0[0]
+ vmlal.u32 q7,d24,d0[0]
+ vld1.32 d8[0],[r6,:32]
+
+ vmlal.u32 q8,d24,d1[0]
+ vmlal.u32 q5,d28,d2[0]
+ vmlal.u32 q9,d26,d1[0]
+ vmlal.u32 q6,d20,d1[0]
+ vmlal.u32 q7,d22,d1[0]
+
+ vmlal.u32 q8,d22,d3[0]
+ vmlal.u32 q5,d26,d4[0]
+ vmlal.u32 q9,d24,d3[0]
+ vmlal.u32 q6,d28,d4[0]
+ vmlal.u32 q7,d20,d3[0]
+
+ vmlal.u32 q8,d20,d5[0]
+ vmlal.u32 q5,d24,d6[0]
+ vmlal.u32 q9,d22,d5[0]
+ vmlal.u32 q6,d26,d6[0]
+ vmlal.u32 q8,d28,d8[0]
+
+ vmlal.u32 q7,d28,d6[0]
+ vmlal.u32 q5,d22,d8[0]
+ vmlal.u32 q9,d20,d7[0]
+ vmov.i32 q14,#1<<24 @ padbit, yes, always
+ vmlal.u32 q6,d24,d8[0]
+ vmlal.u32 q7,d26,d8[0]
+
+ vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1]
+ add r1,r1,#64
+#ifdef __ARMEB__
+ vrev32.8 q10,q10
+ vrev32.8 q11,q11
+ vrev32.8 q12,q12
+ vrev32.8 q13,q13
+#endif
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ lazy reduction interleaved with base 2^32 -> base 2^26 of
+ @ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14.
+
+ vshr.u64 q15,q8,#26
+ vmovn.i64 d16,q8
+ vshr.u64 q4,q5,#26
+ vmovn.i64 d10,q5
+ vadd.i64 q9,q9,q15 @ h3 -> h4
+ vbic.i32 d16,#0xfc000000
+ vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26
+ vadd.i64 q6,q6,q4 @ h0 -> h1
+ vshl.u32 q13,q13,#18
+ vbic.i32 d10,#0xfc000000
+
+ vshrn.u64 d30,q9,#26
+ vmovn.i64 d18,q9
+ vshr.u64 q4,q6,#26
+ vmovn.i64 d12,q6
+ vadd.i64 q7,q7,q4 @ h1 -> h2
+ vsri.u32 q13,q12,#14
+ vbic.i32 d18,#0xfc000000
+ vshl.u32 q12,q12,#12
+ vbic.i32 d12,#0xfc000000
+
+ vadd.i32 d10,d10,d30
+ vshl.u32 d30,d30,#2
+ vbic.i32 q13,#0xfc000000
+ vshrn.u64 d8,q7,#26
+ vmovn.i64 d14,q7
+ vaddl.u32 q5,d10,d30 @ h4 -> h0 [widen for a sec]
+ vsri.u32 q12,q11,#20
+ vadd.i32 d16,d16,d8 @ h2 -> h3
+ vshl.u32 q11,q11,#6
+ vbic.i32 d14,#0xfc000000
+ vbic.i32 q12,#0xfc000000
+
+ vshrn.u64 d30,q5,#26 @ re-narrow
+ vmovn.i64 d10,q5
+ vsri.u32 q11,q10,#26
+ vbic.i32 q10,#0xfc000000
+ vshr.u32 d8,d16,#26
+ vbic.i32 d16,#0xfc000000
+ vbic.i32 d10,#0xfc000000
+ vadd.i32 d12,d12,d30 @ h0 -> h1
+ vadd.i32 d18,d18,d8 @ h3 -> h4
+ vbic.i32 q11,#0xfc000000
+
+ bhi .Loop_neon
+
+.Lskip_loop:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+ add r7,r0,#(48+0*9*4)
+ add r6,r0,#(48+1*9*4)
+ adds r2,r2,#32
+ it ne
+ movne r2,#0
+ bne .Long_tail
+
+ vadd.i32 d25,d24,d14 @ add hash value and move to #hi
+ vadd.i32 d21,d20,d10
+ vadd.i32 d27,d26,d16
+ vadd.i32 d23,d22,d12
+ vadd.i32 d29,d28,d18
+
+.Long_tail:
+ vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^1
+ vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^2
+
+ vadd.i32 d24,d24,d14 @ can be redundant
+ vmull.u32 q7,d25,d0
+ vadd.i32 d20,d20,d10
+ vmull.u32 q5,d21,d0
+ vadd.i32 d26,d26,d16
+ vmull.u32 q8,d27,d0
+ vadd.i32 d22,d22,d12
+ vmull.u32 q6,d23,d0
+ vadd.i32 d28,d28,d18
+ vmull.u32 q9,d29,d0
+
+ vmlal.u32 q5,d29,d2
+ vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
+ vmlal.u32 q8,d25,d1
+ vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
+ vmlal.u32 q6,d21,d1
+ vmlal.u32 q9,d27,d1
+ vmlal.u32 q7,d23,d1
+
+ vmlal.u32 q8,d23,d3
+ vld1.32 d8[1],[r7,:32]
+ vmlal.u32 q5,d27,d4
+ vld1.32 d8[0],[r6,:32]
+ vmlal.u32 q9,d25,d3
+ vmlal.u32 q6,d29,d4
+ vmlal.u32 q7,d21,d3
+
+ vmlal.u32 q8,d21,d5
+ it ne
+ addne r7,r0,#(48+2*9*4)
+ vmlal.u32 q5,d25,d6
+ it ne
+ addne r6,r0,#(48+3*9*4)
+ vmlal.u32 q9,d23,d5
+ vmlal.u32 q6,d27,d6
+ vmlal.u32 q7,d29,d6
+
+ vmlal.u32 q8,d29,d8
+ vorn q0,q0,q0 @ all-ones, can be redundant
+ vmlal.u32 q5,d23,d8
+ vshr.u64 q0,q0,#38
+ vmlal.u32 q9,d21,d7
+ vmlal.u32 q6,d25,d8
+ vmlal.u32 q7,d27,d8
+
+ beq .Lshort_tail
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ (hash+inp[0:1])*r^4:r^3 and accumulate
+
+ vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^3
+ vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
+
+ vmlal.u32 q7,d24,d0
+ vmlal.u32 q5,d20,d0
+ vmlal.u32 q8,d26,d0
+ vmlal.u32 q6,d22,d0
+ vmlal.u32 q9,d28,d0
+
+ vmlal.u32 q5,d28,d2
+ vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
+ vmlal.u32 q8,d24,d1
+ vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
+ vmlal.u32 q6,d20,d1
+ vmlal.u32 q9,d26,d1
+ vmlal.u32 q7,d22,d1
+
+ vmlal.u32 q8,d22,d3
+ vld1.32 d8[1],[r7,:32]
+ vmlal.u32 q5,d26,d4
+ vld1.32 d8[0],[r6,:32]
+ vmlal.u32 q9,d24,d3
+ vmlal.u32 q6,d28,d4
+ vmlal.u32 q7,d20,d3
+
+ vmlal.u32 q8,d20,d5
+ vmlal.u32 q5,d24,d6
+ vmlal.u32 q9,d22,d5
+ vmlal.u32 q6,d26,d6
+ vmlal.u32 q7,d28,d6
+
+ vmlal.u32 q8,d28,d8
+ vorn q0,q0,q0 @ all-ones
+ vmlal.u32 q5,d22,d8
+ vshr.u64 q0,q0,#38
+ vmlal.u32 q9,d20,d7
+ vmlal.u32 q6,d24,d8
+ vmlal.u32 q7,d26,d8
+
+.Lshort_tail:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ horizontal addition
+
+ vadd.i64 d16,d16,d17
+ vadd.i64 d10,d10,d11
+ vadd.i64 d18,d18,d19
+ vadd.i64 d12,d12,d13
+ vadd.i64 d14,d14,d15
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ lazy reduction, but without narrowing
+
+ vshr.u64 q15,q8,#26
+ vand.i64 q8,q8,q0
+ vshr.u64 q4,q5,#26
+ vand.i64 q5,q5,q0
+ vadd.i64 q9,q9,q15 @ h3 -> h4
+ vadd.i64 q6,q6,q4 @ h0 -> h1
+
+ vshr.u64 q15,q9,#26
+ vand.i64 q9,q9,q0
+ vshr.u64 q4,q6,#26
+ vand.i64 q6,q6,q0
+ vadd.i64 q7,q7,q4 @ h1 -> h2
+
+ vadd.i64 q5,q5,q15
+ vshl.u64 q15,q15,#2
+ vshr.u64 q4,q7,#26
+ vand.i64 q7,q7,q0
+ vadd.i64 q5,q5,q15 @ h4 -> h0
+ vadd.i64 q8,q8,q4 @ h2 -> h3
+
+ vshr.u64 q15,q5,#26
+ vand.i64 q5,q5,q0
+ vshr.u64 q4,q8,#26
+ vand.i64 q8,q8,q0
+ vadd.i64 q6,q6,q15 @ h0 -> h1
+ vadd.i64 q9,q9,q4 @ h3 -> h4
+
+ cmp r2,#0
+ bne .Leven
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ store hash value
+
+ vst4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!
+ vst1.32 {d18[0]},[r0]
+
+ vldmia sp!,{d8-d15} @ epilogue
+ ldmia sp!,{r4-r7}
+.Lno_data_neon:
+ bx lr @ bx lr
+ENDPROC(poly1305_blocks_neon)
+
+.align 5
+ENTRY(poly1305_emit_neon)
+ ldr ip,[r0,#36] @ is_base2_26
+
+ stmdb sp!,{r4-r11}
+
+ tst ip,ip
+ beq .Lpoly1305_emit_enter
+
+ ldmia r0,{r3-r7}
+ eor r8,r8,r8
+
+ adds r3,r3,r4,lsl#26 @ base 2^26 -> base 2^32
+ mov r4,r4,lsr#6
+ adcs r4,r4,r5,lsl#20
+ mov r5,r5,lsr#12
+ adcs r5,r5,r6,lsl#14
+ mov r6,r6,lsr#18
+ adcs r6,r6,r7,lsl#8
+ adc r7,r8,r7,lsr#24 @ can be partially reduced ...
+
+ and r8,r7,#-4 @ ... so reduce
+ and r7,r6,#3
+ add r8,r8,r8,lsr#2 @ *= 5
+ adds r3,r3,r8
+ adcs r4,r4,#0
+ adcs r5,r5,#0
+ adcs r6,r6,#0
+ adc r7,r7,#0
+
+ adds r8,r3,#5 @ compare to modulus
+ adcs r9,r4,#0
+ adcs r10,r5,#0
+ adcs r11,r6,#0
+ adc r7,r7,#0
+ tst r7,#4 @ did it carry/borrow?
+
+ it ne
+ movne r3,r8
+ ldr r8,[r2,#0]
+ it ne
+ movne r4,r9
+ ldr r9,[r2,#4]
+ it ne
+ movne r5,r10
+ ldr r10,[r2,#8]
+ it ne
+ movne r6,r11
+ ldr r11,[r2,#12]
+
+ adds r3,r3,r8 @ accumulate nonce
+ adcs r4,r4,r9
+ adcs r5,r5,r10
+ adc r6,r6,r11
+
+#ifdef __ARMEB__
+ rev r3,r3
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+#endif
+ str r3,[r1,#0] @ store the result
+ str r4,[r1,#4]
+ str r5,[r1,#8]
+ str r6,[r1,#12]
+
+ ldmia sp!,{r4-r11}
+ bx lr @ bx lr
+ENDPROC(poly1305_emit_neon)
+
+.align 5
+.Lzeros:
+.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+#endif
diff --git a/src/crypto/zinc/poly1305/poly1305-arm64.S b/src/crypto/zinc/poly1305/poly1305-arm64.S
new file mode 100644
index 0000000..706e664
--- /dev/null
+++ b/src/crypto/zinc/poly1305/poly1305-arm64.S
@@ -0,0 +1,822 @@
+/* SPDX-License-Identifier: OpenSSL OR (BSD-3-Clause OR GPL-2.0)
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * This is based in part on Andy Polyakov's implementation from OpenSSL.
+ */
+
+#include <linux/linkage.h>
+.text
+
+.align 5
+ENTRY(poly1305_init_arm)
+ cmp x1,xzr
+ stp xzr,xzr,[x0] // zero hash value
+ stp xzr,xzr,[x0,#16] // [along with is_base2_26]
+
+ csel x0,xzr,x0,eq
+ b.eq .Lno_key
+
+ ldp x7,x8,[x1] // load key
+ mov x9,#0xfffffffc0fffffff
+ movk x9,#0x0fff,lsl#48
+#ifdef __ARMEB__
+ rev x7,x7 // flip bytes
+ rev x8,x8
+#endif
+ and x7,x7,x9 // &=0ffffffc0fffffff
+ and x9,x9,#-4
+ and x8,x8,x9 // &=0ffffffc0ffffffc
+ stp x7,x8,[x0,#32] // save key value
+
+.Lno_key:
+ ret
+ENDPROC(poly1305_init_arm)
+
+.align 5
+ENTRY(poly1305_blocks_arm)
+ ands x2,x2,#-16
+ b.eq .Lno_data
+
+ ldp x4,x5,[x0] // load hash value
+ ldp x7,x8,[x0,#32] // load key value
+ ldr x6,[x0,#16]
+ add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
+ b .Loop
+
+.align 5
+.Loop:
+ ldp x10,x11,[x1],#16 // load input
+ sub x2,x2,#16
+#ifdef __ARMEB__
+ rev x10,x10
+ rev x11,x11
+#endif
+ adds x4,x4,x10 // accumulate input
+ adcs x5,x5,x11
+
+ mul x12,x4,x7 // h0*r0
+ adc x6,x6,x3
+ umulh x13,x4,x7
+
+ mul x10,x5,x9 // h1*5*r1
+ umulh x11,x5,x9
+
+ adds x12,x12,x10
+ mul x10,x4,x8 // h0*r1
+ adc x13,x13,x11
+ umulh x14,x4,x8
+
+ adds x13,x13,x10
+ mul x10,x5,x7 // h1*r0
+ adc x14,x14,xzr
+ umulh x11,x5,x7
+
+ adds x13,x13,x10
+ mul x10,x6,x9 // h2*5*r1
+ adc x14,x14,x11
+ mul x11,x6,x7 // h2*r0
+
+ adds x13,x13,x10
+ adc x14,x14,x11
+
+ and x10,x14,#-4 // final reduction
+ and x6,x14,#3
+ add x10,x10,x14,lsr#2
+ adds x4,x12,x10
+ adcs x5,x13,xzr
+ adc x6,x6,xzr
+
+ cbnz x2,.Loop
+
+ stp x4,x5,[x0] // store hash value
+ str x6,[x0,#16]
+
+.Lno_data:
+ ret
+ENDPROC(poly1305_blocks_arm)
+
+.align 5
+ENTRY(poly1305_emit_arm)
+ ldp x4,x5,[x0] // load hash base 2^64
+ ldr x6,[x0,#16]
+ ldp x10,x11,[x2] // load nonce
+
+ adds x12,x4,#5 // compare to modulus
+ adcs x13,x5,xzr
+ adc x14,x6,xzr
+
+ tst x14,#-4 // see if it's carried/borrowed
+
+ csel x4,x4,x12,eq
+ csel x5,x5,x13,eq
+
+#ifdef __ARMEB__
+ ror x10,x10,#32 // flip nonce words
+ ror x11,x11,#32
+#endif
+ adds x4,x4,x10 // accumulate nonce
+ adc x5,x5,x11
+#ifdef __ARMEB__
+ rev x4,x4 // flip output bytes
+ rev x5,x5
+#endif
+ stp x4,x5,[x1] // write result
+
+ ret
+ENDPROC(poly1305_emit_arm)
+
+.align 5
+__poly1305_mult:
+ mul x12,x4,x7 // h0*r0
+ umulh x13,x4,x7
+
+ mul x10,x5,x9 // h1*5*r1
+ umulh x11,x5,x9
+
+ adds x12,x12,x10
+ mul x10,x4,x8 // h0*r1
+ adc x13,x13,x11
+ umulh x14,x4,x8
+
+ adds x13,x13,x10
+ mul x10,x5,x7 // h1*r0
+ adc x14,x14,xzr
+ umulh x11,x5,x7
+
+ adds x13,x13,x10
+ mul x10,x6,x9 // h2*5*r1
+ adc x14,x14,x11
+ mul x11,x6,x7 // h2*r0
+
+ adds x13,x13,x10
+ adc x14,x14,x11
+
+ and x10,x14,#-4 // final reduction
+ and x6,x14,#3
+ add x10,x10,x14,lsr#2
+ adds x4,x12,x10
+ adcs x5,x13,xzr
+ adc x6,x6,xzr
+
+ ret
+
+__poly1305_splat:
+ and x12,x4,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x13,x4,#26,#26
+ extr x14,x5,x4,#52
+ and x14,x14,#0x03ffffff
+ ubfx x15,x5,#14,#26
+ extr x16,x6,x5,#40
+
+ str w12,[x0,#16*0] // r0
+ add w12,w13,w13,lsl#2 // r1*5
+ str w13,[x0,#16*1] // r1
+ add w13,w14,w14,lsl#2 // r2*5
+ str w12,[x0,#16*2] // s1
+ str w14,[x0,#16*3] // r2
+ add w14,w15,w15,lsl#2 // r3*5
+ str w13,[x0,#16*4] // s2
+ str w15,[x0,#16*5] // r3
+ add w15,w16,w16,lsl#2 // r4*5
+ str w14,[x0,#16*6] // s3
+ str w16,[x0,#16*7] // r4
+ str w15,[x0,#16*8] // s4
+
+ ret
+
+.align 5
+ENTRY(poly1305_blocks_neon)
+ ldr x17,[x0,#24]
+ cmp x2,#128
+ b.hs .Lblocks_neon
+ cbz x17,poly1305_blocks_arm
+
+.Lblocks_neon:
+ stp x29,x30,[sp,#-80]!
+ add x29,sp,#0
+
+ ands x2,x2,#-16
+ b.eq .Lno_data_neon
+
+ cbz x17,.Lbase2_64_neon
+
+ ldp w10,w11,[x0] // load hash value base 2^26
+ ldp w12,w13,[x0,#8]
+ ldr w14,[x0,#16]
+
+ tst x2,#31
+ b.eq .Leven_neon
+
+ ldp x7,x8,[x0,#32] // load key value
+
+ add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
+ lsr x5,x12,#12
+ adds x4,x4,x12,lsl#52
+ add x5,x5,x13,lsl#14
+ adc x5,x5,xzr
+ lsr x6,x14,#24
+ adds x5,x5,x14,lsl#40
+ adc x14,x6,xzr // can be partially reduced...
+
+ ldp x12,x13,[x1],#16 // load input
+ sub x2,x2,#16
+ add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
+
+ and x10,x14,#-4 // ... so reduce
+ and x6,x14,#3
+ add x10,x10,x14,lsr#2
+ adds x4,x4,x10
+ adcs x5,x5,xzr
+ adc x6,x6,xzr
+
+#ifdef __ARMEB__
+ rev x12,x12
+ rev x13,x13
+#endif
+ adds x4,x4,x12 // accumulate input
+ adcs x5,x5,x13
+ adc x6,x6,x3
+
+ bl __poly1305_mult
+ ldr x30,[sp,#8]
+
+ cbz x3,.Lstore_base2_64_neon
+
+ and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x11,x4,#26,#26
+ extr x12,x5,x4,#52
+ and x12,x12,#0x03ffffff
+ ubfx x13,x5,#14,#26
+ extr x14,x6,x5,#40
+
+ cbnz x2,.Leven_neon
+
+ stp w10,w11,[x0] // store hash value base 2^26
+ stp w12,w13,[x0,#8]
+ str w14,[x0,#16]
+ b .Lno_data_neon
+
+.align 4
+.Lstore_base2_64_neon:
+ stp x4,x5,[x0] // store hash value base 2^64
+ stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed
+ b .Lno_data_neon
+
+.align 4
+.Lbase2_64_neon:
+ ldp x7,x8,[x0,#32] // load key value
+
+ ldp x4,x5,[x0] // load hash value base 2^64
+ ldr x6,[x0,#16]
+
+ tst x2,#31
+ b.eq .Linit_neon
+
+ ldp x12,x13,[x1],#16 // load input
+ sub x2,x2,#16
+ add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
+#ifdef __ARMEB__
+ rev x12,x12
+ rev x13,x13
+#endif
+ adds x4,x4,x12 // accumulate input
+ adcs x5,x5,x13
+ adc x6,x6,x3
+
+ bl __poly1305_mult
+
+.Linit_neon:
+ and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x11,x4,#26,#26
+ extr x12,x5,x4,#52
+ and x12,x12,#0x03ffffff
+ ubfx x13,x5,#14,#26
+ extr x14,x6,x5,#40
+
+ stp d8,d9,[sp,#16] // meet ABI requirements
+ stp d10,d11,[sp,#32]
+ stp d12,d13,[sp,#48]
+ stp d14,d15,[sp,#64]
+
+ fmov d24,x10
+ fmov d25,x11
+ fmov d26,x12
+ fmov d27,x13
+ fmov d28,x14
+
+ ////////////////////////////////// initialize r^n table
+ mov x4,x7 // r^1
+ add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
+ mov x5,x8
+ mov x6,xzr
+ add x0,x0,#48+12
+ bl __poly1305_splat
+
+ bl __poly1305_mult // r^2
+ sub x0,x0,#4
+ bl __poly1305_splat
+
+ bl __poly1305_mult // r^3
+ sub x0,x0,#4
+ bl __poly1305_splat
+
+ bl __poly1305_mult // r^4
+ sub x0,x0,#4
+ bl __poly1305_splat
+ ldr x30,[sp,#8]
+
+ add x16,x1,#32
+ adr x17,.Lzeros
+ subs x2,x2,#64
+ csel x16,x17,x16,lo
+
+ mov x4,#1
+ str x4,[x0,#-24] // set is_base2_26
+ sub x0,x0,#48 // restore original x0
+ b .Ldo_neon
+
+.align 4
+.Leven_neon:
+ add x16,x1,#32
+ adr x17,.Lzeros
+ subs x2,x2,#64
+ csel x16,x17,x16,lo
+
+ stp d8,d9,[sp,#16] // meet ABI requirements
+ stp d10,d11,[sp,#32]
+ stp d12,d13,[sp,#48]
+ stp d14,d15,[sp,#64]
+
+ fmov d24,x10
+ fmov d25,x11
+ fmov d26,x12
+ fmov d27,x13
+ fmov d28,x14
+
+.Ldo_neon:
+ ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
+ ldp x9,x13,[x16],#48
+
+ lsl x3,x3,#24
+ add x15,x0,#48
+
+#ifdef __ARMEB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ and x5,x9,#0x03ffffff
+ ubfx x6,x8,#26,#26
+ ubfx x7,x9,#26,#26
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ extr x8,x12,x8,#52
+ extr x9,x13,x9,#52
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ fmov d14,x4
+ and x8,x8,#0x03ffffff
+ and x9,x9,#0x03ffffff
+ ubfx x10,x12,#14,#26
+ ubfx x11,x13,#14,#26
+ add x12,x3,x12,lsr#40
+ add x13,x3,x13,lsr#40
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ fmov d15,x6
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ fmov d16,x8
+ fmov d17,x10
+ fmov d18,x12
+
+ ldp x8,x12,[x1],#16 // inp[0:1]
+ ldp x9,x13,[x1],#48
+
+ ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
+ ld1 {v8.4s},[x15]
+
+#ifdef __ARMEB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ and x5,x9,#0x03ffffff
+ ubfx x6,x8,#26,#26
+ ubfx x7,x9,#26,#26
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ extr x8,x12,x8,#52
+ extr x9,x13,x9,#52
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ fmov d9,x4
+ and x8,x8,#0x03ffffff
+ and x9,x9,#0x03ffffff
+ ubfx x10,x12,#14,#26
+ ubfx x11,x13,#14,#26
+ add x12,x3,x12,lsr#40
+ add x13,x3,x13,lsr#40
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ fmov d10,x6
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ movi v31.2d,#-1
+ fmov d11,x8
+ fmov d12,x10
+ fmov d13,x12
+ ushr v31.2d,v31.2d,#38
+
+ b.ls .Lskip_loop
+
+.align 4
+.Loop_neon:
+ ////////////////////////////////////////////////////////////////
+ // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+ // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+ // ___________________/
+ // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+ // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+ // ___________________/ ____________________/
+ //
+ // Note that we start with inp[2:3]*r^2. This is because it
+ // doesn't depend on reduction in previous iteration.
+ ////////////////////////////////////////////////////////////////
+ // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
+ // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
+ // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
+ // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
+ // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
+
+ subs x2,x2,#64
+ umull v23.2d,v14.2s,v7.s[2]
+ csel x16,x17,x16,lo
+ umull v22.2d,v14.2s,v5.s[2]
+ umull v21.2d,v14.2s,v3.s[2]
+ ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
+ umull v20.2d,v14.2s,v1.s[2]
+ ldp x9,x13,[x16],#48
+ umull v19.2d,v14.2s,v0.s[2]
+#ifdef __ARMEB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+
+ umlal v23.2d,v15.2s,v5.s[2]
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ umlal v22.2d,v15.2s,v3.s[2]
+ and x5,x9,#0x03ffffff
+ umlal v21.2d,v15.2s,v1.s[2]
+ ubfx x6,x8,#26,#26
+ umlal v20.2d,v15.2s,v0.s[2]
+ ubfx x7,x9,#26,#26
+ umlal v19.2d,v15.2s,v8.s[2]
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+
+ umlal v23.2d,v16.2s,v3.s[2]
+ extr x8,x12,x8,#52
+ umlal v22.2d,v16.2s,v1.s[2]
+ extr x9,x13,x9,#52
+ umlal v21.2d,v16.2s,v0.s[2]
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ umlal v20.2d,v16.2s,v8.s[2]
+ fmov d14,x4
+ umlal v19.2d,v16.2s,v6.s[2]
+ and x8,x8,#0x03ffffff
+
+ umlal v23.2d,v17.2s,v1.s[2]
+ and x9,x9,#0x03ffffff
+ umlal v22.2d,v17.2s,v0.s[2]
+ ubfx x10,x12,#14,#26
+ umlal v21.2d,v17.2s,v8.s[2]
+ ubfx x11,x13,#14,#26
+ umlal v20.2d,v17.2s,v6.s[2]
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ umlal v19.2d,v17.2s,v4.s[2]
+ fmov d15,x6
+
+ add v11.2s,v11.2s,v26.2s
+ add x12,x3,x12,lsr#40
+ umlal v23.2d,v18.2s,v0.s[2]
+ add x13,x3,x13,lsr#40
+ umlal v22.2d,v18.2s,v8.s[2]
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ umlal v21.2d,v18.2s,v6.s[2]
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ umlal v20.2d,v18.2s,v4.s[2]
+ fmov d16,x8
+ umlal v19.2d,v18.2s,v2.s[2]
+ fmov d17,x10
+
+ ////////////////////////////////////////////////////////////////
+ // (hash+inp[0:1])*r^4 and accumulate
+
+ add v9.2s,v9.2s,v24.2s
+ fmov d18,x12
+ umlal v22.2d,v11.2s,v1.s[0]
+ ldp x8,x12,[x1],#16 // inp[0:1]
+ umlal v19.2d,v11.2s,v6.s[0]
+ ldp x9,x13,[x1],#48
+ umlal v23.2d,v11.2s,v3.s[0]
+ umlal v20.2d,v11.2s,v8.s[0]
+ umlal v21.2d,v11.2s,v0.s[0]
+#ifdef __ARMEB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+
+ add v10.2s,v10.2s,v25.2s
+ umlal v22.2d,v9.2s,v5.s[0]
+ umlal v23.2d,v9.2s,v7.s[0]
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ umlal v21.2d,v9.2s,v3.s[0]
+ and x5,x9,#0x03ffffff
+ umlal v19.2d,v9.2s,v0.s[0]
+ ubfx x6,x8,#26,#26
+ umlal v20.2d,v9.2s,v1.s[0]
+ ubfx x7,x9,#26,#26
+
+ add v12.2s,v12.2s,v27.2s
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ umlal v22.2d,v10.2s,v3.s[0]
+ extr x8,x12,x8,#52
+ umlal v23.2d,v10.2s,v5.s[0]
+ extr x9,x13,x9,#52
+ umlal v19.2d,v10.2s,v8.s[0]
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ umlal v21.2d,v10.2s,v1.s[0]
+ fmov d9,x4
+ umlal v20.2d,v10.2s,v0.s[0]
+ and x8,x8,#0x03ffffff
+
+ add v13.2s,v13.2s,v28.2s
+ and x9,x9,#0x03ffffff
+ umlal v22.2d,v12.2s,v0.s[0]
+ ubfx x10,x12,#14,#26
+ umlal v19.2d,v12.2s,v4.s[0]
+ ubfx x11,x13,#14,#26
+ umlal v23.2d,v12.2s,v1.s[0]
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ umlal v20.2d,v12.2s,v6.s[0]
+ fmov d10,x6
+ umlal v21.2d,v12.2s,v8.s[0]
+ add x12,x3,x12,lsr#40
+
+ umlal v22.2d,v13.2s,v8.s[0]
+ add x13,x3,x13,lsr#40
+ umlal v19.2d,v13.2s,v2.s[0]
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ umlal v23.2d,v13.2s,v0.s[0]
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ umlal v20.2d,v13.2s,v4.s[0]
+ fmov d11,x8
+ umlal v21.2d,v13.2s,v6.s[0]
+ fmov d12,x10
+ fmov d13,x12
+
+ /////////////////////////////////////////////////////////////////
+ // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+ // and P. Schwabe
+ //
+ // [see discussion in poly1305-armv4 module]
+
+ ushr v29.2d,v22.2d,#26
+ xtn v27.2s,v22.2d
+ ushr v30.2d,v19.2d,#26
+ and v19.16b,v19.16b,v31.16b
+ add v23.2d,v23.2d,v29.2d // h3 -> h4
+ bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff
+ add v20.2d,v20.2d,v30.2d // h0 -> h1
+
+ ushr v29.2d,v23.2d,#26
+ xtn v28.2s,v23.2d
+ ushr v30.2d,v20.2d,#26
+ xtn v25.2s,v20.2d
+ bic v28.2s,#0xfc,lsl#24
+ add v21.2d,v21.2d,v30.2d // h1 -> h2
+
+ add v19.2d,v19.2d,v29.2d
+ shl v29.2d,v29.2d,#2
+ shrn v30.2s,v21.2d,#26
+ xtn v26.2s,v21.2d
+ add v19.2d,v19.2d,v29.2d // h4 -> h0
+ bic v25.2s,#0xfc,lsl#24
+ add v27.2s,v27.2s,v30.2s // h2 -> h3
+ bic v26.2s,#0xfc,lsl#24
+
+ shrn v29.2s,v19.2d,#26
+ xtn v24.2s,v19.2d
+ ushr v30.2s,v27.2s,#26
+ bic v27.2s,#0xfc,lsl#24
+ bic v24.2s,#0xfc,lsl#24
+ add v25.2s,v25.2s,v29.2s // h0 -> h1
+ add v28.2s,v28.2s,v30.2s // h3 -> h4
+
+ b.hi .Loop_neon
+
+.Lskip_loop:
+ dup v16.2d,v16.d[0]
+ add v11.2s,v11.2s,v26.2s
+
+ ////////////////////////////////////////////////////////////////
+ // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+ adds x2,x2,#32
+ b.ne .Long_tail
+
+ dup v16.2d,v11.d[0]
+ add v14.2s,v9.2s,v24.2s
+ add v17.2s,v12.2s,v27.2s
+ add v15.2s,v10.2s,v25.2s
+ add v18.2s,v13.2s,v28.2s
+
+.Long_tail:
+ dup v14.2d,v14.d[0]
+ umull2 v19.2d,v16.4s,v6.4s
+ umull2 v22.2d,v16.4s,v1.4s
+ umull2 v23.2d,v16.4s,v3.4s
+ umull2 v21.2d,v16.4s,v0.4s
+ umull2 v20.2d,v16.4s,v8.4s
+
+ dup v15.2d,v15.d[0]
+ umlal2 v19.2d,v14.4s,v0.4s
+ umlal2 v21.2d,v14.4s,v3.4s
+ umlal2 v22.2d,v14.4s,v5.4s
+ umlal2 v23.2d,v14.4s,v7.4s
+ umlal2 v20.2d,v14.4s,v1.4s
+
+ dup v17.2d,v17.d[0]
+ umlal2 v19.2d,v15.4s,v8.4s
+ umlal2 v22.2d,v15.4s,v3.4s
+ umlal2 v21.2d,v15.4s,v1.4s
+ umlal2 v23.2d,v15.4s,v5.4s
+ umlal2 v20.2d,v15.4s,v0.4s
+
+ dup v18.2d,v18.d[0]
+ umlal2 v22.2d,v17.4s,v0.4s
+ umlal2 v23.2d,v17.4s,v1.4s
+ umlal2 v19.2d,v17.4s,v4.4s
+ umlal2 v20.2d,v17.4s,v6.4s
+ umlal2 v21.2d,v17.4s,v8.4s
+
+ umlal2 v22.2d,v18.4s,v8.4s
+ umlal2 v19.2d,v18.4s,v2.4s
+ umlal2 v23.2d,v18.4s,v0.4s
+ umlal2 v20.2d,v18.4s,v4.4s
+ umlal2 v21.2d,v18.4s,v6.4s
+
+ b.eq .Lshort_tail
+
+ ////////////////////////////////////////////////////////////////
+ // (hash+inp[0:1])*r^4:r^3 and accumulate
+
+ add v9.2s,v9.2s,v24.2s
+ umlal v22.2d,v11.2s,v1.2s
+ umlal v19.2d,v11.2s,v6.2s
+ umlal v23.2d,v11.2s,v3.2s
+ umlal v20.2d,v11.2s,v8.2s
+ umlal v21.2d,v11.2s,v0.2s
+
+ add v10.2s,v10.2s,v25.2s
+ umlal v22.2d,v9.2s,v5.2s
+ umlal v19.2d,v9.2s,v0.2s
+ umlal v23.2d,v9.2s,v7.2s
+ umlal v20.2d,v9.2s,v1.2s
+ umlal v21.2d,v9.2s,v3.2s
+
+ add v12.2s,v12.2s,v27.2s
+ umlal v22.2d,v10.2s,v3.2s
+ umlal v19.2d,v10.2s,v8.2s
+ umlal v23.2d,v10.2s,v5.2s
+ umlal v20.2d,v10.2s,v0.2s
+ umlal v21.2d,v10.2s,v1.2s
+
+ add v13.2s,v13.2s,v28.2s
+ umlal v22.2d,v12.2s,v0.2s
+ umlal v19.2d,v12.2s,v4.2s
+ umlal v23.2d,v12.2s,v1.2s
+ umlal v20.2d,v12.2s,v6.2s
+ umlal v21.2d,v12.2s,v8.2s
+
+ umlal v22.2d,v13.2s,v8.2s
+ umlal v19.2d,v13.2s,v2.2s
+ umlal v23.2d,v13.2s,v0.2s
+ umlal v20.2d,v13.2s,v4.2s
+ umlal v21.2d,v13.2s,v6.2s
+
+.Lshort_tail:
+ ////////////////////////////////////////////////////////////////
+ // horizontal add
+
+ addp v22.2d,v22.2d,v22.2d
+ ldp d8,d9,[sp,#16] // meet ABI requirements
+ addp v19.2d,v19.2d,v19.2d
+ ldp d10,d11,[sp,#32]
+ addp v23.2d,v23.2d,v23.2d
+ ldp d12,d13,[sp,#48]
+ addp v20.2d,v20.2d,v20.2d
+ ldp d14,d15,[sp,#64]
+ addp v21.2d,v21.2d,v21.2d
+
+ ////////////////////////////////////////////////////////////////
+ // lazy reduction, but without narrowing
+
+ ushr v29.2d,v22.2d,#26
+ and v22.16b,v22.16b,v31.16b
+ ushr v30.2d,v19.2d,#26
+ and v19.16b,v19.16b,v31.16b
+
+ add v23.2d,v23.2d,v29.2d // h3 -> h4
+ add v20.2d,v20.2d,v30.2d // h0 -> h1
+
+ ushr v29.2d,v23.2d,#26
+ and v23.16b,v23.16b,v31.16b
+ ushr v30.2d,v20.2d,#26
+ and v20.16b,v20.16b,v31.16b
+ add v21.2d,v21.2d,v30.2d // h1 -> h2
+
+ add v19.2d,v19.2d,v29.2d
+ shl v29.2d,v29.2d,#2
+ ushr v30.2d,v21.2d,#26
+ and v21.16b,v21.16b,v31.16b
+ add v19.2d,v19.2d,v29.2d // h4 -> h0
+ add v22.2d,v22.2d,v30.2d // h2 -> h3
+
+ ushr v29.2d,v19.2d,#26
+ and v19.16b,v19.16b,v31.16b
+ ushr v30.2d,v22.2d,#26
+ and v22.16b,v22.16b,v31.16b
+ add v20.2d,v20.2d,v29.2d // h0 -> h1
+ add v23.2d,v23.2d,v30.2d // h3 -> h4
+
+ ////////////////////////////////////////////////////////////////
+ // write the result, can be partially reduced
+
+ st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
+ st1 {v23.s}[0],[x0]
+
+.Lno_data_neon:
+ ldr x29,[sp],#80
+ ret
+ENDPROC(poly1305_blocks_neon)
+
+.align 5
+ENTRY(poly1305_emit_neon)
+ ldr x17,[x0,#24]
+ cbz x17,poly1305_emit_arm
+
+ ldp w10,w11,[x0] // load hash value base 2^26
+ ldp w12,w13,[x0,#8]
+ ldr w14,[x0,#16]
+
+ add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
+ lsr x5,x12,#12
+ adds x4,x4,x12,lsl#52
+ add x5,x5,x13,lsl#14
+ adc x5,x5,xzr
+ lsr x6,x14,#24
+ adds x5,x5,x14,lsl#40
+ adc x6,x6,xzr // can be partially reduced...
+
+ ldp x10,x11,[x2] // load nonce
+
+ and x12,x6,#-4 // ... so reduce
+ add x12,x12,x6,lsr#2
+ and x6,x6,#3
+ adds x4,x4,x12
+ adcs x5,x5,xzr
+ adc x6,x6,xzr
+
+ adds x12,x4,#5 // compare to modulus
+ adcs x13,x5,xzr
+ adc x14,x6,xzr
+
+ tst x14,#-4 // see if it's carried/borrowed
+
+ csel x4,x4,x12,eq
+ csel x5,x5,x13,eq
+
+#ifdef __ARMEB__
+ ror x10,x10,#32 // flip nonce words
+ ror x11,x11,#32
+#endif
+ adds x4,x4,x10 // accumulate nonce
+ adc x5,x5,x11
+#ifdef __ARMEB__
+ rev x4,x4 // flip output bytes
+ rev x5,x5
+#endif
+ stp x4,x5,[x1] // write result
+
+ ret
+ENDPROC(poly1305_emit_neon)
+
+.align 5
+.Lzeros:
+.long 0,0,0,0,0,0,0,0
diff --git a/src/crypto/zinc/poly1305/poly1305-mips-glue.h b/src/crypto/zinc/poly1305/poly1305-mips-glue.h
new file mode 100644
index 0000000..e29f859
--- /dev/null
+++ b/src/crypto/zinc/poly1305/poly1305-mips-glue.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <zinc/poly1305.h>
+
+asmlinkage void poly1305_init_mips(void *ctx, const u8 key[16]);
+asmlinkage void poly1305_blocks_mips(void *ctx, const u8 *inp, const size_t len,
+ const u32 padbit);
+asmlinkage void poly1305_emit_mips(void *ctx, u8 mac[16], const u32 nonce[4]);
+void __init poly1305_fpu_init(void)
+{
+}
+
+static inline bool poly1305_init_arch(void *ctx,
+ const u8 key[POLY1305_KEY_SIZE],
+ simd_context_t simd_context)
+{
+ poly1305_init_mips(ctx, key);
+ return true;
+}
+
+static inline bool poly1305_blocks_arch(void *ctx, const u8 *inp,
+ const size_t len, const u32 padbit,
+ simd_context_t simd_context)
+{
+ poly1305_blocks_mips(ctx, inp, len, padbit);
+ return true;
+}
+
+static inline bool poly1305_emit_arch(void *ctx, u8 mac[POLY1305_MAC_SIZE],
+ const u32 nonce[4],
+ simd_context_t simd_context)
+{
+ poly1305_emit_mips(ctx, mac, nonce);
+ return true;
+}
+
+#define HAVE_POLY1305_ARCH_IMPLEMENTATION
diff --git a/src/crypto/zinc/poly1305/poly1305-mips.S b/src/crypto/zinc/poly1305/poly1305-mips.S
new file mode 100644
index 0000000..32d8558
--- /dev/null
+++ b/src/crypto/zinc/poly1305/poly1305-mips.S
@@ -0,0 +1,417 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com> All Rights Reserved.
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define MSB 0
+#define LSB 3
+#else
+#define MSB 3
+#define LSB 0
+#endif
+
+#define POLY1305_BLOCK_SIZE 16
+.text
+#define H0 $t0
+#define H1 $t1
+#define H2 $t2
+#define H3 $t3
+#define H4 $t4
+
+#define R0 $t5
+#define R1 $t6
+#define R2 $t7
+#define R3 $t8
+
+#define O0 $s0
+#define O1 $s4
+#define O2 $v1
+#define O3 $t9
+#define O4 $s5
+
+#define S1 $s1
+#define S2 $s2
+#define S3 $s3
+
+#define SC $at
+#define CA $v0
+
+/* Input arguments */
+#define poly $a0
+#define src $a1
+#define srclen $a2
+#define hibit $a3
+
+/* Location in the opaque buffer
+ * R[0..3], CA, H[0..4]
+ */
+#define PTR_POLY1305_R(n) ( 0 + (n*4)) ## ($a0)
+#define PTR_POLY1305_CA (16 ) ## ($a0)
+#define PTR_POLY1305_H(n) (20 + (n*4)) ## ($a0)
+
+#define POLY1305_BLOCK_SIZE 16
+#define POLY1305_STACK_SIZE 8 * 4
+
+.set reorder
+.set noat
+.align 4
+.globl poly1305_blocks_mips
+.ent poly1305_blocks_mips
+poly1305_blocks_mips:
+ .frame $sp,POLY1305_STACK_SIZE,$31
+ /* srclen &= 0xFFFFFFF0 */
+ ins srclen, $zero, 0, 4
+
+ .set noreorder
+ /* check srclen >= 16 bytes */
+ beqz srclen, .Lpoly1305_blocks_mips_end
+ addiu $sp, -(POLY1305_STACK_SIZE)
+ .set reorder
+
+ /* Calculate last round based on src address pointer.
+ * last round src ptr (srclen) = src + (srclen & 0xFFFFFFF0)
+ */
+ addu srclen, src
+
+ lw R0, PTR_POLY1305_R(0)
+ lw R1, PTR_POLY1305_R(1)
+ lw R2, PTR_POLY1305_R(2)
+ lw R3, PTR_POLY1305_R(3)
+
+ /* store the used save registers. */
+ sw $s0, 0($sp)
+ sw $s1, 4($sp)
+ sw $s2, 8($sp)
+ sw $s3, 12($sp)
+ sw $s4, 16($sp)
+ sw $s5, 20($sp)
+
+ /* load Hx and Carry */
+ lw CA, PTR_POLY1305_CA
+ lw H0, PTR_POLY1305_H(0)
+ lw H1, PTR_POLY1305_H(1)
+ lw H2, PTR_POLY1305_H(2)
+ lw H3, PTR_POLY1305_H(3)
+ lw H4, PTR_POLY1305_H(4)
+
+ /* Sx = Rx + (Rx >> 2) */
+ srl S1, R1, 2
+ srl S2, R2, 2
+ srl S3, R3, 2
+ addu S1, R1
+ addu S2, R2
+ addu S3, R3
+
+ addiu SC, $zero, 1
+
+.Lpoly1305_loop:
+ lwl O0, 0+MSB(src)
+ lwl O1, 4+MSB(src)
+ lwl O2, 8+MSB(src)
+ lwl O3,12+MSB(src)
+ lwr O0, 0+LSB(src)
+ lwr O1, 4+LSB(src)
+ lwr O2, 8+LSB(src)
+ lwr O3,12+LSB(src)
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+ wsbh O0
+ wsbh O1
+ wsbh O2
+ wsbh O3
+ rotr O0, 16
+ rotr O1, 16
+ rotr O2, 16
+ rotr O3, 16
+#endif
+
+ /* h0 = (u32)(d0 = (u64)h0 + inp[0] + c 'Carry_previous cycle'); */
+ addu H0, CA
+ sltu CA, H0, CA
+ addu O0, H0
+ sltu H0, O0, H0
+ addu CA, H0
+
+ /* h1 = (u32)(d1 = (u64)h1 + (d0 >> 32) + inp[4]); */
+ addu H1, CA
+ sltu CA, H1, CA
+ addu O1, H1
+ sltu H1, O1, H1
+ addu CA, H1
+
+ /* h2 = (u32)(d2 = (u64)h2 + (d1 >> 32) + inp[8]); */
+ addu H2, CA
+ sltu CA, H2, CA
+ addu O2, H2
+ sltu H2, O2, H2
+ addu CA, H2
+
+ /* h3 = (u32)(d3 = (u64)h3 + (d2 >> 32) + inp[12]); */
+ addu H3, CA
+ sltu CA, H3, CA
+ addu O3, H3
+ sltu H3, O3, H3
+ addu CA, H3
+
+ /* h4 += (u32)(d3 >> 32) + padbit; */
+ addu H4, hibit
+ addu O4, H4, CA
+
+ /* D0 */
+ multu O0, R0
+ maddu O1, S3
+ maddu O2, S2
+ maddu O3, S1
+ mfhi CA
+ mflo H0
+
+ /* D1 */
+ multu O0, R1
+ maddu O1, R0
+ maddu O2, S3
+ maddu O3, S2
+ maddu O4, S1
+ maddu CA, SC
+ mfhi CA
+ mflo H1
+
+ /* D2 */
+ multu O0, R2
+ maddu O1, R1
+ maddu O2, R0
+ maddu O3, S3
+ maddu O4, S2
+ maddu CA, SC
+ mfhi CA
+ mflo H2
+
+ /* D4 */
+ mul H4, O4, R0
+
+ /* D3 */
+ multu O0, R3
+ maddu O1, R2
+ maddu O2, R1
+ maddu O3, R0
+ maddu O4, S3
+ maddu CA, SC
+ mfhi CA
+ mflo H3
+
+ addiu src, POLY1305_BLOCK_SIZE
+
+ /* h4 += (u32)(d3 >> 32); */
+ addu O4, H4, CA
+ /* h4 &= 3 */
+ andi H4, O4, 3
+ /* c = (h4 >> 2) + (h4 & ~3U); */
+ srl CA, O4, 2
+ ins O4, $zero, 0, 2
+
+ /* able to do a 16 byte block. */
+ .set noreorder
+ bne src, srclen, .Lpoly1305_loop
+ /* Delay slot is always executed. */
+ addu CA, O4
+ .set reorder
+
+ /* restore the used save registers. */
+ lw $s0, 0($sp)
+ lw $s1, 4($sp)
+ lw $s2, 8($sp)
+ lw $s3, 12($sp)
+ lw $s4, 16($sp)
+ lw $s5, 20($sp)
+
+ /* store Hx and Carry */
+ sw CA, PTR_POLY1305_CA
+ sw H0, PTR_POLY1305_H(0)
+ sw H1, PTR_POLY1305_H(1)
+ sw H2, PTR_POLY1305_H(2)
+ sw H3, PTR_POLY1305_H(3)
+ sw H4, PTR_POLY1305_H(4)
+
+.Lpoly1305_blocks_mips_end:
+ /* Jump Back */
+ .set noreorder
+ jr $ra
+ addiu $sp, POLY1305_STACK_SIZE
+ .set reorder
+.end poly1305_blocks_mips
+.set at
+.set reorder
+
+/* Input arguments CTX=$a0, MAC=$a1, NONCE=$a2 */
+#define MAC $a1
+#define NONCE $a2
+
+#define G0 $t5
+#define G1 $t6
+#define G2 $t7
+#define G3 $t8
+#define G4 $t9
+
+.set reorder
+.set noat
+.align 4
+.globl poly1305_emit_mips
+.ent poly1305_emit_mips
+poly1305_emit_mips:
+ /* load Hx and Carry */
+ lw CA, PTR_POLY1305_CA
+ lw H0, PTR_POLY1305_H(0)
+ lw H1, PTR_POLY1305_H(1)
+ lw H2, PTR_POLY1305_H(2)
+ lw H3, PTR_POLY1305_H(3)
+ lw H4, PTR_POLY1305_H(4)
+
+ /* Add left over carry */
+ addu H0, CA
+ sltu CA, H0, CA
+ addu H1, CA
+ sltu CA, H1, CA
+ addu H2, CA
+ sltu CA, H2, CA
+ addu H3, CA
+ sltu CA, H3, CA
+ addu H4, CA
+
+ /* compare to modulus by computing h + -p */
+ addiu G0, H0, 5
+ sltu CA, G0, H0
+ addu G1, H1, CA
+ sltu CA, G1, H1
+ addu G2, H2, CA
+ sltu CA, G2, H2
+ addu G3, H3, CA
+ sltu CA, G3, H3
+ addu G4, H4, CA
+
+ srl SC, G4, 2
+
+ /* if there was carry into 131st bit, h3:h0 = g3:g0 */
+ movn H0, G0, SC
+ movn H1, G1, SC
+ movn H2, G2, SC
+ movn H3, G3, SC
+
+ lwl G0, 0+MSB(NONCE)
+ lwl G1, 4+MSB(NONCE)
+ lwl G2, 8+MSB(NONCE)
+ lwl G3,12+MSB(NONCE)
+ lwr G0, 0+LSB(NONCE)
+ lwr G1, 4+LSB(NONCE)
+ lwr G2, 8+LSB(NONCE)
+ lwr G3,12+LSB(NONCE)
+
+ /* mac = (h + nonce) % (2^128) */
+ addu H0, G0
+ sltu CA, H0, G0
+
+ /* H1 */
+ addu H1, CA
+ sltu CA, H1, CA
+ addu H1, G1
+ sltu G1, H1, G1
+ addu CA, G1
+
+ /* H2 */
+ addu H2, CA
+ sltu CA, H2, CA
+ addu H2, G2
+ sltu G2, H2, G2
+ addu CA, G2
+
+ /* H3 */
+ addu H3, CA
+ addu H3, G3
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+ wsbh H0
+ wsbh H1
+ wsbh H2
+ wsbh H3
+ rotr H0, 16
+ rotr H1, 16
+ rotr H2, 16
+ rotr H3, 16
+#endif
+
+ /* store MAC */
+ swl H0, 0+MSB(MAC)
+ swl H1, 4+MSB(MAC)
+ swl H2, 8+MSB(MAC)
+ swl H3,12+MSB(MAC)
+ swr H0, 0+LSB(MAC)
+ swr H1, 4+LSB(MAC)
+ swr H2, 8+LSB(MAC)
+ .set noreorder
+ jr $ra
+ swr H3,12+LSB(MAC)
+ .set reorder
+.end poly1305_emit_mips
+
+#define PR0 $t0
+#define PR1 $t1
+#define PR2 $t2
+#define PR3 $t3
+#define PT0 $t4
+
+/* Input arguments CTX=$a0, KEY=$a1 */
+
+.align 4
+.globl poly1305_init_mips
+.ent poly1305_init_mips
+poly1305_init_mips:
+ lwl PR0, 0+MSB($a1)
+ lwl PR1, 4+MSB($a1)
+ lwl PR2, 8+MSB($a1)
+ lwl PR3,12+MSB($a1)
+ lwr PR0, 0+LSB($a1)
+ lwr PR1, 4+LSB($a1)
+ lwr PR2, 8+LSB($a1)
+ lwr PR3,12+LSB($a1)
+
+ /* store Hx and Carry */
+ sw $zero, PTR_POLY1305_CA
+ sw $zero, PTR_POLY1305_H(0)
+ sw $zero, PTR_POLY1305_H(1)
+ sw $zero, PTR_POLY1305_H(2)
+ sw $zero, PTR_POLY1305_H(3)
+ sw $zero, PTR_POLY1305_H(4)
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+ wsbh PR0
+ wsbh PR1
+ wsbh PR2
+ wsbh PR3
+ rotr PR0, 16
+ rotr PR1, 16
+ rotr PR2, 16
+ rotr PR3, 16
+#endif
+
+ lui PT0, 0x0FFF
+ ori PT0, 0xFFFC
+
+ /* AND 0x0fffffff; */
+ ext PR0, PR0, 0, (32-4)
+
+ /* AND 0x0ffffffc; */
+ and PR1, PT0
+ and PR2, PT0
+ and PR3, PT0
+
+ /* store Rx */
+ sw PR0, PTR_POLY1305_R(0)
+ sw PR1, PTR_POLY1305_R(1)
+ sw PR2, PTR_POLY1305_R(2)
+
+ .set noreorder
+ /* Jump Back */
+ jr $ra
+ sw PR3, PTR_POLY1305_R(3)
+ .set reorder
+.end poly1305_init_mips
diff --git a/src/crypto/zinc/poly1305/poly1305-mips64.S b/src/crypto/zinc/poly1305/poly1305-mips64.S
new file mode 100644
index 0000000..1a45fbe
--- /dev/null
+++ b/src/crypto/zinc/poly1305/poly1305-mips64.S
@@ -0,0 +1,357 @@
+/* SPDX-License-Identifier: OpenSSL OR (BSD-3-Clause OR GPL-2.0)
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+ */
+
+#if !defined(CONFIG_64BIT)
+#error "This is only for 64-bit kernels."
+#endif
+
+#ifdef __MIPSEB__
+#define MSB 0
+#define LSB 7
+#else
+#define MSB 7
+#define LSB 0
+#endif
+
+#if defined(CONFIG_CPU_MIPS64_R6) || defined(CONFIG_CPU_MIPSR6)
+#define dmultu(rs,rt)
+#define mflo(rd,rs,rt) dmulu rd,rs,rt
+#define mfhi(rd,rs,rt) dmuhu rd,rs,rt
+#else
+#define dmultu(rs,rt) dmultu rs,rt
+#define multu(rs,rt) multu rs,rt
+#define mflo(rd,rs,rt) mflo rd
+#define mfhi(rd,rs,rt) mfhi rd
+#endif
+
+.text
+.set noat
+.set noreorder
+
+/* While most of the assembly in the kernel prefers ENTRY() and ENDPROC(),
+ * there is no existing MIPS assembly that uses it, and MIPS assembler seems
+ * to like its own .ent/.end notation, which the MIPS include files don't
+ * provide in a MIPS-specific ENTRY/ENDPROC definition. So, we skip these
+ * for now, until somebody complains. */
+
+.align 5
+.globl poly1305_init_mips
+.ent poly1305_init_mips
+poly1305_init_mips:
+ .frame $29,0,$31
+ .set reorder
+
+ sd $0,0($4)
+ sd $0,8($4)
+ sd $0,16($4)
+
+ beqz $5,.Lno_key
+
+#if defined(CONFIG_CPU_MIPS64_R6) || defined(CONFIG_CPU_MIPSR6)
+ ld $8,0($5)
+ ld $9,8($5)
+#else
+ ldl $8,0+MSB($5)
+ ldl $9,8+MSB($5)
+ ldr $8,0+LSB($5)
+ ldr $9,8+LSB($5)
+#endif
+#ifdef __MIPSEB__
+#if defined(CONFIG_CPU_MIPS64_R2) || defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPS64_R6) || defined(CONFIG_CPU_MIPSR6)
+ dsbh $8,$8 # byte swap
+ dsbh $9,$9
+ dshd $8,$8
+ dshd $9,$9
+#else
+ ori $10,$0,0xFF
+ dsll $1,$10,32
+ or $10,$1 # 0x000000FF000000FF
+
+ and $11,$8,$10 # byte swap
+ and $2,$9,$10
+ dsrl $1,$8,24
+ dsrl $24,$9,24
+ dsll $11,24
+ dsll $2,24
+ and $1,$10
+ and $24,$10
+ dsll $10,8 # 0x0000FF000000FF00
+ or $11,$1
+ or $2,$24
+ and $1,$8,$10
+ and $24,$9,$10
+ dsrl $8,8
+ dsrl $9,8
+ dsll $1,8
+ dsll $24,8
+ and $8,$10
+ and $9,$10
+ or $11,$1
+ or $2,$24
+ or $8,$11
+ or $9,$2
+ dsrl $11,$8,32
+ dsrl $2,$9,32
+ dsll $8,32
+ dsll $9,32
+ or $8,$11
+ or $9,$2
+#endif
+#endif
+ li $10,1
+ dsll $10,32
+ daddiu $10,-63
+ dsll $10,28
+ daddiu $10,-1 # 0ffffffc0fffffff
+
+ and $8,$10
+ daddiu $10,-3 # 0ffffffc0ffffffc
+ and $9,$10
+
+ sd $8,24($4)
+ dsrl $10,$9,2
+ sd $9,32($4)
+ daddu $10,$9 # s1 = r1 + (r1 >> 2)
+ sd $10,40($4)
+
+.Lno_key:
+ li $2,0 # return 0
+ jr $31
+.end poly1305_init_mips
+
+.align 5
+.globl poly1305_blocks_mips
+.ent poly1305_blocks_mips
+poly1305_blocks_mips:
+ .set noreorder
+ dsrl $6,4 # number of complete blocks
+ bnez $6,poly1305_blocks_internal
+ nop
+ jr $31
+ nop
+.end poly1305_blocks_mips
+
+.align 5
+.ent poly1305_blocks_internal
+poly1305_blocks_internal:
+ .frame $29,6*8,$31
+ .mask 0x00030000,-8
+ .set noreorder
+ dsubu $29,6*8
+ sd $17,40($29)
+ sd $16,32($29)
+ .set reorder
+
+ ld $12,0($4) # load hash value
+ ld $13,8($4)
+ ld $14,16($4)
+
+ ld $15,24($4) # load key
+ ld $16,32($4)
+ ld $17,40($4)
+
+.Loop:
+#if defined(CONFIG_CPU_MIPS64_R6) || defined(CONFIG_CPU_MIPSR6)
+ ld $8,0($5) # load input
+ ld $9,8($5)
+#else
+ ldl $8,0+MSB($5) # load input
+ ldl $9,8+MSB($5)
+ ldr $8,0+LSB($5)
+ ldr $9,8+LSB($5)
+#endif
+ daddiu $6,-1
+ daddiu $5,16
+#ifdef __MIPSEB__
+#if defined(CONFIG_CPU_MIPS64_R2) || defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPS64_R6) || defined(CONFIG_CPU_MIPSR6)
+ dsbh $8,$8 # byte swap
+ dsbh $9,$9
+ dshd $8,$8
+ dshd $9,$9
+#else
+ ori $10,$0,0xFF
+ dsll $1,$10,32
+ or $10,$1 # 0x000000FF000000FF
+
+ and $11,$8,$10 # byte swap
+ and $2,$9,$10
+ dsrl $1,$8,24
+ dsrl $24,$9,24
+ dsll $11,24
+ dsll $2,24
+ and $1,$10
+ and $24,$10
+ dsll $10,8 # 0x0000FF000000FF00
+ or $11,$1
+ or $2,$24
+ and $1,$8,$10
+ and $24,$9,$10
+ dsrl $8,8
+ dsrl $9,8
+ dsll $1,8
+ dsll $24,8
+ and $8,$10
+ and $9,$10
+ or $11,$1
+ or $2,$24
+ or $8,$11
+ or $9,$2
+ dsrl $11,$8,32
+ dsrl $2,$9,32
+ dsll $8,32
+ dsll $9,32
+ or $8,$11
+ or $9,$2
+#endif
+#endif
+ daddu $12,$8 # accumulate input
+ daddu $13,$9
+ sltu $10,$12,$8
+ sltu $11,$13,$9
+ daddu $13,$10
+
+ dmultu ($15,$12) # h0*r0
+ daddu $14,$7
+ sltu $10,$13,$10
+ mflo ($8,$15,$12)
+ mfhi ($9,$15,$12)
+
+ dmultu ($17,$13) # h1*5*r1
+ daddu $10,$11
+ daddu $14,$10
+ mflo ($10,$17,$13)
+ mfhi ($11,$17,$13)
+
+ dmultu ($16,$12) # h0*r1
+ daddu $8,$10
+ daddu $9,$11
+ mflo ($1,$16,$12)
+ mfhi ($25,$16,$12)
+ sltu $10,$8,$10
+ daddu $9,$10
+
+ dmultu ($15,$13) # h1*r0
+ daddu $9,$1
+ sltu $1,$9,$1
+ mflo ($10,$15,$13)
+ mfhi ($11,$15,$13)
+ daddu $25,$1
+
+ dmultu ($17,$14) # h2*5*r1
+ daddu $9,$10
+ daddu $25,$11
+ mflo ($1,$17,$14)
+
+ dmultu ($15,$14) # h2*r0
+ sltu $10,$9,$10
+ daddu $25,$10
+ mflo ($2,$15,$14)
+
+ daddu $9,$1
+ daddu $25,$2
+ sltu $1,$9,$1
+ daddu $25,$1
+
+ li $10,-4 # final reduction
+ and $10,$25
+ dsrl $11,$25,2
+ andi $14,$25,3
+ daddu $10,$11
+ daddu $12,$8,$10
+ sltu $10,$12,$10
+ daddu $13,$9,$10
+ sltu $10,$13,$10
+ daddu $14,$14,$10
+
+ bnez $6,.Loop
+
+ sd $12,0($4) # store hash value
+ sd $13,8($4)
+ sd $14,16($4)
+
+ .set noreorder
+ ld $17,40($29) # epilogue
+ ld $16,32($29)
+ jr $31
+ daddu $29,6*8
+.end poly1305_blocks_internal
+
+.align 5
+.globl poly1305_emit_mips
+.ent poly1305_emit_mips
+poly1305_emit_mips:
+ .frame $29,0,$31
+ .set reorder
+
+ ld $10,0($4)
+ ld $11,8($4)
+ ld $1,16($4)
+
+ daddiu $8,$10,5 # compare to modulus
+ sltiu $2,$8,5
+ daddu $9,$11,$2
+ sltu $2,$9,$2
+ daddu $1,$1,$2
+
+ dsrl $1,2 # see if it carried/borrowed
+ dsubu $1,$0,$1
+ nor $2,$0,$1
+
+ and $8,$1
+ and $10,$2
+ and $9,$1
+ and $11,$2
+ or $8,$10
+ or $9,$11
+
+ lwu $10,0($6) # load nonce
+ lwu $11,4($6)
+ lwu $1,8($6)
+ lwu $2,12($6)
+ dsll $11,32
+ dsll $2,32
+ or $10,$11
+ or $1,$2
+
+ daddu $8,$10 # accumulate nonce
+ daddu $9,$1
+ sltu $10,$8,$10
+ daddu $9,$10
+
+ dsrl $10,$8,8 # write mac value
+ dsrl $11,$8,16
+ dsrl $1,$8,24
+ sb $8,0($5)
+ dsrl $2,$8,32
+ sb $10,1($5)
+ dsrl $10,$8,40
+ sb $11,2($5)
+ dsrl $11,$8,48
+ sb $1,3($5)
+ dsrl $1,$8,56
+ sb $2,4($5)
+ dsrl $2,$9,8
+ sb $10,5($5)
+ dsrl $10,$9,16
+ sb $11,6($5)
+ dsrl $11,$9,24
+ sb $1,7($5)
+
+ sb $9,8($5)
+ dsrl $1,$9,32
+ sb $2,9($5)
+ dsrl $2,$9,40
+ sb $10,10($5)
+ dsrl $10,$9,48
+ sb $11,11($5)
+ dsrl $11,$9,56
+ sb $1,12($5)
+ sb $2,13($5)
+ sb $10,14($5)
+ sb $11,15($5)
+
+ jr $31
+.end poly1305_emit_mips
diff --git a/src/crypto/zinc/poly1305/poly1305-x86_64-glue.h b/src/crypto/zinc/poly1305/poly1305-x86_64-glue.h
new file mode 100644
index 0000000..e24bee7
--- /dev/null
+++ b/src/crypto/zinc/poly1305/poly1305-x86_64-glue.h
@@ -0,0 +1,111 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <zinc/poly1305.h>
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+#include <asm/intel-family.h>
+
+asmlinkage void poly1305_init_x86_64(void *ctx,
+ const u8 key[POLY1305_KEY_SIZE]);
+asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp,
+ const size_t len, const u32 padbit);
+asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_MAC_SIZE],
+ const u32 nonce[4]);
+#ifdef CONFIG_AS_AVX
+asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_MAC_SIZE],
+ const u32 nonce[4]);
+asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len,
+ const u32 padbit);
+#endif
+#ifdef CONFIG_AS_AVX2
+asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len,
+ const u32 padbit);
+#endif
+#ifdef CONFIG_AS_AVX512
+asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp,
+ const size_t len, const u32 padbit);
+#endif
+
+static bool poly1305_use_avx __ro_after_init;
+static bool poly1305_use_avx2 __ro_after_init;
+static bool poly1305_use_avx512 __ro_after_init;
+
+void __init poly1305_fpu_init(void)
+{
+ poly1305_use_avx =
+ boot_cpu_has(X86_FEATURE_AVX) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+ poly1305_use_avx2 =
+ boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX2) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+#ifndef COMPAT_CANNOT_USE_AVX512
+ poly1305_use_avx512 =
+ boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_AVX512F) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
+ XFEATURE_MASK_AVX512, NULL) &&
+ /* Skylake downclocks unacceptably much when using zmm. */
+ boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X;
+#endif
+}
+
+static inline bool poly1305_init_arch(void *ctx,
+ const u8 key[POLY1305_KEY_SIZE],
+ simd_context_t simd_context)
+{
+ poly1305_init_x86_64(ctx, key);
+ return true;
+}
+
+static inline bool poly1305_blocks_arch(void *ctx, const u8 *inp,
+ const size_t len, const u32 padbit,
+ simd_context_t simd_context)
+{
+#ifdef CONFIG_AS_AVX512
+ if (poly1305_use_avx512 && simd_context == HAVE_FULL_SIMD)
+ poly1305_blocks_avx512(ctx, inp, len, padbit);
+ else
+#endif
+#ifdef CONFIG_AS_AVX2
+ if (poly1305_use_avx2 && simd_context == HAVE_FULL_SIMD)
+ poly1305_blocks_avx2(ctx, inp, len, padbit);
+ else
+#endif
+#ifdef CONFIG_AS_AVX
+ if (poly1305_use_avx && simd_context == HAVE_FULL_SIMD)
+ poly1305_blocks_avx(ctx, inp, len, padbit);
+ else
+#endif
+ poly1305_blocks_x86_64(ctx, inp, len, padbit);
+ return true;
+}
+
+static inline bool poly1305_emit_arch(void *ctx, u8 mac[POLY1305_MAC_SIZE],
+ const u32 nonce[4],
+ simd_context_t simd_context)
+{
+#ifdef CONFIG_AS_AVX512
+ if (poly1305_use_avx512 && simd_context == HAVE_FULL_SIMD)
+ poly1305_emit_avx(ctx, mac, nonce);
+ else
+#endif
+#ifdef CONFIG_AS_AVX2
+ if (poly1305_use_avx2 && simd_context == HAVE_FULL_SIMD)
+ poly1305_emit_avx(ctx, mac, nonce);
+ else
+#endif
+#ifdef CONFIG_AS_AVX
+ if (poly1305_use_avx && simd_context == HAVE_FULL_SIMD)
+ poly1305_emit_avx(ctx, mac, nonce);
+ else
+#endif
+ poly1305_emit_x86_64(ctx, mac, nonce);
+ return true;
+}
+
+#define HAVE_POLY1305_ARCH_IMPLEMENTATION
diff --git a/src/crypto/zinc/poly1305/poly1305-x86_64.S b/src/crypto/zinc/poly1305/poly1305-x86_64.S
new file mode 100644
index 0000000..4d51f40
--- /dev/null
+++ b/src/crypto/zinc/poly1305/poly1305-x86_64.S
@@ -0,0 +1,2792 @@
+/* SPDX-License-Identifier: OpenSSL OR (BSD-3-Clause OR GPL-2.0)
+ *
+ * Copyright (C) 2017 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * This is based in part on Andy Polyakov's implementation from OpenSSL.
+ */
+
+#include <linux/linkage.h>
+
+.section .rodata.cst192.Lconst, "aM", @progbits, 192
+.align 64
+.Lconst:
+.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
+.long 16777216,0,16777216,0,16777216,0,16777216,0
+.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
+.long 2,2,2,3,2,0,2,1
+.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
+
+.text
+
+.align 32
+ENTRY(poly1305_init_x86_64)
+ xorq %rax,%rax
+ movq %rax,0(%rdi)
+ movq %rax,8(%rdi)
+ movq %rax,16(%rdi)
+
+ cmpq $0,%rsi
+ je .Lno_key
+
+ movq $0x0ffffffc0fffffff,%rax
+ movq $0x0ffffffc0ffffffc,%rcx
+ andq 0(%rsi),%rax
+ andq 8(%rsi),%rcx
+ movq %rax,24(%rdi)
+ movq %rcx,32(%rdi)
+ movl $1,%eax
+.Lno_key:
+ ret
+ENDPROC(poly1305_init_x86_64)
+
+.align 32
+ENTRY(poly1305_blocks_x86_64)
+.Lblocks:
+ shrq $4,%rdx
+ jz .Lno_data
+
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rdi
+
+.Lblocks_body:
+
+ movq %rdx,%r15
+
+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r13
+
+ movq 0(%rdi),%r14
+ movq 8(%rdi),%rbx
+ movq 16(%rdi),%r10
+
+ movq %r13,%r12
+ shrq $2,%r13
+ movq %r12,%rax
+ addq %r12,%r13
+ jmp .Loop
+
+.align 32
+.Loop:
+
+ addq 0(%rsi),%r14
+ adcq 8(%rsi),%rbx
+ leaq 16(%rsi),%rsi
+ adcq %rcx,%r10
+ mulq %r14
+ movq %rax,%r9
+ movq %r11,%rax
+ movq %rdx,%rdi
+
+ mulq %r14
+ movq %rax,%r14
+ movq %r11,%rax
+ movq %rdx,%r8
+
+ mulq %rbx
+ addq %rax,%r9
+ movq %r13,%rax
+ adcq %rdx,%rdi
+
+ mulq %rbx
+ movq %r10,%rbx
+ addq %rax,%r14
+ adcq %rdx,%r8
+
+ imulq %r13,%rbx
+ addq %rbx,%r9
+ movq %r8,%rbx
+ adcq $0,%rdi
+
+ imulq %r11,%r10
+ addq %r9,%rbx
+ movq $-4,%rax
+ adcq %r10,%rdi
+
+ andq %rdi,%rax
+ movq %rdi,%r10
+ shrq $2,%rdi
+ andq $3,%r10
+ addq %rdi,%rax
+ addq %rax,%r14
+ adcq $0,%rbx
+ adcq $0,%r10
+
+ movq %r12,%rax
+ decq %r15
+ jnz .Loop
+
+ movq 0(%rsp),%rdi
+
+ movq %r14,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %r10,16(%rdi)
+
+ movq 8(%rsp),%r15
+ movq 16(%rsp),%r14
+ movq 24(%rsp),%r13
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%rbx
+ leaq 48(%rsp),%rsp
+.Lno_data:
+.Lblocks_epilogue:
+ ret
+ENDPROC(poly1305_blocks_x86_64)
+
+.align 32
+ENTRY(poly1305_emit_x86_64)
+.Lemit:
+ movq 0(%rdi),%r8
+ movq 8(%rdi),%r9
+ movq 16(%rdi),%r10
+
+ movq %r8,%rax
+ addq $5,%r8
+ movq %r9,%rcx
+ adcq $0,%r9
+ adcq $0,%r10
+ shrq $2,%r10
+ cmovnzq %r8,%rax
+ cmovnzq %r9,%rcx
+
+ addq 0(%rdx),%rax
+ adcq 8(%rdx),%rcx
+ movq %rax,0(%rsi)
+ movq %rcx,8(%rsi)
+
+ ret
+ENDPROC(poly1305_emit_x86_64)
+
+.macro __poly1305_block
+ mulq %r14
+ movq %rax,%r9
+ movq %r11,%rax
+ movq %rdx,%rdi
+
+ mulq %r14
+ movq %rax,%r14
+ movq %r11,%rax
+ movq %rdx,%r8
+
+ mulq %rbx
+ addq %rax,%r9
+ movq %r13,%rax
+ adcq %rdx,%rdi
+
+ mulq %rbx
+ movq %r10,%rbx
+ addq %rax,%r14
+ adcq %rdx,%r8
+
+ imulq %r13,%rbx
+ addq %rbx,%r9
+ movq %r8,%rbx
+ adcq $0,%rdi
+
+ imulq %r11,%r10
+ addq %r9,%rbx
+ movq $-4,%rax
+ adcq %r10,%rdi
+
+ andq %rdi,%rax
+ movq %rdi,%r10
+ shrq $2,%rdi
+ andq $3,%r10
+ addq %rdi,%rax
+ addq %rax,%r14
+ adcq $0,%rbx
+ adcq $0,%r10
+.endm
+
+.macro __poly1305_init_avx
+ movq %r11,%r14
+ movq %r12,%rbx
+ xorq %r10,%r10
+
+ leaq 48+64(%rdi),%rdi
+
+ movq %r12,%rax
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi
+
+ movl $0x3ffffff,%eax
+ movl $0x3ffffff,%edx
+ movq %r14,%r8
+ andl %r14d,%eax
+ movq %r11,%r9
+ andl %r11d,%edx
+ movl %eax,-64(%rdi)
+ shrq $26,%r8
+ movl %edx,-60(%rdi)
+ shrq $26,%r9
+
+ movl $0x3ffffff,%eax
+ movl $0x3ffffff,%edx
+ andl %r8d,%eax
+ andl %r9d,%edx
+ movl %eax,-48(%rdi)
+ leal (%rax,%rax,4),%eax
+ movl %edx,-44(%rdi)
+ leal (%rdx,%rdx,4),%edx
+ movl %eax,-32(%rdi)
+ shrq $26,%r8
+ movl %edx,-28(%rdi)
+ shrq $26,%r9
+
+ movq %rbx,%rax
+ movq %r12,%rdx
+ shlq $12,%rax
+ shlq $12,%rdx
+ orq %r8,%rax
+ orq %r9,%rdx
+ andl $0x3ffffff,%eax
+ andl $0x3ffffff,%edx
+ movl %eax,-16(%rdi)
+ leal (%rax,%rax,4),%eax
+ movl %edx,-12(%rdi)
+ leal (%rdx,%rdx,4),%edx
+ movl %eax,0(%rdi)
+ movq %rbx,%r8
+ movl %edx,4(%rdi)
+ movq %r12,%r9
+
+ movl $0x3ffffff,%eax
+ movl $0x3ffffff,%edx
+ shrq $14,%r8
+ shrq $14,%r9
+ andl %r8d,%eax
+ andl %r9d,%edx
+ movl %eax,16(%rdi)
+ leal (%rax,%rax,4),%eax
+ movl %edx,20(%rdi)
+ leal (%rdx,%rdx,4),%edx
+ movl %eax,32(%rdi)
+ shrq $26,%r8
+ movl %edx,36(%rdi)
+ shrq $26,%r9
+
+ movq %r10,%rax
+ shlq $24,%rax
+ orq %rax,%r8
+ movl %r8d,48(%rdi)
+ leaq (%r8,%r8,4),%r8
+ movl %r9d,52(%rdi)
+ leaq (%r9,%r9,4),%r9
+ movl %r8d,64(%rdi)
+ movl %r9d,68(%rdi)
+
+ movq %r12,%rax
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi
+
+ movl $0x3ffffff,%eax
+ movq %r14,%r8
+ andl %r14d,%eax
+ shrq $26,%r8
+ movl %eax,-52(%rdi)
+
+ movl $0x3ffffff,%edx
+ andl %r8d,%edx
+ movl %edx,-36(%rdi)
+ leal (%rdx,%rdx,4),%edx
+ shrq $26,%r8
+ movl %edx,-20(%rdi)
+
+ movq %rbx,%rax
+ shlq $12,%rax
+ orq %r8,%rax
+ andl $0x3ffffff,%eax
+ movl %eax,-4(%rdi)
+ leal (%rax,%rax,4),%eax
+ movq %rbx,%r8
+ movl %eax,12(%rdi)
+
+ movl $0x3ffffff,%edx
+ shrq $14,%r8
+ andl %r8d,%edx
+ movl %edx,28(%rdi)
+ leal (%rdx,%rdx,4),%edx
+ shrq $26,%r8
+ movl %edx,44(%rdi)
+
+ movq %r10,%rax
+ shlq $24,%rax
+ orq %rax,%r8
+ movl %r8d,60(%rdi)
+ leaq (%r8,%r8,4),%r8
+ movl %r8d,76(%rdi)
+
+ movq %r12,%rax
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi
+
+ movl $0x3ffffff,%eax
+ movq %r14,%r8
+ andl %r14d,%eax
+ shrq $26,%r8
+ movl %eax,-56(%rdi)
+
+ movl $0x3ffffff,%edx
+ andl %r8d,%edx
+ movl %edx,-40(%rdi)
+ leal (%rdx,%rdx,4),%edx
+ shrq $26,%r8
+ movl %edx,-24(%rdi)
+
+ movq %rbx,%rax
+ shlq $12,%rax
+ orq %r8,%rax
+ andl $0x3ffffff,%eax
+ movl %eax,-8(%rdi)
+ leal (%rax,%rax,4),%eax
+ movq %rbx,%r8
+ movl %eax,8(%rdi)
+
+ movl $0x3ffffff,%edx
+ shrq $14,%r8
+ andl %r8d,%edx
+ movl %edx,24(%rdi)
+ leal (%rdx,%rdx,4),%edx
+ shrq $26,%r8
+ movl %edx,40(%rdi)
+
+ movq %r10,%rax
+ shlq $24,%rax
+ orq %rax,%r8
+ movl %r8d,56(%rdi)
+ leaq (%r8,%r8,4),%r8
+ movl %r8d,72(%rdi)
+
+ leaq -48-64(%rdi),%rdi
+.endm
+
+#ifdef CONFIG_AS_AVX
+.align 32
+ENTRY(poly1305_blocks_avx)
+
+ movl 20(%rdi),%r8d
+ cmpq $128,%rdx
+ jae .Lblocks_avx
+ testl %r8d,%r8d
+ jz .Lblocks
+
+.Lblocks_avx:
+ andq $-16,%rdx
+ jz .Lno_data_avx
+
+ vzeroupper
+
+ testl %r8d,%r8d
+ jz .Lbase2_64_avx
+
+ testq $31,%rdx
+ jz .Leven_avx
+
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rdi
+
+.Lblocks_avx_body:
+
+ movq %rdx,%r15
+
+ movq 0(%rdi),%r8
+ movq 8(%rdi),%r9
+ movl 16(%rdi),%r10d
+
+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r13
+
+
+ movl %r8d,%r14d
+ andq $-2147483648,%r8
+ movq %r9,%r12
+ movl %r9d,%ebx
+ andq $-2147483648,%r9
+
+ shrq $6,%r8
+ shlq $52,%r12
+ addq %r8,%r14
+ shrq $12,%rbx
+ shrq $18,%r9
+ addq %r12,%r14
+ adcq %r9,%rbx
+
+ movq %r10,%r8
+ shlq $40,%r8
+ shrq $24,%r10
+ addq %r8,%rbx
+ adcq $0,%r10
+
+ movq $-4,%r9
+ movq %r10,%r8
+ andq %r10,%r9
+ shrq $2,%r8
+ andq $3,%r10
+ addq %r9,%r8
+ addq %r8,%r14
+ adcq $0,%rbx
+ adcq $0,%r10
+
+ movq %r13,%r12
+ movq %r13,%rax
+ shrq $2,%r13
+ addq %r12,%r13
+
+ addq 0(%rsi),%r14
+ adcq 8(%rsi),%rbx
+ leaq 16(%rsi),%rsi
+ adcq %rcx,%r10
+
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi
+
+ testq %rcx,%rcx
+ jz .Lstore_base2_64_avx
+
+
+ movq %r14,%rax
+ movq %r14,%rdx
+ shrq $52,%r14
+ movq %rbx,%r11
+ movq %rbx,%r12
+ shrq $26,%rdx
+ andq $0x3ffffff,%rax
+ shlq $12,%r11
+ andq $0x3ffffff,%rdx
+ shrq $14,%rbx
+ orq %r11,%r14
+ shlq $24,%r10
+ andq $0x3ffffff,%r14
+ shrq $40,%r12
+ andq $0x3ffffff,%rbx
+ orq %r12,%r10
+
+ subq $16,%r15
+ jz .Lstore_base2_26_avx
+
+ vmovd %eax,%xmm0
+ vmovd %edx,%xmm1
+ vmovd %r14d,%xmm2
+ vmovd %ebx,%xmm3
+ vmovd %r10d,%xmm4
+ jmp .Lproceed_avx
+
+.align 32
+.Lstore_base2_64_avx:
+ movq %r14,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %r10,16(%rdi)
+ jmp .Ldone_avx
+
+.align 16
+.Lstore_base2_26_avx:
+ movl %eax,0(%rdi)
+ movl %edx,4(%rdi)
+ movl %r14d,8(%rdi)
+ movl %ebx,12(%rdi)
+ movl %r10d,16(%rdi)
+.align 16
+.Ldone_avx:
+ movq 8(%rsp),%r15
+ movq 16(%rsp),%r14
+ movq 24(%rsp),%r13
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%rbx
+ leaq 48(%rsp),%rsp
+
+.Lno_data_avx:
+.Lblocks_avx_epilogue:
+ ret
+
+.align 32
+.Lbase2_64_avx:
+
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rdi
+
+.Lbase2_64_avx_body:
+
+ movq %rdx,%r15
+
+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r13
+
+ movq 0(%rdi),%r14
+ movq 8(%rdi),%rbx
+ movl 16(%rdi),%r10d
+
+ movq %r13,%r12
+ movq %r13,%rax
+ shrq $2,%r13
+ addq %r12,%r13
+
+ testq $31,%rdx
+ jz .Linit_avx
+
+ addq 0(%rsi),%r14
+ adcq 8(%rsi),%rbx
+ leaq 16(%rsi),%rsi
+ adcq %rcx,%r10
+ subq $16,%r15
+
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi
+
+.Linit_avx:
+
+ movq %r14,%rax
+ movq %r14,%rdx
+ shrq $52,%r14
+ movq %rbx,%r8
+ movq %rbx,%r9
+ shrq $26,%rdx
+ andq $0x3ffffff,%rax
+ shlq $12,%r8
+ andq $0x3ffffff,%rdx
+ shrq $14,%rbx
+ orq %r8,%r14
+ shlq $24,%r10
+ andq $0x3ffffff,%r14
+ shrq $40,%r9
+ andq $0x3ffffff,%rbx
+ orq %r9,%r10
+
+ vmovd %eax,%xmm0
+ vmovd %edx,%xmm1
+ vmovd %r14d,%xmm2
+ vmovd %ebx,%xmm3
+ vmovd %r10d,%xmm4
+ movl $1,20(%rdi)
+
+ __poly1305_init_avx
+
+.Lproceed_avx:
+ movq %r15,%rdx
+
+ movq 8(%rsp),%r15
+ movq 16(%rsp),%r14
+ movq 24(%rsp),%r13
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%rbx
+ leaq 48(%rsp),%rax
+ leaq 48(%rsp),%rsp
+
+.Lbase2_64_avx_epilogue:
+ jmp .Ldo_avx
+
+
+.align 32
+.Leven_avx:
+ vmovd 0(%rdi),%xmm0
+ vmovd 4(%rdi),%xmm1
+ vmovd 8(%rdi),%xmm2
+ vmovd 12(%rdi),%xmm3
+ vmovd 16(%rdi),%xmm4
+
+.Ldo_avx:
+ leaq 8(%rsp),%r10
+ andq $-32,%rsp
+ subq $8,%rsp
+ leaq -88(%rsp),%r11
+ subq $0x178,%rsp
+ subq $64,%rdx
+ leaq -32(%rsi),%rax
+ cmovcq %rax,%rsi
+
+ vmovdqu 48(%rdi),%xmm14
+ leaq 112(%rdi),%rdi
+ leaq .Lconst(%rip),%rcx
+
+ vmovdqu 32(%rsi),%xmm5
+ vmovdqu 48(%rsi),%xmm6
+ vmovdqa 64(%rcx),%xmm15
+
+ vpsrldq $6,%xmm5,%xmm7
+ vpsrldq $6,%xmm6,%xmm8
+ vpunpckhqdq %xmm6,%xmm5,%xmm9
+ vpunpcklqdq %xmm6,%xmm5,%xmm5
+ vpunpcklqdq %xmm8,%xmm7,%xmm8
+
+ vpsrlq $40,%xmm9,%xmm9
+ vpsrlq $26,%xmm5,%xmm6
+ vpand %xmm15,%xmm5,%xmm5
+ vpsrlq $4,%xmm8,%xmm7
+ vpand %xmm15,%xmm6,%xmm6
+ vpsrlq $30,%xmm8,%xmm8
+ vpand %xmm15,%xmm7,%xmm7
+ vpand %xmm15,%xmm8,%xmm8
+ vpor 32(%rcx),%xmm9,%xmm9
+
+ jbe .Lskip_loop_avx
+
+
+ vmovdqu -48(%rdi),%xmm11
+ vmovdqu -32(%rdi),%xmm12
+ vpshufd $0xEE,%xmm14,%xmm13
+ vpshufd $0x44,%xmm14,%xmm10
+ vmovdqa %xmm13,-144(%r11)
+ vmovdqa %xmm10,0(%rsp)
+ vpshufd $0xEE,%xmm11,%xmm14
+ vmovdqu -16(%rdi),%xmm10
+ vpshufd $0x44,%xmm11,%xmm11
+ vmovdqa %xmm14,-128(%r11)
+ vmovdqa %xmm11,16(%rsp)
+ vpshufd $0xEE,%xmm12,%xmm13
+ vmovdqu 0(%rdi),%xmm11
+ vpshufd $0x44,%xmm12,%xmm12
+ vmovdqa %xmm13,-112(%r11)
+ vmovdqa %xmm12,32(%rsp)
+ vpshufd $0xEE,%xmm10,%xmm14
+ vmovdqu 16(%rdi),%xmm12
+ vpshufd $0x44,%xmm10,%xmm10
+ vmovdqa %xmm14,-96(%r11)
+ vmovdqa %xmm10,48(%rsp)
+ vpshufd $0xEE,%xmm11,%xmm13
+ vmovdqu 32(%rdi),%xmm10
+ vpshufd $0x44,%xmm11,%xmm11
+ vmovdqa %xmm13,-80(%r11)
+ vmovdqa %xmm11,64(%rsp)
+ vpshufd $0xEE,%xmm12,%xmm14
+ vmovdqu 48(%rdi),%xmm11
+ vpshufd $0x44,%xmm12,%xmm12
+ vmovdqa %xmm14,-64(%r11)
+ vmovdqa %xmm12,80(%rsp)
+ vpshufd $0xEE,%xmm10,%xmm13
+ vmovdqu 64(%rdi),%xmm12
+ vpshufd $0x44,%xmm10,%xmm10
+ vmovdqa %xmm13,-48(%r11)
+ vmovdqa %xmm10,96(%rsp)
+ vpshufd $0xEE,%xmm11,%xmm14
+ vpshufd $0x44,%xmm11,%xmm11
+ vmovdqa %xmm14,-32(%r11)
+ vmovdqa %xmm11,112(%rsp)
+ vpshufd $0xEE,%xmm12,%xmm13
+ vmovdqa 0(%rsp),%xmm14
+ vpshufd $0x44,%xmm12,%xmm12
+ vmovdqa %xmm13,-16(%r11)
+ vmovdqa %xmm12,128(%rsp)
+
+ jmp .Loop_avx
+
+.align 32
+.Loop_avx:
+
+ vpmuludq %xmm5,%xmm14,%xmm10
+ vpmuludq %xmm6,%xmm14,%xmm11
+ vmovdqa %xmm2,32(%r11)
+ vpmuludq %xmm7,%xmm14,%xmm12
+ vmovdqa 16(%rsp),%xmm2
+ vpmuludq %xmm8,%xmm14,%xmm13
+ vpmuludq %xmm9,%xmm14,%xmm14
+
+ vmovdqa %xmm0,0(%r11)
+ vpmuludq 32(%rsp),%xmm9,%xmm0
+ vmovdqa %xmm1,16(%r11)
+ vpmuludq %xmm8,%xmm2,%xmm1
+ vpaddq %xmm0,%xmm10,%xmm10
+ vpaddq %xmm1,%xmm14,%xmm14
+ vmovdqa %xmm3,48(%r11)
+ vpmuludq %xmm7,%xmm2,%xmm0
+ vpmuludq %xmm6,%xmm2,%xmm1
+ vpaddq %xmm0,%xmm13,%xmm13
+ vmovdqa 48(%rsp),%xmm3
+ vpaddq %xmm1,%xmm12,%xmm12
+ vmovdqa %xmm4,64(%r11)
+ vpmuludq %xmm5,%xmm2,%xmm2
+ vpmuludq %xmm7,%xmm3,%xmm0
+ vpaddq %xmm2,%xmm11,%xmm11
+
+ vmovdqa 64(%rsp),%xmm4
+ vpaddq %xmm0,%xmm14,%xmm14
+ vpmuludq %xmm6,%xmm3,%xmm1
+ vpmuludq %xmm5,%xmm3,%xmm3
+ vpaddq %xmm1,%xmm13,%xmm13
+ vmovdqa 80(%rsp),%xmm2
+ vpaddq %xmm3,%xmm12,%xmm12
+ vpmuludq %xmm9,%xmm4,%xmm0
+ vpmuludq %xmm8,%xmm4,%xmm4
+ vpaddq %xmm0,%xmm11,%xmm11
+ vmovdqa 96(%rsp),%xmm3
+ vpaddq %xmm4,%xmm10,%xmm10
+
+ vmovdqa 128(%rsp),%xmm4
+ vpmuludq %xmm6,%xmm2,%xmm1
+ vpmuludq %xmm5,%xmm2,%xmm2
+ vpaddq %xmm1,%xmm14,%xmm14
+ vpaddq %xmm2,%xmm13,%xmm13
+ vpmuludq %xmm9,%xmm3,%xmm0
+ vpmuludq %xmm8,%xmm3,%xmm1
+ vpaddq %xmm0,%xmm12,%xmm12
+ vmovdqu 0(%rsi),%xmm0
+ vpaddq %xmm1,%xmm11,%xmm11
+ vpmuludq %xmm7,%xmm3,%xmm3
+ vpmuludq %xmm7,%xmm4,%xmm7
+ vpaddq %xmm3,%xmm10,%xmm10
+
+ vmovdqu 16(%rsi),%xmm1
+ vpaddq %xmm7,%xmm11,%xmm11
+ vpmuludq %xmm8,%xmm4,%xmm8
+ vpmuludq %xmm9,%xmm4,%xmm9
+ vpsrldq $6,%xmm0,%xmm2
+ vpaddq %xmm8,%xmm12,%xmm12
+ vpaddq %xmm9,%xmm13,%xmm13
+ vpsrldq $6,%xmm1,%xmm3
+ vpmuludq 112(%rsp),%xmm5,%xmm9
+ vpmuludq %xmm6,%xmm4,%xmm5
+ vpunpckhqdq %xmm1,%xmm0,%xmm4
+ vpaddq %xmm9,%xmm14,%xmm14
+ vmovdqa -144(%r11),%xmm9
+ vpaddq %xmm5,%xmm10,%xmm10
+
+ vpunpcklqdq %xmm1,%xmm0,%xmm0
+ vpunpcklqdq %xmm3,%xmm2,%xmm3
+
+
+ vpsrldq $5,%xmm4,%xmm4
+ vpsrlq $26,%xmm0,%xmm1
+ vpand %xmm15,%xmm0,%xmm0
+ vpsrlq $4,%xmm3,%xmm2
+ vpand %xmm15,%xmm1,%xmm1
+ vpand 0(%rcx),%xmm4,%xmm4
+ vpsrlq $30,%xmm3,%xmm3
+ vpand %xmm15,%xmm2,%xmm2
+ vpand %xmm15,%xmm3,%xmm3
+ vpor 32(%rcx),%xmm4,%xmm4
+
+ vpaddq 0(%r11),%xmm0,%xmm0
+ vpaddq 16(%r11),%xmm1,%xmm1
+ vpaddq 32(%r11),%xmm2,%xmm2
+ vpaddq 48(%r11),%xmm3,%xmm3
+ vpaddq 64(%r11),%xmm4,%xmm4
+
+ leaq 32(%rsi),%rax
+ leaq 64(%rsi),%rsi
+ subq $64,%rdx
+ cmovcq %rax,%rsi
+
+ vpmuludq %xmm0,%xmm9,%xmm5
+ vpmuludq %xmm1,%xmm9,%xmm6
+ vpaddq %xmm5,%xmm10,%xmm10
+ vpaddq %xmm6,%xmm11,%xmm11
+ vmovdqa -128(%r11),%xmm7
+ vpmuludq %xmm2,%xmm9,%xmm5
+ vpmuludq %xmm3,%xmm9,%xmm6
+ vpaddq %xmm5,%xmm12,%xmm12
+ vpaddq %xmm6,%xmm13,%xmm13
+ vpmuludq %xmm4,%xmm9,%xmm9
+ vpmuludq -112(%r11),%xmm4,%xmm5
+ vpaddq %xmm9,%xmm14,%xmm14
+
+ vpaddq %xmm5,%xmm10,%xmm10
+ vpmuludq %xmm2,%xmm7,%xmm6
+ vpmuludq %xmm3,%xmm7,%xmm5
+ vpaddq %xmm6,%xmm13,%xmm13
+ vmovdqa -96(%r11),%xmm8
+ vpaddq %xmm5,%xmm14,%xmm14
+ vpmuludq %xmm1,%xmm7,%xmm6
+ vpmuludq %xmm0,%xmm7,%xmm7
+ vpaddq %xmm6,%xmm12,%xmm12
+ vpaddq %xmm7,%xmm11,%xmm11
+
+ vmovdqa -80(%r11),%xmm9
+ vpmuludq %xmm2,%xmm8,%xmm5
+ vpmuludq %xmm1,%xmm8,%xmm6
+ vpaddq %xmm5,%xmm14,%xmm14
+ vpaddq %xmm6,%xmm13,%xmm13
+ vmovdqa -64(%r11),%xmm7
+ vpmuludq %xmm0,%xmm8,%xmm8
+ vpmuludq %xmm4,%xmm9,%xmm5
+ vpaddq %xmm8,%xmm12,%xmm12
+ vpaddq %xmm5,%xmm11,%xmm11
+ vmovdqa -48(%r11),%xmm8
+ vpmuludq %xmm3,%xmm9,%xmm9
+ vpmuludq %xmm1,%xmm7,%xmm6
+ vpaddq %xmm9,%xmm10,%xmm10
+
+ vmovdqa -16(%r11),%xmm9
+ vpaddq %xmm6,%xmm14,%xmm14
+ vpmuludq %xmm0,%xmm7,%xmm7
+ vpmuludq %xmm4,%xmm8,%xmm5
+ vpaddq %xmm7,%xmm13,%xmm13
+ vpaddq %xmm5,%xmm12,%xmm12
+ vmovdqu 32(%rsi),%xmm5
+ vpmuludq %xmm3,%xmm8,%xmm7
+ vpmuludq %xmm2,%xmm8,%xmm8
+ vpaddq %xmm7,%xmm11,%xmm11
+ vmovdqu 48(%rsi),%xmm6
+ vpaddq %xmm8,%xmm10,%xmm10
+
+ vpmuludq %xmm2,%xmm9,%xmm2
+ vpmuludq %xmm3,%xmm9,%xmm3
+ vpsrldq $6,%xmm5,%xmm7
+ vpaddq %xmm2,%xmm11,%xmm11
+ vpmuludq %xmm4,%xmm9,%xmm4
+ vpsrldq $6,%xmm6,%xmm8
+ vpaddq %xmm3,%xmm12,%xmm2
+ vpaddq %xmm4,%xmm13,%xmm3
+ vpmuludq -32(%r11),%xmm0,%xmm4
+ vpmuludq %xmm1,%xmm9,%xmm0
+ vpunpckhqdq %xmm6,%xmm5,%xmm9
+ vpaddq %xmm4,%xmm14,%xmm4
+ vpaddq %xmm0,%xmm10,%xmm0
+
+ vpunpcklqdq %xmm6,%xmm5,%xmm5
+ vpunpcklqdq %xmm8,%xmm7,%xmm8
+
+
+ vpsrldq $5,%xmm9,%xmm9
+ vpsrlq $26,%xmm5,%xmm6
+ vmovdqa 0(%rsp),%xmm14
+ vpand %xmm15,%xmm5,%xmm5
+ vpsrlq $4,%xmm8,%xmm7
+ vpand %xmm15,%xmm6,%xmm6
+ vpand 0(%rcx),%xmm9,%xmm9
+ vpsrlq $30,%xmm8,%xmm8
+ vpand %xmm15,%xmm7,%xmm7
+ vpand %xmm15,%xmm8,%xmm8
+ vpor 32(%rcx),%xmm9,%xmm9
+
+ vpsrlq $26,%xmm3,%xmm13
+ vpand %xmm15,%xmm3,%xmm3
+ vpaddq %xmm13,%xmm4,%xmm4
+
+ vpsrlq $26,%xmm0,%xmm10
+ vpand %xmm15,%xmm0,%xmm0
+ vpaddq %xmm10,%xmm11,%xmm1
+
+ vpsrlq $26,%xmm4,%xmm10
+ vpand %xmm15,%xmm4,%xmm4
+
+ vpsrlq $26,%xmm1,%xmm11
+ vpand %xmm15,%xmm1,%xmm1
+ vpaddq %xmm11,%xmm2,%xmm2
+
+ vpaddq %xmm10,%xmm0,%xmm0
+ vpsllq $2,%xmm10,%xmm10
+ vpaddq %xmm10,%xmm0,%xmm0
+
+ vpsrlq $26,%xmm2,%xmm12
+ vpand %xmm15,%xmm2,%xmm2
+ vpaddq %xmm12,%xmm3,%xmm3
+
+ vpsrlq $26,%xmm0,%xmm10
+ vpand %xmm15,%xmm0,%xmm0
+ vpaddq %xmm10,%xmm1,%xmm1
+
+ vpsrlq $26,%xmm3,%xmm13
+ vpand %xmm15,%xmm3,%xmm3
+ vpaddq %xmm13,%xmm4,%xmm4
+
+ ja .Loop_avx
+
+.Lskip_loop_avx:
+ vpshufd $0x10,%xmm14,%xmm14
+ addq $32,%rdx
+ jnz .Long_tail_avx
+
+ vpaddq %xmm2,%xmm7,%xmm7
+ vpaddq %xmm0,%xmm5,%xmm5
+ vpaddq %xmm1,%xmm6,%xmm6
+ vpaddq %xmm3,%xmm8,%xmm8
+ vpaddq %xmm4,%xmm9,%xmm9
+
+.Long_tail_avx:
+ vmovdqa %xmm2,32(%r11)
+ vmovdqa %xmm0,0(%r11)
+ vmovdqa %xmm1,16(%r11)
+ vmovdqa %xmm3,48(%r11)
+ vmovdqa %xmm4,64(%r11)
+
+ vpmuludq %xmm7,%xmm14,%xmm12
+ vpmuludq %xmm5,%xmm14,%xmm10
+ vpshufd $0x10,-48(%rdi),%xmm2
+ vpmuludq %xmm6,%xmm14,%xmm11
+ vpmuludq %xmm8,%xmm14,%xmm13
+ vpmuludq %xmm9,%xmm14,%xmm14
+
+ vpmuludq %xmm8,%xmm2,%xmm0
+ vpaddq %xmm0,%xmm14,%xmm14
+ vpshufd $0x10,-32(%rdi),%xmm3
+ vpmuludq %xmm7,%xmm2,%xmm1
+ vpaddq %xmm1,%xmm13,%xmm13
+ vpshufd $0x10,-16(%rdi),%xmm4
+ vpmuludq %xmm6,%xmm2,%xmm0
+ vpaddq %xmm0,%xmm12,%xmm12
+ vpmuludq %xmm5,%xmm2,%xmm2
+ vpaddq %xmm2,%xmm11,%xmm11
+ vpmuludq %xmm9,%xmm3,%xmm3
+ vpaddq %xmm3,%xmm10,%xmm10
+
+ vpshufd $0x10,0(%rdi),%xmm2
+ vpmuludq %xmm7,%xmm4,%xmm1
+ vpaddq %xmm1,%xmm14,%xmm14
+ vpmuludq %xmm6,%xmm4,%xmm0
+ vpaddq %xmm0,%xmm13,%xmm13
+ vpshufd $0x10,16(%rdi),%xmm3
+ vpmuludq %xmm5,%xmm4,%xmm4
+ vpaddq %xmm4,%xmm12,%xmm12
+ vpmuludq %xmm9,%xmm2,%xmm1
+ vpaddq %xmm1,%xmm11,%xmm11
+ vpshufd $0x10,32(%rdi),%xmm4
+ vpmuludq %xmm8,%xmm2,%xmm2
+ vpaddq %xmm2,%xmm10,%xmm10
+
+ vpmuludq %xmm6,%xmm3,%xmm0
+ vpaddq %xmm0,%xmm14,%xmm14
+ vpmuludq %xmm5,%xmm3,%xmm3
+ vpaddq %xmm3,%xmm13,%xmm13
+ vpshufd $0x10,48(%rdi),%xmm2
+ vpmuludq %xmm9,%xmm4,%xmm1
+ vpaddq %xmm1,%xmm12,%xmm12
+ vpshufd $0x10,64(%rdi),%xmm3
+ vpmuludq %xmm8,%xmm4,%xmm0
+ vpaddq %xmm0,%xmm11,%xmm11
+ vpmuludq %xmm7,%xmm4,%xmm4
+ vpaddq %xmm4,%xmm10,%xmm10
+
+ vpmuludq %xmm5,%xmm2,%xmm2
+ vpaddq %xmm2,%xmm14,%xmm14
+ vpmuludq %xmm9,%xmm3,%xmm1
+ vpaddq %xmm1,%xmm13,%xmm13
+ vpmuludq %xmm8,%xmm3,%xmm0
+ vpaddq %xmm0,%xmm12,%xmm12
+ vpmuludq %xmm7,%xmm3,%xmm1
+ vpaddq %xmm1,%xmm11,%xmm11
+ vpmuludq %xmm6,%xmm3,%xmm3
+ vpaddq %xmm3,%xmm10,%xmm10
+
+ jz .Lshort_tail_avx
+
+ vmovdqu 0(%rsi),%xmm0
+ vmovdqu 16(%rsi),%xmm1
+
+ vpsrldq $6,%xmm0,%xmm2
+ vpsrldq $6,%xmm1,%xmm3
+ vpunpckhqdq %xmm1,%xmm0,%xmm4
+ vpunpcklqdq %xmm1,%xmm0,%xmm0
+ vpunpcklqdq %xmm3,%xmm2,%xmm3
+
+ vpsrlq $40,%xmm4,%xmm4
+ vpsrlq $26,%xmm0,%xmm1
+ vpand %xmm15,%xmm0,%xmm0
+ vpsrlq $4,%xmm3,%xmm2
+ vpand %xmm15,%xmm1,%xmm1
+ vpsrlq $30,%xmm3,%xmm3
+ vpand %xmm15,%xmm2,%xmm2
+ vpand %xmm15,%xmm3,%xmm3
+ vpor 32(%rcx),%xmm4,%xmm4
+
+ vpshufd $0x32,-64(%rdi),%xmm9
+ vpaddq 0(%r11),%xmm0,%xmm0
+ vpaddq 16(%r11),%xmm1,%xmm1
+ vpaddq 32(%r11),%xmm2,%xmm2
+ vpaddq 48(%r11),%xmm3,%xmm3
+ vpaddq 64(%r11),%xmm4,%xmm4
+
+ vpmuludq %xmm0,%xmm9,%xmm5
+ vpaddq %xmm5,%xmm10,%xmm10
+ vpmuludq %xmm1,%xmm9,%xmm6
+ vpaddq %xmm6,%xmm11,%xmm11
+ vpmuludq %xmm2,%xmm9,%xmm5
+ vpaddq %xmm5,%xmm12,%xmm12
+ vpshufd $0x32,-48(%rdi),%xmm7
+ vpmuludq %xmm3,%xmm9,%xmm6
+ vpaddq %xmm6,%xmm13,%xmm13
+ vpmuludq %xmm4,%xmm9,%xmm9
+ vpaddq %xmm9,%xmm14,%xmm14
+
+ vpmuludq %xmm3,%xmm7,%xmm5
+ vpaddq %xmm5,%xmm14,%xmm14
+ vpshufd $0x32,-32(%rdi),%xmm8
+ vpmuludq %xmm2,%xmm7,%xmm6
+ vpaddq %xmm6,%xmm13,%xmm13
+ vpshufd $0x32,-16(%rdi),%xmm9
+ vpmuludq %xmm1,%xmm7,%xmm5
+ vpaddq %xmm5,%xmm12,%xmm12
+ vpmuludq %xmm0,%xmm7,%xmm7
+ vpaddq %xmm7,%xmm11,%xmm11
+ vpmuludq %xmm4,%xmm8,%xmm8
+ vpaddq %xmm8,%xmm10,%xmm10
+
+ vpshufd $0x32,0(%rdi),%xmm7
+ vpmuludq %xmm2,%xmm9,%xmm6
+ vpaddq %xmm6,%xmm14,%xmm14
+ vpmuludq %xmm1,%xmm9,%xmm5
+ vpaddq %xmm5,%xmm13,%xmm13
+ vpshufd $0x32,16(%rdi),%xmm8
+ vpmuludq %xmm0,%xmm9,%xmm9
+ vpaddq %xmm9,%xmm12,%xmm12
+ vpmuludq %xmm4,%xmm7,%xmm6
+ vpaddq %xmm6,%xmm11,%xmm11
+ vpshufd $0x32,32(%rdi),%xmm9
+ vpmuludq %xmm3,%xmm7,%xmm7
+ vpaddq %xmm7,%xmm10,%xmm10
+
+ vpmuludq %xmm1,%xmm8,%xmm5
+ vpaddq %xmm5,%xmm14,%xmm14
+ vpmuludq %xmm0,%xmm8,%xmm8
+ vpaddq %xmm8,%xmm13,%xmm13
+ vpshufd $0x32,48(%rdi),%xmm7
+ vpmuludq %xmm4,%xmm9,%xmm6
+ vpaddq %xmm6,%xmm12,%xmm12
+ vpshufd $0x32,64(%rdi),%xmm8
+ vpmuludq %xmm3,%xmm9,%xmm5
+ vpaddq %xmm5,%xmm11,%xmm11
+ vpmuludq %xmm2,%xmm9,%xmm9
+ vpaddq %xmm9,%xmm10,%xmm10
+
+ vpmuludq %xmm0,%xmm7,%xmm7
+ vpaddq %xmm7,%xmm14,%xmm14
+ vpmuludq %xmm4,%xmm8,%xmm6
+ vpaddq %xmm6,%xmm13,%xmm13
+ vpmuludq %xmm3,%xmm8,%xmm5
+ vpaddq %xmm5,%xmm12,%xmm12
+ vpmuludq %xmm2,%xmm8,%xmm6
+ vpaddq %xmm6,%xmm11,%xmm11
+ vpmuludq %xmm1,%xmm8,%xmm8
+ vpaddq %xmm8,%xmm10,%xmm10
+
+.Lshort_tail_avx:
+
+ vpsrldq $8,%xmm14,%xmm9
+ vpsrldq $8,%xmm13,%xmm8
+ vpsrldq $8,%xmm11,%xmm6
+ vpsrldq $8,%xmm10,%xmm5
+ vpsrldq $8,%xmm12,%xmm7
+ vpaddq %xmm8,%xmm13,%xmm13
+ vpaddq %xmm9,%xmm14,%xmm14
+ vpaddq %xmm5,%xmm10,%xmm10
+ vpaddq %xmm6,%xmm11,%xmm11
+ vpaddq %xmm7,%xmm12,%xmm12
+
+ vpsrlq $26,%xmm13,%xmm3
+ vpand %xmm15,%xmm13,%xmm13
+ vpaddq %xmm3,%xmm14,%xmm14
+
+ vpsrlq $26,%xmm10,%xmm0
+ vpand %xmm15,%xmm10,%xmm10
+ vpaddq %xmm0,%xmm11,%xmm11
+
+ vpsrlq $26,%xmm14,%xmm4
+ vpand %xmm15,%xmm14,%xmm14
+
+ vpsrlq $26,%xmm11,%xmm1
+ vpand %xmm15,%xmm11,%xmm11
+ vpaddq %xmm1,%xmm12,%xmm12
+
+ vpaddq %xmm4,%xmm10,%xmm10
+ vpsllq $2,%xmm4,%xmm4
+ vpaddq %xmm4,%xmm10,%xmm10
+
+ vpsrlq $26,%xmm12,%xmm2
+ vpand %xmm15,%xmm12,%xmm12
+ vpaddq %xmm2,%xmm13,%xmm13
+
+ vpsrlq $26,%xmm10,%xmm0
+ vpand %xmm15,%xmm10,%xmm10
+ vpaddq %xmm0,%xmm11,%xmm11
+
+ vpsrlq $26,%xmm13,%xmm3
+ vpand %xmm15,%xmm13,%xmm13
+ vpaddq %xmm3,%xmm14,%xmm14
+
+ vmovd %xmm10,-112(%rdi)
+ vmovd %xmm11,-108(%rdi)
+ vmovd %xmm12,-104(%rdi)
+ vmovd %xmm13,-100(%rdi)
+ vmovd %xmm14,-96(%rdi)
+ leaq -8(%r10),%rsp
+
+ vzeroupper
+ ret
+ENDPROC(poly1305_blocks_avx)
+
+.align 32
+ENTRY(poly1305_emit_avx)
+ cmpl $0,20(%rdi)
+ je .Lemit
+
+ movl 0(%rdi),%eax
+ movl 4(%rdi),%ecx
+ movl 8(%rdi),%r8d
+ movl 12(%rdi),%r11d
+ movl 16(%rdi),%r10d
+
+ shlq $26,%rcx
+ movq %r8,%r9
+ shlq $52,%r8
+ addq %rcx,%rax
+ shrq $12,%r9
+ addq %rax,%r8
+ adcq $0,%r9
+
+ shlq $14,%r11
+ movq %r10,%rax
+ shrq $24,%r10
+ addq %r11,%r9
+ shlq $40,%rax
+ addq %rax,%r9
+ adcq $0,%r10
+
+ movq %r10,%rax
+ movq %r10,%rcx
+ andq $3,%r10
+ shrq $2,%rax
+ andq $-4,%rcx
+ addq %rcx,%rax
+ addq %rax,%r8
+ adcq $0,%r9
+ adcq $0,%r10
+
+ movq %r8,%rax
+ addq $5,%r8
+ movq %r9,%rcx
+ adcq $0,%r9
+ adcq $0,%r10
+ shrq $2,%r10
+ cmovnzq %r8,%rax
+ cmovnzq %r9,%rcx
+
+ addq 0(%rdx),%rax
+ adcq 8(%rdx),%rcx
+ movq %rax,0(%rsi)
+ movq %rcx,8(%rsi)
+
+ ret
+ENDPROC(poly1305_emit_avx)
+#endif /* CONFIG_AS_AVX */
+
+#ifdef CONFIG_AS_AVX2
+.align 32
+ENTRY(poly1305_blocks_avx2)
+
+ movl 20(%rdi),%r8d
+ cmpq $128,%rdx
+ jae .Lblocks_avx2
+ testl %r8d,%r8d
+ jz .Lblocks
+
+.Lblocks_avx2:
+ andq $-16,%rdx
+ jz .Lno_data_avx2
+
+ vzeroupper
+
+ testl %r8d,%r8d
+ jz .Lbase2_64_avx2
+
+ testq $63,%rdx
+ jz .Leven_avx2
+
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rdi
+
+.Lblocks_avx2_body:
+
+ movq %rdx,%r15
+
+ movq 0(%rdi),%r8
+ movq 8(%rdi),%r9
+ movl 16(%rdi),%r10d
+
+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r13
+
+
+ movl %r8d,%r14d
+ andq $-2147483648,%r8
+ movq %r9,%r12
+ movl %r9d,%ebx
+ andq $-2147483648,%r9
+
+ shrq $6,%r8
+ shlq $52,%r12
+ addq %r8,%r14
+ shrq $12,%rbx
+ shrq $18,%r9
+ addq %r12,%r14
+ adcq %r9,%rbx
+
+ movq %r10,%r8
+ shlq $40,%r8
+ shrq $24,%r10
+ addq %r8,%rbx
+ adcq $0,%r10
+
+ movq $-4,%r9
+ movq %r10,%r8
+ andq %r10,%r9
+ shrq $2,%r8
+ andq $3,%r10
+ addq %r9,%r8
+ addq %r8,%r14
+ adcq $0,%rbx
+ adcq $0,%r10
+
+ movq %r13,%r12
+ movq %r13,%rax
+ shrq $2,%r13
+ addq %r12,%r13
+
+.Lbase2_26_pre_avx2:
+ addq 0(%rsi),%r14
+ adcq 8(%rsi),%rbx
+ leaq 16(%rsi),%rsi
+ adcq %rcx,%r10
+ subq $16,%r15
+
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi
+ movq %r12,%rax
+
+ testq $63,%r15
+ jnz .Lbase2_26_pre_avx2
+
+ testq %rcx,%rcx
+ jz .Lstore_base2_64_avx2
+
+
+ movq %r14,%rax
+ movq %r14,%rdx
+ shrq $52,%r14
+ movq %rbx,%r11
+ movq %rbx,%r12
+ shrq $26,%rdx
+ andq $0x3ffffff,%rax
+ shlq $12,%r11
+ andq $0x3ffffff,%rdx
+ shrq $14,%rbx
+ orq %r11,%r14
+ shlq $24,%r10
+ andq $0x3ffffff,%r14
+ shrq $40,%r12
+ andq $0x3ffffff,%rbx
+ orq %r12,%r10
+
+ testq %r15,%r15
+ jz .Lstore_base2_26_avx2
+
+ vmovd %eax,%xmm0
+ vmovd %edx,%xmm1
+ vmovd %r14d,%xmm2
+ vmovd %ebx,%xmm3
+ vmovd %r10d,%xmm4
+ jmp .Lproceed_avx2
+
+.align 32
+.Lstore_base2_64_avx2:
+ movq %r14,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %r10,16(%rdi)
+ jmp .Ldone_avx2
+
+.align 16
+.Lstore_base2_26_avx2:
+ movl %eax,0(%rdi)
+ movl %edx,4(%rdi)
+ movl %r14d,8(%rdi)
+ movl %ebx,12(%rdi)
+ movl %r10d,16(%rdi)
+.align 16
+.Ldone_avx2:
+ movq 8(%rsp),%r15
+ movq 16(%rsp),%r14
+ movq 24(%rsp),%r13
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%rbx
+ leaq 48(%rsp),%rsp
+
+.Lno_data_avx2:
+.Lblocks_avx2_epilogue:
+ ret
+
+
+.align 32
+.Lbase2_64_avx2:
+
+
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rdi
+
+.Lbase2_64_avx2_body:
+
+ movq %rdx,%r15
+
+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r13
+
+ movq 0(%rdi),%r14
+ movq 8(%rdi),%rbx
+ movl 16(%rdi),%r10d
+
+ movq %r13,%r12
+ movq %r13,%rax
+ shrq $2,%r13
+ addq %r12,%r13
+
+ testq $63,%rdx
+ jz .Linit_avx2
+
+.Lbase2_64_pre_avx2:
+ addq 0(%rsi),%r14
+ adcq 8(%rsi),%rbx
+ leaq 16(%rsi),%rsi
+ adcq %rcx,%r10
+ subq $16,%r15
+
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi
+ movq %r12,%rax
+
+ testq $63,%r15
+ jnz .Lbase2_64_pre_avx2
+
+.Linit_avx2:
+
+ movq %r14,%rax
+ movq %r14,%rdx
+ shrq $52,%r14
+ movq %rbx,%r8
+ movq %rbx,%r9
+ shrq $26,%rdx
+ andq $0x3ffffff,%rax
+ shlq $12,%r8
+ andq $0x3ffffff,%rdx
+ shrq $14,%rbx
+ orq %r8,%r14
+ shlq $24,%r10
+ andq $0x3ffffff,%r14
+ shrq $40,%r9
+ andq $0x3ffffff,%rbx
+ orq %r9,%r10
+
+ vmovd %eax,%xmm0
+ vmovd %edx,%xmm1
+ vmovd %r14d,%xmm2
+ vmovd %ebx,%xmm3
+ vmovd %r10d,%xmm4
+ movl $1,20(%rdi)
+
+ __poly1305_init_avx
+
+.Lproceed_avx2:
+ movq %r15,%rdx
+
+ movq 8(%rsp),%r15
+ movq 16(%rsp),%r14
+ movq 24(%rsp),%r13
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%rbx
+ leaq 48(%rsp),%rax
+ leaq 48(%rsp),%rsp
+
+.Lbase2_64_avx2_epilogue:
+ jmp .Ldo_avx2
+
+
+.align 32
+.Leven_avx2:
+
+ vmovd 0(%rdi),%xmm0
+ vmovd 4(%rdi),%xmm1
+ vmovd 8(%rdi),%xmm2
+ vmovd 12(%rdi),%xmm3
+ vmovd 16(%rdi),%xmm4
+
+.Ldo_avx2:
+ leaq 8(%rsp),%r10
+ subq $0x128,%rsp
+ leaq .Lconst(%rip),%rcx
+ leaq 48+64(%rdi),%rdi
+ vmovdqa 96(%rcx),%ymm7
+
+
+ vmovdqu -64(%rdi),%xmm9
+ andq $-512,%rsp
+ vmovdqu -48(%rdi),%xmm10
+ vmovdqu -32(%rdi),%xmm6
+ vmovdqu -16(%rdi),%xmm11
+ vmovdqu 0(%rdi),%xmm12
+ vmovdqu 16(%rdi),%xmm13
+ leaq 144(%rsp),%rax
+ vmovdqu 32(%rdi),%xmm14
+ vpermd %ymm9,%ymm7,%ymm9
+ vmovdqu 48(%rdi),%xmm15
+ vpermd %ymm10,%ymm7,%ymm10
+ vmovdqu 64(%rdi),%xmm5
+ vpermd %ymm6,%ymm7,%ymm6
+ vmovdqa %ymm9,0(%rsp)
+ vpermd %ymm11,%ymm7,%ymm11
+ vmovdqa %ymm10,32-144(%rax)
+ vpermd %ymm12,%ymm7,%ymm12
+ vmovdqa %ymm6,64-144(%rax)
+ vpermd %ymm13,%ymm7,%ymm13
+ vmovdqa %ymm11,96-144(%rax)
+ vpermd %ymm14,%ymm7,%ymm14
+ vmovdqa %ymm12,128-144(%rax)
+ vpermd %ymm15,%ymm7,%ymm15
+ vmovdqa %ymm13,160-144(%rax)
+ vpermd %ymm5,%ymm7,%ymm5
+ vmovdqa %ymm14,192-144(%rax)
+ vmovdqa %ymm15,224-144(%rax)
+ vmovdqa %ymm5,256-144(%rax)
+ vmovdqa 64(%rcx),%ymm5
+
+
+
+ vmovdqu 0(%rsi),%xmm7
+ vmovdqu 16(%rsi),%xmm8
+ vinserti128 $1,32(%rsi),%ymm7,%ymm7
+ vinserti128 $1,48(%rsi),%ymm8,%ymm8
+ leaq 64(%rsi),%rsi
+
+ vpsrldq $6,%ymm7,%ymm9
+ vpsrldq $6,%ymm8,%ymm10
+ vpunpckhqdq %ymm8,%ymm7,%ymm6
+ vpunpcklqdq %ymm10,%ymm9,%ymm9
+ vpunpcklqdq %ymm8,%ymm7,%ymm7
+
+ vpsrlq $30,%ymm9,%ymm10
+ vpsrlq $4,%ymm9,%ymm9
+ vpsrlq $26,%ymm7,%ymm8
+ vpsrlq $40,%ymm6,%ymm6
+ vpand %ymm5,%ymm9,%ymm9
+ vpand %ymm5,%ymm7,%ymm7
+ vpand %ymm5,%ymm8,%ymm8
+ vpand %ymm5,%ymm10,%ymm10
+ vpor 32(%rcx),%ymm6,%ymm6
+
+ vpaddq %ymm2,%ymm9,%ymm2
+ subq $64,%rdx
+ jz .Ltail_avx2
+ jmp .Loop_avx2
+
+.align 32
+.Loop_avx2:
+
+ vpaddq %ymm0,%ymm7,%ymm0
+ vmovdqa 0(%rsp),%ymm7
+ vpaddq %ymm1,%ymm8,%ymm1
+ vmovdqa 32(%rsp),%ymm8
+ vpaddq %ymm3,%ymm10,%ymm3
+ vmovdqa 96(%rsp),%ymm9
+ vpaddq %ymm4,%ymm6,%ymm4
+ vmovdqa 48(%rax),%ymm10
+ vmovdqa 112(%rax),%ymm5
+
+ vpmuludq %ymm2,%ymm7,%ymm13
+ vpmuludq %ymm2,%ymm8,%ymm14
+ vpmuludq %ymm2,%ymm9,%ymm15
+ vpmuludq %ymm2,%ymm10,%ymm11
+ vpmuludq %ymm2,%ymm5,%ymm12
+
+ vpmuludq %ymm0,%ymm8,%ymm6
+ vpmuludq %ymm1,%ymm8,%ymm2
+ vpaddq %ymm6,%ymm12,%ymm12
+ vpaddq %ymm2,%ymm13,%ymm13
+ vpmuludq %ymm3,%ymm8,%ymm6
+ vpmuludq 64(%rsp),%ymm4,%ymm2
+ vpaddq %ymm6,%ymm15,%ymm15
+ vpaddq %ymm2,%ymm11,%ymm11
+ vmovdqa -16(%rax),%ymm8
+
+ vpmuludq %ymm0,%ymm7,%ymm6
+ vpmuludq %ymm1,%ymm7,%ymm2
+ vpaddq %ymm6,%ymm11,%ymm11
+ vpaddq %ymm2,%ymm12,%ymm12
+ vpmuludq %ymm3,%ymm7,%ymm6
+ vpmuludq %ymm4,%ymm7,%ymm2
+ vmovdqu 0(%rsi),%xmm7
+ vpaddq %ymm6,%ymm14,%ymm14
+ vpaddq %ymm2,%ymm15,%ymm15
+ vinserti128 $1,32(%rsi),%ymm7,%ymm7
+
+ vpmuludq %ymm3,%ymm8,%ymm6
+ vpmuludq %ymm4,%ymm8,%ymm2
+ vmovdqu 16(%rsi),%xmm8
+ vpaddq %ymm6,%ymm11,%ymm11
+ vpaddq %ymm2,%ymm12,%ymm12
+ vmovdqa 16(%rax),%ymm2
+ vpmuludq %ymm1,%ymm9,%ymm6
+ vpmuludq %ymm0,%ymm9,%ymm9
+ vpaddq %ymm6,%ymm14,%ymm14
+ vpaddq %ymm9,%ymm13,%ymm13
+ vinserti128 $1,48(%rsi),%ymm8,%ymm8
+ leaq 64(%rsi),%rsi
+
+ vpmuludq %ymm1,%ymm2,%ymm6
+ vpmuludq %ymm0,%ymm2,%ymm2
+ vpsrldq $6,%ymm7,%ymm9
+ vpaddq %ymm6,%ymm15,%ymm15
+ vpaddq %ymm2,%ymm14,%ymm14
+ vpmuludq %ymm3,%ymm10,%ymm6
+ vpmuludq %ymm4,%ymm10,%ymm2
+ vpsrldq $6,%ymm8,%ymm10
+ vpaddq %ymm6,%ymm12,%ymm12
+ vpaddq %ymm2,%ymm13,%ymm13
+ vpunpckhqdq %ymm8,%ymm7,%ymm6
+
+ vpmuludq %ymm3,%ymm5,%ymm3
+ vpmuludq %ymm4,%ymm5,%ymm4
+ vpunpcklqdq %ymm8,%ymm7,%ymm7
+ vpaddq %ymm3,%ymm13,%ymm2
+ vpaddq %ymm4,%ymm14,%ymm3
+ vpunpcklqdq %ymm10,%ymm9,%ymm10
+ vpmuludq 80(%rax),%ymm0,%ymm4
+ vpmuludq %ymm1,%ymm5,%ymm0
+ vmovdqa 64(%rcx),%ymm5
+ vpaddq %ymm4,%ymm15,%ymm4
+ vpaddq %ymm0,%ymm11,%ymm0
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpaddq %ymm11,%ymm12,%ymm1
+
+ vpsrlq $26,%ymm4,%ymm15
+ vpand %ymm5,%ymm4,%ymm4
+
+ vpsrlq $4,%ymm10,%ymm9
+
+ vpsrlq $26,%ymm1,%ymm12
+ vpand %ymm5,%ymm1,%ymm1
+ vpaddq %ymm12,%ymm2,%ymm2
+
+ vpaddq %ymm15,%ymm0,%ymm0
+ vpsllq $2,%ymm15,%ymm15
+ vpaddq %ymm15,%ymm0,%ymm0
+
+ vpand %ymm5,%ymm9,%ymm9
+ vpsrlq $26,%ymm7,%ymm8
+
+ vpsrlq $26,%ymm2,%ymm13
+ vpand %ymm5,%ymm2,%ymm2
+ vpaddq %ymm13,%ymm3,%ymm3
+
+ vpaddq %ymm9,%ymm2,%ymm2
+ vpsrlq $30,%ymm10,%ymm10
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpaddq %ymm11,%ymm1,%ymm1
+
+ vpsrlq $40,%ymm6,%ymm6
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ vpand %ymm5,%ymm7,%ymm7
+ vpand %ymm5,%ymm8,%ymm8
+ vpand %ymm5,%ymm10,%ymm10
+ vpor 32(%rcx),%ymm6,%ymm6
+
+ subq $64,%rdx
+ jnz .Loop_avx2
+
+.byte 0x66,0x90
+.Ltail_avx2:
+
+ vpaddq %ymm0,%ymm7,%ymm0
+ vmovdqu 4(%rsp),%ymm7
+ vpaddq %ymm1,%ymm8,%ymm1
+ vmovdqu 36(%rsp),%ymm8
+ vpaddq %ymm3,%ymm10,%ymm3
+ vmovdqu 100(%rsp),%ymm9
+ vpaddq %ymm4,%ymm6,%ymm4
+ vmovdqu 52(%rax),%ymm10
+ vmovdqu 116(%rax),%ymm5
+
+ vpmuludq %ymm2,%ymm7,%ymm13
+ vpmuludq %ymm2,%ymm8,%ymm14
+ vpmuludq %ymm2,%ymm9,%ymm15
+ vpmuludq %ymm2,%ymm10,%ymm11
+ vpmuludq %ymm2,%ymm5,%ymm12
+
+ vpmuludq %ymm0,%ymm8,%ymm6
+ vpmuludq %ymm1,%ymm8,%ymm2
+ vpaddq %ymm6,%ymm12,%ymm12
+ vpaddq %ymm2,%ymm13,%ymm13
+ vpmuludq %ymm3,%ymm8,%ymm6
+ vpmuludq 68(%rsp),%ymm4,%ymm2
+ vpaddq %ymm6,%ymm15,%ymm15
+ vpaddq %ymm2,%ymm11,%ymm11
+
+ vpmuludq %ymm0,%ymm7,%ymm6
+ vpmuludq %ymm1,%ymm7,%ymm2
+ vpaddq %ymm6,%ymm11,%ymm11
+ vmovdqu -12(%rax),%ymm8
+ vpaddq %ymm2,%ymm12,%ymm12
+ vpmuludq %ymm3,%ymm7,%ymm6
+ vpmuludq %ymm4,%ymm7,%ymm2
+ vpaddq %ymm6,%ymm14,%ymm14
+ vpaddq %ymm2,%ymm15,%ymm15
+
+ vpmuludq %ymm3,%ymm8,%ymm6
+ vpmuludq %ymm4,%ymm8,%ymm2
+ vpaddq %ymm6,%ymm11,%ymm11
+ vpaddq %ymm2,%ymm12,%ymm12
+ vmovdqu 20(%rax),%ymm2
+ vpmuludq %ymm1,%ymm9,%ymm6
+ vpmuludq %ymm0,%ymm9,%ymm9
+ vpaddq %ymm6,%ymm14,%ymm14
+ vpaddq %ymm9,%ymm13,%ymm13
+
+ vpmuludq %ymm1,%ymm2,%ymm6
+ vpmuludq %ymm0,%ymm2,%ymm2
+ vpaddq %ymm6,%ymm15,%ymm15
+ vpaddq %ymm2,%ymm14,%ymm14
+ vpmuludq %ymm3,%ymm10,%ymm6
+ vpmuludq %ymm4,%ymm10,%ymm2
+ vpaddq %ymm6,%ymm12,%ymm12
+ vpaddq %ymm2,%ymm13,%ymm13
+
+ vpmuludq %ymm3,%ymm5,%ymm3
+ vpmuludq %ymm4,%ymm5,%ymm4
+ vpaddq %ymm3,%ymm13,%ymm2
+ vpaddq %ymm4,%ymm14,%ymm3
+ vpmuludq 84(%rax),%ymm0,%ymm4
+ vpmuludq %ymm1,%ymm5,%ymm0
+ vmovdqa 64(%rcx),%ymm5
+ vpaddq %ymm4,%ymm15,%ymm4
+ vpaddq %ymm0,%ymm11,%ymm0
+
+ vpsrldq $8,%ymm12,%ymm8
+ vpsrldq $8,%ymm2,%ymm9
+ vpsrldq $8,%ymm3,%ymm10
+ vpsrldq $8,%ymm4,%ymm6
+ vpsrldq $8,%ymm0,%ymm7
+ vpaddq %ymm8,%ymm12,%ymm12
+ vpaddq %ymm9,%ymm2,%ymm2
+ vpaddq %ymm10,%ymm3,%ymm3
+ vpaddq %ymm6,%ymm4,%ymm4
+ vpaddq %ymm7,%ymm0,%ymm0
+
+ vpermq $0x2,%ymm3,%ymm10
+ vpermq $0x2,%ymm4,%ymm6
+ vpermq $0x2,%ymm0,%ymm7
+ vpermq $0x2,%ymm12,%ymm8
+ vpermq $0x2,%ymm2,%ymm9
+ vpaddq %ymm10,%ymm3,%ymm3
+ vpaddq %ymm6,%ymm4,%ymm4
+ vpaddq %ymm7,%ymm0,%ymm0
+ vpaddq %ymm8,%ymm12,%ymm12
+ vpaddq %ymm9,%ymm2,%ymm2
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpaddq %ymm11,%ymm12,%ymm1
+
+ vpsrlq $26,%ymm4,%ymm15
+ vpand %ymm5,%ymm4,%ymm4
+
+ vpsrlq $26,%ymm1,%ymm12
+ vpand %ymm5,%ymm1,%ymm1
+ vpaddq %ymm12,%ymm2,%ymm2
+
+ vpaddq %ymm15,%ymm0,%ymm0
+ vpsllq $2,%ymm15,%ymm15
+ vpaddq %ymm15,%ymm0,%ymm0
+
+ vpsrlq $26,%ymm2,%ymm13
+ vpand %ymm5,%ymm2,%ymm2
+ vpaddq %ymm13,%ymm3,%ymm3
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpaddq %ymm11,%ymm1,%ymm1
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ vmovd %xmm0,-112(%rdi)
+ vmovd %xmm1,-108(%rdi)
+ vmovd %xmm2,-104(%rdi)
+ vmovd %xmm3,-100(%rdi)
+ vmovd %xmm4,-96(%rdi)
+ leaq -8(%r10),%rsp
+
+ vzeroupper
+ ret
+
+ENDPROC(poly1305_blocks_avx2)
+#endif /* CONFIG_AS_AVX2 */
+
+#ifdef CONFIG_AS_AVX512
+.align 32
+ENTRY(poly1305_blocks_avx512)
+
+ movl 20(%rdi),%r8d
+ cmpq $128,%rdx
+ jae .Lblocks_avx2_512
+ testl %r8d,%r8d
+ jz .Lblocks
+
+.Lblocks_avx2_512:
+ andq $-16,%rdx
+ jz .Lno_data_avx2_512
+
+ vzeroupper
+
+ testl %r8d,%r8d
+ jz .Lbase2_64_avx2_512
+
+ testq $63,%rdx
+ jz .Leven_avx2_512
+
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rdi
+
+.Lblocks_avx2_body_512:
+
+ movq %rdx,%r15
+
+ movq 0(%rdi),%r8
+ movq 8(%rdi),%r9
+ movl 16(%rdi),%r10d
+
+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r13
+
+
+ movl %r8d,%r14d
+ andq $-2147483648,%r8
+ movq %r9,%r12
+ movl %r9d,%ebx
+ andq $-2147483648,%r9
+
+ shrq $6,%r8
+ shlq $52,%r12
+ addq %r8,%r14
+ shrq $12,%rbx
+ shrq $18,%r9
+ addq %r12,%r14
+ adcq %r9,%rbx
+
+ movq %r10,%r8
+ shlq $40,%r8
+ shrq $24,%r10
+ addq %r8,%rbx
+ adcq $0,%r10
+
+ movq $-4,%r9
+ movq %r10,%r8
+ andq %r10,%r9
+ shrq $2,%r8
+ andq $3,%r10
+ addq %r9,%r8
+ addq %r8,%r14
+ adcq $0,%rbx
+ adcq $0,%r10
+
+ movq %r13,%r12
+ movq %r13,%rax
+ shrq $2,%r13
+ addq %r12,%r13
+
+.Lbase2_26_pre_avx2_512:
+ addq 0(%rsi),%r14
+ adcq 8(%rsi),%rbx
+ leaq 16(%rsi),%rsi
+ adcq %rcx,%r10
+ subq $16,%r15
+
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi
+ movq %r12,%rax
+
+ testq $63,%r15
+ jnz .Lbase2_26_pre_avx2_512
+
+ testq %rcx,%rcx
+ jz .Lstore_base2_64_avx2_512
+
+
+ movq %r14,%rax
+ movq %r14,%rdx
+ shrq $52,%r14
+ movq %rbx,%r11
+ movq %rbx,%r12
+ shrq $26,%rdx
+ andq $0x3ffffff,%rax
+ shlq $12,%r11
+ andq $0x3ffffff,%rdx
+ shrq $14,%rbx
+ orq %r11,%r14
+ shlq $24,%r10
+ andq $0x3ffffff,%r14
+ shrq $40,%r12
+ andq $0x3ffffff,%rbx
+ orq %r12,%r10
+
+ testq %r15,%r15
+ jz .Lstore_base2_26_avx2_512
+
+ vmovd %eax,%xmm0
+ vmovd %edx,%xmm1
+ vmovd %r14d,%xmm2
+ vmovd %ebx,%xmm3
+ vmovd %r10d,%xmm4
+ jmp .Lproceed_avx2_512
+
+.align 32
+.Lstore_base2_64_avx2_512:
+ movq %r14,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %r10,16(%rdi)
+ jmp .Ldone_avx2_512
+
+.align 16
+.Lstore_base2_26_avx2_512:
+ movl %eax,0(%rdi)
+ movl %edx,4(%rdi)
+ movl %r14d,8(%rdi)
+ movl %ebx,12(%rdi)
+ movl %r10d,16(%rdi)
+.align 16
+.Ldone_avx2_512:
+ movq 8(%rsp),%r15
+ movq 16(%rsp),%r14
+ movq 24(%rsp),%r13
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%rbx
+ leaq 48(%rsp),%rsp
+
+.Lno_data_avx2_512:
+.Lblocks_avx2_epilogue_512:
+ ret
+
+
+.align 32
+.Lbase2_64_avx2_512:
+
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rdi
+
+.Lbase2_64_avx2_body_512:
+
+ movq %rdx,%r15
+
+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r13
+
+ movq 0(%rdi),%r14
+ movq 8(%rdi),%rbx
+ movl 16(%rdi),%r10d
+
+ movq %r13,%r12
+ movq %r13,%rax
+ shrq $2,%r13
+ addq %r12,%r13
+
+ testq $63,%rdx
+ jz .Linit_avx2_512
+
+.Lbase2_64_pre_avx2_512:
+ addq 0(%rsi),%r14
+ adcq 8(%rsi),%rbx
+ leaq 16(%rsi),%rsi
+ adcq %rcx,%r10
+ subq $16,%r15
+
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi
+ movq %r12,%rax
+
+ testq $63,%r15
+ jnz .Lbase2_64_pre_avx2_512
+
+.Linit_avx2_512:
+
+ movq %r14,%rax
+ movq %r14,%rdx
+ shrq $52,%r14
+ movq %rbx,%r8
+ movq %rbx,%r9
+ shrq $26,%rdx
+ andq $0x3ffffff,%rax
+ shlq $12,%r8
+ andq $0x3ffffff,%rdx
+ shrq $14,%rbx
+ orq %r8,%r14
+ shlq $24,%r10
+ andq $0x3ffffff,%r14
+ shrq $40,%r9
+ andq $0x3ffffff,%rbx
+ orq %r9,%r10
+
+ vmovd %eax,%xmm0
+ vmovd %edx,%xmm1
+ vmovd %r14d,%xmm2
+ vmovd %ebx,%xmm3
+ vmovd %r10d,%xmm4
+ movl $1,20(%rdi)
+
+ __poly1305_init_avx
+
+.Lproceed_avx2_512:
+ movq %r15,%rdx
+
+ movq 8(%rsp),%r15
+ movq 16(%rsp),%r14
+ movq 24(%rsp),%r13
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%rbx
+ leaq 48(%rsp),%rax
+ leaq 48(%rsp),%rsp
+
+.Lbase2_64_avx2_epilogue_512:
+ jmp .Ldo_avx2_512
+
+
+.align 32
+.Leven_avx2_512:
+
+ vmovd 0(%rdi),%xmm0
+ vmovd 4(%rdi),%xmm1
+ vmovd 8(%rdi),%xmm2
+ vmovd 12(%rdi),%xmm3
+ vmovd 16(%rdi),%xmm4
+
+.Ldo_avx2_512:
+ cmpq $512,%rdx
+ jae .Lblocks_avx512
+.Lskip_avx512:
+ leaq 8(%rsp),%r10
+
+ subq $0x128,%rsp
+ leaq .Lconst(%rip),%rcx
+ leaq 48+64(%rdi),%rdi
+ vmovdqa 96(%rcx),%ymm7
+
+
+ vmovdqu -64(%rdi),%xmm9
+ andq $-512,%rsp
+ vmovdqu -48(%rdi),%xmm10
+ vmovdqu -32(%rdi),%xmm6
+ vmovdqu -16(%rdi),%xmm11
+ vmovdqu 0(%rdi),%xmm12
+ vmovdqu 16(%rdi),%xmm13
+ leaq 144(%rsp),%rax
+ vmovdqu 32(%rdi),%xmm14
+ vpermd %ymm9,%ymm7,%ymm9
+ vmovdqu 48(%rdi),%xmm15
+ vpermd %ymm10,%ymm7,%ymm10
+ vmovdqu 64(%rdi),%xmm5
+ vpermd %ymm6,%ymm7,%ymm6
+ vmovdqa %ymm9,0(%rsp)
+ vpermd %ymm11,%ymm7,%ymm11
+ vmovdqa %ymm10,32-144(%rax)
+ vpermd %ymm12,%ymm7,%ymm12
+ vmovdqa %ymm6,64-144(%rax)
+ vpermd %ymm13,%ymm7,%ymm13
+ vmovdqa %ymm11,96-144(%rax)
+ vpermd %ymm14,%ymm7,%ymm14
+ vmovdqa %ymm12,128-144(%rax)
+ vpermd %ymm15,%ymm7,%ymm15
+ vmovdqa %ymm13,160-144(%rax)
+ vpermd %ymm5,%ymm7,%ymm5
+ vmovdqa %ymm14,192-144(%rax)
+ vmovdqa %ymm15,224-144(%rax)
+ vmovdqa %ymm5,256-144(%rax)
+ vmovdqa 64(%rcx),%ymm5
+
+
+
+ vmovdqu 0(%rsi),%xmm7
+ vmovdqu 16(%rsi),%xmm8
+ vinserti128 $1,32(%rsi),%ymm7,%ymm7
+ vinserti128 $1,48(%rsi),%ymm8,%ymm8
+ leaq 64(%rsi),%rsi
+
+ vpsrldq $6,%ymm7,%ymm9
+ vpsrldq $6,%ymm8,%ymm10
+ vpunpckhqdq %ymm8,%ymm7,%ymm6
+ vpunpcklqdq %ymm10,%ymm9,%ymm9
+ vpunpcklqdq %ymm8,%ymm7,%ymm7
+
+ vpsrlq $30,%ymm9,%ymm10
+ vpsrlq $4,%ymm9,%ymm9
+ vpsrlq $26,%ymm7,%ymm8
+ vpsrlq $40,%ymm6,%ymm6
+ vpand %ymm5,%ymm9,%ymm9
+ vpand %ymm5,%ymm7,%ymm7
+ vpand %ymm5,%ymm8,%ymm8
+ vpand %ymm5,%ymm10,%ymm10
+ vpor 32(%rcx),%ymm6,%ymm6
+
+ vpaddq %ymm2,%ymm9,%ymm2
+ subq $64,%rdx
+ jz .Ltail_avx2_512
+ jmp .Loop_avx2_512
+
+.align 32
+.Loop_avx2_512:
+
+ vpaddq %ymm0,%ymm7,%ymm0
+ vmovdqa 0(%rsp),%ymm7
+ vpaddq %ymm1,%ymm8,%ymm1
+ vmovdqa 32(%rsp),%ymm8
+ vpaddq %ymm3,%ymm10,%ymm3
+ vmovdqa 96(%rsp),%ymm9
+ vpaddq %ymm4,%ymm6,%ymm4
+ vmovdqa 48(%rax),%ymm10
+ vmovdqa 112(%rax),%ymm5
+
+ vpmuludq %ymm2,%ymm7,%ymm13
+ vpmuludq %ymm2,%ymm8,%ymm14
+ vpmuludq %ymm2,%ymm9,%ymm15
+ vpmuludq %ymm2,%ymm10,%ymm11
+ vpmuludq %ymm2,%ymm5,%ymm12
+
+ vpmuludq %ymm0,%ymm8,%ymm6
+ vpmuludq %ymm1,%ymm8,%ymm2
+ vpaddq %ymm6,%ymm12,%ymm12
+ vpaddq %ymm2,%ymm13,%ymm13
+ vpmuludq %ymm3,%ymm8,%ymm6
+ vpmuludq 64(%rsp),%ymm4,%ymm2
+ vpaddq %ymm6,%ymm15,%ymm15
+ vpaddq %ymm2,%ymm11,%ymm11
+ vmovdqa -16(%rax),%ymm8
+
+ vpmuludq %ymm0,%ymm7,%ymm6
+ vpmuludq %ymm1,%ymm7,%ymm2
+ vpaddq %ymm6,%ymm11,%ymm11
+ vpaddq %ymm2,%ymm12,%ymm12
+ vpmuludq %ymm3,%ymm7,%ymm6
+ vpmuludq %ymm4,%ymm7,%ymm2
+ vmovdqu 0(%rsi),%xmm7
+ vpaddq %ymm6,%ymm14,%ymm14
+ vpaddq %ymm2,%ymm15,%ymm15
+ vinserti128 $1,32(%rsi),%ymm7,%ymm7
+
+ vpmuludq %ymm3,%ymm8,%ymm6
+ vpmuludq %ymm4,%ymm8,%ymm2
+ vmovdqu 16(%rsi),%xmm8
+ vpaddq %ymm6,%ymm11,%ymm11
+ vpaddq %ymm2,%ymm12,%ymm12
+ vmovdqa 16(%rax),%ymm2
+ vpmuludq %ymm1,%ymm9,%ymm6
+ vpmuludq %ymm0,%ymm9,%ymm9
+ vpaddq %ymm6,%ymm14,%ymm14
+ vpaddq %ymm9,%ymm13,%ymm13
+ vinserti128 $1,48(%rsi),%ymm8,%ymm8
+ leaq 64(%rsi),%rsi
+
+ vpmuludq %ymm1,%ymm2,%ymm6
+ vpmuludq %ymm0,%ymm2,%ymm2
+ vpsrldq $6,%ymm7,%ymm9
+ vpaddq %ymm6,%ymm15,%ymm15
+ vpaddq %ymm2,%ymm14,%ymm14
+ vpmuludq %ymm3,%ymm10,%ymm6
+ vpmuludq %ymm4,%ymm10,%ymm2
+ vpsrldq $6,%ymm8,%ymm10
+ vpaddq %ymm6,%ymm12,%ymm12
+ vpaddq %ymm2,%ymm13,%ymm13
+ vpunpckhqdq %ymm8,%ymm7,%ymm6
+
+ vpmuludq %ymm3,%ymm5,%ymm3
+ vpmuludq %ymm4,%ymm5,%ymm4
+ vpunpcklqdq %ymm8,%ymm7,%ymm7
+ vpaddq %ymm3,%ymm13,%ymm2
+ vpaddq %ymm4,%ymm14,%ymm3
+ vpunpcklqdq %ymm10,%ymm9,%ymm10
+ vpmuludq 80(%rax),%ymm0,%ymm4
+ vpmuludq %ymm1,%ymm5,%ymm0
+ vmovdqa 64(%rcx),%ymm5
+ vpaddq %ymm4,%ymm15,%ymm4
+ vpaddq %ymm0,%ymm11,%ymm0
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpaddq %ymm11,%ymm12,%ymm1
+
+ vpsrlq $26,%ymm4,%ymm15
+ vpand %ymm5,%ymm4,%ymm4
+
+ vpsrlq $4,%ymm10,%ymm9
+
+ vpsrlq $26,%ymm1,%ymm12
+ vpand %ymm5,%ymm1,%ymm1
+ vpaddq %ymm12,%ymm2,%ymm2
+
+ vpaddq %ymm15,%ymm0,%ymm0
+ vpsllq $2,%ymm15,%ymm15
+ vpaddq %ymm15,%ymm0,%ymm0
+
+ vpand %ymm5,%ymm9,%ymm9
+ vpsrlq $26,%ymm7,%ymm8
+
+ vpsrlq $26,%ymm2,%ymm13
+ vpand %ymm5,%ymm2,%ymm2
+ vpaddq %ymm13,%ymm3,%ymm3
+
+ vpaddq %ymm9,%ymm2,%ymm2
+ vpsrlq $30,%ymm10,%ymm10
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpaddq %ymm11,%ymm1,%ymm1
+
+ vpsrlq $40,%ymm6,%ymm6
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ vpand %ymm5,%ymm7,%ymm7
+ vpand %ymm5,%ymm8,%ymm8
+ vpand %ymm5,%ymm10,%ymm10
+ vpor 32(%rcx),%ymm6,%ymm6
+
+ subq $64,%rdx
+ jnz .Loop_avx2_512
+
+.byte 0x66,0x90
+.Ltail_avx2_512:
+
+ vpaddq %ymm0,%ymm7,%ymm0
+ vmovdqu 4(%rsp),%ymm7
+ vpaddq %ymm1,%ymm8,%ymm1
+ vmovdqu 36(%rsp),%ymm8
+ vpaddq %ymm3,%ymm10,%ymm3
+ vmovdqu 100(%rsp),%ymm9
+ vpaddq %ymm4,%ymm6,%ymm4
+ vmovdqu 52(%rax),%ymm10
+ vmovdqu 116(%rax),%ymm5
+
+ vpmuludq %ymm2,%ymm7,%ymm13
+ vpmuludq %ymm2,%ymm8,%ymm14
+ vpmuludq %ymm2,%ymm9,%ymm15
+ vpmuludq %ymm2,%ymm10,%ymm11
+ vpmuludq %ymm2,%ymm5,%ymm12
+
+ vpmuludq %ymm0,%ymm8,%ymm6
+ vpmuludq %ymm1,%ymm8,%ymm2
+ vpaddq %ymm6,%ymm12,%ymm12
+ vpaddq %ymm2,%ymm13,%ymm13
+ vpmuludq %ymm3,%ymm8,%ymm6
+ vpmuludq 68(%rsp),%ymm4,%ymm2
+ vpaddq %ymm6,%ymm15,%ymm15
+ vpaddq %ymm2,%ymm11,%ymm11
+
+ vpmuludq %ymm0,%ymm7,%ymm6
+ vpmuludq %ymm1,%ymm7,%ymm2
+ vpaddq %ymm6,%ymm11,%ymm11
+ vmovdqu -12(%rax),%ymm8
+ vpaddq %ymm2,%ymm12,%ymm12
+ vpmuludq %ymm3,%ymm7,%ymm6
+ vpmuludq %ymm4,%ymm7,%ymm2
+ vpaddq %ymm6,%ymm14,%ymm14
+ vpaddq %ymm2,%ymm15,%ymm15
+
+ vpmuludq %ymm3,%ymm8,%ymm6
+ vpmuludq %ymm4,%ymm8,%ymm2
+ vpaddq %ymm6,%ymm11,%ymm11
+ vpaddq %ymm2,%ymm12,%ymm12
+ vmovdqu 20(%rax),%ymm2
+ vpmuludq %ymm1,%ymm9,%ymm6
+ vpmuludq %ymm0,%ymm9,%ymm9
+ vpaddq %ymm6,%ymm14,%ymm14
+ vpaddq %ymm9,%ymm13,%ymm13
+
+ vpmuludq %ymm1,%ymm2,%ymm6
+ vpmuludq %ymm0,%ymm2,%ymm2
+ vpaddq %ymm6,%ymm15,%ymm15
+ vpaddq %ymm2,%ymm14,%ymm14
+ vpmuludq %ymm3,%ymm10,%ymm6
+ vpmuludq %ymm4,%ymm10,%ymm2
+ vpaddq %ymm6,%ymm12,%ymm12
+ vpaddq %ymm2,%ymm13,%ymm13
+
+ vpmuludq %ymm3,%ymm5,%ymm3
+ vpmuludq %ymm4,%ymm5,%ymm4
+ vpaddq %ymm3,%ymm13,%ymm2
+ vpaddq %ymm4,%ymm14,%ymm3
+ vpmuludq 84(%rax),%ymm0,%ymm4
+ vpmuludq %ymm1,%ymm5,%ymm0
+ vmovdqa 64(%rcx),%ymm5
+ vpaddq %ymm4,%ymm15,%ymm4
+ vpaddq %ymm0,%ymm11,%ymm0
+
+ vpsrldq $8,%ymm12,%ymm8
+ vpsrldq $8,%ymm2,%ymm9
+ vpsrldq $8,%ymm3,%ymm10
+ vpsrldq $8,%ymm4,%ymm6
+ vpsrldq $8,%ymm0,%ymm7
+ vpaddq %ymm8,%ymm12,%ymm12
+ vpaddq %ymm9,%ymm2,%ymm2
+ vpaddq %ymm10,%ymm3,%ymm3
+ vpaddq %ymm6,%ymm4,%ymm4
+ vpaddq %ymm7,%ymm0,%ymm0
+
+ vpermq $0x2,%ymm3,%ymm10
+ vpermq $0x2,%ymm4,%ymm6
+ vpermq $0x2,%ymm0,%ymm7
+ vpermq $0x2,%ymm12,%ymm8
+ vpermq $0x2,%ymm2,%ymm9
+ vpaddq %ymm10,%ymm3,%ymm3
+ vpaddq %ymm6,%ymm4,%ymm4
+ vpaddq %ymm7,%ymm0,%ymm0
+ vpaddq %ymm8,%ymm12,%ymm12
+ vpaddq %ymm9,%ymm2,%ymm2
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpaddq %ymm11,%ymm12,%ymm1
+
+ vpsrlq $26,%ymm4,%ymm15
+ vpand %ymm5,%ymm4,%ymm4
+
+ vpsrlq $26,%ymm1,%ymm12
+ vpand %ymm5,%ymm1,%ymm1
+ vpaddq %ymm12,%ymm2,%ymm2
+
+ vpaddq %ymm15,%ymm0,%ymm0
+ vpsllq $2,%ymm15,%ymm15
+ vpaddq %ymm15,%ymm0,%ymm0
+
+ vpsrlq $26,%ymm2,%ymm13
+ vpand %ymm5,%ymm2,%ymm2
+ vpaddq %ymm13,%ymm3,%ymm3
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpaddq %ymm11,%ymm1,%ymm1
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ vmovd %xmm0,-112(%rdi)
+ vmovd %xmm1,-108(%rdi)
+ vmovd %xmm2,-104(%rdi)
+ vmovd %xmm3,-100(%rdi)
+ vmovd %xmm4,-96(%rdi)
+ leaq -8(%r10),%rsp
+
+ vzeroupper
+ ret
+
+.Lblocks_avx512:
+
+ movl $15,%eax
+ kmovw %eax,%k2
+ leaq 8(%rsp),%r10
+
+ subq $0x128,%rsp
+ leaq .Lconst(%rip),%rcx
+ leaq 48+64(%rdi),%rdi
+ vmovdqa 96(%rcx),%ymm9
+
+ vmovdqu32 -64(%rdi),%zmm16{%k2}{z}
+ andq $-512,%rsp
+ vmovdqu32 -48(%rdi),%zmm17{%k2}{z}
+ movq $0x20,%rax
+ vmovdqu32 -32(%rdi),%zmm21{%k2}{z}
+ vmovdqu32 -16(%rdi),%zmm18{%k2}{z}
+ vmovdqu32 0(%rdi),%zmm22{%k2}{z}
+ vmovdqu32 16(%rdi),%zmm19{%k2}{z}
+ vmovdqu32 32(%rdi),%zmm23{%k2}{z}
+ vmovdqu32 48(%rdi),%zmm20{%k2}{z}
+ vmovdqu32 64(%rdi),%zmm24{%k2}{z}
+ vpermd %zmm16,%zmm9,%zmm16
+ vpbroadcastq 64(%rcx),%zmm5
+ vpermd %zmm17,%zmm9,%zmm17
+ vpermd %zmm21,%zmm9,%zmm21
+ vpermd %zmm18,%zmm9,%zmm18
+ vmovdqa64 %zmm16,0(%rsp){%k2}
+ vpsrlq $32,%zmm16,%zmm7
+ vpermd %zmm22,%zmm9,%zmm22
+ vmovdqu64 %zmm17,0(%rsp,%rax,1){%k2}
+ vpsrlq $32,%zmm17,%zmm8
+ vpermd %zmm19,%zmm9,%zmm19
+ vmovdqa64 %zmm21,64(%rsp){%k2}
+ vpermd %zmm23,%zmm9,%zmm23
+ vpermd %zmm20,%zmm9,%zmm20
+ vmovdqu64 %zmm18,64(%rsp,%rax,1){%k2}
+ vpermd %zmm24,%zmm9,%zmm24
+ vmovdqa64 %zmm22,128(%rsp){%k2}
+ vmovdqu64 %zmm19,128(%rsp,%rax,1){%k2}
+ vmovdqa64 %zmm23,192(%rsp){%k2}
+ vmovdqu64 %zmm20,192(%rsp,%rax,1){%k2}
+ vmovdqa64 %zmm24,256(%rsp){%k2}
+
+ vpmuludq %zmm7,%zmm16,%zmm11
+ vpmuludq %zmm7,%zmm17,%zmm12
+ vpmuludq %zmm7,%zmm18,%zmm13
+ vpmuludq %zmm7,%zmm19,%zmm14
+ vpmuludq %zmm7,%zmm20,%zmm15
+ vpsrlq $32,%zmm18,%zmm9
+
+ vpmuludq %zmm8,%zmm24,%zmm25
+ vpmuludq %zmm8,%zmm16,%zmm26
+ vpmuludq %zmm8,%zmm17,%zmm27
+ vpmuludq %zmm8,%zmm18,%zmm28
+ vpmuludq %zmm8,%zmm19,%zmm29
+ vpsrlq $32,%zmm19,%zmm10
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm26,%zmm12,%zmm12
+ vpaddq %zmm27,%zmm13,%zmm13
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+
+ vpmuludq %zmm9,%zmm23,%zmm25
+ vpmuludq %zmm9,%zmm24,%zmm26
+ vpmuludq %zmm9,%zmm17,%zmm28
+ vpmuludq %zmm9,%zmm18,%zmm29
+ vpmuludq %zmm9,%zmm16,%zmm27
+ vpsrlq $32,%zmm20,%zmm6
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm26,%zmm12,%zmm12
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm27,%zmm13,%zmm13
+
+ vpmuludq %zmm10,%zmm22,%zmm25
+ vpmuludq %zmm10,%zmm16,%zmm28
+ vpmuludq %zmm10,%zmm17,%zmm29
+ vpmuludq %zmm10,%zmm23,%zmm26
+ vpmuludq %zmm10,%zmm24,%zmm27
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm26,%zmm12,%zmm12
+ vpaddq %zmm27,%zmm13,%zmm13
+
+ vpmuludq %zmm6,%zmm24,%zmm28
+ vpmuludq %zmm6,%zmm16,%zmm29
+ vpmuludq %zmm6,%zmm21,%zmm25
+ vpmuludq %zmm6,%zmm22,%zmm26
+ vpmuludq %zmm6,%zmm23,%zmm27
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm26,%zmm12,%zmm12
+ vpaddq %zmm27,%zmm13,%zmm13
+
+ vmovdqu64 0(%rsi),%zmm10
+ vmovdqu64 64(%rsi),%zmm6
+ leaq 128(%rsi),%rsi
+
+ vpsrlq $26,%zmm14,%zmm28
+ vpandq %zmm5,%zmm14,%zmm14
+ vpaddq %zmm28,%zmm15,%zmm15
+
+ vpsrlq $26,%zmm11,%zmm25
+ vpandq %zmm5,%zmm11,%zmm11
+ vpaddq %zmm25,%zmm12,%zmm12
+
+ vpsrlq $26,%zmm15,%zmm29
+ vpandq %zmm5,%zmm15,%zmm15
+
+ vpsrlq $26,%zmm12,%zmm26
+ vpandq %zmm5,%zmm12,%zmm12
+ vpaddq %zmm26,%zmm13,%zmm13
+
+ vpaddq %zmm29,%zmm11,%zmm11
+ vpsllq $2,%zmm29,%zmm29
+ vpaddq %zmm29,%zmm11,%zmm11
+
+ vpsrlq $26,%zmm13,%zmm27
+ vpandq %zmm5,%zmm13,%zmm13
+ vpaddq %zmm27,%zmm14,%zmm14
+
+ vpsrlq $26,%zmm11,%zmm25
+ vpandq %zmm5,%zmm11,%zmm11
+ vpaddq %zmm25,%zmm12,%zmm12
+
+ vpsrlq $26,%zmm14,%zmm28
+ vpandq %zmm5,%zmm14,%zmm14
+ vpaddq %zmm28,%zmm15,%zmm15
+
+ vpunpcklqdq %zmm6,%zmm10,%zmm7
+ vpunpckhqdq %zmm6,%zmm10,%zmm6
+
+ vmovdqa32 128(%rcx),%zmm25
+ movl $0x7777,%eax
+ kmovw %eax,%k1
+
+ vpermd %zmm16,%zmm25,%zmm16
+ vpermd %zmm17,%zmm25,%zmm17
+ vpermd %zmm18,%zmm25,%zmm18
+ vpermd %zmm19,%zmm25,%zmm19
+ vpermd %zmm20,%zmm25,%zmm20
+
+ vpermd %zmm11,%zmm25,%zmm16{%k1}
+ vpermd %zmm12,%zmm25,%zmm17{%k1}
+ vpermd %zmm13,%zmm25,%zmm18{%k1}
+ vpermd %zmm14,%zmm25,%zmm19{%k1}
+ vpermd %zmm15,%zmm25,%zmm20{%k1}
+
+ vpslld $2,%zmm17,%zmm21
+ vpslld $2,%zmm18,%zmm22
+ vpslld $2,%zmm19,%zmm23
+ vpslld $2,%zmm20,%zmm24
+ vpaddd %zmm17,%zmm21,%zmm21
+ vpaddd %zmm18,%zmm22,%zmm22
+ vpaddd %zmm19,%zmm23,%zmm23
+ vpaddd %zmm20,%zmm24,%zmm24
+
+ vpbroadcastq 32(%rcx),%zmm30
+
+ vpsrlq $52,%zmm7,%zmm9
+ vpsllq $12,%zmm6,%zmm10
+ vporq %zmm10,%zmm9,%zmm9
+ vpsrlq $26,%zmm7,%zmm8
+ vpsrlq $14,%zmm6,%zmm10
+ vpsrlq $40,%zmm6,%zmm6
+ vpandq %zmm5,%zmm9,%zmm9
+ vpandq %zmm5,%zmm7,%zmm7
+
+ vpaddq %zmm2,%zmm9,%zmm2
+ subq $192,%rdx
+ jbe .Ltail_avx512
+ jmp .Loop_avx512
+
+.align 32
+.Loop_avx512:
+
+ vpmuludq %zmm2,%zmm17,%zmm14
+ vpaddq %zmm0,%zmm7,%zmm0
+ vpmuludq %zmm2,%zmm18,%zmm15
+ vpandq %zmm5,%zmm8,%zmm8
+ vpmuludq %zmm2,%zmm23,%zmm11
+ vpandq %zmm5,%zmm10,%zmm10
+ vpmuludq %zmm2,%zmm24,%zmm12
+ vporq %zmm30,%zmm6,%zmm6
+ vpmuludq %zmm2,%zmm16,%zmm13
+ vpaddq %zmm1,%zmm8,%zmm1
+ vpaddq %zmm3,%zmm10,%zmm3
+ vpaddq %zmm4,%zmm6,%zmm4
+
+ vmovdqu64 0(%rsi),%zmm10
+ vmovdqu64 64(%rsi),%zmm6
+ leaq 128(%rsi),%rsi
+ vpmuludq %zmm0,%zmm19,%zmm28
+ vpmuludq %zmm0,%zmm20,%zmm29
+ vpmuludq %zmm0,%zmm16,%zmm25
+ vpmuludq %zmm0,%zmm17,%zmm26
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm26,%zmm12,%zmm12
+
+ vpmuludq %zmm1,%zmm18,%zmm28
+ vpmuludq %zmm1,%zmm19,%zmm29
+ vpmuludq %zmm1,%zmm24,%zmm25
+ vpmuludq %zmm0,%zmm18,%zmm27
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm27,%zmm13,%zmm13
+
+ vpunpcklqdq %zmm6,%zmm10,%zmm7
+ vpunpckhqdq %zmm6,%zmm10,%zmm6
+
+ vpmuludq %zmm3,%zmm16,%zmm28
+ vpmuludq %zmm3,%zmm17,%zmm29
+ vpmuludq %zmm1,%zmm16,%zmm26
+ vpmuludq %zmm1,%zmm17,%zmm27
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm26,%zmm12,%zmm12
+ vpaddq %zmm27,%zmm13,%zmm13
+
+ vpmuludq %zmm4,%zmm24,%zmm28
+ vpmuludq %zmm4,%zmm16,%zmm29
+ vpmuludq %zmm3,%zmm22,%zmm25
+ vpmuludq %zmm3,%zmm23,%zmm26
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpmuludq %zmm3,%zmm24,%zmm27
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm26,%zmm12,%zmm12
+ vpaddq %zmm27,%zmm13,%zmm13
+
+ vpmuludq %zmm4,%zmm21,%zmm25
+ vpmuludq %zmm4,%zmm22,%zmm26
+ vpmuludq %zmm4,%zmm23,%zmm27
+ vpaddq %zmm25,%zmm11,%zmm0
+ vpaddq %zmm26,%zmm12,%zmm1
+ vpaddq %zmm27,%zmm13,%zmm2
+
+ vpsrlq $52,%zmm7,%zmm9
+ vpsllq $12,%zmm6,%zmm10
+
+ vpsrlq $26,%zmm14,%zmm3
+ vpandq %zmm5,%zmm14,%zmm14
+ vpaddq %zmm3,%zmm15,%zmm4
+
+ vporq %zmm10,%zmm9,%zmm9
+
+ vpsrlq $26,%zmm0,%zmm11
+ vpandq %zmm5,%zmm0,%zmm0
+ vpaddq %zmm11,%zmm1,%zmm1
+
+ vpandq %zmm5,%zmm9,%zmm9
+
+ vpsrlq $26,%zmm4,%zmm15
+ vpandq %zmm5,%zmm4,%zmm4
+
+ vpsrlq $26,%zmm1,%zmm12
+ vpandq %zmm5,%zmm1,%zmm1
+ vpaddq %zmm12,%zmm2,%zmm2
+
+ vpaddq %zmm15,%zmm0,%zmm0
+ vpsllq $2,%zmm15,%zmm15
+ vpaddq %zmm15,%zmm0,%zmm0
+
+ vpaddq %zmm9,%zmm2,%zmm2
+ vpsrlq $26,%zmm7,%zmm8
+
+ vpsrlq $26,%zmm2,%zmm13
+ vpandq %zmm5,%zmm2,%zmm2
+ vpaddq %zmm13,%zmm14,%zmm3
+
+ vpsrlq $14,%zmm6,%zmm10
+
+ vpsrlq $26,%zmm0,%zmm11
+ vpandq %zmm5,%zmm0,%zmm0
+ vpaddq %zmm11,%zmm1,%zmm1
+
+ vpsrlq $40,%zmm6,%zmm6
+
+ vpsrlq $26,%zmm3,%zmm14
+ vpandq %zmm5,%zmm3,%zmm3
+ vpaddq %zmm14,%zmm4,%zmm4
+
+ vpandq %zmm5,%zmm7,%zmm7
+
+ subq $128,%rdx
+ ja .Loop_avx512
+
+.Ltail_avx512:
+
+ vpsrlq $32,%zmm16,%zmm16
+ vpsrlq $32,%zmm17,%zmm17
+ vpsrlq $32,%zmm18,%zmm18
+ vpsrlq $32,%zmm23,%zmm23
+ vpsrlq $32,%zmm24,%zmm24
+ vpsrlq $32,%zmm19,%zmm19
+ vpsrlq $32,%zmm20,%zmm20
+ vpsrlq $32,%zmm21,%zmm21
+ vpsrlq $32,%zmm22,%zmm22
+
+ leaq (%rsi,%rdx,1),%rsi
+
+ vpaddq %zmm0,%zmm7,%zmm0
+
+ vpmuludq %zmm2,%zmm17,%zmm14
+ vpmuludq %zmm2,%zmm18,%zmm15
+ vpmuludq %zmm2,%zmm23,%zmm11
+ vpandq %zmm5,%zmm8,%zmm8
+ vpmuludq %zmm2,%zmm24,%zmm12
+ vpandq %zmm5,%zmm10,%zmm10
+ vpmuludq %zmm2,%zmm16,%zmm13
+ vporq %zmm30,%zmm6,%zmm6
+ vpaddq %zmm1,%zmm8,%zmm1
+ vpaddq %zmm3,%zmm10,%zmm3
+ vpaddq %zmm4,%zmm6,%zmm4
+
+ vmovdqu 0(%rsi),%xmm7
+ vpmuludq %zmm0,%zmm19,%zmm28
+ vpmuludq %zmm0,%zmm20,%zmm29
+ vpmuludq %zmm0,%zmm16,%zmm25
+ vpmuludq %zmm0,%zmm17,%zmm26
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm26,%zmm12,%zmm12
+
+ vmovdqu 16(%rsi),%xmm8
+ vpmuludq %zmm1,%zmm18,%zmm28
+ vpmuludq %zmm1,%zmm19,%zmm29
+ vpmuludq %zmm1,%zmm24,%zmm25
+ vpmuludq %zmm0,%zmm18,%zmm27
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm27,%zmm13,%zmm13
+
+ vinserti128 $1,32(%rsi),%ymm7,%ymm7
+ vpmuludq %zmm3,%zmm16,%zmm28
+ vpmuludq %zmm3,%zmm17,%zmm29
+ vpmuludq %zmm1,%zmm16,%zmm26
+ vpmuludq %zmm1,%zmm17,%zmm27
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm26,%zmm12,%zmm12
+ vpaddq %zmm27,%zmm13,%zmm13
+
+ vinserti128 $1,48(%rsi),%ymm8,%ymm8
+ vpmuludq %zmm4,%zmm24,%zmm28
+ vpmuludq %zmm4,%zmm16,%zmm29
+ vpmuludq %zmm3,%zmm22,%zmm25
+ vpmuludq %zmm3,%zmm23,%zmm26
+ vpmuludq %zmm3,%zmm24,%zmm27
+ vpaddq %zmm28,%zmm14,%zmm3
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm26,%zmm12,%zmm12
+ vpaddq %zmm27,%zmm13,%zmm13
+
+ vpmuludq %zmm4,%zmm21,%zmm25
+ vpmuludq %zmm4,%zmm22,%zmm26
+ vpmuludq %zmm4,%zmm23,%zmm27
+ vpaddq %zmm25,%zmm11,%zmm0
+ vpaddq %zmm26,%zmm12,%zmm1
+ vpaddq %zmm27,%zmm13,%zmm2
+
+ movl $1,%eax
+ vpermq $0xb1,%zmm3,%zmm14
+ vpermq $0xb1,%zmm15,%zmm4
+ vpermq $0xb1,%zmm0,%zmm11
+ vpermq $0xb1,%zmm1,%zmm12
+ vpermq $0xb1,%zmm2,%zmm13
+ vpaddq %zmm14,%zmm3,%zmm3
+ vpaddq %zmm15,%zmm4,%zmm4
+ vpaddq %zmm11,%zmm0,%zmm0
+ vpaddq %zmm12,%zmm1,%zmm1
+ vpaddq %zmm13,%zmm2,%zmm2
+
+ kmovw %eax,%k3
+ vpermq $0x2,%zmm3,%zmm14
+ vpermq $0x2,%zmm4,%zmm15
+ vpermq $0x2,%zmm0,%zmm11
+ vpermq $0x2,%zmm1,%zmm12
+ vpermq $0x2,%zmm2,%zmm13
+ vpaddq %zmm14,%zmm3,%zmm3
+ vpaddq %zmm15,%zmm4,%zmm4
+ vpaddq %zmm11,%zmm0,%zmm0
+ vpaddq %zmm12,%zmm1,%zmm1
+ vpaddq %zmm13,%zmm2,%zmm2
+
+ vextracti64x4 $0x1,%zmm3,%ymm14
+ vextracti64x4 $0x1,%zmm4,%ymm15
+ vextracti64x4 $0x1,%zmm0,%ymm11
+ vextracti64x4 $0x1,%zmm1,%ymm12
+ vextracti64x4 $0x1,%zmm2,%ymm13
+ vpaddq %zmm14,%zmm3,%zmm3{%k3}{z}
+ vpaddq %zmm15,%zmm4,%zmm4{%k3}{z}
+ vpaddq %zmm11,%zmm0,%zmm0{%k3}{z}
+ vpaddq %zmm12,%zmm1,%zmm1{%k3}{z}
+ vpaddq %zmm13,%zmm2,%zmm2{%k3}{z}
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpsrldq $6,%ymm7,%ymm9
+ vpsrldq $6,%ymm8,%ymm10
+ vpunpckhqdq %ymm8,%ymm7,%ymm6
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpunpcklqdq %ymm10,%ymm9,%ymm9
+ vpunpcklqdq %ymm8,%ymm7,%ymm7
+ vpaddq %ymm11,%ymm1,%ymm1
+
+ vpsrlq $26,%ymm4,%ymm15
+ vpand %ymm5,%ymm4,%ymm4
+
+ vpsrlq $26,%ymm1,%ymm12
+ vpand %ymm5,%ymm1,%ymm1
+ vpsrlq $30,%ymm9,%ymm10
+ vpsrlq $4,%ymm9,%ymm9
+ vpaddq %ymm12,%ymm2,%ymm2
+
+ vpaddq %ymm15,%ymm0,%ymm0
+ vpsllq $2,%ymm15,%ymm15
+ vpsrlq $26,%ymm7,%ymm8
+ vpsrlq $40,%ymm6,%ymm6
+ vpaddq %ymm15,%ymm0,%ymm0
+
+ vpsrlq $26,%ymm2,%ymm13
+ vpand %ymm5,%ymm2,%ymm2
+ vpand %ymm5,%ymm9,%ymm9
+ vpand %ymm5,%ymm7,%ymm7
+ vpaddq %ymm13,%ymm3,%ymm3
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpaddq %ymm2,%ymm9,%ymm2
+ vpand %ymm5,%ymm8,%ymm8
+ vpaddq %ymm11,%ymm1,%ymm1
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpand %ymm5,%ymm10,%ymm10
+ vpor 32(%rcx),%ymm6,%ymm6
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ leaq 144(%rsp),%rax
+ addq $64,%rdx
+ jnz .Ltail_avx2_512
+
+ vpsubq %ymm9,%ymm2,%ymm2
+ vmovd %xmm0,-112(%rdi)
+ vmovd %xmm1,-108(%rdi)
+ vmovd %xmm2,-104(%rdi)
+ vmovd %xmm3,-100(%rdi)
+ vmovd %xmm4,-96(%rdi)
+ vzeroall
+ leaq -8(%r10),%rsp
+
+ ret
+
+ENDPROC(poly1305_blocks_avx512)
+#endif /* CONFIG_AS_AVX512 */
diff --git a/src/crypto/zinc/poly1305/poly1305.c b/src/crypto/zinc/poly1305/poly1305.c
new file mode 100644
index 0000000..4b90523
--- /dev/null
+++ b/src/crypto/zinc/poly1305/poly1305.c
@@ -0,0 +1,289 @@
+/* SPDX-License-Identifier: OpenSSL OR (BSD-3-Clause OR GPL-2.0)
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Implementation of the Poly1305 message authenticator.
+ *
+ * Information: https://cr.yp.to/mac.html
+ */
+
+#include <zinc/poly1305.h>
+
+#include <asm/unaligned.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+#ifndef HAVE_POLY1305_ARCH_IMPLEMENTATION
+static inline bool poly1305_init_arch(void *ctx,
+ const u8 key[POLY1305_KEY_SIZE],
+ simd_context_t simd_context)
+{
+ return false;
+}
+static inline bool poly1305_blocks_arch(void *ctx, const u8 *inp,
+ const size_t len, const u32 padbit,
+ simd_context_t simd_context)
+{
+ return false;
+}
+static inline bool poly1305_emit_arch(void *ctx, u8 mac[POLY1305_MAC_SIZE],
+ const u32 nonce[4],
+ simd_context_t simd_context)
+{
+ return false;
+}
+void __init poly1305_fpu_init(void)
+{
+}
+#endif
+
+struct poly1305_internal {
+ u32 h[5];
+ u32 r[4];
+};
+
+static void poly1305_init_generic(void *ctx, const u8 key[16])
+{
+ struct poly1305_internal *st = (struct poly1305_internal *)ctx;
+
+ /* h = 0 */
+ st->h[0] = 0;
+ st->h[1] = 0;
+ st->h[2] = 0;
+ st->h[3] = 0;
+ st->h[4] = 0;
+
+ /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
+ st->r[0] = get_unaligned_le32(&key[ 0]) & 0x0fffffff;
+ st->r[1] = get_unaligned_le32(&key[ 4]) & 0x0ffffffc;
+ st->r[2] = get_unaligned_le32(&key[ 8]) & 0x0ffffffc;
+ st->r[3] = get_unaligned_le32(&key[12]) & 0x0ffffffc;
+}
+
+static void poly1305_blocks_generic(void *ctx, const u8 *inp, size_t len,
+ const u32 padbit)
+{
+#define CONSTANT_TIME_CARRY(a, b) \
+ ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1))
+ struct poly1305_internal *st = (struct poly1305_internal *)ctx;
+ u32 r0, r1, r2, r3;
+ u32 s1, s2, s3;
+ u32 h0, h1, h2, h3, h4, c;
+ u64 d0, d1, d2, d3;
+
+ r0 = st->r[0];
+ r1 = st->r[1];
+ r2 = st->r[2];
+ r3 = st->r[3];
+
+ s1 = r1 + (r1 >> 2);
+ s2 = r2 + (r2 >> 2);
+ s3 = r3 + (r3 >> 2);
+
+ h0 = st->h[0];
+ h1 = st->h[1];
+ h2 = st->h[2];
+ h3 = st->h[3];
+ h4 = st->h[4];
+
+ while (len >= POLY1305_BLOCK_SIZE) {
+ /* h += m[i] */
+ h0 = (u32)(d0 = (u64)h0 + (0 ) + get_unaligned_le32(&inp[ 0]));
+ h1 = (u32)(d1 = (u64)h1 + (d0 >> 32) + get_unaligned_le32(&inp[ 4]));
+ h2 = (u32)(d2 = (u64)h2 + (d1 >> 32) + get_unaligned_le32(&inp[ 8]));
+ h3 = (u32)(d3 = (u64)h3 + (d2 >> 32) + get_unaligned_le32(&inp[12]));
+ h4 += (u32)(d3 >> 32) + padbit;
+
+ /* h *= r "%" p, where "%" stands for "partial remainder" */
+ d0 = ((u64)h0 * r0) +
+ ((u64)h1 * s3) +
+ ((u64)h2 * s2) +
+ ((u64)h3 * s1);
+ d1 = ((u64)h0 * r1) +
+ ((u64)h1 * r0) +
+ ((u64)h2 * s3) +
+ ((u64)h3 * s2) +
+ (h4 * s1);
+ d2 = ((u64)h0 * r2) +
+ ((u64)h1 * r1) +
+ ((u64)h2 * r0) +
+ ((u64)h3 * s3) +
+ (h4 * s2);
+ d3 = ((u64)h0 * r3) +
+ ((u64)h1 * r2) +
+ ((u64)h2 * r1) +
+ ((u64)h3 * r0) +
+ (h4 * s3);
+ h4 = (h4 * r0);
+
+ /* last reduction step: */
+ /* a) h4:h0 = h4<<128 + d3<<96 + d2<<64 + d1<<32 + d0 */
+ h0 = (u32)d0;
+ h1 = (u32)(d1 += d0 >> 32);
+ h2 = (u32)(d2 += d1 >> 32);
+ h3 = (u32)(d3 += d2 >> 32);
+ h4 += (u32)(d3 >> 32);
+ /* b) (h4:h0 += (h4:h0>>130) * 5) %= 2^130 */
+ c = (h4 >> 2) + (h4 & ~3U);
+ h4 &= 3;
+ h0 += c;
+ h1 += (c = CONSTANT_TIME_CARRY(h0, c));
+ h2 += (c = CONSTANT_TIME_CARRY(h1, c));
+ h3 += (c = CONSTANT_TIME_CARRY(h2, c));
+ h4 += CONSTANT_TIME_CARRY(h3, c);
+ /*
+ * Occasional overflows to 3rd bit of h4 are taken care of
+ * "naturally". If after this point we end up at the top of
+ * this loop, then the overflow bit will be accounted for
+ * in next iteration. If we end up in poly1305_emit, then
+ * comparison to modulus below will still count as "carry
+ * into 131st bit", so that properly reduced value will be
+ * picked in conditional move.
+ */
+
+ inp += POLY1305_BLOCK_SIZE;
+ len -= POLY1305_BLOCK_SIZE;
+ }
+
+ st->h[0] = h0;
+ st->h[1] = h1;
+ st->h[2] = h2;
+ st->h[3] = h3;
+ st->h[4] = h4;
+#undef CONSTANT_TIME_CARRY
+}
+
+static void poly1305_emit_generic(void *ctx, u8 mac[16], const u32 nonce[4])
+{
+ struct poly1305_internal *st = (struct poly1305_internal *)ctx;
+ u32 h0, h1, h2, h3, h4;
+ u32 g0, g1, g2, g3, g4;
+ u64 t;
+ u32 mask;
+
+ h0 = st->h[0];
+ h1 = st->h[1];
+ h2 = st->h[2];
+ h3 = st->h[3];
+ h4 = st->h[4];
+
+ /* compare to modulus by computing h + -p */
+ g0 = (u32)(t = (u64)h0 + 5);
+ g1 = (u32)(t = (u64)h1 + (t >> 32));
+ g2 = (u32)(t = (u64)h2 + (t >> 32));
+ g3 = (u32)(t = (u64)h3 + (t >> 32));
+ g4 = h4 + (u32)(t >> 32);
+
+ /* if there was carry into 131st bit, h3:h0 = g3:g0 */
+ mask = 0 - (g4 >> 2);
+ g0 &= mask;
+ g1 &= mask;
+ g2 &= mask;
+ g3 &= mask;
+ mask = ~mask;
+ h0 = (h0 & mask) | g0;
+ h1 = (h1 & mask) | g1;
+ h2 = (h2 & mask) | g2;
+ h3 = (h3 & mask) | g3;
+
+ /* mac = (h + nonce) % (2^128) */
+ h0 = (u32)(t = (u64)h0 + nonce[0]);
+ h1 = (u32)(t = (u64)h1 + (t >> 32) + nonce[1]);
+ h2 = (u32)(t = (u64)h2 + (t >> 32) + nonce[2]);
+ h3 = (u32)(t = (u64)h3 + (t >> 32) + nonce[3]);
+
+ put_unaligned_le32(h0, &mac[ 0]);
+ put_unaligned_le32(h1, &mac[ 4]);
+ put_unaligned_le32(h2, &mac[ 8]);
+ put_unaligned_le32(h3, &mac[12]);
+}
+
+void poly1305_init(struct poly1305_ctx *ctx, const u8 key[POLY1305_KEY_SIZE],
+ simd_context_t simd_context)
+{
+ ctx->nonce[0] = get_unaligned_le32(&key[16]);
+ ctx->nonce[1] = get_unaligned_le32(&key[20]);
+ ctx->nonce[2] = get_unaligned_le32(&key[24]);
+ ctx->nonce[3] = get_unaligned_le32(&key[28]);
+
+ if (!poly1305_init_arch(ctx->opaque, key, simd_context))
+ poly1305_init_generic(ctx->opaque, key);
+ ctx->num = 0;
+}
+EXPORT_SYMBOL(poly1305_init);
+
+static inline void poly1305_blocks(void *ctx, const u8 *inp, const size_t len,
+ const u32 padbit,
+ simd_context_t simd_context)
+{
+ if (!poly1305_blocks_arch(ctx, inp, len, padbit, simd_context))
+ poly1305_blocks_generic(ctx, inp, len, padbit);
+}
+
+static inline void poly1305_emit(void *ctx, u8 mac[POLY1305_KEY_SIZE],
+ const u32 nonce[4],
+ simd_context_t simd_context)
+{
+ if (!poly1305_emit_arch(ctx, mac, nonce, simd_context))
+ poly1305_emit_generic(ctx, mac, nonce);
+}
+
+void poly1305_update(struct poly1305_ctx *ctx, const u8 *inp, size_t len,
+ simd_context_t simd_context)
+{
+ const size_t num = ctx->num % POLY1305_BLOCK_SIZE;
+ size_t rem;
+
+ if (num) {
+ rem = POLY1305_BLOCK_SIZE - num;
+ if (len >= rem) {
+ memcpy(ctx->data + num, inp, rem);
+ poly1305_blocks(ctx->opaque, ctx->data,
+ POLY1305_BLOCK_SIZE, 1, simd_context);
+ inp += rem;
+ len -= rem;
+ } else {
+ /* Still not enough data to process a block. */
+ memcpy(ctx->data + num, inp, len);
+ ctx->num = num + len;
+ return;
+ }
+ }
+
+ rem = len % POLY1305_BLOCK_SIZE;
+ len -= rem;
+
+ if (len >= POLY1305_BLOCK_SIZE) {
+ poly1305_blocks(ctx->opaque, inp, len, 1, simd_context);
+ inp += len;
+ }
+
+ if (rem)
+ memcpy(ctx->data, inp, rem);
+
+ ctx->num = rem;
+}
+EXPORT_SYMBOL(poly1305_update);
+
+void poly1305_finish(struct poly1305_ctx *ctx, u8 mac[POLY1305_MAC_SIZE],
+ simd_context_t simd_context)
+{
+ size_t num = ctx->num % POLY1305_BLOCK_SIZE;
+
+ if (num) {
+ ctx->data[num++] = 1; /* pad bit */
+ while (num < POLY1305_BLOCK_SIZE)
+ ctx->data[num++] = 0;
+ poly1305_blocks(ctx->opaque, ctx->data, POLY1305_BLOCK_SIZE, 0,
+ simd_context);
+ }
+
+ poly1305_emit(ctx->opaque, mac, ctx->nonce, simd_context);
+
+ /* zero out the state */
+ memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL(poly1305_finish);
+
+#include "../selftest/poly1305.h"
diff --git a/src/crypto/zinc/selftest/blake2s.h b/src/crypto/zinc/selftest/blake2s.h
new file mode 100644
index 0000000..ddd6a86
--- /dev/null
+++ b/src/crypto/zinc/selftest/blake2s.h
@@ -0,0 +1,2095 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#ifdef DEBUG
+static const u8 blake2s_testvecs[][BLAKE2S_OUTBYTES] __initconst = {
+ { 0x69, 0x21, 0x7a, 0x30, 0x79, 0x90, 0x80, 0x94,
+ 0xe1, 0x11, 0x21, 0xd0, 0x42, 0x35, 0x4a, 0x7c,
+ 0x1f, 0x55, 0xb6, 0x48, 0x2c, 0xa1, 0xa5, 0x1e,
+ 0x1b, 0x25, 0x0d, 0xfd, 0x1e, 0xd0, 0xee, 0xf9 },
+ { 0xe3, 0x4d, 0x74, 0xdb, 0xaf, 0x4f, 0xf4, 0xc6,
+ 0xab, 0xd8, 0x71, 0xcc, 0x22, 0x04, 0x51, 0xd2,
+ 0xea, 0x26, 0x48, 0x84, 0x6c, 0x77, 0x57, 0xfb,
+ 0xaa, 0xc8, 0x2f, 0xe5, 0x1a, 0xd6, 0x4b, 0xea },
+ { 0xdd, 0xad, 0x9a, 0xb1, 0x5d, 0xac, 0x45, 0x49,
+ 0xba, 0x42, 0xf4, 0x9d, 0x26, 0x24, 0x96, 0xbe,
+ 0xf6, 0xc0, 0xba, 0xe1, 0xdd, 0x34, 0x2a, 0x88,
+ 0x08, 0xf8, 0xea, 0x26, 0x7c, 0x6e, 0x21, 0x0c },
+ { 0xe8, 0xf9, 0x1c, 0x6e, 0xf2, 0x32, 0xa0, 0x41,
+ 0x45, 0x2a, 0xb0, 0xe1, 0x49, 0x07, 0x0c, 0xdd,
+ 0x7d, 0xd1, 0x76, 0x9e, 0x75, 0xb3, 0xa5, 0x92,
+ 0x1b, 0xe3, 0x78, 0x76, 0xc4, 0x5c, 0x99, 0x00 },
+ { 0x0c, 0xc7, 0x0e, 0x00, 0x34, 0x8b, 0x86, 0xba,
+ 0x29, 0x44, 0xd0, 0xc3, 0x20, 0x38, 0xb2, 0x5c,
+ 0x55, 0x58, 0x4f, 0x90, 0xdf, 0x23, 0x04, 0xf5,
+ 0x5f, 0xa3, 0x32, 0xaf, 0x5f, 0xb0, 0x1e, 0x20 },
+ { 0xec, 0x19, 0x64, 0x19, 0x10, 0x87, 0xa4, 0xfe,
+ 0x9d, 0xf1, 0xc7, 0x95, 0x34, 0x2a, 0x02, 0xff,
+ 0xc1, 0x91, 0xa5, 0xb2, 0x51, 0x76, 0x48, 0x56,
+ 0xae, 0x5b, 0x8b, 0x57, 0x69, 0xf0, 0xc6, 0xcd },
+ { 0xe1, 0xfa, 0x51, 0x61, 0x8d, 0x7d, 0xf4, 0xeb,
+ 0x70, 0xcf, 0x0d, 0x5a, 0x9e, 0x90, 0x6f, 0x80,
+ 0x6e, 0x9d, 0x19, 0xf7, 0xf4, 0xf0, 0x1e, 0x3b,
+ 0x62, 0x12, 0x88, 0xe4, 0x12, 0x04, 0x05, 0xd6 },
+ { 0x59, 0x80, 0x01, 0xfa, 0xfb, 0xe8, 0xf9, 0x4e,
+ 0xc6, 0x6d, 0xc8, 0x27, 0xd0, 0x12, 0xcf, 0xcb,
+ 0xba, 0x22, 0x28, 0x56, 0x9f, 0x44, 0x8e, 0x89,
+ 0xea, 0x22, 0x08, 0xc8, 0xbf, 0x76, 0x92, 0x93 },
+ { 0xc7, 0xe8, 0x87, 0xb5, 0x46, 0x62, 0x36, 0x35,
+ 0xe9, 0x3e, 0x04, 0x95, 0x59, 0x8f, 0x17, 0x26,
+ 0x82, 0x19, 0x96, 0xc2, 0x37, 0x77, 0x05, 0xb9,
+ 0x3a, 0x1f, 0x63, 0x6f, 0x87, 0x2b, 0xfa, 0x2d },
+ { 0xc3, 0x15, 0xa4, 0x37, 0xdd, 0x28, 0x06, 0x2a,
+ 0x77, 0x0d, 0x48, 0x19, 0x67, 0x13, 0x6b, 0x1b,
+ 0x5e, 0xb8, 0x8b, 0x21, 0xee, 0x53, 0xd0, 0x32,
+ 0x9c, 0x58, 0x97, 0x12, 0x6e, 0x9d, 0xb0, 0x2c },
+ { 0xbb, 0x47, 0x3d, 0xed, 0xdc, 0x05, 0x5f, 0xea,
+ 0x62, 0x28, 0xf2, 0x07, 0xda, 0x57, 0x53, 0x47,
+ 0xbb, 0x00, 0x40, 0x4c, 0xd3, 0x49, 0xd3, 0x8c,
+ 0x18, 0x02, 0x63, 0x07, 0xa2, 0x24, 0xcb, 0xff },
+ { 0x68, 0x7e, 0x18, 0x73, 0xa8, 0x27, 0x75, 0x91,
+ 0xbb, 0x33, 0xd9, 0xad, 0xf9, 0xa1, 0x39, 0x12,
+ 0xef, 0xef, 0xe5, 0x57, 0xca, 0xfc, 0x39, 0xa7,
+ 0x95, 0x26, 0x23, 0xe4, 0x72, 0x55, 0xf1, 0x6d },
+ { 0x1a, 0xc7, 0xba, 0x75, 0x4d, 0x6e, 0x2f, 0x94,
+ 0xe0, 0xe8, 0x6c, 0x46, 0xbf, 0xb2, 0x62, 0xab,
+ 0xbb, 0x74, 0xf4, 0x50, 0xef, 0x45, 0x6d, 0x6b,
+ 0x4d, 0x97, 0xaa, 0x80, 0xce, 0x6d, 0xa7, 0x67 },
+ { 0x01, 0x2c, 0x97, 0x80, 0x96, 0x14, 0x81, 0x6b,
+ 0x5d, 0x94, 0x94, 0x47, 0x7d, 0x4b, 0x68, 0x7d,
+ 0x15, 0xb9, 0x6e, 0xb6, 0x9c, 0x0e, 0x80, 0x74,
+ 0xa8, 0x51, 0x6f, 0x31, 0x22, 0x4b, 0x5c, 0x98 },
+ { 0x91, 0xff, 0xd2, 0x6c, 0xfa, 0x4d, 0xa5, 0x13,
+ 0x4c, 0x7e, 0xa2, 0x62, 0xf7, 0x88, 0x9c, 0x32,
+ 0x9f, 0x61, 0xf6, 0xa6, 0x57, 0x22, 0x5c, 0xc2,
+ 0x12, 0xf4, 0x00, 0x56, 0xd9, 0x86, 0xb3, 0xf4 },
+ { 0xd9, 0x7c, 0x82, 0x8d, 0x81, 0x82, 0xa7, 0x21,
+ 0x80, 0xa0, 0x6a, 0x78, 0x26, 0x83, 0x30, 0x67,
+ 0x3f, 0x7c, 0x4e, 0x06, 0x35, 0x94, 0x7c, 0x04,
+ 0xc0, 0x23, 0x23, 0xfd, 0x45, 0xc0, 0xa5, 0x2d },
+ { 0xef, 0xc0, 0x4c, 0xdc, 0x39, 0x1c, 0x7e, 0x91,
+ 0x19, 0xbd, 0x38, 0x66, 0x8a, 0x53, 0x4e, 0x65,
+ 0xfe, 0x31, 0x03, 0x6d, 0x6a, 0x62, 0x11, 0x2e,
+ 0x44, 0xeb, 0xeb, 0x11, 0xf9, 0xc5, 0x70, 0x80 },
+ { 0x99, 0x2c, 0xf5, 0xc0, 0x53, 0x44, 0x2a, 0x5f,
+ 0xbc, 0x4f, 0xaf, 0x58, 0x3e, 0x04, 0xe5, 0x0b,
+ 0xb7, 0x0d, 0x2f, 0x39, 0xfb, 0xb6, 0xa5, 0x03,
+ 0xf8, 0x9e, 0x56, 0xa6, 0x3e, 0x18, 0x57, 0x8a },
+ { 0x38, 0x64, 0x0e, 0x9f, 0x21, 0x98, 0x3e, 0x67,
+ 0xb5, 0x39, 0xca, 0xcc, 0xae, 0x5e, 0xcf, 0x61,
+ 0x5a, 0xe2, 0x76, 0x4f, 0x75, 0xa0, 0x9c, 0x9c,
+ 0x59, 0xb7, 0x64, 0x83, 0xc1, 0xfb, 0xc7, 0x35 },
+ { 0x21, 0x3d, 0xd3, 0x4c, 0x7e, 0xfe, 0x4f, 0xb2,
+ 0x7a, 0x6b, 0x35, 0xf6, 0xb4, 0x00, 0x0d, 0x1f,
+ 0xe0, 0x32, 0x81, 0xaf, 0x3c, 0x72, 0x3e, 0x5c,
+ 0x9f, 0x94, 0x74, 0x7a, 0x5f, 0x31, 0xcd, 0x3b },
+ { 0xec, 0x24, 0x6e, 0xee, 0xb9, 0xce, 0xd3, 0xf7,
+ 0xad, 0x33, 0xed, 0x28, 0x66, 0x0d, 0xd9, 0xbb,
+ 0x07, 0x32, 0x51, 0x3d, 0xb4, 0xe2, 0xfa, 0x27,
+ 0x8b, 0x60, 0xcd, 0xe3, 0x68, 0x2a, 0x4c, 0xcd },
+ { 0xac, 0x9b, 0x61, 0xd4, 0x46, 0x64, 0x8c, 0x30,
+ 0x05, 0xd7, 0x89, 0x2b, 0xf3, 0xa8, 0x71, 0x9f,
+ 0x4c, 0x81, 0x81, 0xcf, 0xdc, 0xbc, 0x2b, 0x79,
+ 0xfe, 0xf1, 0x0a, 0x27, 0x9b, 0x91, 0x10, 0x95 },
+ { 0x7b, 0xf8, 0xb2, 0x29, 0x59, 0xe3, 0x4e, 0x3a,
+ 0x43, 0xf7, 0x07, 0x92, 0x23, 0xe8, 0x3a, 0x97,
+ 0x54, 0x61, 0x7d, 0x39, 0x1e, 0x21, 0x3d, 0xfd,
+ 0x80, 0x8e, 0x41, 0xb9, 0xbe, 0xad, 0x4c, 0xe7 },
+ { 0x68, 0xd4, 0xb5, 0xd4, 0xfa, 0x0e, 0x30, 0x2b,
+ 0x64, 0xcc, 0xc5, 0xaf, 0x79, 0x29, 0x13, 0xac,
+ 0x4c, 0x88, 0xec, 0x95, 0xc0, 0x7d, 0xdf, 0x40,
+ 0x69, 0x42, 0x56, 0xeb, 0x88, 0xce, 0x9f, 0x3d },
+ { 0xb2, 0xc2, 0x42, 0x0f, 0x05, 0xf9, 0xab, 0xe3,
+ 0x63, 0x15, 0x91, 0x93, 0x36, 0xb3, 0x7e, 0x4e,
+ 0x0f, 0xa3, 0x3f, 0xf7, 0xe7, 0x6a, 0x49, 0x27,
+ 0x67, 0x00, 0x6f, 0xdb, 0x5d, 0x93, 0x54, 0x62 },
+ { 0x13, 0x4f, 0x61, 0xbb, 0xd0, 0xbb, 0xb6, 0x9a,
+ 0xed, 0x53, 0x43, 0x90, 0x45, 0x51, 0xa3, 0xe6,
+ 0xc1, 0xaa, 0x7d, 0xcd, 0xd7, 0x7e, 0x90, 0x3e,
+ 0x70, 0x23, 0xeb, 0x7c, 0x60, 0x32, 0x0a, 0xa7 },
+ { 0x46, 0x93, 0xf9, 0xbf, 0xf7, 0xd4, 0xf3, 0x98,
+ 0x6a, 0x7d, 0x17, 0x6e, 0x6e, 0x06, 0xf7, 0x2a,
+ 0xd1, 0x49, 0x0d, 0x80, 0x5c, 0x99, 0xe2, 0x53,
+ 0x47, 0xb8, 0xde, 0x77, 0xb4, 0xdb, 0x6d, 0x9b },
+ { 0x85, 0x3e, 0x26, 0xf7, 0x41, 0x95, 0x3b, 0x0f,
+ 0xd5, 0xbd, 0xb4, 0x24, 0xe8, 0xab, 0x9e, 0x8b,
+ 0x37, 0x50, 0xea, 0xa8, 0xef, 0x61, 0xe4, 0x79,
+ 0x02, 0xc9, 0x1e, 0x55, 0x4e, 0x9c, 0x73, 0xb9 },
+ { 0xf7, 0xde, 0x53, 0x63, 0x61, 0xab, 0xaa, 0x0e,
+ 0x15, 0x81, 0x56, 0xcf, 0x0e, 0xa4, 0xf6, 0x3a,
+ 0x99, 0xb5, 0xe4, 0x05, 0x4f, 0x8f, 0xa4, 0xc9,
+ 0xd4, 0x5f, 0x62, 0x85, 0xca, 0xd5, 0x56, 0x94 },
+ { 0x4c, 0x23, 0x06, 0x08, 0x86, 0x0a, 0x99, 0xae,
+ 0x8d, 0x7b, 0xd5, 0xc2, 0xcc, 0x17, 0xfa, 0x52,
+ 0x09, 0x6b, 0x9a, 0x61, 0xbe, 0xdb, 0x17, 0xcb,
+ 0x76, 0x17, 0x86, 0x4a, 0xd2, 0x9c, 0xa7, 0xa6 },
+ { 0xae, 0xb9, 0x20, 0xea, 0x87, 0x95, 0x2d, 0xad,
+ 0xb1, 0xfb, 0x75, 0x92, 0x91, 0xe3, 0x38, 0x81,
+ 0x39, 0xa8, 0x72, 0x86, 0x50, 0x01, 0x88, 0x6e,
+ 0xd8, 0x47, 0x52, 0xe9, 0x3c, 0x25, 0x0c, 0x2a },
+ { 0xab, 0xa4, 0xad, 0x9b, 0x48, 0x0b, 0x9d, 0xf3,
+ 0xd0, 0x8c, 0xa5, 0xe8, 0x7b, 0x0c, 0x24, 0x40,
+ 0xd4, 0xe4, 0xea, 0x21, 0x22, 0x4c, 0x2e, 0xb4,
+ 0x2c, 0xba, 0xe4, 0x69, 0xd0, 0x89, 0xb9, 0x31 },
+ { 0x05, 0x82, 0x56, 0x07, 0xd7, 0xfd, 0xf2, 0xd8,
+ 0x2e, 0xf4, 0xc3, 0xc8, 0xc2, 0xae, 0xa9, 0x61,
+ 0xad, 0x98, 0xd6, 0x0e, 0xdf, 0xf7, 0xd0, 0x18,
+ 0x98, 0x3e, 0x21, 0x20, 0x4c, 0x0d, 0x93, 0xd1 },
+ { 0xa7, 0x42, 0xf8, 0xb6, 0xaf, 0x82, 0xd8, 0xa6,
+ 0xca, 0x23, 0x57, 0xc5, 0xf1, 0xcf, 0x91, 0xde,
+ 0xfb, 0xd0, 0x66, 0x26, 0x7d, 0x75, 0xc0, 0x48,
+ 0xb3, 0x52, 0x36, 0x65, 0x85, 0x02, 0x59, 0x62 },
+ { 0x2b, 0xca, 0xc8, 0x95, 0x99, 0x00, 0x0b, 0x42,
+ 0xc9, 0x5a, 0xe2, 0x38, 0x35, 0xa7, 0x13, 0x70,
+ 0x4e, 0xd7, 0x97, 0x89, 0xc8, 0x4f, 0xef, 0x14,
+ 0x9a, 0x87, 0x4f, 0xf7, 0x33, 0xf0, 0x17, 0xa2 },
+ { 0xac, 0x1e, 0xd0, 0x7d, 0x04, 0x8f, 0x10, 0x5a,
+ 0x9e, 0x5b, 0x7a, 0xb8, 0x5b, 0x09, 0xa4, 0x92,
+ 0xd5, 0xba, 0xff, 0x14, 0xb8, 0xbf, 0xb0, 0xe9,
+ 0xfd, 0x78, 0x94, 0x86, 0xee, 0xa2, 0xb9, 0x74 },
+ { 0xe4, 0x8d, 0x0e, 0xcf, 0xaf, 0x49, 0x7d, 0x5b,
+ 0x27, 0xc2, 0x5d, 0x99, 0xe1, 0x56, 0xcb, 0x05,
+ 0x79, 0xd4, 0x40, 0xd6, 0xe3, 0x1f, 0xb6, 0x24,
+ 0x73, 0x69, 0x6d, 0xbf, 0x95, 0xe0, 0x10, 0xe4 },
+ { 0x12, 0xa9, 0x1f, 0xad, 0xf8, 0xb2, 0x16, 0x44,
+ 0xfd, 0x0f, 0x93, 0x4f, 0x3c, 0x4a, 0x8f, 0x62,
+ 0xba, 0x86, 0x2f, 0xfd, 0x20, 0xe8, 0xe9, 0x61,
+ 0x15, 0x4c, 0x15, 0xc1, 0x38, 0x84, 0xed, 0x3d },
+ { 0x7c, 0xbe, 0xe9, 0x6e, 0x13, 0x98, 0x97, 0xdc,
+ 0x98, 0xfb, 0xef, 0x3b, 0xe8, 0x1a, 0xd4, 0xd9,
+ 0x64, 0xd2, 0x35, 0xcb, 0x12, 0x14, 0x1f, 0xb6,
+ 0x67, 0x27, 0xe6, 0xe5, 0xdf, 0x73, 0xa8, 0x78 },
+ { 0xeb, 0xf6, 0x6a, 0xbb, 0x59, 0x7a, 0xe5, 0x72,
+ 0xa7, 0x29, 0x7c, 0xb0, 0x87, 0x1e, 0x35, 0x5a,
+ 0xcc, 0xaf, 0xad, 0x83, 0x77, 0xb8, 0xe7, 0x8b,
+ 0xf1, 0x64, 0xce, 0x2a, 0x18, 0xde, 0x4b, 0xaf },
+ { 0x71, 0xb9, 0x33, 0xb0, 0x7e, 0x4f, 0xf7, 0x81,
+ 0x8c, 0xe0, 0x59, 0xd0, 0x08, 0x82, 0x9e, 0x45,
+ 0x3c, 0x6f, 0xf0, 0x2e, 0xc0, 0xa7, 0xdb, 0x39,
+ 0x3f, 0xc2, 0xd8, 0x70, 0xf3, 0x7a, 0x72, 0x86 },
+ { 0x7c, 0xf7, 0xc5, 0x13, 0x31, 0x22, 0x0b, 0x8d,
+ 0x3e, 0xba, 0xed, 0x9c, 0x29, 0x39, 0x8a, 0x16,
+ 0xd9, 0x81, 0x56, 0xe2, 0x61, 0x3c, 0xb0, 0x88,
+ 0xf2, 0xb0, 0xe0, 0x8a, 0x1b, 0xe4, 0xcf, 0x4f },
+ { 0x3e, 0x41, 0xa1, 0x08, 0xe0, 0xf6, 0x4a, 0xd2,
+ 0x76, 0xb9, 0x79, 0xe1, 0xce, 0x06, 0x82, 0x79,
+ 0xe1, 0x6f, 0x7b, 0xc7, 0xe4, 0xaa, 0x1d, 0x21,
+ 0x1e, 0x17, 0xb8, 0x11, 0x61, 0xdf, 0x16, 0x02 },
+ { 0x88, 0x65, 0x02, 0xa8, 0x2a, 0xb4, 0x7b, 0xa8,
+ 0xd8, 0x67, 0x10, 0xaa, 0x9d, 0xe3, 0xd4, 0x6e,
+ 0xa6, 0x5c, 0x47, 0xaf, 0x6e, 0xe8, 0xde, 0x45,
+ 0x0c, 0xce, 0xb8, 0xb1, 0x1b, 0x04, 0x5f, 0x50 },
+ { 0xc0, 0x21, 0xbc, 0x5f, 0x09, 0x54, 0xfe, 0xe9,
+ 0x4f, 0x46, 0xea, 0x09, 0x48, 0x7e, 0x10, 0xa8,
+ 0x48, 0x40, 0xd0, 0x2f, 0x64, 0x81, 0x0b, 0xc0,
+ 0x8d, 0x9e, 0x55, 0x1f, 0x7d, 0x41, 0x68, 0x14 },
+ { 0x20, 0x30, 0x51, 0x6e, 0x8a, 0x5f, 0xe1, 0x9a,
+ 0xe7, 0x9c, 0x33, 0x6f, 0xce, 0x26, 0x38, 0x2a,
+ 0x74, 0x9d, 0x3f, 0xd0, 0xec, 0x91, 0xe5, 0x37,
+ 0xd4, 0xbd, 0x23, 0x58, 0xc1, 0x2d, 0xfb, 0x22 },
+ { 0x55, 0x66, 0x98, 0xda, 0xc8, 0x31, 0x7f, 0xd3,
+ 0x6d, 0xfb, 0xdf, 0x25, 0xa7, 0x9c, 0xb1, 0x12,
+ 0xd5, 0x42, 0x58, 0x60, 0x60, 0x5c, 0xba, 0xf5,
+ 0x07, 0xf2, 0x3b, 0xf7, 0xe9, 0xf4, 0x2a, 0xfe },
+ { 0x2f, 0x86, 0x7b, 0xa6, 0x77, 0x73, 0xfd, 0xc3,
+ 0xe9, 0x2f, 0xce, 0xd9, 0x9a, 0x64, 0x09, 0xad,
+ 0x39, 0xd0, 0xb8, 0x80, 0xfd, 0xe8, 0xf1, 0x09,
+ 0xa8, 0x17, 0x30, 0xc4, 0x45, 0x1d, 0x01, 0x78 },
+ { 0x17, 0x2e, 0xc2, 0x18, 0xf1, 0x19, 0xdf, 0xae,
+ 0x98, 0x89, 0x6d, 0xff, 0x29, 0xdd, 0x98, 0x76,
+ 0xc9, 0x4a, 0xf8, 0x74, 0x17, 0xf9, 0xae, 0x4c,
+ 0x70, 0x14, 0xbb, 0x4e, 0x4b, 0x96, 0xaf, 0xc7 },
+ { 0x3f, 0x85, 0x81, 0x4a, 0x18, 0x19, 0x5f, 0x87,
+ 0x9a, 0xa9, 0x62, 0xf9, 0x5d, 0x26, 0xbd, 0x82,
+ 0xa2, 0x78, 0xf2, 0xb8, 0x23, 0x20, 0x21, 0x8f,
+ 0x6b, 0x3b, 0xd6, 0xf7, 0xf6, 0x67, 0xa6, 0xd9 },
+ { 0x1b, 0x61, 0x8f, 0xba, 0xa5, 0x66, 0xb3, 0xd4,
+ 0x98, 0xc1, 0x2e, 0x98, 0x2c, 0x9e, 0xc5, 0x2e,
+ 0x4d, 0xa8, 0x5a, 0x8c, 0x54, 0xf3, 0x8f, 0x34,
+ 0xc0, 0x90, 0x39, 0x4f, 0x23, 0xc1, 0x84, 0xc1 },
+ { 0x0c, 0x75, 0x8f, 0xb5, 0x69, 0x2f, 0xfd, 0x41,
+ 0xa3, 0x57, 0x5d, 0x0a, 0xf0, 0x0c, 0xc7, 0xfb,
+ 0xf2, 0xcb, 0xe5, 0x90, 0x5a, 0x58, 0x32, 0x3a,
+ 0x88, 0xae, 0x42, 0x44, 0xf6, 0xe4, 0xc9, 0x93 },
+ { 0xa9, 0x31, 0x36, 0x0c, 0xad, 0x62, 0x8c, 0x7f,
+ 0x12, 0xa6, 0xc1, 0xc4, 0xb7, 0x53, 0xb0, 0xf4,
+ 0x06, 0x2a, 0xef, 0x3c, 0xe6, 0x5a, 0x1a, 0xe3,
+ 0xf1, 0x93, 0x69, 0xda, 0xdf, 0x3a, 0xe2, 0x3d },
+ { 0xcb, 0xac, 0x7d, 0x77, 0x3b, 0x1e, 0x3b, 0x3c,
+ 0x66, 0x91, 0xd7, 0xab, 0xb7, 0xe9, 0xdf, 0x04,
+ 0x5c, 0x8b, 0xa1, 0x92, 0x68, 0xde, 0xd1, 0x53,
+ 0x20, 0x7f, 0x5e, 0x80, 0x43, 0x52, 0xec, 0x5d },
+ { 0x23, 0xa1, 0x96, 0xd3, 0x80, 0x2e, 0xd3, 0xc1,
+ 0xb3, 0x84, 0x01, 0x9a, 0x82, 0x32, 0x58, 0x40,
+ 0xd3, 0x2f, 0x71, 0x95, 0x0c, 0x45, 0x80, 0xb0,
+ 0x34, 0x45, 0xe0, 0x89, 0x8e, 0x14, 0x05, 0x3c },
+ { 0xf4, 0x49, 0x54, 0x70, 0xf2, 0x26, 0xc8, 0xc2,
+ 0x14, 0xbe, 0x08, 0xfd, 0xfa, 0xd4, 0xbc, 0x4a,
+ 0x2a, 0x9d, 0xbe, 0xa9, 0x13, 0x6a, 0x21, 0x0d,
+ 0xf0, 0xd4, 0xb6, 0x49, 0x29, 0xe6, 0xfc, 0x14 },
+ { 0xe2, 0x90, 0xdd, 0x27, 0x0b, 0x46, 0x7f, 0x34,
+ 0xab, 0x1c, 0x00, 0x2d, 0x34, 0x0f, 0xa0, 0x16,
+ 0x25, 0x7f, 0xf1, 0x9e, 0x58, 0x33, 0xfd, 0xbb,
+ 0xf2, 0xcb, 0x40, 0x1c, 0x3b, 0x28, 0x17, 0xde },
+ { 0x9f, 0xc7, 0xb5, 0xde, 0xd3, 0xc1, 0x50, 0x42,
+ 0xb2, 0xa6, 0x58, 0x2d, 0xc3, 0x9b, 0xe0, 0x16,
+ 0xd2, 0x4a, 0x68, 0x2d, 0x5e, 0x61, 0xad, 0x1e,
+ 0xff, 0x9c, 0x63, 0x30, 0x98, 0x48, 0xf7, 0x06 },
+ { 0x8c, 0xca, 0x67, 0xa3, 0x6d, 0x17, 0xd5, 0xe6,
+ 0x34, 0x1c, 0xb5, 0x92, 0xfd, 0x7b, 0xef, 0x99,
+ 0x26, 0xc9, 0xe3, 0xaa, 0x10, 0x27, 0xea, 0x11,
+ 0xa7, 0xd8, 0xbd, 0x26, 0x0b, 0x57, 0x6e, 0x04 },
+ { 0x40, 0x93, 0x92, 0xf5, 0x60, 0xf8, 0x68, 0x31,
+ 0xda, 0x43, 0x73, 0xee, 0x5e, 0x00, 0x74, 0x26,
+ 0x05, 0x95, 0xd7, 0xbc, 0x24, 0x18, 0x3b, 0x60,
+ 0xed, 0x70, 0x0d, 0x45, 0x83, 0xd3, 0xf6, 0xf0 },
+ { 0x28, 0x02, 0x16, 0x5d, 0xe0, 0x90, 0x91, 0x55,
+ 0x46, 0xf3, 0x39, 0x8c, 0xd8, 0x49, 0x16, 0x4a,
+ 0x19, 0xf9, 0x2a, 0xdb, 0xc3, 0x61, 0xad, 0xc9,
+ 0x9b, 0x0f, 0x20, 0xc8, 0xea, 0x07, 0x10, 0x54 },
+ { 0xad, 0x83, 0x91, 0x68, 0xd9, 0xf8, 0xa4, 0xbe,
+ 0x95, 0xba, 0x9e, 0xf9, 0xa6, 0x92, 0xf0, 0x72,
+ 0x56, 0xae, 0x43, 0xfe, 0x6f, 0x98, 0x64, 0xe2,
+ 0x90, 0x69, 0x1b, 0x02, 0x56, 0xce, 0x50, 0xa9 },
+ { 0x75, 0xfd, 0xaa, 0x50, 0x38, 0xc2, 0x84, 0xb8,
+ 0x6d, 0x6e, 0x8a, 0xff, 0xe8, 0xb2, 0x80, 0x7e,
+ 0x46, 0x7b, 0x86, 0x60, 0x0e, 0x79, 0xaf, 0x36,
+ 0x89, 0xfb, 0xc0, 0x63, 0x28, 0xcb, 0xf8, 0x94 },
+ { 0xe5, 0x7c, 0xb7, 0x94, 0x87, 0xdd, 0x57, 0x90,
+ 0x24, 0x32, 0xb2, 0x50, 0x73, 0x38, 0x13, 0xbd,
+ 0x96, 0xa8, 0x4e, 0xfc, 0xe5, 0x9f, 0x65, 0x0f,
+ 0xac, 0x26, 0xe6, 0x69, 0x6a, 0xef, 0xaf, 0xc3 },
+ { 0x56, 0xf3, 0x4e, 0x8b, 0x96, 0x55, 0x7e, 0x90,
+ 0xc1, 0xf2, 0x4b, 0x52, 0xd0, 0xc8, 0x9d, 0x51,
+ 0x08, 0x6a, 0xcf, 0x1b, 0x00, 0xf6, 0x34, 0xcf,
+ 0x1d, 0xde, 0x92, 0x33, 0xb8, 0xea, 0xaa, 0x3e },
+ { 0x1b, 0x53, 0xee, 0x94, 0xaa, 0xf3, 0x4e, 0x4b,
+ 0x15, 0x9d, 0x48, 0xde, 0x35, 0x2c, 0x7f, 0x06,
+ 0x61, 0xd0, 0xa4, 0x0e, 0xdf, 0xf9, 0x5a, 0x0b,
+ 0x16, 0x39, 0xb4, 0x09, 0x0e, 0x97, 0x44, 0x72 },
+ { 0x05, 0x70, 0x5e, 0x2a, 0x81, 0x75, 0x7c, 0x14,
+ 0xbd, 0x38, 0x3e, 0xa9, 0x8d, 0xda, 0x54, 0x4e,
+ 0xb1, 0x0e, 0x6b, 0xc0, 0x7b, 0xae, 0x43, 0x5e,
+ 0x25, 0x18, 0xdb, 0xe1, 0x33, 0x52, 0x53, 0x75 },
+ { 0xd8, 0xb2, 0x86, 0x6e, 0x8a, 0x30, 0x9d, 0xb5,
+ 0x3e, 0x52, 0x9e, 0xc3, 0x29, 0x11, 0xd8, 0x2f,
+ 0x5c, 0xa1, 0x6c, 0xff, 0x76, 0x21, 0x68, 0x91,
+ 0xa9, 0x67, 0x6a, 0xa3, 0x1a, 0xaa, 0x6c, 0x42 },
+ { 0xf5, 0x04, 0x1c, 0x24, 0x12, 0x70, 0xeb, 0x04,
+ 0xc7, 0x1e, 0xc2, 0xc9, 0x5d, 0x4c, 0x38, 0xd8,
+ 0x03, 0xb1, 0x23, 0x7b, 0x0f, 0x29, 0xfd, 0x4d,
+ 0xb3, 0xeb, 0x39, 0x76, 0x69, 0xe8, 0x86, 0x99 },
+ { 0x9a, 0x4c, 0xe0, 0x77, 0xc3, 0x49, 0x32, 0x2f,
+ 0x59, 0x5e, 0x0e, 0xe7, 0x9e, 0xd0, 0xda, 0x5f,
+ 0xab, 0x66, 0x75, 0x2c, 0xbf, 0xef, 0x8f, 0x87,
+ 0xd0, 0xe9, 0xd0, 0x72, 0x3c, 0x75, 0x30, 0xdd },
+ { 0x65, 0x7b, 0x09, 0xf3, 0xd0, 0xf5, 0x2b, 0x5b,
+ 0x8f, 0x2f, 0x97, 0x16, 0x3a, 0x0e, 0xdf, 0x0c,
+ 0x04, 0xf0, 0x75, 0x40, 0x8a, 0x07, 0xbb, 0xeb,
+ 0x3a, 0x41, 0x01, 0xa8, 0x91, 0x99, 0x0d, 0x62 },
+ { 0x1e, 0x3f, 0x7b, 0xd5, 0xa5, 0x8f, 0xa5, 0x33,
+ 0x34, 0x4a, 0xa8, 0xed, 0x3a, 0xc1, 0x22, 0xbb,
+ 0x9e, 0x70, 0xd4, 0xef, 0x50, 0xd0, 0x04, 0x53,
+ 0x08, 0x21, 0x94, 0x8f, 0x5f, 0xe6, 0x31, 0x5a },
+ { 0x80, 0xdc, 0xcf, 0x3f, 0xd8, 0x3d, 0xfd, 0x0d,
+ 0x35, 0xaa, 0x28, 0x58, 0x59, 0x22, 0xab, 0x89,
+ 0xd5, 0x31, 0x39, 0x97, 0x67, 0x3e, 0xaf, 0x90,
+ 0x5c, 0xea, 0x9c, 0x0b, 0x22, 0x5c, 0x7b, 0x5f },
+ { 0x8a, 0x0d, 0x0f, 0xbf, 0x63, 0x77, 0xd8, 0x3b,
+ 0xb0, 0x8b, 0x51, 0x4b, 0x4b, 0x1c, 0x43, 0xac,
+ 0xc9, 0x5d, 0x75, 0x17, 0x14, 0xf8, 0x92, 0x56,
+ 0x45, 0xcb, 0x6b, 0xc8, 0x56, 0xca, 0x15, 0x0a },
+ { 0x9f, 0xa5, 0xb4, 0x87, 0x73, 0x8a, 0xd2, 0x84,
+ 0x4c, 0xc6, 0x34, 0x8a, 0x90, 0x19, 0x18, 0xf6,
+ 0x59, 0xa3, 0xb8, 0x9e, 0x9c, 0x0d, 0xfe, 0xea,
+ 0xd3, 0x0d, 0xd9, 0x4b, 0xcf, 0x42, 0xef, 0x8e },
+ { 0x80, 0x83, 0x2c, 0x4a, 0x16, 0x77, 0xf5, 0xea,
+ 0x25, 0x60, 0xf6, 0x68, 0xe9, 0x35, 0x4d, 0xd3,
+ 0x69, 0x97, 0xf0, 0x37, 0x28, 0xcf, 0xa5, 0x5e,
+ 0x1b, 0x38, 0x33, 0x7c, 0x0c, 0x9e, 0xf8, 0x18 },
+ { 0xab, 0x37, 0xdd, 0xb6, 0x83, 0x13, 0x7e, 0x74,
+ 0x08, 0x0d, 0x02, 0x6b, 0x59, 0x0b, 0x96, 0xae,
+ 0x9b, 0xb4, 0x47, 0x72, 0x2f, 0x30, 0x5a, 0x5a,
+ 0xc5, 0x70, 0xec, 0x1d, 0xf9, 0xb1, 0x74, 0x3c },
+ { 0x3e, 0xe7, 0x35, 0xa6, 0x94, 0xc2, 0x55, 0x9b,
+ 0x69, 0x3a, 0xa6, 0x86, 0x29, 0x36, 0x1e, 0x15,
+ 0xd1, 0x22, 0x65, 0xad, 0x6a, 0x3d, 0xed, 0xf4,
+ 0x88, 0xb0, 0xb0, 0x0f, 0xac, 0x97, 0x54, 0xba },
+ { 0xd6, 0xfc, 0xd2, 0x32, 0x19, 0xb6, 0x47, 0xe4,
+ 0xcb, 0xd5, 0xeb, 0x2d, 0x0a, 0xd0, 0x1e, 0xc8,
+ 0x83, 0x8a, 0x4b, 0x29, 0x01, 0xfc, 0x32, 0x5c,
+ 0xc3, 0x70, 0x19, 0x81, 0xca, 0x6c, 0x88, 0x8b },
+ { 0x05, 0x20, 0xec, 0x2f, 0x5b, 0xf7, 0xa7, 0x55,
+ 0xda, 0xcb, 0x50, 0xc6, 0xbf, 0x23, 0x3e, 0x35,
+ 0x15, 0x43, 0x47, 0x63, 0xdb, 0x01, 0x39, 0xcc,
+ 0xd9, 0xfa, 0xef, 0xbb, 0x82, 0x07, 0x61, 0x2d },
+ { 0xaf, 0xf3, 0xb7, 0x5f, 0x3f, 0x58, 0x12, 0x64,
+ 0xd7, 0x66, 0x16, 0x62, 0xb9, 0x2f, 0x5a, 0xd3,
+ 0x7c, 0x1d, 0x32, 0xbd, 0x45, 0xff, 0x81, 0xa4,
+ 0xed, 0x8a, 0xdc, 0x9e, 0xf3, 0x0d, 0xd9, 0x89 },
+ { 0xd0, 0xdd, 0x65, 0x0b, 0xef, 0xd3, 0xba, 0x63,
+ 0xdc, 0x25, 0x10, 0x2c, 0x62, 0x7c, 0x92, 0x1b,
+ 0x9c, 0xbe, 0xb0, 0xb1, 0x30, 0x68, 0x69, 0x35,
+ 0xb5, 0xc9, 0x27, 0xcb, 0x7c, 0xcd, 0x5e, 0x3b },
+ { 0xe1, 0x14, 0x98, 0x16, 0xb1, 0x0a, 0x85, 0x14,
+ 0xfb, 0x3e, 0x2c, 0xab, 0x2c, 0x08, 0xbe, 0xe9,
+ 0xf7, 0x3c, 0xe7, 0x62, 0x21, 0x70, 0x12, 0x46,
+ 0xa5, 0x89, 0xbb, 0xb6, 0x73, 0x02, 0xd8, 0xa9 },
+ { 0x7d, 0xa3, 0xf4, 0x41, 0xde, 0x90, 0x54, 0x31,
+ 0x7e, 0x72, 0xb5, 0xdb, 0xf9, 0x79, 0xda, 0x01,
+ 0xe6, 0xbc, 0xee, 0xbb, 0x84, 0x78, 0xea, 0xe6,
+ 0xa2, 0x28, 0x49, 0xd9, 0x02, 0x92, 0x63, 0x5c },
+ { 0x12, 0x30, 0xb1, 0xfc, 0x8a, 0x7d, 0x92, 0x15,
+ 0xed, 0xc2, 0xd4, 0xa2, 0xde, 0xcb, 0xdd, 0x0a,
+ 0x6e, 0x21, 0x6c, 0x92, 0x42, 0x78, 0xc9, 0x1f,
+ 0xc5, 0xd1, 0x0e, 0x7d, 0x60, 0x19, 0x2d, 0x94 },
+ { 0x57, 0x50, 0xd7, 0x16, 0xb4, 0x80, 0x8f, 0x75,
+ 0x1f, 0xeb, 0xc3, 0x88, 0x06, 0xba, 0x17, 0x0b,
+ 0xf6, 0xd5, 0x19, 0x9a, 0x78, 0x16, 0xbe, 0x51,
+ 0x4e, 0x3f, 0x93, 0x2f, 0xbe, 0x0c, 0xb8, 0x71 },
+ { 0x6f, 0xc5, 0x9b, 0x2f, 0x10, 0xfe, 0xba, 0x95,
+ 0x4a, 0xa6, 0x82, 0x0b, 0x3c, 0xa9, 0x87, 0xee,
+ 0x81, 0xd5, 0xcc, 0x1d, 0xa3, 0xc6, 0x3c, 0xe8,
+ 0x27, 0x30, 0x1c, 0x56, 0x9d, 0xfb, 0x39, 0xce },
+ { 0xc7, 0xc3, 0xfe, 0x1e, 0xeb, 0xdc, 0x7b, 0x5a,
+ 0x93, 0x93, 0x26, 0xe8, 0xdd, 0xb8, 0x3e, 0x8b,
+ 0xf2, 0xb7, 0x80, 0xb6, 0x56, 0x78, 0xcb, 0x62,
+ 0xf2, 0x08, 0xb0, 0x40, 0xab, 0xdd, 0x35, 0xe2 },
+ { 0x0c, 0x75, 0xc1, 0xa1, 0x5c, 0xf3, 0x4a, 0x31,
+ 0x4e, 0xe4, 0x78, 0xf4, 0xa5, 0xce, 0x0b, 0x8a,
+ 0x6b, 0x36, 0x52, 0x8e, 0xf7, 0xa8, 0x20, 0x69,
+ 0x6c, 0x3e, 0x42, 0x46, 0xc5, 0xa1, 0x58, 0x64 },
+ { 0x21, 0x6d, 0xc1, 0x2a, 0x10, 0x85, 0x69, 0xa3,
+ 0xc7, 0xcd, 0xde, 0x4a, 0xed, 0x43, 0xa6, 0xc3,
+ 0x30, 0x13, 0x9d, 0xda, 0x3c, 0xcc, 0x4a, 0x10,
+ 0x89, 0x05, 0xdb, 0x38, 0x61, 0x89, 0x90, 0x50 },
+ { 0xa5, 0x7b, 0xe6, 0xae, 0x67, 0x56, 0xf2, 0x8b,
+ 0x02, 0xf5, 0x9d, 0xad, 0xf7, 0xe0, 0xd7, 0xd8,
+ 0x80, 0x7f, 0x10, 0xfa, 0x15, 0xce, 0xd1, 0xad,
+ 0x35, 0x85, 0x52, 0x1a, 0x1d, 0x99, 0x5a, 0x89 },
+ { 0x81, 0x6a, 0xef, 0x87, 0x59, 0x53, 0x71, 0x6c,
+ 0xd7, 0xa5, 0x81, 0xf7, 0x32, 0xf5, 0x3d, 0xd4,
+ 0x35, 0xda, 0xb6, 0x6d, 0x09, 0xc3, 0x61, 0xd2,
+ 0xd6, 0x59, 0x2d, 0xe1, 0x77, 0x55, 0xd8, 0xa8 },
+ { 0x9a, 0x76, 0x89, 0x32, 0x26, 0x69, 0x3b, 0x6e,
+ 0xa9, 0x7e, 0x6a, 0x73, 0x8f, 0x9d, 0x10, 0xfb,
+ 0x3d, 0x0b, 0x43, 0xae, 0x0e, 0x8b, 0x7d, 0x81,
+ 0x23, 0xea, 0x76, 0xce, 0x97, 0x98, 0x9c, 0x7e },
+ { 0x8d, 0xae, 0xdb, 0x9a, 0x27, 0x15, 0x29, 0xdb,
+ 0xb7, 0xdc, 0x3b, 0x60, 0x7f, 0xe5, 0xeb, 0x2d,
+ 0x32, 0x11, 0x77, 0x07, 0x58, 0xdd, 0x3b, 0x0a,
+ 0x35, 0x93, 0xd2, 0xd7, 0x95, 0x4e, 0x2d, 0x5b },
+ { 0x16, 0xdb, 0xc0, 0xaa, 0x5d, 0xd2, 0xc7, 0x74,
+ 0xf5, 0x05, 0x10, 0x0f, 0x73, 0x37, 0x86, 0xd8,
+ 0xa1, 0x75, 0xfc, 0xbb, 0xb5, 0x9c, 0x43, 0xe1,
+ 0xfb, 0xff, 0x3e, 0x1e, 0xaf, 0x31, 0xcb, 0x4a },
+ { 0x86, 0x06, 0xcb, 0x89, 0x9c, 0x6a, 0xea, 0xf5,
+ 0x1b, 0x9d, 0xb0, 0xfe, 0x49, 0x24, 0xa9, 0xfd,
+ 0x5d, 0xab, 0xc1, 0x9f, 0x88, 0x26, 0xf2, 0xbc,
+ 0x1c, 0x1d, 0x7d, 0xa1, 0x4d, 0x2c, 0x2c, 0x99 },
+ { 0x84, 0x79, 0x73, 0x1a, 0xed, 0xa5, 0x7b, 0xd3,
+ 0x7e, 0xad, 0xb5, 0x1a, 0x50, 0x7e, 0x30, 0x7f,
+ 0x3b, 0xd9, 0x5e, 0x69, 0xdb, 0xca, 0x94, 0xf3,
+ 0xbc, 0x21, 0x72, 0x60, 0x66, 0xad, 0x6d, 0xfd },
+ { 0x58, 0x47, 0x3a, 0x9e, 0xa8, 0x2e, 0xfa, 0x3f,
+ 0x3b, 0x3d, 0x8f, 0xc8, 0x3e, 0xd8, 0x86, 0x31,
+ 0x27, 0xb3, 0x3a, 0xe8, 0xde, 0xae, 0x63, 0x07,
+ 0x20, 0x1e, 0xdb, 0x6d, 0xde, 0x61, 0xde, 0x29 },
+ { 0x9a, 0x92, 0x55, 0xd5, 0x3a, 0xf1, 0x16, 0xde,
+ 0x8b, 0xa2, 0x7c, 0xe3, 0x5b, 0x4c, 0x7e, 0x15,
+ 0x64, 0x06, 0x57, 0xa0, 0xfc, 0xb8, 0x88, 0xc7,
+ 0x0d, 0x95, 0x43, 0x1d, 0xac, 0xd8, 0xf8, 0x30 },
+ { 0x9e, 0xb0, 0x5f, 0xfb, 0xa3, 0x9f, 0xd8, 0x59,
+ 0x6a, 0x45, 0x49, 0x3e, 0x18, 0xd2, 0x51, 0x0b,
+ 0xf3, 0xef, 0x06, 0x5c, 0x51, 0xd6, 0xe1, 0x3a,
+ 0xbe, 0x66, 0xaa, 0x57, 0xe0, 0x5c, 0xfd, 0xb7 },
+ { 0x81, 0xdc, 0xc3, 0xa5, 0x05, 0xea, 0xce, 0x3f,
+ 0x87, 0x9d, 0x8f, 0x70, 0x27, 0x76, 0x77, 0x0f,
+ 0x9d, 0xf5, 0x0e, 0x52, 0x1d, 0x14, 0x28, 0xa8,
+ 0x5d, 0xaf, 0x04, 0xf9, 0xad, 0x21, 0x50, 0xe0 },
+ { 0xe3, 0xe3, 0xc4, 0xaa, 0x3a, 0xcb, 0xbc, 0x85,
+ 0x33, 0x2a, 0xf9, 0xd5, 0x64, 0xbc, 0x24, 0x16,
+ 0x5e, 0x16, 0x87, 0xf6, 0xb1, 0xad, 0xcb, 0xfa,
+ 0xe7, 0x7a, 0x8f, 0x03, 0xc7, 0x2a, 0xc2, 0x8c },
+ { 0x67, 0x46, 0xc8, 0x0b, 0x4e, 0xb5, 0x6a, 0xea,
+ 0x45, 0xe6, 0x4e, 0x72, 0x89, 0xbb, 0xa3, 0xed,
+ 0xbf, 0x45, 0xec, 0xf8, 0x20, 0x64, 0x81, 0xff,
+ 0x63, 0x02, 0x12, 0x29, 0x84, 0xcd, 0x52, 0x6a },
+ { 0x2b, 0x62, 0x8e, 0x52, 0x76, 0x4d, 0x7d, 0x62,
+ 0xc0, 0x86, 0x8b, 0x21, 0x23, 0x57, 0xcd, 0xd1,
+ 0x2d, 0x91, 0x49, 0x82, 0x2f, 0x4e, 0x98, 0x45,
+ 0xd9, 0x18, 0xa0, 0x8d, 0x1a, 0xe9, 0x90, 0xc0 },
+ { 0xe4, 0xbf, 0xe8, 0x0d, 0x58, 0xc9, 0x19, 0x94,
+ 0x61, 0x39, 0x09, 0xdc, 0x4b, 0x1a, 0x12, 0x49,
+ 0x68, 0x96, 0xc0, 0x04, 0xaf, 0x7b, 0x57, 0x01,
+ 0x48, 0x3d, 0xe4, 0x5d, 0x28, 0x23, 0xd7, 0x8e },
+ { 0xeb, 0xb4, 0xba, 0x15, 0x0c, 0xef, 0x27, 0x34,
+ 0x34, 0x5b, 0x5d, 0x64, 0x1b, 0xbe, 0xd0, 0x3a,
+ 0x21, 0xea, 0xfa, 0xe9, 0x33, 0xc9, 0x9e, 0x00,
+ 0x92, 0x12, 0xef, 0x04, 0x57, 0x4a, 0x85, 0x30 },
+ { 0x39, 0x66, 0xec, 0x73, 0xb1, 0x54, 0xac, 0xc6,
+ 0x97, 0xac, 0x5c, 0xf5, 0xb2, 0x4b, 0x40, 0xbd,
+ 0xb0, 0xdb, 0x9e, 0x39, 0x88, 0x36, 0xd7, 0x6d,
+ 0x4b, 0x88, 0x0e, 0x3b, 0x2a, 0xf1, 0xaa, 0x27 },
+ { 0xef, 0x7e, 0x48, 0x31, 0xb3, 0xa8, 0x46, 0x36,
+ 0x51, 0x8d, 0x6e, 0x4b, 0xfc, 0xe6, 0x4a, 0x43,
+ 0xdb, 0x2a, 0x5d, 0xda, 0x9c, 0xca, 0x2b, 0x44,
+ 0xf3, 0x90, 0x33, 0xbd, 0xc4, 0x0d, 0x62, 0x43 },
+ { 0x7a, 0xbf, 0x6a, 0xcf, 0x5c, 0x8e, 0x54, 0x9d,
+ 0xdb, 0xb1, 0x5a, 0xe8, 0xd8, 0xb3, 0x88, 0xc1,
+ 0xc1, 0x97, 0xe6, 0x98, 0x73, 0x7c, 0x97, 0x85,
+ 0x50, 0x1e, 0xd1, 0xf9, 0x49, 0x30, 0xb7, 0xd9 },
+ { 0x88, 0x01, 0x8d, 0xed, 0x66, 0x81, 0x3f, 0x0c,
+ 0xa9, 0x5d, 0xef, 0x47, 0x4c, 0x63, 0x06, 0x92,
+ 0x01, 0x99, 0x67, 0xb9, 0xe3, 0x68, 0x88, 0xda,
+ 0xdd, 0x94, 0x12, 0x47, 0x19, 0xb6, 0x82, 0xf6 },
+ { 0x39, 0x30, 0x87, 0x6b, 0x9f, 0xc7, 0x52, 0x90,
+ 0x36, 0xb0, 0x08, 0xb1, 0xb8, 0xbb, 0x99, 0x75,
+ 0x22, 0xa4, 0x41, 0x63, 0x5a, 0x0c, 0x25, 0xec,
+ 0x02, 0xfb, 0x6d, 0x90, 0x26, 0xe5, 0x5a, 0x97 },
+ { 0x0a, 0x40, 0x49, 0xd5, 0x7e, 0x83, 0x3b, 0x56,
+ 0x95, 0xfa, 0xc9, 0x3d, 0xd1, 0xfb, 0xef, 0x31,
+ 0x66, 0xb4, 0x4b, 0x12, 0xad, 0x11, 0x24, 0x86,
+ 0x62, 0x38, 0x3a, 0xe0, 0x51, 0xe1, 0x58, 0x27 },
+ { 0x81, 0xdc, 0xc0, 0x67, 0x8b, 0xb6, 0xa7, 0x65,
+ 0xe4, 0x8c, 0x32, 0x09, 0x65, 0x4f, 0xe9, 0x00,
+ 0x89, 0xce, 0x44, 0xff, 0x56, 0x18, 0x47, 0x7e,
+ 0x39, 0xab, 0x28, 0x64, 0x76, 0xdf, 0x05, 0x2b },
+ { 0xe6, 0x9b, 0x3a, 0x36, 0xa4, 0x46, 0x19, 0x12,
+ 0xdc, 0x08, 0x34, 0x6b, 0x11, 0xdd, 0xcb, 0x9d,
+ 0xb7, 0x96, 0xf8, 0x85, 0xfd, 0x01, 0x93, 0x6e,
+ 0x66, 0x2f, 0xe2, 0x92, 0x97, 0xb0, 0x99, 0xa4 },
+ { 0x5a, 0xc6, 0x50, 0x3b, 0x0d, 0x8d, 0xa6, 0x91,
+ 0x76, 0x46, 0xe6, 0xdc, 0xc8, 0x7e, 0xdc, 0x58,
+ 0xe9, 0x42, 0x45, 0x32, 0x4c, 0xc2, 0x04, 0xf4,
+ 0xdd, 0x4a, 0xf0, 0x15, 0x63, 0xac, 0xd4, 0x27 },
+ { 0xdf, 0x6d, 0xda, 0x21, 0x35, 0x9a, 0x30, 0xbc,
+ 0x27, 0x17, 0x80, 0x97, 0x1c, 0x1a, 0xbd, 0x56,
+ 0xa6, 0xef, 0x16, 0x7e, 0x48, 0x08, 0x87, 0x88,
+ 0x8e, 0x73, 0xa8, 0x6d, 0x3b, 0xf6, 0x05, 0xe9 },
+ { 0xe8, 0xe6, 0xe4, 0x70, 0x71, 0xe7, 0xb7, 0xdf,
+ 0x25, 0x80, 0xf2, 0x25, 0xcf, 0xbb, 0xed, 0xf8,
+ 0x4c, 0xe6, 0x77, 0x46, 0x62, 0x66, 0x28, 0xd3,
+ 0x30, 0x97, 0xe4, 0xb7, 0xdc, 0x57, 0x11, 0x07 },
+ { 0x53, 0xe4, 0x0e, 0xad, 0x62, 0x05, 0x1e, 0x19,
+ 0xcb, 0x9b, 0xa8, 0x13, 0x3e, 0x3e, 0x5c, 0x1c,
+ 0xe0, 0x0d, 0xdc, 0xad, 0x8a, 0xcf, 0x34, 0x2a,
+ 0x22, 0x43, 0x60, 0xb0, 0xac, 0xc1, 0x47, 0x77 },
+ { 0x9c, 0xcd, 0x53, 0xfe, 0x80, 0xbe, 0x78, 0x6a,
+ 0xa9, 0x84, 0x63, 0x84, 0x62, 0xfb, 0x28, 0xaf,
+ 0xdf, 0x12, 0x2b, 0x34, 0xd7, 0x8f, 0x46, 0x87,
+ 0xec, 0x63, 0x2b, 0xb1, 0x9d, 0xe2, 0x37, 0x1a },
+ { 0xcb, 0xd4, 0x80, 0x52, 0xc4, 0x8d, 0x78, 0x84,
+ 0x66, 0xa3, 0xe8, 0x11, 0x8c, 0x56, 0xc9, 0x7f,
+ 0xe1, 0x46, 0xe5, 0x54, 0x6f, 0xaa, 0xf9, 0x3e,
+ 0x2b, 0xc3, 0xc4, 0x7e, 0x45, 0x93, 0x97, 0x53 },
+ { 0x25, 0x68, 0x83, 0xb1, 0x4e, 0x2a, 0xf4, 0x4d,
+ 0xad, 0xb2, 0x8e, 0x1b, 0x34, 0xb2, 0xac, 0x0f,
+ 0x0f, 0x4c, 0x91, 0xc3, 0x4e, 0xc9, 0x16, 0x9e,
+ 0x29, 0x03, 0x61, 0x58, 0xac, 0xaa, 0x95, 0xb9 },
+ { 0x44, 0x71, 0xb9, 0x1a, 0xb4, 0x2d, 0xb7, 0xc4,
+ 0xdd, 0x84, 0x90, 0xab, 0x95, 0xa2, 0xee, 0x8d,
+ 0x04, 0xe3, 0xef, 0x5c, 0x3d, 0x6f, 0xc7, 0x1a,
+ 0xc7, 0x4b, 0x2b, 0x26, 0x91, 0x4d, 0x16, 0x41 },
+ { 0xa5, 0xeb, 0x08, 0x03, 0x8f, 0x8f, 0x11, 0x55,
+ 0xed, 0x86, 0xe6, 0x31, 0x90, 0x6f, 0xc1, 0x30,
+ 0x95, 0xf6, 0xbb, 0xa4, 0x1d, 0xe5, 0xd4, 0xe7,
+ 0x95, 0x75, 0x8e, 0xc8, 0xc8, 0xdf, 0x8a, 0xf1 },
+ { 0xdc, 0x1d, 0xb6, 0x4e, 0xd8, 0xb4, 0x8a, 0x91,
+ 0x0e, 0x06, 0x0a, 0x6b, 0x86, 0x63, 0x74, 0xc5,
+ 0x78, 0x78, 0x4e, 0x9a, 0xc4, 0x9a, 0xb2, 0x77,
+ 0x40, 0x92, 0xac, 0x71, 0x50, 0x19, 0x34, 0xac },
+ { 0x28, 0x54, 0x13, 0xb2, 0xf2, 0xee, 0x87, 0x3d,
+ 0x34, 0x31, 0x9e, 0xe0, 0xbb, 0xfb, 0xb9, 0x0f,
+ 0x32, 0xda, 0x43, 0x4c, 0xc8, 0x7e, 0x3d, 0xb5,
+ 0xed, 0x12, 0x1b, 0xb3, 0x98, 0xed, 0x96, 0x4b },
+ { 0x02, 0x16, 0xe0, 0xf8, 0x1f, 0x75, 0x0f, 0x26,
+ 0xf1, 0x99, 0x8b, 0xc3, 0x93, 0x4e, 0x3e, 0x12,
+ 0x4c, 0x99, 0x45, 0xe6, 0x85, 0xa6, 0x0b, 0x25,
+ 0xe8, 0xfb, 0xd9, 0x62, 0x5a, 0xb6, 0xb5, 0x99 },
+ { 0x38, 0xc4, 0x10, 0xf5, 0xb9, 0xd4, 0x07, 0x20,
+ 0x50, 0x75, 0x5b, 0x31, 0xdc, 0xa8, 0x9f, 0xd5,
+ 0x39, 0x5c, 0x67, 0x85, 0xee, 0xb3, 0xd7, 0x90,
+ 0xf3, 0x20, 0xff, 0x94, 0x1c, 0x5a, 0x93, 0xbf },
+ { 0xf1, 0x84, 0x17, 0xb3, 0x9d, 0x61, 0x7a, 0xb1,
+ 0xc1, 0x8f, 0xdf, 0x91, 0xeb, 0xd0, 0xfc, 0x6d,
+ 0x55, 0x16, 0xbb, 0x34, 0xcf, 0x39, 0x36, 0x40,
+ 0x37, 0xbc, 0xe8, 0x1f, 0xa0, 0x4c, 0xec, 0xb1 },
+ { 0x1f, 0xa8, 0x77, 0xde, 0x67, 0x25, 0x9d, 0x19,
+ 0x86, 0x3a, 0x2a, 0x34, 0xbc, 0xc6, 0x96, 0x2a,
+ 0x2b, 0x25, 0xfc, 0xbf, 0x5c, 0xbe, 0xcd, 0x7e,
+ 0xde, 0x8f, 0x1f, 0xa3, 0x66, 0x88, 0xa7, 0x96 },
+ { 0x5b, 0xd1, 0x69, 0xe6, 0x7c, 0x82, 0xc2, 0xc2,
+ 0xe9, 0x8e, 0xf7, 0x00, 0x8b, 0xdf, 0x26, 0x1f,
+ 0x2d, 0xdf, 0x30, 0xb1, 0xc0, 0x0f, 0x9e, 0x7f,
+ 0x27, 0x5b, 0xb3, 0xe8, 0xa2, 0x8d, 0xc9, 0xa2 },
+ { 0xc8, 0x0a, 0xbe, 0xeb, 0xb6, 0x69, 0xad, 0x5d,
+ 0xee, 0xb5, 0xf5, 0xec, 0x8e, 0xa6, 0xb7, 0xa0,
+ 0x5d, 0xdf, 0x7d, 0x31, 0xec, 0x4c, 0x0a, 0x2e,
+ 0xe2, 0x0b, 0x0b, 0x98, 0xca, 0xec, 0x67, 0x46 },
+ { 0xe7, 0x6d, 0x3f, 0xbd, 0xa5, 0xba, 0x37, 0x4e,
+ 0x6b, 0xf8, 0xe5, 0x0f, 0xad, 0xc3, 0xbb, 0xb9,
+ 0xba, 0x5c, 0x20, 0x6e, 0xbd, 0xec, 0x89, 0xa3,
+ 0xa5, 0x4c, 0xf3, 0xdd, 0x84, 0xa0, 0x70, 0x16 },
+ { 0x7b, 0xba, 0x9d, 0xc5, 0xb5, 0xdb, 0x20, 0x71,
+ 0xd1, 0x77, 0x52, 0xb1, 0x04, 0x4c, 0x1e, 0xce,
+ 0xd9, 0x6a, 0xaf, 0x2d, 0xd4, 0x6e, 0x9b, 0x43,
+ 0x37, 0x50, 0xe8, 0xea, 0x0d, 0xcc, 0x18, 0x70 },
+ { 0xf2, 0x9b, 0x1b, 0x1a, 0xb9, 0xba, 0xb1, 0x63,
+ 0x01, 0x8e, 0xe3, 0xda, 0x15, 0x23, 0x2c, 0xca,
+ 0x78, 0xec, 0x52, 0xdb, 0xc3, 0x4e, 0xda, 0x5b,
+ 0x82, 0x2e, 0xc1, 0xd8, 0x0f, 0xc2, 0x1b, 0xd0 },
+ { 0x9e, 0xe3, 0xe3, 0xe7, 0xe9, 0x00, 0xf1, 0xe1,
+ 0x1d, 0x30, 0x8c, 0x4b, 0x2b, 0x30, 0x76, 0xd2,
+ 0x72, 0xcf, 0x70, 0x12, 0x4f, 0x9f, 0x51, 0xe1,
+ 0xda, 0x60, 0xf3, 0x78, 0x46, 0xcd, 0xd2, 0xf4 },
+ { 0x70, 0xea, 0x3b, 0x01, 0x76, 0x92, 0x7d, 0x90,
+ 0x96, 0xa1, 0x85, 0x08, 0xcd, 0x12, 0x3a, 0x29,
+ 0x03, 0x25, 0x92, 0x0a, 0x9d, 0x00, 0xa8, 0x9b,
+ 0x5d, 0xe0, 0x42, 0x73, 0xfb, 0xc7, 0x6b, 0x85 },
+ { 0x67, 0xde, 0x25, 0xc0, 0x2a, 0x4a, 0xab, 0xa2,
+ 0x3b, 0xdc, 0x97, 0x3c, 0x8b, 0xb0, 0xb5, 0x79,
+ 0x6d, 0x47, 0xcc, 0x06, 0x59, 0xd4, 0x3d, 0xff,
+ 0x1f, 0x97, 0xde, 0x17, 0x49, 0x63, 0xb6, 0x8e },
+ { 0xb2, 0x16, 0x8e, 0x4e, 0x0f, 0x18, 0xb0, 0xe6,
+ 0x41, 0x00, 0xb5, 0x17, 0xed, 0x95, 0x25, 0x7d,
+ 0x73, 0xf0, 0x62, 0x0d, 0xf8, 0x85, 0xc1, 0x3d,
+ 0x2e, 0xcf, 0x79, 0x36, 0x7b, 0x38, 0x4c, 0xee },
+ { 0x2e, 0x7d, 0xec, 0x24, 0x28, 0x85, 0x3b, 0x2c,
+ 0x71, 0x76, 0x07, 0x45, 0x54, 0x1f, 0x7a, 0xfe,
+ 0x98, 0x25, 0xb5, 0xdd, 0x77, 0xdf, 0x06, 0x51,
+ 0x1d, 0x84, 0x41, 0xa9, 0x4b, 0xac, 0xc9, 0x27 },
+ { 0xca, 0x9f, 0xfa, 0xc4, 0xc4, 0x3f, 0x0b, 0x48,
+ 0x46, 0x1d, 0xc5, 0xc2, 0x63, 0xbe, 0xa3, 0xf6,
+ 0xf0, 0x06, 0x11, 0xce, 0xac, 0xab, 0xf6, 0xf8,
+ 0x95, 0xba, 0x2b, 0x01, 0x01, 0xdb, 0xb6, 0x8d },
+ { 0x74, 0x10, 0xd4, 0x2d, 0x8f, 0xd1, 0xd5, 0xe9,
+ 0xd2, 0xf5, 0x81, 0x5c, 0xb9, 0x34, 0x17, 0x99,
+ 0x88, 0x28, 0xef, 0x3c, 0x42, 0x30, 0xbf, 0xbd,
+ 0x41, 0x2d, 0xf0, 0xa4, 0xa7, 0xa2, 0x50, 0x7a },
+ { 0x50, 0x10, 0xf6, 0x84, 0x51, 0x6d, 0xcc, 0xd0,
+ 0xb6, 0xee, 0x08, 0x52, 0xc2, 0x51, 0x2b, 0x4d,
+ 0xc0, 0x06, 0x6c, 0xf0, 0xd5, 0x6f, 0x35, 0x30,
+ 0x29, 0x78, 0xdb, 0x8a, 0xe3, 0x2c, 0x6a, 0x81 },
+ { 0xac, 0xaa, 0xb5, 0x85, 0xf7, 0xb7, 0x9b, 0x71,
+ 0x99, 0x35, 0xce, 0xb8, 0x95, 0x23, 0xdd, 0xc5,
+ 0x48, 0x27, 0xf7, 0x5c, 0x56, 0x88, 0x38, 0x56,
+ 0x15, 0x4a, 0x56, 0xcd, 0xcd, 0x5e, 0xe9, 0x88 },
+ { 0x66, 0x6d, 0xe5, 0xd1, 0x44, 0x0f, 0xee, 0x73,
+ 0x31, 0xaa, 0xf0, 0x12, 0x3a, 0x62, 0xef, 0x2d,
+ 0x8b, 0xa5, 0x74, 0x53, 0xa0, 0x76, 0x96, 0x35,
+ 0xac, 0x6c, 0xd0, 0x1e, 0x63, 0x3f, 0x77, 0x12 },
+ { 0xa6, 0xf9, 0x86, 0x58, 0xf6, 0xea, 0xba, 0xf9,
+ 0x02, 0xd8, 0xb3, 0x87, 0x1a, 0x4b, 0x10, 0x1d,
+ 0x16, 0x19, 0x6e, 0x8a, 0x4b, 0x24, 0x1e, 0x15,
+ 0x58, 0xfe, 0x29, 0x96, 0x6e, 0x10, 0x3e, 0x8d },
+ { 0x89, 0x15, 0x46, 0xa8, 0xb2, 0x9f, 0x30, 0x47,
+ 0xdd, 0xcf, 0xe5, 0xb0, 0x0e, 0x45, 0xfd, 0x55,
+ 0x75, 0x63, 0x73, 0x10, 0x5e, 0xa8, 0x63, 0x7d,
+ 0xfc, 0xff, 0x54, 0x7b, 0x6e, 0xa9, 0x53, 0x5f },
+ { 0x18, 0xdf, 0xbc, 0x1a, 0xc5, 0xd2, 0x5b, 0x07,
+ 0x61, 0x13, 0x7d, 0xbd, 0x22, 0xc1, 0x7c, 0x82,
+ 0x9d, 0x0f, 0x0e, 0xf1, 0xd8, 0x23, 0x44, 0xe9,
+ 0xc8, 0x9c, 0x28, 0x66, 0x94, 0xda, 0x24, 0xe8 },
+ { 0xb5, 0x4b, 0x9b, 0x67, 0xf8, 0xfe, 0xd5, 0x4b,
+ 0xbf, 0x5a, 0x26, 0x66, 0xdb, 0xdf, 0x4b, 0x23,
+ 0xcf, 0xf1, 0xd1, 0xb6, 0xf4, 0xaf, 0xc9, 0x85,
+ 0xb2, 0xe6, 0xd3, 0x30, 0x5a, 0x9f, 0xf8, 0x0f },
+ { 0x7d, 0xb4, 0x42, 0xe1, 0x32, 0xba, 0x59, 0xbc,
+ 0x12, 0x89, 0xaa, 0x98, 0xb0, 0xd3, 0xe8, 0x06,
+ 0x00, 0x4f, 0x8e, 0xc1, 0x28, 0x11, 0xaf, 0x1e,
+ 0x2e, 0x33, 0xc6, 0x9b, 0xfd, 0xe7, 0x29, 0xe1 },
+ { 0x25, 0x0f, 0x37, 0xcd, 0xc1, 0x5e, 0x81, 0x7d,
+ 0x2f, 0x16, 0x0d, 0x99, 0x56, 0xc7, 0x1f, 0xe3,
+ 0xeb, 0x5d, 0xb7, 0x45, 0x56, 0xe4, 0xad, 0xf9,
+ 0xa4, 0xff, 0xaf, 0xba, 0x74, 0x01, 0x03, 0x96 },
+ { 0x4a, 0xb8, 0xa3, 0xdd, 0x1d, 0xdf, 0x8a, 0xd4,
+ 0x3d, 0xab, 0x13, 0xa2, 0x7f, 0x66, 0xa6, 0x54,
+ 0x4f, 0x29, 0x05, 0x97, 0xfa, 0x96, 0x04, 0x0e,
+ 0x0e, 0x1d, 0xb9, 0x26, 0x3a, 0xa4, 0x79, 0xf8 },
+ { 0xee, 0x61, 0x72, 0x7a, 0x07, 0x66, 0xdf, 0x93,
+ 0x9c, 0xcd, 0xc8, 0x60, 0x33, 0x40, 0x44, 0xc7,
+ 0x9a, 0x3c, 0x9b, 0x15, 0x62, 0x00, 0xbc, 0x3a,
+ 0xa3, 0x29, 0x73, 0x48, 0x3d, 0x83, 0x41, 0xae },
+ { 0x3f, 0x68, 0xc7, 0xec, 0x63, 0xac, 0x11, 0xeb,
+ 0xb9, 0x8f, 0x94, 0xb3, 0x39, 0xb0, 0x5c, 0x10,
+ 0x49, 0x84, 0xfd, 0xa5, 0x01, 0x03, 0x06, 0x01,
+ 0x44, 0xe5, 0xa2, 0xbf, 0xcc, 0xc9, 0xda, 0x95 },
+ { 0x05, 0x6f, 0x29, 0x81, 0x6b, 0x8a, 0xf8, 0xf5,
+ 0x66, 0x82, 0xbc, 0x4d, 0x7c, 0xf0, 0x94, 0x11,
+ 0x1d, 0xa7, 0x73, 0x3e, 0x72, 0x6c, 0xd1, 0x3d,
+ 0x6b, 0x3e, 0x8e, 0xa0, 0x3e, 0x92, 0xa0, 0xd5 },
+ { 0xf5, 0xec, 0x43, 0xa2, 0x8a, 0xcb, 0xef, 0xf1,
+ 0xf3, 0x31, 0x8a, 0x5b, 0xca, 0xc7, 0xc6, 0x6d,
+ 0xdb, 0x52, 0x30, 0xb7, 0x9d, 0xb2, 0xd1, 0x05,
+ 0xbc, 0xbe, 0x15, 0xf3, 0xc1, 0x14, 0x8d, 0x69 },
+ { 0x2a, 0x69, 0x60, 0xad, 0x1d, 0x8d, 0xd5, 0x47,
+ 0x55, 0x5c, 0xfb, 0xd5, 0xe4, 0x60, 0x0f, 0x1e,
+ 0xaa, 0x1c, 0x8e, 0xda, 0x34, 0xde, 0x03, 0x74,
+ 0xec, 0x4a, 0x26, 0xea, 0xaa, 0xa3, 0x3b, 0x4e },
+ { 0xdc, 0xc1, 0xea, 0x7b, 0xaa, 0xb9, 0x33, 0x84,
+ 0xf7, 0x6b, 0x79, 0x68, 0x66, 0x19, 0x97, 0x54,
+ 0x74, 0x2f, 0x7b, 0x96, 0xd6, 0xb4, 0xc1, 0x20,
+ 0x16, 0x5c, 0x04, 0xa6, 0xc4, 0xf5, 0xce, 0x10 },
+ { 0x13, 0xd5, 0xdf, 0x17, 0x92, 0x21, 0x37, 0x9c,
+ 0x6a, 0x78, 0xc0, 0x7c, 0x79, 0x3f, 0xf5, 0x34,
+ 0x87, 0xca, 0xe6, 0xbf, 0x9f, 0xe8, 0x82, 0x54,
+ 0x1a, 0xb0, 0xe7, 0x35, 0xe3, 0xea, 0xda, 0x3b },
+ { 0x8c, 0x59, 0xe4, 0x40, 0x