From 4a0e319af86c0d38304535293f6fc32fe436ef1d Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 28 Aug 2018 23:50:35 -0600 Subject: crypto: import zinc --- src/crypto/zinc/poly1305/poly1305-arm-glue.h | 69 + src/crypto/zinc/poly1305/poly1305-arm.S | 1117 +++++++++ src/crypto/zinc/poly1305/poly1305-arm64.S | 822 +++++++ src/crypto/zinc/poly1305/poly1305-mips-glue.h | 40 + src/crypto/zinc/poly1305/poly1305-mips.S | 417 ++++ src/crypto/zinc/poly1305/poly1305-mips64.S | 357 +++ src/crypto/zinc/poly1305/poly1305-x86_64-glue.h | 111 + src/crypto/zinc/poly1305/poly1305-x86_64.S | 2792 +++++++++++++++++++++++ src/crypto/zinc/poly1305/poly1305.c | 289 +++ 9 files changed, 6014 insertions(+) create mode 100644 src/crypto/zinc/poly1305/poly1305-arm-glue.h create mode 100644 src/crypto/zinc/poly1305/poly1305-arm.S create mode 100644 src/crypto/zinc/poly1305/poly1305-arm64.S create mode 100644 src/crypto/zinc/poly1305/poly1305-mips-glue.h create mode 100644 src/crypto/zinc/poly1305/poly1305-mips.S create mode 100644 src/crypto/zinc/poly1305/poly1305-mips64.S create mode 100644 src/crypto/zinc/poly1305/poly1305-x86_64-glue.h create mode 100644 src/crypto/zinc/poly1305/poly1305-x86_64.S create mode 100644 src/crypto/zinc/poly1305/poly1305.c (limited to 'src/crypto/zinc/poly1305') diff --git a/src/crypto/zinc/poly1305/poly1305-arm-glue.h b/src/crypto/zinc/poly1305/poly1305-arm-glue.h new file mode 100644 index 0000000..53f8fec --- /dev/null +++ b/src/crypto/zinc/poly1305/poly1305-arm-glue.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (C) 2015-2018 Jason A. Donenfeld . All Rights Reserved. + */ + +#include +#include +#include + +asmlinkage void poly1305_init_arm(void *ctx, const u8 key[16]); +asmlinkage void poly1305_blocks_arm(void *ctx, const u8 *inp, const size_t len, + const u32 padbit); +asmlinkage void poly1305_emit_arm(void *ctx, u8 mac[16], const u32 nonce[4]); +#if IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && \ + (defined(CONFIG_64BIT) || __LINUX_ARM_ARCH__ >= 7) +#define ARM_USE_NEON +asmlinkage void poly1305_blocks_neon(void *ctx, const u8 *inp, const size_t len, + const u32 padbit); +asmlinkage void poly1305_emit_neon(void *ctx, u8 mac[16], const u32 nonce[4]); +#endif + +static bool poly1305_use_neon __ro_after_init; + +void __init poly1305_fpu_init(void) +{ +#if defined(CONFIG_ARM64) + poly1305_use_neon = elf_hwcap & HWCAP_ASIMD; +#elif defined(CONFIG_ARM) + poly1305_use_neon = elf_hwcap & HWCAP_NEON; +#endif +} + +static inline bool poly1305_init_arch(void *ctx, + const u8 key[POLY1305_KEY_SIZE], + simd_context_t simd_context) +{ + poly1305_init_arm(ctx, key); + return true; +} + +static inline bool poly1305_blocks_arch(void *ctx, const u8 *inp, + const size_t len, const u32 padbit, + simd_context_t simd_context) +{ +#if defined(ARM_USE_NEON) + if (simd_context == HAVE_FULL_SIMD && poly1305_use_neon) { + poly1305_blocks_neon(ctx, inp, len, padbit); + return true; + } +#endif + poly1305_blocks_arm(ctx, inp, len, padbit); + return true; +} + +static inline bool poly1305_emit_arch(void *ctx, u8 mac[POLY1305_MAC_SIZE], + const u32 nonce[4], + simd_context_t simd_context) +{ +#if defined(ARM_USE_NEON) + if (simd_context == HAVE_FULL_SIMD && poly1305_use_neon) { + poly1305_emit_neon(ctx, mac, nonce); + return true; + } +#endif + poly1305_emit_arm(ctx, mac, nonce); + return true; +} + +#define HAVE_POLY1305_ARCH_IMPLEMENTATION diff --git a/src/crypto/zinc/poly1305/poly1305-arm.S b/src/crypto/zinc/poly1305/poly1305-arm.S new file mode 100644 index 0000000..277a11a --- /dev/null +++ b/src/crypto/zinc/poly1305/poly1305-arm.S @@ -0,0 +1,1117 @@ +/* SPDX-License-Identifier: OpenSSL OR (BSD-3-Clause OR GPL-2.0) + * + * Copyright (C) 2015-2018 Jason A. Donenfeld . All Rights Reserved. + * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. + * + * This is based in part on Andy Polyakov's implementation from OpenSSL. + */ + +#include + +.text +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif + +.align 5 +ENTRY(poly1305_init_arm) + stmdb sp!,{r4-r11} + + eor r3,r3,r3 + cmp r1,#0 + str r3,[r0,#0] @ zero hash value + str r3,[r0,#4] + str r3,[r0,#8] + str r3,[r0,#12] + str r3,[r0,#16] + str r3,[r0,#36] @ is_base2_26 + add r0,r0,#20 + +#ifdef __thumb2__ + it eq +#endif + moveq r0,#0 + beq .Lno_key + + ldrb r4,[r1,#0] + mov r10,#0x0fffffff + ldrb r5,[r1,#1] + and r3,r10,#-4 @ 0x0ffffffc + ldrb r6,[r1,#2] + ldrb r7,[r1,#3] + orr r4,r4,r5,lsl#8 + ldrb r5,[r1,#4] + orr r4,r4,r6,lsl#16 + ldrb r6,[r1,#5] + orr r4,r4,r7,lsl#24 + ldrb r7,[r1,#6] + and r4,r4,r10 + + ldrb r8,[r1,#7] + orr r5,r5,r6,lsl#8 + ldrb r6,[r1,#8] + orr r5,r5,r7,lsl#16 + ldrb r7,[r1,#9] + orr r5,r5,r8,lsl#24 + ldrb r8,[r1,#10] + and r5,r5,r3 + + ldrb r9,[r1,#11] + orr r6,r6,r7,lsl#8 + ldrb r7,[r1,#12] + orr r6,r6,r8,lsl#16 + ldrb r8,[r1,#13] + orr r6,r6,r9,lsl#24 + ldrb r9,[r1,#14] + and r6,r6,r3 + + ldrb r10,[r1,#15] + orr r7,r7,r8,lsl#8 + str r4,[r0,#0] + orr r7,r7,r9,lsl#16 + str r5,[r0,#4] + orr r7,r7,r10,lsl#24 + str r6,[r0,#8] + and r7,r7,r3 + str r7,[r0,#12] +.Lno_key: + ldmia sp!,{r4-r11} +#if __LINUX_ARM_ARCH__ >= 5 + bx lr @ bx lr +#else + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + .word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +ENDPROC(poly1305_init_arm) + +.align 5 +ENTRY(poly1305_blocks_arm) +.Lpoly1305_blocks_arm: + stmdb sp!,{r3-r11,lr} + + ands r2,r2,#-16 + beq .Lno_data + + cmp r3,#0 + add r2,r2,r1 @ end pointer + sub sp,sp,#32 + + ldmia r0,{r4-r12} @ load context + + str r0,[sp,#12] @ offload stuff + mov lr,r1 + str r2,[sp,#16] + str r10,[sp,#20] + str r11,[sp,#24] + str r12,[sp,#28] + b .Loop + +.Loop: +#if __LINUX_ARM_ARCH__ < 7 + ldrb r0,[lr],#16 @ load input +#ifdef __thumb2__ + it hi +#endif + addhi r8,r8,#1 @ 1<<128 + ldrb r1,[lr,#-15] + ldrb r2,[lr,#-14] + ldrb r3,[lr,#-13] + orr r1,r0,r1,lsl#8 + ldrb r0,[lr,#-12] + orr r2,r1,r2,lsl#16 + ldrb r1,[lr,#-11] + orr r3,r2,r3,lsl#24 + ldrb r2,[lr,#-10] + adds r4,r4,r3 @ accumulate input + + ldrb r3,[lr,#-9] + orr r1,r0,r1,lsl#8 + ldrb r0,[lr,#-8] + orr r2,r1,r2,lsl#16 + ldrb r1,[lr,#-7] + orr r3,r2,r3,lsl#24 + ldrb r2,[lr,#-6] + adcs r5,r5,r3 + + ldrb r3,[lr,#-5] + orr r1,r0,r1,lsl#8 + ldrb r0,[lr,#-4] + orr r2,r1,r2,lsl#16 + ldrb r1,[lr,#-3] + orr r3,r2,r3,lsl#24 + ldrb r2,[lr,#-2] + adcs r6,r6,r3 + + ldrb r3,[lr,#-1] + orr r1,r0,r1,lsl#8 + str lr,[sp,#8] @ offload input pointer + orr r2,r1,r2,lsl#16 + add r10,r10,r10,lsr#2 + orr r3,r2,r3,lsl#24 +#else + ldr r0,[lr],#16 @ load input +#ifdef __thumb2__ + it hi +#endif + addhi r8,r8,#1 @ padbit + ldr r1,[lr,#-12] + ldr r2,[lr,#-8] + ldr r3,[lr,#-4] +#ifdef __ARMEB__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +#endif + adds r4,r4,r0 @ accumulate input + str lr,[sp,#8] @ offload input pointer + adcs r5,r5,r1 + add r10,r10,r10,lsr#2 + adcs r6,r6,r2 +#endif + add r11,r11,r11,lsr#2 + adcs r7,r7,r3 + add r12,r12,r12,lsr#2 + + umull r2,r3,r5,r9 + adc r8,r8,#0 + umull r0,r1,r4,r9 + umlal r2,r3,r8,r10 + umlal r0,r1,r7,r10 + ldr r10,[sp,#20] @ reload r10 + umlal r2,r3,r6,r12 + umlal r0,r1,r5,r12 + umlal r2,r3,r7,r11 + umlal r0,r1,r6,r11 + umlal r2,r3,r4,r10 + str r0,[sp,#0] @ future r4 + mul r0,r11,r8 + ldr r11,[sp,#24] @ reload r11 + adds r2,r2,r1 @ d1+=d0>>32 + eor r1,r1,r1 + adc lr,r3,#0 @ future r6 + str r2,[sp,#4] @ future r5 + + mul r2,r12,r8 + eor r3,r3,r3 + umlal r0,r1,r7,r12 + ldr r12,[sp,#28] @ reload r12 + umlal r2,r3,r7,r9 + umlal r0,r1,r6,r9 + umlal r2,r3,r6,r10 + umlal r0,r1,r5,r10 + umlal r2,r3,r5,r11 + umlal r0,r1,r4,r11 + umlal r2,r3,r4,r12 + ldr r4,[sp,#0] + mul r8,r9,r8 + ldr r5,[sp,#4] + + adds r6,lr,r0 @ d2+=d1>>32 + ldr lr,[sp,#8] @ reload input pointer + adc r1,r1,#0 + adds r7,r2,r1 @ d3+=d2>>32 + ldr r0,[sp,#16] @ reload end pointer + adc r3,r3,#0 + add r8,r8,r3 @ h4+=d3>>32 + + and r1,r8,#-4 + and r8,r8,#3 + add r1,r1,r1,lsr#2 @ *=5 + adds r4,r4,r1 + adcs r5,r5,#0 + adcs r6,r6,#0 + adcs r7,r7,#0 + adc r8,r8,#0 + + cmp r0,lr @ done yet? + bhi .Loop + + ldr r0,[sp,#12] + add sp,sp,#32 + stmia r0,{r4-r8} @ store the result + +.Lno_data: +#if __LINUX_ARM_ARCH__ >= 5 + ldmia sp!,{r3-r11,pc} +#else + ldmia sp!,{r3-r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + .word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +ENDPROC(poly1305_blocks_arm) + +.align 5 +ENTRY(poly1305_emit_arm) + stmdb sp!,{r4-r11} +.Lpoly1305_emit_enter: + ldmia r0,{r3-r7} + adds r8,r3,#5 @ compare to modulus + adcs r9,r4,#0 + adcs r10,r5,#0 + adcs r11,r6,#0 + adc r7,r7,#0 + tst r7,#4 @ did it carry/borrow? + +#ifdef __thumb2__ + it ne +#endif + movne r3,r8 + ldr r8,[r2,#0] +#ifdef __thumb2__ + it ne +#endif + movne r4,r9 + ldr r9,[r2,#4] +#ifdef __thumb2__ + it ne +#endif + movne r5,r10 + ldr r10,[r2,#8] +#ifdef __thumb2__ + it ne +#endif + movne r6,r11 + ldr r11,[r2,#12] + + adds r3,r3,r8 + adcs r4,r4,r9 + adcs r5,r5,r10 + adc r6,r6,r11 + +#if __LINUX_ARM_ARCH__ >= 7 +#ifdef __ARMEB__ + rev r3,r3 + rev r4,r4 + rev r5,r5 + rev r6,r6 +#endif + str r3,[r1,#0] + str r4,[r1,#4] + str r5,[r1,#8] + str r6,[r1,#12] +#else + strb r3,[r1,#0] + mov r3,r3,lsr#8 + strb r4,[r1,#4] + mov r4,r4,lsr#8 + strb r5,[r1,#8] + mov r5,r5,lsr#8 + strb r6,[r1,#12] + mov r6,r6,lsr#8 + + strb r3,[r1,#1] + mov r3,r3,lsr#8 + strb r4,[r1,#5] + mov r4,r4,lsr#8 + strb r5,[r1,#9] + mov r5,r5,lsr#8 + strb r6,[r1,#13] + mov r6,r6,lsr#8 + + strb r3,[r1,#2] + mov r3,r3,lsr#8 + strb r4,[r1,#6] + mov r4,r4,lsr#8 + strb r5,[r1,#10] + mov r5,r5,lsr#8 + strb r6,[r1,#14] + mov r6,r6,lsr#8 + + strb r3,[r1,#3] + strb r4,[r1,#7] + strb r5,[r1,#11] + strb r6,[r1,#15] +#endif + ldmia sp!,{r4-r11} +#if __LINUX_ARM_ARCH__ >= 5 + bx lr @ bx lr +#else + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + .word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +ENDPROC(poly1305_emit_arm) + + +#if __LINUX_ARM_ARCH__ >= 7 +.fpu neon + +.align 5 +ENTRY(poly1305_init_neon) +.Lpoly1305_init_neon: + ldr r4,[r0,#20] @ load key base 2^32 + ldr r5,[r0,#24] + ldr r6,[r0,#28] + ldr r7,[r0,#32] + + and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 + mov r3,r4,lsr#26 + mov r4,r5,lsr#20 + orr r3,r3,r5,lsl#6 + mov r5,r6,lsr#14 + orr r4,r4,r6,lsl#12 + mov r6,r7,lsr#8 + orr r5,r5,r7,lsl#18 + and r3,r3,#0x03ffffff + and r4,r4,#0x03ffffff + and r5,r5,#0x03ffffff + + vdup.32 d0,r2 @ r^1 in both lanes + add r2,r3,r3,lsl#2 @ *5 + vdup.32 d1,r3 + add r3,r4,r4,lsl#2 + vdup.32 d2,r2 + vdup.32 d3,r4 + add r4,r5,r5,lsl#2 + vdup.32 d4,r3 + vdup.32 d5,r5 + add r5,r6,r6,lsl#2 + vdup.32 d6,r4 + vdup.32 d7,r6 + vdup.32 d8,r5 + + mov r5,#2 @ counter + +.Lsquare_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + + vmull.u32 q5,d0,d0[1] + vmull.u32 q6,d1,d0[1] + vmull.u32 q7,d3,d0[1] + vmull.u32 q8,d5,d0[1] + vmull.u32 q9,d7,d0[1] + + vmlal.u32 q5,d7,d2[1] + vmlal.u32 q6,d0,d1[1] + vmlal.u32 q7,d1,d1[1] + vmlal.u32 q8,d3,d1[1] + vmlal.u32 q9,d5,d1[1] + + vmlal.u32 q5,d5,d4[1] + vmlal.u32 q6,d7,d4[1] + vmlal.u32 q8,d1,d3[1] + vmlal.u32 q7,d0,d3[1] + vmlal.u32 q9,d3,d3[1] + + vmlal.u32 q5,d3,d6[1] + vmlal.u32 q8,d0,d5[1] + vmlal.u32 q6,d5,d6[1] + vmlal.u32 q7,d7,d6[1] + vmlal.u32 q9,d1,d5[1] + + vmlal.u32 q8,d7,d8[1] + vmlal.u32 q5,d1,d8[1] + vmlal.u32 q6,d3,d8[1] + vmlal.u32 q7,d5,d8[1] + vmlal.u32 q9,d0,d7[1] + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein + @ and P. Schwabe + @ + @ H0>>+H1>>+H2>>+H3>>+H4 + @ H3>>+H4>>*5+H0>>+H1 + @ + @ Trivia. + @ + @ Result of multiplication of n-bit number by m-bit number is + @ n+m bits wide. However! Even though 2^n is a n+1-bit number, + @ m-bit number multiplied by 2^n is still n+m bits wide. + @ + @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2, + @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit + @ one is n+1 bits wide. + @ + @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that + @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4 + @ can be 27. However! In cases when their width exceeds 26 bits + @ they are limited by 2^26+2^6. This in turn means that *sum* + @ of the products with these values can still be viewed as sum + @ of 52-bit numbers as long as the amount of addends is not a + @ power of 2. For example, + @ + @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4, + @ + @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or + @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than + @ 8 * (2^52) or 2^55. However, the value is then multiplied by + @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12), + @ which is less than 32 * (2^52) or 2^57. And when processing + @ data we are looking at triple as many addends... + @ + @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and + @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the + @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while + @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32 + @ instruction accepts 2x32-bit input and writes 2x64-bit result. + @ This means that result of reduction have to be compressed upon + @ loop wrap-around. This can be done in the process of reduction + @ to minimize amount of instructions [as well as amount of + @ 128-bit instructions, which benefits low-end processors], but + @ one has to watch for H2 (which is narrower than H0) and 5*H4 + @ not being wider than 58 bits, so that result of right shift + @ by 26 bits fits in 32 bits. This is also useful on x86, + @ because it allows to use paddd in place for paddq, which + @ benefits Atom, where paddq is ridiculously slow. + + vshr.u64 q15,q8,#26 + vmovn.i64 d16,q8 + vshr.u64 q4,q5,#26 + vmovn.i64 d10,q5 + vadd.i64 q9,q9,q15 @ h3 -> h4 + vbic.i32 d16,#0xfc000000 @ &=0x03ffffff + vadd.i64 q6,q6,q4 @ h0 -> h1 + vbic.i32 d10,#0xfc000000 + + vshrn.u64 d30,q9,#26 + vmovn.i64 d18,q9 + vshr.u64 q4,q6,#26 + vmovn.i64 d12,q6 + vadd.i64 q7,q7,q4 @ h1 -> h2 + vbic.i32 d18,#0xfc000000 + vbic.i32 d12,#0xfc000000 + + vadd.i32 d10,d10,d30 + vshl.u32 d30,d30,#2 + vshrn.u64 d8,q7,#26 + vmovn.i64 d14,q7 + vadd.i32 d10,d10,d30 @ h4 -> h0 + vadd.i32 d16,d16,d8 @ h2 -> h3 + vbic.i32 d14,#0xfc000000 + + vshr.u32 d30,d10,#26 + vbic.i32 d10,#0xfc000000 + vshr.u32 d8,d16,#26 + vbic.i32 d16,#0xfc000000 + vadd.i32 d12,d12,d30 @ h0 -> h1 + vadd.i32 d18,d18,d8 @ h3 -> h4 + + subs r5,r5,#1 + beq .Lsquare_break_neon + + add r6,r0,#(48+0*9*4) + add r7,r0,#(48+1*9*4) + + vtrn.32 d0,d10 @ r^2:r^1 + vtrn.32 d3,d14 + vtrn.32 d5,d16 + vtrn.32 d1,d12 + vtrn.32 d7,d18 + + vshl.u32 d4,d3,#2 @ *5 + vshl.u32 d6,d5,#2 + vshl.u32 d2,d1,#2 + vshl.u32 d8,d7,#2 + vadd.i32 d4,d4,d3 + vadd.i32 d2,d2,d1 + vadd.i32 d6,d6,d5 + vadd.i32 d8,d8,d7 + + vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! + vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! + vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! + vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! + vst1.32 {d8[0]},[r6,:32] + vst1.32 {d8[1]},[r7,:32] + + b .Lsquare_neon + +.align 4 +.Lsquare_break_neon: + add r6,r0,#(48+2*4*9) + add r7,r0,#(48+3*4*9) + + vmov d0,d10 @ r^4:r^3 + vshl.u32 d2,d12,#2 @ *5 + vmov d1,d12 + vshl.u32 d4,d14,#2 + vmov d3,d14 + vshl.u32 d6,d16,#2 + vmov d5,d16 + vshl.u32 d8,d18,#2 + vmov d7,d18 + vadd.i32 d2,d2,d12 + vadd.i32 d4,d4,d14 + vadd.i32 d6,d6,d16 + vadd.i32 d8,d8,d18 + + vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! + vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! + vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! + vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! + vst1.32 {d8[0]},[r6] + vst1.32 {d8[1]},[r7] + + bx lr @ bx lr +ENDPROC(poly1305_init_neon) + +.align 5 +ENTRY(poly1305_blocks_neon) + ldr ip,[r0,#36] @ is_base2_26 + ands r2,r2,#-16 + beq .Lno_data_neon + + cmp r2,#64 + bhs .Lenter_neon + tst ip,ip @ is_base2_26? + beq .Lpoly1305_blocks_arm + +.Lenter_neon: + stmdb sp!,{r4-r7} + vstmdb sp!,{d8-d15} @ ABI specification says so + + tst ip,ip @ is_base2_26? + bne .Lbase2_26_neon + + stmdb sp!,{r1-r3,lr} + bl .Lpoly1305_init_neon + + ldr r4,[r0,#0] @ load hash value base 2^32 + ldr r5,[r0,#4] + ldr r6,[r0,#8] + ldr r7,[r0,#12] + ldr ip,[r0,#16] + + and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 + mov r3,r4,lsr#26 + veor d10,d10,d10 + mov r4,r5,lsr#20 + orr r3,r3,r5,lsl#6 + veor d12,d12,d12 + mov r5,r6,lsr#14 + orr r4,r4,r6,lsl#12 + veor d14,d14,d14 + mov r6,r7,lsr#8 + orr r5,r5,r7,lsl#18 + veor d16,d16,d16 + and r3,r3,#0x03ffffff + orr r6,r6,ip,lsl#24 + veor d18,d18,d18 + and r4,r4,#0x03ffffff + mov r1,#1 + and r5,r5,#0x03ffffff + str r1,[r0,#36] @ is_base2_26 + + vmov.32 d10[0],r2 + vmov.32 d12[0],r3 + vmov.32 d14[0],r4 + vmov.32 d16[0],r5 + vmov.32 d18[0],r6 + adr r5,.Lzeros + + ldmia sp!,{r1-r3,lr} + b .Lbase2_32_neon + +.align 4 +.Lbase2_26_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ load hash value + + veor d10,d10,d10 + veor d12,d12,d12 + veor d14,d14,d14 + veor d16,d16,d16 + veor d18,d18,d18 + vld4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]! + adr r5,.Lzeros + vld1.32 {d18[0]},[r0] + sub r0,r0,#16 @ rewind + +.Lbase2_32_neon: + add r4,r1,#32 + mov r3,r3,lsl#24 + tst r2,#31 + beq .Leven + + vld4.32 {d20[0],d22[0],d24[0],d26[0]},[r1]! + vmov.32 d28[0],r3 + sub r2,r2,#16 + add r4,r1,#32 + +#ifdef __ARMEB__ + vrev32.8 q10,q10 + vrev32.8 q13,q13 + vrev32.8 q11,q11 + vrev32.8 q12,q12 +#endif + vsri.u32 d28,d26,#8 @ base 2^32 -> base 2^26 + vshl.u32 d26,d26,#18 + + vsri.u32 d26,d24,#14 + vshl.u32 d24,d24,#12 + vadd.i32 d29,d28,d18 @ add hash value and move to #hi + + vbic.i32 d26,#0xfc000000 + vsri.u32 d24,d22,#20 + vshl.u32 d22,d22,#6 + + vbic.i32 d24,#0xfc000000 + vsri.u32 d22,d20,#26 + vadd.i32 d27,d26,d16 + + vbic.i32 d20,#0xfc000000 + vbic.i32 d22,#0xfc000000 + vadd.i32 d25,d24,d14 + + vadd.i32 d21,d20,d10 + vadd.i32 d23,d22,d12 + + mov r7,r5 + add r6,r0,#48 + + cmp r2,r2 + b .Long_tail + +.align 4 +.Leven: + subs r2,r2,#64 + it lo + movlo r4,r5 + + vmov.i32 q14,#1<<24 @ padbit, yes, always + vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1] + add r1,r1,#64 + vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0) + add r4,r4,#64 + itt hi + addhi r7,r0,#(48+1*9*4) + addhi r6,r0,#(48+3*9*4) + +#ifdef __ARMEB__ + vrev32.8 q10,q10 + vrev32.8 q13,q13 + vrev32.8 q11,q11 + vrev32.8 q12,q12 +#endif + vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26 + vshl.u32 q13,q13,#18 + + vsri.u32 q13,q12,#14 + vshl.u32 q12,q12,#12 + + vbic.i32 q13,#0xfc000000 + vsri.u32 q12,q11,#20 + vshl.u32 q11,q11,#6 + + vbic.i32 q12,#0xfc000000 + vsri.u32 q11,q10,#26 + + vbic.i32 q10,#0xfc000000 + vbic.i32 q11,#0xfc000000 + + bls .Lskip_loop + + vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^2 + vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4 + vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! + vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! + b .Loop_neon + +.align 5 +.Loop_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 + @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r + @ ___________________/ + @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 + @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r + @ ___________________/ ____________________/ + @ + @ Note that we start with inp[2:3]*r^2. This is because it + @ doesn't depend on reduction in previous iteration. + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ inp[2:3]*r^2 + + vadd.i32 d24,d24,d14 @ accumulate inp[0:1] + vmull.u32 q7,d25,d0[1] + vadd.i32 d20,d20,d10 + vmull.u32 q5,d21,d0[1] + vadd.i32 d26,d26,d16 + vmull.u32 q8,d27,d0[1] + vmlal.u32 q7,d23,d1[1] + vadd.i32 d22,d22,d12 + vmull.u32 q6,d23,d0[1] + + vadd.i32 d28,d28,d18 + vmull.u32 q9,d29,d0[1] + subs r2,r2,#64 + vmlal.u32 q5,d29,d2[1] + it lo + movlo r4,r5 + vmlal.u32 q8,d25,d1[1] + vld1.32 d8[1],[r7,:32] + vmlal.u32 q6,d21,d1[1] + vmlal.u32 q9,d27,d1[1] + + vmlal.u32 q5,d27,d4[1] + vmlal.u32 q8,d23,d3[1] + vmlal.u32 q9,d25,d3[1] + vmlal.u32 q6,d29,d4[1] + vmlal.u32 q7,d21,d3[1] + + vmlal.u32 q8,d21,d5[1] + vmlal.u32 q5,d25,d6[1] + vmlal.u32 q9,d23,d5[1] + vmlal.u32 q6,d27,d6[1] + vmlal.u32 q7,d29,d6[1] + + vmlal.u32 q8,d29,d8[1] + vmlal.u32 q5,d23,d8[1] + vmlal.u32 q9,d21,d7[1] + vmlal.u32 q6,d25,d8[1] + vmlal.u32 q7,d27,d8[1] + + vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0) + add r4,r4,#64 + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ (hash+inp[0:1])*r^4 and accumulate + + vmlal.u32 q8,d26,d0[0] + vmlal.u32 q5,d20,d0[0] + vmlal.u32 q9,d28,d0[0] + vmlal.u32 q6,d22,d0[0] + vmlal.u32 q7,d24,d0[0] + vld1.32 d8[0],[r6,:32] + + vmlal.u32 q8,d24,d1[0] + vmlal.u32 q5,d28,d2[0] + vmlal.u32 q9,d26,d1[0] + vmlal.u32 q6,d20,d1[0] + vmlal.u32 q7,d22,d1[0] + + vmlal.u32 q8,d22,d3[0] + vmlal.u32 q5,d26,d4[0] + vmlal.u32 q9,d24,d3[0] + vmlal.u32 q6,d28,d4[0] + vmlal.u32 q7,d20,d3[0] + + vmlal.u32 q8,d20,d5[0] + vmlal.u32 q5,d24,d6[0] + vmlal.u32 q9,d22,d5[0] + vmlal.u32 q6,d26,d6[0] + vmlal.u32 q8,d28,d8[0] + + vmlal.u32 q7,d28,d6[0] + vmlal.u32 q5,d22,d8[0] + vmlal.u32 q9,d20,d7[0] + vmov.i32 q14,#1<<24 @ padbit, yes, always + vmlal.u32 q6,d24,d8[0] + vmlal.u32 q7,d26,d8[0] + + vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1] + add r1,r1,#64 +#ifdef __ARMEB__ + vrev32.8 q10,q10 + vrev32.8 q11,q11 + vrev32.8 q12,q12 + vrev32.8 q13,q13 +#endif + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ lazy reduction interleaved with base 2^32 -> base 2^26 of + @ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14. + + vshr.u64 q15,q8,#26 + vmovn.i64 d16,q8 + vshr.u64 q4,q5,#26 + vmovn.i64 d10,q5 + vadd.i64 q9,q9,q15 @ h3 -> h4 + vbic.i32 d16,#0xfc000000 + vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26 + vadd.i64 q6,q6,q4 @ h0 -> h1 + vshl.u32 q13,q13,#18 + vbic.i32 d10,#0xfc000000 + + vshrn.u64 d30,q9,#26 + vmovn.i64 d18,q9 + vshr.u64 q4,q6,#26 + vmovn.i64 d12,q6 + vadd.i64 q7,q7,q4 @ h1 -> h2 + vsri.u32 q13,q12,#14 + vbic.i32 d18,#0xfc000000 + vshl.u32 q12,q12,#12 + vbic.i32 d12,#0xfc000000 + + vadd.i32 d10,d10,d30 + vshl.u32 d30,d30,#2 + vbic.i32 q13,#0xfc000000 + vshrn.u64 d8,q7,#26 + vmovn.i64 d14,q7 + vaddl.u32 q5,d10,d30 @ h4 -> h0 [widen for a sec] + vsri.u32 q12,q11,#20 + vadd.i32 d16,d16,d8 @ h2 -> h3 + vshl.u32 q11,q11,#6 + vbic.i32 d14,#0xfc000000 + vbic.i32 q12,#0xfc000000 + + vshrn.u64 d30,q5,#26 @ re-narrow + vmovn.i64 d10,q5 + vsri.u32 q11,q10,#26 + vbic.i32 q10,#0xfc000000 + vshr.u32 d8,d16,#26 + vbic.i32 d16,#0xfc000000 + vbic.i32 d10,#0xfc000000 + vadd.i32 d12,d12,d30 @ h0 -> h1 + vadd.i32 d18,d18,d8 @ h3 -> h4 + vbic.i32 q11,#0xfc000000 + + bhi .Loop_neon + +.Lskip_loop: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 + + add r7,r0,#(48+0*9*4) + add r6,r0,#(48+1*9*4) + adds r2,r2,#32 + it ne + movne r2,#0 + bne .Long_tail + + vadd.i32 d25,d24,d14 @ add hash value and move to #hi + vadd.i32 d21,d20,d10 + vadd.i32 d27,d26,d16 + vadd.i32 d23,d22,d12 + vadd.i32 d29,d28,d18 + +.Long_tail: + vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^1 + vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^2 + + vadd.i32 d24,d24,d14 @ can be redundant + vmull.u32 q7,d25,d0 + vadd.i32 d20,d20,d10 + vmull.u32 q5,d21,d0 + vadd.i32 d26,d26,d16 + vmull.u32 q8,d27,d0 + vadd.i32 d22,d22,d12 + vmull.u32 q6,d23,d0 + vadd.i32 d28,d28,d18 + vmull.u32 q9,d29,d0 + + vmlal.u32 q5,d29,d2 + vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! + vmlal.u32 q8,d25,d1 + vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! + vmlal.u32 q6,d21,d1 + vmlal.u32 q9,d27,d1 + vmlal.u32 q7,d23,d1 + + vmlal.u32 q8,d23,d3 + vld1.32 d8[1],[r7,:32] + vmlal.u32 q5,d27,d4 + vld1.32 d8[0],[r6,:32] + vmlal.u32 q9,d25,d3 + vmlal.u32 q6,d29,d4 + vmlal.u32 q7,d21,d3 + + vmlal.u32 q8,d21,d5 + it ne + addne r7,r0,#(48+2*9*4) + vmlal.u32 q5,d25,d6 + it ne + addne r6,r0,#(48+3*9*4) + vmlal.u32 q9,d23,d5 + vmlal.u32 q6,d27,d6 + vmlal.u32 q7,d29,d6 + + vmlal.u32 q8,d29,d8 + vorn q0,q0,q0 @ all-ones, can be redundant + vmlal.u32 q5,d23,d8 + vshr.u64 q0,q0,#38 + vmlal.u32 q9,d21,d7 + vmlal.u32 q6,d25,d8 + vmlal.u32 q7,d27,d8 + + beq .Lshort_tail + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ (hash+inp[0:1])*r^4:r^3 and accumulate + + vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^3 + vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4 + + vmlal.u32 q7,d24,d0 + vmlal.u32 q5,d20,d0 + vmlal.u32 q8,d26,d0 + vmlal.u32 q6,d22,d0 + vmlal.u32 q9,d28,d0 + + vmlal.u32 q5,d28,d2 + vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! + vmlal.u32 q8,d24,d1 + vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! + vmlal.u32 q6,d20,d1 + vmlal.u32 q9,d26,d1 + vmlal.u32 q7,d22,d1 + + vmlal.u32 q8,d22,d3 + vld1.32 d8[1],[r7,:32] + vmlal.u32 q5,d26,d4 + vld1.32 d8[0],[r6,:32] + vmlal.u32 q9,d24,d3 + vmlal.u32 q6,d28,d4 + vmlal.u32 q7,d20,d3 + + vmlal.u32 q8,d20,d5 + vmlal.u32 q5,d24,d6 + vmlal.u32 q9,d22,d5 + vmlal.u32 q6,d26,d6 + vmlal.u32 q7,d28,d6 + + vmlal.u32 q8,d28,d8 + vorn q0,q0,q0 @ all-ones + vmlal.u32 q5,d22,d8 + vshr.u64 q0,q0,#38 + vmlal.u32 q9,d20,d7 + vmlal.u32 q6,d24,d8 + vmlal.u32 q7,d26,d8 + +.Lshort_tail: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ horizontal addition + + vadd.i64 d16,d16,d17 + vadd.i64 d10,d10,d11 + vadd.i64 d18,d18,d19 + vadd.i64 d12,d12,d13 + vadd.i64 d14,d14,d15 + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ lazy reduction, but without narrowing + + vshr.u64 q15,q8,#26 + vand.i64 q8,q8,q0 + vshr.u64 q4,q5,#26 + vand.i64 q5,q5,q0 + vadd.i64 q9,q9,q15 @ h3 -> h4 + vadd.i64 q6,q6,q4 @ h0 -> h1 + + vshr.u64 q15,q9,#26 + vand.i64 q9,q9,q0 + vshr.u64 q4,q6,#26 + vand.i64 q6,q6,q0 + vadd.i64 q7,q7,q4 @ h1 -> h2 + + vadd.i64 q5,q5,q15 + vshl.u64 q15,q15,#2 + vshr.u64 q4,q7,#26 + vand.i64 q7,q7,q0 + vadd.i64 q5,q5,q15 @ h4 -> h0 + vadd.i64 q8,q8,q4 @ h2 -> h3 + + vshr.u64 q15,q5,#26 + vand.i64 q5,q5,q0 + vshr.u64 q4,q8,#26 + vand.i64 q8,q8,q0 + vadd.i64 q6,q6,q15 @ h0 -> h1 + vadd.i64 q9,q9,q4 @ h3 -> h4 + + cmp r2,#0 + bne .Leven + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ store hash value + + vst4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]! + vst1.32 {d18[0]},[r0] + + vldmia sp!,{d8-d15} @ epilogue + ldmia sp!,{r4-r7} +.Lno_data_neon: + bx lr @ bx lr +ENDPROC(poly1305_blocks_neon) + +.align 5 +ENTRY(poly1305_emit_neon) + ldr ip,[r0,#36] @ is_base2_26 + + stmdb sp!,{r4-r11} + + tst ip,ip + beq .Lpoly1305_emit_enter + + ldmia r0,{r3-r7} + eor r8,r8,r8 + + adds r3,r3,r4,lsl#26 @ base 2^26 -> base 2^32 + mov r4,r4,lsr#6 + adcs r4,r4,r5,lsl#20 + mov r5,r5,lsr#12 + adcs r5,r5,r6,lsl#14 + mov r6,r6,lsr#18 + adcs r6,r6,r7,lsl#8 + adc r7,r8,r7,lsr#24 @ can be partially reduced ... + + and r8,r7,#-4 @ ... so reduce + and r7,r6,#3 + add r8,r8,r8,lsr#2 @ *= 5 + adds r3,r3,r8 + adcs r4,r4,#0 + adcs r5,r5,#0 + adcs r6,r6,#0 + adc r7,r7,#0 + + adds r8,r3,#5 @ compare to modulus + adcs r9,r4,#0 + adcs r10,r5,#0 + adcs r11,r6,#0 + adc r7,r7,#0 + tst r7,#4 @ did it carry/borrow? + + it ne + movne r3,r8 + ldr r8,[r2,#0] + it ne + movne r4,r9 + ldr r9,[r2,#4] + it ne + movne r5,r10 + ldr r10,[r2,#8] + it ne + movne r6,r11 + ldr r11,[r2,#12] + + adds r3,r3,r8 @ accumulate nonce + adcs r4,r4,r9 + adcs r5,r5,r10 + adc r6,r6,r11 + +#ifdef __ARMEB__ + rev r3,r3 + rev r4,r4 + rev r5,r5 + rev r6,r6 +#endif + str r3,[r1,#0] @ store the result + str r4,[r1,#4] + str r5,[r1,#8] + str r6,[r1,#12] + + ldmia sp!,{r4-r11} + bx lr @ bx lr +ENDPROC(poly1305_emit_neon) + +.align 5 +.Lzeros: +.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +#endif diff --git a/src/crypto/zinc/poly1305/poly1305-arm64.S b/src/crypto/zinc/poly1305/poly1305-arm64.S new file mode 100644 index 0000000..706e664 --- /dev/null +++ b/src/crypto/zinc/poly1305/poly1305-arm64.S @@ -0,0 +1,822 @@ +/* SPDX-License-Identifier: OpenSSL OR (BSD-3-Clause OR GPL-2.0) + * + * Copyright (C) 2015-2018 Jason A. Donenfeld . All Rights Reserved. + * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. + * + * This is based in part on Andy Polyakov's implementation from OpenSSL. + */ + +#include +.text + +.align 5 +ENTRY(poly1305_init_arm) + cmp x1,xzr + stp xzr,xzr,[x0] // zero hash value + stp xzr,xzr,[x0,#16] // [along with is_base2_26] + + csel x0,xzr,x0,eq + b.eq .Lno_key + + ldp x7,x8,[x1] // load key + mov x9,#0xfffffffc0fffffff + movk x9,#0x0fff,lsl#48 +#ifdef __ARMEB__ + rev x7,x7 // flip bytes + rev x8,x8 +#endif + and x7,x7,x9 // &=0ffffffc0fffffff + and x9,x9,#-4 + and x8,x8,x9 // &=0ffffffc0ffffffc + stp x7,x8,[x0,#32] // save key value + +.Lno_key: + ret +ENDPROC(poly1305_init_arm) + +.align 5 +ENTRY(poly1305_blocks_arm) + ands x2,x2,#-16 + b.eq .Lno_data + + ldp x4,x5,[x0] // load hash value + ldp x7,x8,[x0,#32] // load key value + ldr x6,[x0,#16] + add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) + b .Loop + +.align 5 +.Loop: + ldp x10,x11,[x1],#16 // load input + sub x2,x2,#16 +#ifdef __ARMEB__ + rev x10,x10 + rev x11,x11 +#endif + adds x4,x4,x10 // accumulate input + adcs x5,x5,x11 + + mul x12,x4,x7 // h0*r0 + adc x6,x6,x3 + umulh x13,x4,x7 + + mul x10,x5,x9 // h1*5*r1 + umulh x11,x5,x9 + + adds x12,x12,x10 + mul x10,x4,x8 // h0*r1 + adc x13,x13,x11 + umulh x14,x4,x8 + + adds x13,x13,x10 + mul x10,x5,x7 // h1*r0 + adc x14,x14,xzr + umulh x11,x5,x7 + + adds x13,x13,x10 + mul x10,x6,x9 // h2*5*r1 + adc x14,x14,x11 + mul x11,x6,x7 // h2*r0 + + adds x13,x13,x10 + adc x14,x14,x11 + + and x10,x14,#-4 // final reduction + and x6,x14,#3 + add x10,x10,x14,lsr#2 + adds x4,x12,x10 + adcs x5,x13,xzr + adc x6,x6,xzr + + cbnz x2,.Loop + + stp x4,x5,[x0] // store hash value + str x6,[x0,#16] + +.Lno_data: + ret +ENDPROC(poly1305_blocks_arm) + +.align 5 +ENTRY(poly1305_emit_arm) + ldp x4,x5,[x0] // load hash base 2^64 + ldr x6,[x0,#16] + ldp x10,x11,[x2] // load nonce + + adds x12,x4,#5 // compare to modulus + adcs x13,x5,xzr + adc x14,x6,xzr + + tst x14,#-4 // see if it's carried/borrowed + + csel x4,x4,x12,eq + csel x5,x5,x13,eq + +#ifdef __ARMEB__ + ror x10,x10,#32 // flip nonce words + ror x11,x11,#32 +#endif + adds x4,x4,x10 // accumulate nonce + adc x5,x5,x11 +#ifdef __ARMEB__ + rev x4,x4 // flip output bytes + rev x5,x5 +#endif + stp x4,x5,[x1] // write result + + ret +ENDPROC(poly1305_emit_arm) + +.align 5 +__poly1305_mult: + mul x12,x4,x7 // h0*r0 + umulh x13,x4,x7 + + mul x10,x5,x9 // h1*5*r1 + umulh x11,x5,x9 + + adds x12,x12,x10 + mul x10,x4,x8 // h0*r1 + adc x13,x13,x11 + umulh x14,x4,x8 + + adds x13,x13,x10 + mul x10,x5,x7 // h1*r0 + adc x14,x14,xzr + umulh x11,x5,x7 + + adds x13,x13,x10 + mul x10,x6,x9 // h2*5*r1 + adc x14,x14,x11 + mul x11,x6,x7 // h2*r0 + + adds x13,x13,x10 + adc x14,x14,x11 + + and x10,x14,#-4 // final reduction + and x6,x14,#3 + add x10,x10,x14,lsr#2 + adds x4,x12,x10 + adcs x5,x13,xzr + adc x6,x6,xzr + + ret + +__poly1305_splat: + and x12,x4,#0x03ffffff // base 2^64 -> base 2^26 + ubfx x13,x4,#26,#26 + extr x14,x5,x4,#52 + and x14,x14,#0x03ffffff + ubfx x15,x5,#14,#26 + extr x16,x6,x5,#40 + + str w12,[x0,#16*0] // r0 + add w12,w13,w13,lsl#2 // r1*5 + str w13,[x0,#16*1] // r1 + add w13,w14,w14,lsl#2 // r2*5 + str w12,[x0,#16*2] // s1 + str w14,[x0,#16*3] // r2 + add w14,w15,w15,lsl#2 // r3*5 + str w13,[x0,#16*4] // s2 + str w15,[x0,#16*5] // r3 + add w15,w16,w16,lsl#2 // r4*5 + str w14,[x0,#16*6] // s3 + str w16,[x0,#16*7] // r4 + str w15,[x0,#16*8] // s4 + + ret + +.align 5 +ENTRY(poly1305_blocks_neon) + ldr x17,[x0,#24] + cmp x2,#128 + b.hs .Lblocks_neon + cbz x17,poly1305_blocks_arm + +.Lblocks_neon: + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + + ands x2,x2,#-16 + b.eq .Lno_data_neon + + cbz x17,.Lbase2_64_neon + + ldp w10,w11,[x0] // load hash value base 2^26 + ldp w12,w13,[x0,#8] + ldr w14,[x0,#16] + + tst x2,#31 + b.eq .Leven_neon + + ldp x7,x8,[x0,#32] // load key value + + add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 + lsr x5,x12,#12 + adds x4,x4,x12,lsl#52 + add x5,x5,x13,lsl#14 + adc x5,x5,xzr + lsr x6,x14,#24 + adds x5,x5,x14,lsl#40 + adc x14,x6,xzr // can be partially reduced... + + ldp x12,x13,[x1],#16 // load input + sub x2,x2,#16 + add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) + + and x10,x14,#-4 // ... so reduce + and x6,x14,#3 + add x10,x10,x14,lsr#2 + adds x4,x4,x10 + adcs x5,x5,xzr + adc x6,x6,xzr + +#ifdef __ARMEB__ + rev x12,x12 + rev x13,x13 +#endif + adds x4,x4,x12 // accumulate input + adcs x5,x5,x13 + adc x6,x6,x3 + + bl __poly1305_mult + ldr x30,[sp,#8] + + cbz x3,.Lstore_base2_64_neon + + and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 + ubfx x11,x4,#26,#26 + extr x12,x5,x4,#52 + and x12,x12,#0x03ffffff + ubfx x13,x5,#14,#26 + extr x14,x6,x5,#40 + + cbnz x2,.Leven_neon + + stp w10,w11,[x0] // store hash value base 2^26 + stp w12,w13,[x0,#8] + str w14,[x0,#16] + b .Lno_data_neon + +.align 4 +.Lstore_base2_64_neon: + stp x4,x5,[x0] // store hash value base 2^64 + stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed + b .Lno_data_neon + +.align 4 +.Lbase2_64_neon: + ldp x7,x8,[x0,#32] // load key value + + ldp x4,x5,[x0] // load hash value base 2^64 + ldr x6,[x0,#16] + + tst x2,#31 + b.eq .Linit_neon + + ldp x12,x13,[x1],#16 // load input + sub x2,x2,#16 + add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) +#ifdef __ARMEB__ + rev x12,x12 + rev x13,x13 +#endif + adds x4,x4,x12 // accumulate input + adcs x5,x5,x13 + adc x6,x6,x3 + + bl __poly1305_mult + +.Linit_neon: + and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 + ubfx x11,x4,#26,#26 + extr x12,x5,x4,#52 + and x12,x12,#0x03ffffff + ubfx x13,x5,#14,#26 + extr x14,x6,x5,#40 + + stp d8,d9,[sp,#16] // meet ABI requirements + stp d10,d11,[sp,#32] + stp d12,d13,[sp,#48] + stp d14,d15,[sp,#64] + + fmov d24,x10 + fmov d25,x11 + fmov d26,x12 + fmov d27,x13 + fmov d28,x14 + + ////////////////////////////////// initialize r^n table + mov x4,x7 // r^1 + add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) + mov x5,x8 + mov x6,xzr + add x0,x0,#48+12 + bl __poly1305_splat + + bl __poly1305_mult // r^2 + sub x0,x0,#4 + bl __poly1305_splat + + bl __poly1305_mult // r^3 + sub x0,x0,#4 + bl __poly1305_splat + + bl __poly1305_mult // r^4 + sub x0,x0,#4 + bl __poly1305_splat + ldr x30,[sp,#8] + + add x16,x1,#32 + adr x17,.Lzeros + subs x2,x2,#64 + csel x16,x17,x16,lo + + mov x4,#1 + str x4,[x0,#-24] // set is_base2_26 + sub x0,x0,#48 // restore original x0 + b .Ldo_neon + +.align 4 +.Leven_neon: + add x16,x1,#32 + adr x17,.Lzeros + subs x2,x2,#64 + csel x16,x17,x16,lo + + stp d8,d9,[sp,#16] // meet ABI requirements + stp d10,d11,[sp,#32] + stp d12,d13,[sp,#48] + stp d14,d15,[sp,#64] + + fmov d24,x10 + fmov d25,x11 + fmov d26,x12 + fmov d27,x13 + fmov d28,x14 + +.Ldo_neon: + ldp x8,x12,[x16],#16 // inp[2:3] (or zero) + ldp x9,x13,[x16],#48 + + lsl x3,x3,#24 + add x15,x0,#48 + +#ifdef __ARMEB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + and x5,x9,#0x03ffffff + ubfx x6,x8,#26,#26 + ubfx x7,x9,#26,#26 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + extr x8,x12,x8,#52 + extr x9,x13,x9,#52 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + fmov d14,x4 + and x8,x8,#0x03ffffff + and x9,x9,#0x03ffffff + ubfx x10,x12,#14,#26 + ubfx x11,x13,#14,#26 + add x12,x3,x12,lsr#40 + add x13,x3,x13,lsr#40 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + fmov d15,x6 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + fmov d16,x8 + fmov d17,x10 + fmov d18,x12 + + ldp x8,x12,[x1],#16 // inp[0:1] + ldp x9,x13,[x1],#48 + + ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64 + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64 + ld1 {v8.4s},[x15] + +#ifdef __ARMEB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + and x5,x9,#0x03ffffff + ubfx x6,x8,#26,#26 + ubfx x7,x9,#26,#26 + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + extr x8,x12,x8,#52 + extr x9,x13,x9,#52 + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + fmov d9,x4 + and x8,x8,#0x03ffffff + and x9,x9,#0x03ffffff + ubfx x10,x12,#14,#26 + ubfx x11,x13,#14,#26 + add x12,x3,x12,lsr#40 + add x13,x3,x13,lsr#40 + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + fmov d10,x6 + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + movi v31.2d,#-1 + fmov d11,x8 + fmov d12,x10 + fmov d13,x12 + ushr v31.2d,v31.2d,#38 + + b.ls .Lskip_loop + +.align 4 +.Loop_neon: + //////////////////////////////////////////////////////////////// + // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 + // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r + // ___________________/ + // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 + // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r + // ___________________/ ____________________/ + // + // Note that we start with inp[2:3]*r^2. This is because it + // doesn't depend on reduction in previous iteration. + //////////////////////////////////////////////////////////////// + // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 + // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 + // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 + // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 + // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 + + subs x2,x2,#64 + umull v23.2d,v14.2s,v7.s[2] + csel x16,x17,x16,lo + umull v22.2d,v14.2s,v5.s[2] + umull v21.2d,v14.2s,v3.s[2] + ldp x8,x12,[x16],#16 // inp[2:3] (or zero) + umull v20.2d,v14.2s,v1.s[2] + ldp x9,x13,[x16],#48 + umull v19.2d,v14.2s,v0.s[2] +#ifdef __ARMEB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + + umlal v23.2d,v15.2s,v5.s[2] + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + umlal v22.2d,v15.2s,v3.s[2] + and x5,x9,#0x03ffffff + umlal v21.2d,v15.2s,v1.s[2] + ubfx x6,x8,#26,#26 + umlal v20.2d,v15.2s,v0.s[2] + ubfx x7,x9,#26,#26 + umlal v19.2d,v15.2s,v8.s[2] + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + + umlal v23.2d,v16.2s,v3.s[2] + extr x8,x12,x8,#52 + umlal v22.2d,v16.2s,v1.s[2] + extr x9,x13,x9,#52 + umlal v21.2d,v16.2s,v0.s[2] + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + umlal v20.2d,v16.2s,v8.s[2] + fmov d14,x4 + umlal v19.2d,v16.2s,v6.s[2] + and x8,x8,#0x03ffffff + + umlal v23.2d,v17.2s,v1.s[2] + and x9,x9,#0x03ffffff + umlal v22.2d,v17.2s,v0.s[2] + ubfx x10,x12,#14,#26 + umlal v21.2d,v17.2s,v8.s[2] + ubfx x11,x13,#14,#26 + umlal v20.2d,v17.2s,v6.s[2] + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + umlal v19.2d,v17.2s,v4.s[2] + fmov d15,x6 + + add v11.2s,v11.2s,v26.2s + add x12,x3,x12,lsr#40 + umlal v23.2d,v18.2s,v0.s[2] + add x13,x3,x13,lsr#40 + umlal v22.2d,v18.2s,v8.s[2] + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + umlal v21.2d,v18.2s,v6.s[2] + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + umlal v20.2d,v18.2s,v4.s[2] + fmov d16,x8 + umlal v19.2d,v18.2s,v2.s[2] + fmov d17,x10 + + //////////////////////////////////////////////////////////////// + // (hash+inp[0:1])*r^4 and accumulate + + add v9.2s,v9.2s,v24.2s + fmov d18,x12 + umlal v22.2d,v11.2s,v1.s[0] + ldp x8,x12,[x1],#16 // inp[0:1] + umlal v19.2d,v11.2s,v6.s[0] + ldp x9,x13,[x1],#48 + umlal v23.2d,v11.2s,v3.s[0] + umlal v20.2d,v11.2s,v8.s[0] + umlal v21.2d,v11.2s,v0.s[0] +#ifdef __ARMEB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 +#endif + + add v10.2s,v10.2s,v25.2s + umlal v22.2d,v9.2s,v5.s[0] + umlal v23.2d,v9.2s,v7.s[0] + and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 + umlal v21.2d,v9.2s,v3.s[0] + and x5,x9,#0x03ffffff + umlal v19.2d,v9.2s,v0.s[0] + ubfx x6,x8,#26,#26 + umlal v20.2d,v9.2s,v1.s[0] + ubfx x7,x9,#26,#26 + + add v12.2s,v12.2s,v27.2s + add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 + umlal v22.2d,v10.2s,v3.s[0] + extr x8,x12,x8,#52 + umlal v23.2d,v10.2s,v5.s[0] + extr x9,x13,x9,#52 + umlal v19.2d,v10.2s,v8.s[0] + add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 + umlal v21.2d,v10.2s,v1.s[0] + fmov d9,x4 + umlal v20.2d,v10.2s,v0.s[0] + and x8,x8,#0x03ffffff + + add v13.2s,v13.2s,v28.2s + and x9,x9,#0x03ffffff + umlal v22.2d,v12.2s,v0.s[0] + ubfx x10,x12,#14,#26 + umlal v19.2d,v12.2s,v4.s[0] + ubfx x11,x13,#14,#26 + umlal v23.2d,v12.2s,v1.s[0] + add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 + umlal v20.2d,v12.2s,v6.s[0] + fmov d10,x6 + umlal v21.2d,v12.2s,v8.s[0] + add x12,x3,x12,lsr#40 + + umlal v22.2d,v13.2s,v8.s[0] + add x13,x3,x13,lsr#40 + umlal v19.2d,v13.2s,v2.s[0] + add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 + umlal v23.2d,v13.2s,v0.s[0] + add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 + umlal v20.2d,v13.2s,v4.s[0] + fmov d11,x8 + umlal v21.2d,v13.2s,v6.s[0] + fmov d12,x10 + fmov d13,x12 + + ///////////////////////////////////////////////////////////////// + // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein + // and P. Schwabe + // + // [see discussion in poly1305-armv4 module] + + ushr v29.2d,v22.2d,#26 + xtn v27.2s,v22.2d + ushr v30.2d,v19.2d,#26 + and v19.16b,v19.16b,v31.16b + add v23.2d,v23.2d,v29.2d // h3 -> h4 + bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff + add v20.2d,v20.2d,v30.2d // h0 -> h1 + + ushr v29.2d,v23.2d,#26 + xtn v28.2s,v23.2d + ushr v30.2d,v20.2d,#26 + xtn v25.2s,v20.2d + bic v28.2s,#0xfc,lsl#24 + add v21.2d,v21.2d,v30.2d // h1 -> h2 + + add v19.2d,v19.2d,v29.2d + shl v29.2d,v29.2d,#2 + shrn v30.2s,v21.2d,#26 + xtn v26.2s,v21.2d + add v19.2d,v19.2d,v29.2d // h4 -> h0 + bic v25.2s,#0xfc,lsl#24 + add v27.2s,v27.2s,v30.2s // h2 -> h3 + bic v26.2s,#0xfc,lsl#24 + + shrn v29.2s,v19.2d,#26 + xtn v24.2s,v19.2d + ushr v30.2s,v27.2s,#26 + bic v27.2s,#0xfc,lsl#24 + bic v24.2s,#0xfc,lsl#24 + add v25.2s,v25.2s,v29.2s // h0 -> h1 + add v28.2s,v28.2s,v30.2s // h3 -> h4 + + b.hi .Loop_neon + +.Lskip_loop: + dup v16.2d,v16.d[0] + add v11.2s,v11.2s,v26.2s + + //////////////////////////////////////////////////////////////// + // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 + + adds x2,x2,#32 + b.ne .Long_tail + + dup v16.2d,v11.d[0] + add v14.2s,v9.2s,v24.2s + add v17.2s,v12.2s,v27.2s + add v15.2s,v10.2s,v25.2s + add v18.2s,v13.2s,v28.2s + +.Long_tail: + dup v14.2d,v14.d[0] + umull2 v19.2d,v16.4s,v6.4s + umull2 v22.2d,v16.4s,v1.4s + umull2 v23.2d,v16.4s,v3.4s + umull2 v21.2d,v16.4s,v0.4s + umull2 v20.2d,v16.4s,v8.4s + + dup v15.2d,v15.d[0] + umlal2 v19.2d,v14.4s,v0.4s + umlal2 v21.2d,v14.4s,v3.4s + umlal2 v22.2d,v14.4s,v5.4s + umlal2 v23.2d,v14.4s,v7.4s + umlal2 v20.2d,v14.4s,v1.4s + + dup v17.2d,v17.d[0] + umlal2 v19.2d,v15.4s,v8.4s + umlal2 v22.2d,v15.4s,v3.4s + umlal2 v21.2d,v15.4s,v1.4s + umlal2 v23.2d,v15.4s,v5.4s + umlal2 v20.2d,v15.4s,v0.4s + + dup v18.2d,v18.d[0] + umlal2 v22.2d,v17.4s,v0.4s + umlal2 v23.2d,v17.4s,v1.4s + umlal2 v19.2d,v17.4s,v4.4s + umlal2 v20.2d,v17.4s,v6.4s + umlal2 v21.2d,v17.4s,v8.4s + + umlal2 v22.2d,v18.4s,v8.4s + umlal2 v19.2d,v18.4s,v2.4s + umlal2 v23.2d,v18.4s,v0.4s + umlal2 v20.2d,v18.4s,v4.4s + umlal2 v21.2d,v18.4s,v6.4s + + b.eq .Lshort_tail + + //////////////////////////////////////////////////////////////// + // (hash+inp[0:1])*r^4:r^3 and accumulate + + add v9.2s,v9.2s,v24.2s + umlal v22.2d,v11.2s,v1.2s + umlal v19.2d,v11.2s,v6.2s + umlal v23.2d,v11.2s,v3.2s + umlal v20.2d,v11.2s,v8.2s + umlal v21.2d,v11.2s,v0.2s + + add v10.2s,v10.2s,v25.2s + umlal v22.2d,v9.2s,v5.2s + umlal v19.2d,v9.2s,v0.2s + umlal v23.2d,v9.2s,v7.2s + umlal v20.2d,v9.2s,v1.2s + umlal v21.2d,v9.2s,v3.2s + + add v12.2s,v12.2s,v27.2s + umlal v22.2d,v10.2s,v3.2s + umlal v19.2d,v10.2s,v8.2s + umlal v23.2d,v10.2s,v5.2s + umlal v20.2d,v10.2s,v0.2s + umlal v21.2d,v10.2s,v1.2s + + add v13.2s,v13.2s,v28.2s + umlal v22.2d,v12.2s,v0.2s + umlal v19.2d,v12.2s,v4.2s + umlal v23.2d,v12.2s,v1.2s + umlal v20.2d,v12.2s,v6.2s + umlal v21.2d,v12.2s,v8.2s + + umlal v22.2d,v13.2s,v8.2s + umlal v19.2d,v13.2s,v2.2s + umlal v23.2d,v13.2s,v0.2s + umlal v20.2d,v13.2s,v4.2s + umlal v21.2d,v13.2s,v6.2s + +.Lshort_tail: + //////////////////////////////////////////////////////////////// + // horizontal add + + addp v22.2d,v22.2d,v22.2d + ldp d8,d9,[sp,#16] // meet ABI requirements + addp v19.2d,v19.2d,v19.2d + ldp d10,d11,[sp,#32] + addp v23.2d,v23.2d,v23.2d + ldp d12,d13,[sp,#48] + addp v20.2d,v20.2d,v20.2d + ldp d14,d15,[sp,#64] + addp v21.2d,v21.2d,v21.2d + + //////////////////////////////////////////////////////////////// + // lazy reduction, but without narrowing + + ushr v29.2d,v22.2d,#26 + and v22.16b,v22.16b,v31.16b + ushr v30.2d,v19.2d,#26 + and v19.16b,v19.16b,v31.16b + + add v23.2d,v23.2d,v29.2d // h3 -> h4 + add v20.2d,v20.2d,v30.2d // h0 -> h1 + + ushr v29.2d,v23.2d,#26 + and v23.16b,v23.16b,v31.16b + ushr v30.2d,v20.2d,#26 + and v20.16b,v20.16b,v31.16b + add v21.2d,v21.2d,v30.2d // h1 -> h2 + + add v19.2d,v19.2d,v29.2d + shl v29.2d,v29.2d,#2 + ushr v30.2d,v21.2d,#26 + and v21.16b,v21.16b,v31.16b + add v19.2d,v19.2d,v29.2d // h4 -> h0 + add v22.2d,v22.2d,v30.2d // h2 -> h3 + + ushr v29.2d,v19.2d,#26 + and v19.16b,v19.16b,v31.16b + ushr v30.2d,v22.2d,#26 + and v22.16b,v22.16b,v31.16b + add v20.2d,v20.2d,v29.2d // h0 -> h1 + add v23.2d,v23.2d,v30.2d // h3 -> h4 + + //////////////////////////////////////////////////////////////// + // write the result, can be partially reduced + + st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16 + st1 {v23.s}[0],[x0] + +.Lno_data_neon: + ldr x29,[sp],#80 + ret +ENDPROC(poly1305_blocks_neon) + +.align 5 +ENTRY(poly1305_emit_neon) + ldr x17,[x0,#24] + cbz x17,poly1305_emit_arm + + ldp w10,w11,[x0] // load hash value base 2^26 + ldp w12,w13,[x0,#8] + ldr w14,[x0,#16] + + add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 + lsr x5,x12,#12 + adds x4,x4,x12,lsl#52 + add x5,x5,x13,lsl#14 + adc x5,x5,xzr + lsr x6,x14,#24 + adds x5,x5,x14,lsl#40 + adc x6,x6,xzr // can be partially reduced... + + ldp x10,x11,[x2] // load nonce + + and x12,x6,#-4 // ... so reduce + add x12,x12,x6,lsr#2 + and x6,x6,#3 + adds x4,x4,x12 + adcs x5,x5,xzr + adc x6,x6,xzr + + adds x12,x4,#5 // compare to modulus + adcs x13,x5,xzr + adc x14,x6,xzr + + tst x14,#-4 // see if it's carried/borrowed + + csel x4,x4,x12,eq + csel x5,x5,x13,eq + +#ifdef __ARMEB__ + ror x10,x10,#32 // flip nonce words + ror x11,x11,#32 +#endif + adds x4,x4,x10 // accumulate nonce + adc x5,x5,x11 +#ifdef __ARMEB__ + rev x4,x4 // flip output bytes + rev x5,x5 +#endif + stp x4,x5,[x1] // write result + + ret +ENDPROC(poly1305_emit_neon) + +.align 5 +.Lzeros: +.long 0,0,0,0,0,0,0,0 diff --git a/src/crypto/zinc/poly1305/poly1305-mips-glue.h b/src/crypto/zinc/poly1305/poly1305-mips-glue.h new file mode 100644 index 0000000..e29f859 --- /dev/null +++ b/src/crypto/zinc/poly1305/poly1305-mips-glue.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (C) 2015-2018 Jason A. Donenfeld . All Rights Reserved. + */ + +#include + +asmlinkage void poly1305_init_mips(void *ctx, const u8 key[16]); +asmlinkage void poly1305_blocks_mips(void *ctx, const u8 *inp, const size_t len, + const u32 padbit); +asmlinkage void poly1305_emit_mips(void *ctx, u8 mac[16], const u32 nonce[4]); +void __init poly1305_fpu_init(void) +{ +} + +static inline bool poly1305_init_arch(void *ctx, + const u8 key[POLY1305_KEY_SIZE], + simd_context_t simd_context) +{ + poly1305_init_mips(ctx, key); + return true; +} + +static inline bool poly1305_blocks_arch(void *ctx, const u8 *inp, + const size_t len, const u32 padbit, + simd_context_t simd_context) +{ + poly1305_blocks_mips(ctx, inp, len, padbit); + return true; +} + +static inline bool poly1305_emit_arch(void *ctx, u8 mac[POLY1305_MAC_SIZE], + const u32 nonce[4], + simd_context_t simd_context) +{ + poly1305_emit_mips(ctx, mac, nonce); + return true; +} + +#define HAVE_POLY1305_ARCH_IMPLEMENTATION diff --git a/src/crypto/zinc/poly1305/poly1305-mips.S b/src/crypto/zinc/poly1305/poly1305-mips.S new file mode 100644 index 0000000..32d8558 --- /dev/null +++ b/src/crypto/zinc/poly1305/poly1305-mips.S @@ -0,0 +1,417 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (C) 2016-2018 René van Dorst All Rights Reserved. + * Copyright (C) 2015-2018 Jason A. Donenfeld . All Rights Reserved. + */ + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define MSB 0 +#define LSB 3 +#else +#define MSB 3 +#define LSB 0 +#endif + +#define POLY1305_BLOCK_SIZE 16 +.text +#define H0 $t0 +#define H1 $t1 +#define H2 $t2 +#define H3 $t3 +#define H4 $t4 + +#define R0 $t5 +#define R1 $t6 +#define R2 $t7 +#define R3 $t8 + +#define O0 $s0 +#define O1 $s4 +#define O2 $v1 +#define O3 $t9 +#define O4 $s5 + +#define S1 $s1 +#define S2 $s2 +#define S3 $s3 + +#define SC $at +#define CA $v0 + +/* Input arguments */ +#define poly $a0 +#define src $a1 +#define srclen $a2 +#define hibit $a3 + +/* Location in the opaque buffer + * R[0..3], CA, H[0..4] + */ +#define PTR_POLY1305_R(n) ( 0 + (n*4)) ## ($a0) +#define PTR_POLY1305_CA (16 ) ## ($a0) +#define PTR_POLY1305_H(n) (20 + (n*4)) ## ($a0) + +#define POLY1305_BLOCK_SIZE 16 +#define POLY1305_STACK_SIZE 8 * 4 + +.set reorder +.set noat +.align 4 +.globl poly1305_blocks_mips +.ent poly1305_blocks_mips +poly1305_blocks_mips: + .frame $sp,POLY1305_STACK_SIZE,$31 + /* srclen &= 0xFFFFFFF0 */ + ins srclen, $zero, 0, 4 + + .set noreorder + /* check srclen >= 16 bytes */ + beqz srclen, .Lpoly1305_blocks_mips_end + addiu $sp, -(POLY1305_STACK_SIZE) + .set reorder + + /* Calculate last round based on src address pointer. + * last round src ptr (srclen) = src + (srclen & 0xFFFFFFF0) + */ + addu srclen, src + + lw R0, PTR_POLY1305_R(0) + lw R1, PTR_POLY1305_R(1) + lw R2, PTR_POLY1305_R(2) + lw R3, PTR_POLY1305_R(3) + + /* store the used save registers. */ + sw $s0, 0($sp) + sw $s1, 4($sp) + sw $s2, 8($sp) + sw $s3, 12($sp) + sw $s4, 16($sp) + sw $s5, 20($sp) + + /* load Hx and Carry */ + lw CA, PTR_POLY1305_CA + lw H0, PTR_POLY1305_H(0) + lw H1, PTR_POLY1305_H(1) + lw H2, PTR_POLY1305_H(2) + lw H3, PTR_POLY1305_H(3) + lw H4, PTR_POLY1305_H(4) + + /* Sx = Rx + (Rx >> 2) */ + srl S1, R1, 2 + srl S2, R2, 2 + srl S3, R3, 2 + addu S1, R1 + addu S2, R2 + addu S3, R3 + + addiu SC, $zero, 1 + +.Lpoly1305_loop: + lwl O0, 0+MSB(src) + lwl O1, 4+MSB(src) + lwl O2, 8+MSB(src) + lwl O3,12+MSB(src) + lwr O0, 0+LSB(src) + lwr O1, 4+LSB(src) + lwr O2, 8+LSB(src) + lwr O3,12+LSB(src) + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + wsbh O0 + wsbh O1 + wsbh O2 + wsbh O3 + rotr O0, 16 + rotr O1, 16 + rotr O2, 16 + rotr O3, 16 +#endif + + /* h0 = (u32)(d0 = (u64)h0 + inp[0] + c 'Carry_previous cycle'); */ + addu H0, CA + sltu CA, H0, CA + addu O0, H0 + sltu H0, O0, H0 + addu CA, H0 + + /* h1 = (u32)(d1 = (u64)h1 + (d0 >> 32) + inp[4]); */ + addu H1, CA + sltu CA, H1, CA + addu O1, H1 + sltu H1, O1, H1 + addu CA, H1 + + /* h2 = (u32)(d2 = (u64)h2 + (d1 >> 32) + inp[8]); */ + addu H2, CA + sltu CA, H2, CA + addu O2, H2 + sltu H2, O2, H2 + addu CA, H2 + + /* h3 = (u32)(d3 = (u64)h3 + (d2 >> 32) + inp[12]); */ + addu H3, CA + sltu CA, H3, CA + addu O3, H3 + sltu H3, O3, H3 + addu CA, H3 + + /* h4 += (u32)(d3 >> 32) + padbit; */ + addu H4, hibit + addu O4, H4, CA + + /* D0 */ + multu O0, R0 + maddu O1, S3 + maddu O2, S2 + maddu O3, S1 + mfhi CA + mflo H0 + + /* D1 */ + multu O0, R1 + maddu O1, R0 + maddu O2, S3 + maddu O3, S2 + maddu O4, S1 + maddu CA, SC + mfhi CA + mflo H1 + + /* D2 */ + multu O0, R2 + maddu O1, R1 + maddu O2, R0 + maddu O3, S3 + maddu O4, S2 + maddu CA, SC + mfhi CA + mflo H2 + + /* D4 */ + mul H4, O4, R0 + + /* D3 */ + multu O0, R3 + maddu O1, R2 + maddu O2, R1 + maddu O3, R0 + maddu O4, S3 + maddu CA, SC + mfhi CA + mflo H3 + + addiu src, POLY1305_BLOCK_SIZE + + /* h4 += (u32)(d3 >> 32); */ + addu O4, H4, CA + /* h4 &= 3 */ + andi H4, O4, 3 + /* c = (h4 >> 2) + (h4 & ~3U); */ + srl CA, O4, 2 + ins O4, $zero, 0, 2 + + /* able to do a 16 byte block. */ + .set noreorder + bne src, srclen, .Lpoly1305_loop + /* Delay slot is always executed. */ + addu CA, O4 + .set reorder + + /* restore the used save registers. */ + lw $s0, 0($sp) + lw $s1, 4($sp) + lw $s2, 8($sp) + lw $s3, 12($sp) + lw $s4, 16($sp) + lw $s5, 20($sp) + + /* store Hx and Carry */ + sw CA, PTR_POLY1305_CA + sw H0, PTR_POLY1305_H(0) + sw H1, PTR_POLY1305_H(1) + sw H2, PTR_POLY1305_H(2) + sw H3, PTR_POLY1305_H(3) + sw H4, PTR_POLY1305_H(4) + +.Lpoly1305_blocks_mips_end: + /* Jump Back */ + .set noreorder + jr $ra + addiu $sp, POLY1305_STACK_SIZE + .set reorder +.end poly1305_blocks_mips +.set at +.set reorder + +/* Input arguments CTX=$a0, MAC=$a1, NONCE=$a2 */ +#define MAC $a1 +#define NONCE $a2 + +#define G0 $t5 +#define G1 $t6 +#define G2 $t7 +#define G3 $t8 +#define G4 $t9 + +.set reorder +.set noat +.align 4 +.globl poly1305_emit_mips +.ent poly1305_emit_mips +poly1305_emit_mips: + /* load Hx and Carry */ + lw CA, PTR_POLY1305_CA + lw H0, PTR_POLY1305_H(0) + lw H1, PTR_POLY1305_H(1) + lw H2, PTR_POLY1305_H(2) + lw H3, PTR_POLY1305_H(3) + lw H4, PTR_POLY1305_H(4) + + /* Add left over carry */ + addu H0, CA + sltu CA, H0, CA + addu H1, CA + sltu CA, H1, CA + addu H2, CA + sltu CA, H2, CA + addu H3, CA + sltu CA, H3, CA + addu H4, CA + + /* compare to modulus by computing h + -p */ + addiu G0, H0, 5 + sltu CA, G0, H0 + addu G1, H1, CA + sltu CA, G1, H1 + addu G2, H2, CA + sltu CA, G2, H2 + addu G3, H3, CA + sltu CA, G3, H3 + addu G4, H4, CA + + srl SC, G4, 2 + + /* if there was carry into 131st bit, h3:h0 = g3:g0 */ + movn H0, G0, SC + movn H1, G1, SC + movn H2, G2, SC + movn H3, G3, SC + + lwl G0, 0+MSB(NONCE) + lwl G1, 4+MSB(NONCE) + lwl G2, 8+MSB(NONCE) + lwl G3,12+MSB(NONCE) + lwr G0, 0+LSB(NONCE) + lwr G1, 4+LSB(NONCE) + lwr G2, 8+LSB(NONCE) + lwr G3,12+LSB(NONCE) + + /* mac = (h + nonce) % (2^128) */ + addu H0, G0 + sltu CA, H0, G0 + + /* H1 */ + addu H1, CA + sltu CA, H1, CA + addu H1, G1 + sltu G1, H1, G1 + addu CA, G1 + + /* H2 */ + addu H2, CA + sltu CA, H2, CA + addu H2, G2 + sltu G2, H2, G2 + addu CA, G2 + + /* H3 */ + addu H3, CA + addu H3, G3 + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + wsbh H0 + wsbh H1 + wsbh H2 + wsbh H3 + rotr H0, 16 + rotr H1, 16 + rotr H2, 16 + rotr H3, 16 +#endif + + /* store MAC */ + swl H0, 0+MSB(MAC) + swl H1, 4+MSB(MAC) + swl H2, 8+MSB(MAC) + swl H3,12+MSB(MAC) + swr H0, 0+LSB(MAC) + swr H1, 4+LSB(MAC) + swr H2, 8+LSB(MAC) + .set noreorder + jr $ra + swr H3,12+LSB(MAC) + .set reorder +.end poly1305_emit_mips + +#define PR0 $t0 +#define PR1 $t1 +#define PR2 $t2 +#define PR3 $t3 +#define PT0 $t4 + +/* Input arguments CTX=$a0, KEY=$a1 */ + +.align 4 +.globl poly1305_init_mips +.ent poly1305_init_mips +poly1305_init_mips: + lwl PR0, 0+MSB($a1) + lwl PR1, 4+MSB($a1) + lwl PR2, 8+MSB($a1) + lwl PR3,12+MSB($a1) + lwr PR0, 0+LSB($a1) + lwr PR1, 4+LSB($a1) + lwr PR2, 8+LSB($a1) + lwr PR3,12+LSB($a1) + + /* store Hx and Carry */ + sw $zero, PTR_POLY1305_CA + sw $zero, PTR_POLY1305_H(0) + sw $zero, PTR_POLY1305_H(1) + sw $zero, PTR_POLY1305_H(2) + sw $zero, PTR_POLY1305_H(3) + sw $zero, PTR_POLY1305_H(4) + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + wsbh PR0 + wsbh PR1 + wsbh PR2 + wsbh PR3 + rotr PR0, 16 + rotr PR1, 16 + rotr PR2, 16 + rotr PR3, 16 +#endif + + lui PT0, 0x0FFF + ori PT0, 0xFFFC + + /* AND 0x0fffffff; */ + ext PR0, PR0, 0, (32-4) + + /* AND 0x0ffffffc; */ + and PR1, PT0 + and PR2, PT0 + and PR3, PT0 + + /* store Rx */ + sw PR0, PTR_POLY1305_R(0) + sw PR1, PTR_POLY1305_R(1) + sw PR2, PTR_POLY1305_R(2) + + .set noreorder + /* Jump Back */ + jr $ra + sw PR3, PTR_POLY1305_R(3) + .set reorder +.end poly1305_init_mips diff --git a/src/crypto/zinc/poly1305/poly1305-mips64.S b/src/crypto/zinc/poly1305/poly1305-mips64.S new file mode 100644 index 0000000..1a45fbe --- /dev/null +++ b/src/crypto/zinc/poly1305/poly1305-mips64.S @@ -0,0 +1,357 @@ +/* SPDX-License-Identifier: OpenSSL OR (BSD-3-Clause OR GPL-2.0) + * + * Copyright (C) 2015-2018 Jason A. Donenfeld . All Rights Reserved. + * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. + */ + +#if !defined(CONFIG_64BIT) +#error "This is only for 64-bit kernels." +#endif + +#ifdef __MIPSEB__ +#define MSB 0 +#define LSB 7 +#else +#define MSB 7 +#define LSB 0 +#endif + +#if defined(CONFIG_CPU_MIPS64_R6) || defined(CONFIG_CPU_MIPSR6) +#define dmultu(rs,rt) +#define mflo(rd,rs,rt) dmulu rd,rs,rt +#define mfhi(rd,rs,rt) dmuhu rd,rs,rt +#else +#define dmultu(rs,rt) dmultu rs,rt +#define multu(rs,rt) multu rs,rt +#define mflo(rd,rs,rt) mflo rd +#define mfhi(rd,rs,rt) mfhi rd +#endif + +.text +.set noat +.set noreorder + +/* While most of the assembly in the kernel prefers ENTRY() and ENDPROC(), + * there is no existing MIPS assembly that uses it, and MIPS assembler seems + * to like its own .ent/.end notation, which the MIPS include files don't + * provide in a MIPS-specific ENTRY/ENDPROC definition. So, we skip these + * for now, until somebody complains. */ + +.align 5 +.globl poly1305_init_mips +.ent poly1305_init_mips +poly1305_init_mips: + .frame $29,0,$31 + .set reorder + + sd $0,0($4) + sd $0,8($4) + sd $0,16($4) + + beqz $5,.Lno_key + +#if defined(CONFIG_CPU_MIPS64_R6) || defined(CONFIG_CPU_MIPSR6) + ld $8,0($5) + ld $9,8($5) +#else + ldl $8,0+MSB($5) + ldl $9,8+MSB($5) + ldr $8,0+LSB($5) + ldr $9,8+LSB($5) +#endif +#ifdef __MIPSEB__ +#if defined(CONFIG_CPU_MIPS64_R2) || defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPS64_R6) || defined(CONFIG_CPU_MIPSR6) + dsbh $8,$8 # byte swap + dsbh $9,$9 + dshd $8,$8 + dshd $9,$9 +#else + ori $10,$0,0xFF + dsll $1,$10,32 + or $10,$1 # 0x000000FF000000FF + + and $11,$8,$10 # byte swap + and $2,$9,$10 + dsrl $1,$8,24 + dsrl $24,$9,24 + dsll $11,24 + dsll $2,24 + and $1,$10 + and $24,$10 + dsll $10,8 # 0x0000FF000000FF00 + or $11,$1 + or $2,$24 + and $1,$8,$10 + and $24,$9,$10 + dsrl $8,8 + dsrl $9,8 + dsll $1,8 + dsll $24,8 + and $8,$10 + and $9,$10 + or $11,$1 + or $2,$24 + or $8,$11 + or $9,$2 + dsrl $11,$8,32 + dsrl $2,$9,32 + dsll $8,32 + dsll $9,32 + or $8,$11 + or $9,$2 +#endif +#endif + li $10,1 + dsll $10,32 + daddiu $10,-63 + dsll $10,28 + daddiu $10,-1 # 0ffffffc0fffffff + + and $8,$10 + daddiu $10,-3 # 0ffffffc0ffffffc + and $9,$10 + + sd $8,24($4) + dsrl $10,$9,2 + sd $9,32($4) + daddu $10,$9 # s1 = r1 + (r1 >> 2) + sd $10,40($4) + +.Lno_key: + li $2,0 # return 0 + jr $31 +.end poly1305_init_mips + +.align 5 +.globl poly1305_blocks_mips +.ent poly1305_blocks_mips +poly1305_blocks_mips: + .set noreorder + dsrl $6,4 # number of complete blocks + bnez $6,poly1305_blocks_internal + nop + jr $31 + nop +.end poly1305_blocks_mips + +.align 5 +.ent poly1305_blocks_internal +poly1305_blocks_internal: + .frame $29,6*8,$31 + .mask 0x00030000,-8 + .set noreorder + dsubu $29,6*8 + sd $17,40($29) + sd $16,32($29) + .set reorder + + ld $12,0($4) # load hash value + ld $13,8($4) + ld $14,16($4) + + ld $15,24($4) # load key + ld $16,32($4) + ld $17,40($4) + +.Loop: +#if defined(CONFIG_CPU_MIPS64_R6) || defined(CONFIG_CPU_MIPSR6) + ld $8,0($5) # load input + ld $9,8($5) +#else + ldl $8,0+MSB($5) # load input + ldl $9,8+MSB($5) + ldr $8,0+LSB($5) + ldr $9,8+LSB($5) +#endif + daddiu $6,-1 + daddiu $5,16 +#ifdef __MIPSEB__ +#if defined(CONFIG_CPU_MIPS64_R2) || defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPS64_R6) || defined(CONFIG_CPU_MIPSR6) + dsbh $8,$8 # byte swap + dsbh $9,$9 + dshd $8,$8 + dshd $9,$9 +#else + ori $10,$0,0xFF + dsll $1,$10,32 + or $10,$1 # 0x000000FF000000FF + + and $11,$8,$10 # byte swap + and $2,$9,$10 + dsrl $1,$8,24 + dsrl $24,$9,24 + dsll $11,24 + dsll $2,24 + and $1,$10 + and $24,$10 + dsll $10,8 # 0x0000FF000000FF00 + or $11,$1 + or $2,$24 + and $1,$8,$10 + and $24,$9,$10 + dsrl $8,8 + dsrl $9,8 + dsll $1,8 + dsll $24,8 + and $8,$10 + and $9,$10 + or $11,$1 + or $2,$24 + or $8,$11 + or $9,$2 + dsrl $11,$8,32 + dsrl $2,$9,32 + dsll $8,32 + dsll $9,32 + or $8,$11 + or $9,$2 +#endif +#endif + daddu $12,$8 # accumulate input + daddu $13,$9 + sltu $10,$12,$8 + sltu $11,$13,$9 + daddu $13,$10 + + dmultu ($15,$12) # h0*r0 + daddu $14,$7 + sltu $10,$13,$10 + mflo ($8,$15,$12) + mfhi ($9,$15,$12) + + dmultu ($17,$13) # h1*5*r1 + daddu $10,$11 + daddu $14,$10 + mflo ($10,$17,$13) + mfhi ($11,$17,$13) + + dmultu ($16,$12) # h0*r1 + daddu $8,$10 + daddu $9,$11 + mflo ($1,$16,$12) + mfhi ($25,$16,$12) + sltu $10,$8,$10 + daddu $9,$10 + + dmultu ($15,$13) # h1*r0 + daddu $9,$1 + sltu $1,$9,$1 + mflo ($10,$15,$13) + mfhi ($11,$15,$13) + daddu $25,$1 + + dmultu ($17,$14) # h2*5*r1 + daddu $9,$10 + daddu $25,$11 + mflo ($1,$17,$14) + + dmultu ($15,$14) # h2*r0 + sltu $10,$9,$10 + daddu $25,$10 + mflo ($2,$15,$14) + + daddu $9,$1 + daddu $25,$2 + sltu $1,$9,$1 + daddu $25,$1 + + li $10,-4 # final reduction + and $10,$25 + dsrl $11,$25,2 + andi $14,$25,3 + daddu $10,$11 + daddu $12,$8,$10 + sltu $10,$12,$10 + daddu $13,$9,$10 + sltu $10,$13,$10 + daddu $14,$14,$10 + + bnez $6,.Loop + + sd $12,0($4) # store hash value + sd $13,8($4) + sd $14,16($4) + + .set noreorder + ld $17,40($29) # epilogue + ld $16,32($29) + jr $31 + daddu $29,6*8 +.end poly1305_blocks_internal + +.align 5 +.globl poly1305_emit_mips +.ent poly1305_emit_mips +poly1305_emit_mips: + .frame $29,0,$31 + .set reorder + + ld $10,0($4) + ld $11,8($4) + ld $1,16($4) + + daddiu $8,$10,5 # compare to modulus + sltiu $2,$8,5 + daddu $9,$11,$2 + sltu $2,$9,$2 + daddu $1,$1,$2 + + dsrl $1,2 # see if it carried/borrowed + dsubu $1,$0,$1 + nor $2,$0,$1 + + and $8,$1 + and $10,$2 + and $9,$1 + and $11,$2 + or $8,$10 + or $9,$11 + + lwu $10,0($6) # load nonce + lwu $11,4($6) + lwu $1,8($6) + lwu $2,12($6) + dsll $11,32 + dsll $2,32 + or $10,$11 + or $1,$2 + + daddu $8,$10 # accumulate nonce + daddu $9,$1 + sltu $10,$8,$10 + daddu $9,$10 + + dsrl $10,$8,8 # write mac value + dsrl $11,$8,16 + dsrl $1,$8,24 + sb $8,0($5) + dsrl $2,$8,32 + sb $10,1($5) + dsrl $10,$8,40 + sb $11,2($5) + dsrl $11,$8,48 + sb $1,3($5) + dsrl $1,$8,56 + sb $2,4($5) + dsrl $2,$9,8 + sb $10,5($5) + dsrl $10,$9,16 + sb $11,6($5) + dsrl $11,$9,24 + sb $1,7($5) + + sb $9,8($5) + dsrl $1,$9,32 + sb $2,9($5) + dsrl $2,$9,40 + sb $10,10($5) + dsrl $10,$9,48 + sb $11,11($5) + dsrl $11,$9,56 + sb $1,12($5) + sb $2,13($5) + sb $10,14($5) + sb $11,15($5) + + jr $31 +.end poly1305_emit_mips diff --git a/src/crypto/zinc/poly1305/poly1305-x86_64-glue.h b/src/crypto/zinc/poly1305/poly1305-x86_64-glue.h new file mode 100644 index 0000000..e24bee7 --- /dev/null +++ b/src/crypto/zinc/poly1305/poly1305-x86_64-glue.h @@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (C) 2015-2018 Jason A. Donenfeld . All Rights Reserved. + */ + +#include +#include +#include +#include + +asmlinkage void poly1305_init_x86_64(void *ctx, + const u8 key[POLY1305_KEY_SIZE]); +asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp, + const size_t len, const u32 padbit); +asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_MAC_SIZE], + const u32 nonce[4]); +#ifdef CONFIG_AS_AVX +asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_MAC_SIZE], + const u32 nonce[4]); +asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len, + const u32 padbit); +#endif +#ifdef CONFIG_AS_AVX2 +asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len, + const u32 padbit); +#endif +#ifdef CONFIG_AS_AVX512 +asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp, + const size_t len, const u32 padbit); +#endif + +static bool poly1305_use_avx __ro_after_init; +static bool poly1305_use_avx2 __ro_after_init; +static bool poly1305_use_avx512 __ro_after_init; + +void __init poly1305_fpu_init(void) +{ + poly1305_use_avx = + boot_cpu_has(X86_FEATURE_AVX) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); + poly1305_use_avx2 = + boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); +#ifndef COMPAT_CANNOT_USE_AVX512 + poly1305_use_avx512 = + boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_AVX512F) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | + XFEATURE_MASK_AVX512, NULL) && + /* Skylake downclocks unacceptably much when using zmm. */ + boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X; +#endif +} + +static inline bool poly1305_init_arch(void *ctx, + const u8 key[POLY1305_KEY_SIZE], + simd_context_t simd_context) +{ + poly1305_init_x86_64(ctx, key); + return true; +} + +static inline bool poly1305_blocks_arch(void *ctx, const u8 *inp, + const size_t len, const u32 padbit, + simd_context_t simd_context) +{ +#ifdef CONFIG_AS_AVX512 + if (poly1305_use_avx512 && simd_context == HAVE_FULL_SIMD) + poly1305_blocks_avx512(ctx, inp, len, padbit); + else +#endif +#ifdef CONFIG_AS_AVX2 + if (poly1305_use_avx2 && simd_context == HAVE_FULL_SIMD) + poly1305_blocks_avx2(ctx, inp, len, padbit); + else +#endif +#ifdef CONFIG_AS_AVX + if (poly1305_use_avx && simd_context == HAVE_FULL_SIMD) + poly1305_blocks_avx(ctx, inp, len, padbit); + else +#endif + poly1305_blocks_x86_64(ctx, inp, len, padbit); + return true; +} + +static inline bool poly1305_emit_arch(void *ctx, u8 mac[POLY1305_MAC_SIZE], + const u32 nonce[4], + simd_context_t simd_context) +{ +#ifdef CONFIG_AS_AVX512 + if (poly1305_use_avx512 && simd_context == HAVE_FULL_SIMD) + poly1305_emit_avx(ctx, mac, nonce); + else +#endif +#ifdef CONFIG_AS_AVX2 + if (poly1305_use_avx2 && simd_context == HAVE_FULL_SIMD) + poly1305_emit_avx(ctx, mac, nonce); + else +#endif +#ifdef CONFIG_AS_AVX + if (poly1305_use_avx && simd_context == HAVE_FULL_SIMD) + poly1305_emit_avx(ctx, mac, nonce); + else +#endif + poly1305_emit_x86_64(ctx, mac, nonce); + return true; +} + +#define HAVE_POLY1305_ARCH_IMPLEMENTATION diff --git a/src/crypto/zinc/poly1305/poly1305-x86_64.S b/src/crypto/zinc/poly1305/poly1305-x86_64.S new file mode 100644 index 0000000..4d51f40 --- /dev/null +++ b/src/crypto/zinc/poly1305/poly1305-x86_64.S @@ -0,0 +1,2792 @@ +/* SPDX-License-Identifier: OpenSSL OR (BSD-3-Clause OR GPL-2.0) + * + * Copyright (C) 2017 Samuel Neves . All Rights Reserved. + * Copyright (C) 2015-2018 Jason A. Donenfeld . All Rights Reserved. + * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. + * + * This is based in part on Andy Polyakov's implementation from OpenSSL. + */ + +#include + +.section .rodata.cst192.Lconst, "aM", @progbits, 192 +.align 64 +.Lconst: +.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 +.long 16777216,0,16777216,0,16777216,0,16777216,0 +.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 +.long 2,2,2,3,2,0,2,1 +.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 + +.text + +.align 32 +ENTRY(poly1305_init_x86_64) + xorq %rax,%rax + movq %rax,0(%rdi) + movq %rax,8(%rdi) + movq %rax,16(%rdi) + + cmpq $0,%rsi + je .Lno_key + + movq $0x0ffffffc0fffffff,%rax + movq $0x0ffffffc0ffffffc,%rcx + andq 0(%rsi),%rax + andq 8(%rsi),%rcx + movq %rax,24(%rdi) + movq %rcx,32(%rdi) + movl $1,%eax +.Lno_key: + ret +ENDPROC(poly1305_init_x86_64) + +.align 32 +ENTRY(poly1305_blocks_x86_64) +.Lblocks: + shrq $4,%rdx + jz .Lno_data + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rdi + +.Lblocks_body: + + movq %rdx,%r15 + + movq 24(%rdi),%r11 + movq 32(%rdi),%r13 + + movq 0(%rdi),%r14 + movq 8(%rdi),%rbx + movq 16(%rdi),%r10 + + movq %r13,%r12 + shrq $2,%r13 + movq %r12,%rax + addq %r12,%r13 + jmp .Loop + +.align 32 +.Loop: + + addq 0(%rsi),%r14 + adcq 8(%rsi),%rbx + leaq 16(%rsi),%rsi + adcq %rcx,%r10 + mulq %r14 + movq %rax,%r9 + movq %r11,%rax + movq %rdx,%rdi + + mulq %r14 + movq %rax,%r14 + movq %r11,%rax + movq %rdx,%r8 + + mulq %rbx + addq %rax,%r9 + movq %r13,%rax + adcq %rdx,%rdi + + mulq %rbx + movq %r10,%rbx + addq %rax,%r14 + adcq %rdx,%r8 + + imulq %r13,%rbx + addq %rbx,%r9 + movq %r8,%rbx + adcq $0,%rdi + + imulq %r11,%r10 + addq %r9,%rbx + movq $-4,%rax + adcq %r10,%rdi + + andq %rdi,%rax + movq %rdi,%r10 + shrq $2,%rdi + andq $3,%r10 + addq %rdi,%rax + addq %rax,%r14 + adcq $0,%rbx + adcq $0,%r10 + + movq %r12,%rax + decq %r15 + jnz .Loop + + movq 0(%rsp),%rdi + + movq %r14,0(%rdi) + movq %rbx,8(%rdi) + movq %r10,16(%rdi) + + movq 8(%rsp),%r15 + movq 16(%rsp),%r14 + movq 24(%rsp),%r13 + movq 32(%rsp),%r12 + movq 40(%rsp),%rbx + leaq 48(%rsp),%rsp +.Lno_data: +.Lblocks_epilogue: + ret +ENDPROC(poly1305_blocks_x86_64) + +.align 32 +ENTRY(poly1305_emit_x86_64) +.Lemit: + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + + movq %r8,%rax + addq $5,%r8 + movq %r9,%rcx + adcq $0,%r9 + adcq $0,%r10 + shrq $2,%r10 + cmovnzq %r8,%rax + cmovnzq %r9,%rcx + + addq 0(%rdx),%rax + adcq 8(%rdx),%rcx + movq %rax,0(%rsi) + movq %rcx,8(%rsi) + + ret +ENDPROC(poly1305_emit_x86_64) + +.macro __poly1305_block + mulq %r14 + movq %rax,%r9 + movq %r11,%rax + movq %rdx,%rdi + + mulq %r14 + movq %rax,%r14 + movq %r11,%rax + movq %rdx,%r8 + + mulq %rbx + addq %rax,%r9 + movq %r13,%rax + adcq %rdx,%rdi + + mulq %rbx + movq %r10,%rbx + addq %rax,%r14 + adcq %rdx,%r8 + + imulq %r13,%rbx + addq %rbx,%r9 + movq %r8,%rbx + adcq $0,%rdi + + imulq %r11,%r10 + addq %r9,%rbx + movq $-4,%rax + adcq %r10,%rdi + + andq %rdi,%rax + movq %rdi,%r10 + shrq $2,%rdi + andq $3,%r10 + addq %rdi,%rax + addq %rax,%r14 + adcq $0,%rbx + adcq $0,%r10 +.endm + +.macro __poly1305_init_avx + movq %r11,%r14 + movq %r12,%rbx + xorq %r10,%r10 + + leaq 48+64(%rdi),%rdi + + movq %r12,%rax + movq %rdi,0(%rsp) + __poly1305_block + movq 0(%rsp),%rdi + + movl $0x3ffffff,%eax + movl $0x3ffffff,%edx + movq %r14,%r8 + andl %r14d,%eax + movq %r11,%r9 + andl %r11d,%edx + movl %eax,-64(%rdi) + shrq $26,%r8 + movl %edx,-60(%rdi) + shrq $26,%r9 + + movl $0x3ffffff,%eax + movl $0x3ffffff,%edx + andl %r8d,%eax + andl %r9d,%edx + movl %eax,-48(%rdi) + leal (%rax,%rax,4),%eax + movl %edx,-44(%rdi) + leal (%rdx,%rdx,4),%edx + movl %eax,-32(%rdi) + shrq $26,%r8 + movl %edx,-28(%rdi) + shrq $26,%r9 + + movq %rbx,%rax + movq %r12,%rdx + shlq $12,%rax + shlq $12,%rdx + orq %r8,%rax + orq %r9,%rdx + andl $0x3ffffff,%eax + andl $0x3ffffff,%edx + movl %eax,-16(%rdi) + leal (%rax,%rax,4),%eax + movl %edx,-12(%rdi) + leal (%rdx,%rdx,4),%edx + movl %eax,0(%rdi) + movq %rbx,%r8 + movl %edx,4(%rdi) + movq %r12,%r9 + + movl $0x3ffffff,%eax + movl $0x3ffffff,%edx + shrq $14,%r8 + shrq $14,%r9 + andl %r8d,%eax + andl %r9d,%edx + movl %eax,16(%rdi) + leal (%rax,%rax,4),%eax + movl %edx,20(%rdi) + leal (%rdx,%rdx,4),%edx + movl %eax,32(%rdi) + shrq $26,%r8 + movl %edx,36(%rdi) + shrq $26,%r9 + + movq %r10,%rax + shlq $24,%rax + orq %rax,%r8 + movl %r8d,48(%rdi) + leaq (%r8,%r8,4),%r8 + movl %r9d,52(%rdi) + leaq (%r9,%r9,4),%r9 + movl %r8d,64(%rdi) + movl %r9d,68(%rdi) + + movq %r12,%rax + movq %rdi,0(%rsp) + __poly1305_block + movq 0(%rsp),%rdi + + movl $0x3ffffff,%eax + movq %r14,%r8 + andl %r14d,%eax + shrq $26,%r8 + movl %eax,-52(%rdi) + + movl $0x3ffffff,%edx + andl %r8d,%edx + movl %edx,-36(%rdi) + leal (%rdx,%rdx,4),%edx + shrq $26,%r8 + movl %edx,-20(%rdi) + + movq %rbx,%rax + shlq $12,%rax + orq %r8,%rax + andl $0x3ffffff,%eax + movl %eax,-4(%rdi) + leal (%rax,%rax,4),%eax + movq %rbx,%r8 + movl %eax,12(%rdi) + + movl $0x3ffffff,%edx + shrq $14,%r8 + andl %r8d,%edx + movl %edx,28(%rdi) + leal (%rdx,%rdx,4),%edx + shrq $26,%r8 + movl %edx,44(%rdi) + + movq %r10,%rax + shlq $24,%rax + orq %rax,%r8 + movl %r8d,60(%rdi) + leaq (%r8,%r8,4),%r8 + movl %r8d,76(%rdi) + + movq %r12,%rax + movq %rdi,0(%rsp) + __poly1305_block + movq 0(%rsp),%rdi + + movl $0x3ffffff,%eax + movq %r14,%r8 + andl %r14d,%eax + shrq $26,%r8 + movl %eax,-56(%rdi) + + movl $0x3ffffff,%edx + andl %r8d,%edx + movl %edx,-40(%rdi) + leal (%rdx,%rdx,4),%edx + shrq $26,%r8 + movl %edx,-24(%rdi) + + movq %rbx,%rax + shlq $12,%rax + orq %r8,%rax + andl $0x3ffffff,%eax + movl %eax,-8(%rdi) + leal (%rax,%rax,4),%eax + movq %rbx,%r8 + movl %eax,8(%rdi) + + movl $0x3ffffff,%edx + shrq $14,%r8 + andl %r8d,%edx + movl %edx,24(%rdi) + leal (%rdx,%rdx,4),%edx + shrq $26,%r8 + movl %edx,40(%rdi) + + movq %r10,%rax + shlq $24,%rax + orq %rax,%r8 + movl %r8d,56(%rdi) + leaq (%r8,%r8,4),%r8 + movl %r8d,72(%rdi) + + leaq -48-64(%rdi),%rdi +.endm + +#ifdef CONFIG_AS_AVX +.align 32 +ENTRY(poly1305_blocks_avx) + + movl 20(%rdi),%r8d + cmpq $128,%rdx + jae .Lblocks_avx + testl %r8d,%r8d + jz .Lblocks + +.Lblocks_avx: + andq $-16,%rdx + jz .Lno_data_avx + + vzeroupper + + testl %r8d,%r8d + jz .Lbase2_64_avx + + testq $31,%rdx + jz .Leven_avx + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rdi + +.Lblocks_avx_body: + + movq %rdx,%r15 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movl 16(%rdi),%r10d + + movq 24(%rdi),%r11 + movq 32(%rdi),%r13 + + + movl %r8d,%r14d + andq $-2147483648,%r8 + movq %r9,%r12 + movl %r9d,%ebx + andq $-2147483648,%r9 + + shrq $6,%r8 + shlq $52,%r12 + addq %r8,%r14 + shrq $12,%rbx + shrq $18,%r9 + addq %r12,%r14 + adcq %r9,%rbx + + movq %r10,%r8 + shlq $40,%r8 + shrq $24,%r10 + addq %r8,%rbx + adcq $0,%r10 + + movq $-4,%r9 + movq %r10,%r8 + andq %r10,%r9 + shrq $2,%r8 + andq $3,%r10 + addq %r9,%r8 + addq %r8,%r14 + adcq $0,%rbx + adcq $0,%r10 + + movq %r13,%r12 + movq %r13,%rax + shrq $2,%r13 + addq %r12,%r13 + + addq 0(%rsi),%r14 + adcq 8(%rsi),%rbx + leaq 16(%rsi),%rsi + adcq %rcx,%r10 + + movq %rdi,0(%rsp) + __poly1305_block + movq 0(%rsp),%rdi + + testq %rcx,%rcx + jz .Lstore_base2_64_avx + + + movq %r14,%rax + movq %r14,%rdx + shrq $52,%r14 + movq %rbx,%r11 + movq %rbx,%r12 + shrq $26,%rdx + andq $0x3ffffff,%rax + shlq $12,%r11 + andq $0x3ffffff,%rdx + shrq $14,%rbx + orq %r11,%r14 + shlq $24,%r10 + andq $0x3ffffff,%r14 + shrq $40,%r12 + andq $0x3ffffff,%rbx + orq %r12,%r10 + + subq $16,%r15 + jz .Lstore_base2_26_avx + + vmovd %eax,%xmm0 + vmovd %edx,%xmm1 + vmovd %r14d,%xmm2 + vmovd %ebx,%xmm3 + vmovd %r10d,%xmm4 + jmp .Lproceed_avx + +.align 32 +.Lstore_base2_64_avx: + movq %r14,0(%rdi) + movq %rbx,8(%rdi) + movq %r10,16(%rdi) + jmp .Ldone_avx + +.align 16 +.Lstore_base2_26_avx: + movl %eax,0(%rdi) + movl %edx,4(%rdi) + movl %r14d,8(%rdi) + movl %ebx,12(%rdi) + movl %r10d,16(%rdi) +.align 16 +.Ldone_avx: + movq 8(%rsp),%r15 + movq 16(%rsp),%r14 + movq 24(%rsp),%r13 + movq 32(%rsp),%r12 + movq 40(%rsp),%rbx + leaq 48(%rsp),%rsp + +.Lno_data_avx: +.Lblocks_avx_epilogue: + ret + +.align 32 +.Lbase2_64_avx: + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rdi + +.Lbase2_64_avx_body: + + movq %rdx,%r15 + + movq 24(%rdi),%r11 + movq 32(%rdi),%r13 + + movq 0(%rdi),%r14 + movq 8(%rdi),%rbx + movl 16(%rdi),%r10d + + movq %r13,%r12 + movq %r13,%rax + shrq $2,%r13 + addq %r12,%r13 + + testq $31,%rdx + jz .Linit_avx + + addq 0(%rsi),%r14 + adcq 8(%rsi),%rbx + leaq 16(%rsi),%rsi + adcq %rcx,%r10 + subq $16,%r15 + + movq %rdi,0(%rsp) + __poly1305_block + movq 0(%rsp),%rdi + +.Linit_avx: + + movq %r14,%rax + movq %r14,%rdx + shrq $52,%r14 + movq %rbx,%r8 + movq %rbx,%r9 + shrq $26,%rdx + andq $0x3ffffff,%rax + shlq $12,%r8 + andq $0x3ffffff,%rdx + shrq $14,%rbx + orq %r8,%r14 + shlq $24,%r10 + andq $0x3ffffff,%r14 + shrq $40,%r9 + andq $0x3ffffff,%rbx + orq %r9,%r10 + + vmovd %eax,%xmm0 + vmovd %edx,%xmm1 + vmovd %r14d,%xmm2 + vmovd %ebx,%xmm3 + vmovd %r10d,%xmm4 + movl $1,20(%rdi) + + __poly1305_init_avx + +.Lproceed_avx: + movq %r15,%rdx + + movq 8(%rsp),%r15 + movq 16(%rsp),%r14 + movq 24(%rsp),%r13 + movq 32(%rsp),%r12 + movq 40(%rsp),%rbx + leaq 48(%rsp),%rax + leaq 48(%rsp),%rsp + +.Lbase2_64_avx_epilogue: + jmp .Ldo_avx + + +.align 32 +.Leven_avx: + vmovd 0(%rdi),%xmm0 + vmovd 4(%rdi),%xmm1 + vmovd 8(%rdi),%xmm2 + vmovd 12(%rdi),%xmm3 + vmovd 16(%rdi),%xmm4 + +.Ldo_avx: + leaq 8(%rsp),%r10 + andq $-32,%rsp + subq $8,%rsp + leaq -88(%rsp),%r11 + subq $0x178,%rsp + subq $64,%rdx + leaq -32(%rsi),%rax + cmovcq %rax,%rsi + + vmovdqu 48(%rdi),%xmm14 + leaq 112(%rdi),%rdi + leaq .Lconst(%rip),%rcx + + vmovdqu 32(%rsi),%xmm5 + vmovdqu 48(%rsi),%xmm6 + vmovdqa 64(%rcx),%xmm15 + + vpsrldq $6,%xmm5,%xmm7 + vpsrldq $6,%xmm6,%xmm8 + vpunpckhqdq %xmm6,%xmm5,%xmm9 + vpunpcklqdq %xmm6,%xmm5,%xmm5 + vpunpcklqdq %xmm8,%xmm7,%xmm8 + + vpsrlq $40,%xmm9,%xmm9 + vpsrlq $26,%xmm5,%xmm6 + vpand %xmm15,%xmm5,%xmm5 + vpsrlq $4,%xmm8,%xmm7 + vpand %xmm15,%xmm6,%xmm6 + vpsrlq $30,%xmm8,%xmm8 + vpand %xmm15,%xmm7,%xmm7 + vpand %xmm15,%xmm8,%xmm8 + vpor 32(%rcx),%xmm9,%xmm9 + + jbe .Lskip_loop_avx + + + vmovdqu -48(%rdi),%xmm11 + vmovdqu -32(%rdi),%xmm12 + vpshufd $0xEE,%xmm14,%xmm13 + vpshufd $0x44,%xmm14,%xmm10 + vmovdqa %xmm13,-144(%r11) + vmovdqa %xmm10,0(%rsp) + vpshufd $0xEE,%xmm11,%xmm14 + vmovdqu -16(%rdi),%xmm10 + vpshufd $0x44,%xmm11,%xmm11 + vmovdqa %xmm14,-128(%r11) + vmovdqa %xmm11,16(%rsp) + vpshufd $0xEE,%xmm12,%xmm13 + vmovdqu 0(%rdi),%xmm11 + vpshufd $0x44,%xmm12,%xmm12 + vmovdqa %xmm13,-112(%r11) + vmovdqa %xmm12,32(%rsp) + vpshufd $0xEE,%xmm10,%xmm14 + vmovdqu 16(%rdi),%xmm12 + vpshufd $0x44,%xmm10,%xmm10 + vmovdqa %xmm14,-96(%r11) + vmovdqa %xmm10,48(%rsp) + vpshufd $0xEE,%xmm11,%xmm13 + vmovdqu 32(%rdi),%xmm10 + vpshufd $0x44,%xmm11,%xmm11 + vmovdqa %xmm13,-80(%r11) + vmovdqa %xmm11,64(%rsp) + vpshufd $0xEE,%xmm12,%xmm14 + vmovdqu 48(%rdi),%xmm11 + vpshufd $0x44,%xmm12,%xmm12 + vmovdqa %xmm14,-64(%r11) + vmovdqa %xmm12,80(%rsp) + vpshufd $0xEE,%xmm10,%xmm13 + vmovdqu 64(%rdi),%xmm12 + vpshufd $0x44,%xmm10,%xmm10 + vmovdqa %xmm13,-48(%r11) + vmovdqa %xmm10,96(%rsp) + vpshufd $0xEE,%xmm11,%xmm14 + vpshufd $0x44,%xmm11,%xmm11 + vmovdqa %xmm14,-32(%r11) + vmovdqa %xmm11,112(%rsp) + vpshufd $0xEE,%xmm12,%xmm13 + vmovdqa 0(%rsp),%xmm14 + vpshufd $0x44,%xmm12,%xmm12 + vmovdqa %xmm13,-16(%r11) + vmovdqa %xmm12,128(%rsp) + + jmp .Loop_avx + +.align 32 +.Loop_avx: + + vpmuludq %xmm5,%xmm14,%xmm10 + vpmuludq %xmm6,%xmm14,%xmm11 + vmovdqa %xmm2,32(%r11) + vpmuludq %xmm7,%xmm14,%xmm12 + vmovdqa 16(%rsp),%xmm2 + vpmuludq %xmm8,%xmm14,%xmm13 + vpmuludq %xmm9,%xmm14,%xmm14 + + vmovdqa %xmm0,0(%r11) + vpmuludq 32(%rsp),%xmm9,%xmm0 + vmovdqa %xmm1,16(%r11) + vpmuludq %xmm8,%xmm2,%xmm1 + vpaddq %xmm0,%xmm10,%xmm10 + vpaddq %xmm1,%xmm14,%xmm14 + vmovdqa %xmm3,48(%r11) + vpmuludq %xmm7,%xmm2,%xmm0 + vpmuludq %xmm6,%xmm2,%xmm1 + vpaddq %xmm0,%xmm13,%xmm13 + vmovdqa 48(%rsp),%xmm3 + vpaddq %xmm1,%xmm12,%xmm12 + vmovdqa %xmm4,64(%r11) + vpmuludq %xmm5,%xmm2,%xmm2 + vpmuludq %xmm7,%xmm3,%xmm0 + vpaddq %xmm2,%xmm11,%xmm11 + + vmovdqa 64(%rsp),%xmm4 + vpaddq %xmm0,%xmm14,%xmm14 + vpmuludq %xmm6,%xmm3,%xmm1 + vpmuludq %xmm5,%xmm3,%xmm3 + vpaddq %xmm1,%xmm13,%xmm13 + vmovdqa 80(%rsp),%xmm2 + vpaddq %xmm3,%xmm12,%xmm12 + vpmuludq %xmm9,%xmm4,%xmm0 + vpmuludq %xmm8,%xmm4,%xmm4 + vpaddq %xmm0,%xmm11,%xmm11 + vmovdqa 96(%rsp),%xmm3 + vpaddq %xmm4,%xmm10,%xmm10 + + vmovdqa 128(%rsp),%xmm4 + vpmuludq %xmm6,%xmm2,%xmm1 + vpmuludq %xmm5,%xmm2,%xmm2 + vpaddq %xmm1,%xmm14,%xmm14 + vpaddq %xmm2,%xmm13,%xmm13 + vpmuludq %xmm9,%xmm3,%xmm0 + vpmuludq %xmm8,%xmm3,%xmm1 + vpaddq %xmm0,%xmm12,%xmm12 + vmovdqu 0(%rsi),%xmm0 + vpaddq %xmm1,%xmm11,%xmm11 + vpmuludq %xmm7,%xmm3,%xmm3 + vpmuludq %xmm7,%xmm4,%xmm7 + vpaddq %xmm3,%xmm10,%xmm10 + + vmovdqu 16(%rsi),%xmm1 + vpaddq %xmm7,%xmm11,%xmm11 + vpmuludq %xmm8,%xmm4,%xmm8 + vpmuludq %xmm9,%xmm4,%xmm9 + vpsrldq $6,%xmm0,%xmm2 + vpaddq %xmm8,%xmm12,%xmm12 + vpaddq %xmm9,%xmm13,%xmm13 + vpsrldq $6,%xmm1,%xmm3 + vpmuludq 112(%rsp),%xmm5,%xmm9 + vpmuludq %xmm6,%xmm4,%xmm5 + vpunpckhqdq %xmm1,%xmm0,%xmm4 + vpaddq %xmm9,%xmm14,%xmm14 + vmovdqa -144(%r11),%xmm9 + vpaddq %xmm5,%xmm10,%xmm10 + + vpunpcklqdq %xmm1,%xmm0,%xmm0 + vpunpcklqdq %xmm3,%xmm2,%xmm3 + + + vpsrldq $5,%xmm4,%xmm4 + vpsrlq $26,%xmm0,%xmm1 + vpand %xmm15,%xmm0,%xmm0 + vpsrlq $4,%xmm3,%xmm2 + vpand %xmm15,%xmm1,%xmm1 + vpand 0(%rcx),%xmm4,%xmm4 + vpsrlq $30,%xmm3,%xmm3 + vpand %xmm15,%xmm2,%xmm2 + vpand %xmm15,%xmm3,%xmm3 + vpor 32(%rcx),%xmm4,%xmm4 + + vpaddq 0(%r11),%xmm0,%xmm0 + vpaddq 16(%r11),%xmm1,%xmm1 + vpaddq 32(%r11),%xmm2,%xmm2 + vpaddq 48(%r11),%xmm3,%xmm3 + vpaddq 64(%r11),%xmm4,%xmm4 + + leaq 32(%rsi),%rax + leaq 64(%rsi),%rsi + subq $64,%rdx + cmovcq %rax,%rsi + + vpmuludq %xmm0,%xmm9,%xmm5 + vpmuludq %xmm1,%xmm9,%xmm6 + vpaddq %xmm5,%xmm10,%xmm10 + vpaddq %xmm6,%xmm11,%xmm11 + vmovdqa -128(%r11),%xmm7 + vpmuludq %xmm2,%xmm9,%xmm5 + vpmuludq %xmm3,%xmm9,%xmm6 + vpaddq %xmm5,%xmm12,%xmm12 + vpaddq %xmm6,%xmm13,%xmm13 + vpmuludq %xmm4,%xmm9,%xmm9 + vpmuludq -112(%r11),%xmm4,%xmm5 + vpaddq %xmm9,%xmm14,%xmm14 + + vpaddq %xmm5,%xmm10,%xmm10 + vpmuludq %xmm2,%xmm7,%xmm6 + vpmuludq %xmm3,%xmm7,%xmm5 + vpaddq %xmm6,%xmm13,%xmm13 + vmovdqa -96(%r11),%xmm8 + vpaddq %xmm5,%xmm14,%xmm14 + vpmuludq %xmm1,%xmm7,%xmm6 + vpmuludq %xmm0,%xmm7,%xmm7 + vpaddq %xmm6,%xmm12,%xmm12 + vpaddq %xmm7,%xmm11,%xmm11 + + vmovdqa -80(%r11),%xmm9 + vpmuludq %xmm2,%xmm8,%xmm5 + vpmuludq %xmm1,%xmm8,%xmm6 + vpaddq %xmm5,%xmm14,%xmm14 + vpaddq %xmm6,%xmm13,%xmm13 + vmovdqa -64(%r11),%xmm7 + vpmuludq %xmm0,%xmm8,%xmm8 + vpmuludq %xmm4,%xmm9,%xmm5 + vpaddq %xmm8,%xmm12,%xmm12 + vpaddq %xmm5,%xmm11,%xmm11 + vmovdqa -48(%r11),%xmm8 + vpmuludq %xmm3,%xmm9,%xmm9 + vpmuludq %xmm1,%xmm7,%xmm6 + vpaddq %xmm9,%xmm10,%xmm10 + + vmovdqa -16(%r11),%xmm9 + vpaddq %xmm6,%xmm14,%xmm14 + vpmuludq %xmm0,%xmm7,%xmm7 + vpmuludq %xmm4,%xmm8,%xmm5 + vpaddq %xmm7,%xmm13,%xmm13 + vpaddq %xmm5,%xmm12,%xmm12 + vmovdqu 32(%rsi),%xmm5 + vpmuludq %xmm3,%xmm8,%xmm7 + vpmuludq %xmm2,%xmm8,%xmm8 + vpaddq %xmm7,%xmm11,%xmm11 + vmovdqu 48(%rsi),%xmm6 + vpaddq %xmm8,%xmm10,%xmm10 + + vpmuludq %xmm2,%xmm9,%xmm2 + vpmuludq %xmm3,%xmm9,%xmm3 + vpsrldq $6,%xmm5,%xmm7 + vpaddq %xmm2,%xmm11,%xmm11 + vpmuludq %xmm4,%xmm9,%xmm4 + vpsrldq $6,%xmm6,%xmm8 + vpaddq %xmm3,%xmm12,%xmm2 + vpaddq %xmm4,%xmm13,%xmm3 + vpmuludq -32(%r11),%xmm0,%xmm4 + vpmuludq %xmm1,%xmm9,%xmm0 + vpunpckhqdq %xmm6,%xmm5,%xmm9 + vpaddq %xmm4,%xmm14,%xmm4 + vpaddq %xmm0,%xmm10,%xmm0 + + vpunpcklqdq %xmm6,%xmm5,%xmm5 + vpunpcklqdq %xmm8,%xmm7,%xmm8 + + + vpsrldq $5,%xmm9,%xmm9 + vpsrlq $26,%xmm5,%xmm6 + vmovdqa 0(%rsp),%xmm14 + vpand %xmm15,%xmm5,%xmm5 + vpsrlq $4,%xmm8,%xmm7 + vpand %xmm15,%xmm6,%xmm6 + vpand 0(%rcx),%xmm9,%xmm9 + vpsrlq $30,%xmm8,%xmm8 + vpand %xmm15,%xmm7,%xmm7 + vpand %xmm15,%xmm8,%xmm8 + vpor 32(%rcx),%xmm9,%xmm9 + + vpsrlq $26,%xmm3,%xmm13 + vpand %xmm15,%xmm3,%xmm3 + vpaddq %xmm13,%xmm4,%xmm4 + + vpsrlq $26,%xmm0,%xmm10 + vpand %xmm15,%xmm0,%xmm0 + vpaddq %xmm10,%xmm11,%xmm1 + + vpsrlq $26,%xmm4,%xmm10 + vpand %xmm15,%xmm4,%xmm4 + + vpsrlq $26,%xmm1,%xmm11 + vpand %xmm15,%xmm1,%xmm1 + vpaddq %xmm11,%xmm2,%xmm2 + + vpaddq %xmm10,%xmm0,%xmm0 + vpsllq $2,%xmm10,%xmm10 + vpaddq %xmm10,%xmm0,%xmm0 + + vpsrlq $26,%xmm2,%xmm12 + vpand %xmm15,%xmm2,%xmm2 + vpaddq %xmm12,%xmm3,%xmm3 + + vpsrlq $26,%xmm0,%xmm10 + vpand %xmm15,%xmm0,%xmm0 + vpaddq %xmm10,%xmm1,%xmm1 + + vpsrlq $26,%xmm3,%xmm13 + vpand %xmm15,%xmm3,%xmm3 + vpaddq %xmm13,%xmm4,%xmm4 + + ja .Loop_avx + +.Lskip_loop_avx: + vpshufd $0x10,%xmm14,%xmm14 + addq $32,%rdx + jnz .Long_tail_avx + + vpaddq %xmm2,%xmm7,%xmm7 + vpaddq %xmm0,%xmm5,%xmm5 + vpaddq %xmm1,%xmm6,%xmm6 + vpaddq %xmm3,%xmm8,%xmm8 + vpaddq %xmm4,%xmm9,%xmm9 + +.Long_tail_avx: + vmovdqa %xmm2,32(%r11) + vmovdqa %xmm0,0(%r11) + vmovdqa %xmm1,16(%r11) + vmovdqa %xmm3,48(%r11) + vmovdqa %xmm4,64(%r11) + + vpmuludq %xmm7,%xmm14,%xmm12 + vpmuludq %xmm5,%xmm14,%xmm10 + vpshufd $0x10,-48(%rdi),%xmm2 + vpmuludq %xmm6,%xmm14,%xmm11 + vpmuludq %xmm8,%xmm14,%xmm13 + vpmuludq %xmm9,%xmm14,%xmm14 + + vpmuludq %xmm8,%xmm2,%xmm0 + vpaddq %xmm0,%xmm14,%xmm14 + vpshufd $0x10,-32(%rdi),%xmm3 + vpmuludq %xmm7,%xmm2,%xmm1 + vpaddq %xmm1,%xmm13,%xmm13 + vpshufd $0x10,-16(%rdi),%xmm4 + vpmuludq %xmm6,%xmm2,%xmm0 + vpaddq %xmm0,%xmm12,%xmm12 + vpmuludq %xmm5,%xmm2,%xmm2 + vpaddq %xmm2,%xmm11,%xmm11 + vpmuludq %xmm9,%xmm3,%xmm3 + vpaddq %xmm3,%xmm10,%xmm10 + + vpshufd $0x10,0(%rdi),%xmm2 + vpmuludq %xmm7,%xmm4,%xmm1 + vpaddq %xmm1,%xmm14,%xmm14 + vpmuludq %xmm6,%xmm4,%xmm0 + vpaddq %xmm0,%xmm13,%xmm13 + vpshufd $0x10,16(%rdi),%xmm3 + vpmuludq %xmm5,%xmm4,%xmm4 + vpaddq %xmm4,%xmm12,%xmm12 + vpmuludq %xmm9,%xmm2,%xmm1 + vpaddq %xmm1,%xmm11,%xmm11 + vpshufd $0x10,32(%rdi),%xmm4 + vpmuludq %xmm8,%xmm2,%xmm2 + vpaddq %xmm2,%xmm10,%xmm10 + + vpmuludq %xmm6,%xmm3,%xmm0 + vpaddq %xmm0,%xmm14,%xmm14 + vpmuludq %xmm5,%xmm3,%xmm3 + vpaddq %xmm3,%xmm13,%xmm13 + vpshufd $0x10,48(%rdi),%xmm2 + vpmuludq %xmm9,%xmm4,%xmm1 + vpaddq %xmm1,%xmm12,%xmm12 + vpshufd $0x10,64(%rdi),%xmm3 + vpmuludq %xmm8,%xmm4,%xmm0 + vpaddq %xmm0,%xmm11,%xmm11 + vpmuludq %xmm7,%xmm4,%xmm4 + vpaddq %xmm4,%xmm10,%xmm10 + + vpmuludq %xmm5,%xmm2,%xmm2 + vpaddq %xmm2,%xmm14,%xmm14 + vpmuludq %xmm9,%xmm3,%xmm1 + vpaddq %xmm1,%xmm13,%xmm13 + vpmuludq %xmm8,%xmm3,%xmm0 + vpaddq %xmm0,%xmm12,%xmm12 + vpmuludq %xmm7,%xmm3,%xmm1 + vpaddq %xmm1,%xmm11,%xmm11 + vpmuludq %xmm6,%xmm3,%xmm3 + vpaddq %xmm3,%xmm10,%xmm10 + + jz .Lshort_tail_avx + + vmovdqu 0(%rsi),%xmm0 + vmovdqu 16(%rsi),%xmm1 + + vpsrldq $6,%xmm0,%xmm2 + vpsrldq $6,%xmm1,%xmm3 + vpunpckhqdq %xmm1,%xmm0,%xmm4 + vpunpcklqdq %xmm1,%xmm0,%xmm0 + vpunpcklqdq %xmm3,%xmm2,%xmm3 + + vpsrlq $40,%xmm4,%xmm4 + vpsrlq $26,%xmm0,%xmm1 + vpand %xmm15,%xmm0,%xmm0 + vpsrlq $4,%xmm3,%xmm2 + vpand %xmm15,%xmm1,%xmm1 + vpsrlq $30,%xmm3,%xmm3 + vpand %xmm15,%xmm2,%xmm2 + vpand %xmm15,%xmm3,%xmm3 + vpor 32(%rcx),%xmm4,%xmm4 + + vpshufd $0x32,-64(%rdi),%xmm9 + vpaddq 0(%r11),%xmm0,%xmm0 + vpaddq 16(%r11),%xmm1,%xmm1 + vpaddq 32(%r11),%xmm2,%xmm2 + vpaddq 48(%r11),%xmm3,%xmm3 + vpaddq 64(%r11),%xmm4,%xmm4 + + vpmuludq %xmm0,%xmm9,%xmm5 + vpaddq %xmm5,%xmm10,%xmm10 + vpmuludq %xmm1,%xmm9,%xmm6 + vpaddq %xmm6,%xmm11,%xmm11 + vpmuludq %xmm2,%xmm9,%xmm5 + vpaddq %xmm5,%xmm12,%xmm12 + vpshufd $0x32,-48(%rdi),%xmm7 + vpmuludq %xmm3,%xmm9,%xmm6 + vpaddq %xmm6,%xmm13,%xmm13 + vpmuludq %xmm4,%xmm9,%xmm9 + vpaddq %xmm9,%xmm14,%xmm14 + + vpmuludq %xmm3,%xmm7,%xmm5 + vpaddq %xmm5,%xmm14,%xmm14 + vpshufd $0x32,-32(%rdi),%xmm8 + vpmuludq %xmm2,%xmm7,%xmm6 + vpaddq %xmm6,%xmm13,%xmm13 + vpshufd $0x32,-16(%rdi),%xmm9 + vpmuludq %xmm1,%xmm7,%xmm5 + vpaddq %xmm5,%xmm12,%xmm12 + vpmuludq %xmm0,%xmm7,%xmm7 + vpaddq %xmm7,%xmm11,%xmm11 + vpmuludq %xmm4,%xmm8,%xmm8 + vpaddq %xmm8,%xmm10,%xmm10 + + vpshufd $0x32,0(%rdi),%xmm7 + vpmuludq %xmm2,%xmm9,%xmm6 + vpaddq %xmm6,%xmm14,%xmm14 + vpmuludq %xmm1,%xmm9,%xmm5 + vpaddq %xmm5,%xmm13,%xmm13 + vpshufd $0x32,16(%rdi),%xmm8 + vpmuludq %xmm0,%xmm9,%xmm9 + vpaddq %xmm9,%xmm12,%xmm12 + vpmuludq %xmm4,%xmm7,%xmm6 + vpaddq %xmm6,%xmm11,%xmm11 + vpshufd $0x32,32(%rdi),%xmm9 + vpmuludq %xmm3,%xmm7,%xmm7 + vpaddq %xmm7,%xmm10,%xmm10 + + vpmuludq %xmm1,%xmm8,%xmm5 + vpaddq %xmm5,%xmm14,%xmm14 + vpmuludq %xmm0,%xmm8,%xmm8 + vpaddq %xmm8,%xmm13,%xmm13 + vpshufd $0x32,48(%rdi),%xmm7 + vpmuludq %xmm4,%xmm9,%xmm6 + vpaddq %xmm6,%xmm12,%xmm12 + vpshufd $0x32,64(%rdi),%xmm8 + vpmuludq %xmm3,%xmm9,%xmm5 + vpaddq %xmm5,%xmm11,%xmm11 + vpmuludq %xmm2,%xmm9,%xmm9 + vpaddq %xmm9,%xmm10,%xmm10 + + vpmuludq %xmm0,%xmm7,%xmm7 + vpaddq %xmm7,%xmm14,%xmm14 + vpmuludq %xmm4,%xmm8,%xmm6 + vpaddq %xmm6,%xmm13,%xmm13 + vpmuludq %xmm3,%xmm8,%xmm5 + vpaddq %xmm5,%xmm12,%xmm12 + vpmuludq %xmm2,%xmm8,%xmm6 + vpaddq %xmm6,%xmm11,%xmm11 + vpmuludq %xmm1,%xmm8,%xmm8 + vpaddq %xmm8,%xmm10,%xmm10 + +.Lshort_tail_avx: + + vpsrldq $8,%xmm14,%xmm9 + vpsrldq $8,%xmm13,%xmm8 + vpsrldq $8,%xmm11,%xmm6 + vpsrldq $8,%xmm10,%xmm5 + vpsrldq $8,%xmm12,%xmm7 + vpaddq %xmm8,%xmm13,%xmm13 + vpaddq %xmm9,%xmm14,%xmm14 + vpaddq %xmm5,%xmm10,%xmm10 + vpaddq %xmm6,%xmm11,%xmm11 + vpaddq %xmm7,%xmm12,%xmm12 + + vpsrlq $26,%xmm13,%xmm3 + vpand %xmm15,%xmm13,%xmm13 + vpaddq %xmm3,%xmm14,%xmm14 + + vpsrlq $26,%xmm10,%xmm0 + vpand %xmm15,%xmm10,%xmm10 + vpaddq %xmm0,%xmm11,%xmm11 + + vpsrlq $26,%xmm14,%xmm4 + vpand %xmm15,%xmm14,%xmm14 + + vpsrlq $26,%xmm11,%xmm1 + vpand %xmm15,%xmm11,%xmm11 + vpaddq %xmm1,%xmm12,%xmm12 + + vpaddq %xmm4,%xmm10,%xmm10 + vpsllq $2,%xmm4,%xmm4 + vpaddq %xmm4,%xmm10,%xmm10 + + vpsrlq $26,%xmm12,%xmm2 + vpand %xmm15,%xmm12,%xmm12 + vpaddq %xmm2,%xmm13,%xmm13 + + vpsrlq $26,%xmm10,%xmm0 + vpand %xmm15,%xmm10,%xmm10 + vpaddq %xmm0,%xmm11,%xmm11 + + vpsrlq $26,%xmm13,%xmm3 + vpand %xmm15,%xmm13,%xmm13 + vpaddq %xmm3,%xmm14,%xmm14 + + vmovd %xmm10,-112(%rdi) + vmovd %xmm11,-108(%rdi) + vmovd %xmm12,-104(%rdi) + vmovd %xmm13,-100(%rdi) + vmovd %xmm14,-96(%rdi) + leaq -8(%r10),%rsp + + vzeroupper + ret +ENDPROC(poly1305_blocks_avx) + +.align 32 +ENTRY(poly1305_emit_avx) + cmpl $0,20(%rdi) + je .Lemit + + movl 0(%rdi),%eax + movl 4(%rdi),%ecx + movl 8(%rdi),%r8d + movl 12(%rdi),%r11d + movl 16(%rdi),%r10d + + shlq $26,%rcx + movq %r8,%r9 + shlq $52,%r8 + addq %rcx,%rax + shrq $12,%r9 + addq %rax,%r8 + adcq $0,%r9 + + shlq $14,%r11 + movq %r10,%rax + shrq $24,%r10 + addq %r11,%r9 + shlq $40,%rax + addq %rax,%r9 + adcq $0,%r10 + + movq %r10,%rax + movq %r10,%rcx + andq $3,%r10 + shrq $2,%rax + andq $-4,%rcx + addq %rcx,%rax + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + + movq %r8,%rax + addq $5,%r8 + movq %r9,%rcx + adcq $0,%r9 + adcq $0,%r10 + shrq $2,%r10 + cmovnzq %r8,%rax + cmovnzq %r9,%rcx + + addq 0(%rdx),%rax + adcq 8(%rdx),%rcx + movq %rax,0(%rsi) + movq %rcx,8(%rsi) + + ret +ENDPROC(poly1305_emit_avx) +#endif /* CONFIG_AS_AVX */ + +#ifdef CONFIG_AS_AVX2 +.align 32 +ENTRY(poly1305_blocks_avx2) + + movl 20(%rdi),%r8d + cmpq $128,%rdx + jae .Lblocks_avx2 + testl %r8d,%r8d + jz .Lblocks + +.Lblocks_avx2: + andq $-16,%rdx + jz .Lno_data_avx2 + + vzeroupper + + testl %r8d,%r8d + jz .Lbase2_64_avx2 + + testq $63,%rdx + jz .Leven_avx2 + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rdi + +.Lblocks_avx2_body: + + movq %rdx,%r15 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movl 16(%rdi),%r10d + + movq 24(%rdi),%r11 + movq 32(%rdi),%r13 + + + movl %r8d,%r14d + andq $-2147483648,%r8 + movq %r9,%r12 + movl %r9d,%ebx + andq $-2147483648,%r9 + + shrq $6,%r8 + shlq $52,%r12 + addq %r8,%r14 + shrq $12,%rbx + shrq $18,%r9 + addq %r12,%r14 + adcq %r9,%rbx + + movq %r10,%r8 + shlq $40,%r8 + shrq $24,%r10 + addq %r8,%rbx + adcq $0,%r10 + + movq $-4,%r9 + movq %r10,%r8 + andq %r10,%r9 + shrq $2,%r8 + andq $3,%r10 + addq %r9,%r8 + addq %r8,%r14 + adcq $0,%rbx + adcq $0,%r10 + + movq %r13,%r12 + movq %r13,%rax + shrq $2,%r13 + addq %r12,%r13 + +.Lbase2_26_pre_avx2: + addq 0(%rsi),%r14 + adcq 8(%rsi),%rbx + leaq 16(%rsi),%rsi + adcq %rcx,%r10 + subq $16,%r15 + + movq %rdi,0(%rsp) + __poly1305_block + movq 0(%rsp),%rdi + movq %r12,%rax + + testq $63,%r15 + jnz .Lbase2_26_pre_avx2 + + testq %rcx,%rcx + jz .Lstore_base2_64_avx2 + + + movq %r14,%rax + movq %r14,%rdx + shrq $52,%r14 + movq %rbx,%r11 + movq %rbx,%r12 + shrq $26,%rdx + andq $0x3ffffff,%rax + shlq $12,%r11 + andq $0x3ffffff,%rdx + shrq $14,%rbx + orq %r11,%r14 + shlq $24,%r10 + andq $0x3ffffff,%r14 + shrq $40,%r12 + andq $0x3ffffff,%rbx + orq %r12,%r10 + + testq %r15,%r15 + jz .Lstore_base2_26_avx2 + + vmovd %eax,%xmm0 + vmovd %edx,%xmm1 + vmovd %r14d,%xmm2 + vmovd %ebx,%xmm3 + vmovd %r10d,%xmm4 + jmp .Lproceed_avx2 + +.align 32 +.Lstore_base2_64_avx2: + movq %r14,0(%rdi) + movq %rbx,8(%rdi) + movq %r10,16(%rdi) + jmp .Ldone_avx2 + +.align 16 +.Lstore_base2_26_avx2: + movl %eax,0(%rdi) + movl %edx,4(%rdi) + movl %r14d,8(%rdi) + movl %ebx,12(%rdi) + movl %r10d,16(%rdi) +.align 16 +.Ldone_avx2: + movq 8(%rsp),%r15 + movq 16(%rsp),%r14 + movq 24(%rsp),%r13 + movq 32(%rsp),%r12 + movq 40(%rsp),%rbx + leaq 48(%rsp),%rsp + +.Lno_data_avx2: +.Lblocks_avx2_epilogue: + ret + + +.align 32 +.Lbase2_64_avx2: + + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rdi + +.Lbase2_64_avx2_body: + + movq %rdx,%r15 + + movq 24(%rdi),%r11 + movq 32(%rdi),%r13 + + movq 0(%rdi),%r14 + movq 8(%rdi),%rbx + movl 16(%rdi),%r10d + + movq %r13,%r12 + movq %r13,%rax + shrq $2,%r13 + addq %r12,%r13 + + testq $63,%rdx + jz .Linit_avx2 + +.Lbase2_64_pre_avx2: + addq 0(%rsi),%r14 + adcq 8(%rsi),%rbx + leaq 16(%rsi),%rsi + adcq %rcx,%r10 + subq $16,%r15 + + movq %rdi,0(%rsp) + __poly1305_block + movq 0(%rsp),%rdi + movq %r12,%rax + + testq $63,%r15 + jnz .Lbase2_64_pre_avx2 + +.Linit_avx2: + + movq %r14,%rax + movq %r14,%rdx + shrq $52,%r14 + movq %rbx,%r8 + movq %rbx,%r9 + shrq $26,%rdx + andq $0x3ffffff,%rax + shlq $12,%r8 + andq $0x3ffffff,%rdx + shrq $14,%rbx + orq %r8,%r14 + shlq $24,%r10 + andq $0x3ffffff,%r14 + shrq $40,%r9 + andq $0x3ffffff,%rbx + orq %r9,%r10 + + vmovd %eax,%xmm0 + vmovd %edx,%xmm1 + vmovd %r14d,%xmm2 + vmovd %ebx,%xmm3 + vmovd %r10d,%xmm4 + movl $1,20(%rdi) + + __poly1305_init_avx + +.Lproceed_avx2: + movq %r15,%rdx + + movq 8(%rsp),%r15 + movq 16(%rsp),%r14 + movq 24(%rsp),%r13 + movq 32(%rsp),%r12 + movq 40(%rsp),%rbx + leaq 48(%rsp),%rax + leaq 48(%rsp),%rsp + +.Lbase2_64_avx2_epilogue: + jmp .Ldo_avx2 + + +.align 32 +.Leven_avx2: + + vmovd 0(%rdi),%xmm0 + vmovd 4(%rdi),%xmm1 + vmovd 8(%rdi),%xmm2 + vmovd 12(%rdi),%xmm3 + vmovd 16(%rdi),%xmm4 + +.Ldo_avx2: + leaq 8(%rsp),%r10 + subq $0x128,%rsp + leaq .Lconst(%rip),%rcx + leaq 48+64(%rdi),%rdi + vmovdqa 96(%rcx),%ymm7 + + + vmovdqu -64(%rdi),%xmm9 + andq $-512,%rsp + vmovdqu -48(%rdi),%xmm10 + vmovdqu -32(%rdi),%xmm6 + vmovdqu -16(%rdi),%xmm11 + vmovdqu 0(%rdi),%xmm12 + vmovdqu 16(%rdi),%xmm13 + leaq 144(%rsp),%rax + vmovdqu 32(%rdi),%xmm14 + vpermd %ymm9,%ymm7,%ymm9 + vmovdqu 48(%rdi),%xmm15 + vpermd %ymm10,%ymm7,%ymm10 + vmovdqu 64(%rdi),%xmm5 + vpermd %ymm6,%ymm7,%ymm6 + vmovdqa %ymm9,0(%rsp) + vpermd %ymm11,%ymm7,%ymm11 + vmovdqa %ymm10,32-144(%rax) + vpermd %ymm12,%ymm7,%ymm12 + vmovdqa %ymm6,64-144(%rax) + vpermd %ymm13,%ymm7,%ymm13 + vmovdqa %ymm11,96-144(%rax) + vpermd %ymm14,%ymm7,%ymm14 + vmovdqa %ymm12,128-144(%rax) + vpermd %ymm15,%ymm7,%ymm15 + vmovdqa %ymm13,160-144(%rax) + vpermd %ymm5,%ymm7,%ymm5 + vmovdqa %ymm14,192-144(%rax) + vmovdqa %ymm15,224-144(%rax) + vmovdqa %ymm5,256-144(%rax) + vmovdqa 64(%rcx),%ymm5 + + + + vmovdqu 0(%rsi),%xmm7 + vmovdqu 16(%rsi),%xmm8 + vinserti128 $1,32(%rsi),%ymm7,%ymm7 + vinserti128 $1,48(%rsi),%ymm8,%ymm8 + leaq 64(%rsi),%rsi + + vpsrldq $6,%ymm7,%ymm9 + vpsrldq $6,%ymm8,%ymm10 + vpunpckhqdq %ymm8,%ymm7,%ymm6 + vpunpcklqdq %ymm10,%ymm9,%ymm9 + vpunpcklqdq %ymm8,%ymm7,%ymm7 + + vpsrlq $30,%ymm9,%ymm10 + vpsrlq $4,%ymm9,%ymm9 + vpsrlq $26,%ymm7,%ymm8 + vpsrlq $40,%ymm6,%ymm6 + vpand %ymm5,%ymm9,%ymm9 + vpand %ymm5,%ymm7,%ymm7 + vpand %ymm5,%ymm8,%ymm8 + vpand %ymm5,%ymm10,%ymm10 + vpor 32(%rcx),%ymm6,%ymm6 + + vpaddq %ymm2,%ymm9,%ymm2 + subq $64,%rdx + jz .Ltail_avx2 + jmp .Loop_avx2 + +.align 32 +.Loop_avx2: + + vpaddq %ymm0,%ymm7,%ymm0 + vmovdqa 0(%rsp),%ymm7 + vpaddq %ymm1,%ymm8,%ymm1 + vmovdqa 32(%rsp),%ymm8 + vpaddq %ymm3,%ymm10,%ymm3 + vmovdqa 96(%rsp),%ymm9 + vpaddq %ymm4,%ymm6,%ymm4 + vmovdqa 48(%rax),%ymm10 + vmovdqa 112(%rax),%ymm5 + + vpmuludq %ymm2,%ymm7,%ymm13 + vpmuludq %ymm2,%ymm8,%ymm14 + vpmuludq %ymm2,%ymm9,%ymm15 + vpmuludq %ymm2,%ymm10,%ymm11 + vpmuludq %ymm2,%ymm5,%ymm12 + + vpmuludq %ymm0,%ymm8,%ymm6 + vpmuludq %ymm1,%ymm8,%ymm2 + vpaddq %ymm6,%ymm12,%ymm12 + vpaddq %ymm2,%ymm13,%ymm13 + vpmuludq %ymm3,%ymm8,%ymm6 + vpmuludq 64(%rsp),%ymm4,%ymm2 + vpaddq %ymm6,%ymm15,%ymm15 + vpaddq %ymm2,%ymm11,%ymm11 + vmovdqa -16(%rax),%ymm8 + + vpmuludq %ymm0,%ymm7,%ymm6 + vpmuludq %ymm1,%ymm7,%ymm2 + vpaddq %ymm6,%ymm11,%ymm11 + vpaddq %ymm2,%ymm12,%ymm12 + vpmuludq %ymm3,%ymm7,%ymm6 + vpmuludq %ymm4,%ymm7,%ymm2 + vmovdqu 0(%rsi),%xmm7 + vpaddq %ymm6,%ymm14,%ymm14 + vpaddq %ymm2,%ymm15,%ymm15 + vinserti128 $1,32(%rsi),%ymm7,%ymm7 + + vpmuludq %ymm3,%ymm8,%ymm6 + vpmuludq %ymm4,%ymm8,%ymm2 + vmovdqu 16(%rsi),%xmm8 + vpaddq %ymm6,%ymm11,%ymm11 + vpaddq %ymm2,%ymm12,%ymm12 + vmovdqa 16(%rax),%ymm2 + vpmuludq %ymm1,%ymm9,%ymm6 + vpmuludq %ymm0,%ymm9,%ymm9 + vpaddq %ymm6,%ymm14,%ymm14 + vpaddq %ymm9,%ymm13,%ymm13 + vinserti128 $1,48(%rsi),%ymm8,%ymm8 + leaq 64(%rsi),%rsi + + vpmuludq %ymm1,%ymm2,%ymm6 + vpmuludq %ymm0,%ymm2,%ymm2 + vpsrldq $6,%ymm7,%ymm9 + vpaddq %ymm6,%ymm15,%ymm15 + vpaddq %ymm2,%ymm14,%ymm14 + vpmuludq %ymm3,%ymm10,%ymm6 + vpmuludq %ymm4,%ymm10,%ymm2 + vpsrldq $6,%ymm8,%ymm10 + vpaddq %ymm6,%ymm12,%ymm12 + vpaddq %ymm2,%ymm13,%ymm13 + vpunpckhqdq %ymm8,%ymm7,%ymm6 + + vpmuludq %ymm3,%ymm5,%ymm3 + vpmuludq %ymm4,%ymm5,%ymm4 + vpunpcklqdq %ymm8,%ymm7,%ymm7 + vpaddq %ymm3,%ymm13,%ymm2 + vpaddq %ymm4,%ymm14,%ymm3 + vpunpcklqdq %ymm10,%ymm9,%ymm10 + vpmuludq 80(%rax),%ymm0,%ymm4 + vpmuludq %ymm1,%ymm5,%ymm0 + vmovdqa 64(%rcx),%ymm5 + vpaddq %ymm4,%ymm15,%ymm4 + vpaddq %ymm0,%ymm11,%ymm0 + + vpsrlq $26,%ymm3,%ymm14 + vpand %ymm5,%ymm3,%ymm3 + vpaddq %ymm14,%ymm4,%ymm4 + + vpsrlq $26,%ymm0,%ymm11 + vpand %ymm5,%ymm0,%ymm0 + vpaddq %ymm11,%ymm12,%ymm1 + + vpsrlq $26,%ymm4,%ymm15 + vpand %ymm5,%ymm4,%ymm4 + + vpsrlq $4,%ymm10,%ymm9 + + vpsrlq $26,%ymm1,%ymm12 + vpand %ymm5,%ymm1,%ymm1 + vpaddq %ymm12,%ymm2,%ymm2 + + vpaddq %ymm15,%ymm0,%ymm0 + vpsllq $2,%ymm15,%ymm15 + vpaddq %ymm15,%ymm0,%ymm0 + + vpand %ymm5,%ymm9,%ymm9 + vpsrlq $26,%ymm7,%ymm8 + + vpsrlq $26,%ymm2,%ymm13 + vpand %ymm5,%ymm2,%ymm2 + vpaddq %ymm13,%ymm3,%ymm3 + + vpaddq %ymm9,%ymm2,%ymm2 + vpsrlq $30,%ymm10,%ymm10 + + vpsrlq $26,%ymm0,%ymm11 + vpand %ymm5,%ymm0,%ymm0 + vpaddq %ymm11,%ymm1,%ymm1 + + vpsrlq $40,%ymm6,%ymm6 + + vpsrlq $26,%ymm3,%ymm14 + vpand %ymm5,%ymm3,%ymm3 + vpaddq %ymm14,%ymm4,%ymm4 + + vpand %ymm5,%ymm7,%ymm7 + vpand %ymm5,%ymm8,%ymm8 + vpand %ymm5,%ymm10,%ymm10 + vpor 32(%rcx),%ymm6,%ymm6 + + subq $64,%rdx + jnz .Loop_avx2 + +.byte 0x66,0x90 +.Ltail_avx2: + + vpaddq %ymm0,%ymm7,%ymm0 + vmovdqu 4(%rsp),%ymm7 + vpaddq %ymm1,%ymm8,%ymm1 + vmovdqu 36(%rsp),%ymm8 + vpaddq %ymm3,%ymm10,%ymm3 + vmovdqu 100(%rsp),%ymm9 + vpaddq %ymm4,%ymm6,%ymm4 + vmovdqu 52(%rax),%ymm10 + vmovdqu 116(%rax),%ymm5 + + vpmuludq %ymm2,%ymm7,%ymm13 + vpmuludq %ymm2,%ymm8,%ymm14 + vpmuludq %ymm2,%ymm9,%ymm15 + vpmuludq %ymm2,%ymm10,%ymm11 + vpmuludq %ymm2,%ymm5,%ymm12 + + vpmuludq %ymm0,%ymm8,%ymm6 + vpmuludq %ymm1,%ymm8,%ymm2 + vpaddq %ymm6,%ymm12,%ymm12 + vpaddq %ymm2,%ymm13,%ymm13 + vpmuludq %ymm3,%ymm8,%ymm6 + vpmuludq 68(%rsp),%ymm4,%ymm2 + vpaddq %ymm6,%ymm15,%ymm15 + vpaddq %ymm2,%ymm11,%ymm11 + + vpmuludq %ymm0,%ymm7,%ymm6 + vpmuludq %ymm1,%ymm7,%ymm2 + vpaddq %ymm6,%ymm11,%ymm11 + vmovdqu -12(%rax),%ymm8 + vpaddq %ymm2,%ymm12,%ymm12 + vpmuludq %ymm3,%ymm7,%ymm6 + vpmuludq %ymm4,%ymm7,%ymm2 + vpaddq %ymm6,%ymm14,%ymm14 + vpaddq %ymm2,%ymm15,%ymm15 + + vpmuludq %ymm3,%ymm8,%ymm6 + vpmuludq %ymm4,%ymm8,%ymm2 + vpaddq %ymm6,%ymm11,%ymm11 + vpaddq %ymm2,%ymm12,%ymm12 + vmovdqu 20(%rax),%ymm2 + vpmuludq %ymm1,%ymm9,%ymm6 + vpmuludq %ymm0,%ymm9,%ymm9 + vpaddq %ymm6,%ymm14,%ymm14 + vpaddq %ymm9,%ymm13,%ymm13 + + vpmuludq %ymm1,%ymm2,%ymm6 + vpmuludq %ymm0,%ymm2,%ymm2 + vpaddq %ymm6,%ymm15,%ymm15 + vpaddq %ymm2,%ymm14,%ymm14 + vpmuludq %ymm3,%ymm10,%ymm6 + vpmuludq %ymm4,%ymm10,%ymm2 + vpaddq %ymm6,%ymm12,%ymm12 + vpaddq %ymm2,%ymm13,%ymm13 + + vpmuludq %ymm3,%ymm5,%ymm3 + vpmuludq %ymm4,%ymm5,%ymm4 + vpaddq %ymm3,%ymm13,%ymm2 + vpaddq %ymm4,%ymm14,%ymm3 + vpmuludq 84(%rax),%ymm0,%ymm4 + vpmuludq %ymm1,%ymm5,%ymm0 + vmovdqa 64(%rcx),%ymm5 + vpaddq %ymm4,%ymm15,%ymm4 + vpaddq %ymm0,%ymm11,%ymm0 + + vpsrldq $8,%ymm12,%ymm8 + vpsrldq $8,%ymm2,%ymm9 + vpsrldq $8,%ymm3,%ymm10 + vpsrldq $8,%ymm4,%ymm6 + vpsrldq $8,%ymm0,%ymm7 + vpaddq %ymm8,%ymm12,%ymm12 + vpaddq %ymm9,%ymm2,%ymm2 + vpaddq %ymm10,%ymm3,%ymm3 + vpaddq %ymm6,%ymm4,%ymm4 + vpaddq %ymm7,%ymm0,%ymm0 + + vpermq $0x2,%ymm3,%ymm10 + vpermq $0x2,%ymm4,%ymm6 + vpermq $0x2,%ymm0,%ymm7 + vpermq $0x2,%ymm12,%ymm8 + vpermq $0x2,%ymm2,%ymm9 + vpaddq %ymm10,%ymm3,%ymm3 + vpaddq %ymm6,%ymm4,%ymm4 + vpaddq %ymm7,%ymm0,%ymm0 + vpaddq %ymm8,%ymm12,%ymm12 + vpaddq %ymm9,%ymm2,%ymm2 + + vpsrlq $26,%ymm3,%ymm14 + vpand %ymm5,%ymm3,%ymm3 + vpaddq %ymm14,%ymm4,%ymm4 + + vpsrlq $26,%ymm0,%ymm11 + vpand %ymm5,%ymm0,%ymm0 + vpaddq %ymm11,%ymm12,%ymm1 + + vpsrlq $26,%ymm4,%ymm15 + vpand %ymm5,%ymm4,%ymm4 + + vpsrlq $26,%ymm1,%ymm12 + vpand %ymm5,%ymm1,%ymm1 + vpaddq %ymm12,%ymm2,%ymm2 + + vpaddq %ymm15,%ymm0,%ymm0 + vpsllq $2,%ymm15,%ymm15 + vpaddq %ymm15,%ymm0,%ymm0 + + vpsrlq $26,%ymm2,%ymm13 + vpand %ymm5,%ymm2,%ymm2 + vpaddq %ymm13,%ymm3,%ymm3 + + vpsrlq $26,%ymm0,%ymm11 + vpand %ymm5,%ymm0,%ymm0 + vpaddq %ymm11,%ymm1,%ymm1 + + vpsrlq $26,%ymm3,%ymm14 + vpand %ymm5,%ymm3,%ymm3 + vpaddq %ymm14,%ymm4,%ymm4 + + vmovd %xmm0,-112(%rdi) + vmovd %xmm1,-108(%rdi) + vmovd %xmm2,-104(%rdi) + vmovd %xmm3,-100(%rdi) + vmovd %xmm4,-96(%rdi) + leaq -8(%r10),%rsp + + vzeroupper + ret + +ENDPROC(poly1305_blocks_avx2) +#endif /* CONFIG_AS_AVX2 */ + +#ifdef CONFIG_AS_AVX512 +.align 32 +ENTRY(poly1305_blocks_avx512) + + movl 20(%rdi),%r8d + cmpq $128,%rdx + jae .Lblocks_avx2_512 + testl %r8d,%r8d + jz .Lblocks + +.Lblocks_avx2_512: + andq $-16,%rdx + jz .Lno_data_avx2_512 + + vzeroupper + + testl %r8d,%r8d + jz .Lbase2_64_avx2_512 + + testq $63,%rdx + jz .Leven_avx2_512 + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rdi + +.Lblocks_avx2_body_512: + + movq %rdx,%r15 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movl 16(%rdi),%r10d + + movq 24(%rdi),%r11 + movq 32(%rdi),%r13 + + + movl %r8d,%r14d + andq $-2147483648,%r8 + movq %r9,%r12 + movl %r9d,%ebx + andq $-2147483648,%r9 + + shrq $6,%r8 + shlq $52,%r12 + addq %r8,%r14 + shrq $12,%rbx + shrq $18,%r9 + addq %r12,%r14 + adcq %r9,%rbx + + movq %r10,%r8 + shlq $40,%r8 + shrq $24,%r10 + addq %r8,%rbx + adcq $0,%r10 + + movq $-4,%r9 + movq %r10,%r8 + andq %r10,%r9 + shrq $2,%r8 + andq $3,%r10 + addq %r9,%r8 + addq %r8,%r14 + adcq $0,%rbx + adcq $0,%r10 + + movq %r13,%r12 + movq %r13,%rax + shrq $2,%r13 + addq %r12,%r13 + +.Lbase2_26_pre_avx2_512: + addq 0(%rsi),%r14 + adcq 8(%rsi),%rbx + leaq 16(%rsi),%rsi + adcq %rcx,%r10 + subq $16,%r15 + + movq %rdi,0(%rsp) + __poly1305_block + movq 0(%rsp),%rdi + movq %r12,%rax + + testq $63,%r15 + jnz .Lbase2_26_pre_avx2_512 + + testq %rcx,%rcx + jz .Lstore_base2_64_avx2_512 + + + movq %r14,%rax + movq %r14,%rdx + shrq $52,%r14 + movq %rbx,%r11 + movq %rbx,%r12 + shrq $26,%rdx + andq $0x3ffffff,%rax + shlq $12,%r11 + andq $0x3ffffff,%rdx + shrq $14,%rbx + orq %r11,%r14 + shlq $24,%r10 + andq $0x3ffffff,%r14 + shrq $40,%r12 + andq $0x3ffffff,%rbx + orq %r12,%r10 + + testq %r15,%r15 + jz .Lstore_base2_26_avx2_512 + + vmovd %eax,%xmm0 + vmovd %edx,%xmm1 + vmovd %r14d,%xmm2 + vmovd %ebx,%xmm3 + vmovd %r10d,%xmm4 + jmp .Lproceed_avx2_512 + +.align 32 +.Lstore_base2_64_avx2_512: + movq %r14,0(%rdi) + movq %rbx,8(%rdi) + movq %r10,16(%rdi) + jmp .Ldone_avx2_512 + +.align 16 +.Lstore_base2_26_avx2_512: + movl %eax,0(%rdi) + movl %edx,4(%rdi) + movl %r14d,8(%rdi) + movl %ebx,12(%rdi) + movl %r10d,16(%rdi) +.align 16 +.Ldone_avx2_512: + movq 8(%rsp),%r15 + movq 16(%rsp),%r14 + movq 24(%rsp),%r13 + movq 32(%rsp),%r12 + movq 40(%rsp),%rbx + leaq 48(%rsp),%rsp + +.Lno_data_avx2_512: +.Lblocks_avx2_epilogue_512: + ret + + +.align 32 +.Lbase2_64_avx2_512: + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rdi + +.Lbase2_64_avx2_body_512: + + movq %rdx,%r15 + + movq 24(%rdi),%r11 + movq 32(%rdi),%r13 + + movq 0(%rdi),%r14 + movq 8(%rdi),%rbx + movl 16(%rdi),%r10d + + movq %r13,%r12 + movq %r13,%rax + shrq $2,%r13 + addq %r12,%r13 + + testq $63,%rdx + jz .Linit_avx2_512 + +.Lbase2_64_pre_avx2_512: + addq 0(%rsi),%r14 + adcq 8(%rsi),%rbx + leaq 16(%rsi),%rsi + adcq %rcx,%r10 + subq $16,%r15 + + movq %rdi,0(%rsp) + __poly1305_block + movq 0(%rsp),%rdi + movq %r12,%rax + + testq $63,%r15 + jnz .Lbase2_64_pre_avx2_512 + +.Linit_avx2_512: + + movq %r14,%rax + movq %r14,%rdx + shrq $52,%r14 + movq %rbx,%r8 + movq %rbx,%r9 + shrq $26,%rdx + andq $0x3ffffff,%rax + shlq $12,%r8 + andq $0x3ffffff,%rdx + shrq $14,%rbx + orq %r8,%r14 + shlq $24,%r10 + andq $0x3ffffff,%r14 + shrq $40,%r9 + andq $0x3ffffff,%rbx + orq %r9,%r10 + + vmovd %eax,%xmm0 + vmovd %edx,%xmm1 + vmovd %r14d,%xmm2 + vmovd %ebx,%xmm3 + vmovd %r10d,%xmm4 + movl $1,20(%rdi) + + __poly1305_init_avx + +.Lproceed_avx2_512: + movq %r15,%rdx + + movq 8(%rsp),%r15 + movq 16(%rsp),%r14 + movq 24(%rsp),%r13 + movq 32(%rsp),%r12 + movq 40(%rsp),%rbx + leaq 48(%rsp),%rax + leaq 48(%rsp),%rsp + +.Lbase2_64_avx2_epilogue_512: + jmp .Ldo_avx2_512 + + +.align 32 +.Leven_avx2_512: + + vmovd 0(%rdi),%xmm0 + vmovd 4(%rdi),%xmm1 + vmovd 8(%rdi),%xmm2 + vmovd 12(%rdi),%xmm3 + vmovd 16(%rdi),%xmm4 + +.Ldo_avx2_512: + cmpq $512,%rdx + jae .Lblocks_avx512 +.Lskip_avx512: + leaq 8(%rsp),%r10 + + subq $0x128,%rsp + leaq .Lconst(%rip),%rcx + leaq 48+64(%rdi),%rdi + vmovdqa 96(%rcx),%ymm7 + + + vmovdqu -64(%rdi),%xmm9 + andq $-512,%rsp + vmovdqu -48(%rdi),%xmm10 + vmovdqu -32(%rdi),%xmm6 + vmovdqu -16(%rdi),%xmm11 + vmovdqu 0(%rdi),%xmm12 + vmovdqu 16(%rdi),%xmm13 + leaq 144(%rsp),%rax + vmovdqu 32(%rdi),%xmm14 + vpermd %ymm9,%ymm7,%ymm9 + vmovdqu 48(%rdi),%xmm15 + vpermd %ymm10,%ymm7,%ymm10 + vmovdqu 64(%rdi),%xmm5 + vpermd %ymm6,%ymm7,%ymm6 + vmovdqa %ymm9,0(%rsp) + vpermd %ymm11,%ymm7,%ymm11 + vmovdqa %ymm10,32-144(%rax) + vpermd %ymm12,%ymm7,%ymm12 + vmovdqa %ymm6,64-144(%rax) + vpermd %ymm13,%ymm7,%ymm13 + vmovdqa %ymm11,96-144(%rax) + vpermd %ymm14,%ymm7,%ymm14 + vmovdqa %ymm12,128-144(%rax) + vpermd %ymm15,%ymm7,%ymm15 + vmovdqa %ymm13,160-144(%rax) + vpermd %ymm5,%ymm7,%ymm5 + vmovdqa %ymm14,192-144(%rax) + vmovdqa %ymm15,224-144(%rax) + vmovdqa %ymm5,256-144(%rax) + vmovdqa 64(%rcx),%ymm5 + + + + vmovdqu 0(%rsi),%xmm7 + vmovdqu 16(%rsi),%xmm8 + vinserti128 $1,32(%rsi),%ymm7,%ymm7 + vinserti128 $1,48(%rsi),%ymm8,%ymm8 + leaq 64(%rsi),%rsi + + vpsrldq $6,%ymm7,%ymm9 + vpsrldq $6,%ymm8,%ymm10 + vpunpckhqdq %ymm8,%ymm7,%ymm6 + vpunpcklqdq %ymm10,%ymm9,%ymm9 + vpunpcklqdq %ymm8,%ymm7,%ymm7 + + vpsrlq $30,%ymm9,%ymm10 + vpsrlq $4,%ymm9,%ymm9 + vpsrlq $26,%ymm7,%ymm8 + vpsrlq $40,%ymm6,%ymm6 + vpand %ymm5,%ymm9,%ymm9 + vpand %ymm5,%ymm7,%ymm7 + vpand %ymm5,%ymm8,%ymm8 + vpand %ymm5,%ymm10,%ymm10 + vpor 32(%rcx),%ymm6,%ymm6 + + vpaddq %ymm2,%ymm9,%ymm2 + subq $64,%rdx + jz .Ltail_avx2_512 + jmp .Loop_avx2_512 + +.align 32 +.Loop_avx2_512: + + vpaddq %ymm0,%ymm7,%ymm0 + vmovdqa 0(%rsp),%ymm7 + vpaddq %ymm1,%ymm8,%ymm1 + vmovdqa 32(%rsp),%ymm8 + vpaddq %ymm3,%ymm10,%ymm3 + vmovdqa 96(%rsp),%ymm9 + vpaddq %ymm4,%ymm6,%ymm4 + vmovdqa 48(%rax),%ymm10 + vmovdqa 112(%rax),%ymm5 + + vpmuludq %ymm2,%ymm7,%ymm13 + vpmuludq %ymm2,%ymm8,%ymm14 + vpmuludq %ymm2,%ymm9,%ymm15 + vpmuludq %ymm2,%ymm10,%ymm11 + vpmuludq %ymm2,%ymm5,%ymm12 + + vpmuludq %ymm0,%ymm8,%ymm6 + vpmuludq %ymm1,%ymm8,%ymm2 + vpaddq %ymm6,%ymm12,%ymm12 + vpaddq %ymm2,%ymm13,%ymm13 + vpmuludq %ymm3,%ymm8,%ymm6 + vpmuludq 64(%rsp),%ymm4,%ymm2 + vpaddq %ymm6,%ymm15,%ymm15 + vpaddq %ymm2,%ymm11,%ymm11 + vmovdqa -16(%rax),%ymm8 + + vpmuludq %ymm0,%ymm7,%ymm6 + vpmuludq %ymm1,%ymm7,%ymm2 + vpaddq %ymm6,%ymm11,%ymm11 + vpaddq %ymm2,%ymm12,%ymm12 + vpmuludq %ymm3,%ymm7,%ymm6 + vpmuludq %ymm4,%ymm7,%ymm2 + vmovdqu 0(%rsi),%xmm7 + vpaddq %ymm6,%ymm14,%ymm14 + vpaddq %ymm2,%ymm15,%ymm15 + vinserti128 $1,32(%rsi),%ymm7,%ymm7 + + vpmuludq %ymm3,%ymm8,%ymm6 + vpmuludq %ymm4,%ymm8,%ymm2 + vmovdqu 16(%rsi),%xmm8 + vpaddq %ymm6,%ymm11,%ymm11 + vpaddq %ymm2,%ymm12,%ymm12 + vmovdqa 16(%rax),%ymm2 + vpmuludq %ymm1,%ymm9,%ymm6 + vpmuludq %ymm0,%ymm9,%ymm9 + vpaddq %ymm6,%ymm14,%ymm14 + vpaddq %ymm9,%ymm13,%ymm13 + vinserti128 $1,48(%rsi),%ymm8,%ymm8 + leaq 64(%rsi),%rsi + + vpmuludq %ymm1,%ymm2,%ymm6 + vpmuludq %ymm0,%ymm2,%ymm2 + vpsrldq $6,%ymm7,%ymm9 + vpaddq %ymm6,%ymm15,%ymm15 + vpaddq %ymm2,%ymm14,%ymm14 + vpmuludq %ymm3,%ymm10,%ymm6 + vpmuludq %ymm4,%ymm10,%ymm2 + vpsrldq $6,%ymm8,%ymm10 + vpaddq %ymm6,%ymm12,%ymm12 + vpaddq %ymm2,%ymm13,%ymm13 + vpunpckhqdq %ymm8,%ymm7,%ymm6 + + vpmuludq %ymm3,%ymm5,%ymm3 + vpmuludq %ymm4,%ymm5,%ymm4 + vpunpcklqdq %ymm8,%ymm7,%ymm7 + vpaddq %ymm3,%ymm13,%ymm2 + vpaddq %ymm4,%ymm14,%ymm3 + vpunpcklqdq %ymm10,%ymm9,%ymm10 + vpmuludq 80(%rax),%ymm0,%ymm4 + vpmuludq %ymm1,%ymm5,%ymm0 + vmovdqa 64(%rcx),%ymm5 + vpaddq %ymm4,%ymm15,%ymm4 + vpaddq %ymm0,%ymm11,%ymm0 + + vpsrlq $26,%ymm3,%ymm14 + vpand %ymm5,%ymm3,%ymm3 + vpaddq %ymm14,%ymm4,%ymm4 + + vpsrlq $26,%ymm0,%ymm11 + vpand %ymm5,%ymm0,%ymm0 + vpaddq %ymm11,%ymm12,%ymm1 + + vpsrlq $26,%ymm4,%ymm15 + vpand %ymm5,%ymm4,%ymm4 + + vpsrlq $4,%ymm10,%ymm9 + + vpsrlq $26,%ymm1,%ymm12 + vpand %ymm5,%ymm1,%ymm1 + vpaddq %ymm12,%ymm2,%ymm2 + + vpaddq %ymm15,%ymm0,%ymm0 + vpsllq $2,%ymm15,%ymm15 + vpaddq %ymm15,%ymm0,%ymm0 + + vpand %ymm5,%ymm9,%ymm9 + vpsrlq $26,%ymm7,%ymm8 + + vpsrlq $26,%ymm2,%ymm13 + vpand %ymm5,%ymm2,%ymm2 + vpaddq %ymm13,%ymm3,%ymm3 + + vpaddq %ymm9,%ymm2,%ymm2 + vpsrlq $30,%ymm10,%ymm10 + + vpsrlq $26,%ymm0,%ymm11 + vpand %ymm5,%ymm0,%ymm0 + vpaddq %ymm11,%ymm1,%ymm1 + + vpsrlq $40,%ymm6,%ymm6 + + vpsrlq $26,%ymm3,%ymm14 + vpand %ymm5,%ymm3,%ymm3 + vpaddq %ymm14,%ymm4,%ymm4 + + vpand %ymm5,%ymm7,%ymm7 + vpand %ymm5,%ymm8,%ymm8 + vpand %ymm5,%ymm10,%ymm10 + vpor 32(%rcx),%ymm6,%ymm6 + + subq $64,%rdx + jnz .Loop_avx2_512 + +.byte 0x66,0x90 +.Ltail_avx2_512: + + vpaddq %ymm0,%ymm7,%ymm0 + vmovdqu 4(%rsp),%ymm7 + vpaddq %ymm1,%ymm8,%ymm1 + vmovdqu 36(%rsp),%ymm8 + vpaddq %ymm3,%ymm10,%ymm3 + vmovdqu 100(%rsp),%ymm9 + vpaddq %ymm4,%ymm6,%ymm4 + vmovdqu 52(%rax),%ymm10 + vmovdqu 116(%rax),%ymm5 + + vpmuludq %ymm2,%ymm7,%ymm13 + vpmuludq %ymm2,%ymm8,%ymm14 + vpmuludq %ymm2,%ymm9,%ymm15 + vpmuludq %ymm2,%ymm10,%ymm11 + vpmuludq %ymm2,%ymm5,%ymm12 + + vpmuludq %ymm0,%ymm8,%ymm6 + vpmuludq %ymm1,%ymm8,%ymm2 + vpaddq %ymm6,%ymm12,%ymm12 + vpaddq %ymm2,%ymm13,%ymm13 + vpmuludq %ymm3,%ymm8,%ymm6 + vpmuludq 68(%rsp),%ymm4,%ymm2 + vpaddq %ymm6,%ymm15,%ymm15 + vpaddq %ymm2,%ymm11,%ymm11 + + vpmuludq %ymm0,%ymm7,%ymm6 + vpmuludq %ymm1,%ymm7,%ymm2 + vpaddq %ymm6,%ymm11,%ymm11 + vmovdqu -12(%rax),%ymm8 + vpaddq %ymm2,%ymm12,%ymm12 + vpmuludq %ymm3,%ymm7,%ymm6 + vpmuludq %ymm4,%ymm7,%ymm2 + vpaddq %ymm6,%ymm14,%ymm14 + vpaddq %ymm2,%ymm15,%ymm15 + + vpmuludq %ymm3,%ymm8,%ymm6 + vpmuludq %ymm4,%ymm8,%ymm2 + vpaddq %ymm6,%ymm11,%ymm11 + vpaddq %ymm2,%ymm12,%ymm12 + vmovdqu 20(%rax),%ymm2 + vpmuludq %ymm1,%ymm9,%ymm6 + vpmuludq %ymm0,%ymm9,%ymm9 + vpaddq %ymm6,%ymm14,%ymm14 + vpaddq %ymm9,%ymm13,%ymm13 + + vpmuludq %ymm1,%ymm2,%ymm6 + vpmuludq %ymm0,%ymm2,%ymm2 + vpaddq %ymm6,%ymm15,%ymm15 + vpaddq %ymm2,%ymm14,%ymm14 + vpmuludq %ymm3,%ymm10,%ymm6 + vpmuludq %ymm4,%ymm10,%ymm2 + vpaddq %ymm6,%ymm12,%ymm12 + vpaddq %ymm2,%ymm13,%ymm13 + + vpmuludq %ymm3,%ymm5,%ymm3 + vpmuludq %ymm4,%ymm5,%ymm4 + vpaddq %ymm3,%ymm13,%ymm2 + vpaddq %ymm4,%ymm14,%ymm3 + vpmuludq 84(%rax),%ymm0,%ymm4 + vpmuludq %ymm1,%ymm5,%ymm0 + vmovdqa 64(%rcx),%ymm5 + vpaddq %ymm4,%ymm15,%ymm4 + vpaddq %ymm0,%ymm11,%ymm0 + + vpsrldq $8,%ymm12,%ymm8 + vpsrldq $8,%ymm2,%ymm9 + vpsrldq $8,%ymm3,%ymm10 + vpsrldq $8,%ymm4,%ymm6 + vpsrldq $8,%ymm0,%ymm7 + vpaddq %ymm8,%ymm12,%ymm12 + vpaddq %ymm9,%ymm2,%ymm2 + vpaddq %ymm10,%ymm3,%ymm3 + vpaddq %ymm6,%ymm4,%ymm4 + vpaddq %ymm7,%ymm0,%ymm0 + + vpermq $0x2,%ymm3,%ymm10 + vpermq $0x2,%ymm4,%ymm6 + vpermq $0x2,%ymm0,%ymm7 + vpermq $0x2,%ymm12,%ymm8 + vpermq $0x2,%ymm2,%ymm9 + vpaddq %ymm10,%ymm3,%ymm3 + vpaddq %ymm6,%ymm4,%ymm4 + vpaddq %ymm7,%ymm0,%ymm0 + vpaddq %ymm8,%ymm12,%ymm12 + vpaddq %ymm9,%ymm2,%ymm2 + + vpsrlq $26,%ymm3,%ymm14 + vpand %ymm5,%ymm3,%ymm3 + vpaddq %ymm14,%ymm4,%ymm4 + + vpsrlq $26,%ymm0,%ymm11 + vpand %ymm5,%ymm0,%ymm0 + vpaddq %ymm11,%ymm12,%ymm1 + + vpsrlq $26,%ymm4,%ymm15 + vpand %ymm5,%ymm4,%ymm4 + + vpsrlq $26,%ymm1,%ymm12 + vpand %ymm5,%ymm1,%ymm1 + vpaddq %ymm12,%ymm2,%ymm2 + + vpaddq %ymm15,%ymm0,%ymm0 + vpsllq $2,%ymm15,%ymm15 + vpaddq %ymm15,%ymm0,%ymm0 + + vpsrlq $26,%ymm2,%ymm13 + vpand %ymm5,%ymm2,%ymm2 + vpaddq %ymm13,%ymm3,%ymm3 + + vpsrlq $26,%ymm0,%ymm11 + vpand %ymm5,%ymm0,%ymm0 + vpaddq %ymm11,%ymm1,%ymm1 + + vpsrlq $26,%ymm3,%ymm14 + vpand %ymm5,%ymm3,%ymm3 + vpaddq %ymm14,%ymm4,%ymm4 + + vmovd %xmm0,-112(%rdi) + vmovd %xmm1,-108(%rdi) + vmovd %xmm2,-104(%rdi) + vmovd %xmm3,-100(%rdi) + vmovd %xmm4,-96(%rdi) + leaq -8(%r10),%rsp + + vzeroupper + ret + +.Lblocks_avx512: + + movl $15,%eax + kmovw %eax,%k2 + leaq 8(%rsp),%r10 + + subq $0x128,%rsp + leaq .Lconst(%rip),%rcx + leaq 48+64(%rdi),%rdi + vmovdqa 96(%rcx),%ymm9 + + vmovdqu32 -64(%rdi),%zmm16{%k2}{z} + andq $-512,%rsp + vmovdqu32 -48(%rdi),%zmm17{%k2}{z} + movq $0x20,%rax + vmovdqu32 -32(%rdi),%zmm21{%k2}{z} + vmovdqu32 -16(%rdi),%zmm18{%k2}{z} + vmovdqu32 0(%rdi),%zmm22{%k2}{z} + vmovdqu32 16(%rdi),%zmm19{%k2}{z} + vmovdqu32 32(%rdi),%zmm23{%k2}{z} + vmovdqu32 48(%rdi),%zmm20{%k2}{z} + vmovdqu32 64(%rdi),%zmm24{%k2}{z} + vpermd %zmm16,%zmm9,%zmm16 + vpbroadcastq 64(%rcx),%zmm5 + vpermd %zmm17,%zmm9,%zmm17 + vpermd %zmm21,%zmm9,%zmm21 + vpermd %zmm18,%zmm9,%zmm18 + vmovdqa64 %zmm16,0(%rsp){%k2} + vpsrlq $32,%zmm16,%zmm7 + vpermd %zmm22,%zmm9,%zmm22 + vmovdqu64 %zmm17,0(%rsp,%rax,1){%k2} + vpsrlq $32,%zmm17,%zmm8 + vpermd %zmm19,%zmm9,%zmm19 + vmovdqa64 %zmm21,64(%rsp){%k2} + vpermd %zmm23,%zmm9,%zmm23 + vpermd %zmm20,%zmm9,%zmm20 + vmovdqu64 %zmm18,64(%rsp,%rax,1){%k2} + vpermd %zmm24,%zmm9,%zmm24 + vmovdqa64 %zmm22,128(%rsp){%k2} + vmovdqu64 %zmm19,128(%rsp,%rax,1){%k2} + vmovdqa64 %zmm23,192(%rsp){%k2} + vmovdqu64 %zmm20,192(%rsp,%rax,1){%k2} + vmovdqa64 %zmm24,256(%rsp){%k2} + + vpmuludq %zmm7,%zmm16,%zmm11 + vpmuludq %zmm7,%zmm17,%zmm12 + vpmuludq %zmm7,%zmm18,%zmm13 + vpmuludq %zmm7,%zmm19,%zmm14 + vpmuludq %zmm7,%zmm20,%zmm15 + vpsrlq $32,%zmm18,%zmm9 + + vpmuludq %zmm8,%zmm24,%zmm25 + vpmuludq %zmm8,%zmm16,%zmm26 + vpmuludq %zmm8,%zmm17,%zmm27 + vpmuludq %zmm8,%zmm18,%zmm28 + vpmuludq %zmm8,%zmm19,%zmm29 + vpsrlq $32,%zmm19,%zmm10 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm26,%zmm12,%zmm12 + vpaddq %zmm27,%zmm13,%zmm13 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + + vpmuludq %zmm9,%zmm23,%zmm25 + vpmuludq %zmm9,%zmm24,%zmm26 + vpmuludq %zmm9,%zmm17,%zmm28 + vpmuludq %zmm9,%zmm18,%zmm29 + vpmuludq %zmm9,%zmm16,%zmm27 + vpsrlq $32,%zmm20,%zmm6 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm26,%zmm12,%zmm12 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm27,%zmm13,%zmm13 + + vpmuludq %zmm10,%zmm22,%zmm25 + vpmuludq %zmm10,%zmm16,%zmm28 + vpmuludq %zmm10,%zmm17,%zmm29 + vpmuludq %zmm10,%zmm23,%zmm26 + vpmuludq %zmm10,%zmm24,%zmm27 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm26,%zmm12,%zmm12 + vpaddq %zmm27,%zmm13,%zmm13 + + vpmuludq %zmm6,%zmm24,%zmm28 + vpmuludq %zmm6,%zmm16,%zmm29 + vpmuludq %zmm6,%zmm21,%zmm25 + vpmuludq %zmm6,%zmm22,%zmm26 + vpmuludq %zmm6,%zmm23,%zmm27 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm26,%zmm12,%zmm12 + vpaddq %zmm27,%zmm13,%zmm13 + + vmovdqu64 0(%rsi),%zmm10 + vmovdqu64 64(%rsi),%zmm6 + leaq 128(%rsi),%rsi + + vpsrlq $26,%zmm14,%zmm28 + vpandq %zmm5,%zmm14,%zmm14 + vpaddq %zmm28,%zmm15,%zmm15 + + vpsrlq $26,%zmm11,%zmm25 + vpandq %zmm5,%zmm11,%zmm11 + vpaddq %zmm25,%zmm12,%zmm12 + + vpsrlq $26,%zmm15,%zmm29 + vpandq %zmm5,%zmm15,%zmm15 + + vpsrlq $26,%zmm12,%zmm26 + vpandq %zmm5,%zmm12,%zmm12 + vpaddq %zmm26,%zmm13,%zmm13 + + vpaddq %zmm29,%zmm11,%zmm11 + vpsllq $2,%zmm29,%zmm29 + vpaddq %zmm29,%zmm11,%zmm11 + + vpsrlq $26,%zmm13,%zmm27 + vpandq %zmm5,%zmm13,%zmm13 + vpaddq %zmm27,%zmm14,%zmm14 + + vpsrlq $26,%zmm11,%zmm25 + vpandq %zmm5,%zmm11,%zmm11 + vpaddq %zmm25,%zmm12,%zmm12 + + vpsrlq $26,%zmm14,%zmm28 + vpandq %zmm5,%zmm14,%zmm14 + vpaddq %zmm28,%zmm15,%zmm15 + + vpunpcklqdq %zmm6,%zmm10,%zmm7 + vpunpckhqdq %zmm6,%zmm10,%zmm6 + + vmovdqa32 128(%rcx),%zmm25 + movl $0x7777,%eax + kmovw %eax,%k1 + + vpermd %zmm16,%zmm25,%zmm16 + vpermd %zmm17,%zmm25,%zmm17 + vpermd %zmm18,%zmm25,%zmm18 + vpermd %zmm19,%zmm25,%zmm19 + vpermd %zmm20,%zmm25,%zmm20 + + vpermd %zmm11,%zmm25,%zmm16{%k1} + vpermd %zmm12,%zmm25,%zmm17{%k1} + vpermd %zmm13,%zmm25,%zmm18{%k1} + vpermd %zmm14,%zmm25,%zmm19{%k1} + vpermd %zmm15,%zmm25,%zmm20{%k1} + + vpslld $2,%zmm17,%zmm21 + vpslld $2,%zmm18,%zmm22 + vpslld $2,%zmm19,%zmm23 + vpslld $2,%zmm20,%zmm24 + vpaddd %zmm17,%zmm21,%zmm21 + vpaddd %zmm18,%zmm22,%zmm22 + vpaddd %zmm19,%zmm23,%zmm23 + vpaddd %zmm20,%zmm24,%zmm24 + + vpbroadcastq 32(%rcx),%zmm30 + + vpsrlq $52,%zmm7,%zmm9 + vpsllq $12,%zmm6,%zmm10 + vporq %zmm10,%zmm9,%zmm9 + vpsrlq $26,%zmm7,%zmm8 + vpsrlq $14,%zmm6,%zmm10 + vpsrlq $40,%zmm6,%zmm6 + vpandq %zmm5,%zmm9,%zmm9 + vpandq %zmm5,%zmm7,%zmm7 + + vpaddq %zmm2,%zmm9,%zmm2 + subq $192,%rdx + jbe .Ltail_avx512 + jmp .Loop_avx512 + +.align 32 +.Loop_avx512: + + vpmuludq %zmm2,%zmm17,%zmm14 + vpaddq %zmm0,%zmm7,%zmm0 + vpmuludq %zmm2,%zmm18,%zmm15 + vpandq %zmm5,%zmm8,%zmm8 + vpmuludq %zmm2,%zmm23,%zmm11 + vpandq %zmm5,%zmm10,%zmm10 + vpmuludq %zmm2,%zmm24,%zmm12 + vporq %zmm30,%zmm6,%zmm6 + vpmuludq %zmm2,%zmm16,%zmm13 + vpaddq %zmm1,%zmm8,%zmm1 + vpaddq %zmm3,%zmm10,%zmm3 + vpaddq %zmm4,%zmm6,%zmm4 + + vmovdqu64 0(%rsi),%zmm10 + vmovdqu64 64(%rsi),%zmm6 + leaq 128(%rsi),%rsi + vpmuludq %zmm0,%zmm19,%zmm28 + vpmuludq %zmm0,%zmm20,%zmm29 + vpmuludq %zmm0,%zmm16,%zmm25 + vpmuludq %zmm0,%zmm17,%zmm26 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm26,%zmm12,%zmm12 + + vpmuludq %zmm1,%zmm18,%zmm28 + vpmuludq %zmm1,%zmm19,%zmm29 + vpmuludq %zmm1,%zmm24,%zmm25 + vpmuludq %zmm0,%zmm18,%zmm27 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm27,%zmm13,%zmm13 + + vpunpcklqdq %zmm6,%zmm10,%zmm7 + vpunpckhqdq %zmm6,%zmm10,%zmm6 + + vpmuludq %zmm3,%zmm16,%zmm28 + vpmuludq %zmm3,%zmm17,%zmm29 + vpmuludq %zmm1,%zmm16,%zmm26 + vpmuludq %zmm1,%zmm17,%zmm27 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm26,%zmm12,%zmm12 + vpaddq %zmm27,%zmm13,%zmm13 + + vpmuludq %zmm4,%zmm24,%zmm28 + vpmuludq %zmm4,%zmm16,%zmm29 + vpmuludq %zmm3,%zmm22,%zmm25 + vpmuludq %zmm3,%zmm23,%zmm26 + vpaddq %zmm28,%zmm14,%zmm14 + vpmuludq %zmm3,%zmm24,%zmm27 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm26,%zmm12,%zmm12 + vpaddq %zmm27,%zmm13,%zmm13 + + vpmuludq %zmm4,%zmm21,%zmm25 + vpmuludq %zmm4,%zmm22,%zmm26 + vpmuludq %zmm4,%zmm23,%zmm27 + vpaddq %zmm25,%zmm11,%zmm0 + vpaddq %zmm26,%zmm12,%zmm1 + vpaddq %zmm27,%zmm13,%zmm2 + + vpsrlq $52,%zmm7,%zmm9 + vpsllq $12,%zmm6,%zmm10 + + vpsrlq $26,%zmm14,%zmm3 + vpandq %zmm5,%zmm14,%zmm14 + vpaddq %zmm3,%zmm15,%zmm4 + + vporq %zmm10,%zmm9,%zmm9 + + vpsrlq $26,%zmm0,%zmm11 + vpandq %zmm5,%zmm0,%zmm0 + vpaddq %zmm11,%zmm1,%zmm1 + + vpandq %zmm5,%zmm9,%zmm9 + + vpsrlq $26,%zmm4,%zmm15 + vpandq %zmm5,%zmm4,%zmm4 + + vpsrlq $26,%zmm1,%zmm12 + vpandq %zmm5,%zmm1,%zmm1 + vpaddq %zmm12,%zmm2,%zmm2 + + vpaddq %zmm15,%zmm0,%zmm0 + vpsllq $2,%zmm15,%zmm15 + vpaddq %zmm15,%zmm0,%zmm0 + + vpaddq %zmm9,%zmm2,%zmm2 + vpsrlq $26,%zmm7,%zmm8 + + vpsrlq $26,%zmm2,%zmm13 + vpandq %zmm5,%zmm2,%zmm2 + vpaddq %zmm13,%zmm14,%zmm3 + + vpsrlq $14,%zmm6,%zmm10 + + vpsrlq $26,%zmm0,%zmm11 + vpandq %zmm5,%zmm0,%zmm0 + vpaddq %zmm11,%zmm1,%zmm1 + + vpsrlq $40,%zmm6,%zmm6 + + vpsrlq $26,%zmm3,%zmm14 + vpandq %zmm5,%zmm3,%zmm3 + vpaddq %zmm14,%zmm4,%zmm4 + + vpandq %zmm5,%zmm7,%zmm7 + + subq $128,%rdx + ja .Loop_avx512 + +.Ltail_avx512: + + vpsrlq $32,%zmm16,%zmm16 + vpsrlq $32,%zmm17,%zmm17 + vpsrlq $32,%zmm18,%zmm18 + vpsrlq $32,%zmm23,%zmm23 + vpsrlq $32,%zmm24,%zmm24 + vpsrlq $32,%zmm19,%zmm19 + vpsrlq $32,%zmm20,%zmm20 + vpsrlq $32,%zmm21,%zmm21 + vpsrlq $32,%zmm22,%zmm22 + + leaq (%rsi,%rdx,1),%rsi + + vpaddq %zmm0,%zmm7,%zmm0 + + vpmuludq %zmm2,%zmm17,%zmm14 + vpmuludq %zmm2,%zmm18,%zmm15 + vpmuludq %zmm2,%zmm23,%zmm11 + vpandq %zmm5,%zmm8,%zmm8 + vpmuludq %zmm2,%zmm24,%zmm12 + vpandq %zmm5,%zmm10,%zmm10 + vpmuludq %zmm2,%zmm16,%zmm13 + vporq %zmm30,%zmm6,%zmm6 + vpaddq %zmm1,%zmm8,%zmm1 + vpaddq %zmm3,%zmm10,%zmm3 + vpaddq %zmm4,%zmm6,%zmm4 + + vmovdqu 0(%rsi),%xmm7 + vpmuludq %zmm0,%zmm19,%zmm28 + vpmuludq %zmm0,%zmm20,%zmm29 + vpmuludq %zmm0,%zmm16,%zmm25 + vpmuludq %zmm0,%zmm17,%zmm26 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm26,%zmm12,%zmm12 + + vmovdqu 16(%rsi),%xmm8 + vpmuludq %zmm1,%zmm18,%zmm28 + vpmuludq %zmm1,%zmm19,%zmm29 + vpmuludq %zmm1,%zmm24,%zmm25 + vpmuludq %zmm0,%zmm18,%zmm27 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm27,%zmm13,%zmm13 + + vinserti128 $1,32(%rsi),%ymm7,%ymm7 + vpmuludq %zmm3,%zmm16,%zmm28 + vpmuludq %zmm3,%zmm17,%zmm29 + vpmuludq %zmm1,%zmm16,%zmm26 + vpmuludq %zmm1,%zmm17,%zmm27 + vpaddq %zmm28,%zmm14,%zmm14 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm26,%zmm12,%zmm12 + vpaddq %zmm27,%zmm13,%zmm13 + + vinserti128 $1,48(%rsi),%ymm8,%ymm8 + vpmuludq %zmm4,%zmm24,%zmm28 + vpmuludq %zmm4,%zmm16,%zmm29 + vpmuludq %zmm3,%zmm22,%zmm25 + vpmuludq %zmm3,%zmm23,%zmm26 + vpmuludq %zmm3,%zmm24,%zmm27 + vpaddq %zmm28,%zmm14,%zmm3 + vpaddq %zmm29,%zmm15,%zmm15 + vpaddq %zmm25,%zmm11,%zmm11 + vpaddq %zmm26,%zmm12,%zmm12 + vpaddq %zmm27,%zmm13,%zmm13 + + vpmuludq %zmm4,%zmm21,%zmm25 + vpmuludq %zmm4,%zmm22,%zmm26 + vpmuludq %zmm4,%zmm23,%zmm27 + vpaddq %zmm25,%zmm11,%zmm0 + vpaddq %zmm26,%zmm12,%zmm1 + vpaddq %zmm27,%zmm13,%zmm2 + + movl $1,%eax + vpermq $0xb1,%zmm3,%zmm14 + vpermq $0xb1,%zmm15,%zmm4 + vpermq $0xb1,%zmm0,%zmm11 + vpermq $0xb1,%zmm1,%zmm12 + vpermq $0xb1,%zmm2,%zmm13 + vpaddq %zmm14,%zmm3,%zmm3 + vpaddq %zmm15,%zmm4,%zmm4 + vpaddq %zmm11,%zmm0,%zmm0 + vpaddq %zmm12,%zmm1,%zmm1 + vpaddq %zmm13,%zmm2,%zmm2 + + kmovw %eax,%k3 + vpermq $0x2,%zmm3,%zmm14 + vpermq $0x2,%zmm4,%zmm15 + vpermq $0x2,%zmm0,%zmm11 + vpermq $0x2,%zmm1,%zmm12 + vpermq $0x2,%zmm2,%zmm13 + vpaddq %zmm14,%zmm3,%zmm3 + vpaddq %zmm15,%zmm4,%zmm4 + vpaddq %zmm11,%zmm0,%zmm0 + vpaddq %zmm12,%zmm1,%zmm1 + vpaddq %zmm13,%zmm2,%zmm2 + + vextracti64x4 $0x1,%zmm3,%ymm14 + vextracti64x4 $0x1,%zmm4,%ymm15 + vextracti64x4 $0x1,%zmm0,%ymm11 + vextracti64x4 $0x1,%zmm1,%ymm12 + vextracti64x4 $0x1,%zmm2,%ymm13 + vpaddq %zmm14,%zmm3,%zmm3{%k3}{z} + vpaddq %zmm15,%zmm4,%zmm4{%k3}{z} + vpaddq %zmm11,%zmm0,%zmm0{%k3}{z} + vpaddq %zmm12,%zmm1,%zmm1{%k3}{z} + vpaddq %zmm13,%zmm2,%zmm2{%k3}{z} + + vpsrlq $26,%ymm3,%ymm14 + vpand %ymm5,%ymm3,%ymm3 + vpsrldq $6,%ymm7,%ymm9 + vpsrldq $6,%ymm8,%ymm10 + vpunpckhqdq %ymm8,%ymm7,%ymm6 + vpaddq %ymm14,%ymm4,%ymm4 + + vpsrlq $26,%ymm0,%ymm11 + vpand %ymm5,%ymm0,%ymm0 + vpunpcklqdq %ymm10,%ymm9,%ymm9 + vpunpcklqdq %ymm8,%ymm7,%ymm7 + vpaddq %ymm11,%ymm1,%ymm1 + + vpsrlq $26,%ymm4,%ymm15 + vpand %ymm5,%ymm4,%ymm4 + + vpsrlq $26,%ymm1,%ymm12 + vpand %ymm5,%ymm1,%ymm1 + vpsrlq $30,%ymm9,%ymm10 + vpsrlq $4,%ymm9,%ymm9 + vpaddq %ymm12,%ymm2,%ymm2 + + vpaddq %ymm15,%ymm0,%ymm0 + vpsllq $2,%ymm15,%ymm15 + vpsrlq $26,%ymm7,%ymm8 + vpsrlq $40,%ymm6,%ymm6 + vpaddq %ymm15,%ymm0,%ymm0 + + vpsrlq $26,%ymm2,%ymm13 + vpand %ymm5,%ymm2,%ymm2 + vpand %ymm5,%ymm9,%ymm9 + vpand %ymm5,%ymm7,%ymm7 + vpaddq %ymm13,%ymm3,%ymm3 + + vpsrlq $26,%ymm0,%ymm11 + vpand %ymm5,%ymm0,%ymm0 + vpaddq %ymm2,%ymm9,%ymm2 + vpand %ymm5,%ymm8,%ymm8 + vpaddq %ymm11,%ymm1,%ymm1 + + vpsrlq $26,%ymm3,%ymm14 + vpand %ymm5,%ymm3,%ymm3 + vpand %ymm5,%ymm10,%ymm10 + vpor 32(%rcx),%ymm6,%ymm6 + vpaddq %ymm14,%ymm4,%ymm4 + + leaq 144(%rsp),%rax + addq $64,%rdx + jnz .Ltail_avx2_512 + + vpsubq %ymm9,%ymm2,%ymm2 + vmovd %xmm0,-112(%rdi) + vmovd %xmm1,-108(%rdi) + vmovd %xmm2,-104(%rdi) + vmovd %xmm3,-100(%rdi) + vmovd %xmm4,-96(%rdi) + vzeroall + leaq -8(%r10),%rsp + + ret + +ENDPROC(poly1305_blocks_avx512) +#endif /* CONFIG_AS_AVX512 */ diff --git a/src/crypto/zinc/poly1305/poly1305.c b/src/crypto/zinc/poly1305/poly1305.c new file mode 100644 index 0000000..4b90523 --- /dev/null +++ b/src/crypto/zinc/poly1305/poly1305.c @@ -0,0 +1,289 @@ +/* SPDX-License-Identifier: OpenSSL OR (BSD-3-Clause OR GPL-2.0) + * + * Copyright (C) 2015-2018 Jason A. Donenfeld . All Rights Reserved. + * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. + * + * Implementation of the Poly1305 message authenticator. + * + * Information: https://cr.yp.to/mac.html + */ + +#include + +#include +#include +#include + +#ifndef HAVE_POLY1305_ARCH_IMPLEMENTATION +static inline bool poly1305_init_arch(void *ctx, + const u8 key[POLY1305_KEY_SIZE], + simd_context_t simd_context) +{ + return false; +} +static inline bool poly1305_blocks_arch(void *ctx, const u8 *inp, + const size_t len, const u32 padbit, + simd_context_t simd_context) +{ + return false; +} +static inline bool poly1305_emit_arch(void *ctx, u8 mac[POLY1305_MAC_SIZE], + const u32 nonce[4], + simd_context_t simd_context) +{ + return false; +} +void __init poly1305_fpu_init(void) +{ +} +#endif + +struct poly1305_internal { + u32 h[5]; + u32 r[4]; +}; + +static void poly1305_init_generic(void *ctx, const u8 key[16]) +{ + struct poly1305_internal *st = (struct poly1305_internal *)ctx; + + /* h = 0 */ + st->h[0] = 0; + st->h[1] = 0; + st->h[2] = 0; + st->h[3] = 0; + st->h[4] = 0; + + /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ + st->r[0] = get_unaligned_le32(&key[ 0]) & 0x0fffffff; + st->r[1] = get_unaligned_le32(&key[ 4]) & 0x0ffffffc; + st->r[2] = get_unaligned_le32(&key[ 8]) & 0x0ffffffc; + st->r[3] = get_unaligned_le32(&key[12]) & 0x0ffffffc; +} + +static void poly1305_blocks_generic(void *ctx, const u8 *inp, size_t len, + const u32 padbit) +{ +#define CONSTANT_TIME_CARRY(a, b) \ + ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1)) + struct poly1305_internal *st = (struct poly1305_internal *)ctx; + u32 r0, r1, r2, r3; + u32 s1, s2, s3; + u32 h0, h1, h2, h3, h4, c; + u64 d0, d1, d2, d3; + + r0 = st->r[0]; + r1 = st->r[1]; + r2 = st->r[2]; + r3 = st->r[3]; + + s1 = r1 + (r1 >> 2); + s2 = r2 + (r2 >> 2); + s3 = r3 + (r3 >> 2); + + h0 = st->h[0]; + h1 = st->h[1]; + h2 = st->h[2]; + h3 = st->h[3]; + h4 = st->h[4]; + + while (len >= POLY1305_BLOCK_SIZE) { + /* h += m[i] */ + h0 = (u32)(d0 = (u64)h0 + (0 ) + get_unaligned_le32(&inp[ 0])); + h1 = (u32)(d1 = (u64)h1 + (d0 >> 32) + get_unaligned_le32(&inp[ 4])); + h2 = (u32)(d2 = (u64)h2 + (d1 >> 32) + get_unaligned_le32(&inp[ 8])); + h3 = (u32)(d3 = (u64)h3 + (d2 >> 32) + get_unaligned_le32(&inp[12])); + h4 += (u32)(d3 >> 32) + padbit; + + /* h *= r "%" p, where "%" stands for "partial remainder" */ + d0 = ((u64)h0 * r0) + + ((u64)h1 * s3) + + ((u64)h2 * s2) + + ((u64)h3 * s1); + d1 = ((u64)h0 * r1) + + ((u64)h1 * r0) + + ((u64)h2 * s3) + + ((u64)h3 * s2) + + (h4 * s1); + d2 = ((u64)h0 * r2) + + ((u64)h1 * r1) + + ((u64)h2 * r0) + + ((u64)h3 * s3) + + (h4 * s2); + d3 = ((u64)h0 * r3) + + ((u64)h1 * r2) + + ((u64)h2 * r1) + + ((u64)h3 * r0) + + (h4 * s3); + h4 = (h4 * r0); + + /* last reduction step: */ + /* a) h4:h0 = h4<<128 + d3<<96 + d2<<64 + d1<<32 + d0 */ + h0 = (u32)d0; + h1 = (u32)(d1 += d0 >> 32); + h2 = (u32)(d2 += d1 >> 32); + h3 = (u32)(d3 += d2 >> 32); + h4 += (u32)(d3 >> 32); + /* b) (h4:h0 += (h4:h0>>130) * 5) %= 2^130 */ + c = (h4 >> 2) + (h4 & ~3U); + h4 &= 3; + h0 += c; + h1 += (c = CONSTANT_TIME_CARRY(h0, c)); + h2 += (c = CONSTANT_TIME_CARRY(h1, c)); + h3 += (c = CONSTANT_TIME_CARRY(h2, c)); + h4 += CONSTANT_TIME_CARRY(h3, c); + /* + * Occasional overflows to 3rd bit of h4 are taken care of + * "naturally". If after this point we end up at the top of + * this loop, then the overflow bit will be accounted for + * in next iteration. If we end up in poly1305_emit, then + * comparison to modulus below will still count as "carry + * into 131st bit", so that properly reduced value will be + * picked in conditional move. + */ + + inp += POLY1305_BLOCK_SIZE; + len -= POLY1305_BLOCK_SIZE; + } + + st->h[0] = h0; + st->h[1] = h1; + st->h[2] = h2; + st->h[3] = h3; + st->h[4] = h4; +#undef CONSTANT_TIME_CARRY +} + +static void poly1305_emit_generic(void *ctx, u8 mac[16], const u32 nonce[4]) +{ + struct poly1305_internal *st = (struct poly1305_internal *)ctx; + u32 h0, h1, h2, h3, h4; + u32 g0, g1, g2, g3, g4; + u64 t; + u32 mask; + + h0 = st->h[0]; + h1 = st->h[1]; + h2 = st->h[2]; + h3 = st->h[3]; + h4 = st->h[4]; + + /* compare to modulus by computing h + -p */ + g0 = (u32)(t = (u64)h0 + 5); + g1 = (u32)(t = (u64)h1 + (t >> 32)); + g2 = (u32)(t = (u64)h2 + (t >> 32)); + g3 = (u32)(t = (u64)h3 + (t >> 32)); + g4 = h4 + (u32)(t >> 32); + + /* if there was carry into 131st bit, h3:h0 = g3:g0 */ + mask = 0 - (g4 >> 2); + g0 &= mask; + g1 &= mask; + g2 &= mask; + g3 &= mask; + mask = ~mask; + h0 = (h0 & mask) | g0; + h1 = (h1 & mask) | g1; + h2 = (h2 & mask) | g2; + h3 = (h3 & mask) | g3; + + /* mac = (h + nonce) % (2^128) */ + h0 = (u32)(t = (u64)h0 + nonce[0]); + h1 = (u32)(t = (u64)h1 + (t >> 32) + nonce[1]); + h2 = (u32)(t = (u64)h2 + (t >> 32) + nonce[2]); + h3 = (u32)(t = (u64)h3 + (t >> 32) + nonce[3]); + + put_unaligned_le32(h0, &mac[ 0]); + put_unaligned_le32(h1, &mac[ 4]); + put_unaligned_le32(h2, &mac[ 8]); + put_unaligned_le32(h3, &mac[12]); +} + +void poly1305_init(struct poly1305_ctx *ctx, const u8 key[POLY1305_KEY_SIZE], + simd_context_t simd_context) +{ + ctx->nonce[0] = get_unaligned_le32(&key[16]); + ctx->nonce[1] = get_unaligned_le32(&key[20]); + ctx->nonce[2] = get_unaligned_le32(&key[24]); + ctx->nonce[3] = get_unaligned_le32(&key[28]); + + if (!poly1305_init_arch(ctx->opaque, key, simd_context)) + poly1305_init_generic(ctx->opaque, key); + ctx->num = 0; +} +EXPORT_SYMBOL(poly1305_init); + +static inline void poly1305_blocks(void *ctx, const u8 *inp, const size_t len, + const u32 padbit, + simd_context_t simd_context) +{ + if (!poly1305_blocks_arch(ctx, inp, len, padbit, simd_context)) + poly1305_blocks_generic(ctx, inp, len, padbit); +} + +static inline void poly1305_emit(void *ctx, u8 mac[POLY1305_KEY_SIZE], + const u32 nonce[4], + simd_context_t simd_context) +{ + if (!poly1305_emit_arch(ctx, mac, nonce, simd_context)) + poly1305_emit_generic(ctx, mac, nonce); +} + +void poly1305_update(struct poly1305_ctx *ctx, const u8 *inp, size_t len, + simd_context_t simd_context) +{ + const size_t num = ctx->num % POLY1305_BLOCK_SIZE; + size_t rem; + + if (num) { + rem = POLY1305_BLOCK_SIZE - num; + if (len >= rem) { + memcpy(ctx->data + num, inp, rem); + poly1305_blocks(ctx->opaque, ctx->data, + POLY1305_BLOCK_SIZE, 1, simd_context); + inp += rem; + len -= rem; + } else { + /* Still not enough data to process a block. */ + memcpy(ctx->data + num, inp, len); + ctx->num = num + len; + return; + } + } + + rem = len % POLY1305_BLOCK_SIZE; + len -= rem; + + if (len >= POLY1305_BLOCK_SIZE) { + poly1305_blocks(ctx->opaque, inp, len, 1, simd_context); + inp += len; + } + + if (rem) + memcpy(ctx->data, inp, rem); + + ctx->num = rem; +} +EXPORT_SYMBOL(poly1305_update); + +void poly1305_finish(struct poly1305_ctx *ctx, u8 mac[POLY1305_MAC_SIZE], + simd_context_t simd_context) +{ + size_t num = ctx->num % POLY1305_BLOCK_SIZE; + + if (num) { + ctx->data[num++] = 1; /* pad bit */ + while (num < POLY1305_BLOCK_SIZE) + ctx->data[num++] = 0; + poly1305_blocks(ctx->opaque, ctx->data, POLY1305_BLOCK_SIZE, 0, + simd_context); + } + + poly1305_emit(ctx->opaque, mac, ctx->nonce, simd_context); + + /* zero out the state */ + memzero_explicit(ctx, sizeof(*ctx)); +} +EXPORT_SYMBOL(poly1305_finish); + +#include "../selftest/poly1305.h" -- cgit v1.2.3-59-g8ed1b