aboutsummaryrefslogtreecommitdiffstats
path: root/poly1305-donna_avx2.c
diff options
context:
space:
mode:
Diffstat (limited to 'poly1305-donna_avx2.c')
-rw-r--r--poly1305-donna_avx2.c535
1 files changed, 535 insertions, 0 deletions
diff --git a/poly1305-donna_avx2.c b/poly1305-donna_avx2.c
new file mode 100644
index 0000000..b5be379
--- /dev/null
+++ b/poly1305-donna_avx2.c
@@ -0,0 +1,535 @@
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+typedef __m128i xmmi;
+typedef __m256i ymmi;
+
+typedef __int128_t u128;
+
+enum poly1305_state_flags_t {
+ poly1305_started = 1,
+ poly1305_final_shift8 = 4,
+ poly1305_final_shift16 = 8,
+ poly1305_final_shift24 = 16,
+ poly1305_final_shift32 = 32,
+ poly1305_finalize = 64,
+
+ poly1305_final_r4_r4_r4_r3 = 128, /* use [r^4,r^4,r^4,r^3] */
+ poly1305_final_r4_r4_r3_r2 = 256, /* use [r^4,r^4,r^3,r^2] */
+ poly1305_final_r4_r3_r2_r = 512, /* use [r^4,r^3,r^2,r] */
+ poly1305_final_r3_r2_r_1 = 1024, /* use [r^3,r^2,r,1] */
+ poly1305_final_r2_r_1_1 = 2048, /* use [r^2,r,1,1] */
+ poly1305_final_r_1_1_1 = 4096 /* use [r,1,1,1] */
+};
+
+#define poly1305_shift_flags (poly1305_final_shift8|poly1305_final_shift16|poly1305_final_shift24|poly1305_final_shift32)
+#define poly1305_mult_flags (poly1305_final_r4_r4_r4_r3|poly1305_final_r4_r4_r3_r2|poly1305_final_r4_r3_r2_r|poly1305_final_r3_r2_r_1|poly1305_final_r2_r_1_1|poly1305_final_r_1_1_1)
+
+typedef struct poly1305_state_internal_t {
+ union {
+ u64 h[3];
+ uint32_t hh[20];
+ }; /* 80 bytes */
+ uint32_t R[5]; /* 20 bytes */
+ uint32_t R2[5]; /* 20 bytes */
+ uint32_t R3[5]; /* 20 bytes */
+ uint32_t R4[5]; /* 20 bytes */
+ u64 pad[2]; /* 16 bytes */
+ u64 flags; /* 8 bytes */
+} poly1305_state_internal; /* 184 bytes total */
+
+typedef u8 poly1305_state[192];
+
+#if defined(__AVX2__)
+#define FN(name) name##_avx2
+#else
+#endif
+
+
+/* copy 0-63 bytes */
+inline __attribute__((always_inline))
+poly1305_block_copy63(u8 *dst, const u8 *src, u32 bytes) {
+ u32 offset = src - dst;
+ if (bytes & 32) { _mm256_store_si256((ymmi *)dst, _mm256_loadu_si256((ymmi *)(dst + offset))); dst += 32; }
+ if (bytes & 16) { _mm_store_si128((xmmi *)dst, _mm_loadu_si128((xmmi *)(dst + offset))); dst += 16; }
+ if (bytes & 8) { *(u64 *)dst = *(u64 *)(dst + offset); dst += 8; }
+ if (bytes & 4) { *(uint32_t *)dst = *(uint32_t *)(dst + offset); dst += 4; }
+ if (bytes & 2) { *(uint16_t *)dst = *(uint16_t *)(dst + offset); dst += 2; }
+ if (bytes & 1) { *( u8 *)dst = *( u8 *)(dst + offset); }
+}
+
+
+u32
+FN(poly1305_block_size)(void) {
+ return 64;
+}
+
+
+__attribute__((noinline)) void
+FN(poly1305_init_ext)(poly1305_state_internal *st, const unsigned char key[32], u32 bytes) {
+ uint32_t *R;
+ uint128_t d[3];
+ u64 r0,r1,r2,c;
+ u64 r20,r21,r22,s21,s22;
+ u64 t0,t1;
+ u32 i;
+
+ if (!bytes) bytes = ~(u32)0;
+
+ /* H = 0 */
+ _mm256_storeu_si256((ymmi *)&st->hh[0], _mm256_setzero_si256());
+ _mm256_storeu_si256((ymmi *)&st->hh[8], _mm256_setzero_si256());
+ _mm_storeu_si128((xmmi *)&st->hh[16], _mm_setzero_si128());
+
+
+ /* clamp key */
+ t0 = *(u64 *)(key + 0);
+ t1 = *(u64 *)(key + 8);
+ r0 = t0 & 0xffc0fffffff; t0 >>= 44; t0 |= t1 << 20;
+ r1 = t0 & 0xfffffc0ffff; t1 >>= 24;
+ r2 = t1 & 0x00ffffffc0f;
+
+ st->pad[0] = *(u64 *)(key + 16);
+ st->pad[1] = *(u64 *)(key + 24);
+
+ R = st->R;
+ R[0] = (uint32_t)( r0 ) & 0x3ffffff;
+ R[1] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff;
+ R[2] = (uint32_t)((r1 >> 8) ) & 0x3ffffff;
+ R[3] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff;
+ R[4] = (uint32_t)((r2 >> 16) );
+
+ if (bytes > 16) {
+ r20 = r0;
+ r21 = r1;
+ r22 = r2;
+ s22 = r22 * (5 << 2);
+ d[0] = ((uint128_t)r20 * r20) + ((uint128_t)(r21 * 2) * s22);
+ d[1] = ((uint128_t)r22 * s22) + ((uint128_t)(r20 * 2) * r21);
+ d[2] = ((uint128_t)r21 * r21) + ((uint128_t)(r22 * 2) * r20);
+ r20 = (u64)d[0] & 0xfffffffffff; c = (u64)(d[0] >> 44);
+ d[1] += c ; r21 = (u64)d[1] & 0xfffffffffff; c = (u64)(d[1] >> 44);
+ d[2] += c ; r22 = (u64)d[2] & 0x3ffffffffff; c = (u64)(d[2] >> 42);
+ r20 += c * 5; c = (r20 >> 44); r20 = r20 & 0xfffffffffff;
+ r21 += c ; c = (r21 >> 44); r21 = r21 & 0xfffffffffff;
+ r22 += c ; /* even if r22 overflows, it will still fit in r4 safely, and is safe to multiply with */
+
+ R = st->R2;
+ R[0] = (uint32_t)( r20 ) & 0x3ffffff;
+ R[1] = (uint32_t)((r20 >> 26) | (r21 << 18)) & 0x3ffffff;
+ R[2] = (uint32_t)((r21 >> 8) ) & 0x3ffffff;
+ R[3] = (uint32_t)((r21 >> 34) | (r22 << 10)) & 0x3ffffff;
+ R[4] = (uint32_t)((r22 >> 16) );
+ }
+
+ if (bytes > 48) {
+ u64 r40,r41,r42,s42;
+ r40 = r20;
+ r41 = r21;
+ r42 = r22;
+ s42 = r42 * (5 << 2);
+ d[0] = ((uint128_t)r40 * r40) + ((uint128_t)(r41 * 2) * s42);
+ d[1] = ((uint128_t)r42 * s42) + ((uint128_t)(r40 * 2) * r41);
+ d[2] = ((uint128_t)r41 * r41) + ((uint128_t)(r42 * 2) * r40);
+ r40 = (u64)d[0] & 0xfffffffffff; c = (u64)(d[0] >> 44);
+ d[1] += c ; r41 = (u64)d[1] & 0xfffffffffff; c = (u64)(d[1] >> 44);
+ d[2] += c ; r42 = (u64)d[2] & 0x3ffffffffff; c = (u64)(d[2] >> 42);
+ r40 += c * 5; c = (r40 >> 44); r40 = r40 & 0xfffffffffff;
+ r41 += c ; c = (r41 >> 44); r41 = r41 & 0xfffffffffff;
+ r42 += c ; /* even if r42 overflows, it will still fit in r4 safely, and is safe to multiply with */
+
+ R = st->R4;
+ R[0] = (uint32_t)( r40 ) & 0x3ffffff;
+ R[1] = (uint32_t)((r40 >> 26) | (r41 << 18)) & 0x3ffffff;
+ R[2] = (uint32_t)((r41 >> 8) ) & 0x3ffffff;
+ R[3] = (uint32_t)((r41 >> 34) | (r42 << 10)) & 0x3ffffff;
+ R[4] = (uint32_t)((r42 >> 16) );
+ }
+
+ /* r^3 */
+ if (bytes > 32) {
+ s21 = r21 * (5 << 2);
+ s22 = r22 * (5 << 2);
+ d[0] = ((uint128_t)r0 * r20) + ((uint128_t)r1 * s22) + ((uint128_t)r2 * s21);
+ d[1] = ((uint128_t)r0 * r21) + ((uint128_t)r1 * r20) + ((uint128_t)r2 * s22);
+ d[2] = ((uint128_t)r0 * r22) + ((uint128_t)r1 * r21) + ((uint128_t)r2 * r20);
+ r0 = (u64)d[0] & 0xfffffffffff; c = (u64)(d[0] >> 44);
+ d[1] += c ; r1 = (u64)d[1] & 0xfffffffffff; c = (u64)(d[1] >> 44);
+ d[2] += c ; r2 = (u64)d[2] & 0x3ffffffffff; c = (u64)(d[2] >> 42);
+ r0 += c * 5; c = (r0 >> 44); r0 = r0 & 0xfffffffffff;
+ r1 += c ; c = (r1 >> 44); r1 = r1 & 0xfffffffffff;
+ r2 += c ; /* even if r2 overflows, it will still fit in r4 safely, and is safe to multiply with */
+
+ R = st->R3;
+ R[0] = (uint32_t)( r0 ) & 0x3ffffff;
+ R[1] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff;
+ R[2] = (uint32_t)((r1 >> 8) ) & 0x3ffffff;
+ R[3] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff;
+ R[4] = (uint32_t)((r2 >> 16) );
+ }
+
+ st->flags = 0;
+}
+
+
+__attribute__((noinline)) void
+FN(poly1305_blocks)(poly1305_state_internal *st, const u8 *m, u32 bytes) {
+ __attribute__((aligned(64))) ymmi HIBIT = _mm256_broadcastq_epi64(_mm_cvtsi32_si128(1 << 24));
+ const ymmi MMASK = _mm256_broadcastq_epi64(_mm_cvtsi32_si128((1 << 26) - 1));
+ //const ymmi FIVE = _mm256_broadcastq_epi64(_mm_cvtsi32_si128(5));
+
+ ymmi H0,H1,H2,H3,H4;
+ ymmi T0,T1,T2,T3,T4,T5,T6,T7,T8,T9;
+ ymmi M0,M1,M2,M3,M4;
+ ymmi M5,M6,M7,M8,M9;
+ ymmi C1,C2;
+ ymmi R40,R41,R42,R43,R44,S41,S42,S43,S44;
+
+ if (st->flags & poly1305_shift_flags) {
+ T0 = _mm256_srli_si256(HIBIT, 8);
+ if (st->flags & poly1305_final_shift8) T0 = _mm256_permute4x64_epi64(T0, _MM_SHUFFLE(3,0,0,0));
+ if (st->flags & poly1305_final_shift16) T0 = _mm256_permute4x64_epi64(T0, _MM_SHUFFLE(3,3,0,0));
+ if (st->flags & poly1305_final_shift24) T0 = _mm256_permute4x64_epi64(T0, _MM_SHUFFLE(3,3,3,0));
+ if (st->flags & poly1305_final_shift32) T0 = _mm256_setzero_si256();
+ HIBIT = T0;
+ }
+
+ if (!(st->flags & poly1305_started)) {
+ /* H = [Mx,My] */
+ T7 = _mm256_loadu_si256((ymmi *)(m + 0));
+ T8 = _mm256_loadu_si256((ymmi *)(m + 32));
+ T5 = _mm256_unpacklo_epi64(T7, T8);
+ T6 = _mm256_unpackhi_epi64(T7, T8);
+ T5 = _mm256_permute4x64_epi64(T5, _MM_SHUFFLE(3,1,2,0));
+ T6 = _mm256_permute4x64_epi64(T6, _MM_SHUFFLE(3,1,2,0));
+ H0 = _mm256_and_si256(MMASK, T5);
+ H1 = _mm256_and_si256(MMASK, _mm256_srli_epi64(T5, 26));
+ T5 = _mm256_or_si256(_mm256_srli_epi64(T5, 52), _mm256_slli_epi64(T6, 12));
+ H2 = _mm256_and_si256(MMASK, T5);
+ H3 = _mm256_and_si256(MMASK, _mm256_srli_epi64(T5, 26));
+ H4 = _mm256_srli_epi64(T6, 40);
+ H4 = _mm256_or_si256(H4, HIBIT);
+ m += 64;
+ bytes -= 64;
+ st->flags |= poly1305_started;
+ } else {
+ T0 = _mm256_loadu_si256((ymmi *)&st->hh[0]);
+ T1 = _mm256_loadu_si256((ymmi *)&st->hh[8]);
+ T2 = _mm256_loadu_si256((ymmi *)&st->hh[16]);
+ T0 = _mm256_permute4x64_epi64(T0, _MM_SHUFFLE(3,1,2,0));
+ T1 = _mm256_permute4x64_epi64(T1, _MM_SHUFFLE(3,1,2,0));
+ T2 = _mm256_permute4x64_epi64(T2, _MM_SHUFFLE(3,1,2,0));
+ H0 = _mm256_unpacklo_epi32(T0, _mm256_setzero_si256());
+ H1 = _mm256_unpackhi_epi32(T0, _mm256_setzero_si256());
+ H2 = _mm256_unpacklo_epi32(T1, _mm256_setzero_si256());
+ H3 = _mm256_unpackhi_epi32(T1, _mm256_setzero_si256());
+ H4 = _mm256_unpacklo_epi32(T2, _mm256_setzero_si256());
+ }
+
+ if (bytes >= 64) {
+ if (st->flags & (poly1305_final_r4_r4_r4_r3|poly1305_final_r4_r4_r3_r2|poly1305_final_r4_r3_r2_r|poly1305_final_r3_r2_r_1|poly1305_final_r2_r_1_1|poly1305_final_r_1_1_1)) {
+ ymmi R0 = _mm256_castsi128_si256(_mm_cvtsi32_si128(1));
+ ymmi R1 = _mm256_loadu_si256((ymmi *)&st->R[0]);
+ ymmi R2 = _mm256_loadu_si256((ymmi *)&st->R2[0]);
+ ymmi R3 = _mm256_loadu_si256((ymmi *)&st->R3[0]);
+ ymmi R4 = _mm256_loadu_si256((ymmi *)&st->R4[0]);
+
+ R1 = _mm256_permute4x64_epi64(R1, _MM_SHUFFLE(3,1,2,0));
+ R2 = _mm256_permute4x64_epi64(R2, _MM_SHUFFLE(3,1,2,0));
+ R3 = _mm256_permute4x64_epi64(R3, _MM_SHUFFLE(3,1,2,0));
+ R4 = _mm256_permute4x64_epi64(R4, _MM_SHUFFLE(3,1,2,0));
+
+ if (st->flags & poly1305_final_r4_r4_r4_r3) {
+ T0 = R4;
+ T1 = R4;
+ T2 = R4;
+ T3 = R3;
+ } else if (st->flags & poly1305_final_r4_r4_r3_r2) {
+ T0 = R4;
+ T1 = R4;
+ T2 = R3;
+ T3 = R2;
+ } else if (st->flags & poly1305_final_r4_r3_r2_r) {
+ T0 = R4;
+ T1 = R3;
+ T2 = R2;
+ T3 = R1;
+ } else if (st->flags & poly1305_final_r3_r2_r_1) {
+ T0 = R3;
+ T1 = R2;
+ T2 = R1;
+ T3 = R0;
+ } else if (st->flags & poly1305_final_r2_r_1_1) {
+ T0 = R2;
+ T1 = R1;
+ T2 = R0;
+ T3 = R0;
+ } else if (st->flags & poly1305_final_r_1_1_1) {
+ T0 = R1;
+ T1 = R0;
+ T2 = R0;
+ T3 = R0;
+ }
+
+ T5 = _mm256_unpacklo_epi64(T0, T1);
+ T6 = _mm256_unpackhi_epi64(T0, T1);
+ T7 = _mm256_unpacklo_epi64(T2, T3);
+ T8 = _mm256_unpackhi_epi64(T2, T3);
+ T0 = _mm256_permute2x128_si256(T5, T7, 0x20);
+ T1 = _mm256_permute2x128_si256(T5, T7, 0x31);
+ T2 = _mm256_permute2x128_si256(T6, T8, 0x20);
+ R40 = T0;
+ R41 = _mm256_srli_epi64(T0, 32);
+ R42 = T1;
+ R43 = _mm256_srli_epi64(T1, 32);
+ R44 = T2;
+ } else {
+ T0 = _mm256_loadu_si256((ymmi *)&st->R4[0]);
+ T1 = _mm256_srli_epi64(T0, 32);
+ R40 = _mm256_permute4x64_epi64(T0, _MM_SHUFFLE(0,0,0,0));
+ R41 = _mm256_permute4x64_epi64(T1, _MM_SHUFFLE(0,0,0,0));
+ R42 = _mm256_permute4x64_epi64(T0, _MM_SHUFFLE(1,1,1,1));
+ R43 = _mm256_permute4x64_epi64(T1, _MM_SHUFFLE(1,1,1,1));
+ R44 = _mm256_permute4x64_epi64(T0, _MM_SHUFFLE(2,2,2,2));
+ }
+ S41 = _mm256_add_epi32(R41, _mm256_slli_epi32(R41, 2));
+ S42 = _mm256_add_epi32(R42, _mm256_slli_epi32(R42, 2));
+ S43 = _mm256_add_epi32(R43, _mm256_slli_epi32(R43, 2));
+ S44 = _mm256_add_epi32(R44, _mm256_slli_epi32(R44, 2));
+
+ do {
+ ymmi v01,v02,v03,v04;
+ ymmi v11,v12,v13,v14;
+ ymmi v21,v22,v23,v24;
+ ymmi v31,v32,v33,v34;
+ ymmi v41,v42,v43,v44;
+ ymmi T14,T15;
+
+ /* H *= [r^4,r^4,r^4,r^4] */
+ T15 = S42;
+ T0 = H4; T0 = _mm256_mul_epu32(T0, S41);
+ v01 = H3; v01 = _mm256_mul_epu32(v01, T15);
+ T14 = S43;
+ T1 = H4; T1 = _mm256_mul_epu32(T1 , T15);
+ v11 = H3; v11 = _mm256_mul_epu32(v11, T14);
+ T2 = H4; T2 = _mm256_mul_epu32(T2 , T14); T0 = _mm256_add_epi64(T0, v01);
+ T15 = S44;
+ v02 = H2; v02 = _mm256_mul_epu32(v02, T14);
+ T3 = H4; T3 = _mm256_mul_epu32(T3 , T15); T1 = _mm256_add_epi64(T1, v11);
+ v03 = H1; v03 = _mm256_mul_epu32(v03, T15);
+ v12 = H2; v12 = _mm256_mul_epu32(v12, T15); T0 = _mm256_add_epi64(T0, v02);
+ T14 = R40;
+ v21 = H3; v21 = _mm256_mul_epu32(v21, T15);
+ v31 = H3; v31 = _mm256_mul_epu32(v31, T14); T0 = _mm256_add_epi64(T0, v03);
+ T4 = H4; T4 = _mm256_mul_epu32(T4 , T14); T1 = _mm256_add_epi64(T1, v12);
+ v04 = H0; v04 = _mm256_mul_epu32(v04, T14); T2 = _mm256_add_epi64(T2, v21);
+ v13 = H1; v13 = _mm256_mul_epu32(v13, T14); T3 = _mm256_add_epi64(T3, v31);
+ T15 = R41;
+ v22 = H2; v22 = _mm256_mul_epu32(v22, T14);
+ v32 = H2; v32 = _mm256_mul_epu32(v32, T15); T0 = _mm256_add_epi64(T0, v04);
+ v41 = H3; v41 = _mm256_mul_epu32(v41, T15); T1 = _mm256_add_epi64(T1, v13);
+ v14 = H0; v14 = _mm256_mul_epu32(v14, T15); T2 = _mm256_add_epi64(T2, v22);
+ T14 = R42;
+ v23 = H1; v23 = _mm256_mul_epu32(v23, T15); T3 = _mm256_add_epi64(T3, v32);
+ v33 = H1; v33 = _mm256_mul_epu32(v33, T14); T4 = _mm256_add_epi64(T4, v41);
+ v42 = H2; v42 = _mm256_mul_epu32(v42, T14); T1 = _mm256_add_epi64(T1, v14);
+ T15 = R43;
+ v24 = H0; v24 = _mm256_mul_epu32(v24, T14); T2 = _mm256_add_epi64(T2, v23);
+ v34 = H0; v34 = _mm256_mul_epu32(v34, T15); T3 = _mm256_add_epi64(T3, v33);
+ v43 = H1; v43 = _mm256_mul_epu32(v43, T15); T4 = _mm256_add_epi64(T4, v42);
+ v44 = H0; v44 = _mm256_mul_epu32(v44, R44); T2 = _mm256_add_epi64(T2, v24);
+ T3 = _mm256_add_epi64(T3, v34);
+ T4 = _mm256_add_epi64(T4, v43);
+ T4 = _mm256_add_epi64(T4, v44);
+
+ /* H += [Mx,My] */
+ T5 = _mm256_loadu_si256((ymmi *)(m + 0));
+ T6 = _mm256_loadu_si256((ymmi *)(m + 32));
+ T7 = _mm256_permute2x128_si256(T5, T6, 0x20);
+ T8 = _mm256_permute2x128_si256(T5, T6, 0x31);
+ T5 = _mm256_unpacklo_epi32(T7, T8);
+ T6 = _mm256_unpackhi_epi32(T7, T8);
+ M0 = _mm256_unpacklo_epi32(T5, _mm256_setzero_si256());
+ M1 = _mm256_unpackhi_epi32(T5, _mm256_setzero_si256());
+ M2 = _mm256_unpacklo_epi32(T6, _mm256_setzero_si256());
+ M3 = _mm256_unpackhi_epi32(T6, _mm256_setzero_si256());
+ M1 = _mm256_slli_epi64(M1, 6);
+ M2 = _mm256_slli_epi64(M2, 12);
+ M3 = _mm256_slli_epi64(M3, 18);
+ T0 = _mm256_add_epi64(T0, M0);
+ T1 = _mm256_add_epi64(T1, M1);
+ T2 = _mm256_add_epi64(T2, M2);
+ T3 = _mm256_add_epi64(T3, M3);
+ T4 = _mm256_add_epi64(T4, HIBIT);
+
+ /* reduce */
+ C1 = _mm256_srli_epi64(T0, 26); C2 = _mm256_srli_epi64(T3, 26); T0 = _mm256_and_si256(T0, MMASK); T3 = _mm256_and_si256(T3, MMASK); T1 = _mm256_add_epi64(T1, C1); T4 = _mm256_add_epi64(T4, C2);
+ C1 = _mm256_srli_epi64(T1, 26); C2 = _mm256_srli_epi64(T4, 26); T1 = _mm256_and_si256(T1, MMASK); T4 = _mm256_and_si256(T4, MMASK); T2 = _mm256_add_epi64(T2, C1); T0 = _mm256_add_epi64(T0, _mm256_add_epi32(C2, _mm256_slli_epi32(C2, 2)));
+ C1 = _mm256_srli_epi64(T2, 26); C2 = _mm256_srli_epi64(T0, 26); T2 = _mm256_and_si256(T2, MMASK); T0 = _mm256_and_si256(T0, MMASK); T3 = _mm256_add_epi64(T3, C1); T1 = _mm256_add_epi64(T1, C2);
+ C1 = _mm256_srli_epi64(T3, 26); T3 = _mm256_and_si256(T3, MMASK); T4 = _mm256_add_epi64(T4, C1);
+
+ /* H = (H*[r^4,r^4,r^4,r^4] + [Mx,My]) */
+ H0 = T0;
+ H1 = T1;
+ H2 = T2;
+ H3 = T3;
+ H4 = T4;
+
+ bytes -= 64;
+ m += 64;
+ } while (bytes >= 64);
+ }
+
+ if (!(st->flags & poly1305_finalize)) {
+ T0 = _mm256_shuffle_epi32(H0, _MM_SHUFFLE(0,0,2,0));
+ T1 = _mm256_shuffle_epi32(H1, _MM_SHUFFLE(0,0,2,0));
+ T2 = _mm256_shuffle_epi32(H2, _MM_SHUFFLE(0,0,2,0));
+ T3 = _mm256_shuffle_epi32(H3, _MM_SHUFFLE(0,0,2,0));
+ T4 = _mm256_shuffle_epi32(H4, _MM_SHUFFLE(0,0,2,0));
+ T0 = _mm256_permute4x64_epi64(T0, _MM_SHUFFLE(0,0,2,0));
+ T1 = _mm256_permute4x64_epi64(T1, _MM_SHUFFLE(0,0,2,0));
+ T2 = _mm256_permute4x64_epi64(T2, _MM_SHUFFLE(0,0,2,0));
+ T3 = _mm256_permute4x64_epi64(T3, _MM_SHUFFLE(0,0,2,0));
+ T4 = _mm256_permute4x64_epi64(T4, _MM_SHUFFLE(0,0,2,0));
+ T0 = _mm256_permute2x128_si256(T0, T1, 0x20);
+ T2 = _mm256_permute2x128_si256(T2, T3, 0x20);
+ _mm256_storeu_si256((ymmi *)&st->hh[0], T0);
+ _mm256_storeu_si256((ymmi *)&st->hh[8], T2);
+ _mm_storeu_si128((xmmi *)&st->hh[16], _mm256_castsi256_si128(T4));
+ } else {
+ uint32_t t0,t1,t2,t3,t4,b;
+ u64 h0,h1,h2,g0,g1,g2,c,nc;
+
+ /* H = H[0]+H[1] */
+ T0 = H0;
+ T1 = H1;
+ T2 = H2;
+ T3 = H3;
+ T4 = H4;
+ T0 = _mm256_add_epi64(T0, _mm256_permute4x64_epi64(T0, 0xf5));
+ T1 = _mm256_add_epi64(T1, _mm256_permute4x64_epi64(T1, 0xf5));
+ T2 = _mm256_add_epi64(T2, _mm256_permute4x64_epi64(T2, 0xf5));
+ T3 = _mm256_add_epi64(T3, _mm256_permute4x64_epi64(T3, 0xf5));
+ T4 = _mm256_add_epi64(T4, _mm256_permute4x64_epi64(T4, 0xf5));
+ T0 = _mm256_add_epi64(T0, _mm256_permute4x64_epi64(T0, 0xaa));
+ T1 = _mm256_add_epi64(T1, _mm256_permute4x64_epi64(T1, 0xaa));
+ T2 = _mm256_add_epi64(T2, _mm256_permute4x64_epi64(T2, 0xaa));
+ T3 = _mm256_add_epi64(T3, _mm256_permute4x64_epi64(T3, 0xaa));
+ T4 = _mm256_add_epi64(T4, _mm256_permute4x64_epi64(T4, 0xaa));
+ t0 = _mm_cvtsi128_si32(_mm256_castsi256_si128(T0)) ; b = (t0 >> 26); t0 &= 0x3ffffff;
+ t1 = _mm_cvtsi128_si32(_mm256_castsi256_si128(T1)) + b; b = (t1 >> 26); t1 &= 0x3ffffff;
+ t2 = _mm_cvtsi128_si32(_mm256_castsi256_si128(T2)) + b; b = (t2 >> 26); t2 &= 0x3ffffff;
+ t3 = _mm_cvtsi128_si32(_mm256_castsi256_si128(T3)) + b; b = (t3 >> 26); t3 &= 0x3ffffff;
+ t4 = _mm_cvtsi128_si32(_mm256_castsi256_si128(T4)) + b;
+
+ /* everything except t4 is in range, so this is all safe */
+ h0 = (((u64)t0 ) | ((u64)t1 << 26) ) & 0xfffffffffffull;
+ h1 = (((u64)t1 >> 18) | ((u64)t2 << 8) | ((u64)t3 << 34)) & 0xfffffffffffull;
+ h2 = (((u64)t3 >> 10) | ((u64)t4 << 16) );
+
+ c = (h2 >> 42); h2 &= 0x3ffffffffff;
+ h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
+ h1 += c; c = (h1 >> 44); h1 &= 0xfffffffffff;
+ h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff;
+ h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
+ h1 += c;
+
+ g0 = h0 + 5; c = (g0 >> 44); g0 &= 0xfffffffffff;
+ g1 = h1 + c; c = (g1 >> 44); g1 &= 0xfffffffffff;
+ g2 = h2 + c - ((u64)1 << 42);
+
+ c = (g2 >> 63) - 1;
+ nc = ~c;
+ h0 = (h0 & nc) | (g0 & c);
+ h1 = (h1 & nc) | (g1 & c);
+ h2 = (h2 & nc) | (g2 & c);
+
+ st->h[0] = h0;
+ st->h[1] = h1;
+ st->h[2] = h2;
+ }
+}
+
+__attribute__((noinline)) void
+FN(poly1305_finish_ext)(poly1305_state_internal *st, const u8 *m, u32 leftover, unsigned char mac[16]) {
+ u64 h0,h1,h2;
+ __attribute__((aligned(64))) unsigned char final[64];
+
+ if (leftover) {
+ _mm256_store_si256((ymmi *)(final + 0), _mm256_setzero_si256());
+ _mm256_store_si256((ymmi *)(final + 32), _mm256_setzero_si256());
+ poly1305_block_copy63(final, m, leftover);
+ if ((leftover % 16) != 0) final[leftover] = 1;
+ st->flags &= ~(poly1305_shift_flags | poly1305_mult_flags);
+ if (leftover >= 48) st->flags |= poly1305_final_shift8;
+ else if (leftover >= 32) st->flags |= poly1305_final_shift16;
+ else if (leftover >= 16) st->flags |= poly1305_final_shift24;
+ else st->flags |= poly1305_final_shift32;
+ if (st->flags & poly1305_started) {
+ if (leftover <= 16)
+ st->flags |= poly1305_final_r4_r4_r3_r2;
+ else if (leftover <= 32)
+ st->flags |= poly1305_final_r4_r4_r4_r3;
+ }
+ FN(poly1305_blocks)(st, final, 64);
+ }
+
+ if (st->flags & poly1305_started) {
+ st->flags &= ~(poly1305_shift_flags | poly1305_mult_flags);
+ if (!leftover || (leftover > 48))
+ st->flags |= poly1305_final_r4_r3_r2_r;
+ else if (leftover > 32)
+ st->flags |= poly1305_final_r3_r2_r_1;
+ else if (leftover > 16)
+ st->flags |= poly1305_final_r2_r_1_1;
+ else
+ st->flags |= poly1305_final_r_1_1_1;
+ st->flags |= (poly1305_finalize|poly1305_final_shift32);
+ _mm256_store_si256((ymmi *)(final + 0), _mm256_setzero_si256());
+ _mm256_store_si256((ymmi *)(final + 32), _mm256_setzero_si256());
+ FN(poly1305_blocks)(st, final, 64);
+ }
+
+ h0 = st->h[0];
+ h1 = st->h[1];
+ h2 = st->h[2];
+
+ /* pad */
+ h0 = ((h0 ) | (h1 << 44));
+ h1 = ((h1 >> 20) | (h2 << 24));
+
+ __asm__ __volatile__(
+ "addq %2, %0 ;\n"
+ "adcq %3, %1 ;\n"
+ : "+r"(h0), "+r"(h1)
+ : "r"(st->pad[0]), "r"(st->pad[1])
+ : "flags", "cc"
+ );
+
+ _mm256_storeu_si256((ymmi *)st + 0, _mm256_setzero_si256());
+ _mm256_storeu_si256((ymmi *)st + 1, _mm256_setzero_si256());
+ _mm256_storeu_si256((ymmi *)st + 2, _mm256_setzero_si256());
+ _mm256_storeu_si256((ymmi *)st + 3, _mm256_setzero_si256());
+ _mm256_storeu_si256((ymmi *)st + 4, _mm256_setzero_si256());
+ _mm256_storeu_si256((ymmi *)st + 5, _mm256_setzero_si256());
+
+ *(u64 *)(mac + 0) = h0;
+ *(u64 *)(mac + 8) = h1;
+}
+
+void
+poly1305_donna_avx2(unsigned char out[16], const unsigned char *m, u32 inlen, const unsigned char key[32]) {
+ __attribute__((aligned(64))) poly1305_state S;
+ poly1305_state_internal *st = (poly1305_state_internal *)S;
+ u32 blocks;
+ FN(poly1305_init_ext)(st, key, inlen);
+ blocks = inlen & ~63;
+ if (blocks) {
+ FN(poly1305_blocks)(st, m, blocks);
+ m += blocks;
+ inlen -= blocks;
+ }
+ FN(poly1305_finish_ext)(st, m, inlen, out);
+}
+
+