aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/src/crypto/chacha20poly1305.c
diff options
context:
space:
mode:
authorSamuel Neves <sneves@dei.uc.pt>2017-11-17 12:07:52 +0000
committerJason A. Donenfeld <Jason@zx2c4.com>2017-11-22 18:32:48 +0100
commitabad6eebd9a94f50645062882755fbeaac354873 (patch)
tree5e3d066bc3784eb1e7423a24191741e008a129e8 /src/crypto/chacha20poly1305.c
parentchacha20poly1305: add more test vectors, some of which are weird (diff)
downloadwireguard-monolithic-historical-abad6eebd9a94f50645062882755fbeaac354873.tar.xz
wireguard-monolithic-historical-abad6eebd9a94f50645062882755fbeaac354873.zip
chacha20poly1305: import x86_64 primitives from OpenSSL
x86_64 only at the moment. SSSE3, AVX, AVX2, AVX512. Signed-off-by: Samuel Neves <sneves@dei.uc.pt>
Diffstat (limited to 'src/crypto/chacha20poly1305.c')
-rw-r--r--src/crypto/chacha20poly1305.c639
1 files changed, 334 insertions, 305 deletions
diff --git a/src/crypto/chacha20poly1305.c b/src/crypto/chacha20poly1305.c
index a9c3bf8..ac033f0 100644
--- a/src/crypto/chacha20poly1305.c
+++ b/src/crypto/chacha20poly1305.c
@@ -1,6 +1,34 @@
-/*
- * Copyright (C) 2015-2017 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+/* Copyright (C) 2015-2017 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
* Copyright 2015 Martin Willi.
+ * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * - Redistributions of source code must retain copyright notices,
+ * this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ * - Neither the name of the CRYPTOGAMS nor the names of its
+ * copyright holder and contributors may be used to endorse or
+ * promote products derived from this software without specific
+ * prior written permission.
+ * ALTERNATIVELY, provided that this notice is retained in full, this
+ * product may be distributed under the terms of the GNU General Public
+ * License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+ * those given above.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "chacha20poly1305.h"
@@ -15,27 +43,39 @@
#if defined(CONFIG_X86_64)
#include <asm/cpufeature.h>
#include <asm/processor.h>
+asmlinkage void poly1305_init_x86_64(void *ctx, const unsigned char key[16]);
+asmlinkage void poly1305_blocks_x86_64(void *ctx, const unsigned char *inp, size_t len, u32 padbit);
+asmlinkage void poly1305_emit_x86_64(void *ctx, unsigned char mac[16], const u32 nonce[4]);
#ifdef CONFIG_AS_SSSE3
-asmlinkage void hchacha20_asm_ssse3(u8 *derived_key, const u8 *nonce, const u8 *key);
-asmlinkage void chacha20_asm_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
-asmlinkage void chacha20_asm_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
+asmlinkage void hchacha20_ssse3(u8 *derived_key, const u8 *nonce, const u8 *key);
+asmlinkage void chacha20_ssse3(unsigned char *out, const unsigned char *in, size_t len, const unsigned int key[8], const unsigned int counter[4]);
#endif
-#ifdef CONFIG_AS_AVX2
-asmlinkage void chacha20_asm_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src);
+#ifdef CONFIG_AS_AVX
+asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[16], const u32 nonce[4]);
+asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, size_t len, u32 padbit);
#endif
-asmlinkage void poly1305_asm_block_sse2(u32 *h, const u8 *src, const u32 *r, unsigned int blocks);
-asmlinkage void poly1305_asm_2block_sse2(u32 *h, const u8 *src, const u32 *r, unsigned int blocks, const u32 *u);
#ifdef CONFIG_AS_AVX2
-asmlinkage void poly1305_asm_4block_avx2(u32 *h, const u8 *src, const u32 *r, unsigned int blocks, const u32 *u);
+asmlinkage void chacha20_avx2(unsigned char *out, const unsigned char *in, size_t len, const unsigned int key[8], const unsigned int counter[4]);
+asmlinkage void poly1305_blocks_avx2(void *ctx, const unsigned char *inp, size_t len, u32 padbit);
#endif
-static bool chacha20poly1305_use_avx2 __read_mostly;
+#ifdef CONFIG_AS_AVX512
+asmlinkage void chacha20_avx512(unsigned char *out, const unsigned char *in, size_t len, const unsigned int key[8], const unsigned int counter[4]);
+asmlinkage void poly1305_blocks_avx512(void *ctx, const unsigned char *inp, size_t len, u32 padbit);
+#endif
+
static bool chacha20poly1305_use_ssse3 __read_mostly;
-static bool chacha20poly1305_use_sse2 __read_mostly;
+static bool chacha20poly1305_use_avx __read_mostly;
+static bool chacha20poly1305_use_avx2 __read_mostly;
+static bool chacha20poly1305_use_avx512 __read_mostly;
+
void chacha20poly1305_fpu_init(void)
{
- chacha20poly1305_use_sse2 = boot_cpu_has(X86_FEATURE_XMM2);
chacha20poly1305_use_ssse3 = boot_cpu_has(X86_FEATURE_SSSE3);
+ chacha20poly1305_use_avx = boot_cpu_has(X86_FEATURE_AVX) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
chacha20poly1305_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+#ifndef COMPAT_CANNOT_USE_AVX512
+ chacha20poly1305_use_avx512 = boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_ZMM_Hi256, NULL);
+#endif
}
#elif IS_ENABLED(CONFIG_KERNEL_MODE_NEON)
#include <asm/hwcap.h>
@@ -77,21 +117,6 @@ static inline u32 rotl32(u32 v, u8 n)
return (v << n) | (v >> (sizeof(v) * 8 - n));
}
-static inline u64 mlt(u64 a, u64 b)
-{
- return a * b;
-}
-
-static inline u32 sr(u64 v, u_char n)
-{
- return v >> n;
-}
-
-static inline u32 and(u32 v, u32 mask)
-{
- return v & mask;
-}
-
struct chacha20_ctx {
u32 state[CHACHA20_BLOCK_SIZE / sizeof(u32)];
} __aligned(32);
@@ -232,17 +257,13 @@ static void hchacha20_generic(u8 derived_key[CHACHA20POLY1305_KEYLEN], const u8
static inline void hchacha20(u8 derived_key[CHACHA20POLY1305_KEYLEN], const u8 nonce[16], const u8 key[CHACHA20POLY1305_KEYLEN], bool have_simd)
{
- if (!have_simd)
- goto no_simd;
-
#if defined(CONFIG_X86_64) && defined(CONFIG_AS_SSSE3)
- if (chacha20poly1305_use_ssse3) {
- hchacha20_asm_ssse3(derived_key, nonce, key);
+ if (have_simd && chacha20poly1305_use_ssse3) {
+ hchacha20_ssse3(derived_key, nonce, key);
return;
}
#endif
-no_simd:
hchacha20_generic(derived_key, nonce, key);
}
@@ -266,7 +287,7 @@ static void chacha20_keysetup(struct chacha20_ctx *ctx, const u8 key[CHACHA20_KE
ctx->state[15] = le32_to_cpuvp(nonce + 4);
}
-static void chacha20_crypt(struct chacha20_ctx *ctx, u8 *dst, const u8 *src, unsigned int bytes, bool have_simd)
+static void chacha20_crypt(struct chacha20_ctx *ctx, u8 *dst, const u8 *src, u32 bytes, bool have_simd)
{
u8 buf[CHACHA20_BLOCK_SIZE];
@@ -281,37 +302,23 @@ static void chacha20_crypt(struct chacha20_ctx *ctx, u8 *dst, const u8 *src, uns
goto no_simd;
#if defined(CONFIG_X86_64)
+#ifdef CONFIG_AS_AVX512
+ if (chacha20poly1305_use_avx512) {
+ chacha20_avx512(dst, src, bytes, &ctx->state[4], &ctx->state[12]);
+ ctx->state[12] += (bytes + 63) / 64;
+ return;
+ }
+#endif
#ifdef CONFIG_AS_AVX2
if (chacha20poly1305_use_avx2) {
- while (bytes >= CHACHA20_BLOCK_SIZE * 8) {
- chacha20_asm_8block_xor_avx2(ctx->state, dst, src);
- bytes -= CHACHA20_BLOCK_SIZE * 8;
- src += CHACHA20_BLOCK_SIZE * 8;
- dst += CHACHA20_BLOCK_SIZE * 8;
- ctx->state[12] += 8;
- }
+ chacha20_avx2(dst, src, bytes, &ctx->state[4], &ctx->state[12]);
+ ctx->state[12] += (bytes + 63) / 64;
+ return;
}
#endif
#ifdef CONFIG_AS_SSSE3
- while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
- chacha20_asm_4block_xor_ssse3(ctx->state, dst, src);
- bytes -= CHACHA20_BLOCK_SIZE * 4;
- src += CHACHA20_BLOCK_SIZE * 4;
- dst += CHACHA20_BLOCK_SIZE * 4;
- ctx->state[12] += 4;
- }
- while (bytes >= CHACHA20_BLOCK_SIZE) {
- chacha20_asm_block_xor_ssse3(ctx->state, dst, src);
- bytes -= CHACHA20_BLOCK_SIZE;
- src += CHACHA20_BLOCK_SIZE;
- dst += CHACHA20_BLOCK_SIZE;
- ctx->state[12]++;
- }
- if (bytes) {
- memcpy(buf, src, bytes);
- chacha20_asm_block_xor_ssse3(ctx->state, buf, buf);
- memcpy(dst, buf, bytes);
- }
+ chacha20_ssse3(dst, src, bytes, &ctx->state[4], &ctx->state[12]);
+ ctx->state[12] += (bytes + 63) / 64;
return;
#endif
#elif IS_ENABLED(CONFIG_KERNEL_MODE_NEON)
@@ -352,261 +359,283 @@ no_simd:
crypto_xor(dst, buf, bytes);
}
}
+typedef void (*poly1305_blocks_f)(void *ctx, const u8 *inp, size_t len, u32 padbit);
+typedef void (*poly1305_emit_f)(void *ctx, u8 mac[16], const u32 nonce[4]);
struct poly1305_ctx {
- /* key */
- u32 r[5];
- /* finalize key */
- u32 s[4];
- /* accumulator */
+ u8 opaque[24 * sizeof(u64)];
+ u32 nonce[4];
+ u8 data[POLY1305_BLOCK_SIZE];
+ size_t num;
+ struct {
+ poly1305_blocks_f blocks;
+ poly1305_emit_f emit;
+ } func;
+} __aligned(8);
+
+#ifndef CONFIG_X86_64
+struct poly1305_internal {
u32 h[5];
- /* partial buffer */
- u8 buf[POLY1305_BLOCK_SIZE];
- /* bytes used in partial buffer */
- unsigned int buflen;
- /* derived key u set? */
- bool uset;
- /* derived keys r^3, r^4 set? */
- bool wset;
- /* derived Poly1305 key r^2 */
- u32 u[5];
- /* derived Poly1305 key r^3 */
- u32 r3[5];
- /* derived Poly1305 key r^4 */
- u32 r4[5];
+ u32 r[4];
};
-static void poly1305_init(struct poly1305_ctx *ctx, const u8 key[POLY1305_KEY_SIZE])
+static void poly1305_init_generic(void *ctx, const u8 key[16])
{
- memset(ctx, 0, sizeof(struct poly1305_ctx));
+ struct poly1305_internal *st = (struct poly1305_internal *)ctx;
+
+ /* h = 0 */
+ st->h[0] = 0;
+ st->h[1] = 0;
+ st->h[2] = 0;
+ st->h[3] = 0;
+ st->h[4] = 0;
+
/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
- ctx->r[0] = (le32_to_cpuvp(key + 0) >> 0) & 0x3ffffff;
- ctx->r[1] = (get_unaligned_le32(key + 3) >> 2) & 0x3ffff03;
- ctx->r[2] = (get_unaligned_le32(key + 6) >> 4) & 0x3ffc0ff;
- ctx->r[3] = (get_unaligned_le32(key + 9) >> 6) & 0x3f03fff;
- ctx->r[4] = (le32_to_cpuvp(key + 12) >> 8) & 0x00fffff;
- ctx->s[0] = le32_to_cpuvp(key + 16);
- ctx->s[1] = le32_to_cpuvp(key + 20);
- ctx->s[2] = le32_to_cpuvp(key + 24);
- ctx->s[3] = le32_to_cpuvp(key + 28);
+ st->r[0] = le32_to_cpuvp(&key[ 0]) & 0x0fffffff;
+ st->r[1] = le32_to_cpuvp(&key[ 4]) & 0x0ffffffc;
+ st->r[2] = le32_to_cpuvp(&key[ 8]) & 0x0ffffffc;
+ st->r[3] = le32_to_cpuvp(&key[12]) & 0x0ffffffc;
}
-static unsigned int poly1305_generic_blocks(struct poly1305_ctx *ctx, const u8 *src, unsigned int srclen, u32 hibit)
+static void
+poly1305_blocks_generic(void *ctx, const u8 *inp, size_t len, u32 padbit)
{
- u32 r0, r1, r2, r3, r4;
- u32 s1, s2, s3, s4;
- u32 h0, h1, h2, h3, h4;
- u64 d0, d1, d2, d3, d4;
-
- r0 = ctx->r[0];
- r1 = ctx->r[1];
- r2 = ctx->r[2];
- r3 = ctx->r[3];
- r4 = ctx->r[4];
-
- s1 = r1 * 5;
- s2 = r2 * 5;
- s3 = r3 * 5;
- s4 = r4 * 5;
-
- h0 = ctx->h[0];
- h1 = ctx->h[1];
- h2 = ctx->h[2];
- h3 = ctx->h[3];
- h4 = ctx->h[4];
-
- while (likely(srclen >= POLY1305_BLOCK_SIZE)) {
+#define CONSTANT_TIME_CARRY(a,b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1))
+ struct poly1305_internal *st = (struct poly1305_internal *)ctx;
+ u32 r0, r1, r2, r3;
+ u32 s1, s2, s3;
+ u32 h0, h1, h2, h3, h4, c;
+ u64 d0, d1, d2, d3;
+
+ r0 = st->r[0];
+ r1 = st->r[1];
+ r2 = st->r[2];
+ r3 = st->r[3];
+
+ s1 = r1 + (r1 >> 2);
+ s2 = r2 + (r2 >> 2);
+ s3 = r3 + (r3 >> 2);
+
+ h0 = st->h[0];
+ h1 = st->h[1];
+ h2 = st->h[2];
+ h3 = st->h[3];
+ h4 = st->h[4];
+
+ while (len >= POLY1305_BLOCK_SIZE) {
/* h += m[i] */
- h0 += (le32_to_cpuvp(src + 0) >> 0) & 0x3ffffff;
- h1 += (get_unaligned_le32(src + 3) >> 2) & 0x3ffffff;
- h2 += (get_unaligned_le32(src + 6) >> 4) & 0x3ffffff;
- h3 += (get_unaligned_le32(src + 9) >> 6) & 0x3ffffff;
- h4 += (le32_to_cpuvp(src + 12) >> 8) | hibit;
-
- /* h *= r */
- d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) + mlt(h3, s2) + mlt(h4, s1);
- d1 = mlt(h0, r1) + mlt(h1, r0) + mlt(h2, s4) + mlt(h3, s3) + mlt(h4, s2);
- d2 = mlt(h0, r2) + mlt(h1, r1) + mlt(h2, r0) + mlt(h3, s4) + mlt(h4, s3);
- d3 = mlt(h0, r3) + mlt(h1, r2) + mlt(h2, r1) + mlt(h3, r0) + mlt(h4, s4);
- d4 = mlt(h0, r4) + mlt(h1, r3) + mlt(h2, r2) + mlt(h3, r1) + mlt(h4, r0);
-
- /* (partial) h %= p */
- d1 += sr(d0, 26); h0 = and(d0, 0x3ffffff);
- d2 += sr(d1, 26); h1 = and(d1, 0x3ffffff);
- d3 += sr(d2, 26); h2 = and(d2, 0x3ffffff);
- d4 += sr(d3, 26); h3 = and(d3, 0x3ffffff);
- h0 += sr(d4, 26) * 5; h4 = and(d4, 0x3ffffff);
- h1 += h0 >> 26; h0 = h0 & 0x3ffffff;
-
- src += POLY1305_BLOCK_SIZE;
- srclen -= POLY1305_BLOCK_SIZE;
+ h0 = (u32)(d0 = (u64)h0 + le32_to_cpuvp(inp + 0));
+ h1 = (u32)(d1 = (u64)h1 + (d0 >> 32) + le32_to_cpuvp(inp + 4));
+ h2 = (u32)(d2 = (u64)h2 + (d1 >> 32) + le32_to_cpuvp(inp + 8));
+ h3 = (u32)(d3 = (u64)h3 + (d2 >> 32) + le32_to_cpuvp(inp + 12));
+ h4 += (u32)(d3 >> 32) + padbit;
+
+ /* h *= r "%" p, where "%" stands for "partial remainder" */
+ d0 = ((u64)h0 * r0) +
+ ((u64)h1 * s3) +
+ ((u64)h2 * s2) +
+ ((u64)h3 * s1);
+ d1 = ((u64)h0 * r1) +
+ ((u64)h1 * r0) +
+ ((u64)h2 * s3) +
+ ((u64)h3 * s2) +
+ (h4 * s1);
+ d2 = ((u64)h0 * r2) +
+ ((u64)h1 * r1) +
+ ((u64)h2 * r0) +
+ ((u64)h3 * s3) +
+ (h4 * s2);
+ d3 = ((u64)h0 * r3) +
+ ((u64)h1 * r2) +
+ ((u64)h2 * r1) +
+ ((u64)h3 * r0) +
+ (h4 * s3);
+ h4 = (h4 * r0);
+
+ /* last reduction step: */
+ /* a) h4:h0 = h4<<128 + d3<<96 + d2<<64 + d1<<32 + d0 */
+ h0 = (u32)d0;
+ h1 = (u32)(d1 += d0 >> 32);
+ h2 = (u32)(d2 += d1 >> 32);
+ h3 = (u32)(d3 += d2 >> 32);
+ h4 += (u32)(d3 >> 32);
+ /* b) (h4:h0 += (h4:h0>>130) * 5) %= 2^130 */
+ c = (h4 >> 2) + (h4 & ~3U);
+ h4 &= 3;
+ h0 += c;
+ h1 += (c = CONSTANT_TIME_CARRY(h0,c));
+ h2 += (c = CONSTANT_TIME_CARRY(h1,c));
+ h3 += (c = CONSTANT_TIME_CARRY(h2,c));
+ h4 += CONSTANT_TIME_CARRY(h3,c);
+ /*
+ * Occasional overflows to 3rd bit of h4 are taken care of
+ * "naturally". If after this point we end up at the top of
+ * this loop, then the overflow bit will be accounted for
+ * in next iteration. If we end up in poly1305_emit, then
+ * comparison to modulus below will still count as "carry
+ * into 131st bit", so that properly reduced value will be
+ * picked in conditional move.
+ */
+
+ inp += POLY1305_BLOCK_SIZE;
+ len -= POLY1305_BLOCK_SIZE;
}
- ctx->h[0] = h0;
- ctx->h[1] = h1;
- ctx->h[2] = h2;
- ctx->h[3] = h3;
- ctx->h[4] = h4;
-
- return srclen;
+ st->h[0] = h0;
+ st->h[1] = h1;
+ st->h[2] = h2;
+ st->h[3] = h3;
+ st->h[4] = h4;
+#undef CONSTANT_TIME_CARRY
}
-#ifdef CONFIG_X86_64
-static void poly1305_simd_mult(u32 *a, const u32 *b)
+static void poly1305_emit_generic(void *ctx, u8 mac[16], const u32 nonce[4])
{
- u8 m[POLY1305_BLOCK_SIZE];
-
- memset(m, 0, sizeof(m));
- /* The poly1305 block function adds a hi-bit to the accumulator which
- * we don't need for key multiplication; compensate for it.
- */
- a[4] -= 1U << 24;
- poly1305_asm_block_sse2(a, m, b, 1);
+ struct poly1305_internal *st = (struct poly1305_internal *)ctx;
+ __le32 *omac = (__force __le32 *)mac;
+ u32 h0, h1, h2, h3, h4;
+ u32 g0, g1, g2, g3, g4;
+ u64 t;
+ u32 mask;
+
+ h0 = st->h[0];
+ h1 = st->h[1];
+ h2 = st->h[2];
+ h3 = st->h[3];
+ h4 = st->h[4];
+
+ /* compare to modulus by computing h + -p */
+ g0 = (u32)(t = (u64)h0 + 5);
+ g1 = (u32)(t = (u64)h1 + (t >> 32));
+ g2 = (u32)(t = (u64)h2 + (t >> 32));
+ g3 = (u32)(t = (u64)h3 + (t >> 32));
+ g4 = h4 + (u32)(t >> 32);
+
+ /* if there was carry into 131st bit, h3:h0 = g3:g0 */
+ mask = 0 - (g4 >> 2);
+ g0 &= mask;
+ g1 &= mask;
+ g2 &= mask;
+ g3 &= mask;
+ mask = ~mask;
+ h0 = (h0 & mask) | g0;
+ h1 = (h1 & mask) | g1;
+ h2 = (h2 & mask) | g2;
+ h3 = (h3 & mask) | g3;
+
+ /* mac = (h + nonce) % (2^128) */
+ h0 = (u32)(t = (u64)h0 + nonce[0]);
+ h1 = (u32)(t = (u64)h1 + (t >> 32) + nonce[1]);
+ h2 = (u32)(t = (u64)h2 + (t >> 32) + nonce[2]);
+ h3 = (u32)(t = (u64)h3 + (t >> 32) + nonce[3]);
+
+ omac[0] = cpu_to_le32(h0);
+ omac[1] = cpu_to_le32(h1);
+ omac[2] = cpu_to_le32(h2);
+ omac[3] = cpu_to_le32(h3);
}
+#endif /* !CONFIG_X86_64 */
-static unsigned int poly1305_simd_blocks(struct poly1305_ctx *ctx, const u8 *src, unsigned int srclen)
+void poly1305_init(struct poly1305_ctx *ctx, const u8 key[POLY1305_KEY_SIZE], bool have_simd)
{
- unsigned int blocks;
+ ctx->nonce[0] = le32_to_cpuvp(&key[16]);
+ ctx->nonce[1] = le32_to_cpuvp(&key[20]);
+ ctx->nonce[2] = le32_to_cpuvp(&key[24]);
+ ctx->nonce[3] = le32_to_cpuvp(&key[28]);
+#ifdef CONFIG_X86_64
+ poly1305_init_x86_64(ctx->opaque, key);
+ ctx->func.blocks = poly1305_blocks_x86_64;
+ ctx->func.emit = poly1305_emit_x86_64;
+#ifdef CONFIG_AS_AVX512
+ if(chacha20poly1305_use_avx512 && have_simd) {
+ ctx->func.blocks = poly1305_blocks_avx512;
+ ctx->func.emit = poly1305_emit_avx;
+ } else
+#endif
#ifdef CONFIG_AS_AVX2
- if (chacha20poly1305_use_avx2 && srclen >= POLY1305_BLOCK_SIZE * 4) {
- if (unlikely(!ctx->wset)) {
- if (!ctx->uset) {
- memcpy(ctx->u, ctx->r, sizeof(ctx->u));
- poly1305_simd_mult(ctx->u, ctx->r);
- ctx->uset = true;
- }
- memcpy(ctx->r3, ctx->u, sizeof(ctx->u));
- poly1305_simd_mult(ctx->r3, ctx->r);
- memcpy(ctx->r4, ctx->r3, sizeof(ctx->u));
- poly1305_simd_mult(ctx->r4, ctx->r);
- ctx->wset = true;
- }
- blocks = srclen / (POLY1305_BLOCK_SIZE * 4);
- poly1305_asm_4block_avx2(ctx->h, src, ctx->r, blocks, ctx->u);
- src += POLY1305_BLOCK_SIZE * 4 * blocks;
- srclen -= POLY1305_BLOCK_SIZE * 4 * blocks;
- }
+ if (chacha20poly1305_use_avx2 && have_simd) {
+ ctx->func.blocks = poly1305_blocks_avx2;
+ ctx->func.emit = poly1305_emit_avx;
+ } else
#endif
- if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) {
- if (unlikely(!ctx->uset)) {
- memcpy(ctx->u, ctx->r, sizeof(ctx->u));
- poly1305_simd_mult(ctx->u, ctx->r);
- ctx->uset = true;
- }
- blocks = srclen / (POLY1305_BLOCK_SIZE * 2);
- poly1305_asm_2block_sse2(ctx->h, src, ctx->r, blocks, ctx->u);
- src += POLY1305_BLOCK_SIZE * 2 * blocks;
- srclen -= POLY1305_BLOCK_SIZE * 2 * blocks;
- }
- if (srclen >= POLY1305_BLOCK_SIZE) {
- poly1305_asm_block_sse2(ctx->h, src, ctx->r, 1);
- srclen -= POLY1305_BLOCK_SIZE;
+#ifdef CONFIG_AS_AVX
+ if (chacha20poly1305_use_avx && have_simd) {
+ ctx->func.blocks = poly1305_blocks_avx;
+ ctx->func.emit = poly1305_emit_avx;
}
- return srclen;
-}
#endif
+#else
+ poly1305_init_generic(ctx->opaque, key);
+#endif
+ ctx->num = 0;
+}
-static void poly1305_update(struct poly1305_ctx *ctx, const u8 *src, unsigned int srclen, bool have_simd)
+void poly1305_update(struct poly1305_ctx *ctx, const u8 *inp, size_t len)
{
- unsigned int bytes;
-
- if (unlikely(ctx->buflen)) {
- bytes = min(srclen, POLY1305_BLOCK_SIZE - ctx->buflen);
- memcpy(ctx->buf + ctx->buflen, src, bytes);
- src += bytes;
- srclen -= bytes;
- ctx->buflen += bytes;
-
- if (ctx->buflen == POLY1305_BLOCK_SIZE) {
#ifdef CONFIG_X86_64
- if (have_simd && chacha20poly1305_use_sse2)
- poly1305_simd_blocks(ctx, ctx->buf, POLY1305_BLOCK_SIZE);
- else
+ const poly1305_blocks_f blocks = ctx->func.blocks;
+#else
+ const poly1305_blocks_f blocks = poly1305_blocks_generic;
#endif
- poly1305_generic_blocks(ctx, ctx->buf, POLY1305_BLOCK_SIZE, 1U << 24);
- ctx->buflen = 0;
+
+ const size_t num = ctx->num;
+ size_t rem;;
+
+ if (num) {
+ rem = POLY1305_BLOCK_SIZE - num;
+ if (len >= rem) {
+ memcpy(ctx->data + num, inp, rem);
+ blocks(ctx->opaque, ctx->data, POLY1305_BLOCK_SIZE, 1);
+ inp += rem;
+ len -= rem;
+ } else {
+ /* Still not enough data to process a block. */
+ memcpy(ctx->data + num, inp, len);
+ ctx->num = num + len;
+ return;
}
}
- if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
-#ifdef CONFIG_X86_64
- if (have_simd && chacha20poly1305_use_sse2)
- bytes = poly1305_simd_blocks(ctx, src, srclen);
- else
-#endif
- bytes = poly1305_generic_blocks(ctx, src, srclen, 1U << 24);
- src += srclen - bytes;
- srclen = bytes;
- }
+ rem = len % POLY1305_BLOCK_SIZE;
+ len -= rem;
- if (unlikely(srclen)) {
- ctx->buflen = srclen;
- memcpy(ctx->buf, src, srclen);
+ if (len >= POLY1305_BLOCK_SIZE) {
+ blocks(ctx->opaque, inp, len, 1);
+ inp += len;
}
+
+ if (rem)
+ memcpy(ctx->data, inp, rem);
+
+ ctx->num = rem;
}
-static void poly1305_finish(struct poly1305_ctx *ctx, u8 *dst)
+void poly1305_finish(struct poly1305_ctx * ctx, u8 mac[16])
{
- __le32 *mac = (__le32 *)dst;
- u32 h0, h1, h2, h3, h4;
- u32 g0, g1, g2, g3, g4;
- u32 mask;
- u64 f = 0;
+#ifdef CONFIG_X86_64
+ poly1305_blocks_f blocks = ctx->func.blocks;
+ poly1305_emit_f emit = ctx->func.emit;
+#else
+ poly1305_blocks_f blocks = poly1305_blocks_generic;
+ poly1305_emit_f emit = poly1305_emit_generic;
+#endif
+ size_t num = ctx->num;
- if (unlikely(ctx->buflen)) {
- ctx->buf[ctx->buflen++] = 1;
- memset(ctx->buf + ctx->buflen, 0, POLY1305_BLOCK_SIZE - ctx->buflen);
- poly1305_generic_blocks(ctx, ctx->buf, POLY1305_BLOCK_SIZE, 0);
+ if (num) {
+ ctx->data[num++] = 1; /* pad bit */
+ while (num < POLY1305_BLOCK_SIZE)
+ ctx->data[num++] = 0;
+ blocks(ctx->opaque, ctx->data, POLY1305_BLOCK_SIZE, 0);
}
- /* fully carry h */
- h0 = ctx->h[0];
- h1 = ctx->h[1];
- h2 = ctx->h[2];
- h3 = ctx->h[3];
- h4 = ctx->h[4];
-
- h2 += (h1 >> 26); h1 = h1 & 0x3ffffff;
- h3 += (h2 >> 26); h2 = h2 & 0x3ffffff;
- h4 += (h3 >> 26); h3 = h3 & 0x3ffffff;
- h0 += (h4 >> 26) * 5; h4 = h4 & 0x3ffffff;
- h1 += (h0 >> 26); h0 = h0 & 0x3ffffff;
-
- /* compute h + -p */
- g0 = h0 + 5;
- g1 = h1 + (g0 >> 26); g0 &= 0x3ffffff;
- g2 = h2 + (g1 >> 26); g1 &= 0x3ffffff;
- g3 = h3 + (g2 >> 26); g2 &= 0x3ffffff;
- g4 = h4 + (g3 >> 26) - (1U << 26); g3 &= 0x3ffffff;
-
- /* select h if h < p, or h + -p if h >= p */
- mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1;
- g0 &= mask;
- g1 &= mask;
- g2 &= mask;
- g3 &= mask;
- g4 &= mask;
- mask = ~mask;
- h0 = (h0 & mask) | g0;
- h1 = (h1 & mask) | g1;
- h2 = (h2 & mask) | g2;
- h3 = (h3 & mask) | g3;
- h4 = (h4 & mask) | g4;
-
- /* h = h % (2^128) */
- h0 = (h0 >> 0) | (h1 << 26);
- h1 = (h1 >> 6) | (h2 << 20);
- h2 = (h2 >> 12) | (h3 << 14);
- h3 = (h3 >> 18) | (h4 << 8);
-
- /* mac = (h + s) % (2^128) */
- f = (f >> 32) + h0 + ctx->s[0]; mac[0] = cpu_to_le32(f);
- f = (f >> 32) + h1 + ctx->s[1]; mac[1] = cpu_to_le32(f);
- f = (f >> 32) + h2 + ctx->s[2]; mac[2] = cpu_to_le32(f);
- f = (f >> 32) + h3 + ctx->s[3]; mac[3] = cpu_to_le32(f);
+ emit(ctx->opaque, mac, ctx->nonce);
+
+ /* zero out the state */
+ memzero_explicit(ctx, sizeof(*ctx));
}
+
static const u8 pad0[16] = { 0 };
static struct crypto_alg chacha20_alg = {
@@ -636,22 +665,22 @@ static inline void __chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size
chacha20_keysetup(&chacha20_state, key, (u8 *)&le_nonce);
chacha20_crypt(&chacha20_state, block0, block0, sizeof(block0), have_simd);
- poly1305_init(&poly1305_state, block0);
+ poly1305_init(&poly1305_state, block0, have_simd);
memzero_explicit(block0, sizeof(block0));
- poly1305_update(&poly1305_state, ad, ad_len, have_simd);
- poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf, have_simd);
+ poly1305_update(&poly1305_state, ad, ad_len);
+ poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf);
chacha20_crypt(&chacha20_state, dst, src, src_len, have_simd);
- poly1305_update(&poly1305_state, dst, src_len, have_simd);
- poly1305_update(&poly1305_state, pad0, (0x10 - src_len) & 0xf, have_simd);
+ poly1305_update(&poly1305_state, dst, src_len);
+ poly1305_update(&poly1305_state, pad0, (0x10 - src_len) & 0xf);
len = cpu_to_le64(ad_len);
- poly1305_update(&poly1305_state, (u8 *)&len, sizeof(len), have_simd);
+ poly1305_update(&poly1305_state, (u8 *)&len, sizeof(len));
len = cpu_to_le64(src_len);
- poly1305_update(&poly1305_state, (u8 *)&len, sizeof(len), have_simd);
+ poly1305_update(&poly1305_state, (u8 *)&len, sizeof(len));
poly1305_finish(&poly1305_state, dst + src_len);
@@ -687,11 +716,11 @@ bool chacha20poly1305_encrypt_sg(struct scatterlist *dst, struct scatterlist *sr
chacha20_keysetup(&chacha20_state, key, (u8 *)&le_nonce);
chacha20_crypt(&chacha20_state, block0, block0, sizeof(block0), have_simd);
- poly1305_init(&poly1305_state, block0);
+ poly1305_init(&poly1305_state, block0, have_simd);
memzero_explicit(block0, sizeof(block0));
- poly1305_update(&poly1305_state, ad, ad_len, have_simd);
- poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf, have_simd);
+ poly1305_update(&poly1305_state, ad, ad_len);
+ poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf);
if (likely(src_len)) {
blkcipher_walk_init(&walk, dst, src, src_len);
@@ -700,25 +729,25 @@ bool chacha20poly1305_encrypt_sg(struct scatterlist *dst, struct scatterlist *sr
size_t chunk_len = rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE);
chacha20_crypt(&chacha20_state, walk.dst.virt.addr, walk.src.virt.addr, chunk_len, have_simd);
- poly1305_update(&poly1305_state, walk.dst.virt.addr, chunk_len, have_simd);
+ poly1305_update(&poly1305_state, walk.dst.virt.addr, chunk_len);
ret = blkcipher_walk_done(&chacha20_desc, &walk, walk.nbytes % CHACHA20_BLOCK_SIZE);
}
if (walk.nbytes) {
chacha20_crypt(&chacha20_state, walk.dst.virt.addr, walk.src.virt.addr, walk.nbytes, have_simd);
- poly1305_update(&poly1305_state, walk.dst.virt.addr, walk.nbytes, have_simd);
+ poly1305_update(&poly1305_state, walk.dst.virt.addr, walk.nbytes);
ret = blkcipher_walk_done(&chacha20_desc, &walk, 0);
}
}
if (unlikely(ret))
goto err;
- poly1305_update(&poly1305_state, pad0, (0x10 - src_len) & 0xf, have_simd);
+ poly1305_update(&poly1305_state, pad0, (0x10 - src_len) & 0xf);
len = cpu_to_le64(ad_len);
- poly1305_update(&poly1305_state, (u8 *)&len, sizeof(len), have_simd);
+ poly1305_update(&poly1305_state, (u8 *)&len, sizeof(len));
len = cpu_to_le64(src_len);
- poly1305_update(&poly1305_state, (u8 *)&len, sizeof(len), have_simd);
+ poly1305_update(&poly1305_state, (u8 *)&len, sizeof(len));
poly1305_finish(&poly1305_state, mac);
scatterwalk_map_and_copy(mac, dst, src_len, sizeof(mac), 1);
@@ -749,21 +778,21 @@ static inline bool __chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size
chacha20_keysetup(&chacha20_state, key, (u8 *)&le_nonce);
chacha20_crypt(&chacha20_state, block0, block0, sizeof(block0), have_simd);
- poly1305_init(&poly1305_state, block0);
+ poly1305_init(&poly1305_state, block0, have_simd);
memzero_explicit(block0, sizeof(block0));
- poly1305_update(&poly1305_state, ad, ad_len, have_simd);
- poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf, have_simd);
+ poly1305_update(&poly1305_state, ad, ad_len);
+ poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf);
dst_len = src_len - POLY1305_MAC_SIZE;
- poly1305_update(&poly1305_state, src, dst_len, have_simd);
- poly1305_update(&poly1305_state, pad0, (0x10 - dst_len) & 0xf, have_simd);
+ poly1305_update(&poly1305_state, src, dst_len);
+ poly1305_update(&poly1305_state, pad0, (0x10 - dst_len) & 0xf);
len = cpu_to_le64(ad_len);
- poly1305_update(&poly1305_state, (u8 *)&len, sizeof(len), have_simd);
+ poly1305_update(&poly1305_state, (u8 *)&len, sizeof(len));
len = cpu_to_le64(dst_len);
- poly1305_update(&poly1305_state, (u8 *)&len, sizeof(len), have_simd);
+ poly1305_update(&poly1305_state, (u8 *)&len, sizeof(len));
poly1305_finish(&poly1305_state, mac);
memzero_explicit(&poly1305_state, sizeof(poly1305_state));
@@ -811,11 +840,11 @@ bool chacha20poly1305_decrypt_sg(struct scatterlist *dst, struct scatterlist *sr
chacha20_keysetup(&chacha20_state, key, (u8 *)&le_nonce);
chacha20_crypt(&chacha20_state, block0, block0, sizeof(block0), have_simd);
- poly1305_init(&poly1305_state, block0);
+ poly1305_init(&poly1305_state, block0, have_simd);
memzero_explicit(block0, sizeof(block0));
- poly1305_update(&poly1305_state, ad, ad_len, have_simd);
- poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf, have_simd);
+ poly1305_update(&poly1305_state, ad, ad_len);
+ poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf);
dst_len = src_len - POLY1305_MAC_SIZE;
if (likely(dst_len)) {
@@ -824,12 +853,12 @@ bool chacha20poly1305_decrypt_sg(struct scatterlist *dst, struct scatterlist *sr
while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
size_t chunk_len = rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE);
- poly1305_update(&poly1305_state, walk.src.virt.addr, chunk_len, have_simd);
+ poly1305_update(&poly1305_state, walk.src.virt.addr, chunk_len);
chacha20_crypt(&chacha20_state, walk.dst.virt.addr, walk.src.virt.addr, chunk_len, have_simd);
ret = blkcipher_walk_done(&chacha20_desc, &walk, walk.nbytes % CHACHA20_BLOCK_SIZE);
}
if (walk.nbytes) {
- poly1305_update(&poly1305_state, walk.src.virt.addr, walk.nbytes, have_simd);
+ poly1305_update(&poly1305_state, walk.src.virt.addr, walk.nbytes);
chacha20_crypt(&chacha20_state, walk.dst.virt.addr, walk.src.virt.addr, walk.nbytes, have_simd);
ret = blkcipher_walk_done(&chacha20_desc, &walk, 0);
}
@@ -837,13 +866,13 @@ bool chacha20poly1305_decrypt_sg(struct scatterlist *dst, struct scatterlist *sr
if (unlikely(ret))
goto err;
- poly1305_update(&poly1305_state, pad0, (0x10 - dst_len) & 0xf, have_simd);
+ poly1305_update(&poly1305_state, pad0, (0x10 - dst_len) & 0xf);
len = cpu_to_le64(ad_len);
- poly1305_update(&poly1305_state, (u8 *)&len, sizeof(len), have_simd);
+ poly1305_update(&poly1305_state, (u8 *)&len, sizeof(len));
len = cpu_to_le64(dst_len);
- poly1305_update(&poly1305_state, (u8 *)&len, sizeof(len), have_simd);
+ poly1305_update(&poly1305_state, (u8 *)&len, sizeof(len));
poly1305_finish(&poly1305_state, computed_mac);
memzero_explicit(&poly1305_state, sizeof(poly1305_state));