aboutsummaryrefslogtreecommitdiffstats
path: root/lib/zinc/chacha20/chacha20-x86_64-glue.c
diff options
context:
space:
mode:
authorJason A. Donenfeld <Jason@zx2c4.com>2019-03-25 16:23:54 +0100
committerJason A. Donenfeld <Jason@zx2c4.com>2019-03-25 16:39:50 +0100
commit202e57de031608d6eb80e695dd9dec032e7d4542 (patch)
treed0b5ea5bfe15e9f2b229dcdf1070a38ec6e7a981 /lib/zinc/chacha20/chacha20-x86_64-glue.c
parentsecurity/keys: rewrite big_key crypto to use Zinc (diff)
downloadlinux-dev-202e57de031608d6eb80e695dd9dec032e7d4542.tar.xz
linux-dev-202e57de031608d6eb80e695dd9dec032e7d4542.zip
zinc: use existing ChaCha20 x86_64 implementationjd/zinc-light
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Diffstat (limited to '')
-rw-r--r--lib/zinc/chacha20/chacha20-x86_64-glue.c132
1 files changed, 99 insertions, 33 deletions
diff --git a/lib/zinc/chacha20/chacha20-x86_64-glue.c b/lib/zinc/chacha20/chacha20-x86_64-glue.c
index 6db499a9ea82..3a0546885502 100644
--- a/lib/zinc/chacha20/chacha20-x86_64-glue.c
+++ b/lib/zinc/chacha20/chacha20-x86_64-glue.c
@@ -8,24 +8,29 @@
#include <asm/processor.h>
#include <asm/intel-family.h>
-asmlinkage void hchacha20_ssse3(u32 *derived_key, const u8 *nonce,
- const u8 *key);
-asmlinkage void chacha20_ssse3(u8 *out, const u8 *in, const size_t len,
- const u32 key[8], const u32 counter[4]);
-asmlinkage void chacha20_avx2(u8 *out, const u8 *in, const size_t len,
- const u32 key[8], const u32 counter[4]);
-asmlinkage void chacha20_avx512(u8 *out, const u8 *in, const size_t len,
- const u32 key[8], const u32 counter[4]);
-asmlinkage void chacha20_avx512vl(u8 *out, const u8 *in, const size_t len,
- const u32 key[8], const u32 counter[4]);
+asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds);
+asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
static bool chacha20_use_ssse3 __ro_after_init;
static bool chacha20_use_avx2 __ro_after_init;
-static bool chacha20_use_avx512 __ro_after_init;
static bool chacha20_use_avx512vl __ro_after_init;
static bool *const chacha20_nobs[] __initconst = {
- &chacha20_use_ssse3, &chacha20_use_avx2, &chacha20_use_avx512,
- &chacha20_use_avx512vl };
+ &chacha20_use_ssse3, &chacha20_use_avx2, &chacha20_use_avx512vl };
static void __init chacha20_fpu_init(void)
{
@@ -34,23 +39,71 @@ static void __init chacha20_fpu_init(void)
boot_cpu_has(X86_FEATURE_AVX) &&
boot_cpu_has(X86_FEATURE_AVX2) &&
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
- chacha20_use_avx512 =
- boot_cpu_has(X86_FEATURE_AVX) &&
- boot_cpu_has(X86_FEATURE_AVX2) &&
- boot_cpu_has(X86_FEATURE_AVX512F) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
- XFEATURE_MASK_AVX512, NULL) &&
- /* Skylake downclocks unacceptably much when using zmm. */
- boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X;
chacha20_use_avx512vl =
boot_cpu_has(X86_FEATURE_AVX) &&
boot_cpu_has(X86_FEATURE_AVX2) &&
boot_cpu_has(X86_FEATURE_AVX512F) &&
boot_cpu_has(X86_FEATURE_AVX512VL) &&
+ boot_cpu_has(X86_FEATURE_AVX512BW) &&
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
XFEATURE_MASK_AVX512, NULL);
}
+static void chacha20_avx512vl(struct chacha20_ctx *ctx, u8 *dst,
+ const u8 *src, unsigned int bytes)
+{
+ while (bytes >= CHACHA20_BLOCK_SIZE * 8) {
+ chacha_8block_xor_avx512vl(ctx->state, dst, src, bytes, 20);
+ bytes -= CHACHA20_BLOCK_SIZE * 8;
+ src += CHACHA20_BLOCK_SIZE * 8;
+ dst += CHACHA20_BLOCK_SIZE * 8;
+ ctx->counter[0] += 8;
+ }
+ if (bytes > CHACHA20_BLOCK_SIZE * 4)
+ chacha_8block_xor_avx512vl(ctx->state, dst, src, bytes, 20);
+ else if (bytes > CHACHA20_BLOCK_SIZE * 2)
+ chacha_4block_xor_avx512vl(ctx->state, dst, src, bytes, 20);
+ else if (bytes)
+ chacha_2block_xor_avx512vl(ctx->state, dst, src, bytes, 20);
+ ctx->counter[0] += round_up(bytes, CHACHA20_BLOCK_SIZE);
+}
+
+static void chacha20_avx2(struct chacha20_ctx *ctx, u8 *dst,
+ const u8 *src, unsigned int bytes)
+{
+ while (bytes >= CHACHA20_BLOCK_SIZE * 8) {
+ chacha_8block_xor_avx2(ctx->state, dst, src, bytes, 20);
+ bytes -= CHACHA20_BLOCK_SIZE * 8;
+ src += CHACHA20_BLOCK_SIZE * 8;
+ dst += CHACHA20_BLOCK_SIZE * 8;
+ ctx->counter[0] += 8;
+ }
+ if (bytes > CHACHA20_BLOCK_SIZE * 4)
+ chacha_8block_xor_avx2(ctx->state, dst, src, bytes, 20);
+ else if (bytes > CHACHA20_BLOCK_SIZE * 2)
+ chacha_4block_xor_avx2(ctx->state, dst, src, bytes, 20);
+ else if (bytes)
+ chacha_2block_xor_avx2(ctx->state, dst, src, bytes, 20);
+ ctx->counter[0] += round_up(bytes, CHACHA20_BLOCK_SIZE);
+}
+
+static void chacha20_ssse3(struct chacha20_ctx *ctx, u8 *dst,
+ const u8 *src, unsigned int bytes)
+{
+ while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
+ chacha_4block_xor_ssse3(ctx->state, dst, src, bytes, 20);
+ bytes -= CHACHA20_BLOCK_SIZE * 4;
+ src += CHACHA20_BLOCK_SIZE * 4;
+ dst += CHACHA20_BLOCK_SIZE * 4;
+ ctx->counter[0] += 4;
+ }
+ if (bytes > CHACHA20_BLOCK_SIZE)
+ chacha_4block_xor_ssse3(ctx->state, dst, src, bytes, 20);
+ else if (bytes)
+ chacha_block_xor_ssse3(ctx->state, dst, src, bytes, 20);
+ ctx->counter[0] += round_up(bytes, CHACHA20_BLOCK_SIZE);
+}
+
static inline bool chacha20_arch(struct chacha20_ctx *ctx, u8 *dst,
const u8 *src, size_t len,
simd_context_t *simd_context)
@@ -66,18 +119,12 @@ static inline bool chacha20_arch(struct chacha20_ctx *ctx, u8 *dst,
for (;;) {
const size_t bytes = min_t(size_t, len, PAGE_SIZE);
- if (IS_ENABLED(CONFIG_AS_AVX512) && chacha20_use_avx512 &&
- len >= CHACHA20_BLOCK_SIZE * 8)
- chacha20_avx512(dst, src, bytes, ctx->key, ctx->counter);
- else if (IS_ENABLED(CONFIG_AS_AVX512) && chacha20_use_avx512vl &&
- len >= CHACHA20_BLOCK_SIZE * 4)
- chacha20_avx512vl(dst, src, bytes, ctx->key, ctx->counter);
- else if (IS_ENABLED(CONFIG_AS_AVX2) && chacha20_use_avx2 &&
- len >= CHACHA20_BLOCK_SIZE * 4)
- chacha20_avx2(dst, src, bytes, ctx->key, ctx->counter);
+ if (IS_ENABLED(CONFIG_AS_AVX512) && chacha20_use_avx512vl)
+ chacha20_avx512vl(ctx, dst, src, bytes);
+ else if (IS_ENABLED(CONFIG_AS_AVX2) && chacha20_use_avx2)
+ chacha20_avx2(ctx, dst, src, bytes);
else
- chacha20_ssse3(dst, src, bytes, ctx->key, ctx->counter);
- ctx->counter[0] += (bytes + 63) / 64;
+ chacha20_ssse3(ctx, dst, src, bytes);
len -= bytes;
if (!len)
break;
@@ -96,7 +143,26 @@ static inline bool hchacha20_arch(u32 derived_key[CHACHA20_KEY_WORDS],
{
if (IS_ENABLED(CONFIG_AS_SSSE3) && chacha20_use_ssse3 &&
simd_use(simd_context)) {
- hchacha20_ssse3(derived_key, nonce, key);
+ u32 x[] __aligned(16) = {
+ CHACHA20_CONSTANT_EXPA,
+ CHACHA20_CONSTANT_ND_3,
+ CHACHA20_CONSTANT_2_BY,
+ CHACHA20_CONSTANT_TE_K,
+ get_unaligned_le32(key + 0),
+ get_unaligned_le32(key + 4),
+ get_unaligned_le32(key + 8),
+ get_unaligned_le32(key + 12),
+ get_unaligned_le32(key + 16),
+ get_unaligned_le32(key + 20),
+ get_unaligned_le32(key + 24),
+ get_unaligned_le32(key + 28),
+ get_unaligned_le32(nonce + 0),
+ get_unaligned_le32(nonce + 4),
+ get_unaligned_le32(nonce + 8),
+ get_unaligned_le32(nonce + 12)
+ };
+
+ hchacha_block_ssse3(x, derived_key, 20);
return true;
}
return false;