aboutsummaryrefslogtreecommitdiffstatshomepage
diff options
context:
space:
mode:
-rw-r--r--src/crypto/chacha20-ssse3-x86_64.S102
-rw-r--r--src/crypto/chacha20poly1305.c76
2 files changed, 157 insertions, 21 deletions
diff --git a/src/crypto/chacha20-ssse3-x86_64.S b/src/crypto/chacha20-ssse3-x86_64.S
index d7600b3..be4b9b7 100644
--- a/src/crypto/chacha20-ssse3-x86_64.S
+++ b/src/crypto/chacha20-ssse3-x86_64.S
@@ -2,6 +2,7 @@
* ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
*
* Copyright (C) 2015 Martin Willi
+ * Copyright (C) 2017 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -17,6 +18,7 @@
ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
CTRINC: .octa 0x00000003000000020000000100000000
+CONST: .ascii "expand 32-byte k"
.text
@@ -625,3 +627,103 @@ ENTRY(chacha20_asm_4block_xor_ssse3)
mov %r11,%rsp
ret
ENDPROC(chacha20_asm_4block_xor_ssse3)
+
+ENTRY(hchacha20_asm_ssse3)
+ # %rdi: 32 byte output key, o
+ # %rsi: 16 byte nonce, n
+ # %rdx: 32 byte input key, i
+
+ # x0 = constant
+ movdqa CONST(%rip),%xmm0
+ # x1, x2 = i
+ movdqu 0x00(%rdx),%xmm1
+ movdqu 0x10(%rdx),%xmm2
+ # x3 = n
+ movdqu 0x00(%rsi),%xmm3
+
+ movdqa %xmm0,%xmm8
+ movdqa %xmm1,%xmm9
+ movdqa %xmm2,%xmm10
+ movdqa %xmm3,%xmm11
+ movdqa ROT8(%rip),%xmm4
+ movdqa ROT16(%rip),%xmm5
+
+ mov $10,%ecx
+
+.Lhdoubleround:
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm5,%xmm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm6
+ pslld $12,%xmm6
+ psrld $20,%xmm1
+ por %xmm6,%xmm1
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm4,%xmm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm7
+ pslld $7,%xmm7
+ psrld $25,%xmm1
+ por %xmm7,%xmm1
+
+ # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ pshufd $0x39,%xmm1,%xmm1
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ pshufd $0x4e,%xmm2,%xmm2
+ # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ pshufd $0x93,%xmm3,%xmm3
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm5,%xmm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm6
+ pslld $12,%xmm6
+ psrld $20,%xmm1
+ por %xmm6,%xmm1
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm4,%xmm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm7
+ pslld $7,%xmm7
+ psrld $25,%xmm1
+ por %xmm7,%xmm1
+
+ # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ pshufd $0x93,%xmm1,%xmm1
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ pshufd $0x4e,%xmm2,%xmm2
+ # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ pshufd $0x39,%xmm3,%xmm3
+
+ dec %ecx
+ jnz .Lhdoubleround
+
+ # o0 = x0
+ movdqu %xmm0,0x00(%rdi)
+ # o1 = x3
+ movdqu %xmm3,0x10(%rdi)
+ ret
+ENDPROC(hchacha20_asm_ssse3)
diff --git a/src/crypto/chacha20poly1305.c b/src/crypto/chacha20poly1305.c
index d0fbe1c..611008e 100644
--- a/src/crypto/chacha20poly1305.c
+++ b/src/crypto/chacha20poly1305.c
@@ -16,6 +16,7 @@
#include <asm/cpufeature.h>
#include <asm/processor.h>
#ifdef CONFIG_AS_SSSE3
+asmlinkage void hchacha20_asm_ssse3(u8 *derived_key, const u8 *nonce, const u8 *key);
asmlinkage void chacha20_asm_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
asmlinkage void chacha20_asm_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
#endif
@@ -140,7 +141,7 @@ static void chacha20_generic_block(struct chacha20_ctx *ctx, void *stream)
static const char constant[16] = "expand 32-byte k";
-static void hchacha20(u8 derived_key[CHACHA20POLY1305_KEYLEN], const u8 nonce[16], const u8 key[CHACHA20POLY1305_KEYLEN])
+static void hchacha20_generic(u8 derived_key[CHACHA20POLY1305_KEYLEN], const u8 nonce[16], const u8 key[CHACHA20POLY1305_KEYLEN])
{
u32 x[CHACHA20_BLOCK_SIZE / sizeof(u32)];
__le32 *out = (__force __le32 *)derived_key;
@@ -215,6 +216,22 @@ static void hchacha20(u8 derived_key[CHACHA20POLY1305_KEYLEN], const u8 nonce[16
out[7] = cpu_to_le32(x[15]);
}
+static inline void hchacha20(u8 derived_key[CHACHA20POLY1305_KEYLEN], const u8 nonce[16], const u8 key[CHACHA20POLY1305_KEYLEN], bool have_simd)
+{
+ if (!have_simd)
+ goto no_simd;
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_AS_SSSE3)
+ if (chacha20poly1305_use_ssse3) {
+ hchacha20_asm_ssse3(derived_key, nonce, key);
+ return;
+ }
+#endif
+
+no_simd:
+ hchacha20_generic(derived_key, nonce, key);
+}
+
static void chacha20_keysetup(struct chacha20_ctx *ctx, const u8 key[CHACHA20_KEY_SIZE], const u8 nonce[sizeof(u64)])
{
ctx->state[0] = le32_to_cpuvp(constant + 0);
@@ -464,7 +481,6 @@ static void poly1305_update(struct poly1305_ctx *ctx, const u8 *src, unsigned in
if (ctx->buflen == POLY1305_BLOCK_SIZE) {
#ifdef CONFIG_X86_64
-
if (have_simd && chacha20poly1305_use_sse2)
poly1305_simd_blocks(ctx, ctx->buf, POLY1305_BLOCK_SIZE);
else
@@ -476,7 +492,6 @@ static void poly1305_update(struct poly1305_ctx *ctx, const u8 *src, unsigned in
if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
#ifdef CONFIG_X86_64
-
if (have_simd && chacha20poly1305_use_sse2)
bytes = poly1305_simd_blocks(ctx, src, srclen);
else
@@ -568,16 +583,16 @@ static struct blkcipher_desc chacha20_desc = {
.tfm = &chacha20_cipher
};
-void chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
- const u8 *ad, const size_t ad_len,
- const u64 nonce, const u8 key[CHACHA20POLY1305_KEYLEN])
+static inline void __chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
+ const u8 *ad, const size_t ad_len,
+ const u64 nonce, const u8 key[CHACHA20POLY1305_KEYLEN],
+ bool have_simd)
{
struct poly1305_ctx poly1305_state;
struct chacha20_ctx chacha20_state;
u8 block0[CHACHA20_BLOCK_SIZE] = { 0 };
__le64 len;
__le64 le_nonce = cpu_to_le64(nonce);
- bool have_simd = chacha20poly1305_init_simd();
chacha20_keysetup(&chacha20_state, key, (u8 *)&le_nonce);
@@ -603,7 +618,15 @@ void chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
memzero_explicit(&poly1305_state, sizeof(poly1305_state));
memzero_explicit(&chacha20_state, sizeof(chacha20_state));
+}
+void chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
+ const u8 *ad, const size_t ad_len,
+ const u64 nonce, const u8 key[CHACHA20POLY1305_KEYLEN])
+{
+ bool have_simd;
+ have_simd = chacha20poly1305_init_simd();
+ __chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len, nonce, key, have_simd);
chacha20poly1305_deinit_simd(have_simd);
}
@@ -665,9 +688,10 @@ err:
return !ret;
}
-bool chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
- const u8 *ad, const size_t ad_len,
- const u64 nonce, const u8 key[CHACHA20POLY1305_KEYLEN])
+static inline bool __chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
+ const u8 *ad, const size_t ad_len,
+ const u64 nonce, const u8 key[CHACHA20POLY1305_KEYLEN],
+ bool have_simd)
{
struct poly1305_ctx poly1305_state;
struct chacha20_ctx chacha20_state;
@@ -677,13 +701,10 @@ bool chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
size_t dst_len;
__le64 len;
__le64 le_nonce = cpu_to_le64(nonce);
- bool have_simd;
if (unlikely(src_len < POLY1305_MAC_SIZE))
return false;
- have_simd = chacha20poly1305_init_simd();
-
chacha20_keysetup(&chacha20_state, key, (u8 *)&le_nonce);
chacha20_crypt(&chacha20_state, block0, block0, sizeof(block0), have_simd);
@@ -713,10 +734,20 @@ bool chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
memzero_explicit(&chacha20_state, sizeof(chacha20_state));
- chacha20poly1305_deinit_simd(have_simd);
return !ret;
}
+bool chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
+ const u8 *ad, const size_t ad_len,
+ const u64 nonce, const u8 key[CHACHA20POLY1305_KEYLEN])
+{
+ bool have_simd, ret;
+ have_simd = chacha20poly1305_init_simd();
+ ret = __chacha20poly1305_decrypt(dst, src, src_len, ad, ad_len, nonce, key, have_simd);
+ chacha20poly1305_deinit_simd(have_simd);
+ return ret;
+}
+
bool chacha20poly1305_decrypt_sg(struct scatterlist *dst, struct scatterlist *src, const size_t src_len,
const u8 *ad, const size_t ad_len,
const u64 nonce, const u8 key[CHACHA20POLY1305_KEYLEN])
@@ -792,10 +823,12 @@ void xchacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
const u8 nonce[XCHACHA20POLY1305_NONCELEN],
const u8 key[CHACHA20POLY1305_KEYLEN])
{
- u8 derived_key[CHACHA20POLY1305_KEYLEN];
- hchacha20(derived_key, nonce, key);
- chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len, le64_to_cpuvp(nonce + 16), derived_key);
+ bool have_simd = chacha20poly1305_init_simd();
+ u8 derived_key[CHACHA20POLY1305_KEYLEN] __aligned(16);
+ hchacha20(derived_key, nonce, key, have_simd);
+ __chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len, le64_to_cpuvp(nonce + 16), derived_key, have_simd);
memzero_explicit(derived_key, CHACHA20POLY1305_KEYLEN);
+ chacha20poly1305_deinit_simd(have_simd);
}
bool xchacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
@@ -803,11 +836,12 @@ bool xchacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
const u8 nonce[XCHACHA20POLY1305_NONCELEN],
const u8 key[CHACHA20POLY1305_KEYLEN])
{
- u8 derived_key[CHACHA20POLY1305_KEYLEN];
- bool ret;
- hchacha20(derived_key, nonce, key);
- ret = chacha20poly1305_decrypt(dst, src, src_len, ad, ad_len, le64_to_cpuvp(nonce + 16), derived_key);
+ bool ret, have_simd = chacha20poly1305_init_simd();
+ u8 derived_key[CHACHA20POLY1305_KEYLEN] __aligned(16);
+ hchacha20(derived_key, nonce, key, have_simd);
+ ret = __chacha20poly1305_decrypt(dst, src, src_len, ad, ad_len, le64_to_cpuvp(nonce + 16), derived_key, have_simd);
memzero_explicit(derived_key, CHACHA20POLY1305_KEYLEN);
+ chacha20poly1305_deinit_simd(have_simd);
return ret;
}