aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/crypto
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/crypto')
-rw-r--r--arch/x86/crypto/Makefile7
-rw-r--r--arch/x86/crypto/aes_ctrby8_avx-x86_64.S232
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c114
-rw-r--r--arch/x86/crypto/blake2s-glue.c3
-rw-r--r--arch/x86/crypto/blake2s-shash.c77
-rw-r--r--arch/x86/crypto/blowfish_glue.c4
-rw-r--r--arch/x86/crypto/polyval-clmulni_asm.S321
-rw-r--r--arch/x86/crypto/polyval-clmulni_glue.c203
8 files changed, 796 insertions, 165 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 2831685adf6f..04d07ab744b2 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -61,14 +61,15 @@ sha256-ssse3-$(CONFIG_AS_SHA256_NI) += sha256_ni_asm.o
obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
-obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o
-blake2s-x86_64-y := blake2s-shash.o
-obj-$(if $(CONFIG_CRYPTO_BLAKE2S_X86),y) += libblake2s-x86_64.o
+obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += libblake2s-x86_64.o
libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o
obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
+obj-$(CONFIG_CRYPTO_POLYVAL_CLMUL_NI) += polyval-clmulni.o
+polyval-clmulni-y := polyval-clmulni_asm.o polyval-clmulni_glue.o
+
obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
crc32c-intel-y := crc32c-intel_glue.o
crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
index 43852ba6e19c..2402b9418cd7 100644
--- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
+++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
@@ -23,6 +23,11 @@
#define VMOVDQ vmovdqu
+/*
+ * Note: the "x" prefix in these aliases means "this is an xmm register". The
+ * alias prefixes have no relation to XCTR where the "X" prefix means "XOR
+ * counter".
+ */
#define xdata0 %xmm0
#define xdata1 %xmm1
#define xdata2 %xmm2
@@ -31,8 +36,10 @@
#define xdata5 %xmm5
#define xdata6 %xmm6
#define xdata7 %xmm7
-#define xcounter %xmm8
-#define xbyteswap %xmm9
+#define xcounter %xmm8 // CTR mode only
+#define xiv %xmm8 // XCTR mode only
+#define xbyteswap %xmm9 // CTR mode only
+#define xtmp %xmm9 // XCTR mode only
#define xkey0 %xmm10
#define xkey4 %xmm11
#define xkey8 %xmm12
@@ -45,7 +52,7 @@
#define p_keys %rdx
#define p_out %rcx
#define num_bytes %r8
-
+#define counter %r9 // XCTR mode only
#define tmp %r10
#define DDQ_DATA 0
#define XDATA 1
@@ -102,7 +109,7 @@ ddq_add_8:
* do_aes num_in_par load_keys key_len
* This increments p_in, but not p_out
*/
-.macro do_aes b, k, key_len
+.macro do_aes b, k, key_len, xctr
.set by, \b
.set load_keys, \k
.set klen, \key_len
@@ -111,29 +118,48 @@ ddq_add_8:
vmovdqa 0*16(p_keys), xkey0
.endif
- vpshufb xbyteswap, xcounter, xdata0
-
- .set i, 1
- .rept (by - 1)
- club XDATA, i
- vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
- vptest ddq_low_msk(%rip), var_xdata
- jnz 1f
- vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata
- vpaddq ddq_high_add_1(%rip), xcounter, xcounter
- 1:
- vpshufb xbyteswap, var_xdata, var_xdata
- .set i, (i +1)
- .endr
+ .if \xctr
+ movq counter, xtmp
+ .set i, 0
+ .rept (by)
+ club XDATA, i
+ vpaddq (ddq_add_1 + 16 * i)(%rip), xtmp, var_xdata
+ .set i, (i +1)
+ .endr
+ .set i, 0
+ .rept (by)
+ club XDATA, i
+ vpxor xiv, var_xdata, var_xdata
+ .set i, (i +1)
+ .endr
+ .else
+ vpshufb xbyteswap, xcounter, xdata0
+ .set i, 1
+ .rept (by - 1)
+ club XDATA, i
+ vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
+ vptest ddq_low_msk(%rip), var_xdata
+ jnz 1f
+ vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata
+ vpaddq ddq_high_add_1(%rip), xcounter, xcounter
+ 1:
+ vpshufb xbyteswap, var_xdata, var_xdata
+ .set i, (i +1)
+ .endr
+ .endif
vmovdqa 1*16(p_keys), xkeyA
vpxor xkey0, xdata0, xdata0
- vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
- vptest ddq_low_msk(%rip), xcounter
- jnz 1f
- vpaddq ddq_high_add_1(%rip), xcounter, xcounter
- 1:
+ .if \xctr
+ add $by, counter
+ .else
+ vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
+ vptest ddq_low_msk(%rip), xcounter
+ jnz 1f
+ vpaddq ddq_high_add_1(%rip), xcounter, xcounter
+ 1:
+ .endif
.set i, 1
.rept (by - 1)
@@ -371,94 +397,99 @@ ddq_add_8:
.endr
.endm
-.macro do_aes_load val, key_len
- do_aes \val, 1, \key_len
+.macro do_aes_load val, key_len, xctr
+ do_aes \val, 1, \key_len, \xctr
.endm
-.macro do_aes_noload val, key_len
- do_aes \val, 0, \key_len
+.macro do_aes_noload val, key_len, xctr
+ do_aes \val, 0, \key_len, \xctr
.endm
/* main body of aes ctr load */
-.macro do_aes_ctrmain key_len
+.macro do_aes_ctrmain key_len, xctr
cmp $16, num_bytes
- jb .Ldo_return2\key_len
+ jb .Ldo_return2\xctr\key_len
- vmovdqa byteswap_const(%rip), xbyteswap
- vmovdqu (p_iv), xcounter
- vpshufb xbyteswap, xcounter, xcounter
+ .if \xctr
+ shr $4, counter
+ vmovdqu (p_iv), xiv
+ .else
+ vmovdqa byteswap_const(%rip), xbyteswap
+ vmovdqu (p_iv), xcounter
+ vpshufb xbyteswap, xcounter, xcounter
+ .endif
mov num_bytes, tmp
and $(7*16), tmp
- jz .Lmult_of_8_blks\key_len
+ jz .Lmult_of_8_blks\xctr\key_len
/* 1 <= tmp <= 7 */
cmp $(4*16), tmp
- jg .Lgt4\key_len
- je .Leq4\key_len
+ jg .Lgt4\xctr\key_len
+ je .Leq4\xctr\key_len
-.Llt4\key_len:
+.Llt4\xctr\key_len:
cmp $(2*16), tmp
- jg .Leq3\key_len
- je .Leq2\key_len
+ jg .Leq3\xctr\key_len
+ je .Leq2\xctr\key_len
-.Leq1\key_len:
- do_aes_load 1, \key_len
+.Leq1\xctr\key_len:
+ do_aes_load 1, \key_len, \xctr
add $(1*16), p_out
and $(~7*16), num_bytes
- jz .Ldo_return2\key_len
- jmp .Lmain_loop2\key_len
+ jz .Ldo_return2\xctr\key_len
+ jmp .Lmain_loop2\xctr\key_len
-.Leq2\key_len:
- do_aes_load 2, \key_len
+.Leq2\xctr\key_len:
+ do_aes_load 2, \key_len, \xctr
add $(2*16), p_out
and $(~7*16), num_bytes
- jz .Ldo_return2\key_len
- jmp .Lmain_loop2\key_len
+ jz .Ldo_return2\xctr\key_len
+ jmp .Lmain_loop2\xctr\key_len
-.Leq3\key_len:
- do_aes_load 3, \key_len
+.Leq3\xctr\key_len:
+ do_aes_load 3, \key_len, \xctr
add $(3*16), p_out
and $(~7*16), num_bytes
- jz .Ldo_return2\key_len
- jmp .Lmain_loop2\key_len
+ jz .Ldo_return2\xctr\key_len
+ jmp .Lmain_loop2\xctr\key_len
-.Leq4\key_len:
- do_aes_load 4, \key_len
+.Leq4\xctr\key_len:
+ do_aes_load 4, \key_len, \xctr
add $(4*16), p_out
and $(~7*16), num_bytes
- jz .Ldo_return2\key_len
- jmp .Lmain_loop2\key_len
+ jz .Ldo_return2\xctr\key_len
+ jmp .Lmain_loop2\xctr\key_len
-.Lgt4\key_len:
+.Lgt4\xctr\key_len:
cmp $(6*16), tmp
- jg .Leq7\key_len
- je .Leq6\key_len
+ jg .Leq7\xctr\key_len
+ je .Leq6\xctr\key_len
-.Leq5\key_len:
- do_aes_load 5, \key_len
+.Leq5\xctr\key_len:
+ do_aes_load 5, \key_len, \xctr
add $(5*16), p_out
and $(~7*16), num_bytes
- jz .Ldo_return2\key_len
- jmp .Lmain_loop2\key_len
+ jz .Ldo_return2\xctr\key_len
+ jmp .Lmain_loop2\xctr\key_len
-.Leq6\key_len:
- do_aes_load 6, \key_len
+.Leq6\xctr\key_len:
+ do_aes_load 6, \key_len, \xctr
add $(6*16), p_out
and $(~7*16), num_bytes
- jz .Ldo_return2\key_len
- jmp .Lmain_loop2\key_len
+ jz .Ldo_return2\xctr\key_len
+ jmp .Lmain_loop2\xctr\key_len
-.Leq7\key_len:
- do_aes_load 7, \key_len
+.Leq7\xctr\key_len:
+ do_aes_load 7, \key_len, \xctr
add $(7*16), p_out
and $(~7*16), num_bytes
- jz .Ldo_return2\key_len
- jmp .Lmain_loop2\key_len
+ jz .Ldo_return2\xctr\key_len
+ jmp .Lmain_loop2\xctr\key_len
-.Lmult_of_8_blks\key_len:
+.Lmult_of_8_blks\xctr\key_len:
.if (\key_len != KEY_128)
vmovdqa 0*16(p_keys), xkey0
vmovdqa 4*16(p_keys), xkey4
@@ -471,17 +502,19 @@ ddq_add_8:
vmovdqa 9*16(p_keys), xkey12
.endif
.align 16
-.Lmain_loop2\key_len:
+.Lmain_loop2\xctr\key_len:
/* num_bytes is a multiple of 8 and >0 */
- do_aes_noload 8, \key_len
+ do_aes_noload 8, \key_len, \xctr
add $(8*16), p_out
sub $(8*16), num_bytes
- jne .Lmain_loop2\key_len
+ jne .Lmain_loop2\xctr\key_len
-.Ldo_return2\key_len:
- /* return updated IV */
- vpshufb xbyteswap, xcounter, xcounter
- vmovdqu xcounter, (p_iv)
+.Ldo_return2\xctr\key_len:
+ .if !\xctr
+ /* return updated IV */
+ vpshufb xbyteswap, xcounter, xcounter
+ vmovdqu xcounter, (p_iv)
+ .endif
RET
.endm
@@ -494,7 +527,7 @@ ddq_add_8:
*/
SYM_FUNC_START(aes_ctr_enc_128_avx_by8)
/* call the aes main loop */
- do_aes_ctrmain KEY_128
+ do_aes_ctrmain KEY_128 0
SYM_FUNC_END(aes_ctr_enc_128_avx_by8)
@@ -507,7 +540,7 @@ SYM_FUNC_END(aes_ctr_enc_128_avx_by8)
*/
SYM_FUNC_START(aes_ctr_enc_192_avx_by8)
/* call the aes main loop */
- do_aes_ctrmain KEY_192
+ do_aes_ctrmain KEY_192 0
SYM_FUNC_END(aes_ctr_enc_192_avx_by8)
@@ -520,6 +553,45 @@ SYM_FUNC_END(aes_ctr_enc_192_avx_by8)
*/
SYM_FUNC_START(aes_ctr_enc_256_avx_by8)
/* call the aes main loop */
- do_aes_ctrmain KEY_256
+ do_aes_ctrmain KEY_256 0
SYM_FUNC_END(aes_ctr_enc_256_avx_by8)
+
+/*
+ * routine to do AES128 XCTR enc/decrypt "by8"
+ * XMM registers are clobbered.
+ * Saving/restoring must be done at a higher level
+ * aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, const void *keys,
+ * u8* out, unsigned int num_bytes, unsigned int byte_ctr)
+ */
+SYM_FUNC_START(aes_xctr_enc_128_avx_by8)
+ /* call the aes main loop */
+ do_aes_ctrmain KEY_128 1
+
+SYM_FUNC_END(aes_xctr_enc_128_avx_by8)
+
+/*
+ * routine to do AES192 XCTR enc/decrypt "by8"
+ * XMM registers are clobbered.
+ * Saving/restoring must be done at a higher level
+ * aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, const void *keys,
+ * u8* out, unsigned int num_bytes, unsigned int byte_ctr)
+ */
+SYM_FUNC_START(aes_xctr_enc_192_avx_by8)
+ /* call the aes main loop */
+ do_aes_ctrmain KEY_192 1
+
+SYM_FUNC_END(aes_xctr_enc_192_avx_by8)
+
+/*
+ * routine to do AES256 XCTR enc/decrypt "by8"
+ * XMM registers are clobbered.
+ * Saving/restoring must be done at a higher level
+ * aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, const void *keys,
+ * u8* out, unsigned int num_bytes, unsigned int byte_ctr)
+ */
+SYM_FUNC_START(aes_xctr_enc_256_avx_by8)
+ /* call the aes main loop */
+ do_aes_ctrmain KEY_256 1
+
+SYM_FUNC_END(aes_xctr_enc_256_avx_by8)
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 41901ba9d3a2..a5b0cb3efeba 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -135,6 +135,20 @@ asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv,
void *keys, u8 *out, unsigned int num_bytes);
asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv,
void *keys, u8 *out, unsigned int num_bytes);
+
+
+asmlinkage void aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv,
+ const void *keys, u8 *out, unsigned int num_bytes,
+ unsigned int byte_ctr);
+
+asmlinkage void aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv,
+ const void *keys, u8 *out, unsigned int num_bytes,
+ unsigned int byte_ctr);
+
+asmlinkage void aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv,
+ const void *keys, u8 *out, unsigned int num_bytes,
+ unsigned int byte_ctr);
+
/*
* asmlinkage void aesni_gcm_init_avx_gen2()
* gcm_data *my_ctx_data, context data
@@ -527,6 +541,59 @@ static int ctr_crypt(struct skcipher_request *req)
return err;
}
+static void aesni_xctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv,
+ unsigned int byte_ctr)
+{
+ if (ctx->key_length == AES_KEYSIZE_128)
+ aes_xctr_enc_128_avx_by8(in, iv, (void *)ctx, out, len,
+ byte_ctr);
+ else if (ctx->key_length == AES_KEYSIZE_192)
+ aes_xctr_enc_192_avx_by8(in, iv, (void *)ctx, out, len,
+ byte_ctr);
+ else
+ aes_xctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len,
+ byte_ctr);
+}
+
+static int xctr_crypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+ u8 keystream[AES_BLOCK_SIZE];
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ unsigned int byte_ctr = 0;
+ int err;
+ __le32 block[AES_BLOCK_SIZE / sizeof(__le32)];
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ kernel_fpu_begin();
+ if (nbytes & AES_BLOCK_MASK)
+ aesni_xctr_enc_avx_tfm(ctx, walk.dst.virt.addr,
+ walk.src.virt.addr, nbytes & AES_BLOCK_MASK,
+ walk.iv, byte_ctr);
+ nbytes &= ~AES_BLOCK_MASK;
+ byte_ctr += walk.nbytes - nbytes;
+
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ memcpy(block, walk.iv, AES_BLOCK_SIZE);
+ block[0] ^= cpu_to_le32(1 + byte_ctr / AES_BLOCK_SIZE);
+ aesni_enc(ctx, keystream, (u8 *)block);
+ crypto_xor_cpy(walk.dst.virt.addr + walk.nbytes -
+ nbytes, walk.src.virt.addr + walk.nbytes
+ - nbytes, keystream, nbytes);
+ byte_ctr += nbytes;
+ nbytes = 0;
+ }
+ kernel_fpu_end();
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+ return err;
+}
+
static int
rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
{
@@ -1051,6 +1118,33 @@ static
struct simd_skcipher_alg *aesni_simd_skciphers[ARRAY_SIZE(aesni_skciphers)];
#ifdef CONFIG_X86_64
+/*
+ * XCTR does not have a non-AVX implementation, so it must be enabled
+ * conditionally.
+ */
+static struct skcipher_alg aesni_xctr = {
+ .base = {
+ .cra_name = "__xctr(aes)",
+ .cra_driver_name = "__xctr-aes-aesni",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = 1,
+ .cra_ctxsize = CRYPTO_AES_CTX_SIZE,
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .chunksize = AES_BLOCK_SIZE,
+ .setkey = aesni_skcipher_setkey,
+ .encrypt = xctr_crypt,
+ .decrypt = xctr_crypt,
+};
+
+static struct simd_skcipher_alg *aesni_simd_xctr;
+#endif /* CONFIG_X86_64 */
+
+#ifdef CONFIG_X86_64
static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key,
unsigned int key_len)
{
@@ -1163,7 +1257,7 @@ static int __init aesni_init(void)
static_call_update(aesni_ctr_enc_tfm, aesni_ctr_enc_avx_tfm);
pr_info("AES CTR mode by8 optimization enabled\n");
}
-#endif
+#endif /* CONFIG_X86_64 */
err = crypto_register_alg(&aesni_cipher_alg);
if (err)
@@ -1180,8 +1274,22 @@ static int __init aesni_init(void)
if (err)
goto unregister_skciphers;
+#ifdef CONFIG_X86_64
+ if (boot_cpu_has(X86_FEATURE_AVX))
+ err = simd_register_skciphers_compat(&aesni_xctr, 1,
+ &aesni_simd_xctr);
+ if (err)
+ goto unregister_aeads;
+#endif /* CONFIG_X86_64 */
+
return 0;
+#ifdef CONFIG_X86_64
+unregister_aeads:
+ simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads),
+ aesni_simd_aeads);
+#endif /* CONFIG_X86_64 */
+
unregister_skciphers:
simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers),
aesni_simd_skciphers);
@@ -1197,6 +1305,10 @@ static void __exit aesni_exit(void)
simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers),
aesni_simd_skciphers);
crypto_unregister_alg(&aesni_cipher_alg);
+#ifdef CONFIG_X86_64
+ if (boot_cpu_has(X86_FEATURE_AVX))
+ simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr);
+#endif /* CONFIG_X86_64 */
}
late_initcall(aesni_init);
diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c
index 69853c13e8fb..aaba21230528 100644
--- a/arch/x86/crypto/blake2s-glue.c
+++ b/arch/x86/crypto/blake2s-glue.c
@@ -4,7 +4,6 @@
*/
#include <crypto/internal/blake2s.h>
-#include <crypto/internal/simd.h>
#include <linux/types.h>
#include <linux/jump_label.h>
@@ -33,7 +32,7 @@ void blake2s_compress(struct blake2s_state *state, const u8 *block,
/* SIMD disables preemption, so relax after processing each page. */
BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
- if (!static_branch_likely(&blake2s_use_ssse3) || !crypto_simd_usable()) {
+ if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) {
blake2s_compress_generic(state, block, nblocks, inc);
return;
}
diff --git a/arch/x86/crypto/blake2s-shash.c b/arch/x86/crypto/blake2s-shash.c
deleted file mode 100644
index 59ae28abe35c..000000000000
--- a/arch/x86/crypto/blake2s-shash.c
+++ /dev/null
@@ -1,77 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#include <crypto/internal/blake2s.h>
-#include <crypto/internal/simd.h>
-#include <crypto/internal/hash.h>
-
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/sizes.h>
-
-#include <asm/cpufeature.h>
-#include <asm/processor.h>
-
-static int crypto_blake2s_update_x86(struct shash_desc *desc,
- const u8 *in, unsigned int inlen)
-{
- return crypto_blake2s_update(desc, in, inlen, false);
-}
-
-static int crypto_blake2s_final_x86(struct shash_desc *desc, u8 *out)
-{
- return crypto_blake2s_final(desc, out, false);
-}
-
-#define BLAKE2S_ALG(name, driver_name, digest_size) \
- { \
- .base.cra_name = name, \
- .base.cra_driver_name = driver_name, \
- .base.cra_priority = 200, \
- .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \
- .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, \
- .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), \
- .base.cra_module = THIS_MODULE, \
- .digestsize = digest_size, \
- .setkey = crypto_blake2s_setkey, \
- .init = crypto_blake2s_init, \
- .update = crypto_blake2s_update_x86, \
- .final = crypto_blake2s_final_x86, \
- .descsize = sizeof(struct blake2s_state), \
- }
-
-static struct shash_alg blake2s_algs[] = {
- BLAKE2S_ALG("blake2s-128", "blake2s-128-x86", BLAKE2S_128_HASH_SIZE),
- BLAKE2S_ALG("blake2s-160", "blake2s-160-x86", BLAKE2S_160_HASH_SIZE),
- BLAKE2S_ALG("blake2s-224", "blake2s-224-x86", BLAKE2S_224_HASH_SIZE),
- BLAKE2S_ALG("blake2s-256", "blake2s-256-x86", BLAKE2S_256_HASH_SIZE),
-};
-
-static int __init blake2s_mod_init(void)
-{
- if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3))
- return crypto_register_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
- return 0;
-}
-
-static void __exit blake2s_mod_exit(void)
-{
- if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3))
- crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
-}
-
-module_init(blake2s_mod_init);
-module_exit(blake2s_mod_exit);
-
-MODULE_ALIAS_CRYPTO("blake2s-128");
-MODULE_ALIAS_CRYPTO("blake2s-128-x86");
-MODULE_ALIAS_CRYPTO("blake2s-160");
-MODULE_ALIAS_CRYPTO("blake2s-160-x86");
-MODULE_ALIAS_CRYPTO("blake2s-224");
-MODULE_ALIAS_CRYPTO("blake2s-224-x86");
-MODULE_ALIAS_CRYPTO("blake2s-256");
-MODULE_ALIAS_CRYPTO("blake2s-256-x86");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c
index ba06322c1e39..019c64c1340a 100644
--- a/arch/x86/crypto/blowfish_glue.c
+++ b/arch/x86/crypto/blowfish_glue.c
@@ -144,7 +144,7 @@ static int cbc_encrypt(struct skcipher_request *req)
err = skcipher_walk_virt(&walk, req, false);
- while ((nbytes = walk.nbytes)) {
+ while (walk.nbytes) {
nbytes = __cbc_encrypt(ctx, &walk);
err = skcipher_walk_done(&walk, nbytes);
}
@@ -225,7 +225,7 @@ static int cbc_decrypt(struct skcipher_request *req)
err = skcipher_walk_virt(&walk, req, false);
- while ((nbytes = walk.nbytes)) {
+ while (walk.nbytes) {
nbytes = __cbc_decrypt(ctx, &walk);
err = skcipher_walk_done(&walk, nbytes);
}
diff --git a/arch/x86/crypto/polyval-clmulni_asm.S b/arch/x86/crypto/polyval-clmulni_asm.S
new file mode 100644
index 000000000000..a6ebe4e7dd2b
--- /dev/null
+++ b/arch/x86/crypto/polyval-clmulni_asm.S
@@ -0,0 +1,321 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2021 Google LLC
+ */
+/*
+ * This is an efficient implementation of POLYVAL using intel PCLMULQDQ-NI
+ * instructions. It works on 8 blocks at a time, by precomputing the first 8
+ * keys powers h^8, ..., h^1 in the POLYVAL finite field. This precomputation
+ * allows us to split finite field multiplication into two steps.
+ *
+ * In the first step, we consider h^i, m_i as normal polynomials of degree less
+ * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
+ * is simply polynomial multiplication.
+ *
+ * In the second step, we compute the reduction of p(x) modulo the finite field
+ * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
+ * multiplication is finite field multiplication. The advantage is that the
+ * two-step process only requires 1 finite field reduction for every 8
+ * polynomial multiplications. Further parallelism is gained by interleaving the
+ * multiplications and polynomial reductions.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define STRIDE_BLOCKS 8
+
+#define GSTAR %xmm7
+#define PL %xmm8
+#define PH %xmm9
+#define TMP_XMM %xmm11
+#define LO %xmm12
+#define HI %xmm13
+#define MI %xmm14
+#define SUM %xmm15
+
+#define KEY_POWERS %rdi
+#define MSG %rsi
+#define BLOCKS_LEFT %rdx
+#define ACCUMULATOR %rcx
+#define TMP %rax
+
+.section .rodata.cst16.gstar, "aM", @progbits, 16
+.align 16
+
+.Lgstar:
+ .quad 0xc200000000000000, 0xc200000000000000
+
+.text
+
+/*
+ * Performs schoolbook1_iteration on two lists of 128-bit polynomials of length
+ * count pointed to by MSG and KEY_POWERS.
+ */
+.macro schoolbook1 count
+ .set i, 0
+ .rept (\count)
+ schoolbook1_iteration i 0
+ .set i, (i +1)
+ .endr
+.endm
+
+/*
+ * Computes the product of two 128-bit polynomials at the memory locations
+ * specified by (MSG + 16*i) and (KEY_POWERS + 16*i) and XORs the components of
+ * the 256-bit product into LO, MI, HI.
+ *
+ * Given:
+ * X = [X_1 : X_0]
+ * Y = [Y_1 : Y_0]
+ *
+ * We compute:
+ * LO += X_0 * Y_0
+ * MI += X_0 * Y_1 + X_1 * Y_0
+ * HI += X_1 * Y_1
+ *
+ * Later, the 256-bit result can be extracted as:
+ * [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
+ * This step is done when computing the polynomial reduction for efficiency
+ * reasons.
+ *
+ * If xor_sum == 1, then also XOR the value of SUM into m_0. This avoids an
+ * extra multiplication of SUM and h^8.
+ */
+.macro schoolbook1_iteration i xor_sum
+ movups (16*\i)(MSG), %xmm0
+ .if (\i == 0 && \xor_sum == 1)
+ pxor SUM, %xmm0
+ .endif
+ vpclmulqdq $0x01, (16*\i)(KEY_POWERS), %xmm0, %xmm2
+ vpclmulqdq $0x00, (16*\i)(KEY_POWERS), %xmm0, %xmm1
+ vpclmulqdq $0x10, (16*\i)(KEY_POWERS), %xmm0, %xmm3
+ vpclmulqdq $0x11, (16*\i)(KEY_POWERS), %xmm0, %xmm4
+ vpxor %xmm2, MI, MI
+ vpxor %xmm1, LO, LO
+ vpxor %xmm4, HI, HI
+ vpxor %xmm3, MI, MI
+.endm
+
+/*
+ * Performs the same computation as schoolbook1_iteration, except we expect the
+ * arguments to already be loaded into xmm0 and xmm1 and we set the result
+ * registers LO, MI, and HI directly rather than XOR'ing into them.
+ */
+.macro schoolbook1_noload
+ vpclmulqdq $0x01, %xmm0, %xmm1, MI
+ vpclmulqdq $0x10, %xmm0, %xmm1, %xmm2
+ vpclmulqdq $0x00, %xmm0, %xmm1, LO
+ vpclmulqdq $0x11, %xmm0, %xmm1, HI
+ vpxor %xmm2, MI, MI
+.endm
+
+/*
+ * Computes the 256-bit polynomial represented by LO, HI, MI. Stores
+ * the result in PL, PH.
+ * [PH : PL] = [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
+ */
+.macro schoolbook2
+ vpslldq $8, MI, PL
+ vpsrldq $8, MI, PH
+ pxor LO, PL
+ pxor HI, PH
+.endm
+
+/*
+ * Computes the 128-bit reduction of PH : PL. Stores the result in dest.
+ *
+ * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) =
+ * x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
+ * product of two 128-bit polynomials in Montgomery form. We need to reduce it
+ * mod g(x). Also, since polynomials in Montgomery form have an "extra" factor
+ * of x^128, this product has two extra factors of x^128. To get it back into
+ * Montgomery form, we need to remove one of these factors by dividing by x^128.
+ *
+ * To accomplish both of these goals, we add multiples of g(x) that cancel out
+ * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low
+ * bits are zero, the polynomial division by x^128 can be done by right shifting.
+ *
+ * Since the only nonzero term in the low 64 bits of g(x) is the constant term,
+ * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can
+ * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 +
+ * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to
+ * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T
+ * = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191.
+ *
+ * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
+ * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
+ * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
+ * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 :
+ * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0).
+ *
+ * So our final computation is:
+ * T = T_1 : T_0 = g*(x) * P_0
+ * V = V_1 : V_0 = g*(x) * (P_1 + T_0)
+ * p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0
+ *
+ * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0
+ * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 :
+ * T_1 into dest. This allows us to reuse P_1 + T_0 when computing V.
+ */
+.macro montgomery_reduction dest
+ vpclmulqdq $0x00, PL, GSTAR, TMP_XMM # TMP_XMM = T_1 : T_0 = P_0 * g*(x)
+ pshufd $0b01001110, TMP_XMM, TMP_XMM # TMP_XMM = T_0 : T_1
+ pxor PL, TMP_XMM # TMP_XMM = P_1 + T_0 : P_0 + T_1
+ pxor TMP_XMM, PH # PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1
+ pclmulqdq $0x11, GSTAR, TMP_XMM # TMP_XMM = V_1 : V_0 = V = [(P_1 + T_0) * g*(x)]
+ vpxor TMP_XMM, PH, \dest
+.endm
+
+/*
+ * Compute schoolbook multiplication for 8 blocks
+ * m_0h^8 + ... + m_7h^1
+ *
+ * If reduce is set, also computes the montgomery reduction of the
+ * previous full_stride call and XORs with the first message block.
+ * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
+ * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0.
+ */
+.macro full_stride reduce
+ pxor LO, LO
+ pxor HI, HI
+ pxor MI, MI
+
+ schoolbook1_iteration 7 0
+ .if \reduce
+ vpclmulqdq $0x00, PL, GSTAR, TMP_XMM
+ .endif
+
+ schoolbook1_iteration 6 0
+ .if \reduce
+ pshufd $0b01001110, TMP_XMM, TMP_XMM
+ .endif
+
+ schoolbook1_iteration 5 0
+ .if \reduce
+ pxor PL, TMP_XMM
+ .endif
+
+ schoolbook1_iteration 4 0
+ .if \reduce
+ pxor TMP_XMM, PH
+ .endif
+
+ schoolbook1_iteration 3 0
+ .if \reduce
+ pclmulqdq $0x11, GSTAR, TMP_XMM
+ .endif
+
+ schoolbook1_iteration 2 0
+ .if \reduce
+ vpxor TMP_XMM, PH, SUM
+ .endif
+
+ schoolbook1_iteration 1 0
+
+ schoolbook1_iteration 0 1
+
+ addq $(8*16), MSG
+ schoolbook2
+.endm
+
+/*
+ * Process BLOCKS_LEFT blocks, where 0 < BLOCKS_LEFT < STRIDE_BLOCKS
+ */
+.macro partial_stride
+ mov BLOCKS_LEFT, TMP
+ shlq $4, TMP
+ addq $(16*STRIDE_BLOCKS), KEY_POWERS
+ subq TMP, KEY_POWERS
+
+ movups (MSG), %xmm0
+ pxor SUM, %xmm0
+ movaps (KEY_POWERS), %xmm1
+ schoolbook1_noload
+ dec BLOCKS_LEFT
+ addq $16, MSG
+ addq $16, KEY_POWERS
+
+ test $4, BLOCKS_LEFT
+ jz .Lpartial4BlocksDone
+ schoolbook1 4
+ addq $(4*16), MSG
+ addq $(4*16), KEY_POWERS
+.Lpartial4BlocksDone:
+ test $2, BLOCKS_LEFT
+ jz .Lpartial2BlocksDone
+ schoolbook1 2
+ addq $(2*16), MSG
+ addq $(2*16), KEY_POWERS
+.Lpartial2BlocksDone:
+ test $1, BLOCKS_LEFT
+ jz .LpartialDone
+ schoolbook1 1
+.LpartialDone:
+ schoolbook2
+ montgomery_reduction SUM
+.endm
+
+/*
+ * Perform montgomery multiplication in GF(2^128) and store result in op1.
+ *
+ * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1
+ * If op1, op2 are in montgomery form, this computes the montgomery
+ * form of op1*op2.
+ *
+ * void clmul_polyval_mul(u8 *op1, const u8 *op2);
+ */
+SYM_FUNC_START(clmul_polyval_mul)
+ FRAME_BEGIN
+ vmovdqa .Lgstar(%rip), GSTAR
+ movups (%rdi), %xmm0
+ movups (%rsi), %xmm1
+ schoolbook1_noload
+ schoolbook2
+ montgomery_reduction SUM
+ movups SUM, (%rdi)
+ FRAME_END
+ RET
+SYM_FUNC_END(clmul_polyval_mul)
+
+/*
+ * Perform polynomial evaluation as specified by POLYVAL. This computes:
+ * h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
+ * where n=nblocks, h is the hash key, and m_i are the message blocks.
+ *
+ * rdi - pointer to precomputed key powers h^8 ... h^1
+ * rsi - pointer to message blocks
+ * rdx - number of blocks to hash
+ * rcx - pointer to the accumulator
+ *
+ * void clmul_polyval_update(const struct polyval_tfm_ctx *keys,
+ * const u8 *in, size_t nblocks, u8 *accumulator);
+ */
+SYM_FUNC_START(clmul_polyval_update)
+ FRAME_BEGIN
+ vmovdqa .Lgstar(%rip), GSTAR
+ movups (ACCUMULATOR), SUM
+ subq $STRIDE_BLOCKS, BLOCKS_LEFT
+ js .LstrideLoopExit
+ full_stride 0
+ subq $STRIDE_BLOCKS, BLOCKS_LEFT
+ js .LstrideLoopExitReduce
+.LstrideLoop:
+ full_stride 1
+ subq $STRIDE_BLOCKS, BLOCKS_LEFT
+ jns .LstrideLoop
+.LstrideLoopExitReduce:
+ montgomery_reduction SUM
+.LstrideLoopExit:
+ add $STRIDE_BLOCKS, BLOCKS_LEFT
+ jz .LskipPartial
+ partial_stride
+.LskipPartial:
+ movups SUM, (ACCUMULATOR)
+ FRAME_END
+ RET
+SYM_FUNC_END(clmul_polyval_update)
diff --git a/arch/x86/crypto/polyval-clmulni_glue.c b/arch/x86/crypto/polyval-clmulni_glue.c
new file mode 100644
index 000000000000..b7664d018851
--- /dev/null
+++ b/arch/x86/crypto/polyval-clmulni_glue.c
@@ -0,0 +1,203 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Glue code for POLYVAL using PCMULQDQ-NI
+ *
+ * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi>
+ * Copyright (c) 2009 Intel Corp.
+ * Author: Huang Ying <ying.huang@intel.com>
+ * Copyright 2021 Google LLC
+ */
+
+/*
+ * Glue code based on ghash-clmulni-intel_glue.c.
+ *
+ * This implementation of POLYVAL uses montgomery multiplication
+ * accelerated by PCLMULQDQ-NI to implement the finite field
+ * operations.
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <crypto/polyval.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <asm/cpu_device_id.h>
+#include <asm/simd.h>
+
+#define NUM_KEY_POWERS 8
+
+struct polyval_tfm_ctx {
+ /*
+ * These powers must be in the order h^8, ..., h^1.
+ */
+ u8 key_powers[NUM_KEY_POWERS][POLYVAL_BLOCK_SIZE];
+};
+
+struct polyval_desc_ctx {
+ u8 buffer[POLYVAL_BLOCK_SIZE];
+ u32 bytes;
+};
+
+asmlinkage void clmul_polyval_update(const struct polyval_tfm_ctx *keys,
+ const u8 *in, size_t nblocks, u8 *accumulator);
+asmlinkage void clmul_polyval_mul(u8 *op1, const u8 *op2);
+
+static void internal_polyval_update(const struct polyval_tfm_ctx *keys,
+ const u8 *in, size_t nblocks, u8 *accumulator)
+{
+ if (likely(crypto_simd_usable())) {
+ kernel_fpu_begin();
+ clmul_polyval_update(keys, in, nblocks, accumulator);
+ kernel_fpu_end();
+ } else {
+ polyval_update_non4k(keys->key_powers[NUM_KEY_POWERS-1], in,
+ nblocks, accumulator);
+ }
+}
+
+static void internal_polyval_mul(u8 *op1, const u8 *op2)
+{
+ if (likely(crypto_simd_usable())) {
+ kernel_fpu_begin();
+ clmul_polyval_mul(op1, op2);
+ kernel_fpu_end();
+ } else {
+ polyval_mul_non4k(op1, op2);
+ }
+}
+
+static int polyval_x86_setkey(struct crypto_shash *tfm,
+ const u8 *key, unsigned int keylen)
+{
+ struct polyval_tfm_ctx *tctx = crypto_shash_ctx(tfm);
+ int i;
+
+ if (keylen != POLYVAL_BLOCK_SIZE)
+ return -EINVAL;
+
+ memcpy(tctx->key_powers[NUM_KEY_POWERS-1], key, POLYVAL_BLOCK_SIZE);
+
+ for (i = NUM_KEY_POWERS-2; i >= 0; i--) {
+ memcpy(tctx->key_powers[i], key, POLYVAL_BLOCK_SIZE);
+ internal_polyval_mul(tctx->key_powers[i],
+ tctx->key_powers[i+1]);
+ }
+
+ return 0;
+}
+
+static int polyval_x86_init(struct shash_desc *desc)
+{
+ struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ memset(dctx, 0, sizeof(*dctx));
+
+ return 0;
+}
+
+static int polyval_x86_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen)
+{
+ struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
+ const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+ u8 *pos;
+ unsigned int nblocks;
+ unsigned int n;
+
+ if (dctx->bytes) {
+ n = min(srclen, dctx->bytes);
+ pos = dctx->buffer + POLYVAL_BLOCK_SIZE - dctx->bytes;
+
+ dctx->bytes -= n;
+ srclen -= n;
+
+ while (n--)
+ *pos++ ^= *src++;
+
+ if (!dctx->bytes)
+ internal_polyval_mul(dctx->buffer,
+ tctx->key_powers[NUM_KEY_POWERS-1]);
+ }
+
+ while (srclen >= POLYVAL_BLOCK_SIZE) {
+ /* Allow rescheduling every 4K bytes. */
+ nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE;
+ internal_polyval_update(tctx, src, nblocks, dctx->buffer);
+ srclen -= nblocks * POLYVAL_BLOCK_SIZE;
+ src += nblocks * POLYVAL_BLOCK_SIZE;
+ }
+
+ if (srclen) {
+ dctx->bytes = POLYVAL_BLOCK_SIZE - srclen;
+ pos = dctx->buffer;
+ while (srclen--)
+ *pos++ ^= *src++;
+ }
+
+ return 0;
+}
+
+static int polyval_x86_final(struct shash_desc *desc, u8 *dst)
+{
+ struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
+ const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+
+ if (dctx->bytes) {
+ internal_polyval_mul(dctx->buffer,
+ tctx->key_powers[NUM_KEY_POWERS-1]);
+ }
+
+ memcpy(dst, dctx->buffer, POLYVAL_BLOCK_SIZE);
+
+ return 0;
+}
+
+static struct shash_alg polyval_alg = {
+ .digestsize = POLYVAL_DIGEST_SIZE,
+ .init = polyval_x86_init,
+ .update = polyval_x86_update,
+ .final = polyval_x86_final,
+ .setkey = polyval_x86_setkey,
+ .descsize = sizeof(struct polyval_desc_ctx),
+ .base = {
+ .cra_name = "polyval",
+ .cra_driver_name = "polyval-clmulni",
+ .cra_priority = 200,
+ .cra_blocksize = POLYVAL_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct polyval_tfm_ctx),
+ .cra_module = THIS_MODULE,
+ },
+};
+
+__maybe_unused static const struct x86_cpu_id pcmul_cpu_id[] = {
+ X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, pcmul_cpu_id);
+
+static int __init polyval_clmulni_mod_init(void)
+{
+ if (!x86_match_cpu(pcmul_cpu_id))
+ return -ENODEV;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX))
+ return -ENODEV;
+
+ return crypto_register_shash(&polyval_alg);
+}
+
+static void __exit polyval_clmulni_mod_exit(void)
+{
+ crypto_unregister_shash(&polyval_alg);
+}
+
+module_init(polyval_clmulni_mod_init);
+module_exit(polyval_clmulni_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("POLYVAL hash function accelerated by PCLMULQDQ-NI");
+MODULE_ALIAS_CRYPTO("polyval");
+MODULE_ALIAS_CRYPTO("polyval-clmulni");