From ca81a1a1b8d79dd6706c9463a81e9491e940ca2b Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 26 Feb 2013 17:52:15 +0800
Subject: crypto: crc32c - Kill pointless CRYPTO_CRC32C_X86_64 option

This bool option can never be set to anything other than y.  So
let's just kill it.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/Kconfig | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'crypto')

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 05c0ce52f96d..aed52b2e4a55 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -322,19 +322,9 @@ config CRYPTO_CRC32C
 	  by iSCSI for header and data digests and by others.
 	  See Castagnoli93.  Module will be crc32c.
 
-config CRYPTO_CRC32C_X86_64
-	bool
-	depends on X86 && 64BIT
-	select CRYPTO_HASH
-	help
-	  In Intel processor with SSE4.2 supported, the processor will
-	  support CRC32C calculation using hardware accelerated CRC32
-	  instruction optimized with PCLMULQDQ instruction when available.
-
 config CRYPTO_CRC32C_INTEL
 	tristate "CRC32c INTEL hardware acceleration"
 	depends on X86
-	select CRYPTO_CRC32C_X86_64 if 64BIT
 	select CRYPTO_HASH
 	help
 	  In Intel processor with SSE4.2 supported, the processor will
-- 
cgit v1.2.3-59-g8ed1b


From a84fb791cb467851772a9196c05531be4abaf1bb Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Sun, 24 Feb 2013 14:09:12 +0100
Subject: crypto: user - constify netlink dispatch table

There is no need to modify the netlink dispatch table at runtime and
making it const even makes the resulting object file slightly smaller.

Cc: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/crypto_user.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'crypto')

diff --git a/crypto/crypto_user.c b/crypto/crypto_user.c
index dfd511fb39ee..1512e41cd93d 100644
--- a/crypto/crypto_user.c
+++ b/crypto/crypto_user.c
@@ -440,7 +440,7 @@ static const struct nla_policy crypto_policy[CRYPTOCFGA_MAX+1] = {
 
 #undef MSGSIZE
 
-static struct crypto_link {
+static const struct crypto_link {
 	int (*doit)(struct sk_buff *, struct nlmsghdr *, struct nlattr **);
 	int (*dump)(struct sk_buff *, struct netlink_callback *);
 	int (*done)(struct netlink_callback *);
@@ -456,7 +456,7 @@ static struct crypto_link {
 static int crypto_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
 	struct nlattr *attrs[CRYPTOCFGA_MAX+1];
-	struct crypto_link *link;
+	const struct crypto_link *link;
 	int type, err;
 
 	type = nlh->nlmsg_type;
-- 
cgit v1.2.3-59-g8ed1b


From 35d2c9d0c3cfd90850dc647250610587743e1f29 Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Tue, 26 Mar 2013 13:58:49 -0700
Subject: crypto: sha256 - Expose SHA256 generic routine to be callable
 externally.

Other SHA256 routine may need to use the generic routine when
FPU is not available.

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/sha256_generic.c | 11 ++++++-----
 include/crypto/sha.h    |  2 ++
 2 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'crypto')

diff --git a/crypto/sha256_generic.c b/crypto/sha256_generic.c
index c3ed4ec924e1..543366779524 100644
--- a/crypto/sha256_generic.c
+++ b/crypto/sha256_generic.c
@@ -246,7 +246,7 @@ static int sha256_init(struct shash_desc *desc)
 	return 0;
 }
 
-static int sha256_update(struct shash_desc *desc, const u8 *data,
+int crypto_sha256_update(struct shash_desc *desc, const u8 *data,
 			  unsigned int len)
 {
 	struct sha256_state *sctx = shash_desc_ctx(desc);
@@ -277,6 +277,7 @@ static int sha256_update(struct shash_desc *desc, const u8 *data,
 
 	return 0;
 }
+EXPORT_SYMBOL(crypto_sha256_update);
 
 static int sha256_final(struct shash_desc *desc, u8 *out)
 {
@@ -293,10 +294,10 @@ static int sha256_final(struct shash_desc *desc, u8 *out)
 	/* Pad out to 56 mod 64. */
 	index = sctx->count & 0x3f;
 	pad_len = (index < 56) ? (56 - index) : ((64+56) - index);
-	sha256_update(desc, padding, pad_len);
+	crypto_sha256_update(desc, padding, pad_len);
 
 	/* Append length (before padding) */
-	sha256_update(desc, (const u8 *)&bits, sizeof(bits));
+	crypto_sha256_update(desc, (const u8 *)&bits, sizeof(bits));
 
 	/* Store state in digest */
 	for (i = 0; i < 8; i++)
@@ -339,7 +340,7 @@ static int sha256_import(struct shash_desc *desc, const void *in)
 static struct shash_alg sha256_algs[2] = { {
 	.digestsize	=	SHA256_DIGEST_SIZE,
 	.init		=	sha256_init,
-	.update		=	sha256_update,
+	.update		=	crypto_sha256_update,
 	.final		=	sha256_final,
 	.export		=	sha256_export,
 	.import		=	sha256_import,
@@ -355,7 +356,7 @@ static struct shash_alg sha256_algs[2] = { {
 }, {
 	.digestsize	=	SHA224_DIGEST_SIZE,
 	.init		=	sha224_init,
-	.update		=	sha256_update,
+	.update		=	crypto_sha256_update,
 	.final		=	sha224_final,
 	.descsize	=	sizeof(struct sha256_state),
 	.base		=	{
diff --git a/include/crypto/sha.h b/include/crypto/sha.h
index c6c9c1fe460c..f46ff61e3780 100644
--- a/include/crypto/sha.h
+++ b/include/crypto/sha.h
@@ -87,4 +87,6 @@ struct shash_desc;
 extern int crypto_sha1_update(struct shash_desc *desc, const u8 *data,
 			      unsigned int len);
 
+extern int crypto_sha256_update(struct shash_desc *desc, const u8 *data,
+			      unsigned int len);
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 8275d1aa642295edd34a11a117080384bb9d65c2 Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Tue, 26 Mar 2013 13:59:17 -0700
Subject: crypto: sha256 - Create module providing optimized SHA256 routines
 using SSSE3, AVX or AVX2 instructions.

We added glue code and config options to create crypto
module that uses SSE/AVX/AVX2 optimized SHA256 x86_64 assembly routines.

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/Makefile            |   2 +
 arch/x86/crypto/sha256_ssse3_glue.c | 275 ++++++++++++++++++++++++++++++++++++
 crypto/Kconfig                      |  11 ++
 3 files changed, 288 insertions(+)
 create mode 100644 arch/x86/crypto/sha256_ssse3_glue.c

(limited to 'crypto')

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 83681a317422..9414b91b8f49 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
 obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
 obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
 obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
+obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o
 
 # These modules require assembler to support AVX.
 ifeq ($(avx_supported),yes)
@@ -66,3 +67,4 @@ sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
 crc32c-intel-y := crc32c-intel_glue.o
 crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
 crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
+sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
new file mode 100644
index 000000000000..597d4da69656
--- /dev/null
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -0,0 +1,275 @@
+/*
+ * Cryptographic API.
+ *
+ * Glue code for the SHA256 Secure Hash Algorithm assembler
+ * implementation using supplemental SSE3 / AVX / AVX2 instructions.
+ *
+ * This file is based on sha256_generic.c
+ *
+ * Copyright (C) 2013 Intel Corporation.
+ *
+ * Author:
+ *     Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/i387.h>
+#include <asm/xcr.h>
+#include <asm/xsave.h>
+#include <linux/string.h>
+
+asmlinkage void sha256_transform_ssse3(const char *data, u32 *digest,
+				     u64 rounds);
+#ifdef CONFIG_AS_AVX
+asmlinkage void sha256_transform_avx(const char *data, u32 *digest,
+				     u64 rounds);
+#endif
+#ifdef CONFIG_AS_AVX2
+asmlinkage void sha256_transform_rorx(const char *data, u32 *digest,
+				     u64 rounds);
+#endif
+
+static asmlinkage void (*sha256_transform_asm)(const char *, u32 *, u64);
+
+
+static int sha256_ssse3_init(struct shash_desc *desc)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	sctx->state[0] = SHA256_H0;
+	sctx->state[1] = SHA256_H1;
+	sctx->state[2] = SHA256_H2;
+	sctx->state[3] = SHA256_H3;
+	sctx->state[4] = SHA256_H4;
+	sctx->state[5] = SHA256_H5;
+	sctx->state[6] = SHA256_H6;
+	sctx->state[7] = SHA256_H7;
+	sctx->count = 0;
+
+	return 0;
+}
+
+static int __sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
+			       unsigned int len, unsigned int partial)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int done = 0;
+
+	sctx->count += len;
+
+	if (partial) {
+		done = SHA256_BLOCK_SIZE - partial;
+		memcpy(sctx->buf + partial, data, done);
+		sha256_transform_asm(sctx->buf, sctx->state, 1);
+	}
+
+	if (len - done >= SHA256_BLOCK_SIZE) {
+		const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
+
+		sha256_transform_asm(data + done, sctx->state, (u64) rounds);
+
+		done += rounds * SHA256_BLOCK_SIZE;
+	}
+
+	memcpy(sctx->buf, data + done, len - done);
+
+	return 0;
+}
+
+static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
+			     unsigned int len)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
+	int res;
+
+	/* Handle the fast case right here */
+	if (partial + len < SHA256_BLOCK_SIZE) {
+		sctx->count += len;
+		memcpy(sctx->buf + partial, data, len);
+
+		return 0;
+	}
+
+	if (!irq_fpu_usable()) {
+		res = crypto_sha256_update(desc, data, len);
+	} else {
+		kernel_fpu_begin();
+		res = __sha256_ssse3_update(desc, data, len, partial);
+		kernel_fpu_end();
+	}
+
+	return res;
+}
+
+
+/* Add padding and return the message digest. */
+static int sha256_ssse3_final(struct shash_desc *desc, u8 *out)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int i, index, padlen;
+	__be32 *dst = (__be32 *)out;
+	__be64 bits;
+	static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
+
+	bits = cpu_to_be64(sctx->count << 3);
+
+	/* Pad out to 56 mod 64 and append length */
+	index = sctx->count % SHA256_BLOCK_SIZE;
+	padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
+
+	if (!irq_fpu_usable()) {
+		crypto_sha256_update(desc, padding, padlen);
+		crypto_sha256_update(desc, (const u8 *)&bits, sizeof(bits));
+	} else {
+		kernel_fpu_begin();
+		/* We need to fill a whole block for __sha256_ssse3_update() */
+		if (padlen <= 56) {
+			sctx->count += padlen;
+			memcpy(sctx->buf + index, padding, padlen);
+		} else {
+			__sha256_ssse3_update(desc, padding, padlen, index);
+		}
+		__sha256_ssse3_update(desc, (const u8 *)&bits,
+					sizeof(bits), 56);
+		kernel_fpu_end();
+	}
+
+	/* Store state in digest */
+	for (i = 0; i < 8; i++)
+		dst[i] = cpu_to_be32(sctx->state[i]);
+
+	/* Wipe context */
+	memset(sctx, 0, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha256_ssse3_export(struct shash_desc *desc, void *out)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(out, sctx, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha256_ssse3_import(struct shash_desc *desc, const void *in)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(sctx, in, sizeof(*sctx));
+
+	return 0;
+}
+
+static struct shash_alg alg = {
+	.digestsize	=	SHA256_DIGEST_SIZE,
+	.init		=	sha256_ssse3_init,
+	.update		=	sha256_ssse3_update,
+	.final		=	sha256_ssse3_final,
+	.export		=	sha256_ssse3_export,
+	.import		=	sha256_ssse3_import,
+	.descsize	=	sizeof(struct sha256_state),
+	.statesize	=	sizeof(struct sha256_state),
+	.base		=	{
+		.cra_name	=	"sha256",
+		.cra_driver_name =	"sha256-ssse3",
+		.cra_priority	=	150,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA256_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+};
+
+#ifdef CONFIG_AS_AVX
+static bool __init avx_usable(void)
+{
+	u64 xcr0;
+
+	if (!cpu_has_avx || !cpu_has_osxsave)
+		return false;
+
+	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
+		pr_info("AVX detected but unusable.\n");
+
+		return false;
+	}
+
+	return true;
+}
+#endif
+
+static int __init sha256_ssse3_mod_init(void)
+{
+	/* test for SSE3 first */
+	if (cpu_has_ssse3)
+		sha256_transform_asm = sha256_transform_ssse3;
+
+#ifdef CONFIG_AS_AVX
+	/* allow AVX to override SSSE3, it's a little faster */
+	if (avx_usable()) {
+#ifdef CONFIG_AS_AVX2
+		if (boot_cpu_has(X86_FEATURE_AVX2))
+			sha256_transform_asm = sha256_transform_rorx;
+		else
+#endif
+			sha256_transform_asm = sha256_transform_avx;
+	}
+#endif
+
+	if (sha256_transform_asm) {
+#ifdef CONFIG_AS_AVX
+		if (sha256_transform_asm == sha256_transform_avx)
+			pr_info("Using AVX optimized SHA-256 implementation\n");
+#ifdef CONFIG_AS_AVX2
+		else if (sha256_transform_asm == sha256_transform_rorx)
+			pr_info("Using AVX2 optimized SHA-256 implementation\n");
+#endif
+		else
+#endif
+			pr_info("Using SSSE3 optimized SHA-256 implementation\n");
+		return crypto_register_shash(&alg);
+	}
+	pr_info("Neither AVX nor SSSE3 is available/usable.\n");
+
+	return -ENODEV;
+}
+
+static void __exit sha256_ssse3_mod_fini(void)
+{
+	crypto_unregister_shash(&alg);
+}
+
+module_init(sha256_ssse3_mod_init);
+module_exit(sha256_ssse3_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated");
+
+MODULE_ALIAS("sha256");
diff --git a/crypto/Kconfig b/crypto/Kconfig
index aed52b2e4a55..8064ef1fedc4 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -470,6 +470,17 @@ config CRYPTO_SHA1_SSSE3
 	  using Supplemental SSE3 (SSSE3) instructions or Advanced Vector
 	  Extensions (AVX), when available.
 
+config CRYPTO_SHA256_SSSE3
+	tristate "SHA256 digest algorithm (SSSE3/AVX/AVX2)"
+	depends on X86 && 64BIT
+	select CRYPTO_SHA256
+	select CRYPTO_HASH
+	help
+	  SHA-256 secure hash standard (DFIPS 180-2) implemented
+	  using Supplemental SSE3 (SSSE3) instructions, or Advanced Vector
+	  Extensions version 1 (AVX1), or Advanced Vector Extensions
+	  version 2 (AVX2) instructions, when available.
+
 config CRYPTO_SHA1_SPARC64
 	tristate "SHA1 digest algorithm (SPARC64)"
 	depends on SPARC64
-- 
cgit v1.2.3-59-g8ed1b


From bf70fa9d9ee07aa175453b19a39b2b9dab602d97 Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Tue, 26 Mar 2013 13:59:25 -0700
Subject: crypto: sha512 - Expose generic sha512 routine to be callable from
 other modules

Other SHA512 routines may need to use the generic routine when
FPU is not available.

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/sha512_generic.c | 13 +++++++------
 include/crypto/sha.h    |  3 +++
 2 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'crypto')

diff --git a/crypto/sha512_generic.c b/crypto/sha512_generic.c
index 71fcf361102d..4c5862095679 100644
--- a/crypto/sha512_generic.c
+++ b/crypto/sha512_generic.c
@@ -163,8 +163,8 @@ sha384_init(struct shash_desc *desc)
 	return 0;
 }
 
-static int
-sha512_update(struct shash_desc *desc, const u8 *data, unsigned int len)
+int crypto_sha512_update(struct shash_desc *desc, const u8 *data,
+			unsigned int len)
 {
 	struct sha512_state *sctx = shash_desc_ctx(desc);
 
@@ -197,6 +197,7 @@ sha512_update(struct shash_desc *desc, const u8 *data, unsigned int len)
 
 	return 0;
 }
+EXPORT_SYMBOL(crypto_sha512_update);
 
 static int
 sha512_final(struct shash_desc *desc, u8 *hash)
@@ -215,10 +216,10 @@ sha512_final(struct shash_desc *desc, u8 *hash)
 	/* Pad out to 112 mod 128. */
 	index = sctx->count[0] & 0x7f;
 	pad_len = (index < 112) ? (112 - index) : ((128+112) - index);
-	sha512_update(desc, padding, pad_len);
+	crypto_sha512_update(desc, padding, pad_len);
 
 	/* Append length (before padding) */
-	sha512_update(desc, (const u8 *)bits, sizeof(bits));
+	crypto_sha512_update(desc, (const u8 *)bits, sizeof(bits));
 
 	/* Store state in digest */
 	for (i = 0; i < 8; i++)
@@ -245,7 +246,7 @@ static int sha384_final(struct shash_desc *desc, u8 *hash)
 static struct shash_alg sha512_algs[2] = { {
 	.digestsize	=	SHA512_DIGEST_SIZE,
 	.init		=	sha512_init,
-	.update		=	sha512_update,
+	.update		=	crypto_sha512_update,
 	.final		=	sha512_final,
 	.descsize	=	sizeof(struct sha512_state),
 	.base		=	{
@@ -257,7 +258,7 @@ static struct shash_alg sha512_algs[2] = { {
 }, {
 	.digestsize	=	SHA384_DIGEST_SIZE,
 	.init		=	sha384_init,
-	.update		=	sha512_update,
+	.update		=	crypto_sha512_update,
 	.final		=	sha384_final,
 	.descsize	=	sizeof(struct sha512_state),
 	.base		=	{
diff --git a/include/crypto/sha.h b/include/crypto/sha.h
index f46ff61e3780..190f8a0e0242 100644
--- a/include/crypto/sha.h
+++ b/include/crypto/sha.h
@@ -89,4 +89,7 @@ extern int crypto_sha1_update(struct shash_desc *desc, const u8 *data,
 
 extern int crypto_sha256_update(struct shash_desc *desc, const u8 *data,
 			      unsigned int len);
+
+extern int crypto_sha512_update(struct shash_desc *desc, const u8 *data,
+			      unsigned int len);
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 87de4579f92dbe50e92f33b94f8688793c894571 Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Tue, 26 Mar 2013 14:00:02 -0700
Subject: crypto: sha512 - Create module providing optimized SHA512 routines
 using SSSE3, AVX or AVX2 instructions.

We added glue code and config options to create crypto
module that uses SSE/AVX/AVX2 optimized SHA512 x86_64 assembly routines.

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/Makefile            |   2 +
 arch/x86/crypto/sha512_ssse3_glue.c | 282 ++++++++++++++++++++++++++++++++++++
 crypto/Kconfig                      |  11 ++
 3 files changed, 295 insertions(+)
 create mode 100644 arch/x86/crypto/sha512_ssse3_glue.c

(limited to 'crypto')

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 9414b91b8f49..03cd7313ad4b 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
 obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
 obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
 obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o
+obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
 
 # These modules require assembler to support AVX.
 ifeq ($(avx_supported),yes)
@@ -68,3 +69,4 @@ crc32c-intel-y := crc32c-intel_glue.o
 crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
 crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
 sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o
+sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
new file mode 100644
index 000000000000..6cbd8df348d2
--- /dev/null
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -0,0 +1,282 @@
+/*
+ * Cryptographic API.
+ *
+ * Glue code for the SHA512 Secure Hash Algorithm assembler
+ * implementation using supplemental SSE3 / AVX / AVX2 instructions.
+ *
+ * This file is based on sha512_generic.c
+ *
+ * Copyright (C) 2013 Intel Corporation
+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/i387.h>
+#include <asm/xcr.h>
+#include <asm/xsave.h>
+
+#include <linux/string.h>
+
+asmlinkage void sha512_transform_ssse3(const char *data, u64 *digest,
+				     u64 rounds);
+#ifdef CONFIG_AS_AVX
+asmlinkage void sha512_transform_avx(const char *data, u64 *digest,
+				     u64 rounds);
+#endif
+#ifdef CONFIG_AS_AVX2
+asmlinkage void sha512_transform_rorx(const char *data, u64 *digest,
+				     u64 rounds);
+#endif
+
+static asmlinkage void (*sha512_transform_asm)(const char *, u64 *, u64);
+
+
+static int sha512_ssse3_init(struct shash_desc *desc)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+
+	sctx->state[0] = SHA512_H0;
+	sctx->state[1] = SHA512_H1;
+	sctx->state[2] = SHA512_H2;
+	sctx->state[3] = SHA512_H3;
+	sctx->state[4] = SHA512_H4;
+	sctx->state[5] = SHA512_H5;
+	sctx->state[6] = SHA512_H6;
+	sctx->state[7] = SHA512_H7;
+	sctx->count[0] = sctx->count[1] = 0;
+
+	return 0;
+}
+
+static int __sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
+			       unsigned int len, unsigned int partial)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+	unsigned int done = 0;
+
+	sctx->count[0] += len;
+	if (sctx->count[0] < len)
+		sctx->count[1]++;
+
+	if (partial) {
+		done = SHA512_BLOCK_SIZE - partial;
+		memcpy(sctx->buf + partial, data, done);
+		sha512_transform_asm(sctx->buf, sctx->state, 1);
+	}
+
+	if (len - done >= SHA512_BLOCK_SIZE) {
+		const unsigned int rounds = (len - done) / SHA512_BLOCK_SIZE;
+
+		sha512_transform_asm(data + done, sctx->state, (u64) rounds);
+
+		done += rounds * SHA512_BLOCK_SIZE;
+	}
+
+	memcpy(sctx->buf, data + done, len - done);
+
+	return 0;
+}
+
+static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
+			     unsigned int len)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+	unsigned int partial = sctx->count[0] % SHA512_BLOCK_SIZE;
+	int res;
+
+	/* Handle the fast case right here */
+	if (partial + len < SHA512_BLOCK_SIZE) {
+		sctx->count[0] += len;
+		if (sctx->count[0] < len)
+			sctx->count[1]++;
+		memcpy(sctx->buf + partial, data, len);
+
+		return 0;
+	}
+
+	if (!irq_fpu_usable()) {
+		res = crypto_sha512_update(desc, data, len);
+	} else {
+		kernel_fpu_begin();
+		res = __sha512_ssse3_update(desc, data, len, partial);
+		kernel_fpu_end();
+	}
+
+	return res;
+}
+
+
+/* Add padding and return the message digest. */
+static int sha512_ssse3_final(struct shash_desc *desc, u8 *out)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+	unsigned int i, index, padlen;
+	__be64 *dst = (__be64 *)out;
+	__be64 bits[2];
+	static const u8 padding[SHA512_BLOCK_SIZE] = { 0x80, };
+
+	/* save number of bits */
+	bits[1] = cpu_to_be64(sctx->count[0] << 3);
+	bits[0] = cpu_to_be64(sctx->count[1] << 3) | sctx->count[0] >> 61;
+
+	/* Pad out to 112 mod 128 and append length */
+	index = sctx->count[0] & 0x7f;
+	padlen = (index < 112) ? (112 - index) : ((128+112) - index);
+
+	if (!irq_fpu_usable()) {
+		crypto_sha512_update(desc, padding, padlen);
+		crypto_sha512_update(desc, (const u8 *)&bits, sizeof(bits));
+	} else {
+		kernel_fpu_begin();
+		/* We need to fill a whole block for __sha512_ssse3_update() */
+		if (padlen <= 112) {
+			sctx->count[0] += padlen;
+			if (sctx->count[0] < padlen)
+				sctx->count[1]++;
+			memcpy(sctx->buf + index, padding, padlen);
+		} else {
+			__sha512_ssse3_update(desc, padding, padlen, index);
+		}
+		__sha512_ssse3_update(desc, (const u8 *)&bits,
+					sizeof(bits), 112);
+		kernel_fpu_end();
+	}
+
+	/* Store state in digest */
+	for (i = 0; i < 8; i++)
+		dst[i] = cpu_to_be64(sctx->state[i]);
+
+	/* Wipe context */
+	memset(sctx, 0, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha512_ssse3_export(struct shash_desc *desc, void *out)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(out, sctx, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha512_ssse3_import(struct shash_desc *desc, const void *in)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(sctx, in, sizeof(*sctx));
+
+	return 0;
+}
+
+static struct shash_alg alg = {
+	.digestsize	=	SHA512_DIGEST_SIZE,
+	.init		=	sha512_ssse3_init,
+	.update		=	sha512_ssse3_update,
+	.final		=	sha512_ssse3_final,
+	.export		=	sha512_ssse3_export,
+	.import		=	sha512_ssse3_import,
+	.descsize	=	sizeof(struct sha512_state),
+	.statesize	=	sizeof(struct sha512_state),
+	.base		=	{
+		.cra_name	=	"sha512",
+		.cra_driver_name =	"sha512-ssse3",
+		.cra_priority	=	150,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA512_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+};
+
+#ifdef CONFIG_AS_AVX
+static bool __init avx_usable(void)
+{
+	u64 xcr0;
+
+	if (!cpu_has_avx || !cpu_has_osxsave)
+		return false;
+
+	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
+		pr_info("AVX detected but unusable.\n");
+
+		return false;
+	}
+
+	return true;
+}
+#endif
+
+static int __init sha512_ssse3_mod_init(void)
+{
+	/* test for SSE3 first */
+	if (cpu_has_ssse3)
+		sha512_transform_asm = sha512_transform_ssse3;
+
+#ifdef CONFIG_AS_AVX
+	/* allow AVX to override SSSE3, it's a little faster */
+	if (avx_usable()) {
+#ifdef CONFIG_AS_AVX2
+		if (boot_cpu_has(X86_FEATURE_AVX2))
+			sha512_transform_asm = sha512_transform_rorx;
+		else
+#endif
+			sha512_transform_asm = sha512_transform_avx;
+	}
+#endif
+
+	if (sha512_transform_asm) {
+#ifdef CONFIG_AS_AVX
+		if (sha512_transform_asm == sha512_transform_avx)
+			pr_info("Using AVX optimized SHA-512 implementation\n");
+#ifdef CONFIG_AS_AVX2
+		else if (sha512_transform_asm == sha512_transform_rorx)
+			pr_info("Using AVX2 optimized SHA-512 implementation\n");
+#endif
+		else
+#endif
+			pr_info("Using SSSE3 optimized SHA-512 implementation\n");
+		return crypto_register_shash(&alg);
+	}
+	pr_info("Neither AVX nor SSSE3 is available/usable.\n");
+
+	return -ENODEV;
+}
+
+static void __exit sha512_ssse3_mod_fini(void)
+{
+	crypto_unregister_shash(&alg);
+}
+
+module_init(sha512_ssse3_mod_init);
+module_exit(sha512_ssse3_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated");
+
+MODULE_ALIAS("sha512");
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 8064ef1fedc4..a654b13ae004 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -481,6 +481,17 @@ config CRYPTO_SHA256_SSSE3
 	  Extensions version 1 (AVX1), or Advanced Vector Extensions
 	  version 2 (AVX2) instructions, when available.
 
+config CRYPTO_SHA512_SSSE3
+	tristate "SHA512 digest algorithm (SSSE3/AVX/AVX2)"
+	depends on X86 && 64BIT
+	select CRYPTO_SHA512
+	select CRYPTO_HASH
+	help
+	  SHA-512 secure hash standard (DFIPS 180-2) implemented
+	  using Supplemental SSE3 (SSSE3) instructions, or Advanced Vector
+	  Extensions version 1 (AVX1), or Advanced Vector Extensions
+	  version 2 (AVX2) instructions, when available.
+
 config CRYPTO_SHA1_SPARC64
 	tristate "SHA1 digest algorithm (SPARC64)"
 	depends on SPARC64
-- 
cgit v1.2.3-59-g8ed1b


From 9489667d3e3d39ba452037585e48a89ce44ccbfe Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Date: Sun, 7 Apr 2013 16:43:41 +0300
Subject: crypto: gcm - make GMAC work when dst and src are different

The GMAC code assumes that dst==src, which causes problems when trying to add
rfc4543(gcm(aes)) test vectors.

So fix this code to work when source and destination buffer are different.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/Kconfig |  1 +
 crypto/gcm.c   | 97 ++++++++++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 81 insertions(+), 17 deletions(-)

(limited to 'crypto')

diff --git a/crypto/Kconfig b/crypto/Kconfig
index a654b13ae004..6cc27f111551 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -198,6 +198,7 @@ config CRYPTO_GCM
 	select CRYPTO_CTR
 	select CRYPTO_AEAD
 	select CRYPTO_GHASH
+	select CRYPTO_NULL
 	help
 	  Support for Galois/Counter Mode (GCM) and Galois Message
 	  Authentication Code (GMAC). Required for IPSec.
diff --git a/crypto/gcm.c b/crypto/gcm.c
index 137ad1ec5438..4ff213997fbd 100644
--- a/crypto/gcm.c
+++ b/crypto/gcm.c
@@ -37,8 +37,14 @@ struct crypto_rfc4106_ctx {
 	u8 nonce[4];
 };
 
+struct crypto_rfc4543_instance_ctx {
+	struct crypto_aead_spawn aead;
+	struct crypto_skcipher_spawn null;
+};
+
 struct crypto_rfc4543_ctx {
 	struct crypto_aead *child;
+	struct crypto_blkcipher *null;
 	u8 nonce[4];
 };
 
@@ -1094,20 +1100,20 @@ static int crypto_rfc4543_setauthsize(struct crypto_aead *parent,
 }
 
 static struct aead_request *crypto_rfc4543_crypt(struct aead_request *req,
-						 int enc)
+						 bool enc)
 {
 	struct crypto_aead *aead = crypto_aead_reqtfm(req);
 	struct crypto_rfc4543_ctx *ctx = crypto_aead_ctx(aead);
 	struct crypto_rfc4543_req_ctx *rctx = crypto_rfc4543_reqctx(req);
 	struct aead_request *subreq = &rctx->subreq;
-	struct scatterlist *dst = req->dst;
+	struct scatterlist *src = req->src;
 	struct scatterlist *cipher = rctx->cipher;
 	struct scatterlist *payload = rctx->payload;
 	struct scatterlist *assoc = rctx->assoc;
 	unsigned int authsize = crypto_aead_authsize(aead);
 	unsigned int assoclen = req->assoclen;
-	struct page *dstp;
-	u8 *vdst;
+	struct page *srcp;
+	u8 *vsrc;
 	u8 *iv = PTR_ALIGN((u8 *)(rctx + 1) + crypto_aead_reqsize(ctx->child),
 			   crypto_aead_alignmask(ctx->child) + 1);
 
@@ -1118,19 +1124,19 @@ static struct aead_request *crypto_rfc4543_crypt(struct aead_request *req,
 	if (enc)
 		memset(rctx->auth_tag, 0, authsize);
 	else
-		scatterwalk_map_and_copy(rctx->auth_tag, dst,
+		scatterwalk_map_and_copy(rctx->auth_tag, src,
 					 req->cryptlen - authsize,
 					 authsize, 0);
 
 	sg_init_one(cipher, rctx->auth_tag, authsize);
 
 	/* construct the aad */
-	dstp = sg_page(dst);
-	vdst = PageHighMem(dstp) ? NULL : page_address(dstp) + dst->offset;
+	srcp = sg_page(src);
+	vsrc = PageHighMem(srcp) ? NULL : page_address(srcp) + src->offset;
 
 	sg_init_table(payload, 2);
 	sg_set_buf(payload, req->iv, 8);
-	scatterwalk_crypto_chain(payload, dst, vdst == req->iv + 8, 2);
+	scatterwalk_crypto_chain(payload, src, vsrc == req->iv + 8, 2);
 	assoclen += 8 + req->cryptlen - (enc ? 0 : authsize);
 
 	sg_init_table(assoc, 2);
@@ -1147,6 +1153,19 @@ static struct aead_request *crypto_rfc4543_crypt(struct aead_request *req,
 	return subreq;
 }
 
+static int crypto_rfc4543_copy_src_to_dst(struct aead_request *req, bool enc)
+{
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct crypto_rfc4543_ctx *ctx = crypto_aead_ctx(aead);
+	unsigned int authsize = crypto_aead_authsize(aead);
+	unsigned int nbytes = req->cryptlen - (enc ? 0 : authsize);
+	struct blkcipher_desc desc = {
+		.tfm = ctx->null,
+	};
+
+	return crypto_blkcipher_encrypt(&desc, req->dst, req->src, nbytes);
+}
+
 static int crypto_rfc4543_encrypt(struct aead_request *req)
 {
 	struct crypto_aead *aead = crypto_aead_reqtfm(req);
@@ -1154,7 +1173,13 @@ static int crypto_rfc4543_encrypt(struct aead_request *req)
 	struct aead_request *subreq;
 	int err;
 
-	subreq = crypto_rfc4543_crypt(req, 1);
+	if (req->src != req->dst) {
+		err = crypto_rfc4543_copy_src_to_dst(req, true);
+		if (err)
+			return err;
+	}
+
+	subreq = crypto_rfc4543_crypt(req, true);
 	err = crypto_aead_encrypt(subreq);
 	if (err)
 		return err;
@@ -1167,7 +1192,15 @@ static int crypto_rfc4543_encrypt(struct aead_request *req)
 
 static int crypto_rfc4543_decrypt(struct aead_request *req)
 {
-	req = crypto_rfc4543_crypt(req, 0);
+	int err;
+
+	if (req->src != req->dst) {
+		err = crypto_rfc4543_copy_src_to_dst(req, false);
+		if (err)
+			return err;
+	}
+
+	req = crypto_rfc4543_crypt(req, false);
 
 	return crypto_aead_decrypt(req);
 }
@@ -1175,16 +1208,25 @@ static int crypto_rfc4543_decrypt(struct aead_request *req)
 static int crypto_rfc4543_init_tfm(struct crypto_tfm *tfm)
 {
 	struct crypto_instance *inst = (void *)tfm->__crt_alg;
-	struct crypto_aead_spawn *spawn = crypto_instance_ctx(inst);
+	struct crypto_rfc4543_instance_ctx *ictx = crypto_instance_ctx(inst);
+	struct crypto_aead_spawn *spawn = &ictx->aead;
 	struct crypto_rfc4543_ctx *ctx = crypto_tfm_ctx(tfm);
 	struct crypto_aead *aead;
+	struct crypto_blkcipher *null;
 	unsigned long align;
+	int err = 0;
 
 	aead = crypto_spawn_aead(spawn);
 	if (IS_ERR(aead))
 		return PTR_ERR(aead);
 
+	null = crypto_spawn_blkcipher(&ictx->null.base);
+	err = PTR_ERR(null);
+	if (IS_ERR(null))
+		goto err_free_aead;
+
 	ctx->child = aead;
+	ctx->null = null;
 
 	align = crypto_aead_alignmask(aead);
 	align &= ~(crypto_tfm_ctx_alignment() - 1);
@@ -1194,6 +1236,10 @@ static int crypto_rfc4543_init_tfm(struct crypto_tfm *tfm)
 				align + 16;
 
 	return 0;
+
+err_free_aead:
+	crypto_free_aead(aead);
+	return err;
 }
 
 static void crypto_rfc4543_exit_tfm(struct crypto_tfm *tfm)
@@ -1201,6 +1247,7 @@ static void crypto_rfc4543_exit_tfm(struct crypto_tfm *tfm)
 	struct crypto_rfc4543_ctx *ctx = crypto_tfm_ctx(tfm);
 
 	crypto_free_aead(ctx->child);
+	crypto_free_blkcipher(ctx->null);
 }
 
 static struct crypto_instance *crypto_rfc4543_alloc(struct rtattr **tb)
@@ -1209,6 +1256,7 @@ static struct crypto_instance *crypto_rfc4543_alloc(struct rtattr **tb)
 	struct crypto_instance *inst;
 	struct crypto_aead_spawn *spawn;
 	struct crypto_alg *alg;
+	struct crypto_rfc4543_instance_ctx *ctx;
 	const char *ccm_name;
 	int err;
 
@@ -1223,11 +1271,12 @@ static struct crypto_instance *crypto_rfc4543_alloc(struct rtattr **tb)
 	if (IS_ERR(ccm_name))
 		return ERR_CAST(ccm_name);
 
-	inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL);
+	inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL);
 	if (!inst)
 		return ERR_PTR(-ENOMEM);
 
-	spawn = crypto_instance_ctx(inst);
+	ctx = crypto_instance_ctx(inst);
+	spawn = &ctx->aead;
 	crypto_set_aead_spawn(spawn, inst);
 	err = crypto_grab_aead(spawn, ccm_name, 0,
 			       crypto_requires_sync(algt->type, algt->mask));
@@ -1236,15 +1285,23 @@ static struct crypto_instance *crypto_rfc4543_alloc(struct rtattr **tb)
 
 	alg = crypto_aead_spawn_alg(spawn);
 
+	crypto_set_skcipher_spawn(&ctx->null, inst);
+	err = crypto_grab_skcipher(&ctx->null, "ecb(cipher_null)", 0,
+				   CRYPTO_ALG_ASYNC);
+	if (err)
+		goto out_drop_alg;
+
+	crypto_skcipher_spawn_alg(&ctx->null);
+
 	err = -EINVAL;
 
 	/* We only support 16-byte blocks. */
 	if (alg->cra_aead.ivsize != 16)
-		goto out_drop_alg;
+		goto out_drop_ecbnull;
 
 	/* Not a stream cipher? */
 	if (alg->cra_blocksize != 1)
-		goto out_drop_alg;
+		goto out_drop_ecbnull;
 
 	err = -ENAMETOOLONG;
 	if (snprintf(inst->alg.cra_name, CRYPTO_MAX_ALG_NAME,
@@ -1252,7 +1309,7 @@ static struct crypto_instance *crypto_rfc4543_alloc(struct rtattr **tb)
 	    snprintf(inst->alg.cra_driver_name, CRYPTO_MAX_ALG_NAME,
 		     "rfc4543(%s)", alg->cra_driver_name) >=
 	    CRYPTO_MAX_ALG_NAME)
-		goto out_drop_alg;
+		goto out_drop_ecbnull;
 
 	inst->alg.cra_flags = CRYPTO_ALG_TYPE_AEAD;
 	inst->alg.cra_flags |= alg->cra_flags & CRYPTO_ALG_ASYNC;
@@ -1279,6 +1336,8 @@ static struct crypto_instance *crypto_rfc4543_alloc(struct rtattr **tb)
 out:
 	return inst;
 
+out_drop_ecbnull:
+	crypto_drop_skcipher(&ctx->null);
 out_drop_alg:
 	crypto_drop_aead(spawn);
 out_free_inst:
@@ -1289,7 +1348,11 @@ out_free_inst:
 
 static void crypto_rfc4543_free(struct crypto_instance *inst)
 {
-	crypto_drop_spawn(crypto_instance_ctx(inst));
+	struct crypto_rfc4543_instance_ctx *ctx = crypto_instance_ctx(inst);
+
+	crypto_drop_aead(&ctx->aead);
+	crypto_drop_skcipher(&ctx->null);
+
 	kfree(inst);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From d733ac90f9fe8ac284e523f9920b507555b12f6d Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Date: Sun, 7 Apr 2013 16:43:46 +0300
Subject: crypto: gcm - fix rfc4543 to handle async crypto correctly

If the gcm cipher used by rfc4543 does not complete request immediately,
the authentication tag is not copied to destination buffer. Patch adds
correct async logic for this case.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/gcm.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

(limited to 'crypto')

diff --git a/crypto/gcm.c b/crypto/gcm.c
index 4ff213997fbd..b0d3cb12334d 100644
--- a/crypto/gcm.c
+++ b/crypto/gcm.c
@@ -1099,6 +1099,21 @@ static int crypto_rfc4543_setauthsize(struct crypto_aead *parent,
 	return crypto_aead_setauthsize(ctx->child, authsize);
 }
 
+static void crypto_rfc4543_done(struct crypto_async_request *areq, int err)
+{
+	struct aead_request *req = areq->data;
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct crypto_rfc4543_req_ctx *rctx = crypto_rfc4543_reqctx(req);
+
+	if (!err) {
+		scatterwalk_map_and_copy(rctx->auth_tag, req->dst,
+					 req->cryptlen,
+					 crypto_aead_authsize(aead), 1);
+	}
+
+	aead_request_complete(req, err);
+}
+
 static struct aead_request *crypto_rfc4543_crypt(struct aead_request *req,
 						 bool enc)
 {
@@ -1145,8 +1160,8 @@ static struct aead_request *crypto_rfc4543_crypt(struct aead_request *req,
 	scatterwalk_crypto_chain(assoc, payload, 0, 2);
 
 	aead_request_set_tfm(subreq, ctx->child);
-	aead_request_set_callback(subreq, req->base.flags, req->base.complete,
-				  req->base.data);
+	aead_request_set_callback(subreq, req->base.flags, crypto_rfc4543_done,
+				  req);
 	aead_request_set_crypt(subreq, cipher, cipher, enc ? 0 : authsize, iv);
 	aead_request_set_assoc(subreq, assoc, assoclen);
 
-- 
cgit v1.2.3-59-g8ed1b


From e9b7441a9926b8091c01431f10231c9ceac52c8f Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Date: Sun, 7 Apr 2013 16:43:51 +0300
Subject: crypto: testmgr - add AES GMAC test vectors

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/tcrypt.c  |  4 +++
 crypto/testmgr.c | 17 +++++++++--
 crypto/testmgr.h | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 108 insertions(+), 2 deletions(-)

(limited to 'crypto')

diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index 87ef7d66bc20..6b911ef8df8b 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -1225,6 +1225,10 @@ static int do_test(int m)
 		ret += tcrypt_test("rfc4106(gcm(aes))");
 		break;
 
+	case 152:
+		ret += tcrypt_test("rfc4543(gcm(aes))");
+		break;
+
 	case 200:
 		test_cipher_speed("ecb(aes)", ENCRYPT, sec, NULL, 0,
 				speed_template_16_24_32);
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index efd8b20e13dc..442ddb46bbe5 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -2696,8 +2696,6 @@ static const struct alg_test_desc alg_test_descs[] = {
 			}
 		}
 	}, {
-
-
 		.alg = "rfc4309(ccm(aes))",
 		.test = alg_test_aead,
 		.fips_allowed = 1,
@@ -2713,6 +2711,21 @@ static const struct alg_test_desc alg_test_descs[] = {
 				}
 			}
 		}
+	}, {
+		.alg = "rfc4543(gcm(aes))",
+		.test = alg_test_aead,
+		.suite = {
+			.aead = {
+				.enc = {
+					.vecs = aes_gcm_rfc4543_enc_tv_template,
+					.count = AES_GCM_4543_ENC_TEST_VECTORS
+				},
+				.dec = {
+					.vecs = aes_gcm_rfc4543_dec_tv_template,
+					.count = AES_GCM_4543_DEC_TEST_VECTORS
+				},
+			}
+		}
 	}, {
 		.alg = "rmd128",
 		.test = alg_test_hash,
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index b5721e0b979c..92db37dda757 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -12680,6 +12680,8 @@ static struct cipher_testvec cast6_xts_dec_tv_template[] = {
 #define AES_GCM_DEC_TEST_VECTORS 8
 #define AES_GCM_4106_ENC_TEST_VECTORS 7
 #define AES_GCM_4106_DEC_TEST_VECTORS 7
+#define AES_GCM_4543_ENC_TEST_VECTORS 1
+#define AES_GCM_4543_DEC_TEST_VECTORS 2
 #define AES_CCM_ENC_TEST_VECTORS 7
 #define AES_CCM_DEC_TEST_VECTORS 7
 #define AES_CCM_4309_ENC_TEST_VECTORS 7
@@ -18193,6 +18195,93 @@ static struct aead_testvec aes_gcm_rfc4106_dec_tv_template[] = {
 	}
 };
 
+static struct aead_testvec aes_gcm_rfc4543_enc_tv_template[] = {
+	{ /* From draft-mcgrew-gcm-test-01 */
+		.key	= "\x4c\x80\xcd\xef\xbb\x5d\x10\xda"
+			  "\x90\x6a\xc7\x3c\x36\x13\xa6\x34"
+			  "\x22\x43\x3c\x64",
+		.klen	= 20,
+		.iv	= zeroed_string,
+		.assoc	= "\x00\x00\x43\x21\x00\x00\x00\x07",
+		.alen	= 8,
+		.input	= "\x45\x00\x00\x30\xda\x3a\x00\x00"
+			  "\x80\x01\xdf\x3b\xc0\xa8\x00\x05"
+			  "\xc0\xa8\x00\x01\x08\x00\xc6\xcd"
+			  "\x02\x00\x07\x00\x61\x62\x63\x64"
+			  "\x65\x66\x67\x68\x69\x6a\x6b\x6c"
+			  "\x6d\x6e\x6f\x70\x71\x72\x73\x74"
+			  "\x01\x02\x02\x01",
+		.ilen	= 52,
+		.result	= "\x45\x00\x00\x30\xda\x3a\x00\x00"
+			  "\x80\x01\xdf\x3b\xc0\xa8\x00\x05"
+			  "\xc0\xa8\x00\x01\x08\x00\xc6\xcd"
+			  "\x02\x00\x07\x00\x61\x62\x63\x64"
+			  "\x65\x66\x67\x68\x69\x6a\x6b\x6c"
+			  "\x6d\x6e\x6f\x70\x71\x72\x73\x74"
+			  "\x01\x02\x02\x01\xf2\xa9\xa8\x36"
+			  "\xe1\x55\x10\x6a\xa8\xdc\xd6\x18"
+			  "\xe4\x09\x9a\xaa",
+		.rlen	= 68,
+	}
+};
+
+static struct aead_testvec aes_gcm_rfc4543_dec_tv_template[] = {
+	{ /* From draft-mcgrew-gcm-test-01 */
+		.key	= "\x4c\x80\xcd\xef\xbb\x5d\x10\xda"
+			  "\x90\x6a\xc7\x3c\x36\x13\xa6\x34"
+			  "\x22\x43\x3c\x64",
+		.klen	= 20,
+		.iv	= zeroed_string,
+		.assoc	= "\x00\x00\x43\x21\x00\x00\x00\x07",
+		.alen	= 8,
+		.input	= "\x45\x00\x00\x30\xda\x3a\x00\x00"
+			  "\x80\x01\xdf\x3b\xc0\xa8\x00\x05"
+			  "\xc0\xa8\x00\x01\x08\x00\xc6\xcd"
+			  "\x02\x00\x07\x00\x61\x62\x63\x64"
+			  "\x65\x66\x67\x68\x69\x6a\x6b\x6c"
+			  "\x6d\x6e\x6f\x70\x71\x72\x73\x74"
+			  "\x01\x02\x02\x01\xf2\xa9\xa8\x36"
+			  "\xe1\x55\x10\x6a\xa8\xdc\xd6\x18"
+			  "\xe4\x09\x9a\xaa",
+		.ilen	= 68,
+		.result	= "\x45\x00\x00\x30\xda\x3a\x00\x00"
+			  "\x80\x01\xdf\x3b\xc0\xa8\x00\x05"
+			  "\xc0\xa8\x00\x01\x08\x00\xc6\xcd"
+			  "\x02\x00\x07\x00\x61\x62\x63\x64"
+			  "\x65\x66\x67\x68\x69\x6a\x6b\x6c"
+			  "\x6d\x6e\x6f\x70\x71\x72\x73\x74"
+			  "\x01\x02\x02\x01",
+		.rlen	= 52,
+	}, { /* nearly same as previous, but should fail */
+		.key	= "\x4c\x80\xcd\xef\xbb\x5d\x10\xda"
+			  "\x90\x6a\xc7\x3c\x36\x13\xa6\x34"
+			  "\x22\x43\x3c\x64",
+		.klen	= 20,
+		.iv	= zeroed_string,
+		.assoc	= "\x00\x00\x43\x21\x00\x00\x00\x07",
+		.alen	= 8,
+		.input	= "\x45\x00\x00\x30\xda\x3a\x00\x00"
+			  "\x80\x01\xdf\x3b\xc0\xa8\x00\x05"
+			  "\xc0\xa8\x00\x01\x08\x00\xc6\xcd"
+			  "\x02\x00\x07\x00\x61\x62\x63\x64"
+			  "\x65\x66\x67\x68\x69\x6a\x6b\x6c"
+			  "\x6d\x6e\x6f\x70\x71\x72\x73\x74"
+			  "\x01\x02\x02\x01\xf2\xa9\xa8\x36"
+			  "\xe1\x55\x10\x6a\xa8\xdc\xd6\x18"
+			  "\x00\x00\x00\x00",
+		.ilen	= 68,
+		.novrfy = 1,
+		.result	= "\x45\x00\x00\x30\xda\x3a\x00\x00"
+			  "\x80\x01\xdf\x3b\xc0\xa8\x00\x05"
+			  "\xc0\xa8\x00\x01\x08\x00\xc6\xcd"
+			  "\x02\x00\x07\x00\x61\x62\x63\x64"
+			  "\x65\x66\x67\x68\x69\x6a\x6b\x6c"
+			  "\x6d\x6e\x6f\x70\x71\x72\x73\x74"
+			  "\x01\x02\x02\x01",
+		.rlen	= 52,
+	},
+};
+
 static struct aead_testvec aes_ccm_enc_tv_template[] = {
 	{ /* From RFC 3610 */
 		.key	= "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
-- 
cgit v1.2.3-59-g8ed1b


From e448370d7377f064c2fef55f72e9b45184bf0926 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Date: Sun, 7 Apr 2013 16:43:56 +0300
Subject: crypto: testmgr - add empty test vectors for null ciphers

Without these, kernel log shows:
[    5.984881] alg: No test for cipher_null (cipher_null-generic)
[    5.985096] alg: No test for ecb(cipher_null) (ecb-cipher_null)
[    5.985170] alg: No test for compress_null (compress_null-generic)
[    5.985297] alg: No test for digest_null (digest_null-generic)

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/testmgr.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'crypto')

diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 442ddb46bbe5..f37e544dddf0 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -1912,6 +1912,9 @@ static const struct alg_test_desc alg_test_descs[] = {
 				}
 			}
 		}
+	}, {
+		.alg = "compress_null",
+		.test = alg_test_null,
 	}, {
 		.alg = "crc32c",
 		.test = alg_test_crc32c,
@@ -2126,6 +2129,9 @@ static const struct alg_test_desc alg_test_descs[] = {
 				}
 			}
 		}
+	}, {
+		.alg = "digest_null",
+		.test = alg_test_null,
 	}, {
 		.alg = "ecb(__aes-aesni)",
 		.test = alg_test_null,
@@ -2236,6 +2242,9 @@ static const struct alg_test_desc alg_test_descs[] = {
 				}
 			}
 		}
+	}, {
+		.alg = "ecb(cipher_null)",
+		.test = alg_test_null,
 	}, {
 		.alg = "ecb(des)",
 		.test = alg_test_skcipher,
-- 
cgit v1.2.3-59-g8ed1b


From 93b5e86a6d13c5dec18c6611933fb38d7d80f0d2 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Date: Mon, 8 Apr 2013 10:48:44 +0300
Subject: crypto: add CMAC support to CryptoAPI

Patch adds support for NIST recommended block cipher mode CMAC to CryptoAPI.

This work is based on Tom St Denis' earlier patch,
 http://marc.info/?l=linux-crypto-vger&m=135877306305466&w=2

Cc: Tom St Denis <tstdenis@elliptictech.com>
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/Kconfig   |  11 ++
 crypto/Makefile  |   1 +
 crypto/cmac.c    | 315 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 crypto/tcrypt.c  |  11 +-
 crypto/testmgr.c |  18 ++++
 crypto/testmgr.h | 125 ++++++++++++++++++++++
 6 files changed, 480 insertions(+), 1 deletion(-)
 create mode 100644 crypto/cmac.c

(limited to 'crypto')

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 6cc27f111551..c1142f31a00c 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -283,6 +283,17 @@ config CRYPTO_XTS
 
 comment "Hash modes"
 
+config CRYPTO_CMAC
+	tristate "CMAC support"
+	select CRYPTO_HASH
+	select CRYPTO_MANAGER
+	help
+	  Cipher-based Message Authentication Code (CMAC) specified by
+	  The National Institute of Standards and Technology (NIST).
+
+	  https://tools.ietf.org/html/rfc4493
+	  http://csrc.nist.gov/publications/nistpubs/800-38B/SP_800-38B.pdf
+
 config CRYPTO_HMAC
 	tristate "HMAC support"
 	select CRYPTO_HASH
diff --git a/crypto/Makefile b/crypto/Makefile
index be1a1bebbb86..a8e9b0fefbe9 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -32,6 +32,7 @@ cryptomgr-y := algboss.o testmgr.o
 
 obj-$(CONFIG_CRYPTO_MANAGER2) += cryptomgr.o
 obj-$(CONFIG_CRYPTO_USER) += crypto_user.o
+obj-$(CONFIG_CRYPTO_CMAC) += cmac.o
 obj-$(CONFIG_CRYPTO_HMAC) += hmac.o
 obj-$(CONFIG_CRYPTO_VMAC) += vmac.o
 obj-$(CONFIG_CRYPTO_XCBC) += xcbc.o
diff --git a/crypto/cmac.c b/crypto/cmac.c
new file mode 100644
index 000000000000..50880cf17fad
--- /dev/null
+++ b/crypto/cmac.c
@@ -0,0 +1,315 @@
+/*
+ * CMAC: Cipher Block Mode for Authentication
+ *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * Based on work by:
+ *  Copyright © 2013 Tom St Denis <tstdenis@elliptictech.com>
+ * Based on crypto/xcbc.c:
+ *  Copyright © 2006 USAGI/WIDE Project,
+ *   Author: Kazunori Miyazawa <miyazawa@linux-ipv6.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+/*
+ * +------------------------
+ * | <parent tfm>
+ * +------------------------
+ * | cmac_tfm_ctx
+ * +------------------------
+ * | consts (block size * 2)
+ * +------------------------
+ */
+struct cmac_tfm_ctx {
+	struct crypto_cipher *child;
+	u8 ctx[];
+};
+
+/*
+ * +------------------------
+ * | <shash desc>
+ * +------------------------
+ * | cmac_desc_ctx
+ * +------------------------
+ * | odds (block size)
+ * +------------------------
+ * | prev (block size)
+ * +------------------------
+ */
+struct cmac_desc_ctx {
+	unsigned int len;
+	u8 ctx[];
+};
+
+static int crypto_cmac_digest_setkey(struct crypto_shash *parent,
+				     const u8 *inkey, unsigned int keylen)
+{
+	unsigned long alignmask = crypto_shash_alignmask(parent);
+	struct cmac_tfm_ctx *ctx = crypto_shash_ctx(parent);
+	unsigned int bs = crypto_shash_blocksize(parent);
+	__be64 *consts = PTR_ALIGN((void *)ctx->ctx, alignmask + 1);
+	u64 _const[2];
+	int i, err = 0;
+	u8 msb_mask, gfmask;
+
+	err = crypto_cipher_setkey(ctx->child, inkey, keylen);
+	if (err)
+		return err;
+
+	/* encrypt the zero block */
+	memset(consts, 0, bs);
+	crypto_cipher_encrypt_one(ctx->child, (u8 *)consts, (u8 *)consts);
+
+	switch (bs) {
+	case 16:
+		gfmask = 0x87;
+		_const[0] = be64_to_cpu(consts[1]);
+		_const[1] = be64_to_cpu(consts[0]);
+
+		/* gf(2^128) multiply zero-ciphertext with u and u^2 */
+		for (i = 0; i < 4; i += 2) {
+			msb_mask = ((s64)_const[1] >> 63) & gfmask;
+			_const[1] = (_const[1] << 1) | (_const[0] >> 63);
+			_const[0] = (_const[0] << 1) ^ msb_mask;
+
+			consts[i + 0] = cpu_to_be64(_const[1]);
+			consts[i + 1] = cpu_to_be64(_const[0]);
+		}
+
+		break;
+	case 8:
+		gfmask = 0x1B;
+		_const[0] = be64_to_cpu(consts[0]);
+
+		/* gf(2^64) multiply zero-ciphertext with u and u^2 */
+		for (i = 0; i < 2; i++) {
+			msb_mask = ((s64)_const[0] >> 63) & gfmask;
+			_const[0] = (_const[0] << 1) ^ msb_mask;
+
+			consts[i] = cpu_to_be64(_const[0]);
+		}
+
+		break;
+	}
+
+	return 0;
+}
+
+static int crypto_cmac_digest_init(struct shash_desc *pdesc)
+{
+	unsigned long alignmask = crypto_shash_alignmask(pdesc->tfm);
+	struct cmac_desc_ctx *ctx = shash_desc_ctx(pdesc);
+	int bs = crypto_shash_blocksize(pdesc->tfm);
+	u8 *prev = PTR_ALIGN((void *)ctx->ctx, alignmask + 1) + bs;
+
+	ctx->len = 0;
+	memset(prev, 0, bs);
+
+	return 0;
+}
+
+static int crypto_cmac_digest_update(struct shash_desc *pdesc, const u8 *p,
+				     unsigned int len)
+{
+	struct crypto_shash *parent = pdesc->tfm;
+	unsigned long alignmask = crypto_shash_alignmask(parent);
+	struct cmac_tfm_ctx *tctx = crypto_shash_ctx(parent);
+	struct cmac_desc_ctx *ctx = shash_desc_ctx(pdesc);
+	struct crypto_cipher *tfm = tctx->child;
+	int bs = crypto_shash_blocksize(parent);
+	u8 *odds = PTR_ALIGN((void *)ctx->ctx, alignmask + 1);
+	u8 *prev = odds + bs;
+
+	/* checking the data can fill the block */
+	if ((ctx->len + len) <= bs) {
+		memcpy(odds + ctx->len, p, len);
+		ctx->len += len;
+		return 0;
+	}
+
+	/* filling odds with new data and encrypting it */
+	memcpy(odds + ctx->len, p, bs - ctx->len);
+	len -= bs - ctx->len;
+	p += bs - ctx->len;
+
+	crypto_xor(prev, odds, bs);
+	crypto_cipher_encrypt_one(tfm, prev, prev);
+
+	/* clearing the length */
+	ctx->len = 0;
+
+	/* encrypting the rest of data */
+	while (len > bs) {
+		crypto_xor(prev, p, bs);
+		crypto_cipher_encrypt_one(tfm, prev, prev);
+		p += bs;
+		len -= bs;
+	}
+
+	/* keeping the surplus of blocksize */
+	if (len) {
+		memcpy(odds, p, len);
+		ctx->len = len;
+	}
+
+	return 0;
+}
+
+static int crypto_cmac_digest_final(struct shash_desc *pdesc, u8 *out)
+{
+	struct crypto_shash *parent = pdesc->tfm;
+	unsigned long alignmask = crypto_shash_alignmask(parent);
+	struct cmac_tfm_ctx *tctx = crypto_shash_ctx(parent);
+	struct cmac_desc_ctx *ctx = shash_desc_ctx(pdesc);
+	struct crypto_cipher *tfm = tctx->child;
+	int bs = crypto_shash_blocksize(parent);
+	u8 *consts = PTR_ALIGN((void *)tctx->ctx, alignmask + 1);
+	u8 *odds = PTR_ALIGN((void *)ctx->ctx, alignmask + 1);
+	u8 *prev = odds + bs;
+	unsigned int offset = 0;
+
+	if (ctx->len != bs) {
+		unsigned int rlen;
+		u8 *p = odds + ctx->len;
+
+		*p = 0x80;
+		p++;
+
+		rlen = bs - ctx->len - 1;
+		if (rlen)
+			memset(p, 0, rlen);
+
+		offset += bs;
+	}
+
+	crypto_xor(prev, odds, bs);
+	crypto_xor(prev, consts + offset, bs);
+
+	crypto_cipher_encrypt_one(tfm, out, prev);
+
+	return 0;
+}
+
+static int cmac_init_tfm(struct crypto_tfm *tfm)
+{
+	struct crypto_cipher *cipher;
+	struct crypto_instance *inst = (void *)tfm->__crt_alg;
+	struct crypto_spawn *spawn = crypto_instance_ctx(inst);
+	struct cmac_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	cipher = crypto_spawn_cipher(spawn);
+	if (IS_ERR(cipher))
+		return PTR_ERR(cipher);
+
+	ctx->child = cipher;
+
+	return 0;
+};
+
+static void cmac_exit_tfm(struct crypto_tfm *tfm)
+{
+	struct cmac_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
+	crypto_free_cipher(ctx->child);
+}
+
+static int cmac_create(struct crypto_template *tmpl, struct rtattr **tb)
+{
+	struct shash_instance *inst;
+	struct crypto_alg *alg;
+	unsigned long alignmask;
+	int err;
+
+	err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SHASH);
+	if (err)
+		return err;
+
+	alg = crypto_get_attr_alg(tb, CRYPTO_ALG_TYPE_CIPHER,
+				  CRYPTO_ALG_TYPE_MASK);
+	if (IS_ERR(alg))
+		return PTR_ERR(alg);
+
+	switch (alg->cra_blocksize) {
+	case 16:
+	case 8:
+		break;
+	default:
+		goto out_put_alg;
+	}
+
+	inst = shash_alloc_instance("cmac", alg);
+	err = PTR_ERR(inst);
+	if (IS_ERR(inst))
+		goto out_put_alg;
+
+	err = crypto_init_spawn(shash_instance_ctx(inst), alg,
+				shash_crypto_instance(inst),
+				CRYPTO_ALG_TYPE_MASK);
+	if (err)
+		goto out_free_inst;
+
+	alignmask = alg->cra_alignmask | (sizeof(long) - 1);
+	inst->alg.base.cra_alignmask = alignmask;
+	inst->alg.base.cra_priority = alg->cra_priority;
+	inst->alg.base.cra_blocksize = alg->cra_blocksize;
+
+	inst->alg.digestsize = alg->cra_blocksize;
+	inst->alg.descsize =
+		ALIGN(sizeof(struct cmac_desc_ctx), crypto_tfm_ctx_alignment())
+		+ (alignmask & ~(crypto_tfm_ctx_alignment() - 1))
+		+ alg->cra_blocksize * 2;
+
+	inst->alg.base.cra_ctxsize =
+		ALIGN(sizeof(struct cmac_tfm_ctx), alignmask + 1)
+		+ alg->cra_blocksize * 2;
+
+	inst->alg.base.cra_init = cmac_init_tfm;
+	inst->alg.base.cra_exit = cmac_exit_tfm;
+
+	inst->alg.init = crypto_cmac_digest_init;
+	inst->alg.update = crypto_cmac_digest_update;
+	inst->alg.final = crypto_cmac_digest_final;
+	inst->alg.setkey = crypto_cmac_digest_setkey;
+
+	err = shash_register_instance(tmpl, inst);
+	if (err) {
+out_free_inst:
+		shash_free_instance(shash_crypto_instance(inst));
+	}
+
+out_put_alg:
+	crypto_mod_put(alg);
+	return err;
+}
+
+static struct crypto_template crypto_cmac_tmpl = {
+	.name = "cmac",
+	.create = cmac_create,
+	.free = shash_free_instance,
+	.module = THIS_MODULE,
+};
+
+static int __init crypto_cmac_module_init(void)
+{
+	return crypto_register_template(&crypto_cmac_tmpl);
+}
+
+static void __exit crypto_cmac_module_exit(void)
+{
+	crypto_unregister_template(&crypto_cmac_tmpl);
+}
+
+module_init(crypto_cmac_module_init);
+module_exit(crypto_cmac_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("CMAC keyed hash algorithm");
diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index 6b911ef8df8b..24ea7dffd21e 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -1095,7 +1095,6 @@ static int do_test(int m)
 		break;
 
 	case 28:
-
 		ret += tcrypt_test("tgr160");
 		break;
 
@@ -1118,6 +1117,7 @@ static int do_test(int m)
 		ret += tcrypt_test("lrw(camellia)");
 		ret += tcrypt_test("xts(camellia)");
 		break;
+
 	case 33:
 		ret += tcrypt_test("sha224");
 		break;
@@ -1213,6 +1213,7 @@ static int do_test(int m)
 	case 109:
 		ret += tcrypt_test("vmac(aes)");
 		break;
+
 	case 110:
 		ret += tcrypt_test("hmac(crc32)");
 		break;
@@ -1229,6 +1230,14 @@ static int do_test(int m)
 		ret += tcrypt_test("rfc4543(gcm(aes))");
 		break;
 
+	case 153:
+		ret += tcrypt_test("cmac(aes)");
+		break;
+
+	case 154:
+		ret += tcrypt_test("cmac(des3_ede)");
+		break;
+
 	case 200:
 		test_cipher_speed("ecb(aes)", ENCRYPT, sec, NULL, 0,
 				speed_template_16_24_32);
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index f37e544dddf0..380708477b35 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -1912,6 +1912,24 @@ static const struct alg_test_desc alg_test_descs[] = {
 				}
 			}
 		}
+	}, {
+		.alg = "cmac(aes)",
+		.test = alg_test_hash,
+		.suite = {
+			.hash = {
+				.vecs = aes_cmac128_tv_template,
+				.count = CMAC_AES_TEST_VECTORS
+			}
+		}
+	}, {
+		.alg = "cmac(des3_ede)",
+		.test = alg_test_hash,
+		.suite = {
+			.hash = {
+				.vecs = des3_ede_cmac64_tv_template,
+				.count = CMAC_DES3_EDE_TEST_VECTORS
+			}
+		}
 	}, {
 		.alg = "compress_null",
 		.test = alg_test_null,
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 92db37dda757..d50366092b1a 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -1639,6 +1639,131 @@ static struct hash_testvec hmac_sha256_tv_template[] = {
 	},
 };
 
+#define CMAC_AES_TEST_VECTORS 6
+
+static struct hash_testvec aes_cmac128_tv_template[] = {
+	{ /* From NIST Special Publication 800-38B, AES-128 */
+		.key		= "\x2b\x7e\x15\x16\x28\xae\xd2\xa6"
+				  "\xab\xf7\x15\x88\x09\xcf\x4f\x3c",
+		.plaintext	= zeroed_string,
+		.digest		= "\xbb\x1d\x69\x29\xe9\x59\x37\x28"
+				  "\x7f\xa3\x7d\x12\x9b\x75\x67\x46",
+		.psize		= 0,
+		.ksize		= 16,
+	}, {
+		.key		= "\x2b\x7e\x15\x16\x28\xae\xd2\xa6"
+				  "\xab\xf7\x15\x88\x09\xcf\x4f\x3c",
+		.plaintext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+				  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a",
+		.digest		= "\x07\x0a\x16\xb4\x6b\x4d\x41\x44"
+				  "\xf7\x9b\xdd\x9d\xd0\x4a\x28\x7c",
+		.psize		= 16,
+		.ksize		= 16,
+	}, {
+		.key		= "\x2b\x7e\x15\x16\x28\xae\xd2\xa6"
+				  "\xab\xf7\x15\x88\x09\xcf\x4f\x3c",
+		.plaintext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+				  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+				  "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
+				  "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+				  "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11",
+		.digest		= "\xdf\xa6\x67\x47\xde\x9a\xe6\x30"
+				  "\x30\xca\x32\x61\x14\x97\xc8\x27",
+		.psize		= 40,
+		.ksize		= 16,
+	}, {
+		.key		= "\x2b\x7e\x15\x16\x28\xae\xd2\xa6"
+				  "\xab\xf7\x15\x88\x09\xcf\x4f\x3c",
+		.plaintext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+				  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+				  "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
+				  "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+				  "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11"
+				  "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef"
+				  "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17"
+				  "\xad\x2b\x41\x7b\xe6\x6c\x37\x10",
+		.digest		= "\x51\xf0\xbe\xbf\x7e\x3b\x9d\x92"
+				  "\xfc\x49\x74\x17\x79\x36\x3c\xfe",
+		.psize		= 64,
+		.ksize		= 16,
+	}, { /* From NIST Special Publication 800-38B, AES-256 */
+		.key		= "\x60\x3d\xeb\x10\x15\xca\x71\xbe"
+				  "\x2b\x73\xae\xf0\x85\x7d\x77\x81"
+				  "\x1f\x35\x2c\x07\x3b\x61\x08\xd7"
+				  "\x2d\x98\x10\xa3\x09\x14\xdf\xf4",
+		.plaintext	= zeroed_string,
+		.digest		= "\x02\x89\x62\xf6\x1b\x7b\xf8\x9e"
+				  "\xfc\x6b\x55\x1f\x46\x67\xd9\x83",
+		.psize		= 0,
+		.ksize		= 32,
+	}, {
+		.key		= "\x60\x3d\xeb\x10\x15\xca\x71\xbe"
+				  "\x2b\x73\xae\xf0\x85\x7d\x77\x81"
+				  "\x1f\x35\x2c\x07\x3b\x61\x08\xd7"
+				  "\x2d\x98\x10\xa3\x09\x14\xdf\xf4",
+		.plaintext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+				  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+				  "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
+				  "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+				  "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11"
+				  "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef"
+				  "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17"
+				  "\xad\x2b\x41\x7b\xe6\x6c\x37\x10",
+		.digest		= "\xe1\x99\x21\x90\x54\x9f\x6e\xd5"
+				  "\x69\x6a\x2c\x05\x6c\x31\x54\x10",
+		.psize		= 64,
+		.ksize		= 32,
+	}
+};
+
+#define CMAC_DES3_EDE_TEST_VECTORS 4
+
+static struct hash_testvec des3_ede_cmac64_tv_template[] = {
+/*
+ * From NIST Special Publication 800-38B, Three Key TDEA
+ * Corrected test vectors from:
+ *  http://csrc.nist.gov/publications/nistpubs/800-38B/Updated_CMAC_Examples.pdf
+ */
+	{
+		.key		= "\x8a\xa8\x3b\xf8\xcb\xda\x10\x62"
+				  "\x0b\xc1\xbf\x19\xfb\xb6\xcd\x58"
+				  "\xbc\x31\x3d\x4a\x37\x1c\xa8\xb5",
+		.plaintext	= zeroed_string,
+		.digest		= "\xb7\xa6\x88\xe1\x22\xff\xaf\x95",
+		.psize		= 0,
+		.ksize		= 24,
+	}, {
+		.key		= "\x8a\xa8\x3b\xf8\xcb\xda\x10\x62"
+				  "\x0b\xc1\xbf\x19\xfb\xb6\xcd\x58"
+				  "\xbc\x31\x3d\x4a\x37\x1c\xa8\xb5",
+		.plaintext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96",
+		.digest		= "\x8e\x8f\x29\x31\x36\x28\x37\x97",
+		.psize		= 8,
+		.ksize		= 24,
+	}, {
+		.key		= "\x8a\xa8\x3b\xf8\xcb\xda\x10\x62"
+				  "\x0b\xc1\xbf\x19\xfb\xb6\xcd\x58"
+				  "\xbc\x31\x3d\x4a\x37\x1c\xa8\xb5",
+		.plaintext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+				  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+				  "\xae\x2d\x8a\x57",
+		.digest		= "\x74\x3d\xdb\xe0\xce\x2d\xc2\xed",
+		.psize		= 20,
+		.ksize		= 24,
+	}, {
+		.key		= "\x8a\xa8\x3b\xf8\xcb\xda\x10\x62"
+				  "\x0b\xc1\xbf\x19\xfb\xb6\xcd\x58"
+				  "\xbc\x31\x3d\x4a\x37\x1c\xa8\xb5",
+		.plaintext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+				  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+				  "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
+				  "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51",
+		.digest		= "\x33\xe6\xb1\x09\x24\x00\xea\xe5",
+		.psize		= 32,
+		.ksize		= 24,
+	}
+};
+
 #define XCBC_AES_TEST_VECTORS 6
 
 static struct hash_testvec aes_xcbc128_tv_template[] = {
-- 
cgit v1.2.3-59-g8ed1b


From c456a9cd1ac4eae9147ffd7ac4fb77ca0fa980c6 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Date: Mon, 8 Apr 2013 21:51:16 +0300
Subject: crypto: aesni_intel - add more optimized XTS mode for x86-64

Add more optimized XTS code for aesni_intel in 64-bit mode, for smaller stack
usage and boost for speed.

tcrypt results, with Intel i5-2450M:
256-bit key
        enc     dec
16B     0.98x   0.99x
64B     0.64x   0.63x
256B    1.29x   1.32x
1024B   1.54x   1.58x
8192B   1.57x   1.60x

512-bit key
        enc     dec
16B     0.98x   0.99x
64B     0.60x   0.59x
256B    1.24x   1.25x
1024B   1.39x   1.42x
8192B   1.38x   1.42x

I chose not to optimize smaller than block size of 256 bytes, since XTS is
practically always used with data blocks of size 512 bytes. This is why
performance is reduced in tcrypt for 64 byte long blocks.

Cc: Huang Ying <ying.huang@intel.com>
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aesni-intel_asm.S  | 117 +++++++++++++++++++++++++++++++++++++
 arch/x86/crypto/aesni-intel_glue.c |  80 +++++++++++++++++++++++++
 crypto/Kconfig                     |   1 +
 3 files changed, 198 insertions(+)

(limited to 'crypto')

diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 04b797767b9e..62fe22cd4cba 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -34,6 +34,10 @@
 
 #ifdef __x86_64__
 .data
+.align 16
+.Lgf128mul_x_ble_mask:
+	.octa 0x00000000000000010000000000000087
+
 POLY:   .octa 0xC2000000000000000000000000000001
 TWOONE: .octa 0x00000001000000000000000000000001
 
@@ -105,6 +109,8 @@ enc:        .octa 0x2
 #define CTR	%xmm11
 #define INC	%xmm12
 
+#define GF128MUL_MASK %xmm10
+
 #ifdef __x86_64__
 #define AREG	%rax
 #define KEYP	%rdi
@@ -2636,4 +2642,115 @@ ENTRY(aesni_ctr_enc)
 .Lctr_enc_just_ret:
 	ret
 ENDPROC(aesni_ctr_enc)
+
+/*
+ * _aesni_gf128mul_x_ble:		internal ABI
+ *	Multiply in GF(2^128) for XTS IVs
+ * input:
+ *	IV:	current IV
+ *	GF128MUL_MASK == mask with 0x87 and 0x01
+ * output:
+ *	IV:	next IV
+ * changed:
+ *	CTR:	== temporary value
+ */
+#define _aesni_gf128mul_x_ble() \
+	pshufd $0x13, IV, CTR; \
+	paddq IV, IV; \
+	psrad $31, CTR; \
+	pand GF128MUL_MASK, CTR; \
+	pxor CTR, IV;
+
+/*
+ * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ *			 bool enc, u8 *iv)
+ */
+ENTRY(aesni_xts_crypt8)
+	cmpb $0, %cl
+	movl $0, %ecx
+	movl $240, %r10d
+	leaq _aesni_enc4, %r11
+	leaq _aesni_dec4, %rax
+	cmovel %r10d, %ecx
+	cmoveq %rax, %r11
+
+	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
+	movups (IVP), IV
+
+	mov 480(KEYP), KLEN
+	addq %rcx, KEYP
+
+	movdqa IV, STATE1
+	pxor 0x00(INP), STATE1
+	movdqu IV, 0x00(OUTP)
+
+	_aesni_gf128mul_x_ble()
+	movdqa IV, STATE2
+	pxor 0x10(INP), STATE2
+	movdqu IV, 0x10(OUTP)
+
+	_aesni_gf128mul_x_ble()
+	movdqa IV, STATE3
+	pxor 0x20(INP), STATE3
+	movdqu IV, 0x20(OUTP)
+
+	_aesni_gf128mul_x_ble()
+	movdqa IV, STATE4
+	pxor 0x30(INP), STATE4
+	movdqu IV, 0x30(OUTP)
+
+	call *%r11
+
+	pxor 0x00(OUTP), STATE1
+	movdqu STATE1, 0x00(OUTP)
+
+	_aesni_gf128mul_x_ble()
+	movdqa IV, STATE1
+	pxor 0x40(INP), STATE1
+	movdqu IV, 0x40(OUTP)
+
+	pxor 0x10(OUTP), STATE2
+	movdqu STATE2, 0x10(OUTP)
+
+	_aesni_gf128mul_x_ble()
+	movdqa IV, STATE2
+	pxor 0x50(INP), STATE2
+	movdqu IV, 0x50(OUTP)
+
+	pxor 0x20(OUTP), STATE3
+	movdqu STATE3, 0x20(OUTP)
+
+	_aesni_gf128mul_x_ble()
+	movdqa IV, STATE3
+	pxor 0x60(INP), STATE3
+	movdqu IV, 0x60(OUTP)
+
+	pxor 0x30(OUTP), STATE4
+	movdqu STATE4, 0x30(OUTP)
+
+	_aesni_gf128mul_x_ble()
+	movdqa IV, STATE4
+	pxor 0x70(INP), STATE4
+	movdqu IV, 0x70(OUTP)
+
+	_aesni_gf128mul_x_ble()
+	movups IV, (IVP)
+
+	call *%r11
+
+	pxor 0x40(OUTP), STATE1
+	movdqu STATE1, 0x40(OUTP)
+
+	pxor 0x50(OUTP), STATE2
+	movdqu STATE2, 0x50(OUTP)
+
+	pxor 0x60(OUTP), STATE3
+	movdqu STATE3, 0x60(OUTP)
+
+	pxor 0x70(OUTP), STATE4
+	movdqu STATE4, 0x70(OUTP)
+
+	ret
+ENDPROC(aesni_xts_crypt8)
+
 #endif
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index a0795da22c02..f80e668785c0 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -39,6 +39,9 @@
 #include <crypto/internal/aead.h>
 #include <linux/workqueue.h>
 #include <linux/spinlock.h>
+#ifdef CONFIG_X86_64
+#include <asm/crypto/glue_helper.h>
+#endif
 
 #if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE)
 #define HAS_PCBC
@@ -102,6 +105,9 @@ void crypto_fpu_exit(void);
 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
 			      const u8 *in, unsigned int len, u8 *iv);
 
+asmlinkage void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, u8 *out,
+				 const u8 *in, bool enc, u8 *iv);
+
 /* asmlinkage void aesni_gcm_enc()
  * void *ctx,  AES Key schedule. Starts on a 16 byte boundary.
  * u8 *out, Ciphertext output. Encrypt in-place is allowed.
@@ -510,6 +516,78 @@ static void aesni_xts_tweak(void *ctx, u8 *out, const u8 *in)
 	aesni_enc(ctx, out, in);
 }
 
+#ifdef CONFIG_X86_64
+
+static void aesni_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+	glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_enc));
+}
+
+static void aesni_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+	glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_dec));
+}
+
+static void aesni_xts_enc8(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+	aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, true, (u8 *)iv);
+}
+
+static void aesni_xts_dec8(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+	aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, false, (u8 *)iv);
+}
+
+static const struct common_glue_ctx aesni_enc_xts = {
+	.num_funcs = 2,
+	.fpu_blocks_limit = 1,
+
+	.funcs = { {
+		.num_blocks = 8,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc8) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc) }
+	} }
+};
+
+static const struct common_glue_ctx aesni_dec_xts = {
+	.num_funcs = 2,
+	.fpu_blocks_limit = 1,
+
+	.funcs = { {
+		.num_blocks = 8,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec8) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec) }
+	} }
+};
+
+static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+
+	return glue_xts_crypt_128bit(&aesni_enc_xts, desc, dst, src, nbytes,
+				     XTS_TWEAK_CAST(aesni_xts_tweak),
+				     aes_ctx(ctx->raw_tweak_ctx),
+				     aes_ctx(ctx->raw_crypt_ctx));
+}
+
+static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+
+	return glue_xts_crypt_128bit(&aesni_dec_xts, desc, dst, src, nbytes,
+				     XTS_TWEAK_CAST(aesni_xts_tweak),
+				     aes_ctx(ctx->raw_tweak_ctx),
+				     aes_ctx(ctx->raw_crypt_ctx));
+}
+
+#else
+
 static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 {
@@ -560,6 +638,8 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 	return ret;
 }
 
+#endif
+
 #ifdef CONFIG_X86_64
 static int rfc4106_init(struct crypto_tfm *tfm)
 {
diff --git a/crypto/Kconfig b/crypto/Kconfig
index c1142f31a00c..808ac374b21f 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -678,6 +678,7 @@ config CRYPTO_AES_NI_INTEL
 	select CRYPTO_CRYPTD
 	select CRYPTO_ABLK_HELPER_X86
 	select CRYPTO_ALGAPI
+	select CRYPTO_GLUE_HELPER if 64BIT
 	select CRYPTO_LRW
 	select CRYPTO_XTS
 	help
-- 
cgit v1.2.3-59-g8ed1b


From 7643a11a35c178c01083829d1a5b43e22b688751 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Date: Wed, 10 Apr 2013 18:39:20 +0300
Subject: crypto: aesni_intel - fix Kconfig problem with CRYPTO_GLUE_HELPER_X86

The Kconfig setting for glue helper module is CRYPTO_GLUE_HELPER_X86, but
recent change for aesni_intel used CRYPTO_GLUE_HELPER instead. Patch corrects
this issue.

Cc: kbuild-all@01.org
Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'crypto')

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 808ac374b21f..0e7a23723b45 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -678,7 +678,7 @@ config CRYPTO_AES_NI_INTEL
 	select CRYPTO_CRYPTD
 	select CRYPTO_ABLK_HELPER_X86
 	select CRYPTO_ALGAPI
-	select CRYPTO_GLUE_HELPER if 64BIT
+	select CRYPTO_GLUE_HELPER_X86 if 64BIT
 	select CRYPTO_LRW
 	select CRYPTO_XTS
 	help
-- 
cgit v1.2.3-59-g8ed1b


From 23a836e87d6cd85dc456dde103dfc69c1743f95a Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Date: Sat, 13 Apr 2013 13:46:35 +0300
Subject: crypto: testmgr - extend camellia test-vectors for
 camellia-aesni/avx2

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/testmgr.h | 1100 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 1062 insertions(+), 38 deletions(-)

(limited to 'crypto')

diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index d50366092b1a..dc2c054d4bbc 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -20997,8 +20997,72 @@ static struct cipher_testvec camellia_enc_tv_template[] = {
 			  "\x86\x1D\xB4\x28\xBF\x56\xED\x61"
 			  "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3"
 			  "\x6A\x01\x75\x0C\xA3\x17\xAE\x45"
-			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7",
-		.ilen	= 496,
+			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7"
+			  "\x2B\xC2\x59\xF0\x64\xFB\x92\x06"
+			  "\x9D\x34\xCB\x3F\xD6\x6D\x04\x78"
+			  "\x0F\xA6\x1A\xB1\x48\xDF\x53\xEA"
+			  "\x81\x18\x8C\x23\xBA\x2E\xC5\x5C"
+			  "\xF3\x67\xFE\x95\x09\xA0\x37\xCE"
+			  "\x42\xD9\x70\x07\x7B\x12\xA9\x1D"
+			  "\xB4\x4B\xE2\x56\xED\x84\x1B\x8F"
+			  "\x26\xBD\x31\xC8\x5F\xF6\x6A\x01"
+			  "\x98\x0C\xA3\x3A\xD1\x45\xDC\x73"
+			  "\x0A\x7E\x15\xAC\x20\xB7\x4E\xE5"
+			  "\x59\xF0\x87\x1E\x92\x29\xC0\x34"
+			  "\xCB\x62\xF9\x6D\x04\x9B\x0F\xA6"
+			  "\x3D\xD4\x48\xDF\x76\x0D\x81\x18"
+			  "\xAF\x23\xBA\x51\xE8\x5C\xF3\x8A"
+			  "\x21\x95\x2C\xC3\x37\xCE\x65\xFC"
+			  "\x70\x07\x9E\x12\xA9\x40\xD7\x4B"
+			  "\xE2\x79\x10\x84\x1B\xB2\x26\xBD"
+			  "\x54\xEB\x5F\xF6\x8D\x01\x98\x2F"
+			  "\xC6\x3A\xD1\x68\xFF\x73\x0A\xA1"
+			  "\x15\xAC\x43\xDA\x4E\xE5\x7C\x13"
+			  "\x87\x1E\xB5\x29\xC0\x57\xEE\x62"
+			  "\xF9\x90\x04\x9B\x32\xC9\x3D\xD4"
+			  "\x6B\x02\x76\x0D\xA4\x18\xAF\x46"
+			  "\xDD\x51\xE8\x7F\x16\x8A\x21\xB8"
+			  "\x2C\xC3\x5A\xF1\x65\xFC\x93\x07"
+			  "\x9E\x35\xCC\x40\xD7\x6E\x05\x79"
+			  "\x10\xA7\x1B\xB2\x49\xE0\x54\xEB"
+			  "\x82\x19\x8D\x24\xBB\x2F\xC6\x5D"
+			  "\xF4\x68\xFF\x96\x0A\xA1\x38\xCF"
+			  "\x43\xDA\x71\x08\x7C\x13\xAA\x1E"
+			  "\xB5\x4C\xE3\x57\xEE\x85\x1C\x90"
+			  "\x27\xBE\x32\xC9\x60\xF7\x6B\x02"
+			  "\x99\x0D\xA4\x3B\xD2\x46\xDD\x74"
+			  "\x0B\x7F\x16\xAD\x21\xB8\x4F\xE6"
+			  "\x5A\xF1\x88\x1F\x93\x2A\xC1\x35"
+			  "\xCC\x63\xFA\x6E\x05\x9C\x10\xA7"
+			  "\x3E\xD5\x49\xE0\x77\x0E\x82\x19"
+			  "\xB0\x24\xBB\x52\xE9\x5D\xF4\x8B"
+			  "\x22\x96\x2D\xC4\x38\xCF\x66\xFD"
+			  "\x71\x08\x9F\x13\xAA\x41\xD8\x4C"
+			  "\xE3\x7A\x11\x85\x1C\xB3\x27\xBE"
+			  "\x55\xEC\x60\xF7\x8E\x02\x99\x30"
+			  "\xC7\x3B\xD2\x69\x00\x74\x0B\xA2"
+			  "\x16\xAD\x44\xDB\x4F\xE6\x7D\x14"
+			  "\x88\x1F\xB6\x2A\xC1\x58\xEF\x63"
+			  "\xFA\x91\x05\x9C\x33\xCA\x3E\xD5"
+			  "\x6C\x03\x77\x0E\xA5\x19\xB0\x47"
+			  "\xDE\x52\xE9\x80\x17\x8B\x22\xB9"
+			  "\x2D\xC4\x5B\xF2\x66\xFD\x94\x08"
+			  "\x9F\x36\xCD\x41\xD8\x6F\x06\x7A"
+			  "\x11\xA8\x1C\xB3\x4A\xE1\x55\xEC"
+			  "\x83\x1A\x8E\x25\xBC\x30\xC7\x5E"
+			  "\xF5\x69\x00\x97\x0B\xA2\x39\xD0"
+			  "\x44\xDB\x72\x09\x7D\x14\xAB\x1F"
+			  "\xB6\x4D\xE4\x58\xEF\x86\x1D\x91"
+			  "\x28\xBF\x33\xCA\x61\xF8\x6C\x03"
+			  "\x9A\x0E\xA5\x3C\xD3\x47\xDE\x75"
+			  "\x0C\x80\x17\xAE\x22\xB9\x50\xE7"
+			  "\x5B\xF2\x89\x20\x94\x2B\xC2\x36"
+			  "\xCD\x64\xFB\x6F\x06\x9D\x11\xA8"
+			  "\x3F\xD6\x4A\xE1\x78\x0F\x83\x1A"
+			  "\xB1\x25\xBC\x53\xEA\x5E\xF5\x8C"
+			  "\x00\x97\x2E\xC5\x39\xD0\x67\xFE"
+			  "\x72\x09\xA0\x14\xAB\x42\xD9\x4D",
+		.ilen	= 1008,
 		.result	= "\xED\xCD\xDB\xB8\x68\xCE\xBD\xEA"
 			  "\x9D\x9D\xCD\x9F\x4F\xFC\x4D\xB7"
 			  "\xA5\xFF\x6F\x43\x0F\xBA\x32\x04"
@@ -21060,11 +21124,75 @@ static struct cipher_testvec camellia_enc_tv_template[] = {
 			  "\x2C\x35\x1B\x38\x85\x7D\xE8\xF3"
 			  "\x87\x4F\xDA\xD8\x5F\xFC\xB6\x44"
 			  "\xD0\xE3\x9B\x8B\xBF\xD6\xB8\xC4"
-			  "\x73\xAE\x1D\x8B\x5B\x74\x8B\xCB",
-		.rlen	= 496,
+			  "\x73\xAE\x1D\x8B\x5B\x74\x8B\xCB"
+			  "\xA4\xAD\xCF\x5D\xD4\x58\xC9\xCD"
+			  "\xF7\x90\x68\xCF\xC9\x11\x52\x3E"
+			  "\xE8\xA1\xA3\x78\x8B\xD0\xAC\x0A"
+			  "\xD4\xC9\xA3\xA5\x55\x30\xC8\x3E"
+			  "\xED\x28\x39\xE9\x63\xED\x41\x70"
+			  "\x51\xE3\xC4\xA0\xFC\xD5\x43\xCB"
+			  "\x4D\x65\xC8\xFD\x3A\x91\x8F\x60"
+			  "\x8A\xA6\x6D\x9D\x3E\x01\x23\x4B"
+			  "\x50\x47\xC9\xDC\x9B\xDE\x37\xC5"
+			  "\xBF\x67\xB1\x6B\x78\x38\xD5\x7E"
+			  "\xB6\xFF\x67\x83\x3B\x6E\xBE\x23"
+			  "\x45\xFA\x1D\x69\x44\xFD\xC6\xB9"
+			  "\xD0\x4A\x92\xD1\xBE\xF6\x4A\xB7"
+			  "\xCA\xA8\xA2\x9E\x13\x87\x57\x92"
+			  "\x64\x7C\x85\x0B\xB3\x29\x37\xD8"
+			  "\xE6\xAA\xAF\xC4\x03\x67\xA3\xBF"
+			  "\x2E\x45\x83\xB6\xD8\x54\x00\x89"
+			  "\xF6\xBC\x3A\x7A\x88\x58\x51\xED"
+			  "\xF4\x4E\x01\xA5\xC3\x2E\xD9\x42"
+			  "\xBD\x6E\x0D\x0B\x21\xB0\x1A\xCC"
+			  "\xA4\xD3\x3F\xDC\x9B\x81\xD8\xF1"
+			  "\xEA\x7A\x6A\xB7\x07\xC9\x6D\x91"
+			  "\x6D\x3A\xF5\x5F\xA6\xFF\x87\x1E"
+			  "\x3F\xDD\xC0\x72\xEA\xAC\x08\x15"
+			  "\x21\xE6\xC6\xB6\x0D\xD8\x51\x86"
+			  "\x2A\x03\x73\xF7\x29\xD4\xC4\xE4"
+			  "\x7F\x95\x10\xF7\xAB\x3F\x92\x23"
+			  "\xD3\xCE\x9C\x2E\x46\x3B\x63\x43"
+			  "\xBB\xC2\x82\x7A\x83\xD5\x55\xE2"
+			  "\xE7\x9B\x2F\x92\xAF\xFD\x81\x56"
+			  "\x79\xFD\x3E\xF9\x46\xE0\x25\xD4"
+			  "\x38\xDE\xBC\x2C\xC4\x7A\x2A\x8F"
+			  "\x94\x4F\xD0\xAD\x9B\x37\x18\xD4"
+			  "\x0E\x4D\x0F\x02\x3A\xDC\x5A\xA2"
+			  "\x39\x25\x55\x20\x5A\xA6\x02\x9F"
+			  "\xE6\x77\x21\x77\xE5\x4B\x7B\x0B"
+			  "\x30\xF8\x5F\x33\x0F\x49\xCD\xFF"
+			  "\xF2\xE4\x35\xF9\xF0\x63\xC3\x7E"
+			  "\xF1\xA6\x73\xB4\xDF\xE7\xBB\x78"
+			  "\xFF\x21\xA9\xF3\xF3\xCF\x5D\xBA"
+			  "\xED\x87\x98\xAC\xFE\x48\x97\x6D"
+			  "\xA6\x7F\x69\x31\xB1\xC4\xFF\x14"
+			  "\xC6\x76\xD4\x10\xDD\xF6\x49\x2C"
+			  "\x9C\xC8\x6D\x76\xC0\x8F\x5F\x55"
+			  "\x2F\x3C\x8A\x30\xAA\xC3\x16\x55"
+			  "\xC6\xFC\x8D\x8B\xB9\xE5\x80\x6C"
+			  "\xC8\x7E\xBD\x65\x58\x36\xD5\xBC"
+			  "\xF0\x33\x52\x29\x70\xF9\x5C\xE9"
+			  "\xAC\x1F\xB5\x73\x56\x66\x54\xAF"
+			  "\x1B\x8F\x7D\xED\xAB\x03\xCE\xE3"
+			  "\xAE\x47\xB6\x69\x86\xE9\x01\x31"
+			  "\x83\x18\x3D\xF4\x74\x7B\xF9\x42"
+			  "\x4C\xFD\x75\x4A\x6D\xF0\x03\xA6"
+			  "\x2B\x20\x63\xDA\x49\x65\x5E\x8B"
+			  "\xC0\x19\xE3\x8D\xD9\xF3\xB0\x34"
+			  "\xD3\x52\xFC\x68\x00\x43\x1B\x37"
+			  "\x31\x93\x51\x1C\x63\x97\x70\xB0"
+			  "\x99\x78\x83\x13\xFD\xCF\x53\x81"
+			  "\x36\x46\xB5\x42\x52\x2F\x32\xEB"
+			  "\x4A\x3D\xF1\x8F\x1C\x54\x2E\xFC"
+			  "\x41\x75\x5A\x8C\x8E\x6F\xE7\x1A"
+			  "\xAE\xEF\x3E\x82\x12\x0B\x74\x72"
+			  "\xF8\xB2\xAA\x7A\xD6\xFF\xFA\x55"
+			  "\x33\x1A\xBB\xD3\xA2\x7E\x97\x66",
+		.rlen	= 1008,
 		.also_non_np = 1,
 		.np	= 2,
-		.tap	= { 496 - 16, 16 },
+		.tap	= { 1008 - 16, 16 },
 	},
 };
 
@@ -21169,8 +21297,72 @@ static struct cipher_testvec camellia_dec_tv_template[] = {
 			  "\x2C\x35\x1B\x38\x85\x7D\xE8\xF3"
 			  "\x87\x4F\xDA\xD8\x5F\xFC\xB6\x44"
 			  "\xD0\xE3\x9B\x8B\xBF\xD6\xB8\xC4"
-			  "\x73\xAE\x1D\x8B\x5B\x74\x8B\xCB",
-		.ilen	= 496,
+			  "\x73\xAE\x1D\x8B\x5B\x74\x8B\xCB"
+			  "\xA4\xAD\xCF\x5D\xD4\x58\xC9\xCD"
+			  "\xF7\x90\x68\xCF\xC9\x11\x52\x3E"
+			  "\xE8\xA1\xA3\x78\x8B\xD0\xAC\x0A"
+			  "\xD4\xC9\xA3\xA5\x55\x30\xC8\x3E"
+			  "\xED\x28\x39\xE9\x63\xED\x41\x70"
+			  "\x51\xE3\xC4\xA0\xFC\xD5\x43\xCB"
+			  "\x4D\x65\xC8\xFD\x3A\x91\x8F\x60"
+			  "\x8A\xA6\x6D\x9D\x3E\x01\x23\x4B"
+			  "\x50\x47\xC9\xDC\x9B\xDE\x37\xC5"
+			  "\xBF\x67\xB1\x6B\x78\x38\xD5\x7E"
+			  "\xB6\xFF\x67\x83\x3B\x6E\xBE\x23"
+			  "\x45\xFA\x1D\x69\x44\xFD\xC6\xB9"
+			  "\xD0\x4A\x92\xD1\xBE\xF6\x4A\xB7"
+			  "\xCA\xA8\xA2\x9E\x13\x87\x57\x92"
+			  "\x64\x7C\x85\x0B\xB3\x29\x37\xD8"
+			  "\xE6\xAA\xAF\xC4\x03\x67\xA3\xBF"
+			  "\x2E\x45\x83\xB6\xD8\x54\x00\x89"
+			  "\xF6\xBC\x3A\x7A\x88\x58\x51\xED"
+			  "\xF4\x4E\x01\xA5\xC3\x2E\xD9\x42"
+			  "\xBD\x6E\x0D\x0B\x21\xB0\x1A\xCC"
+			  "\xA4\xD3\x3F\xDC\x9B\x81\xD8\xF1"
+			  "\xEA\x7A\x6A\xB7\x07\xC9\x6D\x91"
+			  "\x6D\x3A\xF5\x5F\xA6\xFF\x87\x1E"
+			  "\x3F\xDD\xC0\x72\xEA\xAC\x08\x15"
+			  "\x21\xE6\xC6\xB6\x0D\xD8\x51\x86"
+			  "\x2A\x03\x73\xF7\x29\xD4\xC4\xE4"
+			  "\x7F\x95\x10\xF7\xAB\x3F\x92\x23"
+			  "\xD3\xCE\x9C\x2E\x46\x3B\x63\x43"
+			  "\xBB\xC2\x82\x7A\x83\xD5\x55\xE2"
+			  "\xE7\x9B\x2F\x92\xAF\xFD\x81\x56"
+			  "\x79\xFD\x3E\xF9\x46\xE0\x25\xD4"
+			  "\x38\xDE\xBC\x2C\xC4\x7A\x2A\x8F"
+			  "\x94\x4F\xD0\xAD\x9B\x37\x18\xD4"
+			  "\x0E\x4D\x0F\x02\x3A\xDC\x5A\xA2"
+			  "\x39\x25\x55\x20\x5A\xA6\x02\x9F"
+			  "\xE6\x77\x21\x77\xE5\x4B\x7B\x0B"
+			  "\x30\xF8\x5F\x33\x0F\x49\xCD\xFF"
+			  "\xF2\xE4\x35\xF9\xF0\x63\xC3\x7E"
+			  "\xF1\xA6\x73\xB4\xDF\xE7\xBB\x78"
+			  "\xFF\x21\xA9\xF3\xF3\xCF\x5D\xBA"
+			  "\xED\x87\x98\xAC\xFE\x48\x97\x6D"
+			  "\xA6\x7F\x69\x31\xB1\xC4\xFF\x14"
+			  "\xC6\x76\xD4\x10\xDD\xF6\x49\x2C"
+			  "\x9C\xC8\x6D\x76\xC0\x8F\x5F\x55"
+			  "\x2F\x3C\x8A\x30\xAA\xC3\x16\x55"
+			  "\xC6\xFC\x8D\x8B\xB9\xE5\x80\x6C"
+			  "\xC8\x7E\xBD\x65\x58\x36\xD5\xBC"
+			  "\xF0\x33\x52\x29\x70\xF9\x5C\xE9"
+			  "\xAC\x1F\xB5\x73\x56\x66\x54\xAF"
+			  "\x1B\x8F\x7D\xED\xAB\x03\xCE\xE3"
+			  "\xAE\x47\xB6\x69\x86\xE9\x01\x31"
+			  "\x83\x18\x3D\xF4\x74\x7B\xF9\x42"
+			  "\x4C\xFD\x75\x4A\x6D\xF0\x03\xA6"
+			  "\x2B\x20\x63\xDA\x49\x65\x5E\x8B"
+			  "\xC0\x19\xE3\x8D\xD9\xF3\xB0\x34"
+			  "\xD3\x52\xFC\x68\x00\x43\x1B\x37"
+			  "\x31\x93\x51\x1C\x63\x97\x70\xB0"
+			  "\x99\x78\x83\x13\xFD\xCF\x53\x81"
+			  "\x36\x46\xB5\x42\x52\x2F\x32\xEB"
+			  "\x4A\x3D\xF1\x8F\x1C\x54\x2E\xFC"
+			  "\x41\x75\x5A\x8C\x8E\x6F\xE7\x1A"
+			  "\xAE\xEF\x3E\x82\x12\x0B\x74\x72"
+			  "\xF8\xB2\xAA\x7A\xD6\xFF\xFA\x55"
+			  "\x33\x1A\xBB\xD3\xA2\x7E\x97\x66",
+		.ilen	= 1008,
 		.result	= "\x56\xED\x84\x1B\x8F\x26\xBD\x31"
 			  "\xC8\x5F\xF6\x6A\x01\x98\x0C\xA3"
 			  "\x3A\xD1\x45\xDC\x73\x0A\x7E\x15"
@@ -21232,11 +21424,75 @@ static struct cipher_testvec camellia_dec_tv_template[] = {
 			  "\x86\x1D\xB4\x28\xBF\x56\xED\x61"
 			  "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3"
 			  "\x6A\x01\x75\x0C\xA3\x17\xAE\x45"
-			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7",
-		.rlen	= 496,
+			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7"
+			  "\x2B\xC2\x59\xF0\x64\xFB\x92\x06"
+			  "\x9D\x34\xCB\x3F\xD6\x6D\x04\x78"
+			  "\x0F\xA6\x1A\xB1\x48\xDF\x53\xEA"
+			  "\x81\x18\x8C\x23\xBA\x2E\xC5\x5C"
+			  "\xF3\x67\xFE\x95\x09\xA0\x37\xCE"
+			  "\x42\xD9\x70\x07\x7B\x12\xA9\x1D"
+			  "\xB4\x4B\xE2\x56\xED\x84\x1B\x8F"
+			  "\x26\xBD\x31\xC8\x5F\xF6\x6A\x01"
+			  "\x98\x0C\xA3\x3A\xD1\x45\xDC\x73"
+			  "\x0A\x7E\x15\xAC\x20\xB7\x4E\xE5"
+			  "\x59\xF0\x87\x1E\x92\x29\xC0\x34"
+			  "\xCB\x62\xF9\x6D\x04\x9B\x0F\xA6"
+			  "\x3D\xD4\x48\xDF\x76\x0D\x81\x18"
+			  "\xAF\x23\xBA\x51\xE8\x5C\xF3\x8A"
+			  "\x21\x95\x2C\xC3\x37\xCE\x65\xFC"
+			  "\x70\x07\x9E\x12\xA9\x40\xD7\x4B"
+			  "\xE2\x79\x10\x84\x1B\xB2\x26\xBD"
+			  "\x54\xEB\x5F\xF6\x8D\x01\x98\x2F"
+			  "\xC6\x3A\xD1\x68\xFF\x73\x0A\xA1"
+			  "\x15\xAC\x43\xDA\x4E\xE5\x7C\x13"
+			  "\x87\x1E\xB5\x29\xC0\x57\xEE\x62"
+			  "\xF9\x90\x04\x9B\x32\xC9\x3D\xD4"
+			  "\x6B\x02\x76\x0D\xA4\x18\xAF\x46"
+			  "\xDD\x51\xE8\x7F\x16\x8A\x21\xB8"
+			  "\x2C\xC3\x5A\xF1\x65\xFC\x93\x07"
+			  "\x9E\x35\xCC\x40\xD7\x6E\x05\x79"
+			  "\x10\xA7\x1B\xB2\x49\xE0\x54\xEB"
+			  "\x82\x19\x8D\x24\xBB\x2F\xC6\x5D"
+			  "\xF4\x68\xFF\x96\x0A\xA1\x38\xCF"
+			  "\x43\xDA\x71\x08\x7C\x13\xAA\x1E"
+			  "\xB5\x4C\xE3\x57\xEE\x85\x1C\x90"
+			  "\x27\xBE\x32\xC9\x60\xF7\x6B\x02"
+			  "\x99\x0D\xA4\x3B\xD2\x46\xDD\x74"
+			  "\x0B\x7F\x16\xAD\x21\xB8\x4F\xE6"
+			  "\x5A\xF1\x88\x1F\x93\x2A\xC1\x35"
+			  "\xCC\x63\xFA\x6E\x05\x9C\x10\xA7"
+			  "\x3E\xD5\x49\xE0\x77\x0E\x82\x19"
+			  "\xB0\x24\xBB\x52\xE9\x5D\xF4\x8B"
+			  "\x22\x96\x2D\xC4\x38\xCF\x66\xFD"
+			  "\x71\x08\x9F\x13\xAA\x41\xD8\x4C"
+			  "\xE3\x7A\x11\x85\x1C\xB3\x27\xBE"
+			  "\x55\xEC\x60\xF7\x8E\x02\x99\x30"
+			  "\xC7\x3B\xD2\x69\x00\x74\x0B\xA2"
+			  "\x16\xAD\x44\xDB\x4F\xE6\x7D\x14"
+			  "\x88\x1F\xB6\x2A\xC1\x58\xEF\x63"
+			  "\xFA\x91\x05\x9C\x33\xCA\x3E\xD5"
+			  "\x6C\x03\x77\x0E\xA5\x19\xB0\x47"
+			  "\xDE\x52\xE9\x80\x17\x8B\x22\xB9"
+			  "\x2D\xC4\x5B\xF2\x66\xFD\x94\x08"
+			  "\x9F\x36\xCD\x41\xD8\x6F\x06\x7A"
+			  "\x11\xA8\x1C\xB3\x4A\xE1\x55\xEC"
+			  "\x83\x1A\x8E\x25\xBC\x30\xC7\x5E"
+			  "\xF5\x69\x00\x97\x0B\xA2\x39\xD0"
+			  "\x44\xDB\x72\x09\x7D\x14\xAB\x1F"
+			  "\xB6\x4D\xE4\x58\xEF\x86\x1D\x91"
+			  "\x28\xBF\x33\xCA\x61\xF8\x6C\x03"
+			  "\x9A\x0E\xA5\x3C\xD3\x47\xDE\x75"
+			  "\x0C\x80\x17\xAE\x22\xB9\x50\xE7"
+			  "\x5B\xF2\x89\x20\x94\x2B\xC2\x36"
+			  "\xCD\x64\xFB\x6F\x06\x9D\x11\xA8"
+			  "\x3F\xD6\x4A\xE1\x78\x0F\x83\x1A"
+			  "\xB1\x25\xBC\x53\xEA\x5E\xF5\x8C"
+			  "\x00\x97\x2E\xC5\x39\xD0\x67\xFE"
+			  "\x72\x09\xA0\x14\xAB\x42\xD9\x4D",
+		.rlen	= 1008,
 		.also_non_np = 1,
 		.np	= 2,
-		.tap	= { 496 - 16, 16 },
+		.tap	= { 1008 - 16, 16 },
 	},
 };
 
@@ -21337,8 +21593,72 @@ static struct cipher_testvec camellia_cbc_enc_tv_template[] = {
 			  "\x86\x1D\xB4\x28\xBF\x56\xED\x61"
 			  "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3"
 			  "\x6A\x01\x75\x0C\xA3\x17\xAE\x45"
-			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7",
-		.ilen	= 496,
+			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7"
+			  "\x2B\xC2\x59\xF0\x64\xFB\x92\x06"
+			  "\x9D\x34\xCB\x3F\xD6\x6D\x04\x78"
+			  "\x0F\xA6\x1A\xB1\x48\xDF\x53\xEA"
+			  "\x81\x18\x8C\x23\xBA\x2E\xC5\x5C"
+			  "\xF3\x67\xFE\x95\x09\xA0\x37\xCE"
+			  "\x42\xD9\x70\x07\x7B\x12\xA9\x1D"
+			  "\xB4\x4B\xE2\x56\xED\x84\x1B\x8F"
+			  "\x26\xBD\x31\xC8\x5F\xF6\x6A\x01"
+			  "\x98\x0C\xA3\x3A\xD1\x45\xDC\x73"
+			  "\x0A\x7E\x15\xAC\x20\xB7\x4E\xE5"
+			  "\x59\xF0\x87\x1E\x92\x29\xC0\x34"
+			  "\xCB\x62\xF9\x6D\x04\x9B\x0F\xA6"
+			  "\x3D\xD4\x48\xDF\x76\x0D\x81\x18"
+			  "\xAF\x23\xBA\x51\xE8\x5C\xF3\x8A"
+			  "\x21\x95\x2C\xC3\x37\xCE\x65\xFC"
+			  "\x70\x07\x9E\x12\xA9\x40\xD7\x4B"
+			  "\xE2\x79\x10\x84\x1B\xB2\x26\xBD"
+			  "\x54\xEB\x5F\xF6\x8D\x01\x98\x2F"
+			  "\xC6\x3A\xD1\x68\xFF\x73\x0A\xA1"
+			  "\x15\xAC\x43\xDA\x4E\xE5\x7C\x13"
+			  "\x87\x1E\xB5\x29\xC0\x57\xEE\x62"
+			  "\xF9\x90\x04\x9B\x32\xC9\x3D\xD4"
+			  "\x6B\x02\x76\x0D\xA4\x18\xAF\x46"
+			  "\xDD\x51\xE8\x7F\x16\x8A\x21\xB8"
+			  "\x2C\xC3\x5A\xF1\x65\xFC\x93\x07"
+			  "\x9E\x35\xCC\x40\xD7\x6E\x05\x79"
+			  "\x10\xA7\x1B\xB2\x49\xE0\x54\xEB"
+			  "\x82\x19\x8D\x24\xBB\x2F\xC6\x5D"
+			  "\xF4\x68\xFF\x96\x0A\xA1\x38\xCF"
+			  "\x43\xDA\x71\x08\x7C\x13\xAA\x1E"
+			  "\xB5\x4C\xE3\x57\xEE\x85\x1C\x90"
+			  "\x27\xBE\x32\xC9\x60\xF7\x6B\x02"
+			  "\x99\x0D\xA4\x3B\xD2\x46\xDD\x74"
+			  "\x0B\x7F\x16\xAD\x21\xB8\x4F\xE6"
+			  "\x5A\xF1\x88\x1F\x93\x2A\xC1\x35"
+			  "\xCC\x63\xFA\x6E\x05\x9C\x10\xA7"
+			  "\x3E\xD5\x49\xE0\x77\x0E\x82\x19"
+			  "\xB0\x24\xBB\x52\xE9\x5D\xF4\x8B"
+			  "\x22\x96\x2D\xC4\x38\xCF\x66\xFD"
+			  "\x71\x08\x9F\x13\xAA\x41\xD8\x4C"
+			  "\xE3\x7A\x11\x85\x1C\xB3\x27\xBE"
+			  "\x55\xEC\x60\xF7\x8E\x02\x99\x30"
+			  "\xC7\x3B\xD2\x69\x00\x74\x0B\xA2"
+			  "\x16\xAD\x44\xDB\x4F\xE6\x7D\x14"
+			  "\x88\x1F\xB6\x2A\xC1\x58\xEF\x63"
+			  "\xFA\x91\x05\x9C\x33\xCA\x3E\xD5"
+			  "\x6C\x03\x77\x0E\xA5\x19\xB0\x47"
+			  "\xDE\x52\xE9\x80\x17\x8B\x22\xB9"
+			  "\x2D\xC4\x5B\xF2\x66\xFD\x94\x08"
+			  "\x9F\x36\xCD\x41\xD8\x6F\x06\x7A"
+			  "\x11\xA8\x1C\xB3\x4A\xE1\x55\xEC"
+			  "\x83\x1A\x8E\x25\xBC\x30\xC7\x5E"
+			  "\xF5\x69\x00\x97\x0B\xA2\x39\xD0"
+			  "\x44\xDB\x72\x09\x7D\x14\xAB\x1F"
+			  "\xB6\x4D\xE4\x58\xEF\x86\x1D\x91"
+			  "\x28\xBF\x33\xCA\x61\xF8\x6C\x03"
+			  "\x9A\x0E\xA5\x3C\xD3\x47\xDE\x75"
+			  "\x0C\x80\x17\xAE\x22\xB9\x50\xE7"
+			  "\x5B\xF2\x89\x20\x94\x2B\xC2\x36"
+			  "\xCD\x64\xFB\x6F\x06\x9D\x11\xA8"
+			  "\x3F\xD6\x4A\xE1\x78\x0F\x83\x1A"
+			  "\xB1\x25\xBC\x53\xEA\x5E\xF5\x8C"
+			  "\x00\x97\x2E\xC5\x39\xD0\x67\xFE"
+			  "\x72\x09\xA0\x14\xAB\x42\xD9\x4D",
+		.ilen	= 1008,
 		.result	= "\xCD\x3E\x2A\x3B\x3E\x94\xC5\x77"
 			  "\xBA\xBB\x5B\xB1\xDE\x7B\xA4\x40"
 			  "\x88\x39\xE3\xFD\x94\x4B\x25\x58"
@@ -21400,11 +21720,75 @@ static struct cipher_testvec camellia_cbc_enc_tv_template[] = {
 			  "\x2D\x1A\x68\xFE\xEC\x92\x94\xDA"
 			  "\x94\x2A\x6F\xD6\xFE\xE5\x76\x97"
 			  "\xF4\x6E\xEE\xCB\x2B\x95\x4E\x36"
-			  "\x5F\x74\x8C\x86\x5B\x71\xD0\x20",
-		.rlen	= 496,
+			  "\x5F\x74\x8C\x86\x5B\x71\xD0\x20"
+			  "\x78\x1A\x7F\x18\x8C\xD9\xCD\xF5"
+			  "\x21\x41\x56\x72\x13\xE1\x86\x07"
+			  "\x07\x26\xF3\x4F\x7B\xEA\xB5\x18"
+			  "\xFE\x94\x2D\x9F\xE0\x72\x18\x65"
+			  "\xB2\xA5\x63\x48\xB4\x13\x22\xF7"
+			  "\x25\xF1\x80\xA8\x7F\x54\x86\x7B"
+			  "\x39\xAE\x95\x0C\x09\x32\x22\x2D"
+			  "\x4D\x73\x39\x0C\x09\x2C\x7C\x10"
+			  "\xD0\x4B\x53\xF6\x90\xC5\x99\x2F"
+			  "\x15\xE1\x7F\xC6\xC5\x7A\x52\x14"
+			  "\x65\xEE\x93\x54\xD0\x66\x15\x3C"
+			  "\x4C\x68\xFD\x64\x0F\xF9\x10\x39"
+			  "\x46\x7A\xDD\x97\x20\xEE\xC7\xD2"
+			  "\x98\x4A\xB6\xE6\xF5\xA8\x1F\x4F"
+			  "\xDB\xAB\x6D\xD5\x9B\x34\x16\x97"
+			  "\x2F\x64\xE5\x37\xEF\x0E\xA1\xE9"
+			  "\xBE\x31\x31\x96\x8B\x40\x18\x75"
+			  "\x11\x75\x14\x32\xA5\x2D\x1B\x6B"
+			  "\xDB\x59\xEB\xFA\x3D\x8E\x7C\xC4"
+			  "\xDE\x68\xC8\x9F\xC9\x99\xE3\xC6"
+			  "\x71\xB0\x12\x57\x89\x0D\xC0\x2B"
+			  "\x9F\x12\x6A\x04\x67\xF1\x95\x31"
+			  "\x59\xFD\x84\x95\x2C\x9C\x5B\xEC"
+			  "\x09\xB0\x43\x96\x4A\x64\x80\x40"
+			  "\xB9\x72\x19\xDD\x70\x42\xFA\xB1"
+			  "\x4A\x2C\x0C\x0A\x60\x6E\xE3\x7C"
+			  "\x37\x5A\xBE\xA4\x62\xCF\x29\xAB"
+			  "\x7F\x4D\xA6\xB3\xE2\xB6\x64\xC6"
+			  "\x33\x0B\xF3\xD5\x01\x38\x74\xA4"
+			  "\x67\x1E\x75\x68\xC3\xAD\x76\xE9"
+			  "\xE9\xBC\xF0\xEB\xD8\xFD\x31\x8A"
+			  "\x5F\xC9\x18\x94\x4B\x86\x66\xFC"
+			  "\xBD\x0B\x3D\xB3\x9F\xFA\x1F\xD9"
+			  "\x78\xC4\xE3\x24\x1C\x67\xA2\xF8"
+			  "\x43\xBC\x76\x75\xBF\x6C\x05\xB3"
+			  "\x32\xE8\x7C\x80\xDB\xC7\xB6\x61"
+			  "\x1A\x3E\x2B\xA7\x25\xED\x8F\xA0"
+			  "\x00\x4B\xF8\x90\xCA\xD8\xFB\x12"
+			  "\xAC\x1F\x18\xE9\xD2\x5E\xA2\x8E"
+			  "\xE4\x84\x6B\x9D\xEB\x1E\x6B\xA3"
+			  "\x7B\xDC\xCE\x15\x97\x27\xB2\x65"
+			  "\xBC\x0E\x47\xAB\x55\x13\x53\xAB"
+			  "\x0E\x34\x55\x02\x5F\x27\xC5\x89"
+			  "\xDF\xC5\x70\xC4\xDD\x76\x82\xEE"
+			  "\x68\xA6\x09\xB0\xE5\x5E\xF1\x0C"
+			  "\xE3\xF3\x09\x9B\xFE\x65\x4B\xB8"
+			  "\x30\xEC\xD5\x7C\x6A\xEC\x1D\xD2"
+			  "\x93\xB7\xA1\x1A\x02\xD4\xC0\xD6"
+			  "\x8D\x4D\x83\x9A\xED\x29\x4E\x14"
+			  "\x86\xD5\x3C\x1A\xD5\xB9\x0A\x6A"
+			  "\x72\x22\xD5\x92\x38\xF1\xA1\x86"
+			  "\xB2\x41\x51\xCA\x4E\xAB\x8F\xD3"
+			  "\x80\x56\xC3\xD7\x65\xE1\xB3\x86"
+			  "\xCB\xCE\x98\xA1\xD4\x59\x1C\x06"
+			  "\x01\xED\xF8\x29\x91\x19\x5C\x9A"
+			  "\xEE\x28\x1B\x48\xD7\x32\xEF\x9F"
+			  "\x6C\x2B\x66\x4E\x78\xD5\x8B\x72"
+			  "\x80\xE7\x29\xDC\x23\x55\x98\x54"
+			  "\xB1\xFF\x3E\x95\x56\xA8\x78\x78"
+			  "\xEF\xC4\xA5\x11\x2D\x2B\xD8\x93"
+			  "\x30\x6E\x7E\x51\xBB\x42\x5F\x03"
+			  "\x43\x94\x23\x7E\xEE\xF0\xA5\x79"
+			  "\x55\x01\xD4\x58\xB2\xF2\x85\x49"
+			  "\x70\xC5\xB9\x0B\x3B\x7A\x6E\x6C",
+		.rlen	= 1008,
 		.also_non_np = 1,
 		.np	= 2,
-		.tap	= { 496 - 16, 16 },
+		.tap	= { 1008 - 16, 16 },
 	},
 };
 
@@ -21505,8 +21889,72 @@ static struct cipher_testvec camellia_cbc_dec_tv_template[] = {
 			  "\x2D\x1A\x68\xFE\xEC\x92\x94\xDA"
 			  "\x94\x2A\x6F\xD6\xFE\xE5\x76\x97"
 			  "\xF4\x6E\xEE\xCB\x2B\x95\x4E\x36"
-			  "\x5F\x74\x8C\x86\x5B\x71\xD0\x20",
-		.ilen	= 496,
+			  "\x5F\x74\x8C\x86\x5B\x71\xD0\x20"
+			  "\x78\x1A\x7F\x18\x8C\xD9\xCD\xF5"
+			  "\x21\x41\x56\x72\x13\xE1\x86\x07"
+			  "\x07\x26\xF3\x4F\x7B\xEA\xB5\x18"
+			  "\xFE\x94\x2D\x9F\xE0\x72\x18\x65"
+			  "\xB2\xA5\x63\x48\xB4\x13\x22\xF7"
+			  "\x25\xF1\x80\xA8\x7F\x54\x86\x7B"
+			  "\x39\xAE\x95\x0C\x09\x32\x22\x2D"
+			  "\x4D\x73\x39\x0C\x09\x2C\x7C\x10"
+			  "\xD0\x4B\x53\xF6\x90\xC5\x99\x2F"
+			  "\x15\xE1\x7F\xC6\xC5\x7A\x52\x14"
+			  "\x65\xEE\x93\x54\xD0\x66\x15\x3C"
+			  "\x4C\x68\xFD\x64\x0F\xF9\x10\x39"
+			  "\x46\x7A\xDD\x97\x20\xEE\xC7\xD2"
+			  "\x98\x4A\xB6\xE6\xF5\xA8\x1F\x4F"
+			  "\xDB\xAB\x6D\xD5\x9B\x34\x16\x97"
+			  "\x2F\x64\xE5\x37\xEF\x0E\xA1\xE9"
+			  "\xBE\x31\x31\x96\x8B\x40\x18\x75"
+			  "\x11\x75\x14\x32\xA5\x2D\x1B\x6B"
+			  "\xDB\x59\xEB\xFA\x3D\x8E\x7C\xC4"
+			  "\xDE\x68\xC8\x9F\xC9\x99\xE3\xC6"
+			  "\x71\xB0\x12\x57\x89\x0D\xC0\x2B"
+			  "\x9F\x12\x6A\x04\x67\xF1\x95\x31"
+			  "\x59\xFD\x84\x95\x2C\x9C\x5B\xEC"
+			  "\x09\xB0\x43\x96\x4A\x64\x80\x40"
+			  "\xB9\x72\x19\xDD\x70\x42\xFA\xB1"
+			  "\x4A\x2C\x0C\x0A\x60\x6E\xE3\x7C"
+			  "\x37\x5A\xBE\xA4\x62\xCF\x29\xAB"
+			  "\x7F\x4D\xA6\xB3\xE2\xB6\x64\xC6"
+			  "\x33\x0B\xF3\xD5\x01\x38\x74\xA4"
+			  "\x67\x1E\x75\x68\xC3\xAD\x76\xE9"
+			  "\xE9\xBC\xF0\xEB\xD8\xFD\x31\x8A"
+			  "\x5F\xC9\x18\x94\x4B\x86\x66\xFC"
+			  "\xBD\x0B\x3D\xB3\x9F\xFA\x1F\xD9"
+			  "\x78\xC4\xE3\x24\x1C\x67\xA2\xF8"
+			  "\x43\xBC\x76\x75\xBF\x6C\x05\xB3"
+			  "\x32\xE8\x7C\x80\xDB\xC7\xB6\x61"
+			  "\x1A\x3E\x2B\xA7\x25\xED\x8F\xA0"
+			  "\x00\x4B\xF8\x90\xCA\xD8\xFB\x12"
+			  "\xAC\x1F\x18\xE9\xD2\x5E\xA2\x8E"
+			  "\xE4\x84\x6B\x9D\xEB\x1E\x6B\xA3"
+			  "\x7B\xDC\xCE\x15\x97\x27\xB2\x65"
+			  "\xBC\x0E\x47\xAB\x55\x13\x53\xAB"
+			  "\x0E\x34\x55\x02\x5F\x27\xC5\x89"
+			  "\xDF\xC5\x70\xC4\xDD\x76\x82\xEE"
+			  "\x68\xA6\x09\xB0\xE5\x5E\xF1\x0C"
+			  "\xE3\xF3\x09\x9B\xFE\x65\x4B\xB8"
+			  "\x30\xEC\xD5\x7C\x6A\xEC\x1D\xD2"
+			  "\x93\xB7\xA1\x1A\x02\xD4\xC0\xD6"
+			  "\x8D\x4D\x83\x9A\xED\x29\x4E\x14"
+			  "\x86\xD5\x3C\x1A\xD5\xB9\x0A\x6A"
+			  "\x72\x22\xD5\x92\x38\xF1\xA1\x86"
+			  "\xB2\x41\x51\xCA\x4E\xAB\x8F\xD3"
+			  "\x80\x56\xC3\xD7\x65\xE1\xB3\x86"
+			  "\xCB\xCE\x98\xA1\xD4\x59\x1C\x06"
+			  "\x01\xED\xF8\x29\x91\x19\x5C\x9A"
+			  "\xEE\x28\x1B\x48\xD7\x32\xEF\x9F"
+			  "\x6C\x2B\x66\x4E\x78\xD5\x8B\x72"
+			  "\x80\xE7\x29\xDC\x23\x55\x98\x54"
+			  "\xB1\xFF\x3E\x95\x56\xA8\x78\x78"
+			  "\xEF\xC4\xA5\x11\x2D\x2B\xD8\x93"
+			  "\x30\x6E\x7E\x51\xBB\x42\x5F\x03"
+			  "\x43\x94\x23\x7E\xEE\xF0\xA5\x79"
+			  "\x55\x01\xD4\x58\xB2\xF2\x85\x49"
+			  "\x70\xC5\xB9\x0B\x3B\x7A\x6E\x6C",
+		.ilen	= 1008,
 		.result	= "\x56\xED\x84\x1B\x8F\x26\xBD\x31"
 			  "\xC8\x5F\xF6\x6A\x01\x98\x0C\xA3"
 			  "\x3A\xD1\x45\xDC\x73\x0A\x7E\x15"
@@ -21568,11 +22016,75 @@ static struct cipher_testvec camellia_cbc_dec_tv_template[] = {
 			  "\x86\x1D\xB4\x28\xBF\x56\xED\x61"
 			  "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3"
 			  "\x6A\x01\x75\x0C\xA3\x17\xAE\x45"
-			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7",
-		.rlen	= 496,
+			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7"
+			  "\x2B\xC2\x59\xF0\x64\xFB\x92\x06"
+			  "\x9D\x34\xCB\x3F\xD6\x6D\x04\x78"
+			  "\x0F\xA6\x1A\xB1\x48\xDF\x53\xEA"
+			  "\x81\x18\x8C\x23\xBA\x2E\xC5\x5C"
+			  "\xF3\x67\xFE\x95\x09\xA0\x37\xCE"
+			  "\x42\xD9\x70\x07\x7B\x12\xA9\x1D"
+			  "\xB4\x4B\xE2\x56\xED\x84\x1B\x8F"
+			  "\x26\xBD\x31\xC8\x5F\xF6\x6A\x01"
+			  "\x98\x0C\xA3\x3A\xD1\x45\xDC\x73"
+			  "\x0A\x7E\x15\xAC\x20\xB7\x4E\xE5"
+			  "\x59\xF0\x87\x1E\x92\x29\xC0\x34"
+			  "\xCB\x62\xF9\x6D\x04\x9B\x0F\xA6"
+			  "\x3D\xD4\x48\xDF\x76\x0D\x81\x18"
+			  "\xAF\x23\xBA\x51\xE8\x5C\xF3\x8A"
+			  "\x21\x95\x2C\xC3\x37\xCE\x65\xFC"
+			  "\x70\x07\x9E\x12\xA9\x40\xD7\x4B"
+			  "\xE2\x79\x10\x84\x1B\xB2\x26\xBD"
+			  "\x54\xEB\x5F\xF6\x8D\x01\x98\x2F"
+			  "\xC6\x3A\xD1\x68\xFF\x73\x0A\xA1"
+			  "\x15\xAC\x43\xDA\x4E\xE5\x7C\x13"
+			  "\x87\x1E\xB5\x29\xC0\x57\xEE\x62"
+			  "\xF9\x90\x04\x9B\x32\xC9\x3D\xD4"
+			  "\x6B\x02\x76\x0D\xA4\x18\xAF\x46"
+			  "\xDD\x51\xE8\x7F\x16\x8A\x21\xB8"
+			  "\x2C\xC3\x5A\xF1\x65\xFC\x93\x07"
+			  "\x9E\x35\xCC\x40\xD7\x6E\x05\x79"
+			  "\x10\xA7\x1B\xB2\x49\xE0\x54\xEB"
+			  "\x82\x19\x8D\x24\xBB\x2F\xC6\x5D"
+			  "\xF4\x68\xFF\x96\x0A\xA1\x38\xCF"
+			  "\x43\xDA\x71\x08\x7C\x13\xAA\x1E"
+			  "\xB5\x4C\xE3\x57\xEE\x85\x1C\x90"
+			  "\x27\xBE\x32\xC9\x60\xF7\x6B\x02"
+			  "\x99\x0D\xA4\x3B\xD2\x46\xDD\x74"
+			  "\x0B\x7F\x16\xAD\x21\xB8\x4F\xE6"
+			  "\x5A\xF1\x88\x1F\x93\x2A\xC1\x35"
+			  "\xCC\x63\xFA\x6E\x05\x9C\x10\xA7"
+			  "\x3E\xD5\x49\xE0\x77\x0E\x82\x19"
+			  "\xB0\x24\xBB\x52\xE9\x5D\xF4\x8B"
+			  "\x22\x96\x2D\xC4\x38\xCF\x66\xFD"
+			  "\x71\x08\x9F\x13\xAA\x41\xD8\x4C"
+			  "\xE3\x7A\x11\x85\x1C\xB3\x27\xBE"
+			  "\x55\xEC\x60\xF7\x8E\x02\x99\x30"
+			  "\xC7\x3B\xD2\x69\x00\x74\x0B\xA2"
+			  "\x16\xAD\x44\xDB\x4F\xE6\x7D\x14"
+			  "\x88\x1F\xB6\x2A\xC1\x58\xEF\x63"
+			  "\xFA\x91\x05\x9C\x33\xCA\x3E\xD5"
+			  "\x6C\x03\x77\x0E\xA5\x19\xB0\x47"
+			  "\xDE\x52\xE9\x80\x17\x8B\x22\xB9"
+			  "\x2D\xC4\x5B\xF2\x66\xFD\x94\x08"
+			  "\x9F\x36\xCD\x41\xD8\x6F\x06\x7A"
+			  "\x11\xA8\x1C\xB3\x4A\xE1\x55\xEC"
+			  "\x83\x1A\x8E\x25\xBC\x30\xC7\x5E"
+			  "\xF5\x69\x00\x97\x0B\xA2\x39\xD0"
+			  "\x44\xDB\x72\x09\x7D\x14\xAB\x1F"
+			  "\xB6\x4D\xE4\x58\xEF\x86\x1D\x91"
+			  "\x28\xBF\x33\xCA\x61\xF8\x6C\x03"
+			  "\x9A\x0E\xA5\x3C\xD3\x47\xDE\x75"
+			  "\x0C\x80\x17\xAE\x22\xB9\x50\xE7"
+			  "\x5B\xF2\x89\x20\x94\x2B\xC2\x36"
+			  "\xCD\x64\xFB\x6F\x06\x9D\x11\xA8"
+			  "\x3F\xD6\x4A\xE1\x78\x0F\x83\x1A"
+			  "\xB1\x25\xBC\x53\xEA\x5E\xF5\x8C"
+			  "\x00\x97\x2E\xC5\x39\xD0\x67\xFE"
+			  "\x72\x09\xA0\x14\xAB\x42\xD9\x4D",
+		.rlen	= 1008,
 		.also_non_np = 1,
 		.np	= 2,
-		.tap	= { 496 - 16, 16 },
+		.tap	= { 1008 - 16, 16 },
 	},
 };
 
@@ -21781,8 +22293,72 @@ static struct cipher_testvec camellia_ctr_enc_tv_template[] = {
 			  "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3"
 			  "\x6A\x01\x75\x0C\xA3\x17\xAE\x45"
 			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7"
-			  "\x2B\xC2\x59",
-		.ilen	= 499,
+			  "\x2B\xC2\x59\xF0\x64\xFB\x92\x06"
+			  "\x9D\x34\xCB\x3F\xD6\x6D\x04\x78"
+			  "\x0F\xA6\x1A\xB1\x48\xDF\x53\xEA"
+			  "\x81\x18\x8C\x23\xBA\x2E\xC5\x5C"
+			  "\xF3\x67\xFE\x95\x09\xA0\x37\xCE"
+			  "\x42\xD9\x70\x07\x7B\x12\xA9\x1D"
+			  "\xB4\x4B\xE2\x56\xED\x84\x1B\x8F"
+			  "\x26\xBD\x31\xC8\x5F\xF6\x6A\x01"
+			  "\x98\x0C\xA3\x3A\xD1\x45\xDC\x73"
+			  "\x0A\x7E\x15\xAC\x20\xB7\x4E\xE5"
+			  "\x59\xF0\x87\x1E\x92\x29\xC0\x34"
+			  "\xCB\x62\xF9\x6D\x04\x9B\x0F\xA6"
+			  "\x3D\xD4\x48\xDF\x76\x0D\x81\x18"
+			  "\xAF\x23\xBA\x51\xE8\x5C\xF3\x8A"
+			  "\x21\x95\x2C\xC3\x37\xCE\x65\xFC"
+			  "\x70\x07\x9E\x12\xA9\x40\xD7\x4B"
+			  "\xE2\x79\x10\x84\x1B\xB2\x26\xBD"
+			  "\x54\xEB\x5F\xF6\x8D\x01\x98\x2F"
+			  "\xC6\x3A\xD1\x68\xFF\x73\x0A\xA1"
+			  "\x15\xAC\x43\xDA\x4E\xE5\x7C\x13"
+			  "\x87\x1E\xB5\x29\xC0\x57\xEE\x62"
+			  "\xF9\x90\x04\x9B\x32\xC9\x3D\xD4"
+			  "\x6B\x02\x76\x0D\xA4\x18\xAF\x46"
+			  "\xDD\x51\xE8\x7F\x16\x8A\x21\xB8"
+			  "\x2C\xC3\x5A\xF1\x65\xFC\x93\x07"
+			  "\x9E\x35\xCC\x40\xD7\x6E\x05\x79"
+			  "\x10\xA7\x1B\xB2\x49\xE0\x54\xEB"
+			  "\x82\x19\x8D\x24\xBB\x2F\xC6\x5D"
+			  "\xF4\x68\xFF\x96\x0A\xA1\x38\xCF"
+			  "\x43\xDA\x71\x08\x7C\x13\xAA\x1E"
+			  "\xB5\x4C\xE3\x57\xEE\x85\x1C\x90"
+			  "\x27\xBE\x32\xC9\x60\xF7\x6B\x02"
+			  "\x99\x0D\xA4\x3B\xD2\x46\xDD\x74"
+			  "\x0B\x7F\x16\xAD\x21\xB8\x4F\xE6"
+			  "\x5A\xF1\x88\x1F\x93\x2A\xC1\x35"
+			  "\xCC\x63\xFA\x6E\x05\x9C\x10\xA7"
+			  "\x3E\xD5\x49\xE0\x77\x0E\x82\x19"
+			  "\xB0\x24\xBB\x52\xE9\x5D\xF4\x8B"
+			  "\x22\x96\x2D\xC4\x38\xCF\x66\xFD"
+			  "\x71\x08\x9F\x13\xAA\x41\xD8\x4C"
+			  "\xE3\x7A\x11\x85\x1C\xB3\x27\xBE"
+			  "\x55\xEC\x60\xF7\x8E\x02\x99\x30"
+			  "\xC7\x3B\xD2\x69\x00\x74\x0B\xA2"
+			  "\x16\xAD\x44\xDB\x4F\xE6\x7D\x14"
+			  "\x88\x1F\xB6\x2A\xC1\x58\xEF\x63"
+			  "\xFA\x91\x05\x9C\x33\xCA\x3E\xD5"
+			  "\x6C\x03\x77\x0E\xA5\x19\xB0\x47"
+			  "\xDE\x52\xE9\x80\x17\x8B\x22\xB9"
+			  "\x2D\xC4\x5B\xF2\x66\xFD\x94\x08"
+			  "\x9F\x36\xCD\x41\xD8\x6F\x06\x7A"
+			  "\x11\xA8\x1C\xB3\x4A\xE1\x55\xEC"
+			  "\x83\x1A\x8E\x25\xBC\x30\xC7\x5E"
+			  "\xF5\x69\x00\x97\x0B\xA2\x39\xD0"
+			  "\x44\xDB\x72\x09\x7D\x14\xAB\x1F"
+			  "\xB6\x4D\xE4\x58\xEF\x86\x1D\x91"
+			  "\x28\xBF\x33\xCA\x61\xF8\x6C\x03"
+			  "\x9A\x0E\xA5\x3C\xD3\x47\xDE\x75"
+			  "\x0C\x80\x17\xAE\x22\xB9\x50\xE7"
+			  "\x5B\xF2\x89\x20\x94\x2B\xC2\x36"
+			  "\xCD\x64\xFB\x6F\x06\x9D\x11\xA8"
+			  "\x3F\xD6\x4A\xE1\x78\x0F\x83\x1A"
+			  "\xB1\x25\xBC\x53\xEA\x5E\xF5\x8C"
+			  "\x00\x97\x2E\xC5\x39\xD0\x67\xFE"
+			  "\x72\x09\xA0\x14\xAB\x42\xD9\x4D"
+			  "\xE4\x7B\x12",
+		.ilen	= 1011,
 		.result	= "\xF3\x06\x3A\x84\xCD\xBA\x8E\x11"
 			  "\xB7\x74\x6F\x5C\x97\xFB\x36\xFE"
 			  "\xDE\x71\x58\xD4\x15\xD1\xC1\xA4"
@@ -21845,11 +22421,75 @@ static struct cipher_testvec camellia_ctr_enc_tv_template[] = {
 			  "\x7E\x42\xEC\xB6\x6F\x4D\x6B\x48"
 			  "\xE6\xA6\x50\x80\x78\x9E\xF1\xB0"
 			  "\x4D\xB2\x0D\x3D\xFC\x40\x25\x4D"
-			  "\x93\x11\x1C",
-		.rlen	= 499,
+			  "\x93\x11\x1C\xE9\xD2\x9F\x6E\x90"
+			  "\xE5\x41\x4A\xE2\x3C\x45\x29\x35"
+			  "\xEC\xD6\x47\x50\xCB\x7B\xA2\x32"
+			  "\xF7\x8B\x62\xF1\xE3\x9A\xFE\xC7"
+			  "\x1D\x8C\x02\x72\x68\x09\xE9\xB6"
+			  "\x4A\x80\xE6\xB1\x56\xDF\x90\xD4"
+			  "\x93\x74\xA4\xCE\x20\x23\xBF\x48"
+			  "\xA5\xDE\x1B\xFA\x40\x69\x31\x98"
+			  "\x62\x6E\xA5\xC7\xBF\x0C\x62\xE5"
+			  "\x6D\xE1\x93\xF1\x83\x10\x1C\xCA"
+			  "\xF6\x5C\x19\xF8\x90\x78\xCB\xE4"
+			  "\x0B\x3A\xB5\xF8\x43\x86\xD3\x3F"
+			  "\xBA\x83\x34\x3C\x42\xCC\x7D\x28"
+			  "\x29\x63\x4F\xD8\x02\x17\xC5\x07"
+			  "\x2C\xA4\xAC\x79\xCB\xC3\xA9\x09"
+			  "\x81\x45\x18\xED\xE4\xCB\x42\x3B"
+			  "\x87\x2D\x23\xDC\xC5\xBA\x45\xBD"
+			  "\x92\xE5\x02\x97\x96\xCE\xAD\xEC"
+			  "\xBA\xD8\x76\xF8\xCA\xC1\x31\xEC"
+			  "\x1E\x4F\x3F\x83\xF8\x33\xE8\x6E"
+			  "\xCC\xF8\x5F\xDD\x65\x50\x99\x69"
+			  "\xAF\x48\xCE\xA5\xBA\xB6\x14\x9F"
+			  "\x05\x93\xB2\xE6\x59\xC8\x28\xFE"
+			  "\x8F\x37\xF9\x64\xB9\xA5\x56\x8F"
+			  "\xF1\x1B\x90\xEF\xAE\xEB\xFC\x09"
+			  "\x11\x7A\xF2\x19\x0A\x0A\x9A\x3C"
+			  "\xE2\x5E\x29\xFA\x31\x9B\xC1\x74"
+			  "\x1E\x10\x3E\x07\xA9\x31\x6D\xF8"
+			  "\x81\xF5\xD5\x8A\x04\x23\x51\xAC"
+			  "\xA2\xE2\x63\xFD\x27\x1F\x79\x5B"
+			  "\x1F\xE8\xDA\x11\x49\x4D\x1C\xBA"
+			  "\x54\xCC\x0F\xBA\x92\x69\xE5\xCB"
+			  "\x41\x1A\x67\xA6\x40\x82\x70\x8C"
+			  "\x19\x79\x08\xA4\x51\x20\x7D\xC9"
+			  "\x12\x27\xAE\x20\x0D\x2C\xA1\x6D"
+			  "\xF4\x55\xD4\xE7\xE6\xD4\x28\x08"
+			  "\x00\x70\x12\x56\x56\x50\xAD\x14"
+			  "\x5C\x3E\xA2\xD1\x36\x3F\x36\x48"
+			  "\xED\xB1\x57\x3E\x5D\x15\xF6\x1E"
+			  "\x53\xE9\xA4\x3E\xED\x7D\xCF\x7D"
+			  "\x29\xAF\xF3\x1E\x51\xA8\x9F\x85"
+			  "\x8B\xF0\xBB\xCE\xCC\x39\xC3\x64"
+			  "\x4B\xF2\xAD\x70\x19\xD4\x44\x8F"
+			  "\x91\x76\xE8\x15\x66\x34\x9F\xF6"
+			  "\x0F\x15\xA4\xA8\x24\xF8\x58\xB1"
+			  "\x38\x46\x47\xC7\x9B\xCA\xE9\x42"
+			  "\x44\xAA\xE6\xB5\x9C\x91\xA4\xD3"
+			  "\x16\xA0\xED\x42\xBE\xB5\x06\x19"
+			  "\xBE\x67\xE8\xBC\x22\x32\xA4\x1E"
+			  "\x93\xEB\xBE\xE9\xE1\x93\xE5\x31"
+			  "\x3A\xA2\x75\xDF\xE3\x6B\xE7\xCC"
+			  "\xB4\x70\x20\xE0\x6D\x82\x7C\xC8"
+			  "\x94\x5C\x5E\x37\x18\xAD\xED\x8B"
+			  "\x44\x86\xCA\x5E\x07\xB7\x70\x8D"
+			  "\x40\x48\x19\x73\x7C\x78\x64\x0B"
+			  "\xDB\x01\xCA\xAE\x63\x19\xE9\xD1"
+			  "\x6B\x2C\x84\x10\x45\x42\x2E\xC3"
+			  "\xDF\x7F\xAA\xE8\x87\x1B\x63\x46"
+			  "\x74\x28\x9D\x05\x30\x20\x62\x41"
+			  "\xC0\x9F\x2C\x36\x2B\x78\xD7\x26"
+			  "\xDF\x58\x51\xED\xFA\xDC\x87\x79"
+			  "\xBF\x8C\xBF\xC4\x0F\xE5\x05\xDA"
+			  "\x45\xE3\x35\x0D\x69\x91\x54\x1C"
+			  "\xE7\x2C\x49\x08\x8B\x72\xFA\x5C"
+			  "\xF1\x6B\xD9",
+		.rlen	= 1011,
 		.also_non_np = 1,
 		.np	= 2,
-		.tap	= { 499 - 16, 16 },
+		.tap	= { 1011 - 16, 16 },
 	}, { /* Generated with Crypto++ */
 		.key	= "\x85\x62\x3F\x1C\xF9\xD6\x1C\xF9"
 			  "\xD6\xB3\x90\x6D\x4A\x90\x6D\x4A"
@@ -21919,8 +22559,72 @@ static struct cipher_testvec camellia_ctr_enc_tv_template[] = {
 			  "\x86\x1D\xB4\x28\xBF\x56\xED\x61"
 			  "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3"
 			  "\x6A\x01\x75\x0C\xA3\x17\xAE\x45"
-			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7",
-		.ilen	= 496,
+			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7"
+			  "\x2B\xC2\x59\xF0\x64\xFB\x92\x06"
+			  "\x9D\x34\xCB\x3F\xD6\x6D\x04\x78"
+			  "\x0F\xA6\x1A\xB1\x48\xDF\x53\xEA"
+			  "\x81\x18\x8C\x23\xBA\x2E\xC5\x5C"
+			  "\xF3\x67\xFE\x95\x09\xA0\x37\xCE"
+			  "\x42\xD9\x70\x07\x7B\x12\xA9\x1D"
+			  "\xB4\x4B\xE2\x56\xED\x84\x1B\x8F"
+			  "\x26\xBD\x31\xC8\x5F\xF6\x6A\x01"
+			  "\x98\x0C\xA3\x3A\xD1\x45\xDC\x73"
+			  "\x0A\x7E\x15\xAC\x20\xB7\x4E\xE5"
+			  "\x59\xF0\x87\x1E\x92\x29\xC0\x34"
+			  "\xCB\x62\xF9\x6D\x04\x9B\x0F\xA6"
+			  "\x3D\xD4\x48\xDF\x76\x0D\x81\x18"
+			  "\xAF\x23\xBA\x51\xE8\x5C\xF3\x8A"
+			  "\x21\x95\x2C\xC3\x37\xCE\x65\xFC"
+			  "\x70\x07\x9E\x12\xA9\x40\xD7\x4B"
+			  "\xE2\x79\x10\x84\x1B\xB2\x26\xBD"
+			  "\x54\xEB\x5F\xF6\x8D\x01\x98\x2F"
+			  "\xC6\x3A\xD1\x68\xFF\x73\x0A\xA1"
+			  "\x15\xAC\x43\xDA\x4E\xE5\x7C\x13"
+			  "\x87\x1E\xB5\x29\xC0\x57\xEE\x62"
+			  "\xF9\x90\x04\x9B\x32\xC9\x3D\xD4"
+			  "\x6B\x02\x76\x0D\xA4\x18\xAF\x46"
+			  "\xDD\x51\xE8\x7F\x16\x8A\x21\xB8"
+			  "\x2C\xC3\x5A\xF1\x65\xFC\x93\x07"
+			  "\x9E\x35\xCC\x40\xD7\x6E\x05\x79"
+			  "\x10\xA7\x1B\xB2\x49\xE0\x54\xEB"
+			  "\x82\x19\x8D\x24\xBB\x2F\xC6\x5D"
+			  "\xF4\x68\xFF\x96\x0A\xA1\x38\xCF"
+			  "\x43\xDA\x71\x08\x7C\x13\xAA\x1E"
+			  "\xB5\x4C\xE3\x57\xEE\x85\x1C\x90"
+			  "\x27\xBE\x32\xC9\x60\xF7\x6B\x02"
+			  "\x99\x0D\xA4\x3B\xD2\x46\xDD\x74"
+			  "\x0B\x7F\x16\xAD\x21\xB8\x4F\xE6"
+			  "\x5A\xF1\x88\x1F\x93\x2A\xC1\x35"
+			  "\xCC\x63\xFA\x6E\x05\x9C\x10\xA7"
+			  "\x3E\xD5\x49\xE0\x77\x0E\x82\x19"
+			  "\xB0\x24\xBB\x52\xE9\x5D\xF4\x8B"
+			  "\x22\x96\x2D\xC4\x38\xCF\x66\xFD"
+			  "\x71\x08\x9F\x13\xAA\x41\xD8\x4C"
+			  "\xE3\x7A\x11\x85\x1C\xB3\x27\xBE"
+			  "\x55\xEC\x60\xF7\x8E\x02\x99\x30"
+			  "\xC7\x3B\xD2\x69\x00\x74\x0B\xA2"
+			  "\x16\xAD\x44\xDB\x4F\xE6\x7D\x14"
+			  "\x88\x1F\xB6\x2A\xC1\x58\xEF\x63"
+			  "\xFA\x91\x05\x9C\x33\xCA\x3E\xD5"
+			  "\x6C\x03\x77\x0E\xA5\x19\xB0\x47"
+			  "\xDE\x52\xE9\x80\x17\x8B\x22\xB9"
+			  "\x2D\xC4\x5B\xF2\x66\xFD\x94\x08"
+			  "\x9F\x36\xCD\x41\xD8\x6F\x06\x7A"
+			  "\x11\xA8\x1C\xB3\x4A\xE1\x55\xEC"
+			  "\x83\x1A\x8E\x25\xBC\x30\xC7\x5E"
+			  "\xF5\x69\x00\x97\x0B\xA2\x39\xD0"
+			  "\x44\xDB\x72\x09\x7D\x14\xAB\x1F"
+			  "\xB6\x4D\xE4\x58\xEF\x86\x1D\x91"
+			  "\x28\xBF\x33\xCA\x61\xF8\x6C\x03"
+			  "\x9A\x0E\xA5\x3C\xD3\x47\xDE\x75"
+			  "\x0C\x80\x17\xAE\x22\xB9\x50\xE7"
+			  "\x5B\xF2\x89\x20\x94\x2B\xC2\x36"
+			  "\xCD\x64\xFB\x6F\x06\x9D\x11\xA8"
+			  "\x3F\xD6\x4A\xE1\x78\x0F\x83\x1A"
+			  "\xB1\x25\xBC\x53\xEA\x5E\xF5\x8C"
+			  "\x00\x97\x2E\xC5\x39\xD0\x67\xFE"
+			  "\x72\x09\xA0\x14\xAB\x42\xD9\x4D",
+		.ilen	= 1008,
 		.result	= "\x85\x79\x6C\x8B\x2B\x6D\x14\xF9"
 			  "\xA6\x83\xB6\x80\x5B\x3A\xF3\x7E"
 			  "\x30\x29\xEB\x1F\xDC\x19\x5F\xEB"
@@ -21982,8 +22686,72 @@ static struct cipher_testvec camellia_ctr_enc_tv_template[] = {
 			  "\xB4\x3A\x5F\x19\xCF\x42\x1B\x22"
 			  "\x0B\x2D\x7B\xF1\xC5\x43\xF7\x5E"
 			  "\x12\xA8\x01\x64\x16\x0B\x26\x5A"
-			  "\x0C\x95\x0F\x40\xC5\x5A\x06\x7C",
-		.rlen	= 496,
+			  "\x0C\x95\x0F\x40\xC5\x5A\x06\x7C"
+			  "\xCF\xF5\xD5\xB7\x7A\x34\x23\xB6"
+			  "\xAA\x9E\xA8\x98\xA2\xF8\x3D\xD3"
+			  "\x3F\x23\x69\x63\x56\x96\x45\xD6"
+			  "\x74\x23\x1D\x5C\x63\xCC\xD8\x78"
+			  "\x16\xE2\x9C\xD2\x80\x02\xF2\x28"
+			  "\x69\x2F\xC4\xA8\x15\x15\x24\x3B"
+			  "\xCB\xF0\x14\xE4\x62\xC8\xF3\xD1"
+			  "\x03\x58\x1B\x33\x77\x74\x1F\xB4"
+			  "\x07\x86\xF2\x21\xB7\x41\xAE\xBF"
+			  "\x25\xC2\xFF\x51\xEF\xEA\xCE\xC4"
+			  "\x5F\xD9\xB8\x18\x6A\xF0\x0F\x0D"
+			  "\xF8\x04\xBB\x6D\x62\x33\x87\x26"
+			  "\x4F\x2F\x14\x6E\xDC\xDB\x66\x09"
+			  "\x2A\xEF\x7D\x84\x10\xAC\x82\x5E"
+			  "\xD2\xE4\xAD\x74\x7A\x6D\xCC\x3A"
+			  "\x7B\x62\xD8\xD6\x07\x2D\xF7\xDF"
+			  "\x9B\xB3\x82\xCF\x9C\x1D\x76\x5C"
+			  "\xAC\x7B\xD4\x9B\x45\xA1\x64\x11"
+			  "\x66\xF1\xA7\x0B\xF9\xDD\x00\xDD"
+			  "\xA4\x45\x3D\x3E\x03\xC9\x2E\xCB"
+			  "\xC3\x14\x84\x72\xFD\x41\xDC\xBD"
+			  "\x75\xBE\xA8\xE5\x16\x48\x64\x39"
+			  "\xCA\xF3\xE6\xDC\x25\x24\xF1\x6D"
+			  "\xB2\x8D\xC5\x38\x54\xD3\x5D\x6D"
+			  "\x0B\x29\x10\x15\x0E\x13\x3B\xAC"
+			  "\x7E\xCC\x9E\x3E\x18\x48\xA6\x02"
+			  "\xEF\x03\xB2\x2E\xE3\xD2\x70\x21"
+			  "\xB4\x19\x26\xBE\x3A\x3D\x05\xE0"
+			  "\xF8\x09\xAF\xE4\x31\x26\x92\x2F"
+			  "\x8F\x55\xAC\xED\x0B\xB2\xA5\x34"
+			  "\xBE\x50\xB1\x02\x22\x96\xE3\x40"
+			  "\x7B\x70\x50\x6E\x3B\xD5\xE5\xA0"
+			  "\x8E\xA2\xAD\x14\x60\x5C\x7A\x2B"
+			  "\x3D\x1B\x7F\xC1\xC0\x2C\x56\x36"
+			  "\xD2\x0A\x32\x06\x97\x34\xB9\xF4"
+			  "\x6F\x9F\x7E\x80\xD0\x9D\xF7\x6A"
+			  "\x21\xC1\xA2\x6A\xB1\x96\x5B\x4D"
+			  "\x7A\x15\x6C\xC4\x4E\xB8\xE0\x9E"
+			  "\x6C\x50\xF3\x9C\xC9\xB5\x23\xB7"
+			  "\xF1\xD4\x29\x4A\x23\xC4\xAD\x1E"
+			  "\x2C\x07\xD2\x43\x5F\x57\x93\xCA"
+			  "\x85\xF9\x9F\xAD\x4C\xF1\xE4\xB1"
+			  "\x1A\x8E\x28\xA4\xB6\x52\x77\x7E"
+			  "\x68\xC6\x47\xB9\x76\xCC\x65\x5F"
+			  "\x0B\xF9\x67\x93\xD8\x0E\x9A\x37"
+			  "\x5F\x41\xED\x64\x6C\xAD\x5F\xED"
+			  "\x3F\x8D\xFB\x8E\x1E\xA0\xE4\x1F"
+			  "\xC2\xC7\xED\x18\x43\xE1\x20\x86"
+			  "\x5D\xBC\x30\x70\x22\xA1\xDC\x53"
+			  "\x10\x3A\x8D\x47\x82\xCD\x7F\x59"
+			  "\x03\x2D\x6D\xF5\xE7\x79\xD4\x07"
+			  "\x68\x2A\xA5\x42\x19\x4D\xAF\xF5"
+			  "\xED\x47\x83\xBC\x5F\x62\x84\xDA"
+			  "\xDA\x41\xFF\xB0\x1D\x64\xA3\xC8"
+			  "\xBD\x4E\xE0\xB8\x7F\xEE\x55\x0A"
+			  "\x4E\x61\xB2\x51\xF6\x9C\x95\xF6"
+			  "\x92\xBB\xF6\xC5\xF0\x09\x86\xDE"
+			  "\x37\x9E\x29\xF9\x2A\x18\x73\x0D"
+			  "\xDC\x7E\x6B\x7B\x1B\x43\x8C\xEA"
+			  "\x13\xC8\x1A\x47\x0A\x2D\x6D\x56"
+			  "\xCD\xD2\xE7\x53\x1A\xAB\x1C\x3C"
+			  "\xC5\x9B\x03\x70\x29\x2A\x49\x09"
+			  "\x67\xA1\xEA\xD6\x3A\x5B\xBF\x71"
+			  "\x1D\x48\x64\x6C\xFB\xC0\x9E\x36",
+		.rlen	= 1008,
 	},
 };
 
@@ -22192,8 +22960,72 @@ static struct cipher_testvec camellia_ctr_dec_tv_template[] = {
 			  "\x7E\x42\xEC\xB6\x6F\x4D\x6B\x48"
 			  "\xE6\xA6\x50\x80\x78\x9E\xF1\xB0"
 			  "\x4D\xB2\x0D\x3D\xFC\x40\x25\x4D"
-			  "\x93\x11\x1C",
-		.ilen	= 499,
+			  "\x93\x11\x1C\xE9\xD2\x9F\x6E\x90"
+			  "\xE5\x41\x4A\xE2\x3C\x45\x29\x35"
+			  "\xEC\xD6\x47\x50\xCB\x7B\xA2\x32"
+			  "\xF7\x8B\x62\xF1\xE3\x9A\xFE\xC7"
+			  "\x1D\x8C\x02\x72\x68\x09\xE9\xB6"
+			  "\x4A\x80\xE6\xB1\x56\xDF\x90\xD4"
+			  "\x93\x74\xA4\xCE\x20\x23\xBF\x48"
+			  "\xA5\xDE\x1B\xFA\x40\x69\x31\x98"
+			  "\x62\x6E\xA5\xC7\xBF\x0C\x62\xE5"
+			  "\x6D\xE1\x93\xF1\x83\x10\x1C\xCA"
+			  "\xF6\x5C\x19\xF8\x90\x78\xCB\xE4"
+			  "\x0B\x3A\xB5\xF8\x43\x86\xD3\x3F"
+			  "\xBA\x83\x34\x3C\x42\xCC\x7D\x28"
+			  "\x29\x63\x4F\xD8\x02\x17\xC5\x07"
+			  "\x2C\xA4\xAC\x79\xCB\xC3\xA9\x09"
+			  "\x81\x45\x18\xED\xE4\xCB\x42\x3B"
+			  "\x87\x2D\x23\xDC\xC5\xBA\x45\xBD"
+			  "\x92\xE5\x02\x97\x96\xCE\xAD\xEC"
+			  "\xBA\xD8\x76\xF8\xCA\xC1\x31\xEC"
+			  "\x1E\x4F\x3F\x83\xF8\x33\xE8\x6E"
+			  "\xCC\xF8\x5F\xDD\x65\x50\x99\x69"
+			  "\xAF\x48\xCE\xA5\xBA\xB6\x14\x9F"
+			  "\x05\x93\xB2\xE6\x59\xC8\x28\xFE"
+			  "\x8F\x37\xF9\x64\xB9\xA5\x56\x8F"
+			  "\xF1\x1B\x90\xEF\xAE\xEB\xFC\x09"
+			  "\x11\x7A\xF2\x19\x0A\x0A\x9A\x3C"
+			  "\xE2\x5E\x29\xFA\x31\x9B\xC1\x74"
+			  "\x1E\x10\x3E\x07\xA9\x31\x6D\xF8"
+			  "\x81\xF5\xD5\x8A\x04\x23\x51\xAC"
+			  "\xA2\xE2\x63\xFD\x27\x1F\x79\x5B"
+			  "\x1F\xE8\xDA\x11\x49\x4D\x1C\xBA"
+			  "\x54\xCC\x0F\xBA\x92\x69\xE5\xCB"
+			  "\x41\x1A\x67\xA6\x40\x82\x70\x8C"
+			  "\x19\x79\x08\xA4\x51\x20\x7D\xC9"
+			  "\x12\x27\xAE\x20\x0D\x2C\xA1\x6D"
+			  "\xF4\x55\xD4\xE7\xE6\xD4\x28\x08"
+			  "\x00\x70\x12\x56\x56\x50\xAD\x14"
+			  "\x5C\x3E\xA2\xD1\x36\x3F\x36\x48"
+			  "\xED\xB1\x57\x3E\x5D\x15\xF6\x1E"
+			  "\x53\xE9\xA4\x3E\xED\x7D\xCF\x7D"
+			  "\x29\xAF\xF3\x1E\x51\xA8\x9F\x85"
+			  "\x8B\xF0\xBB\xCE\xCC\x39\xC3\x64"
+			  "\x4B\xF2\xAD\x70\x19\xD4\x44\x8F"
+			  "\x91\x76\xE8\x15\x66\x34\x9F\xF6"
+			  "\x0F\x15\xA4\xA8\x24\xF8\x58\xB1"
+			  "\x38\x46\x47\xC7\x9B\xCA\xE9\x42"
+			  "\x44\xAA\xE6\xB5\x9C\x91\xA4\xD3"
+			  "\x16\xA0\xED\x42\xBE\xB5\x06\x19"
+			  "\xBE\x67\xE8\xBC\x22\x32\xA4\x1E"
+			  "\x93\xEB\xBE\xE9\xE1\x93\xE5\x31"
+			  "\x3A\xA2\x75\xDF\xE3\x6B\xE7\xCC"
+			  "\xB4\x70\x20\xE0\x6D\x82\x7C\xC8"
+			  "\x94\x5C\x5E\x37\x18\xAD\xED\x8B"
+			  "\x44\x86\xCA\x5E\x07\xB7\x70\x8D"
+			  "\x40\x48\x19\x73\x7C\x78\x64\x0B"
+			  "\xDB\x01\xCA\xAE\x63\x19\xE9\xD1"
+			  "\x6B\x2C\x84\x10\x45\x42\x2E\xC3"
+			  "\xDF\x7F\xAA\xE8\x87\x1B\x63\x46"
+			  "\x74\x28\x9D\x05\x30\x20\x62\x41"
+			  "\xC0\x9F\x2C\x36\x2B\x78\xD7\x26"
+			  "\xDF\x58\x51\xED\xFA\xDC\x87\x79"
+			  "\xBF\x8C\xBF\xC4\x0F\xE5\x05\xDA"
+			  "\x45\xE3\x35\x0D\x69\x91\x54\x1C"
+			  "\xE7\x2C\x49\x08\x8B\x72\xFA\x5C"
+			  "\xF1\x6B\xD9",
+		.ilen	= 1011,
 		.result	= "\x56\xED\x84\x1B\x8F\x26\xBD\x31"
 			  "\xC8\x5F\xF6\x6A\x01\x98\x0C\xA3"
 			  "\x3A\xD1\x45\xDC\x73\x0A\x7E\x15"
@@ -22256,11 +23088,75 @@ static struct cipher_testvec camellia_ctr_dec_tv_template[] = {
 			  "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3"
 			  "\x6A\x01\x75\x0C\xA3\x17\xAE\x45"
 			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7"
-			  "\x2B\xC2\x59",
-		.rlen	= 499,
+			  "\x2B\xC2\x59\xF0\x64\xFB\x92\x06"
+			  "\x9D\x34\xCB\x3F\xD6\x6D\x04\x78"
+			  "\x0F\xA6\x1A\xB1\x48\xDF\x53\xEA"
+			  "\x81\x18\x8C\x23\xBA\x2E\xC5\x5C"
+			  "\xF3\x67\xFE\x95\x09\xA0\x37\xCE"
+			  "\x42\xD9\x70\x07\x7B\x12\xA9\x1D"
+			  "\xB4\x4B\xE2\x56\xED\x84\x1B\x8F"
+			  "\x26\xBD\x31\xC8\x5F\xF6\x6A\x01"
+			  "\x98\x0C\xA3\x3A\xD1\x45\xDC\x73"
+			  "\x0A\x7E\x15\xAC\x20\xB7\x4E\xE5"
+			  "\x59\xF0\x87\x1E\x92\x29\xC0\x34"
+			  "\xCB\x62\xF9\x6D\x04\x9B\x0F\xA6"
+			  "\x3D\xD4\x48\xDF\x76\x0D\x81\x18"
+			  "\xAF\x23\xBA\x51\xE8\x5C\xF3\x8A"
+			  "\x21\x95\x2C\xC3\x37\xCE\x65\xFC"
+			  "\x70\x07\x9E\x12\xA9\x40\xD7\x4B"
+			  "\xE2\x79\x10\x84\x1B\xB2\x26\xBD"
+			  "\x54\xEB\x5F\xF6\x8D\x01\x98\x2F"
+			  "\xC6\x3A\xD1\x68\xFF\x73\x0A\xA1"
+			  "\x15\xAC\x43\xDA\x4E\xE5\x7C\x13"
+			  "\x87\x1E\xB5\x29\xC0\x57\xEE\x62"
+			  "\xF9\x90\x04\x9B\x32\xC9\x3D\xD4"
+			  "\x6B\x02\x76\x0D\xA4\x18\xAF\x46"
+			  "\xDD\x51\xE8\x7F\x16\x8A\x21\xB8"
+			  "\x2C\xC3\x5A\xF1\x65\xFC\x93\x07"
+			  "\x9E\x35\xCC\x40\xD7\x6E\x05\x79"
+			  "\x10\xA7\x1B\xB2\x49\xE0\x54\xEB"
+			  "\x82\x19\x8D\x24\xBB\x2F\xC6\x5D"
+			  "\xF4\x68\xFF\x96\x0A\xA1\x38\xCF"
+			  "\x43\xDA\x71\x08\x7C\x13\xAA\x1E"
+			  "\xB5\x4C\xE3\x57\xEE\x85\x1C\x90"
+			  "\x27\xBE\x32\xC9\x60\xF7\x6B\x02"
+			  "\x99\x0D\xA4\x3B\xD2\x46\xDD\x74"
+			  "\x0B\x7F\x16\xAD\x21\xB8\x4F\xE6"
+			  "\x5A\xF1\x88\x1F\x93\x2A\xC1\x35"
+			  "\xCC\x63\xFA\x6E\x05\x9C\x10\xA7"
+			  "\x3E\xD5\x49\xE0\x77\x0E\x82\x19"
+			  "\xB0\x24\xBB\x52\xE9\x5D\xF4\x8B"
+			  "\x22\x96\x2D\xC4\x38\xCF\x66\xFD"
+			  "\x71\x08\x9F\x13\xAA\x41\xD8\x4C"
+			  "\xE3\x7A\x11\x85\x1C\xB3\x27\xBE"
+			  "\x55\xEC\x60\xF7\x8E\x02\x99\x30"
+			  "\xC7\x3B\xD2\x69\x00\x74\x0B\xA2"
+			  "\x16\xAD\x44\xDB\x4F\xE6\x7D\x14"
+			  "\x88\x1F\xB6\x2A\xC1\x58\xEF\x63"
+			  "\xFA\x91\x05\x9C\x33\xCA\x3E\xD5"
+			  "\x6C\x03\x77\x0E\xA5\x19\xB0\x47"
+			  "\xDE\x52\xE9\x80\x17\x8B\x22\xB9"
+			  "\x2D\xC4\x5B\xF2\x66\xFD\x94\x08"
+			  "\x9F\x36\xCD\x41\xD8\x6F\x06\x7A"
+			  "\x11\xA8\x1C\xB3\x4A\xE1\x55\xEC"
+			  "\x83\x1A\x8E\x25\xBC\x30\xC7\x5E"
+			  "\xF5\x69\x00\x97\x0B\xA2\x39\xD0"
+			  "\x44\xDB\x72\x09\x7D\x14\xAB\x1F"
+			  "\xB6\x4D\xE4\x58\xEF\x86\x1D\x91"
+			  "\x28\xBF\x33\xCA\x61\xF8\x6C\x03"
+			  "\x9A\x0E\xA5\x3C\xD3\x47\xDE\x75"
+			  "\x0C\x80\x17\xAE\x22\xB9\x50\xE7"
+			  "\x5B\xF2\x89\x20\x94\x2B\xC2\x36"
+			  "\xCD\x64\xFB\x6F\x06\x9D\x11\xA8"
+			  "\x3F\xD6\x4A\xE1\x78\x0F\x83\x1A"
+			  "\xB1\x25\xBC\x53\xEA\x5E\xF5\x8C"
+			  "\x00\x97\x2E\xC5\x39\xD0\x67\xFE"
+			  "\x72\x09\xA0\x14\xAB\x42\xD9\x4D"
+			  "\xE4\x7B\x12",
+		.rlen	= 1011,
 		.also_non_np = 1,
 		.np	= 2,
-		.tap	= { 499 - 16, 16 },
+		.tap	= { 1011 - 16, 16 },
 	}, { /* Generated with Crypto++ */
 		.key	= "\x85\x62\x3F\x1C\xF9\xD6\x1C\xF9"
 			  "\xD6\xB3\x90\x6D\x4A\x90\x6D\x4A"
@@ -22330,8 +23226,72 @@ static struct cipher_testvec camellia_ctr_dec_tv_template[] = {
 			  "\xB4\x3A\x5F\x19\xCF\x42\x1B\x22"
 			  "\x0B\x2D\x7B\xF1\xC5\x43\xF7\x5E"
 			  "\x12\xA8\x01\x64\x16\x0B\x26\x5A"
-			  "\x0C\x95\x0F\x40\xC5\x5A\x06\x7C",
-		.ilen	= 496,
+			  "\x0C\x95\x0F\x40\xC5\x5A\x06\x7C"
+			  "\xCF\xF5\xD5\xB7\x7A\x34\x23\xB6"
+			  "\xAA\x9E\xA8\x98\xA2\xF8\x3D\xD3"
+			  "\x3F\x23\x69\x63\x56\x96\x45\xD6"
+			  "\x74\x23\x1D\x5C\x63\xCC\xD8\x78"
+			  "\x16\xE2\x9C\xD2\x80\x02\xF2\x28"
+			  "\x69\x2F\xC4\xA8\x15\x15\x24\x3B"
+			  "\xCB\xF0\x14\xE4\x62\xC8\xF3\xD1"
+			  "\x03\x58\x1B\x33\x77\x74\x1F\xB4"
+			  "\x07\x86\xF2\x21\xB7\x41\xAE\xBF"
+			  "\x25\xC2\xFF\x51\xEF\xEA\xCE\xC4"
+			  "\x5F\xD9\xB8\x18\x6A\xF0\x0F\x0D"
+			  "\xF8\x04\xBB\x6D\x62\x33\x87\x26"
+			  "\x4F\x2F\x14\x6E\xDC\xDB\x66\x09"
+			  "\x2A\xEF\x7D\x84\x10\xAC\x82\x5E"
+			  "\xD2\xE4\xAD\x74\x7A\x6D\xCC\x3A"
+			  "\x7B\x62\xD8\xD6\x07\x2D\xF7\xDF"
+			  "\x9B\xB3\x82\xCF\x9C\x1D\x76\x5C"
+			  "\xAC\x7B\xD4\x9B\x45\xA1\x64\x11"
+			  "\x66\xF1\xA7\x0B\xF9\xDD\x00\xDD"
+			  "\xA4\x45\x3D\x3E\x03\xC9\x2E\xCB"
+			  "\xC3\x14\x84\x72\xFD\x41\xDC\xBD"
+			  "\x75\xBE\xA8\xE5\x16\x48\x64\x39"
+			  "\xCA\xF3\xE6\xDC\x25\x24\xF1\x6D"
+			  "\xB2\x8D\xC5\x38\x54\xD3\x5D\x6D"
+			  "\x0B\x29\x10\x15\x0E\x13\x3B\xAC"
+			  "\x7E\xCC\x9E\x3E\x18\x48\xA6\x02"
+			  "\xEF\x03\xB2\x2E\xE3\xD2\x70\x21"
+			  "\xB4\x19\x26\xBE\x3A\x3D\x05\xE0"
+			  "\xF8\x09\xAF\xE4\x31\x26\x92\x2F"
+			  "\x8F\x55\xAC\xED\x0B\xB2\xA5\x34"
+			  "\xBE\x50\xB1\x02\x22\x96\xE3\x40"
+			  "\x7B\x70\x50\x6E\x3B\xD5\xE5\xA0"
+			  "\x8E\xA2\xAD\x14\x60\x5C\x7A\x2B"
+			  "\x3D\x1B\x7F\xC1\xC0\x2C\x56\x36"
+			  "\xD2\x0A\x32\x06\x97\x34\xB9\xF4"
+			  "\x6F\x9F\x7E\x80\xD0\x9D\xF7\x6A"
+			  "\x21\xC1\xA2\x6A\xB1\x96\x5B\x4D"
+			  "\x7A\x15\x6C\xC4\x4E\xB8\xE0\x9E"
+			  "\x6C\x50\xF3\x9C\xC9\xB5\x23\xB7"
+			  "\xF1\xD4\x29\x4A\x23\xC4\xAD\x1E"
+			  "\x2C\x07\xD2\x43\x5F\x57\x93\xCA"
+			  "\x85\xF9\x9F\xAD\x4C\xF1\xE4\xB1"
+			  "\x1A\x8E\x28\xA4\xB6\x52\x77\x7E"
+			  "\x68\xC6\x47\xB9\x76\xCC\x65\x5F"
+			  "\x0B\xF9\x67\x93\xD8\x0E\x9A\x37"
+			  "\x5F\x41\xED\x64\x6C\xAD\x5F\xED"
+			  "\x3F\x8D\xFB\x8E\x1E\xA0\xE4\x1F"
+			  "\xC2\xC7\xED\x18\x43\xE1\x20\x86"
+			  "\x5D\xBC\x30\x70\x22\xA1\xDC\x53"
+			  "\x10\x3A\x8D\x47\x82\xCD\x7F\x59"
+			  "\x03\x2D\x6D\xF5\xE7\x79\xD4\x07"
+			  "\x68\x2A\xA5\x42\x19\x4D\xAF\xF5"
+			  "\xED\x47\x83\xBC\x5F\x62\x84\xDA"
+			  "\xDA\x41\xFF\xB0\x1D\x64\xA3\xC8"
+			  "\xBD\x4E\xE0\xB8\x7F\xEE\x55\x0A"
+			  "\x4E\x61\xB2\x51\xF6\x9C\x95\xF6"
+			  "\x92\xBB\xF6\xC5\xF0\x09\x86\xDE"
+			  "\x37\x9E\x29\xF9\x2A\x18\x73\x0D"
+			  "\xDC\x7E\x6B\x7B\x1B\x43\x8C\xEA"
+			  "\x13\xC8\x1A\x47\x0A\x2D\x6D\x56"
+			  "\xCD\xD2\xE7\x53\x1A\xAB\x1C\x3C"
+			  "\xC5\x9B\x03\x70\x29\x2A\x49\x09"
+			  "\x67\xA1\xEA\xD6\x3A\x5B\xBF\x71"
+			  "\x1D\x48\x64\x6C\xFB\xC0\x9E\x36",
+		.ilen	= 1008,
 		.result	= "\x56\xED\x84\x1B\x8F\x26\xBD\x31"
 			  "\xC8\x5F\xF6\x6A\x01\x98\x0C\xA3"
 			  "\x3A\xD1\x45\xDC\x73\x0A\x7E\x15"
@@ -22393,8 +23353,72 @@ static struct cipher_testvec camellia_ctr_dec_tv_template[] = {
 			  "\x86\x1D\xB4\x28\xBF\x56\xED\x61"
 			  "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3"
 			  "\x6A\x01\x75\x0C\xA3\x17\xAE\x45"
-			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7",
-		.rlen	= 496,
+			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7"
+			  "\x2B\xC2\x59\xF0\x64\xFB\x92\x06"
+			  "\x9D\x34\xCB\x3F\xD6\x6D\x04\x78"
+			  "\x0F\xA6\x1A\xB1\x48\xDF\x53\xEA"
+			  "\x81\x18\x8C\x23\xBA\x2E\xC5\x5C"
+			  "\xF3\x67\xFE\x95\x09\xA0\x37\xCE"
+			  "\x42\xD9\x70\x07\x7B\x12\xA9\x1D"
+			  "\xB4\x4B\xE2\x56\xED\x84\x1B\x8F"
+			  "\x26\xBD\x31\xC8\x5F\xF6\x6A\x01"
+			  "\x98\x0C\xA3\x3A\xD1\x45\xDC\x73"
+			  "\x0A\x7E\x15\xAC\x20\xB7\x4E\xE5"
+			  "\x59\xF0\x87\x1E\x92\x29\xC0\x34"
+			  "\xCB\x62\xF9\x6D\x04\x9B\x0F\xA6"
+			  "\x3D\xD4\x48\xDF\x76\x0D\x81\x18"
+			  "\xAF\x23\xBA\x51\xE8\x5C\xF3\x8A"
+			  "\x21\x95\x2C\xC3\x37\xCE\x65\xFC"
+			  "\x70\x07\x9E\x12\xA9\x40\xD7\x4B"
+			  "\xE2\x79\x10\x84\x1B\xB2\x26\xBD"
+			  "\x54\xEB\x5F\xF6\x8D\x01\x98\x2F"
+			  "\xC6\x3A\xD1\x68\xFF\x73\x0A\xA1"
+			  "\x15\xAC\x43\xDA\x4E\xE5\x7C\x13"
+			  "\x87\x1E\xB5\x29\xC0\x57\xEE\x62"
+			  "\xF9\x90\x04\x9B\x32\xC9\x3D\xD4"
+			  "\x6B\x02\x76\x0D\xA4\x18\xAF\x46"
+			  "\xDD\x51\xE8\x7F\x16\x8A\x21\xB8"
+			  "\x2C\xC3\x5A\xF1\x65\xFC\x93\x07"
+			  "\x9E\x35\xCC\x40\xD7\x6E\x05\x79"
+			  "\x10\xA7\x1B\xB2\x49\xE0\x54\xEB"
+			  "\x82\x19\x8D\x24\xBB\x2F\xC6\x5D"
+			  "\xF4\x68\xFF\x96\x0A\xA1\x38\xCF"
+			  "\x43\xDA\x71\x08\x7C\x13\xAA\x1E"
+			  "\xB5\x4C\xE3\x57\xEE\x85\x1C\x90"
+			  "\x27\xBE\x32\xC9\x60\xF7\x6B\x02"
+			  "\x99\x0D\xA4\x3B\xD2\x46\xDD\x74"
+			  "\x0B\x7F\x16\xAD\x21\xB8\x4F\xE6"
+			  "\x5A\xF1\x88\x1F\x93\x2A\xC1\x35"
+			  "\xCC\x63\xFA\x6E\x05\x9C\x10\xA7"
+			  "\x3E\xD5\x49\xE0\x77\x0E\x82\x19"
+			  "\xB0\x24\xBB\x52\xE9\x5D\xF4\x8B"
+			  "\x22\x96\x2D\xC4\x38\xCF\x66\xFD"
+			  "\x71\x08\x9F\x13\xAA\x41\xD8\x4C"
+			  "\xE3\x7A\x11\x85\x1C\xB3\x27\xBE"
+			  "\x55\xEC\x60\xF7\x8E\x02\x99\x30"
+			  "\xC7\x3B\xD2\x69\x00\x74\x0B\xA2"
+			  "\x16\xAD\x44\xDB\x4F\xE6\x7D\x14"
+			  "\x88\x1F\xB6\x2A\xC1\x58\xEF\x63"
+			  "\xFA\x91\x05\x9C\x33\xCA\x3E\xD5"
+			  "\x6C\x03\x77\x0E\xA5\x19\xB0\x47"
+			  "\xDE\x52\xE9\x80\x17\x8B\x22\xB9"
+			  "\x2D\xC4\x5B\xF2\x66\xFD\x94\x08"
+			  "\x9F\x36\xCD\x41\xD8\x6F\x06\x7A"
+			  "\x11\xA8\x1C\xB3\x4A\xE1\x55\xEC"
+			  "\x83\x1A\x8E\x25\xBC\x30\xC7\x5E"
+			  "\xF5\x69\x00\x97\x0B\xA2\x39\xD0"
+			  "\x44\xDB\x72\x09\x7D\x14\xAB\x1F"
+			  "\xB6\x4D\xE4\x58\xEF\x86\x1D\x91"
+			  "\x28\xBF\x33\xCA\x61\xF8\x6C\x03"
+			  "\x9A\x0E\xA5\x3C\xD3\x47\xDE\x75"
+			  "\x0C\x80\x17\xAE\x22\xB9\x50\xE7"
+			  "\x5B\xF2\x89\x20\x94\x2B\xC2\x36"
+			  "\xCD\x64\xFB\x6F\x06\x9D\x11\xA8"
+			  "\x3F\xD6\x4A\xE1\x78\x0F\x83\x1A"
+			  "\xB1\x25\xBC\x53\xEA\x5E\xF5\x8C"
+			  "\x00\x97\x2E\xC5\x39\xD0\x67\xFE"
+			  "\x72\x09\xA0\x14\xAB\x42\xD9\x4D",
+		.rlen	= 1008,
 	},
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From ad8b7c3e92868dd86c54d9d5321000bbb4096f0d Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Date: Sat, 13 Apr 2013 13:46:40 +0300
Subject: crypto: tcrypt - add async cipher speed tests for blowfish

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/tcrypt.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'crypto')

diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index 24ea7dffd21e..66d254ce0d11 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -1768,6 +1768,21 @@ static int do_test(int m)
 				   speed_template_32_64);
 		break;
 
+	case 509:
+		test_acipher_speed("ecb(blowfish)", ENCRYPT, sec, NULL, 0,
+				   speed_template_8_32);
+		test_acipher_speed("ecb(blowfish)", DECRYPT, sec, NULL, 0,
+				   speed_template_8_32);
+		test_acipher_speed("cbc(blowfish)", ENCRYPT, sec, NULL, 0,
+				   speed_template_8_32);
+		test_acipher_speed("cbc(blowfish)", DECRYPT, sec, NULL, 0,
+				   speed_template_8_32);
+		test_acipher_speed("ctr(blowfish)", ENCRYPT, sec, NULL, 0,
+				   speed_template_8_32);
+		test_acipher_speed("ctr(blowfish)", DECRYPT, sec, NULL, 0,
+				   speed_template_8_32);
+		break;
+
 	case 1000:
 		test_available();
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From 604880107010a1e5794552d184cd5471ea31b973 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Date: Sat, 13 Apr 2013 13:46:45 +0300
Subject: crypto: blowfish - add AVX2/x86_64 implementation of blowfish cipher

Patch adds AVX2/x86-64 implementation of Blowfish cipher, requiring 32 parallel
blocks for input (256 bytes). Table look-ups are performed using vpgatherdd
instruction directly from vector registers and thus should be faster than
earlier implementations.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/Makefile               |  11 +
 arch/x86/crypto/blowfish-avx2-asm_64.S | 449 +++++++++++++++++++++++++
 arch/x86/crypto/blowfish_avx2_glue.c   | 585 +++++++++++++++++++++++++++++++++
 arch/x86/crypto/blowfish_glue.c        |  32 +-
 arch/x86/include/asm/cpufeature.h      |   1 +
 arch/x86/include/asm/crypto/blowfish.h |  43 +++
 crypto/Kconfig                         |  18 +
 crypto/testmgr.c                       |  12 +
 8 files changed, 1127 insertions(+), 24 deletions(-)
 create mode 100644 arch/x86/crypto/blowfish-avx2-asm_64.S
 create mode 100644 arch/x86/crypto/blowfish_avx2_glue.c
 create mode 100644 arch/x86/include/asm/crypto/blowfish.h

(limited to 'crypto')

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 03cd7313ad4b..28464ef6fa52 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -3,6 +3,8 @@
 #
 
 avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
+avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
+					$(comma)4)$(comma)%ymm2,yes,no)
 
 obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o
 obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
@@ -38,6 +40,11 @@ ifeq ($(avx_supported),yes)
 	obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
 endif
 
+# These modules require assembler to support AVX2.
+ifeq ($(avx2_supported),yes)
+	obj-$(CONFIG_CRYPTO_BLOWFISH_AVX2_X86_64) += blowfish-avx2.o
+endif
+
 aes-i586-y := aes-i586-asm_32.o aes_glue.o
 twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
 salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
@@ -62,6 +69,10 @@ ifeq ($(avx_supported),yes)
 				serpent_avx_glue.o
 endif
 
+ifeq ($(avx2_supported),yes)
+	blowfish-avx2-y := blowfish-avx2-asm_64.o blowfish_avx2_glue.o
+endif
+
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
 sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
diff --git a/arch/x86/crypto/blowfish-avx2-asm_64.S b/arch/x86/crypto/blowfish-avx2-asm_64.S
new file mode 100644
index 000000000000..784452e0d05d
--- /dev/null
+++ b/arch/x86/crypto/blowfish-avx2-asm_64.S
@@ -0,0 +1,449 @@
+/*
+ * x86_64/AVX2 assembler optimized version of Blowfish
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <linux/linkage.h>
+
+.file "blowfish-avx2-asm_64.S"
+
+.data
+.align 32
+
+.Lprefetch_mask:
+.long 0*64
+.long 1*64
+.long 2*64
+.long 3*64
+.long 4*64
+.long 5*64
+.long 6*64
+.long 7*64
+
+.Lbswap32_mask:
+.long 0x00010203
+.long 0x04050607
+.long 0x08090a0b
+.long 0x0c0d0e0f
+
+.Lbswap128_mask:
+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.Lbswap_iv_mask:
+	.byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
+
+.text
+/* structure of crypto context */
+#define p	0
+#define s0	((16 + 2) * 4)
+#define s1	((16 + 2 + (1 * 256)) * 4)
+#define s2	((16 + 2 + (2 * 256)) * 4)
+#define s3	((16 + 2 + (3 * 256)) * 4)
+
+/* register macros */
+#define CTX	%rdi
+#define RIO	 %rdx
+
+#define RS0	%rax
+#define RS1	%r8
+#define RS2	%r9
+#define RS3	%r10
+
+#define RLOOP	%r11
+#define RLOOPd	%r11d
+
+#define RXr0	%ymm8
+#define RXr1	%ymm9
+#define RXr2	%ymm10
+#define RXr3	%ymm11
+#define RXl0	%ymm12
+#define RXl1	%ymm13
+#define RXl2	%ymm14
+#define RXl3	%ymm15
+
+/* temp regs */
+#define RT0	%ymm0
+#define RT0x	%xmm0
+#define RT1	%ymm1
+#define RT1x	%xmm1
+#define RIDX0	%ymm2
+#define RIDX1	%ymm3
+#define RIDX1x	%xmm3
+#define RIDX2	%ymm4
+#define RIDX3	%ymm5
+
+/* vpgatherdd mask and '-1' */
+#define RNOT	%ymm6
+
+/* byte mask, (-1 >> 24) */
+#define RBYTE	%ymm7
+
+/***********************************************************************
+ * 32-way AVX2 blowfish
+ ***********************************************************************/
+#define F(xl, xr) \
+	vpsrld $24, xl, RIDX0; \
+	vpsrld $16, xl, RIDX1; \
+	vpsrld $8, xl, RIDX2; \
+	vpand RBYTE, RIDX1, RIDX1; \
+	vpand RBYTE, RIDX2, RIDX2; \
+	vpand RBYTE, xl, RIDX3; \
+	\
+	vpgatherdd RNOT, (RS0, RIDX0, 4), RT0; \
+	vpcmpeqd RNOT, RNOT, RNOT; \
+	vpcmpeqd RIDX0, RIDX0, RIDX0; \
+	\
+	vpgatherdd RNOT, (RS1, RIDX1, 4), RT1; \
+	vpcmpeqd RIDX1, RIDX1, RIDX1; \
+	vpaddd RT0, RT1, RT0; \
+	\
+	vpgatherdd RIDX0, (RS2, RIDX2, 4), RT1; \
+	vpxor RT0, RT1, RT0; \
+	\
+	vpgatherdd RIDX1, (RS3, RIDX3, 4), RT1; \
+	vpcmpeqd RNOT, RNOT, RNOT; \
+	vpaddd RT0, RT1, RT0; \
+	\
+	vpxor RT0, xr, xr;
+
+#define add_roundkey(xl, nmem) \
+	vpbroadcastd nmem, RT0; \
+	vpxor RT0, xl ## 0, xl ## 0; \
+	vpxor RT0, xl ## 1, xl ## 1; \
+	vpxor RT0, xl ## 2, xl ## 2; \
+	vpxor RT0, xl ## 3, xl ## 3;
+
+#define round_enc() \
+	add_roundkey(RXr, p(CTX,RLOOP,4)); \
+	F(RXl0, RXr0); \
+	F(RXl1, RXr1); \
+	F(RXl2, RXr2); \
+	F(RXl3, RXr3); \
+	\
+	add_roundkey(RXl, p+4(CTX,RLOOP,4)); \
+	F(RXr0, RXl0); \
+	F(RXr1, RXl1); \
+	F(RXr2, RXl2); \
+	F(RXr3, RXl3);
+
+#define round_dec() \
+	add_roundkey(RXr, p+4*2(CTX,RLOOP,4)); \
+	F(RXl0, RXr0); \
+	F(RXl1, RXr1); \
+	F(RXl2, RXr2); \
+	F(RXl3, RXr3); \
+	\
+	add_roundkey(RXl, p+4(CTX,RLOOP,4)); \
+	F(RXr0, RXl0); \
+	F(RXr1, RXl1); \
+	F(RXr2, RXl2); \
+	F(RXr3, RXl3);
+
+#define init_round_constants() \
+	vpcmpeqd RNOT, RNOT, RNOT; \
+	leaq s0(CTX), RS0; \
+	leaq s1(CTX), RS1; \
+	leaq s2(CTX), RS2; \
+	leaq s3(CTX), RS3; \
+	vpsrld $24, RNOT, RBYTE;
+
+#define transpose_2x2(x0, x1, t0) \
+	vpunpckldq x0, x1, t0; \
+	vpunpckhdq x0, x1, x1; \
+	\
+	vpunpcklqdq t0, x1, x0; \
+	vpunpckhqdq t0, x1, x1;
+
+#define read_block(xl, xr) \
+	vbroadcasti128 .Lbswap32_mask, RT1; \
+	\
+	vpshufb RT1, xl ## 0, xl ## 0; \
+	vpshufb RT1, xr ## 0, xr ## 0; \
+	vpshufb RT1, xl ## 1, xl ## 1; \
+	vpshufb RT1, xr ## 1, xr ## 1; \
+	vpshufb RT1, xl ## 2, xl ## 2; \
+	vpshufb RT1, xr ## 2, xr ## 2; \
+	vpshufb RT1, xl ## 3, xl ## 3; \
+	vpshufb RT1, xr ## 3, xr ## 3; \
+	\
+	transpose_2x2(xl ## 0, xr ## 0, RT0); \
+	transpose_2x2(xl ## 1, xr ## 1, RT0); \
+	transpose_2x2(xl ## 2, xr ## 2, RT0); \
+	transpose_2x2(xl ## 3, xr ## 3, RT0);
+
+#define write_block(xl, xr) \
+	vbroadcasti128 .Lbswap32_mask, RT1; \
+	\
+	transpose_2x2(xl ## 0, xr ## 0, RT0); \
+	transpose_2x2(xl ## 1, xr ## 1, RT0); \
+	transpose_2x2(xl ## 2, xr ## 2, RT0); \
+	transpose_2x2(xl ## 3, xr ## 3, RT0); \
+	\
+	vpshufb RT1, xl ## 0, xl ## 0; \
+	vpshufb RT1, xr ## 0, xr ## 0; \
+	vpshufb RT1, xl ## 1, xl ## 1; \
+	vpshufb RT1, xr ## 1, xr ## 1; \
+	vpshufb RT1, xl ## 2, xl ## 2; \
+	vpshufb RT1, xr ## 2, xr ## 2; \
+	vpshufb RT1, xl ## 3, xl ## 3; \
+	vpshufb RT1, xr ## 3, xr ## 3;
+
+.align 8
+__blowfish_enc_blk32:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RXl0..4, RXr0..4: plaintext
+	 * output:
+	 *	RXl0..4, RXr0..4: ciphertext (RXl <=> RXr swapped)
+	 */
+	init_round_constants();
+
+	read_block(RXl, RXr);
+
+	movl $1, RLOOPd;
+	add_roundkey(RXl, p+4*(0)(CTX));
+
+.align 4
+.L__enc_loop:
+	round_enc();
+
+	leal 2(RLOOPd), RLOOPd;
+	cmpl $17, RLOOPd;
+	jne .L__enc_loop;
+
+	add_roundkey(RXr, p+4*(17)(CTX));
+
+	write_block(RXl, RXr);
+
+	ret;
+ENDPROC(__blowfish_enc_blk32)
+
+.align 8
+__blowfish_dec_blk32:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RXl0..4, RXr0..4: ciphertext
+	 * output:
+	 *	RXl0..4, RXr0..4: plaintext (RXl <=> RXr swapped)
+	 */
+	init_round_constants();
+
+	read_block(RXl, RXr);
+
+	movl $14, RLOOPd;
+	add_roundkey(RXl, p+4*(17)(CTX));
+
+.align 4
+.L__dec_loop:
+	round_dec();
+
+	addl $-2, RLOOPd;
+	jns .L__dec_loop;
+
+	add_roundkey(RXr, p+4*(0)(CTX));
+
+	write_block(RXl, RXr);
+
+	ret;
+ENDPROC(__blowfish_dec_blk32)
+
+ENTRY(blowfish_ecb_enc_32way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	vzeroupper;
+
+	vmovdqu 0*32(%rdx), RXl0;
+	vmovdqu 1*32(%rdx), RXr0;
+	vmovdqu 2*32(%rdx), RXl1;
+	vmovdqu 3*32(%rdx), RXr1;
+	vmovdqu 4*32(%rdx), RXl2;
+	vmovdqu 5*32(%rdx), RXr2;
+	vmovdqu 6*32(%rdx), RXl3;
+	vmovdqu 7*32(%rdx), RXr3;
+
+	call __blowfish_enc_blk32;
+
+	vmovdqu RXr0, 0*32(%rsi);
+	vmovdqu RXl0, 1*32(%rsi);
+	vmovdqu RXr1, 2*32(%rsi);
+	vmovdqu RXl1, 3*32(%rsi);
+	vmovdqu RXr2, 4*32(%rsi);
+	vmovdqu RXl2, 5*32(%rsi);
+	vmovdqu RXr3, 6*32(%rsi);
+	vmovdqu RXl3, 7*32(%rsi);
+
+	vzeroupper;
+
+	ret;
+ENDPROC(blowfish_ecb_enc_32way)
+
+ENTRY(blowfish_ecb_dec_32way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	vzeroupper;
+
+	vmovdqu 0*32(%rdx), RXl0;
+	vmovdqu 1*32(%rdx), RXr0;
+	vmovdqu 2*32(%rdx), RXl1;
+	vmovdqu 3*32(%rdx), RXr1;
+	vmovdqu 4*32(%rdx), RXl2;
+	vmovdqu 5*32(%rdx), RXr2;
+	vmovdqu 6*32(%rdx), RXl3;
+	vmovdqu 7*32(%rdx), RXr3;
+
+	call __blowfish_dec_blk32;
+
+	vmovdqu RXr0, 0*32(%rsi);
+	vmovdqu RXl0, 1*32(%rsi);
+	vmovdqu RXr1, 2*32(%rsi);
+	vmovdqu RXl1, 3*32(%rsi);
+	vmovdqu RXr2, 4*32(%rsi);
+	vmovdqu RXl2, 5*32(%rsi);
+	vmovdqu RXr3, 6*32(%rsi);
+	vmovdqu RXl3, 7*32(%rsi);
+
+	vzeroupper;
+
+	ret;
+ENDPROC(blowfish_ecb_dec_32way)
+
+ENTRY(blowfish_cbc_dec_32way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	vzeroupper;
+
+	vmovdqu 0*32(%rdx), RXl0;
+	vmovdqu 1*32(%rdx), RXr0;
+	vmovdqu 2*32(%rdx), RXl1;
+	vmovdqu 3*32(%rdx), RXr1;
+	vmovdqu 4*32(%rdx), RXl2;
+	vmovdqu 5*32(%rdx), RXr2;
+	vmovdqu 6*32(%rdx), RXl3;
+	vmovdqu 7*32(%rdx), RXr3;
+
+	call __blowfish_dec_blk32;
+
+	/* xor with src */
+	vmovq (%rdx), RT0x;
+	vpshufd $0x4f, RT0x, RT0x;
+	vinserti128 $1, 8(%rdx), RT0, RT0;
+	vpxor RT0, RXr0, RXr0;
+	vpxor 0*32+24(%rdx), RXl0, RXl0;
+	vpxor 1*32+24(%rdx), RXr1, RXr1;
+	vpxor 2*32+24(%rdx), RXl1, RXl1;
+	vpxor 3*32+24(%rdx), RXr2, RXr2;
+	vpxor 4*32+24(%rdx), RXl2, RXl2;
+	vpxor 5*32+24(%rdx), RXr3, RXr3;
+	vpxor 6*32+24(%rdx), RXl3, RXl3;
+
+	vmovdqu RXr0, (0*32)(%rsi);
+	vmovdqu RXl0, (1*32)(%rsi);
+	vmovdqu RXr1, (2*32)(%rsi);
+	vmovdqu RXl1, (3*32)(%rsi);
+	vmovdqu RXr2, (4*32)(%rsi);
+	vmovdqu RXl2, (5*32)(%rsi);
+	vmovdqu RXr3, (6*32)(%rsi);
+	vmovdqu RXl3, (7*32)(%rsi);
+
+	vzeroupper;
+
+	ret;
+ENDPROC(blowfish_cbc_dec_32way)
+
+ENTRY(blowfish_ctr_32way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: iv (big endian, 64bit)
+	 */
+
+	vzeroupper;
+
+	vpcmpeqd RT0, RT0, RT0;
+	vpsrldq $8, RT0, RT0; /* a: -1, b: 0, c: -1, d: 0 */
+
+	vpcmpeqd RT1x, RT1x, RT1x;
+	vpaddq RT1x, RT1x, RT1x; /* a: -2, b: -2 */
+	vpxor RIDX0, RIDX0, RIDX0;
+	vinserti128 $1, RT1x, RIDX0, RIDX0; /* a: 0, b: 0, c: -2, d: -2 */
+
+	vpaddq RIDX0, RT0, RT0; /* a: -1, b: 0, c: -3, d: -2 */
+
+	vpcmpeqd RT1, RT1, RT1;
+	vpaddq RT1, RT1, RT1; /* a: -2, b: -2, c: -2, d: -2 */
+	vpaddq RT1, RT1, RIDX2; /* a: -4, b: -4, c: -4, d: -4 */
+
+	vbroadcasti128 .Lbswap_iv_mask, RIDX0;
+	vbroadcasti128 .Lbswap128_mask, RIDX1;
+
+	/* load IV and byteswap */
+	vmovq (%rcx), RT1x;
+	vinserti128 $1, RT1x, RT1, RT1; /* a: BE, b: 0, c: BE, d: 0 */
+	vpshufb RIDX0, RT1, RT1; /* a: LE, b: LE, c: LE, d: LE */
+
+	/* construct IVs */
+	vpsubq RT0, RT1, RT1;		/* a: le1, b: le0, c: le3, d: le2 */
+	vpshufb RIDX1, RT1, RXl0;	/* a: be0, b: be1, c: be2, d: be3 */
+	vpsubq RIDX2, RT1, RT1;		/* le5, le4, le7, le6 */
+	vpshufb RIDX1, RT1, RXr0;	/* be4, be5, be6, be7 */
+	vpsubq RIDX2, RT1, RT1;
+	vpshufb RIDX1, RT1, RXl1;
+	vpsubq RIDX2, RT1, RT1;
+	vpshufb RIDX1, RT1, RXr1;
+	vpsubq RIDX2, RT1, RT1;
+	vpshufb RIDX1, RT1, RXl2;
+	vpsubq RIDX2, RT1, RT1;
+	vpshufb RIDX1, RT1, RXr2;
+	vpsubq RIDX2, RT1, RT1;
+	vpshufb RIDX1, RT1, RXl3;
+	vpsubq RIDX2, RT1, RT1;
+	vpshufb RIDX1, RT1, RXr3;
+
+	/* store last IV */
+	vpsubq RIDX2, RT1, RT1; /* a: le33, b: le32, ... */
+	vpshufb RIDX1x, RT1x, RT1x; /* a: be32, ... */
+	vmovq RT1x, (%rcx);
+
+	call __blowfish_enc_blk32;
+
+	/* dst = src ^ iv */
+	vpxor 0*32(%rdx), RXr0, RXr0;
+	vpxor 1*32(%rdx), RXl0, RXl0;
+	vpxor 2*32(%rdx), RXr1, RXr1;
+	vpxor 3*32(%rdx), RXl1, RXl1;
+	vpxor 4*32(%rdx), RXr2, RXr2;
+	vpxor 5*32(%rdx), RXl2, RXl2;
+	vpxor 6*32(%rdx), RXr3, RXr3;
+	vpxor 7*32(%rdx), RXl3, RXl3;
+	vmovdqu RXr0, (0*32)(%rsi);
+	vmovdqu RXl0, (1*32)(%rsi);
+	vmovdqu RXr1, (2*32)(%rsi);
+	vmovdqu RXl1, (3*32)(%rsi);
+	vmovdqu RXr2, (4*32)(%rsi);
+	vmovdqu RXl2, (5*32)(%rsi);
+	vmovdqu RXr3, (6*32)(%rsi);
+	vmovdqu RXl3, (7*32)(%rsi);
+
+	vzeroupper;
+
+	ret;
+ENDPROC(blowfish_ctr_32way)
diff --git a/arch/x86/crypto/blowfish_avx2_glue.c b/arch/x86/crypto/blowfish_avx2_glue.c
new file mode 100644
index 000000000000..4417e9aea78d
--- /dev/null
+++ b/arch/x86/crypto/blowfish_avx2_glue.c
@@ -0,0 +1,585 @@
+/*
+ * Glue Code for x86_64/AVX2 assembler optimized version of Blowfish
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
+ *   Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
+ * CTR part based on code (crypto/ctr.c) by:
+ *   (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/blowfish.h>
+#include <crypto/cryptd.h>
+#include <crypto/ctr.h>
+#include <asm/i387.h>
+#include <asm/xcr.h>
+#include <asm/xsave.h>
+#include <asm/crypto/blowfish.h>
+#include <asm/crypto/ablk_helper.h>
+#include <crypto/scatterwalk.h>
+
+#define BF_AVX2_PARALLEL_BLOCKS 32
+
+/* 32-way AVX2 parallel cipher functions */
+asmlinkage void blowfish_ecb_enc_32way(struct bf_ctx *ctx, u8 *dst,
+				       const u8 *src);
+asmlinkage void blowfish_ecb_dec_32way(struct bf_ctx *ctx, u8 *dst,
+				       const u8 *src);
+asmlinkage void blowfish_cbc_dec_32way(struct bf_ctx *ctx, u8 *dst,
+				       const u8 *src);
+asmlinkage void blowfish_ctr_32way(struct bf_ctx *ctx, u8 *dst, const u8 *src,
+				   __be64 *iv);
+
+static inline bool bf_fpu_begin(bool fpu_enabled, unsigned int nbytes)
+{
+	if (fpu_enabled)
+		return true;
+
+	/* FPU is only used when chunk to be processed is large enough, so
+	 * do not enable FPU until it is necessary.
+	 */
+	if (nbytes < BF_BLOCK_SIZE * BF_AVX2_PARALLEL_BLOCKS)
+		return false;
+
+	kernel_fpu_begin();
+	return true;
+}
+
+static inline void bf_fpu_end(bool fpu_enabled)
+{
+	if (fpu_enabled)
+		kernel_fpu_end();
+}
+
+static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
+		     bool enc)
+{
+	bool fpu_enabled = false;
+	struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	const unsigned int bsize = BF_BLOCK_SIZE;
+	unsigned int nbytes;
+	int err;
+
+	err = blkcipher_walk_virt(desc, walk);
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+
+	while ((nbytes = walk->nbytes)) {
+		u8 *wsrc = walk->src.virt.addr;
+		u8 *wdst = walk->dst.virt.addr;
+
+		fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes);
+
+		/* Process multi-block AVX2 batch */
+		if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) {
+			do {
+				if (enc)
+					blowfish_ecb_enc_32way(ctx, wdst, wsrc);
+				else
+					blowfish_ecb_dec_32way(ctx, wdst, wsrc);
+
+				wsrc += bsize * BF_AVX2_PARALLEL_BLOCKS;
+				wdst += bsize * BF_AVX2_PARALLEL_BLOCKS;
+				nbytes -= bsize * BF_AVX2_PARALLEL_BLOCKS;
+			} while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS);
+
+			if (nbytes < bsize)
+				goto done;
+		}
+
+		/* Process multi-block batch */
+		if (nbytes >= bsize * BF_PARALLEL_BLOCKS) {
+			do {
+				if (enc)
+					blowfish_enc_blk_4way(ctx, wdst, wsrc);
+				else
+					blowfish_dec_blk_4way(ctx, wdst, wsrc);
+
+				wsrc += bsize * BF_PARALLEL_BLOCKS;
+				wdst += bsize * BF_PARALLEL_BLOCKS;
+				nbytes -= bsize * BF_PARALLEL_BLOCKS;
+			} while (nbytes >= bsize * BF_PARALLEL_BLOCKS);
+
+			if (nbytes < bsize)
+				goto done;
+		}
+
+		/* Handle leftovers */
+		do {
+			if (enc)
+				blowfish_enc_blk(ctx, wdst, wsrc);
+			else
+				blowfish_dec_blk(ctx, wdst, wsrc);
+
+			wsrc += bsize;
+			wdst += bsize;
+			nbytes -= bsize;
+		} while (nbytes >= bsize);
+
+done:
+		err = blkcipher_walk_done(desc, walk, nbytes);
+	}
+
+	bf_fpu_end(fpu_enabled);
+	return err;
+}
+
+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct blkcipher_walk walk;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	return ecb_crypt(desc, &walk, true);
+}
+
+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct blkcipher_walk walk;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	return ecb_crypt(desc, &walk, false);
+}
+
+static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
+				  struct blkcipher_walk *walk)
+{
+	struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	unsigned int bsize = BF_BLOCK_SIZE;
+	unsigned int nbytes = walk->nbytes;
+	u64 *src = (u64 *)walk->src.virt.addr;
+	u64 *dst = (u64 *)walk->dst.virt.addr;
+	u64 *iv = (u64 *)walk->iv;
+
+	do {
+		*dst = *src ^ *iv;
+		blowfish_enc_blk(ctx, (u8 *)dst, (u8 *)dst);
+		iv = dst;
+
+		src += 1;
+		dst += 1;
+		nbytes -= bsize;
+	} while (nbytes >= bsize);
+
+	*(u64 *)walk->iv = *iv;
+	return nbytes;
+}
+
+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	while ((nbytes = walk.nbytes)) {
+		nbytes = __cbc_encrypt(desc, &walk);
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+
+	return err;
+}
+
+static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
+				  struct blkcipher_walk *walk)
+{
+	struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	const unsigned int bsize = BF_BLOCK_SIZE;
+	unsigned int nbytes = walk->nbytes;
+	u64 *src = (u64 *)walk->src.virt.addr;
+	u64 *dst = (u64 *)walk->dst.virt.addr;
+	u64 last_iv;
+	int i;
+
+	/* Start of the last block. */
+	src += nbytes / bsize - 1;
+	dst += nbytes / bsize - 1;
+
+	last_iv = *src;
+
+	/* Process multi-block AVX2 batch */
+	if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) {
+		do {
+			nbytes -= bsize * (BF_AVX2_PARALLEL_BLOCKS - 1);
+			src -= BF_AVX2_PARALLEL_BLOCKS - 1;
+			dst -= BF_AVX2_PARALLEL_BLOCKS - 1;
+
+			blowfish_cbc_dec_32way(ctx, (u8 *)dst, (u8 *)src);
+
+			nbytes -= bsize;
+			if (nbytes < bsize)
+				goto done;
+
+			*dst ^= *(src - 1);
+			src -= 1;
+			dst -= 1;
+		} while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS);
+
+		if (nbytes < bsize)
+			goto done;
+	}
+
+	/* Process multi-block batch */
+	if (nbytes >= bsize * BF_PARALLEL_BLOCKS) {
+		u64 ivs[BF_PARALLEL_BLOCKS - 1];
+
+		do {
+			nbytes -= bsize * (BF_PARALLEL_BLOCKS - 1);
+			src -= BF_PARALLEL_BLOCKS - 1;
+			dst -= BF_PARALLEL_BLOCKS - 1;
+
+			for (i = 0; i < BF_PARALLEL_BLOCKS - 1; i++)
+				ivs[i] = src[i];
+
+			blowfish_dec_blk_4way(ctx, (u8 *)dst, (u8 *)src);
+
+			for (i = 0; i < BF_PARALLEL_BLOCKS - 1; i++)
+				dst[i + 1] ^= ivs[i];
+
+			nbytes -= bsize;
+			if (nbytes < bsize)
+				goto done;
+
+			*dst ^= *(src - 1);
+			src -= 1;
+			dst -= 1;
+		} while (nbytes >= bsize * BF_PARALLEL_BLOCKS);
+
+		if (nbytes < bsize)
+			goto done;
+	}
+
+	/* Handle leftovers */
+	for (;;) {
+		blowfish_dec_blk(ctx, (u8 *)dst, (u8 *)src);
+
+		nbytes -= bsize;
+		if (nbytes < bsize)
+			break;
+
+		*dst ^= *(src - 1);
+		src -= 1;
+		dst -= 1;
+	}
+
+done:
+	*dst ^= *(u64 *)walk->iv;
+	*(u64 *)walk->iv = last_iv;
+
+	return nbytes;
+}
+
+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	bool fpu_enabled = false;
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+
+	while ((nbytes = walk.nbytes)) {
+		fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes);
+		nbytes = __cbc_decrypt(desc, &walk);
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+
+	bf_fpu_end(fpu_enabled);
+	return err;
+}
+
+static void ctr_crypt_final(struct blkcipher_desc *desc,
+			    struct blkcipher_walk *walk)
+{
+	struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	u8 *ctrblk = walk->iv;
+	u8 keystream[BF_BLOCK_SIZE];
+	u8 *src = walk->src.virt.addr;
+	u8 *dst = walk->dst.virt.addr;
+	unsigned int nbytes = walk->nbytes;
+
+	blowfish_enc_blk(ctx, keystream, ctrblk);
+	crypto_xor(keystream, src, nbytes);
+	memcpy(dst, keystream, nbytes);
+
+	crypto_inc(ctrblk, BF_BLOCK_SIZE);
+}
+
+static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
+				struct blkcipher_walk *walk)
+{
+	struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	unsigned int bsize = BF_BLOCK_SIZE;
+	unsigned int nbytes = walk->nbytes;
+	u64 *src = (u64 *)walk->src.virt.addr;
+	u64 *dst = (u64 *)walk->dst.virt.addr;
+	int i;
+
+	/* Process multi-block AVX2 batch */
+	if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) {
+		do {
+			blowfish_ctr_32way(ctx, (u8 *)dst, (u8 *)src,
+					   (__be64 *)walk->iv);
+
+			src += BF_AVX2_PARALLEL_BLOCKS;
+			dst += BF_AVX2_PARALLEL_BLOCKS;
+			nbytes -= bsize * BF_AVX2_PARALLEL_BLOCKS;
+		} while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS);
+
+		if (nbytes < bsize)
+			goto done;
+	}
+
+	/* Process four block batch */
+	if (nbytes >= bsize * BF_PARALLEL_BLOCKS) {
+		__be64 ctrblocks[BF_PARALLEL_BLOCKS];
+		u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
+
+		do {
+			/* create ctrblks for parallel encrypt */
+			for (i = 0; i < BF_PARALLEL_BLOCKS; i++) {
+				if (dst != src)
+					dst[i] = src[i];
+
+				ctrblocks[i] = cpu_to_be64(ctrblk++);
+			}
+
+			blowfish_enc_blk_xor_4way(ctx, (u8 *)dst,
+						  (u8 *)ctrblocks);
+
+			src += BF_PARALLEL_BLOCKS;
+			dst += BF_PARALLEL_BLOCKS;
+			nbytes -= bsize * BF_PARALLEL_BLOCKS;
+		} while (nbytes >= bsize * BF_PARALLEL_BLOCKS);
+
+		*(__be64 *)walk->iv = cpu_to_be64(ctrblk);
+
+		if (nbytes < bsize)
+			goto done;
+	}
+
+	/* Handle leftovers */
+	do {
+		u64 ctrblk;
+
+		if (dst != src)
+			*dst = *src;
+
+		ctrblk = *(u64 *)walk->iv;
+		be64_add_cpu((__be64 *)walk->iv, 1);
+
+		blowfish_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk);
+
+		src += 1;
+		dst += 1;
+	} while ((nbytes -= bsize) >= bsize);
+
+done:
+	return nbytes;
+}
+
+static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		     struct scatterlist *src, unsigned int nbytes)
+{
+	bool fpu_enabled = false;
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt_block(desc, &walk, BF_BLOCK_SIZE);
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+
+	while ((nbytes = walk.nbytes) >= BF_BLOCK_SIZE) {
+		fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes);
+		nbytes = __ctr_crypt(desc, &walk);
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+
+	bf_fpu_end(fpu_enabled);
+
+	if (walk.nbytes) {
+		ctr_crypt_final(desc, &walk);
+		err = blkcipher_walk_done(desc, &walk, 0);
+	}
+
+	return err;
+}
+
+static struct crypto_alg bf_algs[6] = { {
+	.cra_name		= "__ecb-blowfish-avx2",
+	.cra_driver_name	= "__driver-ecb-blowfish-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= BF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct bf_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= BF_MIN_KEY_SIZE,
+			.max_keysize	= BF_MAX_KEY_SIZE,
+			.setkey		= blowfish_setkey,
+			.encrypt	= ecb_encrypt,
+			.decrypt	= ecb_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "__cbc-blowfish-avx2",
+	.cra_driver_name	= "__driver-cbc-blowfish-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= BF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct bf_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= BF_MIN_KEY_SIZE,
+			.max_keysize	= BF_MAX_KEY_SIZE,
+			.setkey		= blowfish_setkey,
+			.encrypt	= cbc_encrypt,
+			.decrypt	= cbc_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "__ctr-blowfish-avx2",
+	.cra_driver_name	= "__driver-ctr-blowfish-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct bf_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= BF_MIN_KEY_SIZE,
+			.max_keysize	= BF_MAX_KEY_SIZE,
+			.ivsize		= BF_BLOCK_SIZE,
+			.setkey		= blowfish_setkey,
+			.encrypt	= ctr_crypt,
+			.decrypt	= ctr_crypt,
+		},
+	},
+}, {
+	.cra_name		= "ecb(blowfish)",
+	.cra_driver_name	= "ecb-blowfish-avx2",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= BF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= BF_MIN_KEY_SIZE,
+			.max_keysize	= BF_MAX_KEY_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "cbc(blowfish)",
+	.cra_driver_name	= "cbc-blowfish-avx2",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= BF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= BF_MIN_KEY_SIZE,
+			.max_keysize	= BF_MAX_KEY_SIZE,
+			.ivsize		= BF_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= __ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "ctr(blowfish)",
+	.cra_driver_name	= "ctr-blowfish-avx2",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= BF_MIN_KEY_SIZE,
+			.max_keysize	= BF_MAX_KEY_SIZE,
+			.ivsize		= BF_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_encrypt,
+			.geniv		= "chainiv",
+		},
+	},
+} };
+
+
+static int __init init(void)
+{
+	u64 xcr0;
+
+	if (!cpu_has_avx2 || !cpu_has_osxsave) {
+		pr_info("AVX2 instructions are not detected.\n");
+		return -ENODEV;
+	}
+
+	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
+		pr_info("AVX detected but unusable.\n");
+		return -ENODEV;
+	}
+
+	return crypto_register_algs(bf_algs, ARRAY_SIZE(bf_algs));
+}
+
+static void __exit fini(void)
+{
+	crypto_unregister_algs(bf_algs, ARRAY_SIZE(bf_algs));
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Blowfish Cipher Algorithm, AVX2 optimized");
+MODULE_ALIAS("blowfish");
+MODULE_ALIAS("blowfish-asm");
diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c
index 50ec333b70e6..3548d76dbaa9 100644
--- a/arch/x86/crypto/blowfish_glue.c
+++ b/arch/x86/crypto/blowfish_glue.c
@@ -1,7 +1,7 @@
 /*
  * Glue Code for assembler optimized version of Blowfish
  *
- * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
  *   Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
@@ -32,40 +32,24 @@
 #include <linux/module.h>
 #include <linux/types.h>
 #include <crypto/algapi.h>
+#include <asm/crypto/blowfish.h>
 
 /* regular block cipher functions */
 asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src,
 				   bool xor);
+EXPORT_SYMBOL_GPL(__blowfish_enc_blk);
+
 asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src);
+EXPORT_SYMBOL_GPL(blowfish_dec_blk);
 
 /* 4-way parallel cipher functions */
 asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
 					const u8 *src, bool xor);
+EXPORT_SYMBOL_GPL(__blowfish_enc_blk_4way);
+
 asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst,
 				      const u8 *src);
-
-static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src)
-{
-	__blowfish_enc_blk(ctx, dst, src, false);
-}
-
-static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst,
-					const u8 *src)
-{
-	__blowfish_enc_blk(ctx, dst, src, true);
-}
-
-static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
-					 const u8 *src)
-{
-	__blowfish_enc_blk_4way(ctx, dst, src, false);
-}
-
-static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst,
-				      const u8 *src)
-{
-	__blowfish_enc_blk_4way(ctx, dst, src, true);
-}
+EXPORT_SYMBOL_GPL(blowfish_dec_blk_4way);
 
 static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 93fe929d1cee..1243272605aa 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -278,6 +278,7 @@ extern const char * const x86_power_flags[32];
 #define cpu_has_ssse3		boot_cpu_has(X86_FEATURE_SSSE3)
 #define cpu_has_aes		boot_cpu_has(X86_FEATURE_AES)
 #define cpu_has_avx		boot_cpu_has(X86_FEATURE_AVX)
+#define cpu_has_avx2		boot_cpu_has(X86_FEATURE_AVX2)
 #define cpu_has_ht		boot_cpu_has(X86_FEATURE_HT)
 #define cpu_has_mp		boot_cpu_has(X86_FEATURE_MP)
 #define cpu_has_nx		boot_cpu_has(X86_FEATURE_NX)
diff --git a/arch/x86/include/asm/crypto/blowfish.h b/arch/x86/include/asm/crypto/blowfish.h
new file mode 100644
index 000000000000..f097b2face10
--- /dev/null
+++ b/arch/x86/include/asm/crypto/blowfish.h
@@ -0,0 +1,43 @@
+#ifndef ASM_X86_BLOWFISH_H
+#define ASM_X86_BLOWFISH_H
+
+#include <linux/crypto.h>
+#include <crypto/blowfish.h>
+
+#define BF_PARALLEL_BLOCKS 4
+
+/* regular block cipher functions */
+asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src,
+				   bool xor);
+asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src);
+
+/* 4-way parallel cipher functions */
+asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
+					const u8 *src, bool xor);
+asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst,
+				      const u8 *src);
+
+static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src)
+{
+	__blowfish_enc_blk(ctx, dst, src, false);
+}
+
+static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst,
+					const u8 *src)
+{
+	__blowfish_enc_blk(ctx, dst, src, true);
+}
+
+static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
+					 const u8 *src)
+{
+	__blowfish_enc_blk_4way(ctx, dst, src, false);
+}
+
+static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst,
+				      const u8 *src)
+{
+	__blowfish_enc_blk_4way(ctx, dst, src, true);
+}
+
+#endif
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 0e7a23723b45..6b9564f91168 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -820,6 +820,24 @@ config CRYPTO_BLOWFISH_X86_64
 	  See also:
 	  <http://www.schneier.com/blowfish.html>
 
+config CRYPTO_BLOWFISH_AVX2_X86_64
+	tristate "Blowfish cipher algorithm (x86_64/AVX2)"
+	depends on X86 && 64BIT
+	select CRYPTO_ALGAPI
+	select CRYPTO_CRYPTD
+	select CRYPTO_ABLK_HELPER_X86
+	select CRYPTO_BLOWFISH_COMMON
+	select CRYPTO_BLOWFISH_X86_64
+	help
+	  Blowfish cipher algorithm (x86_64/AVX2), by Bruce Schneier.
+
+	  This is a variable key length cipher which can use keys from 32
+	  bits to 448 bits in length.  It's fast, simple and specifically
+	  designed for use on "large microprocessors".
+
+	  See also:
+	  <http://www.schneier.com/blowfish.html>
+
 config CRYPTO_CAMELLIA
 	tristate "Camellia cipher algorithms"
 	depends on CRYPTO
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 380708477b35..f3effb42531e 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -1654,6 +1654,9 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.alg = "__driver-cbc-aes-aesni",
 		.test = alg_test_null,
 		.fips_allowed = 1,
+	}, {
+		.alg = "__driver-cbc-blowfish-avx2",
+		.test = alg_test_null,
 	}, {
 		.alg = "__driver-cbc-camellia-aesni",
 		.test = alg_test_null,
@@ -1676,6 +1679,9 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.alg = "__driver-ecb-aes-aesni",
 		.test = alg_test_null,
 		.fips_allowed = 1,
+	}, {
+		.alg = "__driver-ecb-blowfish-avx2",
+		.test = alg_test_null,
 	}, {
 		.alg = "__driver-ecb-camellia-aesni",
 		.test = alg_test_null,
@@ -1947,6 +1953,9 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.alg = "cryptd(__driver-cbc-aes-aesni)",
 		.test = alg_test_null,
 		.fips_allowed = 1,
+	}, {
+		.alg = "cryptd(__driver-cbc-blowfish-avx2)",
+		.test = alg_test_null,
 	}, {
 		.alg = "cryptd(__driver-cbc-camellia-aesni)",
 		.test = alg_test_null,
@@ -1954,6 +1963,9 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.alg = "cryptd(__driver-ecb-aes-aesni)",
 		.test = alg_test_null,
 		.fips_allowed = 1,
+	}, {
+		.alg = "cryptd(__driver-ecb-blowfish-avx2)",
+		.test = alg_test_null,
 	}, {
 		.alg = "cryptd(__driver-ecb-camellia-aesni)",
 		.test = alg_test_null,
-- 
cgit v1.2.3-59-g8ed1b


From cf1521a1a5e21fd1e79a458605c4282fbfbbeee2 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Date: Sat, 13 Apr 2013 13:46:50 +0300
Subject: crypto: twofish - add AVX2/x86_64 assembler implementation of twofish
 cipher

Patch adds AVX2/x86-64 implementation of Twofish cipher, requiring 16 parallel
blocks for input (256 bytes). Table look-ups are performed using vpgatherdd
instruction directly from vector registers and thus should be faster than
earlier implementations. Implementation also uses 256-bit wide YMM registers,
which should give additional speed up compared to the AVX implementation.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/Makefile               |   2 +
 arch/x86/crypto/glue_helper-asm-avx2.S | 180 ++++++++++
 arch/x86/crypto/twofish-avx2-asm_64.S  | 600 +++++++++++++++++++++++++++++++++
 arch/x86/crypto/twofish_avx2_glue.c    | 584 ++++++++++++++++++++++++++++++++
 arch/x86/crypto/twofish_avx_glue.c     |  14 +-
 arch/x86/include/asm/crypto/twofish.h  |  18 +
 crypto/Kconfig                         |  24 ++
 crypto/testmgr.c                       |  12 +
 8 files changed, 1432 insertions(+), 2 deletions(-)
 create mode 100644 arch/x86/crypto/glue_helper-asm-avx2.S
 create mode 100644 arch/x86/crypto/twofish-avx2-asm_64.S
 create mode 100644 arch/x86/crypto/twofish_avx2_glue.c

(limited to 'crypto')

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 28464ef6fa52..1f6e0c2e9140 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -43,6 +43,7 @@ endif
 # These modules require assembler to support AVX2.
 ifeq ($(avx2_supported),yes)
 	obj-$(CONFIG_CRYPTO_BLOWFISH_AVX2_X86_64) += blowfish-avx2.o
+	obj-$(CONFIG_CRYPTO_TWOFISH_AVX2_X86_64) += twofish-avx2.o
 endif
 
 aes-i586-y := aes-i586-asm_32.o aes_glue.o
@@ -71,6 +72,7 @@ endif
 
 ifeq ($(avx2_supported),yes)
 	blowfish-avx2-y := blowfish-avx2-asm_64.o blowfish_avx2_glue.o
+	twofish-avx2-y := twofish-avx2-asm_64.o twofish_avx2_glue.o
 endif
 
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
diff --git a/arch/x86/crypto/glue_helper-asm-avx2.S b/arch/x86/crypto/glue_helper-asm-avx2.S
new file mode 100644
index 000000000000..a53ac11dd385
--- /dev/null
+++ b/arch/x86/crypto/glue_helper-asm-avx2.S
@@ -0,0 +1,180 @@
+/*
+ * Shared glue code for 128bit block ciphers, AVX2 assembler macros
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#define load_16way(src, x0, x1, x2, x3, x4, x5, x6, x7) \
+	vmovdqu (0*32)(src), x0; \
+	vmovdqu (1*32)(src), x1; \
+	vmovdqu (2*32)(src), x2; \
+	vmovdqu (3*32)(src), x3; \
+	vmovdqu (4*32)(src), x4; \
+	vmovdqu (5*32)(src), x5; \
+	vmovdqu (6*32)(src), x6; \
+	vmovdqu (7*32)(src), x7;
+
+#define store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+	vmovdqu x0, (0*32)(dst); \
+	vmovdqu x1, (1*32)(dst); \
+	vmovdqu x2, (2*32)(dst); \
+	vmovdqu x3, (3*32)(dst); \
+	vmovdqu x4, (4*32)(dst); \
+	vmovdqu x5, (5*32)(dst); \
+	vmovdqu x6, (6*32)(dst); \
+	vmovdqu x7, (7*32)(dst);
+
+#define store_cbc_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7, t0) \
+	vpxor t0, t0, t0; \
+	vinserti128 $1, (src), t0, t0; \
+	vpxor t0, x0, x0; \
+	vpxor (0*32+16)(src), x1, x1; \
+	vpxor (1*32+16)(src), x2, x2; \
+	vpxor (2*32+16)(src), x3, x3; \
+	vpxor (3*32+16)(src), x4, x4; \
+	vpxor (4*32+16)(src), x5, x5; \
+	vpxor (5*32+16)(src), x6, x6; \
+	vpxor (6*32+16)(src), x7, x7; \
+	store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
+
+#define inc_le128(x, minus_one, tmp) \
+	vpcmpeqq minus_one, x, tmp; \
+	vpsubq minus_one, x, x; \
+	vpslldq $8, tmp, tmp; \
+	vpsubq tmp, x, x;
+
+#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
+	vpcmpeqq minus_one, x, tmp1; \
+	vpcmpeqq minus_two, x, tmp2; \
+	vpsubq minus_two, x, x; \
+	vpor tmp2, tmp1, tmp1; \
+	vpslldq $8, tmp1, tmp1; \
+	vpsubq tmp1, x, x;
+
+#define load_ctr_16way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t0x, t1, \
+		       t1x, t2, t2x, t3, t3x, t4, t5) \
+	vpcmpeqd t0, t0, t0; \
+	vpsrldq $8, t0, t0; /* ab: -1:0 ; cd: -1:0 */ \
+	vpaddq t0, t0, t4; /* ab: -2:0 ; cd: -2:0 */\
+	\
+	/* load IV and byteswap */ \
+	vmovdqu (iv), t2x; \
+	vmovdqa t2x, t3x; \
+	inc_le128(t2x, t0x, t1x); \
+	vbroadcasti128 bswap, t1; \
+	vinserti128 $1, t2x, t3, t2; /* ab: le0 ; cd: le1 */ \
+	vpshufb t1, t2, x0; \
+	\
+	/* construct IVs */ \
+	add2_le128(t2, t0, t4, t3, t5); /* ab: le2 ; cd: le3 */ \
+	vpshufb t1, t2, x1; \
+	add2_le128(t2, t0, t4, t3, t5); \
+	vpshufb t1, t2, x2; \
+	add2_le128(t2, t0, t4, t3, t5); \
+	vpshufb t1, t2, x3; \
+	add2_le128(t2, t0, t4, t3, t5); \
+	vpshufb t1, t2, x4; \
+	add2_le128(t2, t0, t4, t3, t5); \
+	vpshufb t1, t2, x5; \
+	add2_le128(t2, t0, t4, t3, t5); \
+	vpshufb t1, t2, x6; \
+	add2_le128(t2, t0, t4, t3, t5); \
+	vpshufb t1, t2, x7; \
+	vextracti128 $1, t2, t2x; \
+	inc_le128(t2x, t0x, t3x); \
+	vmovdqu t2x, (iv);
+
+#define store_ctr_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+	vpxor (0*32)(src), x0, x0; \
+	vpxor (1*32)(src), x1, x1; \
+	vpxor (2*32)(src), x2, x2; \
+	vpxor (3*32)(src), x3, x3; \
+	vpxor (4*32)(src), x4, x4; \
+	vpxor (5*32)(src), x5, x5; \
+	vpxor (6*32)(src), x6, x6; \
+	vpxor (7*32)(src), x7, x7; \
+	store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
+
+#define gf128mul_x_ble(iv, mask, tmp) \
+	vpsrad $31, iv, tmp; \
+	vpaddq iv, iv, iv; \
+	vpshufd $0x13, tmp, tmp; \
+	vpand mask, tmp, tmp; \
+	vpxor tmp, iv, iv;
+
+#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
+	vpsrad $31, iv, tmp0; \
+	vpaddq iv, iv, tmp1; \
+	vpsllq $2, iv, iv; \
+	vpshufd $0x13, tmp0, tmp0; \
+	vpsrad $31, tmp1, tmp1; \
+	vpand mask2, tmp0, tmp0; \
+	vpshufd $0x13, tmp1, tmp1; \
+	vpxor tmp0, iv, iv; \
+	vpand mask1, tmp1, tmp1; \
+	vpxor tmp1, iv, iv;
+
+#define load_xts_16way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, \
+		       tivx, t0, t0x, t1, t1x, t2, t2x, t3, \
+		       xts_gf128mul_and_shl1_mask_0, \
+		       xts_gf128mul_and_shl1_mask_1) \
+	vbroadcasti128 xts_gf128mul_and_shl1_mask_0, t1; \
+	\
+	/* load IV and construct second IV */ \
+	vmovdqu (iv), tivx; \
+	vmovdqa tivx, t0x; \
+	gf128mul_x_ble(tivx, t1x, t2x); \
+	vbroadcasti128 xts_gf128mul_and_shl1_mask_1, t2; \
+	vinserti128 $1, tivx, t0, tiv; \
+	vpxor (0*32)(src), tiv, x0; \
+	vmovdqu tiv, (0*32)(dst); \
+	\
+	/* construct and store IVs, also xor with source */ \
+	gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
+	vpxor (1*32)(src), tiv, x1; \
+	vmovdqu tiv, (1*32)(dst); \
+	\
+	gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
+	vpxor (2*32)(src), tiv, x2; \
+	vmovdqu tiv, (2*32)(dst); \
+	\
+	gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
+	vpxor (3*32)(src), tiv, x3; \
+	vmovdqu tiv, (3*32)(dst); \
+	\
+	gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
+	vpxor (4*32)(src), tiv, x4; \
+	vmovdqu tiv, (4*32)(dst); \
+	\
+	gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
+	vpxor (5*32)(src), tiv, x5; \
+	vmovdqu tiv, (5*32)(dst); \
+	\
+	gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
+	vpxor (6*32)(src), tiv, x6; \
+	vmovdqu tiv, (6*32)(dst); \
+	\
+	gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
+	vpxor (7*32)(src), tiv, x7; \
+	vmovdqu tiv, (7*32)(dst); \
+	\
+	vextracti128 $1, tiv, tivx; \
+	gf128mul_x_ble(tivx, t1x, t2x); \
+	vmovdqu tivx, (iv);
+
+#define store_xts_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+	vpxor (0*32)(dst), x0, x0; \
+	vpxor (1*32)(dst), x1, x1; \
+	vpxor (2*32)(dst), x2, x2; \
+	vpxor (3*32)(dst), x3, x3; \
+	vpxor (4*32)(dst), x4, x4; \
+	vpxor (5*32)(dst), x5, x5; \
+	vpxor (6*32)(dst), x6, x6; \
+	vpxor (7*32)(dst), x7, x7; \
+	store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
diff --git a/arch/x86/crypto/twofish-avx2-asm_64.S b/arch/x86/crypto/twofish-avx2-asm_64.S
new file mode 100644
index 000000000000..e1a83b9cd389
--- /dev/null
+++ b/arch/x86/crypto/twofish-avx2-asm_64.S
@@ -0,0 +1,600 @@
+/*
+ * x86_64/AVX2 assembler optimized version of Twofish
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <linux/linkage.h>
+#include "glue_helper-asm-avx2.S"
+
+.file "twofish-avx2-asm_64.S"
+
+.data
+.align 16
+
+.Lvpshufb_mask0:
+.long 0x80808000
+.long 0x80808004
+.long 0x80808008
+.long 0x8080800c
+
+.Lbswap128_mask:
+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.Lxts_gf128mul_and_shl1_mask_0:
+	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
+.Lxts_gf128mul_and_shl1_mask_1:
+	.byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
+
+.text
+
+/* structure of crypto context */
+#define s0	0
+#define s1	1024
+#define s2	2048
+#define s3	3072
+#define w	4096
+#define	k	4128
+
+/* register macros */
+#define CTX	%rdi
+
+#define RS0	CTX
+#define RS1	%r8
+#define RS2	%r9
+#define RS3	%r10
+#define RK	%r11
+#define RW	%rax
+#define RROUND  %r12
+#define RROUNDd %r12d
+
+#define RA0	%ymm8
+#define RB0	%ymm9
+#define RC0	%ymm10
+#define RD0	%ymm11
+#define RA1	%ymm12
+#define RB1	%ymm13
+#define RC1	%ymm14
+#define RD1	%ymm15
+
+/* temp regs */
+#define RX0	%ymm0
+#define RY0	%ymm1
+#define RX1	%ymm2
+#define RY1	%ymm3
+#define RT0	%ymm4
+#define RIDX	%ymm5
+
+#define RX0x	%xmm0
+#define RY0x	%xmm1
+#define RX1x	%xmm2
+#define RY1x	%xmm3
+#define RT0x	%xmm4
+
+/* vpgatherdd mask and '-1' */
+#define RNOT	%ymm6
+
+/* byte mask, (-1 >> 24) */
+#define RBYTE	%ymm7
+
+/**********************************************************************
+  16-way AVX2 twofish
+ **********************************************************************/
+#define init_round_constants() \
+	vpcmpeqd RNOT, RNOT, RNOT; \
+	vpsrld $24, RNOT, RBYTE; \
+	leaq k(CTX), RK; \
+	leaq w(CTX), RW; \
+	leaq s1(CTX), RS1; \
+	leaq s2(CTX), RS2; \
+	leaq s3(CTX), RS3; \
+
+#define g16(ab, rs0, rs1, rs2, rs3, xy) \
+	vpand RBYTE, ab ## 0, RIDX; \
+	vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 0; \
+	vpcmpeqd RNOT, RNOT, RNOT; \
+		\
+		vpand RBYTE, ab ## 1, RIDX; \
+		vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 1; \
+		vpcmpeqd RNOT, RNOT, RNOT; \
+	\
+	vpsrld $8, ab ## 0, RIDX; \
+	vpand RBYTE, RIDX, RIDX; \
+	vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
+	vpcmpeqd RNOT, RNOT, RNOT; \
+	vpxor RT0, xy ## 0, xy ## 0; \
+		\
+		vpsrld $8, ab ## 1, RIDX; \
+		vpand RBYTE, RIDX, RIDX; \
+		vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
+		vpcmpeqd RNOT, RNOT, RNOT; \
+		vpxor RT0, xy ## 1, xy ## 1; \
+	\
+	vpsrld $16, ab ## 0, RIDX; \
+	vpand RBYTE, RIDX, RIDX; \
+	vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
+	vpcmpeqd RNOT, RNOT, RNOT; \
+	vpxor RT0, xy ## 0, xy ## 0; \
+		\
+		vpsrld $16, ab ## 1, RIDX; \
+		vpand RBYTE, RIDX, RIDX; \
+		vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
+		vpcmpeqd RNOT, RNOT, RNOT; \
+		vpxor RT0, xy ## 1, xy ## 1; \
+	\
+	vpsrld $24, ab ## 0, RIDX; \
+	vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
+	vpcmpeqd RNOT, RNOT, RNOT; \
+	vpxor RT0, xy ## 0, xy ## 0; \
+		\
+		vpsrld $24, ab ## 1, RIDX; \
+		vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
+		vpcmpeqd RNOT, RNOT, RNOT; \
+		vpxor RT0, xy ## 1, xy ## 1;
+
+#define g1_16(a, x) \
+	g16(a, RS0, RS1, RS2, RS3, x);
+
+#define g2_16(b, y) \
+	g16(b, RS1, RS2, RS3, RS0, y);
+
+#define encrypt_round_end16(a, b, c, d, nk) \
+	vpaddd RY0, RX0, RX0; \
+	vpaddd RX0, RY0, RY0; \
+	vpbroadcastd nk(RK,RROUND,8), RT0; \
+	vpaddd RT0, RX0, RX0; \
+	vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
+	vpaddd RT0, RY0, RY0; \
+	\
+	vpxor RY0, d ## 0, d ## 0; \
+	\
+	vpxor RX0, c ## 0, c ## 0; \
+	vpsrld $1, c ## 0, RT0; \
+	vpslld $31, c ## 0, c ## 0; \
+	vpor RT0, c ## 0, c ## 0; \
+	\
+		vpaddd RY1, RX1, RX1; \
+		vpaddd RX1, RY1, RY1; \
+		vpbroadcastd nk(RK,RROUND,8), RT0; \
+		vpaddd RT0, RX1, RX1; \
+		vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
+		vpaddd RT0, RY1, RY1; \
+		\
+		vpxor RY1, d ## 1, d ## 1; \
+		\
+		vpxor RX1, c ## 1, c ## 1; \
+		vpsrld $1, c ## 1, RT0; \
+		vpslld $31, c ## 1, c ## 1; \
+		vpor RT0, c ## 1, c ## 1; \
+
+#define encrypt_round16(a, b, c, d, nk) \
+	g2_16(b, RY); \
+	\
+	vpslld $1, b ## 0, RT0; \
+	vpsrld $31, b ## 0, b ## 0; \
+	vpor RT0, b ## 0, b ## 0; \
+	\
+		vpslld $1, b ## 1, RT0; \
+		vpsrld $31, b ## 1, b ## 1; \
+		vpor RT0, b ## 1, b ## 1; \
+	\
+	g1_16(a, RX); \
+	\
+	encrypt_round_end16(a, b, c, d, nk);
+
+#define encrypt_round_first16(a, b, c, d, nk) \
+	vpslld $1, d ## 0, RT0; \
+	vpsrld $31, d ## 0, d ## 0; \
+	vpor RT0, d ## 0, d ## 0; \
+	\
+		vpslld $1, d ## 1, RT0; \
+		vpsrld $31, d ## 1, d ## 1; \
+		vpor RT0, d ## 1, d ## 1; \
+	\
+	encrypt_round16(a, b, c, d, nk);
+
+#define encrypt_round_last16(a, b, c, d, nk) \
+	g2_16(b, RY); \
+	\
+	g1_16(a, RX); \
+	\
+	encrypt_round_end16(a, b, c, d, nk);
+
+#define decrypt_round_end16(a, b, c, d, nk) \
+	vpaddd RY0, RX0, RX0; \
+	vpaddd RX0, RY0, RY0; \
+	vpbroadcastd nk(RK,RROUND,8), RT0; \
+	vpaddd RT0, RX0, RX0; \
+	vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
+	vpaddd RT0, RY0, RY0; \
+	\
+	vpxor RX0, c ## 0, c ## 0; \
+	\
+	vpxor RY0, d ## 0, d ## 0; \
+	vpsrld $1, d ## 0, RT0; \
+	vpslld $31, d ## 0, d ## 0; \
+	vpor RT0, d ## 0, d ## 0; \
+	\
+		vpaddd RY1, RX1, RX1; \
+		vpaddd RX1, RY1, RY1; \
+		vpbroadcastd nk(RK,RROUND,8), RT0; \
+		vpaddd RT0, RX1, RX1; \
+		vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
+		vpaddd RT0, RY1, RY1; \
+		\
+		vpxor RX1, c ## 1, c ## 1; \
+		\
+		vpxor RY1, d ## 1, d ## 1; \
+		vpsrld $1, d ## 1, RT0; \
+		vpslld $31, d ## 1, d ## 1; \
+		vpor RT0, d ## 1, d ## 1;
+
+#define decrypt_round16(a, b, c, d, nk) \
+	g1_16(a, RX); \
+	\
+	vpslld $1, a ## 0, RT0; \
+	vpsrld $31, a ## 0, a ## 0; \
+	vpor RT0, a ## 0, a ## 0; \
+	\
+		vpslld $1, a ## 1, RT0; \
+		vpsrld $31, a ## 1, a ## 1; \
+		vpor RT0, a ## 1, a ## 1; \
+	\
+	g2_16(b, RY); \
+	\
+	decrypt_round_end16(a, b, c, d, nk);
+
+#define decrypt_round_first16(a, b, c, d, nk) \
+	vpslld $1, c ## 0, RT0; \
+	vpsrld $31, c ## 0, c ## 0; \
+	vpor RT0, c ## 0, c ## 0; \
+	\
+		vpslld $1, c ## 1, RT0; \
+		vpsrld $31, c ## 1, c ## 1; \
+		vpor RT0, c ## 1, c ## 1; \
+	\
+	decrypt_round16(a, b, c, d, nk)
+
+#define decrypt_round_last16(a, b, c, d, nk) \
+	g1_16(a, RX); \
+	\
+	g2_16(b, RY); \
+	\
+	decrypt_round_end16(a, b, c, d, nk);
+
+#define encrypt_cycle16() \
+	encrypt_round16(RA, RB, RC, RD, 0); \
+	encrypt_round16(RC, RD, RA, RB, 8);
+
+#define encrypt_cycle_first16() \
+	encrypt_round_first16(RA, RB, RC, RD, 0); \
+	encrypt_round16(RC, RD, RA, RB, 8);
+
+#define encrypt_cycle_last16() \
+	encrypt_round16(RA, RB, RC, RD, 0); \
+	encrypt_round_last16(RC, RD, RA, RB, 8);
+
+#define decrypt_cycle16(n) \
+	decrypt_round16(RC, RD, RA, RB, 8); \
+	decrypt_round16(RA, RB, RC, RD, 0);
+
+#define decrypt_cycle_first16(n) \
+	decrypt_round_first16(RC, RD, RA, RB, 8); \
+	decrypt_round16(RA, RB, RC, RD, 0);
+
+#define decrypt_cycle_last16(n) \
+	decrypt_round16(RC, RD, RA, RB, 8); \
+	decrypt_round_last16(RA, RB, RC, RD, 0);
+
+#define transpose_4x4(x0,x1,x2,x3,t1,t2) \
+	vpunpckhdq x1, x0, t2; \
+	vpunpckldq x1, x0, x0; \
+	\
+	vpunpckldq x3, x2, t1; \
+	vpunpckhdq x3, x2, x2; \
+	\
+	vpunpckhqdq t1,	x0, x1; \
+	vpunpcklqdq t1,	x0, x0; \
+	\
+	vpunpckhqdq x2, t2, x3; \
+	vpunpcklqdq x2,	t2, x2;
+
+#define read_blocks8(offs,a,b,c,d) \
+	transpose_4x4(a, b, c, d, RX0, RY0);
+
+#define write_blocks8(offs,a,b,c,d) \
+	transpose_4x4(a, b, c, d, RX0, RY0);
+
+#define inpack_enc8(a,b,c,d) \
+	vpbroadcastd 4*0(RW), RT0; \
+	vpxor RT0, a, a; \
+	\
+	vpbroadcastd 4*1(RW), RT0; \
+	vpxor RT0, b, b; \
+	\
+	vpbroadcastd 4*2(RW), RT0; \
+	vpxor RT0, c, c; \
+	\
+	vpbroadcastd 4*3(RW), RT0; \
+	vpxor RT0, d, d;
+
+#define outunpack_enc8(a,b,c,d) \
+	vpbroadcastd 4*4(RW), RX0; \
+	vpbroadcastd 4*5(RW), RY0; \
+	vpxor RX0, c, RX0; \
+	vpxor RY0, d, RY0; \
+	\
+	vpbroadcastd 4*6(RW), RT0; \
+	vpxor RT0, a, c; \
+	vpbroadcastd 4*7(RW), RT0; \
+	vpxor RT0, b, d; \
+	\
+	vmovdqa RX0, a; \
+	vmovdqa RY0, b;
+
+#define inpack_dec8(a,b,c,d) \
+	vpbroadcastd 4*4(RW), RX0; \
+	vpbroadcastd 4*5(RW), RY0; \
+	vpxor RX0, a, RX0; \
+	vpxor RY0, b, RY0; \
+	\
+	vpbroadcastd 4*6(RW), RT0; \
+	vpxor RT0, c, a; \
+	vpbroadcastd 4*7(RW), RT0; \
+	vpxor RT0, d, b; \
+	\
+	vmovdqa RX0, c; \
+	vmovdqa RY0, d;
+
+#define outunpack_dec8(a,b,c,d) \
+	vpbroadcastd 4*0(RW), RT0; \
+	vpxor RT0, a, a; \
+	\
+	vpbroadcastd 4*1(RW), RT0; \
+	vpxor RT0, b, b; \
+	\
+	vpbroadcastd 4*2(RW), RT0; \
+	vpxor RT0, c, c; \
+	\
+	vpbroadcastd 4*3(RW), RT0; \
+	vpxor RT0, d, d;
+
+#define read_blocks16(a,b,c,d) \
+	read_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
+	read_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
+
+#define write_blocks16(a,b,c,d) \
+	write_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
+	write_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
+
+#define xor_blocks16(a,b,c,d) \
+	xor_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
+	xor_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
+
+#define inpack_enc16(a,b,c,d) \
+	inpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \
+	inpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1);
+
+#define outunpack_enc16(a,b,c,d) \
+	outunpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \
+	outunpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1);
+
+#define inpack_dec16(a,b,c,d) \
+	inpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
+	inpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);
+
+#define outunpack_dec16(a,b,c,d) \
+	outunpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
+	outunpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);
+
+.align 8
+__twofish_enc_blk16:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext
+	 * output:
+	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext
+	 */
+	init_round_constants();
+
+	read_blocks16(RA, RB, RC, RD);
+	inpack_enc16(RA, RB, RC, RD);
+
+	xorl RROUNDd, RROUNDd;
+	encrypt_cycle_first16();
+	movl $2, RROUNDd;
+
+.align 4
+.L__enc_loop:
+	encrypt_cycle16();
+
+	addl $2, RROUNDd;
+	cmpl $14, RROUNDd;
+	jne .L__enc_loop;
+
+	encrypt_cycle_last16();
+
+	outunpack_enc16(RA, RB, RC, RD);
+	write_blocks16(RA, RB, RC, RD);
+
+	ret;
+ENDPROC(__twofish_enc_blk16)
+
+.align 8
+__twofish_dec_blk16:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext
+	 * output:
+	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext
+	 */
+	init_round_constants();
+
+	read_blocks16(RA, RB, RC, RD);
+	inpack_dec16(RA, RB, RC, RD);
+
+	movl $14, RROUNDd;
+	decrypt_cycle_first16();
+	movl $12, RROUNDd;
+
+.align 4
+.L__dec_loop:
+	decrypt_cycle16();
+
+	addl $-2, RROUNDd;
+	jnz .L__dec_loop;
+
+	decrypt_cycle_last16();
+
+	outunpack_dec16(RA, RB, RC, RD);
+	write_blocks16(RA, RB, RC, RD);
+
+	ret;
+ENDPROC(__twofish_dec_blk16)
+
+ENTRY(twofish_ecb_enc_16way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	vzeroupper;
+	pushq %r12;
+
+	load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
+
+	call __twofish_enc_blk16;
+
+	store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
+
+	popq %r12;
+	vzeroupper;
+
+	ret;
+ENDPROC(twofish_ecb_enc_16way)
+
+ENTRY(twofish_ecb_dec_16way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	vzeroupper;
+	pushq %r12;
+
+	load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
+
+	call __twofish_dec_blk16;
+
+	store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
+
+	popq %r12;
+	vzeroupper;
+
+	ret;
+ENDPROC(twofish_ecb_dec_16way)
+
+ENTRY(twofish_cbc_dec_16way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	vzeroupper;
+	pushq %r12;
+
+	load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
+
+	call __twofish_dec_blk16;
+
+	store_cbc_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1,
+			RX0);
+
+	popq %r12;
+	vzeroupper;
+
+	ret;
+ENDPROC(twofish_cbc_dec_16way)
+
+ENTRY(twofish_ctr_16way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv (little endian, 128bit)
+	 */
+
+	vzeroupper;
+	pushq %r12;
+
+	load_ctr_16way(%rcx, .Lbswap128_mask, RA0, RB0, RC0, RD0, RA1, RB1, RC1,
+		       RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT,
+		       RBYTE);
+
+	call __twofish_enc_blk16;
+
+	store_ctr_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
+
+	popq %r12;
+	vzeroupper;
+
+	ret;
+ENDPROC(twofish_ctr_16way)
+
+.align 8
+twofish_xts_crypt_16way:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+	 *	%r8: pointer to __twofish_enc_blk16 or __twofish_dec_blk16
+	 */
+
+	vzeroupper;
+	pushq %r12;
+
+	load_xts_16way(%rcx, %rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1,
+		       RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT,
+		       .Lxts_gf128mul_and_shl1_mask_0,
+		       .Lxts_gf128mul_and_shl1_mask_1);
+
+	call *%r8;
+
+	store_xts_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
+
+	popq %r12;
+	vzeroupper;
+
+	ret;
+ENDPROC(twofish_xts_crypt_16way)
+
+ENTRY(twofish_xts_enc_16way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+	 */
+	leaq __twofish_enc_blk16, %r8;
+	jmp twofish_xts_crypt_16way;
+ENDPROC(twofish_xts_enc_16way)
+
+ENTRY(twofish_xts_dec_16way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+	 */
+	leaq __twofish_dec_blk16, %r8;
+	jmp twofish_xts_crypt_16way;
+ENDPROC(twofish_xts_dec_16way)
diff --git a/arch/x86/crypto/twofish_avx2_glue.c b/arch/x86/crypto/twofish_avx2_glue.c
new file mode 100644
index 000000000000..ce33b5be64ee
--- /dev/null
+++ b/arch/x86/crypto/twofish_avx2_glue.c
@@ -0,0 +1,584 @@
+/*
+ * Glue Code for x86_64/AVX2 assembler optimized version of Twofish
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/ctr.h>
+#include <crypto/twofish.h>
+#include <crypto/lrw.h>
+#include <crypto/xts.h>
+#include <asm/xcr.h>
+#include <asm/xsave.h>
+#include <asm/crypto/twofish.h>
+#include <asm/crypto/ablk_helper.h>
+#include <asm/crypto/glue_helper.h>
+#include <crypto/scatterwalk.h>
+
+#define TF_AVX2_PARALLEL_BLOCKS 16
+
+/* 16-way AVX2 parallel cipher functions */
+asmlinkage void twofish_ecb_enc_16way(struct twofish_ctx *ctx, u8 *dst,
+				      const u8 *src);
+asmlinkage void twofish_ecb_dec_16way(struct twofish_ctx *ctx, u8 *dst,
+				      const u8 *src);
+asmlinkage void twofish_cbc_dec_16way(void *ctx, u128 *dst, const u128 *src);
+
+asmlinkage void twofish_ctr_16way(void *ctx, u128 *dst, const u128 *src,
+				  le128 *iv);
+
+asmlinkage void twofish_xts_enc_16way(struct twofish_ctx *ctx, u8 *dst,
+				      const u8 *src, le128 *iv);
+asmlinkage void twofish_xts_dec_16way(struct twofish_ctx *ctx, u8 *dst,
+				      const u8 *src, le128 *iv);
+
+static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
+					const u8 *src)
+{
+	__twofish_enc_blk_3way(ctx, dst, src, false);
+}
+
+static const struct common_glue_ctx twofish_enc = {
+	.num_funcs = 4,
+	.fpu_blocks_limit = 8,
+
+	.funcs = { {
+		.num_blocks = 16,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_16way) }
+	}, {
+		.num_blocks = 8,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_8way) }
+	}, {
+		.num_blocks = 3,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) }
+	} }
+};
+
+static const struct common_glue_ctx twofish_ctr = {
+	.num_funcs = 4,
+	.fpu_blocks_limit = 8,
+
+	.funcs = { {
+		.num_blocks = 16,
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_16way) }
+	},  {
+		.num_blocks = 8,
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_8way) }
+	}, {
+		.num_blocks = 3,
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr) }
+	} }
+};
+
+static const struct common_glue_ctx twofish_enc_xts = {
+	.num_funcs = 3,
+	.fpu_blocks_limit = 8,
+
+	.funcs = { {
+		.num_blocks = 16,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_16way) }
+	}, {
+		.num_blocks = 8,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_8way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc) }
+	} }
+};
+
+static const struct common_glue_ctx twofish_dec = {
+	.num_funcs = 4,
+	.fpu_blocks_limit = 8,
+
+	.funcs = { {
+		.num_blocks = 16,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_16way) }
+	}, {
+		.num_blocks = 8,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_8way) }
+	}, {
+		.num_blocks = 3,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) }
+	} }
+};
+
+static const struct common_glue_ctx twofish_dec_cbc = {
+	.num_funcs = 4,
+	.fpu_blocks_limit = 8,
+
+	.funcs = { {
+		.num_blocks = 16,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_16way) }
+	}, {
+		.num_blocks = 8,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_8way) }
+	}, {
+		.num_blocks = 3,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) }
+	} }
+};
+
+static const struct common_glue_ctx twofish_dec_xts = {
+	.num_funcs = 3,
+	.fpu_blocks_limit = 8,
+
+	.funcs = { {
+		.num_blocks = 16,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_16way) }
+	}, {
+		.num_blocks = 8,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_8way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec) }
+	} }
+};
+
+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_ecb_crypt_128bit(&twofish_enc, desc, dst, src, nbytes);
+}
+
+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_ecb_crypt_128bit(&twofish_dec, desc, dst, src, nbytes);
+}
+
+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(twofish_enc_blk), desc,
+				       dst, src, nbytes);
+}
+
+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_cbc_decrypt_128bit(&twofish_dec_cbc, desc, dst, src,
+				       nbytes);
+}
+
+static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		     struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_ctr_crypt_128bit(&twofish_ctr, desc, dst, src, nbytes);
+}
+
+static inline bool twofish_fpu_begin(bool fpu_enabled, unsigned int nbytes)
+{
+	/* since reusing AVX functions, starts using FPU at 8 parallel blocks */
+	return glue_fpu_begin(TF_BLOCK_SIZE, 8, NULL, fpu_enabled, nbytes);
+}
+
+static inline void twofish_fpu_end(bool fpu_enabled)
+{
+	glue_fpu_end(fpu_enabled);
+}
+
+struct crypt_priv {
+	struct twofish_ctx *ctx;
+	bool fpu_enabled;
+};
+
+static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+	const unsigned int bsize = TF_BLOCK_SIZE;
+	struct crypt_priv *ctx = priv;
+	int i;
+
+	ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
+
+	while (nbytes >= TF_AVX2_PARALLEL_BLOCKS * bsize) {
+		twofish_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * TF_AVX2_PARALLEL_BLOCKS;
+		nbytes -= bsize * TF_AVX2_PARALLEL_BLOCKS;
+	}
+
+	while (nbytes >= 8 * bsize) {
+		twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * 8;
+		nbytes -= bsize * 8;
+	}
+
+	while (nbytes >= 3 * bsize) {
+		twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * 3;
+		nbytes -= bsize * 3;
+	}
+
+	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+		twofish_enc_blk(ctx->ctx, srcdst, srcdst);
+}
+
+static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+	const unsigned int bsize = TF_BLOCK_SIZE;
+	struct crypt_priv *ctx = priv;
+	int i;
+
+	ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
+
+	while (nbytes >= TF_AVX2_PARALLEL_BLOCKS * bsize) {
+		twofish_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * TF_AVX2_PARALLEL_BLOCKS;
+		nbytes -= bsize * TF_AVX2_PARALLEL_BLOCKS;
+	}
+
+	while (nbytes >= 8 * bsize) {
+		twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * 8;
+		nbytes -= bsize * 8;
+	}
+
+	while (nbytes >= 3 * bsize) {
+		twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * 3;
+		nbytes -= bsize * 3;
+	}
+
+	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+		twofish_dec_blk(ctx->ctx, srcdst, srcdst);
+}
+
+static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[TF_AVX2_PARALLEL_BLOCKS];
+	struct crypt_priv crypt_ctx = {
+		.ctx = &ctx->twofish_ctx,
+		.fpu_enabled = false,
+	};
+	struct lrw_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.table_ctx = &ctx->lrw_table,
+		.crypt_ctx = &crypt_ctx,
+		.crypt_fn = encrypt_callback,
+	};
+	int ret;
+
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+	ret = lrw_crypt(desc, dst, src, nbytes, &req);
+	twofish_fpu_end(crypt_ctx.fpu_enabled);
+
+	return ret;
+}
+
+static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[TF_AVX2_PARALLEL_BLOCKS];
+	struct crypt_priv crypt_ctx = {
+		.ctx = &ctx->twofish_ctx,
+		.fpu_enabled = false,
+	};
+	struct lrw_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.table_ctx = &ctx->lrw_table,
+		.crypt_ctx = &crypt_ctx,
+		.crypt_fn = decrypt_callback,
+	};
+	int ret;
+
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+	ret = lrw_crypt(desc, dst, src, nbytes, &req);
+	twofish_fpu_end(crypt_ctx.fpu_enabled);
+
+	return ret;
+}
+
+static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+
+	return glue_xts_crypt_128bit(&twofish_enc_xts, desc, dst, src, nbytes,
+				     XTS_TWEAK_CAST(twofish_enc_blk),
+				     &ctx->tweak_ctx, &ctx->crypt_ctx);
+}
+
+static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+
+	return glue_xts_crypt_128bit(&twofish_dec_xts, desc, dst, src, nbytes,
+				     XTS_TWEAK_CAST(twofish_enc_blk),
+				     &ctx->tweak_ctx, &ctx->crypt_ctx);
+}
+
+static struct crypto_alg tf_algs[10] = { {
+	.cra_name		= "__ecb-twofish-avx2",
+	.cra_driver_name	= "__driver-ecb-twofish-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= TF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct twofish_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE,
+			.max_keysize	= TF_MAX_KEY_SIZE,
+			.setkey		= twofish_setkey,
+			.encrypt	= ecb_encrypt,
+			.decrypt	= ecb_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "__cbc-twofish-avx2",
+	.cra_driver_name	= "__driver-cbc-twofish-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= TF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct twofish_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE,
+			.max_keysize	= TF_MAX_KEY_SIZE,
+			.setkey		= twofish_setkey,
+			.encrypt	= cbc_encrypt,
+			.decrypt	= cbc_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "__ctr-twofish-avx2",
+	.cra_driver_name	= "__driver-ctr-twofish-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct twofish_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE,
+			.max_keysize	= TF_MAX_KEY_SIZE,
+			.ivsize		= TF_BLOCK_SIZE,
+			.setkey		= twofish_setkey,
+			.encrypt	= ctr_crypt,
+			.decrypt	= ctr_crypt,
+		},
+	},
+}, {
+	.cra_name		= "__lrw-twofish-avx2",
+	.cra_driver_name	= "__driver-lrw-twofish-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= TF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct twofish_lrw_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_exit		= lrw_twofish_exit_tfm,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE +
+					  TF_BLOCK_SIZE,
+			.max_keysize	= TF_MAX_KEY_SIZE +
+					  TF_BLOCK_SIZE,
+			.ivsize		= TF_BLOCK_SIZE,
+			.setkey		= lrw_twofish_setkey,
+			.encrypt	= lrw_encrypt,
+			.decrypt	= lrw_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "__xts-twofish-avx2",
+	.cra_driver_name	= "__driver-xts-twofish-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= TF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct twofish_xts_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE * 2,
+			.max_keysize	= TF_MAX_KEY_SIZE * 2,
+			.ivsize		= TF_BLOCK_SIZE,
+			.setkey		= xts_twofish_setkey,
+			.encrypt	= xts_encrypt,
+			.decrypt	= xts_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "ecb(twofish)",
+	.cra_driver_name	= "ecb-twofish-avx2",
+	.cra_priority		= 500,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= TF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE,
+			.max_keysize	= TF_MAX_KEY_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "cbc(twofish)",
+	.cra_driver_name	= "cbc-twofish-avx2",
+	.cra_priority		= 500,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= TF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE,
+			.max_keysize	= TF_MAX_KEY_SIZE,
+			.ivsize		= TF_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= __ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "ctr(twofish)",
+	.cra_driver_name	= "ctr-twofish-avx2",
+	.cra_priority		= 500,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE,
+			.max_keysize	= TF_MAX_KEY_SIZE,
+			.ivsize		= TF_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_encrypt,
+			.geniv		= "chainiv",
+		},
+	},
+}, {
+	.cra_name		= "lrw(twofish)",
+	.cra_driver_name	= "lrw-twofish-avx2",
+	.cra_priority		= 500,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= TF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE +
+					  TF_BLOCK_SIZE,
+			.max_keysize	= TF_MAX_KEY_SIZE +
+					  TF_BLOCK_SIZE,
+			.ivsize		= TF_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "xts(twofish)",
+	.cra_driver_name	= "xts-twofish-avx2",
+	.cra_priority		= 500,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= TF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE * 2,
+			.max_keysize	= TF_MAX_KEY_SIZE * 2,
+			.ivsize		= TF_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+} };
+
+static int __init init(void)
+{
+	u64 xcr0;
+
+	if (!cpu_has_avx2 || !cpu_has_osxsave) {
+		pr_info("AVX2 instructions are not detected.\n");
+		return -ENODEV;
+	}
+
+	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
+		pr_info("AVX2 detected but unusable.\n");
+		return -ENODEV;
+	}
+
+	return crypto_register_algs(tf_algs, ARRAY_SIZE(tf_algs));
+}
+
+static void __exit fini(void)
+{
+	crypto_unregister_algs(tf_algs, ARRAY_SIZE(tf_algs));
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Twofish Cipher Algorithm, AVX2 optimized");
+MODULE_ALIAS("twofish");
+MODULE_ALIAS("twofish-asm");
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
index a62ba541884e..2047a562f6b3 100644
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -50,18 +50,26 @@
 /* 8-way parallel cipher functions */
 asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst,
 				     const u8 *src);
+EXPORT_SYMBOL_GPL(twofish_ecb_enc_8way);
+
 asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst,
 				     const u8 *src);
+EXPORT_SYMBOL_GPL(twofish_ecb_dec_8way);
 
 asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst,
 				     const u8 *src);
+EXPORT_SYMBOL_GPL(twofish_cbc_dec_8way);
+
 asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst,
 				 const u8 *src, le128 *iv);
+EXPORT_SYMBOL_GPL(twofish_ctr_8way);
 
 asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst,
 				     const u8 *src, le128 *iv);
+EXPORT_SYMBOL_GPL(twofish_xts_enc_8way);
 asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst,
 				     const u8 *src, le128 *iv);
+EXPORT_SYMBOL_GPL(twofish_xts_dec_8way);
 
 static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
 					const u8 *src)
@@ -69,17 +77,19 @@ static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
 	__twofish_enc_blk_3way(ctx, dst, src, false);
 }
 
-static void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 {
 	glue_xts_crypt_128bit_one(ctx, dst, src, iv,
 				  GLUE_FUNC_CAST(twofish_enc_blk));
 }
+EXPORT_SYMBOL_GPL(twofish_xts_enc);
 
-static void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 {
 	glue_xts_crypt_128bit_one(ctx, dst, src, iv,
 				  GLUE_FUNC_CAST(twofish_dec_blk));
 }
+EXPORT_SYMBOL_GPL(twofish_xts_dec);
 
 
 static const struct common_glue_ctx twofish_enc = {
diff --git a/arch/x86/include/asm/crypto/twofish.h b/arch/x86/include/asm/crypto/twofish.h
index 878c51ceebb5..e655c6029b45 100644
--- a/arch/x86/include/asm/crypto/twofish.h
+++ b/arch/x86/include/asm/crypto/twofish.h
@@ -28,6 +28,20 @@ asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
 asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst,
 				     const u8 *src);
 
+/* 8-way parallel cipher functions */
+asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst,
+				     const u8 *src);
+asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst,
+				     const u8 *src);
+asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst,
+				     const u8 *src);
+asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst,
+				 const u8 *src, le128 *iv);
+asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst,
+				     const u8 *src, le128 *iv);
+asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst,
+				     const u8 *src, le128 *iv);
+
 /* helpers from twofish_x86_64-3way module */
 extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src);
 extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src,
@@ -43,4 +57,8 @@ extern void lrw_twofish_exit_tfm(struct crypto_tfm *tfm);
 extern int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
 			      unsigned int keylen);
 
+/* helpers from twofish-avx module */
+extern void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv);
+extern void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv);
+
 #endif /* ASM_X86_TWOFISH_H */
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 6b9564f91168..1ba48ddd4da1 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1250,6 +1250,30 @@ config CRYPTO_TWOFISH_AVX_X86_64
 	  See also:
 	  <http://www.schneier.com/twofish.html>
 
+config CRYPTO_TWOFISH_AVX2_X86_64
+	tristate "Twofish cipher algorithm (x86_64/AVX2)"
+	depends on X86 && 64BIT
+	select CRYPTO_ALGAPI
+	select CRYPTO_CRYPTD
+	select CRYPTO_ABLK_HELPER_X86
+	select CRYPTO_GLUE_HELPER_X86
+	select CRYPTO_TWOFISH_COMMON
+	select CRYPTO_TWOFISH_X86_64
+	select CRYPTO_TWOFISH_X86_64_3WAY
+	select CRYPTO_TWOFISH_AVX_X86_64
+	select CRYPTO_LRW
+	select CRYPTO_XTS
+	help
+	  Twofish cipher algorithm (x86_64/AVX2).
+
+	  Twofish was submitted as an AES (Advanced Encryption Standard)
+	  candidate cipher by researchers at CounterPane Systems.  It is a
+	  16 round block cipher supporting key sizes of 128, 192, and 256
+	  bits.
+
+	  See also:
+	  <http://www.schneier.com/twofish.html>
+
 comment "Compression"
 
 config CRYPTO_DEFLATE
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index f3effb42531e..fea7841dd6f3 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -1650,6 +1650,9 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "__cbc-twofish-avx",
 		.test = alg_test_null,
+	}, {
+		.alg = "__cbc-twofish-avx2",
+		.test = alg_test_null,
 	}, {
 		.alg = "__driver-cbc-aes-aesni",
 		.test = alg_test_null,
@@ -1675,6 +1678,9 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "__driver-cbc-twofish-avx",
 		.test = alg_test_null,
+	}, {
+		.alg = "__driver-cbc-twofish-avx2",
+		.test = alg_test_null,
 	}, {
 		.alg = "__driver-ecb-aes-aesni",
 		.test = alg_test_null,
@@ -1700,6 +1706,9 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "__driver-ecb-twofish-avx",
 		.test = alg_test_null,
+	}, {
+		.alg = "__driver-ecb-twofish-avx2",
+		.test = alg_test_null,
 	}, {
 		.alg = "__ghash-pclmulqdqni",
 		.test = alg_test_null,
@@ -1984,6 +1993,9 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "cryptd(__driver-ecb-twofish-avx)",
 		.test = alg_test_null,
+	}, {
+		.alg = "cryptd(__driver-ecb-twofish-avx2)",
+		.test = alg_test_null,
 	}, {
 		.alg = "cryptd(__driver-gcm-aes-aesni)",
 		.test = alg_test_null,
-- 
cgit v1.2.3-59-g8ed1b


From 56d76c96a9f3e39ab733c5643b3ce5a1d4be242a Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Date: Sat, 13 Apr 2013 13:46:55 +0300
Subject: crypto: serpent - add AVX2/x86_64 assembler implementation of serpent
 cipher

Patch adds AVX2/x86-64 implementation of Serpent cipher, requiring 16 parallel
blocks for input (256 bytes). Implementation is based on the AVX implementation
and extends to use the 256-bit wide YMM registers. Since serpent does not use
table look-ups, this implementation should be close to two times faster than
the AVX implementation.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/Makefile                  |   2 +
 arch/x86/crypto/serpent-avx2-asm_64.S     | 800 ++++++++++++++++++++++++++++++
 arch/x86/crypto/serpent_avx2_glue.c       | 562 +++++++++++++++++++++
 arch/x86/crypto/serpent_avx_glue.c        |  62 ++-
 arch/x86/include/asm/crypto/serpent-avx.h |  24 +
 crypto/Kconfig                            |  23 +
 crypto/testmgr.c                          |  15 +
 7 files changed, 1468 insertions(+), 20 deletions(-)
 create mode 100644 arch/x86/crypto/serpent-avx2-asm_64.S
 create mode 100644 arch/x86/crypto/serpent_avx2_glue.c

(limited to 'crypto')

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 1f6e0c2e9140..a21af593ab8d 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -43,6 +43,7 @@ endif
 # These modules require assembler to support AVX2.
 ifeq ($(avx2_supported),yes)
 	obj-$(CONFIG_CRYPTO_BLOWFISH_AVX2_X86_64) += blowfish-avx2.o
+	obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o
 	obj-$(CONFIG_CRYPTO_TWOFISH_AVX2_X86_64) += twofish-avx2.o
 endif
 
@@ -72,6 +73,7 @@ endif
 
 ifeq ($(avx2_supported),yes)
 	blowfish-avx2-y := blowfish-avx2-asm_64.o blowfish_avx2_glue.o
+	serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
 	twofish-avx2-y := twofish-avx2-asm_64.o twofish_avx2_glue.o
 endif
 
diff --git a/arch/x86/crypto/serpent-avx2-asm_64.S b/arch/x86/crypto/serpent-avx2-asm_64.S
new file mode 100644
index 000000000000..b222085cccac
--- /dev/null
+++ b/arch/x86/crypto/serpent-avx2-asm_64.S
@@ -0,0 +1,800 @@
+/*
+ * x86_64/AVX2 assembler optimized version of Serpent
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * Based on AVX assembler implementation of Serpent by:
+ *  Copyright © 2012 Johannes Goetzfried
+ *      <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <linux/linkage.h>
+#include "glue_helper-asm-avx2.S"
+
+.file "serpent-avx2-asm_64.S"
+
+.data
+.align 16
+
+.Lbswap128_mask:
+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.Lxts_gf128mul_and_shl1_mask_0:
+	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
+.Lxts_gf128mul_and_shl1_mask_1:
+	.byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
+
+.text
+
+#define CTX %rdi
+
+#define RNOT %ymm0
+#define tp  %ymm1
+
+#define RA1 %ymm2
+#define RA2 %ymm3
+#define RB1 %ymm4
+#define RB2 %ymm5
+#define RC1 %ymm6
+#define RC2 %ymm7
+#define RD1 %ymm8
+#define RD2 %ymm9
+#define RE1 %ymm10
+#define RE2 %ymm11
+
+#define RK0 %ymm12
+#define RK1 %ymm13
+#define RK2 %ymm14
+#define RK3 %ymm15
+
+#define RK0x %xmm12
+#define RK1x %xmm13
+#define RK2x %xmm14
+#define RK3x %xmm15
+
+#define S0_1(x0, x1, x2, x3, x4)      \
+	vpor		x0,   x3, tp; \
+	vpxor		x3,   x0, x0; \
+	vpxor		x2,   x3, x4; \
+	vpxor		RNOT, x4, x4; \
+	vpxor		x1,   tp, x3; \
+	vpand		x0,   x1, x1; \
+	vpxor		x4,   x1, x1; \
+	vpxor		x0,   x2, x2;
+#define S0_2(x0, x1, x2, x3, x4)      \
+	vpxor		x3,   x0, x0; \
+	vpor		x0,   x4, x4; \
+	vpxor		x2,   x0, x0; \
+	vpand		x1,   x2, x2; \
+	vpxor		x2,   x3, x3; \
+	vpxor		RNOT, x1, x1; \
+	vpxor		x4,   x2, x2; \
+	vpxor		x2,   x1, x1;
+
+#define S1_1(x0, x1, x2, x3, x4)      \
+	vpxor		x0,   x1, tp; \
+	vpxor		x3,   x0, x0; \
+	vpxor		RNOT, x3, x3; \
+	vpand		tp,   x1, x4; \
+	vpor		tp,   x0, x0; \
+	vpxor		x2,   x3, x3; \
+	vpxor		x3,   x0, x0; \
+	vpxor		x3,   tp, x1;
+#define S1_2(x0, x1, x2, x3, x4)      \
+	vpxor		x4,   x3, x3; \
+	vpor		x4,   x1, x1; \
+	vpxor		x2,   x4, x4; \
+	vpand		x0,   x2, x2; \
+	vpxor		x1,   x2, x2; \
+	vpor		x0,   x1, x1; \
+	vpxor		RNOT, x0, x0; \
+	vpxor		x2,   x0, x0; \
+	vpxor		x1,   x4, x4;
+
+#define S2_1(x0, x1, x2, x3, x4)      \
+	vpxor		RNOT, x3, x3; \
+	vpxor		x0,   x1, x1; \
+	vpand		x2,   x0, tp; \
+	vpxor		x3,   tp, tp; \
+	vpor		x0,   x3, x3; \
+	vpxor		x1,   x2, x2; \
+	vpxor		x1,   x3, x3; \
+	vpand		tp,   x1, x1;
+#define S2_2(x0, x1, x2, x3, x4)      \
+	vpxor		x2,   tp, tp; \
+	vpand		x3,   x2, x2; \
+	vpor		x1,   x3, x3; \
+	vpxor		RNOT, tp, tp; \
+	vpxor		tp,   x3, x3; \
+	vpxor		tp,   x0, x4; \
+	vpxor		x2,   tp, x0; \
+	vpor		x2,   x1, x1;
+
+#define S3_1(x0, x1, x2, x3, x4)      \
+	vpxor		x3,   x1, tp; \
+	vpor		x0,   x3, x3; \
+	vpand		x0,   x1, x4; \
+	vpxor		x2,   x0, x0; \
+	vpxor		tp,   x2, x2; \
+	vpand		x3,   tp, x1; \
+	vpxor		x3,   x2, x2; \
+	vpor		x4,   x0, x0; \
+	vpxor		x3,   x4, x4;
+#define S3_2(x0, x1, x2, x3, x4)      \
+	vpxor		x0,   x1, x1; \
+	vpand		x3,   x0, x0; \
+	vpand		x4,   x3, x3; \
+	vpxor		x2,   x3, x3; \
+	vpor		x1,   x4, x4; \
+	vpand		x1,   x2, x2; \
+	vpxor		x3,   x4, x4; \
+	vpxor		x3,   x0, x0; \
+	vpxor		x2,   x3, x3;
+
+#define S4_1(x0, x1, x2, x3, x4)      \
+	vpand		x0,   x3, tp; \
+	vpxor		x3,   x0, x0; \
+	vpxor		x2,   tp, tp; \
+	vpor		x3,   x2, x2; \
+	vpxor		x1,   x0, x0; \
+	vpxor		tp,   x3, x4; \
+	vpor		x0,   x2, x2; \
+	vpxor		x1,   x2, x2;
+#define S4_2(x0, x1, x2, x3, x4)      \
+	vpand		x0,   x1, x1; \
+	vpxor		x4,   x1, x1; \
+	vpand		x2,   x4, x4; \
+	vpxor		tp,   x2, x2; \
+	vpxor		x0,   x4, x4; \
+	vpor		x1,   tp, x3; \
+	vpxor		RNOT, x1, x1; \
+	vpxor		x0,   x3, x3;
+
+#define S5_1(x0, x1, x2, x3, x4)      \
+	vpor		x0,   x1, tp; \
+	vpxor		tp,   x2, x2; \
+	vpxor		RNOT, x3, x3; \
+	vpxor		x0,   x1, x4; \
+	vpxor		x2,   x0, x0; \
+	vpand		x4,   tp, x1; \
+	vpor		x3,   x4, x4; \
+	vpxor		x0,   x4, x4;
+#define S5_2(x0, x1, x2, x3, x4)      \
+	vpand		x3,   x0, x0; \
+	vpxor		x3,   x1, x1; \
+	vpxor		x2,   x3, x3; \
+	vpxor		x1,   x0, x0; \
+	vpand		x4,   x2, x2; \
+	vpxor		x2,   x1, x1; \
+	vpand		x0,   x2, x2; \
+	vpxor		x2,   x3, x3;
+
+#define S6_1(x0, x1, x2, x3, x4)      \
+	vpxor		x0,   x3, x3; \
+	vpxor		x2,   x1, tp; \
+	vpxor		x0,   x2, x2; \
+	vpand		x3,   x0, x0; \
+	vpor		x3,   tp, tp; \
+	vpxor		RNOT, x1, x4; \
+	vpxor		tp,   x0, x0; \
+	vpxor		x2,   tp, x1;
+#define S6_2(x0, x1, x2, x3, x4)      \
+	vpxor		x4,   x3, x3; \
+	vpxor		x0,   x4, x4; \
+	vpand		x0,   x2, x2; \
+	vpxor		x1,   x4, x4; \
+	vpxor		x3,   x2, x2; \
+	vpand		x1,   x3, x3; \
+	vpxor		x0,   x3, x3; \
+	vpxor		x2,   x1, x1;
+
+#define S7_1(x0, x1, x2, x3, x4)      \
+	vpxor		RNOT, x1, tp; \
+	vpxor		RNOT, x0, x0; \
+	vpand		x2,   tp, x1; \
+	vpxor		x3,   x1, x1; \
+	vpor		tp,   x3, x3; \
+	vpxor		x2,   tp, x4; \
+	vpxor		x3,   x2, x2; \
+	vpxor		x0,   x3, x3; \
+	vpor		x1,   x0, x0;
+#define S7_2(x0, x1, x2, x3, x4)      \
+	vpand		x0,   x2, x2; \
+	vpxor		x4,   x0, x0; \
+	vpxor		x3,   x4, x4; \
+	vpand		x0,   x3, x3; \
+	vpxor		x1,   x4, x4; \
+	vpxor		x4,   x2, x2; \
+	vpxor		x1,   x3, x3; \
+	vpor		x0,   x4, x4; \
+	vpxor		x1,   x4, x4;
+
+#define SI0_1(x0, x1, x2, x3, x4)     \
+	vpxor		x0,   x1, x1; \
+	vpor		x1,   x3, tp; \
+	vpxor		x1,   x3, x4; \
+	vpxor		RNOT, x0, x0; \
+	vpxor		tp,   x2, x2; \
+	vpxor		x0,   tp, x3; \
+	vpand		x1,   x0, x0; \
+	vpxor		x2,   x0, x0;
+#define SI0_2(x0, x1, x2, x3, x4)     \
+	vpand		x3,   x2, x2; \
+	vpxor		x4,   x3, x3; \
+	vpxor		x3,   x2, x2; \
+	vpxor		x3,   x1, x1; \
+	vpand		x0,   x3, x3; \
+	vpxor		x0,   x1, x1; \
+	vpxor		x2,   x0, x0; \
+	vpxor		x3,   x4, x4;
+
+#define SI1_1(x0, x1, x2, x3, x4)     \
+	vpxor		x3,   x1, x1; \
+	vpxor		x2,   x0, tp; \
+	vpxor		RNOT, x2, x2; \
+	vpor		x1,   x0, x4; \
+	vpxor		x3,   x4, x4; \
+	vpand		x1,   x3, x3; \
+	vpxor		x2,   x1, x1; \
+	vpand		x4,   x2, x2;
+#define SI1_2(x0, x1, x2, x3, x4)     \
+	vpxor		x1,   x4, x4; \
+	vpor		x3,   x1, x1; \
+	vpxor		tp,   x3, x3; \
+	vpxor		tp,   x2, x2; \
+	vpor		x4,   tp, x0; \
+	vpxor		x4,   x2, x2; \
+	vpxor		x0,   x1, x1; \
+	vpxor		x1,   x4, x4;
+
+#define SI2_1(x0, x1, x2, x3, x4)     \
+	vpxor		x1,   x2, x2; \
+	vpxor		RNOT, x3, tp; \
+	vpor		x2,   tp, tp; \
+	vpxor		x3,   x2, x2; \
+	vpxor		x0,   x3, x4; \
+	vpxor		x1,   tp, x3; \
+	vpor		x2,   x1, x1; \
+	vpxor		x0,   x2, x2;
+#define SI2_2(x0, x1, x2, x3, x4)     \
+	vpxor		x4,   x1, x1; \
+	vpor		x3,   x4, x4; \
+	vpxor		x3,   x2, x2; \
+	vpxor		x2,   x4, x4; \
+	vpand		x1,   x2, x2; \
+	vpxor		x3,   x2, x2; \
+	vpxor		x4,   x3, x3; \
+	vpxor		x0,   x4, x4;
+
+#define SI3_1(x0, x1, x2, x3, x4)     \
+	vpxor		x1,   x2, x2; \
+	vpand		x2,   x1, tp; \
+	vpxor		x0,   tp, tp; \
+	vpor		x1,   x0, x0; \
+	vpxor		x3,   x1, x4; \
+	vpxor		x3,   x0, x0; \
+	vpor		tp,   x3, x3; \
+	vpxor		x2,   tp, x1;
+#define SI3_2(x0, x1, x2, x3, x4)     \
+	vpxor		x3,   x1, x1; \
+	vpxor		x2,   x0, x0; \
+	vpxor		x3,   x2, x2; \
+	vpand		x1,   x3, x3; \
+	vpxor		x0,   x1, x1; \
+	vpand		x2,   x0, x0; \
+	vpxor		x3,   x4, x4; \
+	vpxor		x0,   x3, x3; \
+	vpxor		x1,   x0, x0;
+
+#define SI4_1(x0, x1, x2, x3, x4)     \
+	vpxor		x3,   x2, x2; \
+	vpand		x1,   x0, tp; \
+	vpxor		x2,   tp, tp; \
+	vpor		x3,   x2, x2; \
+	vpxor		RNOT, x0, x4; \
+	vpxor		tp,   x1, x1; \
+	vpxor		x2,   tp, x0; \
+	vpand		x4,   x2, x2;
+#define SI4_2(x0, x1, x2, x3, x4)     \
+	vpxor		x0,   x2, x2; \
+	vpor		x4,   x0, x0; \
+	vpxor		x3,   x0, x0; \
+	vpand		x2,   x3, x3; \
+	vpxor		x3,   x4, x4; \
+	vpxor		x1,   x3, x3; \
+	vpand		x0,   x1, x1; \
+	vpxor		x1,   x4, x4; \
+	vpxor		x3,   x0, x0;
+
+#define SI5_1(x0, x1, x2, x3, x4)     \
+	vpor		x2,   x1, tp; \
+	vpxor		x1,   x2, x2; \
+	vpxor		x3,   tp, tp; \
+	vpand		x1,   x3, x3; \
+	vpxor		x3,   x2, x2; \
+	vpor		x0,   x3, x3; \
+	vpxor		RNOT, x0, x0; \
+	vpxor		x2,   x3, x3; \
+	vpor		x0,   x2, x2;
+#define SI5_2(x0, x1, x2, x3, x4)     \
+	vpxor		tp,   x1, x4; \
+	vpxor		x4,   x2, x2; \
+	vpand		x0,   x4, x4; \
+	vpxor		tp,   x0, x0; \
+	vpxor		x3,   tp, x1; \
+	vpand		x2,   x0, x0; \
+	vpxor		x3,   x2, x2; \
+	vpxor		x2,   x0, x0; \
+	vpxor		x4,   x2, x2; \
+	vpxor		x3,   x4, x4;
+
+#define SI6_1(x0, x1, x2, x3, x4)     \
+	vpxor		x2,   x0, x0; \
+	vpand		x3,   x0, tp; \
+	vpxor		x3,   x2, x2; \
+	vpxor		x2,   tp, tp; \
+	vpxor		x1,   x3, x3; \
+	vpor		x0,   x2, x2; \
+	vpxor		x3,   x2, x2; \
+	vpand		tp,   x3, x3;
+#define SI6_2(x0, x1, x2, x3, x4)     \
+	vpxor		RNOT, tp, tp; \
+	vpxor		x1,   x3, x3; \
+	vpand		x2,   x1, x1; \
+	vpxor		tp,   x0, x4; \
+	vpxor		x4,   x3, x3; \
+	vpxor		x2,   x4, x4; \
+	vpxor		x1,   tp, x0; \
+	vpxor		x0,   x2, x2;
+
+#define SI7_1(x0, x1, x2, x3, x4)     \
+	vpand		x0,   x3, tp; \
+	vpxor		x2,   x0, x0; \
+	vpor		x3,   x2, x2; \
+	vpxor		x1,   x3, x4; \
+	vpxor		RNOT, x0, x0; \
+	vpor		tp,   x1, x1; \
+	vpxor		x0,   x4, x4; \
+	vpand		x2,   x0, x0; \
+	vpxor		x1,   x0, x0;
+#define SI7_2(x0, x1, x2, x3, x4)     \
+	vpand		x2,   x1, x1; \
+	vpxor		x2,   tp, x3; \
+	vpxor		x3,   x4, x4; \
+	vpand		x3,   x2, x2; \
+	vpor		x0,   x3, x3; \
+	vpxor		x4,   x1, x1; \
+	vpxor		x4,   x3, x3; \
+	vpand		x0,   x4, x4; \
+	vpxor		x2,   x4, x4;
+
+#define get_key(i,j,t) \
+	vpbroadcastd (4*(i)+(j))*4(CTX), t;
+
+#define K2(x0, x1, x2, x3, x4, i) \
+	get_key(i, 0, RK0); \
+	get_key(i, 1, RK1); \
+	get_key(i, 2, RK2); \
+	get_key(i, 3, RK3); \
+	vpxor RK0,	x0 ## 1, x0 ## 1; \
+	vpxor RK1,	x1 ## 1, x1 ## 1; \
+	vpxor RK2,	x2 ## 1, x2 ## 1; \
+	vpxor RK3,	x3 ## 1, x3 ## 1; \
+		vpxor RK0,	x0 ## 2, x0 ## 2; \
+		vpxor RK1,	x1 ## 2, x1 ## 2; \
+		vpxor RK2,	x2 ## 2, x2 ## 2; \
+		vpxor RK3,	x3 ## 2, x3 ## 2;
+
+#define LK2(x0, x1, x2, x3, x4, i) \
+	vpslld $13,		x0 ## 1, x4 ## 1;          \
+	vpsrld $(32 - 13),	x0 ## 1, x0 ## 1;          \
+	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \
+	vpxor			x0 ## 1, x1 ## 1, x1 ## 1; \
+	vpslld $3,		x2 ## 1, x4 ## 1;          \
+	vpsrld $(32 - 3),	x2 ## 1, x2 ## 1;          \
+	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \
+	vpxor			x2 ## 1, x1 ## 1, x1 ## 1; \
+		vpslld $13,		x0 ## 2, x4 ## 2;          \
+		vpsrld $(32 - 13),	x0 ## 2, x0 ## 2;          \
+		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \
+		vpxor			x0 ## 2, x1 ## 2, x1 ## 2; \
+		vpslld $3,		x2 ## 2, x4 ## 2;          \
+		vpsrld $(32 - 3),	x2 ## 2, x2 ## 2;          \
+		vpor			x4 ## 2, x2 ## 2, x2 ## 2; \
+		vpxor			x2 ## 2, x1 ## 2, x1 ## 2; \
+	vpslld $1,		x1 ## 1, x4 ## 1;          \
+	vpsrld $(32 - 1),	x1 ## 1, x1 ## 1;          \
+	vpor			x4 ## 1, x1 ## 1, x1 ## 1; \
+	vpslld $3,		x0 ## 1, x4 ## 1;          \
+	vpxor			x2 ## 1, x3 ## 1, x3 ## 1; \
+	vpxor			x4 ## 1, x3 ## 1, x3 ## 1; \
+	get_key(i, 1, RK1); \
+		vpslld $1,		x1 ## 2, x4 ## 2;          \
+		vpsrld $(32 - 1),	x1 ## 2, x1 ## 2;          \
+		vpor			x4 ## 2, x1 ## 2, x1 ## 2; \
+		vpslld $3,		x0 ## 2, x4 ## 2;          \
+		vpxor			x2 ## 2, x3 ## 2, x3 ## 2; \
+		vpxor			x4 ## 2, x3 ## 2, x3 ## 2; \
+		get_key(i, 3, RK3); \
+	vpslld $7,		x3 ## 1, x4 ## 1;          \
+	vpsrld $(32 - 7),	x3 ## 1, x3 ## 1;          \
+	vpor			x4 ## 1, x3 ## 1, x3 ## 1; \
+	vpslld $7,		x1 ## 1, x4 ## 1;          \
+	vpxor			x1 ## 1, x0 ## 1, x0 ## 1; \
+	vpxor			x3 ## 1, x0 ## 1, x0 ## 1; \
+	vpxor			x3 ## 1, x2 ## 1, x2 ## 1; \
+	vpxor			x4 ## 1, x2 ## 1, x2 ## 1; \
+	get_key(i, 0, RK0); \
+		vpslld $7,		x3 ## 2, x4 ## 2;          \
+		vpsrld $(32 - 7),	x3 ## 2, x3 ## 2;          \
+		vpor			x4 ## 2, x3 ## 2, x3 ## 2; \
+		vpslld $7,		x1 ## 2, x4 ## 2;          \
+		vpxor			x1 ## 2, x0 ## 2, x0 ## 2; \
+		vpxor			x3 ## 2, x0 ## 2, x0 ## 2; \
+		vpxor			x3 ## 2, x2 ## 2, x2 ## 2; \
+		vpxor			x4 ## 2, x2 ## 2, x2 ## 2; \
+		get_key(i, 2, RK2); \
+	vpxor			RK1, x1 ## 1, x1 ## 1;     \
+	vpxor			RK3, x3 ## 1, x3 ## 1;     \
+	vpslld $5,		x0 ## 1, x4 ## 1;          \
+	vpsrld $(32 - 5),	x0 ## 1, x0 ## 1;          \
+	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \
+	vpslld $22,		x2 ## 1, x4 ## 1;          \
+	vpsrld $(32 - 22),	x2 ## 1, x2 ## 1;          \
+	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \
+	vpxor			RK0, x0 ## 1, x0 ## 1;     \
+	vpxor			RK2, x2 ## 1, x2 ## 1;     \
+		vpxor			RK1, x1 ## 2, x1 ## 2;     \
+		vpxor			RK3, x3 ## 2, x3 ## 2;     \
+		vpslld $5,		x0 ## 2, x4 ## 2;          \
+		vpsrld $(32 - 5),	x0 ## 2, x0 ## 2;          \
+		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \
+		vpslld $22,		x2 ## 2, x4 ## 2;          \
+		vpsrld $(32 - 22),	x2 ## 2, x2 ## 2;          \
+		vpor			x4 ## 2, x2 ## 2, x2 ## 2; \
+		vpxor			RK0, x0 ## 2, x0 ## 2;     \
+		vpxor			RK2, x2 ## 2, x2 ## 2;
+
+#define KL2(x0, x1, x2, x3, x4, i) \
+	vpxor			RK0, x0 ## 1, x0 ## 1;     \
+	vpxor			RK2, x2 ## 1, x2 ## 1;     \
+	vpsrld $5,		x0 ## 1, x4 ## 1;          \
+	vpslld $(32 - 5),	x0 ## 1, x0 ## 1;          \
+	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \
+	vpxor			RK3, x3 ## 1, x3 ## 1;     \
+	vpxor			RK1, x1 ## 1, x1 ## 1;     \
+	vpsrld $22,		x2 ## 1, x4 ## 1;          \
+	vpslld $(32 - 22),	x2 ## 1, x2 ## 1;          \
+	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \
+	vpxor			x3 ## 1, x2 ## 1, x2 ## 1; \
+		vpxor			RK0, x0 ## 2, x0 ## 2;     \
+		vpxor			RK2, x2 ## 2, x2 ## 2;     \
+		vpsrld $5,		x0 ## 2, x4 ## 2;          \
+		vpslld $(32 - 5),	x0 ## 2, x0 ## 2;          \
+		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \
+		vpxor			RK3, x3 ## 2, x3 ## 2;     \
+		vpxor			RK1, x1 ## 2, x1 ## 2;     \
+		vpsrld $22,		x2 ## 2, x4 ## 2;          \
+		vpslld $(32 - 22),	x2 ## 2, x2 ## 2;          \
+		vpor			x4 ## 2, x2 ## 2, x2 ## 2; \
+		vpxor			x3 ## 2, x2 ## 2, x2 ## 2; \
+	vpxor			x3 ## 1, x0 ## 1, x0 ## 1; \
+	vpslld $7,		x1 ## 1, x4 ## 1;          \
+	vpxor			x1 ## 1, x0 ## 1, x0 ## 1; \
+	vpxor			x4 ## 1, x2 ## 1, x2 ## 1; \
+	vpsrld $1,		x1 ## 1, x4 ## 1;          \
+	vpslld $(32 - 1),	x1 ## 1, x1 ## 1;          \
+	vpor			x4 ## 1, x1 ## 1, x1 ## 1; \
+		vpxor			x3 ## 2, x0 ## 2, x0 ## 2; \
+		vpslld $7,		x1 ## 2, x4 ## 2;          \
+		vpxor			x1 ## 2, x0 ## 2, x0 ## 2; \
+		vpxor			x4 ## 2, x2 ## 2, x2 ## 2; \
+		vpsrld $1,		x1 ## 2, x4 ## 2;          \
+		vpslld $(32 - 1),	x1 ## 2, x1 ## 2;          \
+		vpor			x4 ## 2, x1 ## 2, x1 ## 2; \
+	vpsrld $7,		x3 ## 1, x4 ## 1;          \
+	vpslld $(32 - 7),	x3 ## 1, x3 ## 1;          \
+	vpor			x4 ## 1, x3 ## 1, x3 ## 1; \
+	vpxor			x0 ## 1, x1 ## 1, x1 ## 1; \
+	vpslld $3,		x0 ## 1, x4 ## 1;          \
+	vpxor			x4 ## 1, x3 ## 1, x3 ## 1; \
+		vpsrld $7,		x3 ## 2, x4 ## 2;          \
+		vpslld $(32 - 7),	x3 ## 2, x3 ## 2;          \
+		vpor			x4 ## 2, x3 ## 2, x3 ## 2; \
+		vpxor			x0 ## 2, x1 ## 2, x1 ## 2; \
+		vpslld $3,		x0 ## 2, x4 ## 2;          \
+		vpxor			x4 ## 2, x3 ## 2, x3 ## 2; \
+	vpsrld $13,		x0 ## 1, x4 ## 1;          \
+	vpslld $(32 - 13),	x0 ## 1, x0 ## 1;          \
+	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \
+	vpxor			x2 ## 1, x1 ## 1, x1 ## 1; \
+	vpxor			x2 ## 1, x3 ## 1, x3 ## 1; \
+	vpsrld $3,		x2 ## 1, x4 ## 1;          \
+	vpslld $(32 - 3),	x2 ## 1, x2 ## 1;          \
+	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \
+		vpsrld $13,		x0 ## 2, x4 ## 2;          \
+		vpslld $(32 - 13),	x0 ## 2, x0 ## 2;          \
+		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \
+		vpxor			x2 ## 2, x1 ## 2, x1 ## 2; \
+		vpxor			x2 ## 2, x3 ## 2, x3 ## 2; \
+		vpsrld $3,		x2 ## 2, x4 ## 2;          \
+		vpslld $(32 - 3),	x2 ## 2, x2 ## 2;          \
+		vpor			x4 ## 2, x2 ## 2, x2 ## 2;
+
+#define S(SBOX, x0, x1, x2, x3, x4) \
+	SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+	SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+	SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+	SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
+
+#define SP(SBOX, x0, x1, x2, x3, x4, i) \
+	get_key(i, 0, RK0); \
+	SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+	get_key(i, 2, RK2); \
+	SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+	get_key(i, 3, RK3); \
+	SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+	get_key(i, 1, RK1); \
+	SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+
+#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+	vpunpckldq		x1, x0, t0; \
+	vpunpckhdq		x1, x0, t2; \
+	vpunpckldq		x3, x2, t1; \
+	vpunpckhdq		x3, x2, x3; \
+	\
+	vpunpcklqdq		t1, t0, x0; \
+	vpunpckhqdq		t1, t0, x1; \
+	vpunpcklqdq		x3, t2, x2; \
+	vpunpckhqdq		x3, t2, x3;
+
+#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
+	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
+	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+.align 8
+__serpent_enc_blk16:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: plaintext
+	 * output:
+	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext
+	 */
+
+	vpcmpeqd RNOT, RNOT, RNOT;
+
+	read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+	read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+						 K2(RA, RB, RC, RD, RE, 0);
+	S(S0, RA, RB, RC, RD, RE);		LK2(RC, RB, RD, RA, RE, 1);
+	S(S1, RC, RB, RD, RA, RE);		LK2(RE, RD, RA, RC, RB, 2);
+	S(S2, RE, RD, RA, RC, RB);		LK2(RB, RD, RE, RC, RA, 3);
+	S(S3, RB, RD, RE, RC, RA);		LK2(RC, RA, RD, RB, RE, 4);
+	S(S4, RC, RA, RD, RB, RE);		LK2(RA, RD, RB, RE, RC, 5);
+	S(S5, RA, RD, RB, RE, RC);		LK2(RC, RA, RD, RE, RB, 6);
+	S(S6, RC, RA, RD, RE, RB);		LK2(RD, RB, RA, RE, RC, 7);
+	S(S7, RD, RB, RA, RE, RC);		LK2(RC, RA, RE, RD, RB, 8);
+	S(S0, RC, RA, RE, RD, RB);		LK2(RE, RA, RD, RC, RB, 9);
+	S(S1, RE, RA, RD, RC, RB);		LK2(RB, RD, RC, RE, RA, 10);
+	S(S2, RB, RD, RC, RE, RA);		LK2(RA, RD, RB, RE, RC, 11);
+	S(S3, RA, RD, RB, RE, RC);		LK2(RE, RC, RD, RA, RB, 12);
+	S(S4, RE, RC, RD, RA, RB);		LK2(RC, RD, RA, RB, RE, 13);
+	S(S5, RC, RD, RA, RB, RE);		LK2(RE, RC, RD, RB, RA, 14);
+	S(S6, RE, RC, RD, RB, RA);		LK2(RD, RA, RC, RB, RE, 15);
+	S(S7, RD, RA, RC, RB, RE);		LK2(RE, RC, RB, RD, RA, 16);
+	S(S0, RE, RC, RB, RD, RA);		LK2(RB, RC, RD, RE, RA, 17);
+	S(S1, RB, RC, RD, RE, RA);		LK2(RA, RD, RE, RB, RC, 18);
+	S(S2, RA, RD, RE, RB, RC);		LK2(RC, RD, RA, RB, RE, 19);
+	S(S3, RC, RD, RA, RB, RE);		LK2(RB, RE, RD, RC, RA, 20);
+	S(S4, RB, RE, RD, RC, RA);		LK2(RE, RD, RC, RA, RB, 21);
+	S(S5, RE, RD, RC, RA, RB);		LK2(RB, RE, RD, RA, RC, 22);
+	S(S6, RB, RE, RD, RA, RC);		LK2(RD, RC, RE, RA, RB, 23);
+	S(S7, RD, RC, RE, RA, RB);		LK2(RB, RE, RA, RD, RC, 24);
+	S(S0, RB, RE, RA, RD, RC);		LK2(RA, RE, RD, RB, RC, 25);
+	S(S1, RA, RE, RD, RB, RC);		LK2(RC, RD, RB, RA, RE, 26);
+	S(S2, RC, RD, RB, RA, RE);		LK2(RE, RD, RC, RA, RB, 27);
+	S(S3, RE, RD, RC, RA, RB);		LK2(RA, RB, RD, RE, RC, 28);
+	S(S4, RA, RB, RD, RE, RC);		LK2(RB, RD, RE, RC, RA, 29);
+	S(S5, RB, RD, RE, RC, RA);		LK2(RA, RB, RD, RC, RE, 30);
+	S(S6, RA, RB, RD, RC, RE);		LK2(RD, RE, RB, RC, RA, 31);
+	S(S7, RD, RE, RB, RC, RA);		 K2(RA, RB, RC, RD, RE, 32);
+
+	write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+	write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+	ret;
+ENDPROC(__serpent_enc_blk16)
+
+.align 8
+__serpent_dec_blk16:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext
+	 * output:
+	 *	RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: plaintext
+	 */
+
+	vpcmpeqd RNOT, RNOT, RNOT;
+
+	read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+	read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+						 K2(RA, RB, RC, RD, RE, 32);
+	SP(SI7, RA, RB, RC, RD, RE, 31);	KL2(RB, RD, RA, RE, RC, 31);
+	SP(SI6, RB, RD, RA, RE, RC, 30);	KL2(RA, RC, RE, RB, RD, 30);
+	SP(SI5, RA, RC, RE, RB, RD, 29);	KL2(RC, RD, RA, RE, RB, 29);
+	SP(SI4, RC, RD, RA, RE, RB, 28);	KL2(RC, RA, RB, RE, RD, 28);
+	SP(SI3, RC, RA, RB, RE, RD, 27);	KL2(RB, RC, RD, RE, RA, 27);
+	SP(SI2, RB, RC, RD, RE, RA, 26);	KL2(RC, RA, RE, RD, RB, 26);
+	SP(SI1, RC, RA, RE, RD, RB, 25);	KL2(RB, RA, RE, RD, RC, 25);
+	SP(SI0, RB, RA, RE, RD, RC, 24);	KL2(RE, RC, RA, RB, RD, 24);
+	SP(SI7, RE, RC, RA, RB, RD, 23);	KL2(RC, RB, RE, RD, RA, 23);
+	SP(SI6, RC, RB, RE, RD, RA, 22);	KL2(RE, RA, RD, RC, RB, 22);
+	SP(SI5, RE, RA, RD, RC, RB, 21);	KL2(RA, RB, RE, RD, RC, 21);
+	SP(SI4, RA, RB, RE, RD, RC, 20);	KL2(RA, RE, RC, RD, RB, 20);
+	SP(SI3, RA, RE, RC, RD, RB, 19);	KL2(RC, RA, RB, RD, RE, 19);
+	SP(SI2, RC, RA, RB, RD, RE, 18);	KL2(RA, RE, RD, RB, RC, 18);
+	SP(SI1, RA, RE, RD, RB, RC, 17);	KL2(RC, RE, RD, RB, RA, 17);
+	SP(SI0, RC, RE, RD, RB, RA, 16);	KL2(RD, RA, RE, RC, RB, 16);
+	SP(SI7, RD, RA, RE, RC, RB, 15);	KL2(RA, RC, RD, RB, RE, 15);
+	SP(SI6, RA, RC, RD, RB, RE, 14);	KL2(RD, RE, RB, RA, RC, 14);
+	SP(SI5, RD, RE, RB, RA, RC, 13);	KL2(RE, RC, RD, RB, RA, 13);
+	SP(SI4, RE, RC, RD, RB, RA, 12);	KL2(RE, RD, RA, RB, RC, 12);
+	SP(SI3, RE, RD, RA, RB, RC, 11);	KL2(RA, RE, RC, RB, RD, 11);
+	SP(SI2, RA, RE, RC, RB, RD, 10);	KL2(RE, RD, RB, RC, RA, 10);
+	SP(SI1, RE, RD, RB, RC, RA, 9);		KL2(RA, RD, RB, RC, RE, 9);
+	SP(SI0, RA, RD, RB, RC, RE, 8);		KL2(RB, RE, RD, RA, RC, 8);
+	SP(SI7, RB, RE, RD, RA, RC, 7);		KL2(RE, RA, RB, RC, RD, 7);
+	SP(SI6, RE, RA, RB, RC, RD, 6);		KL2(RB, RD, RC, RE, RA, 6);
+	SP(SI5, RB, RD, RC, RE, RA, 5);		KL2(RD, RA, RB, RC, RE, 5);
+	SP(SI4, RD, RA, RB, RC, RE, 4);		KL2(RD, RB, RE, RC, RA, 4);
+	SP(SI3, RD, RB, RE, RC, RA, 3);		KL2(RE, RD, RA, RC, RB, 3);
+	SP(SI2, RE, RD, RA, RC, RB, 2);		KL2(RD, RB, RC, RA, RE, 2);
+	SP(SI1, RD, RB, RC, RA, RE, 1);		KL2(RE, RB, RC, RA, RD, 1);
+	S(SI0, RE, RB, RC, RA, RD);		 K2(RC, RD, RB, RE, RA, 0);
+
+	write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
+	write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
+
+	ret;
+ENDPROC(__serpent_dec_blk16)
+
+ENTRY(serpent_ecb_enc_16way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	vzeroupper;
+
+	load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+	call __serpent_enc_blk16;
+
+	store_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+	vzeroupper;
+
+	ret;
+ENDPROC(serpent_ecb_enc_16way)
+
+ENTRY(serpent_ecb_dec_16way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	vzeroupper;
+
+	load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+	call __serpent_dec_blk16;
+
+	store_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
+
+	vzeroupper;
+
+	ret;
+ENDPROC(serpent_ecb_dec_16way)
+
+ENTRY(serpent_cbc_dec_16way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	vzeroupper;
+
+	load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+	call __serpent_dec_blk16;
+
+	store_cbc_16way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2,
+			RK0);
+
+	vzeroupper;
+
+	ret;
+ENDPROC(serpent_cbc_dec_16way)
+
+ENTRY(serpent_ctr_16way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv (little endian, 128bit)
+	 */
+
+	vzeroupper;
+
+	load_ctr_16way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
+		       RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
+		       tp);
+
+	call __serpent_enc_blk16;
+
+	store_ctr_16way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+	vzeroupper;
+
+	ret;
+ENDPROC(serpent_ctr_16way)
+
+ENTRY(serpent_xts_enc_16way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+	 */
+
+	vzeroupper;
+
+	load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
+		       RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
+		       .Lxts_gf128mul_and_shl1_mask_0,
+		       .Lxts_gf128mul_and_shl1_mask_1);
+
+	call __serpent_enc_blk16;
+
+	store_xts_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+	vzeroupper;
+
+	ret;
+ENDPROC(serpent_xts_enc_16way)
+
+ENTRY(serpent_xts_dec_16way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+	 */
+
+	vzeroupper;
+
+	load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
+		       RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
+		       .Lxts_gf128mul_and_shl1_mask_0,
+		       .Lxts_gf128mul_and_shl1_mask_1);
+
+	call __serpent_dec_blk16;
+
+	store_xts_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
+
+	vzeroupper;
+
+	ret;
+ENDPROC(serpent_xts_dec_16way)
diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c
new file mode 100644
index 000000000000..23aabc6c20a5
--- /dev/null
+++ b/arch/x86/crypto/serpent_avx2_glue.c
@@ -0,0 +1,562 @@
+/*
+ * Glue Code for x86_64/AVX2 assembler optimized version of Serpent
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/ctr.h>
+#include <crypto/lrw.h>
+#include <crypto/xts.h>
+#include <crypto/serpent.h>
+#include <asm/xcr.h>
+#include <asm/xsave.h>
+#include <asm/crypto/serpent-avx.h>
+#include <asm/crypto/ablk_helper.h>
+#include <asm/crypto/glue_helper.h>
+
+#define SERPENT_AVX2_PARALLEL_BLOCKS 16
+
+/* 16-way AVX2 parallel cipher functions */
+asmlinkage void serpent_ecb_enc_16way(struct serpent_ctx *ctx, u8 *dst,
+				      const u8 *src);
+asmlinkage void serpent_ecb_dec_16way(struct serpent_ctx *ctx, u8 *dst,
+				      const u8 *src);
+asmlinkage void serpent_cbc_dec_16way(void *ctx, u128 *dst, const u128 *src);
+
+asmlinkage void serpent_ctr_16way(void *ctx, u128 *dst, const u128 *src,
+				  le128 *iv);
+asmlinkage void serpent_xts_enc_16way(struct serpent_ctx *ctx, u8 *dst,
+				      const u8 *src, le128 *iv);
+asmlinkage void serpent_xts_dec_16way(struct serpent_ctx *ctx, u8 *dst,
+				      const u8 *src, le128 *iv);
+
+static const struct common_glue_ctx serpent_enc = {
+	.num_funcs = 3,
+	.fpu_blocks_limit = 8,
+
+	.funcs = { {
+		.num_blocks = 16,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_16way) }
+	}, {
+		.num_blocks = 8,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_8way_avx) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) }
+	} }
+};
+
+static const struct common_glue_ctx serpent_ctr = {
+	.num_funcs = 3,
+	.fpu_blocks_limit = 8,
+
+	.funcs = { {
+		.num_blocks = 16,
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_16way) }
+	},  {
+		.num_blocks = 8,
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(__serpent_crypt_ctr) }
+	} }
+};
+
+static const struct common_glue_ctx serpent_enc_xts = {
+	.num_funcs = 3,
+	.fpu_blocks_limit = 8,
+
+	.funcs = { {
+		.num_blocks = 16,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_16way) }
+	}, {
+		.num_blocks = 8,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_8way_avx) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc) }
+	} }
+};
+
+static const struct common_glue_ctx serpent_dec = {
+	.num_funcs = 3,
+	.fpu_blocks_limit = 8,
+
+	.funcs = { {
+		.num_blocks = 16,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_16way) }
+	}, {
+		.num_blocks = 8,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_8way_avx) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) }
+	} }
+};
+
+static const struct common_glue_ctx serpent_dec_cbc = {
+	.num_funcs = 3,
+	.fpu_blocks_limit = 8,
+
+	.funcs = { {
+		.num_blocks = 16,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_16way) }
+	}, {
+		.num_blocks = 8,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_8way_avx) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) }
+	} }
+};
+
+static const struct common_glue_ctx serpent_dec_xts = {
+	.num_funcs = 3,
+	.fpu_blocks_limit = 8,
+
+	.funcs = { {
+		.num_blocks = 16,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_16way) }
+	}, {
+		.num_blocks = 8,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_8way_avx) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec) }
+	} }
+};
+
+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_ecb_crypt_128bit(&serpent_enc, desc, dst, src, nbytes);
+}
+
+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_ecb_crypt_128bit(&serpent_dec, desc, dst, src, nbytes);
+}
+
+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(__serpent_encrypt), desc,
+				       dst, src, nbytes);
+}
+
+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_cbc_decrypt_128bit(&serpent_dec_cbc, desc, dst, src,
+				       nbytes);
+}
+
+static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		     struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_ctr_crypt_128bit(&serpent_ctr, desc, dst, src, nbytes);
+}
+
+static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes)
+{
+	/* since reusing AVX functions, starts using FPU at 8 parallel blocks */
+	return glue_fpu_begin(SERPENT_BLOCK_SIZE, 8, NULL, fpu_enabled, nbytes);
+}
+
+static inline void serpent_fpu_end(bool fpu_enabled)
+{
+	glue_fpu_end(fpu_enabled);
+}
+
+struct crypt_priv {
+	struct serpent_ctx *ctx;
+	bool fpu_enabled;
+};
+
+static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+	const unsigned int bsize = SERPENT_BLOCK_SIZE;
+	struct crypt_priv *ctx = priv;
+	int i;
+
+	ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
+
+	if (nbytes >= SERPENT_AVX2_PARALLEL_BLOCKS * bsize) {
+		serpent_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * SERPENT_AVX2_PARALLEL_BLOCKS;
+		nbytes -= bsize * SERPENT_AVX2_PARALLEL_BLOCKS;
+	}
+
+	while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) {
+		serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * SERPENT_PARALLEL_BLOCKS;
+		nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
+	}
+
+	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+		__serpent_encrypt(ctx->ctx, srcdst, srcdst);
+}
+
+static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+	const unsigned int bsize = SERPENT_BLOCK_SIZE;
+	struct crypt_priv *ctx = priv;
+	int i;
+
+	ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
+
+	if (nbytes >= SERPENT_AVX2_PARALLEL_BLOCKS * bsize) {
+		serpent_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * SERPENT_AVX2_PARALLEL_BLOCKS;
+		nbytes -= bsize * SERPENT_AVX2_PARALLEL_BLOCKS;
+	}
+
+	while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) {
+		serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * SERPENT_PARALLEL_BLOCKS;
+		nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
+	}
+
+	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+		__serpent_decrypt(ctx->ctx, srcdst, srcdst);
+}
+
+static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[SERPENT_AVX2_PARALLEL_BLOCKS];
+	struct crypt_priv crypt_ctx = {
+		.ctx = &ctx->serpent_ctx,
+		.fpu_enabled = false,
+	};
+	struct lrw_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.table_ctx = &ctx->lrw_table,
+		.crypt_ctx = &crypt_ctx,
+		.crypt_fn = encrypt_callback,
+	};
+	int ret;
+
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+	ret = lrw_crypt(desc, dst, src, nbytes, &req);
+	serpent_fpu_end(crypt_ctx.fpu_enabled);
+
+	return ret;
+}
+
+static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[SERPENT_AVX2_PARALLEL_BLOCKS];
+	struct crypt_priv crypt_ctx = {
+		.ctx = &ctx->serpent_ctx,
+		.fpu_enabled = false,
+	};
+	struct lrw_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.table_ctx = &ctx->lrw_table,
+		.crypt_ctx = &crypt_ctx,
+		.crypt_fn = decrypt_callback,
+	};
+	int ret;
+
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+	ret = lrw_crypt(desc, dst, src, nbytes, &req);
+	serpent_fpu_end(crypt_ctx.fpu_enabled);
+
+	return ret;
+}
+
+static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+
+	return glue_xts_crypt_128bit(&serpent_enc_xts, desc, dst, src, nbytes,
+				     XTS_TWEAK_CAST(__serpent_encrypt),
+				     &ctx->tweak_ctx, &ctx->crypt_ctx);
+}
+
+static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+
+	return glue_xts_crypt_128bit(&serpent_dec_xts, desc, dst, src, nbytes,
+				     XTS_TWEAK_CAST(__serpent_encrypt),
+				     &ctx->tweak_ctx, &ctx->crypt_ctx);
+}
+
+static struct crypto_alg srp_algs[10] = { {
+	.cra_name		= "__ecb-serpent-avx2",
+	.cra_driver_name	= "__driver-ecb-serpent-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct serpent_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(srp_algs[0].cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
+			.setkey		= serpent_setkey,
+			.encrypt	= ecb_encrypt,
+			.decrypt	= ecb_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "__cbc-serpent-avx2",
+	.cra_driver_name	= "__driver-cbc-serpent-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct serpent_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(srp_algs[1].cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
+			.setkey		= serpent_setkey,
+			.encrypt	= cbc_encrypt,
+			.decrypt	= cbc_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "__ctr-serpent-avx2",
+	.cra_driver_name	= "__driver-ctr-serpent-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct serpent_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(srp_algs[2].cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= serpent_setkey,
+			.encrypt	= ctr_crypt,
+			.decrypt	= ctr_crypt,
+		},
+	},
+}, {
+	.cra_name		= "__lrw-serpent-avx2",
+	.cra_driver_name	= "__driver-lrw-serpent-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct serpent_lrw_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(srp_algs[3].cra_list),
+	.cra_exit		= lrw_serpent_exit_tfm,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE +
+					  SERPENT_BLOCK_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE +
+					  SERPENT_BLOCK_SIZE,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= lrw_serpent_setkey,
+			.encrypt	= lrw_encrypt,
+			.decrypt	= lrw_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "__xts-serpent-avx2",
+	.cra_driver_name	= "__driver-xts-serpent-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct serpent_xts_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(srp_algs[4].cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE * 2,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE * 2,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= xts_serpent_setkey,
+			.encrypt	= xts_encrypt,
+			.decrypt	= xts_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "ecb(serpent)",
+	.cra_driver_name	= "ecb-serpent-avx2",
+	.cra_priority		= 600,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(srp_algs[5].cra_list),
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "cbc(serpent)",
+	.cra_driver_name	= "cbc-serpent-avx2",
+	.cra_priority		= 600,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(srp_algs[6].cra_list),
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= __ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "ctr(serpent)",
+	.cra_driver_name	= "ctr-serpent-avx2",
+	.cra_priority		= 600,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(srp_algs[7].cra_list),
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_encrypt,
+			.geniv		= "chainiv",
+		},
+	},
+}, {
+	.cra_name		= "lrw(serpent)",
+	.cra_driver_name	= "lrw-serpent-avx2",
+	.cra_priority		= 600,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(srp_algs[8].cra_list),
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE +
+					  SERPENT_BLOCK_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE +
+					  SERPENT_BLOCK_SIZE,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "xts(serpent)",
+	.cra_driver_name	= "xts-serpent-avx2",
+	.cra_priority		= 600,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(srp_algs[9].cra_list),
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE * 2,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE * 2,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+} };
+
+static int __init init(void)
+{
+	u64 xcr0;
+
+	if (!cpu_has_avx2 || !cpu_has_osxsave) {
+		pr_info("AVX2 instructions are not detected.\n");
+		return -ENODEV;
+	}
+
+	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
+		pr_info("AVX detected but unusable.\n");
+		return -ENODEV;
+	}
+
+	return crypto_register_algs(srp_algs, ARRAY_SIZE(srp_algs));
+}
+
+static void __exit fini(void)
+{
+	crypto_unregister_algs(srp_algs, ARRAY_SIZE(srp_algs));
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX2 optimized");
+MODULE_ALIAS("serpent");
+MODULE_ALIAS("serpent-asm");
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index 0f8519cf4ac2..9ae83cf8d21e 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -41,7 +41,32 @@
 #include <asm/crypto/ablk_helper.h>
 #include <asm/crypto/glue_helper.h>
 
-static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+/* 8-way parallel cipher functions */
+asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+					 const u8 *src);
+EXPORT_SYMBOL_GPL(serpent_ecb_enc_8way_avx);
+
+asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+					 const u8 *src);
+EXPORT_SYMBOL_GPL(serpent_ecb_dec_8way_avx);
+
+asmlinkage void serpent_cbc_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+					 const u8 *src);
+EXPORT_SYMBOL_GPL(serpent_cbc_dec_8way_avx);
+
+asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+				     const u8 *src, le128 *iv);
+EXPORT_SYMBOL_GPL(serpent_ctr_8way_avx);
+
+asmlinkage void serpent_xts_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+					 const u8 *src, le128 *iv);
+EXPORT_SYMBOL_GPL(serpent_xts_enc_8way_avx);
+
+asmlinkage void serpent_xts_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+					 const u8 *src, le128 *iv);
+EXPORT_SYMBOL_GPL(serpent_xts_dec_8way_avx);
+
+void __serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 {
 	be128 ctrblk;
 
@@ -51,18 +76,22 @@ static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 	__serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
 	u128_xor(dst, src, (u128 *)&ctrblk);
 }
+EXPORT_SYMBOL_GPL(__serpent_crypt_ctr);
 
-static void serpent_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+void serpent_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 {
 	glue_xts_crypt_128bit_one(ctx, dst, src, iv,
 				  GLUE_FUNC_CAST(__serpent_encrypt));
 }
+EXPORT_SYMBOL_GPL(serpent_xts_enc);
 
-static void serpent_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+void serpent_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 {
 	glue_xts_crypt_128bit_one(ctx, dst, src, iv,
 				  GLUE_FUNC_CAST(__serpent_decrypt));
 }
+EXPORT_SYMBOL_GPL(serpent_xts_dec);
+
 
 static const struct common_glue_ctx serpent_enc = {
 	.num_funcs = 2,
@@ -86,7 +115,7 @@ static const struct common_glue_ctx serpent_ctr = {
 		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) }
 	}, {
 		.num_blocks = 1,
-		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) }
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(__serpent_crypt_ctr) }
 	} }
 };
 
@@ -224,13 +253,8 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 		__serpent_decrypt(ctx->ctx, srcdst, srcdst);
 }
 
-struct serpent_lrw_ctx {
-	struct lrw_table_ctx lrw_table;
-	struct serpent_ctx serpent_ctx;
-};
-
-static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
-			      unsigned int keylen)
+int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
+		       unsigned int keylen)
 {
 	struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
 	int err;
@@ -243,6 +267,7 @@ static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
 	return lrw_init_table(&ctx->lrw_table, key + keylen -
 						SERPENT_BLOCK_SIZE);
 }
+EXPORT_SYMBOL_GPL(lrw_serpent_setkey);
 
 static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
@@ -296,20 +321,16 @@ static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 	return ret;
 }
 
-static void lrw_exit_tfm(struct crypto_tfm *tfm)
+void lrw_serpent_exit_tfm(struct crypto_tfm *tfm)
 {
 	struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
 
 	lrw_free_table(&ctx->lrw_table);
 }
+EXPORT_SYMBOL_GPL(lrw_serpent_exit_tfm);
 
-struct serpent_xts_ctx {
-	struct serpent_ctx tweak_ctx;
-	struct serpent_ctx crypt_ctx;
-};
-
-static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
-			      unsigned int keylen)
+int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
+		       unsigned int keylen)
 {
 	struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm);
 	u32 *flags = &tfm->crt_flags;
@@ -331,6 +352,7 @@ static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
 	/* second half of xts-key is for tweak */
 	return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2);
 }
+EXPORT_SYMBOL_GPL(xts_serpent_setkey);
 
 static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
@@ -420,7 +442,7 @@ static struct crypto_alg serpent_algs[10] = { {
 	.cra_alignmask		= 0,
 	.cra_type		= &crypto_blkcipher_type,
 	.cra_module		= THIS_MODULE,
-	.cra_exit		= lrw_exit_tfm,
+	.cra_exit		= lrw_serpent_exit_tfm,
 	.cra_u = {
 		.blkcipher = {
 			.min_keysize	= SERPENT_MIN_KEY_SIZE +
diff --git a/arch/x86/include/asm/crypto/serpent-avx.h b/arch/x86/include/asm/crypto/serpent-avx.h
index 56e79cc57eaf..33c2b8a435da 100644
--- a/arch/x86/include/asm/crypto/serpent-avx.h
+++ b/arch/x86/include/asm/crypto/serpent-avx.h
@@ -6,6 +6,16 @@
 
 #define SERPENT_PARALLEL_BLOCKS 8
 
+struct serpent_lrw_ctx {
+	struct lrw_table_ctx lrw_table;
+	struct serpent_ctx serpent_ctx;
+};
+
+struct serpent_xts_ctx {
+	struct serpent_ctx tweak_ctx;
+	struct serpent_ctx crypt_ctx;
+};
+
 asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
 					 const u8 *src);
 asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
@@ -21,4 +31,18 @@ asmlinkage void serpent_xts_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
 asmlinkage void serpent_xts_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
 					 const u8 *src, le128 *iv);
 
+extern void __serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src,
+				le128 *iv);
+
+extern void serpent_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv);
+extern void serpent_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv);
+
+extern int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
+			      unsigned int keylen);
+
+extern void lrw_serpent_exit_tfm(struct crypto_tfm *tfm);
+
+extern int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
+			      unsigned int keylen);
+
 #endif
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 1ba48ddd4da1..9ad3d78c1075 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1131,6 +1131,29 @@ config CRYPTO_SERPENT_AVX_X86_64
 	  See also:
 	  <http://www.cl.cam.ac.uk/~rja14/serpent.html>
 
+config CRYPTO_SERPENT_AVX2_X86_64
+	tristate "Serpent cipher algorithm (x86_64/AVX2)"
+	depends on X86 && 64BIT
+	select CRYPTO_ALGAPI
+	select CRYPTO_CRYPTD
+	select CRYPTO_ABLK_HELPER_X86
+	select CRYPTO_GLUE_HELPER_X86
+	select CRYPTO_SERPENT
+	select CRYPTO_SERPENT_AVX_X86_64
+	select CRYPTO_LRW
+	select CRYPTO_XTS
+	help
+	  Serpent cipher algorithm, by Anderson, Biham & Knudsen.
+
+	  Keys are allowed to be from 0 to 256 bits in length, in steps
+	  of 8 bits.
+
+	  This module provides Serpent cipher algorithm that processes 16
+	  blocks parallel using AVX2 instruction set.
+
+	  See also:
+	  <http://www.cl.cam.ac.uk/~rja14/serpent.html>
+
 config CRYPTO_TEA
 	tristate "TEA, XTEA and XETA cipher algorithms"
 	select CRYPTO_ALGAPI
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index fea7841dd6f3..f5e13dea8cc9 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -1644,6 +1644,9 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "__cbc-serpent-avx",
 		.test = alg_test_null,
+	}, {
+		.alg = "__cbc-serpent-avx2",
+		.test = alg_test_null,
 	}, {
 		.alg = "__cbc-serpent-sse2",
 		.test = alg_test_null,
@@ -1672,6 +1675,9 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "__driver-cbc-serpent-avx",
 		.test = alg_test_null,
+	}, {
+		.alg = "__driver-cbc-serpent-avx2",
+		.test = alg_test_null,
 	}, {
 		.alg = "__driver-cbc-serpent-sse2",
 		.test = alg_test_null,
@@ -1700,6 +1706,9 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "__driver-ecb-serpent-avx",
 		.test = alg_test_null,
+	}, {
+		.alg = "__driver-ecb-serpent-avx2",
+		.test = alg_test_null,
 	}, {
 		.alg = "__driver-ecb-serpent-sse2",
 		.test = alg_test_null,
@@ -1968,6 +1977,9 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "cryptd(__driver-cbc-camellia-aesni)",
 		.test = alg_test_null,
+	}, {
+		.alg = "cryptd(__driver-cbc-serpent-avx2)",
+		.test = alg_test_null,
 	}, {
 		.alg = "cryptd(__driver-ecb-aes-aesni)",
 		.test = alg_test_null,
@@ -1987,6 +1999,9 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "cryptd(__driver-ecb-serpent-avx)",
 		.test = alg_test_null,
+	}, {
+		.alg = "cryptd(__driver-ecb-serpent-avx2)",
+		.test = alg_test_null,
 	}, {
 		.alg = "cryptd(__driver-ecb-serpent-sse2)",
 		.test = alg_test_null,
-- 
cgit v1.2.3-59-g8ed1b


From f3f935a76aa0eee68da2b273a08d84ba8ffc7a73 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Date: Sat, 13 Apr 2013 13:47:00 +0300
Subject: crypto: camellia - add AVX2/AES-NI/x86_64 assembler implementation of
 camellia cipher

Patch adds AVX2/AES-NI/x86-64 implementation of Camellia cipher, requiring
32 parallel blocks for input (512 bytes). Compared to AVX implementation, this
version is extended to use the 256-bit wide YMM registers. For AES-NI
instructions data is split to two 128-bit registers and merged afterwards.
Even with this additional handling, performance should be higher compared
to the AES-NI/AVX implementation.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/Makefile                     |    2 +
 arch/x86/crypto/camellia-aesni-avx2-asm_64.S | 1368 ++++++++++++++++++++++++++
 arch/x86/crypto/camellia_aesni_avx2_glue.c   |  586 +++++++++++
 arch/x86/crypto/camellia_aesni_avx_glue.c    |   17 +-
 arch/x86/include/asm/crypto/camellia.h       |   19 +
 crypto/Kconfig                               |   23 +
 crypto/testmgr.c                             |   12 +
 7 files changed, 2024 insertions(+), 3 deletions(-)
 create mode 100644 arch/x86/crypto/camellia-aesni-avx2-asm_64.S
 create mode 100644 arch/x86/crypto/camellia_aesni_avx2_glue.c

(limited to 'crypto')

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index a21af593ab8d..a3a0ed80f17c 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -43,6 +43,7 @@ endif
 # These modules require assembler to support AVX2.
 ifeq ($(avx2_supported),yes)
 	obj-$(CONFIG_CRYPTO_BLOWFISH_AVX2_X86_64) += blowfish-avx2.o
+	obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o
 	obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o
 	obj-$(CONFIG_CRYPTO_TWOFISH_AVX2_X86_64) += twofish-avx2.o
 endif
@@ -73,6 +74,7 @@ endif
 
 ifeq ($(avx2_supported),yes)
 	blowfish-avx2-y := blowfish-avx2-asm_64.o blowfish_avx2_glue.o
+	camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
 	serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
 	twofish-avx2-y := twofish-avx2-asm_64.o twofish_avx2_glue.o
 endif
diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
new file mode 100644
index 000000000000..91a1878fcc3e
--- /dev/null
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@@ -0,0 +1,1368 @@
+/*
+ * x86_64/AVX2/AES-NI assembler implementation of Camellia
+ *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <linux/linkage.h>
+
+#define CAMELLIA_TABLE_BYTE_LEN 272
+
+/* struct camellia_ctx: */
+#define key_table 0
+#define key_length CAMELLIA_TABLE_BYTE_LEN
+
+/* register macros */
+#define CTX %rdi
+#define RIO %r8
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
+	vpand x, mask4bit, tmp0; \
+	vpandn x, mask4bit, x; \
+	vpsrld $4, x, x; \
+	\
+	vpshufb tmp0, lo_t, tmp0; \
+	vpshufb x, hi_t, x; \
+	vpxor tmp0, x, x;
+
+#define ymm0_x xmm0
+#define ymm1_x xmm1
+#define ymm2_x xmm2
+#define ymm3_x xmm3
+#define ymm4_x xmm4
+#define ymm5_x xmm5
+#define ymm6_x xmm6
+#define ymm7_x xmm7
+#define ymm8_x xmm8
+#define ymm9_x xmm9
+#define ymm10_x xmm10
+#define ymm11_x xmm11
+#define ymm12_x xmm12
+#define ymm13_x xmm13
+#define ymm14_x xmm14
+#define ymm15_x xmm15
+
+/*
+ * AES-NI instructions do not support ymmX registers, so we need splitting and
+ * merging.
+ */
+#define vaesenclast256(zero, yreg, tmp) \
+	vextracti128 $1, yreg, tmp##_x; \
+	vaesenclast zero##_x, yreg##_x, yreg##_x; \
+	vaesenclast zero##_x, tmp##_x, tmp##_x; \
+	vinserti128 $1, tmp##_x, yreg, yreg;
+
+/**********************************************************************
+  32-way camellia
+ **********************************************************************/
+
+/*
+ * IN:
+ *   x0..x7: byte-sliced AB state
+ *   mem_cd: register pointer storing CD state
+ *   key: index for key material
+ * OUT:
+ *   x0..x7: new byte-sliced CD state
+ */
+#define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
+		  t7, mem_cd, key) \
+	/* \
+	 * S-function with AES subbytes \
+	 */ \
+	vbroadcasti128 .Linv_shift_row, t4; \
+	vpbroadcastb .L0f0f0f0f, t7; \
+	vbroadcasti128 .Lpre_tf_lo_s1, t0; \
+	vbroadcasti128 .Lpre_tf_hi_s1, t1; \
+	\
+	/* AES inverse shift rows */ \
+	vpshufb t4, x0, x0; \
+	vpshufb t4, x7, x7; \
+	vpshufb t4, x1, x1; \
+	vpshufb t4, x4, x4; \
+	vpshufb t4, x2, x2; \
+	vpshufb t4, x5, x5; \
+	vpshufb t4, x3, x3; \
+	vpshufb t4, x6, x6; \
+	\
+	/* prefilter sboxes 1, 2 and 3 */ \
+	vbroadcasti128 .Lpre_tf_lo_s4, t2; \
+	vbroadcasti128 .Lpre_tf_hi_s4, t3; \
+	filter_8bit(x0, t0, t1, t7, t6); \
+	filter_8bit(x7, t0, t1, t7, t6); \
+	filter_8bit(x1, t0, t1, t7, t6); \
+	filter_8bit(x4, t0, t1, t7, t6); \
+	filter_8bit(x2, t0, t1, t7, t6); \
+	filter_8bit(x5, t0, t1, t7, t6); \
+	\
+	/* prefilter sbox 4 */ \
+	vpxor t4##_x, t4##_x, t4##_x; \
+	filter_8bit(x3, t2, t3, t7, t6); \
+	filter_8bit(x6, t2, t3, t7, t6); \
+	\
+	/* AES subbytes + AES shift rows */ \
+	vbroadcasti128 .Lpost_tf_lo_s1, t0; \
+	vbroadcasti128 .Lpost_tf_hi_s1, t1; \
+	vaesenclast256(t4, x0, t5); \
+	vaesenclast256(t4, x7, t5); \
+	vaesenclast256(t4, x1, t5); \
+	vaesenclast256(t4, x4, t5); \
+	vaesenclast256(t4, x2, t5); \
+	vaesenclast256(t4, x5, t5); \
+	vaesenclast256(t4, x3, t5); \
+	vaesenclast256(t4, x6, t5); \
+	\
+	/* postfilter sboxes 1 and 4 */ \
+	vbroadcasti128 .Lpost_tf_lo_s3, t2; \
+	vbroadcasti128 .Lpost_tf_hi_s3, t3; \
+	filter_8bit(x0, t0, t1, t7, t6); \
+	filter_8bit(x7, t0, t1, t7, t6); \
+	filter_8bit(x3, t0, t1, t7, t6); \
+	filter_8bit(x6, t0, t1, t7, t6); \
+	\
+	/* postfilter sbox 3 */ \
+	vbroadcasti128 .Lpost_tf_lo_s2, t4; \
+	vbroadcasti128 .Lpost_tf_hi_s2, t5; \
+	filter_8bit(x2, t2, t3, t7, t6); \
+	filter_8bit(x5, t2, t3, t7, t6); \
+	\
+	vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
+	\
+	/* postfilter sbox 2 */ \
+	filter_8bit(x1, t4, t5, t7, t2); \
+	filter_8bit(x4, t4, t5, t7, t2); \
+	\
+	vpsrldq $1, t0, t1; \
+	vpsrldq $2, t0, t2; \
+	vpsrldq $3, t0, t3; \
+	vpsrldq $4, t0, t4; \
+	vpsrldq $5, t0, t5; \
+	vpsrldq $6, t0, t6; \
+	vpsrldq $7, t0, t7; \
+	vpbroadcastb t0##_x, t0; \
+	vpbroadcastb t1##_x, t1; \
+	vpbroadcastb t2##_x, t2; \
+	vpbroadcastb t3##_x, t3; \
+	vpbroadcastb t4##_x, t4; \
+	vpbroadcastb t6##_x, t6; \
+	vpbroadcastb t5##_x, t5; \
+	vpbroadcastb t7##_x, t7; \
+	\
+	/* P-function */ \
+	vpxor x5, x0, x0; \
+	vpxor x6, x1, x1; \
+	vpxor x7, x2, x2; \
+	vpxor x4, x3, x3; \
+	\
+	vpxor x2, x4, x4; \
+	vpxor x3, x5, x5; \
+	vpxor x0, x6, x6; \
+	vpxor x1, x7, x7; \
+	\
+	vpxor x7, x0, x0; \
+	vpxor x4, x1, x1; \
+	vpxor x5, x2, x2; \
+	vpxor x6, x3, x3; \
+	\
+	vpxor x3, x4, x4; \
+	vpxor x0, x5, x5; \
+	vpxor x1, x6, x6; \
+	vpxor x2, x7, x7; /* note: high and low parts swapped */ \
+	\
+	/* Add key material and result to CD (x becomes new CD) */ \
+	\
+	vpxor t7, x0, x0; \
+	vpxor 4 * 32(mem_cd), x0, x0; \
+	\
+	vpxor t6, x1, x1; \
+	vpxor 5 * 32(mem_cd), x1, x1; \
+	\
+	vpxor t5, x2, x2; \
+	vpxor 6 * 32(mem_cd), x2, x2; \
+	\
+	vpxor t4, x3, x3; \
+	vpxor 7 * 32(mem_cd), x3, x3; \
+	\
+	vpxor t3, x4, x4; \
+	vpxor 0 * 32(mem_cd), x4, x4; \
+	\
+	vpxor t2, x5, x5; \
+	vpxor 1 * 32(mem_cd), x5, x5; \
+	\
+	vpxor t1, x6, x6; \
+	vpxor 2 * 32(mem_cd), x6, x6; \
+	\
+	vpxor t0, x7, x7; \
+	vpxor 3 * 32(mem_cd), x7, x7;
+
+/*
+ * Size optimization... with inlined roundsm16 binary would be over 5 times
+ * larger and would only marginally faster.
+ */
+.align 8
+roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd:
+	roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		  %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
+		  %rcx, (%r9));
+	ret;
+ENDPROC(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
+
+.align 8
+roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab:
+	roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3,
+		  %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11,
+		  %rax, (%r9));
+	ret;
+ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
+
+/*
+ * IN/OUT:
+ *  x0..x7: byte-sliced AB state preloaded
+ *  mem_ab: byte-sliced AB state in memory
+ *  mem_cb: byte-sliced CD state in memory
+ */
+#define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
+	leaq (key_table + (i) * 8)(CTX), %r9; \
+	call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
+	\
+	vmovdqu x0, 4 * 32(mem_cd); \
+	vmovdqu x1, 5 * 32(mem_cd); \
+	vmovdqu x2, 6 * 32(mem_cd); \
+	vmovdqu x3, 7 * 32(mem_cd); \
+	vmovdqu x4, 0 * 32(mem_cd); \
+	vmovdqu x5, 1 * 32(mem_cd); \
+	vmovdqu x6, 2 * 32(mem_cd); \
+	vmovdqu x7, 3 * 32(mem_cd); \
+	\
+	leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
+	call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
+	\
+	store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
+
+#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
+
+#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
+	/* Store new AB state */ \
+	vmovdqu x4, 4 * 32(mem_ab); \
+	vmovdqu x5, 5 * 32(mem_ab); \
+	vmovdqu x6, 6 * 32(mem_ab); \
+	vmovdqu x7, 7 * 32(mem_ab); \
+	vmovdqu x0, 0 * 32(mem_ab); \
+	vmovdqu x1, 1 * 32(mem_ab); \
+	vmovdqu x2, 2 * 32(mem_ab); \
+	vmovdqu x3, 3 * 32(mem_ab);
+
+#define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, i) \
+	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
+	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
+	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
+
+#define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, i) \
+	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
+	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
+	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
+
+/*
+ * IN:
+ *  v0..3: byte-sliced 32-bit integers
+ * OUT:
+ *  v0..3: (IN <<< 1)
+ */
+#define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \
+	vpcmpgtb v0, zero, t0; \
+	vpaddb v0, v0, v0; \
+	vpabsb t0, t0; \
+	\
+	vpcmpgtb v1, zero, t1; \
+	vpaddb v1, v1, v1; \
+	vpabsb t1, t1; \
+	\
+	vpcmpgtb v2, zero, t2; \
+	vpaddb v2, v2, v2; \
+	vpabsb t2, t2; \
+	\
+	vpor t0, v1, v1; \
+	\
+	vpcmpgtb v3, zero, t0; \
+	vpaddb v3, v3, v3; \
+	vpabsb t0, t0; \
+	\
+	vpor t1, v2, v2; \
+	vpor t2, v3, v3; \
+	vpor t0, v0, v0;
+
+/*
+ * IN:
+ *   r: byte-sliced AB state in memory
+ *   l: byte-sliced CD state in memory
+ * OUT:
+ *   x0..x7: new byte-sliced CD state
+ */
+#define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
+	      tt1, tt2, tt3, kll, klr, krl, krr) \
+	/* \
+	 * t0 = kll; \
+	 * t0 &= ll; \
+	 * lr ^= rol32(t0, 1); \
+	 */ \
+	vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
+	vpxor tt0, tt0, tt0; \
+	vpbroadcastb t0##_x, t3; \
+	vpsrldq $1, t0, t0; \
+	vpbroadcastb t0##_x, t2; \
+	vpsrldq $1, t0, t0; \
+	vpbroadcastb t0##_x, t1; \
+	vpsrldq $1, t0, t0; \
+	vpbroadcastb t0##_x, t0; \
+	\
+	vpand l0, t0, t0; \
+	vpand l1, t1, t1; \
+	vpand l2, t2, t2; \
+	vpand l3, t3, t3; \
+	\
+	rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+	\
+	vpxor l4, t0, l4; \
+	vmovdqu l4, 4 * 32(l); \
+	vpxor l5, t1, l5; \
+	vmovdqu l5, 5 * 32(l); \
+	vpxor l6, t2, l6; \
+	vmovdqu l6, 6 * 32(l); \
+	vpxor l7, t3, l7; \
+	vmovdqu l7, 7 * 32(l); \
+	\
+	/* \
+	 * t2 = krr; \
+	 * t2 |= rr; \
+	 * rl ^= t2; \
+	 */ \
+	\
+	vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
+	vpbroadcastb t0##_x, t3; \
+	vpsrldq $1, t0, t0; \
+	vpbroadcastb t0##_x, t2; \
+	vpsrldq $1, t0, t0; \
+	vpbroadcastb t0##_x, t1; \
+	vpsrldq $1, t0, t0; \
+	vpbroadcastb t0##_x, t0; \
+	\
+	vpor 4 * 32(r), t0, t0; \
+	vpor 5 * 32(r), t1, t1; \
+	vpor 6 * 32(r), t2, t2; \
+	vpor 7 * 32(r), t3, t3; \
+	\
+	vpxor 0 * 32(r), t0, t0; \
+	vpxor 1 * 32(r), t1, t1; \
+	vpxor 2 * 32(r), t2, t2; \
+	vpxor 3 * 32(r), t3, t3; \
+	vmovdqu t0, 0 * 32(r); \
+	vmovdqu t1, 1 * 32(r); \
+	vmovdqu t2, 2 * 32(r); \
+	vmovdqu t3, 3 * 32(r); \
+	\
+	/* \
+	 * t2 = krl; \
+	 * t2 &= rl; \
+	 * rr ^= rol32(t2, 1); \
+	 */ \
+	vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
+	vpbroadcastb t0##_x, t3; \
+	vpsrldq $1, t0, t0; \
+	vpbroadcastb t0##_x, t2; \
+	vpsrldq $1, t0, t0; \
+	vpbroadcastb t0##_x, t1; \
+	vpsrldq $1, t0, t0; \
+	vpbroadcastb t0##_x, t0; \
+	\
+	vpand 0 * 32(r), t0, t0; \
+	vpand 1 * 32(r), t1, t1; \
+	vpand 2 * 32(r), t2, t2; \
+	vpand 3 * 32(r), t3, t3; \
+	\
+	rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+	\
+	vpxor 4 * 32(r), t0, t0; \
+	vpxor 5 * 32(r), t1, t1; \
+	vpxor 6 * 32(r), t2, t2; \
+	vpxor 7 * 32(r), t3, t3; \
+	vmovdqu t0, 4 * 32(r); \
+	vmovdqu t1, 5 * 32(r); \
+	vmovdqu t2, 6 * 32(r); \
+	vmovdqu t3, 7 * 32(r); \
+	\
+	/* \
+	 * t0 = klr; \
+	 * t0 |= lr; \
+	 * ll ^= t0; \
+	 */ \
+	\
+	vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
+	vpbroadcastb t0##_x, t3; \
+	vpsrldq $1, t0, t0; \
+	vpbroadcastb t0##_x, t2; \
+	vpsrldq $1, t0, t0; \
+	vpbroadcastb t0##_x, t1; \
+	vpsrldq $1, t0, t0; \
+	vpbroadcastb t0##_x, t0; \
+	\
+	vpor l4, t0, t0; \
+	vpor l5, t1, t1; \
+	vpor l6, t2, t2; \
+	vpor l7, t3, t3; \
+	\
+	vpxor l0, t0, l0; \
+	vmovdqu l0, 0 * 32(l); \
+	vpxor l1, t1, l1; \
+	vmovdqu l1, 1 * 32(l); \
+	vpxor l2, t2, l2; \
+	vmovdqu l2, 2 * 32(l); \
+	vpxor l3, t3, l3; \
+	vmovdqu l3, 3 * 32(l);
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+	vpunpckhdq x1, x0, t2; \
+	vpunpckldq x1, x0, x0; \
+	\
+	vpunpckldq x3, x2, t1; \
+	vpunpckhdq x3, x2, x2; \
+	\
+	vpunpckhqdq t1, x0, x1; \
+	vpunpcklqdq t1, x0, x0; \
+	\
+	vpunpckhqdq x2, t2, x3; \
+	vpunpcklqdq x2, t2, x2;
+
+#define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
+			      a3, b3, c3, d3, st0, st1) \
+	vmovdqu d2, st0; \
+	vmovdqu d3, st1; \
+	transpose_4x4(a0, a1, a2, a3, d2, d3); \
+	transpose_4x4(b0, b1, b2, b3, d2, d3); \
+	vmovdqu st0, d2; \
+	vmovdqu st1, d3; \
+	\
+	vmovdqu a0, st0; \
+	vmovdqu a1, st1; \
+	transpose_4x4(c0, c1, c2, c3, a0, a1); \
+	transpose_4x4(d0, d1, d2, d3, a0, a1); \
+	\
+	vbroadcasti128 .Lshufb_16x16b, a0; \
+	vmovdqu st1, a1; \
+	vpshufb a0, a2, a2; \
+	vpshufb a0, a3, a3; \
+	vpshufb a0, b0, b0; \
+	vpshufb a0, b1, b1; \
+	vpshufb a0, b2, b2; \
+	vpshufb a0, b3, b3; \
+	vpshufb a0, a1, a1; \
+	vpshufb a0, c0, c0; \
+	vpshufb a0, c1, c1; \
+	vpshufb a0, c2, c2; \
+	vpshufb a0, c3, c3; \
+	vpshufb a0, d0, d0; \
+	vpshufb a0, d1, d1; \
+	vpshufb a0, d2, d2; \
+	vpshufb a0, d3, d3; \
+	vmovdqu d3, st1; \
+	vmovdqu st0, d3; \
+	vpshufb a0, d3, a0; \
+	vmovdqu d2, st0; \
+	\
+	transpose_4x4(a0, b0, c0, d0, d2, d3); \
+	transpose_4x4(a1, b1, c1, d1, d2, d3); \
+	vmovdqu st0, d2; \
+	vmovdqu st1, d3; \
+	\
+	vmovdqu b0, st0; \
+	vmovdqu b1, st1; \
+	transpose_4x4(a2, b2, c2, d2, b0, b1); \
+	transpose_4x4(a3, b3, c3, d3, b0, b1); \
+	vmovdqu st0, b0; \
+	vmovdqu st1, b1; \
+	/* does not adjust output bytes inside vectors */
+
+/* load blocks to registers and apply pre-whitening */
+#define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		     y6, y7, rio, key) \
+	vpbroadcastq key, x0; \
+	vpshufb .Lpack_bswap, x0, x0; \
+	\
+	vpxor 0 * 32(rio), x0, y7; \
+	vpxor 1 * 32(rio), x0, y6; \
+	vpxor 2 * 32(rio), x0, y5; \
+	vpxor 3 * 32(rio), x0, y4; \
+	vpxor 4 * 32(rio), x0, y3; \
+	vpxor 5 * 32(rio), x0, y2; \
+	vpxor 6 * 32(rio), x0, y1; \
+	vpxor 7 * 32(rio), x0, y0; \
+	vpxor 8 * 32(rio), x0, x7; \
+	vpxor 9 * 32(rio), x0, x6; \
+	vpxor 10 * 32(rio), x0, x5; \
+	vpxor 11 * 32(rio), x0, x4; \
+	vpxor 12 * 32(rio), x0, x3; \
+	vpxor 13 * 32(rio), x0, x2; \
+	vpxor 14 * 32(rio), x0, x1; \
+	vpxor 15 * 32(rio), x0, x0;
+
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd) \
+	byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
+			      y4, y5, y6, y7, (mem_ab), (mem_cd)); \
+	\
+	vmovdqu x0, 0 * 32(mem_ab); \
+	vmovdqu x1, 1 * 32(mem_ab); \
+	vmovdqu x2, 2 * 32(mem_ab); \
+	vmovdqu x3, 3 * 32(mem_ab); \
+	vmovdqu x4, 4 * 32(mem_ab); \
+	vmovdqu x5, 5 * 32(mem_ab); \
+	vmovdqu x6, 6 * 32(mem_ab); \
+	vmovdqu x7, 7 * 32(mem_ab); \
+	vmovdqu y0, 0 * 32(mem_cd); \
+	vmovdqu y1, 1 * 32(mem_cd); \
+	vmovdqu y2, 2 * 32(mem_cd); \
+	vmovdqu y3, 3 * 32(mem_cd); \
+	vmovdqu y4, 4 * 32(mem_cd); \
+	vmovdqu y5, 5 * 32(mem_cd); \
+	vmovdqu y6, 6 * 32(mem_cd); \
+	vmovdqu y7, 7 * 32(mem_cd);
+
+/* de-byteslice, apply post-whitening and store blocks */
+#define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
+		    y5, y6, y7, key, stack_tmp0, stack_tmp1) \
+	byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
+			      y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
+	\
+	vmovdqu x0, stack_tmp0; \
+	\
+	vpbroadcastq key, x0; \
+	vpshufb .Lpack_bswap, x0, x0; \
+	\
+	vpxor x0, y7, y7; \
+	vpxor x0, y6, y6; \
+	vpxor x0, y5, y5; \
+	vpxor x0, y4, y4; \
+	vpxor x0, y3, y3; \
+	vpxor x0, y2, y2; \
+	vpxor x0, y1, y1; \
+	vpxor x0, y0, y0; \
+	vpxor x0, x7, x7; \
+	vpxor x0, x6, x6; \
+	vpxor x0, x5, x5; \
+	vpxor x0, x4, x4; \
+	vpxor x0, x3, x3; \
+	vpxor x0, x2, x2; \
+	vpxor x0, x1, x1; \
+	vpxor stack_tmp0, x0, x0;
+
+#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		     y6, y7, rio) \
+	vmovdqu x0, 0 * 32(rio); \
+	vmovdqu x1, 1 * 32(rio); \
+	vmovdqu x2, 2 * 32(rio); \
+	vmovdqu x3, 3 * 32(rio); \
+	vmovdqu x4, 4 * 32(rio); \
+	vmovdqu x5, 5 * 32(rio); \
+	vmovdqu x6, 6 * 32(rio); \
+	vmovdqu x7, 7 * 32(rio); \
+	vmovdqu y0, 8 * 32(rio); \
+	vmovdqu y1, 9 * 32(rio); \
+	vmovdqu y2, 10 * 32(rio); \
+	vmovdqu y3, 11 * 32(rio); \
+	vmovdqu y4, 12 * 32(rio); \
+	vmovdqu y5, 13 * 32(rio); \
+	vmovdqu y6, 14 * 32(rio); \
+	vmovdqu y7, 15 * 32(rio);
+
+.data
+.align 32
+
+#define SHUFB_BYTES(idx) \
+	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+
+.Lshufb_16x16b:
+	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+
+.Lpack_bswap:
+	.long 0x00010203, 0x04050607, 0x80808080, 0x80808080
+	.long 0x00010203, 0x04050607, 0x80808080, 0x80808080
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+/* For XTS mode */
+.Lxts_gf128mul_and_shl1_mask_0:
+	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
+.Lxts_gf128mul_and_shl1_mask_1:
+	.byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
+
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox1, sbox2, sbox3:
+ *   swap_bitendianness(
+ *       isom_map_camellia_to_aes(
+ *           camellia_f(
+ *               swap_bitendianess(in)
+ *           )
+ *       )
+ *   )
+ *
+ * (note: '⊕ 0xc5' inside camellia_f())
+ */
+.Lpre_tf_lo_s1:
+	.byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
+	.byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
+.Lpre_tf_hi_s1:
+	.byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
+	.byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
+
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox4:
+ *   swap_bitendianness(
+ *       isom_map_camellia_to_aes(
+ *           camellia_f(
+ *               swap_bitendianess(in <<< 1)
+ *           )
+ *       )
+ *   )
+ *
+ * (note: '⊕ 0xc5' inside camellia_f())
+ */
+.Lpre_tf_lo_s4:
+	.byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
+	.byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
+.Lpre_tf_hi_s4:
+	.byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
+	.byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox1, sbox4:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  )
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s1:
+	.byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
+	.byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
+.Lpost_tf_hi_s1:
+	.byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
+	.byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox2:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  ) <<< 1
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s2:
+	.byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
+	.byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
+.Lpost_tf_hi_s2:
+	.byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
+	.byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox3:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  ) >>> 1
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s3:
+	.byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
+	.byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
+.Lpost_tf_hi_s3:
+	.byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
+	.byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
+
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+
+.align 4
+/* 4-bit mask */
+.L0f0f0f0f:
+	.long 0x0f0f0f0f
+
+.text
+
+.align 8
+__camellia_enc_blk32:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rax: temporary storage, 512 bytes
+	 *	%ymm0..%ymm15: 32 plaintext blocks
+	 * output:
+	 *	%ymm0..%ymm15: 32 encrypted blocks, order swapped:
+	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+	 */
+
+	leaq 8 * 32(%rax), %rcx;
+
+	inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		      %ymm15, %rax, %rcx);
+
+	enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rax, %rcx, 0);
+
+	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+	      %ymm15,
+	      ((key_table + (8) * 8) + 0)(CTX),
+	      ((key_table + (8) * 8) + 4)(CTX),
+	      ((key_table + (8) * 8) + 8)(CTX),
+	      ((key_table + (8) * 8) + 12)(CTX));
+
+	enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rax, %rcx, 8);
+
+	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+	      %ymm15,
+	      ((key_table + (16) * 8) + 0)(CTX),
+	      ((key_table + (16) * 8) + 4)(CTX),
+	      ((key_table + (16) * 8) + 8)(CTX),
+	      ((key_table + (16) * 8) + 12)(CTX));
+
+	enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rax, %rcx, 16);
+
+	movl $24, %r8d;
+	cmpl $16, key_length(CTX);
+	jne .Lenc_max32;
+
+.Lenc_done:
+	/* load CD for output */
+	vmovdqu 0 * 32(%rcx), %ymm8;
+	vmovdqu 1 * 32(%rcx), %ymm9;
+	vmovdqu 2 * 32(%rcx), %ymm10;
+	vmovdqu 3 * 32(%rcx), %ymm11;
+	vmovdqu 4 * 32(%rcx), %ymm12;
+	vmovdqu 5 * 32(%rcx), %ymm13;
+	vmovdqu 6 * 32(%rcx), %ymm14;
+	vmovdqu 7 * 32(%rcx), %ymm15;
+
+	outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		    %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
+
+	ret;
+
+.align 8
+.Lenc_max32:
+	movl $32, %r8d;
+
+	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+	      %ymm15,
+	      ((key_table + (24) * 8) + 0)(CTX),
+	      ((key_table + (24) * 8) + 4)(CTX),
+	      ((key_table + (24) * 8) + 8)(CTX),
+	      ((key_table + (24) * 8) + 12)(CTX));
+
+	enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rax, %rcx, 24);
+
+	jmp .Lenc_done;
+ENDPROC(__camellia_enc_blk32)
+
+.align 8
+__camellia_dec_blk32:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rax: temporary storage, 512 bytes
+	 *	%r8d: 24 for 16 byte key, 32 for larger
+	 *	%ymm0..%ymm15: 16 encrypted blocks
+	 * output:
+	 *	%ymm0..%ymm15: 16 plaintext blocks, order swapped:
+	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+	 */
+
+	leaq 8 * 32(%rax), %rcx;
+
+	inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		      %ymm15, %rax, %rcx);
+
+	cmpl $32, %r8d;
+	je .Ldec_max32;
+
+.Ldec_max24:
+	dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rax, %rcx, 16);
+
+	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+	      %ymm15,
+	      ((key_table + (16) * 8) + 8)(CTX),
+	      ((key_table + (16) * 8) + 12)(CTX),
+	      ((key_table + (16) * 8) + 0)(CTX),
+	      ((key_table + (16) * 8) + 4)(CTX));
+
+	dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rax, %rcx, 8);
+
+	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+	      %ymm15,
+	      ((key_table + (8) * 8) + 8)(CTX),
+	      ((key_table + (8) * 8) + 12)(CTX),
+	      ((key_table + (8) * 8) + 0)(CTX),
+	      ((key_table + (8) * 8) + 4)(CTX));
+
+	dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rax, %rcx, 0);
+
+	/* load CD for output */
+	vmovdqu 0 * 32(%rcx), %ymm8;
+	vmovdqu 1 * 32(%rcx), %ymm9;
+	vmovdqu 2 * 32(%rcx), %ymm10;
+	vmovdqu 3 * 32(%rcx), %ymm11;
+	vmovdqu 4 * 32(%rcx), %ymm12;
+	vmovdqu 5 * 32(%rcx), %ymm13;
+	vmovdqu 6 * 32(%rcx), %ymm14;
+	vmovdqu 7 * 32(%rcx), %ymm15;
+
+	outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		    %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
+
+	ret;
+
+.align 8
+.Ldec_max32:
+	dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rax, %rcx, 24);
+
+	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+	      %ymm15,
+	      ((key_table + (24) * 8) + 8)(CTX),
+	      ((key_table + (24) * 8) + 12)(CTX),
+	      ((key_table + (24) * 8) + 0)(CTX),
+	      ((key_table + (24) * 8) + 4)(CTX));
+
+	jmp .Ldec_max24;
+ENDPROC(__camellia_dec_blk32)
+
+ENTRY(camellia_ecb_enc_32way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (32 blocks)
+	 *	%rdx: src (32 blocks)
+	 */
+
+	vzeroupper;
+
+	inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rdx, (key_table)(CTX));
+
+	/* now dst can be used as temporary buffer (even in src == dst case) */
+	movq	%rsi, %rax;
+
+	call __camellia_enc_blk32;
+
+	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+		     %ymm8, %rsi);
+
+	vzeroupper;
+
+	ret;
+ENDPROC(camellia_ecb_enc_32way)
+
+ENTRY(camellia_ecb_dec_32way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (32 blocks)
+	 *	%rdx: src (32 blocks)
+	 */
+
+	vzeroupper;
+
+	cmpl $16, key_length(CTX);
+	movl $32, %r8d;
+	movl $24, %eax;
+	cmovel %eax, %r8d; /* max */
+
+	inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rdx, (key_table)(CTX, %r8, 8));
+
+	/* now dst can be used as temporary buffer (even in src == dst case) */
+	movq	%rsi, %rax;
+
+	call __camellia_dec_blk32;
+
+	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+		     %ymm8, %rsi);
+
+	vzeroupper;
+
+	ret;
+ENDPROC(camellia_ecb_dec_32way)
+
+ENTRY(camellia_cbc_dec_32way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (32 blocks)
+	 *	%rdx: src (32 blocks)
+	 */
+
+	vzeroupper;
+
+	cmpl $16, key_length(CTX);
+	movl $32, %r8d;
+	movl $24, %eax;
+	cmovel %eax, %r8d; /* max */
+
+	inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rdx, (key_table)(CTX, %r8, 8));
+
+	movq %rsp, %r10;
+	cmpq %rsi, %rdx;
+	je .Lcbc_dec_use_stack;
+
+	/* dst can be used as temporary storage, src is not overwritten. */
+	movq %rsi, %rax;
+	jmp .Lcbc_dec_continue;
+
+.Lcbc_dec_use_stack:
+	/*
+	 * dst still in-use (because dst == src), so use stack for temporary
+	 * storage.
+	 */
+	subq $(16 * 32), %rsp;
+	movq %rsp, %rax;
+
+.Lcbc_dec_continue:
+	call __camellia_dec_blk32;
+
+	vmovdqu %ymm7, (%rax);
+	vpxor %ymm7, %ymm7, %ymm7;
+	vinserti128 $1, (%rdx), %ymm7, %ymm7;
+	vpxor (%rax), %ymm7, %ymm7;
+	movq %r10, %rsp;
+	vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;
+	vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;
+	vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;
+	vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3;
+	vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2;
+	vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1;
+	vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0;
+	vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15;
+	vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14;
+	vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13;
+	vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12;
+	vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11;
+	vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10;
+	vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9;
+	vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8;
+	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+		     %ymm8, %rsi);
+
+	vzeroupper;
+
+	ret;
+ENDPROC(camellia_cbc_dec_32way)
+
+#define inc_le128(x, minus_one, tmp) \
+	vpcmpeqq minus_one, x, tmp; \
+	vpsubq minus_one, x, x; \
+	vpslldq $8, tmp, tmp; \
+	vpsubq tmp, x, x;
+
+#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
+	vpcmpeqq minus_one, x, tmp1; \
+	vpcmpeqq minus_two, x, tmp2; \
+	vpsubq minus_two, x, x; \
+	vpor tmp2, tmp1, tmp1; \
+	vpslldq $8, tmp1, tmp1; \
+	vpsubq tmp1, x, x;
+
+ENTRY(camellia_ctr_32way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (32 blocks)
+	 *	%rdx: src (32 blocks)
+	 *	%rcx: iv (little endian, 128bit)
+	 */
+
+	vzeroupper;
+
+	movq %rsp, %r10;
+	cmpq %rsi, %rdx;
+	je .Lctr_use_stack;
+
+	/* dst can be used as temporary storage, src is not overwritten. */
+	movq %rsi, %rax;
+	jmp .Lctr_continue;
+
+.Lctr_use_stack:
+	subq $(16 * 32), %rsp;
+	movq %rsp, %rax;
+
+.Lctr_continue:
+	vpcmpeqd %ymm15, %ymm15, %ymm15;
+	vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */
+	vpaddq %ymm15, %ymm15, %ymm12; /* ab: -2:0 ; cd: -2:0 */
+
+	/* load IV and byteswap */
+	vmovdqu (%rcx), %xmm0;
+	vmovdqa %xmm0, %xmm1;
+	inc_le128(%xmm0, %xmm15, %xmm14);
+	vbroadcasti128 .Lbswap128_mask, %ymm14;
+	vinserti128 $1, %xmm0, %ymm1, %ymm0;
+	vpshufb %ymm14, %ymm0, %ymm13;
+	vmovdqu %ymm13, 15 * 32(%rax);
+
+	/* construct IVs */
+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); /* ab:le2 ; cd:le3 */
+	vpshufb %ymm14, %ymm0, %ymm13;
+	vmovdqu %ymm13, 14 * 32(%rax);
+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm13;
+	vmovdqu %ymm13, 13 * 32(%rax);
+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm13;
+	vmovdqu %ymm13, 12 * 32(%rax);
+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm13;
+	vmovdqu %ymm13, 11 * 32(%rax);
+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm10;
+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm9;
+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm8;
+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm7;
+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm6;
+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm5;
+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm4;
+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm3;
+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm2;
+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm1;
+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+	vextracti128 $1, %ymm0, %xmm13;
+	vpshufb %ymm14, %ymm0, %ymm0;
+	inc_le128(%xmm13, %xmm15, %xmm14);
+	vmovdqu %xmm13, (%rcx);
+
+	/* inpack32_pre: */
+	vpbroadcastq (key_table)(CTX), %ymm15;
+	vpshufb .Lpack_bswap, %ymm15, %ymm15;
+	vpxor %ymm0, %ymm15, %ymm0;
+	vpxor %ymm1, %ymm15, %ymm1;
+	vpxor %ymm2, %ymm15, %ymm2;
+	vpxor %ymm3, %ymm15, %ymm3;
+	vpxor %ymm4, %ymm15, %ymm4;
+	vpxor %ymm5, %ymm15, %ymm5;
+	vpxor %ymm6, %ymm15, %ymm6;
+	vpxor %ymm7, %ymm15, %ymm7;
+	vpxor %ymm8, %ymm15, %ymm8;
+	vpxor %ymm9, %ymm15, %ymm9;
+	vpxor %ymm10, %ymm15, %ymm10;
+	vpxor 11 * 32(%rax), %ymm15, %ymm11;
+	vpxor 12 * 32(%rax), %ymm15, %ymm12;
+	vpxor 13 * 32(%rax), %ymm15, %ymm13;
+	vpxor 14 * 32(%rax), %ymm15, %ymm14;
+	vpxor 15 * 32(%rax), %ymm15, %ymm15;
+
+	call __camellia_enc_blk32;
+
+	movq %r10, %rsp;
+
+	vpxor 0 * 32(%rdx), %ymm7, %ymm7;
+	vpxor 1 * 32(%rdx), %ymm6, %ymm6;
+	vpxor 2 * 32(%rdx), %ymm5, %ymm5;
+	vpxor 3 * 32(%rdx), %ymm4, %ymm4;
+	vpxor 4 * 32(%rdx), %ymm3, %ymm3;
+	vpxor 5 * 32(%rdx), %ymm2, %ymm2;
+	vpxor 6 * 32(%rdx), %ymm1, %ymm1;
+	vpxor 7 * 32(%rdx), %ymm0, %ymm0;
+	vpxor 8 * 32(%rdx), %ymm15, %ymm15;
+	vpxor 9 * 32(%rdx), %ymm14, %ymm14;
+	vpxor 10 * 32(%rdx), %ymm13, %ymm13;
+	vpxor 11 * 32(%rdx), %ymm12, %ymm12;
+	vpxor 12 * 32(%rdx), %ymm11, %ymm11;
+	vpxor 13 * 32(%rdx), %ymm10, %ymm10;
+	vpxor 14 * 32(%rdx), %ymm9, %ymm9;
+	vpxor 15 * 32(%rdx), %ymm8, %ymm8;
+	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+		     %ymm8, %rsi);
+
+	vzeroupper;
+
+	ret;
+ENDPROC(camellia_ctr_32way)
+
+#define gf128mul_x_ble(iv, mask, tmp) \
+	vpsrad $31, iv, tmp; \
+	vpaddq iv, iv, iv; \
+	vpshufd $0x13, tmp, tmp; \
+	vpand mask, tmp, tmp; \
+	vpxor tmp, iv, iv;
+
+#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
+	vpsrad $31, iv, tmp0; \
+	vpaddq iv, iv, tmp1; \
+	vpsllq $2, iv, iv; \
+	vpshufd $0x13, tmp0, tmp0; \
+	vpsrad $31, tmp1, tmp1; \
+	vpand mask2, tmp0, tmp0; \
+	vpshufd $0x13, tmp1, tmp1; \
+	vpxor tmp0, iv, iv; \
+	vpand mask1, tmp1, tmp1; \
+	vpxor tmp1, iv, iv;
+
+.align 8
+camellia_xts_crypt_32way:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (32 blocks)
+	 *	%rdx: src (32 blocks)
+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+	 *	%r8: index for input whitening key
+	 *	%r9: pointer to  __camellia_enc_blk32 or __camellia_dec_blk32
+	 */
+
+	vzeroupper;
+
+	subq $(16 * 32), %rsp;
+	movq %rsp, %rax;
+
+	vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12;
+
+	/* load IV and construct second IV */
+	vmovdqu (%rcx), %xmm0;
+	vmovdqa %xmm0, %xmm15;
+	gf128mul_x_ble(%xmm0, %xmm12, %xmm13);
+	vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13;
+	vinserti128 $1, %xmm0, %ymm15, %ymm0;
+	vpxor 0 * 32(%rdx), %ymm0, %ymm15;
+	vmovdqu %ymm15, 15 * 32(%rax);
+	vmovdqu %ymm0, 0 * 32(%rsi);
+
+	/* construct IVs */
+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+	vpxor 1 * 32(%rdx), %ymm0, %ymm15;
+	vmovdqu %ymm15, 14 * 32(%rax);
+	vmovdqu %ymm0, 1 * 32(%rsi);
+
+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+	vpxor 2 * 32(%rdx), %ymm0, %ymm15;
+	vmovdqu %ymm15, 13 * 32(%rax);
+	vmovdqu %ymm0, 2 * 32(%rsi);
+
+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+	vpxor 3 * 32(%rdx), %ymm0, %ymm15;
+	vmovdqu %ymm15, 12 * 32(%rax);
+	vmovdqu %ymm0, 3 * 32(%rsi);
+
+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+	vpxor 4 * 32(%rdx), %ymm0, %ymm11;
+	vmovdqu %ymm0, 4 * 32(%rsi);
+
+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+	vpxor 5 * 32(%rdx), %ymm0, %ymm10;
+	vmovdqu %ymm0, 5 * 32(%rsi);
+
+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+	vpxor 6 * 32(%rdx), %ymm0, %ymm9;
+	vmovdqu %ymm0, 6 * 32(%rsi);
+
+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+	vpxor 7 * 32(%rdx), %ymm0, %ymm8;
+	vmovdqu %ymm0, 7 * 32(%rsi);
+
+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+	vpxor 8 * 32(%rdx), %ymm0, %ymm7;
+	vmovdqu %ymm0, 8 * 32(%rsi);
+
+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+	vpxor 9 * 32(%rdx), %ymm0, %ymm6;
+	vmovdqu %ymm0, 9 * 32(%rsi);
+
+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+	vpxor 10 * 32(%rdx), %ymm0, %ymm5;
+	vmovdqu %ymm0, 10 * 32(%rsi);
+
+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+	vpxor 11 * 32(%rdx), %ymm0, %ymm4;
+	vmovdqu %ymm0, 11 * 32(%rsi);
+
+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+	vpxor 12 * 32(%rdx), %ymm0, %ymm3;
+	vmovdqu %ymm0, 12 * 32(%rsi);
+
+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+	vpxor 13 * 32(%rdx), %ymm0, %ymm2;
+	vmovdqu %ymm0, 13 * 32(%rsi);
+
+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+	vpxor 14 * 32(%rdx), %ymm0, %ymm1;
+	vmovdqu %ymm0, 14 * 32(%rsi);
+
+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+	vpxor 15 * 32(%rdx), %ymm0, %ymm15;
+	vmovdqu %ymm15, 0 * 32(%rax);
+	vmovdqu %ymm0, 15 * 32(%rsi);
+
+	vextracti128 $1, %ymm0, %xmm0;
+	gf128mul_x_ble(%xmm0, %xmm12, %xmm15);
+	vmovdqu %xmm0, (%rcx);
+
+	/* inpack32_pre: */
+	vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15;
+	vpshufb .Lpack_bswap, %ymm15, %ymm15;
+	vpxor 0 * 32(%rax), %ymm15, %ymm0;
+	vpxor %ymm1, %ymm15, %ymm1;
+	vpxor %ymm2, %ymm15, %ymm2;
+	vpxor %ymm3, %ymm15, %ymm3;
+	vpxor %ymm4, %ymm15, %ymm4;
+	vpxor %ymm5, %ymm15, %ymm5;
+	vpxor %ymm6, %ymm15, %ymm6;
+	vpxor %ymm7, %ymm15, %ymm7;
+	vpxor %ymm8, %ymm15, %ymm8;
+	vpxor %ymm9, %ymm15, %ymm9;
+	vpxor %ymm10, %ymm15, %ymm10;
+	vpxor %ymm11, %ymm15, %ymm11;
+	vpxor 12 * 32(%rax), %ymm15, %ymm12;
+	vpxor 13 * 32(%rax), %ymm15, %ymm13;
+	vpxor 14 * 32(%rax), %ymm15, %ymm14;
+	vpxor 15 * 32(%rax), %ymm15, %ymm15;
+
+	call *%r9;
+
+	addq $(16 * 32), %rsp;
+
+	vpxor 0 * 32(%rsi), %ymm7, %ymm7;
+	vpxor 1 * 32(%rsi), %ymm6, %ymm6;
+	vpxor 2 * 32(%rsi), %ymm5, %ymm5;
+	vpxor 3 * 32(%rsi), %ymm4, %ymm4;
+	vpxor 4 * 32(%rsi), %ymm3, %ymm3;
+	vpxor 5 * 32(%rsi), %ymm2, %ymm2;
+	vpxor 6 * 32(%rsi), %ymm1, %ymm1;
+	vpxor 7 * 32(%rsi), %ymm0, %ymm0;
+	vpxor 8 * 32(%rsi), %ymm15, %ymm15;
+	vpxor 9 * 32(%rsi), %ymm14, %ymm14;
+	vpxor 10 * 32(%rsi), %ymm13, %ymm13;
+	vpxor 11 * 32(%rsi), %ymm12, %ymm12;
+	vpxor 12 * 32(%rsi), %ymm11, %ymm11;
+	vpxor 13 * 32(%rsi), %ymm10, %ymm10;
+	vpxor 14 * 32(%rsi), %ymm9, %ymm9;
+	vpxor 15 * 32(%rsi), %ymm8, %ymm8;
+	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+		     %ymm8, %rsi);
+
+	vzeroupper;
+
+	ret;
+ENDPROC(camellia_xts_crypt_32way)
+
+ENTRY(camellia_xts_enc_32way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (32 blocks)
+	 *	%rdx: src (32 blocks)
+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+	 */
+
+	xorl %r8d, %r8d; /* input whitening key, 0 for enc */
+
+	leaq __camellia_enc_blk32, %r9;
+
+	jmp camellia_xts_crypt_32way;
+ENDPROC(camellia_xts_enc_32way)
+
+ENTRY(camellia_xts_dec_32way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (32 blocks)
+	 *	%rdx: src (32 blocks)
+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+	 */
+
+	cmpl $16, key_length(CTX);
+	movl $32, %r8d;
+	movl $24, %eax;
+	cmovel %eax, %r8d;  /* input whitening key, last for dec */
+
+	leaq __camellia_dec_blk32, %r9;
+
+	jmp camellia_xts_crypt_32way;
+ENDPROC(camellia_xts_dec_32way)
diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c
new file mode 100644
index 000000000000..414fe5d7946b
--- /dev/null
+++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
@@ -0,0 +1,586 @@
+/*
+ * Glue Code for x86_64/AVX2/AES-NI assembler optimized version of Camellia
+ *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/ctr.h>
+#include <crypto/lrw.h>
+#include <crypto/xts.h>
+#include <asm/xcr.h>
+#include <asm/xsave.h>
+#include <asm/crypto/camellia.h>
+#include <asm/crypto/ablk_helper.h>
+#include <asm/crypto/glue_helper.h>
+
+#define CAMELLIA_AESNI_PARALLEL_BLOCKS 16
+#define CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS 32
+
+/* 32-way AVX2/AES-NI parallel cipher functions */
+asmlinkage void camellia_ecb_enc_32way(struct camellia_ctx *ctx, u8 *dst,
+				       const u8 *src);
+asmlinkage void camellia_ecb_dec_32way(struct camellia_ctx *ctx, u8 *dst,
+				       const u8 *src);
+
+asmlinkage void camellia_cbc_dec_32way(struct camellia_ctx *ctx, u8 *dst,
+				       const u8 *src);
+asmlinkage void camellia_ctr_32way(struct camellia_ctx *ctx, u8 *dst,
+				   const u8 *src, le128 *iv);
+
+asmlinkage void camellia_xts_enc_32way(struct camellia_ctx *ctx, u8 *dst,
+				       const u8 *src, le128 *iv);
+asmlinkage void camellia_xts_dec_32way(struct camellia_ctx *ctx, u8 *dst,
+				       const u8 *src, le128 *iv);
+
+static const struct common_glue_ctx camellia_enc = {
+	.num_funcs = 4,
+	.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+	.funcs = { {
+		.num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_32way) }
+	}, {
+		.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_16way) }
+	}, {
+		.num_blocks = 2,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) }
+	} }
+};
+
+static const struct common_glue_ctx camellia_ctr = {
+	.num_funcs = 4,
+	.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+	.funcs = { {
+		.num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_32way) }
+	}, {
+		.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_16way) }
+	}, {
+		.num_blocks = 2,
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) }
+	} }
+};
+
+static const struct common_glue_ctx camellia_enc_xts = {
+	.num_funcs = 3,
+	.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+	.funcs = { {
+		.num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_32way) }
+	}, {
+		.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_16way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc) }
+	} }
+};
+
+static const struct common_glue_ctx camellia_dec = {
+	.num_funcs = 4,
+	.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+	.funcs = { {
+		.num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_32way) }
+	}, {
+		.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_16way) }
+	}, {
+		.num_blocks = 2,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) }
+	} }
+};
+
+static const struct common_glue_ctx camellia_dec_cbc = {
+	.num_funcs = 4,
+	.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+	.funcs = { {
+		.num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_32way) }
+	}, {
+		.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_16way) }
+	}, {
+		.num_blocks = 2,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) }
+	} }
+};
+
+static const struct common_glue_ctx camellia_dec_xts = {
+	.num_funcs = 3,
+	.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+	.funcs = { {
+		.num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_32way) }
+	}, {
+		.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_16way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec) }
+	} }
+};
+
+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_ecb_crypt_128bit(&camellia_enc, desc, dst, src, nbytes);
+}
+
+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_ecb_crypt_128bit(&camellia_dec, desc, dst, src, nbytes);
+}
+
+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(camellia_enc_blk), desc,
+				       dst, src, nbytes);
+}
+
+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_cbc_decrypt_128bit(&camellia_dec_cbc, desc, dst, src,
+				       nbytes);
+}
+
+static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		     struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_ctr_crypt_128bit(&camellia_ctr, desc, dst, src, nbytes);
+}
+
+static inline bool camellia_fpu_begin(bool fpu_enabled, unsigned int nbytes)
+{
+	return glue_fpu_begin(CAMELLIA_BLOCK_SIZE,
+			      CAMELLIA_AESNI_PARALLEL_BLOCKS, NULL, fpu_enabled,
+			      nbytes);
+}
+
+static inline void camellia_fpu_end(bool fpu_enabled)
+{
+	glue_fpu_end(fpu_enabled);
+}
+
+static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key,
+			   unsigned int key_len)
+{
+	return __camellia_setkey(crypto_tfm_ctx(tfm), in_key, key_len,
+				 &tfm->crt_flags);
+}
+
+struct crypt_priv {
+	struct camellia_ctx *ctx;
+	bool fpu_enabled;
+};
+
+static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+	const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
+	struct crypt_priv *ctx = priv;
+	int i;
+
+	ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes);
+
+	if (nbytes >= CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS * bsize) {
+		camellia_ecb_enc_32way(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS;
+		nbytes -= bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS;
+	}
+
+	if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
+		camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
+		nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
+	}
+
+	while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
+		camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
+		nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
+	}
+
+	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+		camellia_enc_blk(ctx->ctx, srcdst, srcdst);
+}
+
+static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+	const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
+	struct crypt_priv *ctx = priv;
+	int i;
+
+	ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes);
+
+	if (nbytes >= CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS * bsize) {
+		camellia_ecb_dec_32way(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS;
+		nbytes -= bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS;
+	}
+
+	if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
+		camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
+		nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
+	}
+
+	while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
+		camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
+		nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
+	}
+
+	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+		camellia_dec_blk(ctx->ctx, srcdst, srcdst);
+}
+
+static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS];
+	struct crypt_priv crypt_ctx = {
+		.ctx = &ctx->camellia_ctx,
+		.fpu_enabled = false,
+	};
+	struct lrw_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.table_ctx = &ctx->lrw_table,
+		.crypt_ctx = &crypt_ctx,
+		.crypt_fn = encrypt_callback,
+	};
+	int ret;
+
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+	ret = lrw_crypt(desc, dst, src, nbytes, &req);
+	camellia_fpu_end(crypt_ctx.fpu_enabled);
+
+	return ret;
+}
+
+static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS];
+	struct crypt_priv crypt_ctx = {
+		.ctx = &ctx->camellia_ctx,
+		.fpu_enabled = false,
+	};
+	struct lrw_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.table_ctx = &ctx->lrw_table,
+		.crypt_ctx = &crypt_ctx,
+		.crypt_fn = decrypt_callback,
+	};
+	int ret;
+
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+	ret = lrw_crypt(desc, dst, src, nbytes, &req);
+	camellia_fpu_end(crypt_ctx.fpu_enabled);
+
+	return ret;
+}
+
+static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+
+	return glue_xts_crypt_128bit(&camellia_enc_xts, desc, dst, src, nbytes,
+				     XTS_TWEAK_CAST(camellia_enc_blk),
+				     &ctx->tweak_ctx, &ctx->crypt_ctx);
+}
+
+static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+
+	return glue_xts_crypt_128bit(&camellia_dec_xts, desc, dst, src, nbytes,
+				     XTS_TWEAK_CAST(camellia_enc_blk),
+				     &ctx->tweak_ctx, &ctx->crypt_ctx);
+}
+
+static struct crypto_alg cmll_algs[10] = { {
+	.cra_name		= "__ecb-camellia-aesni-avx2",
+	.cra_driver_name	= "__driver-ecb-camellia-aesni-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct camellia_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE,
+			.setkey		= camellia_setkey,
+			.encrypt	= ecb_encrypt,
+			.decrypt	= ecb_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "__cbc-camellia-aesni-avx2",
+	.cra_driver_name	= "__driver-cbc-camellia-aesni-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct camellia_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE,
+			.setkey		= camellia_setkey,
+			.encrypt	= cbc_encrypt,
+			.decrypt	= cbc_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "__ctr-camellia-aesni-avx2",
+	.cra_driver_name	= "__driver-ctr-camellia-aesni-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct camellia_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE,
+			.ivsize		= CAMELLIA_BLOCK_SIZE,
+			.setkey		= camellia_setkey,
+			.encrypt	= ctr_crypt,
+			.decrypt	= ctr_crypt,
+		},
+	},
+}, {
+	.cra_name		= "__lrw-camellia-aesni-avx2",
+	.cra_driver_name	= "__driver-lrw-camellia-aesni-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct camellia_lrw_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_exit		= lrw_camellia_exit_tfm,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE +
+					  CAMELLIA_BLOCK_SIZE,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE +
+					  CAMELLIA_BLOCK_SIZE,
+			.ivsize		= CAMELLIA_BLOCK_SIZE,
+			.setkey		= lrw_camellia_setkey,
+			.encrypt	= lrw_encrypt,
+			.decrypt	= lrw_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "__xts-camellia-aesni-avx2",
+	.cra_driver_name	= "__driver-xts-camellia-aesni-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct camellia_xts_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE * 2,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE * 2,
+			.ivsize		= CAMELLIA_BLOCK_SIZE,
+			.setkey		= xts_camellia_setkey,
+			.encrypt	= xts_encrypt,
+			.decrypt	= xts_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "ecb(camellia)",
+	.cra_driver_name	= "ecb-camellia-aesni-avx2",
+	.cra_priority		= 500,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "cbc(camellia)",
+	.cra_driver_name	= "cbc-camellia-aesni-avx2",
+	.cra_priority		= 500,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE,
+			.ivsize		= CAMELLIA_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= __ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "ctr(camellia)",
+	.cra_driver_name	= "ctr-camellia-aesni-avx2",
+	.cra_priority		= 500,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE,
+			.ivsize		= CAMELLIA_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_encrypt,
+			.geniv		= "chainiv",
+		},
+	},
+}, {
+	.cra_name		= "lrw(camellia)",
+	.cra_driver_name	= "lrw-camellia-aesni-avx2",
+	.cra_priority		= 500,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE +
+					  CAMELLIA_BLOCK_SIZE,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE +
+					  CAMELLIA_BLOCK_SIZE,
+			.ivsize		= CAMELLIA_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "xts(camellia)",
+	.cra_driver_name	= "xts-camellia-aesni-avx2",
+	.cra_priority		= 500,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE * 2,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE * 2,
+			.ivsize		= CAMELLIA_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+} };
+
+static int __init camellia_aesni_init(void)
+{
+	u64 xcr0;
+
+	if (!cpu_has_avx2 || !cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) {
+		pr_info("AVX2 or AES-NI instructions are not detected.\n");
+		return -ENODEV;
+	}
+
+	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
+		pr_info("AVX2 detected but unusable.\n");
+		return -ENODEV;
+	}
+
+	return crypto_register_algs(cmll_algs, ARRAY_SIZE(cmll_algs));
+}
+
+static void __exit camellia_aesni_fini(void)
+{
+	crypto_unregister_algs(cmll_algs, ARRAY_SIZE(cmll_algs));
+}
+
+module_init(camellia_aesni_init);
+module_exit(camellia_aesni_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX2 optimized");
+MODULE_ALIAS("camellia");
+MODULE_ALIAS("camellia-asm");
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
index 4ff7ed47b3db..37fd0c0a81ea 100644
--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -26,33 +26,44 @@
 
 #define CAMELLIA_AESNI_PARALLEL_BLOCKS 16
 
-/* 16-way AES-NI parallel cipher functions */
+/* 16-way parallel cipher functions (avx/aes-ni) */
 asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst,
 				       const u8 *src);
+EXPORT_SYMBOL_GPL(camellia_ecb_enc_16way);
+
 asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst,
 				       const u8 *src);
+EXPORT_SYMBOL_GPL(camellia_ecb_dec_16way);
 
 asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst,
 				       const u8 *src);
+EXPORT_SYMBOL_GPL(camellia_cbc_dec_16way);
+
 asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst,
 				   const u8 *src, le128 *iv);
+EXPORT_SYMBOL_GPL(camellia_ctr_16way);
 
 asmlinkage void camellia_xts_enc_16way(struct camellia_ctx *ctx, u8 *dst,
 				       const u8 *src, le128 *iv);
+EXPORT_SYMBOL_GPL(camellia_xts_enc_16way);
+
 asmlinkage void camellia_xts_dec_16way(struct camellia_ctx *ctx, u8 *dst,
 				       const u8 *src, le128 *iv);
+EXPORT_SYMBOL_GPL(camellia_xts_dec_16way);
 
-static void camellia_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+void camellia_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 {
 	glue_xts_crypt_128bit_one(ctx, dst, src, iv,
 				  GLUE_FUNC_CAST(camellia_enc_blk));
 }
+EXPORT_SYMBOL_GPL(camellia_xts_enc);
 
-static void camellia_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+void camellia_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 {
 	glue_xts_crypt_128bit_one(ctx, dst, src, iv,
 				  GLUE_FUNC_CAST(camellia_dec_blk));
 }
+EXPORT_SYMBOL_GPL(camellia_xts_dec);
 
 static const struct common_glue_ctx camellia_enc = {
 	.num_funcs = 3,
diff --git a/arch/x86/include/asm/crypto/camellia.h b/arch/x86/include/asm/crypto/camellia.h
index 98038add801e..bb93333d9200 100644
--- a/arch/x86/include/asm/crypto/camellia.h
+++ b/arch/x86/include/asm/crypto/camellia.h
@@ -48,6 +48,22 @@ asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
 asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst,
 				      const u8 *src);
 
+/* 16-way parallel cipher functions (avx/aes-ni) */
+asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst,
+				       const u8 *src);
+asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst,
+				       const u8 *src);
+
+asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst,
+				       const u8 *src);
+asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst,
+				   const u8 *src, le128 *iv);
+
+asmlinkage void camellia_xts_enc_16way(struct camellia_ctx *ctx, u8 *dst,
+				       const u8 *src, le128 *iv);
+asmlinkage void camellia_xts_dec_16way(struct camellia_ctx *ctx, u8 *dst,
+				       const u8 *src, le128 *iv);
+
 static inline void camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
 				    const u8 *src)
 {
@@ -79,4 +95,7 @@ extern void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src,
 extern void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src,
 				    le128 *iv);
 
+extern void camellia_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv);
+extern void camellia_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv);
+
 #endif /* ASM_X86_CAMELLIA_H */
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 9ad3d78c1075..622d8a48cbe9 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -894,6 +894,29 @@ config CRYPTO_CAMELLIA_AESNI_AVX_X86_64
 	  See also:
 	  <https://info.isl.ntt.co.jp/crypt/eng/camellia/index_s.html>
 
+config CRYPTO_CAMELLIA_AESNI_AVX2_X86_64
+	tristate "Camellia cipher algorithm (x86_64/AES-NI/AVX2)"
+	depends on X86 && 64BIT
+	depends on CRYPTO
+	select CRYPTO_ALGAPI
+	select CRYPTO_CRYPTD
+	select CRYPTO_ABLK_HELPER_X86
+	select CRYPTO_GLUE_HELPER_X86
+	select CRYPTO_CAMELLIA_X86_64
+	select CRYPTO_CAMELLIA_AESNI_AVX_X86_64
+	select CRYPTO_LRW
+	select CRYPTO_XTS
+	help
+	  Camellia cipher algorithm module (x86_64/AES-NI/AVX2).
+
+	  Camellia is a symmetric key block cipher developed jointly
+	  at NTT and Mitsubishi Electric Corporation.
+
+	  The Camellia specifies three key sizes: 128, 192 and 256 bits.
+
+	  See also:
+	  <https://info.isl.ntt.co.jp/crypt/eng/camellia/index_s.html>
+
 config CRYPTO_CAMELLIA_SPARC64
 	tristate "Camellia cipher algorithm (SPARC64)"
 	depends on SPARC64
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index f5e13dea8cc9..5823735cf381 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -1666,6 +1666,9 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "__driver-cbc-camellia-aesni",
 		.test = alg_test_null,
+	}, {
+		.alg = "__driver-cbc-camellia-aesni-avx2",
+		.test = alg_test_null,
 	}, {
 		.alg = "__driver-cbc-cast5-avx",
 		.test = alg_test_null,
@@ -1697,6 +1700,9 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "__driver-ecb-camellia-aesni",
 		.test = alg_test_null,
+	}, {
+		.alg = "__driver-ecb-camellia-aesni-avx2",
+		.test = alg_test_null,
 	}, {
 		.alg = "__driver-ecb-cast5-avx",
 		.test = alg_test_null,
@@ -1977,6 +1983,9 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "cryptd(__driver-cbc-camellia-aesni)",
 		.test = alg_test_null,
+	}, {
+		.alg = "cryptd(__driver-cbc-camellia-aesni-avx2)",
+		.test = alg_test_null,
 	}, {
 		.alg = "cryptd(__driver-cbc-serpent-avx2)",
 		.test = alg_test_null,
@@ -1990,6 +1999,9 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "cryptd(__driver-ecb-camellia-aesni)",
 		.test = alg_test_null,
+	}, {
+		.alg = "cryptd(__driver-ecb-camellia-aesni-avx2)",
+		.test = alg_test_null,
 	}, {
 		.alg = "cryptd(__driver-ecb-cast5-avx)",
 		.test = alg_test_null,
-- 
cgit v1.2.3-59-g8ed1b