From 0a6a40c2a8c184a2fb467efacfb1cd338d719e0b Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 17 Oct 2018 21:37:58 -0700
Subject: crypto: aes_ti - disable interrupts while accessing S-box

In the "aes-fixed-time" AES implementation, disable interrupts while
accessing the S-box, in order to make cache-timing attacks more
difficult.  Previously it was possible for the CPU to be interrupted
while the S-box was loaded into L1 cache, potentially evicting the
cachelines and causing later table lookups to be time-variant.

In tests I did on x86 and ARM, this doesn't affect performance
significantly.  Responsiveness is potentially a concern, but interrupts
are only disabled for a single AES block.

Note that even after this change, the implementation still isn't
necessarily guaranteed to be constant-time; see
https://cr.yp.to/antiforgery/cachetiming-20050414.pdf for a discussion
of the many difficulties involved in writing truly constant-time AES
software.  But it's valuable to make such attacks more difficult.

Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/Kconfig  |  3 ++-
 crypto/aes_ti.c | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/crypto/Kconfig b/crypto/Kconfig
index f7a235db56aa..752005201013 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1006,7 +1006,8 @@ config CRYPTO_AES_TI
 	  8 for decryption), this implementation only uses just two S-boxes of
 	  256 bytes each, and attempts to eliminate data dependent latencies by
 	  prefetching the entire table into the cache at the start of each
-	  block.
+	  block. Interrupts are also disabled to avoid races where cachelines
+	  are evicted when the CPU is interrupted to do something else.
 
 config CRYPTO_AES_586
 	tristate "AES cipher algorithms (i586)"
diff --git a/crypto/aes_ti.c b/crypto/aes_ti.c
index 03023b2290e8..1ff9785b30f5 100644
--- a/crypto/aes_ti.c
+++ b/crypto/aes_ti.c
@@ -269,6 +269,7 @@ static void aesti_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 	const u32 *rkp = ctx->key_enc + 4;
 	int rounds = 6 + ctx->key_length / 4;
 	u32 st0[4], st1[4];
+	unsigned long flags;
 	int round;
 
 	st0[0] = ctx->key_enc[0] ^ get_unaligned_le32(in);
@@ -276,6 +277,12 @@ static void aesti_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 	st0[2] = ctx->key_enc[2] ^ get_unaligned_le32(in + 8);
 	st0[3] = ctx->key_enc[3] ^ get_unaligned_le32(in + 12);
 
+	/*
+	 * Temporarily disable interrupts to avoid races where cachelines are
+	 * evicted when the CPU is interrupted to do something else.
+	 */
+	local_irq_save(flags);
+
 	st0[0] ^= __aesti_sbox[ 0] ^ __aesti_sbox[128];
 	st0[1] ^= __aesti_sbox[32] ^ __aesti_sbox[160];
 	st0[2] ^= __aesti_sbox[64] ^ __aesti_sbox[192];
@@ -300,6 +307,8 @@ static void aesti_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 	put_unaligned_le32(subshift(st1, 1) ^ rkp[5], out + 4);
 	put_unaligned_le32(subshift(st1, 2) ^ rkp[6], out + 8);
 	put_unaligned_le32(subshift(st1, 3) ^ rkp[7], out + 12);
+
+	local_irq_restore(flags);
 }
 
 static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
@@ -308,6 +317,7 @@ static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 	const u32 *rkp = ctx->key_dec + 4;
 	int rounds = 6 + ctx->key_length / 4;
 	u32 st0[4], st1[4];
+	unsigned long flags;
 	int round;
 
 	st0[0] = ctx->key_dec[0] ^ get_unaligned_le32(in);
@@ -315,6 +325,12 @@ static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 	st0[2] = ctx->key_dec[2] ^ get_unaligned_le32(in + 8);
 	st0[3] = ctx->key_dec[3] ^ get_unaligned_le32(in + 12);
 
+	/*
+	 * Temporarily disable interrupts to avoid races where cachelines are
+	 * evicted when the CPU is interrupted to do something else.
+	 */
+	local_irq_save(flags);
+
 	st0[0] ^= __aesti_inv_sbox[ 0] ^ __aesti_inv_sbox[128];
 	st0[1] ^= __aesti_inv_sbox[32] ^ __aesti_inv_sbox[160];
 	st0[2] ^= __aesti_inv_sbox[64] ^ __aesti_inv_sbox[192];
@@ -339,6 +355,8 @@ static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 	put_unaligned_le32(inv_subshift(st1, 1) ^ rkp[5], out + 4);
 	put_unaligned_le32(inv_subshift(st1, 2) ^ rkp[6], out + 8);
 	put_unaligned_le32(inv_subshift(st1, 3) ^ rkp[7], out + 12);
+
+	local_irq_restore(flags);
 }
 
 static struct crypto_alg aes_alg = {
-- 
cgit v1.2.3-59-g8ed1b


From 913a3aa07d16e5b302f408d497a4b829910de247 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 17 Oct 2018 21:37:59 -0700
Subject: crypto: arm/aes - add some hardening against cache-timing attacks

Make the ARM scalar AES implementation closer to constant-time by
disabling interrupts and prefetching the tables into L1 cache.  This is
feasible because due to ARM's "free" rotations, the main tables are only
1024 bytes instead of the usual 4096 used by most AES implementations.

On ARM Cortex-A7, the speed loss is only about 5%.  The resulting code
is still over twice as fast as aes_ti.c.  Responsiveness is potentially
a concern, but interrupts are only disabled for a single AES block.

Note that even after these changes, the implementation still isn't
necessarily guaranteed to be constant-time; see
https://cr.yp.to/antiforgery/cachetiming-20050414.pdf for a discussion
of the many difficulties involved in writing truly constant-time AES
software.  But it's valuable to make such attacks more difficult.

Much of this patch is based on patches suggested by Ard Biesheuvel.

Suggested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm/crypto/Kconfig           |  9 ++++++
 arch/arm/crypto/aes-cipher-core.S | 62 ++++++++++++++++++++++++++++++++-------
 crypto/aes_generic.c              |  9 +++---
 3 files changed, 66 insertions(+), 14 deletions(-)

diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index ef0c7feea6e2..0473a8f68389 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -69,6 +69,15 @@ config CRYPTO_AES_ARM
 	help
 	  Use optimized AES assembler routines for ARM platforms.
 
+	  On ARM processors without the Crypto Extensions, this is the
+	  fastest AES implementation for single blocks.  For multiple
+	  blocks, the NEON bit-sliced implementation is usually faster.
+
+	  This implementation may be vulnerable to cache timing attacks,
+	  since it uses lookup tables.  However, as countermeasures it
+	  disables IRQs and preloads the tables; it is hoped this makes
+	  such attacks very difficult.
+
 config CRYPTO_AES_ARM_BS
 	tristate "Bit sliced AES using NEON instructions"
 	depends on KERNEL_MODE_NEON
diff --git a/arch/arm/crypto/aes-cipher-core.S b/arch/arm/crypto/aes-cipher-core.S
index 184d6c2d15d5..f2d67c095e59 100644
--- a/arch/arm/crypto/aes-cipher-core.S
+++ b/arch/arm/crypto/aes-cipher-core.S
@@ -10,6 +10,7 @@
  */
 
 #include <linux/linkage.h>
+#include <asm/assembler.h>
 #include <asm/cache.h>
 
 	.text
@@ -41,7 +42,7 @@
 	.endif
 	.endm
 
-	.macro		__hround, out0, out1, in0, in1, in2, in3, t3, t4, enc, sz, op
+	.macro		__hround, out0, out1, in0, in1, in2, in3, t3, t4, enc, sz, op, oldcpsr
 	__select	\out0, \in0, 0
 	__select	t0, \in1, 1
 	__load		\out0, \out0, 0, \sz, \op
@@ -73,6 +74,14 @@
 	__load		t0, t0, 3, \sz, \op
 	__load		\t4, \t4, 3, \sz, \op
 
+	.ifnb		\oldcpsr
+	/*
+	 * This is the final round and we're done with all data-dependent table
+	 * lookups, so we can safely re-enable interrupts.
+	 */
+	restore_irqs	\oldcpsr
+	.endif
+
 	eor		\out1, \out1, t1, ror #24
 	eor		\out0, \out0, t2, ror #16
 	ldm		rk!, {t1, t2}
@@ -83,14 +92,14 @@
 	eor		\out1, \out1, t2
 	.endm
 
-	.macro		fround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op
+	.macro		fround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op, oldcpsr
 	__hround	\out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1, \sz, \op
-	__hround	\out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1, \sz, \op
+	__hround	\out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1, \sz, \op, \oldcpsr
 	.endm
 
-	.macro		iround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op
+	.macro		iround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op, oldcpsr
 	__hround	\out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0, \sz, \op
-	__hround	\out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op
+	__hround	\out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op, \oldcpsr
 	.endm
 
 	.macro		__rev, out, in
@@ -118,13 +127,14 @@
 	.macro		do_crypt, round, ttab, ltab, bsz
 	push		{r3-r11, lr}
 
+	// Load keys first, to reduce latency in case they're not cached yet.
+	ldm		rk!, {r8-r11}
+
 	ldr		r4, [in]
 	ldr		r5, [in, #4]
 	ldr		r6, [in, #8]
 	ldr		r7, [in, #12]
 
-	ldm		rk!, {r8-r11}
-
 #ifdef CONFIG_CPU_BIG_ENDIAN
 	__rev		r4, r4
 	__rev		r5, r5
@@ -138,6 +148,25 @@
 	eor		r7, r7, r11
 
 	__adrl		ttab, \ttab
+	/*
+	 * Disable interrupts and prefetch the 1024-byte 'ft' or 'it' table into
+	 * L1 cache, assuming cacheline size >= 32.  This is a hardening measure
+	 * intended to make cache-timing attacks more difficult.  They may not
+	 * be fully prevented, however; see the paper
+	 * https://cr.yp.to/antiforgery/cachetiming-20050414.pdf
+	 * ("Cache-timing attacks on AES") for a discussion of the many
+	 * difficulties involved in writing truly constant-time AES software.
+	 */
+	 save_and_disable_irqs	t0
+	.set		i, 0
+	.rept		1024 / 128
+	ldr		r8, [ttab, #i + 0]
+	ldr		r9, [ttab, #i + 32]
+	ldr		r10, [ttab, #i + 64]
+	ldr		r11, [ttab, #i + 96]
+	.set		i, i + 128
+	.endr
+	push		{t0}		// oldcpsr
 
 	tst		rounds, #2
 	bne		1f
@@ -151,8 +180,21 @@
 	\round		r4, r5, r6, r7, r8, r9, r10, r11
 	b		0b
 
-2:	__adrl		ttab, \ltab
-	\round		r4, r5, r6, r7, r8, r9, r10, r11, \bsz, b
+2:	.ifb		\ltab
+	add		ttab, ttab, #1
+	.else
+	__adrl		ttab, \ltab
+	// Prefetch inverse S-box for final round; see explanation above
+	.set		i, 0
+	.rept		256 / 64
+	ldr		t0, [ttab, #i + 0]
+	ldr		t1, [ttab, #i + 32]
+	.set		i, i + 64
+	.endr
+	.endif
+
+	pop		{rounds}	// oldcpsr
+	\round		r4, r5, r6, r7, r8, r9, r10, r11, \bsz, b, rounds
 
 #ifdef CONFIG_CPU_BIG_ENDIAN
 	__rev		r4, r4
@@ -175,7 +217,7 @@
 	.endm
 
 ENTRY(__aes_arm_encrypt)
-	do_crypt	fround, crypto_ft_tab, crypto_ft_tab + 1, 2
+	do_crypt	fround, crypto_ft_tab,, 2
 ENDPROC(__aes_arm_encrypt)
 
 	.align		5
diff --git a/crypto/aes_generic.c b/crypto/aes_generic.c
index ca554d57d01e..13df33aca463 100644
--- a/crypto/aes_generic.c
+++ b/crypto/aes_generic.c
@@ -63,7 +63,8 @@ static inline u8 byte(const u32 x, const unsigned n)
 
 static const u32 rco_tab[10] = { 1, 2, 4, 8, 16, 32, 64, 128, 27, 54 };
 
-__visible const u32 crypto_ft_tab[4][256] = {
+/* cacheline-aligned to facilitate prefetching into cache */
+__visible const u32 crypto_ft_tab[4][256] __cacheline_aligned = {
 	{
 		0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6,
 		0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591,
@@ -327,7 +328,7 @@ __visible const u32 crypto_ft_tab[4][256] = {
 	}
 };
 
-__visible const u32 crypto_fl_tab[4][256] = {
+__visible const u32 crypto_fl_tab[4][256] __cacheline_aligned = {
 	{
 		0x00000063, 0x0000007c, 0x00000077, 0x0000007b,
 		0x000000f2, 0x0000006b, 0x0000006f, 0x000000c5,
@@ -591,7 +592,7 @@ __visible const u32 crypto_fl_tab[4][256] = {
 	}
 };
 
-__visible const u32 crypto_it_tab[4][256] = {
+__visible const u32 crypto_it_tab[4][256] __cacheline_aligned = {
 	{
 		0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a,
 		0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b,
@@ -855,7 +856,7 @@ __visible const u32 crypto_it_tab[4][256] = {
 	}
 };
 
-__visible const u32 crypto_il_tab[4][256] = {
+__visible const u32 crypto_il_tab[4][256] __cacheline_aligned = {
 	{
 		0x00000052, 0x00000009, 0x0000006a, 0x000000d5,
 		0x00000030, 0x00000036, 0x000000a5, 0x00000038,
-- 
cgit v1.2.3-59-g8ed1b


From e40fdb500b2bc92ae2e7e4ff29f6e9dedd854cc2 Mon Sep 17 00:00:00 2001
From: Gilad Ben-Yossef <gilad@benyossef.com>
Date: Mon, 29 Oct 2018 09:50:12 +0000
Subject: crypto: ccree - add support for CryptoCell 713

Add support for Arm TrustZone CryptoCell 713.
Note that this patch just enables using a 713 in backwards compatible mode
to 712. Newer 713 specific features will follow.

Signed-off-by: Gilad Ben-Yossef <gilad@benyossef.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/Kconfig           |  2 +-
 drivers/crypto/ccree/cc_driver.c | 23 +++++++++++++++--------
 drivers/crypto/ccree/cc_driver.h |  5 +++--
 3 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
index caa98a7fe392..75d401c32a16 100644
--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
@@ -765,7 +765,7 @@ config CRYPTO_DEV_CCREE
 	help
 	  Say 'Y' to enable a driver for the REE interface of the Arm
 	  TrustZone CryptoCell family of processors. Currently the
-	  CryptoCell 712, 710 and 630 are supported.
+	  CryptoCell 713, 712, 710 and 630 are supported.
 	  Choose this if you wish to use hardware acceleration of
 	  cryptographic operations on the system REE.
 	  If unsure say Y.
diff --git a/drivers/crypto/ccree/cc_driver.c b/drivers/crypto/ccree/cc_driver.c
index 1ff229c2aeab..630c598af627 100644
--- a/drivers/crypto/ccree/cc_driver.c
+++ b/drivers/crypto/ccree/cc_driver.c
@@ -43,6 +43,10 @@ struct cc_hw_data {
 
 /* Hardware revisions defs. */
 
+static const struct cc_hw_data cc713_hw = {
+	.name = "713", .rev = CC_HW_REV_713
+};
+
 static const struct cc_hw_data cc712_hw = {
 	.name = "712", .rev = CC_HW_REV_712, .sig =  0xDCC71200U
 };
@@ -56,6 +60,7 @@ static const struct cc_hw_data cc630p_hw = {
 };
 
 static const struct of_device_id arm_ccree_dev_of_match[] = {
+	{ .compatible = "arm,cryptocell-713-ree", .data = &cc713_hw },
 	{ .compatible = "arm,cryptocell-712-ree", .data = &cc712_hw },
 	{ .compatible = "arm,cryptocell-710-ree", .data = &cc710_hw },
 	{ .compatible = "arm,cryptocell-630p-ree", .data = &cc630p_hw },
@@ -297,15 +302,17 @@ static int init_cc_resources(struct platform_device *plat_dev)
 		return rc;
 	}
 
-	/* Verify correct mapping */
-	signature_val = cc_ioread(new_drvdata, new_drvdata->sig_offset);
-	if (signature_val != hw_rev->sig) {
-		dev_err(dev, "Invalid CC signature: SIGNATURE=0x%08X != expected=0x%08X\n",
-			signature_val, hw_rev->sig);
-		rc = -EINVAL;
-		goto post_clk_err;
+	if (hw_rev->rev <= CC_HW_REV_712) {
+		/* Verify correct mapping */
+		signature_val = cc_ioread(new_drvdata, new_drvdata->sig_offset);
+		if (signature_val != hw_rev->sig) {
+			dev_err(dev, "Invalid CC signature: SIGNATURE=0x%08X != expected=0x%08X\n",
+				signature_val, hw_rev->sig);
+			rc = -EINVAL;
+			goto post_clk_err;
+		}
+		dev_dbg(dev, "CC SIGNATURE=0x%08X\n", signature_val);
 	}
-	dev_dbg(dev, "CC SIGNATURE=0x%08X\n", signature_val);
 
 	/* Display HW versions */
 	dev_info(dev, "ARM CryptoCell %s Driver: HW version 0x%08X, Driver version %s\n",
diff --git a/drivers/crypto/ccree/cc_driver.h b/drivers/crypto/ccree/cc_driver.h
index d608a4faf662..a06e5c90fb68 100644
--- a/drivers/crypto/ccree/cc_driver.h
+++ b/drivers/crypto/ccree/cc_driver.h
@@ -36,12 +36,13 @@
 extern bool cc_dump_desc;
 extern bool cc_dump_bytes;
 
-#define DRV_MODULE_VERSION "4.0"
+#define DRV_MODULE_VERSION "5.0"
 
 enum cc_hw_rev {
 	CC_HW_REV_630 = 630,
 	CC_HW_REV_710 = 710,
-	CC_HW_REV_712 = 712
+	CC_HW_REV_712 = 712,
+	CC_HW_REV_713 = 713
 };
 
 #define CC_COHERENT_CACHE_PARAMS 0xEEE
-- 
cgit v1.2.3-59-g8ed1b


From d422912a8f0da30c4c6b2f482e5cfc0b85cd3119 Mon Sep 17 00:00:00 2001
From: Gilad Ben-Yossef <gilad@benyossef.com>
Date: Mon, 29 Oct 2018 09:50:13 +0000
Subject: dt-bindings: crypto: ccree: add ccree 713

Add device tree bindings associating Arm TrustZone CryptoCell 713 with the
ccree driver.

Signed-off-by: Gilad Ben-Yossef <gilad@benyossef.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 Documentation/devicetree/bindings/crypto/arm-cryptocell.txt | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/Documentation/devicetree/bindings/crypto/arm-cryptocell.txt b/Documentation/devicetree/bindings/crypto/arm-cryptocell.txt
index 999fb2a810f6..0ac06ec05272 100644
--- a/Documentation/devicetree/bindings/crypto/arm-cryptocell.txt
+++ b/Documentation/devicetree/bindings/crypto/arm-cryptocell.txt
@@ -1,8 +1,11 @@
 Arm TrustZone CryptoCell cryptographic engine
 
 Required properties:
-- compatible: Should be one of: "arm,cryptocell-712-ree",
-  "arm,cryptocell-710-ree" or "arm,cryptocell-630p-ree".
+- compatible: Should be one of -
+   "arm,cryptocell-713-ree"
+   "arm,cryptocell-712-ree"
+   "arm,cryptocell-710-ree"
+   "arm,cryptocell-630p-ree"
 - reg: Base physical address of the engine and length of memory mapped region.
 - interrupts: Interrupt number for the device.
 
-- 
cgit v1.2.3-59-g8ed1b


From 9b8d51f812ce5c8bfffe440391fe85f6e5349f07 Mon Sep 17 00:00:00 2001
From: Gilad Ben-Yossef <gilad@benyossef.com>
Date: Mon, 29 Oct 2018 09:50:14 +0000
Subject: crypto: ccree - add SM4 support

Add support for SM4 cipher in CryptoCell 713.

Signed-off-by: Gilad Ben-Yossef <gilad@benyossef.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/Kconfig                  |  1 +
 drivers/crypto/ccree/cc_cipher.c        | 66 +++++++++++++++++++++++++++++++++
 drivers/crypto/ccree/cc_hw_queue_defs.h |  3 ++
 3 files changed, 70 insertions(+)

diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
index 75d401c32a16..1de998d5bdb5 100644
--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
@@ -762,6 +762,7 @@ config CRYPTO_DEV_CCREE
 	select CRYPTO_ECB
 	select CRYPTO_CTR
 	select CRYPTO_XTS
+	select CRYPTO_SM4
 	help
 	  Say 'Y' to enable a driver for the REE interface of the Arm
 	  TrustZone CryptoCell family of processors. Currently the
diff --git a/drivers/crypto/ccree/cc_cipher.c b/drivers/crypto/ccree/cc_cipher.c
index 7623b29911af..989e70f43bd9 100644
--- a/drivers/crypto/ccree/cc_cipher.c
+++ b/drivers/crypto/ccree/cc_cipher.c
@@ -7,6 +7,7 @@
 #include <crypto/internal/skcipher.h>
 #include <crypto/des.h>
 #include <crypto/xts.h>
+#include <crypto/sm4.h>
 #include <crypto/scatterwalk.h>
 
 #include "cc_driver.h"
@@ -83,6 +84,9 @@ static int validate_keys_sizes(struct cc_cipher_ctx *ctx_p, u32 size)
 		if (size == DES3_EDE_KEY_SIZE || size == DES_KEY_SIZE)
 			return 0;
 		break;
+	case S_DIN_to_SM4:
+		if (size == SM4_KEY_SIZE)
+			return 0;
 	default:
 		break;
 	}
@@ -122,6 +126,17 @@ static int validate_data_size(struct cc_cipher_ctx *ctx_p,
 		if (IS_ALIGNED(size, DES_BLOCK_SIZE))
 			return 0;
 		break;
+	case S_DIN_to_SM4:
+		switch (ctx_p->cipher_mode) {
+		case DRV_CIPHER_CTR:
+			return 0;
+		case DRV_CIPHER_ECB:
+		case DRV_CIPHER_CBC:
+			if (IS_ALIGNED(size, SM4_BLOCK_SIZE))
+				return 0;
+		default:
+			break;
+		}
 	default:
 		break;
 	}
@@ -522,6 +537,9 @@ static void cc_setup_cipher_data(struct crypto_tfm *tfm,
 	case S_DIN_to_DES:
 		flow_mode = DIN_DES_DOUT;
 		break;
+	case S_DIN_to_SM4:
+		flow_mode = DIN_SM4_DOUT;
+		break;
 	default:
 		dev_err(dev, "invalid flow mode, flow_mode = %d\n", flow_mode);
 		return;
@@ -1324,6 +1342,54 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.flow_mode = S_DIN_to_DES,
 		.min_hw_rev = CC_HW_REV_630,
 	},
+	{
+		.name = "cbc(sm4)",
+		.driver_name = "cbc-sm4-ccree",
+		.blocksize = SM4_BLOCK_SIZE,
+		.template_skcipher = {
+			.setkey = cc_cipher_setkey,
+			.encrypt = cc_cipher_encrypt,
+			.decrypt = cc_cipher_decrypt,
+			.min_keysize = SM4_KEY_SIZE,
+			.max_keysize = SM4_KEY_SIZE,
+			.ivsize = SM4_BLOCK_SIZE,
+			},
+		.cipher_mode = DRV_CIPHER_CBC,
+		.flow_mode = S_DIN_to_SM4,
+		.min_hw_rev = CC_HW_REV_713,
+	},
+	{
+		.name = "ecb(sm4)",
+		.driver_name = "ecb-sm4-ccree",
+		.blocksize = SM4_BLOCK_SIZE,
+		.template_skcipher = {
+			.setkey = cc_cipher_setkey,
+			.encrypt = cc_cipher_encrypt,
+			.decrypt = cc_cipher_decrypt,
+			.min_keysize = SM4_KEY_SIZE,
+			.max_keysize = SM4_KEY_SIZE,
+			.ivsize = 0,
+			},
+		.cipher_mode = DRV_CIPHER_ECB,
+		.flow_mode = S_DIN_to_SM4,
+		.min_hw_rev = CC_HW_REV_713,
+	},
+	{
+		.name = "ctr(sm4)",
+		.driver_name = "ctr-sm4-ccree",
+		.blocksize = SM4_BLOCK_SIZE,
+		.template_skcipher = {
+			.setkey = cc_cipher_setkey,
+			.encrypt = cc_cipher_encrypt,
+			.decrypt = cc_cipher_decrypt,
+			.min_keysize = SM4_KEY_SIZE,
+			.max_keysize = SM4_KEY_SIZE,
+			.ivsize = SM4_BLOCK_SIZE,
+			},
+		.cipher_mode = DRV_CIPHER_CTR,
+		.flow_mode = S_DIN_to_SM4,
+		.min_hw_rev = CC_HW_REV_713,
+	},
 };
 
 static struct cc_crypto_alg *cc_create_alg(const struct cc_alg_template *tmpl,
diff --git a/drivers/crypto/ccree/cc_hw_queue_defs.h b/drivers/crypto/ccree/cc_hw_queue_defs.h
index 45985b955d2c..f719ff9c0277 100644
--- a/drivers/crypto/ccree/cc_hw_queue_defs.h
+++ b/drivers/crypto/ccree/cc_hw_queue_defs.h
@@ -107,6 +107,7 @@ enum cc_flow_mode {
 	AES_to_AES_to_HASH_and_DOUT	= 13,
 	AES_to_AES_to_HASH	= 14,
 	AES_to_HASH_and_AES	= 15,
+	DIN_SM4_DOUT		= 16,
 	DIN_AES_AESMAC		= 17,
 	HASH_to_DOUT		= 18,
 	/* setup flows */
@@ -114,9 +115,11 @@ enum cc_flow_mode {
 	S_DIN_to_AES2		= 33,
 	S_DIN_to_DES		= 34,
 	S_DIN_to_RC4		= 35,
+	S_DIN_to_SM4		= 36,
 	S_DIN_to_HASH		= 37,
 	S_AES_to_DOUT		= 38,
 	S_AES2_to_DOUT		= 39,
+	S_SM4_to_DOUT		= 40,
 	S_RC4_to_DOUT		= 41,
 	S_DES_to_DOUT		= 42,
 	S_HASH_to_DOUT		= 43,
-- 
cgit v1.2.3-59-g8ed1b


From f1e52fd0fbd67fbf342932506cd7a6e27c090c5f Mon Sep 17 00:00:00 2001
From: Yael Chemla <yael.chemla@foss.arm.com>
Date: Thu, 18 Oct 2018 13:59:57 +0100
Subject: crypto: ccree - adjust hash length to suit certain context specifics

Adjust hash length such that it will not be fixed and general for all algs.
Instead make it suitable for certain context information.
This is preparation for SM3 support.

Signed-off-by: Yael Chemla <yael.chemla@foss.arm.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/ccree/cc_aead.c   | 19 ++++++++++++++-----
 drivers/crypto/ccree/cc_driver.c | 10 ++++++++--
 drivers/crypto/ccree/cc_driver.h |  2 +-
 drivers/crypto/ccree/cc_hash.c   | 40 ++++++++++++++++++++++++----------------
 4 files changed, 47 insertions(+), 24 deletions(-)

diff --git a/drivers/crypto/ccree/cc_aead.c b/drivers/crypto/ccree/cc_aead.c
index 01b82b82f8b8..e0ac376ceb74 100644
--- a/drivers/crypto/ccree/cc_aead.c
+++ b/drivers/crypto/ccree/cc_aead.c
@@ -58,6 +58,7 @@ struct cc_aead_ctx {
 	unsigned int enc_keylen;
 	unsigned int auth_keylen;
 	unsigned int authsize; /* Actual (reduced?) size of the MAC/ICv */
+	unsigned int hash_len;
 	enum drv_cipher_mode cipher_mode;
 	enum cc_flow_mode flow_mode;
 	enum drv_hash_mode auth_mode;
@@ -122,6 +123,13 @@ static void cc_aead_exit(struct crypto_aead *tfm)
 	}
 }
 
+static unsigned int cc_get_aead_hash_len(struct crypto_aead *tfm)
+{
+	struct cc_aead_ctx *ctx = crypto_aead_ctx(tfm);
+
+	return cc_get_default_hash_len(ctx->drvdata);
+}
+
 static int cc_aead_init(struct crypto_aead *tfm)
 {
 	struct aead_alg *alg = crypto_aead_alg(tfm);
@@ -196,6 +204,7 @@ static int cc_aead_init(struct crypto_aead *tfm)
 		ctx->auth_state.hmac.ipad_opad = NULL;
 		ctx->auth_state.hmac.padded_authkey = NULL;
 	}
+	ctx->hash_len = cc_get_aead_hash_len(tfm);
 
 	return 0;
 
@@ -327,7 +336,7 @@ static int hmac_setkey(struct cc_hw_desc *desc, struct cc_aead_ctx *ctx)
 		/* Load the hash current length*/
 		hw_desc_init(&desc[idx]);
 		set_cipher_mode(&desc[idx], hash_mode);
-		set_din_const(&desc[idx], 0, ctx->drvdata->hash_len_sz);
+		set_din_const(&desc[idx], 0, ctx->hash_len);
 		set_flow_mode(&desc[idx], S_DIN_to_HASH);
 		set_setup_mode(&desc[idx], SETUP_LOAD_KEY0);
 		idx++;
@@ -465,7 +474,7 @@ static int cc_get_plain_hmac_key(struct crypto_aead *tfm, const u8 *key,
 			/* Load the hash current length*/
 			hw_desc_init(&desc[idx]);
 			set_cipher_mode(&desc[idx], hashmode);
-			set_din_const(&desc[idx], 0, ctx->drvdata->hash_len_sz);
+			set_din_const(&desc[idx], 0, ctx->hash_len);
 			set_cipher_config1(&desc[idx], HASH_PADDING_ENABLED);
 			set_flow_mode(&desc[idx], S_DIN_to_HASH);
 			set_setup_mode(&desc[idx], SETUP_LOAD_KEY0);
@@ -1001,7 +1010,7 @@ static void cc_set_hmac_desc(struct aead_request *req, struct cc_hw_desc desc[],
 	hw_desc_init(&desc[idx]);
 	set_cipher_mode(&desc[idx], hash_mode);
 	set_din_sram(&desc[idx], cc_digest_len_addr(ctx->drvdata, hash_mode),
-		     ctx->drvdata->hash_len_sz);
+		     ctx->hash_len);
 	set_flow_mode(&desc[idx], S_DIN_to_HASH);
 	set_setup_mode(&desc[idx], SETUP_LOAD_KEY0);
 	idx++;
@@ -1098,7 +1107,7 @@ static void cc_proc_scheme_desc(struct aead_request *req,
 	hw_desc_init(&desc[idx]);
 	set_cipher_mode(&desc[idx], hash_mode);
 	set_dout_sram(&desc[idx], aead_handle->sram_workspace_addr,
-		      ctx->drvdata->hash_len_sz);
+		      ctx->hash_len);
 	set_flow_mode(&desc[idx], S_HASH_to_DOUT);
 	set_setup_mode(&desc[idx], SETUP_WRITE_STATE1);
 	set_cipher_do(&desc[idx], DO_PAD);
@@ -1128,7 +1137,7 @@ static void cc_proc_scheme_desc(struct aead_request *req,
 	hw_desc_init(&desc[idx]);
 	set_cipher_mode(&desc[idx], hash_mode);
 	set_din_sram(&desc[idx], cc_digest_len_addr(ctx->drvdata, hash_mode),
-		     ctx->drvdata->hash_len_sz);
+		     ctx->hash_len);
 	set_cipher_config1(&desc[idx], HASH_PADDING_ENABLED);
 	set_flow_mode(&desc[idx], S_DIN_to_HASH);
 	set_setup_mode(&desc[idx], SETUP_LOAD_KEY0);
diff --git a/drivers/crypto/ccree/cc_driver.c b/drivers/crypto/ccree/cc_driver.c
index 630c598af627..09a6ada0b547 100644
--- a/drivers/crypto/ccree/cc_driver.c
+++ b/drivers/crypto/ccree/cc_driver.c
@@ -211,12 +211,10 @@ static int init_cc_resources(struct platform_device *plat_dev)
 	new_drvdata->hw_rev = hw_rev->rev;
 
 	if (hw_rev->rev >= CC_HW_REV_712) {
-		new_drvdata->hash_len_sz = HASH_LEN_SIZE_712;
 		new_drvdata->axim_mon_offset = CC_REG(AXIM_MON_COMP);
 		new_drvdata->sig_offset = CC_REG(HOST_SIGNATURE_712);
 		new_drvdata->ver_offset = CC_REG(HOST_VERSION_712);
 	} else {
-		new_drvdata->hash_len_sz = HASH_LEN_SIZE_630;
 		new_drvdata->axim_mon_offset = CC_REG(AXIM_MON_COMP8);
 		new_drvdata->sig_offset = CC_REG(HOST_SIGNATURE_630);
 		new_drvdata->ver_offset = CC_REG(HOST_VERSION_630);
@@ -468,6 +466,14 @@ int cc_clk_on(struct cc_drvdata *drvdata)
 	return 0;
 }
 
+unsigned int cc_get_default_hash_len(struct cc_drvdata *drvdata)
+{
+	if (drvdata->hw_rev >= CC_HW_REV_712)
+		return HASH_LEN_SIZE_712;
+	else
+		return HASH_LEN_SIZE_630;
+}
+
 void cc_clk_off(struct cc_drvdata *drvdata)
 {
 	struct clk *clk = drvdata->clk;
diff --git a/drivers/crypto/ccree/cc_driver.h b/drivers/crypto/ccree/cc_driver.h
index a06e5c90fb68..170790967854 100644
--- a/drivers/crypto/ccree/cc_driver.h
+++ b/drivers/crypto/ccree/cc_driver.h
@@ -128,7 +128,6 @@ struct cc_drvdata {
 	bool coherent;
 	char *hw_rev_name;
 	enum cc_hw_rev hw_rev;
-	u32 hash_len_sz;
 	u32 axim_mon_offset;
 	u32 sig_offset;
 	u32 ver_offset;
@@ -183,6 +182,7 @@ int init_cc_regs(struct cc_drvdata *drvdata, bool is_probe);
 void fini_cc_regs(struct cc_drvdata *drvdata);
 int cc_clk_on(struct cc_drvdata *drvdata);
 void cc_clk_off(struct cc_drvdata *drvdata);
+unsigned int cc_get_default_hash_len(struct cc_drvdata *drvdata);
 
 static inline void cc_iowrite(struct cc_drvdata *drvdata, u32 reg, u32 val)
 {
diff --git a/drivers/crypto/ccree/cc_hash.c b/drivers/crypto/ccree/cc_hash.c
index b9313306c36f..7af5b61f3f66 100644
--- a/drivers/crypto/ccree/cc_hash.c
+++ b/drivers/crypto/ccree/cc_hash.c
@@ -82,6 +82,7 @@ struct cc_hash_ctx {
 	int hash_mode;
 	int hw_mode;
 	int inter_digestsize;
+	unsigned int hash_len;
 	struct completion setkey_comp;
 	bool is_hmac;
 };
@@ -138,10 +139,10 @@ static void cc_init_req(struct device *dev, struct ahash_req_ctx *state,
 			    ctx->hash_mode == DRV_HASH_SHA384)
 				memcpy(state->digest_bytes_len,
 				       digest_len_sha512_init,
-				       ctx->drvdata->hash_len_sz);
+				       ctx->hash_len);
 			else
 				memcpy(state->digest_bytes_len, digest_len_init,
-				       ctx->drvdata->hash_len_sz);
+				       ctx->hash_len);
 		}
 
 		if (ctx->hash_mode != DRV_HASH_NULL) {
@@ -367,7 +368,7 @@ static int cc_fin_hmac(struct cc_hw_desc *desc, struct ahash_request *req,
 	set_cipher_mode(&desc[idx], ctx->hw_mode);
 	set_din_sram(&desc[idx],
 		     cc_digest_len_addr(ctx->drvdata, ctx->hash_mode),
-		     ctx->drvdata->hash_len_sz);
+		     ctx->hash_len);
 	set_cipher_config1(&desc[idx], HASH_PADDING_ENABLED);
 	set_flow_mode(&desc[idx], S_DIN_to_HASH);
 	set_setup_mode(&desc[idx], SETUP_LOAD_KEY0);
@@ -459,9 +460,9 @@ static int cc_hash_digest(struct ahash_request *req)
 	if (is_hmac) {
 		set_din_type(&desc[idx], DMA_DLLI,
 			     state->digest_bytes_len_dma_addr,
-			     ctx->drvdata->hash_len_sz, NS_BIT);
+			     ctx->hash_len, NS_BIT);
 	} else {
-		set_din_const(&desc[idx], 0, ctx->drvdata->hash_len_sz);
+		set_din_const(&desc[idx], 0, ctx->hash_len);
 		if (nbytes)
 			set_cipher_config1(&desc[idx], HASH_PADDING_ENABLED);
 		else
@@ -478,7 +479,7 @@ static int cc_hash_digest(struct ahash_request *req)
 		hw_desc_init(&desc[idx]);
 		set_cipher_mode(&desc[idx], ctx->hw_mode);
 		set_dout_dlli(&desc[idx], state->digest_buff_dma_addr,
-			      ctx->drvdata->hash_len_sz, NS_BIT, 0);
+			      ctx->hash_len, NS_BIT, 0);
 		set_flow_mode(&desc[idx], S_HASH_to_DOUT);
 		set_setup_mode(&desc[idx], SETUP_WRITE_STATE1);
 		set_cipher_do(&desc[idx], DO_PAD);
@@ -516,7 +517,7 @@ static int cc_restore_hash(struct cc_hw_desc *desc, struct cc_hash_ctx *ctx,
 	set_cipher_mode(&desc[idx], ctx->hw_mode);
 	set_cipher_config1(&desc[idx], HASH_PADDING_DISABLED);
 	set_din_type(&desc[idx], DMA_DLLI, state->digest_bytes_len_dma_addr,
-		     ctx->drvdata->hash_len_sz, NS_BIT);
+		     ctx->hash_len, NS_BIT);
 	set_flow_mode(&desc[idx], S_DIN_to_HASH);
 	set_setup_mode(&desc[idx], SETUP_LOAD_KEY0);
 	idx++;
@@ -587,7 +588,7 @@ static int cc_hash_update(struct ahash_request *req)
 	hw_desc_init(&desc[idx]);
 	set_cipher_mode(&desc[idx], ctx->hw_mode);
 	set_dout_dlli(&desc[idx], state->digest_bytes_len_dma_addr,
-		      ctx->drvdata->hash_len_sz, NS_BIT, 1);
+		      ctx->hash_len, NS_BIT, 1);
 	set_queue_last_ind(ctx->drvdata, &desc[idx]);
 	set_flow_mode(&desc[idx], S_HASH_to_DOUT);
 	set_setup_mode(&desc[idx], SETUP_WRITE_STATE1);
@@ -651,7 +652,7 @@ static int cc_do_finup(struct ahash_request *req, bool update)
 	set_cipher_do(&desc[idx], DO_PAD);
 	set_cipher_mode(&desc[idx], ctx->hw_mode);
 	set_dout_dlli(&desc[idx], state->digest_bytes_len_dma_addr,
-		      ctx->drvdata->hash_len_sz, NS_BIT, 0);
+		      ctx->hash_len, NS_BIT, 0);
 	set_setup_mode(&desc[idx], SETUP_WRITE_STATE1);
 	set_flow_mode(&desc[idx], S_HASH_to_DOUT);
 	idx++;
@@ -749,7 +750,7 @@ static int cc_hash_setkey(struct crypto_ahash *ahash, const u8 *key,
 			/* Load the hash current length*/
 			hw_desc_init(&desc[idx]);
 			set_cipher_mode(&desc[idx], ctx->hw_mode);
-			set_din_const(&desc[idx], 0, ctx->drvdata->hash_len_sz);
+			set_din_const(&desc[idx], 0, ctx->hash_len);
 			set_cipher_config1(&desc[idx], HASH_PADDING_ENABLED);
 			set_flow_mode(&desc[idx], S_DIN_to_HASH);
 			set_setup_mode(&desc[idx], SETUP_LOAD_KEY0);
@@ -831,7 +832,7 @@ static int cc_hash_setkey(struct crypto_ahash *ahash, const u8 *key,
 		/* Load the hash current length*/
 		hw_desc_init(&desc[idx]);
 		set_cipher_mode(&desc[idx], ctx->hw_mode);
-		set_din_const(&desc[idx], 0, ctx->drvdata->hash_len_sz);
+		set_din_const(&desc[idx], 0, ctx->hash_len);
 		set_flow_mode(&desc[idx], S_DIN_to_HASH);
 		set_setup_mode(&desc[idx], SETUP_LOAD_KEY0);
 		idx++;
@@ -1069,6 +1070,13 @@ fail:
 	return -ENOMEM;
 }
 
+static int cc_get_hash_len(struct crypto_tfm *tfm)
+{
+	struct cc_hash_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	return cc_get_default_hash_len(ctx->drvdata);
+}
+
 static int cc_cra_init(struct crypto_tfm *tfm)
 {
 	struct cc_hash_ctx *ctx = crypto_tfm_ctx(tfm);
@@ -1086,7 +1094,7 @@ static int cc_cra_init(struct crypto_tfm *tfm)
 	ctx->hw_mode = cc_alg->hw_mode;
 	ctx->inter_digestsize = cc_alg->inter_digestsize;
 	ctx->drvdata = cc_alg->drvdata;
-
+	ctx->hash_len = cc_get_hash_len(tfm);
 	return cc_alloc_ctx(ctx);
 }
 
@@ -1465,8 +1473,8 @@ static int cc_hash_export(struct ahash_request *req, void *out)
 	memcpy(out, state->digest_buff, ctx->inter_digestsize);
 	out += ctx->inter_digestsize;
 
-	memcpy(out, state->digest_bytes_len, ctx->drvdata->hash_len_sz);
-	out += ctx->drvdata->hash_len_sz;
+	memcpy(out, state->digest_bytes_len, ctx->hash_len);
+	out += ctx->hash_len;
 
 	memcpy(out, &curr_buff_cnt, sizeof(u32));
 	out += sizeof(u32);
@@ -1494,8 +1502,8 @@ static int cc_hash_import(struct ahash_request *req, const void *in)
 	memcpy(state->digest_buff, in, ctx->inter_digestsize);
 	in += ctx->inter_digestsize;
 
-	memcpy(state->digest_bytes_len, in, ctx->drvdata->hash_len_sz);
-	in += ctx->drvdata->hash_len_sz;
+	memcpy(state->digest_bytes_len, in, ctx->hash_len);
+	in += ctx->hash_len;
 
 	/* Sanity check the data as much as possible */
 	memcpy(&tmp, in, sizeof(u32));
-- 
cgit v1.2.3-59-g8ed1b


From 18a1dc1fd56b4a5b468ad5bf984a2b0ea5ee8c69 Mon Sep 17 00:00:00 2001
From: Yael Chemla <yael.chemla@foss.arm.com>
Date: Thu, 18 Oct 2018 13:59:58 +0100
Subject: crypto: ccree - modify set_cipher_mode usage from cc_hash

encapsulate set_cipher_mode call with another api,
preparation for specific hash behavior as needed in later patches
when SM3 introduced.

Signed-off-by: Yael Chemla <yael.chemla@foss.arm.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/ccree/cc_hash.c          | 18 +++++++++---------
 drivers/crypto/ccree/cc_hw_queue_defs.h | 14 ++++++++++++++
 2 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/drivers/crypto/ccree/cc_hash.c b/drivers/crypto/ccree/cc_hash.c
index 7af5b61f3f66..adcd9df2046e 100644
--- a/drivers/crypto/ccree/cc_hash.c
+++ b/drivers/crypto/ccree/cc_hash.c
@@ -322,7 +322,7 @@ static int cc_fin_result(struct cc_hw_desc *desc, struct ahash_request *req,
 
 	/* Get final MAC result */
 	hw_desc_init(&desc[idx]);
-	set_cipher_mode(&desc[idx], ctx->hw_mode);
+	set_hash_cipher_mode(&desc[idx], ctx->hw_mode, ctx->hash_mode);
 	/* TODO */
 	set_dout_dlli(&desc[idx], state->digest_result_dma_addr, digestsize,
 		      NS_BIT, 1);
@@ -441,7 +441,7 @@ static int cc_hash_digest(struct ahash_request *req)
 	 * digest
 	 */
 	hw_desc_init(&desc[idx]);
-	set_cipher_mode(&desc[idx], ctx->hw_mode);
+	set_hash_cipher_mode(&desc[idx], ctx->hw_mode, ctx->hash_mode);
 	if (is_hmac) {
 		set_din_type(&desc[idx], DMA_DLLI, state->digest_buff_dma_addr,
 			     ctx->inter_digestsize, NS_BIT);
@@ -455,7 +455,7 @@ static int cc_hash_digest(struct ahash_request *req)
 
 	/* Load the hash current length */
 	hw_desc_init(&desc[idx]);
-	set_cipher_mode(&desc[idx], ctx->hw_mode);
+	set_hash_cipher_mode(&desc[idx], ctx->hw_mode, ctx->hash_mode);
 
 	if (is_hmac) {
 		set_din_type(&desc[idx], DMA_DLLI,
@@ -505,7 +505,7 @@ static int cc_restore_hash(struct cc_hw_desc *desc, struct cc_hash_ctx *ctx,
 {
 	/* Restore hash digest */
 	hw_desc_init(&desc[idx]);
-	set_cipher_mode(&desc[idx], ctx->hw_mode);
+	set_hash_cipher_mode(&desc[idx], ctx->hw_mode, ctx->hash_mode);
 	set_din_type(&desc[idx], DMA_DLLI, state->digest_buff_dma_addr,
 		     ctx->inter_digestsize, NS_BIT);
 	set_flow_mode(&desc[idx], S_DIN_to_HASH);
@@ -514,7 +514,7 @@ static int cc_restore_hash(struct cc_hw_desc *desc, struct cc_hash_ctx *ctx,
 
 	/* Restore hash current length */
 	hw_desc_init(&desc[idx]);
-	set_cipher_mode(&desc[idx], ctx->hw_mode);
+	set_hash_cipher_mode(&desc[idx], ctx->hw_mode, ctx->hash_mode);
 	set_cipher_config1(&desc[idx], HASH_PADDING_DISABLED);
 	set_din_type(&desc[idx], DMA_DLLI, state->digest_bytes_len_dma_addr,
 		     ctx->hash_len, NS_BIT);
@@ -577,7 +577,7 @@ static int cc_hash_update(struct ahash_request *req)
 
 	/* store the hash digest result in context */
 	hw_desc_init(&desc[idx]);
-	set_cipher_mode(&desc[idx], ctx->hw_mode);
+	set_hash_cipher_mode(&desc[idx], ctx->hw_mode, ctx->hash_mode);
 	set_dout_dlli(&desc[idx], state->digest_buff_dma_addr,
 		      ctx->inter_digestsize, NS_BIT, 0);
 	set_flow_mode(&desc[idx], S_HASH_to_DOUT);
@@ -586,7 +586,7 @@ static int cc_hash_update(struct ahash_request *req)
 
 	/* store current hash length in context */
 	hw_desc_init(&desc[idx]);
-	set_cipher_mode(&desc[idx], ctx->hw_mode);
+	set_hash_cipher_mode(&desc[idx], ctx->hw_mode, ctx->hash_mode);
 	set_dout_dlli(&desc[idx], state->digest_bytes_len_dma_addr,
 		      ctx->hash_len, NS_BIT, 1);
 	set_queue_last_ind(ctx->drvdata, &desc[idx]);
@@ -650,7 +650,7 @@ static int cc_do_finup(struct ahash_request *req, bool update)
 	/* Pad the hash */
 	hw_desc_init(&desc[idx]);
 	set_cipher_do(&desc[idx], DO_PAD);
-	set_cipher_mode(&desc[idx], ctx->hw_mode);
+	set_hash_cipher_mode(&desc[idx], ctx->hw_mode, ctx->hash_mode);
 	set_dout_dlli(&desc[idx], state->digest_bytes_len_dma_addr,
 		      ctx->hash_len, NS_BIT, 0);
 	set_setup_mode(&desc[idx], SETUP_WRITE_STATE1);
@@ -2035,7 +2035,7 @@ static void cc_setup_xcbc(struct ahash_request *areq, struct cc_hw_desc desc[],
 					    XCBC_MAC_K1_OFFSET),
 		     CC_AES_128_BIT_KEY_SIZE, NS_BIT);
 	set_setup_mode(&desc[idx], SETUP_LOAD_KEY0);
-	set_cipher_mode(&desc[idx], DRV_CIPHER_XCBC_MAC);
+	set_hash_cipher_mode(&desc[idx], DRV_CIPHER_XCBC_MAC, ctx->hash_mode);
 	set_cipher_config0(&desc[idx], DESC_DIRECTION_ENCRYPT_ENCRYPT);
 	set_key_size_aes(&desc[idx], CC_AES_128_BIT_KEY_SIZE);
 	set_flow_mode(&desc[idx], S_DIN_to_AES);
diff --git a/drivers/crypto/ccree/cc_hw_queue_defs.h b/drivers/crypto/ccree/cc_hw_queue_defs.h
index f719ff9c0277..6e8438a71478 100644
--- a/drivers/crypto/ccree/cc_hw_queue_defs.h
+++ b/drivers/crypto/ccree/cc_hw_queue_defs.h
@@ -457,6 +457,20 @@ static inline void set_cipher_mode(struct cc_hw_desc *pdesc, int mode)
 	pdesc->word[4] |= FIELD_PREP(WORD4_CIPHER_MODE, mode);
 }
 
+/*
+ * Set the cipher mode for hash algorithms.
+ *
+ * @pdesc: pointer HW descriptor struct
+ * @cipher_mode:  Any one of the modes defined in [CC7x-DESC]
+ * @hash_mode: specifies which hash is being handled
+ */
+static inline void set_hash_cipher_mode(struct cc_hw_desc *pdesc,
+					enum drv_cipher_mode cipher_mode,
+					enum drv_hash_mode hash_mode)
+{
+	set_cipher_mode(pdesc, cipher_mode);
+}
+
 /*
  * Set the cipher configuration fields.
  *
-- 
cgit v1.2.3-59-g8ed1b


From 927574e0e85da61f84dcda15d5b6a2baa06cda46 Mon Sep 17 00:00:00 2001
From: Yael Chemla <yael.chemla@foss.arm.com>
Date: Thu, 18 Oct 2018 13:59:59 +0100
Subject: crypto: ccree - add SM3 support

Add support for SM3 cipher in CryptoCell 713.

Signed-off-by: Yael Chemla <yael.chemla@foss.arm.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/Kconfig                  |   1 +
 drivers/crypto/ccree/cc_crypto_ctx.h    |   4 +-
 drivers/crypto/ccree/cc_hash.c          | 119 ++++++++++++++++++++++++++------
 drivers/crypto/ccree/cc_hw_queue_defs.h |  13 ++++
 4 files changed, 113 insertions(+), 24 deletions(-)

diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
index 1de998d5bdb5..9f1baaec385f 100644
--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
@@ -763,6 +763,7 @@ config CRYPTO_DEV_CCREE
 	select CRYPTO_CTR
 	select CRYPTO_XTS
 	select CRYPTO_SM4
+	select CRYPTO_SM3
 	help
 	  Say 'Y' to enable a driver for the REE interface of the Arm
 	  TrustZone CryptoCell family of processors. Currently the
diff --git a/drivers/crypto/ccree/cc_crypto_ctx.h b/drivers/crypto/ccree/cc_crypto_ctx.h
index e032544f4e31..c8dac273c563 100644
--- a/drivers/crypto/ccree/cc_crypto_ctx.h
+++ b/drivers/crypto/ccree/cc_crypto_ctx.h
@@ -115,7 +115,8 @@ enum drv_hash_mode {
 	DRV_HASH_CBC_MAC = 6,
 	DRV_HASH_XCBC_MAC = 7,
 	DRV_HASH_CMAC = 8,
-	DRV_HASH_MODE_NUM = 9,
+	DRV_HASH_SM3 = 9,
+	DRV_HASH_MODE_NUM = 10,
 	DRV_HASH_RESERVE32B = S32_MAX
 };
 
@@ -127,6 +128,7 @@ enum drv_hash_hw_mode {
 	DRV_HASH_HW_SHA512 = 4,
 	DRV_HASH_HW_SHA384 = 12,
 	DRV_HASH_HW_GHASH = 6,
+	DRV_HASH_HW_SM3 = 14,
 	DRV_HASH_HW_RESERVE32B = S32_MAX
 };
 
diff --git a/drivers/crypto/ccree/cc_hash.c b/drivers/crypto/ccree/cc_hash.c
index adcd9df2046e..c80c9ae555d0 100644
--- a/drivers/crypto/ccree/cc_hash.c
+++ b/drivers/crypto/ccree/cc_hash.c
@@ -6,6 +6,7 @@
 #include <crypto/algapi.h>
 #include <crypto/hash.h>
 #include <crypto/md5.h>
+#include <crypto/sm3.h>
 #include <crypto/internal/hash.h>
 
 #include "cc_driver.h"
@@ -16,6 +17,7 @@
 
 #define CC_MAX_HASH_SEQ_LEN 12
 #define CC_MAX_OPAD_KEYS_SIZE CC_MAX_HASH_BLCK_SIZE
+#define CC_SM3_HASH_LEN_SIZE 8
 
 struct cc_hash_handle {
 	cc_sram_addr_t digest_len_sram_addr; /* const value in SRAM*/
@@ -43,6 +45,9 @@ static u64 sha384_init[] = {
 static u64 sha512_init[] = {
 	SHA512_H7, SHA512_H6, SHA512_H5, SHA512_H4,
 	SHA512_H3, SHA512_H2, SHA512_H1, SHA512_H0 };
+static const u32 sm3_init[] = {
+	SM3_IVH, SM3_IVG, SM3_IVF, SM3_IVE,
+	SM3_IVD, SM3_IVC, SM3_IVB, SM3_IVA };
 
 static void cc_setup_xcbc(struct ahash_request *areq, struct cc_hw_desc desc[],
 			  unsigned int *seq_size);
@@ -1074,7 +1079,10 @@ static int cc_get_hash_len(struct crypto_tfm *tfm)
 {
 	struct cc_hash_ctx *ctx = crypto_tfm_ctx(tfm);
 
-	return cc_get_default_hash_len(ctx->drvdata);
+	if (ctx->hash_mode == DRV_HASH_SM3)
+		return CC_SM3_HASH_LEN_SIZE;
+	else
+		return cc_get_default_hash_len(ctx->drvdata);
 }
 
 static int cc_cra_init(struct crypto_tfm *tfm)
@@ -1523,6 +1531,7 @@ struct cc_hash_template {
 	char mac_name[CRYPTO_MAX_ALG_NAME];
 	char mac_driver_name[CRYPTO_MAX_ALG_NAME];
 	unsigned int blocksize;
+	bool is_mac;
 	bool synchronize;
 	struct ahash_alg template_ahash;
 	int hash_mode;
@@ -1544,6 +1553,7 @@ static struct cc_hash_template driver_hash[] = {
 		.mac_name = "hmac(sha1)",
 		.mac_driver_name = "hmac-sha1-ccree",
 		.blocksize = SHA1_BLOCK_SIZE,
+		.is_mac = true,
 		.synchronize = false,
 		.template_ahash = {
 			.init = cc_hash_init,
@@ -1570,6 +1580,7 @@ static struct cc_hash_template driver_hash[] = {
 		.mac_name = "hmac(sha256)",
 		.mac_driver_name = "hmac-sha256-ccree",
 		.blocksize = SHA256_BLOCK_SIZE,
+		.is_mac = true,
 		.template_ahash = {
 			.init = cc_hash_init,
 			.update = cc_hash_update,
@@ -1595,6 +1606,7 @@ static struct cc_hash_template driver_hash[] = {
 		.mac_name = "hmac(sha224)",
 		.mac_driver_name = "hmac-sha224-ccree",
 		.blocksize = SHA224_BLOCK_SIZE,
+		.is_mac = true,
 		.template_ahash = {
 			.init = cc_hash_init,
 			.update = cc_hash_update,
@@ -1620,6 +1632,7 @@ static struct cc_hash_template driver_hash[] = {
 		.mac_name = "hmac(sha384)",
 		.mac_driver_name = "hmac-sha384-ccree",
 		.blocksize = SHA384_BLOCK_SIZE,
+		.is_mac = true,
 		.template_ahash = {
 			.init = cc_hash_init,
 			.update = cc_hash_update,
@@ -1645,6 +1658,7 @@ static struct cc_hash_template driver_hash[] = {
 		.mac_name = "hmac(sha512)",
 		.mac_driver_name = "hmac-sha512-ccree",
 		.blocksize = SHA512_BLOCK_SIZE,
+		.is_mac = true,
 		.template_ahash = {
 			.init = cc_hash_init,
 			.update = cc_hash_update,
@@ -1670,6 +1684,7 @@ static struct cc_hash_template driver_hash[] = {
 		.mac_name = "hmac(md5)",
 		.mac_driver_name = "hmac-md5-ccree",
 		.blocksize = MD5_HMAC_BLOCK_SIZE,
+		.is_mac = true,
 		.template_ahash = {
 			.init = cc_hash_init,
 			.update = cc_hash_update,
@@ -1689,10 +1704,35 @@ static struct cc_hash_template driver_hash[] = {
 		.inter_digestsize = MD5_DIGEST_SIZE,
 		.min_hw_rev = CC_HW_REV_630,
 	},
+	{
+		.name = "sm3",
+		.driver_name = "sm3-ccree",
+		.blocksize = SM3_BLOCK_SIZE,
+		.is_mac = false,
+		.template_ahash = {
+			.init = cc_hash_init,
+			.update = cc_hash_update,
+			.final = cc_hash_final,
+			.finup = cc_hash_finup,
+			.digest = cc_hash_digest,
+			.export = cc_hash_export,
+			.import = cc_hash_import,
+			.setkey = cc_hash_setkey,
+			.halg = {
+				.digestsize = SM3_DIGEST_SIZE,
+				.statesize = CC_STATE_SIZE(SM3_DIGEST_SIZE),
+			},
+		},
+		.hash_mode = DRV_HASH_SM3,
+		.hw_mode = DRV_HASH_HW_SM3,
+		.inter_digestsize = SM3_DIGEST_SIZE,
+		.min_hw_rev = CC_HW_REV_713,
+	},
 	{
 		.mac_name = "xcbc(aes)",
 		.mac_driver_name = "xcbc-aes-ccree",
 		.blocksize = AES_BLOCK_SIZE,
+		.is_mac = true,
 		.template_ahash = {
 			.init = cc_hash_init,
 			.update = cc_mac_update,
@@ -1716,6 +1756,7 @@ static struct cc_hash_template driver_hash[] = {
 		.mac_name = "cmac(aes)",
 		.mac_driver_name = "cmac-aes-ccree",
 		.blocksize = AES_BLOCK_SIZE,
+		.is_mac = true,
 		.template_ahash = {
 			.init = cc_hash_init,
 			.update = cc_mac_update,
@@ -1788,6 +1829,7 @@ int cc_init_hash_sram(struct cc_drvdata *drvdata)
 	unsigned int larval_seq_len = 0;
 	struct cc_hw_desc larval_seq[CC_DIGEST_SIZE_MAX / sizeof(u32)];
 	bool large_sha_supported = (drvdata->hw_rev >= CC_HW_REV_712);
+	bool sm3_supported = (drvdata->hw_rev >= CC_HW_REV_713);
 	int rc = 0;
 
 	/* Copy-to-sram digest-len */
@@ -1853,6 +1895,17 @@ int cc_init_hash_sram(struct cc_drvdata *drvdata)
 	sram_buff_ofs += sizeof(sha256_init);
 	larval_seq_len = 0;
 
+	if (sm3_supported) {
+		cc_set_sram_desc(sm3_init, sram_buff_ofs,
+				 ARRAY_SIZE(sm3_init), larval_seq,
+				 &larval_seq_len);
+		rc = send_request_init(drvdata, larval_seq, larval_seq_len);
+		if (rc)
+			goto init_digest_const_err;
+		sram_buff_ofs += sizeof(sm3_init);
+		larval_seq_len = 0;
+	}
+
 	if (large_sha_supported) {
 		cc_set_sram_desc((u32 *)sha384_init, sram_buff_ofs,
 				 (ARRAY_SIZE(sha384_init) * 2), larval_seq,
@@ -1919,6 +1972,9 @@ int cc_hash_alloc(struct cc_drvdata *drvdata)
 			sizeof(sha224_init) +
 			sizeof(sha256_init);
 
+	if (drvdata->hw_rev >= CC_HW_REV_713)
+		sram_size_to_alloc += sizeof(sm3_init);
+
 	if (drvdata->hw_rev >= CC_HW_REV_712)
 		sram_size_to_alloc += sizeof(digest_len_sha512_init) +
 			sizeof(sha384_init) + sizeof(sha512_init);
@@ -1948,27 +2004,28 @@ int cc_hash_alloc(struct cc_drvdata *drvdata)
 		/* We either support both HASH and MAC or none */
 		if (driver_hash[alg].min_hw_rev > drvdata->hw_rev)
 			continue;
-
-		/* register hmac version */
-		t_alg = cc_alloc_hash_alg(&driver_hash[alg], dev, true);
-		if (IS_ERR(t_alg)) {
-			rc = PTR_ERR(t_alg);
-			dev_err(dev, "%s alg allocation failed\n",
-				driver_hash[alg].driver_name);
-			goto fail;
-		}
-		t_alg->drvdata = drvdata;
-
-		rc = crypto_register_ahash(&t_alg->ahash_alg);
-		if (rc) {
-			dev_err(dev, "%s alg registration failed\n",
-				driver_hash[alg].driver_name);
-			kfree(t_alg);
-			goto fail;
-		} else {
-			list_add_tail(&t_alg->entry, &hash_handle->hash_list);
+		if (driver_hash[alg].is_mac) {
+			/* register hmac version */
+			t_alg = cc_alloc_hash_alg(&driver_hash[alg], dev, true);
+			if (IS_ERR(t_alg)) {
+				rc = PTR_ERR(t_alg);
+				dev_err(dev, "%s alg allocation failed\n",
+					driver_hash[alg].driver_name);
+				goto fail;
+			}
+			t_alg->drvdata = drvdata;
+
+			rc = crypto_register_ahash(&t_alg->ahash_alg);
+			if (rc) {
+				dev_err(dev, "%s alg registration failed\n",
+					driver_hash[alg].driver_name);
+				kfree(t_alg);
+				goto fail;
+			} else {
+				list_add_tail(&t_alg->entry,
+					      &hash_handle->hash_list);
+			}
 		}
-
 		if (hw_mode == DRV_CIPHER_XCBC_MAC ||
 		    hw_mode == DRV_CIPHER_CMAC)
 			continue;
@@ -2170,6 +2227,8 @@ static const void *cc_larval_digest(struct device *dev, u32 mode)
 		return sha384_init;
 	case DRV_HASH_SHA512:
 		return sha512_init;
+	case DRV_HASH_SM3:
+		return sm3_init;
 	default:
 		dev_err(dev, "Invalid hash mode (%d)\n", mode);
 		return md5_init;
@@ -2190,6 +2249,8 @@ cc_sram_addr_t cc_larval_digest_addr(void *drvdata, u32 mode)
 	struct cc_drvdata *_drvdata = (struct cc_drvdata *)drvdata;
 	struct cc_hash_handle *hash_handle = _drvdata->hash_handle;
 	struct device *dev = drvdata_to_dev(_drvdata);
+	bool sm3_supported = (_drvdata->hw_rev >= CC_HW_REV_713);
+	cc_sram_addr_t addr;
 
 	switch (mode) {
 	case DRV_HASH_NULL:
@@ -2208,19 +2269,31 @@ cc_sram_addr_t cc_larval_digest_addr(void *drvdata, u32 mode)
 			sizeof(md5_init) +
 			sizeof(sha1_init) +
 			sizeof(sha224_init));
-	case DRV_HASH_SHA384:
+	case DRV_HASH_SM3:
 		return (hash_handle->larval_digest_sram_addr +
 			sizeof(md5_init) +
 			sizeof(sha1_init) +
 			sizeof(sha224_init) +
 			sizeof(sha256_init));
+	case DRV_HASH_SHA384:
+		addr = (hash_handle->larval_digest_sram_addr +
+			sizeof(md5_init) +
+			sizeof(sha1_init) +
+			sizeof(sha224_init) +
+			sizeof(sha256_init));
+		if (sm3_supported)
+			addr += sizeof(sm3_init);
+		return addr;
 	case DRV_HASH_SHA512:
-		return (hash_handle->larval_digest_sram_addr +
+		addr = (hash_handle->larval_digest_sram_addr +
 			sizeof(md5_init) +
 			sizeof(sha1_init) +
 			sizeof(sha224_init) +
 			sizeof(sha256_init) +
 			sizeof(sha384_init));
+		if (sm3_supported)
+			addr += sizeof(sm3_init);
+		return addr;
 	default:
 		dev_err(dev, "Invalid hash mode (%d)\n", mode);
 	}
diff --git a/drivers/crypto/ccree/cc_hw_queue_defs.h b/drivers/crypto/ccree/cc_hw_queue_defs.h
index 6e8438a71478..7a9b90db7db7 100644
--- a/drivers/crypto/ccree/cc_hw_queue_defs.h
+++ b/drivers/crypto/ccree/cc_hw_queue_defs.h
@@ -42,6 +42,7 @@
 #define WORD3_QUEUE_LAST_IND	CC_GENMASK(3, QUEUE_LAST_IND)
 #define WORD4_ACK_NEEDED	CC_GENMASK(4, ACK_NEEDED)
 #define WORD4_AES_SEL_N_HASH	CC_GENMASK(4, AES_SEL_N_HASH)
+#define WORD4_AES_XOR_CRYPTO_KEY CC_GENMASK(4, AES_XOR_CRYPTO_KEY)
 #define WORD4_BYTES_SWAP	CC_GENMASK(4, BYTES_SWAP)
 #define WORD4_CIPHER_CONF0	CC_GENMASK(4, CIPHER_CONF0)
 #define WORD4_CIPHER_CONF1	CC_GENMASK(4, CIPHER_CONF1)
@@ -396,6 +397,16 @@ static inline void set_aes_not_hash_mode(struct cc_hw_desc *pdesc)
 	pdesc->word[4] |= FIELD_PREP(WORD4_AES_SEL_N_HASH, 1);
 }
 
+/*
+ * Set aes xor crypto key, this in some secenrios select SM3 engine
+ *
+ * @pdesc: pointer HW descriptor struct
+ */
+static inline void set_aes_xor_crypto_key(struct cc_hw_desc *pdesc)
+{
+	pdesc->word[4] |= FIELD_PREP(WORD4_AES_XOR_CRYPTO_KEY, 1);
+}
+
 /*
  * Set the DOUT field of a HW descriptors to SRAM mode
  * Note: No need to check SRAM alignment since host requests do not use SRAM and
@@ -469,6 +480,8 @@ static inline void set_hash_cipher_mode(struct cc_hw_desc *pdesc,
 					enum drv_hash_mode hash_mode)
 {
 	set_cipher_mode(pdesc, cipher_mode);
+	if (hash_mode == DRV_HASH_SM3)
+		set_aes_xor_crypto_key(pdesc);
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 7172122be6a4712d699da4d261f92aa5ab3a78b8 Mon Sep 17 00:00:00 2001
From: Wenwen Wang <wang6495@umn.edu>
Date: Thu, 18 Oct 2018 19:50:43 -0500
Subject: crypto: cavium/nitrox - fix a DMA pool free failure

In crypto_alloc_context(), a DMA pool is allocated through dma_pool_alloc()
to hold the crypto context. The meta data of the DMA pool, including the
pool used for the allocation 'ndev->ctx_pool' and the base address of the
DMA pool used by the device 'dma', are then stored to the beginning of the
pool. These meta data are eventually used in crypto_free_context() to free
the DMA pool through dma_pool_free(). However, given that the DMA pool can
also be accessed by the device, a malicious device can modify these meta
data, especially when the device is controlled to deploy an attack. This
can cause an unexpected DMA pool free failure.

To avoid the above issue, this patch introduces a new structure
crypto_ctx_hdr and a new field chdr in the structure nitrox_crypto_ctx hold
the meta data information of the DMA pool after the allocation. Note that
the original structure ctx_hdr is not changed to ensure the compatibility.

Cc: <stable@vger.kernel.org>
Signed-off-by: Wenwen Wang <wang6495@umn.edu>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/cavium/nitrox/nitrox_algs.c | 12 +++++++-----
 drivers/crypto/cavium/nitrox/nitrox_lib.c  | 22 +++++++++++++++++-----
 drivers/crypto/cavium/nitrox/nitrox_req.h  |  7 +++++++
 3 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/drivers/crypto/cavium/nitrox/nitrox_algs.c b/drivers/crypto/cavium/nitrox/nitrox_algs.c
index 2ae6124e5da6..5d54ebc20cb3 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_algs.c
+++ b/drivers/crypto/cavium/nitrox/nitrox_algs.c
@@ -73,7 +73,7 @@ static int flexi_aes_keylen(int keylen)
 static int nitrox_skcipher_init(struct crypto_skcipher *tfm)
 {
 	struct nitrox_crypto_ctx *nctx = crypto_skcipher_ctx(tfm);
-	void *fctx;
+	struct crypto_ctx_hdr *chdr;
 
 	/* get the first device */
 	nctx->ndev = nitrox_get_first_device();
@@ -81,12 +81,14 @@ static int nitrox_skcipher_init(struct crypto_skcipher *tfm)
 		return -ENODEV;
 
 	/* allocate nitrox crypto context */
-	fctx = crypto_alloc_context(nctx->ndev);
-	if (!fctx) {
+	chdr = crypto_alloc_context(nctx->ndev);
+	if (!chdr) {
 		nitrox_put_device(nctx->ndev);
 		return -ENOMEM;
 	}
-	nctx->u.ctx_handle = (uintptr_t)fctx;
+	nctx->chdr = chdr;
+	nctx->u.ctx_handle = (uintptr_t)((u8 *)chdr->vaddr +
+					 sizeof(struct ctx_hdr));
 	crypto_skcipher_set_reqsize(tfm, crypto_skcipher_reqsize(tfm) +
 				    sizeof(struct nitrox_kcrypt_request));
 	return 0;
@@ -102,7 +104,7 @@ static void nitrox_skcipher_exit(struct crypto_skcipher *tfm)
 
 		memset(&fctx->crypto, 0, sizeof(struct crypto_keys));
 		memset(&fctx->auth, 0, sizeof(struct auth_keys));
-		crypto_free_context((void *)fctx);
+		crypto_free_context((void *)nctx->chdr);
 	}
 	nitrox_put_device(nctx->ndev);
 
diff --git a/drivers/crypto/cavium/nitrox/nitrox_lib.c b/drivers/crypto/cavium/nitrox/nitrox_lib.c
index 2260efa42308..9138bae12521 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_lib.c
+++ b/drivers/crypto/cavium/nitrox/nitrox_lib.c
@@ -158,12 +158,19 @@ static void destroy_crypto_dma_pool(struct nitrox_device *ndev)
 void *crypto_alloc_context(struct nitrox_device *ndev)
 {
 	struct ctx_hdr *ctx;
+	struct crypto_ctx_hdr *chdr;
 	void *vaddr;
 	dma_addr_t dma;
 
+	chdr = kmalloc(sizeof(*chdr), GFP_KERNEL);
+	if (!chdr)
+		return NULL;
+
 	vaddr = dma_pool_zalloc(ndev->ctx_pool, GFP_KERNEL, &dma);
-	if (!vaddr)
+	if (!vaddr) {
+		kfree(chdr);
 		return NULL;
+	}
 
 	/* fill meta data */
 	ctx = vaddr;
@@ -171,7 +178,11 @@ void *crypto_alloc_context(struct nitrox_device *ndev)
 	ctx->dma = dma;
 	ctx->ctx_dma = dma + sizeof(struct ctx_hdr);
 
-	return ((u8 *)vaddr + sizeof(struct ctx_hdr));
+	chdr->pool = ndev->ctx_pool;
+	chdr->dma = dma;
+	chdr->vaddr = vaddr;
+
+	return chdr;
 }
 
 /**
@@ -180,13 +191,14 @@ void *crypto_alloc_context(struct nitrox_device *ndev)
  */
 void crypto_free_context(void *ctx)
 {
-	struct ctx_hdr *ctxp;
+	struct crypto_ctx_hdr *ctxp;
 
 	if (!ctx)
 		return;
 
-	ctxp = (struct ctx_hdr *)((u8 *)ctx - sizeof(struct ctx_hdr));
-	dma_pool_free(ctxp->pool, ctxp, ctxp->dma);
+	ctxp = ctx;
+	dma_pool_free(ctxp->pool, ctxp->vaddr, ctxp->dma);
+	kfree(ctxp);
 }
 
 /**
diff --git a/drivers/crypto/cavium/nitrox/nitrox_req.h b/drivers/crypto/cavium/nitrox/nitrox_req.h
index d091b6f5f5dd..19f0a20e3bb3 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_req.h
+++ b/drivers/crypto/cavium/nitrox/nitrox_req.h
@@ -181,12 +181,19 @@ struct flexi_crypto_context {
 	struct auth_keys auth;
 };
 
+struct crypto_ctx_hdr {
+	struct dma_pool *pool;
+	dma_addr_t dma;
+	void *vaddr;
+};
+
 struct nitrox_crypto_ctx {
 	struct nitrox_device *ndev;
 	union {
 		u64 ctx_handle;
 		struct flexi_crypto_context *fctx;
 	} u;
+	struct crypto_ctx_hdr *chdr;
 };
 
 struct nitrox_kcrypt_request {
-- 
cgit v1.2.3-59-g8ed1b


From fa4600734b74f74d9169c3015946d4722f8bcf79 Mon Sep 17 00:00:00 2001
From: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Date: Sat, 20 Oct 2018 02:01:52 +0300
Subject: crypto: cfb - fix decryption

crypto_cfb_decrypt_segment() incorrectly XOR'ed generated keystream with
IV, rather than with data stream, resulting in incorrect decryption.
Test vectors will be added in the next patch.

Signed-off-by: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Cc: stable@vger.kernel.org
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/cfb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crypto/cfb.c b/crypto/cfb.c
index a0d68c09e1b9..fd4e8500e121 100644
--- a/crypto/cfb.c
+++ b/crypto/cfb.c
@@ -144,7 +144,7 @@ static int crypto_cfb_decrypt_segment(struct skcipher_walk *walk,
 
 	do {
 		crypto_cfb_encrypt_one(tfm, iv, dst);
-		crypto_xor(dst, iv, bsize);
+		crypto_xor(dst, src, bsize);
 		iv = src;
 
 		src += bsize;
-- 
cgit v1.2.3-59-g8ed1b


From 7da66670775d201f633577f5b15a4bbeebaaa2b0 Mon Sep 17 00:00:00 2001
From: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Date: Sat, 20 Oct 2018 02:01:53 +0300
Subject: crypto: testmgr - add AES-CFB tests

Add AES128/192/256-CFB testvectors from NIST SP800-38A.

Signed-off-by: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Cc: stable@vger.kernel.org
Signed-off-by: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/tcrypt.c  |  5 ++++
 crypto/testmgr.c |  7 ++++++
 crypto/testmgr.h | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 88 insertions(+)

diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index c20c9f5c18f2..1026173d721a 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -1736,6 +1736,7 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb)
 		ret += tcrypt_test("ctr(aes)");
 		ret += tcrypt_test("rfc3686(ctr(aes))");
 		ret += tcrypt_test("ofb(aes)");
+		ret += tcrypt_test("cfb(aes)");
 		break;
 
 	case 11:
@@ -2060,6 +2061,10 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb)
 				speed_template_16_24_32);
 		test_cipher_speed("ctr(aes)", DECRYPT, sec, NULL, 0,
 				speed_template_16_24_32);
+		test_cipher_speed("cfb(aes)", ENCRYPT, sec, NULL, 0,
+				speed_template_16_24_32);
+		test_cipher_speed("cfb(aes)", DECRYPT, sec, NULL, 0,
+				speed_template_16_24_32);
 		break;
 
 	case 201:
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index b1f79c6bf409..84937ceb4bd8 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -2690,6 +2690,13 @@ static const struct alg_test_desc alg_test_descs[] = {
 				.dec = __VECS(aes_ccm_dec_tv_template)
 			}
 		}
+	}, {
+		.alg = "cfb(aes)",
+		.test = alg_test_skcipher,
+		.fips_allowed = 1,
+		.suite = {
+			.cipher = __VECS(aes_cfb_tv_template)
+		},
 	}, {
 		.alg = "chacha20",
 		.test = alg_test_skcipher,
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 1fe7b97ba03f..b5b0d29761ce 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -11449,6 +11449,82 @@ static const struct cipher_testvec aes_cbc_tv_template[] = {
 	},
 };
 
+static const struct cipher_testvec aes_cfb_tv_template[] = {
+	{ /* From NIST SP800-38A */
+		.key	= "\x2b\x7e\x15\x16\x28\xae\xd2\xa6"
+			  "\xab\xf7\x15\x88\x09\xcf\x4f\x3c",
+		.klen	= 16,
+		.iv	= "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+		.ptext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+			  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+			  "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
+			  "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+			  "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11"
+			  "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef"
+			  "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17"
+			  "\xad\x2b\x41\x7b\xe6\x6c\x37\x10",
+		.ctext	= "\x3b\x3f\xd9\x2e\xb7\x2d\xad\x20"
+			  "\x33\x34\x49\xf8\xe8\x3c\xfb\x4a"
+			  "\xc8\xa6\x45\x37\xa0\xb3\xa9\x3f"
+			  "\xcd\xe3\xcd\xad\x9f\x1c\xe5\x8b"
+			  "\x26\x75\x1f\x67\xa3\xcb\xb1\x40"
+			  "\xb1\x80\x8c\xf1\x87\xa4\xf4\xdf"
+			  "\xc0\x4b\x05\x35\x7c\x5d\x1c\x0e"
+			  "\xea\xc4\xc6\x6f\x9f\xf7\xf2\xe6",
+		.len	= 64,
+	}, {
+		.key	= "\x8e\x73\xb0\xf7\xda\x0e\x64\x52"
+			  "\xc8\x10\xf3\x2b\x80\x90\x79\xe5"
+			  "\x62\xf8\xea\xd2\x52\x2c\x6b\x7b",
+		.klen	= 24,
+		.iv	= "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+		.ptext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+			  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+			  "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
+			  "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+			  "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11"
+			  "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef"
+			  "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17"
+			  "\xad\x2b\x41\x7b\xe6\x6c\x37\x10",
+		.ctext	= "\xcd\xc8\x0d\x6f\xdd\xf1\x8c\xab"
+			  "\x34\xc2\x59\x09\xc9\x9a\x41\x74"
+			  "\x67\xce\x7f\x7f\x81\x17\x36\x21"
+			  "\x96\x1a\x2b\x70\x17\x1d\x3d\x7a"
+			  "\x2e\x1e\x8a\x1d\xd5\x9b\x88\xb1"
+			  "\xc8\xe6\x0f\xed\x1e\xfa\xc4\xc9"
+			  "\xc0\x5f\x9f\x9c\xa9\x83\x4f\xa0"
+			  "\x42\xae\x8f\xba\x58\x4b\x09\xff",
+		.len	= 64,
+	}, {
+		.key	= "\x60\x3d\xeb\x10\x15\xca\x71\xbe"
+			  "\x2b\x73\xae\xf0\x85\x7d\x77\x81"
+			  "\x1f\x35\x2c\x07\x3b\x61\x08\xd7"
+			  "\x2d\x98\x10\xa3\x09\x14\xdf\xf4",
+		.klen	= 32,
+		.iv	= "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+		.ptext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+			  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+			  "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
+			  "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+			  "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11"
+			  "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef"
+			  "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17"
+			  "\xad\x2b\x41\x7b\xe6\x6c\x37\x10",
+		.ctext	= "\xdc\x7e\x84\xbf\xda\x79\x16\x4b"
+			  "\x7e\xcd\x84\x86\x98\x5d\x38\x60"
+			  "\x39\xff\xed\x14\x3b\x28\xb1\xc8"
+			  "\x32\x11\x3c\x63\x31\xe5\x40\x7b"
+			  "\xdf\x10\x13\x24\x15\xe5\x4b\x92"
+			  "\xa1\x3e\xd0\xa8\x26\x7a\xe2\xf9"
+			  "\x75\xa3\x85\x74\x1a\xb9\xce\xf8"
+			  "\x20\x31\x62\x3d\x55\xb1\xe4\x71",
+		.len	= 64,
+	},
+};
+
 static const struct aead_testvec hmac_md5_ecb_cipher_null_enc_tv_template[] = {
 	{ /* Input data from RFC 2410 Case 1 */
 #ifdef __LITTLE_ENDIAN
-- 
cgit v1.2.3-59-g8ed1b


From b1e3874c75ab15288f573b3532e507c37e8e7656 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Sat, 27 Oct 2018 15:49:26 +0100
Subject: pcrypt: use format specifier in kobject_add

Passing string 'name' as the format specifier is potentially hazardous
because name could (although very unlikely to) have a format specifier
embedded in it causing issues when parsing the non-existent arguments
to these.  Follow best practice by using the "%s" format string for
the string 'name'.

Cleans up clang warning:
crypto/pcrypt.c:397:40: warning: format string is not a string literal
(potentially insecure) [-Wformat-security]

Fixes: a3fb1e330dd2 ("pcrypt: Added sysfs interface to pcrypt")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/pcrypt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crypto/pcrypt.c b/crypto/pcrypt.c
index f8ec3d4ba4a8..a5718c0a3dc4 100644
--- a/crypto/pcrypt.c
+++ b/crypto/pcrypt.c
@@ -394,7 +394,7 @@ static int pcrypt_sysfs_add(struct padata_instance *pinst, const char *name)
 	int ret;
 
 	pinst->kobj.kset = pcrypt_kset;
-	ret = kobject_add(&pinst->kobj, NULL, name);
+	ret = kobject_add(&pinst->kobj, NULL, "%s", name);
 	if (!ret)
 		kobject_uevent(&pinst->kobj, KOBJ_ADD);
 
-- 
cgit v1.2.3-59-g8ed1b


From fc6176a240ae93850be445f355c1dba769fe8467 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Tue, 30 Oct 2018 12:01:58 +0000
Subject: crypto: chelsio - clean up various indentation issues

Trivial fix to clean up varous indentation issue

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/chelsio/chcr_algo.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/crypto/chelsio/chcr_algo.c b/drivers/crypto/chelsio/chcr_algo.c
index db203f8be429..497c57803de7 100644
--- a/drivers/crypto/chelsio/chcr_algo.c
+++ b/drivers/crypto/chelsio/chcr_algo.c
@@ -1311,8 +1311,8 @@ static int chcr_aes_decrypt(struct ablkcipher_request *req)
 			return -ENOSPC;
 	}
 
-	 err = process_cipher(req, u_ctx->lldi.rxq_ids[c_ctx(tfm)->rx_qidx],
-			      &skb, CHCR_DECRYPT_OP);
+	err = process_cipher(req, u_ctx->lldi.rxq_ids[c_ctx(tfm)->rx_qidx],
+			     &skb, CHCR_DECRYPT_OP);
 	if (err || !skb)
 		return err;
 	skb->dev = u_ctx->lldi.ports[0];
@@ -2008,7 +2008,7 @@ static int chcr_ahash_export(struct ahash_request *areq, void *out)
 	memcpy(state->partial_hash, req_ctx->partial_hash,
 	       CHCR_HASH_MAX_DIGEST_SIZE);
 	chcr_init_hctx_per_wr(state);
-		return 0;
+	return 0;
 }
 
 static int chcr_ahash_import(struct ahash_request *areq, const void *in)
@@ -2249,7 +2249,7 @@ static int chcr_aead_fallback(struct aead_request *req, unsigned short op_type)
 				  req->base.complete, req->base.data);
 	aead_request_set_crypt(subreq, req->src, req->dst, req->cryptlen,
 				 req->iv);
-	 aead_request_set_ad(subreq, req->assoclen);
+	aead_request_set_ad(subreq, req->assoclen);
 	return op_type ? crypto_aead_decrypt(subreq) :
 		crypto_aead_encrypt(subreq);
 }
@@ -3118,12 +3118,12 @@ static int chcr_gcm_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
 		aeadctx->mayverify = VERIFY_HW;
 		break;
 	case ICV_12:
-		 aeadctx->hmac_ctrl = CHCR_SCMD_HMAC_CTRL_IPSEC_96BIT;
-		 aeadctx->mayverify = VERIFY_HW;
+		aeadctx->hmac_ctrl = CHCR_SCMD_HMAC_CTRL_IPSEC_96BIT;
+		aeadctx->mayverify = VERIFY_HW;
 		break;
 	case ICV_14:
-		 aeadctx->hmac_ctrl = CHCR_SCMD_HMAC_CTRL_PL3;
-		 aeadctx->mayverify = VERIFY_HW;
+		aeadctx->hmac_ctrl = CHCR_SCMD_HMAC_CTRL_PL3;
+		aeadctx->mayverify = VERIFY_HW;
 		break;
 	case ICV_16:
 		aeadctx->hmac_ctrl = CHCR_SCMD_HMAC_CTRL_NO_TRUNC;
-- 
cgit v1.2.3-59-g8ed1b


From ed848b652cc6a7a3e7f5e539edc3c2009366093e Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sat, 3 Nov 2018 14:56:02 -0700
Subject: crypto: user - remove redundant reporting functions

The acomp, akcipher, and kpp algorithm types already have .report
methods defined, so there's no need to duplicate this functionality in
crypto_user itself; the duplicate functions are actually never executed.
Remove the unused code.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/crypto_user_base.c | 59 -----------------------------------------------
 1 file changed, 59 deletions(-)

diff --git a/crypto/crypto_user_base.c b/crypto/crypto_user_base.c
index 784748dbb19f..1ecd9b08e38f 100644
--- a/crypto/crypto_user_base.c
+++ b/crypto/crypto_user_base.c
@@ -113,51 +113,6 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
-static int crypto_report_acomp(struct sk_buff *skb, struct crypto_alg *alg)
-{
-	struct crypto_report_acomp racomp;
-
-	strncpy(racomp.type, "acomp", sizeof(racomp.type));
-
-	if (nla_put(skb, CRYPTOCFGA_REPORT_ACOMP,
-		    sizeof(struct crypto_report_acomp), &racomp))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
-}
-
-static int crypto_report_akcipher(struct sk_buff *skb, struct crypto_alg *alg)
-{
-	struct crypto_report_akcipher rakcipher;
-
-	strncpy(rakcipher.type, "akcipher", sizeof(rakcipher.type));
-
-	if (nla_put(skb, CRYPTOCFGA_REPORT_AKCIPHER,
-		    sizeof(struct crypto_report_akcipher), &rakcipher))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
-}
-
-static int crypto_report_kpp(struct sk_buff *skb, struct crypto_alg *alg)
-{
-	struct crypto_report_kpp rkpp;
-
-	strncpy(rkpp.type, "kpp", sizeof(rkpp.type));
-
-	if (nla_put(skb, CRYPTOCFGA_REPORT_KPP,
-		    sizeof(struct crypto_report_kpp), &rkpp))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
-}
-
 static int crypto_report_one(struct crypto_alg *alg,
 			     struct crypto_user_alg *ualg, struct sk_buff *skb)
 {
@@ -202,20 +157,6 @@ static int crypto_report_one(struct crypto_alg *alg,
 			goto nla_put_failure;
 
 		break;
-	case CRYPTO_ALG_TYPE_ACOMPRESS:
-		if (crypto_report_acomp(skb, alg))
-			goto nla_put_failure;
-
-		break;
-	case CRYPTO_ALG_TYPE_AKCIPHER:
-		if (crypto_report_akcipher(skb, alg))
-			goto nla_put_failure;
-
-		break;
-	case CRYPTO_ALG_TYPE_KPP:
-		if (crypto_report_kpp(skb, alg))
-			goto nla_put_failure;
-		break;
 	}
 
 out:
-- 
cgit v1.2.3-59-g8ed1b


From 37db69e0b4923bff331820ee6969681937d8b065 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sat, 3 Nov 2018 14:56:03 -0700
Subject: crypto: user - clean up report structure copying

There have been a pretty ridiculous number of issues with initializing
the report structures that are copied to userspace by NETLINK_CRYPTO.
Commit 4473710df1f8 ("crypto: user - Prepare for CRYPTO_MAX_ALG_NAME
expansion") replaced some strncpy()s with strlcpy()s, thereby
introducing information leaks.  Later two other people tried to replace
other strncpy()s with strlcpy() too, which would have introduced even
more information leaks:

    - https://lore.kernel.org/patchwork/patch/954991/
    - https://patchwork.kernel.org/patch/10434351/

Commit cac5818c25d0 ("crypto: user - Implement a generic crypto
statistics") also uses the buggy strlcpy() approach and therefore leaks
uninitialized memory to userspace.  A fix was proposed, but it was
originally incomplete.

Seeing as how apparently no one can get this right with the current
approach, change all the reporting functions to:

- Start by memsetting the report structure to 0.  This guarantees it's
  always initialized, regardless of what happens later.
- Initialize all strings using strscpy().  This is safe after the
  memset, ensures null termination of long strings, avoids unnecessary
  work, and avoids the -Wstringop-truncation warnings from gcc.
- Use sizeof(var) instead of sizeof(type).  This is more robust against
  copy+paste errors.

For simplicity, also reuse the -EMSGSIZE return value from nla_put().

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/ablkcipher.c       |  32 ++++++---------
 crypto/acompress.c        |  10 ++---
 crypto/aead.c             |  14 +++----
 crypto/ahash.c            |  12 ++----
 crypto/akcipher.c         |  11 ++---
 crypto/blkcipher.c        |  16 +++-----
 crypto/crypto_user_base.c |  38 ++++++++---------
 crypto/crypto_user_stat.c | 102 +++++++++++-----------------------------------
 crypto/kpp.c              |  10 ++---
 crypto/rng.c              |  12 ++----
 crypto/scompress.c        |  11 ++---
 crypto/shash.c            |  12 ++----
 crypto/skcipher.c         |  15 +++----
 13 files changed, 96 insertions(+), 199 deletions(-)

diff --git a/crypto/ablkcipher.c b/crypto/ablkcipher.c
index 8882e90e868e..b5e9ce19d324 100644
--- a/crypto/ablkcipher.c
+++ b/crypto/ablkcipher.c
@@ -365,23 +365,19 @@ static int crypto_ablkcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_report_blkcipher rblkcipher;
 
-	strncpy(rblkcipher.type, "ablkcipher", sizeof(rblkcipher.type));
-	strncpy(rblkcipher.geniv, alg->cra_ablkcipher.geniv ?: "<default>",
+	memset(&rblkcipher, 0, sizeof(rblkcipher));
+
+	strscpy(rblkcipher.type, "ablkcipher", sizeof(rblkcipher.type));
+	strscpy(rblkcipher.geniv, alg->cra_ablkcipher.geniv ?: "<default>",
 		sizeof(rblkcipher.geniv));
-	rblkcipher.geniv[sizeof(rblkcipher.geniv) - 1] = '\0';
 
 	rblkcipher.blocksize = alg->cra_blocksize;
 	rblkcipher.min_keysize = alg->cra_ablkcipher.min_keysize;
 	rblkcipher.max_keysize = alg->cra_ablkcipher.max_keysize;
 	rblkcipher.ivsize = alg->cra_ablkcipher.ivsize;
 
-	if (nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER,
-		    sizeof(struct crypto_report_blkcipher), &rblkcipher))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER,
+		       sizeof(rblkcipher), &rblkcipher);
 }
 #else
 static int crypto_ablkcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
@@ -440,23 +436,19 @@ static int crypto_givcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_report_blkcipher rblkcipher;
 
-	strncpy(rblkcipher.type, "givcipher", sizeof(rblkcipher.type));
-	strncpy(rblkcipher.geniv, alg->cra_ablkcipher.geniv ?: "<built-in>",
+	memset(&rblkcipher, 0, sizeof(rblkcipher));
+
+	strscpy(rblkcipher.type, "givcipher", sizeof(rblkcipher.type));
+	strscpy(rblkcipher.geniv, alg->cra_ablkcipher.geniv ?: "<built-in>",
 		sizeof(rblkcipher.geniv));
-	rblkcipher.geniv[sizeof(rblkcipher.geniv) - 1] = '\0';
 
 	rblkcipher.blocksize = alg->cra_blocksize;
 	rblkcipher.min_keysize = alg->cra_ablkcipher.min_keysize;
 	rblkcipher.max_keysize = alg->cra_ablkcipher.max_keysize;
 	rblkcipher.ivsize = alg->cra_ablkcipher.ivsize;
 
-	if (nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER,
-		    sizeof(struct crypto_report_blkcipher), &rblkcipher))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER,
+		       sizeof(rblkcipher), &rblkcipher);
 }
 #else
 static int crypto_givcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
diff --git a/crypto/acompress.c b/crypto/acompress.c
index 1544b7c057fb..0c5bedd06e70 100644
--- a/crypto/acompress.c
+++ b/crypto/acompress.c
@@ -33,15 +33,11 @@ static int crypto_acomp_report(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_report_acomp racomp;
 
-	strncpy(racomp.type, "acomp", sizeof(racomp.type));
+	memset(&racomp, 0, sizeof(racomp));
 
-	if (nla_put(skb, CRYPTOCFGA_REPORT_ACOMP,
-		    sizeof(struct crypto_report_acomp), &racomp))
-		goto nla_put_failure;
-	return 0;
+	strscpy(racomp.type, "acomp", sizeof(racomp.type));
 
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_REPORT_ACOMP, sizeof(racomp), &racomp);
 }
 #else
 static int crypto_acomp_report(struct sk_buff *skb, struct crypto_alg *alg)
diff --git a/crypto/aead.c b/crypto/aead.c
index 60b3bbe973e7..189c52d1f63a 100644
--- a/crypto/aead.c
+++ b/crypto/aead.c
@@ -119,20 +119,16 @@ static int crypto_aead_report(struct sk_buff *skb, struct crypto_alg *alg)
 	struct crypto_report_aead raead;
 	struct aead_alg *aead = container_of(alg, struct aead_alg, base);
 
-	strncpy(raead.type, "aead", sizeof(raead.type));
-	strncpy(raead.geniv, "<none>", sizeof(raead.geniv));
+	memset(&raead, 0, sizeof(raead));
+
+	strscpy(raead.type, "aead", sizeof(raead.type));
+	strscpy(raead.geniv, "<none>", sizeof(raead.geniv));
 
 	raead.blocksize = alg->cra_blocksize;
 	raead.maxauthsize = aead->maxauthsize;
 	raead.ivsize = aead->ivsize;
 
-	if (nla_put(skb, CRYPTOCFGA_REPORT_AEAD,
-		    sizeof(struct crypto_report_aead), &raead))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_REPORT_AEAD, sizeof(raead), &raead);
 }
 #else
 static int crypto_aead_report(struct sk_buff *skb, struct crypto_alg *alg)
diff --git a/crypto/ahash.c b/crypto/ahash.c
index e21667b4e10a..3a348fbcf8f9 100644
--- a/crypto/ahash.c
+++ b/crypto/ahash.c
@@ -498,18 +498,14 @@ static int crypto_ahash_report(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_report_hash rhash;
 
-	strncpy(rhash.type, "ahash", sizeof(rhash.type));
+	memset(&rhash, 0, sizeof(rhash));
+
+	strscpy(rhash.type, "ahash", sizeof(rhash.type));
 
 	rhash.blocksize = alg->cra_blocksize;
 	rhash.digestsize = __crypto_hash_alg_common(alg)->digestsize;
 
-	if (nla_put(skb, CRYPTOCFGA_REPORT_HASH,
-		    sizeof(struct crypto_report_hash), &rhash))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_REPORT_HASH, sizeof(rhash), &rhash);
 }
 #else
 static int crypto_ahash_report(struct sk_buff *skb, struct crypto_alg *alg)
diff --git a/crypto/akcipher.c b/crypto/akcipher.c
index cfbdb06d8ca8..0cbeae137e0a 100644
--- a/crypto/akcipher.c
+++ b/crypto/akcipher.c
@@ -30,15 +30,12 @@ static int crypto_akcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_report_akcipher rakcipher;
 
-	strncpy(rakcipher.type, "akcipher", sizeof(rakcipher.type));
+	memset(&rakcipher, 0, sizeof(rakcipher));
 
-	if (nla_put(skb, CRYPTOCFGA_REPORT_AKCIPHER,
-		    sizeof(struct crypto_report_akcipher), &rakcipher))
-		goto nla_put_failure;
-	return 0;
+	strscpy(rakcipher.type, "akcipher", sizeof(rakcipher.type));
 
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_REPORT_AKCIPHER,
+		       sizeof(rakcipher), &rakcipher);
 }
 #else
 static int crypto_akcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
diff --git a/crypto/blkcipher.c b/crypto/blkcipher.c
index f93abf13b5d4..193237514e90 100644
--- a/crypto/blkcipher.c
+++ b/crypto/blkcipher.c
@@ -507,23 +507,19 @@ static int crypto_blkcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_report_blkcipher rblkcipher;
 
-	strncpy(rblkcipher.type, "blkcipher", sizeof(rblkcipher.type));
-	strncpy(rblkcipher.geniv, alg->cra_blkcipher.geniv ?: "<default>",
+	memset(&rblkcipher, 0, sizeof(rblkcipher));
+
+	strscpy(rblkcipher.type, "blkcipher", sizeof(rblkcipher.type));
+	strscpy(rblkcipher.geniv, alg->cra_blkcipher.geniv ?: "<default>",
 		sizeof(rblkcipher.geniv));
-	rblkcipher.geniv[sizeof(rblkcipher.geniv) - 1] = '\0';
 
 	rblkcipher.blocksize = alg->cra_blocksize;
 	rblkcipher.min_keysize = alg->cra_blkcipher.min_keysize;
 	rblkcipher.max_keysize = alg->cra_blkcipher.max_keysize;
 	rblkcipher.ivsize = alg->cra_blkcipher.ivsize;
 
-	if (nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER,
-		    sizeof(struct crypto_report_blkcipher), &rblkcipher))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER,
+		       sizeof(rblkcipher), &rblkcipher);
 }
 #else
 static int crypto_blkcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
diff --git a/crypto/crypto_user_base.c b/crypto/crypto_user_base.c
index 1ecd9b08e38f..7021efbb35a1 100644
--- a/crypto/crypto_user_base.c
+++ b/crypto/crypto_user_base.c
@@ -84,42 +84,38 @@ static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_report_cipher rcipher;
 
-	strncpy(rcipher.type, "cipher", sizeof(rcipher.type));
+	memset(&rcipher, 0, sizeof(rcipher));
+
+	strscpy(rcipher.type, "cipher", sizeof(rcipher.type));
 
 	rcipher.blocksize = alg->cra_blocksize;
 	rcipher.min_keysize = alg->cra_cipher.cia_min_keysize;
 	rcipher.max_keysize = alg->cra_cipher.cia_max_keysize;
 
-	if (nla_put(skb, CRYPTOCFGA_REPORT_CIPHER,
-		    sizeof(struct crypto_report_cipher), &rcipher))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_REPORT_CIPHER,
+		       sizeof(rcipher), &rcipher);
 }
 
 static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_report_comp rcomp;
 
-	strncpy(rcomp.type, "compression", sizeof(rcomp.type));
-	if (nla_put(skb, CRYPTOCFGA_REPORT_COMPRESS,
-		    sizeof(struct crypto_report_comp), &rcomp))
-		goto nla_put_failure;
-	return 0;
+	memset(&rcomp, 0, sizeof(rcomp));
 
-nla_put_failure:
-	return -EMSGSIZE;
+	strscpy(rcomp.type, "compression", sizeof(rcomp.type));
+
+	return nla_put(skb, CRYPTOCFGA_REPORT_COMPRESS, sizeof(rcomp), &rcomp);
 }
 
 static int crypto_report_one(struct crypto_alg *alg,
 			     struct crypto_user_alg *ualg, struct sk_buff *skb)
 {
-	strncpy(ualg->cru_name, alg->cra_name, sizeof(ualg->cru_name));
-	strncpy(ualg->cru_driver_name, alg->cra_driver_name,
+	memset(ualg, 0, sizeof(*ualg));
+
+	strscpy(ualg->cru_name, alg->cra_name, sizeof(ualg->cru_name));
+	strscpy(ualg->cru_driver_name, alg->cra_driver_name,
 		sizeof(ualg->cru_driver_name));
-	strncpy(ualg->cru_module_name, module_name(alg->cra_module),
+	strscpy(ualg->cru_module_name, module_name(alg->cra_module),
 		sizeof(ualg->cru_module_name));
 
 	ualg->cru_type = 0;
@@ -132,9 +128,9 @@ static int crypto_report_one(struct crypto_alg *alg,
 	if (alg->cra_flags & CRYPTO_ALG_LARVAL) {
 		struct crypto_report_larval rl;
 
-		strncpy(rl.type, "larval", sizeof(rl.type));
-		if (nla_put(skb, CRYPTOCFGA_REPORT_LARVAL,
-			    sizeof(struct crypto_report_larval), &rl))
+		memset(&rl, 0, sizeof(rl));
+		strscpy(rl.type, "larval", sizeof(rl.type));
+		if (nla_put(skb, CRYPTOCFGA_REPORT_LARVAL, sizeof(rl), &rl))
 			goto nla_put_failure;
 		goto out;
 	}
diff --git a/crypto/crypto_user_stat.c b/crypto/crypto_user_stat.c
index 1dfaa0ccd555..a6fb2e6f618d 100644
--- a/crypto/crypto_user_stat.c
+++ b/crypto/crypto_user_stat.c
@@ -39,7 +39,7 @@ static int crypto_report_aead(struct sk_buff *skb, struct crypto_alg *alg)
 
 	memset(&raead, 0, sizeof(raead));
 
-	strncpy(raead.type, "aead", sizeof(raead.type));
+	strscpy(raead.type, "aead", sizeof(raead.type));
 
 	v32 = atomic_read(&alg->encrypt_cnt);
 	raead.stat_encrypt_cnt = v32;
@@ -52,13 +52,7 @@ static int crypto_report_aead(struct sk_buff *skb, struct crypto_alg *alg)
 	v32 = atomic_read(&alg->aead_err_cnt);
 	raead.stat_aead_err_cnt = v32;
 
-	if (nla_put(skb, CRYPTOCFGA_STAT_AEAD,
-		    sizeof(struct crypto_stat), &raead))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_STAT_AEAD, sizeof(raead), &raead);
 }
 
 static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg)
@@ -69,7 +63,7 @@ static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg)
 
 	memset(&rcipher, 0, sizeof(rcipher));
 
-	strlcpy(rcipher.type, "cipher", sizeof(rcipher.type));
+	strscpy(rcipher.type, "cipher", sizeof(rcipher.type));
 
 	v32 = atomic_read(&alg->encrypt_cnt);
 	rcipher.stat_encrypt_cnt = v32;
@@ -82,13 +76,7 @@ static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg)
 	v32 = atomic_read(&alg->cipher_err_cnt);
 	rcipher.stat_cipher_err_cnt = v32;
 
-	if (nla_put(skb, CRYPTOCFGA_STAT_CIPHER,
-		    sizeof(struct crypto_stat), &rcipher))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_STAT_CIPHER, sizeof(rcipher), &rcipher);
 }
 
 static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg)
@@ -99,7 +87,7 @@ static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg)
 
 	memset(&rcomp, 0, sizeof(rcomp));
 
-	strlcpy(rcomp.type, "compression", sizeof(rcomp.type));
+	strscpy(rcomp.type, "compression", sizeof(rcomp.type));
 	v32 = atomic_read(&alg->compress_cnt);
 	rcomp.stat_compress_cnt = v32;
 	v64 = atomic64_read(&alg->compress_tlen);
@@ -111,13 +99,7 @@ static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg)
 	v32 = atomic_read(&alg->cipher_err_cnt);
 	rcomp.stat_compress_err_cnt = v32;
 
-	if (nla_put(skb, CRYPTOCFGA_STAT_COMPRESS,
-		    sizeof(struct crypto_stat), &rcomp))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_STAT_COMPRESS, sizeof(rcomp), &rcomp);
 }
 
 static int crypto_report_acomp(struct sk_buff *skb, struct crypto_alg *alg)
@@ -128,7 +110,7 @@ static int crypto_report_acomp(struct sk_buff *skb, struct crypto_alg *alg)
 
 	memset(&racomp, 0, sizeof(racomp));
 
-	strlcpy(racomp.type, "acomp", sizeof(racomp.type));
+	strscpy(racomp.type, "acomp", sizeof(racomp.type));
 	v32 = atomic_read(&alg->compress_cnt);
 	racomp.stat_compress_cnt = v32;
 	v64 = atomic64_read(&alg->compress_tlen);
@@ -140,13 +122,7 @@ static int crypto_report_acomp(struct sk_buff *skb, struct crypto_alg *alg)
 	v32 = atomic_read(&alg->cipher_err_cnt);
 	racomp.stat_compress_err_cnt = v32;
 
-	if (nla_put(skb, CRYPTOCFGA_STAT_ACOMP,
-		    sizeof(struct crypto_stat), &racomp))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_STAT_ACOMP, sizeof(racomp), &racomp);
 }
 
 static int crypto_report_akcipher(struct sk_buff *skb, struct crypto_alg *alg)
@@ -157,7 +133,7 @@ static int crypto_report_akcipher(struct sk_buff *skb, struct crypto_alg *alg)
 
 	memset(&rakcipher, 0, sizeof(rakcipher));
 
-	strncpy(rakcipher.type, "akcipher", sizeof(rakcipher.type));
+	strscpy(rakcipher.type, "akcipher", sizeof(rakcipher.type));
 	v32 = atomic_read(&alg->encrypt_cnt);
 	rakcipher.stat_encrypt_cnt = v32;
 	v64 = atomic64_read(&alg->encrypt_tlen);
@@ -173,13 +149,8 @@ static int crypto_report_akcipher(struct sk_buff *skb, struct crypto_alg *alg)
 	v32 = atomic_read(&alg->akcipher_err_cnt);
 	rakcipher.stat_akcipher_err_cnt = v32;
 
-	if (nla_put(skb, CRYPTOCFGA_STAT_AKCIPHER,
-		    sizeof(struct crypto_stat), &rakcipher))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_STAT_AKCIPHER,
+		       sizeof(rakcipher), &rakcipher);
 }
 
 static int crypto_report_kpp(struct sk_buff *skb, struct crypto_alg *alg)
@@ -189,7 +160,7 @@ static int crypto_report_kpp(struct sk_buff *skb, struct crypto_alg *alg)
 
 	memset(&rkpp, 0, sizeof(rkpp));
 
-	strlcpy(rkpp.type, "kpp", sizeof(rkpp.type));
+	strscpy(rkpp.type, "kpp", sizeof(rkpp.type));
 
 	v = atomic_read(&alg->setsecret_cnt);
 	rkpp.stat_setsecret_cnt = v;
@@ -200,13 +171,7 @@ static int crypto_report_kpp(struct sk_buff *skb, struct crypto_alg *alg)
 	v = atomic_read(&alg->kpp_err_cnt);
 	rkpp.stat_kpp_err_cnt = v;
 
-	if (nla_put(skb, CRYPTOCFGA_STAT_KPP,
-		    sizeof(struct crypto_stat), &rkpp))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_STAT_KPP, sizeof(rkpp), &rkpp);
 }
 
 static int crypto_report_ahash(struct sk_buff *skb, struct crypto_alg *alg)
@@ -217,7 +182,7 @@ static int crypto_report_ahash(struct sk_buff *skb, struct crypto_alg *alg)
 
 	memset(&rhash, 0, sizeof(rhash));
 
-	strncpy(rhash.type, "ahash", sizeof(rhash.type));
+	strscpy(rhash.type, "ahash", sizeof(rhash.type));
 
 	v32 = atomic_read(&alg->hash_cnt);
 	rhash.stat_hash_cnt = v32;
@@ -226,13 +191,7 @@ static int crypto_report_ahash(struct sk_buff *skb, struct crypto_alg *alg)
 	v32 = atomic_read(&alg->hash_err_cnt);
 	rhash.stat_hash_err_cnt = v32;
 
-	if (nla_put(skb, CRYPTOCFGA_STAT_HASH,
-		    sizeof(struct crypto_stat), &rhash))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_STAT_HASH, sizeof(rhash), &rhash);
 }
 
 static int crypto_report_shash(struct sk_buff *skb, struct crypto_alg *alg)
@@ -243,7 +202,7 @@ static int crypto_report_shash(struct sk_buff *skb, struct crypto_alg *alg)
 
 	memset(&rhash, 0, sizeof(rhash));
 
-	strncpy(rhash.type, "shash", sizeof(rhash.type));
+	strscpy(rhash.type, "shash", sizeof(rhash.type));
 
 	v32 = atomic_read(&alg->hash_cnt);
 	rhash.stat_hash_cnt = v32;
@@ -252,13 +211,7 @@ static int crypto_report_shash(struct sk_buff *skb, struct crypto_alg *alg)
 	v32 = atomic_read(&alg->hash_err_cnt);
 	rhash.stat_hash_err_cnt = v32;
 
-	if (nla_put(skb, CRYPTOCFGA_STAT_HASH,
-		    sizeof(struct crypto_stat), &rhash))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_STAT_HASH, sizeof(rhash), &rhash);
 }
 
 static int crypto_report_rng(struct sk_buff *skb, struct crypto_alg *alg)
@@ -269,7 +222,7 @@ static int crypto_report_rng(struct sk_buff *skb, struct crypto_alg *alg)
 
 	memset(&rrng, 0, sizeof(rrng));
 
-	strncpy(rrng.type, "rng", sizeof(rrng.type));
+	strscpy(rrng.type, "rng", sizeof(rrng.type));
 
 	v32 = atomic_read(&alg->generate_cnt);
 	rrng.stat_generate_cnt = v32;
@@ -280,13 +233,7 @@ static int crypto_report_rng(struct sk_buff *skb, struct crypto_alg *alg)
 	v32 = atomic_read(&alg->hash_err_cnt);
 	rrng.stat_rng_err_cnt = v32;
 
-	if (nla_put(skb, CRYPTOCFGA_STAT_RNG,
-		    sizeof(struct crypto_stat), &rrng))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_STAT_RNG, sizeof(rrng), &rrng);
 }
 
 static int crypto_reportstat_one(struct crypto_alg *alg,
@@ -295,10 +242,10 @@ static int crypto_reportstat_one(struct crypto_alg *alg,
 {
 	memset(ualg, 0, sizeof(*ualg));
 
-	strlcpy(ualg->cru_name, alg->cra_name, sizeof(ualg->cru_name));
-	strlcpy(ualg->cru_driver_name, alg->cra_driver_name,
+	strscpy(ualg->cru_name, alg->cra_name, sizeof(ualg->cru_name));
+	strscpy(ualg->cru_driver_name, alg->cra_driver_name,
 		sizeof(ualg->cru_driver_name));
-	strlcpy(ualg->cru_module_name, module_name(alg->cra_module),
+	strscpy(ualg->cru_module_name, module_name(alg->cra_module),
 		sizeof(ualg->cru_module_name));
 
 	ualg->cru_type = 0;
@@ -312,9 +259,8 @@ static int crypto_reportstat_one(struct crypto_alg *alg,
 		struct crypto_stat rl;
 
 		memset(&rl, 0, sizeof(rl));
-		strlcpy(rl.type, "larval", sizeof(rl.type));
-		if (nla_put(skb, CRYPTOCFGA_STAT_LARVAL,
-			    sizeof(struct crypto_stat), &rl))
+		strscpy(rl.type, "larval", sizeof(rl.type));
+		if (nla_put(skb, CRYPTOCFGA_STAT_LARVAL, sizeof(rl), &rl))
 			goto nla_put_failure;
 		goto out;
 	}
diff --git a/crypto/kpp.c b/crypto/kpp.c
index a90edc27af77..bc2f1006a2f7 100644
--- a/crypto/kpp.c
+++ b/crypto/kpp.c
@@ -30,15 +30,11 @@ static int crypto_kpp_report(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_report_kpp rkpp;
 
-	strncpy(rkpp.type, "kpp", sizeof(rkpp.type));
+	memset(&rkpp, 0, sizeof(rkpp));
 
-	if (nla_put(skb, CRYPTOCFGA_REPORT_KPP,
-		    sizeof(struct crypto_report_kpp), &rkpp))
-		goto nla_put_failure;
-	return 0;
+	strscpy(rkpp.type, "kpp", sizeof(rkpp.type));
 
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_REPORT_KPP, sizeof(rkpp), &rkpp);
 }
 #else
 static int crypto_kpp_report(struct sk_buff *skb, struct crypto_alg *alg)
diff --git a/crypto/rng.c b/crypto/rng.c
index 547f16ecbfb0..2406501b90b7 100644
--- a/crypto/rng.c
+++ b/crypto/rng.c
@@ -74,17 +74,13 @@ static int crypto_rng_report(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_report_rng rrng;
 
-	strncpy(rrng.type, "rng", sizeof(rrng.type));
+	memset(&rrng, 0, sizeof(rrng));
 
-	rrng.seedsize = seedsize(alg);
+	strscpy(rrng.type, "rng", sizeof(rrng.type));
 
-	if (nla_put(skb, CRYPTOCFGA_REPORT_RNG,
-		    sizeof(struct crypto_report_rng), &rrng))
-		goto nla_put_failure;
-	return 0;
+	rrng.seedsize = seedsize(alg);
 
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_REPORT_RNG, sizeof(rrng), &rrng);
 }
 #else
 static int crypto_rng_report(struct sk_buff *skb, struct crypto_alg *alg)
diff --git a/crypto/scompress.c b/crypto/scompress.c
index 968bbcf65c94..6f8305f8c300 100644
--- a/crypto/scompress.c
+++ b/crypto/scompress.c
@@ -40,15 +40,12 @@ static int crypto_scomp_report(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_report_comp rscomp;
 
-	strncpy(rscomp.type, "scomp", sizeof(rscomp.type));
+	memset(&rscomp, 0, sizeof(rscomp));
 
-	if (nla_put(skb, CRYPTOCFGA_REPORT_COMPRESS,
-		    sizeof(struct crypto_report_comp), &rscomp))
-		goto nla_put_failure;
-	return 0;
+	strscpy(rscomp.type, "scomp", sizeof(rscomp.type));
 
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_REPORT_COMPRESS,
+		       sizeof(rscomp), &rscomp);
 }
 #else
 static int crypto_scomp_report(struct sk_buff *skb, struct crypto_alg *alg)
diff --git a/crypto/shash.c b/crypto/shash.c
index d21f04d70dce..44d297b82a8f 100644
--- a/crypto/shash.c
+++ b/crypto/shash.c
@@ -408,18 +408,14 @@ static int crypto_shash_report(struct sk_buff *skb, struct crypto_alg *alg)
 	struct crypto_report_hash rhash;
 	struct shash_alg *salg = __crypto_shash_alg(alg);
 
-	strncpy(rhash.type, "shash", sizeof(rhash.type));
+	memset(&rhash, 0, sizeof(rhash));
+
+	strscpy(rhash.type, "shash", sizeof(rhash.type));
 
 	rhash.blocksize = alg->cra_blocksize;
 	rhash.digestsize = salg->digestsize;
 
-	if (nla_put(skb, CRYPTOCFGA_REPORT_HASH,
-		    sizeof(struct crypto_report_hash), &rhash))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_REPORT_HASH, sizeof(rhash), &rhash);
 }
 #else
 static int crypto_shash_report(struct sk_buff *skb, struct crypto_alg *alg)
diff --git a/crypto/skcipher.c b/crypto/skcipher.c
index 4caab81d2d02..17be8d9c714e 100644
--- a/crypto/skcipher.c
+++ b/crypto/skcipher.c
@@ -897,21 +897,18 @@ static int crypto_skcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
 	struct skcipher_alg *skcipher = container_of(alg, struct skcipher_alg,
 						     base);
 
-	strncpy(rblkcipher.type, "skcipher", sizeof(rblkcipher.type));
-	strncpy(rblkcipher.geniv, "<none>", sizeof(rblkcipher.geniv));
+	memset(&rblkcipher, 0, sizeof(rblkcipher));
+
+	strscpy(rblkcipher.type, "skcipher", sizeof(rblkcipher.type));
+	strscpy(rblkcipher.geniv, "<none>", sizeof(rblkcipher.geniv));
 
 	rblkcipher.blocksize = alg->cra_blocksize;
 	rblkcipher.min_keysize = skcipher->min_keysize;
 	rblkcipher.max_keysize = skcipher->max_keysize;
 	rblkcipher.ivsize = skcipher->ivsize;
 
-	if (nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER,
-		    sizeof(struct crypto_report_blkcipher), &rblkcipher))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER,
+		       sizeof(rblkcipher), &rblkcipher);
 }
 #else
 static int crypto_skcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
-- 
cgit v1.2.3-59-g8ed1b


From 196ad6043e9fe93c4ae3dac02b5c8fd337f58c2d Mon Sep 17 00:00:00 2001
From: Gilad Ben-Yossef <gilad@benyossef.com>
Date: Sun, 4 Nov 2018 10:05:24 +0000
Subject: crypto: testmgr - mark cts(cbc(aes)) as FIPS allowed

As per Sp800-38A addendum from Oct 2010[1], cts(cbc(aes)) is
allowed as a FIPS mode algorithm. Mark it as such.

[1] https://csrc.nist.gov/publications/detail/sp/800-38a/addendum/final

Signed-off-by: Gilad Ben-Yossef <gilad@benyossef.com>
Reviewed-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/testmgr.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 84937ceb4bd8..512ebf697f7e 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -2812,6 +2812,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "cts(cbc(aes))",
 		.test = alg_test_skcipher,
+		.fips_allowed = 1,
 		.suite = {
 			.cipher = __VECS(cts_mode_tv_template)
 		}
-- 
cgit v1.2.3-59-g8ed1b


From 2eb4942b6609d35a4e835644a33203b0aef7443d Mon Sep 17 00:00:00 2001
From: Vitaly Chikunov <vt@altlinux.org>
Date: Mon, 5 Nov 2018 11:36:18 +0300
Subject: crypto: ecc - check for invalid values in the key verification test

Currently used scalar multiplication algorithm (Matthieu Rivain, 2011)
have invalid values for scalar == 1, n-1, and for regularized version
n-2, which was previously not checked. Verify that they are not used as
private keys.

Signed-off-by: Vitaly Chikunov <vt@altlinux.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/ecc.c | 42 ++++++++++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/crypto/ecc.c b/crypto/ecc.c
index 8facafd67802..9d24e6522730 100644
--- a/crypto/ecc.c
+++ b/crypto/ecc.c
@@ -904,30 +904,43 @@ static inline void ecc_swap_digits(const u64 *in, u64 *out,
 		out[i] = __swab64(in[ndigits - 1 - i]);
 }
 
-int ecc_is_key_valid(unsigned int curve_id, unsigned int ndigits,
-		     const u64 *private_key, unsigned int private_key_len)
+static int __ecc_is_key_valid(const struct ecc_curve *curve,
+			      const u64 *private_key, unsigned int ndigits)
 {
-	int nbytes;
-	const struct ecc_curve *curve = ecc_get_curve(curve_id);
+	u64 one[ECC_MAX_DIGITS] = { 1, };
+	u64 res[ECC_MAX_DIGITS];
 
 	if (!private_key)
 		return -EINVAL;
 
-	nbytes = ndigits << ECC_DIGITS_TO_BYTES_SHIFT;
-
-	if (private_key_len != nbytes)
+	if (curve->g.ndigits != ndigits)
 		return -EINVAL;
 
-	if (vli_is_zero(private_key, ndigits))
+	/* Make sure the private key is in the range [2, n-3]. */
+	if (vli_cmp(one, private_key, ndigits) != -1)
 		return -EINVAL;
-
-	/* Make sure the private key is in the range [1, n-1]. */
-	if (vli_cmp(curve->n, private_key, ndigits) != 1)
+	vli_sub(res, curve->n, one, ndigits);
+	vli_sub(res, res, one, ndigits);
+	if (vli_cmp(res, private_key, ndigits) != 1)
 		return -EINVAL;
 
 	return 0;
 }
 
+int ecc_is_key_valid(unsigned int curve_id, unsigned int ndigits,
+		     const u64 *private_key, unsigned int private_key_len)
+{
+	int nbytes;
+	const struct ecc_curve *curve = ecc_get_curve(curve_id);
+
+	nbytes = ndigits << ECC_DIGITS_TO_BYTES_SHIFT;
+
+	if (private_key_len != nbytes)
+		return -EINVAL;
+
+	return __ecc_is_key_valid(curve, private_key, ndigits);
+}
+
 /*
  * ECC private keys are generated using the method of extra random bits,
  * equivalent to that described in FIPS 186-4, Appendix B.4.1.
@@ -971,11 +984,8 @@ int ecc_gen_privkey(unsigned int curve_id, unsigned int ndigits, u64 *privkey)
 	if (err)
 		return err;
 
-	if (vli_is_zero(priv, ndigits))
-		return -EINVAL;
-
-	/* Make sure the private key is in the range [1, n-1]. */
-	if (vli_cmp(curve->n, priv, ndigits) != 1)
+	/* Make sure the private key is in the valid range. */
+	if (__ecc_is_key_valid(curve, priv, ndigits))
 		return -EINVAL;
 
 	ecc_swap_digits(priv, privkey, ndigits);
-- 
cgit v1.2.3-59-g8ed1b


From ecd6d5c9cba5fc6053ba21e3f8a4c536f65ea27a Mon Sep 17 00:00:00 2001
From: Gilad Ben-Yossef <gilad@benyossef.com>
Date: Mon, 5 Nov 2018 12:05:01 +0000
Subject: crypto: cts - document NIST standard status

cts(cbc(aes)) as used in the kernel has been added to NIST
standard as CBC-CS3. Document it as such.

Signed-off-by: Gilad Ben-Yossef <gilad@benyossef.com>
Suggested-by: Stephan Mueller <smueller@chronox.de>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/Kconfig | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 752005201013..06eb23cade43 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -430,11 +430,14 @@ config CRYPTO_CTS
 	help
 	  CTS: Cipher Text Stealing
 	  This is the Cipher Text Stealing mode as described by
-	  Section 8 of rfc2040 and referenced by rfc3962.
-	  (rfc3962 includes errata information in its Appendix A)
+	  Section 8 of rfc2040 and referenced by rfc3962
+	  (rfc3962 includes errata information in its Appendix A) or
+	  CBC-CS3 as defined by NIST in Sp800-38A addendum from Oct 2010.
 	  This mode is required for Kerberos gss mechanism support
 	  for AES encryption.
 
+	  See: https://csrc.nist.gov/publications/detail/sp/800-38a/addendum/final
+
 config CRYPTO_ECB
 	tristate "ECB support"
 	select CRYPTO_BLKCIPHER
-- 
cgit v1.2.3-59-g8ed1b


From 4f0129d13e69bad0363fd75553fb22897b32c379 Mon Sep 17 00:00:00 2001
From: Raveendra Padasalagi <raveendra.padasalagi@broadcom.com>
Date: Tue, 6 Nov 2018 13:58:58 +0530
Subject: crypto: bcm - fix normal/non key hash algorithm failure

Remove setkey() callback handler for normal/non key
hash algorithms and keep it for AES-CBC/CMAC which needs key.

Fixes: 9d12ba86f818 ("crypto: brcm - Add Broadcom SPU driver")
Signed-off-by: Raveendra Padasalagi <raveendra.padasalagi@broadcom.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/bcm/cipher.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/crypto/bcm/cipher.c b/drivers/crypto/bcm/cipher.c
index 2d1f1db9f807..4d67e22a4664 100644
--- a/drivers/crypto/bcm/cipher.c
+++ b/drivers/crypto/bcm/cipher.c
@@ -4652,12 +4652,16 @@ static int spu_register_ahash(struct iproc_alg_s *driver_alg)
 	hash->halg.statesize = sizeof(struct spu_hash_export_s);
 
 	if (driver_alg->auth_info.mode != HASH_MODE_HMAC) {
-		hash->setkey = ahash_setkey;
 		hash->init = ahash_init;
 		hash->update = ahash_update;
 		hash->final = ahash_final;
 		hash->finup = ahash_finup;
 		hash->digest = ahash_digest;
+		if ((driver_alg->auth_info.alg == HASH_ALG_AES) &&
+		    ((driver_alg->auth_info.mode == HASH_MODE_XCBC) ||
+		    (driver_alg->auth_info.mode == HASH_MODE_CMAC))) {
+			hash->setkey = ahash_setkey;
+		}
 	} else {
 		hash->setkey = ahash_hmac_setkey;
 		hash->init = ahash_hmac_init;
-- 
cgit v1.2.3-59-g8ed1b


From d65ddecbea3c07f9f93af9d32680e650f20aa102 Mon Sep 17 00:00:00 2001
From: Brajeswar Ghosh <brajeswar.linux@gmail.com>
Date: Tue, 6 Nov 2018 17:46:14 +0530
Subject: crypto: aes-ce - Remove duplicate header

Remove asm/hwcap.h which is included more than once

Signed-off-by: Brajeswar Ghosh <brajeswar.linux@gmail.com>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm/crypto/aes-ce-glue.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/arm/crypto/aes-ce-glue.c b/arch/arm/crypto/aes-ce-glue.c
index d0a9cec73707..5affb8482379 100644
--- a/arch/arm/crypto/aes-ce-glue.c
+++ b/arch/arm/crypto/aes-ce-glue.c
@@ -10,7 +10,6 @@
 
 #include <asm/hwcap.h>
 #include <asm/neon.h>
-#include <asm/hwcap.h>
 #include <crypto/aes.h>
 #include <crypto/internal/simd.h>
 #include <crypto/internal/skcipher.h>
-- 
cgit v1.2.3-59-g8ed1b


From fe18957e8e87403a9d4be8e8a62352ef107def99 Mon Sep 17 00:00:00 2001
From: Vitaly Chikunov <vt@altlinux.org>
Date: Wed, 7 Nov 2018 00:00:01 +0300
Subject: crypto: streebog - add Streebog hash function

Add GOST/IETF Streebog hash function (GOST R 34.11-2012, RFC 6986)
generic hash transformation.

Cc: linux-integrity@vger.kernel.org
Signed-off-by: Vitaly Chikunov <vt@altlinux.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/Kconfig            |   12 +
 crypto/Makefile           |    1 +
 crypto/streebog_generic.c | 1140 +++++++++++++++++++++++++++++++++++++++++++++
 include/crypto/streebog.h |   34 ++
 4 files changed, 1187 insertions(+)
 create mode 100644 crypto/streebog_generic.c
 create mode 100644 include/crypto/streebog.h

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 06eb23cade43..62dbd1a99fa3 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -939,6 +939,18 @@ config CRYPTO_SM3
 	  http://www.oscca.gov.cn/UpFile/20101222141857786.pdf
 	  https://datatracker.ietf.org/doc/html/draft-shen-sm3-hash
 
+config CRYPTO_STREEBOG
+	tristate "Streebog Hash Function"
+	select CRYPTO_HASH
+	help
+	  Streebog Hash Function (GOST R 34.11-2012, RFC 6986) is one of the Russian
+	  cryptographic standard algorithms (called GOST algorithms).
+	  This setting enables two hash algorithms with 256 and 512 bits output.
+
+	  References:
+	  https://tc26.ru/upload/iblock/fed/feddbb4d26b685903faa2ba11aea43f6.pdf
+	  https://tools.ietf.org/html/rfc6986
+
 config CRYPTO_TGR192
 	tristate "Tiger digest algorithms"
 	select CRYPTO_HASH
diff --git a/crypto/Makefile b/crypto/Makefile
index 5c207c76abf7..abbd86fdbad2 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -71,6 +71,7 @@ obj-$(CONFIG_CRYPTO_SHA256) += sha256_generic.o
 obj-$(CONFIG_CRYPTO_SHA512) += sha512_generic.o
 obj-$(CONFIG_CRYPTO_SHA3) += sha3_generic.o
 obj-$(CONFIG_CRYPTO_SM3) += sm3_generic.o
+obj-$(CONFIG_CRYPTO_STREEBOG) += streebog_generic.o
 obj-$(CONFIG_CRYPTO_WP512) += wp512.o
 CFLAGS_wp512.o := $(call cc-option,-fno-schedule-insns)  # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79149
 obj-$(CONFIG_CRYPTO_TGR192) += tgr192.o
diff --git a/crypto/streebog_generic.c b/crypto/streebog_generic.c
new file mode 100644
index 000000000000..03272a22afce
--- /dev/null
+++ b/crypto/streebog_generic.c
@@ -0,0 +1,1140 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-2-Clause
+/*
+ * Streebog hash function as specified by GOST R 34.11-2012 and
+ * described at https://tools.ietf.org/html/rfc6986
+ *
+ * Copyright (c) 2013 Alexey Degtyarev <alexey@renatasystems.org>
+ * Copyright (c) 2018 Vitaly Chikunov <vt@altlinux.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/module.h>
+#include <linux/crypto.h>
+#include <crypto/streebog.h>
+
+static const struct streebog_uint512 buffer0 = { {
+	0, 0, 0, 0, 0, 0, 0, 0
+} };
+
+static const struct streebog_uint512 buffer512 = { {
+	cpu_to_le64(0x200), 0, 0, 0, 0, 0, 0, 0
+} };
+
+static const struct streebog_uint512 C[12] = {
+	{ {
+		 cpu_to_le64(0xdd806559f2a64507ULL),
+		 cpu_to_le64(0x05767436cc744d23ULL),
+		 cpu_to_le64(0xa2422a08a460d315ULL),
+		 cpu_to_le64(0x4b7ce09192676901ULL),
+		 cpu_to_le64(0x714eb88d7585c4fcULL),
+		 cpu_to_le64(0x2f6a76432e45d016ULL),
+		 cpu_to_le64(0xebcb2f81c0657c1fULL),
+		 cpu_to_le64(0xb1085bda1ecadae9ULL)
+	 } },
+	{ {
+		 cpu_to_le64(0xe679047021b19bb7ULL),
+		 cpu_to_le64(0x55dda21bd7cbcd56ULL),
+		 cpu_to_le64(0x5cb561c2db0aa7caULL),
+		 cpu_to_le64(0x9ab5176b12d69958ULL),
+		 cpu_to_le64(0x61d55e0f16b50131ULL),
+		 cpu_to_le64(0xf3feea720a232b98ULL),
+		 cpu_to_le64(0x4fe39d460f70b5d7ULL),
+		 cpu_to_le64(0x6fa3b58aa99d2f1aULL)
+	 } },
+	{ {
+		 cpu_to_le64(0x991e96f50aba0ab2ULL),
+		 cpu_to_le64(0xc2b6f443867adb31ULL),
+		 cpu_to_le64(0xc1c93a376062db09ULL),
+		 cpu_to_le64(0xd3e20fe490359eb1ULL),
+		 cpu_to_le64(0xf2ea7514b1297b7bULL),
+		 cpu_to_le64(0x06f15e5f529c1f8bULL),
+		 cpu_to_le64(0x0a39fc286a3d8435ULL),
+		 cpu_to_le64(0xf574dcac2bce2fc7ULL)
+	 } },
+	{ {
+		 cpu_to_le64(0x220cbebc84e3d12eULL),
+		 cpu_to_le64(0x3453eaa193e837f1ULL),
+		 cpu_to_le64(0xd8b71333935203beULL),
+		 cpu_to_le64(0xa9d72c82ed03d675ULL),
+		 cpu_to_le64(0x9d721cad685e353fULL),
+		 cpu_to_le64(0x488e857e335c3c7dULL),
+		 cpu_to_le64(0xf948e1a05d71e4ddULL),
+		 cpu_to_le64(0xef1fdfb3e81566d2ULL)
+	 } },
+	{ {
+		 cpu_to_le64(0x601758fd7c6cfe57ULL),
+		 cpu_to_le64(0x7a56a27ea9ea63f5ULL),
+		 cpu_to_le64(0xdfff00b723271a16ULL),
+		 cpu_to_le64(0xbfcd1747253af5a3ULL),
+		 cpu_to_le64(0x359e35d7800fffbdULL),
+		 cpu_to_le64(0x7f151c1f1686104aULL),
+		 cpu_to_le64(0x9a3f410c6ca92363ULL),
+		 cpu_to_le64(0x4bea6bacad474799ULL)
+	 } },
+	{ {
+		 cpu_to_le64(0xfa68407a46647d6eULL),
+		 cpu_to_le64(0xbf71c57236904f35ULL),
+		 cpu_to_le64(0x0af21f66c2bec6b6ULL),
+		 cpu_to_le64(0xcffaa6b71c9ab7b4ULL),
+		 cpu_to_le64(0x187f9ab49af08ec6ULL),
+		 cpu_to_le64(0x2d66c4f95142a46cULL),
+		 cpu_to_le64(0x6fa4c33b7a3039c0ULL),
+		 cpu_to_le64(0xae4faeae1d3ad3d9ULL)
+	 } },
+	{ {
+		 cpu_to_le64(0x8886564d3a14d493ULL),
+		 cpu_to_le64(0x3517454ca23c4af3ULL),
+		 cpu_to_le64(0x06476983284a0504ULL),
+		 cpu_to_le64(0x0992abc52d822c37ULL),
+		 cpu_to_le64(0xd3473e33197a93c9ULL),
+		 cpu_to_le64(0x399ec6c7e6bf87c9ULL),
+		 cpu_to_le64(0x51ac86febf240954ULL),
+		 cpu_to_le64(0xf4c70e16eeaac5ecULL)
+	 } },
+	{ {
+		 cpu_to_le64(0xa47f0dd4bf02e71eULL),
+		 cpu_to_le64(0x36acc2355951a8d9ULL),
+		 cpu_to_le64(0x69d18d2bd1a5c42fULL),
+		 cpu_to_le64(0xf4892bcb929b0690ULL),
+		 cpu_to_le64(0x89b4443b4ddbc49aULL),
+		 cpu_to_le64(0x4eb7f8719c36de1eULL),
+		 cpu_to_le64(0x03e7aa020c6e4141ULL),
+		 cpu_to_le64(0x9b1f5b424d93c9a7ULL)
+	 } },
+	{ {
+		 cpu_to_le64(0x7261445183235adbULL),
+		 cpu_to_le64(0x0e38dc92cb1f2a60ULL),
+		 cpu_to_le64(0x7b2b8a9aa6079c54ULL),
+		 cpu_to_le64(0x800a440bdbb2ceb1ULL),
+		 cpu_to_le64(0x3cd955b7e00d0984ULL),
+		 cpu_to_le64(0x3a7d3a1b25894224ULL),
+		 cpu_to_le64(0x944c9ad8ec165fdeULL),
+		 cpu_to_le64(0x378f5a541631229bULL)
+	 } },
+	{ {
+		 cpu_to_le64(0x74b4c7fb98459cedULL),
+		 cpu_to_le64(0x3698fad1153bb6c3ULL),
+		 cpu_to_le64(0x7a1e6c303b7652f4ULL),
+		 cpu_to_le64(0x9fe76702af69334bULL),
+		 cpu_to_le64(0x1fffe18a1b336103ULL),
+		 cpu_to_le64(0x8941e71cff8a78dbULL),
+		 cpu_to_le64(0x382ae548b2e4f3f3ULL),
+		 cpu_to_le64(0xabbedea680056f52ULL)
+	 } },
+	{ {
+		 cpu_to_le64(0x6bcaa4cd81f32d1bULL),
+		 cpu_to_le64(0xdea2594ac06fd85dULL),
+		 cpu_to_le64(0xefbacd1d7d476e98ULL),
+		 cpu_to_le64(0x8a1d71efea48b9caULL),
+		 cpu_to_le64(0x2001802114846679ULL),
+		 cpu_to_le64(0xd8fa6bbbebab0761ULL),
+		 cpu_to_le64(0x3002c6cd635afe94ULL),
+		 cpu_to_le64(0x7bcd9ed0efc889fbULL)
+	 } },
+	{ {
+		 cpu_to_le64(0x48bc924af11bd720ULL),
+		 cpu_to_le64(0xfaf417d5d9b21b99ULL),
+		 cpu_to_le64(0xe71da4aa88e12852ULL),
+		 cpu_to_le64(0x5d80ef9d1891cc86ULL),
+		 cpu_to_le64(0xf82012d430219f9bULL),
+		 cpu_to_le64(0xcda43c32bcdf1d77ULL),
+		 cpu_to_le64(0xd21380b00449b17aULL),
+		 cpu_to_le64(0x378ee767f11631baULL)
+	 } }
+};
+
+static const u8 Tau[64] = {
+	0,   8,  16,  24,  32,  40,  48,  56,
+	1,   9,  17,  25,  33,  41,  49,  57,
+	2,  10,  18,  26,  34,  42,  50,  58,
+	3,  11,  19,  27,  35,  43,  51,  59,
+	4,  12,  20,  28,  36,  44,  52,  60,
+	5,  13,  21,  29,  37,  45,  53,  61,
+	6,  14,  22,  30,  38,  46,  54,  62,
+	7,  15,  23,  31,  39,  47,  55,  63
+};
+
+static const u8 Pi[256] = {
+	252, 238, 221,  17, 207, 110,  49,  22,
+	251, 196, 250, 218,  35, 197,   4,  77,
+	233, 119, 240, 219, 147,  46, 153, 186,
+	 23,  54, 241, 187,  20, 205,  95, 193,
+	249,  24, 101,  90, 226,  92, 239,  33,
+	129,  28,  60,  66, 139,   1, 142,  79,
+	  5, 132,   2, 174, 227, 106, 143, 160,
+	  6,  11, 237, 152, 127, 212, 211,  31,
+	235,  52,  44,  81, 234, 200,  72, 171,
+	242,  42, 104, 162, 253,  58, 206, 204,
+	181, 112,  14,  86,   8,  12, 118,  18,
+	191, 114,  19,  71, 156, 183,  93, 135,
+	 21, 161, 150,  41,  16, 123, 154, 199,
+	243, 145, 120, 111, 157, 158, 178, 177,
+	 50, 117,  25,  61, 255,  53, 138, 126,
+	109,  84, 198, 128, 195, 189,  13,  87,
+	223, 245,  36, 169,  62, 168,  67, 201,
+	215, 121, 214, 246, 124,  34, 185,   3,
+	224,  15, 236, 222, 122, 148, 176, 188,
+	220, 232,  40,  80,  78,  51,  10,  74,
+	167, 151,  96, 115,  30,   0,  98,  68,
+	 26, 184,  56, 130, 100, 159,  38,  65,
+	173,  69,  70, 146,  39,  94,  85,  47,
+	140, 163, 165, 125, 105, 213, 149,  59,
+	  7,  88, 179,  64, 134, 172,  29, 247,
+	 48,  55, 107, 228, 136, 217, 231, 137,
+	225,  27, 131,  73,  76,  63, 248, 254,
+	141,  83, 170, 144, 202, 216, 133,  97,
+	 32, 113, 103, 164,  45,  43,   9,  91,
+	203, 155,  37, 208, 190, 229, 108,  82,
+	 89, 166, 116, 210, 230, 244, 180, 192,
+	209, 102, 175, 194,  57,  75,  99, 182
+};
+
+static const unsigned long long Ax[8][256] = {
+	{
+	0xd01f715b5c7ef8e6ULL, 0x16fa240980778325ULL, 0xa8a42e857ee049c8ULL,
+	0x6ac1068fa186465bULL, 0x6e417bd7a2e9320bULL, 0x665c8167a437daabULL,
+	0x7666681aa89617f6ULL, 0x4b959163700bdcf5ULL, 0xf14be6b78df36248ULL,
+	0xc585bd689a625cffULL, 0x9557d7fca67d82cbULL, 0x89f0b969af6dd366ULL,
+	0xb0833d48749f6c35ULL, 0xa1998c23b1ecbc7cULL, 0x8d70c431ac02a736ULL,
+	0xd6dfbc2fd0a8b69eULL, 0x37aeb3e551fa198bULL, 0x0b7d128a40b5cf9cULL,
+	0x5a8f2008b5780cbcULL, 0xedec882284e333e5ULL, 0xd25fc177d3c7c2ceULL,
+	0x5e0f5d50b61778ecULL, 0x1d873683c0c24cb9ULL, 0xad040bcbb45d208cULL,
+	0x2f89a0285b853c76ULL, 0x5732fff6791b8d58ULL, 0x3e9311439ef6ec3fULL,
+	0xc9183a809fd3c00fULL, 0x83adf3f5260a01eeULL, 0xa6791941f4e8ef10ULL,
+	0x103ae97d0ca1cd5dULL, 0x2ce948121dee1b4aULL, 0x39738421dbf2bf53ULL,
+	0x093da2a6cf0cf5b4ULL, 0xcd9847d89cbcb45fULL, 0xf9561c078b2d8ae8ULL,
+	0x9c6a755a6971777fULL, 0xbc1ebaa0712ef0c5ULL, 0x72e61542abf963a6ULL,
+	0x78bb5fde229eb12eULL, 0x14ba94250fceb90dULL, 0x844d6697630e5282ULL,
+	0x98ea08026a1e032fULL, 0xf06bbea144217f5cULL, 0xdb6263d11ccb377aULL,
+	0x641c314b2b8ee083ULL, 0x320e96ab9b4770cfULL, 0x1ee7deb986a96b85ULL,
+	0xe96cf57a878c47b5ULL, 0xfdd6615f8842feb8ULL, 0xc83862965601dd1bULL,
+	0x2ea9f83e92572162ULL, 0xf876441142ff97fcULL, 0xeb2c455608357d9dULL,
+	0x5612a7e0b0c9904cULL, 0x6c01cbfb2d500823ULL, 0x4548a6a7fa037a2dULL,
+	0xabc4c6bf388b6ef4ULL, 0xbade77d4fdf8bebdULL, 0x799b07c8eb4cac3aULL,
+	0x0c9d87e805b19cf0ULL, 0xcb588aac106afa27ULL, 0xea0c1d40c1e76089ULL,
+	0x2869354a1e816f1aULL, 0xff96d17307fbc490ULL, 0x9f0a9d602f1a5043ULL,
+	0x96373fc6e016a5f7ULL, 0x5292dab8b3a6e41cULL, 0x9b8ae0382c752413ULL,
+	0x4f15ec3b7364a8a5ULL, 0x3fb349555724f12bULL, 0xc7c50d4415db66d7ULL,
+	0x92b7429ee379d1a7ULL, 0xd37f99611a15dfdaULL, 0x231427c05e34a086ULL,
+	0xa439a96d7b51d538ULL, 0xb403401077f01865ULL, 0xdda2aea5901d7902ULL,
+	0x0a5d4a9c8967d288ULL, 0xc265280adf660f93ULL, 0x8bb0094520d4e94eULL,
+	0x2a29856691385532ULL, 0x42a833c5bf072941ULL, 0x73c64d54622b7eb2ULL,
+	0x07e095624504536cULL, 0x8a905153e906f45aULL, 0x6f6123c16b3b2f1fULL,
+	0xc6e55552dc097bc3ULL, 0x4468feb133d16739ULL, 0xe211e7f0c7398829ULL,
+	0xa2f96419f7879b40ULL, 0x19074bdbc3ad38e9ULL, 0xf4ebc3f9474e0b0cULL,
+	0x43886bd376d53455ULL, 0xd8028beb5aa01046ULL, 0x51f23282f5cdc320ULL,
+	0xe7b1c2be0d84e16dULL, 0x081dfab006dee8a0ULL, 0x3b33340d544b857bULL,
+	0x7f5bcabc679ae242ULL, 0x0edd37c48a08a6d8ULL, 0x81ed43d9a9b33bc6ULL,
+	0xb1a3655ebd4d7121ULL, 0x69a1eeb5e7ed6167ULL, 0xf6ab73d5c8f73124ULL,
+	0x1a67a3e185c61fd5ULL, 0x2dc91004d43c065eULL, 0x0240b02c8fb93a28ULL,
+	0x90f7f2b26cc0eb8fULL, 0x3cd3a16f114fd617ULL, 0xaae49ea9f15973e0ULL,
+	0x06c0cd748cd64e78ULL, 0xda423bc7d5192a6eULL, 0xc345701c16b41287ULL,
+	0x6d2193ede4821537ULL, 0xfcf639494190e3acULL, 0x7c3b228621f1c57eULL,
+	0xfb16ac2b0494b0c0ULL, 0xbf7e529a3745d7f9ULL, 0x6881b6a32e3f7c73ULL,
+	0xca78d2bad9b8e733ULL, 0xbbfe2fc2342aa3a9ULL, 0x0dbddffecc6381e4ULL,
+	0x70a6a56e2440598eULL, 0xe4d12a844befc651ULL, 0x8c509c2765d0ba22ULL,
+	0xee8c6018c28814d9ULL, 0x17da7c1f49a59e31ULL, 0x609c4c1328e194d3ULL,
+	0xb3e3d57232f44b09ULL, 0x91d7aaa4a512f69bULL, 0x0ffd6fd243dabbccULL,
+	0x50d26a943c1fde34ULL, 0x6be15e9968545b4fULL, 0x94778fea6faf9fdfULL,
+	0x2b09dd7058ea4826ULL, 0x677cd9716de5c7bfULL, 0x49d5214fffb2e6ddULL,
+	0x0360e83a466b273cULL, 0x1fc786af4f7b7691ULL, 0xa0b9d435783ea168ULL,
+	0xd49f0c035f118cb6ULL, 0x01205816c9d21d14ULL, 0xac2453dd7d8f3d98ULL,
+	0x545217cc3f70aa64ULL, 0x26b4028e9489c9c2ULL, 0xdec2469fd6765e3eULL,
+	0x04807d58036f7450ULL, 0xe5f17292823ddb45ULL, 0xf30b569b024a5860ULL,
+	0x62dcfc3fa758aefbULL, 0xe84cad6c4e5e5aa1ULL, 0xccb81fce556ea94bULL,
+	0x53b282ae7a74f908ULL, 0x1b47fbf74c1402c1ULL, 0x368eebf39828049fULL,
+	0x7afbeff2ad278b06ULL, 0xbe5e0a8cfe97caedULL, 0xcfd8f7f413058e77ULL,
+	0xf78b2bc301252c30ULL, 0x4d555c17fcdd928dULL, 0x5f2f05467fc565f8ULL,
+	0x24f4b2a21b30f3eaULL, 0x860dd6bbecb768aaULL, 0x4c750401350f8f99ULL,
+	0x0000000000000000ULL, 0xecccd0344d312ef1ULL, 0xb5231806be220571ULL,
+	0xc105c030990d28afULL, 0x653c695de25cfd97ULL, 0x159acc33c61ca419ULL,
+	0xb89ec7f872418495ULL, 0xa9847693b73254dcULL, 0x58cf90243ac13694ULL,
+	0x59efc832f3132b80ULL, 0x5c4fed7c39ae42c4ULL, 0x828dabe3efd81cfaULL,
+	0xd13f294d95ace5f2ULL, 0x7d1b7a90e823d86aULL, 0xb643f03cf849224dULL,
+	0x3df3f979d89dcb03ULL, 0x7426d836272f2ddeULL, 0xdfe21e891fa4432aULL,
+	0x3a136c1b9d99986fULL, 0xfa36f43dcd46add4ULL, 0xc025982650df35bbULL,
+	0x856d3e81aadc4f96ULL, 0xc4a5e57e53b041ebULL, 0x4708168b75ba4005ULL,
+	0xaf44bbe73be41aa4ULL, 0x971767d029c4b8e3ULL, 0xb9be9feebb939981ULL,
+	0x215497ecd18d9aaeULL, 0x316e7e91dd2c57f3ULL, 0xcef8afe2dad79363ULL,
+	0x3853dc371220a247ULL, 0x35ee03c9de4323a3ULL, 0xe6919aa8c456fc79ULL,
+	0xe05157dc4880b201ULL, 0x7bdbb7e464f59612ULL, 0x127a59518318f775ULL,
+	0x332ecebd52956ddbULL, 0x8f30741d23bb9d1eULL, 0xd922d3fd93720d52ULL,
+	0x7746300c61440ae2ULL, 0x25d4eab4d2e2eefeULL, 0x75068020eefd30caULL,
+	0x135a01474acaea61ULL, 0x304e268714fe4ae7ULL, 0xa519f17bb283c82cULL,
+	0xdc82f6b359cf6416ULL, 0x5baf781e7caa11a8ULL, 0xb2c38d64fb26561dULL,
+	0x34ce5bdf17913eb7ULL, 0x5d6fb56af07c5fd0ULL, 0x182713cd0a7f25fdULL,
+	0x9e2ac576e6c84d57ULL, 0x9aaab82ee5a73907ULL, 0xa3d93c0f3e558654ULL,
+	0x7e7b92aaae48ff56ULL, 0x872d8ead256575beULL, 0x41c8dbfff96c0e7dULL,
+	0x99ca5014a3cc1e3bULL, 0x40e883e930be1369ULL, 0x1ca76e95091051adULL,
+	0x4e35b42dbab6b5b1ULL, 0x05a0254ecabd6944ULL, 0xe1710fca8152af15ULL,
+	0xf22b0e8dcb984574ULL, 0xb763a82a319b3f59ULL, 0x63fca4296e8ab3efULL,
+	0x9d4a2d4ca0a36a6bULL, 0xe331bfe60eeb953dULL, 0xd5bf541596c391a2ULL,
+	0xf5cb9bef8e9c1618ULL, 0x46284e9dbc685d11ULL, 0x2074cffa185f87baULL,
+	0xbd3ee2b6b8fcedd1ULL, 0xae64e3f1f23607b0ULL, 0xfeb68965ce29d984ULL,
+	0x55724fdaf6a2b770ULL, 0x29496d5cd753720eULL, 0xa75941573d3af204ULL,
+	0x8e102c0bea69800aULL, 0x111ab16bc573d049ULL, 0xd7ffe439197aab8aULL,
+	0xefac380e0b5a09cdULL, 0x48f579593660fbc9ULL, 0x22347fd697e6bd92ULL,
+	0x61bc1405e13389c7ULL, 0x4ab5c975b9d9c1e1ULL, 0x80cd1bcf606126d2ULL,
+	0x7186fd78ed92449aULL, 0x93971a882aabccb3ULL, 0x88d0e17f66bfce72ULL,
+	0x27945a985d5bd4d6ULL
+	}, {
+	0xde553f8c05a811c8ULL, 0x1906b59631b4f565ULL, 0x436e70d6b1964ff7ULL,
+	0x36d343cb8b1e9d85ULL, 0x843dfacc858aab5aULL, 0xfdfc95c299bfc7f9ULL,
+	0x0f634bdea1d51fa2ULL, 0x6d458b3b76efb3cdULL, 0x85c3f77cf8593f80ULL,
+	0x3c91315fbe737cb2ULL, 0x2148b03366ace398ULL, 0x18f8b8264c6761bfULL,
+	0xc830c1c495c9fb0fULL, 0x981a76102086a0aaULL, 0xaa16012142f35760ULL,
+	0x35cc54060c763cf6ULL, 0x42907d66cc45db2dULL, 0x8203d44b965af4bcULL,
+	0x3d6f3cefc3a0e868ULL, 0xbc73ff69d292bda7ULL, 0x8722ed0102e20a29ULL,
+	0x8f8185e8cd34deb7ULL, 0x9b0561dda7ee01d9ULL, 0x5335a0193227fad6ULL,
+	0xc9cecc74e81a6fd5ULL, 0x54f5832e5c2431eaULL, 0x99e47ba05d553470ULL,
+	0xf7bee756acd226ceULL, 0x384e05a5571816fdULL, 0xd1367452a47d0e6aULL,
+	0xf29fde1c386ad85bULL, 0x320c77316275f7caULL, 0xd0c879e2d9ae9ab0ULL,
+	0xdb7406c69110ef5dULL, 0x45505e51a2461011ULL, 0xfc029872e46c5323ULL,
+	0xfa3cb6f5f7bc0cc5ULL, 0x031f17cd8768a173ULL, 0xbd8df2d9af41297dULL,
+	0x9d3b4f5ab43e5e3fULL, 0x4071671b36feee84ULL, 0x716207e7d3e3b83dULL,
+	0x48d20ff2f9283a1aULL, 0x27769eb4757cbc7eULL, 0x5c56ebc793f2e574ULL,
+	0xa48b474f9ef5dc18ULL, 0x52cbada94ff46e0cULL, 0x60c7da982d8199c6ULL,
+	0x0e9d466edc068b78ULL, 0x4eec2175eaf865fcULL, 0x550b8e9e21f7a530ULL,
+	0x6b7ba5bc653fec2bULL, 0x5eb7f1ba6949d0ddULL, 0x57ea94e3db4c9099ULL,
+	0xf640eae6d101b214ULL, 0xdd4a284182c0b0bbULL, 0xff1d8fbf6304f250ULL,
+	0xb8accb933bf9d7e8ULL, 0xe8867c478eb68c4dULL, 0x3f8e2692391bddc1ULL,
+	0xcb2fd60912a15a7cULL, 0xaec935dbab983d2fULL, 0xf55ffd2b56691367ULL,
+	0x80e2ce366ce1c115ULL, 0x179bf3f8edb27e1dULL, 0x01fe0db07dd394daULL,
+	0xda8a0b76ecc37b87ULL, 0x44ae53e1df9584cbULL, 0xb310b4b77347a205ULL,
+	0xdfab323c787b8512ULL, 0x3b511268d070b78eULL, 0x65e6e3d2b9396753ULL,
+	0x6864b271e2574d58ULL, 0x259784c98fc789d7ULL, 0x02e11a7dfabb35a9ULL,
+	0x8841a6dfa337158bULL, 0x7ade78c39b5dcdd0ULL, 0xb7cf804d9a2cc84aULL,
+	0x20b6bd831b7f7742ULL, 0x75bd331d3a88d272ULL, 0x418f6aab4b2d7a5eULL,
+	0xd9951cbb6babdaf4ULL, 0xb6318dfde7ff5c90ULL, 0x1f389b112264aa83ULL,
+	0x492c024284fbaec0ULL, 0xe33a0363c608f9a0ULL, 0x2688930408af28a4ULL,
+	0xc7538a1a341ce4adULL, 0x5da8e677ee2171aeULL, 0x8c9e92254a5c7fc4ULL,
+	0x63d8cd55aae938b5ULL, 0x29ebd8daa97a3706ULL, 0x959827b37be88aa1ULL,
+	0x1484e4356adadf6eULL, 0xa7945082199d7d6bULL, 0xbf6ce8a455fa1cd4ULL,
+	0x9cc542eac9edcae5ULL, 0x79c16f0e1c356ca3ULL, 0x89bfab6fdee48151ULL,
+	0xd4174d1830c5f0ffULL, 0x9258048415eb419dULL, 0x6139d72850520d1cULL,
+	0x6a85a80c18ec78f1ULL, 0xcd11f88e0171059aULL, 0xcceff53e7ca29140ULL,
+	0xd229639f2315af19ULL, 0x90b91ef9ef507434ULL, 0x5977d28d074a1be1ULL,
+	0x311360fce51d56b9ULL, 0xc093a92d5a1f2f91ULL, 0x1a19a25bb6dc5416ULL,
+	0xeb996b8a09de2d3eULL, 0xfee3820f1ed7668aULL, 0xd7085ad5b7ad518cULL,
+	0x7fff41890fe53345ULL, 0xec5948bd67dde602ULL, 0x2fd5f65dbaaa68e0ULL,
+	0xa5754affe32648c2ULL, 0xf8ddac880d07396cULL, 0x6fa491468c548664ULL,
+	0x0c7c5c1326bdbed1ULL, 0x4a33158f03930fb3ULL, 0x699abfc19f84d982ULL,
+	0xe4fa2054a80b329cULL, 0x6707f9af438252faULL, 0x08a368e9cfd6d49eULL,
+	0x47b1442c58fd25b8ULL, 0xbbb3dc5ebc91769bULL, 0x1665fe489061eac7ULL,
+	0x33f27a811fa66310ULL, 0x93a609346838d547ULL, 0x30ed6d4c98cec263ULL,
+	0x1dd9816cd8df9f2aULL, 0x94662a03063b1e7bULL, 0x83fdd9fbeb896066ULL,
+	0x7b207573e68e590aULL, 0x5f49fc0a149a4407ULL, 0x343259b671a5a82cULL,
+	0xfbc2bb458a6f981fULL, 0xc272b350a0a41a38ULL, 0x3aaf1fd8ada32354ULL,
+	0x6cbb868b0b3c2717ULL, 0xa2b569c88d2583feULL, 0xf180c9d1bf027928ULL,
+	0xaf37386bd64ba9f5ULL, 0x12bacab2790a8088ULL, 0x4c0d3b0810435055ULL,
+	0xb2eeb9070e9436dfULL, 0xc5b29067cea7d104ULL, 0xdcb425f1ff132461ULL,
+	0x4f122cc5972bf126ULL, 0xac282fa651230886ULL, 0xe7e537992f6393efULL,
+	0xe61b3a2952b00735ULL, 0x709c0a57ae302ce7ULL, 0xe02514ae416058d3ULL,
+	0xc44c9dd7b37445deULL, 0x5a68c5408022ba92ULL, 0x1c278cdca50c0bf0ULL,
+	0x6e5a9cf6f18712beULL, 0x86dce0b17f319ef3ULL, 0x2d34ec2040115d49ULL,
+	0x4bcd183f7e409b69ULL, 0x2815d56ad4a9a3dcULL, 0x24698979f2141d0dULL,
+	0x0000000000000000ULL, 0x1ec696a15fb73e59ULL, 0xd86b110b16784e2eULL,
+	0x8e7f8858b0e74a6dULL, 0x063e2e8713d05fe6ULL, 0xe2c40ed3bbdb6d7aULL,
+	0xb1f1aeca89fc97acULL, 0xe1db191e3cb3cc09ULL, 0x6418ee62c4eaf389ULL,
+	0xc6ad87aa49cf7077ULL, 0xd6f65765ca7ec556ULL, 0x9afb6c6dda3d9503ULL,
+	0x7ce05644888d9236ULL, 0x8d609f95378feb1eULL, 0x23a9aa4e9c17d631ULL,
+	0x6226c0e5d73aac6fULL, 0x56149953a69f0443ULL, 0xeeb852c09d66d3abULL,
+	0x2b0ac2a753c102afULL, 0x07c023376e03cb3cULL, 0x2ccae1903dc2c993ULL,
+	0xd3d76e2f5ec63bc3ULL, 0x9e2458973356ff4cULL, 0xa66a5d32644ee9b1ULL,
+	0x0a427294356de137ULL, 0x783f62be61e6f879ULL, 0x1344c70204d91452ULL,
+	0x5b96c8f0fdf12e48ULL, 0xa90916ecc59bf613ULL, 0xbe92e5142829880eULL,
+	0x727d102a548b194eULL, 0x1be7afebcb0fc0ccULL, 0x3e702b2244c8491bULL,
+	0xd5e940a84d166425ULL, 0x66f9f41f3e51c620ULL, 0xabe80c913f20c3baULL,
+	0xf07ec461c2d1edf2ULL, 0xf361d3ac45b94c81ULL, 0x0521394a94b8fe95ULL,
+	0xadd622162cf09c5cULL, 0xe97871f7f3651897ULL, 0xf4a1f09b2bba87bdULL,
+	0x095d6559b2054044ULL, 0x0bbc7f2448be75edULL, 0x2af4cf172e129675ULL,
+	0x157ae98517094bb4ULL, 0x9fda55274e856b96ULL, 0x914713499283e0eeULL,
+	0xb952c623462a4332ULL, 0x74433ead475b46a8ULL, 0x8b5eb112245fb4f8ULL,
+	0xa34b6478f0f61724ULL, 0x11a5dd7ffe6221fbULL, 0xc16da49d27ccbb4bULL,
+	0x76a224d0bde07301ULL, 0x8aa0bca2598c2022ULL, 0x4df336b86d90c48fULL,
+	0xea67663a740db9e4ULL, 0xef465f70e0b54771ULL, 0x39b008152acb8227ULL,
+	0x7d1e5bf4f55e06ecULL, 0x105bd0cf83b1b521ULL, 0x775c2960c033e7dbULL,
+	0x7e014c397236a79fULL, 0x811cc386113255cfULL, 0xeda7450d1a0e72d8ULL,
+	0x5889df3d7a998f3bULL, 0x2e2bfbedc779fc3aULL, 0xce0eef438619a4e9ULL,
+	0x372d4e7bf6cd095fULL, 0x04df34fae96b6a4fULL, 0xf923a13870d4adb6ULL,
+	0xa1aa7e050a4d228dULL, 0xa8f71b5cb84862c9ULL, 0xb52e9a306097fde3ULL,
+	0x0d8251a35b6e2a0bULL, 0x2257a7fee1c442ebULL, 0x73831d9a29588d94ULL,
+	0x51d4ba64c89ccf7fULL, 0x502ab7d4b54f5ba5ULL, 0x97793dce8153bf08ULL,
+	0xe5042de4d5d8a646ULL, 0x9687307efc802bd2ULL, 0xa05473b5779eb657ULL,
+	0xb4d097801d446939ULL, 0xcff0e2f3fbca3033ULL, 0xc38cbee0dd778ee2ULL,
+	0x464f499c252eb162ULL, 0xcad1dbb96f72cea6ULL, 0xba4dd1eec142e241ULL,
+	0xb00fa37af42f0376ULL
+	}, {
+	0xcce4cd3aa968b245ULL, 0x089d5484e80b7fafULL, 0x638246c1b3548304ULL,
+	0xd2fe0ec8c2355492ULL, 0xa7fbdf7ff2374eeeULL, 0x4df1600c92337a16ULL,
+	0x84e503ea523b12fbULL, 0x0790bbfd53ab0c4aULL, 0x198a780f38f6ea9dULL,
+	0x2ab30c8f55ec48cbULL, 0xe0f7fed6b2c49db5ULL, 0xb6ecf3f422cadbdcULL,
+	0x409c9a541358df11ULL, 0xd3ce8a56dfde3fe3ULL, 0xc3e9224312c8c1a0ULL,
+	0x0d6dfa58816ba507ULL, 0xddf3e1b179952777ULL, 0x04c02a42748bb1d9ULL,
+	0x94c2abff9f2decb8ULL, 0x4f91752da8f8acf4ULL, 0x78682befb169bf7bULL,
+	0xe1c77a48af2ff6c4ULL, 0x0c5d7ec69c80ce76ULL, 0x4cc1e4928fd81167ULL,
+	0xfeed3d24d9997b62ULL, 0x518bb6dfc3a54a23ULL, 0x6dbf2d26151f9b90ULL,
+	0xb5bc624b05ea664fULL, 0xe86aaa525acfe21aULL, 0x4801ced0fb53a0beULL,
+	0xc91463e6c00868edULL, 0x1027a815cd16fe43ULL, 0xf67069a0319204cdULL,
+	0xb04ccc976c8abce7ULL, 0xc0b9b3fc35e87c33ULL, 0xf380c77c58f2de65ULL,
+	0x50bb3241de4e2152ULL, 0xdf93f490435ef195ULL, 0xf1e0d25d62390887ULL,
+	0xaf668bfb1a3c3141ULL, 0xbc11b251f00a7291ULL, 0x73a5eed47e427d47ULL,
+	0x25bee3f6ee4c3b2eULL, 0x43cc0beb34786282ULL, 0xc824e778dde3039cULL,
+	0xf97d86d98a327728ULL, 0xf2b043e24519b514ULL, 0xe297ebf7880f4b57ULL,
+	0x3a94a49a98fab688ULL, 0x868516cb68f0c419ULL, 0xeffa11af0964ee50ULL,
+	0xa4ab4ec0d517f37dULL, 0xa9c6b498547c567aULL, 0x8e18424f80fbbbb6ULL,
+	0x0bcdc53bcf2bc23cULL, 0x137739aaea3643d0ULL, 0x2c1333ec1bac2ff0ULL,
+	0x8d48d3f0a7db0625ULL, 0x1e1ac3f26b5de6d7ULL, 0xf520f81f16b2b95eULL,
+	0x9f0f6ec450062e84ULL, 0x0130849e1deb6b71ULL, 0xd45e31ab8c7533a9ULL,
+	0x652279a2fd14e43fULL, 0x3209f01e70f1c927ULL, 0xbe71a770cac1a473ULL,
+	0x0e3d6be7a64b1894ULL, 0x7ec8148cff29d840ULL, 0xcb7476c7fac3be0fULL,
+	0x72956a4a63a91636ULL, 0x37f95ec21991138fULL, 0x9e3fea5a4ded45f5ULL,
+	0x7b38ba50964902e8ULL, 0x222e580bbde73764ULL, 0x61e253e0899f55e6ULL,
+	0xfc8d2805e352ad80ULL, 0x35994be3235ac56dULL, 0x09add01af5e014deULL,
+	0x5e8659a6780539c6ULL, 0xb17c48097161d796ULL, 0x026015213acbd6e2ULL,
+	0xd1ae9f77e515e901ULL, 0xb7dc776a3f21b0adULL, 0xaba6a1b96eb78098ULL,
+	0x9bcf4486248d9f5dULL, 0x582666c536455efdULL, 0xfdbdac9bfeb9c6f1ULL,
+	0xc47999be4163cdeaULL, 0x765540081722a7efULL, 0x3e548ed8ec710751ULL,
+	0x3d041f67cb51bac2ULL, 0x7958af71ac82d40aULL, 0x36c9da5c047a78feULL,
+	0xed9a048e33af38b2ULL, 0x26ee7249c96c86bdULL, 0x900281bdeba65d61ULL,
+	0x11172c8bd0fd9532ULL, 0xea0abf73600434f8ULL, 0x42fc8f75299309f3ULL,
+	0x34a9cf7d3eb1ae1cULL, 0x2b838811480723baULL, 0x5ce64c8742ceef24ULL,
+	0x1adae9b01fd6570eULL, 0x3c349bf9d6bad1b3ULL, 0x82453c891c7b75c0ULL,
+	0x97923a40b80d512bULL, 0x4a61dbf1c198765cULL, 0xb48ce6d518010d3eULL,
+	0xcfb45c858e480fd6ULL, 0xd933cbf30d1e96aeULL, 0xd70ea014ab558e3aULL,
+	0xc189376228031742ULL, 0x9262949cd16d8b83ULL, 0xeb3a3bed7def5f89ULL,
+	0x49314a4ee6b8cbcfULL, 0xdcc3652f647e4c06ULL, 0xda635a4c2a3e2b3dULL,
+	0x470c21a940f3d35bULL, 0x315961a157d174b4ULL, 0x6672e81dda3459acULL,
+	0x5b76f77a1165e36eULL, 0x445cb01667d36ec8ULL, 0xc5491d205c88a69bULL,
+	0x456c34887a3805b9ULL, 0xffddb9bac4721013ULL, 0x99af51a71e4649bfULL,
+	0xa15be01cbc7729d5ULL, 0x52db2760e485f7b0ULL, 0x8c78576eba306d54ULL,
+	0xae560f6507d75a30ULL, 0x95f22f6182c687c9ULL, 0x71c5fbf54489aba5ULL,
+	0xca44f259e728d57eULL, 0x88b87d2ccebbdc8dULL, 0xbab18d32be4a15aaULL,
+	0x8be8ec93e99b611eULL, 0x17b713e89ebdf209ULL, 0xb31c5d284baa0174ULL,
+	0xeeca9531148f8521ULL, 0xb8d198138481c348ULL, 0x8988f9b2d350b7fcULL,
+	0xb9e11c8d996aa839ULL, 0x5a4673e40c8e881fULL, 0x1687977683569978ULL,
+	0xbf4123eed72acf02ULL, 0x4ea1f1b3b513c785ULL, 0xe767452be16f91ffULL,
+	0x7505d1b730021a7cULL, 0xa59bca5ec8fc980cULL, 0xad069eda20f7e7a3ULL,
+	0x38f4b1bba231606aULL, 0x60d2d77e94743e97ULL, 0x9affc0183966f42cULL,
+	0x248e6768f3a7505fULL, 0xcdd449a4b483d934ULL, 0x87b59255751baf68ULL,
+	0x1bea6d2e023d3c7fULL, 0x6b1f12455b5ffcabULL, 0x743555292de9710dULL,
+	0xd8034f6d10f5fddfULL, 0xc6198c9f7ba81b08ULL, 0xbb8109aca3a17edbULL,
+	0xfa2d1766ad12cabbULL, 0xc729080166437079ULL, 0x9c5fff7b77269317ULL,
+	0x0000000000000000ULL, 0x15d706c9a47624ebULL, 0x6fdf38072fd44d72ULL,
+	0x5fb6dd3865ee52b7ULL, 0xa33bf53d86bcff37ULL, 0xe657c1b5fc84fa8eULL,
+	0xaa962527735cebe9ULL, 0x39c43525bfda0b1bULL, 0x204e4d2a872ce186ULL,
+	0x7a083ece8ba26999ULL, 0x554b9c9db72efbfaULL, 0xb22cd9b656416a05ULL,
+	0x96a2bedea5e63a5aULL, 0x802529a826b0a322ULL, 0x8115ad363b5bc853ULL,
+	0x8375b81701901eb1ULL, 0x3069e53f4a3a1fc5ULL, 0xbd2136cfede119e0ULL,
+	0x18bafc91251d81ecULL, 0x1d4a524d4c7d5b44ULL, 0x05f0aedc6960daa8ULL,
+	0x29e39d3072ccf558ULL, 0x70f57f6b5962c0d4ULL, 0x989fd53903ad22ceULL,
+	0xf84d024797d91c59ULL, 0x547b1803aac5908bULL, 0xf0d056c37fd263f6ULL,
+	0xd56eb535919e58d8ULL, 0x1c7ad6d351963035ULL, 0x2e7326cd2167f912ULL,
+	0xac361a443d1c8cd2ULL, 0x697f076461942a49ULL, 0x4b515f6fdc731d2dULL,
+	0x8ad8680df4700a6fULL, 0x41ac1eca0eb3b460ULL, 0x7d988533d80965d3ULL,
+	0xa8f6300649973d0bULL, 0x7765c4960ac9cc9eULL, 0x7ca801adc5e20ea2ULL,
+	0xdea3700e5eb59ae4ULL, 0xa06b6482a19c42a4ULL, 0x6a2f96db46b497daULL,
+	0x27def6d7d487edccULL, 0x463ca5375d18b82aULL, 0xa6cb5be1efdc259fULL,
+	0x53eba3fef96e9cc1ULL, 0xce84d81b93a364a7ULL, 0xf4107c810b59d22fULL,
+	0x333974806d1aa256ULL, 0x0f0def79bba073e5ULL, 0x231edc95a00c5c15ULL,
+	0xe437d494c64f2c6cULL, 0x91320523f64d3610ULL, 0x67426c83c7df32ddULL,
+	0x6eefbc99323f2603ULL, 0x9d6f7be56acdf866ULL, 0x5916e25b2bae358cULL,
+	0x7ff89012e2c2b331ULL, 0x035091bf2720bd93ULL, 0x561b0d22900e4669ULL,
+	0x28d319ae6f279e29ULL, 0x2f43a2533c8c9263ULL, 0xd09e1be9f8fe8270ULL,
+	0xf740ed3e2c796fbcULL, 0xdb53ded237d5404cULL, 0x62b2c25faebfe875ULL,
+	0x0afd41a5d2c0a94dULL, 0x6412fd3ce0ff8f4eULL, 0xe3a76f6995e42026ULL,
+	0x6c8fa9b808f4f0e1ULL, 0xc2d9a6dd0f23aad1ULL, 0x8f28c6d19d10d0c7ULL,
+	0x85d587744fd0798aULL, 0xa20b71a39b579446ULL, 0x684f83fa7c7f4138ULL,
+	0xe507500adba4471dULL, 0x3f640a46f19a6c20ULL, 0x1247bd34f7dd28a1ULL,
+	0x2d23b77206474481ULL, 0x93521002cc86e0f2ULL, 0x572b89bc8de52d18ULL,
+	0xfb1d93f8b0f9a1caULL, 0xe95a2ecc4724896bULL, 0x3ba420048511ddf9ULL,
+	0xd63e248ab6bee54bULL, 0x5dd6c8195f258455ULL, 0x06a03f634e40673bULL,
+	0x1f2a476c76b68da6ULL, 0x217ec9b49ac78af7ULL, 0xecaa80102e4453c3ULL,
+	0x14e78257b99d4f9aULL
+	}, {
+	0x20329b2cc87bba05ULL, 0x4f5eb6f86546a531ULL, 0xd4f44775f751b6b1ULL,
+	0x8266a47b850dfa8bULL, 0xbb986aa15a6ca985ULL, 0xc979eb08f9ae0f99ULL,
+	0x2da6f447a2375ea1ULL, 0x1e74275dcd7d8576ULL, 0xbc20180a800bc5f8ULL,
+	0xb4a2f701b2dc65beULL, 0xe726946f981b6d66ULL, 0x48e6c453bf21c94cULL,
+	0x42cad9930f0a4195ULL, 0xefa47b64aacccd20ULL, 0x71180a8960409a42ULL,
+	0x8bb3329bf6a44e0cULL, 0xd34c35de2d36daccULL, 0xa92f5b7cbc23dc96ULL,
+	0xb31a85aa68bb09c3ULL, 0x13e04836a73161d2ULL, 0xb24dfc4129c51d02ULL,
+	0x8ae44b70b7da5acdULL, 0xe671ed84d96579a7ULL, 0xa4bb3417d66f3832ULL,
+	0x4572ab38d56d2de8ULL, 0xb1b47761ea47215cULL, 0xe81c09cf70aba15dULL,
+	0xffbdb872ce7f90acULL, 0xa8782297fd5dc857ULL, 0x0d946f6b6a4ce4a4ULL,
+	0xe4df1f4f5b995138ULL, 0x9ebc71edca8c5762ULL, 0x0a2c1dc0b02b88d9ULL,
+	0x3b503c115d9d7b91ULL, 0xc64376a8111ec3a2ULL, 0xcec199a323c963e4ULL,
+	0xdc76a87ec58616f7ULL, 0x09d596e073a9b487ULL, 0x14583a9d7d560dafULL,
+	0xf4c6dc593f2a0cb4ULL, 0xdd21d19584f80236ULL, 0x4a4836983ddde1d3ULL,
+	0xe58866a41ae745f9ULL, 0xf591a5b27e541875ULL, 0x891dc05074586693ULL,
+	0x5b068c651810a89eULL, 0xa30346bc0c08544fULL, 0x3dbf3751c684032dULL,
+	0x2a1e86ec785032dcULL, 0xf73f5779fca830eaULL, 0xb60c05ca30204d21ULL,
+	0x0cc316802b32f065ULL, 0x8770241bdd96be69ULL, 0xb861e18199ee95dbULL,
+	0xf805cad91418fcd1ULL, 0x29e70dccbbd20e82ULL, 0xc7140f435060d763ULL,
+	0x0f3a9da0e8b0cc3bULL, 0xa2543f574d76408eULL, 0xbd7761e1c175d139ULL,
+	0x4b1f4f737ca3f512ULL, 0x6dc2df1f2fc137abULL, 0xf1d05c3967b14856ULL,
+	0xa742bf3715ed046cULL, 0x654030141d1697edULL, 0x07b872abda676c7dULL,
+	0x3ce84eba87fa17ecULL, 0xc1fb0403cb79afdfULL, 0x3e46bc7105063f73ULL,
+	0x278ae987121cd678ULL, 0xa1adb4778ef47cd0ULL, 0x26dd906c5362c2b9ULL,
+	0x05168060589b44e2ULL, 0xfbfc41f9d79ac08fULL, 0x0e6de44ba9ced8faULL,
+	0x9feb08068bf243a3ULL, 0x7b341749d06b129bULL, 0x229c69e74a87929aULL,
+	0xe09ee6c4427c011bULL, 0x5692e30e725c4c3aULL, 0xda99a33e5e9f6e4bULL,
+	0x353dd85af453a36bULL, 0x25241b4c90e0fee7ULL, 0x5de987258309d022ULL,
+	0xe230140fc0802984ULL, 0x93281e86a0c0b3c6ULL, 0xf229d719a4337408ULL,
+	0x6f6c2dd4ad3d1f34ULL, 0x8ea5b2fbae3f0aeeULL, 0x8331dd90c473ee4aULL,
+	0x346aa1b1b52db7aaULL, 0xdf8f235e06042aa9ULL, 0xcc6f6b68a1354b7bULL,
+	0x6c95a6f46ebf236aULL, 0x52d31a856bb91c19ULL, 0x1a35ded6d498d555ULL,
+	0xf37eaef2e54d60c9ULL, 0x72e181a9a3c2a61cULL, 0x98537aad51952fdeULL,
+	0x16f6c856ffaa2530ULL, 0xd960281e9d1d5215ULL, 0x3a0745fa1ce36f50ULL,
+	0x0b7b642bf1559c18ULL, 0x59a87eae9aec8001ULL, 0x5e100c05408bec7cULL,
+	0x0441f98b19e55023ULL, 0xd70dcc5534d38aefULL, 0x927f676de1bea707ULL,
+	0x9769e70db925e3e5ULL, 0x7a636ea29115065aULL, 0x468b201816ef11b6ULL,
+	0xab81a9b73edff409ULL, 0xc0ac7de88a07bb1eULL, 0x1f235eb68c0391b7ULL,
+	0x6056b074458dd30fULL, 0xbe8eeac102f7ed67ULL, 0xcd381283e04b5fbaULL,
+	0x5cbefecec277c4e3ULL, 0xd21b4c356c48ce0dULL, 0x1019c31664b35d8cULL,
+	0x247362a7d19eea26ULL, 0xebe582efb3299d03ULL, 0x02aef2cb82fc289fULL,
+	0x86275df09ce8aaa8ULL, 0x28b07427faac1a43ULL, 0x38a9b7319e1f47cfULL,
+	0xc82e92e3b8d01b58ULL, 0x06ef0b409b1978bcULL, 0x62f842bfc771fb90ULL,
+	0x9904034610eb3b1fULL, 0xded85ab5477a3e68ULL, 0x90d195a663428f98ULL,
+	0x5384636e2ac708d8ULL, 0xcbd719c37b522706ULL, 0xae9729d76644b0ebULL,
+	0x7c8c65e20a0c7ee6ULL, 0x80c856b007f1d214ULL, 0x8c0b40302cc32271ULL,
+	0xdbcedad51fe17a8aULL, 0x740e8ae938dbdea0ULL, 0xa615c6dc549310adULL,
+	0x19cc55f6171ae90bULL, 0x49b1bdb8fe5fdd8dULL, 0xed0a89af2830e5bfULL,
+	0x6a7aadb4f5a65bd6ULL, 0x7e22972988f05679ULL, 0xf952b3325566e810ULL,
+	0x39fecedadf61530eULL, 0x6101c99f04f3c7ceULL, 0x2e5f7f6761b562ffULL,
+	0xf08725d226cf5c97ULL, 0x63af3b54860fef51ULL, 0x8ff2cb10ef411e2fULL,
+	0x884ab9bb35267252ULL, 0x4df04433e7ba8daeULL, 0x9afd8866d3690741ULL,
+	0x66b9bb34de94abb3ULL, 0x9baaf18d92171380ULL, 0x543c11c5f0a064a5ULL,
+	0x17a1b1bdbed431f1ULL, 0xb5f58eeaf3a2717fULL, 0xc355f6c849858740ULL,
+	0xec5df044694ef17eULL, 0xd83751f5dc6346d4ULL, 0xfc4433520dfdacf2ULL,
+	0x0000000000000000ULL, 0x5a51f58e596ebc5fULL, 0x3285aaf12e34cf16ULL,
+	0x8d5c39db6dbd36b0ULL, 0x12b731dde64f7513ULL, 0x94906c2d7aa7dfbbULL,
+	0x302b583aacc8e789ULL, 0x9d45facd090e6b3cULL, 0x2165e2c78905aec4ULL,
+	0x68d45f7f775a7349ULL, 0x189b2c1d5664fdcaULL, 0xe1c99f2f030215daULL,
+	0x6983269436246788ULL, 0x8489af3b1e148237ULL, 0xe94b702431d5b59cULL,
+	0x33d2d31a6f4adbd7ULL, 0xbfd9932a4389f9a6ULL, 0xb0e30e8aab39359dULL,
+	0xd1e2c715afcaf253ULL, 0x150f43763c28196eULL, 0xc4ed846393e2eb3dULL,
+	0x03f98b20c3823c5eULL, 0xfd134ab94c83b833ULL, 0x556b682eb1de7064ULL,
+	0x36c4537a37d19f35ULL, 0x7559f30279a5ca61ULL, 0x799ae58252973a04ULL,
+	0x9c12832648707ffdULL, 0x78cd9c6913e92ec5ULL, 0x1d8dac7d0effb928ULL,
+	0x439da0784e745554ULL, 0x413352b3cc887dcbULL, 0xbacf134a1b12bd44ULL,
+	0x114ebafd25cd494dULL, 0x2f08068c20cb763eULL, 0x76a07822ba27f63fULL,
+	0xeab2fb04f25789c2ULL, 0xe3676de481fe3d45ULL, 0x1b62a73d95e6c194ULL,
+	0x641749ff5c68832cULL, 0xa5ec4dfc97112cf3ULL, 0xf6682e92bdd6242bULL,
+	0x3f11c59a44782bb2ULL, 0x317c21d1edb6f348ULL, 0xd65ab5be75ad9e2eULL,
+	0x6b2dd45fb4d84f17ULL, 0xfaab381296e4d44eULL, 0xd0b5befeeeb4e692ULL,
+	0x0882ef0b32d7a046ULL, 0x512a91a5a83b2047ULL, 0x963e9ee6f85bf724ULL,
+	0x4e09cf132438b1f0ULL, 0x77f701c9fb59e2feULL, 0x7ddb1c094b726a27ULL,
+	0x5f4775ee01f5f8bdULL, 0x9186ec4d223c9b59ULL, 0xfeeac1998f01846dULL,
+	0xac39db1ce4b89874ULL, 0xb75b7c21715e59e0ULL, 0xafc0503c273aa42aULL,
+	0x6e3b543fec430bf5ULL, 0x704f7362213e8e83ULL, 0x58ff0745db9294c0ULL,
+	0x67eec2df9feabf72ULL, 0xa0facd9ccf8a6811ULL, 0xb936986ad890811aULL,
+	0x95c715c63bd9cb7aULL, 0xca8060283a2c33c7ULL, 0x507de84ee9453486ULL,
+	0x85ded6d05f6a96f6ULL, 0x1cdad5964f81ade9ULL, 0xd5a33e9eb62fa270ULL,
+	0x40642b588df6690aULL, 0x7f75eec2c98e42b8ULL, 0x2cf18dace3494a60ULL,
+	0x23cb100c0bf9865bULL, 0xeef3028febb2d9e1ULL, 0x4425d2d394133929ULL,
+	0xaad6d05c7fa1e0c8ULL, 0xad6ea2f7a5c68cb5ULL, 0xc2028f2308fb9381ULL,
+	0x819f2f5b468fc6d5ULL, 0xc5bafd88d29cfffcULL, 0x47dc59f357910577ULL,
+	0x2b49ff07392e261dULL, 0x57c59ae5332258fbULL, 0x73b6f842e2bcb2ddULL,
+	0xcf96e04862b77725ULL, 0x4ca73dd8a6c4996fULL, 0x015779eb417e14c1ULL,
+	0x37932a9176af8bf4ULL
+	}, {
+	0x190a2c9b249df23eULL, 0x2f62f8b62263e1e9ULL, 0x7a7f754740993655ULL,
+	0x330b7ba4d5564d9fULL, 0x4c17a16a46672582ULL, 0xb22f08eb7d05f5b8ULL,
+	0x535f47f40bc148ccULL, 0x3aec5d27d4883037ULL, 0x10ed0a1825438f96ULL,
+	0x516101f72c233d17ULL, 0x13cc6f949fd04eaeULL, 0x739853c441474bfdULL,
+	0x653793d90d3f5b1bULL, 0x5240647b96b0fc2fULL, 0x0c84890ad27623e0ULL,
+	0xd7189b32703aaea3ULL, 0x2685de3523bd9c41ULL, 0x99317c5b11bffefaULL,
+	0x0d9baa854f079703ULL, 0x70b93648fbd48ac5ULL, 0xa80441fce30bc6beULL,
+	0x7287704bdc36ff1eULL, 0xb65384ed33dc1f13ULL, 0xd36417343ee34408ULL,
+	0x39cd38ab6e1bf10fULL, 0x5ab861770a1f3564ULL, 0x0ebacf09f594563bULL,
+	0xd04572b884708530ULL, 0x3cae9722bdb3af47ULL, 0x4a556b6f2f5cbaf2ULL,
+	0xe1704f1f76c4bd74ULL, 0x5ec4ed7144c6dfcfULL, 0x16afc01d4c7810e6ULL,
+	0x283f113cd629ca7aULL, 0xaf59a8761741ed2dULL, 0xeed5a3991e215facULL,
+	0x3bf37ea849f984d4ULL, 0xe413e096a56ce33cULL, 0x2c439d3a98f020d1ULL,
+	0x637559dc6404c46bULL, 0x9e6c95d1e5f5d569ULL, 0x24bb9836045fe99aULL,
+	0x44efa466dac8ecc9ULL, 0xc6eab2a5c80895d6ULL, 0x803b50c035220cc4ULL,
+	0x0321658cba93c138ULL, 0x8f9ebc465dc7ee1cULL, 0xd15a5137190131d3ULL,
+	0x0fa5ec8668e5e2d8ULL, 0x91c979578d1037b1ULL, 0x0642ca05693b9f70ULL,
+	0xefca80168350eb4fULL, 0x38d21b24f36a45ecULL, 0xbeab81e1af73d658ULL,
+	0x8cbfd9cae7542f24ULL, 0xfd19cc0d81f11102ULL, 0x0ac6430fbb4dbc90ULL,
+	0x1d76a09d6a441895ULL, 0x2a01573ff1cbbfa1ULL, 0xb572e161894fde2bULL,
+	0x8124734fa853b827ULL, 0x614b1fdf43e6b1b0ULL, 0x68ac395c4238cc18ULL,
+	0x21d837bfd7f7b7d2ULL, 0x20c714304a860331ULL, 0x5cfaab726324aa14ULL,
+	0x74c5ba4eb50d606eULL, 0xf3a3030474654739ULL, 0x23e671bcf015c209ULL,
+	0x45f087e947b9582aULL, 0xd8bd77b418df4c7bULL, 0xe06f6c90ebb50997ULL,
+	0x0bd96080263c0873ULL, 0x7e03f9410e40dcfeULL, 0xb8e94be4c6484928ULL,
+	0xfb5b0608e8ca8e72ULL, 0x1a2b49179e0e3306ULL, 0x4e29e76961855059ULL,
+	0x4f36c4e6fcf4e4baULL, 0x49740ee395cf7bcaULL, 0xc2963ea386d17f7dULL,
+	0x90d65ad810618352ULL, 0x12d34c1b02a1fa4dULL, 0xfa44258775bb3a91ULL,
+	0x18150f14b9ec46ddULL, 0x1491861e6b9a653dULL, 0x9a1019d7ab2c3fc2ULL,
+	0x3668d42d06fe13d7ULL, 0xdcc1fbb25606a6d0ULL, 0x969490dd795a1c22ULL,
+	0x3549b1a1bc6dd2efULL, 0xc94f5e23a0ed770eULL, 0xb9f6686b5b39fdcbULL,
+	0xc4d4f4a6efeae00dULL, 0xe732851a1fff2204ULL, 0x94aad6de5eb869f9ULL,
+	0x3f8ff2ae07206e7fULL, 0xfe38a9813b62d03aULL, 0xa7a1ad7a8bee2466ULL,
+	0x7b6056c8dde882b6ULL, 0x302a1e286fc58ca7ULL, 0x8da0fa457a259bc7ULL,
+	0xb3302b64e074415bULL, 0x5402ae7eff8b635fULL, 0x08f8050c9cafc94bULL,
+	0xae468bf98a3059ceULL, 0x88c355cca98dc58fULL, 0xb10e6d67c7963480ULL,
+	0xbad70de7e1aa3cf3ULL, 0xbfb4a26e320262bbULL, 0xcb711820870f02d5ULL,
+	0xce12b7a954a75c9dULL, 0x563ce87dd8691684ULL, 0x9f73b65e7884618aULL,
+	0x2b1e74b06cba0b42ULL, 0x47cec1ea605b2df1ULL, 0x1c698312f735ac76ULL,
+	0x5fdbcefed9b76b2cULL, 0x831a354c8fb1cdfcULL, 0x820516c312c0791fULL,
+	0xb74ca762aeadabf0ULL, 0xfc06ef821c80a5e1ULL, 0x5723cbf24518a267ULL,
+	0x9d4df05d5f661451ULL, 0x588627742dfd40bfULL, 0xda8331b73f3d39a0ULL,
+	0x17b0e392d109a405ULL, 0xf965400bcf28fba9ULL, 0x7c3dbf4229a2a925ULL,
+	0x023e460327e275dbULL, 0x6cd0b55a0ce126b3ULL, 0xe62da695828e96e7ULL,
+	0x42ad6e63b3f373b9ULL, 0xe50cc319381d57dfULL, 0xc5cbd729729b54eeULL,
+	0x46d1e265fd2a9912ULL, 0x6428b056904eeff8ULL, 0x8be23040131e04b7ULL,
+	0x6709d5da2add2ec0ULL, 0x075de98af44a2b93ULL, 0x8447dcc67bfbe66fULL,
+	0x6616f655b7ac9a23ULL, 0xd607b8bded4b1a40ULL, 0x0563af89d3a85e48ULL,
+	0x3db1b4ad20c21ba4ULL, 0x11f22997b8323b75ULL, 0x292032b34b587e99ULL,
+	0x7f1cdace9331681dULL, 0x8e819fc9c0b65affULL, 0xa1e3677fe2d5bb16ULL,
+	0xcd33d225ee349da5ULL, 0xd9a2543b85aef898ULL, 0x795e10cbfa0af76dULL,
+	0x25a4bbb9992e5d79ULL, 0x78413344677b438eULL, 0xf0826688cef68601ULL,
+	0xd27b34bba392f0ebULL, 0x551d8df162fad7bcULL, 0x1e57c511d0d7d9adULL,
+	0xdeffbdb171e4d30bULL, 0xf4feea8e802f6caaULL, 0xa480c8f6317de55eULL,
+	0xa0fc44f07fa40ff5ULL, 0x95b5f551c3c9dd1aULL, 0x22f952336d6476eaULL,
+	0x0000000000000000ULL, 0xa6be8ef5169f9085ULL, 0xcc2cf1aa73452946ULL,
+	0x2e7ddb39bf12550aULL, 0xd526dd3157d8db78ULL, 0x486b2d6c08becf29ULL,
+	0x9b0f3a58365d8b21ULL, 0xac78cdfaadd22c15ULL, 0xbc95c7e28891a383ULL,
+	0x6a927f5f65dab9c3ULL, 0xc3891d2c1ba0cb9eULL, 0xeaa92f9f50f8b507ULL,
+	0xcf0d9426c9d6e87eULL, 0xca6e3baf1a7eb636ULL, 0xab25247059980786ULL,
+	0x69b31ad3df4978fbULL, 0xe2512a93cc577c4cULL, 0xff278a0ea61364d9ULL,
+	0x71a615c766a53e26ULL, 0x89dc764334fc716cULL, 0xf87a638452594f4aULL,
+	0xf2bc208be914f3daULL, 0x8766b94ac1682757ULL, 0xbbc82e687cdb8810ULL,
+	0x626a7a53f9757088ULL, 0xa2c202f358467a2eULL, 0x4d0882e5db169161ULL,
+	0x09e7268301de7da8ULL, 0xe897699c771ac0dcULL, 0xc8507dac3d9cc3edULL,
+	0xc0a878a0a1330aa6ULL, 0x978bb352e42ba8c1ULL, 0xe9884a13ea6b743fULL,
+	0x279afdbabecc28a2ULL, 0x047c8c064ed9eaabULL, 0x507e2278b15289f4ULL,
+	0x599904fbb08cf45cULL, 0xbd8ae46d15e01760ULL, 0x31353da7f2b43844ULL,
+	0x8558ff49e68a528cULL, 0x76fbfc4d92ef15b5ULL, 0x3456922e211c660cULL,
+	0x86799ac55c1993b4ULL, 0x3e90d1219a51da9cULL, 0x2d5cbeb505819432ULL,
+	0x982e5fd48cce4a19ULL, 0xdb9c1238a24c8d43ULL, 0xd439febecaa96f9bULL,
+	0x418c0bef0960b281ULL, 0x158ea591f6ebd1deULL, 0x1f48e69e4da66d4eULL,
+	0x8afd13cf8e6fb054ULL, 0xf5e1c9011d5ed849ULL, 0xe34e091c5126c8afULL,
+	0xad67ee7530a398f6ULL, 0x43b24dec2e82c75aULL, 0x75da99c1287cd48dULL,
+	0x92e81cdb3783f689ULL, 0xa3dd217cc537cecdULL, 0x60543c50de970553ULL,
+	0x93f73f54aaf2426aULL, 0xa91b62737e7a725dULL, 0xf19d4507538732e2ULL,
+	0x77e4dfc20f9ea156ULL, 0x7d229ccdb4d31dc6ULL, 0x1b346a98037f87e5ULL,
+	0xedf4c615a4b29e94ULL, 0x4093286094110662ULL, 0xb0114ee85ae78063ULL,
+	0x6ff1d0d6b672e78bULL, 0x6dcf96d591909250ULL, 0xdfe09e3eec9567e8ULL,
+	0x3214582b4827f97cULL, 0xb46dc2ee143e6ac8ULL, 0xf6c0ac8da7cd1971ULL,
+	0xebb60c10cd8901e4ULL, 0xf7df8f023abcad92ULL, 0x9c52d3d2c217a0b2ULL,
+	0x6b8d5cd0f8ab0d20ULL, 0x3777f7a29b8fa734ULL, 0x011f238f9d71b4e3ULL,
+	0xc1b75b2f3c42be45ULL, 0x5de588fdfe551ef7ULL, 0x6eeef3592b035368ULL,
+	0xaa3a07ffc4e9b365ULL, 0xecebe59a39c32a77ULL, 0x5ba742f8976e8187ULL,
+	0x4b4a48e0b22d0e11ULL, 0xddded83dcb771233ULL, 0xa59feb79ac0c51bdULL,
+	0xc7f5912a55792135ULL
+	}, {
+	0x6d6ae04668a9b08aULL, 0x3ab3f04b0be8c743ULL, 0xe51e166b54b3c908ULL,
+	0xbe90a9eb35c2f139ULL, 0xb2c7066637f2bec1ULL, 0xaa6945613392202cULL,
+	0x9a28c36f3b5201ebULL, 0xddce5a93ab536994ULL, 0x0e34133ef6382827ULL,
+	0x52a02ba1ec55048bULL, 0xa2f88f97c4b2a177ULL, 0x8640e513ca2251a5ULL,
+	0xcdf1d36258137622ULL, 0xfe6cb708dedf8ddbULL, 0x8a174a9ec8121e5dULL,
+	0x679896036b81560eULL, 0x59ed033395795feeULL, 0x1dd778ab8b74edafULL,
+	0xee533ef92d9f926dULL, 0x2a8c79baf8a8d8f5ULL, 0x6bcf398e69b119f6ULL,
+	0xe20491742fafdd95ULL, 0x276488e0809c2aecULL, 0xea955b82d88f5cceULL,
+	0x7102c63a99d9e0c4ULL, 0xf9763017a5c39946ULL, 0x429fa2501f151b3dULL,
+	0x4659c72bea05d59eULL, 0x984b7fdccf5a6634ULL, 0xf742232953fbb161ULL,
+	0x3041860e08c021c7ULL, 0x747bfd9616cd9386ULL, 0x4bb1367192312787ULL,
+	0x1b72a1638a6c44d3ULL, 0x4a0e68a6e8359a66ULL, 0x169a5039f258b6caULL,
+	0xb98a2ef44edee5a4ULL, 0xd9083fe85e43a737ULL, 0x967f6ce239624e13ULL,
+	0x8874f62d3c1a7982ULL, 0x3c1629830af06e3fULL, 0x9165ebfd427e5a8eULL,
+	0xb5dd81794ceeaa5cULL, 0x0de8f15a7834f219ULL, 0x70bd98ede3dd5d25ULL,
+	0xaccc9ca9328a8950ULL, 0x56664eda1945ca28ULL, 0x221db34c0f8859aeULL,
+	0x26dbd637fa98970dULL, 0x1acdffb4f068f932ULL, 0x4585254f64090fa0ULL,
+	0x72de245e17d53afaULL, 0x1546b25d7c546cf4ULL, 0x207e0ffffb803e71ULL,
+	0xfaaad2732bcf4378ULL, 0xb462dfae36ea17bdULL, 0xcf926fd1ac1b11fdULL,
+	0xe0672dc7dba7ba4aULL, 0xd3fa49ad5d6b41b3ULL, 0x8ba81449b216a3bcULL,
+	0x14f9ec8a0650d115ULL, 0x40fc1ee3eb1d7ce2ULL, 0x23a2ed9b758ce44fULL,
+	0x782c521b14fddc7eULL, 0x1c68267cf170504eULL, 0xbcf31558c1ca96e6ULL,
+	0xa781b43b4ba6d235ULL, 0xf6fd7dfe29ff0c80ULL, 0xb0a4bad5c3fad91eULL,
+	0xd199f51ea963266cULL, 0x414340349119c103ULL, 0x5405f269ed4dadf7ULL,
+	0xabd61bb649969dcdULL, 0x6813dbeae7bdc3c8ULL, 0x65fb2ab09f8931d1ULL,
+	0xf1e7fae152e3181dULL, 0xc1a67cef5a2339daULL, 0x7a4feea8e0f5bba1ULL,
+	0x1e0b9acf05783791ULL, 0x5b8ebf8061713831ULL, 0x80e53cdbcb3af8d9ULL,
+	0x7e898bd315e57502ULL, 0xc6bcfbf0213f2d47ULL, 0x95a38e86b76e942dULL,
+	0x092e94218d243cbaULL, 0x8339debf453622e7ULL, 0xb11be402b9fe64ffULL,
+	0x57d9100d634177c9ULL, 0xcc4e8db52217cbc3ULL, 0x3b0cae9c71ec7aa2ULL,
+	0xfb158ca451cbfe99ULL, 0x2b33276d82ac6514ULL, 0x01bf5ed77a04bde1ULL,
+	0xc5601994af33f779ULL, 0x75c4a3416cc92e67ULL, 0xf3844652a6eb7fc2ULL,
+	0x3487e375fdd0ef64ULL, 0x18ae430704609eedULL, 0x4d14efb993298efbULL,
+	0x815a620cb13e4538ULL, 0x125c354207487869ULL, 0x9eeea614ce42cf48ULL,
+	0xce2d3106d61fac1cULL, 0xbbe99247bad6827bULL, 0x071a871f7b1c149dULL,
+	0x2e4a1cc10db81656ULL, 0x77a71ff298c149b8ULL, 0x06a5d9c80118a97cULL,
+	0xad73c27e488e34b1ULL, 0x443a7b981e0db241ULL, 0xe3bbcfa355ab6074ULL,
+	0x0af276450328e684ULL, 0x73617a896dd1871bULL, 0x58525de4ef7de20fULL,
+	0xb7be3dcab8e6cd83ULL, 0x19111dd07e64230cULL, 0x842359a03e2a367aULL,
+	0x103f89f1f3401fb6ULL, 0xdc710444d157d475ULL, 0xb835702334da5845ULL,
+	0x4320fc876511a6dcULL, 0xd026abc9d3679b8dULL, 0x17250eee885c0b2bULL,
+	0x90dab52a387ae76fULL, 0x31fed8d972c49c26ULL, 0x89cba8fa461ec463ULL,
+	0x2ff5421677bcabb7ULL, 0x396f122f85e41d7dULL, 0xa09b332430bac6a8ULL,
+	0xc888e8ced7070560ULL, 0xaeaf201ac682ee8fULL, 0x1180d7268944a257ULL,
+	0xf058a43628e7a5fcULL, 0xbd4c4b8fbbce2b07ULL, 0xa1246df34abe7b49ULL,
+	0x7d5569b79be9af3cULL, 0xa9b5a705bd9efa12ULL, 0xdb6b835baa4bc0e8ULL,
+	0x05793bac8f147342ULL, 0x21c1512881848390ULL, 0xfdb0556c50d357e5ULL,
+	0x613d4fcb6a99ff72ULL, 0x03dce2648e0cda3eULL, 0xe949b9e6568386f0ULL,
+	0xfc0f0bbb2ad7ea04ULL, 0x6a70675913b5a417ULL, 0x7f36d5046fe1c8e3ULL,
+	0x0c57af8d02304ff8ULL, 0x32223abdfcc84618ULL, 0x0891caf6f720815bULL,
+	0xa63eeaec31a26fd4ULL, 0x2507345374944d33ULL, 0x49d28ac266394058ULL,
+	0xf5219f9aa7f3d6beULL, 0x2d96fea583b4cc68ULL, 0x5a31e1571b7585d0ULL,
+	0x8ed12fe53d02d0feULL, 0xdfade6205f5b0e4bULL, 0x4cabb16ee92d331aULL,
+	0x04c6657bf510cea3ULL, 0xd73c2cd6a87b8f10ULL, 0xe1d87310a1a307abULL,
+	0x6cd5be9112ad0d6bULL, 0x97c032354366f3f2ULL, 0xd4e0ceb22677552eULL,
+	0x0000000000000000ULL, 0x29509bde76a402cbULL, 0xc27a9e8bd42fe3e4ULL,
+	0x5ef7842cee654b73ULL, 0xaf107ecdbc86536eULL, 0x3fcacbe784fcb401ULL,
+	0xd55f90655c73e8cfULL, 0xe6c2f40fdabf1336ULL, 0xe8f6e7312c873b11ULL,
+	0xeb2a0555a28be12fULL, 0xe4a148bc2eb774e9ULL, 0x9b979db84156bc0aULL,
+	0x6eb60222e6a56ab4ULL, 0x87ffbbc4b026ec44ULL, 0xc703a5275b3b90a6ULL,
+	0x47e699fc9001687fULL, 0x9c8d1aa73a4aa897ULL, 0x7cea3760e1ed12ddULL,
+	0x4ec80ddd1d2554c5ULL, 0x13e36b957d4cc588ULL, 0x5d2b66486069914dULL,
+	0x92b90999cc7280b0ULL, 0x517cc9c56259deb5ULL, 0xc937b619ad03b881ULL,
+	0xec30824ad997f5b2ULL, 0xa45d565fc5aa080bULL, 0xd6837201d27f32f1ULL,
+	0x635ef3789e9198adULL, 0x531f75769651b96aULL, 0x4f77530a6721e924ULL,
+	0x486dd4151c3dfdb9ULL, 0x5f48dafb9461f692ULL, 0x375b011173dc355aULL,
+	0x3da9775470f4d3deULL, 0x8d0dcd81b30e0ac0ULL, 0x36e45fc609d888bbULL,
+	0x55baacbe97491016ULL, 0x8cb29356c90ab721ULL, 0x76184125e2c5f459ULL,
+	0x99f4210bb55edbd5ULL, 0x6f095cf59ca1d755ULL, 0x9f51f8c3b44672a9ULL,
+	0x3538bda287d45285ULL, 0x50c39712185d6354ULL, 0xf23b1885dcefc223ULL,
+	0x79930ccc6ef9619fULL, 0xed8fdc9da3934853ULL, 0xcb540aaa590bdf5eULL,
+	0x5c94389f1a6d2cacULL, 0xe77daad8a0bbaed7ULL, 0x28efc5090ca0bf2aULL,
+	0xbf2ff73c4fc64cd8ULL, 0xb37858b14df60320ULL, 0xf8c96ec0dfc724a7ULL,
+	0x828680683f329f06ULL, 0x941cd051cd6a29ccULL, 0xc3c5c05cae2b5e05ULL,
+	0xb601631dc2e27062ULL, 0xc01922382027843bULL, 0x24b86a840e90f0d2ULL,
+	0xd245177a276ffc52ULL, 0x0f8b4de98c3c95c6ULL, 0x3e759530fef809e0ULL,
+	0x0b4d2892792c5b65ULL, 0xc4df4743d5374a98ULL, 0xa5e20888bfaeb5eaULL,
+	0xba56cc90c0d23f9aULL, 0x38d04cf8ffe0a09cULL, 0x62e1adafe495254cULL,
+	0x0263bcb3f40867dfULL, 0xcaeb547d230f62bfULL, 0x6082111c109d4293ULL,
+	0xdad4dd8cd04f7d09ULL, 0xefec602e579b2f8cULL, 0x1fb4c4187f7c8a70ULL,
+	0xffd3e9dfa4db303aULL, 0x7bf0b07f9af10640ULL, 0xf49ec14dddf76b5fULL,
+	0x8f6e713247066d1fULL, 0x339d646a86ccfbf9ULL, 0x64447467e58d8c30ULL,
+	0x2c29a072f9b07189ULL, 0xd8b7613f24471ad6ULL, 0x6627c8d41185ebefULL,
+	0xa347d140beb61c96ULL, 0xde12b8f7255fb3aaULL, 0x9d324470404e1576ULL,
+	0x9306574eb6763d51ULL, 0xa80af9d2c79a47f3ULL, 0x859c0777442e8b9bULL,
+	0x69ac853d9db97e29ULL
+	}, {
+	0xc3407dfc2de6377eULL, 0x5b9e93eea4256f77ULL, 0xadb58fdd50c845e0ULL,
+	0x5219ff11a75bed86ULL, 0x356b61cfd90b1de9ULL, 0xfb8f406e25abe037ULL,
+	0x7a5a0231c0f60796ULL, 0x9d3cd216e1f5020bULL, 0x0c6550fb6b48d8f3ULL,
+	0xf57508c427ff1c62ULL, 0x4ad35ffa71cb407dULL, 0x6290a2da1666aa6dULL,
+	0xe284ec2349355f9fULL, 0xb3c307c53d7c84ecULL, 0x05e23c0468365a02ULL,
+	0x190bac4d6c9ebfa8ULL, 0x94bbbee9e28b80faULL, 0xa34fc777529cb9b5ULL,
+	0xcc7b39f095bcd978ULL, 0x2426addb0ce532e3ULL, 0x7e79329312ce4fc7ULL,
+	0xab09a72eebec2917ULL, 0xf8d15499f6b9d6c2ULL, 0x1a55b8babf8c895dULL,
+	0xdb8add17fb769a85ULL, 0xb57f2f368658e81bULL, 0x8acd36f18f3f41f6ULL,
+	0x5ce3b7bba50f11d3ULL, 0x114dcc14d5ee2f0aULL, 0xb91a7fcded1030e8ULL,
+	0x81d5425fe55de7a1ULL, 0xb6213bc1554adeeeULL, 0x80144ef95f53f5f2ULL,
+	0x1e7688186db4c10cULL, 0x3b912965db5fe1bcULL, 0xc281715a97e8252dULL,
+	0x54a5d7e21c7f8171ULL, 0x4b12535ccbc5522eULL, 0x1d289cefbea6f7f9ULL,
+	0x6ef5f2217d2e729eULL, 0xe6a7dc819b0d17ceULL, 0x1b94b41c05829b0eULL,
+	0x33d7493c622f711eULL, 0xdcf7f942fa5ce421ULL, 0x600fba8b7f7a8ecbULL,
+	0x46b60f011a83988eULL, 0x235b898e0dcf4c47ULL, 0x957ab24f588592a9ULL,
+	0x4354330572b5c28cULL, 0xa5f3ef84e9b8d542ULL, 0x8c711e02341b2d01ULL,
+	0x0b1874ae6a62a657ULL, 0x1213d8e306fc19ffULL, 0xfe6d7c6a4d9dba35ULL,
+	0x65ed868f174cd4c9ULL, 0x88522ea0e6236550ULL, 0x899322065c2d7703ULL,
+	0xc01e690bfef4018bULL, 0x915982ed8abddaf8ULL, 0xbe675b98ec3a4e4cULL,
+	0xa996bf7f82f00db1ULL, 0xe1daf8d49a27696aULL, 0x2effd5d3dc8986e7ULL,
+	0xd153a51f2b1a2e81ULL, 0x18caa0ebd690adfbULL, 0x390e3134b243c51aULL,
+	0x2778b92cdff70416ULL, 0x029f1851691c24a6ULL, 0x5e7cafeacc133575ULL,
+	0xfa4e4cc89fa5f264ULL, 0x5a5f9f481e2b7d24ULL, 0x484c47ab18d764dbULL,
+	0x400a27f2a1a7f479ULL, 0xaeeb9b2a83da7315ULL, 0x721c626879869734ULL,
+	0x042330a2d2384851ULL, 0x85f672fd3765aff0ULL, 0xba446b3a3e02061dULL,
+	0x73dd6ecec3888567ULL, 0xffac70ccf793a866ULL, 0xdfa9edb5294ed2d4ULL,
+	0x6c6aea7014325638ULL, 0x834a5a0e8c41c307ULL, 0xcdba35562fb2cb2bULL,
+	0x0ad97808d06cb404ULL, 0x0f3b440cb85aee06ULL, 0xe5f9c876481f213bULL,
+	0x98deee1289c35809ULL, 0x59018bbfcd394bd1ULL, 0xe01bf47220297b39ULL,
+	0xde68e1139340c087ULL, 0x9fa3ca4788e926adULL, 0xbb85679c840c144eULL,
+	0x53d8f3b71d55ffd5ULL, 0x0da45c5dd146caa0ULL, 0x6f34fe87c72060cdULL,
+	0x57fbc315cf6db784ULL, 0xcee421a1fca0fddeULL, 0x3d2d0196607b8d4bULL,
+	0x642c8a29ad42c69aULL, 0x14aff010bdd87508ULL, 0xac74837beac657b3ULL,
+	0x3216459ad821634dULL, 0x3fb219c70967a9edULL, 0x06bc28f3bb246cf7ULL,
+	0xf2082c9126d562c6ULL, 0x66b39278c45ee23cULL, 0xbd394f6f3f2878b9ULL,
+	0xfd33689d9e8f8cc0ULL, 0x37f4799eb017394fULL, 0x108cc0b26fe03d59ULL,
+	0xda4bd1b1417888d6ULL, 0xb09d1332ee6eb219ULL, 0x2f3ed975668794b4ULL,
+	0x58c0871977375982ULL, 0x7561463d78ace990ULL, 0x09876cff037e82f1ULL,
+	0x7fb83e35a8c05d94ULL, 0x26b9b58a65f91645ULL, 0xef20b07e9873953fULL,
+	0x3148516d0b3355b8ULL, 0x41cb2b541ba9e62aULL, 0x790416c613e43163ULL,
+	0xa011d380818e8f40ULL, 0x3a5025c36151f3efULL, 0xd57095bdf92266d0ULL,
+	0x498d4b0da2d97688ULL, 0x8b0c3a57353153a5ULL, 0x21c491df64d368e1ULL,
+	0x8f2f0af5e7091bf4ULL, 0x2da1c1240f9bb012ULL, 0xc43d59a92ccc49daULL,
+	0xbfa6573e56345c1fULL, 0x828b56a8364fd154ULL, 0x9a41f643e0df7cafULL,
+	0xbcf843c985266aeaULL, 0x2b1de9d7b4bfdce5ULL, 0x20059d79dedd7ab2ULL,
+	0x6dabe6d6ae3c446bULL, 0x45e81bf6c991ae7bULL, 0x6351ae7cac68b83eULL,
+	0xa432e32253b6c711ULL, 0xd092a9b991143cd2ULL, 0xcac711032e98b58fULL,
+	0xd8d4c9e02864ac70ULL, 0xc5fc550f96c25b89ULL, 0xd7ef8dec903e4276ULL,
+	0x67729ede7e50f06fULL, 0xeac28c7af045cf3dULL, 0xb15c1f945460a04aULL,
+	0x9cfddeb05bfb1058ULL, 0x93c69abce3a1fe5eULL, 0xeb0380dc4a4bdd6eULL,
+	0xd20db1e8f8081874ULL, 0x229a8528b7c15e14ULL, 0x44291750739fbc28ULL,
+	0xd3ccbd4e42060a27ULL, 0xf62b1c33f4ed2a97ULL, 0x86a8660ae4779905ULL,
+	0xd62e814a2a305025ULL, 0x477703a7a08d8addULL, 0x7b9b0e977af815c5ULL,
+	0x78c51a60a9ea2330ULL, 0xa6adfb733aaae3b7ULL, 0x97e5aa1e3199b60fULL,
+	0x0000000000000000ULL, 0xf4b404629df10e31ULL, 0x5564db44a6719322ULL,
+	0x9207961a59afec0dULL, 0x9624a6b88b97a45cULL, 0x363575380a192b1cULL,
+	0x2c60cd82b595a241ULL, 0x7d272664c1dc7932ULL, 0x7142769faa94a1c1ULL,
+	0xa1d0df263b809d13ULL, 0x1630e841d4c451aeULL, 0xc1df65ad44fa13d8ULL,
+	0x13d2d445bcf20bacULL, 0xd915c546926abe23ULL, 0x38cf3d92084dd749ULL,
+	0xe766d0272103059dULL, 0xc7634d5effde7f2fULL, 0x077d2455012a7ea4ULL,
+	0xedbfa82ff16fb199ULL, 0xaf2a978c39d46146ULL, 0x42953fa3c8bbd0dfULL,
+	0xcb061da59496a7dcULL, 0x25e7a17db6eb20b0ULL, 0x34aa6d6963050fbaULL,
+	0xa76cf7d580a4f1e4ULL, 0xf7ea10954ee338c4ULL, 0xfcf2643b24819e93ULL,
+	0xcf252d0746aeef8dULL, 0x4ef06f58a3f3082cULL, 0x563acfb37563a5d7ULL,
+	0x5086e740ce47c920ULL, 0x2982f186dda3f843ULL, 0x87696aac5e798b56ULL,
+	0x5d22bb1d1f010380ULL, 0x035e14f7d31236f5ULL, 0x3cec0d30da759f18ULL,
+	0xf3c920379cdb7095ULL, 0xb8db736b571e22bbULL, 0xdd36f5e44052f672ULL,
+	0xaac8ab8851e23b44ULL, 0xa857b3d938fe1fe2ULL, 0x17f1e4e76eca43fdULL,
+	0xec7ea4894b61a3caULL, 0x9e62c6e132e734feULL, 0xd4b1991b432c7483ULL,
+	0x6ad6c283af163acfULL, 0x1ce9904904a8e5aaULL, 0x5fbda34c761d2726ULL,
+	0xf910583f4cb7c491ULL, 0xc6a241f845d06d7cULL, 0x4f3163fe19fd1a7fULL,
+	0xe99c988d2357f9c8ULL, 0x8eee06535d0709a7ULL, 0x0efa48aa0254fc55ULL,
+	0xb4be23903c56fa48ULL, 0x763f52caabbedf65ULL, 0xeee1bcd8227d876cULL,
+	0xe345e085f33b4dccULL, 0x3e731561b369bbbeULL, 0x2843fd2067adea10ULL,
+	0x2adce5710eb1ceb6ULL, 0xb7e03767ef44ccbdULL, 0x8db012a48e153f52ULL,
+	0x61ceb62dc5749c98ULL, 0xe85d942b9959eb9bULL, 0x4c6f7709caef2c8aULL,
+	0x84377e5b8d6bbda3ULL, 0x30895dcbb13d47ebULL, 0x74a04a9bc2a2fbc3ULL,
+	0x6b17ce251518289cULL, 0xe438c4d0f2113368ULL, 0x1fb784bed7bad35fULL,
+	0x9b80fae55ad16efcULL, 0x77fe5e6c11b0cd36ULL, 0xc858095247849129ULL,
+	0x08466059b97090a2ULL, 0x01c10ca6ba0e1253ULL, 0x6988d6747c040c3aULL,
+	0x6849dad2c60a1e69ULL, 0x5147ebe67449db73ULL, 0xc99905f4fd8a837aULL,
+	0x991fe2b433cd4a5aULL, 0xf09734c04fc94660ULL, 0xa28ecbd1e892abe6ULL,
+	0xf1563866f5c75433ULL, 0x4dae7baf70e13ed9ULL, 0x7ce62ac27bd26b61ULL,
+	0x70837a39109ab392ULL, 0x90988e4b30b3c8abULL, 0xb2020b63877296bfULL,
+	0x156efcb607d6675bULL
+	}, {
+	0xe63f55ce97c331d0ULL, 0x25b506b0015bba16ULL, 0xc8706e29e6ad9ba8ULL,
+	0x5b43d3775d521f6aULL, 0x0bfa3d577035106eULL, 0xab95fc172afb0e66ULL,
+	0xf64b63979e7a3276ULL, 0xf58b4562649dad4bULL, 0x48f7c3dbae0c83f1ULL,
+	0xff31916642f5c8c5ULL, 0xcbb048dc1c4a0495ULL, 0x66b8f83cdf622989ULL,
+	0x35c130e908e2b9b0ULL, 0x7c761a61f0b34fa1ULL, 0x3601161cf205268dULL,
+	0x9e54ccfe2219b7d6ULL, 0x8b7d90a538940837ULL, 0x9cd403588ea35d0bULL,
+	0xbc3c6fea9ccc5b5aULL, 0xe5ff733b6d24aeedULL, 0xceed22de0f7eb8d2ULL,
+	0xec8581cab1ab545eULL, 0xb96105e88ff8e71dULL, 0x8ca03501871a5eadULL,
+	0x76ccce65d6db2a2fULL, 0x5883f582a7b58057ULL, 0x3f7be4ed2e8adc3eULL,
+	0x0fe7be06355cd9c9ULL, 0xee054e6c1d11be83ULL, 0x1074365909b903a6ULL,
+	0x5dde9f80b4813c10ULL, 0x4a770c7d02b6692cULL, 0x5379c8d5d7809039ULL,
+	0xb4067448161ed409ULL, 0x5f5e5026183bd6cdULL, 0xe898029bf4c29df9ULL,
+	0x7fb63c940a54d09cULL, 0xc5171f897f4ba8bcULL, 0xa6f28db7b31d3d72ULL,
+	0x2e4f3be7716eaa78ULL, 0x0d6771a099e63314ULL, 0x82076254e41bf284ULL,
+	0x2f0fd2b42733df98ULL, 0x5c9e76d3e2dc49f0ULL, 0x7aeb569619606cdbULL,
+	0x83478b07b2468764ULL, 0xcfadcb8d5923cd32ULL, 0x85dac7f05b95a41eULL,
+	0xb5469d1b4043a1e9ULL, 0xb821ecbbd9a592fdULL, 0x1b8e0b0e798c13c8ULL,
+	0x62a57b6d9a0be02eULL, 0xfcf1b793b81257f8ULL, 0x9d94ea0bd8fe28ebULL,
+	0x4cea408aeb654a56ULL, 0x23284a47e888996cULL, 0x2d8f1d128b893545ULL,
+	0xf4cbac3132c0d8abULL, 0xbd7c86b9ca912ebaULL, 0x3a268eef3dbe6079ULL,
+	0xf0d62f6077a9110cULL, 0x2735c916ade150cbULL, 0x89fd5f03942ee2eaULL,
+	0x1acee25d2fd16628ULL, 0x90f39bab41181bffULL, 0x430dfe8cde39939fULL,
+	0xf70b8ac4c8274796ULL, 0x1c53aeaac6024552ULL, 0x13b410acf35e9c9bULL,
+	0xa532ab4249faa24fULL, 0x2b1251e5625a163fULL, 0xd7e3e676da4841c7ULL,
+	0xa7b264e4e5404892ULL, 0xda8497d643ae72d3ULL, 0x861ae105a1723b23ULL,
+	0x38a6414991048aa4ULL, 0x6578dec92585b6b4ULL, 0x0280cfa6acbaeaddULL,
+	0x88bdb650c273970aULL, 0x9333bd5ebbff84c2ULL, 0x4e6a8f2c47dfa08bULL,
+	0x321c954db76cef2aULL, 0x418d312a72837942ULL, 0xb29b38bfffcdf773ULL,
+	0x6c022c38f90a4c07ULL, 0x5a033a240b0f6a8aULL, 0x1f93885f3ce5da6fULL,
+	0xc38a537e96988bc6ULL, 0x39e6a81ac759ff44ULL, 0x29929e43cee0fce2ULL,
+	0x40cdd87924de0ca2ULL, 0xe9d8ebc8a29fe819ULL, 0x0c2798f3cfbb46f4ULL,
+	0x55e484223e53b343ULL, 0x4650948ecd0d2fd8ULL, 0x20e86cb2126f0651ULL,
+	0x6d42c56baf5739e7ULL, 0xa06fc1405ace1e08ULL, 0x7babbfc54f3d193bULL,
+	0x424d17df8864e67fULL, 0xd8045870ef14980eULL, 0xc6d7397c85ac3781ULL,
+	0x21a885e1443273b1ULL, 0x67f8116f893f5c69ULL, 0x24f5efe35706cff6ULL,
+	0xd56329d076f2ab1aULL, 0x5e1eb9754e66a32dULL, 0x28d2771098bd8902ULL,
+	0x8f6013f47dfdc190ULL, 0x17a993fdb637553cULL, 0xe0a219397e1012aaULL,
+	0x786b9930b5da8606ULL, 0x6e82e39e55b0a6daULL, 0x875a0856f72f4ec3ULL,
+	0x3741ff4fa458536dULL, 0xac4859b3957558fcULL, 0x7ef6d5c75c09a57cULL,
+	0xc04a758b6c7f14fbULL, 0xf9acdd91ab26ebbfULL, 0x7391a467c5ef9668ULL,
+	0x335c7c1ee1319acaULL, 0xa91533b18641e4bbULL, 0xe4bf9a683b79db0dULL,
+	0x8e20faa72ba0b470ULL, 0x51f907737b3a7ae4ULL, 0x2268a314bed5ec8cULL,
+	0xd944b123b949edeeULL, 0x31dcb3b84d8b7017ULL, 0xd3fe65279f218860ULL,
+	0x097af2f1dc8ffab3ULL, 0x9b09a6fc312d0b91ULL, 0xcc6ded78a3c4520fULL,
+	0x3481d9ba5ebfcc50ULL, 0x4f2a667f1182d56bULL, 0xdfd9fdd4509ace94ULL,
+	0x26752045fbbc252bULL, 0xbffc491f662bc467ULL, 0xdd593272fc202449ULL,
+	0x3cbbc218d46d4303ULL, 0x91b372f817456e1fULL, 0x681faf69bc6385a0ULL,
+	0xb686bbeebaa43ed4ULL, 0x1469b5084cd0ca01ULL, 0x98c98009cbca94acULL,
+	0x6438379a73d8c354ULL, 0xc2caba2dc0c5fe26ULL, 0x3e3b0dbe78d7a9deULL,
+	0x50b9ee202d670f04ULL, 0x4590b27b37eab0e5ULL, 0x6025b4cb36b10af3ULL,
+	0xfb2c1237079c0162ULL, 0xa12f28130c936be8ULL, 0x4b37e52e54eb1cccULL,
+	0x083a1ba28ad28f53ULL, 0xc10a9cd83a22611bULL, 0x9f1425ad7444c236ULL,
+	0x069d4cf7e9d3237aULL, 0xedc56899e7f621beULL, 0x778c273680865fcfULL,
+	0x309c5aeb1bd605f7ULL, 0x8de0dc52d1472b4dULL, 0xf8ec34c2fd7b9e5fULL,
+	0xea18cd3d58787724ULL, 0xaad515447ca67b86ULL, 0x9989695a9d97e14cULL,
+	0x0000000000000000ULL, 0xf196c63321f464ecULL, 0x71116bc169557cb5ULL,
+	0xaf887f466f92c7c1ULL, 0x972e3e0ffe964d65ULL, 0x190ec4a8d536f915ULL,
+	0x95aef1a9522ca7b8ULL, 0xdc19db21aa7d51a9ULL, 0x94ee18fa0471d258ULL,
+	0x8087adf248a11859ULL, 0xc457f6da2916dd5cULL, 0xfa6cfb6451c17482ULL,
+	0xf256e0c6db13fbd1ULL, 0x6a9f60cf10d96f7dULL, 0x4daaa9d9bd383fb6ULL,
+	0x03c026f5fae79f3dULL, 0xde99148706c7bb74ULL, 0x2a52b8b6340763dfULL,
+	0x6fc20acd03edd33aULL, 0xd423c08320afdefaULL, 0xbbe1ca4e23420dc0ULL,
+	0x966ed75ca8cb3885ULL, 0xeb58246e0e2502c4ULL, 0x055d6a021334bc47ULL,
+	0xa47242111fa7d7afULL, 0xe3623fcc84f78d97ULL, 0x81c744a11efc6db9ULL,
+	0xaec8961539cfb221ULL, 0xf31609958d4e8e31ULL, 0x63e5923ecc5695ceULL,
+	0x47107ddd9b505a38ULL, 0xa3afe7b5a0298135ULL, 0x792b7063e387f3e6ULL,
+	0x0140e953565d75e0ULL, 0x12f4f9ffa503e97bULL, 0x750ce8902c3cb512ULL,
+	0xdbc47e8515f30733ULL, 0x1ed3610c6ab8af8fULL, 0x5239218681dde5d9ULL,
+	0xe222d69fd2aaf877ULL, 0xfe71783514a8bd25ULL, 0xcaf0a18f4a177175ULL,
+	0x61655d9860ec7f13ULL, 0xe77fbc9dc19e4430ULL, 0x2ccff441ddd440a5ULL,
+	0x16e97aaee06a20dcULL, 0xa855dae2d01c915bULL, 0x1d1347f9905f30b2ULL,
+	0xb7c652bdecf94b34ULL, 0xd03e43d265c6175dULL, 0xfdb15ec0ee4f2218ULL,
+	0x57644b8492e9599eULL, 0x07dda5a4bf8e569aULL, 0x54a46d71680ec6a3ULL,
+	0x5624a2d7c4b42c7eULL, 0xbebca04c3076b187ULL, 0x7d36f332a6ee3a41ULL,
+	0x3b6667bc6be31599ULL, 0x695f463aea3ef040ULL, 0xad08b0e0c3282d1cULL,
+	0xb15b1e4a052a684eULL, 0x44d05b2861b7c505ULL, 0x15295c5b1a8dbfe1ULL,
+	0x744c01c37a61c0f2ULL, 0x59c31cd1f1e8f5b7ULL, 0xef45a73f4b4ccb63ULL,
+	0x6bdf899c46841a9dULL, 0x3dfb2b4b823036e3ULL, 0xa2ef0ee6f674f4d5ULL,
+	0x184e2dfb836b8cf5ULL, 0x1134df0a5fe47646ULL, 0xbaa1231d751f7820ULL,
+	0xd17eaa81339b62bdULL, 0xb01bf71953771daeULL, 0x849a2ea30dc8d1feULL,
+	0x705182923f080955ULL, 0x0ea757556301ac29ULL, 0x041d83514569c9a7ULL,
+	0x0abad4042668658eULL, 0x49b72a88f851f611ULL, 0x8a3d79f66ec97dd7ULL,
+	0xcd2d042bf59927efULL, 0xc930877ab0f0ee48ULL, 0x9273540deda2f122ULL,
+	0xc797d02fd3f14261ULL, 0xe1e2f06a284d674aULL, 0xd2be8c74c97cfd80ULL,
+	0x9a494faf67707e71ULL, 0xb3dbd1eca9908293ULL, 0x72d14d3493b2e388ULL,
+	0xd6a30f258c153427ULL
+	}
+}; /* Ax */
+
+static void streebog_xor(const struct streebog_uint512 *x,
+			 const struct streebog_uint512 *y,
+			 struct streebog_uint512 *z)
+{
+	z->qword[0] = x->qword[0] ^ y->qword[0];
+	z->qword[1] = x->qword[1] ^ y->qword[1];
+	z->qword[2] = x->qword[2] ^ y->qword[2];
+	z->qword[3] = x->qword[3] ^ y->qword[3];
+	z->qword[4] = x->qword[4] ^ y->qword[4];
+	z->qword[5] = x->qword[5] ^ y->qword[5];
+	z->qword[6] = x->qword[6] ^ y->qword[6];
+	z->qword[7] = x->qword[7] ^ y->qword[7];
+}
+
+static void streebog_xlps(const struct streebog_uint512 *x,
+			  const struct streebog_uint512 *y,
+			  struct streebog_uint512 *data)
+{
+	u64 r0, r1, r2, r3, r4, r5, r6, r7;
+	int i;
+
+	r0 = le64_to_cpu(x->qword[0] ^ y->qword[0]);
+	r1 = le64_to_cpu(x->qword[1] ^ y->qword[1]);
+	r2 = le64_to_cpu(x->qword[2] ^ y->qword[2]);
+	r3 = le64_to_cpu(x->qword[3] ^ y->qword[3]);
+	r4 = le64_to_cpu(x->qword[4] ^ y->qword[4]);
+	r5 = le64_to_cpu(x->qword[5] ^ y->qword[5]);
+	r6 = le64_to_cpu(x->qword[6] ^ y->qword[6]);
+	r7 = le64_to_cpu(x->qword[7] ^ y->qword[7]);
+
+	for (i = 0; i <= 7; i++) {
+		data->qword[i]  = cpu_to_le64(Ax[0][r0 & 0xFF]);
+		data->qword[i] ^= cpu_to_le64(Ax[1][r1 & 0xFF]);
+		data->qword[i] ^= cpu_to_le64(Ax[2][r2 & 0xFF]);
+		data->qword[i] ^= cpu_to_le64(Ax[3][r3 & 0xFF]);
+		data->qword[i] ^= cpu_to_le64(Ax[4][r4 & 0xFF]);
+		data->qword[i] ^= cpu_to_le64(Ax[5][r5 & 0xFF]);
+		data->qword[i] ^= cpu_to_le64(Ax[6][r6 & 0xFF]);
+		data->qword[i] ^= cpu_to_le64(Ax[7][r7 & 0xFF]);
+		r0 >>= 8;
+		r1 >>= 8;
+		r2 >>= 8;
+		r3 >>= 8;
+		r4 >>= 8;
+		r5 >>= 8;
+		r6 >>= 8;
+		r7 >>= 8;
+	}
+}
+
+static void streebog_round(int i, struct streebog_uint512 *Ki,
+			   struct streebog_uint512 *data)
+{
+	streebog_xlps(Ki, &C[i], Ki);
+	streebog_xlps(Ki, data, data);
+}
+
+static int streebog_init(struct shash_desc *desc)
+{
+	struct streebog_state *ctx = shash_desc_ctx(desc);
+	unsigned int digest_size = crypto_shash_digestsize(desc->tfm);
+	unsigned int i;
+
+	memset(ctx, 0, sizeof(struct streebog_state));
+	for (i = 0; i < 8; i++) {
+		if (digest_size == STREEBOG256_DIGEST_SIZE)
+			ctx->h.qword[i] = 0x0101010101010101ULL;
+	}
+	return 0;
+}
+
+static void streebog_pad(struct streebog_state *ctx)
+{
+	if (ctx->fillsize >= STREEBOG_BLOCK_SIZE)
+		return;
+
+	memset(ctx->buffer + ctx->fillsize, 0,
+	       sizeof(ctx->buffer) - ctx->fillsize);
+
+	ctx->buffer[ctx->fillsize] = 1;
+}
+
+static void streebog_add512(const struct streebog_uint512 *x,
+			    const struct streebog_uint512 *y,
+			    struct streebog_uint512 *r)
+{
+	u64 carry = 0;
+	int i;
+
+	for (i = 0; i < 8; i++) {
+		const u64 left = le64_to_cpu(x->qword[i]);
+		u64 sum;
+
+		sum = left + le64_to_cpu(y->qword[i]) + carry;
+		if (sum != left)
+			carry = (sum < left);
+		r->qword[i] = cpu_to_le64(sum);
+	}
+}
+
+static void streebog_g(struct streebog_uint512 *h,
+		       const struct streebog_uint512 *N,
+		       const u8 *m)
+{
+	struct streebog_uint512 Ki, data;
+	unsigned int i;
+
+	streebog_xlps(h, N, &data);
+
+	/* Starting E() */
+	Ki = data;
+	streebog_xlps(&Ki, (const struct streebog_uint512 *)&m[0], &data);
+
+	for (i = 0; i < 11; i++)
+		streebog_round(i, &Ki, &data);
+
+	streebog_xlps(&Ki, &C[11], &Ki);
+	streebog_xor(&Ki, &data, &data);
+	/* E() done */
+
+	streebog_xor(&data, h, &data);
+	streebog_xor(&data, (const struct streebog_uint512 *)&m[0], h);
+}
+
+static void streebog_stage2(struct streebog_state *ctx, const u8 *data)
+{
+	streebog_g(&ctx->h, &ctx->N, data);
+
+	streebog_add512(&ctx->N, &buffer512, &ctx->N);
+	streebog_add512(&ctx->Sigma, (const struct streebog_uint512 *)data,
+			&ctx->Sigma);
+}
+
+static void streebog_stage3(struct streebog_state *ctx)
+{
+	struct streebog_uint512 buf = { { 0 } };
+
+	buf.qword[0] = cpu_to_le64(ctx->fillsize << 3);
+	streebog_pad(ctx);
+
+	streebog_g(&ctx->h, &ctx->N, (const u8 *)&ctx->buffer);
+	streebog_add512(&ctx->N, &buf, &ctx->N);
+	streebog_add512(&ctx->Sigma,
+			(const struct streebog_uint512 *)&ctx->buffer[0],
+			&ctx->Sigma);
+	streebog_g(&ctx->h, &buffer0, (const u8 *)&ctx->N);
+	streebog_g(&ctx->h, &buffer0, (const u8 *)&ctx->Sigma);
+	memcpy(&ctx->hash, &ctx->h, sizeof(struct streebog_uint512));
+}
+
+static int streebog_update(struct shash_desc *desc, const u8 *data,
+			   unsigned int len)
+{
+	struct streebog_state *ctx = shash_desc_ctx(desc);
+	size_t chunksize;
+
+	if (ctx->fillsize) {
+		chunksize = STREEBOG_BLOCK_SIZE - ctx->fillsize;
+		if (chunksize > len)
+			chunksize = len;
+		memcpy(&ctx->buffer[ctx->fillsize], data, chunksize);
+		ctx->fillsize += chunksize;
+		len  -= chunksize;
+		data += chunksize;
+
+		if (ctx->fillsize == STREEBOG_BLOCK_SIZE) {
+			streebog_stage2(ctx, ctx->buffer);
+			ctx->fillsize = 0;
+		}
+	}
+
+	while (len >= STREEBOG_BLOCK_SIZE) {
+		streebog_stage2(ctx, data);
+		data += STREEBOG_BLOCK_SIZE;
+		len  -= STREEBOG_BLOCK_SIZE;
+	}
+
+	if (len) {
+		memcpy(&ctx->buffer, data, len);
+		ctx->fillsize = len;
+	}
+	return 0;
+}
+
+static int streebog_final(struct shash_desc *desc, u8 *digest)
+{
+	struct streebog_state *ctx = shash_desc_ctx(desc);
+
+	streebog_stage3(ctx);
+	ctx->fillsize = 0;
+	if (crypto_shash_digestsize(desc->tfm) == STREEBOG256_DIGEST_SIZE)
+		memcpy(digest, &ctx->hash.qword[4], STREEBOG256_DIGEST_SIZE);
+	else
+		memcpy(digest, &ctx->hash.qword[0], STREEBOG512_DIGEST_SIZE);
+	return 0;
+}
+
+static struct shash_alg algs[2] = { {
+	.digestsize	=	STREEBOG256_DIGEST_SIZE,
+	.init		=	streebog_init,
+	.update		=	streebog_update,
+	.final		=	streebog_final,
+	.descsize	=	sizeof(struct streebog_state),
+	.base		=	{
+		.cra_name	 =	"streebog256",
+		.cra_driver_name =	"streebog256-generic",
+		.cra_blocksize	 =	STREEBOG_BLOCK_SIZE,
+		.cra_module	 =	THIS_MODULE,
+	},
+}, {
+	.digestsize	=	STREEBOG512_DIGEST_SIZE,
+	.init		=	streebog_init,
+	.update		=	streebog_update,
+	.final		=	streebog_final,
+	.descsize	=	sizeof(struct streebog_state),
+	.base		=	{
+		.cra_name	 =	"streebog512",
+		.cra_driver_name =	"streebog512-generic",
+		.cra_blocksize	 =	STREEBOG_BLOCK_SIZE,
+		.cra_module	 =	THIS_MODULE,
+	}
+} };
+
+static int __init streebog_mod_init(void)
+{
+	return crypto_register_shashes(algs, ARRAY_SIZE(algs));
+}
+
+static void __exit streebog_mod_fini(void)
+{
+	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+}
+
+module_init(streebog_mod_init);
+module_exit(streebog_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Vitaly Chikunov <vt@altlinux.org>");
+MODULE_DESCRIPTION("Streebog Hash Function");
+
+MODULE_ALIAS_CRYPTO("streebog256");
+MODULE_ALIAS_CRYPTO("streebog256-generic");
+MODULE_ALIAS_CRYPTO("streebog512");
+MODULE_ALIAS_CRYPTO("streebog512-generic");
diff --git a/include/crypto/streebog.h b/include/crypto/streebog.h
new file mode 100644
index 000000000000..4af119f7e07b
--- /dev/null
+++ b/include/crypto/streebog.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-2-Clause */
+/*
+ * Copyright (c) 2013 Alexey Degtyarev <alexey@renatasystems.org>
+ * Copyright (c) 2018 Vitaly Chikunov <vt@altlinux.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#ifndef _CRYPTO_STREEBOG_H_
+#define _CRYPTO_STREEBOG_H_
+
+#include <linux/types.h>
+
+#define STREEBOG256_DIGEST_SIZE	32
+#define STREEBOG512_DIGEST_SIZE	64
+#define STREEBOG_BLOCK_SIZE	64
+
+struct streebog_uint512 {
+	u64 qword[8];
+};
+
+struct streebog_state {
+	u8 buffer[STREEBOG_BLOCK_SIZE];
+	struct streebog_uint512 hash;
+	struct streebog_uint512 h;
+	struct streebog_uint512 N;
+	struct streebog_uint512 Sigma;
+	size_t fillsize;
+};
+
+#endif /* !_CRYPTO_STREEBOG_H_ */
-- 
cgit v1.2.3-59-g8ed1b


From dfdda82e3b84c13601be09f8351ec4f15a4fbe03 Mon Sep 17 00:00:00 2001
From: Vitaly Chikunov <vt@altlinux.org>
Date: Wed, 7 Nov 2018 00:00:02 +0300
Subject: crypto: streebog - register Streebog in hash info for IMA

Register Streebog hash function in Hash Info arrays to let IMA use
it for its purposes.

Cc: linux-integrity@vger.kernel.org
Signed-off-by: Vitaly Chikunov <vt@altlinux.org>
Reviewed-by: Mimi Zohar <zohar@linux.ibm.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/hash_info.c             | 4 ++++
 include/crypto/hash_info.h     | 1 +
 include/uapi/linux/hash_info.h | 2 ++
 3 files changed, 7 insertions(+)

diff --git a/crypto/hash_info.c b/crypto/hash_info.c
index 7b1e0b188ce6..1dd095e4b451 100644
--- a/crypto/hash_info.c
+++ b/crypto/hash_info.c
@@ -32,6 +32,8 @@ const char *const hash_algo_name[HASH_ALGO__LAST] = {
 	[HASH_ALGO_TGR_160]	= "tgr160",
 	[HASH_ALGO_TGR_192]	= "tgr192",
 	[HASH_ALGO_SM3_256]	= "sm3-256",
+	[HASH_ALGO_STREEBOG_256] = "streebog256",
+	[HASH_ALGO_STREEBOG_512] = "streebog512",
 };
 EXPORT_SYMBOL_GPL(hash_algo_name);
 
@@ -54,5 +56,7 @@ const int hash_digest_size[HASH_ALGO__LAST] = {
 	[HASH_ALGO_TGR_160]	= TGR160_DIGEST_SIZE,
 	[HASH_ALGO_TGR_192]	= TGR192_DIGEST_SIZE,
 	[HASH_ALGO_SM3_256]	= SM3256_DIGEST_SIZE,
+	[HASH_ALGO_STREEBOG_256] = STREEBOG256_DIGEST_SIZE,
+	[HASH_ALGO_STREEBOG_512] = STREEBOG512_DIGEST_SIZE,
 };
 EXPORT_SYMBOL_GPL(hash_digest_size);
diff --git a/include/crypto/hash_info.h b/include/crypto/hash_info.h
index 56f217d41f12..91786b68dbdb 100644
--- a/include/crypto/hash_info.h
+++ b/include/crypto/hash_info.h
@@ -15,6 +15,7 @@
 
 #include <crypto/sha.h>
 #include <crypto/md5.h>
+#include <crypto/streebog.h>
 
 #include <uapi/linux/hash_info.h>
 
diff --git a/include/uapi/linux/hash_info.h b/include/uapi/linux/hash_info.h
index eea5d02c58de..74a8609fcb4d 100644
--- a/include/uapi/linux/hash_info.h
+++ b/include/uapi/linux/hash_info.h
@@ -33,6 +33,8 @@ enum hash_algo {
 	HASH_ALGO_TGR_160,
 	HASH_ALGO_TGR_192,
 	HASH_ALGO_SM3_256,
+	HASH_ALGO_STREEBOG_256,
+	HASH_ALGO_STREEBOG_512,
 	HASH_ALGO__LAST
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 25a0b9d4e512ea04d80c84bd5e3b9e2722b92ec1 Mon Sep 17 00:00:00 2001
From: Vitaly Chikunov <vt@altlinux.org>
Date: Wed, 7 Nov 2018 00:00:03 +0300
Subject: crypto: streebog - add Streebog test vectors

Add testmgr and tcrypt tests and vectors for Streebog hash function
from RFC 6986 and GOST R 34.11-2012, for HMAC-Streebog vectors are
from RFC 7836 and R 50.1.113-2016.

Cc: linux-integrity@vger.kernel.org
Signed-off-by: Vitaly Chikunov <vt@altlinux.org>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/tcrypt.c  |  40 ++++++++++++++++++-
 crypto/testmgr.c |  24 ++++++++++++
 crypto/testmgr.h | 116 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 179 insertions(+), 1 deletion(-)

diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index 1026173d721a..5fb120474902 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -76,7 +76,9 @@ static char *check[] = {
 	"cast6", "arc4", "michael_mic", "deflate", "crc32c", "tea", "xtea",
 	"khazad", "wp512", "wp384", "wp256", "tnepres", "xeta",  "fcrypt",
 	"camellia", "seed", "salsa20", "rmd128", "rmd160", "rmd256", "rmd320",
-	"lzo", "cts", "sha3-224", "sha3-256", "sha3-384", "sha3-512", NULL
+	"lzo", "cts", "sha3-224", "sha3-256", "sha3-384", "sha3-512",
+	"streebog256", "streebog512",
+	NULL
 };
 
 static u32 block_sizes[] = { 16, 64, 256, 1024, 8192, 0 };
@@ -1914,6 +1916,14 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb)
 		ret += tcrypt_test("sm3");
 		break;
 
+	case 53:
+		ret += tcrypt_test("streebog256");
+		break;
+
+	case 54:
+		ret += tcrypt_test("streebog512");
+		break;
+
 	case 100:
 		ret += tcrypt_test("hmac(md5)");
 		break;
@@ -1970,6 +1980,14 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb)
 		ret += tcrypt_test("hmac(sha3-512)");
 		break;
 
+	case 115:
+		ret += tcrypt_test("hmac(streebog256)");
+		break;
+
+	case 116:
+		ret += tcrypt_test("hmac(streebog512)");
+		break;
+
 	case 150:
 		ret += tcrypt_test("ansi_cprng");
 		break;
@@ -2412,6 +2430,16 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb)
 		test_hash_speed("sm3", sec, generic_hash_speed_template);
 		if (mode > 300 && mode < 400) break;
 		/* fall through */
+	case 327:
+		test_hash_speed("streebog256", sec,
+				generic_hash_speed_template);
+		if (mode > 300 && mode < 400) break;
+		/* fall through */
+	case 328:
+		test_hash_speed("streebog512", sec,
+				generic_hash_speed_template);
+		if (mode > 300 && mode < 400) break;
+		/* fall through */
 	case 399:
 		break;
 
@@ -2525,6 +2553,16 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb)
 				    num_mb);
 		if (mode > 400 && mode < 500) break;
 		/* fall through */
+	case 426:
+		test_mb_ahash_speed("streebog256", sec,
+				    generic_hash_speed_template, num_mb);
+		if (mode > 400 && mode < 500) break;
+		/* fall through */
+	case 427:
+		test_mb_ahash_speed("streebog512", sec,
+				    generic_hash_speed_template, num_mb);
+		if (mode > 400 && mode < 500) break;
+		/* fall through */
 	case 499:
 		break;
 
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 512ebf697f7e..379794a259a7 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -3192,6 +3192,18 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.suite = {
 			.hash = __VECS(hmac_sha512_tv_template)
 		}
+	}, {
+		.alg = "hmac(streebog256)",
+		.test = alg_test_hash,
+		.suite = {
+			.hash = __VECS(hmac_streebog256_tv_template)
+		}
+	}, {
+		.alg = "hmac(streebog512)",
+		.test = alg_test_hash,
+		.suite = {
+			.hash = __VECS(hmac_streebog512_tv_template)
+		}
 	}, {
 		.alg = "jitterentropy_rng",
 		.fips_allowed = 1,
@@ -3504,6 +3516,18 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.suite = {
 			.hash = __VECS(sm3_tv_template)
 		}
+	}, {
+		.alg = "streebog256",
+		.test = alg_test_hash,
+		.suite = {
+			.hash = __VECS(streebog256_tv_template)
+		}
+	}, {
+		.alg = "streebog512",
+		.test = alg_test_hash,
+		.suite = {
+			.hash = __VECS(streebog512_tv_template)
+		}
 	}, {
 		.alg = "tgr128",
 		.test = alg_test_hash,
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index b5b0d29761ce..fc1b6c0e9ed2 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -2307,6 +2307,122 @@ static const struct hash_testvec crct10dif_tv_template[] = {
 	}
 };
 
+/*
+ * Streebog test vectors from RFC 6986 and GOST R 34.11-2012
+ */
+static const struct hash_testvec streebog256_tv_template[] = {
+	{ /* M1 */
+		.plaintext = "012345678901234567890123456789012345678901234567890123456789012",
+		.psize = 63,
+		.digest =
+			"\x9d\x15\x1e\xef\xd8\x59\x0b\x89"
+			"\xda\xa6\xba\x6c\xb7\x4a\xf9\x27"
+			"\x5d\xd0\x51\x02\x6b\xb1\x49\xa4"
+			"\x52\xfd\x84\xe5\xe5\x7b\x55\x00",
+	},
+	{ /* M2 */
+		.plaintext =
+			"\xd1\xe5\x20\xe2\xe5\xf2\xf0\xe8"
+			"\x2c\x20\xd1\xf2\xf0\xe8\xe1\xee"
+			"\xe6\xe8\x20\xe2\xed\xf3\xf6\xe8"
+			"\x2c\x20\xe2\xe5\xfe\xf2\xfa\x20"
+			"\xf1\x20\xec\xee\xf0\xff\x20\xf1"
+			"\xf2\xf0\xe5\xeb\xe0\xec\xe8\x20"
+			"\xed\xe0\x20\xf5\xf0\xe0\xe1\xf0"
+			"\xfb\xff\x20\xef\xeb\xfa\xea\xfb"
+			"\x20\xc8\xe3\xee\xf0\xe5\xe2\xfb",
+		.psize = 72,
+		.digest =
+			"\x9d\xd2\xfe\x4e\x90\x40\x9e\x5d"
+			"\xa8\x7f\x53\x97\x6d\x74\x05\xb0"
+			"\xc0\xca\xc6\x28\xfc\x66\x9a\x74"
+			"\x1d\x50\x06\x3c\x55\x7e\x8f\x50",
+	},
+};
+
+static const struct hash_testvec streebog512_tv_template[] = {
+	{ /* M1 */
+		.plaintext = "012345678901234567890123456789012345678901234567890123456789012",
+		.psize = 63,
+		.digest =
+			"\x1b\x54\xd0\x1a\x4a\xf5\xb9\xd5"
+			"\xcc\x3d\x86\xd6\x8d\x28\x54\x62"
+			"\xb1\x9a\xbc\x24\x75\x22\x2f\x35"
+			"\xc0\x85\x12\x2b\xe4\xba\x1f\xfa"
+			"\x00\xad\x30\xf8\x76\x7b\x3a\x82"
+			"\x38\x4c\x65\x74\xf0\x24\xc3\x11"
+			"\xe2\xa4\x81\x33\x2b\x08\xef\x7f"
+			"\x41\x79\x78\x91\xc1\x64\x6f\x48",
+	},
+	{ /* M2 */
+		.plaintext =
+			"\xd1\xe5\x20\xe2\xe5\xf2\xf0\xe8"
+			"\x2c\x20\xd1\xf2\xf0\xe8\xe1\xee"
+			"\xe6\xe8\x20\xe2\xed\xf3\xf6\xe8"
+			"\x2c\x20\xe2\xe5\xfe\xf2\xfa\x20"
+			"\xf1\x20\xec\xee\xf0\xff\x20\xf1"
+			"\xf2\xf0\xe5\xeb\xe0\xec\xe8\x20"
+			"\xed\xe0\x20\xf5\xf0\xe0\xe1\xf0"
+			"\xfb\xff\x20\xef\xeb\xfa\xea\xfb"
+			"\x20\xc8\xe3\xee\xf0\xe5\xe2\xfb",
+		.psize = 72,
+		.digest =
+			"\x1e\x88\xe6\x22\x26\xbf\xca\x6f"
+			"\x99\x94\xf1\xf2\xd5\x15\x69\xe0"
+			"\xda\xf8\x47\x5a\x3b\x0f\xe6\x1a"
+			"\x53\x00\xee\xe4\x6d\x96\x13\x76"
+			"\x03\x5f\xe8\x35\x49\xad\xa2\xb8"
+			"\x62\x0f\xcd\x7c\x49\x6c\xe5\xb3"
+			"\x3f\x0c\xb9\xdd\xdc\x2b\x64\x60"
+			"\x14\x3b\x03\xda\xba\xc9\xfb\x28",
+	},
+};
+
+/*
+ * Two HMAC-Streebog test vectors from RFC 7836 and R 50.1.113-2016 A
+ */
+static const struct hash_testvec hmac_streebog256_tv_template[] = {
+	{
+		.key =  "\x00\x01\x02\x03\x04\x05\x06\x07"
+			"\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+			"\x10\x11\x12\x13\x14\x15\x16\x17"
+			"\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f",
+		.ksize  = 32,
+		.plaintext =
+			"\x01\x26\xbd\xb8\x78\x00\xaf\x21"
+			"\x43\x41\x45\x65\x63\x78\x01\x00",
+		.psize  = 16,
+		.digest =
+			"\xa1\xaa\x5f\x7d\xe4\x02\xd7\xb3"
+			"\xd3\x23\xf2\x99\x1c\x8d\x45\x34"
+			"\x01\x31\x37\x01\x0a\x83\x75\x4f"
+			"\xd0\xaf\x6d\x7c\xd4\x92\x2e\xd9",
+	},
+};
+
+static const struct hash_testvec hmac_streebog512_tv_template[] = {
+	{
+		.key =  "\x00\x01\x02\x03\x04\x05\x06\x07"
+			"\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+			"\x10\x11\x12\x13\x14\x15\x16\x17"
+			"\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f",
+		.ksize  = 32,
+		.plaintext =
+			"\x01\x26\xbd\xb8\x78\x00\xaf\x21"
+			"\x43\x41\x45\x65\x63\x78\x01\x00",
+		.psize  = 16,
+		.digest =
+			"\xa5\x9b\xab\x22\xec\xae\x19\xc6"
+			"\x5f\xbd\xe6\xe5\xf4\xe9\xf5\xd8"
+			"\x54\x9d\x31\xf0\x37\xf9\xdf\x9b"
+			"\x90\x55\x00\xe1\x71\x92\x3a\x77"
+			"\x3d\x5f\x15\x30\xf2\xed\x7e\x96"
+			"\x4c\xb2\xee\xdc\x29\xe9\xad\x2f"
+			"\x3a\xfe\x93\xb2\x81\x4f\x79\xf5"
+			"\x00\x0f\xfc\x03\x66\xc2\x51\xe6",
+	},
+};
+
 /* Example vectors below taken from
  * http://www.oscca.gov.cn/UpFile/20101222141857786.pdf
  *
-- 
cgit v1.2.3-59-g8ed1b


From 70db8b79e561a2965927a08ccdb06c796834a1b7 Mon Sep 17 00:00:00 2001
From: Leonard Crestez <leonard.crestez@nxp.com>
Date: Wed, 7 Nov 2018 15:33:31 +0000
Subject: dt-bindings: crypto: Mention clocks for mxs-dcp

Explicit clock enabling is required on 6sll and 6ull so mention that
standard clock bindings are used.

Signed-off-by: Leonard Crestez <leonard.crestez@nxp.com>
Reviewed-by: Fabio Estevam <festevam@gmail.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 Documentation/devicetree/bindings/crypto/fsl-dcp.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Documentation/devicetree/bindings/crypto/fsl-dcp.txt b/Documentation/devicetree/bindings/crypto/fsl-dcp.txt
index 76a0b4e80e83..4e4d387e38a5 100644
--- a/Documentation/devicetree/bindings/crypto/fsl-dcp.txt
+++ b/Documentation/devicetree/bindings/crypto/fsl-dcp.txt
@@ -6,6 +6,8 @@ Required properties:
 - interrupts : Should contain MXS DCP interrupt numbers, VMI IRQ and DCP IRQ
                must be supplied, optionally Secure IRQ can be present, but
 	       is currently not implemented and not used.
+- clocks : Clock reference (only required on some SOCs: 6ull and 6sll).
+- clock-names : Must be "dcp".
 
 Example:
 
-- 
cgit v1.2.3-59-g8ed1b


From 57f002891e08c589abf09409f2d606dfd1164e32 Mon Sep 17 00:00:00 2001
From: Leonard Crestez <leonard.crestez@nxp.com>
Date: Wed, 7 Nov 2018 15:33:32 +0000
Subject: crypto: mxs-dcp - Add support for dcp clk

On 6ull and 6sll the DCP block has a clock which needs to be explicitly
enabled.

Add minimal handling for this at probe/remove time.

Signed-off-by: Leonard Crestez <leonard.crestez@nxp.com>
Reviewed-by: Fabio Estevam <festevam@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/mxs-dcp.c | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/drivers/crypto/mxs-dcp.c b/drivers/crypto/mxs-dcp.c
index 4e6ff32f8a7e..a2105cf33abb 100644
--- a/drivers/crypto/mxs-dcp.c
+++ b/drivers/crypto/mxs-dcp.c
@@ -20,6 +20,7 @@
 #include <linux/of.h>
 #include <linux/platform_device.h>
 #include <linux/stmp_device.h>
+#include <linux/clk.h>
 
 #include <crypto/aes.h>
 #include <crypto/sha.h>
@@ -82,6 +83,7 @@ struct dcp {
 	spinlock_t			lock[DCP_MAX_CHANS];
 	struct task_struct		*thread[DCP_MAX_CHANS];
 	struct crypto_queue		queue[DCP_MAX_CHANS];
+	struct clk			*dcp_clk;
 };
 
 enum dcp_chan {
@@ -1053,11 +1055,24 @@ static int mxs_dcp_probe(struct platform_device *pdev)
 	/* Re-align the structure so it fits the DCP constraints. */
 	sdcp->coh = PTR_ALIGN(sdcp->coh, DCP_ALIGNMENT);
 
-	/* Restart the DCP block. */
-	ret = stmp_reset_block(sdcp->base);
+	/* DCP clock is optional, only used on some SOCs */
+	sdcp->dcp_clk = devm_clk_get(dev, "dcp");
+	if (IS_ERR(sdcp->dcp_clk)) {
+		if (sdcp->dcp_clk != ERR_PTR(-ENOENT))
+			return PTR_ERR(sdcp->dcp_clk);
+		sdcp->dcp_clk = NULL;
+	}
+	ret = clk_prepare_enable(sdcp->dcp_clk);
 	if (ret)
 		return ret;
 
+	/* Restart the DCP block. */
+	ret = stmp_reset_block(sdcp->base);
+	if (ret) {
+		dev_err(dev, "Failed reset\n");
+		goto err_disable_unprepare_clk;
+	}
+
 	/* Initialize control register. */
 	writel(MXS_DCP_CTRL_GATHER_RESIDUAL_WRITES |
 	       MXS_DCP_CTRL_ENABLE_CONTEXT_CACHING | 0xf,
@@ -1094,7 +1109,8 @@ static int mxs_dcp_probe(struct platform_device *pdev)
 						      NULL, "mxs_dcp_chan/sha");
 	if (IS_ERR(sdcp->thread[DCP_CHAN_HASH_SHA])) {
 		dev_err(dev, "Error starting SHA thread!\n");
-		return PTR_ERR(sdcp->thread[DCP_CHAN_HASH_SHA]);
+		ret = PTR_ERR(sdcp->thread[DCP_CHAN_HASH_SHA]);
+		goto err_disable_unprepare_clk;
 	}
 
 	sdcp->thread[DCP_CHAN_CRYPTO] = kthread_run(dcp_chan_thread_aes,
@@ -1151,6 +1167,10 @@ err_destroy_aes_thread:
 
 err_destroy_sha_thread:
 	kthread_stop(sdcp->thread[DCP_CHAN_HASH_SHA]);
+
+err_disable_unprepare_clk:
+	clk_disable_unprepare(sdcp->dcp_clk);
+
 	return ret;
 }
 
@@ -1170,6 +1190,8 @@ static int mxs_dcp_remove(struct platform_device *pdev)
 	kthread_stop(sdcp->thread[DCP_CHAN_HASH_SHA]);
 	kthread_stop(sdcp->thread[DCP_CHAN_CRYPTO]);
 
+	clk_disable_unprepare(sdcp->dcp_clk);
+
 	platform_set_drvdata(pdev, NULL);
 
 	global_sdcp = NULL;
-- 
cgit v1.2.3-59-g8ed1b


From d239b10d4ceb986d998779a4ed81824368aca831 Mon Sep 17 00:00:00 2001
From: Horia Geantă <horia.geanta@nxp.com>
Date: Thu, 8 Nov 2018 15:36:27 +0200
Subject: crypto: caam - add register map changes cf. Era 10
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Era 10 changes the register map.

The updates that affect the drivers:
-new version registers are added
-DBG_DBG[deco_state] field is moved to a new register -
DBG_EXEC[19:16] @ 8_0E3Ch.

Signed-off-by: Horia Geantă <horia.geanta@nxp.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/caam/caamalg.c    | 47 +++++++++++++++++--------
 drivers/crypto/caam/caamalg_qi.c | 37 +++++++++++++++-----
 drivers/crypto/caam/caamhash.c   | 20 ++++++++---
 drivers/crypto/caam/caampkc.c    | 10 ++++--
 drivers/crypto/caam/caamrng.c    | 10 +++++-
 drivers/crypto/caam/ctrl.c       | 28 +++++++++++----
 drivers/crypto/caam/desc.h       |  7 ++++
 drivers/crypto/caam/regs.h       | 74 ++++++++++++++++++++++++++++++++++------
 8 files changed, 184 insertions(+), 49 deletions(-)

diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
index 869f092432de..9f1414030bc2 100644
--- a/drivers/crypto/caam/caamalg.c
+++ b/drivers/crypto/caam/caamalg.c
@@ -3135,7 +3135,7 @@ static int __init caam_algapi_init(void)
 	struct device *ctrldev;
 	struct caam_drv_private *priv;
 	int i = 0, err = 0;
-	u32 cha_vid, cha_inst, des_inst, aes_inst, md_inst;
+	u32 aes_vid, aes_inst, des_inst, md_vid, md_inst;
 	unsigned int md_limit = SHA512_DIGEST_SIZE;
 	bool registered = false;
 
@@ -3168,14 +3168,34 @@ static int __init caam_algapi_init(void)
 	 * Register crypto algorithms the device supports.
 	 * First, detect presence and attributes of DES, AES, and MD blocks.
 	 */
-	cha_vid = rd_reg32(&priv->ctrl->perfmon.cha_id_ls);
-	cha_inst = rd_reg32(&priv->ctrl->perfmon.cha_num_ls);
-	des_inst = (cha_inst & CHA_ID_LS_DES_MASK) >> CHA_ID_LS_DES_SHIFT;
-	aes_inst = (cha_inst & CHA_ID_LS_AES_MASK) >> CHA_ID_LS_AES_SHIFT;
-	md_inst = (cha_inst & CHA_ID_LS_MD_MASK) >> CHA_ID_LS_MD_SHIFT;
+	if (priv->era < 10) {
+		u32 cha_vid, cha_inst;
+
+		cha_vid = rd_reg32(&priv->ctrl->perfmon.cha_id_ls);
+		aes_vid = cha_vid & CHA_ID_LS_AES_MASK;
+		md_vid = (cha_vid & CHA_ID_LS_MD_MASK) >> CHA_ID_LS_MD_SHIFT;
+
+		cha_inst = rd_reg32(&priv->ctrl->perfmon.cha_num_ls);
+		des_inst = (cha_inst & CHA_ID_LS_DES_MASK) >>
+			   CHA_ID_LS_DES_SHIFT;
+		aes_inst = cha_inst & CHA_ID_LS_AES_MASK;
+		md_inst = (cha_inst & CHA_ID_LS_MD_MASK) >> CHA_ID_LS_MD_SHIFT;
+	} else {
+		u32 aesa, mdha;
+
+		aesa = rd_reg32(&priv->ctrl->vreg.aesa);
+		mdha = rd_reg32(&priv->ctrl->vreg.mdha);
+
+		aes_vid = (aesa & CHA_VER_VID_MASK) >> CHA_VER_VID_SHIFT;
+		md_vid = (mdha & CHA_VER_VID_MASK) >> CHA_VER_VID_SHIFT;
+
+		des_inst = rd_reg32(&priv->ctrl->vreg.desa) & CHA_VER_NUM_MASK;
+		aes_inst = aesa & CHA_VER_NUM_MASK;
+		md_inst = mdha & CHA_VER_NUM_MASK;
+	}
 
 	/* If MD is present, limit digest size based on LP256 */
-	if (md_inst && ((cha_vid & CHA_ID_LS_MD_MASK) == CHA_ID_LS_MD_LP256))
+	if (md_inst && md_vid  == CHA_VER_VID_MD_LP256)
 		md_limit = SHA256_DIGEST_SIZE;
 
 	for (i = 0; i < ARRAY_SIZE(driver_algs); i++) {
@@ -3196,10 +3216,10 @@ static int __init caam_algapi_init(void)
 		 * Check support for AES modes not available
 		 * on LP devices.
 		 */
-		if ((cha_vid & CHA_ID_LS_AES_MASK) == CHA_ID_LS_AES_LP)
-			if ((t_alg->caam.class1_alg_type & OP_ALG_AAI_MASK) ==
-			     OP_ALG_AAI_XTS)
-				continue;
+		if (aes_vid == CHA_VER_VID_AES_LP &&
+		    (t_alg->caam.class1_alg_type & OP_ALG_AAI_MASK) ==
+		    OP_ALG_AAI_XTS)
+			continue;
 
 		caam_skcipher_alg_init(t_alg);
 
@@ -3236,9 +3256,8 @@ static int __init caam_algapi_init(void)
 		 * Check support for AES algorithms not available
 		 * on LP devices.
 		 */
-		if ((cha_vid & CHA_ID_LS_AES_MASK) == CHA_ID_LS_AES_LP)
-			if (alg_aai == OP_ALG_AAI_GCM)
-				continue;
+		if (aes_vid  == CHA_VER_VID_AES_LP && alg_aai == OP_ALG_AAI_GCM)
+			continue;
 
 		/*
 		 * Skip algorithms requiring message digests
diff --git a/drivers/crypto/caam/caamalg_qi.c b/drivers/crypto/caam/caamalg_qi.c
index 23c9fc4975f8..c0d55310aade 100644
--- a/drivers/crypto/caam/caamalg_qi.c
+++ b/drivers/crypto/caam/caamalg_qi.c
@@ -2462,7 +2462,7 @@ static int __init caam_qi_algapi_init(void)
 	struct device *ctrldev;
 	struct caam_drv_private *priv;
 	int i = 0, err = 0;
-	u32 cha_vid, cha_inst, des_inst, aes_inst, md_inst;
+	u32 aes_vid, aes_inst, des_inst, md_vid, md_inst;
 	unsigned int md_limit = SHA512_DIGEST_SIZE;
 	bool registered = false;
 
@@ -2497,14 +2497,34 @@ static int __init caam_qi_algapi_init(void)
 	 * Register crypto algorithms the device supports.
 	 * First, detect presence and attributes of DES, AES, and MD blocks.
 	 */
-	cha_vid = rd_reg32(&priv->ctrl->perfmon.cha_id_ls);
-	cha_inst = rd_reg32(&priv->ctrl->perfmon.cha_num_ls);
-	des_inst = (cha_inst & CHA_ID_LS_DES_MASK) >> CHA_ID_LS_DES_SHIFT;
-	aes_inst = (cha_inst & CHA_ID_LS_AES_MASK) >> CHA_ID_LS_AES_SHIFT;
-	md_inst = (cha_inst & CHA_ID_LS_MD_MASK) >> CHA_ID_LS_MD_SHIFT;
+	if (priv->era < 10) {
+		u32 cha_vid, cha_inst;
+
+		cha_vid = rd_reg32(&priv->ctrl->perfmon.cha_id_ls);
+		aes_vid = cha_vid & CHA_ID_LS_AES_MASK;
+		md_vid = (cha_vid & CHA_ID_LS_MD_MASK) >> CHA_ID_LS_MD_SHIFT;
+
+		cha_inst = rd_reg32(&priv->ctrl->perfmon.cha_num_ls);
+		des_inst = (cha_inst & CHA_ID_LS_DES_MASK) >>
+			   CHA_ID_LS_DES_SHIFT;
+		aes_inst = cha_inst & CHA_ID_LS_AES_MASK;
+		md_inst = (cha_inst & CHA_ID_LS_MD_MASK) >> CHA_ID_LS_MD_SHIFT;
+	} else {
+		u32 aesa, mdha;
+
+		aesa = rd_reg32(&priv->ctrl->vreg.aesa);
+		mdha = rd_reg32(&priv->ctrl->vreg.mdha);
+
+		aes_vid = (aesa & CHA_VER_VID_MASK) >> CHA_VER_VID_SHIFT;
+		md_vid = (mdha & CHA_VER_VID_MASK) >> CHA_VER_VID_SHIFT;
+
+		des_inst = rd_reg32(&priv->ctrl->vreg.desa) & CHA_VER_NUM_MASK;
+		aes_inst = aesa & CHA_VER_NUM_MASK;
+		md_inst = mdha & CHA_VER_NUM_MASK;
+	}
 
 	/* If MD is present, limit digest size based on LP256 */
-	if (md_inst && ((cha_vid & CHA_ID_LS_MD_MASK) == CHA_ID_LS_MD_LP256))
+	if (md_inst && md_vid  == CHA_VER_VID_MD_LP256)
 		md_limit = SHA256_DIGEST_SIZE;
 
 	for (i = 0; i < ARRAY_SIZE(driver_algs); i++) {
@@ -2556,8 +2576,7 @@ static int __init caam_qi_algapi_init(void)
 		 * Check support for AES algorithms not available
 		 * on LP devices.
 		 */
-		if (((cha_vid & CHA_ID_LS_AES_MASK) == CHA_ID_LS_AES_LP) &&
-		    (alg_aai == OP_ALG_AAI_GCM))
+		if (aes_vid  == CHA_VER_VID_AES_LP && alg_aai == OP_ALG_AAI_GCM)
 			continue;
 
 		/*
diff --git a/drivers/crypto/caam/caamhash.c b/drivers/crypto/caam/caamhash.c
index 46924affa0bd..81712aa5d0f2 100644
--- a/drivers/crypto/caam/caamhash.c
+++ b/drivers/crypto/caam/caamhash.c
@@ -3,6 +3,7 @@
  * caam - Freescale FSL CAAM support for ahash functions of crypto API
  *
  * Copyright 2011 Freescale Semiconductor, Inc.
+ * Copyright 2018 NXP
  *
  * Based on caamalg.c crypto API driver.
  *
@@ -1801,7 +1802,7 @@ static int __init caam_algapi_hash_init(void)
 	int i = 0, err = 0;
 	struct caam_drv_private *priv;
 	unsigned int md_limit = SHA512_DIGEST_SIZE;
-	u32 cha_inst, cha_vid;
+	u32 md_inst, md_vid;
 
 	dev_node = of_find_compatible_node(NULL, NULL, "fsl,sec-v4.0");
 	if (!dev_node) {
@@ -1831,18 +1832,27 @@ static int __init caam_algapi_hash_init(void)
 	 * Register crypto algorithms the device supports.  First, identify
 	 * presence and attributes of MD block.
 	 */
-	cha_vid = rd_reg32(&priv->ctrl->perfmon.cha_id_ls);
-	cha_inst = rd_reg32(&priv->ctrl->perfmon.cha_num_ls);
+	if (priv->era < 10) {
+		md_vid = (rd_reg32(&priv->ctrl->perfmon.cha_id_ls) &
+			  CHA_ID_LS_MD_MASK) >> CHA_ID_LS_MD_SHIFT;
+		md_inst = (rd_reg32(&priv->ctrl->perfmon.cha_num_ls) &
+			   CHA_ID_LS_MD_MASK) >> CHA_ID_LS_MD_SHIFT;
+	} else {
+		u32 mdha = rd_reg32(&priv->ctrl->vreg.mdha);
+
+		md_vid = (mdha & CHA_VER_VID_MASK) >> CHA_VER_VID_SHIFT;
+		md_inst = mdha & CHA_VER_NUM_MASK;
+	}
 
 	/*
 	 * Skip registration of any hashing algorithms if MD block
 	 * is not present.
 	 */
-	if (!((cha_inst & CHA_ID_LS_MD_MASK) >> CHA_ID_LS_MD_SHIFT))
+	if (!md_inst)
 		return -ENODEV;
 
 	/* Limit digest size based on LP256 */
-	if ((cha_vid & CHA_ID_LS_MD_MASK) == CHA_ID_LS_MD_LP256)
+	if (md_vid == CHA_VER_VID_MD_LP256)
 		md_limit = SHA256_DIGEST_SIZE;
 
 	INIT_LIST_HEAD(&hash_list);
diff --git a/drivers/crypto/caam/caampkc.c b/drivers/crypto/caam/caampkc.c
index 4fc209cbbeab..77ab28a2811a 100644
--- a/drivers/crypto/caam/caampkc.c
+++ b/drivers/crypto/caam/caampkc.c
@@ -3,6 +3,7 @@
  * caam - Freescale FSL CAAM support for Public Key Cryptography
  *
  * Copyright 2016 Freescale Semiconductor, Inc.
+ * Copyright 2018 NXP
  *
  * There is no Shared Descriptor for PKC so that the Job Descriptor must carry
  * all the desired key parameters, input and output pointers.
@@ -1017,7 +1018,7 @@ static int __init caam_pkc_init(void)
 	struct platform_device *pdev;
 	struct device *ctrldev;
 	struct caam_drv_private *priv;
-	u32 cha_inst, pk_inst;
+	u32 pk_inst;
 	int err;
 
 	dev_node = of_find_compatible_node(NULL, NULL, "fsl,sec-v4.0");
@@ -1045,8 +1046,11 @@ static int __init caam_pkc_init(void)
 		return -ENODEV;
 
 	/* Determine public key hardware accelerator presence. */
-	cha_inst = rd_reg32(&priv->ctrl->perfmon.cha_num_ls);
-	pk_inst = (cha_inst & CHA_ID_LS_PK_MASK) >> CHA_ID_LS_PK_SHIFT;
+	if (priv->era < 10)
+		pk_inst = (rd_reg32(&priv->ctrl->perfmon.cha_num_ls) &
+			   CHA_ID_LS_PK_MASK) >> CHA_ID_LS_PK_SHIFT;
+	else
+		pk_inst = rd_reg32(&priv->ctrl->vreg.pkha) & CHA_VER_NUM_MASK;
 
 	/* Do not register algorithms if PKHA is not present. */
 	if (!pk_inst)
diff --git a/drivers/crypto/caam/caamrng.c b/drivers/crypto/caam/caamrng.c
index 4318b0aa6fb9..a387c8d49a62 100644
--- a/drivers/crypto/caam/caamrng.c
+++ b/drivers/crypto/caam/caamrng.c
@@ -3,6 +3,7 @@
  * caam - Freescale FSL CAAM support for hw_random
  *
  * Copyright 2011 Freescale Semiconductor, Inc.
+ * Copyright 2018 NXP
  *
  * Based on caamalg.c crypto API driver.
  *
@@ -309,6 +310,7 @@ static int __init caam_rng_init(void)
 	struct platform_device *pdev;
 	struct device *ctrldev;
 	struct caam_drv_private *priv;
+	u32 rng_inst;
 	int err;
 
 	dev_node = of_find_compatible_node(NULL, NULL, "fsl,sec-v4.0");
@@ -336,7 +338,13 @@ static int __init caam_rng_init(void)
 		return -ENODEV;
 
 	/* Check for an instantiated RNG before registration */
-	if (!(rd_reg32(&priv->ctrl->perfmon.cha_num_ls) & CHA_ID_LS_RNG_MASK))
+	if (priv->era < 10)
+		rng_inst = (rd_reg32(&priv->ctrl->perfmon.cha_num_ls) &
+			    CHA_ID_LS_RNG_MASK) >> CHA_ID_LS_RNG_SHIFT;
+	else
+		rng_inst = rd_reg32(&priv->ctrl->vreg.rng) & CHA_VER_NUM_MASK;
+
+	if (!rng_inst)
 		return -ENODEV;
 
 	dev = caam_jr_alloc();
diff --git a/drivers/crypto/caam/ctrl.c b/drivers/crypto/caam/ctrl.c
index 3fc793193821..16bbc72f041a 100644
--- a/drivers/crypto/caam/ctrl.c
+++ b/drivers/crypto/caam/ctrl.c
@@ -3,6 +3,7 @@
  * Controller-level driver, kernel property detection, initialization
  *
  * Copyright 2008-2012 Freescale Semiconductor, Inc.
+ * Copyright 2018 NXP
  */
 
 #include <linux/device.h>
@@ -106,7 +107,7 @@ static inline int run_descriptor_deco0(struct device *ctrldev, u32 *desc,
 	struct caam_ctrl __iomem *ctrl = ctrlpriv->ctrl;
 	struct caam_deco __iomem *deco = ctrlpriv->deco;
 	unsigned int timeout = 100000;
-	u32 deco_dbg_reg, flags;
+	u32 deco_dbg_reg, deco_state, flags;
 	int i;
 
 
@@ -149,13 +150,22 @@ static inline int run_descriptor_deco0(struct device *ctrldev, u32 *desc,
 	timeout = 10000000;
 	do {
 		deco_dbg_reg = rd_reg32(&deco->desc_dbg);
+
+		if (ctrlpriv->era < 10)
+			deco_state = (deco_dbg_reg & DESC_DBG_DECO_STAT_MASK) >>
+				     DESC_DBG_DECO_STAT_SHIFT;
+		else
+			deco_state = (rd_reg32(&deco->dbg_exec) &
+				      DESC_DER_DECO_STAT_MASK) >>
+				     DESC_DER_DECO_STAT_SHIFT;
+
 		/*
 		 * If an error occured in the descriptor, then
 		 * the DECO status field will be set to 0x0D
 		 */
-		if ((deco_dbg_reg & DESC_DBG_DECO_STAT_MASK) ==
-		    DESC_DBG_DECO_STAT_HOST_ERR)
+		if (deco_state == DECO_STAT_HOST_ERR)
 			break;
+
 		cpu_relax();
 	} while ((deco_dbg_reg & DESC_DBG_DECO_STAT_VALID) && --timeout);
 
@@ -491,7 +501,7 @@ static int caam_probe(struct platform_device *pdev)
 	struct caam_perfmon *perfmon;
 #endif
 	u32 scfgr, comp_params;
-	u32 cha_vid_ls;
+	u8 rng_vid;
 	int pg_size;
 	int BLOCK_OFFSET = 0;
 
@@ -733,15 +743,19 @@ static int caam_probe(struct platform_device *pdev)
 		goto caam_remove;
 	}
 
-	cha_vid_ls = rd_reg32(&ctrl->perfmon.cha_id_ls);
+	if (ctrlpriv->era < 10)
+		rng_vid = (rd_reg32(&ctrl->perfmon.cha_id_ls) &
+			   CHA_ID_LS_RNG_MASK) >> CHA_ID_LS_RNG_SHIFT;
+	else
+		rng_vid = (rd_reg32(&ctrl->vreg.rng) & CHA_VER_VID_MASK) >>
+			   CHA_VER_VID_SHIFT;
 
 	/*
 	 * If SEC has RNG version >= 4 and RNG state handle has not been
 	 * already instantiated, do RNG instantiation
 	 * In case of SoCs with Management Complex, RNG is managed by MC f/w.
 	 */
-	if (!ctrlpriv->mc_en &&
-	    (cha_vid_ls & CHA_ID_LS_RNG_MASK) >> CHA_ID_LS_RNG_SHIFT >= 4) {
+	if (!ctrlpriv->mc_en && rng_vid >= 4) {
 		ctrlpriv->rng4_sh_init =
 			rd_reg32(&ctrl->r4tst[0].rdsta);
 		/*
diff --git a/drivers/crypto/caam/desc.h b/drivers/crypto/caam/desc.h
index f76ff160a02c..ec1ef06049b4 100644
--- a/drivers/crypto/caam/desc.h
+++ b/drivers/crypto/caam/desc.h
@@ -4,6 +4,7 @@
  * Definitions to support CAAM descriptor instruction generation
  *
  * Copyright 2008-2011 Freescale Semiconductor, Inc.
+ * Copyright 2018 NXP
  */
 
 #ifndef DESC_H
@@ -1133,6 +1134,12 @@
 #define OP_ALG_TYPE_CLASS1	(2 << OP_ALG_TYPE_SHIFT)
 #define OP_ALG_TYPE_CLASS2	(4 << OP_ALG_TYPE_SHIFT)
 
+/* version register fields */
+#define OP_VER_CCHA_NUM  0x000000ff /* Number CCHAs instantiated */
+#define OP_VER_CCHA_MISC 0x0000ff00 /* CCHA Miscellaneous Information */
+#define OP_VER_CCHA_REV  0x00ff0000 /* CCHA Revision Number */
+#define OP_VER_CCHA_VID  0xff000000 /* CCHA Version ID */
+
 #define OP_ALG_ALGSEL_SHIFT	16
 #define OP_ALG_ALGSEL_MASK	(0xff << OP_ALG_ALGSEL_SHIFT)
 #define OP_ALG_ALGSEL_SUBMASK	(0x0f << OP_ALG_ALGSEL_SHIFT)
diff --git a/drivers/crypto/caam/regs.h b/drivers/crypto/caam/regs.h
index 457815f965c0..3cd0822ea819 100644
--- a/drivers/crypto/caam/regs.h
+++ b/drivers/crypto/caam/regs.h
@@ -3,6 +3,7 @@
  * CAAM hardware register-level view
  *
  * Copyright 2008-2011 Freescale Semiconductor, Inc.
+ * Copyright 2018 NXP
  */
 
 #ifndef REGS_H
@@ -211,6 +212,47 @@ struct jr_outentry {
 	u32 jrstatus;	/* Status for completed descriptor */
 } __packed;
 
+/* Version registers (Era 10+)	e80-eff */
+struct version_regs {
+	u32 crca;	/* CRCA_VERSION */
+	u32 afha;	/* AFHA_VERSION */
+	u32 kfha;	/* KFHA_VERSION */
+	u32 pkha;	/* PKHA_VERSION */
+	u32 aesa;	/* AESA_VERSION */
+	u32 mdha;	/* MDHA_VERSION */
+	u32 desa;	/* DESA_VERSION */
+	u32 snw8a;	/* SNW8A_VERSION */
+	u32 snw9a;	/* SNW9A_VERSION */
+	u32 zuce;	/* ZUCE_VERSION */
+	u32 zuca;	/* ZUCA_VERSION */
+	u32 ccha;	/* CCHA_VERSION */
+	u32 ptha;	/* PTHA_VERSION */
+	u32 rng;	/* RNG_VERSION */
+	u32 trng;	/* TRNG_VERSION */
+	u32 aaha;	/* AAHA_VERSION */
+	u32 rsvd[10];
+	u32 sr;		/* SR_VERSION */
+	u32 dma;	/* DMA_VERSION */
+	u32 ai;		/* AI_VERSION */
+	u32 qi;		/* QI_VERSION */
+	u32 jr;		/* JR_VERSION */
+	u32 deco;	/* DECO_VERSION */
+};
+
+/* Version registers bitfields */
+
+/* Number of CHAs instantiated */
+#define CHA_VER_NUM_MASK	0xffull
+/* CHA Miscellaneous Information */
+#define CHA_VER_MISC_SHIFT	8
+#define CHA_VER_MISC_MASK	(0xffull << CHA_VER_MISC_SHIFT)
+/* CHA Revision Number */
+#define CHA_VER_REV_SHIFT	16
+#define CHA_VER_REV_MASK	(0xffull << CHA_VER_REV_SHIFT)
+/* CHA Version ID */
+#define CHA_VER_VID_SHIFT	24
+#define CHA_VER_VID_MASK	(0xffull << CHA_VER_VID_SHIFT)
+
 /*
  * caam_perfmon - Performance Monitor/Secure Memory Status/
  *                CAAM Global Status/Component Version IDs
@@ -223,15 +265,13 @@ struct jr_outentry {
 #define CHA_NUM_MS_DECONUM_MASK	(0xfull << CHA_NUM_MS_DECONUM_SHIFT)
 
 /*
- * CHA version IDs / instantiation bitfields
+ * CHA version IDs / instantiation bitfields (< Era 10)
  * Defined for use with the cha_id fields in perfmon, but the same shift/mask
  * selectors can be used to pull out the number of instantiated blocks within
  * cha_num fields in perfmon because the locations are the same.
  */
 #define CHA_ID_LS_AES_SHIFT	0
 #define CHA_ID_LS_AES_MASK	(0xfull << CHA_ID_LS_AES_SHIFT)
-#define CHA_ID_LS_AES_LP	(0x3ull << CHA_ID_LS_AES_SHIFT)
-#define CHA_ID_LS_AES_HP	(0x4ull << CHA_ID_LS_AES_SHIFT)
 
 #define CHA_ID_LS_DES_SHIFT	4
 #define CHA_ID_LS_DES_MASK	(0xfull << CHA_ID_LS_DES_SHIFT)
@@ -241,9 +281,6 @@ struct jr_outentry {
 
 #define CHA_ID_LS_MD_SHIFT	12
 #define CHA_ID_LS_MD_MASK	(0xfull << CHA_ID_LS_MD_SHIFT)
-#define CHA_ID_LS_MD_LP256	(0x0ull << CHA_ID_LS_MD_SHIFT)
-#define CHA_ID_LS_MD_LP512	(0x1ull << CHA_ID_LS_MD_SHIFT)
-#define CHA_ID_LS_MD_HP		(0x2ull << CHA_ID_LS_MD_SHIFT)
 
 #define CHA_ID_LS_RNG_SHIFT	16
 #define CHA_ID_LS_RNG_MASK	(0xfull << CHA_ID_LS_RNG_SHIFT)
@@ -269,6 +306,13 @@ struct jr_outentry {
 #define CHA_ID_MS_JR_SHIFT	28
 #define CHA_ID_MS_JR_MASK	(0xfull << CHA_ID_MS_JR_SHIFT)
 
+/* Specific CHA version IDs */
+#define CHA_VER_VID_AES_LP	0x3ull
+#define CHA_VER_VID_AES_HP	0x4ull
+#define CHA_VER_VID_MD_LP256	0x0ull
+#define CHA_VER_VID_MD_LP512	0x1ull
+#define CHA_VER_VID_MD_HP	0x2ull
+
 struct sec_vid {
 	u16 ip_id;
 	u8 maj_rev;
@@ -479,8 +523,10 @@ struct caam_ctrl {
 		struct rng4tst r4tst[2];
 	};
 
-	u32 rsvd9[448];
+	u32 rsvd9[416];
 
+	/* Version registers - introduced with era 10		e80-eff */
+	struct version_regs vreg;
 	/* Performance Monitor                                  f00-fff */
 	struct caam_perfmon perfmon;
 };
@@ -570,8 +616,10 @@ struct caam_job_ring {
 	u32 rsvd11;
 	u32 jrcommand;	/* JRCRx - JobR command */
 
-	u32 rsvd12[932];
+	u32 rsvd12[900];
 
+	/* Version registers - introduced with era 10           e80-eff */
+	struct version_regs vreg;
 	/* Performance Monitor                                  f00-fff */
 	struct caam_perfmon perfmon;
 };
@@ -878,13 +926,19 @@ struct caam_deco {
 	u32 rsvd29[48];
 	u32 descbuf[64];	/* DxDESB - Descriptor buffer */
 	u32 rscvd30[193];
-#define DESC_DBG_DECO_STAT_HOST_ERR	0x00D00000
 #define DESC_DBG_DECO_STAT_VALID	0x80000000
 #define DESC_DBG_DECO_STAT_MASK		0x00F00000
+#define DESC_DBG_DECO_STAT_SHIFT	20
 	u32 desc_dbg;		/* DxDDR - DECO Debug Register */
-	u32 rsvd31[126];
+	u32 rsvd31[13];
+#define DESC_DER_DECO_STAT_MASK		0x000F0000
+#define DESC_DER_DECO_STAT_SHIFT	16
+	u32 dbg_exec;		/* DxDER - DECO Debug Exec Register */
+	u32 rsvd32[112];
 };
 
+#define DECO_STAT_HOST_ERR	0xD
+
 #define DECO_JQCR_WHL		0x20000000
 #define DECO_JQCR_FOUR		0x10000000
 
-- 
cgit v1.2.3-59-g8ed1b


From c99d4a2454009d6fb51e03248fac7629c4d6a5ca Mon Sep 17 00:00:00 2001
From: Horia Geantă <horia.geanta@nxp.com>
Date: Thu, 8 Nov 2018 15:36:28 +0200
Subject: crypto: caam/qi2 - add support for ChaCha20
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for ChaCha20 skcipher algorithm.

Signed-off-by: Carmen Iorga <carmen.iorga@nxp.com>
Signed-off-by: Horia Geantă <horia.geanta@nxp.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/caam/caamalg_desc.c |  6 ++++--
 drivers/crypto/caam/caamalg_qi2.c  | 27 +++++++++++++++++++++++++--
 drivers/crypto/caam/compat.h       |  1 +
 drivers/crypto/caam/desc.h         |  6 ++++++
 4 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/drivers/crypto/caam/caamalg_desc.c b/drivers/crypto/caam/caamalg_desc.c
index 1a6f0da14106..d850590079a2 100644
--- a/drivers/crypto/caam/caamalg_desc.c
+++ b/drivers/crypto/caam/caamalg_desc.c
@@ -1228,7 +1228,8 @@ static inline void skcipher_append_src_dst(u32 *desc)
  * @desc: pointer to buffer used for descriptor construction
  * @cdata: pointer to block cipher transform definitions
  *         Valid algorithm values - one of OP_ALG_ALGSEL_{AES, DES, 3DES} ANDed
- *         with OP_ALG_AAI_CBC or OP_ALG_AAI_CTR_MOD128.
+ *         with OP_ALG_AAI_CBC or OP_ALG_AAI_CTR_MOD128
+ *                                - OP_ALG_ALGSEL_CHACHA20
  * @ivsize: initialization vector size
  * @is_rfc3686: true when ctr(aes) is wrapped by rfc3686 template
  * @ctx1_iv_off: IV offset in CONTEXT1 register
@@ -1293,7 +1294,8 @@ EXPORT_SYMBOL(cnstr_shdsc_skcipher_encap);
  * @desc: pointer to buffer used for descriptor construction
  * @cdata: pointer to block cipher transform definitions
  *         Valid algorithm values - one of OP_ALG_ALGSEL_{AES, DES, 3DES} ANDed
- *         with OP_ALG_AAI_CBC or OP_ALG_AAI_CTR_MOD128.
+ *         with OP_ALG_AAI_CBC or OP_ALG_AAI_CTR_MOD128
+ *                                - OP_ALG_ALGSEL_CHACHA20
  * @ivsize: initialization vector size
  * @is_rfc3686: true when ctr(aes) is wrapped by rfc3686 template
  * @ctx1_iv_off: IV offset in CONTEXT1 register
diff --git a/drivers/crypto/caam/caamalg_qi2.c b/drivers/crypto/caam/caamalg_qi2.c
index 7d8ac0222fa3..a9e264bb9629 100644
--- a/drivers/crypto/caam/caamalg_qi2.c
+++ b/drivers/crypto/caam/caamalg_qi2.c
@@ -816,7 +816,9 @@ static int skcipher_setkey(struct crypto_skcipher *skcipher, const u8 *key,
 	u32 *desc;
 	u32 ctx1_iv_off = 0;
 	const bool ctr_mode = ((ctx->cdata.algtype & OP_ALG_AAI_MASK) ==
-			       OP_ALG_AAI_CTR_MOD128);
+			       OP_ALG_AAI_CTR_MOD128) &&
+			       ((ctx->cdata.algtype & OP_ALG_ALGSEL_MASK) !=
+			       OP_ALG_ALGSEL_CHACHA20);
 	const bool is_rfc3686 = alg->caam.rfc3686;
 
 	print_hex_dump_debug("key in @" __stringify(__LINE__)": ",
@@ -1494,7 +1496,23 @@ static struct caam_skcipher_alg driver_algs[] = {
 			.ivsize = AES_BLOCK_SIZE,
 		},
 		.caam.class1_alg_type = OP_ALG_ALGSEL_AES | OP_ALG_AAI_XTS,
-	}
+	},
+	{
+		.skcipher = {
+			.base = {
+				.cra_name = "chacha20",
+				.cra_driver_name = "chacha20-caam-qi2",
+				.cra_blocksize = 1,
+			},
+			.setkey = skcipher_setkey,
+			.encrypt = skcipher_encrypt,
+			.decrypt = skcipher_decrypt,
+			.min_keysize = CHACHA20_KEY_SIZE,
+			.max_keysize = CHACHA20_KEY_SIZE,
+			.ivsize = CHACHA20_IV_SIZE,
+		},
+		.caam.class1_alg_type = OP_ALG_ALGSEL_CHACHA20,
+	},
 };
 
 static struct caam_aead_alg driver_aeads[] = {
@@ -4908,6 +4926,11 @@ static int dpaa2_caam_probe(struct fsl_mc_device *dpseci_dev)
 		    alg_sel == OP_ALG_ALGSEL_AES)
 			continue;
 
+		/* Skip CHACHA20 algorithms if not supported by device */
+		if (alg_sel == OP_ALG_ALGSEL_CHACHA20 &&
+		    !priv->sec_attr.ccha_acc_num)
+			continue;
+
 		t_alg->caam.dev = dev;
 		caam_skcipher_alg_init(t_alg);
 
diff --git a/drivers/crypto/caam/compat.h b/drivers/crypto/caam/compat.h
index 9604ff7a335e..a5081b4050b6 100644
--- a/drivers/crypto/caam/compat.h
+++ b/drivers/crypto/caam/compat.h
@@ -36,6 +36,7 @@
 #include <crypto/gcm.h>
 #include <crypto/sha.h>
 #include <crypto/md5.h>
+#include <crypto/chacha20.h>
 #include <crypto/internal/aead.h>
 #include <crypto/authenc.h>
 #include <crypto/akcipher.h>
diff --git a/drivers/crypto/caam/desc.h b/drivers/crypto/caam/desc.h
index ec1ef06049b4..9d117e51629f 100644
--- a/drivers/crypto/caam/desc.h
+++ b/drivers/crypto/caam/desc.h
@@ -1159,6 +1159,7 @@
 #define OP_ALG_ALGSEL_KASUMI	(0x70 << OP_ALG_ALGSEL_SHIFT)
 #define OP_ALG_ALGSEL_CRC	(0x90 << OP_ALG_ALGSEL_SHIFT)
 #define OP_ALG_ALGSEL_SNOW_F9	(0xA0 << OP_ALG_ALGSEL_SHIFT)
+#define OP_ALG_ALGSEL_CHACHA20	(0xD0 << OP_ALG_ALGSEL_SHIFT)
 
 #define OP_ALG_AAI_SHIFT	4
 #define OP_ALG_AAI_MASK		(0x1ff << OP_ALG_AAI_SHIFT)
@@ -1206,6 +1207,11 @@
 #define OP_ALG_AAI_RNG4_AI	(0x80 << OP_ALG_AAI_SHIFT)
 #define OP_ALG_AAI_RNG4_SK	(0x100 << OP_ALG_AAI_SHIFT)
 
+/* Chacha20 AAI set */
+#define OP_ALG_AAI_AEAD	(0x002 << OP_ALG_AAI_SHIFT)
+#define OP_ALG_AAI_KEYSTREAM	(0x001 << OP_ALG_AAI_SHIFT)
+#define OP_ALG_AAI_BC8		(0x008 << OP_ALG_AAI_SHIFT)
+
 /* hmac/smac AAI set */
 #define OP_ALG_AAI_HASH		(0x00 << OP_ALG_AAI_SHIFT)
 #define OP_ALG_AAI_HMAC		(0x01 << OP_ALG_AAI_SHIFT)
-- 
cgit v1.2.3-59-g8ed1b


From 193188e5512db5e84d2d9a7a6a157de651e78f3a Mon Sep 17 00:00:00 2001
From: Cristian Stoica <cristian.stoica@nxp.com>
Date: Thu, 8 Nov 2018 15:36:29 +0200
Subject: crypto: chacha20poly1305 - export CHACHAPOLY_IV_SIZE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move CHACHAPOLY_IV_SIZE to header file, so it can be reused.

Signed-off-by: Cristian Stoica <cristian.stoica@nxp.com>
Signed-off-by: Horia Geantă <horia.geanta@nxp.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/chacha20poly1305.c | 2 --
 include/crypto/chacha20.h | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/crypto/chacha20poly1305.c b/crypto/chacha20poly1305.c
index 600afa99941f..f9dd5453046a 100644
--- a/crypto/chacha20poly1305.c
+++ b/crypto/chacha20poly1305.c
@@ -22,8 +22,6 @@
 
 #include "internal.h"
 
-#define CHACHAPOLY_IV_SIZE	12
-
 struct chachapoly_instance_ctx {
 	struct crypto_skcipher_spawn chacha;
 	struct crypto_ahash_spawn poly;
diff --git a/include/crypto/chacha20.h b/include/crypto/chacha20.h
index f76302d99e2b..2d3129442a52 100644
--- a/include/crypto/chacha20.h
+++ b/include/crypto/chacha20.h
@@ -13,6 +13,7 @@
 #define CHACHA20_IV_SIZE	16
 #define CHACHA20_KEY_SIZE	32
 #define CHACHA20_BLOCK_SIZE	64
+#define CHACHAPOLY_IV_SIZE	12
 
 struct chacha20_ctx {
 	u32 key[8];
-- 
cgit v1.2.3-59-g8ed1b


From d6bbd4eea243951d2543a0f427c9a6bf2835b6f5 Mon Sep 17 00:00:00 2001
From: Horia Geantă <horia.geanta@nxp.com>
Date: Thu, 8 Nov 2018 15:36:30 +0200
Subject: crypto: caam/jr - add support for Chacha20 + Poly1305
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for Chacha20 + Poly1305 combined AEAD:
-generic (rfc7539)
-IPsec (rfc7634 - known as rfc7539esp in the kernel)

Signed-off-by: Cristian Stoica <cristian.stoica@nxp.com>
Signed-off-by: Horia Geantă <horia.geanta@nxp.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/caam/caamalg.c      | 221 ++++++++++++++++++++++++++++++++++++-
 drivers/crypto/caam/caamalg_desc.c | 111 +++++++++++++++++++
 drivers/crypto/caam/caamalg_desc.h |   4 +
 drivers/crypto/caam/compat.h       |   1 +
 drivers/crypto/caam/desc.h         |  15 +++
 drivers/crypto/caam/desc_constr.h  |   7 +-
 6 files changed, 354 insertions(+), 5 deletions(-)

diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
index 9f1414030bc2..cbaeb264a261 100644
--- a/drivers/crypto/caam/caamalg.c
+++ b/drivers/crypto/caam/caamalg.c
@@ -72,6 +72,8 @@
 #define AUTHENC_DESC_JOB_IO_LEN		(AEAD_DESC_JOB_IO_LEN + \
 					 CAAM_CMD_SZ * 5)
 
+#define CHACHAPOLY_DESC_JOB_IO_LEN	(AEAD_DESC_JOB_IO_LEN + CAAM_CMD_SZ * 6)
+
 #define DESC_MAX_USED_BYTES		(CAAM_DESC_BYTES_MAX - DESC_JOB_IO_LEN)
 #define DESC_MAX_USED_LEN		(DESC_MAX_USED_BYTES / CAAM_CMD_SZ)
 
@@ -513,6 +515,61 @@ static int rfc4543_setauthsize(struct crypto_aead *authenc,
 	return 0;
 }
 
+static int chachapoly_set_sh_desc(struct crypto_aead *aead)
+{
+	struct caam_ctx *ctx = crypto_aead_ctx(aead);
+	struct device *jrdev = ctx->jrdev;
+	unsigned int ivsize = crypto_aead_ivsize(aead);
+	u32 *desc;
+
+	if (!ctx->cdata.keylen || !ctx->authsize)
+		return 0;
+
+	desc = ctx->sh_desc_enc;
+	cnstr_shdsc_chachapoly(desc, &ctx->cdata, &ctx->adata, ivsize,
+			       ctx->authsize, true);
+	dma_sync_single_for_device(jrdev, ctx->sh_desc_enc_dma,
+				   desc_bytes(desc), ctx->dir);
+
+	desc = ctx->sh_desc_dec;
+	cnstr_shdsc_chachapoly(desc, &ctx->cdata, &ctx->adata, ivsize,
+			       ctx->authsize, false);
+	dma_sync_single_for_device(jrdev, ctx->sh_desc_dec_dma,
+				   desc_bytes(desc), ctx->dir);
+
+	return 0;
+}
+
+static int chachapoly_setauthsize(struct crypto_aead *aead,
+				  unsigned int authsize)
+{
+	struct caam_ctx *ctx = crypto_aead_ctx(aead);
+
+	if (authsize != POLY1305_DIGEST_SIZE)
+		return -EINVAL;
+
+	ctx->authsize = authsize;
+	return chachapoly_set_sh_desc(aead);
+}
+
+static int chachapoly_setkey(struct crypto_aead *aead, const u8 *key,
+			     unsigned int keylen)
+{
+	struct caam_ctx *ctx = crypto_aead_ctx(aead);
+	unsigned int ivsize = crypto_aead_ivsize(aead);
+	unsigned int saltlen = CHACHAPOLY_IV_SIZE - ivsize;
+
+	if (keylen != CHACHA20_KEY_SIZE + saltlen) {
+		crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+
+	ctx->cdata.key_virt = key;
+	ctx->cdata.keylen = keylen - saltlen;
+
+	return chachapoly_set_sh_desc(aead);
+}
+
 static int aead_setkey(struct crypto_aead *aead,
 			       const u8 *key, unsigned int keylen)
 {
@@ -1031,6 +1088,40 @@ static void init_gcm_job(struct aead_request *req,
 	/* End of blank commands */
 }
 
+static void init_chachapoly_job(struct aead_request *req,
+				struct aead_edesc *edesc, bool all_contig,
+				bool encrypt)
+{
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	unsigned int ivsize = crypto_aead_ivsize(aead);
+	unsigned int assoclen = req->assoclen;
+	u32 *desc = edesc->hw_desc;
+	u32 ctx_iv_off = 4;
+
+	init_aead_job(req, edesc, all_contig, encrypt);
+
+	if (ivsize != CHACHAPOLY_IV_SIZE) {
+		/* IPsec specific: CONTEXT1[223:128] = {NONCE, IV} */
+		ctx_iv_off += 4;
+
+		/*
+		 * The associated data comes already with the IV but we need
+		 * to skip it when we authenticate or encrypt...
+		 */
+		assoclen -= ivsize;
+	}
+
+	append_math_add_imm_u32(desc, REG3, ZERO, IMM, assoclen);
+
+	/*
+	 * For IPsec load the IV further in the same register.
+	 * For RFC7539 simply load the 12 bytes nonce in a single operation
+	 */
+	append_load_as_imm(desc, req->iv, ivsize, LDST_CLASS_1_CCB |
+			   LDST_SRCDST_BYTE_CONTEXT |
+			   ctx_iv_off << LDST_OFFSET_SHIFT);
+}
+
 static void init_authenc_job(struct aead_request *req,
 			     struct aead_edesc *edesc,
 			     bool all_contig, bool encrypt)
@@ -1289,6 +1380,72 @@ static int gcm_encrypt(struct aead_request *req)
 	return ret;
 }
 
+static int chachapoly_encrypt(struct aead_request *req)
+{
+	struct aead_edesc *edesc;
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct caam_ctx *ctx = crypto_aead_ctx(aead);
+	struct device *jrdev = ctx->jrdev;
+	bool all_contig;
+	u32 *desc;
+	int ret;
+
+	edesc = aead_edesc_alloc(req, CHACHAPOLY_DESC_JOB_IO_LEN, &all_contig,
+				 true);
+	if (IS_ERR(edesc))
+		return PTR_ERR(edesc);
+
+	desc = edesc->hw_desc;
+
+	init_chachapoly_job(req, edesc, all_contig, true);
+	print_hex_dump_debug("chachapoly jobdesc@" __stringify(__LINE__)": ",
+			     DUMP_PREFIX_ADDRESS, 16, 4, desc, desc_bytes(desc),
+			     1);
+
+	ret = caam_jr_enqueue(jrdev, desc, aead_encrypt_done, req);
+	if (!ret) {
+		ret = -EINPROGRESS;
+	} else {
+		aead_unmap(jrdev, edesc, req);
+		kfree(edesc);
+	}
+
+	return ret;
+}
+
+static int chachapoly_decrypt(struct aead_request *req)
+{
+	struct aead_edesc *edesc;
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct caam_ctx *ctx = crypto_aead_ctx(aead);
+	struct device *jrdev = ctx->jrdev;
+	bool all_contig;
+	u32 *desc;
+	int ret;
+
+	edesc = aead_edesc_alloc(req, CHACHAPOLY_DESC_JOB_IO_LEN, &all_contig,
+				 false);
+	if (IS_ERR(edesc))
+		return PTR_ERR(edesc);
+
+	desc = edesc->hw_desc;
+
+	init_chachapoly_job(req, edesc, all_contig, false);
+	print_hex_dump_debug("chachapoly jobdesc@" __stringify(__LINE__)": ",
+			     DUMP_PREFIX_ADDRESS, 16, 4, desc, desc_bytes(desc),
+			     1);
+
+	ret = caam_jr_enqueue(jrdev, desc, aead_decrypt_done, req);
+	if (!ret) {
+		ret = -EINPROGRESS;
+	} else {
+		aead_unmap(jrdev, edesc, req);
+		kfree(edesc);
+	}
+
+	return ret;
+}
+
 static int ipsec_gcm_encrypt(struct aead_request *req)
 {
 	if (req->assoclen < 8)
@@ -3002,6 +3159,50 @@ static struct caam_aead_alg driver_aeads[] = {
 			.geniv = true,
 		},
 	},
+	{
+		.aead = {
+			.base = {
+				.cra_name = "rfc7539(chacha20,poly1305)",
+				.cra_driver_name = "rfc7539-chacha20-poly1305-"
+						   "caam",
+				.cra_blocksize = 1,
+			},
+			.setkey = chachapoly_setkey,
+			.setauthsize = chachapoly_setauthsize,
+			.encrypt = chachapoly_encrypt,
+			.decrypt = chachapoly_decrypt,
+			.ivsize = CHACHAPOLY_IV_SIZE,
+			.maxauthsize = POLY1305_DIGEST_SIZE,
+		},
+		.caam = {
+			.class1_alg_type = OP_ALG_ALGSEL_CHACHA20 |
+					   OP_ALG_AAI_AEAD,
+			.class2_alg_type = OP_ALG_ALGSEL_POLY1305 |
+					   OP_ALG_AAI_AEAD,
+		},
+	},
+	{
+		.aead = {
+			.base = {
+				.cra_name = "rfc7539esp(chacha20,poly1305)",
+				.cra_driver_name = "rfc7539esp-chacha20-"
+						   "poly1305-caam",
+				.cra_blocksize = 1,
+			},
+			.setkey = chachapoly_setkey,
+			.setauthsize = chachapoly_setauthsize,
+			.encrypt = chachapoly_encrypt,
+			.decrypt = chachapoly_decrypt,
+			.ivsize = 8,
+			.maxauthsize = POLY1305_DIGEST_SIZE,
+		},
+		.caam = {
+			.class1_alg_type = OP_ALG_ALGSEL_CHACHA20 |
+					   OP_ALG_AAI_AEAD,
+			.class2_alg_type = OP_ALG_ALGSEL_POLY1305 |
+					   OP_ALG_AAI_AEAD,
+		},
+	},
 };
 
 static int caam_init_common(struct caam_ctx *ctx, struct caam_alg_entry *caam,
@@ -3135,7 +3336,7 @@ static int __init caam_algapi_init(void)
 	struct device *ctrldev;
 	struct caam_drv_private *priv;
 	int i = 0, err = 0;
-	u32 aes_vid, aes_inst, des_inst, md_vid, md_inst;
+	u32 aes_vid, aes_inst, des_inst, md_vid, md_inst, ccha_inst, ptha_inst;
 	unsigned int md_limit = SHA512_DIGEST_SIZE;
 	bool registered = false;
 
@@ -3180,6 +3381,8 @@ static int __init caam_algapi_init(void)
 			   CHA_ID_LS_DES_SHIFT;
 		aes_inst = cha_inst & CHA_ID_LS_AES_MASK;
 		md_inst = (cha_inst & CHA_ID_LS_MD_MASK) >> CHA_ID_LS_MD_SHIFT;
+		ccha_inst = 0;
+		ptha_inst = 0;
 	} else {
 		u32 aesa, mdha;
 
@@ -3192,6 +3395,8 @@ static int __init caam_algapi_init(void)
 		des_inst = rd_reg32(&priv->ctrl->vreg.desa) & CHA_VER_NUM_MASK;
 		aes_inst = aesa & CHA_VER_NUM_MASK;
 		md_inst = mdha & CHA_VER_NUM_MASK;
+		ccha_inst = rd_reg32(&priv->ctrl->vreg.ccha) & CHA_VER_NUM_MASK;
+		ptha_inst = rd_reg32(&priv->ctrl->vreg.ptha) & CHA_VER_NUM_MASK;
 	}
 
 	/* If MD is present, limit digest size based on LP256 */
@@ -3252,6 +3457,14 @@ static int __init caam_algapi_init(void)
 		if (!aes_inst && (c1_alg_sel == OP_ALG_ALGSEL_AES))
 				continue;
 
+		/* Skip CHACHA20 algorithms if not supported by device */
+		if (c1_alg_sel == OP_ALG_ALGSEL_CHACHA20 && !ccha_inst)
+			continue;
+
+		/* Skip POLY1305 algorithms if not supported by device */
+		if (c2_alg_sel == OP_ALG_ALGSEL_POLY1305 && !ptha_inst)
+			continue;
+
 		/*
 		 * Check support for AES algorithms not available
 		 * on LP devices.
@@ -3263,9 +3476,9 @@ static int __init caam_algapi_init(void)
 		 * Skip algorithms requiring message digests
 		 * if MD or MD size is not supported by device.
 		 */
-		if (c2_alg_sel &&
-		    (!md_inst || (t_alg->aead.maxauthsize > md_limit)))
-				continue;
+		if ((c2_alg_sel & ~OP_ALG_ALGSEL_SUBMASK) == 0x40 &&
+		    (!md_inst || t_alg->aead.maxauthsize > md_limit))
+			continue;
 
 		caam_aead_alg_init(t_alg);
 
diff --git a/drivers/crypto/caam/caamalg_desc.c b/drivers/crypto/caam/caamalg_desc.c
index d850590079a2..0eb2add7e4e2 100644
--- a/drivers/crypto/caam/caamalg_desc.c
+++ b/drivers/crypto/caam/caamalg_desc.c
@@ -1213,6 +1213,117 @@ void cnstr_shdsc_rfc4543_decap(u32 * const desc, struct alginfo *cdata,
 }
 EXPORT_SYMBOL(cnstr_shdsc_rfc4543_decap);
 
+/**
+ * cnstr_shdsc_chachapoly - Chacha20 + Poly1305 generic AEAD (rfc7539) and
+ *                          IPsec ESP (rfc7634, a.k.a. rfc7539esp) shared
+ *                          descriptor (non-protocol).
+ * @desc: pointer to buffer used for descriptor construction
+ * @cdata: pointer to block cipher transform definitions
+ *         Valid algorithm values - OP_ALG_ALGSEL_CHACHA20 ANDed with
+ *         OP_ALG_AAI_AEAD.
+ * @adata: pointer to authentication transform definitions
+ *         Valid algorithm values - OP_ALG_ALGSEL_POLY1305 ANDed with
+ *         OP_ALG_AAI_AEAD.
+ * @ivsize: initialization vector size
+ * @icvsize: integrity check value (ICV) size (truncated or full)
+ * @encap: true if encapsulation, false if decapsulation
+ */
+void cnstr_shdsc_chachapoly(u32 * const desc, struct alginfo *cdata,
+			    struct alginfo *adata, unsigned int ivsize,
+			    unsigned int icvsize, const bool encap)
+{
+	u32 *key_jump_cmd, *wait_cmd;
+	u32 nfifo;
+	const bool is_ipsec = (ivsize != CHACHAPOLY_IV_SIZE);
+
+	/* Note: Context registers are saved. */
+	init_sh_desc(desc, HDR_SHARE_SERIAL | HDR_SAVECTX);
+
+	/* skip key loading if they are loaded due to sharing */
+	key_jump_cmd = append_jump(desc, JUMP_JSL | JUMP_TEST_ALL |
+				   JUMP_COND_SHRD);
+
+	append_key_as_imm(desc, cdata->key_virt, cdata->keylen, cdata->keylen,
+			  CLASS_1 | KEY_DEST_CLASS_REG);
+
+	/* For IPsec load the salt from keymat in the context register */
+	if (is_ipsec)
+		append_load_as_imm(desc, cdata->key_virt + cdata->keylen, 4,
+				   LDST_CLASS_1_CCB | LDST_SRCDST_BYTE_CONTEXT |
+				   4 << LDST_OFFSET_SHIFT);
+
+	set_jump_tgt_here(desc, key_jump_cmd);
+
+	/* Class 2 and 1 operations: Poly & ChaCha */
+	if (encap) {
+		append_operation(desc, adata->algtype | OP_ALG_AS_INITFINAL |
+				 OP_ALG_ENCRYPT);
+		append_operation(desc, cdata->algtype | OP_ALG_AS_INITFINAL |
+				 OP_ALG_ENCRYPT);
+	} else {
+		append_operation(desc, adata->algtype | OP_ALG_AS_INITFINAL |
+				 OP_ALG_DECRYPT | OP_ALG_ICV_ON);
+		append_operation(desc, cdata->algtype | OP_ALG_AS_INITFINAL |
+				 OP_ALG_DECRYPT);
+	}
+
+	/*
+	 * MAGIC with NFIFO
+	 * Read associated data from the input and send them to class1 and
+	 * class2 alignment blocks. From class1 send data to output fifo and
+	 * then write it to memory since we don't need to encrypt AD.
+	 */
+	nfifo = NFIFOENTRY_DEST_BOTH | NFIFOENTRY_FC1 | NFIFOENTRY_FC2 |
+		NFIFOENTRY_DTYPE_POLY | NFIFOENTRY_BND;
+	append_load_imm_u32(desc, nfifo, LDST_CLASS_IND_CCB |
+			    LDST_SRCDST_WORD_INFO_FIFO_SM | LDLEN_MATH3);
+
+	append_math_add(desc, VARSEQINLEN, ZERO, REG3, CAAM_CMD_SZ);
+	append_math_add(desc, VARSEQOUTLEN, ZERO, REG3, CAAM_CMD_SZ);
+	append_seq_fifo_load(desc, 0, FIFOLD_TYPE_NOINFOFIFO |
+			     FIFOLD_CLASS_CLASS1 | LDST_VLF);
+	append_move_len(desc, MOVE_AUX_LS | MOVE_SRC_AUX_ABLK |
+			MOVE_DEST_OUTFIFO | MOVELEN_MRSEL_MATH3);
+	append_seq_fifo_store(desc, 0, FIFOST_TYPE_MESSAGE_DATA | LDST_VLF);
+
+	/* IPsec - copy IV at the output */
+	if (is_ipsec)
+		append_seq_fifo_store(desc, ivsize, FIFOST_TYPE_METADATA |
+				      0x2 << 25);
+
+	wait_cmd = append_jump(desc, JUMP_JSL | JUMP_TYPE_LOCAL |
+			       JUMP_COND_NOP | JUMP_TEST_ALL);
+	set_jump_tgt_here(desc, wait_cmd);
+
+	if (encap) {
+		/* Read and write cryptlen bytes */
+		append_math_add(desc, VARSEQINLEN, SEQINLEN, REG0, CAAM_CMD_SZ);
+		append_math_add(desc, VARSEQOUTLEN, SEQINLEN, REG0,
+				CAAM_CMD_SZ);
+		aead_append_src_dst(desc, FIFOLD_TYPE_MSG1OUT2);
+
+		/* Write ICV */
+		append_seq_store(desc, icvsize, LDST_CLASS_2_CCB |
+				 LDST_SRCDST_BYTE_CONTEXT);
+	} else {
+		/* Read and write cryptlen bytes */
+		append_math_add(desc, VARSEQINLEN, SEQOUTLEN, REG0,
+				CAAM_CMD_SZ);
+		append_math_add(desc, VARSEQOUTLEN, SEQOUTLEN, REG0,
+				CAAM_CMD_SZ);
+		aead_append_src_dst(desc, FIFOLD_TYPE_MSG);
+
+		/* Load ICV for verification */
+		append_seq_fifo_load(desc, icvsize, FIFOLD_CLASS_CLASS2 |
+				     FIFOLD_TYPE_LAST2 | FIFOLD_TYPE_ICV);
+	}
+
+	print_hex_dump_debug("chachapoly shdesc@" __stringify(__LINE__)": ",
+			     DUMP_PREFIX_ADDRESS, 16, 4, desc, desc_bytes(desc),
+			     1);
+}
+EXPORT_SYMBOL(cnstr_shdsc_chachapoly);
+
 /* For skcipher encrypt and decrypt, read from req->src and write to req->dst */
 static inline void skcipher_append_src_dst(u32 *desc)
 {
diff --git a/drivers/crypto/caam/caamalg_desc.h b/drivers/crypto/caam/caamalg_desc.h
index 1315c8f6f951..a1a7b0e6889d 100644
--- a/drivers/crypto/caam/caamalg_desc.h
+++ b/drivers/crypto/caam/caamalg_desc.h
@@ -96,6 +96,10 @@ void cnstr_shdsc_rfc4543_decap(u32 * const desc, struct alginfo *cdata,
 			       unsigned int ivsize, unsigned int icvsize,
 			       const bool is_qi);
 
+void cnstr_shdsc_chachapoly(u32 * const desc, struct alginfo *cdata,
+			    struct alginfo *adata, unsigned int ivsize,
+			    unsigned int icvsize, const bool encap);
+
 void cnstr_shdsc_skcipher_encap(u32 * const desc, struct alginfo *cdata,
 				unsigned int ivsize, const bool is_rfc3686,
 				const u32 ctx1_iv_off);
diff --git a/drivers/crypto/caam/compat.h b/drivers/crypto/caam/compat.h
index a5081b4050b6..8bde903f9f4a 100644
--- a/drivers/crypto/caam/compat.h
+++ b/drivers/crypto/caam/compat.h
@@ -37,6 +37,7 @@
 #include <crypto/sha.h>
 #include <crypto/md5.h>
 #include <crypto/chacha20.h>
+#include <crypto/poly1305.h>
 #include <crypto/internal/aead.h>
 #include <crypto/authenc.h>
 #include <crypto/akcipher.h>
diff --git a/drivers/crypto/caam/desc.h b/drivers/crypto/caam/desc.h
index 9d117e51629f..ec10230178c5 100644
--- a/drivers/crypto/caam/desc.h
+++ b/drivers/crypto/caam/desc.h
@@ -243,6 +243,7 @@
 #define LDST_SRCDST_WORD_DESCBUF_SHARED	(0x42 << LDST_SRCDST_SHIFT)
 #define LDST_SRCDST_WORD_DESCBUF_JOB_WE	(0x45 << LDST_SRCDST_SHIFT)
 #define LDST_SRCDST_WORD_DESCBUF_SHARED_WE (0x46 << LDST_SRCDST_SHIFT)
+#define LDST_SRCDST_WORD_INFO_FIFO_SM	(0x71 << LDST_SRCDST_SHIFT)
 #define LDST_SRCDST_WORD_INFO_FIFO	(0x7a << LDST_SRCDST_SHIFT)
 
 /* Offset in source/destination */
@@ -285,6 +286,12 @@
 #define LDLEN_SET_OFIFO_OFFSET_SHIFT	0
 #define LDLEN_SET_OFIFO_OFFSET_MASK	(3 << LDLEN_SET_OFIFO_OFFSET_SHIFT)
 
+/* Special Length definitions when dst=sm, nfifo-{sm,m} */
+#define LDLEN_MATH0			0
+#define LDLEN_MATH1			1
+#define LDLEN_MATH2			2
+#define LDLEN_MATH3			3
+
 /*
  * FIFO_LOAD/FIFO_STORE/SEQ_FIFO_LOAD/SEQ_FIFO_STORE
  * Command Constructs
@@ -409,6 +416,7 @@
 #define FIFOST_TYPE_MESSAGE_DATA (0x30 << FIFOST_TYPE_SHIFT)
 #define FIFOST_TYPE_RNGSTORE	 (0x34 << FIFOST_TYPE_SHIFT)
 #define FIFOST_TYPE_RNGFIFO	 (0x35 << FIFOST_TYPE_SHIFT)
+#define FIFOST_TYPE_METADATA	 (0x3e << FIFOST_TYPE_SHIFT)
 #define FIFOST_TYPE_SKIP	 (0x3f << FIFOST_TYPE_SHIFT)
 
 /*
@@ -1160,6 +1168,7 @@
 #define OP_ALG_ALGSEL_CRC	(0x90 << OP_ALG_ALGSEL_SHIFT)
 #define OP_ALG_ALGSEL_SNOW_F9	(0xA0 << OP_ALG_ALGSEL_SHIFT)
 #define OP_ALG_ALGSEL_CHACHA20	(0xD0 << OP_ALG_ALGSEL_SHIFT)
+#define OP_ALG_ALGSEL_POLY1305	(0xE0 << OP_ALG_ALGSEL_SHIFT)
 
 #define OP_ALG_AAI_SHIFT	4
 #define OP_ALG_AAI_MASK		(0x1ff << OP_ALG_AAI_SHIFT)
@@ -1400,6 +1409,7 @@
 #define MOVE_SRC_MATH3		(0x07 << MOVE_SRC_SHIFT)
 #define MOVE_SRC_INFIFO		(0x08 << MOVE_SRC_SHIFT)
 #define MOVE_SRC_INFIFO_CL	(0x09 << MOVE_SRC_SHIFT)
+#define MOVE_SRC_AUX_ABLK	(0x0a << MOVE_SRC_SHIFT)
 
 #define MOVE_DEST_SHIFT		16
 #define MOVE_DEST_MASK		(0x0f << MOVE_DEST_SHIFT)
@@ -1426,6 +1436,10 @@
 
 #define MOVELEN_MRSEL_SHIFT	0
 #define MOVELEN_MRSEL_MASK	(0x3 << MOVE_LEN_SHIFT)
+#define MOVELEN_MRSEL_MATH0	(0 << MOVELEN_MRSEL_SHIFT)
+#define MOVELEN_MRSEL_MATH1	(1 << MOVELEN_MRSEL_SHIFT)
+#define MOVELEN_MRSEL_MATH2	(2 << MOVELEN_MRSEL_SHIFT)
+#define MOVELEN_MRSEL_MATH3	(3 << MOVELEN_MRSEL_SHIFT)
 
 /*
  * MATH Command Constructs
@@ -1602,6 +1616,7 @@
 #define NFIFOENTRY_DTYPE_IV	(0x2 << NFIFOENTRY_DTYPE_SHIFT)
 #define NFIFOENTRY_DTYPE_SAD	(0x3 << NFIFOENTRY_DTYPE_SHIFT)
 #define NFIFOENTRY_DTYPE_ICV	(0xA << NFIFOENTRY_DTYPE_SHIFT)
+#define NFIFOENTRY_DTYPE_POLY	(0xB << NFIFOENTRY_DTYPE_SHIFT)
 #define NFIFOENTRY_DTYPE_SKIP	(0xE << NFIFOENTRY_DTYPE_SHIFT)
 #define NFIFOENTRY_DTYPE_MSG	(0xF << NFIFOENTRY_DTYPE_SHIFT)
 
diff --git a/drivers/crypto/caam/desc_constr.h b/drivers/crypto/caam/desc_constr.h
index d4256fa4a1d6..2980b8ef1fb1 100644
--- a/drivers/crypto/caam/desc_constr.h
+++ b/drivers/crypto/caam/desc_constr.h
@@ -189,6 +189,7 @@ static inline u32 *append_##cmd(u32 * const desc, u32 options) \
 }
 APPEND_CMD_RET(jump, JUMP)
 APPEND_CMD_RET(move, MOVE)
+APPEND_CMD_RET(move_len, MOVE_LEN)
 
 static inline void set_jump_tgt_here(u32 * const desc, u32 *jump_cmd)
 {
@@ -327,7 +328,11 @@ static inline void append_##cmd##_imm_##type(u32 * const desc, type immediate, \
 					     u32 options) \
 { \
 	PRINT_POS; \
-	append_cmd(desc, CMD_##op | IMMEDIATE | options | sizeof(type)); \
+	if (options & LDST_LEN_MASK) \
+		append_cmd(desc, CMD_##op | IMMEDIATE | options); \
+	else \
+		append_cmd(desc, CMD_##op | IMMEDIATE | options | \
+			   sizeof(type)); \
 	append_cmd(desc, immediate); \
 }
 APPEND_CMD_RAW_IMM(load, LOAD, u32);
-- 
cgit v1.2.3-59-g8ed1b


From c10a53367901b36eec6208f1dbaf53da9fd358bb Mon Sep 17 00:00:00 2001
From: Horia Geantă <horia.geanta@nxp.com>
Date: Thu, 8 Nov 2018 15:36:31 +0200
Subject: crypto: caam/qi2 - add support for Chacha20 + Poly1305
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for Chacha20 + Poly1305 combined AEAD:
-generic (rfc7539)
-IPsec (rfc7634 - known as rfc7539esp in the kernel)

Signed-off-by: Horia Geantă <horia.geanta@nxp.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/caam/caamalg.c      |   4 +-
 drivers/crypto/caam/caamalg_desc.c |  24 ++++++-
 drivers/crypto/caam/caamalg_desc.h |   3 +-
 drivers/crypto/caam/caamalg_qi2.c  | 129 ++++++++++++++++++++++++++++++++++++-
 4 files changed, 154 insertions(+), 6 deletions(-)

diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
index cbaeb264a261..523565ce0060 100644
--- a/drivers/crypto/caam/caamalg.c
+++ b/drivers/crypto/caam/caamalg.c
@@ -527,13 +527,13 @@ static int chachapoly_set_sh_desc(struct crypto_aead *aead)
 
 	desc = ctx->sh_desc_enc;
 	cnstr_shdsc_chachapoly(desc, &ctx->cdata, &ctx->adata, ivsize,
-			       ctx->authsize, true);
+			       ctx->authsize, true, false);
 	dma_sync_single_for_device(jrdev, ctx->sh_desc_enc_dma,
 				   desc_bytes(desc), ctx->dir);
 
 	desc = ctx->sh_desc_dec;
 	cnstr_shdsc_chachapoly(desc, &ctx->cdata, &ctx->adata, ivsize,
-			       ctx->authsize, false);
+			       ctx->authsize, false, false);
 	dma_sync_single_for_device(jrdev, ctx->sh_desc_dec_dma,
 				   desc_bytes(desc), ctx->dir);
 
diff --git a/drivers/crypto/caam/caamalg_desc.c b/drivers/crypto/caam/caamalg_desc.c
index 0eb2add7e4e2..7db1640d3577 100644
--- a/drivers/crypto/caam/caamalg_desc.c
+++ b/drivers/crypto/caam/caamalg_desc.c
@@ -1227,10 +1227,12 @@ EXPORT_SYMBOL(cnstr_shdsc_rfc4543_decap);
  * @ivsize: initialization vector size
  * @icvsize: integrity check value (ICV) size (truncated or full)
  * @encap: true if encapsulation, false if decapsulation
+ * @is_qi: true when called from caam/qi
  */
 void cnstr_shdsc_chachapoly(u32 * const desc, struct alginfo *cdata,
 			    struct alginfo *adata, unsigned int ivsize,
-			    unsigned int icvsize, const bool encap)
+			    unsigned int icvsize, const bool encap,
+			    const bool is_qi)
 {
 	u32 *key_jump_cmd, *wait_cmd;
 	u32 nfifo;
@@ -1267,6 +1269,26 @@ void cnstr_shdsc_chachapoly(u32 * const desc, struct alginfo *cdata,
 				 OP_ALG_DECRYPT);
 	}
 
+	if (is_qi) {
+		u32 *wait_load_cmd;
+		u32 ctx1_iv_off = is_ipsec ? 8 : 4;
+
+		/* REG3 = assoclen */
+		append_seq_load(desc, 4, LDST_CLASS_DECO |
+				LDST_SRCDST_WORD_DECO_MATH3 |
+				4 << LDST_OFFSET_SHIFT);
+
+		wait_load_cmd = append_jump(desc, JUMP_JSL | JUMP_TEST_ALL |
+					    JUMP_COND_CALM | JUMP_COND_NCP |
+					    JUMP_COND_NOP | JUMP_COND_NIP |
+					    JUMP_COND_NIFP);
+		set_jump_tgt_here(desc, wait_load_cmd);
+
+		append_seq_load(desc, ivsize, LDST_CLASS_1_CCB |
+				LDST_SRCDST_BYTE_CONTEXT |
+				ctx1_iv_off << LDST_OFFSET_SHIFT);
+	}
+
 	/*
 	 * MAGIC with NFIFO
 	 * Read associated data from the input and send them to class1 and
diff --git a/drivers/crypto/caam/caamalg_desc.h b/drivers/crypto/caam/caamalg_desc.h
index a1a7b0e6889d..d5ca42ff961a 100644
--- a/drivers/crypto/caam/caamalg_desc.h
+++ b/drivers/crypto/caam/caamalg_desc.h
@@ -98,7 +98,8 @@ void cnstr_shdsc_rfc4543_decap(u32 * const desc, struct alginfo *cdata,
 
 void cnstr_shdsc_chachapoly(u32 * const desc, struct alginfo *cdata,
 			    struct alginfo *adata, unsigned int ivsize,
-			    unsigned int icvsize, const bool encap);
+			    unsigned int icvsize, const bool encap,
+			    const bool is_qi);
 
 void cnstr_shdsc_skcipher_encap(u32 * const desc, struct alginfo *cdata,
 				unsigned int ivsize, const bool is_rfc3686,
diff --git a/drivers/crypto/caam/caamalg_qi2.c b/drivers/crypto/caam/caamalg_qi2.c
index a9e264bb9629..2598640aa98b 100644
--- a/drivers/crypto/caam/caamalg_qi2.c
+++ b/drivers/crypto/caam/caamalg_qi2.c
@@ -462,7 +462,15 @@ static struct aead_edesc *aead_edesc_alloc(struct aead_request *req,
 	edesc->dst_nents = dst_nents;
 	edesc->iv_dma = iv_dma;
 
-	edesc->assoclen = cpu_to_caam32(req->assoclen);
+	if ((alg->caam.class1_alg_type & OP_ALG_ALGSEL_MASK) ==
+	    OP_ALG_ALGSEL_CHACHA20 && ivsize != CHACHAPOLY_IV_SIZE)
+		/*
+		 * The associated data comes already with the IV but we need
+		 * to skip it when we authenticate or encrypt...
+		 */
+		edesc->assoclen = cpu_to_caam32(req->assoclen - ivsize);
+	else
+		edesc->assoclen = cpu_to_caam32(req->assoclen);
 	edesc->assoclen_dma = dma_map_single(dev, &edesc->assoclen, 4,
 					     DMA_TO_DEVICE);
 	if (dma_mapping_error(dev, edesc->assoclen_dma)) {
@@ -532,6 +540,68 @@ static struct aead_edesc *aead_edesc_alloc(struct aead_request *req,
 	return edesc;
 }
 
+static int chachapoly_set_sh_desc(struct crypto_aead *aead)
+{
+	struct caam_ctx *ctx = crypto_aead_ctx(aead);
+	unsigned int ivsize = crypto_aead_ivsize(aead);
+	struct device *dev = ctx->dev;
+	struct caam_flc *flc;
+	u32 *desc;
+
+	if (!ctx->cdata.keylen || !ctx->authsize)
+		return 0;
+
+	flc = &ctx->flc[ENCRYPT];
+	desc = flc->sh_desc;
+	cnstr_shdsc_chachapoly(desc, &ctx->cdata, &ctx->adata, ivsize,
+			       ctx->authsize, true, true);
+	flc->flc[1] = cpu_to_caam32(desc_len(desc)); /* SDL */
+	dma_sync_single_for_device(dev, ctx->flc_dma[ENCRYPT],
+				   sizeof(flc->flc) + desc_bytes(desc),
+				   ctx->dir);
+
+	flc = &ctx->flc[DECRYPT];
+	desc = flc->sh_desc;
+	cnstr_shdsc_chachapoly(desc, &ctx->cdata, &ctx->adata, ivsize,
+			       ctx->authsize, false, true);
+	flc->flc[1] = cpu_to_caam32(desc_len(desc)); /* SDL */
+	dma_sync_single_for_device(dev, ctx->flc_dma[DECRYPT],
+				   sizeof(flc->flc) + desc_bytes(desc),
+				   ctx->dir);
+
+	return 0;
+}
+
+static int chachapoly_setauthsize(struct crypto_aead *aead,
+				  unsigned int authsize)
+{
+	struct caam_ctx *ctx = crypto_aead_ctx(aead);
+
+	if (authsize != POLY1305_DIGEST_SIZE)
+		return -EINVAL;
+
+	ctx->authsize = authsize;
+	return chachapoly_set_sh_desc(aead);
+}
+
+static int chachapoly_setkey(struct crypto_aead *aead, const u8 *key,
+			     unsigned int keylen)
+{
+	struct caam_ctx *ctx = crypto_aead_ctx(aead);
+	unsigned int ivsize = crypto_aead_ivsize(aead);
+	unsigned int saltlen = CHACHAPOLY_IV_SIZE - ivsize;
+
+	if (keylen != CHACHA20_KEY_SIZE + saltlen) {
+		crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+
+	ctx->cdata.key_virt = key;
+	ctx->cdata.keylen = keylen - saltlen;
+
+	return chachapoly_set_sh_desc(aead);
+}
+
 static int gcm_set_sh_desc(struct crypto_aead *aead)
 {
 	struct caam_ctx *ctx = crypto_aead_ctx(aead);
@@ -2626,6 +2696,50 @@ static struct caam_aead_alg driver_aeads[] = {
 			.geniv = true,
 		},
 	},
+	{
+		.aead = {
+			.base = {
+				.cra_name = "rfc7539(chacha20,poly1305)",
+				.cra_driver_name = "rfc7539-chacha20-poly1305-"
+						   "caam-qi2",
+				.cra_blocksize = 1,
+			},
+			.setkey = chachapoly_setkey,
+			.setauthsize = chachapoly_setauthsize,
+			.encrypt = aead_encrypt,
+			.decrypt = aead_decrypt,
+			.ivsize = CHACHAPOLY_IV_SIZE,
+			.maxauthsize = POLY1305_DIGEST_SIZE,
+		},
+		.caam = {
+			.class1_alg_type = OP_ALG_ALGSEL_CHACHA20 |
+					   OP_ALG_AAI_AEAD,
+			.class2_alg_type = OP_ALG_ALGSEL_POLY1305 |
+					   OP_ALG_AAI_AEAD,
+		},
+	},
+	{
+		.aead = {
+			.base = {
+				.cra_name = "rfc7539esp(chacha20,poly1305)",
+				.cra_driver_name = "rfc7539esp-chacha20-"
+						   "poly1305-caam-qi2",
+				.cra_blocksize = 1,
+			},
+			.setkey = chachapoly_setkey,
+			.setauthsize = chachapoly_setauthsize,
+			.encrypt = aead_encrypt,
+			.decrypt = aead_decrypt,
+			.ivsize = 8,
+			.maxauthsize = POLY1305_DIGEST_SIZE,
+		},
+		.caam = {
+			.class1_alg_type = OP_ALG_ALGSEL_CHACHA20 |
+					   OP_ALG_AAI_AEAD,
+			.class2_alg_type = OP_ALG_ALGSEL_POLY1305 |
+					   OP_ALG_AAI_AEAD,
+		},
+	},
 	{
 		.aead = {
 			.base = {
@@ -4963,11 +5077,22 @@ static int dpaa2_caam_probe(struct fsl_mc_device *dpseci_dev)
 		    c1_alg_sel == OP_ALG_ALGSEL_AES)
 			continue;
 
+		/* Skip CHACHA20 algorithms if not supported by device */
+		if (c1_alg_sel == OP_ALG_ALGSEL_CHACHA20 &&
+		    !priv->sec_attr.ccha_acc_num)
+			continue;
+
+		/* Skip POLY1305 algorithms if not supported by device */
+		if (c2_alg_sel == OP_ALG_ALGSEL_POLY1305 &&
+		    !priv->sec_attr.ptha_acc_num)
+			continue;
+
 		/*
 		 * Skip algorithms requiring message digests
 		 * if MD not supported by device.
 		 */
-		if (!priv->sec_attr.md_acc_num && c2_alg_sel)
+		if ((c2_alg_sel & ~OP_ALG_ALGSEL_SUBMASK) == 0x40 &&
+		    !priv->sec_attr.md_acc_num)
 			continue;
 
 		t_alg->caam.dev = dev;
-- 
cgit v1.2.3-59-g8ed1b


From 8ddab428730d66e86ebe90aef15d5f7c5c45fe0d Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Fri, 9 Nov 2018 13:16:39 +0000
Subject: padata: clean an indentation issue, remove extraneous space

Trivial fix to clean up an indentation issue

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 kernel/padata.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/padata.c b/kernel/padata.c
index d568cc56405f..3e2633ae3bca 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -720,7 +720,7 @@ int padata_start(struct padata_instance *pinst)
 	if (pinst->flags & PADATA_INVALID)
 		err = -EINVAL;
 
-	 __padata_start(pinst);
+	__padata_start(pinst);
 
 	mutex_unlock(&pinst->lock);
 
-- 
cgit v1.2.3-59-g8ed1b


From 05ba88468b7d85c50d0945446248e0fcda7536bb Mon Sep 17 00:00:00 2001
From: Stefan Wahren <stefan.wahren@i2se.com>
Date: Sat, 10 Nov 2018 15:51:16 +0100
Subject: hwrng: bcm2835 - Switch to SPDX identifier

Adopt the SPDX license identifier headers to ease license compliance
management. While we are at this fix the comment style, too.

Cc: Lubomir Rintel <lkundrak@v3.sk>
Signed-off-by: Stefan Wahren <stefan.wahren@i2se.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Eric Anholt <eric@anholt.net>
Acked-by: Lubomir Rintel <lkundrak@v3.sk>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/char/hw_random/bcm2835-rng.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/char/hw_random/bcm2835-rng.c b/drivers/char/hw_random/bcm2835-rng.c
index 6767d965c36c..256b0b1d0f26 100644
--- a/drivers/char/hw_random/bcm2835-rng.c
+++ b/drivers/char/hw_random/bcm2835-rng.c
@@ -1,10 +1,7 @@
-/**
+// SPDX-License-Identifier: GPL-2.0
+/*
  * Copyright (c) 2010-2012 Broadcom. All rights reserved.
  * Copyright (c) 2013 Lubomir Rintel
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License ("GPL")
- * version 2, as published by the Free Software Foundation.
  */
 
 #include <linux/hw_random.h>
-- 
cgit v1.2.3-59-g8ed1b


From e4e72063d3c0ee9ba10faeb5645dcdaae2d733e9 Mon Sep 17 00:00:00 2001
From: Martin Willi <martin@strongswan.org>
Date: Sun, 11 Nov 2018 10:36:25 +0100
Subject: crypto: x86/chacha20 - Support partial lengths in 1-block SSSE3
 variant

Add a length argument to the single block function for SSSE3, so the
block function may XOR only a partial length of the full block. Given
that the setup code is rather cheap, the function does not process more
than one block; this allows us to keep the block function selection in
the C glue code.

The required branching does not negatively affect performance for full
block sizes. The partial XORing uses simple "rep movsb" to copy the
data before and after doing XOR in SSE. This is rather efficient on
modern processors; movsw can be slightly faster, but the additional
complexity is probably not worth it.

Signed-off-by: Martin Willi <martin@strongswan.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/chacha20-ssse3-x86_64.S | 74 ++++++++++++++++++++++++++-------
 arch/x86/crypto/chacha20_glue.c         | 11 ++---
 2 files changed, 63 insertions(+), 22 deletions(-)

diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S b/arch/x86/crypto/chacha20-ssse3-x86_64.S
index 512a2b500fd1..98d130b5e4ab 100644
--- a/arch/x86/crypto/chacha20-ssse3-x86_64.S
+++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S
@@ -25,12 +25,13 @@ CTRINC:	.octa 0x00000003000000020000000100000000
 
 ENTRY(chacha20_block_xor_ssse3)
 	# %rdi: Input state matrix, s
-	# %rsi: 1 data block output, o
-	# %rdx: 1 data block input, i
+	# %rsi: up to 1 data block output, o
+	# %rdx: up to 1 data block input, i
+	# %rcx: input/output length in bytes
 
 	# This function encrypts one ChaCha20 block by loading the state matrix
 	# in four SSE registers. It performs matrix operation on four words in
-	# parallel, but requireds shuffling to rearrange the words after each
+	# parallel, but requires shuffling to rearrange the words after each
 	# round. 8/16-bit word rotation is done with the slightly better
 	# performing SSSE3 byte shuffling, 7/12-bit word rotation uses
 	# traditional shift+OR.
@@ -48,7 +49,8 @@ ENTRY(chacha20_block_xor_ssse3)
 	movdqa		ROT8(%rip),%xmm4
 	movdqa		ROT16(%rip),%xmm5
 
-	mov	$10,%ecx
+	mov		%rcx,%rax
+	mov		$10,%ecx
 
 .Ldoubleround:
 
@@ -122,27 +124,69 @@ ENTRY(chacha20_block_xor_ssse3)
 	jnz		.Ldoubleround
 
 	# o0 = i0 ^ (x0 + s0)
-	movdqu		0x00(%rdx),%xmm4
 	paddd		%xmm8,%xmm0
+	cmp		$0x10,%rax
+	jl		.Lxorpart
+	movdqu		0x00(%rdx),%xmm4
 	pxor		%xmm4,%xmm0
 	movdqu		%xmm0,0x00(%rsi)
 	# o1 = i1 ^ (x1 + s1)
-	movdqu		0x10(%rdx),%xmm5
 	paddd		%xmm9,%xmm1
-	pxor		%xmm5,%xmm1
-	movdqu		%xmm1,0x10(%rsi)
+	movdqa		%xmm1,%xmm0
+	cmp		$0x20,%rax
+	jl		.Lxorpart
+	movdqu		0x10(%rdx),%xmm0
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x10(%rsi)
 	# o2 = i2 ^ (x2 + s2)
-	movdqu		0x20(%rdx),%xmm6
 	paddd		%xmm10,%xmm2
-	pxor		%xmm6,%xmm2
-	movdqu		%xmm2,0x20(%rsi)
+	movdqa		%xmm2,%xmm0
+	cmp		$0x30,%rax
+	jl		.Lxorpart
+	movdqu		0x20(%rdx),%xmm0
+	pxor		%xmm2,%xmm0
+	movdqu		%xmm0,0x20(%rsi)
 	# o3 = i3 ^ (x3 + s3)
-	movdqu		0x30(%rdx),%xmm7
 	paddd		%xmm11,%xmm3
-	pxor		%xmm7,%xmm3
-	movdqu		%xmm3,0x30(%rsi)
-
+	movdqa		%xmm3,%xmm0
+	cmp		$0x40,%rax
+	jl		.Lxorpart
+	movdqu		0x30(%rdx),%xmm0
+	pxor		%xmm3,%xmm0
+	movdqu		%xmm0,0x30(%rsi)
+
+.Ldone:
 	ret
+
+.Lxorpart:
+	# xor remaining bytes from partial register into output
+	mov		%rax,%r9
+	and		$0x0f,%r9
+	jz		.Ldone
+	and		$~0x0f,%rax
+
+	mov		%rsi,%r11
+
+	lea		8(%rsp),%r10
+	sub		$0x10,%rsp
+	and		$~31,%rsp
+
+	lea		(%rdx,%rax),%rsi
+	mov		%rsp,%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	pxor		0x00(%rsp),%xmm0
+	movdqa		%xmm0,0x00(%rsp)
+
+	mov		%rsp,%rsi
+	lea		(%r11,%rax),%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	lea		-8(%r10),%rsp
+	jmp		.Ldone
+
 ENDPROC(chacha20_block_xor_ssse3)
 
 ENTRY(chacha20_4block_xor_ssse3)
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index dce7c5d39c2f..cc4571736ce8 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -19,7 +19,8 @@
 
 #define CHACHA20_STATE_ALIGN 16
 
-asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
+asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
+					 unsigned int len);
 asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
 #ifdef CONFIG_AS_AVX2
 asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src);
@@ -29,8 +30,6 @@ static bool chacha20_use_avx2;
 static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
 			    unsigned int bytes)
 {
-	u8 buf[CHACHA20_BLOCK_SIZE];
-
 #ifdef CONFIG_AS_AVX2
 	if (chacha20_use_avx2) {
 		while (bytes >= CHACHA20_BLOCK_SIZE * 8) {
@@ -50,16 +49,14 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
 		state[12] += 4;
 	}
 	while (bytes >= CHACHA20_BLOCK_SIZE) {
-		chacha20_block_xor_ssse3(state, dst, src);
+		chacha20_block_xor_ssse3(state, dst, src, bytes);
 		bytes -= CHACHA20_BLOCK_SIZE;
 		src += CHACHA20_BLOCK_SIZE;
 		dst += CHACHA20_BLOCK_SIZE;
 		state[12]++;
 	}
 	if (bytes) {
-		memcpy(buf, src, bytes);
-		chacha20_block_xor_ssse3(state, buf, buf);
-		memcpy(dst, buf, bytes);
+		chacha20_block_xor_ssse3(state, dst, src, bytes);
 	}
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From db8e15a24957904d10f784a9adc4ea4824ee996c Mon Sep 17 00:00:00 2001
From: Martin Willi <martin@strongswan.org>
Date: Sun, 11 Nov 2018 10:36:26 +0100
Subject: crypto: x86/chacha20 - Support partial lengths in 4-block SSSE3
 variant

Add a length argument to the quad block function for SSSE3, so the
block function may XOR only a partial length of four blocks.

As we already have the stack set up, the partial XORing does not need
to. This gives a slightly different function trailer, so we keep that
separate from the 1-block function.

Signed-off-by: Martin Willi <martin@strongswan.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/chacha20-ssse3-x86_64.S | 163 ++++++++++++++++++++++++--------
 arch/x86/crypto/chacha20_glue.c         |   5 +-
 2 files changed, 128 insertions(+), 40 deletions(-)

diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S b/arch/x86/crypto/chacha20-ssse3-x86_64.S
index 98d130b5e4ab..d8ac75bb448f 100644
--- a/arch/x86/crypto/chacha20-ssse3-x86_64.S
+++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S
@@ -191,8 +191,9 @@ ENDPROC(chacha20_block_xor_ssse3)
 
 ENTRY(chacha20_4block_xor_ssse3)
 	# %rdi: Input state matrix, s
-	# %rsi: 4 data blocks output, o
-	# %rdx: 4 data blocks input, i
+	# %rsi: up to 4 data blocks output, o
+	# %rdx: up to 4 data blocks input, i
+	# %rcx: input/output length in bytes
 
 	# This function encrypts four consecutive ChaCha20 blocks by loading the
 	# the state matrix in SSE registers four times. As we need some scratch
@@ -207,6 +208,7 @@ ENTRY(chacha20_4block_xor_ssse3)
 	lea		8(%rsp),%r10
 	sub		$0x80,%rsp
 	and		$~63,%rsp
+	mov		%rcx,%rax
 
 	# x0..15[0-3] = s0..3[0..3]
 	movq		0x00(%rdi),%xmm1
@@ -617,58 +619,143 @@ ENTRY(chacha20_4block_xor_ssse3)
 
 	# xor with corresponding input, write to output
 	movdqa		0x00(%rsp),%xmm0
+	cmp		$0x10,%rax
+	jl		.Lxorpart4
 	movdqu		0x00(%rdx),%xmm1
 	pxor		%xmm1,%xmm0
 	movdqu		%xmm0,0x00(%rsi)
-	movdqa		0x10(%rsp),%xmm0
-	movdqu		0x80(%rdx),%xmm1
+
+	movdqu		%xmm4,%xmm0
+	cmp		$0x20,%rax
+	jl		.Lxorpart4
+	movdqu		0x10(%rdx),%xmm1
 	pxor		%xmm1,%xmm0
-	movdqu		%xmm0,0x80(%rsi)
+	movdqu		%xmm0,0x10(%rsi)
+
+	movdqu		%xmm8,%xmm0
+	cmp		$0x30,%rax
+	jl		.Lxorpart4
+	movdqu		0x20(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x20(%rsi)
+
+	movdqu		%xmm12,%xmm0
+	cmp		$0x40,%rax
+	jl		.Lxorpart4
+	movdqu		0x30(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x30(%rsi)
+
 	movdqa		0x20(%rsp),%xmm0
+	cmp		$0x50,%rax
+	jl		.Lxorpart4
 	movdqu		0x40(%rdx),%xmm1
 	pxor		%xmm1,%xmm0
 	movdqu		%xmm0,0x40(%rsi)
+
+	movdqu		%xmm6,%xmm0
+	cmp		$0x60,%rax
+	jl		.Lxorpart4
+	movdqu		0x50(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x50(%rsi)
+
+	movdqu		%xmm10,%xmm0
+	cmp		$0x70,%rax
+	jl		.Lxorpart4
+	movdqu		0x60(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x60(%rsi)
+
+	movdqu		%xmm14,%xmm0
+	cmp		$0x80,%rax
+	jl		.Lxorpart4
+	movdqu		0x70(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x70(%rsi)
+
+	movdqa		0x10(%rsp),%xmm0
+	cmp		$0x90,%rax
+	jl		.Lxorpart4
+	movdqu		0x80(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x80(%rsi)
+
+	movdqu		%xmm5,%xmm0
+	cmp		$0xa0,%rax
+	jl		.Lxorpart4
+	movdqu		0x90(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x90(%rsi)
+
+	movdqu		%xmm9,%xmm0
+	cmp		$0xb0,%rax
+	jl		.Lxorpart4
+	movdqu		0xa0(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0xa0(%rsi)
+
+	movdqu		%xmm13,%xmm0
+	cmp		$0xc0,%rax
+	jl		.Lxorpart4
+	movdqu		0xb0(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0xb0(%rsi)
+
 	movdqa		0x30(%rsp),%xmm0
+	cmp		$0xd0,%rax
+	jl		.Lxorpart4
 	movdqu		0xc0(%rdx),%xmm1
 	pxor		%xmm1,%xmm0
 	movdqu		%xmm0,0xc0(%rsi)
-	movdqu		0x10(%rdx),%xmm1
-	pxor		%xmm1,%xmm4
-	movdqu		%xmm4,0x10(%rsi)
-	movdqu		0x90(%rdx),%xmm1
-	pxor		%xmm1,%xmm5
-	movdqu		%xmm5,0x90(%rsi)
-	movdqu		0x50(%rdx),%xmm1
-	pxor		%xmm1,%xmm6
-	movdqu		%xmm6,0x50(%rsi)
+
+	movdqu		%xmm7,%xmm0
+	cmp		$0xe0,%rax
+	jl		.Lxorpart4
 	movdqu		0xd0(%rdx),%xmm1
-	pxor		%xmm1,%xmm7
-	movdqu		%xmm7,0xd0(%rsi)
-	movdqu		0x20(%rdx),%xmm1
-	pxor		%xmm1,%xmm8
-	movdqu		%xmm8,0x20(%rsi)
-	movdqu		0xa0(%rdx),%xmm1
-	pxor		%xmm1,%xmm9
-	movdqu		%xmm9,0xa0(%rsi)
-	movdqu		0x60(%rdx),%xmm1
-	pxor		%xmm1,%xmm10
-	movdqu		%xmm10,0x60(%rsi)
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0xd0(%rsi)
+
+	movdqu		%xmm11,%xmm0
+	cmp		$0xf0,%rax
+	jl		.Lxorpart4
 	movdqu		0xe0(%rdx),%xmm1
-	pxor		%xmm1,%xmm11
-	movdqu		%xmm11,0xe0(%rsi)
-	movdqu		0x30(%rdx),%xmm1
-	pxor		%xmm1,%xmm12
-	movdqu		%xmm12,0x30(%rsi)
-	movdqu		0xb0(%rdx),%xmm1
-	pxor		%xmm1,%xmm13
-	movdqu		%xmm13,0xb0(%rsi)
-	movdqu		0x70(%rdx),%xmm1
-	pxor		%xmm1,%xmm14
-	movdqu		%xmm14,0x70(%rsi)
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0xe0(%rsi)
+
+	movdqu		%xmm15,%xmm0
+	cmp		$0x100,%rax
+	jl		.Lxorpart4
 	movdqu		0xf0(%rdx),%xmm1
-	pxor		%xmm1,%xmm15
-	movdqu		%xmm15,0xf0(%rsi)
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0xf0(%rsi)
 
+.Ldone4:
 	lea		-8(%r10),%rsp
 	ret
+
+.Lxorpart4:
+	# xor remaining bytes from partial register into output
+	mov		%rax,%r9
+	and		$0x0f,%r9
+	jz		.Ldone4
+	and		$~0x0f,%rax
+
+	mov		%rsi,%r11
+
+	lea		(%rdx,%rax),%rsi
+	mov		%rsp,%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	pxor		0x00(%rsp),%xmm0
+	movdqa		%xmm0,0x00(%rsp)
+
+	mov		%rsp,%rsi
+	lea		(%r11,%rax),%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	jmp		.Ldone4
+
 ENDPROC(chacha20_4block_xor_ssse3)
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index cc4571736ce8..8f1ef1a9ce5c 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -21,7 +21,8 @@
 
 asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
 					 unsigned int len);
-asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
+asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
+					  unsigned int len);
 #ifdef CONFIG_AS_AVX2
 asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src);
 static bool chacha20_use_avx2;
@@ -42,7 +43,7 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
 	}
 #endif
 	while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
-		chacha20_4block_xor_ssse3(state, dst, src);
+		chacha20_4block_xor_ssse3(state, dst, src, bytes);
 		bytes -= CHACHA20_BLOCK_SIZE * 4;
 		src += CHACHA20_BLOCK_SIZE * 4;
 		dst += CHACHA20_BLOCK_SIZE * 4;
-- 
cgit v1.2.3-59-g8ed1b


From c3b734dd325dadc73c2f5b4d187208730bf21df5 Mon Sep 17 00:00:00 2001
From: Martin Willi <martin@strongswan.org>
Date: Sun, 11 Nov 2018 10:36:27 +0100
Subject: crypto: x86/chacha20 - Support partial lengths in 8-block AVX2
 variant

Add a length argument to the eight block function for AVX2, so the
block function may XOR only a partial length of eight blocks.

To avoid unnecessary operations, we integrate XORing of the first four
blocks in the final lane interleaving; this also avoids some work in
the partial lengths path.

Signed-off-by: Martin Willi <martin@strongswan.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/chacha20-avx2-x86_64.S | 189 +++++++++++++++++++++++----------
 arch/x86/crypto/chacha20_glue.c        |   5 +-
 2 files changed, 133 insertions(+), 61 deletions(-)

diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S b/arch/x86/crypto/chacha20-avx2-x86_64.S
index f3cd26f48332..7b62d55bee3d 100644
--- a/arch/x86/crypto/chacha20-avx2-x86_64.S
+++ b/arch/x86/crypto/chacha20-avx2-x86_64.S
@@ -30,8 +30,9 @@ CTRINC:	.octa 0x00000003000000020000000100000000
 
 ENTRY(chacha20_8block_xor_avx2)
 	# %rdi: Input state matrix, s
-	# %rsi: 8 data blocks output, o
-	# %rdx: 8 data blocks input, i
+	# %rsi: up to 8 data blocks output, o
+	# %rdx: up to 8 data blocks input, i
+	# %rcx: input/output length in bytes
 
 	# This function encrypts eight consecutive ChaCha20 blocks by loading
 	# the state matrix in AVX registers eight times. As we need some
@@ -48,6 +49,7 @@ ENTRY(chacha20_8block_xor_avx2)
 	lea		8(%rsp),%r10
 	and		$~31, %rsp
 	sub		$0x80, %rsp
+	mov		%rcx,%rax
 
 	# x0..15[0-7] = s[0..15]
 	vpbroadcastd	0x00(%rdi),%ymm0
@@ -375,74 +377,143 @@ ENTRY(chacha20_8block_xor_avx2)
 	vpunpckhqdq	%ymm15,%ymm0,%ymm15
 
 	# interleave 128-bit words in state n, n+4
-	vmovdqa		0x00(%rsp),%ymm0
-	vperm2i128	$0x20,%ymm4,%ymm0,%ymm1
-	vperm2i128	$0x31,%ymm4,%ymm0,%ymm4
-	vmovdqa		%ymm1,0x00(%rsp)
-	vmovdqa		0x20(%rsp),%ymm0
-	vperm2i128	$0x20,%ymm5,%ymm0,%ymm1
-	vperm2i128	$0x31,%ymm5,%ymm0,%ymm5
-	vmovdqa		%ymm1,0x20(%rsp)
-	vmovdqa		0x40(%rsp),%ymm0
-	vperm2i128	$0x20,%ymm6,%ymm0,%ymm1
-	vperm2i128	$0x31,%ymm6,%ymm0,%ymm6
-	vmovdqa		%ymm1,0x40(%rsp)
-	vmovdqa		0x60(%rsp),%ymm0
-	vperm2i128	$0x20,%ymm7,%ymm0,%ymm1
-	vperm2i128	$0x31,%ymm7,%ymm0,%ymm7
-	vmovdqa		%ymm1,0x60(%rsp)
+	# xor/write first four blocks
+	vmovdqa		0x00(%rsp),%ymm1
+	vperm2i128	$0x20,%ymm4,%ymm1,%ymm0
+	cmp		$0x0020,%rax
+	jl		.Lxorpart8
+	vpxor		0x0000(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0000(%rsi)
+	vperm2i128	$0x31,%ymm4,%ymm1,%ymm4
+
 	vperm2i128	$0x20,%ymm12,%ymm8,%ymm0
+	cmp		$0x0040,%rax
+	jl		.Lxorpart8
+	vpxor		0x0020(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0020(%rsi)
 	vperm2i128	$0x31,%ymm12,%ymm8,%ymm12
-	vmovdqa		%ymm0,%ymm8
-	vperm2i128	$0x20,%ymm13,%ymm9,%ymm0
-	vperm2i128	$0x31,%ymm13,%ymm9,%ymm13
-	vmovdqa		%ymm0,%ymm9
+
+	vmovdqa		0x40(%rsp),%ymm1
+	vperm2i128	$0x20,%ymm6,%ymm1,%ymm0
+	cmp		$0x0060,%rax
+	jl		.Lxorpart8
+	vpxor		0x0040(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0040(%rsi)
+	vperm2i128	$0x31,%ymm6,%ymm1,%ymm6
+
 	vperm2i128	$0x20,%ymm14,%ymm10,%ymm0
+	cmp		$0x0080,%rax
+	jl		.Lxorpart8
+	vpxor		0x0060(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0060(%rsi)
 	vperm2i128	$0x31,%ymm14,%ymm10,%ymm14
-	vmovdqa		%ymm0,%ymm10
-	vperm2i128	$0x20,%ymm15,%ymm11,%ymm0
-	vperm2i128	$0x31,%ymm15,%ymm11,%ymm15
-	vmovdqa		%ymm0,%ymm11
 
-	# xor with corresponding input, write to output
-	vmovdqa		0x00(%rsp),%ymm0
-	vpxor		0x0000(%rdx),%ymm0,%ymm0
-	vmovdqu		%ymm0,0x0000(%rsi)
-	vmovdqa		0x20(%rsp),%ymm0
+	vmovdqa		0x20(%rsp),%ymm1
+	vperm2i128	$0x20,%ymm5,%ymm1,%ymm0
+	cmp		$0x00a0,%rax
+	jl		.Lxorpart8
 	vpxor		0x0080(%rdx),%ymm0,%ymm0
 	vmovdqu		%ymm0,0x0080(%rsi)
-	vmovdqa		0x40(%rsp),%ymm0
-	vpxor		0x0040(%rdx),%ymm0,%ymm0
-	vmovdqu		%ymm0,0x0040(%rsi)
-	vmovdqa		0x60(%rsp),%ymm0
+	vperm2i128	$0x31,%ymm5,%ymm1,%ymm5
+
+	vperm2i128	$0x20,%ymm13,%ymm9,%ymm0
+	cmp		$0x00c0,%rax
+	jl		.Lxorpart8
+	vpxor		0x00a0(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x00a0(%rsi)
+	vperm2i128	$0x31,%ymm13,%ymm9,%ymm13
+
+	vmovdqa		0x60(%rsp),%ymm1
+	vperm2i128	$0x20,%ymm7,%ymm1,%ymm0
+	cmp		$0x00e0,%rax
+	jl		.Lxorpart8
 	vpxor		0x00c0(%rdx),%ymm0,%ymm0
 	vmovdqu		%ymm0,0x00c0(%rsi)
-	vpxor		0x0100(%rdx),%ymm4,%ymm4
-	vmovdqu		%ymm4,0x0100(%rsi)
-	vpxor		0x0180(%rdx),%ymm5,%ymm5
-	vmovdqu		%ymm5,0x00180(%rsi)
-	vpxor		0x0140(%rdx),%ymm6,%ymm6
-	vmovdqu		%ymm6,0x0140(%rsi)
-	vpxor		0x01c0(%rdx),%ymm7,%ymm7
-	vmovdqu		%ymm7,0x01c0(%rsi)
-	vpxor		0x0020(%rdx),%ymm8,%ymm8
-	vmovdqu		%ymm8,0x0020(%rsi)
-	vpxor		0x00a0(%rdx),%ymm9,%ymm9
-	vmovdqu		%ymm9,0x00a0(%rsi)
-	vpxor		0x0060(%rdx),%ymm10,%ymm10
-	vmovdqu		%ymm10,0x0060(%rsi)
-	vpxor		0x00e0(%rdx),%ymm11,%ymm11
-	vmovdqu		%ymm11,0x00e0(%rsi)
-	vpxor		0x0120(%rdx),%ymm12,%ymm12
-	vmovdqu		%ymm12,0x0120(%rsi)
-	vpxor		0x01a0(%rdx),%ymm13,%ymm13
-	vmovdqu		%ymm13,0x01a0(%rsi)
-	vpxor		0x0160(%rdx),%ymm14,%ymm14
-	vmovdqu		%ymm14,0x0160(%rsi)
-	vpxor		0x01e0(%rdx),%ymm15,%ymm15
-	vmovdqu		%ymm15,0x01e0(%rsi)
+	vperm2i128	$0x31,%ymm7,%ymm1,%ymm7
+
+	vperm2i128	$0x20,%ymm15,%ymm11,%ymm0
+	cmp		$0x0100,%rax
+	jl		.Lxorpart8
+	vpxor		0x00e0(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x00e0(%rsi)
+	vperm2i128	$0x31,%ymm15,%ymm11,%ymm15
+
+	# xor remaining blocks, write to output
+	vmovdqa		%ymm4,%ymm0
+	cmp		$0x0120,%rax
+	jl		.Lxorpart8
+	vpxor		0x0100(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0100(%rsi)
 
+	vmovdqa		%ymm12,%ymm0
+	cmp		$0x0140,%rax
+	jl		.Lxorpart8
+	vpxor		0x0120(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0120(%rsi)
+
+	vmovdqa		%ymm6,%ymm0
+	cmp		$0x0160,%rax
+	jl		.Lxorpart8
+	vpxor		0x0140(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0140(%rsi)
+
+	vmovdqa		%ymm14,%ymm0
+	cmp		$0x0180,%rax
+	jl		.Lxorpart8
+	vpxor		0x0160(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0160(%rsi)
+
+	vmovdqa		%ymm5,%ymm0
+	cmp		$0x01a0,%rax
+	jl		.Lxorpart8
+	vpxor		0x0180(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0180(%rsi)
+
+	vmovdqa		%ymm13,%ymm0
+	cmp		$0x01c0,%rax
+	jl		.Lxorpart8
+	vpxor		0x01a0(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x01a0(%rsi)
+
+	vmovdqa		%ymm7,%ymm0
+	cmp		$0x01e0,%rax
+	jl		.Lxorpart8
+	vpxor		0x01c0(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x01c0(%rsi)
+
+	vmovdqa		%ymm15,%ymm0
+	cmp		$0x0200,%rax
+	jl		.Lxorpart8
+	vpxor		0x01e0(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x01e0(%rsi)
+
+.Ldone8:
 	vzeroupper
 	lea		-8(%r10),%rsp
 	ret
+
+.Lxorpart8:
+	# xor remaining bytes from partial register into output
+	mov		%rax,%r9
+	and		$0x1f,%r9
+	jz		.Ldone8
+	and		$~0x1f,%rax
+
+	mov		%rsi,%r11
+
+	lea		(%rdx,%rax),%rsi
+	mov		%rsp,%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	vpxor		0x00(%rsp),%ymm0,%ymm0
+	vmovdqa		%ymm0,0x00(%rsp)
+
+	mov		%rsp,%rsi
+	lea		(%r11,%rax),%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	jmp		.Ldone8
+
 ENDPROC(chacha20_8block_xor_avx2)
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index 8f1ef1a9ce5c..882e8bf5965a 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -24,7 +24,8 @@ asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
 asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
 					  unsigned int len);
 #ifdef CONFIG_AS_AVX2
-asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src);
+asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+					 unsigned int len);
 static bool chacha20_use_avx2;
 #endif
 
@@ -34,7 +35,7 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
 #ifdef CONFIG_AS_AVX2
 	if (chacha20_use_avx2) {
 		while (bytes >= CHACHA20_BLOCK_SIZE * 8) {
-			chacha20_8block_xor_avx2(state, dst, src);
+			chacha20_8block_xor_avx2(state, dst, src, bytes);
 			bytes -= CHACHA20_BLOCK_SIZE * 8;
 			src += CHACHA20_BLOCK_SIZE * 8;
 			dst += CHACHA20_BLOCK_SIZE * 8;
-- 
cgit v1.2.3-59-g8ed1b


From 9b17608f15b940babe2e32522ea29787abd10af2 Mon Sep 17 00:00:00 2001
From: Martin Willi <martin@strongswan.org>
Date: Sun, 11 Nov 2018 10:36:28 +0100
Subject: crypto: x86/chacha20 - Use larger block functions more aggressively

Now that all block functions support partial lengths, engage the wider
block sizes more aggressively. This prevents using smaller block
functions multiple times, where the next larger block function would
have been faster.

Signed-off-by: Martin Willi <martin@strongswan.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/chacha20_glue.c | 39 ++++++++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index 882e8bf5965a..b541da71f11e 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -29,6 +29,12 @@ asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
 static bool chacha20_use_avx2;
 #endif
 
+static unsigned int chacha20_advance(unsigned int len, unsigned int maxblocks)
+{
+	len = min(len, maxblocks * CHACHA20_BLOCK_SIZE);
+	return round_up(len, CHACHA20_BLOCK_SIZE) / CHACHA20_BLOCK_SIZE;
+}
+
 static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
 			    unsigned int bytes)
 {
@@ -41,6 +47,11 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
 			dst += CHACHA20_BLOCK_SIZE * 8;
 			state[12] += 8;
 		}
+		if (bytes > CHACHA20_BLOCK_SIZE * 4) {
+			chacha20_8block_xor_avx2(state, dst, src, bytes);
+			state[12] += chacha20_advance(bytes, 8);
+			return;
+		}
 	}
 #endif
 	while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
@@ -50,15 +61,14 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
 		dst += CHACHA20_BLOCK_SIZE * 4;
 		state[12] += 4;
 	}
-	while (bytes >= CHACHA20_BLOCK_SIZE) {
-		chacha20_block_xor_ssse3(state, dst, src, bytes);
-		bytes -= CHACHA20_BLOCK_SIZE;
-		src += CHACHA20_BLOCK_SIZE;
-		dst += CHACHA20_BLOCK_SIZE;
-		state[12]++;
+	if (bytes > CHACHA20_BLOCK_SIZE) {
+		chacha20_4block_xor_ssse3(state, dst, src, bytes);
+		state[12] += chacha20_advance(bytes, 4);
+		return;
 	}
 	if (bytes) {
 		chacha20_block_xor_ssse3(state, dst, src, bytes);
+		state[12]++;
 	}
 }
 
@@ -82,17 +92,16 @@ static int chacha20_simd(struct skcipher_request *req)
 
 	kernel_fpu_begin();
 
-	while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
-		chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
-				rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
-		err = skcipher_walk_done(&walk,
-					 walk.nbytes % CHACHA20_BLOCK_SIZE);
-	}
+	while (walk.nbytes > 0) {
+		unsigned int nbytes = walk.nbytes;
+
+		if (nbytes < walk.total)
+			nbytes = round_down(nbytes, walk.stride);
 
-	if (walk.nbytes) {
 		chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
-				walk.nbytes);
-		err = skcipher_walk_done(&walk, 0);
+				nbytes);
+
+		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
 	}
 
 	kernel_fpu_end();
-- 
cgit v1.2.3-59-g8ed1b


From a5dd97f86211e91219807db607d740f9896b8e0b Mon Sep 17 00:00:00 2001
From: Martin Willi <martin@strongswan.org>
Date: Sun, 11 Nov 2018 10:36:29 +0100
Subject: crypto: x86/chacha20 - Add a 2-block AVX2 variant

This variant uses the same principle as the single block SSSE3 variant
by shuffling the state matrix after each round. With the wider AVX
registers, we can do two blocks in parallel, though.

This function can increase performance and efficiency significantly for
lengths that would otherwise require a 4-block function.

Signed-off-by: Martin Willi <martin@strongswan.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/chacha20-avx2-x86_64.S | 197 +++++++++++++++++++++++++++++++++
 arch/x86/crypto/chacha20_glue.c        |   7 ++
 2 files changed, 204 insertions(+)

diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S b/arch/x86/crypto/chacha20-avx2-x86_64.S
index 7b62d55bee3d..8247076b0ba7 100644
--- a/arch/x86/crypto/chacha20-avx2-x86_64.S
+++ b/arch/x86/crypto/chacha20-avx2-x86_64.S
@@ -26,8 +26,205 @@ ROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
 CTRINC:	.octa 0x00000003000000020000000100000000
 	.octa 0x00000007000000060000000500000004
 
+.section	.rodata.cst32.CTR2BL, "aM", @progbits, 32
+.align 32
+CTR2BL:	.octa 0x00000000000000000000000000000000
+	.octa 0x00000000000000000000000000000001
+
 .text
 
+ENTRY(chacha20_2block_xor_avx2)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 2 data blocks output, o
+	# %rdx: up to 2 data blocks input, i
+	# %rcx: input/output length in bytes
+
+	# This function encrypts two ChaCha20 blocks by loading the state
+	# matrix twice across four AVX registers. It performs matrix operations
+	# on four words in each matrix in parallel, but requires shuffling to
+	# rearrange the words after each round.
+
+	vzeroupper
+
+	# x0..3[0-2] = s0..3
+	vbroadcasti128	0x00(%rdi),%ymm0
+	vbroadcasti128	0x10(%rdi),%ymm1
+	vbroadcasti128	0x20(%rdi),%ymm2
+	vbroadcasti128	0x30(%rdi),%ymm3
+
+	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
+
+	vmovdqa		%ymm0,%ymm8
+	vmovdqa		%ymm1,%ymm9
+	vmovdqa		%ymm2,%ymm10
+	vmovdqa		%ymm3,%ymm11
+
+	vmovdqa		ROT8(%rip),%ymm4
+	vmovdqa		ROT16(%rip),%ymm5
+
+	mov		%rcx,%rax
+	mov		$10,%ecx
+
+.Ldoubleround:
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm5,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm6
+	vpslld		$12,%ymm6,%ymm6
+	vpsrld		$20,%ymm1,%ymm1
+	vpor		%ymm6,%ymm1,%ymm1
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm4,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm7
+	vpslld		$7,%ymm7,%ymm7
+	vpsrld		$25,%ymm1,%ymm1
+	vpor		%ymm7,%ymm1,%ymm1
+
+	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm1,%ymm1
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm3,%ymm3
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm5,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm6
+	vpslld		$12,%ymm6,%ymm6
+	vpsrld		$20,%ymm1,%ymm1
+	vpor		%ymm6,%ymm1,%ymm1
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm4,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm7
+	vpslld		$7,%ymm7,%ymm7
+	vpsrld		$25,%ymm1,%ymm1
+	vpor		%ymm7,%ymm1,%ymm1
+
+	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm1,%ymm1
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm3,%ymm3
+
+	dec		%ecx
+	jnz		.Ldoubleround
+
+	# o0 = i0 ^ (x0 + s0)
+	vpaddd		%ymm8,%ymm0,%ymm7
+	cmp		$0x10,%rax
+	jl		.Lxorpart2
+	vpxor		0x00(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x00(%rsi)
+	vextracti128	$1,%ymm7,%xmm0
+	# o1 = i1 ^ (x1 + s1)
+	vpaddd		%ymm9,%ymm1,%ymm7
+	cmp		$0x20,%rax
+	jl		.Lxorpart2
+	vpxor		0x10(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x10(%rsi)
+	vextracti128	$1,%ymm7,%xmm1
+	# o2 = i2 ^ (x2 + s2)
+	vpaddd		%ymm10,%ymm2,%ymm7
+	cmp		$0x30,%rax
+	jl		.Lxorpart2
+	vpxor		0x20(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x20(%rsi)
+	vextracti128	$1,%ymm7,%xmm2
+	# o3 = i3 ^ (x3 + s3)
+	vpaddd		%ymm11,%ymm3,%ymm7
+	cmp		$0x40,%rax
+	jl		.Lxorpart2
+	vpxor		0x30(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x30(%rsi)
+	vextracti128	$1,%ymm7,%xmm3
+
+	# xor and write second block
+	vmovdqa		%xmm0,%xmm7
+	cmp		$0x50,%rax
+	jl		.Lxorpart2
+	vpxor		0x40(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x40(%rsi)
+
+	vmovdqa		%xmm1,%xmm7
+	cmp		$0x60,%rax
+	jl		.Lxorpart2
+	vpxor		0x50(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x50(%rsi)
+
+	vmovdqa		%xmm2,%xmm7
+	cmp		$0x70,%rax
+	jl		.Lxorpart2
+	vpxor		0x60(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x60(%rsi)
+
+	vmovdqa		%xmm3,%xmm7
+	cmp		$0x80,%rax
+	jl		.Lxorpart2
+	vpxor		0x70(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x70(%rsi)
+
+.Ldone2:
+	vzeroupper
+	ret
+
+.Lxorpart2:
+	# xor remaining bytes from partial register into output
+	mov		%rax,%r9
+	and		$0x0f,%r9
+	jz		.Ldone2
+	and		$~0x0f,%rax
+
+	mov		%rsi,%r11
+
+	lea		8(%rsp),%r10
+	sub		$0x10,%rsp
+	and		$~31,%rsp
+
+	lea		(%rdx,%rax),%rsi
+	mov		%rsp,%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	vpxor		0x00(%rsp),%xmm7,%xmm7
+	vmovdqa		%xmm7,0x00(%rsp)
+
+	mov		%rsp,%rsi
+	lea		(%r11,%rax),%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	lea		-8(%r10),%rsp
+	jmp		.Ldone2
+
+ENDPROC(chacha20_2block_xor_avx2)
+
 ENTRY(chacha20_8block_xor_avx2)
 	# %rdi: Input state matrix, s
 	# %rsi: up to 8 data blocks output, o
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index b541da71f11e..82e46589a189 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -24,6 +24,8 @@ asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
 asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
 					  unsigned int len);
 #ifdef CONFIG_AS_AVX2
+asmlinkage void chacha20_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+					 unsigned int len);
 asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
 					 unsigned int len);
 static bool chacha20_use_avx2;
@@ -52,6 +54,11 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
 			state[12] += chacha20_advance(bytes, 8);
 			return;
 		}
+		if (bytes > CHACHA20_BLOCK_SIZE) {
+			chacha20_2block_xor_avx2(state, dst, src, bytes);
+			state[12] += chacha20_advance(bytes, 2);
+			return;
+		}
 	}
 #endif
 	while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
-- 
cgit v1.2.3-59-g8ed1b


From 8a5a79d5556b822143b4403fc46068d4eef2e4e2 Mon Sep 17 00:00:00 2001
From: Martin Willi <martin@strongswan.org>
Date: Sun, 11 Nov 2018 10:36:30 +0100
Subject: crypto: x86/chacha20 - Add a 4-block AVX2 variant

This variant builds upon the idea of the 2-block AVX2 variant that
shuffles words after each round. The shuffling has a rather high latency,
so the arithmetic units are not optimally used.

Given that we have plenty of registers in AVX, this version parallelizes
the 2-block variant to do four blocks. While the first two blocks are
shuffling, the CPU can do the XORing on the second two blocks and
vice-versa, which makes this version much faster than the SSSE3 variant
for four blocks. The latter is now mostly for systems that do not have
AVX2, but there it is the work-horse, so we keep it in place.

The partial XORing function trailer is very similar to the AVX2 2-block
variant. While it could be shared, that code segment is rather short;
profiling is also easier with the trailer integrated, so we keep it per
function.

Signed-off-by: Martin Willi <martin@strongswan.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/chacha20-avx2-x86_64.S | 310 +++++++++++++++++++++++++++++++++
 arch/x86/crypto/chacha20_glue.c        |   7 +
 2 files changed, 317 insertions(+)

diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S b/arch/x86/crypto/chacha20-avx2-x86_64.S
index 8247076b0ba7..b6ab082be657 100644
--- a/arch/x86/crypto/chacha20-avx2-x86_64.S
+++ b/arch/x86/crypto/chacha20-avx2-x86_64.S
@@ -31,6 +31,11 @@ CTRINC:	.octa 0x00000003000000020000000100000000
 CTR2BL:	.octa 0x00000000000000000000000000000000
 	.octa 0x00000000000000000000000000000001
 
+.section	.rodata.cst32.CTR4BL, "aM", @progbits, 32
+.align 32
+CTR4BL:	.octa 0x00000000000000000000000000000002
+	.octa 0x00000000000000000000000000000003
+
 .text
 
 ENTRY(chacha20_2block_xor_avx2)
@@ -225,6 +230,311 @@ ENTRY(chacha20_2block_xor_avx2)
 
 ENDPROC(chacha20_2block_xor_avx2)
 
+ENTRY(chacha20_4block_xor_avx2)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 4 data blocks output, o
+	# %rdx: up to 4 data blocks input, i
+	# %rcx: input/output length in bytes
+
+	# This function encrypts four ChaCha20 block by loading the state
+	# matrix four times across eight AVX registers. It performs matrix
+	# operations on four words in two matrices in parallel, sequentially
+	# to the operations on the four words of the other two matrices. The
+	# required word shuffling has a rather high latency, we can do the
+	# arithmetic on two matrix-pairs without much slowdown.
+
+	vzeroupper
+
+	# x0..3[0-4] = s0..3
+	vbroadcasti128	0x00(%rdi),%ymm0
+	vbroadcasti128	0x10(%rdi),%ymm1
+	vbroadcasti128	0x20(%rdi),%ymm2
+	vbroadcasti128	0x30(%rdi),%ymm3
+
+	vmovdqa		%ymm0,%ymm4
+	vmovdqa		%ymm1,%ymm5
+	vmovdqa		%ymm2,%ymm6
+	vmovdqa		%ymm3,%ymm7
+
+	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
+	vpaddd		CTR4BL(%rip),%ymm7,%ymm7
+
+	vmovdqa		%ymm0,%ymm11
+	vmovdqa		%ymm1,%ymm12
+	vmovdqa		%ymm2,%ymm13
+	vmovdqa		%ymm3,%ymm14
+	vmovdqa		%ymm7,%ymm15
+
+	vmovdqa		ROT8(%rip),%ymm8
+	vmovdqa		ROT16(%rip),%ymm9
+
+	mov		%rcx,%rax
+	mov		$10,%ecx
+
+.Ldoubleround4:
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm9,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxor		%ymm4,%ymm7,%ymm7
+	vpshufb		%ymm9,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm10
+	vpslld		$12,%ymm10,%ymm10
+	vpsrld		$20,%ymm1,%ymm1
+	vpor		%ymm10,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxor		%ymm6,%ymm5,%ymm5
+	vmovdqa		%ymm5,%ymm10
+	vpslld		$12,%ymm10,%ymm10
+	vpsrld		$20,%ymm5,%ymm5
+	vpor		%ymm10,%ymm5,%ymm5
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm8,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxor		%ymm4,%ymm7,%ymm7
+	vpshufb		%ymm8,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm10
+	vpslld		$7,%ymm10,%ymm10
+	vpsrld		$25,%ymm1,%ymm1
+	vpor		%ymm10,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxor		%ymm6,%ymm5,%ymm5
+	vmovdqa		%ymm5,%ymm10
+	vpslld		$7,%ymm10,%ymm10
+	vpsrld		$25,%ymm5,%ymm5
+	vpor		%ymm10,%ymm5,%ymm5
+
+	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm1,%ymm1
+	vpshufd		$0x39,%ymm5,%ymm5
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	vpshufd		$0x4e,%ymm6,%ymm6
+	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm3,%ymm3
+	vpshufd		$0x93,%ymm7,%ymm7
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm9,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxor		%ymm4,%ymm7,%ymm7
+	vpshufb		%ymm9,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm10
+	vpslld		$12,%ymm10,%ymm10
+	vpsrld		$20,%ymm1,%ymm1
+	vpor		%ymm10,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxor		%ymm6,%ymm5,%ymm5
+	vmovdqa		%ymm5,%ymm10
+	vpslld		$12,%ymm10,%ymm10
+	vpsrld		$20,%ymm5,%ymm5
+	vpor		%ymm10,%ymm5,%ymm5
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm8,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxor		%ymm4,%ymm7,%ymm7
+	vpshufb		%ymm8,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm10
+	vpslld		$7,%ymm10,%ymm10
+	vpsrld		$25,%ymm1,%ymm1
+	vpor		%ymm10,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxor		%ymm6,%ymm5,%ymm5
+	vmovdqa		%ymm5,%ymm10
+	vpslld		$7,%ymm10,%ymm10
+	vpsrld		$25,%ymm5,%ymm5
+	vpor		%ymm10,%ymm5,%ymm5
+
+	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm1,%ymm1
+	vpshufd		$0x93,%ymm5,%ymm5
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	vpshufd		$0x4e,%ymm6,%ymm6
+	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm3,%ymm3
+	vpshufd		$0x39,%ymm7,%ymm7
+
+	dec		%ecx
+	jnz		.Ldoubleround4
+
+	# o0 = i0 ^ (x0 + s0), first block
+	vpaddd		%ymm11,%ymm0,%ymm10
+	cmp		$0x10,%rax
+	jl		.Lxorpart4
+	vpxor		0x00(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x00(%rsi)
+	vextracti128	$1,%ymm10,%xmm0
+	# o1 = i1 ^ (x1 + s1), first block
+	vpaddd		%ymm12,%ymm1,%ymm10
+	cmp		$0x20,%rax
+	jl		.Lxorpart4
+	vpxor		0x10(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x10(%rsi)
+	vextracti128	$1,%ymm10,%xmm1
+	# o2 = i2 ^ (x2 + s2), first block
+	vpaddd		%ymm13,%ymm2,%ymm10
+	cmp		$0x30,%rax
+	jl		.Lxorpart4
+	vpxor		0x20(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x20(%rsi)
+	vextracti128	$1,%ymm10,%xmm2
+	# o3 = i3 ^ (x3 + s3), first block
+	vpaddd		%ymm14,%ymm3,%ymm10
+	cmp		$0x40,%rax
+	jl		.Lxorpart4
+	vpxor		0x30(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x30(%rsi)
+	vextracti128	$1,%ymm10,%xmm3
+
+	# xor and write second block
+	vmovdqa		%xmm0,%xmm10
+	cmp		$0x50,%rax
+	jl		.Lxorpart4
+	vpxor		0x40(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x40(%rsi)
+
+	vmovdqa		%xmm1,%xmm10
+	cmp		$0x60,%rax
+	jl		.Lxorpart4
+	vpxor		0x50(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x50(%rsi)
+
+	vmovdqa		%xmm2,%xmm10
+	cmp		$0x70,%rax
+	jl		.Lxorpart4
+	vpxor		0x60(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x60(%rsi)
+
+	vmovdqa		%xmm3,%xmm10
+	cmp		$0x80,%rax
+	jl		.Lxorpart4
+	vpxor		0x70(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x70(%rsi)
+
+	# o0 = i0 ^ (x0 + s0), third block
+	vpaddd		%ymm11,%ymm4,%ymm10
+	cmp		$0x90,%rax
+	jl		.Lxorpart4
+	vpxor		0x80(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x80(%rsi)
+	vextracti128	$1,%ymm10,%xmm4
+	# o1 = i1 ^ (x1 + s1), third block
+	vpaddd		%ymm12,%ymm5,%ymm10
+	cmp		$0xa0,%rax
+	jl		.Lxorpart4
+	vpxor		0x90(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x90(%rsi)
+	vextracti128	$1,%ymm10,%xmm5
+	# o2 = i2 ^ (x2 + s2), third block
+	vpaddd		%ymm13,%ymm6,%ymm10
+	cmp		$0xb0,%rax
+	jl		.Lxorpart4
+	vpxor		0xa0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xa0(%rsi)
+	vextracti128	$1,%ymm10,%xmm6
+	# o3 = i3 ^ (x3 + s3), third block
+	vpaddd		%ymm15,%ymm7,%ymm10
+	cmp		$0xc0,%rax
+	jl		.Lxorpart4
+	vpxor		0xb0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xb0(%rsi)
+	vextracti128	$1,%ymm10,%xmm7
+
+	# xor and write fourth block
+	vmovdqa		%xmm4,%xmm10
+	cmp		$0xd0,%rax
+	jl		.Lxorpart4
+	vpxor		0xc0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xc0(%rsi)
+
+	vmovdqa		%xmm5,%xmm10
+	cmp		$0xe0,%rax
+	jl		.Lxorpart4
+	vpxor		0xd0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xd0(%rsi)
+
+	vmovdqa		%xmm6,%xmm10
+	cmp		$0xf0,%rax
+	jl		.Lxorpart4
+	vpxor		0xe0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xe0(%rsi)
+
+	vmovdqa		%xmm7,%xmm10
+	cmp		$0x100,%rax
+	jl		.Lxorpart4
+	vpxor		0xf0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xf0(%rsi)
+
+.Ldone4:
+	vzeroupper
+	ret
+
+.Lxorpart4:
+	# xor remaining bytes from partial register into output
+	mov		%rax,%r9
+	and		$0x0f,%r9
+	jz		.Ldone4
+	and		$~0x0f,%rax
+
+	mov		%rsi,%r11
+
+	lea		8(%rsp),%r10
+	sub		$0x10,%rsp
+	and		$~31,%rsp
+
+	lea		(%rdx,%rax),%rsi
+	mov		%rsp,%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	vpxor		0x00(%rsp),%xmm10,%xmm10
+	vmovdqa		%xmm10,0x00(%rsp)
+
+	mov		%rsp,%rsi
+	lea		(%r11,%rax),%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	lea		-8(%r10),%rsp
+	jmp		.Ldone4
+
+ENDPROC(chacha20_4block_xor_avx2)
+
 ENTRY(chacha20_8block_xor_avx2)
 	# %rdi: Input state matrix, s
 	# %rsi: up to 8 data blocks output, o
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index 82e46589a189..9fd84fe6ec09 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -26,6 +26,8 @@ asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
 #ifdef CONFIG_AS_AVX2
 asmlinkage void chacha20_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
 					 unsigned int len);
+asmlinkage void chacha20_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+					 unsigned int len);
 asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
 					 unsigned int len);
 static bool chacha20_use_avx2;
@@ -54,6 +56,11 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
 			state[12] += chacha20_advance(bytes, 8);
 			return;
 		}
+		if (bytes > CHACHA20_BLOCK_SIZE * 2) {
+			chacha20_4block_xor_avx2(state, dst, src, bytes);
+			state[12] += chacha20_advance(bytes, 4);
+			return;
+		}
 		if (bytes > CHACHA20_BLOCK_SIZE) {
 			chacha20_2block_xor_avx2(state, dst, src, bytes);
 			state[12] += chacha20_advance(bytes, 2);
-- 
cgit v1.2.3-59-g8ed1b


From 3da2c1dfdb802b184eea0653d1e589515b52d74b Mon Sep 17 00:00:00 2001
From: Vitaly Chikunov <vt@altlinux.org>
Date: Sun, 11 Nov 2018 20:40:02 +0300
Subject: crypto: ecc - regularize scalar for scalar multiplication

ecc_point_mult is supposed to be used with a regularized scalar,
otherwise, it's possible to deduce the position of the top bit of the
scalar with timing attack. This is important when the scalar is a
private key.

ecc_point_mult is already using a regular algorithm (i.e. having an
operation flow independent of the input scalar) but regularization step
is not implemented.

Arrange scalar to always have fixed top bit by adding a multiple of the
curve order (n).

References:
The constant time regularization step is based on micro-ecc by Kenneth
MacKay and also referenced in the literature (Bernstein, D. J., & Lange,
T. (2017). Montgomery curves and the Montgomery ladder. (Cryptology
ePrint Archive; Vol. 2017/293). s.l.: IACR. Chapter 4.6.2.)

Signed-off-by: Vitaly Chikunov <vt@altlinux.org>
Cc: kernel-hardening@lists.openwall.com
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/ecc.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/crypto/ecc.c b/crypto/ecc.c
index 9d24e6522730..ed1237115066 100644
--- a/crypto/ecc.c
+++ b/crypto/ecc.c
@@ -842,15 +842,23 @@ static void xycz_add_c(u64 *x1, u64 *y1, u64 *x2, u64 *y2, u64 *curve_prime,
 
 static void ecc_point_mult(struct ecc_point *result,
 			   const struct ecc_point *point, const u64 *scalar,
-			   u64 *initial_z, u64 *curve_prime,
+			   u64 *initial_z, const struct ecc_curve *curve,
 			   unsigned int ndigits)
 {
 	/* R0 and R1 */
 	u64 rx[2][ECC_MAX_DIGITS];
 	u64 ry[2][ECC_MAX_DIGITS];
 	u64 z[ECC_MAX_DIGITS];
+	u64 sk[2][ECC_MAX_DIGITS];
+	u64 *curve_prime = curve->p;
 	int i, nb;
-	int num_bits = vli_num_bits(scalar, ndigits);
+	int num_bits;
+	int carry;
+
+	carry = vli_add(sk[0], scalar, curve->n, ndigits);
+	vli_add(sk[1], sk[0], curve->n, ndigits);
+	scalar = sk[!carry];
+	num_bits = sizeof(u64) * ndigits * 8 + 1;
 
 	vli_set(rx[1], point->x, ndigits);
 	vli_set(ry[1], point->y, ndigits);
@@ -1014,7 +1022,7 @@ int ecc_make_pub_key(unsigned int curve_id, unsigned int ndigits,
 		goto out;
 	}
 
-	ecc_point_mult(pk, &curve->g, priv, NULL, curve->p, ndigits);
+	ecc_point_mult(pk, &curve->g, priv, NULL, curve, ndigits);
 	if (ecc_point_is_zero(pk)) {
 		ret = -EAGAIN;
 		goto err_free_point;
@@ -1100,7 +1108,7 @@ int crypto_ecdh_shared_secret(unsigned int curve_id, unsigned int ndigits,
 		goto err_alloc_product;
 	}
 
-	ecc_point_mult(product, pk, priv, rand_z, curve->p, ndigits);
+	ecc_point_mult(product, pk, priv, rand_z, curve, ndigits);
 
 	ecc_swap_digits(product->x, secret, ndigits);
 
-- 
cgit v1.2.3-59-g8ed1b


From 2b78aeb366365bd9cbf56c710e8b2ac494620306 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 14 Nov 2018 11:10:53 -0800
Subject: crypto: inside-secure - remove useless setting of type flags

Remove the unnecessary setting of CRYPTO_ALG_TYPE_SKCIPHER.
Commit 2c95e6d97892 ("crypto: skcipher - remove useless setting of type
flags") took care of this everywhere else, but a few more instances made
it into the tree at about the same time.  Squash them before they get
copy+pasted around again.

This patch shouldn't change any actual behavior.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Acked-by: Antoine Tenart <antoine.tenart@bootlin.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/inside-secure/safexcel_cipher.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/crypto/inside-secure/safexcel_cipher.c b/drivers/crypto/inside-secure/safexcel_cipher.c
index 3aef1d43e435..d531c14020dc 100644
--- a/drivers/crypto/inside-secure/safexcel_cipher.c
+++ b/drivers/crypto/inside-secure/safexcel_cipher.c
@@ -970,7 +970,7 @@ struct safexcel_alg_template safexcel_alg_cbc_des = {
 			.cra_name = "cbc(des)",
 			.cra_driver_name = "safexcel-cbc-des",
 			.cra_priority = 300,
-			.cra_flags = CRYPTO_ALG_TYPE_SKCIPHER | CRYPTO_ALG_ASYNC |
+			.cra_flags = CRYPTO_ALG_ASYNC |
 				     CRYPTO_ALG_KERN_DRIVER_ONLY,
 			.cra_blocksize = DES_BLOCK_SIZE,
 			.cra_ctxsize = sizeof(struct safexcel_cipher_ctx),
@@ -1010,7 +1010,7 @@ struct safexcel_alg_template safexcel_alg_ecb_des = {
 			.cra_name = "ecb(des)",
 			.cra_driver_name = "safexcel-ecb-des",
 			.cra_priority = 300,
-			.cra_flags = CRYPTO_ALG_TYPE_SKCIPHER | CRYPTO_ALG_ASYNC |
+			.cra_flags = CRYPTO_ALG_ASYNC |
 				     CRYPTO_ALG_KERN_DRIVER_ONLY,
 			.cra_blocksize = DES_BLOCK_SIZE,
 			.cra_ctxsize = sizeof(struct safexcel_cipher_ctx),
@@ -1074,7 +1074,7 @@ struct safexcel_alg_template safexcel_alg_cbc_des3_ede = {
 			.cra_name = "cbc(des3_ede)",
 			.cra_driver_name = "safexcel-cbc-des3_ede",
 			.cra_priority = 300,
-			.cra_flags = CRYPTO_ALG_TYPE_SKCIPHER | CRYPTO_ALG_ASYNC |
+			.cra_flags = CRYPTO_ALG_ASYNC |
 				     CRYPTO_ALG_KERN_DRIVER_ONLY,
 			.cra_blocksize = DES3_EDE_BLOCK_SIZE,
 			.cra_ctxsize = sizeof(struct safexcel_cipher_ctx),
@@ -1114,7 +1114,7 @@ struct safexcel_alg_template safexcel_alg_ecb_des3_ede = {
 			.cra_name = "ecb(des3_ede)",
 			.cra_driver_name = "safexcel-ecb-des3_ede",
 			.cra_priority = 300,
-			.cra_flags = CRYPTO_ALG_TYPE_SKCIPHER | CRYPTO_ALG_ASYNC |
+			.cra_flags = CRYPTO_ALG_ASYNC |
 				     CRYPTO_ALG_KERN_DRIVER_ONLY,
 			.cra_blocksize = DES3_EDE_BLOCK_SIZE,
 			.cra_ctxsize = sizeof(struct safexcel_cipher_ctx),
-- 
cgit v1.2.3-59-g8ed1b


From d41655909e3236bfb00aa69f435a9634cd74b60b Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 14 Nov 2018 11:35:48 -0800
Subject: crypto: remove useless initializations of cra_list

Some algorithms initialize their .cra_list prior to registration.
But this is unnecessary since crypto_register_alg() will overwrite
.cra_list when adding the algorithm to the 'crypto_alg_list'.
Apparently the useless assignment has just been copy+pasted around.

So, remove the useless assignments.

Exception: paes_s390.c uses cra_list to check whether the algorithm is
registered or not, so I left that as-is for now.

This patch shouldn't change any actual behavior.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/sparc/crypto/aes_glue.c      | 5 -----
 arch/sparc/crypto/camellia_glue.c | 5 -----
 arch/sparc/crypto/des_glue.c      | 5 -----
 crypto/lz4.c                      | 1 -
 crypto/lz4hc.c                    | 1 -
 drivers/crypto/bcm/cipher.c       | 2 --
 drivers/crypto/omap-aes.c         | 2 --
 drivers/crypto/omap-des.c         | 1 -
 drivers/crypto/qce/ablkcipher.c   | 1 -
 drivers/crypto/qce/sha.c          | 1 -
 drivers/crypto/sahara.c           | 1 -
 11 files changed, 25 deletions(-)

diff --git a/arch/sparc/crypto/aes_glue.c b/arch/sparc/crypto/aes_glue.c
index 3cd4f6b198b6..a9b8b0b94a8d 100644
--- a/arch/sparc/crypto/aes_glue.c
+++ b/arch/sparc/crypto/aes_glue.c
@@ -476,11 +476,6 @@ static bool __init sparc64_has_aes_opcode(void)
 
 static int __init aes_sparc64_mod_init(void)
 {
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(algs); i++)
-		INIT_LIST_HEAD(&algs[i].cra_list);
-
 	if (sparc64_has_aes_opcode()) {
 		pr_info("Using sparc64 aes opcodes optimized AES implementation\n");
 		return crypto_register_algs(algs, ARRAY_SIZE(algs));
diff --git a/arch/sparc/crypto/camellia_glue.c b/arch/sparc/crypto/camellia_glue.c
index 561a84d93cf6..900d5c617e83 100644
--- a/arch/sparc/crypto/camellia_glue.c
+++ b/arch/sparc/crypto/camellia_glue.c
@@ -299,11 +299,6 @@ static bool __init sparc64_has_camellia_opcode(void)
 
 static int __init camellia_sparc64_mod_init(void)
 {
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(algs); i++)
-		INIT_LIST_HEAD(&algs[i].cra_list);
-
 	if (sparc64_has_camellia_opcode()) {
 		pr_info("Using sparc64 camellia opcodes optimized CAMELLIA implementation\n");
 		return crypto_register_algs(algs, ARRAY_SIZE(algs));
diff --git a/arch/sparc/crypto/des_glue.c b/arch/sparc/crypto/des_glue.c
index 61af794aa2d3..56499ea39fd3 100644
--- a/arch/sparc/crypto/des_glue.c
+++ b/arch/sparc/crypto/des_glue.c
@@ -510,11 +510,6 @@ static bool __init sparc64_has_des_opcode(void)
 
 static int __init des_sparc64_mod_init(void)
 {
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(algs); i++)
-		INIT_LIST_HEAD(&algs[i].cra_list);
-
 	if (sparc64_has_des_opcode()) {
 		pr_info("Using sparc64 des opcodes optimized DES implementation\n");
 		return crypto_register_algs(algs, ARRAY_SIZE(algs));
diff --git a/crypto/lz4.c b/crypto/lz4.c
index 2ce2660d3519..c160dfdbf2e0 100644
--- a/crypto/lz4.c
+++ b/crypto/lz4.c
@@ -122,7 +122,6 @@ static struct crypto_alg alg_lz4 = {
 	.cra_flags		= CRYPTO_ALG_TYPE_COMPRESS,
 	.cra_ctxsize		= sizeof(struct lz4_ctx),
 	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(alg_lz4.cra_list),
 	.cra_init		= lz4_init,
 	.cra_exit		= lz4_exit,
 	.cra_u			= { .compress = {
diff --git a/crypto/lz4hc.c b/crypto/lz4hc.c
index 2be14f054daf..583b5e013d7a 100644
--- a/crypto/lz4hc.c
+++ b/crypto/lz4hc.c
@@ -123,7 +123,6 @@ static struct crypto_alg alg_lz4hc = {
 	.cra_flags		= CRYPTO_ALG_TYPE_COMPRESS,
 	.cra_ctxsize		= sizeof(struct lz4hc_ctx),
 	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(alg_lz4hc.cra_list),
 	.cra_init		= lz4hc_init,
 	.cra_exit		= lz4hc_exit,
 	.cra_u			= { .compress = {
diff --git a/drivers/crypto/bcm/cipher.c b/drivers/crypto/bcm/cipher.c
index 4d67e22a4664..2ce3a16d3d10 100644
--- a/drivers/crypto/bcm/cipher.c
+++ b/drivers/crypto/bcm/cipher.c
@@ -4605,7 +4605,6 @@ static int spu_register_ablkcipher(struct iproc_alg_s *driver_alg)
 	crypto->cra_priority = cipher_pri;
 	crypto->cra_alignmask = 0;
 	crypto->cra_ctxsize = sizeof(struct iproc_ctx_s);
-	INIT_LIST_HEAD(&crypto->cra_list);
 
 	crypto->cra_init = ablkcipher_cra_init;
 	crypto->cra_exit = generic_cra_exit;
@@ -4691,7 +4690,6 @@ static int spu_register_aead(struct iproc_alg_s *driver_alg)
 	aead->base.cra_priority = aead_pri;
 	aead->base.cra_alignmask = 0;
 	aead->base.cra_ctxsize = sizeof(struct iproc_ctx_s);
-	INIT_LIST_HEAD(&aead->base.cra_list);
 
 	aead->base.cra_flags |= CRYPTO_ALG_ASYNC;
 	/* setkey set in alg initialization */
diff --git a/drivers/crypto/omap-aes.c b/drivers/crypto/omap-aes.c
index a553ffddb11b..4c0ea8142923 100644
--- a/drivers/crypto/omap-aes.c
+++ b/drivers/crypto/omap-aes.c
@@ -1222,7 +1222,6 @@ static int omap_aes_probe(struct platform_device *pdev)
 				algp = &dd->pdata->algs_info[i].algs_list[j];
 
 				pr_debug("reg alg: %s\n", algp->cra_name);
-				INIT_LIST_HEAD(&algp->cra_list);
 
 				err = crypto_register_alg(algp);
 				if (err)
@@ -1240,7 +1239,6 @@ static int omap_aes_probe(struct platform_device *pdev)
 			algp = &aalg->base;
 
 			pr_debug("reg alg: %s\n", algp->cra_name);
-			INIT_LIST_HEAD(&algp->cra_list);
 
 			err = crypto_register_aead(aalg);
 			if (err)
diff --git a/drivers/crypto/omap-des.c b/drivers/crypto/omap-des.c
index eb95b0d7f184..6369019219d4 100644
--- a/drivers/crypto/omap-des.c
+++ b/drivers/crypto/omap-des.c
@@ -1069,7 +1069,6 @@ static int omap_des_probe(struct platform_device *pdev)
 			algp = &dd->pdata->algs_info[i].algs_list[j];
 
 			pr_debug("reg alg: %s\n", algp->cra_name);
-			INIT_LIST_HEAD(&algp->cra_list);
 
 			err = crypto_register_alg(algp);
 			if (err)
diff --git a/drivers/crypto/qce/ablkcipher.c b/drivers/crypto/qce/ablkcipher.c
index 585e1cab9ae3..25c13e26d012 100644
--- a/drivers/crypto/qce/ablkcipher.c
+++ b/drivers/crypto/qce/ablkcipher.c
@@ -376,7 +376,6 @@ static int qce_ablkcipher_register_one(const struct qce_ablkcipher_def *def,
 	alg->cra_module = THIS_MODULE;
 	alg->cra_init = qce_ablkcipher_init;
 	alg->cra_exit = qce_ablkcipher_exit;
-	INIT_LIST_HEAD(&alg->cra_list);
 
 	INIT_LIST_HEAD(&tmpl->entry);
 	tmpl->crypto_alg_type = CRYPTO_ALG_TYPE_ABLKCIPHER;
diff --git a/drivers/crypto/qce/sha.c b/drivers/crypto/qce/sha.c
index d8a5db11b7ea..fc45f5ea6fdd 100644
--- a/drivers/crypto/qce/sha.c
+++ b/drivers/crypto/qce/sha.c
@@ -508,7 +508,6 @@ static int qce_ahash_register_one(const struct qce_ahash_def *def,
 	base->cra_alignmask = 0;
 	base->cra_module = THIS_MODULE;
 	base->cra_init = qce_ahash_cra_init;
-	INIT_LIST_HEAD(&base->cra_list);
 
 	snprintf(base->cra_name, CRYPTO_MAX_ALG_NAME, "%s", def->name);
 	snprintf(base->cra_driver_name, CRYPTO_MAX_ALG_NAME, "%s",
diff --git a/drivers/crypto/sahara.c b/drivers/crypto/sahara.c
index bbf166a97ad3..8c32a3059b4a 100644
--- a/drivers/crypto/sahara.c
+++ b/drivers/crypto/sahara.c
@@ -1321,7 +1321,6 @@ static int sahara_register_algs(struct sahara_dev *dev)
 	unsigned int i, j, k, l;
 
 	for (i = 0; i < ARRAY_SIZE(aes_algs); i++) {
-		INIT_LIST_HEAD(&aes_algs[i].cra_list);
 		err = crypto_register_alg(&aes_algs[i]);
 		if (err)
 			goto err_aes_algs;
-- 
cgit v1.2.3-59-g8ed1b


From 1ad0f1603a6b2afb62a1c065409aaa4e43ca7627 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 14 Nov 2018 12:19:39 -0800
Subject: crypto: drop mask=CRYPTO_ALG_ASYNC from 'cipher' tfm allocations

'cipher' algorithms (single block ciphers) are always synchronous, so
passing CRYPTO_ALG_ASYNC in the mask to crypto_alloc_cipher() has no
effect.  Many users therefore already don't pass it, but some still do.
This inconsistency can cause confusion, especially since the way the
'mask' argument works is somewhat counterintuitive.

Thus, just remove the unneeded CRYPTO_ALG_ASYNC flags.

This patch shouldn't change any actual behavior.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/s390/crypto/aes_s390.c                               | 2 +-
 drivers/crypto/amcc/crypto4xx_alg.c                       | 3 +--
 drivers/crypto/ccp/ccp-crypto-aes-cmac.c                  | 4 +---
 drivers/crypto/geode-aes.c                                | 2 +-
 drivers/md/dm-crypt.c                                     | 2 +-
 drivers/net/wireless/cisco/airo.c                         | 2 +-
 drivers/staging/rtl8192e/rtllib_crypt_ccmp.c              | 2 +-
 drivers/staging/rtl8192u/ieee80211/ieee80211_crypt_ccmp.c | 2 +-
 drivers/usb/wusbcore/crypto.c                             | 2 +-
 net/bluetooth/smp.c                                       | 6 +++---
 net/mac80211/wep.c                                        | 4 ++--
 net/wireless/lib80211_crypt_ccmp.c                        | 2 +-
 net/wireless/lib80211_crypt_tkip.c                        | 4 ++--
 net/wireless/lib80211_crypt_wep.c                         | 4 ++--
 14 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c
index 812d9498d97b..dd456725189f 100644
--- a/arch/s390/crypto/aes_s390.c
+++ b/arch/s390/crypto/aes_s390.c
@@ -137,7 +137,7 @@ static int fallback_init_cip(struct crypto_tfm *tfm)
 	struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm);
 
 	sctx->fallback.cip = crypto_alloc_cipher(name, 0,
-			CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK);
+						 CRYPTO_ALG_NEED_FALLBACK);
 
 	if (IS_ERR(sctx->fallback.cip)) {
 		pr_err("Allocating AES fallback algorithm %s failed\n",
diff --git a/drivers/crypto/amcc/crypto4xx_alg.c b/drivers/crypto/amcc/crypto4xx_alg.c
index f5c07498ea4f..4092c2aad8e2 100644
--- a/drivers/crypto/amcc/crypto4xx_alg.c
+++ b/drivers/crypto/amcc/crypto4xx_alg.c
@@ -520,8 +520,7 @@ static int crypto4xx_compute_gcm_hash_key_sw(__le32 *hash_start, const u8 *key,
 	uint8_t src[16] = { 0 };
 	int rc = 0;
 
-	aes_tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC |
-				      CRYPTO_ALG_NEED_FALLBACK);
+	aes_tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_NEED_FALLBACK);
 	if (IS_ERR(aes_tfm)) {
 		rc = PTR_ERR(aes_tfm);
 		pr_warn("could not load aes cipher driver: %d\n", rc);
diff --git a/drivers/crypto/ccp/ccp-crypto-aes-cmac.c b/drivers/crypto/ccp/ccp-crypto-aes-cmac.c
index 3c6fe57f91f8..9108015e56cc 100644
--- a/drivers/crypto/ccp/ccp-crypto-aes-cmac.c
+++ b/drivers/crypto/ccp/ccp-crypto-aes-cmac.c
@@ -346,9 +346,7 @@ static int ccp_aes_cmac_cra_init(struct crypto_tfm *tfm)
 
 	crypto_ahash_set_reqsize(ahash, sizeof(struct ccp_aes_cmac_req_ctx));
 
-	cipher_tfm = crypto_alloc_cipher("aes", 0,
-					 CRYPTO_ALG_ASYNC |
-					 CRYPTO_ALG_NEED_FALLBACK);
+	cipher_tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_NEED_FALLBACK);
 	if (IS_ERR(cipher_tfm)) {
 		pr_warn("could not load aes cipher driver\n");
 		return PTR_ERR(cipher_tfm);
diff --git a/drivers/crypto/geode-aes.c b/drivers/crypto/geode-aes.c
index eb2a0a73cbed..b4c24a35b3d0 100644
--- a/drivers/crypto/geode-aes.c
+++ b/drivers/crypto/geode-aes.c
@@ -261,7 +261,7 @@ static int fallback_init_cip(struct crypto_tfm *tfm)
 	struct geode_aes_op *op = crypto_tfm_ctx(tfm);
 
 	op->fallback.cip = crypto_alloc_cipher(name, 0,
-				CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK);
+					       CRYPTO_ALG_NEED_FALLBACK);
 
 	if (IS_ERR(op->fallback.cip)) {
 		printk(KERN_ERR "Error allocating fallback algo %s\n", name);
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index b8eec515a003..a7195eb5b8d8 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -377,7 +377,7 @@ static struct crypto_cipher *alloc_essiv_cipher(struct crypt_config *cc,
 	int err;
 
 	/* Setup the essiv_tfm with the given salt */
-	essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
+	essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, 0);
 	if (IS_ERR(essiv_tfm)) {
 		ti->error = "Error allocating crypto tfm for ESSIV";
 		return essiv_tfm;
diff --git a/drivers/net/wireless/cisco/airo.c b/drivers/net/wireless/cisco/airo.c
index 04dd7a936593..6fab69fe6c92 100644
--- a/drivers/net/wireless/cisco/airo.c
+++ b/drivers/net/wireless/cisco/airo.c
@@ -1359,7 +1359,7 @@ static int micsetup(struct airo_info *ai) {
 	int i;
 
 	if (ai->tfm == NULL)
-	        ai->tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
+		ai->tfm = crypto_alloc_cipher("aes", 0, 0);
 
         if (IS_ERR(ai->tfm)) {
                 airo_print_err(ai->dev->name, "failed to load transform for AES");
diff --git a/drivers/staging/rtl8192e/rtllib_crypt_ccmp.c b/drivers/staging/rtl8192e/rtllib_crypt_ccmp.c
index bc45cf098b04..91871503364d 100644
--- a/drivers/staging/rtl8192e/rtllib_crypt_ccmp.c
+++ b/drivers/staging/rtl8192e/rtllib_crypt_ccmp.c
@@ -67,7 +67,7 @@ static void *rtllib_ccmp_init(int key_idx)
 		goto fail;
 	priv->key_idx = key_idx;
 
-	priv->tfm = (void *)crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
+	priv->tfm = (void *)crypto_alloc_cipher("aes", 0, 0);
 	if (IS_ERR(priv->tfm)) {
 		pr_debug("Could not allocate crypto API aes\n");
 		priv->tfm = NULL;
diff --git a/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt_ccmp.c b/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt_ccmp.c
index 041f1b123888..3534ddb900d1 100644
--- a/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt_ccmp.c
+++ b/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt_ccmp.c
@@ -71,7 +71,7 @@ static void *ieee80211_ccmp_init(int key_idx)
 		goto fail;
 	priv->key_idx = key_idx;
 
-	priv->tfm = (void *)crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
+	priv->tfm = (void *)crypto_alloc_cipher("aes", 0, 0);
 	if (IS_ERR(priv->tfm)) {
 		pr_debug("ieee80211_crypt_ccmp: could not allocate crypto API aes\n");
 		priv->tfm = NULL;
diff --git a/drivers/usb/wusbcore/crypto.c b/drivers/usb/wusbcore/crypto.c
index 68ddee86a886..edb7263bff40 100644
--- a/drivers/usb/wusbcore/crypto.c
+++ b/drivers/usb/wusbcore/crypto.c
@@ -316,7 +316,7 @@ ssize_t wusb_prf(void *out, size_t out_size,
 		goto error_setkey_cbc;
 	}
 
-	tfm_aes = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
+	tfm_aes = crypto_alloc_cipher("aes", 0, 0);
 	if (IS_ERR(tfm_aes)) {
 		result = PTR_ERR(tfm_aes);
 		printk(KERN_ERR "E: can't load AES: %d\n", (int)result);
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index c822e626761b..1f94a25beef6 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -1390,7 +1390,7 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn)
 	if (!smp)
 		return NULL;
 
-	smp->tfm_aes = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
+	smp->tfm_aes = crypto_alloc_cipher("aes", 0, 0);
 	if (IS_ERR(smp->tfm_aes)) {
 		BT_ERR("Unable to create AES crypto context");
 		goto zfree_smp;
@@ -3233,7 +3233,7 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid)
 	if (!smp)
 		return ERR_PTR(-ENOMEM);
 
-	tfm_aes = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
+	tfm_aes = crypto_alloc_cipher("aes", 0, 0);
 	if (IS_ERR(tfm_aes)) {
 		BT_ERR("Unable to create AES crypto context");
 		kzfree(smp);
@@ -3906,7 +3906,7 @@ int __init bt_selftest_smp(void)
 	struct crypto_kpp *tfm_ecdh;
 	int err;
 
-	tfm_aes = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
+	tfm_aes = crypto_alloc_cipher("aes", 0, 0);
 	if (IS_ERR(tfm_aes)) {
 		BT_ERR("Unable to create AES crypto context");
 		return PTR_ERR(tfm_aes);
diff --git a/net/mac80211/wep.c b/net/mac80211/wep.c
index 73e8f347802e..bfe9ed9f4c48 100644
--- a/net/mac80211/wep.c
+++ b/net/mac80211/wep.c
@@ -30,13 +30,13 @@ int ieee80211_wep_init(struct ieee80211_local *local)
 	/* start WEP IV from a random value */
 	get_random_bytes(&local->wep_iv, IEEE80211_WEP_IV_LEN);
 
-	local->wep_tx_tfm = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC);
+	local->wep_tx_tfm = crypto_alloc_cipher("arc4", 0, 0);
 	if (IS_ERR(local->wep_tx_tfm)) {
 		local->wep_rx_tfm = ERR_PTR(-EINVAL);
 		return PTR_ERR(local->wep_tx_tfm);
 	}
 
-	local->wep_rx_tfm = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC);
+	local->wep_rx_tfm = crypto_alloc_cipher("arc4", 0, 0);
 	if (IS_ERR(local->wep_rx_tfm)) {
 		crypto_free_cipher(local->wep_tx_tfm);
 		local->wep_tx_tfm = ERR_PTR(-EINVAL);
diff --git a/net/wireless/lib80211_crypt_ccmp.c b/net/wireless/lib80211_crypt_ccmp.c
index 6beab0cfcb99..55214fe925b2 100644
--- a/net/wireless/lib80211_crypt_ccmp.c
+++ b/net/wireless/lib80211_crypt_ccmp.c
@@ -75,7 +75,7 @@ static void *lib80211_ccmp_init(int key_idx)
 		goto fail;
 	priv->key_idx = key_idx;
 
-	priv->tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
+	priv->tfm = crypto_alloc_cipher("aes", 0, 0);
 	if (IS_ERR(priv->tfm)) {
 		priv->tfm = NULL;
 		goto fail;
diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c
index b5e235573c8a..35f06563207d 100644
--- a/net/wireless/lib80211_crypt_tkip.c
+++ b/net/wireless/lib80211_crypt_tkip.c
@@ -99,7 +99,7 @@ static void *lib80211_tkip_init(int key_idx)
 
 	priv->key_idx = key_idx;
 
-	priv->tx_tfm_arc4 = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC);
+	priv->tx_tfm_arc4 = crypto_alloc_cipher("arc4", 0, 0);
 	if (IS_ERR(priv->tx_tfm_arc4)) {
 		priv->tx_tfm_arc4 = NULL;
 		goto fail;
@@ -111,7 +111,7 @@ static void *lib80211_tkip_init(int key_idx)
 		goto fail;
 	}
 
-	priv->rx_tfm_arc4 = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC);
+	priv->rx_tfm_arc4 = crypto_alloc_cipher("arc4", 0, 0);
 	if (IS_ERR(priv->rx_tfm_arc4)) {
 		priv->rx_tfm_arc4 = NULL;
 		goto fail;
diff --git a/net/wireless/lib80211_crypt_wep.c b/net/wireless/lib80211_crypt_wep.c
index 6015f6b542a6..20c1ad63ad44 100644
--- a/net/wireless/lib80211_crypt_wep.c
+++ b/net/wireless/lib80211_crypt_wep.c
@@ -48,13 +48,13 @@ static void *lib80211_wep_init(int keyidx)
 		goto fail;
 	priv->key_idx = keyidx;
 
-	priv->tx_tfm = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC);
+	priv->tx_tfm = crypto_alloc_cipher("arc4", 0, 0);
 	if (IS_ERR(priv->tx_tfm)) {
 		priv->tx_tfm = NULL;
 		goto fail;
 	}
 
-	priv->rx_tfm = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC);
+	priv->rx_tfm = crypto_alloc_cipher("arc4", 0, 0);
 	if (IS_ERR(priv->rx_tfm)) {
 		priv->rx_tfm = NULL;
 		goto fail;
-- 
cgit v1.2.3-59-g8ed1b


From 3d234b3313cd12157946522fe35f5a4574f31169 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 14 Nov 2018 12:21:11 -0800
Subject: crypto: drop mask=CRYPTO_ALG_ASYNC from 'shash' tfm allocations

'shash' algorithms are always synchronous, so passing CRYPTO_ALG_ASYNC
in the mask to crypto_alloc_shash() has no effect.  Many users therefore
already don't pass it, but some still do.  This inconsistency can cause
confusion, especially since the way the 'mask' argument works is
somewhat counterintuitive.

Thus, just remove the unneeded CRYPTO_ALG_ASYNC flags.

This patch shouldn't change any actual behavior.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/block/drbd/drbd_receiver.c          | 2 +-
 drivers/md/dm-integrity.c                   | 2 +-
 drivers/net/wireless/intersil/orinoco/mic.c | 6 ++----
 fs/ubifs/auth.c                             | 5 ++---
 net/bluetooth/smp.c                         | 2 +-
 security/apparmor/crypto.c                  | 2 +-
 security/integrity/evm/evm_crypto.c         | 3 +--
 security/keys/encrypted-keys/encrypted.c    | 4 ++--
 security/keys/trusted.c                     | 4 ++--
 9 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 61c392752fe4..ccfcf00f2798 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -3623,7 +3623,7 @@ static int receive_protocol(struct drbd_connection *connection, struct packet_in
 		 * change.
 		 */
 
-		peer_integrity_tfm = crypto_alloc_shash(integrity_alg, 0, CRYPTO_ALG_ASYNC);
+		peer_integrity_tfm = crypto_alloc_shash(integrity_alg, 0, 0);
 		if (IS_ERR(peer_integrity_tfm)) {
 			peer_integrity_tfm = NULL;
 			drbd_err(connection, "peer data-integrity-alg %s not supported\n",
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index bb3096bf2cc6..d4ad0bfee251 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -2804,7 +2804,7 @@ static int get_mac(struct crypto_shash **hash, struct alg_spec *a, char **error,
 	int r;
 
 	if (a->alg_string) {
-		*hash = crypto_alloc_shash(a->alg_string, 0, CRYPTO_ALG_ASYNC);
+		*hash = crypto_alloc_shash(a->alg_string, 0, 0);
 		if (IS_ERR(*hash)) {
 			*error = error_alg;
 			r = PTR_ERR(*hash);
diff --git a/drivers/net/wireless/intersil/orinoco/mic.c b/drivers/net/wireless/intersil/orinoco/mic.c
index 08bc7822f820..709d9ab3e7bc 100644
--- a/drivers/net/wireless/intersil/orinoco/mic.c
+++ b/drivers/net/wireless/intersil/orinoco/mic.c
@@ -16,8 +16,7 @@
 /********************************************************************/
 int orinoco_mic_init(struct orinoco_private *priv)
 {
-	priv->tx_tfm_mic = crypto_alloc_shash("michael_mic", 0,
-					      CRYPTO_ALG_ASYNC);
+	priv->tx_tfm_mic = crypto_alloc_shash("michael_mic", 0, 0);
 	if (IS_ERR(priv->tx_tfm_mic)) {
 		printk(KERN_DEBUG "orinoco_mic_init: could not allocate "
 		       "crypto API michael_mic\n");
@@ -25,8 +24,7 @@ int orinoco_mic_init(struct orinoco_private *priv)
 		return -ENOMEM;
 	}
 
-	priv->rx_tfm_mic = crypto_alloc_shash("michael_mic", 0,
-					      CRYPTO_ALG_ASYNC);
+	priv->rx_tfm_mic = crypto_alloc_shash("michael_mic", 0, 0);
 	if (IS_ERR(priv->rx_tfm_mic)) {
 		printk(KERN_DEBUG "orinoco_mic_init: could not allocate "
 		       "crypto API michael_mic\n");
diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c
index 124e965a28b3..5bf5fd08879e 100644
--- a/fs/ubifs/auth.c
+++ b/fs/ubifs/auth.c
@@ -269,8 +269,7 @@ int ubifs_init_authentication(struct ubifs_info *c)
 		goto out;
 	}
 
-	c->hash_tfm = crypto_alloc_shash(c->auth_hash_name, 0,
-					 CRYPTO_ALG_ASYNC);
+	c->hash_tfm = crypto_alloc_shash(c->auth_hash_name, 0, 0);
 	if (IS_ERR(c->hash_tfm)) {
 		err = PTR_ERR(c->hash_tfm);
 		ubifs_err(c, "Can not allocate %s: %d",
@@ -286,7 +285,7 @@ int ubifs_init_authentication(struct ubifs_info *c)
 		goto out_free_hash;
 	}
 
-	c->hmac_tfm = crypto_alloc_shash(hmac_name, 0, CRYPTO_ALG_ASYNC);
+	c->hmac_tfm = crypto_alloc_shash(hmac_name, 0, 0);
 	if (IS_ERR(c->hmac_tfm)) {
 		err = PTR_ERR(c->hmac_tfm);
 		ubifs_err(c, "Can not allocate %s: %d", hmac_name, err);
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index 1f94a25beef6..621146d04c03 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -3912,7 +3912,7 @@ int __init bt_selftest_smp(void)
 		return PTR_ERR(tfm_aes);
 	}
 
-	tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, CRYPTO_ALG_ASYNC);
+	tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0);
 	if (IS_ERR(tfm_cmac)) {
 		BT_ERR("Unable to create CMAC crypto context");
 		crypto_free_cipher(tfm_aes);
diff --git a/security/apparmor/crypto.c b/security/apparmor/crypto.c
index 136f2a047836..af03d98c7552 100644
--- a/security/apparmor/crypto.c
+++ b/security/apparmor/crypto.c
@@ -112,7 +112,7 @@ static int __init init_profile_hash(void)
 	if (!apparmor_initialized)
 		return 0;
 
-	tfm = crypto_alloc_shash("sha1", 0, CRYPTO_ALG_ASYNC);
+	tfm = crypto_alloc_shash("sha1", 0, 0);
 	if (IS_ERR(tfm)) {
 		int error = PTR_ERR(tfm);
 		AA_ERROR("failed to setup profile sha1 hashing: %d\n", error);
diff --git a/security/integrity/evm/evm_crypto.c b/security/integrity/evm/evm_crypto.c
index 8c25f949ebdb..1820099dc74b 100644
--- a/security/integrity/evm/evm_crypto.c
+++ b/security/integrity/evm/evm_crypto.c
@@ -97,8 +97,7 @@ static struct shash_desc *init_desc(char type, uint8_t hash_algo)
 		mutex_lock(&mutex);
 		if (*tfm)
 			goto out;
-		*tfm = crypto_alloc_shash(algo, 0,
-					  CRYPTO_ALG_ASYNC | CRYPTO_NOLOAD);
+		*tfm = crypto_alloc_shash(algo, 0, CRYPTO_NOLOAD);
 		if (IS_ERR(*tfm)) {
 			rc = PTR_ERR(*tfm);
 			pr_err("Can not allocate %s (reason: %ld)\n", algo, rc);
diff --git a/security/keys/encrypted-keys/encrypted.c b/security/keys/encrypted-keys/encrypted.c
index d92cbf9687c3..a3891ae9fa0f 100644
--- a/security/keys/encrypted-keys/encrypted.c
+++ b/security/keys/encrypted-keys/encrypted.c
@@ -342,7 +342,7 @@ static int calc_hmac(u8 *digest, const u8 *key, unsigned int keylen,
 	struct crypto_shash *tfm;
 	int err;
 
-	tfm = crypto_alloc_shash(hmac_alg, 0, CRYPTO_ALG_ASYNC);
+	tfm = crypto_alloc_shash(hmac_alg, 0, 0);
 	if (IS_ERR(tfm)) {
 		pr_err("encrypted_key: can't alloc %s transform: %ld\n",
 		       hmac_alg, PTR_ERR(tfm));
@@ -984,7 +984,7 @@ static int __init init_encrypted(void)
 {
 	int ret;
 
-	hash_tfm = crypto_alloc_shash(hash_alg, 0, CRYPTO_ALG_ASYNC);
+	hash_tfm = crypto_alloc_shash(hash_alg, 0, 0);
 	if (IS_ERR(hash_tfm)) {
 		pr_err("encrypted_key: can't allocate %s transform: %ld\n",
 		       hash_alg, PTR_ERR(hash_tfm));
diff --git a/security/keys/trusted.c b/security/keys/trusted.c
index ff6789365a12..2d4f16b2a7c5 100644
--- a/security/keys/trusted.c
+++ b/security/keys/trusted.c
@@ -1199,14 +1199,14 @@ static int __init trusted_shash_alloc(void)
 {
 	int ret;
 
-	hmacalg = crypto_alloc_shash(hmac_alg, 0, CRYPTO_ALG_ASYNC);
+	hmacalg = crypto_alloc_shash(hmac_alg, 0, 0);
 	if (IS_ERR(hmacalg)) {
 		pr_info("trusted_key: could not allocate crypto %s\n",
 			hmac_alg);
 		return PTR_ERR(hmacalg);
 	}
 
-	hashalg = crypto_alloc_shash(hash_alg, 0, CRYPTO_ALG_ASYNC);
+	hashalg = crypto_alloc_shash(hash_alg, 0, 0);
 	if (IS_ERR(hashalg)) {
 		pr_info("trusted_key: could not allocate crypto %s\n",
 			hash_alg);
-- 
cgit v1.2.3-59-g8ed1b


From dd333449d0fb667c5250c42488a7e90470e16c77 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 16 Nov 2018 17:26:18 -0800
Subject: crypto: chacha20-generic - add HChaCha20 library function

Refactor the unkeyed permutation part of chacha20_block() into its own
function, then add hchacha20_block() which is the ChaCha equivalent of
HSalsa20 and is an intermediate step towards XChaCha20 (see
https://cr.yp.to/snuffle/xsalsa-20081128.pdf).  HChaCha20 skips the
final addition of the initial state, and outputs only certain words of
the state.  It should not be used for streaming directly.

Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Martin Willi <martin@strongswan.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/crypto/chacha20.h |  2 ++
 lib/chacha20.c            | 50 +++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 46 insertions(+), 6 deletions(-)

diff --git a/include/crypto/chacha20.h b/include/crypto/chacha20.h
index 2d3129442a52..56073814eef0 100644
--- a/include/crypto/chacha20.h
+++ b/include/crypto/chacha20.h
@@ -20,6 +20,8 @@ struct chacha20_ctx {
 };
 
 void chacha20_block(u32 *state, u8 *stream);
+void hchacha20_block(const u32 *in, u32 *out);
+
 void crypto_chacha20_init(u32 *state, struct chacha20_ctx *ctx, u8 *iv);
 int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
 			   unsigned int keysize);
diff --git a/lib/chacha20.c b/lib/chacha20.c
index d907fec6a9ed..6a484e16171d 100644
--- a/lib/chacha20.c
+++ b/lib/chacha20.c
@@ -1,5 +1,5 @@
 /*
- * ChaCha20 256-bit cipher algorithm, RFC7539
+ * The "hash function" used as the core of the ChaCha20 stream cipher (RFC7539)
  *
  * Copyright (C) 2015 Martin Willi
  *
@@ -16,14 +16,10 @@
 #include <asm/unaligned.h>
 #include <crypto/chacha20.h>
 
-void chacha20_block(u32 *state, u8 *stream)
+static void chacha20_permute(u32 *x)
 {
-	u32 x[16];
 	int i;
 
-	for (i = 0; i < ARRAY_SIZE(x); i++)
-		x[i] = state[i];
-
 	for (i = 0; i < 20; i += 2) {
 		x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],  16);
 		x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],  16);
@@ -65,6 +61,25 @@ void chacha20_block(u32 *state, u8 *stream)
 		x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],   7);
 		x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],   7);
 	}
+}
+
+/**
+ * chacha20_block - generate one keystream block and increment block counter
+ * @state: input state matrix (16 32-bit words)
+ * @stream: output keystream block (64 bytes)
+ *
+ * This is the ChaCha20 core, a function from 64-byte strings to 64-byte
+ * strings.  The caller has already converted the endianness of the input.  This
+ * function also handles incrementing the block counter in the input matrix.
+ */
+void chacha20_block(u32 *state, u8 *stream)
+{
+	u32 x[16];
+	int i;
+
+	memcpy(x, state, 64);
+
+	chacha20_permute(x);
 
 	for (i = 0; i < ARRAY_SIZE(x); i++)
 		put_unaligned_le32(x[i] + state[i], &stream[i * sizeof(u32)]);
@@ -72,3 +87,26 @@ void chacha20_block(u32 *state, u8 *stream)
 	state[12]++;
 }
 EXPORT_SYMBOL(chacha20_block);
+
+/**
+ * hchacha20_block - abbreviated ChaCha20 core, for XChaCha20
+ * @in: input state matrix (16 32-bit words)
+ * @out: output (8 32-bit words)
+ *
+ * HChaCha20 is the ChaCha equivalent of HSalsa20 and is an intermediate step
+ * towards XChaCha20 (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf).
+ * HChaCha20 skips the final addition of the initial state, and outputs only
+ * certain words of the state.  It should not be used for streaming directly.
+ */
+void hchacha20_block(const u32 *in, u32 *out)
+{
+	u32 x[16];
+
+	memcpy(x, in, 64);
+
+	chacha20_permute(x);
+
+	memcpy(&out[0], &x[0], 16);
+	memcpy(&out[4], &x[12], 16);
+}
+EXPORT_SYMBOL(hchacha20_block);
-- 
cgit v1.2.3-59-g8ed1b


From 5e04542a0e0763294e9fced73a149c38c4e0cee5 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 16 Nov 2018 17:26:19 -0800
Subject: crypto: chacha20-generic - don't unnecessarily use atomic walk

chacha20-generic doesn't use SIMD instructions or otherwise disable
preemption, so passing atomic=true to skcipher_walk_virt() is
unnecessary.

Suggested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Martin Willi <martin@strongswan.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/chacha20_generic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crypto/chacha20_generic.c b/crypto/chacha20_generic.c
index 3ae96587caf9..3529521d72a4 100644
--- a/crypto/chacha20_generic.c
+++ b/crypto/chacha20_generic.c
@@ -81,7 +81,7 @@ int crypto_chacha20_crypt(struct skcipher_request *req)
 	u32 state[16];
 	int err;
 
-	err = skcipher_walk_virt(&walk, req, true);
+	err = skcipher_walk_virt(&walk, req, false);
 
 	crypto_chacha20_init(state, ctx, walk.iv);
 
-- 
cgit v1.2.3-59-g8ed1b


From de61d7ae5d3789dcba3749a418f76613fbee8414 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 16 Nov 2018 17:26:20 -0800
Subject: crypto: chacha20-generic - add XChaCha20 support

Add support for the XChaCha20 stream cipher.  XChaCha20 is the
application of the XSalsa20 construction
(https://cr.yp.to/snuffle/xsalsa-20081128.pdf) to ChaCha20 rather than
to Salsa20.  XChaCha20 extends ChaCha20's nonce length from 64 bits (or
96 bits, depending on convention) to 192 bits, while provably retaining
ChaCha20's security.  XChaCha20 uses the ChaCha20 permutation to map the
key and first 128 nonce bits to a 256-bit subkey.  Then, it does the
ChaCha20 stream cipher with the subkey and remaining 64 bits of nonce.

We need XChaCha support in order to add support for the Adiantum
encryption mode.  Note that to meet our performance requirements, we
actually plan to primarily use the variant XChaCha12.  But we believe
it's wise to first add XChaCha20 as a baseline with a higher security
margin, in case there are any situations where it can be used.
Supporting both variants is straightforward.

Since XChaCha20's subkey differs for each request, XChaCha20 can't be a
template that wraps ChaCha20; that would require re-keying the
underlying ChaCha20 for every request, which wouldn't be thread-safe.
Instead, we make XChaCha20 its own top-level algorithm which calls the
ChaCha20 streaming implementation internally.

Similar to the existing ChaCha20 implementation, we define the IV to be
the nonce and stream position concatenated together.  This allows users
to seek to any position in the stream.

I considered splitting the code into separate chacha20-common, chacha20,
and xchacha20 modules, so that chacha20 and xchacha20 could be
enabled/disabled independently.  However, since nearly all the code is
shared anyway, I ultimately decided there would have been little benefit
to the added complexity of separate modules.

Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Martin Willi <martin@strongswan.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/Kconfig            |  14 +-
 crypto/chacha20_generic.c | 120 +++++++---
 crypto/testmgr.c          |   6 +
 crypto/testmgr.h          | 577 ++++++++++++++++++++++++++++++++++++++++++++++
 include/crypto/chacha20.h |  14 +-
 5 files changed, 689 insertions(+), 42 deletions(-)

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 62dbd1a99fa3..75ebd1a2746c 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1403,18 +1403,22 @@ config CRYPTO_SALSA20
 	  Bernstein <djb@cr.yp.to>. See <http://cr.yp.to/snuffle.html>
 
 config CRYPTO_CHACHA20
-	tristate "ChaCha20 cipher algorithm"
+	tristate "ChaCha20 stream cipher algorithms"
 	select CRYPTO_BLKCIPHER
 	help
-	  ChaCha20 cipher algorithm, RFC7539.
+	  The ChaCha20 and XChaCha20 stream cipher algorithms.
 
 	  ChaCha20 is a 256-bit high-speed stream cipher designed by Daniel J.
 	  Bernstein and further specified in RFC7539 for use in IETF protocols.
-	  This is the portable C implementation of ChaCha20.
-
-	  See also:
+	  This is the portable C implementation of ChaCha20.  See also:
 	  <http://cr.yp.to/chacha/chacha-20080128.pdf>
 
+	  XChaCha20 is the application of the XSalsa20 construction to ChaCha20
+	  rather than to Salsa20.  XChaCha20 extends ChaCha20's nonce length
+	  from 64 bits (or 96 bits using the RFC7539 convention) to 192 bits,
+	  while provably retaining ChaCha20's security.  See also:
+	  <https://cr.yp.to/snuffle/xsalsa-20081128.pdf>
+
 config CRYPTO_CHACHA20_X86_64
 	tristate "ChaCha20 cipher algorithm (x86_64/SSSE3/AVX2)"
 	depends on X86 && 64BIT
diff --git a/crypto/chacha20_generic.c b/crypto/chacha20_generic.c
index 3529521d72a4..4305b1b62b16 100644
--- a/crypto/chacha20_generic.c
+++ b/crypto/chacha20_generic.c
@@ -1,7 +1,8 @@
 /*
- * ChaCha20 256-bit cipher algorithm, RFC7539
+ * ChaCha20 (RFC7539) and XChaCha20 stream cipher algorithms
  *
  * Copyright (C) 2015 Martin Willi
+ * Copyright (C) 2018 Google LLC
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -36,6 +37,31 @@ static void chacha20_docrypt(u32 *state, u8 *dst, const u8 *src,
 	}
 }
 
+static int chacha20_stream_xor(struct skcipher_request *req,
+			       struct chacha20_ctx *ctx, u8 *iv)
+{
+	struct skcipher_walk walk;
+	u32 state[16];
+	int err;
+
+	err = skcipher_walk_virt(&walk, req, false);
+
+	crypto_chacha20_init(state, ctx, iv);
+
+	while (walk.nbytes > 0) {
+		unsigned int nbytes = walk.nbytes;
+
+		if (nbytes < walk.total)
+			nbytes = round_down(nbytes, walk.stride);
+
+		chacha20_docrypt(state, walk.dst.virt.addr, walk.src.virt.addr,
+				 nbytes);
+		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+	}
+
+	return err;
+}
+
 void crypto_chacha20_init(u32 *state, struct chacha20_ctx *ctx, u8 *iv)
 {
 	state[0]  = 0x61707865; /* "expa" */
@@ -77,54 +103,74 @@ int crypto_chacha20_crypt(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct skcipher_walk walk;
-	u32 state[16];
-	int err;
-
-	err = skcipher_walk_virt(&walk, req, false);
 
-	crypto_chacha20_init(state, ctx, walk.iv);
+	return chacha20_stream_xor(req, ctx, req->iv);
+}
+EXPORT_SYMBOL_GPL(crypto_chacha20_crypt);
 
-	while (walk.nbytes > 0) {
-		unsigned int nbytes = walk.nbytes;
+int crypto_xchacha20_crypt(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct chacha20_ctx subctx;
+	u32 state[16];
+	u8 real_iv[16];
 
-		if (nbytes < walk.total)
-			nbytes = round_down(nbytes, walk.stride);
+	/* Compute the subkey given the original key and first 128 nonce bits */
+	crypto_chacha20_init(state, ctx, req->iv);
+	hchacha20_block(state, subctx.key);
 
-		chacha20_docrypt(state, walk.dst.virt.addr, walk.src.virt.addr,
-				 nbytes);
-		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
-	}
+	/* Build the real IV */
+	memcpy(&real_iv[0], req->iv + 24, 8); /* stream position */
+	memcpy(&real_iv[8], req->iv + 16, 8); /* remaining 64 nonce bits */
 
-	return err;
+	/* Generate the stream and XOR it with the data */
+	return chacha20_stream_xor(req, &subctx, real_iv);
 }
-EXPORT_SYMBOL_GPL(crypto_chacha20_crypt);
-
-static struct skcipher_alg alg = {
-	.base.cra_name		= "chacha20",
-	.base.cra_driver_name	= "chacha20-generic",
-	.base.cra_priority	= 100,
-	.base.cra_blocksize	= 1,
-	.base.cra_ctxsize	= sizeof(struct chacha20_ctx),
-	.base.cra_module	= THIS_MODULE,
-
-	.min_keysize		= CHACHA20_KEY_SIZE,
-	.max_keysize		= CHACHA20_KEY_SIZE,
-	.ivsize			= CHACHA20_IV_SIZE,
-	.chunksize		= CHACHA20_BLOCK_SIZE,
-	.setkey			= crypto_chacha20_setkey,
-	.encrypt		= crypto_chacha20_crypt,
-	.decrypt		= crypto_chacha20_crypt,
+EXPORT_SYMBOL_GPL(crypto_xchacha20_crypt);
+
+static struct skcipher_alg algs[] = {
+	{
+		.base.cra_name		= "chacha20",
+		.base.cra_driver_name	= "chacha20-generic",
+		.base.cra_priority	= 100,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha20_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA20_KEY_SIZE,
+		.max_keysize		= CHACHA20_KEY_SIZE,
+		.ivsize			= CHACHA20_IV_SIZE,
+		.chunksize		= CHACHA20_BLOCK_SIZE,
+		.setkey			= crypto_chacha20_setkey,
+		.encrypt		= crypto_chacha20_crypt,
+		.decrypt		= crypto_chacha20_crypt,
+	}, {
+		.base.cra_name		= "xchacha20",
+		.base.cra_driver_name	= "xchacha20-generic",
+		.base.cra_priority	= 100,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha20_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA20_KEY_SIZE,
+		.max_keysize		= CHACHA20_KEY_SIZE,
+		.ivsize			= XCHACHA20_IV_SIZE,
+		.chunksize		= CHACHA20_BLOCK_SIZE,
+		.setkey			= crypto_chacha20_setkey,
+		.encrypt		= crypto_xchacha20_crypt,
+		.decrypt		= crypto_xchacha20_crypt,
+	}
 };
 
 static int __init chacha20_generic_mod_init(void)
 {
-	return crypto_register_skcipher(&alg);
+	return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
 }
 
 static void __exit chacha20_generic_mod_fini(void)
 {
-	crypto_unregister_skcipher(&alg);
+	crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
 }
 
 module_init(chacha20_generic_mod_init);
@@ -132,6 +178,8 @@ module_exit(chacha20_generic_mod_fini);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
-MODULE_DESCRIPTION("chacha20 cipher algorithm");
+MODULE_DESCRIPTION("ChaCha20 and XChaCha20 stream ciphers (generic)");
 MODULE_ALIAS_CRYPTO("chacha20");
 MODULE_ALIAS_CRYPTO("chacha20-generic");
+MODULE_ALIAS_CRYPTO("xchacha20");
+MODULE_ALIAS_CRYPTO("xchacha20-generic");
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 379794a259a7..11f5c8b0f4dc 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -3576,6 +3576,12 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.suite = {
 			.hash = __VECS(aes_xcbc128_tv_template)
 		}
+	}, {
+		.alg = "xchacha20",
+		.test = alg_test_skcipher,
+		.suite = {
+			.cipher = __VECS(xchacha20_tv_template)
+		},
 	}, {
 		.alg = "xts(aes)",
 		.test = alg_test_skcipher,
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index fc1b6c0e9ed2..df0dc44a9b7b 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -30994,6 +30994,583 @@ static const struct cipher_testvec chacha20_tv_template[] = {
 	},
 };
 
+static const struct cipher_testvec xchacha20_tv_template[] = {
+	{ /* from libsodium test/default/xchacha20.c */
+		.key	= "\x79\xc9\x97\x98\xac\x67\x30\x0b"
+			  "\xbb\x27\x04\xc9\x5c\x34\x1e\x32"
+			  "\x45\xf3\xdc\xb2\x17\x61\xb9\x8e"
+			  "\x52\xff\x45\xb2\x4f\x30\x4f\xc4",
+		.klen	= 32,
+		.iv	= "\xb3\x3f\xfd\x30\x96\x47\x9b\xcf"
+			  "\xbc\x9a\xee\x49\x41\x76\x88\xa0"
+			  "\xa2\x55\x4f\x8d\x95\x38\x94\x19"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.ptext	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00",
+		.ctext	= "\xc6\xe9\x75\x81\x60\x08\x3a\xc6"
+			  "\x04\xef\x90\xe7\x12\xce\x6e\x75"
+			  "\xd7\x79\x75\x90\x74\x4e\x0c\xf0"
+			  "\x60\xf0\x13\x73\x9c",
+		.len	= 29,
+	}, { /* from libsodium test/default/xchacha20.c */
+		.key	= "\x9d\x23\xbd\x41\x49\xcb\x97\x9c"
+			  "\xcf\x3c\x5c\x94\xdd\x21\x7e\x98"
+			  "\x08\xcb\x0e\x50\xcd\x0f\x67\x81"
+			  "\x22\x35\xea\xaf\x60\x1d\x62\x32",
+		.klen	= 32,
+		.iv	= "\xc0\x47\x54\x82\x66\xb7\xc3\x70"
+			  "\xd3\x35\x66\xa2\x42\x5c\xbf\x30"
+			  "\xd8\x2d\x1e\xaf\x52\x94\x10\x9e"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.ptext	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00",
+		.ctext	= "\xa2\x12\x09\x09\x65\x94\xde\x8c"
+			  "\x56\x67\xb1\xd1\x3a\xd9\x3f\x74"
+			  "\x41\x06\xd0\x54\xdf\x21\x0e\x47"
+			  "\x82\xcd\x39\x6f\xec\x69\x2d\x35"
+			  "\x15\xa2\x0b\xf3\x51\xee\xc0\x11"
+			  "\xa9\x2c\x36\x78\x88\xbc\x46\x4c"
+			  "\x32\xf0\x80\x7a\xcd\x6c\x20\x3a"
+			  "\x24\x7e\x0d\xb8\x54\x14\x84\x68"
+			  "\xe9\xf9\x6b\xee\x4c\xf7\x18\xd6"
+			  "\x8d\x5f\x63\x7c\xbd\x5a\x37\x64"
+			  "\x57\x78\x8e\x6f\xae\x90\xfc\x31"
+			  "\x09\x7c\xfc",
+		.len	= 91,
+	}, { /* Taken from the ChaCha20 test vectors, appended 16 random bytes
+		to nonce, and recomputed the ciphertext with libsodium */
+		.key	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.klen	= 32,
+		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x67\xc6\x69\x73"
+			  "\x51\xff\x4a\xec\x29\xcd\xba\xab"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.ptext	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.ctext	= "\x9c\x49\x2a\xe7\x8a\x2f\x93\xc7"
+			  "\xb3\x33\x6f\x82\x17\xd8\xc4\x1e"
+			  "\xad\x80\x11\x11\x1d\x4c\x16\x18"
+			  "\x07\x73\x9b\x4f\xdb\x7c\xcb\x47"
+			  "\xfd\xef\x59\x74\xfa\x3f\xe5\x4c"
+			  "\x9b\xd0\xea\xbc\xba\x56\xad\x32"
+			  "\x03\xdc\xf8\x2b\xc1\xe1\x75\x67"
+			  "\x23\x7b\xe6\xfc\xd4\x03\x86\x54",
+		.len	= 64,
+	}, { /* Taken from the ChaCha20 test vectors, appended 16 random bytes
+		to nonce, and recomputed the ciphertext with libsodium */
+		.key	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x01",
+		.klen	= 32,
+		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x02\xf2\xfb\xe3\x46"
+			  "\x7c\xc2\x54\xf8\x1b\xe8\xe7\x8d"
+			  "\x01\x00\x00\x00\x00\x00\x00\x00",
+		.ptext	= "\x41\x6e\x79\x20\x73\x75\x62\x6d"
+			  "\x69\x73\x73\x69\x6f\x6e\x20\x74"
+			  "\x6f\x20\x74\x68\x65\x20\x49\x45"
+			  "\x54\x46\x20\x69\x6e\x74\x65\x6e"
+			  "\x64\x65\x64\x20\x62\x79\x20\x74"
+			  "\x68\x65\x20\x43\x6f\x6e\x74\x72"
+			  "\x69\x62\x75\x74\x6f\x72\x20\x66"
+			  "\x6f\x72\x20\x70\x75\x62\x6c\x69"
+			  "\x63\x61\x74\x69\x6f\x6e\x20\x61"
+			  "\x73\x20\x61\x6c\x6c\x20\x6f\x72"
+			  "\x20\x70\x61\x72\x74\x20\x6f\x66"
+			  "\x20\x61\x6e\x20\x49\x45\x54\x46"
+			  "\x20\x49\x6e\x74\x65\x72\x6e\x65"
+			  "\x74\x2d\x44\x72\x61\x66\x74\x20"
+			  "\x6f\x72\x20\x52\x46\x43\x20\x61"
+			  "\x6e\x64\x20\x61\x6e\x79\x20\x73"
+			  "\x74\x61\x74\x65\x6d\x65\x6e\x74"
+			  "\x20\x6d\x61\x64\x65\x20\x77\x69"
+			  "\x74\x68\x69\x6e\x20\x74\x68\x65"
+			  "\x20\x63\x6f\x6e\x74\x65\x78\x74"
+			  "\x20\x6f\x66\x20\x61\x6e\x20\x49"
+			  "\x45\x54\x46\x20\x61\x63\x74\x69"
+			  "\x76\x69\x74\x79\x20\x69\x73\x20"
+			  "\x63\x6f\x6e\x73\x69\x64\x65\x72"
+			  "\x65\x64\x20\x61\x6e\x20\x22\x49"
+			  "\x45\x54\x46\x20\x43\x6f\x6e\x74"
+			  "\x72\x69\x62\x75\x74\x69\x6f\x6e"
+			  "\x22\x2e\x20\x53\x75\x63\x68\x20"
+			  "\x73\x74\x61\x74\x65\x6d\x65\x6e"
+			  "\x74\x73\x20\x69\x6e\x63\x6c\x75"
+			  "\x64\x65\x20\x6f\x72\x61\x6c\x20"
+			  "\x73\x74\x61\x74\x65\x6d\x65\x6e"
+			  "\x74\x73\x20\x69\x6e\x20\x49\x45"
+			  "\x54\x46\x20\x73\x65\x73\x73\x69"
+			  "\x6f\x6e\x73\x2c\x20\x61\x73\x20"
+			  "\x77\x65\x6c\x6c\x20\x61\x73\x20"
+			  "\x77\x72\x69\x74\x74\x65\x6e\x20"
+			  "\x61\x6e\x64\x20\x65\x6c\x65\x63"
+			  "\x74\x72\x6f\x6e\x69\x63\x20\x63"
+			  "\x6f\x6d\x6d\x75\x6e\x69\x63\x61"
+			  "\x74\x69\x6f\x6e\x73\x20\x6d\x61"
+			  "\x64\x65\x20\x61\x74\x20\x61\x6e"
+			  "\x79\x20\x74\x69\x6d\x65\x20\x6f"
+			  "\x72\x20\x70\x6c\x61\x63\x65\x2c"
+			  "\x20\x77\x68\x69\x63\x68\x20\x61"
+			  "\x72\x65\x20\x61\x64\x64\x72\x65"
+			  "\x73\x73\x65\x64\x20\x74\x6f",
+		.ctext	= "\xf9\xab\x7a\x4a\x60\xb8\x5f\xa0"
+			  "\x50\xbb\x57\xce\xef\x8c\xc1\xd9"
+			  "\x24\x15\xb3\x67\x5e\x7f\x01\xf6"
+			  "\x1c\x22\xf6\xe5\x71\xb1\x43\x64"
+			  "\x63\x05\xd5\xfc\x5c\x3d\xc0\x0e"
+			  "\x23\xef\xd3\x3b\xd9\xdc\x7f\xa8"
+			  "\x58\x26\xb3\xd0\xc2\xd5\x04\x3f"
+			  "\x0a\x0e\x8f\x17\xe4\xcd\xf7\x2a"
+			  "\xb4\x2c\x09\xe4\x47\xec\x8b\xfb"
+			  "\x59\x37\x7a\xa1\xd0\x04\x7e\xaa"
+			  "\xf1\x98\x5f\x24\x3d\x72\x9a\x43"
+			  "\xa4\x36\x51\x92\x22\x87\xff\x26"
+			  "\xce\x9d\xeb\x59\x78\x84\x5e\x74"
+			  "\x97\x2e\x63\xc0\xef\x29\xf7\x8a"
+			  "\xb9\xee\x35\x08\x77\x6a\x35\x9a"
+			  "\x3e\xe6\x4f\x06\x03\x74\x1b\xc1"
+			  "\x5b\xb3\x0b\x89\x11\x07\xd3\xb7"
+			  "\x53\xd6\x25\x04\xd9\x35\xb4\x5d"
+			  "\x4c\x33\x5a\xc2\x42\x4c\xe6\xa4"
+			  "\x97\x6e\x0e\xd2\xb2\x8b\x2f\x7f"
+			  "\x28\xe5\x9f\xac\x4b\x2e\x02\xab"
+			  "\x85\xfa\xa9\x0d\x7c\x2d\x10\xe6"
+			  "\x91\xab\x55\x63\xf0\xde\x3a\x94"
+			  "\x25\x08\x10\x03\xc2\x68\xd1\xf4"
+			  "\xaf\x7d\x9c\x99\xf7\x86\x96\x30"
+			  "\x60\xfc\x0b\xe6\xa8\x80\x15\xb0"
+			  "\x81\xb1\x0c\xbe\xb9\x12\x18\x25"
+			  "\xe9\x0e\xb1\xe7\x23\xb2\xef\x4a"
+			  "\x22\x8f\xc5\x61\x89\xd4\xe7\x0c"
+			  "\x64\x36\x35\x61\xb6\x34\x60\xf7"
+			  "\x7b\x61\x37\x37\x12\x10\xa2\xf6"
+			  "\x7e\xdb\x7f\x39\x3f\xb6\x8e\x89"
+			  "\x9e\xf3\xfe\x13\x98\xbb\x66\x5a"
+			  "\xec\xea\xab\x3f\x9c\x87\xc4\x8c"
+			  "\x8a\x04\x18\x49\xfc\x77\x11\x50"
+			  "\x16\xe6\x71\x2b\xee\xc0\x9c\xb6"
+			  "\x87\xfd\x80\xff\x0b\x1d\x73\x38"
+			  "\xa4\x1d\x6f\xae\xe4\x12\xd7\x93"
+			  "\x9d\xcd\x38\x26\x09\x40\x52\xcd"
+			  "\x67\x01\x67\x26\xe0\x3e\x98\xa8"
+			  "\xe8\x1a\x13\x41\xbb\x90\x4d\x87"
+			  "\xbb\x42\x82\x39\xce\x3a\xd0\x18"
+			  "\x6d\x7b\x71\x8f\xbb\x2c\x6a\xd1"
+			  "\xbd\xf5\xc7\x8a\x7e\xe1\x1e\x0f"
+			  "\x0d\x0d\x13\x7c\xd9\xd8\x3c\x91"
+			  "\xab\xff\x1f\x12\xc3\xee\xe5\x65"
+			  "\x12\x8d\x7b\x61\xe5\x1f\x98",
+		.len	= 375,
+		.also_non_np = 1,
+		.np	= 3,
+		.tap	= { 375 - 20, 4, 16 },
+
+	}, { /* Taken from the ChaCha20 test vectors, appended 16 random bytes
+		to nonce, and recomputed the ciphertext with libsodium */
+		.key	= "\x1c\x92\x40\xa5\xeb\x55\xd3\x8a"
+			  "\xf3\x33\x88\x86\x04\xf6\xb5\xf0"
+			  "\x47\x39\x17\xc1\x40\x2b\x80\x09"
+			  "\x9d\xca\x5c\xbc\x20\x70\x75\xc0",
+		.klen	= 32,
+		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x02\x76\x5a\x2e\x63"
+			  "\x33\x9f\xc9\x9a\x66\x32\x0d\xb7"
+			  "\x2a\x00\x00\x00\x00\x00\x00\x00",
+		.ptext	= "\x27\x54\x77\x61\x73\x20\x62\x72"
+			  "\x69\x6c\x6c\x69\x67\x2c\x20\x61"
+			  "\x6e\x64\x20\x74\x68\x65\x20\x73"
+			  "\x6c\x69\x74\x68\x79\x20\x74\x6f"
+			  "\x76\x65\x73\x0a\x44\x69\x64\x20"
+			  "\x67\x79\x72\x65\x20\x61\x6e\x64"
+			  "\x20\x67\x69\x6d\x62\x6c\x65\x20"
+			  "\x69\x6e\x20\x74\x68\x65\x20\x77"
+			  "\x61\x62\x65\x3a\x0a\x41\x6c\x6c"
+			  "\x20\x6d\x69\x6d\x73\x79\x20\x77"
+			  "\x65\x72\x65\x20\x74\x68\x65\x20"
+			  "\x62\x6f\x72\x6f\x67\x6f\x76\x65"
+			  "\x73\x2c\x0a\x41\x6e\x64\x20\x74"
+			  "\x68\x65\x20\x6d\x6f\x6d\x65\x20"
+			  "\x72\x61\x74\x68\x73\x20\x6f\x75"
+			  "\x74\x67\x72\x61\x62\x65\x2e",
+		.ctext	= "\x95\xb9\x51\xe7\x8f\xb4\xa4\x03"
+			  "\xca\x37\xcc\xde\x60\x1d\x8c\xe2"
+			  "\xf1\xbb\x8a\x13\x7f\x61\x85\xcc"
+			  "\xad\xf4\xf0\xdc\x86\xa6\x1e\x10"
+			  "\xbc\x8e\xcb\x38\x2b\xa5\xc8\x8f"
+			  "\xaa\x03\x3d\x53\x4a\x42\xb1\x33"
+			  "\xfc\xd3\xef\xf0\x8e\x7e\x10\x9c"
+			  "\x6f\x12\x5e\xd4\x96\xfe\x5b\x08"
+			  "\xb6\x48\xf0\x14\x74\x51\x18\x7c"
+			  "\x07\x92\xfc\xac\x9d\xf1\x94\xc0"
+			  "\xc1\x9d\xc5\x19\x43\x1f\x1d\xbb"
+			  "\x07\xf0\x1b\x14\x25\x45\xbb\xcb"
+			  "\x5c\xe2\x8b\x28\xf3\xcf\x47\x29"
+			  "\x27\x79\x67\x24\xa6\x87\xc2\x11"
+			  "\x65\x03\xfa\x45\xf7\x9e\x53\x7a"
+			  "\x99\xf1\x82\x25\x4f\x8d\x07",
+		.len	= 127,
+	}, { /* Taken from the ChaCha20 test vectors, appended 16 random bytes
+		to nonce, and recomputed the ciphertext with libsodium */
+		.key	= "\x1c\x92\x40\xa5\xeb\x55\xd3\x8a"
+			  "\xf3\x33\x88\x86\x04\xf6\xb5\xf0"
+			  "\x47\x39\x17\xc1\x40\x2b\x80\x09"
+			  "\x9d\xca\x5c\xbc\x20\x70\x75\xc0",
+		.klen	= 32,
+		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x01\x31\x58\xa3\x5a"
+			  "\x25\x5d\x05\x17\x58\xe9\x5e\xd4"
+			  "\x1c\x00\x00\x00\x00\x00\x00\x00",
+		.ptext	= "\x49\xee\xe0\xdc\x24\x90\x40\xcd"
+			  "\xc5\x40\x8f\x47\x05\xbc\xdd\x81"
+			  "\x47\xc6\x8d\xe6\xb1\x8f\xd7\xcb"
+			  "\x09\x0e\x6e\x22\x48\x1f\xbf\xb8"
+			  "\x5c\xf7\x1e\x8a\xc1\x23\xf2\xd4"
+			  "\x19\x4b\x01\x0f\x4e\xa4\x43\xce"
+			  "\x01\xc6\x67\xda\x03\x91\x18\x90"
+			  "\xa5\xa4\x8e\x45\x03\xb3\x2d\xac"
+			  "\x74\x92\xd3\x53\x47\xc8\xdd\x25"
+			  "\x53\x6c\x02\x03\x87\x0d\x11\x0c"
+			  "\x58\xe3\x12\x18\xfd\x2a\x5b\x40"
+			  "\x0c\x30\xf0\xb8\x3f\x43\xce\xae"
+			  "\x65\x3a\x7d\x7c\xf4\x54\xaa\xcc"
+			  "\x33\x97\xc3\x77\xba\xc5\x70\xde"
+			  "\xd7\xd5\x13\xa5\x65\xc4\x5f\x0f"
+			  "\x46\x1a\x0d\x97\xb5\xf3\xbb\x3c"
+			  "\x84\x0f\x2b\xc5\xaa\xea\xf2\x6c"
+			  "\xc9\xb5\x0c\xee\x15\xf3\x7d\xbe"
+			  "\x9f\x7b\x5a\xa6\xae\x4f\x83\xb6"
+			  "\x79\x49\x41\xf4\x58\x18\xcb\x86"
+			  "\x7f\x30\x0e\xf8\x7d\x44\x36\xea"
+			  "\x75\xeb\x88\x84\x40\x3c\xad\x4f"
+			  "\x6f\x31\x6b\xaa\x5d\xe5\xa5\xc5"
+			  "\x21\x66\xe9\xa7\xe3\xb2\x15\x88"
+			  "\x78\xf6\x79\xa1\x59\x47\x12\x4e"
+			  "\x9f\x9f\x64\x1a\xa0\x22\x5b\x08"
+			  "\xbe\x7c\x36\xc2\x2b\x66\x33\x1b"
+			  "\xdd\x60\x71\xf7\x47\x8c\x61\xc3"
+			  "\xda\x8a\x78\x1e\x16\xfa\x1e\x86"
+			  "\x81\xa6\x17\x2a\xa7\xb5\xc2\xe7"
+			  "\xa4\xc7\x42\xf1\xcf\x6a\xca\xb4"
+			  "\x45\xcf\xf3\x93\xf0\xe7\xea\xf6"
+			  "\xf4\xe6\x33\x43\x84\x93\xa5\x67"
+			  "\x9b\x16\x58\x58\x80\x0f\x2b\x5c"
+			  "\x24\x74\x75\x7f\x95\x81\xb7\x30"
+			  "\x7a\x33\xa7\xf7\x94\x87\x32\x27"
+			  "\x10\x5d\x14\x4c\x43\x29\xdd\x26"
+			  "\xbd\x3e\x3c\x0e\xfe\x0e\xa5\x10"
+			  "\xea\x6b\x64\xfd\x73\xc6\xed\xec"
+			  "\xa8\xc9\xbf\xb3\xba\x0b\x4d\x07"
+			  "\x70\xfc\x16\xfd\x79\x1e\xd7\xc5"
+			  "\x49\x4e\x1c\x8b\x8d\x79\x1b\xb1"
+			  "\xec\xca\x60\x09\x4c\x6a\xd5\x09"
+			  "\x49\x46\x00\x88\x22\x8d\xce\xea"
+			  "\xb1\x17\x11\xde\x42\xd2\x23\xc1"
+			  "\x72\x11\xf5\x50\x73\x04\x40\x47"
+			  "\xf9\x5d\xe7\xa7\x26\xb1\x7e\xb0"
+			  "\x3f\x58\xc1\x52\xab\x12\x67\x9d"
+			  "\x3f\x43\x4b\x68\xd4\x9c\x68\x38"
+			  "\x07\x8a\x2d\x3e\xf3\xaf\x6a\x4b"
+			  "\xf9\xe5\x31\x69\x22\xf9\xa6\x69"
+			  "\xc6\x9c\x96\x9a\x12\x35\x95\x1d"
+			  "\x95\xd5\xdd\xbe\xbf\x93\x53\x24"
+			  "\xfd\xeb\xc2\x0a\x64\xb0\x77\x00"
+			  "\x6f\x88\xc4\x37\x18\x69\x7c\xd7"
+			  "\x41\x92\x55\x4c\x03\xa1\x9a\x4b"
+			  "\x15\xe5\xdf\x7f\x37\x33\x72\xc1"
+			  "\x8b\x10\x67\xa3\x01\x57\x94\x25"
+			  "\x7b\x38\x71\x7e\xdd\x1e\xcc\x73"
+			  "\x55\xd2\x8e\xeb\x07\xdd\xf1\xda"
+			  "\x58\xb1\x47\x90\xfe\x42\x21\x72"
+			  "\xa3\x54\x7a\xa0\x40\xec\x9f\xdd"
+			  "\xc6\x84\x6e\xca\xae\xe3\x68\xb4"
+			  "\x9d\xe4\x78\xff\x57\xf2\xf8\x1b"
+			  "\x03\xa1\x31\xd9\xde\x8d\xf5\x22"
+			  "\x9c\xdd\x20\xa4\x1e\x27\xb1\x76"
+			  "\x4f\x44\x55\xe2\x9b\xa1\x9c\xfe"
+			  "\x54\xf7\x27\x1b\xf4\xde\x02\xf5"
+			  "\x1b\x55\x48\x5c\xdc\x21\x4b\x9e"
+			  "\x4b\x6e\xed\x46\x23\xdc\x65\xb2"
+			  "\xcf\x79\x5f\x28\xe0\x9e\x8b\xe7"
+			  "\x4c\x9d\x8a\xff\xc1\xa6\x28\xb8"
+			  "\x65\x69\x8a\x45\x29\xef\x74\x85"
+			  "\xde\x79\xc7\x08\xae\x30\xb0\xf4"
+			  "\xa3\x1d\x51\x41\xab\xce\xcb\xf6"
+			  "\xb5\xd8\x6d\xe0\x85\xe1\x98\xb3"
+			  "\x43\xbb\x86\x83\x0a\xa0\xf5\xb7"
+			  "\x04\x0b\xfa\x71\x1f\xb0\xf6\xd9"
+			  "\x13\x00\x15\xf0\xc7\xeb\x0d\x5a"
+			  "\x9f\xd7\xb9\x6c\x65\x14\x22\x45"
+			  "\x6e\x45\x32\x3e\x7e\x60\x1a\x12"
+			  "\x97\x82\x14\xfb\xaa\x04\x22\xfa"
+			  "\xa0\xe5\x7e\x8c\x78\x02\x48\x5d"
+			  "\x78\x33\x5a\x7c\xad\xdb\x29\xce"
+			  "\xbb\x8b\x61\xa4\xb7\x42\xe2\xac"
+			  "\x8b\x1a\xd9\x2f\x0b\x8b\x62\x21"
+			  "\x83\x35\x7e\xad\x73\xc2\xb5\x6c"
+			  "\x10\x26\x38\x07\xe5\xc7\x36\x80"
+			  "\xe2\x23\x12\x61\xf5\x48\x4b\x2b"
+			  "\xc5\xdf\x15\xd9\x87\x01\xaa\xac"
+			  "\x1e\x7c\xad\x73\x78\x18\x63\xe0"
+			  "\x8b\x9f\x81\xd8\x12\x6a\x28\x10"
+			  "\xbe\x04\x68\x8a\x09\x7c\x1b\x1c"
+			  "\x83\x66\x80\x47\x80\xe8\xfd\x35"
+			  "\x1c\x97\x6f\xae\x49\x10\x66\xcc"
+			  "\xc6\xd8\xcc\x3a\x84\x91\x20\x77"
+			  "\x72\xe4\x24\xd2\x37\x9f\xc5\xc9"
+			  "\x25\x94\x10\x5f\x40\x00\x64\x99"
+			  "\xdc\xae\xd7\x21\x09\x78\x50\x15"
+			  "\xac\x5f\xc6\x2c\xa2\x0b\xa9\x39"
+			  "\x87\x6e\x6d\xab\xde\x08\x51\x16"
+			  "\xc7\x13\xe9\xea\xed\x06\x8e\x2c"
+			  "\xf8\x37\x8c\xf0\xa6\x96\x8d\x43"
+			  "\xb6\x98\x37\xb2\x43\xed\xde\xdf"
+			  "\x89\x1a\xe7\xeb\x9d\xa1\x7b\x0b"
+			  "\x77\xb0\xe2\x75\xc0\xf1\x98\xd9"
+			  "\x80\x55\xc9\x34\x91\xd1\x59\xe8"
+			  "\x4b\x0f\xc1\xa9\x4b\x7a\x84\x06"
+			  "\x20\xa8\x5d\xfa\xd1\xde\x70\x56"
+			  "\x2f\x9e\x91\x9c\x20\xb3\x24\xd8"
+			  "\x84\x3d\xe1\x8c\x7e\x62\x52\xe5"
+			  "\x44\x4b\x9f\xc2\x93\x03\xea\x2b"
+			  "\x59\xc5\xfa\x3f\x91\x2b\xbb\x23"
+			  "\xf5\xb2\x7b\xf5\x38\xaf\xb3\xee"
+			  "\x63\xdc\x7b\xd1\xff\xaa\x8b\xab"
+			  "\x82\x6b\x37\x04\xeb\x74\xbe\x79"
+			  "\xb9\x83\x90\xef\x20\x59\x46\xff"
+			  "\xe9\x97\x3e\x2f\xee\xb6\x64\x18"
+			  "\x38\x4c\x7a\x4a\xf9\x61\xe8\x9a"
+			  "\xa1\xb5\x01\xa6\x47\xd3\x11\xd4"
+			  "\xce\xd3\x91\x49\x88\xc7\xb8\x4d"
+			  "\xb1\xb9\x07\x6d\x16\x72\xae\x46"
+			  "\x5e\x03\xa1\x4b\xb6\x02\x30\xa8"
+			  "\x3d\xa9\x07\x2a\x7c\x19\xe7\x62"
+			  "\x87\xe3\x82\x2f\x6f\xe1\x09\xd9"
+			  "\x94\x97\xea\xdd\x58\x9e\xae\x76"
+			  "\x7e\x35\xe5\xb4\xda\x7e\xf4\xde"
+			  "\xf7\x32\x87\xcd\x93\xbf\x11\x56"
+			  "\x11\xbe\x08\x74\xe1\x69\xad\xe2"
+			  "\xd7\xf8\x86\x75\x8a\x3c\xa4\xbe"
+			  "\x70\xa7\x1b\xfc\x0b\x44\x2a\x76"
+			  "\x35\xea\x5d\x85\x81\xaf\x85\xeb"
+			  "\xa0\x1c\x61\xc2\xf7\x4f\xa5\xdc"
+			  "\x02\x7f\xf6\x95\x40\x6e\x8a\x9a"
+			  "\xf3\x5d\x25\x6e\x14\x3a\x22\xc9"
+			  "\x37\x1c\xeb\x46\x54\x3f\xa5\x91"
+			  "\xc2\xb5\x8c\xfe\x53\x08\x97\x32"
+			  "\x1b\xb2\x30\x27\xfe\x25\x5d\xdc"
+			  "\x08\x87\xd0\xe5\x94\x1a\xd4\xf1"
+			  "\xfe\xd6\xb4\xa3\xe6\x74\x81\x3c"
+			  "\x1b\xb7\x31\xa7\x22\xfd\xd4\xdd"
+			  "\x20\x4e\x7c\x51\xb0\x60\x73\xb8"
+			  "\x9c\xac\x91\x90\x7e\x01\xb0\xe1"
+			  "\x8a\x2f\x75\x1c\x53\x2a\x98\x2a"
+			  "\x06\x52\x95\x52\xb2\xe9\x25\x2e"
+			  "\x4c\xe2\x5a\x00\xb2\x13\x81\x03"
+			  "\x77\x66\x0d\xa5\x99\xda\x4e\x8c"
+			  "\xac\xf3\x13\x53\x27\x45\xaf\x64"
+			  "\x46\xdc\xea\x23\xda\x97\xd1\xab"
+			  "\x7d\x6c\x30\x96\x1f\xbc\x06\x34"
+			  "\x18\x0b\x5e\x21\x35\x11\x8d\x4c"
+			  "\xe0\x2d\xe9\x50\x16\x74\x81\xa8"
+			  "\xb4\x34\xb9\x72\x42\xa6\xcc\xbc"
+			  "\xca\x34\x83\x27\x10\x5b\x68\x45"
+			  "\x8f\x52\x22\x0c\x55\x3d\x29\x7c"
+			  "\xe3\xc0\x66\x05\x42\x91\x5f\x58"
+			  "\xfe\x4a\x62\xd9\x8c\xa9\x04\x19"
+			  "\x04\xa9\x08\x4b\x57\xfc\x67\x53"
+			  "\x08\x7c\xbc\x66\x8a\xb0\xb6\x9f"
+			  "\x92\xd6\x41\x7c\x5b\x2a\x00\x79"
+			  "\x72",
+		.ctext	= "\x3a\x92\xee\x53\x31\xaf\x2b\x60"
+			  "\x5f\x55\x8d\x00\x5d\xfc\x74\x97"
+			  "\x28\x54\xf4\xa5\x75\xf1\x9b\x25"
+			  "\x62\x1c\xc0\xe0\x13\xc8\x87\x53"
+			  "\xd0\xf3\xa7\x97\x1f\x3b\x1e\xea"
+			  "\xe0\xe5\x2a\xd1\xdd\xa4\x3b\x50"
+			  "\x45\xa3\x0d\x7e\x1b\xc9\xa0\xad"
+			  "\xb9\x2c\x54\xa6\xc7\x55\x16\xd0"
+			  "\xc5\x2e\x02\x44\x35\xd0\x7e\x67"
+			  "\xf2\xc4\x9b\xcd\x95\x10\xcc\x29"
+			  "\x4b\xfa\x86\x87\xbe\x40\x36\xbe"
+			  "\xe1\xa3\x52\x89\x55\x20\x9b\xc2"
+			  "\xab\xf2\x31\x34\x16\xad\xc8\x17"
+			  "\x65\x24\xc0\xff\x12\x37\xfe\x5a"
+			  "\x62\x3b\x59\x47\x6c\x5f\x3a\x8e"
+			  "\x3b\xd9\x30\xc8\x7f\x2f\x88\xda"
+			  "\x80\xfd\x02\xda\x7f\x9a\x7a\x73"
+			  "\x59\xc5\x34\x09\x9a\x11\xcb\xa7"
+			  "\xfc\xf6\xa1\xa0\x60\xfb\x43\xbb"
+			  "\xf1\xe9\xd7\xc6\x79\x27\x4e\xff"
+			  "\x22\xb4\x24\xbf\x76\xee\x47\xb9"
+			  "\x6d\x3f\x8b\xb0\x9c\x3c\x43\xdd"
+			  "\xff\x25\x2e\x6d\xa4\x2b\xfb\x5d"
+			  "\x1b\x97\x6c\x55\x0a\x82\x7a\x7b"
+			  "\x94\x34\xc2\xdb\x2f\x1f\xc1\xea"
+			  "\xd4\x4d\x17\x46\x3b\x51\x69\x09"
+			  "\xe4\x99\x32\x25\xfd\x94\xaf\xfb"
+			  "\x10\xf7\x4f\xdd\x0b\x3c\x8b\x41"
+			  "\xb3\x6a\xb7\xd1\x33\xa8\x0c\x2f"
+			  "\x62\x4c\x72\x11\xd7\x74\xe1\x3b"
+			  "\x38\x43\x66\x7b\x6c\x36\x48\xe7"
+			  "\xe3\xe7\x9d\xb9\x42\x73\x7a\x2a"
+			  "\x89\x20\x1a\x41\x80\x03\xf7\x8f"
+			  "\x61\x78\x13\xbf\xfe\x50\xf5\x04"
+			  "\x52\xf9\xac\x47\xf8\x62\x4b\xb2"
+			  "\x24\xa9\xbf\x64\xb0\x18\x69\xd2"
+			  "\xf5\xe4\xce\xc8\xb1\x87\x75\xd6"
+			  "\x2c\x24\x79\x00\x7d\x26\xfb\x44"
+			  "\xe7\x45\x7a\xee\x58\xa5\x83\xc1"
+			  "\xb4\x24\xab\x23\x2f\x4d\xd7\x4f"
+			  "\x1c\xc7\xaa\xa9\x50\xf4\xa3\x07"
+			  "\x12\x13\x89\x74\xdc\x31\x6a\xb2"
+			  "\xf5\x0f\x13\x8b\xb9\xdb\x85\x1f"
+			  "\xf5\xbc\x88\xd9\x95\xea\x31\x6c"
+			  "\x36\x60\xb6\x49\xdc\xc4\xf7\x55"
+			  "\x3f\x21\xc1\xb5\x92\x18\x5e\xbc"
+			  "\x9f\x87\x7f\xe7\x79\x25\x40\x33"
+			  "\xd6\xb9\x33\xd5\x50\xb3\xc7\x89"
+			  "\x1b\x12\xa0\x46\xdd\xa7\xd8\x3e"
+			  "\x71\xeb\x6f\x66\xa1\x26\x0c\x67"
+			  "\xab\xb2\x38\x58\x17\xd8\x44\x3b"
+			  "\x16\xf0\x8e\x62\x8d\x16\x10\x00"
+			  "\x32\x8b\xef\xb9\x28\xd3\xc5\xad"
+			  "\x0a\x19\xa2\xe4\x03\x27\x7d\x94"
+			  "\x06\x18\xcd\xd6\x27\x00\xf9\x1f"
+			  "\xb6\xb3\xfe\x96\x35\x5f\xc4\x1c"
+			  "\x07\x62\x10\x79\x68\x50\xf1\x7e"
+			  "\x29\xe7\xc4\xc4\xe7\xee\x54\xd6"
+			  "\x58\x76\x84\x6d\x8d\xe4\x59\x31"
+			  "\xe9\xf4\xdc\xa1\x1f\xe5\x1a\xd6"
+			  "\xe6\x64\x46\xf5\x77\x9c\x60\x7a"
+			  "\x5e\x62\xe3\x0a\xd4\x9f\x7a\x2d"
+			  "\x7a\xa5\x0a\x7b\x29\x86\x7a\x74"
+			  "\x74\x71\x6b\xca\x7d\x1d\xaa\xba"
+			  "\x39\x84\x43\x76\x35\xfe\x4f\x9b"
+			  "\xbb\xbb\xb5\x6a\x32\xb5\x5d\x41"
+			  "\x51\xf0\x5b\x68\x03\x47\x4b\x8a"
+			  "\xca\x88\xf6\x37\xbd\x73\x51\x70"
+			  "\x66\xfe\x9e\x5f\x21\x9c\xf3\xdd"
+			  "\xc3\xea\x27\xf9\x64\x94\xe1\x19"
+			  "\xa0\xa9\xab\x60\xe0\x0e\xf7\x78"
+			  "\x70\x86\xeb\xe0\xd1\x5c\x05\xd3"
+			  "\xd7\xca\xe0\xc0\x47\x47\x34\xee"
+			  "\x11\xa3\xa3\x54\x98\xb7\x49\x8e"
+			  "\x84\x28\x70\x2c\x9e\xfb\x55\x54"
+			  "\x4d\xf8\x86\xf7\x85\x7c\xbd\xf3"
+			  "\x17\xd8\x47\xcb\xac\xf4\x20\x85"
+			  "\x34\x66\xad\x37\x2d\x5e\x52\xda"
+			  "\x8a\xfe\x98\x55\x30\xe7\x2d\x2b"
+			  "\x19\x10\x8e\x7b\x66\x5e\xdc\xe0"
+			  "\x45\x1f\x7b\xb4\x08\xfb\x8f\xf6"
+			  "\x8c\x89\x21\x34\x55\x27\xb2\x76"
+			  "\xb2\x07\xd9\xd6\x68\x9b\xea\x6b"
+			  "\x2d\xb4\xc4\x35\xdd\xd2\x79\xae"
+			  "\xc7\xd6\x26\x7f\x12\x01\x8c\xa7"
+			  "\xe3\xdb\xa8\xf4\xf7\x2b\xec\x99"
+			  "\x11\x00\xf1\x35\x8c\xcf\xd5\xc9"
+			  "\xbd\x91\x36\x39\x70\xcf\x7d\x70"
+			  "\x47\x1a\xfc\x6b\x56\xe0\x3f\x9c"
+			  "\x60\x49\x01\x72\xa9\xaf\x2c\x9c"
+			  "\xe8\xab\xda\x8c\x14\x19\xf3\x75"
+			  "\x07\x17\x9d\x44\x67\x7a\x2e\xef"
+			  "\xb7\x83\x35\x4a\xd1\x3d\x1c\x84"
+			  "\x32\xdd\xaa\xea\xca\x1d\xdc\x72"
+			  "\x2c\xcc\x43\xcd\x5d\xe3\x21\xa4"
+			  "\xd0\x8a\x4b\x20\x12\xa3\xd5\x86"
+			  "\x76\x96\xff\x5f\x04\x57\x0f\xe6"
+			  "\xba\xe8\x76\x50\x0c\x64\x1d\x83"
+			  "\x9c\x9b\x9a\x9a\x58\x97\x9c\x5c"
+			  "\xb4\xa4\xa6\x3e\x19\xeb\x8f\x5a"
+			  "\x61\xb2\x03\x7b\x35\x19\xbe\xa7"
+			  "\x63\x0c\xfd\xdd\xf9\x90\x6c\x08"
+			  "\x19\x11\xd3\x65\x4a\xf5\x96\x92"
+			  "\x59\xaa\x9c\x61\x0c\x29\xa7\xf8"
+			  "\x14\x39\x37\xbf\x3c\xf2\x16\x72"
+			  "\x02\xfa\xa2\xf3\x18\x67\x5d\xcb"
+			  "\xdc\x4d\xbb\x96\xff\x70\x08\x2d"
+			  "\xc2\xa8\x52\xe1\x34\x5f\x72\xfe"
+			  "\x64\xbf\xca\xa7\x74\x38\xfb\x74"
+			  "\x55\x9c\xfa\x8a\xed\xfb\x98\xeb"
+			  "\x58\x2e\x6c\xe1\x52\x76\x86\xd7"
+			  "\xcf\xa1\xa4\xfc\xb2\x47\x41\x28"
+			  "\xa3\xc1\xe5\xfd\x53\x19\x28\x2b"
+			  "\x37\x04\x65\x96\x99\x7a\x28\x0f"
+			  "\x07\x68\x4b\xc7\x52\x0a\x55\x35"
+			  "\x40\x19\x95\x61\xe8\x59\x40\x1f"
+			  "\x9d\xbf\x78\x7d\x8f\x84\xff\x6f"
+			  "\xd0\xd5\x63\xd2\x22\xbd\xc8\x4e"
+			  "\xfb\xe7\x9f\x06\xe6\xe7\x39\x6d"
+			  "\x6a\x96\x9f\xf0\x74\x7e\xc9\x35"
+			  "\xb7\x26\xb8\x1c\x0a\xa6\x27\x2c"
+			  "\xa2\x2b\xfe\xbe\x0f\x07\x73\xae"
+			  "\x7f\x7f\x54\xf5\x7c\x6a\x0a\x56"
+			  "\x49\xd4\x81\xe5\x85\x53\x99\x1f"
+			  "\x95\x05\x13\x58\x8d\x0e\x1b\x90"
+			  "\xc3\x75\x48\x64\x58\x98\x67\x84"
+			  "\xae\xe2\x21\xa2\x8a\x04\x0a\x0b"
+			  "\x61\xaa\xb0\xd4\x28\x60\x7a\xf8"
+			  "\xbc\x52\xfb\x24\x7f\xed\x0d\x2a"
+			  "\x0a\xb2\xf9\xc6\x95\xb5\x11\xc9"
+			  "\xf4\x0f\x26\x11\xcf\x2a\x57\x87"
+			  "\x7a\xf3\xe7\x94\x65\xc2\xb5\xb3"
+			  "\xab\x98\xe3\xc1\x2b\x59\x19\x7c"
+			  "\xd6\xf3\xf9\xbf\xff\x6d\xc6\x82"
+			  "\x13\x2f\x4a\x2e\xcd\x26\xfe\x2d"
+			  "\x01\x70\xf4\xc2\x7f\x1f\x4c\xcb"
+			  "\x47\x77\x0c\xa0\xa3\x03\xec\xda"
+			  "\xa9\xbf\x0d\x2d\xae\xe4\xb8\x7b"
+			  "\xa9\xbc\x08\xb4\x68\x2e\xc5\x60"
+			  "\x8d\x87\x41\x2b\x0f\x69\xf0\xaf"
+			  "\x5f\xba\x72\x20\x0f\x33\xcd\x6d"
+			  "\x36\x7d\x7b\xd5\x05\xf1\x4b\x05"
+			  "\xc4\xfc\x7f\x80\xb9\x4d\xbd\xf7"
+			  "\x7c\x84\x07\x01\xc2\x40\x66\x5b"
+			  "\x98\xc7\x2c\xe3\x97\xfa\xdf\x87"
+			  "\xa0\x1f\xe9\x21\x42\x0f\x3b\xeb"
+			  "\x89\x1c\x3b\xca\x83\x61\x77\x68"
+			  "\x84\xbb\x60\x87\x38\x2e\x25\xd5"
+			  "\x9e\x04\x41\x70\xac\xda\xc0\x9c"
+			  "\x9c\x69\xea\x8d\x4e\x55\x2a\x29"
+			  "\xed\x05\x4b\x7b\x73\x71\x90\x59"
+			  "\x4d\xc8\xd8\x44\xf0\x4c\xe1\x5e"
+			  "\x84\x47\x55\xcc\x32\x3f\xe7\x97"
+			  "\x42\xc6\x32\xac\x40\xe5\xa5\xc7"
+			  "\x8b\xed\xdb\xf7\x83\xd6\xb1\xc2"
+			  "\x52\x5e\x34\xb7\xeb\x6e\xd9\xfc"
+			  "\xe5\x93\x9a\x97\x3e\xb0\xdc\xd9"
+			  "\xd7\x06\x10\xb6\x1d\x80\x59\xdd"
+			  "\x0d\xfe\x64\x35\xcd\x5d\xec\xf0"
+			  "\xba\xd0\x34\xc9\x2d\x91\xc5\x17"
+			  "\x11",
+		.len	= 1281,
+		.also_non_np = 1,
+		.np	= 3,
+		.tap	= { 1200, 1, 80 },
+	},
+};
+
 /*
  * CTS (Cipher Text Stealing) mode tests
  */
diff --git a/include/crypto/chacha20.h b/include/crypto/chacha20.h
index 56073814eef0..c24b4ac03b85 100644
--- a/include/crypto/chacha20.h
+++ b/include/crypto/chacha20.h
@@ -1,6 +1,10 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
- * Common values for the ChaCha20 algorithm
+ * Common values and helper functions for the ChaCha20 and XChaCha20 algorithms.
+ *
+ * XChaCha20 extends ChaCha20's nonce to 192 bits, while provably retaining
+ * ChaCha20's security.  Here they share the same key size, tfm context, and
+ * setkey function; only their IV size and encrypt/decrypt function differ.
  */
 
 #ifndef _CRYPTO_CHACHA20_H
@@ -10,11 +14,16 @@
 #include <linux/types.h>
 #include <linux/crypto.h>
 
+/* 32-bit stream position, then 96-bit nonce (RFC7539 convention) */
 #define CHACHA20_IV_SIZE	16
+
 #define CHACHA20_KEY_SIZE	32
 #define CHACHA20_BLOCK_SIZE	64
 #define CHACHAPOLY_IV_SIZE	12
 
+/* 192-bit nonce, then 64-bit stream position */
+#define XCHACHA20_IV_SIZE	32
+
 struct chacha20_ctx {
 	u32 key[8];
 };
@@ -23,8 +32,11 @@ void chacha20_block(u32 *state, u8 *stream);
 void hchacha20_block(const u32 *in, u32 *out);
 
 void crypto_chacha20_init(u32 *state, struct chacha20_ctx *ctx, u8 *iv);
+
 int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
 			   unsigned int keysize);
+
 int crypto_chacha20_crypt(struct skcipher_request *req);
+int crypto_xchacha20_crypt(struct skcipher_request *req);
 
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 1ca1b917940c24ca3d1f490118c5474168622953 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 16 Nov 2018 17:26:21 -0800
Subject: crypto: chacha20-generic - refactor to allow varying number of rounds

In preparation for adding XChaCha12 support, rename/refactor
chacha20-generic to support different numbers of rounds.  The
justification for needing XChaCha12 support is explained in more detail
in the patch "crypto: chacha - add XChaCha12 support".

The only difference between ChaCha{8,12,20} are the number of rounds
itself; all other parts of the algorithm are the same.  Therefore,
remove the "20" from all definitions, structures, functions, files, etc.
that will be shared by all ChaCha versions.

Also make ->setkey() store the round count in the chacha_ctx (previously
chacha20_ctx).  The generic code then passes the round count through to
chacha_block().  There will be a ->setkey() function for each explicitly
allowed round count; the encrypt/decrypt functions will be the same.  I
decided not to do it the opposite way (same ->setkey() function for all
round counts, with different encrypt/decrypt functions) because that
would have required more boilerplate code in architecture-specific
implementations of ChaCha and XChaCha.

Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Martin Willi <martin@strongswan.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm/crypto/chacha20-neon-glue.c   |  40 +++----
 arch/arm64/crypto/chacha20-neon-glue.c |  40 +++----
 arch/x86/crypto/chacha20_glue.c        |  48 ++++----
 crypto/Makefile                        |   2 +-
 crypto/chacha20_generic.c              | 185 -------------------------------
 crypto/chacha20poly1305.c              |  10 +-
 crypto/chacha_generic.c                | 193 +++++++++++++++++++++++++++++++++
 drivers/char/random.c                  |  51 +++++----
 drivers/crypto/caam/caamalg.c          |   2 +-
 drivers/crypto/caam/caamalg_qi2.c      |   8 +-
 drivers/crypto/caam/compat.h           |   2 +-
 include/crypto/chacha.h                |  47 ++++++++
 include/crypto/chacha20.h              |  42 -------
 lib/Makefile                           |   2 +-
 lib/chacha.c                           | 117 ++++++++++++++++++++
 lib/chacha20.c                         | 112 -------------------
 16 files changed, 459 insertions(+), 442 deletions(-)
 delete mode 100644 crypto/chacha20_generic.c
 create mode 100644 crypto/chacha_generic.c
 create mode 100644 include/crypto/chacha.h
 delete mode 100644 include/crypto/chacha20.h
 create mode 100644 lib/chacha.c
 delete mode 100644 lib/chacha20.c

diff --git a/arch/arm/crypto/chacha20-neon-glue.c b/arch/arm/crypto/chacha20-neon-glue.c
index 59a7be08e80c..7386eb1c1889 100644
--- a/arch/arm/crypto/chacha20-neon-glue.c
+++ b/arch/arm/crypto/chacha20-neon-glue.c
@@ -19,7 +19,7 @@
  */
 
 #include <crypto/algapi.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
 #include <crypto/internal/skcipher.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -34,20 +34,20 @@ asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
 static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
 			    unsigned int bytes)
 {
-	u8 buf[CHACHA20_BLOCK_SIZE];
+	u8 buf[CHACHA_BLOCK_SIZE];
 
-	while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
+	while (bytes >= CHACHA_BLOCK_SIZE * 4) {
 		chacha20_4block_xor_neon(state, dst, src);
-		bytes -= CHACHA20_BLOCK_SIZE * 4;
-		src += CHACHA20_BLOCK_SIZE * 4;
-		dst += CHACHA20_BLOCK_SIZE * 4;
+		bytes -= CHACHA_BLOCK_SIZE * 4;
+		src += CHACHA_BLOCK_SIZE * 4;
+		dst += CHACHA_BLOCK_SIZE * 4;
 		state[12] += 4;
 	}
-	while (bytes >= CHACHA20_BLOCK_SIZE) {
+	while (bytes >= CHACHA_BLOCK_SIZE) {
 		chacha20_block_xor_neon(state, dst, src);
-		bytes -= CHACHA20_BLOCK_SIZE;
-		src += CHACHA20_BLOCK_SIZE;
-		dst += CHACHA20_BLOCK_SIZE;
+		bytes -= CHACHA_BLOCK_SIZE;
+		src += CHACHA_BLOCK_SIZE;
+		dst += CHACHA_BLOCK_SIZE;
 		state[12]++;
 	}
 	if (bytes) {
@@ -60,17 +60,17 @@ static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
 static int chacha20_neon(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
 	struct skcipher_walk walk;
 	u32 state[16];
 	int err;
 
-	if (req->cryptlen <= CHACHA20_BLOCK_SIZE || !may_use_simd())
-		return crypto_chacha20_crypt(req);
+	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
+		return crypto_chacha_crypt(req);
 
 	err = skcipher_walk_virt(&walk, req, true);
 
-	crypto_chacha20_init(state, ctx, walk.iv);
+	crypto_chacha_init(state, ctx, walk.iv);
 
 	kernel_neon_begin();
 	while (walk.nbytes > 0) {
@@ -93,14 +93,14 @@ static struct skcipher_alg alg = {
 	.base.cra_driver_name	= "chacha20-neon",
 	.base.cra_priority	= 300,
 	.base.cra_blocksize	= 1,
-	.base.cra_ctxsize	= sizeof(struct chacha20_ctx),
+	.base.cra_ctxsize	= sizeof(struct chacha_ctx),
 	.base.cra_module	= THIS_MODULE,
 
-	.min_keysize		= CHACHA20_KEY_SIZE,
-	.max_keysize		= CHACHA20_KEY_SIZE,
-	.ivsize			= CHACHA20_IV_SIZE,
-	.chunksize		= CHACHA20_BLOCK_SIZE,
-	.walksize		= 4 * CHACHA20_BLOCK_SIZE,
+	.min_keysize		= CHACHA_KEY_SIZE,
+	.max_keysize		= CHACHA_KEY_SIZE,
+	.ivsize			= CHACHA_IV_SIZE,
+	.chunksize		= CHACHA_BLOCK_SIZE,
+	.walksize		= 4 * CHACHA_BLOCK_SIZE,
 	.setkey			= crypto_chacha20_setkey,
 	.encrypt		= chacha20_neon,
 	.decrypt		= chacha20_neon,
diff --git a/arch/arm64/crypto/chacha20-neon-glue.c b/arch/arm64/crypto/chacha20-neon-glue.c
index 727579c93ded..96e0cfb8c3f5 100644
--- a/arch/arm64/crypto/chacha20-neon-glue.c
+++ b/arch/arm64/crypto/chacha20-neon-glue.c
@@ -19,7 +19,7 @@
  */
 
 #include <crypto/algapi.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
 #include <crypto/internal/skcipher.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -34,15 +34,15 @@ asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
 static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
 			    unsigned int bytes)
 {
-	u8 buf[CHACHA20_BLOCK_SIZE];
+	u8 buf[CHACHA_BLOCK_SIZE];
 
-	while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
+	while (bytes >= CHACHA_BLOCK_SIZE * 4) {
 		kernel_neon_begin();
 		chacha20_4block_xor_neon(state, dst, src);
 		kernel_neon_end();
-		bytes -= CHACHA20_BLOCK_SIZE * 4;
-		src += CHACHA20_BLOCK_SIZE * 4;
-		dst += CHACHA20_BLOCK_SIZE * 4;
+		bytes -= CHACHA_BLOCK_SIZE * 4;
+		src += CHACHA_BLOCK_SIZE * 4;
+		dst += CHACHA_BLOCK_SIZE * 4;
 		state[12] += 4;
 	}
 
@@ -50,11 +50,11 @@ static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
 		return;
 
 	kernel_neon_begin();
-	while (bytes >= CHACHA20_BLOCK_SIZE) {
+	while (bytes >= CHACHA_BLOCK_SIZE) {
 		chacha20_block_xor_neon(state, dst, src);
-		bytes -= CHACHA20_BLOCK_SIZE;
-		src += CHACHA20_BLOCK_SIZE;
-		dst += CHACHA20_BLOCK_SIZE;
+		bytes -= CHACHA_BLOCK_SIZE;
+		src += CHACHA_BLOCK_SIZE;
+		dst += CHACHA_BLOCK_SIZE;
 		state[12]++;
 	}
 	if (bytes) {
@@ -68,17 +68,17 @@ static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
 static int chacha20_neon(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
 	struct skcipher_walk walk;
 	u32 state[16];
 	int err;
 
-	if (!may_use_simd() || req->cryptlen <= CHACHA20_BLOCK_SIZE)
-		return crypto_chacha20_crypt(req);
+	if (!may_use_simd() || req->cryptlen <= CHACHA_BLOCK_SIZE)
+		return crypto_chacha_crypt(req);
 
 	err = skcipher_walk_virt(&walk, req, false);
 
-	crypto_chacha20_init(state, ctx, walk.iv);
+	crypto_chacha_init(state, ctx, walk.iv);
 
 	while (walk.nbytes > 0) {
 		unsigned int nbytes = walk.nbytes;
@@ -99,14 +99,14 @@ static struct skcipher_alg alg = {
 	.base.cra_driver_name	= "chacha20-neon",
 	.base.cra_priority	= 300,
 	.base.cra_blocksize	= 1,
-	.base.cra_ctxsize	= sizeof(struct chacha20_ctx),
+	.base.cra_ctxsize	= sizeof(struct chacha_ctx),
 	.base.cra_module	= THIS_MODULE,
 
-	.min_keysize		= CHACHA20_KEY_SIZE,
-	.max_keysize		= CHACHA20_KEY_SIZE,
-	.ivsize			= CHACHA20_IV_SIZE,
-	.chunksize		= CHACHA20_BLOCK_SIZE,
-	.walksize		= 4 * CHACHA20_BLOCK_SIZE,
+	.min_keysize		= CHACHA_KEY_SIZE,
+	.max_keysize		= CHACHA_KEY_SIZE,
+	.ivsize			= CHACHA_IV_SIZE,
+	.chunksize		= CHACHA_BLOCK_SIZE,
+	.walksize		= 4 * CHACHA_BLOCK_SIZE,
 	.setkey			= crypto_chacha20_setkey,
 	.encrypt		= chacha20_neon,
 	.decrypt		= chacha20_neon,
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index 9fd84fe6ec09..1e9e66509226 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -10,7 +10,7 @@
  */
 
 #include <crypto/algapi.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
 #include <crypto/internal/skcipher.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -35,8 +35,8 @@ static bool chacha20_use_avx2;
 
 static unsigned int chacha20_advance(unsigned int len, unsigned int maxblocks)
 {
-	len = min(len, maxblocks * CHACHA20_BLOCK_SIZE);
-	return round_up(len, CHACHA20_BLOCK_SIZE) / CHACHA20_BLOCK_SIZE;
+	len = min(len, maxblocks * CHACHA_BLOCK_SIZE);
+	return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
 }
 
 static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
@@ -44,38 +44,38 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
 {
 #ifdef CONFIG_AS_AVX2
 	if (chacha20_use_avx2) {
-		while (bytes >= CHACHA20_BLOCK_SIZE * 8) {
+		while (bytes >= CHACHA_BLOCK_SIZE * 8) {
 			chacha20_8block_xor_avx2(state, dst, src, bytes);
-			bytes -= CHACHA20_BLOCK_SIZE * 8;
-			src += CHACHA20_BLOCK_SIZE * 8;
-			dst += CHACHA20_BLOCK_SIZE * 8;
+			bytes -= CHACHA_BLOCK_SIZE * 8;
+			src += CHACHA_BLOCK_SIZE * 8;
+			dst += CHACHA_BLOCK_SIZE * 8;
 			state[12] += 8;
 		}
-		if (bytes > CHACHA20_BLOCK_SIZE * 4) {
+		if (bytes > CHACHA_BLOCK_SIZE * 4) {
 			chacha20_8block_xor_avx2(state, dst, src, bytes);
 			state[12] += chacha20_advance(bytes, 8);
 			return;
 		}
-		if (bytes > CHACHA20_BLOCK_SIZE * 2) {
+		if (bytes > CHACHA_BLOCK_SIZE * 2) {
 			chacha20_4block_xor_avx2(state, dst, src, bytes);
 			state[12] += chacha20_advance(bytes, 4);
 			return;
 		}
-		if (bytes > CHACHA20_BLOCK_SIZE) {
+		if (bytes > CHACHA_BLOCK_SIZE) {
 			chacha20_2block_xor_avx2(state, dst, src, bytes);
 			state[12] += chacha20_advance(bytes, 2);
 			return;
 		}
 	}
 #endif
-	while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
+	while (bytes >= CHACHA_BLOCK_SIZE * 4) {
 		chacha20_4block_xor_ssse3(state, dst, src, bytes);
-		bytes -= CHACHA20_BLOCK_SIZE * 4;
-		src += CHACHA20_BLOCK_SIZE * 4;
-		dst += CHACHA20_BLOCK_SIZE * 4;
+		bytes -= CHACHA_BLOCK_SIZE * 4;
+		src += CHACHA_BLOCK_SIZE * 4;
+		dst += CHACHA_BLOCK_SIZE * 4;
 		state[12] += 4;
 	}
-	if (bytes > CHACHA20_BLOCK_SIZE) {
+	if (bytes > CHACHA_BLOCK_SIZE) {
 		chacha20_4block_xor_ssse3(state, dst, src, bytes);
 		state[12] += chacha20_advance(bytes, 4);
 		return;
@@ -89,7 +89,7 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
 static int chacha20_simd(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
 	u32 *state, state_buf[16 + 2] __aligned(8);
 	struct skcipher_walk walk;
 	int err;
@@ -97,12 +97,12 @@ static int chacha20_simd(struct skcipher_request *req)
 	BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16);
 	state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN);
 
-	if (req->cryptlen <= CHACHA20_BLOCK_SIZE || !may_use_simd())
-		return crypto_chacha20_crypt(req);
+	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
+		return crypto_chacha_crypt(req);
 
 	err = skcipher_walk_virt(&walk, req, true);
 
-	crypto_chacha20_init(state, ctx, walk.iv);
+	crypto_chacha_init(state, ctx, walk.iv);
 
 	kernel_fpu_begin();
 
@@ -128,13 +128,13 @@ static struct skcipher_alg alg = {
 	.base.cra_driver_name	= "chacha20-simd",
 	.base.cra_priority	= 300,
 	.base.cra_blocksize	= 1,
-	.base.cra_ctxsize	= sizeof(struct chacha20_ctx),
+	.base.cra_ctxsize	= sizeof(struct chacha_ctx),
 	.base.cra_module	= THIS_MODULE,
 
-	.min_keysize		= CHACHA20_KEY_SIZE,
-	.max_keysize		= CHACHA20_KEY_SIZE,
-	.ivsize			= CHACHA20_IV_SIZE,
-	.chunksize		= CHACHA20_BLOCK_SIZE,
+	.min_keysize		= CHACHA_KEY_SIZE,
+	.max_keysize		= CHACHA_KEY_SIZE,
+	.ivsize			= CHACHA_IV_SIZE,
+	.chunksize		= CHACHA_BLOCK_SIZE,
 	.setkey			= crypto_chacha20_setkey,
 	.encrypt		= chacha20_simd,
 	.decrypt		= chacha20_simd,
diff --git a/crypto/Makefile b/crypto/Makefile
index abbd86fdbad2..102e8525814f 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -117,7 +117,7 @@ obj-$(CONFIG_CRYPTO_KHAZAD) += khazad.o
 obj-$(CONFIG_CRYPTO_ANUBIS) += anubis.o
 obj-$(CONFIG_CRYPTO_SEED) += seed.o
 obj-$(CONFIG_CRYPTO_SALSA20) += salsa20_generic.o
-obj-$(CONFIG_CRYPTO_CHACHA20) += chacha20_generic.o
+obj-$(CONFIG_CRYPTO_CHACHA20) += chacha_generic.o
 obj-$(CONFIG_CRYPTO_POLY1305) += poly1305_generic.o
 obj-$(CONFIG_CRYPTO_DEFLATE) += deflate.o
 obj-$(CONFIG_CRYPTO_MICHAEL_MIC) += michael_mic.o
diff --git a/crypto/chacha20_generic.c b/crypto/chacha20_generic.c
deleted file mode 100644
index 4305b1b62b16..000000000000
--- a/crypto/chacha20_generic.c
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * ChaCha20 (RFC7539) and XChaCha20 stream cipher algorithms
- *
- * Copyright (C) 2015 Martin Willi
- * Copyright (C) 2018 Google LLC
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <asm/unaligned.h>
-#include <crypto/algapi.h>
-#include <crypto/chacha20.h>
-#include <crypto/internal/skcipher.h>
-#include <linux/module.h>
-
-static void chacha20_docrypt(u32 *state, u8 *dst, const u8 *src,
-			     unsigned int bytes)
-{
-	/* aligned to potentially speed up crypto_xor() */
-	u8 stream[CHACHA20_BLOCK_SIZE] __aligned(sizeof(long));
-
-	if (dst != src)
-		memcpy(dst, src, bytes);
-
-	while (bytes >= CHACHA20_BLOCK_SIZE) {
-		chacha20_block(state, stream);
-		crypto_xor(dst, stream, CHACHA20_BLOCK_SIZE);
-		bytes -= CHACHA20_BLOCK_SIZE;
-		dst += CHACHA20_BLOCK_SIZE;
-	}
-	if (bytes) {
-		chacha20_block(state, stream);
-		crypto_xor(dst, stream, bytes);
-	}
-}
-
-static int chacha20_stream_xor(struct skcipher_request *req,
-			       struct chacha20_ctx *ctx, u8 *iv)
-{
-	struct skcipher_walk walk;
-	u32 state[16];
-	int err;
-
-	err = skcipher_walk_virt(&walk, req, false);
-
-	crypto_chacha20_init(state, ctx, iv);
-
-	while (walk.nbytes > 0) {
-		unsigned int nbytes = walk.nbytes;
-
-		if (nbytes < walk.total)
-			nbytes = round_down(nbytes, walk.stride);
-
-		chacha20_docrypt(state, walk.dst.virt.addr, walk.src.virt.addr,
-				 nbytes);
-		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
-	}
-
-	return err;
-}
-
-void crypto_chacha20_init(u32 *state, struct chacha20_ctx *ctx, u8 *iv)
-{
-	state[0]  = 0x61707865; /* "expa" */
-	state[1]  = 0x3320646e; /* "nd 3" */
-	state[2]  = 0x79622d32; /* "2-by" */
-	state[3]  = 0x6b206574; /* "te k" */
-	state[4]  = ctx->key[0];
-	state[5]  = ctx->key[1];
-	state[6]  = ctx->key[2];
-	state[7]  = ctx->key[3];
-	state[8]  = ctx->key[4];
-	state[9]  = ctx->key[5];
-	state[10] = ctx->key[6];
-	state[11] = ctx->key[7];
-	state[12] = get_unaligned_le32(iv +  0);
-	state[13] = get_unaligned_le32(iv +  4);
-	state[14] = get_unaligned_le32(iv +  8);
-	state[15] = get_unaligned_le32(iv + 12);
-}
-EXPORT_SYMBOL_GPL(crypto_chacha20_init);
-
-int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
-			   unsigned int keysize)
-{
-	struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
-	int i;
-
-	if (keysize != CHACHA20_KEY_SIZE)
-		return -EINVAL;
-
-	for (i = 0; i < ARRAY_SIZE(ctx->key); i++)
-		ctx->key[i] = get_unaligned_le32(key + i * sizeof(u32));
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(crypto_chacha20_setkey);
-
-int crypto_chacha20_crypt(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
-
-	return chacha20_stream_xor(req, ctx, req->iv);
-}
-EXPORT_SYMBOL_GPL(crypto_chacha20_crypt);
-
-int crypto_xchacha20_crypt(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct chacha20_ctx subctx;
-	u32 state[16];
-	u8 real_iv[16];
-
-	/* Compute the subkey given the original key and first 128 nonce bits */
-	crypto_chacha20_init(state, ctx, req->iv);
-	hchacha20_block(state, subctx.key);
-
-	/* Build the real IV */
-	memcpy(&real_iv[0], req->iv + 24, 8); /* stream position */
-	memcpy(&real_iv[8], req->iv + 16, 8); /* remaining 64 nonce bits */
-
-	/* Generate the stream and XOR it with the data */
-	return chacha20_stream_xor(req, &subctx, real_iv);
-}
-EXPORT_SYMBOL_GPL(crypto_xchacha20_crypt);
-
-static struct skcipher_alg algs[] = {
-	{
-		.base.cra_name		= "chacha20",
-		.base.cra_driver_name	= "chacha20-generic",
-		.base.cra_priority	= 100,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha20_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA20_KEY_SIZE,
-		.max_keysize		= CHACHA20_KEY_SIZE,
-		.ivsize			= CHACHA20_IV_SIZE,
-		.chunksize		= CHACHA20_BLOCK_SIZE,
-		.setkey			= crypto_chacha20_setkey,
-		.encrypt		= crypto_chacha20_crypt,
-		.decrypt		= crypto_chacha20_crypt,
-	}, {
-		.base.cra_name		= "xchacha20",
-		.base.cra_driver_name	= "xchacha20-generic",
-		.base.cra_priority	= 100,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha20_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA20_KEY_SIZE,
-		.max_keysize		= CHACHA20_KEY_SIZE,
-		.ivsize			= XCHACHA20_IV_SIZE,
-		.chunksize		= CHACHA20_BLOCK_SIZE,
-		.setkey			= crypto_chacha20_setkey,
-		.encrypt		= crypto_xchacha20_crypt,
-		.decrypt		= crypto_xchacha20_crypt,
-	}
-};
-
-static int __init chacha20_generic_mod_init(void)
-{
-	return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
-}
-
-static void __exit chacha20_generic_mod_fini(void)
-{
-	crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
-}
-
-module_init(chacha20_generic_mod_init);
-module_exit(chacha20_generic_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
-MODULE_DESCRIPTION("ChaCha20 and XChaCha20 stream ciphers (generic)");
-MODULE_ALIAS_CRYPTO("chacha20");
-MODULE_ALIAS_CRYPTO("chacha20-generic");
-MODULE_ALIAS_CRYPTO("xchacha20");
-MODULE_ALIAS_CRYPTO("xchacha20-generic");
diff --git a/crypto/chacha20poly1305.c b/crypto/chacha20poly1305.c
index f9dd5453046a..fef11446ab1b 100644
--- a/crypto/chacha20poly1305.c
+++ b/crypto/chacha20poly1305.c
@@ -13,7 +13,7 @@
 #include <crypto/internal/hash.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/scatterwalk.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
 #include <crypto/poly1305.h>
 #include <linux/err.h>
 #include <linux/init.h>
@@ -49,7 +49,7 @@ struct poly_req {
 };
 
 struct chacha_req {
-	u8 iv[CHACHA20_IV_SIZE];
+	u8 iv[CHACHA_IV_SIZE];
 	struct scatterlist src[1];
 	struct skcipher_request req; /* must be last member */
 };
@@ -89,7 +89,7 @@ static void chacha_iv(u8 *iv, struct aead_request *req, u32 icb)
 	memcpy(iv, &leicb, sizeof(leicb));
 	memcpy(iv + sizeof(leicb), ctx->salt, ctx->saltlen);
 	memcpy(iv + sizeof(leicb) + ctx->saltlen, req->iv,
-	       CHACHA20_IV_SIZE - sizeof(leicb) - ctx->saltlen);
+	       CHACHA_IV_SIZE - sizeof(leicb) - ctx->saltlen);
 }
 
 static int poly_verify_tag(struct aead_request *req)
@@ -492,7 +492,7 @@ static int chachapoly_setkey(struct crypto_aead *aead, const u8 *key,
 	struct chachapoly_ctx *ctx = crypto_aead_ctx(aead);
 	int err;
 
-	if (keylen != ctx->saltlen + CHACHA20_KEY_SIZE)
+	if (keylen != ctx->saltlen + CHACHA_KEY_SIZE)
 		return -EINVAL;
 
 	keylen -= ctx->saltlen;
@@ -637,7 +637,7 @@ static int chachapoly_create(struct crypto_template *tmpl, struct rtattr **tb,
 
 	err = -EINVAL;
 	/* Need 16-byte IV size, including Initial Block Counter value */
-	if (crypto_skcipher_alg_ivsize(chacha) != CHACHA20_IV_SIZE)
+	if (crypto_skcipher_alg_ivsize(chacha) != CHACHA_IV_SIZE)
 		goto out_drop_chacha;
 	/* Not a stream cipher? */
 	if (chacha->base.cra_blocksize != 1)
diff --git a/crypto/chacha_generic.c b/crypto/chacha_generic.c
new file mode 100644
index 000000000000..438f15a14054
--- /dev/null
+++ b/crypto/chacha_generic.c
@@ -0,0 +1,193 @@
+/*
+ * ChaCha20 (RFC7539) and XChaCha20 stream cipher algorithms
+ *
+ * Copyright (C) 2015 Martin Willi
+ * Copyright (C) 2018 Google LLC
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <asm/unaligned.h>
+#include <crypto/algapi.h>
+#include <crypto/chacha.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/module.h>
+
+static void chacha_docrypt(u32 *state, u8 *dst, const u8 *src,
+			   unsigned int bytes, int nrounds)
+{
+	/* aligned to potentially speed up crypto_xor() */
+	u8 stream[CHACHA_BLOCK_SIZE] __aligned(sizeof(long));
+
+	if (dst != src)
+		memcpy(dst, src, bytes);
+
+	while (bytes >= CHACHA_BLOCK_SIZE) {
+		chacha_block(state, stream, nrounds);
+		crypto_xor(dst, stream, CHACHA_BLOCK_SIZE);
+		bytes -= CHACHA_BLOCK_SIZE;
+		dst += CHACHA_BLOCK_SIZE;
+	}
+	if (bytes) {
+		chacha_block(state, stream, nrounds);
+		crypto_xor(dst, stream, bytes);
+	}
+}
+
+static int chacha_stream_xor(struct skcipher_request *req,
+			     struct chacha_ctx *ctx, u8 *iv)
+{
+	struct skcipher_walk walk;
+	u32 state[16];
+	int err;
+
+	err = skcipher_walk_virt(&walk, req, false);
+
+	crypto_chacha_init(state, ctx, iv);
+
+	while (walk.nbytes > 0) {
+		unsigned int nbytes = walk.nbytes;
+
+		if (nbytes < walk.total)
+			nbytes = round_down(nbytes, walk.stride);
+
+		chacha_docrypt(state, walk.dst.virt.addr, walk.src.virt.addr,
+			       nbytes, ctx->nrounds);
+		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+	}
+
+	return err;
+}
+
+void crypto_chacha_init(u32 *state, struct chacha_ctx *ctx, u8 *iv)
+{
+	state[0]  = 0x61707865; /* "expa" */
+	state[1]  = 0x3320646e; /* "nd 3" */
+	state[2]  = 0x79622d32; /* "2-by" */
+	state[3]  = 0x6b206574; /* "te k" */
+	state[4]  = ctx->key[0];
+	state[5]  = ctx->key[1];
+	state[6]  = ctx->key[2];
+	state[7]  = ctx->key[3];
+	state[8]  = ctx->key[4];
+	state[9]  = ctx->key[5];
+	state[10] = ctx->key[6];
+	state[11] = ctx->key[7];
+	state[12] = get_unaligned_le32(iv +  0);
+	state[13] = get_unaligned_le32(iv +  4);
+	state[14] = get_unaligned_le32(iv +  8);
+	state[15] = get_unaligned_le32(iv + 12);
+}
+EXPORT_SYMBOL_GPL(crypto_chacha_init);
+
+static int chacha_setkey(struct crypto_skcipher *tfm, const u8 *key,
+			 unsigned int keysize, int nrounds)
+{
+	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+	int i;
+
+	if (keysize != CHACHA_KEY_SIZE)
+		return -EINVAL;
+
+	for (i = 0; i < ARRAY_SIZE(ctx->key); i++)
+		ctx->key[i] = get_unaligned_le32(key + i * sizeof(u32));
+
+	ctx->nrounds = nrounds;
+	return 0;
+}
+
+int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
+			   unsigned int keysize)
+{
+	return chacha_setkey(tfm, key, keysize, 20);
+}
+EXPORT_SYMBOL_GPL(crypto_chacha20_setkey);
+
+int crypto_chacha_crypt(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+	return chacha_stream_xor(req, ctx, req->iv);
+}
+EXPORT_SYMBOL_GPL(crypto_chacha_crypt);
+
+int crypto_xchacha_crypt(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct chacha_ctx subctx;
+	u32 state[16];
+	u8 real_iv[16];
+
+	/* Compute the subkey given the original key and first 128 nonce bits */
+	crypto_chacha_init(state, ctx, req->iv);
+	hchacha_block(state, subctx.key, ctx->nrounds);
+	subctx.nrounds = ctx->nrounds;
+
+	/* Build the real IV */
+	memcpy(&real_iv[0], req->iv + 24, 8); /* stream position */
+	memcpy(&real_iv[8], req->iv + 16, 8); /* remaining 64 nonce bits */
+
+	/* Generate the stream and XOR it with the data */
+	return chacha_stream_xor(req, &subctx, real_iv);
+}
+EXPORT_SYMBOL_GPL(crypto_xchacha_crypt);
+
+static struct skcipher_alg algs[] = {
+	{
+		.base.cra_name		= "chacha20",
+		.base.cra_driver_name	= "chacha20-generic",
+		.base.cra_priority	= 100,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= CHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.setkey			= crypto_chacha20_setkey,
+		.encrypt		= crypto_chacha_crypt,
+		.decrypt		= crypto_chacha_crypt,
+	}, {
+		.base.cra_name		= "xchacha20",
+		.base.cra_driver_name	= "xchacha20-generic",
+		.base.cra_priority	= 100,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= XCHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.setkey			= crypto_chacha20_setkey,
+		.encrypt		= crypto_xchacha_crypt,
+		.decrypt		= crypto_xchacha_crypt,
+	}
+};
+
+static int __init chacha_generic_mod_init(void)
+{
+	return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
+}
+
+static void __exit chacha_generic_mod_fini(void)
+{
+	crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+}
+
+module_init(chacha_generic_mod_init);
+module_exit(chacha_generic_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
+MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (generic)");
+MODULE_ALIAS_CRYPTO("chacha20");
+MODULE_ALIAS_CRYPTO("chacha20-generic");
+MODULE_ALIAS_CRYPTO("xchacha20");
+MODULE_ALIAS_CRYPTO("xchacha20-generic");
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 2eb70e76ed35..38c6d1af6d1c 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -265,7 +265,7 @@
 #include <linux/syscalls.h>
 #include <linux/completion.h>
 #include <linux/uuid.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
 
 #include <asm/processor.h>
 #include <linux/uaccess.h>
@@ -431,11 +431,10 @@ static int crng_init = 0;
 #define crng_ready() (likely(crng_init > 1))
 static int crng_init_cnt = 0;
 static unsigned long crng_global_init_time = 0;
-#define CRNG_INIT_CNT_THRESH (2*CHACHA20_KEY_SIZE)
-static void _extract_crng(struct crng_state *crng,
-			  __u8 out[CHACHA20_BLOCK_SIZE]);
+#define CRNG_INIT_CNT_THRESH (2*CHACHA_KEY_SIZE)
+static void _extract_crng(struct crng_state *crng, __u8 out[CHACHA_BLOCK_SIZE]);
 static void _crng_backtrack_protect(struct crng_state *crng,
-				    __u8 tmp[CHACHA20_BLOCK_SIZE], int used);
+				    __u8 tmp[CHACHA_BLOCK_SIZE], int used);
 static void process_random_ready_list(void);
 static void _get_random_bytes(void *buf, int nbytes);
 
@@ -863,7 +862,7 @@ static int crng_fast_load(const char *cp, size_t len)
 	}
 	p = (unsigned char *) &primary_crng.state[4];
 	while (len > 0 && crng_init_cnt < CRNG_INIT_CNT_THRESH) {
-		p[crng_init_cnt % CHACHA20_KEY_SIZE] ^= *cp;
+		p[crng_init_cnt % CHACHA_KEY_SIZE] ^= *cp;
 		cp++; crng_init_cnt++; len--;
 	}
 	spin_unlock_irqrestore(&primary_crng.lock, flags);
@@ -895,7 +894,7 @@ static int crng_slow_load(const char *cp, size_t len)
 	unsigned long		flags;
 	static unsigned char	lfsr = 1;
 	unsigned char		tmp;
-	unsigned		i, max = CHACHA20_KEY_SIZE;
+	unsigned		i, max = CHACHA_KEY_SIZE;
 	const char *		src_buf = cp;
 	char *			dest_buf = (char *) &primary_crng.state[4];
 
@@ -913,8 +912,8 @@ static int crng_slow_load(const char *cp, size_t len)
 		lfsr >>= 1;
 		if (tmp & 1)
 			lfsr ^= 0xE1;
-		tmp = dest_buf[i % CHACHA20_KEY_SIZE];
-		dest_buf[i % CHACHA20_KEY_SIZE] ^= src_buf[i % len] ^ lfsr;
+		tmp = dest_buf[i % CHACHA_KEY_SIZE];
+		dest_buf[i % CHACHA_KEY_SIZE] ^= src_buf[i % len] ^ lfsr;
 		lfsr += (tmp << 3) | (tmp >> 5);
 	}
 	spin_unlock_irqrestore(&primary_crng.lock, flags);
@@ -926,7 +925,7 @@ static void crng_reseed(struct crng_state *crng, struct entropy_store *r)
 	unsigned long	flags;
 	int		i, num;
 	union {
-		__u8	block[CHACHA20_BLOCK_SIZE];
+		__u8	block[CHACHA_BLOCK_SIZE];
 		__u32	key[8];
 	} buf;
 
@@ -937,7 +936,7 @@ static void crng_reseed(struct crng_state *crng, struct entropy_store *r)
 	} else {
 		_extract_crng(&primary_crng, buf.block);
 		_crng_backtrack_protect(&primary_crng, buf.block,
-					CHACHA20_KEY_SIZE);
+					CHACHA_KEY_SIZE);
 	}
 	spin_lock_irqsave(&crng->lock, flags);
 	for (i = 0; i < 8; i++) {
@@ -973,7 +972,7 @@ static void crng_reseed(struct crng_state *crng, struct entropy_store *r)
 }
 
 static void _extract_crng(struct crng_state *crng,
-			  __u8 out[CHACHA20_BLOCK_SIZE])
+			  __u8 out[CHACHA_BLOCK_SIZE])
 {
 	unsigned long v, flags;
 
@@ -990,7 +989,7 @@ static void _extract_crng(struct crng_state *crng,
 	spin_unlock_irqrestore(&crng->lock, flags);
 }
 
-static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE])
+static void extract_crng(__u8 out[CHACHA_BLOCK_SIZE])
 {
 	struct crng_state *crng = NULL;
 
@@ -1008,14 +1007,14 @@ static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE])
  * enough) to mutate the CRNG key to provide backtracking protection.
  */
 static void _crng_backtrack_protect(struct crng_state *crng,
-				    __u8 tmp[CHACHA20_BLOCK_SIZE], int used)
+				    __u8 tmp[CHACHA_BLOCK_SIZE], int used)
 {
 	unsigned long	flags;
 	__u32		*s, *d;
 	int		i;
 
 	used = round_up(used, sizeof(__u32));
-	if (used + CHACHA20_KEY_SIZE > CHACHA20_BLOCK_SIZE) {
+	if (used + CHACHA_KEY_SIZE > CHACHA_BLOCK_SIZE) {
 		extract_crng(tmp);
 		used = 0;
 	}
@@ -1027,7 +1026,7 @@ static void _crng_backtrack_protect(struct crng_state *crng,
 	spin_unlock_irqrestore(&crng->lock, flags);
 }
 
-static void crng_backtrack_protect(__u8 tmp[CHACHA20_BLOCK_SIZE], int used)
+static void crng_backtrack_protect(__u8 tmp[CHACHA_BLOCK_SIZE], int used)
 {
 	struct crng_state *crng = NULL;
 
@@ -1042,8 +1041,8 @@ static void crng_backtrack_protect(__u8 tmp[CHACHA20_BLOCK_SIZE], int used)
 
 static ssize_t extract_crng_user(void __user *buf, size_t nbytes)
 {
-	ssize_t ret = 0, i = CHACHA20_BLOCK_SIZE;
-	__u8 tmp[CHACHA20_BLOCK_SIZE] __aligned(4);
+	ssize_t ret = 0, i = CHACHA_BLOCK_SIZE;
+	__u8 tmp[CHACHA_BLOCK_SIZE] __aligned(4);
 	int large_request = (nbytes > 256);
 
 	while (nbytes) {
@@ -1057,7 +1056,7 @@ static ssize_t extract_crng_user(void __user *buf, size_t nbytes)
 		}
 
 		extract_crng(tmp);
-		i = min_t(int, nbytes, CHACHA20_BLOCK_SIZE);
+		i = min_t(int, nbytes, CHACHA_BLOCK_SIZE);
 		if (copy_to_user(buf, tmp, i)) {
 			ret = -EFAULT;
 			break;
@@ -1622,14 +1621,14 @@ static void _warn_unseeded_randomness(const char *func_name, void *caller,
  */
 static void _get_random_bytes(void *buf, int nbytes)
 {
-	__u8 tmp[CHACHA20_BLOCK_SIZE] __aligned(4);
+	__u8 tmp[CHACHA_BLOCK_SIZE] __aligned(4);
 
 	trace_get_random_bytes(nbytes, _RET_IP_);
 
-	while (nbytes >= CHACHA20_BLOCK_SIZE) {
+	while (nbytes >= CHACHA_BLOCK_SIZE) {
 		extract_crng(buf);
-		buf += CHACHA20_BLOCK_SIZE;
-		nbytes -= CHACHA20_BLOCK_SIZE;
+		buf += CHACHA_BLOCK_SIZE;
+		nbytes -= CHACHA_BLOCK_SIZE;
 	}
 
 	if (nbytes > 0) {
@@ -1637,7 +1636,7 @@ static void _get_random_bytes(void *buf, int nbytes)
 		memcpy(buf, tmp, nbytes);
 		crng_backtrack_protect(tmp, nbytes);
 	} else
-		crng_backtrack_protect(tmp, CHACHA20_BLOCK_SIZE);
+		crng_backtrack_protect(tmp, CHACHA_BLOCK_SIZE);
 	memzero_explicit(tmp, sizeof(tmp));
 }
 
@@ -2208,8 +2207,8 @@ struct ctl_table random_table[] = {
 
 struct batched_entropy {
 	union {
-		u64 entropy_u64[CHACHA20_BLOCK_SIZE / sizeof(u64)];
-		u32 entropy_u32[CHACHA20_BLOCK_SIZE / sizeof(u32)];
+		u64 entropy_u64[CHACHA_BLOCK_SIZE / sizeof(u64)];
+		u32 entropy_u32[CHACHA_BLOCK_SIZE / sizeof(u32)];
 	};
 	unsigned int position;
 };
diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
index 523565ce0060..92e593e2069a 100644
--- a/drivers/crypto/caam/caamalg.c
+++ b/drivers/crypto/caam/caamalg.c
@@ -559,7 +559,7 @@ static int chachapoly_setkey(struct crypto_aead *aead, const u8 *key,
 	unsigned int ivsize = crypto_aead_ivsize(aead);
 	unsigned int saltlen = CHACHAPOLY_IV_SIZE - ivsize;
 
-	if (keylen != CHACHA20_KEY_SIZE + saltlen) {
+	if (keylen != CHACHA_KEY_SIZE + saltlen) {
 		crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
 		return -EINVAL;
 	}
diff --git a/drivers/crypto/caam/caamalg_qi2.c b/drivers/crypto/caam/caamalg_qi2.c
index 2598640aa98b..425d5d974613 100644
--- a/drivers/crypto/caam/caamalg_qi2.c
+++ b/drivers/crypto/caam/caamalg_qi2.c
@@ -591,7 +591,7 @@ static int chachapoly_setkey(struct crypto_aead *aead, const u8 *key,
 	unsigned int ivsize = crypto_aead_ivsize(aead);
 	unsigned int saltlen = CHACHAPOLY_IV_SIZE - ivsize;
 
-	if (keylen != CHACHA20_KEY_SIZE + saltlen) {
+	if (keylen != CHACHA_KEY_SIZE + saltlen) {
 		crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
 		return -EINVAL;
 	}
@@ -1577,9 +1577,9 @@ static struct caam_skcipher_alg driver_algs[] = {
 			.setkey = skcipher_setkey,
 			.encrypt = skcipher_encrypt,
 			.decrypt = skcipher_decrypt,
-			.min_keysize = CHACHA20_KEY_SIZE,
-			.max_keysize = CHACHA20_KEY_SIZE,
-			.ivsize = CHACHA20_IV_SIZE,
+			.min_keysize = CHACHA_KEY_SIZE,
+			.max_keysize = CHACHA_KEY_SIZE,
+			.ivsize = CHACHA_IV_SIZE,
 		},
 		.caam.class1_alg_type = OP_ALG_ALGSEL_CHACHA20,
 	},
diff --git a/drivers/crypto/caam/compat.h b/drivers/crypto/caam/compat.h
index 8bde903f9f4a..87d9efe4c7aa 100644
--- a/drivers/crypto/caam/compat.h
+++ b/drivers/crypto/caam/compat.h
@@ -36,7 +36,7 @@
 #include <crypto/gcm.h>
 #include <crypto/sha.h>
 #include <crypto/md5.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
 #include <crypto/poly1305.h>
 #include <crypto/internal/aead.h>
 #include <crypto/authenc.h>
diff --git a/include/crypto/chacha.h b/include/crypto/chacha.h
new file mode 100644
index 000000000000..b722a23e54bb
--- /dev/null
+++ b/include/crypto/chacha.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Common values and helper functions for the ChaCha and XChaCha stream ciphers.
+ *
+ * XChaCha extends ChaCha's nonce to 192 bits, while provably retaining ChaCha's
+ * security.  Here they share the same key size, tfm context, and setkey
+ * function; only their IV size and encrypt/decrypt function differ.
+ */
+
+#ifndef _CRYPTO_CHACHA_H
+#define _CRYPTO_CHACHA_H
+
+#include <crypto/skcipher.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+
+/* 32-bit stream position, then 96-bit nonce (RFC7539 convention) */
+#define CHACHA_IV_SIZE		16
+
+#define CHACHA_KEY_SIZE		32
+#define CHACHA_BLOCK_SIZE	64
+#define CHACHAPOLY_IV_SIZE	12
+
+/* 192-bit nonce, then 64-bit stream position */
+#define XCHACHA_IV_SIZE		32
+
+struct chacha_ctx {
+	u32 key[8];
+	int nrounds;
+};
+
+void chacha_block(u32 *state, u8 *stream, int nrounds);
+static inline void chacha20_block(u32 *state, u8 *stream)
+{
+	chacha_block(state, stream, 20);
+}
+void hchacha_block(const u32 *in, u32 *out, int nrounds);
+
+void crypto_chacha_init(u32 *state, struct chacha_ctx *ctx, u8 *iv);
+
+int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
+			   unsigned int keysize);
+
+int crypto_chacha_crypt(struct skcipher_request *req);
+int crypto_xchacha_crypt(struct skcipher_request *req);
+
+#endif /* _CRYPTO_CHACHA_H */
diff --git a/include/crypto/chacha20.h b/include/crypto/chacha20.h
deleted file mode 100644
index c24b4ac03b85..000000000000
--- a/include/crypto/chacha20.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Common values and helper functions for the ChaCha20 and XChaCha20 algorithms.
- *
- * XChaCha20 extends ChaCha20's nonce to 192 bits, while provably retaining
- * ChaCha20's security.  Here they share the same key size, tfm context, and
- * setkey function; only their IV size and encrypt/decrypt function differ.
- */
-
-#ifndef _CRYPTO_CHACHA20_H
-#define _CRYPTO_CHACHA20_H
-
-#include <crypto/skcipher.h>
-#include <linux/types.h>
-#include <linux/crypto.h>
-
-/* 32-bit stream position, then 96-bit nonce (RFC7539 convention) */
-#define CHACHA20_IV_SIZE	16
-
-#define CHACHA20_KEY_SIZE	32
-#define CHACHA20_BLOCK_SIZE	64
-#define CHACHAPOLY_IV_SIZE	12
-
-/* 192-bit nonce, then 64-bit stream position */
-#define XCHACHA20_IV_SIZE	32
-
-struct chacha20_ctx {
-	u32 key[8];
-};
-
-void chacha20_block(u32 *state, u8 *stream);
-void hchacha20_block(const u32 *in, u32 *out);
-
-void crypto_chacha20_init(u32 *state, struct chacha20_ctx *ctx, u8 *iv);
-
-int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
-			   unsigned int keysize);
-
-int crypto_chacha20_crypt(struct skcipher_request *req);
-int crypto_xchacha20_crypt(struct skcipher_request *req);
-
-#endif
diff --git a/lib/Makefile b/lib/Makefile
index db06d1237898..4c2b6fc5cde9 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -20,7 +20,7 @@ KCOV_INSTRUMENT_dynamic_debug.o := n
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 rbtree.o radix-tree.o timerqueue.o xarray.o \
 	 idr.o int_sqrt.o extable.o \
-	 sha1.o chacha20.o irq_regs.o argv_split.o \
+	 sha1.o chacha.o irq_regs.o argv_split.o \
 	 flex_proportions.o ratelimit.o show_mem.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
 	 earlycpio.o seq_buf.o siphash.o dec_and_lock.o \
diff --git a/lib/chacha.c b/lib/chacha.c
new file mode 100644
index 000000000000..1bdc688c18df
--- /dev/null
+++ b/lib/chacha.c
@@ -0,0 +1,117 @@
+/*
+ * The "hash function" used as the core of the ChaCha stream cipher (RFC7539)
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/bitops.h>
+#include <linux/cryptohash.h>
+#include <asm/unaligned.h>
+#include <crypto/chacha.h>
+
+static void chacha_permute(u32 *x, int nrounds)
+{
+	int i;
+
+	/* whitelist the allowed round counts */
+	WARN_ON_ONCE(nrounds != 20);
+
+	for (i = 0; i < nrounds; i += 2) {
+		x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],  16);
+		x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],  16);
+		x[2]  += x[6];    x[14] = rol32(x[14] ^ x[2],  16);
+		x[3]  += x[7];    x[15] = rol32(x[15] ^ x[3],  16);
+
+		x[8]  += x[12];   x[4]  = rol32(x[4]  ^ x[8],  12);
+		x[9]  += x[13];   x[5]  = rol32(x[5]  ^ x[9],  12);
+		x[10] += x[14];   x[6]  = rol32(x[6]  ^ x[10], 12);
+		x[11] += x[15];   x[7]  = rol32(x[7]  ^ x[11], 12);
+
+		x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],   8);
+		x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],   8);
+		x[2]  += x[6];    x[14] = rol32(x[14] ^ x[2],   8);
+		x[3]  += x[7];    x[15] = rol32(x[15] ^ x[3],   8);
+
+		x[8]  += x[12];   x[4]  = rol32(x[4]  ^ x[8],   7);
+		x[9]  += x[13];   x[5]  = rol32(x[5]  ^ x[9],   7);
+		x[10] += x[14];   x[6]  = rol32(x[6]  ^ x[10],  7);
+		x[11] += x[15];   x[7]  = rol32(x[7]  ^ x[11],  7);
+
+		x[0]  += x[5];    x[15] = rol32(x[15] ^ x[0],  16);
+		x[1]  += x[6];    x[12] = rol32(x[12] ^ x[1],  16);
+		x[2]  += x[7];    x[13] = rol32(x[13] ^ x[2],  16);
+		x[3]  += x[4];    x[14] = rol32(x[14] ^ x[3],  16);
+
+		x[10] += x[15];   x[5]  = rol32(x[5]  ^ x[10], 12);
+		x[11] += x[12];   x[6]  = rol32(x[6]  ^ x[11], 12);
+		x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],  12);
+		x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],  12);
+
+		x[0]  += x[5];    x[15] = rol32(x[15] ^ x[0],   8);
+		x[1]  += x[6];    x[12] = rol32(x[12] ^ x[1],   8);
+		x[2]  += x[7];    x[13] = rol32(x[13] ^ x[2],   8);
+		x[3]  += x[4];    x[14] = rol32(x[14] ^ x[3],   8);
+
+		x[10] += x[15];   x[5]  = rol32(x[5]  ^ x[10],  7);
+		x[11] += x[12];   x[6]  = rol32(x[6]  ^ x[11],  7);
+		x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],   7);
+		x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],   7);
+	}
+}
+
+/**
+ * chacha_block - generate one keystream block and increment block counter
+ * @state: input state matrix (16 32-bit words)
+ * @stream: output keystream block (64 bytes)
+ * @nrounds: number of rounds (currently must be 20)
+ *
+ * This is the ChaCha core, a function from 64-byte strings to 64-byte strings.
+ * The caller has already converted the endianness of the input.  This function
+ * also handles incrementing the block counter in the input matrix.
+ */
+void chacha_block(u32 *state, u8 *stream, int nrounds)
+{
+	u32 x[16];
+	int i;
+
+	memcpy(x, state, 64);
+
+	chacha_permute(x, nrounds);
+
+	for (i = 0; i < ARRAY_SIZE(x); i++)
+		put_unaligned_le32(x[i] + state[i], &stream[i * sizeof(u32)]);
+
+	state[12]++;
+}
+EXPORT_SYMBOL(chacha_block);
+
+/**
+ * hchacha_block - abbreviated ChaCha core, for XChaCha
+ * @in: input state matrix (16 32-bit words)
+ * @out: output (8 32-bit words)
+ * @nrounds: number of rounds (currently must be 20)
+ *
+ * HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step
+ * towards XChaCha (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf).  HChaCha
+ * skips the final addition of the initial state, and outputs only certain words
+ * of the state.  It should not be used for streaming directly.
+ */
+void hchacha_block(const u32 *in, u32 *out, int nrounds)
+{
+	u32 x[16];
+
+	memcpy(x, in, 64);
+
+	chacha_permute(x, nrounds);
+
+	memcpy(&out[0], &x[0], 16);
+	memcpy(&out[4], &x[12], 16);
+}
+EXPORT_SYMBOL(hchacha_block);
diff --git a/lib/chacha20.c b/lib/chacha20.c
deleted file mode 100644
index 6a484e16171d..000000000000
--- a/lib/chacha20.c
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * The "hash function" used as the core of the ChaCha20 stream cipher (RFC7539)
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/bitops.h>
-#include <linux/cryptohash.h>
-#include <asm/unaligned.h>
-#include <crypto/chacha20.h>
-
-static void chacha20_permute(u32 *x)
-{
-	int i;
-
-	for (i = 0; i < 20; i += 2) {
-		x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],  16);
-		x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],  16);
-		x[2]  += x[6];    x[14] = rol32(x[14] ^ x[2],  16);
-		x[3]  += x[7];    x[15] = rol32(x[15] ^ x[3],  16);
-
-		x[8]  += x[12];   x[4]  = rol32(x[4]  ^ x[8],  12);
-		x[9]  += x[13];   x[5]  = rol32(x[5]  ^ x[9],  12);
-		x[10] += x[14];   x[6]  = rol32(x[6]  ^ x[10], 12);
-		x[11] += x[15];   x[7]  = rol32(x[7]  ^ x[11], 12);
-
-		x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],   8);
-		x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],   8);
-		x[2]  += x[6];    x[14] = rol32(x[14] ^ x[2],   8);
-		x[3]  += x[7];    x[15] = rol32(x[15] ^ x[3],   8);
-
-		x[8]  += x[12];   x[4]  = rol32(x[4]  ^ x[8],   7);
-		x[9]  += x[13];   x[5]  = rol32(x[5]  ^ x[9],   7);
-		x[10] += x[14];   x[6]  = rol32(x[6]  ^ x[10],  7);
-		x[11] += x[15];   x[7]  = rol32(x[7]  ^ x[11],  7);
-
-		x[0]  += x[5];    x[15] = rol32(x[15] ^ x[0],  16);
-		x[1]  += x[6];    x[12] = rol32(x[12] ^ x[1],  16);
-		x[2]  += x[7];    x[13] = rol32(x[13] ^ x[2],  16);
-		x[3]  += x[4];    x[14] = rol32(x[14] ^ x[3],  16);
-
-		x[10] += x[15];   x[5]  = rol32(x[5]  ^ x[10], 12);
-		x[11] += x[12];   x[6]  = rol32(x[6]  ^ x[11], 12);
-		x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],  12);
-		x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],  12);
-
-		x[0]  += x[5];    x[15] = rol32(x[15] ^ x[0],   8);
-		x[1]  += x[6];    x[12] = rol32(x[12] ^ x[1],   8);
-		x[2]  += x[7];    x[13] = rol32(x[13] ^ x[2],   8);
-		x[3]  += x[4];    x[14] = rol32(x[14] ^ x[3],   8);
-
-		x[10] += x[15];   x[5]  = rol32(x[5]  ^ x[10],  7);
-		x[11] += x[12];   x[6]  = rol32(x[6]  ^ x[11],  7);
-		x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],   7);
-		x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],   7);
-	}
-}
-
-/**
- * chacha20_block - generate one keystream block and increment block counter
- * @state: input state matrix (16 32-bit words)
- * @stream: output keystream block (64 bytes)
- *
- * This is the ChaCha20 core, a function from 64-byte strings to 64-byte
- * strings.  The caller has already converted the endianness of the input.  This
- * function also handles incrementing the block counter in the input matrix.
- */
-void chacha20_block(u32 *state, u8 *stream)
-{
-	u32 x[16];
-	int i;
-
-	memcpy(x, state, 64);
-
-	chacha20_permute(x);
-
-	for (i = 0; i < ARRAY_SIZE(x); i++)
-		put_unaligned_le32(x[i] + state[i], &stream[i * sizeof(u32)]);
-
-	state[12]++;
-}
-EXPORT_SYMBOL(chacha20_block);
-
-/**
- * hchacha20_block - abbreviated ChaCha20 core, for XChaCha20
- * @in: input state matrix (16 32-bit words)
- * @out: output (8 32-bit words)
- *
- * HChaCha20 is the ChaCha equivalent of HSalsa20 and is an intermediate step
- * towards XChaCha20 (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf).
- * HChaCha20 skips the final addition of the initial state, and outputs only
- * certain words of the state.  It should not be used for streaming directly.
- */
-void hchacha20_block(const u32 *in, u32 *out)
-{
-	u32 x[16];
-
-	memcpy(x, in, 64);
-
-	chacha20_permute(x);
-
-	memcpy(&out[0], &x[0], 16);
-	memcpy(&out[4], &x[12], 16);
-}
-EXPORT_SYMBOL(hchacha20_block);
-- 
cgit v1.2.3-59-g8ed1b


From aa7624093cb7fbf4fea95e612580d8d29a819f67 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 16 Nov 2018 17:26:22 -0800
Subject: crypto: chacha - add XChaCha12 support

Now that the generic implementation of ChaCha20 has been refactored to
allow varying the number of rounds, add support for XChaCha12, which is
the XSalsa construction applied to ChaCha12.  ChaCha12 is one of the
three ciphers specified by the original ChaCha paper
(https://cr.yp.to/chacha/chacha-20080128.pdf: "ChaCha, a variant of
Salsa20"), alongside ChaCha8 and ChaCha20.  ChaCha12 is faster than
ChaCha20 but has a lower, but still large, security margin.

We need XChaCha12 support so that it can be used in the Adiantum
encryption mode, which enables disk/file encryption on low-end mobile
devices where AES-XTS is too slow as the CPUs lack AES instructions.

We'd prefer XChaCha20 (the more popular variant), but it's too slow on
some of our target devices, so at least in some cases we do need the
XChaCha12-based version.  In more detail, the problem is that Adiantum
is still much slower than we're happy with, and encryption still has a
quite noticeable effect on the feel of low-end devices.  Users and
vendors push back hard against encryption that degrades the user
experience, which always risks encryption being disabled entirely.  So
we need to choose the fastest option that gives us a solid margin of
security, and here that's XChaCha12.  The best known attack on ChaCha
breaks only 7 rounds and has 2^235 time complexity, so ChaCha12's
security margin is still better than AES-256's.  Much has been learned
about cryptanalysis of ARX ciphers since Salsa20 was originally designed
in 2005, and it now seems we can be comfortable with a smaller number of
rounds.  The eSTREAM project also suggests the 12-round version of
Salsa20 as providing the best balance among the different variants:
combining very good performance with a "comfortable margin of security".

Note that it would be trivial to add vanilla ChaCha12 in addition to
XChaCha12.  However, it's unneeded for now and therefore is omitted.

As discussed in the patch that introduced XChaCha20 support, I
considered splitting the code into separate chacha-common, chacha20,
xchacha20, and xchacha12 modules, so that these algorithms could be
enabled/disabled independently.  However, since nearly all the code is
shared anyway, I ultimately decided there would have been little benefit
to the added complexity.

Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Martin Willi <martin@strongswan.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/Kconfig          |   8 +-
 crypto/chacha_generic.c |  26 ++-
 crypto/testmgr.c        |   6 +
 crypto/testmgr.h        | 578 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/crypto/chacha.h |   7 +
 lib/chacha.c            |   6 +-
 6 files changed, 625 insertions(+), 6 deletions(-)

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 75ebd1a2746c..4431c0db56b7 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1403,10 +1403,10 @@ config CRYPTO_SALSA20
 	  Bernstein <djb@cr.yp.to>. See <http://cr.yp.to/snuffle.html>
 
 config CRYPTO_CHACHA20
-	tristate "ChaCha20 stream cipher algorithms"
+	tristate "ChaCha stream cipher algorithms"
 	select CRYPTO_BLKCIPHER
 	help
-	  The ChaCha20 and XChaCha20 stream cipher algorithms.
+	  The ChaCha20, XChaCha20, and XChaCha12 stream cipher algorithms.
 
 	  ChaCha20 is a 256-bit high-speed stream cipher designed by Daniel J.
 	  Bernstein and further specified in RFC7539 for use in IETF protocols.
@@ -1419,6 +1419,10 @@ config CRYPTO_CHACHA20
 	  while provably retaining ChaCha20's security.  See also:
 	  <https://cr.yp.to/snuffle/xsalsa-20081128.pdf>
 
+	  XChaCha12 is XChaCha20 reduced to 12 rounds, with correspondingly
+	  reduced security margin but increased performance.  It can be needed
+	  in some performance-sensitive scenarios.
+
 config CRYPTO_CHACHA20_X86_64
 	tristate "ChaCha20 cipher algorithm (x86_64/SSSE3/AVX2)"
 	depends on X86 && 64BIT
diff --git a/crypto/chacha_generic.c b/crypto/chacha_generic.c
index 438f15a14054..35b583101f4f 100644
--- a/crypto/chacha_generic.c
+++ b/crypto/chacha_generic.c
@@ -1,5 +1,5 @@
 /*
- * ChaCha20 (RFC7539) and XChaCha20 stream cipher algorithms
+ * ChaCha and XChaCha stream ciphers, including ChaCha20 (RFC7539)
  *
  * Copyright (C) 2015 Martin Willi
  * Copyright (C) 2018 Google LLC
@@ -106,6 +106,13 @@ int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
 }
 EXPORT_SYMBOL_GPL(crypto_chacha20_setkey);
 
+int crypto_chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key,
+			   unsigned int keysize)
+{
+	return chacha_setkey(tfm, key, keysize, 12);
+}
+EXPORT_SYMBOL_GPL(crypto_chacha12_setkey);
+
 int crypto_chacha_crypt(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
@@ -168,6 +175,21 @@ static struct skcipher_alg algs[] = {
 		.setkey			= crypto_chacha20_setkey,
 		.encrypt		= crypto_xchacha_crypt,
 		.decrypt		= crypto_xchacha_crypt,
+	}, {
+		.base.cra_name		= "xchacha12",
+		.base.cra_driver_name	= "xchacha12-generic",
+		.base.cra_priority	= 100,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= XCHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.setkey			= crypto_chacha12_setkey,
+		.encrypt		= crypto_xchacha_crypt,
+		.decrypt		= crypto_xchacha_crypt,
 	}
 };
 
@@ -191,3 +213,5 @@ MODULE_ALIAS_CRYPTO("chacha20");
 MODULE_ALIAS_CRYPTO("chacha20-generic");
 MODULE_ALIAS_CRYPTO("xchacha20");
 MODULE_ALIAS_CRYPTO("xchacha20-generic");
+MODULE_ALIAS_CRYPTO("xchacha12");
+MODULE_ALIAS_CRYPTO("xchacha12-generic");
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 11f5c8b0f4dc..6ff60c3745f1 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -3576,6 +3576,12 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.suite = {
 			.hash = __VECS(aes_xcbc128_tv_template)
 		}
+	}, {
+		.alg = "xchacha12",
+		.test = alg_test_skcipher,
+		.suite = {
+			.cipher = __VECS(xchacha12_tv_template)
+		},
 	}, {
 		.alg = "xchacha20",
 		.test = alg_test_skcipher,
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index df0dc44a9b7b..a23dca2b11d0 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -31571,6 +31571,584 @@ static const struct cipher_testvec xchacha20_tv_template[] = {
 	},
 };
 
+/*
+ * Same as XChaCha20 test vectors above, but recomputed the ciphertext with
+ * XChaCha12, using a modified libsodium.
+ */
+static const struct cipher_testvec xchacha12_tv_template[] = {
+	{
+		.key	= "\x79\xc9\x97\x98\xac\x67\x30\x0b"
+			  "\xbb\x27\x04\xc9\x5c\x34\x1e\x32"
+			  "\x45\xf3\xdc\xb2\x17\x61\xb9\x8e"
+			  "\x52\xff\x45\xb2\x4f\x30\x4f\xc4",
+		.klen	= 32,
+		.iv	= "\xb3\x3f\xfd\x30\x96\x47\x9b\xcf"
+			  "\xbc\x9a\xee\x49\x41\x76\x88\xa0"
+			  "\xa2\x55\x4f\x8d\x95\x38\x94\x19"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.ptext	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00",
+		.ctext	= "\x1b\x78\x7f\xd7\xa1\x41\x68\xab"
+			  "\x3d\x3f\xd1\x7b\x69\x56\xb2\xd5"
+			  "\x43\xce\xeb\xaf\x36\xf0\x29\x9d"
+			  "\x3a\xfb\x18\xae\x1b",
+		.len	= 29,
+	}, {
+		.key	= "\x9d\x23\xbd\x41\x49\xcb\x97\x9c"
+			  "\xcf\x3c\x5c\x94\xdd\x21\x7e\x98"
+			  "\x08\xcb\x0e\x50\xcd\x0f\x67\x81"
+			  "\x22\x35\xea\xaf\x60\x1d\x62\x32",
+		.klen	= 32,
+		.iv	= "\xc0\x47\x54\x82\x66\xb7\xc3\x70"
+			  "\xd3\x35\x66\xa2\x42\x5c\xbf\x30"
+			  "\xd8\x2d\x1e\xaf\x52\x94\x10\x9e"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.ptext	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00",
+		.ctext	= "\xfb\x32\x09\x1d\x83\x05\xae\x4c"
+			  "\x13\x1f\x12\x71\xf2\xca\xb2\xeb"
+			  "\x5b\x83\x14\x7d\x83\xf6\x57\x77"
+			  "\x2e\x40\x1f\x92\x2c\xf9\xec\x35"
+			  "\x34\x1f\x93\xdf\xfb\x30\xd7\x35"
+			  "\x03\x05\x78\xc1\x20\x3b\x7a\xe3"
+			  "\x62\xa3\x89\xdc\x11\x11\x45\xa8"
+			  "\x82\x89\xa0\xf1\x4e\xc7\x0f\x11"
+			  "\x69\xdd\x0c\x84\x2b\x89\x5c\xdc"
+			  "\xf0\xde\x01\xef\xc5\x65\x79\x23"
+			  "\x87\x67\xd6\x50\xd9\x8d\xd9\x92"
+			  "\x54\x5b\x0e",
+		.len	= 91,
+	}, {
+		.key	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.klen	= 32,
+		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x67\xc6\x69\x73"
+			  "\x51\xff\x4a\xec\x29\xcd\xba\xab"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.ptext	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.ctext	= "\xdf\x2d\xc6\x21\x2a\x9d\xa1\xbb"
+			  "\xc2\x77\x66\x0c\x5c\x46\xef\xa7"
+			  "\x79\x1b\xb9\xdf\x55\xe2\xf9\x61"
+			  "\x4c\x7b\xa4\x52\x24\xaf\xa2\xda"
+			  "\xd1\x8f\x8f\xa2\x9e\x53\x4d\xc4"
+			  "\xb8\x55\x98\x08\x7c\x08\xd4\x18"
+			  "\x67\x8f\xef\x50\xb1\x5f\xa5\x77"
+			  "\x4c\x25\xe7\x86\x26\x42\xca\x44",
+		.len	= 64,
+	}, {
+		.key	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x01",
+		.klen	= 32,
+		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x02\xf2\xfb\xe3\x46"
+			  "\x7c\xc2\x54\xf8\x1b\xe8\xe7\x8d"
+			  "\x01\x00\x00\x00\x00\x00\x00\x00",
+		.ptext	= "\x41\x6e\x79\x20\x73\x75\x62\x6d"
+			  "\x69\x73\x73\x69\x6f\x6e\x20\x74"
+			  "\x6f\x20\x74\x68\x65\x20\x49\x45"
+			  "\x54\x46\x20\x69\x6e\x74\x65\x6e"
+			  "\x64\x65\x64\x20\x62\x79\x20\x74"
+			  "\x68\x65\x20\x43\x6f\x6e\x74\x72"
+			  "\x69\x62\x75\x74\x6f\x72\x20\x66"
+			  "\x6f\x72\x20\x70\x75\x62\x6c\x69"
+			  "\x63\x61\x74\x69\x6f\x6e\x20\x61"
+			  "\x73\x20\x61\x6c\x6c\x20\x6f\x72"
+			  "\x20\x70\x61\x72\x74\x20\x6f\x66"
+			  "\x20\x61\x6e\x20\x49\x45\x54\x46"
+			  "\x20\x49\x6e\x74\x65\x72\x6e\x65"
+			  "\x74\x2d\x44\x72\x61\x66\x74\x20"
+			  "\x6f\x72\x20\x52\x46\x43\x20\x61"
+			  "\x6e\x64\x20\x61\x6e\x79\x20\x73"
+			  "\x74\x61\x74\x65\x6d\x65\x6e\x74"
+			  "\x20\x6d\x61\x64\x65\x20\x77\x69"
+			  "\x74\x68\x69\x6e\x20\x74\x68\x65"
+			  "\x20\x63\x6f\x6e\x74\x65\x78\x74"
+			  "\x20\x6f\x66\x20\x61\x6e\x20\x49"
+			  "\x45\x54\x46\x20\x61\x63\x74\x69"
+			  "\x76\x69\x74\x79\x20\x69\x73\x20"
+			  "\x63\x6f\x6e\x73\x69\x64\x65\x72"
+			  "\x65\x64\x20\x61\x6e\x20\x22\x49"
+			  "\x45\x54\x46\x20\x43\x6f\x6e\x74"
+			  "\x72\x69\x62\x75\x74\x69\x6f\x6e"
+			  "\x22\x2e\x20\x53\x75\x63\x68\x20"
+			  "\x73\x74\x61\x74\x65\x6d\x65\x6e"
+			  "\x74\x73\x20\x69\x6e\x63\x6c\x75"
+			  "\x64\x65\x20\x6f\x72\x61\x6c\x20"
+			  "\x73\x74\x61\x74\x65\x6d\x65\x6e"
+			  "\x74\x73\x20\x69\x6e\x20\x49\x45"
+			  "\x54\x46\x20\x73\x65\x73\x73\x69"
+			  "\x6f\x6e\x73\x2c\x20\x61\x73\x20"
+			  "\x77\x65\x6c\x6c\x20\x61\x73\x20"
+			  "\x77\x72\x69\x74\x74\x65\x6e\x20"
+			  "\x61\x6e\x64\x20\x65\x6c\x65\x63"
+			  "\x74\x72\x6f\x6e\x69\x63\x20\x63"
+			  "\x6f\x6d\x6d\x75\x6e\x69\x63\x61"
+			  "\x74\x69\x6f\x6e\x73\x20\x6d\x61"
+			  "\x64\x65\x20\x61\x74\x20\x61\x6e"
+			  "\x79\x20\x74\x69\x6d\x65\x20\x6f"
+			  "\x72\x20\x70\x6c\x61\x63\x65\x2c"
+			  "\x20\x77\x68\x69\x63\x68\x20\x61"
+			  "\x72\x65\x20\x61\x64\x64\x72\x65"
+			  "\x73\x73\x65\x64\x20\x74\x6f",
+		.ctext	= "\xe4\xa6\xc8\x30\xc4\x23\x13\xd6"
+			  "\x08\x4d\xc9\xb7\xa5\x64\x7c\xb9"
+			  "\x71\xe2\xab\x3e\xa8\x30\x8a\x1c"
+			  "\x4a\x94\x6d\x9b\xe0\xb3\x6f\xf1"
+			  "\xdc\xe3\x1b\xb3\xa9\x6d\x0d\xd6"
+			  "\xd0\xca\x12\xef\xe7\x5f\xd8\x61"
+			  "\x3c\x82\xd3\x99\x86\x3c\x6f\x66"
+			  "\x02\x06\xdc\x55\xf9\xed\xdf\x38"
+			  "\xb4\xa6\x17\x00\x7f\xef\xbf\x4f"
+			  "\xf8\x36\xf1\x60\x7e\x47\xaf\xdb"
+			  "\x55\x9b\x12\xcb\x56\x44\xa7\x1f"
+			  "\xd3\x1a\x07\x3b\x00\xec\xe6\x4c"
+			  "\xa2\x43\x27\xdf\x86\x19\x4f\x16"
+			  "\xed\xf9\x4a\xf3\x63\x6f\xfa\x7f"
+			  "\x78\x11\xf6\x7d\x97\x6f\xec\x6f"
+			  "\x85\x0f\x5c\x36\x13\x8d\x87\xe0"
+			  "\x80\xb1\x69\x0b\x98\x89\x9c\x4e"
+			  "\xf8\xdd\xee\x5c\x0a\x85\xce\xd4"
+			  "\xea\x1b\x48\xbe\x08\xf8\xe2\xa8"
+			  "\xa5\xb0\x3c\x79\xb1\x15\xb4\xb9"
+			  "\x75\x10\x95\x35\x81\x7e\x26\xe6"
+			  "\x78\xa4\x88\xcf\xdb\x91\x34\x18"
+			  "\xad\xd7\x8e\x07\x7d\xab\x39\xf9"
+			  "\xa3\x9e\xa5\x1d\xbb\xed\x61\xfd"
+			  "\xdc\xb7\x5a\x27\xfc\xb5\xc9\x10"
+			  "\xa8\xcc\x52\x7f\x14\x76\x90\xe7"
+			  "\x1b\x29\x60\x74\xc0\x98\x77\xbb"
+			  "\xe0\x54\xbb\x27\x49\x59\x1e\x62"
+			  "\x3d\xaf\x74\x06\xa4\x42\x6f\xc6"
+			  "\x52\x97\xc4\x1d\xc4\x9f\xe2\xe5"
+			  "\x38\x57\x91\xd1\xa2\x28\xcc\x40"
+			  "\xcc\x70\x59\x37\xfc\x9f\x4b\xda"
+			  "\xa0\xeb\x97\x9a\x7d\xed\x14\x5c"
+			  "\x9c\xb7\x93\x26\x41\xa8\x66\xdd"
+			  "\x87\x6a\xc0\xd3\xc2\xa9\x3e\xae"
+			  "\xe9\x72\xfe\xd1\xb3\xac\x38\xea"
+			  "\x4d\x15\xa9\xd5\x36\x61\xe9\x96"
+			  "\x6c\x23\xf8\x43\xe4\x92\x29\xd9"
+			  "\x8b\x78\xf7\x0a\x52\xe0\x19\x5b"
+			  "\x59\x69\x5b\x5d\xa1\x53\xc4\x68"
+			  "\xe1\xbb\xac\x89\x14\xe2\xe2\x85"
+			  "\x41\x18\xf5\xb3\xd1\xfa\x68\x19"
+			  "\x44\x78\xdc\xcf\xe7\x88\x2d\x52"
+			  "\x5f\x40\xb5\x7e\xf8\x88\xa2\xae"
+			  "\x4a\xb2\x07\x35\x9d\x9b\x07\x88"
+			  "\xb7\x00\xd0\x0c\xb6\xa0\x47\x59"
+			  "\xda\x4e\xc9\xab\x9b\x8a\x7b",
+
+		.len	= 375,
+		.also_non_np = 1,
+		.np	= 3,
+		.tap	= { 375 - 20, 4, 16 },
+
+	}, {
+		.key	= "\x1c\x92\x40\xa5\xeb\x55\xd3\x8a"
+			  "\xf3\x33\x88\x86\x04\xf6\xb5\xf0"
+			  "\x47\x39\x17\xc1\x40\x2b\x80\x09"
+			  "\x9d\xca\x5c\xbc\x20\x70\x75\xc0",
+		.klen	= 32,
+		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x02\x76\x5a\x2e\x63"
+			  "\x33\x9f\xc9\x9a\x66\x32\x0d\xb7"
+			  "\x2a\x00\x00\x00\x00\x00\x00\x00",
+		.ptext	= "\x27\x54\x77\x61\x73\x20\x62\x72"
+			  "\x69\x6c\x6c\x69\x67\x2c\x20\x61"
+			  "\x6e\x64\x20\x74\x68\x65\x20\x73"
+			  "\x6c\x69\x74\x68\x79\x20\x74\x6f"
+			  "\x76\x65\x73\x0a\x44\x69\x64\x20"
+			  "\x67\x79\x72\x65\x20\x61\x6e\x64"
+			  "\x20\x67\x69\x6d\x62\x6c\x65\x20"
+			  "\x69\x6e\x20\x74\x68\x65\x20\x77"
+			  "\x61\x62\x65\x3a\x0a\x41\x6c\x6c"
+			  "\x20\x6d\x69\x6d\x73\x79\x20\x77"
+			  "\x65\x72\x65\x20\x74\x68\x65\x20"
+			  "\x62\x6f\x72\x6f\x67\x6f\x76\x65"
+			  "\x73\x2c\x0a\x41\x6e\x64\x20\x74"
+			  "\x68\x65\x20\x6d\x6f\x6d\x65\x20"
+			  "\x72\x61\x74\x68\x73\x20\x6f\x75"
+			  "\x74\x67\x72\x61\x62\x65\x2e",
+		.ctext	= "\xb9\x68\xbc\x6a\x24\xbc\xcc\xd8"
+			  "\x9b\x2a\x8d\x5b\x96\xaf\x56\xe3"
+			  "\x11\x61\xe7\xa7\x9b\xce\x4e\x7d"
+			  "\x60\x02\x48\xac\xeb\xd5\x3a\x26"
+			  "\x9d\x77\x3b\xb5\x32\x13\x86\x8e"
+			  "\x20\x82\x26\x72\xae\x64\x1b\x7e"
+			  "\x2e\x01\x68\xb4\x87\x45\xa1\x24"
+			  "\xe4\x48\x40\xf0\xaa\xac\xee\xa9"
+			  "\xfc\x31\xad\x9d\x89\xa3\xbb\xd2"
+			  "\xe4\x25\x13\xad\x0f\x5e\xdf\x3c"
+			  "\x27\xab\xb8\x62\x46\x22\x30\x48"
+			  "\x55\x2c\x4e\x84\x78\x1d\x0d\x34"
+			  "\x8d\x3c\x91\x0a\x7f\x5b\x19\x9f"
+			  "\x97\x05\x4c\xa7\x62\x47\x8b\xc5"
+			  "\x44\x2e\x20\x33\xdd\xa0\x82\xa9"
+			  "\x25\x76\x37\xe6\x3c\x67\x5b",
+		.len	= 127,
+	}, {
+		.key	= "\x1c\x92\x40\xa5\xeb\x55\xd3\x8a"
+			  "\xf3\x33\x88\x86\x04\xf6\xb5\xf0"
+			  "\x47\x39\x17\xc1\x40\x2b\x80\x09"
+			  "\x9d\xca\x5c\xbc\x20\x70\x75\xc0",
+		.klen	= 32,
+		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x01\x31\x58\xa3\x5a"
+			  "\x25\x5d\x05\x17\x58\xe9\x5e\xd4"
+			  "\x1c\x00\x00\x00\x00\x00\x00\x00",
+		.ptext	= "\x49\xee\xe0\xdc\x24\x90\x40\xcd"
+			  "\xc5\x40\x8f\x47\x05\xbc\xdd\x81"
+			  "\x47\xc6\x8d\xe6\xb1\x8f\xd7\xcb"
+			  "\x09\x0e\x6e\x22\x48\x1f\xbf\xb8"
+			  "\x5c\xf7\x1e\x8a\xc1\x23\xf2\xd4"
+			  "\x19\x4b\x01\x0f\x4e\xa4\x43\xce"
+			  "\x01\xc6\x67\xda\x03\x91\x18\x90"
+			  "\xa5\xa4\x8e\x45\x03\xb3\x2d\xac"
+			  "\x74\x92\xd3\x53\x47\xc8\xdd\x25"
+			  "\x53\x6c\x02\x03\x87\x0d\x11\x0c"
+			  "\x58\xe3\x12\x18\xfd\x2a\x5b\x40"
+			  "\x0c\x30\xf0\xb8\x3f\x43\xce\xae"
+			  "\x65\x3a\x7d\x7c\xf4\x54\xaa\xcc"
+			  "\x33\x97\xc3\x77\xba\xc5\x70\xde"
+			  "\xd7\xd5\x13\xa5\x65\xc4\x5f\x0f"
+			  "\x46\x1a\x0d\x97\xb5\xf3\xbb\x3c"
+			  "\x84\x0f\x2b\xc5\xaa\xea\xf2\x6c"
+			  "\xc9\xb5\x0c\xee\x15\xf3\x7d\xbe"
+			  "\x9f\x7b\x5a\xa6\xae\x4f\x83\xb6"
+			  "\x79\x49\x41\xf4\x58\x18\xcb\x86"
+			  "\x7f\x30\x0e\xf8\x7d\x44\x36\xea"
+			  "\x75\xeb\x88\x84\x40\x3c\xad\x4f"
+			  "\x6f\x31\x6b\xaa\x5d\xe5\xa5\xc5"
+			  "\x21\x66\xe9\xa7\xe3\xb2\x15\x88"
+			  "\x78\xf6\x79\xa1\x59\x47\x12\x4e"
+			  "\x9f\x9f\x64\x1a\xa0\x22\x5b\x08"
+			  "\xbe\x7c\x36\xc2\x2b\x66\x33\x1b"
+			  "\xdd\x60\x71\xf7\x47\x8c\x61\xc3"
+			  "\xda\x8a\x78\x1e\x16\xfa\x1e\x86"
+			  "\x81\xa6\x17\x2a\xa7\xb5\xc2\xe7"
+			  "\xa4\xc7\x42\xf1\xcf\x6a\xca\xb4"
+			  "\x45\xcf\xf3\x93\xf0\xe7\xea\xf6"
+			  "\xf4\xe6\x33\x43\x84\x93\xa5\x67"
+			  "\x9b\x16\x58\x58\x80\x0f\x2b\x5c"
+			  "\x24\x74\x75\x7f\x95\x81\xb7\x30"
+			  "\x7a\x33\xa7\xf7\x94\x87\x32\x27"
+			  "\x10\x5d\x14\x4c\x43\x29\xdd\x26"
+			  "\xbd\x3e\x3c\x0e\xfe\x0e\xa5\x10"
+			  "\xea\x6b\x64\xfd\x73\xc6\xed\xec"
+			  "\xa8\xc9\xbf\xb3\xba\x0b\x4d\x07"
+			  "\x70\xfc\x16\xfd\x79\x1e\xd7\xc5"
+			  "\x49\x4e\x1c\x8b\x8d\x79\x1b\xb1"
+			  "\xec\xca\x60\x09\x4c\x6a\xd5\x09"
+			  "\x49\x46\x00\x88\x22\x8d\xce\xea"
+			  "\xb1\x17\x11\xde\x42\xd2\x23\xc1"
+			  "\x72\x11\xf5\x50\x73\x04\x40\x47"
+			  "\xf9\x5d\xe7\xa7\x26\xb1\x7e\xb0"
+			  "\x3f\x58\xc1\x52\xab\x12\x67\x9d"
+			  "\x3f\x43\x4b\x68\xd4\x9c\x68\x38"
+			  "\x07\x8a\x2d\x3e\xf3\xaf\x6a\x4b"
+			  "\xf9\xe5\x31\x69\x22\xf9\xa6\x69"
+			  "\xc6\x9c\x96\x9a\x12\x35\x95\x1d"
+			  "\x95\xd5\xdd\xbe\xbf\x93\x53\x24"
+			  "\xfd\xeb\xc2\x0a\x64\xb0\x77\x00"
+			  "\x6f\x88\xc4\x37\x18\x69\x7c\xd7"
+			  "\x41\x92\x55\x4c\x03\xa1\x9a\x4b"
+			  "\x15\xe5\xdf\x7f\x37\x33\x72\xc1"
+			  "\x8b\x10\x67\xa3\x01\x57\x94\x25"
+			  "\x7b\x38\x71\x7e\xdd\x1e\xcc\x73"
+			  "\x55\xd2\x8e\xeb\x07\xdd\xf1\xda"
+			  "\x58\xb1\x47\x90\xfe\x42\x21\x72"
+			  "\xa3\x54\x7a\xa0\x40\xec\x9f\xdd"
+			  "\xc6\x84\x6e\xca\xae\xe3\x68\xb4"
+			  "\x9d\xe4\x78\xff\x57\xf2\xf8\x1b"
+			  "\x03\xa1\x31\xd9\xde\x8d\xf5\x22"
+			  "\x9c\xdd\x20\xa4\x1e\x27\xb1\x76"
+			  "\x4f\x44\x55\xe2\x9b\xa1\x9c\xfe"
+			  "\x54\xf7\x27\x1b\xf4\xde\x02\xf5"
+			  "\x1b\x55\x48\x5c\xdc\x21\x4b\x9e"
+			  "\x4b\x6e\xed\x46\x23\xdc\x65\xb2"
+			  "\xcf\x79\x5f\x28\xe0\x9e\x8b\xe7"
+			  "\x4c\x9d\x8a\xff\xc1\xa6\x28\xb8"
+			  "\x65\x69\x8a\x45\x29\xef\x74\x85"
+			  "\xde\x79\xc7\x08\xae\x30\xb0\xf4"
+			  "\xa3\x1d\x51\x41\xab\xce\xcb\xf6"
+			  "\xb5\xd8\x6d\xe0\x85\xe1\x98\xb3"
+			  "\x43\xbb\x86\x83\x0a\xa0\xf5\xb7"
+			  "\x04\x0b\xfa\x71\x1f\xb0\xf6\xd9"
+			  "\x13\x00\x15\xf0\xc7\xeb\x0d\x5a"
+			  "\x9f\xd7\xb9\x6c\x65\x14\x22\x45"
+			  "\x6e\x45\x32\x3e\x7e\x60\x1a\x12"
+			  "\x97\x82\x14\xfb\xaa\x04\x22\xfa"
+			  "\xa0\xe5\x7e\x8c\x78\x02\x48\x5d"
+			  "\x78\x33\x5a\x7c\xad\xdb\x29\xce"
+			  "\xbb\x8b\x61\xa4\xb7\x42\xe2\xac"
+			  "\x8b\x1a\xd9\x2f\x0b\x8b\x62\x21"
+			  "\x83\x35\x7e\xad\x73\xc2\xb5\x6c"
+			  "\x10\x26\x38\x07\xe5\xc7\x36\x80"
+			  "\xe2\x23\x12\x61\xf5\x48\x4b\x2b"
+			  "\xc5\xdf\x15\xd9\x87\x01\xaa\xac"
+			  "\x1e\x7c\xad\x73\x78\x18\x63\xe0"
+			  "\x8b\x9f\x81\xd8\x12\x6a\x28\x10"
+			  "\xbe\x04\x68\x8a\x09\x7c\x1b\x1c"
+			  "\x83\x66\x80\x47\x80\xe8\xfd\x35"
+			  "\x1c\x97\x6f\xae\x49\x10\x66\xcc"
+			  "\xc6\xd8\xcc\x3a\x84\x91\x20\x77"
+			  "\x72\xe4\x24\xd2\x37\x9f\xc5\xc9"
+			  "\x25\x94\x10\x5f\x40\x00\x64\x99"
+			  "\xdc\xae\xd7\x21\x09\x78\x50\x15"
+			  "\xac\x5f\xc6\x2c\xa2\x0b\xa9\x39"
+			  "\x87\x6e\x6d\xab\xde\x08\x51\x16"
+			  "\xc7\x13\xe9\xea\xed\x06\x8e\x2c"
+			  "\xf8\x37\x8c\xf0\xa6\x96\x8d\x43"
+			  "\xb6\x98\x37\xb2\x43\xed\xde\xdf"
+			  "\x89\x1a\xe7\xeb\x9d\xa1\x7b\x0b"
+			  "\x77\xb0\xe2\x75\xc0\xf1\x98\xd9"
+			  "\x80\x55\xc9\x34\x91\xd1\x59\xe8"
+			  "\x4b\x0f\xc1\xa9\x4b\x7a\x84\x06"
+			  "\x20\xa8\x5d\xfa\xd1\xde\x70\x56"
+			  "\x2f\x9e\x91\x9c\x20\xb3\x24\xd8"
+			  "\x84\x3d\xe1\x8c\x7e\x62\x52\xe5"
+			  "\x44\x4b\x9f\xc2\x93\x03\xea\x2b"
+			  "\x59\xc5\xfa\x3f\x91\x2b\xbb\x23"
+			  "\xf5\xb2\x7b\xf5\x38\xaf\xb3\xee"
+			  "\x63\xdc\x7b\xd1\xff\xaa\x8b\xab"
+			  "\x82\x6b\x37\x04\xeb\x74\xbe\x79"
+			  "\xb9\x83\x90\xef\x20\x59\x46\xff"
+			  "\xe9\x97\x3e\x2f\xee\xb6\x64\x18"
+			  "\x38\x4c\x7a\x4a\xf9\x61\xe8\x9a"
+			  "\xa1\xb5\x01\xa6\x47\xd3\x11\xd4"
+			  "\xce\xd3\x91\x49\x88\xc7\xb8\x4d"
+			  "\xb1\xb9\x07\x6d\x16\x72\xae\x46"
+			  "\x5e\x03\xa1\x4b\xb6\x02\x30\xa8"
+			  "\x3d\xa9\x07\x2a\x7c\x19\xe7\x62"
+			  "\x87\xe3\x82\x2f\x6f\xe1\x09\xd9"
+			  "\x94\x97\xea\xdd\x58\x9e\xae\x76"
+			  "\x7e\x35\xe5\xb4\xda\x7e\xf4\xde"
+			  "\xf7\x32\x87\xcd\x93\xbf\x11\x56"
+			  "\x11\xbe\x08\x74\xe1\x69\xad\xe2"
+			  "\xd7\xf8\x86\x75\x8a\x3c\xa4\xbe"
+			  "\x70\xa7\x1b\xfc\x0b\x44\x2a\x76"
+			  "\x35\xea\x5d\x85\x81\xaf\x85\xeb"
+			  "\xa0\x1c\x61\xc2\xf7\x4f\xa5\xdc"
+			  "\x02\x7f\xf6\x95\x40\x6e\x8a\x9a"
+			  "\xf3\x5d\x25\x6e\x14\x3a\x22\xc9"
+			  "\x37\x1c\xeb\x46\x54\x3f\xa5\x91"
+			  "\xc2\xb5\x8c\xfe\x53\x08\x97\x32"
+			  "\x1b\xb2\x30\x27\xfe\x25\x5d\xdc"
+			  "\x08\x87\xd0\xe5\x94\x1a\xd4\xf1"
+			  "\xfe\xd6\xb4\xa3\xe6\x74\x81\x3c"
+			  "\x1b\xb7\x31\xa7\x22\xfd\xd4\xdd"
+			  "\x20\x4e\x7c\x51\xb0\x60\x73\xb8"
+			  "\x9c\xac\x91\x90\x7e\x01\xb0\xe1"
+			  "\x8a\x2f\x75\x1c\x53\x2a\x98\x2a"
+			  "\x06\x52\x95\x52\xb2\xe9\x25\x2e"
+			  "\x4c\xe2\x5a\x00\xb2\x13\x81\x03"
+			  "\x77\x66\x0d\xa5\x99\xda\x4e\x8c"
+			  "\xac\xf3\x13\x53\x27\x45\xaf\x64"
+			  "\x46\xdc\xea\x23\xda\x97\xd1\xab"
+			  "\x7d\x6c\x30\x96\x1f\xbc\x06\x34"
+			  "\x18\x0b\x5e\x21\x35\x11\x8d\x4c"
+			  "\xe0\x2d\xe9\x50\x16\x74\x81\xa8"
+			  "\xb4\x34\xb9\x72\x42\xa6\xcc\xbc"
+			  "\xca\x34\x83\x27\x10\x5b\x68\x45"
+			  "\x8f\x52\x22\x0c\x55\x3d\x29\x7c"
+			  "\xe3\xc0\x66\x05\x42\x91\x5f\x58"
+			  "\xfe\x4a\x62\xd9\x8c\xa9\x04\x19"
+			  "\x04\xa9\x08\x4b\x57\xfc\x67\x53"
+			  "\x08\x7c\xbc\x66\x8a\xb0\xb6\x9f"
+			  "\x92\xd6\x41\x7c\x5b\x2a\x00\x79"
+			  "\x72",
+		.ctext	= "\xe1\xb6\x8b\x5c\x80\xb8\xcc\x08"
+			  "\x1b\x84\xb2\xd1\xad\xa4\x70\xac"
+			  "\x67\xa9\x39\x27\xac\xb4\x5b\xb7"
+			  "\x4c\x26\x77\x23\x1d\xce\x0a\xbe"
+			  "\x18\x9e\x42\x8b\xbd\x7f\xd6\xf1"
+			  "\xf1\x6b\xe2\x6d\x7f\x92\x0e\xcb"
+			  "\xb8\x79\xba\xb4\xac\x7e\x2d\xc0"
+			  "\x9e\x83\x81\x91\xd5\xea\xc3\x12"
+			  "\x8d\xa4\x26\x70\xa4\xf9\x71\x0b"
+			  "\xbd\x2e\xe1\xb3\x80\x42\x25\xb3"
+			  "\x0b\x31\x99\xe1\x0d\xde\xa6\x90"
+			  "\xf2\xa3\x10\xf7\xe5\xf3\x83\x1e"
+			  "\x2c\xfb\x4d\xf0\x45\x3d\x28\x3c"
+			  "\xb8\xf1\xcb\xbf\x67\xd8\x43\x5a"
+			  "\x9d\x7b\x73\x29\x88\x0f\x13\x06"
+			  "\x37\x50\x0d\x7c\xe6\x9b\x07\xdd"
+			  "\x7e\x01\x1f\x81\x90\x10\x69\xdb"
+			  "\xa4\xad\x8a\x5e\xac\x30\x72\xf2"
+			  "\x36\xcd\xe3\x23\x49\x02\x93\xfa"
+			  "\x3d\xbb\xe2\x98\x83\xeb\xe9\x8d"
+			  "\xb3\x8f\x11\xaa\x53\xdb\xaf\x2e"
+			  "\x95\x13\x99\x3d\x71\xbd\x32\x92"
+			  "\xdd\xfc\x9d\x5e\x6f\x63\x2c\xee"
+			  "\x91\x1f\x4c\x64\x3d\x87\x55\x0f"
+			  "\xcc\x3d\x89\x61\x53\x02\x57\x8f"
+			  "\xe4\x77\x29\x32\xaf\xa6\x2f\x0a"
+			  "\xae\x3c\x3f\x3f\xf4\xfb\x65\x52"
+			  "\xc5\xc1\x78\x78\x53\x28\xad\xed"
+			  "\xd1\x67\x37\xc7\x59\x70\xcd\x0a"
+			  "\xb8\x0f\x80\x51\x9f\xc0\x12\x5e"
+			  "\x06\x0a\x7e\xec\x24\x5f\x73\x00"
+			  "\xb1\x0b\x31\x47\x4f\x73\x8d\xb4"
+			  "\xce\xf3\x55\x45\x6c\x84\x27\xba"
+			  "\xb9\x6f\x03\x4a\xeb\x98\x88\x6e"
+			  "\x53\xed\x25\x19\x0d\x8f\xfe\xca"
+			  "\x60\xe5\x00\x93\x6e\x3c\xff\x19"
+			  "\xae\x08\x3b\x8a\xa6\x84\x05\xfe"
+			  "\x9b\x59\xa0\x8c\xc8\x05\x45\xf5"
+			  "\x05\x37\xdc\x45\x6f\x8b\x95\x8c"
+			  "\x4e\x11\x45\x7a\xce\x21\xa5\xf7"
+			  "\x71\x67\xb9\xce\xd7\xf9\xe9\x5e"
+			  "\x60\xf5\x53\x7a\xa8\x85\x14\x03"
+			  "\xa0\x92\xec\xf3\x51\x80\x84\xc4"
+			  "\xdc\x11\x9e\x57\xce\x4b\x45\xcf"
+			  "\x90\x95\x85\x0b\x96\xe9\xee\x35"
+			  "\x10\xb8\x9b\xf2\x59\x4a\xc6\x7e"
+			  "\x85\xe5\x6f\x38\x51\x93\x40\x0c"
+			  "\x99\xd7\x7f\x32\xa8\x06\x27\xd1"
+			  "\x2b\xd5\xb5\x3a\x1a\xe1\x5e\xda"
+			  "\xcd\x5a\x50\x30\x3c\xc7\xe7\x65"
+			  "\xa6\x07\x0b\x98\x91\xc6\x20\x27"
+			  "\x2a\x03\x63\x1b\x1e\x3d\xaf\xc8"
+			  "\x71\x48\x46\x6a\x64\x28\xf9\x3d"
+			  "\xd1\x1d\xab\xc8\x40\x76\xc2\x39"
+			  "\x4e\x00\x75\xd2\x0e\x82\x58\x8c"
+			  "\xd3\x73\x5a\xea\x46\x89\xbe\xfd"
+			  "\x4e\x2c\x0d\x94\xaa\x9b\x68\xac"
+			  "\x86\x87\x30\x7e\xa9\x16\xcd\x59"
+			  "\xd2\xa6\xbe\x0a\xd8\xf5\xfd\x2d"
+			  "\x49\x69\xd2\x1a\x90\xd2\x1b\xed"
+			  "\xff\x71\x04\x87\x87\x21\xc4\xb8"
+			  "\x1f\x5b\x51\x33\xd0\xd6\x59\x9a"
+			  "\x03\x0e\xd3\x8b\xfb\x57\x73\xfd"
+			  "\x5a\x52\x63\x82\xc8\x85\x2f\xcb"
+			  "\x74\x6d\x4e\xd9\x68\x37\x85\x6a"
+			  "\xd4\xfb\x94\xed\x8d\xd1\x1a\xaf"
+			  "\x76\xa7\xb7\x88\xd0\x2b\x4e\xda"
+			  "\xec\x99\x94\x27\x6f\x87\x8c\xdf"
+			  "\x4b\x5e\xa6\x66\xdd\xcb\x33\x7b"
+			  "\x64\x94\x31\xa8\x37\xa6\x1d\xdb"
+			  "\x0d\x5c\x93\xa4\x40\xf9\x30\x53"
+			  "\x4b\x74\x8d\xdd\xf6\xde\x3c\xac"
+			  "\x5c\x80\x01\x3a\xef\xb1\x9a\x02"
+			  "\x0c\x22\x8e\xe7\x44\x09\x74\x4c"
+			  "\xf2\x9a\x27\x69\x7f\x12\x32\x36"
+			  "\xde\x92\xdf\xde\x8f\x5b\x31\xab"
+			  "\x4a\x01\x26\xe0\xb1\xda\xe8\x37"
+			  "\x21\x64\xe8\xff\x69\xfc\x9e\x41"
+			  "\xd2\x96\x2d\x18\x64\x98\x33\x78"
+			  "\x24\x61\x73\x9b\x47\x29\xf1\xa7"
+			  "\xcb\x27\x0f\xf0\x85\x6d\x8c\x9d"
+			  "\x2c\x95\x9e\xe5\xb2\x8e\x30\x29"
+			  "\x78\x8a\x9d\x65\xb4\x8e\xde\x7b"
+			  "\xd9\x00\x50\xf5\x7f\x81\xc3\x1b"
+			  "\x25\x85\xeb\xc2\x8c\x33\x22\x1e"
+			  "\x68\x38\x22\x30\xd8\x2e\x00\x98"
+			  "\x85\x16\x06\x56\xb4\x81\x74\x20"
+			  "\x95\xdb\x1c\x05\x19\xe8\x23\x4d"
+			  "\x65\x5d\xcc\xd8\x7f\xc4\x2d\x0f"
+			  "\x57\x26\x71\x07\xad\xaa\x71\x9f"
+			  "\x19\x76\x2f\x25\x51\x88\xe4\xc0"
+			  "\x82\x6e\x08\x05\x37\x04\xee\x25"
+			  "\x23\x90\xe9\x4e\xce\x9b\x16\xc1"
+			  "\x31\xe7\x6e\x2c\x1b\xe1\x85\x9a"
+			  "\x0c\x8c\xbb\x12\x1e\x68\x7b\x93"
+			  "\xa9\x3c\x39\x56\x23\x3e\x6e\xc7"
+			  "\x77\x84\xd3\xe0\x86\x59\xaa\xb9"
+			  "\xd5\x53\x58\xc9\x0a\x83\x5f\x85"
+			  "\xd8\x47\x14\x67\x8a\x3c\x17\xe0"
+			  "\xab\x02\x51\xea\xf1\xf0\x4f\x30"
+			  "\x7d\xe0\x92\xc2\x5f\xfb\x19\x5a"
+			  "\x3f\xbd\xf4\x39\xa4\x31\x0c\x39"
+			  "\xd1\xae\x4e\xf7\x65\x7f\x1f\xce"
+			  "\xc2\x39\xd1\x84\xd4\xe5\x02\xe0"
+			  "\x58\xaa\xf1\x5e\x81\xaf\x7f\x72"
+			  "\x0f\x08\x99\x43\xb9\xd8\xac\x41"
+			  "\x35\x55\xf2\xb2\xd4\x98\xb8\x3b"
+			  "\x2b\x3c\x3e\x16\x06\x31\xfc\x79"
+			  "\x47\x38\x63\x51\xc5\xd0\x26\xd7"
+			  "\x43\xb4\x2b\xd9\xc5\x05\xf2\x9d"
+			  "\x18\xc9\x26\x82\x56\xd2\x11\x05"
+			  "\xb6\x89\xb4\x43\x9c\xb5\x9d\x11"
+			  "\x6c\x83\x37\x71\x27\x1c\xae\xbf"
+			  "\xcd\x57\xd2\xee\x0d\x5a\x15\x26"
+			  "\x67\x88\x80\x80\x1b\xdc\xc1\x62"
+			  "\xdd\x4c\xff\x92\x5c\x6c\xe1\xa0"
+			  "\xe3\x79\xa9\x65\x8c\x8c\x14\x42"
+			  "\xe5\x11\xd2\x1a\xad\xa9\x56\x6f"
+			  "\x98\xfc\x8a\x7b\x56\x1f\xc6\xc1"
+			  "\x52\x12\x92\x9b\x41\x0f\x4b\xae"
+			  "\x1b\x4a\xbc\xfe\x23\xb6\x94\x70"
+			  "\x04\x30\x9e\x69\x47\xbe\xb8\x8f"
+			  "\xca\x45\xd7\x8a\xf4\x78\x3e\xaa"
+			  "\x71\x17\xd8\x1e\xb8\x11\x8f\xbc"
+			  "\xc8\x1a\x65\x7b\x41\x89\x72\xc7"
+			  "\x5f\xbe\xc5\x2a\xdb\x5c\x54\xf9"
+			  "\x25\xa3\x7a\x80\x56\x9c\x8c\xab"
+			  "\x26\x19\x10\x36\xa6\xf3\x14\x79"
+			  "\x40\x98\x70\x68\xb7\x35\xd9\xb9"
+			  "\x27\xd4\xe7\x74\x5b\x3d\x97\xb4"
+			  "\xd9\xaa\xd9\xf2\xb5\x14\x84\x1f"
+			  "\xa9\xde\x12\x44\x5b\x00\xc0\xbc"
+			  "\xc8\x11\x25\x1b\x67\x7a\x15\x72"
+			  "\xa6\x31\x6f\xf4\x68\x7a\x86\x9d"
+			  "\x43\x1c\x5f\x16\xd3\xad\x2e\x52"
+			  "\xf3\xb4\xc3\xfa\x27\x2e\x68\x6c"
+			  "\x06\xe7\x4c\x4f\xa2\xe0\xe4\x21"
+			  "\x5d\x9e\x33\x58\x8d\xbf\xd5\x70"
+			  "\xf8\x80\xa5\xdd\xe7\x18\x79\xfa"
+			  "\x7b\xfd\x09\x69\x2c\x37\x32\xa8"
+			  "\x65\xfa\x8d\x8b\x5c\xcc\xe8\xf3"
+			  "\x37\xf6\xa6\xc6\x5c\xa2\x66\x79"
+			  "\xfa\x8a\xa7\xd1\x0b\x2e\x1b\x5e"
+			  "\x95\x35\x00\x76\xae\x42\xf7\x50"
+			  "\x51\x78\xfb\xb4\x28\x24\xde\x1a"
+			  "\x70\x8b\xed\xca\x3c\x5e\xe4\xbd"
+			  "\x28\xb5\xf3\x76\x4f\x67\x5d\x81"
+			  "\xb2\x60\x87\xd9\x7b\x19\x1a\xa7"
+			  "\x79\xa2\xfa\x3f\x9e\xa9\xd7\x25"
+			  "\x61\xe1\x74\x31\xa2\x77\xa0\x1b"
+			  "\xf6\xf7\xcb\xc5\xaa\x9e\xce\xf9"
+			  "\x9b\x96\xef\x51\xc3\x1a\x44\x96"
+			  "\xae\x17\x50\xab\x29\x08\xda\xcc"
+			  "\x1a\xb3\x12\xd0\x24\xe4\xe2\xe0"
+			  "\xc6\xe3\xcc\x82\xd0\xba\x47\x4c"
+			  "\x3f\x49\xd7\xe8\xb6\x61\xaa\x65"
+			  "\x25\x18\x40\x2d\x62\x25\x02\x71"
+			  "\x61\xa2\xc1\xb2\x13\xd2\x71\x3f"
+			  "\x43\x1a\xc9\x09\x92\xff\xd5\x57"
+			  "\xf0\xfc\x5e\x1c\xf1\xf5\xf9\xf3"
+			  "\x5b",
+		.len	= 1281,
+		.also_non_np = 1,
+		.np	= 3,
+		.tap	= { 1200, 1, 80 },
+	},
+};
+
 /*
  * CTS (Cipher Text Stealing) mode tests
  */
diff --git a/include/crypto/chacha.h b/include/crypto/chacha.h
index b722a23e54bb..1fc70a69d550 100644
--- a/include/crypto/chacha.h
+++ b/include/crypto/chacha.h
@@ -5,6 +5,11 @@
  * XChaCha extends ChaCha's nonce to 192 bits, while provably retaining ChaCha's
  * security.  Here they share the same key size, tfm context, and setkey
  * function; only their IV size and encrypt/decrypt function differ.
+ *
+ * The ChaCha paper specifies 20, 12, and 8-round variants.  In general, it is
+ * recommended to use the 20-round variant ChaCha20.  However, the other
+ * variants can be needed in some performance-sensitive scenarios.  The generic
+ * ChaCha code currently allows only the 20 and 12-round variants.
  */
 
 #ifndef _CRYPTO_CHACHA_H
@@ -40,6 +45,8 @@ void crypto_chacha_init(u32 *state, struct chacha_ctx *ctx, u8 *iv);
 
 int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
 			   unsigned int keysize);
+int crypto_chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key,
+			   unsigned int keysize);
 
 int crypto_chacha_crypt(struct skcipher_request *req);
 int crypto_xchacha_crypt(struct skcipher_request *req);
diff --git a/lib/chacha.c b/lib/chacha.c
index 1bdc688c18df..a46d2832dbab 100644
--- a/lib/chacha.c
+++ b/lib/chacha.c
@@ -21,7 +21,7 @@ static void chacha_permute(u32 *x, int nrounds)
 	int i;
 
 	/* whitelist the allowed round counts */
-	WARN_ON_ONCE(nrounds != 20);
+	WARN_ON_ONCE(nrounds != 20 && nrounds != 12);
 
 	for (i = 0; i < nrounds; i += 2) {
 		x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],  16);
@@ -70,7 +70,7 @@ static void chacha_permute(u32 *x, int nrounds)
  * chacha_block - generate one keystream block and increment block counter
  * @state: input state matrix (16 32-bit words)
  * @stream: output keystream block (64 bytes)
- * @nrounds: number of rounds (currently must be 20)
+ * @nrounds: number of rounds (20 or 12; 20 is recommended)
  *
  * This is the ChaCha core, a function from 64-byte strings to 64-byte strings.
  * The caller has already converted the endianness of the input.  This function
@@ -96,7 +96,7 @@ EXPORT_SYMBOL(chacha_block);
  * hchacha_block - abbreviated ChaCha core, for XChaCha
  * @in: input state matrix (16 32-bit words)
  * @out: output (8 32-bit words)
- * @nrounds: number of rounds (currently must be 20)
+ * @nrounds: number of rounds (20 or 12; 20 is recommended)
  *
  * HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step
  * towards XChaCha (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf).  HChaCha
-- 
cgit v1.2.3-59-g8ed1b


From be2830b15b60011845ad701076511e8b93b2fd76 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 16 Nov 2018 17:26:23 -0800
Subject: crypto: arm/chacha20 - limit the preemption-disabled section

To improve responsivesess, disable preemption for each step of the walk
(which is at most PAGE_SIZE) rather than for the entire
encryption/decryption operation.

Suggested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm/crypto/chacha20-neon-glue.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm/crypto/chacha20-neon-glue.c b/arch/arm/crypto/chacha20-neon-glue.c
index 7386eb1c1889..2bc035cb8f23 100644
--- a/arch/arm/crypto/chacha20-neon-glue.c
+++ b/arch/arm/crypto/chacha20-neon-glue.c
@@ -68,22 +68,22 @@ static int chacha20_neon(struct skcipher_request *req)
 	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
 		return crypto_chacha_crypt(req);
 
-	err = skcipher_walk_virt(&walk, req, true);
+	err = skcipher_walk_virt(&walk, req, false);
 
 	crypto_chacha_init(state, ctx, walk.iv);
 
-	kernel_neon_begin();
 	while (walk.nbytes > 0) {
 		unsigned int nbytes = walk.nbytes;
 
 		if (nbytes < walk.total)
 			nbytes = round_down(nbytes, walk.stride);
 
+		kernel_neon_begin();
 		chacha20_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
 				nbytes);
+		kernel_neon_end();
 		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
 	}
-	kernel_neon_end();
 
 	return err;
 }
-- 
cgit v1.2.3-59-g8ed1b


From d97a94309d764ed907d4281da6246f5d935166f8 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 16 Nov 2018 17:26:24 -0800
Subject: crypto: arm/chacha20 - add XChaCha20 support

Add an XChaCha20 implementation that is hooked up to the ARM NEON
implementation of ChaCha20.  This is needed for use in the Adiantum
encryption mode; see the generic code patch,
"crypto: chacha20-generic - add XChaCha20 support", for more details.

We also update the NEON code to support HChaCha20 on one block, so we
can use that in XChaCha20 rather than calling the generic HChaCha20.
This required factoring the permutation out into its own macro.

Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm/crypto/Kconfig              |   2 +-
 arch/arm/crypto/chacha20-neon-core.S |  70 ++++++++++++++++--------
 arch/arm/crypto/chacha20-neon-glue.c | 103 ++++++++++++++++++++++++++---------
 3 files changed, 126 insertions(+), 49 deletions(-)

diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index 0473a8f68389..a08759c32cb9 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -126,7 +126,7 @@ config CRYPTO_CRC32_ARM_CE
 	select CRYPTO_HASH
 
 config CRYPTO_CHACHA20_NEON
-	tristate "NEON accelerated ChaCha20 symmetric cipher"
+	tristate "NEON accelerated ChaCha20 stream cipher algorithms"
 	depends on KERNEL_MODE_NEON
 	select CRYPTO_BLKCIPHER
 	select CRYPTO_CHACHA20
diff --git a/arch/arm/crypto/chacha20-neon-core.S b/arch/arm/crypto/chacha20-neon-core.S
index 50e7b9896818..2335e5055d2b 100644
--- a/arch/arm/crypto/chacha20-neon-core.S
+++ b/arch/arm/crypto/chacha20-neon-core.S
@@ -52,27 +52,16 @@
 	.fpu		neon
 	.align		5
 
-ENTRY(chacha20_block_xor_neon)
-	// r0: Input state matrix, s
-	// r1: 1 data block output, o
-	// r2: 1 data block input, i
-
-	//
-	// This function encrypts one ChaCha20 block by loading the state matrix
-	// in four NEON registers. It performs matrix operation on four words in
-	// parallel, but requireds shuffling to rearrange the words after each
-	// round.
-	//
-
-	// x0..3 = s0..3
-	add		ip, r0, #0x20
-	vld1.32		{q0-q1}, [r0]
-	vld1.32		{q2-q3}, [ip]
-
-	vmov		q8, q0
-	vmov		q9, q1
-	vmov		q10, q2
-	vmov		q11, q3
+/*
+ * chacha20_permute - permute one block
+ *
+ * Permute one 64-byte block where the state matrix is stored in the four NEON
+ * registers q0-q3.  It performs matrix operations on four words in parallel,
+ * but requires shuffling to rearrange the words after each round.
+ *
+ * Clobbers: r3, ip, q4-q5
+ */
+chacha20_permute:
 
 	adr		ip, .Lrol8_table
 	mov		r3, #10
@@ -142,6 +131,27 @@ ENTRY(chacha20_block_xor_neon)
 	subs		r3, r3, #1
 	bne		.Ldoubleround
 
+	bx		lr
+ENDPROC(chacha20_permute)
+
+ENTRY(chacha20_block_xor_neon)
+	// r0: Input state matrix, s
+	// r1: 1 data block output, o
+	// r2: 1 data block input, i
+	push		{lr}
+
+	// x0..3 = s0..3
+	add		ip, r0, #0x20
+	vld1.32		{q0-q1}, [r0]
+	vld1.32		{q2-q3}, [ip]
+
+	vmov		q8, q0
+	vmov		q9, q1
+	vmov		q10, q2
+	vmov		q11, q3
+
+	bl		chacha20_permute
+
 	add		ip, r2, #0x20
 	vld1.8		{q4-q5}, [r2]
 	vld1.8		{q6-q7}, [ip]
@@ -166,9 +176,25 @@ ENTRY(chacha20_block_xor_neon)
 	vst1.8		{q0-q1}, [r1]
 	vst1.8		{q2-q3}, [ip]
 
-	bx		lr
+	pop		{pc}
 ENDPROC(chacha20_block_xor_neon)
 
+ENTRY(hchacha20_block_neon)
+	// r0: Input state matrix, s
+	// r1: output (8 32-bit words)
+	push		{lr}
+
+	vld1.32		{q0-q1}, [r0]!
+	vld1.32		{q2-q3}, [r0]
+
+	bl		chacha20_permute
+
+	vst1.32		{q0}, [r1]!
+	vst1.32		{q3}, [r1]
+
+	pop		{pc}
+ENDPROC(hchacha20_block_neon)
+
 	.align		4
 .Lctrinc:	.word	0, 1, 2, 3
 .Lrol8_table:	.byte	3, 0, 1, 2, 7, 4, 5, 6
diff --git a/arch/arm/crypto/chacha20-neon-glue.c b/arch/arm/crypto/chacha20-neon-glue.c
index 2bc035cb8f23..f2d3b0f70a8d 100644
--- a/arch/arm/crypto/chacha20-neon-glue.c
+++ b/arch/arm/crypto/chacha20-neon-glue.c
@@ -1,5 +1,5 @@
 /*
- * ChaCha20 256-bit cipher algorithm, RFC7539, ARM NEON functions
+ * ChaCha20 (RFC7539) and XChaCha20 stream ciphers, NEON accelerated
  *
  * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
  *
@@ -30,6 +30,7 @@
 
 asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src);
 asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
+asmlinkage void hchacha20_block_neon(const u32 *state, u32 *out);
 
 static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
 			    unsigned int bytes)
@@ -57,20 +58,16 @@ static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
 	}
 }
 
-static int chacha20_neon(struct skcipher_request *req)
+static int chacha20_neon_stream_xor(struct skcipher_request *req,
+				    struct chacha_ctx *ctx, u8 *iv)
 {
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
 	struct skcipher_walk walk;
 	u32 state[16];
 	int err;
 
-	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
-		return crypto_chacha_crypt(req);
-
 	err = skcipher_walk_virt(&walk, req, false);
 
-	crypto_chacha_init(state, ctx, walk.iv);
+	crypto_chacha_init(state, ctx, iv);
 
 	while (walk.nbytes > 0) {
 		unsigned int nbytes = walk.nbytes;
@@ -88,22 +85,73 @@ static int chacha20_neon(struct skcipher_request *req)
 	return err;
 }
 
-static struct skcipher_alg alg = {
-	.base.cra_name		= "chacha20",
-	.base.cra_driver_name	= "chacha20-neon",
-	.base.cra_priority	= 300,
-	.base.cra_blocksize	= 1,
-	.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-	.base.cra_module	= THIS_MODULE,
-
-	.min_keysize		= CHACHA_KEY_SIZE,
-	.max_keysize		= CHACHA_KEY_SIZE,
-	.ivsize			= CHACHA_IV_SIZE,
-	.chunksize		= CHACHA_BLOCK_SIZE,
-	.walksize		= 4 * CHACHA_BLOCK_SIZE,
-	.setkey			= crypto_chacha20_setkey,
-	.encrypt		= chacha20_neon,
-	.decrypt		= chacha20_neon,
+static int chacha20_neon(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
+		return crypto_chacha_crypt(req);
+
+	return chacha20_neon_stream_xor(req, ctx, req->iv);
+}
+
+static int xchacha20_neon(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct chacha_ctx subctx;
+	u32 state[16];
+	u8 real_iv[16];
+
+	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
+		return crypto_xchacha_crypt(req);
+
+	crypto_chacha_init(state, ctx, req->iv);
+
+	kernel_neon_begin();
+	hchacha20_block_neon(state, subctx.key);
+	kernel_neon_end();
+
+	memcpy(&real_iv[0], req->iv + 24, 8);
+	memcpy(&real_iv[8], req->iv + 16, 8);
+	return chacha20_neon_stream_xor(req, &subctx, real_iv);
+}
+
+static struct skcipher_alg algs[] = {
+	{
+		.base.cra_name		= "chacha20",
+		.base.cra_driver_name	= "chacha20-neon",
+		.base.cra_priority	= 300,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= CHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.walksize		= 4 * CHACHA_BLOCK_SIZE,
+		.setkey			= crypto_chacha20_setkey,
+		.encrypt		= chacha20_neon,
+		.decrypt		= chacha20_neon,
+	}, {
+		.base.cra_name		= "xchacha20",
+		.base.cra_driver_name	= "xchacha20-neon",
+		.base.cra_priority	= 300,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= XCHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.walksize		= 4 * CHACHA_BLOCK_SIZE,
+		.setkey			= crypto_chacha20_setkey,
+		.encrypt		= xchacha20_neon,
+		.decrypt		= xchacha20_neon,
+	}
 };
 
 static int __init chacha20_simd_mod_init(void)
@@ -111,12 +159,12 @@ static int __init chacha20_simd_mod_init(void)
 	if (!(elf_hwcap & HWCAP_NEON))
 		return -ENODEV;
 
-	return crypto_register_skcipher(&alg);
+	return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
 }
 
 static void __exit chacha20_simd_mod_fini(void)
 {
-	crypto_unregister_skcipher(&alg);
+	crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
 }
 
 module_init(chacha20_simd_mod_init);
@@ -125,3 +173,6 @@ module_exit(chacha20_simd_mod_fini);
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
 MODULE_LICENSE("GPL v2");
 MODULE_ALIAS_CRYPTO("chacha20");
+MODULE_ALIAS_CRYPTO("chacha20-neon");
+MODULE_ALIAS_CRYPTO("xchacha20");
+MODULE_ALIAS_CRYPTO("xchacha20-neon");
-- 
cgit v1.2.3-59-g8ed1b


From 3cc215198eac75cc4130729ddd94a5cdbdb4d300 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 16 Nov 2018 17:26:25 -0800
Subject: crypto: arm/chacha20 - refactor to allow varying number of rounds

In preparation for adding XChaCha12 support, rename/refactor the NEON
implementation of ChaCha20 to support different numbers of rounds.

Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm/crypto/Makefile             |   4 +-
 arch/arm/crypto/chacha-neon-core.S   | 560 +++++++++++++++++++++++++++++++++++
 arch/arm/crypto/chacha-neon-glue.c   | 182 ++++++++++++
 arch/arm/crypto/chacha20-neon-core.S | 556 ----------------------------------
 arch/arm/crypto/chacha20-neon-glue.c | 178 -----------
 5 files changed, 744 insertions(+), 736 deletions(-)
 create mode 100644 arch/arm/crypto/chacha-neon-core.S
 create mode 100644 arch/arm/crypto/chacha-neon-glue.c
 delete mode 100644 arch/arm/crypto/chacha20-neon-core.S
 delete mode 100644 arch/arm/crypto/chacha20-neon-glue.c

diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index bd5bceef0605..005482ff9504 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -9,7 +9,7 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
 obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
-obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
+obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
 
 ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
@@ -52,7 +52,7 @@ aes-arm-ce-y	:= aes-ce-core.o aes-ce-glue.o
 ghash-arm-ce-y	:= ghash-ce-core.o ghash-ce-glue.o
 crct10dif-arm-ce-y	:= crct10dif-ce-core.o crct10dif-ce-glue.o
 crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
-chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
+chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
 
 ifdef REGENERATE_ARM_CRYPTO
 quiet_cmd_perl = PERL    $@
diff --git a/arch/arm/crypto/chacha-neon-core.S b/arch/arm/crypto/chacha-neon-core.S
new file mode 100644
index 000000000000..eb22926d4912
--- /dev/null
+++ b/arch/arm/crypto/chacha-neon-core.S
@@ -0,0 +1,560 @@
+/*
+ * ChaCha/XChaCha NEON helper functions
+ *
+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+ /*
+  * NEON doesn't have a rotate instruction.  The alternatives are, more or less:
+  *
+  * (a)  vshl.u32 + vsri.u32		(needs temporary register)
+  * (b)  vshl.u32 + vshr.u32 + vorr	(needs temporary register)
+  * (c)  vrev32.16			(16-bit rotations only)
+  * (d)  vtbl.8 + vtbl.8		(multiple of 8 bits rotations only,
+  *					 needs index vector)
+  *
+  * ChaCha has 16, 12, 8, and 7-bit rotations.  For the 12 and 7-bit rotations,
+  * the only choices are (a) and (b).  We use (a) since it takes two-thirds the
+  * cycles of (b) on both Cortex-A7 and Cortex-A53.
+  *
+  * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
+  * and doesn't need a temporary register.
+  *
+  * For the 8-bit rotation, we use vtbl.8 + vtbl.8.  On Cortex-A7, this sequence
+  * is twice as fast as (a), even when doing (a) on multiple registers
+  * simultaneously to eliminate the stall between vshl and vsri.  Also, it
+  * parallelizes better when temporary registers are scarce.
+  *
+  * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
+  * (a), so the need to load the rotation table actually makes the vtbl method
+  * slightly slower overall on that CPU (~1.3% slower ChaCha20).  Still, it
+  * seems to be a good compromise to get a more significant speed boost on some
+  * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
+  */
+
+#include <linux/linkage.h>
+
+	.text
+	.fpu		neon
+	.align		5
+
+/*
+ * chacha_permute - permute one block
+ *
+ * Permute one 64-byte block where the state matrix is stored in the four NEON
+ * registers q0-q3.  It performs matrix operations on four words in parallel,
+ * but requires shuffling to rearrange the words after each round.
+ *
+ * The round count is given in r3.
+ *
+ * Clobbers: r3, ip, q4-q5
+ */
+chacha_permute:
+
+	adr		ip, .Lrol8_table
+	vld1.8		{d10}, [ip, :64]
+
+.Ldoubleround:
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vadd.i32	q0, q0, q1
+	veor		q3, q3, q0
+	vrev32.16	q3, q3
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vadd.i32	q2, q2, q3
+	veor		q4, q1, q2
+	vshl.u32	q1, q4, #12
+	vsri.u32	q1, q4, #20
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vadd.i32	q0, q0, q1
+	veor		q3, q3, q0
+	vtbl.8		d6, {d6}, d10
+	vtbl.8		d7, {d7}, d10
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vadd.i32	q2, q2, q3
+	veor		q4, q1, q2
+	vshl.u32	q1, q4, #7
+	vsri.u32	q1, q4, #25
+
+	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	vext.8		q1, q1, q1, #4
+	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vext.8		q2, q2, q2, #8
+	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	vext.8		q3, q3, q3, #12
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vadd.i32	q0, q0, q1
+	veor		q3, q3, q0
+	vrev32.16	q3, q3
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vadd.i32	q2, q2, q3
+	veor		q4, q1, q2
+	vshl.u32	q1, q4, #12
+	vsri.u32	q1, q4, #20
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vadd.i32	q0, q0, q1
+	veor		q3, q3, q0
+	vtbl.8		d6, {d6}, d10
+	vtbl.8		d7, {d7}, d10
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vadd.i32	q2, q2, q3
+	veor		q4, q1, q2
+	vshl.u32	q1, q4, #7
+	vsri.u32	q1, q4, #25
+
+	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	vext.8		q1, q1, q1, #12
+	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vext.8		q2, q2, q2, #8
+	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	vext.8		q3, q3, q3, #4
+
+	subs		r3, r3, #2
+	bne		.Ldoubleround
+
+	bx		lr
+ENDPROC(chacha_permute)
+
+ENTRY(chacha_block_xor_neon)
+	// r0: Input state matrix, s
+	// r1: 1 data block output, o
+	// r2: 1 data block input, i
+	// r3: nrounds
+	push		{lr}
+
+	// x0..3 = s0..3
+	add		ip, r0, #0x20
+	vld1.32		{q0-q1}, [r0]
+	vld1.32		{q2-q3}, [ip]
+
+	vmov		q8, q0
+	vmov		q9, q1
+	vmov		q10, q2
+	vmov		q11, q3
+
+	bl		chacha_permute
+
+	add		ip, r2, #0x20
+	vld1.8		{q4-q5}, [r2]
+	vld1.8		{q6-q7}, [ip]
+
+	// o0 = i0 ^ (x0 + s0)
+	vadd.i32	q0, q0, q8
+	veor		q0, q0, q4
+
+	// o1 = i1 ^ (x1 + s1)
+	vadd.i32	q1, q1, q9
+	veor		q1, q1, q5
+
+	// o2 = i2 ^ (x2 + s2)
+	vadd.i32	q2, q2, q10
+	veor		q2, q2, q6
+
+	// o3 = i3 ^ (x3 + s3)
+	vadd.i32	q3, q3, q11
+	veor		q3, q3, q7
+
+	add		ip, r1, #0x20
+	vst1.8		{q0-q1}, [r1]
+	vst1.8		{q2-q3}, [ip]
+
+	pop		{pc}
+ENDPROC(chacha_block_xor_neon)
+
+ENTRY(hchacha_block_neon)
+	// r0: Input state matrix, s
+	// r1: output (8 32-bit words)
+	// r2: nrounds
+	push		{lr}
+
+	vld1.32		{q0-q1}, [r0]!
+	vld1.32		{q2-q3}, [r0]
+
+	mov		r3, r2
+	bl		chacha_permute
+
+	vst1.32		{q0}, [r1]!
+	vst1.32		{q3}, [r1]
+
+	pop		{pc}
+ENDPROC(hchacha_block_neon)
+
+	.align		4
+.Lctrinc:	.word	0, 1, 2, 3
+.Lrol8_table:	.byte	3, 0, 1, 2, 7, 4, 5, 6
+
+	.align		5
+ENTRY(chacha_4block_xor_neon)
+	push		{r4-r5}
+	mov		r4, sp			// preserve the stack pointer
+	sub		ip, sp, #0x20		// allocate a 32 byte buffer
+	bic		ip, ip, #0x1f		// aligned to 32 bytes
+	mov		sp, ip
+
+	// r0: Input state matrix, s
+	// r1: 4 data blocks output, o
+	// r2: 4 data blocks input, i
+	// r3: nrounds
+
+	//
+	// This function encrypts four consecutive ChaCha blocks by loading
+	// the state matrix in NEON registers four times. The algorithm performs
+	// each operation on the corresponding word of each state matrix, hence
+	// requires no word shuffling. The words are re-interleaved before the
+	// final addition of the original state and the XORing step.
+	//
+
+	// x0..15[0-3] = s0..15[0-3]
+	add		ip, r0, #0x20
+	vld1.32		{q0-q1}, [r0]
+	vld1.32		{q2-q3}, [ip]
+
+	adr		r5, .Lctrinc
+	vdup.32		q15, d7[1]
+	vdup.32		q14, d7[0]
+	vld1.32		{q4}, [r5, :128]
+	vdup.32		q13, d6[1]
+	vdup.32		q12, d6[0]
+	vdup.32		q11, d5[1]
+	vdup.32		q10, d5[0]
+	vadd.u32	q12, q12, q4		// x12 += counter values 0-3
+	vdup.32		q9, d4[1]
+	vdup.32		q8, d4[0]
+	vdup.32		q7, d3[1]
+	vdup.32		q6, d3[0]
+	vdup.32		q5, d2[1]
+	vdup.32		q4, d2[0]
+	vdup.32		q3, d1[1]
+	vdup.32		q2, d1[0]
+	vdup.32		q1, d0[1]
+	vdup.32		q0, d0[0]
+
+	adr		ip, .Lrol8_table
+	b		1f
+
+.Ldoubleround4:
+	vld1.32		{q8-q9}, [sp, :256]
+1:
+	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+	vadd.i32	q0, q0, q4
+	vadd.i32	q1, q1, q5
+	vadd.i32	q2, q2, q6
+	vadd.i32	q3, q3, q7
+
+	veor		q12, q12, q0
+	veor		q13, q13, q1
+	veor		q14, q14, q2
+	veor		q15, q15, q3
+
+	vrev32.16	q12, q12
+	vrev32.16	q13, q13
+	vrev32.16	q14, q14
+	vrev32.16	q15, q15
+
+	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+	vadd.i32	q8, q8, q12
+	vadd.i32	q9, q9, q13
+	vadd.i32	q10, q10, q14
+	vadd.i32	q11, q11, q15
+
+	vst1.32		{q8-q9}, [sp, :256]
+
+	veor		q8, q4, q8
+	veor		q9, q5, q9
+	vshl.u32	q4, q8, #12
+	vshl.u32	q5, q9, #12
+	vsri.u32	q4, q8, #20
+	vsri.u32	q5, q9, #20
+
+	veor		q8, q6, q10
+	veor		q9, q7, q11
+	vshl.u32	q6, q8, #12
+	vshl.u32	q7, q9, #12
+	vsri.u32	q6, q8, #20
+	vsri.u32	q7, q9, #20
+
+	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+	vld1.8		{d16}, [ip, :64]
+	vadd.i32	q0, q0, q4
+	vadd.i32	q1, q1, q5
+	vadd.i32	q2, q2, q6
+	vadd.i32	q3, q3, q7
+
+	veor		q12, q12, q0
+	veor		q13, q13, q1
+	veor		q14, q14, q2
+	veor		q15, q15, q3
+
+	vtbl.8		d24, {d24}, d16
+	vtbl.8		d25, {d25}, d16
+	vtbl.8		d26, {d26}, d16
+	vtbl.8		d27, {d27}, d16
+	vtbl.8		d28, {d28}, d16
+	vtbl.8		d29, {d29}, d16
+	vtbl.8		d30, {d30}, d16
+	vtbl.8		d31, {d31}, d16
+
+	vld1.32		{q8-q9}, [sp, :256]
+
+	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+	vadd.i32	q8, q8, q12
+	vadd.i32	q9, q9, q13
+	vadd.i32	q10, q10, q14
+	vadd.i32	q11, q11, q15
+
+	vst1.32		{q8-q9}, [sp, :256]
+
+	veor		q8, q4, q8
+	veor		q9, q5, q9
+	vshl.u32	q4, q8, #7
+	vshl.u32	q5, q9, #7
+	vsri.u32	q4, q8, #25
+	vsri.u32	q5, q9, #25
+
+	veor		q8, q6, q10
+	veor		q9, q7, q11
+	vshl.u32	q6, q8, #7
+	vshl.u32	q7, q9, #7
+	vsri.u32	q6, q8, #25
+	vsri.u32	q7, q9, #25
+
+	vld1.32		{q8-q9}, [sp, :256]
+
+	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+	vadd.i32	q0, q0, q5
+	vadd.i32	q1, q1, q6
+	vadd.i32	q2, q2, q7
+	vadd.i32	q3, q3, q4
+
+	veor		q15, q15, q0
+	veor		q12, q12, q1
+	veor		q13, q13, q2
+	veor		q14, q14, q3
+
+	vrev32.16	q15, q15
+	vrev32.16	q12, q12
+	vrev32.16	q13, q13
+	vrev32.16	q14, q14
+
+	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+	vadd.i32	q10, q10, q15
+	vadd.i32	q11, q11, q12
+	vadd.i32	q8, q8, q13
+	vadd.i32	q9, q9, q14
+
+	vst1.32		{q8-q9}, [sp, :256]
+
+	veor		q8, q7, q8
+	veor		q9, q4, q9
+	vshl.u32	q7, q8, #12
+	vshl.u32	q4, q9, #12
+	vsri.u32	q7, q8, #20
+	vsri.u32	q4, q9, #20
+
+	veor		q8, q5, q10
+	veor		q9, q6, q11
+	vshl.u32	q5, q8, #12
+	vshl.u32	q6, q9, #12
+	vsri.u32	q5, q8, #20
+	vsri.u32	q6, q9, #20
+
+	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+	vld1.8		{d16}, [ip, :64]
+	vadd.i32	q0, q0, q5
+	vadd.i32	q1, q1, q6
+	vadd.i32	q2, q2, q7
+	vadd.i32	q3, q3, q4
+
+	veor		q15, q15, q0
+	veor		q12, q12, q1
+	veor		q13, q13, q2
+	veor		q14, q14, q3
+
+	vtbl.8		d30, {d30}, d16
+	vtbl.8		d31, {d31}, d16
+	vtbl.8		d24, {d24}, d16
+	vtbl.8		d25, {d25}, d16
+	vtbl.8		d26, {d26}, d16
+	vtbl.8		d27, {d27}, d16
+	vtbl.8		d28, {d28}, d16
+	vtbl.8		d29, {d29}, d16
+
+	vld1.32		{q8-q9}, [sp, :256]
+
+	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+	vadd.i32	q10, q10, q15
+	vadd.i32	q11, q11, q12
+	vadd.i32	q8, q8, q13
+	vadd.i32	q9, q9, q14
+
+	vst1.32		{q8-q9}, [sp, :256]
+
+	veor		q8, q7, q8
+	veor		q9, q4, q9
+	vshl.u32	q7, q8, #7
+	vshl.u32	q4, q9, #7
+	vsri.u32	q7, q8, #25
+	vsri.u32	q4, q9, #25
+
+	veor		q8, q5, q10
+	veor		q9, q6, q11
+	vshl.u32	q5, q8, #7
+	vshl.u32	q6, q9, #7
+	vsri.u32	q5, q8, #25
+	vsri.u32	q6, q9, #25
+
+	subs		r3, r3, #2
+	bne		.Ldoubleround4
+
+	// x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
+	// x8..9[0-3] are on the stack.
+
+	// Re-interleave the words in the first two rows of each block (x0..7).
+	// Also add the counter values 0-3 to x12[0-3].
+	  vld1.32	{q8}, [r5, :128]	// load counter values 0-3
+	vzip.32		q0, q1			// => (0 1 0 1) (0 1 0 1)
+	vzip.32		q2, q3			// => (2 3 2 3) (2 3 2 3)
+	vzip.32		q4, q5			// => (4 5 4 5) (4 5 4 5)
+	vzip.32		q6, q7			// => (6 7 6 7) (6 7 6 7)
+	  vadd.u32	q12, q8			// x12 += counter values 0-3
+	vswp		d1, d4
+	vswp		d3, d6
+	  vld1.32	{q8-q9}, [r0]!		// load s0..7
+	vswp		d9, d12
+	vswp		d11, d14
+
+	// Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
+	// after XORing the first 32 bytes.
+	vswp		q1, q4
+
+	// First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7)
+
+	// x0..3[0-3] += s0..3[0-3]	(add orig state to 1st row of each block)
+	vadd.u32	q0, q0, q8
+	vadd.u32	q2, q2, q8
+	vadd.u32	q4, q4, q8
+	vadd.u32	q3, q3, q8
+
+	// x4..7[0-3] += s4..7[0-3]	(add orig state to 2nd row of each block)
+	vadd.u32	q1, q1, q9
+	vadd.u32	q6, q6, q9
+	vadd.u32	q5, q5, q9
+	vadd.u32	q7, q7, q9
+
+	// XOR first 32 bytes using keystream from first two rows of first block
+	vld1.8		{q8-q9}, [r2]!
+	veor		q8, q8, q0
+	veor		q9, q9, q1
+	vst1.8		{q8-q9}, [r1]!
+
+	// Re-interleave the words in the last two rows of each block (x8..15).
+	vld1.32		{q8-q9}, [sp, :256]
+	vzip.32		q12, q13	// => (12 13 12 13) (12 13 12 13)
+	vzip.32		q14, q15	// => (14 15 14 15) (14 15 14 15)
+	vzip.32		q8, q9		// => (8 9 8 9) (8 9 8 9)
+	vzip.32		q10, q11	// => (10 11 10 11) (10 11 10 11)
+	  vld1.32	{q0-q1}, [r0]	// load s8..15
+	vswp		d25, d28
+	vswp		d27, d30
+	vswp		d17, d20
+	vswp		d19, d22
+
+	// Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15)
+
+	// x8..11[0-3] += s8..11[0-3]	(add orig state to 3rd row of each block)
+	vadd.u32	q8,  q8,  q0
+	vadd.u32	q10, q10, q0
+	vadd.u32	q9,  q9,  q0
+	vadd.u32	q11, q11, q0
+
+	// x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
+	vadd.u32	q12, q12, q1
+	vadd.u32	q14, q14, q1
+	vadd.u32	q13, q13, q1
+	vadd.u32	q15, q15, q1
+
+	// XOR the rest of the data with the keystream
+
+	vld1.8		{q0-q1}, [r2]!
+	veor		q0, q0, q8
+	veor		q1, q1, q12
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]!
+	veor		q0, q0, q2
+	veor		q1, q1, q6
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]!
+	veor		q0, q0, q10
+	veor		q1, q1, q14
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]!
+	veor		q0, q0, q4
+	veor		q1, q1, q5
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]!
+	veor		q0, q0, q9
+	veor		q1, q1, q13
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]!
+	veor		q0, q0, q3
+	veor		q1, q1, q7
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]
+	  mov		sp, r4		// restore original stack pointer
+	veor		q0, q0, q11
+	veor		q1, q1, q15
+	vst1.8		{q0-q1}, [r1]
+
+	pop		{r4-r5}
+	bx		lr
+ENDPROC(chacha_4block_xor_neon)
diff --git a/arch/arm/crypto/chacha-neon-glue.c b/arch/arm/crypto/chacha-neon-glue.c
new file mode 100644
index 000000000000..385557d38634
--- /dev/null
+++ b/arch/arm/crypto/chacha-neon-glue.c
@@ -0,0 +1,182 @@
+/*
+ * ChaCha20 (RFC7539) and XChaCha20 stream ciphers, NEON accelerated
+ *
+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/chacha.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
+				      int nrounds);
+asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
+				       int nrounds);
+asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
+
+static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
+			  unsigned int bytes, int nrounds)
+{
+	u8 buf[CHACHA_BLOCK_SIZE];
+
+	while (bytes >= CHACHA_BLOCK_SIZE * 4) {
+		chacha_4block_xor_neon(state, dst, src, nrounds);
+		bytes -= CHACHA_BLOCK_SIZE * 4;
+		src += CHACHA_BLOCK_SIZE * 4;
+		dst += CHACHA_BLOCK_SIZE * 4;
+		state[12] += 4;
+	}
+	while (bytes >= CHACHA_BLOCK_SIZE) {
+		chacha_block_xor_neon(state, dst, src, nrounds);
+		bytes -= CHACHA_BLOCK_SIZE;
+		src += CHACHA_BLOCK_SIZE;
+		dst += CHACHA_BLOCK_SIZE;
+		state[12]++;
+	}
+	if (bytes) {
+		memcpy(buf, src, bytes);
+		chacha_block_xor_neon(state, buf, buf, nrounds);
+		memcpy(dst, buf, bytes);
+	}
+}
+
+static int chacha_neon_stream_xor(struct skcipher_request *req,
+				  struct chacha_ctx *ctx, u8 *iv)
+{
+	struct skcipher_walk walk;
+	u32 state[16];
+	int err;
+
+	err = skcipher_walk_virt(&walk, req, false);
+
+	crypto_chacha_init(state, ctx, iv);
+
+	while (walk.nbytes > 0) {
+		unsigned int nbytes = walk.nbytes;
+
+		if (nbytes < walk.total)
+			nbytes = round_down(nbytes, walk.stride);
+
+		kernel_neon_begin();
+		chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
+			      nbytes, ctx->nrounds);
+		kernel_neon_end();
+		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+	}
+
+	return err;
+}
+
+static int chacha_neon(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
+		return crypto_chacha_crypt(req);
+
+	return chacha_neon_stream_xor(req, ctx, req->iv);
+}
+
+static int xchacha_neon(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct chacha_ctx subctx;
+	u32 state[16];
+	u8 real_iv[16];
+
+	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
+		return crypto_xchacha_crypt(req);
+
+	crypto_chacha_init(state, ctx, req->iv);
+
+	kernel_neon_begin();
+	hchacha_block_neon(state, subctx.key, ctx->nrounds);
+	kernel_neon_end();
+	subctx.nrounds = ctx->nrounds;
+
+	memcpy(&real_iv[0], req->iv + 24, 8);
+	memcpy(&real_iv[8], req->iv + 16, 8);
+	return chacha_neon_stream_xor(req, &subctx, real_iv);
+}
+
+static struct skcipher_alg algs[] = {
+	{
+		.base.cra_name		= "chacha20",
+		.base.cra_driver_name	= "chacha20-neon",
+		.base.cra_priority	= 300,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= CHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.walksize		= 4 * CHACHA_BLOCK_SIZE,
+		.setkey			= crypto_chacha20_setkey,
+		.encrypt		= chacha_neon,
+		.decrypt		= chacha_neon,
+	}, {
+		.base.cra_name		= "xchacha20",
+		.base.cra_driver_name	= "xchacha20-neon",
+		.base.cra_priority	= 300,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= XCHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.walksize		= 4 * CHACHA_BLOCK_SIZE,
+		.setkey			= crypto_chacha20_setkey,
+		.encrypt		= xchacha_neon,
+		.decrypt		= xchacha_neon,
+	}
+};
+
+static int __init chacha_simd_mod_init(void)
+{
+	if (!(elf_hwcap & HWCAP_NEON))
+		return -ENODEV;
+
+	return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
+}
+
+static void __exit chacha_simd_mod_fini(void)
+{
+	crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+}
+
+module_init(chacha_simd_mod_init);
+module_exit(chacha_simd_mod_fini);
+
+MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (NEON accelerated)");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("chacha20");
+MODULE_ALIAS_CRYPTO("chacha20-neon");
+MODULE_ALIAS_CRYPTO("xchacha20");
+MODULE_ALIAS_CRYPTO("xchacha20-neon");
diff --git a/arch/arm/crypto/chacha20-neon-core.S b/arch/arm/crypto/chacha20-neon-core.S
deleted file mode 100644
index 2335e5055d2b..000000000000
--- a/arch/arm/crypto/chacha20-neon-core.S
+++ /dev/null
@@ -1,556 +0,0 @@
-/*
- * ChaCha20 256-bit cipher algorithm, RFC7539, ARM NEON functions
- *
- * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Based on:
- * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
- /*
-  * NEON doesn't have a rotate instruction.  The alternatives are, more or less:
-  *
-  * (a)  vshl.u32 + vsri.u32		(needs temporary register)
-  * (b)  vshl.u32 + vshr.u32 + vorr	(needs temporary register)
-  * (c)  vrev32.16			(16-bit rotations only)
-  * (d)  vtbl.8 + vtbl.8		(multiple of 8 bits rotations only,
-  *					 needs index vector)
-  *
-  * ChaCha20 has 16, 12, 8, and 7-bit rotations.  For the 12 and 7-bit
-  * rotations, the only choices are (a) and (b).  We use (a) since it takes
-  * two-thirds the cycles of (b) on both Cortex-A7 and Cortex-A53.
-  *
-  * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
-  * and doesn't need a temporary register.
-  *
-  * For the 8-bit rotation, we use vtbl.8 + vtbl.8.  On Cortex-A7, this sequence
-  * is twice as fast as (a), even when doing (a) on multiple registers
-  * simultaneously to eliminate the stall between vshl and vsri.  Also, it
-  * parallelizes better when temporary registers are scarce.
-  *
-  * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
-  * (a), so the need to load the rotation table actually makes the vtbl method
-  * slightly slower overall on that CPU (~1.3% slower ChaCha20).  Still, it
-  * seems to be a good compromise to get a more significant speed boost on some
-  * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
-  */
-
-#include <linux/linkage.h>
-
-	.text
-	.fpu		neon
-	.align		5
-
-/*
- * chacha20_permute - permute one block
- *
- * Permute one 64-byte block where the state matrix is stored in the four NEON
- * registers q0-q3.  It performs matrix operations on four words in parallel,
- * but requires shuffling to rearrange the words after each round.
- *
- * Clobbers: r3, ip, q4-q5
- */
-chacha20_permute:
-
-	adr		ip, .Lrol8_table
-	mov		r3, #10
-	vld1.8		{d10}, [ip, :64]
-
-.Ldoubleround:
-	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-	vadd.i32	q0, q0, q1
-	veor		q3, q3, q0
-	vrev32.16	q3, q3
-
-	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-	vadd.i32	q2, q2, q3
-	veor		q4, q1, q2
-	vshl.u32	q1, q4, #12
-	vsri.u32	q1, q4, #20
-
-	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-	vadd.i32	q0, q0, q1
-	veor		q3, q3, q0
-	vtbl.8		d6, {d6}, d10
-	vtbl.8		d7, {d7}, d10
-
-	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-	vadd.i32	q2, q2, q3
-	veor		q4, q1, q2
-	vshl.u32	q1, q4, #7
-	vsri.u32	q1, q4, #25
-
-	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
-	vext.8		q1, q1, q1, #4
-	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-	vext.8		q2, q2, q2, #8
-	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
-	vext.8		q3, q3, q3, #12
-
-	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-	vadd.i32	q0, q0, q1
-	veor		q3, q3, q0
-	vrev32.16	q3, q3
-
-	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-	vadd.i32	q2, q2, q3
-	veor		q4, q1, q2
-	vshl.u32	q1, q4, #12
-	vsri.u32	q1, q4, #20
-
-	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-	vadd.i32	q0, q0, q1
-	veor		q3, q3, q0
-	vtbl.8		d6, {d6}, d10
-	vtbl.8		d7, {d7}, d10
-
-	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-	vadd.i32	q2, q2, q3
-	veor		q4, q1, q2
-	vshl.u32	q1, q4, #7
-	vsri.u32	q1, q4, #25
-
-	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
-	vext.8		q1, q1, q1, #12
-	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-	vext.8		q2, q2, q2, #8
-	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
-	vext.8		q3, q3, q3, #4
-
-	subs		r3, r3, #1
-	bne		.Ldoubleround
-
-	bx		lr
-ENDPROC(chacha20_permute)
-
-ENTRY(chacha20_block_xor_neon)
-	// r0: Input state matrix, s
-	// r1: 1 data block output, o
-	// r2: 1 data block input, i
-	push		{lr}
-
-	// x0..3 = s0..3
-	add		ip, r0, #0x20
-	vld1.32		{q0-q1}, [r0]
-	vld1.32		{q2-q3}, [ip]
-
-	vmov		q8, q0
-	vmov		q9, q1
-	vmov		q10, q2
-	vmov		q11, q3
-
-	bl		chacha20_permute
-
-	add		ip, r2, #0x20
-	vld1.8		{q4-q5}, [r2]
-	vld1.8		{q6-q7}, [ip]
-
-	// o0 = i0 ^ (x0 + s0)
-	vadd.i32	q0, q0, q8
-	veor		q0, q0, q4
-
-	// o1 = i1 ^ (x1 + s1)
-	vadd.i32	q1, q1, q9
-	veor		q1, q1, q5
-
-	// o2 = i2 ^ (x2 + s2)
-	vadd.i32	q2, q2, q10
-	veor		q2, q2, q6
-
-	// o3 = i3 ^ (x3 + s3)
-	vadd.i32	q3, q3, q11
-	veor		q3, q3, q7
-
-	add		ip, r1, #0x20
-	vst1.8		{q0-q1}, [r1]
-	vst1.8		{q2-q3}, [ip]
-
-	pop		{pc}
-ENDPROC(chacha20_block_xor_neon)
-
-ENTRY(hchacha20_block_neon)
-	// r0: Input state matrix, s
-	// r1: output (8 32-bit words)
-	push		{lr}
-
-	vld1.32		{q0-q1}, [r0]!
-	vld1.32		{q2-q3}, [r0]
-
-	bl		chacha20_permute
-
-	vst1.32		{q0}, [r1]!
-	vst1.32		{q3}, [r1]
-
-	pop		{pc}
-ENDPROC(hchacha20_block_neon)
-
-	.align		4
-.Lctrinc:	.word	0, 1, 2, 3
-.Lrol8_table:	.byte	3, 0, 1, 2, 7, 4, 5, 6
-
-	.align		5
-ENTRY(chacha20_4block_xor_neon)
-	push		{r4-r5}
-	mov		r4, sp			// preserve the stack pointer
-	sub		ip, sp, #0x20		// allocate a 32 byte buffer
-	bic		ip, ip, #0x1f		// aligned to 32 bytes
-	mov		sp, ip
-
-	// r0: Input state matrix, s
-	// r1: 4 data blocks output, o
-	// r2: 4 data blocks input, i
-
-	//
-	// This function encrypts four consecutive ChaCha20 blocks by loading
-	// the state matrix in NEON registers four times. The algorithm performs
-	// each operation on the corresponding word of each state matrix, hence
-	// requires no word shuffling. The words are re-interleaved before the
-	// final addition of the original state and the XORing step.
-	//
-
-	// x0..15[0-3] = s0..15[0-3]
-	add		ip, r0, #0x20
-	vld1.32		{q0-q1}, [r0]
-	vld1.32		{q2-q3}, [ip]
-
-	adr		r5, .Lctrinc
-	vdup.32		q15, d7[1]
-	vdup.32		q14, d7[0]
-	vld1.32		{q4}, [r5, :128]
-	vdup.32		q13, d6[1]
-	vdup.32		q12, d6[0]
-	vdup.32		q11, d5[1]
-	vdup.32		q10, d5[0]
-	vadd.u32	q12, q12, q4		// x12 += counter values 0-3
-	vdup.32		q9, d4[1]
-	vdup.32		q8, d4[0]
-	vdup.32		q7, d3[1]
-	vdup.32		q6, d3[0]
-	vdup.32		q5, d2[1]
-	vdup.32		q4, d2[0]
-	vdup.32		q3, d1[1]
-	vdup.32		q2, d1[0]
-	vdup.32		q1, d0[1]
-	vdup.32		q0, d0[0]
-
-	adr		ip, .Lrol8_table
-	mov		r3, #10
-	b		1f
-
-.Ldoubleround4:
-	vld1.32		{q8-q9}, [sp, :256]
-1:
-	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
-	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
-	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
-	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
-	vadd.i32	q0, q0, q4
-	vadd.i32	q1, q1, q5
-	vadd.i32	q2, q2, q6
-	vadd.i32	q3, q3, q7
-
-	veor		q12, q12, q0
-	veor		q13, q13, q1
-	veor		q14, q14, q2
-	veor		q15, q15, q3
-
-	vrev32.16	q12, q12
-	vrev32.16	q13, q13
-	vrev32.16	q14, q14
-	vrev32.16	q15, q15
-
-	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
-	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
-	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
-	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
-	vadd.i32	q8, q8, q12
-	vadd.i32	q9, q9, q13
-	vadd.i32	q10, q10, q14
-	vadd.i32	q11, q11, q15
-
-	vst1.32		{q8-q9}, [sp, :256]
-
-	veor		q8, q4, q8
-	veor		q9, q5, q9
-	vshl.u32	q4, q8, #12
-	vshl.u32	q5, q9, #12
-	vsri.u32	q4, q8, #20
-	vsri.u32	q5, q9, #20
-
-	veor		q8, q6, q10
-	veor		q9, q7, q11
-	vshl.u32	q6, q8, #12
-	vshl.u32	q7, q9, #12
-	vsri.u32	q6, q8, #20
-	vsri.u32	q7, q9, #20
-
-	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
-	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
-	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
-	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
-	vld1.8		{d16}, [ip, :64]
-	vadd.i32	q0, q0, q4
-	vadd.i32	q1, q1, q5
-	vadd.i32	q2, q2, q6
-	vadd.i32	q3, q3, q7
-
-	veor		q12, q12, q0
-	veor		q13, q13, q1
-	veor		q14, q14, q2
-	veor		q15, q15, q3
-
-	vtbl.8		d24, {d24}, d16
-	vtbl.8		d25, {d25}, d16
-	vtbl.8		d26, {d26}, d16
-	vtbl.8		d27, {d27}, d16
-	vtbl.8		d28, {d28}, d16
-	vtbl.8		d29, {d29}, d16
-	vtbl.8		d30, {d30}, d16
-	vtbl.8		d31, {d31}, d16
-
-	vld1.32		{q8-q9}, [sp, :256]
-
-	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
-	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
-	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
-	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
-	vadd.i32	q8, q8, q12
-	vadd.i32	q9, q9, q13
-	vadd.i32	q10, q10, q14
-	vadd.i32	q11, q11, q15
-
-	vst1.32		{q8-q9}, [sp, :256]
-
-	veor		q8, q4, q8
-	veor		q9, q5, q9
-	vshl.u32	q4, q8, #7
-	vshl.u32	q5, q9, #7
-	vsri.u32	q4, q8, #25
-	vsri.u32	q5, q9, #25
-
-	veor		q8, q6, q10
-	veor		q9, q7, q11
-	vshl.u32	q6, q8, #7
-	vshl.u32	q7, q9, #7
-	vsri.u32	q6, q8, #25
-	vsri.u32	q7, q9, #25
-
-	vld1.32		{q8-q9}, [sp, :256]
-
-	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
-	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
-	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
-	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
-	vadd.i32	q0, q0, q5
-	vadd.i32	q1, q1, q6
-	vadd.i32	q2, q2, q7
-	vadd.i32	q3, q3, q4
-
-	veor		q15, q15, q0
-	veor		q12, q12, q1
-	veor		q13, q13, q2
-	veor		q14, q14, q3
-
-	vrev32.16	q15, q15
-	vrev32.16	q12, q12
-	vrev32.16	q13, q13
-	vrev32.16	q14, q14
-
-	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
-	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
-	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
-	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
-	vadd.i32	q10, q10, q15
-	vadd.i32	q11, q11, q12
-	vadd.i32	q8, q8, q13
-	vadd.i32	q9, q9, q14
-
-	vst1.32		{q8-q9}, [sp, :256]
-
-	veor		q8, q7, q8
-	veor		q9, q4, q9
-	vshl.u32	q7, q8, #12
-	vshl.u32	q4, q9, #12
-	vsri.u32	q7, q8, #20
-	vsri.u32	q4, q9, #20
-
-	veor		q8, q5, q10
-	veor		q9, q6, q11
-	vshl.u32	q5, q8, #12
-	vshl.u32	q6, q9, #12
-	vsri.u32	q5, q8, #20
-	vsri.u32	q6, q9, #20
-
-	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
-	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
-	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
-	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
-	vld1.8		{d16}, [ip, :64]
-	vadd.i32	q0, q0, q5
-	vadd.i32	q1, q1, q6
-	vadd.i32	q2, q2, q7
-	vadd.i32	q3, q3, q4
-
-	veor		q15, q15, q0
-	veor		q12, q12, q1
-	veor		q13, q13, q2
-	veor		q14, q14, q3
-
-	vtbl.8		d30, {d30}, d16
-	vtbl.8		d31, {d31}, d16
-	vtbl.8		d24, {d24}, d16
-	vtbl.8		d25, {d25}, d16
-	vtbl.8		d26, {d26}, d16
-	vtbl.8		d27, {d27}, d16
-	vtbl.8		d28, {d28}, d16
-	vtbl.8		d29, {d29}, d16
-
-	vld1.32		{q8-q9}, [sp, :256]
-
-	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
-	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
-	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
-	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
-	vadd.i32	q10, q10, q15
-	vadd.i32	q11, q11, q12
-	vadd.i32	q8, q8, q13
-	vadd.i32	q9, q9, q14
-
-	vst1.32		{q8-q9}, [sp, :256]
-
-	veor		q8, q7, q8
-	veor		q9, q4, q9
-	vshl.u32	q7, q8, #7
-	vshl.u32	q4, q9, #7
-	vsri.u32	q7, q8, #25
-	vsri.u32	q4, q9, #25
-
-	veor		q8, q5, q10
-	veor		q9, q6, q11
-	vshl.u32	q5, q8, #7
-	vshl.u32	q6, q9, #7
-	vsri.u32	q5, q8, #25
-	vsri.u32	q6, q9, #25
-
-	subs		r3, r3, #1
-	bne		.Ldoubleround4
-
-	// x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
-	// x8..9[0-3] are on the stack.
-
-	// Re-interleave the words in the first two rows of each block (x0..7).
-	// Also add the counter values 0-3 to x12[0-3].
-	  vld1.32	{q8}, [r5, :128]	// load counter values 0-3
-	vzip.32		q0, q1			// => (0 1 0 1) (0 1 0 1)
-	vzip.32		q2, q3			// => (2 3 2 3) (2 3 2 3)
-	vzip.32		q4, q5			// => (4 5 4 5) (4 5 4 5)
-	vzip.32		q6, q7			// => (6 7 6 7) (6 7 6 7)
-	  vadd.u32	q12, q8			// x12 += counter values 0-3
-	vswp		d1, d4
-	vswp		d3, d6
-	  vld1.32	{q8-q9}, [r0]!		// load s0..7
-	vswp		d9, d12
-	vswp		d11, d14
-
-	// Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
-	// after XORing the first 32 bytes.
-	vswp		q1, q4
-
-	// First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7)
-
-	// x0..3[0-3] += s0..3[0-3]	(add orig state to 1st row of each block)
-	vadd.u32	q0, q0, q8
-	vadd.u32	q2, q2, q8
-	vadd.u32	q4, q4, q8
-	vadd.u32	q3, q3, q8
-
-	// x4..7[0-3] += s4..7[0-3]	(add orig state to 2nd row of each block)
-	vadd.u32	q1, q1, q9
-	vadd.u32	q6, q6, q9
-	vadd.u32	q5, q5, q9
-	vadd.u32	q7, q7, q9
-
-	// XOR first 32 bytes using keystream from first two rows of first block
-	vld1.8		{q8-q9}, [r2]!
-	veor		q8, q8, q0
-	veor		q9, q9, q1
-	vst1.8		{q8-q9}, [r1]!
-
-	// Re-interleave the words in the last two rows of each block (x8..15).
-	vld1.32		{q8-q9}, [sp, :256]
-	vzip.32		q12, q13	// => (12 13 12 13) (12 13 12 13)
-	vzip.32		q14, q15	// => (14 15 14 15) (14 15 14 15)
-	vzip.32		q8, q9		// => (8 9 8 9) (8 9 8 9)
-	vzip.32		q10, q11	// => (10 11 10 11) (10 11 10 11)
-	  vld1.32	{q0-q1}, [r0]	// load s8..15
-	vswp		d25, d28
-	vswp		d27, d30
-	vswp		d17, d20
-	vswp		d19, d22
-
-	// Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15)
-
-	// x8..11[0-3] += s8..11[0-3]	(add orig state to 3rd row of each block)
-	vadd.u32	q8,  q8,  q0
-	vadd.u32	q10, q10, q0
-	vadd.u32	q9,  q9,  q0
-	vadd.u32	q11, q11, q0
-
-	// x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
-	vadd.u32	q12, q12, q1
-	vadd.u32	q14, q14, q1
-	vadd.u32	q13, q13, q1
-	vadd.u32	q15, q15, q1
-
-	// XOR the rest of the data with the keystream
-
-	vld1.8		{q0-q1}, [r2]!
-	veor		q0, q0, q8
-	veor		q1, q1, q12
-	vst1.8		{q0-q1}, [r1]!
-
-	vld1.8		{q0-q1}, [r2]!
-	veor		q0, q0, q2
-	veor		q1, q1, q6
-	vst1.8		{q0-q1}, [r1]!
-
-	vld1.8		{q0-q1}, [r2]!
-	veor		q0, q0, q10
-	veor		q1, q1, q14
-	vst1.8		{q0-q1}, [r1]!
-
-	vld1.8		{q0-q1}, [r2]!
-	veor		q0, q0, q4
-	veor		q1, q1, q5
-	vst1.8		{q0-q1}, [r1]!
-
-	vld1.8		{q0-q1}, [r2]!
-	veor		q0, q0, q9
-	veor		q1, q1, q13
-	vst1.8		{q0-q1}, [r1]!
-
-	vld1.8		{q0-q1}, [r2]!
-	veor		q0, q0, q3
-	veor		q1, q1, q7
-	vst1.8		{q0-q1}, [r1]!
-
-	vld1.8		{q0-q1}, [r2]
-	  mov		sp, r4		// restore original stack pointer
-	veor		q0, q0, q11
-	veor		q1, q1, q15
-	vst1.8		{q0-q1}, [r1]
-
-	pop		{r4-r5}
-	bx		lr
-ENDPROC(chacha20_4block_xor_neon)
diff --git a/arch/arm/crypto/chacha20-neon-glue.c b/arch/arm/crypto/chacha20-neon-glue.c
deleted file mode 100644
index f2d3b0f70a8d..000000000000
--- a/arch/arm/crypto/chacha20-neon-glue.c
+++ /dev/null
@@ -1,178 +0,0 @@
-/*
- * ChaCha20 (RFC7539) and XChaCha20 stream ciphers, NEON accelerated
- *
- * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Based on:
- * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <crypto/algapi.h>
-#include <crypto/chacha.h>
-#include <crypto/internal/skcipher.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src);
-asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
-asmlinkage void hchacha20_block_neon(const u32 *state, u32 *out);
-
-static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
-			    unsigned int bytes)
-{
-	u8 buf[CHACHA_BLOCK_SIZE];
-
-	while (bytes >= CHACHA_BLOCK_SIZE * 4) {
-		chacha20_4block_xor_neon(state, dst, src);
-		bytes -= CHACHA_BLOCK_SIZE * 4;
-		src += CHACHA_BLOCK_SIZE * 4;
-		dst += CHACHA_BLOCK_SIZE * 4;
-		state[12] += 4;
-	}
-	while (bytes >= CHACHA_BLOCK_SIZE) {
-		chacha20_block_xor_neon(state, dst, src);
-		bytes -= CHACHA_BLOCK_SIZE;
-		src += CHACHA_BLOCK_SIZE;
-		dst += CHACHA_BLOCK_SIZE;
-		state[12]++;
-	}
-	if (bytes) {
-		memcpy(buf, src, bytes);
-		chacha20_block_xor_neon(state, buf, buf);
-		memcpy(dst, buf, bytes);
-	}
-}
-
-static int chacha20_neon_stream_xor(struct skcipher_request *req,
-				    struct chacha_ctx *ctx, u8 *iv)
-{
-	struct skcipher_walk walk;
-	u32 state[16];
-	int err;
-
-	err = skcipher_walk_virt(&walk, req, false);
-
-	crypto_chacha_init(state, ctx, iv);
-
-	while (walk.nbytes > 0) {
-		unsigned int nbytes = walk.nbytes;
-
-		if (nbytes < walk.total)
-			nbytes = round_down(nbytes, walk.stride);
-
-		kernel_neon_begin();
-		chacha20_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
-				nbytes);
-		kernel_neon_end();
-		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
-	}
-
-	return err;
-}
-
-static int chacha20_neon(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
-
-	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
-		return crypto_chacha_crypt(req);
-
-	return chacha20_neon_stream_xor(req, ctx, req->iv);
-}
-
-static int xchacha20_neon(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct chacha_ctx subctx;
-	u32 state[16];
-	u8 real_iv[16];
-
-	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
-		return crypto_xchacha_crypt(req);
-
-	crypto_chacha_init(state, ctx, req->iv);
-
-	kernel_neon_begin();
-	hchacha20_block_neon(state, subctx.key);
-	kernel_neon_end();
-
-	memcpy(&real_iv[0], req->iv + 24, 8);
-	memcpy(&real_iv[8], req->iv + 16, 8);
-	return chacha20_neon_stream_xor(req, &subctx, real_iv);
-}
-
-static struct skcipher_alg algs[] = {
-	{
-		.base.cra_name		= "chacha20",
-		.base.cra_driver_name	= "chacha20-neon",
-		.base.cra_priority	= 300,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA_KEY_SIZE,
-		.max_keysize		= CHACHA_KEY_SIZE,
-		.ivsize			= CHACHA_IV_SIZE,
-		.chunksize		= CHACHA_BLOCK_SIZE,
-		.walksize		= 4 * CHACHA_BLOCK_SIZE,
-		.setkey			= crypto_chacha20_setkey,
-		.encrypt		= chacha20_neon,
-		.decrypt		= chacha20_neon,
-	}, {
-		.base.cra_name		= "xchacha20",
-		.base.cra_driver_name	= "xchacha20-neon",
-		.base.cra_priority	= 300,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA_KEY_SIZE,
-		.max_keysize		= CHACHA_KEY_SIZE,
-		.ivsize			= XCHACHA_IV_SIZE,
-		.chunksize		= CHACHA_BLOCK_SIZE,
-		.walksize		= 4 * CHACHA_BLOCK_SIZE,
-		.setkey			= crypto_chacha20_setkey,
-		.encrypt		= xchacha20_neon,
-		.decrypt		= xchacha20_neon,
-	}
-};
-
-static int __init chacha20_simd_mod_init(void)
-{
-	if (!(elf_hwcap & HWCAP_NEON))
-		return -ENODEV;
-
-	return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
-}
-
-static void __exit chacha20_simd_mod_fini(void)
-{
-	crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
-}
-
-module_init(chacha20_simd_mod_init);
-module_exit(chacha20_simd_mod_fini);
-
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("chacha20");
-MODULE_ALIAS_CRYPTO("chacha20-neon");
-MODULE_ALIAS_CRYPTO("xchacha20");
-MODULE_ALIAS_CRYPTO("xchacha20-neon");
-- 
cgit v1.2.3-59-g8ed1b


From bdb063a79f6da589af1de3f10a7c8f654fba9ae8 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 16 Nov 2018 17:26:26 -0800
Subject: crypto: arm/chacha - add XChaCha12 support

Now that the 32-bit ARM NEON implementation of ChaCha20 and XChaCha20
has been refactored to support varying the number of rounds, add support
for XChaCha12.  This is identical to XChaCha20 except for the number of
rounds, which is 12 instead of 20.

XChaCha12 is faster than XChaCha20 but has a lower security margin,
though still greater than AES-256's since the best known attacks make it
through only 7 rounds.  See the patch "crypto: chacha - add XChaCha12
support" for more details about why we need XChaCha12 support.

Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm/crypto/Kconfig            |  2 +-
 arch/arm/crypto/chacha-neon-glue.c | 21 ++++++++++++++++++++-
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index a08759c32cb9..59c674cf08ef 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -126,7 +126,7 @@ config CRYPTO_CRC32_ARM_CE
 	select CRYPTO_HASH
 
 config CRYPTO_CHACHA20_NEON
-	tristate "NEON accelerated ChaCha20 stream cipher algorithms"
+	tristate "NEON accelerated ChaCha stream cipher algorithms"
 	depends on KERNEL_MODE_NEON
 	select CRYPTO_BLKCIPHER
 	select CRYPTO_CHACHA20
diff --git a/arch/arm/crypto/chacha-neon-glue.c b/arch/arm/crypto/chacha-neon-glue.c
index 385557d38634..9d6fda81986d 100644
--- a/arch/arm/crypto/chacha-neon-glue.c
+++ b/arch/arm/crypto/chacha-neon-glue.c
@@ -1,5 +1,6 @@
 /*
- * ChaCha20 (RFC7539) and XChaCha20 stream ciphers, NEON accelerated
+ * ARM NEON accelerated ChaCha and XChaCha stream ciphers,
+ * including ChaCha20 (RFC7539)
  *
  * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
  *
@@ -154,6 +155,22 @@ static struct skcipher_alg algs[] = {
 		.setkey			= crypto_chacha20_setkey,
 		.encrypt		= xchacha_neon,
 		.decrypt		= xchacha_neon,
+	}, {
+		.base.cra_name		= "xchacha12",
+		.base.cra_driver_name	= "xchacha12-neon",
+		.base.cra_priority	= 300,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= XCHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.walksize		= 4 * CHACHA_BLOCK_SIZE,
+		.setkey			= crypto_chacha12_setkey,
+		.encrypt		= xchacha_neon,
+		.decrypt		= xchacha_neon,
 	}
 };
 
@@ -180,3 +197,5 @@ MODULE_ALIAS_CRYPTO("chacha20");
 MODULE_ALIAS_CRYPTO("chacha20-neon");
 MODULE_ALIAS_CRYPTO("xchacha20");
 MODULE_ALIAS_CRYPTO("xchacha20-neon");
+MODULE_ALIAS_CRYPTO("xchacha12");
+MODULE_ALIAS_CRYPTO("xchacha12-neon");
-- 
cgit v1.2.3-59-g8ed1b


From 878afc35cd28bcd93cd3c5e1985ef39a104a4d45 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 16 Nov 2018 17:26:27 -0800
Subject: crypto: poly1305 - use structures for key and accumulator
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In preparation for exposing a low-level Poly1305 API which implements
the ε-almost-∆-universal (εA∆U) hash function underlying the Poly1305
MAC and supports block-aligned inputs only, create structures
poly1305_key and poly1305_state which hold the limbs of the Poly1305
"r" key and accumulator, respectively.

These structures could actually have the same type (e.g. poly1305_val),
but different types are preferable, to prevent misuse.

Acked-by: Martin Willi <martin@strongswan.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/poly1305_glue.c | 20 +++++++++-------
 crypto/poly1305_generic.c       | 52 ++++++++++++++++++++---------------------
 include/crypto/poly1305.h       | 12 ++++++++--
 3 files changed, 47 insertions(+), 37 deletions(-)

diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c
index f012b7e28ad1..88cc01506c84 100644
--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
@@ -83,35 +83,37 @@ static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx,
 	if (poly1305_use_avx2 && srclen >= POLY1305_BLOCK_SIZE * 4) {
 		if (unlikely(!sctx->wset)) {
 			if (!sctx->uset) {
-				memcpy(sctx->u, dctx->r, sizeof(sctx->u));
-				poly1305_simd_mult(sctx->u, dctx->r);
+				memcpy(sctx->u, dctx->r.r, sizeof(sctx->u));
+				poly1305_simd_mult(sctx->u, dctx->r.r);
 				sctx->uset = true;
 			}
 			memcpy(sctx->u + 5, sctx->u, sizeof(sctx->u));
-			poly1305_simd_mult(sctx->u + 5, dctx->r);
+			poly1305_simd_mult(sctx->u + 5, dctx->r.r);
 			memcpy(sctx->u + 10, sctx->u + 5, sizeof(sctx->u));
-			poly1305_simd_mult(sctx->u + 10, dctx->r);
+			poly1305_simd_mult(sctx->u + 10, dctx->r.r);
 			sctx->wset = true;
 		}
 		blocks = srclen / (POLY1305_BLOCK_SIZE * 4);
-		poly1305_4block_avx2(dctx->h, src, dctx->r, blocks, sctx->u);
+		poly1305_4block_avx2(dctx->h.h, src, dctx->r.r, blocks,
+				     sctx->u);
 		src += POLY1305_BLOCK_SIZE * 4 * blocks;
 		srclen -= POLY1305_BLOCK_SIZE * 4 * blocks;
 	}
 #endif
 	if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) {
 		if (unlikely(!sctx->uset)) {
-			memcpy(sctx->u, dctx->r, sizeof(sctx->u));
-			poly1305_simd_mult(sctx->u, dctx->r);
+			memcpy(sctx->u, dctx->r.r, sizeof(sctx->u));
+			poly1305_simd_mult(sctx->u, dctx->r.r);
 			sctx->uset = true;
 		}
 		blocks = srclen / (POLY1305_BLOCK_SIZE * 2);
-		poly1305_2block_sse2(dctx->h, src, dctx->r, blocks, sctx->u);
+		poly1305_2block_sse2(dctx->h.h, src, dctx->r.r, blocks,
+				     sctx->u);
 		src += POLY1305_BLOCK_SIZE * 2 * blocks;
 		srclen -= POLY1305_BLOCK_SIZE * 2 * blocks;
 	}
 	if (srclen >= POLY1305_BLOCK_SIZE) {
-		poly1305_block_sse2(dctx->h, src, dctx->r, 1);
+		poly1305_block_sse2(dctx->h.h, src, dctx->r.r, 1);
 		srclen -= POLY1305_BLOCK_SIZE;
 	}
 	return srclen;
diff --git a/crypto/poly1305_generic.c b/crypto/poly1305_generic.c
index 47d3a6b83931..a23173f351b7 100644
--- a/crypto/poly1305_generic.c
+++ b/crypto/poly1305_generic.c
@@ -38,7 +38,7 @@ int crypto_poly1305_init(struct shash_desc *desc)
 {
 	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
 
-	memset(dctx->h, 0, sizeof(dctx->h));
+	memset(dctx->h.h, 0, sizeof(dctx->h.h));
 	dctx->buflen = 0;
 	dctx->rset = false;
 	dctx->sset = false;
@@ -50,11 +50,11 @@ EXPORT_SYMBOL_GPL(crypto_poly1305_init);
 static void poly1305_setrkey(struct poly1305_desc_ctx *dctx, const u8 *key)
 {
 	/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
-	dctx->r[0] = (get_unaligned_le32(key +  0) >> 0) & 0x3ffffff;
-	dctx->r[1] = (get_unaligned_le32(key +  3) >> 2) & 0x3ffff03;
-	dctx->r[2] = (get_unaligned_le32(key +  6) >> 4) & 0x3ffc0ff;
-	dctx->r[3] = (get_unaligned_le32(key +  9) >> 6) & 0x3f03fff;
-	dctx->r[4] = (get_unaligned_le32(key + 12) >> 8) & 0x00fffff;
+	dctx->r.r[0] = (get_unaligned_le32(key +  0) >> 0) & 0x3ffffff;
+	dctx->r.r[1] = (get_unaligned_le32(key +  3) >> 2) & 0x3ffff03;
+	dctx->r.r[2] = (get_unaligned_le32(key +  6) >> 4) & 0x3ffc0ff;
+	dctx->r.r[3] = (get_unaligned_le32(key +  9) >> 6) & 0x3f03fff;
+	dctx->r.r[4] = (get_unaligned_le32(key + 12) >> 8) & 0x00fffff;
 }
 
 static void poly1305_setskey(struct poly1305_desc_ctx *dctx, const u8 *key)
@@ -107,22 +107,22 @@ static unsigned int poly1305_blocks(struct poly1305_desc_ctx *dctx,
 		srclen = datalen;
 	}
 
-	r0 = dctx->r[0];
-	r1 = dctx->r[1];
-	r2 = dctx->r[2];
-	r3 = dctx->r[3];
-	r4 = dctx->r[4];
+	r0 = dctx->r.r[0];
+	r1 = dctx->r.r[1];
+	r2 = dctx->r.r[2];
+	r3 = dctx->r.r[3];
+	r4 = dctx->r.r[4];
 
 	s1 = r1 * 5;
 	s2 = r2 * 5;
 	s3 = r3 * 5;
 	s4 = r4 * 5;
 
-	h0 = dctx->h[0];
-	h1 = dctx->h[1];
-	h2 = dctx->h[2];
-	h3 = dctx->h[3];
-	h4 = dctx->h[4];
+	h0 = dctx->h.h[0];
+	h1 = dctx->h.h[1];
+	h2 = dctx->h.h[2];
+	h3 = dctx->h.h[3];
+	h4 = dctx->h.h[4];
 
 	while (likely(srclen >= POLY1305_BLOCK_SIZE)) {
 
@@ -157,11 +157,11 @@ static unsigned int poly1305_blocks(struct poly1305_desc_ctx *dctx,
 		srclen -= POLY1305_BLOCK_SIZE;
 	}
 
-	dctx->h[0] = h0;
-	dctx->h[1] = h1;
-	dctx->h[2] = h2;
-	dctx->h[3] = h3;
-	dctx->h[4] = h4;
+	dctx->h.h[0] = h0;
+	dctx->h.h[1] = h1;
+	dctx->h.h[2] = h2;
+	dctx->h.h[3] = h3;
+	dctx->h.h[4] = h4;
 
 	return srclen;
 }
@@ -220,11 +220,11 @@ int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
 	}
 
 	/* fully carry h */
-	h0 = dctx->h[0];
-	h1 = dctx->h[1];
-	h2 = dctx->h[2];
-	h3 = dctx->h[3];
-	h4 = dctx->h[4];
+	h0 = dctx->h.h[0];
+	h1 = dctx->h.h[1];
+	h2 = dctx->h.h[2];
+	h3 = dctx->h.h[3];
+	h4 = dctx->h.h[4];
 
 	h2 += (h1 >> 26);     h1 = h1 & 0x3ffffff;
 	h3 += (h2 >> 26);     h2 = h2 & 0x3ffffff;
diff --git a/include/crypto/poly1305.h b/include/crypto/poly1305.h
index f718a19da82f..493244c46664 100644
--- a/include/crypto/poly1305.h
+++ b/include/crypto/poly1305.h
@@ -13,13 +13,21 @@
 #define POLY1305_KEY_SIZE	32
 #define POLY1305_DIGEST_SIZE	16
 
+struct poly1305_key {
+	u32 r[5];	/* key, base 2^26 */
+};
+
+struct poly1305_state {
+	u32 h[5];	/* accumulator, base 2^26 */
+};
+
 struct poly1305_desc_ctx {
 	/* key */
-	u32 r[5];
+	struct poly1305_key r;
 	/* finalize key */
 	u32 s[4];
 	/* accumulator */
-	u32 h[5];
+	struct poly1305_state h;
 	/* partial buffer */
 	u8 buf[POLY1305_BLOCK_SIZE];
 	/* bytes used in partial buffer */
-- 
cgit v1.2.3-59-g8ed1b


From 1b6fd3d5d18bbc1b1abf3b0cbc4b95a9a63d407b Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 16 Nov 2018 17:26:28 -0800
Subject: crypto: poly1305 - add Poly1305 core API
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Expose a low-level Poly1305 API which implements the
ε-almost-∆-universal (εA∆U) hash function underlying the Poly1305 MAC
and supports block-aligned inputs only.

This is needed for Adiantum hashing, which builds an εA∆U hash function
from NH and a polynomial evaluation in GF(2^{130}-5); this polynomial
evaluation is identical to the one the Poly1305 MAC does.  However, the
crypto_shash Poly1305 API isn't very appropriate for this because its
calling convention assumes it is used as a MAC, with a 32-byte "one-time
key" provided for every digest.

But by design, in Adiantum hashing the performance of the polynomial
evaluation isn't nearly as critical as NH.  So it suffices to just have
some C helper functions.  Thus, this patch adds such functions.

Acked-by: Martin Willi <martin@strongswan.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/poly1305_generic.c | 174 ++++++++++++++++++++++++++--------------------
 include/crypto/poly1305.h |  16 +++++
 2 files changed, 115 insertions(+), 75 deletions(-)

diff --git a/crypto/poly1305_generic.c b/crypto/poly1305_generic.c
index a23173f351b7..2a06874204e8 100644
--- a/crypto/poly1305_generic.c
+++ b/crypto/poly1305_generic.c
@@ -38,7 +38,7 @@ int crypto_poly1305_init(struct shash_desc *desc)
 {
 	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
 
-	memset(dctx->h.h, 0, sizeof(dctx->h.h));
+	poly1305_core_init(&dctx->h);
 	dctx->buflen = 0;
 	dctx->rset = false;
 	dctx->sset = false;
@@ -47,23 +47,16 @@ int crypto_poly1305_init(struct shash_desc *desc)
 }
 EXPORT_SYMBOL_GPL(crypto_poly1305_init);
 
-static void poly1305_setrkey(struct poly1305_desc_ctx *dctx, const u8 *key)
+void poly1305_core_setkey(struct poly1305_key *key, const u8 *raw_key)
 {
 	/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
-	dctx->r.r[0] = (get_unaligned_le32(key +  0) >> 0) & 0x3ffffff;
-	dctx->r.r[1] = (get_unaligned_le32(key +  3) >> 2) & 0x3ffff03;
-	dctx->r.r[2] = (get_unaligned_le32(key +  6) >> 4) & 0x3ffc0ff;
-	dctx->r.r[3] = (get_unaligned_le32(key +  9) >> 6) & 0x3f03fff;
-	dctx->r.r[4] = (get_unaligned_le32(key + 12) >> 8) & 0x00fffff;
-}
-
-static void poly1305_setskey(struct poly1305_desc_ctx *dctx, const u8 *key)
-{
-	dctx->s[0] = get_unaligned_le32(key +  0);
-	dctx->s[1] = get_unaligned_le32(key +  4);
-	dctx->s[2] = get_unaligned_le32(key +  8);
-	dctx->s[3] = get_unaligned_le32(key + 12);
+	key->r[0] = (get_unaligned_le32(raw_key +  0) >> 0) & 0x3ffffff;
+	key->r[1] = (get_unaligned_le32(raw_key +  3) >> 2) & 0x3ffff03;
+	key->r[2] = (get_unaligned_le32(raw_key +  6) >> 4) & 0x3ffc0ff;
+	key->r[3] = (get_unaligned_le32(raw_key +  9) >> 6) & 0x3f03fff;
+	key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff;
 }
+EXPORT_SYMBOL_GPL(poly1305_core_setkey);
 
 /*
  * Poly1305 requires a unique key for each tag, which implies that we can't set
@@ -75,13 +68,16 @@ unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
 {
 	if (!dctx->sset) {
 		if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) {
-			poly1305_setrkey(dctx, src);
+			poly1305_core_setkey(&dctx->r, src);
 			src += POLY1305_BLOCK_SIZE;
 			srclen -= POLY1305_BLOCK_SIZE;
 			dctx->rset = true;
 		}
 		if (srclen >= POLY1305_BLOCK_SIZE) {
-			poly1305_setskey(dctx, src);
+			dctx->s[0] = get_unaligned_le32(src +  0);
+			dctx->s[1] = get_unaligned_le32(src +  4);
+			dctx->s[2] = get_unaligned_le32(src +  8);
+			dctx->s[3] = get_unaligned_le32(src + 12);
 			src += POLY1305_BLOCK_SIZE;
 			srclen -= POLY1305_BLOCK_SIZE;
 			dctx->sset = true;
@@ -91,41 +87,37 @@ unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
 }
 EXPORT_SYMBOL_GPL(crypto_poly1305_setdesckey);
 
-static unsigned int poly1305_blocks(struct poly1305_desc_ctx *dctx,
-				    const u8 *src, unsigned int srclen,
-				    u32 hibit)
+static void poly1305_blocks_internal(struct poly1305_state *state,
+				     const struct poly1305_key *key,
+				     const void *src, unsigned int nblocks,
+				     u32 hibit)
 {
 	u32 r0, r1, r2, r3, r4;
 	u32 s1, s2, s3, s4;
 	u32 h0, h1, h2, h3, h4;
 	u64 d0, d1, d2, d3, d4;
-	unsigned int datalen;
 
-	if (unlikely(!dctx->sset)) {
-		datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
-		src += srclen - datalen;
-		srclen = datalen;
-	}
+	if (!nblocks)
+		return;
 
-	r0 = dctx->r.r[0];
-	r1 = dctx->r.r[1];
-	r2 = dctx->r.r[2];
-	r3 = dctx->r.r[3];
-	r4 = dctx->r.r[4];
+	r0 = key->r[0];
+	r1 = key->r[1];
+	r2 = key->r[2];
+	r3 = key->r[3];
+	r4 = key->r[4];
 
 	s1 = r1 * 5;
 	s2 = r2 * 5;
 	s3 = r3 * 5;
 	s4 = r4 * 5;
 
-	h0 = dctx->h.h[0];
-	h1 = dctx->h.h[1];
-	h2 = dctx->h.h[2];
-	h3 = dctx->h.h[3];
-	h4 = dctx->h.h[4];
-
-	while (likely(srclen >= POLY1305_BLOCK_SIZE)) {
+	h0 = state->h[0];
+	h1 = state->h[1];
+	h2 = state->h[2];
+	h3 = state->h[3];
+	h4 = state->h[4];
 
+	do {
 		/* h += m[i] */
 		h0 += (get_unaligned_le32(src +  0) >> 0) & 0x3ffffff;
 		h1 += (get_unaligned_le32(src +  3) >> 2) & 0x3ffffff;
@@ -154,16 +146,36 @@ static unsigned int poly1305_blocks(struct poly1305_desc_ctx *dctx,
 		h1 += h0 >> 26;       h0 = h0 & 0x3ffffff;
 
 		src += POLY1305_BLOCK_SIZE;
-		srclen -= POLY1305_BLOCK_SIZE;
-	}
+	} while (--nblocks);
 
-	dctx->h.h[0] = h0;
-	dctx->h.h[1] = h1;
-	dctx->h.h[2] = h2;
-	dctx->h.h[3] = h3;
-	dctx->h.h[4] = h4;
+	state->h[0] = h0;
+	state->h[1] = h1;
+	state->h[2] = h2;
+	state->h[3] = h3;
+	state->h[4] = h4;
+}
 
-	return srclen;
+void poly1305_core_blocks(struct poly1305_state *state,
+			  const struct poly1305_key *key,
+			  const void *src, unsigned int nblocks)
+{
+	poly1305_blocks_internal(state, key, src, nblocks, 1 << 24);
+}
+EXPORT_SYMBOL_GPL(poly1305_core_blocks);
+
+static void poly1305_blocks(struct poly1305_desc_ctx *dctx,
+			    const u8 *src, unsigned int srclen, u32 hibit)
+{
+	unsigned int datalen;
+
+	if (unlikely(!dctx->sset)) {
+		datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
+		src += srclen - datalen;
+		srclen = datalen;
+	}
+
+	poly1305_blocks_internal(&dctx->h, &dctx->r,
+				 src, srclen / POLY1305_BLOCK_SIZE, hibit);
 }
 
 int crypto_poly1305_update(struct shash_desc *desc,
@@ -187,9 +199,9 @@ int crypto_poly1305_update(struct shash_desc *desc,
 	}
 
 	if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
-		bytes = poly1305_blocks(dctx, src, srclen, 1 << 24);
-		src += srclen - bytes;
-		srclen = bytes;
+		poly1305_blocks(dctx, src, srclen, 1 << 24);
+		src += srclen - (srclen % POLY1305_BLOCK_SIZE);
+		srclen %= POLY1305_BLOCK_SIZE;
 	}
 
 	if (unlikely(srclen)) {
@@ -201,30 +213,18 @@ int crypto_poly1305_update(struct shash_desc *desc,
 }
 EXPORT_SYMBOL_GPL(crypto_poly1305_update);
 
-int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
+void poly1305_core_emit(const struct poly1305_state *state, void *dst)
 {
-	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
 	u32 h0, h1, h2, h3, h4;
 	u32 g0, g1, g2, g3, g4;
 	u32 mask;
-	u64 f = 0;
-
-	if (unlikely(!dctx->sset))
-		return -ENOKEY;
-
-	if (unlikely(dctx->buflen)) {
-		dctx->buf[dctx->buflen++] = 1;
-		memset(dctx->buf + dctx->buflen, 0,
-		       POLY1305_BLOCK_SIZE - dctx->buflen);
-		poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 0);
-	}
 
 	/* fully carry h */
-	h0 = dctx->h.h[0];
-	h1 = dctx->h.h[1];
-	h2 = dctx->h.h[2];
-	h3 = dctx->h.h[3];
-	h4 = dctx->h.h[4];
+	h0 = state->h[0];
+	h1 = state->h[1];
+	h2 = state->h[2];
+	h3 = state->h[3];
+	h4 = state->h[4];
 
 	h2 += (h1 >> 26);     h1 = h1 & 0x3ffffff;
 	h3 += (h2 >> 26);     h2 = h2 & 0x3ffffff;
@@ -254,16 +254,40 @@ int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
 	h4 = (h4 & mask) | g4;
 
 	/* h = h % (2^128) */
-	h0 = (h0 >>  0) | (h1 << 26);
-	h1 = (h1 >>  6) | (h2 << 20);
-	h2 = (h2 >> 12) | (h3 << 14);
-	h3 = (h3 >> 18) | (h4 <<  8);
+	put_unaligned_le32((h0 >>  0) | (h1 << 26), dst +  0);
+	put_unaligned_le32((h1 >>  6) | (h2 << 20), dst +  4);
+	put_unaligned_le32((h2 >> 12) | (h3 << 14), dst +  8);
+	put_unaligned_le32((h3 >> 18) | (h4 <<  8), dst + 12);
+}
+EXPORT_SYMBOL_GPL(poly1305_core_emit);
+
+int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
+{
+	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+	__le32 digest[4];
+	u64 f = 0;
+
+	if (unlikely(!dctx->sset))
+		return -ENOKEY;
+
+	if (unlikely(dctx->buflen)) {
+		dctx->buf[dctx->buflen++] = 1;
+		memset(dctx->buf + dctx->buflen, 0,
+		       POLY1305_BLOCK_SIZE - dctx->buflen);
+		poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 0);
+	}
+
+	poly1305_core_emit(&dctx->h, digest);
 
 	/* mac = (h + s) % (2^128) */
-	f = (f >> 32) + h0 + dctx->s[0]; put_unaligned_le32(f, dst +  0);
-	f = (f >> 32) + h1 + dctx->s[1]; put_unaligned_le32(f, dst +  4);
-	f = (f >> 32) + h2 + dctx->s[2]; put_unaligned_le32(f, dst +  8);
-	f = (f >> 32) + h3 + dctx->s[3]; put_unaligned_le32(f, dst + 12);
+	f = (f >> 32) + le32_to_cpu(digest[0]) + dctx->s[0];
+	put_unaligned_le32(f, dst + 0);
+	f = (f >> 32) + le32_to_cpu(digest[1]) + dctx->s[1];
+	put_unaligned_le32(f, dst + 4);
+	f = (f >> 32) + le32_to_cpu(digest[2]) + dctx->s[2];
+	put_unaligned_le32(f, dst + 8);
+	f = (f >> 32) + le32_to_cpu(digest[3]) + dctx->s[3];
+	put_unaligned_le32(f, dst + 12);
 
 	return 0;
 }
diff --git a/include/crypto/poly1305.h b/include/crypto/poly1305.h
index 493244c46664..34317ed2071e 100644
--- a/include/crypto/poly1305.h
+++ b/include/crypto/poly1305.h
@@ -38,6 +38,22 @@ struct poly1305_desc_ctx {
 	bool sset;
 };
 
+/*
+ * Poly1305 core functions.  These implement the ε-almost-∆-universal hash
+ * function underlying the Poly1305 MAC, i.e. they don't add an encrypted nonce
+ * ("s key") at the end.  They also only support block-aligned inputs.
+ */
+void poly1305_core_setkey(struct poly1305_key *key, const u8 *raw_key);
+static inline void poly1305_core_init(struct poly1305_state *state)
+{
+	memset(state->h, 0, sizeof(state->h));
+}
+void poly1305_core_blocks(struct poly1305_state *state,
+			  const struct poly1305_key *key,
+			  const void *src, unsigned int nblocks);
+void poly1305_core_emit(const struct poly1305_state *state, void *dst);
+
+/* Crypto API helper functions for the Poly1305 MAC */
 int crypto_poly1305_init(struct shash_desc *desc);
 unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
 					const u8 *src, unsigned int srclen);
-- 
cgit v1.2.3-59-g8ed1b


From 26609a21a9460145e37d90947ad957b358a05288 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 16 Nov 2018 17:26:29 -0800
Subject: crypto: nhpoly1305 - add NHPoly1305 support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a generic implementation of NHPoly1305, an ε-almost-∆-universal hash
function used in the Adiantum encryption mode.

CONFIG_NHPOLY1305 is not selectable by itself since there won't be any
real reason to enable it without also enabling Adiantum support.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/Kconfig              |    5 +
 crypto/Makefile             |    1 +
 crypto/nhpoly1305.c         |  254 +++++++++
 crypto/testmgr.c            |    6 +
 crypto/testmgr.h            | 1240 ++++++++++++++++++++++++++++++++++++++++++-
 include/crypto/nhpoly1305.h |   74 +++
 6 files changed, 1576 insertions(+), 4 deletions(-)
 create mode 100644 crypto/nhpoly1305.c
 create mode 100644 include/crypto/nhpoly1305.h

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 4431c0db56b7..eaeb8a986b7d 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -496,6 +496,11 @@ config CRYPTO_KEYWRAP
 	  Support for key wrapping (NIST SP800-38F / RFC3394) without
 	  padding.
 
+config CRYPTO_NHPOLY1305
+	tristate
+	select CRYPTO_HASH
+	select CRYPTO_POLY1305
+
 comment "Hash modes"
 
 config CRYPTO_CMAC
diff --git a/crypto/Makefile b/crypto/Makefile
index 102e8525814f..c3310c85f09f 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -85,6 +85,7 @@ obj-$(CONFIG_CRYPTO_LRW) += lrw.o
 obj-$(CONFIG_CRYPTO_XTS) += xts.o
 obj-$(CONFIG_CRYPTO_CTR) += ctr.o
 obj-$(CONFIG_CRYPTO_KEYWRAP) += keywrap.o
+obj-$(CONFIG_CRYPTO_NHPOLY1305) += nhpoly1305.o
 obj-$(CONFIG_CRYPTO_GCM) += gcm.o
 obj-$(CONFIG_CRYPTO_CCM) += ccm.o
 obj-$(CONFIG_CRYPTO_CHACHA20POLY1305) += chacha20poly1305.o
diff --git a/crypto/nhpoly1305.c b/crypto/nhpoly1305.c
new file mode 100644
index 000000000000..c8385853f699
--- /dev/null
+++ b/crypto/nhpoly1305.c
@@ -0,0 +1,254 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum
+ *
+ * Copyright 2018 Google LLC
+ */
+
+/*
+ * "NHPoly1305" is the main component of Adiantum hashing.
+ * Specifically, it is the calculation
+ *
+ *	H_M ← Poly1305_{K_M}(NH_{K_N}(pad_{128}(M)))
+ *
+ * from the procedure in section A.5 of the Adiantum paper [1].  It is an
+ * ε-almost-∆-universal (εA∆U) hash function for equal-length inputs over
+ * Z/(2^{128}Z), where the "∆" operation is addition.  It hashes 1024-byte
+ * chunks of the input with the NH hash function [2], reducing the input length
+ * by 32x.  The resulting NH digests are evaluated as a polynomial in
+ * GF(2^{130}-5), like in the Poly1305 MAC [3].  Note that the polynomial
+ * evaluation by itself would suffice to achieve the εA∆U property; NH is used
+ * for performance since it's over twice as fast as Poly1305.
+ *
+ * This is *not* a cryptographic hash function; do not use it as such!
+ *
+ * [1] Adiantum: length-preserving encryption for entry-level processors
+ *     (https://eprint.iacr.org/2018/720.pdf)
+ * [2] UMAC: Fast and Secure Message Authentication
+ *     (https://fastcrypto.org/umac/umac_proc.pdf)
+ * [3] The Poly1305-AES message-authentication code
+ *     (https://cr.yp.to/mac/poly1305-20050329.pdf)
+ */
+
+#include <asm/unaligned.h>
+#include <crypto/algapi.h>
+#include <crypto/internal/hash.h>
+#include <crypto/nhpoly1305.h>
+#include <linux/crypto.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+static void nh_generic(const u32 *key, const u8 *message, size_t message_len,
+		       __le64 hash[NH_NUM_PASSES])
+{
+	u64 sums[4] = { 0, 0, 0, 0 };
+
+	BUILD_BUG_ON(NH_PAIR_STRIDE != 2);
+	BUILD_BUG_ON(NH_NUM_PASSES != 4);
+
+	while (message_len) {
+		u32 m0 = get_unaligned_le32(message + 0);
+		u32 m1 = get_unaligned_le32(message + 4);
+		u32 m2 = get_unaligned_le32(message + 8);
+		u32 m3 = get_unaligned_le32(message + 12);
+
+		sums[0] += (u64)(u32)(m0 + key[ 0]) * (u32)(m2 + key[ 2]);
+		sums[1] += (u64)(u32)(m0 + key[ 4]) * (u32)(m2 + key[ 6]);
+		sums[2] += (u64)(u32)(m0 + key[ 8]) * (u32)(m2 + key[10]);
+		sums[3] += (u64)(u32)(m0 + key[12]) * (u32)(m2 + key[14]);
+		sums[0] += (u64)(u32)(m1 + key[ 1]) * (u32)(m3 + key[ 3]);
+		sums[1] += (u64)(u32)(m1 + key[ 5]) * (u32)(m3 + key[ 7]);
+		sums[2] += (u64)(u32)(m1 + key[ 9]) * (u32)(m3 + key[11]);
+		sums[3] += (u64)(u32)(m1 + key[13]) * (u32)(m3 + key[15]);
+		key += NH_MESSAGE_UNIT / sizeof(key[0]);
+		message += NH_MESSAGE_UNIT;
+		message_len -= NH_MESSAGE_UNIT;
+	}
+
+	hash[0] = cpu_to_le64(sums[0]);
+	hash[1] = cpu_to_le64(sums[1]);
+	hash[2] = cpu_to_le64(sums[2]);
+	hash[3] = cpu_to_le64(sums[3]);
+}
+
+/* Pass the next NH hash value through Poly1305 */
+static void process_nh_hash_value(struct nhpoly1305_state *state,
+				  const struct nhpoly1305_key *key)
+{
+	BUILD_BUG_ON(NH_HASH_BYTES % POLY1305_BLOCK_SIZE != 0);
+
+	poly1305_core_blocks(&state->poly_state, &key->poly_key, state->nh_hash,
+			     NH_HASH_BYTES / POLY1305_BLOCK_SIZE);
+}
+
+/*
+ * Feed the next portion of the source data, as a whole number of 16-byte
+ * "NH message units", through NH and Poly1305.  Each NH hash is taken over
+ * 1024 bytes, except possibly the final one which is taken over a multiple of
+ * 16 bytes up to 1024.  Also, in the case where data is passed in misaligned
+ * chunks, we combine partial hashes; the end result is the same either way.
+ */
+static void nhpoly1305_units(struct nhpoly1305_state *state,
+			     const struct nhpoly1305_key *key,
+			     const u8 *src, unsigned int srclen, nh_t nh_fn)
+{
+	do {
+		unsigned int bytes;
+
+		if (state->nh_remaining == 0) {
+			/* Starting a new NH message */
+			bytes = min_t(unsigned int, srclen, NH_MESSAGE_BYTES);
+			nh_fn(key->nh_key, src, bytes, state->nh_hash);
+			state->nh_remaining = NH_MESSAGE_BYTES - bytes;
+		} else {
+			/* Continuing a previous NH message */
+			__le64 tmp_hash[NH_NUM_PASSES];
+			unsigned int pos;
+			int i;
+
+			pos = NH_MESSAGE_BYTES - state->nh_remaining;
+			bytes = min(srclen, state->nh_remaining);
+			nh_fn(&key->nh_key[pos / 4], src, bytes, tmp_hash);
+			for (i = 0; i < NH_NUM_PASSES; i++)
+				le64_add_cpu(&state->nh_hash[i],
+					     le64_to_cpu(tmp_hash[i]));
+			state->nh_remaining -= bytes;
+		}
+		if (state->nh_remaining == 0)
+			process_nh_hash_value(state, key);
+		src += bytes;
+		srclen -= bytes;
+	} while (srclen);
+}
+
+int crypto_nhpoly1305_setkey(struct crypto_shash *tfm,
+			     const u8 *key, unsigned int keylen)
+{
+	struct nhpoly1305_key *ctx = crypto_shash_ctx(tfm);
+	int i;
+
+	if (keylen != NHPOLY1305_KEY_SIZE)
+		return -EINVAL;
+
+	poly1305_core_setkey(&ctx->poly_key, key);
+	key += POLY1305_BLOCK_SIZE;
+
+	for (i = 0; i < NH_KEY_WORDS; i++)
+		ctx->nh_key[i] = get_unaligned_le32(key + i * sizeof(u32));
+
+	return 0;
+}
+EXPORT_SYMBOL(crypto_nhpoly1305_setkey);
+
+int crypto_nhpoly1305_init(struct shash_desc *desc)
+{
+	struct nhpoly1305_state *state = shash_desc_ctx(desc);
+
+	poly1305_core_init(&state->poly_state);
+	state->buflen = 0;
+	state->nh_remaining = 0;
+	return 0;
+}
+EXPORT_SYMBOL(crypto_nhpoly1305_init);
+
+int crypto_nhpoly1305_update_helper(struct shash_desc *desc,
+				    const u8 *src, unsigned int srclen,
+				    nh_t nh_fn)
+{
+	struct nhpoly1305_state *state = shash_desc_ctx(desc);
+	const struct nhpoly1305_key *key = crypto_shash_ctx(desc->tfm);
+	unsigned int bytes;
+
+	if (state->buflen) {
+		bytes = min(srclen, (int)NH_MESSAGE_UNIT - state->buflen);
+		memcpy(&state->buffer[state->buflen], src, bytes);
+		state->buflen += bytes;
+		if (state->buflen < NH_MESSAGE_UNIT)
+			return 0;
+		nhpoly1305_units(state, key, state->buffer, NH_MESSAGE_UNIT,
+				 nh_fn);
+		state->buflen = 0;
+		src += bytes;
+		srclen -= bytes;
+	}
+
+	if (srclen >= NH_MESSAGE_UNIT) {
+		bytes = round_down(srclen, NH_MESSAGE_UNIT);
+		nhpoly1305_units(state, key, src, bytes, nh_fn);
+		src += bytes;
+		srclen -= bytes;
+	}
+
+	if (srclen) {
+		memcpy(state->buffer, src, srclen);
+		state->buflen = srclen;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(crypto_nhpoly1305_update_helper);
+
+int crypto_nhpoly1305_update(struct shash_desc *desc,
+			     const u8 *src, unsigned int srclen)
+{
+	return crypto_nhpoly1305_update_helper(desc, src, srclen, nh_generic);
+}
+EXPORT_SYMBOL(crypto_nhpoly1305_update);
+
+int crypto_nhpoly1305_final_helper(struct shash_desc *desc, u8 *dst, nh_t nh_fn)
+{
+	struct nhpoly1305_state *state = shash_desc_ctx(desc);
+	const struct nhpoly1305_key *key = crypto_shash_ctx(desc->tfm);
+
+	if (state->buflen) {
+		memset(&state->buffer[state->buflen], 0,
+		       NH_MESSAGE_UNIT - state->buflen);
+		nhpoly1305_units(state, key, state->buffer, NH_MESSAGE_UNIT,
+				 nh_fn);
+	}
+
+	if (state->nh_remaining)
+		process_nh_hash_value(state, key);
+
+	poly1305_core_emit(&state->poly_state, dst);
+	return 0;
+}
+EXPORT_SYMBOL(crypto_nhpoly1305_final_helper);
+
+int crypto_nhpoly1305_final(struct shash_desc *desc, u8 *dst)
+{
+	return crypto_nhpoly1305_final_helper(desc, dst, nh_generic);
+}
+EXPORT_SYMBOL(crypto_nhpoly1305_final);
+
+static struct shash_alg nhpoly1305_alg = {
+	.base.cra_name		= "nhpoly1305",
+	.base.cra_driver_name	= "nhpoly1305-generic",
+	.base.cra_priority	= 100,
+	.base.cra_ctxsize	= sizeof(struct nhpoly1305_key),
+	.base.cra_module	= THIS_MODULE,
+	.digestsize		= POLY1305_DIGEST_SIZE,
+	.init			= crypto_nhpoly1305_init,
+	.update			= crypto_nhpoly1305_update,
+	.final			= crypto_nhpoly1305_final,
+	.setkey			= crypto_nhpoly1305_setkey,
+	.descsize		= sizeof(struct nhpoly1305_state),
+};
+
+static int __init nhpoly1305_mod_init(void)
+{
+	return crypto_register_shash(&nhpoly1305_alg);
+}
+
+static void __exit nhpoly1305_mod_exit(void)
+{
+	crypto_unregister_shash(&nhpoly1305_alg);
+}
+
+module_init(nhpoly1305_mod_init);
+module_exit(nhpoly1305_mod_exit);
+
+MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
+MODULE_ALIAS_CRYPTO("nhpoly1305");
+MODULE_ALIAS_CRYPTO("nhpoly1305-generic");
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 6ff60c3745f1..665911c24786 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -3311,6 +3311,12 @@ static const struct alg_test_desc alg_test_descs[] = {
 				.dec = __VECS(morus640_dec_tv_template),
 			}
 		}
+	}, {
+		.alg = "nhpoly1305",
+		.test = alg_test_hash,
+		.suite = {
+			.hash = __VECS(nhpoly1305_tv_template)
+		}
 	}, {
 		.alg = "ofb(aes)",
 		.test = alg_test_skcipher,
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index a23dca2b11d0..50ea1f2705c7 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -27,7 +27,7 @@
 #define MAX_DIGEST_SIZE		64
 #define MAX_TAP			8
 
-#define MAX_KEYLEN		160
+#define MAX_KEYLEN		1088
 #define MAX_IVLEN		32
 
 struct hash_testvec {
@@ -35,10 +35,10 @@ struct hash_testvec {
 	const char *key;
 	const char *plaintext;
 	const char *digest;
-	unsigned char tap[MAX_TAP];
+	unsigned short tap[MAX_TAP];
+	unsigned short np;
 	unsigned short psize;
-	unsigned char np;
-	unsigned char ksize;
+	unsigned short ksize;
 };
 
 /*
@@ -5709,6 +5709,1238 @@ static const struct hash_testvec poly1305_tv_template[] = {
 	},
 };
 
+/* NHPoly1305 test vectors from https://github.com/google/adiantum */
+static const struct hash_testvec nhpoly1305_tv_template[] = {
+	{
+		.key	= "\xd2\x5d\x4c\xdd\x8d\x2b\x7f\x7a"
+			  "\xd9\xbe\x71\xec\xd1\x83\x52\xe3"
+			  "\xe1\xad\xd7\x5c\x0a\x75\x9d\xec"
+			  "\x1d\x13\x7e\x5d\x71\x07\xc9\xe4"
+			  "\x57\x2d\x44\x68\xcf\xd8\xd6\xc5"
+			  "\x39\x69\x7d\x32\x75\x51\x4f\x7e"
+			  "\xb2\x4c\xc6\x90\x51\x6e\xd9\xd6"
+			  "\xa5\x8b\x2d\xf1\x94\xf9\xf7\x5e"
+			  "\x2c\x84\x7b\x41\x0f\x88\x50\x89"
+			  "\x30\xd9\xa1\x38\x46\x6c\xc0\x4f"
+			  "\xe8\xdf\xdc\x66\xab\x24\x43\x41"
+			  "\x91\x55\x29\x65\x86\x28\x5e\x45"
+			  "\xd5\x2d\xb7\x80\x08\x9a\xc3\xd4"
+			  "\x9a\x77\x0a\xd4\xef\x3e\xe6\x3f"
+			  "\x6f\x2f\x9b\x3a\x7d\x12\x1e\x80"
+			  "\x6c\x44\xa2\x25\xe1\xf6\x60\xe9"
+			  "\x0d\xaf\xc5\x3c\xa5\x79\xae\x64"
+			  "\xbc\xa0\x39\xa3\x4d\x10\xe5\x4d"
+			  "\xd5\xe7\x89\x7a\x13\xee\x06\x78"
+			  "\xdc\xa4\xdc\x14\x27\xe6\x49\x38"
+			  "\xd0\xe0\x45\x25\x36\xc5\xf4\x79"
+			  "\x2e\x9a\x98\x04\xe4\x2b\x46\x52"
+			  "\x7c\x33\xca\xe2\x56\x51\x50\xe2"
+			  "\xa5\x9a\xae\x18\x6a\x13\xf8\xd2"
+			  "\x21\x31\x66\x02\xe2\xda\x8d\x7e"
+			  "\x41\x19\xb2\x61\xee\x48\x8f\xf1"
+			  "\x65\x24\x2e\x1e\x68\xce\x05\xd9"
+			  "\x2a\xcf\xa5\x3a\x57\xdd\x35\x91"
+			  "\x93\x01\xca\x95\xfc\x2b\x36\x04"
+			  "\xe6\x96\x97\x28\xf6\x31\xfe\xa3"
+			  "\x9d\xf6\x6a\x1e\x80\x8d\xdc\xec"
+			  "\xaf\x66\x11\x13\x02\x88\xd5\x27"
+			  "\x33\xb4\x1a\xcd\xa3\xf6\xde\x31"
+			  "\x8e\xc0\x0e\x6c\xd8\x5a\x97\x5e"
+			  "\xdd\xfd\x60\x69\x38\x46\x3f\x90"
+			  "\x5e\x97\xd3\x32\x76\xc7\x82\x49"
+			  "\xfe\xba\x06\x5f\x2f\xa2\xfd\xff"
+			  "\x80\x05\x40\xe4\x33\x03\xfb\x10"
+			  "\xc0\xde\x65\x8c\xc9\x8d\x3a\x9d"
+			  "\xb5\x7b\x36\x4b\xb5\x0c\xcf\x00"
+			  "\x9c\x87\xe4\x49\xad\x90\xda\x4a"
+			  "\xdd\xbd\xff\xe2\x32\x57\xd6\x78"
+			  "\x36\x39\x6c\xd3\x5b\x9b\x88\x59"
+			  "\x2d\xf0\x46\xe4\x13\x0e\x2b\x35"
+			  "\x0d\x0f\x73\x8a\x4f\x26\x84\x75"
+			  "\x88\x3c\xc5\x58\x66\x18\x1a\xb4"
+			  "\x64\x51\x34\x27\x1b\xa4\x11\xc9"
+			  "\x6d\x91\x8a\xfa\x32\x60\x9d\xd7"
+			  "\x87\xe5\xaa\x43\x72\xf8\xda\xd1"
+			  "\x48\x44\x13\x61\xdc\x8c\x76\x17"
+			  "\x0c\x85\x4e\xf3\xdd\xa2\x42\xd2"
+			  "\x74\xc1\x30\x1b\xeb\x35\x31\x29"
+			  "\x5b\xd7\x4c\x94\x46\x35\xa1\x23"
+			  "\x50\xf2\xa2\x8e\x7e\x4f\x23\x4f"
+			  "\x51\xff\xe2\xc9\xa3\x7d\x56\x8b"
+			  "\x41\xf2\xd0\xc5\x57\x7e\x59\xac"
+			  "\xbb\x65\xf3\xfe\xf7\x17\xef\x63"
+			  "\x7c\x6f\x23\xdd\x22\x8e\xed\x84"
+			  "\x0e\x3b\x09\xb3\xf3\xf4\x8f\xcd"
+			  "\x37\xa8\xe1\xa7\x30\xdb\xb1\xa2"
+			  "\x9c\xa2\xdf\x34\x17\x3e\x68\x44"
+			  "\xd0\xde\x03\x50\xd1\x48\x6b\x20"
+			  "\xe2\x63\x45\xa5\xea\x87\xc2\x42"
+			  "\x95\x03\x49\x05\xed\xe0\x90\x29"
+			  "\x1a\xb8\xcf\x9b\x43\xcf\x29\x7a"
+			  "\x63\x17\x41\x9f\xe0\xc9\x10\xfd"
+			  "\x2c\x56\x8c\x08\x55\xb4\xa9\x27"
+			  "\x0f\x23\xb1\x05\x6a\x12\x46\xc7"
+			  "\xe1\xfe\x28\x93\x93\xd7\x2f\xdc"
+			  "\x98\x30\xdb\x75\x8a\xbe\x97\x7a"
+			  "\x02\xfb\x8c\xba\xbe\x25\x09\xbe"
+			  "\xce\xcb\xa2\xef\x79\x4d\x0e\x9d"
+			  "\x1b\x9d\xb6\x39\x34\x38\xfa\x07"
+			  "\xec\xe8\xfc\x32\x85\x1d\xf7\x85"
+			  "\x63\xc3\x3c\xc0\x02\x75\xd7\x3f"
+			  "\xb2\x68\x60\x66\x65\x81\xc6\xb1"
+			  "\x42\x65\x4b\x4b\x28\xd7\xc7\xaa"
+			  "\x9b\xd2\xdc\x1b\x01\xe0\x26\x39"
+			  "\x01\xc1\x52\x14\xd1\x3f\xb7\xe6"
+			  "\x61\x41\xc7\x93\xd2\xa2\x67\xc6"
+			  "\xf7\x11\xb5\xf5\xea\xdd\x19\xfb"
+			  "\x4d\x21\x12\xd6\x7d\xf1\x10\xb0"
+			  "\x89\x07\xc7\x5a\x52\x73\x70\x2f"
+			  "\x32\xef\x65\x2b\x12\xb2\xf0\xf5"
+			  "\x20\xe0\x90\x59\x7e\x64\xf1\x4c"
+			  "\x41\xb3\xa5\x91\x08\xe6\x5e\x5f"
+			  "\x05\x56\x76\xb4\xb0\xcd\x70\x53"
+			  "\x10\x48\x9c\xff\xc2\x69\x55\x24"
+			  "\x87\xef\x84\xea\xfb\xa7\xbf\xa0"
+			  "\x91\x04\xad\x4f\x8b\x57\x54\x4b"
+			  "\xb6\xe9\xd1\xac\x37\x2f\x1d\x2e"
+			  "\xab\xa5\xa4\xe8\xff\xfb\xd9\x39"
+			  "\x2f\xb7\xac\xd1\xfe\x0b\x9a\x80"
+			  "\x0f\xb6\xf4\x36\x39\x90\x51\xe3"
+			  "\x0a\x2f\xb6\x45\x76\x89\xcd\x61"
+			  "\xfe\x48\x5f\x75\x1d\x13\x00\x62"
+			  "\x80\x24\x47\xe7\xbc\x37\xd7\xe3"
+			  "\x15\xe8\x68\x22\xaf\x80\x6f\x4b"
+			  "\xa8\x9f\x01\x10\x48\x14\xc3\x02"
+			  "\x52\xd2\xc7\x75\x9b\x52\x6d\x30"
+			  "\xac\x13\x85\xc8\xf7\xa3\x58\x4b"
+			  "\x49\xf7\x1c\x45\x55\x8c\x39\x9a"
+			  "\x99\x6d\x97\x27\x27\xe6\xab\xdd"
+			  "\x2c\x42\x1b\x35\xdd\x9d\x73\xbb"
+			  "\x6c\xf3\x64\xf1\xfb\xb9\xf7\xe6"
+			  "\x4a\x3c\xc0\x92\xc0\x2e\xb7\x1a"
+			  "\xbe\xab\xb3\x5a\xe5\xea\xb1\x48"
+			  "\x58\x13\x53\x90\xfd\xc3\x8e\x54"
+			  "\xf9\x18\x16\x73\xe8\xcb\x6d\x39"
+			  "\x0e\xd7\xe0\xfe\xb6\x9f\x43\x97"
+			  "\xe8\xd0\x85\x56\x83\x3e\x98\x68"
+			  "\x7f\xbd\x95\xa8\x9a\x61\x21\x8f"
+			  "\x06\x98\x34\xa6\xc8\xd6\x1d\xf3"
+			  "\x3d\x43\xa4\x9a\x8c\xe5\xd3\x5a"
+			  "\x32\xa2\x04\x22\xa4\x19\x1a\x46"
+			  "\x42\x7e\x4d\xe5\xe0\xe6\x0e\xca"
+			  "\xd5\x58\x9d\x2c\xaf\xda\x33\x5c"
+			  "\xb0\x79\x9e\xc9\xfc\xca\xf0\x2f"
+			  "\xa8\xb2\x77\xeb\x7a\xa2\xdd\x37"
+			  "\x35\x83\x07\xd6\x02\x1a\xb6\x6c"
+			  "\x24\xe2\x59\x08\x0e\xfd\x3e\x46"
+			  "\xec\x40\x93\xf4\x00\x26\x4f\x2a"
+			  "\xff\x47\x2f\xeb\x02\x92\x26\x5b"
+			  "\x53\x17\xc2\x8d\x2a\xc7\xa3\x1b"
+			  "\xcd\xbc\xa7\xe8\xd1\x76\xe3\x80"
+			  "\x21\xca\x5d\x3b\xe4\x9c\x8f\xa9"
+			  "\x5b\x7f\x29\x7f\x7c\xd8\xed\x6d"
+			  "\x8c\xb2\x86\x85\xe7\x77\xf2\x85"
+			  "\xab\x38\xa9\x9d\xc1\x4e\xc5\x64"
+			  "\x33\x73\x8b\x59\x03\xad\x05\xdf"
+			  "\x25\x98\x31\xde\xef\x13\xf1\x9b"
+			  "\x3c\x91\x9d\x7b\xb1\xfa\xe6\xbf"
+			  "\x5b\xed\xa5\x55\xe6\xea\x6c\x74"
+			  "\xf4\xb9\xe4\x45\x64\x72\x81\xc2"
+			  "\x4c\x28\xd4\xcd\xac\xe2\xde\xf9"
+			  "\xeb\x5c\xeb\x61\x60\x5a\xe5\x28",
+		.ksize	= 1088,
+		.plaintext	= "",
+		.psize	= 0,
+		.digest	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+	}, {
+		.key	= "\x29\x21\x43\xcb\xcb\x13\x07\xde"
+			  "\xbf\x48\xdf\x8a\x7f\xa2\x84\xde"
+			  "\x72\x23\x9d\xf5\xf0\x07\xf2\x4c"
+			  "\x20\x3a\x93\xb9\xcd\x5d\xfe\xcb"
+			  "\x99\x2c\x2b\x58\xc6\x50\x5f\x94"
+			  "\x56\xc3\x7c\x0d\x02\x3f\xb8\x5e"
+			  "\x7b\xc0\x6c\x51\x34\x76\xc0\x0e"
+			  "\xc6\x22\xc8\x9e\x92\xa0\x21\xc9"
+			  "\x85\x5c\x7c\xf8\xe2\x64\x47\xc9"
+			  "\xe4\xa2\x57\x93\xf8\xa2\x69\xcd"
+			  "\x62\x98\x99\xf4\xd7\x7b\x14\xb1"
+			  "\xd8\x05\xff\x04\x15\xc9\xe1\x6e"
+			  "\x9b\xe6\x50\x6b\x0b\x3f\x22\x1f"
+			  "\x08\xde\x0c\x5b\x08\x7e\xc6\x2f"
+			  "\x6c\xed\xd6\xb2\x15\xa4\xb3\xf9"
+			  "\xa7\x46\x38\x2a\xea\x69\xa5\xde"
+			  "\x02\xc3\x96\x89\x4d\x55\x3b\xed"
+			  "\x3d\x3a\x85\x77\xbf\x97\x45\x5c"
+			  "\x9e\x02\x69\xe2\x1b\x68\xbe\x96"
+			  "\xfb\x64\x6f\x0f\xf6\x06\x40\x67"
+			  "\xfa\x04\xe3\x55\xfa\xbe\xa4\x60"
+			  "\xef\x21\x66\x97\xe6\x9d\x5c\x1f"
+			  "\x62\x37\xaa\x31\xde\xe4\x9c\x28"
+			  "\x95\xe0\x22\x86\xf4\x4d\xf3\x07"
+			  "\xfd\x5f\x3a\x54\x2c\x51\x80\x71"
+			  "\xba\x78\x69\x5b\x65\xab\x1f\x81"
+			  "\xed\x3b\xff\x34\xa3\xfb\xbc\x73"
+			  "\x66\x7d\x13\x7f\xdf\x6e\xe2\xe2"
+			  "\xeb\x4f\x6c\xda\x7d\x33\x57\xd0"
+			  "\xd3\x7c\x95\x4f\x33\x58\x21\xc7"
+			  "\xc0\xe5\x6f\x42\x26\xc6\x1f\x5e"
+			  "\x85\x1b\x98\x9a\xa2\x1e\x55\x77"
+			  "\x23\xdf\x81\x5e\x79\x55\x05\xfc"
+			  "\xfb\xda\xee\xba\x5a\xba\xf7\x77"
+			  "\x7f\x0e\xd3\xe1\x37\xfe\x8d\x2b"
+			  "\xd5\x3f\xfb\xd0\xc0\x3c\x0b\x3f"
+			  "\xcf\x3c\x14\xcf\xfb\x46\x72\x4c"
+			  "\x1f\x39\xe2\xda\x03\x71\x6d\x23"
+			  "\xef\x93\xcd\x39\xd9\x37\x80\x4d"
+			  "\x65\x61\xd1\x2c\x03\xa9\x47\x72"
+			  "\x4d\x1e\x0e\x16\x33\x0f\x21\x17"
+			  "\xec\x92\xea\x6f\x37\x22\xa4\xd8"
+			  "\x03\x33\x9e\xd8\x03\x69\x9a\xe8"
+			  "\xb2\x57\xaf\x78\x99\x05\x12\xab"
+			  "\x48\x90\x80\xf0\x12\x9b\x20\x64"
+			  "\x7a\x1d\x47\x5f\xba\x3c\xf9\xc3"
+			  "\x0a\x0d\x8d\xa1\xf9\x1b\x82\x13"
+			  "\x3e\x0d\xec\x0a\x83\xc0\x65\xe1"
+			  "\xe9\x95\xff\x97\xd6\xf2\xe4\xd5"
+			  "\x86\xc0\x1f\x29\x27\x63\xd7\xde"
+			  "\xb7\x0a\x07\x99\x04\x2d\xa3\x89"
+			  "\xa2\x43\xcf\xf3\xe1\x43\xac\x4a"
+			  "\x06\x97\xd0\x05\x4f\x87\xfa\xf9"
+			  "\x9b\xbf\x52\x70\xbd\xbc\x6c\xf3"
+			  "\x03\x13\x60\x41\x28\x09\xec\xcc"
+			  "\xb1\x1a\xec\xd6\xfb\x6f\x2a\x89"
+			  "\x5d\x0b\x53\x9c\x59\xc1\x84\x21"
+			  "\x33\x51\x47\x19\x31\x9c\xd4\x0a"
+			  "\x4d\x04\xec\x50\x90\x61\xbd\xbc"
+			  "\x7e\xc8\xd9\x6c\x98\x1d\x45\x41"
+			  "\x17\x5e\x97\x1c\xc5\xa8\xe8\xea"
+			  "\x46\x58\x53\xf7\x17\xd5\xad\x11"
+			  "\xc8\x54\xf5\x7a\x33\x90\xf5\x19"
+			  "\xba\x36\xb4\xfc\x52\xa5\x72\x3d"
+			  "\x14\xbb\x55\xa7\xe9\xe3\x12\xf7"
+			  "\x1c\x30\xa2\x82\x03\xbf\x53\x91"
+			  "\x2e\x60\x41\x9f\x5b\x69\x39\xf6"
+			  "\x4d\xc8\xf8\x46\x7a\x7f\xa4\x98"
+			  "\x36\xff\x06\xcb\xca\xe7\x33\xf2"
+			  "\xc0\x4a\xf4\x3c\x14\x44\x5f\x6b"
+			  "\x75\xef\x02\x36\x75\x08\x14\xfd"
+			  "\x10\x8e\xa5\x58\xd0\x30\x46\x49"
+			  "\xaf\x3a\xf8\x40\x3d\x35\xdb\x84"
+			  "\x11\x2e\x97\x6a\xb7\x87\x7f\xad"
+			  "\xf1\xfa\xa5\x63\x60\xd8\x5e\xbf"
+			  "\x41\x78\x49\xcf\x77\xbb\x56\xbb"
+			  "\x7d\x01\x67\x05\x22\xc8\x8f\x41"
+			  "\xba\x81\xd2\xca\x2c\x38\xac\x76"
+			  "\x06\xc1\x1a\xc2\xce\xac\x90\x67"
+			  "\x57\x3e\x20\x12\x5b\xd9\x97\x58"
+			  "\x65\x05\xb7\x04\x61\x7e\xd8\x3a"
+			  "\xbf\x55\x3b\x13\xe9\x34\x5a\x37"
+			  "\x36\xcb\x94\x45\xc5\x32\xb3\xa0"
+			  "\x0c\x3e\x49\xc5\xd3\xed\xa7\xf0"
+			  "\x1c\x69\xcc\xea\xcc\x83\xc9\x16"
+			  "\x95\x72\x4b\xf4\x89\xd5\xb9\x10"
+			  "\xf6\x2d\x60\x15\xea\x3c\x06\x66"
+			  "\x9f\x82\xad\x17\xce\xd2\xa4\x48"
+			  "\x7c\x65\xd9\xf8\x02\x4d\x9b\x4c"
+			  "\x89\x06\x3a\x34\x85\x48\x89\x86"
+			  "\xf9\x24\xa9\x54\x72\xdb\x44\x95"
+			  "\xc7\x44\x1c\x19\x11\x4c\x04\xdc"
+			  "\x13\xb9\x67\xc8\xc3\x3a\x6a\x50"
+			  "\xfa\xd1\xfb\xe1\x88\xb6\xf1\xa3"
+			  "\xc5\x3b\xdc\x38\x45\x16\x26\x02"
+			  "\x3b\xb8\x8f\x8b\x58\x7d\x23\x04"
+			  "\x50\x6b\x81\x9f\xae\x66\xac\x6f"
+			  "\xcf\x2a\x9d\xf1\xfd\x1d\x57\x07"
+			  "\xbe\x58\xeb\x77\x0c\xe3\xc2\x19"
+			  "\x14\x74\x1b\x51\x1c\x4f\x41\xf3"
+			  "\x32\x89\xb3\xe7\xde\x62\xf6\x5f"
+			  "\xc7\x6a\x4a\x2a\x5b\x0f\x5f\x87"
+			  "\x9c\x08\xb9\x02\x88\xc8\x29\xb7"
+			  "\x94\x52\xfa\x52\xfe\xaa\x50\x10"
+			  "\xba\x48\x75\x5e\x11\x1b\xe6\x39"
+			  "\xd7\x82\x2c\x87\xf1\x1e\xa4\x38"
+			  "\x72\x3e\x51\xe7\xd8\x3e\x5b\x7b"
+			  "\x31\x16\x89\xba\xd6\xad\x18\x5e"
+			  "\xba\xf8\x12\xb3\xf4\x6c\x47\x30"
+			  "\xc0\x38\x58\xb3\x10\x8d\x58\x5d"
+			  "\xb4\xfb\x19\x7e\x41\xc3\x66\xb8"
+			  "\xd6\x72\x84\xe1\x1a\xc2\x71\x4c"
+			  "\x0d\x4a\x21\x7a\xab\xa2\xc0\x36"
+			  "\x15\xc5\xe9\x46\xd7\x29\x17\x76"
+			  "\x5e\x47\x36\x7f\x72\x05\xa7\xcc"
+			  "\x36\x63\xf9\x47\x7d\xe6\x07\x3c"
+			  "\x8b\x79\x1d\x96\x61\x8d\x90\x65"
+			  "\x7c\xf5\xeb\x4e\x6e\x09\x59\x6d"
+			  "\x62\x50\x1b\x0f\xe0\xdc\x78\xf2"
+			  "\x5b\x83\x1a\xa1\x11\x75\xfd\x18"
+			  "\xd7\xe2\x8d\x65\x14\x21\xce\xbe"
+			  "\xb5\x87\xe3\x0a\xda\x24\x0a\x64"
+			  "\xa9\x9f\x03\x8d\x46\x5d\x24\x1a"
+			  "\x8a\x0c\x42\x01\xca\xb1\x5f\x7c"
+			  "\xa5\xac\x32\x4a\xb8\x07\x91\x18"
+			  "\x6f\xb0\x71\x3c\xc9\xb1\xa8\xf8"
+			  "\x5f\x69\xa5\xa1\xca\x9e\x7a\xaa"
+			  "\xac\xe9\xc7\x47\x41\x75\x25\xc3"
+			  "\x73\xe2\x0b\xdd\x6d\x52\x71\xbe"
+			  "\xc5\xdc\xb4\xe7\x01\x26\x53\x77"
+			  "\x86\x90\x85\x68\x6b\x7b\x03\x53"
+			  "\xda\x52\x52\x51\x68\xc8\xf3\xec"
+			  "\x6c\xd5\x03\x7a\xa3\x0e\xb4\x02"
+			  "\x5f\x1a\xab\xee\xca\x67\x29\x7b"
+			  "\xbd\x96\x59\xb3\x8b\x32\x7a\x92"
+			  "\x9f\xd8\x25\x2b\xdf\xc0\x4c\xda",
+		.ksize	= 1088,
+		.plaintext	= "\xbc\xda\x81\xa8\x78\x79\x1c\xbf"
+			  "\x77\x53\xba\x4c\x30\x5b\xb8\x33",
+		.psize	= 16,
+		.digest	= "\x04\xbf\x7f\x6a\xce\x72\xea\x6a"
+			  "\x79\xdb\xb0\xc9\x60\xf6\x12\xcc",
+		.np	= 6,
+		.tap	= { 4, 4, 1, 1, 1, 5 },
+	}, {
+		.key	= "\x65\x4d\xe3\xf8\xd2\x4c\xac\x28"
+			  "\x68\xf5\xb3\x81\x71\x4b\xa1\xfa"
+			  "\x04\x0e\xd3\x81\x36\xbe\x0c\x81"
+			  "\x5e\xaf\xbc\x3a\xa4\xc0\x8e\x8b"
+			  "\x55\x63\xd3\x52\x97\x88\xd6\x19"
+			  "\xbc\x96\xdf\x49\xff\x04\x63\xf5"
+			  "\x0c\x11\x13\xaa\x9e\x1f\x5a\xf7"
+			  "\xdd\xbd\x37\x80\xc3\xd0\xbe\xa7"
+			  "\x05\xc8\x3c\x98\x1e\x05\x3c\x84"
+			  "\x39\x61\xc4\xed\xed\x71\x1b\xc4"
+			  "\x74\x45\x2c\xa1\x56\x70\x97\xfd"
+			  "\x44\x18\x07\x7d\xca\x60\x1f\x73"
+			  "\x3b\x6d\x21\xcb\x61\x87\x70\x25"
+			  "\x46\x21\xf1\x1f\x21\x91\x31\x2d"
+			  "\x5d\xcc\xb7\xd1\x84\x3e\x3d\xdb"
+			  "\x03\x53\x2a\x82\xa6\x9a\x95\xbc"
+			  "\x1a\x1e\x0a\x5e\x07\x43\xab\x43"
+			  "\xaf\x92\x82\x06\x91\x04\x09\xf4"
+			  "\x17\x0a\x9a\x2c\x54\xdb\xb8\xf4"
+			  "\xd0\xf0\x10\x66\x24\x8d\xcd\xda"
+			  "\xfe\x0e\x45\x9d\x6f\xc4\x4e\xf4"
+			  "\x96\xaf\x13\xdc\xa9\xd4\x8c\xc4"
+			  "\xc8\x57\x39\x3c\xc2\xd3\x0a\x76"
+			  "\x4a\x1f\x75\x83\x44\xc7\xd1\x39"
+			  "\xd8\xb5\x41\xba\x73\x87\xfa\x96"
+			  "\xc7\x18\x53\xfb\x9b\xda\xa0\x97"
+			  "\x1d\xee\x60\x85\x9e\x14\xc3\xce"
+			  "\xc4\x05\x29\x3b\x95\x30\xa3\xd1"
+			  "\x9f\x82\x6a\x04\xf5\xa7\x75\x57"
+			  "\x82\x04\xfe\x71\x51\x71\xb1\x49"
+			  "\x50\xf8\xe0\x96\xf1\xfa\xa8\x88"
+			  "\x3f\xa0\x86\x20\xd4\x60\x79\x59"
+			  "\x17\x2d\xd1\x09\xf4\xec\x05\x57"
+			  "\xcf\x62\x7e\x0e\x7e\x60\x78\xe6"
+			  "\x08\x60\x29\xd8\xd5\x08\x1a\x24"
+			  "\xc4\x6c\x24\xe7\x92\x08\x3d\x8a"
+			  "\x98\x7a\xcf\x99\x0a\x65\x0e\xdc"
+			  "\x8c\x8a\xbe\x92\x82\x91\xcc\x62"
+			  "\x30\xb6\xf4\x3f\xc6\x8a\x7f\x12"
+			  "\x4a\x8a\x49\xfa\x3f\x5c\xd4\x5a"
+			  "\xa6\x82\xa3\xe6\xaa\x34\x76\xb2"
+			  "\xab\x0a\x30\xef\x6c\x77\x58\x3f"
+			  "\x05\x6b\xcc\x5c\xae\xdc\xd7\xb9"
+			  "\x51\x7e\x8d\x32\x5b\x24\x25\xbe"
+			  "\x2b\x24\x01\xcf\x80\xda\x16\xd8"
+			  "\x90\x72\x2c\xad\x34\x8d\x0c\x74"
+			  "\x02\xcb\xfd\xcf\x6e\xef\x97\xb5"
+			  "\x4c\xf2\x68\xca\xde\x43\x9e\x8a"
+			  "\xc5\x5f\x31\x7f\x14\x71\x38\xec"
+			  "\xbd\x98\xe5\x71\xc4\xb5\xdb\xef"
+			  "\x59\xd2\xca\xc0\xc1\x86\x75\x01"
+			  "\xd4\x15\x0d\x6f\xa4\xf7\x7b\x37"
+			  "\x47\xda\x18\x93\x63\xda\xbe\x9e"
+			  "\x07\xfb\xb2\x83\xd5\xc4\x34\x55"
+			  "\xee\x73\xa1\x42\x96\xf9\x66\x41"
+			  "\xa4\xcc\xd2\x93\x6e\xe1\x0a\xbb"
+			  "\xd2\xdd\x18\x23\xe6\x6b\x98\x0b"
+			  "\x8a\x83\x59\x2c\xc3\xa6\x59\x5b"
+			  "\x01\x22\x59\xf7\xdc\xb0\x87\x7e"
+			  "\xdb\x7d\xf4\x71\x41\xab\xbd\xee"
+			  "\x79\xbe\x3c\x01\x76\x0b\x2d\x0a"
+			  "\x42\xc9\x77\x8c\xbb\x54\x95\x60"
+			  "\x43\x2e\xe0\x17\x52\xbd\x90\xc9"
+			  "\xc2\x2c\xdd\x90\x24\x22\x76\x40"
+			  "\x5c\xb9\x41\xc9\xa1\xd5\xbd\xe3"
+			  "\x44\xe0\xa4\xab\xcc\xb8\xe2\x32"
+			  "\x02\x15\x04\x1f\x8c\xec\x5d\x14"
+			  "\xac\x18\xaa\xef\x6e\x33\x19\x6e"
+			  "\xde\xfe\x19\xdb\xeb\x61\xca\x18"
+			  "\xad\xd8\x3d\xbf\x09\x11\xc7\xa5"
+			  "\x86\x0b\x0f\xe5\x3e\xde\xe8\xd9"
+			  "\x0a\x69\x9e\x4c\x20\xff\xf9\xc5"
+			  "\xfa\xf8\xf3\x7f\xa5\x01\x4b\x5e"
+			  "\x0f\xf0\x3b\x68\xf0\x46\x8c\x2a"
+			  "\x7a\xc1\x8f\xa0\xfe\x6a\x5b\x44"
+			  "\x70\x5c\xcc\x92\x2c\x6f\x0f\xbd"
+			  "\x25\x3e\xb7\x8e\x73\x58\xda\xc9"
+			  "\xa5\xaa\x9e\xf3\x9b\xfd\x37\x3e"
+			  "\xe2\x88\xa4\x7b\xc8\x5c\xa8\x93"
+			  "\x0e\xe7\x9a\x9c\x2e\x95\x18\x9f"
+			  "\xc8\x45\x0c\x88\x9e\x53\x4f\x3a"
+			  "\x76\xc1\x35\xfa\x17\xd8\xac\xa0"
+			  "\x0c\x2d\x47\x2e\x4f\x69\x9b\xf7"
+			  "\xd0\xb6\x96\x0c\x19\xb3\x08\x01"
+			  "\x65\x7a\x1f\xc7\x31\x86\xdb\xc8"
+			  "\xc1\x99\x8f\xf8\x08\x4a\x9d\x23"
+			  "\x22\xa8\xcf\x27\x01\x01\x88\x93"
+			  "\x9c\x86\x45\xbd\xe0\x51\xca\x52"
+			  "\x84\xba\xfe\x03\xf7\xda\xc5\xce"
+			  "\x3e\x77\x75\x86\xaf\x84\xc8\x05"
+			  "\x44\x01\x0f\x02\xf3\x58\xb0\x06"
+			  "\x5a\xd7\x12\x30\x8d\xdf\x1f\x1f"
+			  "\x0a\xe6\xd2\xea\xf6\x3a\x7a\x99"
+			  "\x63\xe8\xd2\xc1\x4a\x45\x8b\x40"
+			  "\x4d\x0a\xa9\x76\x92\xb3\xda\x87"
+			  "\x36\x33\xf0\x78\xc3\x2f\x5f\x02"
+			  "\x1a\x6a\x2c\x32\xcd\x76\xbf\xbd"
+			  "\x5a\x26\x20\x28\x8c\x8c\xbc\x52"
+			  "\x3d\x0a\xc9\xcb\xab\xa4\x21\xb0"
+			  "\x54\x40\x81\x44\xc7\xd6\x1c\x11"
+			  "\x44\xc6\x02\x92\x14\x5a\xbf\x1a"
+			  "\x09\x8a\x18\xad\xcd\x64\x3d\x53"
+			  "\x4a\xb6\xa5\x1b\x57\x0e\xef\xe0"
+			  "\x8c\x44\x5f\x7d\xbd\x6c\xfd\x60"
+			  "\xae\x02\x24\xb6\x99\xdd\x8c\xaf"
+			  "\x59\x39\x75\x3c\xd1\x54\x7b\x86"
+			  "\xcc\x99\xd9\x28\x0c\xb0\x94\x62"
+			  "\xf9\x51\xd1\x19\x96\x2d\x66\xf5"
+			  "\x55\xcf\x9e\x59\xe2\x6b\x2c\x08"
+			  "\xc0\x54\x48\x24\x45\xc3\x8c\x73"
+			  "\xea\x27\x6e\x66\x7d\x1d\x0e\x6e"
+			  "\x13\xe8\x56\x65\x3a\xb0\x81\x5c"
+			  "\xf0\xe8\xd8\x00\x6b\xcd\x8f\xad"
+			  "\xdd\x53\xf3\xa4\x6c\x43\xd6\x31"
+			  "\xaf\xd2\x76\x1e\x91\x12\xdb\x3c"
+			  "\x8c\xc2\x81\xf0\x49\xdb\xe2\x6b"
+			  "\x76\x62\x0a\x04\xe4\xaa\x8a\x7c"
+			  "\x08\x0b\x5d\xd0\xee\x1d\xfb\xc4"
+			  "\x02\x75\x42\xd6\xba\xa7\x22\xa8"
+			  "\x47\x29\xb7\x85\x6d\x93\x3a\xdb"
+			  "\x00\x53\x0b\xa2\xeb\xf8\xfe\x01"
+			  "\x6f\x8a\x31\xd6\x17\x05\x6f\x67"
+			  "\x88\x95\x32\xfe\x4f\xa6\x4b\xf8"
+			  "\x03\xe4\xcd\x9a\x18\xe8\x4e\x2d"
+			  "\xf7\x97\x9a\x0c\x7d\x9f\x7e\x44"
+			  "\x69\x51\xe0\x32\x6b\x62\x86\x8f"
+			  "\xa6\x8e\x0b\x21\x96\xe5\xaf\x77"
+			  "\xc0\x83\xdf\xa5\x0e\xd0\xa1\x04"
+			  "\xaf\xc1\x10\xcb\x5a\x40\xe4\xe3"
+			  "\x38\x7e\x07\xe8\x4d\xfa\xed\xc5"
+			  "\xf0\x37\xdf\xbb\x8a\xcf\x3d\xdc"
+			  "\x61\xd2\xc6\x2b\xff\x07\xc9\x2f"
+			  "\x0c\x2d\x5c\x07\xa8\x35\x6a\xfc"
+			  "\xae\x09\x03\x45\x74\x51\x4d\xc4"
+			  "\xb8\x23\x87\x4a\x99\x27\x20\x87"
+			  "\x62\x44\x0a\x4a\xce\x78\x47\x22",
+		.ksize	= 1088,
+		.plaintext	= "\x8e\xb0\x4c\xde\x9c\x4a\x04\x5a"
+			  "\xf6\xa9\x7f\x45\x25\xa5\x7b\x3a"
+			  "\xbc\x4d\x73\x39\x81\xb5\xbd\x3d"
+			  "\x21\x6f\xd7\x37\x50\x3c\x7b\x28"
+			  "\xd1\x03\x3a\x17\xed\x7b\x7c\x2a"
+			  "\x16\xbc\xdf\x19\x89\x52\x71\x31"
+			  "\xb6\xc0\xfd\xb5\xd3\xba\x96\x99"
+			  "\xb6\x34\x0b\xd0\x99\x93\xfc\x1a"
+			  "\x01\x3c\x85\xc6\x9b\x78\x5c\x8b"
+			  "\xfe\xae\xd2\xbf\xb2\x6f\xf9\xed"
+			  "\xc8\x25\x17\xfe\x10\x3b\x7d\xda"
+			  "\xf4\x8d\x35\x4b\x7c\x7b\x82\xe7"
+			  "\xc2\xb3\xee\x60\x4a\x03\x86\xc9"
+			  "\x4e\xb5\xc4\xbe\xd2\xbd\x66\xf1"
+			  "\x13\xf1\x09\xab\x5d\xca\x63\x1f"
+			  "\xfc\xfb\x57\x2a\xfc\xca\x66\xd8"
+			  "\x77\x84\x38\x23\x1d\xac\xd3\xb3"
+			  "\x7a\xad\x4c\x70\xfa\x9c\xc9\x61"
+			  "\xa6\x1b\xba\x33\x4b\x4e\x33\xec"
+			  "\xa0\xa1\x64\x39\x40\x05\x1c\xc2"
+			  "\x3f\x49\x9d\xae\xf2\xc5\xf2\xc5"
+			  "\xfe\xe8\xf4\xc2\xf9\x96\x2d\x28"
+			  "\x92\x30\x44\xbc\xd2\x7f\xe1\x6e"
+			  "\x62\x02\x8f\x3d\x1c\x80\xda\x0e"
+			  "\x6a\x90\x7e\x75\xff\xec\x3e\xc4"
+			  "\xcd\x16\x34\x3b\x05\x6d\x4d\x20"
+			  "\x1c\x7b\xf5\x57\x4f\xfa\x3d\xac"
+			  "\xd0\x13\x55\xe8\xb3\xe1\x1b\x78"
+			  "\x30\xe6\x9f\x84\xd4\x69\xd1\x08"
+			  "\x12\x77\xa7\x4a\xbd\xc0\xf2\xd2"
+			  "\x78\xdd\xa3\x81\x12\xcb\x6c\x14"
+			  "\x90\x61\xe2\x84\xc6\x2b\x16\xcc"
+			  "\x40\x99\x50\x88\x01\x09\x64\x4f"
+			  "\x0a\x80\xbe\x61\xae\x46\xc9\x0a"
+			  "\x5d\xe0\xfb\x72\x7a\x1a\xdd\x61"
+			  "\x63\x20\x05\xa0\x4a\xf0\x60\x69"
+			  "\x7f\x92\xbc\xbf\x4e\x39\x4d\xdd"
+			  "\x74\xd1\xb7\xc0\x5a\x34\xb7\xae"
+			  "\x76\x65\x2e\xbc\x36\xb9\x04\x95"
+			  "\x42\xe9\x6f\xca\x78\xb3\x72\x07"
+			  "\xa3\xba\x02\x94\x67\x4c\xb1\xd7"
+			  "\xe9\x30\x0d\xf0\x3b\xb8\x10\x6d"
+			  "\xea\x2b\x21\xbf\x74\x59\x82\x97"
+			  "\x85\xaa\xf1\xd7\x54\x39\xeb\x05"
+			  "\xbd\xf3\x40\xa0\x97\xe6\x74\xfe"
+			  "\xb4\x82\x5b\xb1\x36\xcb\xe8\x0d"
+			  "\xce\x14\xd9\xdf\xf1\x94\x22\xcd"
+			  "\xd6\x00\xba\x04\x4c\x05\x0c\xc0"
+			  "\xd1\x5a\xeb\x52\xd5\xa8\x8e\xc8"
+			  "\x97\xa1\xaa\xc1\xea\xc1\xbe\x7c"
+			  "\x36\xb3\x36\xa0\xc6\x76\x66\xc5"
+			  "\xe2\xaf\xd6\x5c\xe2\xdb\x2c\xb3"
+			  "\x6c\xb9\x99\x7f\xff\x9f\x03\x24"
+			  "\xe1\x51\x44\x66\xd8\x0c\x5d\x7f"
+			  "\x5c\x85\x22\x2a\xcf\x6d\x79\x28"
+			  "\xab\x98\x01\x72\xfe\x80\x87\x5f"
+			  "\x46\xba\xef\x81\x24\xee\xbf\xb0"
+			  "\x24\x74\xa3\x65\x97\x12\xc4\xaf"
+			  "\x8b\xa0\x39\xda\x8a\x7e\x74\x6e"
+			  "\x1b\x42\xb4\x44\x37\xfc\x59\xfd"
+			  "\x86\xed\xfb\x8c\x66\x33\xda\x63"
+			  "\x75\xeb\xe1\xa4\x85\x4f\x50\x8f"
+			  "\x83\x66\x0d\xd3\x37\xfa\xe6\x9c"
+			  "\x4f\x30\x87\x35\x18\xe3\x0b\xb7"
+			  "\x6e\x64\x54\xcd\x70\xb3\xde\x54"
+			  "\xb7\x1d\xe6\x4c\x4d\x55\x12\x12"
+			  "\xaf\x5f\x7f\x5e\xee\x9d\xe8\x8e"
+			  "\x32\x9d\x4e\x75\xeb\xc6\xdd\xaa"
+			  "\x48\x82\xa4\x3f\x3c\xd7\xd3\xa8"
+			  "\x63\x9e\x64\xfe\xe3\x97\x00\x62"
+			  "\xe5\x40\x5d\xc3\xad\x72\xe1\x28"
+			  "\x18\x50\xb7\x75\xef\xcd\x23\xbf"
+			  "\x3f\xc0\x51\x36\xf8\x41\xc3\x08"
+			  "\xcb\xf1\x8d\x38\x34\xbd\x48\x45"
+			  "\x75\xed\xbc\x65\x7b\xb5\x0c\x9b"
+			  "\xd7\x67\x7d\x27\xb4\xc4\x80\xd7"
+			  "\xa9\xb9\xc7\x4a\x97\xaa\xda\xc8"
+			  "\x3c\x74\xcf\x36\x8f\xe4\x41\xe3"
+			  "\xd4\xd3\x26\xa7\xf3\x23\x9d\x8f"
+			  "\x6c\x20\x05\x32\x3e\xe0\xc3\xc8"
+			  "\x56\x3f\xa7\x09\xb7\xfb\xc7\xf7"
+			  "\xbe\x2a\xdd\x0f\x06\x7b\x0d\xdd"
+			  "\xb0\xb4\x86\x17\xfd\xb9\x04\xe5"
+			  "\xc0\x64\x5d\xad\x2a\x36\x38\xdb"
+			  "\x24\xaf\x5b\xff\xca\xf9\x41\xe8"
+			  "\xf9\x2f\x1e\x5e\xf9\xf5\xd5\xf2"
+			  "\xb2\x88\xca\xc9\xa1\x31\xe2\xe8"
+			  "\x10\x95\x65\xbf\xf1\x11\x61\x7a"
+			  "\x30\x1a\x54\x90\xea\xd2\x30\xf6"
+			  "\xa5\xad\x60\xf9\x4d\x84\x21\x1b"
+			  "\xe4\x42\x22\xc8\x12\x4b\xb0\x58"
+			  "\x3e\x9c\x2d\x32\x95\x0a\x8e\xb0"
+			  "\x0a\x7e\x77\x2f\xe8\x97\x31\x6a"
+			  "\xf5\x59\xb4\x26\xe6\x37\x12\xc9"
+			  "\xcb\xa0\x58\x33\x6f\xd5\x55\x55"
+			  "\x3c\xa1\x33\xb1\x0b\x7e\x2e\xb4"
+			  "\x43\x2a\x84\x39\xf0\x9c\xf4\x69"
+			  "\x4f\x1e\x79\xa6\x15\x1b\x87\xbb"
+			  "\xdb\x9b\xe0\xf1\x0b\xba\xe3\x6e"
+			  "\xcc\x2f\x49\x19\x22\x29\xfc\x71"
+			  "\xbb\x77\x38\x18\x61\xaf\x85\x76"
+			  "\xeb\xd1\x09\xcc\x86\x04\x20\x9a"
+			  "\x66\x53\x2f\x44\x8b\xc6\xa3\xd2"
+			  "\x5f\xc7\x79\x82\x66\xa8\x6e\x75"
+			  "\x7d\x94\xd1\x86\x75\x0f\xa5\x4f"
+			  "\x3c\x7a\x33\xce\xd1\x6e\x9d\x7b"
+			  "\x1f\x91\x37\xb8\x37\x80\xfb\xe0"
+			  "\x52\x26\xd0\x9a\xd4\x48\x02\x41"
+			  "\x05\xe3\x5a\x94\xf1\x65\x61\x19"
+			  "\xb8\x88\x4e\x2b\xea\xba\x8b\x58"
+			  "\x8b\x42\x01\x00\xa8\xfe\x00\x5c"
+			  "\xfe\x1c\xee\x31\x15\x69\xfa\xb3"
+			  "\x9b\x5f\x22\x8e\x0d\x2c\xe3\xa5"
+			  "\x21\xb9\x99\x8a\x8e\x94\x5a\xef"
+			  "\x13\x3e\x99\x96\x79\x6e\xd5\x42"
+			  "\x36\x03\xa9\xe2\xca\x65\x4e\x8a"
+			  "\x8a\x30\xd2\x7d\x74\xe7\xf0\xaa"
+			  "\x23\x26\xdd\xcb\x82\x39\xfc\x9d"
+			  "\x51\x76\x21\x80\xa2\xbe\x93\x03"
+			  "\x47\xb0\xc1\xb6\xdc\x63\xfd\x9f"
+			  "\xca\x9d\xa5\xca\x27\x85\xe2\xd8"
+			  "\x15\x5b\x7e\x14\x7a\xc4\x89\xcc"
+			  "\x74\x14\x4b\x46\xd2\xce\xac\x39"
+			  "\x6b\x6a\x5a\xa4\x0e\xe3\x7b\x15"
+			  "\x94\x4b\x0f\x74\xcb\x0c\x7f\xa9"
+			  "\xbe\x09\x39\xa3\xdd\x56\x5c\xc7"
+			  "\x99\x56\x65\x39\xf4\x0b\x7d\x87"
+			  "\xec\xaa\xe3\x4d\x22\x65\x39\x4e",
+		.psize	= 1024,
+		.digest	= "\x64\x3a\xbc\xc3\x3f\x74\x40\x51"
+			  "\x6e\x56\x01\x1a\x51\xec\x36\xde",
+		.np	= 8,
+		.tap	= { 64, 203, 267, 28, 263, 62, 54, 83 },
+	}, {
+		.key	= "\x1b\x82\x2e\x1b\x17\x23\xb9\x6d"
+			  "\xdc\x9c\xda\x99\x07\xe3\x5f\xd8"
+			  "\xd2\xf8\x43\x80\x8d\x86\x7d\x80"
+			  "\x1a\xd0\xcc\x13\xb9\x11\x05\x3f"
+			  "\x7e\xcf\x7e\x80\x0e\xd8\x25\x48"
+			  "\x8b\xaa\x63\x83\x92\xd0\x72\xf5"
+			  "\x4f\x67\x7e\x50\x18\x25\xa4\xd1"
+			  "\xe0\x7e\x1e\xba\xd8\xa7\x6e\xdb"
+			  "\x1a\xcc\x0d\xfe\x9f\x6d\x22\x35"
+			  "\xe1\xe6\xe0\xa8\x7b\x9c\xb1\x66"
+			  "\xa3\xf8\xff\x4d\x90\x84\x28\xbc"
+			  "\xdc\x19\xc7\x91\x49\xfc\xf6\x33"
+			  "\xc9\x6e\x65\x7f\x28\x6f\x68\x2e"
+			  "\xdf\x1a\x75\xe9\xc2\x0c\x96\xb9"
+			  "\x31\x22\xc4\x07\xc6\x0a\x2f\xfd"
+			  "\x36\x06\x5f\x5c\xc5\xb1\x3a\xf4"
+			  "\x5e\x48\xa4\x45\x2b\x88\xa7\xee"
+			  "\xa9\x8b\x52\xcc\x99\xd9\x2f\xb8"
+			  "\xa4\x58\x0a\x13\xeb\x71\x5a\xfa"
+			  "\xe5\x5e\xbe\xf2\x64\xad\x75\xbc"
+			  "\x0b\x5b\x34\x13\x3b\x23\x13\x9a"
+			  "\x69\x30\x1e\x9a\xb8\x03\xb8\x8b"
+			  "\x3e\x46\x18\x6d\x38\xd9\xb3\xd8"
+			  "\xbf\xf1\xd0\x28\xe6\x51\x57\x80"
+			  "\x5e\x99\xfb\xd0\xce\x1e\x83\xf7"
+			  "\xe9\x07\x5a\x63\xa9\xef\xce\xa5"
+			  "\xfb\x3f\x37\x17\xfc\x0b\x37\x0e"
+			  "\xbb\x4b\x21\x62\xb7\x83\x0e\xa9"
+			  "\x9e\xb0\xc4\xad\x47\xbe\x35\xe7"
+			  "\x51\xb2\xf2\xac\x2b\x65\x7b\x48"
+			  "\xe3\x3f\x5f\xb6\x09\x04\x0c\x58"
+			  "\xce\x99\xa9\x15\x2f\x4e\xc1\xf2"
+			  "\x24\x48\xc0\xd8\x6c\xd3\x76\x17"
+			  "\x83\x5d\xe6\xe3\xfd\x01\x8e\xf7"
+			  "\x42\xa5\x04\x29\x30\xdf\xf9\x00"
+			  "\x4a\xdc\x71\x22\x1a\x33\x15\xb6"
+			  "\xd7\x72\xfb\x9a\xb8\xeb\x2b\x38"
+			  "\xea\xa8\x61\xa8\x90\x11\x9d\x73"
+			  "\x2e\x6c\xce\x81\x54\x5a\x9f\xcd"
+			  "\xcf\xd5\xbd\x26\x5d\x66\xdb\xfb"
+			  "\xdc\x1e\x7c\x10\xfe\x58\x82\x10"
+			  "\x16\x24\x01\xce\x67\x55\x51\xd1"
+			  "\xdd\x6b\x44\xa3\x20\x8e\xa9\xa6"
+			  "\x06\xa8\x29\x77\x6e\x00\x38\x5b"
+			  "\xde\x4d\x58\xd8\x1f\x34\xdf\xf9"
+			  "\x2c\xac\x3e\xad\xfb\x92\x0d\x72"
+			  "\x39\xa4\xac\x44\x10\xc0\x43\xc4"
+			  "\xa4\x77\x3b\xfc\xc4\x0d\x37\xd3"
+			  "\x05\x84\xda\x53\x71\xf8\x80\xd3"
+			  "\x34\x44\xdb\x09\xb4\x2b\x8e\xe3"
+			  "\x00\x75\x50\x9e\x43\x22\x00\x0b"
+			  "\x7c\x70\xab\xd4\x41\xf1\x93\xcd"
+			  "\x25\x2d\x84\x74\xb5\xf2\x92\xcd"
+			  "\x0a\x28\xea\x9a\x49\x02\x96\xcb"
+			  "\x85\x9e\x2f\x33\x03\x86\x1d\xdc"
+			  "\x1d\x31\xd5\xfc\x9d\xaa\xc5\xe9"
+			  "\x9a\xc4\x57\xf5\x35\xed\xf4\x4b"
+			  "\x3d\x34\xc2\x29\x13\x86\x36\x42"
+			  "\x5d\xbf\x90\x86\x13\x77\xe5\xc3"
+			  "\x62\xb4\xfe\x0b\x70\x39\x35\x65"
+			  "\x02\xea\xf6\xce\x57\x0c\xbb\x74"
+			  "\x29\xe3\xfd\x60\x90\xfd\x10\x38"
+			  "\xd5\x4e\x86\xbd\x37\x70\xf0\x97"
+			  "\xa6\xab\x3b\x83\x64\x52\xca\x66"
+			  "\x2f\xf9\xa4\xca\x3a\x55\x6b\xb0"
+			  "\xe8\x3a\x34\xdb\x9e\x48\x50\x2f"
+			  "\x3b\xef\xfd\x08\x2d\x5f\xc1\x37"
+			  "\x5d\xbe\x73\xe4\xd8\xe9\xac\xca"
+			  "\x8a\xaa\x48\x7c\x5c\xf4\xa6\x96"
+			  "\x5f\xfa\x70\xa6\xb7\x8b\x50\xcb"
+			  "\xa6\xf5\xa9\xbd\x7b\x75\x4c\x22"
+			  "\x0b\x19\x40\x2e\xc9\x39\x39\x32"
+			  "\x83\x03\xa8\xa4\x98\xe6\x8e\x16"
+			  "\xb9\xde\x08\xc5\xfc\xbf\xad\x39"
+			  "\xa8\xc7\x93\x6c\x6f\x23\xaf\xc1"
+			  "\xab\xe1\xdf\xbb\x39\xae\x93\x29"
+			  "\x0e\x7d\x80\x8d\x3e\x65\xf3\xfd"
+			  "\x96\x06\x65\x90\xa1\x28\x64\x4b"
+			  "\x69\xf9\xa8\x84\x27\x50\xfc\x87"
+			  "\xf7\xbf\x55\x8e\x56\x13\x58\x7b"
+			  "\x85\xb4\x6a\x72\x0f\x40\xf1\x4f"
+			  "\x83\x81\x1f\x76\xde\x15\x64\x7a"
+			  "\x7a\x80\xe4\xc7\x5e\x63\x01\x91"
+			  "\xd7\x6b\xea\x0b\x9b\xa2\x99\x3b"
+			  "\x6c\x88\xd8\xfd\x59\x3c\x8d\x22"
+			  "\x86\x56\xbe\xab\xa1\x37\x08\x01"
+			  "\x50\x85\x69\x29\xee\x9f\xdf\x21"
+			  "\x3e\x20\x20\xf5\xb0\xbb\x6b\xd0"
+			  "\x9c\x41\x38\xec\x54\x6f\x2d\xbd"
+			  "\x0f\xe1\xbd\xf1\x2b\x6e\x60\x56"
+			  "\x29\xe5\x7a\x70\x1c\xe2\xfc\x97"
+			  "\x82\x68\x67\xd9\x3d\x1f\xfb\xd8"
+			  "\x07\x9f\xbf\x96\x74\xba\x6a\x0e"
+			  "\x10\x48\x20\xd8\x13\x1e\xb5\x44"
+			  "\xf2\xcc\xb1\x8b\xfb\xbb\xec\xd7"
+			  "\x37\x70\x1f\x7c\x55\xd2\x4b\xb9"
+			  "\xfd\x70\x5e\xa3\x91\x73\x63\x52"
+			  "\x13\x47\x5a\x06\xfb\x01\x67\xa5"
+			  "\xc0\xd0\x49\x19\x56\x66\x9a\x77"
+			  "\x64\xaf\x8c\x25\x91\x52\x87\x0e"
+			  "\x18\xf3\x5f\x97\xfd\x71\x13\xf8"
+			  "\x05\xa5\x39\xcc\x65\xd3\xcc\x63"
+			  "\x5b\xdb\x5f\x7e\x5f\x6e\xad\xc4"
+			  "\xf4\xa0\xc5\xc2\x2b\x4d\x97\x38"
+			  "\x4f\xbc\xfa\x33\x17\xb4\x47\xb9"
+			  "\x43\x24\x15\x8d\xd2\xed\x80\x68"
+			  "\x84\xdb\x04\x80\xca\x5e\x6a\x35"
+			  "\x2c\x2c\xe7\xc5\x03\x5f\x54\xb0"
+			  "\x5e\x4f\x1d\x40\x54\x3d\x78\x9a"
+			  "\xac\xda\x80\x27\x4d\x15\x4c\x1a"
+			  "\x6e\x80\xc9\xc4\x3b\x84\x0e\xd9"
+			  "\x2e\x93\x01\x8c\xc3\xc8\x91\x4b"
+			  "\xb3\xaa\x07\x04\x68\x5b\x93\xa5"
+			  "\xe7\xc4\x9d\xe7\x07\xee\xf5\x3b"
+			  "\x40\x89\xcc\x60\x34\x9d\xb4\x06"
+			  "\x1b\xef\x92\xe6\xc1\x2a\x7d\x0f"
+			  "\x81\xaa\x56\xe3\xd7\xed\xa7\xd4"
+			  "\xa7\x3a\x49\xc4\xad\x81\x5c\x83"
+			  "\x55\x8e\x91\x54\xb7\x7d\x65\xa5"
+			  "\x06\x16\xd5\x9a\x16\xc1\xb0\xa2"
+			  "\x06\xd8\x98\x47\x73\x7e\x73\xa0"
+			  "\xb8\x23\xb1\x52\xbf\x68\x74\x5d"
+			  "\x0b\xcb\xfa\x8c\x46\xe3\x24\xe6"
+			  "\xab\xd4\x69\x8d\x8c\xf2\x8a\x59"
+			  "\xbe\x48\x46\x50\x8c\x9a\xe8\xe3"
+			  "\x31\x55\x0a\x06\xed\x4f\xf8\xb7"
+			  "\x4f\xe3\x85\x17\x30\xbd\xd5\x20"
+			  "\xe7\x5b\xb2\x32\xcf\x6b\x16\x44"
+			  "\xd2\xf5\x7e\xd7\xd1\x2f\xee\x64"
+			  "\x3e\x9d\x10\xef\x27\x35\x43\x64"
+			  "\x67\xfb\x7a\x7b\xe0\x62\x31\x9a"
+			  "\x4d\xdf\xa5\xab\xc0\x20\xbb\x01"
+			  "\xe9\x7b\x54\xf1\xde\xb2\x79\x50"
+			  "\x6c\x4b\x91\xdb\x7f\xbb\x50\xc1"
+			  "\x55\x44\x38\x9a\xe0\x9f\xe8\x29"
+			  "\x6f\x15\xf8\x4e\xa6\xec\xa0\x60",
+		.ksize	= 1088,
+		.plaintext	= "\x15\x68\x9e\x2f\xad\x15\x52\xdf"
+			  "\xf0\x42\x62\x24\x2a\x2d\xea\xbf"
+			  "\xc7\xf3\xb4\x1a\xf5\xed\xb2\x08"
+			  "\x15\x60\x1c\x00\x77\xbf\x0b\x0e"
+			  "\xb7\x2c\xcf\x32\x3a\xc7\x01\x77"
+			  "\xef\xa6\x75\xd0\x29\xc7\x68\x20"
+			  "\xb2\x92\x25\xbf\x12\x34\xe9\xa4"
+			  "\xfd\x32\x7b\x3f\x7c\xbd\xa5\x02"
+			  "\x38\x41\xde\xc9\xc1\x09\xd9\xfc"
+			  "\x6e\x78\x22\x83\x18\xf7\x50\x8d"
+			  "\x8f\x9c\x2d\x02\xa5\x30\xac\xff"
+			  "\xea\x63\x2e\x80\x37\x83\xb0\x58"
+			  "\xda\x2f\xef\x21\x55\xba\x7b\xb1"
+			  "\xb6\xed\xf5\xd2\x4d\xaa\x8c\xa9"
+			  "\xdd\xdb\x0f\xb4\xce\xc1\x9a\xb1"
+			  "\xc1\xdc\xbd\xab\x86\xc2\xdf\x0b"
+			  "\xe1\x2c\xf9\xbe\xf6\xd8\xda\x62"
+			  "\x72\xdd\x98\x09\x52\xc0\xc4\xb6"
+			  "\x7b\x17\x5c\xf5\xd8\x4b\x88\xd6"
+			  "\x6b\xbf\x84\x4a\x3f\xf5\x4d\xd2"
+			  "\x94\xe2\x9c\xff\xc7\x3c\xd9\xc8"
+			  "\x37\x38\xbc\x8c\xf3\xe7\xb7\xd0"
+			  "\x1d\x78\xc4\x39\x07\xc8\x5e\x79"
+			  "\xb6\x5a\x90\x5b\x6e\x97\xc9\xd4"
+			  "\x82\x9c\xf3\x83\x7a\xe7\x97\xfc"
+			  "\x1d\xbb\xef\xdb\xce\xe0\x82\xad"
+			  "\xca\x07\x6c\x54\x62\x6f\x81\xe6"
+			  "\x7a\x5a\x96\x6e\x80\x3a\xa2\x37"
+			  "\x6f\xc6\xa4\x29\xc3\x9e\x19\x94"
+			  "\x9f\xb0\x3e\x38\xfb\x3c\x2b\x7d"
+			  "\xaa\xb8\x74\xda\x54\x23\x51\x12"
+			  "\x4b\x96\x36\x8f\x91\x4f\x19\x37"
+			  "\x83\xc9\xdd\xc7\x1a\x32\x2d\xab"
+			  "\xc7\x89\xe2\x07\x47\x6c\xe8\xa6"
+			  "\x70\x6b\x8e\x0c\xda\x5c\x6a\x59"
+			  "\x27\x33\x0e\xe1\xe1\x20\xe8\xc8"
+			  "\xae\xdc\xd0\xe3\x6d\xa8\xa6\x06"
+			  "\x41\xb4\xd4\xd4\xcf\x91\x3e\x06"
+			  "\xb0\x9a\xf7\xf1\xaa\xa6\x23\x92"
+			  "\x10\x86\xf0\x94\xd1\x7c\x2e\x07"
+			  "\x30\xfb\xc5\xd8\xf3\x12\xa9\xe8"
+			  "\x22\x1c\x97\x1a\xad\x96\xb0\xa1"
+			  "\x72\x6a\x6b\xb4\xfd\xf7\xe8\xfa"
+			  "\xe2\x74\xd8\x65\x8d\x35\x17\x4b"
+			  "\x00\x23\x5c\x8c\x70\xad\x71\xa2"
+			  "\xca\xc5\x6c\x59\xbf\xb4\xc0\x6d"
+			  "\x86\x98\x3e\x19\x5a\x90\x92\xb1"
+			  "\x66\x57\x6a\x91\x68\x7c\xbc\xf3"
+			  "\xf1\xdb\x94\xf8\x48\xf1\x36\xd8"
+			  "\x78\xac\x1c\xa9\xcc\xd6\x27\xba"
+			  "\x91\x54\x22\xf5\xe6\x05\x3f\xcc"
+			  "\xc2\x8f\x2c\x3b\x2b\xc3\x2b\x2b"
+			  "\x3b\xb8\xb6\x29\xb7\x2f\x94\xb6"
+			  "\x7b\xfc\x94\x3e\xd0\x7a\x41\x59"
+			  "\x7b\x1f\x9a\x09\xa6\xed\x4a\x82"
+			  "\x9d\x34\x1c\xbd\x4e\x1c\x3a\x66"
+			  "\x80\x74\x0e\x9a\x4f\x55\x54\x47"
+			  "\x16\xba\x2a\x0a\x03\x35\x99\xa3"
+			  "\x5c\x63\x8d\xa2\x72\x8b\x17\x15"
+			  "\x68\x39\x73\xeb\xec\xf2\xe8\xf5"
+			  "\x95\x32\x27\xd6\xc4\xfe\xb0\x51"
+			  "\xd5\x0c\x50\xc5\xcd\x6d\x16\xb3"
+			  "\xa3\x1e\x95\x69\xad\x78\x95\x06"
+			  "\xb9\x46\xf2\x6d\x24\x5a\x99\x76"
+			  "\x73\x6a\x91\xa6\xac\x12\xe1\x28"
+			  "\x79\xbc\x08\x4e\x97\x00\x98\x63"
+			  "\x07\x1c\x4e\xd1\x68\xf3\xb3\x81"
+			  "\xa8\xa6\x5f\xf1\x01\xc9\xc1\xaf"
+			  "\x3a\x96\xf9\x9d\xb5\x5a\x5f\x8f"
+			  "\x7e\xc1\x7e\x77\x0a\x40\xc8\x8e"
+			  "\xfc\x0e\xed\xe1\x0d\xb0\xe5\x5e"
+			  "\x5e\x6f\xf5\x7f\xab\x33\x7d\xcd"
+			  "\xf0\x09\x4b\xb2\x11\x37\xdc\x65"
+			  "\x97\x32\x62\x71\x3a\x29\x54\xb9"
+			  "\xc7\xa4\xbf\x75\x0f\xf9\x40\xa9"
+			  "\x8d\xd7\x8b\xa7\xe0\x9a\xbe\x15"
+			  "\xc6\xda\xd8\x00\x14\x69\x1a\xaf"
+			  "\x5f\x79\xc3\xf5\xbb\x6c\x2a\x9d"
+			  "\xdd\x3c\x5f\x97\x21\xe1\x3a\x03"
+			  "\x84\x6a\xe9\x76\x11\x1f\xd3\xd5"
+			  "\xf0\x54\x20\x4d\xc2\x91\xc3\xa4"
+			  "\x36\x25\xbe\x1b\x2a\x06\xb7\xf3"
+			  "\xd1\xd0\x55\x29\x81\x4c\x83\xa3"
+			  "\xa6\x84\x1e\x5c\xd1\xd0\x6c\x90"
+			  "\xa4\x11\xf0\xd7\x63\x6a\x48\x05"
+			  "\xbc\x48\x18\x53\xcd\xb0\x8d\xdb"
+			  "\xdc\xfe\x55\x11\x5c\x51\xb3\xab"
+			  "\xab\x63\x3e\x31\x5a\x8b\x93\x63"
+			  "\x34\xa9\xba\x2b\x69\x1a\xc0\xe3"
+			  "\xcb\x41\xbc\xd7\xf5\x7f\x82\x3e"
+			  "\x01\xa3\x3c\x72\xf4\xfe\xdf\xbe"
+			  "\xb1\x67\x17\x2b\x37\x60\x0d\xca"
+			  "\x6f\xc3\x94\x2c\xd2\x92\x6d\x9d"
+			  "\x75\x18\x77\xaa\x29\x38\x96\xed"
+			  "\x0e\x20\x70\x92\xd5\xd0\xb4\x00"
+			  "\xc0\x31\xf2\xc9\x43\x0e\x75\x1d"
+			  "\x4b\x64\xf2\x1f\xf2\x29\x6c\x7b"
+			  "\x7f\xec\x59\x7d\x8c\x0d\xd4\xd3"
+			  "\xac\x53\x4c\xa3\xde\x42\x92\x95"
+			  "\x6d\xa3\x4f\xd0\xe6\x3d\xe7\xec"
+			  "\x7a\x4d\x68\xf1\xfe\x67\x66\x09"
+			  "\x83\x22\xb1\x98\x43\x8c\xab\xb8"
+			  "\x45\xe6\x6d\xdf\x5e\x50\x71\xce"
+			  "\xf5\x4e\x40\x93\x2b\xfa\x86\x0e"
+			  "\xe8\x30\xbd\x82\xcc\x1c\x9c\x5f"
+			  "\xad\xfd\x08\x31\xbe\x52\xe7\xe6"
+			  "\xf2\x06\x01\x62\x25\x15\x99\x74"
+			  "\x33\x51\x52\x57\x3f\x57\x87\x61"
+			  "\xb9\x7f\x29\x3d\xcd\x92\x5e\xa6"
+			  "\x5c\x3b\xf1\xed\x5f\xeb\x82\xed"
+			  "\x56\x7b\x61\xe7\xfd\x02\x47\x0e"
+			  "\x2a\x15\xa4\xce\x43\x86\x9b\xe1"
+			  "\x2b\x4c\x2a\xd9\x42\x97\xf7\x9a"
+			  "\xe5\x47\x46\x48\xd3\x55\x6f\x4d"
+			  "\xd9\xeb\x4b\xdd\x7b\x21\x2f\xb3"
+			  "\xa8\x36\x28\xdf\xca\xf1\xf6\xd9"
+			  "\x10\xf6\x1c\xfd\x2e\x0c\x27\xe0"
+			  "\x01\xb3\xff\x6d\x47\x08\x4d\xd4"
+			  "\x00\x25\xee\x55\x4a\xe9\xe8\x5b"
+			  "\xd8\xf7\x56\x12\xd4\x50\xb2\xe5"
+			  "\x51\x6f\x34\x63\x69\xd2\x4e\x96"
+			  "\x4e\xbc\x79\xbf\x18\xae\xc6\x13"
+			  "\x80\x92\x77\xb0\xb4\x0f\x29\x94"
+			  "\x6f\x4c\xbb\x53\x11\x36\xc3\x9f"
+			  "\x42\x8e\x96\x8a\x91\xc8\xe9\xfc"
+			  "\xfe\xbf\x7c\x2d\x6f\xf9\xb8\x44"
+			  "\x89\x1b\x09\x53\x0a\x2a\x92\xc3"
+			  "\x54\x7a\x3a\xf9\xe2\xe4\x75\x87"
+			  "\xa0\x5e\x4b\x03\x7a\x0d\x8a\xf4"
+			  "\x55\x59\x94\x2b\x63\x96\x0e\xf5",
+		.psize	= 1040,
+		.digest	= "\xb5\xb9\x08\xb3\x24\x3e\x03\xf0"
+			  "\xd6\x0b\x57\xbc\x0a\x6d\x89\x59",
+	}, {
+		.key	= "\xf6\x34\x42\x71\x35\x52\x8b\x58"
+			  "\x02\x3a\x8e\x4a\x8d\x41\x13\xe9"
+			  "\x7f\xba\xb9\x55\x9d\x73\x4d\xf8"
+			  "\x3f\x5d\x73\x15\xff\xd3\x9e\x7f"
+			  "\x20\x2a\x6a\xa8\xd1\xf0\x8f\x12"
+			  "\x6b\x02\xd8\x6c\xde\xba\x80\x22"
+			  "\x19\x37\xc8\xd0\x4e\x89\x17\x7c"
+			  "\x7c\xdd\x88\xfd\x41\xc0\x04\xb7"
+			  "\x1d\xac\x19\xe3\x20\xc7\x16\xcf"
+			  "\x58\xee\x1d\x7a\x61\x69\xa9\x12"
+			  "\x4b\xef\x4f\xb6\x38\xdd\x78\xf8"
+			  "\x28\xee\x70\x08\xc7\x7c\xcc\xc8"
+			  "\x1e\x41\xf5\x80\x86\x70\xd0\xf0"
+			  "\xa3\x87\x6b\x0a\x00\xd2\x41\x28"
+			  "\x74\x26\xf1\x24\xf3\xd0\x28\x77"
+			  "\xd7\xcd\xf6\x2d\x61\xf4\xa2\x13"
+			  "\x77\xb4\x6f\xa0\xf4\xfb\xd6\xb5"
+			  "\x38\x9d\x5a\x0c\x51\xaf\xad\x63"
+			  "\x27\x67\x8c\x01\xea\x42\x1a\x66"
+			  "\xda\x16\x7c\x3c\x30\x0c\x66\x53"
+			  "\x1c\x88\xa4\x5c\xb2\xe3\x78\x0a"
+			  "\x13\x05\x6d\xe2\xaf\xb3\xe4\x75"
+			  "\x00\x99\x58\xee\x76\x09\x64\xaa"
+			  "\xbb\x2e\xb1\x81\xec\xd8\x0e\xd3"
+			  "\x0c\x33\x5d\xb7\x98\xef\x36\xb6"
+			  "\xd2\x65\x69\x41\x70\x12\xdc\x25"
+			  "\x41\x03\x99\x81\x41\x19\x62\x13"
+			  "\xd1\x0a\x29\xc5\x8c\xe0\x4c\xf3"
+			  "\xd6\xef\x4c\xf4\x1d\x83\x2e\x6d"
+			  "\x8e\x14\x87\xed\x80\xe0\xaa\xd3"
+			  "\x08\x04\x73\x1a\x84\x40\xf5\x64"
+			  "\xbd\x61\x32\x65\x40\x42\xfb\xb0"
+			  "\x40\xf6\x40\x8d\xc7\x7f\x14\xd0"
+			  "\x83\x99\xaa\x36\x7e\x60\xc6\xbf"
+			  "\x13\x8a\xf9\x21\xe4\x7e\x68\x87"
+			  "\xf3\x33\x86\xb4\xe0\x23\x7e\x0a"
+			  "\x21\xb1\xf5\xad\x67\x3c\x9c\x9d"
+			  "\x09\xab\xaf\x5f\xba\xe0\xd0\x82"
+			  "\x48\x22\x70\xb5\x6d\x53\xd6\x0e"
+			  "\xde\x64\x92\x41\xb0\xd3\xfb\xda"
+			  "\x21\xfe\xab\xea\x20\xc4\x03\x58"
+			  "\x18\x2e\x7d\x2f\x03\xa9\x47\x66"
+			  "\xdf\x7b\xa4\x6b\x34\x6b\x55\x9c"
+			  "\x4f\xd7\x9c\x47\xfb\xa9\x42\xec"
+			  "\x5a\x12\xfd\xfe\x76\xa0\x92\x9d"
+			  "\xfe\x1e\x16\xdd\x24\x2a\xe4\x27"
+			  "\xd5\xa9\xf2\x05\x4f\x83\xa2\xaf"
+			  "\xfe\xee\x83\x7a\xad\xde\xdf\x9a"
+			  "\x80\xd5\x81\x14\x93\x16\x7e\x46"
+			  "\x47\xc2\x14\xef\x49\x6e\xb9\xdb"
+			  "\x40\xe8\x06\x6f\x9c\x2a\xfd\x62"
+			  "\x06\x46\xfd\x15\x1d\x36\x61\x6f"
+			  "\x77\x77\x5e\x64\xce\x78\x1b\x85"
+			  "\xbf\x50\x9a\xfd\x67\xa6\x1a\x65"
+			  "\xad\x5b\x33\x30\xf1\x71\xaa\xd9"
+			  "\x23\x0d\x92\x24\x5f\xae\x57\xb0"
+			  "\x24\x37\x0a\x94\x12\xfb\xb5\xb1"
+			  "\xd3\xb8\x1d\x12\x29\xb0\x80\x24"
+			  "\x2d\x47\x9f\x96\x1f\x95\xf1\xb1"
+			  "\xda\x35\xf6\x29\xe0\xe1\x23\x96"
+			  "\xc7\xe8\x22\x9b\x7c\xac\xf9\x41"
+			  "\x39\x01\xe5\x73\x15\x5e\x99\xec"
+			  "\xb4\xc1\xf4\xe7\xa7\x97\x6a\xd5"
+			  "\x90\x9a\xa0\x1d\xf3\x5a\x8b\x5f"
+			  "\xdf\x01\x52\xa4\x93\x31\x97\xb0"
+			  "\x93\x24\xb5\xbc\xb2\x14\x24\x98"
+			  "\x4a\x8f\x19\x85\xc3\x2d\x0f\x74"
+			  "\x9d\x16\x13\x80\x5e\x59\x62\x62"
+			  "\x25\xe0\xd1\x2f\x64\xef\xba\xac"
+			  "\xcd\x09\x07\x15\x8a\xcf\x73\xb5"
+			  "\x8b\xc9\xd8\x24\xb0\x53\xd5\x6f"
+			  "\xe1\x2b\x77\xb1\xc5\xe4\xa7\x0e"
+			  "\x18\x45\xab\x36\x03\x59\xa8\xbd"
+			  "\x43\xf0\xd8\x2c\x1a\x69\x96\xbb"
+			  "\x13\xdf\x6c\x33\x77\xdf\x25\x34"
+			  "\x5b\xa5\x5b\x8c\xf9\x51\x05\xd4"
+			  "\x8b\x8b\x44\x87\x49\xfc\xa0\x8f"
+			  "\x45\x15\x5b\x40\x42\xc4\x09\x92"
+			  "\x98\x0c\x4d\xf4\x26\x37\x1b\x13"
+			  "\x76\x01\x93\x8d\x4f\xe6\xed\x18"
+			  "\xd0\x79\x7b\x3f\x44\x50\xcb\xee"
+			  "\xf7\x4a\xc9\x9e\xe0\x96\x74\xa7"
+			  "\xe6\x93\xb2\x53\xca\x55\xa8\xdc"
+			  "\x1e\x68\x07\x87\xb7\x2e\xc1\x08"
+			  "\xb2\xa4\x5b\xaf\xc6\xdb\x5c\x66"
+			  "\x41\x1c\x51\xd9\xb0\x07\x00\x0d"
+			  "\xf0\x4c\xdc\x93\xde\xa9\x1e\x8e"
+			  "\xd3\x22\x62\xd8\x8b\x88\x2c\xea"
+			  "\x5e\xf1\x6e\x14\x40\xc7\xbe\xaa"
+			  "\x42\x28\xd0\x26\x30\x78\x01\x9b"
+			  "\x83\x07\xbc\x94\xc7\x57\xa2\x9f"
+			  "\x03\x07\xff\x16\xff\x3c\x6e\x48"
+			  "\x0a\xd0\xdd\x4c\xf6\x64\x9a\xf1"
+			  "\xcd\x30\x12\x82\x2c\x38\xd3\x26"
+			  "\x83\xdb\xab\x3e\xc6\xf8\xe6\xfa"
+			  "\x77\x0a\x78\x82\x75\xf8\x63\x51"
+			  "\x59\xd0\x8d\x24\x9f\x25\xe6\xa3"
+			  "\x4c\xbc\x34\xfc\xe3\x10\xc7\x62"
+			  "\xd4\x23\xc8\x3d\xa7\xc6\xa6\x0a"
+			  "\x4f\x7e\x29\x9d\x6d\xbe\xb5\xf1"
+			  "\xdf\xa4\x53\xfa\xc0\x23\x0f\x37"
+			  "\x84\x68\xd0\xb5\xc8\xc6\xae\xf8"
+			  "\xb7\x8d\xb3\x16\xfe\x8f\x87\xad"
+			  "\xd0\xc1\x08\xee\x12\x1c\x9b\x1d"
+			  "\x90\xf8\xd1\x63\xa4\x92\x3c\xf0"
+			  "\xc7\x34\xd8\xf1\x14\xed\xa3\xbc"
+			  "\x17\x7e\xd4\x62\x42\x54\x57\x2c"
+			  "\x3e\x7a\x35\x35\x17\x0f\x0b\x7f"
+			  "\x81\xa1\x3f\xd0\xcd\xc8\x3b\x96"
+			  "\xe9\xe0\x4a\x04\xe1\xb6\x3c\xa1"
+			  "\xd6\xca\xc4\xbd\xb6\xb5\x95\x34"
+			  "\x12\x9d\xc5\x96\xf2\xdf\xba\x54"
+			  "\x76\xd1\xb2\x6b\x3b\x39\xe0\xb9"
+			  "\x18\x62\xfb\xf7\xfc\x12\xf1\x5f"
+			  "\x7e\xc7\xe3\x59\x4c\xa6\xc2\x3d"
+			  "\x40\x15\xf9\xa3\x95\x64\x4c\x74"
+			  "\x8b\x73\x77\x33\x07\xa7\x04\x1d"
+			  "\x33\x5a\x7e\x8f\xbd\x86\x01\x4f"
+			  "\x3e\xb9\x27\x6f\xe2\x41\xf7\x09"
+			  "\x67\xfd\x29\x28\xc5\xe4\xf6\x18"
+			  "\x4c\x1b\x49\xb2\x9c\x5b\xf6\x81"
+			  "\x4f\xbb\x5c\xcc\x0b\xdf\x84\x23"
+			  "\x58\xd6\x28\x34\x93\x3a\x25\x97"
+			  "\xdf\xb2\xc3\x9e\x97\x38\x0b\x7d"
+			  "\x10\xb3\x54\x35\x23\x8c\x64\xee"
+			  "\xf0\xd8\x66\xff\x8b\x22\xd2\x5b"
+			  "\x05\x16\x3c\x89\xf7\xb1\x75\xaf"
+			  "\xc0\xae\x6a\x4f\x3f\xaf\x9a\xf4"
+			  "\xf4\x9a\x24\xd9\x80\x82\xc0\x12"
+			  "\xde\x96\xd1\xbe\x15\x0b\x8d\x6a"
+			  "\xd7\x12\xe4\x85\x9f\x83\xc9\xc3"
+			  "\xff\x0b\xb5\xaf\x3b\xd8\x6d\x67"
+			  "\x81\x45\xe6\xac\xec\xc1\x7b\x16"
+			  "\x18\x0a\xce\x4b\xc0\x2e\x76\xbc"
+			  "\x1b\xfa\xb4\x34\xb8\xfc\x3e\xc8"
+			  "\x5d\x90\x71\x6d\x7a\x79\xef\x06",
+		.ksize	= 1088,
+		.plaintext	= "\xaa\x5d\x54\xcb\xea\x1e\x46\x0f"
+			  "\x45\x87\x70\x51\x8a\x66\x7a\x33"
+			  "\xb4\x18\xff\xa9\x82\xf9\x45\x4b"
+			  "\x93\xae\x2e\x7f\xab\x98\xfe\xbf"
+			  "\x01\xee\xe5\xa0\x37\x8f\x57\xa6"
+			  "\xb0\x76\x0d\xa4\xd6\x28\x2b\x5d"
+			  "\xe1\x03\xd6\x1c\x6f\x34\x0d\xe7"
+			  "\x61\x2d\x2e\xe5\xae\x5d\x47\xc7"
+			  "\x80\x4b\x18\x8f\xa8\x99\xbc\x28"
+			  "\xed\x1d\x9d\x86\x7d\xd7\x41\xd1"
+			  "\xe0\x2b\xe1\x8c\x93\x2a\xa7\x80"
+			  "\xe1\x07\xa0\xa9\x9f\x8c\x8d\x1a"
+			  "\x55\xfc\x6b\x24\x7a\xbd\x3e\x51"
+			  "\x68\x4b\x26\x59\xc8\xa7\x16\xd9"
+			  "\xb9\x61\x13\xde\x8b\x63\x1c\xf6"
+			  "\x60\x01\xfb\x08\xb3\x5b\x0a\xbf"
+			  "\x34\x73\xda\x87\x87\x3d\x6f\x97"
+			  "\x4a\x0c\xa3\x58\x20\xa2\xc0\x81"
+			  "\x5b\x8c\xef\xa9\xc2\x01\x1e\x64"
+			  "\x83\x8c\xbc\x03\xb6\xd0\x29\x9f"
+			  "\x54\xe2\xce\x8b\xc2\x07\x85\x78"
+			  "\x25\x38\x96\x4c\xb4\xbe\x17\x4a"
+			  "\x65\xa6\xfa\x52\x9d\x66\x9d\x65"
+			  "\x4a\xd1\x01\x01\xf0\xcb\x13\xcc"
+			  "\xa5\x82\xf3\xf2\x66\xcd\x3f\x9d"
+			  "\xd1\xaa\xe4\x67\xea\xf2\xad\x88"
+			  "\x56\x76\xa7\x9b\x59\x3c\xb1\x5d"
+			  "\x78\xfd\x69\x79\x74\x78\x43\x26"
+			  "\x7b\xde\x3f\xf1\xf5\x4e\x14\xd9"
+			  "\x15\xf5\x75\xb5\x2e\x19\xf3\x0c"
+			  "\x48\x72\xd6\x71\x6d\x03\x6e\xaa"
+			  "\xa7\x08\xf9\xaa\x70\xa3\x0f\x4d"
+			  "\x12\x8a\xdd\xe3\x39\x73\x7e\xa7"
+			  "\xea\x1f\x6d\x06\x26\x2a\xf2\xc5"
+			  "\x52\xb4\xbf\xfd\x52\x0c\x06\x60"
+			  "\x90\xd1\xb2\x7b\x56\xae\xac\x58"
+			  "\x5a\x6b\x50\x2a\xf5\xe0\x30\x3c"
+			  "\x2a\x98\x0f\x1b\x5b\x0a\x84\x6c"
+			  "\x31\xae\x92\xe2\xd4\xbb\x7f\x59"
+			  "\x26\x10\xb9\x89\x37\x68\x26\xbf"
+			  "\x41\xc8\x49\xc4\x70\x35\x7d\xff"
+			  "\x2d\x7f\xf6\x8a\x93\x68\x8c\x78"
+			  "\x0d\x53\xce\x7d\xff\x7d\xfb\xae"
+			  "\x13\x1b\x75\xc4\x78\xd7\x71\xd8"
+			  "\xea\xd3\xf4\x9d\x95\x64\x8e\xb4"
+			  "\xde\xb8\xe4\xa6\x68\xc8\xae\x73"
+			  "\x58\xaf\xa8\xb0\x5a\x20\xde\x87"
+			  "\x43\xb9\x0f\xe3\xad\x41\x4b\xd5"
+			  "\xb7\xad\x16\x00\xa6\xff\xf6\x74"
+			  "\xbf\x8c\x9f\xb3\x58\x1b\xb6\x55"
+			  "\xa9\x90\x56\x28\xf0\xb5\x13\x4e"
+			  "\x9e\xf7\x25\x86\xe0\x07\x7b\x98"
+			  "\xd8\x60\x5d\x38\x95\x3c\xe4\x22"
+			  "\x16\x2f\xb2\xa2\xaf\xe8\x90\x17"
+			  "\xec\x11\x83\x1a\xf4\xa9\x26\xda"
+			  "\x39\x72\xf5\x94\x61\x05\x51\xec"
+			  "\xa8\x30\x8b\x2c\x13\xd0\x72\xac"
+			  "\xb9\xd2\xa0\x4c\x4b\x78\xe8\x6e"
+			  "\x04\x85\xe9\x04\x49\x82\x91\xff"
+			  "\x89\xe5\xab\x4c\xaa\x37\x03\x12"
+			  "\xca\x8b\x74\x10\xfd\x9e\xd9\x7b"
+			  "\xcb\xdb\x82\x6e\xce\x2e\x33\x39"
+			  "\xce\xd2\x84\x6e\x34\x71\x51\x6e"
+			  "\x0d\xd6\x01\x87\xc7\xfa\x0a\xd3"
+			  "\xad\x36\xf3\x4c\x9f\x96\x5e\x62"
+			  "\x62\x54\xc3\x03\x78\xd6\xab\xdd"
+			  "\x89\x73\x55\x25\x30\xf8\xa7\xe6"
+			  "\x4f\x11\x0c\x7c\x0a\xa1\x2b\x7b"
+			  "\x3d\x0d\xde\x81\xd4\x9d\x0b\xae"
+			  "\xdf\x00\xf9\x4c\xb6\x90\x8e\x16"
+			  "\xcb\x11\xc8\xd1\x2e\x73\x13\x75"
+			  "\x75\x3e\xaa\xf5\xee\x02\xb3\x18"
+			  "\xa6\x2d\xf5\x3b\x51\xd1\x1f\x47"
+			  "\x6b\x2c\xdb\xc4\x10\xe0\xc8\xba"
+			  "\x9d\xac\xb1\x9d\x75\xd5\x41\x0e"
+			  "\x7e\xbe\x18\x5b\xa4\x1f\xf8\x22"
+			  "\x4c\xc1\x68\xda\x6d\x51\x34\x6c"
+			  "\x19\x59\xec\xb5\xb1\xec\xa7\x03"
+			  "\xca\x54\x99\x63\x05\x6c\xb1\xac"
+			  "\x9c\x31\xd6\xdb\xba\x7b\x14\x12"
+			  "\x7a\xc3\x2f\xbf\x8d\xdc\x37\x46"
+			  "\xdb\xd2\xbc\xd4\x2f\xab\x30\xd5"
+			  "\xed\x34\x99\x8e\x83\x3e\xbe\x4c"
+			  "\x86\x79\x58\xe0\x33\x8d\x9a\xb8"
+			  "\xa9\xa6\x90\x46\xa2\x02\xb8\xdd"
+			  "\xf5\xf9\x1a\x5c\x8c\x01\xaa\x6e"
+			  "\xb4\x22\x12\xf5\x0c\x1b\x9b\x7a"
+			  "\xc3\x80\xf3\x06\x00\x5f\x30\xd5"
+			  "\x06\xdb\x7d\x82\xc2\xd4\x0b\x4c"
+			  "\x5f\xe9\xc5\xf5\xdf\x97\x12\xbf"
+			  "\x56\xaf\x9b\x69\xcd\xee\x30\xb4"
+			  "\xa8\x71\xff\x3e\x7d\x73\x7a\xb4"
+			  "\x0d\xa5\x46\x7a\xf3\xf4\x15\x87"
+			  "\x5d\x93\x2b\x8c\x37\x64\xb5\xdd"
+			  "\x48\xd1\xe5\x8c\xae\xd4\xf1\x76"
+			  "\xda\xf4\xba\x9e\x25\x0e\xad\xa3"
+			  "\x0d\x08\x7c\xa8\x82\x16\x8d\x90"
+			  "\x56\x40\x16\x84\xe7\x22\x53\x3a"
+			  "\x58\xbc\xb9\x8f\x33\xc8\xc2\x84"
+			  "\x22\xe6\x0d\xe7\xb3\xdc\x5d\xdf"
+			  "\xd7\x2a\x36\xe4\x16\x06\x07\xd2"
+			  "\x97\x60\xb2\xf5\x5e\x14\xc9\xfd"
+			  "\x8b\x05\xd1\xce\xee\x9a\x65\x99"
+			  "\xb7\xae\x19\xb7\xc8\xbc\xd5\xa2"
+			  "\x7b\x95\xe1\xcc\xba\x0d\xdc\x8a"
+			  "\x1d\x59\x52\x50\xaa\x16\x02\x82"
+			  "\xdf\x61\x33\x2e\x44\xce\x49\xc7"
+			  "\xe5\xc6\x2e\x76\xcf\x80\x52\xf0"
+			  "\x3d\x17\x34\x47\x3f\xd3\x80\x48"
+			  "\xa2\xba\xd5\xc7\x7b\x02\x28\xdb"
+			  "\xac\x44\xc7\x6e\x05\x5c\xc2\x79"
+			  "\xb3\x7d\x6a\x47\x77\x66\xf1\x38"
+			  "\xf0\xf5\x4f\x27\x1a\x31\xca\x6c"
+			  "\x72\x95\x92\x8e\x3f\xb0\xec\x1d"
+			  "\xc7\x2a\xff\x73\xee\xdf\x55\x80"
+			  "\x93\xd2\xbd\x34\xd3\x9f\x00\x51"
+			  "\xfb\x2e\x41\xba\x6c\x5a\x7c\x17"
+			  "\x7f\xe6\x70\xac\x8d\x39\x3f\x77"
+			  "\xe2\x23\xac\x8f\x72\x4e\xe4\x53"
+			  "\xcc\xf1\x1b\xf1\x35\xfe\x52\xa4"
+			  "\xd6\xb8\x40\x6b\xc1\xfd\xa0\xa1"
+			  "\xf5\x46\x65\xc2\x50\xbb\x43\xe2"
+			  "\xd1\x43\x28\x34\x74\xf5\x87\xa0"
+			  "\xf2\x5e\x27\x3b\x59\x2b\x3e\x49"
+			  "\xdf\x46\xee\xaf\x71\xd7\x32\x36"
+			  "\xc7\x14\x0b\x58\x6e\x3e\x2d\x41"
+			  "\xfa\x75\x66\x3a\x54\xe0\xb2\xb9"
+			  "\xaf\xdd\x04\x80\x15\x19\x3f\x6f"
+			  "\xce\x12\xb4\xd8\xe8\x89\x3c\x05"
+			  "\x30\xeb\xf3\x3d\xcd\x27\xec\xdc"
+			  "\x56\x70\x12\xcf\x78\x2b\x77\xbf"
+			  "\x22\xf0\x1b\x17\x9c\xcc\xd6\x1b"
+			  "\x2d\x3d\xa0\x3b\xd8\xc9\x70\xa4"
+			  "\x7a\x3e\x07\xb9\x06\xc3\xfa\xb0"
+			  "\x33\xee\xc1\xd8\xf6\xe0\xf0\xb2"
+			  "\x61\x12\x69\xb0\x5f\x28\x99\xda"
+			  "\xc3\x61\x48\xfa\x07\x16\x03\xc4"
+			  "\xa8\xe1\x3c\xe8\x0e\x64\x15\x30"
+			  "\xc1\x9d\x84\x2f\x73\x98\x0e\x3a"
+			  "\xf2\x86\x21\xa4\x9e\x1d\xb5\x86"
+			  "\x16\xdb\x2b\x9a\x06\x64\x8e\x79"
+			  "\x8d\x76\x3e\xc3\xc2\x64\x44\xe3"
+			  "\xda\xbc\x1a\x52\xd7\x61\x03\x65"
+			  "\x54\x32\x77\x01\xed\x9d\x8a\x43"
+			  "\x25\x24\xe3\xc1\xbe\xb8\x2f\xcb"
+			  "\x89\x14\x64\xab\xf6\xa0\x6e\x02"
+			  "\x57\xe4\x7d\xa9\x4e\x9a\x03\x36"
+			  "\xad\xf1\xb1\xfc\x0b\xe6\x79\x51"
+			  "\x9f\x81\x77\xc4\x14\x78\x9d\xbf"
+			  "\xb6\xd6\xa3\x8c\xba\x0b\x26\xe7"
+			  "\xc8\xb9\x5c\xcc\xe1\x5f\xd5\xc6"
+			  "\xc4\xca\xc2\xa3\x45\xba\x94\x13"
+			  "\xb2\x8f\xc3\x54\x01\x09\xe7\x8b"
+			  "\xda\x2a\x0a\x11\x02\x43\xcb\x57"
+			  "\xc9\xcc\xb5\x5c\xab\xc4\xec\x54"
+			  "\x00\x06\x34\xe1\x6e\x03\x89\x7c"
+			  "\xc6\xfb\x6a\xc7\x60\x43\xd6\xc5"
+			  "\xb5\x68\x72\x89\x8f\x42\xc3\x74"
+			  "\xbd\x25\xaa\x9f\x67\xb5\xdf\x26"
+			  "\x20\xe8\xb7\x01\x3c\xe4\x77\xce"
+			  "\xc4\x65\xa7\x23\x79\xea\x33\xc7"
+			  "\x82\x14\x5c\x82\xf2\x4e\x3d\xf6"
+			  "\xc6\x4a\x0e\x29\xbb\xec\x44\xcd"
+			  "\x2f\xd1\x4f\x21\x71\xa9\xce\x0f"
+			  "\x5c\xf2\x72\x5c\x08\x2e\x21\xd2"
+			  "\xc3\x29\x13\xd8\xac\xc3\xda\x13"
+			  "\x1a\x9d\xa7\x71\x1d\x27\x1d\x27"
+			  "\x1d\xea\xab\x44\x79\xad\xe5\xeb"
+			  "\xef\x1f\x22\x0a\x44\x4f\xcb\x87"
+			  "\xa7\x58\x71\x0e\x66\xf8\x60\xbf"
+			  "\x60\x74\x4a\xb4\xec\x2e\xfe\xd3"
+			  "\xf5\xb8\xfe\x46\x08\x50\x99\x6c"
+			  "\x66\xa5\xa8\x34\x44\xb5\xe5\xf0"
+			  "\xdd\x2c\x67\x4e\x35\x96\x8e\x67"
+			  "\x48\x3f\x5f\x37\x44\x60\x51\x2e"
+			  "\x14\x91\x5e\x57\xc3\x0e\x79\x77"
+			  "\x2f\x03\xf4\xe2\x1c\x72\xbf\x85"
+			  "\x5d\xd3\x17\xdf\x6c\xc5\x70\x24"
+			  "\x42\xdf\x51\x4e\x2a\xb2\xd2\x5b"
+			  "\x9e\x69\x83\x41\x11\xfe\x73\x22"
+			  "\xde\x8a\x9e\xd8\x8a\xfb\x20\x38"
+			  "\xd8\x47\x6f\xd5\xed\x8f\x41\xfd"
+			  "\x13\x7a\x18\x03\x7d\x0f\xcd\x7d"
+			  "\xa6\x7d\x31\x9e\xf1\x8f\x30\xa3"
+			  "\x8b\x4c\x24\xb7\xf5\x48\xd7\xd9"
+			  "\x12\xe7\x84\x97\x5c\x31\x6d\xfb"
+			  "\xdf\xf3\xd3\xd1\xd5\x0c\x30\x06"
+			  "\x01\x6a\xbc\x6c\x78\x7b\xa6\x50"
+			  "\xfa\x0f\x3c\x42\x2d\xa5\xa3\x3b"
+			  "\xcf\x62\x50\xff\x71\x6d\xe7\xda"
+			  "\x27\xab\xc6\x67\x16\x65\x68\x64"
+			  "\xc7\xd5\x5f\x81\xa9\xf6\x65\xb3"
+			  "\x5e\x43\x91\x16\xcd\x3d\x55\x37"
+			  "\x55\xb3\xf0\x28\xc5\x54\x19\xc0"
+			  "\xe0\xd6\x2a\x61\xd4\xc8\x72\x51"
+			  "\xe9\xa1\x7b\x48\x21\xad\x44\x09"
+			  "\xe4\x01\x61\x3c\x8a\x5b\xf9\xa1"
+			  "\x6e\x1b\xdf\xc0\x04\xa8\x8b\xf2"
+			  "\x21\xbe\x34\x7b\xfc\xa1\xcd\xc9"
+			  "\xa9\x96\xf4\xa4\x4c\xf7\x4e\x8f"
+			  "\x84\xcc\xd3\xa8\x92\x77\x8f\x36"
+			  "\xe2\x2e\x8c\x33\xe8\x84\xa6\x0c"
+			  "\x6c\x8a\xda\x14\x32\xc2\x96\xff"
+			  "\xc6\x4a\xc2\x9b\x30\x7f\xd1\x29"
+			  "\xc0\xd5\x78\x41\x00\x80\x80\x03"
+			  "\x2a\xb1\xde\x26\x03\x48\x49\xee"
+			  "\x57\x14\x76\x51\x3c\x36\x5d\x0a"
+			  "\x5c\x9f\xe8\xd8\x53\xdb\x4f\xd4"
+			  "\x38\xbf\x66\xc9\x75\x12\x18\x75"
+			  "\x34\x2d\x93\x22\x96\x51\x24\x6e"
+			  "\x4e\xd9\x30\xea\x67\xff\x92\x1c"
+			  "\x16\x26\xe9\xb5\x33\xab\x8c\x22"
+			  "\x47\xdb\xa0\x2c\x08\xf0\x12\x69"
+			  "\x7e\x93\x52\xda\xa5\xe5\xca\xc1"
+			  "\x0f\x55\x2a\xbd\x09\x30\x88\x1b"
+			  "\x9c\xc6\x9f\xe6\xdb\xa6\x92\xeb"
+			  "\xf4\xbd\x5c\xc4\xdb\xc6\x71\x09"
+			  "\xab\x5e\x48\x0c\xed\x6f\xda\x8e"
+			  "\x8d\x0c\x98\x71\x7d\x10\xd0\x9c"
+			  "\x20\x9b\x79\x53\x26\x5d\xb9\x85"
+			  "\x8a\x31\xb8\xc5\x1c\x97\xde\x88"
+			  "\x61\x55\x7f\x7c\x21\x06\xea\xc4"
+			  "\x5f\xaf\xf2\xf0\xd5\x5e\x7d\xb4"
+			  "\x6e\xcf\xe9\xae\x1b\x0e\x11\x80"
+			  "\xc1\x9a\x74\x7e\x52\x6f\xa0\xb7"
+			  "\x24\xcd\x8d\x0a\x11\x40\x63\x72"
+			  "\xfa\xe2\xc5\xb3\x94\xef\x29\xa2"
+			  "\x1a\x23\x43\x04\x37\x55\x0d\xe9"
+			  "\x83\xb2\x29\x51\x49\x64\xa0\xbd"
+			  "\xde\x73\xfd\xa5\x7c\x95\x70\x62"
+			  "\x58\xdc\xe2\xd0\xbf\x98\xf5\x8a"
+			  "\x6a\xfd\xce\xa8\x0e\x42\x2a\xeb"
+			  "\xd2\xff\x83\x27\x53\x5c\xa0\x6e"
+			  "\x93\xef\xe2\xb9\x5d\x35\xd6\x98"
+			  "\xf6\x71\x19\x7a\x54\xa1\xa7\xe8"
+			  "\x09\xfe\xf6\x9e\xc7\xbd\x3e\x29"
+			  "\xbd\x6b\x17\xf4\xe7\x3e\x10\x5c"
+			  "\xc1\xd2\x59\x4f\x4b\x12\x1a\x5b"
+			  "\x50\x80\x59\xb9\xec\x13\x66\xa8"
+			  "\xd2\x31\x7b\x6a\x61\x22\xdd\x7d"
+			  "\x61\xee\x87\x16\x46\x9f\xf9\xc7"
+			  "\x41\xee\x74\xf8\xd0\x96\x2c\x76"
+			  "\x2a\xac\x7d\x6e\x9f\x0e\x7f\x95"
+			  "\xfe\x50\x16\xb2\x23\xca\x62\xd5"
+			  "\x68\xcf\x07\x3f\x3f\x97\x85\x2a"
+			  "\x0c\x25\x45\xba\xdb\x32\xcb\x83"
+			  "\x8c\x4f\xe0\x6d\x9a\x99\xf9\xc9"
+			  "\xda\xd4\x19\x31\xc1\x7c\x6d\xd9"
+			  "\x9c\x56\xd3\xec\xc1\x81\x4c\xed"
+			  "\x28\x9d\x87\xeb\x19\xd7\x1a\x4f"
+			  "\x04\x6a\xcb\x1f\xcf\x1f\xa2\x16"
+			  "\xfc\x2a\x0d\xa1\x14\x2d\xfa\xc5"
+			  "\x5a\xd2\xc5\xf9\x19\x7c\x20\x1f"
+			  "\x2d\x10\xc0\x66\x7c\xd9\x2d\xe5"
+			  "\x88\x70\x59\xa7\x85\xd5\x2e\x7c"
+			  "\x5c\xe3\xb7\x12\xd6\x97\x3f\x29",
+		.psize	= 2048,
+		.digest	= "\x37\x90\x92\xc2\xeb\x01\x87\xd9"
+			  "\x95\xc7\x91\xc3\x17\x8b\x38\x52",
+	}
+};
+
+
 /*
  * DES test vectors.
  */
diff --git a/include/crypto/nhpoly1305.h b/include/crypto/nhpoly1305.h
new file mode 100644
index 000000000000..53c04423c582
--- /dev/null
+++ b/include/crypto/nhpoly1305.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Common values and helper functions for the NHPoly1305 hash function.
+ */
+
+#ifndef _NHPOLY1305_H
+#define _NHPOLY1305_H
+
+#include <crypto/hash.h>
+#include <crypto/poly1305.h>
+
+/* NH parameterization: */
+
+/* Endianness: little */
+/* Word size: 32 bits (works well on NEON, SSE2, AVX2) */
+
+/* Stride: 2 words (optimal on ARM32 NEON; works okay on other CPUs too) */
+#define NH_PAIR_STRIDE		2
+#define NH_MESSAGE_UNIT		(NH_PAIR_STRIDE * 2 * sizeof(u32))
+
+/* Num passes (Toeplitz iteration count): 4, to give ε = 2^{-128} */
+#define NH_NUM_PASSES		4
+#define NH_HASH_BYTES		(NH_NUM_PASSES * sizeof(u64))
+
+/* Max message size: 1024 bytes (32x compression factor) */
+#define NH_NUM_STRIDES		64
+#define NH_MESSAGE_WORDS	(NH_PAIR_STRIDE * 2 * NH_NUM_STRIDES)
+#define NH_MESSAGE_BYTES	(NH_MESSAGE_WORDS * sizeof(u32))
+#define NH_KEY_WORDS		(NH_MESSAGE_WORDS + \
+				 NH_PAIR_STRIDE * 2 * (NH_NUM_PASSES - 1))
+#define NH_KEY_BYTES		(NH_KEY_WORDS * sizeof(u32))
+
+#define NHPOLY1305_KEY_SIZE	(POLY1305_BLOCK_SIZE + NH_KEY_BYTES)
+
+struct nhpoly1305_key {
+	struct poly1305_key poly_key;
+	u32 nh_key[NH_KEY_WORDS];
+};
+
+struct nhpoly1305_state {
+
+	/* Running total of polynomial evaluation */
+	struct poly1305_state poly_state;
+
+	/* Partial block buffer */
+	u8 buffer[NH_MESSAGE_UNIT];
+	unsigned int buflen;
+
+	/*
+	 * Number of bytes remaining until the current NH message reaches
+	 * NH_MESSAGE_BYTES.  When nonzero, 'nh_hash' holds the partial NH hash.
+	 */
+	unsigned int nh_remaining;
+
+	__le64 nh_hash[NH_NUM_PASSES];
+};
+
+typedef void (*nh_t)(const u32 *key, const u8 *message, size_t message_len,
+		     __le64 hash[NH_NUM_PASSES]);
+
+int crypto_nhpoly1305_setkey(struct crypto_shash *tfm,
+			     const u8 *key, unsigned int keylen);
+
+int crypto_nhpoly1305_init(struct shash_desc *desc);
+int crypto_nhpoly1305_update(struct shash_desc *desc,
+			     const u8 *src, unsigned int srclen);
+int crypto_nhpoly1305_update_helper(struct shash_desc *desc,
+				    const u8 *src, unsigned int srclen,
+				    nh_t nh_fn);
+int crypto_nhpoly1305_final(struct shash_desc *desc, u8 *dst);
+int crypto_nhpoly1305_final_helper(struct shash_desc *desc, u8 *dst,
+				   nh_t nh_fn);
+
+#endif /* _NHPOLY1305_H */
-- 
cgit v1.2.3-59-g8ed1b


From 16aae3595a9d41c97d983889b341c455779c2ecf Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 16 Nov 2018 17:26:30 -0800
Subject: crypto: arm/nhpoly1305 - add NEON-accelerated NHPoly1305
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add an ARM NEON implementation of NHPoly1305, an ε-almost-∆-universal
hash function used in the Adiantum encryption mode.  For now, only the
NH portion is actually NEON-accelerated; the Poly1305 part is less
performance-critical so is just implemented in C.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm/crypto/Kconfig                |   5 ++
 arch/arm/crypto/Makefile               |   2 +
 arch/arm/crypto/nh-neon-core.S         | 116 +++++++++++++++++++++++++++++++++
 arch/arm/crypto/nhpoly1305-neon-glue.c |  77 ++++++++++++++++++++++
 4 files changed, 200 insertions(+)
 create mode 100644 arch/arm/crypto/nh-neon-core.S
 create mode 100644 arch/arm/crypto/nhpoly1305-neon-glue.c

diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index 59c674cf08ef..a95322b59799 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -131,4 +131,9 @@ config CRYPTO_CHACHA20_NEON
 	select CRYPTO_BLKCIPHER
 	select CRYPTO_CHACHA20
 
+config CRYPTO_NHPOLY1305_NEON
+	tristate "NEON accelerated NHPoly1305 hash function (for Adiantum)"
+	depends on KERNEL_MODE_NEON
+	select CRYPTO_NHPOLY1305
+
 endif
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index 005482ff9504..b65d6bfab8e6 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -10,6 +10,7 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
 obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
 obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
+obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
 
 ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
@@ -53,6 +54,7 @@ ghash-arm-ce-y	:= ghash-ce-core.o ghash-ce-glue.o
 crct10dif-arm-ce-y	:= crct10dif-ce-core.o crct10dif-ce-glue.o
 crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
 chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
+nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
 
 ifdef REGENERATE_ARM_CRYPTO
 quiet_cmd_perl = PERL    $@
diff --git a/arch/arm/crypto/nh-neon-core.S b/arch/arm/crypto/nh-neon-core.S
new file mode 100644
index 000000000000..434d80ab531c
--- /dev/null
+++ b/arch/arm/crypto/nh-neon-core.S
@@ -0,0 +1,116 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NH - ε-almost-universal hash function, NEON accelerated version
+ *
+ * Copyright 2018 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+
+	.text
+	.fpu		neon
+
+	KEY		.req	r0
+	MESSAGE		.req	r1
+	MESSAGE_LEN	.req	r2
+	HASH		.req	r3
+
+	PASS0_SUMS	.req	q0
+	PASS0_SUM_A	.req	d0
+	PASS0_SUM_B	.req	d1
+	PASS1_SUMS	.req	q1
+	PASS1_SUM_A	.req	d2
+	PASS1_SUM_B	.req	d3
+	PASS2_SUMS	.req	q2
+	PASS2_SUM_A	.req	d4
+	PASS2_SUM_B	.req	d5
+	PASS3_SUMS	.req	q3
+	PASS3_SUM_A	.req	d6
+	PASS3_SUM_B	.req	d7
+	K0		.req	q4
+	K1		.req	q5
+	K2		.req	q6
+	K3		.req	q7
+	T0		.req	q8
+	T0_L		.req	d16
+	T0_H		.req	d17
+	T1		.req	q9
+	T1_L		.req	d18
+	T1_H		.req	d19
+	T2		.req	q10
+	T2_L		.req	d20
+	T2_H		.req	d21
+	T3		.req	q11
+	T3_L		.req	d22
+	T3_H		.req	d23
+
+.macro _nh_stride	k0, k1, k2, k3
+
+	// Load next message stride
+	vld1.8		{T3}, [MESSAGE]!
+
+	// Load next key stride
+	vld1.32		{\k3}, [KEY]!
+
+	// Add message words to key words
+	vadd.u32	T0, T3, \k0
+	vadd.u32	T1, T3, \k1
+	vadd.u32	T2, T3, \k2
+	vadd.u32	T3, T3, \k3
+
+	// Multiply 32x32 => 64 and accumulate
+	vmlal.u32	PASS0_SUMS, T0_L, T0_H
+	vmlal.u32	PASS1_SUMS, T1_L, T1_H
+	vmlal.u32	PASS2_SUMS, T2_L, T2_H
+	vmlal.u32	PASS3_SUMS, T3_L, T3_H
+.endm
+
+/*
+ * void nh_neon(const u32 *key, const u8 *message, size_t message_len,
+ *		u8 hash[NH_HASH_BYTES])
+ *
+ * It's guaranteed that message_len % 16 == 0.
+ */
+ENTRY(nh_neon)
+
+	vld1.32		{K0,K1}, [KEY]!
+	  vmov.u64	PASS0_SUMS, #0
+	  vmov.u64	PASS1_SUMS, #0
+	vld1.32		{K2}, [KEY]!
+	  vmov.u64	PASS2_SUMS, #0
+	  vmov.u64	PASS3_SUMS, #0
+
+	subs		MESSAGE_LEN, MESSAGE_LEN, #64
+	blt		.Lloop4_done
+.Lloop4:
+	_nh_stride	K0, K1, K2, K3
+	_nh_stride	K1, K2, K3, K0
+	_nh_stride	K2, K3, K0, K1
+	_nh_stride	K3, K0, K1, K2
+	subs		MESSAGE_LEN, MESSAGE_LEN, #64
+	bge		.Lloop4
+
+.Lloop4_done:
+	ands		MESSAGE_LEN, MESSAGE_LEN, #63
+	beq		.Ldone
+	_nh_stride	K0, K1, K2, K3
+
+	subs		MESSAGE_LEN, MESSAGE_LEN, #16
+	beq		.Ldone
+	_nh_stride	K1, K2, K3, K0
+
+	subs		MESSAGE_LEN, MESSAGE_LEN, #16
+	beq		.Ldone
+	_nh_stride	K2, K3, K0, K1
+
+.Ldone:
+	// Sum the accumulators for each pass, then store the sums to 'hash'
+	vadd.u64	T0_L, PASS0_SUM_A, PASS0_SUM_B
+	vadd.u64	T0_H, PASS1_SUM_A, PASS1_SUM_B
+	vadd.u64	T1_L, PASS2_SUM_A, PASS2_SUM_B
+	vadd.u64	T1_H, PASS3_SUM_A, PASS3_SUM_B
+	vst1.8		{T0-T1}, [HASH]
+	bx		lr
+ENDPROC(nh_neon)
diff --git a/arch/arm/crypto/nhpoly1305-neon-glue.c b/arch/arm/crypto/nhpoly1305-neon-glue.c
new file mode 100644
index 000000000000..49aae87cb2bc
--- /dev/null
+++ b/arch/arm/crypto/nhpoly1305-neon-glue.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum
+ * (NEON accelerated version)
+ *
+ * Copyright 2018 Google LLC
+ */
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <crypto/internal/hash.h>
+#include <crypto/nhpoly1305.h>
+#include <linux/module.h>
+
+asmlinkage void nh_neon(const u32 *key, const u8 *message, size_t message_len,
+			u8 hash[NH_HASH_BYTES]);
+
+/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */
+static void _nh_neon(const u32 *key, const u8 *message, size_t message_len,
+		     __le64 hash[NH_NUM_PASSES])
+{
+	nh_neon(key, message, message_len, (u8 *)hash);
+}
+
+static int nhpoly1305_neon_update(struct shash_desc *desc,
+				  const u8 *src, unsigned int srclen)
+{
+	if (srclen < 64 || !may_use_simd())
+		return crypto_nhpoly1305_update(desc, src, srclen);
+
+	do {
+		unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE);
+
+		kernel_neon_begin();
+		crypto_nhpoly1305_update_helper(desc, src, n, _nh_neon);
+		kernel_neon_end();
+		src += n;
+		srclen -= n;
+	} while (srclen);
+	return 0;
+}
+
+static struct shash_alg nhpoly1305_alg = {
+	.base.cra_name		= "nhpoly1305",
+	.base.cra_driver_name	= "nhpoly1305-neon",
+	.base.cra_priority	= 200,
+	.base.cra_ctxsize	= sizeof(struct nhpoly1305_key),
+	.base.cra_module	= THIS_MODULE,
+	.digestsize		= POLY1305_DIGEST_SIZE,
+	.init			= crypto_nhpoly1305_init,
+	.update			= nhpoly1305_neon_update,
+	.final			= crypto_nhpoly1305_final,
+	.setkey			= crypto_nhpoly1305_setkey,
+	.descsize		= sizeof(struct nhpoly1305_state),
+};
+
+static int __init nhpoly1305_mod_init(void)
+{
+	if (!(elf_hwcap & HWCAP_NEON))
+		return -ENODEV;
+
+	return crypto_register_shash(&nhpoly1305_alg);
+}
+
+static void __exit nhpoly1305_mod_exit(void)
+{
+	crypto_unregister_shash(&nhpoly1305_alg);
+}
+
+module_init(nhpoly1305_mod_init);
+module_exit(nhpoly1305_mod_exit);
+
+MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (NEON-accelerated)");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
+MODULE_ALIAS_CRYPTO("nhpoly1305");
+MODULE_ALIAS_CRYPTO("nhpoly1305-neon");
-- 
cgit v1.2.3-59-g8ed1b


From 059c2a4d8e164dccc3078e49e7f286023b019a98 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 16 Nov 2018 17:26:31 -0800
Subject: crypto: adiantum - add Adiantum support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for the Adiantum encryption mode.  Adiantum was designed by
Paul Crowley and is specified by our paper:

    Adiantum: length-preserving encryption for entry-level processors
    (https://eprint.iacr.org/2018/720.pdf)

See our paper for full details; this patch only provides an overview.

Adiantum is a tweakable, length-preserving encryption mode designed for
fast and secure disk encryption, especially on CPUs without dedicated
crypto instructions.  Adiantum encrypts each sector using the XChaCha12
stream cipher, two passes of an ε-almost-∆-universal (εA∆U) hash
function, and an invocation of the AES-256 block cipher on a single
16-byte block.  On CPUs without AES instructions, Adiantum is much
faster than AES-XTS; for example, on ARM Cortex-A7, on 4096-byte sectors
Adiantum encryption is about 4 times faster than AES-256-XTS encryption,
and decryption about 5 times faster.

Adiantum is a specialization of the more general HBSH construction.  Our
earlier proposal, HPolyC, was also a HBSH specialization, but it used a
different εA∆U hash function, one based on Poly1305 only.  Adiantum's
εA∆U hash function, which is based primarily on the "NH" hash function
like that used in UMAC (RFC4418), is about twice as fast as HPolyC's;
consequently, Adiantum is about 20% faster than HPolyC.

This speed comes with no loss of security: Adiantum is provably just as
secure as HPolyC, in fact slightly *more* secure.  Like HPolyC,
Adiantum's security is reducible to that of XChaCha12 and AES-256,
subject to a security bound.  XChaCha12 itself has a security reduction
to ChaCha12.  Therefore, one need not "trust" Adiantum; one need only
trust ChaCha12 and AES-256.  Note that the εA∆U hash function is only
used for its proven combinatorical properties so cannot be "broken".

Adiantum is also a true wide-block encryption mode, so flipping any
plaintext bit in the sector scrambles the entire ciphertext, and vice
versa.  No other such mode is available in the kernel currently; doing
the same with XTS scrambles only 16 bytes.  Adiantum also supports
arbitrary-length tweaks and naturally supports any length input >= 16
bytes without needing "ciphertext stealing".

For the stream cipher, Adiantum uses XChaCha12 rather than XChaCha20 in
order to make encryption feasible on the widest range of devices.
Although the 20-round variant is quite popular, the best known attacks
on ChaCha are on only 7 rounds, so ChaCha12 still has a substantial
security margin; in fact, larger than AES-256's.  12-round Salsa20 is
also the eSTREAM recommendation.  For the block cipher, Adiantum uses
AES-256, despite it having a lower security margin than XChaCha12 and
needing table lookups, due to AES's extensive adoption and analysis
making it the obvious first choice.  Nevertheless, for flexibility this
patch also permits the "adiantum" template to be instantiated with
XChaCha20 and/or with an alternate block cipher.

We need Adiantum support in the kernel for use in dm-crypt and fscrypt,
where currently the only other suitable options are block cipher modes
such as AES-XTS.  A big problem with this is that many low-end mobile
devices (e.g. Android Go phones sold primarily in developing countries,
as well as some smartwatches) still have CPUs that lack AES
instructions, e.g. ARM Cortex-A7.  Sadly, AES-XTS encryption is much too
slow to be viable on these devices.  We did find that some "lightweight"
block ciphers are fast enough, but these suffer from problems such as
not having much cryptanalysis or being too controversial.

The ChaCha stream cipher has excellent performance but is insecure to
use directly for disk encryption, since each sector's IV is reused each
time it is overwritten.  Even restricting the threat model to offline
attacks only isn't enough, since modern flash storage devices don't
guarantee that "overwrites" are really overwrites, due to wear-leveling.
Adiantum avoids this problem by constructing a
"tweakable super-pseudorandom permutation"; this is the strongest
possible security model for length-preserving encryption.

Of course, storing random nonces along with the ciphertext would be the
ideal solution.  But doing that with existing hardware and filesystems
runs into major practical problems; in most cases it would require data
journaling (like dm-integrity) which severely degrades performance.
Thus, for now length-preserving encryption is still needed.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/Kconfig    |  23 ++
 crypto/Makefile   |   1 +
 crypto/adiantum.c | 658 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 crypto/tcrypt.c   |  12 +
 crypto/testmgr.c  |  12 +
 crypto/testmgr.h  | 461 ++++++++++++++++++++++++++++++++++++++
 6 files changed, 1167 insertions(+)
 create mode 100644 crypto/adiantum.c

diff --git a/crypto/Kconfig b/crypto/Kconfig
index eaeb8a986b7d..b6376d5d973e 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -501,6 +501,29 @@ config CRYPTO_NHPOLY1305
 	select CRYPTO_HASH
 	select CRYPTO_POLY1305
 
+config CRYPTO_ADIANTUM
+	tristate "Adiantum support"
+	select CRYPTO_CHACHA20
+	select CRYPTO_POLY1305
+	select CRYPTO_NHPOLY1305
+	help
+	  Adiantum is a tweakable, length-preserving encryption mode
+	  designed for fast and secure disk encryption, especially on
+	  CPUs without dedicated crypto instructions.  It encrypts
+	  each sector using the XChaCha12 stream cipher, two passes of
+	  an ε-almost-∆-universal hash function, and an invocation of
+	  the AES-256 block cipher on a single 16-byte block.  On CPUs
+	  without AES instructions, Adiantum is much faster than
+	  AES-XTS.
+
+	  Adiantum's security is provably reducible to that of its
+	  underlying stream and block ciphers, subject to a security
+	  bound.  Unlike XTS, Adiantum is a true wide-block encryption
+	  mode, so it actually provides an even stronger notion of
+	  security than XTS, subject to the security bound.
+
+	  If unsure, say N.
+
 comment "Hash modes"
 
 config CRYPTO_CMAC
diff --git a/crypto/Makefile b/crypto/Makefile
index c3310c85f09f..5e789dc2d4fd 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -85,6 +85,7 @@ obj-$(CONFIG_CRYPTO_LRW) += lrw.o
 obj-$(CONFIG_CRYPTO_XTS) += xts.o
 obj-$(CONFIG_CRYPTO_CTR) += ctr.o
 obj-$(CONFIG_CRYPTO_KEYWRAP) += keywrap.o
+obj-$(CONFIG_CRYPTO_ADIANTUM) += adiantum.o
 obj-$(CONFIG_CRYPTO_NHPOLY1305) += nhpoly1305.o
 obj-$(CONFIG_CRYPTO_GCM) += gcm.o
 obj-$(CONFIG_CRYPTO_CCM) += ccm.o
diff --git a/crypto/adiantum.c b/crypto/adiantum.c
new file mode 100644
index 000000000000..2dfcf12fd452
--- /dev/null
+++ b/crypto/adiantum.c
@@ -0,0 +1,658 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Adiantum length-preserving encryption mode
+ *
+ * Copyright 2018 Google LLC
+ */
+
+/*
+ * Adiantum is a tweakable, length-preserving encryption mode designed for fast
+ * and secure disk encryption, especially on CPUs without dedicated crypto
+ * instructions.  Adiantum encrypts each sector using the XChaCha12 stream
+ * cipher, two passes of an ε-almost-∆-universal (εA∆U) hash function based on
+ * NH and Poly1305, and an invocation of the AES-256 block cipher on a single
+ * 16-byte block.  See the paper for details:
+ *
+ *	Adiantum: length-preserving encryption for entry-level processors
+ *      (https://eprint.iacr.org/2018/720.pdf)
+ *
+ * For flexibility, this implementation also allows other ciphers:
+ *
+ *	- Stream cipher: XChaCha12 or XChaCha20
+ *	- Block cipher: any with a 128-bit block size and 256-bit key
+ *
+ * This implementation doesn't currently allow other εA∆U hash functions, i.e.
+ * HPolyC is not supported.  This is because Adiantum is ~20% faster than HPolyC
+ * but still provably as secure, and also the εA∆U hash function of HBSH is
+ * formally defined to take two inputs (tweak, message) which makes it difficult
+ * to wrap with the crypto_shash API.  Rather, some details need to be handled
+ * here.  Nevertheless, if needed in the future, support for other εA∆U hash
+ * functions could be added here.
+ */
+
+#include <crypto/b128ops.h>
+#include <crypto/chacha.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/nhpoly1305.h>
+#include <crypto/scatterwalk.h>
+#include <linux/module.h>
+
+#include "internal.h"
+
+/*
+ * Size of right-hand block of input data, in bytes; also the size of the block
+ * cipher's block size and the hash function's output.
+ */
+#define BLOCKCIPHER_BLOCK_SIZE		16
+
+/* Size of the block cipher key (K_E) in bytes */
+#define BLOCKCIPHER_KEY_SIZE		32
+
+/* Size of the hash key (K_H) in bytes */
+#define HASH_KEY_SIZE		(POLY1305_BLOCK_SIZE + NHPOLY1305_KEY_SIZE)
+
+/*
+ * The specification allows variable-length tweaks, but Linux's crypto API
+ * currently only allows algorithms to support a single length.  The "natural"
+ * tweak length for Adiantum is 16, since that fits into one Poly1305 block for
+ * the best performance.  But longer tweaks are useful for fscrypt, to avoid
+ * needing to derive per-file keys.  So instead we use two blocks, or 32 bytes.
+ */
+#define TWEAK_SIZE		32
+
+struct adiantum_instance_ctx {
+	struct crypto_skcipher_spawn streamcipher_spawn;
+	struct crypto_spawn blockcipher_spawn;
+	struct crypto_shash_spawn hash_spawn;
+};
+
+struct adiantum_tfm_ctx {
+	struct crypto_skcipher *streamcipher;
+	struct crypto_cipher *blockcipher;
+	struct crypto_shash *hash;
+	struct poly1305_key header_hash_key;
+};
+
+struct adiantum_request_ctx {
+
+	/*
+	 * Buffer for right-hand block of data, i.e.
+	 *
+	 *    P_L => P_M => C_M => C_R when encrypting, or
+	 *    C_R => C_M => P_M => P_L when decrypting.
+	 *
+	 * Also used to build the IV for the stream cipher.
+	 */
+	union {
+		u8 bytes[XCHACHA_IV_SIZE];
+		__le32 words[XCHACHA_IV_SIZE / sizeof(__le32)];
+		le128 bignum;	/* interpret as element of Z/(2^{128}Z) */
+	} rbuf;
+
+	bool enc; /* true if encrypting, false if decrypting */
+
+	/*
+	 * The result of the Poly1305 εA∆U hash function applied to
+	 * (message length, tweak).
+	 */
+	le128 header_hash;
+
+	/* Sub-requests, must be last */
+	union {
+		struct shash_desc hash_desc;
+		struct skcipher_request streamcipher_req;
+	} u;
+};
+
+/*
+ * Given the XChaCha stream key K_S, derive the block cipher key K_E and the
+ * hash key K_H as follows:
+ *
+ *     K_E || K_H || ... = XChaCha(key=K_S, nonce=1||0^191)
+ *
+ * Note that this denotes using bits from the XChaCha keystream, which here we
+ * get indirectly by encrypting a buffer containing all 0's.
+ */
+static int adiantum_setkey(struct crypto_skcipher *tfm, const u8 *key,
+			   unsigned int keylen)
+{
+	struct adiantum_tfm_ctx *tctx = crypto_skcipher_ctx(tfm);
+	struct {
+		u8 iv[XCHACHA_IV_SIZE];
+		u8 derived_keys[BLOCKCIPHER_KEY_SIZE + HASH_KEY_SIZE];
+		struct scatterlist sg;
+		struct crypto_wait wait;
+		struct skcipher_request req; /* must be last */
+	} *data;
+	u8 *keyp;
+	int err;
+
+	/* Set the stream cipher key (K_S) */
+	crypto_skcipher_clear_flags(tctx->streamcipher, CRYPTO_TFM_REQ_MASK);
+	crypto_skcipher_set_flags(tctx->streamcipher,
+				  crypto_skcipher_get_flags(tfm) &
+				  CRYPTO_TFM_REQ_MASK);
+	err = crypto_skcipher_setkey(tctx->streamcipher, key, keylen);
+	crypto_skcipher_set_flags(tfm,
+				crypto_skcipher_get_flags(tctx->streamcipher) &
+				CRYPTO_TFM_RES_MASK);
+	if (err)
+		return err;
+
+	/* Derive the subkeys */
+	data = kzalloc(sizeof(*data) +
+		       crypto_skcipher_reqsize(tctx->streamcipher), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+	data->iv[0] = 1;
+	sg_init_one(&data->sg, data->derived_keys, sizeof(data->derived_keys));
+	crypto_init_wait(&data->wait);
+	skcipher_request_set_tfm(&data->req, tctx->streamcipher);
+	skcipher_request_set_callback(&data->req, CRYPTO_TFM_REQ_MAY_SLEEP |
+						  CRYPTO_TFM_REQ_MAY_BACKLOG,
+				      crypto_req_done, &data->wait);
+	skcipher_request_set_crypt(&data->req, &data->sg, &data->sg,
+				   sizeof(data->derived_keys), data->iv);
+	err = crypto_wait_req(crypto_skcipher_encrypt(&data->req), &data->wait);
+	if (err)
+		goto out;
+	keyp = data->derived_keys;
+
+	/* Set the block cipher key (K_E) */
+	crypto_cipher_clear_flags(tctx->blockcipher, CRYPTO_TFM_REQ_MASK);
+	crypto_cipher_set_flags(tctx->blockcipher,
+				crypto_skcipher_get_flags(tfm) &
+				CRYPTO_TFM_REQ_MASK);
+	err = crypto_cipher_setkey(tctx->blockcipher, keyp,
+				   BLOCKCIPHER_KEY_SIZE);
+	crypto_skcipher_set_flags(tfm,
+				  crypto_cipher_get_flags(tctx->blockcipher) &
+				  CRYPTO_TFM_RES_MASK);
+	if (err)
+		goto out;
+	keyp += BLOCKCIPHER_KEY_SIZE;
+
+	/* Set the hash key (K_H) */
+	poly1305_core_setkey(&tctx->header_hash_key, keyp);
+	keyp += POLY1305_BLOCK_SIZE;
+
+	crypto_shash_clear_flags(tctx->hash, CRYPTO_TFM_REQ_MASK);
+	crypto_shash_set_flags(tctx->hash, crypto_skcipher_get_flags(tfm) &
+					   CRYPTO_TFM_REQ_MASK);
+	err = crypto_shash_setkey(tctx->hash, keyp, NHPOLY1305_KEY_SIZE);
+	crypto_skcipher_set_flags(tfm, crypto_shash_get_flags(tctx->hash) &
+				       CRYPTO_TFM_RES_MASK);
+	keyp += NHPOLY1305_KEY_SIZE;
+	WARN_ON(keyp != &data->derived_keys[ARRAY_SIZE(data->derived_keys)]);
+out:
+	kzfree(data);
+	return err;
+}
+
+/* Addition in Z/(2^{128}Z) */
+static inline void le128_add(le128 *r, const le128 *v1, const le128 *v2)
+{
+	u64 x = le64_to_cpu(v1->b);
+	u64 y = le64_to_cpu(v2->b);
+
+	r->b = cpu_to_le64(x + y);
+	r->a = cpu_to_le64(le64_to_cpu(v1->a) + le64_to_cpu(v2->a) +
+			   (x + y < x));
+}
+
+/* Subtraction in Z/(2^{128}Z) */
+static inline void le128_sub(le128 *r, const le128 *v1, const le128 *v2)
+{
+	u64 x = le64_to_cpu(v1->b);
+	u64 y = le64_to_cpu(v2->b);
+
+	r->b = cpu_to_le64(x - y);
+	r->a = cpu_to_le64(le64_to_cpu(v1->a) - le64_to_cpu(v2->a) -
+			   (x - y > x));
+}
+
+/*
+ * Apply the Poly1305 εA∆U hash function to (message length, tweak) and save the
+ * result to rctx->header_hash.
+ *
+ * This value is reused in both the first and second hash steps.  Specifically,
+ * it's added to the result of an independently keyed εA∆U hash function (for
+ * equal length inputs only) taken over the message.  This gives the overall
+ * Adiantum hash of the (tweak, message) pair.
+ */
+static void adiantum_hash_header(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	const struct adiantum_tfm_ctx *tctx = crypto_skcipher_ctx(tfm);
+	struct adiantum_request_ctx *rctx = skcipher_request_ctx(req);
+	const unsigned int bulk_len = req->cryptlen - BLOCKCIPHER_BLOCK_SIZE;
+	struct {
+		__le64 message_bits;
+		__le64 padding;
+	} header = {
+		.message_bits = cpu_to_le64((u64)bulk_len * 8)
+	};
+	struct poly1305_state state;
+
+	poly1305_core_init(&state);
+
+	BUILD_BUG_ON(sizeof(header) % POLY1305_BLOCK_SIZE != 0);
+	poly1305_core_blocks(&state, &tctx->header_hash_key,
+			     &header, sizeof(header) / POLY1305_BLOCK_SIZE);
+
+	BUILD_BUG_ON(TWEAK_SIZE % POLY1305_BLOCK_SIZE != 0);
+	poly1305_core_blocks(&state, &tctx->header_hash_key, req->iv,
+			     TWEAK_SIZE / POLY1305_BLOCK_SIZE);
+
+	poly1305_core_emit(&state, &rctx->header_hash);
+}
+
+/* Hash the left-hand block (the "bulk") of the message using NHPoly1305 */
+static int adiantum_hash_message(struct skcipher_request *req,
+				 struct scatterlist *sgl, le128 *digest)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	const struct adiantum_tfm_ctx *tctx = crypto_skcipher_ctx(tfm);
+	struct adiantum_request_ctx *rctx = skcipher_request_ctx(req);
+	const unsigned int bulk_len = req->cryptlen - BLOCKCIPHER_BLOCK_SIZE;
+	struct shash_desc *hash_desc = &rctx->u.hash_desc;
+	struct sg_mapping_iter miter;
+	unsigned int i, n;
+	int err;
+
+	hash_desc->tfm = tctx->hash;
+	hash_desc->flags = 0;
+
+	err = crypto_shash_init(hash_desc);
+	if (err)
+		return err;
+
+	sg_miter_start(&miter, sgl, sg_nents(sgl),
+		       SG_MITER_FROM_SG | SG_MITER_ATOMIC);
+	for (i = 0; i < bulk_len; i += n) {
+		sg_miter_next(&miter);
+		n = min_t(unsigned int, miter.length, bulk_len - i);
+		err = crypto_shash_update(hash_desc, miter.addr, n);
+		if (err)
+			break;
+	}
+	sg_miter_stop(&miter);
+	if (err)
+		return err;
+
+	return crypto_shash_final(hash_desc, (u8 *)digest);
+}
+
+/* Continue Adiantum encryption/decryption after the stream cipher step */
+static int adiantum_finish(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	const struct adiantum_tfm_ctx *tctx = crypto_skcipher_ctx(tfm);
+	struct adiantum_request_ctx *rctx = skcipher_request_ctx(req);
+	const unsigned int bulk_len = req->cryptlen - BLOCKCIPHER_BLOCK_SIZE;
+	le128 digest;
+	int err;
+
+	/* If decrypting, decrypt C_M with the block cipher to get P_M */
+	if (!rctx->enc)
+		crypto_cipher_decrypt_one(tctx->blockcipher, rctx->rbuf.bytes,
+					  rctx->rbuf.bytes);
+
+	/*
+	 * Second hash step
+	 *	enc: C_R = C_M - H_{K_H}(T, C_L)
+	 *	dec: P_R = P_M - H_{K_H}(T, P_L)
+	 */
+	err = adiantum_hash_message(req, req->dst, &digest);
+	if (err)
+		return err;
+	le128_add(&digest, &digest, &rctx->header_hash);
+	le128_sub(&rctx->rbuf.bignum, &rctx->rbuf.bignum, &digest);
+	scatterwalk_map_and_copy(&rctx->rbuf.bignum, req->dst,
+				 bulk_len, BLOCKCIPHER_BLOCK_SIZE, 1);
+	return 0;
+}
+
+static void adiantum_streamcipher_done(struct crypto_async_request *areq,
+				       int err)
+{
+	struct skcipher_request *req = areq->data;
+
+	if (!err)
+		err = adiantum_finish(req);
+
+	skcipher_request_complete(req, err);
+}
+
+static int adiantum_crypt(struct skcipher_request *req, bool enc)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	const struct adiantum_tfm_ctx *tctx = crypto_skcipher_ctx(tfm);
+	struct adiantum_request_ctx *rctx = skcipher_request_ctx(req);
+	const unsigned int bulk_len = req->cryptlen - BLOCKCIPHER_BLOCK_SIZE;
+	unsigned int stream_len;
+	le128 digest;
+	int err;
+
+	if (req->cryptlen < BLOCKCIPHER_BLOCK_SIZE)
+		return -EINVAL;
+
+	rctx->enc = enc;
+
+	/*
+	 * First hash step
+	 *	enc: P_M = P_R + H_{K_H}(T, P_L)
+	 *	dec: C_M = C_R + H_{K_H}(T, C_L)
+	 */
+	adiantum_hash_header(req);
+	err = adiantum_hash_message(req, req->src, &digest);
+	if (err)
+		return err;
+	le128_add(&digest, &digest, &rctx->header_hash);
+	scatterwalk_map_and_copy(&rctx->rbuf.bignum, req->src,
+				 bulk_len, BLOCKCIPHER_BLOCK_SIZE, 0);
+	le128_add(&rctx->rbuf.bignum, &rctx->rbuf.bignum, &digest);
+
+	/* If encrypting, encrypt P_M with the block cipher to get C_M */
+	if (enc)
+		crypto_cipher_encrypt_one(tctx->blockcipher, rctx->rbuf.bytes,
+					  rctx->rbuf.bytes);
+
+	/* Initialize the rest of the XChaCha IV (first part is C_M) */
+	BUILD_BUG_ON(BLOCKCIPHER_BLOCK_SIZE != 16);
+	BUILD_BUG_ON(XCHACHA_IV_SIZE != 32);	/* nonce || stream position */
+	rctx->rbuf.words[4] = cpu_to_le32(1);
+	rctx->rbuf.words[5] = 0;
+	rctx->rbuf.words[6] = 0;
+	rctx->rbuf.words[7] = 0;
+
+	/*
+	 * XChaCha needs to be done on all the data except the last 16 bytes;
+	 * for disk encryption that usually means 4080 or 496 bytes.  But ChaCha
+	 * implementations tend to be most efficient when passed a whole number
+	 * of 64-byte ChaCha blocks, or sometimes even a multiple of 256 bytes.
+	 * And here it doesn't matter whether the last 16 bytes are written to,
+	 * as the second hash step will overwrite them.  Thus, round the XChaCha
+	 * length up to the next 64-byte boundary if possible.
+	 */
+	stream_len = bulk_len;
+	if (round_up(stream_len, CHACHA_BLOCK_SIZE) <= req->cryptlen)
+		stream_len = round_up(stream_len, CHACHA_BLOCK_SIZE);
+
+	skcipher_request_set_tfm(&rctx->u.streamcipher_req, tctx->streamcipher);
+	skcipher_request_set_crypt(&rctx->u.streamcipher_req, req->src,
+				   req->dst, stream_len, &rctx->rbuf);
+	skcipher_request_set_callback(&rctx->u.streamcipher_req,
+				      req->base.flags,
+				      adiantum_streamcipher_done, req);
+	return crypto_skcipher_encrypt(&rctx->u.streamcipher_req) ?:
+		adiantum_finish(req);
+}
+
+static int adiantum_encrypt(struct skcipher_request *req)
+{
+	return adiantum_crypt(req, true);
+}
+
+static int adiantum_decrypt(struct skcipher_request *req)
+{
+	return adiantum_crypt(req, false);
+}
+
+static int adiantum_init_tfm(struct crypto_skcipher *tfm)
+{
+	struct skcipher_instance *inst = skcipher_alg_instance(tfm);
+	struct adiantum_instance_ctx *ictx = skcipher_instance_ctx(inst);
+	struct adiantum_tfm_ctx *tctx = crypto_skcipher_ctx(tfm);
+	struct crypto_skcipher *streamcipher;
+	struct crypto_cipher *blockcipher;
+	struct crypto_shash *hash;
+	unsigned int subreq_size;
+	int err;
+
+	streamcipher = crypto_spawn_skcipher(&ictx->streamcipher_spawn);
+	if (IS_ERR(streamcipher))
+		return PTR_ERR(streamcipher);
+
+	blockcipher = crypto_spawn_cipher(&ictx->blockcipher_spawn);
+	if (IS_ERR(blockcipher)) {
+		err = PTR_ERR(blockcipher);
+		goto err_free_streamcipher;
+	}
+
+	hash = crypto_spawn_shash(&ictx->hash_spawn);
+	if (IS_ERR(hash)) {
+		err = PTR_ERR(hash);
+		goto err_free_blockcipher;
+	}
+
+	tctx->streamcipher = streamcipher;
+	tctx->blockcipher = blockcipher;
+	tctx->hash = hash;
+
+	BUILD_BUG_ON(offsetofend(struct adiantum_request_ctx, u) !=
+		     sizeof(struct adiantum_request_ctx));
+	subreq_size = max(FIELD_SIZEOF(struct adiantum_request_ctx,
+				       u.hash_desc) +
+			  crypto_shash_descsize(hash),
+			  FIELD_SIZEOF(struct adiantum_request_ctx,
+				       u.streamcipher_req) +
+			  crypto_skcipher_reqsize(streamcipher));
+
+	crypto_skcipher_set_reqsize(tfm,
+				    offsetof(struct adiantum_request_ctx, u) +
+				    subreq_size);
+	return 0;
+
+err_free_blockcipher:
+	crypto_free_cipher(blockcipher);
+err_free_streamcipher:
+	crypto_free_skcipher(streamcipher);
+	return err;
+}
+
+static void adiantum_exit_tfm(struct crypto_skcipher *tfm)
+{
+	struct adiantum_tfm_ctx *tctx = crypto_skcipher_ctx(tfm);
+
+	crypto_free_skcipher(tctx->streamcipher);
+	crypto_free_cipher(tctx->blockcipher);
+	crypto_free_shash(tctx->hash);
+}
+
+static void adiantum_free_instance(struct skcipher_instance *inst)
+{
+	struct adiantum_instance_ctx *ictx = skcipher_instance_ctx(inst);
+
+	crypto_drop_skcipher(&ictx->streamcipher_spawn);
+	crypto_drop_spawn(&ictx->blockcipher_spawn);
+	crypto_drop_shash(&ictx->hash_spawn);
+	kfree(inst);
+}
+
+/*
+ * Check for a supported set of inner algorithms.
+ * See the comment at the beginning of this file.
+ */
+static bool adiantum_supported_algorithms(struct skcipher_alg *streamcipher_alg,
+					  struct crypto_alg *blockcipher_alg,
+					  struct shash_alg *hash_alg)
+{
+	if (strcmp(streamcipher_alg->base.cra_name, "xchacha12") != 0 &&
+	    strcmp(streamcipher_alg->base.cra_name, "xchacha20") != 0)
+		return false;
+
+	if (blockcipher_alg->cra_cipher.cia_min_keysize > BLOCKCIPHER_KEY_SIZE ||
+	    blockcipher_alg->cra_cipher.cia_max_keysize < BLOCKCIPHER_KEY_SIZE)
+		return false;
+	if (blockcipher_alg->cra_blocksize != BLOCKCIPHER_BLOCK_SIZE)
+		return false;
+
+	if (strcmp(hash_alg->base.cra_name, "nhpoly1305") != 0)
+		return false;
+
+	return true;
+}
+
+static int adiantum_create(struct crypto_template *tmpl, struct rtattr **tb)
+{
+	struct crypto_attr_type *algt;
+	const char *streamcipher_name;
+	const char *blockcipher_name;
+	const char *nhpoly1305_name;
+	struct skcipher_instance *inst;
+	struct adiantum_instance_ctx *ictx;
+	struct skcipher_alg *streamcipher_alg;
+	struct crypto_alg *blockcipher_alg;
+	struct crypto_alg *_hash_alg;
+	struct shash_alg *hash_alg;
+	int err;
+
+	algt = crypto_get_attr_type(tb);
+	if (IS_ERR(algt))
+		return PTR_ERR(algt);
+
+	if ((algt->type ^ CRYPTO_ALG_TYPE_SKCIPHER) & algt->mask)
+		return -EINVAL;
+
+	streamcipher_name = crypto_attr_alg_name(tb[1]);
+	if (IS_ERR(streamcipher_name))
+		return PTR_ERR(streamcipher_name);
+
+	blockcipher_name = crypto_attr_alg_name(tb[2]);
+	if (IS_ERR(blockcipher_name))
+		return PTR_ERR(blockcipher_name);
+
+	nhpoly1305_name = crypto_attr_alg_name(tb[3]);
+	if (nhpoly1305_name == ERR_PTR(-ENOENT))
+		nhpoly1305_name = "nhpoly1305";
+	if (IS_ERR(nhpoly1305_name))
+		return PTR_ERR(nhpoly1305_name);
+
+	inst = kzalloc(sizeof(*inst) + sizeof(*ictx), GFP_KERNEL);
+	if (!inst)
+		return -ENOMEM;
+	ictx = skcipher_instance_ctx(inst);
+
+	/* Stream cipher, e.g. "xchacha12" */
+	err = crypto_grab_skcipher(&ictx->streamcipher_spawn, streamcipher_name,
+				   0, crypto_requires_sync(algt->type,
+							   algt->mask));
+	if (err)
+		goto out_free_inst;
+	streamcipher_alg = crypto_spawn_skcipher_alg(&ictx->streamcipher_spawn);
+
+	/* Block cipher, e.g. "aes" */
+	err = crypto_grab_spawn(&ictx->blockcipher_spawn, blockcipher_name,
+				CRYPTO_ALG_TYPE_CIPHER, CRYPTO_ALG_TYPE_MASK);
+	if (err)
+		goto out_drop_streamcipher;
+	blockcipher_alg = ictx->blockcipher_spawn.alg;
+
+	/* NHPoly1305 εA∆U hash function */
+	_hash_alg = crypto_alg_mod_lookup(nhpoly1305_name,
+					  CRYPTO_ALG_TYPE_SHASH,
+					  CRYPTO_ALG_TYPE_MASK);
+	if (IS_ERR(_hash_alg)) {
+		err = PTR_ERR(_hash_alg);
+		goto out_drop_blockcipher;
+	}
+	hash_alg = __crypto_shash_alg(_hash_alg);
+	err = crypto_init_shash_spawn(&ictx->hash_spawn, hash_alg,
+				      skcipher_crypto_instance(inst));
+	if (err) {
+		crypto_mod_put(_hash_alg);
+		goto out_drop_blockcipher;
+	}
+
+	/* Check the set of algorithms */
+	if (!adiantum_supported_algorithms(streamcipher_alg, blockcipher_alg,
+					   hash_alg)) {
+		pr_warn("Unsupported Adiantum instantiation: (%s,%s,%s)\n",
+			streamcipher_alg->base.cra_name,
+			blockcipher_alg->cra_name, hash_alg->base.cra_name);
+		err = -EINVAL;
+		goto out_drop_hash;
+	}
+
+	/* Instance fields */
+
+	err = -ENAMETOOLONG;
+	if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME,
+		     "adiantum(%s,%s)", streamcipher_alg->base.cra_name,
+		     blockcipher_alg->cra_name) >= CRYPTO_MAX_ALG_NAME)
+		goto out_drop_hash;
+	if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME,
+		     "adiantum(%s,%s,%s)",
+		     streamcipher_alg->base.cra_driver_name,
+		     blockcipher_alg->cra_driver_name,
+		     hash_alg->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME)
+		goto out_drop_hash;
+
+	inst->alg.base.cra_blocksize = BLOCKCIPHER_BLOCK_SIZE;
+	inst->alg.base.cra_ctxsize = sizeof(struct adiantum_tfm_ctx);
+	inst->alg.base.cra_alignmask = streamcipher_alg->base.cra_alignmask |
+				       hash_alg->base.cra_alignmask;
+	/*
+	 * The block cipher is only invoked once per message, so for long
+	 * messages (e.g. sectors for disk encryption) its performance doesn't
+	 * matter as much as that of the stream cipher and hash function.  Thus,
+	 * weigh the block cipher's ->cra_priority less.
+	 */
+	inst->alg.base.cra_priority = (4 * streamcipher_alg->base.cra_priority +
+				       2 * hash_alg->base.cra_priority +
+				       blockcipher_alg->cra_priority) / 7;
+
+	inst->alg.setkey = adiantum_setkey;
+	inst->alg.encrypt = adiantum_encrypt;
+	inst->alg.decrypt = adiantum_decrypt;
+	inst->alg.init = adiantum_init_tfm;
+	inst->alg.exit = adiantum_exit_tfm;
+	inst->alg.min_keysize = crypto_skcipher_alg_min_keysize(streamcipher_alg);
+	inst->alg.max_keysize = crypto_skcipher_alg_max_keysize(streamcipher_alg);
+	inst->alg.ivsize = TWEAK_SIZE;
+
+	inst->free = adiantum_free_instance;
+
+	err = skcipher_register_instance(tmpl, inst);
+	if (err)
+		goto out_drop_hash;
+
+	return 0;
+
+out_drop_hash:
+	crypto_drop_shash(&ictx->hash_spawn);
+out_drop_blockcipher:
+	crypto_drop_spawn(&ictx->blockcipher_spawn);
+out_drop_streamcipher:
+	crypto_drop_skcipher(&ictx->streamcipher_spawn);
+out_free_inst:
+	kfree(inst);
+	return err;
+}
+
+/* adiantum(streamcipher_name, blockcipher_name [, nhpoly1305_name]) */
+static struct crypto_template adiantum_tmpl = {
+	.name = "adiantum",
+	.create = adiantum_create,
+	.module = THIS_MODULE,
+};
+
+static int __init adiantum_module_init(void)
+{
+	return crypto_register_template(&adiantum_tmpl);
+}
+
+static void __exit adiantum_module_exit(void)
+{
+	crypto_unregister_template(&adiantum_tmpl);
+}
+
+module_init(adiantum_module_init);
+module_exit(adiantum_module_exit);
+
+MODULE_DESCRIPTION("Adiantum length-preserving encryption mode");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
+MODULE_ALIAS_CRYPTO("adiantum");
diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index 5fb120474902..0590a9204562 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -2320,6 +2320,18 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb)
 		test_cipher_speed("ctr(sm4)", DECRYPT, sec, NULL, 0,
 				speed_template_16);
 		break;
+
+	case 219:
+		test_cipher_speed("adiantum(xchacha12,aes)", ENCRYPT, sec, NULL,
+				  0, speed_template_32);
+		test_cipher_speed("adiantum(xchacha12,aes)", DECRYPT, sec, NULL,
+				  0, speed_template_32);
+		test_cipher_speed("adiantum(xchacha20,aes)", ENCRYPT, sec, NULL,
+				  0, speed_template_32);
+		test_cipher_speed("adiantum(xchacha20,aes)", DECRYPT, sec, NULL,
+				  0, speed_template_32);
+		break;
+
 	case 300:
 		if (alg) {
 			test_hash_speed(alg, sec, generic_hash_speed_template);
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 665911c24786..0f684a414acb 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -2404,6 +2404,18 @@ static int alg_test_null(const struct alg_test_desc *desc,
 /* Please keep this list sorted by algorithm name. */
 static const struct alg_test_desc alg_test_descs[] = {
 	{
+		.alg = "adiantum(xchacha12,aes)",
+		.test = alg_test_skcipher,
+		.suite = {
+			.cipher = __VECS(adiantum_xchacha12_aes_tv_template)
+		},
+	}, {
+		.alg = "adiantum(xchacha20,aes)",
+		.test = alg_test_skcipher,
+		.suite = {
+			.cipher = __VECS(adiantum_xchacha20_aes_tv_template)
+		},
+	}, {
 		.alg = "aegis128",
 		.test = alg_test_aead,
 		.suite = {
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 50ea1f2705c7..e7e56a8febbc 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -33381,6 +33381,467 @@ static const struct cipher_testvec xchacha12_tv_template[] = {
 	},
 };
 
+/* Adiantum test vectors from https://github.com/google/adiantum */
+static const struct cipher_testvec adiantum_xchacha12_aes_tv_template[] = {
+	{
+		.key	= "\x9e\xeb\xb2\x49\x3c\x1c\xf5\xf4"
+			  "\x6a\x99\xc2\xc4\xdf\xb1\xf4\xdd"
+			  "\x75\x20\x57\xea\x2c\x4f\xcd\xb2"
+			  "\xa5\x3d\x7b\x49\x1e\xab\xfd\x0f",
+		.klen	= 32,
+		.iv	= "\xdf\x63\xd4\xab\xd2\x49\xf3\xd8"
+			  "\x33\x81\x37\x60\x7d\xfa\x73\x08"
+			  "\xd8\x49\x6d\x80\xe8\x2f\x62\x54"
+			  "\xeb\x0e\xa9\x39\x5b\x45\x7f\x8a",
+		.ptext	= "\x67\xc9\xf2\x30\x84\x41\x8e\x43"
+			  "\xfb\xf3\xb3\x3e\x79\x36\x7f\xe8",
+		.ctext	= "\x6d\x32\x86\x18\x67\x86\x0f\x3f"
+			  "\x96\x7c\x9d\x28\x0d\x53\xec\x9f",
+		.len	= 16,
+		.also_non_np = 1,
+		.np	= 2,
+		.tap	= { 14, 2 },
+	}, {
+		.key	= "\x36\x2b\x57\x97\xf8\x5d\xcd\x99"
+			  "\x5f\x1a\x5a\x44\x1d\x92\x0f\x27"
+			  "\xcc\x16\xd7\x2b\x85\x63\x99\xd3"
+			  "\xba\x96\xa1\xdb\xd2\x60\x68\xda",
+		.klen	= 32,
+		.iv	= "\xef\x58\x69\xb1\x2c\x5e\x9a\x47"
+			  "\x24\xc1\xb1\x69\xe1\x12\x93\x8f"
+			  "\x43\x3d\x6d\x00\xdb\x5e\xd8\xd9"
+			  "\x12\x9a\xfe\xd9\xff\x2d\xaa\xc4",
+		.ptext	= "\x5e\xa8\x68\x19\x85\x98\x12\x23"
+			  "\x26\x0a\xcc\xdb\x0a\x04\xb9\xdf"
+			  "\x4d\xb3\x48\x7b\xb0\xe3\xc8\x19"
+			  "\x43\x5a\x46\x06\x94\x2d\xf2",
+		.ctext	= "\xc7\xc6\xf1\x73\x8f\xc4\xff\x4a"
+			  "\x39\xbe\x78\xbe\x8d\x28\xc8\x89"
+			  "\x46\x63\xe7\x0c\x7d\x87\xe8\x4e"
+			  "\xc9\x18\x7b\xbe\x18\x60\x50",
+		.len	= 31,
+	}, {
+		.key	= "\xa5\x28\x24\x34\x1a\x3c\xd8\xf7"
+			  "\x05\x91\x8f\xee\x85\x1f\x35\x7f"
+			  "\x80\x3d\xfc\x9b\x94\xf6\xfc\x9e"
+			  "\x19\x09\x00\xa9\x04\x31\x4f\x11",
+		.klen	= 32,
+		.iv	= "\xa1\xba\x49\x95\xff\x34\x6d\xb8"
+			  "\xcd\x87\x5d\x5e\xfd\xea\x85\xdb"
+			  "\x8a\x7b\x5e\xb2\x5d\x57\xdd\x62"
+			  "\xac\xa9\x8c\x41\x42\x94\x75\xb7",
+		.ptext	= "\x69\xb4\xe8\x8c\x37\xe8\x67\x82"
+			  "\xf1\xec\x5d\x04\xe5\x14\x91\x13"
+			  "\xdf\xf2\x87\x1b\x69\x81\x1d\x71"
+			  "\x70\x9e\x9c\x3b\xde\x49\x70\x11"
+			  "\xa0\xa3\xdb\x0d\x54\x4f\x66\x69"
+			  "\xd7\xdb\x80\xa7\x70\x92\x68\xce"
+			  "\x81\x04\x2c\xc6\xab\xae\xe5\x60"
+			  "\x15\xe9\x6f\xef\xaa\x8f\xa7\xa7"
+			  "\x63\x8f\xf2\xf0\x77\xf1\xa8\xea"
+			  "\xe1\xb7\x1f\x9e\xab\x9e\x4b\x3f"
+			  "\x07\x87\x5b\x6f\xcd\xa8\xaf\xb9"
+			  "\xfa\x70\x0b\x52\xb8\xa8\xa7\x9e"
+			  "\x07\x5f\xa6\x0e\xb3\x9b\x79\x13"
+			  "\x79\xc3\x3e\x8d\x1c\x2c\x68\xc8"
+			  "\x51\x1d\x3c\x7b\x7d\x79\x77\x2a"
+			  "\x56\x65\xc5\x54\x23\x28\xb0\x03",
+		.ctext	= "\x9e\x16\xab\xed\x4b\xa7\x42\x5a"
+			  "\xc6\xfb\x4e\x76\xff\xbe\x03\xa0"
+			  "\x0f\xe3\xad\xba\xe4\x98\x2b\x0e"
+			  "\x21\x48\xa0\xb8\x65\x48\x27\x48"
+			  "\x84\x54\x54\xb2\x9a\x94\x7b\xe6"
+			  "\x4b\x29\xe9\xcf\x05\x91\x80\x1a"
+			  "\x3a\xf3\x41\x96\x85\x1d\x9f\x74"
+			  "\x51\x56\x63\xfa\x7c\x28\x85\x49"
+			  "\xf7\x2f\xf9\xf2\x18\x46\xf5\x33"
+			  "\x80\xa3\x3c\xce\xb2\x57\x93\xf5"
+			  "\xae\xbd\xa9\xf5\x7b\x30\xc4\x93"
+			  "\x66\xe0\x30\x77\x16\xe4\xa0\x31"
+			  "\xba\x70\xbc\x68\x13\xf5\xb0\x9a"
+			  "\xc1\xfc\x7e\xfe\x55\x80\x5c\x48"
+			  "\x74\xa6\xaa\xa3\xac\xdc\xc2\xf5"
+			  "\x8d\xde\x34\x86\x78\x60\x75\x8d",
+		.len	= 128,
+		.also_non_np = 1,
+		.np	= 4,
+		.tap	= { 104, 16, 4, 4 },
+	}, {
+		.key	= "\xd3\x81\x72\x18\x23\xff\x6f\x4a"
+			  "\x25\x74\x29\x0d\x51\x8a\x0e\x13"
+			  "\xc1\x53\x5d\x30\x8d\xee\x75\x0d"
+			  "\x14\xd6\x69\xc9\x15\xa9\x0c\x60",
+		.klen	= 32,
+		.iv	= "\x65\x9b\xd4\xa8\x7d\x29\x1d\xf4"
+			  "\xc4\xd6\x9b\x6a\x28\xab\x64\xe2"
+			  "\x62\x81\x97\xc5\x81\xaa\xf9\x44"
+			  "\xc1\x72\x59\x82\xaf\x16\xc8\x2c",
+		.ptext	= "\xc7\x6b\x52\x6a\x10\xf0\xcc\x09"
+			  "\xc1\x12\x1d\x6d\x21\xa6\x78\xf5"
+			  "\x05\xa3\x69\x60\x91\x36\x98\x57"
+			  "\xba\x0c\x14\xcc\xf3\x2d\x73\x03"
+			  "\xc6\xb2\x5f\xc8\x16\x27\x37\x5d"
+			  "\xd0\x0b\x87\xb2\x50\x94\x7b\x58"
+			  "\x04\xf4\xe0\x7f\x6e\x57\x8e\xc9"
+			  "\x41\x84\xc1\xb1\x7e\x4b\x91\x12"
+			  "\x3a\x8b\x5d\x50\x82\x7b\xcb\xd9"
+			  "\x9a\xd9\x4e\x18\x06\x23\x9e\xd4"
+			  "\xa5\x20\x98\xef\xb5\xda\xe5\xc0"
+			  "\x8a\x6a\x83\x77\x15\x84\x1e\xae"
+			  "\x78\x94\x9d\xdf\xb7\xd1\xea\x67"
+			  "\xaa\xb0\x14\x15\xfa\x67\x21\x84"
+			  "\xd3\x41\x2a\xce\xba\x4b\x4a\xe8"
+			  "\x95\x62\xa9\x55\xf0\x80\xad\xbd"
+			  "\xab\xaf\xdd\x4f\xa5\x7c\x13\x36"
+			  "\xed\x5e\x4f\x72\xad\x4b\xf1\xd0"
+			  "\x88\x4e\xec\x2c\x88\x10\x5e\xea"
+			  "\x12\xc0\x16\x01\x29\xa3\xa0\x55"
+			  "\xaa\x68\xf3\xe9\x9d\x3b\x0d\x3b"
+			  "\x6d\xec\xf8\xa0\x2d\xf0\x90\x8d"
+			  "\x1c\xe2\x88\xd4\x24\x71\xf9\xb3"
+			  "\xc1\x9f\xc5\xd6\x76\x70\xc5\x2e"
+			  "\x9c\xac\xdb\x90\xbd\x83\x72\xba"
+			  "\x6e\xb5\xa5\x53\x83\xa9\xa5\xbf"
+			  "\x7d\x06\x0e\x3c\x2a\xd2\x04\xb5"
+			  "\x1e\x19\x38\x09\x16\xd2\x82\x1f"
+			  "\x75\x18\x56\xb8\x96\x0b\xa6\xf9"
+			  "\xcf\x62\xd9\x32\x5d\xa9\xd7\x1d"
+			  "\xec\xe4\xdf\x1b\xbe\xf1\x36\xee"
+			  "\xe3\x7b\xb5\x2f\xee\xf8\x53\x3d"
+			  "\x6a\xb7\x70\xa9\xfc\x9c\x57\x25"
+			  "\xf2\x89\x10\xd3\xb8\xa8\x8c\x30"
+			  "\xae\x23\x4f\x0e\x13\x66\x4f\xe1"
+			  "\xb6\xc0\xe4\xf8\xef\x93\xbd\x6e"
+			  "\x15\x85\x6b\xe3\x60\x81\x1d\x68"
+			  "\xd7\x31\x87\x89\x09\xab\xd5\x96"
+			  "\x1d\xf3\x6d\x67\x80\xca\x07\x31"
+			  "\x5d\xa7\xe4\xfb\x3e\xf2\x9b\x33"
+			  "\x52\x18\xc8\x30\xfe\x2d\xca\x1e"
+			  "\x79\x92\x7a\x60\x5c\xb6\x58\x87"
+			  "\xa4\x36\xa2\x67\x92\x8b\xa4\xb7"
+			  "\xf1\x86\xdf\xdc\xc0\x7e\x8f\x63"
+			  "\xd2\xa2\xdc\x78\xeb\x4f\xd8\x96"
+			  "\x47\xca\xb8\x91\xf9\xf7\x94\x21"
+			  "\x5f\x9a\x9f\x5b\xb8\x40\x41\x4b"
+			  "\x66\x69\x6a\x72\xd0\xcb\x70\xb7"
+			  "\x93\xb5\x37\x96\x05\x37\x4f\xe5"
+			  "\x8c\xa7\x5a\x4e\x8b\xb7\x84\xea"
+			  "\xc7\xfc\x19\x6e\x1f\x5a\xa1\xac"
+			  "\x18\x7d\x52\x3b\xb3\x34\x62\x99"
+			  "\xe4\x9e\x31\x04\x3f\xc0\x8d\x84"
+			  "\x17\x7c\x25\x48\x52\x67\x11\x27"
+			  "\x67\xbb\x5a\x85\xca\x56\xb2\x5c"
+			  "\xe6\xec\xd5\x96\x3d\x15\xfc\xfb"
+			  "\x22\x25\xf4\x13\xe5\x93\x4b\x9a"
+			  "\x77\xf1\x52\x18\xfa\x16\x5e\x49"
+			  "\x03\x45\xa8\x08\xfa\xb3\x41\x92"
+			  "\x79\x50\x33\xca\xd0\xd7\x42\x55"
+			  "\xc3\x9a\x0c\x4e\xd9\xa4\x3c\x86"
+			  "\x80\x9f\x53\xd1\xa4\x2e\xd1\xbc"
+			  "\xf1\x54\x6e\x93\xa4\x65\x99\x8e"
+			  "\xdf\x29\xc0\x64\x63\x07\xbb\xea",
+		.ctext	= "\x15\x97\xd0\x86\x18\x03\x9c\x51"
+			  "\xc5\x11\x36\x62\x13\x92\xe6\x73"
+			  "\x29\x79\xde\xa1\x00\x3e\x08\x64"
+			  "\x17\x1a\xbc\xd5\xfe\x33\x0e\x0c"
+			  "\x7c\x94\xa7\xc6\x3c\xbe\xac\xa2"
+			  "\x89\xe6\xbc\xdf\x0c\x33\x27\x42"
+			  "\x46\x73\x2f\xba\x4e\xa6\x46\x8f"
+			  "\xe4\xee\x39\x63\x42\x65\xa3\x88"
+			  "\x7a\xad\x33\x23\xa9\xa7\x20\x7f"
+			  "\x0b\xe6\x6a\xc3\x60\xda\x9e\xb4"
+			  "\xd6\x07\x8a\x77\x26\xd1\xab\x44"
+			  "\x99\x55\x03\x5e\xed\x8d\x7b\xbd"
+			  "\xc8\x21\xb7\x21\x30\x3f\xc0\xb5"
+			  "\xc8\xec\x6c\x23\xa6\xa3\x6d\xf1"
+			  "\x30\x0a\xd0\xa6\xa9\x28\x69\xae"
+			  "\x2a\xe6\x54\xac\x82\x9d\x6a\x95"
+			  "\x6f\x06\x44\xc5\x5a\x77\x6e\xec"
+			  "\xf8\xf8\x63\xb2\xe6\xaa\xbd\x8e"
+			  "\x0e\x8a\x62\x00\x03\xc8\x84\xdd"
+			  "\x47\x4a\xc3\x55\xba\xb7\xe7\xdf"
+			  "\x08\xbf\x62\xf5\xe8\xbc\xb6\x11"
+			  "\xe4\xcb\xd0\x66\x74\x32\xcf\xd4"
+			  "\xf8\x51\x80\x39\x14\x05\x12\xdb"
+			  "\x87\x93\xe2\x26\x30\x9c\x3a\x21"
+			  "\xe5\xd0\x38\x57\x80\x15\xe4\x08"
+			  "\x58\x05\x49\x7d\xe6\x92\x77\x70"
+			  "\xfb\x1e\x2d\x6a\x84\x00\xc8\x68"
+			  "\xf7\x1a\xdd\xf0\x7b\x38\x1e\xd8"
+			  "\x2c\x78\x78\x61\xcf\xe3\xde\x69"
+			  "\x1f\xd5\x03\xd5\x1a\xb4\xcf\x03"
+			  "\xc8\x7a\x70\x68\x35\xb4\xf6\xbe"
+			  "\x90\x62\xb2\x28\x99\x86\xf5\x44"
+			  "\x99\xeb\x31\xcf\xca\xdf\xd0\x21"
+			  "\xd6\x60\xf7\x0f\x40\xb4\x80\xb7"
+			  "\xab\xe1\x9b\x45\xba\x66\xda\xee"
+			  "\xdd\x04\x12\x40\x98\xe1\x69\xe5"
+			  "\x2b\x9c\x59\x80\xe7\x7b\xcc\x63"
+			  "\xa6\xc0\x3a\xa9\xfe\x8a\xf9\x62"
+			  "\x11\x34\x61\x94\x35\xfe\xf2\x99"
+			  "\xfd\xee\x19\xea\x95\xb6\x12\xbf"
+			  "\x1b\xdf\x02\x1a\xcc\x3e\x7e\x65"
+			  "\x78\x74\x10\x50\x29\x63\x28\xea"
+			  "\x6b\xab\xd4\x06\x4d\x15\x24\x31"
+			  "\xc7\x0a\xc9\x16\xb6\x48\xf0\xbf"
+			  "\x49\xdb\x68\x71\x31\x8f\x87\xe2"
+			  "\x13\x05\x64\xd6\x22\x0c\xf8\x36"
+			  "\x84\x24\x3e\x69\x5e\xb8\x9e\x16"
+			  "\x73\x6c\x83\x1e\xe0\x9f\x9e\xba"
+			  "\xe5\x59\x21\x33\x1b\xa9\x26\xc2"
+			  "\xc7\xd9\x30\x73\xb6\xa6\x73\x82"
+			  "\x19\xfa\x44\x4d\x40\x8b\x69\x04"
+			  "\x94\x74\xea\x6e\xb3\x09\x47\x01"
+			  "\x2a\xb9\x78\x34\x43\x11\xed\xd6"
+			  "\x8c\x95\x65\x1b\x85\x67\xa5\x40"
+			  "\xac\x9c\x05\x4b\x57\x4a\xa9\x96"
+			  "\x0f\xdd\x4f\xa1\xe0\xcf\x6e\xc7"
+			  "\x1b\xed\xa2\xb4\x56\x8c\x09\x6e"
+			  "\xa6\x65\xd7\x55\x81\xb7\xed\x11"
+			  "\x9b\x40\x75\xa8\x6b\x56\xaf\x16"
+			  "\x8b\x3d\xf4\xcb\xfe\xd5\x1d\x3d"
+			  "\x85\xc2\xc0\xde\x43\x39\x4a\x96"
+			  "\xba\x88\x97\xc0\xd6\x00\x0e\x27"
+			  "\x21\xb0\x21\x52\xba\xa7\x37\xaa"
+			  "\xcc\xbf\x95\xa8\xf4\xd0\x91\xf6",
+		.len	= 512,
+		.also_non_np = 1,
+		.np	= 2,
+		.tap	= { 144, 368 },
+	}
+};
+
+/* Adiantum with XChaCha20 instead of XChaCha12 */
+/* Test vectors from https://github.com/google/adiantum */
+static const struct cipher_testvec adiantum_xchacha20_aes_tv_template[] = {
+	{
+		.key	= "\x9e\xeb\xb2\x49\x3c\x1c\xf5\xf4"
+			  "\x6a\x99\xc2\xc4\xdf\xb1\xf4\xdd"
+			  "\x75\x20\x57\xea\x2c\x4f\xcd\xb2"
+			  "\xa5\x3d\x7b\x49\x1e\xab\xfd\x0f",
+		.klen	= 32,
+		.iv	= "\xdf\x63\xd4\xab\xd2\x49\xf3\xd8"
+			  "\x33\x81\x37\x60\x7d\xfa\x73\x08"
+			  "\xd8\x49\x6d\x80\xe8\x2f\x62\x54"
+			  "\xeb\x0e\xa9\x39\x5b\x45\x7f\x8a",
+		.ptext	= "\x67\xc9\xf2\x30\x84\x41\x8e\x43"
+			  "\xfb\xf3\xb3\x3e\x79\x36\x7f\xe8",
+		.ctext	= "\xf6\x78\x97\xd6\xaa\x94\x01\x27"
+			  "\x2e\x4d\x83\xe0\x6e\x64\x9a\xdf",
+		.len	= 16,
+		.also_non_np = 1,
+		.np	= 3,
+		.tap	= { 5, 2, 9 },
+	}, {
+		.key	= "\x36\x2b\x57\x97\xf8\x5d\xcd\x99"
+			  "\x5f\x1a\x5a\x44\x1d\x92\x0f\x27"
+			  "\xcc\x16\xd7\x2b\x85\x63\x99\xd3"
+			  "\xba\x96\xa1\xdb\xd2\x60\x68\xda",
+		.klen	= 32,
+		.iv	= "\xef\x58\x69\xb1\x2c\x5e\x9a\x47"
+			  "\x24\xc1\xb1\x69\xe1\x12\x93\x8f"
+			  "\x43\x3d\x6d\x00\xdb\x5e\xd8\xd9"
+			  "\x12\x9a\xfe\xd9\xff\x2d\xaa\xc4",
+		.ptext	= "\x5e\xa8\x68\x19\x85\x98\x12\x23"
+			  "\x26\x0a\xcc\xdb\x0a\x04\xb9\xdf"
+			  "\x4d\xb3\x48\x7b\xb0\xe3\xc8\x19"
+			  "\x43\x5a\x46\x06\x94\x2d\xf2",
+		.ctext	= "\x4b\xb8\x90\x10\xdf\x7f\x64\x08"
+			  "\x0e\x14\x42\x5f\x00\x74\x09\x36"
+			  "\x57\x72\xb5\xfd\xb5\x5d\xb8\x28"
+			  "\x0c\x04\x91\x14\x91\xe9\x37",
+		.len	= 31,
+		.also_non_np = 1,
+		.np	= 2,
+		.tap	= { 16, 15 },
+	}, {
+		.key	= "\xa5\x28\x24\x34\x1a\x3c\xd8\xf7"
+			  "\x05\x91\x8f\xee\x85\x1f\x35\x7f"
+			  "\x80\x3d\xfc\x9b\x94\xf6\xfc\x9e"
+			  "\x19\x09\x00\xa9\x04\x31\x4f\x11",
+		.klen	= 32,
+		.iv	= "\xa1\xba\x49\x95\xff\x34\x6d\xb8"
+			  "\xcd\x87\x5d\x5e\xfd\xea\x85\xdb"
+			  "\x8a\x7b\x5e\xb2\x5d\x57\xdd\x62"
+			  "\xac\xa9\x8c\x41\x42\x94\x75\xb7",
+		.ptext	= "\x69\xb4\xe8\x8c\x37\xe8\x67\x82"
+			  "\xf1\xec\x5d\x04\xe5\x14\x91\x13"
+			  "\xdf\xf2\x87\x1b\x69\x81\x1d\x71"
+			  "\x70\x9e\x9c\x3b\xde\x49\x70\x11"
+			  "\xa0\xa3\xdb\x0d\x54\x4f\x66\x69"
+			  "\xd7\xdb\x80\xa7\x70\x92\x68\xce"
+			  "\x81\x04\x2c\xc6\xab\xae\xe5\x60"
+			  "\x15\xe9\x6f\xef\xaa\x8f\xa7\xa7"
+			  "\x63\x8f\xf2\xf0\x77\xf1\xa8\xea"
+			  "\xe1\xb7\x1f\x9e\xab\x9e\x4b\x3f"
+			  "\x07\x87\x5b\x6f\xcd\xa8\xaf\xb9"
+			  "\xfa\x70\x0b\x52\xb8\xa8\xa7\x9e"
+			  "\x07\x5f\xa6\x0e\xb3\x9b\x79\x13"
+			  "\x79\xc3\x3e\x8d\x1c\x2c\x68\xc8"
+			  "\x51\x1d\x3c\x7b\x7d\x79\x77\x2a"
+			  "\x56\x65\xc5\x54\x23\x28\xb0\x03",
+		.ctext	= "\xb1\x8b\xa0\x05\x77\xa8\x4d\x59"
+			  "\x1b\x8e\x21\xfc\x3a\x49\xfa\xd4"
+			  "\xeb\x36\xf3\xc4\xdf\xdc\xae\x67"
+			  "\x07\x3f\x70\x0e\xe9\x66\xf5\x0c"
+			  "\x30\x4d\x66\xc9\xa4\x2f\x73\x9c"
+			  "\x13\xc8\x49\x44\xcc\x0a\x90\x9d"
+			  "\x7c\xdd\x19\x3f\xea\x72\x8d\x58"
+			  "\xab\xe7\x09\x2c\xec\xb5\x44\xd2"
+			  "\xca\xa6\x2d\x7a\x5c\x9c\x2b\x15"
+			  "\xec\x2a\xa6\x69\x91\xf9\xf3\x13"
+			  "\xf7\x72\xc1\xc1\x40\xd5\xe1\x94"
+			  "\xf4\x29\xa1\x3e\x25\x02\xa8\x3e"
+			  "\x94\xc1\x91\x14\xa1\x14\xcb\xbe"
+			  "\x67\x4c\xb9\x38\xfe\xa7\xaa\x32"
+			  "\x29\x62\x0d\xb2\xf6\x3c\x58\x57"
+			  "\xc1\xd5\x5a\xbb\xd6\xa6\x2a\xe5",
+		.len	= 128,
+		.also_non_np = 1,
+		.np	= 4,
+		.tap	= { 112, 7, 8, 1 },
+	}, {
+		.key	= "\xd3\x81\x72\x18\x23\xff\x6f\x4a"
+			  "\x25\x74\x29\x0d\x51\x8a\x0e\x13"
+			  "\xc1\x53\x5d\x30\x8d\xee\x75\x0d"
+			  "\x14\xd6\x69\xc9\x15\xa9\x0c\x60",
+		.klen	= 32,
+		.iv	= "\x65\x9b\xd4\xa8\x7d\x29\x1d\xf4"
+			  "\xc4\xd6\x9b\x6a\x28\xab\x64\xe2"
+			  "\x62\x81\x97\xc5\x81\xaa\xf9\x44"
+			  "\xc1\x72\x59\x82\xaf\x16\xc8\x2c",
+		.ptext	= "\xc7\x6b\x52\x6a\x10\xf0\xcc\x09"
+			  "\xc1\x12\x1d\x6d\x21\xa6\x78\xf5"
+			  "\x05\xa3\x69\x60\x91\x36\x98\x57"
+			  "\xba\x0c\x14\xcc\xf3\x2d\x73\x03"
+			  "\xc6\xb2\x5f\xc8\x16\x27\x37\x5d"
+			  "\xd0\x0b\x87\xb2\x50\x94\x7b\x58"
+			  "\x04\xf4\xe0\x7f\x6e\x57\x8e\xc9"
+			  "\x41\x84\xc1\xb1\x7e\x4b\x91\x12"
+			  "\x3a\x8b\x5d\x50\x82\x7b\xcb\xd9"
+			  "\x9a\xd9\x4e\x18\x06\x23\x9e\xd4"
+			  "\xa5\x20\x98\xef\xb5\xda\xe5\xc0"
+			  "\x8a\x6a\x83\x77\x15\x84\x1e\xae"
+			  "\x78\x94\x9d\xdf\xb7\xd1\xea\x67"
+			  "\xaa\xb0\x14\x15\xfa\x67\x21\x84"
+			  "\xd3\x41\x2a\xce\xba\x4b\x4a\xe8"
+			  "\x95\x62\xa9\x55\xf0\x80\xad\xbd"
+			  "\xab\xaf\xdd\x4f\xa5\x7c\x13\x36"
+			  "\xed\x5e\x4f\x72\xad\x4b\xf1\xd0"
+			  "\x88\x4e\xec\x2c\x88\x10\x5e\xea"
+			  "\x12\xc0\x16\x01\x29\xa3\xa0\x55"
+			  "\xaa\x68\xf3\xe9\x9d\x3b\x0d\x3b"
+			  "\x6d\xec\xf8\xa0\x2d\xf0\x90\x8d"
+			  "\x1c\xe2\x88\xd4\x24\x71\xf9\xb3"
+			  "\xc1\x9f\xc5\xd6\x76\x70\xc5\x2e"
+			  "\x9c\xac\xdb\x90\xbd\x83\x72\xba"
+			  "\x6e\xb5\xa5\x53\x83\xa9\xa5\xbf"
+			  "\x7d\x06\x0e\x3c\x2a\xd2\x04\xb5"
+			  "\x1e\x19\x38\x09\x16\xd2\x82\x1f"
+			  "\x75\x18\x56\xb8\x96\x0b\xa6\xf9"
+			  "\xcf\x62\xd9\x32\x5d\xa9\xd7\x1d"
+			  "\xec\xe4\xdf\x1b\xbe\xf1\x36\xee"
+			  "\xe3\x7b\xb5\x2f\xee\xf8\x53\x3d"
+			  "\x6a\xb7\x70\xa9\xfc\x9c\x57\x25"
+			  "\xf2\x89\x10\xd3\xb8\xa8\x8c\x30"
+			  "\xae\x23\x4f\x0e\x13\x66\x4f\xe1"
+			  "\xb6\xc0\xe4\xf8\xef\x93\xbd\x6e"
+			  "\x15\x85\x6b\xe3\x60\x81\x1d\x68"
+			  "\xd7\x31\x87\x89\x09\xab\xd5\x96"
+			  "\x1d\xf3\x6d\x67\x80\xca\x07\x31"
+			  "\x5d\xa7\xe4\xfb\x3e\xf2\x9b\x33"
+			  "\x52\x18\xc8\x30\xfe\x2d\xca\x1e"
+			  "\x79\x92\x7a\x60\x5c\xb6\x58\x87"
+			  "\xa4\x36\xa2\x67\x92\x8b\xa4\xb7"
+			  "\xf1\x86\xdf\xdc\xc0\x7e\x8f\x63"
+			  "\xd2\xa2\xdc\x78\xeb\x4f\xd8\x96"
+			  "\x47\xca\xb8\x91\xf9\xf7\x94\x21"
+			  "\x5f\x9a\x9f\x5b\xb8\x40\x41\x4b"
+			  "\x66\x69\x6a\x72\xd0\xcb\x70\xb7"
+			  "\x93\xb5\x37\x96\x05\x37\x4f\xe5"
+			  "\x8c\xa7\x5a\x4e\x8b\xb7\x84\xea"
+			  "\xc7\xfc\x19\x6e\x1f\x5a\xa1\xac"
+			  "\x18\x7d\x52\x3b\xb3\x34\x62\x99"
+			  "\xe4\x9e\x31\x04\x3f\xc0\x8d\x84"
+			  "\x17\x7c\x25\x48\x52\x67\x11\x27"
+			  "\x67\xbb\x5a\x85\xca\x56\xb2\x5c"
+			  "\xe6\xec\xd5\x96\x3d\x15\xfc\xfb"
+			  "\x22\x25\xf4\x13\xe5\x93\x4b\x9a"
+			  "\x77\xf1\x52\x18\xfa\x16\x5e\x49"
+			  "\x03\x45\xa8\x08\xfa\xb3\x41\x92"
+			  "\x79\x50\x33\xca\xd0\xd7\x42\x55"
+			  "\xc3\x9a\x0c\x4e\xd9\xa4\x3c\x86"
+			  "\x80\x9f\x53\xd1\xa4\x2e\xd1\xbc"
+			  "\xf1\x54\x6e\x93\xa4\x65\x99\x8e"
+			  "\xdf\x29\xc0\x64\x63\x07\xbb\xea",
+		.ctext	= "\xe0\x33\xf6\xe0\xb4\xa5\xdd\x2b"
+			  "\xdd\xce\xfc\x12\x1e\xfc\x2d\xf2"
+			  "\x8b\xc7\xeb\xc1\xc4\x2a\xe8\x44"
+			  "\x0f\x3d\x97\x19\x2e\x6d\xa2\x38"
+			  "\x9d\xa6\xaa\xe1\x96\xb9\x08\xe8"
+			  "\x0b\x70\x48\x5c\xed\xb5\x9b\xcb"
+			  "\x8b\x40\x88\x7e\x69\x73\xf7\x16"
+			  "\x71\xbb\x5b\xfc\xa3\x47\x5d\xa6"
+			  "\xae\x3a\x64\xc4\xe7\xb8\xa8\xe7"
+			  "\xb1\x32\x19\xdb\xe3\x01\xb8\xf0"
+			  "\xa4\x86\xb4\x4c\xc2\xde\x5c\xd2"
+			  "\x6c\x77\xd2\xe8\x18\xb7\x0a\xc9"
+			  "\x3d\x53\xb5\xc4\x5c\xf0\x8c\x06"
+			  "\xdc\x90\xe0\x74\x47\x1b\x0b\xf6"
+			  "\xd2\x71\x6b\xc4\xf1\x97\x00\x2d"
+			  "\x63\x57\x44\x1f\x8c\xf4\xe6\x9b"
+			  "\xe0\x7a\xdd\xec\x32\x73\x42\x32"
+			  "\x7f\x35\x67\x60\x0d\xcf\x10\x52"
+			  "\x61\x22\x53\x8d\x8e\xbb\x33\x76"
+			  "\x59\xd9\x10\xce\xdf\xef\xc0\x41"
+			  "\xd5\x33\x29\x6a\xda\x46\xa4\x51"
+			  "\xf0\x99\x3d\x96\x31\xdd\xb5\xcb"
+			  "\x3e\x2a\x1f\xc7\x5c\x79\xd3\xc5"
+			  "\x20\xa1\xb1\x39\x1b\xc6\x0a\x70"
+			  "\x26\x39\x95\x07\xad\x7a\xc9\x69"
+			  "\xfe\x81\xc7\x88\x08\x38\xaf\xad"
+			  "\x9e\x8d\xfb\xe8\x24\x0d\x22\xb8"
+			  "\x0e\xed\xbe\x37\x53\x7c\xa6\xc6"
+			  "\x78\x62\xec\xa3\x59\xd9\xc6\x9d"
+			  "\xb8\x0e\x69\x77\x84\x2d\x6a\x4c"
+			  "\xc5\xd9\xb2\xa0\x2b\xa8\x80\xcc"
+			  "\xe9\x1e\x9c\x5a\xc4\xa1\xb2\x37"
+			  "\x06\x9b\x30\x32\x67\xf7\xe7\xd2"
+			  "\x42\xc7\xdf\x4e\xd4\xcb\xa0\x12"
+			  "\x94\xa1\x34\x85\x93\x50\x4b\x0a"
+			  "\x3c\x7d\x49\x25\x01\x41\x6b\x96"
+			  "\xa9\x12\xbb\x0b\xc0\xd7\xd0\x93"
+			  "\x1f\x70\x38\xb8\x21\xee\xf6\xa7"
+			  "\xee\xeb\xe7\x81\xa4\x13\xb4\x87"
+			  "\xfa\xc1\xb0\xb5\x37\x8b\x74\xa2"
+			  "\x4e\xc7\xc2\xad\x3d\x62\x3f\xf8"
+			  "\x34\x42\xe5\xae\x45\x13\x63\xfe"
+			  "\xfc\x2a\x17\x46\x61\xa9\xd3\x1c"
+			  "\x4c\xaf\xf0\x09\x62\x26\x66\x1e"
+			  "\x74\xcf\xd6\x68\x3d\x7d\xd8\xb7"
+			  "\xe7\xe6\xf8\xf0\x08\x20\xf7\x47"
+			  "\x1c\x52\xaa\x0f\x3e\x21\xa3\xf2"
+			  "\xbf\x2f\x95\x16\xa8\xc8\xc8\x8c"
+			  "\x99\x0f\x5d\xfb\xfa\x2b\x58\x8a"
+			  "\x7e\xd6\x74\x02\x60\xf0\xd0\x5b"
+			  "\x65\xa8\xac\xea\x8d\x68\x46\x34"
+			  "\x26\x9d\x4f\xb1\x9a\x8e\xc0\x1a"
+			  "\xf1\xed\xc6\x7a\x83\xfd\x8a\x57"
+			  "\xf2\xe6\xe4\xba\xfc\xc6\x3c\xad"
+			  "\x5b\x19\x50\x2f\x3a\xcc\x06\x46"
+			  "\x04\x51\x3f\x91\x97\xf0\xd2\x07"
+			  "\xe7\x93\x89\x7e\xb5\x32\x0f\x03"
+			  "\xe5\x58\x9e\x74\x72\xeb\xc2\x38"
+			  "\x00\x0c\x91\x72\x69\xed\x7d\x6d"
+			  "\xc8\x71\xf0\xec\xff\x80\xd9\x1c"
+			  "\x9e\xd2\xfa\x15\xfc\x6c\x4e\xbc"
+			  "\xb1\xa6\xbd\xbd\x70\x40\xca\x20"
+			  "\xb8\x78\xd2\xa3\xc6\xf3\x79\x9c"
+			  "\xc7\x27\xe1\x6a\x29\xad\xa4\x03",
+		.len	= 512,
+	}
+};
+
 /*
  * CTS (Cipher Text Stealing) mode tests
  */
-- 
cgit v1.2.3-59-g8ed1b


From cee7a36ecb5bafef8c87fb2c10641e6125044154 Mon Sep 17 00:00:00 2001
From: Martin Willi <martin@strongswan.org>
Date: Tue, 20 Nov 2018 17:30:48 +0100
Subject: crypto: x86/chacha20 - Add a 8-block AVX-512VL variant

This variant is similar to the AVX2 version, but benefits from the AVX-512
rotate instructions and the additional registers, so it can operate without
any data on the stack. It uses ymm registers only to avoid the massive core
throttling on Skylake-X platforms. Nontheless does it bring a ~30% speed
improvement compared to the AVX2 variant for random encryption lengths.

The AVX2 version uses "rep movsb" for partial block XORing via the stack.
With AVX-512, the new "vmovdqu8" can do this much more efficiently. The
associated "kmov" instructions to work with dynamic masks is not part of
the AVX-512VL instruction set, hence we depend on AVX-512BW as well. Given
that the major AVX-512VL architectures provide AVX-512BW and this extension
does not affect core clocking, this seems to be no problem at least for
now.

Signed-off-by: Martin Willi <martin@strongswan.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/Makefile                   |   5 +
 arch/x86/crypto/chacha20-avx512vl-x86_64.S | 396 +++++++++++++++++++++++++++++
 arch/x86/crypto/chacha20_glue.c            |  26 ++
 3 files changed, 427 insertions(+)
 create mode 100644 arch/x86/crypto/chacha20-avx512vl-x86_64.S

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index a4b0007a54e1..ce4e43642984 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -8,6 +8,7 @@ OBJECT_FILES_NON_STANDARD := y
 avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
 avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
 				$(comma)4)$(comma)%ymm2,yes,no)
+avx512_supported :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,yes,no)
 sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no)
 sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no)
 
@@ -103,6 +104,10 @@ ifeq ($(avx2_supported),yes)
 	morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o
 endif
 
+ifeq ($(avx512_supported),yes)
+	chacha20-x86_64-y += chacha20-avx512vl-x86_64.o
+endif
+
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
 aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
diff --git a/arch/x86/crypto/chacha20-avx512vl-x86_64.S b/arch/x86/crypto/chacha20-avx512vl-x86_64.S
new file mode 100644
index 000000000000..e1877afcaa73
--- /dev/null
+++ b/arch/x86/crypto/chacha20-avx512vl-x86_64.S
@@ -0,0 +1,396 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX-512VL functions
+ *
+ * Copyright (C) 2018 Martin Willi
+ */
+
+#include <linux/linkage.h>
+
+.section	.rodata.cst32.CTR8BL, "aM", @progbits, 32
+.align 32
+CTR8BL:	.octa 0x00000003000000020000000100000000
+	.octa 0x00000007000000060000000500000004
+
+.text
+
+ENTRY(chacha20_8block_xor_avx512vl)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 8 data blocks output, o
+	# %rdx: up to 8 data blocks input, i
+	# %rcx: input/output length in bytes
+
+	# This function encrypts eight consecutive ChaCha20 blocks by loading
+	# the state matrix in AVX registers eight times. Compared to AVX2, this
+	# mostly benefits from the new rotate instructions in VL and the
+	# additional registers.
+
+	vzeroupper
+
+	# x0..15[0-7] = s[0..15]
+	vpbroadcastd	0x00(%rdi),%ymm0
+	vpbroadcastd	0x04(%rdi),%ymm1
+	vpbroadcastd	0x08(%rdi),%ymm2
+	vpbroadcastd	0x0c(%rdi),%ymm3
+	vpbroadcastd	0x10(%rdi),%ymm4
+	vpbroadcastd	0x14(%rdi),%ymm5
+	vpbroadcastd	0x18(%rdi),%ymm6
+	vpbroadcastd	0x1c(%rdi),%ymm7
+	vpbroadcastd	0x20(%rdi),%ymm8
+	vpbroadcastd	0x24(%rdi),%ymm9
+	vpbroadcastd	0x28(%rdi),%ymm10
+	vpbroadcastd	0x2c(%rdi),%ymm11
+	vpbroadcastd	0x30(%rdi),%ymm12
+	vpbroadcastd	0x34(%rdi),%ymm13
+	vpbroadcastd	0x38(%rdi),%ymm14
+	vpbroadcastd	0x3c(%rdi),%ymm15
+
+	# x12 += counter values 0-3
+	vpaddd		CTR8BL(%rip),%ymm12,%ymm12
+
+	vmovdqa64	%ymm0,%ymm16
+	vmovdqa64	%ymm1,%ymm17
+	vmovdqa64	%ymm2,%ymm18
+	vmovdqa64	%ymm3,%ymm19
+	vmovdqa64	%ymm4,%ymm20
+	vmovdqa64	%ymm5,%ymm21
+	vmovdqa64	%ymm6,%ymm22
+	vmovdqa64	%ymm7,%ymm23
+	vmovdqa64	%ymm8,%ymm24
+	vmovdqa64	%ymm9,%ymm25
+	vmovdqa64	%ymm10,%ymm26
+	vmovdqa64	%ymm11,%ymm27
+	vmovdqa64	%ymm12,%ymm28
+	vmovdqa64	%ymm13,%ymm29
+	vmovdqa64	%ymm14,%ymm30
+	vmovdqa64	%ymm15,%ymm31
+
+	mov		$10,%eax
+
+.Ldoubleround8:
+	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+	vpaddd		%ymm0,%ymm4,%ymm0
+	vpxord		%ymm0,%ymm12,%ymm12
+	vprold		$16,%ymm12,%ymm12
+	# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+	vpaddd		%ymm1,%ymm5,%ymm1
+	vpxord		%ymm1,%ymm13,%ymm13
+	vprold		$16,%ymm13,%ymm13
+	# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+	vpaddd		%ymm2,%ymm6,%ymm2
+	vpxord		%ymm2,%ymm14,%ymm14
+	vprold		$16,%ymm14,%ymm14
+	# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+	vpaddd		%ymm3,%ymm7,%ymm3
+	vpxord		%ymm3,%ymm15,%ymm15
+	vprold		$16,%ymm15,%ymm15
+
+	# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+	vpaddd		%ymm12,%ymm8,%ymm8
+	vpxord		%ymm8,%ymm4,%ymm4
+	vprold		$12,%ymm4,%ymm4
+	# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+	vpaddd		%ymm13,%ymm9,%ymm9
+	vpxord		%ymm9,%ymm5,%ymm5
+	vprold		$12,%ymm5,%ymm5
+	# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+	vpaddd		%ymm14,%ymm10,%ymm10
+	vpxord		%ymm10,%ymm6,%ymm6
+	vprold		$12,%ymm6,%ymm6
+	# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+	vpaddd		%ymm15,%ymm11,%ymm11
+	vpxord		%ymm11,%ymm7,%ymm7
+	vprold		$12,%ymm7,%ymm7
+
+	# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+	vpaddd		%ymm0,%ymm4,%ymm0
+	vpxord		%ymm0,%ymm12,%ymm12
+	vprold		$8,%ymm12,%ymm12
+	# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+	vpaddd		%ymm1,%ymm5,%ymm1
+	vpxord		%ymm1,%ymm13,%ymm13
+	vprold		$8,%ymm13,%ymm13
+	# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+	vpaddd		%ymm2,%ymm6,%ymm2
+	vpxord		%ymm2,%ymm14,%ymm14
+	vprold		$8,%ymm14,%ymm14
+	# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+	vpaddd		%ymm3,%ymm7,%ymm3
+	vpxord		%ymm3,%ymm15,%ymm15
+	vprold		$8,%ymm15,%ymm15
+
+	# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+	vpaddd		%ymm12,%ymm8,%ymm8
+	vpxord		%ymm8,%ymm4,%ymm4
+	vprold		$7,%ymm4,%ymm4
+	# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+	vpaddd		%ymm13,%ymm9,%ymm9
+	vpxord		%ymm9,%ymm5,%ymm5
+	vprold		$7,%ymm5,%ymm5
+	# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+	vpaddd		%ymm14,%ymm10,%ymm10
+	vpxord		%ymm10,%ymm6,%ymm6
+	vprold		$7,%ymm6,%ymm6
+	# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+	vpaddd		%ymm15,%ymm11,%ymm11
+	vpxord		%ymm11,%ymm7,%ymm7
+	vprold		$7,%ymm7,%ymm7
+
+	# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+	vpaddd		%ymm0,%ymm5,%ymm0
+	vpxord		%ymm0,%ymm15,%ymm15
+	vprold		$16,%ymm15,%ymm15
+	# x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+	vpaddd		%ymm1,%ymm6,%ymm1
+	vpxord		%ymm1,%ymm12,%ymm12
+	vprold		$16,%ymm12,%ymm12
+	# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+	vpaddd		%ymm2,%ymm7,%ymm2
+	vpxord		%ymm2,%ymm13,%ymm13
+	vprold		$16,%ymm13,%ymm13
+	# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+	vpaddd		%ymm3,%ymm4,%ymm3
+	vpxord		%ymm3,%ymm14,%ymm14
+	vprold		$16,%ymm14,%ymm14
+
+	# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+	vpaddd		%ymm15,%ymm10,%ymm10
+	vpxord		%ymm10,%ymm5,%ymm5
+	vprold		$12,%ymm5,%ymm5
+	# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+	vpaddd		%ymm12,%ymm11,%ymm11
+	vpxord		%ymm11,%ymm6,%ymm6
+	vprold		$12,%ymm6,%ymm6
+	# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+	vpaddd		%ymm13,%ymm8,%ymm8
+	vpxord		%ymm8,%ymm7,%ymm7
+	vprold		$12,%ymm7,%ymm7
+	# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+	vpaddd		%ymm14,%ymm9,%ymm9
+	vpxord		%ymm9,%ymm4,%ymm4
+	vprold		$12,%ymm4,%ymm4
+
+	# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+	vpaddd		%ymm0,%ymm5,%ymm0
+	vpxord		%ymm0,%ymm15,%ymm15
+	vprold		$8,%ymm15,%ymm15
+	# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+	vpaddd		%ymm1,%ymm6,%ymm1
+	vpxord		%ymm1,%ymm12,%ymm12
+	vprold		$8,%ymm12,%ymm12
+	# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+	vpaddd		%ymm2,%ymm7,%ymm2
+	vpxord		%ymm2,%ymm13,%ymm13
+	vprold		$8,%ymm13,%ymm13
+	# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+	vpaddd		%ymm3,%ymm4,%ymm3
+	vpxord		%ymm3,%ymm14,%ymm14
+	vprold		$8,%ymm14,%ymm14
+
+	# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+	vpaddd		%ymm15,%ymm10,%ymm10
+	vpxord		%ymm10,%ymm5,%ymm5
+	vprold		$7,%ymm5,%ymm5
+	# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+	vpaddd		%ymm12,%ymm11,%ymm11
+	vpxord		%ymm11,%ymm6,%ymm6
+	vprold		$7,%ymm6,%ymm6
+	# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+	vpaddd		%ymm13,%ymm8,%ymm8
+	vpxord		%ymm8,%ymm7,%ymm7
+	vprold		$7,%ymm7,%ymm7
+	# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+	vpaddd		%ymm14,%ymm9,%ymm9
+	vpxord		%ymm9,%ymm4,%ymm4
+	vprold		$7,%ymm4,%ymm4
+
+	dec		%eax
+	jnz		.Ldoubleround8
+
+	# x0..15[0-3] += s[0..15]
+	vpaddd		%ymm16,%ymm0,%ymm0
+	vpaddd		%ymm17,%ymm1,%ymm1
+	vpaddd		%ymm18,%ymm2,%ymm2
+	vpaddd		%ymm19,%ymm3,%ymm3
+	vpaddd		%ymm20,%ymm4,%ymm4
+	vpaddd		%ymm21,%ymm5,%ymm5
+	vpaddd		%ymm22,%ymm6,%ymm6
+	vpaddd		%ymm23,%ymm7,%ymm7
+	vpaddd		%ymm24,%ymm8,%ymm8
+	vpaddd		%ymm25,%ymm9,%ymm9
+	vpaddd		%ymm26,%ymm10,%ymm10
+	vpaddd		%ymm27,%ymm11,%ymm11
+	vpaddd		%ymm28,%ymm12,%ymm12
+	vpaddd		%ymm29,%ymm13,%ymm13
+	vpaddd		%ymm30,%ymm14,%ymm14
+	vpaddd		%ymm31,%ymm15,%ymm15
+
+	# interleave 32-bit words in state n, n+1
+	vpunpckldq	%ymm1,%ymm0,%ymm16
+	vpunpckhdq	%ymm1,%ymm0,%ymm17
+	vpunpckldq	%ymm3,%ymm2,%ymm18
+	vpunpckhdq	%ymm3,%ymm2,%ymm19
+	vpunpckldq	%ymm5,%ymm4,%ymm20
+	vpunpckhdq	%ymm5,%ymm4,%ymm21
+	vpunpckldq	%ymm7,%ymm6,%ymm22
+	vpunpckhdq	%ymm7,%ymm6,%ymm23
+	vpunpckldq	%ymm9,%ymm8,%ymm24
+	vpunpckhdq	%ymm9,%ymm8,%ymm25
+	vpunpckldq	%ymm11,%ymm10,%ymm26
+	vpunpckhdq	%ymm11,%ymm10,%ymm27
+	vpunpckldq	%ymm13,%ymm12,%ymm28
+	vpunpckhdq	%ymm13,%ymm12,%ymm29
+	vpunpckldq	%ymm15,%ymm14,%ymm30
+	vpunpckhdq	%ymm15,%ymm14,%ymm31
+
+	# interleave 64-bit words in state n, n+2
+	vpunpcklqdq	%ymm18,%ymm16,%ymm0
+	vpunpcklqdq	%ymm19,%ymm17,%ymm1
+	vpunpckhqdq	%ymm18,%ymm16,%ymm2
+	vpunpckhqdq	%ymm19,%ymm17,%ymm3
+	vpunpcklqdq	%ymm22,%ymm20,%ymm4
+	vpunpcklqdq	%ymm23,%ymm21,%ymm5
+	vpunpckhqdq	%ymm22,%ymm20,%ymm6
+	vpunpckhqdq	%ymm23,%ymm21,%ymm7
+	vpunpcklqdq	%ymm26,%ymm24,%ymm8
+	vpunpcklqdq	%ymm27,%ymm25,%ymm9
+	vpunpckhqdq	%ymm26,%ymm24,%ymm10
+	vpunpckhqdq	%ymm27,%ymm25,%ymm11
+	vpunpcklqdq	%ymm30,%ymm28,%ymm12
+	vpunpcklqdq	%ymm31,%ymm29,%ymm13
+	vpunpckhqdq	%ymm30,%ymm28,%ymm14
+	vpunpckhqdq	%ymm31,%ymm29,%ymm15
+
+	# interleave 128-bit words in state n, n+4
+	# xor/write first four blocks
+	vmovdqa64	%ymm0,%ymm16
+	vperm2i128	$0x20,%ymm4,%ymm0,%ymm0
+	cmp		$0x0020,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0000(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0000(%rsi)
+	vmovdqa64	%ymm16,%ymm0
+	vperm2i128	$0x31,%ymm4,%ymm0,%ymm4
+
+	vperm2i128	$0x20,%ymm12,%ymm8,%ymm0
+	cmp		$0x0040,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0020(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0020(%rsi)
+	vperm2i128	$0x31,%ymm12,%ymm8,%ymm12
+
+	vperm2i128	$0x20,%ymm6,%ymm2,%ymm0
+	cmp		$0x0060,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0040(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0040(%rsi)
+	vperm2i128	$0x31,%ymm6,%ymm2,%ymm6
+
+	vperm2i128	$0x20,%ymm14,%ymm10,%ymm0
+	cmp		$0x0080,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0060(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0060(%rsi)
+	vperm2i128	$0x31,%ymm14,%ymm10,%ymm14
+
+	vperm2i128	$0x20,%ymm5,%ymm1,%ymm0
+	cmp		$0x00a0,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0080(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0080(%rsi)
+	vperm2i128	$0x31,%ymm5,%ymm1,%ymm5
+
+	vperm2i128	$0x20,%ymm13,%ymm9,%ymm0
+	cmp		$0x00c0,%rcx
+	jl		.Lxorpart8
+	vpxord		0x00a0(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x00a0(%rsi)
+	vperm2i128	$0x31,%ymm13,%ymm9,%ymm13
+
+	vperm2i128	$0x20,%ymm7,%ymm3,%ymm0
+	cmp		$0x00e0,%rcx
+	jl		.Lxorpart8
+	vpxord		0x00c0(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x00c0(%rsi)
+	vperm2i128	$0x31,%ymm7,%ymm3,%ymm7
+
+	vperm2i128	$0x20,%ymm15,%ymm11,%ymm0
+	cmp		$0x0100,%rcx
+	jl		.Lxorpart8
+	vpxord		0x00e0(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x00e0(%rsi)
+	vperm2i128	$0x31,%ymm15,%ymm11,%ymm15
+
+	# xor remaining blocks, write to output
+	vmovdqa64	%ymm4,%ymm0
+	cmp		$0x0120,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0100(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0100(%rsi)
+
+	vmovdqa64	%ymm12,%ymm0
+	cmp		$0x0140,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0120(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0120(%rsi)
+
+	vmovdqa64	%ymm6,%ymm0
+	cmp		$0x0160,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0140(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0140(%rsi)
+
+	vmovdqa64	%ymm14,%ymm0
+	cmp		$0x0180,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0160(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0160(%rsi)
+
+	vmovdqa64	%ymm5,%ymm0
+	cmp		$0x01a0,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0180(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0180(%rsi)
+
+	vmovdqa64	%ymm13,%ymm0
+	cmp		$0x01c0,%rcx
+	jl		.Lxorpart8
+	vpxord		0x01a0(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x01a0(%rsi)
+
+	vmovdqa64	%ymm7,%ymm0
+	cmp		$0x01e0,%rcx
+	jl		.Lxorpart8
+	vpxord		0x01c0(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x01c0(%rsi)
+
+	vmovdqa64	%ymm15,%ymm0
+	cmp		$0x0200,%rcx
+	jl		.Lxorpart8
+	vpxord		0x01e0(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x01e0(%rsi)
+
+.Ldone8:
+	vzeroupper
+	ret
+
+.Lxorpart8:
+	# xor remaining bytes from partial register into output
+	mov		%rcx,%rax
+	and		$0x1f,%rcx
+	jz		.Ldone8
+	mov		%rax,%r9
+	and		$~0x1f,%r9
+
+	mov		$1,%rax
+	shld		%cl,%rax,%rax
+	sub		$1,%rax
+	kmovq		%rax,%k1
+
+	vmovdqu8	(%rdx,%r9),%ymm1{%k1}{z}
+	vpxord		%ymm0,%ymm1,%ymm1
+	vmovdqu8	%ymm1,(%rsi,%r9){%k1}
+
+	jmp		.Ldone8
+
+ENDPROC(chacha20_8block_xor_avx512vl)
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index 1e9e66509226..6a67e70bc82a 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -31,6 +31,11 @@ asmlinkage void chacha20_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
 asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
 					 unsigned int len);
 static bool chacha20_use_avx2;
+#ifdef CONFIG_AS_AVX512
+asmlinkage void chacha20_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+					     unsigned int len);
+static bool chacha20_use_avx512vl;
+#endif
 #endif
 
 static unsigned int chacha20_advance(unsigned int len, unsigned int maxblocks)
@@ -43,6 +48,22 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
 			    unsigned int bytes)
 {
 #ifdef CONFIG_AS_AVX2
+#ifdef CONFIG_AS_AVX512
+	if (chacha20_use_avx512vl) {
+		while (bytes >= CHACHA_BLOCK_SIZE * 8) {
+			chacha20_8block_xor_avx512vl(state, dst, src, bytes);
+			bytes -= CHACHA_BLOCK_SIZE * 8;
+			src += CHACHA_BLOCK_SIZE * 8;
+			dst += CHACHA_BLOCK_SIZE * 8;
+			state[12] += 8;
+		}
+		if (bytes > CHACHA_BLOCK_SIZE * 4) {
+			chacha20_8block_xor_avx512vl(state, dst, src, bytes);
+			state[12] += chacha20_advance(bytes, 8);
+			return;
+		}
+	}
+#endif
 	if (chacha20_use_avx2) {
 		while (bytes >= CHACHA_BLOCK_SIZE * 8) {
 			chacha20_8block_xor_avx2(state, dst, src, bytes);
@@ -149,6 +170,11 @@ static int __init chacha20_simd_mod_init(void)
 	chacha20_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
 			    boot_cpu_has(X86_FEATURE_AVX2) &&
 			    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+#ifdef CONFIG_AS_AVX512
+	chacha20_use_avx512vl = chacha20_use_avx2 &&
+				boot_cpu_has(X86_FEATURE_AVX512VL) &&
+				boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */
+#endif
 #endif
 	return crypto_register_skcipher(&alg);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 29a47b54e030efe308aa90e6c26a9ce7f5f84ed8 Mon Sep 17 00:00:00 2001
From: Martin Willi <martin@strongswan.org>
Date: Tue, 20 Nov 2018 17:30:49 +0100
Subject: crypto: x86/chacha20 - Add a 2-block AVX-512VL variant

This version uses the same principle as the AVX2 version. It benefits
from the AVX-512VL rotate instructions and the more efficient partial
block handling using "vmovdqu8", resulting in a speedup of ~20%.

Unlike the AVX2 version, it is faster than the single block SSSE3 version
to process a single block. Hence we engage that function for (partial)
single block lengths as well.

Signed-off-by: Martin Willi <martin@strongswan.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/chacha20-avx512vl-x86_64.S | 171 +++++++++++++++++++++++++++++
 arch/x86/crypto/chacha20_glue.c            |   7 ++
 2 files changed, 178 insertions(+)

diff --git a/arch/x86/crypto/chacha20-avx512vl-x86_64.S b/arch/x86/crypto/chacha20-avx512vl-x86_64.S
index e1877afcaa73..261097578715 100644
--- a/arch/x86/crypto/chacha20-avx512vl-x86_64.S
+++ b/arch/x86/crypto/chacha20-avx512vl-x86_64.S
@@ -7,6 +7,11 @@
 
 #include <linux/linkage.h>
 
+.section	.rodata.cst32.CTR2BL, "aM", @progbits, 32
+.align 32
+CTR2BL:	.octa 0x00000000000000000000000000000000
+	.octa 0x00000000000000000000000000000001
+
 .section	.rodata.cst32.CTR8BL, "aM", @progbits, 32
 .align 32
 CTR8BL:	.octa 0x00000003000000020000000100000000
@@ -14,6 +19,172 @@ CTR8BL:	.octa 0x00000003000000020000000100000000
 
 .text
 
+ENTRY(chacha20_2block_xor_avx512vl)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 2 data blocks output, o
+	# %rdx: up to 2 data blocks input, i
+	# %rcx: input/output length in bytes
+
+	# This function encrypts two ChaCha20 blocks by loading the state
+	# matrix twice across four AVX registers. It performs matrix operations
+	# on four words in each matrix in parallel, but requires shuffling to
+	# rearrange the words after each round.
+
+	vzeroupper
+
+	# x0..3[0-2] = s0..3
+	vbroadcasti128	0x00(%rdi),%ymm0
+	vbroadcasti128	0x10(%rdi),%ymm1
+	vbroadcasti128	0x20(%rdi),%ymm2
+	vbroadcasti128	0x30(%rdi),%ymm3
+
+	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
+
+	vmovdqa		%ymm0,%ymm8
+	vmovdqa		%ymm1,%ymm9
+	vmovdqa		%ymm2,%ymm10
+	vmovdqa		%ymm3,%ymm11
+
+	mov		$10,%rax
+
+.Ldoubleround:
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$16,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$12,%ymm1,%ymm1
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$8,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$7,%ymm1,%ymm1
+
+	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm1,%ymm1
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm3,%ymm3
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$16,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$12,%ymm1,%ymm1
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$8,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$7,%ymm1,%ymm1
+
+	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm1,%ymm1
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm3,%ymm3
+
+	dec		%rax
+	jnz		.Ldoubleround
+
+	# o0 = i0 ^ (x0 + s0)
+	vpaddd		%ymm8,%ymm0,%ymm7
+	cmp		$0x10,%rcx
+	jl		.Lxorpart2
+	vpxord		0x00(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x00(%rsi)
+	vextracti128	$1,%ymm7,%xmm0
+	# o1 = i1 ^ (x1 + s1)
+	vpaddd		%ymm9,%ymm1,%ymm7
+	cmp		$0x20,%rcx
+	jl		.Lxorpart2
+	vpxord		0x10(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x10(%rsi)
+	vextracti128	$1,%ymm7,%xmm1
+	# o2 = i2 ^ (x2 + s2)
+	vpaddd		%ymm10,%ymm2,%ymm7
+	cmp		$0x30,%rcx
+	jl		.Lxorpart2
+	vpxord		0x20(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x20(%rsi)
+	vextracti128	$1,%ymm7,%xmm2
+	# o3 = i3 ^ (x3 + s3)
+	vpaddd		%ymm11,%ymm3,%ymm7
+	cmp		$0x40,%rcx
+	jl		.Lxorpart2
+	vpxord		0x30(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x30(%rsi)
+	vextracti128	$1,%ymm7,%xmm3
+
+	# xor and write second block
+	vmovdqa		%xmm0,%xmm7
+	cmp		$0x50,%rcx
+	jl		.Lxorpart2
+	vpxord		0x40(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x40(%rsi)
+
+	vmovdqa		%xmm1,%xmm7
+	cmp		$0x60,%rcx
+	jl		.Lxorpart2
+	vpxord		0x50(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x50(%rsi)
+
+	vmovdqa		%xmm2,%xmm7
+	cmp		$0x70,%rcx
+	jl		.Lxorpart2
+	vpxord		0x60(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x60(%rsi)
+
+	vmovdqa		%xmm3,%xmm7
+	cmp		$0x80,%rcx
+	jl		.Lxorpart2
+	vpxord		0x70(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x70(%rsi)
+
+.Ldone2:
+	vzeroupper
+	ret
+
+.Lxorpart2:
+	# xor remaining bytes from partial register into output
+	mov		%rcx,%rax
+	and		$0xf,%rcx
+	jz		.Ldone8
+	mov		%rax,%r9
+	and		$~0xf,%r9
+
+	mov		$1,%rax
+	shld		%cl,%rax,%rax
+	sub		$1,%rax
+	kmovq		%rax,%k1
+
+	vmovdqu8	(%rdx,%r9),%xmm1{%k1}{z}
+	vpxord		%xmm7,%xmm1,%xmm1
+	vmovdqu8	%xmm1,(%rsi,%r9){%k1}
+
+	jmp		.Ldone2
+
+ENDPROC(chacha20_2block_xor_avx512vl)
+
 ENTRY(chacha20_8block_xor_avx512vl)
 	# %rdi: Input state matrix, s
 	# %rsi: up to 8 data blocks output, o
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index 6a67e70bc82a..d6a95a6a324e 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -32,6 +32,8 @@ asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
 					 unsigned int len);
 static bool chacha20_use_avx2;
 #ifdef CONFIG_AS_AVX512
+asmlinkage void chacha20_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+					     unsigned int len);
 asmlinkage void chacha20_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
 					     unsigned int len);
 static bool chacha20_use_avx512vl;
@@ -62,6 +64,11 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
 			state[12] += chacha20_advance(bytes, 8);
 			return;
 		}
+		if (bytes) {
+			chacha20_2block_xor_avx512vl(state, dst, src, bytes);
+			state[12] += chacha20_advance(bytes, 2);
+			return;
+		}
 	}
 #endif
 	if (chacha20_use_avx2) {
-- 
cgit v1.2.3-59-g8ed1b


From 180def6c4ad139ae6f97953ae810092ace295d5b Mon Sep 17 00:00:00 2001
From: Martin Willi <martin@strongswan.org>
Date: Tue, 20 Nov 2018 17:30:50 +0100
Subject: crypto: x86/chacha20 - Add a 4-block AVX-512VL variant

This version uses the same principle as the AVX2 version by scheduling the
operations for two block pairs in parallel. It benefits from the AVX-512VL
rotate instructions and the more efficient partial block handling using
"vmovdqu8", resulting in a speedup of the raw block function of ~20%.

Signed-off-by: Martin Willi <martin@strongswan.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/chacha20-avx512vl-x86_64.S | 272 +++++++++++++++++++++++++++++
 arch/x86/crypto/chacha20_glue.c            |   7 +
 2 files changed, 279 insertions(+)

diff --git a/arch/x86/crypto/chacha20-avx512vl-x86_64.S b/arch/x86/crypto/chacha20-avx512vl-x86_64.S
index 261097578715..55d34de29e3e 100644
--- a/arch/x86/crypto/chacha20-avx512vl-x86_64.S
+++ b/arch/x86/crypto/chacha20-avx512vl-x86_64.S
@@ -12,6 +12,11 @@
 CTR2BL:	.octa 0x00000000000000000000000000000000
 	.octa 0x00000000000000000000000000000001
 
+.section	.rodata.cst32.CTR4BL, "aM", @progbits, 32
+.align 32
+CTR4BL:	.octa 0x00000000000000000000000000000002
+	.octa 0x00000000000000000000000000000003
+
 .section	.rodata.cst32.CTR8BL, "aM", @progbits, 32
 .align 32
 CTR8BL:	.octa 0x00000003000000020000000100000000
@@ -185,6 +190,273 @@ ENTRY(chacha20_2block_xor_avx512vl)
 
 ENDPROC(chacha20_2block_xor_avx512vl)
 
+ENTRY(chacha20_4block_xor_avx512vl)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 4 data blocks output, o
+	# %rdx: up to 4 data blocks input, i
+	# %rcx: input/output length in bytes
+
+	# This function encrypts four ChaCha20 block by loading the state
+	# matrix four times across eight AVX registers. It performs matrix
+	# operations on four words in two matrices in parallel, sequentially
+	# to the operations on the four words of the other two matrices. The
+	# required word shuffling has a rather high latency, we can do the
+	# arithmetic on two matrix-pairs without much slowdown.
+
+	vzeroupper
+
+	# x0..3[0-4] = s0..3
+	vbroadcasti128	0x00(%rdi),%ymm0
+	vbroadcasti128	0x10(%rdi),%ymm1
+	vbroadcasti128	0x20(%rdi),%ymm2
+	vbroadcasti128	0x30(%rdi),%ymm3
+
+	vmovdqa		%ymm0,%ymm4
+	vmovdqa		%ymm1,%ymm5
+	vmovdqa		%ymm2,%ymm6
+	vmovdqa		%ymm3,%ymm7
+
+	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
+	vpaddd		CTR4BL(%rip),%ymm7,%ymm7
+
+	vmovdqa		%ymm0,%ymm11
+	vmovdqa		%ymm1,%ymm12
+	vmovdqa		%ymm2,%ymm13
+	vmovdqa		%ymm3,%ymm14
+	vmovdqa		%ymm7,%ymm15
+
+	mov		$10,%rax
+
+.Ldoubleround4:
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$16,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxord		%ymm4,%ymm7,%ymm7
+	vprold		$16,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$12,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxord		%ymm6,%ymm5,%ymm5
+	vprold		$12,%ymm5,%ymm5
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$8,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxord		%ymm4,%ymm7,%ymm7
+	vprold		$8,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$7,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxord		%ymm6,%ymm5,%ymm5
+	vprold		$7,%ymm5,%ymm5
+
+	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm1,%ymm1
+	vpshufd		$0x39,%ymm5,%ymm5
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	vpshufd		$0x4e,%ymm6,%ymm6
+	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm3,%ymm3
+	vpshufd		$0x93,%ymm7,%ymm7
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$16,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxord		%ymm4,%ymm7,%ymm7
+	vprold		$16,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$12,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxord		%ymm6,%ymm5,%ymm5
+	vprold		$12,%ymm5,%ymm5
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$8,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxord		%ymm4,%ymm7,%ymm7
+	vprold		$8,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$7,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxord		%ymm6,%ymm5,%ymm5
+	vprold		$7,%ymm5,%ymm5
+
+	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm1,%ymm1
+	vpshufd		$0x93,%ymm5,%ymm5
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	vpshufd		$0x4e,%ymm6,%ymm6
+	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm3,%ymm3
+	vpshufd		$0x39,%ymm7,%ymm7
+
+	dec		%rax
+	jnz		.Ldoubleround4
+
+	# o0 = i0 ^ (x0 + s0), first block
+	vpaddd		%ymm11,%ymm0,%ymm10
+	cmp		$0x10,%rcx
+	jl		.Lxorpart4
+	vpxord		0x00(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x00(%rsi)
+	vextracti128	$1,%ymm10,%xmm0
+	# o1 = i1 ^ (x1 + s1), first block
+	vpaddd		%ymm12,%ymm1,%ymm10
+	cmp		$0x20,%rcx
+	jl		.Lxorpart4
+	vpxord		0x10(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x10(%rsi)
+	vextracti128	$1,%ymm10,%xmm1
+	# o2 = i2 ^ (x2 + s2), first block
+	vpaddd		%ymm13,%ymm2,%ymm10
+	cmp		$0x30,%rcx
+	jl		.Lxorpart4
+	vpxord		0x20(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x20(%rsi)
+	vextracti128	$1,%ymm10,%xmm2
+	# o3 = i3 ^ (x3 + s3), first block
+	vpaddd		%ymm14,%ymm3,%ymm10
+	cmp		$0x40,%rcx
+	jl		.Lxorpart4
+	vpxord		0x30(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x30(%rsi)
+	vextracti128	$1,%ymm10,%xmm3
+
+	# xor and write second block
+	vmovdqa		%xmm0,%xmm10
+	cmp		$0x50,%rcx
+	jl		.Lxorpart4
+	vpxord		0x40(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x40(%rsi)
+
+	vmovdqa		%xmm1,%xmm10
+	cmp		$0x60,%rcx
+	jl		.Lxorpart4
+	vpxord		0x50(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x50(%rsi)
+
+	vmovdqa		%xmm2,%xmm10
+	cmp		$0x70,%rcx
+	jl		.Lxorpart4
+	vpxord		0x60(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x60(%rsi)
+
+	vmovdqa		%xmm3,%xmm10
+	cmp		$0x80,%rcx
+	jl		.Lxorpart4
+	vpxord		0x70(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x70(%rsi)
+
+	# o0 = i0 ^ (x0 + s0), third block
+	vpaddd		%ymm11,%ymm4,%ymm10
+	cmp		$0x90,%rcx
+	jl		.Lxorpart4
+	vpxord		0x80(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x80(%rsi)
+	vextracti128	$1,%ymm10,%xmm4
+	# o1 = i1 ^ (x1 + s1), third block
+	vpaddd		%ymm12,%ymm5,%ymm10
+	cmp		$0xa0,%rcx
+	jl		.Lxorpart4
+	vpxord		0x90(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x90(%rsi)
+	vextracti128	$1,%ymm10,%xmm5
+	# o2 = i2 ^ (x2 + s2), third block
+	vpaddd		%ymm13,%ymm6,%ymm10
+	cmp		$0xb0,%rcx
+	jl		.Lxorpart4
+	vpxord		0xa0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xa0(%rsi)
+	vextracti128	$1,%ymm10,%xmm6
+	# o3 = i3 ^ (x3 + s3), third block
+	vpaddd		%ymm15,%ymm7,%ymm10
+	cmp		$0xc0,%rcx
+	jl		.Lxorpart4
+	vpxord		0xb0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xb0(%rsi)
+	vextracti128	$1,%ymm10,%xmm7
+
+	# xor and write fourth block
+	vmovdqa		%xmm4,%xmm10
+	cmp		$0xd0,%rcx
+	jl		.Lxorpart4
+	vpxord		0xc0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xc0(%rsi)
+
+	vmovdqa		%xmm5,%xmm10
+	cmp		$0xe0,%rcx
+	jl		.Lxorpart4
+	vpxord		0xd0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xd0(%rsi)
+
+	vmovdqa		%xmm6,%xmm10
+	cmp		$0xf0,%rcx
+	jl		.Lxorpart4
+	vpxord		0xe0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xe0(%rsi)
+
+	vmovdqa		%xmm7,%xmm10
+	cmp		$0x100,%rcx
+	jl		.Lxorpart4
+	vpxord		0xf0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xf0(%rsi)
+
+.Ldone4:
+	vzeroupper
+	ret
+
+.Lxorpart4:
+	# xor remaining bytes from partial register into output
+	mov		%rcx,%rax
+	and		$0xf,%rcx
+	jz		.Ldone8
+	mov		%rax,%r9
+	and		$~0xf,%r9
+
+	mov		$1,%rax
+	shld		%cl,%rax,%rax
+	sub		$1,%rax
+	kmovq		%rax,%k1
+
+	vmovdqu8	(%rdx,%r9),%xmm1{%k1}{z}
+	vpxord		%xmm10,%xmm1,%xmm1
+	vmovdqu8	%xmm1,(%rsi,%r9){%k1}
+
+	jmp		.Ldone4
+
+ENDPROC(chacha20_4block_xor_avx512vl)
+
 ENTRY(chacha20_8block_xor_avx512vl)
 	# %rdi: Input state matrix, s
 	# %rsi: up to 8 data blocks output, o
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index d6a95a6a324e..773d075a1483 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -34,6 +34,8 @@ static bool chacha20_use_avx2;
 #ifdef CONFIG_AS_AVX512
 asmlinkage void chacha20_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
 					     unsigned int len);
+asmlinkage void chacha20_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+					     unsigned int len);
 asmlinkage void chacha20_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
 					     unsigned int len);
 static bool chacha20_use_avx512vl;
@@ -64,6 +66,11 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
 			state[12] += chacha20_advance(bytes, 8);
 			return;
 		}
+		if (bytes > CHACHA_BLOCK_SIZE * 2) {
+			chacha20_4block_xor_avx512vl(state, dst, src, bytes);
+			state[12] += chacha20_advance(bytes, 4);
+			return;
+		}
 		if (bytes) {
 			chacha20_2block_xor_avx512vl(state, dst, src, bytes);
 			state[12] += chacha20_advance(bytes, 2);
-- 
cgit v1.2.3-59-g8ed1b


From 4bede34c1aa19266628b13f68db95ec4a16a6f38 Mon Sep 17 00:00:00 2001
From: "Nagadheeraj, Rottela" <Rottela.Nagadheeraj@cavium.com>
Date: Wed, 21 Nov 2018 07:36:58 +0000
Subject: crypto: cavium/nitrox - crypto request format changes

nitrox_skcipher_crypt() will do the necessary formatting/ordering of
input and output sglists based on the algorithm requirements.
It will also accommodate the mandatory output buffers required for
NITROX hardware like Output request headers (ORH) and Completion headers.

Signed-off-by: Nagadheeraj Rottela <rottela.nagadheeraj@cavium.com>
Reviewed-by: Srikanth Jampala <Jampala.Srikanth@cavium.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/cavium/nitrox/nitrox_algs.c   | 111 ++++++++++-
 drivers/crypto/cavium/nitrox/nitrox_req.h    |  94 ++++++----
 drivers/crypto/cavium/nitrox/nitrox_reqmgr.c | 266 ++++++---------------------
 3 files changed, 227 insertions(+), 244 deletions(-)

diff --git a/drivers/crypto/cavium/nitrox/nitrox_algs.c b/drivers/crypto/cavium/nitrox/nitrox_algs.c
index 5d54ebc20cb3..10075a97ff0d 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_algs.c
+++ b/drivers/crypto/cavium/nitrox/nitrox_algs.c
@@ -155,13 +155,109 @@ static int nitrox_aes_setkey(struct crypto_skcipher *cipher, const u8 *key,
 	return nitrox_skcipher_setkey(cipher, aes_keylen, key, keylen);
 }
 
+static int alloc_src_sglist(struct skcipher_request *skreq, int ivsize)
+{
+	struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq);
+	int nents = sg_nents(skreq->src) + 1;
+	struct se_crypto_request *creq = &nkreq->creq;
+	char *iv;
+	struct scatterlist *sg;
+
+	/* Allocate buffer to hold IV and input scatterlist array */
+	nkreq->src = alloc_req_buf(nents, ivsize, creq->gfp);
+	if (!nkreq->src)
+		return -ENOMEM;
+
+	/* copy iv */
+	iv = nkreq->src;
+	memcpy(iv, skreq->iv, ivsize);
+
+	sg = (struct scatterlist *)(iv + ivsize);
+	creq->src = sg;
+	sg_init_table(sg, nents);
+
+	/* Input format:
+	 * +----+----------------+
+	 * | IV | SRC sg entries |
+	 * +----+----------------+
+	 */
+
+	/* IV */
+	sg = create_single_sg(sg, iv, ivsize);
+	/* SRC entries */
+	create_multi_sg(sg, skreq->src);
+
+	return 0;
+}
+
+static int alloc_dst_sglist(struct skcipher_request *skreq, int ivsize)
+{
+	struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq);
+	int nents = sg_nents(skreq->dst) + 3;
+	int extralen = ORH_HLEN + COMP_HLEN;
+	struct se_crypto_request *creq = &nkreq->creq;
+	struct scatterlist *sg;
+	char *iv = nkreq->src;
+
+	/* Allocate buffer to hold ORH, COMPLETION and output scatterlist
+	 * array
+	 */
+	nkreq->dst = alloc_req_buf(nents, extralen, creq->gfp);
+	if (!nkreq->dst)
+		return -ENOMEM;
+
+	creq->orh = (u64 *)(nkreq->dst);
+	set_orh_value(creq->orh);
+
+	creq->comp = (u64 *)(nkreq->dst + ORH_HLEN);
+	set_comp_value(creq->comp);
+
+	sg = (struct scatterlist *)(nkreq->dst + ORH_HLEN + COMP_HLEN);
+	creq->dst = sg;
+	sg_init_table(sg, nents);
+
+	/* Output format:
+	 * +-----+----+----------------+-----------------+
+	 * | ORH | IV | DST sg entries | COMPLETION Bytes|
+	 * +-----+----+----------------+-----------------+
+	 */
+
+	/* ORH */
+	sg = create_single_sg(sg, creq->orh, ORH_HLEN);
+	/* IV */
+	sg = create_single_sg(sg, iv, ivsize);
+	/* DST entries */
+	sg = create_multi_sg(sg, skreq->dst);
+	/* COMPLETION Bytes */
+	create_single_sg(sg, creq->comp, COMP_HLEN);
+
+	return 0;
+}
+
+static void free_src_sglist(struct skcipher_request *skreq)
+{
+	struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq);
+
+	kfree(nkreq->src);
+}
+
+static void free_dst_sglist(struct skcipher_request *skreq)
+{
+	struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq);
+
+	kfree(nkreq->dst);
+}
+
 static void nitrox_skcipher_callback(struct skcipher_request *skreq,
 				     int err)
 {
+	free_src_sglist(skreq);
+	free_dst_sglist(skreq);
 	if (err) {
 		pr_err_ratelimited("request failed status 0x%0x\n", err);
 		err = -EINVAL;
 	}
+
 	skcipher_request_complete(skreq, err);
 }
 
@@ -172,6 +268,7 @@ static int nitrox_skcipher_crypt(struct skcipher_request *skreq, bool enc)
 	struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq);
 	int ivsize = crypto_skcipher_ivsize(cipher);
 	struct se_crypto_request *creq;
+	int ret;
 
 	creq = &nkreq->creq;
 	creq->flags = skreq->base.flags;
@@ -192,11 +289,15 @@ static int nitrox_skcipher_crypt(struct skcipher_request *skreq, bool enc)
 	creq->ctx_handle = nctx->u.ctx_handle;
 	creq->ctrl.s.ctxl = sizeof(struct flexi_crypto_context);
 
-	/* copy the iv */
-	memcpy(creq->iv, skreq->iv, ivsize);
-	creq->ivsize = ivsize;
-	creq->src = skreq->src;
-	creq->dst = skreq->dst;
+	ret = alloc_src_sglist(skreq, ivsize);
+	if (ret)
+		return ret;
+
+	ret = alloc_dst_sglist(skreq, ivsize);
+	if (ret) {
+		free_src_sglist(skreq);
+		return ret;
+	}
 
 	nkreq->nctx = nctx;
 	nkreq->skreq = skreq;
diff --git a/drivers/crypto/cavium/nitrox/nitrox_req.h b/drivers/crypto/cavium/nitrox/nitrox_req.h
index 19f0a20e3bb3..d45ff91c19a9 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_req.h
+++ b/drivers/crypto/cavium/nitrox/nitrox_req.h
@@ -7,6 +7,8 @@
 
 #include "nitrox_dev.h"
 
+#define PENDING_SIG	0xFFFFFFFFFFFFFFFFUL
+
 /**
  * struct gphdr - General purpose Header
  * @param0: first parameter.
@@ -46,13 +48,6 @@ union se_req_ctrl {
 	} s;
 };
 
-struct nitrox_sglist {
-	u16 len;
-	u16 raz0;
-	u32 raz1;
-	dma_addr_t dma;
-};
-
 #define MAX_IV_LEN 16
 
 /**
@@ -62,8 +57,10 @@ struct nitrox_sglist {
  * @ctx_handle: Crypto context handle.
  * @gph: GP Header
  * @ctrl: Request Information.
- * @in: Input sglist
- * @out: Output sglist
+ * @orh: ORH address
+ * @comp: completion address
+ * @src: Input sglist
+ * @dst: Output sglist
  */
 struct se_crypto_request {
 	u8 opcode;
@@ -73,9 +70,8 @@ struct se_crypto_request {
 
 	struct gphdr gph;
 	union se_req_ctrl ctrl;
-
-	u8 iv[MAX_IV_LEN];
-	u16 ivsize;
+	u64 *orh;
+	u64 *comp;
 
 	struct scatterlist *src;
 	struct scatterlist *dst;
@@ -200,6 +196,8 @@ struct nitrox_kcrypt_request {
 	struct se_crypto_request creq;
 	struct nitrox_crypto_ctx *nctx;
 	struct skcipher_request *skreq;
+	u8 *src;
+	u8 *dst;
 };
 
 /**
@@ -376,26 +374,19 @@ struct nitrox_sgcomp {
 
 /*
  * strutct nitrox_sgtable - SG list information
- * @map_cnt: Number of buffers mapped
- * @nr_comp: Number of sglist components
+ * @sgmap_cnt: Number of buffers mapped
  * @total_bytes: Total bytes in sglist.
- * @len: Total sglist components length.
- * @dma: DMA address of sglist component.
- * @dir: DMA direction.
- * @buf: crypto request buffer.
- * @sglist: SG list of input/output buffers.
+ * @sgcomp_len: Total sglist components length.
+ * @sgcomp_dma: DMA address of sglist component.
+ * @sg: crypto request buffer.
  * @sgcomp: sglist component for NITROX.
  */
 struct nitrox_sgtable {
-	u8 map_bufs_cnt;
-	u8 nr_sgcomp;
+	u8 sgmap_cnt;
 	u16 total_bytes;
-	u32 len;
-	dma_addr_t dma;
-	enum dma_data_direction dir;
-
-	struct scatterlist *buf;
-	struct nitrox_sglist *sglist;
+	u32 sgcomp_len;
+	dma_addr_t sgcomp_dma;
+	struct scatterlist *sg;
 	struct nitrox_sgcomp *sgcomp;
 };
 
@@ -405,10 +396,8 @@ struct nitrox_sgtable {
 #define COMP_HLEN	8
 
 struct resp_hdr {
-	u64 orh;
-	dma_addr_t orh_dma;
-	u64 completion;
-	dma_addr_t completion_dma;
+	u64 *orh;
+	u64 *completion;
 };
 
 typedef void (*completion_t)(struct skcipher_request *skreq, int err);
@@ -434,7 +423,6 @@ struct nitrox_softreq {
 	u32 flags;
 	gfp_t gfp;
 	atomic_t status;
-	bool inplace;
 
 	struct nitrox_device *ndev;
 	struct nitrox_cmdq *cmdq;
@@ -450,4 +438,46 @@ struct nitrox_softreq {
 	struct skcipher_request *skreq;
 };
 
+static inline void *alloc_req_buf(int nents, int extralen, gfp_t gfp)
+{
+	size_t size;
+
+	size = sizeof(struct scatterlist) * nents;
+	size += extralen;
+
+	return kzalloc(size, gfp);
+}
+
+static inline struct scatterlist *create_single_sg(struct scatterlist *sg,
+						   void *buf, int buflen)
+{
+	sg_set_buf(sg, buf, buflen);
+	sg++;
+	return sg;
+}
+
+static inline struct scatterlist *create_multi_sg(struct scatterlist *to_sg,
+						  struct scatterlist *from_sg)
+{
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sg(from_sg, sg, sg_nents(from_sg), i) {
+		sg_set_buf(to_sg, sg_virt(sg), sg->length);
+		to_sg++;
+	}
+
+	return to_sg;
+}
+
+static inline void set_orh_value(u64 *orh)
+{
+	WRITE_ONCE(*orh, PENDING_SIG);
+}
+
+static inline void set_comp_value(u64 *comp)
+{
+	WRITE_ONCE(*comp, PENDING_SIG);
+}
+
 #endif /* __NITROX_REQ_H */
diff --git a/drivers/crypto/cavium/nitrox/nitrox_reqmgr.c b/drivers/crypto/cavium/nitrox/nitrox_reqmgr.c
index 3987cd84c033..d566bb904ec2 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_reqmgr.c
+++ b/drivers/crypto/cavium/nitrox/nitrox_reqmgr.c
@@ -13,7 +13,6 @@
 #define FDATA_SIZE 32
 /* Base destination port for the solicited requests */
 #define SOLICIT_BASE_DPORT 256
-#define PENDING_SIG	0xFFFFFFFFFFFFFFFFUL
 
 #define REQ_NOT_POSTED 1
 #define REQ_BACKLOG    2
@@ -52,58 +51,26 @@ static inline int incr_index(int index, int count, int max)
 	return index;
 }
 
-/**
- * dma_free_sglist - unmap and free the sg lists.
- * @ndev: N5 device
- * @sgtbl: SG table
- */
 static void softreq_unmap_sgbufs(struct nitrox_softreq *sr)
 {
 	struct nitrox_device *ndev = sr->ndev;
 	struct device *dev = DEV(ndev);
-	struct nitrox_sglist *sglist;
-
-	/* unmap in sgbuf */
-	sglist = sr->in.sglist;
-	if (!sglist)
-		goto out_unmap;
-
-	/* unmap iv */
-	dma_unmap_single(dev, sglist->dma, sglist->len, DMA_BIDIRECTIONAL);
-	/* unmpa src sglist */
-	dma_unmap_sg(dev, sr->in.buf, (sr->in.map_bufs_cnt - 1), sr->in.dir);
-	/* unamp gather component */
-	dma_unmap_single(dev, sr->in.dma, sr->in.len, DMA_TO_DEVICE);
-	kfree(sr->in.sglist);
+
+
+	dma_unmap_sg(dev, sr->in.sg, sr->in.sgmap_cnt, DMA_BIDIRECTIONAL);
+	dma_unmap_single(dev, sr->in.sgcomp_dma, sr->in.sgcomp_len,
+			 DMA_TO_DEVICE);
 	kfree(sr->in.sgcomp);
-	sr->in.sglist = NULL;
-	sr->in.buf = NULL;
-	sr->in.map_bufs_cnt = 0;
-
-out_unmap:
-	/* unmap out sgbuf */
-	sglist = sr->out.sglist;
-	if (!sglist)
-		return;
-
-	/* unmap orh */
-	dma_unmap_single(dev, sr->resp.orh_dma, ORH_HLEN, sr->out.dir);
-
-	/* unmap dst sglist */
-	if (!sr->inplace) {
-		dma_unmap_sg(dev, sr->out.buf, (sr->out.map_bufs_cnt - 3),
-			     sr->out.dir);
-	}
-	/* unmap completion */
-	dma_unmap_single(dev, sr->resp.completion_dma, COMP_HLEN, sr->out.dir);
+	sr->in.sg = NULL;
+	sr->in.sgmap_cnt = 0;
 
-	/* unmap scatter component */
-	dma_unmap_single(dev, sr->out.dma, sr->out.len, DMA_TO_DEVICE);
-	kfree(sr->out.sglist);
+	dma_unmap_sg(dev, sr->out.sg, sr->out.sgmap_cnt,
+		     DMA_BIDIRECTIONAL);
+	dma_unmap_single(dev, sr->out.sgcomp_dma, sr->out.sgcomp_len,
+			 DMA_TO_DEVICE);
 	kfree(sr->out.sgcomp);
-	sr->out.sglist = NULL;
-	sr->out.buf = NULL;
-	sr->out.map_bufs_cnt = 0;
+	sr->out.sg = NULL;
+	sr->out.sgmap_cnt = 0;
 }
 
 static void softreq_destroy(struct nitrox_softreq *sr)
@@ -116,7 +83,7 @@ static void softreq_destroy(struct nitrox_softreq *sr)
  * create_sg_component - create SG componets for N5 device.
  * @sr: Request structure
  * @sgtbl: SG table
- * @nr_comp: total number of components required
+ * @map_nents: number of dma mapped entries
  *
  * Component structure
  *
@@ -140,7 +107,7 @@ static int create_sg_component(struct nitrox_softreq *sr,
 {
 	struct nitrox_device *ndev = sr->ndev;
 	struct nitrox_sgcomp *sgcomp;
-	struct nitrox_sglist *sglist;
+	struct scatterlist *sg;
 	dma_addr_t dma;
 	size_t sz_comp;
 	int i, j, nr_sgcomp;
@@ -154,17 +121,15 @@ static int create_sg_component(struct nitrox_softreq *sr,
 		return -ENOMEM;
 
 	sgtbl->sgcomp = sgcomp;
-	sgtbl->nr_sgcomp = nr_sgcomp;
 
-	sglist = sgtbl->sglist;
+	sg = sgtbl->sg;
 	/* populate device sg component */
 	for (i = 0; i < nr_sgcomp; i++) {
-		for (j = 0; j < 4; j++) {
-			sgcomp->len[j] = cpu_to_be16(sglist->len);
-			sgcomp->dma[j] = cpu_to_be64(sglist->dma);
-			sglist++;
+		for (j = 0; j < 4 && sg; j++) {
+			sgcomp[i].len[j] = cpu_to_be16(sg_dma_len(sg));
+			sgcomp[i].dma[j] = cpu_to_be64(sg_dma_address(sg));
+			sg = sg_next(sg);
 		}
-		sgcomp++;
 	}
 	/* map the device sg component */
 	dma = dma_map_single(DEV(ndev), sgtbl->sgcomp, sz_comp, DMA_TO_DEVICE);
@@ -174,8 +139,8 @@ static int create_sg_component(struct nitrox_softreq *sr,
 		return -ENOMEM;
 	}
 
-	sgtbl->dma = dma;
-	sgtbl->len = sz_comp;
+	sgtbl->sgcomp_dma = dma;
+	sgtbl->sgcomp_len = sz_comp;
 
 	return 0;
 }
@@ -193,66 +158,27 @@ static int dma_map_inbufs(struct nitrox_softreq *sr,
 {
 	struct device *dev = DEV(sr->ndev);
 	struct scatterlist *sg = req->src;
-	struct nitrox_sglist *glist;
 	int i, nents, ret = 0;
-	dma_addr_t dma;
-	size_t sz;
 
-	nents = sg_nents(req->src);
+	nents = dma_map_sg(dev, req->src, sg_nents(req->src),
+			   DMA_BIDIRECTIONAL);
+	if (!nents)
+		return -EINVAL;
 
-	/* creater gather list IV and src entries */
-	sz = roundup((1 + nents), 4) * sizeof(*glist);
-	glist = kzalloc(sz, sr->gfp);
-	if (!glist)
-		return -ENOMEM;
-
-	sr->in.sglist = glist;
-	/* map IV */
-	dma = dma_map_single(dev, &req->iv, req->ivsize, DMA_BIDIRECTIONAL);
-	if (dma_mapping_error(dev, dma)) {
-		ret = -EINVAL;
-		goto iv_map_err;
-	}
-
-	sr->in.dir = (req->src == req->dst) ? DMA_BIDIRECTIONAL : DMA_TO_DEVICE;
-	/* map src entries */
-	nents = dma_map_sg(dev, req->src, nents, sr->in.dir);
-	if (!nents) {
-		ret = -EINVAL;
-		goto src_map_err;
-	}
-	sr->in.buf = req->src;
-
-	/* store the mappings */
-	glist->len = req->ivsize;
-	glist->dma = dma;
-	glist++;
-	sr->in.total_bytes += req->ivsize;
-
-	for_each_sg(req->src, sg, nents, i) {
-		glist->len = sg_dma_len(sg);
-		glist->dma = sg_dma_address(sg);
-		sr->in.total_bytes += glist->len;
-		glist++;
-	}
-	/* roundup map count to align with entires in sg component */
-	sr->in.map_bufs_cnt = (1 + nents);
+	for_each_sg(req->src, sg, nents, i)
+		sr->in.total_bytes += sg_dma_len(sg);
 
-	/* create NITROX gather component */
-	ret = create_sg_component(sr, &sr->in, sr->in.map_bufs_cnt);
+	sr->in.sg = req->src;
+	sr->in.sgmap_cnt = nents;
+	ret = create_sg_component(sr, &sr->in, sr->in.sgmap_cnt);
 	if (ret)
 		goto incomp_err;
 
 	return 0;
 
 incomp_err:
-	dma_unmap_sg(dev, req->src, nents, sr->in.dir);
-	sr->in.map_bufs_cnt = 0;
-src_map_err:
-	dma_unmap_single(dev, dma, req->ivsize, DMA_BIDIRECTIONAL);
-iv_map_err:
-	kfree(sr->in.sglist);
-	sr->in.sglist = NULL;
+	dma_unmap_sg(dev, req->src, nents, DMA_BIDIRECTIONAL);
+	sr->in.sgmap_cnt = 0;
 	return ret;
 }
 
@@ -260,104 +186,25 @@ static int dma_map_outbufs(struct nitrox_softreq *sr,
 			   struct se_crypto_request *req)
 {
 	struct device *dev = DEV(sr->ndev);
-	struct nitrox_sglist *glist = sr->in.sglist;
-	struct nitrox_sglist *slist;
-	struct scatterlist *sg;
-	int i, nents, map_bufs_cnt, ret = 0;
-	size_t sz;
+	int nents, ret = 0;
 
-	nents = sg_nents(req->dst);
+	nents = dma_map_sg(dev, req->dst, sg_nents(req->dst),
+			   DMA_BIDIRECTIONAL);
+	if (!nents)
+		return -EINVAL;
 
-	/* create scatter list ORH, IV, dst entries and Completion header */
-	sz = roundup((3 + nents), 4) * sizeof(*slist);
-	slist = kzalloc(sz, sr->gfp);
-	if (!slist)
-		return -ENOMEM;
-
-	sr->out.sglist = slist;
-	sr->out.dir = DMA_BIDIRECTIONAL;
-	/* map ORH */
-	sr->resp.orh_dma = dma_map_single(dev, &sr->resp.orh, ORH_HLEN,
-					  sr->out.dir);
-	if (dma_mapping_error(dev, sr->resp.orh_dma)) {
-		ret = -EINVAL;
-		goto orh_map_err;
-	}
-
-	/* map completion */
-	sr->resp.completion_dma = dma_map_single(dev, &sr->resp.completion,
-						 COMP_HLEN, sr->out.dir);
-	if (dma_mapping_error(dev, sr->resp.completion_dma)) {
-		ret = -EINVAL;
-		goto compl_map_err;
-	}
-
-	sr->inplace = (req->src == req->dst) ? true : false;
-	/* out place */
-	if (!sr->inplace) {
-		nents = dma_map_sg(dev, req->dst, nents, sr->out.dir);
-		if (!nents) {
-			ret = -EINVAL;
-			goto dst_map_err;
-		}
-	}
-	sr->out.buf = req->dst;
-
-	/* store the mappings */
-	/* orh */
-	slist->len = ORH_HLEN;
-	slist->dma = sr->resp.orh_dma;
-	slist++;
-
-	/* copy the glist mappings */
-	if (sr->inplace) {
-		nents = sr->in.map_bufs_cnt - 1;
-		map_bufs_cnt = sr->in.map_bufs_cnt;
-		while (map_bufs_cnt--) {
-			slist->len = glist->len;
-			slist->dma = glist->dma;
-			slist++;
-			glist++;
-		}
-	} else {
-		/* copy iv mapping */
-		slist->len = glist->len;
-		slist->dma = glist->dma;
-		slist++;
-		/* copy remaining maps */
-		for_each_sg(req->dst, sg, nents, i) {
-			slist->len = sg_dma_len(sg);
-			slist->dma = sg_dma_address(sg);
-			slist++;
-		}
-	}
-
-	/* completion */
-	slist->len = COMP_HLEN;
-	slist->dma = sr->resp.completion_dma;
-
-	sr->out.map_bufs_cnt = (3 + nents);
-
-	ret = create_sg_component(sr, &sr->out, sr->out.map_bufs_cnt);
+	sr->out.sg = req->dst;
+	sr->out.sgmap_cnt = nents;
+	ret = create_sg_component(sr, &sr->out, sr->out.sgmap_cnt);
 	if (ret)
 		goto outcomp_map_err;
 
 	return 0;
 
 outcomp_map_err:
-	if (!sr->inplace)
-		dma_unmap_sg(dev, req->dst, nents, sr->out.dir);
-	sr->out.map_bufs_cnt = 0;
-	sr->out.buf = NULL;
-dst_map_err:
-	dma_unmap_single(dev, sr->resp.completion_dma, COMP_HLEN, sr->out.dir);
-	sr->resp.completion_dma = 0;
-compl_map_err:
-	dma_unmap_single(dev, sr->resp.orh_dma, ORH_HLEN, sr->out.dir);
-	sr->resp.orh_dma = 0;
-orh_map_err:
-	kfree(sr->out.sglist);
-	sr->out.sglist = NULL;
+	dma_unmap_sg(dev, req->dst, nents, DMA_BIDIRECTIONAL);
+	sr->out.sgmap_cnt = 0;
+	sr->out.sg = NULL;
 	return ret;
 }
 
@@ -556,8 +403,8 @@ int nitrox_process_se_request(struct nitrox_device *ndev,
 
 	atomic_set(&sr->status, REQ_NOT_POSTED);
 
-	WRITE_ONCE(sr->resp.orh, PENDING_SIG);
-	WRITE_ONCE(sr->resp.completion, PENDING_SIG);
+	sr->resp.orh = req->orh;
+	sr->resp.completion = req->comp;
 
 	ret = softreq_map_iobuf(sr, req);
 	if (ret) {
@@ -598,13 +445,13 @@ int nitrox_process_se_request(struct nitrox_device *ndev,
 
 	/* fill the packet instruction */
 	/* word 0 */
-	sr->instr.dptr0 = cpu_to_be64(sr->in.dma);
+	sr->instr.dptr0 = cpu_to_be64(sr->in.sgcomp_dma);
 
 	/* word 1 */
 	sr->instr.ih.value = 0;
 	sr->instr.ih.s.g = 1;
-	sr->instr.ih.s.gsz = sr->in.map_bufs_cnt;
-	sr->instr.ih.s.ssz = sr->out.map_bufs_cnt;
+	sr->instr.ih.s.gsz = sr->in.sgmap_cnt;
+	sr->instr.ih.s.ssz = sr->out.sgmap_cnt;
 	sr->instr.ih.s.fsz = FDATA_SIZE + sizeof(struct gphdr);
 	sr->instr.ih.s.tlen = sr->instr.ih.s.fsz + sr->in.total_bytes;
 	sr->instr.ih.value = cpu_to_be64(sr->instr.ih.value);
@@ -626,11 +473,11 @@ int nitrox_process_se_request(struct nitrox_device *ndev,
 
 	/* word 4 */
 	sr->instr.slc.value[0] = 0;
-	sr->instr.slc.s.ssz = sr->out.map_bufs_cnt;
+	sr->instr.slc.s.ssz = sr->out.sgmap_cnt;
 	sr->instr.slc.value[0] = cpu_to_be64(sr->instr.slc.value[0]);
 
 	/* word 5 */
-	sr->instr.slc.s.rptr = cpu_to_be64(sr->out.dma);
+	sr->instr.slc.s.rptr = cpu_to_be64(sr->out.sgcomp_dma);
 
 	/*
 	 * No conversion for front data,
@@ -664,6 +511,11 @@ void backlog_qflush_work(struct work_struct *work)
 	post_backlog_cmds(cmdq);
 }
 
+static bool sr_completed(struct nitrox_softreq *sr)
+{
+	return (READ_ONCE(*sr->resp.orh) != READ_ONCE(*sr->resp.completion));
+}
+
 /**
  * process_request_list - process completed requests
  * @ndev: N5 device
@@ -691,13 +543,13 @@ static void process_response_list(struct nitrox_cmdq *cmdq)
 			break;
 
 		/* check orh and completion bytes updates */
-		if (READ_ONCE(sr->resp.orh) == READ_ONCE(sr->resp.completion)) {
+		if (!sr_completed(sr)) {
 			/* request not completed, check for timeout */
 			if (!cmd_timeout(sr->tstamp, ndev->timeout))
 				break;
 			dev_err_ratelimited(DEV(ndev),
 					    "Request timeout, orh 0x%016llx\n",
-					    READ_ONCE(sr->resp.orh));
+					    READ_ONCE(*sr->resp.orh));
 		}
 		atomic_dec(&cmdq->pending_count);
 		atomic64_inc(&ndev->stats.completed);
@@ -710,7 +562,7 @@ static void process_response_list(struct nitrox_cmdq *cmdq)
 		skreq = sr->skreq;
 
 		/* ORH error code */
-		err = READ_ONCE(sr->resp.orh) & 0xff;
+		err = READ_ONCE(*sr->resp.orh) & 0xff;
 		softreq_destroy(sr);
 
 		if (callback)
-- 
cgit v1.2.3-59-g8ed1b


From 7a027b57f959660547779b6290a3b50ff1601980 Mon Sep 17 00:00:00 2001
From: "Srikanth, Jampala" <Jampala.Srikanth@cavium.com>
Date: Wed, 21 Nov 2018 09:52:24 +0000
Subject: crypto: cavium/nitrox - Enable interrups for PF in SR-IOV mode.

Enable the available interrupt vectors for PF in SR-IOV Mode.
Only single vector entry 192 is valid of PF. This is used to
notify any hardware errors and mailbox messages from VF(s).

Signed-off-by: Srikanth Jampala <Jampala.Srikanth@cavium.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/cavium/nitrox/nitrox_dev.h   | 15 +++++-
 drivers/crypto/cavium/nitrox/nitrox_isr.c   | 84 ++++++++++++++++++++++++++++-
 drivers/crypto/cavium/nitrox/nitrox_isr.h   |  2 +
 drivers/crypto/cavium/nitrox/nitrox_sriov.c | 51 +++++++++++++++---
 4 files changed, 142 insertions(+), 10 deletions(-)

diff --git a/drivers/crypto/cavium/nitrox/nitrox_dev.h b/drivers/crypto/cavium/nitrox/nitrox_dev.h
index 283e252385fb..247df32f687c 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_dev.h
+++ b/drivers/crypto/cavium/nitrox/nitrox_dev.h
@@ -103,6 +103,16 @@ struct nitrox_q_vector {
 	};
 };
 
+/**
+ * struct nitrox_iov - SR-IOV information
+ * @num_vfs: number of VF(s) enabled
+ * @msix: MSI-X for PF in SR-IOV case
+ */
+struct nitrox_iov {
+	int num_vfs;
+	struct msix_entry msix;
+};
+
 /*
  * NITROX Device states
  */
@@ -150,6 +160,9 @@ enum vf_mode {
  * @ctx_pool: DMA pool for crypto context
  * @pkt_inq: Packet input rings
  * @qvec: MSI-X queue vectors information
+ * @iov: SR-IOV informatin
+ * @num_vecs: number of MSI-X vectors
+ * @stats: request statistics
  * @hw: hardware information
  * @debugfs_dir: debugfs directory
  */
@@ -168,13 +181,13 @@ struct nitrox_device {
 	int node;
 	u16 qlen;
 	u16 nr_queues;
-	int num_vfs;
 	enum vf_mode mode;
 
 	struct dma_pool *ctx_pool;
 	struct nitrox_cmdq *pkt_inq;
 
 	struct nitrox_q_vector *qvec;
+	struct nitrox_iov iov;
 	int num_vecs;
 
 	struct nitrox_stats stats;
diff --git a/drivers/crypto/cavium/nitrox/nitrox_isr.c b/drivers/crypto/cavium/nitrox/nitrox_isr.c
index 88a77b8fb3fb..c5b797808680 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_isr.c
+++ b/drivers/crypto/cavium/nitrox/nitrox_isr.c
@@ -13,6 +13,7 @@
  *  - NPS packet ring, AQMQ ring and ZQMQ ring
  */
 #define NR_RING_VECTORS 3
+#define NR_NON_RING_VECTORS 1
 /* base entry for packet ring/port */
 #define PKT_RING_MSIX_BASE 0
 #define NON_RING_MSIX_BASE 192
@@ -275,6 +276,7 @@ void nitrox_unregister_interrupts(struct nitrox_device *ndev)
 		qvec->valid = false;
 	}
 	kfree(ndev->qvec);
+	ndev->qvec = NULL;
 	pci_free_irq_vectors(pdev);
 }
 
@@ -321,6 +323,7 @@ int nitrox_register_interrupts(struct nitrox_device *ndev)
 		if (qvec->ring >= ndev->nr_queues)
 			break;
 
+		qvec->cmdq = &ndev->pkt_inq[qvec->ring];
 		snprintf(qvec->name, IRQ_NAMESZ, "nitrox-pkt%d", qvec->ring);
 		/* get the vector number */
 		vec = pci_irq_vector(pdev, i);
@@ -335,13 +338,13 @@ int nitrox_register_interrupts(struct nitrox_device *ndev)
 
 		tasklet_init(&qvec->resp_tasklet, pkt_slc_resp_tasklet,
 			     (unsigned long)qvec);
-		qvec->cmdq = &ndev->pkt_inq[qvec->ring];
 		qvec->valid = true;
 	}
 
 	/* request irqs for non ring vectors */
 	i = NON_RING_MSIX_BASE;
 	qvec = &ndev->qvec[i];
+	qvec->ndev = ndev;
 
 	snprintf(qvec->name, IRQ_NAMESZ, "nitrox-core-int%d", i);
 	/* get the vector number */
@@ -356,7 +359,6 @@ int nitrox_register_interrupts(struct nitrox_device *ndev)
 
 	tasklet_init(&qvec->resp_tasklet, nps_core_int_tasklet,
 		     (unsigned long)qvec);
-	qvec->ndev = ndev;
 	qvec->valid = true;
 
 	return 0;
@@ -365,3 +367,81 @@ irq_fail:
 	nitrox_unregister_interrupts(ndev);
 	return ret;
 }
+
+void nitrox_sriov_unregister_interrupts(struct nitrox_device *ndev)
+{
+	struct pci_dev *pdev = ndev->pdev;
+	int i;
+
+	for (i = 0; i < ndev->num_vecs; i++) {
+		struct nitrox_q_vector *qvec;
+		int vec;
+
+		qvec = ndev->qvec + i;
+		if (!qvec->valid)
+			continue;
+
+		vec = ndev->iov.msix.vector;
+		irq_set_affinity_hint(vec, NULL);
+		free_irq(vec, qvec);
+
+		tasklet_disable(&qvec->resp_tasklet);
+		tasklet_kill(&qvec->resp_tasklet);
+		qvec->valid = false;
+	}
+	kfree(ndev->qvec);
+	ndev->qvec = NULL;
+	pci_disable_msix(pdev);
+}
+
+int nitrox_sriov_register_interupts(struct nitrox_device *ndev)
+{
+	struct pci_dev *pdev = ndev->pdev;
+	struct nitrox_q_vector *qvec;
+	int vec, cpu;
+	int ret;
+
+	/**
+	 * only non ring vectors i.e Entry 192 is available
+	 * for PF in SR-IOV mode.
+	 */
+	ndev->iov.msix.entry = NON_RING_MSIX_BASE;
+	ret = pci_enable_msix_exact(pdev, &ndev->iov.msix, NR_NON_RING_VECTORS);
+	if (ret) {
+		dev_err(DEV(ndev), "failed to allocate nps-core-int%d\n",
+			NON_RING_MSIX_BASE);
+		return ret;
+	}
+
+	qvec = kcalloc(NR_NON_RING_VECTORS, sizeof(*qvec), GFP_KERNEL);
+	if (!qvec) {
+		pci_disable_msix(pdev);
+		return -ENOMEM;
+	}
+	qvec->ndev = ndev;
+
+	ndev->qvec = qvec;
+	ndev->num_vecs = NR_NON_RING_VECTORS;
+	snprintf(qvec->name, IRQ_NAMESZ, "nitrox-core-int%d",
+		 NON_RING_MSIX_BASE);
+
+	vec = ndev->iov.msix.vector;
+	ret = request_irq(vec, nps_core_int_isr, 0, qvec->name, qvec);
+	if (ret) {
+		dev_err(DEV(ndev), "irq failed for nitrox-core-int%d\n",
+			NON_RING_MSIX_BASE);
+		goto iov_irq_fail;
+	}
+	cpu = num_online_cpus();
+	irq_set_affinity_hint(vec, get_cpu_mask(cpu));
+
+	tasklet_init(&qvec->resp_tasklet, nps_core_int_tasklet,
+		     (unsigned long)qvec);
+	qvec->valid = true;
+
+	return 0;
+
+iov_irq_fail:
+	nitrox_sriov_unregister_interrupts(ndev);
+	return ret;
+}
diff --git a/drivers/crypto/cavium/nitrox/nitrox_isr.h b/drivers/crypto/cavium/nitrox/nitrox_isr.h
index 63418a6cc52c..1062c9336c1f 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_isr.h
+++ b/drivers/crypto/cavium/nitrox/nitrox_isr.h
@@ -6,5 +6,7 @@
 
 int nitrox_register_interrupts(struct nitrox_device *ndev);
 void nitrox_unregister_interrupts(struct nitrox_device *ndev);
+int nitrox_sriov_register_interupts(struct nitrox_device *ndev);
+void nitrox_sriov_unregister_interrupts(struct nitrox_device *ndev);
 
 #endif /* __NITROX_ISR_H */
diff --git a/drivers/crypto/cavium/nitrox/nitrox_sriov.c b/drivers/crypto/cavium/nitrox/nitrox_sriov.c
index 30c0aa874583..7ba0cc5d6d02 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_sriov.c
+++ b/drivers/crypto/cavium/nitrox/nitrox_sriov.c
@@ -7,6 +7,10 @@
 #include "nitrox_common.h"
 #include "nitrox_isr.h"
 
+/**
+ * num_vfs_valid - validate VF count
+ * @num_vfs: number of VF(s)
+ */
 static inline bool num_vfs_valid(int num_vfs)
 {
 	bool valid = false;
@@ -48,7 +52,7 @@ static inline enum vf_mode num_vfs_to_mode(int num_vfs)
 	return mode;
 }
 
-static void pf_sriov_cleanup(struct nitrox_device *ndev)
+static void nitrox_pf_cleanup(struct nitrox_device *ndev)
 {
 	 /* PF has no queues in SR-IOV mode */
 	atomic_set(&ndev->state, __NDEV_NOT_READY);
@@ -60,7 +64,11 @@ static void pf_sriov_cleanup(struct nitrox_device *ndev)
 	nitrox_common_sw_cleanup(ndev);
 }
 
-static int pf_sriov_init(struct nitrox_device *ndev)
+/**
+ * nitrox_pf_reinit - re-initialize PF resources once SR-IOV is disabled
+ * @ndev: NITROX device
+ */
+static int nitrox_pf_reinit(struct nitrox_device *ndev)
 {
 	int err;
 
@@ -86,6 +94,18 @@ static int pf_sriov_init(struct nitrox_device *ndev)
 	return nitrox_crypto_register();
 }
 
+static int nitrox_sriov_init(struct nitrox_device *ndev)
+{
+	/* register interrupts for PF in SR-IOV */
+	return nitrox_sriov_register_interupts(ndev);
+}
+
+static void nitrox_sriov_cleanup(struct nitrox_device *ndev)
+{
+	/* unregister interrupts for PF in SR-IOV */
+	nitrox_sriov_unregister_interrupts(ndev);
+}
+
 static int nitrox_sriov_enable(struct pci_dev *pdev, int num_vfs)
 {
 	struct nitrox_device *ndev = pci_get_drvdata(pdev);
@@ -106,17 +126,31 @@ static int nitrox_sriov_enable(struct pci_dev *pdev, int num_vfs)
 	}
 	dev_info(DEV(ndev), "Enabled VF(s) %d\n", num_vfs);
 
-	ndev->num_vfs = num_vfs;
+	ndev->iov.num_vfs = num_vfs;
 	ndev->mode = num_vfs_to_mode(num_vfs);
 	/* set bit in flags */
 	set_bit(__NDEV_SRIOV_BIT, &ndev->flags);
 
 	/* cleanup PF resources */
-	pf_sriov_cleanup(ndev);
+	nitrox_pf_cleanup(ndev);
 
-	config_nps_core_vfcfg_mode(ndev, ndev->mode);
+	/* PF SR-IOV mode initialization */
+	err = nitrox_sriov_init(ndev);
+	if (err)
+		goto iov_fail;
 
+	config_nps_core_vfcfg_mode(ndev, ndev->mode);
 	return num_vfs;
+
+iov_fail:
+	pci_disable_sriov(pdev);
+	/* clear bit in flags */
+	clear_bit(__NDEV_SRIOV_BIT, &ndev->flags);
+	ndev->iov.num_vfs = 0;
+	ndev->mode = __NDEV_MODE_PF;
+	/* reset back to working mode in PF */
+	nitrox_pf_reinit(ndev);
+	return err;
 }
 
 static int nitrox_sriov_disable(struct pci_dev *pdev)
@@ -134,12 +168,15 @@ static int nitrox_sriov_disable(struct pci_dev *pdev)
 	/* clear bit in flags */
 	clear_bit(__NDEV_SRIOV_BIT, &ndev->flags);
 
-	ndev->num_vfs = 0;
+	ndev->iov.num_vfs = 0;
 	ndev->mode = __NDEV_MODE_PF;
 
+	/* cleanup PF SR-IOV resources */
+	nitrox_sriov_cleanup(ndev);
+
 	config_nps_core_vfcfg_mode(ndev, ndev->mode);
 
-	return pf_sriov_init(ndev);
+	return nitrox_pf_reinit(ndev);
 }
 
 int nitrox_sriov_configure(struct pci_dev *pdev, int num_vfs)
-- 
cgit v1.2.3-59-g8ed1b


From 1c876a90e25398a7396ff4de9074ab530e7892b4 Mon Sep 17 00:00:00 2001
From: Gilad Ben-Yossef <gilad@benyossef.com>
Date: Tue, 13 Nov 2018 09:40:35 +0000
Subject: crypto: ccree - add support for CryptoCell 703

Add support for Arm TrustZone CryptoCell 703.
The 703 is a variant of the CryptoCell 713 that supports only
algorithms certified by the Chinesse Office of the State Commercial
Cryptography Administration (OSCCA).

Signed-off-by: Gilad Ben-Yossef <gilad@benyossef.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/Kconfig           |  2 +-
 drivers/crypto/ccree/cc_aead.c   | 16 +++++++++++++++-
 drivers/crypto/ccree/cc_cipher.c | 38 +++++++++++++++++++++++++++++++++++++-
 drivers/crypto/ccree/cc_driver.c | 19 +++++++++++++++----
 drivers/crypto/ccree/cc_driver.h |  8 ++++++++
 drivers/crypto/ccree/cc_hash.c   | 16 ++++++++++++++--
 6 files changed, 90 insertions(+), 9 deletions(-)

diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
index 9f1baaec385f..d80751d48cf1 100644
--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
@@ -767,7 +767,7 @@ config CRYPTO_DEV_CCREE
 	help
 	  Say 'Y' to enable a driver for the REE interface of the Arm
 	  TrustZone CryptoCell family of processors. Currently the
-	  CryptoCell 713, 712, 710 and 630 are supported.
+	  CryptoCell 713, 703, 712, 710 and 630 are supported.
 	  Choose this if you wish to use hardware acceleration of
 	  cryptographic operations on the system REE.
 	  If unsure say Y.
diff --git a/drivers/crypto/ccree/cc_aead.c b/drivers/crypto/ccree/cc_aead.c
index e0ac376ceb74..f2643cda45db 100644
--- a/drivers/crypto/ccree/cc_aead.c
+++ b/drivers/crypto/ccree/cc_aead.c
@@ -2367,6 +2367,7 @@ static struct cc_alg_template aead_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.auth_mode = DRV_HASH_SHA1,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "authenc(hmac(sha1),cbc(des3_ede))",
@@ -2386,6 +2387,7 @@ static struct cc_alg_template aead_algs[] = {
 		.flow_mode = S_DIN_to_DES,
 		.auth_mode = DRV_HASH_SHA1,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "authenc(hmac(sha256),cbc(aes))",
@@ -2405,6 +2407,7 @@ static struct cc_alg_template aead_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.auth_mode = DRV_HASH_SHA256,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "authenc(hmac(sha256),cbc(des3_ede))",
@@ -2424,6 +2427,7 @@ static struct cc_alg_template aead_algs[] = {
 		.flow_mode = S_DIN_to_DES,
 		.auth_mode = DRV_HASH_SHA256,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "authenc(xcbc(aes),cbc(aes))",
@@ -2443,6 +2447,7 @@ static struct cc_alg_template aead_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.auth_mode = DRV_HASH_XCBC_MAC,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "authenc(hmac(sha1),rfc3686(ctr(aes)))",
@@ -2462,6 +2467,7 @@ static struct cc_alg_template aead_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.auth_mode = DRV_HASH_SHA1,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "authenc(hmac(sha256),rfc3686(ctr(aes)))",
@@ -2481,6 +2487,7 @@ static struct cc_alg_template aead_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.auth_mode = DRV_HASH_SHA256,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "authenc(xcbc(aes),rfc3686(ctr(aes)))",
@@ -2500,6 +2507,7 @@ static struct cc_alg_template aead_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.auth_mode = DRV_HASH_XCBC_MAC,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "ccm(aes)",
@@ -2519,6 +2527,7 @@ static struct cc_alg_template aead_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.auth_mode = DRV_HASH_NULL,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "rfc4309(ccm(aes))",
@@ -2538,6 +2547,7 @@ static struct cc_alg_template aead_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.auth_mode = DRV_HASH_NULL,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "gcm(aes)",
@@ -2557,6 +2567,7 @@ static struct cc_alg_template aead_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.auth_mode = DRV_HASH_NULL,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "rfc4106(gcm(aes))",
@@ -2576,6 +2587,7 @@ static struct cc_alg_template aead_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.auth_mode = DRV_HASH_NULL,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "rfc4543(gcm(aes))",
@@ -2595,6 +2607,7 @@ static struct cc_alg_template aead_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.auth_mode = DRV_HASH_NULL,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 };
 
@@ -2679,7 +2692,8 @@ int cc_aead_alloc(struct cc_drvdata *drvdata)
 
 	/* Linux crypto */
 	for (alg = 0; alg < ARRAY_SIZE(aead_algs); alg++) {
-		if (aead_algs[alg].min_hw_rev > drvdata->hw_rev)
+		if ((aead_algs[alg].min_hw_rev > drvdata->hw_rev) ||
+		    !(drvdata->std_bodies & aead_algs[alg].std_body))
 			continue;
 
 		t_alg = cc_create_aead_alg(&aead_algs[alg], dev);
diff --git a/drivers/crypto/ccree/cc_cipher.c b/drivers/crypto/ccree/cc_cipher.c
index 989e70f43bd9..cc92b031fad1 100644
--- a/drivers/crypto/ccree/cc_cipher.c
+++ b/drivers/crypto/ccree/cc_cipher.c
@@ -833,6 +833,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_XTS,
 		.flow_mode = S_DIN_to_AES,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "xts512(paes)",
@@ -850,6 +851,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.data_unit = 512,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "xts4096(paes)",
@@ -867,6 +869,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.data_unit = 4096,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "essiv(paes)",
@@ -883,6 +886,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_ESSIV,
 		.flow_mode = S_DIN_to_AES,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "essiv512(paes)",
@@ -900,6 +904,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.data_unit = 512,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "essiv4096(paes)",
@@ -917,6 +922,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.data_unit = 4096,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "bitlocker(paes)",
@@ -933,6 +939,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_BITLOCKER,
 		.flow_mode = S_DIN_to_AES,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "bitlocker512(paes)",
@@ -950,6 +957,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.data_unit = 512,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "bitlocker4096(paes)",
@@ -967,6 +975,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.data_unit = 4096,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "ecb(paes)",
@@ -983,6 +992,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_ECB,
 		.flow_mode = S_DIN_to_AES,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "cbc(paes)",
@@ -999,6 +1009,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_CBC,
 		.flow_mode = S_DIN_to_AES,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "ofb(paes)",
@@ -1015,6 +1026,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_OFB,
 		.flow_mode = S_DIN_to_AES,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "cts(cbc(paes))",
@@ -1031,6 +1043,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_CBC_CTS,
 		.flow_mode = S_DIN_to_AES,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "ctr(paes)",
@@ -1047,6 +1060,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_CTR,
 		.flow_mode = S_DIN_to_AES,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "xts(aes)",
@@ -1063,6 +1077,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_XTS,
 		.flow_mode = S_DIN_to_AES,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "xts512(aes)",
@@ -1080,6 +1095,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.data_unit = 512,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "xts4096(aes)",
@@ -1097,6 +1113,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.data_unit = 4096,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "essiv(aes)",
@@ -1113,6 +1130,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_ESSIV,
 		.flow_mode = S_DIN_to_AES,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "essiv512(aes)",
@@ -1130,6 +1148,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.data_unit = 512,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "essiv4096(aes)",
@@ -1147,6 +1166,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.data_unit = 4096,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "bitlocker(aes)",
@@ -1163,6 +1183,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_BITLOCKER,
 		.flow_mode = S_DIN_to_AES,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "bitlocker512(aes)",
@@ -1180,6 +1201,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.data_unit = 512,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "bitlocker4096(aes)",
@@ -1197,6 +1219,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.data_unit = 4096,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "ecb(aes)",
@@ -1213,6 +1236,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_ECB,
 		.flow_mode = S_DIN_to_AES,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "cbc(aes)",
@@ -1229,6 +1253,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_CBC,
 		.flow_mode = S_DIN_to_AES,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "ofb(aes)",
@@ -1245,6 +1270,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_OFB,
 		.flow_mode = S_DIN_to_AES,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "cts(cbc(aes))",
@@ -1261,6 +1287,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_CBC_CTS,
 		.flow_mode = S_DIN_to_AES,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "ctr(aes)",
@@ -1277,6 +1304,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_CTR,
 		.flow_mode = S_DIN_to_AES,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "cbc(des3_ede)",
@@ -1293,6 +1321,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_CBC,
 		.flow_mode = S_DIN_to_DES,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "ecb(des3_ede)",
@@ -1309,6 +1338,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_ECB,
 		.flow_mode = S_DIN_to_DES,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "cbc(des)",
@@ -1325,6 +1355,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_CBC,
 		.flow_mode = S_DIN_to_DES,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "ecb(des)",
@@ -1341,6 +1372,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_ECB,
 		.flow_mode = S_DIN_to_DES,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "cbc(sm4)",
@@ -1357,6 +1389,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_CBC,
 		.flow_mode = S_DIN_to_SM4,
 		.min_hw_rev = CC_HW_REV_713,
+		.std_body = CC_STD_OSCCA,
 	},
 	{
 		.name = "ecb(sm4)",
@@ -1373,6 +1406,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_ECB,
 		.flow_mode = S_DIN_to_SM4,
 		.min_hw_rev = CC_HW_REV_713,
+		.std_body = CC_STD_OSCCA,
 	},
 	{
 		.name = "ctr(sm4)",
@@ -1389,6 +1423,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_CTR,
 		.flow_mode = S_DIN_to_SM4,
 		.min_hw_rev = CC_HW_REV_713,
+		.std_body = CC_STD_OSCCA,
 	},
 };
 
@@ -1464,7 +1499,8 @@ int cc_cipher_alloc(struct cc_drvdata *drvdata)
 	dev_dbg(dev, "Number of algorithms = %zu\n",
 		ARRAY_SIZE(skcipher_algs));
 	for (alg = 0; alg < ARRAY_SIZE(skcipher_algs); alg++) {
-		if (skcipher_algs[alg].min_hw_rev > drvdata->hw_rev)
+		if ((skcipher_algs[alg].min_hw_rev > drvdata->hw_rev) ||
+		    !(drvdata->std_bodies & skcipher_algs[alg].std_body))
 			continue;
 
 		dev_dbg(dev, "creating %s\n", skcipher_algs[alg].driver_name);
diff --git a/drivers/crypto/ccree/cc_driver.c b/drivers/crypto/ccree/cc_driver.c
index 09a6ada0b547..8ada308d72ee 100644
--- a/drivers/crypto/ccree/cc_driver.c
+++ b/drivers/crypto/ccree/cc_driver.c
@@ -39,27 +39,37 @@ struct cc_hw_data {
 	char *name;
 	enum cc_hw_rev rev;
 	u32 sig;
+	int std_bodies;
 };
 
 /* Hardware revisions defs. */
 
+/* The 703 is a OSCCA only variant of the 713 */
+static const struct cc_hw_data cc703_hw = {
+	.name = "703", .rev = CC_HW_REV_713, .std_bodies = CC_STD_OSCCA
+};
+
 static const struct cc_hw_data cc713_hw = {
-	.name = "713", .rev = CC_HW_REV_713
+	.name = "713", .rev = CC_HW_REV_713, .std_bodies = CC_STD_ALL
 };
 
 static const struct cc_hw_data cc712_hw = {
-	.name = "712", .rev = CC_HW_REV_712, .sig =  0xDCC71200U
+	.name = "712", .rev = CC_HW_REV_712, .sig =  0xDCC71200U,
+	.std_bodies = CC_STD_ALL
 };
 
 static const struct cc_hw_data cc710_hw = {
-	.name = "710", .rev = CC_HW_REV_710, .sig =  0xDCC63200U
+	.name = "710", .rev = CC_HW_REV_710, .sig =  0xDCC63200U,
+	.std_bodies = CC_STD_ALL
 };
 
 static const struct cc_hw_data cc630p_hw = {
-	.name = "630P", .rev = CC_HW_REV_630, .sig = 0xDCC63000U
+	.name = "630P", .rev = CC_HW_REV_630, .sig = 0xDCC63000U,
+	.std_bodies = CC_STD_ALL
 };
 
 static const struct of_device_id arm_ccree_dev_of_match[] = {
+	{ .compatible = "arm,cryptocell-703-ree", .data = &cc703_hw },
 	{ .compatible = "arm,cryptocell-713-ree", .data = &cc713_hw },
 	{ .compatible = "arm,cryptocell-712-ree", .data = &cc712_hw },
 	{ .compatible = "arm,cryptocell-710-ree", .data = &cc710_hw },
@@ -209,6 +219,7 @@ static int init_cc_resources(struct platform_device *plat_dev)
 	hw_rev = (struct cc_hw_data *)dev_id->data;
 	new_drvdata->hw_rev_name = hw_rev->name;
 	new_drvdata->hw_rev = hw_rev->rev;
+	new_drvdata->std_bodies = hw_rev->std_bodies;
 
 	if (hw_rev->rev >= CC_HW_REV_712) {
 		new_drvdata->axim_mon_offset = CC_REG(AXIM_MON_COMP);
diff --git a/drivers/crypto/ccree/cc_driver.h b/drivers/crypto/ccree/cc_driver.h
index 170790967854..5be7fd431b05 100644
--- a/drivers/crypto/ccree/cc_driver.h
+++ b/drivers/crypto/ccree/cc_driver.h
@@ -45,6 +45,12 @@ enum cc_hw_rev {
 	CC_HW_REV_713 = 713
 };
 
+enum cc_std_body {
+	CC_STD_NIST = 0x1,
+	CC_STD_OSCCA = 0x2,
+	CC_STD_ALL = 0x3
+};
+
 #define CC_COHERENT_CACHE_PARAMS 0xEEE
 
 /* Maximum DMA mask supported by IP */
@@ -131,6 +137,7 @@ struct cc_drvdata {
 	u32 axim_mon_offset;
 	u32 sig_offset;
 	u32 ver_offset;
+	int std_bodies;
 };
 
 struct cc_crypto_alg {
@@ -156,6 +163,7 @@ struct cc_alg_template {
 	int flow_mode; /* Note: currently, refers to the cipher mode only. */
 	int auth_mode;
 	u32 min_hw_rev;
+	enum cc_std_body std_body;
 	unsigned int data_unit;
 	struct cc_drvdata *drvdata;
 };
diff --git a/drivers/crypto/ccree/cc_hash.c b/drivers/crypto/ccree/cc_hash.c
index c80c9ae555d0..2c4ddc8fb76b 100644
--- a/drivers/crypto/ccree/cc_hash.c
+++ b/drivers/crypto/ccree/cc_hash.c
@@ -1539,6 +1539,7 @@ struct cc_hash_template {
 	int inter_digestsize;
 	struct cc_drvdata *drvdata;
 	u32 min_hw_rev;
+	enum cc_std_body std_body;
 };
 
 #define CC_STATE_SIZE(_x) \
@@ -1573,6 +1574,7 @@ static struct cc_hash_template driver_hash[] = {
 		.hw_mode = DRV_HASH_HW_SHA1,
 		.inter_digestsize = SHA1_DIGEST_SIZE,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "sha256",
@@ -1599,6 +1601,7 @@ static struct cc_hash_template driver_hash[] = {
 		.hw_mode = DRV_HASH_HW_SHA256,
 		.inter_digestsize = SHA256_DIGEST_SIZE,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "sha224",
@@ -1625,6 +1628,7 @@ static struct cc_hash_template driver_hash[] = {
 		.hw_mode = DRV_HASH_HW_SHA256,
 		.inter_digestsize = SHA256_DIGEST_SIZE,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "sha384",
@@ -1651,6 +1655,7 @@ static struct cc_hash_template driver_hash[] = {
 		.hw_mode = DRV_HASH_HW_SHA512,
 		.inter_digestsize = SHA512_DIGEST_SIZE,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "sha512",
@@ -1677,6 +1682,7 @@ static struct cc_hash_template driver_hash[] = {
 		.hw_mode = DRV_HASH_HW_SHA512,
 		.inter_digestsize = SHA512_DIGEST_SIZE,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "md5",
@@ -1703,6 +1709,7 @@ static struct cc_hash_template driver_hash[] = {
 		.hw_mode = DRV_HASH_HW_MD5,
 		.inter_digestsize = MD5_DIGEST_SIZE,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "sm3",
@@ -1727,6 +1734,7 @@ static struct cc_hash_template driver_hash[] = {
 		.hw_mode = DRV_HASH_HW_SM3,
 		.inter_digestsize = SM3_DIGEST_SIZE,
 		.min_hw_rev = CC_HW_REV_713,
+		.std_body = CC_STD_OSCCA,
 	},
 	{
 		.mac_name = "xcbc(aes)",
@@ -1751,6 +1759,7 @@ static struct cc_hash_template driver_hash[] = {
 		.hw_mode = DRV_CIPHER_XCBC_MAC,
 		.inter_digestsize = AES_BLOCK_SIZE,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.mac_name = "cmac(aes)",
@@ -1775,6 +1784,7 @@ static struct cc_hash_template driver_hash[] = {
 		.hw_mode = DRV_CIPHER_CMAC,
 		.inter_digestsize = AES_BLOCK_SIZE,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 };
 
@@ -2001,9 +2011,11 @@ int cc_hash_alloc(struct cc_drvdata *drvdata)
 		struct cc_hash_alg *t_alg;
 		int hw_mode = driver_hash[alg].hw_mode;
 
-		/* We either support both HASH and MAC or none */
-		if (driver_hash[alg].min_hw_rev > drvdata->hw_rev)
+		/* Check that the HW revision and variants are suitable */
+		if ((driver_hash[alg].min_hw_rev > drvdata->hw_rev) ||
+		    !(drvdata->std_bodies & driver_hash[alg].std_body))
 			continue;
+
 		if (driver_hash[alg].is_mac) {
 			/* register hmac version */
 			t_alg = cc_alloc_hash_alg(&driver_hash[alg], dev, true);
-- 
cgit v1.2.3-59-g8ed1b


From fefbc0b4bcb3c0cc45bed906af46e7cd627ac4bc Mon Sep 17 00:00:00 2001
From: Gilad Ben-Yossef <gilad@benyossef.com>
Date: Tue, 13 Nov 2018 09:40:36 +0000
Subject: dt-bindings: crypto: ccree: add dt bindings for ccree 703

Add device tree bindings associating Arm TrustZone CryptoCell 703 with the
ccree driver.

Signed-off-by: Gilad Ben-Yossef <gilad@benyossef.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 Documentation/devicetree/bindings/crypto/arm-cryptocell.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/crypto/arm-cryptocell.txt b/Documentation/devicetree/bindings/crypto/arm-cryptocell.txt
index 0ac06ec05272..6130e6eb4af8 100644
--- a/Documentation/devicetree/bindings/crypto/arm-cryptocell.txt
+++ b/Documentation/devicetree/bindings/crypto/arm-cryptocell.txt
@@ -3,6 +3,7 @@ Arm TrustZone CryptoCell cryptographic engine
 Required properties:
 - compatible: Should be one of -
    "arm,cryptocell-713-ree"
+   "arm,cryptocell-703-ree"
    "arm,cryptocell-712-ree"
    "arm,cryptocell-710-ree"
    "arm,cryptocell-630p-ree"
-- 
cgit v1.2.3-59-g8ed1b


From 18596781e0beb6b4633cb5582308ba14c37eaaed Mon Sep 17 00:00:00 2001
From: Gilad Ben-Yossef <gilad@benyossef.com>
Date: Tue, 13 Nov 2018 09:40:37 +0000
Subject: MAINTAINERS: ccree: add co-maintainer

Add Yael Chemla as co-maintainer of Arm TrustZone CryptoCell REE
driver.

Signed-off-by: Gilad Ben-Yossef <gilad@benyossef.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index f4855974f325..ab42acbe6b1d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3388,6 +3388,7 @@ F:	include/linux/spi/cc2520.h
 F:	Documentation/devicetree/bindings/net/ieee802154/cc2520.txt
 
 CCREE ARM TRUSTZONE CRYPTOCELL REE DRIVER
+M:	Yael Chemla <yael.chemla@foss.arm.com>
 M:	Gilad Ben-Yossef <gilad@benyossef.com>
 L:	linux-crypto@vger.kernel.org
 S:	Supported
-- 
cgit v1.2.3-59-g8ed1b


From c97e4df573f2434b07f08d3b93673f61158d267f Mon Sep 17 00:00:00 2001
From: Paulo Flabiano Smorigo <pfsmorigo@linux.vnet.ibm.com>
Date: Tue, 27 Nov 2018 15:27:21 -0200
Subject: MAINTAINERS: change NX/VMX maintainers

Add Breno and Nayna as NX/VMX crypto driver maintainers. Also change my
email address to my personal account and remove Leonidas since he's not
working with the driver anymore.

Signed-off-by: Paulo Flabiano Smorigo <pfsmorigo@linux.vnet.ibm.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 MAINTAINERS | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index ab42acbe6b1d..6d6675bbe729 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7002,8 +7002,9 @@ F:	crypto/842.c
 F:	lib/842/
 
 IBM Power in-Nest Crypto Acceleration
-M:	Leonidas S. Barbosa <leosilva@linux.vnet.ibm.com>
-M:	Paulo Flabiano Smorigo <pfsmorigo@linux.vnet.ibm.com>
+M:	Breno Leitão <leitao@debian.org>
+M:	Nayna Jain <nayna@linux.ibm.com>
+M:	Paulo Flabiano Smorigo <pfsmorigo@gmail.com>
 L:	linux-crypto@vger.kernel.org
 S:	Supported
 F:	drivers/crypto/nx/Makefile
@@ -7069,8 +7070,9 @@ S:	Supported
 F:	drivers/scsi/ibmvscsi_tgt/
 
 IBM Power VMX Cryptographic instructions
-M:	Leonidas S. Barbosa <leosilva@linux.vnet.ibm.com>
-M:	Paulo Flabiano Smorigo <pfsmorigo@linux.vnet.ibm.com>
+M:	Breno Leitão <leitao@debian.org>
+M:	Nayna Jain <nayna@linux.ibm.com>
+M:	Paulo Flabiano Smorigo <pfsmorigo@gmail.com>
 L:	linux-crypto@vger.kernel.org
 S:	Supported
 F:	drivers/crypto/vmx/Makefile
-- 
cgit v1.2.3-59-g8ed1b


From 2ced26078fcff26db532d6300a1b5f8ffd11a5e1 Mon Sep 17 00:00:00 2001
From: Corentin Labbe <clabbe@baylibre.com>
Date: Thu, 29 Nov 2018 14:42:16 +0000
Subject: crypto: user - made crypto_user_stat optional

Even if CRYPTO_STATS is set to n, some part of CRYPTO_STATS are
compiled.
This patch made all part of crypto_user_stat uncompiled in that case.

Signed-off-by: Corentin Labbe <clabbe@baylibre.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/Makefile                      |  3 ++-
 crypto/algapi.c                      |  2 ++
 include/crypto/internal/cryptouser.h | 17 +++++++++++++++++
 include/linux/crypto.h               |  2 ++
 4 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/crypto/Makefile b/crypto/Makefile
index 5e789dc2d4fd..799ed5e94606 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -54,7 +54,8 @@ cryptomgr-y := algboss.o testmgr.o
 
 obj-$(CONFIG_CRYPTO_MANAGER2) += cryptomgr.o
 obj-$(CONFIG_CRYPTO_USER) += crypto_user.o
-crypto_user-y := crypto_user_base.o crypto_user_stat.o
+crypto_user-y := crypto_user_base.o
+crypto_user-$(CONFIG_CRYPTO_STATS) += crypto_user_stat.o
 obj-$(CONFIG_CRYPTO_CMAC) += cmac.o
 obj-$(CONFIG_CRYPTO_HMAC) += hmac.o
 obj-$(CONFIG_CRYPTO_VMAC) += vmac.o
diff --git a/crypto/algapi.c b/crypto/algapi.c
index 2545c5f89c4c..f5396c88e8cd 100644
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -258,6 +258,7 @@ static struct crypto_larval *__crypto_register_alg(struct crypto_alg *alg)
 	list_add(&alg->cra_list, &crypto_alg_list);
 	list_add(&larval->alg.cra_list, &crypto_alg_list);
 
+#ifdef CONFIG_CRYPTO_STATS
 	atomic_set(&alg->encrypt_cnt, 0);
 	atomic_set(&alg->decrypt_cnt, 0);
 	atomic64_set(&alg->encrypt_tlen, 0);
@@ -265,6 +266,7 @@ static struct crypto_larval *__crypto_register_alg(struct crypto_alg *alg)
 	atomic_set(&alg->verify_cnt, 0);
 	atomic_set(&alg->cipher_err_cnt, 0);
 	atomic_set(&alg->sign_cnt, 0);
+#endif
 
 out:
 	return larval;
diff --git a/include/crypto/internal/cryptouser.h b/include/crypto/internal/cryptouser.h
index 8db299c25566..3492ab42eefb 100644
--- a/include/crypto/internal/cryptouser.h
+++ b/include/crypto/internal/cryptouser.h
@@ -3,6 +3,23 @@
 
 struct crypto_alg *crypto_alg_match(struct crypto_user_alg *p, int exact);
 
+#ifdef CONFIG_CRYPTO_STATS
 int crypto_dump_reportstat(struct sk_buff *skb, struct netlink_callback *cb);
 int crypto_reportstat(struct sk_buff *in_skb, struct nlmsghdr *in_nlh, struct nlattr **attrs);
 int crypto_dump_reportstat_done(struct netlink_callback *cb);
+#else
+static int crypto_dump_reportstat(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	return -ENOTSUPP;
+}
+
+static int crypto_reportstat(struct sk_buff *in_skb, struct nlmsghdr *in_nlh, struct nlattr **attrs)
+{
+	return -ENOTSUPP;
+}
+
+static int crypto_dump_reportstat_done(struct netlink_callback *cb)
+{
+	return -ENOTSUPP;
+}
+#endif
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 3634ad6fe202..3e05053b8d57 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -515,6 +515,7 @@ struct crypto_alg {
 	
 	struct module *cra_module;
 
+#ifdef CONFIG_CRYPTO_STATS
 	union {
 		atomic_t encrypt_cnt;
 		atomic_t compress_cnt;
@@ -552,6 +553,7 @@ struct crypto_alg {
 		atomic_t compute_shared_secret_cnt;
 	};
 	atomic_t sign_cnt;
+#endif /* CONFIG_CRYPTO_STATS */
 
 } CRYPTO_MINALIGN_ATTR;
 
-- 
cgit v1.2.3-59-g8ed1b


From a6a31385364ca0f7b98ace0bad93d793f07f97f3 Mon Sep 17 00:00:00 2001
From: Corentin Labbe <clabbe@baylibre.com>
Date: Thu, 29 Nov 2018 14:42:17 +0000
Subject: crypto: user - CRYPTO_STATS should depend on CRYPTO_USER

CRYPTO_STATS is using CRYPTO_USER stuff, so it should depends on it.
Signed-off-by: Corentin Labbe <clabbe@baylibre.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/crypto/Kconfig b/crypto/Kconfig
index b6376d5d973e..5994d0fa7a78 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1865,6 +1865,7 @@ config CRYPTO_USER_API_AEAD
 
 config CRYPTO_STATS
 	bool "Crypto usage statistics for User-space"
+	depends on CRYPTO_USER
 	help
 	  This option enables the gathering of crypto stats.
 	  This will collect:
-- 
cgit v1.2.3-59-g8ed1b


From 6e8e72cd206e2ba68801e4f2490f639d41808c8d Mon Sep 17 00:00:00 2001
From: Corentin Labbe <clabbe@baylibre.com>
Date: Thu, 29 Nov 2018 14:42:18 +0000
Subject: crypto: user - convert all stats from u32 to u64

All the 32-bit fields need to be 64-bit.  In some cases, UINT32_MAX crypto
operations can be done in seconds.

Reported-by: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Corentin Labbe <clabbe@baylibre.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/algapi.c                 |  10 ++--
 crypto/crypto_user_stat.c       | 114 +++++++++++++++++++---------------------
 include/crypto/acompress.h      |   8 +--
 include/crypto/aead.h           |   8 +--
 include/crypto/akcipher.h       |  16 +++---
 include/crypto/hash.h           |   6 +--
 include/crypto/kpp.h            |  12 ++---
 include/crypto/rng.h            |   8 +--
 include/crypto/skcipher.h       |   8 +--
 include/linux/crypto.h          |  46 ++++++++--------
 include/uapi/linux/cryptouser.h |  38 +++++++-------
 11 files changed, 133 insertions(+), 141 deletions(-)

diff --git a/crypto/algapi.c b/crypto/algapi.c
index f5396c88e8cd..42fe316f80ee 100644
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -259,13 +259,13 @@ static struct crypto_larval *__crypto_register_alg(struct crypto_alg *alg)
 	list_add(&larval->alg.cra_list, &crypto_alg_list);
 
 #ifdef CONFIG_CRYPTO_STATS
-	atomic_set(&alg->encrypt_cnt, 0);
-	atomic_set(&alg->decrypt_cnt, 0);
+	atomic64_set(&alg->encrypt_cnt, 0);
+	atomic64_set(&alg->decrypt_cnt, 0);
 	atomic64_set(&alg->encrypt_tlen, 0);
 	atomic64_set(&alg->decrypt_tlen, 0);
-	atomic_set(&alg->verify_cnt, 0);
-	atomic_set(&alg->cipher_err_cnt, 0);
-	atomic_set(&alg->sign_cnt, 0);
+	atomic64_set(&alg->verify_cnt, 0);
+	atomic64_set(&alg->cipher_err_cnt, 0);
+	atomic64_set(&alg->sign_cnt, 0);
 #endif
 
 out:
diff --git a/crypto/crypto_user_stat.c b/crypto/crypto_user_stat.c
index a6fb2e6f618d..352569f378a0 100644
--- a/crypto/crypto_user_stat.c
+++ b/crypto/crypto_user_stat.c
@@ -35,22 +35,21 @@ static int crypto_report_aead(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_stat raead;
 	u64 v64;
-	u32 v32;
 
 	memset(&raead, 0, sizeof(raead));
 
 	strscpy(raead.type, "aead", sizeof(raead.type));
 
-	v32 = atomic_read(&alg->encrypt_cnt);
-	raead.stat_encrypt_cnt = v32;
+	v64 = atomic64_read(&alg->encrypt_cnt);
+	raead.stat_encrypt_cnt = v64;
 	v64 = atomic64_read(&alg->encrypt_tlen);
 	raead.stat_encrypt_tlen = v64;
-	v32 = atomic_read(&alg->decrypt_cnt);
-	raead.stat_decrypt_cnt = v32;
+	v64 = atomic64_read(&alg->decrypt_cnt);
+	raead.stat_decrypt_cnt = v64;
 	v64 = atomic64_read(&alg->decrypt_tlen);
 	raead.stat_decrypt_tlen = v64;
-	v32 = atomic_read(&alg->aead_err_cnt);
-	raead.stat_aead_err_cnt = v32;
+	v64 = atomic64_read(&alg->aead_err_cnt);
+	raead.stat_aead_err_cnt = v64;
 
 	return nla_put(skb, CRYPTOCFGA_STAT_AEAD, sizeof(raead), &raead);
 }
@@ -59,22 +58,21 @@ static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_stat rcipher;
 	u64 v64;
-	u32 v32;
 
 	memset(&rcipher, 0, sizeof(rcipher));
 
 	strscpy(rcipher.type, "cipher", sizeof(rcipher.type));
 
-	v32 = atomic_read(&alg->encrypt_cnt);
-	rcipher.stat_encrypt_cnt = v32;
+	v64 = atomic64_read(&alg->encrypt_cnt);
+	rcipher.stat_encrypt_cnt = v64;
 	v64 = atomic64_read(&alg->encrypt_tlen);
 	rcipher.stat_encrypt_tlen = v64;
-	v32 = atomic_read(&alg->decrypt_cnt);
-	rcipher.stat_decrypt_cnt = v32;
+	v64 = atomic64_read(&alg->decrypt_cnt);
+	rcipher.stat_decrypt_cnt = v64;
 	v64 = atomic64_read(&alg->decrypt_tlen);
 	rcipher.stat_decrypt_tlen = v64;
-	v32 = atomic_read(&alg->cipher_err_cnt);
-	rcipher.stat_cipher_err_cnt = v32;
+	v64 = atomic64_read(&alg->cipher_err_cnt);
+	rcipher.stat_cipher_err_cnt = v64;
 
 	return nla_put(skb, CRYPTOCFGA_STAT_CIPHER, sizeof(rcipher), &rcipher);
 }
@@ -83,21 +81,20 @@ static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_stat rcomp;
 	u64 v64;
-	u32 v32;
 
 	memset(&rcomp, 0, sizeof(rcomp));
 
 	strscpy(rcomp.type, "compression", sizeof(rcomp.type));
-	v32 = atomic_read(&alg->compress_cnt);
-	rcomp.stat_compress_cnt = v32;
+	v64 = atomic64_read(&alg->compress_cnt);
+	rcomp.stat_compress_cnt = v64;
 	v64 = atomic64_read(&alg->compress_tlen);
 	rcomp.stat_compress_tlen = v64;
-	v32 = atomic_read(&alg->decompress_cnt);
-	rcomp.stat_decompress_cnt = v32;
+	v64 = atomic64_read(&alg->decompress_cnt);
+	rcomp.stat_decompress_cnt = v64;
 	v64 = atomic64_read(&alg->decompress_tlen);
 	rcomp.stat_decompress_tlen = v64;
-	v32 = atomic_read(&alg->cipher_err_cnt);
-	rcomp.stat_compress_err_cnt = v32;
+	v64 = atomic64_read(&alg->cipher_err_cnt);
+	rcomp.stat_compress_err_cnt = v64;
 
 	return nla_put(skb, CRYPTOCFGA_STAT_COMPRESS, sizeof(rcomp), &rcomp);
 }
@@ -106,21 +103,20 @@ static int crypto_report_acomp(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_stat racomp;
 	u64 v64;
-	u32 v32;
 
 	memset(&racomp, 0, sizeof(racomp));
 
 	strscpy(racomp.type, "acomp", sizeof(racomp.type));
-	v32 = atomic_read(&alg->compress_cnt);
-	racomp.stat_compress_cnt = v32;
+	v64 = atomic64_read(&alg->compress_cnt);
+	racomp.stat_compress_cnt = v64;
 	v64 = atomic64_read(&alg->compress_tlen);
 	racomp.stat_compress_tlen = v64;
-	v32 = atomic_read(&alg->decompress_cnt);
-	racomp.stat_decompress_cnt = v32;
+	v64 = atomic64_read(&alg->decompress_cnt);
+	racomp.stat_decompress_cnt = v64;
 	v64 = atomic64_read(&alg->decompress_tlen);
 	racomp.stat_decompress_tlen = v64;
-	v32 = atomic_read(&alg->cipher_err_cnt);
-	racomp.stat_compress_err_cnt = v32;
+	v64 = atomic64_read(&alg->cipher_err_cnt);
+	racomp.stat_compress_err_cnt = v64;
 
 	return nla_put(skb, CRYPTOCFGA_STAT_ACOMP, sizeof(racomp), &racomp);
 }
@@ -129,25 +125,24 @@ static int crypto_report_akcipher(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_stat rakcipher;
 	u64 v64;
-	u32 v32;
 
 	memset(&rakcipher, 0, sizeof(rakcipher));
 
 	strscpy(rakcipher.type, "akcipher", sizeof(rakcipher.type));
-	v32 = atomic_read(&alg->encrypt_cnt);
-	rakcipher.stat_encrypt_cnt = v32;
+	v64 = atomic64_read(&alg->encrypt_cnt);
+	rakcipher.stat_encrypt_cnt = v64;
 	v64 = atomic64_read(&alg->encrypt_tlen);
 	rakcipher.stat_encrypt_tlen = v64;
-	v32 = atomic_read(&alg->decrypt_cnt);
-	rakcipher.stat_decrypt_cnt = v32;
+	v64 = atomic64_read(&alg->decrypt_cnt);
+	rakcipher.stat_decrypt_cnt = v64;
 	v64 = atomic64_read(&alg->decrypt_tlen);
 	rakcipher.stat_decrypt_tlen = v64;
-	v32 = atomic_read(&alg->sign_cnt);
-	rakcipher.stat_sign_cnt = v32;
-	v32 = atomic_read(&alg->verify_cnt);
-	rakcipher.stat_verify_cnt = v32;
-	v32 = atomic_read(&alg->akcipher_err_cnt);
-	rakcipher.stat_akcipher_err_cnt = v32;
+	v64 = atomic64_read(&alg->sign_cnt);
+	rakcipher.stat_sign_cnt = v64;
+	v64 = atomic64_read(&alg->verify_cnt);
+	rakcipher.stat_verify_cnt = v64;
+	v64 = atomic64_read(&alg->akcipher_err_cnt);
+	rakcipher.stat_akcipher_err_cnt = v64;
 
 	return nla_put(skb, CRYPTOCFGA_STAT_AKCIPHER,
 		       sizeof(rakcipher), &rakcipher);
@@ -156,19 +151,19 @@ static int crypto_report_akcipher(struct sk_buff *skb, struct crypto_alg *alg)
 static int crypto_report_kpp(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_stat rkpp;
-	u32 v;
+	u64 v;
 
 	memset(&rkpp, 0, sizeof(rkpp));
 
 	strscpy(rkpp.type, "kpp", sizeof(rkpp.type));
 
-	v = atomic_read(&alg->setsecret_cnt);
+	v = atomic64_read(&alg->setsecret_cnt);
 	rkpp.stat_setsecret_cnt = v;
-	v = atomic_read(&alg->generate_public_key_cnt);
+	v = atomic64_read(&alg->generate_public_key_cnt);
 	rkpp.stat_generate_public_key_cnt = v;
-	v = atomic_read(&alg->compute_shared_secret_cnt);
+	v = atomic64_read(&alg->compute_shared_secret_cnt);
 	rkpp.stat_compute_shared_secret_cnt = v;
-	v = atomic_read(&alg->kpp_err_cnt);
+	v = atomic64_read(&alg->kpp_err_cnt);
 	rkpp.stat_kpp_err_cnt = v;
 
 	return nla_put(skb, CRYPTOCFGA_STAT_KPP, sizeof(rkpp), &rkpp);
@@ -178,18 +173,17 @@ static int crypto_report_ahash(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_stat rhash;
 	u64 v64;
-	u32 v32;
 
 	memset(&rhash, 0, sizeof(rhash));
 
 	strscpy(rhash.type, "ahash", sizeof(rhash.type));
 
-	v32 = atomic_read(&alg->hash_cnt);
-	rhash.stat_hash_cnt = v32;
+	v64 = atomic64_read(&alg->hash_cnt);
+	rhash.stat_hash_cnt = v64;
 	v64 = atomic64_read(&alg->hash_tlen);
 	rhash.stat_hash_tlen = v64;
-	v32 = atomic_read(&alg->hash_err_cnt);
-	rhash.stat_hash_err_cnt = v32;
+	v64 = atomic64_read(&alg->hash_err_cnt);
+	rhash.stat_hash_err_cnt = v64;
 
 	return nla_put(skb, CRYPTOCFGA_STAT_HASH, sizeof(rhash), &rhash);
 }
@@ -198,18 +192,17 @@ static int crypto_report_shash(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_stat rhash;
 	u64 v64;
-	u32 v32;
 
 	memset(&rhash, 0, sizeof(rhash));
 
 	strscpy(rhash.type, "shash", sizeof(rhash.type));
 
-	v32 = atomic_read(&alg->hash_cnt);
-	rhash.stat_hash_cnt = v32;
+	v64 = atomic64_read(&alg->hash_cnt);
+	rhash.stat_hash_cnt = v64;
 	v64 = atomic64_read(&alg->hash_tlen);
 	rhash.stat_hash_tlen = v64;
-	v32 = atomic_read(&alg->hash_err_cnt);
-	rhash.stat_hash_err_cnt = v32;
+	v64 = atomic64_read(&alg->hash_err_cnt);
+	rhash.stat_hash_err_cnt = v64;
 
 	return nla_put(skb, CRYPTOCFGA_STAT_HASH, sizeof(rhash), &rhash);
 }
@@ -218,20 +211,19 @@ static int crypto_report_rng(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_stat rrng;
 	u64 v64;
-	u32 v32;
 
 	memset(&rrng, 0, sizeof(rrng));
 
 	strscpy(rrng.type, "rng", sizeof(rrng.type));
 
-	v32 = atomic_read(&alg->generate_cnt);
-	rrng.stat_generate_cnt = v32;
+	v64 = atomic64_read(&alg->generate_cnt);
+	rrng.stat_generate_cnt = v64;
 	v64 = atomic64_read(&alg->generate_tlen);
 	rrng.stat_generate_tlen = v64;
-	v32 = atomic_read(&alg->seed_cnt);
-	rrng.stat_seed_cnt = v32;
-	v32 = atomic_read(&alg->hash_err_cnt);
-	rrng.stat_rng_err_cnt = v32;
+	v64 = atomic64_read(&alg->seed_cnt);
+	rrng.stat_seed_cnt = v64;
+	v64 = atomic64_read(&alg->hash_err_cnt);
+	rrng.stat_rng_err_cnt = v64;
 
 	return nla_put(skb, CRYPTOCFGA_STAT_RNG, sizeof(rrng), &rrng);
 }
diff --git a/include/crypto/acompress.h b/include/crypto/acompress.h
index 22e6f412c595..f79918196811 100644
--- a/include/crypto/acompress.h
+++ b/include/crypto/acompress.h
@@ -240,9 +240,9 @@ static inline void crypto_stat_compress(struct acomp_req *req, int ret)
 	struct crypto_acomp *tfm = crypto_acomp_reqtfm(req);
 
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&tfm->base.__crt_alg->compress_err_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->compress_err_cnt);
 	} else {
-		atomic_inc(&tfm->base.__crt_alg->compress_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->compress_cnt);
 		atomic64_add(req->slen, &tfm->base.__crt_alg->compress_tlen);
 	}
 #endif
@@ -254,9 +254,9 @@ static inline void crypto_stat_decompress(struct acomp_req *req, int ret)
 	struct crypto_acomp *tfm = crypto_acomp_reqtfm(req);
 
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&tfm->base.__crt_alg->compress_err_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->compress_err_cnt);
 	} else {
-		atomic_inc(&tfm->base.__crt_alg->decompress_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->decompress_cnt);
 		atomic64_add(req->slen, &tfm->base.__crt_alg->decompress_tlen);
 	}
 #endif
diff --git a/include/crypto/aead.h b/include/crypto/aead.h
index 0d765d7bfb82..99afd78c665d 100644
--- a/include/crypto/aead.h
+++ b/include/crypto/aead.h
@@ -312,9 +312,9 @@ static inline void crypto_stat_aead_encrypt(struct aead_request *req, int ret)
 	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
 
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&tfm->base.__crt_alg->aead_err_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->aead_err_cnt);
 	} else {
-		atomic_inc(&tfm->base.__crt_alg->encrypt_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->encrypt_cnt);
 		atomic64_add(req->cryptlen, &tfm->base.__crt_alg->encrypt_tlen);
 	}
 #endif
@@ -326,9 +326,9 @@ static inline void crypto_stat_aead_decrypt(struct aead_request *req, int ret)
 	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
 
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&tfm->base.__crt_alg->aead_err_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->aead_err_cnt);
 	} else {
-		atomic_inc(&tfm->base.__crt_alg->decrypt_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->decrypt_cnt);
 		atomic64_add(req->cryptlen, &tfm->base.__crt_alg->decrypt_tlen);
 	}
 #endif
diff --git a/include/crypto/akcipher.h b/include/crypto/akcipher.h
index afac71119396..3dc05cf7e0a9 100644
--- a/include/crypto/akcipher.h
+++ b/include/crypto/akcipher.h
@@ -278,9 +278,9 @@ static inline void crypto_stat_akcipher_encrypt(struct akcipher_request *req,
 	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
 
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&tfm->base.__crt_alg->akcipher_err_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->akcipher_err_cnt);
 	} else {
-		atomic_inc(&tfm->base.__crt_alg->encrypt_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->encrypt_cnt);
 		atomic64_add(req->src_len, &tfm->base.__crt_alg->encrypt_tlen);
 	}
 #endif
@@ -293,9 +293,9 @@ static inline void crypto_stat_akcipher_decrypt(struct akcipher_request *req,
 	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
 
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&tfm->base.__crt_alg->akcipher_err_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->akcipher_err_cnt);
 	} else {
-		atomic_inc(&tfm->base.__crt_alg->decrypt_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->decrypt_cnt);
 		atomic64_add(req->src_len, &tfm->base.__crt_alg->decrypt_tlen);
 	}
 #endif
@@ -308,9 +308,9 @@ static inline void crypto_stat_akcipher_sign(struct akcipher_request *req,
 	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
 
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
-		atomic_inc(&tfm->base.__crt_alg->akcipher_err_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->akcipher_err_cnt);
 	else
-		atomic_inc(&tfm->base.__crt_alg->sign_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->sign_cnt);
 #endif
 }
 
@@ -321,9 +321,9 @@ static inline void crypto_stat_akcipher_verify(struct akcipher_request *req,
 	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
 
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
-		atomic_inc(&tfm->base.__crt_alg->akcipher_err_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->akcipher_err_cnt);
 	else
-		atomic_inc(&tfm->base.__crt_alg->verify_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->verify_cnt);
 #endif
 }
 
diff --git a/include/crypto/hash.h b/include/crypto/hash.h
index bc7796600338..52920bed05ba 100644
--- a/include/crypto/hash.h
+++ b/include/crypto/hash.h
@@ -418,7 +418,7 @@ static inline void crypto_stat_ahash_update(struct ahash_request *req, int ret)
 	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
 
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
-		atomic_inc(&tfm->base.__crt_alg->hash_err_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->hash_err_cnt);
 	else
 		atomic64_add(req->nbytes, &tfm->base.__crt_alg->hash_tlen);
 #endif
@@ -430,9 +430,9 @@ static inline void crypto_stat_ahash_final(struct ahash_request *req, int ret)
 	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
 
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&tfm->base.__crt_alg->hash_err_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->hash_err_cnt);
 	} else {
-		atomic_inc(&tfm->base.__crt_alg->hash_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->hash_cnt);
 		atomic64_add(req->nbytes, &tfm->base.__crt_alg->hash_tlen);
 	}
 #endif
diff --git a/include/crypto/kpp.h b/include/crypto/kpp.h
index f517ba6d3a27..bd5103a80919 100644
--- a/include/crypto/kpp.h
+++ b/include/crypto/kpp.h
@@ -272,9 +272,9 @@ static inline void crypto_stat_kpp_set_secret(struct crypto_kpp *tfm, int ret)
 {
 #ifdef CONFIG_CRYPTO_STATS
 	if (ret)
-		atomic_inc(&tfm->base.__crt_alg->kpp_err_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->kpp_err_cnt);
 	else
-		atomic_inc(&tfm->base.__crt_alg->setsecret_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->setsecret_cnt);
 #endif
 }
 
@@ -285,9 +285,9 @@ static inline void crypto_stat_kpp_generate_public_key(struct kpp_request *req,
 	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
 
 	if (ret)
-		atomic_inc(&tfm->base.__crt_alg->kpp_err_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->kpp_err_cnt);
 	else
-		atomic_inc(&tfm->base.__crt_alg->generate_public_key_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->generate_public_key_cnt);
 #endif
 }
 
@@ -298,9 +298,9 @@ static inline void crypto_stat_kpp_compute_shared_secret(struct kpp_request *req
 	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
 
 	if (ret)
-		atomic_inc(&tfm->base.__crt_alg->kpp_err_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->kpp_err_cnt);
 	else
-		atomic_inc(&tfm->base.__crt_alg->compute_shared_secret_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->compute_shared_secret_cnt);
 #endif
 }
 
diff --git a/include/crypto/rng.h b/include/crypto/rng.h
index 6d258f5b68f1..966615bba45e 100644
--- a/include/crypto/rng.h
+++ b/include/crypto/rng.h
@@ -126,9 +126,9 @@ static inline void crypto_stat_rng_seed(struct crypto_rng *tfm, int ret)
 {
 #ifdef CONFIG_CRYPTO_STATS
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
-		atomic_inc(&tfm->base.__crt_alg->rng_err_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->rng_err_cnt);
 	else
-		atomic_inc(&tfm->base.__crt_alg->seed_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->seed_cnt);
 #endif
 }
 
@@ -137,9 +137,9 @@ static inline void crypto_stat_rng_generate(struct crypto_rng *tfm,
 {
 #ifdef CONFIG_CRYPTO_STATS
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&tfm->base.__crt_alg->rng_err_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->rng_err_cnt);
 	} else {
-		atomic_inc(&tfm->base.__crt_alg->generate_cnt);
+		atomic64_inc(&tfm->base.__crt_alg->generate_cnt);
 		atomic64_add(dlen, &tfm->base.__crt_alg->generate_tlen);
 	}
 #endif
diff --git a/include/crypto/skcipher.h b/include/crypto/skcipher.h
index 925f547cdcfa..dff54731ddf4 100644
--- a/include/crypto/skcipher.h
+++ b/include/crypto/skcipher.h
@@ -491,9 +491,9 @@ static inline void crypto_stat_skcipher_encrypt(struct skcipher_request *req,
 {
 #ifdef CONFIG_CRYPTO_STATS
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&alg->cipher_err_cnt);
+		atomic64_inc(&alg->cipher_err_cnt);
 	} else {
-		atomic_inc(&alg->encrypt_cnt);
+		atomic64_inc(&alg->encrypt_cnt);
 		atomic64_add(req->cryptlen, &alg->encrypt_tlen);
 	}
 #endif
@@ -504,9 +504,9 @@ static inline void crypto_stat_skcipher_decrypt(struct skcipher_request *req,
 {
 #ifdef CONFIG_CRYPTO_STATS
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&alg->cipher_err_cnt);
+		atomic64_inc(&alg->cipher_err_cnt);
 	} else {
-		atomic_inc(&alg->decrypt_cnt);
+		atomic64_inc(&alg->decrypt_cnt);
 		atomic64_add(req->cryptlen, &alg->decrypt_tlen);
 	}
 #endif
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 3e05053b8d57..b109b50906e7 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -517,11 +517,11 @@ struct crypto_alg {
 
 #ifdef CONFIG_CRYPTO_STATS
 	union {
-		atomic_t encrypt_cnt;
-		atomic_t compress_cnt;
-		atomic_t generate_cnt;
-		atomic_t hash_cnt;
-		atomic_t setsecret_cnt;
+		atomic64_t encrypt_cnt;
+		atomic64_t compress_cnt;
+		atomic64_t generate_cnt;
+		atomic64_t hash_cnt;
+		atomic64_t setsecret_cnt;
 	};
 	union {
 		atomic64_t encrypt_tlen;
@@ -530,29 +530,29 @@ struct crypto_alg {
 		atomic64_t hash_tlen;
 	};
 	union {
-		atomic_t akcipher_err_cnt;
-		atomic_t cipher_err_cnt;
-		atomic_t compress_err_cnt;
-		atomic_t aead_err_cnt;
-		atomic_t hash_err_cnt;
-		atomic_t rng_err_cnt;
-		atomic_t kpp_err_cnt;
+		atomic64_t akcipher_err_cnt;
+		atomic64_t cipher_err_cnt;
+		atomic64_t compress_err_cnt;
+		atomic64_t aead_err_cnt;
+		atomic64_t hash_err_cnt;
+		atomic64_t rng_err_cnt;
+		atomic64_t kpp_err_cnt;
 	};
 	union {
-		atomic_t decrypt_cnt;
-		atomic_t decompress_cnt;
-		atomic_t seed_cnt;
-		atomic_t generate_public_key_cnt;
+		atomic64_t decrypt_cnt;
+		atomic64_t decompress_cnt;
+		atomic64_t seed_cnt;
+		atomic64_t generate_public_key_cnt;
 	};
 	union {
 		atomic64_t decrypt_tlen;
 		atomic64_t decompress_tlen;
 	};
 	union {
-		atomic_t verify_cnt;
-		atomic_t compute_shared_secret_cnt;
+		atomic64_t verify_cnt;
+		atomic64_t compute_shared_secret_cnt;
 	};
-	atomic_t sign_cnt;
+	atomic64_t sign_cnt;
 #endif /* CONFIG_CRYPTO_STATS */
 
 } CRYPTO_MINALIGN_ATTR;
@@ -983,9 +983,9 @@ static inline void crypto_stat_ablkcipher_encrypt(struct ablkcipher_request *req
 		crypto_ablkcipher_crt(crypto_ablkcipher_reqtfm(req));
 
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&crt->base->base.__crt_alg->cipher_err_cnt);
+		atomic64_inc(&crt->base->base.__crt_alg->cipher_err_cnt);
 	} else {
-		atomic_inc(&crt->base->base.__crt_alg->encrypt_cnt);
+		atomic64_inc(&crt->base->base.__crt_alg->encrypt_cnt);
 		atomic64_add(req->nbytes, &crt->base->base.__crt_alg->encrypt_tlen);
 	}
 #endif
@@ -999,9 +999,9 @@ static inline void crypto_stat_ablkcipher_decrypt(struct ablkcipher_request *req
 		crypto_ablkcipher_crt(crypto_ablkcipher_reqtfm(req));
 
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&crt->base->base.__crt_alg->cipher_err_cnt);
+		atomic64_inc(&crt->base->base.__crt_alg->cipher_err_cnt);
 	} else {
-		atomic_inc(&crt->base->base.__crt_alg->decrypt_cnt);
+		atomic64_inc(&crt->base->base.__crt_alg->decrypt_cnt);
 		atomic64_add(req->nbytes, &crt->base->base.__crt_alg->decrypt_tlen);
 	}
 #endif
diff --git a/include/uapi/linux/cryptouser.h b/include/uapi/linux/cryptouser.h
index 6dafbc3e4414..9f8187077ce4 100644
--- a/include/uapi/linux/cryptouser.h
+++ b/include/uapi/linux/cryptouser.h
@@ -79,11 +79,11 @@ struct crypto_user_alg {
 struct crypto_stat {
 	char type[CRYPTO_MAX_NAME];
 	union {
-		__u32 stat_encrypt_cnt;
-		__u32 stat_compress_cnt;
-		__u32 stat_generate_cnt;
-		__u32 stat_hash_cnt;
-		__u32 stat_setsecret_cnt;
+		__u64 stat_encrypt_cnt;
+		__u64 stat_compress_cnt;
+		__u64 stat_generate_cnt;
+		__u64 stat_hash_cnt;
+		__u64 stat_setsecret_cnt;
 	};
 	union {
 		__u64 stat_encrypt_tlen;
@@ -92,29 +92,29 @@ struct crypto_stat {
 		__u64 stat_hash_tlen;
 	};
 	union {
-		__u32 stat_akcipher_err_cnt;
-		__u32 stat_cipher_err_cnt;
-		__u32 stat_compress_err_cnt;
-		__u32 stat_aead_err_cnt;
-		__u32 stat_hash_err_cnt;
-		__u32 stat_rng_err_cnt;
-		__u32 stat_kpp_err_cnt;
+		__u64 stat_akcipher_err_cnt;
+		__u64 stat_cipher_err_cnt;
+		__u64 stat_compress_err_cnt;
+		__u64 stat_aead_err_cnt;
+		__u64 stat_hash_err_cnt;
+		__u64 stat_rng_err_cnt;
+		__u64 stat_kpp_err_cnt;
 	};
 	union {
-		__u32 stat_decrypt_cnt;
-		__u32 stat_decompress_cnt;
-		__u32 stat_seed_cnt;
-		__u32 stat_generate_public_key_cnt;
+		__u64 stat_decrypt_cnt;
+		__u64 stat_decompress_cnt;
+		__u64 stat_seed_cnt;
+		__u64 stat_generate_public_key_cnt;
 	};
 	union {
 		__u64 stat_decrypt_tlen;
 		__u64 stat_decompress_tlen;
 	};
 	union {
-		__u32 stat_verify_cnt;
-		__u32 stat_compute_shared_secret_cnt;
+		__u64 stat_verify_cnt;
+		__u64 stat_compute_shared_secret_cnt;
 	};
-	__u32 stat_sign_cnt;
+	__u64 stat_sign_cnt;
 };
 
 struct crypto_report_larval {
-- 
cgit v1.2.3-59-g8ed1b


From 7f0a9d5c9d1ba8ab3e5b144e52553744dc0d7471 Mon Sep 17 00:00:00 2001
From: Corentin Labbe <clabbe@baylibre.com>
Date: Thu, 29 Nov 2018 14:42:19 +0000
Subject: crypto: user - split user space crypto stat structures

It is cleaner to have each stat in their own structures.

Signed-off-by: Corentin Labbe <clabbe@baylibre.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/crypto_user_stat.c       |  20 ++++----
 include/uapi/linux/cryptouser.h | 100 +++++++++++++++++++++++++---------------
 2 files changed, 72 insertions(+), 48 deletions(-)

diff --git a/crypto/crypto_user_stat.c b/crypto/crypto_user_stat.c
index 352569f378a0..3c14be2f7a1b 100644
--- a/crypto/crypto_user_stat.c
+++ b/crypto/crypto_user_stat.c
@@ -33,7 +33,7 @@ struct crypto_dump_info {
 
 static int crypto_report_aead(struct sk_buff *skb, struct crypto_alg *alg)
 {
-	struct crypto_stat raead;
+	struct crypto_stat_aead raead;
 	u64 v64;
 
 	memset(&raead, 0, sizeof(raead));
@@ -56,7 +56,7 @@ static int crypto_report_aead(struct sk_buff *skb, struct crypto_alg *alg)
 
 static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg)
 {
-	struct crypto_stat rcipher;
+	struct crypto_stat_cipher rcipher;
 	u64 v64;
 
 	memset(&rcipher, 0, sizeof(rcipher));
@@ -79,7 +79,7 @@ static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg)
 
 static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg)
 {
-	struct crypto_stat rcomp;
+	struct crypto_stat_compress rcomp;
 	u64 v64;
 
 	memset(&rcomp, 0, sizeof(rcomp));
@@ -101,7 +101,7 @@ static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg)
 
 static int crypto_report_acomp(struct sk_buff *skb, struct crypto_alg *alg)
 {
-	struct crypto_stat racomp;
+	struct crypto_stat_compress racomp;
 	u64 v64;
 
 	memset(&racomp, 0, sizeof(racomp));
@@ -123,7 +123,7 @@ static int crypto_report_acomp(struct sk_buff *skb, struct crypto_alg *alg)
 
 static int crypto_report_akcipher(struct sk_buff *skb, struct crypto_alg *alg)
 {
-	struct crypto_stat rakcipher;
+	struct crypto_stat_akcipher rakcipher;
 	u64 v64;
 
 	memset(&rakcipher, 0, sizeof(rakcipher));
@@ -150,7 +150,7 @@ static int crypto_report_akcipher(struct sk_buff *skb, struct crypto_alg *alg)
 
 static int crypto_report_kpp(struct sk_buff *skb, struct crypto_alg *alg)
 {
-	struct crypto_stat rkpp;
+	struct crypto_stat_kpp rkpp;
 	u64 v;
 
 	memset(&rkpp, 0, sizeof(rkpp));
@@ -171,7 +171,7 @@ static int crypto_report_kpp(struct sk_buff *skb, struct crypto_alg *alg)
 
 static int crypto_report_ahash(struct sk_buff *skb, struct crypto_alg *alg)
 {
-	struct crypto_stat rhash;
+	struct crypto_stat_hash rhash;
 	u64 v64;
 
 	memset(&rhash, 0, sizeof(rhash));
@@ -190,7 +190,7 @@ static int crypto_report_ahash(struct sk_buff *skb, struct crypto_alg *alg)
 
 static int crypto_report_shash(struct sk_buff *skb, struct crypto_alg *alg)
 {
-	struct crypto_stat rhash;
+	struct crypto_stat_hash rhash;
 	u64 v64;
 
 	memset(&rhash, 0, sizeof(rhash));
@@ -209,7 +209,7 @@ static int crypto_report_shash(struct sk_buff *skb, struct crypto_alg *alg)
 
 static int crypto_report_rng(struct sk_buff *skb, struct crypto_alg *alg)
 {
-	struct crypto_stat rrng;
+	struct crypto_stat_rng rrng;
 	u64 v64;
 
 	memset(&rrng, 0, sizeof(rrng));
@@ -248,7 +248,7 @@ static int crypto_reportstat_one(struct crypto_alg *alg,
 	if (nla_put_u32(skb, CRYPTOCFGA_PRIORITY_VAL, alg->cra_priority))
 		goto nla_put_failure;
 	if (alg->cra_flags & CRYPTO_ALG_LARVAL) {
-		struct crypto_stat rl;
+		struct crypto_stat_larval rl;
 
 		memset(&rl, 0, sizeof(rl));
 		strscpy(rl.type, "larval", sizeof(rl.type));
diff --git a/include/uapi/linux/cryptouser.h b/include/uapi/linux/cryptouser.h
index 9f8187077ce4..3a70f025e27d 100644
--- a/include/uapi/linux/cryptouser.h
+++ b/include/uapi/linux/cryptouser.h
@@ -76,45 +76,69 @@ struct crypto_user_alg {
 	__u32 cru_flags;
 };
 
-struct crypto_stat {
-	char type[CRYPTO_MAX_NAME];
-	union {
-		__u64 stat_encrypt_cnt;
-		__u64 stat_compress_cnt;
-		__u64 stat_generate_cnt;
-		__u64 stat_hash_cnt;
-		__u64 stat_setsecret_cnt;
-	};
-	union {
-		__u64 stat_encrypt_tlen;
-		__u64 stat_compress_tlen;
-		__u64 stat_generate_tlen;
-		__u64 stat_hash_tlen;
-	};
-	union {
-		__u64 stat_akcipher_err_cnt;
-		__u64 stat_cipher_err_cnt;
-		__u64 stat_compress_err_cnt;
-		__u64 stat_aead_err_cnt;
-		__u64 stat_hash_err_cnt;
-		__u64 stat_rng_err_cnt;
-		__u64 stat_kpp_err_cnt;
-	};
-	union {
-		__u64 stat_decrypt_cnt;
-		__u64 stat_decompress_cnt;
-		__u64 stat_seed_cnt;
-		__u64 stat_generate_public_key_cnt;
-	};
-	union {
-		__u64 stat_decrypt_tlen;
-		__u64 stat_decompress_tlen;
-	};
-	union {
-		__u64 stat_verify_cnt;
-		__u64 stat_compute_shared_secret_cnt;
-	};
+struct crypto_stat_aead {
+	char type[CRYPTO_MAX_NAME];
+	__u64 stat_encrypt_cnt;
+	__u64 stat_encrypt_tlen;
+	__u64 stat_decrypt_cnt;
+	__u64 stat_decrypt_tlen;
+	__u64 stat_aead_err_cnt;
+};
+
+struct crypto_stat_akcipher {
+	char type[CRYPTO_MAX_NAME];
+	__u64 stat_encrypt_cnt;
+	__u64 stat_encrypt_tlen;
+	__u64 stat_decrypt_cnt;
+	__u64 stat_decrypt_tlen;
+	__u64 stat_verify_cnt;
 	__u64 stat_sign_cnt;
+	__u64 stat_akcipher_err_cnt;
+};
+
+struct crypto_stat_cipher {
+	char type[CRYPTO_MAX_NAME];
+	__u64 stat_encrypt_cnt;
+	__u64 stat_encrypt_tlen;
+	__u64 stat_decrypt_cnt;
+	__u64 stat_decrypt_tlen;
+	__u64 stat_cipher_err_cnt;
+};
+
+struct crypto_stat_compress {
+	char type[CRYPTO_MAX_NAME];
+	__u64 stat_compress_cnt;
+	__u64 stat_compress_tlen;
+	__u64 stat_decompress_cnt;
+	__u64 stat_decompress_tlen;
+	__u64 stat_compress_err_cnt;
+};
+
+struct crypto_stat_hash {
+	char type[CRYPTO_MAX_NAME];
+	__u64 stat_hash_cnt;
+	__u64 stat_hash_tlen;
+	__u64 stat_hash_err_cnt;
+};
+
+struct crypto_stat_kpp {
+	char type[CRYPTO_MAX_NAME];
+	__u64 stat_setsecret_cnt;
+	__u64 stat_generate_public_key_cnt;
+	__u64 stat_compute_shared_secret_cnt;
+	__u64 stat_kpp_err_cnt;
+};
+
+struct crypto_stat_rng {
+	char type[CRYPTO_MAX_NAME];
+	__u64 stat_generate_cnt;
+	__u64 stat_generate_tlen;
+	__u64 stat_seed_cnt;
+	__u64 stat_rng_err_cnt;
+};
+
+struct crypto_stat_larval {
+	char type[CRYPTO_MAX_NAME];
 };
 
 struct crypto_report_larval {
-- 
cgit v1.2.3-59-g8ed1b


From 76d09ea7c22f2cabf1f66ffc287c23b19b120be9 Mon Sep 17 00:00:00 2001
From: Corentin Labbe <clabbe@baylibre.com>
Date: Thu, 29 Nov 2018 14:42:20 +0000
Subject: crypto: tool: getstat: convert user space example to the new
 crypto_user_stat uapi

This patch converts the getstat example tool to the recent changes done in crypto_user_stat
- changed all stats to u64
- separated struct stats for each crypto alg

Signed-off-by: Corentin Labbe <clabbe@baylibre.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 tools/crypto/getstat.c | 54 +++++++++++++++++++++++++-------------------------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/tools/crypto/getstat.c b/tools/crypto/getstat.c
index 24115173a483..57fbb94608d4 100644
--- a/tools/crypto/getstat.c
+++ b/tools/crypto/getstat.c
@@ -152,53 +152,53 @@ static int get_stat(const char *drivername)
 
 	if (tb[CRYPTOCFGA_STAT_HASH]) {
 		struct rtattr *rta = tb[CRYPTOCFGA_STAT_HASH];
-		struct crypto_stat *rhash =
-			(struct crypto_stat *)RTA_DATA(rta);
-		printf("%s\tHash\n\tHash: %u bytes: %llu\n\tErrors: %u\n",
+		struct crypto_stat_hash *rhash =
+			(struct crypto_stat_hash *)RTA_DATA(rta);
+		printf("%s\tHash\n\tHash: %llu bytes: %llu\n\tErrors: %llu\n",
 			drivername,
 			rhash->stat_hash_cnt, rhash->stat_hash_tlen,
 			rhash->stat_hash_err_cnt);
 	} else if (tb[CRYPTOCFGA_STAT_COMPRESS]) {
 		struct rtattr *rta = tb[CRYPTOCFGA_STAT_COMPRESS];
-		struct crypto_stat *rblk =
-			(struct crypto_stat *)RTA_DATA(rta);
-		printf("%s\tCompress\n\tCompress: %u bytes: %llu\n\tDecompress: %u bytes: %llu\n\tErrors: %u\n",
+		struct crypto_stat_compress *rblk =
+			(struct crypto_stat_compress *)RTA_DATA(rta);
+		printf("%s\tCompress\n\tCompress: %llu bytes: %llu\n\tDecompress: %llu bytes: %llu\n\tErrors: %llu\n",
 			drivername,
 			rblk->stat_compress_cnt, rblk->stat_compress_tlen,
 			rblk->stat_decompress_cnt, rblk->stat_decompress_tlen,
 			rblk->stat_compress_err_cnt);
 	} else if (tb[CRYPTOCFGA_STAT_ACOMP]) {
 		struct rtattr *rta = tb[CRYPTOCFGA_STAT_ACOMP];
-		struct crypto_stat *rcomp =
-			(struct crypto_stat *)RTA_DATA(rta);
-		printf("%s\tACompress\n\tCompress: %u bytes: %llu\n\tDecompress: %u bytes: %llu\n\tErrors: %u\n",
+		struct crypto_stat_compress *rcomp =
+			(struct crypto_stat_compress *)RTA_DATA(rta);
+		printf("%s\tACompress\n\tCompress: %llu bytes: %llu\n\tDecompress: %llu bytes: %llu\n\tErrors: %llu\n",
 			drivername,
 			rcomp->stat_compress_cnt, rcomp->stat_compress_tlen,
 			rcomp->stat_decompress_cnt, rcomp->stat_decompress_tlen,
 			rcomp->stat_compress_err_cnt);
 	} else if (tb[CRYPTOCFGA_STAT_AEAD]) {
 		struct rtattr *rta = tb[CRYPTOCFGA_STAT_AEAD];
-		struct crypto_stat *raead =
-			(struct crypto_stat *)RTA_DATA(rta);
-		printf("%s\tAEAD\n\tEncrypt: %u bytes: %llu\n\tDecrypt: %u bytes: %llu\n\tErrors: %u\n",
+		struct crypto_stat_aead *raead =
+			(struct crypto_stat_aead *)RTA_DATA(rta);
+		printf("%s\tAEAD\n\tEncrypt: %llu bytes: %llu\n\tDecrypt: %llu bytes: %llu\n\tErrors: %llu\n",
 			drivername,
 			raead->stat_encrypt_cnt, raead->stat_encrypt_tlen,
 			raead->stat_decrypt_cnt, raead->stat_decrypt_tlen,
 			raead->stat_aead_err_cnt);
 	} else if (tb[CRYPTOCFGA_STAT_BLKCIPHER]) {
 		struct rtattr *rta = tb[CRYPTOCFGA_STAT_BLKCIPHER];
-		struct crypto_stat *rblk =
-			(struct crypto_stat *)RTA_DATA(rta);
-		printf("%s\tCipher\n\tEncrypt: %u bytes: %llu\n\tDecrypt: %u bytes: %llu\n\tErrors: %u\n",
+		struct crypto_stat_cipher *rblk =
+			(struct crypto_stat_cipher *)RTA_DATA(rta);
+		printf("%s\tCipher\n\tEncrypt: %llu bytes: %llu\n\tDecrypt: %llu bytes: %llu\n\tErrors: %llu\n",
 			drivername,
 			rblk->stat_encrypt_cnt, rblk->stat_encrypt_tlen,
 			rblk->stat_decrypt_cnt, rblk->stat_decrypt_tlen,
 			rblk->stat_cipher_err_cnt);
 	} else if (tb[CRYPTOCFGA_STAT_AKCIPHER]) {
 		struct rtattr *rta = tb[CRYPTOCFGA_STAT_AKCIPHER];
-		struct crypto_stat *rblk =
-			(struct crypto_stat *)RTA_DATA(rta);
-		printf("%s\tAkcipher\n\tEncrypt: %u bytes: %llu\n\tDecrypt: %u bytes: %llu\n\tSign: %u\n\tVerify: %u\n\tErrors: %u\n",
+		struct crypto_stat_akcipher *rblk =
+			(struct crypto_stat_akcipher *)RTA_DATA(rta);
+		printf("%s\tAkcipher\n\tEncrypt: %llu bytes: %llu\n\tDecrypt: %llu bytes: %llu\n\tSign: %llu\n\tVerify: %llu\n\tErrors: %llu\n",
 			drivername,
 			rblk->stat_encrypt_cnt, rblk->stat_encrypt_tlen,
 			rblk->stat_decrypt_cnt, rblk->stat_decrypt_tlen,
@@ -206,27 +206,27 @@ static int get_stat(const char *drivername)
 			rblk->stat_akcipher_err_cnt);
 	} else if (tb[CRYPTOCFGA_STAT_CIPHER]) {
 		struct rtattr *rta = tb[CRYPTOCFGA_STAT_CIPHER];
-		struct crypto_stat *rblk =
-			(struct crypto_stat *)RTA_DATA(rta);
-		printf("%s\tcipher\n\tEncrypt: %u bytes: %llu\n\tDecrypt: %u bytes: %llu\n\tErrors: %u\n",
+		struct crypto_stat_cipher *rblk =
+			(struct crypto_stat_cipher *)RTA_DATA(rta);
+		printf("%s\tcipher\n\tEncrypt: %llu bytes: %llu\n\tDecrypt: %llu bytes: %llu\n\tErrors: %llu\n",
 			drivername,
 			rblk->stat_encrypt_cnt, rblk->stat_encrypt_tlen,
 			rblk->stat_decrypt_cnt, rblk->stat_decrypt_tlen,
 			rblk->stat_cipher_err_cnt);
 	} else if (tb[CRYPTOCFGA_STAT_RNG]) {
 		struct rtattr *rta = tb[CRYPTOCFGA_STAT_RNG];
-		struct crypto_stat *rrng =
-			(struct crypto_stat *)RTA_DATA(rta);
-		printf("%s\tRNG\n\tSeed: %u\n\tGenerate: %u bytes: %llu\n\tErrors: %u\n",
+		struct crypto_stat_rng *rrng =
+			(struct crypto_stat_rng *)RTA_DATA(rta);
+		printf("%s\tRNG\n\tSeed: %llu\n\tGenerate: %llu bytes: %llu\n\tErrors: %llu\n",
 			drivername,
 			rrng->stat_seed_cnt,
 			rrng->stat_generate_cnt, rrng->stat_generate_tlen,
 			rrng->stat_rng_err_cnt);
 	} else if (tb[CRYPTOCFGA_STAT_KPP]) {
 		struct rtattr *rta = tb[CRYPTOCFGA_STAT_KPP];
-		struct crypto_stat *rkpp =
-			(struct crypto_stat *)RTA_DATA(rta);
-		printf("%s\tKPP\n\tSetsecret: %u\n\tGenerate public key: %u\n\tCompute_shared_secret: %u\n\tErrors: %u\n",
+		struct crypto_stat_kpp *rkpp =
+			(struct crypto_stat_kpp *)RTA_DATA(rta);
+		printf("%s\tKPP\n\tSetsecret: %llu\n\tGenerate public key: %llu\n\tCompute_shared_secret: %llu\n\tErrors: %llu\n",
 			drivername,
 			rkpp->stat_setsecret_cnt,
 			rkpp->stat_generate_public_key_cnt,
-- 
cgit v1.2.3-59-g8ed1b


From f7d76e05d058b832b373237566cc1af8251371b5 Mon Sep 17 00:00:00 2001
From: Corentin Labbe <clabbe@baylibre.com>
Date: Thu, 29 Nov 2018 14:42:21 +0000
Subject: crypto: user - fix use_after_free of struct xxx_request

All crypto_stats functions use the struct xxx_request for feeding stats,
but in some case this structure could already be freed.

For fixing this, the needed parameters (len and alg) will be stored
before the request being executed.
Fixes: cac5818c25d0 ("crypto: user - Implement a generic crypto statistics")
Reported-by: syzbot <syzbot+6939a606a5305e9e9799@syzkaller.appspotmail.com>

Signed-off-by: Corentin Labbe <clabbe@baylibre.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/ahash.c             |  17 +++-
 crypto/algapi.c            | 233 +++++++++++++++++++++++++++++++++++++++++++++
 crypto/rng.c               |   4 +-
 include/crypto/acompress.h |  38 ++------
 include/crypto/aead.h      |  38 ++------
 include/crypto/akcipher.h  |  74 +++-----------
 include/crypto/hash.h      |  32 +------
 include/crypto/kpp.h       |  48 ++--------
 include/crypto/rng.h       |  27 +-----
 include/crypto/skcipher.h  |  36 ++-----
 include/linux/crypto.h     | 105 +++++++++++++-------
 11 files changed, 376 insertions(+), 276 deletions(-)

diff --git a/crypto/ahash.c b/crypto/ahash.c
index 3a348fbcf8f9..5d320a811f75 100644
--- a/crypto/ahash.c
+++ b/crypto/ahash.c
@@ -364,20 +364,28 @@ static int crypto_ahash_op(struct ahash_request *req,
 
 int crypto_ahash_final(struct ahash_request *req)
 {
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct crypto_alg *alg = tfm->base.__crt_alg;
+	unsigned int nbytes = req->nbytes;
 	int ret;
 
+	crypto_stats_get(alg);
 	ret = crypto_ahash_op(req, crypto_ahash_reqtfm(req)->final);
-	crypto_stat_ahash_final(req, ret);
+	crypto_stats_ahash_final(nbytes, ret, alg);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(crypto_ahash_final);
 
 int crypto_ahash_finup(struct ahash_request *req)
 {
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct crypto_alg *alg = tfm->base.__crt_alg;
+	unsigned int nbytes = req->nbytes;
 	int ret;
 
+	crypto_stats_get(alg);
 	ret = crypto_ahash_op(req, crypto_ahash_reqtfm(req)->finup);
-	crypto_stat_ahash_final(req, ret);
+	crypto_stats_ahash_final(nbytes, ret, alg);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(crypto_ahash_finup);
@@ -385,13 +393,16 @@ EXPORT_SYMBOL_GPL(crypto_ahash_finup);
 int crypto_ahash_digest(struct ahash_request *req)
 {
 	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct crypto_alg *alg = tfm->base.__crt_alg;
+	unsigned int nbytes = req->nbytes;
 	int ret;
 
+	crypto_stats_get(alg);
 	if (crypto_ahash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
 		ret = -ENOKEY;
 	else
 		ret = crypto_ahash_op(req, tfm->digest);
-	crypto_stat_ahash_final(req, ret);
+	crypto_stats_ahash_final(nbytes, ret, alg);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(crypto_ahash_digest);
diff --git a/crypto/algapi.c b/crypto/algapi.c
index 42fe316f80ee..4c1e6079d271 100644
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -1078,6 +1078,239 @@ int crypto_type_has_alg(const char *name, const struct crypto_type *frontend,
 }
 EXPORT_SYMBOL_GPL(crypto_type_has_alg);
 
+#ifdef CONFIG_CRYPTO_STATS
+void crypto_stats_get(struct crypto_alg *alg)
+{
+	crypto_alg_get(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_get);
+
+void crypto_stats_ablkcipher_encrypt(unsigned int nbytes, int ret,
+				     struct crypto_alg *alg)
+{
+	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
+		atomic64_inc(&alg->cipher_err_cnt);
+	} else {
+		atomic64_inc(&alg->encrypt_cnt);
+		atomic64_add(nbytes, &alg->encrypt_tlen);
+	}
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_ablkcipher_encrypt);
+
+void crypto_stats_ablkcipher_decrypt(unsigned int nbytes, int ret,
+				     struct crypto_alg *alg)
+{
+	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
+		atomic64_inc(&alg->cipher_err_cnt);
+	} else {
+		atomic64_inc(&alg->decrypt_cnt);
+		atomic64_add(nbytes, &alg->decrypt_tlen);
+	}
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_ablkcipher_decrypt);
+
+void crypto_stats_aead_encrypt(unsigned int cryptlen, struct crypto_alg *alg,
+			       int ret)
+{
+	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
+		atomic64_inc(&alg->aead_err_cnt);
+	} else {
+		atomic64_inc(&alg->encrypt_cnt);
+		atomic64_add(cryptlen, &alg->encrypt_tlen);
+	}
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_aead_encrypt);
+
+void crypto_stats_aead_decrypt(unsigned int cryptlen, struct crypto_alg *alg,
+			       int ret)
+{
+	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
+		atomic64_inc(&alg->aead_err_cnt);
+	} else {
+		atomic64_inc(&alg->decrypt_cnt);
+		atomic64_add(cryptlen, &alg->decrypt_tlen);
+	}
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_aead_decrypt);
+
+void crypto_stats_akcipher_encrypt(unsigned int src_len, int ret,
+				   struct crypto_alg *alg)
+{
+	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
+		atomic64_inc(&alg->akcipher_err_cnt);
+	} else {
+		atomic64_inc(&alg->encrypt_cnt);
+		atomic64_add(src_len, &alg->encrypt_tlen);
+	}
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_akcipher_encrypt);
+
+void crypto_stats_akcipher_decrypt(unsigned int src_len, int ret,
+				   struct crypto_alg *alg)
+{
+	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
+		atomic64_inc(&alg->akcipher_err_cnt);
+	} else {
+		atomic64_inc(&alg->decrypt_cnt);
+		atomic64_add(src_len, &alg->decrypt_tlen);
+	}
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_akcipher_decrypt);
+
+void crypto_stats_akcipher_sign(int ret, struct crypto_alg *alg)
+{
+	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
+		atomic64_inc(&alg->akcipher_err_cnt);
+	else
+		atomic64_inc(&alg->sign_cnt);
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_akcipher_sign);
+
+void crypto_stats_akcipher_verify(int ret, struct crypto_alg *alg)
+{
+	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
+		atomic64_inc(&alg->akcipher_err_cnt);
+	else
+		atomic64_inc(&alg->verify_cnt);
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_akcipher_verify);
+
+void crypto_stats_compress(unsigned int slen, int ret, struct crypto_alg *alg)
+{
+	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
+		atomic64_inc(&alg->compress_err_cnt);
+	} else {
+		atomic64_inc(&alg->compress_cnt);
+		atomic64_add(slen, &alg->compress_tlen);
+	}
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_compress);
+
+void crypto_stats_decompress(unsigned int slen, int ret, struct crypto_alg *alg)
+{
+	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
+		atomic64_inc(&alg->compress_err_cnt);
+	} else {
+		atomic64_inc(&alg->decompress_cnt);
+		atomic64_add(slen, &alg->decompress_tlen);
+	}
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_decompress);
+
+void crypto_stats_ahash_update(unsigned int nbytes, int ret,
+			       struct crypto_alg *alg)
+{
+	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
+		atomic64_inc(&alg->hash_err_cnt);
+	else
+		atomic64_add(nbytes, &alg->hash_tlen);
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_ahash_update);
+
+void crypto_stats_ahash_final(unsigned int nbytes, int ret,
+			      struct crypto_alg *alg)
+{
+	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
+		atomic64_inc(&alg->hash_err_cnt);
+	} else {
+		atomic64_inc(&alg->hash_cnt);
+		atomic64_add(nbytes, &alg->hash_tlen);
+	}
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_ahash_final);
+
+void crypto_stats_kpp_set_secret(struct crypto_alg *alg, int ret)
+{
+	if (ret)
+		atomic64_inc(&alg->kpp_err_cnt);
+	else
+		atomic64_inc(&alg->setsecret_cnt);
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_kpp_set_secret);
+
+void crypto_stats_kpp_generate_public_key(struct crypto_alg *alg, int ret)
+{
+	if (ret)
+		atomic64_inc(&alg->kpp_err_cnt);
+	else
+		atomic64_inc(&alg->generate_public_key_cnt);
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_kpp_generate_public_key);
+
+void crypto_stats_kpp_compute_shared_secret(struct crypto_alg *alg, int ret)
+{
+	if (ret)
+		atomic64_inc(&alg->kpp_err_cnt);
+	else
+		atomic64_inc(&alg->compute_shared_secret_cnt);
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_kpp_compute_shared_secret);
+
+void crypto_stats_rng_seed(struct crypto_alg *alg, int ret)
+{
+	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
+		atomic64_inc(&alg->rng_err_cnt);
+	else
+		atomic64_inc(&alg->seed_cnt);
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_rng_seed);
+
+void crypto_stats_rng_generate(struct crypto_alg *alg, unsigned int dlen,
+			       int ret)
+{
+	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
+		atomic64_inc(&alg->rng_err_cnt);
+	} else {
+		atomic64_inc(&alg->generate_cnt);
+		atomic64_add(dlen, &alg->generate_tlen);
+	}
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_rng_generate);
+
+void crypto_stats_skcipher_encrypt(unsigned int cryptlen, int ret,
+				   struct crypto_alg *alg)
+{
+	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
+		atomic64_inc(&alg->cipher_err_cnt);
+	} else {
+		atomic64_inc(&alg->encrypt_cnt);
+		atomic64_add(cryptlen, &alg->encrypt_tlen);
+	}
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_skcipher_encrypt);
+
+void crypto_stats_skcipher_decrypt(unsigned int cryptlen, int ret,
+				   struct crypto_alg *alg)
+{
+	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
+		atomic64_inc(&alg->cipher_err_cnt);
+	} else {
+		atomic64_inc(&alg->decrypt_cnt);
+		atomic64_add(cryptlen, &alg->decrypt_tlen);
+	}
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_skcipher_decrypt);
+#endif
+
 static int __init crypto_algapi_init(void)
 {
 	crypto_init_proc();
diff --git a/crypto/rng.c b/crypto/rng.c
index 2406501b90b7..33c38a72bff5 100644
--- a/crypto/rng.c
+++ b/crypto/rng.c
@@ -35,9 +35,11 @@ static int crypto_default_rng_refcnt;
 
 int crypto_rng_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen)
 {
+	struct crypto_alg *alg = tfm->base.__crt_alg;
 	u8 *buf = NULL;
 	int err;
 
+	crypto_stats_get(alg);
 	if (!seed && slen) {
 		buf = kmalloc(slen, GFP_KERNEL);
 		if (!buf)
@@ -50,7 +52,7 @@ int crypto_rng_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen)
 	}
 
 	err = crypto_rng_alg(tfm)->seed(tfm, seed, slen);
-	crypto_stat_rng_seed(tfm, err);
+	crypto_stats_rng_seed(alg, err);
 out:
 	kzfree(buf);
 	return err;
diff --git a/include/crypto/acompress.h b/include/crypto/acompress.h
index f79918196811..a3e766dff917 100644
--- a/include/crypto/acompress.h
+++ b/include/crypto/acompress.h
@@ -234,34 +234,6 @@ static inline void acomp_request_set_params(struct acomp_req *req,
 		req->flags |= CRYPTO_ACOMP_ALLOC_OUTPUT;
 }
 
-static inline void crypto_stat_compress(struct acomp_req *req, int ret)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	struct crypto_acomp *tfm = crypto_acomp_reqtfm(req);
-
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&tfm->base.__crt_alg->compress_err_cnt);
-	} else {
-		atomic64_inc(&tfm->base.__crt_alg->compress_cnt);
-		atomic64_add(req->slen, &tfm->base.__crt_alg->compress_tlen);
-	}
-#endif
-}
-
-static inline void crypto_stat_decompress(struct acomp_req *req, int ret)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	struct crypto_acomp *tfm = crypto_acomp_reqtfm(req);
-
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&tfm->base.__crt_alg->compress_err_cnt);
-	} else {
-		atomic64_inc(&tfm->base.__crt_alg->decompress_cnt);
-		atomic64_add(req->slen, &tfm->base.__crt_alg->decompress_tlen);
-	}
-#endif
-}
-
 /**
  * crypto_acomp_compress() -- Invoke asynchronous compress operation
  *
@@ -274,10 +246,13 @@ static inline void crypto_stat_decompress(struct acomp_req *req, int ret)
 static inline int crypto_acomp_compress(struct acomp_req *req)
 {
 	struct crypto_acomp *tfm = crypto_acomp_reqtfm(req);
+	struct crypto_alg *alg = tfm->base.__crt_alg;
+	unsigned int slen = req->slen;
 	int ret;
 
+	crypto_stats_get(alg);
 	ret = tfm->compress(req);
-	crypto_stat_compress(req, ret);
+	crypto_stats_compress(slen, ret, alg);
 	return ret;
 }
 
@@ -293,10 +268,13 @@ static inline int crypto_acomp_compress(struct acomp_req *req)
 static inline int crypto_acomp_decompress(struct acomp_req *req)
 {
 	struct crypto_acomp *tfm = crypto_acomp_reqtfm(req);
+	struct crypto_alg *alg = tfm->base.__crt_alg;
+	unsigned int slen = req->slen;
 	int ret;
 
+	crypto_stats_get(alg);
 	ret = tfm->decompress(req);
-	crypto_stat_decompress(req, ret);
+	crypto_stats_decompress(slen, ret, alg);
 	return ret;
 }
 
diff --git a/include/crypto/aead.h b/include/crypto/aead.h
index 99afd78c665d..b7b8d24cf765 100644
--- a/include/crypto/aead.h
+++ b/include/crypto/aead.h
@@ -306,34 +306,6 @@ static inline struct crypto_aead *crypto_aead_reqtfm(struct aead_request *req)
 	return __crypto_aead_cast(req->base.tfm);
 }
 
-static inline void crypto_stat_aead_encrypt(struct aead_request *req, int ret)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
-
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&tfm->base.__crt_alg->aead_err_cnt);
-	} else {
-		atomic64_inc(&tfm->base.__crt_alg->encrypt_cnt);
-		atomic64_add(req->cryptlen, &tfm->base.__crt_alg->encrypt_tlen);
-	}
-#endif
-}
-
-static inline void crypto_stat_aead_decrypt(struct aead_request *req, int ret)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
-
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&tfm->base.__crt_alg->aead_err_cnt);
-	} else {
-		atomic64_inc(&tfm->base.__crt_alg->decrypt_cnt);
-		atomic64_add(req->cryptlen, &tfm->base.__crt_alg->decrypt_tlen);
-	}
-#endif
-}
-
 /**
  * crypto_aead_encrypt() - encrypt plaintext
  * @req: reference to the aead_request handle that holds all information
@@ -356,13 +328,16 @@ static inline void crypto_stat_aead_decrypt(struct aead_request *req, int ret)
 static inline int crypto_aead_encrypt(struct aead_request *req)
 {
 	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct crypto_alg *alg = aead->base.__crt_alg;
+	unsigned int cryptlen = req->cryptlen;
 	int ret;
 
+	crypto_stats_get(alg);
 	if (crypto_aead_get_flags(aead) & CRYPTO_TFM_NEED_KEY)
 		ret = -ENOKEY;
 	else
 		ret = crypto_aead_alg(aead)->encrypt(req);
-	crypto_stat_aead_encrypt(req, ret);
+	crypto_stats_aead_encrypt(cryptlen, alg, ret);
 	return ret;
 }
 
@@ -391,15 +366,18 @@ static inline int crypto_aead_encrypt(struct aead_request *req)
 static inline int crypto_aead_decrypt(struct aead_request *req)
 {
 	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct crypto_alg *alg = aead->base.__crt_alg;
+	unsigned int cryptlen = req->cryptlen;
 	int ret;
 
+	crypto_stats_get(alg);
 	if (crypto_aead_get_flags(aead) & CRYPTO_TFM_NEED_KEY)
 		ret = -ENOKEY;
 	else if (req->cryptlen < crypto_aead_authsize(aead))
 		ret = -EINVAL;
 	else
 		ret = crypto_aead_alg(aead)->decrypt(req);
-	crypto_stat_aead_decrypt(req, ret);
+	crypto_stats_aead_decrypt(cryptlen, alg, ret);
 	return ret;
 }
 
diff --git a/include/crypto/akcipher.h b/include/crypto/akcipher.h
index 3dc05cf7e0a9..2d690494568c 100644
--- a/include/crypto/akcipher.h
+++ b/include/crypto/akcipher.h
@@ -271,62 +271,6 @@ static inline unsigned int crypto_akcipher_maxsize(struct crypto_akcipher *tfm)
 	return alg->max_size(tfm);
 }
 
-static inline void crypto_stat_akcipher_encrypt(struct akcipher_request *req,
-						int ret)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
-
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&tfm->base.__crt_alg->akcipher_err_cnt);
-	} else {
-		atomic64_inc(&tfm->base.__crt_alg->encrypt_cnt);
-		atomic64_add(req->src_len, &tfm->base.__crt_alg->encrypt_tlen);
-	}
-#endif
-}
-
-static inline void crypto_stat_akcipher_decrypt(struct akcipher_request *req,
-						int ret)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
-
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&tfm->base.__crt_alg->akcipher_err_cnt);
-	} else {
-		atomic64_inc(&tfm->base.__crt_alg->decrypt_cnt);
-		atomic64_add(req->src_len, &tfm->base.__crt_alg->decrypt_tlen);
-	}
-#endif
-}
-
-static inline void crypto_stat_akcipher_sign(struct akcipher_request *req,
-					     int ret)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
-
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
-		atomic64_inc(&tfm->base.__crt_alg->akcipher_err_cnt);
-	else
-		atomic64_inc(&tfm->base.__crt_alg->sign_cnt);
-#endif
-}
-
-static inline void crypto_stat_akcipher_verify(struct akcipher_request *req,
-					       int ret)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
-
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
-		atomic64_inc(&tfm->base.__crt_alg->akcipher_err_cnt);
-	else
-		atomic64_inc(&tfm->base.__crt_alg->verify_cnt);
-#endif
-}
-
 /**
  * crypto_akcipher_encrypt() - Invoke public key encrypt operation
  *
@@ -341,10 +285,13 @@ static inline int crypto_akcipher_encrypt(struct akcipher_request *req)
 {
 	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
 	struct akcipher_alg *alg = crypto_akcipher_alg(tfm);
+	struct crypto_alg *calg = tfm->base.__crt_alg;
+	unsigned int src_len = req->src_len;
 	int ret;
 
+	crypto_stats_get(calg);
 	ret = alg->encrypt(req);
-	crypto_stat_akcipher_encrypt(req, ret);
+	crypto_stats_akcipher_encrypt(src_len, ret, calg);
 	return ret;
 }
 
@@ -362,10 +309,13 @@ static inline int crypto_akcipher_decrypt(struct akcipher_request *req)
 {
 	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
 	struct akcipher_alg *alg = crypto_akcipher_alg(tfm);
+	struct crypto_alg *calg = tfm->base.__crt_alg;
+	unsigned int src_len = req->src_len;
 	int ret;
 
+	crypto_stats_get(calg);
 	ret = alg->decrypt(req);
-	crypto_stat_akcipher_decrypt(req, ret);
+	crypto_stats_akcipher_decrypt(src_len, ret, calg);
 	return ret;
 }
 
@@ -383,10 +333,12 @@ static inline int crypto_akcipher_sign(struct akcipher_request *req)
 {
 	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
 	struct akcipher_alg *alg = crypto_akcipher_alg(tfm);
+	struct crypto_alg *calg = tfm->base.__crt_alg;
 	int ret;
 
+	crypto_stats_get(calg);
 	ret = alg->sign(req);
-	crypto_stat_akcipher_sign(req, ret);
+	crypto_stats_akcipher_sign(ret, calg);
 	return ret;
 }
 
@@ -404,10 +356,12 @@ static inline int crypto_akcipher_verify(struct akcipher_request *req)
 {
 	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
 	struct akcipher_alg *alg = crypto_akcipher_alg(tfm);
+	struct crypto_alg *calg = tfm->base.__crt_alg;
 	int ret;
 
+	crypto_stats_get(calg);
 	ret = alg->verify(req);
-	crypto_stat_akcipher_verify(req, ret);
+	crypto_stats_akcipher_verify(ret, calg);
 	return ret;
 }
 
diff --git a/include/crypto/hash.h b/include/crypto/hash.h
index 52920bed05ba..3b31c1b349ae 100644
--- a/include/crypto/hash.h
+++ b/include/crypto/hash.h
@@ -412,32 +412,6 @@ static inline void *ahash_request_ctx(struct ahash_request *req)
 int crypto_ahash_setkey(struct crypto_ahash *tfm, const u8 *key,
 			unsigned int keylen);
 
-static inline void crypto_stat_ahash_update(struct ahash_request *req, int ret)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
-
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
-		atomic64_inc(&tfm->base.__crt_alg->hash_err_cnt);
-	else
-		atomic64_add(req->nbytes, &tfm->base.__crt_alg->hash_tlen);
-#endif
-}
-
-static inline void crypto_stat_ahash_final(struct ahash_request *req, int ret)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
-
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&tfm->base.__crt_alg->hash_err_cnt);
-	} else {
-		atomic64_inc(&tfm->base.__crt_alg->hash_cnt);
-		atomic64_add(req->nbytes, &tfm->base.__crt_alg->hash_tlen);
-	}
-#endif
-}
-
 /**
  * crypto_ahash_finup() - update and finalize message digest
  * @req: reference to the ahash_request handle that holds all information
@@ -552,10 +526,14 @@ static inline int crypto_ahash_init(struct ahash_request *req)
  */
 static inline int crypto_ahash_update(struct ahash_request *req)
 {
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct crypto_alg *alg = tfm->base.__crt_alg;
+	unsigned int nbytes = req->nbytes;
 	int ret;
 
+	crypto_stats_get(alg);
 	ret = crypto_ahash_reqtfm(req)->update(req);
-	crypto_stat_ahash_update(req, ret);
+	crypto_stats_ahash_update(nbytes, ret, alg);
 	return ret;
 }
 
diff --git a/include/crypto/kpp.h b/include/crypto/kpp.h
index bd5103a80919..1a97e1601422 100644
--- a/include/crypto/kpp.h
+++ b/include/crypto/kpp.h
@@ -268,42 +268,6 @@ struct kpp_secret {
 	unsigned short len;
 };
 
-static inline void crypto_stat_kpp_set_secret(struct crypto_kpp *tfm, int ret)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	if (ret)
-		atomic64_inc(&tfm->base.__crt_alg->kpp_err_cnt);
-	else
-		atomic64_inc(&tfm->base.__crt_alg->setsecret_cnt);
-#endif
-}
-
-static inline void crypto_stat_kpp_generate_public_key(struct kpp_request *req,
-						       int ret)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
-
-	if (ret)
-		atomic64_inc(&tfm->base.__crt_alg->kpp_err_cnt);
-	else
-		atomic64_inc(&tfm->base.__crt_alg->generate_public_key_cnt);
-#endif
-}
-
-static inline void crypto_stat_kpp_compute_shared_secret(struct kpp_request *req,
-							 int ret)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
-
-	if (ret)
-		atomic64_inc(&tfm->base.__crt_alg->kpp_err_cnt);
-	else
-		atomic64_inc(&tfm->base.__crt_alg->compute_shared_secret_cnt);
-#endif
-}
-
 /**
  * crypto_kpp_set_secret() - Invoke kpp operation
  *
@@ -323,10 +287,12 @@ static inline int crypto_kpp_set_secret(struct crypto_kpp *tfm,
 					const void *buffer, unsigned int len)
 {
 	struct kpp_alg *alg = crypto_kpp_alg(tfm);
+	struct crypto_alg *calg = tfm->base.__crt_alg;
 	int ret;
 
+	crypto_stats_get(calg);
 	ret = alg->set_secret(tfm, buffer, len);
-	crypto_stat_kpp_set_secret(tfm, ret);
+	crypto_stats_kpp_set_secret(calg, ret);
 	return ret;
 }
 
@@ -347,10 +313,12 @@ static inline int crypto_kpp_generate_public_key(struct kpp_request *req)
 {
 	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
 	struct kpp_alg *alg = crypto_kpp_alg(tfm);
+	struct crypto_alg *calg = tfm->base.__crt_alg;
 	int ret;
 
+	crypto_stats_get(calg);
 	ret = alg->generate_public_key(req);
-	crypto_stat_kpp_generate_public_key(req, ret);
+	crypto_stats_kpp_generate_public_key(calg, ret);
 	return ret;
 }
 
@@ -368,10 +336,12 @@ static inline int crypto_kpp_compute_shared_secret(struct kpp_request *req)
 {
 	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
 	struct kpp_alg *alg = crypto_kpp_alg(tfm);
+	struct crypto_alg *calg = tfm->base.__crt_alg;
 	int ret;
 
+	crypto_stats_get(calg);
 	ret = alg->compute_shared_secret(req);
-	crypto_stat_kpp_compute_shared_secret(req, ret);
+	crypto_stats_kpp_compute_shared_secret(calg, ret);
 	return ret;
 }
 
diff --git a/include/crypto/rng.h b/include/crypto/rng.h
index 966615bba45e..022a1b896b47 100644
--- a/include/crypto/rng.h
+++ b/include/crypto/rng.h
@@ -122,29 +122,6 @@ static inline void crypto_free_rng(struct crypto_rng *tfm)
 	crypto_destroy_tfm(tfm, crypto_rng_tfm(tfm));
 }
 
-static inline void crypto_stat_rng_seed(struct crypto_rng *tfm, int ret)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
-		atomic64_inc(&tfm->base.__crt_alg->rng_err_cnt);
-	else
-		atomic64_inc(&tfm->base.__crt_alg->seed_cnt);
-#endif
-}
-
-static inline void crypto_stat_rng_generate(struct crypto_rng *tfm,
-					    unsigned int dlen, int ret)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&tfm->base.__crt_alg->rng_err_cnt);
-	} else {
-		atomic64_inc(&tfm->base.__crt_alg->generate_cnt);
-		atomic64_add(dlen, &tfm->base.__crt_alg->generate_tlen);
-	}
-#endif
-}
-
 /**
  * crypto_rng_generate() - get random number
  * @tfm: cipher handle
@@ -163,10 +140,12 @@ static inline int crypto_rng_generate(struct crypto_rng *tfm,
 				      const u8 *src, unsigned int slen,
 				      u8 *dst, unsigned int dlen)
 {
+	struct crypto_alg *alg = tfm->base.__crt_alg;
 	int ret;
 
+	crypto_stats_get(alg);
 	ret = crypto_rng_alg(tfm)->generate(tfm, src, slen, dst, dlen);
-	crypto_stat_rng_generate(tfm, dlen, ret);
+	crypto_stats_rng_generate(alg, dlen, ret);
 	return ret;
 }
 
diff --git a/include/crypto/skcipher.h b/include/crypto/skcipher.h
index dff54731ddf4..480f8301a47d 100644
--- a/include/crypto/skcipher.h
+++ b/include/crypto/skcipher.h
@@ -486,32 +486,6 @@ static inline struct crypto_sync_skcipher *crypto_sync_skcipher_reqtfm(
 	return container_of(tfm, struct crypto_sync_skcipher, base);
 }
 
-static inline void crypto_stat_skcipher_encrypt(struct skcipher_request *req,
-						int ret, struct crypto_alg *alg)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->cipher_err_cnt);
-	} else {
-		atomic64_inc(&alg->encrypt_cnt);
-		atomic64_add(req->cryptlen, &alg->encrypt_tlen);
-	}
-#endif
-}
-
-static inline void crypto_stat_skcipher_decrypt(struct skcipher_request *req,
-						int ret, struct crypto_alg *alg)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->cipher_err_cnt);
-	} else {
-		atomic64_inc(&alg->decrypt_cnt);
-		atomic64_add(req->cryptlen, &alg->decrypt_tlen);
-	}
-#endif
-}
-
 /**
  * crypto_skcipher_encrypt() - encrypt plaintext
  * @req: reference to the skcipher_request handle that holds all information
@@ -526,13 +500,16 @@ static inline void crypto_stat_skcipher_decrypt(struct skcipher_request *req,
 static inline int crypto_skcipher_encrypt(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct crypto_alg *alg = tfm->base.__crt_alg;
+	unsigned int cryptlen = req->cryptlen;
 	int ret;
 
+	crypto_stats_get(alg);
 	if (crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
 		ret = -ENOKEY;
 	else
 		ret = tfm->encrypt(req);
-	crypto_stat_skcipher_encrypt(req, ret, tfm->base.__crt_alg);
+	crypto_stats_skcipher_encrypt(cryptlen, ret, alg);
 	return ret;
 }
 
@@ -550,13 +527,16 @@ static inline int crypto_skcipher_encrypt(struct skcipher_request *req)
 static inline int crypto_skcipher_decrypt(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct crypto_alg *alg = tfm->base.__crt_alg;
+	unsigned int cryptlen = req->cryptlen;
 	int ret;
 
+	crypto_stats_get(alg);
 	if (crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
 		ret = -ENOKEY;
 	else
 		ret = tfm->decrypt(req);
-	crypto_stat_skcipher_decrypt(req, ret, tfm->base.__crt_alg);
+	crypto_stats_skcipher_decrypt(cryptlen, ret, alg);
 	return ret;
 }
 
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index b109b50906e7..e2fd24714e00 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -557,6 +557,69 @@ struct crypto_alg {
 
 } CRYPTO_MINALIGN_ATTR;
 
+#ifdef CONFIG_CRYPTO_STATS
+void crypto_stats_get(struct crypto_alg *alg);
+void crypto_stats_ablkcipher_encrypt(unsigned int nbytes, int ret, struct crypto_alg *alg);
+void crypto_stats_ablkcipher_decrypt(unsigned int nbytes, int ret, struct crypto_alg *alg);
+void crypto_stats_aead_encrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret);
+void crypto_stats_aead_decrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret);
+void crypto_stats_ahash_update(unsigned int nbytes, int ret, struct crypto_alg *alg);
+void crypto_stats_ahash_final(unsigned int nbytes, int ret, struct crypto_alg *alg);
+void crypto_stats_akcipher_encrypt(unsigned int src_len, int ret, struct crypto_alg *alg);
+void crypto_stats_akcipher_decrypt(unsigned int src_len, int ret, struct crypto_alg *alg);
+void crypto_stats_akcipher_sign(int ret, struct crypto_alg *alg);
+void crypto_stats_akcipher_verify(int ret, struct crypto_alg *alg);
+void crypto_stats_compress(unsigned int slen, int ret, struct crypto_alg *alg);
+void crypto_stats_decompress(unsigned int slen, int ret, struct crypto_alg *alg);
+void crypto_stats_kpp_set_secret(struct crypto_alg *alg, int ret);
+void crypto_stats_kpp_generate_public_key(struct crypto_alg *alg, int ret);
+void crypto_stats_kpp_compute_shared_secret(struct crypto_alg *alg, int ret);
+void crypto_stats_rng_seed(struct crypto_alg *alg, int ret);
+void crypto_stats_rng_generate(struct crypto_alg *alg, unsigned int dlen, int ret);
+void crypto_stats_skcipher_encrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg);
+void crypto_stats_skcipher_decrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg);
+#else
+static inline void crypto_stats_get(struct crypto_alg *alg)
+{}
+static inline void crypto_stats_ablkcipher_encrypt(unsigned int nbytes, int ret, struct crypto_alg *alg)
+{}
+static inline void crypto_stats_ablkcipher_decrypt(unsigned int nbytes, int ret, struct crypto_alg *alg)
+{}
+static inline void crypto_stats_aead_encrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret)
+{}
+static inline void crypto_stats_aead_decrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret)
+{}
+static inline void crypto_stats_ahash_update(unsigned int nbytes, int ret, struct crypto_alg *alg)
+{}
+static inline void crypto_stats_ahash_final(unsigned int nbytes, int ret, struct crypto_alg *alg)
+{}
+static inline void crypto_stats_akcipher_encrypt(unsigned int src_len, int ret, struct crypto_alg *alg)
+{}
+static inline void crypto_stats_akcipher_decrypt(unsigned int src_len, int ret, struct crypto_alg *alg)
+{}
+static inline void crypto_stats_akcipher_sign(int ret, struct crypto_alg *alg)
+{}
+static inline void crypto_stats_akcipher_verify(int ret, struct crypto_alg *alg)
+{}
+static inline void crypto_stats_compress(unsigned int slen, int ret, struct crypto_alg *alg)
+{}
+static inline void crypto_stats_decompress(unsigned int slen, int ret, struct crypto_alg *alg)
+{}
+static inline void crypto_stats_kpp_set_secret(struct crypto_alg *alg, int ret)
+{}
+static inline void crypto_stats_kpp_generate_public_key(struct crypto_alg *alg, int ret)
+{}
+static inline void crypto_stats_kpp_compute_shared_secret(struct crypto_alg *alg, int ret)
+{}
+static inline void crypto_stats_rng_seed(struct crypto_alg *alg, int ret)
+{}
+static inline void crypto_stats_rng_generate(struct crypto_alg *alg, unsigned int dlen, int ret)
+{}
+static inline void crypto_stats_skcipher_encrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg)
+{}
+static inline void crypto_stats_skcipher_decrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg)
+{}
+#endif
 /*
  * A helper struct for waiting for completion of async crypto ops
  */
@@ -975,38 +1038,6 @@ static inline struct crypto_ablkcipher *crypto_ablkcipher_reqtfm(
 	return __crypto_ablkcipher_cast(req->base.tfm);
 }
 
-static inline void crypto_stat_ablkcipher_encrypt(struct ablkcipher_request *req,
-						  int ret)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	struct ablkcipher_tfm *crt =
-		crypto_ablkcipher_crt(crypto_ablkcipher_reqtfm(req));
-
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&crt->base->base.__crt_alg->cipher_err_cnt);
-	} else {
-		atomic64_inc(&crt->base->base.__crt_alg->encrypt_cnt);
-		atomic64_add(req->nbytes, &crt->base->base.__crt_alg->encrypt_tlen);
-	}
-#endif
-}
-
-static inline void crypto_stat_ablkcipher_decrypt(struct ablkcipher_request *req,
-						  int ret)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	struct ablkcipher_tfm *crt =
-		crypto_ablkcipher_crt(crypto_ablkcipher_reqtfm(req));
-
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&crt->base->base.__crt_alg->cipher_err_cnt);
-	} else {
-		atomic64_inc(&crt->base->base.__crt_alg->decrypt_cnt);
-		atomic64_add(req->nbytes, &crt->base->base.__crt_alg->decrypt_tlen);
-	}
-#endif
-}
-
 /**
  * crypto_ablkcipher_encrypt() - encrypt plaintext
  * @req: reference to the ablkcipher_request handle that holds all information
@@ -1022,10 +1053,13 @@ static inline int crypto_ablkcipher_encrypt(struct ablkcipher_request *req)
 {
 	struct ablkcipher_tfm *crt =
 		crypto_ablkcipher_crt(crypto_ablkcipher_reqtfm(req));
+	struct crypto_alg *alg = crt->base->base.__crt_alg;
+	unsigned int nbytes = req->nbytes;
 	int ret;
 
+	crypto_stats_get(alg);
 	ret = crt->encrypt(req);
-	crypto_stat_ablkcipher_encrypt(req, ret);
+	crypto_stats_ablkcipher_encrypt(nbytes, ret, alg);
 	return ret;
 }
 
@@ -1044,10 +1078,13 @@ static inline int crypto_ablkcipher_decrypt(struct ablkcipher_request *req)
 {
 	struct ablkcipher_tfm *crt =
 		crypto_ablkcipher_crt(crypto_ablkcipher_reqtfm(req));
+	struct crypto_alg *alg = crt->base->base.__crt_alg;
+	unsigned int nbytes = req->nbytes;
 	int ret;
 
+	crypto_stats_get(alg);
 	ret = crt->decrypt(req);
-	crypto_stat_ablkcipher_decrypt(req, ret);
+	crypto_stats_ablkcipher_decrypt(nbytes, ret, alg);
 	return ret;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From b0af91c14109d6c9c0d73428dc0512b780f41d94 Mon Sep 17 00:00:00 2001
From: Corentin Labbe <clabbe@baylibre.com>
Date: Thu, 29 Nov 2018 14:42:22 +0000
Subject: crypto: user - Fix invalid stat reporting

Some error count use the wrong name for getting this data.
But this had not caused any reporting problem, since all error count are shared in the same
union.

Signed-off-by: Corentin Labbe <clabbe@baylibre.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/crypto_user_stat.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/crypto/crypto_user_stat.c b/crypto/crypto_user_stat.c
index 3c14be2f7a1b..838123758423 100644
--- a/crypto/crypto_user_stat.c
+++ b/crypto/crypto_user_stat.c
@@ -93,7 +93,7 @@ static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg)
 	rcomp.stat_decompress_cnt = v64;
 	v64 = atomic64_read(&alg->decompress_tlen);
 	rcomp.stat_decompress_tlen = v64;
-	v64 = atomic64_read(&alg->cipher_err_cnt);
+	v64 = atomic64_read(&alg->compress_err_cnt);
 	rcomp.stat_compress_err_cnt = v64;
 
 	return nla_put(skb, CRYPTOCFGA_STAT_COMPRESS, sizeof(rcomp), &rcomp);
@@ -115,7 +115,7 @@ static int crypto_report_acomp(struct sk_buff *skb, struct crypto_alg *alg)
 	racomp.stat_decompress_cnt = v64;
 	v64 = atomic64_read(&alg->decompress_tlen);
 	racomp.stat_decompress_tlen = v64;
-	v64 = atomic64_read(&alg->cipher_err_cnt);
+	v64 = atomic64_read(&alg->compress_err_cnt);
 	racomp.stat_compress_err_cnt = v64;
 
 	return nla_put(skb, CRYPTOCFGA_STAT_ACOMP, sizeof(racomp), &racomp);
@@ -222,7 +222,7 @@ static int crypto_report_rng(struct sk_buff *skb, struct crypto_alg *alg)
 	rrng.stat_generate_tlen = v64;
 	v64 = atomic64_read(&alg->seed_cnt);
 	rrng.stat_seed_cnt = v64;
-	v64 = atomic64_read(&alg->hash_err_cnt);
+	v64 = atomic64_read(&alg->rng_err_cnt);
 	rrng.stat_rng_err_cnt = v64;
 
 	return nla_put(skb, CRYPTOCFGA_STAT_RNG, sizeof(rrng), &rrng);
-- 
cgit v1.2.3-59-g8ed1b


From 5fff81729f09f3d7d9be0ace50be112bd34f0bb9 Mon Sep 17 00:00:00 2001
From: Corentin Labbe <clabbe@baylibre.com>
Date: Thu, 29 Nov 2018 14:42:23 +0000
Subject: crypto: user - remove intermediate variable

The use of the v64 intermediate variable is useless, and removing it
bring to much readable code.

Signed-off-by: Corentin Labbe <clabbe@baylibre.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/crypto_user_stat.c | 132 ++++++++++++++--------------------------------
 1 file changed, 41 insertions(+), 91 deletions(-)

diff --git a/crypto/crypto_user_stat.c b/crypto/crypto_user_stat.c
index 838123758423..7b668c659122 100644
--- a/crypto/crypto_user_stat.c
+++ b/crypto/crypto_user_stat.c
@@ -34,22 +34,16 @@ struct crypto_dump_info {
 static int crypto_report_aead(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_stat_aead raead;
-	u64 v64;
 
 	memset(&raead, 0, sizeof(raead));
 
 	strscpy(raead.type, "aead", sizeof(raead.type));
 
-	v64 = atomic64_read(&alg->encrypt_cnt);
-	raead.stat_encrypt_cnt = v64;
-	v64 = atomic64_read(&alg->encrypt_tlen);
-	raead.stat_encrypt_tlen = v64;
-	v64 = atomic64_read(&alg->decrypt_cnt);
-	raead.stat_decrypt_cnt = v64;
-	v64 = atomic64_read(&alg->decrypt_tlen);
-	raead.stat_decrypt_tlen = v64;
-	v64 = atomic64_read(&alg->aead_err_cnt);
-	raead.stat_aead_err_cnt = v64;
+	raead.stat_encrypt_cnt = atomic64_read(&alg->encrypt_cnt);
+	raead.stat_encrypt_tlen = atomic64_read(&alg->encrypt_tlen);
+	raead.stat_decrypt_cnt = atomic64_read(&alg->decrypt_cnt);
+	raead.stat_decrypt_tlen = atomic64_read(&alg->decrypt_tlen);
+	raead.stat_aead_err_cnt = atomic64_read(&alg->aead_err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_AEAD, sizeof(raead), &raead);
 }
@@ -57,22 +51,16 @@ static int crypto_report_aead(struct sk_buff *skb, struct crypto_alg *alg)
 static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_stat_cipher rcipher;
-	u64 v64;
 
 	memset(&rcipher, 0, sizeof(rcipher));
 
 	strscpy(rcipher.type, "cipher", sizeof(rcipher.type));
 
-	v64 = atomic64_read(&alg->encrypt_cnt);
-	rcipher.stat_encrypt_cnt = v64;
-	v64 = atomic64_read(&alg->encrypt_tlen);
-	rcipher.stat_encrypt_tlen = v64;
-	v64 = atomic64_read(&alg->decrypt_cnt);
-	rcipher.stat_decrypt_cnt = v64;
-	v64 = atomic64_read(&alg->decrypt_tlen);
-	rcipher.stat_decrypt_tlen = v64;
-	v64 = atomic64_read(&alg->cipher_err_cnt);
-	rcipher.stat_cipher_err_cnt = v64;
+	rcipher.stat_encrypt_cnt = atomic64_read(&alg->encrypt_cnt);
+	rcipher.stat_encrypt_tlen = atomic64_read(&alg->encrypt_tlen);
+	rcipher.stat_decrypt_cnt =  atomic64_read(&alg->decrypt_cnt);
+	rcipher.stat_decrypt_tlen = atomic64_read(&alg->decrypt_tlen);
+	rcipher.stat_cipher_err_cnt =  atomic64_read(&alg->cipher_err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_CIPHER, sizeof(rcipher), &rcipher);
 }
@@ -80,21 +68,15 @@ static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg)
 static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_stat_compress rcomp;
-	u64 v64;
 
 	memset(&rcomp, 0, sizeof(rcomp));
 
 	strscpy(rcomp.type, "compression", sizeof(rcomp.type));
-	v64 = atomic64_read(&alg->compress_cnt);
-	rcomp.stat_compress_cnt = v64;
-	v64 = atomic64_read(&alg->compress_tlen);
-	rcomp.stat_compress_tlen = v64;
-	v64 = atomic64_read(&alg->decompress_cnt);
-	rcomp.stat_decompress_cnt = v64;
-	v64 = atomic64_read(&alg->decompress_tlen);
-	rcomp.stat_decompress_tlen = v64;
-	v64 = atomic64_read(&alg->compress_err_cnt);
-	rcomp.stat_compress_err_cnt = v64;
+	rcomp.stat_compress_cnt = atomic64_read(&alg->compress_cnt);
+	rcomp.stat_compress_tlen = atomic64_read(&alg->compress_tlen);
+	rcomp.stat_decompress_cnt = atomic64_read(&alg->decompress_cnt);
+	rcomp.stat_decompress_tlen = atomic64_read(&alg->decompress_tlen);
+	rcomp.stat_compress_err_cnt = atomic64_read(&alg->compress_err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_COMPRESS, sizeof(rcomp), &rcomp);
 }
@@ -102,21 +84,15 @@ static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg)
 static int crypto_report_acomp(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_stat_compress racomp;
-	u64 v64;
 
 	memset(&racomp, 0, sizeof(racomp));
 
 	strscpy(racomp.type, "acomp", sizeof(racomp.type));
-	v64 = atomic64_read(&alg->compress_cnt);
-	racomp.stat_compress_cnt = v64;
-	v64 = atomic64_read(&alg->compress_tlen);
-	racomp.stat_compress_tlen = v64;
-	v64 = atomic64_read(&alg->decompress_cnt);
-	racomp.stat_decompress_cnt = v64;
-	v64 = atomic64_read(&alg->decompress_tlen);
-	racomp.stat_decompress_tlen = v64;
-	v64 = atomic64_read(&alg->compress_err_cnt);
-	racomp.stat_compress_err_cnt = v64;
+	racomp.stat_compress_cnt = atomic64_read(&alg->compress_cnt);
+	racomp.stat_compress_tlen = atomic64_read(&alg->compress_tlen);
+	racomp.stat_decompress_cnt =  atomic64_read(&alg->decompress_cnt);
+	racomp.stat_decompress_tlen = atomic64_read(&alg->decompress_tlen);
+	racomp.stat_compress_err_cnt = atomic64_read(&alg->compress_err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_ACOMP, sizeof(racomp), &racomp);
 }
@@ -124,25 +100,17 @@ static int crypto_report_acomp(struct sk_buff *skb, struct crypto_alg *alg)
 static int crypto_report_akcipher(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_stat_akcipher rakcipher;
-	u64 v64;
 
 	memset(&rakcipher, 0, sizeof(rakcipher));
 
 	strscpy(rakcipher.type, "akcipher", sizeof(rakcipher.type));
-	v64 = atomic64_read(&alg->encrypt_cnt);
-	rakcipher.stat_encrypt_cnt = v64;
-	v64 = atomic64_read(&alg->encrypt_tlen);
-	rakcipher.stat_encrypt_tlen = v64;
-	v64 = atomic64_read(&alg->decrypt_cnt);
-	rakcipher.stat_decrypt_cnt = v64;
-	v64 = atomic64_read(&alg->decrypt_tlen);
-	rakcipher.stat_decrypt_tlen = v64;
-	v64 = atomic64_read(&alg->sign_cnt);
-	rakcipher.stat_sign_cnt = v64;
-	v64 = atomic64_read(&alg->verify_cnt);
-	rakcipher.stat_verify_cnt = v64;
-	v64 = atomic64_read(&alg->akcipher_err_cnt);
-	rakcipher.stat_akcipher_err_cnt = v64;
+	rakcipher.stat_encrypt_cnt = atomic64_read(&alg->encrypt_cnt);
+	rakcipher.stat_encrypt_tlen = atomic64_read(&alg->encrypt_tlen);
+	rakcipher.stat_decrypt_cnt = atomic64_read(&alg->decrypt_cnt);
+	rakcipher.stat_decrypt_tlen = atomic64_read(&alg->decrypt_tlen);
+	rakcipher.stat_sign_cnt = atomic64_read(&alg->sign_cnt);
+	rakcipher.stat_verify_cnt = atomic64_read(&alg->verify_cnt);
+	rakcipher.stat_akcipher_err_cnt = atomic64_read(&alg->akcipher_err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_AKCIPHER,
 		       sizeof(rakcipher), &rakcipher);
@@ -151,20 +119,15 @@ static int crypto_report_akcipher(struct sk_buff *skb, struct crypto_alg *alg)
 static int crypto_report_kpp(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_stat_kpp rkpp;
-	u64 v;
 
 	memset(&rkpp, 0, sizeof(rkpp));
 
 	strscpy(rkpp.type, "kpp", sizeof(rkpp.type));
 
-	v = atomic64_read(&alg->setsecret_cnt);
-	rkpp.stat_setsecret_cnt = v;
-	v = atomic64_read(&alg->generate_public_key_cnt);
-	rkpp.stat_generate_public_key_cnt = v;
-	v = atomic64_read(&alg->compute_shared_secret_cnt);
-	rkpp.stat_compute_shared_secret_cnt = v;
-	v = atomic64_read(&alg->kpp_err_cnt);
-	rkpp.stat_kpp_err_cnt = v;
+	rkpp.stat_setsecret_cnt = atomic64_read(&alg->setsecret_cnt);
+	rkpp.stat_generate_public_key_cnt = atomic64_read(&alg->generate_public_key_cnt);
+	rkpp.stat_compute_shared_secret_cnt = atomic64_read(&alg->compute_shared_secret_cnt);
+	rkpp.stat_kpp_err_cnt = atomic64_read(&alg->kpp_err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_KPP, sizeof(rkpp), &rkpp);
 }
@@ -172,18 +135,14 @@ static int crypto_report_kpp(struct sk_buff *skb, struct crypto_alg *alg)
 static int crypto_report_ahash(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_stat_hash rhash;
-	u64 v64;
 
 	memset(&rhash, 0, sizeof(rhash));
 
 	strscpy(rhash.type, "ahash", sizeof(rhash.type));
 
-	v64 = atomic64_read(&alg->hash_cnt);
-	rhash.stat_hash_cnt = v64;
-	v64 = atomic64_read(&alg->hash_tlen);
-	rhash.stat_hash_tlen = v64;
-	v64 = atomic64_read(&alg->hash_err_cnt);
-	rhash.stat_hash_err_cnt = v64;
+	rhash.stat_hash_cnt = atomic64_read(&alg->hash_cnt);
+	rhash.stat_hash_tlen = atomic64_read(&alg->hash_tlen);
+	rhash.stat_hash_err_cnt = atomic64_read(&alg->hash_err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_HASH, sizeof(rhash), &rhash);
 }
@@ -191,18 +150,14 @@ static int crypto_report_ahash(struct sk_buff *skb, struct crypto_alg *alg)
 static int crypto_report_shash(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_stat_hash rhash;
-	u64 v64;
 
 	memset(&rhash, 0, sizeof(rhash));
 
 	strscpy(rhash.type, "shash", sizeof(rhash.type));
 
-	v64 = atomic64_read(&alg->hash_cnt);
-	rhash.stat_hash_cnt = v64;
-	v64 = atomic64_read(&alg->hash_tlen);
-	rhash.stat_hash_tlen = v64;
-	v64 = atomic64_read(&alg->hash_err_cnt);
-	rhash.stat_hash_err_cnt = v64;
+	rhash.stat_hash_cnt =  atomic64_read(&alg->hash_cnt);
+	rhash.stat_hash_tlen = atomic64_read(&alg->hash_tlen);
+	rhash.stat_hash_err_cnt = atomic64_read(&alg->hash_err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_HASH, sizeof(rhash), &rhash);
 }
@@ -210,20 +165,15 @@ static int crypto_report_shash(struct sk_buff *skb, struct crypto_alg *alg)
 static int crypto_report_rng(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_stat_rng rrng;
-	u64 v64;
 
 	memset(&rrng, 0, sizeof(rrng));
 
 	strscpy(rrng.type, "rng", sizeof(rrng.type));
 
-	v64 = atomic64_read(&alg->generate_cnt);
-	rrng.stat_generate_cnt = v64;
-	v64 = atomic64_read(&alg->generate_tlen);
-	rrng.stat_generate_tlen = v64;
-	v64 = atomic64_read(&alg->seed_cnt);
-	rrng.stat_seed_cnt = v64;
-	v64 = atomic64_read(&alg->rng_err_cnt);
-	rrng.stat_rng_err_cnt = v64;
+	rrng.stat_generate_cnt = atomic64_read(&alg->generate_cnt);
+	rrng.stat_generate_tlen = atomic64_read(&alg->generate_tlen);
+	rrng.stat_seed_cnt = atomic64_read(&alg->seed_cnt);
+	rrng.stat_rng_err_cnt = atomic64_read(&alg->rng_err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_RNG, sizeof(rrng), &rrng);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 17c18f9e33282a170458cb5ea20759bfcb0da7d8 Mon Sep 17 00:00:00 2001
From: Corentin Labbe <clabbe@baylibre.com>
Date: Thu, 29 Nov 2018 14:42:24 +0000
Subject: crypto: user - Split stats in multiple structures

Like for userspace, this patch splits stats into multiple structures,
one for each algorithm class.
Signed-off-by: Corentin Labbe <clabbe@baylibre.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/algapi.c           | 108 +++++++++++++---------------
 crypto/crypto_user_stat.c |  82 ++++++++++-----------
 include/linux/crypto.h    | 180 ++++++++++++++++++++++++++++++----------------
 3 files changed, 210 insertions(+), 160 deletions(-)

diff --git a/crypto/algapi.c b/crypto/algapi.c
index 4c1e6079d271..a8cb5aed0069 100644
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -259,13 +259,7 @@ static struct crypto_larval *__crypto_register_alg(struct crypto_alg *alg)
 	list_add(&larval->alg.cra_list, &crypto_alg_list);
 
 #ifdef CONFIG_CRYPTO_STATS
-	atomic64_set(&alg->encrypt_cnt, 0);
-	atomic64_set(&alg->decrypt_cnt, 0);
-	atomic64_set(&alg->encrypt_tlen, 0);
-	atomic64_set(&alg->decrypt_tlen, 0);
-	atomic64_set(&alg->verify_cnt, 0);
-	atomic64_set(&alg->cipher_err_cnt, 0);
-	atomic64_set(&alg->sign_cnt, 0);
+	memset(&alg->stats, 0, sizeof(alg->stats));
 #endif
 
 out:
@@ -1089,10 +1083,10 @@ void crypto_stats_ablkcipher_encrypt(unsigned int nbytes, int ret,
 				     struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->cipher_err_cnt);
+		atomic64_inc(&alg->stats.cipher.cipher_err_cnt);
 	} else {
-		atomic64_inc(&alg->encrypt_cnt);
-		atomic64_add(nbytes, &alg->encrypt_tlen);
+		atomic64_inc(&alg->stats.cipher.encrypt_cnt);
+		atomic64_add(nbytes, &alg->stats.cipher.encrypt_tlen);
 	}
 	crypto_alg_put(alg);
 }
@@ -1102,10 +1096,10 @@ void crypto_stats_ablkcipher_decrypt(unsigned int nbytes, int ret,
 				     struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->cipher_err_cnt);
+		atomic64_inc(&alg->stats.cipher.cipher_err_cnt);
 	} else {
-		atomic64_inc(&alg->decrypt_cnt);
-		atomic64_add(nbytes, &alg->decrypt_tlen);
+		atomic64_inc(&alg->stats.cipher.decrypt_cnt);
+		atomic64_add(nbytes, &alg->stats.cipher.decrypt_tlen);
 	}
 	crypto_alg_put(alg);
 }
@@ -1115,10 +1109,10 @@ void crypto_stats_aead_encrypt(unsigned int cryptlen, struct crypto_alg *alg,
 			       int ret)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->aead_err_cnt);
+		atomic64_inc(&alg->stats.aead.aead_err_cnt);
 	} else {
-		atomic64_inc(&alg->encrypt_cnt);
-		atomic64_add(cryptlen, &alg->encrypt_tlen);
+		atomic64_inc(&alg->stats.aead.encrypt_cnt);
+		atomic64_add(cryptlen, &alg->stats.aead.encrypt_tlen);
 	}
 	crypto_alg_put(alg);
 }
@@ -1128,10 +1122,10 @@ void crypto_stats_aead_decrypt(unsigned int cryptlen, struct crypto_alg *alg,
 			       int ret)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->aead_err_cnt);
+		atomic64_inc(&alg->stats.aead.aead_err_cnt);
 	} else {
-		atomic64_inc(&alg->decrypt_cnt);
-		atomic64_add(cryptlen, &alg->decrypt_tlen);
+		atomic64_inc(&alg->stats.aead.decrypt_cnt);
+		atomic64_add(cryptlen, &alg->stats.aead.decrypt_tlen);
 	}
 	crypto_alg_put(alg);
 }
@@ -1141,10 +1135,10 @@ void crypto_stats_akcipher_encrypt(unsigned int src_len, int ret,
 				   struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->akcipher_err_cnt);
+		atomic64_inc(&alg->stats.akcipher.akcipher_err_cnt);
 	} else {
-		atomic64_inc(&alg->encrypt_cnt);
-		atomic64_add(src_len, &alg->encrypt_tlen);
+		atomic64_inc(&alg->stats.akcipher.encrypt_cnt);
+		atomic64_add(src_len, &alg->stats.akcipher.encrypt_tlen);
 	}
 	crypto_alg_put(alg);
 }
@@ -1154,10 +1148,10 @@ void crypto_stats_akcipher_decrypt(unsigned int src_len, int ret,
 				   struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->akcipher_err_cnt);
+		atomic64_inc(&alg->stats.akcipher.akcipher_err_cnt);
 	} else {
-		atomic64_inc(&alg->decrypt_cnt);
-		atomic64_add(src_len, &alg->decrypt_tlen);
+		atomic64_inc(&alg->stats.akcipher.decrypt_cnt);
+		atomic64_add(src_len, &alg->stats.akcipher.decrypt_tlen);
 	}
 	crypto_alg_put(alg);
 }
@@ -1166,9 +1160,9 @@ EXPORT_SYMBOL_GPL(crypto_stats_akcipher_decrypt);
 void crypto_stats_akcipher_sign(int ret, struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
-		atomic64_inc(&alg->akcipher_err_cnt);
+		atomic64_inc(&alg->stats.akcipher.akcipher_err_cnt);
 	else
-		atomic64_inc(&alg->sign_cnt);
+		atomic64_inc(&alg->stats.akcipher.sign_cnt);
 	crypto_alg_put(alg);
 }
 EXPORT_SYMBOL_GPL(crypto_stats_akcipher_sign);
@@ -1176,9 +1170,9 @@ EXPORT_SYMBOL_GPL(crypto_stats_akcipher_sign);
 void crypto_stats_akcipher_verify(int ret, struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
-		atomic64_inc(&alg->akcipher_err_cnt);
+		atomic64_inc(&alg->stats.akcipher.akcipher_err_cnt);
 	else
-		atomic64_inc(&alg->verify_cnt);
+		atomic64_inc(&alg->stats.akcipher.verify_cnt);
 	crypto_alg_put(alg);
 }
 EXPORT_SYMBOL_GPL(crypto_stats_akcipher_verify);
@@ -1186,10 +1180,10 @@ EXPORT_SYMBOL_GPL(crypto_stats_akcipher_verify);
 void crypto_stats_compress(unsigned int slen, int ret, struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->compress_err_cnt);
+		atomic64_inc(&alg->stats.compress.compress_err_cnt);
 	} else {
-		atomic64_inc(&alg->compress_cnt);
-		atomic64_add(slen, &alg->compress_tlen);
+		atomic64_inc(&alg->stats.compress.compress_cnt);
+		atomic64_add(slen, &alg->stats.compress.compress_tlen);
 	}
 	crypto_alg_put(alg);
 }
@@ -1198,10 +1192,10 @@ EXPORT_SYMBOL_GPL(crypto_stats_compress);
 void crypto_stats_decompress(unsigned int slen, int ret, struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->compress_err_cnt);
+		atomic64_inc(&alg->stats.compress.compress_err_cnt);
 	} else {
-		atomic64_inc(&alg->decompress_cnt);
-		atomic64_add(slen, &alg->decompress_tlen);
+		atomic64_inc(&alg->stats.compress.decompress_cnt);
+		atomic64_add(slen, &alg->stats.compress.decompress_tlen);
 	}
 	crypto_alg_put(alg);
 }
@@ -1211,9 +1205,9 @@ void crypto_stats_ahash_update(unsigned int nbytes, int ret,
 			       struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
-		atomic64_inc(&alg->hash_err_cnt);
+		atomic64_inc(&alg->stats.hash.hash_err_cnt);
 	else
-		atomic64_add(nbytes, &alg->hash_tlen);
+		atomic64_add(nbytes, &alg->stats.hash.hash_tlen);
 	crypto_alg_put(alg);
 }
 EXPORT_SYMBOL_GPL(crypto_stats_ahash_update);
@@ -1222,10 +1216,10 @@ void crypto_stats_ahash_final(unsigned int nbytes, int ret,
 			      struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->hash_err_cnt);
+		atomic64_inc(&alg->stats.hash.hash_err_cnt);
 	} else {
-		atomic64_inc(&alg->hash_cnt);
-		atomic64_add(nbytes, &alg->hash_tlen);
+		atomic64_inc(&alg->stats.hash.hash_cnt);
+		atomic64_add(nbytes, &alg->stats.hash.hash_tlen);
 	}
 	crypto_alg_put(alg);
 }
@@ -1234,9 +1228,9 @@ EXPORT_SYMBOL_GPL(crypto_stats_ahash_final);
 void crypto_stats_kpp_set_secret(struct crypto_alg *alg, int ret)
 {
 	if (ret)
-		atomic64_inc(&alg->kpp_err_cnt);
+		atomic64_inc(&alg->stats.kpp.kpp_err_cnt);
 	else
-		atomic64_inc(&alg->setsecret_cnt);
+		atomic64_inc(&alg->stats.kpp.setsecret_cnt);
 	crypto_alg_put(alg);
 }
 EXPORT_SYMBOL_GPL(crypto_stats_kpp_set_secret);
@@ -1244,9 +1238,9 @@ EXPORT_SYMBOL_GPL(crypto_stats_kpp_set_secret);
 void crypto_stats_kpp_generate_public_key(struct crypto_alg *alg, int ret)
 {
 	if (ret)
-		atomic64_inc(&alg->kpp_err_cnt);
+		atomic64_inc(&alg->stats.kpp.kpp_err_cnt);
 	else
-		atomic64_inc(&alg->generate_public_key_cnt);
+		atomic64_inc(&alg->stats.kpp.generate_public_key_cnt);
 	crypto_alg_put(alg);
 }
 EXPORT_SYMBOL_GPL(crypto_stats_kpp_generate_public_key);
@@ -1254,9 +1248,9 @@ EXPORT_SYMBOL_GPL(crypto_stats_kpp_generate_public_key);
 void crypto_stats_kpp_compute_shared_secret(struct crypto_alg *alg, int ret)
 {
 	if (ret)
-		atomic64_inc(&alg->kpp_err_cnt);
+		atomic64_inc(&alg->stats.kpp.kpp_err_cnt);
 	else
-		atomic64_inc(&alg->compute_shared_secret_cnt);
+		atomic64_inc(&alg->stats.kpp.compute_shared_secret_cnt);
 	crypto_alg_put(alg);
 }
 EXPORT_SYMBOL_GPL(crypto_stats_kpp_compute_shared_secret);
@@ -1264,9 +1258,9 @@ EXPORT_SYMBOL_GPL(crypto_stats_kpp_compute_shared_secret);
 void crypto_stats_rng_seed(struct crypto_alg *alg, int ret)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
-		atomic64_inc(&alg->rng_err_cnt);
+		atomic64_inc(&alg->stats.rng.rng_err_cnt);
 	else
-		atomic64_inc(&alg->seed_cnt);
+		atomic64_inc(&alg->stats.rng.seed_cnt);
 	crypto_alg_put(alg);
 }
 EXPORT_SYMBOL_GPL(crypto_stats_rng_seed);
@@ -1275,10 +1269,10 @@ void crypto_stats_rng_generate(struct crypto_alg *alg, unsigned int dlen,
 			       int ret)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->rng_err_cnt);
+		atomic64_inc(&alg->stats.rng.rng_err_cnt);
 	} else {
-		atomic64_inc(&alg->generate_cnt);
-		atomic64_add(dlen, &alg->generate_tlen);
+		atomic64_inc(&alg->stats.rng.generate_cnt);
+		atomic64_add(dlen, &alg->stats.rng.generate_tlen);
 	}
 	crypto_alg_put(alg);
 }
@@ -1288,10 +1282,10 @@ void crypto_stats_skcipher_encrypt(unsigned int cryptlen, int ret,
 				   struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->cipher_err_cnt);
+		atomic64_inc(&alg->stats.cipher.cipher_err_cnt);
 	} else {
-		atomic64_inc(&alg->encrypt_cnt);
-		atomic64_add(cryptlen, &alg->encrypt_tlen);
+		atomic64_inc(&alg->stats.cipher.encrypt_cnt);
+		atomic64_add(cryptlen, &alg->stats.cipher.encrypt_tlen);
 	}
 	crypto_alg_put(alg);
 }
@@ -1301,10 +1295,10 @@ void crypto_stats_skcipher_decrypt(unsigned int cryptlen, int ret,
 				   struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->cipher_err_cnt);
+		atomic64_inc(&alg->stats.cipher.cipher_err_cnt);
 	} else {
-		atomic64_inc(&alg->decrypt_cnt);
-		atomic64_add(cryptlen, &alg->decrypt_tlen);
+		atomic64_inc(&alg->stats.cipher.decrypt_cnt);
+		atomic64_add(cryptlen, &alg->stats.cipher.decrypt_tlen);
 	}
 	crypto_alg_put(alg);
 }
diff --git a/crypto/crypto_user_stat.c b/crypto/crypto_user_stat.c
index 7b668c659122..113bf1691560 100644
--- a/crypto/crypto_user_stat.c
+++ b/crypto/crypto_user_stat.c
@@ -39,11 +39,11 @@ static int crypto_report_aead(struct sk_buff *skb, struct crypto_alg *alg)
 
 	strscpy(raead.type, "aead", sizeof(raead.type));
 
-	raead.stat_encrypt_cnt = atomic64_read(&alg->encrypt_cnt);
-	raead.stat_encrypt_tlen = atomic64_read(&alg->encrypt_tlen);
-	raead.stat_decrypt_cnt = atomic64_read(&alg->decrypt_cnt);
-	raead.stat_decrypt_tlen = atomic64_read(&alg->decrypt_tlen);
-	raead.stat_aead_err_cnt = atomic64_read(&alg->aead_err_cnt);
+	raead.stat_encrypt_cnt = atomic64_read(&alg->stats.aead.encrypt_cnt);
+	raead.stat_encrypt_tlen = atomic64_read(&alg->stats.aead.encrypt_tlen);
+	raead.stat_decrypt_cnt = atomic64_read(&alg->stats.aead.decrypt_cnt);
+	raead.stat_decrypt_tlen = atomic64_read(&alg->stats.aead.decrypt_tlen);
+	raead.stat_aead_err_cnt = atomic64_read(&alg->stats.aead.aead_err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_AEAD, sizeof(raead), &raead);
 }
@@ -56,11 +56,11 @@ static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg)
 
 	strscpy(rcipher.type, "cipher", sizeof(rcipher.type));
 
-	rcipher.stat_encrypt_cnt = atomic64_read(&alg->encrypt_cnt);
-	rcipher.stat_encrypt_tlen = atomic64_read(&alg->encrypt_tlen);
-	rcipher.stat_decrypt_cnt =  atomic64_read(&alg->decrypt_cnt);
-	rcipher.stat_decrypt_tlen = atomic64_read(&alg->decrypt_tlen);
-	rcipher.stat_cipher_err_cnt =  atomic64_read(&alg->cipher_err_cnt);
+	rcipher.stat_encrypt_cnt = atomic64_read(&alg->stats.cipher.encrypt_cnt);
+	rcipher.stat_encrypt_tlen = atomic64_read(&alg->stats.cipher.encrypt_tlen);
+	rcipher.stat_decrypt_cnt =  atomic64_read(&alg->stats.cipher.decrypt_cnt);
+	rcipher.stat_decrypt_tlen = atomic64_read(&alg->stats.cipher.decrypt_tlen);
+	rcipher.stat_cipher_err_cnt =  atomic64_read(&alg->stats.cipher.cipher_err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_CIPHER, sizeof(rcipher), &rcipher);
 }
@@ -72,11 +72,11 @@ static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg)
 	memset(&rcomp, 0, sizeof(rcomp));
 
 	strscpy(rcomp.type, "compression", sizeof(rcomp.type));
-	rcomp.stat_compress_cnt = atomic64_read(&alg->compress_cnt);
-	rcomp.stat_compress_tlen = atomic64_read(&alg->compress_tlen);
-	rcomp.stat_decompress_cnt = atomic64_read(&alg->decompress_cnt);
-	rcomp.stat_decompress_tlen = atomic64_read(&alg->decompress_tlen);
-	rcomp.stat_compress_err_cnt = atomic64_read(&alg->compress_err_cnt);
+	rcomp.stat_compress_cnt = atomic64_read(&alg->stats.compress.compress_cnt);
+	rcomp.stat_compress_tlen = atomic64_read(&alg->stats.compress.compress_tlen);
+	rcomp.stat_decompress_cnt = atomic64_read(&alg->stats.compress.decompress_cnt);
+	rcomp.stat_decompress_tlen = atomic64_read(&alg->stats.compress.decompress_tlen);
+	rcomp.stat_compress_err_cnt = atomic64_read(&alg->stats.compress.compress_err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_COMPRESS, sizeof(rcomp), &rcomp);
 }
@@ -88,11 +88,11 @@ static int crypto_report_acomp(struct sk_buff *skb, struct crypto_alg *alg)
 	memset(&racomp, 0, sizeof(racomp));
 
 	strscpy(racomp.type, "acomp", sizeof(racomp.type));
-	racomp.stat_compress_cnt = atomic64_read(&alg->compress_cnt);
-	racomp.stat_compress_tlen = atomic64_read(&alg->compress_tlen);
-	racomp.stat_decompress_cnt =  atomic64_read(&alg->decompress_cnt);
-	racomp.stat_decompress_tlen = atomic64_read(&alg->decompress_tlen);
-	racomp.stat_compress_err_cnt = atomic64_read(&alg->compress_err_cnt);
+	racomp.stat_compress_cnt = atomic64_read(&alg->stats.compress.compress_cnt);
+	racomp.stat_compress_tlen = atomic64_read(&alg->stats.compress.compress_tlen);
+	racomp.stat_decompress_cnt =  atomic64_read(&alg->stats.compress.decompress_cnt);
+	racomp.stat_decompress_tlen = atomic64_read(&alg->stats.compress.decompress_tlen);
+	racomp.stat_compress_err_cnt = atomic64_read(&alg->stats.compress.compress_err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_ACOMP, sizeof(racomp), &racomp);
 }
@@ -104,13 +104,13 @@ static int crypto_report_akcipher(struct sk_buff *skb, struct crypto_alg *alg)
 	memset(&rakcipher, 0, sizeof(rakcipher));
 
 	strscpy(rakcipher.type, "akcipher", sizeof(rakcipher.type));
-	rakcipher.stat_encrypt_cnt = atomic64_read(&alg->encrypt_cnt);
-	rakcipher.stat_encrypt_tlen = atomic64_read(&alg->encrypt_tlen);
-	rakcipher.stat_decrypt_cnt = atomic64_read(&alg->decrypt_cnt);
-	rakcipher.stat_decrypt_tlen = atomic64_read(&alg->decrypt_tlen);
-	rakcipher.stat_sign_cnt = atomic64_read(&alg->sign_cnt);
-	rakcipher.stat_verify_cnt = atomic64_read(&alg->verify_cnt);
-	rakcipher.stat_akcipher_err_cnt = atomic64_read(&alg->akcipher_err_cnt);
+	rakcipher.stat_encrypt_cnt = atomic64_read(&alg->stats.akcipher.encrypt_cnt);
+	rakcipher.stat_encrypt_tlen = atomic64_read(&alg->stats.akcipher.encrypt_tlen);
+	rakcipher.stat_decrypt_cnt = atomic64_read(&alg->stats.akcipher.decrypt_cnt);
+	rakcipher.stat_decrypt_tlen = atomic64_read(&alg->stats.akcipher.decrypt_tlen);
+	rakcipher.stat_sign_cnt = atomic64_read(&alg->stats.akcipher.sign_cnt);
+	rakcipher.stat_verify_cnt = atomic64_read(&alg->stats.akcipher.verify_cnt);
+	rakcipher.stat_akcipher_err_cnt = atomic64_read(&alg->stats.akcipher.akcipher_err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_AKCIPHER,
 		       sizeof(rakcipher), &rakcipher);
@@ -124,10 +124,10 @@ static int crypto_report_kpp(struct sk_buff *skb, struct crypto_alg *alg)
 
 	strscpy(rkpp.type, "kpp", sizeof(rkpp.type));
 
-	rkpp.stat_setsecret_cnt = atomic64_read(&alg->setsecret_cnt);
-	rkpp.stat_generate_public_key_cnt = atomic64_read(&alg->generate_public_key_cnt);
-	rkpp.stat_compute_shared_secret_cnt = atomic64_read(&alg->compute_shared_secret_cnt);
-	rkpp.stat_kpp_err_cnt = atomic64_read(&alg->kpp_err_cnt);
+	rkpp.stat_setsecret_cnt = atomic64_read(&alg->stats.kpp.setsecret_cnt);
+	rkpp.stat_generate_public_key_cnt = atomic64_read(&alg->stats.kpp.generate_public_key_cnt);
+	rkpp.stat_compute_shared_secret_cnt = atomic64_read(&alg->stats.kpp.compute_shared_secret_cnt);
+	rkpp.stat_kpp_err_cnt = atomic64_read(&alg->stats.kpp.kpp_err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_KPP, sizeof(rkpp), &rkpp);
 }
@@ -140,9 +140,9 @@ static int crypto_report_ahash(struct sk_buff *skb, struct crypto_alg *alg)
 
 	strscpy(rhash.type, "ahash", sizeof(rhash.type));
 
-	rhash.stat_hash_cnt = atomic64_read(&alg->hash_cnt);
-	rhash.stat_hash_tlen = atomic64_read(&alg->hash_tlen);
-	rhash.stat_hash_err_cnt = atomic64_read(&alg->hash_err_cnt);
+	rhash.stat_hash_cnt = atomic64_read(&alg->stats.hash.hash_cnt);
+	rhash.stat_hash_tlen = atomic64_read(&alg->stats.hash.hash_tlen);
+	rhash.stat_hash_err_cnt = atomic64_read(&alg->stats.hash.hash_err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_HASH, sizeof(rhash), &rhash);
 }
@@ -155,9 +155,9 @@ static int crypto_report_shash(struct sk_buff *skb, struct crypto_alg *alg)
 
 	strscpy(rhash.type, "shash", sizeof(rhash.type));
 
-	rhash.stat_hash_cnt =  atomic64_read(&alg->hash_cnt);
-	rhash.stat_hash_tlen = atomic64_read(&alg->hash_tlen);
-	rhash.stat_hash_err_cnt = atomic64_read(&alg->hash_err_cnt);
+	rhash.stat_hash_cnt =  atomic64_read(&alg->stats.hash.hash_cnt);
+	rhash.stat_hash_tlen = atomic64_read(&alg->stats.hash.hash_tlen);
+	rhash.stat_hash_err_cnt = atomic64_read(&alg->stats.hash.hash_err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_HASH, sizeof(rhash), &rhash);
 }
@@ -170,10 +170,10 @@ static int crypto_report_rng(struct sk_buff *skb, struct crypto_alg *alg)
 
 	strscpy(rrng.type, "rng", sizeof(rrng.type));
 
-	rrng.stat_generate_cnt = atomic64_read(&alg->generate_cnt);
-	rrng.stat_generate_tlen = atomic64_read(&alg->generate_tlen);
-	rrng.stat_seed_cnt = atomic64_read(&alg->seed_cnt);
-	rrng.stat_rng_err_cnt = atomic64_read(&alg->rng_err_cnt);
+	rrng.stat_generate_cnt = atomic64_read(&alg->stats.rng.generate_cnt);
+	rrng.stat_generate_tlen = atomic64_read(&alg->stats.rng.generate_tlen);
+	rrng.stat_seed_cnt = atomic64_read(&alg->stats.rng.seed_cnt);
+	rrng.stat_rng_err_cnt = atomic64_read(&alg->stats.rng.rng_err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_RNG, sizeof(rrng), &rrng);
 }
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index e2fd24714e00..8a46ab35479e 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -369,6 +369,115 @@ struct compress_alg {
 			      unsigned int slen, u8 *dst, unsigned int *dlen);
 };
 
+#ifdef CONFIG_CRYPTO_STATS
+/*
+ * struct crypto_istat_aead - statistics for AEAD algorithm
+ * @encrypt_cnt:	number of encrypt requests
+ * @encrypt_tlen:	total data size handled by encrypt requests
+ * @decrypt_cnt:	number of decrypt requests
+ * @decrypt_tlen:	total data size handled by decrypt requests
+ * @aead_err_cnt:	number of error for AEAD requests
+ */
+struct crypto_istat_aead {
+	atomic64_t encrypt_cnt;
+	atomic64_t encrypt_tlen;
+	atomic64_t decrypt_cnt;
+	atomic64_t decrypt_tlen;
+	atomic64_t aead_err_cnt;
+};
+
+/*
+ * struct crypto_istat_akcipher - statistics for akcipher algorithm
+ * @encrypt_cnt:	number of encrypt requests
+ * @encrypt_tlen:	total data size handled by encrypt requests
+ * @decrypt_cnt:	number of decrypt requests
+ * @decrypt_tlen:	total data size handled by decrypt requests
+ * @verify_cnt:		number of verify operation
+ * @sign_cnt:		number of sign requests
+ * @akcipher_err_cnt:	number of error for akcipher requests
+ */
+struct crypto_istat_akcipher {
+	atomic64_t encrypt_cnt;
+	atomic64_t encrypt_tlen;
+	atomic64_t decrypt_cnt;
+	atomic64_t decrypt_tlen;
+	atomic64_t verify_cnt;
+	atomic64_t sign_cnt;
+	atomic64_t akcipher_err_cnt;
+};
+
+/*
+ * struct crypto_istat_cipher - statistics for cipher algorithm
+ * @encrypt_cnt:	number of encrypt requests
+ * @encrypt_tlen:	total data size handled by encrypt requests
+ * @decrypt_cnt:	number of decrypt requests
+ * @decrypt_tlen:	total data size handled by decrypt requests
+ * @cipher_err_cnt:	number of error for cipher requests
+ */
+struct crypto_istat_cipher {
+	atomic64_t encrypt_cnt;
+	atomic64_t encrypt_tlen;
+	atomic64_t decrypt_cnt;
+	atomic64_t decrypt_tlen;
+	atomic64_t cipher_err_cnt;
+};
+
+/*
+ * struct crypto_istat_compress - statistics for compress algorithm
+ * @compress_cnt:	number of compress requests
+ * @compress_tlen:	total data size handled by compress requests
+ * @decompress_cnt:	number of decompress requests
+ * @decompress_tlen:	total data size handled by decompress requests
+ * @compress_err_cnt:	number of error for compress requests
+ */
+struct crypto_istat_compress {
+	atomic64_t compress_cnt;
+	atomic64_t compress_tlen;
+	atomic64_t decompress_cnt;
+	atomic64_t decompress_tlen;
+	atomic64_t compress_err_cnt;
+};
+
+/*
+ * struct crypto_istat_hash - statistics for has algorithm
+ * @hash_cnt:		number of hash requests
+ * @hash_tlen:		total data size hashed
+ * @hash_err_cnt:	number of error for hash requests
+ */
+struct crypto_istat_hash {
+	atomic64_t hash_cnt;
+	atomic64_t hash_tlen;
+	atomic64_t hash_err_cnt;
+};
+
+/*
+ * struct crypto_istat_kpp - statistics for KPP algorithm
+ * @setsecret_cnt:		number of setsecrey operation
+ * @generate_public_key_cnt:	number of generate_public_key operation
+ * @compute_shared_secret_cnt:	number of compute_shared_secret operation
+ * @kpp_err_cnt:		number of error for KPP requests
+ */
+struct crypto_istat_kpp {
+	atomic64_t setsecret_cnt;
+	atomic64_t generate_public_key_cnt;
+	atomic64_t compute_shared_secret_cnt;
+	atomic64_t kpp_err_cnt;
+};
+
+/*
+ * struct crypto_istat_rng: statistics for RNG algorithm
+ * @generate_cnt:	number of RNG generate requests
+ * @generate_tlen:	total data size of generated data by the RNG
+ * @seed_cnt:		number of times the RNG was seeded
+ * @rng_err_cnt:	number of error for RNG requests
+ */
+struct crypto_istat_rng {
+	atomic64_t generate_cnt;
+	atomic64_t generate_tlen;
+	atomic64_t seed_cnt;
+	atomic64_t rng_err_cnt;
+};
+#endif /* CONFIG_CRYPTO_STATS */
 
 #define cra_ablkcipher	cra_u.ablkcipher
 #define cra_blkcipher	cra_u.blkcipher
@@ -454,32 +563,7 @@ struct compress_alg {
  * @cra_refcnt: internally used
  * @cra_destroy: internally used
  *
- * All following statistics are for this crypto_alg
- * @encrypt_cnt:	number of encrypt requests
- * @decrypt_cnt:	number of decrypt requests
- * @compress_cnt:	number of compress requests
- * @decompress_cnt:	number of decompress requests
- * @generate_cnt:	number of RNG generate requests
- * @seed_cnt:		number of times the rng was seeded
- * @hash_cnt:		number of hash requests
- * @sign_cnt:		number of sign requests
- * @setsecret_cnt:	number of setsecrey operation
- * @generate_public_key_cnt:	number of generate_public_key operation
- * @verify_cnt:			number of verify operation
- * @compute_shared_secret_cnt:	number of compute_shared_secret operation
- * @encrypt_tlen:	total data size handled by encrypt requests
- * @decrypt_tlen:	total data size handled by decrypt requests
- * @compress_tlen:	total data size handled by compress requests
- * @decompress_tlen:	total data size handled by decompress requests
- * @generate_tlen:	total data size of generated data by the RNG
- * @hash_tlen:		total data size hashed
- * @akcipher_err_cnt:	number of error for akcipher requests
- * @cipher_err_cnt:	number of error for akcipher requests
- * @compress_err_cnt:	number of error for akcipher requests
- * @aead_err_cnt:	number of error for akcipher requests
- * @hash_err_cnt:	number of error for akcipher requests
- * @rng_err_cnt:	number of error for akcipher requests
- * @kpp_err_cnt:	number of error for akcipher requests
+ * @stats: union of all possible crypto_istat_xxx structures
  *
  * The struct crypto_alg describes a generic Crypto API algorithm and is common
  * for all of the transformations. Any variable not documented here shall not
@@ -517,42 +601,14 @@ struct crypto_alg {
 
 #ifdef CONFIG_CRYPTO_STATS
 	union {
-		atomic64_t encrypt_cnt;
-		atomic64_t compress_cnt;
-		atomic64_t generate_cnt;
-		atomic64_t hash_cnt;
-		atomic64_t setsecret_cnt;
-	};
-	union {
-		atomic64_t encrypt_tlen;
-		atomic64_t compress_tlen;
-		atomic64_t generate_tlen;
-		atomic64_t hash_tlen;
-	};
-	union {
-		atomic64_t akcipher_err_cnt;
-		atomic64_t cipher_err_cnt;
-		atomic64_t compress_err_cnt;
-		atomic64_t aead_err_cnt;
-		atomic64_t hash_err_cnt;
-		atomic64_t rng_err_cnt;
-		atomic64_t kpp_err_cnt;
-	};
-	union {
-		atomic64_t decrypt_cnt;
-		atomic64_t decompress_cnt;
-		atomic64_t seed_cnt;
-		atomic64_t generate_public_key_cnt;
-	};
-	union {
-		atomic64_t decrypt_tlen;
-		atomic64_t decompress_tlen;
-	};
-	union {
-		atomic64_t verify_cnt;
-		atomic64_t compute_shared_secret_cnt;
-	};
-	atomic64_t sign_cnt;
+		struct crypto_istat_aead aead;
+		struct crypto_istat_akcipher akcipher;
+		struct crypto_istat_cipher cipher;
+		struct crypto_istat_compress compress;
+		struct crypto_istat_hash hash;
+		struct crypto_istat_rng rng;
+		struct crypto_istat_kpp kpp;
+	} stats;
 #endif /* CONFIG_CRYPTO_STATS */
 
 } CRYPTO_MINALIGN_ATTR;
-- 
cgit v1.2.3-59-g8ed1b


From 44f13133cb03ec32fc88a533673248ef5c0617e3 Mon Sep 17 00:00:00 2001
From: Corentin Labbe <clabbe@baylibre.com>
Date: Thu, 29 Nov 2018 14:42:25 +0000
Subject: crypto: user - rename err_cnt parameter

Since now all crypto stats are on their own structures, it is now
useless to have the algorithm name in the err_cnt member.

Signed-off-by: Corentin Labbe <clabbe@baylibre.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/algapi.c                 | 38 +++++++++++++++++++-------------------
 crypto/crypto_user_stat.c       | 18 +++++++++---------
 include/linux/crypto.h          | 28 ++++++++++++++--------------
 include/uapi/linux/cryptouser.h | 14 +++++++-------
 tools/crypto/getstat.c          | 18 +++++++++---------
 5 files changed, 58 insertions(+), 58 deletions(-)

diff --git a/crypto/algapi.c b/crypto/algapi.c
index a8cb5aed0069..c0d4f9ef6b0f 100644
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -1083,7 +1083,7 @@ void crypto_stats_ablkcipher_encrypt(unsigned int nbytes, int ret,
 				     struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->stats.cipher.cipher_err_cnt);
+		atomic64_inc(&alg->stats.cipher.err_cnt);
 	} else {
 		atomic64_inc(&alg->stats.cipher.encrypt_cnt);
 		atomic64_add(nbytes, &alg->stats.cipher.encrypt_tlen);
@@ -1096,7 +1096,7 @@ void crypto_stats_ablkcipher_decrypt(unsigned int nbytes, int ret,
 				     struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->stats.cipher.cipher_err_cnt);
+		atomic64_inc(&alg->stats.cipher.err_cnt);
 	} else {
 		atomic64_inc(&alg->stats.cipher.decrypt_cnt);
 		atomic64_add(nbytes, &alg->stats.cipher.decrypt_tlen);
@@ -1109,7 +1109,7 @@ void crypto_stats_aead_encrypt(unsigned int cryptlen, struct crypto_alg *alg,
 			       int ret)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->stats.aead.aead_err_cnt);
+		atomic64_inc(&alg->stats.aead.err_cnt);
 	} else {
 		atomic64_inc(&alg->stats.aead.encrypt_cnt);
 		atomic64_add(cryptlen, &alg->stats.aead.encrypt_tlen);
@@ -1122,7 +1122,7 @@ void crypto_stats_aead_decrypt(unsigned int cryptlen, struct crypto_alg *alg,
 			       int ret)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->stats.aead.aead_err_cnt);
+		atomic64_inc(&alg->stats.aead.err_cnt);
 	} else {
 		atomic64_inc(&alg->stats.aead.decrypt_cnt);
 		atomic64_add(cryptlen, &alg->stats.aead.decrypt_tlen);
@@ -1135,7 +1135,7 @@ void crypto_stats_akcipher_encrypt(unsigned int src_len, int ret,
 				   struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->stats.akcipher.akcipher_err_cnt);
+		atomic64_inc(&alg->stats.akcipher.err_cnt);
 	} else {
 		atomic64_inc(&alg->stats.akcipher.encrypt_cnt);
 		atomic64_add(src_len, &alg->stats.akcipher.encrypt_tlen);
@@ -1148,7 +1148,7 @@ void crypto_stats_akcipher_decrypt(unsigned int src_len, int ret,
 				   struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->stats.akcipher.akcipher_err_cnt);
+		atomic64_inc(&alg->stats.akcipher.err_cnt);
 	} else {
 		atomic64_inc(&alg->stats.akcipher.decrypt_cnt);
 		atomic64_add(src_len, &alg->stats.akcipher.decrypt_tlen);
@@ -1160,7 +1160,7 @@ EXPORT_SYMBOL_GPL(crypto_stats_akcipher_decrypt);
 void crypto_stats_akcipher_sign(int ret, struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
-		atomic64_inc(&alg->stats.akcipher.akcipher_err_cnt);
+		atomic64_inc(&alg->stats.akcipher.err_cnt);
 	else
 		atomic64_inc(&alg->stats.akcipher.sign_cnt);
 	crypto_alg_put(alg);
@@ -1170,7 +1170,7 @@ EXPORT_SYMBOL_GPL(crypto_stats_akcipher_sign);
 void crypto_stats_akcipher_verify(int ret, struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
-		atomic64_inc(&alg->stats.akcipher.akcipher_err_cnt);
+		atomic64_inc(&alg->stats.akcipher.err_cnt);
 	else
 		atomic64_inc(&alg->stats.akcipher.verify_cnt);
 	crypto_alg_put(alg);
@@ -1180,7 +1180,7 @@ EXPORT_SYMBOL_GPL(crypto_stats_akcipher_verify);
 void crypto_stats_compress(unsigned int slen, int ret, struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->stats.compress.compress_err_cnt);
+		atomic64_inc(&alg->stats.compress.err_cnt);
 	} else {
 		atomic64_inc(&alg->stats.compress.compress_cnt);
 		atomic64_add(slen, &alg->stats.compress.compress_tlen);
@@ -1192,7 +1192,7 @@ EXPORT_SYMBOL_GPL(crypto_stats_compress);
 void crypto_stats_decompress(unsigned int slen, int ret, struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->stats.compress.compress_err_cnt);
+		atomic64_inc(&alg->stats.compress.err_cnt);
 	} else {
 		atomic64_inc(&alg->stats.compress.decompress_cnt);
 		atomic64_add(slen, &alg->stats.compress.decompress_tlen);
@@ -1205,7 +1205,7 @@ void crypto_stats_ahash_update(unsigned int nbytes, int ret,
 			       struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
-		atomic64_inc(&alg->stats.hash.hash_err_cnt);
+		atomic64_inc(&alg->stats.hash.err_cnt);
 	else
 		atomic64_add(nbytes, &alg->stats.hash.hash_tlen);
 	crypto_alg_put(alg);
@@ -1216,7 +1216,7 @@ void crypto_stats_ahash_final(unsigned int nbytes, int ret,
 			      struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->stats.hash.hash_err_cnt);
+		atomic64_inc(&alg->stats.hash.err_cnt);
 	} else {
 		atomic64_inc(&alg->stats.hash.hash_cnt);
 		atomic64_add(nbytes, &alg->stats.hash.hash_tlen);
@@ -1228,7 +1228,7 @@ EXPORT_SYMBOL_GPL(crypto_stats_ahash_final);
 void crypto_stats_kpp_set_secret(struct crypto_alg *alg, int ret)
 {
 	if (ret)
-		atomic64_inc(&alg->stats.kpp.kpp_err_cnt);
+		atomic64_inc(&alg->stats.kpp.err_cnt);
 	else
 		atomic64_inc(&alg->stats.kpp.setsecret_cnt);
 	crypto_alg_put(alg);
@@ -1238,7 +1238,7 @@ EXPORT_SYMBOL_GPL(crypto_stats_kpp_set_secret);
 void crypto_stats_kpp_generate_public_key(struct crypto_alg *alg, int ret)
 {
 	if (ret)
-		atomic64_inc(&alg->stats.kpp.kpp_err_cnt);
+		atomic64_inc(&alg->stats.kpp.err_cnt);
 	else
 		atomic64_inc(&alg->stats.kpp.generate_public_key_cnt);
 	crypto_alg_put(alg);
@@ -1248,7 +1248,7 @@ EXPORT_SYMBOL_GPL(crypto_stats_kpp_generate_public_key);
 void crypto_stats_kpp_compute_shared_secret(struct crypto_alg *alg, int ret)
 {
 	if (ret)
-		atomic64_inc(&alg->stats.kpp.kpp_err_cnt);
+		atomic64_inc(&alg->stats.kpp.err_cnt);
 	else
 		atomic64_inc(&alg->stats.kpp.compute_shared_secret_cnt);
 	crypto_alg_put(alg);
@@ -1258,7 +1258,7 @@ EXPORT_SYMBOL_GPL(crypto_stats_kpp_compute_shared_secret);
 void crypto_stats_rng_seed(struct crypto_alg *alg, int ret)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
-		atomic64_inc(&alg->stats.rng.rng_err_cnt);
+		atomic64_inc(&alg->stats.rng.err_cnt);
 	else
 		atomic64_inc(&alg->stats.rng.seed_cnt);
 	crypto_alg_put(alg);
@@ -1269,7 +1269,7 @@ void crypto_stats_rng_generate(struct crypto_alg *alg, unsigned int dlen,
 			       int ret)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->stats.rng.rng_err_cnt);
+		atomic64_inc(&alg->stats.rng.err_cnt);
 	} else {
 		atomic64_inc(&alg->stats.rng.generate_cnt);
 		atomic64_add(dlen, &alg->stats.rng.generate_tlen);
@@ -1282,7 +1282,7 @@ void crypto_stats_skcipher_encrypt(unsigned int cryptlen, int ret,
 				   struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->stats.cipher.cipher_err_cnt);
+		atomic64_inc(&alg->stats.cipher.err_cnt);
 	} else {
 		atomic64_inc(&alg->stats.cipher.encrypt_cnt);
 		atomic64_add(cryptlen, &alg->stats.cipher.encrypt_tlen);
@@ -1295,7 +1295,7 @@ void crypto_stats_skcipher_decrypt(unsigned int cryptlen, int ret,
 				   struct crypto_alg *alg)
 {
 	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic64_inc(&alg->stats.cipher.cipher_err_cnt);
+		atomic64_inc(&alg->stats.cipher.err_cnt);
 	} else {
 		atomic64_inc(&alg->stats.cipher.decrypt_cnt);
 		atomic64_add(cryptlen, &alg->stats.cipher.decrypt_tlen);
diff --git a/crypto/crypto_user_stat.c b/crypto/crypto_user_stat.c
index 113bf1691560..0ba00aaeb810 100644
--- a/crypto/crypto_user_stat.c
+++ b/crypto/crypto_user_stat.c
@@ -43,7 +43,7 @@ static int crypto_report_aead(struct sk_buff *skb, struct crypto_alg *alg)
 	raead.stat_encrypt_tlen = atomic64_read(&alg->stats.aead.encrypt_tlen);
 	raead.stat_decrypt_cnt = atomic64_read(&alg->stats.aead.decrypt_cnt);
 	raead.stat_decrypt_tlen = atomic64_read(&alg->stats.aead.decrypt_tlen);
-	raead.stat_aead_err_cnt = atomic64_read(&alg->stats.aead.aead_err_cnt);
+	raead.stat_err_cnt = atomic64_read(&alg->stats.aead.err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_AEAD, sizeof(raead), &raead);
 }
@@ -60,7 +60,7 @@ static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg)
 	rcipher.stat_encrypt_tlen = atomic64_read(&alg->stats.cipher.encrypt_tlen);
 	rcipher.stat_decrypt_cnt =  atomic64_read(&alg->stats.cipher.decrypt_cnt);
 	rcipher.stat_decrypt_tlen = atomic64_read(&alg->stats.cipher.decrypt_tlen);
-	rcipher.stat_cipher_err_cnt =  atomic64_read(&alg->stats.cipher.cipher_err_cnt);
+	rcipher.stat_err_cnt =  atomic64_read(&alg->stats.cipher.err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_CIPHER, sizeof(rcipher), &rcipher);
 }
@@ -76,7 +76,7 @@ static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg)
 	rcomp.stat_compress_tlen = atomic64_read(&alg->stats.compress.compress_tlen);
 	rcomp.stat_decompress_cnt = atomic64_read(&alg->stats.compress.decompress_cnt);
 	rcomp.stat_decompress_tlen = atomic64_read(&alg->stats.compress.decompress_tlen);
-	rcomp.stat_compress_err_cnt = atomic64_read(&alg->stats.compress.compress_err_cnt);
+	rcomp.stat_err_cnt = atomic64_read(&alg->stats.compress.err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_COMPRESS, sizeof(rcomp), &rcomp);
 }
@@ -92,7 +92,7 @@ static int crypto_report_acomp(struct sk_buff *skb, struct crypto_alg *alg)
 	racomp.stat_compress_tlen = atomic64_read(&alg->stats.compress.compress_tlen);
 	racomp.stat_decompress_cnt =  atomic64_read(&alg->stats.compress.decompress_cnt);
 	racomp.stat_decompress_tlen = atomic64_read(&alg->stats.compress.decompress_tlen);
-	racomp.stat_compress_err_cnt = atomic64_read(&alg->stats.compress.compress_err_cnt);
+	racomp.stat_err_cnt = atomic64_read(&alg->stats.compress.err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_ACOMP, sizeof(racomp), &racomp);
 }
@@ -110,7 +110,7 @@ static int crypto_report_akcipher(struct sk_buff *skb, struct crypto_alg *alg)
 	rakcipher.stat_decrypt_tlen = atomic64_read(&alg->stats.akcipher.decrypt_tlen);
 	rakcipher.stat_sign_cnt = atomic64_read(&alg->stats.akcipher.sign_cnt);
 	rakcipher.stat_verify_cnt = atomic64_read(&alg->stats.akcipher.verify_cnt);
-	rakcipher.stat_akcipher_err_cnt = atomic64_read(&alg->stats.akcipher.akcipher_err_cnt);
+	rakcipher.stat_err_cnt = atomic64_read(&alg->stats.akcipher.err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_AKCIPHER,
 		       sizeof(rakcipher), &rakcipher);
@@ -127,7 +127,7 @@ static int crypto_report_kpp(struct sk_buff *skb, struct crypto_alg *alg)
 	rkpp.stat_setsecret_cnt = atomic64_read(&alg->stats.kpp.setsecret_cnt);
 	rkpp.stat_generate_public_key_cnt = atomic64_read(&alg->stats.kpp.generate_public_key_cnt);
 	rkpp.stat_compute_shared_secret_cnt = atomic64_read(&alg->stats.kpp.compute_shared_secret_cnt);
-	rkpp.stat_kpp_err_cnt = atomic64_read(&alg->stats.kpp.kpp_err_cnt);
+	rkpp.stat_err_cnt = atomic64_read(&alg->stats.kpp.err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_KPP, sizeof(rkpp), &rkpp);
 }
@@ -142,7 +142,7 @@ static int crypto_report_ahash(struct sk_buff *skb, struct crypto_alg *alg)
 
 	rhash.stat_hash_cnt = atomic64_read(&alg->stats.hash.hash_cnt);
 	rhash.stat_hash_tlen = atomic64_read(&alg->stats.hash.hash_tlen);
-	rhash.stat_hash_err_cnt = atomic64_read(&alg->stats.hash.hash_err_cnt);
+	rhash.stat_err_cnt = atomic64_read(&alg->stats.hash.err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_HASH, sizeof(rhash), &rhash);
 }
@@ -157,7 +157,7 @@ static int crypto_report_shash(struct sk_buff *skb, struct crypto_alg *alg)
 
 	rhash.stat_hash_cnt =  atomic64_read(&alg->stats.hash.hash_cnt);
 	rhash.stat_hash_tlen = atomic64_read(&alg->stats.hash.hash_tlen);
-	rhash.stat_hash_err_cnt = atomic64_read(&alg->stats.hash.hash_err_cnt);
+	rhash.stat_err_cnt = atomic64_read(&alg->stats.hash.err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_HASH, sizeof(rhash), &rhash);
 }
@@ -173,7 +173,7 @@ static int crypto_report_rng(struct sk_buff *skb, struct crypto_alg *alg)
 	rrng.stat_generate_cnt = atomic64_read(&alg->stats.rng.generate_cnt);
 	rrng.stat_generate_tlen = atomic64_read(&alg->stats.rng.generate_tlen);
 	rrng.stat_seed_cnt = atomic64_read(&alg->stats.rng.seed_cnt);
-	rrng.stat_rng_err_cnt = atomic64_read(&alg->stats.rng.rng_err_cnt);
+	rrng.stat_err_cnt = atomic64_read(&alg->stats.rng.err_cnt);
 
 	return nla_put(skb, CRYPTOCFGA_STAT_RNG, sizeof(rrng), &rrng);
 }
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 8a46ab35479e..a2967c1a08b1 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -376,14 +376,14 @@ struct compress_alg {
  * @encrypt_tlen:	total data size handled by encrypt requests
  * @decrypt_cnt:	number of decrypt requests
  * @decrypt_tlen:	total data size handled by decrypt requests
- * @aead_err_cnt:	number of error for AEAD requests
+ * @err_cnt:		number of error for AEAD requests
  */
 struct crypto_istat_aead {
 	atomic64_t encrypt_cnt;
 	atomic64_t encrypt_tlen;
 	atomic64_t decrypt_cnt;
 	atomic64_t decrypt_tlen;
-	atomic64_t aead_err_cnt;
+	atomic64_t err_cnt;
 };
 
 /*
@@ -394,7 +394,7 @@ struct crypto_istat_aead {
  * @decrypt_tlen:	total data size handled by decrypt requests
  * @verify_cnt:		number of verify operation
  * @sign_cnt:		number of sign requests
- * @akcipher_err_cnt:	number of error for akcipher requests
+ * @err_cnt:		number of error for akcipher requests
  */
 struct crypto_istat_akcipher {
 	atomic64_t encrypt_cnt;
@@ -403,7 +403,7 @@ struct crypto_istat_akcipher {
 	atomic64_t decrypt_tlen;
 	atomic64_t verify_cnt;
 	atomic64_t sign_cnt;
-	atomic64_t akcipher_err_cnt;
+	atomic64_t err_cnt;
 };
 
 /*
@@ -412,14 +412,14 @@ struct crypto_istat_akcipher {
  * @encrypt_tlen:	total data size handled by encrypt requests
  * @decrypt_cnt:	number of decrypt requests
  * @decrypt_tlen:	total data size handled by decrypt requests
- * @cipher_err_cnt:	number of error for cipher requests
+ * @err_cnt:		number of error for cipher requests
  */
 struct crypto_istat_cipher {
 	atomic64_t encrypt_cnt;
 	atomic64_t encrypt_tlen;
 	atomic64_t decrypt_cnt;
 	atomic64_t decrypt_tlen;
-	atomic64_t cipher_err_cnt;
+	atomic64_t err_cnt;
 };
 
 /*
@@ -428,26 +428,26 @@ struct crypto_istat_cipher {
  * @compress_tlen:	total data size handled by compress requests
  * @decompress_cnt:	number of decompress requests
  * @decompress_tlen:	total data size handled by decompress requests
- * @compress_err_cnt:	number of error for compress requests
+ * @err_cnt:		number of error for compress requests
  */
 struct crypto_istat_compress {
 	atomic64_t compress_cnt;
 	atomic64_t compress_tlen;
 	atomic64_t decompress_cnt;
 	atomic64_t decompress_tlen;
-	atomic64_t compress_err_cnt;
+	atomic64_t err_cnt;
 };
 
 /*
  * struct crypto_istat_hash - statistics for has algorithm
  * @hash_cnt:		number of hash requests
  * @hash_tlen:		total data size hashed
- * @hash_err_cnt:	number of error for hash requests
+ * @err_cnt:		number of error for hash requests
  */
 struct crypto_istat_hash {
 	atomic64_t hash_cnt;
 	atomic64_t hash_tlen;
-	atomic64_t hash_err_cnt;
+	atomic64_t err_cnt;
 };
 
 /*
@@ -455,13 +455,13 @@ struct crypto_istat_hash {
  * @setsecret_cnt:		number of setsecrey operation
  * @generate_public_key_cnt:	number of generate_public_key operation
  * @compute_shared_secret_cnt:	number of compute_shared_secret operation
- * @kpp_err_cnt:		number of error for KPP requests
+ * @err_cnt:			number of error for KPP requests
  */
 struct crypto_istat_kpp {
 	atomic64_t setsecret_cnt;
 	atomic64_t generate_public_key_cnt;
 	atomic64_t compute_shared_secret_cnt;
-	atomic64_t kpp_err_cnt;
+	atomic64_t err_cnt;
 };
 
 /*
@@ -469,13 +469,13 @@ struct crypto_istat_kpp {
  * @generate_cnt:	number of RNG generate requests
  * @generate_tlen:	total data size of generated data by the RNG
  * @seed_cnt:		number of times the RNG was seeded
- * @rng_err_cnt:	number of error for RNG requests
+ * @err_cnt:		number of error for RNG requests
  */
 struct crypto_istat_rng {
 	atomic64_t generate_cnt;
 	atomic64_t generate_tlen;
 	atomic64_t seed_cnt;
-	atomic64_t rng_err_cnt;
+	atomic64_t err_cnt;
 };
 #endif /* CONFIG_CRYPTO_STATS */
 
diff --git a/include/uapi/linux/cryptouser.h b/include/uapi/linux/cryptouser.h
index 3a70f025e27d..4dc1603919ce 100644
--- a/include/uapi/linux/cryptouser.h
+++ b/include/uapi/linux/cryptouser.h
@@ -82,7 +82,7 @@ struct crypto_stat_aead {
 	__u64 stat_encrypt_tlen;
 	__u64 stat_decrypt_cnt;
 	__u64 stat_decrypt_tlen;
-	__u64 stat_aead_err_cnt;
+	__u64 stat_err_cnt;
 };
 
 struct crypto_stat_akcipher {
@@ -93,7 +93,7 @@ struct crypto_stat_akcipher {
 	__u64 stat_decrypt_tlen;
 	__u64 stat_verify_cnt;
 	__u64 stat_sign_cnt;
-	__u64 stat_akcipher_err_cnt;
+	__u64 stat_err_cnt;
 };
 
 struct crypto_stat_cipher {
@@ -102,7 +102,7 @@ struct crypto_stat_cipher {
 	__u64 stat_encrypt_tlen;
 	__u64 stat_decrypt_cnt;
 	__u64 stat_decrypt_tlen;
-	__u64 stat_cipher_err_cnt;
+	__u64 stat_err_cnt;
 };
 
 struct crypto_stat_compress {
@@ -111,14 +111,14 @@ struct crypto_stat_compress {
 	__u64 stat_compress_tlen;
 	__u64 stat_decompress_cnt;
 	__u64 stat_decompress_tlen;
-	__u64 stat_compress_err_cnt;
+	__u64 stat_err_cnt;
 };
 
 struct crypto_stat_hash {
 	char type[CRYPTO_MAX_NAME];
 	__u64 stat_hash_cnt;
 	__u64 stat_hash_tlen;
-	__u64 stat_hash_err_cnt;
+	__u64 stat_err_cnt;
 };
 
 struct crypto_stat_kpp {
@@ -126,7 +126,7 @@ struct crypto_stat_kpp {
 	__u64 stat_setsecret_cnt;
 	__u64 stat_generate_public_key_cnt;
 	__u64 stat_compute_shared_secret_cnt;
-	__u64 stat_kpp_err_cnt;
+	__u64 stat_err_cnt;
 };
 
 struct crypto_stat_rng {
@@ -134,7 +134,7 @@ struct crypto_stat_rng {
 	__u64 stat_generate_cnt;
 	__u64 stat_generate_tlen;
 	__u64 stat_seed_cnt;
-	__u64 stat_rng_err_cnt;
+	__u64 stat_err_cnt;
 };
 
 struct crypto_stat_larval {
diff --git a/tools/crypto/getstat.c b/tools/crypto/getstat.c
index 57fbb94608d4..9e8ff76420fa 100644
--- a/tools/crypto/getstat.c
+++ b/tools/crypto/getstat.c
@@ -157,7 +157,7 @@ static int get_stat(const char *drivername)
 		printf("%s\tHash\n\tHash: %llu bytes: %llu\n\tErrors: %llu\n",
 			drivername,
 			rhash->stat_hash_cnt, rhash->stat_hash_tlen,
-			rhash->stat_hash_err_cnt);
+			rhash->stat_err_cnt);
 	} else if (tb[CRYPTOCFGA_STAT_COMPRESS]) {
 		struct rtattr *rta = tb[CRYPTOCFGA_STAT_COMPRESS];
 		struct crypto_stat_compress *rblk =
@@ -166,7 +166,7 @@ static int get_stat(const char *drivername)
 			drivername,
 			rblk->stat_compress_cnt, rblk->stat_compress_tlen,
 			rblk->stat_decompress_cnt, rblk->stat_decompress_tlen,
-			rblk->stat_compress_err_cnt);
+			rblk->stat_err_cnt);
 	} else if (tb[CRYPTOCFGA_STAT_ACOMP]) {
 		struct rtattr *rta = tb[CRYPTOCFGA_STAT_ACOMP];
 		struct crypto_stat_compress *rcomp =
@@ -175,7 +175,7 @@ static int get_stat(const char *drivername)
 			drivername,
 			rcomp->stat_compress_cnt, rcomp->stat_compress_tlen,
 			rcomp->stat_decompress_cnt, rcomp->stat_decompress_tlen,
-			rcomp->stat_compress_err_cnt);
+			rcomp->stat_err_cnt);
 	} else if (tb[CRYPTOCFGA_STAT_AEAD]) {
 		struct rtattr *rta = tb[CRYPTOCFGA_STAT_AEAD];
 		struct crypto_stat_aead *raead =
@@ -184,7 +184,7 @@ static int get_stat(const char *drivername)
 			drivername,
 			raead->stat_encrypt_cnt, raead->stat_encrypt_tlen,
 			raead->stat_decrypt_cnt, raead->stat_decrypt_tlen,
-			raead->stat_aead_err_cnt);
+			raead->stat_err_cnt);
 	} else if (tb[CRYPTOCFGA_STAT_BLKCIPHER]) {
 		struct rtattr *rta = tb[CRYPTOCFGA_STAT_BLKCIPHER];
 		struct crypto_stat_cipher *rblk =
@@ -193,7 +193,7 @@ static int get_stat(const char *drivername)
 			drivername,
 			rblk->stat_encrypt_cnt, rblk->stat_encrypt_tlen,
 			rblk->stat_decrypt_cnt, rblk->stat_decrypt_tlen,
-			rblk->stat_cipher_err_cnt);
+			rblk->stat_err_cnt);
 	} else if (tb[CRYPTOCFGA_STAT_AKCIPHER]) {
 		struct rtattr *rta = tb[CRYPTOCFGA_STAT_AKCIPHER];
 		struct crypto_stat_akcipher *rblk =
@@ -203,7 +203,7 @@ static int get_stat(const char *drivername)
 			rblk->stat_encrypt_cnt, rblk->stat_encrypt_tlen,
 			rblk->stat_decrypt_cnt, rblk->stat_decrypt_tlen,
 			rblk->stat_sign_cnt, rblk->stat_verify_cnt,
-			rblk->stat_akcipher_err_cnt);
+			rblk->stat_err_cnt);
 	} else if (tb[CRYPTOCFGA_STAT_CIPHER]) {
 		struct rtattr *rta = tb[CRYPTOCFGA_STAT_CIPHER];
 		struct crypto_stat_cipher *rblk =
@@ -212,7 +212,7 @@ static int get_stat(const char *drivername)
 			drivername,
 			rblk->stat_encrypt_cnt, rblk->stat_encrypt_tlen,
 			rblk->stat_decrypt_cnt, rblk->stat_decrypt_tlen,
-			rblk->stat_cipher_err_cnt);
+			rblk->stat_err_cnt);
 	} else if (tb[CRYPTOCFGA_STAT_RNG]) {
 		struct rtattr *rta = tb[CRYPTOCFGA_STAT_RNG];
 		struct crypto_stat_rng *rrng =
@@ -221,7 +221,7 @@ static int get_stat(const char *drivername)
 			drivername,
 			rrng->stat_seed_cnt,
 			rrng->stat_generate_cnt, rrng->stat_generate_tlen,
-			rrng->stat_rng_err_cnt);
+			rrng->stat_err_cnt);
 	} else if (tb[CRYPTOCFGA_STAT_KPP]) {
 		struct rtattr *rta = tb[CRYPTOCFGA_STAT_KPP];
 		struct crypto_stat_kpp *rkpp =
@@ -231,7 +231,7 @@ static int get_stat(const char *drivername)
 			rkpp->stat_setsecret_cnt,
 			rkpp->stat_generate_public_key_cnt,
 			rkpp->stat_compute_shared_secret_cnt,
-			rkpp->stat_kpp_err_cnt);
+			rkpp->stat_err_cnt);
 	} else {
 		fprintf(stderr, "%s is of an unknown algorithm\n", drivername);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 1f6669b9716c6c98391b0f756e060892b32b8ca7 Mon Sep 17 00:00:00 2001
From: Corentin Labbe <clabbe@baylibre.com>
Date: Thu, 29 Nov 2018 14:42:26 +0000
Subject: crypto: user - Add crypto_stats_init

This patch add the crypto_stats_init() function.
This will permit to remove some ifdef from __crypto_register_alg().

Signed-off-by: Corentin Labbe <clabbe@baylibre.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/algapi.c        | 10 +++++++---
 include/linux/crypto.h |  3 +++
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/crypto/algapi.c b/crypto/algapi.c
index c0d4f9ef6b0f..8b65ada33e5d 100644
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -258,9 +258,7 @@ static struct crypto_larval *__crypto_register_alg(struct crypto_alg *alg)
 	list_add(&alg->cra_list, &crypto_alg_list);
 	list_add(&larval->alg.cra_list, &crypto_alg_list);
 
-#ifdef CONFIG_CRYPTO_STATS
-	memset(&alg->stats, 0, sizeof(alg->stats));
-#endif
+	crypto_stats_init(alg);
 
 out:
 	return larval;
@@ -1073,6 +1071,12 @@ int crypto_type_has_alg(const char *name, const struct crypto_type *frontend,
 EXPORT_SYMBOL_GPL(crypto_type_has_alg);
 
 #ifdef CONFIG_CRYPTO_STATS
+void crypto_stats_init(struct crypto_alg *alg)
+{
+	memset(&alg->stats, 0, sizeof(alg->stats));
+}
+EXPORT_SYMBOL_GPL(crypto_stats_init);
+
 void crypto_stats_get(struct crypto_alg *alg)
 {
 	crypto_alg_get(alg);
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index a2967c1a08b1..9850b41e38ae 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -614,6 +614,7 @@ struct crypto_alg {
 } CRYPTO_MINALIGN_ATTR;
 
 #ifdef CONFIG_CRYPTO_STATS
+void crypto_stats_init(struct crypto_alg *alg);
 void crypto_stats_get(struct crypto_alg *alg);
 void crypto_stats_ablkcipher_encrypt(unsigned int nbytes, int ret, struct crypto_alg *alg);
 void crypto_stats_ablkcipher_decrypt(unsigned int nbytes, int ret, struct crypto_alg *alg);
@@ -635,6 +636,8 @@ void crypto_stats_rng_generate(struct crypto_alg *alg, unsigned int dlen, int re
 void crypto_stats_skcipher_encrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg);
 void crypto_stats_skcipher_decrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg);
 #else
+static inline void crypto_stats_init(struct crypto_alg *alg)
+{}
 static inline void crypto_stats_get(struct crypto_alg *alg)
 {}
 static inline void crypto_stats_ablkcipher_encrypt(unsigned int nbytes, int ret, struct crypto_alg *alg)
-- 
cgit v1.2.3-59-g8ed1b


From c35828ea906a7c76632a0211e59c392903cd4615 Mon Sep 17 00:00:00 2001
From: Atul Gupta <atul.gupta@chelsio.com>
Date: Fri, 30 Nov 2018 14:31:48 +0530
Subject: crypto: chcr - small packet Tx stalls the queue

Immediate packets sent to hardware should include the work
request length in calculating the flits. WR occupy one flit and
if not accounted result in invalid request which stalls the HW
queue.

Cc: stable@vger.kernel.org
Signed-off-by: Atul Gupta <atul.gupta@chelsio.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/chelsio/chcr_ipsec.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/crypto/chelsio/chcr_ipsec.c b/drivers/crypto/chelsio/chcr_ipsec.c
index 461b97e2f1fd..1ff8738631a3 100644
--- a/drivers/crypto/chelsio/chcr_ipsec.c
+++ b/drivers/crypto/chelsio/chcr_ipsec.c
@@ -303,7 +303,10 @@ static bool chcr_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
 
 static inline int is_eth_imm(const struct sk_buff *skb, unsigned int kctx_len)
 {
-	int hdrlen = sizeof(struct chcr_ipsec_req) + kctx_len;
+	int hdrlen;
+
+	hdrlen = sizeof(struct fw_ulptx_wr) +
+		 sizeof(struct chcr_ipsec_req) + kctx_len;
 
 	hdrlen += sizeof(struct cpl_tx_pkt);
 	if (skb->len <= MAX_IMM_TX_PKT_LEN - hdrlen)
-- 
cgit v1.2.3-59-g8ed1b


From 8362ea16f69fe59c4d012f0748e586ad09391f41 Mon Sep 17 00:00:00 2001
From: Atul Gupta <atul.gupta@chelsio.com>
Date: Fri, 30 Nov 2018 14:32:09 +0530
Subject: crypto: chcr - ESN for Inline IPSec Tx

Send SPI, 64b seq nos and 64b IV with aadiv drop for inline crypto.
This information is added in outgoing packet after the CPL TX PKT XT
and removed by hardware.
The aad, auth and cipher offsets are then adjusted for ESN enabled tunnel.

Signed-off-by: Atul Gupta <atul.gupta@chelsio.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/chelsio/chcr_core.h  |   9 ++
 drivers/crypto/chelsio/chcr_ipsec.c | 175 ++++++++++++++++++++++++++++--------
 2 files changed, 148 insertions(+), 36 deletions(-)

diff --git a/drivers/crypto/chelsio/chcr_core.h b/drivers/crypto/chelsio/chcr_core.h
index de3a9c085daf..4616663e25d7 100644
--- a/drivers/crypto/chelsio/chcr_core.h
+++ b/drivers/crypto/chelsio/chcr_core.h
@@ -159,8 +159,17 @@ struct chcr_ipsec_wr {
 	struct chcr_ipsec_req req;
 };
 
+#define ESN_IV_INSERT_OFFSET 12
+struct chcr_ipsec_aadiv {
+	__be32 spi;
+	u8 seq_no[8];
+	u8 iv[8];
+};
+
 struct ipsec_sa_entry {
 	int hmac_ctrl;
+	u16 esn;
+	u16 imm;
 	unsigned int enckey_len;
 	unsigned int kctx_len;
 	unsigned int authsize;
diff --git a/drivers/crypto/chelsio/chcr_ipsec.c b/drivers/crypto/chelsio/chcr_ipsec.c
index 1ff8738631a3..9321d2b1a5ab 100644
--- a/drivers/crypto/chelsio/chcr_ipsec.c
+++ b/drivers/crypto/chelsio/chcr_ipsec.c
@@ -76,12 +76,14 @@ static int chcr_xfrm_add_state(struct xfrm_state *x);
 static void chcr_xfrm_del_state(struct xfrm_state *x);
 static void chcr_xfrm_free_state(struct xfrm_state *x);
 static bool chcr_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *x);
+static void chcr_advance_esn_state(struct xfrm_state *x);
 
 static const struct xfrmdev_ops chcr_xfrmdev_ops = {
 	.xdo_dev_state_add      = chcr_xfrm_add_state,
 	.xdo_dev_state_delete   = chcr_xfrm_del_state,
 	.xdo_dev_state_free     = chcr_xfrm_free_state,
 	.xdo_dev_offload_ok     = chcr_ipsec_offload_ok,
+	.xdo_dev_state_advance_esn = chcr_advance_esn_state,
 };
 
 /* Add offload xfrms to Chelsio Interface */
@@ -210,10 +212,6 @@ static int chcr_xfrm_add_state(struct xfrm_state *x)
 		pr_debug("CHCR: Cannot offload compressed xfrm states\n");
 		return -EINVAL;
 	}
-	if (x->props.flags & XFRM_STATE_ESN) {
-		pr_debug("CHCR: Cannot offload ESN xfrm states\n");
-		return -EINVAL;
-	}
 	if (x->props.family != AF_INET &&
 	    x->props.family != AF_INET6) {
 		pr_debug("CHCR: Only IPv4/6 xfrm state offloaded\n");
@@ -266,6 +264,8 @@ static int chcr_xfrm_add_state(struct xfrm_state *x)
 	}
 
 	sa_entry->hmac_ctrl = chcr_ipsec_setauthsize(x, sa_entry);
+	if (x->props.flags & XFRM_STATE_ESN)
+		sa_entry->esn = 1;
 	chcr_ipsec_setkey(x, sa_entry);
 	x->xso.offload_handle = (unsigned long)sa_entry;
 	try_module_get(THIS_MODULE);
@@ -294,31 +294,57 @@ static void chcr_xfrm_free_state(struct xfrm_state *x)
 
 static bool chcr_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
 {
-	/* Offload with IP options is not supported yet */
-	if (ip_hdr(skb)->ihl > 5)
-		return false;
-
+	if (x->props.family == AF_INET) {
+		/* Offload with IP options is not supported yet */
+		if (ip_hdr(skb)->ihl > 5)
+			return false;
+	} else {
+		/* Offload with IPv6 extension headers is not support yet */
+		if (ipv6_ext_hdr(ipv6_hdr(skb)->nexthdr))
+			return false;
+	}
 	return true;
 }
 
-static inline int is_eth_imm(const struct sk_buff *skb, unsigned int kctx_len)
+static void chcr_advance_esn_state(struct xfrm_state *x)
+{
+	/* do nothing */
+	if (!x->xso.offload_handle)
+		return;
+}
+
+static inline int is_eth_imm(const struct sk_buff *skb,
+			     struct ipsec_sa_entry *sa_entry)
 {
+	unsigned int kctx_len;
 	int hdrlen;
 
+	kctx_len = sa_entry->kctx_len;
 	hdrlen = sizeof(struct fw_ulptx_wr) +
 		 sizeof(struct chcr_ipsec_req) + kctx_len;
 
 	hdrlen += sizeof(struct cpl_tx_pkt);
+	if (sa_entry->esn)
+		hdrlen += (DIV_ROUND_UP(sizeof(struct chcr_ipsec_aadiv), 16)
+			   << 4);
 	if (skb->len <= MAX_IMM_TX_PKT_LEN - hdrlen)
 		return hdrlen;
 	return 0;
 }
 
 static inline unsigned int calc_tx_sec_flits(const struct sk_buff *skb,
-					     unsigned int kctx_len)
+					     struct ipsec_sa_entry *sa_entry)
 {
+	unsigned int kctx_len;
 	unsigned int flits;
-	int hdrlen = is_eth_imm(skb, kctx_len);
+	int aadivlen;
+	int hdrlen;
+
+	kctx_len = sa_entry->kctx_len;
+	hdrlen = is_eth_imm(skb, sa_entry);
+	aadivlen = sa_entry->esn ? DIV_ROUND_UP(sizeof(struct chcr_ipsec_aadiv),
+						16) : 0;
+	aadivlen <<= 4;
 
 	/* If the skb is small enough, we can pump it out as a work request
 	 * with only immediate data.  In that case we just have to have the
@@ -341,13 +367,69 @@ static inline unsigned int calc_tx_sec_flits(const struct sk_buff *skb,
 	flits += (sizeof(struct fw_ulptx_wr) +
 		  sizeof(struct chcr_ipsec_req) +
 		  kctx_len +
-		  sizeof(struct cpl_tx_pkt_core)) / sizeof(__be64);
+		  sizeof(struct cpl_tx_pkt_core) +
+		  aadivlen) / sizeof(__be64);
 	return flits;
 }
 
+inline void *copy_esn_pktxt(struct sk_buff *skb,
+			    struct net_device *dev,
+			    void *pos,
+			    struct ipsec_sa_entry *sa_entry)
+{
+	struct chcr_ipsec_aadiv *aadiv;
+	struct ulptx_idata *sc_imm;
+	struct ip_esp_hdr *esphdr;
+	struct xfrm_offload *xo;
+	struct sge_eth_txq *q;
+	struct adapter *adap;
+	struct port_info *pi;
+	__be64 seqno;
+	u32 qidx;
+	u32 seqlo;
+	u8 *iv;
+	int eoq;
+	int len;
+
+	pi = netdev_priv(dev);
+	adap = pi->adapter;
+	qidx = skb->queue_mapping;
+	q = &adap->sge.ethtxq[qidx + pi->first_qset];
+
+	/* end of queue, reset pos to start of queue */
+	eoq = (void *)q->q.stat - pos;
+	if (!eoq)
+		pos = q->q.desc;
+
+	len = DIV_ROUND_UP(sizeof(struct chcr_ipsec_aadiv), 16) << 4;
+	memset(pos, 0, len);
+	aadiv = (struct chcr_ipsec_aadiv *)pos;
+	esphdr = (struct ip_esp_hdr *)skb_transport_header(skb);
+	iv = skb_transport_header(skb) + sizeof(struct ip_esp_hdr);
+	xo = xfrm_offload(skb);
+
+	aadiv->spi = (esphdr->spi);
+	seqlo = htonl(esphdr->seq_no);
+	seqno = cpu_to_be64(seqlo + ((u64)xo->seq.hi << 32));
+	memcpy(aadiv->seq_no, &seqno, 8);
+	iv = skb_transport_header(skb) + sizeof(struct ip_esp_hdr);
+	memcpy(aadiv->iv, iv, 8);
+
+	if (sa_entry->imm) {
+		sc_imm = (struct ulptx_idata *)(pos +
+			  (DIV_ROUND_UP(sizeof(struct chcr_ipsec_aadiv),
+					sizeof(__be64)) << 3));
+		sc_imm->cmd_more = FILL_CMD_MORE(!sa_entry->imm);
+		sc_imm->len = cpu_to_be32(sa_entry->imm);
+	}
+	pos += len;
+	return pos;
+}
+
 inline void *copy_cpltx_pktxt(struct sk_buff *skb,
-				struct net_device *dev,
-				void *pos)
+			      struct net_device *dev,
+			      void *pos,
+			      struct ipsec_sa_entry *sa_entry)
 {
 	struct cpl_tx_pkt_core *cpl;
 	struct sge_eth_txq *q;
@@ -382,6 +464,9 @@ inline void *copy_cpltx_pktxt(struct sk_buff *skb,
 	cpl->ctrl1 = cpu_to_be64(cntrl);
 
 	pos += sizeof(struct cpl_tx_pkt_core);
+	/* Copy ESN info for HW */
+	if (sa_entry->esn)
+		pos = copy_esn_pktxt(skb, dev, pos, sa_entry);
 	return pos;
 }
 
@@ -428,7 +513,7 @@ inline void *copy_key_cpltx_pktxt(struct sk_buff *skb,
 		pos = (u8 *)q->q.desc + (key_len - left);
 	}
 	/* Copy CPL TX PKT XT */
-	pos = copy_cpltx_pktxt(skb, dev, pos);
+	pos = copy_cpltx_pktxt(skb, dev, pos, sa_entry);
 
 	return pos;
 }
@@ -441,10 +526,16 @@ inline void *chcr_crypto_wreq(struct sk_buff *skb,
 {
 	struct port_info *pi = netdev_priv(dev);
 	struct adapter *adap = pi->adapter;
-	unsigned int immdatalen = 0;
 	unsigned int ivsize = GCM_ESP_IV_SIZE;
 	struct chcr_ipsec_wr *wr;
+	u16 immdatalen = 0;
 	unsigned int flits;
+	u32 ivinoffset;
+	u32 aadstart;
+	u32 aadstop;
+	u32 ciphstart;
+	u32 ivdrop = 0;
+	u32 esnlen = 0;
 	u32 wr_mid;
 	int qidx = skb_get_queue_mapping(skb);
 	struct sge_eth_txq *q = &adap->sge.ethtxq[qidx + pi->first_qset];
@@ -453,10 +544,17 @@ inline void *chcr_crypto_wreq(struct sk_buff *skb,
 
 	atomic_inc(&adap->chcr_stats.ipsec_cnt);
 
-	flits = calc_tx_sec_flits(skb, kctx_len);
+	flits = calc_tx_sec_flits(skb, sa_entry);
+	if (sa_entry->esn)
+		ivdrop = 1;
 
-	if (is_eth_imm(skb, kctx_len))
+	if (is_eth_imm(skb, sa_entry)) {
 		immdatalen = skb->len;
+		sa_entry->imm = immdatalen;
+	}
+
+	if (sa_entry->esn)
+		esnlen = sizeof(struct chcr_ipsec_aadiv);
 
 	/* WR Header */
 	wr = (struct chcr_ipsec_wr *)pos;
@@ -481,33 +579,38 @@ inline void *chcr_crypto_wreq(struct sk_buff *skb,
 					 sizeof(wr->req.key_ctx) +
 					 kctx_len +
 					 sizeof(struct cpl_tx_pkt_core) +
-					 immdatalen);
+					 esnlen +
+					 (esnlen ? 0 : immdatalen));
 
 	/* CPL_SEC_PDU */
+	ivinoffset = sa_entry->esn ? (ESN_IV_INSERT_OFFSET + 1) :
+				     (skb_transport_offset(skb) +
+				      sizeof(struct ip_esp_hdr) + 1);
 	wr->req.sec_cpl.op_ivinsrtofst = htonl(
 				CPL_TX_SEC_PDU_OPCODE_V(CPL_TX_SEC_PDU) |
 				CPL_TX_SEC_PDU_CPLLEN_V(2) |
 				CPL_TX_SEC_PDU_PLACEHOLDER_V(1) |
 				CPL_TX_SEC_PDU_IVINSRTOFST_V(
-				(skb_transport_offset(skb) +
-				sizeof(struct ip_esp_hdr) + 1)));
+							     ivinoffset));
 
-	wr->req.sec_cpl.pldlen = htonl(skb->len);
+	wr->req.sec_cpl.pldlen = htonl(skb->len + esnlen);
+	aadstart = sa_entry->esn ? 1 : (skb_transport_offset(skb) + 1);
+	aadstop = sa_entry->esn ? ESN_IV_INSERT_OFFSET :
+				  (skb_transport_offset(skb) +
+				   sizeof(struct ip_esp_hdr));
+	ciphstart = skb_transport_offset(skb) + sizeof(struct ip_esp_hdr) +
+		    GCM_ESP_IV_SIZE + 1;
+	ciphstart += sa_entry->esn ?  esnlen : 0;
 
 	wr->req.sec_cpl.aadstart_cipherstop_hi = FILL_SEC_CPL_CIPHERSTOP_HI(
-				(skb_transport_offset(skb) + 1),
-				(skb_transport_offset(skb) +
-				 sizeof(struct ip_esp_hdr)),
-				(skb_transport_offset(skb) +
-				 sizeof(struct ip_esp_hdr) +
-				 GCM_ESP_IV_SIZE + 1), 0);
+							aadstart,
+							aadstop,
+							ciphstart, 0);
 
 	wr->req.sec_cpl.cipherstop_lo_authinsert =
-		FILL_SEC_CPL_AUTHINSERT(0, skb_transport_offset(skb) +
-					   sizeof(struct ip_esp_hdr) +
-					   GCM_ESP_IV_SIZE + 1,
-					   sa_entry->authsize,
-					   sa_entry->authsize);
+		FILL_SEC_CPL_AUTHINSERT(0, ciphstart,
+					sa_entry->authsize,
+					 sa_entry->authsize);
 	wr->req.sec_cpl.seqno_numivs =
 		FILL_SEC_CPL_SCMD0_SEQNO(CHCR_ENCRYPT_OP, 1,
 					 CHCR_SCMD_CIPHER_MODE_AES_GCM,
@@ -515,7 +618,7 @@ inline void *chcr_crypto_wreq(struct sk_buff *skb,
 					 sa_entry->hmac_ctrl,
 					 ivsize >> 1);
 	wr->req.sec_cpl.ivgen_hdrlen =  FILL_SEC_CPL_IVGEN_HDRLEN(0, 0, 1,
-								  0, 0, 0);
+								  0, ivdrop, 0);
 
 	pos += sizeof(struct fw_ulptx_wr) +
 	       sizeof(struct ulp_txpkt) +
@@ -593,7 +696,7 @@ out_free:       dev_kfree_skb_any(skb);
 
 	cxgb4_reclaim_completed_tx(adap, &q->q, true);
 
-	flits = calc_tx_sec_flits(skb, sa_entry->kctx_len);
+	flits = calc_tx_sec_flits(skb, sa_entry);
 	ndesc = flits_to_desc(flits);
 	credits = txq_avail(&q->q) - ndesc;
 
@@ -606,7 +709,7 @@ out_free:       dev_kfree_skb_any(skb);
 		return NETDEV_TX_BUSY;
 	}
 
-	if (is_eth_imm(skb, kctx_len))
+	if (is_eth_imm(skb, sa_entry))
 		immediate = true;
 
 	if (!immediate &&
-- 
cgit v1.2.3-59-g8ed1b


From 88d905e20b11f7ad841e3afddaf1d59b6693c4a1 Mon Sep 17 00:00:00 2001
From: Yangtao Li <tiny.windzz@gmail.com>
Date: Sat, 1 Dec 2018 04:56:30 -0500
Subject: crypto: cavium/nitrox - convert to DEFINE_SHOW_ATTRIBUTE

Use DEFINE_SHOW_ATTRIBUTE macro to simplify the code.

Signed-off-by: Yangtao Li <tiny.windzz@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/cavium/nitrox/nitrox_debugfs.c | 48 +++++----------------------
 1 file changed, 9 insertions(+), 39 deletions(-)

diff --git a/drivers/crypto/cavium/nitrox/nitrox_debugfs.c b/drivers/crypto/cavium/nitrox/nitrox_debugfs.c
index 5f3cd5fafe04..0196b992280f 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_debugfs.c
+++ b/drivers/crypto/cavium/nitrox/nitrox_debugfs.c
@@ -13,18 +13,7 @@ static int firmware_show(struct seq_file *s, void *v)
 	return 0;
 }
 
-static int firmware_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, firmware_show, inode->i_private);
-}
-
-static const struct file_operations firmware_fops = {
-	.owner = THIS_MODULE,
-	.open = firmware_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(firmware);
 
 static int device_show(struct seq_file *s, void *v)
 {
@@ -41,18 +30,7 @@ static int device_show(struct seq_file *s, void *v)
 	return 0;
 }
 
-static int nitrox_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, device_show, inode->i_private);
-}
-
-static const struct file_operations nitrox_fops = {
-	.owner = THIS_MODULE,
-	.open = nitrox_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(device);
 
 static int stats_show(struct seq_file *s, void *v)
 {
@@ -69,18 +47,7 @@ static int stats_show(struct seq_file *s, void *v)
 	return 0;
 }
 
-static int nitrox_stats_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, stats_show, inode->i_private);
-}
-
-static const struct file_operations nitrox_stats_fops = {
-	.owner = THIS_MODULE,
-	.open = nitrox_stats_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(stats);
 
 void nitrox_debugfs_exit(struct nitrox_device *ndev)
 {
@@ -97,13 +64,16 @@ int nitrox_debugfs_init(struct nitrox_device *ndev)
 		return -ENOMEM;
 
 	ndev->debugfs_dir = dir;
-	f = debugfs_create_file("firmware", 0400, dir, ndev, &firmware_fops);
+	f = debugfs_create_file("firmware", 0400, dir, ndev,
+				&firmware_fops);
 	if (!f)
 		goto err;
-	f = debugfs_create_file("device", 0400, dir, ndev, &nitrox_fops);
+	f = debugfs_create_file("device", 0400, dir, ndev,
+				&device_fops);
 	if (!f)
 		goto err;
-	f = debugfs_create_file("stats", 0400, dir, ndev, &nitrox_stats_fops);
+	f = debugfs_create_file("stats", 0400, dir, ndev,
+				&stats_fops);
 	if (!f)
 		goto err;
 
-- 
cgit v1.2.3-59-g8ed1b


From a00fa0c88774bea9a102fc616598d9ee52765451 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 3 Dec 2018 19:52:49 -0800
Subject: crypto: arm64/nhpoly1305 - add NEON-accelerated NHPoly1305
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add an ARM64 NEON implementation of NHPoly1305, an ε-almost-∆-universal
hash function used in the Adiantum encryption mode.  For now, only the
NH portion is actually NEON-accelerated; the Poly1305 part is less
performance-critical so is just implemented in C.

Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> # big-endian
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm64/crypto/Kconfig                |   5 ++
 arch/arm64/crypto/Makefile               |   3 +
 arch/arm64/crypto/nh-neon-core.S         | 103 +++++++++++++++++++++++++++++++
 arch/arm64/crypto/nhpoly1305-neon-glue.c |  77 +++++++++++++++++++++++
 4 files changed, 188 insertions(+)
 create mode 100644 arch/arm64/crypto/nh-neon-core.S
 create mode 100644 arch/arm64/crypto/nhpoly1305-neon-glue.c

diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index a5606823ed4d..3f5aeb786192 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -106,6 +106,11 @@ config CRYPTO_CHACHA20_NEON
 	select CRYPTO_BLKCIPHER
 	select CRYPTO_CHACHA20
 
+config CRYPTO_NHPOLY1305_NEON
+	tristate "NHPoly1305 hash function using NEON instructions (for Adiantum)"
+	depends on KERNEL_MODE_NEON
+	select CRYPTO_NHPOLY1305
+
 config CRYPTO_AES_ARM64_BS
 	tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm"
 	depends on KERNEL_MODE_NEON
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index f476fede09ba..125dbb10a93e 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -53,6 +53,9 @@ sha512-arm64-y := sha512-glue.o sha512-core.o
 obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
 chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
 
+obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
+nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
+
 obj-$(CONFIG_CRYPTO_AES_ARM64) += aes-arm64.o
 aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o
 
diff --git a/arch/arm64/crypto/nh-neon-core.S b/arch/arm64/crypto/nh-neon-core.S
new file mode 100644
index 000000000000..e05570c38de7
--- /dev/null
+++ b/arch/arm64/crypto/nh-neon-core.S
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NH - ε-almost-universal hash function, ARM64 NEON accelerated version
+ *
+ * Copyright 2018 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+
+	KEY		.req	x0
+	MESSAGE		.req	x1
+	MESSAGE_LEN	.req	x2
+	HASH		.req	x3
+
+	PASS0_SUMS	.req	v0
+	PASS1_SUMS	.req	v1
+	PASS2_SUMS	.req	v2
+	PASS3_SUMS	.req	v3
+	K0		.req	v4
+	K1		.req	v5
+	K2		.req	v6
+	K3		.req	v7
+	T0		.req	v8
+	T1		.req	v9
+	T2		.req	v10
+	T3		.req	v11
+	T4		.req	v12
+	T5		.req	v13
+	T6		.req	v14
+	T7		.req	v15
+
+.macro _nh_stride	k0, k1, k2, k3
+
+	// Load next message stride
+	ld1		{T3.16b}, [MESSAGE], #16
+
+	// Load next key stride
+	ld1		{\k3\().4s}, [KEY], #16
+
+	// Add message words to key words
+	add		T0.4s, T3.4s, \k0\().4s
+	add		T1.4s, T3.4s, \k1\().4s
+	add		T2.4s, T3.4s, \k2\().4s
+	add		T3.4s, T3.4s, \k3\().4s
+
+	// Multiply 32x32 => 64 and accumulate
+	mov		T4.d[0], T0.d[1]
+	mov		T5.d[0], T1.d[1]
+	mov		T6.d[0], T2.d[1]
+	mov		T7.d[0], T3.d[1]
+	umlal		PASS0_SUMS.2d, T0.2s, T4.2s
+	umlal		PASS1_SUMS.2d, T1.2s, T5.2s
+	umlal		PASS2_SUMS.2d, T2.2s, T6.2s
+	umlal		PASS3_SUMS.2d, T3.2s, T7.2s
+.endm
+
+/*
+ * void nh_neon(const u32 *key, const u8 *message, size_t message_len,
+ *		u8 hash[NH_HASH_BYTES])
+ *
+ * It's guaranteed that message_len % 16 == 0.
+ */
+ENTRY(nh_neon)
+
+	ld1		{K0.4s,K1.4s}, [KEY], #32
+	  movi		PASS0_SUMS.2d, #0
+	  movi		PASS1_SUMS.2d, #0
+	ld1		{K2.4s}, [KEY], #16
+	  movi		PASS2_SUMS.2d, #0
+	  movi		PASS3_SUMS.2d, #0
+
+	subs		MESSAGE_LEN, MESSAGE_LEN, #64
+	blt		.Lloop4_done
+.Lloop4:
+	_nh_stride	K0, K1, K2, K3
+	_nh_stride	K1, K2, K3, K0
+	_nh_stride	K2, K3, K0, K1
+	_nh_stride	K3, K0, K1, K2
+	subs		MESSAGE_LEN, MESSAGE_LEN, #64
+	bge		.Lloop4
+
+.Lloop4_done:
+	ands		MESSAGE_LEN, MESSAGE_LEN, #63
+	beq		.Ldone
+	_nh_stride	K0, K1, K2, K3
+
+	subs		MESSAGE_LEN, MESSAGE_LEN, #16
+	beq		.Ldone
+	_nh_stride	K1, K2, K3, K0
+
+	subs		MESSAGE_LEN, MESSAGE_LEN, #16
+	beq		.Ldone
+	_nh_stride	K2, K3, K0, K1
+
+.Ldone:
+	// Sum the accumulators for each pass, then store the sums to 'hash'
+	addp		T0.2d, PASS0_SUMS.2d, PASS1_SUMS.2d
+	addp		T1.2d, PASS2_SUMS.2d, PASS3_SUMS.2d
+	st1		{T0.16b,T1.16b}, [HASH]
+	ret
+ENDPROC(nh_neon)
diff --git a/arch/arm64/crypto/nhpoly1305-neon-glue.c b/arch/arm64/crypto/nhpoly1305-neon-glue.c
new file mode 100644
index 000000000000..22cc32ac9448
--- /dev/null
+++ b/arch/arm64/crypto/nhpoly1305-neon-glue.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum
+ * (ARM64 NEON accelerated version)
+ *
+ * Copyright 2018 Google LLC
+ */
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <crypto/internal/hash.h>
+#include <crypto/nhpoly1305.h>
+#include <linux/module.h>
+
+asmlinkage void nh_neon(const u32 *key, const u8 *message, size_t message_len,
+			u8 hash[NH_HASH_BYTES]);
+
+/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */
+static void _nh_neon(const u32 *key, const u8 *message, size_t message_len,
+		     __le64 hash[NH_NUM_PASSES])
+{
+	nh_neon(key, message, message_len, (u8 *)hash);
+}
+
+static int nhpoly1305_neon_update(struct shash_desc *desc,
+				  const u8 *src, unsigned int srclen)
+{
+	if (srclen < 64 || !may_use_simd())
+		return crypto_nhpoly1305_update(desc, src, srclen);
+
+	do {
+		unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE);
+
+		kernel_neon_begin();
+		crypto_nhpoly1305_update_helper(desc, src, n, _nh_neon);
+		kernel_neon_end();
+		src += n;
+		srclen -= n;
+	} while (srclen);
+	return 0;
+}
+
+static struct shash_alg nhpoly1305_alg = {
+	.base.cra_name		= "nhpoly1305",
+	.base.cra_driver_name	= "nhpoly1305-neon",
+	.base.cra_priority	= 200,
+	.base.cra_ctxsize	= sizeof(struct nhpoly1305_key),
+	.base.cra_module	= THIS_MODULE,
+	.digestsize		= POLY1305_DIGEST_SIZE,
+	.init			= crypto_nhpoly1305_init,
+	.update			= nhpoly1305_neon_update,
+	.final			= crypto_nhpoly1305_final,
+	.setkey			= crypto_nhpoly1305_setkey,
+	.descsize		= sizeof(struct nhpoly1305_state),
+};
+
+static int __init nhpoly1305_mod_init(void)
+{
+	if (!(elf_hwcap & HWCAP_ASIMD))
+		return -ENODEV;
+
+	return crypto_register_shash(&nhpoly1305_alg);
+}
+
+static void __exit nhpoly1305_mod_exit(void)
+{
+	crypto_unregister_shash(&nhpoly1305_alg);
+}
+
+module_init(nhpoly1305_mod_init);
+module_exit(nhpoly1305_mod_exit);
+
+MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (NEON-accelerated)");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
+MODULE_ALIAS_CRYPTO("nhpoly1305");
+MODULE_ALIAS_CRYPTO("nhpoly1305-neon");
-- 
cgit v1.2.3-59-g8ed1b


From cc7cf991e9eb54cac7733dc7d8f3a8591ba6e1c3 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 3 Dec 2018 19:52:50 -0800
Subject: crypto: arm64/chacha20 - add XChaCha20 support

Add an XChaCha20 implementation that is hooked up to the ARM64 NEON
implementation of ChaCha20.  This can be used by Adiantum.

A NEON implementation of single-block HChaCha20 is also added so that
XChaCha20 can use it rather than the generic implementation.  This
required refactoring the ChaCha20 permutation into its own function.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm64/crypto/Kconfig              |   2 +-
 arch/arm64/crypto/chacha20-neon-core.S |  65 +++++++++++++++------
 arch/arm64/crypto/chacha20-neon-glue.c | 101 +++++++++++++++++++++++++--------
 3 files changed, 125 insertions(+), 43 deletions(-)

diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index 3f5aeb786192..d54ddb8468ef 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -101,7 +101,7 @@ config CRYPTO_AES_ARM64_NEON_BLK
 	select CRYPTO_SIMD
 
 config CRYPTO_CHACHA20_NEON
-	tristate "NEON accelerated ChaCha20 symmetric cipher"
+	tristate "ChaCha20 and XChaCha20 stream ciphers using NEON instructions"
 	depends on KERNEL_MODE_NEON
 	select CRYPTO_BLKCIPHER
 	select CRYPTO_CHACHA20
diff --git a/arch/arm64/crypto/chacha20-neon-core.S b/arch/arm64/crypto/chacha20-neon-core.S
index 13c85e272c2a..0571e45a1a0a 100644
--- a/arch/arm64/crypto/chacha20-neon-core.S
+++ b/arch/arm64/crypto/chacha20-neon-core.S
@@ -23,25 +23,20 @@
 	.text
 	.align		6
 
-ENTRY(chacha20_block_xor_neon)
-	// x0: Input state matrix, s
-	// x1: 1 data block output, o
-	// x2: 1 data block input, i
-
-	//
-	// This function encrypts one ChaCha20 block by loading the state matrix
-	// in four NEON registers. It performs matrix operation on four words in
-	// parallel, but requires shuffling to rearrange the words after each
-	// round.
-	//
-
-	// x0..3 = s0..3
-	adr		x3, ROT8
-	ld1		{v0.4s-v3.4s}, [x0]
-	ld1		{v8.4s-v11.4s}, [x0]
-	ld1		{v12.4s}, [x3]
+/*
+ * chacha20_permute - permute one block
+ *
+ * Permute one 64-byte block where the state matrix is stored in the four NEON
+ * registers v0-v3.  It performs matrix operations on four words in parallel,
+ * but requires shuffling to rearrange the words after each round.
+ *
+ * Clobbers: x3, x10, v4, v12
+ */
+chacha20_permute:
 
 	mov		x3, #10
+	adr		x10, ROT8
+	ld1		{v12.4s}, [x10]
 
 .Ldoubleround:
 	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
@@ -105,6 +100,23 @@ ENTRY(chacha20_block_xor_neon)
 	subs		x3, x3, #1
 	b.ne		.Ldoubleround
 
+	ret
+ENDPROC(chacha20_permute)
+
+ENTRY(chacha20_block_xor_neon)
+	// x0: Input state matrix, s
+	// x1: 1 data block output, o
+	// x2: 1 data block input, i
+
+	stp		x29, x30, [sp, #-16]!
+	mov		x29, sp
+
+	// x0..3 = s0..3
+	ld1		{v0.4s-v3.4s}, [x0]
+	ld1		{v8.4s-v11.4s}, [x0]
+
+	bl		chacha20_permute
+
 	ld1		{v4.16b-v7.16b}, [x2]
 
 	// o0 = i0 ^ (x0 + s0)
@@ -125,9 +137,28 @@ ENTRY(chacha20_block_xor_neon)
 
 	st1		{v0.16b-v3.16b}, [x1]
 
+	ldp		x29, x30, [sp], #16
 	ret
 ENDPROC(chacha20_block_xor_neon)
 
+ENTRY(hchacha20_block_neon)
+	// x0: Input state matrix, s
+	// x1: output (8 32-bit words)
+
+	stp		x29, x30, [sp, #-16]!
+	mov		x29, sp
+
+	ld1		{v0.4s-v3.4s}, [x0]
+
+	bl		chacha20_permute
+
+	st1		{v0.16b}, [x1], #16
+	st1		{v3.16b}, [x1]
+
+	ldp		x29, x30, [sp], #16
+	ret
+ENDPROC(hchacha20_block_neon)
+
 	.align		6
 ENTRY(chacha20_4block_xor_neon)
 	// x0: Input state matrix, s
diff --git a/arch/arm64/crypto/chacha20-neon-glue.c b/arch/arm64/crypto/chacha20-neon-glue.c
index 96e0cfb8c3f5..a5b9cbc0c4de 100644
--- a/arch/arm64/crypto/chacha20-neon-glue.c
+++ b/arch/arm64/crypto/chacha20-neon-glue.c
@@ -30,6 +30,7 @@
 
 asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src);
 asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
+asmlinkage void hchacha20_block_neon(const u32 *state, u32 *out);
 
 static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
 			    unsigned int bytes)
@@ -65,20 +66,16 @@ static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
 	kernel_neon_end();
 }
 
-static int chacha20_neon(struct skcipher_request *req)
+static int chacha20_neon_stream_xor(struct skcipher_request *req,
+				    struct chacha_ctx *ctx, u8 *iv)
 {
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
 	struct skcipher_walk walk;
 	u32 state[16];
 	int err;
 
-	if (!may_use_simd() || req->cryptlen <= CHACHA_BLOCK_SIZE)
-		return crypto_chacha_crypt(req);
-
 	err = skcipher_walk_virt(&walk, req, false);
 
-	crypto_chacha_init(state, ctx, walk.iv);
+	crypto_chacha_init(state, ctx, iv);
 
 	while (walk.nbytes > 0) {
 		unsigned int nbytes = walk.nbytes;
@@ -94,22 +91,73 @@ static int chacha20_neon(struct skcipher_request *req)
 	return err;
 }
 
-static struct skcipher_alg alg = {
-	.base.cra_name		= "chacha20",
-	.base.cra_driver_name	= "chacha20-neon",
-	.base.cra_priority	= 300,
-	.base.cra_blocksize	= 1,
-	.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-	.base.cra_module	= THIS_MODULE,
-
-	.min_keysize		= CHACHA_KEY_SIZE,
-	.max_keysize		= CHACHA_KEY_SIZE,
-	.ivsize			= CHACHA_IV_SIZE,
-	.chunksize		= CHACHA_BLOCK_SIZE,
-	.walksize		= 4 * CHACHA_BLOCK_SIZE,
-	.setkey			= crypto_chacha20_setkey,
-	.encrypt		= chacha20_neon,
-	.decrypt		= chacha20_neon,
+static int chacha20_neon(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
+		return crypto_chacha_crypt(req);
+
+	return chacha20_neon_stream_xor(req, ctx, req->iv);
+}
+
+static int xchacha20_neon(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct chacha_ctx subctx;
+	u32 state[16];
+	u8 real_iv[16];
+
+	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
+		return crypto_xchacha_crypt(req);
+
+	crypto_chacha_init(state, ctx, req->iv);
+
+	kernel_neon_begin();
+	hchacha20_block_neon(state, subctx.key);
+	kernel_neon_end();
+
+	memcpy(&real_iv[0], req->iv + 24, 8);
+	memcpy(&real_iv[8], req->iv + 16, 8);
+	return chacha20_neon_stream_xor(req, &subctx, real_iv);
+}
+
+static struct skcipher_alg algs[] = {
+	{
+		.base.cra_name		= "chacha20",
+		.base.cra_driver_name	= "chacha20-neon",
+		.base.cra_priority	= 300,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= CHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.walksize		= 4 * CHACHA_BLOCK_SIZE,
+		.setkey			= crypto_chacha20_setkey,
+		.encrypt		= chacha20_neon,
+		.decrypt		= chacha20_neon,
+	}, {
+		.base.cra_name		= "xchacha20",
+		.base.cra_driver_name	= "xchacha20-neon",
+		.base.cra_priority	= 300,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= XCHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.walksize		= 4 * CHACHA_BLOCK_SIZE,
+		.setkey			= crypto_chacha20_setkey,
+		.encrypt		= xchacha20_neon,
+		.decrypt		= xchacha20_neon,
+	}
 };
 
 static int __init chacha20_simd_mod_init(void)
@@ -117,12 +165,12 @@ static int __init chacha20_simd_mod_init(void)
 	if (!(elf_hwcap & HWCAP_ASIMD))
 		return -ENODEV;
 
-	return crypto_register_skcipher(&alg);
+	return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
 }
 
 static void __exit chacha20_simd_mod_fini(void)
 {
-	crypto_unregister_skcipher(&alg);
+	crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
 }
 
 module_init(chacha20_simd_mod_init);
@@ -131,3 +179,6 @@ module_exit(chacha20_simd_mod_fini);
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
 MODULE_LICENSE("GPL v2");
 MODULE_ALIAS_CRYPTO("chacha20");
+MODULE_ALIAS_CRYPTO("chacha20-neon");
+MODULE_ALIAS_CRYPTO("xchacha20");
+MODULE_ALIAS_CRYPTO("xchacha20-neon");
-- 
cgit v1.2.3-59-g8ed1b


From 95a34b779e2a45b14e73cee1e7eec11870efb2ea Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 3 Dec 2018 19:52:51 -0800
Subject: crypto: arm64/chacha20 - refactor to allow varying number of rounds

In preparation for adding XChaCha12 support, rename/refactor the ARM64
NEON implementation of ChaCha20 to support different numbers of rounds.

Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm64/crypto/Makefile             |   4 +-
 arch/arm64/crypto/chacha-neon-core.S   | 484 +++++++++++++++++++++++++++++++++
 arch/arm64/crypto/chacha-neon-glue.c   | 189 +++++++++++++
 arch/arm64/crypto/chacha20-neon-core.S | 481 --------------------------------
 arch/arm64/crypto/chacha20-neon-glue.c | 184 -------------
 5 files changed, 675 insertions(+), 667 deletions(-)
 create mode 100644 arch/arm64/crypto/chacha-neon-core.S
 create mode 100644 arch/arm64/crypto/chacha-neon-glue.c
 delete mode 100644 arch/arm64/crypto/chacha20-neon-core.S
 delete mode 100644 arch/arm64/crypto/chacha20-neon-glue.c

diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index 125dbb10a93e..a4ffd9fe3265 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -50,8 +50,8 @@ sha256-arm64-y := sha256-glue.o sha256-core.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o
 sha512-arm64-y := sha512-glue.o sha512-core.o
 
-obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
-chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
+obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
+chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
 
 obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
 nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
diff --git a/arch/arm64/crypto/chacha-neon-core.S b/arch/arm64/crypto/chacha-neon-core.S
new file mode 100644
index 000000000000..3d3a12db5204
--- /dev/null
+++ b/arch/arm64/crypto/chacha-neon-core.S
@@ -0,0 +1,484 @@
+/*
+ * ChaCha/XChaCha NEON helper functions
+ *
+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+
+	.text
+	.align		6
+
+/*
+ * chacha_permute - permute one block
+ *
+ * Permute one 64-byte block where the state matrix is stored in the four NEON
+ * registers v0-v3.  It performs matrix operations on four words in parallel,
+ * but requires shuffling to rearrange the words after each round.
+ *
+ * The round count is given in w3.
+ *
+ * Clobbers: w3, x10, v4, v12
+ */
+chacha_permute:
+
+	adr		x10, ROT8
+	ld1		{v12.4s}, [x10]
+
+.Ldoubleround:
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	add		v0.4s, v0.4s, v1.4s
+	eor		v3.16b, v3.16b, v0.16b
+	rev32		v3.8h, v3.8h
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	add		v2.4s, v2.4s, v3.4s
+	eor		v4.16b, v1.16b, v2.16b
+	shl		v1.4s, v4.4s, #12
+	sri		v1.4s, v4.4s, #20
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	add		v0.4s, v0.4s, v1.4s
+	eor		v3.16b, v3.16b, v0.16b
+	tbl		v3.16b, {v3.16b}, v12.16b
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	add		v2.4s, v2.4s, v3.4s
+	eor		v4.16b, v1.16b, v2.16b
+	shl		v1.4s, v4.4s, #7
+	sri		v1.4s, v4.4s, #25
+
+	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	ext		v1.16b, v1.16b, v1.16b, #4
+	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	ext		v2.16b, v2.16b, v2.16b, #8
+	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	ext		v3.16b, v3.16b, v3.16b, #12
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	add		v0.4s, v0.4s, v1.4s
+	eor		v3.16b, v3.16b, v0.16b
+	rev32		v3.8h, v3.8h
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	add		v2.4s, v2.4s, v3.4s
+	eor		v4.16b, v1.16b, v2.16b
+	shl		v1.4s, v4.4s, #12
+	sri		v1.4s, v4.4s, #20
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	add		v0.4s, v0.4s, v1.4s
+	eor		v3.16b, v3.16b, v0.16b
+	tbl		v3.16b, {v3.16b}, v12.16b
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	add		v2.4s, v2.4s, v3.4s
+	eor		v4.16b, v1.16b, v2.16b
+	shl		v1.4s, v4.4s, #7
+	sri		v1.4s, v4.4s, #25
+
+	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	ext		v1.16b, v1.16b, v1.16b, #12
+	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	ext		v2.16b, v2.16b, v2.16b, #8
+	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	ext		v3.16b, v3.16b, v3.16b, #4
+
+	subs		w3, w3, #2
+	b.ne		.Ldoubleround
+
+	ret
+ENDPROC(chacha_permute)
+
+ENTRY(chacha_block_xor_neon)
+	// x0: Input state matrix, s
+	// x1: 1 data block output, o
+	// x2: 1 data block input, i
+	// w3: nrounds
+
+	stp		x29, x30, [sp, #-16]!
+	mov		x29, sp
+
+	// x0..3 = s0..3
+	ld1		{v0.4s-v3.4s}, [x0]
+	ld1		{v8.4s-v11.4s}, [x0]
+
+	bl		chacha_permute
+
+	ld1		{v4.16b-v7.16b}, [x2]
+
+	// o0 = i0 ^ (x0 + s0)
+	add		v0.4s, v0.4s, v8.4s
+	eor		v0.16b, v0.16b, v4.16b
+
+	// o1 = i1 ^ (x1 + s1)
+	add		v1.4s, v1.4s, v9.4s
+	eor		v1.16b, v1.16b, v5.16b
+
+	// o2 = i2 ^ (x2 + s2)
+	add		v2.4s, v2.4s, v10.4s
+	eor		v2.16b, v2.16b, v6.16b
+
+	// o3 = i3 ^ (x3 + s3)
+	add		v3.4s, v3.4s, v11.4s
+	eor		v3.16b, v3.16b, v7.16b
+
+	st1		{v0.16b-v3.16b}, [x1]
+
+	ldp		x29, x30, [sp], #16
+	ret
+ENDPROC(chacha_block_xor_neon)
+
+ENTRY(hchacha_block_neon)
+	// x0: Input state matrix, s
+	// x1: output (8 32-bit words)
+	// w2: nrounds
+
+	stp		x29, x30, [sp, #-16]!
+	mov		x29, sp
+
+	ld1		{v0.4s-v3.4s}, [x0]
+
+	mov		w3, w2
+	bl		chacha_permute
+
+	st1		{v0.16b}, [x1], #16
+	st1		{v3.16b}, [x1]
+
+	ldp		x29, x30, [sp], #16
+	ret
+ENDPROC(hchacha_block_neon)
+
+	.align		6
+ENTRY(chacha_4block_xor_neon)
+	// x0: Input state matrix, s
+	// x1: 4 data blocks output, o
+	// x2: 4 data blocks input, i
+	// w3: nrounds
+
+	//
+	// This function encrypts four consecutive ChaCha blocks by loading
+	// the state matrix in NEON registers four times. The algorithm performs
+	// each operation on the corresponding word of each state matrix, hence
+	// requires no word shuffling. For final XORing step we transpose the
+	// matrix by interleaving 32- and then 64-bit words, which allows us to
+	// do XOR in NEON registers.
+	//
+	adr		x9, CTRINC		// ... and ROT8
+	ld1		{v30.4s-v31.4s}, [x9]
+
+	// x0..15[0-3] = s0..3[0..3]
+	mov		x4, x0
+	ld4r		{ v0.4s- v3.4s}, [x4], #16
+	ld4r		{ v4.4s- v7.4s}, [x4], #16
+	ld4r		{ v8.4s-v11.4s}, [x4], #16
+	ld4r		{v12.4s-v15.4s}, [x4]
+
+	// x12 += counter values 0-3
+	add		v12.4s, v12.4s, v30.4s
+
+.Ldoubleround4:
+	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+	add		v0.4s, v0.4s, v4.4s
+	add		v1.4s, v1.4s, v5.4s
+	add		v2.4s, v2.4s, v6.4s
+	add		v3.4s, v3.4s, v7.4s
+
+	eor		v12.16b, v12.16b, v0.16b
+	eor		v13.16b, v13.16b, v1.16b
+	eor		v14.16b, v14.16b, v2.16b
+	eor		v15.16b, v15.16b, v3.16b
+
+	rev32		v12.8h, v12.8h
+	rev32		v13.8h, v13.8h
+	rev32		v14.8h, v14.8h
+	rev32		v15.8h, v15.8h
+
+	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+	add		v8.4s, v8.4s, v12.4s
+	add		v9.4s, v9.4s, v13.4s
+	add		v10.4s, v10.4s, v14.4s
+	add		v11.4s, v11.4s, v15.4s
+
+	eor		v16.16b, v4.16b, v8.16b
+	eor		v17.16b, v5.16b, v9.16b
+	eor		v18.16b, v6.16b, v10.16b
+	eor		v19.16b, v7.16b, v11.16b
+
+	shl		v4.4s, v16.4s, #12
+	shl		v5.4s, v17.4s, #12
+	shl		v6.4s, v18.4s, #12
+	shl		v7.4s, v19.4s, #12
+
+	sri		v4.4s, v16.4s, #20
+	sri		v5.4s, v17.4s, #20
+	sri		v6.4s, v18.4s, #20
+	sri		v7.4s, v19.4s, #20
+
+	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+	add		v0.4s, v0.4s, v4.4s
+	add		v1.4s, v1.4s, v5.4s
+	add		v2.4s, v2.4s, v6.4s
+	add		v3.4s, v3.4s, v7.4s
+
+	eor		v12.16b, v12.16b, v0.16b
+	eor		v13.16b, v13.16b, v1.16b
+	eor		v14.16b, v14.16b, v2.16b
+	eor		v15.16b, v15.16b, v3.16b
+
+	tbl		v12.16b, {v12.16b}, v31.16b
+	tbl		v13.16b, {v13.16b}, v31.16b
+	tbl		v14.16b, {v14.16b}, v31.16b
+	tbl		v15.16b, {v15.16b}, v31.16b
+
+	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+	add		v8.4s, v8.4s, v12.4s
+	add		v9.4s, v9.4s, v13.4s
+	add		v10.4s, v10.4s, v14.4s
+	add		v11.4s, v11.4s, v15.4s
+
+	eor		v16.16b, v4.16b, v8.16b
+	eor		v17.16b, v5.16b, v9.16b
+	eor		v18.16b, v6.16b, v10.16b
+	eor		v19.16b, v7.16b, v11.16b
+
+	shl		v4.4s, v16.4s, #7
+	shl		v5.4s, v17.4s, #7
+	shl		v6.4s, v18.4s, #7
+	shl		v7.4s, v19.4s, #7
+
+	sri		v4.4s, v16.4s, #25
+	sri		v5.4s, v17.4s, #25
+	sri		v6.4s, v18.4s, #25
+	sri		v7.4s, v19.4s, #25
+
+	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+	add		v0.4s, v0.4s, v5.4s
+	add		v1.4s, v1.4s, v6.4s
+	add		v2.4s, v2.4s, v7.4s
+	add		v3.4s, v3.4s, v4.4s
+
+	eor		v15.16b, v15.16b, v0.16b
+	eor		v12.16b, v12.16b, v1.16b
+	eor		v13.16b, v13.16b, v2.16b
+	eor		v14.16b, v14.16b, v3.16b
+
+	rev32		v15.8h, v15.8h
+	rev32		v12.8h, v12.8h
+	rev32		v13.8h, v13.8h
+	rev32		v14.8h, v14.8h
+
+	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+	add		v10.4s, v10.4s, v15.4s
+	add		v11.4s, v11.4s, v12.4s
+	add		v8.4s, v8.4s, v13.4s
+	add		v9.4s, v9.4s, v14.4s
+
+	eor		v16.16b, v5.16b, v10.16b
+	eor		v17.16b, v6.16b, v11.16b
+	eor		v18.16b, v7.16b, v8.16b
+	eor		v19.16b, v4.16b, v9.16b
+
+	shl		v5.4s, v16.4s, #12
+	shl		v6.4s, v17.4s, #12
+	shl		v7.4s, v18.4s, #12
+	shl		v4.4s, v19.4s, #12
+
+	sri		v5.4s, v16.4s, #20
+	sri		v6.4s, v17.4s, #20
+	sri		v7.4s, v18.4s, #20
+	sri		v4.4s, v19.4s, #20
+
+	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+	add		v0.4s, v0.4s, v5.4s
+	add		v1.4s, v1.4s, v6.4s
+	add		v2.4s, v2.4s, v7.4s
+	add		v3.4s, v3.4s, v4.4s
+
+	eor		v15.16b, v15.16b, v0.16b
+	eor		v12.16b, v12.16b, v1.16b
+	eor		v13.16b, v13.16b, v2.16b
+	eor		v14.16b, v14.16b, v3.16b
+
+	tbl		v15.16b, {v15.16b}, v31.16b
+	tbl		v12.16b, {v12.16b}, v31.16b
+	tbl		v13.16b, {v13.16b}, v31.16b
+	tbl		v14.16b, {v14.16b}, v31.16b
+
+	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+	add		v10.4s, v10.4s, v15.4s
+	add		v11.4s, v11.4s, v12.4s
+	add		v8.4s, v8.4s, v13.4s
+	add		v9.4s, v9.4s, v14.4s
+
+	eor		v16.16b, v5.16b, v10.16b
+	eor		v17.16b, v6.16b, v11.16b
+	eor		v18.16b, v7.16b, v8.16b
+	eor		v19.16b, v4.16b, v9.16b
+
+	shl		v5.4s, v16.4s, #7
+	shl		v6.4s, v17.4s, #7
+	shl		v7.4s, v18.4s, #7
+	shl		v4.4s, v19.4s, #7
+
+	sri		v5.4s, v16.4s, #25
+	sri		v6.4s, v17.4s, #25
+	sri		v7.4s, v18.4s, #25
+	sri		v4.4s, v19.4s, #25
+
+	subs		w3, w3, #2
+	b.ne		.Ldoubleround4
+
+	ld4r		{v16.4s-v19.4s}, [x0], #16
+	ld4r		{v20.4s-v23.4s}, [x0], #16
+
+	// x12 += counter values 0-3
+	add		v12.4s, v12.4s, v30.4s
+
+	// x0[0-3] += s0[0]
+	// x1[0-3] += s0[1]
+	// x2[0-3] += s0[2]
+	// x3[0-3] += s0[3]
+	add		v0.4s, v0.4s, v16.4s
+	add		v1.4s, v1.4s, v17.4s
+	add		v2.4s, v2.4s, v18.4s
+	add		v3.4s, v3.4s, v19.4s
+
+	ld4r		{v24.4s-v27.4s}, [x0], #16
+	ld4r		{v28.4s-v31.4s}, [x0]
+
+	// x4[0-3] += s1[0]
+	// x5[0-3] += s1[1]
+	// x6[0-3] += s1[2]
+	// x7[0-3] += s1[3]
+	add		v4.4s, v4.4s, v20.4s
+	add		v5.4s, v5.4s, v21.4s
+	add		v6.4s, v6.4s, v22.4s
+	add		v7.4s, v7.4s, v23.4s
+
+	// x8[0-3] += s2[0]
+	// x9[0-3] += s2[1]
+	// x10[0-3] += s2[2]
+	// x11[0-3] += s2[3]
+	add		v8.4s, v8.4s, v24.4s
+	add		v9.4s, v9.4s, v25.4s
+	add		v10.4s, v10.4s, v26.4s
+	add		v11.4s, v11.4s, v27.4s
+
+	// x12[0-3] += s3[0]
+	// x13[0-3] += s3[1]
+	// x14[0-3] += s3[2]
+	// x15[0-3] += s3[3]
+	add		v12.4s, v12.4s, v28.4s
+	add		v13.4s, v13.4s, v29.4s
+	add		v14.4s, v14.4s, v30.4s
+	add		v15.4s, v15.4s, v31.4s
+
+	// interleave 32-bit words in state n, n+1
+	zip1		v16.4s, v0.4s, v1.4s
+	zip2		v17.4s, v0.4s, v1.4s
+	zip1		v18.4s, v2.4s, v3.4s
+	zip2		v19.4s, v2.4s, v3.4s
+	zip1		v20.4s, v4.4s, v5.4s
+	zip2		v21.4s, v4.4s, v5.4s
+	zip1		v22.4s, v6.4s, v7.4s
+	zip2		v23.4s, v6.4s, v7.4s
+	zip1		v24.4s, v8.4s, v9.4s
+	zip2		v25.4s, v8.4s, v9.4s
+	zip1		v26.4s, v10.4s, v11.4s
+	zip2		v27.4s, v10.4s, v11.4s
+	zip1		v28.4s, v12.4s, v13.4s
+	zip2		v29.4s, v12.4s, v13.4s
+	zip1		v30.4s, v14.4s, v15.4s
+	zip2		v31.4s, v14.4s, v15.4s
+
+	// interleave 64-bit words in state n, n+2
+	zip1		v0.2d, v16.2d, v18.2d
+	zip2		v4.2d, v16.2d, v18.2d
+	zip1		v8.2d, v17.2d, v19.2d
+	zip2		v12.2d, v17.2d, v19.2d
+	ld1		{v16.16b-v19.16b}, [x2], #64
+
+	zip1		v1.2d, v20.2d, v22.2d
+	zip2		v5.2d, v20.2d, v22.2d
+	zip1		v9.2d, v21.2d, v23.2d
+	zip2		v13.2d, v21.2d, v23.2d
+	ld1		{v20.16b-v23.16b}, [x2], #64
+
+	zip1		v2.2d, v24.2d, v26.2d
+	zip2		v6.2d, v24.2d, v26.2d
+	zip1		v10.2d, v25.2d, v27.2d
+	zip2		v14.2d, v25.2d, v27.2d
+	ld1		{v24.16b-v27.16b}, [x2], #64
+
+	zip1		v3.2d, v28.2d, v30.2d
+	zip2		v7.2d, v28.2d, v30.2d
+	zip1		v11.2d, v29.2d, v31.2d
+	zip2		v15.2d, v29.2d, v31.2d
+	ld1		{v28.16b-v31.16b}, [x2]
+
+	// xor with corresponding input, write to output
+	eor		v16.16b, v16.16b, v0.16b
+	eor		v17.16b, v17.16b, v1.16b
+	eor		v18.16b, v18.16b, v2.16b
+	eor		v19.16b, v19.16b, v3.16b
+	eor		v20.16b, v20.16b, v4.16b
+	eor		v21.16b, v21.16b, v5.16b
+	st1		{v16.16b-v19.16b}, [x1], #64
+	eor		v22.16b, v22.16b, v6.16b
+	eor		v23.16b, v23.16b, v7.16b
+	eor		v24.16b, v24.16b, v8.16b
+	eor		v25.16b, v25.16b, v9.16b
+	st1		{v20.16b-v23.16b}, [x1], #64
+	eor		v26.16b, v26.16b, v10.16b
+	eor		v27.16b, v27.16b, v11.16b
+	eor		v28.16b, v28.16b, v12.16b
+	st1		{v24.16b-v27.16b}, [x1], #64
+	eor		v29.16b, v29.16b, v13.16b
+	eor		v30.16b, v30.16b, v14.16b
+	eor		v31.16b, v31.16b, v15.16b
+	st1		{v28.16b-v31.16b}, [x1]
+
+	ret
+ENDPROC(chacha_4block_xor_neon)
+
+CTRINC:	.word		0, 1, 2, 3
+ROT8:	.word		0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
diff --git a/arch/arm64/crypto/chacha-neon-glue.c b/arch/arm64/crypto/chacha-neon-glue.c
new file mode 100644
index 000000000000..4d992029b912
--- /dev/null
+++ b/arch/arm64/crypto/chacha-neon-glue.c
@@ -0,0 +1,189 @@
+/*
+ * ARM NEON accelerated ChaCha and XChaCha stream ciphers,
+ * including ChaCha20 (RFC7539)
+ *
+ * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/chacha.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+asmlinkage void chacha_block_xor_neon(u32 *state, u8 *dst, const u8 *src,
+				      int nrounds);
+asmlinkage void chacha_4block_xor_neon(u32 *state, u8 *dst, const u8 *src,
+				       int nrounds);
+asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
+
+static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
+			  unsigned int bytes, int nrounds)
+{
+	u8 buf[CHACHA_BLOCK_SIZE];
+
+	while (bytes >= CHACHA_BLOCK_SIZE * 4) {
+		kernel_neon_begin();
+		chacha_4block_xor_neon(state, dst, src, nrounds);
+		kernel_neon_end();
+		bytes -= CHACHA_BLOCK_SIZE * 4;
+		src += CHACHA_BLOCK_SIZE * 4;
+		dst += CHACHA_BLOCK_SIZE * 4;
+		state[12] += 4;
+	}
+
+	if (!bytes)
+		return;
+
+	kernel_neon_begin();
+	while (bytes >= CHACHA_BLOCK_SIZE) {
+		chacha_block_xor_neon(state, dst, src, nrounds);
+		bytes -= CHACHA_BLOCK_SIZE;
+		src += CHACHA_BLOCK_SIZE;
+		dst += CHACHA_BLOCK_SIZE;
+		state[12]++;
+	}
+	if (bytes) {
+		memcpy(buf, src, bytes);
+		chacha_block_xor_neon(state, buf, buf, nrounds);
+		memcpy(dst, buf, bytes);
+	}
+	kernel_neon_end();
+}
+
+static int chacha_neon_stream_xor(struct skcipher_request *req,
+				  struct chacha_ctx *ctx, u8 *iv)
+{
+	struct skcipher_walk walk;
+	u32 state[16];
+	int err;
+
+	err = skcipher_walk_virt(&walk, req, false);
+
+	crypto_chacha_init(state, ctx, iv);
+
+	while (walk.nbytes > 0) {
+		unsigned int nbytes = walk.nbytes;
+
+		if (nbytes < walk.total)
+			nbytes = round_down(nbytes, walk.stride);
+
+		chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
+			      nbytes, ctx->nrounds);
+		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+	}
+
+	return err;
+}
+
+static int chacha_neon(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
+		return crypto_chacha_crypt(req);
+
+	return chacha_neon_stream_xor(req, ctx, req->iv);
+}
+
+static int xchacha_neon(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct chacha_ctx subctx;
+	u32 state[16];
+	u8 real_iv[16];
+
+	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
+		return crypto_xchacha_crypt(req);
+
+	crypto_chacha_init(state, ctx, req->iv);
+
+	kernel_neon_begin();
+	hchacha_block_neon(state, subctx.key, ctx->nrounds);
+	kernel_neon_end();
+	subctx.nrounds = ctx->nrounds;
+
+	memcpy(&real_iv[0], req->iv + 24, 8);
+	memcpy(&real_iv[8], req->iv + 16, 8);
+	return chacha_neon_stream_xor(req, &subctx, real_iv);
+}
+
+static struct skcipher_alg algs[] = {
+	{
+		.base.cra_name		= "chacha20",
+		.base.cra_driver_name	= "chacha20-neon",
+		.base.cra_priority	= 300,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= CHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.walksize		= 4 * CHACHA_BLOCK_SIZE,
+		.setkey			= crypto_chacha20_setkey,
+		.encrypt		= chacha_neon,
+		.decrypt		= chacha_neon,
+	}, {
+		.base.cra_name		= "xchacha20",
+		.base.cra_driver_name	= "xchacha20-neon",
+		.base.cra_priority	= 300,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= XCHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.walksize		= 4 * CHACHA_BLOCK_SIZE,
+		.setkey			= crypto_chacha20_setkey,
+		.encrypt		= xchacha_neon,
+		.decrypt		= xchacha_neon,
+	}
+};
+
+static int __init chacha_simd_mod_init(void)
+{
+	if (!(elf_hwcap & HWCAP_ASIMD))
+		return -ENODEV;
+
+	return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
+}
+
+static void __exit chacha_simd_mod_fini(void)
+{
+	crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+}
+
+module_init(chacha_simd_mod_init);
+module_exit(chacha_simd_mod_fini);
+
+MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (NEON accelerated)");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("chacha20");
+MODULE_ALIAS_CRYPTO("chacha20-neon");
+MODULE_ALIAS_CRYPTO("xchacha20");
+MODULE_ALIAS_CRYPTO("xchacha20-neon");
diff --git a/arch/arm64/crypto/chacha20-neon-core.S b/arch/arm64/crypto/chacha20-neon-core.S
deleted file mode 100644
index 0571e45a1a0a..000000000000
--- a/arch/arm64/crypto/chacha20-neon-core.S
+++ /dev/null
@@ -1,481 +0,0 @@
-/*
- * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
- *
- * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Based on:
- * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <linux/linkage.h>
-
-	.text
-	.align		6
-
-/*
- * chacha20_permute - permute one block
- *
- * Permute one 64-byte block where the state matrix is stored in the four NEON
- * registers v0-v3.  It performs matrix operations on four words in parallel,
- * but requires shuffling to rearrange the words after each round.
- *
- * Clobbers: x3, x10, v4, v12
- */
-chacha20_permute:
-
-	mov		x3, #10
-	adr		x10, ROT8
-	ld1		{v12.4s}, [x10]
-
-.Ldoubleround:
-	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-	add		v0.4s, v0.4s, v1.4s
-	eor		v3.16b, v3.16b, v0.16b
-	rev32		v3.8h, v3.8h
-
-	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-	add		v2.4s, v2.4s, v3.4s
-	eor		v4.16b, v1.16b, v2.16b
-	shl		v1.4s, v4.4s, #12
-	sri		v1.4s, v4.4s, #20
-
-	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-	add		v0.4s, v0.4s, v1.4s
-	eor		v3.16b, v3.16b, v0.16b
-	tbl		v3.16b, {v3.16b}, v12.16b
-
-	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-	add		v2.4s, v2.4s, v3.4s
-	eor		v4.16b, v1.16b, v2.16b
-	shl		v1.4s, v4.4s, #7
-	sri		v1.4s, v4.4s, #25
-
-	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
-	ext		v1.16b, v1.16b, v1.16b, #4
-	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-	ext		v2.16b, v2.16b, v2.16b, #8
-	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
-	ext		v3.16b, v3.16b, v3.16b, #12
-
-	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-	add		v0.4s, v0.4s, v1.4s
-	eor		v3.16b, v3.16b, v0.16b
-	rev32		v3.8h, v3.8h
-
-	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-	add		v2.4s, v2.4s, v3.4s
-	eor		v4.16b, v1.16b, v2.16b
-	shl		v1.4s, v4.4s, #12
-	sri		v1.4s, v4.4s, #20
-
-	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-	add		v0.4s, v0.4s, v1.4s
-	eor		v3.16b, v3.16b, v0.16b
-	tbl		v3.16b, {v3.16b}, v12.16b
-
-	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-	add		v2.4s, v2.4s, v3.4s
-	eor		v4.16b, v1.16b, v2.16b
-	shl		v1.4s, v4.4s, #7
-	sri		v1.4s, v4.4s, #25
-
-	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
-	ext		v1.16b, v1.16b, v1.16b, #12
-	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-	ext		v2.16b, v2.16b, v2.16b, #8
-	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
-	ext		v3.16b, v3.16b, v3.16b, #4
-
-	subs		x3, x3, #1
-	b.ne		.Ldoubleround
-
-	ret
-ENDPROC(chacha20_permute)
-
-ENTRY(chacha20_block_xor_neon)
-	// x0: Input state matrix, s
-	// x1: 1 data block output, o
-	// x2: 1 data block input, i
-
-	stp		x29, x30, [sp, #-16]!
-	mov		x29, sp
-
-	// x0..3 = s0..3
-	ld1		{v0.4s-v3.4s}, [x0]
-	ld1		{v8.4s-v11.4s}, [x0]
-
-	bl		chacha20_permute
-
-	ld1		{v4.16b-v7.16b}, [x2]
-
-	// o0 = i0 ^ (x0 + s0)
-	add		v0.4s, v0.4s, v8.4s
-	eor		v0.16b, v0.16b, v4.16b
-
-	// o1 = i1 ^ (x1 + s1)
-	add		v1.4s, v1.4s, v9.4s
-	eor		v1.16b, v1.16b, v5.16b
-
-	// o2 = i2 ^ (x2 + s2)
-	add		v2.4s, v2.4s, v10.4s
-	eor		v2.16b, v2.16b, v6.16b
-
-	// o3 = i3 ^ (x3 + s3)
-	add		v3.4s, v3.4s, v11.4s
-	eor		v3.16b, v3.16b, v7.16b
-
-	st1		{v0.16b-v3.16b}, [x1]
-
-	ldp		x29, x30, [sp], #16
-	ret
-ENDPROC(chacha20_block_xor_neon)
-
-ENTRY(hchacha20_block_neon)
-	// x0: Input state matrix, s
-	// x1: output (8 32-bit words)
-
-	stp		x29, x30, [sp, #-16]!
-	mov		x29, sp
-
-	ld1		{v0.4s-v3.4s}, [x0]
-
-	bl		chacha20_permute
-
-	st1		{v0.16b}, [x1], #16
-	st1		{v3.16b}, [x1]
-
-	ldp		x29, x30, [sp], #16
-	ret
-ENDPROC(hchacha20_block_neon)
-
-	.align		6
-ENTRY(chacha20_4block_xor_neon)
-	// x0: Input state matrix, s
-	// x1: 4 data blocks output, o
-	// x2: 4 data blocks input, i
-
-	//
-	// This function encrypts four consecutive ChaCha20 blocks by loading
-	// the state matrix in NEON registers four times. The algorithm performs
-	// each operation on the corresponding word of each state matrix, hence
-	// requires no word shuffling. For final XORing step we transpose the
-	// matrix by interleaving 32- and then 64-bit words, which allows us to
-	// do XOR in NEON registers.
-	//
-	adr		x3, CTRINC		// ... and ROT8
-	ld1		{v30.4s-v31.4s}, [x3]
-
-	// x0..15[0-3] = s0..3[0..3]
-	mov		x4, x0
-	ld4r		{ v0.4s- v3.4s}, [x4], #16
-	ld4r		{ v4.4s- v7.4s}, [x4], #16
-	ld4r		{ v8.4s-v11.4s}, [x4], #16
-	ld4r		{v12.4s-v15.4s}, [x4]
-
-	// x12 += counter values 0-3
-	add		v12.4s, v12.4s, v30.4s
-
-	mov		x3, #10
-
-.Ldoubleround4:
-	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
-	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
-	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
-	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
-	add		v0.4s, v0.4s, v4.4s
-	add		v1.4s, v1.4s, v5.4s
-	add		v2.4s, v2.4s, v6.4s
-	add		v3.4s, v3.4s, v7.4s
-
-	eor		v12.16b, v12.16b, v0.16b
-	eor		v13.16b, v13.16b, v1.16b
-	eor		v14.16b, v14.16b, v2.16b
-	eor		v15.16b, v15.16b, v3.16b
-
-	rev32		v12.8h, v12.8h
-	rev32		v13.8h, v13.8h
-	rev32		v14.8h, v14.8h
-	rev32		v15.8h, v15.8h
-
-	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
-	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
-	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
-	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
-	add		v8.4s, v8.4s, v12.4s
-	add		v9.4s, v9.4s, v13.4s
-	add		v10.4s, v10.4s, v14.4s
-	add		v11.4s, v11.4s, v15.4s
-
-	eor		v16.16b, v4.16b, v8.16b
-	eor		v17.16b, v5.16b, v9.16b
-	eor		v18.16b, v6.16b, v10.16b
-	eor		v19.16b, v7.16b, v11.16b
-
-	shl		v4.4s, v16.4s, #12
-	shl		v5.4s, v17.4s, #12
-	shl		v6.4s, v18.4s, #12
-	shl		v7.4s, v19.4s, #12
-
-	sri		v4.4s, v16.4s, #20
-	sri		v5.4s, v17.4s, #20
-	sri		v6.4s, v18.4s, #20
-	sri		v7.4s, v19.4s, #20
-
-	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
-	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
-	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
-	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
-	add		v0.4s, v0.4s, v4.4s
-	add		v1.4s, v1.4s, v5.4s
-	add		v2.4s, v2.4s, v6.4s
-	add		v3.4s, v3.4s, v7.4s
-
-	eor		v12.16b, v12.16b, v0.16b
-	eor		v13.16b, v13.16b, v1.16b
-	eor		v14.16b, v14.16b, v2.16b
-	eor		v15.16b, v15.16b, v3.16b
-
-	tbl		v12.16b, {v12.16b}, v31.16b
-	tbl		v13.16b, {v13.16b}, v31.16b
-	tbl		v14.16b, {v14.16b}, v31.16b
-	tbl		v15.16b, {v15.16b}, v31.16b
-
-	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
-	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
-	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
-	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
-	add		v8.4s, v8.4s, v12.4s
-	add		v9.4s, v9.4s, v13.4s
-	add		v10.4s, v10.4s, v14.4s
-	add		v11.4s, v11.4s, v15.4s
-
-	eor		v16.16b, v4.16b, v8.16b
-	eor		v17.16b, v5.16b, v9.16b
-	eor		v18.16b, v6.16b, v10.16b
-	eor		v19.16b, v7.16b, v11.16b
-
-	shl		v4.4s, v16.4s, #7
-	shl		v5.4s, v17.4s, #7
-	shl		v6.4s, v18.4s, #7
-	shl		v7.4s, v19.4s, #7
-
-	sri		v4.4s, v16.4s, #25
-	sri		v5.4s, v17.4s, #25
-	sri		v6.4s, v18.4s, #25
-	sri		v7.4s, v19.4s, #25
-
-	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
-	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
-	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
-	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
-	add		v0.4s, v0.4s, v5.4s
-	add		v1.4s, v1.4s, v6.4s
-	add		v2.4s, v2.4s, v7.4s
-	add		v3.4s, v3.4s, v4.4s
-
-	eor		v15.16b, v15.16b, v0.16b
-	eor		v12.16b, v12.16b, v1.16b
-	eor		v13.16b, v13.16b, v2.16b
-	eor		v14.16b, v14.16b, v3.16b
-
-	rev32		v15.8h, v15.8h
-	rev32		v12.8h, v12.8h
-	rev32		v13.8h, v13.8h
-	rev32		v14.8h, v14.8h
-
-	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
-	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
-	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
-	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
-	add		v10.4s, v10.4s, v15.4s
-	add		v11.4s, v11.4s, v12.4s
-	add		v8.4s, v8.4s, v13.4s
-	add		v9.4s, v9.4s, v14.4s
-
-	eor		v16.16b, v5.16b, v10.16b
-	eor		v17.16b, v6.16b, v11.16b
-	eor		v18.16b, v7.16b, v8.16b
-	eor		v19.16b, v4.16b, v9.16b
-
-	shl		v5.4s, v16.4s, #12
-	shl		v6.4s, v17.4s, #12
-	shl		v7.4s, v18.4s, #12
-	shl		v4.4s, v19.4s, #12
-
-	sri		v5.4s, v16.4s, #20
-	sri		v6.4s, v17.4s, #20
-	sri		v7.4s, v18.4s, #20
-	sri		v4.4s, v19.4s, #20
-
-	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
-	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
-	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
-	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
-	add		v0.4s, v0.4s, v5.4s
-	add		v1.4s, v1.4s, v6.4s
-	add		v2.4s, v2.4s, v7.4s
-	add		v3.4s, v3.4s, v4.4s
-
-	eor		v15.16b, v15.16b, v0.16b
-	eor		v12.16b, v12.16b, v1.16b
-	eor		v13.16b, v13.16b, v2.16b
-	eor		v14.16b, v14.16b, v3.16b
-
-	tbl		v15.16b, {v15.16b}, v31.16b
-	tbl		v12.16b, {v12.16b}, v31.16b
-	tbl		v13.16b, {v13.16b}, v31.16b
-	tbl		v14.16b, {v14.16b}, v31.16b
-
-	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
-	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
-	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
-	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
-	add		v10.4s, v10.4s, v15.4s
-	add		v11.4s, v11.4s, v12.4s
-	add		v8.4s, v8.4s, v13.4s
-	add		v9.4s, v9.4s, v14.4s
-
-	eor		v16.16b, v5.16b, v10.16b
-	eor		v17.16b, v6.16b, v11.16b
-	eor		v18.16b, v7.16b, v8.16b
-	eor		v19.16b, v4.16b, v9.16b
-
-	shl		v5.4s, v16.4s, #7
-	shl		v6.4s, v17.4s, #7
-	shl		v7.4s, v18.4s, #7
-	shl		v4.4s, v19.4s, #7
-
-	sri		v5.4s, v16.4s, #25
-	sri		v6.4s, v17.4s, #25
-	sri		v7.4s, v18.4s, #25
-	sri		v4.4s, v19.4s, #25
-
-	subs		x3, x3, #1
-	b.ne		.Ldoubleround4
-
-	ld4r		{v16.4s-v19.4s}, [x0], #16
-	ld4r		{v20.4s-v23.4s}, [x0], #16
-
-	// x12 += counter values 0-3
-	add		v12.4s, v12.4s, v30.4s
-
-	// x0[0-3] += s0[0]
-	// x1[0-3] += s0[1]
-	// x2[0-3] += s0[2]
-	// x3[0-3] += s0[3]
-	add		v0.4s, v0.4s, v16.4s
-	add		v1.4s, v1.4s, v17.4s
-	add		v2.4s, v2.4s, v18.4s
-	add		v3.4s, v3.4s, v19.4s
-
-	ld4r		{v24.4s-v27.4s}, [x0], #16
-	ld4r		{v28.4s-v31.4s}, [x0]
-
-	// x4[0-3] += s1[0]
-	// x5[0-3] += s1[1]
-	// x6[0-3] += s1[2]
-	// x7[0-3] += s1[3]
-	add		v4.4s, v4.4s, v20.4s
-	add		v5.4s, v5.4s, v21.4s
-	add		v6.4s, v6.4s, v22.4s
-	add		v7.4s, v7.4s, v23.4s
-
-	// x8[0-3] += s2[0]
-	// x9[0-3] += s2[1]
-	// x10[0-3] += s2[2]
-	// x11[0-3] += s2[3]
-	add		v8.4s, v8.4s, v24.4s
-	add		v9.4s, v9.4s, v25.4s
-	add		v10.4s, v10.4s, v26.4s
-	add		v11.4s, v11.4s, v27.4s
-
-	// x12[0-3] += s3[0]
-	// x13[0-3] += s3[1]
-	// x14[0-3] += s3[2]
-	// x15[0-3] += s3[3]
-	add		v12.4s, v12.4s, v28.4s
-	add		v13.4s, v13.4s, v29.4s
-	add		v14.4s, v14.4s, v30.4s
-	add		v15.4s, v15.4s, v31.4s
-
-	// interleave 32-bit words in state n, n+1
-	zip1		v16.4s, v0.4s, v1.4s
-	zip2		v17.4s, v0.4s, v1.4s
-	zip1		v18.4s, v2.4s, v3.4s
-	zip2		v19.4s, v2.4s, v3.4s
-	zip1		v20.4s, v4.4s, v5.4s
-	zip2		v21.4s, v4.4s, v5.4s
-	zip1		v22.4s, v6.4s, v7.4s
-	zip2		v23.4s, v6.4s, v7.4s
-	zip1		v24.4s, v8.4s, v9.4s
-	zip2		v25.4s, v8.4s, v9.4s
-	zip1		v26.4s, v10.4s, v11.4s
-	zip2		v27.4s, v10.4s, v11.4s
-	zip1		v28.4s, v12.4s, v13.4s
-	zip2		v29.4s, v12.4s, v13.4s
-	zip1		v30.4s, v14.4s, v15.4s
-	zip2		v31.4s, v14.4s, v15.4s
-
-	// interleave 64-bit words in state n, n+2
-	zip1		v0.2d, v16.2d, v18.2d
-	zip2		v4.2d, v16.2d, v18.2d
-	zip1		v8.2d, v17.2d, v19.2d
-	zip2		v12.2d, v17.2d, v19.2d
-	ld1		{v16.16b-v19.16b}, [x2], #64
-
-	zip1		v1.2d, v20.2d, v22.2d
-	zip2		v5.2d, v20.2d, v22.2d
-	zip1		v9.2d, v21.2d, v23.2d
-	zip2		v13.2d, v21.2d, v23.2d
-	ld1		{v20.16b-v23.16b}, [x2], #64
-
-	zip1		v2.2d, v24.2d, v26.2d
-	zip2		v6.2d, v24.2d, v26.2d
-	zip1		v10.2d, v25.2d, v27.2d
-	zip2		v14.2d, v25.2d, v27.2d
-	ld1		{v24.16b-v27.16b}, [x2], #64
-
-	zip1		v3.2d, v28.2d, v30.2d
-	zip2		v7.2d, v28.2d, v30.2d
-	zip1		v11.2d, v29.2d, v31.2d
-	zip2		v15.2d, v29.2d, v31.2d
-	ld1		{v28.16b-v31.16b}, [x2]
-
-	// xor with corresponding input, write to output
-	eor		v16.16b, v16.16b, v0.16b
-	eor		v17.16b, v17.16b, v1.16b
-	eor		v18.16b, v18.16b, v2.16b
-	eor		v19.16b, v19.16b, v3.16b
-	eor		v20.16b, v20.16b, v4.16b
-	eor		v21.16b, v21.16b, v5.16b
-	st1		{v16.16b-v19.16b}, [x1], #64
-	eor		v22.16b, v22.16b, v6.16b
-	eor		v23.16b, v23.16b, v7.16b
-	eor		v24.16b, v24.16b, v8.16b
-	eor		v25.16b, v25.16b, v9.16b
-	st1		{v20.16b-v23.16b}, [x1], #64
-	eor		v26.16b, v26.16b, v10.16b
-	eor		v27.16b, v27.16b, v11.16b
-	eor		v28.16b, v28.16b, v12.16b
-	st1		{v24.16b-v27.16b}, [x1], #64
-	eor		v29.16b, v29.16b, v13.16b
-	eor		v30.16b, v30.16b, v14.16b
-	eor		v31.16b, v31.16b, v15.16b
-	st1		{v28.16b-v31.16b}, [x1]
-
-	ret
-ENDPROC(chacha20_4block_xor_neon)
-
-CTRINC:	.word		0, 1, 2, 3
-ROT8:	.word		0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
diff --git a/arch/arm64/crypto/chacha20-neon-glue.c b/arch/arm64/crypto/chacha20-neon-glue.c
deleted file mode 100644
index a5b9cbc0c4de..000000000000
--- a/arch/arm64/crypto/chacha20-neon-glue.c
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
- *
- * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Based on:
- * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <crypto/algapi.h>
-#include <crypto/chacha.h>
-#include <crypto/internal/skcipher.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src);
-asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
-asmlinkage void hchacha20_block_neon(const u32 *state, u32 *out);
-
-static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
-			    unsigned int bytes)
-{
-	u8 buf[CHACHA_BLOCK_SIZE];
-
-	while (bytes >= CHACHA_BLOCK_SIZE * 4) {
-		kernel_neon_begin();
-		chacha20_4block_xor_neon(state, dst, src);
-		kernel_neon_end();
-		bytes -= CHACHA_BLOCK_SIZE * 4;
-		src += CHACHA_BLOCK_SIZE * 4;
-		dst += CHACHA_BLOCK_SIZE * 4;
-		state[12] += 4;
-	}
-
-	if (!bytes)
-		return;
-
-	kernel_neon_begin();
-	while (bytes >= CHACHA_BLOCK_SIZE) {
-		chacha20_block_xor_neon(state, dst, src);
-		bytes -= CHACHA_BLOCK_SIZE;
-		src += CHACHA_BLOCK_SIZE;
-		dst += CHACHA_BLOCK_SIZE;
-		state[12]++;
-	}
-	if (bytes) {
-		memcpy(buf, src, bytes);
-		chacha20_block_xor_neon(state, buf, buf);
-		memcpy(dst, buf, bytes);
-	}
-	kernel_neon_end();
-}
-
-static int chacha20_neon_stream_xor(struct skcipher_request *req,
-				    struct chacha_ctx *ctx, u8 *iv)
-{
-	struct skcipher_walk walk;
-	u32 state[16];
-	int err;
-
-	err = skcipher_walk_virt(&walk, req, false);
-
-	crypto_chacha_init(state, ctx, iv);
-
-	while (walk.nbytes > 0) {
-		unsigned int nbytes = walk.nbytes;
-
-		if (nbytes < walk.total)
-			nbytes = round_down(nbytes, walk.stride);
-
-		chacha20_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
-				nbytes);
-		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
-	}
-
-	return err;
-}
-
-static int chacha20_neon(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
-
-	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
-		return crypto_chacha_crypt(req);
-
-	return chacha20_neon_stream_xor(req, ctx, req->iv);
-}
-
-static int xchacha20_neon(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct chacha_ctx subctx;
-	u32 state[16];
-	u8 real_iv[16];
-
-	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
-		return crypto_xchacha_crypt(req);
-
-	crypto_chacha_init(state, ctx, req->iv);
-
-	kernel_neon_begin();
-	hchacha20_block_neon(state, subctx.key);
-	kernel_neon_end();
-
-	memcpy(&real_iv[0], req->iv + 24, 8);
-	memcpy(&real_iv[8], req->iv + 16, 8);
-	return chacha20_neon_stream_xor(req, &subctx, real_iv);
-}
-
-static struct skcipher_alg algs[] = {
-	{
-		.base.cra_name		= "chacha20",
-		.base.cra_driver_name	= "chacha20-neon",
-		.base.cra_priority	= 300,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA_KEY_SIZE,
-		.max_keysize		= CHACHA_KEY_SIZE,
-		.ivsize			= CHACHA_IV_SIZE,
-		.chunksize		= CHACHA_BLOCK_SIZE,
-		.walksize		= 4 * CHACHA_BLOCK_SIZE,
-		.setkey			= crypto_chacha20_setkey,
-		.encrypt		= chacha20_neon,
-		.decrypt		= chacha20_neon,
-	}, {
-		.base.cra_name		= "xchacha20",
-		.base.cra_driver_name	= "xchacha20-neon",
-		.base.cra_priority	= 300,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA_KEY_SIZE,
-		.max_keysize		= CHACHA_KEY_SIZE,
-		.ivsize			= XCHACHA_IV_SIZE,
-		.chunksize		= CHACHA_BLOCK_SIZE,
-		.walksize		= 4 * CHACHA_BLOCK_SIZE,
-		.setkey			= crypto_chacha20_setkey,
-		.encrypt		= xchacha20_neon,
-		.decrypt		= xchacha20_neon,
-	}
-};
-
-static int __init chacha20_simd_mod_init(void)
-{
-	if (!(elf_hwcap & HWCAP_ASIMD))
-		return -ENODEV;
-
-	return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
-}
-
-static void __exit chacha20_simd_mod_fini(void)
-{
-	crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
-}
-
-module_init(chacha20_simd_mod_init);
-module_exit(chacha20_simd_mod_fini);
-
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("chacha20");
-MODULE_ALIAS_CRYPTO("chacha20-neon");
-MODULE_ALIAS_CRYPTO("xchacha20");
-MODULE_ALIAS_CRYPTO("xchacha20-neon");
-- 
cgit v1.2.3-59-g8ed1b


From 19c11c97c39f5c6280b4d523ea170ef9a8f7ed12 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 3 Dec 2018 19:52:52 -0800
Subject: crypto: arm64/chacha - add XChaCha12 support

Now that the ARM64 NEON implementation of ChaCha20 and XChaCha20 has
been refactored to support varying the number of rounds, add support for
XChaCha12.  This is identical to XChaCha20 except for the number of
rounds, which is 12 instead of 20.  This can be used by Adiantum.

Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm64/crypto/Kconfig            |  2 +-
 arch/arm64/crypto/chacha-neon-glue.c | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index d54ddb8468ef..d9a523ecdd83 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -101,7 +101,7 @@ config CRYPTO_AES_ARM64_NEON_BLK
 	select CRYPTO_SIMD
 
 config CRYPTO_CHACHA20_NEON
-	tristate "ChaCha20 and XChaCha20 stream ciphers using NEON instructions"
+	tristate "ChaCha20, XChaCha20, and XChaCha12 stream ciphers using NEON instructions"
 	depends on KERNEL_MODE_NEON
 	select CRYPTO_BLKCIPHER
 	select CRYPTO_CHACHA20
diff --git a/arch/arm64/crypto/chacha-neon-glue.c b/arch/arm64/crypto/chacha-neon-glue.c
index 4d992029b912..346eb85498a1 100644
--- a/arch/arm64/crypto/chacha-neon-glue.c
+++ b/arch/arm64/crypto/chacha-neon-glue.c
@@ -161,6 +161,22 @@ static struct skcipher_alg algs[] = {
 		.setkey			= crypto_chacha20_setkey,
 		.encrypt		= xchacha_neon,
 		.decrypt		= xchacha_neon,
+	}, {
+		.base.cra_name		= "xchacha12",
+		.base.cra_driver_name	= "xchacha12-neon",
+		.base.cra_priority	= 300,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= XCHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.walksize		= 4 * CHACHA_BLOCK_SIZE,
+		.setkey			= crypto_chacha12_setkey,
+		.encrypt		= xchacha_neon,
+		.decrypt		= xchacha_neon,
 	}
 };
 
@@ -187,3 +203,5 @@ MODULE_ALIAS_CRYPTO("chacha20");
 MODULE_ALIAS_CRYPTO("chacha20-neon");
 MODULE_ALIAS_CRYPTO("xchacha20");
 MODULE_ALIAS_CRYPTO("xchacha20-neon");
+MODULE_ALIAS_CRYPTO("xchacha12");
+MODULE_ALIAS_CRYPTO("xchacha12-neon");
-- 
cgit v1.2.3-59-g8ed1b


From cf718eaa8f9b2cb8a372dcfd5ef701188e233558 Mon Sep 17 00:00:00 2001
From: "Srikanth, Jampala" <Jampala.Srikanth@cavium.com>
Date: Tue, 4 Dec 2018 12:55:54 +0000
Subject: crypto: cavium/nitrox - Enabled Mailbox support

Enabled the PF->VF Mailbox support. Mailbox message are interpreted
as {type, opcode, data}. Supported message types are REQ, ACK and NACK.

Signed-off-by: Srikanth Jampala <Jampala.Srikanth@cavium.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/cavium/nitrox/Makefile         |   3 +-
 drivers/crypto/cavium/nitrox/nitrox_csr.h     |  12 +-
 drivers/crypto/cavium/nitrox/nitrox_debugfs.h |  22 +++
 drivers/crypto/cavium/nitrox/nitrox_dev.h     |  61 ++++++--
 drivers/crypto/cavium/nitrox/nitrox_hal.c     | 114 ++++++++++----
 drivers/crypto/cavium/nitrox/nitrox_hal.h     |   2 +
 drivers/crypto/cavium/nitrox/nitrox_isr.c     |   8 +-
 drivers/crypto/cavium/nitrox/nitrox_main.c    |   3 +-
 drivers/crypto/cavium/nitrox/nitrox_mbx.c     | 204 ++++++++++++++++++++++++++
 drivers/crypto/cavium/nitrox/nitrox_mbx.h     |   9 ++
 drivers/crypto/cavium/nitrox/nitrox_sriov.c   |  57 ++++++-
 11 files changed, 441 insertions(+), 54 deletions(-)
 create mode 100644 drivers/crypto/cavium/nitrox/nitrox_debugfs.h
 create mode 100644 drivers/crypto/cavium/nitrox/nitrox_mbx.c
 create mode 100644 drivers/crypto/cavium/nitrox/nitrox_mbx.h

diff --git a/drivers/crypto/cavium/nitrox/Makefile b/drivers/crypto/cavium/nitrox/Makefile
index e12954791673..ad0546630ad8 100644
--- a/drivers/crypto/cavium/nitrox/Makefile
+++ b/drivers/crypto/cavium/nitrox/Makefile
@@ -6,7 +6,8 @@ n5pf-objs := nitrox_main.o \
 	nitrox_lib.o \
 	nitrox_hal.o \
 	nitrox_reqmgr.o \
-	nitrox_algs.o
+	nitrox_algs.o	\
+	nitrox_mbx.o
 
 n5pf-$(CONFIG_PCI_IOV) += nitrox_sriov.o
 n5pf-$(CONFIG_DEBUG_FS) += nitrox_debugfs.o
diff --git a/drivers/crypto/cavium/nitrox/nitrox_csr.h b/drivers/crypto/cavium/nitrox/nitrox_csr.h
index 1ad27b1a87c5..a2a452642b38 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_csr.h
+++ b/drivers/crypto/cavium/nitrox/nitrox_csr.h
@@ -54,7 +54,13 @@
 #define NPS_STATS_PKT_DMA_WR_CNT	0x1000190
 
 /* NPS packet registers */
-#define NPS_PKT_INT				0x1040018
+#define NPS_PKT_INT			0x1040018
+#define NPS_PKT_MBOX_INT_LO		0x1040020
+#define NPS_PKT_MBOX_INT_LO_ENA_W1C	0x1040030
+#define NPS_PKT_MBOX_INT_LO_ENA_W1S	0x1040038
+#define NPS_PKT_MBOX_INT_HI		0x1040040
+#define NPS_PKT_MBOX_INT_HI_ENA_W1C	0x1040050
+#define NPS_PKT_MBOX_INT_HI_ENA_W1S	0x1040058
 #define NPS_PKT_IN_RERR_HI		0x1040108
 #define NPS_PKT_IN_RERR_HI_ENA_W1S	0x1040120
 #define NPS_PKT_IN_RERR_LO		0x1040128
@@ -74,6 +80,10 @@
 #define NPS_PKT_SLC_RERR_LO_ENA_W1S	0x1040240
 #define NPS_PKT_SLC_ERR_TYPE		0x1040248
 #define NPS_PKT_SLC_ERR_TYPE_ENA_W1S	0x1040260
+/* Mailbox PF->VF PF Accessible Data registers */
+#define NPS_PKT_MBOX_PF_VF_PFDATAX(_i)	(0x1040800 + ((_i) * 0x8))
+#define NPS_PKT_MBOX_VF_PF_PFDATAX(_i)	(0x1040C00 + ((_i) * 0x8))
+
 #define NPS_PKT_SLC_CTLX(_i)		(0x10000 + ((_i) * 0x40000))
 #define NPS_PKT_SLC_CNTSX(_i)		(0x10008 + ((_i) * 0x40000))
 #define NPS_PKT_SLC_INT_LEVELSX(_i)	(0x10010 + ((_i) * 0x40000))
diff --git a/drivers/crypto/cavium/nitrox/nitrox_debugfs.h b/drivers/crypto/cavium/nitrox/nitrox_debugfs.h
new file mode 100644
index 000000000000..7b701ea6227a
--- /dev/null
+++ b/drivers/crypto/cavium/nitrox/nitrox_debugfs.h
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef __NITROX_DEBUGFS_H
+#define __NITROX_DEBUGFS_H
+
+#include "nitrox_dev.h"
+
+#ifdef CONFIG_DEBUG_FS
+int nitrox_debugfs_init(struct nitrox_device *ndev);
+void nitrox_debugfs_exit(struct nitrox_device *ndev);
+#else
+static inline int nitrox_debugfs_init(struct nitrox_device *ndev)
+{
+	return 0;
+}
+
+static inline int nitrox_sriov_debugfs_init(struct nitrox_device *ndev)
+{
+	return 0;
+}
+#endif /* !CONFIG_DEBUG_FS */
+
+#endif /* __NITROX_DEBUGFS_H */
diff --git a/drivers/crypto/cavium/nitrox/nitrox_dev.h b/drivers/crypto/cavium/nitrox/nitrox_dev.h
index 247df32f687c..0338877b828f 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_dev.h
+++ b/drivers/crypto/cavium/nitrox/nitrox_dev.h
@@ -8,6 +8,8 @@
 #include <linux/if.h>
 
 #define VERSION_LEN 32
+/* Maximum queues in PF mode */
+#define MAX_PF_QUEUES	64
 
 /**
  * struct nitrox_cmdq - NITROX command queue
@@ -103,13 +105,58 @@ struct nitrox_q_vector {
 	};
 };
 
+/**
+ * mbox_msg - Mailbox message data
+ * @type: message type
+ * @opcode: message opcode
+ * @data: message data
+ */
+union mbox_msg {
+	u64 value;
+	struct {
+		u64 type: 2;
+		u64 opcode: 6;
+		u64 data: 58;
+	};
+	struct {
+		u64 type: 2;
+		u64 opcode: 6;
+		u64 chipid: 8;
+		u64 vfid: 8;
+	} id;
+};
+
+/**
+ * nitrox_vfdev - NITROX VF device instance in PF
+ * @state: VF device state
+ * @vfno: VF number
+ * @nr_queues: number of queues enabled in VF
+ * @ring: ring to communicate with VF
+ * @msg: Mailbox message data from VF
+ * @mbx_resp: Mailbox counters
+ */
+struct nitrox_vfdev {
+	atomic_t state;
+	int vfno;
+	int nr_queues;
+	int ring;
+	union mbox_msg msg;
+	atomic64_t mbx_resp;
+};
+
 /**
  * struct nitrox_iov - SR-IOV information
  * @num_vfs: number of VF(s) enabled
- * @msix: MSI-X for PF in SR-IOV case
+ * @max_vf_queues: Maximum number of queues allowed for VF
+ * @vfdev: VF(s) devices
+ * @pf2vf_wq: workqueue for PF2VF communication
+ * @msix: MSI-X entry for PF in SR-IOV case
  */
 struct nitrox_iov {
 	int num_vfs;
+	int max_vf_queues;
+	struct nitrox_vfdev *vfdev;
+	struct workqueue_struct *pf2vf_wq;
 	struct msix_entry msix;
 };
 
@@ -226,17 +273,9 @@ static inline bool nitrox_ready(struct nitrox_device *ndev)
 	return atomic_read(&ndev->state) == __NDEV_READY;
 }
 
-#ifdef CONFIG_DEBUG_FS
-int nitrox_debugfs_init(struct nitrox_device *ndev);
-void nitrox_debugfs_exit(struct nitrox_device *ndev);
-#else
-static inline int nitrox_debugfs_init(struct nitrox_device *ndev)
+static inline bool nitrox_vfdev_ready(struct nitrox_vfdev *vfdev)
 {
-	return 0;
+	return atomic_read(&vfdev->state) == __NDEV_READY;
 }
 
-static inline void nitrox_debugfs_exit(struct nitrox_device *ndev)
-{ }
-#endif
-
 #endif /* __NITROX_DEV_H */
diff --git a/drivers/crypto/cavium/nitrox/nitrox_hal.c b/drivers/crypto/cavium/nitrox/nitrox_hal.c
index a9b82387cf53..c08d9f33a3b1 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_hal.c
+++ b/drivers/crypto/cavium/nitrox/nitrox_hal.c
@@ -5,10 +5,11 @@
 #include "nitrox_csr.h"
 
 #define PLL_REF_CLK 50
+#define MAX_CSR_RETRIES 10
 
 /**
  * emu_enable_cores - Enable EMU cluster cores.
- * @ndev: N5 device
+ * @ndev: NITROX device
  */
 static void emu_enable_cores(struct nitrox_device *ndev)
 {
@@ -33,7 +34,7 @@ static void emu_enable_cores(struct nitrox_device *ndev)
 
 /**
  * nitrox_config_emu_unit - configure EMU unit.
- * @ndev: N5 device
+ * @ndev: NITROX device
  */
 void nitrox_config_emu_unit(struct nitrox_device *ndev)
 {
@@ -63,29 +64,26 @@ void nitrox_config_emu_unit(struct nitrox_device *ndev)
 static void reset_pkt_input_ring(struct nitrox_device *ndev, int ring)
 {
 	union nps_pkt_in_instr_ctl pkt_in_ctl;
-	union nps_pkt_in_instr_baoff_dbell pkt_in_dbell;
 	union nps_pkt_in_done_cnts pkt_in_cnts;
+	int max_retries = MAX_CSR_RETRIES;
 	u64 offset;
 
+	/* step 1: disable the ring, clear enable bit */
 	offset = NPS_PKT_IN_INSTR_CTLX(ring);
-	/* disable the ring */
 	pkt_in_ctl.value = nitrox_read_csr(ndev, offset);
 	pkt_in_ctl.s.enb = 0;
 	nitrox_write_csr(ndev, offset, pkt_in_ctl.value);
-	usleep_range(100, 150);
 
-	/* wait to clear [ENB] */
+	/* step 2: wait to clear [ENB] */
+	usleep_range(100, 150);
 	do {
 		pkt_in_ctl.value = nitrox_read_csr(ndev, offset);
-	} while (pkt_in_ctl.s.enb);
-
-	/* clear off door bell counts */
-	offset = NPS_PKT_IN_INSTR_BAOFF_DBELLX(ring);
-	pkt_in_dbell.value = 0;
-	pkt_in_dbell.s.dbell = 0xffffffff;
-	nitrox_write_csr(ndev, offset, pkt_in_dbell.value);
+		if (!pkt_in_ctl.s.enb)
+			break;
+		udelay(50);
+	} while (max_retries--);
 
-	/* clear done counts */
+	/* step 3: clear done counts */
 	offset = NPS_PKT_IN_DONE_CNTSX(ring);
 	pkt_in_cnts.value = nitrox_read_csr(ndev, offset);
 	nitrox_write_csr(ndev, offset, pkt_in_cnts.value);
@@ -95,6 +93,7 @@ static void reset_pkt_input_ring(struct nitrox_device *ndev, int ring)
 void enable_pkt_input_ring(struct nitrox_device *ndev, int ring)
 {
 	union nps_pkt_in_instr_ctl pkt_in_ctl;
+	int max_retries = MAX_CSR_RETRIES;
 	u64 offset;
 
 	/* 64-byte instruction size */
@@ -107,12 +106,15 @@ void enable_pkt_input_ring(struct nitrox_device *ndev, int ring)
 	/* wait for set [ENB] */
 	do {
 		pkt_in_ctl.value = nitrox_read_csr(ndev, offset);
-	} while (!pkt_in_ctl.s.enb);
+		if (pkt_in_ctl.s.enb)
+			break;
+		udelay(50);
+	} while (max_retries--);
 }
 
 /**
  * nitrox_config_pkt_input_rings - configure Packet Input Rings
- * @ndev: N5 device
+ * @ndev: NITROX device
  */
 void nitrox_config_pkt_input_rings(struct nitrox_device *ndev)
 {
@@ -121,11 +123,14 @@ void nitrox_config_pkt_input_rings(struct nitrox_device *ndev)
 	for (i = 0; i < ndev->nr_queues; i++) {
 		struct nitrox_cmdq *cmdq = &ndev->pkt_inq[i];
 		union nps_pkt_in_instr_rsize pkt_in_rsize;
+		union nps_pkt_in_instr_baoff_dbell pkt_in_dbell;
 		u64 offset;
 
 		reset_pkt_input_ring(ndev, i);
 
-		/* configure ring base address 16-byte aligned,
+		/**
+		 * step 4:
+		 * configure ring base address 16-byte aligned,
 		 * size and interrupt threshold.
 		 */
 		offset = NPS_PKT_IN_INSTR_BADDRX(i);
@@ -141,6 +146,13 @@ void nitrox_config_pkt_input_rings(struct nitrox_device *ndev)
 		offset = NPS_PKT_IN_INT_LEVELSX(i);
 		nitrox_write_csr(ndev, offset, 0xffffffff);
 
+		/* step 5: clear off door bell counts */
+		offset = NPS_PKT_IN_INSTR_BAOFF_DBELLX(i);
+		pkt_in_dbell.value = 0;
+		pkt_in_dbell.s.dbell = 0xffffffff;
+		nitrox_write_csr(ndev, offset, pkt_in_dbell.value);
+
+		/* enable the ring */
 		enable_pkt_input_ring(ndev, i);
 	}
 }
@@ -149,21 +161,26 @@ static void reset_pkt_solicit_port(struct nitrox_device *ndev, int port)
 {
 	union nps_pkt_slc_ctl pkt_slc_ctl;
 	union nps_pkt_slc_cnts pkt_slc_cnts;
+	int max_retries = MAX_CSR_RETRIES;
 	u64 offset;
 
-	/* disable slc port */
+	/* step 1: disable slc port */
 	offset = NPS_PKT_SLC_CTLX(port);
 	pkt_slc_ctl.value = nitrox_read_csr(ndev, offset);
 	pkt_slc_ctl.s.enb = 0;
 	nitrox_write_csr(ndev, offset, pkt_slc_ctl.value);
-	usleep_range(100, 150);
 
+	/* step 2 */
+	usleep_range(100, 150);
 	/* wait to clear [ENB] */
 	do {
 		pkt_slc_ctl.value = nitrox_read_csr(ndev, offset);
-	} while (pkt_slc_ctl.s.enb);
+		if (!pkt_slc_ctl.s.enb)
+			break;
+		udelay(50);
+	} while (max_retries--);
 
-	/* clear slc counters */
+	/* step 3: clear slc counters */
 	offset = NPS_PKT_SLC_CNTSX(port);
 	pkt_slc_cnts.value = nitrox_read_csr(ndev, offset);
 	nitrox_write_csr(ndev, offset, pkt_slc_cnts.value);
@@ -173,12 +190,12 @@ static void reset_pkt_solicit_port(struct nitrox_device *ndev, int port)
 void enable_pkt_solicit_port(struct nitrox_device *ndev, int port)
 {
 	union nps_pkt_slc_ctl pkt_slc_ctl;
+	int max_retries = MAX_CSR_RETRIES;
 	u64 offset;
 
 	offset = NPS_PKT_SLC_CTLX(port);
 	pkt_slc_ctl.value = 0;
 	pkt_slc_ctl.s.enb = 1;
-
 	/*
 	 * 8 trailing 0x00 bytes will be added
 	 * to the end of the outgoing packet.
@@ -191,23 +208,27 @@ void enable_pkt_solicit_port(struct nitrox_device *ndev, int port)
 	/* wait to set [ENB] */
 	do {
 		pkt_slc_ctl.value = nitrox_read_csr(ndev, offset);
-	} while (!pkt_slc_ctl.s.enb);
+		if (pkt_slc_ctl.s.enb)
+			break;
+		udelay(50);
+	} while (max_retries--);
 }
 
-static void config_single_pkt_solicit_port(struct nitrox_device *ndev,
-					   int port)
+static void config_pkt_solicit_port(struct nitrox_device *ndev, int port)
 {
 	union nps_pkt_slc_int_levels pkt_slc_int;
 	u64 offset;
 
 	reset_pkt_solicit_port(ndev, port);
 
+	/* step 4: configure interrupt levels */
 	offset = NPS_PKT_SLC_INT_LEVELSX(port);
 	pkt_slc_int.value = 0;
 	/* time interrupt threshold */
 	pkt_slc_int.s.timet = 0x3fffff;
 	nitrox_write_csr(ndev, offset, pkt_slc_int.value);
 
+	/* enable the solicit port */
 	enable_pkt_solicit_port(ndev, port);
 }
 
@@ -216,12 +237,12 @@ void nitrox_config_pkt_solicit_ports(struct nitrox_device *ndev)
 	int i;
 
 	for (i = 0; i < ndev->nr_queues; i++)
-		config_single_pkt_solicit_port(ndev, i);
+		config_pkt_solicit_port(ndev, i);
 }
 
 /**
  * enable_nps_interrupts - enable NPS interrutps
- * @ndev: N5 device.
+ * @ndev: NITROX device.
  *
  * This includes NPS core, packet in and slc interrupts.
  */
@@ -284,8 +305,8 @@ void nitrox_config_pom_unit(struct nitrox_device *ndev)
 }
 
 /**
- * nitrox_config_rand_unit - enable N5 random number unit
- * @ndev: N5 device
+ * nitrox_config_rand_unit - enable NITROX random number unit
+ * @ndev: NITROX device
  */
 void nitrox_config_rand_unit(struct nitrox_device *ndev)
 {
@@ -361,6 +382,7 @@ void invalidate_lbc(struct nitrox_device *ndev)
 {
 	union lbc_inval_ctl lbc_ctl;
 	union lbc_inval_status lbc_stat;
+	int max_retries = MAX_CSR_RETRIES;
 	u64 offset;
 
 	/* invalidate LBC */
@@ -370,10 +392,12 @@ void invalidate_lbc(struct nitrox_device *ndev)
 	nitrox_write_csr(ndev, offset, lbc_ctl.value);
 
 	offset = LBC_INVAL_STATUS;
-
 	do {
 		lbc_stat.value = nitrox_read_csr(ndev, offset);
-	} while (!lbc_stat.s.done);
+		if (lbc_stat.s.done)
+			break;
+		udelay(50);
+	} while (max_retries--);
 }
 
 void nitrox_config_lbc_unit(struct nitrox_device *ndev)
@@ -467,3 +491,31 @@ void nitrox_get_hwinfo(struct nitrox_device *ndev)
 	/* copy partname */
 	strncpy(ndev->hw.partname, name, sizeof(ndev->hw.partname));
 }
+
+void enable_pf2vf_mbox_interrupts(struct nitrox_device *ndev)
+{
+	u64 value = ~0ULL;
+	u64 reg_addr;
+
+	/* Mailbox interrupt low enable set register */
+	reg_addr = NPS_PKT_MBOX_INT_LO_ENA_W1S;
+	nitrox_write_csr(ndev, reg_addr, value);
+
+	/* Mailbox interrupt high enable set register */
+	reg_addr = NPS_PKT_MBOX_INT_HI_ENA_W1S;
+	nitrox_write_csr(ndev, reg_addr, value);
+}
+
+void disable_pf2vf_mbox_interrupts(struct nitrox_device *ndev)
+{
+	u64 value = ~0ULL;
+	u64 reg_addr;
+
+	/* Mailbox interrupt low enable clear register */
+	reg_addr = NPS_PKT_MBOX_INT_LO_ENA_W1C;
+	nitrox_write_csr(ndev, reg_addr, value);
+
+	/* Mailbox interrupt high enable clear register */
+	reg_addr = NPS_PKT_MBOX_INT_HI_ENA_W1C;
+	nitrox_write_csr(ndev, reg_addr, value);
+}
diff --git a/drivers/crypto/cavium/nitrox/nitrox_hal.h b/drivers/crypto/cavium/nitrox/nitrox_hal.h
index 489ee64c119e..d6606418ba38 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_hal.h
+++ b/drivers/crypto/cavium/nitrox/nitrox_hal.h
@@ -19,5 +19,7 @@ void enable_pkt_input_ring(struct nitrox_device *ndev, int ring);
 void enable_pkt_solicit_port(struct nitrox_device *ndev, int port);
 void config_nps_core_vfcfg_mode(struct nitrox_device *ndev, enum vf_mode mode);
 void nitrox_get_hwinfo(struct nitrox_device *ndev);
+void enable_pf2vf_mbox_interrupts(struct nitrox_device *ndev);
+void disable_pf2vf_mbox_interrupts(struct nitrox_device *ndev);
 
 #endif /* __NITROX_HAL_H */
diff --git a/drivers/crypto/cavium/nitrox/nitrox_isr.c b/drivers/crypto/cavium/nitrox/nitrox_isr.c
index c5b797808680..3dec570a190a 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_isr.c
+++ b/drivers/crypto/cavium/nitrox/nitrox_isr.c
@@ -7,6 +7,7 @@
 #include "nitrox_csr.h"
 #include "nitrox_common.h"
 #include "nitrox_hal.h"
+#include "nitrox_mbx.h"
 
 /**
  * One vector for each type of ring
@@ -220,7 +221,8 @@ static void nps_core_int_tasklet(unsigned long data)
  */
 static irqreturn_t nps_core_int_isr(int irq, void *data)
 {
-	struct nitrox_device *ndev = data;
+	struct nitrox_q_vector *qvec = data;
+	struct nitrox_device *ndev = qvec->ndev;
 	union nps_core_int_active core_int;
 
 	core_int.value = nitrox_read_csr(ndev, NPS_CORE_INT_ACTIVE);
@@ -246,6 +248,10 @@ static irqreturn_t nps_core_int_isr(int irq, void *data)
 	if (core_int.s.bmi)
 		clear_bmi_err_intr(ndev);
 
+	/* Mailbox interrupt */
+	if (core_int.s.mbox)
+		nitrox_pf2vf_mbox_handler(ndev);
+
 	/* If more work callback the ISR, set resend */
 	core_int.s.resend = 1;
 	nitrox_write_csr(ndev, NPS_CORE_INT_ACTIVE, core_int.value);
diff --git a/drivers/crypto/cavium/nitrox/nitrox_main.c b/drivers/crypto/cavium/nitrox/nitrox_main.c
index 6595c95af9f1..014e9863c20e 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_main.c
+++ b/drivers/crypto/cavium/nitrox/nitrox_main.c
@@ -1,6 +1,5 @@
 #include <linux/aer.h>
 #include <linux/delay.h>
-#include <linux/debugfs.h>
 #include <linux/firmware.h>
 #include <linux/list.h>
 #include <linux/module.h>
@@ -13,9 +12,9 @@
 #include "nitrox_csr.h"
 #include "nitrox_hal.h"
 #include "nitrox_isr.h"
+#include "nitrox_debugfs.h"
 
 #define CNN55XX_DEV_ID	0x12
-#define MAX_PF_QUEUES	64
 #define UCODE_HLEN 48
 #define SE_GROUP 0
 
diff --git a/drivers/crypto/cavium/nitrox/nitrox_mbx.c b/drivers/crypto/cavium/nitrox/nitrox_mbx.c
new file mode 100644
index 000000000000..02ee95064841
--- /dev/null
+++ b/drivers/crypto/cavium/nitrox/nitrox_mbx.c
@@ -0,0 +1,204 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/workqueue.h>
+
+#include "nitrox_csr.h"
+#include "nitrox_hal.h"
+#include "nitrox_dev.h"
+
+#define RING_TO_VFNO(_x, _y)	((_x) / (_y))
+
+/**
+ * mbx_msg_type - Mailbox message types
+ */
+enum mbx_msg_type {
+	MBX_MSG_TYPE_NOP,
+	MBX_MSG_TYPE_REQ,
+	MBX_MSG_TYPE_ACK,
+	MBX_MSG_TYPE_NACK,
+};
+
+/**
+ * mbx_msg_opcode - Mailbox message opcodes
+ */
+enum mbx_msg_opcode {
+	MSG_OP_VF_MODE = 1,
+	MSG_OP_VF_UP,
+	MSG_OP_VF_DOWN,
+	MSG_OP_CHIPID_VFID,
+};
+
+struct pf2vf_work {
+	struct nitrox_vfdev *vfdev;
+	struct nitrox_device *ndev;
+	struct work_struct pf2vf_resp;
+};
+
+static inline u64 pf2vf_read_mbox(struct nitrox_device *ndev, int ring)
+{
+	u64 reg_addr;
+
+	reg_addr = NPS_PKT_MBOX_VF_PF_PFDATAX(ring);
+	return nitrox_read_csr(ndev, reg_addr);
+}
+
+static inline void pf2vf_write_mbox(struct nitrox_device *ndev, u64 value,
+				    int ring)
+{
+	u64 reg_addr;
+
+	reg_addr = NPS_PKT_MBOX_PF_VF_PFDATAX(ring);
+	nitrox_write_csr(ndev, reg_addr, value);
+}
+
+static void pf2vf_send_response(struct nitrox_device *ndev,
+				struct nitrox_vfdev *vfdev)
+{
+	union mbox_msg msg;
+
+	msg.value = vfdev->msg.value;
+
+	switch (vfdev->msg.opcode) {
+	case MSG_OP_VF_MODE:
+		msg.data = ndev->mode;
+		break;
+	case MSG_OP_VF_UP:
+		vfdev->nr_queues = vfdev->msg.data;
+		atomic_set(&vfdev->state, __NDEV_READY);
+		break;
+	case MSG_OP_CHIPID_VFID:
+		msg.id.chipid = ndev->idx;
+		msg.id.vfid = vfdev->vfno;
+		break;
+	case MSG_OP_VF_DOWN:
+		vfdev->nr_queues = 0;
+		atomic_set(&vfdev->state, __NDEV_NOT_READY);
+		break;
+	default:
+		msg.type = MBX_MSG_TYPE_NOP;
+		break;
+	}
+
+	if (msg.type == MBX_MSG_TYPE_NOP)
+		return;
+
+	/* send ACK to VF */
+	msg.type = MBX_MSG_TYPE_ACK;
+	pf2vf_write_mbox(ndev, msg.value, vfdev->ring);
+
+	vfdev->msg.value = 0;
+	atomic64_inc(&vfdev->mbx_resp);
+}
+
+static void pf2vf_resp_handler(struct work_struct *work)
+{
+	struct pf2vf_work *pf2vf_resp = container_of(work, struct pf2vf_work,
+						     pf2vf_resp);
+	struct nitrox_vfdev *vfdev = pf2vf_resp->vfdev;
+	struct nitrox_device *ndev = pf2vf_resp->ndev;
+
+	switch (vfdev->msg.type) {
+	case MBX_MSG_TYPE_REQ:
+		/* process the request from VF */
+		pf2vf_send_response(ndev, vfdev);
+		break;
+	case MBX_MSG_TYPE_ACK:
+	case MBX_MSG_TYPE_NACK:
+		break;
+	};
+
+	kfree(pf2vf_resp);
+}
+
+void nitrox_pf2vf_mbox_handler(struct nitrox_device *ndev)
+{
+	struct nitrox_vfdev *vfdev;
+	struct pf2vf_work *pfwork;
+	u64 value, reg_addr;
+	u32 i;
+	int vfno;
+
+	/* loop for VF(0..63) */
+	reg_addr = NPS_PKT_MBOX_INT_LO;
+	value = nitrox_read_csr(ndev, reg_addr);
+	for_each_set_bit(i, (const unsigned long *)&value, BITS_PER_LONG) {
+		/* get the vfno from ring */
+		vfno = RING_TO_VFNO(i, ndev->iov.max_vf_queues);
+		vfdev = ndev->iov.vfdev + vfno;
+		vfdev->ring = i;
+		/* fill the vf mailbox data */
+		vfdev->msg.value = pf2vf_read_mbox(ndev, vfdev->ring);
+		pfwork = kzalloc(sizeof(*pfwork), GFP_ATOMIC);
+		if (!pfwork)
+			continue;
+
+		pfwork->vfdev = vfdev;
+		pfwork->ndev = ndev;
+		INIT_WORK(&pfwork->pf2vf_resp, pf2vf_resp_handler);
+		queue_work(ndev->iov.pf2vf_wq, &pfwork->pf2vf_resp);
+		/* clear the corresponding vf bit */
+		nitrox_write_csr(ndev, reg_addr, BIT_ULL(i));
+	}
+
+	/* loop for VF(64..127) */
+	reg_addr = NPS_PKT_MBOX_INT_HI;
+	value = nitrox_read_csr(ndev, reg_addr);
+	for_each_set_bit(i, (const unsigned long *)&value, BITS_PER_LONG) {
+		/* get the vfno from ring */
+		vfno = RING_TO_VFNO(i + 64, ndev->iov.max_vf_queues);
+		vfdev = ndev->iov.vfdev + vfno;
+		vfdev->ring = (i + 64);
+		/* fill the vf mailbox data */
+		vfdev->msg.value = pf2vf_read_mbox(ndev, vfdev->ring);
+
+		pfwork = kzalloc(sizeof(*pfwork), GFP_ATOMIC);
+		if (!pfwork)
+			continue;
+
+		pfwork->vfdev = vfdev;
+		pfwork->ndev = ndev;
+		INIT_WORK(&pfwork->pf2vf_resp, pf2vf_resp_handler);
+		queue_work(ndev->iov.pf2vf_wq, &pfwork->pf2vf_resp);
+		/* clear the corresponding vf bit */
+		nitrox_write_csr(ndev, reg_addr, BIT_ULL(i));
+	}
+}
+
+int nitrox_mbox_init(struct nitrox_device *ndev)
+{
+	struct nitrox_vfdev *vfdev;
+	int i;
+
+	ndev->iov.vfdev = kcalloc(ndev->iov.num_vfs,
+				  sizeof(struct nitrox_vfdev), GFP_KERNEL);
+	if (!ndev->iov.vfdev)
+		return -ENOMEM;
+
+	for (i = 0; i < ndev->iov.num_vfs; i++) {
+		vfdev = ndev->iov.vfdev + i;
+		vfdev->vfno = i;
+	}
+
+	/* allocate pf2vf response workqueue */
+	ndev->iov.pf2vf_wq = alloc_workqueue("nitrox_pf2vf", 0, 0);
+	if (!ndev->iov.pf2vf_wq) {
+		kfree(ndev->iov.vfdev);
+		return -ENOMEM;
+	}
+	/* enable pf2vf mailbox interrupts */
+	enable_pf2vf_mbox_interrupts(ndev);
+
+	return 0;
+}
+
+void nitrox_mbox_cleanup(struct nitrox_device *ndev)
+{
+	/* disable pf2vf mailbox interrupts */
+	disable_pf2vf_mbox_interrupts(ndev);
+	/* destroy workqueue */
+	if (ndev->iov.pf2vf_wq)
+		destroy_workqueue(ndev->iov.pf2vf_wq);
+
+	kfree(ndev->iov.vfdev);
+	ndev->iov.pf2vf_wq = NULL;
+	ndev->iov.vfdev = NULL;
+}
diff --git a/drivers/crypto/cavium/nitrox/nitrox_mbx.h b/drivers/crypto/cavium/nitrox/nitrox_mbx.h
new file mode 100644
index 000000000000..5008399775a9
--- /dev/null
+++ b/drivers/crypto/cavium/nitrox/nitrox_mbx.h
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef __NITROX_MBX_H
+#define __NITROX_MBX_H
+
+int nitrox_mbox_init(struct nitrox_device *ndev);
+void nitrox_mbox_cleanup(struct nitrox_device *ndev);
+void nitrox_pf2vf_mbox_handler(struct nitrox_device *ndev);
+
+#endif /* __NITROX_MBX_H */
diff --git a/drivers/crypto/cavium/nitrox/nitrox_sriov.c b/drivers/crypto/cavium/nitrox/nitrox_sriov.c
index 7ba0cc5d6d02..bf439d8256ba 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_sriov.c
+++ b/drivers/crypto/cavium/nitrox/nitrox_sriov.c
@@ -6,6 +6,7 @@
 #include "nitrox_hal.h"
 #include "nitrox_common.h"
 #include "nitrox_isr.h"
+#include "nitrox_mbx.h"
 
 /**
  * num_vfs_valid - validate VF count
@@ -52,6 +53,31 @@ static inline enum vf_mode num_vfs_to_mode(int num_vfs)
 	return mode;
 }
 
+static inline int vf_mode_to_nr_queues(enum vf_mode mode)
+{
+	int nr_queues = 0;
+
+	switch (mode) {
+	case __NDEV_MODE_PF:
+		nr_queues = MAX_PF_QUEUES;
+		break;
+	case __NDEV_MODE_VF16:
+		nr_queues = 8;
+		break;
+	case __NDEV_MODE_VF32:
+		nr_queues = 4;
+		break;
+	case __NDEV_MODE_VF64:
+		nr_queues = 2;
+		break;
+	case __NDEV_MODE_VF128:
+		nr_queues = 1;
+		break;
+	}
+
+	return nr_queues;
+}
+
 static void nitrox_pf_cleanup(struct nitrox_device *ndev)
 {
 	 /* PF has no queues in SR-IOV mode */
@@ -94,16 +120,31 @@ static int nitrox_pf_reinit(struct nitrox_device *ndev)
 	return nitrox_crypto_register();
 }
 
-static int nitrox_sriov_init(struct nitrox_device *ndev)
-{
-	/* register interrupts for PF in SR-IOV */
-	return nitrox_sriov_register_interupts(ndev);
-}
-
 static void nitrox_sriov_cleanup(struct nitrox_device *ndev)
 {
 	/* unregister interrupts for PF in SR-IOV */
 	nitrox_sriov_unregister_interrupts(ndev);
+	nitrox_mbox_cleanup(ndev);
+}
+
+static int nitrox_sriov_init(struct nitrox_device *ndev)
+{
+	int ret;
+
+	/* register interrupts for PF in SR-IOV */
+	ret = nitrox_sriov_register_interupts(ndev);
+	if (ret)
+		return ret;
+
+	ret = nitrox_mbox_init(ndev);
+	if (ret)
+		goto sriov_init_fail;
+
+	return 0;
+
+sriov_init_fail:
+	nitrox_sriov_cleanup(ndev);
+	return ret;
 }
 
 static int nitrox_sriov_enable(struct pci_dev *pdev, int num_vfs)
@@ -126,8 +167,9 @@ static int nitrox_sriov_enable(struct pci_dev *pdev, int num_vfs)
 	}
 	dev_info(DEV(ndev), "Enabled VF(s) %d\n", num_vfs);
 
-	ndev->iov.num_vfs = num_vfs;
 	ndev->mode = num_vfs_to_mode(num_vfs);
+	ndev->iov.num_vfs = num_vfs;
+	ndev->iov.max_vf_queues = vf_mode_to_nr_queues(ndev->mode);
 	/* set bit in flags */
 	set_bit(__NDEV_SRIOV_BIT, &ndev->flags);
 
@@ -169,6 +211,7 @@ static int nitrox_sriov_disable(struct pci_dev *pdev)
 	clear_bit(__NDEV_SRIOV_BIT, &ndev->flags);
 
 	ndev->iov.num_vfs = 0;
+	ndev->iov.max_vf_queues = 0;
 	ndev->mode = __NDEV_MODE_PF;
 
 	/* cleanup PF SR-IOV resources */
-- 
cgit v1.2.3-59-g8ed1b


From ee5bbc9fd3a1fb81e9f6103d6c52ab88926a9603 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 4 Dec 2018 14:13:31 +0100
Subject: crypto: tcrypt - add block size of 1472 to skcipher template

In order to have better coverage of algorithms operating on block
sizes that are in the ballpark of a VPN  packet, add 1472 to the
block_sizes array.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/tcrypt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index 0590a9204562..e7fb87e114a5 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -81,7 +81,7 @@ static char *check[] = {
 	NULL
 };
 
-static u32 block_sizes[] = { 16, 64, 256, 1024, 8192, 0 };
+static u32 block_sizes[] = { 16, 64, 256, 1024, 1472, 8192, 0 };
 static u32 aead_sizes[] = { 16, 64, 256, 512, 1024, 2048, 4096, 8192, 0 };
 
 #define XBUFSIZE 8
-- 
cgit v1.2.3-59-g8ed1b


From f2ca1cbd0fb584b5b5e0dbd9bda819f49cf9cdb6 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 4 Dec 2018 14:13:32 +0100
Subject: crypto: arm64/chacha - optimize for arbitrary length inputs

Update the 4-way NEON ChaCha routine so it can handle input of any
length >64 bytes in its entirety, rather than having to call into
the 1-way routine and/or memcpy()s via temp buffers to handle the
tail of a ChaCha invocation that is not a multiple of 256 bytes.

On inputs that are a multiple of 256 bytes (and thus in tcrypt
benchmarks), performance drops by around 1% on Cortex-A57, while
performance for inputs drawn randomly from the range [64, 1024)
increases by around 30%.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm64/crypto/chacha-neon-core.S | 183 ++++++++++++++++++++++++++++++++---
 arch/arm64/crypto/chacha-neon-glue.c |  38 +++-----
 2 files changed, 184 insertions(+), 37 deletions(-)

diff --git a/arch/arm64/crypto/chacha-neon-core.S b/arch/arm64/crypto/chacha-neon-core.S
index 3d3a12db5204..8f9c2e83f6f0 100644
--- a/arch/arm64/crypto/chacha-neon-core.S
+++ b/arch/arm64/crypto/chacha-neon-core.S
@@ -19,6 +19,8 @@
  */
 
 #include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/cache.h>
 
 	.text
 	.align		6
@@ -36,7 +38,7 @@
  */
 chacha_permute:
 
-	adr		x10, ROT8
+	adr_l		x10, ROT8
 	ld1		{v12.4s}, [x10]
 
 .Ldoubleround:
@@ -169,6 +171,12 @@ ENTRY(chacha_4block_xor_neon)
 	// x1: 4 data blocks output, o
 	// x2: 4 data blocks input, i
 	// w3: nrounds
+	// x4: byte count
+
+	adr_l		x10, .Lpermute
+	and		x5, x4, #63
+	add		x10, x10, x5
+	add		x11, x10, #64
 
 	//
 	// This function encrypts four consecutive ChaCha blocks by loading
@@ -178,15 +186,15 @@ ENTRY(chacha_4block_xor_neon)
 	// matrix by interleaving 32- and then 64-bit words, which allows us to
 	// do XOR in NEON registers.
 	//
-	adr		x9, CTRINC		// ... and ROT8
+	adr_l		x9, CTRINC		// ... and ROT8
 	ld1		{v30.4s-v31.4s}, [x9]
 
 	// x0..15[0-3] = s0..3[0..3]
-	mov		x4, x0
-	ld4r		{ v0.4s- v3.4s}, [x4], #16
-	ld4r		{ v4.4s- v7.4s}, [x4], #16
-	ld4r		{ v8.4s-v11.4s}, [x4], #16
-	ld4r		{v12.4s-v15.4s}, [x4]
+	add		x8, x0, #16
+	ld4r		{ v0.4s- v3.4s}, [x0]
+	ld4r		{ v4.4s- v7.4s}, [x8], #16
+	ld4r		{ v8.4s-v11.4s}, [x8], #16
+	ld4r		{v12.4s-v15.4s}, [x8]
 
 	// x12 += counter values 0-3
 	add		v12.4s, v12.4s, v30.4s
@@ -430,24 +438,47 @@ ENTRY(chacha_4block_xor_neon)
 	zip1		v30.4s, v14.4s, v15.4s
 	zip2		v31.4s, v14.4s, v15.4s
 
+	mov		x3, #64
+	subs		x5, x4, #64
+	add		x6, x5, x2
+	csel		x3, x3, xzr, ge
+	csel		x2, x2, x6, ge
+
 	// interleave 64-bit words in state n, n+2
 	zip1		v0.2d, v16.2d, v18.2d
 	zip2		v4.2d, v16.2d, v18.2d
 	zip1		v8.2d, v17.2d, v19.2d
 	zip2		v12.2d, v17.2d, v19.2d
-	ld1		{v16.16b-v19.16b}, [x2], #64
+	ld1		{v16.16b-v19.16b}, [x2], x3
+
+	subs		x6, x4, #128
+	ccmp		x3, xzr, #4, lt
+	add		x7, x6, x2
+	csel		x3, x3, xzr, eq
+	csel		x2, x2, x7, eq
 
 	zip1		v1.2d, v20.2d, v22.2d
 	zip2		v5.2d, v20.2d, v22.2d
 	zip1		v9.2d, v21.2d, v23.2d
 	zip2		v13.2d, v21.2d, v23.2d
-	ld1		{v20.16b-v23.16b}, [x2], #64
+	ld1		{v20.16b-v23.16b}, [x2], x3
+
+	subs		x7, x4, #192
+	ccmp		x3, xzr, #4, lt
+	add		x8, x7, x2
+	csel		x3, x3, xzr, eq
+	csel		x2, x2, x8, eq
 
 	zip1		v2.2d, v24.2d, v26.2d
 	zip2		v6.2d, v24.2d, v26.2d
 	zip1		v10.2d, v25.2d, v27.2d
 	zip2		v14.2d, v25.2d, v27.2d
-	ld1		{v24.16b-v27.16b}, [x2], #64
+	ld1		{v24.16b-v27.16b}, [x2], x3
+
+	subs		x8, x4, #256
+	ccmp		x3, xzr, #4, lt
+	add		x9, x8, x2
+	csel		x2, x2, x9, eq
 
 	zip1		v3.2d, v28.2d, v30.2d
 	zip2		v7.2d, v28.2d, v30.2d
@@ -456,29 +487,155 @@ ENTRY(chacha_4block_xor_neon)
 	ld1		{v28.16b-v31.16b}, [x2]
 
 	// xor with corresponding input, write to output
+	tbnz		x5, #63, 0f
 	eor		v16.16b, v16.16b, v0.16b
 	eor		v17.16b, v17.16b, v1.16b
 	eor		v18.16b, v18.16b, v2.16b
 	eor		v19.16b, v19.16b, v3.16b
+	st1		{v16.16b-v19.16b}, [x1], #64
+
+	tbnz		x6, #63, 1f
 	eor		v20.16b, v20.16b, v4.16b
 	eor		v21.16b, v21.16b, v5.16b
-	st1		{v16.16b-v19.16b}, [x1], #64
 	eor		v22.16b, v22.16b, v6.16b
 	eor		v23.16b, v23.16b, v7.16b
+	st1		{v20.16b-v23.16b}, [x1], #64
+
+	tbnz		x7, #63, 2f
 	eor		v24.16b, v24.16b, v8.16b
 	eor		v25.16b, v25.16b, v9.16b
-	st1		{v20.16b-v23.16b}, [x1], #64
 	eor		v26.16b, v26.16b, v10.16b
 	eor		v27.16b, v27.16b, v11.16b
-	eor		v28.16b, v28.16b, v12.16b
 	st1		{v24.16b-v27.16b}, [x1], #64
+
+	tbnz		x8, #63, 3f
+	eor		v28.16b, v28.16b, v12.16b
 	eor		v29.16b, v29.16b, v13.16b
 	eor		v30.16b, v30.16b, v14.16b
 	eor		v31.16b, v31.16b, v15.16b
 	st1		{v28.16b-v31.16b}, [x1]
 
 	ret
+
+	// fewer than 64 bytes of in/output
+0:	ld1		{v8.16b}, [x10]
+	ld1		{v9.16b}, [x11]
+	movi		v10.16b, #16
+	sub		x2, x1, #64
+	add		x1, x1, x5
+	ld1		{v16.16b-v19.16b}, [x2]
+	tbl		v4.16b, {v0.16b-v3.16b}, v8.16b
+	tbx		v20.16b, {v16.16b-v19.16b}, v9.16b
+	add		v8.16b, v8.16b, v10.16b
+	add		v9.16b, v9.16b, v10.16b
+	tbl		v5.16b, {v0.16b-v3.16b}, v8.16b
+	tbx		v21.16b, {v16.16b-v19.16b}, v9.16b
+	add		v8.16b, v8.16b, v10.16b
+	add		v9.16b, v9.16b, v10.16b
+	tbl		v6.16b, {v0.16b-v3.16b}, v8.16b
+	tbx		v22.16b, {v16.16b-v19.16b}, v9.16b
+	add		v8.16b, v8.16b, v10.16b
+	add		v9.16b, v9.16b, v10.16b
+	tbl		v7.16b, {v0.16b-v3.16b}, v8.16b
+	tbx		v23.16b, {v16.16b-v19.16b}, v9.16b
+
+	eor		v20.16b, v20.16b, v4.16b
+	eor		v21.16b, v21.16b, v5.16b
+	eor		v22.16b, v22.16b, v6.16b
+	eor		v23.16b, v23.16b, v7.16b
+	st1		{v20.16b-v23.16b}, [x1]
+	ret
+
+	// fewer than 128 bytes of in/output
+1:	ld1		{v8.16b}, [x10]
+	ld1		{v9.16b}, [x11]
+	movi		v10.16b, #16
+	add		x1, x1, x6
+	tbl		v0.16b, {v4.16b-v7.16b}, v8.16b
+	tbx		v20.16b, {v16.16b-v19.16b}, v9.16b
+	add		v8.16b, v8.16b, v10.16b
+	add		v9.16b, v9.16b, v10.16b
+	tbl		v1.16b, {v4.16b-v7.16b}, v8.16b
+	tbx		v21.16b, {v16.16b-v19.16b}, v9.16b
+	add		v8.16b, v8.16b, v10.16b
+	add		v9.16b, v9.16b, v10.16b
+	tbl		v2.16b, {v4.16b-v7.16b}, v8.16b
+	tbx		v22.16b, {v16.16b-v19.16b}, v9.16b
+	add		v8.16b, v8.16b, v10.16b
+	add		v9.16b, v9.16b, v10.16b
+	tbl		v3.16b, {v4.16b-v7.16b}, v8.16b
+	tbx		v23.16b, {v16.16b-v19.16b}, v9.16b
+
+	eor		v20.16b, v20.16b, v0.16b
+	eor		v21.16b, v21.16b, v1.16b
+	eor		v22.16b, v22.16b, v2.16b
+	eor		v23.16b, v23.16b, v3.16b
+	st1		{v20.16b-v23.16b}, [x1]
+	ret
+
+	// fewer than 192 bytes of in/output
+2:	ld1		{v4.16b}, [x10]
+	ld1		{v5.16b}, [x11]
+	movi		v6.16b, #16
+	add		x1, x1, x7
+	tbl		v0.16b, {v8.16b-v11.16b}, v4.16b
+	tbx		v24.16b, {v20.16b-v23.16b}, v5.16b
+	add		v4.16b, v4.16b, v6.16b
+	add		v5.16b, v5.16b, v6.16b
+	tbl		v1.16b, {v8.16b-v11.16b}, v4.16b
+	tbx		v25.16b, {v20.16b-v23.16b}, v5.16b
+	add		v4.16b, v4.16b, v6.16b
+	add		v5.16b, v5.16b, v6.16b
+	tbl		v2.16b, {v8.16b-v11.16b}, v4.16b
+	tbx		v26.16b, {v20.16b-v23.16b}, v5.16b
+	add		v4.16b, v4.16b, v6.16b
+	add		v5.16b, v5.16b, v6.16b
+	tbl		v3.16b, {v8.16b-v11.16b}, v4.16b
+	tbx		v27.16b, {v20.16b-v23.16b}, v5.16b
+
+	eor		v24.16b, v24.16b, v0.16b
+	eor		v25.16b, v25.16b, v1.16b
+	eor		v26.16b, v26.16b, v2.16b
+	eor		v27.16b, v27.16b, v3.16b
+	st1		{v24.16b-v27.16b}, [x1]
+	ret
+
+	// fewer than 256 bytes of in/output
+3:	ld1		{v4.16b}, [x10]
+	ld1		{v5.16b}, [x11]
+	movi		v6.16b, #16
+	add		x1, x1, x8
+	tbl		v0.16b, {v12.16b-v15.16b}, v4.16b
+	tbx		v28.16b, {v24.16b-v27.16b}, v5.16b
+	add		v4.16b, v4.16b, v6.16b
+	add		v5.16b, v5.16b, v6.16b
+	tbl		v1.16b, {v12.16b-v15.16b}, v4.16b
+	tbx		v29.16b, {v24.16b-v27.16b}, v5.16b
+	add		v4.16b, v4.16b, v6.16b
+	add		v5.16b, v5.16b, v6.16b
+	tbl		v2.16b, {v12.16b-v15.16b}, v4.16b
+	tbx		v30.16b, {v24.16b-v27.16b}, v5.16b
+	add		v4.16b, v4.16b, v6.16b
+	add		v5.16b, v5.16b, v6.16b
+	tbl		v3.16b, {v12.16b-v15.16b}, v4.16b
+	tbx		v31.16b, {v24.16b-v27.16b}, v5.16b
+
+	eor		v28.16b, v28.16b, v0.16b
+	eor		v29.16b, v29.16b, v1.16b
+	eor		v30.16b, v30.16b, v2.16b
+	eor		v31.16b, v31.16b, v3.16b
+	st1		{v28.16b-v31.16b}, [x1]
+	ret
 ENDPROC(chacha_4block_xor_neon)
 
+	.section	".rodata", "a", %progbits
+	.align		L1_CACHE_SHIFT
+.Lpermute:
+	.set		.Li, 0
+	.rept		192
+	.byte		(.Li - 64)
+	.set		.Li, .Li + 1
+	.endr
+
 CTRINC:	.word		0, 1, 2, 3
 ROT8:	.word		0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
diff --git a/arch/arm64/crypto/chacha-neon-glue.c b/arch/arm64/crypto/chacha-neon-glue.c
index 346eb85498a1..67f8feb0c717 100644
--- a/arch/arm64/crypto/chacha-neon-glue.c
+++ b/arch/arm64/crypto/chacha-neon-glue.c
@@ -32,41 +32,29 @@
 asmlinkage void chacha_block_xor_neon(u32 *state, u8 *dst, const u8 *src,
 				      int nrounds);
 asmlinkage void chacha_4block_xor_neon(u32 *state, u8 *dst, const u8 *src,
-				       int nrounds);
+				       int nrounds, int bytes);
 asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
 
 static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
-			  unsigned int bytes, int nrounds)
+			  int bytes, int nrounds)
 {
 	u8 buf[CHACHA_BLOCK_SIZE];
 
-	while (bytes >= CHACHA_BLOCK_SIZE * 4) {
-		kernel_neon_begin();
-		chacha_4block_xor_neon(state, dst, src, nrounds);
-		kernel_neon_end();
+	if (bytes < CHACHA_BLOCK_SIZE) {
+		memcpy(buf, src, bytes);
+		chacha_block_xor_neon(state, buf, buf, nrounds);
+		memcpy(dst, buf, bytes);
+		return;
+	}
+
+	while (bytes > 0) {
+		chacha_4block_xor_neon(state, dst, src, nrounds,
+				       min(bytes, CHACHA_BLOCK_SIZE * 4));
 		bytes -= CHACHA_BLOCK_SIZE * 4;
 		src += CHACHA_BLOCK_SIZE * 4;
 		dst += CHACHA_BLOCK_SIZE * 4;
 		state[12] += 4;
 	}
-
-	if (!bytes)
-		return;
-
-	kernel_neon_begin();
-	while (bytes >= CHACHA_BLOCK_SIZE) {
-		chacha_block_xor_neon(state, dst, src, nrounds);
-		bytes -= CHACHA_BLOCK_SIZE;
-		src += CHACHA_BLOCK_SIZE;
-		dst += CHACHA_BLOCK_SIZE;
-		state[12]++;
-	}
-	if (bytes) {
-		memcpy(buf, src, bytes);
-		chacha_block_xor_neon(state, buf, buf, nrounds);
-		memcpy(dst, buf, bytes);
-	}
-	kernel_neon_end();
 }
 
 static int chacha_neon_stream_xor(struct skcipher_request *req,
@@ -86,8 +74,10 @@ static int chacha_neon_stream_xor(struct skcipher_request *req,
 		if (nbytes < walk.total)
 			nbytes = round_down(nbytes, walk.stride);
 
+		kernel_neon_begin();
 		chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
 			      nbytes, ctx->nrounds);
+		kernel_neon_end();
 		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 2fe55987b2624a86a5c709a8df65d4de2608dc07 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 4 Dec 2018 14:13:33 +0100
Subject: crypto: arm64/chacha - use combined SIMD/ALU routine for more speed

To some degree, most known AArch64 micro-architectures appear to be
able to issue ALU instructions in parellel to SIMD instructions
without affecting the SIMD throughput. This means we can use the ALU
to process a fifth ChaCha block while the SIMD is processing four
blocks in parallel.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm64/crypto/chacha-neon-core.S | 235 ++++++++++++++++++++++++++++++++---
 arch/arm64/crypto/chacha-neon-glue.c |  39 +++---
 2 files changed, 239 insertions(+), 35 deletions(-)

diff --git a/arch/arm64/crypto/chacha-neon-core.S b/arch/arm64/crypto/chacha-neon-core.S
index 8f9c2e83f6f0..021bb9e9784b 100644
--- a/arch/arm64/crypto/chacha-neon-core.S
+++ b/arch/arm64/crypto/chacha-neon-core.S
@@ -1,13 +1,13 @@
 /*
  * ChaCha/XChaCha NEON helper functions
  *
- * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  *
- * Based on:
+ * Originally based on:
  * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
  *
  * Copyright (C) 2015 Martin Willi
@@ -165,8 +165,27 @@ ENTRY(hchacha_block_neon)
 	ret
 ENDPROC(hchacha_block_neon)
 
+	a0		.req	w12
+	a1		.req	w13
+	a2		.req	w14
+	a3		.req	w15
+	a4		.req	w16
+	a5		.req	w17
+	a6		.req	w19
+	a7		.req	w20
+	a8		.req	w21
+	a9		.req	w22
+	a10		.req	w23
+	a11		.req	w24
+	a12		.req	w25
+	a13		.req	w26
+	a14		.req	w27
+	a15		.req	w28
+
 	.align		6
 ENTRY(chacha_4block_xor_neon)
+	frame_push	10
+
 	// x0: Input state matrix, s
 	// x1: 4 data blocks output, o
 	// x2: 4 data blocks input, i
@@ -186,6 +205,9 @@ ENTRY(chacha_4block_xor_neon)
 	// matrix by interleaving 32- and then 64-bit words, which allows us to
 	// do XOR in NEON registers.
 	//
+	// At the same time, a fifth block is encrypted in parallel using
+	// scalar registers
+	//
 	adr_l		x9, CTRINC		// ... and ROT8
 	ld1		{v30.4s-v31.4s}, [x9]
 
@@ -196,7 +218,24 @@ ENTRY(chacha_4block_xor_neon)
 	ld4r		{ v8.4s-v11.4s}, [x8], #16
 	ld4r		{v12.4s-v15.4s}, [x8]
 
-	// x12 += counter values 0-3
+	mov		a0, v0.s[0]
+	mov		a1, v1.s[0]
+	mov		a2, v2.s[0]
+	mov		a3, v3.s[0]
+	mov		a4, v4.s[0]
+	mov		a5, v5.s[0]
+	mov		a6, v6.s[0]
+	mov		a7, v7.s[0]
+	mov		a8, v8.s[0]
+	mov		a9, v9.s[0]
+	mov		a10, v10.s[0]
+	mov		a11, v11.s[0]
+	mov		a12, v12.s[0]
+	mov		a13, v13.s[0]
+	mov		a14, v14.s[0]
+	mov		a15, v15.s[0]
+
+	// x12 += counter values 1-4
 	add		v12.4s, v12.4s, v30.4s
 
 .Ldoubleround4:
@@ -205,33 +244,53 @@ ENTRY(chacha_4block_xor_neon)
 	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
 	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
 	add		v0.4s, v0.4s, v4.4s
+	  add		a0, a0, a4
 	add		v1.4s, v1.4s, v5.4s
+	  add		a1, a1, a5
 	add		v2.4s, v2.4s, v6.4s
+	  add		a2, a2, a6
 	add		v3.4s, v3.4s, v7.4s
+	  add		a3, a3, a7
 
 	eor		v12.16b, v12.16b, v0.16b
+	  eor		a12, a12, a0
 	eor		v13.16b, v13.16b, v1.16b
+	  eor		a13, a13, a1
 	eor		v14.16b, v14.16b, v2.16b
+	  eor		a14, a14, a2
 	eor		v15.16b, v15.16b, v3.16b
+	  eor		a15, a15, a3
 
 	rev32		v12.8h, v12.8h
+	  ror		a12, a12, #16
 	rev32		v13.8h, v13.8h
+	  ror		a13, a13, #16
 	rev32		v14.8h, v14.8h
+	  ror		a14, a14, #16
 	rev32		v15.8h, v15.8h
+	  ror		a15, a15, #16
 
 	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
 	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
 	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
 	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
 	add		v8.4s, v8.4s, v12.4s
+	  add		a8, a8, a12
 	add		v9.4s, v9.4s, v13.4s
+	  add		a9, a9, a13
 	add		v10.4s, v10.4s, v14.4s
+	  add		a10, a10, a14
 	add		v11.4s, v11.4s, v15.4s
+	  add		a11, a11, a15
 
 	eor		v16.16b, v4.16b, v8.16b
+	  eor		a4, a4, a8
 	eor		v17.16b, v5.16b, v9.16b
+	  eor		a5, a5, a9
 	eor		v18.16b, v6.16b, v10.16b
+	  eor		a6, a6, a10
 	eor		v19.16b, v7.16b, v11.16b
+	  eor		a7, a7, a11
 
 	shl		v4.4s, v16.4s, #12
 	shl		v5.4s, v17.4s, #12
@@ -239,42 +298,66 @@ ENTRY(chacha_4block_xor_neon)
 	shl		v7.4s, v19.4s, #12
 
 	sri		v4.4s, v16.4s, #20
+	  ror		a4, a4, #20
 	sri		v5.4s, v17.4s, #20
+	  ror		a5, a5, #20
 	sri		v6.4s, v18.4s, #20
+	  ror		a6, a6, #20
 	sri		v7.4s, v19.4s, #20
+	  ror		a7, a7, #20
 
 	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
 	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
 	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
 	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
 	add		v0.4s, v0.4s, v4.4s
+	  add		a0, a0, a4
 	add		v1.4s, v1.4s, v5.4s
+	  add		a1, a1, a5
 	add		v2.4s, v2.4s, v6.4s
+	  add		a2, a2, a6
 	add		v3.4s, v3.4s, v7.4s
+	  add		a3, a3, a7
 
 	eor		v12.16b, v12.16b, v0.16b
+	  eor		a12, a12, a0
 	eor		v13.16b, v13.16b, v1.16b
+	  eor		a13, a13, a1
 	eor		v14.16b, v14.16b, v2.16b
+	  eor		a14, a14, a2
 	eor		v15.16b, v15.16b, v3.16b
+	  eor		a15, a15, a3
 
 	tbl		v12.16b, {v12.16b}, v31.16b
+	  ror		a12, a12, #24
 	tbl		v13.16b, {v13.16b}, v31.16b
+	  ror		a13, a13, #24
 	tbl		v14.16b, {v14.16b}, v31.16b
+	  ror		a14, a14, #24
 	tbl		v15.16b, {v15.16b}, v31.16b
+	  ror		a15, a15, #24
 
 	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
 	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
 	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
 	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
 	add		v8.4s, v8.4s, v12.4s
+	  add		a8, a8, a12
 	add		v9.4s, v9.4s, v13.4s
+	  add		a9, a9, a13
 	add		v10.4s, v10.4s, v14.4s
+	  add		a10, a10, a14
 	add		v11.4s, v11.4s, v15.4s
+	  add		a11, a11, a15
 
 	eor		v16.16b, v4.16b, v8.16b
+	  eor		a4, a4, a8
 	eor		v17.16b, v5.16b, v9.16b
+	  eor		a5, a5, a9
 	eor		v18.16b, v6.16b, v10.16b
+	  eor		a6, a6, a10
 	eor		v19.16b, v7.16b, v11.16b
+	  eor		a7, a7, a11
 
 	shl		v4.4s, v16.4s, #7
 	shl		v5.4s, v17.4s, #7
@@ -282,42 +365,66 @@ ENTRY(chacha_4block_xor_neon)
 	shl		v7.4s, v19.4s, #7
 
 	sri		v4.4s, v16.4s, #25
+	  ror		a4, a4, #25
 	sri		v5.4s, v17.4s, #25
+	  ror		a5, a5, #25
 	sri		v6.4s, v18.4s, #25
+	 ror		a6, a6, #25
 	sri		v7.4s, v19.4s, #25
+	  ror		a7, a7, #25
 
 	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
 	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
 	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
 	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
 	add		v0.4s, v0.4s, v5.4s
+	  add		a0, a0, a5
 	add		v1.4s, v1.4s, v6.4s
+	  add		a1, a1, a6
 	add		v2.4s, v2.4s, v7.4s
+	  add		a2, a2, a7
 	add		v3.4s, v3.4s, v4.4s
+	  add		a3, a3, a4
 
 	eor		v15.16b, v15.16b, v0.16b
+	  eor		a15, a15, a0
 	eor		v12.16b, v12.16b, v1.16b
+	  eor		a12, a12, a1
 	eor		v13.16b, v13.16b, v2.16b
+	  eor		a13, a13, a2
 	eor		v14.16b, v14.16b, v3.16b
+	  eor		a14, a14, a3
 
 	rev32		v15.8h, v15.8h
+	  ror		a15, a15, #16
 	rev32		v12.8h, v12.8h
+	  ror		a12, a12, #16
 	rev32		v13.8h, v13.8h
+	  ror		a13, a13, #16
 	rev32		v14.8h, v14.8h
+	  ror		a14, a14, #16
 
 	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
 	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
 	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
 	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
 	add		v10.4s, v10.4s, v15.4s
+	  add		a10, a10, a15
 	add		v11.4s, v11.4s, v12.4s
+	  add		a11, a11, a12
 	add		v8.4s, v8.4s, v13.4s
+	  add		a8, a8, a13
 	add		v9.4s, v9.4s, v14.4s
+	  add		a9, a9, a14
 
 	eor		v16.16b, v5.16b, v10.16b
+	  eor		a5, a5, a10
 	eor		v17.16b, v6.16b, v11.16b
+	  eor		a6, a6, a11
 	eor		v18.16b, v7.16b, v8.16b
+	  eor		a7, a7, a8
 	eor		v19.16b, v4.16b, v9.16b
+	  eor		a4, a4, a9
 
 	shl		v5.4s, v16.4s, #12
 	shl		v6.4s, v17.4s, #12
@@ -325,42 +432,66 @@ ENTRY(chacha_4block_xor_neon)
 	shl		v4.4s, v19.4s, #12
 
 	sri		v5.4s, v16.4s, #20
+	  ror		a5, a5, #20
 	sri		v6.4s, v17.4s, #20
+	  ror		a6, a6, #20
 	sri		v7.4s, v18.4s, #20
+	  ror		a7, a7, #20
 	sri		v4.4s, v19.4s, #20
+	  ror		a4, a4, #20
 
 	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
 	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
 	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
 	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
 	add		v0.4s, v0.4s, v5.4s
+	  add		a0, a0, a5
 	add		v1.4s, v1.4s, v6.4s
+	  add		a1, a1, a6
 	add		v2.4s, v2.4s, v7.4s
+	  add		a2, a2, a7
 	add		v3.4s, v3.4s, v4.4s
+	  add		a3, a3, a4
 
 	eor		v15.16b, v15.16b, v0.16b
+	  eor		a15, a15, a0
 	eor		v12.16b, v12.16b, v1.16b
+	  eor		a12, a12, a1
 	eor		v13.16b, v13.16b, v2.16b
+	  eor		a13, a13, a2
 	eor		v14.16b, v14.16b, v3.16b
+	  eor		a14, a14, a3
 
 	tbl		v15.16b, {v15.16b}, v31.16b
+	  ror		a15, a15, #24
 	tbl		v12.16b, {v12.16b}, v31.16b
+	  ror		a12, a12, #24
 	tbl		v13.16b, {v13.16b}, v31.16b
+	  ror		a13, a13, #24
 	tbl		v14.16b, {v14.16b}, v31.16b
+	  ror		a14, a14, #24
 
 	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
 	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
 	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
 	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
 	add		v10.4s, v10.4s, v15.4s
+	  add		a10, a10, a15
 	add		v11.4s, v11.4s, v12.4s
+	  add		a11, a11, a12
 	add		v8.4s, v8.4s, v13.4s
+	  add		a8, a8, a13
 	add		v9.4s, v9.4s, v14.4s
+	  add		a9, a9, a14
 
 	eor		v16.16b, v5.16b, v10.16b
+	  eor		a5, a5, a10
 	eor		v17.16b, v6.16b, v11.16b
+	  eor		a6, a6, a11
 	eor		v18.16b, v7.16b, v8.16b
+	  eor		a7, a7, a8
 	eor		v19.16b, v4.16b, v9.16b
+	  eor		a4, a4, a9
 
 	shl		v5.4s, v16.4s, #7
 	shl		v6.4s, v17.4s, #7
@@ -368,9 +499,13 @@ ENTRY(chacha_4block_xor_neon)
 	shl		v4.4s, v19.4s, #7
 
 	sri		v5.4s, v16.4s, #25
+	  ror		a5, a5, #25
 	sri		v6.4s, v17.4s, #25
+	  ror		a6, a6, #25
 	sri		v7.4s, v18.4s, #25
+	  ror		a7, a7, #25
 	sri		v4.4s, v19.4s, #25
+	  ror		a4, a4, #25
 
 	subs		w3, w3, #2
 	b.ne		.Ldoubleround4
@@ -386,9 +521,17 @@ ENTRY(chacha_4block_xor_neon)
 	// x2[0-3] += s0[2]
 	// x3[0-3] += s0[3]
 	add		v0.4s, v0.4s, v16.4s
+	  mov		w6, v16.s[0]
+	  mov		w7, v17.s[0]
 	add		v1.4s, v1.4s, v17.4s
+	  mov		w8, v18.s[0]
+	  mov		w9, v19.s[0]
 	add		v2.4s, v2.4s, v18.4s
+	  add		a0, a0, w6
+	  add		a1, a1, w7
 	add		v3.4s, v3.4s, v19.4s
+	  add		a2, a2, w8
+	  add		a3, a3, w9
 
 	ld4r		{v24.4s-v27.4s}, [x0], #16
 	ld4r		{v28.4s-v31.4s}, [x0]
@@ -398,48 +541,96 @@ ENTRY(chacha_4block_xor_neon)
 	// x6[0-3] += s1[2]
 	// x7[0-3] += s1[3]
 	add		v4.4s, v4.4s, v20.4s
+	  mov		w6, v20.s[0]
+	  mov		w7, v21.s[0]
 	add		v5.4s, v5.4s, v21.4s
+	  mov		w8, v22.s[0]
+	  mov		w9, v23.s[0]
 	add		v6.4s, v6.4s, v22.4s
+	  add		a4, a4, w6
+	  add		a5, a5, w7
 	add		v7.4s, v7.4s, v23.4s
+	  add		a6, a6, w8
+	  add		a7, a7, w9
 
 	// x8[0-3] += s2[0]
 	// x9[0-3] += s2[1]
 	// x10[0-3] += s2[2]
 	// x11[0-3] += s2[3]
 	add		v8.4s, v8.4s, v24.4s
+	  mov		w6, v24.s[0]
+	  mov		w7, v25.s[0]
 	add		v9.4s, v9.4s, v25.4s
+	  mov		w8, v26.s[0]
+	  mov		w9, v27.s[0]
 	add		v10.4s, v10.4s, v26.4s
+	  add		a8, a8, w6
+	  add		a9, a9, w7
 	add		v11.4s, v11.4s, v27.4s
+	  add		a10, a10, w8
+	  add		a11, a11, w9
 
 	// x12[0-3] += s3[0]
 	// x13[0-3] += s3[1]
 	// x14[0-3] += s3[2]
 	// x15[0-3] += s3[3]
 	add		v12.4s, v12.4s, v28.4s
+	  mov		w6, v28.s[0]
+	  mov		w7, v29.s[0]
 	add		v13.4s, v13.4s, v29.4s
+	  mov		w8, v30.s[0]
+	  mov		w9, v31.s[0]
 	add		v14.4s, v14.4s, v30.4s
+	  add		a12, a12, w6
+	  add		a13, a13, w7
 	add		v15.4s, v15.4s, v31.4s
+	  add		a14, a14, w8
+	  add		a15, a15, w9
 
 	// interleave 32-bit words in state n, n+1
+	  ldp		w6, w7, [x2], #64
 	zip1		v16.4s, v0.4s, v1.4s
+	  ldp		w8, w9, [x2, #-56]
+	  eor		a0, a0, w6
 	zip2		v17.4s, v0.4s, v1.4s
+	  eor		a1, a1, w7
 	zip1		v18.4s, v2.4s, v3.4s
+	  eor		a2, a2, w8
 	zip2		v19.4s, v2.4s, v3.4s
+	  eor		a3, a3, w9
+	  ldp		w6, w7, [x2, #-48]
 	zip1		v20.4s, v4.4s, v5.4s
+	  ldp		w8, w9, [x2, #-40]
+	  eor		a4, a4, w6
 	zip2		v21.4s, v4.4s, v5.4s
+	  eor		a5, a5, w7
 	zip1		v22.4s, v6.4s, v7.4s
+	  eor		a6, a6, w8
 	zip2		v23.4s, v6.4s, v7.4s
+	  eor		a7, a7, w9
+	  ldp		w6, w7, [x2, #-32]
 	zip1		v24.4s, v8.4s, v9.4s
+	  ldp		w8, w9, [x2, #-24]
+	  eor		a8, a8, w6
 	zip2		v25.4s, v8.4s, v9.4s
+	  eor		a9, a9, w7
 	zip1		v26.4s, v10.4s, v11.4s
+	  eor		a10, a10, w8
 	zip2		v27.4s, v10.4s, v11.4s
+	  eor		a11, a11, w9
+	  ldp		w6, w7, [x2, #-16]
 	zip1		v28.4s, v12.4s, v13.4s
+	  ldp		w8, w9, [x2, #-8]
+	  eor		a12, a12, w6
 	zip2		v29.4s, v12.4s, v13.4s
+	  eor		a13, a13, w7
 	zip1		v30.4s, v14.4s, v15.4s
+	  eor		a14, a14, w8
 	zip2		v31.4s, v14.4s, v15.4s
+	  eor		a15, a15, w9
 
 	mov		x3, #64
-	subs		x5, x4, #64
+	subs		x5, x4, #128
 	add		x6, x5, x2
 	csel		x3, x3, xzr, ge
 	csel		x2, x2, x6, ge
@@ -447,11 +638,13 @@ ENTRY(chacha_4block_xor_neon)
 	// interleave 64-bit words in state n, n+2
 	zip1		v0.2d, v16.2d, v18.2d
 	zip2		v4.2d, v16.2d, v18.2d
+	  stp		a0, a1, [x1], #64
 	zip1		v8.2d, v17.2d, v19.2d
 	zip2		v12.2d, v17.2d, v19.2d
+	  stp		a2, a3, [x1, #-56]
 	ld1		{v16.16b-v19.16b}, [x2], x3
 
-	subs		x6, x4, #128
+	subs		x6, x4, #192
 	ccmp		x3, xzr, #4, lt
 	add		x7, x6, x2
 	csel		x3, x3, xzr, eq
@@ -459,11 +652,13 @@ ENTRY(chacha_4block_xor_neon)
 
 	zip1		v1.2d, v20.2d, v22.2d
 	zip2		v5.2d, v20.2d, v22.2d
+	  stp		a4, a5, [x1, #-48]
 	zip1		v9.2d, v21.2d, v23.2d
 	zip2		v13.2d, v21.2d, v23.2d
+	  stp		a6, a7, [x1, #-40]
 	ld1		{v20.16b-v23.16b}, [x2], x3
 
-	subs		x7, x4, #192
+	subs		x7, x4, #256
 	ccmp		x3, xzr, #4, lt
 	add		x8, x7, x2
 	csel		x3, x3, xzr, eq
@@ -471,19 +666,23 @@ ENTRY(chacha_4block_xor_neon)
 
 	zip1		v2.2d, v24.2d, v26.2d
 	zip2		v6.2d, v24.2d, v26.2d
+	  stp		a8, a9, [x1, #-32]
 	zip1		v10.2d, v25.2d, v27.2d
 	zip2		v14.2d, v25.2d, v27.2d
+	  stp		a10, a11, [x1, #-24]
 	ld1		{v24.16b-v27.16b}, [x2], x3
 
-	subs		x8, x4, #256
+	subs		x8, x4, #320
 	ccmp		x3, xzr, #4, lt
 	add		x9, x8, x2
 	csel		x2, x2, x9, eq
 
 	zip1		v3.2d, v28.2d, v30.2d
 	zip2		v7.2d, v28.2d, v30.2d
+	  stp		a12, a13, [x1, #-16]
 	zip1		v11.2d, v29.2d, v31.2d
 	zip2		v15.2d, v29.2d, v31.2d
+	  stp		a14, a15, [x1, #-8]
 	ld1		{v28.16b-v31.16b}, [x2]
 
 	// xor with corresponding input, write to output
@@ -493,6 +692,7 @@ ENTRY(chacha_4block_xor_neon)
 	eor		v18.16b, v18.16b, v2.16b
 	eor		v19.16b, v19.16b, v3.16b
 	st1		{v16.16b-v19.16b}, [x1], #64
+	cbz		x5, .Lout
 
 	tbnz		x6, #63, 1f
 	eor		v20.16b, v20.16b, v4.16b
@@ -500,6 +700,7 @@ ENTRY(chacha_4block_xor_neon)
 	eor		v22.16b, v22.16b, v6.16b
 	eor		v23.16b, v23.16b, v7.16b
 	st1		{v20.16b-v23.16b}, [x1], #64
+	cbz		x6, .Lout
 
 	tbnz		x7, #63, 2f
 	eor		v24.16b, v24.16b, v8.16b
@@ -507,6 +708,7 @@ ENTRY(chacha_4block_xor_neon)
 	eor		v26.16b, v26.16b, v10.16b
 	eor		v27.16b, v27.16b, v11.16b
 	st1		{v24.16b-v27.16b}, [x1], #64
+	cbz		x7, .Lout
 
 	tbnz		x8, #63, 3f
 	eor		v28.16b, v28.16b, v12.16b
@@ -515,9 +717,10 @@ ENTRY(chacha_4block_xor_neon)
 	eor		v31.16b, v31.16b, v15.16b
 	st1		{v28.16b-v31.16b}, [x1]
 
+.Lout:	frame_pop
 	ret
 
-	// fewer than 64 bytes of in/output
+	// fewer than 128 bytes of in/output
 0:	ld1		{v8.16b}, [x10]
 	ld1		{v9.16b}, [x11]
 	movi		v10.16b, #16
@@ -544,9 +747,9 @@ ENTRY(chacha_4block_xor_neon)
 	eor		v22.16b, v22.16b, v6.16b
 	eor		v23.16b, v23.16b, v7.16b
 	st1		{v20.16b-v23.16b}, [x1]
-	ret
+	b		.Lout
 
-	// fewer than 128 bytes of in/output
+	// fewer than 192 bytes of in/output
 1:	ld1		{v8.16b}, [x10]
 	ld1		{v9.16b}, [x11]
 	movi		v10.16b, #16
@@ -571,9 +774,9 @@ ENTRY(chacha_4block_xor_neon)
 	eor		v22.16b, v22.16b, v2.16b
 	eor		v23.16b, v23.16b, v3.16b
 	st1		{v20.16b-v23.16b}, [x1]
-	ret
+	b		.Lout
 
-	// fewer than 192 bytes of in/output
+	// fewer than 256 bytes of in/output
 2:	ld1		{v4.16b}, [x10]
 	ld1		{v5.16b}, [x11]
 	movi		v6.16b, #16
@@ -598,9 +801,9 @@ ENTRY(chacha_4block_xor_neon)
 	eor		v26.16b, v26.16b, v2.16b
 	eor		v27.16b, v27.16b, v3.16b
 	st1		{v24.16b-v27.16b}, [x1]
-	ret
+	b		.Lout
 
-	// fewer than 256 bytes of in/output
+	// fewer than 320 bytes of in/output
 3:	ld1		{v4.16b}, [x10]
 	ld1		{v5.16b}, [x11]
 	movi		v6.16b, #16
@@ -625,7 +828,7 @@ ENTRY(chacha_4block_xor_neon)
 	eor		v30.16b, v30.16b, v2.16b
 	eor		v31.16b, v31.16b, v3.16b
 	st1		{v28.16b-v31.16b}, [x1]
-	ret
+	b		.Lout
 ENDPROC(chacha_4block_xor_neon)
 
 	.section	".rodata", "a", %progbits
@@ -637,5 +840,5 @@ ENDPROC(chacha_4block_xor_neon)
 	.set		.Li, .Li + 1
 	.endr
 
-CTRINC:	.word		0, 1, 2, 3
+CTRINC:	.word		1, 2, 3, 4
 ROT8:	.word		0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
diff --git a/arch/arm64/crypto/chacha-neon-glue.c b/arch/arm64/crypto/chacha-neon-glue.c
index 67f8feb0c717..bece1d85bd81 100644
--- a/arch/arm64/crypto/chacha-neon-glue.c
+++ b/arch/arm64/crypto/chacha-neon-glue.c
@@ -38,22 +38,23 @@ asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
 static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
 			  int bytes, int nrounds)
 {
-	u8 buf[CHACHA_BLOCK_SIZE];
-
-	if (bytes < CHACHA_BLOCK_SIZE) {
-		memcpy(buf, src, bytes);
-		chacha_block_xor_neon(state, buf, buf, nrounds);
-		memcpy(dst, buf, bytes);
-		return;
-	}
-
 	while (bytes > 0) {
-		chacha_4block_xor_neon(state, dst, src, nrounds,
-				       min(bytes, CHACHA_BLOCK_SIZE * 4));
-		bytes -= CHACHA_BLOCK_SIZE * 4;
-		src += CHACHA_BLOCK_SIZE * 4;
-		dst += CHACHA_BLOCK_SIZE * 4;
-		state[12] += 4;
+		int l = min(bytes, CHACHA_BLOCK_SIZE * 5);
+
+		if (l <= CHACHA_BLOCK_SIZE) {
+			u8 buf[CHACHA_BLOCK_SIZE];
+
+			memcpy(buf, src, l);
+			chacha_block_xor_neon(state, buf, buf, nrounds);
+			memcpy(dst, buf, l);
+			state[12] += 1;
+			break;
+		}
+		chacha_4block_xor_neon(state, dst, src, nrounds, l);
+		bytes -= CHACHA_BLOCK_SIZE * 5;
+		src += CHACHA_BLOCK_SIZE * 5;
+		dst += CHACHA_BLOCK_SIZE * 5;
+		state[12] += 5;
 	}
 }
 
@@ -72,7 +73,7 @@ static int chacha_neon_stream_xor(struct skcipher_request *req,
 		unsigned int nbytes = walk.nbytes;
 
 		if (nbytes < walk.total)
-			nbytes = round_down(nbytes, walk.stride);
+			nbytes = rounddown(nbytes, walk.stride);
 
 		kernel_neon_begin();
 		chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
@@ -131,7 +132,7 @@ static struct skcipher_alg algs[] = {
 		.max_keysize		= CHACHA_KEY_SIZE,
 		.ivsize			= CHACHA_IV_SIZE,
 		.chunksize		= CHACHA_BLOCK_SIZE,
-		.walksize		= 4 * CHACHA_BLOCK_SIZE,
+		.walksize		= 5 * CHACHA_BLOCK_SIZE,
 		.setkey			= crypto_chacha20_setkey,
 		.encrypt		= chacha_neon,
 		.decrypt		= chacha_neon,
@@ -147,7 +148,7 @@ static struct skcipher_alg algs[] = {
 		.max_keysize		= CHACHA_KEY_SIZE,
 		.ivsize			= XCHACHA_IV_SIZE,
 		.chunksize		= CHACHA_BLOCK_SIZE,
-		.walksize		= 4 * CHACHA_BLOCK_SIZE,
+		.walksize		= 5 * CHACHA_BLOCK_SIZE,
 		.setkey			= crypto_chacha20_setkey,
 		.encrypt		= xchacha_neon,
 		.decrypt		= xchacha_neon,
@@ -163,7 +164,7 @@ static struct skcipher_alg algs[] = {
 		.max_keysize		= CHACHA_KEY_SIZE,
 		.ivsize			= XCHACHA_IV_SIZE,
 		.chunksize		= CHACHA_BLOCK_SIZE,
-		.walksize		= 4 * CHACHA_BLOCK_SIZE,
+		.walksize		= 5 * CHACHA_BLOCK_SIZE,
 		.setkey			= crypto_chacha12_setkey,
 		.encrypt		= xchacha_neon,
 		.decrypt		= xchacha_neon,
-- 
cgit v1.2.3-59-g8ed1b


From b299362ee48db8eab34208302ee9730ff9d6091c Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 4 Dec 2018 16:46:54 -0800
Subject: crypto: adiantum - propagate CRYPTO_ALG_ASYNC flag to instance

If the stream cipher implementation is asynchronous, then the Adiantum
instance must be flagged as asynchronous as well.  Otherwise someone
asking for a synchronous algorithm can get an asynchronous algorithm.

There are no asynchronous xchacha12 or xchacha20 implementations yet
which makes this largely a theoretical issue, but it should be fixed.

Fixes: 059c2a4d8e16 ("crypto: adiantum - add Adiantum support")
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/adiantum.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/crypto/adiantum.c b/crypto/adiantum.c
index 2dfcf12fd452..ca27e0dc2958 100644
--- a/crypto/adiantum.c
+++ b/crypto/adiantum.c
@@ -590,6 +590,8 @@ static int adiantum_create(struct crypto_template *tmpl, struct rtattr **tb)
 		     hash_alg->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME)
 		goto out_drop_hash;
 
+	inst->alg.base.cra_flags = streamcipher_alg->base.cra_flags &
+				   CRYPTO_ALG_ASYNC;
 	inst->alg.base.cra_blocksize = BLOCKCIPHER_BLOCK_SIZE;
 	inst->alg.base.cra_ctxsize = sizeof(struct adiantum_tfm_ctx);
 	inst->alg.base.cra_alignmask = streamcipher_alg->base.cra_alignmask |
-- 
cgit v1.2.3-59-g8ed1b


From 012c82388c032cd4a9821e11bae336cf4a014822 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 4 Dec 2018 22:20:00 -0800
Subject: crypto: x86/nhpoly1305 - add SSE2 accelerated NHPoly1305
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a 64-bit SSE2 implementation of NHPoly1305, an ε-almost-∆-universal
hash function used in the Adiantum encryption mode.  For now, only the
NH portion is actually SSE2-accelerated; the Poly1305 part is less
performance-critical so is just implemented in C.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/Makefile               |   4 ++
 arch/x86/crypto/nh-sse2-x86_64.S       | 123 +++++++++++++++++++++++++++++++++
 arch/x86/crypto/nhpoly1305-sse2-glue.c |  76 ++++++++++++++++++++
 crypto/Kconfig                         |   8 +++
 4 files changed, 211 insertions(+)
 create mode 100644 arch/x86/crypto/nh-sse2-x86_64.S
 create mode 100644 arch/x86/crypto/nhpoly1305-sse2-glue.c

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index ce4e43642984..2a6acb4de373 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -47,6 +47,8 @@ obj-$(CONFIG_CRYPTO_MORUS1280_GLUE) += morus1280_glue.o
 obj-$(CONFIG_CRYPTO_MORUS640_SSE2) += morus640-sse2.o
 obj-$(CONFIG_CRYPTO_MORUS1280_SSE2) += morus1280-sse2.o
 
+obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o
+
 # These modules require assembler to support AVX.
 ifeq ($(avx_supported),yes)
 	obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \
@@ -85,6 +87,8 @@ aegis256-aesni-y := aegis256-aesni-asm.o aegis256-aesni-glue.o
 morus640-sse2-y := morus640-sse2-asm.o morus640-sse2-glue.o
 morus1280-sse2-y := morus1280-sse2-asm.o morus1280-sse2-glue.o
 
+nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
+
 ifeq ($(avx_supported),yes)
 	camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
 					camellia_aesni_avx_glue.o
diff --git a/arch/x86/crypto/nh-sse2-x86_64.S b/arch/x86/crypto/nh-sse2-x86_64.S
new file mode 100644
index 000000000000..51f52d4ab4bb
--- /dev/null
+++ b/arch/x86/crypto/nh-sse2-x86_64.S
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NH - ε-almost-universal hash function, x86_64 SSE2 accelerated
+ *
+ * Copyright 2018 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+
+#define		PASS0_SUMS	%xmm0
+#define		PASS1_SUMS	%xmm1
+#define		PASS2_SUMS	%xmm2
+#define		PASS3_SUMS	%xmm3
+#define		K0		%xmm4
+#define		K1		%xmm5
+#define		K2		%xmm6
+#define		K3		%xmm7
+#define		T0		%xmm8
+#define		T1		%xmm9
+#define		T2		%xmm10
+#define		T3		%xmm11
+#define		T4		%xmm12
+#define		T5		%xmm13
+#define		T6		%xmm14
+#define		T7		%xmm15
+#define		KEY		%rdi
+#define		MESSAGE		%rsi
+#define		MESSAGE_LEN	%rdx
+#define		HASH		%rcx
+
+.macro _nh_stride	k0, k1, k2, k3, offset
+
+	// Load next message stride
+	movdqu		\offset(MESSAGE), T1
+
+	// Load next key stride
+	movdqu		\offset(KEY), \k3
+
+	// Add message words to key words
+	movdqa		T1, T2
+	movdqa		T1, T3
+	paddd		T1, \k0    // reuse k0 to avoid a move
+	paddd		\k1, T1
+	paddd		\k2, T2
+	paddd		\k3, T3
+
+	// Multiply 32x32 => 64 and accumulate
+	pshufd		$0x10, \k0, T4
+	pshufd		$0x32, \k0, \k0
+	pshufd		$0x10, T1, T5
+	pshufd		$0x32, T1, T1
+	pshufd		$0x10, T2, T6
+	pshufd		$0x32, T2, T2
+	pshufd		$0x10, T3, T7
+	pshufd		$0x32, T3, T3
+	pmuludq		T4, \k0
+	pmuludq		T5, T1
+	pmuludq		T6, T2
+	pmuludq		T7, T3
+	paddq		\k0, PASS0_SUMS
+	paddq		T1, PASS1_SUMS
+	paddq		T2, PASS2_SUMS
+	paddq		T3, PASS3_SUMS
+.endm
+
+/*
+ * void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
+ *		u8 hash[NH_HASH_BYTES])
+ *
+ * It's guaranteed that message_len % 16 == 0.
+ */
+ENTRY(nh_sse2)
+
+	movdqu		0x00(KEY), K0
+	movdqu		0x10(KEY), K1
+	movdqu		0x20(KEY), K2
+	add		$0x30, KEY
+	pxor		PASS0_SUMS, PASS0_SUMS
+	pxor		PASS1_SUMS, PASS1_SUMS
+	pxor		PASS2_SUMS, PASS2_SUMS
+	pxor		PASS3_SUMS, PASS3_SUMS
+
+	sub		$0x40, MESSAGE_LEN
+	jl		.Lloop4_done
+.Lloop4:
+	_nh_stride	K0, K1, K2, K3, 0x00
+	_nh_stride	K1, K2, K3, K0, 0x10
+	_nh_stride	K2, K3, K0, K1, 0x20
+	_nh_stride	K3, K0, K1, K2, 0x30
+	add		$0x40, KEY
+	add		$0x40, MESSAGE
+	sub		$0x40, MESSAGE_LEN
+	jge		.Lloop4
+
+.Lloop4_done:
+	and		$0x3f, MESSAGE_LEN
+	jz		.Ldone
+	_nh_stride	K0, K1, K2, K3, 0x00
+
+	sub		$0x10, MESSAGE_LEN
+	jz		.Ldone
+	_nh_stride	K1, K2, K3, K0, 0x10
+
+	sub		$0x10, MESSAGE_LEN
+	jz		.Ldone
+	_nh_stride	K2, K3, K0, K1, 0x20
+
+.Ldone:
+	// Sum the accumulators for each pass, then store the sums to 'hash'
+	movdqa		PASS0_SUMS, T0
+	movdqa		PASS2_SUMS, T1
+	punpcklqdq	PASS1_SUMS, T0		// => (PASS0_SUM_A PASS1_SUM_A)
+	punpcklqdq	PASS3_SUMS, T1		// => (PASS2_SUM_A PASS3_SUM_A)
+	punpckhqdq	PASS1_SUMS, PASS0_SUMS	// => (PASS0_SUM_B PASS1_SUM_B)
+	punpckhqdq	PASS3_SUMS, PASS2_SUMS	// => (PASS2_SUM_B PASS3_SUM_B)
+	paddq		PASS0_SUMS, T0
+	paddq		PASS2_SUMS, T1
+	movdqu		T0, 0x00(HASH)
+	movdqu		T1, 0x10(HASH)
+	ret
+ENDPROC(nh_sse2)
diff --git a/arch/x86/crypto/nhpoly1305-sse2-glue.c b/arch/x86/crypto/nhpoly1305-sse2-glue.c
new file mode 100644
index 000000000000..ed68d164ce14
--- /dev/null
+++ b/arch/x86/crypto/nhpoly1305-sse2-glue.c
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum
+ * (SSE2 accelerated version)
+ *
+ * Copyright 2018 Google LLC
+ */
+
+#include <crypto/internal/hash.h>
+#include <crypto/nhpoly1305.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+
+asmlinkage void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
+			u8 hash[NH_HASH_BYTES]);
+
+/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */
+static void _nh_sse2(const u32 *key, const u8 *message, size_t message_len,
+		     __le64 hash[NH_NUM_PASSES])
+{
+	nh_sse2(key, message, message_len, (u8 *)hash);
+}
+
+static int nhpoly1305_sse2_update(struct shash_desc *desc,
+				  const u8 *src, unsigned int srclen)
+{
+	if (srclen < 64 || !irq_fpu_usable())
+		return crypto_nhpoly1305_update(desc, src, srclen);
+
+	do {
+		unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE);
+
+		kernel_fpu_begin();
+		crypto_nhpoly1305_update_helper(desc, src, n, _nh_sse2);
+		kernel_fpu_end();
+		src += n;
+		srclen -= n;
+	} while (srclen);
+	return 0;
+}
+
+static struct shash_alg nhpoly1305_alg = {
+	.base.cra_name		= "nhpoly1305",
+	.base.cra_driver_name	= "nhpoly1305-sse2",
+	.base.cra_priority	= 200,
+	.base.cra_ctxsize	= sizeof(struct nhpoly1305_key),
+	.base.cra_module	= THIS_MODULE,
+	.digestsize		= POLY1305_DIGEST_SIZE,
+	.init			= crypto_nhpoly1305_init,
+	.update			= nhpoly1305_sse2_update,
+	.final			= crypto_nhpoly1305_final,
+	.setkey			= crypto_nhpoly1305_setkey,
+	.descsize		= sizeof(struct nhpoly1305_state),
+};
+
+static int __init nhpoly1305_mod_init(void)
+{
+	if (!boot_cpu_has(X86_FEATURE_XMM2))
+		return -ENODEV;
+
+	return crypto_register_shash(&nhpoly1305_alg);
+}
+
+static void __exit nhpoly1305_mod_exit(void)
+{
+	crypto_unregister_shash(&nhpoly1305_alg);
+}
+
+module_init(nhpoly1305_mod_init);
+module_exit(nhpoly1305_mod_exit);
+
+MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (SSE2-accelerated)");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
+MODULE_ALIAS_CRYPTO("nhpoly1305");
+MODULE_ALIAS_CRYPTO("nhpoly1305-sse2");
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 5994d0fa7a78..cd3d9cb3482c 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -501,6 +501,14 @@ config CRYPTO_NHPOLY1305
 	select CRYPTO_HASH
 	select CRYPTO_POLY1305
 
+config CRYPTO_NHPOLY1305_SSE2
+	tristate "NHPoly1305 hash function (x86_64 SSE2 implementation)"
+	depends on X86 && 64BIT
+	select CRYPTO_NHPOLY1305
+	help
+	  SSE2 optimized implementation of the hash function used by the
+	  Adiantum encryption mode.
+
 config CRYPTO_ADIANTUM
 	tristate "Adiantum support"
 	select CRYPTO_CHACHA20
-- 
cgit v1.2.3-59-g8ed1b


From 0f961f9f670e7c07690bfde2f533b93c653569cc Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 4 Dec 2018 22:20:01 -0800
Subject: crypto: x86/nhpoly1305 - add AVX2 accelerated NHPoly1305
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a 64-bit AVX2 implementation of NHPoly1305, an ε-almost-∆-universal
hash function used in the Adiantum encryption mode.  For now, only the
NH portion is actually AVX2-accelerated; the Poly1305 part is less
performance-critical so is just implemented in C.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/Makefile               |   3 +
 arch/x86/crypto/nh-avx2-x86_64.S       | 157 +++++++++++++++++++++++++++++++++
 arch/x86/crypto/nhpoly1305-avx2-glue.c |  77 ++++++++++++++++
 crypto/Kconfig                         |   8 ++
 4 files changed, 245 insertions(+)
 create mode 100644 arch/x86/crypto/nh-avx2-x86_64.S
 create mode 100644 arch/x86/crypto/nhpoly1305-avx2-glue.c

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 2a6acb4de373..0b31b16f49d8 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_CRYPTO_MORUS640_SSE2) += morus640-sse2.o
 obj-$(CONFIG_CRYPTO_MORUS1280_SSE2) += morus1280-sse2.o
 
 obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o
+obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o
 
 # These modules require assembler to support AVX.
 ifeq ($(avx_supported),yes)
@@ -106,6 +107,8 @@ ifeq ($(avx2_supported),yes)
 	serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
 
 	morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o
+
+	nhpoly1305-avx2-y := nh-avx2-x86_64.o nhpoly1305-avx2-glue.o
 endif
 
 ifeq ($(avx512_supported),yes)
diff --git a/arch/x86/crypto/nh-avx2-x86_64.S b/arch/x86/crypto/nh-avx2-x86_64.S
new file mode 100644
index 000000000000..f7946ea1b704
--- /dev/null
+++ b/arch/x86/crypto/nh-avx2-x86_64.S
@@ -0,0 +1,157 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NH - ε-almost-universal hash function, x86_64 AVX2 accelerated
+ *
+ * Copyright 2018 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+
+#define		PASS0_SUMS	%ymm0
+#define		PASS1_SUMS	%ymm1
+#define		PASS2_SUMS	%ymm2
+#define		PASS3_SUMS	%ymm3
+#define		K0		%ymm4
+#define		K0_XMM		%xmm4
+#define		K1		%ymm5
+#define		K1_XMM		%xmm5
+#define		K2		%ymm6
+#define		K2_XMM		%xmm6
+#define		K3		%ymm7
+#define		K3_XMM		%xmm7
+#define		T0		%ymm8
+#define		T1		%ymm9
+#define		T2		%ymm10
+#define		T2_XMM		%xmm10
+#define		T3		%ymm11
+#define		T3_XMM		%xmm11
+#define		T4		%ymm12
+#define		T5		%ymm13
+#define		T6		%ymm14
+#define		T7		%ymm15
+#define		KEY		%rdi
+#define		MESSAGE		%rsi
+#define		MESSAGE_LEN	%rdx
+#define		HASH		%rcx
+
+.macro _nh_2xstride	k0, k1, k2, k3
+
+	// Add message words to key words
+	vpaddd		\k0, T3, T0
+	vpaddd		\k1, T3, T1
+	vpaddd		\k2, T3, T2
+	vpaddd		\k3, T3, T3
+
+	// Multiply 32x32 => 64 and accumulate
+	vpshufd		$0x10, T0, T4
+	vpshufd		$0x32, T0, T0
+	vpshufd		$0x10, T1, T5
+	vpshufd		$0x32, T1, T1
+	vpshufd		$0x10, T2, T6
+	vpshufd		$0x32, T2, T2
+	vpshufd		$0x10, T3, T7
+	vpshufd		$0x32, T3, T3
+	vpmuludq	T4, T0, T0
+	vpmuludq	T5, T1, T1
+	vpmuludq	T6, T2, T2
+	vpmuludq	T7, T3, T3
+	vpaddq		T0, PASS0_SUMS, PASS0_SUMS
+	vpaddq		T1, PASS1_SUMS, PASS1_SUMS
+	vpaddq		T2, PASS2_SUMS, PASS2_SUMS
+	vpaddq		T3, PASS3_SUMS, PASS3_SUMS
+.endm
+
+/*
+ * void nh_avx2(const u32 *key, const u8 *message, size_t message_len,
+ *		u8 hash[NH_HASH_BYTES])
+ *
+ * It's guaranteed that message_len % 16 == 0.
+ */
+ENTRY(nh_avx2)
+
+	vmovdqu		0x00(KEY), K0
+	vmovdqu		0x10(KEY), K1
+	add		$0x20, KEY
+	vpxor		PASS0_SUMS, PASS0_SUMS, PASS0_SUMS
+	vpxor		PASS1_SUMS, PASS1_SUMS, PASS1_SUMS
+	vpxor		PASS2_SUMS, PASS2_SUMS, PASS2_SUMS
+	vpxor		PASS3_SUMS, PASS3_SUMS, PASS3_SUMS
+
+	sub		$0x40, MESSAGE_LEN
+	jl		.Lloop4_done
+.Lloop4:
+	vmovdqu		(MESSAGE), T3
+	vmovdqu		0x00(KEY), K2
+	vmovdqu		0x10(KEY), K3
+	_nh_2xstride	K0, K1, K2, K3
+
+	vmovdqu		0x20(MESSAGE), T3
+	vmovdqu		0x20(KEY), K0
+	vmovdqu		0x30(KEY), K1
+	_nh_2xstride	K2, K3, K0, K1
+
+	add		$0x40, MESSAGE
+	add		$0x40, KEY
+	sub		$0x40, MESSAGE_LEN
+	jge		.Lloop4
+
+.Lloop4_done:
+	and		$0x3f, MESSAGE_LEN
+	jz		.Ldone
+
+	cmp		$0x20, MESSAGE_LEN
+	jl		.Llast
+
+	// 2 or 3 strides remain; do 2 more.
+	vmovdqu		(MESSAGE), T3
+	vmovdqu		0x00(KEY), K2
+	vmovdqu		0x10(KEY), K3
+	_nh_2xstride	K0, K1, K2, K3
+	add		$0x20, MESSAGE
+	add		$0x20, KEY
+	sub		$0x20, MESSAGE_LEN
+	jz		.Ldone
+	vmovdqa		K2, K0
+	vmovdqa		K3, K1
+.Llast:
+	// Last stride.  Zero the high 128 bits of the message and keys so they
+	// don't affect the result when processing them like 2 strides.
+	vmovdqu		(MESSAGE), T3_XMM
+	vmovdqa		K0_XMM, K0_XMM
+	vmovdqa		K1_XMM, K1_XMM
+	vmovdqu		0x00(KEY), K2_XMM
+	vmovdqu		0x10(KEY), K3_XMM
+	_nh_2xstride	K0, K1, K2, K3
+
+.Ldone:
+	// Sum the accumulators for each pass, then store the sums to 'hash'
+
+	// PASS0_SUMS is (0A 0B 0C 0D)
+	// PASS1_SUMS is (1A 1B 1C 1D)
+	// PASS2_SUMS is (2A 2B 2C 2D)
+	// PASS3_SUMS is (3A 3B 3C 3D)
+	// We need the horizontal sums:
+	//     (0A + 0B + 0C + 0D,
+	//	1A + 1B + 1C + 1D,
+	//	2A + 2B + 2C + 2D,
+	//	3A + 3B + 3C + 3D)
+	//
+
+	vpunpcklqdq	PASS1_SUMS, PASS0_SUMS, T0	// T0 = (0A 1A 0C 1C)
+	vpunpckhqdq	PASS1_SUMS, PASS0_SUMS, T1	// T1 = (0B 1B 0D 1D)
+	vpunpcklqdq	PASS3_SUMS, PASS2_SUMS, T2	// T2 = (2A 3A 2C 3C)
+	vpunpckhqdq	PASS3_SUMS, PASS2_SUMS, T3	// T3 = (2B 3B 2D 3D)
+
+	vinserti128	$0x1, T2_XMM, T0, T4		// T4 = (0A 1A 2A 3A)
+	vinserti128	$0x1, T3_XMM, T1, T5		// T5 = (0B 1B 2B 3B)
+	vperm2i128	$0x31, T2, T0, T0		// T0 = (0C 1C 2C 3C)
+	vperm2i128	$0x31, T3, T1, T1		// T1 = (0D 1D 2D 3D)
+
+	vpaddq		T5, T4, T4
+	vpaddq		T1, T0, T0
+	vpaddq		T4, T0, T0
+	vmovdqu		T0, (HASH)
+	ret
+ENDPROC(nh_avx2)
diff --git a/arch/x86/crypto/nhpoly1305-avx2-glue.c b/arch/x86/crypto/nhpoly1305-avx2-glue.c
new file mode 100644
index 000000000000..20d815ea4b6a
--- /dev/null
+++ b/arch/x86/crypto/nhpoly1305-avx2-glue.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum
+ * (AVX2 accelerated version)
+ *
+ * Copyright 2018 Google LLC
+ */
+
+#include <crypto/internal/hash.h>
+#include <crypto/nhpoly1305.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+
+asmlinkage void nh_avx2(const u32 *key, const u8 *message, size_t message_len,
+			u8 hash[NH_HASH_BYTES]);
+
+/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */
+static void _nh_avx2(const u32 *key, const u8 *message, size_t message_len,
+		     __le64 hash[NH_NUM_PASSES])
+{
+	nh_avx2(key, message, message_len, (u8 *)hash);
+}
+
+static int nhpoly1305_avx2_update(struct shash_desc *desc,
+				  const u8 *src, unsigned int srclen)
+{
+	if (srclen < 64 || !irq_fpu_usable())
+		return crypto_nhpoly1305_update(desc, src, srclen);
+
+	do {
+		unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE);
+
+		kernel_fpu_begin();
+		crypto_nhpoly1305_update_helper(desc, src, n, _nh_avx2);
+		kernel_fpu_end();
+		src += n;
+		srclen -= n;
+	} while (srclen);
+	return 0;
+}
+
+static struct shash_alg nhpoly1305_alg = {
+	.base.cra_name		= "nhpoly1305",
+	.base.cra_driver_name	= "nhpoly1305-avx2",
+	.base.cra_priority	= 300,
+	.base.cra_ctxsize	= sizeof(struct nhpoly1305_key),
+	.base.cra_module	= THIS_MODULE,
+	.digestsize		= POLY1305_DIGEST_SIZE,
+	.init			= crypto_nhpoly1305_init,
+	.update			= nhpoly1305_avx2_update,
+	.final			= crypto_nhpoly1305_final,
+	.setkey			= crypto_nhpoly1305_setkey,
+	.descsize		= sizeof(struct nhpoly1305_state),
+};
+
+static int __init nhpoly1305_mod_init(void)
+{
+	if (!boot_cpu_has(X86_FEATURE_AVX2) ||
+	    !boot_cpu_has(X86_FEATURE_OSXSAVE))
+		return -ENODEV;
+
+	return crypto_register_shash(&nhpoly1305_alg);
+}
+
+static void __exit nhpoly1305_mod_exit(void)
+{
+	crypto_unregister_shash(&nhpoly1305_alg);
+}
+
+module_init(nhpoly1305_mod_init);
+module_exit(nhpoly1305_mod_exit);
+
+MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (AVX2-accelerated)");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
+MODULE_ALIAS_CRYPTO("nhpoly1305");
+MODULE_ALIAS_CRYPTO("nhpoly1305-avx2");
diff --git a/crypto/Kconfig b/crypto/Kconfig
index cd3d9cb3482c..d0bff6ea6b10 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -509,6 +509,14 @@ config CRYPTO_NHPOLY1305_SSE2
 	  SSE2 optimized implementation of the hash function used by the
 	  Adiantum encryption mode.
 
+config CRYPTO_NHPOLY1305_AVX2
+	tristate "NHPoly1305 hash function (x86_64 AVX2 implementation)"
+	depends on X86 && 64BIT
+	select CRYPTO_NHPOLY1305
+	help
+	  AVX2 optimized implementation of the hash function used by the
+	  Adiantum encryption mode.
+
 config CRYPTO_ADIANTUM
 	tristate "Adiantum support"
 	select CRYPTO_CHACHA20
-- 
cgit v1.2.3-59-g8ed1b


From 4af78261870a7d36dd222af8dad9688b705e365e Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 4 Dec 2018 22:20:02 -0800
Subject: crypto: x86/chacha20 - add XChaCha20 support

Add an XChaCha20 implementation that is hooked up to the x86_64 SIMD
implementations of ChaCha20.  This can be used by Adiantum.

An SSSE3 implementation of single-block HChaCha20 is also added so that
XChaCha20 can use it rather than the generic implementation.  This
required refactoring the ChaCha permutation into its own function.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/chacha20-ssse3-x86_64.S |  81 ++++++++++++++++--------
 arch/x86/crypto/chacha20_glue.c         | 108 ++++++++++++++++++++++++--------
 crypto/Kconfig                          |  12 +---
 3 files changed, 141 insertions(+), 60 deletions(-)

diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S b/arch/x86/crypto/chacha20-ssse3-x86_64.S
index d8ac75bb448f..f6792789f875 100644
--- a/arch/x86/crypto/chacha20-ssse3-x86_64.S
+++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S
@@ -10,6 +10,7 @@
  */
 
 #include <linux/linkage.h>
+#include <asm/frame.h>
 
 .section	.rodata.cst16.ROT8, "aM", @progbits, 16
 .align 16
@@ -23,37 +24,24 @@ CTRINC:	.octa 0x00000003000000020000000100000000
 
 .text
 
-ENTRY(chacha20_block_xor_ssse3)
-	# %rdi: Input state matrix, s
-	# %rsi: up to 1 data block output, o
-	# %rdx: up to 1 data block input, i
-	# %rcx: input/output length in bytes
-
-	# This function encrypts one ChaCha20 block by loading the state matrix
-	# in four SSE registers. It performs matrix operation on four words in
-	# parallel, but requires shuffling to rearrange the words after each
-	# round. 8/16-bit word rotation is done with the slightly better
-	# performing SSSE3 byte shuffling, 7/12-bit word rotation uses
-	# traditional shift+OR.
-
-	# x0..3 = s0..3
-	movdqa		0x00(%rdi),%xmm0
-	movdqa		0x10(%rdi),%xmm1
-	movdqa		0x20(%rdi),%xmm2
-	movdqa		0x30(%rdi),%xmm3
-	movdqa		%xmm0,%xmm8
-	movdqa		%xmm1,%xmm9
-	movdqa		%xmm2,%xmm10
-	movdqa		%xmm3,%xmm11
+/*
+ * chacha20_permute - permute one block
+ *
+ * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3.  This
+ * function performs matrix operations on four words in parallel, but requires
+ * shuffling to rearrange the words after each round.  8/16-bit word rotation is
+ * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
+ * rotation uses traditional shift+OR.
+ *
+ * Clobbers: %ecx, %xmm4-%xmm7
+ */
+chacha20_permute:
 
 	movdqa		ROT8(%rip),%xmm4
 	movdqa		ROT16(%rip),%xmm5
-
-	mov		%rcx,%rax
 	mov		$10,%ecx
 
 .Ldoubleround:
-
 	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
 	paddd		%xmm1,%xmm0
 	pxor		%xmm0,%xmm3
@@ -123,6 +111,29 @@ ENTRY(chacha20_block_xor_ssse3)
 	dec		%ecx
 	jnz		.Ldoubleround
 
+	ret
+ENDPROC(chacha20_permute)
+
+ENTRY(chacha20_block_xor_ssse3)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 1 data block output, o
+	# %rdx: up to 1 data block input, i
+	# %rcx: input/output length in bytes
+	FRAME_BEGIN
+
+	# x0..3 = s0..3
+	movdqa		0x00(%rdi),%xmm0
+	movdqa		0x10(%rdi),%xmm1
+	movdqa		0x20(%rdi),%xmm2
+	movdqa		0x30(%rdi),%xmm3
+	movdqa		%xmm0,%xmm8
+	movdqa		%xmm1,%xmm9
+	movdqa		%xmm2,%xmm10
+	movdqa		%xmm3,%xmm11
+
+	mov		%rcx,%rax
+	call		chacha20_permute
+
 	# o0 = i0 ^ (x0 + s0)
 	paddd		%xmm8,%xmm0
 	cmp		$0x10,%rax
@@ -156,6 +167,7 @@ ENTRY(chacha20_block_xor_ssse3)
 	movdqu		%xmm0,0x30(%rsi)
 
 .Ldone:
+	FRAME_END
 	ret
 
 .Lxorpart:
@@ -189,6 +201,25 @@ ENTRY(chacha20_block_xor_ssse3)
 
 ENDPROC(chacha20_block_xor_ssse3)
 
+ENTRY(hchacha20_block_ssse3)
+	# %rdi: Input state matrix, s
+	# %rsi: output (8 32-bit words)
+	FRAME_BEGIN
+
+	movdqa		0x00(%rdi),%xmm0
+	movdqa		0x10(%rdi),%xmm1
+	movdqa		0x20(%rdi),%xmm2
+	movdqa		0x30(%rdi),%xmm3
+
+	call		chacha20_permute
+
+	movdqu		%xmm0,0x00(%rsi)
+	movdqu		%xmm3,0x10(%rsi)
+
+	FRAME_END
+	ret
+ENDPROC(hchacha20_block_ssse3)
+
 ENTRY(chacha20_4block_xor_ssse3)
 	# %rdi: Input state matrix, s
 	# %rsi: up to 4 data blocks output, o
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index 773d075a1483..70d388e4a3a2 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -23,6 +23,7 @@ asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
 					 unsigned int len);
 asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
 					  unsigned int len);
+asmlinkage void hchacha20_block_ssse3(const u32 *state, u32 *out);
 #ifdef CONFIG_AS_AVX2
 asmlinkage void chacha20_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
 					 unsigned int len);
@@ -121,10 +122,9 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
 	}
 }
 
-static int chacha20_simd(struct skcipher_request *req)
+static int chacha20_simd_stream_xor(struct skcipher_request *req,
+				    struct chacha_ctx *ctx, u8 *iv)
 {
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
 	u32 *state, state_buf[16 + 2] __aligned(8);
 	struct skcipher_walk walk;
 	int err;
@@ -132,14 +132,9 @@ static int chacha20_simd(struct skcipher_request *req)
 	BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16);
 	state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN);
 
-	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
-		return crypto_chacha_crypt(req);
-
 	err = skcipher_walk_virt(&walk, req, true);
 
-	crypto_chacha_init(state, ctx, walk.iv);
-
-	kernel_fpu_begin();
+	crypto_chacha_init(state, ctx, iv);
 
 	while (walk.nbytes > 0) {
 		unsigned int nbytes = walk.nbytes;
@@ -153,26 +148,85 @@ static int chacha20_simd(struct skcipher_request *req)
 		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
 	}
 
+	return err;
+}
+
+static int chacha20_simd(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+	int err;
+
+	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable())
+		return crypto_chacha_crypt(req);
+
+	kernel_fpu_begin();
+	err = chacha20_simd_stream_xor(req, ctx, req->iv);
+	kernel_fpu_end();
+	return err;
+}
+
+static int xchacha20_simd(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct chacha_ctx subctx;
+	u32 *state, state_buf[16 + 2] __aligned(8);
+	u8 real_iv[16];
+	int err;
+
+	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable())
+		return crypto_xchacha_crypt(req);
+
+	BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16);
+	state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN);
+	crypto_chacha_init(state, ctx, req->iv);
+
+	kernel_fpu_begin();
+
+	hchacha20_block_ssse3(state, subctx.key);
+
+	memcpy(&real_iv[0], req->iv + 24, 8);
+	memcpy(&real_iv[8], req->iv + 16, 8);
+	err = chacha20_simd_stream_xor(req, &subctx, real_iv);
+
 	kernel_fpu_end();
 
 	return err;
 }
 
-static struct skcipher_alg alg = {
-	.base.cra_name		= "chacha20",
-	.base.cra_driver_name	= "chacha20-simd",
-	.base.cra_priority	= 300,
-	.base.cra_blocksize	= 1,
-	.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-	.base.cra_module	= THIS_MODULE,
-
-	.min_keysize		= CHACHA_KEY_SIZE,
-	.max_keysize		= CHACHA_KEY_SIZE,
-	.ivsize			= CHACHA_IV_SIZE,
-	.chunksize		= CHACHA_BLOCK_SIZE,
-	.setkey			= crypto_chacha20_setkey,
-	.encrypt		= chacha20_simd,
-	.decrypt		= chacha20_simd,
+static struct skcipher_alg algs[] = {
+	{
+		.base.cra_name		= "chacha20",
+		.base.cra_driver_name	= "chacha20-simd",
+		.base.cra_priority	= 300,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= CHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.setkey			= crypto_chacha20_setkey,
+		.encrypt		= chacha20_simd,
+		.decrypt		= chacha20_simd,
+	}, {
+		.base.cra_name		= "xchacha20",
+		.base.cra_driver_name	= "xchacha20-simd",
+		.base.cra_priority	= 300,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= XCHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.setkey			= crypto_chacha20_setkey,
+		.encrypt		= xchacha20_simd,
+		.decrypt		= xchacha20_simd,
+	},
 };
 
 static int __init chacha20_simd_mod_init(void)
@@ -190,12 +244,12 @@ static int __init chacha20_simd_mod_init(void)
 				boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */
 #endif
 #endif
-	return crypto_register_skcipher(&alg);
+	return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
 }
 
 static void __exit chacha20_simd_mod_fini(void)
 {
-	crypto_unregister_skcipher(&alg);
+	crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
 }
 
 module_init(chacha20_simd_mod_init);
@@ -206,3 +260,5 @@ MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
 MODULE_DESCRIPTION("chacha20 cipher algorithm, SIMD accelerated");
 MODULE_ALIAS_CRYPTO("chacha20");
 MODULE_ALIAS_CRYPTO("chacha20-simd");
+MODULE_ALIAS_CRYPTO("xchacha20");
+MODULE_ALIAS_CRYPTO("xchacha20-simd");
diff --git a/crypto/Kconfig b/crypto/Kconfig
index d0bff6ea6b10..dc3a0e3ea2ac 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1468,19 +1468,13 @@ config CRYPTO_CHACHA20
 	  in some performance-sensitive scenarios.
 
 config CRYPTO_CHACHA20_X86_64
-	tristate "ChaCha20 cipher algorithm (x86_64/SSSE3/AVX2)"
+	tristate "ChaCha stream cipher algorithms (x86_64/SSSE3/AVX2/AVX-512VL)"
 	depends on X86 && 64BIT
 	select CRYPTO_BLKCIPHER
 	select CRYPTO_CHACHA20
 	help
-	  ChaCha20 cipher algorithm, RFC7539.
-
-	  ChaCha20 is a 256-bit high-speed stream cipher designed by Daniel J.
-	  Bernstein and further specified in RFC7539 for use in IETF protocols.
-	  This is the x86_64 assembler implementation using SIMD instructions.
-
-	  See also:
-	  <http://cr.yp.to/chacha/chacha-20080128.pdf>
+	  SSSE3, AVX2, and AVX-512VL optimized implementations of the ChaCha20
+	  and XChaCha20 stream ciphers.
 
 config CRYPTO_SEED
 	tristate "SEED cipher algorithm"
-- 
cgit v1.2.3-59-g8ed1b


From 8b65f34c5821e7361488dc668d21195ea4c9f14d Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 4 Dec 2018 22:20:03 -0800
Subject: crypto: x86/chacha20 - refactor to allow varying number of rounds

In preparation for adding XChaCha12 support, rename/refactor the x86_64
SIMD implementations of ChaCha20 to support different numbers of rounds.

Reviewed-by: Martin Willi <martin@strongswan.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/Makefile                   |    8 +-
 arch/x86/crypto/chacha-avx2-x86_64.S       | 1025 +++++++++++++++++++++++++++
 arch/x86/crypto/chacha-avx512vl-x86_64.S   |  836 +++++++++++++++++++++++
 arch/x86/crypto/chacha-ssse3-x86_64.S      |  795 +++++++++++++++++++++
 arch/x86/crypto/chacha20-avx2-x86_64.S     | 1026 ----------------------------
 arch/x86/crypto/chacha20-avx512vl-x86_64.S |  839 -----------------------
 arch/x86/crypto/chacha20-ssse3-x86_64.S    |  792 ---------------------
 arch/x86/crypto/chacha20_glue.c            |  264 -------
 arch/x86/crypto/chacha_glue.c              |  270 ++++++++
 9 files changed, 2930 insertions(+), 2925 deletions(-)
 create mode 100644 arch/x86/crypto/chacha-avx2-x86_64.S
 create mode 100644 arch/x86/crypto/chacha-avx512vl-x86_64.S
 create mode 100644 arch/x86/crypto/chacha-ssse3-x86_64.S
 delete mode 100644 arch/x86/crypto/chacha20-avx2-x86_64.S
 delete mode 100644 arch/x86/crypto/chacha20-avx512vl-x86_64.S
 delete mode 100644 arch/x86/crypto/chacha20-ssse3-x86_64.S
 delete mode 100644 arch/x86/crypto/chacha20_glue.c
 create mode 100644 arch/x86/crypto/chacha_glue.c

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 0b31b16f49d8..45734e1cf967 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -24,7 +24,7 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
 obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
-obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o
+obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o
 obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
 obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
 obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
@@ -78,7 +78,7 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
 blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
 twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
 twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
-chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o
+chacha-x86_64-y := chacha-ssse3-x86_64.o chacha_glue.o
 serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
 
 aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
@@ -103,7 +103,7 @@ endif
 
 ifeq ($(avx2_supported),yes)
 	camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
-	chacha20-x86_64-y += chacha20-avx2-x86_64.o
+	chacha-x86_64-y += chacha-avx2-x86_64.o
 	serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
 
 	morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o
@@ -112,7 +112,7 @@ ifeq ($(avx2_supported),yes)
 endif
 
 ifeq ($(avx512_supported),yes)
-	chacha20-x86_64-y += chacha20-avx512vl-x86_64.o
+	chacha-x86_64-y += chacha-avx512vl-x86_64.o
 endif
 
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
diff --git a/arch/x86/crypto/chacha-avx2-x86_64.S b/arch/x86/crypto/chacha-avx2-x86_64.S
new file mode 100644
index 000000000000..32903fd450af
--- /dev/null
+++ b/arch/x86/crypto/chacha-avx2-x86_64.S
@@ -0,0 +1,1025 @@
+/*
+ * ChaCha 256-bit cipher algorithm, x64 AVX2 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+
+.section	.rodata.cst32.ROT8, "aM", @progbits, 32
+.align 32
+ROT8:	.octa 0x0e0d0c0f0a09080b0605040702010003
+	.octa 0x0e0d0c0f0a09080b0605040702010003
+
+.section	.rodata.cst32.ROT16, "aM", @progbits, 32
+.align 32
+ROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
+	.octa 0x0d0c0f0e09080b0a0504070601000302
+
+.section	.rodata.cst32.CTRINC, "aM", @progbits, 32
+.align 32
+CTRINC:	.octa 0x00000003000000020000000100000000
+	.octa 0x00000007000000060000000500000004
+
+.section	.rodata.cst32.CTR2BL, "aM", @progbits, 32
+.align 32
+CTR2BL:	.octa 0x00000000000000000000000000000000
+	.octa 0x00000000000000000000000000000001
+
+.section	.rodata.cst32.CTR4BL, "aM", @progbits, 32
+.align 32
+CTR4BL:	.octa 0x00000000000000000000000000000002
+	.octa 0x00000000000000000000000000000003
+
+.text
+
+ENTRY(chacha_2block_xor_avx2)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 2 data blocks output, o
+	# %rdx: up to 2 data blocks input, i
+	# %rcx: input/output length in bytes
+	# %r8d: nrounds
+
+	# This function encrypts two ChaCha blocks by loading the state
+	# matrix twice across four AVX registers. It performs matrix operations
+	# on four words in each matrix in parallel, but requires shuffling to
+	# rearrange the words after each round.
+
+	vzeroupper
+
+	# x0..3[0-2] = s0..3
+	vbroadcasti128	0x00(%rdi),%ymm0
+	vbroadcasti128	0x10(%rdi),%ymm1
+	vbroadcasti128	0x20(%rdi),%ymm2
+	vbroadcasti128	0x30(%rdi),%ymm3
+
+	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
+
+	vmovdqa		%ymm0,%ymm8
+	vmovdqa		%ymm1,%ymm9
+	vmovdqa		%ymm2,%ymm10
+	vmovdqa		%ymm3,%ymm11
+
+	vmovdqa		ROT8(%rip),%ymm4
+	vmovdqa		ROT16(%rip),%ymm5
+
+	mov		%rcx,%rax
+
+.Ldoubleround:
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm5,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm6
+	vpslld		$12,%ymm6,%ymm6
+	vpsrld		$20,%ymm1,%ymm1
+	vpor		%ymm6,%ymm1,%ymm1
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm4,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm7
+	vpslld		$7,%ymm7,%ymm7
+	vpsrld		$25,%ymm1,%ymm1
+	vpor		%ymm7,%ymm1,%ymm1
+
+	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm1,%ymm1
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm3,%ymm3
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm5,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm6
+	vpslld		$12,%ymm6,%ymm6
+	vpsrld		$20,%ymm1,%ymm1
+	vpor		%ymm6,%ymm1,%ymm1
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm4,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm7
+	vpslld		$7,%ymm7,%ymm7
+	vpsrld		$25,%ymm1,%ymm1
+	vpor		%ymm7,%ymm1,%ymm1
+
+	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm1,%ymm1
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm3,%ymm3
+
+	sub		$2,%r8d
+	jnz		.Ldoubleround
+
+	# o0 = i0 ^ (x0 + s0)
+	vpaddd		%ymm8,%ymm0,%ymm7
+	cmp		$0x10,%rax
+	jl		.Lxorpart2
+	vpxor		0x00(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x00(%rsi)
+	vextracti128	$1,%ymm7,%xmm0
+	# o1 = i1 ^ (x1 + s1)
+	vpaddd		%ymm9,%ymm1,%ymm7
+	cmp		$0x20,%rax
+	jl		.Lxorpart2
+	vpxor		0x10(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x10(%rsi)
+	vextracti128	$1,%ymm7,%xmm1
+	# o2 = i2 ^ (x2 + s2)
+	vpaddd		%ymm10,%ymm2,%ymm7
+	cmp		$0x30,%rax
+	jl		.Lxorpart2
+	vpxor		0x20(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x20(%rsi)
+	vextracti128	$1,%ymm7,%xmm2
+	# o3 = i3 ^ (x3 + s3)
+	vpaddd		%ymm11,%ymm3,%ymm7
+	cmp		$0x40,%rax
+	jl		.Lxorpart2
+	vpxor		0x30(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x30(%rsi)
+	vextracti128	$1,%ymm7,%xmm3
+
+	# xor and write second block
+	vmovdqa		%xmm0,%xmm7
+	cmp		$0x50,%rax
+	jl		.Lxorpart2
+	vpxor		0x40(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x40(%rsi)
+
+	vmovdqa		%xmm1,%xmm7
+	cmp		$0x60,%rax
+	jl		.Lxorpart2
+	vpxor		0x50(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x50(%rsi)
+
+	vmovdqa		%xmm2,%xmm7
+	cmp		$0x70,%rax
+	jl		.Lxorpart2
+	vpxor		0x60(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x60(%rsi)
+
+	vmovdqa		%xmm3,%xmm7
+	cmp		$0x80,%rax
+	jl		.Lxorpart2
+	vpxor		0x70(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x70(%rsi)
+
+.Ldone2:
+	vzeroupper
+	ret
+
+.Lxorpart2:
+	# xor remaining bytes from partial register into output
+	mov		%rax,%r9
+	and		$0x0f,%r9
+	jz		.Ldone2
+	and		$~0x0f,%rax
+
+	mov		%rsi,%r11
+
+	lea		8(%rsp),%r10
+	sub		$0x10,%rsp
+	and		$~31,%rsp
+
+	lea		(%rdx,%rax),%rsi
+	mov		%rsp,%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	vpxor		0x00(%rsp),%xmm7,%xmm7
+	vmovdqa		%xmm7,0x00(%rsp)
+
+	mov		%rsp,%rsi
+	lea		(%r11,%rax),%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	lea		-8(%r10),%rsp
+	jmp		.Ldone2
+
+ENDPROC(chacha_2block_xor_avx2)
+
+ENTRY(chacha_4block_xor_avx2)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 4 data blocks output, o
+	# %rdx: up to 4 data blocks input, i
+	# %rcx: input/output length in bytes
+	# %r8d: nrounds
+
+	# This function encrypts four ChaCha blocks by loading the state
+	# matrix four times across eight AVX registers. It performs matrix
+	# operations on four words in two matrices in parallel, sequentially
+	# to the operations on the four words of the other two matrices. The
+	# required word shuffling has a rather high latency, we can do the
+	# arithmetic on two matrix-pairs without much slowdown.
+
+	vzeroupper
+
+	# x0..3[0-4] = s0..3
+	vbroadcasti128	0x00(%rdi),%ymm0
+	vbroadcasti128	0x10(%rdi),%ymm1
+	vbroadcasti128	0x20(%rdi),%ymm2
+	vbroadcasti128	0x30(%rdi),%ymm3
+
+	vmovdqa		%ymm0,%ymm4
+	vmovdqa		%ymm1,%ymm5
+	vmovdqa		%ymm2,%ymm6
+	vmovdqa		%ymm3,%ymm7
+
+	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
+	vpaddd		CTR4BL(%rip),%ymm7,%ymm7
+
+	vmovdqa		%ymm0,%ymm11
+	vmovdqa		%ymm1,%ymm12
+	vmovdqa		%ymm2,%ymm13
+	vmovdqa		%ymm3,%ymm14
+	vmovdqa		%ymm7,%ymm15
+
+	vmovdqa		ROT8(%rip),%ymm8
+	vmovdqa		ROT16(%rip),%ymm9
+
+	mov		%rcx,%rax
+
+.Ldoubleround4:
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm9,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxor		%ymm4,%ymm7,%ymm7
+	vpshufb		%ymm9,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm10
+	vpslld		$12,%ymm10,%ymm10
+	vpsrld		$20,%ymm1,%ymm1
+	vpor		%ymm10,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxor		%ymm6,%ymm5,%ymm5
+	vmovdqa		%ymm5,%ymm10
+	vpslld		$12,%ymm10,%ymm10
+	vpsrld		$20,%ymm5,%ymm5
+	vpor		%ymm10,%ymm5,%ymm5
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm8,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxor		%ymm4,%ymm7,%ymm7
+	vpshufb		%ymm8,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm10
+	vpslld		$7,%ymm10,%ymm10
+	vpsrld		$25,%ymm1,%ymm1
+	vpor		%ymm10,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxor		%ymm6,%ymm5,%ymm5
+	vmovdqa		%ymm5,%ymm10
+	vpslld		$7,%ymm10,%ymm10
+	vpsrld		$25,%ymm5,%ymm5
+	vpor		%ymm10,%ymm5,%ymm5
+
+	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm1,%ymm1
+	vpshufd		$0x39,%ymm5,%ymm5
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	vpshufd		$0x4e,%ymm6,%ymm6
+	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm3,%ymm3
+	vpshufd		$0x93,%ymm7,%ymm7
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm9,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxor		%ymm4,%ymm7,%ymm7
+	vpshufb		%ymm9,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm10
+	vpslld		$12,%ymm10,%ymm10
+	vpsrld		$20,%ymm1,%ymm1
+	vpor		%ymm10,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxor		%ymm6,%ymm5,%ymm5
+	vmovdqa		%ymm5,%ymm10
+	vpslld		$12,%ymm10,%ymm10
+	vpsrld		$20,%ymm5,%ymm5
+	vpor		%ymm10,%ymm5,%ymm5
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm8,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxor		%ymm4,%ymm7,%ymm7
+	vpshufb		%ymm8,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm10
+	vpslld		$7,%ymm10,%ymm10
+	vpsrld		$25,%ymm1,%ymm1
+	vpor		%ymm10,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxor		%ymm6,%ymm5,%ymm5
+	vmovdqa		%ymm5,%ymm10
+	vpslld		$7,%ymm10,%ymm10
+	vpsrld		$25,%ymm5,%ymm5
+	vpor		%ymm10,%ymm5,%ymm5
+
+	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm1,%ymm1
+	vpshufd		$0x93,%ymm5,%ymm5
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	vpshufd		$0x4e,%ymm6,%ymm6
+	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm3,%ymm3
+	vpshufd		$0x39,%ymm7,%ymm7
+
+	sub		$2,%r8d
+	jnz		.Ldoubleround4
+
+	# o0 = i0 ^ (x0 + s0), first block
+	vpaddd		%ymm11,%ymm0,%ymm10
+	cmp		$0x10,%rax
+	jl		.Lxorpart4
+	vpxor		0x00(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x00(%rsi)
+	vextracti128	$1,%ymm10,%xmm0
+	# o1 = i1 ^ (x1 + s1), first block
+	vpaddd		%ymm12,%ymm1,%ymm10
+	cmp		$0x20,%rax
+	jl		.Lxorpart4
+	vpxor		0x10(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x10(%rsi)
+	vextracti128	$1,%ymm10,%xmm1
+	# o2 = i2 ^ (x2 + s2), first block
+	vpaddd		%ymm13,%ymm2,%ymm10
+	cmp		$0x30,%rax
+	jl		.Lxorpart4
+	vpxor		0x20(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x20(%rsi)
+	vextracti128	$1,%ymm10,%xmm2
+	# o3 = i3 ^ (x3 + s3), first block
+	vpaddd		%ymm14,%ymm3,%ymm10
+	cmp		$0x40,%rax
+	jl		.Lxorpart4
+	vpxor		0x30(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x30(%rsi)
+	vextracti128	$1,%ymm10,%xmm3
+
+	# xor and write second block
+	vmovdqa		%xmm0,%xmm10
+	cmp		$0x50,%rax
+	jl		.Lxorpart4
+	vpxor		0x40(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x40(%rsi)
+
+	vmovdqa		%xmm1,%xmm10
+	cmp		$0x60,%rax
+	jl		.Lxorpart4
+	vpxor		0x50(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x50(%rsi)
+
+	vmovdqa		%xmm2,%xmm10
+	cmp		$0x70,%rax
+	jl		.Lxorpart4
+	vpxor		0x60(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x60(%rsi)
+
+	vmovdqa		%xmm3,%xmm10
+	cmp		$0x80,%rax
+	jl		.Lxorpart4
+	vpxor		0x70(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x70(%rsi)
+
+	# o0 = i0 ^ (x0 + s0), third block
+	vpaddd		%ymm11,%ymm4,%ymm10
+	cmp		$0x90,%rax
+	jl		.Lxorpart4
+	vpxor		0x80(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x80(%rsi)
+	vextracti128	$1,%ymm10,%xmm4
+	# o1 = i1 ^ (x1 + s1), third block
+	vpaddd		%ymm12,%ymm5,%ymm10
+	cmp		$0xa0,%rax
+	jl		.Lxorpart4
+	vpxor		0x90(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x90(%rsi)
+	vextracti128	$1,%ymm10,%xmm5
+	# o2 = i2 ^ (x2 + s2), third block
+	vpaddd		%ymm13,%ymm6,%ymm10
+	cmp		$0xb0,%rax
+	jl		.Lxorpart4
+	vpxor		0xa0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xa0(%rsi)
+	vextracti128	$1,%ymm10,%xmm6
+	# o3 = i3 ^ (x3 + s3), third block
+	vpaddd		%ymm15,%ymm7,%ymm10
+	cmp		$0xc0,%rax
+	jl		.Lxorpart4
+	vpxor		0xb0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xb0(%rsi)
+	vextracti128	$1,%ymm10,%xmm7
+
+	# xor and write fourth block
+	vmovdqa		%xmm4,%xmm10
+	cmp		$0xd0,%rax
+	jl		.Lxorpart4
+	vpxor		0xc0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xc0(%rsi)
+
+	vmovdqa		%xmm5,%xmm10
+	cmp		$0xe0,%rax
+	jl		.Lxorpart4
+	vpxor		0xd0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xd0(%rsi)
+
+	vmovdqa		%xmm6,%xmm10
+	cmp		$0xf0,%rax
+	jl		.Lxorpart4
+	vpxor		0xe0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xe0(%rsi)
+
+	vmovdqa		%xmm7,%xmm10
+	cmp		$0x100,%rax
+	jl		.Lxorpart4
+	vpxor		0xf0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xf0(%rsi)
+
+.Ldone4:
+	vzeroupper
+	ret
+
+.Lxorpart4:
+	# xor remaining bytes from partial register into output
+	mov		%rax,%r9
+	and		$0x0f,%r9
+	jz		.Ldone4
+	and		$~0x0f,%rax
+
+	mov		%rsi,%r11
+
+	lea		8(%rsp),%r10
+	sub		$0x10,%rsp
+	and		$~31,%rsp
+
+	lea		(%rdx,%rax),%rsi
+	mov		%rsp,%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	vpxor		0x00(%rsp),%xmm10,%xmm10
+	vmovdqa		%xmm10,0x00(%rsp)
+
+	mov		%rsp,%rsi
+	lea		(%r11,%rax),%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	lea		-8(%r10),%rsp
+	jmp		.Ldone4
+
+ENDPROC(chacha_4block_xor_avx2)
+
+ENTRY(chacha_8block_xor_avx2)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 8 data blocks output, o
+	# %rdx: up to 8 data blocks input, i
+	# %rcx: input/output length in bytes
+	# %r8d: nrounds
+
+	# This function encrypts eight consecutive ChaCha blocks by loading
+	# the state matrix in AVX registers eight times. As we need some
+	# scratch registers, we save the first four registers on the stack. The
+	# algorithm performs each operation on the corresponding word of each
+	# state matrix, hence requires no word shuffling. For final XORing step
+	# we transpose the matrix by interleaving 32-, 64- and then 128-bit
+	# words, which allows us to do XOR in AVX registers. 8/16-bit word
+	# rotation is done with the slightly better performing byte shuffling,
+	# 7/12-bit word rotation uses traditional shift+OR.
+
+	vzeroupper
+	# 4 * 32 byte stack, 32-byte aligned
+	lea		8(%rsp),%r10
+	and		$~31, %rsp
+	sub		$0x80, %rsp
+	mov		%rcx,%rax
+
+	# x0..15[0-7] = s[0..15]
+	vpbroadcastd	0x00(%rdi),%ymm0
+	vpbroadcastd	0x04(%rdi),%ymm1
+	vpbroadcastd	0x08(%rdi),%ymm2
+	vpbroadcastd	0x0c(%rdi),%ymm3
+	vpbroadcastd	0x10(%rdi),%ymm4
+	vpbroadcastd	0x14(%rdi),%ymm5
+	vpbroadcastd	0x18(%rdi),%ymm6
+	vpbroadcastd	0x1c(%rdi),%ymm7
+	vpbroadcastd	0x20(%rdi),%ymm8
+	vpbroadcastd	0x24(%rdi),%ymm9
+	vpbroadcastd	0x28(%rdi),%ymm10
+	vpbroadcastd	0x2c(%rdi),%ymm11
+	vpbroadcastd	0x30(%rdi),%ymm12
+	vpbroadcastd	0x34(%rdi),%ymm13
+	vpbroadcastd	0x38(%rdi),%ymm14
+	vpbroadcastd	0x3c(%rdi),%ymm15
+	# x0..3 on stack
+	vmovdqa		%ymm0,0x00(%rsp)
+	vmovdqa		%ymm1,0x20(%rsp)
+	vmovdqa		%ymm2,0x40(%rsp)
+	vmovdqa		%ymm3,0x60(%rsp)
+
+	vmovdqa		CTRINC(%rip),%ymm1
+	vmovdqa		ROT8(%rip),%ymm2
+	vmovdqa		ROT16(%rip),%ymm3
+
+	# x12 += counter values 0-3
+	vpaddd		%ymm1,%ymm12,%ymm12
+
+.Ldoubleround8:
+	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+	vpaddd		0x00(%rsp),%ymm4,%ymm0
+	vmovdqa		%ymm0,0x00(%rsp)
+	vpxor		%ymm0,%ymm12,%ymm12
+	vpshufb		%ymm3,%ymm12,%ymm12
+	# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+	vpaddd		0x20(%rsp),%ymm5,%ymm0
+	vmovdqa		%ymm0,0x20(%rsp)
+	vpxor		%ymm0,%ymm13,%ymm13
+	vpshufb		%ymm3,%ymm13,%ymm13
+	# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+	vpaddd		0x40(%rsp),%ymm6,%ymm0
+	vmovdqa		%ymm0,0x40(%rsp)
+	vpxor		%ymm0,%ymm14,%ymm14
+	vpshufb		%ymm3,%ymm14,%ymm14
+	# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+	vpaddd		0x60(%rsp),%ymm7,%ymm0
+	vmovdqa		%ymm0,0x60(%rsp)
+	vpxor		%ymm0,%ymm15,%ymm15
+	vpshufb		%ymm3,%ymm15,%ymm15
+
+	# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+	vpaddd		%ymm12,%ymm8,%ymm8
+	vpxor		%ymm8,%ymm4,%ymm4
+	vpslld		$12,%ymm4,%ymm0
+	vpsrld		$20,%ymm4,%ymm4
+	vpor		%ymm0,%ymm4,%ymm4
+	# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+	vpaddd		%ymm13,%ymm9,%ymm9
+	vpxor		%ymm9,%ymm5,%ymm5
+	vpslld		$12,%ymm5,%ymm0
+	vpsrld		$20,%ymm5,%ymm5
+	vpor		%ymm0,%ymm5,%ymm5
+	# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+	vpaddd		%ymm14,%ymm10,%ymm10
+	vpxor		%ymm10,%ymm6,%ymm6
+	vpslld		$12,%ymm6,%ymm0
+	vpsrld		$20,%ymm6,%ymm6
+	vpor		%ymm0,%ymm6,%ymm6
+	# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+	vpaddd		%ymm15,%ymm11,%ymm11
+	vpxor		%ymm11,%ymm7,%ymm7
+	vpslld		$12,%ymm7,%ymm0
+	vpsrld		$20,%ymm7,%ymm7
+	vpor		%ymm0,%ymm7,%ymm7
+
+	# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+	vpaddd		0x00(%rsp),%ymm4,%ymm0
+	vmovdqa		%ymm0,0x00(%rsp)
+	vpxor		%ymm0,%ymm12,%ymm12
+	vpshufb		%ymm2,%ymm12,%ymm12
+	# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+	vpaddd		0x20(%rsp),%ymm5,%ymm0
+	vmovdqa		%ymm0,0x20(%rsp)
+	vpxor		%ymm0,%ymm13,%ymm13
+	vpshufb		%ymm2,%ymm13,%ymm13
+	# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+	vpaddd		0x40(%rsp),%ymm6,%ymm0
+	vmovdqa		%ymm0,0x40(%rsp)
+	vpxor		%ymm0,%ymm14,%ymm14
+	vpshufb		%ymm2,%ymm14,%ymm14
+	# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+	vpaddd		0x60(%rsp),%ymm7,%ymm0
+	vmovdqa		%ymm0,0x60(%rsp)
+	vpxor		%ymm0,%ymm15,%ymm15
+	vpshufb		%ymm2,%ymm15,%ymm15
+
+	# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+	vpaddd		%ymm12,%ymm8,%ymm8
+	vpxor		%ymm8,%ymm4,%ymm4
+	vpslld		$7,%ymm4,%ymm0
+	vpsrld		$25,%ymm4,%ymm4
+	vpor		%ymm0,%ymm4,%ymm4
+	# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+	vpaddd		%ymm13,%ymm9,%ymm9
+	vpxor		%ymm9,%ymm5,%ymm5
+	vpslld		$7,%ymm5,%ymm0
+	vpsrld		$25,%ymm5,%ymm5
+	vpor		%ymm0,%ymm5,%ymm5
+	# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+	vpaddd		%ymm14,%ymm10,%ymm10
+	vpxor		%ymm10,%ymm6,%ymm6
+	vpslld		$7,%ymm6,%ymm0
+	vpsrld		$25,%ymm6,%ymm6
+	vpor		%ymm0,%ymm6,%ymm6
+	# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+	vpaddd		%ymm15,%ymm11,%ymm11
+	vpxor		%ymm11,%ymm7,%ymm7
+	vpslld		$7,%ymm7,%ymm0
+	vpsrld		$25,%ymm7,%ymm7
+	vpor		%ymm0,%ymm7,%ymm7
+
+	# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+	vpaddd		0x00(%rsp),%ymm5,%ymm0
+	vmovdqa		%ymm0,0x00(%rsp)
+	vpxor		%ymm0,%ymm15,%ymm15
+	vpshufb		%ymm3,%ymm15,%ymm15
+	# x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
+	vpaddd		0x20(%rsp),%ymm6,%ymm0
+	vmovdqa		%ymm0,0x20(%rsp)
+	vpxor		%ymm0,%ymm12,%ymm12
+	vpshufb		%ymm3,%ymm12,%ymm12
+	# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+	vpaddd		0x40(%rsp),%ymm7,%ymm0
+	vmovdqa		%ymm0,0x40(%rsp)
+	vpxor		%ymm0,%ymm13,%ymm13
+	vpshufb		%ymm3,%ymm13,%ymm13
+	# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+	vpaddd		0x60(%rsp),%ymm4,%ymm0
+	vmovdqa		%ymm0,0x60(%rsp)
+	vpxor		%ymm0,%ymm14,%ymm14
+	vpshufb		%ymm3,%ymm14,%ymm14
+
+	# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+	vpaddd		%ymm15,%ymm10,%ymm10
+	vpxor		%ymm10,%ymm5,%ymm5
+	vpslld		$12,%ymm5,%ymm0
+	vpsrld		$20,%ymm5,%ymm5
+	vpor		%ymm0,%ymm5,%ymm5
+	# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+	vpaddd		%ymm12,%ymm11,%ymm11
+	vpxor		%ymm11,%ymm6,%ymm6
+	vpslld		$12,%ymm6,%ymm0
+	vpsrld		$20,%ymm6,%ymm6
+	vpor		%ymm0,%ymm6,%ymm6
+	# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+	vpaddd		%ymm13,%ymm8,%ymm8
+	vpxor		%ymm8,%ymm7,%ymm7
+	vpslld		$12,%ymm7,%ymm0
+	vpsrld		$20,%ymm7,%ymm7
+	vpor		%ymm0,%ymm7,%ymm7
+	# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+	vpaddd		%ymm14,%ymm9,%ymm9
+	vpxor		%ymm9,%ymm4,%ymm4
+	vpslld		$12,%ymm4,%ymm0
+	vpsrld		$20,%ymm4,%ymm4
+	vpor		%ymm0,%ymm4,%ymm4
+
+	# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+	vpaddd		0x00(%rsp),%ymm5,%ymm0
+	vmovdqa		%ymm0,0x00(%rsp)
+	vpxor		%ymm0,%ymm15,%ymm15
+	vpshufb		%ymm2,%ymm15,%ymm15
+	# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+	vpaddd		0x20(%rsp),%ymm6,%ymm0
+	vmovdqa		%ymm0,0x20(%rsp)
+	vpxor		%ymm0,%ymm12,%ymm12
+	vpshufb		%ymm2,%ymm12,%ymm12
+	# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+	vpaddd		0x40(%rsp),%ymm7,%ymm0
+	vmovdqa		%ymm0,0x40(%rsp)
+	vpxor		%ymm0,%ymm13,%ymm13
+	vpshufb		%ymm2,%ymm13,%ymm13
+	# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+	vpaddd		0x60(%rsp),%ymm4,%ymm0
+	vmovdqa		%ymm0,0x60(%rsp)
+	vpxor		%ymm0,%ymm14,%ymm14
+	vpshufb		%ymm2,%ymm14,%ymm14
+
+	# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+	vpaddd		%ymm15,%ymm10,%ymm10
+	vpxor		%ymm10,%ymm5,%ymm5
+	vpslld		$7,%ymm5,%ymm0
+	vpsrld		$25,%ymm5,%ymm5
+	vpor		%ymm0,%ymm5,%ymm5
+	# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+	vpaddd		%ymm12,%ymm11,%ymm11
+	vpxor		%ymm11,%ymm6,%ymm6
+	vpslld		$7,%ymm6,%ymm0
+	vpsrld		$25,%ymm6,%ymm6
+	vpor		%ymm0,%ymm6,%ymm6
+	# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+	vpaddd		%ymm13,%ymm8,%ymm8
+	vpxor		%ymm8,%ymm7,%ymm7
+	vpslld		$7,%ymm7,%ymm0
+	vpsrld		$25,%ymm7,%ymm7
+	vpor		%ymm0,%ymm7,%ymm7
+	# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+	vpaddd		%ymm14,%ymm9,%ymm9
+	vpxor		%ymm9,%ymm4,%ymm4
+	vpslld		$7,%ymm4,%ymm0
+	vpsrld		$25,%ymm4,%ymm4
+	vpor		%ymm0,%ymm4,%ymm4
+
+	sub		$2,%r8d
+	jnz		.Ldoubleround8
+
+	# x0..15[0-3] += s[0..15]
+	vpbroadcastd	0x00(%rdi),%ymm0
+	vpaddd		0x00(%rsp),%ymm0,%ymm0
+	vmovdqa		%ymm0,0x00(%rsp)
+	vpbroadcastd	0x04(%rdi),%ymm0
+	vpaddd		0x20(%rsp),%ymm0,%ymm0
+	vmovdqa		%ymm0,0x20(%rsp)
+	vpbroadcastd	0x08(%rdi),%ymm0
+	vpaddd		0x40(%rsp),%ymm0,%ymm0
+	vmovdqa		%ymm0,0x40(%rsp)
+	vpbroadcastd	0x0c(%rdi),%ymm0
+	vpaddd		0x60(%rsp),%ymm0,%ymm0
+	vmovdqa		%ymm0,0x60(%rsp)
+	vpbroadcastd	0x10(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm4,%ymm4
+	vpbroadcastd	0x14(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm5,%ymm5
+	vpbroadcastd	0x18(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm6,%ymm6
+	vpbroadcastd	0x1c(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm7,%ymm7
+	vpbroadcastd	0x20(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm8,%ymm8
+	vpbroadcastd	0x24(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm9,%ymm9
+	vpbroadcastd	0x28(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm10,%ymm10
+	vpbroadcastd	0x2c(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm11,%ymm11
+	vpbroadcastd	0x30(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm12,%ymm12
+	vpbroadcastd	0x34(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm13,%ymm13
+	vpbroadcastd	0x38(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm14,%ymm14
+	vpbroadcastd	0x3c(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm15,%ymm15
+
+	# x12 += counter values 0-3
+	vpaddd		%ymm1,%ymm12,%ymm12
+
+	# interleave 32-bit words in state n, n+1
+	vmovdqa		0x00(%rsp),%ymm0
+	vmovdqa		0x20(%rsp),%ymm1
+	vpunpckldq	%ymm1,%ymm0,%ymm2
+	vpunpckhdq	%ymm1,%ymm0,%ymm1
+	vmovdqa		%ymm2,0x00(%rsp)
+	vmovdqa		%ymm1,0x20(%rsp)
+	vmovdqa		0x40(%rsp),%ymm0
+	vmovdqa		0x60(%rsp),%ymm1
+	vpunpckldq	%ymm1,%ymm0,%ymm2
+	vpunpckhdq	%ymm1,%ymm0,%ymm1
+	vmovdqa		%ymm2,0x40(%rsp)
+	vmovdqa		%ymm1,0x60(%rsp)
+	vmovdqa		%ymm4,%ymm0
+	vpunpckldq	%ymm5,%ymm0,%ymm4
+	vpunpckhdq	%ymm5,%ymm0,%ymm5
+	vmovdqa		%ymm6,%ymm0
+	vpunpckldq	%ymm7,%ymm0,%ymm6
+	vpunpckhdq	%ymm7,%ymm0,%ymm7
+	vmovdqa		%ymm8,%ymm0
+	vpunpckldq	%ymm9,%ymm0,%ymm8
+	vpunpckhdq	%ymm9,%ymm0,%ymm9
+	vmovdqa		%ymm10,%ymm0
+	vpunpckldq	%ymm11,%ymm0,%ymm10
+	vpunpckhdq	%ymm11,%ymm0,%ymm11
+	vmovdqa		%ymm12,%ymm0
+	vpunpckldq	%ymm13,%ymm0,%ymm12
+	vpunpckhdq	%ymm13,%ymm0,%ymm13
+	vmovdqa		%ymm14,%ymm0
+	vpunpckldq	%ymm15,%ymm0,%ymm14
+	vpunpckhdq	%ymm15,%ymm0,%ymm15
+
+	# interleave 64-bit words in state n, n+2
+	vmovdqa		0x00(%rsp),%ymm0
+	vmovdqa		0x40(%rsp),%ymm2
+	vpunpcklqdq	%ymm2,%ymm0,%ymm1
+	vpunpckhqdq	%ymm2,%ymm0,%ymm2
+	vmovdqa		%ymm1,0x00(%rsp)
+	vmovdqa		%ymm2,0x40(%rsp)
+	vmovdqa		0x20(%rsp),%ymm0
+	vmovdqa		0x60(%rsp),%ymm2
+	vpunpcklqdq	%ymm2,%ymm0,%ymm1
+	vpunpckhqdq	%ymm2,%ymm0,%ymm2
+	vmovdqa		%ymm1,0x20(%rsp)
+	vmovdqa		%ymm2,0x60(%rsp)
+	vmovdqa		%ymm4,%ymm0
+	vpunpcklqdq	%ymm6,%ymm0,%ymm4
+	vpunpckhqdq	%ymm6,%ymm0,%ymm6
+	vmovdqa		%ymm5,%ymm0
+	vpunpcklqdq	%ymm7,%ymm0,%ymm5
+	vpunpckhqdq	%ymm7,%ymm0,%ymm7
+	vmovdqa		%ymm8,%ymm0
+	vpunpcklqdq	%ymm10,%ymm0,%ymm8
+	vpunpckhqdq	%ymm10,%ymm0,%ymm10
+	vmovdqa		%ymm9,%ymm0
+	vpunpcklqdq	%ymm11,%ymm0,%ymm9
+	vpunpckhqdq	%ymm11,%ymm0,%ymm11
+	vmovdqa		%ymm12,%ymm0
+	vpunpcklqdq	%ymm14,%ymm0,%ymm12
+	vpunpckhqdq	%ymm14,%ymm0,%ymm14
+	vmovdqa		%ymm13,%ymm0
+	vpunpcklqdq	%ymm15,%ymm0,%ymm13
+	vpunpckhqdq	%ymm15,%ymm0,%ymm15
+
+	# interleave 128-bit words in state n, n+4
+	# xor/write first four blocks
+	vmovdqa		0x00(%rsp),%ymm1
+	vperm2i128	$0x20,%ymm4,%ymm1,%ymm0
+	cmp		$0x0020,%rax
+	jl		.Lxorpart8
+	vpxor		0x0000(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0000(%rsi)
+	vperm2i128	$0x31,%ymm4,%ymm1,%ymm4
+
+	vperm2i128	$0x20,%ymm12,%ymm8,%ymm0
+	cmp		$0x0040,%rax
+	jl		.Lxorpart8
+	vpxor		0x0020(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0020(%rsi)
+	vperm2i128	$0x31,%ymm12,%ymm8,%ymm12
+
+	vmovdqa		0x40(%rsp),%ymm1
+	vperm2i128	$0x20,%ymm6,%ymm1,%ymm0
+	cmp		$0x0060,%rax
+	jl		.Lxorpart8
+	vpxor		0x0040(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0040(%rsi)
+	vperm2i128	$0x31,%ymm6,%ymm1,%ymm6
+
+	vperm2i128	$0x20,%ymm14,%ymm10,%ymm0
+	cmp		$0x0080,%rax
+	jl		.Lxorpart8
+	vpxor		0x0060(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0060(%rsi)
+	vperm2i128	$0x31,%ymm14,%ymm10,%ymm14
+
+	vmovdqa		0x20(%rsp),%ymm1
+	vperm2i128	$0x20,%ymm5,%ymm1,%ymm0
+	cmp		$0x00a0,%rax
+	jl		.Lxorpart8
+	vpxor		0x0080(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0080(%rsi)
+	vperm2i128	$0x31,%ymm5,%ymm1,%ymm5
+
+	vperm2i128	$0x20,%ymm13,%ymm9,%ymm0
+	cmp		$0x00c0,%rax
+	jl		.Lxorpart8
+	vpxor		0x00a0(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x00a0(%rsi)
+	vperm2i128	$0x31,%ymm13,%ymm9,%ymm13
+
+	vmovdqa		0x60(%rsp),%ymm1
+	vperm2i128	$0x20,%ymm7,%ymm1,%ymm0
+	cmp		$0x00e0,%rax
+	jl		.Lxorpart8
+	vpxor		0x00c0(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x00c0(%rsi)
+	vperm2i128	$0x31,%ymm7,%ymm1,%ymm7
+
+	vperm2i128	$0x20,%ymm15,%ymm11,%ymm0
+	cmp		$0x0100,%rax
+	jl		.Lxorpart8
+	vpxor		0x00e0(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x00e0(%rsi)
+	vperm2i128	$0x31,%ymm15,%ymm11,%ymm15
+
+	# xor remaining blocks, write to output
+	vmovdqa		%ymm4,%ymm0
+	cmp		$0x0120,%rax
+	jl		.Lxorpart8
+	vpxor		0x0100(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0100(%rsi)
+
+	vmovdqa		%ymm12,%ymm0
+	cmp		$0x0140,%rax
+	jl		.Lxorpart8
+	vpxor		0x0120(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0120(%rsi)
+
+	vmovdqa		%ymm6,%ymm0
+	cmp		$0x0160,%rax
+	jl		.Lxorpart8
+	vpxor		0x0140(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0140(%rsi)
+
+	vmovdqa		%ymm14,%ymm0
+	cmp		$0x0180,%rax
+	jl		.Lxorpart8
+	vpxor		0x0160(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0160(%rsi)
+
+	vmovdqa		%ymm5,%ymm0
+	cmp		$0x01a0,%rax
+	jl		.Lxorpart8
+	vpxor		0x0180(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0180(%rsi)
+
+	vmovdqa		%ymm13,%ymm0
+	cmp		$0x01c0,%rax
+	jl		.Lxorpart8
+	vpxor		0x01a0(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x01a0(%rsi)
+
+	vmovdqa		%ymm7,%ymm0
+	cmp		$0x01e0,%rax
+	jl		.Lxorpart8
+	vpxor		0x01c0(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x01c0(%rsi)
+
+	vmovdqa		%ymm15,%ymm0
+	cmp		$0x0200,%rax
+	jl		.Lxorpart8
+	vpxor		0x01e0(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x01e0(%rsi)
+
+.Ldone8:
+	vzeroupper
+	lea		-8(%r10),%rsp
+	ret
+
+.Lxorpart8:
+	# xor remaining bytes from partial register into output
+	mov		%rax,%r9
+	and		$0x1f,%r9
+	jz		.Ldone8
+	and		$~0x1f,%rax
+
+	mov		%rsi,%r11
+
+	lea		(%rdx,%rax),%rsi
+	mov		%rsp,%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	vpxor		0x00(%rsp),%ymm0,%ymm0
+	vmovdqa		%ymm0,0x00(%rsp)
+
+	mov		%rsp,%rsi
+	lea		(%r11,%rax),%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	jmp		.Ldone8
+
+ENDPROC(chacha_8block_xor_avx2)
diff --git a/arch/x86/crypto/chacha-avx512vl-x86_64.S b/arch/x86/crypto/chacha-avx512vl-x86_64.S
new file mode 100644
index 000000000000..848f9c75fd4f
--- /dev/null
+++ b/arch/x86/crypto/chacha-avx512vl-x86_64.S
@@ -0,0 +1,836 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions
+ *
+ * Copyright (C) 2018 Martin Willi
+ */
+
+#include <linux/linkage.h>
+
+.section	.rodata.cst32.CTR2BL, "aM", @progbits, 32
+.align 32
+CTR2BL:	.octa 0x00000000000000000000000000000000
+	.octa 0x00000000000000000000000000000001
+
+.section	.rodata.cst32.CTR4BL, "aM", @progbits, 32
+.align 32
+CTR4BL:	.octa 0x00000000000000000000000000000002
+	.octa 0x00000000000000000000000000000003
+
+.section	.rodata.cst32.CTR8BL, "aM", @progbits, 32
+.align 32
+CTR8BL:	.octa 0x00000003000000020000000100000000
+	.octa 0x00000007000000060000000500000004
+
+.text
+
+ENTRY(chacha_2block_xor_avx512vl)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 2 data blocks output, o
+	# %rdx: up to 2 data blocks input, i
+	# %rcx: input/output length in bytes
+	# %r8d: nrounds
+
+	# This function encrypts two ChaCha blocks by loading the state
+	# matrix twice across four AVX registers. It performs matrix operations
+	# on four words in each matrix in parallel, but requires shuffling to
+	# rearrange the words after each round.
+
+	vzeroupper
+
+	# x0..3[0-2] = s0..3
+	vbroadcasti128	0x00(%rdi),%ymm0
+	vbroadcasti128	0x10(%rdi),%ymm1
+	vbroadcasti128	0x20(%rdi),%ymm2
+	vbroadcasti128	0x30(%rdi),%ymm3
+
+	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
+
+	vmovdqa		%ymm0,%ymm8
+	vmovdqa		%ymm1,%ymm9
+	vmovdqa		%ymm2,%ymm10
+	vmovdqa		%ymm3,%ymm11
+
+.Ldoubleround:
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$16,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$12,%ymm1,%ymm1
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$8,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$7,%ymm1,%ymm1
+
+	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm1,%ymm1
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm3,%ymm3
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$16,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$12,%ymm1,%ymm1
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$8,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$7,%ymm1,%ymm1
+
+	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm1,%ymm1
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm3,%ymm3
+
+	sub		$2,%r8d
+	jnz		.Ldoubleround
+
+	# o0 = i0 ^ (x0 + s0)
+	vpaddd		%ymm8,%ymm0,%ymm7
+	cmp		$0x10,%rcx
+	jl		.Lxorpart2
+	vpxord		0x00(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x00(%rsi)
+	vextracti128	$1,%ymm7,%xmm0
+	# o1 = i1 ^ (x1 + s1)
+	vpaddd		%ymm9,%ymm1,%ymm7
+	cmp		$0x20,%rcx
+	jl		.Lxorpart2
+	vpxord		0x10(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x10(%rsi)
+	vextracti128	$1,%ymm7,%xmm1
+	# o2 = i2 ^ (x2 + s2)
+	vpaddd		%ymm10,%ymm2,%ymm7
+	cmp		$0x30,%rcx
+	jl		.Lxorpart2
+	vpxord		0x20(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x20(%rsi)
+	vextracti128	$1,%ymm7,%xmm2
+	# o3 = i3 ^ (x3 + s3)
+	vpaddd		%ymm11,%ymm3,%ymm7
+	cmp		$0x40,%rcx
+	jl		.Lxorpart2
+	vpxord		0x30(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x30(%rsi)
+	vextracti128	$1,%ymm7,%xmm3
+
+	# xor and write second block
+	vmovdqa		%xmm0,%xmm7
+	cmp		$0x50,%rcx
+	jl		.Lxorpart2
+	vpxord		0x40(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x40(%rsi)
+
+	vmovdqa		%xmm1,%xmm7
+	cmp		$0x60,%rcx
+	jl		.Lxorpart2
+	vpxord		0x50(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x50(%rsi)
+
+	vmovdqa		%xmm2,%xmm7
+	cmp		$0x70,%rcx
+	jl		.Lxorpart2
+	vpxord		0x60(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x60(%rsi)
+
+	vmovdqa		%xmm3,%xmm7
+	cmp		$0x80,%rcx
+	jl		.Lxorpart2
+	vpxord		0x70(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x70(%rsi)
+
+.Ldone2:
+	vzeroupper
+	ret
+
+.Lxorpart2:
+	# xor remaining bytes from partial register into output
+	mov		%rcx,%rax
+	and		$0xf,%rcx
+	jz		.Ldone8
+	mov		%rax,%r9
+	and		$~0xf,%r9
+
+	mov		$1,%rax
+	shld		%cl,%rax,%rax
+	sub		$1,%rax
+	kmovq		%rax,%k1
+
+	vmovdqu8	(%rdx,%r9),%xmm1{%k1}{z}
+	vpxord		%xmm7,%xmm1,%xmm1
+	vmovdqu8	%xmm1,(%rsi,%r9){%k1}
+
+	jmp		.Ldone2
+
+ENDPROC(chacha_2block_xor_avx512vl)
+
+ENTRY(chacha_4block_xor_avx512vl)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 4 data blocks output, o
+	# %rdx: up to 4 data blocks input, i
+	# %rcx: input/output length in bytes
+	# %r8d: nrounds
+
+	# This function encrypts four ChaCha blocks by loading the state
+	# matrix four times across eight AVX registers. It performs matrix
+	# operations on four words in two matrices in parallel, sequentially
+	# to the operations on the four words of the other two matrices. The
+	# required word shuffling has a rather high latency, we can do the
+	# arithmetic on two matrix-pairs without much slowdown.
+
+	vzeroupper
+
+	# x0..3[0-4] = s0..3
+	vbroadcasti128	0x00(%rdi),%ymm0
+	vbroadcasti128	0x10(%rdi),%ymm1
+	vbroadcasti128	0x20(%rdi),%ymm2
+	vbroadcasti128	0x30(%rdi),%ymm3
+
+	vmovdqa		%ymm0,%ymm4
+	vmovdqa		%ymm1,%ymm5
+	vmovdqa		%ymm2,%ymm6
+	vmovdqa		%ymm3,%ymm7
+
+	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
+	vpaddd		CTR4BL(%rip),%ymm7,%ymm7
+
+	vmovdqa		%ymm0,%ymm11
+	vmovdqa		%ymm1,%ymm12
+	vmovdqa		%ymm2,%ymm13
+	vmovdqa		%ymm3,%ymm14
+	vmovdqa		%ymm7,%ymm15
+
+.Ldoubleround4:
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$16,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxord		%ymm4,%ymm7,%ymm7
+	vprold		$16,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$12,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxord		%ymm6,%ymm5,%ymm5
+	vprold		$12,%ymm5,%ymm5
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$8,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxord		%ymm4,%ymm7,%ymm7
+	vprold		$8,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$7,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxord		%ymm6,%ymm5,%ymm5
+	vprold		$7,%ymm5,%ymm5
+
+	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm1,%ymm1
+	vpshufd		$0x39,%ymm5,%ymm5
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	vpshufd		$0x4e,%ymm6,%ymm6
+	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm3,%ymm3
+	vpshufd		$0x93,%ymm7,%ymm7
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$16,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxord		%ymm4,%ymm7,%ymm7
+	vprold		$16,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$12,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxord		%ymm6,%ymm5,%ymm5
+	vprold		$12,%ymm5,%ymm5
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$8,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxord		%ymm4,%ymm7,%ymm7
+	vprold		$8,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$7,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxord		%ymm6,%ymm5,%ymm5
+	vprold		$7,%ymm5,%ymm5
+
+	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm1,%ymm1
+	vpshufd		$0x93,%ymm5,%ymm5
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	vpshufd		$0x4e,%ymm6,%ymm6
+	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm3,%ymm3
+	vpshufd		$0x39,%ymm7,%ymm7
+
+	sub		$2,%r8d
+	jnz		.Ldoubleround4
+
+	# o0 = i0 ^ (x0 + s0), first block
+	vpaddd		%ymm11,%ymm0,%ymm10
+	cmp		$0x10,%rcx
+	jl		.Lxorpart4
+	vpxord		0x00(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x00(%rsi)
+	vextracti128	$1,%ymm10,%xmm0
+	# o1 = i1 ^ (x1 + s1), first block
+	vpaddd		%ymm12,%ymm1,%ymm10
+	cmp		$0x20,%rcx
+	jl		.Lxorpart4
+	vpxord		0x10(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x10(%rsi)
+	vextracti128	$1,%ymm10,%xmm1
+	# o2 = i2 ^ (x2 + s2), first block
+	vpaddd		%ymm13,%ymm2,%ymm10
+	cmp		$0x30,%rcx
+	jl		.Lxorpart4
+	vpxord		0x20(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x20(%rsi)
+	vextracti128	$1,%ymm10,%xmm2
+	# o3 = i3 ^ (x3 + s3), first block
+	vpaddd		%ymm14,%ymm3,%ymm10
+	cmp		$0x40,%rcx
+	jl		.Lxorpart4
+	vpxord		0x30(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x30(%rsi)
+	vextracti128	$1,%ymm10,%xmm3
+
+	# xor and write second block
+	vmovdqa		%xmm0,%xmm10
+	cmp		$0x50,%rcx
+	jl		.Lxorpart4
+	vpxord		0x40(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x40(%rsi)
+
+	vmovdqa		%xmm1,%xmm10
+	cmp		$0x60,%rcx
+	jl		.Lxorpart4
+	vpxord		0x50(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x50(%rsi)
+
+	vmovdqa		%xmm2,%xmm10
+	cmp		$0x70,%rcx
+	jl		.Lxorpart4
+	vpxord		0x60(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x60(%rsi)
+
+	vmovdqa		%xmm3,%xmm10
+	cmp		$0x80,%rcx
+	jl		.Lxorpart4
+	vpxord		0x70(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x70(%rsi)
+
+	# o0 = i0 ^ (x0 + s0), third block
+	vpaddd		%ymm11,%ymm4,%ymm10
+	cmp		$0x90,%rcx
+	jl		.Lxorpart4
+	vpxord		0x80(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x80(%rsi)
+	vextracti128	$1,%ymm10,%xmm4
+	# o1 = i1 ^ (x1 + s1), third block
+	vpaddd		%ymm12,%ymm5,%ymm10
+	cmp		$0xa0,%rcx
+	jl		.Lxorpart4
+	vpxord		0x90(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x90(%rsi)
+	vextracti128	$1,%ymm10,%xmm5
+	# o2 = i2 ^ (x2 + s2), third block
+	vpaddd		%ymm13,%ymm6,%ymm10
+	cmp		$0xb0,%rcx
+	jl		.Lxorpart4
+	vpxord		0xa0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xa0(%rsi)
+	vextracti128	$1,%ymm10,%xmm6
+	# o3 = i3 ^ (x3 + s3), third block
+	vpaddd		%ymm15,%ymm7,%ymm10
+	cmp		$0xc0,%rcx
+	jl		.Lxorpart4
+	vpxord		0xb0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xb0(%rsi)
+	vextracti128	$1,%ymm10,%xmm7
+
+	# xor and write fourth block
+	vmovdqa		%xmm4,%xmm10
+	cmp		$0xd0,%rcx
+	jl		.Lxorpart4
+	vpxord		0xc0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xc0(%rsi)
+
+	vmovdqa		%xmm5,%xmm10
+	cmp		$0xe0,%rcx
+	jl		.Lxorpart4
+	vpxord		0xd0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xd0(%rsi)
+
+	vmovdqa		%xmm6,%xmm10
+	cmp		$0xf0,%rcx
+	jl		.Lxorpart4
+	vpxord		0xe0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xe0(%rsi)
+
+	vmovdqa		%xmm7,%xmm10
+	cmp		$0x100,%rcx
+	jl		.Lxorpart4
+	vpxord		0xf0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xf0(%rsi)
+
+.Ldone4:
+	vzeroupper
+	ret
+
+.Lxorpart4:
+	# xor remaining bytes from partial register into output
+	mov		%rcx,%rax
+	and		$0xf,%rcx
+	jz		.Ldone8
+	mov		%rax,%r9
+	and		$~0xf,%r9
+
+	mov		$1,%rax
+	shld		%cl,%rax,%rax
+	sub		$1,%rax
+	kmovq		%rax,%k1
+
+	vmovdqu8	(%rdx,%r9),%xmm1{%k1}{z}
+	vpxord		%xmm10,%xmm1,%xmm1
+	vmovdqu8	%xmm1,(%rsi,%r9){%k1}
+
+	jmp		.Ldone4
+
+ENDPROC(chacha_4block_xor_avx512vl)
+
+ENTRY(chacha_8block_xor_avx512vl)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 8 data blocks output, o
+	# %rdx: up to 8 data blocks input, i
+	# %rcx: input/output length in bytes
+	# %r8d: nrounds
+
+	# This function encrypts eight consecutive ChaCha blocks by loading
+	# the state matrix in AVX registers eight times. Compared to AVX2, this
+	# mostly benefits from the new rotate instructions in VL and the
+	# additional registers.
+
+	vzeroupper
+
+	# x0..15[0-7] = s[0..15]
+	vpbroadcastd	0x00(%rdi),%ymm0
+	vpbroadcastd	0x04(%rdi),%ymm1
+	vpbroadcastd	0x08(%rdi),%ymm2
+	vpbroadcastd	0x0c(%rdi),%ymm3
+	vpbroadcastd	0x10(%rdi),%ymm4
+	vpbroadcastd	0x14(%rdi),%ymm5
+	vpbroadcastd	0x18(%rdi),%ymm6
+	vpbroadcastd	0x1c(%rdi),%ymm7
+	vpbroadcastd	0x20(%rdi),%ymm8
+	vpbroadcastd	0x24(%rdi),%ymm9
+	vpbroadcastd	0x28(%rdi),%ymm10
+	vpbroadcastd	0x2c(%rdi),%ymm11
+	vpbroadcastd	0x30(%rdi),%ymm12
+	vpbroadcastd	0x34(%rdi),%ymm13
+	vpbroadcastd	0x38(%rdi),%ymm14
+	vpbroadcastd	0x3c(%rdi),%ymm15
+
+	# x12 += counter values 0-3
+	vpaddd		CTR8BL(%rip),%ymm12,%ymm12
+
+	vmovdqa64	%ymm0,%ymm16
+	vmovdqa64	%ymm1,%ymm17
+	vmovdqa64	%ymm2,%ymm18
+	vmovdqa64	%ymm3,%ymm19
+	vmovdqa64	%ymm4,%ymm20
+	vmovdqa64	%ymm5,%ymm21
+	vmovdqa64	%ymm6,%ymm22
+	vmovdqa64	%ymm7,%ymm23
+	vmovdqa64	%ymm8,%ymm24
+	vmovdqa64	%ymm9,%ymm25
+	vmovdqa64	%ymm10,%ymm26
+	vmovdqa64	%ymm11,%ymm27
+	vmovdqa64	%ymm12,%ymm28
+	vmovdqa64	%ymm13,%ymm29
+	vmovdqa64	%ymm14,%ymm30
+	vmovdqa64	%ymm15,%ymm31
+
+.Ldoubleround8:
+	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+	vpaddd		%ymm0,%ymm4,%ymm0
+	vpxord		%ymm0,%ymm12,%ymm12
+	vprold		$16,%ymm12,%ymm12
+	# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+	vpaddd		%ymm1,%ymm5,%ymm1
+	vpxord		%ymm1,%ymm13,%ymm13
+	vprold		$16,%ymm13,%ymm13
+	# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+	vpaddd		%ymm2,%ymm6,%ymm2
+	vpxord		%ymm2,%ymm14,%ymm14
+	vprold		$16,%ymm14,%ymm14
+	# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+	vpaddd		%ymm3,%ymm7,%ymm3
+	vpxord		%ymm3,%ymm15,%ymm15
+	vprold		$16,%ymm15,%ymm15
+
+	# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+	vpaddd		%ymm12,%ymm8,%ymm8
+	vpxord		%ymm8,%ymm4,%ymm4
+	vprold		$12,%ymm4,%ymm4
+	# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+	vpaddd		%ymm13,%ymm9,%ymm9
+	vpxord		%ymm9,%ymm5,%ymm5
+	vprold		$12,%ymm5,%ymm5
+	# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+	vpaddd		%ymm14,%ymm10,%ymm10
+	vpxord		%ymm10,%ymm6,%ymm6
+	vprold		$12,%ymm6,%ymm6
+	# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+	vpaddd		%ymm15,%ymm11,%ymm11
+	vpxord		%ymm11,%ymm7,%ymm7
+	vprold		$12,%ymm7,%ymm7
+
+	# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+	vpaddd		%ymm0,%ymm4,%ymm0
+	vpxord		%ymm0,%ymm12,%ymm12
+	vprold		$8,%ymm12,%ymm12
+	# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+	vpaddd		%ymm1,%ymm5,%ymm1
+	vpxord		%ymm1,%ymm13,%ymm13
+	vprold		$8,%ymm13,%ymm13
+	# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+	vpaddd		%ymm2,%ymm6,%ymm2
+	vpxord		%ymm2,%ymm14,%ymm14
+	vprold		$8,%ymm14,%ymm14
+	# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+	vpaddd		%ymm3,%ymm7,%ymm3
+	vpxord		%ymm3,%ymm15,%ymm15
+	vprold		$8,%ymm15,%ymm15
+
+	# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+	vpaddd		%ymm12,%ymm8,%ymm8
+	vpxord		%ymm8,%ymm4,%ymm4
+	vprold		$7,%ymm4,%ymm4
+	# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+	vpaddd		%ymm13,%ymm9,%ymm9
+	vpxord		%ymm9,%ymm5,%ymm5
+	vprold		$7,%ymm5,%ymm5
+	# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+	vpaddd		%ymm14,%ymm10,%ymm10
+	vpxord		%ymm10,%ymm6,%ymm6
+	vprold		$7,%ymm6,%ymm6
+	# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+	vpaddd		%ymm15,%ymm11,%ymm11
+	vpxord		%ymm11,%ymm7,%ymm7
+	vprold		$7,%ymm7,%ymm7
+
+	# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+	vpaddd		%ymm0,%ymm5,%ymm0
+	vpxord		%ymm0,%ymm15,%ymm15
+	vprold		$16,%ymm15,%ymm15
+	# x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+	vpaddd		%ymm1,%ymm6,%ymm1
+	vpxord		%ymm1,%ymm12,%ymm12
+	vprold		$16,%ymm12,%ymm12
+	# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+	vpaddd		%ymm2,%ymm7,%ymm2
+	vpxord		%ymm2,%ymm13,%ymm13
+	vprold		$16,%ymm13,%ymm13
+	# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+	vpaddd		%ymm3,%ymm4,%ymm3
+	vpxord		%ymm3,%ymm14,%ymm14
+	vprold		$16,%ymm14,%ymm14
+
+	# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+	vpaddd		%ymm15,%ymm10,%ymm10
+	vpxord		%ymm10,%ymm5,%ymm5
+	vprold		$12,%ymm5,%ymm5
+	# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+	vpaddd		%ymm12,%ymm11,%ymm11
+	vpxord		%ymm11,%ymm6,%ymm6
+	vprold		$12,%ymm6,%ymm6
+	# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+	vpaddd		%ymm13,%ymm8,%ymm8
+	vpxord		%ymm8,%ymm7,%ymm7
+	vprold		$12,%ymm7,%ymm7
+	# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+	vpaddd		%ymm14,%ymm9,%ymm9
+	vpxord		%ymm9,%ymm4,%ymm4
+	vprold		$12,%ymm4,%ymm4
+
+	# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+	vpaddd		%ymm0,%ymm5,%ymm0
+	vpxord		%ymm0,%ymm15,%ymm15
+	vprold		$8,%ymm15,%ymm15
+	# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+	vpaddd		%ymm1,%ymm6,%ymm1
+	vpxord		%ymm1,%ymm12,%ymm12
+	vprold		$8,%ymm12,%ymm12
+	# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+	vpaddd		%ymm2,%ymm7,%ymm2
+	vpxord		%ymm2,%ymm13,%ymm13
+	vprold		$8,%ymm13,%ymm13
+	# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+	vpaddd		%ymm3,%ymm4,%ymm3
+	vpxord		%ymm3,%ymm14,%ymm14
+	vprold		$8,%ymm14,%ymm14
+
+	# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+	vpaddd		%ymm15,%ymm10,%ymm10
+	vpxord		%ymm10,%ymm5,%ymm5
+	vprold		$7,%ymm5,%ymm5
+	# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+	vpaddd		%ymm12,%ymm11,%ymm11
+	vpxord		%ymm11,%ymm6,%ymm6
+	vprold		$7,%ymm6,%ymm6
+	# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+	vpaddd		%ymm13,%ymm8,%ymm8
+	vpxord		%ymm8,%ymm7,%ymm7
+	vprold		$7,%ymm7,%ymm7
+	# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+	vpaddd		%ymm14,%ymm9,%ymm9
+	vpxord		%ymm9,%ymm4,%ymm4
+	vprold		$7,%ymm4,%ymm4
+
+	sub		$2,%r8d
+	jnz		.Ldoubleround8
+
+	# x0..15[0-3] += s[0..15]
+	vpaddd		%ymm16,%ymm0,%ymm0
+	vpaddd		%ymm17,%ymm1,%ymm1
+	vpaddd		%ymm18,%ymm2,%ymm2
+	vpaddd		%ymm19,%ymm3,%ymm3
+	vpaddd		%ymm20,%ymm4,%ymm4
+	vpaddd		%ymm21,%ymm5,%ymm5
+	vpaddd		%ymm22,%ymm6,%ymm6
+	vpaddd		%ymm23,%ymm7,%ymm7
+	vpaddd		%ymm24,%ymm8,%ymm8
+	vpaddd		%ymm25,%ymm9,%ymm9
+	vpaddd		%ymm26,%ymm10,%ymm10
+	vpaddd		%ymm27,%ymm11,%ymm11
+	vpaddd		%ymm28,%ymm12,%ymm12
+	vpaddd		%ymm29,%ymm13,%ymm13
+	vpaddd		%ymm30,%ymm14,%ymm14
+	vpaddd		%ymm31,%ymm15,%ymm15
+
+	# interleave 32-bit words in state n, n+1
+	vpunpckldq	%ymm1,%ymm0,%ymm16
+	vpunpckhdq	%ymm1,%ymm0,%ymm17
+	vpunpckldq	%ymm3,%ymm2,%ymm18
+	vpunpckhdq	%ymm3,%ymm2,%ymm19
+	vpunpckldq	%ymm5,%ymm4,%ymm20
+	vpunpckhdq	%ymm5,%ymm4,%ymm21
+	vpunpckldq	%ymm7,%ymm6,%ymm22
+	vpunpckhdq	%ymm7,%ymm6,%ymm23
+	vpunpckldq	%ymm9,%ymm8,%ymm24
+	vpunpckhdq	%ymm9,%ymm8,%ymm25
+	vpunpckldq	%ymm11,%ymm10,%ymm26
+	vpunpckhdq	%ymm11,%ymm10,%ymm27
+	vpunpckldq	%ymm13,%ymm12,%ymm28
+	vpunpckhdq	%ymm13,%ymm12,%ymm29
+	vpunpckldq	%ymm15,%ymm14,%ymm30
+	vpunpckhdq	%ymm15,%ymm14,%ymm31
+
+	# interleave 64-bit words in state n, n+2
+	vpunpcklqdq	%ymm18,%ymm16,%ymm0
+	vpunpcklqdq	%ymm19,%ymm17,%ymm1
+	vpunpckhqdq	%ymm18,%ymm16,%ymm2
+	vpunpckhqdq	%ymm19,%ymm17,%ymm3
+	vpunpcklqdq	%ymm22,%ymm20,%ymm4
+	vpunpcklqdq	%ymm23,%ymm21,%ymm5
+	vpunpckhqdq	%ymm22,%ymm20,%ymm6
+	vpunpckhqdq	%ymm23,%ymm21,%ymm7
+	vpunpcklqdq	%ymm26,%ymm24,%ymm8
+	vpunpcklqdq	%ymm27,%ymm25,%ymm9
+	vpunpckhqdq	%ymm26,%ymm24,%ymm10
+	vpunpckhqdq	%ymm27,%ymm25,%ymm11
+	vpunpcklqdq	%ymm30,%ymm28,%ymm12
+	vpunpcklqdq	%ymm31,%ymm29,%ymm13
+	vpunpckhqdq	%ymm30,%ymm28,%ymm14
+	vpunpckhqdq	%ymm31,%ymm29,%ymm15
+
+	# interleave 128-bit words in state n, n+4
+	# xor/write first four blocks
+	vmovdqa64	%ymm0,%ymm16
+	vperm2i128	$0x20,%ymm4,%ymm0,%ymm0
+	cmp		$0x0020,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0000(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0000(%rsi)
+	vmovdqa64	%ymm16,%ymm0
+	vperm2i128	$0x31,%ymm4,%ymm0,%ymm4
+
+	vperm2i128	$0x20,%ymm12,%ymm8,%ymm0
+	cmp		$0x0040,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0020(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0020(%rsi)
+	vperm2i128	$0x31,%ymm12,%ymm8,%ymm12
+
+	vperm2i128	$0x20,%ymm6,%ymm2,%ymm0
+	cmp		$0x0060,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0040(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0040(%rsi)
+	vperm2i128	$0x31,%ymm6,%ymm2,%ymm6
+
+	vperm2i128	$0x20,%ymm14,%ymm10,%ymm0
+	cmp		$0x0080,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0060(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0060(%rsi)
+	vperm2i128	$0x31,%ymm14,%ymm10,%ymm14
+
+	vperm2i128	$0x20,%ymm5,%ymm1,%ymm0
+	cmp		$0x00a0,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0080(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0080(%rsi)
+	vperm2i128	$0x31,%ymm5,%ymm1,%ymm5
+
+	vperm2i128	$0x20,%ymm13,%ymm9,%ymm0
+	cmp		$0x00c0,%rcx
+	jl		.Lxorpart8
+	vpxord		0x00a0(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x00a0(%rsi)
+	vperm2i128	$0x31,%ymm13,%ymm9,%ymm13
+
+	vperm2i128	$0x20,%ymm7,%ymm3,%ymm0
+	cmp		$0x00e0,%rcx
+	jl		.Lxorpart8
+	vpxord		0x00c0(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x00c0(%rsi)
+	vperm2i128	$0x31,%ymm7,%ymm3,%ymm7
+
+	vperm2i128	$0x20,%ymm15,%ymm11,%ymm0
+	cmp		$0x0100,%rcx
+	jl		.Lxorpart8
+	vpxord		0x00e0(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x00e0(%rsi)
+	vperm2i128	$0x31,%ymm15,%ymm11,%ymm15
+
+	# xor remaining blocks, write to output
+	vmovdqa64	%ymm4,%ymm0
+	cmp		$0x0120,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0100(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0100(%rsi)
+
+	vmovdqa64	%ymm12,%ymm0
+	cmp		$0x0140,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0120(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0120(%rsi)
+
+	vmovdqa64	%ymm6,%ymm0
+	cmp		$0x0160,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0140(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0140(%rsi)
+
+	vmovdqa64	%ymm14,%ymm0
+	cmp		$0x0180,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0160(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0160(%rsi)
+
+	vmovdqa64	%ymm5,%ymm0
+	cmp		$0x01a0,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0180(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0180(%rsi)
+
+	vmovdqa64	%ymm13,%ymm0
+	cmp		$0x01c0,%rcx
+	jl		.Lxorpart8
+	vpxord		0x01a0(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x01a0(%rsi)
+
+	vmovdqa64	%ymm7,%ymm0
+	cmp		$0x01e0,%rcx
+	jl		.Lxorpart8
+	vpxord		0x01c0(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x01c0(%rsi)
+
+	vmovdqa64	%ymm15,%ymm0
+	cmp		$0x0200,%rcx
+	jl		.Lxorpart8
+	vpxord		0x01e0(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x01e0(%rsi)
+
+.Ldone8:
+	vzeroupper
+	ret
+
+.Lxorpart8:
+	# xor remaining bytes from partial register into output
+	mov		%rcx,%rax
+	and		$0x1f,%rcx
+	jz		.Ldone8
+	mov		%rax,%r9
+	and		$~0x1f,%r9
+
+	mov		$1,%rax
+	shld		%cl,%rax,%rax
+	sub		$1,%rax
+	kmovq		%rax,%k1
+
+	vmovdqu8	(%rdx,%r9),%ymm1{%k1}{z}
+	vpxord		%ymm0,%ymm1,%ymm1
+	vmovdqu8	%ymm1,(%rsi,%r9){%k1}
+
+	jmp		.Ldone8
+
+ENDPROC(chacha_8block_xor_avx512vl)
diff --git a/arch/x86/crypto/chacha-ssse3-x86_64.S b/arch/x86/crypto/chacha-ssse3-x86_64.S
new file mode 100644
index 000000000000..c05a7a963dc3
--- /dev/null
+++ b/arch/x86/crypto/chacha-ssse3-x86_64.S
@@ -0,0 +1,795 @@
+/*
+ * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+.section	.rodata.cst16.ROT8, "aM", @progbits, 16
+.align 16
+ROT8:	.octa 0x0e0d0c0f0a09080b0605040702010003
+.section	.rodata.cst16.ROT16, "aM", @progbits, 16
+.align 16
+ROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
+.section	.rodata.cst16.CTRINC, "aM", @progbits, 16
+.align 16
+CTRINC:	.octa 0x00000003000000020000000100000000
+
+.text
+
+/*
+ * chacha_permute - permute one block
+ *
+ * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3.  This
+ * function performs matrix operations on four words in parallel, but requires
+ * shuffling to rearrange the words after each round.  8/16-bit word rotation is
+ * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
+ * rotation uses traditional shift+OR.
+ *
+ * The round count is given in %r8d.
+ *
+ * Clobbers: %r8d, %xmm4-%xmm7
+ */
+chacha_permute:
+
+	movdqa		ROT8(%rip),%xmm4
+	movdqa		ROT16(%rip),%xmm5
+
+.Ldoubleround:
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	paddd		%xmm1,%xmm0
+	pxor		%xmm0,%xmm3
+	pshufb		%xmm5,%xmm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	paddd		%xmm3,%xmm2
+	pxor		%xmm2,%xmm1
+	movdqa		%xmm1,%xmm6
+	pslld		$12,%xmm6
+	psrld		$20,%xmm1
+	por		%xmm6,%xmm1
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	paddd		%xmm1,%xmm0
+	pxor		%xmm0,%xmm3
+	pshufb		%xmm4,%xmm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	paddd		%xmm3,%xmm2
+	pxor		%xmm2,%xmm1
+	movdqa		%xmm1,%xmm7
+	pslld		$7,%xmm7
+	psrld		$25,%xmm1
+	por		%xmm7,%xmm1
+
+	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	pshufd		$0x39,%xmm1,%xmm1
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	pshufd		$0x4e,%xmm2,%xmm2
+	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	pshufd		$0x93,%xmm3,%xmm3
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	paddd		%xmm1,%xmm0
+	pxor		%xmm0,%xmm3
+	pshufb		%xmm5,%xmm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	paddd		%xmm3,%xmm2
+	pxor		%xmm2,%xmm1
+	movdqa		%xmm1,%xmm6
+	pslld		$12,%xmm6
+	psrld		$20,%xmm1
+	por		%xmm6,%xmm1
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	paddd		%xmm1,%xmm0
+	pxor		%xmm0,%xmm3
+	pshufb		%xmm4,%xmm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	paddd		%xmm3,%xmm2
+	pxor		%xmm2,%xmm1
+	movdqa		%xmm1,%xmm7
+	pslld		$7,%xmm7
+	psrld		$25,%xmm1
+	por		%xmm7,%xmm1
+
+	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	pshufd		$0x93,%xmm1,%xmm1
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	pshufd		$0x4e,%xmm2,%xmm2
+	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	pshufd		$0x39,%xmm3,%xmm3
+
+	sub		$2,%r8d
+	jnz		.Ldoubleround
+
+	ret
+ENDPROC(chacha_permute)
+
+ENTRY(chacha_block_xor_ssse3)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 1 data block output, o
+	# %rdx: up to 1 data block input, i
+	# %rcx: input/output length in bytes
+	# %r8d: nrounds
+	FRAME_BEGIN
+
+	# x0..3 = s0..3
+	movdqa		0x00(%rdi),%xmm0
+	movdqa		0x10(%rdi),%xmm1
+	movdqa		0x20(%rdi),%xmm2
+	movdqa		0x30(%rdi),%xmm3
+	movdqa		%xmm0,%xmm8
+	movdqa		%xmm1,%xmm9
+	movdqa		%xmm2,%xmm10
+	movdqa		%xmm3,%xmm11
+
+	mov		%rcx,%rax
+	call		chacha_permute
+
+	# o0 = i0 ^ (x0 + s0)
+	paddd		%xmm8,%xmm0
+	cmp		$0x10,%rax
+	jl		.Lxorpart
+	movdqu		0x00(%rdx),%xmm4
+	pxor		%xmm4,%xmm0
+	movdqu		%xmm0,0x00(%rsi)
+	# o1 = i1 ^ (x1 + s1)
+	paddd		%xmm9,%xmm1
+	movdqa		%xmm1,%xmm0
+	cmp		$0x20,%rax
+	jl		.Lxorpart
+	movdqu		0x10(%rdx),%xmm0
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x10(%rsi)
+	# o2 = i2 ^ (x2 + s2)
+	paddd		%xmm10,%xmm2
+	movdqa		%xmm2,%xmm0
+	cmp		$0x30,%rax
+	jl		.Lxorpart
+	movdqu		0x20(%rdx),%xmm0
+	pxor		%xmm2,%xmm0
+	movdqu		%xmm0,0x20(%rsi)
+	# o3 = i3 ^ (x3 + s3)
+	paddd		%xmm11,%xmm3
+	movdqa		%xmm3,%xmm0
+	cmp		$0x40,%rax
+	jl		.Lxorpart
+	movdqu		0x30(%rdx),%xmm0
+	pxor		%xmm3,%xmm0
+	movdqu		%xmm0,0x30(%rsi)
+
+.Ldone:
+	FRAME_END
+	ret
+
+.Lxorpart:
+	# xor remaining bytes from partial register into output
+	mov		%rax,%r9
+	and		$0x0f,%r9
+	jz		.Ldone
+	and		$~0x0f,%rax
+
+	mov		%rsi,%r11
+
+	lea		8(%rsp),%r10
+	sub		$0x10,%rsp
+	and		$~31,%rsp
+
+	lea		(%rdx,%rax),%rsi
+	mov		%rsp,%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	pxor		0x00(%rsp),%xmm0
+	movdqa		%xmm0,0x00(%rsp)
+
+	mov		%rsp,%rsi
+	lea		(%r11,%rax),%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	lea		-8(%r10),%rsp
+	jmp		.Ldone
+
+ENDPROC(chacha_block_xor_ssse3)
+
+ENTRY(hchacha_block_ssse3)
+	# %rdi: Input state matrix, s
+	# %rsi: output (8 32-bit words)
+	# %edx: nrounds
+	FRAME_BEGIN
+
+	movdqa		0x00(%rdi),%xmm0
+	movdqa		0x10(%rdi),%xmm1
+	movdqa		0x20(%rdi),%xmm2
+	movdqa		0x30(%rdi),%xmm3
+
+	mov		%edx,%r8d
+	call		chacha_permute
+
+	movdqu		%xmm0,0x00(%rsi)
+	movdqu		%xmm3,0x10(%rsi)
+
+	FRAME_END
+	ret
+ENDPROC(hchacha_block_ssse3)
+
+ENTRY(chacha_4block_xor_ssse3)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 4 data blocks output, o
+	# %rdx: up to 4 data blocks input, i
+	# %rcx: input/output length in bytes
+	# %r8d: nrounds
+
+	# This function encrypts four consecutive ChaCha blocks by loading the
+	# the state matrix in SSE registers four times. As we need some scratch
+	# registers, we save the first four registers on the stack. The
+	# algorithm performs each operation on the corresponding word of each
+	# state matrix, hence requires no word shuffling. For final XORing step
+	# we transpose the matrix by interleaving 32- and then 64-bit words,
+	# which allows us to do XOR in SSE registers. 8/16-bit word rotation is
+	# done with the slightly better performing SSSE3 byte shuffling,
+	# 7/12-bit word rotation uses traditional shift+OR.
+
+	lea		8(%rsp),%r10
+	sub		$0x80,%rsp
+	and		$~63,%rsp
+	mov		%rcx,%rax
+
+	# x0..15[0-3] = s0..3[0..3]
+	movq		0x00(%rdi),%xmm1
+	pshufd		$0x00,%xmm1,%xmm0
+	pshufd		$0x55,%xmm1,%xmm1
+	movq		0x08(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	movq		0x10(%rdi),%xmm5
+	pshufd		$0x00,%xmm5,%xmm4
+	pshufd		$0x55,%xmm5,%xmm5
+	movq		0x18(%rdi),%xmm7
+	pshufd		$0x00,%xmm7,%xmm6
+	pshufd		$0x55,%xmm7,%xmm7
+	movq		0x20(%rdi),%xmm9
+	pshufd		$0x00,%xmm9,%xmm8
+	pshufd		$0x55,%xmm9,%xmm9
+	movq		0x28(%rdi),%xmm11
+	pshufd		$0x00,%xmm11,%xmm10
+	pshufd		$0x55,%xmm11,%xmm11
+	movq		0x30(%rdi),%xmm13
+	pshufd		$0x00,%xmm13,%xmm12
+	pshufd		$0x55,%xmm13,%xmm13
+	movq		0x38(%rdi),%xmm15
+	pshufd		$0x00,%xmm15,%xmm14
+	pshufd		$0x55,%xmm15,%xmm15
+	# x0..3 on stack
+	movdqa		%xmm0,0x00(%rsp)
+	movdqa		%xmm1,0x10(%rsp)
+	movdqa		%xmm2,0x20(%rsp)
+	movdqa		%xmm3,0x30(%rsp)
+
+	movdqa		CTRINC(%rip),%xmm1
+	movdqa		ROT8(%rip),%xmm2
+	movdqa		ROT16(%rip),%xmm3
+
+	# x12 += counter values 0-3
+	paddd		%xmm1,%xmm12
+
+.Ldoubleround4:
+	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+	movdqa		0x00(%rsp),%xmm0
+	paddd		%xmm4,%xmm0
+	movdqa		%xmm0,0x00(%rsp)
+	pxor		%xmm0,%xmm12
+	pshufb		%xmm3,%xmm12
+	# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+	movdqa		0x10(%rsp),%xmm0
+	paddd		%xmm5,%xmm0
+	movdqa		%xmm0,0x10(%rsp)
+	pxor		%xmm0,%xmm13
+	pshufb		%xmm3,%xmm13
+	# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+	movdqa		0x20(%rsp),%xmm0
+	paddd		%xmm6,%xmm0
+	movdqa		%xmm0,0x20(%rsp)
+	pxor		%xmm0,%xmm14
+	pshufb		%xmm3,%xmm14
+	# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+	movdqa		0x30(%rsp),%xmm0
+	paddd		%xmm7,%xmm0
+	movdqa		%xmm0,0x30(%rsp)
+	pxor		%xmm0,%xmm15
+	pshufb		%xmm3,%xmm15
+
+	# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+	paddd		%xmm12,%xmm8
+	pxor		%xmm8,%xmm4
+	movdqa		%xmm4,%xmm0
+	pslld		$12,%xmm0
+	psrld		$20,%xmm4
+	por		%xmm0,%xmm4
+	# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+	paddd		%xmm13,%xmm9
+	pxor		%xmm9,%xmm5
+	movdqa		%xmm5,%xmm0
+	pslld		$12,%xmm0
+	psrld		$20,%xmm5
+	por		%xmm0,%xmm5
+	# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+	paddd		%xmm14,%xmm10
+	pxor		%xmm10,%xmm6
+	movdqa		%xmm6,%xmm0
+	pslld		$12,%xmm0
+	psrld		$20,%xmm6
+	por		%xmm0,%xmm6
+	# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+	paddd		%xmm15,%xmm11
+	pxor		%xmm11,%xmm7
+	movdqa		%xmm7,%xmm0
+	pslld		$12,%xmm0
+	psrld		$20,%xmm7
+	por		%xmm0,%xmm7
+
+	# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+	movdqa		0x00(%rsp),%xmm0
+	paddd		%xmm4,%xmm0
+	movdqa		%xmm0,0x00(%rsp)
+	pxor		%xmm0,%xmm12
+	pshufb		%xmm2,%xmm12
+	# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+	movdqa		0x10(%rsp),%xmm0
+	paddd		%xmm5,%xmm0
+	movdqa		%xmm0,0x10(%rsp)
+	pxor		%xmm0,%xmm13
+	pshufb		%xmm2,%xmm13
+	# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+	movdqa		0x20(%rsp),%xmm0
+	paddd		%xmm6,%xmm0
+	movdqa		%xmm0,0x20(%rsp)
+	pxor		%xmm0,%xmm14
+	pshufb		%xmm2,%xmm14
+	# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+	movdqa		0x30(%rsp),%xmm0
+	paddd		%xmm7,%xmm0
+	movdqa		%xmm0,0x30(%rsp)
+	pxor		%xmm0,%xmm15
+	pshufb		%xmm2,%xmm15
+
+	# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+	paddd		%xmm12,%xmm8
+	pxor		%xmm8,%xmm4
+	movdqa		%xmm4,%xmm0
+	pslld		$7,%xmm0
+	psrld		$25,%xmm4
+	por		%xmm0,%xmm4
+	# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+	paddd		%xmm13,%xmm9
+	pxor		%xmm9,%xmm5
+	movdqa		%xmm5,%xmm0
+	pslld		$7,%xmm0
+	psrld		$25,%xmm5
+	por		%xmm0,%xmm5
+	# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+	paddd		%xmm14,%xmm10
+	pxor		%xmm10,%xmm6
+	movdqa		%xmm6,%xmm0
+	pslld		$7,%xmm0
+	psrld		$25,%xmm6
+	por		%xmm0,%xmm6
+	# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+	paddd		%xmm15,%xmm11
+	pxor		%xmm11,%xmm7
+	movdqa		%xmm7,%xmm0
+	pslld		$7,%xmm0
+	psrld		$25,%xmm7
+	por		%xmm0,%xmm7
+
+	# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+	movdqa		0x00(%rsp),%xmm0
+	paddd		%xmm5,%xmm0
+	movdqa		%xmm0,0x00(%rsp)
+	pxor		%xmm0,%xmm15
+	pshufb		%xmm3,%xmm15
+	# x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+	movdqa		0x10(%rsp),%xmm0
+	paddd		%xmm6,%xmm0
+	movdqa		%xmm0,0x10(%rsp)
+	pxor		%xmm0,%xmm12
+	pshufb		%xmm3,%xmm12
+	# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+	movdqa		0x20(%rsp),%xmm0
+	paddd		%xmm7,%xmm0
+	movdqa		%xmm0,0x20(%rsp)
+	pxor		%xmm0,%xmm13
+	pshufb		%xmm3,%xmm13
+	# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+	movdqa		0x30(%rsp),%xmm0
+	paddd		%xmm4,%xmm0
+	movdqa		%xmm0,0x30(%rsp)
+	pxor		%xmm0,%xmm14
+	pshufb		%xmm3,%xmm14
+
+	# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+	paddd		%xmm15,%xmm10
+	pxor		%xmm10,%xmm5
+	movdqa		%xmm5,%xmm0
+	pslld		$12,%xmm0
+	psrld		$20,%xmm5
+	por		%xmm0,%xmm5
+	# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+	paddd		%xmm12,%xmm11
+	pxor		%xmm11,%xmm6
+	movdqa		%xmm6,%xmm0
+	pslld		$12,%xmm0
+	psrld		$20,%xmm6
+	por		%xmm0,%xmm6
+	# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+	paddd		%xmm13,%xmm8
+	pxor		%xmm8,%xmm7
+	movdqa		%xmm7,%xmm0
+	pslld		$12,%xmm0
+	psrld		$20,%xmm7
+	por		%xmm0,%xmm7
+	# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+	paddd		%xmm14,%xmm9
+	pxor		%xmm9,%xmm4
+	movdqa		%xmm4,%xmm0
+	pslld		$12,%xmm0
+	psrld		$20,%xmm4
+	por		%xmm0,%xmm4
+
+	# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+	movdqa		0x00(%rsp),%xmm0
+	paddd		%xmm5,%xmm0
+	movdqa		%xmm0,0x00(%rsp)
+	pxor		%xmm0,%xmm15
+	pshufb		%xmm2,%xmm15
+	# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+	movdqa		0x10(%rsp),%xmm0
+	paddd		%xmm6,%xmm0
+	movdqa		%xmm0,0x10(%rsp)
+	pxor		%xmm0,%xmm12
+	pshufb		%xmm2,%xmm12
+	# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+	movdqa		0x20(%rsp),%xmm0
+	paddd		%xmm7,%xmm0
+	movdqa		%xmm0,0x20(%rsp)
+	pxor		%xmm0,%xmm13
+	pshufb		%xmm2,%xmm13
+	# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+	movdqa		0x30(%rsp),%xmm0
+	paddd		%xmm4,%xmm0
+	movdqa		%xmm0,0x30(%rsp)
+	pxor		%xmm0,%xmm14
+	pshufb		%xmm2,%xmm14
+
+	# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+	paddd		%xmm15,%xmm10
+	pxor		%xmm10,%xmm5
+	movdqa		%xmm5,%xmm0
+	pslld		$7,%xmm0
+	psrld		$25,%xmm5
+	por		%xmm0,%xmm5
+	# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+	paddd		%xmm12,%xmm11
+	pxor		%xmm11,%xmm6
+	movdqa		%xmm6,%xmm0
+	pslld		$7,%xmm0
+	psrld		$25,%xmm6
+	por		%xmm0,%xmm6
+	# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+	paddd		%xmm13,%xmm8
+	pxor		%xmm8,%xmm7
+	movdqa		%xmm7,%xmm0
+	pslld		$7,%xmm0
+	psrld		$25,%xmm7
+	por		%xmm0,%xmm7
+	# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+	paddd		%xmm14,%xmm9
+	pxor		%xmm9,%xmm4
+	movdqa		%xmm4,%xmm0
+	pslld		$7,%xmm0
+	psrld		$25,%xmm4
+	por		%xmm0,%xmm4
+
+	sub		$2,%r8d
+	jnz		.Ldoubleround4
+
+	# x0[0-3] += s0[0]
+	# x1[0-3] += s0[1]
+	movq		0x00(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	paddd		0x00(%rsp),%xmm2
+	movdqa		%xmm2,0x00(%rsp)
+	paddd		0x10(%rsp),%xmm3
+	movdqa		%xmm3,0x10(%rsp)
+	# x2[0-3] += s0[2]
+	# x3[0-3] += s0[3]
+	movq		0x08(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	paddd		0x20(%rsp),%xmm2
+	movdqa		%xmm2,0x20(%rsp)
+	paddd		0x30(%rsp),%xmm3
+	movdqa		%xmm3,0x30(%rsp)
+
+	# x4[0-3] += s1[0]
+	# x5[0-3] += s1[1]
+	movq		0x10(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	paddd		%xmm2,%xmm4
+	paddd		%xmm3,%xmm5
+	# x6[0-3] += s1[2]
+	# x7[0-3] += s1[3]
+	movq		0x18(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	paddd		%xmm2,%xmm6
+	paddd		%xmm3,%xmm7
+
+	# x8[0-3] += s2[0]
+	# x9[0-3] += s2[1]
+	movq		0x20(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	paddd		%xmm2,%xmm8
+	paddd		%xmm3,%xmm9
+	# x10[0-3] += s2[2]
+	# x11[0-3] += s2[3]
+	movq		0x28(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	paddd		%xmm2,%xmm10
+	paddd		%xmm3,%xmm11
+
+	# x12[0-3] += s3[0]
+	# x13[0-3] += s3[1]
+	movq		0x30(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	paddd		%xmm2,%xmm12
+	paddd		%xmm3,%xmm13
+	# x14[0-3] += s3[2]
+	# x15[0-3] += s3[3]
+	movq		0x38(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	paddd		%xmm2,%xmm14
+	paddd		%xmm3,%xmm15
+
+	# x12 += counter values 0-3
+	paddd		%xmm1,%xmm12
+
+	# interleave 32-bit words in state n, n+1
+	movdqa		0x00(%rsp),%xmm0
+	movdqa		0x10(%rsp),%xmm1
+	movdqa		%xmm0,%xmm2
+	punpckldq	%xmm1,%xmm2
+	punpckhdq	%xmm1,%xmm0
+	movdqa		%xmm2,0x00(%rsp)
+	movdqa		%xmm0,0x10(%rsp)
+	movdqa		0x20(%rsp),%xmm0
+	movdqa		0x30(%rsp),%xmm1
+	movdqa		%xmm0,%xmm2
+	punpckldq	%xmm1,%xmm2
+	punpckhdq	%xmm1,%xmm0
+	movdqa		%xmm2,0x20(%rsp)
+	movdqa		%xmm0,0x30(%rsp)
+	movdqa		%xmm4,%xmm0
+	punpckldq	%xmm5,%xmm4
+	punpckhdq	%xmm5,%xmm0
+	movdqa		%xmm0,%xmm5
+	movdqa		%xmm6,%xmm0
+	punpckldq	%xmm7,%xmm6
+	punpckhdq	%xmm7,%xmm0
+	movdqa		%xmm0,%xmm7
+	movdqa		%xmm8,%xmm0
+	punpckldq	%xmm9,%xmm8
+	punpckhdq	%xmm9,%xmm0
+	movdqa		%xmm0,%xmm9
+	movdqa		%xmm10,%xmm0
+	punpckldq	%xmm11,%xmm10
+	punpckhdq	%xmm11,%xmm0
+	movdqa		%xmm0,%xmm11
+	movdqa		%xmm12,%xmm0
+	punpckldq	%xmm13,%xmm12
+	punpckhdq	%xmm13,%xmm0
+	movdqa		%xmm0,%xmm13
+	movdqa		%xmm14,%xmm0
+	punpckldq	%xmm15,%xmm14
+	punpckhdq	%xmm15,%xmm0
+	movdqa		%xmm0,%xmm15
+
+	# interleave 64-bit words in state n, n+2
+	movdqa		0x00(%rsp),%xmm0
+	movdqa		0x20(%rsp),%xmm1
+	movdqa		%xmm0,%xmm2
+	punpcklqdq	%xmm1,%xmm2
+	punpckhqdq	%xmm1,%xmm0
+	movdqa		%xmm2,0x00(%rsp)
+	movdqa		%xmm0,0x20(%rsp)
+	movdqa		0x10(%rsp),%xmm0
+	movdqa		0x30(%rsp),%xmm1
+	movdqa		%xmm0,%xmm2
+	punpcklqdq	%xmm1,%xmm2
+	punpckhqdq	%xmm1,%xmm0
+	movdqa		%xmm2,0x10(%rsp)
+	movdqa		%xmm0,0x30(%rsp)
+	movdqa		%xmm4,%xmm0
+	punpcklqdq	%xmm6,%xmm4
+	punpckhqdq	%xmm6,%xmm0
+	movdqa		%xmm0,%xmm6
+	movdqa		%xmm5,%xmm0
+	punpcklqdq	%xmm7,%xmm5
+	punpckhqdq	%xmm7,%xmm0
+	movdqa		%xmm0,%xmm7
+	movdqa		%xmm8,%xmm0
+	punpcklqdq	%xmm10,%xmm8
+	punpckhqdq	%xmm10,%xmm0
+	movdqa		%xmm0,%xmm10
+	movdqa		%xmm9,%xmm0
+	punpcklqdq	%xmm11,%xmm9
+	punpckhqdq	%xmm11,%xmm0
+	movdqa		%xmm0,%xmm11
+	movdqa		%xmm12,%xmm0
+	punpcklqdq	%xmm14,%xmm12
+	punpckhqdq	%xmm14,%xmm0
+	movdqa		%xmm0,%xmm14
+	movdqa		%xmm13,%xmm0
+	punpcklqdq	%xmm15,%xmm13
+	punpckhqdq	%xmm15,%xmm0
+	movdqa		%xmm0,%xmm15
+
+	# xor with corresponding input, write to output
+	movdqa		0x00(%rsp),%xmm0
+	cmp		$0x10,%rax
+	jl		.Lxorpart4
+	movdqu		0x00(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x00(%rsi)
+
+	movdqu		%xmm4,%xmm0
+	cmp		$0x20,%rax
+	jl		.Lxorpart4
+	movdqu		0x10(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x10(%rsi)
+
+	movdqu		%xmm8,%xmm0
+	cmp		$0x30,%rax
+	jl		.Lxorpart4
+	movdqu		0x20(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x20(%rsi)
+
+	movdqu		%xmm12,%xmm0
+	cmp		$0x40,%rax
+	jl		.Lxorpart4
+	movdqu		0x30(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x30(%rsi)
+
+	movdqa		0x20(%rsp),%xmm0
+	cmp		$0x50,%rax
+	jl		.Lxorpart4
+	movdqu		0x40(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x40(%rsi)
+
+	movdqu		%xmm6,%xmm0
+	cmp		$0x60,%rax
+	jl		.Lxorpart4
+	movdqu		0x50(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x50(%rsi)
+
+	movdqu		%xmm10,%xmm0
+	cmp		$0x70,%rax
+	jl		.Lxorpart4
+	movdqu		0x60(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x60(%rsi)
+
+	movdqu		%xmm14,%xmm0
+	cmp		$0x80,%rax
+	jl		.Lxorpart4
+	movdqu		0x70(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x70(%rsi)
+
+	movdqa		0x10(%rsp),%xmm0
+	cmp		$0x90,%rax
+	jl		.Lxorpart4
+	movdqu		0x80(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x80(%rsi)
+
+	movdqu		%xmm5,%xmm0
+	cmp		$0xa0,%rax
+	jl		.Lxorpart4
+	movdqu		0x90(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x90(%rsi)
+
+	movdqu		%xmm9,%xmm0
+	cmp		$0xb0,%rax
+	jl		.Lxorpart4
+	movdqu		0xa0(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0xa0(%rsi)
+
+	movdqu		%xmm13,%xmm0
+	cmp		$0xc0,%rax
+	jl		.Lxorpart4
+	movdqu		0xb0(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0xb0(%rsi)
+
+	movdqa		0x30(%rsp),%xmm0
+	cmp		$0xd0,%rax
+	jl		.Lxorpart4
+	movdqu		0xc0(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0xc0(%rsi)
+
+	movdqu		%xmm7,%xmm0
+	cmp		$0xe0,%rax
+	jl		.Lxorpart4
+	movdqu		0xd0(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0xd0(%rsi)
+
+	movdqu		%xmm11,%xmm0
+	cmp		$0xf0,%rax
+	jl		.Lxorpart4
+	movdqu		0xe0(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0xe0(%rsi)
+
+	movdqu		%xmm15,%xmm0
+	cmp		$0x100,%rax
+	jl		.Lxorpart4
+	movdqu		0xf0(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0xf0(%rsi)
+
+.Ldone4:
+	lea		-8(%r10),%rsp
+	ret
+
+.Lxorpart4:
+	# xor remaining bytes from partial register into output
+	mov		%rax,%r9
+	and		$0x0f,%r9
+	jz		.Ldone4
+	and		$~0x0f,%rax
+
+	mov		%rsi,%r11
+
+	lea		(%rdx,%rax),%rsi
+	mov		%rsp,%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	pxor		0x00(%rsp),%xmm0
+	movdqa		%xmm0,0x00(%rsp)
+
+	mov		%rsp,%rsi
+	lea		(%r11,%rax),%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	jmp		.Ldone4
+
+ENDPROC(chacha_4block_xor_ssse3)
diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S b/arch/x86/crypto/chacha20-avx2-x86_64.S
deleted file mode 100644
index b6ab082be657..000000000000
--- a/arch/x86/crypto/chacha20-avx2-x86_64.S
+++ /dev/null
@@ -1,1026 +0,0 @@
-/*
- * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <linux/linkage.h>
-
-.section	.rodata.cst32.ROT8, "aM", @progbits, 32
-.align 32
-ROT8:	.octa 0x0e0d0c0f0a09080b0605040702010003
-	.octa 0x0e0d0c0f0a09080b0605040702010003
-
-.section	.rodata.cst32.ROT16, "aM", @progbits, 32
-.align 32
-ROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
-	.octa 0x0d0c0f0e09080b0a0504070601000302
-
-.section	.rodata.cst32.CTRINC, "aM", @progbits, 32
-.align 32
-CTRINC:	.octa 0x00000003000000020000000100000000
-	.octa 0x00000007000000060000000500000004
-
-.section	.rodata.cst32.CTR2BL, "aM", @progbits, 32
-.align 32
-CTR2BL:	.octa 0x00000000000000000000000000000000
-	.octa 0x00000000000000000000000000000001
-
-.section	.rodata.cst32.CTR4BL, "aM", @progbits, 32
-.align 32
-CTR4BL:	.octa 0x00000000000000000000000000000002
-	.octa 0x00000000000000000000000000000003
-
-.text
-
-ENTRY(chacha20_2block_xor_avx2)
-	# %rdi: Input state matrix, s
-	# %rsi: up to 2 data blocks output, o
-	# %rdx: up to 2 data blocks input, i
-	# %rcx: input/output length in bytes
-
-	# This function encrypts two ChaCha20 blocks by loading the state
-	# matrix twice across four AVX registers. It performs matrix operations
-	# on four words in each matrix in parallel, but requires shuffling to
-	# rearrange the words after each round.
-
-	vzeroupper
-
-	# x0..3[0-2] = s0..3
-	vbroadcasti128	0x00(%rdi),%ymm0
-	vbroadcasti128	0x10(%rdi),%ymm1
-	vbroadcasti128	0x20(%rdi),%ymm2
-	vbroadcasti128	0x30(%rdi),%ymm3
-
-	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
-
-	vmovdqa		%ymm0,%ymm8
-	vmovdqa		%ymm1,%ymm9
-	vmovdqa		%ymm2,%ymm10
-	vmovdqa		%ymm3,%ymm11
-
-	vmovdqa		ROT8(%rip),%ymm4
-	vmovdqa		ROT16(%rip),%ymm5
-
-	mov		%rcx,%rax
-	mov		$10,%ecx
-
-.Ldoubleround:
-
-	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-	vpaddd		%ymm1,%ymm0,%ymm0
-	vpxor		%ymm0,%ymm3,%ymm3
-	vpshufb		%ymm5,%ymm3,%ymm3
-
-	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-	vpaddd		%ymm3,%ymm2,%ymm2
-	vpxor		%ymm2,%ymm1,%ymm1
-	vmovdqa		%ymm1,%ymm6
-	vpslld		$12,%ymm6,%ymm6
-	vpsrld		$20,%ymm1,%ymm1
-	vpor		%ymm6,%ymm1,%ymm1
-
-	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-	vpaddd		%ymm1,%ymm0,%ymm0
-	vpxor		%ymm0,%ymm3,%ymm3
-	vpshufb		%ymm4,%ymm3,%ymm3
-
-	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-	vpaddd		%ymm3,%ymm2,%ymm2
-	vpxor		%ymm2,%ymm1,%ymm1
-	vmovdqa		%ymm1,%ymm7
-	vpslld		$7,%ymm7,%ymm7
-	vpsrld		$25,%ymm1,%ymm1
-	vpor		%ymm7,%ymm1,%ymm1
-
-	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
-	vpshufd		$0x39,%ymm1,%ymm1
-	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-	vpshufd		$0x4e,%ymm2,%ymm2
-	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
-	vpshufd		$0x93,%ymm3,%ymm3
-
-	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-	vpaddd		%ymm1,%ymm0,%ymm0
-	vpxor		%ymm0,%ymm3,%ymm3
-	vpshufb		%ymm5,%ymm3,%ymm3
-
-	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-	vpaddd		%ymm3,%ymm2,%ymm2
-	vpxor		%ymm2,%ymm1,%ymm1
-	vmovdqa		%ymm1,%ymm6
-	vpslld		$12,%ymm6,%ymm6
-	vpsrld		$20,%ymm1,%ymm1
-	vpor		%ymm6,%ymm1,%ymm1
-
-	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-	vpaddd		%ymm1,%ymm0,%ymm0
-	vpxor		%ymm0,%ymm3,%ymm3
-	vpshufb		%ymm4,%ymm3,%ymm3
-
-	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-	vpaddd		%ymm3,%ymm2,%ymm2
-	vpxor		%ymm2,%ymm1,%ymm1
-	vmovdqa		%ymm1,%ymm7
-	vpslld		$7,%ymm7,%ymm7
-	vpsrld		$25,%ymm1,%ymm1
-	vpor		%ymm7,%ymm1,%ymm1
-
-	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
-	vpshufd		$0x93,%ymm1,%ymm1
-	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-	vpshufd		$0x4e,%ymm2,%ymm2
-	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
-	vpshufd		$0x39,%ymm3,%ymm3
-
-	dec		%ecx
-	jnz		.Ldoubleround
-
-	# o0 = i0 ^ (x0 + s0)
-	vpaddd		%ymm8,%ymm0,%ymm7
-	cmp		$0x10,%rax
-	jl		.Lxorpart2
-	vpxor		0x00(%rdx),%xmm7,%xmm6
-	vmovdqu		%xmm6,0x00(%rsi)
-	vextracti128	$1,%ymm7,%xmm0
-	# o1 = i1 ^ (x1 + s1)
-	vpaddd		%ymm9,%ymm1,%ymm7
-	cmp		$0x20,%rax
-	jl		.Lxorpart2
-	vpxor		0x10(%rdx),%xmm7,%xmm6
-	vmovdqu		%xmm6,0x10(%rsi)
-	vextracti128	$1,%ymm7,%xmm1
-	# o2 = i2 ^ (x2 + s2)
-	vpaddd		%ymm10,%ymm2,%ymm7
-	cmp		$0x30,%rax
-	jl		.Lxorpart2
-	vpxor		0x20(%rdx),%xmm7,%xmm6
-	vmovdqu		%xmm6,0x20(%rsi)
-	vextracti128	$1,%ymm7,%xmm2
-	# o3 = i3 ^ (x3 + s3)
-	vpaddd		%ymm11,%ymm3,%ymm7
-	cmp		$0x40,%rax
-	jl		.Lxorpart2
-	vpxor		0x30(%rdx),%xmm7,%xmm6
-	vmovdqu		%xmm6,0x30(%rsi)
-	vextracti128	$1,%ymm7,%xmm3
-
-	# xor and write second block
-	vmovdqa		%xmm0,%xmm7
-	cmp		$0x50,%rax
-	jl		.Lxorpart2
-	vpxor		0x40(%rdx),%xmm7,%xmm6
-	vmovdqu		%xmm6,0x40(%rsi)
-
-	vmovdqa		%xmm1,%xmm7
-	cmp		$0x60,%rax
-	jl		.Lxorpart2
-	vpxor		0x50(%rdx),%xmm7,%xmm6
-	vmovdqu		%xmm6,0x50(%rsi)
-
-	vmovdqa		%xmm2,%xmm7
-	cmp		$0x70,%rax
-	jl		.Lxorpart2
-	vpxor		0x60(%rdx),%xmm7,%xmm6
-	vmovdqu		%xmm6,0x60(%rsi)
-
-	vmovdqa		%xmm3,%xmm7
-	cmp		$0x80,%rax
-	jl		.Lxorpart2
-	vpxor		0x70(%rdx),%xmm7,%xmm6
-	vmovdqu		%xmm6,0x70(%rsi)
-
-.Ldone2:
-	vzeroupper
-	ret
-
-.Lxorpart2:
-	# xor remaining bytes from partial register into output
-	mov		%rax,%r9
-	and		$0x0f,%r9
-	jz		.Ldone2
-	and		$~0x0f,%rax
-
-	mov		%rsi,%r11
-
-	lea		8(%rsp),%r10
-	sub		$0x10,%rsp
-	and		$~31,%rsp
-
-	lea		(%rdx,%rax),%rsi
-	mov		%rsp,%rdi
-	mov		%r9,%rcx
-	rep movsb
-
-	vpxor		0x00(%rsp),%xmm7,%xmm7
-	vmovdqa		%xmm7,0x00(%rsp)
-
-	mov		%rsp,%rsi
-	lea		(%r11,%rax),%rdi
-	mov		%r9,%rcx
-	rep movsb
-
-	lea		-8(%r10),%rsp
-	jmp		.Ldone2
-
-ENDPROC(chacha20_2block_xor_avx2)
-
-ENTRY(chacha20_4block_xor_avx2)
-	# %rdi: Input state matrix, s
-	# %rsi: up to 4 data blocks output, o
-	# %rdx: up to 4 data blocks input, i
-	# %rcx: input/output length in bytes
-
-	# This function encrypts four ChaCha20 block by loading the state
-	# matrix four times across eight AVX registers. It performs matrix
-	# operations on four words in two matrices in parallel, sequentially
-	# to the operations on the four words of the other two matrices. The
-	# required word shuffling has a rather high latency, we can do the
-	# arithmetic on two matrix-pairs without much slowdown.
-
-	vzeroupper
-
-	# x0..3[0-4] = s0..3
-	vbroadcasti128	0x00(%rdi),%ymm0
-	vbroadcasti128	0x10(%rdi),%ymm1
-	vbroadcasti128	0x20(%rdi),%ymm2
-	vbroadcasti128	0x30(%rdi),%ymm3
-
-	vmovdqa		%ymm0,%ymm4
-	vmovdqa		%ymm1,%ymm5
-	vmovdqa		%ymm2,%ymm6
-	vmovdqa		%ymm3,%ymm7
-
-	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
-	vpaddd		CTR4BL(%rip),%ymm7,%ymm7
-
-	vmovdqa		%ymm0,%ymm11
-	vmovdqa		%ymm1,%ymm12
-	vmovdqa		%ymm2,%ymm13
-	vmovdqa		%ymm3,%ymm14
-	vmovdqa		%ymm7,%ymm15
-
-	vmovdqa		ROT8(%rip),%ymm8
-	vmovdqa		ROT16(%rip),%ymm9
-
-	mov		%rcx,%rax
-	mov		$10,%ecx
-
-.Ldoubleround4:
-
-	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-	vpaddd		%ymm1,%ymm0,%ymm0
-	vpxor		%ymm0,%ymm3,%ymm3
-	vpshufb		%ymm9,%ymm3,%ymm3
-
-	vpaddd		%ymm5,%ymm4,%ymm4
-	vpxor		%ymm4,%ymm7,%ymm7
-	vpshufb		%ymm9,%ymm7,%ymm7
-
-	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-	vpaddd		%ymm3,%ymm2,%ymm2
-	vpxor		%ymm2,%ymm1,%ymm1
-	vmovdqa		%ymm1,%ymm10
-	vpslld		$12,%ymm10,%ymm10
-	vpsrld		$20,%ymm1,%ymm1
-	vpor		%ymm10,%ymm1,%ymm1
-
-	vpaddd		%ymm7,%ymm6,%ymm6
-	vpxor		%ymm6,%ymm5,%ymm5
-	vmovdqa		%ymm5,%ymm10
-	vpslld		$12,%ymm10,%ymm10
-	vpsrld		$20,%ymm5,%ymm5
-	vpor		%ymm10,%ymm5,%ymm5
-
-	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-	vpaddd		%ymm1,%ymm0,%ymm0
-	vpxor		%ymm0,%ymm3,%ymm3
-	vpshufb		%ymm8,%ymm3,%ymm3
-
-	vpaddd		%ymm5,%ymm4,%ymm4
-	vpxor		%ymm4,%ymm7,%ymm7
-	vpshufb		%ymm8,%ymm7,%ymm7
-
-	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-	vpaddd		%ymm3,%ymm2,%ymm2
-	vpxor		%ymm2,%ymm1,%ymm1
-	vmovdqa		%ymm1,%ymm10
-	vpslld		$7,%ymm10,%ymm10
-	vpsrld		$25,%ymm1,%ymm1
-	vpor		%ymm10,%ymm1,%ymm1
-
-	vpaddd		%ymm7,%ymm6,%ymm6
-	vpxor		%ymm6,%ymm5,%ymm5
-	vmovdqa		%ymm5,%ymm10
-	vpslld		$7,%ymm10,%ymm10
-	vpsrld		$25,%ymm5,%ymm5
-	vpor		%ymm10,%ymm5,%ymm5
-
-	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
-	vpshufd		$0x39,%ymm1,%ymm1
-	vpshufd		$0x39,%ymm5,%ymm5
-	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-	vpshufd		$0x4e,%ymm2,%ymm2
-	vpshufd		$0x4e,%ymm6,%ymm6
-	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
-	vpshufd		$0x93,%ymm3,%ymm3
-	vpshufd		$0x93,%ymm7,%ymm7
-
-	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-	vpaddd		%ymm1,%ymm0,%ymm0
-	vpxor		%ymm0,%ymm3,%ymm3
-	vpshufb		%ymm9,%ymm3,%ymm3
-
-	vpaddd		%ymm5,%ymm4,%ymm4
-	vpxor		%ymm4,%ymm7,%ymm7
-	vpshufb		%ymm9,%ymm7,%ymm7
-
-	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-	vpaddd		%ymm3,%ymm2,%ymm2
-	vpxor		%ymm2,%ymm1,%ymm1
-	vmovdqa		%ymm1,%ymm10
-	vpslld		$12,%ymm10,%ymm10
-	vpsrld		$20,%ymm1,%ymm1
-	vpor		%ymm10,%ymm1,%ymm1
-
-	vpaddd		%ymm7,%ymm6,%ymm6
-	vpxor		%ymm6,%ymm5,%ymm5
-	vmovdqa		%ymm5,%ymm10
-	vpslld		$12,%ymm10,%ymm10
-	vpsrld		$20,%ymm5,%ymm5
-	vpor		%ymm10,%ymm5,%ymm5
-
-	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-	vpaddd		%ymm1,%ymm0,%ymm0
-	vpxor		%ymm0,%ymm3,%ymm3
-	vpshufb		%ymm8,%ymm3,%ymm3
-
-	vpaddd		%ymm5,%ymm4,%ymm4
-	vpxor		%ymm4,%ymm7,%ymm7
-	vpshufb		%ymm8,%ymm7,%ymm7
-
-	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-	vpaddd		%ymm3,%ymm2,%ymm2
-	vpxor		%ymm2,%ymm1,%ymm1
-	vmovdqa		%ymm1,%ymm10
-	vpslld		$7,%ymm10,%ymm10
-	vpsrld		$25,%ymm1,%ymm1
-	vpor		%ymm10,%ymm1,%ymm1
-
-	vpaddd		%ymm7,%ymm6,%ymm6
-	vpxor		%ymm6,%ymm5,%ymm5
-	vmovdqa		%ymm5,%ymm10
-	vpslld		$7,%ymm10,%ymm10
-	vpsrld		$25,%ymm5,%ymm5
-	vpor		%ymm10,%ymm5,%ymm5
-
-	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
-	vpshufd		$0x93,%ymm1,%ymm1
-	vpshufd		$0x93,%ymm5,%ymm5
-	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-	vpshufd		$0x4e,%ymm2,%ymm2
-	vpshufd		$0x4e,%ymm6,%ymm6
-	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
-	vpshufd		$0x39,%ymm3,%ymm3
-	vpshufd		$0x39,%ymm7,%ymm7
-
-	dec		%ecx
-	jnz		.Ldoubleround4
-
-	# o0 = i0 ^ (x0 + s0), first block
-	vpaddd		%ymm11,%ymm0,%ymm10
-	cmp		$0x10,%rax
-	jl		.Lxorpart4
-	vpxor		0x00(%rdx),%xmm10,%xmm9
-	vmovdqu		%xmm9,0x00(%rsi)
-	vextracti128	$1,%ymm10,%xmm0
-	# o1 = i1 ^ (x1 + s1), first block
-	vpaddd		%ymm12,%ymm1,%ymm10
-	cmp		$0x20,%rax
-	jl		.Lxorpart4
-	vpxor		0x10(%rdx),%xmm10,%xmm9
-	vmovdqu		%xmm9,0x10(%rsi)
-	vextracti128	$1,%ymm10,%xmm1
-	# o2 = i2 ^ (x2 + s2), first block
-	vpaddd		%ymm13,%ymm2,%ymm10
-	cmp		$0x30,%rax
-	jl		.Lxorpart4
-	vpxor		0x20(%rdx),%xmm10,%xmm9
-	vmovdqu		%xmm9,0x20(%rsi)
-	vextracti128	$1,%ymm10,%xmm2
-	# o3 = i3 ^ (x3 + s3), first block
-	vpaddd		%ymm14,%ymm3,%ymm10
-	cmp		$0x40,%rax
-	jl		.Lxorpart4
-	vpxor		0x30(%rdx),%xmm10,%xmm9
-	vmovdqu		%xmm9,0x30(%rsi)
-	vextracti128	$1,%ymm10,%xmm3
-
-	# xor and write second block
-	vmovdqa		%xmm0,%xmm10
-	cmp		$0x50,%rax
-	jl		.Lxorpart4
-	vpxor		0x40(%rdx),%xmm10,%xmm9
-	vmovdqu		%xmm9,0x40(%rsi)
-
-	vmovdqa		%xmm1,%xmm10
-	cmp		$0x60,%rax
-	jl		.Lxorpart4
-	vpxor		0x50(%rdx),%xmm10,%xmm9
-	vmovdqu		%xmm9,0x50(%rsi)
-
-	vmovdqa		%xmm2,%xmm10
-	cmp		$0x70,%rax
-	jl		.Lxorpart4
-	vpxor		0x60(%rdx),%xmm10,%xmm9
-	vmovdqu		%xmm9,0x60(%rsi)
-
-	vmovdqa		%xmm3,%xmm10
-	cmp		$0x80,%rax
-	jl		.Lxorpart4
-	vpxor		0x70(%rdx),%xmm10,%xmm9
-	vmovdqu		%xmm9,0x70(%rsi)
-
-	# o0 = i0 ^ (x0 + s0), third block
-	vpaddd		%ymm11,%ymm4,%ymm10
-	cmp		$0x90,%rax
-	jl		.Lxorpart4
-	vpxor		0x80(%rdx),%xmm10,%xmm9
-	vmovdqu		%xmm9,0x80(%rsi)
-	vextracti128	$1,%ymm10,%xmm4
-	# o1 = i1 ^ (x1 + s1), third block
-	vpaddd		%ymm12,%ymm5,%ymm10
-	cmp		$0xa0,%rax
-	jl		.Lxorpart4
-	vpxor		0x90(%rdx),%xmm10,%xmm9
-	vmovdqu		%xmm9,0x90(%rsi)
-	vextracti128	$1,%ymm10,%xmm5
-	# o2 = i2 ^ (x2 + s2), third block
-	vpaddd		%ymm13,%ymm6,%ymm10
-	cmp		$0xb0,%rax
-	jl		.Lxorpart4
-	vpxor		0xa0(%rdx),%xmm10,%xmm9
-	vmovdqu		%xmm9,0xa0(%rsi)
-	vextracti128	$1,%ymm10,%xmm6
-	# o3 = i3 ^ (x3 + s3), third block
-	vpaddd		%ymm15,%ymm7,%ymm10
-	cmp		$0xc0,%rax
-	jl		.Lxorpart4
-	vpxor		0xb0(%rdx),%xmm10,%xmm9
-	vmovdqu		%xmm9,0xb0(%rsi)
-	vextracti128	$1,%ymm10,%xmm7
-
-	# xor and write fourth block
-	vmovdqa		%xmm4,%xmm10
-	cmp		$0xd0,%rax
-	jl		.Lxorpart4
-	vpxor		0xc0(%rdx),%xmm10,%xmm9
-	vmovdqu		%xmm9,0xc0(%rsi)
-
-	vmovdqa		%xmm5,%xmm10
-	cmp		$0xe0,%rax
-	jl		.Lxorpart4
-	vpxor		0xd0(%rdx),%xmm10,%xmm9
-	vmovdqu		%xmm9,0xd0(%rsi)
-
-	vmovdqa		%xmm6,%xmm10
-	cmp		$0xf0,%rax
-	jl		.Lxorpart4
-	vpxor		0xe0(%rdx),%xmm10,%xmm9
-	vmovdqu		%xmm9,0xe0(%rsi)
-
-	vmovdqa		%xmm7,%xmm10
-	cmp		$0x100,%rax
-	jl		.Lxorpart4
-	vpxor		0xf0(%rdx),%xmm10,%xmm9
-	vmovdqu		%xmm9,0xf0(%rsi)
-
-.Ldone4:
-	vzeroupper
-	ret
-
-.Lxorpart4:
-	# xor remaining bytes from partial register into output
-	mov		%rax,%r9
-	and		$0x0f,%r9
-	jz		.Ldone4
-	and		$~0x0f,%rax
-
-	mov		%rsi,%r11
-
-	lea		8(%rsp),%r10
-	sub		$0x10,%rsp
-	and		$~31,%rsp
-
-	lea		(%rdx,%rax),%rsi
-	mov		%rsp,%rdi
-	mov		%r9,%rcx
-	rep movsb
-
-	vpxor		0x00(%rsp),%xmm10,%xmm10
-	vmovdqa		%xmm10,0x00(%rsp)
-
-	mov		%rsp,%rsi
-	lea		(%r11,%rax),%rdi
-	mov		%r9,%rcx
-	rep movsb
-
-	lea		-8(%r10),%rsp
-	jmp		.Ldone4
-
-ENDPROC(chacha20_4block_xor_avx2)
-
-ENTRY(chacha20_8block_xor_avx2)
-	# %rdi: Input state matrix, s
-	# %rsi: up to 8 data blocks output, o
-	# %rdx: up to 8 data blocks input, i
-	# %rcx: input/output length in bytes
-
-	# This function encrypts eight consecutive ChaCha20 blocks by loading
-	# the state matrix in AVX registers eight times. As we need some
-	# scratch registers, we save the first four registers on the stack. The
-	# algorithm performs each operation on the corresponding word of each
-	# state matrix, hence requires no word shuffling. For final XORing step
-	# we transpose the matrix by interleaving 32-, 64- and then 128-bit
-	# words, which allows us to do XOR in AVX registers. 8/16-bit word
-	# rotation is done with the slightly better performing byte shuffling,
-	# 7/12-bit word rotation uses traditional shift+OR.
-
-	vzeroupper
-	# 4 * 32 byte stack, 32-byte aligned
-	lea		8(%rsp),%r10
-	and		$~31, %rsp
-	sub		$0x80, %rsp
-	mov		%rcx,%rax
-
-	# x0..15[0-7] = s[0..15]
-	vpbroadcastd	0x00(%rdi),%ymm0
-	vpbroadcastd	0x04(%rdi),%ymm1
-	vpbroadcastd	0x08(%rdi),%ymm2
-	vpbroadcastd	0x0c(%rdi),%ymm3
-	vpbroadcastd	0x10(%rdi),%ymm4
-	vpbroadcastd	0x14(%rdi),%ymm5
-	vpbroadcastd	0x18(%rdi),%ymm6
-	vpbroadcastd	0x1c(%rdi),%ymm7
-	vpbroadcastd	0x20(%rdi),%ymm8
-	vpbroadcastd	0x24(%rdi),%ymm9
-	vpbroadcastd	0x28(%rdi),%ymm10
-	vpbroadcastd	0x2c(%rdi),%ymm11
-	vpbroadcastd	0x30(%rdi),%ymm12
-	vpbroadcastd	0x34(%rdi),%ymm13
-	vpbroadcastd	0x38(%rdi),%ymm14
-	vpbroadcastd	0x3c(%rdi),%ymm15
-	# x0..3 on stack
-	vmovdqa		%ymm0,0x00(%rsp)
-	vmovdqa		%ymm1,0x20(%rsp)
-	vmovdqa		%ymm2,0x40(%rsp)
-	vmovdqa		%ymm3,0x60(%rsp)
-
-	vmovdqa		CTRINC(%rip),%ymm1
-	vmovdqa		ROT8(%rip),%ymm2
-	vmovdqa		ROT16(%rip),%ymm3
-
-	# x12 += counter values 0-3
-	vpaddd		%ymm1,%ymm12,%ymm12
-
-	mov		$10,%ecx
-
-.Ldoubleround8:
-	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
-	vpaddd		0x00(%rsp),%ymm4,%ymm0
-	vmovdqa		%ymm0,0x00(%rsp)
-	vpxor		%ymm0,%ymm12,%ymm12
-	vpshufb		%ymm3,%ymm12,%ymm12
-	# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
-	vpaddd		0x20(%rsp),%ymm5,%ymm0
-	vmovdqa		%ymm0,0x20(%rsp)
-	vpxor		%ymm0,%ymm13,%ymm13
-	vpshufb		%ymm3,%ymm13,%ymm13
-	# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
-	vpaddd		0x40(%rsp),%ymm6,%ymm0
-	vmovdqa		%ymm0,0x40(%rsp)
-	vpxor		%ymm0,%ymm14,%ymm14
-	vpshufb		%ymm3,%ymm14,%ymm14
-	# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
-	vpaddd		0x60(%rsp),%ymm7,%ymm0
-	vmovdqa		%ymm0,0x60(%rsp)
-	vpxor		%ymm0,%ymm15,%ymm15
-	vpshufb		%ymm3,%ymm15,%ymm15
-
-	# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
-	vpaddd		%ymm12,%ymm8,%ymm8
-	vpxor		%ymm8,%ymm4,%ymm4
-	vpslld		$12,%ymm4,%ymm0
-	vpsrld		$20,%ymm4,%ymm4
-	vpor		%ymm0,%ymm4,%ymm4
-	# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
-	vpaddd		%ymm13,%ymm9,%ymm9
-	vpxor		%ymm9,%ymm5,%ymm5
-	vpslld		$12,%ymm5,%ymm0
-	vpsrld		$20,%ymm5,%ymm5
-	vpor		%ymm0,%ymm5,%ymm5
-	# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
-	vpaddd		%ymm14,%ymm10,%ymm10
-	vpxor		%ymm10,%ymm6,%ymm6
-	vpslld		$12,%ymm6,%ymm0
-	vpsrld		$20,%ymm6,%ymm6
-	vpor		%ymm0,%ymm6,%ymm6
-	# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
-	vpaddd		%ymm15,%ymm11,%ymm11
-	vpxor		%ymm11,%ymm7,%ymm7
-	vpslld		$12,%ymm7,%ymm0
-	vpsrld		$20,%ymm7,%ymm7
-	vpor		%ymm0,%ymm7,%ymm7
-
-	# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
-	vpaddd		0x00(%rsp),%ymm4,%ymm0
-	vmovdqa		%ymm0,0x00(%rsp)
-	vpxor		%ymm0,%ymm12,%ymm12
-	vpshufb		%ymm2,%ymm12,%ymm12
-	# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
-	vpaddd		0x20(%rsp),%ymm5,%ymm0
-	vmovdqa		%ymm0,0x20(%rsp)
-	vpxor		%ymm0,%ymm13,%ymm13
-	vpshufb		%ymm2,%ymm13,%ymm13
-	# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
-	vpaddd		0x40(%rsp),%ymm6,%ymm0
-	vmovdqa		%ymm0,0x40(%rsp)
-	vpxor		%ymm0,%ymm14,%ymm14
-	vpshufb		%ymm2,%ymm14,%ymm14
-	# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
-	vpaddd		0x60(%rsp),%ymm7,%ymm0
-	vmovdqa		%ymm0,0x60(%rsp)
-	vpxor		%ymm0,%ymm15,%ymm15
-	vpshufb		%ymm2,%ymm15,%ymm15
-
-	# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
-	vpaddd		%ymm12,%ymm8,%ymm8
-	vpxor		%ymm8,%ymm4,%ymm4
-	vpslld		$7,%ymm4,%ymm0
-	vpsrld		$25,%ymm4,%ymm4
-	vpor		%ymm0,%ymm4,%ymm4
-	# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
-	vpaddd		%ymm13,%ymm9,%ymm9
-	vpxor		%ymm9,%ymm5,%ymm5
-	vpslld		$7,%ymm5,%ymm0
-	vpsrld		$25,%ymm5,%ymm5
-	vpor		%ymm0,%ymm5,%ymm5
-	# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
-	vpaddd		%ymm14,%ymm10,%ymm10
-	vpxor		%ymm10,%ymm6,%ymm6
-	vpslld		$7,%ymm6,%ymm0
-	vpsrld		$25,%ymm6,%ymm6
-	vpor		%ymm0,%ymm6,%ymm6
-	# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
-	vpaddd		%ymm15,%ymm11,%ymm11
-	vpxor		%ymm11,%ymm7,%ymm7
-	vpslld		$7,%ymm7,%ymm0
-	vpsrld		$25,%ymm7,%ymm7
-	vpor		%ymm0,%ymm7,%ymm7
-
-	# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
-	vpaddd		0x00(%rsp),%ymm5,%ymm0
-	vmovdqa		%ymm0,0x00(%rsp)
-	vpxor		%ymm0,%ymm15,%ymm15
-	vpshufb		%ymm3,%ymm15,%ymm15
-	# x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
-	vpaddd		0x20(%rsp),%ymm6,%ymm0
-	vmovdqa		%ymm0,0x20(%rsp)
-	vpxor		%ymm0,%ymm12,%ymm12
-	vpshufb		%ymm3,%ymm12,%ymm12
-	# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
-	vpaddd		0x40(%rsp),%ymm7,%ymm0
-	vmovdqa		%ymm0,0x40(%rsp)
-	vpxor		%ymm0,%ymm13,%ymm13
-	vpshufb		%ymm3,%ymm13,%ymm13
-	# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
-	vpaddd		0x60(%rsp),%ymm4,%ymm0
-	vmovdqa		%ymm0,0x60(%rsp)
-	vpxor		%ymm0,%ymm14,%ymm14
-	vpshufb		%ymm3,%ymm14,%ymm14
-
-	# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
-	vpaddd		%ymm15,%ymm10,%ymm10
-	vpxor		%ymm10,%ymm5,%ymm5
-	vpslld		$12,%ymm5,%ymm0
-	vpsrld		$20,%ymm5,%ymm5
-	vpor		%ymm0,%ymm5,%ymm5
-	# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
-	vpaddd		%ymm12,%ymm11,%ymm11
-	vpxor		%ymm11,%ymm6,%ymm6
-	vpslld		$12,%ymm6,%ymm0
-	vpsrld		$20,%ymm6,%ymm6
-	vpor		%ymm0,%ymm6,%ymm6
-	# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
-	vpaddd		%ymm13,%ymm8,%ymm8
-	vpxor		%ymm8,%ymm7,%ymm7
-	vpslld		$12,%ymm7,%ymm0
-	vpsrld		$20,%ymm7,%ymm7
-	vpor		%ymm0,%ymm7,%ymm7
-	# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
-	vpaddd		%ymm14,%ymm9,%ymm9
-	vpxor		%ymm9,%ymm4,%ymm4
-	vpslld		$12,%ymm4,%ymm0
-	vpsrld		$20,%ymm4,%ymm4
-	vpor		%ymm0,%ymm4,%ymm4
-
-	# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
-	vpaddd		0x00(%rsp),%ymm5,%ymm0
-	vmovdqa		%ymm0,0x00(%rsp)
-	vpxor		%ymm0,%ymm15,%ymm15
-	vpshufb		%ymm2,%ymm15,%ymm15
-	# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
-	vpaddd		0x20(%rsp),%ymm6,%ymm0
-	vmovdqa		%ymm0,0x20(%rsp)
-	vpxor		%ymm0,%ymm12,%ymm12
-	vpshufb		%ymm2,%ymm12,%ymm12
-	# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
-	vpaddd		0x40(%rsp),%ymm7,%ymm0
-	vmovdqa		%ymm0,0x40(%rsp)
-	vpxor		%ymm0,%ymm13,%ymm13
-	vpshufb		%ymm2,%ymm13,%ymm13
-	# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
-	vpaddd		0x60(%rsp),%ymm4,%ymm0
-	vmovdqa		%ymm0,0x60(%rsp)
-	vpxor		%ymm0,%ymm14,%ymm14
-	vpshufb		%ymm2,%ymm14,%ymm14
-
-	# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
-	vpaddd		%ymm15,%ymm10,%ymm10
-	vpxor		%ymm10,%ymm5,%ymm5
-	vpslld		$7,%ymm5,%ymm0
-	vpsrld		$25,%ymm5,%ymm5
-	vpor		%ymm0,%ymm5,%ymm5
-	# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
-	vpaddd		%ymm12,%ymm11,%ymm11
-	vpxor		%ymm11,%ymm6,%ymm6
-	vpslld		$7,%ymm6,%ymm0
-	vpsrld		$25,%ymm6,%ymm6
-	vpor		%ymm0,%ymm6,%ymm6
-	# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
-	vpaddd		%ymm13,%ymm8,%ymm8
-	vpxor		%ymm8,%ymm7,%ymm7
-	vpslld		$7,%ymm7,%ymm0
-	vpsrld		$25,%ymm7,%ymm7
-	vpor		%ymm0,%ymm7,%ymm7
-	# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
-	vpaddd		%ymm14,%ymm9,%ymm9
-	vpxor		%ymm9,%ymm4,%ymm4
-	vpslld		$7,%ymm4,%ymm0
-	vpsrld		$25,%ymm4,%ymm4
-	vpor		%ymm0,%ymm4,%ymm4
-
-	dec		%ecx
-	jnz		.Ldoubleround8
-
-	# x0..15[0-3] += s[0..15]
-	vpbroadcastd	0x00(%rdi),%ymm0
-	vpaddd		0x00(%rsp),%ymm0,%ymm0
-	vmovdqa		%ymm0,0x00(%rsp)
-	vpbroadcastd	0x04(%rdi),%ymm0
-	vpaddd		0x20(%rsp),%ymm0,%ymm0
-	vmovdqa		%ymm0,0x20(%rsp)
-	vpbroadcastd	0x08(%rdi),%ymm0
-	vpaddd		0x40(%rsp),%ymm0,%ymm0
-	vmovdqa		%ymm0,0x40(%rsp)
-	vpbroadcastd	0x0c(%rdi),%ymm0
-	vpaddd		0x60(%rsp),%ymm0,%ymm0
-	vmovdqa		%ymm0,0x60(%rsp)
-	vpbroadcastd	0x10(%rdi),%ymm0
-	vpaddd		%ymm0,%ymm4,%ymm4
-	vpbroadcastd	0x14(%rdi),%ymm0
-	vpaddd		%ymm0,%ymm5,%ymm5
-	vpbroadcastd	0x18(%rdi),%ymm0
-	vpaddd		%ymm0,%ymm6,%ymm6
-	vpbroadcastd	0x1c(%rdi),%ymm0
-	vpaddd		%ymm0,%ymm7,%ymm7
-	vpbroadcastd	0x20(%rdi),%ymm0
-	vpaddd		%ymm0,%ymm8,%ymm8
-	vpbroadcastd	0x24(%rdi),%ymm0
-	vpaddd		%ymm0,%ymm9,%ymm9
-	vpbroadcastd	0x28(%rdi),%ymm0
-	vpaddd		%ymm0,%ymm10,%ymm10
-	vpbroadcastd	0x2c(%rdi),%ymm0
-	vpaddd		%ymm0,%ymm11,%ymm11
-	vpbroadcastd	0x30(%rdi),%ymm0
-	vpaddd		%ymm0,%ymm12,%ymm12
-	vpbroadcastd	0x34(%rdi),%ymm0
-	vpaddd		%ymm0,%ymm13,%ymm13
-	vpbroadcastd	0x38(%rdi),%ymm0
-	vpaddd		%ymm0,%ymm14,%ymm14
-	vpbroadcastd	0x3c(%rdi),%ymm0
-	vpaddd		%ymm0,%ymm15,%ymm15
-
-	# x12 += counter values 0-3
-	vpaddd		%ymm1,%ymm12,%ymm12
-
-	# interleave 32-bit words in state n, n+1
-	vmovdqa		0x00(%rsp),%ymm0
-	vmovdqa		0x20(%rsp),%ymm1
-	vpunpckldq	%ymm1,%ymm0,%ymm2
-	vpunpckhdq	%ymm1,%ymm0,%ymm1
-	vmovdqa		%ymm2,0x00(%rsp)
-	vmovdqa		%ymm1,0x20(%rsp)
-	vmovdqa		0x40(%rsp),%ymm0
-	vmovdqa		0x60(%rsp),%ymm1
-	vpunpckldq	%ymm1,%ymm0,%ymm2
-	vpunpckhdq	%ymm1,%ymm0,%ymm1
-	vmovdqa		%ymm2,0x40(%rsp)
-	vmovdqa		%ymm1,0x60(%rsp)
-	vmovdqa		%ymm4,%ymm0
-	vpunpckldq	%ymm5,%ymm0,%ymm4
-	vpunpckhdq	%ymm5,%ymm0,%ymm5
-	vmovdqa		%ymm6,%ymm0
-	vpunpckldq	%ymm7,%ymm0,%ymm6
-	vpunpckhdq	%ymm7,%ymm0,%ymm7
-	vmovdqa		%ymm8,%ymm0
-	vpunpckldq	%ymm9,%ymm0,%ymm8
-	vpunpckhdq	%ymm9,%ymm0,%ymm9
-	vmovdqa		%ymm10,%ymm0
-	vpunpckldq	%ymm11,%ymm0,%ymm10
-	vpunpckhdq	%ymm11,%ymm0,%ymm11
-	vmovdqa		%ymm12,%ymm0
-	vpunpckldq	%ymm13,%ymm0,%ymm12
-	vpunpckhdq	%ymm13,%ymm0,%ymm13
-	vmovdqa		%ymm14,%ymm0
-	vpunpckldq	%ymm15,%ymm0,%ymm14
-	vpunpckhdq	%ymm15,%ymm0,%ymm15
-
-	# interleave 64-bit words in state n, n+2
-	vmovdqa		0x00(%rsp),%ymm0
-	vmovdqa		0x40(%rsp),%ymm2
-	vpunpcklqdq	%ymm2,%ymm0,%ymm1
-	vpunpckhqdq	%ymm2,%ymm0,%ymm2
-	vmovdqa		%ymm1,0x00(%rsp)
-	vmovdqa		%ymm2,0x40(%rsp)
-	vmovdqa		0x20(%rsp),%ymm0
-	vmovdqa		0x60(%rsp),%ymm2
-	vpunpcklqdq	%ymm2,%ymm0,%ymm1
-	vpunpckhqdq	%ymm2,%ymm0,%ymm2
-	vmovdqa		%ymm1,0x20(%rsp)
-	vmovdqa		%ymm2,0x60(%rsp)
-	vmovdqa		%ymm4,%ymm0
-	vpunpcklqdq	%ymm6,%ymm0,%ymm4
-	vpunpckhqdq	%ymm6,%ymm0,%ymm6
-	vmovdqa		%ymm5,%ymm0
-	vpunpcklqdq	%ymm7,%ymm0,%ymm5
-	vpunpckhqdq	%ymm7,%ymm0,%ymm7
-	vmovdqa		%ymm8,%ymm0
-	vpunpcklqdq	%ymm10,%ymm0,%ymm8
-	vpunpckhqdq	%ymm10,%ymm0,%ymm10
-	vmovdqa		%ymm9,%ymm0
-	vpunpcklqdq	%ymm11,%ymm0,%ymm9
-	vpunpckhqdq	%ymm11,%ymm0,%ymm11
-	vmovdqa		%ymm12,%ymm0
-	vpunpcklqdq	%ymm14,%ymm0,%ymm12
-	vpunpckhqdq	%ymm14,%ymm0,%ymm14
-	vmovdqa		%ymm13,%ymm0
-	vpunpcklqdq	%ymm15,%ymm0,%ymm13
-	vpunpckhqdq	%ymm15,%ymm0,%ymm15
-
-	# interleave 128-bit words in state n, n+4
-	# xor/write first four blocks
-	vmovdqa		0x00(%rsp),%ymm1
-	vperm2i128	$0x20,%ymm4,%ymm1,%ymm0
-	cmp		$0x0020,%rax
-	jl		.Lxorpart8
-	vpxor		0x0000(%rdx),%ymm0,%ymm0
-	vmovdqu		%ymm0,0x0000(%rsi)
-	vperm2i128	$0x31,%ymm4,%ymm1,%ymm4
-
-	vperm2i128	$0x20,%ymm12,%ymm8,%ymm0
-	cmp		$0x0040,%rax
-	jl		.Lxorpart8
-	vpxor		0x0020(%rdx),%ymm0,%ymm0
-	vmovdqu		%ymm0,0x0020(%rsi)
-	vperm2i128	$0x31,%ymm12,%ymm8,%ymm12
-
-	vmovdqa		0x40(%rsp),%ymm1
-	vperm2i128	$0x20,%ymm6,%ymm1,%ymm0
-	cmp		$0x0060,%rax
-	jl		.Lxorpart8
-	vpxor		0x0040(%rdx),%ymm0,%ymm0
-	vmovdqu		%ymm0,0x0040(%rsi)
-	vperm2i128	$0x31,%ymm6,%ymm1,%ymm6
-
-	vperm2i128	$0x20,%ymm14,%ymm10,%ymm0
-	cmp		$0x0080,%rax
-	jl		.Lxorpart8
-	vpxor		0x0060(%rdx),%ymm0,%ymm0
-	vmovdqu		%ymm0,0x0060(%rsi)
-	vperm2i128	$0x31,%ymm14,%ymm10,%ymm14
-
-	vmovdqa		0x20(%rsp),%ymm1
-	vperm2i128	$0x20,%ymm5,%ymm1,%ymm0
-	cmp		$0x00a0,%rax
-	jl		.Lxorpart8
-	vpxor		0x0080(%rdx),%ymm0,%ymm0
-	vmovdqu		%ymm0,0x0080(%rsi)
-	vperm2i128	$0x31,%ymm5,%ymm1,%ymm5
-
-	vperm2i128	$0x20,%ymm13,%ymm9,%ymm0
-	cmp		$0x00c0,%rax
-	jl		.Lxorpart8
-	vpxor		0x00a0(%rdx),%ymm0,%ymm0
-	vmovdqu		%ymm0,0x00a0(%rsi)
-	vperm2i128	$0x31,%ymm13,%ymm9,%ymm13
-
-	vmovdqa		0x60(%rsp),%ymm1
-	vperm2i128	$0x20,%ymm7,%ymm1,%ymm0
-	cmp		$0x00e0,%rax
-	jl		.Lxorpart8
-	vpxor		0x00c0(%rdx),%ymm0,%ymm0
-	vmovdqu		%ymm0,0x00c0(%rsi)
-	vperm2i128	$0x31,%ymm7,%ymm1,%ymm7
-
-	vperm2i128	$0x20,%ymm15,%ymm11,%ymm0
-	cmp		$0x0100,%rax
-	jl		.Lxorpart8
-	vpxor		0x00e0(%rdx),%ymm0,%ymm0
-	vmovdqu		%ymm0,0x00e0(%rsi)
-	vperm2i128	$0x31,%ymm15,%ymm11,%ymm15
-
-	# xor remaining blocks, write to output
-	vmovdqa		%ymm4,%ymm0
-	cmp		$0x0120,%rax
-	jl		.Lxorpart8
-	vpxor		0x0100(%rdx),%ymm0,%ymm0
-	vmovdqu		%ymm0,0x0100(%rsi)
-
-	vmovdqa		%ymm12,%ymm0
-	cmp		$0x0140,%rax
-	jl		.Lxorpart8
-	vpxor		0x0120(%rdx),%ymm0,%ymm0
-	vmovdqu		%ymm0,0x0120(%rsi)
-
-	vmovdqa		%ymm6,%ymm0
-	cmp		$0x0160,%rax
-	jl		.Lxorpart8
-	vpxor		0x0140(%rdx),%ymm0,%ymm0
-	vmovdqu		%ymm0,0x0140(%rsi)
-
-	vmovdqa		%ymm14,%ymm0
-	cmp		$0x0180,%rax
-	jl		.Lxorpart8
-	vpxor		0x0160(%rdx),%ymm0,%ymm0
-	vmovdqu		%ymm0,0x0160(%rsi)
-
-	vmovdqa		%ymm5,%ymm0
-	cmp		$0x01a0,%rax
-	jl		.Lxorpart8
-	vpxor		0x0180(%rdx),%ymm0,%ymm0
-	vmovdqu		%ymm0,0x0180(%rsi)
-
-	vmovdqa		%ymm13,%ymm0
-	cmp		$0x01c0,%rax
-	jl		.Lxorpart8
-	vpxor		0x01a0(%rdx),%ymm0,%ymm0
-	vmovdqu		%ymm0,0x01a0(%rsi)
-
-	vmovdqa		%ymm7,%ymm0
-	cmp		$0x01e0,%rax
-	jl		.Lxorpart8
-	vpxor		0x01c0(%rdx),%ymm0,%ymm0
-	vmovdqu		%ymm0,0x01c0(%rsi)
-
-	vmovdqa		%ymm15,%ymm0
-	cmp		$0x0200,%rax
-	jl		.Lxorpart8
-	vpxor		0x01e0(%rdx),%ymm0,%ymm0
-	vmovdqu		%ymm0,0x01e0(%rsi)
-
-.Ldone8:
-	vzeroupper
-	lea		-8(%r10),%rsp
-	ret
-
-.Lxorpart8:
-	# xor remaining bytes from partial register into output
-	mov		%rax,%r9
-	and		$0x1f,%r9
-	jz		.Ldone8
-	and		$~0x1f,%rax
-
-	mov		%rsi,%r11
-
-	lea		(%rdx,%rax),%rsi
-	mov		%rsp,%rdi
-	mov		%r9,%rcx
-	rep movsb
-
-	vpxor		0x00(%rsp),%ymm0,%ymm0
-	vmovdqa		%ymm0,0x00(%rsp)
-
-	mov		%rsp,%rsi
-	lea		(%r11,%rax),%rdi
-	mov		%r9,%rcx
-	rep movsb
-
-	jmp		.Ldone8
-
-ENDPROC(chacha20_8block_xor_avx2)
diff --git a/arch/x86/crypto/chacha20-avx512vl-x86_64.S b/arch/x86/crypto/chacha20-avx512vl-x86_64.S
deleted file mode 100644
index 55d34de29e3e..000000000000
--- a/arch/x86/crypto/chacha20-avx512vl-x86_64.S
+++ /dev/null
@@ -1,839 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-/*
- * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX-512VL functions
- *
- * Copyright (C) 2018 Martin Willi
- */
-
-#include <linux/linkage.h>
-
-.section	.rodata.cst32.CTR2BL, "aM", @progbits, 32
-.align 32
-CTR2BL:	.octa 0x00000000000000000000000000000000
-	.octa 0x00000000000000000000000000000001
-
-.section	.rodata.cst32.CTR4BL, "aM", @progbits, 32
-.align 32
-CTR4BL:	.octa 0x00000000000000000000000000000002
-	.octa 0x00000000000000000000000000000003
-
-.section	.rodata.cst32.CTR8BL, "aM", @progbits, 32
-.align 32
-CTR8BL:	.octa 0x00000003000000020000000100000000
-	.octa 0x00000007000000060000000500000004
-
-.text
-
-ENTRY(chacha20_2block_xor_avx512vl)
-	# %rdi: Input state matrix, s
-	# %rsi: up to 2 data blocks output, o
-	# %rdx: up to 2 data blocks input, i
-	# %rcx: input/output length in bytes
-
-	# This function encrypts two ChaCha20 blocks by loading the state
-	# matrix twice across four AVX registers. It performs matrix operations
-	# on four words in each matrix in parallel, but requires shuffling to
-	# rearrange the words after each round.
-
-	vzeroupper
-
-	# x0..3[0-2] = s0..3
-	vbroadcasti128	0x00(%rdi),%ymm0
-	vbroadcasti128	0x10(%rdi),%ymm1
-	vbroadcasti128	0x20(%rdi),%ymm2
-	vbroadcasti128	0x30(%rdi),%ymm3
-
-	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
-
-	vmovdqa		%ymm0,%ymm8
-	vmovdqa		%ymm1,%ymm9
-	vmovdqa		%ymm2,%ymm10
-	vmovdqa		%ymm3,%ymm11
-
-	mov		$10,%rax
-
-.Ldoubleround:
-
-	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-	vpaddd		%ymm1,%ymm0,%ymm0
-	vpxord		%ymm0,%ymm3,%ymm3
-	vprold		$16,%ymm3,%ymm3
-
-	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-	vpaddd		%ymm3,%ymm2,%ymm2
-	vpxord		%ymm2,%ymm1,%ymm1
-	vprold		$12,%ymm1,%ymm1
-
-	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-	vpaddd		%ymm1,%ymm0,%ymm0
-	vpxord		%ymm0,%ymm3,%ymm3
-	vprold		$8,%ymm3,%ymm3
-
-	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-	vpaddd		%ymm3,%ymm2,%ymm2
-	vpxord		%ymm2,%ymm1,%ymm1
-	vprold		$7,%ymm1,%ymm1
-
-	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
-	vpshufd		$0x39,%ymm1,%ymm1
-	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-	vpshufd		$0x4e,%ymm2,%ymm2
-	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
-	vpshufd		$0x93,%ymm3,%ymm3
-
-	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-	vpaddd		%ymm1,%ymm0,%ymm0
-	vpxord		%ymm0,%ymm3,%ymm3
-	vprold		$16,%ymm3,%ymm3
-
-	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-	vpaddd		%ymm3,%ymm2,%ymm2
-	vpxord		%ymm2,%ymm1,%ymm1
-	vprold		$12,%ymm1,%ymm1
-
-	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-	vpaddd		%ymm1,%ymm0,%ymm0
-	vpxord		%ymm0,%ymm3,%ymm3
-	vprold		$8,%ymm3,%ymm3
-
-	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-	vpaddd		%ymm3,%ymm2,%ymm2
-	vpxord		%ymm2,%ymm1,%ymm1
-	vprold		$7,%ymm1,%ymm1
-
-	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
-	vpshufd		$0x93,%ymm1,%ymm1
-	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-	vpshufd		$0x4e,%ymm2,%ymm2
-	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
-	vpshufd		$0x39,%ymm3,%ymm3
-
-	dec		%rax
-	jnz		.Ldoubleround
-
-	# o0 = i0 ^ (x0 + s0)
-	vpaddd		%ymm8,%ymm0,%ymm7
-	cmp		$0x10,%rcx
-	jl		.Lxorpart2
-	vpxord		0x00(%rdx),%xmm7,%xmm6
-	vmovdqu		%xmm6,0x00(%rsi)
-	vextracti128	$1,%ymm7,%xmm0
-	# o1 = i1 ^ (x1 + s1)
-	vpaddd		%ymm9,%ymm1,%ymm7
-	cmp		$0x20,%rcx
-	jl		.Lxorpart2
-	vpxord		0x10(%rdx),%xmm7,%xmm6
-	vmovdqu		%xmm6,0x10(%rsi)
-	vextracti128	$1,%ymm7,%xmm1
-	# o2 = i2 ^ (x2 + s2)
-	vpaddd		%ymm10,%ymm2,%ymm7
-	cmp		$0x30,%rcx
-	jl		.Lxorpart2
-	vpxord		0x20(%rdx),%xmm7,%xmm6
-	vmovdqu		%xmm6,0x20(%rsi)
-	vextracti128	$1,%ymm7,%xmm2
-	# o3 = i3 ^ (x3 + s3)
-	vpaddd		%ymm11,%ymm3,%ymm7
-	cmp		$0x40,%rcx
-	jl		.Lxorpart2
-	vpxord		0x30(%rdx),%xmm7,%xmm6
-	vmovdqu		%xmm6,0x30(%rsi)
-	vextracti128	$1,%ymm7,%xmm3
-
-	# xor and write second block
-	vmovdqa		%xmm0,%xmm7
-	cmp		$0x50,%rcx
-	jl		.Lxorpart2
-	vpxord		0x40(%rdx),%xmm7,%xmm6
-	vmovdqu		%xmm6,0x40(%rsi)
-
-	vmovdqa		%xmm1,%xmm7
-	cmp		$0x60,%rcx
-	jl		.Lxorpart2
-	vpxord		0x50(%rdx),%xmm7,%xmm6
-	vmovdqu		%xmm6,0x50(%rsi)
-
-	vmovdqa		%xmm2,%xmm7
-	cmp		$0x70,%rcx
-	jl		.Lxorpart2
-	vpxord		0x60(%rdx),%xmm7,%xmm6
-	vmovdqu		%xmm6,0x60(%rsi)
-
-	vmovdqa		%xmm3,%xmm7
-	cmp		$0x80,%rcx
-	jl		.Lxorpart2
-	vpxord		0x70(%rdx),%xmm7,%xmm6
-	vmovdqu		%xmm6,0x70(%rsi)
-
-.Ldone2:
-	vzeroupper
-	ret
-
-.Lxorpart2:
-	# xor remaining bytes from partial register into output
-	mov		%rcx,%rax
-	and		$0xf,%rcx
-	jz		.Ldone8
-	mov		%rax,%r9
-	and		$~0xf,%r9
-
-	mov		$1,%rax
-	shld		%cl,%rax,%rax
-	sub		$1,%rax
-	kmovq		%rax,%k1
-
-	vmovdqu8	(%rdx,%r9),%xmm1{%k1}{z}
-	vpxord		%xmm7,%xmm1,%xmm1
-	vmovdqu8	%xmm1,(%rsi,%r9){%k1}
-
-	jmp		.Ldone2
-
-ENDPROC(chacha20_2block_xor_avx512vl)
-
-ENTRY(chacha20_4block_xor_avx512vl)
-	# %rdi: Input state matrix, s
-	# %rsi: up to 4 data blocks output, o
-	# %rdx: up to 4 data blocks input, i
-	# %rcx: input/output length in bytes
-
-	# This function encrypts four ChaCha20 block by loading the state
-	# matrix four times across eight AVX registers. It performs matrix
-	# operations on four words in two matrices in parallel, sequentially
-	# to the operations on the four words of the other two matrices. The
-	# required word shuffling has a rather high latency, we can do the
-	# arithmetic on two matrix-pairs without much slowdown.
-
-	vzeroupper
-
-	# x0..3[0-4] = s0..3
-	vbroadcasti128	0x00(%rdi),%ymm0
-	vbroadcasti128	0x10(%rdi),%ymm1
-	vbroadcasti128	0x20(%rdi),%ymm2
-	vbroadcasti128	0x30(%rdi),%ymm3
-
-	vmovdqa		%ymm0,%ymm4
-	vmovdqa		%ymm1,%ymm5
-	vmovdqa		%ymm2,%ymm6
-	vmovdqa		%ymm3,%ymm7
-
-	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
-	vpaddd		CTR4BL(%rip),%ymm7,%ymm7
-
-	vmovdqa		%ymm0,%ymm11
-	vmovdqa		%ymm1,%ymm12
-	vmovdqa		%ymm2,%ymm13
-	vmovdqa		%ymm3,%ymm14
-	vmovdqa		%ymm7,%ymm15
-
-	mov		$10,%rax
-
-.Ldoubleround4:
-
-	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-	vpaddd		%ymm1,%ymm0,%ymm0
-	vpxord		%ymm0,%ymm3,%ymm3
-	vprold		$16,%ymm3,%ymm3
-
-	vpaddd		%ymm5,%ymm4,%ymm4
-	vpxord		%ymm4,%ymm7,%ymm7
-	vprold		$16,%ymm7,%ymm7
-
-	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-	vpaddd		%ymm3,%ymm2,%ymm2
-	vpxord		%ymm2,%ymm1,%ymm1
-	vprold		$12,%ymm1,%ymm1
-
-	vpaddd		%ymm7,%ymm6,%ymm6
-	vpxord		%ymm6,%ymm5,%ymm5
-	vprold		$12,%ymm5,%ymm5
-
-	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-	vpaddd		%ymm1,%ymm0,%ymm0
-	vpxord		%ymm0,%ymm3,%ymm3
-	vprold		$8,%ymm3,%ymm3
-
-	vpaddd		%ymm5,%ymm4,%ymm4
-	vpxord		%ymm4,%ymm7,%ymm7
-	vprold		$8,%ymm7,%ymm7
-
-	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-	vpaddd		%ymm3,%ymm2,%ymm2
-	vpxord		%ymm2,%ymm1,%ymm1
-	vprold		$7,%ymm1,%ymm1
-
-	vpaddd		%ymm7,%ymm6,%ymm6
-	vpxord		%ymm6,%ymm5,%ymm5
-	vprold		$7,%ymm5,%ymm5
-
-	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
-	vpshufd		$0x39,%ymm1,%ymm1
-	vpshufd		$0x39,%ymm5,%ymm5
-	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-	vpshufd		$0x4e,%ymm2,%ymm2
-	vpshufd		$0x4e,%ymm6,%ymm6
-	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
-	vpshufd		$0x93,%ymm3,%ymm3
-	vpshufd		$0x93,%ymm7,%ymm7
-
-	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-	vpaddd		%ymm1,%ymm0,%ymm0
-	vpxord		%ymm0,%ymm3,%ymm3
-	vprold		$16,%ymm3,%ymm3
-
-	vpaddd		%ymm5,%ymm4,%ymm4
-	vpxord		%ymm4,%ymm7,%ymm7
-	vprold		$16,%ymm7,%ymm7
-
-	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-	vpaddd		%ymm3,%ymm2,%ymm2
-	vpxord		%ymm2,%ymm1,%ymm1
-	vprold		$12,%ymm1,%ymm1
-
-	vpaddd		%ymm7,%ymm6,%ymm6
-	vpxord		%ymm6,%ymm5,%ymm5
-	vprold		$12,%ymm5,%ymm5
-
-	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-	vpaddd		%ymm1,%ymm0,%ymm0
-	vpxord		%ymm0,%ymm3,%ymm3
-	vprold		$8,%ymm3,%ymm3
-
-	vpaddd		%ymm5,%ymm4,%ymm4
-	vpxord		%ymm4,%ymm7,%ymm7
-	vprold		$8,%ymm7,%ymm7
-
-	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-	vpaddd		%ymm3,%ymm2,%ymm2
-	vpxord		%ymm2,%ymm1,%ymm1
-	vprold		$7,%ymm1,%ymm1
-
-	vpaddd		%ymm7,%ymm6,%ymm6
-	vpxord		%ymm6,%ymm5,%ymm5
-	vprold		$7,%ymm5,%ymm5
-
-	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
-	vpshufd		$0x93,%ymm1,%ymm1
-	vpshufd		$0x93,%ymm5,%ymm5
-	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-	vpshufd		$0x4e,%ymm2,%ymm2
-	vpshufd		$0x4e,%ymm6,%ymm6
-	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
-	vpshufd		$0x39,%ymm3,%ymm3
-	vpshufd		$0x39,%ymm7,%ymm7
-
-	dec		%rax
-	jnz		.Ldoubleround4
-
-	# o0 = i0 ^ (x0 + s0), first block
-	vpaddd		%ymm11,%ymm0,%ymm10
-	cmp		$0x10,%rcx
-	jl		.Lxorpart4
-	vpxord		0x00(%rdx),%xmm10,%xmm9
-	vmovdqu		%xmm9,0x00(%rsi)
-	vextracti128	$1,%ymm10,%xmm0
-	# o1 = i1 ^ (x1 + s1), first block
-	vpaddd		%ymm12,%ymm1,%ymm10
-	cmp		$0x20,%rcx
-	jl		.Lxorpart4
-	vpxord		0x10(%rdx),%xmm10,%xmm9
-	vmovdqu		%xmm9,0x10(%rsi)
-	vextracti128	$1,%ymm10,%xmm1
-	# o2 = i2 ^ (x2 + s2), first block
-	vpaddd		%ymm13,%ymm2,%ymm10
-	cmp		$0x30,%rcx
-	jl		.Lxorpart4
-	vpxord		0x20(%rdx),%xmm10,%xmm9
-	vmovdqu		%xmm9,0x20(%rsi)
-	vextracti128	$1,%ymm10,%xmm2
-	# o3 = i3 ^ (x3 + s3), first block
-	vpaddd		%ymm14,%ymm3,%ymm10
-	cmp		$0x40,%rcx
-	jl		.Lxorpart4
-	vpxord		0x30(%rdx),%xmm10,%xmm9
-	vmovdqu		%xmm9,0x30(%rsi)
-	vextracti128	$1,%ymm10,%xmm3
-
-	# xor and write second block
-	vmovdqa		%xmm0,%xmm10
-	cmp		$0x50,%rcx
-	jl		.Lxorpart4
-	vpxord		0x40(%rdx),%xmm10,%xmm9
-	vmovdqu		%xmm9,0x40(%rsi)
-
-	vmovdqa		%xmm1,%xmm10
-	cmp		$0x60,%rcx
-	jl		.Lxorpart4
-	vpxord		0x50(%rdx),%xmm10,%xmm9
-	vmovdqu		%xmm9,0x50(%rsi)
-
-	vmovdqa		%xmm2,%xmm10
-	cmp		$0x70,%rcx
-	jl		.Lxorpart4
-	vpxord		0x60(%rdx),%xmm10,%xmm9
-	vmovdqu		%xmm9,0x60(%rsi)
-
-	vmovdqa		%xmm3,%xmm10
-	cmp		$0x80,%rcx
-	jl		.Lxorpart4
-	vpxord		0x70(%rdx),%xmm10,%xmm9
-	vmovdqu		%xmm9,0x70(%rsi)
-
-	# o0 = i0 ^ (x0 + s0), third block
-	vpaddd		%ymm11,%ymm4,%ymm10
-	cmp		$0x90,%rcx
-	jl		.Lxorpart4
-	vpxord		0x80(%rdx),%xmm10,%xmm9
-	vmovdqu		%xmm9,0x80(%rsi)
-	vextracti128	$1,%ymm10,%xmm4
-	# o1 = i1 ^ (x1 + s1), third block
-	vpaddd		%ymm12,%ymm5,%ymm10
-	cmp		$0xa0,%rcx
-	jl		.Lxorpart4
-	vpxord		0x90(%rdx),%xmm10,%xmm9
-	vmovdqu		%xmm9,0x90(%rsi)
-	vextracti128	$1,%ymm10,%xmm5
-	# o2 = i2 ^ (x2 + s2), third block
-	vpaddd		%ymm13,%ymm6,%ymm10
-	cmp		$0xb0,%rcx
-	jl		.Lxorpart4
-	vpxord		0xa0(%rdx),%xmm10,%xmm9
-	vmovdqu		%xmm9,0xa0(%rsi)
-	vextracti128	$1,%ymm10,%xmm6
-	# o3 = i3 ^ (x3 + s3), third block
-	vpaddd		%ymm15,%ymm7,%ymm10
-	cmp		$0xc0,%rcx
-	jl		.Lxorpart4
-	vpxord		0xb0(%rdx),%xmm10,%xmm9
-	vmovdqu		%xmm9,0xb0(%rsi)
-	vextracti128	$1,%ymm10,%xmm7
-
-	# xor and write fourth block
-	vmovdqa		%xmm4,%xmm10
-	cmp		$0xd0,%rcx
-	jl		.Lxorpart4
-	vpxord		0xc0(%rdx),%xmm10,%xmm9
-	vmovdqu		%xmm9,0xc0(%rsi)
-
-	vmovdqa		%xmm5,%xmm10
-	cmp		$0xe0,%rcx
-	jl		.Lxorpart4
-	vpxord		0xd0(%rdx),%xmm10,%xmm9
-	vmovdqu		%xmm9,0xd0(%rsi)
-
-	vmovdqa		%xmm6,%xmm10
-	cmp		$0xf0,%rcx
-	jl		.Lxorpart4
-	vpxord		0xe0(%rdx),%xmm10,%xmm9
-	vmovdqu		%xmm9,0xe0(%rsi)
-
-	vmovdqa		%xmm7,%xmm10
-	cmp		$0x100,%rcx
-	jl		.Lxorpart4
-	vpxord		0xf0(%rdx),%xmm10,%xmm9
-	vmovdqu		%xmm9,0xf0(%rsi)
-
-.Ldone4:
-	vzeroupper
-	ret
-
-.Lxorpart4:
-	# xor remaining bytes from partial register into output
-	mov		%rcx,%rax
-	and		$0xf,%rcx
-	jz		.Ldone8
-	mov		%rax,%r9
-	and		$~0xf,%r9
-
-	mov		$1,%rax
-	shld		%cl,%rax,%rax
-	sub		$1,%rax
-	kmovq		%rax,%k1
-
-	vmovdqu8	(%rdx,%r9),%xmm1{%k1}{z}
-	vpxord		%xmm10,%xmm1,%xmm1
-	vmovdqu8	%xmm1,(%rsi,%r9){%k1}
-
-	jmp		.Ldone4
-
-ENDPROC(chacha20_4block_xor_avx512vl)
-
-ENTRY(chacha20_8block_xor_avx512vl)
-	# %rdi: Input state matrix, s
-	# %rsi: up to 8 data blocks output, o
-	# %rdx: up to 8 data blocks input, i
-	# %rcx: input/output length in bytes
-
-	# This function encrypts eight consecutive ChaCha20 blocks by loading
-	# the state matrix in AVX registers eight times. Compared to AVX2, this
-	# mostly benefits from the new rotate instructions in VL and the
-	# additional registers.
-
-	vzeroupper
-
-	# x0..15[0-7] = s[0..15]
-	vpbroadcastd	0x00(%rdi),%ymm0
-	vpbroadcastd	0x04(%rdi),%ymm1
-	vpbroadcastd	0x08(%rdi),%ymm2
-	vpbroadcastd	0x0c(%rdi),%ymm3
-	vpbroadcastd	0x10(%rdi),%ymm4
-	vpbroadcastd	0x14(%rdi),%ymm5
-	vpbroadcastd	0x18(%rdi),%ymm6
-	vpbroadcastd	0x1c(%rdi),%ymm7
-	vpbroadcastd	0x20(%rdi),%ymm8
-	vpbroadcastd	0x24(%rdi),%ymm9
-	vpbroadcastd	0x28(%rdi),%ymm10
-	vpbroadcastd	0x2c(%rdi),%ymm11
-	vpbroadcastd	0x30(%rdi),%ymm12
-	vpbroadcastd	0x34(%rdi),%ymm13
-	vpbroadcastd	0x38(%rdi),%ymm14
-	vpbroadcastd	0x3c(%rdi),%ymm15
-
-	# x12 += counter values 0-3
-	vpaddd		CTR8BL(%rip),%ymm12,%ymm12
-
-	vmovdqa64	%ymm0,%ymm16
-	vmovdqa64	%ymm1,%ymm17
-	vmovdqa64	%ymm2,%ymm18
-	vmovdqa64	%ymm3,%ymm19
-	vmovdqa64	%ymm4,%ymm20
-	vmovdqa64	%ymm5,%ymm21
-	vmovdqa64	%ymm6,%ymm22
-	vmovdqa64	%ymm7,%ymm23
-	vmovdqa64	%ymm8,%ymm24
-	vmovdqa64	%ymm9,%ymm25
-	vmovdqa64	%ymm10,%ymm26
-	vmovdqa64	%ymm11,%ymm27
-	vmovdqa64	%ymm12,%ymm28
-	vmovdqa64	%ymm13,%ymm29
-	vmovdqa64	%ymm14,%ymm30
-	vmovdqa64	%ymm15,%ymm31
-
-	mov		$10,%eax
-
-.Ldoubleround8:
-	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
-	vpaddd		%ymm0,%ymm4,%ymm0
-	vpxord		%ymm0,%ymm12,%ymm12
-	vprold		$16,%ymm12,%ymm12
-	# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
-	vpaddd		%ymm1,%ymm5,%ymm1
-	vpxord		%ymm1,%ymm13,%ymm13
-	vprold		$16,%ymm13,%ymm13
-	# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
-	vpaddd		%ymm2,%ymm6,%ymm2
-	vpxord		%ymm2,%ymm14,%ymm14
-	vprold		$16,%ymm14,%ymm14
-	# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
-	vpaddd		%ymm3,%ymm7,%ymm3
-	vpxord		%ymm3,%ymm15,%ymm15
-	vprold		$16,%ymm15,%ymm15
-
-	# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
-	vpaddd		%ymm12,%ymm8,%ymm8
-	vpxord		%ymm8,%ymm4,%ymm4
-	vprold		$12,%ymm4,%ymm4
-	# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
-	vpaddd		%ymm13,%ymm9,%ymm9
-	vpxord		%ymm9,%ymm5,%ymm5
-	vprold		$12,%ymm5,%ymm5
-	# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
-	vpaddd		%ymm14,%ymm10,%ymm10
-	vpxord		%ymm10,%ymm6,%ymm6
-	vprold		$12,%ymm6,%ymm6
-	# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
-	vpaddd		%ymm15,%ymm11,%ymm11
-	vpxord		%ymm11,%ymm7,%ymm7
-	vprold		$12,%ymm7,%ymm7
-
-	# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
-	vpaddd		%ymm0,%ymm4,%ymm0
-	vpxord		%ymm0,%ymm12,%ymm12
-	vprold		$8,%ymm12,%ymm12
-	# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
-	vpaddd		%ymm1,%ymm5,%ymm1
-	vpxord		%ymm1,%ymm13,%ymm13
-	vprold		$8,%ymm13,%ymm13
-	# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
-	vpaddd		%ymm2,%ymm6,%ymm2
-	vpxord		%ymm2,%ymm14,%ymm14
-	vprold		$8,%ymm14,%ymm14
-	# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
-	vpaddd		%ymm3,%ymm7,%ymm3
-	vpxord		%ymm3,%ymm15,%ymm15
-	vprold		$8,%ymm15,%ymm15
-
-	# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
-	vpaddd		%ymm12,%ymm8,%ymm8
-	vpxord		%ymm8,%ymm4,%ymm4
-	vprold		$7,%ymm4,%ymm4
-	# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
-	vpaddd		%ymm13,%ymm9,%ymm9
-	vpxord		%ymm9,%ymm5,%ymm5
-	vprold		$7,%ymm5,%ymm5
-	# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
-	vpaddd		%ymm14,%ymm10,%ymm10
-	vpxord		%ymm10,%ymm6,%ymm6
-	vprold		$7,%ymm6,%ymm6
-	# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
-	vpaddd		%ymm15,%ymm11,%ymm11
-	vpxord		%ymm11,%ymm7,%ymm7
-	vprold		$7,%ymm7,%ymm7
-
-	# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
-	vpaddd		%ymm0,%ymm5,%ymm0
-	vpxord		%ymm0,%ymm15,%ymm15
-	vprold		$16,%ymm15,%ymm15
-	# x1 += x6, x12 = rotl32(x12 ^ x1, 16)
-	vpaddd		%ymm1,%ymm6,%ymm1
-	vpxord		%ymm1,%ymm12,%ymm12
-	vprold		$16,%ymm12,%ymm12
-	# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
-	vpaddd		%ymm2,%ymm7,%ymm2
-	vpxord		%ymm2,%ymm13,%ymm13
-	vprold		$16,%ymm13,%ymm13
-	# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
-	vpaddd		%ymm3,%ymm4,%ymm3
-	vpxord		%ymm3,%ymm14,%ymm14
-	vprold		$16,%ymm14,%ymm14
-
-	# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
-	vpaddd		%ymm15,%ymm10,%ymm10
-	vpxord		%ymm10,%ymm5,%ymm5
-	vprold		$12,%ymm5,%ymm5
-	# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
-	vpaddd		%ymm12,%ymm11,%ymm11
-	vpxord		%ymm11,%ymm6,%ymm6
-	vprold		$12,%ymm6,%ymm6
-	# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
-	vpaddd		%ymm13,%ymm8,%ymm8
-	vpxord		%ymm8,%ymm7,%ymm7
-	vprold		$12,%ymm7,%ymm7
-	# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
-	vpaddd		%ymm14,%ymm9,%ymm9
-	vpxord		%ymm9,%ymm4,%ymm4
-	vprold		$12,%ymm4,%ymm4
-
-	# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
-	vpaddd		%ymm0,%ymm5,%ymm0
-	vpxord		%ymm0,%ymm15,%ymm15
-	vprold		$8,%ymm15,%ymm15
-	# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
-	vpaddd		%ymm1,%ymm6,%ymm1
-	vpxord		%ymm1,%ymm12,%ymm12
-	vprold		$8,%ymm12,%ymm12
-	# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
-	vpaddd		%ymm2,%ymm7,%ymm2
-	vpxord		%ymm2,%ymm13,%ymm13
-	vprold		$8,%ymm13,%ymm13
-	# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
-	vpaddd		%ymm3,%ymm4,%ymm3
-	vpxord		%ymm3,%ymm14,%ymm14
-	vprold		$8,%ymm14,%ymm14
-
-	# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
-	vpaddd		%ymm15,%ymm10,%ymm10
-	vpxord		%ymm10,%ymm5,%ymm5
-	vprold		$7,%ymm5,%ymm5
-	# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
-	vpaddd		%ymm12,%ymm11,%ymm11
-	vpxord		%ymm11,%ymm6,%ymm6
-	vprold		$7,%ymm6,%ymm6
-	# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
-	vpaddd		%ymm13,%ymm8,%ymm8
-	vpxord		%ymm8,%ymm7,%ymm7
-	vprold		$7,%ymm7,%ymm7
-	# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
-	vpaddd		%ymm14,%ymm9,%ymm9
-	vpxord		%ymm9,%ymm4,%ymm4
-	vprold		$7,%ymm4,%ymm4
-
-	dec		%eax
-	jnz		.Ldoubleround8
-
-	# x0..15[0-3] += s[0..15]
-	vpaddd		%ymm16,%ymm0,%ymm0
-	vpaddd		%ymm17,%ymm1,%ymm1
-	vpaddd		%ymm18,%ymm2,%ymm2
-	vpaddd		%ymm19,%ymm3,%ymm3
-	vpaddd		%ymm20,%ymm4,%ymm4
-	vpaddd		%ymm21,%ymm5,%ymm5
-	vpaddd		%ymm22,%ymm6,%ymm6
-	vpaddd		%ymm23,%ymm7,%ymm7
-	vpaddd		%ymm24,%ymm8,%ymm8
-	vpaddd		%ymm25,%ymm9,%ymm9
-	vpaddd		%ymm26,%ymm10,%ymm10
-	vpaddd		%ymm27,%ymm11,%ymm11
-	vpaddd		%ymm28,%ymm12,%ymm12
-	vpaddd		%ymm29,%ymm13,%ymm13
-	vpaddd		%ymm30,%ymm14,%ymm14
-	vpaddd		%ymm31,%ymm15,%ymm15
-
-	# interleave 32-bit words in state n, n+1
-	vpunpckldq	%ymm1,%ymm0,%ymm16
-	vpunpckhdq	%ymm1,%ymm0,%ymm17
-	vpunpckldq	%ymm3,%ymm2,%ymm18
-	vpunpckhdq	%ymm3,%ymm2,%ymm19
-	vpunpckldq	%ymm5,%ymm4,%ymm20
-	vpunpckhdq	%ymm5,%ymm4,%ymm21
-	vpunpckldq	%ymm7,%ymm6,%ymm22
-	vpunpckhdq	%ymm7,%ymm6,%ymm23
-	vpunpckldq	%ymm9,%ymm8,%ymm24
-	vpunpckhdq	%ymm9,%ymm8,%ymm25
-	vpunpckldq	%ymm11,%ymm10,%ymm26
-	vpunpckhdq	%ymm11,%ymm10,%ymm27
-	vpunpckldq	%ymm13,%ymm12,%ymm28
-	vpunpckhdq	%ymm13,%ymm12,%ymm29
-	vpunpckldq	%ymm15,%ymm14,%ymm30
-	vpunpckhdq	%ymm15,%ymm14,%ymm31
-
-	# interleave 64-bit words in state n, n+2
-	vpunpcklqdq	%ymm18,%ymm16,%ymm0
-	vpunpcklqdq	%ymm19,%ymm17,%ymm1
-	vpunpckhqdq	%ymm18,%ymm16,%ymm2
-	vpunpckhqdq	%ymm19,%ymm17,%ymm3
-	vpunpcklqdq	%ymm22,%ymm20,%ymm4
-	vpunpcklqdq	%ymm23,%ymm21,%ymm5
-	vpunpckhqdq	%ymm22,%ymm20,%ymm6
-	vpunpckhqdq	%ymm23,%ymm21,%ymm7
-	vpunpcklqdq	%ymm26,%ymm24,%ymm8
-	vpunpcklqdq	%ymm27,%ymm25,%ymm9
-	vpunpckhqdq	%ymm26,%ymm24,%ymm10
-	vpunpckhqdq	%ymm27,%ymm25,%ymm11
-	vpunpcklqdq	%ymm30,%ymm28,%ymm12
-	vpunpcklqdq	%ymm31,%ymm29,%ymm13
-	vpunpckhqdq	%ymm30,%ymm28,%ymm14
-	vpunpckhqdq	%ymm31,%ymm29,%ymm15
-
-	# interleave 128-bit words in state n, n+4
-	# xor/write first four blocks
-	vmovdqa64	%ymm0,%ymm16
-	vperm2i128	$0x20,%ymm4,%ymm0,%ymm0
-	cmp		$0x0020,%rcx
-	jl		.Lxorpart8
-	vpxord		0x0000(%rdx),%ymm0,%ymm0
-	vmovdqu64	%ymm0,0x0000(%rsi)
-	vmovdqa64	%ymm16,%ymm0
-	vperm2i128	$0x31,%ymm4,%ymm0,%ymm4
-
-	vperm2i128	$0x20,%ymm12,%ymm8,%ymm0
-	cmp		$0x0040,%rcx
-	jl		.Lxorpart8
-	vpxord		0x0020(%rdx),%ymm0,%ymm0
-	vmovdqu64	%ymm0,0x0020(%rsi)
-	vperm2i128	$0x31,%ymm12,%ymm8,%ymm12
-
-	vperm2i128	$0x20,%ymm6,%ymm2,%ymm0
-	cmp		$0x0060,%rcx
-	jl		.Lxorpart8
-	vpxord		0x0040(%rdx),%ymm0,%ymm0
-	vmovdqu64	%ymm0,0x0040(%rsi)
-	vperm2i128	$0x31,%ymm6,%ymm2,%ymm6
-
-	vperm2i128	$0x20,%ymm14,%ymm10,%ymm0
-	cmp		$0x0080,%rcx
-	jl		.Lxorpart8
-	vpxord		0x0060(%rdx),%ymm0,%ymm0
-	vmovdqu64	%ymm0,0x0060(%rsi)
-	vperm2i128	$0x31,%ymm14,%ymm10,%ymm14
-
-	vperm2i128	$0x20,%ymm5,%ymm1,%ymm0
-	cmp		$0x00a0,%rcx
-	jl		.Lxorpart8
-	vpxord		0x0080(%rdx),%ymm0,%ymm0
-	vmovdqu64	%ymm0,0x0080(%rsi)
-	vperm2i128	$0x31,%ymm5,%ymm1,%ymm5
-
-	vperm2i128	$0x20,%ymm13,%ymm9,%ymm0
-	cmp		$0x00c0,%rcx
-	jl		.Lxorpart8
-	vpxord		0x00a0(%rdx),%ymm0,%ymm0
-	vmovdqu64	%ymm0,0x00a0(%rsi)
-	vperm2i128	$0x31,%ymm13,%ymm9,%ymm13
-
-	vperm2i128	$0x20,%ymm7,%ymm3,%ymm0
-	cmp		$0x00e0,%rcx
-	jl		.Lxorpart8
-	vpxord		0x00c0(%rdx),%ymm0,%ymm0
-	vmovdqu64	%ymm0,0x00c0(%rsi)
-	vperm2i128	$0x31,%ymm7,%ymm3,%ymm7
-
-	vperm2i128	$0x20,%ymm15,%ymm11,%ymm0
-	cmp		$0x0100,%rcx
-	jl		.Lxorpart8
-	vpxord		0x00e0(%rdx),%ymm0,%ymm0
-	vmovdqu64	%ymm0,0x00e0(%rsi)
-	vperm2i128	$0x31,%ymm15,%ymm11,%ymm15
-
-	# xor remaining blocks, write to output
-	vmovdqa64	%ymm4,%ymm0
-	cmp		$0x0120,%rcx
-	jl		.Lxorpart8
-	vpxord		0x0100(%rdx),%ymm0,%ymm0
-	vmovdqu64	%ymm0,0x0100(%rsi)
-
-	vmovdqa64	%ymm12,%ymm0
-	cmp		$0x0140,%rcx
-	jl		.Lxorpart8
-	vpxord		0x0120(%rdx),%ymm0,%ymm0
-	vmovdqu64	%ymm0,0x0120(%rsi)
-
-	vmovdqa64	%ymm6,%ymm0
-	cmp		$0x0160,%rcx
-	jl		.Lxorpart8
-	vpxord		0x0140(%rdx),%ymm0,%ymm0
-	vmovdqu64	%ymm0,0x0140(%rsi)
-
-	vmovdqa64	%ymm14,%ymm0
-	cmp		$0x0180,%rcx
-	jl		.Lxorpart8
-	vpxord		0x0160(%rdx),%ymm0,%ymm0
-	vmovdqu64	%ymm0,0x0160(%rsi)
-
-	vmovdqa64	%ymm5,%ymm0
-	cmp		$0x01a0,%rcx
-	jl		.Lxorpart8
-	vpxord		0x0180(%rdx),%ymm0,%ymm0
-	vmovdqu64	%ymm0,0x0180(%rsi)
-
-	vmovdqa64	%ymm13,%ymm0
-	cmp		$0x01c0,%rcx
-	jl		.Lxorpart8
-	vpxord		0x01a0(%rdx),%ymm0,%ymm0
-	vmovdqu64	%ymm0,0x01a0(%rsi)
-
-	vmovdqa64	%ymm7,%ymm0
-	cmp		$0x01e0,%rcx
-	jl		.Lxorpart8
-	vpxord		0x01c0(%rdx),%ymm0,%ymm0
-	vmovdqu64	%ymm0,0x01c0(%rsi)
-
-	vmovdqa64	%ymm15,%ymm0
-	cmp		$0x0200,%rcx
-	jl		.Lxorpart8
-	vpxord		0x01e0(%rdx),%ymm0,%ymm0
-	vmovdqu64	%ymm0,0x01e0(%rsi)
-
-.Ldone8:
-	vzeroupper
-	ret
-
-.Lxorpart8:
-	# xor remaining bytes from partial register into output
-	mov		%rcx,%rax
-	and		$0x1f,%rcx
-	jz		.Ldone8
-	mov		%rax,%r9
-	and		$~0x1f,%r9
-
-	mov		$1,%rax
-	shld		%cl,%rax,%rax
-	sub		$1,%rax
-	kmovq		%rax,%k1
-
-	vmovdqu8	(%rdx,%r9),%ymm1{%k1}{z}
-	vpxord		%ymm0,%ymm1,%ymm1
-	vmovdqu8	%ymm1,(%rsi,%r9){%k1}
-
-	jmp		.Ldone8
-
-ENDPROC(chacha20_8block_xor_avx512vl)
diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S b/arch/x86/crypto/chacha20-ssse3-x86_64.S
deleted file mode 100644
index f6792789f875..000000000000
--- a/arch/x86/crypto/chacha20-ssse3-x86_64.S
+++ /dev/null
@@ -1,792 +0,0 @@
-/*
- * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <linux/linkage.h>
-#include <asm/frame.h>
-
-.section	.rodata.cst16.ROT8, "aM", @progbits, 16
-.align 16
-ROT8:	.octa 0x0e0d0c0f0a09080b0605040702010003
-.section	.rodata.cst16.ROT16, "aM", @progbits, 16
-.align 16
-ROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
-.section	.rodata.cst16.CTRINC, "aM", @progbits, 16
-.align 16
-CTRINC:	.octa 0x00000003000000020000000100000000
-
-.text
-
-/*
- * chacha20_permute - permute one block
- *
- * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3.  This
- * function performs matrix operations on four words in parallel, but requires
- * shuffling to rearrange the words after each round.  8/16-bit word rotation is
- * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
- * rotation uses traditional shift+OR.
- *
- * Clobbers: %ecx, %xmm4-%xmm7
- */
-chacha20_permute:
-
-	movdqa		ROT8(%rip),%xmm4
-	movdqa		ROT16(%rip),%xmm5
-	mov		$10,%ecx
-
-.Ldoubleround:
-	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-	paddd		%xmm1,%xmm0
-	pxor		%xmm0,%xmm3
-	pshufb		%xmm5,%xmm3
-
-	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-	paddd		%xmm3,%xmm2
-	pxor		%xmm2,%xmm1
-	movdqa		%xmm1,%xmm6
-	pslld		$12,%xmm6
-	psrld		$20,%xmm1
-	por		%xmm6,%xmm1
-
-	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-	paddd		%xmm1,%xmm0
-	pxor		%xmm0,%xmm3
-	pshufb		%xmm4,%xmm3
-
-	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-	paddd		%xmm3,%xmm2
-	pxor		%xmm2,%xmm1
-	movdqa		%xmm1,%xmm7
-	pslld		$7,%xmm7
-	psrld		$25,%xmm1
-	por		%xmm7,%xmm1
-
-	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
-	pshufd		$0x39,%xmm1,%xmm1
-	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-	pshufd		$0x4e,%xmm2,%xmm2
-	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
-	pshufd		$0x93,%xmm3,%xmm3
-
-	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-	paddd		%xmm1,%xmm0
-	pxor		%xmm0,%xmm3
-	pshufb		%xmm5,%xmm3
-
-	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-	paddd		%xmm3,%xmm2
-	pxor		%xmm2,%xmm1
-	movdqa		%xmm1,%xmm6
-	pslld		$12,%xmm6
-	psrld		$20,%xmm1
-	por		%xmm6,%xmm1
-
-	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-	paddd		%xmm1,%xmm0
-	pxor		%xmm0,%xmm3
-	pshufb		%xmm4,%xmm3
-
-	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-	paddd		%xmm3,%xmm2
-	pxor		%xmm2,%xmm1
-	movdqa		%xmm1,%xmm7
-	pslld		$7,%xmm7
-	psrld		$25,%xmm1
-	por		%xmm7,%xmm1
-
-	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
-	pshufd		$0x93,%xmm1,%xmm1
-	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-	pshufd		$0x4e,%xmm2,%xmm2
-	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
-	pshufd		$0x39,%xmm3,%xmm3
-
-	dec		%ecx
-	jnz		.Ldoubleround
-
-	ret
-ENDPROC(chacha20_permute)
-
-ENTRY(chacha20_block_xor_ssse3)
-	# %rdi: Input state matrix, s
-	# %rsi: up to 1 data block output, o
-	# %rdx: up to 1 data block input, i
-	# %rcx: input/output length in bytes
-	FRAME_BEGIN
-
-	# x0..3 = s0..3
-	movdqa		0x00(%rdi),%xmm0
-	movdqa		0x10(%rdi),%xmm1
-	movdqa		0x20(%rdi),%xmm2
-	movdqa		0x30(%rdi),%xmm3
-	movdqa		%xmm0,%xmm8
-	movdqa		%xmm1,%xmm9
-	movdqa		%xmm2,%xmm10
-	movdqa		%xmm3,%xmm11
-
-	mov		%rcx,%rax
-	call		chacha20_permute
-
-	# o0 = i0 ^ (x0 + s0)
-	paddd		%xmm8,%xmm0
-	cmp		$0x10,%rax
-	jl		.Lxorpart
-	movdqu		0x00(%rdx),%xmm4
-	pxor		%xmm4,%xmm0
-	movdqu		%xmm0,0x00(%rsi)
-	# o1 = i1 ^ (x1 + s1)
-	paddd		%xmm9,%xmm1
-	movdqa		%xmm1,%xmm0
-	cmp		$0x20,%rax
-	jl		.Lxorpart
-	movdqu		0x10(%rdx),%xmm0
-	pxor		%xmm1,%xmm0
-	movdqu		%xmm0,0x10(%rsi)
-	# o2 = i2 ^ (x2 + s2)
-	paddd		%xmm10,%xmm2
-	movdqa		%xmm2,%xmm0
-	cmp		$0x30,%rax
-	jl		.Lxorpart
-	movdqu		0x20(%rdx),%xmm0
-	pxor		%xmm2,%xmm0
-	movdqu		%xmm0,0x20(%rsi)
-	# o3 = i3 ^ (x3 + s3)
-	paddd		%xmm11,%xmm3
-	movdqa		%xmm3,%xmm0
-	cmp		$0x40,%rax
-	jl		.Lxorpart
-	movdqu		0x30(%rdx),%xmm0
-	pxor		%xmm3,%xmm0
-	movdqu		%xmm0,0x30(%rsi)
-
-.Ldone:
-	FRAME_END
-	ret
-
-.Lxorpart:
-	# xor remaining bytes from partial register into output
-	mov		%rax,%r9
-	and		$0x0f,%r9
-	jz		.Ldone
-	and		$~0x0f,%rax
-
-	mov		%rsi,%r11
-
-	lea		8(%rsp),%r10
-	sub		$0x10,%rsp
-	and		$~31,%rsp
-
-	lea		(%rdx,%rax),%rsi
-	mov		%rsp,%rdi
-	mov		%r9,%rcx
-	rep movsb
-
-	pxor		0x00(%rsp),%xmm0
-	movdqa		%xmm0,0x00(%rsp)
-
-	mov		%rsp,%rsi
-	lea		(%r11,%rax),%rdi
-	mov		%r9,%rcx
-	rep movsb
-
-	lea		-8(%r10),%rsp
-	jmp		.Ldone
-
-ENDPROC(chacha20_block_xor_ssse3)
-
-ENTRY(hchacha20_block_ssse3)
-	# %rdi: Input state matrix, s
-	# %rsi: output (8 32-bit words)
-	FRAME_BEGIN
-
-	movdqa		0x00(%rdi),%xmm0
-	movdqa		0x10(%rdi),%xmm1
-	movdqa		0x20(%rdi),%xmm2
-	movdqa		0x30(%rdi),%xmm3
-
-	call		chacha20_permute
-
-	movdqu		%xmm0,0x00(%rsi)
-	movdqu		%xmm3,0x10(%rsi)
-
-	FRAME_END
-	ret
-ENDPROC(hchacha20_block_ssse3)
-
-ENTRY(chacha20_4block_xor_ssse3)
-	# %rdi: Input state matrix, s
-	# %rsi: up to 4 data blocks output, o
-	# %rdx: up to 4 data blocks input, i
-	# %rcx: input/output length in bytes
-
-	# This function encrypts four consecutive ChaCha20 blocks by loading the
-	# the state matrix in SSE registers four times. As we need some scratch
-	# registers, we save the first four registers on the stack. The
-	# algorithm performs each operation on the corresponding word of each
-	# state matrix, hence requires no word shuffling. For final XORing step
-	# we transpose the matrix by interleaving 32- and then 64-bit words,
-	# which allows us to do XOR in SSE registers. 8/16-bit word rotation is
-	# done with the slightly better performing SSSE3 byte shuffling,
-	# 7/12-bit word rotation uses traditional shift+OR.
-
-	lea		8(%rsp),%r10
-	sub		$0x80,%rsp
-	and		$~63,%rsp
-	mov		%rcx,%rax
-
-	# x0..15[0-3] = s0..3[0..3]
-	movq		0x00(%rdi),%xmm1
-	pshufd		$0x00,%xmm1,%xmm0
-	pshufd		$0x55,%xmm1,%xmm1
-	movq		0x08(%rdi),%xmm3
-	pshufd		$0x00,%xmm3,%xmm2
-	pshufd		$0x55,%xmm3,%xmm3
-	movq		0x10(%rdi),%xmm5
-	pshufd		$0x00,%xmm5,%xmm4
-	pshufd		$0x55,%xmm5,%xmm5
-	movq		0x18(%rdi),%xmm7
-	pshufd		$0x00,%xmm7,%xmm6
-	pshufd		$0x55,%xmm7,%xmm7
-	movq		0x20(%rdi),%xmm9
-	pshufd		$0x00,%xmm9,%xmm8
-	pshufd		$0x55,%xmm9,%xmm9
-	movq		0x28(%rdi),%xmm11
-	pshufd		$0x00,%xmm11,%xmm10
-	pshufd		$0x55,%xmm11,%xmm11
-	movq		0x30(%rdi),%xmm13
-	pshufd		$0x00,%xmm13,%xmm12
-	pshufd		$0x55,%xmm13,%xmm13
-	movq		0x38(%rdi),%xmm15
-	pshufd		$0x00,%xmm15,%xmm14
-	pshufd		$0x55,%xmm15,%xmm15
-	# x0..3 on stack
-	movdqa		%xmm0,0x00(%rsp)
-	movdqa		%xmm1,0x10(%rsp)
-	movdqa		%xmm2,0x20(%rsp)
-	movdqa		%xmm3,0x30(%rsp)
-
-	movdqa		CTRINC(%rip),%xmm1
-	movdqa		ROT8(%rip),%xmm2
-	movdqa		ROT16(%rip),%xmm3
-
-	# x12 += counter values 0-3
-	paddd		%xmm1,%xmm12
-
-	mov		$10,%ecx
-
-.Ldoubleround4:
-	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
-	movdqa		0x00(%rsp),%xmm0
-	paddd		%xmm4,%xmm0
-	movdqa		%xmm0,0x00(%rsp)
-	pxor		%xmm0,%xmm12
-	pshufb		%xmm3,%xmm12
-	# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
-	movdqa		0x10(%rsp),%xmm0
-	paddd		%xmm5,%xmm0
-	movdqa		%xmm0,0x10(%rsp)
-	pxor		%xmm0,%xmm13
-	pshufb		%xmm3,%xmm13
-	# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
-	movdqa		0x20(%rsp),%xmm0
-	paddd		%xmm6,%xmm0
-	movdqa		%xmm0,0x20(%rsp)
-	pxor		%xmm0,%xmm14
-	pshufb		%xmm3,%xmm14
-	# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
-	movdqa		0x30(%rsp),%xmm0
-	paddd		%xmm7,%xmm0
-	movdqa		%xmm0,0x30(%rsp)
-	pxor		%xmm0,%xmm15
-	pshufb		%xmm3,%xmm15
-
-	# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
-	paddd		%xmm12,%xmm8
-	pxor		%xmm8,%xmm4
-	movdqa		%xmm4,%xmm0
-	pslld		$12,%xmm0
-	psrld		$20,%xmm4
-	por		%xmm0,%xmm4
-	# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
-	paddd		%xmm13,%xmm9
-	pxor		%xmm9,%xmm5
-	movdqa		%xmm5,%xmm0
-	pslld		$12,%xmm0
-	psrld		$20,%xmm5
-	por		%xmm0,%xmm5
-	# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
-	paddd		%xmm14,%xmm10
-	pxor		%xmm10,%xmm6
-	movdqa		%xmm6,%xmm0
-	pslld		$12,%xmm0
-	psrld		$20,%xmm6
-	por		%xmm0,%xmm6
-	# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
-	paddd		%xmm15,%xmm11
-	pxor		%xmm11,%xmm7
-	movdqa		%xmm7,%xmm0
-	pslld		$12,%xmm0
-	psrld		$20,%xmm7
-	por		%xmm0,%xmm7
-
-	# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
-	movdqa		0x00(%rsp),%xmm0
-	paddd		%xmm4,%xmm0
-	movdqa		%xmm0,0x00(%rsp)
-	pxor		%xmm0,%xmm12
-	pshufb		%xmm2,%xmm12
-	# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
-	movdqa		0x10(%rsp),%xmm0
-	paddd		%xmm5,%xmm0
-	movdqa		%xmm0,0x10(%rsp)
-	pxor		%xmm0,%xmm13
-	pshufb		%xmm2,%xmm13
-	# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
-	movdqa		0x20(%rsp),%xmm0
-	paddd		%xmm6,%xmm0
-	movdqa		%xmm0,0x20(%rsp)
-	pxor		%xmm0,%xmm14
-	pshufb		%xmm2,%xmm14
-	# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
-	movdqa		0x30(%rsp),%xmm0
-	paddd		%xmm7,%xmm0
-	movdqa		%xmm0,0x30(%rsp)
-	pxor		%xmm0,%xmm15
-	pshufb		%xmm2,%xmm15
-
-	# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
-	paddd		%xmm12,%xmm8
-	pxor		%xmm8,%xmm4
-	movdqa		%xmm4,%xmm0
-	pslld		$7,%xmm0
-	psrld		$25,%xmm4
-	por		%xmm0,%xmm4
-	# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
-	paddd		%xmm13,%xmm9
-	pxor		%xmm9,%xmm5
-	movdqa		%xmm5,%xmm0
-	pslld		$7,%xmm0
-	psrld		$25,%xmm5
-	por		%xmm0,%xmm5
-	# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
-	paddd		%xmm14,%xmm10
-	pxor		%xmm10,%xmm6
-	movdqa		%xmm6,%xmm0
-	pslld		$7,%xmm0
-	psrld		$25,%xmm6
-	por		%xmm0,%xmm6
-	# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
-	paddd		%xmm15,%xmm11
-	pxor		%xmm11,%xmm7
-	movdqa		%xmm7,%xmm0
-	pslld		$7,%xmm0
-	psrld		$25,%xmm7
-	por		%xmm0,%xmm7
-
-	# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
-	movdqa		0x00(%rsp),%xmm0
-	paddd		%xmm5,%xmm0
-	movdqa		%xmm0,0x00(%rsp)
-	pxor		%xmm0,%xmm15
-	pshufb		%xmm3,%xmm15
-	# x1 += x6, x12 = rotl32(x12 ^ x1, 16)
-	movdqa		0x10(%rsp),%xmm0
-	paddd		%xmm6,%xmm0
-	movdqa		%xmm0,0x10(%rsp)
-	pxor		%xmm0,%xmm12
-	pshufb		%xmm3,%xmm12
-	# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
-	movdqa		0x20(%rsp),%xmm0
-	paddd		%xmm7,%xmm0
-	movdqa		%xmm0,0x20(%rsp)
-	pxor		%xmm0,%xmm13
-	pshufb		%xmm3,%xmm13
-	# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
-	movdqa		0x30(%rsp),%xmm0
-	paddd		%xmm4,%xmm0
-	movdqa		%xmm0,0x30(%rsp)
-	pxor		%xmm0,%xmm14
-	pshufb		%xmm3,%xmm14
-
-	# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
-	paddd		%xmm15,%xmm10
-	pxor		%xmm10,%xmm5
-	movdqa		%xmm5,%xmm0
-	pslld		$12,%xmm0
-	psrld		$20,%xmm5
-	por		%xmm0,%xmm5
-	# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
-	paddd		%xmm12,%xmm11
-	pxor		%xmm11,%xmm6
-	movdqa		%xmm6,%xmm0
-	pslld		$12,%xmm0
-	psrld		$20,%xmm6
-	por		%xmm0,%xmm6
-	# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
-	paddd		%xmm13,%xmm8
-	pxor		%xmm8,%xmm7
-	movdqa		%xmm7,%xmm0
-	pslld		$12,%xmm0
-	psrld		$20,%xmm7
-	por		%xmm0,%xmm7
-	# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
-	paddd		%xmm14,%xmm9
-	pxor		%xmm9,%xmm4
-	movdqa		%xmm4,%xmm0
-	pslld		$12,%xmm0
-	psrld		$20,%xmm4
-	por		%xmm0,%xmm4
-
-	# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
-	movdqa		0x00(%rsp),%xmm0
-	paddd		%xmm5,%xmm0
-	movdqa		%xmm0,0x00(%rsp)
-	pxor		%xmm0,%xmm15
-	pshufb		%xmm2,%xmm15
-	# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
-	movdqa		0x10(%rsp),%xmm0
-	paddd		%xmm6,%xmm0
-	movdqa		%xmm0,0x10(%rsp)
-	pxor		%xmm0,%xmm12
-	pshufb		%xmm2,%xmm12
-	# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
-	movdqa		0x20(%rsp),%xmm0
-	paddd		%xmm7,%xmm0
-	movdqa		%xmm0,0x20(%rsp)
-	pxor		%xmm0,%xmm13
-	pshufb		%xmm2,%xmm13
-	# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
-	movdqa		0x30(%rsp),%xmm0
-	paddd		%xmm4,%xmm0
-	movdqa		%xmm0,0x30(%rsp)
-	pxor		%xmm0,%xmm14
-	pshufb		%xmm2,%xmm14
-
-	# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
-	paddd		%xmm15,%xmm10
-	pxor		%xmm10,%xmm5
-	movdqa		%xmm5,%xmm0
-	pslld		$7,%xmm0
-	psrld		$25,%xmm5
-	por		%xmm0,%xmm5
-	# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
-	paddd		%xmm12,%xmm11
-	pxor		%xmm11,%xmm6
-	movdqa		%xmm6,%xmm0
-	pslld		$7,%xmm0
-	psrld		$25,%xmm6
-	por		%xmm0,%xmm6
-	# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
-	paddd		%xmm13,%xmm8
-	pxor		%xmm8,%xmm7
-	movdqa		%xmm7,%xmm0
-	pslld		$7,%xmm0
-	psrld		$25,%xmm7
-	por		%xmm0,%xmm7
-	# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
-	paddd		%xmm14,%xmm9
-	pxor		%xmm9,%xmm4
-	movdqa		%xmm4,%xmm0
-	pslld		$7,%xmm0
-	psrld		$25,%xmm4
-	por		%xmm0,%xmm4
-
-	dec		%ecx
-	jnz		.Ldoubleround4
-
-	# x0[0-3] += s0[0]
-	# x1[0-3] += s0[1]
-	movq		0x00(%rdi),%xmm3
-	pshufd		$0x00,%xmm3,%xmm2
-	pshufd		$0x55,%xmm3,%xmm3
-	paddd		0x00(%rsp),%xmm2
-	movdqa		%xmm2,0x00(%rsp)
-	paddd		0x10(%rsp),%xmm3
-	movdqa		%xmm3,0x10(%rsp)
-	# x2[0-3] += s0[2]
-	# x3[0-3] += s0[3]
-	movq		0x08(%rdi),%xmm3
-	pshufd		$0x00,%xmm3,%xmm2
-	pshufd		$0x55,%xmm3,%xmm3
-	paddd		0x20(%rsp),%xmm2
-	movdqa		%xmm2,0x20(%rsp)
-	paddd		0x30(%rsp),%xmm3
-	movdqa		%xmm3,0x30(%rsp)
-
-	# x4[0-3] += s1[0]
-	# x5[0-3] += s1[1]
-	movq		0x10(%rdi),%xmm3
-	pshufd		$0x00,%xmm3,%xmm2
-	pshufd		$0x55,%xmm3,%xmm3
-	paddd		%xmm2,%xmm4
-	paddd		%xmm3,%xmm5
-	# x6[0-3] += s1[2]
-	# x7[0-3] += s1[3]
-	movq		0x18(%rdi),%xmm3
-	pshufd		$0x00,%xmm3,%xmm2
-	pshufd		$0x55,%xmm3,%xmm3
-	paddd		%xmm2,%xmm6
-	paddd		%xmm3,%xmm7
-
-	# x8[0-3] += s2[0]
-	# x9[0-3] += s2[1]
-	movq		0x20(%rdi),%xmm3
-	pshufd		$0x00,%xmm3,%xmm2
-	pshufd		$0x55,%xmm3,%xmm3
-	paddd		%xmm2,%xmm8
-	paddd		%xmm3,%xmm9
-	# x10[0-3] += s2[2]
-	# x11[0-3] += s2[3]
-	movq		0x28(%rdi),%xmm3
-	pshufd		$0x00,%xmm3,%xmm2
-	pshufd		$0x55,%xmm3,%xmm3
-	paddd		%xmm2,%xmm10
-	paddd		%xmm3,%xmm11
-
-	# x12[0-3] += s3[0]
-	# x13[0-3] += s3[1]
-	movq		0x30(%rdi),%xmm3
-	pshufd		$0x00,%xmm3,%xmm2
-	pshufd		$0x55,%xmm3,%xmm3
-	paddd		%xmm2,%xmm12
-	paddd		%xmm3,%xmm13
-	# x14[0-3] += s3[2]
-	# x15[0-3] += s3[3]
-	movq		0x38(%rdi),%xmm3
-	pshufd		$0x00,%xmm3,%xmm2
-	pshufd		$0x55,%xmm3,%xmm3
-	paddd		%xmm2,%xmm14
-	paddd		%xmm3,%xmm15
-
-	# x12 += counter values 0-3
-	paddd		%xmm1,%xmm12
-
-	# interleave 32-bit words in state n, n+1
-	movdqa		0x00(%rsp),%xmm0
-	movdqa		0x10(%rsp),%xmm1
-	movdqa		%xmm0,%xmm2
-	punpckldq	%xmm1,%xmm2
-	punpckhdq	%xmm1,%xmm0
-	movdqa		%xmm2,0x00(%rsp)
-	movdqa		%xmm0,0x10(%rsp)
-	movdqa		0x20(%rsp),%xmm0
-	movdqa		0x30(%rsp),%xmm1
-	movdqa		%xmm0,%xmm2
-	punpckldq	%xmm1,%xmm2
-	punpckhdq	%xmm1,%xmm0
-	movdqa		%xmm2,0x20(%rsp)
-	movdqa		%xmm0,0x30(%rsp)
-	movdqa		%xmm4,%xmm0
-	punpckldq	%xmm5,%xmm4
-	punpckhdq	%xmm5,%xmm0
-	movdqa		%xmm0,%xmm5
-	movdqa		%xmm6,%xmm0
-	punpckldq	%xmm7,%xmm6
-	punpckhdq	%xmm7,%xmm0
-	movdqa		%xmm0,%xmm7
-	movdqa		%xmm8,%xmm0
-	punpckldq	%xmm9,%xmm8
-	punpckhdq	%xmm9,%xmm0
-	movdqa		%xmm0,%xmm9
-	movdqa		%xmm10,%xmm0
-	punpckldq	%xmm11,%xmm10
-	punpckhdq	%xmm11,%xmm0
-	movdqa		%xmm0,%xmm11
-	movdqa		%xmm12,%xmm0
-	punpckldq	%xmm13,%xmm12
-	punpckhdq	%xmm13,%xmm0
-	movdqa		%xmm0,%xmm13
-	movdqa		%xmm14,%xmm0
-	punpckldq	%xmm15,%xmm14
-	punpckhdq	%xmm15,%xmm0
-	movdqa		%xmm0,%xmm15
-
-	# interleave 64-bit words in state n, n+2
-	movdqa		0x00(%rsp),%xmm0
-	movdqa		0x20(%rsp),%xmm1
-	movdqa		%xmm0,%xmm2
-	punpcklqdq	%xmm1,%xmm2
-	punpckhqdq	%xmm1,%xmm0
-	movdqa		%xmm2,0x00(%rsp)
-	movdqa		%xmm0,0x20(%rsp)
-	movdqa		0x10(%rsp),%xmm0
-	movdqa		0x30(%rsp),%xmm1
-	movdqa		%xmm0,%xmm2
-	punpcklqdq	%xmm1,%xmm2
-	punpckhqdq	%xmm1,%xmm0
-	movdqa		%xmm2,0x10(%rsp)
-	movdqa		%xmm0,0x30(%rsp)
-	movdqa		%xmm4,%xmm0
-	punpcklqdq	%xmm6,%xmm4
-	punpckhqdq	%xmm6,%xmm0
-	movdqa		%xmm0,%xmm6
-	movdqa		%xmm5,%xmm0
-	punpcklqdq	%xmm7,%xmm5
-	punpckhqdq	%xmm7,%xmm0
-	movdqa		%xmm0,%xmm7
-	movdqa		%xmm8,%xmm0
-	punpcklqdq	%xmm10,%xmm8
-	punpckhqdq	%xmm10,%xmm0
-	movdqa		%xmm0,%xmm10
-	movdqa		%xmm9,%xmm0
-	punpcklqdq	%xmm11,%xmm9
-	punpckhqdq	%xmm11,%xmm0
-	movdqa		%xmm0,%xmm11
-	movdqa		%xmm12,%xmm0
-	punpcklqdq	%xmm14,%xmm12
-	punpckhqdq	%xmm14,%xmm0
-	movdqa		%xmm0,%xmm14
-	movdqa		%xmm13,%xmm0
-	punpcklqdq	%xmm15,%xmm13
-	punpckhqdq	%xmm15,%xmm0
-	movdqa		%xmm0,%xmm15
-
-	# xor with corresponding input, write to output
-	movdqa		0x00(%rsp),%xmm0
-	cmp		$0x10,%rax
-	jl		.Lxorpart4
-	movdqu		0x00(%rdx),%xmm1
-	pxor		%xmm1,%xmm0
-	movdqu		%xmm0,0x00(%rsi)
-
-	movdqu		%xmm4,%xmm0
-	cmp		$0x20,%rax
-	jl		.Lxorpart4
-	movdqu		0x10(%rdx),%xmm1
-	pxor		%xmm1,%xmm0
-	movdqu		%xmm0,0x10(%rsi)
-
-	movdqu		%xmm8,%xmm0
-	cmp		$0x30,%rax
-	jl		.Lxorpart4
-	movdqu		0x20(%rdx),%xmm1
-	pxor		%xmm1,%xmm0
-	movdqu		%xmm0,0x20(%rsi)
-
-	movdqu		%xmm12,%xmm0
-	cmp		$0x40,%rax
-	jl		.Lxorpart4
-	movdqu		0x30(%rdx),%xmm1
-	pxor		%xmm1,%xmm0
-	movdqu		%xmm0,0x30(%rsi)
-
-	movdqa		0x20(%rsp),%xmm0
-	cmp		$0x50,%rax
-	jl		.Lxorpart4
-	movdqu		0x40(%rdx),%xmm1
-	pxor		%xmm1,%xmm0
-	movdqu		%xmm0,0x40(%rsi)
-
-	movdqu		%xmm6,%xmm0
-	cmp		$0x60,%rax
-	jl		.Lxorpart4
-	movdqu		0x50(%rdx),%xmm1
-	pxor		%xmm1,%xmm0
-	movdqu		%xmm0,0x50(%rsi)
-
-	movdqu		%xmm10,%xmm0
-	cmp		$0x70,%rax
-	jl		.Lxorpart4
-	movdqu		0x60(%rdx),%xmm1
-	pxor		%xmm1,%xmm0
-	movdqu		%xmm0,0x60(%rsi)
-
-	movdqu		%xmm14,%xmm0
-	cmp		$0x80,%rax
-	jl		.Lxorpart4
-	movdqu		0x70(%rdx),%xmm1
-	pxor		%xmm1,%xmm0
-	movdqu		%xmm0,0x70(%rsi)
-
-	movdqa		0x10(%rsp),%xmm0
-	cmp		$0x90,%rax
-	jl		.Lxorpart4
-	movdqu		0x80(%rdx),%xmm1
-	pxor		%xmm1,%xmm0
-	movdqu		%xmm0,0x80(%rsi)
-
-	movdqu		%xmm5,%xmm0
-	cmp		$0xa0,%rax
-	jl		.Lxorpart4
-	movdqu		0x90(%rdx),%xmm1
-	pxor		%xmm1,%xmm0
-	movdqu		%xmm0,0x90(%rsi)
-
-	movdqu		%xmm9,%xmm0
-	cmp		$0xb0,%rax
-	jl		.Lxorpart4
-	movdqu		0xa0(%rdx),%xmm1
-	pxor		%xmm1,%xmm0
-	movdqu		%xmm0,0xa0(%rsi)
-
-	movdqu		%xmm13,%xmm0
-	cmp		$0xc0,%rax
-	jl		.Lxorpart4
-	movdqu		0xb0(%rdx),%xmm1
-	pxor		%xmm1,%xmm0
-	movdqu		%xmm0,0xb0(%rsi)
-
-	movdqa		0x30(%rsp),%xmm0
-	cmp		$0xd0,%rax
-	jl		.Lxorpart4
-	movdqu		0xc0(%rdx),%xmm1
-	pxor		%xmm1,%xmm0
-	movdqu		%xmm0,0xc0(%rsi)
-
-	movdqu		%xmm7,%xmm0
-	cmp		$0xe0,%rax
-	jl		.Lxorpart4
-	movdqu		0xd0(%rdx),%xmm1
-	pxor		%xmm1,%xmm0
-	movdqu		%xmm0,0xd0(%rsi)
-
-	movdqu		%xmm11,%xmm0
-	cmp		$0xf0,%rax
-	jl		.Lxorpart4
-	movdqu		0xe0(%rdx),%xmm1
-	pxor		%xmm1,%xmm0
-	movdqu		%xmm0,0xe0(%rsi)
-
-	movdqu		%xmm15,%xmm0
-	cmp		$0x100,%rax
-	jl		.Lxorpart4
-	movdqu		0xf0(%rdx),%xmm1
-	pxor		%xmm1,%xmm0
-	movdqu		%xmm0,0xf0(%rsi)
-
-.Ldone4:
-	lea		-8(%r10),%rsp
-	ret
-
-.Lxorpart4:
-	# xor remaining bytes from partial register into output
-	mov		%rax,%r9
-	and		$0x0f,%r9
-	jz		.Ldone4
-	and		$~0x0f,%rax
-
-	mov		%rsi,%r11
-
-	lea		(%rdx,%rax),%rsi
-	mov		%rsp,%rdi
-	mov		%r9,%rcx
-	rep movsb
-
-	pxor		0x00(%rsp),%xmm0
-	movdqa		%xmm0,0x00(%rsp)
-
-	mov		%rsp,%rsi
-	lea		(%r11,%rax),%rdi
-	mov		%r9,%rcx
-	rep movsb
-
-	jmp		.Ldone4
-
-ENDPROC(chacha20_4block_xor_ssse3)
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
deleted file mode 100644
index 70d388e4a3a2..000000000000
--- a/arch/x86/crypto/chacha20_glue.c
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <crypto/algapi.h>
-#include <crypto/chacha.h>
-#include <crypto/internal/skcipher.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <asm/fpu/api.h>
-#include <asm/simd.h>
-
-#define CHACHA20_STATE_ALIGN 16
-
-asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
-					 unsigned int len);
-asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
-					  unsigned int len);
-asmlinkage void hchacha20_block_ssse3(const u32 *state, u32 *out);
-#ifdef CONFIG_AS_AVX2
-asmlinkage void chacha20_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
-					 unsigned int len);
-asmlinkage void chacha20_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
-					 unsigned int len);
-asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
-					 unsigned int len);
-static bool chacha20_use_avx2;
-#ifdef CONFIG_AS_AVX512
-asmlinkage void chacha20_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
-					     unsigned int len);
-asmlinkage void chacha20_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
-					     unsigned int len);
-asmlinkage void chacha20_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
-					     unsigned int len);
-static bool chacha20_use_avx512vl;
-#endif
-#endif
-
-static unsigned int chacha20_advance(unsigned int len, unsigned int maxblocks)
-{
-	len = min(len, maxblocks * CHACHA_BLOCK_SIZE);
-	return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
-}
-
-static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
-			    unsigned int bytes)
-{
-#ifdef CONFIG_AS_AVX2
-#ifdef CONFIG_AS_AVX512
-	if (chacha20_use_avx512vl) {
-		while (bytes >= CHACHA_BLOCK_SIZE * 8) {
-			chacha20_8block_xor_avx512vl(state, dst, src, bytes);
-			bytes -= CHACHA_BLOCK_SIZE * 8;
-			src += CHACHA_BLOCK_SIZE * 8;
-			dst += CHACHA_BLOCK_SIZE * 8;
-			state[12] += 8;
-		}
-		if (bytes > CHACHA_BLOCK_SIZE * 4) {
-			chacha20_8block_xor_avx512vl(state, dst, src, bytes);
-			state[12] += chacha20_advance(bytes, 8);
-			return;
-		}
-		if (bytes > CHACHA_BLOCK_SIZE * 2) {
-			chacha20_4block_xor_avx512vl(state, dst, src, bytes);
-			state[12] += chacha20_advance(bytes, 4);
-			return;
-		}
-		if (bytes) {
-			chacha20_2block_xor_avx512vl(state, dst, src, bytes);
-			state[12] += chacha20_advance(bytes, 2);
-			return;
-		}
-	}
-#endif
-	if (chacha20_use_avx2) {
-		while (bytes >= CHACHA_BLOCK_SIZE * 8) {
-			chacha20_8block_xor_avx2(state, dst, src, bytes);
-			bytes -= CHACHA_BLOCK_SIZE * 8;
-			src += CHACHA_BLOCK_SIZE * 8;
-			dst += CHACHA_BLOCK_SIZE * 8;
-			state[12] += 8;
-		}
-		if (bytes > CHACHA_BLOCK_SIZE * 4) {
-			chacha20_8block_xor_avx2(state, dst, src, bytes);
-			state[12] += chacha20_advance(bytes, 8);
-			return;
-		}
-		if (bytes > CHACHA_BLOCK_SIZE * 2) {
-			chacha20_4block_xor_avx2(state, dst, src, bytes);
-			state[12] += chacha20_advance(bytes, 4);
-			return;
-		}
-		if (bytes > CHACHA_BLOCK_SIZE) {
-			chacha20_2block_xor_avx2(state, dst, src, bytes);
-			state[12] += chacha20_advance(bytes, 2);
-			return;
-		}
-	}
-#endif
-	while (bytes >= CHACHA_BLOCK_SIZE * 4) {
-		chacha20_4block_xor_ssse3(state, dst, src, bytes);
-		bytes -= CHACHA_BLOCK_SIZE * 4;
-		src += CHACHA_BLOCK_SIZE * 4;
-		dst += CHACHA_BLOCK_SIZE * 4;
-		state[12] += 4;
-	}
-	if (bytes > CHACHA_BLOCK_SIZE) {
-		chacha20_4block_xor_ssse3(state, dst, src, bytes);
-		state[12] += chacha20_advance(bytes, 4);
-		return;
-	}
-	if (bytes) {
-		chacha20_block_xor_ssse3(state, dst, src, bytes);
-		state[12]++;
-	}
-}
-
-static int chacha20_simd_stream_xor(struct skcipher_request *req,
-				    struct chacha_ctx *ctx, u8 *iv)
-{
-	u32 *state, state_buf[16 + 2] __aligned(8);
-	struct skcipher_walk walk;
-	int err;
-
-	BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16);
-	state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN);
-
-	err = skcipher_walk_virt(&walk, req, true);
-
-	crypto_chacha_init(state, ctx, iv);
-
-	while (walk.nbytes > 0) {
-		unsigned int nbytes = walk.nbytes;
-
-		if (nbytes < walk.total)
-			nbytes = round_down(nbytes, walk.stride);
-
-		chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
-				nbytes);
-
-		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
-	}
-
-	return err;
-}
-
-static int chacha20_simd(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
-	int err;
-
-	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable())
-		return crypto_chacha_crypt(req);
-
-	kernel_fpu_begin();
-	err = chacha20_simd_stream_xor(req, ctx, req->iv);
-	kernel_fpu_end();
-	return err;
-}
-
-static int xchacha20_simd(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct chacha_ctx subctx;
-	u32 *state, state_buf[16 + 2] __aligned(8);
-	u8 real_iv[16];
-	int err;
-
-	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable())
-		return crypto_xchacha_crypt(req);
-
-	BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16);
-	state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN);
-	crypto_chacha_init(state, ctx, req->iv);
-
-	kernel_fpu_begin();
-
-	hchacha20_block_ssse3(state, subctx.key);
-
-	memcpy(&real_iv[0], req->iv + 24, 8);
-	memcpy(&real_iv[8], req->iv + 16, 8);
-	err = chacha20_simd_stream_xor(req, &subctx, real_iv);
-
-	kernel_fpu_end();
-
-	return err;
-}
-
-static struct skcipher_alg algs[] = {
-	{
-		.base.cra_name		= "chacha20",
-		.base.cra_driver_name	= "chacha20-simd",
-		.base.cra_priority	= 300,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA_KEY_SIZE,
-		.max_keysize		= CHACHA_KEY_SIZE,
-		.ivsize			= CHACHA_IV_SIZE,
-		.chunksize		= CHACHA_BLOCK_SIZE,
-		.setkey			= crypto_chacha20_setkey,
-		.encrypt		= chacha20_simd,
-		.decrypt		= chacha20_simd,
-	}, {
-		.base.cra_name		= "xchacha20",
-		.base.cra_driver_name	= "xchacha20-simd",
-		.base.cra_priority	= 300,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA_KEY_SIZE,
-		.max_keysize		= CHACHA_KEY_SIZE,
-		.ivsize			= XCHACHA_IV_SIZE,
-		.chunksize		= CHACHA_BLOCK_SIZE,
-		.setkey			= crypto_chacha20_setkey,
-		.encrypt		= xchacha20_simd,
-		.decrypt		= xchacha20_simd,
-	},
-};
-
-static int __init chacha20_simd_mod_init(void)
-{
-	if (!boot_cpu_has(X86_FEATURE_SSSE3))
-		return -ENODEV;
-
-#ifdef CONFIG_AS_AVX2
-	chacha20_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
-			    boot_cpu_has(X86_FEATURE_AVX2) &&
-			    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
-#ifdef CONFIG_AS_AVX512
-	chacha20_use_avx512vl = chacha20_use_avx2 &&
-				boot_cpu_has(X86_FEATURE_AVX512VL) &&
-				boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */
-#endif
-#endif
-	return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
-}
-
-static void __exit chacha20_simd_mod_fini(void)
-{
-	crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
-}
-
-module_init(chacha20_simd_mod_init);
-module_exit(chacha20_simd_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
-MODULE_DESCRIPTION("chacha20 cipher algorithm, SIMD accelerated");
-MODULE_ALIAS_CRYPTO("chacha20");
-MODULE_ALIAS_CRYPTO("chacha20-simd");
-MODULE_ALIAS_CRYPTO("xchacha20");
-MODULE_ALIAS_CRYPTO("xchacha20-simd");
diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c
new file mode 100644
index 000000000000..35fd02b50d27
--- /dev/null
+++ b/arch/x86/crypto/chacha_glue.c
@@ -0,0 +1,270 @@
+/*
+ * x64 SIMD accelerated ChaCha and XChaCha stream ciphers,
+ * including ChaCha20 (RFC7539)
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/chacha.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/simd.h>
+
+#define CHACHA_STATE_ALIGN 16
+
+asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
+				       unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
+					unsigned int len, int nrounds);
+asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds);
+#ifdef CONFIG_AS_AVX2
+asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+				       unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+				       unsigned int len, int nrounds);
+asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+				       unsigned int len, int nrounds);
+static bool chacha_use_avx2;
+#ifdef CONFIG_AS_AVX512
+asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+					   unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+					   unsigned int len, int nrounds);
+asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+					   unsigned int len, int nrounds);
+static bool chacha_use_avx512vl;
+#endif
+#endif
+
+static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks)
+{
+	len = min(len, maxblocks * CHACHA_BLOCK_SIZE);
+	return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
+}
+
+static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src,
+			  unsigned int bytes, int nrounds)
+{
+#ifdef CONFIG_AS_AVX2
+#ifdef CONFIG_AS_AVX512
+	if (chacha_use_avx512vl) {
+		while (bytes >= CHACHA_BLOCK_SIZE * 8) {
+			chacha_8block_xor_avx512vl(state, dst, src, bytes,
+						   nrounds);
+			bytes -= CHACHA_BLOCK_SIZE * 8;
+			src += CHACHA_BLOCK_SIZE * 8;
+			dst += CHACHA_BLOCK_SIZE * 8;
+			state[12] += 8;
+		}
+		if (bytes > CHACHA_BLOCK_SIZE * 4) {
+			chacha_8block_xor_avx512vl(state, dst, src, bytes,
+						   nrounds);
+			state[12] += chacha_advance(bytes, 8);
+			return;
+		}
+		if (bytes > CHACHA_BLOCK_SIZE * 2) {
+			chacha_4block_xor_avx512vl(state, dst, src, bytes,
+						   nrounds);
+			state[12] += chacha_advance(bytes, 4);
+			return;
+		}
+		if (bytes) {
+			chacha_2block_xor_avx512vl(state, dst, src, bytes,
+						   nrounds);
+			state[12] += chacha_advance(bytes, 2);
+			return;
+		}
+	}
+#endif
+	if (chacha_use_avx2) {
+		while (bytes >= CHACHA_BLOCK_SIZE * 8) {
+			chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
+			bytes -= CHACHA_BLOCK_SIZE * 8;
+			src += CHACHA_BLOCK_SIZE * 8;
+			dst += CHACHA_BLOCK_SIZE * 8;
+			state[12] += 8;
+		}
+		if (bytes > CHACHA_BLOCK_SIZE * 4) {
+			chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
+			state[12] += chacha_advance(bytes, 8);
+			return;
+		}
+		if (bytes > CHACHA_BLOCK_SIZE * 2) {
+			chacha_4block_xor_avx2(state, dst, src, bytes, nrounds);
+			state[12] += chacha_advance(bytes, 4);
+			return;
+		}
+		if (bytes > CHACHA_BLOCK_SIZE) {
+			chacha_2block_xor_avx2(state, dst, src, bytes, nrounds);
+			state[12] += chacha_advance(bytes, 2);
+			return;
+		}
+	}
+#endif
+	while (bytes >= CHACHA_BLOCK_SIZE * 4) {
+		chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
+		bytes -= CHACHA_BLOCK_SIZE * 4;
+		src += CHACHA_BLOCK_SIZE * 4;
+		dst += CHACHA_BLOCK_SIZE * 4;
+		state[12] += 4;
+	}
+	if (bytes > CHACHA_BLOCK_SIZE) {
+		chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
+		state[12] += chacha_advance(bytes, 4);
+		return;
+	}
+	if (bytes) {
+		chacha_block_xor_ssse3(state, dst, src, bytes, nrounds);
+		state[12]++;
+	}
+}
+
+static int chacha_simd_stream_xor(struct skcipher_request *req,
+				  struct chacha_ctx *ctx, u8 *iv)
+{
+	u32 *state, state_buf[16 + 2] __aligned(8);
+	struct skcipher_walk walk;
+	int err;
+
+	BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
+	state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
+
+	err = skcipher_walk_virt(&walk, req, true);
+
+	crypto_chacha_init(state, ctx, iv);
+
+	while (walk.nbytes > 0) {
+		unsigned int nbytes = walk.nbytes;
+
+		if (nbytes < walk.total)
+			nbytes = round_down(nbytes, walk.stride);
+
+		chacha_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
+			      nbytes, ctx->nrounds);
+
+		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+	}
+
+	return err;
+}
+
+static int chacha_simd(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+	int err;
+
+	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable())
+		return crypto_chacha_crypt(req);
+
+	kernel_fpu_begin();
+	err = chacha_simd_stream_xor(req, ctx, req->iv);
+	kernel_fpu_end();
+	return err;
+}
+
+static int xchacha_simd(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct chacha_ctx subctx;
+	u32 *state, state_buf[16 + 2] __aligned(8);
+	u8 real_iv[16];
+	int err;
+
+	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable())
+		return crypto_xchacha_crypt(req);
+
+	BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
+	state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
+	crypto_chacha_init(state, ctx, req->iv);
+
+	kernel_fpu_begin();
+
+	hchacha_block_ssse3(state, subctx.key, ctx->nrounds);
+	subctx.nrounds = ctx->nrounds;
+
+	memcpy(&real_iv[0], req->iv + 24, 8);
+	memcpy(&real_iv[8], req->iv + 16, 8);
+	err = chacha_simd_stream_xor(req, &subctx, real_iv);
+
+	kernel_fpu_end();
+
+	return err;
+}
+
+static struct skcipher_alg algs[] = {
+	{
+		.base.cra_name		= "chacha20",
+		.base.cra_driver_name	= "chacha20-simd",
+		.base.cra_priority	= 300,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= CHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.setkey			= crypto_chacha20_setkey,
+		.encrypt		= chacha_simd,
+		.decrypt		= chacha_simd,
+	}, {
+		.base.cra_name		= "xchacha20",
+		.base.cra_driver_name	= "xchacha20-simd",
+		.base.cra_priority	= 300,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= XCHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.setkey			= crypto_chacha20_setkey,
+		.encrypt		= xchacha_simd,
+		.decrypt		= xchacha_simd,
+	},
+};
+
+static int __init chacha_simd_mod_init(void)
+{
+	if (!boot_cpu_has(X86_FEATURE_SSSE3))
+		return -ENODEV;
+
+#ifdef CONFIG_AS_AVX2
+	chacha_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
+			  boot_cpu_has(X86_FEATURE_AVX2) &&
+			  cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+#ifdef CONFIG_AS_AVX512
+	chacha_use_avx512vl = chacha_use_avx2 &&
+			      boot_cpu_has(X86_FEATURE_AVX512VL) &&
+			      boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */
+#endif
+#endif
+	return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
+}
+
+static void __exit chacha_simd_mod_fini(void)
+{
+	crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+}
+
+module_init(chacha_simd_mod_init);
+module_exit(chacha_simd_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
+MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (x64 SIMD accelerated)");
+MODULE_ALIAS_CRYPTO("chacha20");
+MODULE_ALIAS_CRYPTO("chacha20-simd");
+MODULE_ALIAS_CRYPTO("xchacha20");
+MODULE_ALIAS_CRYPTO("xchacha20-simd");
-- 
cgit v1.2.3-59-g8ed1b


From 7a507d62258afd514583fadf1482451079fa0e4d Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 4 Dec 2018 22:20:04 -0800
Subject: crypto: x86/chacha - add XChaCha12 support

Now that the x86_64 SIMD implementations of ChaCha20 and XChaCha20 have
been refactored to support varying the number of rounds, add support for
XChaCha12.  This is identical to XChaCha20 except for the number of
rounds, which is 12 instead of 20.  This can be used by Adiantum.

Reviewed-by: Martin Willi <martin@strongswan.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/chacha_glue.c | 17 +++++++++++++++++
 crypto/Kconfig                |  4 ++--
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c
index 35fd02b50d27..d19c2908be90 100644
--- a/arch/x86/crypto/chacha_glue.c
+++ b/arch/x86/crypto/chacha_glue.c
@@ -232,6 +232,21 @@ static struct skcipher_alg algs[] = {
 		.setkey			= crypto_chacha20_setkey,
 		.encrypt		= xchacha_simd,
 		.decrypt		= xchacha_simd,
+	}, {
+		.base.cra_name		= "xchacha12",
+		.base.cra_driver_name	= "xchacha12-simd",
+		.base.cra_priority	= 300,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= XCHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.setkey			= crypto_chacha12_setkey,
+		.encrypt		= xchacha_simd,
+		.decrypt		= xchacha_simd,
 	},
 };
 
@@ -268,3 +283,5 @@ MODULE_ALIAS_CRYPTO("chacha20");
 MODULE_ALIAS_CRYPTO("chacha20-simd");
 MODULE_ALIAS_CRYPTO("xchacha20");
 MODULE_ALIAS_CRYPTO("xchacha20-simd");
+MODULE_ALIAS_CRYPTO("xchacha12");
+MODULE_ALIAS_CRYPTO("xchacha12-simd");
diff --git a/crypto/Kconfig b/crypto/Kconfig
index dc3a0e3ea2ac..045af6eeb7e2 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1473,8 +1473,8 @@ config CRYPTO_CHACHA20_X86_64
 	select CRYPTO_BLKCIPHER
 	select CRYPTO_CHACHA20
 	help
-	  SSSE3, AVX2, and AVX-512VL optimized implementations of the ChaCha20
-	  and XChaCha20 stream ciphers.
+	  SSSE3, AVX2, and AVX-512VL optimized implementations of the ChaCha20,
+	  XChaCha20, and XChaCha12 stream ciphers.
 
 config CRYPTO_SEED
 	tristate "SEED cipher algorithm"
-- 
cgit v1.2.3-59-g8ed1b


From a033aed5a84eb93a32929b6862602cb283d39e82 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 4 Dec 2018 22:20:05 -0800
Subject: crypto: x86/chacha - yield the FPU occasionally

To improve responsiveness, yield the FPU (temporarily re-enabling
preemption) every 4 KiB encrypted/decrypted, rather than keeping
preemption disabled during the entire encryption/decryption operation.

Alternatively we could do this for every skcipher_walk step, but steps
may be small in some cases, and yielding the FPU is expensive on x86.

Suggested-by: Martin Willi <martin@strongswan.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/chacha_glue.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c
index d19c2908be90..9b1d3fac4943 100644
--- a/arch/x86/crypto/chacha_glue.c
+++ b/arch/x86/crypto/chacha_glue.c
@@ -132,6 +132,7 @@ static int chacha_simd_stream_xor(struct skcipher_request *req,
 {
 	u32 *state, state_buf[16 + 2] __aligned(8);
 	struct skcipher_walk walk;
+	int next_yield = 4096; /* bytes until next FPU yield */
 	int err;
 
 	BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
@@ -144,12 +145,21 @@ static int chacha_simd_stream_xor(struct skcipher_request *req,
 	while (walk.nbytes > 0) {
 		unsigned int nbytes = walk.nbytes;
 
-		if (nbytes < walk.total)
+		if (nbytes < walk.total) {
 			nbytes = round_down(nbytes, walk.stride);
+			next_yield -= nbytes;
+		}
 
 		chacha_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
 			      nbytes, ctx->nrounds);
 
+		if (next_yield <= 0) {
+			/* temporarily allow preemption */
+			kernel_fpu_end();
+			kernel_fpu_begin();
+			next_yield = 4096;
+		}
+
 		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 5569e8c07447344cdc3771378ba4e0da0b94c2a4 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 6 Dec 2018 12:31:54 -0800
Subject: crypto: xchacha - add test vector from XChaCha20 draft RFC

There is a draft specification for XChaCha20 being worked on.  Add the
XChaCha20 test vector from the appendix so that we can be extra sure the
kernel's implementation is compatible.

I also recomputed the ciphertext with XChaCha12 and added it there too,
to keep the tests for XChaCha20 and XChaCha12 in sync.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/testmgr.h | 178 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 176 insertions(+), 2 deletions(-)

diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index e7e56a8febbc..357cf4cbcbb1 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -32800,7 +32800,94 @@ static const struct cipher_testvec xchacha20_tv_template[] = {
 		.also_non_np = 1,
 		.np	= 3,
 		.tap	= { 1200, 1, 80 },
-	},
+	}, { /* test vector from https://tools.ietf.org/html/draft-arciszewski-xchacha-02#appendix-A.3.2 */
+		.key	= "\x80\x81\x82\x83\x84\x85\x86\x87"
+			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
+			  "\x90\x91\x92\x93\x94\x95\x96\x97"
+			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f",
+		.klen	= 32,
+		.iv	= "\x40\x41\x42\x43\x44\x45\x46\x47"
+			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
+			  "\x50\x51\x52\x53\x54\x55\x56\x58"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.ptext	= "\x54\x68\x65\x20\x64\x68\x6f\x6c"
+			  "\x65\x20\x28\x70\x72\x6f\x6e\x6f"
+			  "\x75\x6e\x63\x65\x64\x20\x22\x64"
+			  "\x6f\x6c\x65\x22\x29\x20\x69\x73"
+			  "\x20\x61\x6c\x73\x6f\x20\x6b\x6e"
+			  "\x6f\x77\x6e\x20\x61\x73\x20\x74"
+			  "\x68\x65\x20\x41\x73\x69\x61\x74"
+			  "\x69\x63\x20\x77\x69\x6c\x64\x20"
+			  "\x64\x6f\x67\x2c\x20\x72\x65\x64"
+			  "\x20\x64\x6f\x67\x2c\x20\x61\x6e"
+			  "\x64\x20\x77\x68\x69\x73\x74\x6c"
+			  "\x69\x6e\x67\x20\x64\x6f\x67\x2e"
+			  "\x20\x49\x74\x20\x69\x73\x20\x61"
+			  "\x62\x6f\x75\x74\x20\x74\x68\x65"
+			  "\x20\x73\x69\x7a\x65\x20\x6f\x66"
+			  "\x20\x61\x20\x47\x65\x72\x6d\x61"
+			  "\x6e\x20\x73\x68\x65\x70\x68\x65"
+			  "\x72\x64\x20\x62\x75\x74\x20\x6c"
+			  "\x6f\x6f\x6b\x73\x20\x6d\x6f\x72"
+			  "\x65\x20\x6c\x69\x6b\x65\x20\x61"
+			  "\x20\x6c\x6f\x6e\x67\x2d\x6c\x65"
+			  "\x67\x67\x65\x64\x20\x66\x6f\x78"
+			  "\x2e\x20\x54\x68\x69\x73\x20\x68"
+			  "\x69\x67\x68\x6c\x79\x20\x65\x6c"
+			  "\x75\x73\x69\x76\x65\x20\x61\x6e"
+			  "\x64\x20\x73\x6b\x69\x6c\x6c\x65"
+			  "\x64\x20\x6a\x75\x6d\x70\x65\x72"
+			  "\x20\x69\x73\x20\x63\x6c\x61\x73"
+			  "\x73\x69\x66\x69\x65\x64\x20\x77"
+			  "\x69\x74\x68\x20\x77\x6f\x6c\x76"
+			  "\x65\x73\x2c\x20\x63\x6f\x79\x6f"
+			  "\x74\x65\x73\x2c\x20\x6a\x61\x63"
+			  "\x6b\x61\x6c\x73\x2c\x20\x61\x6e"
+			  "\x64\x20\x66\x6f\x78\x65\x73\x20"
+			  "\x69\x6e\x20\x74\x68\x65\x20\x74"
+			  "\x61\x78\x6f\x6e\x6f\x6d\x69\x63"
+			  "\x20\x66\x61\x6d\x69\x6c\x79\x20"
+			  "\x43\x61\x6e\x69\x64\x61\x65\x2e",
+		.ctext	= "\x45\x59\xab\xba\x4e\x48\xc1\x61"
+			  "\x02\xe8\xbb\x2c\x05\xe6\x94\x7f"
+			  "\x50\xa7\x86\xde\x16\x2f\x9b\x0b"
+			  "\x7e\x59\x2a\x9b\x53\xd0\xd4\xe9"
+			  "\x8d\x8d\x64\x10\xd5\x40\xa1\xa6"
+			  "\x37\x5b\x26\xd8\x0d\xac\xe4\xfa"
+			  "\xb5\x23\x84\xc7\x31\xac\xbf\x16"
+			  "\xa5\x92\x3c\x0c\x48\xd3\x57\x5d"
+			  "\x4d\x0d\x2c\x67\x3b\x66\x6f\xaa"
+			  "\x73\x10\x61\x27\x77\x01\x09\x3a"
+			  "\x6b\xf7\xa1\x58\xa8\x86\x42\x92"
+			  "\xa4\x1c\x48\xe3\xa9\xb4\xc0\xda"
+			  "\xec\xe0\xf8\xd9\x8d\x0d\x7e\x05"
+			  "\xb3\x7a\x30\x7b\xbb\x66\x33\x31"
+			  "\x64\xec\x9e\x1b\x24\xea\x0d\x6c"
+			  "\x3f\xfd\xdc\xec\x4f\x68\xe7\x44"
+			  "\x30\x56\x19\x3a\x03\xc8\x10\xe1"
+			  "\x13\x44\xca\x06\xd8\xed\x8a\x2b"
+			  "\xfb\x1e\x8d\x48\xcf\xa6\xbc\x0e"
+			  "\xb4\xe2\x46\x4b\x74\x81\x42\x40"
+			  "\x7c\x9f\x43\x1a\xee\x76\x99\x60"
+			  "\xe1\x5b\xa8\xb9\x68\x90\x46\x6e"
+			  "\xf2\x45\x75\x99\x85\x23\x85\xc6"
+			  "\x61\xf7\x52\xce\x20\xf9\xda\x0c"
+			  "\x09\xab\x6b\x19\xdf\x74\xe7\x6a"
+			  "\x95\x96\x74\x46\xf8\xd0\xfd\x41"
+			  "\x5e\x7b\xee\x2a\x12\xa1\x14\xc2"
+			  "\x0e\xb5\x29\x2a\xe7\xa3\x49\xae"
+			  "\x57\x78\x20\xd5\x52\x0a\x1f\x3f"
+			  "\xb6\x2a\x17\xce\x6a\x7e\x68\xfa"
+			  "\x7c\x79\x11\x1d\x88\x60\x92\x0b"
+			  "\xc0\x48\xef\x43\xfe\x84\x48\x6c"
+			  "\xcb\x87\xc2\x5f\x0a\xe0\x45\xf0"
+			  "\xcc\xe1\xe7\x98\x9a\x9a\xa2\x20"
+			  "\xa2\x8b\xdd\x48\x27\xe7\x51\xa2"
+			  "\x4a\x6d\x5c\x62\xd7\x90\xa6\x63"
+			  "\x93\xb9\x31\x11\xc1\xa5\x5d\xd7"
+			  "\x42\x1a\x10\x18\x49\x74\xc7\xc5",
+		.len	= 304,
+	}
 };
 
 /*
@@ -33378,7 +33465,94 @@ static const struct cipher_testvec xchacha12_tv_template[] = {
 		.also_non_np = 1,
 		.np	= 3,
 		.tap	= { 1200, 1, 80 },
-	},
+	}, {
+		.key	= "\x80\x81\x82\x83\x84\x85\x86\x87"
+			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
+			  "\x90\x91\x92\x93\x94\x95\x96\x97"
+			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f",
+		.klen	= 32,
+		.iv	= "\x40\x41\x42\x43\x44\x45\x46\x47"
+			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
+			  "\x50\x51\x52\x53\x54\x55\x56\x58"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.ptext	= "\x54\x68\x65\x20\x64\x68\x6f\x6c"
+			  "\x65\x20\x28\x70\x72\x6f\x6e\x6f"
+			  "\x75\x6e\x63\x65\x64\x20\x22\x64"
+			  "\x6f\x6c\x65\x22\x29\x20\x69\x73"
+			  "\x20\x61\x6c\x73\x6f\x20\x6b\x6e"
+			  "\x6f\x77\x6e\x20\x61\x73\x20\x74"
+			  "\x68\x65\x20\x41\x73\x69\x61\x74"
+			  "\x69\x63\x20\x77\x69\x6c\x64\x20"
+			  "\x64\x6f\x67\x2c\x20\x72\x65\x64"
+			  "\x20\x64\x6f\x67\x2c\x20\x61\x6e"
+			  "\x64\x20\x77\x68\x69\x73\x74\x6c"
+			  "\x69\x6e\x67\x20\x64\x6f\x67\x2e"
+			  "\x20\x49\x74\x20\x69\x73\x20\x61"
+			  "\x62\x6f\x75\x74\x20\x74\x68\x65"
+			  "\x20\x73\x69\x7a\x65\x20\x6f\x66"
+			  "\x20\x61\x20\x47\x65\x72\x6d\x61"
+			  "\x6e\x20\x73\x68\x65\x70\x68\x65"
+			  "\x72\x64\x20\x62\x75\x74\x20\x6c"
+			  "\x6f\x6f\x6b\x73\x20\x6d\x6f\x72"
+			  "\x65\x20\x6c\x69\x6b\x65\x20\x61"
+			  "\x20\x6c\x6f\x6e\x67\x2d\x6c\x65"
+			  "\x67\x67\x65\x64\x20\x66\x6f\x78"
+			  "\x2e\x20\x54\x68\x69\x73\x20\x68"
+			  "\x69\x67\x68\x6c\x79\x20\x65\x6c"
+			  "\x75\x73\x69\x76\x65\x20\x61\x6e"
+			  "\x64\x20\x73\x6b\x69\x6c\x6c\x65"
+			  "\x64\x20\x6a\x75\x6d\x70\x65\x72"
+			  "\x20\x69\x73\x20\x63\x6c\x61\x73"
+			  "\x73\x69\x66\x69\x65\x64\x20\x77"
+			  "\x69\x74\x68\x20\x77\x6f\x6c\x76"
+			  "\x65\x73\x2c\x20\x63\x6f\x79\x6f"
+			  "\x74\x65\x73\x2c\x20\x6a\x61\x63"
+			  "\x6b\x61\x6c\x73\x2c\x20\x61\x6e"
+			  "\x64\x20\x66\x6f\x78\x65\x73\x20"
+			  "\x69\x6e\x20\x74\x68\x65\x20\x74"
+			  "\x61\x78\x6f\x6e\x6f\x6d\x69\x63"
+			  "\x20\x66\x61\x6d\x69\x6c\x79\x20"
+			  "\x43\x61\x6e\x69\x64\x61\x65\x2e",
+		.ctext	= "\x9f\x1a\xab\x8a\x95\xf4\x7e\xcd"
+			  "\xee\x34\xc0\x39\xd6\x23\x43\x94"
+			  "\xf6\x01\xc1\x7f\x60\x91\xa5\x23"
+			  "\x4a\x8a\xe6\xb1\x14\x8b\xd7\x58"
+			  "\xee\x02\xad\xab\xce\x1e\x7d\xdf"
+			  "\xf9\x49\x27\x69\xd0\x8d\x0c\x20"
+			  "\x6e\x17\xc4\xae\x87\x7a\xc6\x61"
+			  "\x91\xe2\x8e\x0a\x1d\x61\xcc\x38"
+			  "\x02\x64\x43\x49\xc6\xb2\x59\x59"
+			  "\x42\xe7\x9d\x83\x00\x60\x90\xd2"
+			  "\xb9\xcd\x97\x6e\xc7\x95\x71\xbc"
+			  "\x23\x31\x58\x07\xb3\xb4\xac\x0b"
+			  "\x87\x64\x56\xe5\xe3\xec\x63\xa1"
+			  "\x71\x8c\x08\x48\x33\x20\x29\x81"
+			  "\xea\x01\x25\x20\xc3\xda\xe6\xee"
+			  "\x6a\x03\xf6\x68\x4d\x26\xa0\x91"
+			  "\x9e\x44\xb8\xc1\xc0\x8f\x5a\x6a"
+			  "\xc0\xcd\xbf\x24\x5e\x40\x66\xd2"
+			  "\x42\x24\xb5\xbf\xc1\xeb\x12\x60"
+			  "\x56\xbe\xb1\xa6\xc4\x0f\xfc\x49"
+			  "\x69\x9f\xcc\x06\x5c\xe3\x26\xd7"
+			  "\x52\xc0\x42\xe8\xb4\x76\xc3\xee"
+			  "\xb2\x97\xe3\x37\x61\x29\x5a\xb5"
+			  "\x8e\xe8\x8c\xc5\x38\xcc\xcb\xec"
+			  "\x64\x1a\xa9\x12\x5f\xf7\x79\xdf"
+			  "\x64\xca\x77\x4e\xbd\xf9\x83\xa0"
+			  "\x13\x27\x3f\x31\x03\x63\x30\x26"
+			  "\x27\x0b\x3e\xb3\x23\x13\x61\x0b"
+			  "\x70\x1d\xd4\xad\x85\x1e\xbf\xdf"
+			  "\xc6\x8e\x4d\x08\xcc\x7e\x77\xbd"
+			  "\x1e\x18\x77\x38\x3a\xfe\xc0\x5d"
+			  "\x16\xfc\xf0\xa9\x2f\xe9\x17\xc7"
+			  "\xd3\x23\x17\x18\xa3\xe6\x54\x77"
+			  "\x6f\x1b\xbe\x8a\x6e\x7e\xca\x97"
+			  "\x08\x05\x36\x76\xaf\x12\x7a\x42"
+			  "\xf7\x7a\xc2\x35\xc3\xb4\x93\x40"
+			  "\x54\x14\x90\xa0\x4d\x65\x1c\x37"
+			  "\x50\x70\x44\x29\x6d\x6e\x62\x68",
+		.len	= 304,
+	}
 };
 
 /* Adiantum test vectors from https://github.com/google/adiantum */
-- 
cgit v1.2.3-59-g8ed1b


From 282c14852d00d6d1b8fadf3e01e4180f02ddda84 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 6 Dec 2018 13:00:08 -0800
Subject: crypto: xchacha20 - fix comments for test vectors

The kernel's ChaCha20 uses the RFC7539 convention of the nonce being 12
bytes rather than 8, so actually I only appended 12 random bytes (not
16) to its test vectors to form 24-byte nonces for the XChaCha20 test
vectors.  The other 4 bytes were just from zero-padding the stream
position to 8 bytes.  Fix the comments above the test vectors.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/testmgr.h | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 357cf4cbcbb1..e8f47d7b92cd 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -32281,8 +32281,9 @@ static const struct cipher_testvec xchacha20_tv_template[] = {
 			  "\x57\x78\x8e\x6f\xae\x90\xfc\x31"
 			  "\x09\x7c\xfc",
 		.len	= 91,
-	}, { /* Taken from the ChaCha20 test vectors, appended 16 random bytes
-		to nonce, and recomputed the ciphertext with libsodium */
+	}, { /* Taken from the ChaCha20 test vectors, appended 12 random bytes
+		to the nonce, zero-padded the stream position from 4 to 8 bytes,
+		and recomputed the ciphertext using libsodium's XChaCha20 */
 		.key	= "\x00\x00\x00\x00\x00\x00\x00\x00"
 			  "\x00\x00\x00\x00\x00\x00\x00\x00"
 			  "\x00\x00\x00\x00\x00\x00\x00\x00"
@@ -32309,8 +32310,7 @@ static const struct cipher_testvec xchacha20_tv_template[] = {
 			  "\x03\xdc\xf8\x2b\xc1\xe1\x75\x67"
 			  "\x23\x7b\xe6\xfc\xd4\x03\x86\x54",
 		.len	= 64,
-	}, { /* Taken from the ChaCha20 test vectors, appended 16 random bytes
-		to nonce, and recomputed the ciphertext with libsodium */
+	}, { /* Derived from a ChaCha20 test vector, via the process above */
 		.key	= "\x00\x00\x00\x00\x00\x00\x00\x00"
 			  "\x00\x00\x00\x00\x00\x00\x00\x00"
 			  "\x00\x00\x00\x00\x00\x00\x00\x00"
@@ -32419,8 +32419,7 @@ static const struct cipher_testvec xchacha20_tv_template[] = {
 		.np	= 3,
 		.tap	= { 375 - 20, 4, 16 },
 
-	}, { /* Taken from the ChaCha20 test vectors, appended 16 random bytes
-		to nonce, and recomputed the ciphertext with libsodium */
+	}, { /* Derived from a ChaCha20 test vector, via the process above */
 		.key	= "\x1c\x92\x40\xa5\xeb\x55\xd3\x8a"
 			  "\xf3\x33\x88\x86\x04\xf6\xb5\xf0"
 			  "\x47\x39\x17\xc1\x40\x2b\x80\x09"
@@ -32463,8 +32462,7 @@ static const struct cipher_testvec xchacha20_tv_template[] = {
 			  "\x65\x03\xfa\x45\xf7\x9e\x53\x7a"
 			  "\x99\xf1\x82\x25\x4f\x8d\x07",
 		.len	= 127,
-	}, { /* Taken from the ChaCha20 test vectors, appended 16 random bytes
-		to nonce, and recomputed the ciphertext with libsodium */
+	}, { /* Derived from a ChaCha20 test vector, via the process above */
 		.key	= "\x1c\x92\x40\xa5\xeb\x55\xd3\x8a"
 			  "\xf3\x33\x88\x86\x04\xf6\xb5\xf0"
 			  "\x47\x39\x17\xc1\x40\x2b\x80\x09"
-- 
cgit v1.2.3-59-g8ed1b


From c6018e1a00b5c70610cdfb3650cc5622c917ed17 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 6 Dec 2018 14:21:59 -0800
Subject: crypto: adiantum - adjust some comments to match latest paper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The 2018-11-28 revision of the Adiantum paper has revised some notation:

- 'M' was replaced with 'L' (meaning "Left", for the left-hand part of
  the message) in the definition of Adiantum hashing, to avoid confusion
  with the full message
- ε-almost-∆-universal is now abbreviated as ε-∆U instead of εA∆U
- "block" is now used only to mean block cipher and Poly1305 blocks

Also, Adiantum hashing was moved from the appendix to the main paper.

To avoid confusion, update relevant comments in the code to match.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/adiantum.c   | 35 +++++++++++++++++++----------------
 crypto/nhpoly1305.c |  8 ++++----
 2 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/crypto/adiantum.c b/crypto/adiantum.c
index ca27e0dc2958..e62e34f5e389 100644
--- a/crypto/adiantum.c
+++ b/crypto/adiantum.c
@@ -9,7 +9,7 @@
  * Adiantum is a tweakable, length-preserving encryption mode designed for fast
  * and secure disk encryption, especially on CPUs without dedicated crypto
  * instructions.  Adiantum encrypts each sector using the XChaCha12 stream
- * cipher, two passes of an ε-almost-∆-universal (εA∆U) hash function based on
+ * cipher, two passes of an ε-almost-∆-universal (ε-∆U) hash function based on
  * NH and Poly1305, and an invocation of the AES-256 block cipher on a single
  * 16-byte block.  See the paper for details:
  *
@@ -21,12 +21,12 @@
  *	- Stream cipher: XChaCha12 or XChaCha20
  *	- Block cipher: any with a 128-bit block size and 256-bit key
  *
- * This implementation doesn't currently allow other εA∆U hash functions, i.e.
+ * This implementation doesn't currently allow other ε-∆U hash functions, i.e.
  * HPolyC is not supported.  This is because Adiantum is ~20% faster than HPolyC
- * but still provably as secure, and also the εA∆U hash function of HBSH is
+ * but still provably as secure, and also the ε-∆U hash function of HBSH is
  * formally defined to take two inputs (tweak, message) which makes it difficult
  * to wrap with the crypto_shash API.  Rather, some details need to be handled
- * here.  Nevertheless, if needed in the future, support for other εA∆U hash
+ * here.  Nevertheless, if needed in the future, support for other ε-∆U hash
  * functions could be added here.
  */
 
@@ -41,7 +41,7 @@
 #include "internal.h"
 
 /*
- * Size of right-hand block of input data, in bytes; also the size of the block
+ * Size of right-hand part of input data, in bytes; also the size of the block
  * cipher's block size and the hash function's output.
  */
 #define BLOCKCIPHER_BLOCK_SIZE		16
@@ -77,7 +77,7 @@ struct adiantum_tfm_ctx {
 struct adiantum_request_ctx {
 
 	/*
-	 * Buffer for right-hand block of data, i.e.
+	 * Buffer for right-hand part of data, i.e.
 	 *
 	 *    P_L => P_M => C_M => C_R when encrypting, or
 	 *    C_R => C_M => P_M => P_L when decrypting.
@@ -93,8 +93,8 @@ struct adiantum_request_ctx {
 	bool enc; /* true if encrypting, false if decrypting */
 
 	/*
-	 * The result of the Poly1305 εA∆U hash function applied to
-	 * (message length, tweak).
+	 * The result of the Poly1305 ε-∆U hash function applied to
+	 * (bulk length, tweak)
 	 */
 	le128 header_hash;
 
@@ -213,13 +213,16 @@ static inline void le128_sub(le128 *r, const le128 *v1, const le128 *v2)
 }
 
 /*
- * Apply the Poly1305 εA∆U hash function to (message length, tweak) and save the
- * result to rctx->header_hash.
+ * Apply the Poly1305 ε-∆U hash function to (bulk length, tweak) and save the
+ * result to rctx->header_hash.  This is the calculation
  *
- * This value is reused in both the first and second hash steps.  Specifically,
- * it's added to the result of an independently keyed εA∆U hash function (for
- * equal length inputs only) taken over the message.  This gives the overall
- * Adiantum hash of the (tweak, message) pair.
+ *	H_T ← Poly1305_{K_T}(bin_{128}(|L|) || T)
+ *
+ * from the procedure in section 6.4 of the Adiantum paper.  The resulting value
+ * is reused in both the first and second hash steps.  Specifically, it's added
+ * to the result of an independently keyed ε-∆U hash function (for equal length
+ * inputs only) taken over the left-hand part (the "bulk") of the message, to
+ * give the overall Adiantum hash of the (tweak, left-hand part) pair.
  */
 static void adiantum_hash_header(struct skcipher_request *req)
 {
@@ -248,7 +251,7 @@ static void adiantum_hash_header(struct skcipher_request *req)
 	poly1305_core_emit(&state, &rctx->header_hash);
 }
 
-/* Hash the left-hand block (the "bulk") of the message using NHPoly1305 */
+/* Hash the left-hand part (the "bulk") of the message using NHPoly1305 */
 static int adiantum_hash_message(struct skcipher_request *req,
 				 struct scatterlist *sgl, le128 *digest)
 {
@@ -550,7 +553,7 @@ static int adiantum_create(struct crypto_template *tmpl, struct rtattr **tb)
 		goto out_drop_streamcipher;
 	blockcipher_alg = ictx->blockcipher_spawn.alg;
 
-	/* NHPoly1305 εA∆U hash function */
+	/* NHPoly1305 ε-∆U hash function */
 	_hash_alg = crypto_alg_mod_lookup(nhpoly1305_name,
 					  CRYPTO_ALG_TYPE_SHASH,
 					  CRYPTO_ALG_TYPE_MASK);
diff --git a/crypto/nhpoly1305.c b/crypto/nhpoly1305.c
index c8385853f699..ec831a5594d8 100644
--- a/crypto/nhpoly1305.c
+++ b/crypto/nhpoly1305.c
@@ -9,15 +9,15 @@
  * "NHPoly1305" is the main component of Adiantum hashing.
  * Specifically, it is the calculation
  *
- *	H_M ← Poly1305_{K_M}(NH_{K_N}(pad_{128}(M)))
+ *	H_L ← Poly1305_{K_L}(NH_{K_N}(pad_{128}(L)))
  *
- * from the procedure in section A.5 of the Adiantum paper [1].  It is an
- * ε-almost-∆-universal (εA∆U) hash function for equal-length inputs over
+ * from the procedure in section 6.4 of the Adiantum paper [1].  It is an
+ * ε-almost-∆-universal (ε-∆U) hash function for equal-length inputs over
  * Z/(2^{128}Z), where the "∆" operation is addition.  It hashes 1024-byte
  * chunks of the input with the NH hash function [2], reducing the input length
  * by 32x.  The resulting NH digests are evaluated as a polynomial in
  * GF(2^{130}-5), like in the Poly1305 MAC [3].  Note that the polynomial
- * evaluation by itself would suffice to achieve the εA∆U property; NH is used
+ * evaluation by itself would suffice to achieve the ε-∆U property; NH is used
  * for performance since it's over twice as fast as Poly1305.
  *
  * This is *not* a cryptographic hash function; do not use it as such!
-- 
cgit v1.2.3-59-g8ed1b


From 0ac6b8fb23c724b015d9ca70a89126e8d1563166 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 6 Dec 2018 15:55:41 -0800
Subject: crypto: user - support incremental algorithm dumps

CRYPTO_MSG_GETALG in NLM_F_DUMP mode sometimes doesn't return all
registered crypto algorithms, because it doesn't support incremental
dumps.  crypto_dump_report() only permits itself to be called once, yet
the netlink subsystem allocates at most ~64 KiB for the skb being dumped
to.  Thus only the first recvmsg() returns data, and it may only include
a subset of the crypto algorithms even if the user buffer passed to
recvmsg() is large enough to hold all of them.

Fix this by using one of the arguments in the netlink_callback structure
to keep track of the current position in the algorithm list.  Then
userspace can do multiple recvmsg() on the socket after sending the dump
request.  This is the way netlink dumps work elsewhere in the kernel;
it's unclear why this was different (probably just an oversight).

Also fix an integer overflow when calculating the dump buffer size hint.

Fixes: a38f7907b926 ("crypto: Add userspace configuration API")
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/crypto_user_base.c | 37 ++++++++++++++++++++-----------------
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/crypto/crypto_user_base.c b/crypto/crypto_user_base.c
index 7021efbb35a1..5311fd7fae34 100644
--- a/crypto/crypto_user_base.c
+++ b/crypto/crypto_user_base.c
@@ -231,30 +231,33 @@ drop_alg:
 
 static int crypto_dump_report(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	struct crypto_alg *alg;
+	const size_t start_pos = cb->args[0];
+	size_t pos = 0;
 	struct crypto_dump_info info;
-	int err;
-
-	if (cb->args[0])
-		goto out;
-
-	cb->args[0] = 1;
+	struct crypto_alg *alg;
+	int res;
 
 	info.in_skb = cb->skb;
 	info.out_skb = skb;
 	info.nlmsg_seq = cb->nlh->nlmsg_seq;
 	info.nlmsg_flags = NLM_F_MULTI;
 
+	down_read(&crypto_alg_sem);
 	list_for_each_entry(alg, &crypto_alg_list, cra_list) {
-		err = crypto_report_alg(alg, &info);
-		if (err)
-			goto out_err;
+		if (pos >= start_pos) {
+			res = crypto_report_alg(alg, &info);
+			if (res == -EMSGSIZE)
+				break;
+			if (res)
+				goto out;
+		}
+		pos++;
 	}
-
+	cb->args[0] = pos;
+	res = skb->len;
 out:
-	return skb->len;
-out_err:
-	return err;
+	up_read(&crypto_alg_sem);
+	return res;
 }
 
 static int crypto_dump_report_done(struct netlink_callback *cb)
@@ -442,7 +445,7 @@ static int crypto_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if ((type == (CRYPTO_MSG_GETALG - CRYPTO_MSG_BASE) &&
 	    (nlh->nlmsg_flags & NLM_F_DUMP))) {
 		struct crypto_alg *alg;
-		u16 dump_alloc = 0;
+		unsigned long dump_alloc = 0;
 
 		if (link->dump == NULL)
 			return -EINVAL;
@@ -450,16 +453,16 @@ static int crypto_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 		down_read(&crypto_alg_sem);
 		list_for_each_entry(alg, &crypto_alg_list, cra_list)
 			dump_alloc += CRYPTO_REPORT_MAXSIZE;
+		up_read(&crypto_alg_sem);
 
 		{
 			struct netlink_dump_control c = {
 				.dump = link->dump,
 				.done = link->done,
-				.min_dump_alloc = dump_alloc,
+				.min_dump_alloc = min(dump_alloc, 65535UL),
 			};
 			err = netlink_dump_start(crypto_nlsk, skb, nlh, &c);
 		}
-		up_read(&crypto_alg_sem);
 
 		return err;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 00c9fe37a7f27a306bcaa5737f0787fe139f8aba Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 10 Dec 2018 11:45:58 -0800
Subject: crypto: adiantum - fix leaking reference to hash algorithm

crypto_alg_mod_lookup() takes a reference to the hash algorithm but
crypto_init_shash_spawn() doesn't take ownership of it, hence the
reference needs to be dropped in adiantum_create().

Fixes: 059c2a4d8e16 ("crypto: adiantum - add Adiantum support")
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/adiantum.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/crypto/adiantum.c b/crypto/adiantum.c
index e62e34f5e389..6651e713c45d 100644
--- a/crypto/adiantum.c
+++ b/crypto/adiantum.c
@@ -564,10 +564,8 @@ static int adiantum_create(struct crypto_template *tmpl, struct rtattr **tb)
 	hash_alg = __crypto_shash_alg(_hash_alg);
 	err = crypto_init_shash_spawn(&ictx->hash_spawn, hash_alg,
 				      skcipher_crypto_instance(inst));
-	if (err) {
-		crypto_mod_put(_hash_alg);
-		goto out_drop_blockcipher;
-	}
+	if (err)
+		goto out_put_hash;
 
 	/* Check the set of algorithms */
 	if (!adiantum_supported_algorithms(streamcipher_alg, blockcipher_alg,
@@ -624,10 +622,13 @@ static int adiantum_create(struct crypto_template *tmpl, struct rtattr **tb)
 	if (err)
 		goto out_drop_hash;
 
+	crypto_mod_put(_hash_alg);
 	return 0;
 
 out_drop_hash:
 	crypto_drop_shash(&ictx->hash_spawn);
+out_put_hash:
+	crypto_mod_put(_hash_alg);
 out_drop_blockcipher:
 	crypto_drop_spawn(&ictx->blockcipher_spawn);
 out_drop_streamcipher:
-- 
cgit v1.2.3-59-g8ed1b


From f9b1d64678607e132051d232c7a2127f32947d64 Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Mon, 10 Dec 2018 19:56:45 +0000
Subject: crypto: aesni - Merge GCM_ENC_DEC

The GCM_ENC_DEC routines for AVX and AVX2 are identical, except they
call separate sub-macros.  Pass the macros as arguments, and merge them.
This facilitates additional refactoring, by requiring changes in only
one place.

The GCM_ENC_DEC macro was moved above the CONFIG_AS_AVX* ifdefs,
since it will be used by both AVX and AVX2.

Signed-off-by: Dave Watson <davejwatson@fb.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aesni-intel_avx-x86_64.S | 2481 +++++++++++++-----------------
 1 file changed, 1083 insertions(+), 1398 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index 1985ea0b551b..318135a77975 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -280,1252 +280,1250 @@ VARIABLE_OFFSET = 16*8
                 vaesenclast 16*10(arg1), \XMM0, \XMM0
 .endm
 
-#ifdef CONFIG_AS_AVX
-###############################################################################
-# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
-# Input: A and B (128-bits each, bit-reflected)
-# Output: C = A*B*x mod poly, (i.e. >>1 )
-# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
-# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
-###############################################################################
-.macro  GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
+# combined for GCM encrypt and decrypt functions
+# clobbering all xmm registers
+# clobbering r10, r11, r12, r13, r14, r15
+.macro  GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC
 
-        vpshufd         $0b01001110, \GH, \T2
-        vpshufd         $0b01001110, \HK, \T3
-        vpxor           \GH     , \T2, \T2      # T2 = (a1+a0)
-        vpxor           \HK     , \T3, \T3      # T3 = (b1+b0)
+        #the number of pushes must equal STACK_OFFSET
+        push    %r12
+        push    %r13
+        push    %r14
+        push    %r15
 
-        vpclmulqdq      $0x11, \HK, \GH, \T1    # T1 = a1*b1
-        vpclmulqdq      $0x00, \HK, \GH, \GH    # GH = a0*b0
-        vpclmulqdq      $0x00, \T3, \T2, \T2    # T2 = (a1+a0)*(b1+b0)
-        vpxor           \GH, \T2,\T2
-        vpxor           \T1, \T2,\T2            # T2 = a0*b1+a1*b0
+        mov     %rsp, %r14
 
-        vpslldq         $8, \T2,\T3             # shift-L T3 2 DWs
-        vpsrldq         $8, \T2,\T2             # shift-R T2 2 DWs
-        vpxor           \T3, \GH, \GH
-        vpxor           \T2, \T1, \T1           # <T1:GH> = GH x HK
 
-        #first phase of the reduction
-        vpslld  $31, \GH, \T2                   # packed right shifting << 31
-        vpslld  $30, \GH, \T3                   # packed right shifting shift << 30
-        vpslld  $25, \GH, \T4                   # packed right shifting shift << 25
 
-        vpxor   \T3, \T2, \T2                   # xor the shifted versions
-        vpxor   \T4, \T2, \T2
 
-        vpsrldq $4, \T2, \T5                    # shift-R T5 1 DW
+        sub     $VARIABLE_OFFSET, %rsp
+        and     $~63, %rsp                  # align rsp to 64 bytes
 
-        vpslldq $12, \T2, \T2                   # shift-L T2 3 DWs
-        vpxor   \T2, \GH, \GH                   # first phase of the reduction complete
 
-        #second phase of the reduction
+        vmovdqu  HashKey(arg1), %xmm13      # xmm13 = HashKey
 
-        vpsrld  $1,\GH, \T2                     # packed left shifting >> 1
-        vpsrld  $2,\GH, \T3                     # packed left shifting >> 2
-        vpsrld  $7,\GH, \T4                     # packed left shifting >> 7
-        vpxor   \T3, \T2, \T2                   # xor the shifted versions
-        vpxor   \T4, \T2, \T2
+        mov     arg4, %r13                  # save the number of bytes of plaintext/ciphertext
+        and     $-16, %r13                  # r13 = r13 - (r13 mod 16)
 
-        vpxor   \T5, \T2, \T2
-        vpxor   \T2, \GH, \GH
-        vpxor   \T1, \GH, \GH                   # the result is in GH
+        mov     %r13, %r12
+        shr     $4, %r12
+        and     $7, %r12
+        jz      _initial_num_blocks_is_0\@
 
+        cmp     $7, %r12
+        je      _initial_num_blocks_is_7\@
+        cmp     $6, %r12
+        je      _initial_num_blocks_is_6\@
+        cmp     $5, %r12
+        je      _initial_num_blocks_is_5\@
+        cmp     $4, %r12
+        je      _initial_num_blocks_is_4\@
+        cmp     $3, %r12
+        je      _initial_num_blocks_is_3\@
+        cmp     $2, %r12
+        je      _initial_num_blocks_is_2\@
 
-.endm
+        jmp     _initial_num_blocks_is_1\@
 
-.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
+_initial_num_blocks_is_7\@:
+        \INITIAL_BLOCKS  7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        sub     $16*7, %r13
+        jmp     _initial_blocks_encrypted\@
 
-        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
-        vmovdqa  \HK, \T5
+_initial_num_blocks_is_6\@:
+        \INITIAL_BLOCKS  6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        sub     $16*6, %r13
+        jmp     _initial_blocks_encrypted\@
 
-        vpshufd  $0b01001110, \T5, \T1
-        vpxor    \T5, \T1, \T1
-        vmovdqa  \T1, HashKey_k(arg1)
+_initial_num_blocks_is_5\@:
+        \INITIAL_BLOCKS  5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        sub     $16*5, %r13
+        jmp     _initial_blocks_encrypted\@
 
-        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^2<<1 mod poly
-        vmovdqa  \T5, HashKey_2(arg1)                    #  [HashKey_2] = HashKey^2<<1 mod poly
-        vpshufd  $0b01001110, \T5, \T1
-        vpxor    \T5, \T1, \T1
-        vmovdqa  \T1, HashKey_2_k(arg1)
+_initial_num_blocks_is_4\@:
+        \INITIAL_BLOCKS  4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        sub     $16*4, %r13
+        jmp     _initial_blocks_encrypted\@
 
-        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^3<<1 mod poly
-        vmovdqa  \T5, HashKey_3(arg1)
-        vpshufd  $0b01001110, \T5, \T1
-        vpxor    \T5, \T1, \T1
-        vmovdqa  \T1, HashKey_3_k(arg1)
+_initial_num_blocks_is_3\@:
+        \INITIAL_BLOCKS  3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        sub     $16*3, %r13
+        jmp     _initial_blocks_encrypted\@
 
-        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^4<<1 mod poly
-        vmovdqa  \T5, HashKey_4(arg1)
-        vpshufd  $0b01001110, \T5, \T1
-        vpxor    \T5, \T1, \T1
-        vmovdqa  \T1, HashKey_4_k(arg1)
+_initial_num_blocks_is_2\@:
+        \INITIAL_BLOCKS  2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        sub     $16*2, %r13
+        jmp     _initial_blocks_encrypted\@
 
-        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^5<<1 mod poly
-        vmovdqa  \T5, HashKey_5(arg1)
-        vpshufd  $0b01001110, \T5, \T1
-        vpxor    \T5, \T1, \T1
-        vmovdqa  \T1, HashKey_5_k(arg1)
+_initial_num_blocks_is_1\@:
+        \INITIAL_BLOCKS  1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        sub     $16*1, %r13
+        jmp     _initial_blocks_encrypted\@
 
-        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^6<<1 mod poly
-        vmovdqa  \T5, HashKey_6(arg1)
-        vpshufd  $0b01001110, \T5, \T1
-        vpxor    \T5, \T1, \T1
-        vmovdqa  \T1, HashKey_6_k(arg1)
+_initial_num_blocks_is_0\@:
+        \INITIAL_BLOCKS  0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
 
-        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^7<<1 mod poly
-        vmovdqa  \T5, HashKey_7(arg1)
-        vpshufd  $0b01001110, \T5, \T1
-        vpxor    \T5, \T1, \T1
-        vmovdqa  \T1, HashKey_7_k(arg1)
 
-        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^8<<1 mod poly
-        vmovdqa  \T5, HashKey_8(arg1)
-        vpshufd  $0b01001110, \T5, \T1
-        vpxor    \T5, \T1, \T1
-        vmovdqa  \T1, HashKey_8_k(arg1)
+_initial_blocks_encrypted\@:
+        cmp     $0, %r13
+        je      _zero_cipher_left\@
 
-.endm
+        sub     $128, %r13
+        je      _eight_cipher_left\@
 
-## if a = number of total plaintext bytes
-## b = floor(a/16)
-## num_initial_blocks = b mod 4#
-## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
-## r10, r11, r12, rax are clobbered
-## arg1, arg2, arg3, r14 are used as a pointer only, not modified
 
-.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
-	i = (8-\num_initial_blocks)
-	j = 0
-	setreg
 
-	mov     arg6, %r10                      # r10 = AAD
-	mov     arg7, %r12                      # r12 = aadLen
 
+        vmovd   %xmm9, %r15d
+        and     $255, %r15d
+        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
 
-	mov     %r12, %r11
 
-	vpxor   reg_j, reg_j, reg_j
-	vpxor   reg_i, reg_i, reg_i
-	cmp     $16, %r11
-	jl      _get_AAD_rest8\@
-_get_AAD_blocks\@:
-	vmovdqu (%r10), reg_i
-	vpshufb SHUF_MASK(%rip), reg_i, reg_i
-	vpxor   reg_i, reg_j, reg_j
-	GHASH_MUL_AVX       reg_j, \T2, \T1, \T3, \T4, \T5, \T6
-	add     $16, %r10
-	sub     $16, %r12
-	sub     $16, %r11
-	cmp     $16, %r11
-	jge     _get_AAD_blocks\@
-	vmovdqu reg_j, reg_i
-	cmp     $0, %r11
-	je      _get_AAD_done\@
+_encrypt_by_8_new\@:
+        cmp     $(255-8), %r15d
+        jg      _encrypt_by_8\@
 
-	vpxor   reg_i, reg_i, reg_i
 
-	/* read the last <16B of AAD. since we have at least 4B of
-	data right after the AAD (the ICV, and maybe some CT), we can
-	read 4B/8B blocks safely, and then get rid of the extra stuff */
-_get_AAD_rest8\@:
-	cmp     $4, %r11
-	jle     _get_AAD_rest4\@
-	movq    (%r10), \T1
-	add     $8, %r10
-	sub     $8, %r11
-	vpslldq $8, \T1, \T1
-	vpsrldq $8, reg_i, reg_i
-	vpxor   \T1, reg_i, reg_i
-	jmp     _get_AAD_rest8\@
-_get_AAD_rest4\@:
-	cmp     $0, %r11
-	jle      _get_AAD_rest0\@
-	mov     (%r10), %eax
-	movq    %rax, \T1
-	add     $4, %r10
-	sub     $4, %r11
-	vpslldq $12, \T1, \T1
-	vpsrldq $4, reg_i, reg_i
-	vpxor   \T1, reg_i, reg_i
-_get_AAD_rest0\@:
-	/* finalize: shift out the extra bytes we read, and align
-	left. since pslldq can only shift by an immediate, we use
-	vpshufb and an array of shuffle masks */
-	movq    %r12, %r11
-	salq    $4, %r11
-	movdqu  aad_shift_arr(%r11), \T1
-	vpshufb \T1, reg_i, reg_i
-_get_AAD_rest_final\@:
-	vpshufb SHUF_MASK(%rip), reg_i, reg_i
-	vpxor   reg_j, reg_i, reg_i
-	GHASH_MUL_AVX       reg_i, \T2, \T1, \T3, \T4, \T5, \T6
 
-_get_AAD_done\@:
-	# initialize the data pointer offset as zero
-	xor     %r11d, %r11d
+        add     $8, %r15b
+        \GHASH_8_ENCRYPT_8_PARALLEL      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
+        add     $128, %r11
+        sub     $128, %r13
+        jne     _encrypt_by_8_new\@
 
-	# start AES for num_initial_blocks blocks
-	mov     arg5, %rax                     # rax = *Y0
-	vmovdqu (%rax), \CTR                   # CTR = Y0
-	vpshufb SHUF_MASK(%rip), \CTR, \CTR
+        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+        jmp     _eight_cipher_left\@
 
+_encrypt_by_8\@:
+        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+        add     $8, %r15b
+        \GHASH_8_ENCRYPT_8_PARALLEL      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
+        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+        add     $128, %r11
+        sub     $128, %r13
+        jne     _encrypt_by_8_new\@
 
-	i = (9-\num_initial_blocks)
-	setreg
-.rep \num_initial_blocks
-                vpaddd  ONE(%rip), \CTR, \CTR		# INCR Y0
-                vmovdqa \CTR, reg_i
-                vpshufb SHUF_MASK(%rip), reg_i, reg_i   # perform a 16Byte swap
-	i = (i+1)
-	setreg
-.endr
+        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
 
-	vmovdqa  (arg1), \T_key
-	i = (9-\num_initial_blocks)
-	setreg
-.rep \num_initial_blocks
-                vpxor   \T_key, reg_i, reg_i
-	i = (i+1)
-	setreg
-.endr
 
-	j = 1
-	setreg
-.rep 9
-	vmovdqa  16*j(arg1), \T_key
-	i = (9-\num_initial_blocks)
-	setreg
-.rep \num_initial_blocks
-        vaesenc \T_key, reg_i, reg_i
-	i = (i+1)
-	setreg
-.endr
 
-	j = (j+1)
-	setreg
-.endr
-
-
-	vmovdqa  16*10(arg1), \T_key
-	i = (9-\num_initial_blocks)
-	setreg
-.rep \num_initial_blocks
-        vaesenclast      \T_key, reg_i, reg_i
-	i = (i+1)
-	setreg
-.endr
 
-	i = (9-\num_initial_blocks)
-	setreg
-.rep \num_initial_blocks
-                vmovdqu (arg3, %r11), \T1
-                vpxor   \T1, reg_i, reg_i
-                vmovdqu reg_i, (arg2 , %r11)           # write back ciphertext for num_initial_blocks blocks
-                add     $16, %r11
-.if  \ENC_DEC == DEC
-                vmovdqa \T1, reg_i
-.endif
-                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
-	i = (i+1)
-	setreg
-.endr
+_eight_cipher_left\@:
+        \GHASH_LAST_8    %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
 
 
-	i = (8-\num_initial_blocks)
-	j = (9-\num_initial_blocks)
-	setreg
+_zero_cipher_left\@:
+        cmp     $16, arg4
+        jl      _only_less_than_16\@
 
-.rep \num_initial_blocks
-        vpxor    reg_i, reg_j, reg_j
-        GHASH_MUL_AVX       reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
-	i = (i+1)
-	j = (j+1)
-	setreg
-.endr
-        # XMM8 has the combined result here
+        mov     arg4, %r13
+        and     $15, %r13                            # r13 = (arg4 mod 16)
 
-        vmovdqa  \XMM8, TMP1(%rsp)
-        vmovdqa  \XMM8, \T3
+        je      _multiple_of_16_bytes\@
 
-        cmp     $128, %r13
-        jl      _initial_blocks_done\@                  # no need for precomputed constants
+        # handle the last <16 Byte block seperately
 
-###############################################################################
-# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
-                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
-                vmovdqa  \CTR, \XMM1
-                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
 
-                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
-                vmovdqa  \CTR, \XMM2
-                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
+        vpaddd   ONE(%rip), %xmm9, %xmm9             # INCR CNT to get Yn
+        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
 
-                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
-                vmovdqa  \CTR, \XMM3
-                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
+        sub     $16, %r11
+        add     %r13, %r11
+        vmovdqu (arg3, %r11), %xmm1                  # receive the last <16 Byte block
 
-                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
-                vmovdqa  \CTR, \XMM4
-                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
+        lea     SHIFT_MASK+16(%rip), %r12
+        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
+						     # able to shift 16-r13 bytes (r13 is the
+						     # number of bytes in plaintext mod 16)
+        vmovdqu (%r12), %xmm2                        # get the appropriate shuffle mask
+        vpshufb %xmm2, %xmm1, %xmm1                  # shift right 16-r13 bytes
+        jmp     _final_ghash_mul\@
 
-                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
-                vmovdqa  \CTR, \XMM5
-                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
+_only_less_than_16\@:
+        # check for 0 length
+        mov     arg4, %r13
+        and     $15, %r13                            # r13 = (arg4 mod 16)
 
-                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
-                vmovdqa  \CTR, \XMM6
-                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
+        je      _multiple_of_16_bytes\@
 
-                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
-                vmovdqa  \CTR, \XMM7
-                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
+        # handle the last <16 Byte block separately
 
-                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
-                vmovdqa  \CTR, \XMM8
-                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
 
-                vmovdqa  (arg1), \T_key
-                vpxor    \T_key, \XMM1, \XMM1
-                vpxor    \T_key, \XMM2, \XMM2
-                vpxor    \T_key, \XMM3, \XMM3
-                vpxor    \T_key, \XMM4, \XMM4
-                vpxor    \T_key, \XMM5, \XMM5
-                vpxor    \T_key, \XMM6, \XMM6
-                vpxor    \T_key, \XMM7, \XMM7
-                vpxor    \T_key, \XMM8, \XMM8
+        vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
+        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
 
-		i = 1
-		setreg
-.rep    9       # do 9 rounds
-                vmovdqa  16*i(arg1), \T_key
-                vaesenc  \T_key, \XMM1, \XMM1
-                vaesenc  \T_key, \XMM2, \XMM2
-                vaesenc  \T_key, \XMM3, \XMM3
-                vaesenc  \T_key, \XMM4, \XMM4
-                vaesenc  \T_key, \XMM5, \XMM5
-                vaesenc  \T_key, \XMM6, \XMM6
-                vaesenc  \T_key, \XMM7, \XMM7
-                vaesenc  \T_key, \XMM8, \XMM8
-		i = (i+1)
-		setreg
-.endr
 
+        lea     SHIFT_MASK+16(%rip), %r12
+        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
+						     # able to shift 16-r13 bytes (r13 is the
+						     # number of bytes in plaintext mod 16)
 
-                vmovdqa  16*i(arg1), \T_key
-                vaesenclast  \T_key, \XMM1, \XMM1
-                vaesenclast  \T_key, \XMM2, \XMM2
-                vaesenclast  \T_key, \XMM3, \XMM3
-                vaesenclast  \T_key, \XMM4, \XMM4
-                vaesenclast  \T_key, \XMM5, \XMM5
-                vaesenclast  \T_key, \XMM6, \XMM6
-                vaesenclast  \T_key, \XMM7, \XMM7
-                vaesenclast  \T_key, \XMM8, \XMM8
+_get_last_16_byte_loop\@:
+        movb    (arg3, %r11),  %al
+        movb    %al,  TMP1 (%rsp , %r11)
+        add     $1, %r11
+        cmp     %r13,  %r11
+        jne     _get_last_16_byte_loop\@
 
-                vmovdqu  (arg3, %r11), \T1
-                vpxor    \T1, \XMM1, \XMM1
-                vmovdqu  \XMM1, (arg2 , %r11)
-                .if   \ENC_DEC == DEC
-                vmovdqa  \T1, \XMM1
-                .endif
+        vmovdqu  TMP1(%rsp), %xmm1
 
-                vmovdqu  16*1(arg3, %r11), \T1
-                vpxor    \T1, \XMM2, \XMM2
-                vmovdqu  \XMM2, 16*1(arg2 , %r11)
-                .if   \ENC_DEC == DEC
-                vmovdqa  \T1, \XMM2
-                .endif
+        sub     $16, %r11
 
-                vmovdqu  16*2(arg3, %r11), \T1
-                vpxor    \T1, \XMM3, \XMM3
-                vmovdqu  \XMM3, 16*2(arg2 , %r11)
-                .if   \ENC_DEC == DEC
-                vmovdqa  \T1, \XMM3
-                .endif
+_final_ghash_mul\@:
+        .if  \ENC_DEC ==  DEC
+        vmovdqa %xmm1, %xmm2
+        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
+        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
+						     # mask out top 16-r13 bytes of xmm9
+        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
+        vpand   %xmm1, %xmm2, %xmm2
+        vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
+        vpxor   %xmm2, %xmm14, %xmm14
+	#GHASH computation for the last <16 Byte block
+        \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+        sub     %r13, %r11
+        add     $16, %r11
+        .else
+        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
+        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
+						     # mask out top 16-r13 bytes of xmm9
+        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
+        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+        vpxor   %xmm9, %xmm14, %xmm14
+	#GHASH computation for the last <16 Byte block
+        \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+        sub     %r13, %r11
+        add     $16, %r11
+        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
+        .endif
 
-                vmovdqu  16*3(arg3, %r11), \T1
-                vpxor    \T1, \XMM4, \XMM4
-                vmovdqu  \XMM4, 16*3(arg2 , %r11)
-                .if   \ENC_DEC == DEC
-                vmovdqa  \T1, \XMM4
-                .endif
 
-                vmovdqu  16*4(arg3, %r11), \T1
-                vpxor    \T1, \XMM5, \XMM5
-                vmovdqu  \XMM5, 16*4(arg2 , %r11)
-                .if   \ENC_DEC == DEC
-                vmovdqa  \T1, \XMM5
-                .endif
+        #############################
+        # output r13 Bytes
+        vmovq   %xmm9, %rax
+        cmp     $8, %r13
+        jle     _less_than_8_bytes_left\@
 
-                vmovdqu  16*5(arg3, %r11), \T1
-                vpxor    \T1, \XMM6, \XMM6
-                vmovdqu  \XMM6, 16*5(arg2 , %r11)
-                .if   \ENC_DEC == DEC
-                vmovdqa  \T1, \XMM6
-                .endif
+        mov     %rax, (arg2 , %r11)
+        add     $8, %r11
+        vpsrldq $8, %xmm9, %xmm9
+        vmovq   %xmm9, %rax
+        sub     $8, %r13
 
-                vmovdqu  16*6(arg3, %r11), \T1
-                vpxor    \T1, \XMM7, \XMM7
-                vmovdqu  \XMM7, 16*6(arg2 , %r11)
-                .if   \ENC_DEC == DEC
-                vmovdqa  \T1, \XMM7
-                .endif
+_less_than_8_bytes_left\@:
+        movb    %al, (arg2 , %r11)
+        add     $1, %r11
+        shr     $8, %rax
+        sub     $1, %r13
+        jne     _less_than_8_bytes_left\@
+        #############################
 
-                vmovdqu  16*7(arg3, %r11), \T1
-                vpxor    \T1, \XMM8, \XMM8
-                vmovdqu  \XMM8, 16*7(arg2 , %r11)
-                .if   \ENC_DEC == DEC
-                vmovdqa  \T1, \XMM8
-                .endif
+_multiple_of_16_bytes\@:
+        mov     arg7, %r12                           # r12 = aadLen (number of bytes)
+        shl     $3, %r12                             # convert into number of bits
+        vmovd   %r12d, %xmm15                        # len(A) in xmm15
 
-                add     $128, %r11
+        shl     $3, arg4                             # len(C) in bits  (*128)
+        vmovq   arg4, %xmm1
+        vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
+        vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
 
-                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
-                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with the corresponding ciphertext
-                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
-                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
-                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
-                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
-                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
-                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
-                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
+        vpxor   %xmm15, %xmm14, %xmm14
+        \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
+        vpshufb SHUF_MASK(%rip), %xmm14, %xmm14      # perform a 16Byte swap
 
-###############################################################################
+        mov     arg5, %rax                           # rax = *Y0
+        vmovdqu (%rax), %xmm9                        # xmm9 = Y0
 
-_initial_blocks_done\@:
+        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Y0)
 
-.endm
+        vpxor   %xmm14, %xmm9, %xmm9
 
-# encrypt 8 blocks at a time
-# ghash the 8 previously encrypted ciphertext blocks
-# arg1, arg2, arg3 are used as pointers only, not modified
-# r11 is the data offset value
-.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
 
-        vmovdqa \XMM1, \T2
-        vmovdqa \XMM2, TMP2(%rsp)
-        vmovdqa \XMM3, TMP3(%rsp)
-        vmovdqa \XMM4, TMP4(%rsp)
-        vmovdqa \XMM5, TMP5(%rsp)
-        vmovdqa \XMM6, TMP6(%rsp)
-        vmovdqa \XMM7, TMP7(%rsp)
-        vmovdqa \XMM8, TMP8(%rsp)
 
-.if \loop_idx == in_order
-                vpaddd  ONE(%rip), \CTR, \XMM1           # INCR CNT
-                vpaddd  ONE(%rip), \XMM1, \XMM2
-                vpaddd  ONE(%rip), \XMM2, \XMM3
-                vpaddd  ONE(%rip), \XMM3, \XMM4
-                vpaddd  ONE(%rip), \XMM4, \XMM5
-                vpaddd  ONE(%rip), \XMM5, \XMM6
-                vpaddd  ONE(%rip), \XMM6, \XMM7
-                vpaddd  ONE(%rip), \XMM7, \XMM8
-                vmovdqa \XMM8, \CTR
+_return_T\@:
+        mov     arg8, %r10              # r10 = authTag
+        mov     arg9, %r11              # r11 = auth_tag_len
 
-                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1    # perform a 16Byte swap
-                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2    # perform a 16Byte swap
-                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3    # perform a 16Byte swap
-                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4    # perform a 16Byte swap
-                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5    # perform a 16Byte swap
-                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6    # perform a 16Byte swap
-                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7    # perform a 16Byte swap
-                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8    # perform a 16Byte swap
-.else
-                vpaddd  ONEf(%rip), \CTR, \XMM1           # INCR CNT
-                vpaddd  ONEf(%rip), \XMM1, \XMM2
-                vpaddd  ONEf(%rip), \XMM2, \XMM3
-                vpaddd  ONEf(%rip), \XMM3, \XMM4
-                vpaddd  ONEf(%rip), \XMM4, \XMM5
-                vpaddd  ONEf(%rip), \XMM5, \XMM6
-                vpaddd  ONEf(%rip), \XMM6, \XMM7
-                vpaddd  ONEf(%rip), \XMM7, \XMM8
-                vmovdqa \XMM8, \CTR
-.endif
+        cmp     $16, %r11
+        je      _T_16\@
 
+        cmp     $8, %r11
+        jl      _T_4\@
 
-        #######################################################################
+_T_8\@:
+        vmovq   %xmm9, %rax
+        mov     %rax, (%r10)
+        add     $8, %r10
+        sub     $8, %r11
+        vpsrldq $8, %xmm9, %xmm9
+        cmp     $0, %r11
+        je     _return_T_done\@
+_T_4\@:
+        vmovd   %xmm9, %eax
+        mov     %eax, (%r10)
+        add     $4, %r10
+        sub     $4, %r11
+        vpsrldq     $4, %xmm9, %xmm9
+        cmp     $0, %r11
+        je     _return_T_done\@
+_T_123\@:
+        vmovd     %xmm9, %eax
+        cmp     $2, %r11
+        jl     _T_1\@
+        mov     %ax, (%r10)
+        cmp     $2, %r11
+        je     _return_T_done\@
+        add     $2, %r10
+        sar     $16, %eax
+_T_1\@:
+        mov     %al, (%r10)
+        jmp     _return_T_done\@
 
-                vmovdqu (arg1), \T1
-                vpxor   \T1, \XMM1, \XMM1
-                vpxor   \T1, \XMM2, \XMM2
-                vpxor   \T1, \XMM3, \XMM3
-                vpxor   \T1, \XMM4, \XMM4
-                vpxor   \T1, \XMM5, \XMM5
-                vpxor   \T1, \XMM6, \XMM6
-                vpxor   \T1, \XMM7, \XMM7
-                vpxor   \T1, \XMM8, \XMM8
+_T_16\@:
+        vmovdqu %xmm9, (%r10)
 
-        #######################################################################
+_return_T_done\@:
+        mov     %r14, %rsp
 
+        pop     %r15
+        pop     %r14
+        pop     %r13
+        pop     %r12
+.endm
 
+#ifdef CONFIG_AS_AVX
+###############################################################################
+# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+# Input: A and B (128-bits each, bit-reflected)
+# Output: C = A*B*x mod poly, (i.e. >>1 )
+# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+###############################################################################
+.macro  GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
 
+        vpshufd         $0b01001110, \GH, \T2
+        vpshufd         $0b01001110, \HK, \T3
+        vpxor           \GH     , \T2, \T2      # T2 = (a1+a0)
+        vpxor           \HK     , \T3, \T3      # T3 = (b1+b0)
 
+        vpclmulqdq      $0x11, \HK, \GH, \T1    # T1 = a1*b1
+        vpclmulqdq      $0x00, \HK, \GH, \GH    # GH = a0*b0
+        vpclmulqdq      $0x00, \T3, \T2, \T2    # T2 = (a1+a0)*(b1+b0)
+        vpxor           \GH, \T2,\T2
+        vpxor           \T1, \T2,\T2            # T2 = a0*b1+a1*b0
 
-                vmovdqu 16*1(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
+        vpslldq         $8, \T2,\T3             # shift-L T3 2 DWs
+        vpsrldq         $8, \T2,\T2             # shift-R T2 2 DWs
+        vpxor           \T3, \GH, \GH
+        vpxor           \T2, \T1, \T1           # <T1:GH> = GH x HK
 
-                vmovdqu 16*2(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
+        #first phase of the reduction
+        vpslld  $31, \GH, \T2                   # packed right shifting << 31
+        vpslld  $30, \GH, \T3                   # packed right shifting shift << 30
+        vpslld  $25, \GH, \T4                   # packed right shifting shift << 25
 
+        vpxor   \T3, \T2, \T2                   # xor the shifted versions
+        vpxor   \T4, \T2, \T2
 
-        #######################################################################
+        vpsrldq $4, \T2, \T5                    # shift-R T5 1 DW
 
-        vmovdqa         HashKey_8(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \T2, \T4             # T4 = a1*b1
-        vpclmulqdq      $0x00, \T5, \T2, \T7             # T7 = a0*b0
+        vpslldq $12, \T2, \T2                   # shift-L T2 3 DWs
+        vpxor   \T2, \GH, \GH                   # first phase of the reduction complete
 
-        vpshufd         $0b01001110, \T2, \T6
-        vpxor           \T2, \T6, \T6
+        #second phase of the reduction
 
-        vmovdqa         HashKey_8_k(arg1), \T5
-        vpclmulqdq      $0x00, \T5, \T6, \T6
+        vpsrld  $1,\GH, \T2                     # packed left shifting >> 1
+        vpsrld  $2,\GH, \T3                     # packed left shifting >> 2
+        vpsrld  $7,\GH, \T4                     # packed left shifting >> 7
+        vpxor   \T3, \T2, \T2                   # xor the shifted versions
+        vpxor   \T4, \T2, \T2
 
-                vmovdqu 16*3(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
+        vpxor   \T5, \T2, \T2
+        vpxor   \T2, \GH, \GH
+        vpxor   \T1, \GH, \GH                   # the result is in GH
 
-        vmovdqa         TMP2(%rsp), \T1
-        vmovdqa         HashKey_7(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \T1, \T3
-        vpxor           \T3, \T4, \T4
-        vpclmulqdq      $0x00, \T5, \T1, \T3
-        vpxor           \T3, \T7, \T7
 
-        vpshufd         $0b01001110, \T1, \T3
-        vpxor           \T1, \T3, \T3
-        vmovdqa         HashKey_7_k(arg1), \T5
-        vpclmulqdq      $0x10, \T5, \T3, \T3
-        vpxor           \T3, \T6, \T6
+.endm
 
-                vmovdqu 16*4(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
+.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
 
-        #######################################################################
+        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+        vmovdqa  \HK, \T5
 
-        vmovdqa         TMP3(%rsp), \T1
-        vmovdqa         HashKey_6(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \T1, \T3
-        vpxor           \T3, \T4, \T4
-        vpclmulqdq      $0x00, \T5, \T1, \T3
-        vpxor           \T3, \T7, \T7
+        vpshufd  $0b01001110, \T5, \T1
+        vpxor    \T5, \T1, \T1
+        vmovdqa  \T1, HashKey_k(arg1)
 
-        vpshufd         $0b01001110, \T1, \T3
-        vpxor           \T1, \T3, \T3
-        vmovdqa         HashKey_6_k(arg1), \T5
-        vpclmulqdq      $0x10, \T5, \T3, \T3
-        vpxor           \T3, \T6, \T6
+        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^2<<1 mod poly
+        vmovdqa  \T5, HashKey_2(arg1)                    #  [HashKey_2] = HashKey^2<<1 mod poly
+        vpshufd  $0b01001110, \T5, \T1
+        vpxor    \T5, \T1, \T1
+        vmovdqa  \T1, HashKey_2_k(arg1)
 
-                vmovdqu 16*5(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
+        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^3<<1 mod poly
+        vmovdqa  \T5, HashKey_3(arg1)
+        vpshufd  $0b01001110, \T5, \T1
+        vpxor    \T5, \T1, \T1
+        vmovdqa  \T1, HashKey_3_k(arg1)
 
-        vmovdqa         TMP4(%rsp), \T1
-        vmovdqa         HashKey_5(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \T1, \T3
-        vpxor           \T3, \T4, \T4
-        vpclmulqdq      $0x00, \T5, \T1, \T3
-        vpxor           \T3, \T7, \T7
+        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^4<<1 mod poly
+        vmovdqa  \T5, HashKey_4(arg1)
+        vpshufd  $0b01001110, \T5, \T1
+        vpxor    \T5, \T1, \T1
+        vmovdqa  \T1, HashKey_4_k(arg1)
 
-        vpshufd         $0b01001110, \T1, \T3
-        vpxor           \T1, \T3, \T3
-        vmovdqa         HashKey_5_k(arg1), \T5
-        vpclmulqdq      $0x10, \T5, \T3, \T3
-        vpxor           \T3, \T6, \T6
+        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^5<<1 mod poly
+        vmovdqa  \T5, HashKey_5(arg1)
+        vpshufd  $0b01001110, \T5, \T1
+        vpxor    \T5, \T1, \T1
+        vmovdqa  \T1, HashKey_5_k(arg1)
 
-                vmovdqu 16*6(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
+        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^6<<1 mod poly
+        vmovdqa  \T5, HashKey_6(arg1)
+        vpshufd  $0b01001110, \T5, \T1
+        vpxor    \T5, \T1, \T1
+        vmovdqa  \T1, HashKey_6_k(arg1)
 
+        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^7<<1 mod poly
+        vmovdqa  \T5, HashKey_7(arg1)
+        vpshufd  $0b01001110, \T5, \T1
+        vpxor    \T5, \T1, \T1
+        vmovdqa  \T1, HashKey_7_k(arg1)
 
-        vmovdqa         TMP5(%rsp), \T1
-        vmovdqa         HashKey_4(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \T1, \T3
-        vpxor           \T3, \T4, \T4
-        vpclmulqdq      $0x00, \T5, \T1, \T3
-        vpxor           \T3, \T7, \T7
+        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^8<<1 mod poly
+        vmovdqa  \T5, HashKey_8(arg1)
+        vpshufd  $0b01001110, \T5, \T1
+        vpxor    \T5, \T1, \T1
+        vmovdqa  \T1, HashKey_8_k(arg1)
 
-        vpshufd         $0b01001110, \T1, \T3
-        vpxor           \T1, \T3, \T3
-        vmovdqa         HashKey_4_k(arg1), \T5
-        vpclmulqdq      $0x10, \T5, \T3, \T3
-        vpxor           \T3, \T6, \T6
+.endm
 
-                vmovdqu 16*7(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
+## if a = number of total plaintext bytes
+## b = floor(a/16)
+## num_initial_blocks = b mod 4#
+## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
+## r10, r11, r12, rax are clobbered
+## arg1, arg2, arg3, r14 are used as a pointer only, not modified
 
-        vmovdqa         TMP6(%rsp), \T1
-        vmovdqa         HashKey_3(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \T1, \T3
-        vpxor           \T3, \T4, \T4
-        vpclmulqdq      $0x00, \T5, \T1, \T3
-        vpxor           \T3, \T7, \T7
+.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
+	i = (8-\num_initial_blocks)
+	j = 0
+	setreg
 
-        vpshufd         $0b01001110, \T1, \T3
-        vpxor           \T1, \T3, \T3
-        vmovdqa         HashKey_3_k(arg1), \T5
-        vpclmulqdq      $0x10, \T5, \T3, \T3
-        vpxor           \T3, \T6, \T6
+	mov     arg6, %r10                      # r10 = AAD
+	mov     arg7, %r12                      # r12 = aadLen
 
 
-                vmovdqu 16*8(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
+	mov     %r12, %r11
 
-        vmovdqa         TMP7(%rsp), \T1
-        vmovdqa         HashKey_2(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \T1, \T3
-        vpxor           \T3, \T4, \T4
-        vpclmulqdq      $0x00, \T5, \T1, \T3
-        vpxor           \T3, \T7, \T7
+	vpxor   reg_j, reg_j, reg_j
+	vpxor   reg_i, reg_i, reg_i
+	cmp     $16, %r11
+	jl      _get_AAD_rest8\@
+_get_AAD_blocks\@:
+	vmovdqu (%r10), reg_i
+	vpshufb SHUF_MASK(%rip), reg_i, reg_i
+	vpxor   reg_i, reg_j, reg_j
+	GHASH_MUL_AVX       reg_j, \T2, \T1, \T3, \T4, \T5, \T6
+	add     $16, %r10
+	sub     $16, %r12
+	sub     $16, %r11
+	cmp     $16, %r11
+	jge     _get_AAD_blocks\@
+	vmovdqu reg_j, reg_i
+	cmp     $0, %r11
+	je      _get_AAD_done\@
 
-        vpshufd         $0b01001110, \T1, \T3
-        vpxor           \T1, \T3, \T3
-        vmovdqa         HashKey_2_k(arg1), \T5
-        vpclmulqdq      $0x10, \T5, \T3, \T3
-        vpxor           \T3, \T6, \T6
+	vpxor   reg_i, reg_i, reg_i
 
-        #######################################################################
+	/* read the last <16B of AAD. since we have at least 4B of
+	data right after the AAD (the ICV, and maybe some CT), we can
+	read 4B/8B blocks safely, and then get rid of the extra stuff */
+_get_AAD_rest8\@:
+	cmp     $4, %r11
+	jle     _get_AAD_rest4\@
+	movq    (%r10), \T1
+	add     $8, %r10
+	sub     $8, %r11
+	vpslldq $8, \T1, \T1
+	vpsrldq $8, reg_i, reg_i
+	vpxor   \T1, reg_i, reg_i
+	jmp     _get_AAD_rest8\@
+_get_AAD_rest4\@:
+	cmp     $0, %r11
+	jle      _get_AAD_rest0\@
+	mov     (%r10), %eax
+	movq    %rax, \T1
+	add     $4, %r10
+	sub     $4, %r11
+	vpslldq $12, \T1, \T1
+	vpsrldq $4, reg_i, reg_i
+	vpxor   \T1, reg_i, reg_i
+_get_AAD_rest0\@:
+	/* finalize: shift out the extra bytes we read, and align
+	left. since pslldq can only shift by an immediate, we use
+	vpshufb and an array of shuffle masks */
+	movq    %r12, %r11
+	salq    $4, %r11
+	movdqu  aad_shift_arr(%r11), \T1
+	vpshufb \T1, reg_i, reg_i
+_get_AAD_rest_final\@:
+	vpshufb SHUF_MASK(%rip), reg_i, reg_i
+	vpxor   reg_j, reg_i, reg_i
+	GHASH_MUL_AVX       reg_i, \T2, \T1, \T3, \T4, \T5, \T6
 
-                vmovdqu 16*9(arg1), \T5
-                vaesenc \T5, \XMM1, \XMM1
-                vaesenc \T5, \XMM2, \XMM2
-                vaesenc \T5, \XMM3, \XMM3
-                vaesenc \T5, \XMM4, \XMM4
-                vaesenc \T5, \XMM5, \XMM5
-                vaesenc \T5, \XMM6, \XMM6
-                vaesenc \T5, \XMM7, \XMM7
-                vaesenc \T5, \XMM8, \XMM8
+_get_AAD_done\@:
+	# initialize the data pointer offset as zero
+	xor     %r11d, %r11d
 
-        vmovdqa         TMP8(%rsp), \T1
-        vmovdqa         HashKey(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \T1, \T3
-        vpxor           \T3, \T4, \T4
-        vpclmulqdq      $0x00, \T5, \T1, \T3
-        vpxor           \T3, \T7, \T7
+	# start AES for num_initial_blocks blocks
+	mov     arg5, %rax                     # rax = *Y0
+	vmovdqu (%rax), \CTR                   # CTR = Y0
+	vpshufb SHUF_MASK(%rip), \CTR, \CTR
 
-        vpshufd         $0b01001110, \T1, \T3
-        vpxor           \T1, \T3, \T3
-        vmovdqa         HashKey_k(arg1), \T5
-        vpclmulqdq      $0x10, \T5, \T3, \T3
-        vpxor           \T3, \T6, \T6
 
-        vpxor           \T4, \T6, \T6
-        vpxor           \T7, \T6, \T6
+	i = (9-\num_initial_blocks)
+	setreg
+.rep \num_initial_blocks
+                vpaddd  ONE(%rip), \CTR, \CTR		# INCR Y0
+                vmovdqa \CTR, reg_i
+                vpshufb SHUF_MASK(%rip), reg_i, reg_i   # perform a 16Byte swap
+	i = (i+1)
+	setreg
+.endr
 
-                vmovdqu 16*10(arg1), \T5
+	vmovdqa  (arg1), \T_key
+	i = (9-\num_initial_blocks)
+	setreg
+.rep \num_initial_blocks
+                vpxor   \T_key, reg_i, reg_i
+	i = (i+1)
+	setreg
+.endr
 
-	i = 0
 	j = 1
 	setreg
-.rep 8
-		vpxor	16*i(arg3, %r11), \T5, \T2
-                .if \ENC_DEC == ENC
-                vaesenclast     \T2, reg_j, reg_j
-                .else
-                vaesenclast     \T2, reg_j, \T3
-                vmovdqu 16*i(arg3, %r11), reg_j
-                vmovdqu \T3, 16*i(arg2, %r11)
-                .endif
+.rep 9
+	vmovdqa  16*j(arg1), \T_key
+	i = (9-\num_initial_blocks)
+	setreg
+.rep \num_initial_blocks
+        vaesenc \T_key, reg_i, reg_i
 	i = (i+1)
+	setreg
+.endr
+
 	j = (j+1)
 	setreg
 .endr
-	#######################################################################
 
 
-	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
-	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
-	vpxor	\T3, \T7, \T7
-	vpxor	\T4, \T6, \T6				# accumulate the results in T6:T7
+	vmovdqa  16*10(arg1), \T_key
+	i = (9-\num_initial_blocks)
+	setreg
+.rep \num_initial_blocks
+        vaesenclast      \T_key, reg_i, reg_i
+	i = (i+1)
+	setreg
+.endr
 
+	i = (9-\num_initial_blocks)
+	setreg
+.rep \num_initial_blocks
+                vmovdqu (arg3, %r11), \T1
+                vpxor   \T1, reg_i, reg_i
+                vmovdqu reg_i, (arg2 , %r11)           # write back ciphertext for num_initial_blocks blocks
+                add     $16, %r11
+.if  \ENC_DEC == DEC
+                vmovdqa \T1, reg_i
+.endif
+                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
+	i = (i+1)
+	setreg
+.endr
 
 
-	#######################################################################
-	#first phase of the reduction
-	#######################################################################
-        vpslld  $31, \T7, \T2                           # packed right shifting << 31
-        vpslld  $30, \T7, \T3                           # packed right shifting shift << 30
-        vpslld  $25, \T7, \T4                           # packed right shifting shift << 25
+	i = (8-\num_initial_blocks)
+	j = (9-\num_initial_blocks)
+	setreg
 
-        vpxor   \T3, \T2, \T2                           # xor the shifted versions
-        vpxor   \T4, \T2, \T2
+.rep \num_initial_blocks
+        vpxor    reg_i, reg_j, reg_j
+        GHASH_MUL_AVX       reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
+	i = (i+1)
+	j = (j+1)
+	setreg
+.endr
+        # XMM8 has the combined result here
 
-        vpsrldq $4, \T2, \T1                            # shift-R T1 1 DW
+        vmovdqa  \XMM8, TMP1(%rsp)
+        vmovdqa  \XMM8, \T3
 
-        vpslldq $12, \T2, \T2                           # shift-L T2 3 DWs
-        vpxor   \T2, \T7, \T7                           # first phase of the reduction complete
-	#######################################################################
-                .if \ENC_DEC == ENC
-		vmovdqu	 \XMM1,	16*0(arg2,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM2,	16*1(arg2,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM3,	16*2(arg2,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM4,	16*3(arg2,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM5,	16*4(arg2,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM6,	16*5(arg2,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM7,	16*6(arg2,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM8,	16*7(arg2,%r11)		# Write to the Ciphertext buffer
-                .endif
+        cmp     $128, %r13
+        jl      _initial_blocks_done\@                  # no need for precomputed constants
 
-	#######################################################################
-	#second phase of the reduction
-        vpsrld  $1, \T7, \T2                            # packed left shifting >> 1
-        vpsrld  $2, \T7, \T3                            # packed left shifting >> 2
-        vpsrld  $7, \T7, \T4                            # packed left shifting >> 7
-        vpxor   \T3, \T2, \T2                           # xor the shifted versions
-        vpxor   \T4, \T2, \T2
+###############################################################################
+# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
+                vmovdqa  \CTR, \XMM1
+                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
 
-        vpxor   \T1, \T2, \T2
-        vpxor   \T2, \T7, \T7
-        vpxor   \T7, \T6, \T6                           # the result is in T6
-	#######################################################################
+                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
+                vmovdqa  \CTR, \XMM2
+                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
 
-		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
-		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
-		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
-		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
-		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
-		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
-		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
-		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
+                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
+                vmovdqa  \CTR, \XMM3
+                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
 
+                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
+                vmovdqa  \CTR, \XMM4
+                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
 
-	vpxor	\T6, \XMM1, \XMM1
+                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
+                vmovdqa  \CTR, \XMM5
+                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
 
+                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
+                vmovdqa  \CTR, \XMM6
+                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
 
+                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
+                vmovdqa  \CTR, \XMM7
+                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
 
-.endm
+                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
+                vmovdqa  \CTR, \XMM8
+                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
 
+                vmovdqa  (arg1), \T_key
+                vpxor    \T_key, \XMM1, \XMM1
+                vpxor    \T_key, \XMM2, \XMM2
+                vpxor    \T_key, \XMM3, \XMM3
+                vpxor    \T_key, \XMM4, \XMM4
+                vpxor    \T_key, \XMM5, \XMM5
+                vpxor    \T_key, \XMM6, \XMM6
+                vpxor    \T_key, \XMM7, \XMM7
+                vpxor    \T_key, \XMM8, \XMM8
 
-# GHASH the last 4 ciphertext blocks.
-.macro  GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
+		i = 1
+		setreg
+.rep    9       # do 9 rounds
+                vmovdqa  16*i(arg1), \T_key
+                vaesenc  \T_key, \XMM1, \XMM1
+                vaesenc  \T_key, \XMM2, \XMM2
+                vaesenc  \T_key, \XMM3, \XMM3
+                vaesenc  \T_key, \XMM4, \XMM4
+                vaesenc  \T_key, \XMM5, \XMM5
+                vaesenc  \T_key, \XMM6, \XMM6
+                vaesenc  \T_key, \XMM7, \XMM7
+                vaesenc  \T_key, \XMM8, \XMM8
+		i = (i+1)
+		setreg
+.endr
 
-        ## Karatsuba Method
 
+                vmovdqa  16*i(arg1), \T_key
+                vaesenclast  \T_key, \XMM1, \XMM1
+                vaesenclast  \T_key, \XMM2, \XMM2
+                vaesenclast  \T_key, \XMM3, \XMM3
+                vaesenclast  \T_key, \XMM4, \XMM4
+                vaesenclast  \T_key, \XMM5, \XMM5
+                vaesenclast  \T_key, \XMM6, \XMM6
+                vaesenclast  \T_key, \XMM7, \XMM7
+                vaesenclast  \T_key, \XMM8, \XMM8
 
-        vpshufd         $0b01001110, \XMM1, \T2
-        vpxor           \XMM1, \T2, \T2
-        vmovdqa         HashKey_8(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \XMM1, \T6
-        vpclmulqdq      $0x00, \T5, \XMM1, \T7
+                vmovdqu  (arg3, %r11), \T1
+                vpxor    \T1, \XMM1, \XMM1
+                vmovdqu  \XMM1, (arg2 , %r11)
+                .if   \ENC_DEC == DEC
+                vmovdqa  \T1, \XMM1
+                .endif
 
-        vmovdqa         HashKey_8_k(arg1), \T3
-        vpclmulqdq      $0x00, \T3, \T2, \XMM1
+                vmovdqu  16*1(arg3, %r11), \T1
+                vpxor    \T1, \XMM2, \XMM2
+                vmovdqu  \XMM2, 16*1(arg2 , %r11)
+                .if   \ENC_DEC == DEC
+                vmovdqa  \T1, \XMM2
+                .endif
 
-        ######################
+                vmovdqu  16*2(arg3, %r11), \T1
+                vpxor    \T1, \XMM3, \XMM3
+                vmovdqu  \XMM3, 16*2(arg2 , %r11)
+                .if   \ENC_DEC == DEC
+                vmovdqa  \T1, \XMM3
+                .endif
 
-        vpshufd         $0b01001110, \XMM2, \T2
-        vpxor           \XMM2, \T2, \T2
-        vmovdqa         HashKey_7(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \XMM2, \T4
-        vpxor           \T4, \T6, \T6
+                vmovdqu  16*3(arg3, %r11), \T1
+                vpxor    \T1, \XMM4, \XMM4
+                vmovdqu  \XMM4, 16*3(arg2 , %r11)
+                .if   \ENC_DEC == DEC
+                vmovdqa  \T1, \XMM4
+                .endif
 
-        vpclmulqdq      $0x00, \T5, \XMM2, \T4
-        vpxor           \T4, \T7, \T7
+                vmovdqu  16*4(arg3, %r11), \T1
+                vpxor    \T1, \XMM5, \XMM5
+                vmovdqu  \XMM5, 16*4(arg2 , %r11)
+                .if   \ENC_DEC == DEC
+                vmovdqa  \T1, \XMM5
+                .endif
 
-        vmovdqa         HashKey_7_k(arg1), \T3
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-        vpxor           \T2, \XMM1, \XMM1
+                vmovdqu  16*5(arg3, %r11), \T1
+                vpxor    \T1, \XMM6, \XMM6
+                vmovdqu  \XMM6, 16*5(arg2 , %r11)
+                .if   \ENC_DEC == DEC
+                vmovdqa  \T1, \XMM6
+                .endif
 
-        ######################
+                vmovdqu  16*6(arg3, %r11), \T1
+                vpxor    \T1, \XMM7, \XMM7
+                vmovdqu  \XMM7, 16*6(arg2 , %r11)
+                .if   \ENC_DEC == DEC
+                vmovdqa  \T1, \XMM7
+                .endif
 
-        vpshufd         $0b01001110, \XMM3, \T2
-        vpxor           \XMM3, \T2, \T2
-        vmovdqa         HashKey_6(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \XMM3, \T4
-        vpxor           \T4, \T6, \T6
+                vmovdqu  16*7(arg3, %r11), \T1
+                vpxor    \T1, \XMM8, \XMM8
+                vmovdqu  \XMM8, 16*7(arg2 , %r11)
+                .if   \ENC_DEC == DEC
+                vmovdqa  \T1, \XMM8
+                .endif
 
-        vpclmulqdq      $0x00, \T5, \XMM3, \T4
-        vpxor           \T4, \T7, \T7
+                add     $128, %r11
 
-        vmovdqa         HashKey_6_k(arg1), \T3
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-        vpxor           \T2, \XMM1, \XMM1
+                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
+                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with the corresponding ciphertext
+                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
+                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
+                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
+                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
+                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
+                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
+                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
 
-        ######################
+###############################################################################
 
-        vpshufd         $0b01001110, \XMM4, \T2
-        vpxor           \XMM4, \T2, \T2
-        vmovdqa         HashKey_5(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \XMM4, \T4
-        vpxor           \T4, \T6, \T6
+_initial_blocks_done\@:
 
-        vpclmulqdq      $0x00, \T5, \XMM4, \T4
-        vpxor           \T4, \T7, \T7
+.endm
 
-        vmovdqa         HashKey_5_k(arg1), \T3
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-        vpxor           \T2, \XMM1, \XMM1
+# encrypt 8 blocks at a time
+# ghash the 8 previously encrypted ciphertext blocks
+# arg1, arg2, arg3 are used as pointers only, not modified
+# r11 is the data offset value
+.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
 
-        ######################
+        vmovdqa \XMM1, \T2
+        vmovdqa \XMM2, TMP2(%rsp)
+        vmovdqa \XMM3, TMP3(%rsp)
+        vmovdqa \XMM4, TMP4(%rsp)
+        vmovdqa \XMM5, TMP5(%rsp)
+        vmovdqa \XMM6, TMP6(%rsp)
+        vmovdqa \XMM7, TMP7(%rsp)
+        vmovdqa \XMM8, TMP8(%rsp)
 
-        vpshufd         $0b01001110, \XMM5, \T2
-        vpxor           \XMM5, \T2, \T2
-        vmovdqa         HashKey_4(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \XMM5, \T4
-        vpxor           \T4, \T6, \T6
+.if \loop_idx == in_order
+                vpaddd  ONE(%rip), \CTR, \XMM1           # INCR CNT
+                vpaddd  ONE(%rip), \XMM1, \XMM2
+                vpaddd  ONE(%rip), \XMM2, \XMM3
+                vpaddd  ONE(%rip), \XMM3, \XMM4
+                vpaddd  ONE(%rip), \XMM4, \XMM5
+                vpaddd  ONE(%rip), \XMM5, \XMM6
+                vpaddd  ONE(%rip), \XMM6, \XMM7
+                vpaddd  ONE(%rip), \XMM7, \XMM8
+                vmovdqa \XMM8, \CTR
 
-        vpclmulqdq      $0x00, \T5, \XMM5, \T4
-        vpxor           \T4, \T7, \T7
+                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1    # perform a 16Byte swap
+                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2    # perform a 16Byte swap
+                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3    # perform a 16Byte swap
+                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4    # perform a 16Byte swap
+                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5    # perform a 16Byte swap
+                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6    # perform a 16Byte swap
+                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7    # perform a 16Byte swap
+                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8    # perform a 16Byte swap
+.else
+                vpaddd  ONEf(%rip), \CTR, \XMM1           # INCR CNT
+                vpaddd  ONEf(%rip), \XMM1, \XMM2
+                vpaddd  ONEf(%rip), \XMM2, \XMM3
+                vpaddd  ONEf(%rip), \XMM3, \XMM4
+                vpaddd  ONEf(%rip), \XMM4, \XMM5
+                vpaddd  ONEf(%rip), \XMM5, \XMM6
+                vpaddd  ONEf(%rip), \XMM6, \XMM7
+                vpaddd  ONEf(%rip), \XMM7, \XMM8
+                vmovdqa \XMM8, \CTR
+.endif
 
-        vmovdqa         HashKey_4_k(arg1), \T3
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-        vpxor           \T2, \XMM1, \XMM1
 
-        ######################
+        #######################################################################
 
-        vpshufd         $0b01001110, \XMM6, \T2
-        vpxor           \XMM6, \T2, \T2
-        vmovdqa         HashKey_3(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \XMM6, \T4
-        vpxor           \T4, \T6, \T6
+                vmovdqu (arg1), \T1
+                vpxor   \T1, \XMM1, \XMM1
+                vpxor   \T1, \XMM2, \XMM2
+                vpxor   \T1, \XMM3, \XMM3
+                vpxor   \T1, \XMM4, \XMM4
+                vpxor   \T1, \XMM5, \XMM5
+                vpxor   \T1, \XMM6, \XMM6
+                vpxor   \T1, \XMM7, \XMM7
+                vpxor   \T1, \XMM8, \XMM8
 
-        vpclmulqdq      $0x00, \T5, \XMM6, \T4
-        vpxor           \T4, \T7, \T7
+        #######################################################################
 
-        vmovdqa         HashKey_3_k(arg1), \T3
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-        vpxor           \T2, \XMM1, \XMM1
 
-        ######################
 
-        vpshufd         $0b01001110, \XMM7, \T2
-        vpxor           \XMM7, \T2, \T2
-        vmovdqa         HashKey_2(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \XMM7, \T4
-        vpxor           \T4, \T6, \T6
 
-        vpclmulqdq      $0x00, \T5, \XMM7, \T4
-        vpxor           \T4, \T7, \T7
 
-        vmovdqa         HashKey_2_k(arg1), \T3
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-        vpxor           \T2, \XMM1, \XMM1
+                vmovdqu 16*1(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
 
-        ######################
+                vmovdqu 16*2(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
 
-        vpshufd         $0b01001110, \XMM8, \T2
-        vpxor           \XMM8, \T2, \T2
-        vmovdqa         HashKey(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \XMM8, \T4
-        vpxor           \T4, \T6, \T6
 
-        vpclmulqdq      $0x00, \T5, \XMM8, \T4
-        vpxor           \T4, \T7, \T7
+        #######################################################################
 
-        vmovdqa         HashKey_k(arg1), \T3
-        vpclmulqdq      $0x00, \T3, \T2, \T2
+        vmovdqa         HashKey_8(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \T2, \T4             # T4 = a1*b1
+        vpclmulqdq      $0x00, \T5, \T2, \T7             # T7 = a0*b0
 
-        vpxor           \T2, \XMM1, \XMM1
-        vpxor           \T6, \XMM1, \XMM1
-        vpxor           \T7, \XMM1, \T2
+        vpshufd         $0b01001110, \T2, \T6
+        vpxor           \T2, \T6, \T6
 
+        vmovdqa         HashKey_8_k(arg1), \T5
+        vpclmulqdq      $0x00, \T5, \T6, \T6
 
+                vmovdqu 16*3(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
 
+        vmovdqa         TMP2(%rsp), \T1
+        vmovdqa         HashKey_7(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \T1, \T3
+        vpxor           \T3, \T4, \T4
+        vpclmulqdq      $0x00, \T5, \T1, \T3
+        vpxor           \T3, \T7, \T7
 
-        vpslldq $8, \T2, \T4
-        vpsrldq $8, \T2, \T2
+        vpshufd         $0b01001110, \T1, \T3
+        vpxor           \T1, \T3, \T3
+        vmovdqa         HashKey_7_k(arg1), \T5
+        vpclmulqdq      $0x10, \T5, \T3, \T3
+        vpxor           \T3, \T6, \T6
 
-        vpxor   \T4, \T7, \T7
-        vpxor   \T2, \T6, \T6   # <T6:T7> holds the result of
-				# the accumulated carry-less multiplications
+                vmovdqu 16*4(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
 
-        #######################################################################
-        #first phase of the reduction
-        vpslld  $31, \T7, \T2   # packed right shifting << 31
-        vpslld  $30, \T7, \T3   # packed right shifting shift << 30
-        vpslld  $25, \T7, \T4   # packed right shifting shift << 25
+        #######################################################################
 
-        vpxor   \T3, \T2, \T2   # xor the shifted versions
-        vpxor   \T4, \T2, \T2
+        vmovdqa         TMP3(%rsp), \T1
+        vmovdqa         HashKey_6(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \T1, \T3
+        vpxor           \T3, \T4, \T4
+        vpclmulqdq      $0x00, \T5, \T1, \T3
+        vpxor           \T3, \T7, \T7
 
-        vpsrldq $4, \T2, \T1    # shift-R T1 1 DW
+        vpshufd         $0b01001110, \T1, \T3
+        vpxor           \T1, \T3, \T3
+        vmovdqa         HashKey_6_k(arg1), \T5
+        vpclmulqdq      $0x10, \T5, \T3, \T3
+        vpxor           \T3, \T6, \T6
 
-        vpslldq $12, \T2, \T2   # shift-L T2 3 DWs
-        vpxor   \T2, \T7, \T7   # first phase of the reduction complete
-        #######################################################################
+                vmovdqu 16*5(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
 
+        vmovdqa         TMP4(%rsp), \T1
+        vmovdqa         HashKey_5(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \T1, \T3
+        vpxor           \T3, \T4, \T4
+        vpclmulqdq      $0x00, \T5, \T1, \T3
+        vpxor           \T3, \T7, \T7
 
-        #second phase of the reduction
-        vpsrld  $1, \T7, \T2    # packed left shifting >> 1
-        vpsrld  $2, \T7, \T3    # packed left shifting >> 2
-        vpsrld  $7, \T7, \T4    # packed left shifting >> 7
-        vpxor   \T3, \T2, \T2   # xor the shifted versions
-        vpxor   \T4, \T2, \T2
+        vpshufd         $0b01001110, \T1, \T3
+        vpxor           \T1, \T3, \T3
+        vmovdqa         HashKey_5_k(arg1), \T5
+        vpclmulqdq      $0x10, \T5, \T3, \T3
+        vpxor           \T3, \T6, \T6
 
-        vpxor   \T1, \T2, \T2
-        vpxor   \T2, \T7, \T7
-        vpxor   \T7, \T6, \T6   # the result is in T6
+                vmovdqu 16*6(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
 
-.endm
 
+        vmovdqa         TMP5(%rsp), \T1
+        vmovdqa         HashKey_4(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \T1, \T3
+        vpxor           \T3, \T4, \T4
+        vpclmulqdq      $0x00, \T5, \T1, \T3
+        vpxor           \T3, \T7, \T7
 
-# combined for GCM encrypt and decrypt functions
-# clobbering all xmm registers
-# clobbering r10, r11, r12, r13, r14, r15
-.macro  GCM_ENC_DEC_AVX     ENC_DEC
+        vpshufd         $0b01001110, \T1, \T3
+        vpxor           \T1, \T3, \T3
+        vmovdqa         HashKey_4_k(arg1), \T5
+        vpclmulqdq      $0x10, \T5, \T3, \T3
+        vpxor           \T3, \T6, \T6
 
-        #the number of pushes must equal STACK_OFFSET
-        push    %r12
-        push    %r13
-        push    %r14
-        push    %r15
+                vmovdqu 16*7(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
 
-        mov     %rsp, %r14
+        vmovdqa         TMP6(%rsp), \T1
+        vmovdqa         HashKey_3(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \T1, \T3
+        vpxor           \T3, \T4, \T4
+        vpclmulqdq      $0x00, \T5, \T1, \T3
+        vpxor           \T3, \T7, \T7
 
+        vpshufd         $0b01001110, \T1, \T3
+        vpxor           \T1, \T3, \T3
+        vmovdqa         HashKey_3_k(arg1), \T5
+        vpclmulqdq      $0x10, \T5, \T3, \T3
+        vpxor           \T3, \T6, \T6
 
 
+                vmovdqu 16*8(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
 
-        sub     $VARIABLE_OFFSET, %rsp
-        and     $~63, %rsp                  # align rsp to 64 bytes
+        vmovdqa         TMP7(%rsp), \T1
+        vmovdqa         HashKey_2(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \T1, \T3
+        vpxor           \T3, \T4, \T4
+        vpclmulqdq      $0x00, \T5, \T1, \T3
+        vpxor           \T3, \T7, \T7
 
+        vpshufd         $0b01001110, \T1, \T3
+        vpxor           \T1, \T3, \T3
+        vmovdqa         HashKey_2_k(arg1), \T5
+        vpclmulqdq      $0x10, \T5, \T3, \T3
+        vpxor           \T3, \T6, \T6
 
-        vmovdqu  HashKey(arg1), %xmm13      # xmm13 = HashKey
+        #######################################################################
 
-        mov     arg4, %r13                  # save the number of bytes of plaintext/ciphertext
-        and     $-16, %r13                  # r13 = r13 - (r13 mod 16)
+                vmovdqu 16*9(arg1), \T5
+                vaesenc \T5, \XMM1, \XMM1
+                vaesenc \T5, \XMM2, \XMM2
+                vaesenc \T5, \XMM3, \XMM3
+                vaesenc \T5, \XMM4, \XMM4
+                vaesenc \T5, \XMM5, \XMM5
+                vaesenc \T5, \XMM6, \XMM6
+                vaesenc \T5, \XMM7, \XMM7
+                vaesenc \T5, \XMM8, \XMM8
 
-        mov     %r13, %r12
-        shr     $4, %r12
-        and     $7, %r12
-        jz      _initial_num_blocks_is_0\@
+        vmovdqa         TMP8(%rsp), \T1
+        vmovdqa         HashKey(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \T1, \T3
+        vpxor           \T3, \T4, \T4
+        vpclmulqdq      $0x00, \T5, \T1, \T3
+        vpxor           \T3, \T7, \T7
 
-        cmp     $7, %r12
-        je      _initial_num_blocks_is_7\@
-        cmp     $6, %r12
-        je      _initial_num_blocks_is_6\@
-        cmp     $5, %r12
-        je      _initial_num_blocks_is_5\@
-        cmp     $4, %r12
-        je      _initial_num_blocks_is_4\@
-        cmp     $3, %r12
-        je      _initial_num_blocks_is_3\@
-        cmp     $2, %r12
-        je      _initial_num_blocks_is_2\@
+        vpshufd         $0b01001110, \T1, \T3
+        vpxor           \T1, \T3, \T3
+        vmovdqa         HashKey_k(arg1), \T5
+        vpclmulqdq      $0x10, \T5, \T3, \T3
+        vpxor           \T3, \T6, \T6
 
-        jmp     _initial_num_blocks_is_1\@
+        vpxor           \T4, \T6, \T6
+        vpxor           \T7, \T6, \T6
 
-_initial_num_blocks_is_7\@:
-        INITIAL_BLOCKS_AVX  7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-        sub     $16*7, %r13
-        jmp     _initial_blocks_encrypted\@
+                vmovdqu 16*10(arg1), \T5
 
-_initial_num_blocks_is_6\@:
-        INITIAL_BLOCKS_AVX  6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-        sub     $16*6, %r13
-        jmp     _initial_blocks_encrypted\@
+	i = 0
+	j = 1
+	setreg
+.rep 8
+		vpxor	16*i(arg3, %r11), \T5, \T2
+                .if \ENC_DEC == ENC
+                vaesenclast     \T2, reg_j, reg_j
+                .else
+                vaesenclast     \T2, reg_j, \T3
+                vmovdqu 16*i(arg3, %r11), reg_j
+                vmovdqu \T3, 16*i(arg2, %r11)
+                .endif
+	i = (i+1)
+	j = (j+1)
+	setreg
+.endr
+	#######################################################################
 
-_initial_num_blocks_is_5\@:
-        INITIAL_BLOCKS_AVX  5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-        sub     $16*5, %r13
-        jmp     _initial_blocks_encrypted\@
 
-_initial_num_blocks_is_4\@:
-        INITIAL_BLOCKS_AVX  4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-        sub     $16*4, %r13
-        jmp     _initial_blocks_encrypted\@
+	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
+	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
+	vpxor	\T3, \T7, \T7
+	vpxor	\T4, \T6, \T6				# accumulate the results in T6:T7
 
-_initial_num_blocks_is_3\@:
-        INITIAL_BLOCKS_AVX  3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-        sub     $16*3, %r13
-        jmp     _initial_blocks_encrypted\@
 
-_initial_num_blocks_is_2\@:
-        INITIAL_BLOCKS_AVX  2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-        sub     $16*2, %r13
-        jmp     _initial_blocks_encrypted\@
 
-_initial_num_blocks_is_1\@:
-        INITIAL_BLOCKS_AVX  1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-        sub     $16*1, %r13
-        jmp     _initial_blocks_encrypted\@
+	#######################################################################
+	#first phase of the reduction
+	#######################################################################
+        vpslld  $31, \T7, \T2                           # packed right shifting << 31
+        vpslld  $30, \T7, \T3                           # packed right shifting shift << 30
+        vpslld  $25, \T7, \T4                           # packed right shifting shift << 25
 
-_initial_num_blocks_is_0\@:
-        INITIAL_BLOCKS_AVX  0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        vpxor   \T3, \T2, \T2                           # xor the shifted versions
+        vpxor   \T4, \T2, \T2
 
+        vpsrldq $4, \T2, \T1                            # shift-R T1 1 DW
 
-_initial_blocks_encrypted\@:
-        cmp     $0, %r13
-        je      _zero_cipher_left\@
+        vpslldq $12, \T2, \T2                           # shift-L T2 3 DWs
+        vpxor   \T2, \T7, \T7                           # first phase of the reduction complete
+	#######################################################################
+                .if \ENC_DEC == ENC
+		vmovdqu	 \XMM1,	16*0(arg2,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM2,	16*1(arg2,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM3,	16*2(arg2,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM4,	16*3(arg2,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM5,	16*4(arg2,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM6,	16*5(arg2,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM7,	16*6(arg2,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM8,	16*7(arg2,%r11)		# Write to the Ciphertext buffer
+                .endif
 
-        sub     $128, %r13
-        je      _eight_cipher_left\@
+	#######################################################################
+	#second phase of the reduction
+        vpsrld  $1, \T7, \T2                            # packed left shifting >> 1
+        vpsrld  $2, \T7, \T3                            # packed left shifting >> 2
+        vpsrld  $7, \T7, \T4                            # packed left shifting >> 7
+        vpxor   \T3, \T2, \T2                           # xor the shifted versions
+        vpxor   \T4, \T2, \T2
 
+        vpxor   \T1, \T2, \T2
+        vpxor   \T2, \T7, \T7
+        vpxor   \T7, \T6, \T6                           # the result is in T6
+	#######################################################################
 
+		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
+		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
+		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
+		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
+		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
+		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
+		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
+		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
 
 
-        vmovd   %xmm9, %r15d
-        and     $255, %r15d
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+	vpxor	\T6, \XMM1, \XMM1
 
 
-_encrypt_by_8_new\@:
-        cmp     $(255-8), %r15d
-        jg      _encrypt_by_8\@
 
+.endm
 
 
-        add     $8, %r15b
-        GHASH_8_ENCRYPT_8_PARALLEL_AVX      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
-        add     $128, %r11
-        sub     $128, %r13
-        jne     _encrypt_by_8_new\@
+# GHASH the last 4 ciphertext blocks.
+.macro  GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
 
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-        jmp     _eight_cipher_left\@
+        ## Karatsuba Method
 
-_encrypt_by_8\@:
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-        add     $8, %r15b
-        GHASH_8_ENCRYPT_8_PARALLEL_AVX      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-        add     $128, %r11
-        sub     $128, %r13
-        jne     _encrypt_by_8_new\@
 
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+        vpshufd         $0b01001110, \XMM1, \T2
+        vpxor           \XMM1, \T2, \T2
+        vmovdqa         HashKey_8(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \XMM1, \T6
+        vpclmulqdq      $0x00, \T5, \XMM1, \T7
 
+        vmovdqa         HashKey_8_k(arg1), \T3
+        vpclmulqdq      $0x00, \T3, \T2, \XMM1
 
+        ######################
 
+        vpshufd         $0b01001110, \XMM2, \T2
+        vpxor           \XMM2, \T2, \T2
+        vmovdqa         HashKey_7(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \XMM2, \T4
+        vpxor           \T4, \T6, \T6
 
-_eight_cipher_left\@:
-        GHASH_LAST_8_AVX    %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
+        vpclmulqdq      $0x00, \T5, \XMM2, \T4
+        vpxor           \T4, \T7, \T7
 
+        vmovdqa         HashKey_7_k(arg1), \T3
+        vpclmulqdq      $0x00, \T3, \T2, \T2
+        vpxor           \T2, \XMM1, \XMM1
 
-_zero_cipher_left\@:
-        cmp     $16, arg4
-        jl      _only_less_than_16\@
+        ######################
 
-        mov     arg4, %r13
-        and     $15, %r13                            # r13 = (arg4 mod 16)
+        vpshufd         $0b01001110, \XMM3, \T2
+        vpxor           \XMM3, \T2, \T2
+        vmovdqa         HashKey_6(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \XMM3, \T4
+        vpxor           \T4, \T6, \T6
 
-        je      _multiple_of_16_bytes\@
+        vpclmulqdq      $0x00, \T5, \XMM3, \T4
+        vpxor           \T4, \T7, \T7
 
-        # handle the last <16 Byte block seperately
+        vmovdqa         HashKey_6_k(arg1), \T3
+        vpclmulqdq      $0x00, \T3, \T2, \T2
+        vpxor           \T2, \XMM1, \XMM1
 
+        ######################
 
-        vpaddd   ONE(%rip), %xmm9, %xmm9             # INCR CNT to get Yn
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
+        vpshufd         $0b01001110, \XMM4, \T2
+        vpxor           \XMM4, \T2, \T2
+        vmovdqa         HashKey_5(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \XMM4, \T4
+        vpxor           \T4, \T6, \T6
 
-        sub     $16, %r11
-        add     %r13, %r11
-        vmovdqu (arg3, %r11), %xmm1                  # receive the last <16 Byte block
+        vpclmulqdq      $0x00, \T5, \XMM4, \T4
+        vpxor           \T4, \T7, \T7
 
-        lea     SHIFT_MASK+16(%rip), %r12
-        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
-						     # able to shift 16-r13 bytes (r13 is the
-						     # number of bytes in plaintext mod 16)
-        vmovdqu (%r12), %xmm2                        # get the appropriate shuffle mask
-        vpshufb %xmm2, %xmm1, %xmm1                  # shift right 16-r13 bytes
-        jmp     _final_ghash_mul\@
+        vmovdqa         HashKey_5_k(arg1), \T3
+        vpclmulqdq      $0x00, \T3, \T2, \T2
+        vpxor           \T2, \XMM1, \XMM1
 
-_only_less_than_16\@:
-        # check for 0 length
-        mov     arg4, %r13
-        and     $15, %r13                            # r13 = (arg4 mod 16)
+        ######################
 
-        je      _multiple_of_16_bytes\@
+        vpshufd         $0b01001110, \XMM5, \T2
+        vpxor           \XMM5, \T2, \T2
+        vmovdqa         HashKey_4(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \XMM5, \T4
+        vpxor           \T4, \T6, \T6
 
-        # handle the last <16 Byte block seperately
+        vpclmulqdq      $0x00, \T5, \XMM5, \T4
+        vpxor           \T4, \T7, \T7
 
+        vmovdqa         HashKey_4_k(arg1), \T3
+        vpclmulqdq      $0x00, \T3, \T2, \T2
+        vpxor           \T2, \XMM1, \XMM1
 
-        vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
+        ######################
 
+        vpshufd         $0b01001110, \XMM6, \T2
+        vpxor           \XMM6, \T2, \T2
+        vmovdqa         HashKey_3(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \XMM6, \T4
+        vpxor           \T4, \T6, \T6
 
-        lea     SHIFT_MASK+16(%rip), %r12
-        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
-						     # able to shift 16-r13 bytes (r13 is the
-						     # number of bytes in plaintext mod 16)
+        vpclmulqdq      $0x00, \T5, \XMM6, \T4
+        vpxor           \T4, \T7, \T7
 
-_get_last_16_byte_loop\@:
-        movb    (arg3, %r11),  %al
-        movb    %al,  TMP1 (%rsp , %r11)
-        add     $1, %r11
-        cmp     %r13,  %r11
-        jne     _get_last_16_byte_loop\@
+        vmovdqa         HashKey_3_k(arg1), \T3
+        vpclmulqdq      $0x00, \T3, \T2, \T2
+        vpxor           \T2, \XMM1, \XMM1
 
-        vmovdqu  TMP1(%rsp), %xmm1
+        ######################
 
-        sub     $16, %r11
+        vpshufd         $0b01001110, \XMM7, \T2
+        vpxor           \XMM7, \T2, \T2
+        vmovdqa         HashKey_2(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \XMM7, \T4
+        vpxor           \T4, \T6, \T6
 
-_final_ghash_mul\@:
-        .if  \ENC_DEC ==  DEC
-        vmovdqa %xmm1, %xmm2
-        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
-        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
-						     # mask out top 16-r13 bytes of xmm9
-        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
-        vpand   %xmm1, %xmm2, %xmm2
-        vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
-        vpxor   %xmm2, %xmm14, %xmm14
-	#GHASH computation for the last <16 Byte block
-        GHASH_MUL_AVX       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
-        sub     %r13, %r11
-        add     $16, %r11
-        .else
-        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
-        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
-						     # mask out top 16-r13 bytes of xmm9
-        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-        vpxor   %xmm9, %xmm14, %xmm14
-	#GHASH computation for the last <16 Byte block
-        GHASH_MUL_AVX       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
-        sub     %r13, %r11
-        add     $16, %r11
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
-        .endif
+        vpclmulqdq      $0x00, \T5, \XMM7, \T4
+        vpxor           \T4, \T7, \T7
 
+        vmovdqa         HashKey_2_k(arg1), \T3
+        vpclmulqdq      $0x00, \T3, \T2, \T2
+        vpxor           \T2, \XMM1, \XMM1
 
-        #############################
-        # output r13 Bytes
-        vmovq   %xmm9, %rax
-        cmp     $8, %r13
-        jle     _less_than_8_bytes_left\@
+        ######################
 
-        mov     %rax, (arg2 , %r11)
-        add     $8, %r11
-        vpsrldq $8, %xmm9, %xmm9
-        vmovq   %xmm9, %rax
-        sub     $8, %r13
+        vpshufd         $0b01001110, \XMM8, \T2
+        vpxor           \XMM8, \T2, \T2
+        vmovdqa         HashKey(arg1), \T5
+        vpclmulqdq      $0x11, \T5, \XMM8, \T4
+        vpxor           \T4, \T6, \T6
 
-_less_than_8_bytes_left\@:
-        movb    %al, (arg2 , %r11)
-        add     $1, %r11
-        shr     $8, %rax
-        sub     $1, %r13
-        jne     _less_than_8_bytes_left\@
-        #############################
+        vpclmulqdq      $0x00, \T5, \XMM8, \T4
+        vpxor           \T4, \T7, \T7
 
-_multiple_of_16_bytes\@:
-        mov     arg7, %r12                           # r12 = aadLen (number of bytes)
-        shl     $3, %r12                             # convert into number of bits
-        vmovd   %r12d, %xmm15                        # len(A) in xmm15
+        vmovdqa         HashKey_k(arg1), \T3
+        vpclmulqdq      $0x00, \T3, \T2, \T2
 
-        shl     $3, arg4                             # len(C) in bits  (*128)
-        vmovq   arg4, %xmm1
-        vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
-        vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
+        vpxor           \T2, \XMM1, \XMM1
+        vpxor           \T6, \XMM1, \XMM1
+        vpxor           \T7, \XMM1, \T2
 
-        vpxor   %xmm15, %xmm14, %xmm14
-        GHASH_MUL_AVX       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
-        vpshufb SHUF_MASK(%rip), %xmm14, %xmm14      # perform a 16Byte swap
 
-        mov     arg5, %rax                           # rax = *Y0
-        vmovdqu (%rax), %xmm9                        # xmm9 = Y0
 
-        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Y0)
 
-        vpxor   %xmm14, %xmm9, %xmm9
+        vpslldq $8, \T2, \T4
+        vpsrldq $8, \T2, \T2
 
+        vpxor   \T4, \T7, \T7
+        vpxor   \T2, \T6, \T6   # <T6:T7> holds the result of
+				# the accumulated carry-less multiplications
 
+        #######################################################################
+        #first phase of the reduction
+        vpslld  $31, \T7, \T2   # packed right shifting << 31
+        vpslld  $30, \T7, \T3   # packed right shifting shift << 30
+        vpslld  $25, \T7, \T4   # packed right shifting shift << 25
 
-_return_T\@:
-        mov     arg8, %r10              # r10 = authTag
-        mov     arg9, %r11              # r11 = auth_tag_len
+        vpxor   \T3, \T2, \T2   # xor the shifted versions
+        vpxor   \T4, \T2, \T2
 
-        cmp     $16, %r11
-        je      _T_16\@
+        vpsrldq $4, \T2, \T1    # shift-R T1 1 DW
 
-        cmp     $8, %r11
-        jl      _T_4\@
+        vpslldq $12, \T2, \T2   # shift-L T2 3 DWs
+        vpxor   \T2, \T7, \T7   # first phase of the reduction complete
+        #######################################################################
 
-_T_8\@:
-        vmovq   %xmm9, %rax
-        mov     %rax, (%r10)
-        add     $8, %r10
-        sub     $8, %r11
-        vpsrldq $8, %xmm9, %xmm9
-        cmp     $0, %r11
-        je     _return_T_done\@
-_T_4\@:
-        vmovd   %xmm9, %eax
-        mov     %eax, (%r10)
-        add     $4, %r10
-        sub     $4, %r11
-        vpsrldq     $4, %xmm9, %xmm9
-        cmp     $0, %r11
-        je     _return_T_done\@
-_T_123\@:
-        vmovd     %xmm9, %eax
-        cmp     $2, %r11
-        jl     _T_1\@
-        mov     %ax, (%r10)
-        cmp     $2, %r11
-        je     _return_T_done\@
-        add     $2, %r10
-        sar     $16, %eax
-_T_1\@:
-        mov     %al, (%r10)
-        jmp     _return_T_done\@
 
-_T_16\@:
-        vmovdqu %xmm9, (%r10)
+        #second phase of the reduction
+        vpsrld  $1, \T7, \T2    # packed left shifting >> 1
+        vpsrld  $2, \T7, \T3    # packed left shifting >> 2
+        vpsrld  $7, \T7, \T4    # packed left shifting >> 7
+        vpxor   \T3, \T2, \T2   # xor the shifted versions
+        vpxor   \T4, \T2, \T2
 
-_return_T_done\@:
-        mov     %r14, %rsp
+        vpxor   \T1, \T2, \T2
+        vpxor   \T2, \T7, \T7
+        vpxor   \T7, \T6, \T6   # the result is in T6
 
-        pop     %r15
-        pop     %r14
-        pop     %r13
-        pop     %r12
 .endm
 
-
 #############################################################
 #void   aesni_gcm_precomp_avx_gen2
 #        (gcm_data     *my_ctx_data,
@@ -1593,7 +1591,7 @@ ENDPROC(aesni_gcm_precomp_avx_gen2)
 #				Valid values are 16 (most likely), 12 or 8. */
 ###############################################################################
 ENTRY(aesni_gcm_enc_avx_gen2)
-        GCM_ENC_DEC_AVX     ENC
+        GCM_ENC_DEC INITIAL_BLOCKS_AVX GHASH_8_ENCRYPT_8_PARALLEL_AVX GHASH_LAST_8_AVX GHASH_MUL_AVX ENC
 	ret
 ENDPROC(aesni_gcm_enc_avx_gen2)
 
@@ -1614,7 +1612,7 @@ ENDPROC(aesni_gcm_enc_avx_gen2)
 #				Valid values are 16 (most likely), 12 or 8. */
 ###############################################################################
 ENTRY(aesni_gcm_dec_avx_gen2)
-        GCM_ENC_DEC_AVX     DEC
+        GCM_ENC_DEC INITIAL_BLOCKS_AVX GHASH_8_ENCRYPT_8_PARALLEL_AVX GHASH_LAST_8_AVX GHASH_MUL_AVX DEC
 	ret
 ENDPROC(aesni_gcm_dec_avx_gen2)
 #endif /* CONFIG_AS_AVX */
@@ -2378,477 +2376,164 @@ _initial_blocks_done\@:
         vmovdqa         HashKey_7(arg1), \T5
         vpshufd         $0b01001110, \XMM2, \T2
         vpshufd         $0b01001110, \T5, \T3
-        vpxor           \XMM2, \T2, \T2
-        vpxor           \T5, \T3, \T3
-
-        vpclmulqdq      $0x11, \T5, \XMM2, \T4
-        vpxor           \T4, \T6, \T6
-
-        vpclmulqdq      $0x00, \T5, \XMM2, \T4
-        vpxor           \T4, \T7, \T7
-
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-
-        vpxor           \T2, \XMM1, \XMM1
-
-        ######################
-
-        vmovdqa         HashKey_6(arg1), \T5
-        vpshufd         $0b01001110, \XMM3, \T2
-        vpshufd         $0b01001110, \T5, \T3
-        vpxor           \XMM3, \T2, \T2
-        vpxor           \T5, \T3, \T3
-
-        vpclmulqdq      $0x11, \T5, \XMM3, \T4
-        vpxor           \T4, \T6, \T6
-
-        vpclmulqdq      $0x00, \T5, \XMM3, \T4
-        vpxor           \T4, \T7, \T7
-
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-
-        vpxor           \T2, \XMM1, \XMM1
-
-        ######################
-
-        vmovdqa         HashKey_5(arg1), \T5
-        vpshufd         $0b01001110, \XMM4, \T2
-        vpshufd         $0b01001110, \T5, \T3
-        vpxor           \XMM4, \T2, \T2
-        vpxor           \T5, \T3, \T3
-
-        vpclmulqdq      $0x11, \T5, \XMM4, \T4
-        vpxor           \T4, \T6, \T6
-
-        vpclmulqdq      $0x00, \T5, \XMM4, \T4
-        vpxor           \T4, \T7, \T7
-
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-
-        vpxor           \T2, \XMM1, \XMM1
-
-        ######################
-
-        vmovdqa         HashKey_4(arg1), \T5
-        vpshufd         $0b01001110, \XMM5, \T2
-        vpshufd         $0b01001110, \T5, \T3
-        vpxor           \XMM5, \T2, \T2
-        vpxor           \T5, \T3, \T3
-
-        vpclmulqdq      $0x11, \T5, \XMM5, \T4
-        vpxor           \T4, \T6, \T6
-
-        vpclmulqdq      $0x00, \T5, \XMM5, \T4
-        vpxor           \T4, \T7, \T7
-
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-
-        vpxor           \T2, \XMM1, \XMM1
-
-        ######################
-
-        vmovdqa         HashKey_3(arg1), \T5
-        vpshufd         $0b01001110, \XMM6, \T2
-        vpshufd         $0b01001110, \T5, \T3
-        vpxor           \XMM6, \T2, \T2
-        vpxor           \T5, \T3, \T3
-
-        vpclmulqdq      $0x11, \T5, \XMM6, \T4
-        vpxor           \T4, \T6, \T6
-
-        vpclmulqdq      $0x00, \T5, \XMM6, \T4
-        vpxor           \T4, \T7, \T7
-
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-
-        vpxor           \T2, \XMM1, \XMM1
-
-        ######################
-
-        vmovdqa         HashKey_2(arg1), \T5
-        vpshufd         $0b01001110, \XMM7, \T2
-        vpshufd         $0b01001110, \T5, \T3
-        vpxor           \XMM7, \T2, \T2
-        vpxor           \T5, \T3, \T3
-
-        vpclmulqdq      $0x11, \T5, \XMM7, \T4
-        vpxor           \T4, \T6, \T6
-
-        vpclmulqdq      $0x00, \T5, \XMM7, \T4
-        vpxor           \T4, \T7, \T7
-
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-
-        vpxor           \T2, \XMM1, \XMM1
-
-        ######################
-
-        vmovdqa         HashKey(arg1), \T5
-        vpshufd         $0b01001110, \XMM8, \T2
-        vpshufd         $0b01001110, \T5, \T3
-        vpxor           \XMM8, \T2, \T2
-        vpxor           \T5, \T3, \T3
-
-        vpclmulqdq      $0x11, \T5, \XMM8, \T4
-        vpxor           \T4, \T6, \T6
-
-        vpclmulqdq      $0x00, \T5, \XMM8, \T4
-        vpxor           \T4, \T7, \T7
-
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-
-        vpxor           \T2, \XMM1, \XMM1
-        vpxor           \T6, \XMM1, \XMM1
-        vpxor           \T7, \XMM1, \T2
-
-
-
-
-        vpslldq $8, \T2, \T4
-        vpsrldq $8, \T2, \T2
-
-        vpxor   \T4, \T7, \T7
-        vpxor   \T2, \T6, \T6                      # <T6:T7> holds the result of the
-						   # accumulated carry-less multiplications
-
-        #######################################################################
-        #first phase of the reduction
-        vmovdqa         POLY2(%rip), \T3
-
-        vpclmulqdq      $0x01, \T7, \T3, \T2
-        vpslldq         $8, \T2, \T2               # shift-L xmm2 2 DWs
-
-        vpxor           \T2, \T7, \T7              # first phase of the reduction complete
-        #######################################################################
-
-
-        #second phase of the reduction
-        vpclmulqdq      $0x00, \T7, \T3, \T2
-        vpsrldq         $4, \T2, \T2               # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
-
-        vpclmulqdq      $0x10, \T7, \T3, \T4
-        vpslldq         $4, \T4, \T4               # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
-
-        vpxor           \T2, \T4, \T4              # second phase of the reduction complete
-        #######################################################################
-        vpxor           \T4, \T6, \T6              # the result is in T6
-.endm
-
-
-
-# combined for GCM encrypt and decrypt functions
-# clobbering all xmm registers
-# clobbering r10, r11, r12, r13, r14, r15
-.macro  GCM_ENC_DEC_AVX2     ENC_DEC
-
-        #the number of pushes must equal STACK_OFFSET
-        push    %r12
-        push    %r13
-        push    %r14
-        push    %r15
-
-        mov     %rsp, %r14
-
-
-
-
-        sub     $VARIABLE_OFFSET, %rsp
-        and     $~63, %rsp                         # align rsp to 64 bytes
-
-
-        vmovdqu  HashKey(arg1), %xmm13             # xmm13 = HashKey
-
-        mov     arg4, %r13                         # save the number of bytes of plaintext/ciphertext
-        and     $-16, %r13                         # r13 = r13 - (r13 mod 16)
-
-        mov     %r13, %r12
-        shr     $4, %r12
-        and     $7, %r12
-        jz      _initial_num_blocks_is_0\@
-
-        cmp     $7, %r12
-        je      _initial_num_blocks_is_7\@
-        cmp     $6, %r12
-        je      _initial_num_blocks_is_6\@
-        cmp     $5, %r12
-        je      _initial_num_blocks_is_5\@
-        cmp     $4, %r12
-        je      _initial_num_blocks_is_4\@
-        cmp     $3, %r12
-        je      _initial_num_blocks_is_3\@
-        cmp     $2, %r12
-        je      _initial_num_blocks_is_2\@
-
-        jmp     _initial_num_blocks_is_1\@
-
-_initial_num_blocks_is_7\@:
-        INITIAL_BLOCKS_AVX2  7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-        sub     $16*7, %r13
-        jmp     _initial_blocks_encrypted\@
-
-_initial_num_blocks_is_6\@:
-        INITIAL_BLOCKS_AVX2  6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-        sub     $16*6, %r13
-        jmp     _initial_blocks_encrypted\@
-
-_initial_num_blocks_is_5\@:
-        INITIAL_BLOCKS_AVX2  5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-        sub     $16*5, %r13
-        jmp     _initial_blocks_encrypted\@
-
-_initial_num_blocks_is_4\@:
-        INITIAL_BLOCKS_AVX2  4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-        sub     $16*4, %r13
-        jmp     _initial_blocks_encrypted\@
-
-_initial_num_blocks_is_3\@:
-        INITIAL_BLOCKS_AVX2  3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-        sub     $16*3, %r13
-        jmp     _initial_blocks_encrypted\@
-
-_initial_num_blocks_is_2\@:
-        INITIAL_BLOCKS_AVX2  2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-        sub     $16*2, %r13
-        jmp     _initial_blocks_encrypted\@
-
-_initial_num_blocks_is_1\@:
-        INITIAL_BLOCKS_AVX2  1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-        sub     $16*1, %r13
-        jmp     _initial_blocks_encrypted\@
-
-_initial_num_blocks_is_0\@:
-        INITIAL_BLOCKS_AVX2  0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-
-
-_initial_blocks_encrypted\@:
-        cmp     $0, %r13
-        je      _zero_cipher_left\@
-
-        sub     $128, %r13
-        je      _eight_cipher_left\@
-
-
-
+        vpxor           \XMM2, \T2, \T2
+        vpxor           \T5, \T3, \T3
 
-        vmovd   %xmm9, %r15d
-        and     $255, %r15d
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+        vpclmulqdq      $0x11, \T5, \XMM2, \T4
+        vpxor           \T4, \T6, \T6
 
+        vpclmulqdq      $0x00, \T5, \XMM2, \T4
+        vpxor           \T4, \T7, \T7
 
-_encrypt_by_8_new\@:
-        cmp     $(255-8), %r15d
-        jg      _encrypt_by_8\@
+        vpclmulqdq      $0x00, \T3, \T2, \T2
 
+        vpxor           \T2, \XMM1, \XMM1
 
+        ######################
 
-        add     $8, %r15b
-        GHASH_8_ENCRYPT_8_PARALLEL_AVX2      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
-        add     $128, %r11
-        sub     $128, %r13
-        jne     _encrypt_by_8_new\@
+        vmovdqa         HashKey_6(arg1), \T5
+        vpshufd         $0b01001110, \XMM3, \T2
+        vpshufd         $0b01001110, \T5, \T3
+        vpxor           \XMM3, \T2, \T2
+        vpxor           \T5, \T3, \T3
 
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-        jmp     _eight_cipher_left\@
+        vpclmulqdq      $0x11, \T5, \XMM3, \T4
+        vpxor           \T4, \T6, \T6
 
-_encrypt_by_8\@:
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-        add     $8, %r15b
-        GHASH_8_ENCRYPT_8_PARALLEL_AVX2      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-        add     $128, %r11
-        sub     $128, %r13
-        jne     _encrypt_by_8_new\@
+        vpclmulqdq      $0x00, \T5, \XMM3, \T4
+        vpxor           \T4, \T7, \T7
 
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+        vpclmulqdq      $0x00, \T3, \T2, \T2
 
+        vpxor           \T2, \XMM1, \XMM1
 
+        ######################
 
+        vmovdqa         HashKey_5(arg1), \T5
+        vpshufd         $0b01001110, \XMM4, \T2
+        vpshufd         $0b01001110, \T5, \T3
+        vpxor           \XMM4, \T2, \T2
+        vpxor           \T5, \T3, \T3
 
-_eight_cipher_left\@:
-        GHASH_LAST_8_AVX2    %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
+        vpclmulqdq      $0x11, \T5, \XMM4, \T4
+        vpxor           \T4, \T6, \T6
 
+        vpclmulqdq      $0x00, \T5, \XMM4, \T4
+        vpxor           \T4, \T7, \T7
 
-_zero_cipher_left\@:
-        cmp     $16, arg4
-        jl      _only_less_than_16\@
+        vpclmulqdq      $0x00, \T3, \T2, \T2
 
-        mov     arg4, %r13
-        and     $15, %r13                            # r13 = (arg4 mod 16)
+        vpxor           \T2, \XMM1, \XMM1
 
-        je      _multiple_of_16_bytes\@
+        ######################
 
-        # handle the last <16 Byte block seperately
+        vmovdqa         HashKey_4(arg1), \T5
+        vpshufd         $0b01001110, \XMM5, \T2
+        vpshufd         $0b01001110, \T5, \T3
+        vpxor           \XMM5, \T2, \T2
+        vpxor           \T5, \T3, \T3
 
+        vpclmulqdq      $0x11, \T5, \XMM5, \T4
+        vpxor           \T4, \T6, \T6
 
-        vpaddd   ONE(%rip), %xmm9, %xmm9             # INCR CNT to get Yn
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
+        vpclmulqdq      $0x00, \T5, \XMM5, \T4
+        vpxor           \T4, \T7, \T7
 
-        sub     $16, %r11
-        add     %r13, %r11
-        vmovdqu (arg3, %r11), %xmm1                  # receive the last <16 Byte block
+        vpclmulqdq      $0x00, \T3, \T2, \T2
 
-        lea     SHIFT_MASK+16(%rip), %r12
-        sub     %r13, %r12                           # adjust the shuffle mask pointer
-						     # to be able to shift 16-r13 bytes
-						     # (r13 is the number of bytes in plaintext mod 16)
-        vmovdqu (%r12), %xmm2                        # get the appropriate shuffle mask
-        vpshufb %xmm2, %xmm1, %xmm1                  # shift right 16-r13 bytes
-        jmp     _final_ghash_mul\@
+        vpxor           \T2, \XMM1, \XMM1
 
-_only_less_than_16\@:
-        # check for 0 length
-        mov     arg4, %r13
-        and     $15, %r13                            # r13 = (arg4 mod 16)
+        ######################
 
-        je      _multiple_of_16_bytes\@
+        vmovdqa         HashKey_3(arg1), \T5
+        vpshufd         $0b01001110, \XMM6, \T2
+        vpshufd         $0b01001110, \T5, \T3
+        vpxor           \XMM6, \T2, \T2
+        vpxor           \T5, \T3, \T3
 
-        # handle the last <16 Byte block seperately
+        vpclmulqdq      $0x11, \T5, \XMM6, \T4
+        vpxor           \T4, \T6, \T6
 
+        vpclmulqdq      $0x00, \T5, \XMM6, \T4
+        vpxor           \T4, \T7, \T7
 
-        vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
+        vpclmulqdq      $0x00, \T3, \T2, \T2
 
+        vpxor           \T2, \XMM1, \XMM1
 
-        lea     SHIFT_MASK+16(%rip), %r12
-        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
-						     # able to shift 16-r13 bytes (r13 is the
-						     # number of bytes in plaintext mod 16)
+        ######################
 
-_get_last_16_byte_loop\@:
-        movb    (arg3, %r11),  %al
-        movb    %al,  TMP1 (%rsp , %r11)
-        add     $1, %r11
-        cmp     %r13,  %r11
-        jne     _get_last_16_byte_loop\@
+        vmovdqa         HashKey_2(arg1), \T5
+        vpshufd         $0b01001110, \XMM7, \T2
+        vpshufd         $0b01001110, \T5, \T3
+        vpxor           \XMM7, \T2, \T2
+        vpxor           \T5, \T3, \T3
 
-        vmovdqu  TMP1(%rsp), %xmm1
+        vpclmulqdq      $0x11, \T5, \XMM7, \T4
+        vpxor           \T4, \T6, \T6
 
-        sub     $16, %r11
+        vpclmulqdq      $0x00, \T5, \XMM7, \T4
+        vpxor           \T4, \T7, \T7
 
-_final_ghash_mul\@:
-        .if  \ENC_DEC ==  DEC
-        vmovdqa %xmm1, %xmm2
-        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
-        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to mask out top 16-r13 bytes of xmm9
-        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
-        vpand   %xmm1, %xmm2, %xmm2
-        vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
-        vpxor   %xmm2, %xmm14, %xmm14
-	#GHASH computation for the last <16 Byte block
-        GHASH_MUL_AVX2       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
-        sub     %r13, %r11
-        add     $16, %r11
-        .else
-        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
-        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to mask out top 16-r13 bytes of xmm9
-        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-        vpxor   %xmm9, %xmm14, %xmm14
-	#GHASH computation for the last <16 Byte block
-        GHASH_MUL_AVX2       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
-        sub     %r13, %r11
-        add     $16, %r11
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
-        .endif
+        vpclmulqdq      $0x00, \T3, \T2, \T2
 
+        vpxor           \T2, \XMM1, \XMM1
 
-        #############################
-        # output r13 Bytes
-        vmovq   %xmm9, %rax
-        cmp     $8, %r13
-        jle     _less_than_8_bytes_left\@
+        ######################
 
-        mov     %rax, (arg2 , %r11)
-        add     $8, %r11
-        vpsrldq $8, %xmm9, %xmm9
-        vmovq   %xmm9, %rax
-        sub     $8, %r13
+        vmovdqa         HashKey(arg1), \T5
+        vpshufd         $0b01001110, \XMM8, \T2
+        vpshufd         $0b01001110, \T5, \T3
+        vpxor           \XMM8, \T2, \T2
+        vpxor           \T5, \T3, \T3
 
-_less_than_8_bytes_left\@:
-        movb    %al, (arg2 , %r11)
-        add     $1, %r11
-        shr     $8, %rax
-        sub     $1, %r13
-        jne     _less_than_8_bytes_left\@
-        #############################
+        vpclmulqdq      $0x11, \T5, \XMM8, \T4
+        vpxor           \T4, \T6, \T6
 
-_multiple_of_16_bytes\@:
-        mov     arg7, %r12                           # r12 = aadLen (number of bytes)
-        shl     $3, %r12                             # convert into number of bits
-        vmovd   %r12d, %xmm15                        # len(A) in xmm15
+        vpclmulqdq      $0x00, \T5, \XMM8, \T4
+        vpxor           \T4, \T7, \T7
 
-        shl     $3, arg4                             # len(C) in bits  (*128)
-        vmovq   arg4, %xmm1
-        vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
-        vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
+        vpclmulqdq      $0x00, \T3, \T2, \T2
 
-        vpxor   %xmm15, %xmm14, %xmm14
-        GHASH_MUL_AVX2       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
-        vpshufb SHUF_MASK(%rip), %xmm14, %xmm14              # perform a 16Byte swap
+        vpxor           \T2, \XMM1, \XMM1
+        vpxor           \T6, \XMM1, \XMM1
+        vpxor           \T7, \XMM1, \T2
 
-        mov     arg5, %rax                           # rax = *Y0
-        vmovdqu (%rax), %xmm9                        # xmm9 = Y0
 
-        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Y0)
 
-        vpxor   %xmm14, %xmm9, %xmm9
 
+        vpslldq $8, \T2, \T4
+        vpsrldq $8, \T2, \T2
 
+        vpxor   \T4, \T7, \T7
+        vpxor   \T2, \T6, \T6                      # <T6:T7> holds the result of the
+						   # accumulated carry-less multiplications
 
-_return_T\@:
-        mov     arg8, %r10              # r10 = authTag
-        mov     arg9, %r11              # r11 = auth_tag_len
+        #######################################################################
+        #first phase of the reduction
+        vmovdqa         POLY2(%rip), \T3
 
-        cmp     $16, %r11
-        je      _T_16\@
+        vpclmulqdq      $0x01, \T7, \T3, \T2
+        vpslldq         $8, \T2, \T2               # shift-L xmm2 2 DWs
 
-        cmp     $8, %r11
-        jl      _T_4\@
+        vpxor           \T2, \T7, \T7              # first phase of the reduction complete
+        #######################################################################
 
-_T_8\@:
-        vmovq   %xmm9, %rax
-        mov     %rax, (%r10)
-        add     $8, %r10
-        sub     $8, %r11
-        vpsrldq $8, %xmm9, %xmm9
-        cmp     $0, %r11
-        je     _return_T_done\@
-_T_4\@:
-        vmovd   %xmm9, %eax
-        mov     %eax, (%r10)
-        add     $4, %r10
-        sub     $4, %r11
-        vpsrldq     $4, %xmm9, %xmm9
-        cmp     $0, %r11
-        je     _return_T_done\@
-_T_123\@:
-        vmovd     %xmm9, %eax
-        cmp     $2, %r11
-        jl     _T_1\@
-        mov     %ax, (%r10)
-        cmp     $2, %r11
-        je     _return_T_done\@
-        add     $2, %r10
-        sar     $16, %eax
-_T_1\@:
-        mov     %al, (%r10)
-        jmp     _return_T_done\@
 
-_T_16\@:
-        vmovdqu %xmm9, (%r10)
+        #second phase of the reduction
+        vpclmulqdq      $0x00, \T7, \T3, \T2
+        vpsrldq         $4, \T2, \T2               # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
 
-_return_T_done\@:
-        mov     %r14, %rsp
+        vpclmulqdq      $0x10, \T7, \T3, \T4
+        vpslldq         $4, \T4, \T4               # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
 
-        pop     %r15
-        pop     %r14
-        pop     %r13
-        pop     %r12
+        vpxor           \T2, \T4, \T4              # second phase of the reduction complete
+        #######################################################################
+        vpxor           \T4, \T6, \T6              # the result is in T6
 .endm
 
 
+
 #############################################################
 #void   aesni_gcm_precomp_avx_gen4
 #        (gcm_data     *my_ctx_data,
@@ -2918,7 +2603,7 @@ ENDPROC(aesni_gcm_precomp_avx_gen4)
 #				Valid values are 16 (most likely), 12 or 8. */
 ###############################################################################
 ENTRY(aesni_gcm_enc_avx_gen4)
-        GCM_ENC_DEC_AVX2     ENC
+        GCM_ENC_DEC INITIAL_BLOCKS_AVX2 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 GHASH_LAST_8_AVX2 GHASH_MUL_AVX2 ENC
 	ret
 ENDPROC(aesni_gcm_enc_avx_gen4)
 
@@ -2939,7 +2624,7 @@ ENDPROC(aesni_gcm_enc_avx_gen4)
 #				Valid values are 16 (most likely), 12 or 8. */
 ###############################################################################
 ENTRY(aesni_gcm_dec_avx_gen4)
-        GCM_ENC_DEC_AVX2     DEC
+        GCM_ENC_DEC INITIAL_BLOCKS_AVX2 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 GHASH_LAST_8_AVX2 GHASH_MUL_AVX2 DEC
 	ret
 ENDPROC(aesni_gcm_dec_avx_gen4)
 
-- 
cgit v1.2.3-59-g8ed1b


From de85fc46b1037099c78d2fec08a662079e568d62 Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Mon, 10 Dec 2018 19:57:00 +0000
Subject: crypto: aesni - Introduce gcm_context_data

Add the gcm_context_data structure to the avx asm routines.
This will be necessary to support both 256 bit keys and
scatter/gather.

The pre-computed HashKeys are now stored in the gcm_context_data
struct, which is expanded to hold the greater number of hashkeys
necessary for avx.

Loads and stores to the new struct are always done unlaligned to
avoid compiler issues, see e5b954e8 "Use unaligned loads from
gcm_context_data"

Signed-off-by: Dave Watson <davejwatson@fb.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aesni-intel_avx-x86_64.S | 378 +++++++++++++++----------------
 arch/x86/crypto/aesni-intel_glue.c       |  58 +++--
 2 files changed, 215 insertions(+), 221 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index 318135a77975..284f1b8b88fc 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -182,43 +182,22 @@ aad_shift_arr:
 .text
 
 
-##define the fields of the gcm aes context
-#{
-#        u8 expanded_keys[16*11] store expanded keys
-#        u8 shifted_hkey_1[16]   store HashKey <<1 mod poly here
-#        u8 shifted_hkey_2[16]   store HashKey^2 <<1 mod poly here
-#        u8 shifted_hkey_3[16]   store HashKey^3 <<1 mod poly here
-#        u8 shifted_hkey_4[16]   store HashKey^4 <<1 mod poly here
-#        u8 shifted_hkey_5[16]   store HashKey^5 <<1 mod poly here
-#        u8 shifted_hkey_6[16]   store HashKey^6 <<1 mod poly here
-#        u8 shifted_hkey_7[16]   store HashKey^7 <<1 mod poly here
-#        u8 shifted_hkey_8[16]   store HashKey^8 <<1 mod poly here
-#        u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes)
-#        u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes)
-#        u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes)
-#        u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes)
-#        u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes)
-#        u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes)
-#        u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes)
-#        u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes)
-#} gcm_ctx#
-
-HashKey        = 16*11   # store HashKey <<1 mod poly here
-HashKey_2      = 16*12   # store HashKey^2 <<1 mod poly here
-HashKey_3      = 16*13   # store HashKey^3 <<1 mod poly here
-HashKey_4      = 16*14   # store HashKey^4 <<1 mod poly here
-HashKey_5      = 16*15   # store HashKey^5 <<1 mod poly here
-HashKey_6      = 16*16   # store HashKey^6 <<1 mod poly here
-HashKey_7      = 16*17   # store HashKey^7 <<1 mod poly here
-HashKey_8      = 16*18   # store HashKey^8 <<1 mod poly here
-HashKey_k      = 16*19   # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
-HashKey_2_k    = 16*20   # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
-HashKey_3_k    = 16*21   # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
-HashKey_4_k    = 16*22   # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
-HashKey_5_k    = 16*23   # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
-HashKey_6_k    = 16*24   # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
-HashKey_7_k    = 16*25   # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
-HashKey_8_k    = 16*26   # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
+HashKey        = 16*6   # store HashKey <<1 mod poly here
+HashKey_2      = 16*7   # store HashKey^2 <<1 mod poly here
+HashKey_3      = 16*8   # store HashKey^3 <<1 mod poly here
+HashKey_4      = 16*9   # store HashKey^4 <<1 mod poly here
+HashKey_5      = 16*10   # store HashKey^5 <<1 mod poly here
+HashKey_6      = 16*11   # store HashKey^6 <<1 mod poly here
+HashKey_7      = 16*12   # store HashKey^7 <<1 mod poly here
+HashKey_8      = 16*13   # store HashKey^8 <<1 mod poly here
+HashKey_k      = 16*14   # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
+HashKey_2_k    = 16*15   # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
+HashKey_3_k    = 16*16   # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
+HashKey_4_k    = 16*17   # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
+HashKey_5_k    = 16*18   # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
+HashKey_6_k    = 16*19   # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
+HashKey_7_k    = 16*20   # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
+HashKey_8_k    = 16*21   # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
 
 #define arg1 %rdi
 #define arg2 %rsi
@@ -229,6 +208,7 @@ HashKey_8_k    = 16*26   # store XOR of HashKey^8 <<1 mod poly here (for Karatsu
 #define arg7 STACK_OFFSET+8*1(%r14)
 #define arg8 STACK_OFFSET+8*2(%r14)
 #define arg9 STACK_OFFSET+8*3(%r14)
+#define arg10 STACK_OFFSET+8*4(%r14)
 
 i = 0
 j = 0
@@ -300,9 +280,9 @@ VARIABLE_OFFSET = 16*8
         and     $~63, %rsp                  # align rsp to 64 bytes
 
 
-        vmovdqu  HashKey(arg1), %xmm13      # xmm13 = HashKey
+        vmovdqu  HashKey(arg2), %xmm13      # xmm13 = HashKey
 
-        mov     arg4, %r13                  # save the number of bytes of plaintext/ciphertext
+        mov     arg5, %r13                  # save the number of bytes of plaintext/ciphertext
         and     $-16, %r13                  # r13 = r13 - (r13 mod 16)
 
         mov     %r13, %r12
@@ -413,11 +393,11 @@ _eight_cipher_left\@:
 
 
 _zero_cipher_left\@:
-        cmp     $16, arg4
+        cmp     $16, arg5
         jl      _only_less_than_16\@
 
-        mov     arg4, %r13
-        and     $15, %r13                            # r13 = (arg4 mod 16)
+        mov     arg5, %r13
+        and     $15, %r13                            # r13 = (arg5 mod 16)
 
         je      _multiple_of_16_bytes\@
 
@@ -430,7 +410,7 @@ _zero_cipher_left\@:
 
         sub     $16, %r11
         add     %r13, %r11
-        vmovdqu (arg3, %r11), %xmm1                  # receive the last <16 Byte block
+        vmovdqu (arg4, %r11), %xmm1                  # receive the last <16 Byte block
 
         lea     SHIFT_MASK+16(%rip), %r12
         sub     %r13, %r12                           # adjust the shuffle mask pointer to be
@@ -442,8 +422,8 @@ _zero_cipher_left\@:
 
 _only_less_than_16\@:
         # check for 0 length
-        mov     arg4, %r13
-        and     $15, %r13                            # r13 = (arg4 mod 16)
+        mov     arg5, %r13
+        and     $15, %r13                            # r13 = (arg5 mod 16)
 
         je      _multiple_of_16_bytes\@
 
@@ -461,7 +441,7 @@ _only_less_than_16\@:
 						     # number of bytes in plaintext mod 16)
 
 _get_last_16_byte_loop\@:
-        movb    (arg3, %r11),  %al
+        movb    (arg4, %r11),  %al
         movb    %al,  TMP1 (%rsp , %r11)
         add     $1, %r11
         cmp     %r13,  %r11
@@ -506,14 +486,14 @@ _final_ghash_mul\@:
         cmp     $8, %r13
         jle     _less_than_8_bytes_left\@
 
-        mov     %rax, (arg2 , %r11)
+        mov     %rax, (arg3 , %r11)
         add     $8, %r11
         vpsrldq $8, %xmm9, %xmm9
         vmovq   %xmm9, %rax
         sub     $8, %r13
 
 _less_than_8_bytes_left\@:
-        movb    %al, (arg2 , %r11)
+        movb    %al, (arg3 , %r11)
         add     $1, %r11
         shr     $8, %rax
         sub     $1, %r13
@@ -521,12 +501,12 @@ _less_than_8_bytes_left\@:
         #############################
 
 _multiple_of_16_bytes\@:
-        mov     arg7, %r12                           # r12 = aadLen (number of bytes)
+        mov     arg8, %r12                           # r12 = aadLen (number of bytes)
         shl     $3, %r12                             # convert into number of bits
         vmovd   %r12d, %xmm15                        # len(A) in xmm15
 
-        shl     $3, arg4                             # len(C) in bits  (*128)
-        vmovq   arg4, %xmm1
+        shl     $3, arg5                             # len(C) in bits  (*128)
+        vmovq   arg5, %xmm1
         vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
         vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
 
@@ -534,7 +514,7 @@ _multiple_of_16_bytes\@:
         \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
         vpshufb SHUF_MASK(%rip), %xmm14, %xmm14      # perform a 16Byte swap
 
-        mov     arg5, %rax                           # rax = *Y0
+        mov     arg6, %rax                           # rax = *Y0
         vmovdqu (%rax), %xmm9                        # xmm9 = Y0
 
         ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Y0)
@@ -544,8 +524,8 @@ _multiple_of_16_bytes\@:
 
 
 _return_T\@:
-        mov     arg8, %r10              # r10 = authTag
-        mov     arg9, %r11              # r11 = auth_tag_len
+        mov     arg9, %r10              # r10 = authTag
+        mov     arg10, %r11              # r11 = auth_tag_len
 
         cmp     $16, %r11
         je      _T_16\@
@@ -655,49 +635,49 @@ _return_T_done\@:
 
         vpshufd  $0b01001110, \T5, \T1
         vpxor    \T5, \T1, \T1
-        vmovdqa  \T1, HashKey_k(arg1)
+        vmovdqu  \T1, HashKey_k(arg2)
 
         GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^2<<1 mod poly
-        vmovdqa  \T5, HashKey_2(arg1)                    #  [HashKey_2] = HashKey^2<<1 mod poly
+        vmovdqu  \T5, HashKey_2(arg2)                    #  [HashKey_2] = HashKey^2<<1 mod poly
         vpshufd  $0b01001110, \T5, \T1
         vpxor    \T5, \T1, \T1
-        vmovdqa  \T1, HashKey_2_k(arg1)
+        vmovdqu  \T1, HashKey_2_k(arg2)
 
         GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^3<<1 mod poly
-        vmovdqa  \T5, HashKey_3(arg1)
+        vmovdqu  \T5, HashKey_3(arg2)
         vpshufd  $0b01001110, \T5, \T1
         vpxor    \T5, \T1, \T1
-        vmovdqa  \T1, HashKey_3_k(arg1)
+        vmovdqu  \T1, HashKey_3_k(arg2)
 
         GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^4<<1 mod poly
-        vmovdqa  \T5, HashKey_4(arg1)
+        vmovdqu  \T5, HashKey_4(arg2)
         vpshufd  $0b01001110, \T5, \T1
         vpxor    \T5, \T1, \T1
-        vmovdqa  \T1, HashKey_4_k(arg1)
+        vmovdqu  \T1, HashKey_4_k(arg2)
 
         GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^5<<1 mod poly
-        vmovdqa  \T5, HashKey_5(arg1)
+        vmovdqu  \T5, HashKey_5(arg2)
         vpshufd  $0b01001110, \T5, \T1
         vpxor    \T5, \T1, \T1
-        vmovdqa  \T1, HashKey_5_k(arg1)
+        vmovdqu  \T1, HashKey_5_k(arg2)
 
         GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^6<<1 mod poly
-        vmovdqa  \T5, HashKey_6(arg1)
+        vmovdqu  \T5, HashKey_6(arg2)
         vpshufd  $0b01001110, \T5, \T1
         vpxor    \T5, \T1, \T1
-        vmovdqa  \T1, HashKey_6_k(arg1)
+        vmovdqu  \T1, HashKey_6_k(arg2)
 
         GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^7<<1 mod poly
-        vmovdqa  \T5, HashKey_7(arg1)
+        vmovdqu  \T5, HashKey_7(arg2)
         vpshufd  $0b01001110, \T5, \T1
         vpxor    \T5, \T1, \T1
-        vmovdqa  \T1, HashKey_7_k(arg1)
+        vmovdqu  \T1, HashKey_7_k(arg2)
 
         GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^8<<1 mod poly
-        vmovdqa  \T5, HashKey_8(arg1)
+        vmovdqu  \T5, HashKey_8(arg2)
         vpshufd  $0b01001110, \T5, \T1
         vpxor    \T5, \T1, \T1
-        vmovdqa  \T1, HashKey_8_k(arg1)
+        vmovdqu  \T1, HashKey_8_k(arg2)
 
 .endm
 
@@ -706,15 +686,15 @@ _return_T_done\@:
 ## num_initial_blocks = b mod 4#
 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
 ## r10, r11, r12, rax are clobbered
-## arg1, arg2, arg3, r14 are used as a pointer only, not modified
+## arg1, arg3, arg4, r14 are used as a pointer only, not modified
 
 .macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
 	i = (8-\num_initial_blocks)
 	j = 0
 	setreg
 
-	mov     arg6, %r10                      # r10 = AAD
-	mov     arg7, %r12                      # r12 = aadLen
+	mov     arg7, %r10                      # r10 = AAD
+	mov     arg8, %r12                      # r12 = aadLen
 
 
 	mov     %r12, %r11
@@ -780,7 +760,7 @@ _get_AAD_done\@:
 	xor     %r11d, %r11d
 
 	# start AES for num_initial_blocks blocks
-	mov     arg5, %rax                     # rax = *Y0
+	mov     arg6, %rax                     # rax = *Y0
 	vmovdqu (%rax), \CTR                   # CTR = Y0
 	vpshufb SHUF_MASK(%rip), \CTR, \CTR
 
@@ -833,9 +813,9 @@ _get_AAD_done\@:
 	i = (9-\num_initial_blocks)
 	setreg
 .rep \num_initial_blocks
-                vmovdqu (arg3, %r11), \T1
+                vmovdqu (arg4, %r11), \T1
                 vpxor   \T1, reg_i, reg_i
-                vmovdqu reg_i, (arg2 , %r11)           # write back ciphertext for num_initial_blocks blocks
+                vmovdqu reg_i, (arg3 , %r11)           # write back ciphertext for num_initial_blocks blocks
                 add     $16, %r11
 .if  \ENC_DEC == DEC
                 vmovdqa \T1, reg_i
@@ -936,58 +916,58 @@ _get_AAD_done\@:
                 vaesenclast  \T_key, \XMM7, \XMM7
                 vaesenclast  \T_key, \XMM8, \XMM8
 
-                vmovdqu  (arg3, %r11), \T1
+                vmovdqu  (arg4, %r11), \T1
                 vpxor    \T1, \XMM1, \XMM1
-                vmovdqu  \XMM1, (arg2 , %r11)
+                vmovdqu  \XMM1, (arg3 , %r11)
                 .if   \ENC_DEC == DEC
                 vmovdqa  \T1, \XMM1
                 .endif
 
-                vmovdqu  16*1(arg3, %r11), \T1
+                vmovdqu  16*1(arg4, %r11), \T1
                 vpxor    \T1, \XMM2, \XMM2
-                vmovdqu  \XMM2, 16*1(arg2 , %r11)
+                vmovdqu  \XMM2, 16*1(arg3 , %r11)
                 .if   \ENC_DEC == DEC
                 vmovdqa  \T1, \XMM2
                 .endif
 
-                vmovdqu  16*2(arg3, %r11), \T1
+                vmovdqu  16*2(arg4, %r11), \T1
                 vpxor    \T1, \XMM3, \XMM3
-                vmovdqu  \XMM3, 16*2(arg2 , %r11)
+                vmovdqu  \XMM3, 16*2(arg3 , %r11)
                 .if   \ENC_DEC == DEC
                 vmovdqa  \T1, \XMM3
                 .endif
 
-                vmovdqu  16*3(arg3, %r11), \T1
+                vmovdqu  16*3(arg4, %r11), \T1
                 vpxor    \T1, \XMM4, \XMM4
-                vmovdqu  \XMM4, 16*3(arg2 , %r11)
+                vmovdqu  \XMM4, 16*3(arg3 , %r11)
                 .if   \ENC_DEC == DEC
                 vmovdqa  \T1, \XMM4
                 .endif
 
-                vmovdqu  16*4(arg3, %r11), \T1
+                vmovdqu  16*4(arg4, %r11), \T1
                 vpxor    \T1, \XMM5, \XMM5
-                vmovdqu  \XMM5, 16*4(arg2 , %r11)
+                vmovdqu  \XMM5, 16*4(arg3 , %r11)
                 .if   \ENC_DEC == DEC
                 vmovdqa  \T1, \XMM5
                 .endif
 
-                vmovdqu  16*5(arg3, %r11), \T1
+                vmovdqu  16*5(arg4, %r11), \T1
                 vpxor    \T1, \XMM6, \XMM6
-                vmovdqu  \XMM6, 16*5(arg2 , %r11)
+                vmovdqu  \XMM6, 16*5(arg3 , %r11)
                 .if   \ENC_DEC == DEC
                 vmovdqa  \T1, \XMM6
                 .endif
 
-                vmovdqu  16*6(arg3, %r11), \T1
+                vmovdqu  16*6(arg4, %r11), \T1
                 vpxor    \T1, \XMM7, \XMM7
-                vmovdqu  \XMM7, 16*6(arg2 , %r11)
+                vmovdqu  \XMM7, 16*6(arg3 , %r11)
                 .if   \ENC_DEC == DEC
                 vmovdqa  \T1, \XMM7
                 .endif
 
-                vmovdqu  16*7(arg3, %r11), \T1
+                vmovdqu  16*7(arg4, %r11), \T1
                 vpxor    \T1, \XMM8, \XMM8
-                vmovdqu  \XMM8, 16*7(arg2 , %r11)
+                vmovdqu  \XMM8, 16*7(arg3 , %r11)
                 .if   \ENC_DEC == DEC
                 vmovdqa  \T1, \XMM8
                 .endif
@@ -1012,7 +992,7 @@ _initial_blocks_done\@:
 
 # encrypt 8 blocks at a time
 # ghash the 8 previously encrypted ciphertext blocks
-# arg1, arg2, arg3 are used as pointers only, not modified
+# arg1, arg3, arg4 are used as pointers only, not modified
 # r11 is the data offset value
 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
 
@@ -1098,14 +1078,14 @@ _initial_blocks_done\@:
 
         #######################################################################
 
-        vmovdqa         HashKey_8(arg1), \T5
+        vmovdqu         HashKey_8(arg2), \T5
         vpclmulqdq      $0x11, \T5, \T2, \T4             # T4 = a1*b1
         vpclmulqdq      $0x00, \T5, \T2, \T7             # T7 = a0*b0
 
         vpshufd         $0b01001110, \T2, \T6
         vpxor           \T2, \T6, \T6
 
-        vmovdqa         HashKey_8_k(arg1), \T5
+        vmovdqu         HashKey_8_k(arg2), \T5
         vpclmulqdq      $0x00, \T5, \T6, \T6
 
                 vmovdqu 16*3(arg1), \T1
@@ -1119,7 +1099,7 @@ _initial_blocks_done\@:
                 vaesenc \T1, \XMM8, \XMM8
 
         vmovdqa         TMP2(%rsp), \T1
-        vmovdqa         HashKey_7(arg1), \T5
+        vmovdqu         HashKey_7(arg2), \T5
         vpclmulqdq      $0x11, \T5, \T1, \T3
         vpxor           \T3, \T4, \T4
         vpclmulqdq      $0x00, \T5, \T1, \T3
@@ -1127,7 +1107,7 @@ _initial_blocks_done\@:
 
         vpshufd         $0b01001110, \T1, \T3
         vpxor           \T1, \T3, \T3
-        vmovdqa         HashKey_7_k(arg1), \T5
+        vmovdqu         HashKey_7_k(arg2), \T5
         vpclmulqdq      $0x10, \T5, \T3, \T3
         vpxor           \T3, \T6, \T6
 
@@ -1144,7 +1124,7 @@ _initial_blocks_done\@:
         #######################################################################
 
         vmovdqa         TMP3(%rsp), \T1
-        vmovdqa         HashKey_6(arg1), \T5
+        vmovdqu         HashKey_6(arg2), \T5
         vpclmulqdq      $0x11, \T5, \T1, \T3
         vpxor           \T3, \T4, \T4
         vpclmulqdq      $0x00, \T5, \T1, \T3
@@ -1152,7 +1132,7 @@ _initial_blocks_done\@:
 
         vpshufd         $0b01001110, \T1, \T3
         vpxor           \T1, \T3, \T3
-        vmovdqa         HashKey_6_k(arg1), \T5
+        vmovdqu         HashKey_6_k(arg2), \T5
         vpclmulqdq      $0x10, \T5, \T3, \T3
         vpxor           \T3, \T6, \T6
 
@@ -1167,7 +1147,7 @@ _initial_blocks_done\@:
                 vaesenc \T1, \XMM8, \XMM8
 
         vmovdqa         TMP4(%rsp), \T1
-        vmovdqa         HashKey_5(arg1), \T5
+        vmovdqu         HashKey_5(arg2), \T5
         vpclmulqdq      $0x11, \T5, \T1, \T3
         vpxor           \T3, \T4, \T4
         vpclmulqdq      $0x00, \T5, \T1, \T3
@@ -1175,7 +1155,7 @@ _initial_blocks_done\@:
 
         vpshufd         $0b01001110, \T1, \T3
         vpxor           \T1, \T3, \T3
-        vmovdqa         HashKey_5_k(arg1), \T5
+        vmovdqu         HashKey_5_k(arg2), \T5
         vpclmulqdq      $0x10, \T5, \T3, \T3
         vpxor           \T3, \T6, \T6
 
@@ -1191,7 +1171,7 @@ _initial_blocks_done\@:
 
 
         vmovdqa         TMP5(%rsp), \T1
-        vmovdqa         HashKey_4(arg1), \T5
+        vmovdqu         HashKey_4(arg2), \T5
         vpclmulqdq      $0x11, \T5, \T1, \T3
         vpxor           \T3, \T4, \T4
         vpclmulqdq      $0x00, \T5, \T1, \T3
@@ -1199,7 +1179,7 @@ _initial_blocks_done\@:
 
         vpshufd         $0b01001110, \T1, \T3
         vpxor           \T1, \T3, \T3
-        vmovdqa         HashKey_4_k(arg1), \T5
+        vmovdqu         HashKey_4_k(arg2), \T5
         vpclmulqdq      $0x10, \T5, \T3, \T3
         vpxor           \T3, \T6, \T6
 
@@ -1214,7 +1194,7 @@ _initial_blocks_done\@:
                 vaesenc \T1, \XMM8, \XMM8
 
         vmovdqa         TMP6(%rsp), \T1
-        vmovdqa         HashKey_3(arg1), \T5
+        vmovdqu         HashKey_3(arg2), \T5
         vpclmulqdq      $0x11, \T5, \T1, \T3
         vpxor           \T3, \T4, \T4
         vpclmulqdq      $0x00, \T5, \T1, \T3
@@ -1222,7 +1202,7 @@ _initial_blocks_done\@:
 
         vpshufd         $0b01001110, \T1, \T3
         vpxor           \T1, \T3, \T3
-        vmovdqa         HashKey_3_k(arg1), \T5
+        vmovdqu         HashKey_3_k(arg2), \T5
         vpclmulqdq      $0x10, \T5, \T3, \T3
         vpxor           \T3, \T6, \T6
 
@@ -1238,7 +1218,7 @@ _initial_blocks_done\@:
                 vaesenc \T1, \XMM8, \XMM8
 
         vmovdqa         TMP7(%rsp), \T1
-        vmovdqa         HashKey_2(arg1), \T5
+        vmovdqu         HashKey_2(arg2), \T5
         vpclmulqdq      $0x11, \T5, \T1, \T3
         vpxor           \T3, \T4, \T4
         vpclmulqdq      $0x00, \T5, \T1, \T3
@@ -1246,7 +1226,7 @@ _initial_blocks_done\@:
 
         vpshufd         $0b01001110, \T1, \T3
         vpxor           \T1, \T3, \T3
-        vmovdqa         HashKey_2_k(arg1), \T5
+        vmovdqu         HashKey_2_k(arg2), \T5
         vpclmulqdq      $0x10, \T5, \T3, \T3
         vpxor           \T3, \T6, \T6
 
@@ -1263,7 +1243,7 @@ _initial_blocks_done\@:
                 vaesenc \T5, \XMM8, \XMM8
 
         vmovdqa         TMP8(%rsp), \T1
-        vmovdqa         HashKey(arg1), \T5
+        vmovdqu         HashKey(arg2), \T5
         vpclmulqdq      $0x11, \T5, \T1, \T3
         vpxor           \T3, \T4, \T4
         vpclmulqdq      $0x00, \T5, \T1, \T3
@@ -1271,7 +1251,7 @@ _initial_blocks_done\@:
 
         vpshufd         $0b01001110, \T1, \T3
         vpxor           \T1, \T3, \T3
-        vmovdqa         HashKey_k(arg1), \T5
+        vmovdqu         HashKey_k(arg2), \T5
         vpclmulqdq      $0x10, \T5, \T3, \T3
         vpxor           \T3, \T6, \T6
 
@@ -1284,13 +1264,13 @@ _initial_blocks_done\@:
 	j = 1
 	setreg
 .rep 8
-		vpxor	16*i(arg3, %r11), \T5, \T2
+		vpxor	16*i(arg4, %r11), \T5, \T2
                 .if \ENC_DEC == ENC
                 vaesenclast     \T2, reg_j, reg_j
                 .else
                 vaesenclast     \T2, reg_j, \T3
-                vmovdqu 16*i(arg3, %r11), reg_j
-                vmovdqu \T3, 16*i(arg2, %r11)
+                vmovdqu 16*i(arg4, %r11), reg_j
+                vmovdqu \T3, 16*i(arg3, %r11)
                 .endif
 	i = (i+1)
 	j = (j+1)
@@ -1322,14 +1302,14 @@ _initial_blocks_done\@:
         vpxor   \T2, \T7, \T7                           # first phase of the reduction complete
 	#######################################################################
                 .if \ENC_DEC == ENC
-		vmovdqu	 \XMM1,	16*0(arg2,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM2,	16*1(arg2,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM3,	16*2(arg2,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM4,	16*3(arg2,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM5,	16*4(arg2,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM6,	16*5(arg2,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM7,	16*6(arg2,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM8,	16*7(arg2,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM1,	16*0(arg3,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM2,	16*1(arg3,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM3,	16*2(arg3,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM4,	16*3(arg3,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM5,	16*4(arg3,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM6,	16*5(arg3,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM7,	16*6(arg3,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM8,	16*7(arg3,%r11)		# Write to the Ciphertext buffer
                 .endif
 
 	#######################################################################
@@ -1370,25 +1350,25 @@ _initial_blocks_done\@:
 
         vpshufd         $0b01001110, \XMM1, \T2
         vpxor           \XMM1, \T2, \T2
-        vmovdqa         HashKey_8(arg1), \T5
+        vmovdqu         HashKey_8(arg2), \T5
         vpclmulqdq      $0x11, \T5, \XMM1, \T6
         vpclmulqdq      $0x00, \T5, \XMM1, \T7
 
-        vmovdqa         HashKey_8_k(arg1), \T3
+        vmovdqu         HashKey_8_k(arg2), \T3
         vpclmulqdq      $0x00, \T3, \T2, \XMM1
 
         ######################
 
         vpshufd         $0b01001110, \XMM2, \T2
         vpxor           \XMM2, \T2, \T2
-        vmovdqa         HashKey_7(arg1), \T5
+        vmovdqu         HashKey_7(arg2), \T5
         vpclmulqdq      $0x11, \T5, \XMM2, \T4
         vpxor           \T4, \T6, \T6
 
         vpclmulqdq      $0x00, \T5, \XMM2, \T4
         vpxor           \T4, \T7, \T7
 
-        vmovdqa         HashKey_7_k(arg1), \T3
+        vmovdqu         HashKey_7_k(arg2), \T3
         vpclmulqdq      $0x00, \T3, \T2, \T2
         vpxor           \T2, \XMM1, \XMM1
 
@@ -1396,14 +1376,14 @@ _initial_blocks_done\@:
 
         vpshufd         $0b01001110, \XMM3, \T2
         vpxor           \XMM3, \T2, \T2
-        vmovdqa         HashKey_6(arg1), \T5
+        vmovdqu         HashKey_6(arg2), \T5
         vpclmulqdq      $0x11, \T5, \XMM3, \T4
         vpxor           \T4, \T6, \T6
 
         vpclmulqdq      $0x00, \T5, \XMM3, \T4
         vpxor           \T4, \T7, \T7
 
-        vmovdqa         HashKey_6_k(arg1), \T3
+        vmovdqu         HashKey_6_k(arg2), \T3
         vpclmulqdq      $0x00, \T3, \T2, \T2
         vpxor           \T2, \XMM1, \XMM1
 
@@ -1411,14 +1391,14 @@ _initial_blocks_done\@:
 
         vpshufd         $0b01001110, \XMM4, \T2
         vpxor           \XMM4, \T2, \T2
-        vmovdqa         HashKey_5(arg1), \T5
+        vmovdqu         HashKey_5(arg2), \T5
         vpclmulqdq      $0x11, \T5, \XMM4, \T4
         vpxor           \T4, \T6, \T6
 
         vpclmulqdq      $0x00, \T5, \XMM4, \T4
         vpxor           \T4, \T7, \T7
 
-        vmovdqa         HashKey_5_k(arg1), \T3
+        vmovdqu         HashKey_5_k(arg2), \T3
         vpclmulqdq      $0x00, \T3, \T2, \T2
         vpxor           \T2, \XMM1, \XMM1
 
@@ -1426,14 +1406,14 @@ _initial_blocks_done\@:
 
         vpshufd         $0b01001110, \XMM5, \T2
         vpxor           \XMM5, \T2, \T2
-        vmovdqa         HashKey_4(arg1), \T5
+        vmovdqu         HashKey_4(arg2), \T5
         vpclmulqdq      $0x11, \T5, \XMM5, \T4
         vpxor           \T4, \T6, \T6
 
         vpclmulqdq      $0x00, \T5, \XMM5, \T4
         vpxor           \T4, \T7, \T7
 
-        vmovdqa         HashKey_4_k(arg1), \T3
+        vmovdqu         HashKey_4_k(arg2), \T3
         vpclmulqdq      $0x00, \T3, \T2, \T2
         vpxor           \T2, \XMM1, \XMM1
 
@@ -1441,14 +1421,14 @@ _initial_blocks_done\@:
 
         vpshufd         $0b01001110, \XMM6, \T2
         vpxor           \XMM6, \T2, \T2
-        vmovdqa         HashKey_3(arg1), \T5
+        vmovdqu         HashKey_3(arg2), \T5
         vpclmulqdq      $0x11, \T5, \XMM6, \T4
         vpxor           \T4, \T6, \T6
 
         vpclmulqdq      $0x00, \T5, \XMM6, \T4
         vpxor           \T4, \T7, \T7
 
-        vmovdqa         HashKey_3_k(arg1), \T3
+        vmovdqu         HashKey_3_k(arg2), \T3
         vpclmulqdq      $0x00, \T3, \T2, \T2
         vpxor           \T2, \XMM1, \XMM1
 
@@ -1456,14 +1436,14 @@ _initial_blocks_done\@:
 
         vpshufd         $0b01001110, \XMM7, \T2
         vpxor           \XMM7, \T2, \T2
-        vmovdqa         HashKey_2(arg1), \T5
+        vmovdqu         HashKey_2(arg2), \T5
         vpclmulqdq      $0x11, \T5, \XMM7, \T4
         vpxor           \T4, \T6, \T6
 
         vpclmulqdq      $0x00, \T5, \XMM7, \T4
         vpxor           \T4, \T7, \T7
 
-        vmovdqa         HashKey_2_k(arg1), \T3
+        vmovdqu         HashKey_2_k(arg2), \T3
         vpclmulqdq      $0x00, \T3, \T2, \T2
         vpxor           \T2, \XMM1, \XMM1
 
@@ -1471,14 +1451,14 @@ _initial_blocks_done\@:
 
         vpshufd         $0b01001110, \XMM8, \T2
         vpxor           \XMM8, \T2, \T2
-        vmovdqa         HashKey(arg1), \T5
+        vmovdqu         HashKey(arg2), \T5
         vpclmulqdq      $0x11, \T5, \XMM8, \T4
         vpxor           \T4, \T6, \T6
 
         vpclmulqdq      $0x00, \T5, \XMM8, \T4
         vpxor           \T4, \T7, \T7
 
-        vmovdqa         HashKey_k(arg1), \T3
+        vmovdqu         HashKey_k(arg2), \T3
         vpclmulqdq      $0x00, \T3, \T2, \T2
 
         vpxor           \T2, \XMM1, \XMM1
@@ -1527,6 +1507,7 @@ _initial_blocks_done\@:
 #############################################################
 #void   aesni_gcm_precomp_avx_gen2
 #        (gcm_data     *my_ctx_data,
+#         gcm_context_data *data,
 #        u8     *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
 #############################################################
 ENTRY(aesni_gcm_precomp_avx_gen2)
@@ -1543,7 +1524,7 @@ ENTRY(aesni_gcm_precomp_avx_gen2)
         sub     $VARIABLE_OFFSET, %rsp
         and     $~63, %rsp                  # align rsp to 64 bytes
 
-        vmovdqu  (arg2), %xmm6              # xmm6 = HashKey
+        vmovdqu  (arg3), %xmm6              # xmm6 = HashKey
 
         vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
         ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
@@ -1560,7 +1541,7 @@ ENTRY(aesni_gcm_precomp_avx_gen2)
         vpand    POLY(%rip), %xmm2, %xmm2
         vpxor    %xmm2, %xmm6, %xmm6        # xmm6 holds the HashKey<<1 mod poly
         #######################################################################
-        vmovdqa  %xmm6, HashKey(arg1)       # store HashKey<<1 mod poly
+        vmovdqu  %xmm6, HashKey(arg2)       # store HashKey<<1 mod poly
 
 
         PRECOMPUTE_AVX  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
@@ -1577,6 +1558,7 @@ ENDPROC(aesni_gcm_precomp_avx_gen2)
 ###############################################################################
 #void   aesni_gcm_enc_avx_gen2(
 #        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
+#        gcm_context_data *data,
 #        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
 #        const   u8 *in, /* Plaintext input */
 #        u64     plaintext_len, /* Length of data in Bytes for encryption. */
@@ -1598,6 +1580,7 @@ ENDPROC(aesni_gcm_enc_avx_gen2)
 ###############################################################################
 #void   aesni_gcm_dec_avx_gen2(
 #        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
+#        gcm_context_data *data,
 #        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
 #        const   u8 *in, /* Ciphertext input */
 #        u64     plaintext_len, /* Length of data in Bytes for encryption. */
@@ -1668,25 +1651,25 @@ ENDPROC(aesni_gcm_dec_avx_gen2)
         # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
         vmovdqa  \HK, \T5
         GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^2<<1 mod poly
-        vmovdqa  \T5, HashKey_2(arg1)                       #  [HashKey_2] = HashKey^2<<1 mod poly
+        vmovdqu  \T5, HashKey_2(arg2)                       #  [HashKey_2] = HashKey^2<<1 mod poly
 
         GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^3<<1 mod poly
-        vmovdqa  \T5, HashKey_3(arg1)
+        vmovdqu  \T5, HashKey_3(arg2)
 
         GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^4<<1 mod poly
-        vmovdqa  \T5, HashKey_4(arg1)
+        vmovdqu  \T5, HashKey_4(arg2)
 
         GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^5<<1 mod poly
-        vmovdqa  \T5, HashKey_5(arg1)
+        vmovdqu  \T5, HashKey_5(arg2)
 
         GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^6<<1 mod poly
-        vmovdqa  \T5, HashKey_6(arg1)
+        vmovdqu  \T5, HashKey_6(arg2)
 
         GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^7<<1 mod poly
-        vmovdqa  \T5, HashKey_7(arg1)
+        vmovdqu  \T5, HashKey_7(arg2)
 
         GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^8<<1 mod poly
-        vmovdqa  \T5, HashKey_8(arg1)
+        vmovdqu  \T5, HashKey_8(arg2)
 
 .endm
 
@@ -1696,15 +1679,15 @@ ENDPROC(aesni_gcm_dec_avx_gen2)
 ## num_initial_blocks = b mod 4#
 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
 ## r10, r11, r12, rax are clobbered
-## arg1, arg2, arg3, r14 are used as a pointer only, not modified
+## arg1, arg3, arg4, r14 are used as a pointer only, not modified
 
 .macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
 	i = (8-\num_initial_blocks)
 	j = 0
 	setreg
 
-	mov     arg6, %r10                       # r10 = AAD
-	mov     arg7, %r12                       # r12 = aadLen
+	mov     arg7, %r10                       # r10 = AAD
+	mov     arg8, %r12                       # r12 = aadLen
 
 
 	mov     %r12, %r11
@@ -1771,7 +1754,7 @@ _get_AAD_done\@:
 	xor     %r11d, %r11d
 
 	# start AES for num_initial_blocks blocks
-	mov     arg5, %rax                     # rax = *Y0
+	mov     arg6, %rax                     # rax = *Y0
 	vmovdqu (%rax), \CTR                   # CTR = Y0
 	vpshufb SHUF_MASK(%rip), \CTR, \CTR
 
@@ -1824,9 +1807,9 @@ _get_AAD_done\@:
 	i = (9-\num_initial_blocks)
 	setreg
 .rep \num_initial_blocks
-                vmovdqu (arg3, %r11), \T1
+                vmovdqu (arg4, %r11), \T1
                 vpxor   \T1, reg_i, reg_i
-                vmovdqu reg_i, (arg2 , %r11)           # write back ciphertext for
+                vmovdqu reg_i, (arg3 , %r11)           # write back ciphertext for
 						       # num_initial_blocks blocks
                 add     $16, %r11
 .if  \ENC_DEC == DEC
@@ -1928,58 +1911,58 @@ _get_AAD_done\@:
                 vaesenclast  \T_key, \XMM7, \XMM7
                 vaesenclast  \T_key, \XMM8, \XMM8
 
-                vmovdqu  (arg3, %r11), \T1
+                vmovdqu  (arg4, %r11), \T1
                 vpxor    \T1, \XMM1, \XMM1
-                vmovdqu  \XMM1, (arg2 , %r11)
+                vmovdqu  \XMM1, (arg3 , %r11)
                 .if   \ENC_DEC == DEC
                 vmovdqa  \T1, \XMM1
                 .endif
 
-                vmovdqu  16*1(arg3, %r11), \T1
+                vmovdqu  16*1(arg4, %r11), \T1
                 vpxor    \T1, \XMM2, \XMM2
-                vmovdqu  \XMM2, 16*1(arg2 , %r11)
+                vmovdqu  \XMM2, 16*1(arg3 , %r11)
                 .if   \ENC_DEC == DEC
                 vmovdqa  \T1, \XMM2
                 .endif
 
-                vmovdqu  16*2(arg3, %r11), \T1
+                vmovdqu  16*2(arg4, %r11), \T1
                 vpxor    \T1, \XMM3, \XMM3
-                vmovdqu  \XMM3, 16*2(arg2 , %r11)
+                vmovdqu  \XMM3, 16*2(arg3 , %r11)
                 .if   \ENC_DEC == DEC
                 vmovdqa  \T1, \XMM3
                 .endif
 
-                vmovdqu  16*3(arg3, %r11), \T1
+                vmovdqu  16*3(arg4, %r11), \T1
                 vpxor    \T1, \XMM4, \XMM4
-                vmovdqu  \XMM4, 16*3(arg2 , %r11)
+                vmovdqu  \XMM4, 16*3(arg3 , %r11)
                 .if   \ENC_DEC == DEC
                 vmovdqa  \T1, \XMM4
                 .endif
 
-                vmovdqu  16*4(arg3, %r11), \T1
+                vmovdqu  16*4(arg4, %r11), \T1
                 vpxor    \T1, \XMM5, \XMM5
-                vmovdqu  \XMM5, 16*4(arg2 , %r11)
+                vmovdqu  \XMM5, 16*4(arg3 , %r11)
                 .if   \ENC_DEC == DEC
                 vmovdqa  \T1, \XMM5
                 .endif
 
-                vmovdqu  16*5(arg3, %r11), \T1
+                vmovdqu  16*5(arg4, %r11), \T1
                 vpxor    \T1, \XMM6, \XMM6
-                vmovdqu  \XMM6, 16*5(arg2 , %r11)
+                vmovdqu  \XMM6, 16*5(arg3 , %r11)
                 .if   \ENC_DEC == DEC
                 vmovdqa  \T1, \XMM6
                 .endif
 
-                vmovdqu  16*6(arg3, %r11), \T1
+                vmovdqu  16*6(arg4, %r11), \T1
                 vpxor    \T1, \XMM7, \XMM7
-                vmovdqu  \XMM7, 16*6(arg2 , %r11)
+                vmovdqu  \XMM7, 16*6(arg3 , %r11)
                 .if   \ENC_DEC == DEC
                 vmovdqa  \T1, \XMM7
                 .endif
 
-                vmovdqu  16*7(arg3, %r11), \T1
+                vmovdqu  16*7(arg4, %r11), \T1
                 vpxor    \T1, \XMM8, \XMM8
-                vmovdqu  \XMM8, 16*7(arg2 , %r11)
+                vmovdqu  \XMM8, 16*7(arg3 , %r11)
                 .if   \ENC_DEC == DEC
                 vmovdqa  \T1, \XMM8
                 .endif
@@ -2008,7 +1991,7 @@ _initial_blocks_done\@:
 
 # encrypt 8 blocks at a time
 # ghash the 8 previously encrypted ciphertext blocks
-# arg1, arg2, arg3 are used as pointers only, not modified
+# arg1, arg3, arg4 are used as pointers only, not modified
 # r11 is the data offset value
 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
 
@@ -2094,7 +2077,7 @@ _initial_blocks_done\@:
 
         #######################################################################
 
-        vmovdqa         HashKey_8(arg1), \T5
+        vmovdqu         HashKey_8(arg2), \T5
         vpclmulqdq      $0x11, \T5, \T2, \T4              # T4 = a1*b1
         vpclmulqdq      $0x00, \T5, \T2, \T7              # T7 = a0*b0
         vpclmulqdq      $0x01, \T5, \T2, \T6              # T6 = a1*b0
@@ -2112,7 +2095,7 @@ _initial_blocks_done\@:
                 vaesenc \T1, \XMM8, \XMM8
 
         vmovdqa         TMP2(%rsp), \T1
-        vmovdqa         HashKey_7(arg1), \T5
+        vmovdqu         HashKey_7(arg2), \T5
         vpclmulqdq      $0x11, \T5, \T1, \T3
         vpxor           \T3, \T4, \T4
 
@@ -2138,7 +2121,7 @@ _initial_blocks_done\@:
         #######################################################################
 
         vmovdqa         TMP3(%rsp), \T1
-        vmovdqa         HashKey_6(arg1), \T5
+        vmovdqu         HashKey_6(arg2), \T5
         vpclmulqdq      $0x11, \T5, \T1, \T3
         vpxor           \T3, \T4, \T4
 
@@ -2162,7 +2145,7 @@ _initial_blocks_done\@:
                 vaesenc \T1, \XMM8, \XMM8
 
         vmovdqa         TMP4(%rsp), \T1
-        vmovdqa         HashKey_5(arg1), \T5
+        vmovdqu         HashKey_5(arg2), \T5
         vpclmulqdq      $0x11, \T5, \T1, \T3
         vpxor           \T3, \T4, \T4
 
@@ -2187,7 +2170,7 @@ _initial_blocks_done\@:
 
 
         vmovdqa         TMP5(%rsp), \T1
-        vmovdqa         HashKey_4(arg1), \T5
+        vmovdqu         HashKey_4(arg2), \T5
         vpclmulqdq      $0x11, \T5, \T1, \T3
         vpxor           \T3, \T4, \T4
 
@@ -2211,7 +2194,7 @@ _initial_blocks_done\@:
                 vaesenc \T1, \XMM8, \XMM8
 
         vmovdqa         TMP6(%rsp), \T1
-        vmovdqa         HashKey_3(arg1), \T5
+        vmovdqu         HashKey_3(arg2), \T5
         vpclmulqdq      $0x11, \T5, \T1, \T3
         vpxor           \T3, \T4, \T4
 
@@ -2235,7 +2218,7 @@ _initial_blocks_done\@:
                 vaesenc \T1, \XMM8, \XMM8
 
         vmovdqa         TMP7(%rsp), \T1
-        vmovdqa         HashKey_2(arg1), \T5
+        vmovdqu         HashKey_2(arg2), \T5
         vpclmulqdq      $0x11, \T5, \T1, \T3
         vpxor           \T3, \T4, \T4
 
@@ -2262,7 +2245,7 @@ _initial_blocks_done\@:
                 vaesenc \T5, \XMM8, \XMM8
 
         vmovdqa         TMP8(%rsp), \T1
-        vmovdqa         HashKey(arg1), \T5
+        vmovdqu         HashKey(arg2), \T5
 
         vpclmulqdq      $0x00, \T5, \T1, \T3
         vpxor           \T3, \T7, \T7
@@ -2283,13 +2266,13 @@ _initial_blocks_done\@:
 	j = 1
 	setreg
 .rep 8
-		vpxor	16*i(arg3, %r11), \T5, \T2
+		vpxor	16*i(arg4, %r11), \T5, \T2
                 .if \ENC_DEC == ENC
                 vaesenclast     \T2, reg_j, reg_j
                 .else
                 vaesenclast     \T2, reg_j, \T3
-                vmovdqu 16*i(arg3, %r11), reg_j
-                vmovdqu \T3, 16*i(arg2, %r11)
+                vmovdqu 16*i(arg4, %r11), reg_j
+                vmovdqu \T3, 16*i(arg3, %r11)
                 .endif
 	i = (i+1)
 	j = (j+1)
@@ -2315,14 +2298,14 @@ _initial_blocks_done\@:
 	vpxor		\T2, \T7, \T7			# first phase of the reduction complete
 	#######################################################################
                 .if \ENC_DEC == ENC
-		vmovdqu	 \XMM1,	16*0(arg2,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM2,	16*1(arg2,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM3,	16*2(arg2,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM4,	16*3(arg2,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM5,	16*4(arg2,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM6,	16*5(arg2,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM7,	16*6(arg2,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM8,	16*7(arg2,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM1,	16*0(arg3,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM2,	16*1(arg3,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM3,	16*2(arg3,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM4,	16*3(arg3,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM5,	16*4(arg3,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM6,	16*5(arg3,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM7,	16*6(arg3,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM8,	16*7(arg3,%r11)		# Write to the Ciphertext buffer
                 .endif
 
 	#######################################################################
@@ -2359,7 +2342,7 @@ _initial_blocks_done\@:
 
         ## Karatsuba Method
 
-        vmovdqa         HashKey_8(arg1), \T5
+        vmovdqu         HashKey_8(arg2), \T5
 
         vpshufd         $0b01001110, \XMM1, \T2
         vpshufd         $0b01001110, \T5, \T3
@@ -2373,7 +2356,7 @@ _initial_blocks_done\@:
 
         ######################
 
-        vmovdqa         HashKey_7(arg1), \T5
+        vmovdqu         HashKey_7(arg2), \T5
         vpshufd         $0b01001110, \XMM2, \T2
         vpshufd         $0b01001110, \T5, \T3
         vpxor           \XMM2, \T2, \T2
@@ -2391,7 +2374,7 @@ _initial_blocks_done\@:
 
         ######################
 
-        vmovdqa         HashKey_6(arg1), \T5
+        vmovdqu         HashKey_6(arg2), \T5
         vpshufd         $0b01001110, \XMM3, \T2
         vpshufd         $0b01001110, \T5, \T3
         vpxor           \XMM3, \T2, \T2
@@ -2409,7 +2392,7 @@ _initial_blocks_done\@:
 
         ######################
 
-        vmovdqa         HashKey_5(arg1), \T5
+        vmovdqu         HashKey_5(arg2), \T5
         vpshufd         $0b01001110, \XMM4, \T2
         vpshufd         $0b01001110, \T5, \T3
         vpxor           \XMM4, \T2, \T2
@@ -2427,7 +2410,7 @@ _initial_blocks_done\@:
 
         ######################
 
-        vmovdqa         HashKey_4(arg1), \T5
+        vmovdqu         HashKey_4(arg2), \T5
         vpshufd         $0b01001110, \XMM5, \T2
         vpshufd         $0b01001110, \T5, \T3
         vpxor           \XMM5, \T2, \T2
@@ -2445,7 +2428,7 @@ _initial_blocks_done\@:
 
         ######################
 
-        vmovdqa         HashKey_3(arg1), \T5
+        vmovdqu         HashKey_3(arg2), \T5
         vpshufd         $0b01001110, \XMM6, \T2
         vpshufd         $0b01001110, \T5, \T3
         vpxor           \XMM6, \T2, \T2
@@ -2463,7 +2446,7 @@ _initial_blocks_done\@:
 
         ######################
 
-        vmovdqa         HashKey_2(arg1), \T5
+        vmovdqu         HashKey_2(arg2), \T5
         vpshufd         $0b01001110, \XMM7, \T2
         vpshufd         $0b01001110, \T5, \T3
         vpxor           \XMM7, \T2, \T2
@@ -2481,7 +2464,7 @@ _initial_blocks_done\@:
 
         ######################
 
-        vmovdqa         HashKey(arg1), \T5
+        vmovdqu         HashKey(arg2), \T5
         vpshufd         $0b01001110, \XMM8, \T2
         vpshufd         $0b01001110, \T5, \T3
         vpxor           \XMM8, \T2, \T2
@@ -2537,6 +2520,7 @@ _initial_blocks_done\@:
 #############################################################
 #void   aesni_gcm_precomp_avx_gen4
 #        (gcm_data     *my_ctx_data,
+#         gcm_context_data *data,
 #        u8     *hash_subkey)# /* H, the Hash sub key input.
 #				Data starts on a 16-byte boundary. */
 #############################################################
@@ -2554,7 +2538,7 @@ ENTRY(aesni_gcm_precomp_avx_gen4)
         sub     $VARIABLE_OFFSET, %rsp
         and     $~63, %rsp                    # align rsp to 64 bytes
 
-        vmovdqu  (arg2), %xmm6                # xmm6 = HashKey
+        vmovdqu  (arg3), %xmm6                # xmm6 = HashKey
 
         vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
         ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
@@ -2571,7 +2555,7 @@ ENTRY(aesni_gcm_precomp_avx_gen4)
         vpand    POLY(%rip), %xmm2, %xmm2
         vpxor    %xmm2, %xmm6, %xmm6          # xmm6 holds the HashKey<<1 mod poly
         #######################################################################
-        vmovdqa  %xmm6, HashKey(arg1)         # store HashKey<<1 mod poly
+        vmovdqu  %xmm6, HashKey(arg2)         # store HashKey<<1 mod poly
 
 
         PRECOMPUTE_AVX2  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
@@ -2589,6 +2573,7 @@ ENDPROC(aesni_gcm_precomp_avx_gen4)
 ###############################################################################
 #void   aesni_gcm_enc_avx_gen4(
 #        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
+#        gcm_context_data *data,
 #        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
 #        const   u8 *in, /* Plaintext input */
 #        u64     plaintext_len, /* Length of data in Bytes for encryption. */
@@ -2610,6 +2595,7 @@ ENDPROC(aesni_gcm_enc_avx_gen4)
 ###############################################################################
 #void   aesni_gcm_dec_avx_gen4(
 #        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
+#        gcm_context_data *data,
 #        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
 #        const   u8 *in, /* Ciphertext input */
 #        u64     plaintext_len, /* Length of data in Bytes for encryption. */
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 661f7daf43da..d8b8cb33f608 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -84,7 +84,7 @@ struct gcm_context_data {
 	u8 current_counter[GCM_BLOCK_LEN];
 	u64 partial_block_len;
 	u64 unused;
-	u8 hash_keys[GCM_BLOCK_LEN * 8];
+	u8 hash_keys[GCM_BLOCK_LEN * 16];
 };
 
 asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
@@ -187,14 +187,18 @@ asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv,
  * gcm_data *my_ctx_data, context data
  * u8 *hash_subkey,  the Hash sub key input. Data starts on a 16-byte boundary.
  */
-asmlinkage void aesni_gcm_precomp_avx_gen2(void *my_ctx_data, u8 *hash_subkey);
+asmlinkage void aesni_gcm_precomp_avx_gen2(void *my_ctx_data,
+					   struct gcm_context_data *gdata,
+					   u8 *hash_subkey);
 
-asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx, u8 *out,
+asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx,
+				struct gcm_context_data *gdata, u8 *out,
 			const u8 *in, unsigned long plaintext_len, u8 *iv,
 			const u8 *aad, unsigned long aad_len,
 			u8 *auth_tag, unsigned long auth_tag_len);
 
-asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx, u8 *out,
+asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx,
+				struct gcm_context_data *gdata, u8 *out,
 			const u8 *in, unsigned long ciphertext_len, u8 *iv,
 			const u8 *aad, unsigned long aad_len,
 			u8 *auth_tag, unsigned long auth_tag_len);
@@ -211,9 +215,9 @@ static void aesni_gcm_enc_avx(void *ctx,
 			plaintext_len, iv, hash_subkey, aad,
 			aad_len, auth_tag, auth_tag_len);
 	} else {
-		aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
-		aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad,
-					aad_len, auth_tag, auth_tag_len);
+		aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey);
+		aesni_gcm_enc_avx_gen2(ctx, data, out, in, plaintext_len, iv,
+				       aad, aad_len, auth_tag, auth_tag_len);
 	}
 }
 
@@ -229,9 +233,9 @@ static void aesni_gcm_dec_avx(void *ctx,
 			ciphertext_len, iv, hash_subkey, aad,
 			aad_len, auth_tag, auth_tag_len);
 	} else {
-		aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
-		aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad,
-					aad_len, auth_tag, auth_tag_len);
+		aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey);
+		aesni_gcm_dec_avx_gen2(ctx, data, out, in, ciphertext_len, iv,
+				       aad, aad_len, auth_tag, auth_tag_len);
 	}
 }
 #endif
@@ -242,14 +246,18 @@ static void aesni_gcm_dec_avx(void *ctx,
  * gcm_data *my_ctx_data, context data
  * u8 *hash_subkey,  the Hash sub key input. Data starts on a 16-byte boundary.
  */
-asmlinkage void aesni_gcm_precomp_avx_gen4(void *my_ctx_data, u8 *hash_subkey);
+asmlinkage void aesni_gcm_precomp_avx_gen4(void *my_ctx_data,
+					   struct gcm_context_data *gdata,
+					   u8 *hash_subkey);
 
-asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx, u8 *out,
+asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx,
+				struct gcm_context_data *gdata, u8 *out,
 			const u8 *in, unsigned long plaintext_len, u8 *iv,
 			const u8 *aad, unsigned long aad_len,
 			u8 *auth_tag, unsigned long auth_tag_len);
 
-asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx, u8 *out,
+asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx,
+				struct gcm_context_data *gdata, u8 *out,
 			const u8 *in, unsigned long ciphertext_len, u8 *iv,
 			const u8 *aad, unsigned long aad_len,
 			u8 *auth_tag, unsigned long auth_tag_len);
@@ -266,13 +274,13 @@ static void aesni_gcm_enc_avx2(void *ctx,
 			      plaintext_len, iv, hash_subkey, aad,
 			      aad_len, auth_tag, auth_tag_len);
 	} else if (plaintext_len < AVX_GEN4_OPTSIZE) {
-		aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
-		aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad,
-					aad_len, auth_tag, auth_tag_len);
+		aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey);
+		aesni_gcm_enc_avx_gen2(ctx, data, out, in, plaintext_len, iv,
+				       aad, aad_len, auth_tag, auth_tag_len);
 	} else {
-		aesni_gcm_precomp_avx_gen4(ctx, hash_subkey);
-		aesni_gcm_enc_avx_gen4(ctx, out, in, plaintext_len, iv, aad,
-					aad_len, auth_tag, auth_tag_len);
+		aesni_gcm_precomp_avx_gen4(ctx, data, hash_subkey);
+		aesni_gcm_enc_avx_gen4(ctx, data, out, in, plaintext_len, iv,
+				       aad, aad_len, auth_tag, auth_tag_len);
 	}
 }
 
@@ -288,13 +296,13 @@ static void aesni_gcm_dec_avx2(void *ctx,
 			      ciphertext_len, iv, hash_subkey,
 			      aad, aad_len, auth_tag, auth_tag_len);
 	} else if (ciphertext_len < AVX_GEN4_OPTSIZE) {
-		aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
-		aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad,
-					aad_len, auth_tag, auth_tag_len);
+		aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey);
+		aesni_gcm_dec_avx_gen2(ctx, data, out, in, ciphertext_len, iv,
+				       aad, aad_len, auth_tag, auth_tag_len);
 	} else {
-		aesni_gcm_precomp_avx_gen4(ctx, hash_subkey);
-		aesni_gcm_dec_avx_gen4(ctx, out, in, ciphertext_len, iv, aad,
-					aad_len, auth_tag, auth_tag_len);
+		aesni_gcm_precomp_avx_gen4(ctx, data, hash_subkey);
+		aesni_gcm_dec_avx_gen4(ctx, data, out, in, ciphertext_len, iv,
+				       aad, aad_len, auth_tag, auth_tag_len);
 	}
 }
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 2426f64bc51fc86951b735d2247d6eb89259d580 Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Mon, 10 Dec 2018 19:57:12 +0000
Subject: crypto: aesni - Macro-ify func save/restore

Macro-ify function save and restore.  These will be used in new functions
added for scatter/gather update operations.

Signed-off-by: Dave Watson <davejwatson@fb.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aesni-intel_avx-x86_64.S | 94 ++++++++++++--------------------
 1 file changed, 36 insertions(+), 58 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index 284f1b8b88fc..dd895f69399b 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -247,6 +247,30 @@ VARIABLE_OFFSET = 16*8
 # Utility Macros
 ################################
 
+.macro FUNC_SAVE
+        #the number of pushes must equal STACK_OFFSET
+        push    %r12
+        push    %r13
+        push    %r14
+        push    %r15
+
+        mov     %rsp, %r14
+
+
+
+        sub     $VARIABLE_OFFSET, %rsp
+        and     $~63, %rsp                    # align rsp to 64 bytes
+.endm
+
+.macro FUNC_RESTORE
+        mov     %r14, %rsp
+
+        pop     %r15
+        pop     %r14
+        pop     %r13
+        pop     %r12
+.endm
+
 # Encryption of a single block
 .macro ENCRYPT_SINGLE_BLOCK XMM0
                 vpxor    (arg1), \XMM0, \XMM0
@@ -264,22 +288,6 @@ VARIABLE_OFFSET = 16*8
 # clobbering all xmm registers
 # clobbering r10, r11, r12, r13, r14, r15
 .macro  GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC
-
-        #the number of pushes must equal STACK_OFFSET
-        push    %r12
-        push    %r13
-        push    %r14
-        push    %r15
-
-        mov     %rsp, %r14
-
-
-
-
-        sub     $VARIABLE_OFFSET, %rsp
-        and     $~63, %rsp                  # align rsp to 64 bytes
-
-
         vmovdqu  HashKey(arg2), %xmm13      # xmm13 = HashKey
 
         mov     arg5, %r13                  # save the number of bytes of plaintext/ciphertext
@@ -566,12 +574,6 @@ _T_16\@:
         vmovdqu %xmm9, (%r10)
 
 _return_T_done\@:
-        mov     %r14, %rsp
-
-        pop     %r15
-        pop     %r14
-        pop     %r13
-        pop     %r12
 .endm
 
 #ifdef CONFIG_AS_AVX
@@ -1511,18 +1513,7 @@ _initial_blocks_done\@:
 #        u8     *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
 #############################################################
 ENTRY(aesni_gcm_precomp_avx_gen2)
-        #the number of pushes must equal STACK_OFFSET
-        push    %r12
-        push    %r13
-        push    %r14
-        push    %r15
-
-        mov     %rsp, %r14
-
-
-
-        sub     $VARIABLE_OFFSET, %rsp
-        and     $~63, %rsp                  # align rsp to 64 bytes
+        FUNC_SAVE
 
         vmovdqu  (arg3), %xmm6              # xmm6 = HashKey
 
@@ -1546,12 +1537,7 @@ ENTRY(aesni_gcm_precomp_avx_gen2)
 
         PRECOMPUTE_AVX  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
 
-        mov     %r14, %rsp
-
-        pop     %r15
-        pop     %r14
-        pop     %r13
-        pop     %r12
+        FUNC_RESTORE
         ret
 ENDPROC(aesni_gcm_precomp_avx_gen2)
 
@@ -1573,7 +1559,9 @@ ENDPROC(aesni_gcm_precomp_avx_gen2)
 #				Valid values are 16 (most likely), 12 or 8. */
 ###############################################################################
 ENTRY(aesni_gcm_enc_avx_gen2)
+        FUNC_SAVE
         GCM_ENC_DEC INITIAL_BLOCKS_AVX GHASH_8_ENCRYPT_8_PARALLEL_AVX GHASH_LAST_8_AVX GHASH_MUL_AVX ENC
+        FUNC_RESTORE
 	ret
 ENDPROC(aesni_gcm_enc_avx_gen2)
 
@@ -1595,7 +1583,9 @@ ENDPROC(aesni_gcm_enc_avx_gen2)
 #				Valid values are 16 (most likely), 12 or 8. */
 ###############################################################################
 ENTRY(aesni_gcm_dec_avx_gen2)
+        FUNC_SAVE
         GCM_ENC_DEC INITIAL_BLOCKS_AVX GHASH_8_ENCRYPT_8_PARALLEL_AVX GHASH_LAST_8_AVX GHASH_MUL_AVX DEC
+        FUNC_RESTORE
 	ret
 ENDPROC(aesni_gcm_dec_avx_gen2)
 #endif /* CONFIG_AS_AVX */
@@ -2525,18 +2515,7 @@ _initial_blocks_done\@:
 #				Data starts on a 16-byte boundary. */
 #############################################################
 ENTRY(aesni_gcm_precomp_avx_gen4)
-        #the number of pushes must equal STACK_OFFSET
-        push    %r12
-        push    %r13
-        push    %r14
-        push    %r15
-
-        mov     %rsp, %r14
-
-
-
-        sub     $VARIABLE_OFFSET, %rsp
-        and     $~63, %rsp                    # align rsp to 64 bytes
+        FUNC_SAVE
 
         vmovdqu  (arg3), %xmm6                # xmm6 = HashKey
 
@@ -2560,12 +2539,7 @@ ENTRY(aesni_gcm_precomp_avx_gen4)
 
         PRECOMPUTE_AVX2  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
 
-        mov     %r14, %rsp
-
-        pop     %r15
-        pop     %r14
-        pop     %r13
-        pop     %r12
+        FUNC_RESTORE
         ret
 ENDPROC(aesni_gcm_precomp_avx_gen4)
 
@@ -2588,7 +2562,9 @@ ENDPROC(aesni_gcm_precomp_avx_gen4)
 #				Valid values are 16 (most likely), 12 or 8. */
 ###############################################################################
 ENTRY(aesni_gcm_enc_avx_gen4)
+        FUNC_SAVE
         GCM_ENC_DEC INITIAL_BLOCKS_AVX2 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 GHASH_LAST_8_AVX2 GHASH_MUL_AVX2 ENC
+        FUNC_RESTORE
 	ret
 ENDPROC(aesni_gcm_enc_avx_gen4)
 
@@ -2610,7 +2586,9 @@ ENDPROC(aesni_gcm_enc_avx_gen4)
 #				Valid values are 16 (most likely), 12 or 8. */
 ###############################################################################
 ENTRY(aesni_gcm_dec_avx_gen4)
+        FUNC_SAVE
         GCM_ENC_DEC INITIAL_BLOCKS_AVX2 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 GHASH_LAST_8_AVX2 GHASH_MUL_AVX2 DEC
+        FUNC_RESTORE
 	ret
 ENDPROC(aesni_gcm_dec_avx_gen4)
 
-- 
cgit v1.2.3-59-g8ed1b


From 5350b0f563433dacc134214a452fd316b36251d6 Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Mon, 10 Dec 2018 19:57:36 +0000
Subject: crypto: aesni - support 256 byte keys in avx asm

Add support for 192/256-bit keys using the avx gcm/aes routines.
The sse routines were previously updated in e31ac32d3b (Add support
for 192 & 256 bit keys to AESNI RFC4106).

Instead of adding an additional loop in the hotpath as in e31ac32d3b,
this diff instead generates separate versions of the code using macros,
and the entry routines choose which version once.   This results
in a 5% performance improvement vs. adding a loop to the hot path.
This is the same strategy chosen by the intel isa-l_crypto library.

The key size checks are removed from the c code where appropriate.

Note that this diff depends on using gcm_context_data - 256 bit keys
require 16 HashKeys + 15 expanded keys, which is larger than
struct crypto_aes_ctx, so they are stored in struct gcm_context_data.

Signed-off-by: Dave Watson <davejwatson@fb.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aesni-intel_avx-x86_64.S | 188 +++++++++++++++++++++++--------
 arch/x86/crypto/aesni-intel_glue.c       |  18 +--
 2 files changed, 145 insertions(+), 61 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index dd895f69399b..2aa11c503bb9 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -209,6 +209,7 @@ HashKey_8_k    = 16*21   # store XOR of HashKey^8 <<1 mod poly here (for Karatsu
 #define arg8 STACK_OFFSET+8*2(%r14)
 #define arg9 STACK_OFFSET+8*3(%r14)
 #define arg10 STACK_OFFSET+8*4(%r14)
+#define keysize 2*15*16(arg1)
 
 i = 0
 j = 0
@@ -272,22 +273,22 @@ VARIABLE_OFFSET = 16*8
 .endm
 
 # Encryption of a single block
-.macro ENCRYPT_SINGLE_BLOCK XMM0
+.macro ENCRYPT_SINGLE_BLOCK REP XMM0
                 vpxor    (arg1), \XMM0, \XMM0
-		i = 1
-		setreg
-.rep 9
+               i = 1
+               setreg
+.rep \REP
                 vaesenc  16*i(arg1), \XMM0, \XMM0
-		i = (i+1)
-		setreg
+               i = (i+1)
+               setreg
 .endr
-                vaesenclast 16*10(arg1), \XMM0, \XMM0
+                vaesenclast 16*i(arg1), \XMM0, \XMM0
 .endm
 
 # combined for GCM encrypt and decrypt functions
 # clobbering all xmm registers
 # clobbering r10, r11, r12, r13, r14, r15
-.macro  GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC
+.macro  GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
         vmovdqu  HashKey(arg2), %xmm13      # xmm13 = HashKey
 
         mov     arg5, %r13                  # save the number of bytes of plaintext/ciphertext
@@ -314,42 +315,42 @@ VARIABLE_OFFSET = 16*8
         jmp     _initial_num_blocks_is_1\@
 
 _initial_num_blocks_is_7\@:
-        \INITIAL_BLOCKS  7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        \INITIAL_BLOCKS  \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
         sub     $16*7, %r13
         jmp     _initial_blocks_encrypted\@
 
 _initial_num_blocks_is_6\@:
-        \INITIAL_BLOCKS  6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        \INITIAL_BLOCKS  \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
         sub     $16*6, %r13
         jmp     _initial_blocks_encrypted\@
 
 _initial_num_blocks_is_5\@:
-        \INITIAL_BLOCKS  5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        \INITIAL_BLOCKS  \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
         sub     $16*5, %r13
         jmp     _initial_blocks_encrypted\@
 
 _initial_num_blocks_is_4\@:
-        \INITIAL_BLOCKS  4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        \INITIAL_BLOCKS  \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
         sub     $16*4, %r13
         jmp     _initial_blocks_encrypted\@
 
 _initial_num_blocks_is_3\@:
-        \INITIAL_BLOCKS  3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        \INITIAL_BLOCKS  \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
         sub     $16*3, %r13
         jmp     _initial_blocks_encrypted\@
 
 _initial_num_blocks_is_2\@:
-        \INITIAL_BLOCKS  2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        \INITIAL_BLOCKS  \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
         sub     $16*2, %r13
         jmp     _initial_blocks_encrypted\@
 
 _initial_num_blocks_is_1\@:
-        \INITIAL_BLOCKS  1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        \INITIAL_BLOCKS  \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
         sub     $16*1, %r13
         jmp     _initial_blocks_encrypted\@
 
 _initial_num_blocks_is_0\@:
-        \INITIAL_BLOCKS  0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        \INITIAL_BLOCKS  \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
 
 
 _initial_blocks_encrypted\@:
@@ -374,7 +375,7 @@ _encrypt_by_8_new\@:
 
 
         add     $8, %r15b
-        \GHASH_8_ENCRYPT_8_PARALLEL      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
+        \GHASH_8_ENCRYPT_8_PARALLEL      \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
         add     $128, %r11
         sub     $128, %r13
         jne     _encrypt_by_8_new\@
@@ -385,7 +386,7 @@ _encrypt_by_8_new\@:
 _encrypt_by_8\@:
         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
         add     $8, %r15b
-        \GHASH_8_ENCRYPT_8_PARALLEL      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
+        \GHASH_8_ENCRYPT_8_PARALLEL      \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
         add     $128, %r11
         sub     $128, %r13
@@ -414,7 +415,7 @@ _zero_cipher_left\@:
 
         vpaddd   ONE(%rip), %xmm9, %xmm9             # INCR CNT to get Yn
         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
+        ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Yn)
 
         sub     $16, %r11
         add     %r13, %r11
@@ -440,7 +441,7 @@ _only_less_than_16\@:
 
         vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
+        ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Yn)
 
 
         lea     SHIFT_MASK+16(%rip), %r12
@@ -525,7 +526,7 @@ _multiple_of_16_bytes\@:
         mov     arg6, %rax                           # rax = *Y0
         vmovdqu (%rax), %xmm9                        # xmm9 = Y0
 
-        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Y0)
+        ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Y0)
 
         vpxor   %xmm14, %xmm9, %xmm9
 
@@ -690,7 +691,7 @@ _return_T_done\@:
 ## r10, r11, r12, rax are clobbered
 ## arg1, arg3, arg4, r14 are used as a pointer only, not modified
 
-.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
+.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
 	i = (8-\num_initial_blocks)
 	j = 0
 	setreg
@@ -786,10 +787,10 @@ _get_AAD_done\@:
 	setreg
 .endr
 
-	j = 1
-	setreg
-.rep 9
-	vmovdqa  16*j(arg1), \T_key
+       j = 1
+       setreg
+.rep \REP
+       vmovdqa  16*j(arg1), \T_key
 	i = (9-\num_initial_blocks)
 	setreg
 .rep \num_initial_blocks
@@ -798,12 +799,11 @@ _get_AAD_done\@:
 	setreg
 .endr
 
-	j = (j+1)
-	setreg
+       j = (j+1)
+       setreg
 .endr
 
-
-	vmovdqa  16*10(arg1), \T_key
+	vmovdqa  16*j(arg1), \T_key
 	i = (9-\num_initial_blocks)
 	setreg
 .rep \num_initial_blocks
@@ -891,9 +891,9 @@ _get_AAD_done\@:
                 vpxor    \T_key, \XMM7, \XMM7
                 vpxor    \T_key, \XMM8, \XMM8
 
-		i = 1
-		setreg
-.rep    9       # do 9 rounds
+               i = 1
+               setreg
+.rep    \REP       # do REP rounds
                 vmovdqa  16*i(arg1), \T_key
                 vaesenc  \T_key, \XMM1, \XMM1
                 vaesenc  \T_key, \XMM2, \XMM2
@@ -903,11 +903,10 @@ _get_AAD_done\@:
                 vaesenc  \T_key, \XMM6, \XMM6
                 vaesenc  \T_key, \XMM7, \XMM7
                 vaesenc  \T_key, \XMM8, \XMM8
-		i = (i+1)
-		setreg
+               i = (i+1)
+               setreg
 .endr
 
-
                 vmovdqa  16*i(arg1), \T_key
                 vaesenclast  \T_key, \XMM1, \XMM1
                 vaesenclast  \T_key, \XMM2, \XMM2
@@ -996,7 +995,7 @@ _initial_blocks_done\@:
 # ghash the 8 previously encrypted ciphertext blocks
 # arg1, arg3, arg4 are used as pointers only, not modified
 # r11 is the data offset value
-.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
+.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
 
         vmovdqa \XMM1, \T2
         vmovdqa \XMM2, TMP2(%rsp)
@@ -1262,6 +1261,24 @@ _initial_blocks_done\@:
 
                 vmovdqu 16*10(arg1), \T5
 
+        i = 11
+        setreg
+.rep (\REP-9)
+
+        vaesenc \T5, \XMM1, \XMM1
+        vaesenc \T5, \XMM2, \XMM2
+        vaesenc \T5, \XMM3, \XMM3
+        vaesenc \T5, \XMM4, \XMM4
+        vaesenc \T5, \XMM5, \XMM5
+        vaesenc \T5, \XMM6, \XMM6
+        vaesenc \T5, \XMM7, \XMM7
+        vaesenc \T5, \XMM8, \XMM8
+
+        vmovdqu 16*i(arg1), \T5
+        i = i + 1
+        setreg
+.endr
+
 	i = 0
 	j = 1
 	setreg
@@ -1560,9 +1577,23 @@ ENDPROC(aesni_gcm_precomp_avx_gen2)
 ###############################################################################
 ENTRY(aesni_gcm_enc_avx_gen2)
         FUNC_SAVE
-        GCM_ENC_DEC INITIAL_BLOCKS_AVX GHASH_8_ENCRYPT_8_PARALLEL_AVX GHASH_LAST_8_AVX GHASH_MUL_AVX ENC
+        mov     keysize, %eax
+        cmp     $32, %eax
+        je      key_256_enc
+        cmp     $16, %eax
+        je      key_128_enc
+        # must be 192
+        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
         FUNC_RESTORE
-	ret
+        ret
+key_128_enc:
+        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
+        FUNC_RESTORE
+        ret
+key_256_enc:
+        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
+        FUNC_RESTORE
+        ret
 ENDPROC(aesni_gcm_enc_avx_gen2)
 
 ###############################################################################
@@ -1584,9 +1615,23 @@ ENDPROC(aesni_gcm_enc_avx_gen2)
 ###############################################################################
 ENTRY(aesni_gcm_dec_avx_gen2)
         FUNC_SAVE
-        GCM_ENC_DEC INITIAL_BLOCKS_AVX GHASH_8_ENCRYPT_8_PARALLEL_AVX GHASH_LAST_8_AVX GHASH_MUL_AVX DEC
+        mov     keysize,%eax
+        cmp     $32, %eax
+        je      key_256_dec
+        cmp     $16, %eax
+        je      key_128_dec
+        # must be 192
+        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
         FUNC_RESTORE
-	ret
+        ret
+key_128_dec:
+        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
+        FUNC_RESTORE
+        ret
+key_256_dec:
+        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
+        FUNC_RESTORE
+        ret
 ENDPROC(aesni_gcm_dec_avx_gen2)
 #endif /* CONFIG_AS_AVX */
 
@@ -1671,7 +1716,7 @@ ENDPROC(aesni_gcm_dec_avx_gen2)
 ## r10, r11, r12, rax are clobbered
 ## arg1, arg3, arg4, r14 are used as a pointer only, not modified
 
-.macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
+.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
 	i = (8-\num_initial_blocks)
 	j = 0
 	setreg
@@ -1770,7 +1815,7 @@ _get_AAD_done\@:
 
 	j = 1
 	setreg
-.rep 9
+.rep \REP
 	vmovdqa  16*j(arg1), \T_key
 	i = (9-\num_initial_blocks)
 	setreg
@@ -1785,7 +1830,7 @@ _get_AAD_done\@:
 .endr
 
 
-	vmovdqa  16*10(arg1), \T_key
+	vmovdqa  16*j(arg1), \T_key
 	i = (9-\num_initial_blocks)
 	setreg
 .rep \num_initial_blocks
@@ -1876,7 +1921,7 @@ _get_AAD_done\@:
 
 		i = 1
 		setreg
-.rep    9       # do 9 rounds
+.rep    \REP       # do REP rounds
                 vmovdqa  16*i(arg1), \T_key
                 vaesenc  \T_key, \XMM1, \XMM1
                 vaesenc  \T_key, \XMM2, \XMM2
@@ -1983,7 +2028,7 @@ _initial_blocks_done\@:
 # ghash the 8 previously encrypted ciphertext blocks
 # arg1, arg3, arg4 are used as pointers only, not modified
 # r11 is the data offset value
-.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
+.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
 
         vmovdqa \XMM1, \T2
         vmovdqa \XMM2, TMP2(%rsp)
@@ -2252,6 +2297,23 @@ _initial_blocks_done\@:
 
                 vmovdqu 16*10(arg1), \T5
 
+        i = 11
+        setreg
+.rep (\REP-9)
+        vaesenc \T5, \XMM1, \XMM1
+        vaesenc \T5, \XMM2, \XMM2
+        vaesenc \T5, \XMM3, \XMM3
+        vaesenc \T5, \XMM4, \XMM4
+        vaesenc \T5, \XMM5, \XMM5
+        vaesenc \T5, \XMM6, \XMM6
+        vaesenc \T5, \XMM7, \XMM7
+        vaesenc \T5, \XMM8, \XMM8
+
+        vmovdqu 16*i(arg1), \T5
+        i = i + 1
+        setreg
+.endr
+
 	i = 0
 	j = 1
 	setreg
@@ -2563,7 +2625,21 @@ ENDPROC(aesni_gcm_precomp_avx_gen4)
 ###############################################################################
 ENTRY(aesni_gcm_enc_avx_gen4)
         FUNC_SAVE
-        GCM_ENC_DEC INITIAL_BLOCKS_AVX2 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 GHASH_LAST_8_AVX2 GHASH_MUL_AVX2 ENC
+        mov     keysize,%eax
+        cmp     $32, %eax
+        je      key_256_enc4
+        cmp     $16, %eax
+        je      key_128_enc4
+        # must be 192
+        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
+        FUNC_RESTORE
+	ret
+key_128_enc4:
+        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
+        FUNC_RESTORE
+	ret
+key_256_enc4:
+        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
         FUNC_RESTORE
 	ret
 ENDPROC(aesni_gcm_enc_avx_gen4)
@@ -2587,9 +2663,23 @@ ENDPROC(aesni_gcm_enc_avx_gen4)
 ###############################################################################
 ENTRY(aesni_gcm_dec_avx_gen4)
         FUNC_SAVE
-        GCM_ENC_DEC INITIAL_BLOCKS_AVX2 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 GHASH_LAST_8_AVX2 GHASH_MUL_AVX2 DEC
+        mov     keysize,%eax
+        cmp     $32, %eax
+        je      key_256_dec4
+        cmp     $16, %eax
+        je      key_128_dec4
+        # must be 192
+        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
         FUNC_RESTORE
-	ret
+        ret
+key_128_dec4:
+        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
+        FUNC_RESTORE
+        ret
+key_256_dec4:
+        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
+        FUNC_RESTORE
+        ret
 ENDPROC(aesni_gcm_dec_avx_gen4)
 
 #endif /* CONFIG_AS_AVX2 */
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index d8b8cb33f608..7d1259feb0f9 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -209,8 +209,7 @@ static void aesni_gcm_enc_avx(void *ctx,
 			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
 			u8 *auth_tag, unsigned long auth_tag_len)
 {
-        struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
-	if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)){
+	if (plaintext_len < AVX_GEN2_OPTSIZE) {
 		aesni_gcm_enc(ctx, data, out, in,
 			plaintext_len, iv, hash_subkey, aad,
 			aad_len, auth_tag, auth_tag_len);
@@ -227,8 +226,7 @@ static void aesni_gcm_dec_avx(void *ctx,
 			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
 			u8 *auth_tag, unsigned long auth_tag_len)
 {
-        struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
-	if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
+	if (ciphertext_len < AVX_GEN2_OPTSIZE) {
 		aesni_gcm_dec(ctx, data, out, in,
 			ciphertext_len, iv, hash_subkey, aad,
 			aad_len, auth_tag, auth_tag_len);
@@ -268,8 +266,7 @@ static void aesni_gcm_enc_avx2(void *ctx,
 			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
 			u8 *auth_tag, unsigned long auth_tag_len)
 {
-       struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
-	if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
+	if (plaintext_len < AVX_GEN2_OPTSIZE) {
 		aesni_gcm_enc(ctx, data, out, in,
 			      plaintext_len, iv, hash_subkey, aad,
 			      aad_len, auth_tag, auth_tag_len);
@@ -290,8 +287,7 @@ static void aesni_gcm_dec_avx2(void *ctx,
 			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
 			u8 *auth_tag, unsigned long auth_tag_len)
 {
-       struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
-	if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
+	if (ciphertext_len < AVX_GEN2_OPTSIZE) {
 		aesni_gcm_dec(ctx, data, out, in,
 			      ciphertext_len, iv, hash_subkey,
 			      aad, aad_len, auth_tag, auth_tag_len);
@@ -928,8 +924,7 @@ static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen,
 	struct scatter_walk dst_sg_walk = {};
 	struct gcm_context_data data AESNI_ALIGN_ATTR;
 
-	if (((struct crypto_aes_ctx *)aes_ctx)->key_length != AES_KEYSIZE_128 ||
-		aesni_gcm_enc_tfm == aesni_gcm_enc ||
+	if (aesni_gcm_enc_tfm == aesni_gcm_enc ||
 		req->cryptlen < AVX_GEN2_OPTSIZE) {
 		return gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv,
 					  aes_ctx);
@@ -1000,8 +995,7 @@ static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen,
 	struct gcm_context_data data AESNI_ALIGN_ATTR;
 	int retval = 0;
 
-	if (((struct crypto_aes_ctx *)aes_ctx)->key_length != AES_KEYSIZE_128 ||
-		aesni_gcm_enc_tfm == aesni_gcm_enc ||
+	if (aesni_gcm_enc_tfm == aesni_gcm_enc ||
 		req->cryptlen < AVX_GEN2_OPTSIZE) {
 		return gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv,
 					  aes_ctx);
-- 
cgit v1.2.3-59-g8ed1b


From e377bedb09d6970ad27d7714b0a6365ee7e4d732 Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Mon, 10 Dec 2018 19:57:49 +0000
Subject: crypto: aesni - Add GCM_COMPLETE macro

Merge encode and decode tag calculations in GCM_COMPLETE macro.
Scatter/gather routines will call this once at the end of encryption
or decryption.

Signed-off-by: Dave Watson <davejwatson@fb.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aesni-intel_avx-x86_64.S | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index 2aa11c503bb9..8e9ae4b26118 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -510,6 +510,14 @@ _less_than_8_bytes_left\@:
         #############################
 
 _multiple_of_16_bytes\@:
+        GCM_COMPLETE \GHASH_MUL \REP
+.endm
+
+
+# GCM_COMPLETE Finishes update of tag of last partial block
+# Output: Authorization Tag (AUTH_TAG)
+# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
+.macro GCM_COMPLETE GHASH_MUL REP
         mov     arg8, %r12                           # r12 = aadLen (number of bytes)
         shl     $3, %r12                             # convert into number of bits
         vmovd   %r12d, %xmm15                        # len(A) in xmm15
-- 
cgit v1.2.3-59-g8ed1b


From 38003cd26c9f59da77d98927fb9af58732da207a Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Mon, 10 Dec 2018 19:58:19 +0000
Subject: crypto: aesni - Split AAD hash calculation to separate macro

AAD hash only needs to be calculated once for each scatter/gather operation.
Move it to its own macro, and call it from GCM_INIT instead of
INITIAL_BLOCKS.

Signed-off-by: Dave Watson <davejwatson@fb.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aesni-intel_avx-x86_64.S | 228 +++++++++++++------------------
 arch/x86/crypto/aesni-intel_glue.c       |  28 ++--
 2 files changed, 115 insertions(+), 141 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index 8e9ae4b26118..305abece93ad 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -182,6 +182,14 @@ aad_shift_arr:
 .text
 
 
+#define AadHash 16*0
+#define AadLen 16*1
+#define InLen (16*1)+8
+#define PBlockEncKey 16*2
+#define OrigIV 16*3
+#define CurCount 16*4
+#define PBlockLen 16*5
+
 HashKey        = 16*6   # store HashKey <<1 mod poly here
 HashKey_2      = 16*7   # store HashKey^2 <<1 mod poly here
 HashKey_3      = 16*8   # store HashKey^3 <<1 mod poly here
@@ -585,6 +593,74 @@ _T_16\@:
 _return_T_done\@:
 .endm
 
+.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
+
+	mov     \AAD, %r10                      # r10 = AAD
+	mov     \AADLEN, %r12                      # r12 = aadLen
+
+
+	mov     %r12, %r11
+
+	vpxor   \T8, \T8, \T8
+	vpxor   \T7, \T7, \T7
+	cmp     $16, %r11
+	jl      _get_AAD_rest8\@
+_get_AAD_blocks\@:
+	vmovdqu (%r10), \T7
+	vpshufb SHUF_MASK(%rip), \T7, \T7
+	vpxor   \T7, \T8, \T8
+	\GHASH_MUL       \T8, \T2, \T1, \T3, \T4, \T5, \T6
+	add     $16, %r10
+	sub     $16, %r12
+	sub     $16, %r11
+	cmp     $16, %r11
+	jge     _get_AAD_blocks\@
+	vmovdqu \T8, \T7
+	cmp     $0, %r11
+	je      _get_AAD_done\@
+
+	vpxor   \T7, \T7, \T7
+
+	/* read the last <16B of AAD. since we have at least 4B of
+	data right after the AAD (the ICV, and maybe some CT), we can
+	read 4B/8B blocks safely, and then get rid of the extra stuff */
+_get_AAD_rest8\@:
+	cmp     $4, %r11
+	jle     _get_AAD_rest4\@
+	movq    (%r10), \T1
+	add     $8, %r10
+	sub     $8, %r11
+	vpslldq $8, \T1, \T1
+	vpsrldq $8, \T7, \T7
+	vpxor   \T1, \T7, \T7
+	jmp     _get_AAD_rest8\@
+_get_AAD_rest4\@:
+	cmp     $0, %r11
+	jle      _get_AAD_rest0\@
+	mov     (%r10), %eax
+	movq    %rax, \T1
+	add     $4, %r10
+	sub     $4, %r11
+	vpslldq $12, \T1, \T1
+	vpsrldq $4, \T7, \T7
+	vpxor   \T1, \T7, \T7
+_get_AAD_rest0\@:
+	/* finalize: shift out the extra bytes we read, and align
+	left. since pslldq can only shift by an immediate, we use
+	vpshufb and an array of shuffle masks */
+	movq    %r12, %r11
+	salq    $4, %r11
+	vmovdqu  aad_shift_arr(%r11), \T1
+	vpshufb \T1, \T7, \T7
+_get_AAD_rest_final\@:
+	vpshufb SHUF_MASK(%rip), \T7, \T7
+	vpxor   \T8, \T7, \T7
+	\GHASH_MUL       \T7, \T2, \T1, \T3, \T4, \T5, \T6
+
+_get_AAD_done\@:
+        vmovdqu \T7, AadHash(arg2)
+.endm
+
 #ifdef CONFIG_AS_AVX
 ###############################################################################
 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
@@ -701,72 +777,9 @@ _return_T_done\@:
 
 .macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
 	i = (8-\num_initial_blocks)
-	j = 0
 	setreg
+        vmovdqu AadHash(arg2), reg_i
 
-	mov     arg7, %r10                      # r10 = AAD
-	mov     arg8, %r12                      # r12 = aadLen
-
-
-	mov     %r12, %r11
-
-	vpxor   reg_j, reg_j, reg_j
-	vpxor   reg_i, reg_i, reg_i
-	cmp     $16, %r11
-	jl      _get_AAD_rest8\@
-_get_AAD_blocks\@:
-	vmovdqu (%r10), reg_i
-	vpshufb SHUF_MASK(%rip), reg_i, reg_i
-	vpxor   reg_i, reg_j, reg_j
-	GHASH_MUL_AVX       reg_j, \T2, \T1, \T3, \T4, \T5, \T6
-	add     $16, %r10
-	sub     $16, %r12
-	sub     $16, %r11
-	cmp     $16, %r11
-	jge     _get_AAD_blocks\@
-	vmovdqu reg_j, reg_i
-	cmp     $0, %r11
-	je      _get_AAD_done\@
-
-	vpxor   reg_i, reg_i, reg_i
-
-	/* read the last <16B of AAD. since we have at least 4B of
-	data right after the AAD (the ICV, and maybe some CT), we can
-	read 4B/8B blocks safely, and then get rid of the extra stuff */
-_get_AAD_rest8\@:
-	cmp     $4, %r11
-	jle     _get_AAD_rest4\@
-	movq    (%r10), \T1
-	add     $8, %r10
-	sub     $8, %r11
-	vpslldq $8, \T1, \T1
-	vpsrldq $8, reg_i, reg_i
-	vpxor   \T1, reg_i, reg_i
-	jmp     _get_AAD_rest8\@
-_get_AAD_rest4\@:
-	cmp     $0, %r11
-	jle      _get_AAD_rest0\@
-	mov     (%r10), %eax
-	movq    %rax, \T1
-	add     $4, %r10
-	sub     $4, %r11
-	vpslldq $12, \T1, \T1
-	vpsrldq $4, reg_i, reg_i
-	vpxor   \T1, reg_i, reg_i
-_get_AAD_rest0\@:
-	/* finalize: shift out the extra bytes we read, and align
-	left. since pslldq can only shift by an immediate, we use
-	vpshufb and an array of shuffle masks */
-	movq    %r12, %r11
-	salq    $4, %r11
-	movdqu  aad_shift_arr(%r11), \T1
-	vpshufb \T1, reg_i, reg_i
-_get_AAD_rest_final\@:
-	vpshufb SHUF_MASK(%rip), reg_i, reg_i
-	vpxor   reg_j, reg_i, reg_i
-	GHASH_MUL_AVX       reg_i, \T2, \T1, \T3, \T4, \T5, \T6
-
-_get_AAD_done\@:
 	# initialize the data pointer offset as zero
 	xor     %r11d, %r11d
 
@@ -1535,7 +1548,13 @@ _initial_blocks_done\@:
 #void   aesni_gcm_precomp_avx_gen2
 #        (gcm_data     *my_ctx_data,
 #         gcm_context_data *data,
-#        u8     *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
+#        u8     *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
+#        u8      *iv, /* Pre-counter block j0: 4 byte salt
+#			(from Security Association) concatenated with 8 byte
+#			Initialisation Vector (from IPSec ESP Payload)
+#			concatenated with 0x00000001. 16-byte aligned pointer. */
+#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
+#        u64     aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
 #############################################################
 ENTRY(aesni_gcm_precomp_avx_gen2)
         FUNC_SAVE
@@ -1560,6 +1579,8 @@ ENTRY(aesni_gcm_precomp_avx_gen2)
         vmovdqu  %xmm6, HashKey(arg2)       # store HashKey<<1 mod poly
 
 
+        CALC_AAD_HASH GHASH_MUL_AVX, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
+
         PRECOMPUTE_AVX  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
 
         FUNC_RESTORE
@@ -1716,7 +1737,6 @@ ENDPROC(aesni_gcm_dec_avx_gen2)
 
 .endm
 
-
 ## if a = number of total plaintext bytes
 ## b = floor(a/16)
 ## num_initial_blocks = b mod 4#
@@ -1726,73 +1746,9 @@ ENDPROC(aesni_gcm_dec_avx_gen2)
 
 .macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
 	i = (8-\num_initial_blocks)
-	j = 0
 	setreg
+        vmovdqu AadHash(arg2), reg_i
 
-	mov     arg7, %r10                       # r10 = AAD
-	mov     arg8, %r12                       # r12 = aadLen
-
-
-	mov     %r12, %r11
-
-	vpxor   reg_j, reg_j, reg_j
-	vpxor   reg_i, reg_i, reg_i
-
-	cmp     $16, %r11
-	jl      _get_AAD_rest8\@
-_get_AAD_blocks\@:
-	vmovdqu (%r10), reg_i
-	vpshufb SHUF_MASK(%rip), reg_i, reg_i
-	vpxor   reg_i, reg_j, reg_j
-	GHASH_MUL_AVX2      reg_j, \T2, \T1, \T3, \T4, \T5, \T6
-	add     $16, %r10
-	sub     $16, %r12
-	sub     $16, %r11
-	cmp     $16, %r11
-	jge     _get_AAD_blocks\@
-	vmovdqu reg_j, reg_i
-	cmp     $0, %r11
-	je      _get_AAD_done\@
-
-	vpxor   reg_i, reg_i, reg_i
-
-	/* read the last <16B of AAD. since we have at least 4B of
-	data right after the AAD (the ICV, and maybe some CT), we can
-	read 4B/8B blocks safely, and then get rid of the extra stuff */
-_get_AAD_rest8\@:
-	cmp     $4, %r11
-	jle     _get_AAD_rest4\@
-	movq    (%r10), \T1
-	add     $8, %r10
-	sub     $8, %r11
-	vpslldq $8, \T1, \T1
-	vpsrldq $8, reg_i, reg_i
-	vpxor   \T1, reg_i, reg_i
-	jmp     _get_AAD_rest8\@
-_get_AAD_rest4\@:
-	cmp     $0, %r11
-	jle     _get_AAD_rest0\@
-	mov     (%r10), %eax
-	movq    %rax, \T1
-	add     $4, %r10
-	sub     $4, %r11
-	vpslldq $12, \T1, \T1
-	vpsrldq $4, reg_i, reg_i
-	vpxor   \T1, reg_i, reg_i
-_get_AAD_rest0\@:
-	/* finalize: shift out the extra bytes we read, and align
-	left. since pslldq can only shift by an immediate, we use
-	vpshufb and an array of shuffle masks */
-	movq    %r12, %r11
-	salq    $4, %r11
-	movdqu  aad_shift_arr(%r11), \T1
-	vpshufb \T1, reg_i, reg_i
-_get_AAD_rest_final\@:
-	vpshufb SHUF_MASK(%rip), reg_i, reg_i
-	vpxor   reg_j, reg_i, reg_i
-	GHASH_MUL_AVX2      reg_i, \T2, \T1, \T3, \T4, \T5, \T6
-
-_get_AAD_done\@:
 	# initialize the data pointer offset as zero
 	xor     %r11d, %r11d
 
@@ -2581,8 +2537,13 @@ _initial_blocks_done\@:
 #void   aesni_gcm_precomp_avx_gen4
 #        (gcm_data     *my_ctx_data,
 #         gcm_context_data *data,
-#        u8     *hash_subkey)# /* H, the Hash sub key input.
-#				Data starts on a 16-byte boundary. */
+#        u8     *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
+#        u8      *iv, /* Pre-counter block j0: 4 byte salt
+#			(from Security Association) concatenated with 8 byte
+#			Initialisation Vector (from IPSec ESP Payload)
+#			concatenated with 0x00000001. 16-byte aligned pointer. */
+#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
+#        u64     aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
 #############################################################
 ENTRY(aesni_gcm_precomp_avx_gen4)
         FUNC_SAVE
@@ -2606,6 +2567,7 @@ ENTRY(aesni_gcm_precomp_avx_gen4)
         #######################################################################
         vmovdqu  %xmm6, HashKey(arg2)         # store HashKey<<1 mod poly
 
+        CALC_AAD_HASH GHASH_MUL_AVX2, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
 
         PRECOMPUTE_AVX2  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
 
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 7d1259feb0f9..2648842f1c3f 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -189,7 +189,10 @@ asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv,
  */
 asmlinkage void aesni_gcm_precomp_avx_gen2(void *my_ctx_data,
 					   struct gcm_context_data *gdata,
-					   u8 *hash_subkey);
+					   u8 *hash_subkey,
+					   u8 *iv,
+					   const u8 *aad,
+					   unsigned long aad_len);
 
 asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx,
 				struct gcm_context_data *gdata, u8 *out,
@@ -214,7 +217,8 @@ static void aesni_gcm_enc_avx(void *ctx,
 			plaintext_len, iv, hash_subkey, aad,
 			aad_len, auth_tag, auth_tag_len);
 	} else {
-		aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey);
+		aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey, iv,
+					   aad, aad_len);
 		aesni_gcm_enc_avx_gen2(ctx, data, out, in, plaintext_len, iv,
 				       aad, aad_len, auth_tag, auth_tag_len);
 	}
@@ -231,7 +235,8 @@ static void aesni_gcm_dec_avx(void *ctx,
 			ciphertext_len, iv, hash_subkey, aad,
 			aad_len, auth_tag, auth_tag_len);
 	} else {
-		aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey);
+		aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey, iv,
+					   aad, aad_len);
 		aesni_gcm_dec_avx_gen2(ctx, data, out, in, ciphertext_len, iv,
 				       aad, aad_len, auth_tag, auth_tag_len);
 	}
@@ -246,7 +251,10 @@ static void aesni_gcm_dec_avx(void *ctx,
  */
 asmlinkage void aesni_gcm_precomp_avx_gen4(void *my_ctx_data,
 					   struct gcm_context_data *gdata,
-					   u8 *hash_subkey);
+					   u8 *hash_subkey,
+					   u8 *iv,
+					   const u8 *aad,
+					   unsigned long aad_len);
 
 asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx,
 				struct gcm_context_data *gdata, u8 *out,
@@ -271,11 +279,13 @@ static void aesni_gcm_enc_avx2(void *ctx,
 			      plaintext_len, iv, hash_subkey, aad,
 			      aad_len, auth_tag, auth_tag_len);
 	} else if (plaintext_len < AVX_GEN4_OPTSIZE) {
-		aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey);
+		aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey, iv,
+					   aad, aad_len);
 		aesni_gcm_enc_avx_gen2(ctx, data, out, in, plaintext_len, iv,
 				       aad, aad_len, auth_tag, auth_tag_len);
 	} else {
-		aesni_gcm_precomp_avx_gen4(ctx, data, hash_subkey);
+		aesni_gcm_precomp_avx_gen4(ctx, data, hash_subkey, iv,
+					   aad, aad_len);
 		aesni_gcm_enc_avx_gen4(ctx, data, out, in, plaintext_len, iv,
 				       aad, aad_len, auth_tag, auth_tag_len);
 	}
@@ -292,11 +302,13 @@ static void aesni_gcm_dec_avx2(void *ctx,
 			      ciphertext_len, iv, hash_subkey,
 			      aad, aad_len, auth_tag, auth_tag_len);
 	} else if (ciphertext_len < AVX_GEN4_OPTSIZE) {
-		aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey);
+		aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey, iv,
+					   aad, aad_len);
 		aesni_gcm_dec_avx_gen2(ctx, data, out, in, ciphertext_len, iv,
 				       aad, aad_len, auth_tag, auth_tag_len);
 	} else {
-		aesni_gcm_precomp_avx_gen4(ctx, data, hash_subkey);
+		aesni_gcm_precomp_avx_gen4(ctx, data, hash_subkey, iv,
+					   aad, aad_len);
 		aesni_gcm_dec_avx_gen4(ctx, data, out, in, ciphertext_len, iv,
 				       aad, aad_len, auth_tag, auth_tag_len);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 1cb1bcbb567d10bdd9de9d03964c5e2958f0d111 Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Mon, 10 Dec 2018 19:58:38 +0000
Subject: crypto: aesni - Merge avx precompute functions

The precompute functions differ only by the sub-macros
they call, merge them to a single macro.   Later diffs
add more code to fill in the gcm_context_data structure,
this allows changes in a single place.

Signed-off-by: Dave Watson <davejwatson@fb.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aesni-intel_avx-x86_64.S | 76 ++++++++++++--------------------
 1 file changed, 27 insertions(+), 49 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index 305abece93ad..e347ba61db65 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -661,6 +661,31 @@ _get_AAD_done\@:
         vmovdqu \T7, AadHash(arg2)
 .endm
 
+.macro INIT GHASH_MUL PRECOMPUTE
+        vmovdqu  (arg3), %xmm6              # xmm6 = HashKey
+
+        vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
+        ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
+        vmovdqa  %xmm6, %xmm2
+        vpsllq   $1, %xmm6, %xmm6
+        vpsrlq   $63, %xmm2, %xmm2
+        vmovdqa  %xmm2, %xmm1
+        vpslldq  $8, %xmm2, %xmm2
+        vpsrldq  $8, %xmm1, %xmm1
+        vpor     %xmm2, %xmm6, %xmm6
+        #reduction
+        vpshufd  $0b00100100, %xmm1, %xmm2
+        vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
+        vpand    POLY(%rip), %xmm2, %xmm2
+        vpxor    %xmm2, %xmm6, %xmm6        # xmm6 holds the HashKey<<1 mod poly
+        #######################################################################
+        vmovdqu  %xmm6, HashKey(arg2)       # store HashKey<<1 mod poly
+
+        CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
+
+        \PRECOMPUTE  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
+.endm
+
 #ifdef CONFIG_AS_AVX
 ###############################################################################
 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
@@ -1558,31 +1583,7 @@ _initial_blocks_done\@:
 #############################################################
 ENTRY(aesni_gcm_precomp_avx_gen2)
         FUNC_SAVE
-
-        vmovdqu  (arg3), %xmm6              # xmm6 = HashKey
-
-        vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
-        ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
-        vmovdqa  %xmm6, %xmm2
-        vpsllq   $1, %xmm6, %xmm6
-        vpsrlq   $63, %xmm2, %xmm2
-        vmovdqa  %xmm2, %xmm1
-        vpslldq  $8, %xmm2, %xmm2
-        vpsrldq  $8, %xmm1, %xmm1
-        vpor     %xmm2, %xmm6, %xmm6
-        #reduction
-        vpshufd  $0b00100100, %xmm1, %xmm2
-        vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
-        vpand    POLY(%rip), %xmm2, %xmm2
-        vpxor    %xmm2, %xmm6, %xmm6        # xmm6 holds the HashKey<<1 mod poly
-        #######################################################################
-        vmovdqu  %xmm6, HashKey(arg2)       # store HashKey<<1 mod poly
-
-
-        CALC_AAD_HASH GHASH_MUL_AVX, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
-
-        PRECOMPUTE_AVX  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
-
+        INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
         FUNC_RESTORE
         ret
 ENDPROC(aesni_gcm_precomp_avx_gen2)
@@ -2547,30 +2548,7 @@ _initial_blocks_done\@:
 #############################################################
 ENTRY(aesni_gcm_precomp_avx_gen4)
         FUNC_SAVE
-
-        vmovdqu  (arg3), %xmm6                # xmm6 = HashKey
-
-        vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
-        ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
-        vmovdqa  %xmm6, %xmm2
-        vpsllq   $1, %xmm6, %xmm6
-        vpsrlq   $63, %xmm2, %xmm2
-        vmovdqa  %xmm2, %xmm1
-        vpslldq  $8, %xmm2, %xmm2
-        vpsrldq  $8, %xmm1, %xmm1
-        vpor     %xmm2, %xmm6, %xmm6
-        #reduction
-        vpshufd  $0b00100100, %xmm1, %xmm2
-        vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
-        vpand    POLY(%rip), %xmm2, %xmm2
-        vpxor    %xmm2, %xmm6, %xmm6          # xmm6 holds the HashKey<<1 mod poly
-        #######################################################################
-        vmovdqu  %xmm6, HashKey(arg2)         # store HashKey<<1 mod poly
-
-        CALC_AAD_HASH GHASH_MUL_AVX2, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
-
-        PRECOMPUTE_AVX2  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
-
+        INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
         FUNC_RESTORE
         ret
 ENDPROC(aesni_gcm_precomp_avx_gen4)
-- 
cgit v1.2.3-59-g8ed1b


From a44b419fe5aee3bfe12c88698a023cb5c6067985 Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Mon, 10 Dec 2018 19:58:56 +0000
Subject: crypto: aesni - Fill in new context data structures

Fill in aadhash, aadlen, pblocklen, curcount with appropriate values.
pblocklen, aadhash, and pblockenckey are also updated at the end
of each scatter/gather operation, to be carried over to the next
operation.

Signed-off-by: Dave Watson <davejwatson@fb.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aesni-intel_avx-x86_64.S | 51 +++++++++++++++++++++++---------
 1 file changed, 37 insertions(+), 14 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index e347ba61db65..0a9cdcfdd987 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -297,7 +297,9 @@ VARIABLE_OFFSET = 16*8
 # clobbering all xmm registers
 # clobbering r10, r11, r12, r13, r14, r15
 .macro  GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
+        vmovdqu AadHash(arg2), %xmm8
         vmovdqu  HashKey(arg2), %xmm13      # xmm13 = HashKey
+        add arg5, InLen(arg2)
 
         mov     arg5, %r13                  # save the number of bytes of plaintext/ciphertext
         and     $-16, %r13                  # r13 = r13 - (r13 mod 16)
@@ -410,6 +412,9 @@ _eight_cipher_left\@:
 
 
 _zero_cipher_left\@:
+        vmovdqu %xmm14, AadHash(arg2)
+        vmovdqu %xmm9, CurCount(arg2)
+
         cmp     $16, arg5
         jl      _only_less_than_16\@
 
@@ -420,10 +425,14 @@ _zero_cipher_left\@:
 
         # handle the last <16 Byte block seperately
 
+        mov %r13, PBlockLen(arg2)
 
         vpaddd   ONE(%rip), %xmm9, %xmm9             # INCR CNT to get Yn
+        vmovdqu %xmm9, CurCount(arg2)
         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+
         ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Yn)
+        vmovdqu %xmm9, PBlockEncKey(arg2)
 
         sub     $16, %r11
         add     %r13, %r11
@@ -451,6 +460,7 @@ _only_less_than_16\@:
         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
         ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Yn)
 
+        vmovdqu %xmm9, PBlockEncKey(arg2)
 
         lea     SHIFT_MASK+16(%rip), %r12
         sub     %r13, %r12                           # adjust the shuffle mask pointer to be
@@ -480,6 +490,7 @@ _final_ghash_mul\@:
         vpxor   %xmm2, %xmm14, %xmm14
 	#GHASH computation for the last <16 Byte block
         \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+        vmovdqu %xmm14, AadHash(arg2)
         sub     %r13, %r11
         add     $16, %r11
         .else
@@ -491,6 +502,7 @@ _final_ghash_mul\@:
         vpxor   %xmm9, %xmm14, %xmm14
 	#GHASH computation for the last <16 Byte block
         \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+        vmovdqu %xmm14, AadHash(arg2)
         sub     %r13, %r11
         add     $16, %r11
         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
@@ -526,12 +538,16 @@ _multiple_of_16_bytes\@:
 # Output: Authorization Tag (AUTH_TAG)
 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
 .macro GCM_COMPLETE GHASH_MUL REP
-        mov     arg8, %r12                           # r12 = aadLen (number of bytes)
+        vmovdqu AadHash(arg2), %xmm14
+        vmovdqu HashKey(arg2), %xmm13
+
+        mov AadLen(arg2), %r12                          # r12 = aadLen (number of bytes)
         shl     $3, %r12                             # convert into number of bits
         vmovd   %r12d, %xmm15                        # len(A) in xmm15
 
-        shl     $3, arg5                             # len(C) in bits  (*128)
-        vmovq   arg5, %xmm1
+        mov InLen(arg2), %r12
+        shl     $3, %r12                        # len(C) in bits  (*128)
+        vmovq   %r12, %xmm1
         vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
         vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
 
@@ -539,8 +555,7 @@ _multiple_of_16_bytes\@:
         \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
         vpshufb SHUF_MASK(%rip), %xmm14, %xmm14      # perform a 16Byte swap
 
-        mov     arg6, %rax                           # rax = *Y0
-        vmovdqu (%rax), %xmm9                        # xmm9 = Y0
+        vmovdqu OrigIV(arg2), %xmm9
 
         ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Y0)
 
@@ -662,6 +677,20 @@ _get_AAD_done\@:
 .endm
 
 .macro INIT GHASH_MUL PRECOMPUTE
+        mov arg6, %r11
+        mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
+        xor %r11d, %r11d
+        mov %r11, InLen(arg2) # ctx_data.in_length = 0
+
+        mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
+        mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
+        mov arg4, %rax
+        movdqu (%rax), %xmm0
+        movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
+
+        vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
+        movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
+
         vmovdqu  (arg3), %xmm6              # xmm6 = HashKey
 
         vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
@@ -809,10 +838,7 @@ _get_AAD_done\@:
 	xor     %r11d, %r11d
 
 	# start AES for num_initial_blocks blocks
-	mov     arg6, %rax                     # rax = *Y0
-	vmovdqu (%rax), \CTR                   # CTR = Y0
-	vpshufb SHUF_MASK(%rip), \CTR, \CTR
-
+	vmovdqu CurCount(arg2), \CTR
 
 	i = (9-\num_initial_blocks)
 	setreg
@@ -1748,16 +1774,13 @@ ENDPROC(aesni_gcm_dec_avx_gen2)
 .macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
 	i = (8-\num_initial_blocks)
 	setreg
-        vmovdqu AadHash(arg2), reg_i
+	vmovdqu AadHash(arg2), reg_i
 
 	# initialize the data pointer offset as zero
 	xor     %r11d, %r11d
 
 	# start AES for num_initial_blocks blocks
-	mov     arg6, %rax                     # rax = *Y0
-	vmovdqu (%rax), \CTR                   # CTR = Y0
-	vpshufb SHUF_MASK(%rip), \CTR, \CTR
-
+	vmovdqu CurCount(arg2), \CTR
 
 	i = (9-\num_initial_blocks)
 	setreg
-- 
cgit v1.2.3-59-g8ed1b


From 517a448e09846732d46e983a2195002d05857919 Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Mon, 10 Dec 2018 19:59:11 +0000
Subject: crypto: aesni - Move ghash_mul to GCM_COMPLETE

Prepare to handle partial blocks between scatter/gather calls.
For the last partial block, we only want to calculate the aadhash
in GCM_COMPLETE, and a new partial block macro will handle both
aadhash update and encrypting partial blocks between calls.

Signed-off-by: Dave Watson <davejwatson@fb.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aesni-intel_avx-x86_64.S | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index 0a9cdcfdd987..44a4a8b43ca4 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -488,8 +488,7 @@ _final_ghash_mul\@:
         vpand   %xmm1, %xmm2, %xmm2
         vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
         vpxor   %xmm2, %xmm14, %xmm14
-	#GHASH computation for the last <16 Byte block
-        \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+
         vmovdqu %xmm14, AadHash(arg2)
         sub     %r13, %r11
         add     $16, %r11
@@ -500,8 +499,7 @@ _final_ghash_mul\@:
         vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
         vpxor   %xmm9, %xmm14, %xmm14
-	#GHASH computation for the last <16 Byte block
-        \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+
         vmovdqu %xmm14, AadHash(arg2)
         sub     %r13, %r11
         add     $16, %r11
@@ -541,6 +539,14 @@ _multiple_of_16_bytes\@:
         vmovdqu AadHash(arg2), %xmm14
         vmovdqu HashKey(arg2), %xmm13
 
+        mov PBlockLen(arg2), %r12
+        cmp $0, %r12
+        je _partial_done\@
+
+	#GHASH computation for the last <16 Byte block
+        \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+
+_partial_done\@:
         mov AadLen(arg2), %r12                          # r12 = aadLen (number of bytes)
         shl     $3, %r12                             # convert into number of bits
         vmovd   %r12d, %xmm15                        # len(A) in xmm15
-- 
cgit v1.2.3-59-g8ed1b


From ec8c02d9a30b8324d7aae9e4e7a08973a8eaa8b4 Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Mon, 10 Dec 2018 19:59:26 +0000
Subject: crypto: aesni - Introduce READ_PARTIAL_BLOCK macro

Introduce READ_PARTIAL_BLOCK macro, and use it in the two existing
partial block cases: AAD and the end of ENC_DEC.   In particular,
the ENC_DEC case should be faster, since we read by 8/4 bytes if
possible.

This macro will also be used to read partial blocks between
enc_update and dec_update calls.

Signed-off-by: Dave Watson <davejwatson@fb.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aesni-intel_avx-x86_64.S | 102 ++++++++++++++++++-------------
 1 file changed, 59 insertions(+), 43 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index 44a4a8b43ca4..ff00ad19064d 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -415,68 +415,56 @@ _zero_cipher_left\@:
         vmovdqu %xmm14, AadHash(arg2)
         vmovdqu %xmm9, CurCount(arg2)
 
-        cmp     $16, arg5
-        jl      _only_less_than_16\@
-
+        # check for 0 length
         mov     arg5, %r13
         and     $15, %r13                            # r13 = (arg5 mod 16)
 
         je      _multiple_of_16_bytes\@
 
-        # handle the last <16 Byte block seperately
+        # handle the last <16 Byte block separately
 
         mov %r13, PBlockLen(arg2)
 
-        vpaddd   ONE(%rip), %xmm9, %xmm9             # INCR CNT to get Yn
+        vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
         vmovdqu %xmm9, CurCount(arg2)
         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
 
         ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Yn)
         vmovdqu %xmm9, PBlockEncKey(arg2)
 
-        sub     $16, %r11
-        add     %r13, %r11
-        vmovdqu (arg4, %r11), %xmm1                  # receive the last <16 Byte block
-
-        lea     SHIFT_MASK+16(%rip), %r12
-        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
-						     # able to shift 16-r13 bytes (r13 is the
-						     # number of bytes in plaintext mod 16)
-        vmovdqu (%r12), %xmm2                        # get the appropriate shuffle mask
-        vpshufb %xmm2, %xmm1, %xmm1                  # shift right 16-r13 bytes
-        jmp     _final_ghash_mul\@
-
-_only_less_than_16\@:
-        # check for 0 length
-        mov     arg5, %r13
-        and     $15, %r13                            # r13 = (arg5 mod 16)
+        cmp $16, arg5
+        jge _large_enough_update\@
 
-        je      _multiple_of_16_bytes\@
+        lea (arg4,%r11,1), %r10
+        mov %r13, %r12
 
-        # handle the last <16 Byte block separately
-
-
-        vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-        ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Yn)
-
-        vmovdqu %xmm9, PBlockEncKey(arg2)
+        READ_PARTIAL_BLOCK %r10 %r12 %xmm1
 
         lea     SHIFT_MASK+16(%rip), %r12
         sub     %r13, %r12                           # adjust the shuffle mask pointer to be
 						     # able to shift 16-r13 bytes (r13 is the
-						     # number of bytes in plaintext mod 16)
+	# number of bytes in plaintext mod 16)
 
-_get_last_16_byte_loop\@:
-        movb    (arg4, %r11),  %al
-        movb    %al,  TMP1 (%rsp , %r11)
-        add     $1, %r11
-        cmp     %r13,  %r11
-        jne     _get_last_16_byte_loop\@
+        jmp _final_ghash_mul\@
+
+_large_enough_update\@:
+        sub $16, %r11
+        add %r13, %r11
+
+        # receive the last <16 Byte block
+        vmovdqu	(arg4, %r11, 1), %xmm1
 
-        vmovdqu  TMP1(%rsp), %xmm1
+        sub	%r13, %r11
+        add	$16, %r11
 
-        sub     $16, %r11
+        lea	SHIFT_MASK+16(%rip), %r12
+        # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
+        # (r13 is the number of bytes in plaintext mod 16)
+        sub	%r13, %r12
+        # get the appropriate shuffle mask
+        vmovdqu	(%r12), %xmm2
+        # shift right 16-r13 bytes
+        vpshufb  %xmm2, %xmm1, %xmm1
 
 _final_ghash_mul\@:
         .if  \ENC_DEC ==  DEC
@@ -490,8 +478,6 @@ _final_ghash_mul\@:
         vpxor   %xmm2, %xmm14, %xmm14
 
         vmovdqu %xmm14, AadHash(arg2)
-        sub     %r13, %r11
-        add     $16, %r11
         .else
         vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
         vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
@@ -501,8 +487,6 @@ _final_ghash_mul\@:
         vpxor   %xmm9, %xmm14, %xmm14
 
         vmovdqu %xmm14, AadHash(arg2)
-        sub     %r13, %r11
-        add     $16, %r11
         vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
         .endif
 
@@ -721,6 +705,38 @@ _get_AAD_done\@:
         \PRECOMPUTE  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
 .endm
 
+
+# Reads DLEN bytes starting at DPTR and stores in XMMDst
+# where 0 < DLEN < 16
+# Clobbers %rax, DLEN
+.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
+        vpxor \XMMDst, \XMMDst, \XMMDst
+
+        cmp $8, \DLEN
+        jl _read_lt8_\@
+        mov (\DPTR), %rax
+        vpinsrq $0, %rax, \XMMDst, \XMMDst
+        sub $8, \DLEN
+        jz _done_read_partial_block_\@
+        xor %eax, %eax
+_read_next_byte_\@:
+        shl $8, %rax
+        mov 7(\DPTR, \DLEN, 1), %al
+        dec \DLEN
+        jnz _read_next_byte_\@
+        vpinsrq $1, %rax, \XMMDst, \XMMDst
+        jmp _done_read_partial_block_\@
+_read_lt8_\@:
+        xor %eax, %eax
+_read_next_byte_lt8_\@:
+        shl $8, %rax
+        mov -1(\DPTR, \DLEN, 1), %al
+        dec \DLEN
+        jnz _read_next_byte_lt8_\@
+        vpinsrq $0, %rax, \XMMDst, \XMMDst
+_done_read_partial_block_\@:
+.endm
+
 #ifdef CONFIG_AS_AVX
 ###############################################################################
 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
-- 
cgit v1.2.3-59-g8ed1b


From e044d5056396029cc12ed5354aa2a073b747195a Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Mon, 10 Dec 2018 19:59:38 +0000
Subject: crypto: aesni - Introduce partial block macro

Before this diff, multiple calls to GCM_ENC_DEC will
succeed, but only if all calls are a multiple of 16 bytes.

Handle partial blocks at the start of GCM_ENC_DEC, and update
aadhash as appropriate.

The data offset %r11 is also updated after the partial block.

Signed-off-by: Dave Watson <davejwatson@fb.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aesni-intel_avx-x86_64.S | 156 +++++++++++++++++++++++++++++--
 1 file changed, 150 insertions(+), 6 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index ff00ad19064d..af45fc57db90 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -301,6 +301,12 @@ VARIABLE_OFFSET = 16*8
         vmovdqu  HashKey(arg2), %xmm13      # xmm13 = HashKey
         add arg5, InLen(arg2)
 
+        # initialize the data pointer offset as zero
+        xor     %r11d, %r11d
+
+        PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
+        sub %r11, arg5
+
         mov     arg5, %r13                  # save the number of bytes of plaintext/ciphertext
         and     $-16, %r13                  # r13 = r13 - (r13 mod 16)
 
@@ -737,6 +743,150 @@ _read_next_byte_lt8_\@:
 _done_read_partial_block_\@:
 .endm
 
+# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
+# between update calls.
+# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
+# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
+# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
+.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
+        AAD_HASH ENC_DEC
+        mov 	PBlockLen(arg2), %r13
+        cmp	$0, %r13
+        je	_partial_block_done_\@	# Leave Macro if no partial blocks
+        # Read in input data without over reading
+        cmp	$16, \PLAIN_CYPH_LEN
+        jl	_fewer_than_16_bytes_\@
+        vmovdqu	(\PLAIN_CYPH_IN), %xmm1	# If more than 16 bytes, just fill xmm
+        jmp	_data_read_\@
+
+_fewer_than_16_bytes_\@:
+        lea	(\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
+        mov	\PLAIN_CYPH_LEN, %r12
+        READ_PARTIAL_BLOCK %r10 %r12 %xmm1
+
+        mov PBlockLen(arg2), %r13
+
+_data_read_\@:				# Finished reading in data
+
+        vmovdqu	PBlockEncKey(arg2), %xmm9
+        vmovdqu	HashKey(arg2), %xmm13
+
+        lea	SHIFT_MASK(%rip), %r12
+
+        # adjust the shuffle mask pointer to be able to shift r13 bytes
+        # r16-r13 is the number of bytes in plaintext mod 16)
+        add	%r13, %r12
+        vmovdqu	(%r12), %xmm2		# get the appropriate shuffle mask
+        vpshufb %xmm2, %xmm9, %xmm9		# shift right r13 bytes
+
+.if  \ENC_DEC ==  DEC
+        vmovdqa	%xmm1, %xmm3
+        pxor	%xmm1, %xmm9		# Cyphertext XOR E(K, Yn)
+
+        mov	\PLAIN_CYPH_LEN, %r10
+        add	%r13, %r10
+        # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
+        sub	$16, %r10
+        # Determine if if partial block is not being filled and
+        # shift mask accordingly
+        jge	_no_extra_mask_1_\@
+        sub	%r10, %r12
+_no_extra_mask_1_\@:
+
+        vmovdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
+        # get the appropriate mask to mask out bottom r13 bytes of xmm9
+        vpand	%xmm1, %xmm9, %xmm9		# mask out bottom r13 bytes of xmm9
+
+        vpand	%xmm1, %xmm3, %xmm3
+        vmovdqa	SHUF_MASK(%rip), %xmm10
+        vpshufb	%xmm10, %xmm3, %xmm3
+        vpshufb	%xmm2, %xmm3, %xmm3
+        vpxor	%xmm3, \AAD_HASH, \AAD_HASH
+
+        cmp	$0, %r10
+        jl	_partial_incomplete_1_\@
+
+        # GHASH computation for the last <16 Byte block
+        \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+        xor	%eax,%eax
+
+        mov	%rax, PBlockLen(arg2)
+        jmp	_dec_done_\@
+_partial_incomplete_1_\@:
+        add	\PLAIN_CYPH_LEN, PBlockLen(arg2)
+_dec_done_\@:
+        vmovdqu	\AAD_HASH, AadHash(arg2)
+.else
+        vpxor	%xmm1, %xmm9, %xmm9			# Plaintext XOR E(K, Yn)
+
+        mov	\PLAIN_CYPH_LEN, %r10
+        add	%r13, %r10
+        # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
+        sub	$16, %r10
+        # Determine if if partial block is not being filled and
+        # shift mask accordingly
+        jge	_no_extra_mask_2_\@
+        sub	%r10, %r12
+_no_extra_mask_2_\@:
+
+        vmovdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
+        # get the appropriate mask to mask out bottom r13 bytes of xmm9
+        vpand	%xmm1, %xmm9, %xmm9
+
+        vmovdqa	SHUF_MASK(%rip), %xmm1
+        vpshufb %xmm1, %xmm9, %xmm9
+        vpshufb %xmm2, %xmm9, %xmm9
+        vpxor	%xmm9, \AAD_HASH, \AAD_HASH
+
+        cmp	$0, %r10
+        jl	_partial_incomplete_2_\@
+
+        # GHASH computation for the last <16 Byte block
+        \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+        xor	%eax,%eax
+
+        mov	%rax, PBlockLen(arg2)
+        jmp	_encode_done_\@
+_partial_incomplete_2_\@:
+        add	\PLAIN_CYPH_LEN, PBlockLen(arg2)
+_encode_done_\@:
+        vmovdqu	\AAD_HASH, AadHash(arg2)
+
+        vmovdqa	SHUF_MASK(%rip), %xmm10
+        # shuffle xmm9 back to output as ciphertext
+        vpshufb	%xmm10, %xmm9, %xmm9
+        vpshufb	%xmm2, %xmm9, %xmm9
+.endif
+        # output encrypted Bytes
+        cmp	$0, %r10
+        jl	_partial_fill_\@
+        mov	%r13, %r12
+        mov	$16, %r13
+        # Set r13 to be the number of bytes to write out
+        sub	%r12, %r13
+        jmp	_count_set_\@
+_partial_fill_\@:
+        mov	\PLAIN_CYPH_LEN, %r13
+_count_set_\@:
+        vmovdqa	%xmm9, %xmm0
+        vmovq	%xmm0, %rax
+        cmp	$8, %r13
+        jle	_less_than_8_bytes_left_\@
+
+        mov	%rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
+        add	$8, \DATA_OFFSET
+        psrldq	$8, %xmm0
+        vmovq	%xmm0, %rax
+        sub	$8, %r13
+_less_than_8_bytes_left_\@:
+        movb	%al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
+        add	$1, \DATA_OFFSET
+        shr	$8, %rax
+        sub	$1, %r13
+        jne	_less_than_8_bytes_left_\@
+_partial_block_done_\@:
+.endm # PARTIAL_BLOCK
+
 #ifdef CONFIG_AS_AVX
 ###############################################################################
 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
@@ -856,9 +1006,6 @@ _done_read_partial_block_\@:
 	setreg
         vmovdqu AadHash(arg2), reg_i
 
-	# initialize the data pointer offset as zero
-	xor     %r11d, %r11d
-
 	# start AES for num_initial_blocks blocks
 	vmovdqu CurCount(arg2), \CTR
 
@@ -1798,9 +1945,6 @@ ENDPROC(aesni_gcm_dec_avx_gen2)
 	setreg
 	vmovdqu AadHash(arg2), reg_i
 
-	# initialize the data pointer offset as zero
-	xor     %r11d, %r11d
-
 	# start AES for num_initial_blocks blocks
 	vmovdqu CurCount(arg2), \CTR
 
-- 
cgit v1.2.3-59-g8ed1b


From 603f8c3b0dbbe21fabb7e005f57883b21aaadd82 Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Mon, 10 Dec 2018 19:59:59 +0000
Subject: crypto: aesni - Add scatter/gather avx stubs, and use them in C

Add the appropriate scatter/gather stubs to the avx asm.
In the C code, we can now always use crypt_by_sg, since both
sse and asm code now support scatter/gather.

Introduce a new struct, aesni_gcm_tfm, that is initialized on
startup to point to either the SSE, AVX, or AVX2 versions of the
four necessary encryption/decryption routines.

GENX_OPTSIZE is still checked at the start of crypt_by_sg.  The
total size of the data is checked, since the additional overhead
is in the init function, calculating additional HashKeys.

Signed-off-by: Dave Watson <davejwatson@fb.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aesni-intel_avx-x86_64.S | 181 +++++++++-------
 arch/x86/crypto/aesni-intel_glue.c       | 349 +++++++++----------------------
 2 files changed, 198 insertions(+), 332 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index af45fc57db90..91c039ab5699 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -518,14 +518,13 @@ _less_than_8_bytes_left\@:
         #############################
 
 _multiple_of_16_bytes\@:
-        GCM_COMPLETE \GHASH_MUL \REP
 .endm
 
 
 # GCM_COMPLETE Finishes update of tag of last partial block
 # Output: Authorization Tag (AUTH_TAG)
 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
-.macro GCM_COMPLETE GHASH_MUL REP
+.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
         vmovdqu AadHash(arg2), %xmm14
         vmovdqu HashKey(arg2), %xmm13
 
@@ -560,8 +559,8 @@ _partial_done\@:
 
 
 _return_T\@:
-        mov     arg9, %r10              # r10 = authTag
-        mov     arg10, %r11              # r11 = auth_tag_len
+        mov     \AUTH_TAG, %r10              # r10 = authTag
+        mov     \AUTH_TAG_LEN, %r11              # r11 = auth_tag_len
 
         cmp     $16, %r11
         je      _T_16\@
@@ -680,14 +679,14 @@ _get_AAD_done\@:
 
         mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
         mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
-        mov arg4, %rax
+        mov arg3, %rax
         movdqu (%rax), %xmm0
         movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
 
         vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
         movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
 
-        vmovdqu  (arg3), %xmm6              # xmm6 = HashKey
+        vmovdqu  (arg4), %xmm6              # xmm6 = HashKey
 
         vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
         ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
@@ -1776,88 +1775,100 @@ _initial_blocks_done\@:
 #        const   u8 *aad, /* Additional Authentication Data (AAD)*/
 #        u64     aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
 #############################################################
-ENTRY(aesni_gcm_precomp_avx_gen2)
+ENTRY(aesni_gcm_init_avx_gen2)
         FUNC_SAVE
         INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
         FUNC_RESTORE
         ret
-ENDPROC(aesni_gcm_precomp_avx_gen2)
+ENDPROC(aesni_gcm_init_avx_gen2)
 
 ###############################################################################
-#void   aesni_gcm_enc_avx_gen2(
+#void   aesni_gcm_enc_update_avx_gen2(
 #        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
 #        gcm_context_data *data,
 #        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
 #        const   u8 *in, /* Plaintext input */
-#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
-#        u8      *iv, /* Pre-counter block j0: 4 byte salt
-#			(from Security Association) concatenated with 8 byte
-#			Initialisation Vector (from IPSec ESP Payload)
-#			concatenated with 0x00000001. 16-byte aligned pointer. */
-#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
-#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
-#        u8      *auth_tag, /* Authenticated Tag output. */
-#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
-#				Valid values are 16 (most likely), 12 or 8. */
+#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
 ###############################################################################
-ENTRY(aesni_gcm_enc_avx_gen2)
+ENTRY(aesni_gcm_enc_update_avx_gen2)
         FUNC_SAVE
         mov     keysize, %eax
         cmp     $32, %eax
-        je      key_256_enc
+        je      key_256_enc_update
         cmp     $16, %eax
-        je      key_128_enc
+        je      key_128_enc_update
         # must be 192
         GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
         FUNC_RESTORE
         ret
-key_128_enc:
+key_128_enc_update:
         GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
         FUNC_RESTORE
         ret
-key_256_enc:
+key_256_enc_update:
         GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
         FUNC_RESTORE
         ret
-ENDPROC(aesni_gcm_enc_avx_gen2)
+ENDPROC(aesni_gcm_enc_update_avx_gen2)
 
 ###############################################################################
-#void   aesni_gcm_dec_avx_gen2(
+#void   aesni_gcm_dec_update_avx_gen2(
 #        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
 #        gcm_context_data *data,
 #        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
 #        const   u8 *in, /* Ciphertext input */
-#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
-#        u8      *iv, /* Pre-counter block j0: 4 byte salt
-#			(from Security Association) concatenated with 8 byte
-#			Initialisation Vector (from IPSec ESP Payload)
-#			concatenated with 0x00000001. 16-byte aligned pointer. */
-#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
-#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
-#        u8      *auth_tag, /* Authenticated Tag output. */
-#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
-#				Valid values are 16 (most likely), 12 or 8. */
+#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
 ###############################################################################
-ENTRY(aesni_gcm_dec_avx_gen2)
+ENTRY(aesni_gcm_dec_update_avx_gen2)
         FUNC_SAVE
         mov     keysize,%eax
         cmp     $32, %eax
-        je      key_256_dec
+        je      key_256_dec_update
         cmp     $16, %eax
-        je      key_128_dec
+        je      key_128_dec_update
         # must be 192
         GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
         FUNC_RESTORE
         ret
-key_128_dec:
+key_128_dec_update:
         GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
         FUNC_RESTORE
         ret
-key_256_dec:
+key_256_dec_update:
         GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
         FUNC_RESTORE
         ret
-ENDPROC(aesni_gcm_dec_avx_gen2)
+ENDPROC(aesni_gcm_dec_update_avx_gen2)
+
+###############################################################################
+#void   aesni_gcm_finalize_avx_gen2(
+#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
+#        gcm_context_data *data,
+#        u8      *auth_tag, /* Authenticated Tag output. */
+#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
+#				Valid values are 16 (most likely), 12 or 8. */
+###############################################################################
+ENTRY(aesni_gcm_finalize_avx_gen2)
+        FUNC_SAVE
+        mov	keysize,%eax
+        cmp     $32, %eax
+        je      key_256_finalize
+        cmp     $16, %eax
+        je      key_128_finalize
+        # must be 192
+        GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
+        FUNC_RESTORE
+        ret
+key_128_finalize:
+        GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
+        FUNC_RESTORE
+        ret
+key_256_finalize:
+        GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
+        FUNC_RESTORE
+        ret
+ENDPROC(aesni_gcm_finalize_avx_gen2)
+
 #endif /* CONFIG_AS_AVX */
 
 #ifdef CONFIG_AS_AVX2
@@ -2724,24 +2735,23 @@ _initial_blocks_done\@:
 
 
 #############################################################
-#void   aesni_gcm_precomp_avx_gen4
+#void   aesni_gcm_init_avx_gen4
 #        (gcm_data     *my_ctx_data,
 #         gcm_context_data *data,
-#        u8     *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
 #        u8      *iv, /* Pre-counter block j0: 4 byte salt
 #			(from Security Association) concatenated with 8 byte
 #			Initialisation Vector (from IPSec ESP Payload)
 #			concatenated with 0x00000001. 16-byte aligned pointer. */
+#        u8     *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
 #        const   u8 *aad, /* Additional Authentication Data (AAD)*/
 #        u64     aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
 #############################################################
-ENTRY(aesni_gcm_precomp_avx_gen4)
+ENTRY(aesni_gcm_init_avx_gen4)
         FUNC_SAVE
         INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
         FUNC_RESTORE
         ret
-ENDPROC(aesni_gcm_precomp_avx_gen4)
-
+ENDPROC(aesni_gcm_init_avx_gen4)
 
 ###############################################################################
 #void   aesni_gcm_enc_avx_gen4(
@@ -2749,74 +2759,85 @@ ENDPROC(aesni_gcm_precomp_avx_gen4)
 #        gcm_context_data *data,
 #        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
 #        const   u8 *in, /* Plaintext input */
-#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
-#        u8      *iv, /* Pre-counter block j0: 4 byte salt
-#			(from Security Association) concatenated with 8 byte
-#			 Initialisation Vector (from IPSec ESP Payload)
-#			 concatenated with 0x00000001. 16-byte aligned pointer. */
-#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
-#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
-#        u8      *auth_tag, /* Authenticated Tag output. */
-#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
-#				Valid values are 16 (most likely), 12 or 8. */
+#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
 ###############################################################################
-ENTRY(aesni_gcm_enc_avx_gen4)
+ENTRY(aesni_gcm_enc_update_avx_gen4)
         FUNC_SAVE
         mov     keysize,%eax
         cmp     $32, %eax
-        je      key_256_enc4
+        je      key_256_enc_update4
         cmp     $16, %eax
-        je      key_128_enc4
+        je      key_128_enc_update4
         # must be 192
         GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
         FUNC_RESTORE
 	ret
-key_128_enc4:
+key_128_enc_update4:
         GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
         FUNC_RESTORE
 	ret
-key_256_enc4:
+key_256_enc_update4:
         GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
         FUNC_RESTORE
 	ret
-ENDPROC(aesni_gcm_enc_avx_gen4)
+ENDPROC(aesni_gcm_enc_update_avx_gen4)
 
 ###############################################################################
-#void   aesni_gcm_dec_avx_gen4(
+#void   aesni_gcm_dec_update_avx_gen4(
 #        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
 #        gcm_context_data *data,
 #        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
 #        const   u8 *in, /* Ciphertext input */
-#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
-#        u8      *iv, /* Pre-counter block j0: 4 byte salt
-#			(from Security Association) concatenated with 8 byte
-#			Initialisation Vector (from IPSec ESP Payload)
-#			concatenated with 0x00000001. 16-byte aligned pointer. */
-#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
-#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
-#        u8      *auth_tag, /* Authenticated Tag output. */
-#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
-#				Valid values are 16 (most likely), 12 or 8. */
+#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
 ###############################################################################
-ENTRY(aesni_gcm_dec_avx_gen4)
+ENTRY(aesni_gcm_dec_update_avx_gen4)
         FUNC_SAVE
         mov     keysize,%eax
         cmp     $32, %eax
-        je      key_256_dec4
+        je      key_256_dec_update4
         cmp     $16, %eax
-        je      key_128_dec4
+        je      key_128_dec_update4
         # must be 192
         GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
         FUNC_RESTORE
         ret
-key_128_dec4:
+key_128_dec_update4:
         GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
         FUNC_RESTORE
         ret
-key_256_dec4:
+key_256_dec_update4:
         GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
         FUNC_RESTORE
         ret
-ENDPROC(aesni_gcm_dec_avx_gen4)
+ENDPROC(aesni_gcm_dec_update_avx_gen4)
+
+###############################################################################
+#void   aesni_gcm_finalize_avx_gen4(
+#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
+#        gcm_context_data *data,
+#        u8      *auth_tag, /* Authenticated Tag output. */
+#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
+#                              Valid values are 16 (most likely), 12 or 8. */
+###############################################################################
+ENTRY(aesni_gcm_finalize_avx_gen4)
+        FUNC_SAVE
+        mov	keysize,%eax
+        cmp     $32, %eax
+        je      key_256_finalize4
+        cmp     $16, %eax
+        je      key_128_finalize4
+        # must be 192
+        GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
+        FUNC_RESTORE
+        ret
+key_128_finalize4:
+        GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
+        FUNC_RESTORE
+        ret
+key_256_finalize4:
+        GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
+        FUNC_RESTORE
+        ret
+ENDPROC(aesni_gcm_finalize_avx_gen4)
 
 #endif /* CONFIG_AS_AVX2 */
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 2648842f1c3f..1321700d6647 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -175,6 +175,32 @@ asmlinkage void aesni_gcm_finalize(void *ctx,
 				   struct gcm_context_data *gdata,
 				   u8 *auth_tag, unsigned long auth_tag_len);
 
+static struct aesni_gcm_tfm_s {
+void (*init)(void *ctx,
+				struct gcm_context_data *gdata,
+				u8 *iv,
+				u8 *hash_subkey, const u8 *aad,
+				unsigned long aad_len);
+void (*enc_update)(void *ctx,
+					struct gcm_context_data *gdata, u8 *out,
+					const u8 *in,
+					unsigned long plaintext_len);
+void (*dec_update)(void *ctx,
+					struct gcm_context_data *gdata, u8 *out,
+					const u8 *in,
+					unsigned long ciphertext_len);
+void (*finalize)(void *ctx,
+				struct gcm_context_data *gdata,
+				u8 *auth_tag, unsigned long auth_tag_len);
+} *aesni_gcm_tfm;
+
+struct aesni_gcm_tfm_s aesni_gcm_tfm_sse = {
+	.init = &aesni_gcm_init,
+	.enc_update = &aesni_gcm_enc_update,
+	.dec_update = &aesni_gcm_dec_update,
+	.finalize = &aesni_gcm_finalize,
+};
+
 #ifdef CONFIG_AS_AVX
 asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv,
 		void *keys, u8 *out, unsigned int num_bytes);
@@ -183,16 +209,27 @@ asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv,
 asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv,
 		void *keys, u8 *out, unsigned int num_bytes);
 /*
- * asmlinkage void aesni_gcm_precomp_avx_gen2()
+ * asmlinkage void aesni_gcm_init_avx_gen2()
  * gcm_data *my_ctx_data, context data
  * u8 *hash_subkey,  the Hash sub key input. Data starts on a 16-byte boundary.
  */
-asmlinkage void aesni_gcm_precomp_avx_gen2(void *my_ctx_data,
-					   struct gcm_context_data *gdata,
-					   u8 *hash_subkey,
-					   u8 *iv,
-					   const u8 *aad,
-					   unsigned long aad_len);
+asmlinkage void aesni_gcm_init_avx_gen2(void *my_ctx_data,
+					struct gcm_context_data *gdata,
+					u8 *iv,
+					u8 *hash_subkey,
+					const u8 *aad,
+					unsigned long aad_len);
+
+asmlinkage void aesni_gcm_enc_update_avx_gen2(void *ctx,
+				     struct gcm_context_data *gdata, u8 *out,
+				     const u8 *in, unsigned long plaintext_len);
+asmlinkage void aesni_gcm_dec_update_avx_gen2(void *ctx,
+				     struct gcm_context_data *gdata, u8 *out,
+				     const u8 *in,
+				     unsigned long ciphertext_len);
+asmlinkage void aesni_gcm_finalize_avx_gen2(void *ctx,
+				   struct gcm_context_data *gdata,
+				   u8 *auth_tag, unsigned long auth_tag_len);
 
 asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx,
 				struct gcm_context_data *gdata, u8 *out,
@@ -206,55 +243,38 @@ asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx,
 			const u8 *aad, unsigned long aad_len,
 			u8 *auth_tag, unsigned long auth_tag_len);
 
-static void aesni_gcm_enc_avx(void *ctx,
-			struct gcm_context_data *data, u8 *out,
-			const u8 *in, unsigned long plaintext_len, u8 *iv,
-			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
-			u8 *auth_tag, unsigned long auth_tag_len)
-{
-	if (plaintext_len < AVX_GEN2_OPTSIZE) {
-		aesni_gcm_enc(ctx, data, out, in,
-			plaintext_len, iv, hash_subkey, aad,
-			aad_len, auth_tag, auth_tag_len);
-	} else {
-		aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey, iv,
-					   aad, aad_len);
-		aesni_gcm_enc_avx_gen2(ctx, data, out, in, plaintext_len, iv,
-				       aad, aad_len, auth_tag, auth_tag_len);
-	}
-}
+struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen2 = {
+	.init = &aesni_gcm_init_avx_gen2,
+	.enc_update = &aesni_gcm_enc_update_avx_gen2,
+	.dec_update = &aesni_gcm_dec_update_avx_gen2,
+	.finalize = &aesni_gcm_finalize_avx_gen2,
+};
 
-static void aesni_gcm_dec_avx(void *ctx,
-			struct gcm_context_data *data, u8 *out,
-			const u8 *in, unsigned long ciphertext_len, u8 *iv,
-			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
-			u8 *auth_tag, unsigned long auth_tag_len)
-{
-	if (ciphertext_len < AVX_GEN2_OPTSIZE) {
-		aesni_gcm_dec(ctx, data, out, in,
-			ciphertext_len, iv, hash_subkey, aad,
-			aad_len, auth_tag, auth_tag_len);
-	} else {
-		aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey, iv,
-					   aad, aad_len);
-		aesni_gcm_dec_avx_gen2(ctx, data, out, in, ciphertext_len, iv,
-				       aad, aad_len, auth_tag, auth_tag_len);
-	}
-}
 #endif
 
 #ifdef CONFIG_AS_AVX2
 /*
- * asmlinkage void aesni_gcm_precomp_avx_gen4()
+ * asmlinkage void aesni_gcm_init_avx_gen4()
  * gcm_data *my_ctx_data, context data
  * u8 *hash_subkey,  the Hash sub key input. Data starts on a 16-byte boundary.
  */
-asmlinkage void aesni_gcm_precomp_avx_gen4(void *my_ctx_data,
-					   struct gcm_context_data *gdata,
-					   u8 *hash_subkey,
-					   u8 *iv,
-					   const u8 *aad,
-					   unsigned long aad_len);
+asmlinkage void aesni_gcm_init_avx_gen4(void *my_ctx_data,
+					struct gcm_context_data *gdata,
+					u8 *iv,
+					u8 *hash_subkey,
+					const u8 *aad,
+					unsigned long aad_len);
+
+asmlinkage void aesni_gcm_enc_update_avx_gen4(void *ctx,
+				     struct gcm_context_data *gdata, u8 *out,
+				     const u8 *in, unsigned long plaintext_len);
+asmlinkage void aesni_gcm_dec_update_avx_gen4(void *ctx,
+				     struct gcm_context_data *gdata, u8 *out,
+				     const u8 *in,
+				     unsigned long ciphertext_len);
+asmlinkage void aesni_gcm_finalize_avx_gen4(void *ctx,
+				   struct gcm_context_data *gdata,
+				   u8 *auth_tag, unsigned long auth_tag_len);
 
 asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx,
 				struct gcm_context_data *gdata, u8 *out,
@@ -268,67 +288,15 @@ asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx,
 			const u8 *aad, unsigned long aad_len,
 			u8 *auth_tag, unsigned long auth_tag_len);
 
-static void aesni_gcm_enc_avx2(void *ctx,
-			struct gcm_context_data *data, u8 *out,
-			const u8 *in, unsigned long plaintext_len, u8 *iv,
-			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
-			u8 *auth_tag, unsigned long auth_tag_len)
-{
-	if (plaintext_len < AVX_GEN2_OPTSIZE) {
-		aesni_gcm_enc(ctx, data, out, in,
-			      plaintext_len, iv, hash_subkey, aad,
-			      aad_len, auth_tag, auth_tag_len);
-	} else if (plaintext_len < AVX_GEN4_OPTSIZE) {
-		aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey, iv,
-					   aad, aad_len);
-		aesni_gcm_enc_avx_gen2(ctx, data, out, in, plaintext_len, iv,
-				       aad, aad_len, auth_tag, auth_tag_len);
-	} else {
-		aesni_gcm_precomp_avx_gen4(ctx, data, hash_subkey, iv,
-					   aad, aad_len);
-		aesni_gcm_enc_avx_gen4(ctx, data, out, in, plaintext_len, iv,
-				       aad, aad_len, auth_tag, auth_tag_len);
-	}
-}
+struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen4 = {
+	.init = &aesni_gcm_init_avx_gen4,
+	.enc_update = &aesni_gcm_enc_update_avx_gen4,
+	.dec_update = &aesni_gcm_dec_update_avx_gen4,
+	.finalize = &aesni_gcm_finalize_avx_gen4,
+};
 
-static void aesni_gcm_dec_avx2(void *ctx,
-	struct gcm_context_data *data, u8 *out,
-			const u8 *in, unsigned long ciphertext_len, u8 *iv,
-			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
-			u8 *auth_tag, unsigned long auth_tag_len)
-{
-	if (ciphertext_len < AVX_GEN2_OPTSIZE) {
-		aesni_gcm_dec(ctx, data, out, in,
-			      ciphertext_len, iv, hash_subkey,
-			      aad, aad_len, auth_tag, auth_tag_len);
-	} else if (ciphertext_len < AVX_GEN4_OPTSIZE) {
-		aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey, iv,
-					   aad, aad_len);
-		aesni_gcm_dec_avx_gen2(ctx, data, out, in, ciphertext_len, iv,
-				       aad, aad_len, auth_tag, auth_tag_len);
-	} else {
-		aesni_gcm_precomp_avx_gen4(ctx, data, hash_subkey, iv,
-					   aad, aad_len);
-		aesni_gcm_dec_avx_gen4(ctx, data, out, in, ciphertext_len, iv,
-				       aad, aad_len, auth_tag, auth_tag_len);
-	}
-}
 #endif
 
-static void (*aesni_gcm_enc_tfm)(void *ctx,
-				 struct gcm_context_data *data, u8 *out,
-				 const u8 *in, unsigned long plaintext_len,
-				 u8 *iv, u8 *hash_subkey, const u8 *aad,
-				 unsigned long aad_len, u8 *auth_tag,
-				 unsigned long auth_tag_len);
-
-static void (*aesni_gcm_dec_tfm)(void *ctx,
-				 struct gcm_context_data *data, u8 *out,
-				 const u8 *in, unsigned long ciphertext_len,
-				 u8 *iv, u8 *hash_subkey, const u8 *aad,
-				 unsigned long aad_len, u8 *auth_tag,
-				 unsigned long auth_tag_len);
-
 static inline struct
 aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
 {
@@ -810,6 +778,7 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
 {
 	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
 	unsigned long auth_tag_len = crypto_aead_authsize(tfm);
+	struct aesni_gcm_tfm_s *gcm_tfm = aesni_gcm_tfm;
 	struct gcm_context_data data AESNI_ALIGN_ATTR;
 	struct scatter_walk dst_sg_walk = {};
 	unsigned long left = req->cryptlen;
@@ -827,6 +796,15 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
 	if (!enc)
 		left -= auth_tag_len;
 
+#ifdef CONFIG_AS_AVX2
+	if (left < AVX_GEN4_OPTSIZE && gcm_tfm == &aesni_gcm_tfm_avx_gen4)
+		gcm_tfm = &aesni_gcm_tfm_avx_gen2;
+#endif
+#ifdef CONFIG_AS_AVX
+	if (left < AVX_GEN2_OPTSIZE && gcm_tfm == &aesni_gcm_tfm_avx_gen2)
+		gcm_tfm = &aesni_gcm_tfm_sse;
+#endif
+
 	/* Linearize assoc, if not already linear */
 	if (req->src->length >= assoclen && req->src->length &&
 		(!PageHighMem(sg_page(req->src)) ||
@@ -851,7 +829,7 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
 	}
 
 	kernel_fpu_begin();
-	aesni_gcm_init(aes_ctx, &data, iv,
+	gcm_tfm->init(aes_ctx, &data, iv,
 		hash_subkey, assoc, assoclen);
 	if (req->src != req->dst) {
 		while (left) {
@@ -862,10 +840,10 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
 			len = min(srclen, dstlen);
 			if (len) {
 				if (enc)
-					aesni_gcm_enc_update(aes_ctx, &data,
+					gcm_tfm->enc_update(aes_ctx, &data,
 							     dst, src, len);
 				else
-					aesni_gcm_dec_update(aes_ctx, &data,
+					gcm_tfm->dec_update(aes_ctx, &data,
 							     dst, src, len);
 			}
 			left -= len;
@@ -883,10 +861,10 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
 			len = scatterwalk_clamp(&src_sg_walk, left);
 			if (len) {
 				if (enc)
-					aesni_gcm_enc_update(aes_ctx, &data,
+					gcm_tfm->enc_update(aes_ctx, &data,
 							     src, src, len);
 				else
-					aesni_gcm_dec_update(aes_ctx, &data,
+					gcm_tfm->dec_update(aes_ctx, &data,
 							     src, src, len);
 			}
 			left -= len;
@@ -895,7 +873,7 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
 			scatterwalk_done(&src_sg_walk, 1, left);
 		}
 	}
-	aesni_gcm_finalize(aes_ctx, &data, authTag, auth_tag_len);
+	gcm_tfm->finalize(aes_ctx, &data, authTag, auth_tag_len);
 	kernel_fpu_end();
 
 	if (!assocmem)
@@ -928,145 +906,15 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
 static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen,
 			  u8 *hash_subkey, u8 *iv, void *aes_ctx)
 {
-	u8 one_entry_in_sg = 0;
-	u8 *src, *dst, *assoc;
-	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
-	unsigned long auth_tag_len = crypto_aead_authsize(tfm);
-	struct scatter_walk src_sg_walk;
-	struct scatter_walk dst_sg_walk = {};
-	struct gcm_context_data data AESNI_ALIGN_ATTR;
-
-	if (aesni_gcm_enc_tfm == aesni_gcm_enc ||
-		req->cryptlen < AVX_GEN2_OPTSIZE) {
-		return gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv,
-					  aes_ctx);
-	}
-	if (sg_is_last(req->src) &&
-	    (!PageHighMem(sg_page(req->src)) ||
-	    req->src->offset + req->src->length <= PAGE_SIZE) &&
-	    sg_is_last(req->dst) &&
-	    (!PageHighMem(sg_page(req->dst)) ||
-	    req->dst->offset + req->dst->length <= PAGE_SIZE)) {
-		one_entry_in_sg = 1;
-		scatterwalk_start(&src_sg_walk, req->src);
-		assoc = scatterwalk_map(&src_sg_walk);
-		src = assoc + req->assoclen;
-		dst = src;
-		if (unlikely(req->src != req->dst)) {
-			scatterwalk_start(&dst_sg_walk, req->dst);
-			dst = scatterwalk_map(&dst_sg_walk) + req->assoclen;
-		}
-	} else {
-		/* Allocate memory for src, dst, assoc */
-		assoc = kmalloc(req->cryptlen + auth_tag_len + req->assoclen,
-			GFP_ATOMIC);
-		if (unlikely(!assoc))
-			return -ENOMEM;
-		scatterwalk_map_and_copy(assoc, req->src, 0,
-					 req->assoclen + req->cryptlen, 0);
-		src = assoc + req->assoclen;
-		dst = src;
-	}
-
-	kernel_fpu_begin();
-	aesni_gcm_enc_tfm(aes_ctx, &data, dst, src, req->cryptlen, iv,
-			  hash_subkey, assoc, assoclen,
-			  dst + req->cryptlen, auth_tag_len);
-	kernel_fpu_end();
-
-	/* The authTag (aka the Integrity Check Value) needs to be written
-	 * back to the packet. */
-	if (one_entry_in_sg) {
-		if (unlikely(req->src != req->dst)) {
-			scatterwalk_unmap(dst - req->assoclen);
-			scatterwalk_advance(&dst_sg_walk, req->dst->length);
-			scatterwalk_done(&dst_sg_walk, 1, 0);
-		}
-		scatterwalk_unmap(assoc);
-		scatterwalk_advance(&src_sg_walk, req->src->length);
-		scatterwalk_done(&src_sg_walk, req->src == req->dst, 0);
-	} else {
-		scatterwalk_map_and_copy(dst, req->dst, req->assoclen,
-					 req->cryptlen + auth_tag_len, 1);
-		kfree(assoc);
-	}
-	return 0;
+	return gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv,
+				aes_ctx);
 }
 
 static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen,
 			  u8 *hash_subkey, u8 *iv, void *aes_ctx)
 {
-	u8 one_entry_in_sg = 0;
-	u8 *src, *dst, *assoc;
-	unsigned long tempCipherLen = 0;
-	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
-	unsigned long auth_tag_len = crypto_aead_authsize(tfm);
-	u8 authTag[16];
-	struct scatter_walk src_sg_walk;
-	struct scatter_walk dst_sg_walk = {};
-	struct gcm_context_data data AESNI_ALIGN_ATTR;
-	int retval = 0;
-
-	if (aesni_gcm_enc_tfm == aesni_gcm_enc ||
-		req->cryptlen < AVX_GEN2_OPTSIZE) {
-		return gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv,
-					  aes_ctx);
-	}
-	tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len);
-
-	if (sg_is_last(req->src) &&
-	    (!PageHighMem(sg_page(req->src)) ||
-	    req->src->offset + req->src->length <= PAGE_SIZE) &&
-	    sg_is_last(req->dst) && req->dst->length &&
-	    (!PageHighMem(sg_page(req->dst)) ||
-	    req->dst->offset + req->dst->length <= PAGE_SIZE)) {
-		one_entry_in_sg = 1;
-		scatterwalk_start(&src_sg_walk, req->src);
-		assoc = scatterwalk_map(&src_sg_walk);
-		src = assoc + req->assoclen;
-		dst = src;
-		if (unlikely(req->src != req->dst)) {
-			scatterwalk_start(&dst_sg_walk, req->dst);
-			dst = scatterwalk_map(&dst_sg_walk) + req->assoclen;
-		}
-	} else {
-		/* Allocate memory for src, dst, assoc */
-		assoc = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC);
-		if (!assoc)
-			return -ENOMEM;
-		scatterwalk_map_and_copy(assoc, req->src, 0,
-					 req->assoclen + req->cryptlen, 0);
-		src = assoc + req->assoclen;
-		dst = src;
-	}
-
-
-	kernel_fpu_begin();
-	aesni_gcm_dec_tfm(aes_ctx, &data, dst, src, tempCipherLen, iv,
-			  hash_subkey, assoc, assoclen,
-			  authTag, auth_tag_len);
-	kernel_fpu_end();
-
-	/* Compare generated tag with passed in tag. */
-	retval = crypto_memneq(src + tempCipherLen, authTag, auth_tag_len) ?
-		-EBADMSG : 0;
-
-	if (one_entry_in_sg) {
-		if (unlikely(req->src != req->dst)) {
-			scatterwalk_unmap(dst - req->assoclen);
-			scatterwalk_advance(&dst_sg_walk, req->dst->length);
-			scatterwalk_done(&dst_sg_walk, 1, 0);
-		}
-		scatterwalk_unmap(assoc);
-		scatterwalk_advance(&src_sg_walk, req->src->length);
-		scatterwalk_done(&src_sg_walk, req->src == req->dst, 0);
-	} else {
-		scatterwalk_map_and_copy(dst, req->dst, req->assoclen,
-					 tempCipherLen, 1);
-		kfree(assoc);
-	}
-	return retval;
-
+	return gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv,
+				aes_ctx);
 }
 
 static int helper_rfc4106_encrypt(struct aead_request *req)
@@ -1434,21 +1282,18 @@ static int __init aesni_init(void)
 #ifdef CONFIG_AS_AVX2
 	if (boot_cpu_has(X86_FEATURE_AVX2)) {
 		pr_info("AVX2 version of gcm_enc/dec engaged.\n");
-		aesni_gcm_enc_tfm = aesni_gcm_enc_avx2;
-		aesni_gcm_dec_tfm = aesni_gcm_dec_avx2;
+		aesni_gcm_tfm = &aesni_gcm_tfm_avx_gen4;
 	} else
 #endif
 #ifdef CONFIG_AS_AVX
 	if (boot_cpu_has(X86_FEATURE_AVX)) {
 		pr_info("AVX version of gcm_enc/dec engaged.\n");
-		aesni_gcm_enc_tfm = aesni_gcm_enc_avx;
-		aesni_gcm_dec_tfm = aesni_gcm_dec_avx;
+		aesni_gcm_tfm = &aesni_gcm_tfm_avx_gen2;
 	} else
 #endif
 	{
 		pr_info("SSE version of gcm_enc/dec engaged.\n");
-		aesni_gcm_enc_tfm = aesni_gcm_enc;
-		aesni_gcm_dec_tfm = aesni_gcm_dec;
+		aesni_gcm_tfm = &aesni_gcm_tfm_sse;
 	}
 	aesni_ctr_enc_tfm = aesni_ctr_enc;
 #ifdef CONFIG_AS_AVX
-- 
cgit v1.2.3-59-g8ed1b


From 9d880c5945c748d8edcac30965f3349a602158c4 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <natechancellor@gmail.com>
Date: Mon, 10 Dec 2018 16:49:29 -0700
Subject: crypto: ux500 - Use proper enum in cryp_set_dma_transfer

Clang warns when one enumerated type is implicitly converted to another:

drivers/crypto/ux500/cryp/cryp_core.c:559:5: warning: implicit
conversion from enumeration type 'enum dma_data_direction' to different
enumeration type 'enum dma_transfer_direction' [-Wenum-conversion]
                                direction, DMA_CTRL_ACK);
                                ^~~~~~~~~
drivers/crypto/ux500/cryp/cryp_core.c:583:5: warning: implicit
conversion from enumeration type 'enum dma_data_direction' to different
enumeration type 'enum dma_transfer_direction' [-Wenum-conversion]
                                direction,
                                ^~~~~~~~~
2 warnings generated.

dmaengine_prep_slave_sg expects an enum from dma_transfer_direction.
Because we know the value of the dma_data_direction enum from the
switch statement, we can just use the proper value from
dma_transfer_direction so there is no more conversion.

DMA_TO_DEVICE = DMA_MEM_TO_DEV = 1
DMA_FROM_DEVICE = DMA_DEV_TO_MEM = 2

Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/ux500/cryp/cryp_core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/crypto/ux500/cryp/cryp_core.c b/drivers/crypto/ux500/cryp/cryp_core.c
index d2663a4e1f5e..a92a66b1ff46 100644
--- a/drivers/crypto/ux500/cryp/cryp_core.c
+++ b/drivers/crypto/ux500/cryp/cryp_core.c
@@ -556,7 +556,7 @@ static int cryp_set_dma_transfer(struct cryp_ctx *ctx,
 		desc = dmaengine_prep_slave_sg(channel,
 				ctx->device->dma.sg_src,
 				ctx->device->dma.sg_src_len,
-				direction, DMA_CTRL_ACK);
+				DMA_MEM_TO_DEV, DMA_CTRL_ACK);
 		break;
 
 	case DMA_FROM_DEVICE:
@@ -580,7 +580,7 @@ static int cryp_set_dma_transfer(struct cryp_ctx *ctx,
 		desc = dmaengine_prep_slave_sg(channel,
 				ctx->device->dma.sg_dst,
 				ctx->device->dma.sg_dst_len,
-				direction,
+				DMA_DEV_TO_MEM,
 				DMA_CTRL_ACK |
 				DMA_PREP_INTERRUPT);
 
-- 
cgit v1.2.3-59-g8ed1b


From 5ac93f808338f4dd465402e91869702eb87db241 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <natechancellor@gmail.com>
Date: Mon, 10 Dec 2018 16:49:54 -0700
Subject: crypto: ux500 - Use proper enum in hash_set_dma_transfer

Clang warns when one enumerated type is implicitly converted to another:

drivers/crypto/ux500/hash/hash_core.c:169:4: warning: implicit
conversion from enumeration type 'enum dma_data_direction' to different
enumeration type 'enum dma_transfer_direction' [-Wenum-conversion]
                        direction, DMA_CTRL_ACK | DMA_PREP_INTERRUPT);
                        ^~~~~~~~~
1 warning generated.

dmaengine_prep_slave_sg expects an enum from dma_transfer_direction.
We know that the only direction supported by this function is
DMA_TO_DEVICE because of the check at the top of this function so we can
just use the equivalent value from dma_transfer_direction.

DMA_TO_DEVICE = DMA_MEM_TO_DEV = 1

Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/ux500/hash/hash_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/crypto/ux500/hash/hash_core.c b/drivers/crypto/ux500/hash/hash_core.c
index 633321a8dd03..a0bb8a6eec3f 100644
--- a/drivers/crypto/ux500/hash/hash_core.c
+++ b/drivers/crypto/ux500/hash/hash_core.c
@@ -166,7 +166,7 @@ static int hash_set_dma_transfer(struct hash_ctx *ctx, struct scatterlist *sg,
 		__func__);
 	desc = dmaengine_prep_slave_sg(channel,
 			ctx->device->dma.sg, ctx->device->dma.sg_len,
-			direction, DMA_CTRL_ACK | DMA_PREP_INTERRUPT);
+			DMA_MEM_TO_DEV, DMA_CTRL_ACK | DMA_PREP_INTERRUPT);
 	if (!desc) {
 		dev_err(ctx->device->dev,
 			"%s: dmaengine_prep_slave_sg() failed!\n", __func__);
-- 
cgit v1.2.3-59-g8ed1b


From 3cc04c160208ec55940db652343d236515d88af5 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Tue, 11 Dec 2018 08:11:59 +0000
Subject: crypto: chelsio - remove set but not used variable 'kctx_len'

Fixes gcc '-Wunused-but-set-variable' warning:

drivers/crypto/chelsio/chcr_ipsec.c: In function 'chcr_ipsec_xmit':
drivers/crypto/chelsio/chcr_ipsec.c:674:33: warning:
 variable 'kctx_len' set but not used [-Wunused-but-set-variable]
  unsigned int flits = 0, ndesc, kctx_len;

It not used since commit 8362ea16f69f ("crypto: chcr - ESN for Inline IPSec Tx")

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/chelsio/chcr_ipsec.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/crypto/chelsio/chcr_ipsec.c b/drivers/crypto/chelsio/chcr_ipsec.c
index 9321d2b1a5ab..82ca33126786 100644
--- a/drivers/crypto/chelsio/chcr_ipsec.c
+++ b/drivers/crypto/chelsio/chcr_ipsec.c
@@ -671,7 +671,7 @@ int chcr_ipsec_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct ipsec_sa_entry *sa_entry;
 	u64 *pos, *end, *before, *sgl;
 	int qidx, left, credits;
-	unsigned int flits = 0, ndesc, kctx_len;
+	unsigned int flits = 0, ndesc;
 	struct adapter *adap;
 	struct sge_eth_txq *q;
 	struct port_info *pi;
@@ -682,7 +682,6 @@ int chcr_ipsec_xmit(struct sk_buff *skb, struct net_device *dev)
 		return NETDEV_TX_BUSY;
 
 	sa_entry = (struct ipsec_sa_entry *)x->xso.offload_handle;
-	kctx_len = sa_entry->kctx_len;
 
 	if (skb->sp->len != 1) {
 out_free:       dev_kfree_skb_any(skb);
-- 
cgit v1.2.3-59-g8ed1b


From 1f479e4cfd08f20e48dfde07b27e3180e0901252 Mon Sep 17 00:00:00 2001
From: Harsh Jain <harsh@chelsio.com>
Date: Tue, 11 Dec 2018 16:21:37 +0530
Subject: crypto: chelsio - Swap location of AAD and IV sent in WR

Send input as IV | AAD | Data. It will allow sending IV as Immediate
Data and Creates space in Work request to add more dma mapped entries.

Signed-off-by: Harsh Jain <harsh@chelsio.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/chelsio/chcr_algo.c   | 212 +++++++++++++++++------------------
 drivers/crypto/chelsio/chcr_algo.h   |   2 +-
 drivers/crypto/chelsio/chcr_crypto.h |  10 +-
 3 files changed, 104 insertions(+), 120 deletions(-)

diff --git a/drivers/crypto/chelsio/chcr_algo.c b/drivers/crypto/chelsio/chcr_algo.c
index 497c57803de7..2b26735fd122 100644
--- a/drivers/crypto/chelsio/chcr_algo.c
+++ b/drivers/crypto/chelsio/chcr_algo.c
@@ -2215,10 +2215,7 @@ static int chcr_aead_common_init(struct aead_request *req)
 		error = -ENOMEM;
 		goto err;
 	}
-	reqctx->aad_nents = sg_nents_xlen(req->src, req->assoclen,
-					  CHCR_SRC_SG_SIZE, 0);
-	reqctx->src_nents = sg_nents_xlen(req->src, req->cryptlen,
-					  CHCR_SRC_SG_SIZE, req->assoclen);
+
 	return 0;
 err:
 	return error;
@@ -2268,10 +2265,10 @@ static struct sk_buff *create_authenc_wr(struct aead_request *req,
 	struct ulptx_sgl *ulptx;
 	unsigned int transhdr_len;
 	unsigned int dst_size = 0, temp, subtype = get_aead_subtype(tfm);
-	unsigned int   kctx_len = 0, dnents;
-	unsigned int  assoclen = req->assoclen;
+	unsigned int   kctx_len = 0, dnents, snents;
 	unsigned int  authsize = crypto_aead_authsize(tfm);
 	int error = -EINVAL;
+	u8 *ivptr;
 	int null = 0;
 	gfp_t flags = req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP ? GFP_KERNEL :
 		GFP_ATOMIC;
@@ -2288,24 +2285,20 @@ static struct sk_buff *create_authenc_wr(struct aead_request *req,
 	if (subtype == CRYPTO_ALG_SUB_TYPE_CBC_NULL ||
 		subtype == CRYPTO_ALG_SUB_TYPE_CTR_NULL) {
 		null = 1;
-		assoclen = 0;
-		reqctx->aad_nents = 0;
 	}
-	dnents = sg_nents_xlen(req->dst, assoclen, CHCR_DST_SG_SIZE, 0);
-	dnents += sg_nents_xlen(req->dst, req->cryptlen +
-		(reqctx->op ? -authsize : authsize), CHCR_DST_SG_SIZE,
-		req->assoclen);
+	dnents = sg_nents_xlen(req->dst, req->assoclen + req->cryptlen +
+		(reqctx->op ? -authsize : authsize), CHCR_DST_SG_SIZE, 0);
 	dnents += MIN_AUTH_SG; // For IV
-
+	snents = sg_nents_xlen(req->src, req->assoclen + req->cryptlen,
+			       CHCR_SRC_SG_SIZE, 0);
 	dst_size = get_space_for_phys_dsgl(dnents);
 	kctx_len = (ntohl(KEY_CONTEXT_CTX_LEN_V(aeadctx->key_ctx_hdr)) << 4)
 		- sizeof(chcr_req->key_ctx);
 	transhdr_len = CIPHER_TRANSHDR_SIZE(kctx_len, dst_size);
-	reqctx->imm = (transhdr_len + assoclen + IV + req->cryptlen) <
+	reqctx->imm = (transhdr_len + req->assoclen + req->cryptlen) <
 			SGE_MAX_WR_LEN;
-	temp = reqctx->imm ? roundup(assoclen + IV + req->cryptlen, 16)
-			: (sgl_len(reqctx->src_nents + reqctx->aad_nents
-			+ MIN_GCM_SG) * 8);
+	temp = reqctx->imm ? roundup(req->assoclen + req->cryptlen, 16)
+			: (sgl_len(snents) * 8);
 	transhdr_len += temp;
 	transhdr_len = roundup(transhdr_len, 16);
 
@@ -2315,7 +2308,7 @@ static struct sk_buff *create_authenc_wr(struct aead_request *req,
 		chcr_aead_common_exit(req);
 		return ERR_PTR(chcr_aead_fallback(req, reqctx->op));
 	}
-	skb = alloc_skb(SGE_MAX_WR_LEN, flags);
+	skb = alloc_skb(transhdr_len, flags);
 	if (!skb) {
 		error = -ENOMEM;
 		goto err;
@@ -2331,16 +2324,16 @@ static struct sk_buff *create_authenc_wr(struct aead_request *req,
 	 * to the hardware spec
 	 */
 	chcr_req->sec_cpl.op_ivinsrtofst =
-		FILL_SEC_CPL_OP_IVINSR(a_ctx(tfm)->dev->rx_channel_id, 2,
-				       assoclen + 1);
-	chcr_req->sec_cpl.pldlen = htonl(assoclen + IV + req->cryptlen);
+		FILL_SEC_CPL_OP_IVINSR(a_ctx(tfm)->dev->rx_channel_id, 2, 1);
+	chcr_req->sec_cpl.pldlen = htonl(req->assoclen + IV + req->cryptlen);
 	chcr_req->sec_cpl.aadstart_cipherstop_hi = FILL_SEC_CPL_CIPHERSTOP_HI(
-					assoclen ? 1 : 0, assoclen,
-					assoclen + IV + 1,
+					null ? 0 : 1 + IV,
+					null ? 0 : IV + req->assoclen,
+					req->assoclen + IV + 1,
 					(temp & 0x1F0) >> 4);
 	chcr_req->sec_cpl.cipherstop_lo_authinsert = FILL_SEC_CPL_AUTHINSERT(
 					temp & 0xF,
-					null ? 0 : assoclen + IV + 1,
+					null ? 0 : req->assoclen + IV + 1,
 					temp, temp);
 	if (subtype == CRYPTO_ALG_SUB_TYPE_CTR_NULL ||
 	    subtype == CRYPTO_ALG_SUB_TYPE_CTR_SHA)
@@ -2367,23 +2360,24 @@ static struct sk_buff *create_authenc_wr(struct aead_request *req,
 
 	memcpy(chcr_req->key_ctx.key + roundup(aeadctx->enckey_len, 16),
 	       actx->h_iopad, kctx_len - roundup(aeadctx->enckey_len, 16));
+	phys_cpl = (struct cpl_rx_phys_dsgl *)((u8 *)(chcr_req + 1) + kctx_len);
+	ivptr = (u8 *)(phys_cpl + 1) + dst_size;
+	ulptx = (struct ulptx_sgl *)(ivptr + IV);
 	if (subtype == CRYPTO_ALG_SUB_TYPE_CTR_SHA ||
 	    subtype == CRYPTO_ALG_SUB_TYPE_CTR_NULL) {
-		memcpy(reqctx->iv, aeadctx->nonce, CTR_RFC3686_NONCE_SIZE);
-		memcpy(reqctx->iv + CTR_RFC3686_NONCE_SIZE, req->iv,
+		memcpy(ivptr, aeadctx->nonce, CTR_RFC3686_NONCE_SIZE);
+		memcpy(ivptr + CTR_RFC3686_NONCE_SIZE, req->iv,
 				CTR_RFC3686_IV_SIZE);
-		*(__be32 *)(reqctx->iv + CTR_RFC3686_NONCE_SIZE +
+		*(__be32 *)(ivptr + CTR_RFC3686_NONCE_SIZE +
 			CTR_RFC3686_IV_SIZE) = cpu_to_be32(1);
 	} else {
-		memcpy(reqctx->iv, req->iv, IV);
+		memcpy(ivptr, req->iv, IV);
 	}
-	phys_cpl = (struct cpl_rx_phys_dsgl *)((u8 *)(chcr_req + 1) + kctx_len);
-	ulptx = (struct ulptx_sgl *)((u8 *)(phys_cpl + 1) + dst_size);
-	chcr_add_aead_dst_ent(req, phys_cpl, assoclen, qid);
-	chcr_add_aead_src_ent(req, ulptx, assoclen);
+	chcr_add_aead_dst_ent(req, phys_cpl, qid);
+	chcr_add_aead_src_ent(req, ulptx);
 	atomic_inc(&adap->chcr_stats.cipher_rqst);
-	temp = sizeof(struct cpl_rx_phys_dsgl) + dst_size +
-		kctx_len + (reqctx->imm ? (assoclen + IV + req->cryptlen) : 0);
+	temp = sizeof(struct cpl_rx_phys_dsgl) + dst_size + IV +
+		kctx_len + (reqctx->imm ? (req->assoclen + req->cryptlen) : 0);
 	create_wreq(a_ctx(tfm), chcr_req, &req->base, reqctx->imm, size,
 		   transhdr_len, temp, 0);
 	reqctx->skb = skb;
@@ -2470,8 +2464,7 @@ void chcr_aead_dma_unmap(struct device *dev,
 }
 
 void chcr_add_aead_src_ent(struct aead_request *req,
-			   struct ulptx_sgl *ulptx,
-			   unsigned int assoclen)
+			   struct ulptx_sgl *ulptx)
 {
 	struct ulptx_walk ulp_walk;
 	struct chcr_aead_reqctx  *reqctx = aead_request_ctx(req);
@@ -2484,28 +2477,20 @@ void chcr_add_aead_src_ent(struct aead_request *req,
 			buf += reqctx->b0_len;
 		}
 		sg_pcopy_to_buffer(req->src, sg_nents(req->src),
-				   buf, assoclen, 0);
-		buf += assoclen;
-		memcpy(buf, reqctx->iv, IV);
-		buf += IV;
-		sg_pcopy_to_buffer(req->src, sg_nents(req->src),
-				   buf, req->cryptlen, req->assoclen);
+				   buf, req->cryptlen + req->assoclen, 0);
 	} else {
 		ulptx_walk_init(&ulp_walk, ulptx);
 		if (reqctx->b0_len)
 			ulptx_walk_add_page(&ulp_walk, reqctx->b0_len,
 					    &reqctx->b0_dma);
-		ulptx_walk_add_sg(&ulp_walk, req->src, assoclen, 0);
-		ulptx_walk_add_page(&ulp_walk, IV, &reqctx->iv_dma);
-		ulptx_walk_add_sg(&ulp_walk, req->src, req->cryptlen,
-				  req->assoclen);
+		ulptx_walk_add_sg(&ulp_walk, req->src, req->cryptlen +
+				  req->assoclen,  0);
 		ulptx_walk_end(&ulp_walk);
 	}
 }
 
 void chcr_add_aead_dst_ent(struct aead_request *req,
 			   struct cpl_rx_phys_dsgl *phys_cpl,
-			   unsigned int assoclen,
 			   unsigned short qid)
 {
 	struct chcr_aead_reqctx  *reqctx = aead_request_ctx(req);
@@ -2516,12 +2501,10 @@ void chcr_add_aead_dst_ent(struct aead_request *req,
 	u32 temp;
 
 	dsgl_walk_init(&dsgl_walk, phys_cpl);
-	if (reqctx->b0_len)
-		dsgl_walk_add_page(&dsgl_walk, reqctx->b0_len, &reqctx->b0_dma);
-	dsgl_walk_add_sg(&dsgl_walk, req->dst, assoclen, 0);
-	dsgl_walk_add_page(&dsgl_walk, IV, &reqctx->iv_dma);
-	temp = req->cryptlen + (reqctx->op ? -authsize : authsize);
-	dsgl_walk_add_sg(&dsgl_walk, req->dst, temp, req->assoclen);
+	dsgl_walk_add_page(&dsgl_walk, IV + reqctx->b0_len, &reqctx->iv_dma);
+	temp = req->assoclen + req->cryptlen +
+		(reqctx->op ? -authsize : authsize);
+	dsgl_walk_add_sg(&dsgl_walk, req->dst, temp, 0);
 	dsgl_walk_end(&dsgl_walk, qid, ctx->pci_chan_id);
 }
 
@@ -2689,8 +2672,7 @@ static int set_msg_len(u8 *block, unsigned int msglen, int csize)
 	return 0;
 }
 
-static void generate_b0(struct aead_request *req,
-			struct chcr_aead_ctx *aeadctx,
+static void generate_b0(struct aead_request *req, u8 *ivptr,
 			unsigned short op_type)
 {
 	unsigned int l, lp, m;
@@ -2701,7 +2683,7 @@ static void generate_b0(struct aead_request *req,
 
 	m = crypto_aead_authsize(aead);
 
-	memcpy(b0, reqctx->iv, 16);
+	memcpy(b0, ivptr, 16);
 
 	lp = b0[0];
 	l = lp + 1;
@@ -2727,29 +2709,31 @@ static inline int crypto_ccm_check_iv(const u8 *iv)
 }
 
 static int ccm_format_packet(struct aead_request *req,
-			     struct chcr_aead_ctx *aeadctx,
+			     u8 *ivptr,
 			     unsigned int sub_type,
 			     unsigned short op_type,
 			     unsigned int assoclen)
 {
 	struct chcr_aead_reqctx *reqctx = aead_request_ctx(req);
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct chcr_aead_ctx *aeadctx = AEAD_CTX(a_ctx(tfm));
 	int rc = 0;
 
 	if (sub_type == CRYPTO_ALG_SUB_TYPE_AEAD_RFC4309) {
-		reqctx->iv[0] = 3;
-		memcpy(reqctx->iv + 1, &aeadctx->salt[0], 3);
-		memcpy(reqctx->iv + 4, req->iv, 8);
-		memset(reqctx->iv + 12, 0, 4);
+		ivptr[0] = 3;
+		memcpy(ivptr + 1, &aeadctx->salt[0], 3);
+		memcpy(ivptr + 4, req->iv, 8);
+		memset(ivptr + 12, 0, 4);
 	} else {
-		memcpy(reqctx->iv, req->iv, 16);
+		memcpy(ivptr, req->iv, 16);
 	}
 	if (assoclen)
 		*((unsigned short *)(reqctx->scratch_pad + 16)) =
 				htons(assoclen);
 
-	generate_b0(req, aeadctx, op_type);
+	generate_b0(req, ivptr, op_type);
 	/* zero the ctr value */
-	memset(reqctx->iv + 15 - reqctx->iv[0], 0, reqctx->iv[0] + 1);
+	memset(ivptr + 15 - ivptr[0], 0, ivptr[0] + 1);
 	return rc;
 }
 
@@ -2775,7 +2759,7 @@ static void fill_sec_cpl_for_aead(struct cpl_tx_sec_pdu *sec_cpl,
 		((assoclen) ? CCM_AAD_FIELD_SIZE : 0);
 
 	auth_offset = req->cryptlen ?
-		(assoclen + IV + 1 + ccm_xtra) : 0;
+		(req->assoclen + IV + 1 + ccm_xtra) : 0;
 	if (op_type == CHCR_DECRYPT_OP) {
 		if (crypto_aead_authsize(tfm) != req->cryptlen)
 			tag_offset = crypto_aead_authsize(tfm);
@@ -2785,13 +2769,13 @@ static void fill_sec_cpl_for_aead(struct cpl_tx_sec_pdu *sec_cpl,
 
 
 	sec_cpl->op_ivinsrtofst = FILL_SEC_CPL_OP_IVINSR(c_id,
-					 2, assoclen + 1 + ccm_xtra);
+					 2, 1);
 	sec_cpl->pldlen =
-		htonl(assoclen + IV + req->cryptlen + ccm_xtra);
+		htonl(req->assoclen + IV + req->cryptlen + ccm_xtra);
 	/* For CCM there wil be b0 always. So AAD start will be 1 always */
 	sec_cpl->aadstart_cipherstop_hi = FILL_SEC_CPL_CIPHERSTOP_HI(
-					1, assoclen + ccm_xtra, assoclen
-					+ IV + 1 + ccm_xtra, 0);
+				1 + IV,	IV + assoclen + ccm_xtra,
+				req->assoclen + IV + 1 + ccm_xtra, 0);
 
 	sec_cpl->cipherstop_lo_authinsert = FILL_SEC_CPL_AUTHINSERT(0,
 					auth_offset, tag_offset,
@@ -2838,10 +2822,11 @@ static struct sk_buff *create_aead_ccm_wr(struct aead_request *req,
 	struct cpl_rx_phys_dsgl *phys_cpl;
 	struct ulptx_sgl *ulptx;
 	unsigned int transhdr_len;
-	unsigned int dst_size = 0, kctx_len, dnents, temp;
+	unsigned int dst_size = 0, kctx_len, dnents, temp, snents;
 	unsigned int sub_type, assoclen = req->assoclen;
 	unsigned int authsize = crypto_aead_authsize(tfm);
 	int error = -EINVAL;
+	u8 *ivptr;
 	gfp_t flags = req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP ? GFP_KERNEL :
 		GFP_ATOMIC;
 	struct adapter *adap = padap(a_ctx(tfm)->dev);
@@ -2857,37 +2842,38 @@ static struct sk_buff *create_aead_ccm_wr(struct aead_request *req,
 	error = aead_ccm_validate_input(reqctx->op, req, aeadctx, sub_type);
 	if (error)
 		goto err;
-	dnents = sg_nents_xlen(req->dst, assoclen, CHCR_DST_SG_SIZE, 0);
-	dnents += sg_nents_xlen(req->dst, req->cryptlen
+	dnents = sg_nents_xlen(req->dst, req->assoclen + req->cryptlen
 			+ (reqctx->op ? -authsize : authsize),
-			CHCR_DST_SG_SIZE, req->assoclen);
+			CHCR_DST_SG_SIZE, 0);
 	dnents += MIN_CCM_SG; // For IV and B0
 	dst_size = get_space_for_phys_dsgl(dnents);
+	snents = sg_nents_xlen(req->src, req->assoclen + req->cryptlen,
+			       CHCR_SRC_SG_SIZE, 0);
+	snents += MIN_CCM_SG; //For B0
 	kctx_len = roundup(aeadctx->enckey_len, 16) * 2;
 	transhdr_len = CIPHER_TRANSHDR_SIZE(kctx_len, dst_size);
-	reqctx->imm = (transhdr_len + assoclen + IV + req->cryptlen +
+	reqctx->imm = (transhdr_len + req->assoclen + req->cryptlen +
 		       reqctx->b0_len) <= SGE_MAX_WR_LEN;
-	temp = reqctx->imm ? roundup(assoclen + IV + req->cryptlen +
+	temp = reqctx->imm ? roundup(req->assoclen + req->cryptlen +
 				     reqctx->b0_len, 16) :
-		(sgl_len(reqctx->src_nents + reqctx->aad_nents +
-				    MIN_CCM_SG) *  8);
+		(sgl_len(snents) *  8);
 	transhdr_len += temp;
 	transhdr_len = roundup(transhdr_len, 16);
 
 	if (chcr_aead_need_fallback(req, dnents, T6_MAX_AAD_SIZE -
-				    reqctx->b0_len, transhdr_len, reqctx->op)) {
+				reqctx->b0_len, transhdr_len, reqctx->op)) {
 		atomic_inc(&adap->chcr_stats.fallback);
 		chcr_aead_common_exit(req);
 		return ERR_PTR(chcr_aead_fallback(req, reqctx->op));
 	}
-	skb = alloc_skb(SGE_MAX_WR_LEN,  flags);
+	skb = alloc_skb(transhdr_len,  flags);
 
 	if (!skb) {
 		error = -ENOMEM;
 		goto err;
 	}
 
-	chcr_req = (struct chcr_wr *) __skb_put_zero(skb, transhdr_len);
+	chcr_req = __skb_put_zero(skb, transhdr_len);
 
 	fill_sec_cpl_for_aead(&chcr_req->sec_cpl, dst_size, req, reqctx->op);
 
@@ -2897,16 +2883,17 @@ static struct sk_buff *create_aead_ccm_wr(struct aead_request *req,
 			aeadctx->key, aeadctx->enckey_len);
 
 	phys_cpl = (struct cpl_rx_phys_dsgl *)((u8 *)(chcr_req + 1) + kctx_len);
-	ulptx = (struct ulptx_sgl *)((u8 *)(phys_cpl + 1) + dst_size);
-	error = ccm_format_packet(req, aeadctx, sub_type, reqctx->op, assoclen);
+	ivptr = (u8 *)(phys_cpl + 1) + dst_size;
+	ulptx = (struct ulptx_sgl *)(ivptr + IV);
+	error = ccm_format_packet(req, ivptr, sub_type, reqctx->op, assoclen);
 	if (error)
 		goto dstmap_fail;
-	chcr_add_aead_dst_ent(req, phys_cpl, assoclen, qid);
-	chcr_add_aead_src_ent(req, ulptx, assoclen);
+	chcr_add_aead_dst_ent(req, phys_cpl, qid);
+	chcr_add_aead_src_ent(req, ulptx);
 
 	atomic_inc(&adap->chcr_stats.aead_rqst);
-	temp = sizeof(struct cpl_rx_phys_dsgl) + dst_size +
-		kctx_len + (reqctx->imm ? (assoclen + IV + req->cryptlen +
+	temp = sizeof(struct cpl_rx_phys_dsgl) + dst_size + IV +
+		kctx_len + (reqctx->imm ? (req->assoclen + req->cryptlen +
 		reqctx->b0_len) : 0);
 	create_wreq(a_ctx(tfm), chcr_req, &req->base, reqctx->imm, 0,
 		    transhdr_len, temp, 0);
@@ -2931,10 +2918,11 @@ static struct sk_buff *create_gcm_wr(struct aead_request *req,
 	struct chcr_wr *chcr_req;
 	struct cpl_rx_phys_dsgl *phys_cpl;
 	struct ulptx_sgl *ulptx;
-	unsigned int transhdr_len, dnents = 0;
+	unsigned int transhdr_len, dnents = 0, snents;
 	unsigned int dst_size = 0, temp = 0, kctx_len, assoclen = req->assoclen;
 	unsigned int authsize = crypto_aead_authsize(tfm);
 	int error = -EINVAL;
+	u8 *ivptr;
 	gfp_t flags = req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP ? GFP_KERNEL :
 		GFP_ATOMIC;
 	struct adapter *adap = padap(a_ctx(tfm)->dev);
@@ -2946,19 +2934,19 @@ static struct sk_buff *create_gcm_wr(struct aead_request *req,
 	error = chcr_aead_common_init(req);
 	if (error)
 		return ERR_PTR(error);
-	dnents = sg_nents_xlen(req->dst, assoclen, CHCR_DST_SG_SIZE, 0);
-	dnents += sg_nents_xlen(req->dst, req->cryptlen +
+	dnents = sg_nents_xlen(req->dst, req->assoclen + req->cryptlen +
 				(reqctx->op ? -authsize : authsize),
-				CHCR_DST_SG_SIZE, req->assoclen);
+				CHCR_DST_SG_SIZE, 0);
+	snents = sg_nents_xlen(req->src, req->assoclen + req->cryptlen,
+			       CHCR_SRC_SG_SIZE, 0);
 	dnents += MIN_GCM_SG; // For IV
 	dst_size = get_space_for_phys_dsgl(dnents);
 	kctx_len = roundup(aeadctx->enckey_len, 16) + AEAD_H_SIZE;
 	transhdr_len = CIPHER_TRANSHDR_SIZE(kctx_len, dst_size);
-	reqctx->imm = (transhdr_len + assoclen + IV + req->cryptlen) <=
+	reqctx->imm = (transhdr_len + req->assoclen + req->cryptlen) <=
 			SGE_MAX_WR_LEN;
-	temp = reqctx->imm ? roundup(assoclen + IV + req->cryptlen, 16) :
-		(sgl_len(reqctx->src_nents +
-		reqctx->aad_nents + MIN_GCM_SG) * 8);
+	temp = reqctx->imm ? roundup(req->assoclen + req->cryptlen, 16) :
+		(sgl_len(snents) * 8);
 	transhdr_len += temp;
 	transhdr_len = roundup(transhdr_len, 16);
 	if (chcr_aead_need_fallback(req, dnents, T6_MAX_AAD_SIZE,
@@ -2968,7 +2956,7 @@ static struct sk_buff *create_gcm_wr(struct aead_request *req,
 		chcr_aead_common_exit(req);
 		return ERR_PTR(chcr_aead_fallback(req, reqctx->op));
 	}
-	skb = alloc_skb(SGE_MAX_WR_LEN, flags);
+	skb = alloc_skb(transhdr_len, flags);
 	if (!skb) {
 		error = -ENOMEM;
 		goto err;
@@ -2979,15 +2967,15 @@ static struct sk_buff *create_gcm_wr(struct aead_request *req,
 	//Offset of tag from end
 	temp = (reqctx->op == CHCR_ENCRYPT_OP) ? 0 : authsize;
 	chcr_req->sec_cpl.op_ivinsrtofst = FILL_SEC_CPL_OP_IVINSR(
-					a_ctx(tfm)->dev->rx_channel_id, 2,
-					(assoclen + 1));
+					a_ctx(tfm)->dev->rx_channel_id, 2, 1);
 	chcr_req->sec_cpl.pldlen =
-		htonl(assoclen + IV + req->cryptlen);
+		htonl(req->assoclen + IV + req->cryptlen);
 	chcr_req->sec_cpl.aadstart_cipherstop_hi = FILL_SEC_CPL_CIPHERSTOP_HI(
-					assoclen ? 1 : 0, assoclen,
-					assoclen + IV + 1, 0);
+					assoclen ? 1 + IV : 0,
+					assoclen ? IV + assoclen : 0,
+					req->assoclen + IV + 1, 0);
 	chcr_req->sec_cpl.cipherstop_lo_authinsert =
-			FILL_SEC_CPL_AUTHINSERT(0, assoclen + IV + 1,
+			FILL_SEC_CPL_AUTHINSERT(0, req->assoclen + IV + 1,
 						temp, temp);
 	chcr_req->sec_cpl.seqno_numivs =
 			FILL_SEC_CPL_SCMD0_SEQNO(reqctx->op, (reqctx->op ==
@@ -3002,25 +2990,26 @@ static struct sk_buff *create_gcm_wr(struct aead_request *req,
 	memcpy(chcr_req->key_ctx.key + roundup(aeadctx->enckey_len, 16),
 	       GCM_CTX(aeadctx)->ghash_h, AEAD_H_SIZE);
 
+	phys_cpl = (struct cpl_rx_phys_dsgl *)((u8 *)(chcr_req + 1) + kctx_len);
+	ivptr = (u8 *)(phys_cpl + 1) + dst_size;
 	/* prepare a 16 byte iv */
 	/* S   A   L  T |  IV | 0x00000001 */
 	if (get_aead_subtype(tfm) ==
 	    CRYPTO_ALG_SUB_TYPE_AEAD_RFC4106) {
-		memcpy(reqctx->iv, aeadctx->salt, 4);
-		memcpy(reqctx->iv + 4, req->iv, GCM_RFC4106_IV_SIZE);
+		memcpy(ivptr, aeadctx->salt, 4);
+		memcpy(ivptr + 4, req->iv, GCM_RFC4106_IV_SIZE);
 	} else {
-		memcpy(reqctx->iv, req->iv, GCM_AES_IV_SIZE);
+		memcpy(ivptr, req->iv, GCM_AES_IV_SIZE);
 	}
-	*((unsigned int *)(reqctx->iv + 12)) = htonl(0x01);
+	*((unsigned int *)(ivptr + 12)) = htonl(0x01);
 
-	phys_cpl = (struct cpl_rx_phys_dsgl *)((u8 *)(chcr_req + 1) + kctx_len);
-	ulptx = (struct ulptx_sgl *)((u8 *)(phys_cpl + 1) + dst_size);
+	ulptx = (struct ulptx_sgl *)(ivptr + 16);
 
-	chcr_add_aead_dst_ent(req, phys_cpl, assoclen, qid);
-	chcr_add_aead_src_ent(req, ulptx, assoclen);
+	chcr_add_aead_dst_ent(req, phys_cpl, qid);
+	chcr_add_aead_src_ent(req, ulptx);
 	atomic_inc(&adap->chcr_stats.aead_rqst);
-	temp = sizeof(struct cpl_rx_phys_dsgl) + dst_size +
-		kctx_len + (reqctx->imm ? (assoclen + IV + req->cryptlen) : 0);
+	temp = sizeof(struct cpl_rx_phys_dsgl) + dst_size + IV +
+		kctx_len + (reqctx->imm ? (req->assoclen + req->cryptlen) : 0);
 	create_wreq(a_ctx(tfm), chcr_req, &req->base, reqctx->imm, size,
 		    transhdr_len, temp, reqctx->verify);
 	reqctx->skb = skb;
@@ -4178,7 +4167,6 @@ static struct chcr_alg_template driver_algs[] = {
 			.setauthsize = chcr_authenc_null_setauthsize,
 		}
 	},
-
 };
 
 /*
diff --git a/drivers/crypto/chelsio/chcr_algo.h b/drivers/crypto/chelsio/chcr_algo.h
index 1871500309e2..ee20dd899e83 100644
--- a/drivers/crypto/chelsio/chcr_algo.h
+++ b/drivers/crypto/chelsio/chcr_algo.h
@@ -262,7 +262,7 @@
 #define MIN_AUTH_SG			1 /* IV */
 #define MIN_GCM_SG			1 /* IV */
 #define MIN_DIGEST_SG			1 /*Partial Buffer*/
-#define MIN_CCM_SG			2 /*IV+B0*/
+#define MIN_CCM_SG			1 /*IV+B0*/
 #define CIP_SPACE_LEFT(len) \
 	((SGE_MAX_WR_LEN - CIP_WR_MIN_LEN - (len)))
 #define HASH_SPACE_LEFT(len) \
diff --git a/drivers/crypto/chelsio/chcr_crypto.h b/drivers/crypto/chelsio/chcr_crypto.h
index d37ef41f9ebe..655606f2e4d0 100644
--- a/drivers/crypto/chelsio/chcr_crypto.h
+++ b/drivers/crypto/chelsio/chcr_crypto.h
@@ -41,7 +41,8 @@
 
 #define CCM_B0_SIZE             16
 #define CCM_AAD_FIELD_SIZE      2
-#define T6_MAX_AAD_SIZE 511
+// 511 - 16(For IV)
+#define T6_MAX_AAD_SIZE 495
 
 
 /* Define following if h/w is not dropping the AAD and IV data before
@@ -185,9 +186,6 @@ struct chcr_aead_reqctx {
 	dma_addr_t b0_dma;
 	unsigned int b0_len;
 	unsigned int op;
-	short int aad_nents;
-	short int src_nents;
-	short int dst_nents;
 	u16 imm;
 	u16 verify;
 	u8 iv[CHCR_MAX_CRYPTO_IV_LEN + MAX_SCRATCH_PAD_SIZE];
@@ -322,10 +320,8 @@ void chcr_aead_dma_unmap(struct device *dev, struct aead_request *req,
 			 unsigned short op_type);
 void chcr_add_aead_dst_ent(struct aead_request *req,
 			   struct cpl_rx_phys_dsgl *phys_cpl,
-			   unsigned int assoclen,
 			   unsigned short qid);
-void chcr_add_aead_src_ent(struct aead_request *req, struct ulptx_sgl *ulptx,
-			   unsigned int assoclen);
+void chcr_add_aead_src_ent(struct aead_request *req, struct ulptx_sgl *ulptx);
 void chcr_add_cipher_src_ent(struct ablkcipher_request *req,
 			     void *ulptx,
 			     struct  cipher_wr_param *wrparam);
-- 
cgit v1.2.3-59-g8ed1b


From d5a4dfbdaf54cbd845755a5415cff57688bb983c Mon Sep 17 00:00:00 2001
From: Harsh Jain <harsh@chelsio.com>
Date: Tue, 11 Dec 2018 16:21:38 +0530
Subject: crypto: chelsio - Use same value for both channel in single WR

Use tx_channel_id instead of rx_channel_id.

Signed-off-by: Harsh Jain <harsh@chelsio.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/chelsio/chcr_algo.c | 13 ++++++-------
 drivers/crypto/chelsio/chcr_core.h |  1 -
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/drivers/crypto/chelsio/chcr_algo.c b/drivers/crypto/chelsio/chcr_algo.c
index 2b26735fd122..ee985ad69039 100644
--- a/drivers/crypto/chelsio/chcr_algo.c
+++ b/drivers/crypto/chelsio/chcr_algo.c
@@ -717,7 +717,7 @@ static inline void create_wreq(struct chcr_context *ctx,
 		htonl(FW_CRYPTO_LOOKASIDE_WR_LEN16_V(DIV_ROUND_UP(len16, 16)));
 	chcr_req->wreq.cookie = cpu_to_be64((uintptr_t)req);
 	chcr_req->wreq.rx_chid_to_rx_q_id =
-		FILL_WR_RX_Q_ID(ctx->dev->rx_channel_id, qid,
+		FILL_WR_RX_Q_ID(ctx->tx_chan_id, qid,
 				!!lcb, ctx->tx_qidx);
 
 	chcr_req->ulptx.cmd_dest = FILL_ULPTX_CMD_DEST(ctx->tx_chan_id,
@@ -773,7 +773,7 @@ static struct sk_buff *create_cipher_wr(struct cipher_wr_param *wrparam)
 	}
 	chcr_req = __skb_put_zero(skb, transhdr_len);
 	chcr_req->sec_cpl.op_ivinsrtofst =
-		FILL_SEC_CPL_OP_IVINSR(c_ctx(tfm)->dev->rx_channel_id, 2, 1);
+		FILL_SEC_CPL_OP_IVINSR(c_ctx(tfm)->tx_chan_id, 2, 1);
 
 	chcr_req->sec_cpl.pldlen = htonl(IV + wrparam->bytes);
 	chcr_req->sec_cpl.aadstart_cipherstop_hi =
@@ -1344,7 +1344,6 @@ static int chcr_device_init(struct chcr_context *ctx)
 		spin_lock(&ctx->dev->lock_chcr_dev);
 		ctx->tx_chan_id = ctx->dev->tx_channel_id;
 		ctx->dev->tx_channel_id = !ctx->dev->tx_channel_id;
-		ctx->dev->rx_channel_id = 0;
 		spin_unlock(&ctx->dev->lock_chcr_dev);
 		rxq_idx = ctx->tx_chan_id * rxq_perchan;
 		rxq_idx += id % rxq_perchan;
@@ -1498,7 +1497,7 @@ static struct sk_buff *create_hash_wr(struct ahash_request *req,
 	chcr_req = __skb_put_zero(skb, transhdr_len);
 
 	chcr_req->sec_cpl.op_ivinsrtofst =
-		FILL_SEC_CPL_OP_IVINSR(h_ctx(tfm)->dev->rx_channel_id, 2, 0);
+		FILL_SEC_CPL_OP_IVINSR(h_ctx(tfm)->tx_chan_id, 2, 0);
 	chcr_req->sec_cpl.pldlen = htonl(param->bfr_len + param->sg_len);
 
 	chcr_req->sec_cpl.aadstart_cipherstop_hi =
@@ -2324,7 +2323,7 @@ static struct sk_buff *create_authenc_wr(struct aead_request *req,
 	 * to the hardware spec
 	 */
 	chcr_req->sec_cpl.op_ivinsrtofst =
-		FILL_SEC_CPL_OP_IVINSR(a_ctx(tfm)->dev->rx_channel_id, 2, 1);
+		FILL_SEC_CPL_OP_IVINSR(a_ctx(tfm)->tx_chan_id, 2, 1);
 	chcr_req->sec_cpl.pldlen = htonl(req->assoclen + IV + req->cryptlen);
 	chcr_req->sec_cpl.aadstart_cipherstop_hi = FILL_SEC_CPL_CIPHERSTOP_HI(
 					null ? 0 : 1 + IV,
@@ -2746,7 +2745,7 @@ static void fill_sec_cpl_for_aead(struct cpl_tx_sec_pdu *sec_cpl,
 	struct chcr_aead_ctx *aeadctx = AEAD_CTX(a_ctx(tfm));
 	unsigned int cipher_mode = CHCR_SCMD_CIPHER_MODE_AES_CCM;
 	unsigned int mac_mode = CHCR_SCMD_AUTH_MODE_CBCMAC;
-	unsigned int c_id = a_ctx(tfm)->dev->rx_channel_id;
+	unsigned int c_id = a_ctx(tfm)->tx_chan_id;
 	unsigned int ccm_xtra;
 	unsigned char tag_offset = 0, auth_offset = 0;
 	unsigned int assoclen;
@@ -2967,7 +2966,7 @@ static struct sk_buff *create_gcm_wr(struct aead_request *req,
 	//Offset of tag from end
 	temp = (reqctx->op == CHCR_ENCRYPT_OP) ? 0 : authsize;
 	chcr_req->sec_cpl.op_ivinsrtofst = FILL_SEC_CPL_OP_IVINSR(
-					a_ctx(tfm)->dev->rx_channel_id, 2, 1);
+					a_ctx(tfm)->tx_chan_id, 2, 1);
 	chcr_req->sec_cpl.pldlen =
 		htonl(req->assoclen + IV + req->cryptlen);
 	chcr_req->sec_cpl.aadstart_cipherstop_hi = FILL_SEC_CPL_CIPHERSTOP_HI(
diff --git a/drivers/crypto/chelsio/chcr_core.h b/drivers/crypto/chelsio/chcr_core.h
index 4616663e25d7..a50a24f39607 100644
--- a/drivers/crypto/chelsio/chcr_core.h
+++ b/drivers/crypto/chelsio/chcr_core.h
@@ -133,7 +133,6 @@ struct chcr_dev {
 	spinlock_t lock_chcr_dev;
 	struct uld_ctx *u_ctx;
 	unsigned char tx_channel_id;
-	unsigned char rx_channel_id;
 };
 
 struct uld_ctx {
-- 
cgit v1.2.3-59-g8ed1b


From c4f6d44d774eff382b6fc79a9fe1ff376b5ac6d7 Mon Sep 17 00:00:00 2001
From: Harsh Jain <harsh@chelsio.com>
Date: Tue, 11 Dec 2018 16:21:39 +0530
Subject: crypto: chelsio - cleanup:send addr as value in function argument

Send dma address as value to function arguments instead of pointer.

Signed-off-by: Harsh Jain <harsh@chelsio.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/chelsio/chcr_algo.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/crypto/chelsio/chcr_algo.c b/drivers/crypto/chelsio/chcr_algo.c
index ee985ad69039..f94364b097dc 100644
--- a/drivers/crypto/chelsio/chcr_algo.c
+++ b/drivers/crypto/chelsio/chcr_algo.c
@@ -391,7 +391,7 @@ static inline void dsgl_walk_end(struct dsgl_walk *walk, unsigned short qid,
 
 static inline void dsgl_walk_add_page(struct dsgl_walk *walk,
 					size_t size,
-					dma_addr_t *addr)
+					dma_addr_t addr)
 {
 	int j;
 
@@ -399,7 +399,7 @@ static inline void dsgl_walk_add_page(struct dsgl_walk *walk,
 		return;
 	j = walk->nents;
 	walk->to->len[j % 8] = htons(size);
-	walk->to->addr[j % 8] = cpu_to_be64(*addr);
+	walk->to->addr[j % 8] = cpu_to_be64(addr);
 	j++;
 	if ((j % 8) == 0)
 		walk->to++;
@@ -473,16 +473,16 @@ static inline void ulptx_walk_end(struct ulptx_walk *walk)
 
 static inline void ulptx_walk_add_page(struct ulptx_walk *walk,
 					size_t size,
-					dma_addr_t *addr)
+					dma_addr_t addr)
 {
 	if (!size)
 		return;
 
 	if (walk->nents == 0) {
 		walk->sgl->len0 = cpu_to_be32(size);
-		walk->sgl->addr0 = cpu_to_be64(*addr);
+		walk->sgl->addr0 = cpu_to_be64(addr);
 	} else {
-		walk->pair->addr[walk->pair_idx] = cpu_to_be64(*addr);
+		walk->pair->addr[walk->pair_idx] = cpu_to_be64(addr);
 		walk->pair->len[walk->pair_idx] = cpu_to_be32(size);
 		walk->pair_idx = !walk->pair_idx;
 		if (!walk->pair_idx)
@@ -2481,7 +2481,7 @@ void chcr_add_aead_src_ent(struct aead_request *req,
 		ulptx_walk_init(&ulp_walk, ulptx);
 		if (reqctx->b0_len)
 			ulptx_walk_add_page(&ulp_walk, reqctx->b0_len,
-					    &reqctx->b0_dma);
+					    reqctx->b0_dma);
 		ulptx_walk_add_sg(&ulp_walk, req->src, req->cryptlen +
 				  req->assoclen,  0);
 		ulptx_walk_end(&ulp_walk);
@@ -2500,7 +2500,7 @@ void chcr_add_aead_dst_ent(struct aead_request *req,
 	u32 temp;
 
 	dsgl_walk_init(&dsgl_walk, phys_cpl);
-	dsgl_walk_add_page(&dsgl_walk, IV + reqctx->b0_len, &reqctx->iv_dma);
+	dsgl_walk_add_page(&dsgl_walk, IV + reqctx->b0_len, reqctx->iv_dma);
 	temp = req->assoclen + req->cryptlen +
 		(reqctx->op ? -authsize : authsize);
 	dsgl_walk_add_sg(&dsgl_walk, req->dst, temp, 0);
@@ -2571,7 +2571,7 @@ void chcr_add_hash_src_ent(struct ahash_request *req,
 		ulptx_walk_init(&ulp_walk, ulptx);
 		if (param->bfr_len)
 			ulptx_walk_add_page(&ulp_walk, param->bfr_len,
-					    &reqctx->hctx_wr.dma_addr);
+					    reqctx->hctx_wr.dma_addr);
 		ulptx_walk_add_sg(&ulp_walk, reqctx->hctx_wr.srcsg,
 				  param->sg_len, reqctx->hctx_wr.src_ofst);
 		reqctx->hctx_wr.srcsg = ulp_walk.last_sg;
-- 
cgit v1.2.3-59-g8ed1b


From fef4912b66d6246d958d97382d20d0dd23bcf0bc Mon Sep 17 00:00:00 2001
From: Harsh Jain <harsh@chelsio.com>
Date: Tue, 11 Dec 2018 16:21:40 +0530
Subject: crypto: chelsio - Handle PCI shutdown event

chcr receives "CXGB4_STATE_DETACH" event on PCI Shutdown.
Wait for processing of inflight request and Mark the device unavailable.

Signed-off-by: Harsh Jain <harsh@chelsio.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/chelsio/chcr_algo.c | 157 ++++++++++++++++++++++++++------
 drivers/crypto/chelsio/chcr_core.c | 180 ++++++++++++++++++++++++-------------
 drivers/crypto/chelsio/chcr_core.h |  34 +++++--
 3 files changed, 278 insertions(+), 93 deletions(-)

diff --git a/drivers/crypto/chelsio/chcr_algo.c b/drivers/crypto/chelsio/chcr_algo.c
index f94364b097dc..df526414f03f 100644
--- a/drivers/crypto/chelsio/chcr_algo.c
+++ b/drivers/crypto/chelsio/chcr_algo.c
@@ -123,7 +123,7 @@ static inline struct chcr_authenc_ctx *AUTHENC_CTX(struct chcr_aead_ctx *gctx)
 
 static inline struct uld_ctx *ULD_CTX(struct chcr_context *ctx)
 {
-	return ctx->dev->u_ctx;
+	return container_of(ctx->dev, struct uld_ctx, dev);
 }
 
 static inline int is_ofld_imm(const struct sk_buff *skb)
@@ -198,17 +198,40 @@ void chcr_verify_tag(struct aead_request *req, u8 *input, int *err)
 		*err = 0;
 }
 
+static int chcr_inc_wrcount(struct chcr_dev *dev)
+{
+	int err = 0;
+
+	spin_lock_bh(&dev->lock_chcr_dev);
+	if (dev->state == CHCR_DETACH)
+		err = 1;
+	else
+		atomic_inc(&dev->inflight);
+
+	spin_unlock_bh(&dev->lock_chcr_dev);
+
+	return err;
+}
+
+static inline void chcr_dec_wrcount(struct chcr_dev *dev)
+{
+	atomic_dec(&dev->inflight);
+}
+
 static inline void chcr_handle_aead_resp(struct aead_request *req,
 					 unsigned char *input,
 					 int err)
 {
 	struct chcr_aead_reqctx *reqctx = aead_request_ctx(req);
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct chcr_dev *dev = a_ctx(tfm)->dev;
 
 	chcr_aead_common_exit(req);
 	if (reqctx->verify == VERIFY_SW) {
 		chcr_verify_tag(req, input, &err);
 		reqctx->verify = VERIFY_HW;
 	}
+	chcr_dec_wrcount(dev);
 	req->base.complete(&req->base, err);
 }
 
@@ -1100,6 +1123,7 @@ static int chcr_handle_cipher_resp(struct ablkcipher_request *req,
 	struct cpl_fw6_pld *fw6_pld = (struct cpl_fw6_pld *)input;
 	struct chcr_blkcipher_req_ctx *reqctx = ablkcipher_request_ctx(req);
 	struct  cipher_wr_param wrparam;
+	struct chcr_dev *dev = c_ctx(tfm)->dev;
 	int bytes;
 
 	if (err)
@@ -1161,6 +1185,7 @@ static int chcr_handle_cipher_resp(struct ablkcipher_request *req,
 unmap:
 	chcr_cipher_dma_unmap(&ULD_CTX(c_ctx(tfm))->lldi.pdev->dev, req);
 complete:
+	chcr_dec_wrcount(dev);
 	req->base.complete(&req->base, err);
 	return err;
 }
@@ -1187,7 +1212,10 @@ static int process_cipher(struct ablkcipher_request *req,
 		       ablkctx->enckey_len, req->nbytes, ivsize);
 		goto error;
 	}
-	chcr_cipher_dma_map(&ULD_CTX(c_ctx(tfm))->lldi.pdev->dev, req);
+
+	err = chcr_cipher_dma_map(&ULD_CTX(c_ctx(tfm))->lldi.pdev->dev, req);
+	if (err)
+		goto error;
 	if (req->nbytes < (SGE_MAX_WR_LEN - (sizeof(struct chcr_wr) +
 					    AES_MIN_KEY_SIZE +
 					    sizeof(struct cpl_rx_phys_dsgl) +
@@ -1276,15 +1304,21 @@ error:
 static int chcr_aes_encrypt(struct ablkcipher_request *req)
 {
 	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
+	struct chcr_dev *dev = c_ctx(tfm)->dev;
 	struct sk_buff *skb = NULL;
 	int err, isfull = 0;
 	struct uld_ctx *u_ctx = ULD_CTX(c_ctx(tfm));
 
+	err = chcr_inc_wrcount(dev);
+	if (err)
+		return -ENXIO;
 	if (unlikely(cxgb4_is_crypto_q_full(u_ctx->lldi.ports[0],
 					    c_ctx(tfm)->tx_qidx))) {
 		isfull = 1;
-		if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_BACKLOG))
-			return -ENOSPC;
+		if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_BACKLOG)) {
+			err = -ENOSPC;
+			goto error;
+		}
 	}
 
 	err = process_cipher(req, u_ctx->lldi.rxq_ids[c_ctx(tfm)->rx_qidx],
@@ -1295,15 +1329,23 @@ static int chcr_aes_encrypt(struct ablkcipher_request *req)
 	set_wr_txq(skb, CPL_PRIORITY_DATA, c_ctx(tfm)->tx_qidx);
 	chcr_send_wr(skb);
 	return isfull ? -EBUSY : -EINPROGRESS;
+error:
+	chcr_dec_wrcount(dev);
+	return err;
 }
 
 static int chcr_aes_decrypt(struct ablkcipher_request *req)
 {
 	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
 	struct uld_ctx *u_ctx = ULD_CTX(c_ctx(tfm));
+	struct chcr_dev *dev = c_ctx(tfm)->dev;
 	struct sk_buff *skb = NULL;
 	int err, isfull = 0;
 
+	err = chcr_inc_wrcount(dev);
+	if (err)
+		return -ENXIO;
+
 	if (unlikely(cxgb4_is_crypto_q_full(u_ctx->lldi.ports[0],
 					    c_ctx(tfm)->tx_qidx))) {
 		isfull = 1;
@@ -1333,10 +1375,11 @@ static int chcr_device_init(struct chcr_context *ctx)
 	if (!ctx->dev) {
 		u_ctx = assign_chcr_device();
 		if (!u_ctx) {
+			err = -ENXIO;
 			pr_err("chcr device assignment fails\n");
 			goto out;
 		}
-		ctx->dev = u_ctx->dev;
+		ctx->dev = &u_ctx->dev;
 		adap = padap(ctx->dev);
 		ntxq = u_ctx->lldi.ntxq;
 		rxq_perchan = u_ctx->lldi.nrxq / u_ctx->lldi.nchan;
@@ -1561,6 +1604,7 @@ static int chcr_ahash_update(struct ahash_request *req)
 	struct chcr_ahash_req_ctx *req_ctx = ahash_request_ctx(req);
 	struct crypto_ahash *rtfm = crypto_ahash_reqtfm(req);
 	struct uld_ctx *u_ctx = NULL;
+	struct chcr_dev *dev = h_ctx(rtfm)->dev;
 	struct sk_buff *skb;
 	u8 remainder = 0, bs;
 	unsigned int nbytes = req->nbytes;
@@ -1569,12 +1613,6 @@ static int chcr_ahash_update(struct ahash_request *req)
 
 	bs = crypto_tfm_alg_blocksize(crypto_ahash_tfm(rtfm));
 	u_ctx = ULD_CTX(h_ctx(rtfm));
-	if (unlikely(cxgb4_is_crypto_q_full(u_ctx->lldi.ports[0],
-					    h_ctx(rtfm)->tx_qidx))) {
-		isfull = 1;
-		if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_BACKLOG))
-			return -ENOSPC;
-	}
 
 	if (nbytes + req_ctx->reqlen >= bs) {
 		remainder = (nbytes + req_ctx->reqlen) % bs;
@@ -1585,10 +1623,27 @@ static int chcr_ahash_update(struct ahash_request *req)
 		req_ctx->reqlen += nbytes;
 		return 0;
 	}
+	error = chcr_inc_wrcount(dev);
+	if (error)
+		return -ENXIO;
+	/* Detach state for CHCR means lldi or padap is freed. Increasing
+	 * inflight count for dev guarantees that lldi and padap is valid
+	 */
+	if (unlikely(cxgb4_is_crypto_q_full(u_ctx->lldi.ports[0],
+					    h_ctx(rtfm)->tx_qidx))) {
+		isfull = 1;
+		if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_BACKLOG)) {
+			error = -ENOSPC;
+			goto err;
+		}
+	}
+
 	chcr_init_hctx_per_wr(req_ctx);
 	error = chcr_hash_dma_map(&u_ctx->lldi.pdev->dev, req);
-	if (error)
-		return -ENOMEM;
+	if (error) {
+		error = -ENOMEM;
+		goto err;
+	}
 	get_alg_config(&params.alg_prm, crypto_ahash_digestsize(rtfm));
 	params.kctx_len = roundup(params.alg_prm.result_size, 16);
 	params.sg_len = chcr_hash_ent_in_wr(req->src, !!req_ctx->reqlen,
@@ -1628,6 +1683,8 @@ static int chcr_ahash_update(struct ahash_request *req)
 	return isfull ? -EBUSY : -EINPROGRESS;
 unmap:
 	chcr_hash_dma_unmap(&u_ctx->lldi.pdev->dev, req);
+err:
+	chcr_dec_wrcount(dev);
 	return error;
 }
 
@@ -1645,10 +1702,16 @@ static int chcr_ahash_final(struct ahash_request *req)
 {
 	struct chcr_ahash_req_ctx *req_ctx = ahash_request_ctx(req);
 	struct crypto_ahash *rtfm = crypto_ahash_reqtfm(req);
+	struct chcr_dev *dev = h_ctx(rtfm)->dev;
 	struct hash_wr_param params;
 	struct sk_buff *skb;
 	struct uld_ctx *u_ctx = NULL;
 	u8 bs = crypto_tfm_alg_blocksize(crypto_ahash_tfm(rtfm));
+	int error = -EINVAL;
+
+	error = chcr_inc_wrcount(dev);
+	if (error)
+		return -ENXIO;
 
 	chcr_init_hctx_per_wr(req_ctx);
 	u_ctx = ULD_CTX(h_ctx(rtfm));
@@ -1685,19 +1748,25 @@ static int chcr_ahash_final(struct ahash_request *req)
 	}
 	params.hash_size = crypto_ahash_digestsize(rtfm);
 	skb = create_hash_wr(req, &params);
-	if (IS_ERR(skb))
-		return PTR_ERR(skb);
+	if (IS_ERR(skb)) {
+		error = PTR_ERR(skb);
+		goto err;
+	}
 	req_ctx->reqlen = 0;
 	skb->dev = u_ctx->lldi.ports[0];
 	set_wr_txq(skb, CPL_PRIORITY_DATA, h_ctx(rtfm)->tx_qidx);
 	chcr_send_wr(skb);
 	return -EINPROGRESS;
+err:
+	chcr_dec_wrcount(dev);
+	return error;
 }
 
 static int chcr_ahash_finup(struct ahash_request *req)
 {
 	struct chcr_ahash_req_ctx *req_ctx = ahash_request_ctx(req);
 	struct crypto_ahash *rtfm = crypto_ahash_reqtfm(req);
+	struct chcr_dev *dev = h_ctx(rtfm)->dev;
 	struct uld_ctx *u_ctx = NULL;
 	struct sk_buff *skb;
 	struct hash_wr_param params;
@@ -1706,17 +1775,24 @@ static int chcr_ahash_finup(struct ahash_request *req)
 
 	bs = crypto_tfm_alg_blocksize(crypto_ahash_tfm(rtfm));
 	u_ctx = ULD_CTX(h_ctx(rtfm));
+	error = chcr_inc_wrcount(dev);
+	if (error)
+		return -ENXIO;
 
 	if (unlikely(cxgb4_is_crypto_q_full(u_ctx->lldi.ports[0],
 					    h_ctx(rtfm)->tx_qidx))) {
 		isfull = 1;
-		if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_BACKLOG))
-			return -ENOSPC;
+		if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_BACKLOG)) {
+			error = -ENOSPC;
+			goto err;
+		}
 	}
 	chcr_init_hctx_per_wr(req_ctx);
 	error = chcr_hash_dma_map(&u_ctx->lldi.pdev->dev, req);
-	if (error)
-		return -ENOMEM;
+	if (error) {
+		error = -ENOMEM;
+		goto err;
+	}
 
 	get_alg_config(&params.alg_prm, crypto_ahash_digestsize(rtfm));
 	params.kctx_len = roundup(params.alg_prm.result_size, 16);
@@ -1773,6 +1849,8 @@ static int chcr_ahash_finup(struct ahash_request *req)
 	return isfull ? -EBUSY : -EINPROGRESS;
 unmap:
 	chcr_hash_dma_unmap(&u_ctx->lldi.pdev->dev, req);
+err:
+	chcr_dec_wrcount(dev);
 	return error;
 }
 
@@ -1780,6 +1858,7 @@ static int chcr_ahash_digest(struct ahash_request *req)
 {
 	struct chcr_ahash_req_ctx *req_ctx = ahash_request_ctx(req);
 	struct crypto_ahash *rtfm = crypto_ahash_reqtfm(req);
+	struct chcr_dev *dev = h_ctx(rtfm)->dev;
 	struct uld_ctx *u_ctx = NULL;
 	struct sk_buff *skb;
 	struct hash_wr_param params;
@@ -1788,19 +1867,26 @@ static int chcr_ahash_digest(struct ahash_request *req)
 
 	rtfm->init(req);
 	bs = crypto_tfm_alg_blocksize(crypto_ahash_tfm(rtfm));
+	error = chcr_inc_wrcount(dev);
+	if (error)
+		return -ENXIO;
 
 	u_ctx = ULD_CTX(h_ctx(rtfm));
 	if (unlikely(cxgb4_is_crypto_q_full(u_ctx->lldi.ports[0],
 					    h_ctx(rtfm)->tx_qidx))) {
 		isfull = 1;
-		if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_BACKLOG))
-			return -ENOSPC;
+		if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_BACKLOG)) {
+			error = -ENOSPC;
+			goto err;
+		}
 	}
 
 	chcr_init_hctx_per_wr(req_ctx);
 	error = chcr_hash_dma_map(&u_ctx->lldi.pdev->dev, req);
-	if (error)
-		return -ENOMEM;
+	if (error) {
+		error = -ENOMEM;
+		goto err;
+	}
 
 	get_alg_config(&params.alg_prm, crypto_ahash_digestsize(rtfm));
 	params.kctx_len = roundup(params.alg_prm.result_size, 16);
@@ -1853,6 +1939,8 @@ static int chcr_ahash_digest(struct ahash_request *req)
 	return isfull ? -EBUSY : -EINPROGRESS;
 unmap:
 	chcr_hash_dma_unmap(&u_ctx->lldi.pdev->dev, req);
+err:
+	chcr_dec_wrcount(dev);
 	return error;
 }
 
@@ -1924,6 +2012,7 @@ static inline void chcr_handle_ahash_resp(struct ahash_request *req,
 	int digestsize, updated_digestsize;
 	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
 	struct uld_ctx *u_ctx = ULD_CTX(h_ctx(tfm));
+	struct chcr_dev *dev = h_ctx(tfm)->dev;
 
 	if (input == NULL)
 		goto out;
@@ -1966,6 +2055,7 @@ unmap:
 
 
 out:
+	chcr_dec_wrcount(dev);
 	req->base.complete(&req->base, err);
 }
 
@@ -3553,27 +3643,42 @@ static int chcr_aead_op(struct aead_request *req,
 			create_wr_t create_wr_fn)
 {
 	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct chcr_aead_reqctx  *reqctx = aead_request_ctx(req);
 	struct uld_ctx *u_ctx;
 	struct sk_buff *skb;
 	int isfull = 0;
+	struct chcr_dev *cdev;
 
-	if (!a_ctx(tfm)->dev) {
+	cdev = a_ctx(tfm)->dev;
+	if (!cdev) {
 		pr_err("chcr : %s : No crypto device.\n", __func__);
 		return -ENXIO;
 	}
+
+	if (chcr_inc_wrcount(cdev)) {
+	/* Detach state for CHCR means lldi or padap is freed.
+	 * We cannot increment fallback here.
+	 */
+		return chcr_aead_fallback(req, reqctx->op);
+	}
+
 	u_ctx = ULD_CTX(a_ctx(tfm));
 	if (cxgb4_is_crypto_q_full(u_ctx->lldi.ports[0],
 				   a_ctx(tfm)->tx_qidx)) {
 		isfull = 1;
-		if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_BACKLOG))
+		if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_BACKLOG)) {
+			chcr_dec_wrcount(cdev);
 			return -ENOSPC;
+		}
 	}
 
 	/* Form a WR from req */
 	skb = create_wr_fn(req, u_ctx->lldi.rxq_ids[a_ctx(tfm)->rx_qidx], size);
 
-	if (IS_ERR(skb) || !skb)
+	if (IS_ERR(skb) || !skb) {
+		chcr_dec_wrcount(cdev);
 		return PTR_ERR(skb);
+	}
 
 	skb->dev = u_ctx->lldi.ports[0];
 	set_wr_txq(skb, CPL_PRIORITY_DATA, a_ctx(tfm)->tx_qidx);
diff --git a/drivers/crypto/chelsio/chcr_core.c b/drivers/crypto/chelsio/chcr_core.c
index 2c472e3c6aeb..f71a97939418 100644
--- a/drivers/crypto/chelsio/chcr_core.c
+++ b/drivers/crypto/chelsio/chcr_core.c
@@ -26,10 +26,7 @@
 #include "chcr_core.h"
 #include "cxgb4_uld.h"
 
-static LIST_HEAD(uld_ctx_list);
-static DEFINE_MUTEX(dev_mutex);
-static atomic_t dev_count;
-static struct uld_ctx *ctx_rr;
+static struct chcr_driver_data drv_data;
 
 typedef int (*chcr_handler_func)(struct chcr_dev *dev, unsigned char *input);
 static int cpl_fw6_pld_handler(struct chcr_dev *dev, unsigned char *input);
@@ -53,6 +50,29 @@ static struct cxgb4_uld_info chcr_uld_info = {
 #endif /* CONFIG_CHELSIO_IPSEC_INLINE */
 };
 
+static void detach_work_fn(struct work_struct *work)
+{
+	struct chcr_dev *dev;
+
+	dev = container_of(work, struct chcr_dev, detach_work.work);
+
+	if (atomic_read(&dev->inflight)) {
+		dev->wqretry--;
+		if (dev->wqretry) {
+			pr_debug("Request Inflight Count %d\n",
+				atomic_read(&dev->inflight));
+
+			schedule_delayed_work(&dev->detach_work, WQ_DETACH_TM);
+		} else {
+			WARN(1, "CHCR:%d request Still Pending\n",
+				atomic_read(&dev->inflight));
+			complete(&dev->detach_comp);
+		}
+	} else {
+		complete(&dev->detach_comp);
+	}
+}
+
 struct uld_ctx *assign_chcr_device(void)
 {
 	struct uld_ctx *u_ctx = NULL;
@@ -63,56 +83,70 @@ struct uld_ctx *assign_chcr_device(void)
 	 * Although One session must use the same device to
 	 * maintain request-response ordering.
 	 */
-	mutex_lock(&dev_mutex);
-	if (!list_empty(&uld_ctx_list)) {
-		u_ctx = ctx_rr;
-		if (list_is_last(&ctx_rr->entry, &uld_ctx_list))
-			ctx_rr = list_first_entry(&uld_ctx_list,
-						  struct uld_ctx,
-						  entry);
+	mutex_lock(&drv_data.drv_mutex);
+	if (!list_empty(&drv_data.act_dev)) {
+		u_ctx = drv_data.last_dev;
+		if (list_is_last(&drv_data.last_dev->entry, &drv_data.act_dev))
+			drv_data.last_dev = list_first_entry(&drv_data.act_dev,
+						  struct uld_ctx, entry);
 		else
-			ctx_rr = list_next_entry(ctx_rr, entry);
+			drv_data.last_dev =
+				list_next_entry(drv_data.last_dev, entry);
 	}
-	mutex_unlock(&dev_mutex);
+	mutex_unlock(&drv_data.drv_mutex);
 	return u_ctx;
 }
 
-static int chcr_dev_add(struct uld_ctx *u_ctx)
+static void chcr_dev_add(struct uld_ctx *u_ctx)
 {
 	struct chcr_dev *dev;
 
-	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
-	if (!dev)
-		return -ENXIO;
+	dev = &u_ctx->dev;
+	dev->state = CHCR_ATTACH;
+	atomic_set(&dev->inflight, 0);
+	mutex_lock(&drv_data.drv_mutex);
+	list_move(&u_ctx->entry, &drv_data.act_dev);
+	if (!drv_data.last_dev)
+		drv_data.last_dev = u_ctx;
+	mutex_unlock(&drv_data.drv_mutex);
+}
 
+static void chcr_dev_init(struct uld_ctx *u_ctx)
+{
+	struct chcr_dev *dev;
+
+	dev = &u_ctx->dev;
 	spin_lock_init(&dev->lock_chcr_dev);
-	u_ctx->dev = dev;
-	dev->u_ctx = u_ctx;
-	atomic_inc(&dev_count);
-	mutex_lock(&dev_mutex);
-	list_add_tail(&u_ctx->entry, &uld_ctx_list);
-	if (!ctx_rr)
-		ctx_rr = u_ctx;
-	mutex_unlock(&dev_mutex);
-	return 0;
+	INIT_DELAYED_WORK(&dev->detach_work, detach_work_fn);
+	init_completion(&dev->detach_comp);
+	dev->state = CHCR_INIT;
+	dev->wqretry = WQ_RETRY;
+	atomic_inc(&drv_data.dev_count);
+	atomic_set(&dev->inflight, 0);
+	mutex_lock(&drv_data.drv_mutex);
+	list_add_tail(&u_ctx->entry, &drv_data.inact_dev);
+	if (!drv_data.last_dev)
+		drv_data.last_dev = u_ctx;
+	mutex_unlock(&drv_data.drv_mutex);
 }
 
-static int chcr_dev_remove(struct uld_ctx *u_ctx)
+static int chcr_dev_move(struct uld_ctx *u_ctx)
 {
-	if (ctx_rr == u_ctx) {
-		if (list_is_last(&ctx_rr->entry, &uld_ctx_list))
-			ctx_rr = list_first_entry(&uld_ctx_list,
-						  struct uld_ctx,
-						  entry);
+	 mutex_lock(&drv_data.drv_mutex);
+	if (drv_data.last_dev == u_ctx) {
+		if (list_is_last(&drv_data.last_dev->entry, &drv_data.act_dev))
+			drv_data.last_dev = list_first_entry(&drv_data.act_dev,
+						  struct uld_ctx, entry);
 		else
-			ctx_rr = list_next_entry(ctx_rr, entry);
+			drv_data.last_dev =
+				list_next_entry(drv_data.last_dev, entry);
 	}
-	list_del(&u_ctx->entry);
-	if (list_empty(&uld_ctx_list))
-		ctx_rr = NULL;
-	kfree(u_ctx->dev);
-	u_ctx->dev = NULL;
-	atomic_dec(&dev_count);
+	list_move(&u_ctx->entry, &drv_data.inact_dev);
+	if (list_empty(&drv_data.act_dev))
+		drv_data.last_dev = NULL;
+	atomic_dec(&drv_data.dev_count);
+	mutex_unlock(&drv_data.drv_mutex);
+
 	return 0;
 }
 
@@ -167,6 +201,7 @@ static void *chcr_uld_add(const struct cxgb4_lld_info *lld)
 		goto out;
 	}
 	u_ctx->lldi = *lld;
+	chcr_dev_init(u_ctx);
 #ifdef CONFIG_CHELSIO_IPSEC_INLINE
 	if (lld->crypto & ULP_CRYPTO_IPSEC_INLINE)
 		chcr_add_xfrmops(lld);
@@ -179,7 +214,7 @@ int chcr_uld_rx_handler(void *handle, const __be64 *rsp,
 			const struct pkt_gl *pgl)
 {
 	struct uld_ctx *u_ctx = (struct uld_ctx *)handle;
-	struct chcr_dev *dev = u_ctx->dev;
+	struct chcr_dev *dev = &u_ctx->dev;
 	const struct cpl_fw6_pld *rpl = (struct cpl_fw6_pld *)rsp;
 
 	if (rpl->opcode != CPL_FW6_PLD) {
@@ -201,6 +236,28 @@ int chcr_uld_tx_handler(struct sk_buff *skb, struct net_device *dev)
 }
 #endif /* CONFIG_CHELSIO_IPSEC_INLINE */
 
+static void chcr_detach_device(struct uld_ctx *u_ctx)
+{
+	struct chcr_dev *dev = &u_ctx->dev;
+
+	spin_lock_bh(&dev->lock_chcr_dev);
+	if (dev->state == CHCR_DETACH) {
+		spin_unlock_bh(&dev->lock_chcr_dev);
+		pr_debug("Detached Event received for already detach device\n");
+		return;
+	}
+	dev->state = CHCR_DETACH;
+	spin_unlock_bh(&dev->lock_chcr_dev);
+
+	if (atomic_read(&dev->inflight) != 0) {
+		schedule_delayed_work(&dev->detach_work, WQ_DETACH_TM);
+		wait_for_completion(&dev->detach_comp);
+	}
+
+	// Move u_ctx to inactive_dev list
+	chcr_dev_move(u_ctx);
+}
+
 static int chcr_uld_state_change(void *handle, enum cxgb4_state state)
 {
 	struct uld_ctx *u_ctx = handle;
@@ -208,23 +265,16 @@ static int chcr_uld_state_change(void *handle, enum cxgb4_state state)
 
 	switch (state) {
 	case CXGB4_STATE_UP:
-		if (!u_ctx->dev) {
-			ret = chcr_dev_add(u_ctx);
-			if (ret != 0)
-				return ret;
+		if (u_ctx->dev.state != CHCR_INIT) {
+			// ALready Initialised.
+			return 0;
 		}
-		if (atomic_read(&dev_count) == 1)
-			ret = start_crypto();
+		chcr_dev_add(u_ctx);
+		ret = start_crypto();
 		break;
 
 	case CXGB4_STATE_DETACH:
-		if (u_ctx->dev) {
-			mutex_lock(&dev_mutex);
-			chcr_dev_remove(u_ctx);
-			mutex_unlock(&dev_mutex);
-		}
-		if (!atomic_read(&dev_count))
-			stop_crypto();
+		chcr_detach_device(u_ctx);
 		break;
 
 	case CXGB4_STATE_START_RECOVERY:
@@ -237,7 +287,13 @@ static int chcr_uld_state_change(void *handle, enum cxgb4_state state)
 
 static int __init chcr_crypto_init(void)
 {
+	INIT_LIST_HEAD(&drv_data.act_dev);
+	INIT_LIST_HEAD(&drv_data.inact_dev);
+	atomic_set(&drv_data.dev_count, 0);
+	mutex_init(&drv_data.drv_mutex);
+	drv_data.last_dev = NULL;
 	cxgb4_register_uld(CXGB4_ULD_CRYPTO, &chcr_uld_info);
+
 	return 0;
 }
 
@@ -245,18 +301,20 @@ static void __exit chcr_crypto_exit(void)
 {
 	struct uld_ctx *u_ctx, *tmp;
 
-	if (atomic_read(&dev_count))
-		stop_crypto();
+	stop_crypto();
 
+	cxgb4_unregister_uld(CXGB4_ULD_CRYPTO);
 	/* Remove all devices from list */
-	mutex_lock(&dev_mutex);
-	list_for_each_entry_safe(u_ctx, tmp, &uld_ctx_list, entry) {
-		if (u_ctx->dev)
-			chcr_dev_remove(u_ctx);
+	mutex_lock(&drv_data.drv_mutex);
+	list_for_each_entry_safe(u_ctx, tmp, &drv_data.act_dev, entry) {
+		list_del(&u_ctx->entry);
 		kfree(u_ctx);
 	}
-	mutex_unlock(&dev_mutex);
-	cxgb4_unregister_uld(CXGB4_ULD_CRYPTO);
+	list_for_each_entry_safe(u_ctx, tmp, &drv_data.inact_dev, entry) {
+		list_del(&u_ctx->entry);
+		kfree(u_ctx);
+	}
+	mutex_unlock(&drv_data.drv_mutex);
 }
 
 module_init(chcr_crypto_init);
diff --git a/drivers/crypto/chelsio/chcr_core.h b/drivers/crypto/chelsio/chcr_core.h
index a50a24f39607..1159dee964ed 100644
--- a/drivers/crypto/chelsio/chcr_core.h
+++ b/drivers/crypto/chelsio/chcr_core.h
@@ -47,7 +47,7 @@
 
 #define MAX_PENDING_REQ_TO_HW 20
 #define CHCR_TEST_RESPONSE_TIMEOUT 1000
-
+#define WQ_DETACH_TM	(msecs_to_jiffies(50))
 #define PAD_ERROR_BIT		1
 #define CHK_PAD_ERR_BIT(x)	(((x) >> PAD_ERROR_BIT) & 1)
 
@@ -61,9 +61,6 @@
 #define HASH_WR_MIN_LEN (sizeof(struct chcr_wr) + \
 			DUMMY_BYTES + \
 		    sizeof(struct ulptx_sgl))
-
-#define padap(dev) pci_get_drvdata(dev->u_ctx->lldi.pdev)
-
 struct uld_ctx;
 
 struct _key_ctx {
@@ -121,6 +118,20 @@ struct _key_ctx {
 #define KEYCTX_TX_WR_AUTHIN_G(x) \
 	(((x) >> KEYCTX_TX_WR_AUTHIN_S) & KEYCTX_TX_WR_AUTHIN_M)
 
+#define WQ_RETRY	5
+struct chcr_driver_data {
+	struct list_head act_dev;
+	struct list_head inact_dev;
+	atomic_t dev_count;
+	struct mutex drv_mutex;
+	struct uld_ctx *last_dev;
+};
+
+enum chcr_state {
+	CHCR_INIT = 0,
+	CHCR_ATTACH,
+	CHCR_DETACH,
+};
 struct chcr_wr {
 	struct fw_crypto_lookaside_wr wreq;
 	struct ulp_txpkt ulptx;
@@ -131,14 +142,18 @@ struct chcr_wr {
 
 struct chcr_dev {
 	spinlock_t lock_chcr_dev;
-	struct uld_ctx *u_ctx;
+	enum chcr_state state;
+	atomic_t inflight;
+	int wqretry;
+	struct delayed_work detach_work;
+	struct completion detach_comp;
 	unsigned char tx_channel_id;
 };
 
 struct uld_ctx {
 	struct list_head entry;
 	struct cxgb4_lld_info lldi;
-	struct chcr_dev *dev;
+	struct chcr_dev dev;
 };
 
 struct sge_opaque_hdr {
@@ -189,6 +204,13 @@ static inline unsigned int sgl_len(unsigned int n)
 	return (3 * n) / 2 + (n & 1) + 2;
 }
 
+static inline void *padap(struct chcr_dev *dev)
+{
+	struct uld_ctx *u_ctx = container_of(dev, struct uld_ctx, dev);
+
+	return pci_get_drvdata(u_ctx->lldi.pdev);
+}
+
 struct uld_ctx *assign_chcr_device(void);
 int chcr_send_wr(struct sk_buff *skb);
 int start_crypto(void);
-- 
cgit v1.2.3-59-g8ed1b


From 6501ab5ed4d925cce4c2a1c49b63583c42e65bd8 Mon Sep 17 00:00:00 2001
From: Harsh Jain <harsh@chelsio.com>
Date: Tue, 11 Dec 2018 16:21:41 +0530
Subject: crypto: chelsio - Reset counters on cxgb4 Detach

Reset the counters on receiving detach from Cxgb4.

Signed-off-by: Atul Gupta <atul.gupta@chelsio.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/chelsio/chcr_core.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/crypto/chelsio/chcr_core.c b/drivers/crypto/chelsio/chcr_core.c
index f71a97939418..e04b3e8fa623 100644
--- a/drivers/crypto/chelsio/chcr_core.c
+++ b/drivers/crypto/chelsio/chcr_core.c
@@ -132,6 +132,8 @@ static void chcr_dev_init(struct uld_ctx *u_ctx)
 
 static int chcr_dev_move(struct uld_ctx *u_ctx)
 {
+	struct adapter *adap;
+
 	 mutex_lock(&drv_data.drv_mutex);
 	if (drv_data.last_dev == u_ctx) {
 		if (list_is_last(&drv_data.last_dev->entry, &drv_data.act_dev))
@@ -144,6 +146,8 @@ static int chcr_dev_move(struct uld_ctx *u_ctx)
 	list_move(&u_ctx->entry, &drv_data.inact_dev);
 	if (list_empty(&drv_data.act_dev))
 		drv_data.last_dev = NULL;
+	adap = padap(&u_ctx->dev);
+	memset(&adap->chcr_stats, 0, sizeof(adap->chcr_stats));
 	atomic_dec(&drv_data.dev_count);
 	mutex_unlock(&drv_data.drv_mutex);
 
-- 
cgit v1.2.3-59-g8ed1b


From f31ba0f95f1998118098978dbfb25ecbec6b0891 Mon Sep 17 00:00:00 2001
From: Harsh Jain <harsh@chelsio.com>
Date: Tue, 11 Dec 2018 16:21:42 +0530
Subject: crypto: chelsio - Fix wrong error counter increments

Fix error counter increment in AEAD decrypt operation when
validation of tag is done in Driver instead of H/W.

Signed-off-by: Harsh Jain <harsh@chelsio.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/chelsio/chcr_algo.c |  9 +++++----
 drivers/crypto/chelsio/chcr_core.c | 11 +++++------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/crypto/chelsio/chcr_algo.c b/drivers/crypto/chelsio/chcr_algo.c
index df526414f03f..eedc33128da4 100644
--- a/drivers/crypto/chelsio/chcr_algo.c
+++ b/drivers/crypto/chelsio/chcr_algo.c
@@ -218,7 +218,7 @@ static inline void chcr_dec_wrcount(struct chcr_dev *dev)
 	atomic_dec(&dev->inflight);
 }
 
-static inline void chcr_handle_aead_resp(struct aead_request *req,
+static inline int chcr_handle_aead_resp(struct aead_request *req,
 					 unsigned char *input,
 					 int err)
 {
@@ -233,6 +233,8 @@ static inline void chcr_handle_aead_resp(struct aead_request *req,
 	}
 	chcr_dec_wrcount(dev);
 	req->base.complete(&req->base, err);
+
+	return err;
 }
 
 static void get_aes_decrypt_key(unsigned char *dec_key,
@@ -2072,14 +2074,13 @@ int chcr_handle_resp(struct crypto_async_request *req, unsigned char *input,
 
 	switch (tfm->__crt_alg->cra_flags & CRYPTO_ALG_TYPE_MASK) {
 	case CRYPTO_ALG_TYPE_AEAD:
-		chcr_handle_aead_resp(aead_request_cast(req), input, err);
+		err = chcr_handle_aead_resp(aead_request_cast(req), input, err);
 		break;
 
 	case CRYPTO_ALG_TYPE_ABLKCIPHER:
-		 err = chcr_handle_cipher_resp(ablkcipher_request_cast(req),
+		 chcr_handle_cipher_resp(ablkcipher_request_cast(req),
 					       input, err);
 		break;
-
 	case CRYPTO_ALG_TYPE_AHASH:
 		chcr_handle_ahash_resp(ahash_request_cast(req), input, err);
 		}
diff --git a/drivers/crypto/chelsio/chcr_core.c b/drivers/crypto/chelsio/chcr_core.c
index e04b3e8fa623..239b933d6df6 100644
--- a/drivers/crypto/chelsio/chcr_core.c
+++ b/drivers/crypto/chelsio/chcr_core.c
@@ -169,12 +169,8 @@ static int cpl_fw6_pld_handler(struct chcr_dev *dev,
 
 	ack_err_status =
 		ntohl(*(__be32 *)((unsigned char *)&fw6_pld->data[0] + 4));
-	if (ack_err_status) {
-		if (CHK_MAC_ERR_BIT(ack_err_status) ||
-		    CHK_PAD_ERR_BIT(ack_err_status))
-			error_status = -EBADMSG;
-		atomic_inc(&adap->chcr_stats.error);
-	}
+	if (CHK_MAC_ERR_BIT(ack_err_status) || CHK_PAD_ERR_BIT(ack_err_status))
+		error_status = -EBADMSG;
 	/* call completion callback with failure status */
 	if (req) {
 		error_status = chcr_handle_resp(req, input, error_status);
@@ -182,6 +178,9 @@ static int cpl_fw6_pld_handler(struct chcr_dev *dev,
 		pr_err("Incorrect request address from the firmware\n");
 		return -EFAULT;
 	}
+	if (error_status)
+		atomic_inc(&adap->chcr_stats.error);
+
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 0c99c2a087c60b71e1fc90d070e2e16ca52defbe Mon Sep 17 00:00:00 2001
From: Corentin Labbe <clabbe@baylibre.com>
Date: Thu, 13 Dec 2018 08:36:37 +0000
Subject: crypto: user - remove unused dump functions

This patch removes unused dump functions for crypto_user_stats.
There are remains of the copy/paste of crypto_user_base to
crypto_user_stat and I forgot to remove them.

Signed-off-by: Corentin Labbe <clabbe@baylibre.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/crypto_user_base.c            |  4 +---
 crypto/crypto_user_stat.c            | 33 ---------------------------------
 include/crypto/internal/cryptouser.h | 12 ------------
 3 files changed, 1 insertion(+), 48 deletions(-)

diff --git a/crypto/crypto_user_base.c b/crypto/crypto_user_base.c
index 5311fd7fae34..f25d3f32c9c2 100644
--- a/crypto/crypto_user_base.c
+++ b/crypto/crypto_user_base.c
@@ -423,9 +423,7 @@ static const struct crypto_link {
 						       .dump = crypto_dump_report,
 						       .done = crypto_dump_report_done},
 	[CRYPTO_MSG_DELRNG	- CRYPTO_MSG_BASE] = { .doit = crypto_del_rng },
-	[CRYPTO_MSG_GETSTAT	- CRYPTO_MSG_BASE] = { .doit = crypto_reportstat,
-						       .dump = crypto_dump_reportstat,
-						       .done = crypto_dump_reportstat_done},
+	[CRYPTO_MSG_GETSTAT	- CRYPTO_MSG_BASE] = { .doit = crypto_reportstat},
 };
 
 static int crypto_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
diff --git a/crypto/crypto_user_stat.c b/crypto/crypto_user_stat.c
index 0ba00aaeb810..3e9a53233d80 100644
--- a/crypto/crypto_user_stat.c
+++ b/crypto/crypto_user_stat.c
@@ -336,37 +336,4 @@ drop_alg:
 	return nlmsg_unicast(crypto_nlsk, skb, NETLINK_CB(in_skb).portid);
 }
 
-int crypto_dump_reportstat(struct sk_buff *skb, struct netlink_callback *cb)
-{
-	struct crypto_alg *alg;
-	struct crypto_dump_info info;
-	int err;
-
-	if (cb->args[0])
-		goto out;
-
-	cb->args[0] = 1;
-
-	info.in_skb = cb->skb;
-	info.out_skb = skb;
-	info.nlmsg_seq = cb->nlh->nlmsg_seq;
-	info.nlmsg_flags = NLM_F_MULTI;
-
-	list_for_each_entry(alg, &crypto_alg_list, cra_list) {
-		err = crypto_reportstat_alg(alg, &info);
-		if (err)
-			goto out_err;
-	}
-
-out:
-	return skb->len;
-out_err:
-	return err;
-}
-
-int crypto_dump_reportstat_done(struct netlink_callback *cb)
-{
-	return 0;
-}
-
 MODULE_LICENSE("GPL");
diff --git a/include/crypto/internal/cryptouser.h b/include/crypto/internal/cryptouser.h
index 3492ab42eefb..40623f4457df 100644
--- a/include/crypto/internal/cryptouser.h
+++ b/include/crypto/internal/cryptouser.h
@@ -4,22 +4,10 @@
 struct crypto_alg *crypto_alg_match(struct crypto_user_alg *p, int exact);
 
 #ifdef CONFIG_CRYPTO_STATS
-int crypto_dump_reportstat(struct sk_buff *skb, struct netlink_callback *cb);
 int crypto_reportstat(struct sk_buff *in_skb, struct nlmsghdr *in_nlh, struct nlattr **attrs);
-int crypto_dump_reportstat_done(struct netlink_callback *cb);
 #else
-static int crypto_dump_reportstat(struct sk_buff *skb, struct netlink_callback *cb)
-{
-	return -ENOTSUPP;
-}
-
 static int crypto_reportstat(struct sk_buff *in_skb, struct nlmsghdr *in_nlh, struct nlattr **attrs)
 {
 	return -ENOTSUPP;
 }
-
-static int crypto_dump_reportstat_done(struct netlink_callback *cb)
-{
-	return -ENOTSUPP;
-}
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From bfad6cb3f8295559216690e1eb9c99003a79b3a0 Mon Sep 17 00:00:00 2001
From: Corentin Labbe <clabbe@baylibre.com>
Date: Thu, 13 Dec 2018 08:36:38 +0000
Subject: crypto: api - document missing stats member

This patchs adds missing member of stats documentation.

Signed-off-by: Corentin Labbe <clabbe@baylibre.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/crypto.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 9850b41e38ae..81e178fb9ed8 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -564,6 +564,13 @@ struct crypto_istat_rng {
  * @cra_destroy: internally used
  *
  * @stats: union of all possible crypto_istat_xxx structures
+ * @stats.aead:		statistics for AEAD algorithm
+ * @stats.akcipher:	statistics for akcipher algorithm
+ * @stats.cipher:	statistics for cipher algorithm
+ * @stats.compress:	statistics for compress algorithm
+ * @stats.hash:		statistics for hash algorithm
+ * @stats.rng:		statistics for rng algorithm
+ * @stats.kpp:		statistics for KPP algorithm
  *
  * The struct crypto_alg describes a generic Crypto API algorithm and is common
  * for all of the transformations. Any variable not documented here shall not
-- 
cgit v1.2.3-59-g8ed1b


From 2326828ee40357b3d2b1359b8ca7526af201495b Mon Sep 17 00:00:00 2001
From: Fabio Estevam <festevam@gmail.com>
Date: Thu, 13 Dec 2018 07:52:32 -0200
Subject: crypto: mxc-scc - fix build warnings on ARM64

The following build warnings are seen when building for ARM64 allmodconfig:

drivers/crypto/mxc-scc.c:181:20: warning: format '%d' expects argument of type 'int', but argument 5 has type 'size_t' {aka 'long unsigned int'} [-Wformat=]
drivers/crypto/mxc-scc.c:186:21: warning: format '%d' expects argument of type 'int', but argument 4 has type 'size_t' {aka 'long unsigned int'} [-Wformat=]
drivers/crypto/mxc-scc.c:277:21: warning: format '%d' expects argument of type 'int', but argument 4 has type 'size_t' {aka 'long unsigned int'} [-Wformat=]
drivers/crypto/mxc-scc.c:339:3: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
drivers/crypto/mxc-scc.c:340:3: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]

Fix them by using the %zu specifier to print a size_t variable and using
a plain %x to print the result of a readl().

Signed-off-by: Fabio Estevam <festevam@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/mxc-scc.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/crypto/mxc-scc.c b/drivers/crypto/mxc-scc.c
index e01c46387df8..519086730791 100644
--- a/drivers/crypto/mxc-scc.c
+++ b/drivers/crypto/mxc-scc.c
@@ -178,12 +178,12 @@ static int mxc_scc_get_data(struct mxc_scc_ctx *ctx,
 	else
 		from = scc->black_memory;
 
-	dev_dbg(scc->dev, "pcopy: from 0x%p %d bytes\n", from,
+	dev_dbg(scc->dev, "pcopy: from 0x%p %zu bytes\n", from,
 		ctx->dst_nents * 8);
 	len = sg_pcopy_from_buffer(ablkreq->dst, ctx->dst_nents,
 				   from, ctx->size, ctx->offset);
 	if (!len) {
-		dev_err(scc->dev, "pcopy err from 0x%p (len=%d)\n", from, len);
+		dev_err(scc->dev, "pcopy err from 0x%p (len=%zu)\n", from, len);
 		return -EINVAL;
 	}
 
@@ -274,7 +274,7 @@ static int mxc_scc_put_data(struct mxc_scc_ctx *ctx,
 	len = sg_pcopy_to_buffer(req->src, ctx->src_nents,
 				 to, len, ctx->offset);
 	if (!len) {
-		dev_err(scc->dev, "pcopy err to 0x%p (len=%d)\n", to, len);
+		dev_err(scc->dev, "pcopy err to 0x%p (len=%zu)\n", to, len);
 		return -EINVAL;
 	}
 
@@ -335,9 +335,9 @@ static void mxc_scc_ablkcipher_next(struct mxc_scc_ctx *ctx,
 		return;
 	}
 
-	dev_dbg(scc->dev, "Start encryption (0x%p/0x%p)\n",
-		(void *)readl(scc->base + SCC_SCM_RED_START),
-		(void *)readl(scc->base + SCC_SCM_BLACK_START));
+	dev_dbg(scc->dev, "Start encryption (0x%x/0x%x)\n",
+		readl(scc->base + SCC_SCM_RED_START),
+		readl(scc->base + SCC_SCM_BLACK_START));
 
 	/* clear interrupt control registers */
 	writel(SCC_SCM_INTR_CTRL_CLR_INTR,
-- 
cgit v1.2.3-59-g8ed1b


From c9613335bf4fe259a654aa0e9701f0c4cddc12ba Mon Sep 17 00:00:00 2001
From: Nagadheeraj Rottela <rnagadheeraj@marvell.com>
Date: Fri, 14 Dec 2018 11:19:51 +0000
Subject: crypto: cavium/nitrox - Added AEAD cipher support

Added support to offload AEAD ciphers to NITROX. Currently supported
AEAD cipher is 'gcm(aes)'.

Signed-off-by: Nagadheeraj Rottela <rnagadheeraj@marvell.com>
Reviewed-by: Srikanth Jampala <jsrikanth@marvell.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/cavium/nitrox/Makefile          |   4 +-
 drivers/crypto/cavium/nitrox/nitrox_aead.c     | 364 ++++++++++++++++
 drivers/crypto/cavium/nitrox/nitrox_algs.c     | 559 +------------------------
 drivers/crypto/cavium/nitrox/nitrox_common.h   |   6 +-
 drivers/crypto/cavium/nitrox/nitrox_req.h      | 239 +++++++++--
 drivers/crypto/cavium/nitrox/nitrox_reqmgr.c   |  38 +-
 drivers/crypto/cavium/nitrox/nitrox_skcipher.c | 498 ++++++++++++++++++++++
 7 files changed, 1103 insertions(+), 605 deletions(-)
 create mode 100644 drivers/crypto/cavium/nitrox/nitrox_aead.c
 create mode 100644 drivers/crypto/cavium/nitrox/nitrox_skcipher.c

diff --git a/drivers/crypto/cavium/nitrox/Makefile b/drivers/crypto/cavium/nitrox/Makefile
index ad0546630ad8..f83991aaf820 100644
--- a/drivers/crypto/cavium/nitrox/Makefile
+++ b/drivers/crypto/cavium/nitrox/Makefile
@@ -7,7 +7,9 @@ n5pf-objs := nitrox_main.o \
 	nitrox_hal.o \
 	nitrox_reqmgr.o \
 	nitrox_algs.o	\
-	nitrox_mbx.o
+	nitrox_mbx.o	\
+	nitrox_skcipher.o \
+	nitrox_aead.o
 
 n5pf-$(CONFIG_PCI_IOV) += nitrox_sriov.o
 n5pf-$(CONFIG_DEBUG_FS) += nitrox_debugfs.o
diff --git a/drivers/crypto/cavium/nitrox/nitrox_aead.c b/drivers/crypto/cavium/nitrox/nitrox_aead.c
new file mode 100644
index 000000000000..4f43eacd2557
--- /dev/null
+++ b/drivers/crypto/cavium/nitrox/nitrox_aead.c
@@ -0,0 +1,364 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/printk.h>
+#include <linux/crypto.h>
+#include <linux/rtnetlink.h>
+
+#include <crypto/aead.h>
+#include <crypto/authenc.h>
+#include <crypto/des.h>
+#include <crypto/sha.h>
+#include <crypto/internal/aead.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/gcm.h>
+
+#include "nitrox_dev.h"
+#include "nitrox_common.h"
+#include "nitrox_req.h"
+
+#define GCM_AES_SALT_SIZE	4
+
+/**
+ * struct nitrox_crypt_params - Params to set nitrox crypto request.
+ * @cryptlen: Encryption/Decryption data length
+ * @authlen: Assoc data length + Cryptlen
+ * @srclen: Input buffer length
+ * @dstlen: Output buffer length
+ * @iv: IV data
+ * @ivsize: IV data length
+ * @ctrl_arg: Identifies the request type (ENCRYPT/DECRYPT)
+ */
+struct nitrox_crypt_params {
+	unsigned int cryptlen;
+	unsigned int authlen;
+	unsigned int srclen;
+	unsigned int dstlen;
+	u8 *iv;
+	int ivsize;
+	u8 ctrl_arg;
+};
+
+union gph_p3 {
+	struct {
+#ifdef __BIG_ENDIAN_BITFIELD
+		u16 iv_offset : 8;
+		u16 auth_offset	: 8;
+#else
+		u16 auth_offset	: 8;
+		u16 iv_offset : 8;
+#endif
+	};
+	u16 param;
+};
+
+static int nitrox_aes_gcm_setkey(struct crypto_aead *aead, const u8 *key,
+				 unsigned int keylen)
+{
+	int aes_keylen;
+	struct nitrox_crypto_ctx *nctx = crypto_aead_ctx(aead);
+	struct flexi_crypto_context *fctx;
+	union fc_ctx_flags flags;
+
+	aes_keylen = flexi_aes_keylen(keylen);
+	if (aes_keylen < 0) {
+		crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+
+	/* fill crypto context */
+	fctx = nctx->u.fctx;
+	flags.f = be64_to_cpu(fctx->flags.f);
+	flags.w0.aes_keylen = aes_keylen;
+	fctx->flags.f = cpu_to_be64(flags.f);
+
+	/* copy enc key to context */
+	memset(&fctx->crypto, 0, sizeof(fctx->crypto));
+	memcpy(fctx->crypto.u.key, key, keylen);
+
+	return 0;
+}
+
+static int nitrox_aead_setauthsize(struct crypto_aead *aead,
+				   unsigned int authsize)
+{
+	struct nitrox_crypto_ctx *nctx = crypto_aead_ctx(aead);
+	struct flexi_crypto_context *fctx = nctx->u.fctx;
+	union fc_ctx_flags flags;
+
+	flags.f = be64_to_cpu(fctx->flags.f);
+	flags.w0.mac_len = authsize;
+	fctx->flags.f = cpu_to_be64(flags.f);
+
+	aead->authsize = authsize;
+
+	return 0;
+}
+
+static int alloc_src_sglist(struct aead_request *areq, char *iv, int ivsize,
+			    int buflen)
+{
+	struct nitrox_kcrypt_request *nkreq = aead_request_ctx(areq);
+	int nents = sg_nents_for_len(areq->src, buflen) + 1;
+	int ret;
+
+	if (nents < 0)
+		return nents;
+
+	/* Allocate buffer to hold IV and input scatterlist array */
+	ret = alloc_src_req_buf(nkreq, nents, ivsize);
+	if (ret)
+		return ret;
+
+	nitrox_creq_copy_iv(nkreq->src, iv, ivsize);
+	nitrox_creq_set_src_sg(nkreq, nents, ivsize, areq->src, buflen);
+
+	return 0;
+}
+
+static int alloc_dst_sglist(struct aead_request *areq, int ivsize, int buflen)
+{
+	struct nitrox_kcrypt_request *nkreq = aead_request_ctx(areq);
+	int nents = sg_nents_for_len(areq->dst, buflen) + 3;
+	int ret;
+
+	if (nents < 0)
+		return nents;
+
+	/* Allocate buffer to hold ORH, COMPLETION and output scatterlist
+	 * array
+	 */
+	ret = alloc_dst_req_buf(nkreq, nents);
+	if (ret)
+		return ret;
+
+	nitrox_creq_set_orh(nkreq);
+	nitrox_creq_set_comp(nkreq);
+	nitrox_creq_set_dst_sg(nkreq, nents, ivsize, areq->dst, buflen);
+
+	return 0;
+}
+
+static void free_src_sglist(struct aead_request *areq)
+{
+	struct nitrox_kcrypt_request *nkreq = aead_request_ctx(areq);
+
+	kfree(nkreq->src);
+}
+
+static void free_dst_sglist(struct aead_request *areq)
+{
+	struct nitrox_kcrypt_request *nkreq = aead_request_ctx(areq);
+
+	kfree(nkreq->dst);
+}
+
+static int nitrox_set_creq(struct aead_request *areq,
+			   struct nitrox_crypt_params *params)
+{
+	struct nitrox_kcrypt_request *nkreq = aead_request_ctx(areq);
+	struct se_crypto_request *creq = &nkreq->creq;
+	struct crypto_aead *aead = crypto_aead_reqtfm(areq);
+	union gph_p3 param3;
+	struct nitrox_crypto_ctx *nctx = crypto_aead_ctx(aead);
+	int ret;
+
+	creq->flags = areq->base.flags;
+	creq->gfp = (areq->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ?
+		GFP_KERNEL : GFP_ATOMIC;
+
+	creq->ctrl.value = 0;
+	creq->opcode = FLEXI_CRYPTO_ENCRYPT_HMAC;
+	creq->ctrl.s.arg = params->ctrl_arg;
+
+	creq->gph.param0 = cpu_to_be16(params->cryptlen);
+	creq->gph.param1 = cpu_to_be16(params->authlen);
+	creq->gph.param2 = cpu_to_be16(params->ivsize + areq->assoclen);
+	param3.iv_offset = 0;
+	param3.auth_offset = params->ivsize;
+	creq->gph.param3 = cpu_to_be16(param3.param);
+
+	creq->ctx_handle = nctx->u.ctx_handle;
+	creq->ctrl.s.ctxl = sizeof(struct flexi_crypto_context);
+
+	ret = alloc_src_sglist(areq, params->iv, params->ivsize,
+			       params->srclen);
+	if (ret)
+		return ret;
+
+	ret = alloc_dst_sglist(areq, params->ivsize, params->dstlen);
+	if (ret) {
+		free_src_sglist(areq);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void nitrox_aead_callback(void *arg, int err)
+{
+	struct aead_request *areq = arg;
+
+	free_src_sglist(areq);
+	free_dst_sglist(areq);
+	if (err) {
+		pr_err_ratelimited("request failed status 0x%0x\n", err);
+		err = -EINVAL;
+	}
+
+	areq->base.complete(&areq->base, err);
+}
+
+static int nitrox_aes_gcm_enc(struct aead_request *areq)
+{
+	struct crypto_aead *aead = crypto_aead_reqtfm(areq);
+	struct nitrox_crypto_ctx *nctx = crypto_aead_ctx(aead);
+	struct nitrox_kcrypt_request *nkreq = aead_request_ctx(areq);
+	struct se_crypto_request *creq = &nkreq->creq;
+	struct flexi_crypto_context *fctx = nctx->u.fctx;
+	struct nitrox_crypt_params params;
+	int ret;
+
+	memcpy(fctx->crypto.iv, areq->iv, GCM_AES_SALT_SIZE);
+
+	memset(&params, 0, sizeof(params));
+	params.cryptlen = areq->cryptlen;
+	params.authlen = areq->assoclen + params.cryptlen;
+	params.srclen = params.authlen;
+	params.dstlen = params.srclen + aead->authsize;
+	params.iv = &areq->iv[GCM_AES_SALT_SIZE];
+	params.ivsize = GCM_AES_IV_SIZE - GCM_AES_SALT_SIZE;
+	params.ctrl_arg = ENCRYPT;
+	ret = nitrox_set_creq(areq, &params);
+	if (ret)
+		return ret;
+
+	/* send the crypto request */
+	return nitrox_process_se_request(nctx->ndev, creq, nitrox_aead_callback,
+					 areq);
+}
+
+static int nitrox_aes_gcm_dec(struct aead_request *areq)
+{
+	struct crypto_aead *aead = crypto_aead_reqtfm(areq);
+	struct nitrox_crypto_ctx *nctx = crypto_aead_ctx(aead);
+	struct nitrox_kcrypt_request *nkreq = aead_request_ctx(areq);
+	struct se_crypto_request *creq = &nkreq->creq;
+	struct flexi_crypto_context *fctx = nctx->u.fctx;
+	struct nitrox_crypt_params params;
+	int ret;
+
+	memcpy(fctx->crypto.iv, areq->iv, GCM_AES_SALT_SIZE);
+
+	memset(&params, 0, sizeof(params));
+	params.cryptlen = areq->cryptlen - aead->authsize;
+	params.authlen = areq->assoclen + params.cryptlen;
+	params.srclen = areq->cryptlen + areq->assoclen;
+	params.dstlen = params.srclen - aead->authsize;
+	params.iv = &areq->iv[GCM_AES_SALT_SIZE];
+	params.ivsize = GCM_AES_IV_SIZE - GCM_AES_SALT_SIZE;
+	params.ctrl_arg = DECRYPT;
+	ret = nitrox_set_creq(areq, &params);
+	if (ret)
+		return ret;
+
+	/* send the crypto request */
+	return nitrox_process_se_request(nctx->ndev, creq, nitrox_aead_callback,
+					 areq);
+}
+
+static int nitrox_aead_init(struct crypto_aead *aead)
+{
+	struct nitrox_crypto_ctx *nctx = crypto_aead_ctx(aead);
+	struct crypto_ctx_hdr *chdr;
+
+	/* get the first device */
+	nctx->ndev = nitrox_get_first_device();
+	if (!nctx->ndev)
+		return -ENODEV;
+
+	/* allocate nitrox crypto context */
+	chdr = crypto_alloc_context(nctx->ndev);
+	if (!chdr) {
+		nitrox_put_device(nctx->ndev);
+		return -ENOMEM;
+	}
+	nctx->chdr = chdr;
+	nctx->u.ctx_handle = (uintptr_t)((u8 *)chdr->vaddr +
+					 sizeof(struct ctx_hdr));
+	nctx->u.fctx->flags.f = 0;
+
+	return 0;
+}
+
+static int nitrox_aes_gcm_init(struct crypto_aead *aead)
+{
+	int ret;
+	struct nitrox_crypto_ctx *nctx = crypto_aead_ctx(aead);
+	union fc_ctx_flags *flags;
+
+	ret = nitrox_aead_init(aead);
+	if (ret)
+		return ret;
+
+	flags = &nctx->u.fctx->flags;
+	flags->w0.cipher_type = CIPHER_AES_GCM;
+	flags->w0.hash_type = AUTH_NULL;
+	flags->w0.iv_source = IV_FROM_DPTR;
+	/* ask microcode to calculate ipad/opad */
+	flags->w0.auth_input_type = 1;
+	flags->f = be64_to_cpu(flags->f);
+
+	crypto_aead_set_reqsize(aead, sizeof(struct aead_request) +
+				sizeof(struct nitrox_kcrypt_request));
+
+	return 0;
+}
+
+static void nitrox_aead_exit(struct crypto_aead *aead)
+{
+	struct nitrox_crypto_ctx *nctx = crypto_aead_ctx(aead);
+
+	/* free the nitrox crypto context */
+	if (nctx->u.ctx_handle) {
+		struct flexi_crypto_context *fctx = nctx->u.fctx;
+
+		memzero_explicit(&fctx->crypto, sizeof(struct crypto_keys));
+		memzero_explicit(&fctx->auth, sizeof(struct auth_keys));
+		crypto_free_context((void *)nctx->chdr);
+	}
+	nitrox_put_device(nctx->ndev);
+
+	nctx->u.ctx_handle = 0;
+	nctx->ndev = NULL;
+}
+
+static struct aead_alg nitrox_aeads[] = { {
+	.base = {
+		.cra_name = "gcm(aes)",
+		.cra_driver_name = "n5_aes_gcm",
+		.cra_priority = PRIO,
+		.cra_flags = CRYPTO_ALG_ASYNC,
+		.cra_blocksize = AES_BLOCK_SIZE,
+		.cra_ctxsize = sizeof(struct nitrox_crypto_ctx),
+		.cra_alignmask = 0,
+		.cra_module = THIS_MODULE,
+	},
+	.setkey = nitrox_aes_gcm_setkey,
+	.setauthsize = nitrox_aead_setauthsize,
+	.encrypt = nitrox_aes_gcm_enc,
+	.decrypt = nitrox_aes_gcm_dec,
+	.init = nitrox_aes_gcm_init,
+	.exit = nitrox_aead_exit,
+	.ivsize = GCM_AES_IV_SIZE,
+	.maxauthsize = AES_BLOCK_SIZE,
+} };
+
+int nitrox_register_aeads(void)
+{
+	return crypto_register_aeads(nitrox_aeads, ARRAY_SIZE(nitrox_aeads));
+}
+
+void nitrox_unregister_aeads(void)
+{
+	crypto_unregister_aeads(nitrox_aeads, ARRAY_SIZE(nitrox_aeads));
+}
diff --git a/drivers/crypto/cavium/nitrox/nitrox_algs.c b/drivers/crypto/cavium/nitrox/nitrox_algs.c
index 10075a97ff0d..d646ae5f29b0 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_algs.c
+++ b/drivers/crypto/cavium/nitrox/nitrox_algs.c
@@ -1,561 +1,24 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/crypto.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/printk.h>
-
-#include <crypto/aes.h>
-#include <crypto/skcipher.h>
-#include <crypto/ctr.h>
-#include <crypto/des.h>
-#include <crypto/xts.h>
-
-#include "nitrox_dev.h"
 #include "nitrox_common.h"
-#include "nitrox_req.h"
-
-#define PRIO 4001
-
-struct nitrox_cipher {
-	const char *name;
-	enum flexi_cipher value;
-};
-
-/**
- * supported cipher list
- */
-static const struct nitrox_cipher flexi_cipher_table[] = {
-	{ "null",		CIPHER_NULL },
-	{ "cbc(des3_ede)",	CIPHER_3DES_CBC },
-	{ "ecb(des3_ede)",	CIPHER_3DES_ECB },
-	{ "cbc(aes)",		CIPHER_AES_CBC },
-	{ "ecb(aes)",		CIPHER_AES_ECB },
-	{ "cfb(aes)",		CIPHER_AES_CFB },
-	{ "rfc3686(ctr(aes))",	CIPHER_AES_CTR },
-	{ "xts(aes)",		CIPHER_AES_XTS },
-	{ "cts(cbc(aes))",	CIPHER_AES_CBC_CTS },
-	{ NULL,			CIPHER_INVALID }
-};
-
-static enum flexi_cipher flexi_cipher_type(const char *name)
-{
-	const struct nitrox_cipher *cipher = flexi_cipher_table;
-
-	while (cipher->name) {
-		if (!strcmp(cipher->name, name))
-			break;
-		cipher++;
-	}
-	return cipher->value;
-}
-
-static int flexi_aes_keylen(int keylen)
-{
-	int aes_keylen;
-
-	switch (keylen) {
-	case AES_KEYSIZE_128:
-		aes_keylen = 1;
-		break;
-	case AES_KEYSIZE_192:
-		aes_keylen = 2;
-		break;
-	case AES_KEYSIZE_256:
-		aes_keylen = 3;
-		break;
-	default:
-		aes_keylen = -EINVAL;
-		break;
-	}
-	return aes_keylen;
-}
-
-static int nitrox_skcipher_init(struct crypto_skcipher *tfm)
-{
-	struct nitrox_crypto_ctx *nctx = crypto_skcipher_ctx(tfm);
-	struct crypto_ctx_hdr *chdr;
-
-	/* get the first device */
-	nctx->ndev = nitrox_get_first_device();
-	if (!nctx->ndev)
-		return -ENODEV;
-
-	/* allocate nitrox crypto context */
-	chdr = crypto_alloc_context(nctx->ndev);
-	if (!chdr) {
-		nitrox_put_device(nctx->ndev);
-		return -ENOMEM;
-	}
-	nctx->chdr = chdr;
-	nctx->u.ctx_handle = (uintptr_t)((u8 *)chdr->vaddr +
-					 sizeof(struct ctx_hdr));
-	crypto_skcipher_set_reqsize(tfm, crypto_skcipher_reqsize(tfm) +
-				    sizeof(struct nitrox_kcrypt_request));
-	return 0;
-}
-
-static void nitrox_skcipher_exit(struct crypto_skcipher *tfm)
-{
-	struct nitrox_crypto_ctx *nctx = crypto_skcipher_ctx(tfm);
-
-	/* free the nitrox crypto context */
-	if (nctx->u.ctx_handle) {
-		struct flexi_crypto_context *fctx = nctx->u.fctx;
-
-		memset(&fctx->crypto, 0, sizeof(struct crypto_keys));
-		memset(&fctx->auth, 0, sizeof(struct auth_keys));
-		crypto_free_context((void *)nctx->chdr);
-	}
-	nitrox_put_device(nctx->ndev);
-
-	nctx->u.ctx_handle = 0;
-	nctx->ndev = NULL;
-}
-
-static inline int nitrox_skcipher_setkey(struct crypto_skcipher *cipher,
-					 int aes_keylen, const u8 *key,
-					 unsigned int keylen)
-{
-	struct crypto_tfm *tfm = crypto_skcipher_tfm(cipher);
-	struct nitrox_crypto_ctx *nctx = crypto_tfm_ctx(tfm);
-	struct flexi_crypto_context *fctx;
-	enum flexi_cipher cipher_type;
-	const char *name;
-
-	name = crypto_tfm_alg_name(tfm);
-	cipher_type = flexi_cipher_type(name);
-	if (unlikely(cipher_type == CIPHER_INVALID)) {
-		pr_err("unsupported cipher: %s\n", name);
-		return -EINVAL;
-	}
-
-	/* fill crypto context */
-	fctx = nctx->u.fctx;
-	fctx->flags = 0;
-	fctx->w0.cipher_type = cipher_type;
-	fctx->w0.aes_keylen = aes_keylen;
-	fctx->w0.iv_source = IV_FROM_DPTR;
-	fctx->flags = cpu_to_be64(*(u64 *)&fctx->w0);
-	/* copy the key to context */
-	memcpy(fctx->crypto.u.key, key, keylen);
-
-	return 0;
-}
-
-static int nitrox_aes_setkey(struct crypto_skcipher *cipher, const u8 *key,
-			     unsigned int keylen)
-{
-	int aes_keylen;
-
-	aes_keylen = flexi_aes_keylen(keylen);
-	if (aes_keylen < 0) {
-		crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN);
-		return -EINVAL;
-	}
-	return nitrox_skcipher_setkey(cipher, aes_keylen, key, keylen);
-}
-
-static int alloc_src_sglist(struct skcipher_request *skreq, int ivsize)
-{
-	struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq);
-	int nents = sg_nents(skreq->src) + 1;
-	struct se_crypto_request *creq = &nkreq->creq;
-	char *iv;
-	struct scatterlist *sg;
-
-	/* Allocate buffer to hold IV and input scatterlist array */
-	nkreq->src = alloc_req_buf(nents, ivsize, creq->gfp);
-	if (!nkreq->src)
-		return -ENOMEM;
-
-	/* copy iv */
-	iv = nkreq->src;
-	memcpy(iv, skreq->iv, ivsize);
-
-	sg = (struct scatterlist *)(iv + ivsize);
-	creq->src = sg;
-	sg_init_table(sg, nents);
-
-	/* Input format:
-	 * +----+----------------+
-	 * | IV | SRC sg entries |
-	 * +----+----------------+
-	 */
-
-	/* IV */
-	sg = create_single_sg(sg, iv, ivsize);
-	/* SRC entries */
-	create_multi_sg(sg, skreq->src);
-
-	return 0;
-}
-
-static int alloc_dst_sglist(struct skcipher_request *skreq, int ivsize)
-{
-	struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq);
-	int nents = sg_nents(skreq->dst) + 3;
-	int extralen = ORH_HLEN + COMP_HLEN;
-	struct se_crypto_request *creq = &nkreq->creq;
-	struct scatterlist *sg;
-	char *iv = nkreq->src;
-
-	/* Allocate buffer to hold ORH, COMPLETION and output scatterlist
-	 * array
-	 */
-	nkreq->dst = alloc_req_buf(nents, extralen, creq->gfp);
-	if (!nkreq->dst)
-		return -ENOMEM;
-
-	creq->orh = (u64 *)(nkreq->dst);
-	set_orh_value(creq->orh);
-
-	creq->comp = (u64 *)(nkreq->dst + ORH_HLEN);
-	set_comp_value(creq->comp);
-
-	sg = (struct scatterlist *)(nkreq->dst + ORH_HLEN + COMP_HLEN);
-	creq->dst = sg;
-	sg_init_table(sg, nents);
-
-	/* Output format:
-	 * +-----+----+----------------+-----------------+
-	 * | ORH | IV | DST sg entries | COMPLETION Bytes|
-	 * +-----+----+----------------+-----------------+
-	 */
-
-	/* ORH */
-	sg = create_single_sg(sg, creq->orh, ORH_HLEN);
-	/* IV */
-	sg = create_single_sg(sg, iv, ivsize);
-	/* DST entries */
-	sg = create_multi_sg(sg, skreq->dst);
-	/* COMPLETION Bytes */
-	create_single_sg(sg, creq->comp, COMP_HLEN);
-
-	return 0;
-}
-
-static void free_src_sglist(struct skcipher_request *skreq)
-{
-	struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq);
-
-	kfree(nkreq->src);
-}
 
-static void free_dst_sglist(struct skcipher_request *skreq)
+int nitrox_crypto_register(void)
 {
-	struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq);
+	int err;
 
-	kfree(nkreq->dst);
-}
+	err = nitrox_register_skciphers();
+	if (err)
+		return err;
 
-static void nitrox_skcipher_callback(struct skcipher_request *skreq,
-				     int err)
-{
-	free_src_sglist(skreq);
-	free_dst_sglist(skreq);
+	err = nitrox_register_aeads();
 	if (err) {
-		pr_err_ratelimited("request failed status 0x%0x\n", err);
-		err = -EINVAL;
-	}
-
-	skcipher_request_complete(skreq, err);
-}
-
-static int nitrox_skcipher_crypt(struct skcipher_request *skreq, bool enc)
-{
-	struct crypto_skcipher *cipher = crypto_skcipher_reqtfm(skreq);
-	struct nitrox_crypto_ctx *nctx = crypto_skcipher_ctx(cipher);
-	struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq);
-	int ivsize = crypto_skcipher_ivsize(cipher);
-	struct se_crypto_request *creq;
-	int ret;
-
-	creq = &nkreq->creq;
-	creq->flags = skreq->base.flags;
-	creq->gfp = (skreq->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ?
-		     GFP_KERNEL : GFP_ATOMIC;
-
-	/* fill the request */
-	creq->ctrl.value = 0;
-	creq->opcode = FLEXI_CRYPTO_ENCRYPT_HMAC;
-	creq->ctrl.s.arg = (enc ? ENCRYPT : DECRYPT);
-	/* param0: length of the data to be encrypted */
-	creq->gph.param0 = cpu_to_be16(skreq->cryptlen);
-	creq->gph.param1 = 0;
-	/* param2: encryption data offset */
-	creq->gph.param2 = cpu_to_be16(ivsize);
-	creq->gph.param3 = 0;
-
-	creq->ctx_handle = nctx->u.ctx_handle;
-	creq->ctrl.s.ctxl = sizeof(struct flexi_crypto_context);
-
-	ret = alloc_src_sglist(skreq, ivsize);
-	if (ret)
-		return ret;
-
-	ret = alloc_dst_sglist(skreq, ivsize);
-	if (ret) {
-		free_src_sglist(skreq);
-		return ret;
-	}
-
-	nkreq->nctx = nctx;
-	nkreq->skreq = skreq;
-
-	/* send the crypto request */
-	return nitrox_process_se_request(nctx->ndev, creq,
-					 nitrox_skcipher_callback, skreq);
-}
-
-static int nitrox_aes_encrypt(struct skcipher_request *skreq)
-{
-	return nitrox_skcipher_crypt(skreq, true);
-}
-
-static int nitrox_aes_decrypt(struct skcipher_request *skreq)
-{
-	return nitrox_skcipher_crypt(skreq, false);
-}
-
-static int nitrox_3des_setkey(struct crypto_skcipher *cipher,
-			      const u8 *key, unsigned int keylen)
-{
-	if (keylen != DES3_EDE_KEY_SIZE) {
-		crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN);
-		return -EINVAL;
-	}
-
-	return nitrox_skcipher_setkey(cipher, 0, key, keylen);
-}
-
-static int nitrox_3des_encrypt(struct skcipher_request *skreq)
-{
-	return nitrox_skcipher_crypt(skreq, true);
-}
-
-static int nitrox_3des_decrypt(struct skcipher_request *skreq)
-{
-	return nitrox_skcipher_crypt(skreq, false);
-}
-
-static int nitrox_aes_xts_setkey(struct crypto_skcipher *cipher,
-				 const u8 *key, unsigned int keylen)
-{
-	struct crypto_tfm *tfm = crypto_skcipher_tfm(cipher);
-	struct nitrox_crypto_ctx *nctx = crypto_tfm_ctx(tfm);
-	struct flexi_crypto_context *fctx;
-	int aes_keylen, ret;
-
-	ret = xts_check_key(tfm, key, keylen);
-	if (ret)
-		return ret;
-
-	keylen /= 2;
-
-	aes_keylen = flexi_aes_keylen(keylen);
-	if (aes_keylen < 0) {
-		crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN);
-		return -EINVAL;
+		nitrox_unregister_skciphers();
+		return err;
 	}
 
-	fctx = nctx->u.fctx;
-	/* copy KEY2 */
-	memcpy(fctx->auth.u.key2, (key + keylen), keylen);
-
-	return nitrox_skcipher_setkey(cipher, aes_keylen, key, keylen);
-}
-
-static int nitrox_aes_ctr_rfc3686_setkey(struct crypto_skcipher *cipher,
-					 const u8 *key, unsigned int keylen)
-{
-	struct crypto_tfm *tfm = crypto_skcipher_tfm(cipher);
-	struct nitrox_crypto_ctx *nctx = crypto_tfm_ctx(tfm);
-	struct flexi_crypto_context *fctx;
-	int aes_keylen;
-
-	if (keylen < CTR_RFC3686_NONCE_SIZE)
-		return -EINVAL;
-
-	fctx = nctx->u.fctx;
-
-	memcpy(fctx->crypto.iv, key + (keylen - CTR_RFC3686_NONCE_SIZE),
-	       CTR_RFC3686_NONCE_SIZE);
-
-	keylen -= CTR_RFC3686_NONCE_SIZE;
-
-	aes_keylen = flexi_aes_keylen(keylen);
-	if (aes_keylen < 0) {
-		crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN);
-		return -EINVAL;
-	}
-	return nitrox_skcipher_setkey(cipher, aes_keylen, key, keylen);
-}
-
-static struct skcipher_alg nitrox_skciphers[] = { {
-	.base = {
-		.cra_name = "cbc(aes)",
-		.cra_driver_name = "n5_cbc(aes)",
-		.cra_priority = PRIO,
-		.cra_flags = CRYPTO_ALG_ASYNC,
-		.cra_blocksize = AES_BLOCK_SIZE,
-		.cra_ctxsize = sizeof(struct nitrox_crypto_ctx),
-		.cra_alignmask = 0,
-		.cra_module = THIS_MODULE,
-	},
-	.min_keysize = AES_MIN_KEY_SIZE,
-	.max_keysize = AES_MAX_KEY_SIZE,
-	.ivsize = AES_BLOCK_SIZE,
-	.setkey = nitrox_aes_setkey,
-	.encrypt = nitrox_aes_encrypt,
-	.decrypt = nitrox_aes_decrypt,
-	.init = nitrox_skcipher_init,
-	.exit = nitrox_skcipher_exit,
-}, {
-	.base = {
-		.cra_name = "ecb(aes)",
-		.cra_driver_name = "n5_ecb(aes)",
-		.cra_priority = PRIO,
-		.cra_flags = CRYPTO_ALG_ASYNC,
-		.cra_blocksize = AES_BLOCK_SIZE,
-		.cra_ctxsize = sizeof(struct nitrox_crypto_ctx),
-		.cra_alignmask = 0,
-		.cra_module = THIS_MODULE,
-	},
-	.min_keysize = AES_MIN_KEY_SIZE,
-	.max_keysize = AES_MAX_KEY_SIZE,
-	.ivsize = AES_BLOCK_SIZE,
-	.setkey = nitrox_aes_setkey,
-	.encrypt = nitrox_aes_encrypt,
-	.decrypt = nitrox_aes_decrypt,
-	.init = nitrox_skcipher_init,
-	.exit = nitrox_skcipher_exit,
-}, {
-	.base = {
-		.cra_name = "cfb(aes)",
-		.cra_driver_name = "n5_cfb(aes)",
-		.cra_priority = PRIO,
-		.cra_flags = CRYPTO_ALG_ASYNC,
-		.cra_blocksize = AES_BLOCK_SIZE,
-		.cra_ctxsize = sizeof(struct nitrox_crypto_ctx),
-		.cra_alignmask = 0,
-		.cra_module = THIS_MODULE,
-	},
-	.min_keysize = AES_MIN_KEY_SIZE,
-	.max_keysize = AES_MAX_KEY_SIZE,
-	.ivsize = AES_BLOCK_SIZE,
-	.setkey = nitrox_aes_setkey,
-	.encrypt = nitrox_aes_encrypt,
-	.decrypt = nitrox_aes_decrypt,
-	.init = nitrox_skcipher_init,
-	.exit = nitrox_skcipher_exit,
-}, {
-	.base = {
-		.cra_name = "xts(aes)",
-		.cra_driver_name = "n5_xts(aes)",
-		.cra_priority = PRIO,
-		.cra_flags = CRYPTO_ALG_ASYNC,
-		.cra_blocksize = AES_BLOCK_SIZE,
-		.cra_ctxsize = sizeof(struct nitrox_crypto_ctx),
-		.cra_alignmask = 0,
-		.cra_module = THIS_MODULE,
-	},
-	.min_keysize = 2 * AES_MIN_KEY_SIZE,
-	.max_keysize = 2 * AES_MAX_KEY_SIZE,
-	.ivsize = AES_BLOCK_SIZE,
-	.setkey = nitrox_aes_xts_setkey,
-	.encrypt = nitrox_aes_encrypt,
-	.decrypt = nitrox_aes_decrypt,
-	.init = nitrox_skcipher_init,
-	.exit = nitrox_skcipher_exit,
-}, {
-	.base = {
-		.cra_name = "rfc3686(ctr(aes))",
-		.cra_driver_name = "n5_rfc3686(ctr(aes))",
-		.cra_priority = PRIO,
-		.cra_flags = CRYPTO_ALG_ASYNC,
-		.cra_blocksize = 1,
-		.cra_ctxsize = sizeof(struct nitrox_crypto_ctx),
-		.cra_alignmask = 0,
-		.cra_module = THIS_MODULE,
-	},
-	.min_keysize = AES_MIN_KEY_SIZE + CTR_RFC3686_NONCE_SIZE,
-	.max_keysize = AES_MAX_KEY_SIZE + CTR_RFC3686_NONCE_SIZE,
-	.ivsize = CTR_RFC3686_IV_SIZE,
-	.init = nitrox_skcipher_init,
-	.exit = nitrox_skcipher_exit,
-	.setkey = nitrox_aes_ctr_rfc3686_setkey,
-	.encrypt = nitrox_aes_encrypt,
-	.decrypt = nitrox_aes_decrypt,
-}, {
-	.base = {
-		.cra_name = "cts(cbc(aes))",
-		.cra_driver_name = "n5_cts(cbc(aes))",
-		.cra_priority = PRIO,
-		.cra_flags = CRYPTO_ALG_ASYNC,
-		.cra_blocksize = AES_BLOCK_SIZE,
-		.cra_ctxsize = sizeof(struct nitrox_crypto_ctx),
-		.cra_alignmask = 0,
-		.cra_type = &crypto_ablkcipher_type,
-		.cra_module = THIS_MODULE,
-	},
-	.min_keysize = AES_MIN_KEY_SIZE,
-	.max_keysize = AES_MAX_KEY_SIZE,
-	.ivsize = AES_BLOCK_SIZE,
-	.setkey = nitrox_aes_setkey,
-	.encrypt = nitrox_aes_encrypt,
-	.decrypt = nitrox_aes_decrypt,
-	.init = nitrox_skcipher_init,
-	.exit = nitrox_skcipher_exit,
-}, {
-	.base = {
-		.cra_name = "cbc(des3_ede)",
-		.cra_driver_name = "n5_cbc(des3_ede)",
-		.cra_priority = PRIO,
-		.cra_flags = CRYPTO_ALG_ASYNC,
-		.cra_blocksize = DES3_EDE_BLOCK_SIZE,
-		.cra_ctxsize = sizeof(struct nitrox_crypto_ctx),
-		.cra_alignmask = 0,
-		.cra_module = THIS_MODULE,
-	},
-	.min_keysize = DES3_EDE_KEY_SIZE,
-	.max_keysize = DES3_EDE_KEY_SIZE,
-	.ivsize = DES3_EDE_BLOCK_SIZE,
-	.setkey = nitrox_3des_setkey,
-	.encrypt = nitrox_3des_encrypt,
-	.decrypt = nitrox_3des_decrypt,
-	.init = nitrox_skcipher_init,
-	.exit = nitrox_skcipher_exit,
-}, {
-	.base = {
-		.cra_name = "ecb(des3_ede)",
-		.cra_driver_name = "n5_ecb(des3_ede)",
-		.cra_priority = PRIO,
-		.cra_flags = CRYPTO_ALG_ASYNC,
-		.cra_blocksize = DES3_EDE_BLOCK_SIZE,
-		.cra_ctxsize = sizeof(struct nitrox_crypto_ctx),
-		.cra_alignmask = 0,
-		.cra_module = THIS_MODULE,
-	},
-	.min_keysize = DES3_EDE_KEY_SIZE,
-	.max_keysize = DES3_EDE_KEY_SIZE,
-	.ivsize = DES3_EDE_BLOCK_SIZE,
-	.setkey = nitrox_3des_setkey,
-	.encrypt = nitrox_3des_encrypt,
-	.decrypt = nitrox_3des_decrypt,
-	.init = nitrox_skcipher_init,
-	.exit = nitrox_skcipher_exit,
-}
-
-};
-
-int nitrox_crypto_register(void)
-{
-	return crypto_register_skciphers(nitrox_skciphers,
-					 ARRAY_SIZE(nitrox_skciphers));
+	return 0;
 }
 
 void nitrox_crypto_unregister(void)
 {
-	crypto_unregister_skciphers(nitrox_skciphers,
-				    ARRAY_SIZE(nitrox_skciphers));
+	nitrox_unregister_aeads();
+	nitrox_unregister_skciphers();
 }
diff --git a/drivers/crypto/cavium/nitrox/nitrox_common.h b/drivers/crypto/cavium/nitrox/nitrox_common.h
index 863143a8336b..e4be69d7e6e5 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_common.h
+++ b/drivers/crypto/cavium/nitrox/nitrox_common.h
@@ -7,6 +7,10 @@
 
 int nitrox_crypto_register(void);
 void nitrox_crypto_unregister(void);
+int nitrox_register_aeads(void);
+void nitrox_unregister_aeads(void);
+int nitrox_register_skciphers(void);
+void nitrox_unregister_skciphers(void);
 void *crypto_alloc_context(struct nitrox_device *ndev);
 void crypto_free_context(void *ctx);
 struct nitrox_device *nitrox_get_first_device(void);
@@ -19,7 +23,7 @@ void pkt_slc_resp_tasklet(unsigned long data);
 int nitrox_process_se_request(struct nitrox_device *ndev,
 			      struct se_crypto_request *req,
 			      completion_t cb,
-			      struct skcipher_request *skreq);
+			      void *cb_arg);
 void backlog_qflush_work(struct work_struct *work);
 
 
diff --git a/drivers/crypto/cavium/nitrox/nitrox_req.h b/drivers/crypto/cavium/nitrox/nitrox_req.h
index d45ff91c19a9..76c0f0be7233 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_req.h
+++ b/drivers/crypto/cavium/nitrox/nitrox_req.h
@@ -8,6 +8,7 @@
 #include "nitrox_dev.h"
 
 #define PENDING_SIG	0xFFFFFFFFFFFFFFFFUL
+#define PRIO 4001
 
 /**
  * struct gphdr - General purpose Header
@@ -106,6 +107,18 @@ enum flexi_cipher {
 	CIPHER_INVALID
 };
 
+enum flexi_auth {
+	AUTH_NULL = 0,
+	AUTH_MD5,
+	AUTH_SHA1,
+	AUTH_SHA2_SHA224,
+	AUTH_SHA2_SHA256,
+	AUTH_SHA2_SHA384,
+	AUTH_SHA2_SHA512,
+	AUTH_GMAC,
+	AUTH_INVALID
+};
+
 /**
  * struct crypto_keys - Crypto keys
  * @key: Encryption key or KEY1 for AES-XTS
@@ -132,6 +145,32 @@ struct auth_keys {
 	u8 opad[64];
 };
 
+union fc_ctx_flags {
+	__be64 f;
+	struct {
+#if defined(__BIG_ENDIAN_BITFIELD)
+		u64 cipher_type	: 4;
+		u64 reserved_59	: 1;
+		u64 aes_keylen : 2;
+		u64 iv_source : 1;
+		u64 hash_type : 4;
+		u64 reserved_49_51 : 3;
+		u64 auth_input_type: 1;
+		u64 mac_len : 8;
+		u64 reserved_0_39 : 40;
+#else
+		u64 reserved_0_39 : 40;
+		u64 mac_len : 8;
+		u64 auth_input_type: 1;
+		u64 reserved_49_51 : 3;
+		u64 hash_type : 4;
+		u64 iv_source : 1;
+		u64 aes_keylen : 2;
+		u64 reserved_59	: 1;
+		u64 cipher_type	: 4;
+#endif
+	} w0;
+};
 /**
  * struct flexi_crypto_context - Crypto context
  * @cipher_type: Encryption cipher type
@@ -146,33 +185,7 @@ struct auth_keys {
  * @auth: Authentication keys
  */
 struct flexi_crypto_context {
-	union {
-		__be64 flags;
-		struct {
-#if defined(__BIG_ENDIAN_BITFIELD)
-			u64 cipher_type	: 4;
-			u64 reserved_59	: 1;
-			u64 aes_keylen : 2;
-			u64 iv_source : 1;
-			u64 hash_type : 4;
-			u64 reserved_49_51 : 3;
-			u64 auth_input_type: 1;
-			u64 mac_len : 8;
-			u64 reserved_0_39 : 40;
-#else
-			u64 reserved_0_39 : 40;
-			u64 mac_len : 8;
-			u64 auth_input_type: 1;
-			u64 reserved_49_51 : 3;
-			u64 hash_type : 4;
-			u64 iv_source : 1;
-			u64 aes_keylen : 2;
-			u64 reserved_59	: 1;
-			u64 cipher_type	: 4;
-#endif
-		} w0;
-	};
-
+	union fc_ctx_flags flags;
 	struct crypto_keys crypto;
 	struct auth_keys auth;
 };
@@ -194,8 +207,6 @@ struct nitrox_crypto_ctx {
 
 struct nitrox_kcrypt_request {
 	struct se_crypto_request creq;
-	struct nitrox_crypto_ctx *nctx;
-	struct skcipher_request *skreq;
 	u8 *src;
 	u8 *dst;
 };
@@ -400,7 +411,7 @@ struct resp_hdr {
 	u64 *completion;
 };
 
-typedef void (*completion_t)(struct skcipher_request *skreq, int err);
+typedef void (*completion_t)(void *arg, int err);
 
 /**
  * struct nitrox_softreq - Represents the NIROX Request.
@@ -435,9 +446,30 @@ struct nitrox_softreq {
 	unsigned long tstamp;
 
 	completion_t callback;
-	struct skcipher_request *skreq;
+	void *cb_arg;
 };
 
+static inline int flexi_aes_keylen(int keylen)
+{
+	int aes_keylen;
+
+	switch (keylen) {
+	case AES_KEYSIZE_128:
+		aes_keylen = 1;
+		break;
+	case AES_KEYSIZE_192:
+		aes_keylen = 2;
+		break;
+	case AES_KEYSIZE_256:
+		aes_keylen = 3;
+		break;
+	default:
+		aes_keylen = -EINVAL;
+		break;
+	}
+	return aes_keylen;
+}
+
 static inline void *alloc_req_buf(int nents, int extralen, gfp_t gfp)
 {
 	size_t size;
@@ -448,6 +480,14 @@ static inline void *alloc_req_buf(int nents, int extralen, gfp_t gfp)
 	return kzalloc(size, gfp);
 }
 
+/**
+ * create_single_sg - Point SG entry to the data
+ * @sg:		Destination SG list
+ * @buf:	Data
+ * @buflen:	Data length
+ *
+ * Returns next free entry in the destination SG list
+ **/
 static inline struct scatterlist *create_single_sg(struct scatterlist *sg,
 						   void *buf, int buflen)
 {
@@ -456,18 +496,33 @@ static inline struct scatterlist *create_single_sg(struct scatterlist *sg,
 	return sg;
 }
 
+/**
+ * create_multi_sg - Create multiple sg entries with buflen data length from
+ *		     source sglist
+ * @to_sg:	Destination SG list
+ * @from_sg:	Source SG list
+ * @buflen:	Data length
+ *
+ * Returns next free entry in the destination SG list
+ **/
 static inline struct scatterlist *create_multi_sg(struct scatterlist *to_sg,
-						  struct scatterlist *from_sg)
+						  struct scatterlist *from_sg,
+						  int buflen)
 {
-	struct scatterlist *sg;
-	int i;
+	struct scatterlist *sg = to_sg;
+	unsigned int sglen;
+
+	for (; buflen; buflen -= sglen) {
+		sglen = from_sg->length;
+		if (sglen > buflen)
+			sglen = buflen;
 
-	for_each_sg(from_sg, sg, sg_nents(from_sg), i) {
-		sg_set_buf(to_sg, sg_virt(sg), sg->length);
-		to_sg++;
+		sg_set_buf(sg, sg_virt(from_sg), sglen);
+		from_sg = sg_next(from_sg);
+		sg++;
 	}
 
-	return to_sg;
+	return sg;
 }
 
 static inline void set_orh_value(u64 *orh)
@@ -480,4 +535,112 @@ static inline void set_comp_value(u64 *comp)
 	WRITE_ONCE(*comp, PENDING_SIG);
 }
 
+static inline int alloc_src_req_buf(struct nitrox_kcrypt_request *nkreq,
+				    int nents, int ivsize)
+{
+	struct se_crypto_request *creq = &nkreq->creq;
+
+	nkreq->src = alloc_req_buf(nents, ivsize, creq->gfp);
+	if (!nkreq->src)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static inline void nitrox_creq_copy_iv(char *dst, char *src, int size)
+{
+	memcpy(dst, src, size);
+}
+
+static inline struct scatterlist *nitrox_creq_src_sg(char *iv, int ivsize)
+{
+	return (struct scatterlist *)(iv + ivsize);
+}
+
+static inline void nitrox_creq_set_src_sg(struct nitrox_kcrypt_request *nkreq,
+					  int nents, int ivsize,
+					  struct scatterlist *src, int buflen)
+{
+	char *iv = nkreq->src;
+	struct scatterlist *sg;
+	struct se_crypto_request *creq = &nkreq->creq;
+
+	creq->src = nitrox_creq_src_sg(iv, ivsize);
+	sg = creq->src;
+	sg_init_table(sg, nents);
+
+	/* Input format:
+	 * +----+----------------+
+	 * | IV | SRC sg entries |
+	 * +----+----------------+
+	 */
+
+	/* IV */
+	sg = create_single_sg(sg, iv, ivsize);
+	/* SRC entries */
+	create_multi_sg(sg, src, buflen);
+}
+
+static inline int alloc_dst_req_buf(struct nitrox_kcrypt_request *nkreq,
+				    int nents)
+{
+	int extralen = ORH_HLEN + COMP_HLEN;
+	struct se_crypto_request *creq = &nkreq->creq;
+
+	nkreq->dst = alloc_req_buf(nents, extralen, creq->gfp);
+	if (!nkreq->dst)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static inline void nitrox_creq_set_orh(struct nitrox_kcrypt_request *nkreq)
+{
+	struct se_crypto_request *creq = &nkreq->creq;
+
+	creq->orh = (u64 *)(nkreq->dst);
+	set_orh_value(creq->orh);
+}
+
+static inline void nitrox_creq_set_comp(struct nitrox_kcrypt_request *nkreq)
+{
+	struct se_crypto_request *creq = &nkreq->creq;
+
+	creq->comp = (u64 *)(nkreq->dst + ORH_HLEN);
+	set_comp_value(creq->comp);
+}
+
+static inline struct scatterlist *nitrox_creq_dst_sg(char *dst)
+{
+	return (struct scatterlist *)(dst + ORH_HLEN + COMP_HLEN);
+}
+
+static inline void nitrox_creq_set_dst_sg(struct nitrox_kcrypt_request *nkreq,
+					  int nents, int ivsize,
+					  struct scatterlist *dst, int buflen)
+{
+	struct se_crypto_request *creq = &nkreq->creq;
+	struct scatterlist *sg;
+	char *iv = nkreq->src;
+
+	creq->dst = nitrox_creq_dst_sg(nkreq->dst);
+	sg = creq->dst;
+	sg_init_table(sg, nents);
+
+	/* Output format:
+	 * +-----+----+----------------+-----------------+
+	 * | ORH | IV | DST sg entries | COMPLETION Bytes|
+	 * +-----+----+----------------+-----------------+
+	 */
+
+	/* ORH */
+	sg = create_single_sg(sg, creq->orh, ORH_HLEN);
+	/* IV */
+	sg = create_single_sg(sg, iv, ivsize);
+	/* DST entries */
+	sg = create_multi_sg(sg, dst, buflen);
+	/* COMPLETION Bytes */
+	create_single_sg(sg, creq->comp, COMP_HLEN);
+}
+
 #endif /* __NITROX_REQ_H */
diff --git a/drivers/crypto/cavium/nitrox/nitrox_reqmgr.c b/drivers/crypto/cavium/nitrox/nitrox_reqmgr.c
index d566bb904ec2..e34e4df8fd24 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_reqmgr.c
+++ b/drivers/crypto/cavium/nitrox/nitrox_reqmgr.c
@@ -269,6 +269,8 @@ static inline bool cmdq_full(struct nitrox_cmdq *cmdq, int qlen)
 		smp_mb__after_atomic();
 		return true;
 	}
+	/* sync with other cpus */
+	smp_mb__after_atomic();
 	return false;
 }
 
@@ -324,8 +326,6 @@ static int post_backlog_cmds(struct nitrox_cmdq *cmdq)
 	spin_lock_bh(&cmdq->backlog_qlock);
 
 	list_for_each_entry_safe(sr, tmp, &cmdq->backlog_head, backlog) {
-		struct skcipher_request *skreq;
-
 		/* submit until space available */
 		if (unlikely(cmdq_full(cmdq, ndev->qlen))) {
 			ret = -ENOSPC;
@@ -337,12 +337,8 @@ static int post_backlog_cmds(struct nitrox_cmdq *cmdq)
 		/* sync with other cpus */
 		smp_mb__after_atomic();
 
-		skreq = sr->skreq;
 		/* post the command */
 		post_se_instr(sr, cmdq);
-
-		/* backlog requests are posted, wakeup with -EINPROGRESS */
-		skcipher_request_complete(skreq, -EINPROGRESS);
 	}
 	spin_unlock_bh(&cmdq->backlog_qlock);
 
@@ -365,7 +361,7 @@ static int nitrox_enqueue_request(struct nitrox_softreq *sr)
 		}
 		/* add to backlog list */
 		backlog_list_add(sr, cmdq);
-		return -EBUSY;
+		return -EINPROGRESS;
 	}
 	post_se_instr(sr, cmdq);
 
@@ -382,7 +378,7 @@ static int nitrox_enqueue_request(struct nitrox_softreq *sr)
 int nitrox_process_se_request(struct nitrox_device *ndev,
 			      struct se_crypto_request *req,
 			      completion_t callback,
-			      struct skcipher_request *skreq)
+			      void *cb_arg)
 {
 	struct nitrox_softreq *sr;
 	dma_addr_t ctx_handle = 0;
@@ -399,7 +395,7 @@ int nitrox_process_se_request(struct nitrox_device *ndev,
 	sr->flags = req->flags;
 	sr->gfp = req->gfp;
 	sr->callback = callback;
-	sr->skreq = skreq;
+	sr->cb_arg = cb_arg;
 
 	atomic_set(&sr->status, REQ_NOT_POSTED);
 
@@ -513,7 +509,20 @@ void backlog_qflush_work(struct work_struct *work)
 
 static bool sr_completed(struct nitrox_softreq *sr)
 {
-	return (READ_ONCE(*sr->resp.orh) != READ_ONCE(*sr->resp.completion));
+	u64 orh = READ_ONCE(*sr->resp.orh);
+	unsigned long timeout = jiffies + msecs_to_jiffies(1);
+
+	if ((orh != PENDING_SIG) && (orh & 0xff))
+		return true;
+
+	while (READ_ONCE(*sr->resp.completion) == PENDING_SIG) {
+		if (time_after(jiffies, timeout)) {
+			pr_err("comp not done\n");
+			return false;
+		}
+	}
+
+	return true;
 }
 
 /**
@@ -527,8 +536,6 @@ static void process_response_list(struct nitrox_cmdq *cmdq)
 {
 	struct nitrox_device *ndev = cmdq->ndev;
 	struct nitrox_softreq *sr;
-	struct skcipher_request *skreq;
-	completion_t callback;
 	int req_completed = 0, err = 0, budget;
 
 	/* check all pending requests */
@@ -558,15 +565,12 @@ static void process_response_list(struct nitrox_cmdq *cmdq)
 		/* remove from response list */
 		response_list_del(sr, cmdq);
 
-		callback = sr->callback;
-		skreq = sr->skreq;
-
 		/* ORH error code */
 		err = READ_ONCE(*sr->resp.orh) & 0xff;
 		softreq_destroy(sr);
 
-		if (callback)
-			callback(skreq, err);
+		if (sr->callback)
+			sr->callback(sr->cb_arg, err);
 
 		req_completed++;
 	}
diff --git a/drivers/crypto/cavium/nitrox/nitrox_skcipher.c b/drivers/crypto/cavium/nitrox/nitrox_skcipher.c
new file mode 100644
index 000000000000..d4935d6cefdd
--- /dev/null
+++ b/drivers/crypto/cavium/nitrox/nitrox_skcipher.c
@@ -0,0 +1,498 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/crypto.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+
+#include <crypto/aes.h>
+#include <crypto/skcipher.h>
+#include <crypto/ctr.h>
+#include <crypto/des.h>
+#include <crypto/xts.h>
+
+#include "nitrox_dev.h"
+#include "nitrox_common.h"
+#include "nitrox_req.h"
+
+struct nitrox_cipher {
+	const char *name;
+	enum flexi_cipher value;
+};
+
+/**
+ * supported cipher list
+ */
+static const struct nitrox_cipher flexi_cipher_table[] = {
+	{ "null",		CIPHER_NULL },
+	{ "cbc(des3_ede)",	CIPHER_3DES_CBC },
+	{ "ecb(des3_ede)",	CIPHER_3DES_ECB },
+	{ "cbc(aes)",		CIPHER_AES_CBC },
+	{ "ecb(aes)",		CIPHER_AES_ECB },
+	{ "cfb(aes)",		CIPHER_AES_CFB },
+	{ "rfc3686(ctr(aes))",	CIPHER_AES_CTR },
+	{ "xts(aes)",		CIPHER_AES_XTS },
+	{ "cts(cbc(aes))",	CIPHER_AES_CBC_CTS },
+	{ NULL,			CIPHER_INVALID }
+};
+
+static enum flexi_cipher flexi_cipher_type(const char *name)
+{
+	const struct nitrox_cipher *cipher = flexi_cipher_table;
+
+	while (cipher->name) {
+		if (!strcmp(cipher->name, name))
+			break;
+		cipher++;
+	}
+	return cipher->value;
+}
+
+static int nitrox_skcipher_init(struct crypto_skcipher *tfm)
+{
+	struct nitrox_crypto_ctx *nctx = crypto_skcipher_ctx(tfm);
+	struct crypto_ctx_hdr *chdr;
+
+	/* get the first device */
+	nctx->ndev = nitrox_get_first_device();
+	if (!nctx->ndev)
+		return -ENODEV;
+
+	/* allocate nitrox crypto context */
+	chdr = crypto_alloc_context(nctx->ndev);
+	if (!chdr) {
+		nitrox_put_device(nctx->ndev);
+		return -ENOMEM;
+	}
+	nctx->chdr = chdr;
+	nctx->u.ctx_handle = (uintptr_t)((u8 *)chdr->vaddr +
+					 sizeof(struct ctx_hdr));
+	crypto_skcipher_set_reqsize(tfm, crypto_skcipher_reqsize(tfm) +
+				    sizeof(struct nitrox_kcrypt_request));
+	return 0;
+}
+
+static void nitrox_skcipher_exit(struct crypto_skcipher *tfm)
+{
+	struct nitrox_crypto_ctx *nctx = crypto_skcipher_ctx(tfm);
+
+	/* free the nitrox crypto context */
+	if (nctx->u.ctx_handle) {
+		struct flexi_crypto_context *fctx = nctx->u.fctx;
+
+		memzero_explicit(&fctx->crypto, sizeof(struct crypto_keys));
+		memzero_explicit(&fctx->auth, sizeof(struct auth_keys));
+		crypto_free_context((void *)nctx->chdr);
+	}
+	nitrox_put_device(nctx->ndev);
+
+	nctx->u.ctx_handle = 0;
+	nctx->ndev = NULL;
+}
+
+static inline int nitrox_skcipher_setkey(struct crypto_skcipher *cipher,
+					 int aes_keylen, const u8 *key,
+					 unsigned int keylen)
+{
+	struct crypto_tfm *tfm = crypto_skcipher_tfm(cipher);
+	struct nitrox_crypto_ctx *nctx = crypto_tfm_ctx(tfm);
+	struct flexi_crypto_context *fctx;
+	union fc_ctx_flags *flags;
+	enum flexi_cipher cipher_type;
+	const char *name;
+
+	name = crypto_tfm_alg_name(tfm);
+	cipher_type = flexi_cipher_type(name);
+	if (unlikely(cipher_type == CIPHER_INVALID)) {
+		pr_err("unsupported cipher: %s\n", name);
+		return -EINVAL;
+	}
+
+	/* fill crypto context */
+	fctx = nctx->u.fctx;
+	flags = &fctx->flags;
+	flags->f = 0;
+	flags->w0.cipher_type = cipher_type;
+	flags->w0.aes_keylen = aes_keylen;
+	flags->w0.iv_source = IV_FROM_DPTR;
+	flags->f = cpu_to_be64(*(u64 *)&flags->w0);
+	/* copy the key to context */
+	memcpy(fctx->crypto.u.key, key, keylen);
+
+	return 0;
+}
+
+static int nitrox_aes_setkey(struct crypto_skcipher *cipher, const u8 *key,
+			     unsigned int keylen)
+{
+	int aes_keylen;
+
+	aes_keylen = flexi_aes_keylen(keylen);
+	if (aes_keylen < 0) {
+		crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+	return nitrox_skcipher_setkey(cipher, aes_keylen, key, keylen);
+}
+
+static int alloc_src_sglist(struct skcipher_request *skreq, int ivsize)
+{
+	struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq);
+	int nents = sg_nents(skreq->src) + 1;
+	int ret;
+
+	/* Allocate buffer to hold IV and input scatterlist array */
+	ret = alloc_src_req_buf(nkreq, nents, ivsize);
+	if (ret)
+		return ret;
+
+	nitrox_creq_copy_iv(nkreq->src, skreq->iv, ivsize);
+	nitrox_creq_set_src_sg(nkreq, nents, ivsize, skreq->src,
+			       skreq->cryptlen);
+
+	return 0;
+}
+
+static int alloc_dst_sglist(struct skcipher_request *skreq, int ivsize)
+{
+	struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq);
+	int nents = sg_nents(skreq->dst) + 3;
+	int ret;
+
+	/* Allocate buffer to hold ORH, COMPLETION and output scatterlist
+	 * array
+	 */
+	ret = alloc_dst_req_buf(nkreq, nents);
+	if (ret)
+		return ret;
+
+	nitrox_creq_set_orh(nkreq);
+	nitrox_creq_set_comp(nkreq);
+	nitrox_creq_set_dst_sg(nkreq, nents, ivsize, skreq->dst,
+			       skreq->cryptlen);
+
+	return 0;
+}
+
+static void free_src_sglist(struct skcipher_request *skreq)
+{
+	struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq);
+
+	kfree(nkreq->src);
+}
+
+static void free_dst_sglist(struct skcipher_request *skreq)
+{
+	struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq);
+
+	kfree(nkreq->dst);
+}
+
+static void nitrox_skcipher_callback(void *arg, int err)
+{
+	struct skcipher_request *skreq = arg;
+
+	free_src_sglist(skreq);
+	free_dst_sglist(skreq);
+	if (err) {
+		pr_err_ratelimited("request failed status 0x%0x\n", err);
+		err = -EINVAL;
+	}
+
+	skcipher_request_complete(skreq, err);
+}
+
+static int nitrox_skcipher_crypt(struct skcipher_request *skreq, bool enc)
+{
+	struct crypto_skcipher *cipher = crypto_skcipher_reqtfm(skreq);
+	struct nitrox_crypto_ctx *nctx = crypto_skcipher_ctx(cipher);
+	struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq);
+	int ivsize = crypto_skcipher_ivsize(cipher);
+	struct se_crypto_request *creq;
+	int ret;
+
+	creq = &nkreq->creq;
+	creq->flags = skreq->base.flags;
+	creq->gfp = (skreq->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ?
+		     GFP_KERNEL : GFP_ATOMIC;
+
+	/* fill the request */
+	creq->ctrl.value = 0;
+	creq->opcode = FLEXI_CRYPTO_ENCRYPT_HMAC;
+	creq->ctrl.s.arg = (enc ? ENCRYPT : DECRYPT);
+	/* param0: length of the data to be encrypted */
+	creq->gph.param0 = cpu_to_be16(skreq->cryptlen);
+	creq->gph.param1 = 0;
+	/* param2: encryption data offset */
+	creq->gph.param2 = cpu_to_be16(ivsize);
+	creq->gph.param3 = 0;
+
+	creq->ctx_handle = nctx->u.ctx_handle;
+	creq->ctrl.s.ctxl = sizeof(struct flexi_crypto_context);
+
+	ret = alloc_src_sglist(skreq, ivsize);
+	if (ret)
+		return ret;
+
+	ret = alloc_dst_sglist(skreq, ivsize);
+	if (ret) {
+		free_src_sglist(skreq);
+		return ret;
+	}
+
+	/* send the crypto request */
+	return nitrox_process_se_request(nctx->ndev, creq,
+					 nitrox_skcipher_callback, skreq);
+}
+
+static int nitrox_aes_encrypt(struct skcipher_request *skreq)
+{
+	return nitrox_skcipher_crypt(skreq, true);
+}
+
+static int nitrox_aes_decrypt(struct skcipher_request *skreq)
+{
+	return nitrox_skcipher_crypt(skreq, false);
+}
+
+static int nitrox_3des_setkey(struct crypto_skcipher *cipher,
+			      const u8 *key, unsigned int keylen)
+{
+	if (keylen != DES3_EDE_KEY_SIZE) {
+		crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+
+	return nitrox_skcipher_setkey(cipher, 0, key, keylen);
+}
+
+static int nitrox_3des_encrypt(struct skcipher_request *skreq)
+{
+	return nitrox_skcipher_crypt(skreq, true);
+}
+
+static int nitrox_3des_decrypt(struct skcipher_request *skreq)
+{
+	return nitrox_skcipher_crypt(skreq, false);
+}
+
+static int nitrox_aes_xts_setkey(struct crypto_skcipher *cipher,
+				 const u8 *key, unsigned int keylen)
+{
+	struct crypto_tfm *tfm = crypto_skcipher_tfm(cipher);
+	struct nitrox_crypto_ctx *nctx = crypto_tfm_ctx(tfm);
+	struct flexi_crypto_context *fctx;
+	int aes_keylen, ret;
+
+	ret = xts_check_key(tfm, key, keylen);
+	if (ret)
+		return ret;
+
+	keylen /= 2;
+
+	aes_keylen = flexi_aes_keylen(keylen);
+	if (aes_keylen < 0) {
+		crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+
+	fctx = nctx->u.fctx;
+	/* copy KEY2 */
+	memcpy(fctx->auth.u.key2, (key + keylen), keylen);
+
+	return nitrox_skcipher_setkey(cipher, aes_keylen, key, keylen);
+}
+
+static int nitrox_aes_ctr_rfc3686_setkey(struct crypto_skcipher *cipher,
+					 const u8 *key, unsigned int keylen)
+{
+	struct crypto_tfm *tfm = crypto_skcipher_tfm(cipher);
+	struct nitrox_crypto_ctx *nctx = crypto_tfm_ctx(tfm);
+	struct flexi_crypto_context *fctx;
+	int aes_keylen;
+
+	if (keylen < CTR_RFC3686_NONCE_SIZE)
+		return -EINVAL;
+
+	fctx = nctx->u.fctx;
+
+	memcpy(fctx->crypto.iv, key + (keylen - CTR_RFC3686_NONCE_SIZE),
+	       CTR_RFC3686_NONCE_SIZE);
+
+	keylen -= CTR_RFC3686_NONCE_SIZE;
+
+	aes_keylen = flexi_aes_keylen(keylen);
+	if (aes_keylen < 0) {
+		crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+	return nitrox_skcipher_setkey(cipher, aes_keylen, key, keylen);
+}
+
+static struct skcipher_alg nitrox_skciphers[] = { {
+	.base = {
+		.cra_name = "cbc(aes)",
+		.cra_driver_name = "n5_cbc(aes)",
+		.cra_priority = PRIO,
+		.cra_flags = CRYPTO_ALG_ASYNC,
+		.cra_blocksize = AES_BLOCK_SIZE,
+		.cra_ctxsize = sizeof(struct nitrox_crypto_ctx),
+		.cra_alignmask = 0,
+		.cra_module = THIS_MODULE,
+	},
+	.min_keysize = AES_MIN_KEY_SIZE,
+	.max_keysize = AES_MAX_KEY_SIZE,
+	.ivsize = AES_BLOCK_SIZE,
+	.setkey = nitrox_aes_setkey,
+	.encrypt = nitrox_aes_encrypt,
+	.decrypt = nitrox_aes_decrypt,
+	.init = nitrox_skcipher_init,
+	.exit = nitrox_skcipher_exit,
+}, {
+	.base = {
+		.cra_name = "ecb(aes)",
+		.cra_driver_name = "n5_ecb(aes)",
+		.cra_priority = PRIO,
+		.cra_flags = CRYPTO_ALG_ASYNC,
+		.cra_blocksize = AES_BLOCK_SIZE,
+		.cra_ctxsize = sizeof(struct nitrox_crypto_ctx),
+		.cra_alignmask = 0,
+		.cra_module = THIS_MODULE,
+	},
+	.min_keysize = AES_MIN_KEY_SIZE,
+	.max_keysize = AES_MAX_KEY_SIZE,
+	.ivsize = AES_BLOCK_SIZE,
+	.setkey = nitrox_aes_setkey,
+	.encrypt = nitrox_aes_encrypt,
+	.decrypt = nitrox_aes_decrypt,
+	.init = nitrox_skcipher_init,
+	.exit = nitrox_skcipher_exit,
+}, {
+	.base = {
+		.cra_name = "cfb(aes)",
+		.cra_driver_name = "n5_cfb(aes)",
+		.cra_priority = PRIO,
+		.cra_flags = CRYPTO_ALG_ASYNC,
+		.cra_blocksize = AES_BLOCK_SIZE,
+		.cra_ctxsize = sizeof(struct nitrox_crypto_ctx),
+		.cra_alignmask = 0,
+		.cra_module = THIS_MODULE,
+	},
+	.min_keysize = AES_MIN_KEY_SIZE,
+	.max_keysize = AES_MAX_KEY_SIZE,
+	.ivsize = AES_BLOCK_SIZE,
+	.setkey = nitrox_aes_setkey,
+	.encrypt = nitrox_aes_encrypt,
+	.decrypt = nitrox_aes_decrypt,
+	.init = nitrox_skcipher_init,
+	.exit = nitrox_skcipher_exit,
+}, {
+	.base = {
+		.cra_name = "xts(aes)",
+		.cra_driver_name = "n5_xts(aes)",
+		.cra_priority = PRIO,
+		.cra_flags = CRYPTO_ALG_ASYNC,
+		.cra_blocksize = AES_BLOCK_SIZE,
+		.cra_ctxsize = sizeof(struct nitrox_crypto_ctx),
+		.cra_alignmask = 0,
+		.cra_module = THIS_MODULE,
+	},
+	.min_keysize = 2 * AES_MIN_KEY_SIZE,
+	.max_keysize = 2 * AES_MAX_KEY_SIZE,
+	.ivsize = AES_BLOCK_SIZE,
+	.setkey = nitrox_aes_xts_setkey,
+	.encrypt = nitrox_aes_encrypt,
+	.decrypt = nitrox_aes_decrypt,
+	.init = nitrox_skcipher_init,
+	.exit = nitrox_skcipher_exit,
+}, {
+	.base = {
+		.cra_name = "rfc3686(ctr(aes))",
+		.cra_driver_name = "n5_rfc3686(ctr(aes))",
+		.cra_priority = PRIO,
+		.cra_flags = CRYPTO_ALG_ASYNC,
+		.cra_blocksize = 1,
+		.cra_ctxsize = sizeof(struct nitrox_crypto_ctx),
+		.cra_alignmask = 0,
+		.cra_module = THIS_MODULE,
+	},
+	.min_keysize = AES_MIN_KEY_SIZE + CTR_RFC3686_NONCE_SIZE,
+	.max_keysize = AES_MAX_KEY_SIZE + CTR_RFC3686_NONCE_SIZE,
+	.ivsize = CTR_RFC3686_IV_SIZE,
+	.init = nitrox_skcipher_init,
+	.exit = nitrox_skcipher_exit,
+	.setkey = nitrox_aes_ctr_rfc3686_setkey,
+	.encrypt = nitrox_aes_encrypt,
+	.decrypt = nitrox_aes_decrypt,
+}, {
+	.base = {
+		.cra_name = "cts(cbc(aes))",
+		.cra_driver_name = "n5_cts(cbc(aes))",
+		.cra_priority = PRIO,
+		.cra_flags = CRYPTO_ALG_ASYNC,
+		.cra_blocksize = AES_BLOCK_SIZE,
+		.cra_ctxsize = sizeof(struct nitrox_crypto_ctx),
+		.cra_alignmask = 0,
+		.cra_type = &crypto_ablkcipher_type,
+		.cra_module = THIS_MODULE,
+	},
+	.min_keysize = AES_MIN_KEY_SIZE,
+	.max_keysize = AES_MAX_KEY_SIZE,
+	.ivsize = AES_BLOCK_SIZE,
+	.setkey = nitrox_aes_setkey,
+	.encrypt = nitrox_aes_encrypt,
+	.decrypt = nitrox_aes_decrypt,
+	.init = nitrox_skcipher_init,
+	.exit = nitrox_skcipher_exit,
+}, {
+	.base = {
+		.cra_name = "cbc(des3_ede)",
+		.cra_driver_name = "n5_cbc(des3_ede)",
+		.cra_priority = PRIO,
+		.cra_flags = CRYPTO_ALG_ASYNC,
+		.cra_blocksize = DES3_EDE_BLOCK_SIZE,
+		.cra_ctxsize = sizeof(struct nitrox_crypto_ctx),
+		.cra_alignmask = 0,
+		.cra_module = THIS_MODULE,
+	},
+	.min_keysize = DES3_EDE_KEY_SIZE,
+	.max_keysize = DES3_EDE_KEY_SIZE,
+	.ivsize = DES3_EDE_BLOCK_SIZE,
+	.setkey = nitrox_3des_setkey,
+	.encrypt = nitrox_3des_encrypt,
+	.decrypt = nitrox_3des_decrypt,
+	.init = nitrox_skcipher_init,
+	.exit = nitrox_skcipher_exit,
+}, {
+	.base = {
+		.cra_name = "ecb(des3_ede)",
+		.cra_driver_name = "n5_ecb(des3_ede)",
+		.cra_priority = PRIO,
+		.cra_flags = CRYPTO_ALG_ASYNC,
+		.cra_blocksize = DES3_EDE_BLOCK_SIZE,
+		.cra_ctxsize = sizeof(struct nitrox_crypto_ctx),
+		.cra_alignmask = 0,
+		.cra_module = THIS_MODULE,
+	},
+	.min_keysize = DES3_EDE_KEY_SIZE,
+	.max_keysize = DES3_EDE_KEY_SIZE,
+	.ivsize = DES3_EDE_BLOCK_SIZE,
+	.setkey = nitrox_3des_setkey,
+	.encrypt = nitrox_3des_encrypt,
+	.decrypt = nitrox_3des_decrypt,
+	.init = nitrox_skcipher_init,
+	.exit = nitrox_skcipher_exit,
+}
+
+};
+
+int nitrox_register_skciphers(void)
+{
+	return crypto_register_skciphers(nitrox_skciphers,
+					 ARRAY_SIZE(nitrox_skciphers));
+}
+
+void nitrox_unregister_skciphers(void)
+{
+	crypto_unregister_skciphers(nitrox_skciphers,
+				    ARRAY_SIZE(nitrox_skciphers));
+}
-- 
cgit v1.2.3-59-g8ed1b


From f9c9bdb5131eee60dc3b92e5126d4c0e291703e2 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sat, 15 Dec 2018 12:40:17 -0800
Subject: crypto: x86/chacha - avoid sleeping under kernel_fpu_begin()

Passing atomic=true to skcipher_walk_virt() only makes the later
skcipher_walk_done() calls use atomic memory allocations, not
skcipher_walk_virt() itself.  Thus, we have to move it outside of the
preemption-disabled region (kernel_fpu_begin()/kernel_fpu_end()).

(skcipher_walk_virt() only allocates memory for certain layouts of the
input scatterlist, hence why I didn't notice this earlier...)

Reported-by: syzbot+9bf843c33f782d73ae7d@syzkaller.appspotmail.com
Fixes: 4af78261870a ("crypto: x86/chacha20 - add XChaCha20 support")
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/chacha_glue.c | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c
index 9b1d3fac4943..45c1c4143176 100644
--- a/arch/x86/crypto/chacha_glue.c
+++ b/arch/x86/crypto/chacha_glue.c
@@ -127,30 +127,27 @@ static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src,
 	}
 }
 
-static int chacha_simd_stream_xor(struct skcipher_request *req,
+static int chacha_simd_stream_xor(struct skcipher_walk *walk,
 				  struct chacha_ctx *ctx, u8 *iv)
 {
 	u32 *state, state_buf[16 + 2] __aligned(8);
-	struct skcipher_walk walk;
 	int next_yield = 4096; /* bytes until next FPU yield */
-	int err;
+	int err = 0;
 
 	BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
 	state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
 
-	err = skcipher_walk_virt(&walk, req, true);
-
 	crypto_chacha_init(state, ctx, iv);
 
-	while (walk.nbytes > 0) {
-		unsigned int nbytes = walk.nbytes;
+	while (walk->nbytes > 0) {
+		unsigned int nbytes = walk->nbytes;
 
-		if (nbytes < walk.total) {
-			nbytes = round_down(nbytes, walk.stride);
+		if (nbytes < walk->total) {
+			nbytes = round_down(nbytes, walk->stride);
 			next_yield -= nbytes;
 		}
 
-		chacha_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
+		chacha_dosimd(state, walk->dst.virt.addr, walk->src.virt.addr,
 			      nbytes, ctx->nrounds);
 
 		if (next_yield <= 0) {
@@ -160,7 +157,7 @@ static int chacha_simd_stream_xor(struct skcipher_request *req,
 			next_yield = 4096;
 		}
 
-		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+		err = skcipher_walk_done(walk, walk->nbytes - nbytes);
 	}
 
 	return err;
@@ -170,13 +167,18 @@ static int chacha_simd(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
 	int err;
 
 	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable())
 		return crypto_chacha_crypt(req);
 
+	err = skcipher_walk_virt(&walk, req, true);
+	if (err)
+		return err;
+
 	kernel_fpu_begin();
-	err = chacha_simd_stream_xor(req, ctx, req->iv);
+	err = chacha_simd_stream_xor(&walk, ctx, req->iv);
 	kernel_fpu_end();
 	return err;
 }
@@ -185,6 +187,7 @@ static int xchacha_simd(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
 	struct chacha_ctx subctx;
 	u32 *state, state_buf[16 + 2] __aligned(8);
 	u8 real_iv[16];
@@ -193,6 +196,10 @@ static int xchacha_simd(struct skcipher_request *req)
 	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable())
 		return crypto_xchacha_crypt(req);
 
+	err = skcipher_walk_virt(&walk, req, true);
+	if (err)
+		return err;
+
 	BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
 	state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
 	crypto_chacha_init(state, ctx, req->iv);
@@ -204,7 +211,7 @@ static int xchacha_simd(struct skcipher_request *req)
 
 	memcpy(&real_iv[0], req->iv + 24, 8);
 	memcpy(&real_iv[8], req->iv + 16, 8);
-	err = chacha_simd_stream_xor(req, &subctx, real_iv);
+	err = chacha_simd_stream_xor(&walk, &subctx, real_iv);
 
 	kernel_fpu_end();
 
-- 
cgit v1.2.3-59-g8ed1b


From bb648291fc04c49197561939b8bfea0ada42bce3 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sat, 15 Dec 2018 12:41:53 -0800
Subject: crypto: skcipher - add might_sleep() to skcipher_walk_virt()

skcipher_walk_virt() can still sleep even with atomic=true, since that
only affects the later calls to skcipher_walk_done().  But,
skcipher_walk_virt() only has to allocate memory for some input data
layouts, so incorrectly calling it with preemption disabled can go
undetected.  Use might_sleep() so that it's detected reliably.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/skcipher.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/crypto/skcipher.c b/crypto/skcipher.c
index 17be8d9c714e..41b4f7f27f45 100644
--- a/crypto/skcipher.c
+++ b/crypto/skcipher.c
@@ -474,6 +474,8 @@ int skcipher_walk_virt(struct skcipher_walk *walk,
 {
 	int err;
 
+	might_sleep_if(req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP);
+
 	walk->flags &= ~SKCIPHER_WALK_PHYS;
 
 	err = skcipher_walk_skcipher(walk, req);
-- 
cgit v1.2.3-59-g8ed1b


From 101b53d91d57ebcc13cb5fbd437b1230457ba9e2 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sat, 15 Dec 2018 12:42:52 -0800
Subject: crypto: salsa20-generic - don't unnecessarily use atomic walk

salsa20-generic doesn't use SIMD instructions or otherwise disable
preemption, so passing atomic=true to skcipher_walk_virt() is
unnecessary.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/salsa20_generic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crypto/salsa20_generic.c b/crypto/salsa20_generic.c
index 8c77bc78a09f..00fce32ae17a 100644
--- a/crypto/salsa20_generic.c
+++ b/crypto/salsa20_generic.c
@@ -159,7 +159,7 @@ static int salsa20_crypt(struct skcipher_request *req)
 	u32 state[16];
 	int err;
 
-	err = skcipher_walk_virt(&walk, req, true);
+	err = skcipher_walk_virt(&walk, req, false);
 
 	salsa20_init(state, ctx, walk.iv);
 
-- 
cgit v1.2.3-59-g8ed1b


From dec5d0db0de7271c7616d713f6c434c6366c9bfb Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 16 Dec 2018 15:00:30 -0800
Subject: crypto: cavium/nitrox - Fix build with !CONFIG_DEBUG_FS

Fixes: cf718eaa8f9b ("crypto: cavium/nitrox - Enabled Mailbox support")
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/cavium/nitrox/nitrox_debugfs.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/crypto/cavium/nitrox/nitrox_debugfs.h b/drivers/crypto/cavium/nitrox/nitrox_debugfs.h
index 7b701ea6227a..a8d85ffa619c 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_debugfs.h
+++ b/drivers/crypto/cavium/nitrox/nitrox_debugfs.h
@@ -13,9 +13,8 @@ static inline int nitrox_debugfs_init(struct nitrox_device *ndev)
 	return 0;
 }
 
-static inline int nitrox_sriov_debugfs_init(struct nitrox_device *ndev)
+static inline void nitrox_debugfs_exit(struct nitrox_device *ndev)
 {
-	return 0;
 }
 #endif /* !CONFIG_DEBUG_FS */
 
-- 
cgit v1.2.3-59-g8ed1b


From c79b411eaa7257204f89c30651c45cea22278769 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 16 Dec 2018 15:55:06 -0800
Subject: crypto: skcipher - remove remnants of internal IV generators

Remove dead code related to internal IV generators, which are no longer
used since they've been replaced with the "seqiv" and "echainiv"
templates.  The removed code includes:

- The "givcipher" (GIVCIPHER) algorithm type.  No algorithms are
  registered with this type anymore, so it's unneeded.

- The "const char *geniv" member of aead_alg, ablkcipher_alg, and
  blkcipher_alg.  A few algorithms still set this, but it isn't used
  anymore except to show via /proc/crypto and CRYPTO_MSG_GETALG.
  Just hardcode "<default>" or "<none>" in those cases.

- The 'skcipher_givcrypt_request' structure, which is never used.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 Documentation/crypto/api.rst          |  9 -----
 Documentation/crypto/architecture.rst | 31 +++-----------
 crypto/ablkcipher.c                   | 76 +----------------------------------
 crypto/blkcipher.c                    |  6 +--
 crypto/cryptd.c                       |  4 +-
 crypto/ctr.c                          |  2 -
 crypto/skcipher.c                     |  6 +--
 drivers/crypto/bcm/cipher.c           |  1 -
 drivers/crypto/chelsio/chcr_algo.c    |  1 -
 drivers/crypto/ixp4xx_crypto.c        |  5 ---
 drivers/crypto/nx/nx-aes-ctr.c        |  1 -
 drivers/crypto/omap-aes.c             |  1 -
 drivers/crypto/picoxcell_crypto.c     |  3 +-
 drivers/crypto/talitos.c              |  1 -
 include/crypto/aead.h                 |  3 --
 include/crypto/internal/skcipher.h    |  2 -
 include/crypto/skcipher.h             | 13 ------
 include/linux/crypto.h                | 34 ++--------------
 18 files changed, 17 insertions(+), 182 deletions(-)

diff --git a/Documentation/crypto/api.rst b/Documentation/crypto/api.rst
index 2e519193ab4a..b91b31736df8 100644
--- a/Documentation/crypto/api.rst
+++ b/Documentation/crypto/api.rst
@@ -1,15 +1,6 @@
 Programming Interface
 =====================
 
-Please note that the kernel crypto API contains the AEAD givcrypt API
-(crypto_aead_giv\* and aead_givcrypt\* function calls in
-include/crypto/aead.h). This API is obsolete and will be removed in the
-future. To obtain the functionality of an AEAD cipher with internal IV
-generation, use the IV generator as a regular cipher. For example,
-rfc4106(gcm(aes)) is the AEAD cipher with external IV generation and
-seqniv(rfc4106(gcm(aes))) implies that the kernel crypto API generates
-the IV. Different IV generators are available.
-
 .. class:: toc-title
 
 	   Table of contents
diff --git a/Documentation/crypto/architecture.rst b/Documentation/crypto/architecture.rst
index ca2d09b991f5..ee8ff0762d7f 100644
--- a/Documentation/crypto/architecture.rst
+++ b/Documentation/crypto/architecture.rst
@@ -157,10 +157,6 @@ applicable to a cipher, it is not displayed:
 
    -  rng for random number generator
 
-   -  givcipher for cipher with associated IV generator (see the geniv
-      entry below for the specification of the IV generator type used by
-      the cipher implementation)
-
    -  kpp for a Key-agreement Protocol Primitive (KPP) cipher such as
       an ECDH or DH implementation
 
@@ -174,16 +170,7 @@ applicable to a cipher, it is not displayed:
 
 -  digestsize: output size of the message digest
 
--  geniv: IV generation type:
-
-   -  eseqiv for encrypted sequence number based IV generation
-
-   -  seqiv for sequence number based IV generation
-
-   -  chainiv for chain iv generation
-
-   -  <builtin> is a marker that the cipher implements IV generation and
-      handling as it is specific to the given cipher
+-  geniv: IV generator (obsolete)
 
 Key Sizes
 ---------
@@ -218,10 +205,6 @@ the aforementioned cipher types:
 
 -  CRYPTO_ALG_TYPE_ABLKCIPHER Asynchronous multi-block cipher
 
--  CRYPTO_ALG_TYPE_GIVCIPHER Asynchronous multi-block cipher packed
-   together with an IV generator (see geniv field in the /proc/crypto
-   listing for the known IV generators)
-
 -  CRYPTO_ALG_TYPE_KPP Key-agreement Protocol Primitive (KPP) such as
    an ECDH or DH implementation
 
@@ -338,18 +321,14 @@ uses the API applicable to the cipher type specified for the block.
 
 The following call sequence is applicable when the IPSEC layer triggers
 an encryption operation with the esp_output function. During
-configuration, the administrator set up the use of rfc4106(gcm(aes)) as
-the cipher for ESP. The following call sequence is now depicted in the
-ASCII art above:
+configuration, the administrator set up the use of seqiv(rfc4106(gcm(aes)))
+as the cipher for ESP. The following call sequence is now depicted in
+the ASCII art above:
 
 1. esp_output() invokes crypto_aead_encrypt() to trigger an
    encryption operation of the AEAD cipher with IV generator.
 
-   In case of GCM, the SEQIV implementation is registered as GIVCIPHER
-   in crypto_rfc4106_alloc().
-
-   The SEQIV performs its operation to generate an IV where the core
-   function is seqiv_geniv().
+   The SEQIV generates the IV.
 
 2. Now, SEQIV uses the AEAD API function calls to invoke the associated
    AEAD cipher. In our case, during the instantiation of SEQIV, the
diff --git a/crypto/ablkcipher.c b/crypto/ablkcipher.c
index b5e9ce19d324..b339587073c3 100644
--- a/crypto/ablkcipher.c
+++ b/crypto/ablkcipher.c
@@ -368,8 +368,7 @@ static int crypto_ablkcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
 	memset(&rblkcipher, 0, sizeof(rblkcipher));
 
 	strscpy(rblkcipher.type, "ablkcipher", sizeof(rblkcipher.type));
-	strscpy(rblkcipher.geniv, alg->cra_ablkcipher.geniv ?: "<default>",
-		sizeof(rblkcipher.geniv));
+	strscpy(rblkcipher.geniv, "<default>", sizeof(rblkcipher.geniv));
 
 	rblkcipher.blocksize = alg->cra_blocksize;
 	rblkcipher.min_keysize = alg->cra_ablkcipher.min_keysize;
@@ -399,7 +398,7 @@ static void crypto_ablkcipher_show(struct seq_file *m, struct crypto_alg *alg)
 	seq_printf(m, "min keysize  : %u\n", ablkcipher->min_keysize);
 	seq_printf(m, "max keysize  : %u\n", ablkcipher->max_keysize);
 	seq_printf(m, "ivsize       : %u\n", ablkcipher->ivsize);
-	seq_printf(m, "geniv        : %s\n", ablkcipher->geniv ?: "<default>");
+	seq_printf(m, "geniv        : <default>\n");
 }
 
 const struct crypto_type crypto_ablkcipher_type = {
@@ -411,74 +410,3 @@ const struct crypto_type crypto_ablkcipher_type = {
 	.report = crypto_ablkcipher_report,
 };
 EXPORT_SYMBOL_GPL(crypto_ablkcipher_type);
-
-static int crypto_init_givcipher_ops(struct crypto_tfm *tfm, u32 type,
-				      u32 mask)
-{
-	struct ablkcipher_alg *alg = &tfm->__crt_alg->cra_ablkcipher;
-	struct ablkcipher_tfm *crt = &tfm->crt_ablkcipher;
-
-	if (alg->ivsize > PAGE_SIZE / 8)
-		return -EINVAL;
-
-	crt->setkey = tfm->__crt_alg->cra_flags & CRYPTO_ALG_GENIV ?
-		      alg->setkey : setkey;
-	crt->encrypt = alg->encrypt;
-	crt->decrypt = alg->decrypt;
-	crt->base = __crypto_ablkcipher_cast(tfm);
-	crt->ivsize = alg->ivsize;
-
-	return 0;
-}
-
-#ifdef CONFIG_NET
-static int crypto_givcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
-{
-	struct crypto_report_blkcipher rblkcipher;
-
-	memset(&rblkcipher, 0, sizeof(rblkcipher));
-
-	strscpy(rblkcipher.type, "givcipher", sizeof(rblkcipher.type));
-	strscpy(rblkcipher.geniv, alg->cra_ablkcipher.geniv ?: "<built-in>",
-		sizeof(rblkcipher.geniv));
-
-	rblkcipher.blocksize = alg->cra_blocksize;
-	rblkcipher.min_keysize = alg->cra_ablkcipher.min_keysize;
-	rblkcipher.max_keysize = alg->cra_ablkcipher.max_keysize;
-	rblkcipher.ivsize = alg->cra_ablkcipher.ivsize;
-
-	return nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER,
-		       sizeof(rblkcipher), &rblkcipher);
-}
-#else
-static int crypto_givcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
-{
-	return -ENOSYS;
-}
-#endif
-
-static void crypto_givcipher_show(struct seq_file *m, struct crypto_alg *alg)
-	__maybe_unused;
-static void crypto_givcipher_show(struct seq_file *m, struct crypto_alg *alg)
-{
-	struct ablkcipher_alg *ablkcipher = &alg->cra_ablkcipher;
-
-	seq_printf(m, "type         : givcipher\n");
-	seq_printf(m, "async        : %s\n", alg->cra_flags & CRYPTO_ALG_ASYNC ?
-					     "yes" : "no");
-	seq_printf(m, "blocksize    : %u\n", alg->cra_blocksize);
-	seq_printf(m, "min keysize  : %u\n", ablkcipher->min_keysize);
-	seq_printf(m, "max keysize  : %u\n", ablkcipher->max_keysize);
-	seq_printf(m, "ivsize       : %u\n", ablkcipher->ivsize);
-	seq_printf(m, "geniv        : %s\n", ablkcipher->geniv ?: "<built-in>");
-}
-
-const struct crypto_type crypto_givcipher_type = {
-	.ctxsize = crypto_ablkcipher_ctxsize,
-	.init = crypto_init_givcipher_ops,
-#ifdef CONFIG_PROC_FS
-	.show = crypto_givcipher_show,
-#endif
-	.report = crypto_givcipher_report,
-};
-EXPORT_SYMBOL_GPL(crypto_givcipher_type);
diff --git a/crypto/blkcipher.c b/crypto/blkcipher.c
index 193237514e90..c5398bd54942 100644
--- a/crypto/blkcipher.c
+++ b/crypto/blkcipher.c
@@ -510,8 +510,7 @@ static int crypto_blkcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
 	memset(&rblkcipher, 0, sizeof(rblkcipher));
 
 	strscpy(rblkcipher.type, "blkcipher", sizeof(rblkcipher.type));
-	strscpy(rblkcipher.geniv, alg->cra_blkcipher.geniv ?: "<default>",
-		sizeof(rblkcipher.geniv));
+	strscpy(rblkcipher.geniv, "<default>", sizeof(rblkcipher.geniv));
 
 	rblkcipher.blocksize = alg->cra_blocksize;
 	rblkcipher.min_keysize = alg->cra_blkcipher.min_keysize;
@@ -537,8 +536,7 @@ static void crypto_blkcipher_show(struct seq_file *m, struct crypto_alg *alg)
 	seq_printf(m, "min keysize  : %u\n", alg->cra_blkcipher.min_keysize);
 	seq_printf(m, "max keysize  : %u\n", alg->cra_blkcipher.max_keysize);
 	seq_printf(m, "ivsize       : %u\n", alg->cra_blkcipher.ivsize);
-	seq_printf(m, "geniv        : %s\n", alg->cra_blkcipher.geniv ?:
-					     "<default>");
+	seq_printf(m, "geniv        : <default>\n");
 }
 
 const struct crypto_type crypto_blkcipher_type = {
diff --git a/crypto/cryptd.c b/crypto/cryptd.c
index 7118fb5efbaa..5640e5db7bdb 100644
--- a/crypto/cryptd.c
+++ b/crypto/cryptd.c
@@ -422,8 +422,6 @@ static int cryptd_create_blkcipher(struct crypto_template *tmpl,
 	inst->alg.cra_ablkcipher.min_keysize = alg->cra_blkcipher.min_keysize;
 	inst->alg.cra_ablkcipher.max_keysize = alg->cra_blkcipher.max_keysize;
 
-	inst->alg.cra_ablkcipher.geniv = alg->cra_blkcipher.geniv;
-
 	inst->alg.cra_ctxsize = sizeof(struct cryptd_blkcipher_ctx);
 
 	inst->alg.cra_init = cryptd_blkcipher_init_tfm;
@@ -1174,7 +1172,7 @@ struct cryptd_ablkcipher *cryptd_alloc_ablkcipher(const char *alg_name,
 		return ERR_PTR(-EINVAL);
 	type = crypto_skcipher_type(type);
 	mask &= ~CRYPTO_ALG_TYPE_MASK;
-	mask |= (CRYPTO_ALG_GENIV | CRYPTO_ALG_TYPE_BLKCIPHER_MASK);
+	mask |= CRYPTO_ALG_TYPE_BLKCIPHER_MASK;
 	tfm = crypto_alloc_base(cryptd_alg_name, type, mask);
 	if (IS_ERR(tfm))
 		return ERR_CAST(tfm);
diff --git a/crypto/ctr.c b/crypto/ctr.c
index 435b75bd619e..30f3946efc6d 100644
--- a/crypto/ctr.c
+++ b/crypto/ctr.c
@@ -233,8 +233,6 @@ static struct crypto_instance *crypto_ctr_alloc(struct rtattr **tb)
 	inst->alg.cra_blkcipher.encrypt = crypto_ctr_crypt;
 	inst->alg.cra_blkcipher.decrypt = crypto_ctr_crypt;
 
-	inst->alg.cra_blkcipher.geniv = "chainiv";
-
 out:
 	crypto_mod_put(alg);
 	return inst;
diff --git a/crypto/skcipher.c b/crypto/skcipher.c
index 41b4f7f27f45..2a969296bc24 100644
--- a/crypto/skcipher.c
+++ b/crypto/skcipher.c
@@ -579,8 +579,7 @@ static unsigned int crypto_skcipher_extsize(struct crypto_alg *alg)
 	if (alg->cra_type == &crypto_blkcipher_type)
 		return sizeof(struct crypto_blkcipher *);
 
-	if (alg->cra_type == &crypto_ablkcipher_type ||
-	    alg->cra_type == &crypto_givcipher_type)
+	if (alg->cra_type == &crypto_ablkcipher_type)
 		return sizeof(struct crypto_ablkcipher *);
 
 	return crypto_alg_extsize(alg);
@@ -844,8 +843,7 @@ static int crypto_skcipher_init_tfm(struct crypto_tfm *tfm)
 	if (tfm->__crt_alg->cra_type == &crypto_blkcipher_type)
 		return crypto_init_skcipher_ops_blkcipher(tfm);
 
-	if (tfm->__crt_alg->cra_type == &crypto_ablkcipher_type ||
-	    tfm->__crt_alg->cra_type == &crypto_givcipher_type)
+	if (tfm->__crt_alg->cra_type == &crypto_ablkcipher_type)
 		return crypto_init_skcipher_ops_ablkcipher(tfm);
 
 	skcipher->setkey = skcipher_setkey;
diff --git a/drivers/crypto/bcm/cipher.c b/drivers/crypto/bcm/cipher.c
index 2ce3a16d3d10..c9393ffb70ed 100644
--- a/drivers/crypto/bcm/cipher.c
+++ b/drivers/crypto/bcm/cipher.c
@@ -3868,7 +3868,6 @@ static struct iproc_alg_s driver_algs[] = {
 			.cra_driver_name = "ctr-aes-iproc",
 			.cra_blocksize = AES_BLOCK_SIZE,
 			.cra_ablkcipher = {
-					   /* .geniv = "chainiv", */
 					   .min_keysize = AES_MIN_KEY_SIZE,
 					   .max_keysize = AES_MAX_KEY_SIZE,
 					   .ivsize = AES_BLOCK_SIZE,
diff --git a/drivers/crypto/chelsio/chcr_algo.c b/drivers/crypto/chelsio/chcr_algo.c
index eedc33128da4..bcef76508dfa 100644
--- a/drivers/crypto/chelsio/chcr_algo.c
+++ b/drivers/crypto/chelsio/chcr_algo.c
@@ -3816,7 +3816,6 @@ static struct chcr_alg_template driver_algs[] = {
 				.setkey		= chcr_aes_rfc3686_setkey,
 				.encrypt	= chcr_aes_encrypt,
 				.decrypt	= chcr_aes_decrypt,
-				.geniv          = "seqiv",
 			}
 		}
 	},
diff --git a/drivers/crypto/ixp4xx_crypto.c b/drivers/crypto/ixp4xx_crypto.c
index 27f7dad2d45d..19fba998b86b 100644
--- a/drivers/crypto/ixp4xx_crypto.c
+++ b/drivers/crypto/ixp4xx_crypto.c
@@ -1194,7 +1194,6 @@ static struct ixp_alg ixp4xx_algos[] = {
 			.min_keysize	= DES_KEY_SIZE,
 			.max_keysize	= DES_KEY_SIZE,
 			.ivsize		= DES_BLOCK_SIZE,
-			.geniv		= "eseqiv",
 			}
 		}
 	},
@@ -1221,7 +1220,6 @@ static struct ixp_alg ixp4xx_algos[] = {
 			.min_keysize	= DES3_EDE_KEY_SIZE,
 			.max_keysize	= DES3_EDE_KEY_SIZE,
 			.ivsize		= DES3_EDE_BLOCK_SIZE,
-			.geniv		= "eseqiv",
 			}
 		}
 	},
@@ -1247,7 +1245,6 @@ static struct ixp_alg ixp4xx_algos[] = {
 			.min_keysize	= AES_MIN_KEY_SIZE,
 			.max_keysize	= AES_MAX_KEY_SIZE,
 			.ivsize		= AES_BLOCK_SIZE,
-			.geniv		= "eseqiv",
 			}
 		}
 	},
@@ -1273,7 +1270,6 @@ static struct ixp_alg ixp4xx_algos[] = {
 			.min_keysize	= AES_MIN_KEY_SIZE,
 			.max_keysize	= AES_MAX_KEY_SIZE,
 			.ivsize		= AES_BLOCK_SIZE,
-			.geniv		= "eseqiv",
 			}
 		}
 	},
@@ -1287,7 +1283,6 @@ static struct ixp_alg ixp4xx_algos[] = {
 			.min_keysize	= AES_MIN_KEY_SIZE,
 			.max_keysize	= AES_MAX_KEY_SIZE,
 			.ivsize		= AES_BLOCK_SIZE,
-			.geniv		= "eseqiv",
 			.setkey		= ablk_rfc3686_setkey,
 			.encrypt	= ablk_rfc3686_crypt,
 			.decrypt	= ablk_rfc3686_crypt }
diff --git a/drivers/crypto/nx/nx-aes-ctr.c b/drivers/crypto/nx/nx-aes-ctr.c
index 898c0a280511..5a26fcd75d2d 100644
--- a/drivers/crypto/nx/nx-aes-ctr.c
+++ b/drivers/crypto/nx/nx-aes-ctr.c
@@ -159,7 +159,6 @@ struct crypto_alg nx_ctr3686_aes_alg = {
 		.min_keysize = AES_MIN_KEY_SIZE + CTR_RFC3686_NONCE_SIZE,
 		.max_keysize = AES_MAX_KEY_SIZE + CTR_RFC3686_NONCE_SIZE,
 		.ivsize      = CTR_RFC3686_IV_SIZE,
-		.geniv       = "seqiv",
 		.setkey      = ctr3686_aes_nx_set_key,
 		.encrypt     = ctr3686_aes_nx_crypt,
 		.decrypt     = ctr3686_aes_nx_crypt,
diff --git a/drivers/crypto/omap-aes.c b/drivers/crypto/omap-aes.c
index 4c0ea8142923..0120feb2d746 100644
--- a/drivers/crypto/omap-aes.c
+++ b/drivers/crypto/omap-aes.c
@@ -749,7 +749,6 @@ static struct crypto_alg algs_ctr[] = {
 	.cra_u.ablkcipher = {
 		.min_keysize	= AES_MIN_KEY_SIZE,
 		.max_keysize	= AES_MAX_KEY_SIZE,
-		.geniv		= "eseqiv",
 		.ivsize		= AES_BLOCK_SIZE,
 		.setkey		= omap_aes_setkey,
 		.encrypt	= omap_aes_ctr_encrypt,
diff --git a/drivers/crypto/picoxcell_crypto.c b/drivers/crypto/picoxcell_crypto.c
index a28f1d18fe01..17068b55fea5 100644
--- a/drivers/crypto/picoxcell_crypto.c
+++ b/drivers/crypto/picoxcell_crypto.c
@@ -1585,8 +1585,7 @@ static struct spacc_alg l2_engine_algs[] = {
 			.cra_name = "f8(kasumi)",
 			.cra_driver_name = "f8-kasumi-picoxcell",
 			.cra_priority = SPACC_CRYPTO_ALG_PRIORITY,
-			.cra_flags = CRYPTO_ALG_TYPE_GIVCIPHER |
-					CRYPTO_ALG_ASYNC |
+			.cra_flags = CRYPTO_ALG_ASYNC |
 					CRYPTO_ALG_KERN_DRIVER_ONLY,
 			.cra_blocksize = 8,
 			.cra_ctxsize = sizeof(struct spacc_ablk_ctx),
diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c
index 6988012deca4..45e20707cef8 100644
--- a/drivers/crypto/talitos.c
+++ b/drivers/crypto/talitos.c
@@ -3155,7 +3155,6 @@ static struct talitos_crypto_alg *talitos_alg_alloc(struct device *dev,
 		alg->cra_ablkcipher.setkey = ablkcipher_setkey;
 		alg->cra_ablkcipher.encrypt = ablkcipher_encrypt;
 		alg->cra_ablkcipher.decrypt = ablkcipher_decrypt;
-		alg->cra_ablkcipher.geniv = "eseqiv";
 		break;
 	case CRYPTO_ALG_TYPE_AEAD:
 		alg = &t_alg->algt.alg.aead.base;
diff --git a/include/crypto/aead.h b/include/crypto/aead.h
index b7b8d24cf765..9ad595f97c65 100644
--- a/include/crypto/aead.h
+++ b/include/crypto/aead.h
@@ -115,7 +115,6 @@ struct aead_request {
  * @setkey: see struct skcipher_alg
  * @encrypt: see struct skcipher_alg
  * @decrypt: see struct skcipher_alg
- * @geniv: see struct skcipher_alg
  * @ivsize: see struct skcipher_alg
  * @chunksize: see struct skcipher_alg
  * @init: Initialize the cryptographic transformation object. This function
@@ -142,8 +141,6 @@ struct aead_alg {
 	int (*init)(struct crypto_aead *tfm);
 	void (*exit)(struct crypto_aead *tfm);
 
-	const char *geniv;
-
 	unsigned int ivsize;
 	unsigned int maxauthsize;
 	unsigned int chunksize;
diff --git a/include/crypto/internal/skcipher.h b/include/crypto/internal/skcipher.h
index e42f7063f245..453e867b4bd9 100644
--- a/include/crypto/internal/skcipher.h
+++ b/include/crypto/internal/skcipher.h
@@ -70,8 +70,6 @@ struct skcipher_walk {
 	unsigned int alignmask;
 };
 
-extern const struct crypto_type crypto_givcipher_type;
-
 static inline struct crypto_instance *skcipher_crypto_instance(
 	struct skcipher_instance *inst)
 {
diff --git a/include/crypto/skcipher.h b/include/crypto/skcipher.h
index 480f8301a47d..e555294ed77f 100644
--- a/include/crypto/skcipher.h
+++ b/include/crypto/skcipher.h
@@ -39,19 +39,6 @@ struct skcipher_request {
 	void *__ctx[] CRYPTO_MINALIGN_ATTR;
 };
 
-/**
- *	struct skcipher_givcrypt_request - Crypto request with IV generation
- *	@seq: Sequence number for IV generation
- *	@giv: Space for generated IV
- *	@creq: The crypto request itself
- */
-struct skcipher_givcrypt_request {
-	u64 seq;
-	u8 *giv;
-
-	struct ablkcipher_request creq;
-};
-
 struct crypto_skcipher {
 	int (*setkey)(struct crypto_skcipher *tfm, const u8 *key,
 	              unsigned int keylen);
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 81e178fb9ed8..902ec171fc6d 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -49,7 +49,6 @@
 #define CRYPTO_ALG_TYPE_BLKCIPHER	0x00000004
 #define CRYPTO_ALG_TYPE_ABLKCIPHER	0x00000005
 #define CRYPTO_ALG_TYPE_SKCIPHER	0x00000005
-#define CRYPTO_ALG_TYPE_GIVCIPHER	0x00000006
 #define CRYPTO_ALG_TYPE_KPP		0x00000008
 #define CRYPTO_ALG_TYPE_ACOMPRESS	0x0000000a
 #define CRYPTO_ALG_TYPE_SCOMPRESS	0x0000000b
@@ -76,12 +75,6 @@
  */
 #define CRYPTO_ALG_NEED_FALLBACK	0x00000100
 
-/*
- * This bit is set for symmetric key ciphers that have already been wrapped
- * with a generic IV generator to prevent them from being wrapped again.
- */
-#define CRYPTO_ALG_GENIV		0x00000200
-
 /*
  * Set if the algorithm has passed automated run-time testing.  Note that
  * if there is no run-time testing for a given algorithm it is considered
@@ -157,7 +150,6 @@ struct crypto_async_request;
 struct crypto_blkcipher;
 struct crypto_tfm;
 struct crypto_type;
-struct skcipher_givcrypt_request;
 
 typedef void (*crypto_completion_t)(struct crypto_async_request *req, int err);
 
@@ -246,31 +238,16 @@ struct cipher_desc {
  *	     be called in parallel with the same transformation object.
  * @decrypt: Decrypt a single block. This is a reverse counterpart to @encrypt
  *	     and the conditions are exactly the same.
- * @givencrypt: Update the IV for encryption. With this function, a cipher
- *	        implementation may provide the function on how to update the IV
- *	        for encryption.
- * @givdecrypt: Update the IV for decryption. This is the reverse of
- *	        @givencrypt .
- * @geniv: The transformation implementation may use an "IV generator" provided
- *	   by the kernel crypto API. Several use cases have a predefined
- *	   approach how IVs are to be updated. For such use cases, the kernel
- *	   crypto API provides ready-to-use implementations that can be
- *	   referenced with this variable.
  * @ivsize: IV size applicable for transformation. The consumer must provide an
  *	    IV of exactly that size to perform the encrypt or decrypt operation.
  *
- * All fields except @givencrypt , @givdecrypt , @geniv and @ivsize are
- * mandatory and must be filled.
+ * All fields except @ivsize are mandatory and must be filled.
  */
 struct ablkcipher_alg {
 	int (*setkey)(struct crypto_ablkcipher *tfm, const u8 *key,
 	              unsigned int keylen);
 	int (*encrypt)(struct ablkcipher_request *req);
 	int (*decrypt)(struct ablkcipher_request *req);
-	int (*givencrypt)(struct skcipher_givcrypt_request *req);
-	int (*givdecrypt)(struct skcipher_givcrypt_request *req);
-
-	const char *geniv;
 
 	unsigned int min_keysize;
 	unsigned int max_keysize;
@@ -284,10 +261,9 @@ struct ablkcipher_alg {
  * @setkey: see struct ablkcipher_alg
  * @encrypt: see struct ablkcipher_alg
  * @decrypt: see struct ablkcipher_alg
- * @geniv: see struct ablkcipher_alg
  * @ivsize: see struct ablkcipher_alg
  *
- * All fields except @geniv and @ivsize are mandatory and must be filled.
+ * All fields except @ivsize are mandatory and must be filled.
  */
 struct blkcipher_alg {
 	int (*setkey)(struct crypto_tfm *tfm, const u8 *key,
@@ -299,8 +275,6 @@ struct blkcipher_alg {
 		       struct scatterlist *dst, struct scatterlist *src,
 		       unsigned int nbytes);
 
-	const char *geniv;
-
 	unsigned int min_keysize;
 	unsigned int max_keysize;
 	unsigned int ivsize;
@@ -931,14 +905,14 @@ static inline struct crypto_ablkcipher *__crypto_ablkcipher_cast(
 
 static inline u32 crypto_skcipher_type(u32 type)
 {
-	type &= ~(CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_GENIV);
+	type &= ~CRYPTO_ALG_TYPE_MASK;
 	type |= CRYPTO_ALG_TYPE_BLKCIPHER;
 	return type;
 }
 
 static inline u32 crypto_skcipher_mask(u32 mask)
 {
-	mask &= ~(CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_GENIV);
+	mask &= ~CRYPTO_ALG_TYPE_MASK;
 	mask |= CRYPTO_ALG_TYPE_BLKCIPHER_MASK;
 	return mask;
 }
-- 
cgit v1.2.3-59-g8ed1b